Bug 693057 - Upgrade libvpx decoder to v0.9.7-p1, r=cpearce,khuey
authorTimothy B. Terriberry <tterribe@vt.edu>
Thu, 13 Oct 2011 17:37:21 -0700
changeset 78719 64603f6ddda1641bfd0e925d89f30d6c7c49134f
parent 78718 5f16d668ce5a18668d35b2ba495dfeb9d618c374
child 78720 f5eded31718ae2fd1edd6955f3ec533c4951e853
push idunknown
push userunknown
push dateunknown
reviewerscpearce, khuey
bugs693057
milestone10.0a1
Bug 693057 - Upgrade libvpx decoder to v0.9.7-p1, r=cpearce,khuey
configure.in
media/libvpx/Makefile.in
media/libvpx/build/make/ads2gas.pl
media/libvpx/build/make/obj_int_extract.c
media/libvpx/compile_errors.patch
media/libvpx/solaris.patch
media/libvpx/update.sh
media/libvpx/vp8/common/alloccommon.c
media/libvpx/vp8/common/arm/arm_systemdependent.c
media/libvpx/vp8/common/arm/armv6/bilinearfilter_v6.asm
media/libvpx/vp8/common/arm/armv6/copymem16x16_v6.asm
media/libvpx/vp8/common/arm/armv6/filter_v6.asm
media/libvpx/vp8/common/arm/armv6/loopfilter_v6.asm
media/libvpx/vp8/common/arm/armv6/simpleloopfilter_v6.asm
media/libvpx/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
media/libvpx/vp8/common/arm/bilinearfilter_arm.c
media/libvpx/vp8/common/arm/bilinearfilter_arm.h
media/libvpx/vp8/common/arm/filter_arm.c
media/libvpx/vp8/common/arm/loopfilter_arm.c
media/libvpx/vp8/common/arm/loopfilter_arm.h
media/libvpx/vp8/common/arm/neon/bilinearpredict16x16_neon.asm
media/libvpx/vp8/common/arm/neon/bilinearpredict4x4_neon.asm
media/libvpx/vp8/common/arm/neon/bilinearpredict8x4_neon.asm
media/libvpx/vp8/common/arm/neon/bilinearpredict8x8_neon.asm
media/libvpx/vp8/common/arm/neon/iwalsh_neon.asm
media/libvpx/vp8/common/arm/neon/loopfilter_neon.asm
media/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
media/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
media/libvpx/vp8/common/arm/neon/mbloopfilter_neon.asm
media/libvpx/vp8/common/arm/neon/recon_neon.c
media/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.asm
media/libvpx/vp8/common/arm/neon/sixtappredict16x16_neon.asm
media/libvpx/vp8/common/arm/neon/sixtappredict4x4_neon.asm
media/libvpx/vp8/common/arm/neon/sixtappredict8x4_neon.asm
media/libvpx/vp8/common/arm/neon/sixtappredict8x8_neon.asm
media/libvpx/vp8/common/arm/reconintra_arm.c
media/libvpx/vp8/common/arm/vpx_asm_offsets.c
media/libvpx/vp8/common/blockd.c
media/libvpx/vp8/common/blockd.h
media/libvpx/vp8/common/coefupdateprobs.h
media/libvpx/vp8/common/debugmodes.c
media/libvpx/vp8/common/defaultcoefcounts.c
media/libvpx/vp8/common/defaultcoefcounts.h
media/libvpx/vp8/common/entropy.c
media/libvpx/vp8/common/entropy.h
media/libvpx/vp8/common/entropymode.c
media/libvpx/vp8/common/entropymode.h
media/libvpx/vp8/common/entropymv.h
media/libvpx/vp8/common/extend.c
media/libvpx/vp8/common/extend.h
media/libvpx/vp8/common/filter.c
media/libvpx/vp8/common/filter.h
media/libvpx/vp8/common/filter_c.c
media/libvpx/vp8/common/findnearmv.c
media/libvpx/vp8/common/findnearmv.h
media/libvpx/vp8/common/generic/systemdependent.c
media/libvpx/vp8/common/loopfilter.c
media/libvpx/vp8/common/loopfilter.h
media/libvpx/vp8/common/loopfilter_filters.c
media/libvpx/vp8/common/mbpitch.c
media/libvpx/vp8/common/mv.h
media/libvpx/vp8/common/onyx.h
media/libvpx/vp8/common/onyxc_int.h
media/libvpx/vp8/common/onyxd.h
media/libvpx/vp8/common/postproc.c
media/libvpx/vp8/common/postproc.h
media/libvpx/vp8/common/ppflags.h
media/libvpx/vp8/common/predictdc.c
media/libvpx/vp8/common/predictdc.h
media/libvpx/vp8/common/preproc.h
media/libvpx/vp8/common/recon.h
media/libvpx/vp8/common/reconinter.c
media/libvpx/vp8/common/reconinter.h
media/libvpx/vp8/common/reconintra.h
media/libvpx/vp8/common/reconintra4x4.c
media/libvpx/vp8/common/threading.h
media/libvpx/vp8/common/vpxerrors.h
media/libvpx/vp8/common/x86/idctllm_mmx.asm
media/libvpx/vp8/common/x86/idctllm_sse2.asm
media/libvpx/vp8/common/x86/iwalsh_sse2.asm
media/libvpx/vp8/common/x86/loopfilter_mmx.asm
media/libvpx/vp8/common/x86/loopfilter_sse2.asm
media/libvpx/vp8/common/x86/loopfilter_x86.c
media/libvpx/vp8/common/x86/loopfilter_x86.h
media/libvpx/vp8/common/x86/postproc_sse2.asm
media/libvpx/vp8/common/x86/recon_sse2.asm
media/libvpx/vp8/common/x86/recon_wrapper_sse2.c
media/libvpx/vp8/common/x86/recon_x86.h
media/libvpx/vp8/common/x86/subpixel_mmx.asm
media/libvpx/vp8/common/x86/subpixel_sse2.asm
media/libvpx/vp8/common/x86/subpixel_ssse3.asm
media/libvpx/vp8/common/x86/vp8_asm_stubs.c
media/libvpx/vp8/common/x86/x86_systemdependent.c
media/libvpx/vp8/decoder/arm/arm_dsystemdependent.c
media/libvpx/vp8/decoder/arm/armv6/idct_blk_v6.c
media/libvpx/vp8/decoder/arm/dboolhuff_arm.h
media/libvpx/vp8/decoder/arm/dequantize_arm.c
media/libvpx/vp8/decoder/arm/detokenize.asm
media/libvpx/vp8/decoder/arm/detokenize_arm.h
media/libvpx/vp8/decoder/arm/neon/dequant_idct_neon.asm
media/libvpx/vp8/decoder/arm/neon/idct_blk_neon.c
media/libvpx/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm
media/libvpx/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm
media/libvpx/vp8/decoder/dboolhuff.c
media/libvpx/vp8/decoder/dboolhuff.h
media/libvpx/vp8/decoder/decodemv.c
media/libvpx/vp8/decoder/decoderthreading.h
media/libvpx/vp8/decoder/decodframe.c
media/libvpx/vp8/decoder/dequantize.c
media/libvpx/vp8/decoder/dequantize.h
media/libvpx/vp8/decoder/detokenize.c
media/libvpx/vp8/decoder/detokenize.h
media/libvpx/vp8/decoder/generic/dsystemdependent.c
media/libvpx/vp8/decoder/idct_blk.c
media/libvpx/vp8/decoder/onyxd_if.c
media/libvpx/vp8/decoder/onyxd_int.h
media/libvpx/vp8/decoder/reconintra_mt.c
media/libvpx/vp8/decoder/threading.c
media/libvpx/vp8/decoder/treereader.h
media/libvpx/vp8/decoder/x86/idct_blk_mmx.c
media/libvpx/vp8/decoder/x86/idct_blk_sse2.c
media/libvpx/vp8/decoder/x86/x86_dsystemdependent.c
media/libvpx/vp8/vp8_dx_iface.c
media/libvpx/vpx/internal/vpx_codec_internal.h
media/libvpx/vpx/src/vpx_codec.c
media/libvpx/vpx/src/vpx_decoder.c
media/libvpx/vpx/src/vpx_decoder_compat.c
media/libvpx/vpx/src/vpx_encoder.c
media/libvpx/vpx/vp8.h
media/libvpx/vpx/vp8cx.h
media/libvpx/vpx/vp8dx.h
media/libvpx/vpx/vp8e.h
media/libvpx/vpx/vpx_codec.h
media/libvpx/vpx/vpx_decoder.h
media/libvpx/vpx/vpx_decoder_compat.h
media/libvpx/vpx/vpx_encoder.h
media/libvpx/vpx/vpx_image.h
media/libvpx/vpx/vpx_integer.h
media/libvpx/vpx_config_arm-linux-gcc.h
media/libvpx/vpx_config_generic-gnu.h
media/libvpx/vpx_config_x86-darwin9-gcc.asm
media/libvpx/vpx_config_x86-darwin9-gcc.h
media/libvpx/vpx_config_x86-linux-gcc.asm
media/libvpx/vpx_config_x86-linux-gcc.h
media/libvpx/vpx_config_x86-win32-vs8.asm
media/libvpx/vpx_config_x86-win32-vs8.h
media/libvpx/vpx_config_x86_64-darwin9-gcc.asm
media/libvpx/vpx_config_x86_64-darwin9-gcc.h
media/libvpx/vpx_config_x86_64-linux-gcc.asm
media/libvpx/vpx_config_x86_64-linux-gcc.h
media/libvpx/vpx_config_x86_64-win64-vs8.asm
media/libvpx/vpx_config_x86_64-win64-vs8.h
media/libvpx/vpx_ports/vpx_timer.h
media/libvpx/vpx_ports/x86.h
media/libvpx/vpx_ports/x86_abi_support.asm
media/libvpx/vpx_ports/x86_abi_support_win32.asm
media/libvpx/vpx_scale/generic/scalesystemdependant.c
media/libvpx/vpx_scale/generic/scalesystemdependent.c
media/libvpx/vpx_scale/generic/yv12config.c
media/libvpx/vpx_scale/generic/yv12extend.c
media/libvpx/vpx_scale/yv12config.h
--- a/configure.in
+++ b/configure.in
@@ -5351,26 +5351,51 @@ if test -n "$MOZ_WEBM"; then
         _SAVE_CFLAGS=$CFLAGS
         _SAVE_LDFLAGS=$LDFLAGS
         _SAVE_LIBS=$LIBS
         if test "${LIBVPX_DIR}" = "yes"; then
             LIBVPX_DIR=/usr
         fi
         CFLAGS="-I${LIBVPX_DIR}/include $CFLAGS"
         LDFLAGS="-L${LIBVPX_DIR}/lib $LDFLAGS"
+        MOZ_NATIVE_LIBVPX_DEC_TEST=
         MOZ_CHECK_HEADER(vpx/vpx_decoder.h,
             [if test ! -f "${LIBVPX_DIR}/include/vpx/vpx_decoder.h"; then
              AC_MSG_ERROR([vpx/vpx_decoder.h found, but is not in ${LIBVPX_DIR}/include])
             fi],
             AC_MSG_ERROR([--with-system-libvpx requested but vpx/vpx_decoder.h not found]))
         AC_CHECK_LIB(vpx, vpx_codec_dec_init_ver,
-                     [MOZ_NATIVE_LIBVPX=1
-                      MOZ_LIBVPX_INCLUDES="-I${LIBVPX_DIR}/include"
-                      MOZ_LIBVPX_LIBS="-L${LIBVPX_DIR}/lib -lvpx"],
+                     [MOZ_NATIVE_LIBVPX_DEC_TEST=1],
                      ([--with-system-libvpx requested but symbol vpx_codec_dec_init_ver not found]))
+        if test -n "$MOZ_NATIVE_LIBVPX_DEC_TEST" ; then
+            AC_MSG_CHECKING([for libvpx version >= v0.9.7])
+            dnl We need at least v0.9.7 to fix several crash bugs (for which we
+            dnl had local patches prior to v0.9.7).
+            dnl
+            dnl This is a terrible test for the library version, but we don't
+            dnl have a good one. There is no version number in a public header,
+            dnl and testing the headers still doesn't guarantee we link against
+            dnl the right version. While we could call vpx_codec_version() at
+            dnl run-time, that would break cross-compiling. There are no
+            dnl additional exported symbols between the v0.9.7 release and the
+            dnl v0.9.6 one to check for.
+            AC_TRY_COMPILE([
+                #include <vpx/vpx_decoder.h>
+                #if !defined(VPX_CODEC_USE_INPUT_PARTITION)
+                    #error "test failed."
+                #endif
+                ],
+                [return 0;],
+                [AC_MSG_RESULT([yes])
+                 MOZ_NATIVE_LIBVPX=1
+                 MOZ_LIBVPX_INCLUDES="-I${LIBVPX_DIR}/include"
+                 MOZ_LIBVPX_LIBS="-L${LIBVPX_DIR}/lib -lvpx"],
+                [AC_MSG_RESULT([no])
+                 AC_MSG_ERROR([--with-system-libvpx requested but it is not v0.9.7 or later])])
+        fi
         CFLAGS=$_SAVE_CFLAGS
         LDFLAGS=$_SAVE_LDFLAGS
         LIBS=$_SAVE_LIBS
     fi
 fi
 
 AC_SUBST(MOZ_NATIVE_LIBVPX)
 AC_SUBST(MOZ_LIBVPX_INCLUDES)
--- a/media/libvpx/Makefile.in
+++ b/media/libvpx/Makefile.in
@@ -125,32 +125,32 @@ EXPORTS_vpx = \
   $(NULL)
 
 CSRCS += \
   vpx_config_c.c \
   systemdependent.c \
   alloccommon.c \
   blockd.c \
   debugmodes.c \
+  defaultcoefcounts.c \
   dsystemdependent.c \
   entropy.c \
   entropymode.c \
   entropymv.c \
   extend.c \
-  filter_c.c \
+  filter.c \
   findnearmv.c \
   idctllm.c \
   invtrans.c \
   loopfilter.c \
   loopfilter_filters.c \
   mbpitch.c \
   modecont.c \
   modecontext.c \
   postproc.c \
-  predictdc.c \
   quant_common.c \
   recon.c \
   reconinter.c \
   reconintra.c \
   reconintra4x4.c \
   setupintrarecon.c \
   swapyv12buffer.c \
   textblit.c \
@@ -168,29 +168,30 @@ CSRCS += \
   vpx_codec.c \
   vpx_decoder.c \
   vpx_decoder_compat.c \
   vpx_encoder.c \
   vpx_image.c \
   vpx_mem.c \
   gen_scalers.c \
   vpxscale.c \
-  scalesystemdependant.c \
+  scalesystemdependent.c \
   yv12config.c \
   yv12extend.c \
   $(NULL)
 
 ifdef VPX_X86_ASM
 # Building on an x86 platform with a supported assembler, include
 # the optimized assembly in the build.
 
 CSRCS += \
   idct_blk_mmx.c \
   idct_blk_sse2.c \
   loopfilter_x86.c \
+  recon_wrapper_sse2.c \
   vp8_asm_stubs.c \
   x86_systemdependent.c \
   x86_dsystemdependent.c \
   $(NULL)
 
 ASFILES += \
   idctllm_mmx.asm \
   idctllm_sse2.asm \
@@ -279,47 +280,35 @@ VPX_ASFILES = \
   idct_dequant_dc_full_2x_neon.asm \
   idct_dequant_dc_0_2x_neon.asm \
   dequant_idct_neon.asm \
   idct_dequant_full_2x_neon.asm \
   idct_dequant_0_2x_neon.asm \
   dequantizeb_neon.asm \
   $(NULL)
 
-# The ARM asm needs to extract the offsets of various C struct members.
-# We need a program that runs on the host to pull them out of a .o file.
-HOST_CSRCS = obj_int_extract.c
-HOST_PROGRAM = host_obj_int_extract$(HOST_BIN_SUFFIX)
-
 ifdef VPX_AS_CONVERSION
 # The ARM asm is written in ARM RVCT syntax, but we actually build it with
 # gas using GNU syntax. Add some rules to perform the conversion.
 VPX_CONVERTED_ASFILES = $(addsuffix .$(ASM_SUFFIX), $(VPX_ASFILES))
 
 ASFILES += $(VPX_CONVERTED_ASFILES)
 GARBAGE += $(VPX_CONVERTED_ASFILES)
 
 %.asm.$(ASM_SUFFIX): %.asm
 	$(VPX_AS_CONVERSION) < $< > $@
 else
 ASFILES += $(VPX_ASFILES)
 endif
 
-GARBAGE += vpx_asm_offsets.$(OBJ_SUFFIX) vpx_asm_offsets.asm
-
 endif
 
 include $(topsrcdir)/config/rules.mk
 
 # Workaround a bug of Sun Studio (CR 6963410)
 ifdef SOLARIS_SUNPRO_CC
 ifeq (86,$(findstring 86,$(OS_TEST)))
 filter_c.o: filter_c.c Makefile.in
 	$(REPORT_BUILD)
 	@$(MAKE_DEPS_AUTO_CC)
 	$(CC) -o $@ -c $(patsubst -xO[45],-xO3,$(COMPILE_CFLAGS)) $<
 endif
 endif
-
-ifdef VPX_ARM_ASM
-vpx_asm_offsets.asm: vpx_asm_offsets.$(OBJ_SUFFIX) $(HOST_PROGRAM)
-	./$(HOST_PROGRAM) rvds $< $(if $(VPX_AS_CONVERSION),| $(VPX_AS_CONVERSION)) > $@
-endif
old mode 100644
new mode 100755
--- a/media/libvpx/build/make/ads2gas.pl
+++ b/media/libvpx/build/make/ads2gas.pl
@@ -16,18 +16,24 @@
 # Convert ARM Developer Suite 1.0.1 syntax assembly source to GNU as format
 #
 # Usage: cat inputfile | perl ads2gas.pl > outputfile
 #
 print "@ This file was created from a .asm file\n";
 print "@  using the ads2gas.pl script.\n";
 print "\t.equ DO1STROUNDING, 0\n";
 
+# Stack of procedure names.
+@proc_stack = ();
+
 while (<STDIN>)
 {
+    # Load and store alignment
+    s/@/,:/g;
+
     # Comment character
     s/;/@/g;
 
     # Hexadecimal constants prefaced by 0x
     s/#&/#0x/g;
 
     # Convert :OR: to |
     s/:OR:/ | /g;
@@ -112,32 +118,46 @@ while (<STDIN>)
     # underscore
     s/^\|(\$?\w+)\|/_$1\n\t$1:/g;
 
     # Labels need trailing colon
 #   s/^(\w+)/$1:/ if !/EQU/;
     # put the colon at the end of the line in the macro
     s/^([a-zA-Z_0-9\$]+)/$1:/ if !/EQU/;
 
-    # Strip ALIGN
-    s/\sALIGN/@ ALIGN/g;
+    # ALIGN directive
+    s/ALIGN/.balign/g;
 
     # Strip ARM
     s/\sARM/@ ARM/g;
 
     # Strip REQUIRE8
     #s/\sREQUIRE8/@ REQUIRE8/g;
     s/\sREQUIRE8/@ /g;      #EQU cause problem
 
     # Strip PRESERVE8
     s/\sPRESERVE8/@ PRESERVE8/g;
 
-    # Strip PROC and ENDPROC
-    s/\sPROC/@/g;
-    s/\sENDP/@/g;
+    # Use PROC and ENDP to give the symbols a .size directive.
+    # This makes them show up properly in debugging tools like gdb and valgrind.
+    if (/\bPROC\b/)
+    {
+        my $proc;
+        /^_([\.0-9A-Z_a-z]\w+)\b/;
+        $proc = $1;
+        push(@proc_stack, $proc) if ($proc);
+        s/\bPROC\b/@ $&/;
+    }
+    if (/\bENDP\b/)
+    {
+        my $proc;
+        s/\bENDP\b/@ $&/;
+        $proc = pop(@proc_stack);
+        $_ = "\t.size $proc, .-$proc".$_ if ($proc);
+    }
 
     # EQU directive
     s/(.*)EQU(.*)/.equ $1, $2/;
 
     # Begin macro definition
     if (/MACRO/) {
         $_ = <STDIN>;
         s/^/.macro/;
@@ -146,8 +166,11 @@ while (<STDIN>)
     }
 
     # For macros, use \ to reference formal params
     s/\$/\\/g;                  # End macro definition
     s/MEND/.endm/;              # No need to tell it where to stop assembling
     next if /^\s*END\s*$/;
     print;
 }
+
+# Mark that this object doesn't need an executable stack.
+printf ("\t.section\t.note.GNU-stack,\"\",\%\%progbits\n");
--- a/media/libvpx/build/make/obj_int_extract.c
+++ b/media/libvpx/build/make/obj_int_extract.c
@@ -4,35 +4,23 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
+#include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include "vpx_config.h"
-
-#if defined(_MSC_VER)
-#include <io.h>
-#include <share.h>
 #include "vpx/vpx_integer.h"
-#else
-#include <stdint.h>
-#include <unistd.h>
-#endif
-
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <stdarg.h>
 
 typedef enum
 {
     OUTPUT_FMT_PLAIN,
     OUTPUT_FMT_RVDS,
     OUTPUT_FMT_GAS,
 } output_fmt_t;
 
@@ -42,188 +30,194 @@ int log_msg(const char *fmt, ...)
     va_list ap;
     va_start(ap, fmt);
     res = vfprintf(stderr, fmt, ap);
     va_end(ap);
     return res;
 }
 
 #if defined(__GNUC__) && __GNUC__
-
 #if defined(__MACH__)
 
 #include <mach-o/loader.h>
 #include <mach-o/nlist.h>
 
 int parse_macho(uint8_t *base_buf, size_t sz)
 {
     int i, j;
     struct mach_header header;
     uint8_t *buf = base_buf;
     int base_data_section = 0;
+    int bits = 0;
 
+    /* We can read in mach_header for 32 and 64 bit architectures
+     * because it's identical to mach_header_64 except for the last
+     * element (uint32_t reserved), which we don't use. Then, when
+     * we know which architecture we're looking at, increment buf
+     * appropriately.
+     */
     memcpy(&header, buf, sizeof(struct mach_header));
-    buf += sizeof(struct mach_header);
 
-    if (header.magic != MH_MAGIC)
+    if (header.magic == MH_MAGIC)
     {
-        log_msg("Bad magic number for object file. 0x%x expected, 0x%x found.\n",
-                header.magic, MH_MAGIC);
-        goto bail;
+        if (header.cputype == CPU_TYPE_ARM
+            || header.cputype == CPU_TYPE_X86)
+        {
+            bits = 32;
+            buf += sizeof(struct mach_header);
+        }
+        else
+        {
+            log_msg("Bad cputype for object file. Currently only tested for CPU_TYPE_[ARM|X86].\n");
+            goto bail;
+        }
     }
-
-    if (header.cputype != CPU_TYPE_ARM)
+    else if (header.magic == MH_MAGIC_64)
     {
-        log_msg("Bad cputype for object file. Currently only tested for CPU_TYPE_ARM.\n");
+        if (header.cputype == CPU_TYPE_X86_64)
+        {
+            bits = 64;
+            buf += sizeof(struct mach_header_64);
+        }
+        else
+        {
+            log_msg("Bad cputype for object file. Currently only tested for CPU_TYPE_X86_64.\n");
+            goto bail;
+        }
+    }
+    else
+    {
+        log_msg("Bad magic number for object file. 0x%x or 0x%x expected, 0x%x found.\n",
+                MH_MAGIC, MH_MAGIC_64, header.magic);
         goto bail;
     }
 
     if (header.filetype != MH_OBJECT)
     {
         log_msg("Bad filetype for object file. Currently only tested for MH_OBJECT.\n");
         goto bail;
     }
 
     for (i = 0; i < header.ncmds; i++)
     {
         struct load_command lc;
-        struct symtab_command sc;
-        struct segment_command seg_c;
 
         memcpy(&lc, buf, sizeof(struct load_command));
 
         if (lc.cmd == LC_SEGMENT)
         {
             uint8_t *seg_buf = buf;
             struct section s;
+            struct segment_command seg_c;
 
-            memcpy(&seg_c, buf, sizeof(struct segment_command));
-
+            memcpy(&seg_c, seg_buf, sizeof(struct segment_command));
             seg_buf += sizeof(struct segment_command);
 
-            for (j = 0; j < seg_c.nsects; j++)
+            /* Although each section is given it's own offset, nlist.n_value
+             * references the offset of the first section. This isn't
+             * apparent without debug information because the offset of the
+             * data section is the same as the first section. However, with
+             * debug sections mixed in, the offset of the debug section
+             * increases but n_value still references the first section.
+             */
+            if (seg_c.nsects < 1)
             {
-                memcpy(&s, seg_buf + (j * sizeof(struct section)), sizeof(struct section));
+                log_msg("Not enough sections\n");
+                goto bail;
+            }
 
-                // Need to get this offset which is the start of the symbol table
-                // before matching the strings up with symbols.
-                base_data_section = s.offset;
+            memcpy(&s, seg_buf, sizeof(struct section));
+            base_data_section = s.offset;
+        }
+        else if (lc.cmd == LC_SEGMENT_64)
+        {
+            uint8_t *seg_buf = buf;
+            struct section_64 s;
+            struct segment_command_64 seg_c;
+
+            memcpy(&seg_c, seg_buf, sizeof(struct segment_command_64));
+            seg_buf += sizeof(struct segment_command_64);
+
+            /* Explanation in LG_SEGMENT */
+            if (seg_c.nsects < 1)
+            {
+                log_msg("Not enough sections\n");
+                goto bail;
             }
+
+            memcpy(&s, seg_buf, sizeof(struct section_64));
+            base_data_section = s.offset;
         }
         else if (lc.cmd == LC_SYMTAB)
         {
-            uint8_t *sym_buf = base_buf;
-            uint8_t *str_buf = base_buf;
-
             if (base_data_section != 0)
             {
+                struct symtab_command sc;
+                uint8_t *sym_buf = base_buf;
+                uint8_t *str_buf = base_buf;
+
                 memcpy(&sc, buf, sizeof(struct symtab_command));
 
                 if (sc.cmdsize != sizeof(struct symtab_command))
+                {
                     log_msg("Can't find symbol table!\n");
+                    goto bail;
+                }
 
                 sym_buf += sc.symoff;
                 str_buf += sc.stroff;
 
                 for (j = 0; j < sc.nsyms; j++)
                 {
-                    struct nlist nl;
-                    int val;
+                    /* Location of string is cacluated each time from the
+                     * start of the string buffer.  On darwin the symbols
+                     * are prefixed by "_", so we bump the pointer by 1.
+                     * The target value is defined as an int in asm_*_offsets.c,
+                     * which is 4 bytes on all targets we currently use.
+                     */
+                    if (bits == 32)
+                    {
+                        struct nlist nl;
+                        int val;
 
-                    memcpy(&nl, sym_buf + (j * sizeof(struct nlist)), sizeof(struct nlist));
-
-                    val = *((int *)(base_buf + base_data_section + nl.n_value));
+                        memcpy(&nl, sym_buf, sizeof(struct nlist));
+                        sym_buf += sizeof(struct nlist);
 
-                    // Location of string is cacluated each time from the
-                    // start of the string buffer.  On darwin the symbols
-                    // are prefixed by "_".  On other platforms it is not
-                    // so it needs to be removed.  That is the reason for
-                    // the +1.
-                    printf("%-40s EQU %5d\n", str_buf + nl.n_un.n_strx + 1, val);
+                        memcpy(&val, base_buf + base_data_section + nl.n_value,
+                               sizeof(val));
+                        printf("%-40s EQU %5d\n",
+                               str_buf + nl.n_un.n_strx + 1, val);
+                    }
+                    else /* if (bits == 64) */
+                    {
+                        struct nlist_64 nl;
+                        int val;
+
+                        memcpy(&nl, sym_buf, sizeof(struct nlist_64));
+                        sym_buf += sizeof(struct nlist_64);
+
+                        memcpy(&val, base_buf + base_data_section + nl.n_value,
+                               sizeof(val));
+                        printf("%-40s EQU %5d\n",
+                               str_buf + nl.n_un.n_strx + 1, val);
+                    }
                 }
             }
         }
 
         buf += lc.cmdsize;
     }
 
     return 0;
 bail:
     return 1;
 
 }
 
-int main(int argc, char **argv)
-{
-    int fd;
-    char *f;
-    struct stat stat_buf;
-    uint8_t *file_buf;
-    int res;
-
-    if (argc < 2 || argc > 3)
-    {
-        fprintf(stderr, "Usage: %s [output format] <obj file>\n\n", argv[0]);
-        fprintf(stderr, "  <obj file>\tMachO format object file to parse\n");
-        fprintf(stderr, "Output Formats:\n");
-        fprintf(stderr, "  gas  - compatible with GNU assembler\n");
-        fprintf(stderr, "  rvds - compatible with armasm\n");
-        goto bail;
-    }
-
-    f = argv[2];
-
-    if (!((!strcmp(argv[1], "rvds")) || (!strcmp(argv[1], "gas"))))
-        f = argv[1];
-
-    fd = open(f, O_RDONLY);
-
-    if (fd < 0)
-    {
-        perror("Unable to open file");
-        goto bail;
-    }
-
-    if (fstat(fd, &stat_buf))
-    {
-        perror("stat");
-        goto bail;
-    }
-
-    file_buf = malloc(stat_buf.st_size);
-
-    if (!file_buf)
-    {
-        perror("malloc");
-        goto bail;
-    }
-
-    if (read(fd, file_buf, stat_buf.st_size) != stat_buf.st_size)
-    {
-        perror("read");
-        goto bail;
-    }
-
-    if (close(fd))
-    {
-        perror("close");
-        goto bail;
-    }
-
-    res = parse_macho(file_buf, stat_buf.st_size);
-    free(file_buf);
-
-    if (!res)
-        return EXIT_SUCCESS;
-
-bail:
-    return EXIT_FAILURE;
-}
-
-#else
+#elif defined(__ELF__)
 #include "elf.h"
 
 #define COPY_STRUCT(dst, buf, ofst, sz) do {\
         if(ofst + sizeof((*(dst))) > sz) goto bail;\
         memcpy(dst, buf+ofst, sizeof((*(dst))));\
     } while(0)
 
 #define ENDIAN_ASSIGN(val, memb) do {\
@@ -232,342 +226,485 @@ bail:
     } while(0)
 
 #define ENDIAN_ASSIGN_IN_PLACE(memb) do {\
         ENDIAN_ASSIGN(memb, memb);\
     } while(0)
 
 typedef struct
 {
-    uint8_t     *buf; /* Buffer containing ELF data */
-    size_t       sz;  /* Buffer size */
-    int          le_data;   /* Data is little-endian */
-    Elf32_Ehdr   hdr;
+    uint8_t      *buf; /* Buffer containing ELF data */
+    size_t        sz;  /* Buffer size */
+    int           le_data; /* Data is little-endian */
+    unsigned char e_ident[EI_NIDENT]; /* Magic number and other info */
+    int           bits; /* 32 or 64 */
+    Elf32_Ehdr    hdr32;
+    Elf64_Ehdr    hdr64;
 } elf_obj_t;
 
-int parse_elf32_header(elf_obj_t *elf)
+int parse_elf_header(elf_obj_t *elf)
 {
     int res;
-    /* Verify ELF32 header */
-    COPY_STRUCT(&elf->hdr, elf->buf, 0, elf->sz);
-    res = elf->hdr.e_ident[EI_MAG0] == ELFMAG0;
-    res &= elf->hdr.e_ident[EI_MAG1] == ELFMAG1;
-    res &= elf->hdr.e_ident[EI_MAG2] == ELFMAG2;
-    res &= elf->hdr.e_ident[EI_MAG3] == ELFMAG3;
-    res &= elf->hdr.e_ident[EI_CLASS] == ELFCLASS32;
-    res &= elf->hdr.e_ident[EI_DATA] == ELFDATA2LSB
-           || elf->hdr.e_ident[EI_DATA] == ELFDATA2MSB;
+    /* Verify ELF Magic numbers */
+    COPY_STRUCT(&elf->e_ident, elf->buf, 0, elf->sz);
+    res = elf->e_ident[EI_MAG0] == ELFMAG0;
+    res &= elf->e_ident[EI_MAG1] == ELFMAG1;
+    res &= elf->e_ident[EI_MAG2] == ELFMAG2;
+    res &= elf->e_ident[EI_MAG3] == ELFMAG3;
+    res &= elf->e_ident[EI_CLASS] == ELFCLASS32
+        || elf->e_ident[EI_CLASS] == ELFCLASS64;
+    res &= elf->e_ident[EI_DATA] == ELFDATA2LSB;
 
     if (!res) goto bail;
 
-    elf->le_data = elf->hdr.e_ident[EI_DATA] == ELFDATA2LSB;
+    elf->le_data = elf->e_ident[EI_DATA] == ELFDATA2LSB;
+
+    /* Read in relevant values */
+    if (elf->e_ident[EI_CLASS] == ELFCLASS32)
+    {
+        elf->bits = 32;
+        COPY_STRUCT(&elf->hdr32, elf->buf, 0, elf->sz);
+
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_type);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_machine);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_version);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_entry);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_phoff);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shoff);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_flags);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_ehsize);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_phentsize);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_phnum);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shentsize);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shnum);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shstrndx);
+    }
+    else /* if (elf->e_ident[EI_CLASS] == ELFCLASS64) */
+    {
+        elf->bits = 64;
+        COPY_STRUCT(&elf->hdr64, elf->buf, 0, elf->sz);
+
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_type);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_machine);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_version);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_entry);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_phoff);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shoff);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_flags);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_ehsize);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_phentsize);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_phnum);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shentsize);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shnum);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shstrndx);
+    }
 
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_type);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_machine);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_version);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_entry);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_phoff);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shoff);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_flags);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_ehsize);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_phentsize);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_phnum);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shentsize);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shnum);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shstrndx);
+    return 0;
+bail:
+    log_msg("Failed to parse ELF file header");
+    return 1;
+}
+
+int parse_elf_section(elf_obj_t *elf, int idx, Elf32_Shdr *hdr32, Elf64_Shdr *hdr64)
+{
+    if (hdr32)
+    {
+        if (idx >= elf->hdr32.e_shnum)
+            goto bail;
+
+        COPY_STRUCT(hdr32, elf->buf, elf->hdr32.e_shoff + idx * elf->hdr32.e_shentsize,
+                    elf->sz);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_name);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_type);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_flags);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_addr);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_offset);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_size);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_link);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_info);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_addralign);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_entsize);
+    }
+    else /* if (hdr64) */
+    {
+        if (idx >= elf->hdr64.e_shnum)
+            goto bail;
+
+        COPY_STRUCT(hdr64, elf->buf, elf->hdr64.e_shoff + idx * elf->hdr64.e_shentsize,
+                    elf->sz);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_name);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_type);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_flags);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_addr);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_offset);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_size);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_link);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_info);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_addralign);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_entsize);
+    }
+
     return 0;
 bail:
     return 1;
 }
 
-int parse_elf32_section(elf_obj_t *elf, int idx, Elf32_Shdr *hdr)
+char *parse_elf_string_table(elf_obj_t *elf, int s_idx, int idx)
 {
-    if (idx >= elf->hdr.e_shnum)
-        goto bail;
+    if (elf->bits == 32)
+    {
+        Elf32_Shdr shdr;
+
+        if (parse_elf_section(elf, s_idx, &shdr, NULL))
+        {
+            log_msg("Failed to parse ELF string table: section %d, index %d\n",
+                    s_idx, idx);
+            return "";
+        }
+
+        return (char *)(elf->buf + shdr.sh_offset + idx);
+    }
+    else /* if (elf->bits == 64) */
+    {
+        Elf64_Shdr shdr;
+
+        if (parse_elf_section(elf, s_idx, NULL, &shdr))
+        {
+            log_msg("Failed to parse ELF string table: section %d, index %d\n",
+                    s_idx, idx);
+            return "";
+        }
 
-    COPY_STRUCT(hdr, elf->buf, elf->hdr.e_shoff + idx * elf->hdr.e_shentsize,
-                elf->sz);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_name);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_type);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_flags);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_addr);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_offset);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_size);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_link);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_info);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_addralign);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_entsize);
+        return (char *)(elf->buf + shdr.sh_offset + idx);
+    }
+}
+
+int parse_elf_symbol(elf_obj_t *elf, unsigned int ofst, Elf32_Sym *sym32, Elf64_Sym *sym64)
+{
+    if (sym32)
+    {
+        COPY_STRUCT(sym32, elf->buf, ofst, elf->sz);
+        ENDIAN_ASSIGN_IN_PLACE(sym32->st_name);
+        ENDIAN_ASSIGN_IN_PLACE(sym32->st_value);
+        ENDIAN_ASSIGN_IN_PLACE(sym32->st_size);
+        ENDIAN_ASSIGN_IN_PLACE(sym32->st_info);
+        ENDIAN_ASSIGN_IN_PLACE(sym32->st_other);
+        ENDIAN_ASSIGN_IN_PLACE(sym32->st_shndx);
+    }
+    else /* if (sym64) */
+    {
+        COPY_STRUCT(sym64, elf->buf, ofst, elf->sz);
+        ENDIAN_ASSIGN_IN_PLACE(sym64->st_name);
+        ENDIAN_ASSIGN_IN_PLACE(sym64->st_value);
+        ENDIAN_ASSIGN_IN_PLACE(sym64->st_size);
+        ENDIAN_ASSIGN_IN_PLACE(sym64->st_info);
+        ENDIAN_ASSIGN_IN_PLACE(sym64->st_other);
+        ENDIAN_ASSIGN_IN_PLACE(sym64->st_shndx);
+    }
     return 0;
 bail:
     return 1;
 }
 
-char *parse_elf32_string_table(elf_obj_t *elf, int s_idx, int idx)
-{
-    Elf32_Shdr shdr;
-
-    if (parse_elf32_section(elf, s_idx, &shdr))
-    {
-        log_msg("Failed to parse ELF string table: section %d, index %d\n",
-                s_idx, idx);
-        return "";
-    }
-
-    return (char *)(elf->buf + shdr.sh_offset + idx);
-}
-
-int parse_elf32_symbol(elf_obj_t *elf, unsigned int ofst, Elf32_Sym *sym)
+int parse_elf(uint8_t *buf, size_t sz, output_fmt_t mode)
 {
-    COPY_STRUCT(sym, elf->buf, ofst, elf->sz);
-    ENDIAN_ASSIGN_IN_PLACE(sym->st_name);
-    ENDIAN_ASSIGN_IN_PLACE(sym->st_value);
-    ENDIAN_ASSIGN_IN_PLACE(sym->st_size);
-    ENDIAN_ASSIGN_IN_PLACE(sym->st_info);
-    ENDIAN_ASSIGN_IN_PLACE(sym->st_other);
-    ENDIAN_ASSIGN_IN_PLACE(sym->st_shndx);
-    return 0;
-bail:
-    return 1;
-}
-
-int parse_elf32(uint8_t *buf, size_t sz, output_fmt_t mode)
-{
-    elf_obj_t  elf;
-    Elf32_Shdr shdr;
+    elf_obj_t    elf;
     unsigned int ofst;
-    int         i;
-    Elf32_Off strtab_off;   /* save String Table offset for later use */
+    int          i;
+    Elf32_Off    strtab_off32;
+    Elf64_Off    strtab_off64; /* save String Table offset for later use */
 
     memset(&elf, 0, sizeof(elf));
     elf.buf = buf;
     elf.sz = sz;
 
     /* Parse Header */
-    if (parse_elf32_header(&elf))
+    if (parse_elf_header(&elf))
+      goto bail;
+
+    if (elf.bits == 32)
     {
-        log_msg("Parse error: File does not appear to be valid ELF32\n");
-        return 1;
-    }
+        Elf32_Shdr shdr;
+        for (i = 0; i < elf.hdr32.e_shnum; i++)
+        {
+            parse_elf_section(&elf, i, &shdr, NULL);
 
-    for (i = 0; i < elf.hdr.e_shnum; i++)
-    {
-        parse_elf32_section(&elf, i, &shdr);
+            if (shdr.sh_type == SHT_STRTAB)
+            {
+                char strtsb_name[128];
+
+                strcpy(strtsb_name, (char *)(elf.buf + shdr.sh_offset + shdr.sh_name));
 
-        if (shdr.sh_type == SHT_STRTAB)
+                if (!(strcmp(strtsb_name, ".shstrtab")))
+                {
+                    /* log_msg("found section: %s\n", strtsb_name); */
+                    strtab_off32 = shdr.sh_offset;
+                    break;
+                }
+            }
+        }
+    }
+    else /* if (elf.bits == 64) */
+    {
+        Elf64_Shdr shdr;
+        for (i = 0; i < elf.hdr64.e_shnum; i++)
         {
-            char strtsb_name[128];
-
-            strcpy(strtsb_name, (char *)(elf.buf + shdr.sh_offset + shdr.sh_name));
+            parse_elf_section(&elf, i, NULL, &shdr);
 
-            if (!(strcmp(strtsb_name, ".shstrtab")))
+            if (shdr.sh_type == SHT_STRTAB)
             {
-                log_msg("found section: %s\n", strtsb_name);
-                strtab_off = shdr.sh_offset;
-                break;
+                char strtsb_name[128];
+
+                strcpy(strtsb_name, (char *)(elf.buf + shdr.sh_offset + shdr.sh_name));
+
+                if (!(strcmp(strtsb_name, ".shstrtab")))
+                {
+                    /* log_msg("found section: %s\n", strtsb_name); */
+                    strtab_off64 = shdr.sh_offset;
+                    break;
+                }
             }
         }
     }
 
     /* Parse all Symbol Tables */
-    for (i = 0; i < elf.hdr.e_shnum; i++)
+    if (elf.bits == 32)
     {
-
-        parse_elf32_section(&elf, i, &shdr);
-
-        if (shdr.sh_type == SHT_SYMTAB)
+        Elf32_Shdr shdr;
+        for (i = 0; i < elf.hdr32.e_shnum; i++)
         {
-            for (ofst = shdr.sh_offset;
-                 ofst < shdr.sh_offset + shdr.sh_size;
-                 ofst += shdr.sh_entsize)
+            parse_elf_section(&elf, i, &shdr, NULL);
+
+            if (shdr.sh_type == SHT_SYMTAB)
             {
-                Elf32_Sym sym;
+                for (ofst = shdr.sh_offset;
+                     ofst < shdr.sh_offset + shdr.sh_size;
+                     ofst += shdr.sh_entsize)
+                {
+                    Elf32_Sym sym;
+
+                    parse_elf_symbol(&elf, ofst, &sym, NULL);
 
-                parse_elf32_symbol(&elf, ofst, &sym);
+                    /* For all OBJECTS (data objects), extract the value from the
+                     * proper data segment.
+                     */
+                    /* if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT && sym.st_name)
+                        log_msg("found data object %s\n",
+                                parse_elf_string_table(&elf,
+                                                       shdr.sh_link,
+                                                       sym.st_name));
+                     */
 
-                /* For all OBJECTS (data objects), extract the value from the
-                 * proper data segment.
-                 */
-                if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT && sym.st_name)
-                    log_msg("found data object %s\n",
-                            parse_elf32_string_table(&elf,
-                                                     shdr.sh_link,
-                                                     sym.st_name));
+                    if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT
+                        && sym.st_size == 4)
+                    {
+                        Elf32_Shdr dhdr;
+                        int val = 0;
+                        char section_name[128];
+
+                        parse_elf_section(&elf, sym.st_shndx, &dhdr, NULL);
+
+                        /* For explanition - refer to _MSC_VER version of code */
+                        strcpy(section_name, (char *)(elf.buf + strtab_off32 + dhdr.sh_name));
+                        /* log_msg("Section_name: %s, Section_type: %d\n", section_name, dhdr.sh_type); */
 
-                if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT
-                    && sym.st_size == 4)
-                {
-                    Elf32_Shdr dhdr;
-                    int32_t      val;
-                    char section_name[128];
+                        if (strcmp(section_name, ".bss"))
+                        {
+                            if (sizeof(val) != sym.st_size)
+                            {
+                                /* The target value is declared as an int in
+                                 * asm_*_offsets.c, which is 4 bytes on all
+                                 * targets we currently use. Complain loudly if
+                                 * this is not true.
+                                 */
+                                log_msg("Symbol size is wrong\n");
+                                goto bail;
+                            }
 
-                    parse_elf32_section(&elf, sym.st_shndx, &dhdr);
+                            memcpy(&val,
+                                   elf.buf + dhdr.sh_offset + sym.st_value,
+                                   sym.st_size);
+                        }
 
-                    /* For explanition - refer to _MSC_VER version of code */
-                    strcpy(section_name, (char *)(elf.buf + strtab_off + dhdr.sh_name));
-                    log_msg("Section_name: %s, Section_type: %d\n", section_name, dhdr.sh_type);
+                        if (!elf.le_data)
+                        {
+                            log_msg("Big Endian data not supported yet!\n");
+                            goto bail;
+                        }
 
-                    if (!(strcmp(section_name, ".bss")))
-                    {
-                        val = 0;
+                        switch (mode)
+                        {
+                            case OUTPUT_FMT_RVDS:
+                                printf("%-40s EQU %5d\n",
+                                       parse_elf_string_table(&elf,
+                                                              shdr.sh_link,
+                                                              sym.st_name),
+                                       val);
+                                break;
+                            case OUTPUT_FMT_GAS:
+                                printf(".equ %-40s, %5d\n",
+                                       parse_elf_string_table(&elf,
+                                                              shdr.sh_link,
+                                                              sym.st_name),
+                                       val);
+                                break;
+                            default:
+                                printf("%s = %d\n",
+                                       parse_elf_string_table(&elf,
+                                                              shdr.sh_link,
+                                                              sym.st_name),
+                                       val);
+                        }
                     }
-                    else
-                    {
-                        memcpy(&val,
-                               elf.buf + dhdr.sh_offset + sym.st_value,
-                               sizeof(val));
-                    }
+                }
+            }
+        }
+    }
+    else /* if (elf.bits == 64) */
+    {
+        Elf64_Shdr shdr;
+        for (i = 0; i < elf.hdr64.e_shnum; i++)
+        {
+            parse_elf_section(&elf, i, NULL, &shdr);
 
-                    if (!elf.le_data)
-                    {
-                        log_msg("Big Endian data not supported yet!\n");
-                        goto bail;
-                    }\
+            if (shdr.sh_type == SHT_SYMTAB)
+            {
+                for (ofst = shdr.sh_offset;
+                     ofst < shdr.sh_offset + shdr.sh_size;
+                     ofst += shdr.sh_entsize)
+                {
+                    Elf64_Sym sym;
+
+                    parse_elf_symbol(&elf, ofst, NULL, &sym);
 
-                    switch (mode)
+                    /* For all OBJECTS (data objects), extract the value from the
+                     * proper data segment.
+                     */
+                    /* if (ELF64_ST_TYPE(sym.st_info) == STT_OBJECT && sym.st_name)
+                        log_msg("found data object %s\n",
+                                parse_elf_string_table(&elf,
+                                                       shdr.sh_link,
+                                                       sym.st_name));
+                     */
+
+                    if (ELF64_ST_TYPE(sym.st_info) == STT_OBJECT
+                        && sym.st_size == 4)
                     {
-                    case OUTPUT_FMT_RVDS:
-                        printf("%-40s EQU %5d\n",
-                               parse_elf32_string_table(&elf,
-                                                        shdr.sh_link,
-                                                        sym.st_name),
-                               val);
-                        break;
-                    case OUTPUT_FMT_GAS:
-                        printf(".equ %-40s, %5d\n",
-                               parse_elf32_string_table(&elf,
-                                                        shdr.sh_link,
-                                                        sym.st_name),
-                               val);
-                        break;
-                    default:
-                        printf("%s = %d\n",
-                               parse_elf32_string_table(&elf,
-                                                        shdr.sh_link,
-                                                        sym.st_name),
-                               val);
+                        Elf64_Shdr dhdr;
+                        int val = 0;
+                        char section_name[128];
+
+                        parse_elf_section(&elf, sym.st_shndx, NULL, &dhdr);
+
+                        /* For explanition - refer to _MSC_VER version of code */
+                        strcpy(section_name, (char *)(elf.buf + strtab_off64 + dhdr.sh_name));
+                        /* log_msg("Section_name: %s, Section_type: %d\n", section_name, dhdr.sh_type); */
+
+                        if ((strcmp(section_name, ".bss")))
+                        {
+                            if (sizeof(val) != sym.st_size)
+                            {
+                                /* The target value is declared as an int in
+                                 * asm_*_offsets.c, which is 4 bytes on all
+                                 * targets we currently use. Complain loudly if
+                                 * this is not true.
+                                 */
+                                log_msg("Symbol size is wrong\n");
+                                goto bail;
+                            }
+
+                            memcpy(&val,
+                                   elf.buf + dhdr.sh_offset + sym.st_value,
+                                   sym.st_size);
+                        }
+
+                        if (!elf.le_data)
+                        {
+                            log_msg("Big Endian data not supported yet!\n");
+                            goto bail;
+                        }
+
+                        switch (mode)
+                        {
+                            case OUTPUT_FMT_RVDS:
+                                printf("%-40s EQU %5d\n",
+                                       parse_elf_string_table(&elf,
+                                                              shdr.sh_link,
+                                                              sym.st_name),
+                                       val);
+                                break;
+                            case OUTPUT_FMT_GAS:
+                                printf(".equ %-40s, %5d\n",
+                                       parse_elf_string_table(&elf,
+                                                              shdr.sh_link,
+                                                              sym.st_name),
+                                       val);
+                                break;
+                            default:
+                                printf("%s = %d\n",
+                                       parse_elf_string_table(&elf,
+                                                              shdr.sh_link,
+                                                              sym.st_name),
+                                       val);
+                        }
                     }
                 }
             }
         }
     }
 
     if (mode == OUTPUT_FMT_RVDS)
         printf("    END\n");
 
     return 0;
 bail:
-    log_msg("Parse error: File does not appear to be valid ELF32\n");
+    log_msg("Parse error: File does not appear to be valid ELF32 or ELF64\n");
     return 1;
 }
 
-int main(int argc, char **argv)
-{
-    int fd;
-    output_fmt_t mode;
-    char *f;
-    struct stat stat_buf;
-    uint8_t *file_buf;
-    int res;
-
-    if (argc < 2 || argc > 3)
-    {
-        fprintf(stderr, "Usage: %s [output format] <obj file>\n\n", argv[0]);
-        fprintf(stderr, "  <obj file>\tELF format object file to parse\n");
-        fprintf(stderr, "Output Formats:\n");
-        fprintf(stderr, "  gas  - compatible with GNU assembler\n");
-        fprintf(stderr, "  rvds - compatible with armasm\n");
-        goto bail;
-    }
-
-    f = argv[2];
-
-    if (!strcmp(argv[1], "rvds"))
-        mode = OUTPUT_FMT_RVDS;
-    else if (!strcmp(argv[1], "gas"))
-        mode = OUTPUT_FMT_GAS;
-    else
-        f = argv[1];
+#endif
+#endif /* defined(__GNUC__) && __GNUC__ */
 
 
-    fd = open(f, O_RDONLY);
-
-    if (fd < 0)
-    {
-        perror("Unable to open file");
-        goto bail;
-    }
-
-    if (fstat(fd, &stat_buf))
-    {
-        perror("stat");
-        goto bail;
-    }
-
-    file_buf = malloc(stat_buf.st_size);
-
-    if (!file_buf)
-    {
-        perror("malloc");
-        goto bail;
-    }
-
-    if (read(fd, file_buf, stat_buf.st_size) != stat_buf.st_size)
-    {
-        perror("read");
-        goto bail;
-    }
-
-    if (close(fd))
-    {
-        perror("close");
-        goto bail;
-    }
-
-    res = parse_elf32(file_buf, stat_buf.st_size, mode);
-    //res = parse_coff(file_buf, stat_buf.st_size);
-    free(file_buf);
-
-    if (!res)
-        return EXIT_SUCCESS;
-
-bail:
-    return EXIT_FAILURE;
-}
-#endif
-#endif
-
-
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__CYGWIN__)
 /*  See "Microsoft Portable Executable and Common Object File Format Specification"
     for reference.
 */
 #define get_le32(x) ((*(x)) | (*(x+1)) << 8 |(*(x+2)) << 16 | (*(x+3)) << 24 )
 #define get_le16(x) ((*(x)) | (*(x+1)) << 8)
 
-int parse_coff(unsigned __int8 *buf, size_t sz)
+int parse_coff(uint8_t *buf, size_t sz)
 {
     unsigned int nsections, symtab_ptr, symtab_sz, strtab_ptr;
     unsigned int sectionrawdata_ptr;
     unsigned int i;
-    unsigned __int8 *ptr;
-    unsigned __int32 symoffset;
-    FILE *fp;
+    uint8_t *ptr;
+    uint32_t symoffset;
 
     char **sectionlist;  //this array holds all section names in their correct order.
     //it is used to check if the symbol is in .bss or .data section.
 
     nsections = get_le16(buf + 2);
     symtab_ptr = get_le32(buf + 8);
     symtab_sz = get_le32(buf + 12);
     strtab_ptr = symtab_ptr + symtab_sz * 18;
 
     if (nsections > 96)
-        goto bail;
+    {
+        log_msg("Too many sections\n");
+        return 1;
+    }
 
-    sectionlist = malloc(nsections * sizeof * sectionlist);
+    sectionlist = malloc(nsections * sizeof(sectionlist));
+
+    if (sectionlist == NULL)
+    {
+        log_msg("Allocating first level of section list failed\n");
+        return 1;
+    }
 
     //log_msg("COFF: Found %u symbols in %u sections.\n", symtab_sz, nsections);
 
     /*
     The size of optional header is always zero for an obj file. So, the section header
     follows the file header immediately.
     */
 
@@ -575,34 +712,32 @@ int parse_coff(unsigned __int8 *buf, siz
 
     for (i = 0; i < nsections; i++)
     {
         char sectionname[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
         strncpy(sectionname, ptr, 8);
         //log_msg("COFF: Parsing section %s\n",sectionname);
 
         sectionlist[i] = malloc(strlen(sectionname) + 1);
+
+        if (sectionlist[i] == NULL)
+        {
+            log_msg("Allocating storage for %s failed\n", sectionname);
+            goto bail;
+        }
         strcpy(sectionlist[i], sectionname);
 
         if (!strcmp(sectionname, ".data")) sectionrawdata_ptr = get_le32(ptr + 20);
 
         ptr += 40;
     }
 
     //log_msg("COFF: Symbol table at offset %u\n", symtab_ptr);
     //log_msg("COFF: raw data pointer ofset for section .data is %u\n", sectionrawdata_ptr);
 
-    fp = fopen("vpx_asm_offsets.asm", "w");
-
-    if (fp == NULL)
-    {
-        perror("open file");
-        goto bail;
-    }
-
     /*  The compiler puts the data with non-zero offset in .data section, but puts the data with
         zero offset in .bss section. So, if the data in in .bss section, set offset=0.
         Note from Wiki: In an object module compiled from C, the bss section contains
         the local variables (but not functions) that were declared with the static keyword,
         except for those with non-zero initial values. (In C, static variables are initialized
         to zero by default.) It also contains the non-local (both extern and static) variables
         that are also initialized to zero (either explicitly or by default).
         */
@@ -615,58 +750,67 @@ int parse_coff(unsigned __int8 *buf, siz
         14          Type
         16          StorageClass
         17          NumberOfAuxSymbols
         */
     ptr = buf + symtab_ptr;
 
     for (i = 0; i < symtab_sz; i++)
     {
-        __int16 section = get_le16(ptr + 12); //section number
+        int16_t section = get_le16(ptr + 12); //section number
 
         if (section > 0 && ptr[16] == 2)
         {
             //if(section > 0 && ptr[16] == 3 && get_le32(ptr+8)) {
 
             if (get_le32(ptr))
             {
                 char name[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
                 strncpy(name, ptr, 8);
                 //log_msg("COFF: Parsing symbol %s\n",name);
-                fprintf(fp, "%-40s EQU ", name);
+                /* The 64bit Windows compiler doesn't prefix with an _.
+                 * Check what's there, and bump if necessary
+                 */
+                if (name[0] == '_')
+                    printf("%-40s EQU ", name + 1);
+                else
+                    printf("%-40s EQU ", name);
             }
             else
             {
                 //log_msg("COFF: Parsing symbol %s\n",
                 //        buf + strtab_ptr + get_le32(ptr+4));
-                fprintf(fp, "%-40s EQU ", buf + strtab_ptr + get_le32(ptr + 4));
+                if ((buf + strtab_ptr + get_le32(ptr + 4))[0] == '_')
+                    printf("%-40s EQU ",
+                           buf + strtab_ptr + get_le32(ptr + 4) + 1);
+                else
+                    printf("%-40s EQU ", buf + strtab_ptr + get_le32(ptr + 4));
             }
 
             if (!(strcmp(sectionlist[section-1], ".bss")))
             {
                 symoffset = 0;
             }
             else
             {
                 symoffset = get_le32(buf + sectionrawdata_ptr + get_le32(ptr + 8));
             }
 
             //log_msg("      Section: %d\n",section);
             //log_msg("      Class:   %d\n",ptr[16]);
             //log_msg("      Address: %u\n",get_le32(ptr+8));
             //log_msg("      Offset: %u\n", symoffset);
 
-            fprintf(fp, "%5d\n", symoffset);
+            printf("%5d\n", symoffset);
         }
 
         ptr += 18;
     }
 
-    fprintf(fp, "    END\n");
-    fclose(fp);
+    printf("    END\n");
 
     for (i = 0; i < nsections; i++)
     {
         free(sectionlist[i]);
     }
 
     free(sectionlist);
 
@@ -677,80 +821,94 @@ bail:
     {
         free(sectionlist[i]);
     }
 
     free(sectionlist);
 
     return 1;
 }
+#endif /* defined(_MSC_VER) || defined(__MINGW32__) || defined(__CYGWIN__) */
 
 int main(int argc, char **argv)
 {
-    int fd;
-    output_fmt_t mode;
+    output_fmt_t mode = OUTPUT_FMT_PLAIN;
     const char *f;
-    struct _stat stat_buf;
-    unsigned __int8 *file_buf;
+    uint8_t *file_buf;
     int res;
+    FILE *fp;
+    long int file_size;
 
     if (argc < 2 || argc > 3)
     {
         fprintf(stderr, "Usage: %s [output format] <obj file>\n\n", argv[0]);
-        fprintf(stderr, "  <obj file>\tELF format object file to parse\n");
+        fprintf(stderr, "  <obj file>\tobject file to parse\n");
         fprintf(stderr, "Output Formats:\n");
         fprintf(stderr, "  gas  - compatible with GNU assembler\n");
         fprintf(stderr, "  rvds - compatible with armasm\n");
         goto bail;
     }
 
     f = argv[2];
 
     if (!strcmp(argv[1], "rvds"))
         mode = OUTPUT_FMT_RVDS;
     else if (!strcmp(argv[1], "gas"))
         mode = OUTPUT_FMT_GAS;
     else
         f = argv[1];
 
-    if (_sopen_s(&fd, f, _O_BINARY, _SH_DENYNO, _S_IREAD | _S_IWRITE))
+    fp = fopen(f, "rb");
+
+    if (!fp)
     {
         perror("Unable to open file");
         goto bail;
     }
 
-    if (_fstat(fd, &stat_buf))
+    if (fseek(fp, 0, SEEK_END))
     {
         perror("stat");
         goto bail;
     }
 
-    file_buf = malloc(stat_buf.st_size);
+    file_size = ftell(fp);
+    file_buf = malloc(file_size);
 
     if (!file_buf)
     {
         perror("malloc");
         goto bail;
     }
 
-    if (_read(fd, file_buf, stat_buf.st_size) != stat_buf.st_size)
+    rewind(fp);
+
+    if (fread(file_buf, sizeof(char), file_size, fp) != file_size)
     {
         perror("read");
         goto bail;
     }
 
-    if (_close(fd))
+    if (fclose(fp))
     {
         perror("close");
         goto bail;
     }
 
-    res = parse_coff(file_buf, stat_buf.st_size);
+#if defined(__GNUC__) && __GNUC__
+#if defined(__MACH__)
+    res = parse_macho(file_buf, file_size);
+#elif defined(__ELF__)
+    res = parse_elf(file_buf, file_size, mode);
+#endif
+#endif
+#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__CYGWIN__)
+    res = parse_coff(file_buf, file_size);
+#endif
 
     free(file_buf);
 
     if (!res)
         return EXIT_SUCCESS;
 
 bail:
     return EXIT_FAILURE;
 }
-#endif
new file mode 100644
--- /dev/null
+++ b/media/libvpx/compile_errors.patch
@@ -0,0 +1,39 @@
+diff --git a/media/libvpx/vpx/vp8.h b/media/libvpx/vpx/vp8.h
+--- a/media/libvpx/vpx/vp8.h
++++ b/media/libvpx/vpx/vp8.h
+@@ -41,33 +41,33 @@ enum vp8_com_control_id
+     VP8_SET_REFERENCE           = 1,    /**< pass in an external frame into decoder to be used as reference frame */
+     VP8_COPY_REFERENCE          = 2,    /**< get a copy of reference frame from the decoder */
+     VP8_SET_POSTPROC            = 3,    /**< set the decoder's post processing settings  */
+     VP8_SET_DBG_COLOR_REF_FRAME = 4,    /**< set the reference frames to color for each macroblock */
+     VP8_SET_DBG_COLOR_MB_MODES  = 5,    /**< set which macro block modes to color */
+     VP8_SET_DBG_COLOR_B_MODES   = 6,    /**< set which blocks modes to color */
+     VP8_SET_DBG_DISPLAY_MV      = 7,    /**< set which motion vector modes to draw */
+     VP8_COMMON_CTRL_ID_MAX,
+-    VP8_DECODER_CTRL_ID_START   = 256,
++    VP8_DECODER_CTRL_ID_START   = 256
+ };
+ 
+ /*!\brief post process flags
+  *
+  * The set of macros define VP8 decoder post processing flags
+  */
+ enum vp8_postproc_level
+ {
+     VP8_NOFILTERING             = 0,
+     VP8_DEBLOCK                 = 1<<0,
+     VP8_DEMACROBLOCK            = 1<<1,
+     VP8_ADDNOISE                = 1<<2,
+     VP8_DEBUG_TXT_FRAME_INFO    = 1<<3, /**< print frame information */
+     VP8_DEBUG_TXT_MBLK_MODES    = 1<<4, /**< print macro block modes over each macro block */
+     VP8_DEBUG_TXT_DC_DIFF       = 1<<5, /**< print dc diff for each macro block */
+-    VP8_DEBUG_TXT_RATE_INFO     = 1<<6, /**< print video rate info (encoder only) */
++    VP8_DEBUG_TXT_RATE_INFO     = 1<<6  /**< print video rate info (encoder only) */
+ };
+ 
+ /*!\brief post process flags
+  *
+  * This define a structure that describe the post processing settings. For
+  * the best objective measure (using the PSNR metric) set post_proc_flag
+  * to VP8_DEBLOCK and deblocking_level to 1.
+  */
--- a/media/libvpx/solaris.patch
+++ b/media/libvpx/solaris.patch
@@ -11,79 +11,16 @@ diff --git a/media/libvpx/vp8/common/loo
 +#define __inline inline
 +#endif
 +
  typedef unsigned char uc;
  
  static __inline signed char vp8_signed_char_clamp(int t)
  {
      t = (t < -128 ? -128 : t);
-diff --git a/media/libvpx/vpx/internal/vpx_codec_internal.h b/media/libvpx/vpx/internal/vpx_codec_internal.h
---- a/media/libvpx/vpx/internal/vpx_codec_internal.h
-+++ b/media/libvpx/vpx/internal/vpx_codec_internal.h
-@@ -316,17 +316,17 @@ struct vpx_codec_iface
- 
- /*!\brief Callback function pointer / user data pair storage */
- typedef struct vpx_codec_priv_cb_pair
- {
-     union
-     {
-         vpx_codec_put_frame_cb_fn_t    put_frame;
-         vpx_codec_put_slice_cb_fn_t    put_slice;
--    };
-+    } fn;
-     void                            *user_priv;
- } vpx_codec_priv_cb_pair_t;
- 
- 
- /*!\brief Instance private storage
-  *
-  * This structure is allocated by the algorithm's init function. It can be
-  * extended in one of two ways. First, a second, algorithm specific structure
-diff --git a/media/libvpx/vpx/src/vpx_decoder.c b/media/libvpx/vpx/src/vpx_decoder.c
---- a/media/libvpx/vpx/src/vpx_decoder.c
-+++ b/media/libvpx/vpx/src/vpx_decoder.c
-@@ -165,17 +165,17 @@ vpx_codec_err_t vpx_codec_register_put_f
- 
-     if (!ctx || !cb)
-         res = VPX_CODEC_INVALID_PARAM;
-     else if (!ctx->iface || !ctx->priv
-              || !(ctx->iface->caps & VPX_CODEC_CAP_PUT_FRAME))
-         res = VPX_CODEC_ERROR;
-     else
-     {
--        ctx->priv->dec.put_frame_cb.put_frame = cb;
-+        ctx->priv->dec.put_frame_cb.fn.put_frame = cb;
-         ctx->priv->dec.put_frame_cb.user_priv = user_priv;
-         res = VPX_CODEC_OK;
-     }
- 
-     return SAVE_STATUS(ctx, res);
- }
- 
- 
-@@ -187,17 +187,17 @@ vpx_codec_err_t vpx_codec_register_put_s
- 
-     if (!ctx || !cb)
-         res = VPX_CODEC_INVALID_PARAM;
-     else if (!ctx->iface || !ctx->priv
-              || !(ctx->iface->caps & VPX_CODEC_CAP_PUT_FRAME))
-         res = VPX_CODEC_ERROR;
-     else
-     {
--        ctx->priv->dec.put_slice_cb.put_slice = cb;
-+        ctx->priv->dec.put_slice_cb.fn.put_slice = cb;
-         ctx->priv->dec.put_slice_cb.user_priv = user_priv;
-         res = VPX_CODEC_OK;
-     }
- 
-     return SAVE_STATUS(ctx, res);
- }
- 
- 
 diff --git a/media/libvpx/vpx_ports/mem.h b/media/libvpx/vpx_ports/mem.h
 --- a/media/libvpx/vpx_ports/mem.h
 +++ b/media/libvpx/vpx_ports/mem.h
 @@ -9,17 +9,17 @@
   */
  
  
  #ifndef VPX_PORTS_MEM_H
@@ -167,18 +104,18 @@ diff --git a/media/libvpx/vpx_ports/x86.
  #if defined(__GNUC__) && __GNUC__
  #define x86_pause_hint()\
      __asm__ __volatile__ ("pause \n\t")
 +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
 +#define x86_pause_hint()\
 +    asm volatile ("pause \n\t")
  #else
  #if ARCH_X86_64
- /* No pause intrinsic for windows x64 */
- #define x86_pause_hint()
+ #define x86_pause_hint()\
+     _mm_pause();
  #else
  #define x86_pause_hint()\
      __asm pause
  #endif
 @@ -144,16 +171,29 @@ x87_set_control_word(unsigned short mode
  }
  static unsigned short
  x87_get_control_word(void)
--- a/media/libvpx/update.sh
+++ b/media/libvpx/update.sh
@@ -46,122 +46,121 @@ if [ $# -lt 1 ]; then
   echo You can configure these from objdir/$target with the following command:
   echo $ ..configure --target=$target --disable-vp8-encoder --disable-examples --disable-install-docs
   echo On Mac, you also need --enable-pic
   exit -1
 fi
 
 # These are relative to SDK source dir.
 commonFiles=(
+  vp8/vp8_dx_iface.c
   vp8/common/alloccommon.c
   vp8/common/blockd.c
   vp8/common/debugmodes.c
+  vp8/common/defaultcoefcounts.c
   vp8/common/entropy.c
   vp8/common/entropymode.c
   vp8/common/entropymv.c
   vp8/common/extend.c
-  vp8/common/filter_c.c
+  vp8/common/filter.c
   vp8/common/findnearmv.c
-  vp8/common/generic/systemdependent.c
   vp8/common/idctllm.c
   vp8/common/invtrans.c
   vp8/common/loopfilter.c
   vp8/common/loopfilter_filters.c
   vp8/common/mbpitch.c
   vp8/common/modecont.c
   vp8/common/modecontext.c
   vp8/common/postproc.c
-  vp8/common/predictdc.c
   vp8/common/quant_common.c
   vp8/common/recon.c
   vp8/common/reconinter.c
   vp8/common/reconintra4x4.c
   vp8/common/reconintra.c
   vp8/common/setupintrarecon.c
   vp8/common/swapyv12buffer.c
   vp8/common/textblit.c
   vp8/common/treecoder.c
   vp8/common/arm/arm_systemdependent.c
   vp8/common/arm/bilinearfilter_arm.c
   vp8/common/arm/filter_arm.c
   vp8/common/arm/loopfilter_arm.c
   vp8/common/arm/reconintra_arm.c
-  vp8/common/arm/vpx_asm_offsets.c
   vp8/common/arm/neon/recon_neon.c
+  vp8/common/generic/systemdependent.c
   vp8/common/x86/loopfilter_x86.c
+  vp8/common/x86/recon_wrapper_sse2.c
   vp8/common/x86/vp8_asm_stubs.c
   vp8/common/x86/x86_systemdependent.c
   vp8/decoder/dboolhuff.c
   vp8/decoder/decodemv.c
   vp8/decoder/decodframe.c
   vp8/decoder/dequantize.c
   vp8/decoder/detokenize.c
-  vp8/decoder/reconintra_mt.c
-  vp8/decoder/generic/dsystemdependent.c
   vp8/decoder/idct_blk.c
   vp8/decoder/onyxd_if.c
+  vp8/decoder/reconintra_mt.c
   vp8/decoder/threading.c
   vp8/decoder/arm/arm_dsystemdependent.c
   vp8/decoder/arm/dequantize_arm.c
   vp8/decoder/arm/armv6/idct_blk_v6.c
   vp8/decoder/arm/neon/idct_blk_neon.c
+  vp8/decoder/generic/dsystemdependent.c
   vp8/decoder/x86/idct_blk_mmx.c
   vp8/decoder/x86/idct_blk_sse2.c
   vp8/decoder/x86/x86_dsystemdependent.c
-  vp8/vp8_dx_iface.c
   vpx/src/vpx_codec.c
   vpx/src/vpx_decoder.c
   vpx/src/vpx_decoder_compat.c
   vpx/src/vpx_encoder.c
   vpx/src/vpx_image.c
   vpx_mem/vpx_mem.c
   vpx_scale/generic/gen_scalers.c
-  vpx_scale/generic/scalesystemdependant.c
+  vpx_scale/generic/scalesystemdependent.c
   vpx_scale/generic/vpxscale.c
   vpx_scale/generic/yv12config.c
   vpx_scale/generic/yv12extend.c
   vp8/common/alloccommon.h
   vp8/common/blockd.h
   vp8/common/coefupdateprobs.h
   vp8/common/common.h
   vp8/common/common_types.h
   vp8/common/defaultcoefcounts.h
   vp8/common/entropy.h
   vp8/common/entropymode.h
   vp8/common/entropymv.h
   vp8/common/extend.h
+  vp8/common/filter.h
   vp8/common/findnearmv.h
   vp8/common/g_common.h
   vp8/common/header.h
   vp8/common/idct.h
   vp8/common/invtrans.h
   vp8/common/loopfilter.h
   vp8/common/modecont.h
   vp8/common/mv.h
   vp8/common/onyxc_int.h
   vp8/common/onyxd.h
   vp8/common/onyx.h
   vp8/common/postproc.h
   vp8/common/ppflags.h
   vp8/common/pragmas.h
-  vp8/common/predictdc.h
-  vp8/common/preproc.h
   vp8/common/quant_common.h
   vp8/common/recon.h
   vp8/common/reconinter.h
   vp8/common/reconintra4x4.h
   vp8/common/reconintra.h
   vp8/common/setupintrarecon.h
   vp8/common/subpixel.h
   vp8/common/swapyv12buffer.h
   vp8/common/systemdependent.h
   vp8/common/threading.h
   vp8/common/treecoder.h
   vp8/common/type_aliases.h
-  vp8/common/vpxerrors.h
+  vp8/common/arm/bilinearfilter_arm.h
   vp8/common/arm/idct_arm.h
   vp8/common/arm/loopfilter_arm.h
   vp8/common/arm/recon_arm.h
   vp8/common/arm/subpixel_arm.h
   vp8/common/x86/idct_x86.h
   vp8/common/x86/loopfilter_x86.h
   vp8/common/x86/postproc_x86.h
   vp8/common/x86/recon_x86.h
@@ -169,19 +168,17 @@ commonFiles=(
   vp8/decoder/dboolhuff.h
   vp8/decoder/decodemv.h
   vp8/decoder/decoderthreading.h
   vp8/decoder/dequantize.h
   vp8/decoder/detokenize.h
   vp8/decoder/onyxd_int.h
   vp8/decoder/reconintra_mt.h
   vp8/decoder/treereader.h
-  vp8/decoder/arm/dboolhuff_arm.h
   vp8/decoder/arm/dequantize_arm.h
-  vp8/decoder/arm/detokenize_arm.h
   vp8/decoder/x86/dequantize_x86.h
   vpx/internal/vpx_codec_internal.h
   vpx/vp8cx.h
   vpx/vp8dx.h
   vpx/vp8e.h
   vpx/vp8.h
   vpx/vpx_codec.h
   vpx/vpx_codec_impl_bottom.h
@@ -235,17 +232,16 @@ commonFiles=(
   vp8/common/arm/neon/shortidct4x4llm_neon.asm
   vp8/common/arm/neon/sixtappredict4x4_neon.asm
   vp8/common/arm/neon/sixtappredict8x4_neon.asm
   vp8/common/arm/neon/sixtappredict8x8_neon.asm
   vp8/common/arm/neon/sixtappredict16x16_neon.asm
   vp8/common/arm/neon/recon16x16mb_neon.asm
   vp8/common/arm/neon/buildintrapredictorsmby_neon.asm
   vp8/common/arm/neon/save_neon_reg.asm
-  vp8/decoder/arm/detokenize.asm
   vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm
   vp8/decoder/arm/armv6/dequant_idct_v6.asm
   vp8/decoder/arm/armv6/dequantize_v6.asm
   vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm
   vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm
   vp8/decoder/arm/neon/dequant_idct_neon.asm
   vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm
   vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm
@@ -313,22 +309,10 @@ for f in ${commonFiles[@]}
 do
   mkdir -p -v `dirname $f`
   cp -v $1/$f $f
 done
 
 # Patch to compile with Sun Studio on Solaris
 patch -p3 < solaris.patch
 
-# Patch to fix link with xcode4
-patch -p1 < xcode4.patch
-
-# Patch to fix data race on global function pointers
-patch -p3 < bug640935.patch
-
-# Patch to avoid text relocations on ARM
-patch -p3 < bug646815.patch
-
-# Patch to fix alignment problems with using ARM asm in Thumb mode.
-patch -p3 < bug666931.patch
-
-# Patch to make chroma planes 16-byte aligned.
-patch -p3 < bug671818.patch
+# Patch to fix errors including C headers in C++
+patch -p3 < compile_errors.patch
--- a/media/libvpx/vp8/common/alloccommon.c
+++ b/media/libvpx/vp8/common/alloccommon.c
@@ -11,47 +11,51 @@
 
 #include "vpx_ports/config.h"
 #include "blockd.h"
 #include "vpx_mem/vpx_mem.h"
 #include "onyxc_int.h"
 #include "findnearmv.h"
 #include "entropymode.h"
 #include "systemdependent.h"
-#include "vpxerrors.h"
 
 
 extern  void vp8_init_scan_order_mask();
 
-void vp8_update_mode_info_border(MODE_INFO *mi, int rows, int cols)
+static void update_mode_info_border(MODE_INFO *mi, int rows, int cols)
 {
     int i;
     vpx_memset(mi - cols - 2, 0, sizeof(MODE_INFO) * (cols + 1));
 
     for (i = 0; i < rows; i++)
     {
+        /* TODO(holmer): Bug? This updates the last element of each row
+         * rather than the border element!
+         */
         vpx_memset(&mi[i*cols-1], 0, sizeof(MODE_INFO));
     }
 }
 
 void vp8_de_alloc_frame_buffers(VP8_COMMON *oci)
 {
     int i;
 
     for (i = 0; i < NUM_YV12_BUFFERS; i++)
         vp8_yv12_de_alloc_frame_buffer(&oci->yv12_fb[i]);
 
     vp8_yv12_de_alloc_frame_buffer(&oci->temp_scale_frame);
     vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer);
 
     vpx_free(oci->above_context);
     vpx_free(oci->mip);
+    vpx_free(oci->prev_mip);
 
     oci->above_context = 0;
     oci->mip = 0;
+    oci->prev_mip = 0;
 
 }
 
 int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height)
 {
     int i;
 
     vp8_de_alloc_frame_buffers(oci);
@@ -61,121 +65,139 @@ int vp8_alloc_frame_buffers(VP8_COMMON *
         width += 16 - (width & 0xf);
 
     if ((height & 0xf) != 0)
         height += 16 - (height & 0xf);
 
 
     for (i = 0; i < NUM_YV12_BUFFERS; i++)
     {
-      oci->fb_idx_ref_cnt[0] = 0;
-
-      if (vp8_yv12_alloc_frame_buffer(&oci->yv12_fb[i],  width, height, VP8BORDERINPIXELS) < 0)
+        oci->fb_idx_ref_cnt[i] = 0;
+        oci->yv12_fb[i].flags = 0;
+        if (vp8_yv12_alloc_frame_buffer(&oci->yv12_fb[i], width, height, VP8BORDERINPIXELS) < 0)
         {
             vp8_de_alloc_frame_buffers(oci);
-            return ALLOC_FAILURE;
+            return 1;
         }
     }
 
     oci->new_fb_idx = 0;
     oci->lst_fb_idx = 1;
     oci->gld_fb_idx = 2;
     oci->alt_fb_idx = 3;
 
     oci->fb_idx_ref_cnt[0] = 1;
     oci->fb_idx_ref_cnt[1] = 1;
     oci->fb_idx_ref_cnt[2] = 1;
     oci->fb_idx_ref_cnt[3] = 1;
 
     if (vp8_yv12_alloc_frame_buffer(&oci->temp_scale_frame,   width, 16, VP8BORDERINPIXELS) < 0)
     {
         vp8_de_alloc_frame_buffers(oci);
-        return ALLOC_FAILURE;
+        return 1;
     }
 
     if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer, width, height, VP8BORDERINPIXELS) < 0)
     {
         vp8_de_alloc_frame_buffers(oci);
-        return ALLOC_FAILURE;
+        return 1;
     }
 
     oci->mb_rows = height >> 4;
     oci->mb_cols = width >> 4;
     oci->MBs = oci->mb_rows * oci->mb_cols;
     oci->mode_info_stride = oci->mb_cols + 1;
     oci->mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO));
 
     if (!oci->mip)
     {
         vp8_de_alloc_frame_buffers(oci);
-        return ALLOC_FAILURE;
+        return 1;
     }
 
     oci->mi = oci->mip + oci->mode_info_stride + 1;
 
+    /* allocate memory for last frame MODE_INFO array */
+#if CONFIG_ERROR_CONCEALMENT
+    oci->prev_mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO));
+
+    if (!oci->prev_mip)
+    {
+        vp8_de_alloc_frame_buffers(oci);
+        return 1;
+    }
+
+    oci->prev_mi = oci->prev_mip + oci->mode_info_stride + 1;
+#else
+    oci->prev_mip = NULL;
+    oci->prev_mi = NULL;
+#endif
 
     oci->above_context = vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * oci->mb_cols, 1);
 
     if (!oci->above_context)
     {
         vp8_de_alloc_frame_buffers(oci);
-        return ALLOC_FAILURE;
+        return 1;
     }
 
-    vp8_update_mode_info_border(oci->mi, oci->mb_rows, oci->mb_cols);
+    update_mode_info_border(oci->mi, oci->mb_rows, oci->mb_cols);
+#if CONFIG_ERROR_CONCEALMENT
+    update_mode_info_border(oci->prev_mi, oci->mb_rows, oci->mb_cols);
+#endif
 
     return 0;
 }
 void vp8_setup_version(VP8_COMMON *cm)
 {
     switch (cm->version)
     {
     case 0:
         cm->no_lpf = 0;
-        cm->simpler_lpf = 0;
+        cm->filter_type = NORMAL_LOOPFILTER;
         cm->use_bilinear_mc_filter = 0;
         cm->full_pixel = 0;
         break;
     case 1:
         cm->no_lpf = 0;
-        cm->simpler_lpf = 1;
+        cm->filter_type = SIMPLE_LOOPFILTER;
         cm->use_bilinear_mc_filter = 1;
         cm->full_pixel = 0;
         break;
     case 2:
         cm->no_lpf = 1;
-        cm->simpler_lpf = 0;
+        cm->filter_type = NORMAL_LOOPFILTER;
         cm->use_bilinear_mc_filter = 1;
         cm->full_pixel = 0;
         break;
     case 3:
         cm->no_lpf = 1;
-        cm->simpler_lpf = 1;
+        cm->filter_type = SIMPLE_LOOPFILTER;
         cm->use_bilinear_mc_filter = 1;
         cm->full_pixel = 1;
         break;
     default:
         /*4,5,6,7 are reserved for future use*/
         cm->no_lpf = 0;
-        cm->simpler_lpf = 0;
+        cm->filter_type = NORMAL_LOOPFILTER;
         cm->use_bilinear_mc_filter = 0;
         cm->full_pixel = 0;
         break;
     }
 }
 void vp8_create_common(VP8_COMMON *oci)
 {
     vp8_machine_specific_config(oci);
     vp8_default_coef_probs(oci);
     vp8_init_mbmode_probs(oci);
     vp8_default_bmode_probs(oci->fc.bmode_prob);
 
     oci->mb_no_coeff_skip = 1;
     oci->no_lpf = 0;
-    oci->simpler_lpf = 0;
+    oci->filter_type = NORMAL_LOOPFILTER;
     oci->use_bilinear_mc_filter = 0;
     oci->full_pixel = 0;
     oci->multi_token_partition = ONE_PARTITION;
     oci->clr_type = REG_YUV;
     oci->clamp_type = RECON_CLAMP_REQUIRED;
 
     /* Initialise reference frame sign bias structure to defaults */
     vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
--- a/media/libvpx/vp8/common/arm/arm_systemdependent.c
+++ b/media/libvpx/vp8/common/arm/arm_systemdependent.c
@@ -6,37 +6,40 @@
  *  tree. An additional intellectual property rights grant can be found
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
 #include "vpx_ports/config.h"
 #include "vpx_ports/arm.h"
-#include "g_common.h"
-#include "pragmas.h"
-#include "subpixel.h"
-#include "loopfilter.h"
-#include "recon.h"
-#include "idct.h"
-#include "onyxc_int.h"
+#include "vp8/common/g_common.h"
+#include "vp8/common/pragmas.h"
+#include "vp8/common/subpixel.h"
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/recon.h"
+#include "vp8/common/idct.h"
+#include "vp8/common/onyxc_int.h"
 
 void vp8_arch_arm_common_init(VP8_COMMON *ctx)
 {
 #if CONFIG_RUNTIME_CPU_DETECT
     VP8_COMMON_RTCD *rtcd = &ctx->rtcd;
     int flags = arm_cpu_caps();
-    int has_edsp = flags & HAS_EDSP;
-    int has_media = flags & HAS_MEDIA;
-    int has_neon = flags & HAS_NEON;
     rtcd->flags = flags;
 
     /* Override default functions with fastest ones for this CPU. */
+#if HAVE_ARMV5TE
+    if (flags & HAS_EDSP)
+    {
+    }
+#endif
+
 #if HAVE_ARMV6
-    if (has_media)
+    if (flags & HAS_MEDIA)
     {
         rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_armv6;
         rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_armv6;
         rtcd->subpix.sixtap8x4     = vp8_sixtap_predict8x4_armv6;
         rtcd->subpix.sixtap4x4     = vp8_sixtap_predict_armv6;
         rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_armv6;
         rtcd->subpix.bilinear8x8   = vp8_bilinear_predict8x8_armv6;
         rtcd->subpix.bilinear8x4   = vp8_bilinear_predict8x4_armv6;
@@ -46,32 +49,34 @@ void vp8_arch_arm_common_init(VP8_COMMON
         rtcd->idct.idct16       = vp8_short_idct4x4llm_v6_dual;
         rtcd->idct.iwalsh1      = vp8_short_inv_walsh4x4_1_v6;
         rtcd->idct.iwalsh16     = vp8_short_inv_walsh4x4_v6;
 
         rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_armv6;
         rtcd->loopfilter.normal_b_v  = vp8_loop_filter_bv_armv6;
         rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_armv6;
         rtcd->loopfilter.normal_b_h  = vp8_loop_filter_bh_armv6;
-        rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_armv6;
+        rtcd->loopfilter.simple_mb_v =
+                vp8_loop_filter_simple_vertical_edge_armv6;
         rtcd->loopfilter.simple_b_v  = vp8_loop_filter_bvs_armv6;
-        rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_armv6;
+        rtcd->loopfilter.simple_mb_h =
+                vp8_loop_filter_simple_horizontal_edge_armv6;
         rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_armv6;
 
         rtcd->recon.copy16x16   = vp8_copy_mem16x16_v6;
         rtcd->recon.copy8x8     = vp8_copy_mem8x8_v6;
         rtcd->recon.copy8x4     = vp8_copy_mem8x4_v6;
         rtcd->recon.recon       = vp8_recon_b_armv6;
         rtcd->recon.recon2      = vp8_recon2b_armv6;
         rtcd->recon.recon4      = vp8_recon4b_armv6;
     }
 #endif
 
 #if HAVE_ARMV7
-    if (has_neon)
+    if (flags & HAS_NEON)
     {
         rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_neon;
         rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_neon;
         rtcd->subpix.sixtap8x4     = vp8_sixtap_predict8x4_neon;
         rtcd->subpix.sixtap4x4     = vp8_sixtap_predict_neon;
         rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_neon;
         rtcd->subpix.bilinear8x8   = vp8_bilinear_predict8x8_neon;
         rtcd->subpix.bilinear8x4   = vp8_bilinear_predict8x4_neon;
--- a/media/libvpx/vp8/common/arm/armv6/bilinearfilter_v6.asm
+++ b/media/libvpx/vp8/common/arm/armv6/bilinearfilter_v6.asm
@@ -10,43 +10,43 @@
 
 
     EXPORT  |vp8_filter_block2d_bil_first_pass_armv6|
     EXPORT  |vp8_filter_block2d_bil_second_pass_armv6|
 
     AREA    |.text|, CODE, READONLY  ; name this block of code
 
 ;-------------------------------------
-; r0    unsigned char *src_ptr,
-; r1    unsigned short *output_ptr,
-; r2    unsigned int src_pixels_per_line,
-; r3    unsigned int output_height,
-; stack    unsigned int output_width,
-; stack    const short *vp8_filter
+; r0    unsigned char  *src_ptr,
+; r1    unsigned short *dst_ptr,
+; r2    unsigned int    src_pitch,
+; r3    unsigned int    height,
+; stack unsigned int    width,
+; stack const short    *vp8_filter
 ;-------------------------------------
 ; The output is transposed stroed in output array to make it easy for second pass filtering.
 |vp8_filter_block2d_bil_first_pass_armv6| PROC
     stmdb   sp!, {r4 - r11, lr}
 
     ldr     r11, [sp, #40]                  ; vp8_filter address
-    ldr     r4, [sp, #36]                   ; output width
+    ldr     r4, [sp, #36]                   ; width
 
     mov     r12, r3                         ; outer-loop counter
-    sub     r2, r2, r4                      ; src increment for height loop
 
-    ;;IF ARCHITECTURE=6
-    pld     [r0]
-    ;;ENDIF
+    add     r7, r2, r4                      ; preload next row
+    pld     [r0, r7]
+
+    sub     r2, r2, r4                      ; src increment for height loop
 
     ldr     r5, [r11]                       ; load up filter coefficients
 
-    mov     r3, r3, lsl #1                  ; output_height*2
+    mov     r3, r3, lsl #1                  ; height*2
     add     r3, r3, #2                      ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
 
-    mov     r11, r1                         ; save output_ptr for each row
+    mov     r11, r1                         ; save dst_ptr for each row
 
     cmp     r5, #128                        ; if filter coef = 128, then skip the filter
     beq     bil_null_1st_filter
 
 |bil_height_loop_1st_v6|
     ldrb    r6, [r0]                        ; load source data
     ldrb    r7, [r0, #1]
     ldrb    r8, [r0, #2]
@@ -91,19 +91,18 @@
     ldrneb  r7, [r0, #1]
     ldrneb  r8, [r0, #2]
 
     bne     bil_width_loop_1st_v6
 
     add     r0, r0, r2                      ; move to next input row
     subs    r12, r12, #1
 
-    ;;IF ARCHITECTURE=6
-    pld     [r0]
-    ;;ENDIF
+    add     r9, r2, r4, lsl #1              ; adding back block width
+    pld     [r0, r9]                        ; preload next row
 
     add     r11, r11, #2                    ; move over to next column
     mov     r1, r11
 
     bne     bil_height_loop_1st_v6
 
     ldmia   sp!, {r4 - r11, pc}
 
@@ -135,27 +134,27 @@
 
     ldmia   sp!, {r4 - r11, pc}
 
     ENDP  ; |vp8_filter_block2d_bil_first_pass_armv6|
 
 
 ;---------------------------------
 ; r0    unsigned short *src_ptr,
-; r1    unsigned char *output_ptr,
-; r2    int output_pitch,
-; r3    unsigned int  output_height,
-; stack unsigned int  output_width,
-; stack const short *vp8_filter
+; r1    unsigned char  *dst_ptr,
+; r2    int             dst_pitch,
+; r3    unsigned int    height,
+; stack unsigned int    width,
+; stack const short    *vp8_filter
 ;---------------------------------
 |vp8_filter_block2d_bil_second_pass_armv6| PROC
     stmdb   sp!, {r4 - r11, lr}
 
     ldr     r11, [sp, #40]                  ; vp8_filter address
-    ldr     r4, [sp, #36]                   ; output width
+    ldr     r4, [sp, #36]                   ; width
 
     ldr     r5, [r11]                       ; load up filter coefficients
     mov     r12, r4                         ; outer-loop counter = width, since we work on transposed data matrix
     mov     r11, r1
 
     cmp     r5, #128                        ; if filter coef = 128, then skip the filter
     beq     bil_null_2nd_filter
 
--- a/media/libvpx/vp8/common/arm/armv6/copymem16x16_v6.asm
+++ b/media/libvpx/vp8/common/arm/armv6/copymem16x16_v6.asm
@@ -17,19 +17,17 @@
     AREA    Block, CODE, READONLY ; name this block of code
 ;void copy_mem16x16_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
 |vp8_copy_mem16x16_v6| PROC
     stmdb       sp!, {r4 - r7}
     ;push   {r4-r7}
 
     ;preload
-    pld     [r0]
-    pld     [r0, r1]
-    pld     [r0, r1, lsl #1]
+    pld     [r0, #31]                ; preload for next 16x16 block
 
     ands    r4, r0, #15
     beq     copy_mem16x16_fast
 
     ands    r4, r0, #7
     beq     copy_mem16x16_8
 
     ands    r4, r0, #3
@@ -85,16 +83,18 @@ copy_mem16x16_1_loop
 
     add     r2, r2, r3
 
     ldrneb  r4, [r0]
     ldrneb  r5, [r0, #1]
     ldrneb  r6, [r0, #2]
     ldrneb  r7, [r0, #3]
 
+    pld     [r0, #31]               ; preload for next 16x16 block
+
     bne     copy_mem16x16_1_loop
 
     ldmia       sp!, {r4 - r7}
     ;pop        {r4-r7}
     mov     pc, lr
 
 ;copy 4 bytes each time
 copy_mem16x16_4
@@ -116,16 +116,18 @@ copy_mem16x16_4_loop
 
     add     r2, r2, r3
 
     ldrne   r4, [r0]
     ldrne   r5, [r0, #4]
     ldrne   r6, [r0, #8]
     ldrne   r7, [r0, #12]
 
+    pld     [r0, #31]               ; preload for next 16x16 block
+
     bne     copy_mem16x16_4_loop
 
     ldmia       sp!, {r4 - r7}
     ;pop        {r4-r7}
     mov     pc, lr
 
 ;copy 8 bytes each time
 copy_mem16x16_8
@@ -143,16 +145,17 @@ copy_mem16x16_8_loop
 
     stmia   r2!, {r4-r5}
     subs    r12, r12, #1
     ;stm        r2, {r4-r5}
     stmia   r2!, {r6-r7}
 
     add     r2, r2, r3
 
+    pld     [r0, #31]               ; preload for next 16x16 block
     bne     copy_mem16x16_8_loop
 
     ldmia       sp!, {r4 - r7}
     ;pop        {r4-r7}
     mov     pc, lr
 
 ;copy 16 bytes each time
 copy_mem16x16_fast
@@ -166,16 +169,17 @@ copy_mem16x16_fast_loop
     ;ldm        r0, {r4-r7}
     add     r0, r0, r1
 
     subs    r12, r12, #1
     stmia   r2, {r4-r7}
     ;stm        r2, {r4-r7}
     add     r2, r2, r3
 
+    pld     [r0, #31]               ; preload for next 16x16 block
     bne     copy_mem16x16_fast_loop
 
     ldmia       sp!, {r4 - r7}
     ;pop        {r4-r7}
     mov     pc, lr
 
     ENDP  ; |vp8_copy_mem16x16_v6|
 
--- a/media/libvpx/vp8/common/arm/armv6/filter_v6.asm
+++ b/media/libvpx/vp8/common/arm/armv6/filter_v6.asm
@@ -5,16 +5,18 @@
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
 
     EXPORT  |vp8_filter_block2d_first_pass_armv6|
+    EXPORT  |vp8_filter_block2d_first_pass_16x16_armv6|
+    EXPORT  |vp8_filter_block2d_first_pass_8x8_armv6|
     EXPORT  |vp8_filter_block2d_second_pass_armv6|
     EXPORT  |vp8_filter4_block2d_second_pass_armv6|
     EXPORT  |vp8_filter_block2d_first_pass_only_armv6|
     EXPORT  |vp8_filter_block2d_second_pass_only_armv6|
 
     AREA    |.text|, CODE, READONLY  ; name this block of code
 ;-------------------------------------
 ; r0    unsigned char *src_ptr
@@ -35,21 +37,16 @@
     sub     r2, r2, r3                      ; inside loop increments input array,
                                             ; so the height loop only needs to add
                                             ; r2 - width to the input pointer
 
     mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts
     add     r12, r3, #16                    ; square off the output
     sub     sp, sp, #4
 
-    ;;IF ARCHITECTURE=6
-    ;pld        [r0, #-2]
-    ;;pld       [r0, #30]
-    ;;ENDIF
-
     ldr     r4, [r11]                       ; load up packed filter coefficients
     ldr     r5, [r11, #4]
     ldr     r6, [r11, #8]
 
     str     r1, [sp]                        ; push destination to stack
     mov     r7, r7, lsl #16                 ; height is top part of counter
 
 ; six tap filter
@@ -96,29 +93,210 @@
 
     strh    lr, [r1], r12                   ; result is transposed and stored, which
                                             ; will make second pass filtering easier.
     ldrneb  r10, [r0], #2
     strh    r11, [r1], r12
 
     bne     width_loop_1st_6
 
-    ;;add       r9, r2, #30                 ; attempt to load 2 adjacent cache lines
-    ;;IF ARCHITECTURE=6
-    ;pld        [r0, r2]
-    ;;pld       [r0, r9]
-    ;;ENDIF
+    ldr     r1, [sp]                        ; load and update dst address
+    subs    r7, r7, #0x10000
+    add     r0, r0, r2                      ; move to next input line
+
+    add     r1, r1, #2                      ; move over to next column
+    str     r1, [sp]
+
+    bne     height_loop_1st_6
+
+    add     sp, sp, #4
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP
+
+; --------------------------
+; 16x16 version
+; -----------------------------
+|vp8_filter_block2d_first_pass_16x16_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vp8_filter address
+    ldr     r7, [sp, #36]                   ; output height
+
+    add     r4, r2, #18                     ; preload next low
+    pld     [r0, r4]
+
+    sub     r2, r2, r3                      ; inside loop increments input array,
+                                            ; so the height loop only needs to add
+                                            ; r2 - width to the input pointer
+
+    mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts
+    add     r12, r3, #16                    ; square off the output
+    sub     sp, sp, #4
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+    str     r1, [sp]                        ; push destination to stack
+    mov     r7, r7, lsl #16                 ; height is top part of counter
+
+; six tap filter
+|height_loop_1st_16_6|
+    ldrb    r8, [r0, #-2]                   ; load source data
+    ldrb    r9, [r0, #-1]
+    ldrb    r10, [r0], #2
+    orr     r7, r7, r3, lsr #2              ; construct loop counter
+
+|width_loop_1st_16_6|
+    ldrb    r11, [r0, #-1]
+
+    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
+    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
+
+    ldrb    r9, [r0]
+
+    smuad   lr, lr, r4                      ; apply the filter
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+    smuad   r8, r8, r4
+    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
+
+    smlad   lr, r10, r5, lr
+    ldrb    r10, [r0, #1]
+    smlad   r8, r11, r5, r8
+    ldrb    r11, [r0, #2]
+
+    sub     r7, r7, #1
+
+    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+
+    smlad   lr, r9, r6, lr
+    smlad   r11, r10, r6, r8
+
+    ands    r10, r7, #0xff                  ; test loop counter
+
+    add     lr, lr, #0x40                   ; round_shift_and_clamp
+    ldrneb  r8, [r0, #-2]                   ; load data for next loop
+    usat    lr, #8, lr, asr #7
+    add     r11, r11, #0x40
+    ldrneb  r9, [r0, #-1]
+    usat    r11, #8, r11, asr #7
+
+    strh    lr, [r1], r12                   ; result is transposed and stored, which
+                                            ; will make second pass filtering easier.
+    ldrneb  r10, [r0], #2
+    strh    r11, [r1], r12
+
+    bne     width_loop_1st_16_6
 
     ldr     r1, [sp]                        ; load and update dst address
     subs    r7, r7, #0x10000
     add     r0, r0, r2                      ; move to next input line
+
+    add     r11, r2, #34                    ; adding back block width(=16)
+    pld     [r0, r11]                       ; preload next low
+
     add     r1, r1, #2                      ; move over to next column
     str     r1, [sp]
 
-    bne     height_loop_1st_6
+    bne     height_loop_1st_16_6
+
+    add     sp, sp, #4
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP
+
+; --------------------------
+; 8x8 version
+; -----------------------------
+|vp8_filter_block2d_first_pass_8x8_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vp8_filter address
+    ldr     r7, [sp, #36]                   ; output height
+
+    add     r4, r2, #10                     ; preload next low
+    pld     [r0, r4]
+
+    sub     r2, r2, r3                      ; inside loop increments input array,
+                                            ; so the height loop only needs to add
+                                            ; r2 - width to the input pointer
+
+    mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts
+    add     r12, r3, #16                    ; square off the output
+    sub     sp, sp, #4
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+    str     r1, [sp]                        ; push destination to stack
+    mov     r7, r7, lsl #16                 ; height is top part of counter
+
+; six tap filter
+|height_loop_1st_8_6|
+    ldrb    r8, [r0, #-2]                   ; load source data
+    ldrb    r9, [r0, #-1]
+    ldrb    r10, [r0], #2
+    orr     r7, r7, r3, lsr #2              ; construct loop counter
+
+|width_loop_1st_8_6|
+    ldrb    r11, [r0, #-1]
+
+    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
+    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
+
+    ldrb    r9, [r0]
+
+    smuad   lr, lr, r4                      ; apply the filter
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+    smuad   r8, r8, r4
+    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
+
+    smlad   lr, r10, r5, lr
+    ldrb    r10, [r0, #1]
+    smlad   r8, r11, r5, r8
+    ldrb    r11, [r0, #2]
+
+    sub     r7, r7, #1
+
+    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+
+    smlad   lr, r9, r6, lr
+    smlad   r11, r10, r6, r8
+
+    ands    r10, r7, #0xff                  ; test loop counter
+
+    add     lr, lr, #0x40                   ; round_shift_and_clamp
+    ldrneb  r8, [r0, #-2]                   ; load data for next loop
+    usat    lr, #8, lr, asr #7
+    add     r11, r11, #0x40
+    ldrneb  r9, [r0, #-1]
+    usat    r11, #8, r11, asr #7
+
+    strh    lr, [r1], r12                   ; result is transposed and stored, which
+                                            ; will make second pass filtering easier.
+    ldrneb  r10, [r0], #2
+    strh    r11, [r1], r12
+
+    bne     width_loop_1st_8_6
+
+    ldr     r1, [sp]                        ; load and update dst address
+    subs    r7, r7, #0x10000
+    add     r0, r0, r2                      ; move to next input line
+
+    add     r11, r2, #18                    ; adding back block width(=8)
+    pld     [r0, r11]                       ; preload next low
+
+    add     r1, r1, #2                      ; move over to next column
+    str     r1, [sp]
+
+    bne     height_loop_1st_8_6
 
     add     sp, sp, #4
     ldmia   sp!, {r4 - r11, pc}
 
     ENDP
 
 ;---------------------------------
 ; r0    short         *src_ptr,
@@ -257,16 +435,20 @@
 ; r2    unsigned int src_pixels_per_line
 ; r3    unsigned int cnt,
 ; stack unsigned int output_pitch,
 ; stack const short *vp8_filter
 ;------------------------------------
 |vp8_filter_block2d_first_pass_only_armv6| PROC
     stmdb   sp!, {r4 - r11, lr}
 
+    add     r7, r2, r3                      ; preload next low
+    add     r7, r7, #2
+    pld     [r0, r7]
+
     ldr     r4, [sp, #36]                   ; output pitch
     ldr     r11, [sp, #40]                  ; HFilter address
     sub     sp, sp, #8
 
     mov     r7, r3
     sub     r2, r2, r3                      ; inside loop increments input array,
                                             ; so the height loop only needs to add
                                             ; r2 - width to the input pointer
@@ -325,26 +507,25 @@
     usat    r10, #8, r10, asr #7
 
     ldrneb  r9, [r0, #-1]
     strb    r10, [r1], #1
     ldrneb  r10, [r0], #2
 
     bne     width_loop_1st_only_6
 
-    ;;add       r9, r2, #30                 ; attempt to load 2 adjacent cache lines
-    ;;IF ARCHITECTURE=6
-    ;pld        [r0, r2]
-    ;;pld       [r0, r9]
-    ;;ENDIF
-
     ldr     lr, [sp]                        ; load back output pitch
     ldr     r12, [sp, #4]                   ; load back output pitch
     subs    r7, r7, #1
     add     r0, r0, r12                     ; updata src for next loop
+
+    add     r11, r12, r3                    ; preload next low
+    add     r11, r11, #2
+    pld     [r0, r11]
+
     add     r1, r1, lr                      ; update dst for next loop
 
     bne     height_loop_1st_only_6
 
     add     sp, sp, #8
     ldmia   sp!, {r4 - r11, pc}
     ENDP  ; |vp8_filter_block2d_first_pass_only_armv6|
 
--- a/media/libvpx/vp8/common/arm/armv6/loopfilter_v6.asm
+++ b/media/libvpx/vp8/common/arm/armv6/loopfilter_v6.asm
@@ -48,43 +48,44 @@
 
 
 src         RN  r0
 pstep       RN  r1
 count       RN  r5
 
 ;r0     unsigned char *src_ptr,
 ;r1     int src_pixel_step,
-;r2     const char *flimit,
+;r2     const char *blimit,
 ;r3     const char *limit,
 ;stack  const char *thresh,
 ;stack  int  count
 
-;Note: All 16 elements in flimit are equal. So, in the code, only one load is needed
-;for flimit. Same way applies to limit and thresh.
-
 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 |vp8_loop_filter_horizontal_edge_armv6| PROC
 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
     stmdb       sp!, {r4 - r11, lr}
 
     sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
     ldr         count, [sp, #40]            ; count for 8-in-parallel
     ldr         r6, [sp, #36]               ; load thresh address
     sub         sp, sp, #16                 ; create temp buffer
 
     ldr         r9, [src], pstep            ; p3
-    ldr         r4, [r2], #4                ; flimit
+    ldrb        r4, [r2]                    ; blimit
     ldr         r10, [src], pstep           ; p2
-    ldr         r2, [r3], #4                ; limit
+    ldrb        r2, [r3]                    ; limit
     ldr         r11, [src], pstep           ; p1
-    uadd8       r4, r4, r4                  ; flimit * 2
-    ldr         r3, [r6], #4                ; thresh
+    orr         r4, r4, r4, lsl #8
+    ldrb        r3, [r6]                    ; thresh
+    orr         r2, r2, r2, lsl #8
     mov         count, count, lsl #1        ; 4-in-parallel
-    uadd8       r4, r4, r2                  ; flimit * 2 + limit
+    orr         r4, r4, r4, lsl #16
+    orr         r3, r3, r3, lsl #8
+    orr         r2, r2, r2, lsl #16
+    orr         r3, r3, r3, lsl #16
 
 |Hnext8|
     ; vp8_filter_mask() function
     ; calculate breakout conditions
     ldr         r12, [src], pstep           ; p0
 
     uqsub8      r6, r9, r10                 ; p3 - p2
     uqsub8      r7, r10, r9                 ; p2 - p3
@@ -248,22 +249,16 @@ count       RN  r5
     sub         src, src, pstep, lsl #1
 
 |hskip_filter|
     add         src, src, #4
     sub         src, src, pstep, lsl #2
 
     subs        count, count, #1
 
-    ;pld            [src]
-    ;pld            [src, pstep]
-    ;pld            [src, pstep, lsl #1]
-    ;pld            [src, pstep, lsl #2]
-    ;pld            [src, pstep, lsl #3]
-
     ldrne       r9, [src], pstep            ; p3
     ldrne       r10, [src], pstep           ; p2
     ldrne       r11, [src], pstep           ; p1
 
     bne         Hnext8
 
     add         sp, sp, #16
     ldmia       sp!, {r4 - r11, pc}
@@ -276,24 +271,28 @@ count       RN  r5
     stmdb       sp!, {r4 - r11, lr}
 
     sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
     ldr         count, [sp, #40]            ; count for 8-in-parallel
     ldr         r6, [sp, #36]               ; load thresh address
     sub         sp, sp, #16                 ; create temp buffer
 
     ldr         r9, [src], pstep            ; p3
-    ldr         r4, [r2], #4                ; flimit
+    ldrb        r4, [r2]                    ; blimit
     ldr         r10, [src], pstep           ; p2
-    ldr         r2, [r3], #4                ; limit
+    ldrb        r2, [r3]                    ; limit
     ldr         r11, [src], pstep           ; p1
-    uadd8       r4, r4, r4                  ; flimit * 2
-    ldr         r3, [r6], #4                ; thresh
+    orr         r4, r4, r4, lsl #8
+    ldrb        r3, [r6]                    ; thresh
+    orr         r2, r2, r2, lsl #8
     mov         count, count, lsl #1        ; 4-in-parallel
-    uadd8       r4, r4, r2                  ; flimit * 2 + limit
+    orr         r4, r4, r4, lsl #16
+    orr         r3, r3, r3, lsl #8
+    orr         r2, r2, r2, lsl #16
+    orr         r3, r3, r3, lsl #16
 
 |MBHnext8|
 
     ; vp8_filter_mask() function
     ; calculate breakout conditions
     ldr         r12, [src], pstep           ; p0
 
     uqsub8      r6, r9, r10                 ; p3 - p2
@@ -585,25 +584,29 @@ count       RN  r5
     stmdb       sp!, {r4 - r11, lr}
 
     sub         src, src, #4                ; move src pointer down by 4
     ldr         count, [sp, #40]            ; count for 8-in-parallel
     ldr         r12, [sp, #36]              ; load thresh address
     sub         sp, sp, #16                 ; create temp buffer
 
     ldr         r6, [src], pstep            ; load source data
-    ldr         r4, [r2], #4                ; flimit
+    ldrb        r4, [r2]                    ; blimit
     ldr         r7, [src], pstep
-    ldr         r2, [r3], #4                ; limit
+    ldrb        r2, [r3]                    ; limit
     ldr         r8, [src], pstep
-    uadd8       r4, r4, r4                  ; flimit * 2
-    ldr         r3, [r12], #4               ; thresh
+    orr         r4, r4, r4, lsl #8
+    ldrb        r3, [r12]                   ; thresh
+    orr         r2, r2, r2, lsl #8
     ldr         lr, [src], pstep
     mov         count, count, lsl #1        ; 4-in-parallel
-    uadd8       r4, r4, r2                  ; flimit * 2 + limit
+    orr         r4, r4, r4, lsl #16
+    orr         r3, r3, r3, lsl #8
+    orr         r2, r2, r2, lsl #16
+    orr         r3, r3, r3, lsl #16
 
 |Vnext8|
 
     ; vp8_filter_mask() function
     ; calculate breakout conditions
     ; transpose the source data for 4-in-parallel operation
     TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
 
@@ -852,28 +855,36 @@ count       RN  r5
 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 |vp8_mbloop_filter_vertical_edge_armv6| PROC
 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
     stmdb       sp!, {r4 - r11, lr}
 
     sub         src, src, #4                ; move src pointer down by 4
     ldr         count, [sp, #40]            ; count for 8-in-parallel
     ldr         r12, [sp, #36]              ; load thresh address
+    pld         [src, #23]                  ; preload for next block
     sub         sp, sp, #16                 ; create temp buffer
 
     ldr         r6, [src], pstep            ; load source data
-    ldr         r4, [r2], #4                ; flimit
+    ldrb        r4, [r2]                    ; blimit
+    pld         [src, #23]
     ldr         r7, [src], pstep
-    ldr         r2, [r3], #4                ; limit
+    ldrb        r2, [r3]                    ; limit
+    pld         [src, #23]
     ldr         r8, [src], pstep
-    uadd8       r4, r4, r4                  ; flimit * 2
-    ldr         r3, [r12], #4               ; thresh
+    orr         r4, r4, r4, lsl #8
+    ldrb        r3, [r12]                   ; thresh
+    orr         r2, r2, r2, lsl #8
+    pld         [src, #23]
     ldr         lr, [src], pstep
     mov         count, count, lsl #1        ; 4-in-parallel
-    uadd8       r4, r4, r2                  ; flimit * 2 + limit
+    orr         r4, r4, r4, lsl #16
+    orr         r3, r3, r3, lsl #8
+    orr         r2, r2, r2, lsl #16
+    orr         r3, r3, r3, lsl #16
 
 |MBVnext8|
     ; vp8_filter_mask() function
     ; calculate breakout conditions
     ; transpose the source data for 4-in-parallel operation
     TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
 
     uqsub8      r7, r9, r10                 ; p3 - p2
@@ -903,16 +914,17 @@ count       RN  r5
     ldr         r6, [src], pstep            ; load source data
     str         r11, [sp]                   ; push r11 to stack
     ldr         r7, [src], pstep
     str         r12, [sp, #4]               ; save current reg before load q0 - q3 data
     ldr         r8, [src], pstep
     str         lr, [sp, #8]
     ldr         lr, [src], pstep
 
+
     TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
 
     ldr         lr, [sp, #8]                ; load back (f)limit accumulator
 
     uqsub8      r6, r12, r11                ; q3 - q2
     uqsub8      r7, r11, r12                ; q2 - q3
     uqsub8      r12, r11, r10               ; q2 - q1
     uqsub8      r11, r10, r11               ; q1 - q2
@@ -951,16 +963,17 @@ count       RN  r5
     usub8       lr, r12, lr
     ldr         r9, [sp]                    ; load the compared result
     sel         lr, r11, r12                ; filter mask: lr
 
     cmp         lr, #0
     beq         mbvskip_filter               ; skip filtering
 
 
+
     ;vp8_hevmask() function
     ;calculate high edge variance
 
     sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
 
     orr         r9, r9, r10
 
     ldrh        r7, [src, #-2]
@@ -1118,16 +1131,17 @@ count       RN  r5
     mov         lr, #0x12                   ; 18
     mov         r7, #0x3f                   ; 63
 
     sxtb16      r6, r12
     sxtb16      r10, r12, ror #8
     smlabb      r8, r6, lr, r7
     smlatb      r6, r6, lr, r7
     smlabb      r9, r10, lr, r7
+
     smlatb      r10, r10, lr, r7
     ssat        r8, #8, r8, asr #7
     ssat        r6, #8, r6, asr #7
     ssat        r9, #8, r9, asr #7
     ssat        r10, #8, r10, asr #7
 
     sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
 
@@ -1237,19 +1251,23 @@ count       RN  r5
 
     ;adjust src pointer for next loop
     sub         src, src, #2
 
 |mbvskip_filter|
     sub         src, src, #4
     subs        count, count, #1
 
+    pld         [src, #23]                  ; preload for next block
     ldrne       r6, [src], pstep            ; load source data
+    pld         [src, #23]
     ldrne       r7, [src], pstep
+    pld         [src, #23]
     ldrne       r8, [src], pstep
+    pld         [src, #23]
     ldrne       lr, [src], pstep
 
     bne         MBVnext8
 
     add         sp, sp, #16
 
     ldmia       sp!, {r4 - r11, pc}
     ENDP        ; |vp8_mbloop_filter_vertical_edge_armv6|
--- a/media/libvpx/vp8/common/arm/armv6/simpleloopfilter_v6.asm
+++ b/media/libvpx/vp8/common/arm/armv6/simpleloopfilter_v6.asm
@@ -40,45 +40,38 @@
     pkhtb       $b2, $b3, $b1, asr #16      ; 32 22 12 02   -- p1
     pkhbt       $b0, $b1, $b3, lsl #16      ; 30 20 10 00   -- p3
 
     pkhtb       $b3, $a2, $a0, asr #16      ; 33 23 13 03   -- p0
     pkhbt       $b1, $a0, $a2, lsl #16      ; 31 21 11 01   -- p2
     MEND
 
 
+
 src         RN  r0
 pstep       RN  r1
 
 ;r0     unsigned char *src_ptr,
 ;r1     int src_pixel_step,
-;r2     const char *flimit,
-;r3     const char *limit,
-;stack  const char *thresh,
-;stack  int  count
-
-; All 16 elements in flimit are equal. So, in the code, only one load is needed
-; for flimit. Same applies to limit. thresh is not used in simple looopfilter
+;r2     const char *blimit
 
 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 |vp8_loop_filter_simple_horizontal_edge_armv6| PROC
 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
     stmdb       sp!, {r4 - r11, lr}
 
-    ldr         r12, [r3]                   ; limit
+    ldrb        r12, [r2]                   ; blimit
     ldr         r3, [src, -pstep, lsl #1]   ; p1
     ldr         r4, [src, -pstep]           ; p0
     ldr         r5, [src]                   ; q0
     ldr         r6, [src, pstep]            ; q1
-    ldr         r7, [r2]                    ; flimit
+    orr         r12, r12, r12, lsl #8       ; blimit
     ldr         r2, c0x80808080
-    ldr         r9, [sp, #40]               ; count for 8-in-parallel
-    uadd8       r7, r7, r7                  ; flimit * 2
-    mov         r9, r9, lsl #1              ; double the count. we're doing 4 at a time
-    uadd8       r12, r7, r12                ; flimit * 2 + limit
+    orr         r12, r12, r12, lsl #16      ; blimit
+    mov         r9, #4                      ; double the count. we're doing 4 at a time
     mov         lr, #0                      ; need 0 in a couple places
 
 |simple_hnext8|
     ; vp8_simple_filter_mask()
 
     uqsub8      r7, r3, r6                  ; p1 - q1
     uqsub8      r8, r6, r3                  ; q1 - p1
     uqsub8      r10, r4, r5                 ; p0 - q0
@@ -143,40 +136,42 @@ pstep       RN  r1
     ENDP        ; |vp8_loop_filter_simple_horizontal_edge_armv6|
 
 
 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 |vp8_loop_filter_simple_vertical_edge_armv6| PROC
 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
     stmdb       sp!, {r4 - r11, lr}
 
-    ldr         r12, [r2]                   ; r12: flimit
+    ldrb        r12, [r2]                   ; r12: blimit
     ldr         r2, c0x80808080
-    ldr         r7, [r3]                    ; limit
+    orr         r12, r12, r12, lsl #8
 
     ; load soure data to r7, r8, r9, r10
     ldrh        r3, [src, #-2]
+    pld         [src, #23]                  ; preload for next block
     ldrh        r4, [src], pstep
-    uadd8       r12, r12, r12               ; flimit * 2
+    orr         r12, r12, r12, lsl #16
 
     ldrh        r5, [src, #-2]
+    pld         [src, #23]
     ldrh        r6, [src], pstep
-    uadd8       r12, r12, r7                ; flimit * 2 + limit
 
     pkhbt       r7, r3, r4, lsl #16
 
     ldrh        r3, [src, #-2]
+    pld         [src, #23]
     ldrh        r4, [src], pstep
-    ldr         r11, [sp, #40]              ; count (r11) for 8-in-parallel
 
     pkhbt       r8, r5, r6, lsl #16
 
     ldrh        r5, [src, #-2]
+    pld         [src, #23]
     ldrh        r6, [src], pstep
-    mov         r11, r11, lsl #1            ; 4-in-parallel
+    mov         r11, #4                     ; double the count. we're doing 4 at a time
 
 |simple_vnext8|
     ; vp8_simple_filter_mask() function
     pkhbt       r9, r3, r4, lsl #16
     pkhbt       r10, r5, r6, lsl #16
 
     ;transpose r7, r8, r9, r10 to r3, r4, r5, r6
     TRANSPOSE_MATRIX r7, r8, r9, r10, r3, r4, r5, r6
@@ -254,29 +249,33 @@ pstep       RN  r1
     strb        r4, [src, #-1]
     strb        r5, [src], pstep
 
 |simple_vskip_filter|
     subs        r11, r11, #1
 
     ; load soure data to r7, r8, r9, r10
     ldrneh      r3, [src, #-2]
+    pld         [src, #23]                  ; preload for next block
     ldrneh      r4, [src], pstep
 
     ldrneh      r5, [src, #-2]
+    pld         [src, #23]
     ldrneh      r6, [src], pstep
 
     pkhbt       r7, r3, r4, lsl #16
 
     ldrneh      r3, [src, #-2]
+    pld         [src, #23]
     ldrneh      r4, [src], pstep
 
     pkhbt       r8, r5, r6, lsl #16
 
     ldrneh      r5, [src, #-2]
+    pld         [src, #23]
     ldrneh      r6, [src], pstep
 
     bne         simple_vnext8
 
     ldmia       sp!, {r4 - r11, pc}
     ENDP        ; |vp8_loop_filter_simple_vertical_edge_armv6|
 
 ; Constant Pool
--- a/media/libvpx/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
+++ b/media/libvpx/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
@@ -30,16 +30,19 @@
     cmp         r2, #0                      ;skip first_pass filter if xoffset=0
     add         lr, sp, #4                  ;point to temporary buffer
     beq         skip_firstpass_filter
 
 ;first-pass filter
     adr         r12, filter8_coeff
     sub         r0, r0, r1, lsl #1
 
+    add         r3, r1, #10                 ; preload next low
+    pld         [r0, r3]
+
     add         r2, r12, r2, lsl #4         ;calculate filter location
     add         r0, r0, #3                  ;adjust src only for loading convinience
 
     ldr         r3, [r2]                    ; load up packed filter coefficients
     ldr         r4, [r2, #4]
     ldr         r5, [r2, #8]
 
     mov         r2, #0x90000                ; height=9 is top part of counter
@@ -105,16 +108,19 @@
     ;;ENDIF
 
     subs        r2, r2, #0x10000
 
     sub         lr, lr, #158
 
     add         r0, r0, r1                  ; move to next input line
 
+    add         r11, r1, #18                ; preload next low. adding back block width(=8), which is subtracted earlier
+    pld         [r0, r11]
+
     bne         first_pass_hloop_v6
 
 ;second pass filter
 secondpass_filter
     ldr         r3, [sp], #4                ; load back yoffset
     ldr         r0, [sp, #216]              ; load dst address from stack 180+36
     ldr         r1, [sp, #220]              ; load dst stride from stack 180+40
 
@@ -238,18 +244,16 @@ skip_secondpass_hloop
 
     add         sp, sp, #16                 ; 180 - (160 +4)
 
     ldmia       sp!, {r4 - r11, pc}
 
     ENDP
 
 ;-----------------
-    AREA    subpelfilters8_dat, DATA, READWRITE         ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
 ;One word each is reserved. Label filter_coeff can be used to access the data.
 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
 filter8_coeff
     DCD     0x00000000,     0x00000080,     0x00000000,     0x00000000
     DCD     0xfffa0000,     0x000c007b,     0x0000ffff,     0x00000000
     DCD     0xfff50002,     0x0024006c,     0x0001fff8,     0x00000000
     DCD     0xfff70000,     0x0032005d,     0x0000fffa,     0x00000000
     DCD     0xfff00003,     0x004d004d,     0x0003fff0,     0x00000000
--- a/media/libvpx/vp8/common/arm/bilinearfilter_arm.c
+++ b/media/libvpx/vp8/common/arm/bilinearfilter_arm.c
@@ -5,156 +5,57 @@
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
 #include <math.h>
-#include "subpixel.h"
-
-#define BLOCK_HEIGHT_WIDTH 4
-#define VP8_FILTER_WEIGHT 128
-#define VP8_FILTER_SHIFT  7
-
-static const short bilinear_filters[8][2] =
-{
-    { 128,   0 },
-    { 112,  16 },
-    {  96,  32 },
-    {  80,  48 },
-    {  64,  64 },
-    {  48,  80 },
-    {  32,  96 },
-    {  16, 112 }
-};
-
-
-extern void vp8_filter_block2d_bil_first_pass_armv6
-(
-    unsigned char *src_ptr,
-    unsigned short *output_ptr,
-    unsigned int src_pixels_per_line,
-    unsigned int output_height,
-    unsigned int output_width,
-    const short *vp8_filter
-);
-
-extern void vp8_filter_block2d_bil_second_pass_armv6
-(
-    unsigned short *src_ptr,
-    unsigned char  *output_ptr,
-    int output_pitch,
-    unsigned int  output_height,
-    unsigned int  output_width,
-    const short *vp8_filter
-);
-
-#if 0
-void vp8_filter_block2d_bil_first_pass_6
-(
-    unsigned char *src_ptr,
-    unsigned short *output_ptr,
-    unsigned int src_pixels_per_line,
-    unsigned int output_height,
-    unsigned int output_width,
-    const short *vp8_filter
-)
-{
-    unsigned int i, j;
-
-    for ( i=0; i<output_height; i++ )
-    {
-        for ( j=0; j<output_width; j++ )
-        {
-            /* Apply bilinear filter */
-            output_ptr[j] = ( ( (int)src_ptr[0]          * vp8_filter[0]) +
-                               ((int)src_ptr[1] * vp8_filter[1]) +
-                                (VP8_FILTER_WEIGHT/2) ) >> VP8_FILTER_SHIFT;
-            src_ptr++;
-        }
-
-        /* Next row... */
-        src_ptr    += src_pixels_per_line - output_width;
-        output_ptr += output_width;
-    }
-}
-
-void vp8_filter_block2d_bil_second_pass_6
-(
-    unsigned short *src_ptr,
-    unsigned char  *output_ptr,
-    int output_pitch,
-    unsigned int  output_height,
-    unsigned int  output_width,
-    const short *vp8_filter
-)
-{
-    unsigned int  i,j;
-    int  Temp;
-
-    for ( i=0; i<output_height; i++ )
-    {
-        for ( j=0; j<output_width; j++ )
-        {
-            /* Apply filter */
-            Temp =  ((int)src_ptr[0]         * vp8_filter[0]) +
-                    ((int)src_ptr[output_width] * vp8_filter[1]) +
-                    (VP8_FILTER_WEIGHT/2);
-            output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
-            src_ptr++;
-        }
-
-        /* Next row... */
-        /*src_ptr    += src_pixels_per_line - output_width;*/
-        output_ptr += output_pitch;
-    }
-}
-#endif
+#include "vp8/common/filter.h"
+#include "vp8/common/subpixel.h"
+#include "bilinearfilter_arm.h"
 
 void vp8_filter_block2d_bil_armv6
 (
     unsigned char *src_ptr,
-    unsigned char *output_ptr,
-    unsigned int   src_pixels_per_line,
+    unsigned char *dst_ptr,
+    unsigned int   src_pitch,
     unsigned int   dst_pitch,
-    const short      *HFilter,
-    const short      *VFilter,
+    const short   *HFilter,
+    const short   *VFilter,
     int            Width,
     int            Height
 )
 {
-
-    unsigned short FData[36*16]; /* Temp data bufffer used in filtering */
+    unsigned short FData[36*16]; /* Temp data buffer used in filtering */
 
     /* First filter 1-D horizontally... */
-    /* pixel_step = 1; */
-    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pixels_per_line, Height + 1, Width, HFilter);
+    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
 
     /* then 1-D vertically... */
-    vp8_filter_block2d_bil_second_pass_armv6(FData, output_ptr, dst_pitch, Height, Width, VFilter);
+    vp8_filter_block2d_bil_second_pass_armv6(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
 }
 
 
 void vp8_bilinear_predict4x4_armv6
 (
     unsigned char  *src_ptr,
     int   src_pixels_per_line,
     int  xoffset,
     int  yoffset,
     unsigned char *dst_ptr,
     int dst_pitch
 )
 {
     const short  *HFilter;
     const short  *VFilter;
 
-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
 
     vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
 }
 
 void vp8_bilinear_predict8x8_armv6
 (
     unsigned char  *src_ptr,
     int  src_pixels_per_line,
@@ -162,18 +63,18 @@ void vp8_bilinear_predict8x8_armv6
     int  yoffset,
     unsigned char *dst_ptr,
     int  dst_pitch
 )
 {
     const short  *HFilter;
     const short  *VFilter;
 
-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
 
     vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
 }
 
 void vp8_bilinear_predict8x4_armv6
 (
     unsigned char  *src_ptr,
     int  src_pixels_per_line,
@@ -181,18 +82,18 @@ void vp8_bilinear_predict8x4_armv6
     int  yoffset,
     unsigned char *dst_ptr,
     int  dst_pitch
 )
 {
     const short  *HFilter;
     const short  *VFilter;
 
-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
 
     vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
 }
 
 void vp8_bilinear_predict16x16_armv6
 (
     unsigned char  *src_ptr,
     int  src_pixels_per_line,
@@ -200,13 +101,13 @@ void vp8_bilinear_predict16x16_armv6
     int  yoffset,
     unsigned char *dst_ptr,
     int  dst_pitch
 )
 {
     const short  *HFilter;
     const short  *VFilter;
 
-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
 
     vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
 }
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/common/arm/bilinearfilter_arm.h
@@ -0,0 +1,35 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef BILINEARFILTER_ARM_H
+#define BILINEARFILTER_ARM_H
+
+extern void vp8_filter_block2d_bil_first_pass_armv6
+(
+    const unsigned char  *src_ptr,
+    unsigned short       *dst_ptr,
+    unsigned int          src_pitch,
+    unsigned int          height,
+    unsigned int          width,
+    const short          *vp8_filter
+);
+
+extern void vp8_filter_block2d_bil_second_pass_armv6
+(
+    const unsigned short *src_ptr,
+    unsigned char        *dst_ptr,
+    int                   dst_pitch,
+    unsigned int          height,
+    unsigned int          width,
+    const short         *vp8_filter
+);
+
+#endif /* BILINEARFILTER_ARM_H */
--- a/media/libvpx/vp8/common/arm/filter_arm.c
+++ b/media/libvpx/vp8/common/arm/filter_arm.c
@@ -6,37 +6,43 @@
  *  tree. An additional intellectual property rights grant can be found
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
 #include "vpx_ports/config.h"
 #include <math.h>
-#include "subpixel.h"
+#include "vp8/common/filter.h"
+#include "vp8/common/subpixel.h"
 #include "vpx_ports/mem.h"
 
-#define BLOCK_HEIGHT_WIDTH 4
-#define VP8_FILTER_WEIGHT 128
-#define VP8_FILTER_SHIFT  7
+extern void vp8_filter_block2d_first_pass_armv6
+(
+    unsigned char *src_ptr,
+    short         *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int output_width,
+    unsigned int output_height,
+    const short *vp8_filter
+);
 
-DECLARE_ALIGNED(16, static const short, sub_pel_filters[8][6]) =
-{
-    { 0,  0,  128,    0,   0,  0 },         /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */
-    { 0, -6,  123,   12,  -1,  0 },
-    { 2, -11, 108,   36,  -8,  1 },         /* New 1/4 pel 6 tap filter */
-    { 0, -9,   93,   50,  -6,  0 },
-    { 3, -16,  77,   77, -16,  3 },         /* New 1/2 pel 6 tap filter */
-    { 0, -6,   50,   93,  -9,  0 },
-    { 1, -8,   36,  108, -11,  2 },         /* New 1/4 pel 6 tap filter */
-    { 0, -1,   12,  123,  -6,  0 },
-};
+// 8x8
+extern void vp8_filter_block2d_first_pass_8x8_armv6
+(
+    unsigned char *src_ptr,
+    short         *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int output_width,
+    unsigned int output_height,
+    const short *vp8_filter
+);
 
-
-extern void vp8_filter_block2d_first_pass_armv6
+// 16x16
+extern void vp8_filter_block2d_first_pass_16x16_armv6
 (
     unsigned char *src_ptr,
     short         *output_ptr,
     unsigned int src_pixels_per_line,
     unsigned int output_width,
     unsigned int output_height,
     const short *vp8_filter
 );
@@ -88,21 +94,21 @@ void vp8_sixtap_predict_armv6
     int  xoffset,
     int  yoffset,
     unsigned char *dst_ptr,
     int  dst_pitch
 )
 {
     const short  *HFilter;
     const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 12*4); /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 12*4); /* Temp data buffer used in filtering */
 
 
-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];       /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
 
     /* Vfilter is null. First pass only */
     if (xoffset && !yoffset)
     {
         /*vp8_filter_block2d_first_pass_armv6 ( src_ptr, FData+2, src_pixels_per_line, 4, 4, HFilter );
         vp8_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, VFilter );*/
 
         vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, HFilter);
@@ -124,93 +130,52 @@ void vp8_sixtap_predict_armv6
         else
         {
             vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 4, 9, HFilter);
             vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
         }
     }
 }
 
-#if 0
-void vp8_sixtap_predict8x4_armv6
-(
-    unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    unsigned char *dst_ptr,
-    int  dst_pitch
-)
-{
-    const short  *HFilter;
-    const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data bufffer used in filtering */
-
-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];       /* 6 tap */
-
-
-    /*if (xoffset && !yoffset)
-    {
-        vp8_filter_block2d_first_pass_only_armv6 (  src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter );
-    }*/
-    /* Hfilter is null. Second pass only */
-    /*else if (!xoffset && yoffset)
-    {
-        vp8_filter_block2d_second_pass_only_armv6 ( src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter );
-    }
-    else
-    {
-        if (yoffset & 0x1)
-            vp8_filter_block2d_first_pass_armv6 ( src_ptr-src_pixels_per_line, FData+1, src_pixels_per_line, 8, 7, HFilter );
-        else*/
-
-        vp8_filter_block2d_first_pass_armv6 ( src_ptr-(2*src_pixels_per_line), FData, src_pixels_per_line, 8, 9, HFilter );
-
-        vp8_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, 8, VFilter );
-    /*}*/
-}
-#endif
-
 void vp8_sixtap_predict8x8_armv6
 (
     unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
     unsigned char *dst_ptr,
     int  dst_pitch
 )
 {
     const short  *HFilter;
     const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data buffer used in filtering */
 
-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];       /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
 
     if (xoffset && !yoffset)
     {
         vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter);
     }
     /* Hfilter is null. Second pass only */
     else if (!xoffset && yoffset)
     {
         vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter);
     }
     else
     {
         if (yoffset & 0x1)
         {
-            vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 8, 11, HFilter);
+            vp8_filter_block2d_first_pass_8x8_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 8, 11, HFilter);
             vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
         }
         else
         {
-            vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, 13, HFilter);
+            vp8_filter_block2d_first_pass_8x8_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, 13, HFilter);
             vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
         }
     }
 }
 
 
 void vp8_sixtap_predict16x16_armv6
 (
@@ -219,38 +184,38 @@ void vp8_sixtap_predict16x16_armv6
     int  xoffset,
     int  yoffset,
     unsigned char *dst_ptr,
     int  dst_pitch
 )
 {
     const short  *HFilter;
     const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 24*16);    /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 24*16);    /* Temp data buffer used in filtering */
 
-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];       /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
 
     if (xoffset && !yoffset)
     {
         vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, HFilter);
     }
     /* Hfilter is null. Second pass only */
     else if (!xoffset && yoffset)
     {
         vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, VFilter);
     }
     else
     {
         if (yoffset & 0x1)
         {
-            vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 16, 19, HFilter);
+            vp8_filter_block2d_first_pass_16x16_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 16, 19, HFilter);
             vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
         }
         else
         {
-            vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, 21, HFilter);
+            vp8_filter_block2d_first_pass_16x16_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, 21, HFilter);
             vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
         }
     }
 
 }
 #endif
--- a/media/libvpx/vp8/common/arm/loopfilter_arm.c
+++ b/media/libvpx/vp8/common/arm/loopfilter_arm.c
@@ -4,234 +4,173 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
-#include "vpx_ports/config.h"
-#include <math.h>
-#include "loopfilter.h"
-#include "onyxc_int.h"
+#include "vpx_config.h"
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/onyxc_int.h"
 
+#if HAVE_ARMV6
 extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6);
 extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6);
 extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6);
 extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6);
-extern prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_armv6);
-extern prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6);
+#endif
+
+#if HAVE_ARMV7
+typedef void loopfilter_y_neon(unsigned char *src, int pitch,
+        unsigned char blimit, unsigned char limit, unsigned char thresh);
+typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
+        unsigned char blimit, unsigned char limit, unsigned char thresh,
+        unsigned char *v);
 
-extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_y_neon);
-extern prototype_loopfilter(vp8_loop_filter_vertical_edge_y_neon);
-extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_y_neon);
-extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_y_neon);
-extern prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_neon);
-extern prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_neon);
+extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon;
+extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon;
+extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
+extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
 
-extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_neon;
-extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_neon;
-extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_neon;
-extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_neon;
-
+extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon;
+extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon;
+extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
+extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon;
+#endif
 
 #if HAVE_ARMV6
 /*ARMV6 loopfilter functions*/
 /* Horizontal MB filtering */
 void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                               int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
-    vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
 
     if (u_ptr)
-        vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 
     if (v_ptr)
-        vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
-}
-
-void vp8_loop_filter_mbhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                                int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
-{
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    (void) simpler_lpf;
-    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+        vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 }
 
 /* Vertical MB Filtering */
 void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                               int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
-    vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
 
     if (u_ptr)
-        vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 
     if (v_ptr)
-        vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
-}
-
-void vp8_loop_filter_mbvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                                int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
-{
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    (void) simpler_lpf;
-    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+        vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 }
 
 /* Horizontal B Filtering */
 void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
-    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
 
     if (u_ptr)
-        vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 
     if (v_ptr)
-        vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }
 
-void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride,
+                               const unsigned char *blimit)
 {
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    (void) simpler_lpf;
-    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit);
 }
 
 /* Vertical B Filtering */
 void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
-    vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
 
     if (u_ptr)
-        vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 
     if (v_ptr)
-        vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }
 
-void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride,
+                               const unsigned char *blimit)
 {
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    (void) simpler_lpf;
-    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit);
 }
 #endif
 
 #if HAVE_ARMV7
 /* NEON loopfilter functions */
 /* Horizontal MB filtering */
 void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
-    vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    unsigned char mblim = *lfi->mblim;
+    unsigned char lim = *lfi->lim;
+    unsigned char hev_thr = *lfi->hev_thr;
+    vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
 
     if (u_ptr)
-        vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, v_ptr);
-}
-
-void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
-{
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    (void) simpler_lpf;
-    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+        vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
 }
 
 /* Vertical MB Filtering */
 void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
-    vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    unsigned char mblim = *lfi->mblim;
+    unsigned char lim = *lfi->lim;
+    unsigned char hev_thr = *lfi->hev_thr;
+
+    vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
 
     if (u_ptr)
-        vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, v_ptr);
-}
-
-void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
-{
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    (void) simpler_lpf;
-    vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+        vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
 }
 
 /* Horizontal B Filtering */
 void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                             int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
-    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    unsigned char blim = *lfi->blim;
+    unsigned char lim = *lfi->lim;
+    unsigned char hev_thr = *lfi->hev_thr;
+
+    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim, lim, hev_thr);
+    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim, lim, hev_thr);
+    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim, lim, hev_thr);
 
     if (u_ptr)
-        vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, v_ptr + 4 * uv_stride);
-}
-
-void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
-{
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    (void) simpler_lpf;
-    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+        vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, blim, lim, hev_thr, v_ptr + 4 * uv_stride);
 }
 
 /* Vertical B Filtering */
 void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                             int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+                             int y_stride, int uv_stride, loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
-    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    unsigned char blim = *lfi->blim;
+    unsigned char lim = *lfi->lim;
+    unsigned char hev_thr = *lfi->hev_thr;
+
+    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr);
+    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr);
+    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim, hev_thr);
 
     if (u_ptr)
-        vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, v_ptr + 4);
-}
-
-void vp8_loop_filter_bvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                              int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
-{
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    (void) simpler_lpf;
-    vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+        vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, hev_thr, v_ptr + 4);
 }
 #endif
--- a/media/libvpx/vp8/common/arm/loopfilter_arm.h
+++ b/media/libvpx/vp8/common/arm/loopfilter_arm.h
@@ -7,62 +7,65 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
 #ifndef LOOPFILTER_ARM_H
 #define LOOPFILTER_ARM_H
 
+#include "vpx_config.h"
+
 #if HAVE_ARMV6
 extern prototype_loopfilter_block(vp8_loop_filter_mbv_armv6);
 extern prototype_loopfilter_block(vp8_loop_filter_bv_armv6);
 extern prototype_loopfilter_block(vp8_loop_filter_mbh_armv6);
 extern prototype_loopfilter_block(vp8_loop_filter_bh_armv6);
-extern prototype_loopfilter_block(vp8_loop_filter_mbvs_armv6);
-extern prototype_loopfilter_block(vp8_loop_filter_bvs_armv6);
-extern prototype_loopfilter_block(vp8_loop_filter_mbhs_armv6);
-extern prototype_loopfilter_block(vp8_loop_filter_bhs_armv6);
+extern prototype_simple_loopfilter(vp8_loop_filter_bvs_armv6);
+extern prototype_simple_loopfilter(vp8_loop_filter_bhs_armv6);
+extern prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_armv6);
+extern prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_lf_normal_mb_v
 #define vp8_lf_normal_mb_v vp8_loop_filter_mbv_armv6
 
 #undef  vp8_lf_normal_b_v
 #define vp8_lf_normal_b_v vp8_loop_filter_bv_armv6
 
 #undef  vp8_lf_normal_mb_h
 #define vp8_lf_normal_mb_h vp8_loop_filter_mbh_armv6
 
 #undef  vp8_lf_normal_b_h
 #define vp8_lf_normal_b_h vp8_loop_filter_bh_armv6
 
 #undef  vp8_lf_simple_mb_v
-#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_armv6
+#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_armv6
 
 #undef  vp8_lf_simple_b_v
 #define vp8_lf_simple_b_v vp8_loop_filter_bvs_armv6
 
 #undef  vp8_lf_simple_mb_h
-#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_armv6
+#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_armv6
 
 #undef  vp8_lf_simple_b_h
 #define vp8_lf_simple_b_h vp8_loop_filter_bhs_armv6
-#endif
-#endif
+#endif /* !CONFIG_RUNTIME_CPU_DETECT */
+
+#endif /* HAVE_ARMV6 */
 
 #if HAVE_ARMV7
 extern prototype_loopfilter_block(vp8_loop_filter_mbv_neon);
 extern prototype_loopfilter_block(vp8_loop_filter_bv_neon);
 extern prototype_loopfilter_block(vp8_loop_filter_mbh_neon);
 extern prototype_loopfilter_block(vp8_loop_filter_bh_neon);
-extern prototype_loopfilter_block(vp8_loop_filter_mbvs_neon);
-extern prototype_loopfilter_block(vp8_loop_filter_bvs_neon);
-extern prototype_loopfilter_block(vp8_loop_filter_mbhs_neon);
-extern prototype_loopfilter_block(vp8_loop_filter_bhs_neon);
+extern prototype_simple_loopfilter(vp8_loop_filter_mbvs_neon);
+extern prototype_simple_loopfilter(vp8_loop_filter_bvs_neon);
+extern prototype_simple_loopfilter(vp8_loop_filter_mbhs_neon);
+extern prototype_simple_loopfilter(vp8_loop_filter_bhs_neon);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_lf_normal_mb_v
 #define vp8_lf_normal_mb_v vp8_loop_filter_mbv_neon
 
 #undef  vp8_lf_normal_b_v
 #define vp8_lf_normal_b_v vp8_loop_filter_bv_neon
 
@@ -78,12 +81,13 @@ extern prototype_loopfilter_block(vp8_lo
 #undef  vp8_lf_simple_b_v
 #define vp8_lf_simple_b_v vp8_loop_filter_bvs_neon
 
 #undef  vp8_lf_simple_mb_h
 #define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_neon
 
 #undef  vp8_lf_simple_b_h
 #define vp8_lf_simple_b_h vp8_loop_filter_bhs_neon
-#endif
-#endif
+#endif /* !CONFIG_RUNTIME_CPU_DETECT */
 
-#endif
+#endif /* HAVE_ARMV7 */
+
+#endif /* LOOPFILTER_ARM_H */
--- a/media/libvpx/vp8/common/arm/neon/bilinearpredict16x16_neon.asm
+++ b/media/libvpx/vp8/common/arm/neon/bilinearpredict16x16_neon.asm
@@ -345,16 +345,13 @@ filt_blk2d_spo16x16_loop_neon
     vst1.u8         {d8, d9}, [r4], r5
 
     bne             filt_blk2d_spo16x16_loop_neon
     pop             {r4-r5,pc}
 
     ENDP
 
 ;-----------------
-    AREA    bifilters16_dat, DATA, READWRITE            ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 bifilter16_coeff
     DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
 
     END
--- a/media/libvpx/vp8/common/arm/neon/bilinearpredict4x4_neon.asm
+++ b/media/libvpx/vp8/common/arm/neon/bilinearpredict4x4_neon.asm
@@ -118,16 +118,13 @@ skip_secondpass_filter
     vst1.32         {d29[0]}, [r4], lr
     vst1.32         {d29[1]}, [r4], lr
 
     pop             {r4, pc}
 
     ENDP
 
 ;-----------------
-    AREA    bilinearfilters4_dat, DATA, READWRITE           ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 bifilter4_coeff
     DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
 
     END
--- a/media/libvpx/vp8/common/arm/neon/bilinearpredict8x4_neon.asm
+++ b/media/libvpx/vp8/common/arm/neon/bilinearpredict8x4_neon.asm
@@ -123,16 +123,13 @@ skip_secondpass_filter
     vst1.u8         {d24}, [r4], lr
     vst1.u8         {d25}, [r4], lr
 
     pop             {r4, pc}
 
     ENDP
 
 ;-----------------
-    AREA    bifilters8x4_dat, DATA, READWRITE           ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 bifilter8x4_coeff
     DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
 
     END
--- a/media/libvpx/vp8/common/arm/neon/bilinearpredict8x8_neon.asm
+++ b/media/libvpx/vp8/common/arm/neon/bilinearpredict8x8_neon.asm
@@ -171,16 +171,13 @@ skip_secondpass_filter
     vst1.u8         {d28}, [r4], lr
     vst1.u8         {d29}, [r4], lr
 
     pop             {r4, pc}
 
     ENDP
 
 ;-----------------
-    AREA    bifilters8_dat, DATA, READWRITE         ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 bifilter8_coeff
     DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
 
     END
--- a/media/libvpx/vp8/common/arm/neon/iwalsh_neon.asm
+++ b/media/libvpx/vp8/common/arm/neon/iwalsh_neon.asm
@@ -15,82 +15,66 @@
     PRESERVE8
 
     AREA    |.text|, CODE, READONLY  ; name this block of code
 
 ;short vp8_short_inv_walsh4x4_neon(short *input, short *output)
 |vp8_short_inv_walsh4x4_neon| PROC
 
     ; read in all four lines of values: d0->d3
-    vldm.64 r0, {q0, q1}
+    vld1.i16 {q0-q1}, [r0@128]
 
     ; first for loop
+    vadd.s16 d4, d0, d3 ;a = [0] + [12]
+    vadd.s16 d6, d1, d2 ;b = [4] + [8]
+    vsub.s16 d5, d0, d3 ;d = [0] - [12]
+    vsub.s16 d7, d1, d2 ;c = [4] - [8]
 
-    vadd.s16 d4, d0, d3 ;a = [0] + [12]
-    vadd.s16 d5, d1, d2 ;b = [4] + [8]
-    vsub.s16 d6, d1, d2 ;c = [4] - [8]
-    vsub.s16 d7, d0, d3 ;d = [0] - [12]
-
-    vadd.s16 d0, d4, d5 ;a + b
-    vadd.s16 d1, d6, d7 ;c + d
-    vsub.s16 d2, d4, d5 ;a - b
-    vsub.s16 d3, d7, d6 ;d - c
+    vadd.s16 q0, q2, q3 ; a+b d+c
+    vsub.s16 q1, q2, q3 ; a-b d-c
 
     vtrn.32 d0, d2 ;d0:  0  1  8  9
                    ;d2:  2  3 10 11
     vtrn.32 d1, d3 ;d1:  4  5 12 13
                    ;d3:  6  7 14 15
 
     vtrn.16 d0, d1 ;d0:  0  4  8 12
                    ;d1:  1  5  9 13
     vtrn.16 d2, d3 ;d2:  2  6 10 14
                    ;d3:  3  7 11 15
 
     ; second for loop
 
     vadd.s16 d4, d0, d3 ;a = [0] + [3]
-    vadd.s16 d5, d1, d2 ;b = [1] + [2]
-    vsub.s16 d6, d1, d2 ;c = [1] - [2]
-    vsub.s16 d7, d0, d3 ;d = [0] - [3]
+    vadd.s16 d6, d1, d2 ;b = [1] + [2]
+    vsub.s16 d5, d0, d3 ;d = [0] - [3]
+    vsub.s16 d7, d1, d2 ;c = [1] - [2]
+
+    vmov.i16 q8, #3
 
-    vadd.s16 d0, d4, d5 ;e = a + b
-    vadd.s16 d1, d6, d7 ;f = c + d
-    vsub.s16 d2, d4, d5 ;g = a - b
-    vsub.s16 d3, d7, d6 ;h = d - c
+    vadd.s16 q0, q2, q3 ; a+b d+c
+    vsub.s16 q1, q2, q3 ; a-b d-c
 
-    vmov.i16 q2, #3
-    vadd.i16 q0, q0, q2 ;e/f += 3
-    vadd.i16 q1, q1, q2 ;g/h += 3
+    vadd.i16 q0, q0, q8 ;e/f += 3
+    vadd.i16 q1, q1, q8 ;g/h += 3
 
     vshr.s16 q0, q0, #3 ;e/f >> 3
     vshr.s16 q1, q1, #3 ;g/h >> 3
 
-    vtrn.32 d0, d2
-    vtrn.32 d1, d3
-    vtrn.16 d0, d1
-    vtrn.16 d2, d3
-
-    vstmia.16 r1!, {q0}
-    vstmia.16 r1!, {q1}
+    vst4.i16 {d0,d1,d2,d3}, [r1@128]
 
     bx lr
     ENDP    ; |vp8_short_inv_walsh4x4_neon|
 
 
 ;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output)
 |vp8_short_inv_walsh4x4_1_neon| PROC
-    ; load a full line into a neon register
-    vld1.16  {q0}, [r0]
-    ; extract first element and replicate
-    vdup.16 q1, d0[0]
-    ; add 3 to all values
-    vmov.i16 q2, #3
-    vadd.i16 q3, q1, q2
-    ; right shift
-    vshr.s16 q3, q3, #3
-    ; write it back
-    vstmia.16 r1!, {q3}
-    vstmia.16 r1!, {q3}
-
+    ldrsh r2, [r0]          ; load input[0]
+    add r3, r2, #3          ; add 3
+    add r2, r1, #16         ; base for last 8 output
+    asr r0, r3, #3          ; right shift 3
+    vdup.16 q0, r0          ; load and duplicate
+    vst1.16 {q0}, [r1@128]  ; write back 8
+    vst1.16 {q0}, [r2@128]  ; write back last 8
     bx lr
     ENDP    ; |vp8_short_inv_walsh4x4_1_neon|
 
     END
--- a/media/libvpx/vp8/common/arm/neon/loopfilter_neon.asm
+++ b/media/libvpx/vp8/common/arm/neon/loopfilter_neon.asm
@@ -9,166 +9,245 @@
 ;
 
 
     EXPORT  |vp8_loop_filter_horizontal_edge_y_neon|
     EXPORT  |vp8_loop_filter_horizontal_edge_uv_neon|
     EXPORT  |vp8_loop_filter_vertical_edge_y_neon|
     EXPORT  |vp8_loop_filter_vertical_edge_uv_neon|
     ARM
-    REQUIRE8
-    PRESERVE8
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-; flimit, limit, and thresh should be positive numbers.
-; All 16 elements in these variables are equal.
-
-; void vp8_loop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
-;                                             const signed char *flimit,
-;                                             const signed char *limit,
-;                                             const signed char *thresh,
-;                                             int count)
 ; r0    unsigned char *src
 ; r1    int pitch
-; r2    const signed char *flimit
-; r3    const signed char *limit
-; sp    const signed char *thresh,
-; sp+4  int count (unused)
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
 |vp8_loop_filter_horizontal_edge_y_neon| PROC
-    stmdb       sp!, {lr}
-    vld1.s8     {d0[], d1[]}, [r2]          ; flimit
-    vld1.s8     {d2[], d3[]}, [r3]          ; limit
-    sub         r2, r0, r1, lsl #2          ; move src pointer down by 4 lines
-    ldr         r12, [sp, #4]               ; load thresh pointer
+    push        {lr}
+    vdup.u8     q0, r2                     ; duplicate blimit
+    vdup.u8     q1, r3                     ; duplicate limit
+    sub         r2, r0, r1, lsl #2         ; move src pointer down by 4 lines
+    ldr         r3, [sp, #4]               ; load thresh
+    add         r12, r2, r1
+    add         r1, r1, r1
 
-    vld1.u8     {q3}, [r2], r1              ; p3
-    vld1.u8     {q4}, [r2], r1              ; p2
-    vld1.u8     {q5}, [r2], r1              ; p1
-    vld1.u8     {q6}, [r2], r1              ; p0
-    vld1.u8     {q7}, [r2], r1              ; q0
-    vld1.u8     {q8}, [r2], r1              ; q1
-    vld1.u8     {q9}, [r2], r1              ; q2
-    vld1.u8     {q10}, [r2]                 ; q3
-    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
-    sub         r0, r0, r1, lsl #1
+    vdup.u8     q2, r3                     ; duplicate thresh
+
+    vld1.u8     {q3}, [r2@128], r1              ; p3
+    vld1.u8     {q4}, [r12@128], r1             ; p2
+    vld1.u8     {q5}, [r2@128], r1              ; p1
+    vld1.u8     {q6}, [r12@128], r1             ; p0
+    vld1.u8     {q7}, [r2@128], r1              ; q0
+    vld1.u8     {q8}, [r12@128], r1             ; q1
+    vld1.u8     {q9}, [r2@128]                  ; q2
+    vld1.u8     {q10}, [r12@128]                ; q3
+
+    sub         r2, r2, r1, lsl #1
+    sub         r12, r12, r1, lsl #1
 
     bl          vp8_loop_filter_neon
 
-    vst1.u8     {q5}, [r0], r1              ; store op1
-    vst1.u8     {q6}, [r0], r1              ; store op0
-    vst1.u8     {q7}, [r0], r1              ; store oq0
-    vst1.u8     {q8}, [r0], r1              ; store oq1
+    vst1.u8     {q5}, [r2@128], r1              ; store op1
+    vst1.u8     {q6}, [r12@128], r1             ; store op0
+    vst1.u8     {q7}, [r2@128], r1              ; store oq0
+    vst1.u8     {q8}, [r12@128], r1             ; store oq1
 
-    ldmia       sp!, {pc}
+    pop         {pc}
     ENDP        ; |vp8_loop_filter_horizontal_edge_y_neon|
 
-; void vp8_loop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch
-;                                              const signed char *flimit,
-;                                              const signed char *limit,
-;                                              const signed char *thresh,
-;                                              unsigned char *v)
+
 ; r0    unsigned char *u,
 ; r1    int pitch,
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; sp    const signed char *thresh,
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
 ; sp+4  unsigned char *v
 |vp8_loop_filter_horizontal_edge_uv_neon| PROC
-    stmdb       sp!, {lr}
-    vld1.s8     {d0[], d1[]}, [r2]          ; flimit
-    vld1.s8     {d2[], d3[]}, [r3]          ; limit
+    push        {lr}
+    vdup.u8     q0, r2                      ; duplicate blimit
+    vdup.u8     q1, r3                      ; duplicate limit
+    ldr         r12, [sp, #4]               ; load thresh
     ldr         r2, [sp, #8]                ; load v ptr
+    vdup.u8     q2, r12                     ; duplicate thresh
 
     sub         r3, r0, r1, lsl #2          ; move u pointer down by 4 lines
-    vld1.u8     {d6}, [r3], r1              ; p3
-    vld1.u8     {d8}, [r3], r1              ; p2
-    vld1.u8     {d10}, [r3], r1             ; p1
-    vld1.u8     {d12}, [r3], r1             ; p0
-    vld1.u8     {d14}, [r3], r1             ; q0
-    vld1.u8     {d16}, [r3], r1             ; q1
-    vld1.u8     {d18}, [r3], r1             ; q2
-    vld1.u8     {d20}, [r3]                 ; q3
-
-    ldr         r3, [sp, #4]                ; load thresh pointer
+    sub         r12, r2, r1, lsl #2         ; move v pointer down by 4 lines
 
-    sub         r12, r2, r1, lsl #2         ; move v pointer down by 4 lines
-    vld1.u8     {d7}, [r12], r1             ; p3
-    vld1.u8     {d9}, [r12], r1             ; p2
-    vld1.u8     {d11}, [r12], r1            ; p1
-    vld1.u8     {d13}, [r12], r1            ; p0
-    vld1.u8     {d15}, [r12], r1            ; q0
-    vld1.u8     {d17}, [r12], r1            ; q1
-    vld1.u8     {d19}, [r12], r1            ; q2
-    vld1.u8     {d21}, [r12]                ; q3
-
-    vld1.s8     {d4[], d5[]}, [r3]          ; thresh
+    vld1.u8     {d6}, [r3@64], r1              ; p3
+    vld1.u8     {d7}, [r12@64], r1             ; p3
+    vld1.u8     {d8}, [r3@64], r1              ; p2
+    vld1.u8     {d9}, [r12@64], r1             ; p2
+    vld1.u8     {d10}, [r3@64], r1             ; p1
+    vld1.u8     {d11}, [r12@64], r1            ; p1
+    vld1.u8     {d12}, [r3@64], r1             ; p0
+    vld1.u8     {d13}, [r12@64], r1            ; p0
+    vld1.u8     {d14}, [r3@64], r1             ; q0
+    vld1.u8     {d15}, [r12@64], r1            ; q0
+    vld1.u8     {d16}, [r3@64], r1             ; q1
+    vld1.u8     {d17}, [r12@64], r1            ; q1
+    vld1.u8     {d18}, [r3@64], r1             ; q2
+    vld1.u8     {d19}, [r12@64], r1            ; q2
+    vld1.u8     {d20}, [r3@64]                 ; q3
+    vld1.u8     {d21}, [r12@64]                ; q3
 
     bl          vp8_loop_filter_neon
 
     sub         r0, r0, r1, lsl #1
     sub         r2, r2, r1, lsl #1
 
-    vst1.u8     {d10}, [r0], r1             ; store u op1
-    vst1.u8     {d11}, [r2], r1             ; store v op1
-    vst1.u8     {d12}, [r0], r1             ; store u op0
-    vst1.u8     {d13}, [r2], r1             ; store v op0
-    vst1.u8     {d14}, [r0], r1             ; store u oq0
-    vst1.u8     {d15}, [r2], r1             ; store v oq0
-    vst1.u8     {d16}, [r0]                 ; store u oq1
-    vst1.u8     {d17}, [r2]                 ; store v oq1
+    vst1.u8     {d10}, [r0@64], r1             ; store u op1
+    vst1.u8     {d11}, [r2@64], r1             ; store v op1
+    vst1.u8     {d12}, [r0@64], r1             ; store u op0
+    vst1.u8     {d13}, [r2@64], r1             ; store v op0
+    vst1.u8     {d14}, [r0@64], r1             ; store u oq0
+    vst1.u8     {d15}, [r2@64], r1             ; store v oq0
+    vst1.u8     {d16}, [r0@64]                 ; store u oq1
+    vst1.u8     {d17}, [r2@64]                 ; store v oq1
 
-    ldmia       sp!, {pc}
+    pop         {pc}
     ENDP        ; |vp8_loop_filter_horizontal_edge_uv_neon|
 
 ; void vp8_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
 ;                                           const signed char *flimit,
 ;                                           const signed char *limit,
 ;                                           const signed char *thresh,
 ;                                           int count)
-; r0    unsigned char *src,
-; r1    int pitch,
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; sp    const signed char *thresh,
-; sp+4  int count (unused)
+; r0    unsigned char *src
+; r1    int pitch
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
+
 |vp8_loop_filter_vertical_edge_y_neon| PROC
-    stmdb       sp!, {lr}
-    vld1.s8     {d0[], d1[]}, [r2]          ; flimit
-    vld1.s8     {d2[], d3[]}, [r3]          ; limit
-    sub         r2, r0, #4                  ; src ptr down by 4 columns
-    sub         r0, r0, #2                  ; dst ptr
-    ldr         r12, [sp, #4]               ; load thresh pointer
+    push        {lr}
+    vdup.u8     q0, r2                     ; duplicate blimit
+    vdup.u8     q1, r3                     ; duplicate limit
+    sub         r2, r0, #4                 ; src ptr down by 4 columns
+    add         r1, r1, r1
+    ldr         r3, [sp, #4]               ; load thresh
+    add         r12, r2, r1, asr #1
 
-    vld1.u8     {d6}, [r2], r1              ; load first 8-line src data
-    vld1.u8     {d8}, [r2], r1
+    vld1.u8     {d6}, [r2], r1
+    vld1.u8     {d8}, [r12], r1
     vld1.u8     {d10}, [r2], r1
-    vld1.u8     {d12}, [r2], r1
+    vld1.u8     {d12}, [r12], r1
     vld1.u8     {d14}, [r2], r1
-    vld1.u8     {d16}, [r2], r1
+    vld1.u8     {d16}, [r12], r1
     vld1.u8     {d18}, [r2], r1
-    vld1.u8     {d20}, [r2], r1
-
-    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
+    vld1.u8     {d20}, [r12], r1
 
     vld1.u8     {d7}, [r2], r1              ; load second 8-line src data
-    vld1.u8     {d9}, [r2], r1
+    vld1.u8     {d9}, [r12], r1
     vld1.u8     {d11}, [r2], r1
-    vld1.u8     {d13}, [r2], r1
+    vld1.u8     {d13}, [r12], r1
     vld1.u8     {d15}, [r2], r1
-    vld1.u8     {d17}, [r2], r1
-    vld1.u8     {d19}, [r2], r1
-    vld1.u8     {d21}, [r2]
+    vld1.u8     {d17}, [r12], r1
+    vld1.u8     {d19}, [r2]
+    vld1.u8     {d21}, [r12]
 
     ;transpose to 8x16 matrix
     vtrn.32     q3, q7
     vtrn.32     q4, q8
     vtrn.32     q5, q9
     vtrn.32     q6, q10
 
+    vdup.u8     q2, r3                     ; duplicate thresh
+
+    vtrn.16     q3, q5
+    vtrn.16     q4, q6
+    vtrn.16     q7, q9
+    vtrn.16     q8, q10
+
+    vtrn.8      q3, q4
+    vtrn.8      q5, q6
+    vtrn.8      q7, q8
+    vtrn.8      q9, q10
+
+    bl          vp8_loop_filter_neon
+
+    vswp        d12, d11
+    vswp        d16, d13
+
+    sub         r0, r0, #2                 ; dst ptr
+
+    vswp        d14, d12
+    vswp        d16, d15
+
+    add         r12, r0, r1, asr #1
+
+    ;store op1, op0, oq0, oq1
+    vst4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
+    vst4.8      {d10[1], d11[1], d12[1], d13[1]}, [r12], r1
+    vst4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
+    vst4.8      {d10[3], d11[3], d12[3], d13[3]}, [r12], r1
+    vst4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
+    vst4.8      {d10[5], d11[5], d12[5], d13[5]}, [r12], r1
+    vst4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
+    vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r12], r1
+
+    vst4.8      {d14[0], d15[0], d16[0], d17[0]}, [r0], r1
+    vst4.8      {d14[1], d15[1], d16[1], d17[1]}, [r12], r1
+    vst4.8      {d14[2], d15[2], d16[2], d17[2]}, [r0], r1
+    vst4.8      {d14[3], d15[3], d16[3], d17[3]}, [r12], r1
+    vst4.8      {d14[4], d15[4], d16[4], d17[4]}, [r0], r1
+    vst4.8      {d14[5], d15[5], d16[5], d17[5]}, [r12], r1
+    vst4.8      {d14[6], d15[6], d16[6], d17[6]}, [r0]
+    vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r12]
+
+    pop         {pc}
+    ENDP        ; |vp8_loop_filter_vertical_edge_y_neon|
+
+; void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch
+;                                            const signed char *flimit,
+;                                            const signed char *limit,
+;                                            const signed char *thresh,
+;                                            unsigned char *v)
+; r0    unsigned char *u,
+; r1    int pitch,
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
+; sp+4  unsigned char *v
+|vp8_loop_filter_vertical_edge_uv_neon| PROC
+    push        {lr}
+    vdup.u8     q0, r2                      ; duplicate blimit
+    sub         r12, r0, #4                 ; move u pointer down by 4 columns
+    ldr         r2, [sp, #8]                ; load v ptr
+    vdup.u8     q1, r3                      ; duplicate limit
+    sub         r3, r2, #4                  ; move v pointer down by 4 columns
+
+    vld1.u8     {d6}, [r12], r1             ;load u data
+    vld1.u8     {d7}, [r3], r1              ;load v data
+    vld1.u8     {d8}, [r12], r1
+    vld1.u8     {d9}, [r3], r1
+    vld1.u8     {d10}, [r12], r1
+    vld1.u8     {d11}, [r3], r1
+    vld1.u8     {d12}, [r12], r1
+    vld1.u8     {d13}, [r3], r1
+    vld1.u8     {d14}, [r12], r1
+    vld1.u8     {d15}, [r3], r1
+    vld1.u8     {d16}, [r12], r1
+    vld1.u8     {d17}, [r3], r1
+    vld1.u8     {d18}, [r12], r1
+    vld1.u8     {d19}, [r3], r1
+    vld1.u8     {d20}, [r12]
+    vld1.u8     {d21}, [r3]
+
+    ldr        r12, [sp, #4]               ; load thresh
+
+    ;transpose to 8x16 matrix
+    vtrn.32     q3, q7
+    vtrn.32     q4, q8
+    vtrn.32     q5, q9
+    vtrn.32     q6, q10
+
+    vdup.u8     q2, r12                     ; duplicate thresh
+
     vtrn.16     q3, q5
     vtrn.16     q4, q6
     vtrn.16     q7, q9
     vtrn.16     q8, q10
 
     vtrn.8      q3, q4
     vtrn.8      q5, q6
     vtrn.8      q7, q8
@@ -176,105 +255,19 @@
 
     bl          vp8_loop_filter_neon
 
     vswp        d12, d11
     vswp        d16, d13
     vswp        d14, d12
     vswp        d16, d15
 
-    ;store op1, op0, oq0, oq1
-    vst4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
-    vst4.8      {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
-    vst4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
-    vst4.8      {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
-    vst4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
-    vst4.8      {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
-    vst4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
-    vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r0], r1
-    vst4.8      {d14[0], d15[0], d16[0], d17[0]}, [r0], r1
-    vst4.8      {d14[1], d15[1], d16[1], d17[1]}, [r0], r1
-    vst4.8      {d14[2], d15[2], d16[2], d17[2]}, [r0], r1
-    vst4.8      {d14[3], d15[3], d16[3], d17[3]}, [r0], r1
-    vst4.8      {d14[4], d15[4], d16[4], d17[4]}, [r0], r1
-    vst4.8      {d14[5], d15[5], d16[5], d17[5]}, [r0], r1
-    vst4.8      {d14[6], d15[6], d16[6], d17[6]}, [r0], r1
-    vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r0]
-
-    ldmia       sp!, {pc}
-    ENDP        ; |vp8_loop_filter_vertical_edge_y_neon|
-
-; void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch
-;                                            const signed char *flimit,
-;                                            const signed char *limit,
-;                                            const signed char *thresh,
-;                                            unsigned char *v)
-; r0    unsigned char *u,
-; r1    int pitch,
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; sp    const signed char *thresh,
-; sp+4  unsigned char *v
-|vp8_loop_filter_vertical_edge_uv_neon| PROC
-    stmdb       sp!, {lr}
-    sub         r12, r0, #4                  ; move u pointer down by 4 columns
-    vld1.s8     {d0[], d1[]}, [r2]          ; flimit
-    vld1.s8     {d2[], d3[]}, [r3]          ; limit
-
-    ldr         r2, [sp, #8]                ; load v ptr
-
-    vld1.u8     {d6}, [r12], r1              ;load u data
-    vld1.u8     {d8}, [r12], r1
-    vld1.u8     {d10}, [r12], r1
-    vld1.u8     {d12}, [r12], r1
-    vld1.u8     {d14}, [r12], r1
-    vld1.u8     {d16}, [r12], r1
-    vld1.u8     {d18}, [r12], r1
-    vld1.u8     {d20}, [r12]
-
-    sub         r3, r2, #4                  ; move v pointer down by 4 columns
-    vld1.u8     {d7}, [r3], r1              ;load v data
-    vld1.u8     {d9}, [r3], r1
-    vld1.u8     {d11}, [r3], r1
-    vld1.u8     {d13}, [r3], r1
-    vld1.u8     {d15}, [r3], r1
-    vld1.u8     {d17}, [r3], r1
-    vld1.u8     {d19}, [r3], r1
-    vld1.u8     {d21}, [r3]
-
-    ldr         r12, [sp, #4]               ; load thresh pointer
-
-    ;transpose to 8x16 matrix
-    vtrn.32     q3, q7
-    vtrn.32     q4, q8
-    vtrn.32     q5, q9
-    vtrn.32     q6, q10
-
-    vtrn.16     q3, q5
-    vtrn.16     q4, q6
-    vtrn.16     q7, q9
-    vtrn.16     q8, q10
-
-    vtrn.8      q3, q4
-    vtrn.8      q5, q6
-    vtrn.8      q7, q8
-    vtrn.8      q9, q10
-
-    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
-
-    bl          vp8_loop_filter_neon
-
     sub         r0, r0, #2
     sub         r2, r2, #2
 
-    vswp        d12, d11
-    vswp        d16, d13
-    vswp        d14, d12
-    vswp        d16, d15
-
     ;store op1, op0, oq0, oq1
     vst4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
     vst4.8      {d14[0], d15[0], d16[0], d17[0]}, [r2], r1
     vst4.8      {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
     vst4.8      {d14[1], d15[1], d16[1], d17[1]}, [r2], r1
     vst4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
     vst4.8      {d14[2], d15[2], d16[2], d17[2]}, [r2], r1
     vst4.8      {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
@@ -283,17 +276,17 @@
     vst4.8      {d14[4], d15[4], d16[4], d17[4]}, [r2], r1
     vst4.8      {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
     vst4.8      {d14[5], d15[5], d16[5], d17[5]}, [r2], r1
     vst4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
     vst4.8      {d14[6], d15[6], d16[6], d17[6]}, [r2], r1
     vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r0]
     vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r2]
 
-    ldmia       sp!, {pc}
+    pop         {pc}
     ENDP        ; |vp8_loop_filter_vertical_edge_uv_neon|
 
 ; void vp8_loop_filter_neon();
 ; This is a helper function for the loopfilters. The invidual functions do the
 ; necessary load, transpose (if necessary) and store.
 
 ; r0-r3 PRESERVE
 ; q0    flimit
@@ -303,105 +296,102 @@
 ; q4    p2
 ; q5    p1
 ; q6    p0
 ; q7    q0
 ; q8    q1
 ; q9    q2
 ; q10   q3
 |vp8_loop_filter_neon| PROC
-    adr         r12, lf_coeff
 
     ; vp8_filter_mask
     vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
     vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
     vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
     vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
     vabd.u8     q3, q9, q8                  ; abs(q2 - q1)
     vabd.u8     q4, q10, q9                 ; abs(q3 - q2)
-    vabd.u8     q9, q6, q7                  ; abs(p0 - q0)
 
     vmax.u8     q11, q11, q12
     vmax.u8     q12, q13, q14
     vmax.u8     q3, q3, q4
     vmax.u8     q15, q11, q12
 
+    vabd.u8     q9, q6, q7                  ; abs(p0 - q0)
+
     ; vp8_hevmask
     vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
     vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
     vmax.u8     q15, q15, q3
 
-    vadd.u8     q0, q0, q0                  ; flimit * 2
-    vadd.u8     q0, q0, q1                  ; flimit * 2 + limit
-    vcge.u8     q15, q1, q15
+    vmov.u8     q10, #0x80                   ; 0x80
 
     vabd.u8     q2, q5, q8                  ; a = abs(p1 - q1)
     vqadd.u8    q9, q9, q9                  ; b = abs(p0 - q0) * 2
-    vshr.u8     q2, q2, #1                  ; a = a / 2
-    vqadd.u8    q9, q9, q2                  ; a = b + a
-    vcge.u8     q9, q0, q9                  ; (a > flimit * 2 + limit) * -1
 
-    vld1.u8     {q0}, [r12]!
+    vcge.u8     q15, q1, q15
 
     ; vp8_filter() function
     ; convert to signed
-    veor        q7, q7, q0                  ; qs0
-    veor        q6, q6, q0                  ; ps0
-    veor        q5, q5, q0                  ; ps1
-    veor        q8, q8, q0                  ; qs1
+    veor        q7, q7, q10                 ; qs0
+    vshr.u8     q2, q2, #1                  ; a = a / 2
+    veor        q6, q6, q10                 ; ps0
 
-    vld1.u8     {q10}, [r12]!
+    veor        q5, q5, q10                 ; ps1
+    vqadd.u8    q9, q9, q2                  ; a = b + a
+
+    veor        q8, q8, q10                 ; qs1
+
+    vmov.u8     q10, #3                     ; #3
 
     vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
     vsubl.s8    q11, d15, d13
 
+    vcge.u8     q9, q0, q9                  ; (a > flimit * 2 + limit) * -1
+
     vmovl.u8    q4, d20
 
     vqsub.s8    q1, q5, q8                  ; vp8_filter = clamp(ps1-qs1)
     vorr        q14, q13, q14               ; vp8_hevmask
 
     vmul.i16    q2, q2, q4                  ; 3 * ( qs0 - ps0)
     vmul.i16    q11, q11, q4
 
     vand        q1, q1, q14                 ; vp8_filter &= hev
     vand        q15, q15, q9                ; vp8_filter_mask
 
     vaddw.s8    q2, q2, d2
     vaddw.s8    q11, q11, d3
 
-    vld1.u8     {q9}, [r12]!
+    vmov.u8     q9, #4                      ; #4
 
     ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
     vqmovn.s16  d2, q2
     vqmovn.s16  d3, q11
     vand        q1, q1, q15                 ; vp8_filter &= mask
 
     vqadd.s8    q2, q1, q10                 ; Filter2 = clamp(vp8_filter+3)
     vqadd.s8    q1, q1, q9                  ; Filter1 = clamp(vp8_filter+4)
     vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
     vshr.s8     q1, q1, #3                  ; Filter1 >>= 3
 
+
     vqadd.s8    q11, q6, q2                 ; u = clamp(ps0 + Filter2)
     vqsub.s8    q10, q7, q1                 ; u = clamp(qs0 - Filter1)
 
     ; outer tap adjustments: ++vp8_filter >> 1
     vrshr.s8    q1, q1, #1
     vbic        q1, q1, q14                 ; vp8_filter &= ~hev
-
+    vmov.u8     q0, #0x80                   ; 0x80
     vqadd.s8    q13, q5, q1                 ; u = clamp(ps1 + vp8_filter)
     vqsub.s8    q12, q8, q1                 ; u = clamp(qs1 - vp8_filter)
 
-    veor        q5, q13, q0                 ; *op1 = u^0x80
     veor        q6, q11, q0                 ; *op0 = u^0x80
     veor        q7, q10, q0                 ; *oq0 = u^0x80
+    veor        q5, q13, q0                 ; *op1 = u^0x80
     veor        q8, q12, q0                 ; *oq1 = u^0x80
 
     bx          lr
     ENDP        ; |vp8_loop_filter_horizontal_edge_y_neon|
 
-    AREA    loopfilter_dat, DATA, READONLY
-lf_coeff
-    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
-    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
-    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
-    DCD     0x01010101, 0x01010101, 0x01010101, 0x01010101
+;-----------------
 
     END
--- a/media/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
+++ b/media/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
@@ -4,113 +4,114 @@
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
 
-    EXPORT  |vp8_loop_filter_simple_horizontal_edge_neon|
+    ;EXPORT  |vp8_loop_filter_simple_horizontal_edge_neon|
+    EXPORT  |vp8_loop_filter_bhs_neon|
+    EXPORT  |vp8_loop_filter_mbhs_neon|
     ARM
-    REQUIRE8
     PRESERVE8
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
-;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit
-;are equal. So, in the code, only one load is needed
-;for flimit. Same way applies to limit and thresh.
-; r0    unsigned char *s,
-; r1    int p, //pitch
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; stack(r4) const signed char *thresh,
-; //stack(r5)   int count --unused
+
+; r0    unsigned char *s, PRESERVE
+; r1    int p, PRESERVE
+; q1    limit, PRESERVE
 
 |vp8_loop_filter_simple_horizontal_edge_neon| PROC
-    sub         r0, r0, r1, lsl #1          ; move src pointer down by 2 lines
+
+    sub         r3, r0, r1, lsl #1          ; move src pointer down by 2 lines
 
-    adr         r12, lfhy_coeff
-    vld1.u8     {q5}, [r0], r1              ; p1
-    vld1.s8     {d2[], d3[]}, [r2]          ; flimit
-    vld1.s8     {d26[], d27[]}, [r3]        ; limit -> q13
-    vld1.u8     {q6}, [r0], r1              ; p0
-    vld1.u8     {q0}, [r12]!                ; 0x80
-    vld1.u8     {q7}, [r0], r1              ; q0
-    vld1.u8     {q10}, [r12]!               ; 0x03
-    vld1.u8     {q8}, [r0]                  ; q1
+    vld1.u8     {q7}, [r0@128], r1          ; q0
+    vld1.u8     {q5}, [r3@128], r1          ; p0
+    vld1.u8     {q8}, [r0@128]              ; q1
+    vld1.u8     {q6}, [r3@128]              ; p1
 
-    ;vp8_filter_mask() function
     vabd.u8     q15, q6, q7                 ; abs(p0 - q0)
     vabd.u8     q14, q5, q8                 ; abs(p1 - q1)
+
     vqadd.u8    q15, q15, q15               ; abs(p0 - q0) * 2
     vshr.u8     q14, q14, #1                ; abs(p1 - q1) / 2
+    vmov.u8     q0, #0x80                   ; 0x80
+    vmov.s16    q13, #3
     vqadd.u8    q15, q15, q14               ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
 
-    ;vp8_filter() function
     veor        q7, q7, q0                  ; qs0: q0 offset to convert to a signed value
     veor        q6, q6, q0                  ; ps0: p0 offset to convert to a signed value
     veor        q5, q5, q0                  ; ps1: p1 offset to convert to a signed value
     veor        q8, q8, q0                  ; qs1: q1 offset to convert to a signed value
 
-    vadd.u8     q1, q1, q1                  ; flimit * 2
-    vadd.u8     q1, q1, q13                 ; flimit * 2 + limit
-    vcge.u8     q15, q1, q15                ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
+    vcge.u8     q15, q1, q15                ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > limit)*-1
 
-;;;;;;;;;;
-    ;vqsub.s8   q2, q7, q6                  ; ( qs0 - ps0)
     vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
     vsubl.s8    q3, d15, d13
 
     vqsub.s8    q4, q5, q8                  ; q4: vp8_filter = vp8_signed_char_clamp(ps1-qs1)
 
-    ;vmul.i8    q2, q2, q10                 ;  3 * ( qs0 - ps0)
-    vadd.s16    q11, q2, q2                 ;  3 * ( qs0 - ps0)
-    vadd.s16    q12, q3, q3
+    vmul.s16    q2, q2, q13                 ;  3 * ( qs0 - ps0)
+    vmul.s16    q3, q3, q13
 
-    vld1.u8     {q9}, [r12]!                ; 0x04
-
-    vadd.s16    q2, q2, q11
-    vadd.s16    q3, q3, q12
+    vmov.u8     q10, #0x03                  ; 0x03
+    vmov.u8     q9, #0x04                   ; 0x04
 
     vaddw.s8    q2, q2, d8                  ; vp8_filter + 3 * ( qs0 - ps0)
     vaddw.s8    q3, q3, d9
 
-    ;vqadd.s8   q4, q4, q2                  ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
     vqmovn.s16  d8, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
     vqmovn.s16  d9, q3
-;;;;;;;;;;;;;
 
-    vand        q4, q4, q15                 ; vp8_filter &= mask
+    vand        q14, q4, q15                ; vp8_filter &= mask
 
-    vqadd.s8    q2, q4, q10                 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
-    vqadd.s8    q4, q4, q9                  ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
+    vqadd.s8    q2, q14, q10                ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
+    vqadd.s8    q3, q14, q9                 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
     vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
-    vshr.s8     q4, q4, #3                  ; Filter1 >>= 3
+    vshr.s8     q4, q3, #3                  ; Filter1 >>= 3
 
-    sub         r0, r0, r1, lsl #1
+    sub         r0, r0, r1
 
     ;calculate output
     vqadd.s8    q11, q6, q2                 ; u = vp8_signed_char_clamp(ps0 + Filter2)
     vqsub.s8    q10, q7, q4                 ; u = vp8_signed_char_clamp(qs0 - Filter1)
 
-    add         r3, r0, r1
-
     veor        q6, q11, q0                 ; *op0 = u^0x80
     veor        q7, q10, q0                 ; *oq0 = u^0x80
 
-    vst1.u8     {q6}, [r0]                  ; store op0
-    vst1.u8     {q7}, [r3]                  ; store oq0
+    vst1.u8     {q6}, [r3@128]              ; store op0
+    vst1.u8     {q7}, [r0@128]              ; store oq0
 
     bx          lr
     ENDP        ; |vp8_loop_filter_simple_horizontal_edge_neon|
 
-;-----------------
-    AREA    hloopfiltery_dat, DATA, READWRITE           ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 16 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
-lfhy_coeff
-    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
-    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
-    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
+; r0    unsigned char *y
+; r1    int ystride
+; r2    const unsigned char *blimit
+
+|vp8_loop_filter_bhs_neon| PROC
+    push        {r4, lr}
+    ldrb        r3, [r2]                    ; load blim from mem
+    vdup.s8     q1, r3                      ; duplicate blim
+
+    add         r0, r0, r1, lsl #2          ; src = y_ptr + 4 * y_stride
+    bl          vp8_loop_filter_simple_horizontal_edge_neon
+    ; vp8_loop_filter_simple_horizontal_edge_neon preserves r0, r1 and q1
+    add         r0, r0, r1, lsl #2          ; src = y_ptr + 8* y_stride
+    bl          vp8_loop_filter_simple_horizontal_edge_neon
+    add         r0, r0, r1, lsl #2          ; src = y_ptr + 12 * y_stride
+    pop         {r4, lr}
+    b           vp8_loop_filter_simple_horizontal_edge_neon
+    ENDP        ;|vp8_loop_filter_bhs_neon|
+
+; r0    unsigned char *y
+; r1    int ystride
+; r2    const unsigned char *blimit
+
+|vp8_loop_filter_mbhs_neon| PROC
+    ldrb        r3, [r2]                   ; load blim from mem
+    vdup.s8     q1, r3                     ; duplicate mblim
+    b           vp8_loop_filter_simple_horizontal_edge_neon
+    ENDP        ;|vp8_loop_filter_bhs_neon|
 
     END
--- a/media/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
+++ b/media/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
@@ -4,154 +4,151 @@
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
 
-    EXPORT  |vp8_loop_filter_simple_vertical_edge_neon|
+    ;EXPORT  |vp8_loop_filter_simple_vertical_edge_neon|
+    EXPORT |vp8_loop_filter_bvs_neon|
+    EXPORT |vp8_loop_filter_mbvs_neon|
     ARM
-    REQUIRE8
     PRESERVE8
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
-;Note: flimit, limit, and thresh should be positive numbers. All 16 elements in flimit
-;are equal. So, in the code, only one load is needed
-;for flimit. Same way applies to limit and thresh.
-; r0    unsigned char *s,
-; r1    int p, //pitch
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; stack(r4) const signed char *thresh,
-; //stack(r5)   int count --unused
+
+; r0    unsigned char *s, PRESERVE
+; r1    int p, PRESERVE
+; q1    limit, PRESERVE
 
 |vp8_loop_filter_simple_vertical_edge_neon| PROC
     sub         r0, r0, #2                  ; move src pointer down by 2 columns
+    add         r12, r1, r1
+    add         r3, r0, r1
 
-    vld4.8      {d6[0], d7[0], d8[0], d9[0]}, [r0], r1
-    vld1.s8     {d2[], d3[]}, [r2]          ; flimit
-    vld1.s8     {d26[], d27[]}, [r3]        ; limit -> q13
-    vld4.8      {d6[1], d7[1], d8[1], d9[1]}, [r0], r1
-    adr         r12, vlfy_coeff
-    vld4.8      {d6[2], d7[2], d8[2], d9[2]}, [r0], r1
-    vld4.8      {d6[3], d7[3], d8[3], d9[3]}, [r0], r1
-    vld4.8      {d6[4], d7[4], d8[4], d9[4]}, [r0], r1
-    vld4.8      {d6[5], d7[5], d8[5], d9[5]}, [r0], r1
-    vld4.8      {d6[6], d7[6], d8[6], d9[6]}, [r0], r1
-    vld4.8      {d6[7], d7[7], d8[7], d9[7]}, [r0], r1
+    vld4.8      {d6[0], d7[0], d8[0], d9[0]}, [r0], r12
+    vld4.8      {d6[1], d7[1], d8[1], d9[1]}, [r3], r12
+    vld4.8      {d6[2], d7[2], d8[2], d9[2]}, [r0], r12
+    vld4.8      {d6[3], d7[3], d8[3], d9[3]}, [r3], r12
+    vld4.8      {d6[4], d7[4], d8[4], d9[4]}, [r0], r12
+    vld4.8      {d6[5], d7[5], d8[5], d9[5]}, [r3], r12
+    vld4.8      {d6[6], d7[6], d8[6], d9[6]}, [r0], r12
+    vld4.8      {d6[7], d7[7], d8[7], d9[7]}, [r3], r12
 
-    vld4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
-    vld1.u8     {q0}, [r12]!                ; 0x80
-    vld4.8      {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
-    vld1.u8     {q11}, [r12]!               ; 0x03
-    vld4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
-    vld1.u8     {q12}, [r12]!               ; 0x04
-    vld4.8      {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
-    vld4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
-    vld4.8      {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
-    vld4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
-    vld4.8      {d10[7], d11[7], d12[7], d13[7]}, [r0], r1
+    vld4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r12
+    vld4.8      {d10[1], d11[1], d12[1], d13[1]}, [r3], r12
+    vld4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r12
+    vld4.8      {d10[3], d11[3], d12[3], d13[3]}, [r3], r12
+    vld4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r12
+    vld4.8      {d10[5], d11[5], d12[5], d13[5]}, [r3], r12
+    vld4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r12
+    vld4.8      {d10[7], d11[7], d12[7], d13[7]}, [r3]
 
     vswp        d7, d10
     vswp        d12, d9
-    ;vswp       q4, q5                      ; p1:q3, p0:q5, q0:q4, q1:q6
 
     ;vp8_filter_mask() function
     ;vp8_hevmask() function
     sub         r0, r0, r1, lsl #4
     vabd.u8     q15, q5, q4                 ; abs(p0 - q0)
     vabd.u8     q14, q3, q6                 ; abs(p1 - q1)
+
     vqadd.u8    q15, q15, q15               ; abs(p0 - q0) * 2
     vshr.u8     q14, q14, #1                ; abs(p1 - q1) / 2
+    vmov.u8     q0, #0x80                   ; 0x80
+    vmov.s16    q11, #3
     vqadd.u8    q15, q15, q14               ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
 
     veor        q4, q4, q0                  ; qs0: q0 offset to convert to a signed value
     veor        q5, q5, q0                  ; ps0: p0 offset to convert to a signed value
     veor        q3, q3, q0                  ; ps1: p1 offset to convert to a signed value
     veor        q6, q6, q0                  ; qs1: q1 offset to convert to a signed value
 
-    vadd.u8     q1, q1, q1                  ; flimit * 2
-    vadd.u8     q1, q1, q13                 ; flimit * 2 + limit
     vcge.u8     q15, q1, q15                ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
 
-    ;vp8_filter() function
-;;;;;;;;;;
-    ;vqsub.s8   q2, q5, q4                  ; ( qs0 - ps0)
     vsubl.s8    q2, d8, d10                 ; ( qs0 - ps0)
     vsubl.s8    q13, d9, d11
 
-    vqsub.s8    q1, q3, q6                  ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
+    vqsub.s8    q14, q3, q6                  ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
+
+    vmul.s16    q2, q2, q11                 ;  3 * ( qs0 - ps0)
+    vmul.s16    q13, q13, q11
 
-    ;vmul.i8    q2, q2, q11                 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
-    vadd.s16    q10, q2, q2                 ;  3 * ( qs0 - ps0)
-    vadd.s16    q14, q13, q13
-    vadd.s16    q2, q2, q10
-    vadd.s16    q13, q13, q14
+    vmov.u8     q11, #0x03                  ; 0x03
+    vmov.u8     q12, #0x04                  ; 0x04
 
-    ;vqadd.s8   q1, q1, q2
-    vaddw.s8    q2, q2, d2                  ; vp8_filter + 3 * ( qs0 - ps0)
-    vaddw.s8    q13, q13, d3
+    vaddw.s8    q2, q2, d28                  ; vp8_filter + 3 * ( qs0 - ps0)
+    vaddw.s8    q13, q13, d29
 
-    vqmovn.s16  d2, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d3, q13
+    vqmovn.s16  d28, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d29, q13
 
     add         r0, r0, #1
-    add         r2, r0, r1
-;;;;;;;;;;;
+    add         r3, r0, r1
 
-    vand        q1, q1, q15                 ; vp8_filter &= mask
+    vand        q14, q14, q15                 ; vp8_filter &= mask
 
-    vqadd.s8    q2, q1, q11                 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
-    vqadd.s8    q1, q1, q12                 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
+    vqadd.s8    q2, q14, q11                 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
+    vqadd.s8    q3, q14, q12                 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
     vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
-    vshr.s8     q1, q1, #3                  ; Filter1 >>= 3
+    vshr.s8     q14, q3, #3                  ; Filter1 >>= 3
 
     ;calculate output
-    vqsub.s8    q10, q4, q1                 ; u = vp8_signed_char_clamp(qs0 - Filter1)
     vqadd.s8    q11, q5, q2                 ; u = vp8_signed_char_clamp(ps0 + Filter2)
+    vqsub.s8    q10, q4, q14                 ; u = vp8_signed_char_clamp(qs0 - Filter1)
 
-    veor        q7, q10, q0                 ; *oq0 = u^0x80
     veor        q6, q11, q0                 ; *op0 = u^0x80
-
-    add         r3, r2, r1
+    veor        q7, q10, q0                 ; *oq0 = u^0x80
+    add         r12, r1, r1
     vswp        d13, d14
-    add         r12, r3, r1
 
     ;store op1, op0, oq0, oq1
-    vst2.8      {d12[0], d13[0]}, [r0]
-    vst2.8      {d12[1], d13[1]}, [r2]
-    vst2.8      {d12[2], d13[2]}, [r3]
-    vst2.8      {d12[3], d13[3]}, [r12], r1
-    add         r0, r12, r1
-    vst2.8      {d12[4], d13[4]}, [r12]
-    vst2.8      {d12[5], d13[5]}, [r0], r1
-    add         r2, r0, r1
-    vst2.8      {d12[6], d13[6]}, [r0]
-    vst2.8      {d12[7], d13[7]}, [r2], r1
-    add         r3, r2, r1
-    vst2.8      {d14[0], d15[0]}, [r2]
-    vst2.8      {d14[1], d15[1]}, [r3], r1
-    add         r12, r3, r1
-    vst2.8      {d14[2], d15[2]}, [r3]
-    vst2.8      {d14[3], d15[3]}, [r12], r1
-    add         r0, r12, r1
-    vst2.8      {d14[4], d15[4]}, [r12]
-    vst2.8      {d14[5], d15[5]}, [r0], r1
-    add         r2, r0, r1
-    vst2.8      {d14[6], d15[6]}, [r0]
-    vst2.8      {d14[7], d15[7]}, [r2]
+    vst2.8      {d12[0], d13[0]}, [r0], r12
+    vst2.8      {d12[1], d13[1]}, [r3], r12
+    vst2.8      {d12[2], d13[2]}, [r0], r12
+    vst2.8      {d12[3], d13[3]}, [r3], r12
+    vst2.8      {d12[4], d13[4]}, [r0], r12
+    vst2.8      {d12[5], d13[5]}, [r3], r12
+    vst2.8      {d12[6], d13[6]}, [r0], r12
+    vst2.8      {d12[7], d13[7]}, [r3], r12
+    vst2.8      {d14[0], d15[0]}, [r0], r12
+    vst2.8      {d14[1], d15[1]}, [r3], r12
+    vst2.8      {d14[2], d15[2]}, [r0], r12
+    vst2.8      {d14[3], d15[3]}, [r3], r12
+    vst2.8      {d14[4], d15[4]}, [r0], r12
+    vst2.8      {d14[5], d15[5]}, [r3], r12
+    vst2.8      {d14[6], d15[6]}, [r0], r12
+    vst2.8      {d14[7], d15[7]}, [r3]
 
     bx          lr
     ENDP        ; |vp8_loop_filter_simple_vertical_edge_neon|
 
-;-----------------
-    AREA    vloopfiltery_dat, DATA, READWRITE           ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 16 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
-vlfy_coeff
-    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
-    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
-    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
+; r0    unsigned char *y
+; r1    int ystride
+; r2    const unsigned char *blimit
 
+|vp8_loop_filter_bvs_neon| PROC
+    push        {r4, lr}
+    ldrb        r3, [r2]                   ; load blim from mem
+    mov         r4, r0
+    add         r0, r0, #4
+    vdup.s8     q1, r3                     ; duplicate blim
+    bl          vp8_loop_filter_simple_vertical_edge_neon
+    ; vp8_loop_filter_simple_vertical_edge_neon preserves  r1 and q1
+    add         r0, r4, #8
+    bl          vp8_loop_filter_simple_vertical_edge_neon
+    add         r0, r4, #12
+    pop         {r4, lr}
+    b           vp8_loop_filter_simple_vertical_edge_neon
+    ENDP        ;|vp8_loop_filter_bvs_neon|
+
+; r0    unsigned char *y
+; r1    int ystride
+; r2    const unsigned char *blimit
+
+|vp8_loop_filter_mbvs_neon| PROC
+    ldrb        r3, [r2]                   ; load mblim from mem
+    vdup.s8     q1, r3                     ; duplicate mblim
+    b           vp8_loop_filter_simple_vertical_edge_neon
+    ENDP        ;|vp8_loop_filter_bvs_neon|
     END
--- a/media/libvpx/vp8/common/arm/neon/mbloopfilter_neon.asm
+++ b/media/libvpx/vp8/common/arm/neon/mbloopfilter_neon.asm
@@ -9,165 +9,153 @@
 ;
 
 
     EXPORT  |vp8_mbloop_filter_horizontal_edge_y_neon|
     EXPORT  |vp8_mbloop_filter_horizontal_edge_uv_neon|
     EXPORT  |vp8_mbloop_filter_vertical_edge_y_neon|
     EXPORT  |vp8_mbloop_filter_vertical_edge_uv_neon|
     ARM
-    REQUIRE8
-    PRESERVE8
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-; flimit, limit, and thresh should be positive numbers.
-; All 16 elements in these variables are equal.
-
 ; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
-;                                               const signed char *flimit,
-;                                               const signed char *limit,
-;                                               const signed char *thresh,
-;                                               int count)
+;                                               const unsigned char *blimit,
+;                                               const unsigned char *limit,
+;                                               const unsigned char *thresh)
 ; r0    unsigned char *src,
 ; r1    int pitch,
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; sp    const signed char *thresh,
-; sp+4  int count (unused)
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
 |vp8_mbloop_filter_horizontal_edge_y_neon| PROC
-    stmdb       sp!, {lr}
-    sub         r0, r0, r1, lsl #2          ; move src pointer down by 4 lines
-    ldr         r12, [sp, #4]               ; load thresh pointer
+    push        {lr}
+    add         r1, r1, r1                  ; double stride
+    ldr         r12, [sp, #4]               ; load thresh
+    sub         r0, r0, r1, lsl #1          ; move src pointer down by 4 lines
+    vdup.u8     q2, r12                     ; thresh
+    add         r12, r0, r1,  lsr #1        ; move src pointer up by 1 line
+
+    vld1.u8     {q3}, [r0@128], r1              ; p3
+    vld1.u8     {q4}, [r12@128], r1             ; p2
+    vld1.u8     {q5}, [r0@128], r1              ; p1
+    vld1.u8     {q6}, [r12@128], r1             ; p0
+    vld1.u8     {q7}, [r0@128], r1              ; q0
+    vld1.u8     {q8}, [r12@128], r1             ; q1
+    vld1.u8     {q9}, [r0@128], r1              ; q2
+    vld1.u8     {q10}, [r12@128], r1            ; q3
+
+    bl          vp8_mbloop_filter_neon
+
+    sub         r12, r12, r1, lsl #2
+    add         r0, r12, r1, lsr #1
+
+    vst1.u8     {q4}, [r12@128],r1         ; store op2
+    vst1.u8     {q5}, [r0@128],r1          ; store op1
+    vst1.u8     {q6}, [r12@128], r1        ; store op0
+    vst1.u8     {q7}, [r0@128],r1          ; store oq0
+    vst1.u8     {q8}, [r12@128]            ; store oq1
+    vst1.u8     {q9}, [r0@128]             ; store oq2
+
+    pop         {pc}
+    ENDP        ; |vp8_mbloop_filter_horizontal_edge_y_neon|
 
-    vld1.u8     {q3}, [r0], r1              ; p3
-    vld1.s8     {d2[], d3[]}, [r3]          ; limit
-    vld1.u8     {q4}, [r0], r1              ; p2
-    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
-    vld1.u8     {q5}, [r0], r1              ; p1
-    vld1.u8     {q6}, [r0], r1              ; p0
-    vld1.u8     {q7}, [r0], r1              ; q0
-    vld1.u8     {q8}, [r0], r1              ; q1
-    vld1.u8     {q9}, [r0], r1              ; q2
-    vld1.u8     {q10}, [r0], r1             ; q3
+; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch,
+;                                                const unsigned char *blimit,
+;                                                const unsigned char *limit,
+;                                                const unsigned char *thresh,
+;                                                unsigned char *v)
+; r0    unsigned char *u,
+; r1    int pitch,
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
+; sp+4  unsigned char *v
+
+|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC
+    push        {lr}
+    ldr         r12, [sp, #4]                 ; load thresh
+    sub         r0, r0, r1, lsl #2            ; move u pointer down by 4 lines
+    vdup.u8     q2, r12                       ; thresh
+    ldr         r12, [sp, #8]                 ; load v ptr
+    sub         r12, r12, r1, lsl #2          ; move v pointer down by 4 lines
+
+    vld1.u8     {d6}, [r0@64], r1              ; p3
+    vld1.u8     {d7}, [r12@64], r1              ; p3
+    vld1.u8     {d8}, [r0@64], r1              ; p2
+    vld1.u8     {d9}, [r12@64], r1              ; p2
+    vld1.u8     {d10}, [r0@64], r1             ; p1
+    vld1.u8     {d11}, [r12@64], r1             ; p1
+    vld1.u8     {d12}, [r0@64], r1             ; p0
+    vld1.u8     {d13}, [r12@64], r1             ; p0
+    vld1.u8     {d14}, [r0@64], r1             ; q0
+    vld1.u8     {d15}, [r12@64], r1             ; q0
+    vld1.u8     {d16}, [r0@64], r1             ; q1
+    vld1.u8     {d17}, [r12@64], r1             ; q1
+    vld1.u8     {d18}, [r0@64], r1             ; q2
+    vld1.u8     {d19}, [r12@64], r1             ; q2
+    vld1.u8     {d20}, [r0@64], r1             ; q3
+    vld1.u8     {d21}, [r12@64], r1             ; q3
 
     bl          vp8_mbloop_filter_neon
 
     sub         r0, r0, r1, lsl #3
-    add         r0, r0, r1
-    add         r2, r0, r1
-    add         r3, r2, r1
-
-    vst1.u8     {q4}, [r0]                  ; store op2
-    vst1.u8     {q5}, [r2]                  ; store op1
-    vst1.u8     {q6}, [r3], r1              ; store op0
-    add         r12, r3, r1
-    vst1.u8     {q7}, [r3]                  ; store oq0
-    vst1.u8     {q8}, [r12], r1             ; store oq1
-    vst1.u8     {q9}, [r12]             ; store oq2
-
-    ldmia       sp!, {pc}
-    ENDP        ; |vp8_mbloop_filter_horizontal_edge_y_neon|
-
-; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch,
-;                                                const signed char *flimit,
-;                                                const signed char *limit,
-;                                                const signed char *thresh,
-;                                                unsigned char *v)
-; r0    unsigned char *u,
-; r1    int pitch,
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; sp    const signed char *thresh,
-; sp+4  unsigned char *v
-|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC
-    stmdb       sp!, {lr}
-    sub         r0, r0, r1, lsl #2          ; move u pointer down by 4 lines
-    vld1.s8     {d2[], d3[]}, [r3]          ; limit
-    ldr         r3, [sp, #8]                ; load v ptr
-    ldr         r12, [sp, #4]               ; load thresh pointer
-    sub         r3, r3, r1, lsl #2          ; move v pointer down by 4 lines
-
-    vld1.u8     {d6}, [r0], r1              ; p3
-    vld1.u8     {d7}, [r3], r1              ; p3
-    vld1.u8     {d8}, [r0], r1              ; p2
-    vld1.u8     {d9}, [r3], r1              ; p2
-    vld1.u8     {d10}, [r0], r1             ; p1
-    vld1.u8     {d11}, [r3], r1             ; p1
-    vld1.u8     {d12}, [r0], r1             ; p0
-    vld1.u8     {d13}, [r3], r1             ; p0
-    vld1.u8     {d14}, [r0], r1             ; q0
-    vld1.u8     {d15}, [r3], r1             ; q0
-    vld1.u8     {d16}, [r0], r1             ; q1
-    vld1.u8     {d17}, [r3], r1             ; q1
-    vld1.u8     {d18}, [r0], r1             ; q2
-    vld1.u8     {d19}, [r3], r1             ; q2
-    vld1.u8     {d20}, [r0], r1             ; q3
-    vld1.u8     {d21}, [r3], r1             ; q3
-
-    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
-
-    bl          vp8_mbloop_filter_neon
-
-    sub         r0, r0, r1, lsl #3
-    sub         r3, r3, r1, lsl #3
+    sub         r12, r12, r1, lsl #3
 
     add         r0, r0, r1
-    add         r3, r3, r1
+    add         r12, r12, r1
 
-    vst1.u8     {d8}, [r0], r1              ; store u op2
-    vst1.u8     {d9}, [r3], r1              ; store v op2
-    vst1.u8     {d10}, [r0], r1             ; store u op1
-    vst1.u8     {d11}, [r3], r1             ; store v op1
-    vst1.u8     {d12}, [r0], r1             ; store u op0
-    vst1.u8     {d13}, [r3], r1             ; store v op0
-    vst1.u8     {d14}, [r0], r1             ; store u oq0
-    vst1.u8     {d15}, [r3], r1             ; store v oq0
-    vst1.u8     {d16}, [r0], r1             ; store u oq1
-    vst1.u8     {d17}, [r3], r1             ; store v oq1
-    vst1.u8     {d18}, [r0], r1             ; store u oq2
-    vst1.u8     {d19}, [r3], r1             ; store v oq2
+    vst1.u8     {d8}, [r0@64], r1              ; store u op2
+    vst1.u8     {d9}, [r12@64], r1              ; store v op2
+    vst1.u8     {d10}, [r0@64], r1             ; store u op1
+    vst1.u8     {d11}, [r12@64], r1             ; store v op1
+    vst1.u8     {d12}, [r0@64], r1             ; store u op0
+    vst1.u8     {d13}, [r12@64], r1             ; store v op0
+    vst1.u8     {d14}, [r0@64], r1             ; store u oq0
+    vst1.u8     {d15}, [r12@64], r1             ; store v oq0
+    vst1.u8     {d16}, [r0@64], r1             ; store u oq1
+    vst1.u8     {d17}, [r12@64], r1             ; store v oq1
+    vst1.u8     {d18}, [r0@64], r1             ; store u oq2
+    vst1.u8     {d19}, [r12@64], r1             ; store v oq2
 
-    ldmia       sp!, {pc}
+    pop         {pc}
     ENDP        ; |vp8_mbloop_filter_horizontal_edge_uv_neon|
 
 ; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
-;                                             const signed char *flimit,
-;                                             const signed char *limit,
-;                                             const signed char *thresh,
-;                                             int count)
+;                                             const unsigned char *blimit,
+;                                             const unsigned char *limit,
+;                                             const unsigned char *thresh)
 ; r0    unsigned char *src,
 ; r1    int pitch,
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; sp    const signed char *thresh,
-; sp+4  int count (unused)
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
 |vp8_mbloop_filter_vertical_edge_y_neon| PROC
-    stmdb       sp!, {lr}
+    push        {lr}
+    ldr         r12, [sp, #4]               ; load thresh
     sub         r0, r0, #4                  ; move src pointer down by 4 columns
+    vdup.s8     q2, r12                     ; thresh
+    add         r12, r0, r1, lsl #3         ; move src pointer down by 8 lines
 
     vld1.u8     {d6}, [r0], r1              ; load first 8-line src data
-    ldr         r12, [sp, #4]               ; load thresh pointer
+    vld1.u8     {d7}, [r12], r1             ; load second 8-line src data
     vld1.u8     {d8}, [r0], r1
-    sub         sp, sp, #32
+    vld1.u8     {d9}, [r12], r1
     vld1.u8     {d10}, [r0], r1
+    vld1.u8     {d11}, [r12], r1
     vld1.u8     {d12}, [r0], r1
+    vld1.u8     {d13}, [r12], r1
     vld1.u8     {d14}, [r0], r1
+    vld1.u8     {d15}, [r12], r1
     vld1.u8     {d16}, [r0], r1
+    vld1.u8     {d17}, [r12], r1
     vld1.u8     {d18}, [r0], r1
+    vld1.u8     {d19}, [r12], r1
     vld1.u8     {d20}, [r0], r1
-
-    vld1.u8     {d7}, [r0], r1              ; load second 8-line src data
-    vld1.u8     {d9}, [r0], r1
-    vld1.u8     {d11}, [r0], r1
-    vld1.u8     {d13}, [r0], r1
-    vld1.u8     {d15}, [r0], r1
-    vld1.u8     {d17}, [r0], r1
-    vld1.u8     {d19}, [r0], r1
-    vld1.u8     {d21}, [r0], r1
+    vld1.u8     {d21}, [r12], r1
 
     ;transpose to 8x16 matrix
     vtrn.32     q3, q7
     vtrn.32     q4, q8
     vtrn.32     q5, q9
     vtrn.32     q6, q10
 
     vtrn.16     q3, q5
@@ -175,143 +163,21 @@
     vtrn.16     q7, q9
     vtrn.16     q8, q10
 
     vtrn.8      q3, q4
     vtrn.8      q5, q6
     vtrn.8      q7, q8
     vtrn.8      q9, q10
 
-    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
-    vld1.s8     {d2[], d3[]}, [r3]          ; limit
-    mov         r12, sp
-    vst1.u8     {q3}, [r12]!
-    vst1.u8     {q10}, [r12]!
+    sub         r0, r0, r1, lsl #3
 
     bl          vp8_mbloop_filter_neon
 
-    sub         r0, r0, r1, lsl #4
-
-    add         r2, r0, r1
-
-    add         r3, r2, r1
-
-    vld1.u8     {q3}, [sp]!
-    vld1.u8     {q10}, [sp]!
-
-    ;transpose to 16x8 matrix
-    vtrn.32     q3, q7
-    vtrn.32     q4, q8
-    vtrn.32     q5, q9
-    vtrn.32     q6, q10
-    add         r12, r3, r1
-
-    vtrn.16     q3, q5
-    vtrn.16     q4, q6
-    vtrn.16     q7, q9
-    vtrn.16     q8, q10
-
-    vtrn.8      q3, q4
-    vtrn.8      q5, q6
-    vtrn.8      q7, q8
-    vtrn.8      q9, q10
-
-    ;store op2, op1, op0, oq0, oq1, oq2
-    vst1.8      {d6}, [r0]
-    vst1.8      {d8}, [r2]
-    vst1.8      {d10}, [r3]
-    vst1.8      {d12}, [r12], r1
-    add         r0, r12, r1
-    vst1.8      {d14}, [r12]
-    vst1.8      {d16}, [r0], r1
-    add         r2, r0, r1
-    vst1.8      {d18}, [r0]
-    vst1.8      {d20}, [r2], r1
-    add         r3, r2, r1
-    vst1.8      {d7}, [r2]
-    vst1.8      {d9}, [r3], r1
-    add         r12, r3, r1
-    vst1.8      {d11}, [r3]
-    vst1.8      {d13}, [r12], r1
-    add         r0, r12, r1
-    vst1.8      {d15}, [r12]
-    vst1.8      {d17}, [r0], r1
-    add         r2, r0, r1
-    vst1.8      {d19}, [r0]
-    vst1.8      {d21}, [r2]
-
-    ldmia       sp!, {pc}
-    ENDP        ; |vp8_mbloop_filter_vertical_edge_y_neon|
-
-; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch,
-;                                              const signed char *flimit,
-;                                              const signed char *limit,
-;                                              const signed char *thresh,
-;                                              unsigned char *v)
-; r0    unsigned char *u,
-; r1    int pitch,
-; r2    const signed char *flimit,
-; r3    const signed char *limit,
-; sp    const signed char *thresh,
-; sp+4  unsigned char *v
-|vp8_mbloop_filter_vertical_edge_uv_neon| PROC
-    stmdb       sp!, {lr}
-    sub         r0, r0, #4                  ; move src pointer down by 4 columns
-    vld1.s8     {d2[], d3[]}, [r3]          ; limit
-    ldr         r3, [sp, #8]                ; load v ptr
-    ldr         r12, [sp, #4]               ; load thresh pointer
-
-    sub         r3, r3, #4                  ; move v pointer down by 4 columns
-
-    vld1.u8     {d6}, [r0], r1              ;load u data
-    vld1.u8     {d7}, [r3], r1              ;load v data
-    vld1.u8     {d8}, [r0], r1
-    vld1.u8     {d9}, [r3], r1
-    vld1.u8     {d10}, [r0], r1
-    vld1.u8     {d11}, [r3], r1
-    vld1.u8     {d12}, [r0], r1
-    vld1.u8     {d13}, [r3], r1
-    vld1.u8     {d14}, [r0], r1
-    vld1.u8     {d15}, [r3], r1
-    vld1.u8     {d16}, [r0], r1
-    vld1.u8     {d17}, [r3], r1
-    vld1.u8     {d18}, [r0], r1
-    vld1.u8     {d19}, [r3], r1
-    vld1.u8     {d20}, [r0], r1
-    vld1.u8     {d21}, [r3], r1
-
-    ;transpose to 8x16 matrix
-    vtrn.32     q3, q7
-    vtrn.32     q4, q8
-    vtrn.32     q5, q9
-    vtrn.32     q6, q10
-
-    vtrn.16     q3, q5
-    vtrn.16     q4, q6
-    vtrn.16     q7, q9
-    vtrn.16     q8, q10
-
-    vtrn.8      q3, q4
-    vtrn.8      q5, q6
-    vtrn.8      q7, q8
-    vtrn.8      q9, q10
-
-    sub         sp, sp, #32
-    vld1.s8     {d4[], d5[]}, [r12]         ; thresh
-    mov         r12, sp
-    vst1.u8     {q3}, [r12]!
-    vst1.u8     {q10}, [r12]!
-
-    bl          vp8_mbloop_filter_neon
-
-    sub         r0, r0, r1, lsl #3
-    sub         r3, r3, r1, lsl #3
-
-    vld1.u8     {q3}, [sp]!
-    vld1.u8     {q10}, [sp]!
+    sub         r12, r12, r1, lsl #3
 
     ;transpose to 16x8 matrix
     vtrn.32     q3, q7
     vtrn.32     q4, q8
     vtrn.32     q5, q9
     vtrn.32     q6, q10
 
     vtrn.16     q3, q5
@@ -321,197 +187,283 @@
 
     vtrn.8      q3, q4
     vtrn.8      q5, q6
     vtrn.8      q7, q8
     vtrn.8      q9, q10
 
     ;store op2, op1, op0, oq0, oq1, oq2
     vst1.8      {d6}, [r0], r1
-    vst1.8      {d7}, [r3], r1
+    vst1.8      {d7}, [r12], r1
     vst1.8      {d8}, [r0], r1
-    vst1.8      {d9}, [r3], r1
+    vst1.8      {d9}, [r12], r1
     vst1.8      {d10}, [r0], r1
-    vst1.8      {d11}, [r3], r1
+    vst1.8      {d11}, [r12], r1
     vst1.8      {d12}, [r0], r1
-    vst1.8      {d13}, [r3], r1
+    vst1.8      {d13}, [r12], r1
     vst1.8      {d14}, [r0], r1
-    vst1.8      {d15}, [r3], r1
+    vst1.8      {d15}, [r12], r1
     vst1.8      {d16}, [r0], r1
-    vst1.8      {d17}, [r3], r1
+    vst1.8      {d17}, [r12], r1
     vst1.8      {d18}, [r0], r1
-    vst1.8      {d19}, [r3], r1
-    vst1.8      {d20}, [r0], r1
-    vst1.8      {d21}, [r3], r1
+    vst1.8      {d19}, [r12], r1
+    vst1.8      {d20}, [r0]
+    vst1.8      {d21}, [r12]
+
+    pop         {pc}
+    ENDP        ; |vp8_mbloop_filter_vertical_edge_y_neon|
+
+; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch,
+;                                              const unsigned char *blimit,
+;                                              const unsigned char *limit,
+;                                              const unsigned char *thresh,
+;                                              unsigned char *v)
+; r0    unsigned char *u,
+; r1    int pitch,
+; r2    const signed char *flimit,
+; r3    const signed char *limit,
+; sp    const signed char *thresh,
+; sp+4  unsigned char *v
+|vp8_mbloop_filter_vertical_edge_uv_neon| PROC
+    push        {lr}
+    ldr         r12, [sp, #4]               ; load thresh
+    sub         r0, r0, #4                  ; move u pointer down by 4 columns
+    vdup.u8     q2, r12                     ; thresh
+    ldr         r12, [sp, #8]               ; load v ptr
+    sub         r12, r12, #4                ; move v pointer down by 4 columns
+
+    vld1.u8     {d6}, [r0], r1              ;load u data
+    vld1.u8     {d7}, [r12], r1             ;load v data
+    vld1.u8     {d8}, [r0], r1
+    vld1.u8     {d9}, [r12], r1
+    vld1.u8     {d10}, [r0], r1
+    vld1.u8     {d11}, [r12], r1
+    vld1.u8     {d12}, [r0], r1
+    vld1.u8     {d13}, [r12], r1
+    vld1.u8     {d14}, [r0], r1
+    vld1.u8     {d15}, [r12], r1
+    vld1.u8     {d16}, [r0], r1
+    vld1.u8     {d17}, [r12], r1
+    vld1.u8     {d18}, [r0], r1
+    vld1.u8     {d19}, [r12], r1
+    vld1.u8     {d20}, [r0], r1
+    vld1.u8     {d21}, [r12], r1
+
+    ;transpose to 8x16 matrix
+    vtrn.32     q3, q7
+    vtrn.32     q4, q8
+    vtrn.32     q5, q9
+    vtrn.32     q6, q10
 
-    ldmia       sp!, {pc}
+    vtrn.16     q3, q5
+    vtrn.16     q4, q6
+    vtrn.16     q7, q9
+    vtrn.16     q8, q10
+
+    vtrn.8      q3, q4
+    vtrn.8      q5, q6
+    vtrn.8      q7, q8
+    vtrn.8      q9, q10
+
+    sub         r0, r0, r1, lsl #3
+
+    bl          vp8_mbloop_filter_neon
+
+    sub         r12, r12, r1, lsl #3
+
+    ;transpose to 16x8 matrix
+    vtrn.32     q3, q7
+    vtrn.32     q4, q8
+    vtrn.32     q5, q9
+    vtrn.32     q6, q10
+
+    vtrn.16     q3, q5
+    vtrn.16     q4, q6
+    vtrn.16     q7, q9
+    vtrn.16     q8, q10
+
+    vtrn.8      q3, q4
+    vtrn.8      q5, q6
+    vtrn.8      q7, q8
+    vtrn.8      q9, q10
+
+    ;store op2, op1, op0, oq0, oq1, oq2
+    vst1.8      {d6}, [r0], r1
+    vst1.8      {d7}, [r12], r1
+    vst1.8      {d8}, [r0], r1
+    vst1.8      {d9}, [r12], r1
+    vst1.8      {d10}, [r0], r1
+    vst1.8      {d11}, [r12], r1
+    vst1.8      {d12}, [r0], r1
+    vst1.8      {d13}, [r12], r1
+    vst1.8      {d14}, [r0], r1
+    vst1.8      {d15}, [r12], r1
+    vst1.8      {d16}, [r0], r1
+    vst1.8      {d17}, [r12], r1
+    vst1.8      {d18}, [r0], r1
+    vst1.8      {d19}, [r12], r1
+    vst1.8      {d20}, [r0]
+    vst1.8      {d21}, [r12]
+
+    pop         {pc}
     ENDP        ; |vp8_mbloop_filter_vertical_edge_uv_neon|
 
 ; void vp8_mbloop_filter_neon()
 ; This is a helper function for the macroblock loopfilters. The individual
 ; functions do the necessary load, transpose (if necessary), preserve (if
 ; necessary) and store.
 
-; TODO:
-; The vertical filter writes p3/q3 back out because two 4 element writes are
-; much simpler than ordering and writing two 3 element sets (or three 2 elements
-; sets, or whichever other combinations are possible).
-; If we can preserve q3 and q10, the vertical filter will be able to avoid
-; storing those values on the stack and reading them back after the filter.
+; r0,r1 PRESERVE
+; r2    mblimit
+; r3    limit
 
-; r0,r1 PRESERVE
-; r2    flimit
-; r3    PRESERVE
-; q1    limit
 ; q2    thresh
-; q3    p3
+; q3    p3 PRESERVE
 ; q4    p2
 ; q5    p1
 ; q6    p0
 ; q7    q0
 ; q8    q1
 ; q9    q2
-; q10   q3
+; q10   q3 PRESERVE
 
 |vp8_mbloop_filter_neon| PROC
-    adr         r12, mblf_coeff
 
     ; vp8_filter_mask
     vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
     vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
     vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
     vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
-    vabd.u8     q3, q9, q8                  ; abs(q2 - q1)
+    vabd.u8     q1, q9, q8                  ; abs(q2 - q1)
     vabd.u8     q0, q10, q9                 ; abs(q3 - q2)
 
     vmax.u8     q11, q11, q12
     vmax.u8     q12, q13, q14
-    vmax.u8     q3, q3, q0
+    vmax.u8     q1, q1, q0
     vmax.u8     q15, q11, q12
 
     vabd.u8     q12, q6, q7                 ; abs(p0 - q0)
 
     ; vp8_hevmask
     vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh) * -1
     vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh) * -1
-    vmax.u8     q15, q15, q3
-
-    vld1.s8     {d4[], d5[]}, [r2]          ; flimit
+    vmax.u8     q15, q15, q1
 
-    vld1.u8     {q0}, [r12]!
+    vdup.u8     q1, r3                      ; limit
+    vdup.u8     q2, r2                      ; mblimit
 
-    vadd.u8     q2, q2, q2                  ; flimit * 2
-    vadd.u8     q2, q2, q1                  ; flimit * 2 +  limit
+    vmov.u8     q0, #0x80                   ; 0x80
+
     vcge.u8     q15, q1, q15
 
     vabd.u8     q1, q5, q8                  ; a = abs(p1 - q1)
     vqadd.u8    q12, q12, q12               ; b = abs(p0 - q0) * 2
-    vshr.u8     q1, q1, #1                  ; a = a / 2
-    vqadd.u8    q12, q12, q1                ; a = b + a
-    vcge.u8     q12, q2, q12                ; (a > flimit * 2 + limit) * -1
+    vmov.u16    q11, #3                     ; #3
 
     ; vp8_filter
     ; convert to signed
     veor        q7, q7, q0                  ; qs0
+    vshr.u8     q1, q1, #1                  ; a = a / 2
     veor        q6, q6, q0                  ; ps0
     veor        q5, q5, q0                  ; ps1
+
+    vqadd.u8    q12, q12, q1                ; a = b + a
+
     veor        q8, q8, q0                  ; qs1
     veor        q4, q4, q0                  ; ps2
     veor        q9, q9, q0                  ; qs2
 
     vorr        q14, q13, q14               ; vp8_hevmask
 
+    vcge.u8     q12, q2, q12                ; (a > flimit * 2 + limit) * -1
+
     vsubl.s8    q2, d14, d12                ; qs0 - ps0
     vsubl.s8    q13, d15, d13
 
     vqsub.s8    q1, q5, q8                  ; vp8_filter = clamp(ps1-qs1)
 
-    vadd.s16    q10, q2, q2                 ; 3 * (qs0 - ps0)
-    vadd.s16    q11, q13, q13
+    vmul.i16    q2, q2, q11                 ; 3 * ( qs0 - ps0)
+
     vand        q15, q15, q12               ; vp8_filter_mask
 
-    vadd.s16    q2, q2, q10
-    vadd.s16    q13, q13, q11
+    vmul.i16    q13, q13, q11
 
-    vld1.u8     {q12}, [r12]!               ; #3
+    vmov.u8     q12, #3                     ; #3
 
     vaddw.s8    q2, q2, d2                  ; vp8_filter + 3 * ( qs0 - ps0)
     vaddw.s8    q13, q13, d3
 
-    vld1.u8     {q11}, [r12]!               ; #4
+    vmov.u8     q11, #4                     ; #4
 
     ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
     vqmovn.s16  d2, q2
     vqmovn.s16  d3, q13
 
     vand        q1, q1, q15                 ; vp8_filter &= mask
 
-    vld1.u8     {q15}, [r12]!               ; #63
-    ;
+    vmov.u16    q15, #63                    ; #63
+
     vand        q13, q1, q14                ; Filter2 &= hev
 
-    vld1.u8     {d7}, [r12]!                ; #9
-
     vqadd.s8    q2, q13, q11                ; Filter1 = clamp(Filter2+4)
     vqadd.s8    q13, q13, q12               ; Filter2 = clamp(Filter2+3)
 
-    vld1.u8     {d6}, [r12]!                ; #18
+    vmov        q0, q15
 
     vshr.s8     q2, q2, #3                  ; Filter1 >>= 3
     vshr.s8     q13, q13, #3                ; Filter2 >>= 3
 
-    vmov        q10, q15
+    vmov        q11, q15
     vmov        q12, q15
 
     vqsub.s8    q7, q7, q2                  ; qs0 = clamp(qs0 - Filter1)
 
-    vld1.u8     {d5}, [r12]!                ; #27
-
     vqadd.s8    q6, q6, q13                 ; ps0 = clamp(ps0 + Filter2)
 
     vbic        q1, q1, q14                 ; vp8_filter &= ~hev
 
     ; roughly 1/7th difference across boundary
     ; roughly 2/7th difference across boundary
     ; roughly 3/7th difference across boundary
-    vmov        q11, q15
+
+    vmov.u8     d5, #9                      ; #9
+    vmov.u8     d4, #18                     ; #18
+
     vmov        q13, q15
     vmov        q14, q15
 
-    vmlal.s8    q10, d2, d7                 ; Filter2 * 9
-    vmlal.s8    q11, d3, d7
-    vmlal.s8    q12, d2, d6                 ; Filter2 * 18
-    vmlal.s8    q13, d3, d6
-    vmlal.s8    q14, d2, d5                 ; Filter2 * 27
+    vmlal.s8    q0, d2, d5                  ; 63 + Filter2 * 9
+    vmlal.s8    q11, d3, d5
+    vmov.u8     d5, #27                     ; #27
+    vmlal.s8    q12, d2, d4                 ; 63 + Filter2 * 18
+    vmlal.s8    q13, d3, d4
+    vmlal.s8    q14, d2, d5                 ; 63 + Filter2 * 27
     vmlal.s8    q15, d3, d5
-    vqshrn.s16  d20, q10, #7                ; u = clamp((63 + Filter2 * 9)>>7)
-    vqshrn.s16  d21, q11, #7
+
+    vqshrn.s16  d0, q0, #7                  ; u = clamp((63 + Filter2 * 9)>>7)
+    vqshrn.s16  d1, q11, #7
     vqshrn.s16  d24, q12, #7                ; u = clamp((63 + Filter2 * 18)>>7)
     vqshrn.s16  d25, q13, #7
     vqshrn.s16  d28, q14, #7                ; u = clamp((63 + Filter2 * 27)>>7)
     vqshrn.s16  d29, q15, #7
 
-    vqsub.s8    q11, q9, q10                ; s = clamp(qs2 - u)
-    vqadd.s8    q10, q4, q10                ; s = clamp(ps2 + u)
+    vmov.u8     q1, #0x80                   ; 0x80
+
+    vqsub.s8    q11, q9, q0                 ; s = clamp(qs2 - u)
+    vqadd.s8    q0, q4, q0                  ; s = clamp(ps2 + u)
     vqsub.s8    q13, q8, q12                ; s = clamp(qs1 - u)
     vqadd.s8    q12, q5, q12                ; s = clamp(ps1 + u)
     vqsub.s8    q15, q7, q14                ; s = clamp(qs0 - u)
     vqadd.s8    q14, q6, q14                ; s = clamp(ps0 + u)
-    veor        q9, q11, q0                 ; *oq2 = s^0x80
-    veor        q4, q10, q0                 ; *op2 = s^0x80
-    veor        q8, q13, q0                 ; *oq1 = s^0x80
-    veor        q5, q12, q0                 ; *op2 = s^0x80
-    veor        q7, q15, q0                 ; *oq0 = s^0x80
-    veor        q6, q14, q0                 ; *op0 = s^0x80
+
+    veor        q9, q11, q1                 ; *oq2 = s^0x80
+    veor        q4, q0, q1                  ; *op2 = s^0x80
+    veor        q8, q13, q1                 ; *oq1 = s^0x80
+    veor        q5, q12, q1                 ; *op2 = s^0x80
+    veor        q7, q15, q1                 ; *oq0 = s^0x80
+    veor        q6, q14, q1                 ; *op0 = s^0x80
 
     bx          lr
     ENDP        ; |vp8_mbloop_filter_neon|
 
-    AREA    mbloopfilter_dat, DATA, READONLY
-mblf_coeff
-    DCD     0x80808080, 0x80808080, 0x80808080, 0x80808080
-    DCD     0x03030303, 0x03030303, 0x03030303, 0x03030303
-    DCD     0x04040404, 0x04040404, 0x04040404, 0x04040404
-    DCD     0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
-    DCD     0x09090909, 0x09090909, 0x12121212, 0x12121212
-    DCD     0x1b1b1b1b, 0x1b1b1b1b
+;-----------------
 
     END
--- a/media/libvpx/vp8/common/arm/neon/recon_neon.c
+++ b/media/libvpx/vp8/common/arm/neon/recon_neon.c
@@ -5,18 +5,18 @@
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
 #include "vpx_ports/config.h"
-#include "recon.h"
-#include "blockd.h"
+#include "vp8/common/recon.h"
+#include "vp8/common/blockd.h"
 
 extern void vp8_recon16x16mb_neon(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int ystride, unsigned char *udst_ptr, unsigned char *vdst_ptr);
 
 void vp8_recon_mb_neon(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
 {
     unsigned char *pred_ptr = &x->predictor[0];
     short *diff_ptr = &x->diff[0];
     unsigned char *dst_ptr = x->dst.y_buffer;
--- a/media/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.asm
+++ b/media/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.asm
@@ -108,18 +108,15 @@
     vst1.16         {d4}, [r12]
     vst1.16         {d5}, [r0]
 
     bx             lr
 
     ENDP
 
 ;-----------------
-    AREA    idct4x4_dat, DATA, READWRITE            ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+
 idct_coeff
     DCD     0x4e7b4e7b, 0x8a8c8a8c
 
 ;20091, 20091, 35468, 35468
 
     END
--- a/media/libvpx/vp8/common/arm/neon/sixtappredict16x16_neon.asm
+++ b/media/libvpx/vp8/common/arm/neon/sixtappredict16x16_neon.asm
@@ -10,16 +10,27 @@
 
 
     EXPORT  |vp8_sixtap_predict16x16_neon|
     ARM
     REQUIRE8
     PRESERVE8
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
+
+filter16_coeff
+    DCD     0,  0,  128,    0,   0,  0,   0,  0
+    DCD     0, -6,  123,   12,  -1,  0,   0,  0
+    DCD     2, -11, 108,   36,  -8,  1,   0,  0
+    DCD     0, -9,   93,   50,  -6,  0,   0,  0
+    DCD     3, -16,  77,   77, -16,  3,   0,  0
+    DCD     0, -6,   50,   93,  -9,  0,   0,  0
+    DCD     1, -8,   36,  108, -11,  2,   0,  0
+    DCD     0, -1,   12,  123,  -6,   0,  0,  0
+
 ; r0    unsigned char  *src_ptr,
 ; r1    int  src_pixels_per_line,
 ; r2    int  xoffset,
 ; r3    int  yoffset,
 ; r4    unsigned char *dst_ptr,
 ; stack(r5) int  dst_pitch
 
 ;Note: To take advantage of 8-bit mulplication instruction in NEON. First apply abs() to
@@ -28,17 +39,17 @@
 ; that the result can be a large positive number (> 2^15-1), which could be confused as a
 ; negtive number. To avoid that error, apply filter coeffs in the order of 0, 1, 4 ,5 ,2,
 ; which ensures that the result stays in s16 range. Finally, saturated add the result by
 ; applying 3rd filter coeff. Same applys to other filter functions.
 
 |vp8_sixtap_predict16x16_neon| PROC
     push            {r4-r5, lr}
 
-    adrl            r12, filter16_coeff
+    adr             r12, filter16_coeff
     ldr             r4, [sp, #12]           ;load parameters from stack
     ldr             r5, [sp, #16]           ;load parameters from stack
 
     cmp             r2, #0                  ;skip first_pass filter if xoffset=0
     beq             secondpass_filter16x16_only
 
     add             r2, r12, r2, lsl #5     ;calculate filter location
 
@@ -471,23 +482,9 @@ secondpass_only_inner_loop_neon
 
     bne filt_blk2d_spo16x16_outloop_neon
 
     pop             {r4-r5,pc}
 
     ENDP
 
 ;-----------------
-    AREA    subpelfilters16_dat, DATA, READWRITE            ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
-filter16_coeff
-    DCD     0,  0,  128,    0,   0,  0,   0,  0
-    DCD     0, -6,  123,   12,  -1,  0,   0,  0
-    DCD     2, -11, 108,   36,  -8,  1,   0,  0
-    DCD     0, -9,   93,   50,  -6,  0,   0,  0
-    DCD     3, -16,  77,   77, -16,  3,   0,  0
-    DCD     0, -6,   50,   93,  -9,  0,   0,  0
-    DCD     1, -8,   36,  108, -11,  2,   0,  0
-    DCD     0, -1,   12,  123,  -6,   0,  0,  0
-
     END
--- a/media/libvpx/vp8/common/arm/neon/sixtappredict4x4_neon.asm
+++ b/media/libvpx/vp8/common/arm/neon/sixtappredict4x4_neon.asm
@@ -10,27 +10,38 @@
 
 
     EXPORT  |vp8_sixtap_predict_neon|
     ARM
     REQUIRE8
     PRESERVE8
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
+
+filter4_coeff
+    DCD     0,  0,  128,    0,   0,  0,   0,  0
+    DCD     0, -6,  123,   12,  -1,  0,   0,  0
+    DCD     2, -11, 108,   36,  -8,  1,   0,  0
+    DCD     0, -9,   93,   50,  -6,  0,   0,  0
+    DCD     3, -16,  77,   77, -16,  3,   0,  0
+    DCD     0, -6,   50,   93,  -9,  0,   0,  0
+    DCD     1, -8,   36,  108, -11,  2,   0,  0
+    DCD     0, -1,   12,  123,  -6,   0,  0,  0
+
 ; r0    unsigned char  *src_ptr,
 ; r1    int  src_pixels_per_line,
 ; r2    int  xoffset,
 ; r3    int  yoffset,
 ; stack(r4) unsigned char *dst_ptr,
 ; stack(lr) int  dst_pitch
 
 |vp8_sixtap_predict_neon| PROC
     push            {r4, lr}
 
-    adrl            r12, filter4_coeff
+    adr             r12, filter4_coeff
     ldr             r4, [sp, #8]            ;load parameters from stack
     ldr             lr, [sp, #12]           ;load parameters from stack
 
     cmp             r2, #0                  ;skip first_pass filter if xoffset=0
     beq             secondpass_filter4x4_only
 
     add             r2, r12, r2, lsl #5     ;calculate filter location
 
@@ -402,23 +413,10 @@ secondpass_filter4x4_only
     vst1.32         {d4[0]}, [r1]
     vst1.32         {d4[1]}, [r2]
 
     pop             {r4, pc}
 
     ENDP
 
 ;-----------------
-    AREA    subpelfilters4_dat, DATA, READWRITE         ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
-filter4_coeff
-    DCD     0,  0,  128,    0,   0,  0,   0,  0
-    DCD     0, -6,  123,   12,  -1,  0,   0,  0
-    DCD     2, -11, 108,   36,  -8,  1,   0,  0
-    DCD     0, -9,   93,   50,  -6,  0,   0,  0
-    DCD     3, -16,  77,   77, -16,  3,   0,  0
-    DCD     0, -6,   50,   93,  -9,  0,   0,  0
-    DCD     1, -8,   36,  108, -11,  2,   0,  0
-    DCD     0, -1,   12,  123,  -6,   0,  0,  0
 
     END
--- a/media/libvpx/vp8/common/arm/neon/sixtappredict8x4_neon.asm
+++ b/media/libvpx/vp8/common/arm/neon/sixtappredict8x4_neon.asm
@@ -10,27 +10,38 @@
 
 
     EXPORT  |vp8_sixtap_predict8x4_neon|
     ARM
     REQUIRE8
     PRESERVE8
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
+
+filter8_coeff
+    DCD     0,  0,  128,    0,   0,  0,   0,  0
+    DCD     0, -6,  123,   12,  -1,  0,   0,  0
+    DCD     2, -11, 108,   36,  -8,  1,   0,  0
+    DCD     0, -9,   93,   50,  -6,  0,   0,  0
+    DCD     3, -16,  77,   77, -16,  3,   0,  0
+    DCD     0, -6,   50,   93,  -9,  0,   0,  0
+    DCD     1, -8,   36,  108, -11,  2,   0,  0
+    DCD     0, -1,   12,  123,  -6,   0,  0,  0
+
 ; r0    unsigned char  *src_ptr,
 ; r1    int  src_pixels_per_line,
 ; r2    int  xoffset,
 ; r3    int  yoffset,
 ; r4    unsigned char *dst_ptr,
 ; stack(r5) int  dst_pitch
 
 |vp8_sixtap_predict8x4_neon| PROC
     push            {r4-r5, lr}
 
-    adrl            r12, filter8_coeff
+    adr             r12, filter8_coeff
     ldr             r4, [sp, #12]           ;load parameters from stack
     ldr             r5, [sp, #16]           ;load parameters from stack
 
     cmp             r2, #0                  ;skip first_pass filter if xoffset=0
     beq             secondpass_filter8x4_only
 
     add             r2, r12, r2, lsl #5     ;calculate filter location
 
@@ -453,23 +464,10 @@ secondpass_filter8x4_only
     vst1.u8         {d8}, [r4], r5
     vst1.u8         {d9}, [r4], r5
 
     pop             {r4-r5,pc}
 
     ENDP
 
 ;-----------------
-    AREA    subpelfilters8_dat, DATA, READWRITE         ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
-filter8_coeff
-    DCD     0,  0,  128,    0,   0,  0,   0,  0
-    DCD     0, -6,  123,   12,  -1,  0,   0,  0
-    DCD     2, -11, 108,   36,  -8,  1,   0,  0
-    DCD     0, -9,   93,   50,  -6,  0,   0,  0
-    DCD     3, -16,  77,   77, -16,  3,   0,  0
-    DCD     0, -6,   50,   93,  -9,  0,   0,  0
-    DCD     1, -8,   36,  108, -11,  2,   0,  0
-    DCD     0, -1,   12,  123,  -6,   0,  0,  0
 
     END
--- a/media/libvpx/vp8/common/arm/neon/sixtappredict8x8_neon.asm
+++ b/media/libvpx/vp8/common/arm/neon/sixtappredict8x8_neon.asm
@@ -10,27 +10,38 @@
 
 
     EXPORT  |vp8_sixtap_predict8x8_neon|
     ARM
     REQUIRE8
     PRESERVE8
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
+
+filter8_coeff
+    DCD     0,  0,  128,    0,   0,  0,   0,  0
+    DCD     0, -6,  123,   12,  -1,  0,   0,  0
+    DCD     2, -11, 108,   36,  -8,  1,   0,  0
+    DCD     0, -9,   93,   50,  -6,  0,   0,  0
+    DCD     3, -16,  77,   77, -16,  3,   0,  0
+    DCD     0, -6,   50,   93,  -9,  0,   0,  0
+    DCD     1, -8,   36,  108, -11,  2,   0,  0
+    DCD     0, -1,   12,  123,  -6,   0,  0,  0
+
 ; r0    unsigned char  *src_ptr,
 ; r1    int  src_pixels_per_line,
 ; r2    int  xoffset,
 ; r3    int  yoffset,
 ; stack(r4) unsigned char *dst_ptr,
 ; stack(r5) int  dst_pitch
 
 |vp8_sixtap_predict8x8_neon| PROC
     push            {r4-r5, lr}
 
-    adrl            r12, filter8_coeff
+    adr             r12, filter8_coeff
 
     ldr             r4, [sp, #12]           ;load parameters from stack
     ldr             r5, [sp, #16]           ;load parameters from stack
 
     cmp             r2, #0                  ;skip first_pass filter if xoffset=0
     beq             secondpass_filter8x8_only
 
     add             r2, r12, r2, lsl #5     ;calculate filter location
@@ -504,23 +515,10 @@ filt_blk2d_spo8x8_loop_neon
 
     bne filt_blk2d_spo8x8_loop_neon
 
     pop             {r4-r5,pc}
 
     ENDP
 
 ;-----------------
-    AREA    subpelfilters8_dat, DATA, READWRITE         ;read/write by default
-;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
-;One word each is reserved. Label filter_coeff can be used to access the data.
-;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
-filter8_coeff
-    DCD     0,  0,  128,    0,   0,  0,   0,  0
-    DCD     0, -6,  123,   12,  -1,  0,   0,  0
-    DCD     2, -11, 108,   36,  -8,  1,   0,  0
-    DCD     0, -9,   93,   50,  -6,  0,   0,  0
-    DCD     3, -16,  77,   77, -16,  3,   0,  0
-    DCD     0, -6,   50,   93,  -9,  0,   0,  0
-    DCD     1, -8,   36,  108, -11,  2,   0,  0
-    DCD     0, -1,   12,  123,  -6,   0,  0,  0
 
     END
--- a/media/libvpx/vp8/common/arm/reconintra_arm.c
+++ b/media/libvpx/vp8/common/arm/reconintra_arm.c
@@ -5,20 +5,20 @@
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
 #include "vpx_ports/config.h"
-#include "blockd.h"
-#include "reconintra.h"
+#include "vp8/common/blockd.h"
+#include "vp8/common/reconintra.h"
 #include "vpx_mem/vpx_mem.h"
-#include "recon.h"
+#include "vp8/common/recon.h"
 
 #if HAVE_ARMV7
 extern void vp8_build_intra_predictors_mby_neon_func(
     unsigned char *y_buffer,
     unsigned char *ypred_ptr,
     int y_stride,
     int mode,
     int Up,
deleted file mode 100644
--- a/media/libvpx/vp8/common/arm/vpx_asm_offsets.c
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include <stddef.h>
-
-#if CONFIG_VP8_ENCODER
-#include "vpx_scale/yv12config.h"
-#endif
-
-#if CONFIG_VP8_DECODER
-#include "onyxd_int.h"
-#endif
-
-#define DEFINE(sym, val) int sym = val;
-
-/*
-#define BLANK() asm volatile("\n->" : : )
-*/
-
-/*
- * int main(void)
- * {
- */
-
-#if CONFIG_VP8_DECODER || CONFIG_VP8_ENCODER
-DEFINE(yv12_buffer_config_y_width,              offsetof(YV12_BUFFER_CONFIG, y_width));
-DEFINE(yv12_buffer_config_y_height,             offsetof(YV12_BUFFER_CONFIG, y_height));
-DEFINE(yv12_buffer_config_y_stride,             offsetof(YV12_BUFFER_CONFIG, y_stride));
-DEFINE(yv12_buffer_config_uv_width,             offsetof(YV12_BUFFER_CONFIG, uv_width));
-DEFINE(yv12_buffer_config_uv_height,            offsetof(YV12_BUFFER_CONFIG, uv_height));
-DEFINE(yv12_buffer_config_uv_stride,            offsetof(YV12_BUFFER_CONFIG, uv_stride));
-DEFINE(yv12_buffer_config_y_buffer,             offsetof(YV12_BUFFER_CONFIG, y_buffer));
-DEFINE(yv12_buffer_config_u_buffer,             offsetof(YV12_BUFFER_CONFIG, u_buffer));
-DEFINE(yv12_buffer_config_v_buffer,             offsetof(YV12_BUFFER_CONFIG, v_buffer));
-DEFINE(yv12_buffer_config_border,               offsetof(YV12_BUFFER_CONFIG, border));
-#endif
-
-#if CONFIG_VP8_DECODER
-DEFINE(mb_diff,                                 offsetof(MACROBLOCKD, diff));
-DEFINE(mb_predictor,                            offsetof(MACROBLOCKD, predictor));
-DEFINE(mb_dst_y_stride,                         offsetof(MACROBLOCKD, dst.y_stride));
-DEFINE(mb_dst_y_buffer,                         offsetof(MACROBLOCKD, dst.y_buffer));
-DEFINE(mb_dst_u_buffer,                         offsetof(MACROBLOCKD, dst.u_buffer));
-DEFINE(mb_dst_v_buffer,                         offsetof(MACROBLOCKD, dst.v_buffer));
-DEFINE(mb_up_available,                         offsetof(MACROBLOCKD, up_available));
-DEFINE(mb_left_available,                       offsetof(MACROBLOCKD, left_available));
-
-DEFINE(detok_scan,                              offsetof(DETOK, scan));
-DEFINE(detok_ptr_block2leftabove,               offsetof(DETOK, ptr_block2leftabove));
-DEFINE(detok_coef_tree_ptr,                     offsetof(DETOK, vp8_coef_tree_ptr));
-DEFINE(detok_teb_base_ptr,                      offsetof(DETOK, teb_base_ptr));
-DEFINE(detok_norm_ptr,                          offsetof(DETOK, norm_ptr));
-DEFINE(detok_ptr_coef_bands_x,                  offsetof(DETOK, ptr_coef_bands_x));
-
-DEFINE(detok_A,                                 offsetof(DETOK, A));
-DEFINE(detok_L,                                 offsetof(DETOK, L));
-
-DEFINE(detok_qcoeff_start_ptr,                  offsetof(DETOK, qcoeff_start_ptr));
-DEFINE(detok_current_bc,                        offsetof(DETOK, current_bc));
-DEFINE(detok_coef_probs,                        offsetof(DETOK, coef_probs));
-DEFINE(detok_eob,                               offsetof(DETOK, eob));
-
-DEFINE(bool_decoder_user_buffer_end,            offsetof(BOOL_DECODER, user_buffer_end));
-DEFINE(bool_decoder_user_buffer,                offsetof(BOOL_DECODER, user_buffer));
-DEFINE(bool_decoder_value,                      offsetof(BOOL_DECODER, value));
-DEFINE(bool_decoder_count,                      offsetof(BOOL_DECODER, count));
-DEFINE(bool_decoder_range,                      offsetof(BOOL_DECODER, range));
-
-DEFINE(tokenextrabits_min_val,                  offsetof(TOKENEXTRABITS, min_val));
-DEFINE(tokenextrabits_length,                   offsetof(TOKENEXTRABITS, Length));
-#endif
-
-//add asserts for any offset that is not supported by assembly code
-//add asserts for any size that is not supported by assembly code
-/*
- * return 0;
- * }
- */
--- a/media/libvpx/vp8/common/blockd.c
+++ b/media/libvpx/vp8/common/blockd.c
@@ -7,18 +7,16 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
 #include "blockd.h"
 #include "vpx_mem/vpx_mem.h"
 
-const int vp8_block2type[25] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 1};
-
 const unsigned char vp8_block2left[25] =
 {
     0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
 };
 const unsigned char vp8_block2above[25] =
 {
     0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8
 };
--- a/media/libvpx/vp8/common/blockd.h
+++ b/media/libvpx/vp8/common/blockd.h
@@ -23,48 +23,46 @@ void vpx_log(const char *format, ...);
 
 #define TRUE    1
 #define FALSE   0
 
 /*#define DCPRED 1*/
 #define DCPREDSIMTHRESH 0
 #define DCPREDCNTTHRESH 3
 
-#define Y1CONTEXT 0
-#define UCONTEXT 1
-#define VCONTEXT 2
-#define Y2CONTEXT 3
-
 #define MB_FEATURE_TREE_PROBS   3
 #define MAX_MB_SEGMENTS         4
 
 #define MAX_REF_LF_DELTAS       4
 #define MAX_MODE_LF_DELTAS      4
 
 /* Segment Feature Masks */
 #define SEGMENT_DELTADATA   0
 #define SEGMENT_ABSDATA     1
 
 typedef struct
 {
     int r, c;
 } POS;
 
+#define PLANE_TYPE_Y_NO_DC    0
+#define PLANE_TYPE_Y2         1
+#define PLANE_TYPE_UV         2
+#define PLANE_TYPE_Y_WITH_DC  3
+
 
 typedef char ENTROPY_CONTEXT;
 typedef struct
 {
     ENTROPY_CONTEXT y1[4];
     ENTROPY_CONTEXT u[2];
     ENTROPY_CONTEXT v[2];
     ENTROPY_CONTEXT y2;
 } ENTROPY_CONTEXT_PLANES;
 
-extern const int vp8_block2type[25];
-
 extern const unsigned char vp8_block2left[25];
 extern const unsigned char vp8_block2above[25];
 
 #define VP8_COMBINEENTROPYCONTEXTS( Dest, A, B) \
     Dest = ((A)!=0) + ((B)!=0);
 
 
 typedef enum
@@ -134,94 +132,75 @@ typedef enum
 
 #define VP8_BINTRAMODES (B_HU_PRED + 1)  /* 10 */
 #define VP8_SUBMVREFS (1 + NEW4X4 - LEFT4X4)
 
 /* For keyframes, intra block modes are predicted by the (already decoded)
    modes for the Y blocks to the left and above us; for interframes, there
    is a single probability table. */
 
-typedef struct
+union b_mode_info
 {
-    B_PREDICTION_MODE mode;
-    union
-    {
-        int as_int;
-        MV  as_mv;
-    } mv;
-} B_MODE_INFO;
-
+    B_PREDICTION_MODE as_mode;
+    int_mv mv;
+};
 
 typedef enum
 {
     INTRA_FRAME = 0,
     LAST_FRAME = 1,
     GOLDEN_FRAME = 2,
     ALTREF_FRAME = 3,
     MAX_REF_FRAMES = 4
 } MV_REFERENCE_FRAME;
 
 typedef struct
 {
     MB_PREDICTION_MODE mode, uv_mode;
     MV_REFERENCE_FRAME ref_frame;
-    union
-    {
-        int as_int;
-        MV  as_mv;
-    } mv;
+    int_mv mv;
 
     unsigned char partitioning;
     unsigned char mb_skip_coeff;                                /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
-    unsigned char dc_diff;
     unsigned char need_to_clamp_mvs;
-
     unsigned char segment_id;                  /* Which set of segmentation parameters should be used for this MB */
-
-    unsigned char force_no_skip; /* encoder only */
 } MB_MODE_INFO;
 
-
 typedef struct
 {
     MB_MODE_INFO mbmi;
-    B_MODE_INFO bmi[16];
+    union b_mode_info bmi[16];
 } MODE_INFO;
 
-
 typedef struct
 {
     short *qcoeff;
     short *dqcoeff;
     unsigned char  *predictor;
     short *diff;
-    short *reference;
-
     short *dequant;
 
     /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */
     unsigned char **base_pre;
     int pre;
     int pre_stride;
 
     unsigned char **base_dst;
     int dst;
     int dst_stride;
 
     int eob;
 
-    B_MODE_INFO bmi;
-
+    union b_mode_info bmi;
 } BLOCKD;
 
-typedef struct
+typedef struct MacroBlockD
 {
     DECLARE_ALIGNED(16, short, diff[400]);      /* from idct diff */
     DECLARE_ALIGNED(16, unsigned char,  predictor[384]);
-/* not used    DECLARE_ALIGNED(16, short, reference[384]); */
     DECLARE_ALIGNED(16, short, qcoeff[400]);
     DECLARE_ALIGNED(16, short, dqcoeff[400]);
     DECLARE_ALIGNED(16, char,  eobs[25]);
 
     /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */
     BLOCKD block[25];
 
     YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */
@@ -268,27 +247,56 @@ typedef struct
     signed char mode_lf_deltas[MAX_MODE_LF_DELTAS];                           /* 0 = BPRED, ZERO_MV, MV, SPLIT */
 
     /* Distance of MB away from frame edges */
     int mb_to_left_edge;
     int mb_to_right_edge;
     int mb_to_top_edge;
     int mb_to_bottom_edge;
 
+    int ref_frame_cost[MAX_REF_FRAMES];
+
+
     unsigned int frames_since_golden;
     unsigned int frames_till_alt_ref_frame;
     vp8_subpix_fn_t  subpixel_predict;
     vp8_subpix_fn_t  subpixel_predict8x4;
     vp8_subpix_fn_t  subpixel_predict8x8;
     vp8_subpix_fn_t  subpixel_predict16x16;
 
     void *current_bc;
 
+    int corrupted;
+
+#if ARCH_X86 || ARCH_X86_64
+    /* This is an intermediate buffer currently used in sub-pixel motion search
+     * to keep a copy of the reference area. This buffer can be used for other
+     * purpose.
+     */
+    DECLARE_ALIGNED(32, unsigned char, y_buf[22*32]);
+#endif
+
 #if CONFIG_RUNTIME_CPU_DETECT
     struct VP8_COMMON_RTCD  *rtcd;
 #endif
 } MACROBLOCKD;
 
 
 extern void vp8_build_block_doffsets(MACROBLOCKD *x);
 extern void vp8_setup_block_dptrs(MACROBLOCKD *x);
 
+static void update_blockd_bmi(MACROBLOCKD *xd)
+{
+    int i;
+    int is_4x4;
+    is_4x4 = (xd->mode_info_context->mbmi.mode == SPLITMV) ||
+              (xd->mode_info_context->mbmi.mode == B_PRED);
+
+    if (is_4x4)
+    {
+        for (i = 0; i < 16; i++)
+        {
+            xd->block[i].bmi = xd->mode_info_context->bmi[i];
+        }
+    }
+}
+
 #endif  /* __INC_BLOCKD_H */
--- a/media/libvpx/vp8/common/coefupdateprobs.h
+++ b/media/libvpx/vp8/common/coefupdateprobs.h
@@ -7,17 +7,17 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
 /* Update probabilities for the nodes in the token entropy tree.
    Generated file included by entropy.c */
 
-const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens-1] =
+const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES] =
 {
     {
         {
             {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
             {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
             {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
         },
         {
--- a/media/libvpx/vp8/common/debugmodes.c
+++ b/media/libvpx/vp8/common/debugmodes.c
@@ -92,17 +92,17 @@ void vp8_print_modes_and_motion_vectors(
             int bindex;
 
             for (b_col = 0; b_col < 4 * cols; b_col++)
             {
                 mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);
                 bindex = (b_row & 3) * 4 + (b_col & 3);
 
                 if (mi[mb_index].mbmi.mode == B_PRED)
-                    fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].mode);
+                    fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode);
                 else
                     fprintf(mvs, "xx ");
 
             }
 
             fprintf(mvs, "\n");
         }
     }
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/common/defaultcoefcounts.c
@@ -0,0 +1,225 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "defaultcoefcounts.h"
+
+/* Generated file, included by entropy.c */
+
+const unsigned int vp8_default_coef_counts[BLOCK_TYPES]
+                                          [COEF_BANDS]
+                                          [PREV_COEF_CONTEXTS]
+                                          [MAX_ENTROPY_TOKENS] =
+{
+
+    {
+        /* Block Type ( 0 ) */
+        {
+            /* Coeff Band ( 0 ) */
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+        },
+        {
+            /* Coeff Band ( 1 ) */
+            {30190, 26544, 225,  24,   4,   0,   0,   0,   0,   0,   0, 4171593,},
+            {26846, 25157, 1241, 130,  26,   6,   1,   0,   0,   0,   0, 149987,},
+            {10484, 9538, 1006, 160,  36,  18,   0,   0,   0,   0,   0, 15104,},
+        },
+        {
+            /* Coeff Band ( 2 ) */
+            {25842, 40456, 1126,  83,  11,   2,   0,   0,   0,   0,   0,   0,},
+            {9338, 8010, 512,  73,   7,   3,   2,   0,   0,   0,   0, 43294,},
+            {1047, 751, 149,  31,  13,   6,   1,   0,   0,   0,   0, 879,},
+        },
+        {
+            /* Coeff Band ( 3 ) */
+            {26136, 9826, 252,  13,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {8134, 5574, 191,  14,   2,   0,   0,   0,   0,   0,   0, 35302,},
+            { 605, 677, 116,   9,   1,   0,   0,   0,   0,   0,   0, 611,},
+        },
+        {
+            /* Coeff Band ( 4 ) */
+            {10263, 15463, 283,  17,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {2773, 2191, 128,   9,   2,   2,   0,   0,   0,   0,   0, 10073,},
+            { 134, 125,  32,   4,   0,   2,   0,   0,   0,   0,   0,  50,},
+        },
+        {
+            /* Coeff Band ( 5 ) */
+            {10483, 2663,  23,   1,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {2137, 1251,  27,   1,   1,   0,   0,   0,   0,   0,   0, 14362,},
+            { 116, 156,  14,   2,   1,   0,   0,   0,   0,   0,   0, 190,},
+        },
+        {
+            /* Coeff Band ( 6 ) */
+            {40977, 27614, 412,  28,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {6113, 5213, 261,  22,   3,   0,   0,   0,   0,   0,   0, 26164,},
+            { 382, 312,  50,  14,   2,   0,   0,   0,   0,   0,   0, 345,},
+        },
+        {
+            /* Coeff Band ( 7 ) */
+            {   0,  26,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,  13,   0,   0,   0,   0,   0,   0,   0,   0,   0, 319,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   8,},
+        },
+    },
+    {
+        /* Block Type ( 1 ) */
+        {
+            /* Coeff Band ( 0 ) */
+            {3268, 19382, 1043, 250,  93,  82,  49,  26,  17,   8,  25, 82289,},
+            {8758, 32110, 5436, 1832, 827, 668, 420, 153,  24,   0,   3, 52914,},
+            {9337, 23725, 8487, 3954, 2107, 1836, 1069, 399,  59,   0,   0, 18620,},
+        },
+        {
+            /* Coeff Band ( 1 ) */
+            {12419, 8420, 452,  62,   9,   1,   0,   0,   0,   0,   0,   0,},
+            {11715, 8705, 693,  92,  15,   7,   2,   0,   0,   0,   0, 53988,},
+            {7603, 8585, 2306, 778, 270, 145,  39,   5,   0,   0,   0, 9136,},
+        },
+        {
+            /* Coeff Band ( 2 ) */
+            {15938, 14335, 1207, 184,  55,  13,   4,   1,   0,   0,   0,   0,},
+            {7415, 6829, 1138, 244,  71,  26,   7,   0,   0,   0,   0, 9980,},
+            {1580, 1824, 655, 241,  89,  46,  10,   2,   0,   0,   0, 429,},
+        },
+        {
+            /* Coeff Band ( 3 ) */
+            {19453, 5260, 201,  19,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {9173, 3758, 213,  22,   1,   1,   0,   0,   0,   0,   0, 9820,},
+            {1689, 1277, 276,  51,  17,   4,   0,   0,   0,   0,   0, 679,},
+        },
+        {
+            /* Coeff Band ( 4 ) */
+            {12076, 10667, 620,  85,  19,   9,   5,   0,   0,   0,   0,   0,},
+            {4665, 3625, 423,  55,  19,   9,   0,   0,   0,   0,   0, 5127,},
+            { 415, 440, 143,  34,  20,   7,   2,   0,   0,   0,   0, 101,},
+        },
+        {
+            /* Coeff Band ( 5 ) */
+            {12183, 4846, 115,  11,   1,   0,   0,   0,   0,   0,   0,   0,},
+            {4226, 3149, 177,  21,   2,   0,   0,   0,   0,   0,   0, 7157,},
+            { 375, 621, 189,  51,  11,   4,   1,   0,   0,   0,   0, 198,},
+        },
+        {
+            /* Coeff Band ( 6 ) */
+            {61658, 37743, 1203,  94,  10,   3,   0,   0,   0,   0,   0,   0,},
+            {15514, 11563, 903, 111,  14,   5,   0,   0,   0,   0,   0, 25195,},
+            { 929, 1077, 291,  78,  14,   7,   1,   0,   0,   0,   0, 507,},
+        },
+        {
+            /* Coeff Band ( 7 ) */
+            {   0, 990,  15,   3,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0, 412,  13,   0,   0,   0,   0,   0,   0,   0,   0, 1641,},
+            {   0,  18,   7,   1,   0,   0,   0,   0,   0,   0,   0,  30,},
+        },
+    },
+    {
+        /* Block Type ( 2 ) */
+        {
+            /* Coeff Band ( 0 ) */
+            { 953, 24519, 628, 120,  28,  12,   4,   0,   0,   0,   0, 2248798,},
+            {1525, 25654, 2647, 617, 239, 143,  42,   5,   0,   0,   0, 66837,},
+            {1180, 11011, 3001, 1237, 532, 448, 239,  54,   5,   0,   0, 7122,},
+        },
+        {
+            /* Coeff Band ( 1 ) */
+            {1356, 2220,  67,  10,   4,   1,   0,   0,   0,   0,   0,   0,},
+            {1450, 2544, 102,  18,   4,   3,   0,   0,   0,   0,   0, 57063,},
+            {1182, 2110, 470, 130,  41,  21,   0,   0,   0,   0,   0, 6047,},
+        },
+        {
+            /* Coeff Band ( 2 ) */
+            { 370, 3378, 200,  30,   5,   4,   1,   0,   0,   0,   0,   0,},
+            { 293, 1006, 131,  29,  11,   0,   0,   0,   0,   0,   0, 5404,},
+            { 114, 387,  98,  23,   4,   8,   1,   0,   0,   0,   0, 236,},
+        },
+        {
+            /* Coeff Band ( 3 ) */
+            { 579, 194,   4,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            { 395, 213,   5,   1,   0,   0,   0,   0,   0,   0,   0, 4157,},
+            { 119, 122,   4,   0,   0,   0,   0,   0,   0,   0,   0, 300,},
+        },
+        {
+            /* Coeff Band ( 4 ) */
+            {  38, 557,  19,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {  21, 114,  12,   1,   0,   0,   0,   0,   0,   0,   0, 427,},
+            {   0,   5,   0,   0,   0,   0,   0,   0,   0,   0,   0,   7,},
+        },
+        {
+            /* Coeff Band ( 5 ) */
+            {  52,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {  18,   6,   0,   0,   0,   0,   0,   0,   0,   0,   0, 652,},
+            {   1,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  30,},
+        },
+        {
+            /* Coeff Band ( 6 ) */
+            { 640, 569,  10,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {  25,  77,   2,   0,   0,   0,   0,   0,   0,   0,   0, 517,},
+            {   4,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   3,},
+        },
+        {
+            /* Coeff Band ( 7 ) */
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+        },
+    },
+    {
+        /* Block Type ( 3 ) */
+        {
+            /* Coeff Band ( 0 ) */
+            {2506, 20161, 2707, 767, 261, 178, 107,  30,  14,   3,   0, 100694,},
+            {8806, 36478, 8817, 3268, 1280, 850, 401, 114,  42,   0,   0, 58572,},
+            {11003, 27214, 11798, 5716, 2482, 2072, 1048, 175,  32,   0,   0, 19284,},
+        },
+        {
+            /* Coeff Band ( 1 ) */
+            {9738, 11313, 959, 205,  70,  18,  11,   1,   0,   0,   0,   0,},
+            {12628, 15085, 1507, 273,  52,  19,   9,   0,   0,   0,   0, 54280,},
+            {10701, 15846, 5561, 1926, 813, 570, 249,  36,   0,   0,   0, 6460,},
+        },
+        {
+            /* Coeff Band ( 2 ) */
+            {6781, 22539, 2784, 634, 182, 123,  20,   4,   0,   0,   0,   0,},
+            {6263, 11544, 2649, 790, 259, 168,  27,   5,   0,   0,   0, 20539,},
+            {3109, 4075, 2031, 896, 457, 386, 158,  29,   0,   0,   0, 1138,},
+        },
+        {
+            /* Coeff Band ( 3 ) */
+            {11515, 4079, 465,  73,   5,  14,   2,   0,   0,   0,   0,   0,},
+            {9361, 5834, 650,  96,  24,   8,   4,   0,   0,   0,   0, 22181,},
+            {4343, 3974, 1360, 415, 132,  96,  14,   1,   0,   0,   0, 1267,},
+        },
+        {
+            /* Coeff Band ( 4 ) */
+            {4787, 9297, 823, 168,  44,  12,   4,   0,   0,   0,   0,   0,},
+            {3619, 4472, 719, 198,  60,  31,   3,   0,   0,   0,   0, 8401,},
+            {1157, 1175, 483, 182,  88,  31,   8,   0,   0,   0,   0, 268,},
+        },
+        {
+            /* Coeff Band ( 5 ) */
+            {8299, 1226,  32,   5,   1,   0,   0,   0,   0,   0,   0,   0,},
+            {3502, 1568,  57,   4,   1,   1,   0,   0,   0,   0,   0, 9811,},
+            {1055, 1070, 166,  29,   6,   1,   0,   0,   0,   0,   0, 527,},
+        },
+        {
+            /* Coeff Band ( 6 ) */
+            {27414, 27927, 1989, 347,  69,  26,   0,   0,   0,   0,   0,   0,},
+            {5876, 10074, 1574, 341,  91,  24,   4,   0,   0,   0,   0, 21954,},
+            {1571, 2171, 778, 324, 124,  65,  16,   0,   0,   0,   0, 979,},
+        },
+        {
+            /* Coeff Band ( 7 ) */
+            {   0,  29,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,  23,   0,   0,   0,   0,   0,   0,   0,   0,   0, 459,},
+            {   0,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  13,},
+        },
+    },
+};
--- a/media/libvpx/vp8/common/defaultcoefcounts.h
+++ b/media/libvpx/vp8/common/defaultcoefcounts.h
@@ -3,219 +3,19 @@
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
-/* Generated file, included by entropy.c */
+#ifndef __DEFAULTCOEFCOUNTS_H
+#define __DEFAULTCOEFCOUNTS_H
 
-static const unsigned int default_coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens] =
-{
+#include "entropy.h"
 
-    {
-        /* Block Type ( 0 ) */
-        {
-            /* Coeff Band ( 0 ) */
-            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
-            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
-            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
-        },
-        {
-            /* Coeff Band ( 1 ) */
-            {30190, 26544, 225,  24,   4,   0,   0,   0,   0,   0,   0, 4171593,},
-            {26846, 25157, 1241, 130,  26,   6,   1,   0,   0,   0,   0, 149987,},
-            {10484, 9538, 1006, 160,  36,  18,   0,   0,   0,   0,   0, 15104,},
-        },
-        {
-            /* Coeff Band ( 2 ) */
-            {25842, 40456, 1126,  83,  11,   2,   0,   0,   0,   0,   0,   0,},
-            {9338, 8010, 512,  73,   7,   3,   2,   0,   0,   0,   0, 43294,},
-            {1047, 751, 149,  31,  13,   6,   1,   0,   0,   0,   0, 879,},
-        },
-        {
-            /* Coeff Band ( 3 ) */
-            {26136, 9826, 252,  13,   0,   0,   0,   0,   0,   0,   0,   0,},
-            {8134, 5574, 191,  14,   2,   0,   0,   0,   0,   0,   0, 35302,},
-            { 605, 677, 116,   9,   1,   0,   0,   0,   0,   0,   0, 611,},
-        },
-        {
-            /* Coeff Band ( 4 ) */
-            {10263, 15463, 283,  17,   0,   0,   0,   0,   0,   0,   0,   0,},
-            {2773, 2191, 128,   9,   2,   2,   0,   0,   0,   0,   0, 10073,},
-            { 134, 125,  32,   4,   0,   2,   0,   0,   0,   0,   0,  50,},
-        },
-        {
-            /* Coeff Band ( 5 ) */
-            {10483, 2663,  23,   1,   0,   0,   0,   0,   0,   0,   0,   0,},
-            {2137, 1251,  27,   1,   1,   0,   0,   0,   0,   0,   0, 14362,},
-            { 116, 156,  14,   2,   1,   0,   0,   0,   0,   0,   0, 190,},
-        },
-        {
-            /* Coeff Band ( 6 ) */
-            {40977, 27614, 412,  28,   0,   0,   0,   0,   0,   0,   0,   0,},
-            {6113, 5213, 261,  22,   3,   0,   0,   0,   0,   0,   0, 26164,},
-            { 382, 312,  50,  14,   2,   0,   0,   0,   0,   0,   0, 345,},
-        },
-        {
-            /* Coeff Band ( 7 ) */
-            {   0,  26,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
-            {   0,  13,   0,   0,   0,   0,   0,   0,   0,   0,   0, 319,},
-            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   8,},
-        },
-    },
-    {
-        /* Block Type ( 1 ) */
-        {
-            /* Coeff Band ( 0 ) */
-            {3268, 19382, 1043, 250,  93,  82,  49,  26,  17,   8,  25, 82289,},
-            {8758, 32110, 5436, 1832, 827, 668, 420, 153,  24,   0,   3, 52914,},
-            {9337, 23725, 8487, 3954, 2107, 1836, 1069, 399,  59,   0,   0, 18620,},
-        },
-        {
-            /* Coeff Band ( 1 ) */
-            {12419, 8420, 452,  62,   9,   1,   0,   0,   0,   0,   0,   0,},
-            {11715, 8705, 693,  92,  15,   7,   2,   0,   0,   0,   0, 53988,},
-            {7603, 8585, 2306, 778, 270, 145,  39,   5,   0,   0,   0, 9136,},
-        },
-        {
-            /* Coeff Band ( 2 ) */
-            {15938, 14335, 1207, 184,  55,  13,   4,   1,   0,   0,   0,   0,},
-            {7415, 6829, 1138, 244,  71,  26,   7,   0,   0,   0,   0, 9980,},
-            {1580, 1824, 655, 241,  89,  46,  10,   2,   0,   0,   0, 429,},
-        },
-        {
-            /* Coeff Band ( 3 ) */
-            {19453, 5260, 201,  19,   0,   0,   0,   0,   0,   0,   0,   0,},
-            {9173, 3758, 213,  22,   1,   1,   0,   0,   0,   0,   0, 9820,},
-            {1689, 1277, 276,  51,  17,   4,   0,   0,   0,   0,   0, 679,},
-        },
-        {
-            /* Coeff Band ( 4 ) */
-            {12076, 10667, 620,  85,  19,   9,   5,   0,   0,   0,   0,   0,},
-            {4665, 3625, 423,  55,  19,   9,   0,   0,   0,   0,   0, 5127,},
-            { 415, 440, 143,  34,  20,   7,   2,   0,   0,   0,   0, 101,},
-        },
-        {
-            /* Coeff Band ( 5 ) */
-            {12183, 4846, 115,  11,   1,   0,   0,   0,   0,   0,   0,   0,},
-            {4226, 3149, 177,  21,   2,   0,   0,   0,   0,   0,   0, 7157,},
-            { 375, 621, 189,  51,  11,   4,   1,   0,   0,   0,   0, 198,},
-        },
-        {
-            /* Coeff Band ( 6 ) */
-            {61658, 37743, 1203,  94,  10,   3,   0,   0,   0,   0,   0,   0,},
-            {15514, 11563, 903, 111,  14,   5,   0,   0,   0,   0,   0, 25195,},
-            { 929, 1077, 291,  78,  14,   7,   1,   0,   0,   0,   0, 507,},
-        },
-        {
-            /* Coeff Band ( 7 ) */
-            {   0, 990,  15,   3,   0,   0,   0,   0,   0,   0,   0,   0,},
-            {   0, 412,  13,   0,   0,   0,   0,   0,   0,   0,   0, 1641,},
-            {   0,  18,   7,   1,   0,   0,   0,   0,   0,   0,   0,  30,},
-        },
-    },
-    {
-        /* Block Type ( 2 ) */
-        {
-            /* Coeff Band ( 0 ) */
-            { 953, 24519, 628, 120,  28,  12,   4,   0,   0,   0,   0, 2248798,},
-            {1525, 25654, 2647, 617, 239, 143,  42,   5,   0,   0,   0, 66837,},
-            {1180, 11011, 3001, 1237, 532, 448, 239,  54,   5,   0,   0, 7122,},
-        },
-        {
-            /* Coeff Band ( 1 ) */
-            {1356, 2220,  67,  10,   4,   1,   0,   0,   0,   0,   0,   0,},
-            {1450, 2544, 102,  18,   4,   3,   0,   0,   0,   0,   0, 57063,},
-            {1182, 2110, 470, 130,  41,  21,   0,   0,   0,   0,   0, 6047,},
-        },
-        {
-            /* Coeff Band ( 2 ) */
-            { 370, 3378, 200,  30,   5,   4,   1,   0,   0,   0,   0,   0,},
-            { 293, 1006, 131,  29,  11,   0,   0,   0,   0,   0,   0, 5404,},
-            { 114, 387,  98,  23,   4,   8,   1,   0,   0,   0,   0, 236,},
-        },
-        {
-            /* Coeff Band ( 3 ) */
-            { 579, 194,   4,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
-            { 395, 213,   5,   1,   0,   0,   0,   0,   0,   0,   0, 4157,},
-            { 119, 122,   4,   0,   0,   0,   0,   0,   0,   0,   0, 300,},
-        },
-        {
-            /* Coeff Band ( 4 ) */
-            {  38, 557,  19,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
-            {  21, 114,  12,   1,   0,   0,   0,   0,   0,   0,   0, 427,},
-            {   0,   5,   0,   0,   0,   0,   0,   0,   0,   0,   0,   7,},
-        },
-        {
-            /* Coeff Band ( 5 ) */
-            {  52,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
-            {  18,   6,   0,   0,   0,   0,   0,   0,   0,   0,   0, 652,},
-            {   1,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  30,},
-        },
-        {
-            /* Coeff Band ( 6 ) */
-            { 640, 569,  10,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
-            {  25,  77,   2,   0,   0,   0,   0,   0,   0,   0,   0, 517,},
-            {   4,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   3,},
-        },
-        {
-            /* Coeff Band ( 7 ) */
-            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
-            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
-            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
-        },
-    },
-    {
-        /* Block Type ( 3 ) */
-        {
-            /* Coeff Band ( 0 ) */
-            {2506, 20161, 2707, 767, 261, 178, 107,  30,  14,   3,   0, 100694,},
-            {8806, 36478, 8817, 3268, 1280, 850, 401, 114,  42,   0,   0, 58572,},
-            {11003, 27214, 11798, 5716, 2482, 2072, 1048, 175,  32,   0,   0, 19284,},
-        },
-        {
-            /* Coeff Band ( 1 ) */
-            {9738, 11313, 959, 205,  70,  18,  11,   1,   0,   0,   0,   0,},
-            {12628, 15085, 1507, 273,  52,  19,   9,   0,   0,   0,   0, 54280,},
-            {10701, 15846, 5561, 1926, 813, 570, 249,  36,   0,   0,   0, 6460,},
-        },
-        {
-            /* Coeff Band ( 2 ) */
-            {6781, 22539, 2784, 634, 182, 123,  20,   4,   0,   0,   0,   0,},
-            {6263, 11544, 2649, 790, 259, 168,  27,   5,   0,   0,   0, 20539,},
-            {3109, 4075, 2031, 896, 457, 386, 158,  29,   0,   0,   0, 1138,},
-        },
-        {
-            /* Coeff Band ( 3 ) */
-            {11515, 4079, 465,  73,   5,  14,   2,   0,   0,   0,   0,   0,},
-            {9361, 5834, 650,  96,  24,   8,   4,   0,   0,   0,   0, 22181,},
-            {4343, 3974, 1360, 415, 132,  96,  14,   1,   0,   0,   0, 1267,},
-        },
-        {
-            /* Coeff Band ( 4 ) */
-            {4787, 9297, 823, 168,  44,  12,   4,   0,   0,   0,   0,   0,},
-            {3619, 4472, 719, 198,  60,  31,   3,   0,   0,   0,   0, 8401,},
-            {1157, 1175, 483, 182,  88,  31,   8,   0,   0,   0,   0, 268,},
-        },
-        {
-            /* Coeff Band ( 5 ) */
-            {8299, 1226,  32,   5,   1,   0,   0,   0,   0,   0,   0,   0,},
-            {3502, 1568,  57,   4,   1,   1,   0,   0,   0,   0,   0, 9811,},
-            {1055, 1070, 166,  29,   6,   1,   0,   0,   0,   0,   0, 527,},
-        },
-        {
-            /* Coeff Band ( 6 ) */
-            {27414, 27927, 1989, 347,  69,  26,   0,   0,   0,   0,   0,   0,},
-            {5876, 10074, 1574, 341,  91,  24,   4,   0,   0,   0,   0, 21954,},
-            {1571, 2171, 778, 324, 124,  65,  16,   0,   0,   0,   0, 979,},
-        },
-        {
-            /* Coeff Band ( 7 ) */
-            {   0,  29,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
-            {   0,  23,   0,   0,   0,   0,   0,   0,   0,   0,   0, 459,},
-            {   0,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  13,},
-        },
-    },
-};
+extern const unsigned int vp8_default_coef_counts[BLOCK_TYPES]
+                                                 [COEF_BANDS]
+                                                 [PREV_COEF_CONTEXTS]
+                                                 [MAX_ENTROPY_TOKENS];
+
+#endif //__DEFAULTCOEFCOUNTS_H
--- a/media/libvpx/vp8/common/entropy.c
+++ b/media/libvpx/vp8/common/entropy.c
@@ -21,26 +21,58 @@
 
 typedef const uchar cuchar;
 typedef const uint cuint;
 
 typedef vp8_prob Prob;
 
 #include "coefupdateprobs.h"
 
-DECLARE_ALIGNED(16, cuchar, vp8_coef_bands[16]) = { 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7};
-DECLARE_ALIGNED(16, cuchar, vp8_prev_token_class[MAX_ENTROPY_TOKENS]) = { 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0};
+DECLARE_ALIGNED(16, const unsigned char, vp8_norm[256]) =
+{
+    0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+DECLARE_ALIGNED(16, cuchar, vp8_coef_bands[16]) =
+{ 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7};
+
+DECLARE_ALIGNED(16, cuchar, vp8_prev_token_class[MAX_ENTROPY_TOKENS]) =
+{ 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0};
+
 DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]) =
 {
     0,  1,  4,  8,
     5,  2,  3,  6,
     9, 12, 13, 10,
     7, 11, 14, 15,
 };
 
+DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]) =
+{
+    1,  2,  6,  7,
+    3,  5,  8, 13,
+    4,  9, 12, 14,
+   10, 11, 15, 16
+};
+
 DECLARE_ALIGNED(16, short, vp8_default_zig_zag_mask[16]);
 
 const int vp8_mb_feature_data_bits[MB_LVL_MAX] = {7, 6};
 
 /* Array indices are identical to previously-existing CONTEXT_NODE indices */
 
 const vp8_tree_index vp8_coef_tree[ 22] =     /* corresponding _CONTEXT_NODEs */
 {
@@ -52,17 +84,17 @@ const vp8_tree_index vp8_coef_tree[ 22] 
     -THREE_TOKEN, -FOUR_TOKEN,                /* 5 = THREE */
     14, 16,                                    /* 6 = HIGH_LOW */
     -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2,   /* 7 = CAT_ONE */
     18, 20,                                   /* 8 = CAT_THREEFOUR */
     -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4,  /* 9 = CAT_THREE */
     -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6   /* 10 = CAT_FIVE */
 };
 
-struct vp8_token_struct vp8_coef_encodings[vp8_coef_tokens];
+struct vp8_token_struct vp8_coef_encodings[MAX_ENTROPY_TOKENS];
 
 /* Trees for extra bits.  Probabilities are constant and
    do not depend on previously encoded bits */
 
 static const Prob Pcat1[] = { 159};
 static const Prob Pcat2[] = { 165, 145};
 static const Prob Pcat3[] = { 173, 148, 140};
 static const Prob Pcat4[] = { 176, 155, 140, 135};
@@ -101,33 +133,30 @@ static void init_bit_trees()
     init_bit_tree(cat1, 1);
     init_bit_tree(cat2, 2);
     init_bit_tree(cat3, 3);
     init_bit_tree(cat4, 4);
     init_bit_tree(cat5, 5);
     init_bit_tree(cat6, 11);
 }
 
-
-static vp8bc_index_t bcc1[1], bcc2[2], bcc3[3], bcc4[4], bcc5[5], bcc6[11];
-
 vp8_extra_bit_struct vp8_extra_bits[12] =
 {
-    { 0, 0, 0, 0, 0},
-    { 0, 0, 0, 0, 1},
-    { 0, 0, 0, 0, 2},
-    { 0, 0, 0, 0, 3},
-    { 0, 0, 0, 0, 4},
-    { cat1, Pcat1, bcc1, 1, 5},
-    { cat2, Pcat2, bcc2, 2, 7},
-    { cat3, Pcat3, bcc3, 3, 11},
-    { cat4, Pcat4, bcc4, 4, 19},
-    { cat5, Pcat5, bcc5, 5, 35},
-    { cat6, Pcat6, bcc6, 11, 67},
-    { 0, 0, 0, 0, 0}
+    { 0, 0, 0, 0},
+    { 0, 0, 0, 1},
+    { 0, 0, 0, 2},
+    { 0, 0, 0, 3},
+    { 0, 0, 0, 4},
+    { cat1, Pcat1, 1, 5},
+    { cat2, Pcat2, 2, 7},
+    { cat3, Pcat3, 3, 11},
+    { cat4, Pcat4, 4, 19},
+    { cat5, Pcat5, 5, 35},
+    { cat6, Pcat6, 11, 67},
+    { 0, 0, 0, 0}
 };
 #include "defaultcoefcounts.h"
 
 void vp8_default_coef_probs(VP8_COMMON *pc)
 {
     int h = 0;
 
     do
@@ -135,20 +164,22 @@ void vp8_default_coef_probs(VP8_COMMON *
         int i = 0;
 
         do
         {
             int k = 0;
 
             do
             {
-                unsigned int branch_ct [vp8_coef_tokens-1] [2];
+                unsigned int branch_ct [ENTROPY_NODES] [2];
                 vp8_tree_probs_from_distribution(
-                    vp8_coef_tokens, vp8_coef_encodings, vp8_coef_tree,
-                    pc->fc.coef_probs [h][i][k], branch_ct, default_coef_counts [h][i][k],
+                    MAX_ENTROPY_TOKENS, vp8_coef_encodings, vp8_coef_tree,
+                    pc->fc.coef_probs[h][i][k],
+                    branch_ct,
+                    vp8_default_coef_counts[h][i][k],
                     256, 1);
 
             }
             while (++k < PREV_COEF_CONTEXTS);
         }
         while (++i < COEF_BANDS);
     }
     while (++h < BLOCK_TYPES);
--- a/media/libvpx/vp8/common/entropy.h
+++ b/media/libvpx/vp8/common/entropy.h
@@ -19,35 +19,33 @@
 
 #define ZERO_TOKEN              0       /* 0         Extra Bits 0+0 */
 #define ONE_TOKEN               1       /* 1         Extra Bits 0+1 */
 #define TWO_TOKEN               2       /* 2         Extra Bits 0+1 */
 #define THREE_TOKEN             3       /* 3         Extra Bits 0+1 */
 #define FOUR_TOKEN              4       /* 4         Extra Bits 0+1 */
 #define DCT_VAL_CATEGORY1       5       /* 5-6       Extra Bits 1+1 */
 #define DCT_VAL_CATEGORY2       6       /* 7-10      Extra Bits 2+1 */
-#define DCT_VAL_CATEGORY3       7       /* 11-26     Extra Bits 4+1 */
-#define DCT_VAL_CATEGORY4       8       /* 11-26     Extra Bits 5+1 */
-#define DCT_VAL_CATEGORY5       9       /* 27-58     Extra Bits 5+1 */
-#define DCT_VAL_CATEGORY6       10      /* 59+       Extra Bits 11+1 */
+#define DCT_VAL_CATEGORY3       7       /* 11-18     Extra Bits 3+1 */
+#define DCT_VAL_CATEGORY4       8       /* 19-34     Extra Bits 4+1 */
+#define DCT_VAL_CATEGORY5       9       /* 35-66     Extra Bits 5+1 */
+#define DCT_VAL_CATEGORY6       10      /* 67+       Extra Bits 11+1 */
 #define DCT_EOB_TOKEN           11      /* EOB       Extra Bits 0+0 */
 
-#define vp8_coef_tokens 12
-#define MAX_ENTROPY_TOKENS vp8_coef_tokens
+#define MAX_ENTROPY_TOKENS 12
 #define ENTROPY_NODES 11
 
 extern const vp8_tree_index vp8_coef_tree[];
 
-extern struct vp8_token_struct vp8_coef_encodings[vp8_coef_tokens];
+extern struct vp8_token_struct vp8_coef_encodings[MAX_ENTROPY_TOKENS];
 
 typedef struct
 {
     vp8_tree_p tree;
     const vp8_prob *prob;
-    vp8bc_index_t *prob_bc;
     int Len;
     int base_val;
 } vp8_extra_bit_struct;
 
 extern vp8_extra_bit_struct vp8_extra_bits[12];    /* indexed by token value */
 
 #define PROB_UPDATE_BASELINE_COST   7
 
@@ -81,22 +79,23 @@ extern DECLARE_ALIGNED(16, const unsigne
    coefficient will appear later in this block.  However, this shift
    in meaning is perfectly OK because our context depends also on the
    coefficient band (and since zigzag positions 0, 1, and 2 are in
    distinct bands). */
 
 /*# define DC_TOKEN_CONTEXTS        3*/ /* 00, 0!0, !0!0 */
 #   define PREV_COEF_CONTEXTS       3
 
-extern DECLARE_ALIGNED(16, const unsigned char, vp8_prev_token_class[vp8_coef_tokens]);
+extern DECLARE_ALIGNED(16, const unsigned char, vp8_prev_token_class[MAX_ENTROPY_TOKENS]);
 
-extern const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens-1];
+extern const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
 
 
 struct VP8Common;
 void vp8_default_coef_probs(struct VP8Common *);
 
 extern DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]);
+extern DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]);
 extern short vp8_default_zig_zag_mask[16];
 extern const int vp8_mb_feature_data_bits[MB_LVL_MAX];
 
 void vp8_coef_tree_initialize(void);
 #endif
--- a/media/libvpx/vp8/common/entropymode.c
+++ b/media/libvpx/vp8/common/entropymode.c
@@ -28,21 +28,21 @@ typedef enum
 {
     SUBMVREF_NORMAL,
     SUBMVREF_LEFT_ZED,
     SUBMVREF_ABOVE_ZED,
     SUBMVREF_LEFT_ABOVE_SAME,
     SUBMVREF_LEFT_ABOVE_ZED
 } sumvfref_t;
 
-int vp8_mv_cont(const MV *l, const MV *a)
+int vp8_mv_cont(const int_mv *l, const int_mv *a)
 {
-    int lez = (l->row == 0 && l->col == 0);
-    int aez = (a->row == 0 && a->col == 0);
-    int lea = (l->row == a->row && l->col == a->col);
+    int lez = (l->as_int == 0);
+    int aez = (a->as_int == 0);
+    int lea = (l->as_int == a->as_int);
 
     if (lea && lez)
         return SUBMVREF_LEFT_ABOVE_ZED;
 
     if (lea)
         return SUBMVREF_LEFT_ABOVE_SAME;
 
     if (aez)
--- a/media/libvpx/vp8/common/entropymode.h
+++ b/media/libvpx/vp8/common/entropymode.h
@@ -20,17 +20,17 @@ typedef const int vp8_mbsplit[16];
 #define VP8_NUMMBSPLITS 4
 
 extern vp8_mbsplit vp8_mbsplits [VP8_NUMMBSPLITS];
 
 extern const int vp8_mbsplit_count [VP8_NUMMBSPLITS];    /* # of subsets */
 
 extern const vp8_prob vp8_mbsplit_probs [VP8_NUMMBSPLITS-1];
 
-extern int vp8_mv_cont(const MV *l, const MV *a);
+extern int vp8_mv_cont(const int_mv *l, const int_mv *a);
 #define SUBMVREF_COUNT 5
 extern const vp8_prob vp8_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP8_SUBMVREFS-1];
 
 
 extern const unsigned int vp8_kf_default_bmode_counts [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES];
 
 
 extern const vp8_tree_index vp8_bmode_tree[];
--- a/media/libvpx/vp8/common/entropymv.h
+++ b/media/libvpx/vp8/common/entropymv.h
@@ -13,16 +13,18 @@
 #define __INC_ENTROPYMV_H
 
 #include "treecoder.h"
 
 enum
 {
     mv_max  = 1023,              /* max absolute value of a MV component */
     MVvals = (2 * mv_max) + 1,   /* # possible values "" */
+    mvfp_max  = 255,              /* max absolute value of a full pixel MV component */
+    MVfpvals = (2 * mvfp_max) +1, /* # possible full pixel MV values */
 
     mvlong_width = 10,       /* Large MVs have 9 bit magnitudes */
     mvnum_short = 8,         /* magnitudes 0 through 7 */
 
     /* probability offsets for coding each MV component */
 
     mvpis_short = 0,         /* short (<= 7) vs long (>= 8) */
     MVPsign,                /* sign for non-zero */
--- a/media/libvpx/vp8/common/extend.c
+++ b/media/libvpx/vp8/common/extend.c
@@ -8,99 +8,105 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
 #include "extend.h"
 #include "vpx_mem/vpx_mem.h"
 
 
-static void extend_plane_borders
+static void copy_and_extend_plane
 (
     unsigned char *s, /* source */
-    int sp,           /* pitch */
+    int sp,           /* source pitch */
+    unsigned char *d, /* destination */
+    int dp,           /* destination pitch */
     int h,            /* height */
     int w,            /* width */
     int et,           /* extend top border */
     int el,           /* extend left border */
     int eb,           /* extend bottom border */
     int er            /* extend right border */
 )
 {
-
     int i;
     unsigned char *src_ptr1, *src_ptr2;
     unsigned char *dest_ptr1, *dest_ptr2;
     int linesize;
 
     /* copy the left and right most columns out */
     src_ptr1 = s;
     src_ptr2 = s + w - 1;
-    dest_ptr1 = s - el;
-    dest_ptr2 = s + w;
+    dest_ptr1 = d - el;
+    dest_ptr2 = d + w;
 
-    for (i = 0; i < h - 0 + 1; i++)
+    for (i = 0; i < h; i++)
     {
-        /* Some linkers will complain if we call vpx_memset with el set to a
-         * constant 0.
-         */
-        if (el)
-            vpx_memset(dest_ptr1, src_ptr1[0], el);
+        vpx_memset(dest_ptr1, src_ptr1[0], el);
+        vpx_memcpy(dest_ptr1 + el, src_ptr1, w);
         vpx_memset(dest_ptr2, src_ptr2[0], er);
         src_ptr1  += sp;
         src_ptr2  += sp;
-        dest_ptr1 += sp;
-        dest_ptr2 += sp;
+        dest_ptr1 += dp;
+        dest_ptr2 += dp;
     }
 
-    /* Now copy the top and bottom source lines into each line of the respective borders */
-    src_ptr1 = s - el;
-    src_ptr2 = s + sp * (h - 1) - el;
-    dest_ptr1 = s + sp * (-et) - el;
-    dest_ptr2 = s + sp * (h) - el;
-    linesize = el + er + w + 1;
+    /* Now copy the top and bottom lines into each line of the respective
+     * borders
+     */
+    src_ptr1 = d - el;
+    src_ptr2 = d + dp * (h - 1) - el;
+    dest_ptr1 = d + dp * (-et) - el;
+    dest_ptr2 = d + dp * (h) - el;
+    linesize = el + er + w;
 
-    for (i = 0; i < (int)et; i++)
+    for (i = 0; i < et; i++)
     {
         vpx_memcpy(dest_ptr1, src_ptr1, linesize);
-        dest_ptr1 += sp;
+        dest_ptr1 += dp;
     }
 
-    for (i = 0; i < (int)eb; i++)
+    for (i = 0; i < eb; i++)
     {
         vpx_memcpy(dest_ptr2, src_ptr2, linesize);
-        dest_ptr2 += sp;
+        dest_ptr2 += dp;
     }
 }
 
 
-void vp8_extend_to_multiple_of16(YV12_BUFFER_CONFIG *ybf, int width, int height)
+void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
+                               YV12_BUFFER_CONFIG *dst)
 {
-    int er = 0xf & (16 - (width & 0xf));
-    int eb = 0xf & (16 - (height & 0xf));
+    int et = dst->border;
+    int el = dst->border;
+    int eb = dst->border + dst->y_height - src->y_height;
+    int er = dst->border + dst->y_width - src->y_width;
 
-    /* check for non multiples of 16 */
-    if (er != 0 || eb != 0)
-    {
-        extend_plane_borders(ybf->y_buffer, ybf->y_stride, height, width, 0, 0, eb, er);
+    copy_and_extend_plane(src->y_buffer, src->y_stride,
+                          dst->y_buffer, dst->y_stride,
+                          src->y_height, src->y_width,
+                          et, el, eb, er);
 
-        /* adjust for uv */
-        height = (height + 1) >> 1;
-        width  = (width  + 1) >> 1;
-        er = 0x7 & (8 - (width  & 0x7));
-        eb = 0x7 & (8 - (height & 0x7));
+    et = dst->border >> 1;
+    el = dst->border >> 1;
+    eb = (dst->border >> 1) + dst->uv_height - src->uv_height;
+    er = (dst->border >> 1) + dst->uv_width - src->uv_width;
 
-        if (er || eb)
-        {
-            extend_plane_borders(ybf->u_buffer, ybf->uv_stride, height, width, 0, 0, eb, er);
-            extend_plane_borders(ybf->v_buffer, ybf->uv_stride, height, width, 0, 0, eb, er);
-        }
-    }
+    copy_and_extend_plane(src->u_buffer, src->uv_stride,
+                          dst->u_buffer, dst->uv_stride,
+                          src->uv_height, src->uv_width,
+                          et, el, eb, er);
+
+    copy_and_extend_plane(src->v_buffer, src->uv_stride,
+                          dst->v_buffer, dst->uv_stride,
+                          src->uv_height, src->uv_width,
+                          et, el, eb, er);
 }
 
+
 /* note the extension is only for the last row, for intra prediction purpose */
 void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, unsigned char *UPtr, unsigned char *VPtr)
 {
     int i;
 
     YPtr += ybf->y_stride * 14;
     UPtr += ybf->uv_stride * 6;
     VPtr += ybf->uv_stride * 6;
--- a/media/libvpx/vp8/common/extend.h
+++ b/media/libvpx/vp8/common/extend.h
@@ -9,13 +9,13 @@
  */
 
 
 #ifndef __INC_EXTEND_H
 #define __INC_EXTEND_H
 
 #include "vpx_scale/yv12config.h"
 
-void Extend(YV12_BUFFER_CONFIG *ybf);
 void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, unsigned char *UPtr, unsigned char *VPtr);
-void vp8_extend_to_multiple_of16(YV12_BUFFER_CONFIG *ybf, int width, int height);
+void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
+                               YV12_BUFFER_CONFIG *dst);
 
 #endif
rename from media/libvpx/vp8/common/filter_c.c
rename to media/libvpx/vp8/common/filter.c
--- a/media/libvpx/vp8/common/filter_c.c
+++ b/media/libvpx/vp8/common/filter.c
@@ -5,52 +5,45 @@
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
 #include <stdlib.h>
+#include "filter.h"
+#include "vpx_ports/mem.h"
 
-#define BLOCK_HEIGHT_WIDTH 4
-#define VP8_FILTER_WEIGHT 128
-#define VP8_FILTER_SHIFT  7
-
-
-static const int bilinear_filters[8][2] =
+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]) =
 {
     { 128,   0 },
     { 112,  16 },
     {  96,  32 },
     {  80,  48 },
     {  64,  64 },
     {  48,  80 },
     {  32,  96 },
     {  16, 112 }
 };
 
-
-static const short sub_pel_filters[8][6] =
+DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]) =
 {
 
     { 0,  0,  128,    0,   0,  0 },         /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */
     { 0, -6,  123,   12,  -1,  0 },
     { 2, -11, 108,   36,  -8,  1 },         /* New 1/4 pel 6 tap filter */
     { 0, -9,   93,   50,  -6,  0 },
     { 3, -16,  77,   77, -16,  3 },         /* New 1/2 pel 6 tap filter */
     { 0, -6,   50,   93,  -9,  0 },
     { 1, -8,   36,  108, -11,  2 },         /* New 1/4 pel 6 tap filter */
     { 0, -1,   12,  123,  -6,  0 },
-
-
-
 };
 
-void vp8_filter_block2d_first_pass
+static void filter_block2d_first_pass
 (
     unsigned char *src_ptr,
     int *output_ptr,
     unsigned int src_pixels_per_line,
     unsigned int pixel_step,
     unsigned int output_height,
     unsigned int output_width,
     const short *vp8_filter
@@ -84,17 +77,17 @@ void vp8_filter_block2d_first_pass
         }
 
         /* Next row... */
         src_ptr    += src_pixels_per_line - output_width;
         output_ptr += output_width;
     }
 }
 
-void vp8_filter_block2d_second_pass
+static void filter_block2d_second_pass
 (
     int *src_ptr,
     unsigned char *output_ptr,
     int output_pitch,
     unsigned int src_pixels_per_line,
     unsigned int pixel_step,
     unsigned int output_height,
     unsigned int output_width,
@@ -131,410 +124,371 @@ void vp8_filter_block2d_second_pass
 
         /* Start next row */
         src_ptr    += src_pixels_per_line - output_width;
         output_ptr += output_pitch;
     }
 }
 
 
-void vp8_filter_block2d
+static void filter_block2d
 (
     unsigned char  *src_ptr,
     unsigned char  *output_ptr,
     unsigned int src_pixels_per_line,
     int output_pitch,
     const short  *HFilter,
     const short  *VFilter
 )
 {
-    int FData[9*4]; /* Temp data bufffer used in filtering */
+    int FData[9*4]; /* Temp data buffer used in filtering */
 
     /* First filter 1-D horizontally... */
-    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 4, HFilter);
+    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 4, HFilter);
 
     /* then filter verticaly... */
-    vp8_filter_block2d_second_pass(FData + 8, output_ptr, output_pitch, 4, 4, 4, 4, VFilter);
+    filter_block2d_second_pass(FData + 8, output_ptr, output_pitch, 4, 4, 4, 4, VFilter);
 }
 
 
-void vp8_block_variation_c
-(
-    unsigned char  *src_ptr,
-    int   src_pixels_per_line,
-    int *HVar,
-    int *VVar
-)
-{
-    int i, j;
-    unsigned char *Ptr = src_ptr;
-
-    for (i = 0; i < 4; i++)
-    {
-        for (j = 0; j < 4; j++)
-        {
-            *HVar += abs((int)Ptr[j] - (int)Ptr[j+1]);
-            *VVar += abs((int)Ptr[j] - (int)Ptr[j+src_pixels_per_line]);
-        }
-
-        Ptr += src_pixels_per_line;
-    }
-}
-
-
-
-
 void vp8_sixtap_predict_c
 (
     unsigned char  *src_ptr,
     int   src_pixels_per_line,
     int  xoffset,
     int  yoffset,
     unsigned char *dst_ptr,
     int dst_pitch
 )
 {
     const short  *HFilter;
     const short  *VFilter;
 
-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];   /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
 
-    vp8_filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);
+    filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);
 }
 void vp8_sixtap_predict8x8_c
 (
     unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
     unsigned char *dst_ptr,
     int  dst_pitch
 )
 {
     const short  *HFilter;
     const short  *VFilter;
-    int FData[13*16];   /* Temp data bufffer used in filtering */
+    int FData[13*16];   /* Temp data buffer used in filtering */
 
-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];   /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
 
     /* First filter 1-D horizontally... */
-    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 13, 8, HFilter);
+    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 13, 8, HFilter);
 
 
     /* then filter verticaly... */
-    vp8_filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);
+    filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);
 
 }
 
 void vp8_sixtap_predict8x4_c
 (
     unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
     unsigned char *dst_ptr,
     int  dst_pitch
 )
 {
     const short  *HFilter;
     const short  *VFilter;
-    int FData[13*16];   /* Temp data bufffer used in filtering */
+    int FData[13*16];   /* Temp data buffer used in filtering */
 
-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];   /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
 
     /* First filter 1-D horizontally... */
-    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 8, HFilter);
+    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 8, HFilter);
 
 
     /* then filter verticaly... */
-    vp8_filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter);
+    filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter);
 
 }
 
 void vp8_sixtap_predict16x16_c
 (
     unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
     unsigned char *dst_ptr,
     int  dst_pitch
 )
 {
     const short  *HFilter;
     const short  *VFilter;
-    int FData[21*24];   /* Temp data bufffer used in filtering */
+    int FData[21*24];   /* Temp data buffer used in filtering */
 
 
-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];   /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
 
     /* First filter 1-D horizontally... */
-    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 21, 16, HFilter);
+    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 21, 16, HFilter);
 
     /* then filter verticaly... */
-    vp8_filter_block2d_second_pass(FData + 32, dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter);
+    filter_block2d_second_pass(FData + 32, dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter);
 
 }
 
 
 /****************************************************************************
  *
  *  ROUTINE       : filter_block2d_bil_first_pass
  *
- *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
- *                  UINT32 pixel_step        : Offset between filter input samples (see notes).
- *                  UINT32 output_height     : Input block height.
- *                  UINT32 output_width      : Input block width.
- *                  INT32  *vp8_filter          : Array of 2 bi-linear filter taps.
+ *  INPUTS        : UINT8  *src_ptr    : Pointer to source block.
+ *                  UINT32  src_stride : Stride of source block.
+ *                  UINT32  height     : Block height.
+ *                  UINT32  width      : Block width.
+ *                  INT32  *vp8_filter : Array of 2 bi-linear filter taps.
  *
- *  OUTPUTS       : INT32 *output_ptr        : Pointer to filtered block.
+ *  OUTPUTS       : INT32  *dst_ptr    : Pointer to filtered block.
  *
  *  RETURNS       : void
  *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
- *                  either horizontal or vertical direction to produce the
- *                  filtered output block. Used to implement first-pass
- *                  of 2-D separable filter.
+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block
+ *                  in the horizontal direction to produce the filtered output
+ *                  block. Used to implement first-pass of 2-D separable filter.
  *
  *  SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
  *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
- *                  pixel_step defines whether the filter is applied
- *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
- *                  It defines the offset required to move from one input
- *                  to the next.
  *
  ****************************************************************************/
-void vp8_filter_block2d_bil_first_pass
+static void filter_block2d_bil_first_pass
 (
-    unsigned char *src_ptr,
-    unsigned short *output_ptr,
-    unsigned int src_pixels_per_line,
-    int pixel_step,
-    unsigned int output_height,
-    unsigned int output_width,
-    const int *vp8_filter
+    unsigned char  *src_ptr,
+    unsigned short *dst_ptr,
+    unsigned int    src_stride,
+    unsigned int    height,
+    unsigned int    width,
+    const short    *vp8_filter
 )
 {
     unsigned int i, j;
 
-    for (i = 0; i < output_height; i++)
+    for (i = 0; i < height; i++)
     {
-        for (j = 0; j < output_width; j++)
+        for (j = 0; j < width; j++)
         {
             /* Apply bilinear filter */
-            output_ptr[j] = (((int)src_ptr[0]          * vp8_filter[0]) +
-                             ((int)src_ptr[pixel_step] * vp8_filter[1]) +
-                             (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
+            dst_ptr[j] = (((int)src_ptr[0] * vp8_filter[0]) +
+                          ((int)src_ptr[1] * vp8_filter[1]) +
+                          (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
             src_ptr++;
         }
 
         /* Next row... */
-        src_ptr    += src_pixels_per_line - output_width;
-        output_ptr += output_width;
+        src_ptr += src_stride - width;
+        dst_ptr += width;
     }
 }
 
 /****************************************************************************
  *
  *  ROUTINE       : filter_block2d_bil_second_pass
  *
- *  INPUTS        : INT32  *src_ptr          : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
- *                  UINT32 pixel_step        : Offset between filter input samples (see notes).
- *                  UINT32 output_height     : Input block height.
- *                  UINT32 output_width      : Input block width.
- *                  INT32  *vp8_filter          : Array of 2 bi-linear filter taps.
+ *  INPUTS        : INT32  *src_ptr    : Pointer to source block.
+ *                  UINT32  dst_pitch  : Destination block pitch.
+ *                  UINT32  height     : Block height.
+ *                  UINT32  width      : Block width.
+ *                  INT32  *vp8_filter : Array of 2 bi-linear filter taps.
  *
- *  OUTPUTS       : UINT16 *output_ptr       : Pointer to filtered block.
+ *  OUTPUTS       : UINT16 *dst_ptr    : Pointer to filtered block.
  *
  *  RETURNS       : void
  *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
- *                  either horizontal or vertical direction to produce the
- *                  filtered output block. Used to implement second-pass
- *                  of 2-D separable filter.
+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block
+ *                  in the vertical direction to produce the filtered output
+ *                  block. Used to implement second-pass of 2-D separable filter.
  *
  *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
  *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
- *                  pixel_step defines whether the filter is applied
- *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
- *                  It defines the offset required to move from one input
- *                  to the next.
  *
  ****************************************************************************/
-void vp8_filter_block2d_bil_second_pass
+static void filter_block2d_bil_second_pass
 (
     unsigned short *src_ptr,
-    unsigned char  *output_ptr,
-    int output_pitch,
-    unsigned int  src_pixels_per_line,
-    unsigned int  pixel_step,
-    unsigned int  output_height,
-    unsigned int  output_width,
-    const int *vp8_filter
+    unsigned char  *dst_ptr,
+    int             dst_pitch,
+    unsigned int    height,
+    unsigned int    width,
+    const short    *vp8_filter
 )
 {
     unsigned int  i, j;
     int  Temp;
 
-    for (i = 0; i < output_height; i++)
+    for (i = 0; i < height; i++)
     {
-        for (j = 0; j < output_width; j++)
+        for (j = 0; j < width; j++)
         {
             /* Apply filter */
-            Temp = ((int)src_ptr[0]         * vp8_filter[0]) +
-                   ((int)src_ptr[pixel_step] * vp8_filter[1]) +
+            Temp = ((int)src_ptr[0]     * vp8_filter[0]) +
+                   ((int)src_ptr[width] * vp8_filter[1]) +
                    (VP8_FILTER_WEIGHT / 2);
-            output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
+            dst_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
             src_ptr++;
         }
 
         /* Next row... */
-        src_ptr    += src_pixels_per_line - output_width;
-        output_ptr += output_pitch;
+        dst_ptr += dst_pitch;
     }
 }
 
 
 /****************************************************************************
  *
  *  ROUTINE       : filter_block2d_bil
  *
  *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
- *                  INT32  *HFilter         : Array of 2 horizontal filter taps.
- *                  INT32  *VFilter         : Array of 2 vertical filter taps.
+ *                  UINT32  src_pitch        : Stride of source block.
+ *                  UINT32  dst_pitch        : Stride of destination block.
+ *                  INT32  *HFilter          : Array of 2 horizontal filter taps.
+ *                  INT32  *VFilter          : Array of 2 vertical filter taps.
+ *                  INT32  Width             : Block width
+ *                  INT32  Height            : Block height
  *
- *  OUTPUTS       : UINT16 *output_ptr       : Pointer to filtered block.
+ *  OUTPUTS       : UINT16 *dst_ptr       : Pointer to filtered block.
  *
  *  RETURNS       : void
  *
  *  FUNCTION      : 2-D filters an input block by applying a 2-tap
  *                  bi-linear filter horizontally followed by a 2-tap
  *                  bi-linear filter vertically on the result.
  *
  *  SPECIAL NOTES : The largest block size can be handled here is 16x16
  *
  ****************************************************************************/
-void vp8_filter_block2d_bil
+static void filter_block2d_bil
 (
     unsigned char *src_ptr,
-    unsigned char *output_ptr,
-    unsigned int   src_pixels_per_line,
+    unsigned char *dst_ptr,
+    unsigned int   src_pitch,
     unsigned int   dst_pitch,
-    const int      *HFilter,
-    const int      *VFilter,
+    const short   *HFilter,
+    const short   *VFilter,
     int            Width,
     int            Height
 )
 {
 
-    unsigned short FData[17*16];    /* Temp data bufffer used in filtering */
+    unsigned short FData[17*16];    /* Temp data buffer used in filtering */
 
     /* First filter 1-D horizontally... */
-    vp8_filter_block2d_bil_first_pass(src_ptr, FData, src_pixels_per_line, 1, Height + 1, Width, HFilter);
+    filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
 
     /* then 1-D vertically... */
-    vp8_filter_block2d_bil_second_pass(FData, output_ptr, dst_pitch, Width, Width, Height, Width, VFilter);
+    filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
 }
 
 
 void vp8_bilinear_predict4x4_c
 (
     unsigned char  *src_ptr,
     int   src_pixels_per_line,
     int  xoffset,
     int  yoffset,
     unsigned char *dst_ptr,
     int dst_pitch
 )
 {
-    const int  *HFilter;
-    const int  *VFilter;
+    const short *HFilter;
+    const short *VFilter;
 
-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
 #if 0
     {
         int i;
         unsigned char temp1[16];
         unsigned char temp2[16];
 
         bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);
-        vp8_filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
+        filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
 
         for (i = 0; i < 16; i++)
         {
             if (temp1[i] != temp2[i])
             {
                 bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);
-                vp8_filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
+                filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
             }
         }
     }
 #endif
-    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
+    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
 
 }
 
 void vp8_bilinear_predict8x8_c
 (
     unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
     unsigned char *dst_ptr,
     int  dst_pitch
 )
 {
-    const int  *HFilter;
-    const int  *VFilter;
+    const short *HFilter;
+    const short *VFilter;
 
-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
 
-    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
+    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
 
 }
 
 void vp8_bilinear_predict8x4_c
 (
     unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
     unsigned char *dst_ptr,
     int  dst_pitch
 )
 {
-    const int  *HFilter;
-    const int  *VFilter;
+    const short *HFilter;
+    const short *VFilter;
 
-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
 
-    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
+    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
 
 }
 
 void vp8_bilinear_predict16x16_c
 (
     unsigned char  *src_ptr,
     int  src_pixels_per_line,
     int  xoffset,
     int  yoffset,
     unsigned char *dst_ptr,
     int  dst_pitch
 )
 {
-    const int  *HFilter;
-    const int  *VFilter;
+    const short *HFilter;
+    const short *VFilter;
 
-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
 
-    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
+    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
 }
new file mode 100644
--- /dev/null
+++ b/media/libvpx/vp8/common/filter.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef FILTER_H
+#define FILTER_H
+
+#define BLOCK_HEIGHT_WIDTH 4
+#define VP8_FILTER_WEIGHT 128
+#define VP8_FILTER_SHIFT  7
+
+extern const short vp8_bilinear_filters[8][2];
+extern const short vp8_sub_pel_filters[8][6];
+
+#endif //FILTER_H
--- a/media/libvpx/vp8/common/findnearmv.c
+++ b/media/libvpx/vp8/common/findnearmv.c
@@ -6,64 +6,33 @@
  *  tree. An additional intellectual property rights grant can be found
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
 #include "findnearmv.h"
 
-#define FINDNEAR_SEARCH_SITES   3
+const unsigned char vp8_mbsplit_offset[4][16] = {
+    { 0,  8,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
+    { 0,  2,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
+    { 0,  2,  8, 10,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
+    { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}
+};
 
 /* Predict motion vectors using those from already-decoded nearby blocks.
    Note that we only consider one 4x4 subblock from each candidate 16x16
    macroblock.   */
-
-typedef union
-{
-    unsigned int as_int;
-    MV           as_mv;
-} int_mv;        /* facilitates rapid equality tests */
-
-static void mv_bias(const MODE_INFO *x, int refframe, int_mv *mvp, const int *ref_frame_sign_bias)
-{
-    MV xmv;
-    xmv = x->mbmi.mv.as_mv;
-
-    if (ref_frame_sign_bias[x->mbmi.ref_frame] != ref_frame_sign_bias[refframe])
-    {
-        xmv.row *= -1;
-        xmv.col *= -1;
-    }
-
-    mvp->as_mv = xmv;
-}
-
-
-void vp8_clamp_mv(MV *mv, const MACROBLOCKD *xd)
-{
-    if (mv->col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN))
-        mv->col = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
-    else if (mv->col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN)
-        mv->col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
-
-    if (mv->row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN))
-        mv->row = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
-    else if (mv->row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN)
-        mv->row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
-}
-
-
 void vp8_find_near_mvs
 (
     MACROBLOCKD *xd,
     const MODE_INFO *here,
-    MV *nearest,
-    MV *nearby,
-    MV *best_mv,
+    int_mv *nearest,
+    int_mv *nearby,
+    int_mv *best_mv,
     int cnt[4],
     int refframe,
     int *ref_frame_sign_bias
 )
 {
     const MODE_INFO *above = here - xd->mode_info_stride;
     const MODE_INFO *left = here - 1;
     const MODE_INFO *aboveleft = above - 1;
@@ -77,32 +46,32 @@ void vp8_find_near_mvs
     cnt[0] = cnt[1] = cnt[2] = cnt[3] = 0;
 
     /* Process above */
     if (above->mbmi.ref_frame != INTRA_FRAME)
     {
         if (above->mbmi.mv.as_int)
         {
             (++mv)->as_int = above->mbmi.mv.as_int;
-            mv_bias(above, refframe, mv, ref_frame_sign_bias);
+            mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame], refframe, mv, ref_frame_sign_bias);
             ++cntx;
         }
 
         *cntx += 2;
     }
 
     /* Process left */
     if (left->mbmi.ref_frame != INTRA_FRAME)
     {
         if (left->mbmi.mv.as_int)
         {
             int_mv this_mv;
 
             this_mv.as_int = left->mbmi.mv.as_int;
-            mv_bias(left, refframe, &this_mv, ref_frame_sign_bias);
+            mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame], refframe, &this_mv, ref_frame_sign_bias);
 
             if (this_mv.as_int != mv->as_int)
             {
                 (++mv)->as_int = this_mv.as_int;
                 ++cntx;
             }
 
             *cntx += 2;
@@ -114,17 +83,17 @@ void vp8_find_near_mvs
     /* Process above left */
     if (aboveleft->mbmi.ref_frame != INTRA_FRAME)
     {
         if (aboveleft->mbmi.mv.as_int)
         {
             int_mv this_mv;
 
             this_mv.as_int = aboveleft->mbmi.mv.as_int;
-            mv_bias(aboveleft, refframe, &this_mv, ref_frame_sign_bias);
+            mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame], refframe, &this_mv, ref_frame_sign_bias);
 
             if (this_mv.as_int != mv->as_int)
             {
                 (++mv)->as_int = this_mv.as_int;
                 ++cntx;
             }
 
             *cntx += 1;
@@ -157,52 +126,30 @@ void vp8_find_near_mvs
         near_mvs[CNT_NEAR].as_int = tmp;
     }
 
     /* Use near_mvs[0] to store the "best" MV */
     if (cnt[CNT_NEAREST] >= cnt[CNT_INTRA])
         near_mvs[CNT_INTRA] = near_mvs[CNT_NEAREST];
 
     /* Set up return values */
-    *best_mv = near_mvs[0].as_mv;
-    *nearest = near_mvs[CNT_NEAREST].as_mv;
-    *nearby = near_mvs[CNT_NEAR].as_mv;
+    best_mv->as_int = near_mvs[0].as_int;
+    nearest->as_int = near_mvs[CNT_NEAREST].as_int;
+    nearby->as_int = near_mvs[CNT_NEAR].as_int;
 
-    vp8_clamp_mv(nearest, xd);
-    vp8_clamp_mv(nearby, xd);
-    vp8_clamp_mv(best_mv, xd); /*TODO: move this up before the copy*/
+    //TODO: move clamp outside findnearmv
+    vp8_clamp_mv2(nearest, xd);
+    vp8_clamp_mv2(nearby, xd);
+    vp8_clamp_mv2(best_mv, xd);
 }
 
 vp8_prob *vp8_mv_ref_probs(
     vp8_prob p[VP8_MVREFS-1], const int near_mv_ref_ct[4]
 )
 {
     p[0] = vp8_mode_contexts [near_mv_ref_ct[0]] [0];
     p[1] = vp8_mode_contexts [near_mv_ref_ct[1]] [1];
     p[2] = vp8_mode_contexts [near_mv_ref_ct[2]] [2];
     p[3] = vp8_mode_contexts [near_mv_ref_ct[3]] [3];
     /*p[3] = vp8_mode_contexts [near_mv_ref_ct[1] + near_mv_ref_ct[2] + near_mv_ref_ct[3]] [3];*/
     return p;
 }
 
-const B_MODE_INFO *vp8_left_bmi(const MODE_INFO *cur_mb, int b)
-{
-    if (!(b & 3))
-    {
-        /* On L edge, get from MB to left of us */
-        --cur_mb;
-        b += 4;
-    }
-
-    return cur_mb->bmi + b - 1;
-}
-
-const B_MODE_INFO *vp8_above_bmi(const MODE_INFO *cur_mb, int b, int mi_stride)
-{
-    if (!(b >> 2))
-    {
-        /* On top edge, get from MB above us */
-        cur_mb -= mi_stride;
-        b += 16;
-    }
-
-    return cur_mb->bmi + b - 4;
-}
--- a/media/libvpx/vp8/common/findnearmv.h
+++ b/media/libvpx/vp8/common/findnearmv.h
@@ -12,31 +12,162 @@
 #ifndef __INC_FINDNEARMV_H
 #define __INC_FINDNEARMV_H
 
 #include "mv.h"
 #include "blockd.h"
 #include "modecont.h"
 #include "treecoder.h"
 
+
+static void mv_bias(int refmb_ref_frame_sign_bias, int refframe, int_mv *mvp, const int *ref_frame_sign_bias)
+{
+    MV xmv;
+    xmv = mvp->as_mv;
+
+    if (refmb_ref_frame_sign_bias != ref_frame_sign_bias[refframe])
+    {
+        xmv.row *= -1;
+        xmv.col *= -1;
+    }
+
+    mvp->as_mv = xmv;
+}
+
+#define LEFT_TOP_MARGIN (16 << 3)
+#define RIGHT_BOTTOM_MARGIN (16 << 3)
+static void vp8_clamp_mv2(int_mv *mv, const MACROBLOCKD *xd)
+{
+    if (mv->as_mv.col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN))
+        mv->as_mv.col = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
+    else if (mv->as_mv.col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN)
+        mv->as_mv.col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
+
+    if (mv->as_mv.row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN))
+        mv->as_mv.row = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
+    else if (mv->as_mv.row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN)
+        mv->as_mv.row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
+}
+
+static void vp8_clamp_mv(int_mv *mv, int mb_to_left_edge, int mb_to_right_edge,
+                         int mb_to_top_edge, int mb_to_bottom_edge)
+{
+    mv->as_mv.col = (mv->as_mv.col < mb_to_left_edge) ?
+        mb_to_left_edge : mv->as_mv.col;
+    mv->as_mv.col = (mv->as_mv.col > mb_to_right_edge) ?
+        mb_to_right_edge : mv->as_mv.col;
+    mv->as_mv.row = (mv->as_mv.row < mb_to_top_edge) ?
+        mb_to_top_edge : mv->as_mv.row;
+    mv->as_mv.row = (mv->as_mv.row > mb_to_bottom_edge) ?
+        mb_to_bottom_edge : mv->as_mv.row;
+}
+static unsigned int vp8_check_mv_bounds(int_mv *mv, int mb_to_left_edge,
+                                int mb_to_right_edge, int mb_to_top_edge,
+                                int mb_to_bottom_edge)
+{
+    unsigned int need_to_clamp;
+    need_to_clamp = (mv->as_mv.col < mb_to_left_edge) ? 1 : 0;
+    need_to_clamp |= (mv->as_mv.col > mb_to_right_edge) ? 1 : 0;
+    need_to_clamp |= (mv->as_mv.row < mb_to_top_edge) ? 1 : 0;
+    need_to_clamp |= (mv->as_mv.row > mb_to_bottom_edge) ? 1 : 0;
+    return need_to_clamp;
+}
+
 void vp8_find_near_mvs
 (
     MACROBLOCKD *xd,
     const MODE_INFO *here,
-    MV *nearest, MV *nearby, MV *best,
+    int_mv *nearest, int_mv *nearby, int_mv *best,
     int near_mv_ref_cts[4],
     int refframe,
     int *ref_frame_sign_bias
 );
 
 vp8_prob *vp8_mv_ref_probs(
     vp8_prob p[VP8_MVREFS-1], const int near_mv_ref_ct[4]
 );
 
-const B_MODE_INFO *vp8_left_bmi(const MODE_INFO *cur_mb, int b);
-
-const B_MODE_INFO *vp8_above_bmi(const MODE_INFO *cur_mb, int b, int mi_stride);
-
-#define LEFT_TOP_MARGIN (16 << 3)
-#define RIGHT_BOTTOM_MARGIN (16 << 3)
+extern const unsigned char vp8_mbsplit_offset[4][16];
 
 
+static int left_block_mv(const MODE_INFO *cur_mb, int b)
+{
+    if (!(b & 3))
+    {
+        /* On L edge, get from MB to left of us */
+        --cur_mb;
+
+        if(cur_mb->mbmi.mode != SPLITMV)
+            return cur_mb->mbmi.mv.as_int;
+        b += 4;
+    }
+
+    return (cur_mb->bmi + b - 1)->mv.as_int;
+}
+
+static int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride)
+{
+    if (!(b >> 2))
+    {
+        /* On top edge, get from MB above us */
+        cur_mb -= mi_stride;
+
+        if(cur_mb->mbmi.mode != SPLITMV)
+            return cur_mb->mbmi.mv.as_int;
+        b += 16;
+    }
+
+    return (cur_mb->bmi + b - 4)->mv.as_int;
+}
+static B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b)
+{
+    if (!(b & 3))
+    {
+        /* On L edge, get from MB to left of us */
+        --cur_mb;
+        switch (cur_mb->mbmi.mode)
+        {
+            case B_PRED:
+              return (cur_mb->bmi + b + 3)->as_mode;
+            case DC_PRED:
+                return B_DC_PRED;
+            case V_PRED:
+                return B_VE_PRED;
+            case H_PRED:
+                return B_HE_PRED;
+            case TM_PRED:
+                return B_TM_PRED;
+            default:
+                return B_DC_PRED;
+        }
+    }
+
+    return (cur_mb->bmi + b - 1)->as_mode;
+}
+
+static B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, int b, int mi_stride)
+{
+    if (!(b >> 2))
+    {
+        /* On top edge, get from MB above us */
+        cur_mb -= mi_stride;
+
+        switch (cur_mb->mbmi.mode)
+        {
+            case B_PRED:
+              return (cur_mb->bmi + b + 12)->as_mode;
+            case DC_PRED:
+                return B_DC_PRED;
+            case V_PRED:
+                return B_VE_PRED;
+            case H_PRED:
+                return B_HE_PRED;
+            case TM_PRED:
+                return B_TM_PRED;
+            default:
+                return B_DC_PRED;
+        }
+    }
+
+    return (cur_mb->bmi + b - 4)->as_mode;
+}
+
 #endif
--- a/media/libvpx/vp8/common/generic/systemdependent.c
+++ b/media/libvpx/vp8/common/generic/systemdependent.c
@@ -5,26 +5,71 @@
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
 #include "vpx_ports/config.h"
-#include "g_common.h"
-#include "subpixel.h"
-#include "loopfilter.h"
-#include "recon.h"
-#include "idct.h"
-#include "onyxc_int.h"
+#include "vp8/common/g_common.h"
+#include "vp8/common/subpixel.h"
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/recon.h"
+#include "vp8/common/idct.h"
+#include "vp8/common/onyxc_int.h"
+
+#if CONFIG_MULTITHREAD
+#if HAVE_UNISTD_H
+#include <unistd.h>
+#elif defined(_WIN32)
+#include <windows.h>
+typedef void (WINAPI *PGNSI)(LPSYSTEM_INFO);
+#endif
+#endif
 
 extern void vp8_arch_x86_common_init(VP8_COMMON *ctx);
 extern void vp8_arch_arm_common_init(VP8_COMMON *ctx);
 
+#if CONFIG_MULTITHREAD
+static int get_cpu_count()
+{
+    int core_count = 16;
+
+#if HAVE_UNISTD_H
+#if defined(_SC_NPROCESSORS_ONLN)
+    core_count = sysconf(_SC_NPROCESSORS_ONLN);
+#elif defined(_SC_NPROC_ONLN)
+    core_count = sysconf(_SC_NPROC_ONLN);
+#endif
+#elif defined(_WIN32)
+    {
+        PGNSI pGNSI;
+        SYSTEM_INFO sysinfo;
+
+        /* Call GetNativeSystemInfo if supported or
+         * GetSystemInfo otherwise. */
+
+        pGNSI = (PGNSI) GetProcAddress(
+                GetModuleHandle(TEXT("kernel32.dll")), "GetNativeSystemInfo");
+        if (pGNSI != NULL)
+            pGNSI(&sysinfo);
+        else
+            GetSystemInfo(&sysinfo);
+
+        core_count = sysinfo.dwNumberOfProcessors;
+    }
+#else
+    /* other platforms */
+#endif
+
+    return core_count > 0 ? core_count : 1;
+}
+#endif
+
 void vp8_machine_specific_config(VP8_COMMON *ctx)
 {
 #if CONFIG_RUNTIME_CPU_DETECT
     VP8_COMMON_RTCD *rtcd = &ctx->rtcd;
 
     rtcd->idct.idct1        = vp8_short_idct4x4llm_1_c;
     rtcd->idct.idct16       = vp8_short_idct4x4llm_c;
     rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_c;
@@ -38,46 +83,57 @@ void vp8_machine_specific_config(VP8_COM
     rtcd->recon.recon2      = vp8_recon2b_c;
     rtcd->recon.recon4      = vp8_recon4b_c;
     rtcd->recon.recon_mb    = vp8_recon_mb_c;
     rtcd->recon.recon_mby   = vp8_recon_mby_c;
     rtcd->recon.build_intra_predictors_mby =
         vp8_build_intra_predictors_mby;
     rtcd->recon.build_intra_predictors_mby_s =
         vp8_build_intra_predictors_mby_s;
+    rtcd->recon.build_intra_predictors_mbuv =
+        vp8_build_intra_predictors_mbuv;
+    rtcd->recon.build_intra_predictors_mbuv_s =
+        vp8_build_intra_predictors_mbuv_s;
+    rtcd->recon.intra4x4_predict =
+        vp8_intra4x4_predict;
 
     rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_c;
     rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_c;
     rtcd->subpix.sixtap8x4     = vp8_sixtap_predict8x4_c;
     rtcd->subpix.sixtap4x4     = vp8_sixtap_predict_c;
     rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_c;
     rtcd->subpix.bilinear8x8   = vp8_bilinear_predict8x8_c;
     rtcd->subpix.bilinear8x4   = vp8_bilinear_predict8x4_c;
     rtcd->subpix.bilinear4x4   = vp8_bilinear_predict4x4_c;
 
     rtcd->loopfilter.normal_mb_v = vp8_loop_filter_mbv_c;
     rtcd->loopfilter.normal_b_v  = vp8_loop_filter_bv_c;
     rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_c;
     rtcd->loopfilter.normal_b_h  = vp8_loop_filter_bh_c;
-    rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_c;
+    rtcd->loopfilter.simple_mb_v = vp8_loop_filter_simple_vertical_edge_c;
     rtcd->loopfilter.simple_b_v  = vp8_loop_filter_bvs_c;
-    rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_c;
+    rtcd->loopfilter.simple_mb_h = vp8_loop_filter_simple_horizontal_edge_c;
     rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_c;
 
-#if CONFIG_POSTPROC || (CONFIG_VP8_ENCODER && CONFIG_PSNR)
-    rtcd->postproc.down        = vp8_mbpost_proc_down_c;
-    rtcd->postproc.across      = vp8_mbpost_proc_across_ip_c;
-    rtcd->postproc.downacross  = vp8_post_proc_down_and_across_c;
-    rtcd->postproc.addnoise    = vp8_plane_add_noise_c;
-    rtcd->postproc.blend_mb    = vp8_blend_mb_c;
+#if CONFIG_POSTPROC || (CONFIG_VP8_ENCODER && CONFIG_INTERNAL_STATS)
+    rtcd->postproc.down             = vp8_mbpost_proc_down_c;
+    rtcd->postproc.across           = vp8_mbpost_proc_across_ip_c;
+    rtcd->postproc.downacross       = vp8_post_proc_down_and_across_c;
+    rtcd->postproc.addnoise         = vp8_plane_add_noise_c;
+    rtcd->postproc.blend_mb_inner   = vp8_blend_mb_inner_c;
+    rtcd->postproc.blend_mb_outer   = vp8_blend_mb_outer_c;
+    rtcd->postproc.blend_b          = vp8_blend_b_c;
 #endif
 
 #endif
 
 #if ARCH_X86 || ARCH_X86_64
     vp8_arch_x86_common_init(ctx);
 #endif
 
 #if ARCH_ARM
     vp8_arch_arm_common_init(ctx);
 #endif
 
+#if CONFIG_MULTITHREAD
+    ctx->processor_core_count = get_cpu_count();
+#endif /* CONFIG_MULTITHREAD */
 }
--- a/media/libvpx/vp8/common/loopfilter.c
+++ b/media/libvpx/vp8/common/loopfilter.c
@@ -4,601 +4,622 @@
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
 
-#include "vpx_ports/config.h"
+#include "vpx_config.h"
 #include "loopfilter.h"
 #include "onyxc_int.h"
+#include "vpx_mem/vpx_mem.h"
 
 typedef unsigned char uc;
 
-
 prototype_loopfilter(vp8_loop_filter_horizontal_edge_c);
 prototype_loopfilter(vp8_loop_filter_vertical_edge_c);
 prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_c);
 prototype_loopfilter(vp8_mbloop_filter_vertical_edge_c);
-prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_c);
-prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_c);
+
+prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_c);
+prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_c);
 
 /* Horizontal MB filtering */
-void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                           int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,
+                           unsigned char *v_ptr, int y_stride, int uv_stride,
+                           loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
-    vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
 
     if (u_ptr)
-        vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 
     if (v_ptr)
-        vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
-}
-
-void vp8_loop_filter_mbhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                            int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
-{
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    (void) simpler_lpf;
-    vp8_loop_filter_simple_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+        vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 }
 
 /* Vertical MB Filtering */
-void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                           int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,
+                           unsigned char *v_ptr, int y_stride, int uv_stride,
+                           loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
-    vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+    vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
 
     if (u_ptr)
-        vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
+        vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 
     if (v_ptr)
-        vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->uvmbflim, lfi->uvlim, lfi->uvmbthr, 1);
-}
-
-void vp8_loop_filter_mbvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                            int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
-{
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    (void) simpler_lpf;
-    vp8_loop_filter_simple_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->mbthr, 2);
+        vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 }
 
 /* Horizontal B Filtering */
-void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                          int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr,
+                          unsigned char *v_ptr, int y_stride, int uv_stride,
+                          loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
-    vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
 
     if (u_ptr)
-        vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 
     if (v_ptr)
-        vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }
 
-void vp8_loop_filter_bhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                           int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride,
+                           const unsigned char *blimit)
 {
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    (void) simpler_lpf;
-    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, blimit);
 }
 
 /* Vertical B Filtering */
-void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                          int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr,
+                          unsigned char *v_ptr, int y_stride, int uv_stride,
+                          loop_filter_info *lfi)
 {
-    (void) simpler_lpf;
-    vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
 
     if (u_ptr)
-        vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 
     if (v_ptr)
-        vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->uvflim, lfi->uvlim, lfi->uvthr, 1);
+        vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride,
+                           const unsigned char *blimit)
+{
+    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit);
 }
 
-void vp8_loop_filter_bvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                           int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+static void lf_init_lut(loop_filter_info_n *lfi)
 {
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    (void) simpler_lpf;
-    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
-    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
+    int filt_lvl;
+
+    for (filt_lvl = 0; filt_lvl <= MAX_LOOP_FILTER; filt_lvl++)
+    {
+        if (filt_lvl >= 40)
+        {
+            lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 2;
+            lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 3;
+        }
+        else if (filt_lvl >= 20)
+        {
+            lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;
+            lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 2;
+        }
+        else if (filt_lvl >= 15)
+        {
+            lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;
+            lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 1;
+        }
+        else
+        {
+            lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 0;
+            lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 0;
+        }
+    }
+
+    lfi->mode_lf_lut[DC_PRED] = 1;