Bug 1540760 - Add missing aarch64 files for ffvpx; r=jya
authorDan Minor <dminor@mozilla.com>
Wed, 01 May 2019 23:06:25 +0000
changeset 531306 742b7c0a4bdbbe5f4004b038b4b5b4467ef4484b
parent 531305 f40ae51578ac27c6ea38af1e2818a12ac0b93dbd
child 531307 d000d40067de32c45c46b39a413ad6a9d2949411
push id11265
push userffxbld-merge
push dateMon, 13 May 2019 10:53:39 +0000
treeherdermozilla-beta@77e0fe8dbdd3 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjya
bugs1540760
milestone68.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1540760 - Add missing aarch64 files for ffvpx; r=jya Differential Revision: https://phabricator.services.mozilla.com/D27789
media/ffvpx/libavcodec/aarch64/fft_init_aarch64.c
media/ffvpx/libavcodec/aarch64/fft_neon.S
media/ffvpx/libavcodec/aarch64/h264chroma_init_aarch64.c
media/ffvpx/libavcodec/aarch64/h264cmc_neon.S
media/ffvpx/libavcodec/aarch64/h264dsp_init_aarch64.c
media/ffvpx/libavcodec/aarch64/h264dsp_neon.S
media/ffvpx/libavcodec/aarch64/h264idct_neon.S
media/ffvpx/libavcodec/aarch64/h264pred_init.c
media/ffvpx/libavcodec/aarch64/h264pred_neon.S
media/ffvpx/libavcodec/aarch64/hpeldsp_init_aarch64.c
media/ffvpx/libavcodec/aarch64/hpeldsp_neon.S
media/ffvpx/libavcodec/aarch64/idct.h
media/ffvpx/libavcodec/aarch64/idctdsp_init_aarch64.c
media/ffvpx/libavcodec/aarch64/mdct_neon.S
media/ffvpx/libavcodec/aarch64/neon.S
media/ffvpx/libavcodec/aarch64/simple_idct_neon.S
media/ffvpx/libavcodec/aarch64/vc1dsp_init_aarch64.c
media/ffvpx/libavcodec/aarch64/videodsp.S
media/ffvpx/libavcodec/aarch64/videodsp_init.c
media/ffvpx/libavcodec/aarch64/vp9dsp_init.h
media/ffvpx/libavcodec/aarch64/vp9dsp_init_10bpp_aarch64.c
media/ffvpx/libavcodec/aarch64/vp9dsp_init_12bpp_aarch64.c
media/ffvpx/libavcodec/aarch64/vp9dsp_init_16bpp_aarch64_template.c
media/ffvpx/libavcodec/aarch64/vp9dsp_init_aarch64.c
media/ffvpx/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
media/ffvpx/libavcodec/aarch64/vp9itxfm_neon.S
media/ffvpx/libavcodec/aarch64/vp9lpf_16bpp_neon.S
media/ffvpx/libavcodec/aarch64/vp9lpf_neon.S
media/ffvpx/libavcodec/aarch64/vp9mc_16bpp_neon.S
media/ffvpx/libavcodec/aarch64/vp9mc_neon.S
media/ffvpx/libavutil/aarch64/asm.S
media/ffvpx/libavutil/aarch64/bswap.h
media/ffvpx/libavutil/aarch64/cpu.c
media/ffvpx/libavutil/aarch64/cpu.h
media/ffvpx/libavutil/aarch64/float_dsp_init.c
media/ffvpx/libavutil/aarch64/float_dsp_neon.S
media/ffvpx/libavutil/aarch64/timer.h
new file mode 100644
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/fft_init_aarch64.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/aarch64/cpu.h"
+
+#include "libavcodec/fft.h"
+
+void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
+
+void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+
+av_cold void ff_fft_init_aarch64(FFTContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        s->fft_permute  = ff_fft_permute_neon;
+        s->fft_calc     = ff_fft_calc_neon;
+#if CONFIG_MDCT
+        s->imdct_calc   = ff_imdct_calc_neon;
+        s->imdct_half   = ff_imdct_half_neon;
+        s->mdct_calc    = ff_mdct_calc_neon;
+        s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
+#endif
+    }
+}
new file mode 100644
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/fft_neon.S
@@ -0,0 +1,442 @@
+/*
+ * ARM NEON optimised FFT
+ *
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2009 Naotoshi Nojiri
+ * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This algorithm (though not any of the implementation details) is
+ * based on libdjbfft by D. J. Bernstein.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+#define M_SQRT1_2 0.70710678118654752440
+
+.macro transpose d0, d1, s0, s1
+        trn1            \d0, \s0, \s1
+        trn2            \d1, \s0, \s1
+.endm
+
+
+function fft4_neon
+        ld1             {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
+
+        fadd            v4.2s,  v0.2s,  v1.2s   // r0+r1,i0+i1
+        fsub            v6.2s,  v0.2s,  v1.2s   // r0-r1,i0-i1
+
+        ext             v16.8b, v2.8b,  v3.8b,  #4
+        ext             v17.8b, v3.8b,  v2.8b,  #4
+
+        fadd            v5.2s,  v2.2s,  v3.2s   // i2+i3,r2+r3
+        fsub            v7.2s,  v16.2s, v17.2s  // r3-r2,i2-i3
+
+        fadd            v0.2s,  v4.2s,  v5.2s
+        fsub            v2.2s,  v4.2s,  v5.2s
+        fadd            v1.2s,  v6.2s,  v7.2s
+        fsub            v3.2s,  v6.2s,  v7.2s
+
+        st1             {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
+
+        ret
+endfunc
+
+function fft8_neon
+        mov             x1,  x0
+        ld1             {v0.2s, v1.2s, v2.2s, v3.2s},  [x0], #32
+        ld1             {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
+        ext             v22.8b, v2.8b,  v3.8b,  #4
+        ext             v23.8b, v3.8b,  v2.8b,  #4
+        fadd            v4.2s,  v16.2s, v17.2s           // r4+r5,i4+i5
+        fadd            v5.2s,  v18.2s, v19.2s           // r6+r7,i6+i7
+        fsub            v17.2s, v16.2s, v17.2s           // r4-r5,i4-i5
+        fsub            v19.2s, v18.2s, v19.2s           // r6-r7,i6-i7
+        rev64           v27.2s, v28.2s  // ???
+        fadd            v20.2s, v0.2s,  v1.2s            // r0+r1,i0+i1
+        fadd            v21.2s, v2.2s,  v3.2s            // r2+r3,i2+i3
+        fmul            v26.2s, v17.2s, v28.2s           // -a2r*w,a2i*w
+        ext             v6.8b,  v4.8b,  v5.8b,  #4
+        ext             v7.8b,  v5.8b,  v4.8b,  #4
+        fmul            v27.2s, v19.2s, v27.2s           // a3r*w,-a3i*w
+        fsub            v23.2s, v22.2s, v23.2s           // i2-i3,r3-r2
+        fsub            v22.2s, v0.2s,  v1.2s            // r0-r1,i0-i1
+        fmul            v24.2s, v17.2s, v28.s[1]         // a2r*w,a2i*w
+        fmul            v25.2s, v19.2s, v28.s[1]         // a3r*w,a3i*w
+        fadd            v0.2s,  v20.2s, v21.2s
+        fsub            v2.2s,  v20.2s, v21.2s
+        fadd            v1.2s,  v22.2s, v23.2s
+        rev64           v26.2s, v26.2s
+        rev64           v27.2s, v27.2s
+        fsub            v3.2s,  v22.2s, v23.2s
+        fsub            v6.2s,  v6.2s,  v7.2s
+        fadd            v24.2s, v24.2s, v26.2s  // a2r+a2i,a2i-a2r   t1,t2
+        fadd            v25.2s, v25.2s, v27.2s  // a3r-a3i,a3i+a3r   t5,t6
+        fadd            v7.2s,  v4.2s,  v5.2s
+        fsub            v18.2s, v2.2s,  v6.2s
+        ext             v26.8b, v24.8b, v25.8b, #4
+        ext             v27.8b, v25.8b, v24.8b, #4
+        fadd            v2.2s,  v2.2s,  v6.2s
+        fsub            v16.2s, v0.2s,  v7.2s
+        fadd            v5.2s,  v25.2s, v24.2s
+        fsub            v4.2s,  v26.2s, v27.2s
+        fadd            v0.2s,  v0.2s,  v7.2s
+        fsub            v17.2s, v1.2s,  v5.2s
+        fsub            v19.2s, v3.2s,  v4.2s
+        fadd            v3.2s,  v3.2s,  v4.2s
+        fadd            v1.2s,  v1.2s,  v5.2s
+
+        st1             {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
+        st1             {v0.2s, v1.2s, v2.2s, v3.2s},  [x1]
+
+        ret
+endfunc
+
+function fft16_neon
+        mov             x1,  x0
+        ld1             {v0.2s, v1.2s, v2.2s, v3.2s},  [x0], #32
+        ld1             {v16.2s,v17.2s,v18.2s,v19.2s}, [x0], #32
+        ext             v22.8b, v2.8b,  v3.8b,  #4
+        ext             v23.8b, v3.8b,  v2.8b,  #4
+        fadd            v4.2s,  v16.2s, v17.2s           // r4+r5,i4+i5
+        fadd            v5.2s,  v18.2s, v19.2s           // r6+r7,i6+i7
+        fsub            v17.2s, v16.2s, v17.2s           // r4-r5,i4-i5
+        fsub            v19.2s, v18.2s, v19.2s           // r6-r7,i6-i7
+        rev64           v27.2s, v28.2s  // ???
+        fadd            v20.2s, v0.2s,  v1.2s            // r0+r1,i0+i1
+        fadd            v21.2s, v2.2s,  v3.2s            // r2+r3,i2+i3
+        fmul            v26.2s, v17.2s, v28.2s           // -a2r*w,a2i*w
+        ext             v6.8b,  v4.8b,  v5.8b,  #4
+        ext             v7.8b,  v5.8b,  v4.8b,  #4
+        fmul            v27.2s, v19.2s, v27.2s           // a3r*w,-a3i*w
+        fsub            v23.2s, v22.2s, v23.2s           // i2-i3,r3-r2
+        fsub            v22.2s, v0.2s,  v1.2s            // r0-r1,i0-i1
+        fmul            v24.2s, v17.2s, v28.s[1]         // a2r*w,a2i*w
+        fmul            v25.2s, v19.2s, v28.s[1]         // a3r*w,a3i*w
+        fadd            v0.2s,  v20.2s, v21.2s
+        fsub            v2.2s,  v20.2s, v21.2s
+        fadd            v1.2s,  v22.2s, v23.2s
+        rev64           v26.2s, v26.2s
+        rev64           v27.2s, v27.2s
+        fsub            v3.2s,  v22.2s, v23.2s
+        fsub            v6.2s,  v6.2s,  v7.2s
+        fadd            v24.2s, v24.2s, v26.2s  // a2r+a2i,a2i-a2r   t1,t2
+        fadd            v25.2s, v25.2s, v27.2s  // a3r-a3i,a3i+a3r   t5,t6
+        fadd            v7.2s,  v4.2s,  v5.2s
+        fsub            v18.2s, v2.2s,  v6.2s
+        ld1             {v20.4s,v21.4s}, [x0], #32
+        ld1             {v22.4s,v23.4s}, [x0], #32
+        ext             v26.8b, v24.8b, v25.8b, #4
+        ext             v27.8b, v25.8b, v24.8b, #4
+        fadd            v2.2s,  v2.2s,  v6.2s
+        fsub            v16.2s, v0.2s,  v7.2s
+        fadd            v5.2s,  v25.2s, v24.2s
+        fsub            v4.2s,  v26.2s, v27.2s
+        transpose       v24.2d, v25.2d, v20.2d, v22.2d
+        transpose       v26.2d, v27.2d, v21.2d, v23.2d
+        fadd            v0.2s,  v0.2s,  v7.2s
+        fsub            v17.2s, v1.2s,  v5.2s
+        fsub            v19.2s, v3.2s,  v4.2s
+        fadd            v3.2s,  v3.2s,  v4.2s
+        fadd            v1.2s,  v1.2s,  v5.2s
+        ext             v20.16b, v21.16b, v21.16b,  #4
+        ext             v21.16b, v23.16b, v23.16b,  #4
+
+        zip1            v0.2d,  v0.2d,  v1.2d   // {z[0],   z[1]}
+        zip1            v1.2d,  v2.2d,  v3.2d   // {z[2],   z[3]}
+        zip1            v2.2d,  v16.2d, v17.2d  // {z[o1],  z[o1+1]}
+        zip1            v3.2d,  v18.2d, v19.2d  // {z[o1+2],z[o1+3]}
+
+        // 2 x fft4
+        transpose       v22.2d, v23.2d, v20.2d, v21.2d
+
+        fadd            v4.4s,  v24.4s, v25.4s
+        fadd            v5.4s,  v26.4s, v27.4s
+        fsub            v6.4s,  v24.4s, v25.4s
+        fsub            v7.4s,  v22.4s, v23.4s
+
+        ld1             {v23.4s},  [x14]
+
+        fadd            v24.4s, v4.4s,  v5.4s   // {z[o2+0],z[o2+1]}
+        fsub            v26.4s, v4.4s,  v5.4s   // {z[o2+2],z[o2+3]}
+        fadd            v25.4s, v6.4s,  v7.4s   // {z[o3+0],z[o3+1]}
+        fsub            v27.4s, v6.4s,  v7.4s   // {z[o3+2],z[o3+3]}
+
+        //fft_pass_neon_16
+        rev64           v7.4s,  v25.4s
+        fmul            v25.4s, v25.4s, v23.s[1]
+        fmul            v7.4s,  v7.4s,  v29.4s
+        fmla            v25.4s, v7.4s,  v23.s[3] // {t1a,t2a,t5a,t6a}
+
+        zip1            v20.4s, v24.4s, v25.4s
+        zip2            v21.4s, v24.4s, v25.4s
+        fneg            v22.4s, v20.4s
+        fadd            v4.4s,  v21.4s, v20.4s
+        fsub            v6.4s,  v20.4s, v21.4s  // just the second half
+        fadd            v5.4s,  v21.4s, v22.4s  // just the first half
+
+        tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
+        tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
+
+        fsub            v20.4s, v0.4s,  v4.4s   // {z[o2],z[o2+1]}
+        fadd            v16.4s, v0.4s,  v4.4s   // {z[0], z[1]}
+        fsub            v22.4s, v2.4s,  v5.4s   // {z[o3],z[o3+1]}
+        fadd            v18.4s, v2.4s,  v5.4s   // {z[o1],z[o1+1]}
+
+//second half
+        rev64           v6.4s,  v26.4s
+        fmul            v26.4s, v26.4s, v23.s[2]
+        rev64           v7.4s,  v27.4s
+        fmul            v27.4s, v27.4s, v23.s[3]
+        fmul            v6.4s,  v6.4s,  v29.4s
+        fmul            v7.4s,  v7.4s,  v29.4s
+        fmla            v26.4s, v6.4s,  v23.s[2] // {t1,t2,t5,t6}
+        fmla            v27.4s, v7.4s,  v23.s[1] // {t1a,t2a,t5a,t6a}
+
+        zip1            v24.4s, v26.4s, v27.4s
+        zip2            v25.4s, v26.4s, v27.4s
+        fneg            v26.4s, v24.4s
+        fadd            v4.4s,  v25.4s, v24.4s
+        fsub            v6.4s,  v24.4s, v25.4s  // just the second half
+        fadd            v5.4s,  v25.4s, v26.4s  // just the first half
+
+        tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
+        tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
+
+        fadd            v17.4s, v1.4s, v4.4s    // {z[2], z[3]}
+        fsub            v21.4s, v1.4s, v4.4s    // {z[o2+2],z[o2+3]}
+        fadd            v19.4s, v3.4s, v5.4s    // {z[o1+2],z[o1+3]}
+        fsub            v23.4s, v3.4s, v5.4s    // {z[o3+2],z[o3+3]}
+
+        st1             {v16.4s,v17.4s}, [x1], #32
+        st1             {v18.4s,v19.4s}, [x1], #32
+        st1             {v20.4s,v21.4s}, [x1], #32
+        st1             {v22.4s,v23.4s}, [x1], #32
+
+        ret
+endfunc
+
+
+const  trans4_float, align=4
+        .byte    0,  1,  2,  3
+        .byte    8,  9, 10, 11
+        .byte    4,  5,  6,  7
+        .byte   12, 13, 14, 15
+endconst
+
+const  trans8_float, align=4
+        .byte   24, 25, 26, 27
+        .byte    0,  1,  2,  3
+        .byte   28, 29, 30, 31
+        .byte    4,  5,  6,  7
+endconst
+
+function fft_pass_neon
+        sub             x6,  x2,  #1            // n - 1, loop counter
+        lsl             x5,  x2,  #3            // 2 * n * sizeof FFTSample
+        lsl             x1,  x2,  #4            // 2 * n * sizeof FFTComplex
+        add             x5,  x4,  x5            // wim
+        add             x3,  x1,  x2,  lsl #5   // 4 * n * sizeof FFTComplex
+        add             x2,  x0,  x2,  lsl #5   // &z[o2]
+        add             x3,  x0,  x3            // &z[o3]
+        add             x1,  x0,  x1            // &z[o1]
+        ld1             {v20.4s},[x2]           // {z[o2],z[o2+1]}
+        ld1             {v22.4s},[x3]           // {z[o3],z[o3+1]}
+        ld1             {v4.2s},  [x4], #8      // {wre[0],wre[1]}
+        trn2            v25.2d, v20.2d, v22.2d
+        sub             x5,  x5,  #4            // wim--
+        trn1            v24.2d, v20.2d, v22.2d
+        ld1             {v5.s}[0],  [x5], x7    // d5[0] = wim[-1]
+        rev64           v7.4s,  v25.4s
+        fmul            v25.4s, v25.4s, v4.s[1]
+        ld1             {v16.4s}, [x0]          // {z[0],z[1]}
+        fmul            v7.4s,  v7.4s,  v29.4s
+        ld1             {v17.4s}, [x1]          // {z[o1],z[o1+1]}
+        prfm            pldl1keep, [x2, #16]
+        prfm            pldl1keep, [x3, #16]
+        fmla            v25.4s, v7.4s,  v5.s[0] // {t1a,t2a,t5a,t6a}
+        prfm            pldl1keep, [x0, #16]
+        prfm            pldl1keep, [x1, #16]
+
+        zip1            v20.4s, v24.4s, v25.4s
+        zip2            v21.4s, v24.4s, v25.4s
+        fneg            v22.4s, v20.4s
+        fadd            v4.4s,  v21.4s, v20.4s
+        fsub            v6.4s,  v20.4s, v21.4s  // just the second half
+        fadd            v5.4s,  v21.4s, v22.4s  // just the first half
+
+        tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
+        tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
+
+        fadd            v20.4s, v16.4s, v4.4s
+        fsub            v22.4s, v16.4s, v4.4s
+        fadd            v21.4s, v17.4s, v5.4s
+        st1             {v20.4s}, [x0], #16     // {z[0], z[1]}
+        fsub            v23.4s, v17.4s, v5.4s
+
+        st1             {v21.4s}, [x1], #16     // {z[o1],z[o1+1]}
+        st1             {v22.4s}, [x2], #16     // {z[o2],z[o2+1]}
+        st1             {v23.4s}, [x3], #16     // {z[o3],z[o3+1]}
+1:
+        ld1             {v20.4s},[x2]    // {z[o2],z[o2+1]}
+        ld1             {v22.4s},[x3]    // {z[o3],z[o3+1]}
+        ld1             {v4.2s}, [x4], #8       // {wre[0],wre[1]}
+        transpose       v26.2d, v27.2d, v20.2d, v22.2d
+        ld1             {v5.2s}, [x5], x7       // {wim[-1],wim[0]}
+        rev64           v6.4s,  v26.4s
+        fmul            v26.4s, v26.4s, v4.s[0]
+        rev64           v7.4s,  v27.4s
+        fmul            v27.4s, v27.4s, v4.s[1]
+        fmul            v6.4s,  v6.4s,  v29.4s
+        fmul            v7.4s,  v7.4s,  v29.4s
+        ld1             {v16.4s},[x0]           // {z[0],z[1]}
+        fmla            v26.4s, v6.4s,  v5.s[1] // {t1,t2,t5,t6}
+        fmla            v27.4s, v7.4s,  v5.s[0] // {t1a,t2a,t5a,t6a}
+        ld1             {v17.4s},[x1]           // {z[o1],z[o1+1]}
+
+        subs            x6,  x6,  #1            // n--
+
+        zip1            v20.4s, v26.4s, v27.4s
+        zip2            v21.4s, v26.4s, v27.4s
+        fneg            v22.4s, v20.4s
+        fadd            v4.4s,  v21.4s, v20.4s
+        fsub            v6.4s,  v20.4s, v21.4s  // just the second half
+        fadd            v5.4s,  v21.4s, v22.4s  // just the first half
+
+        tbl             v4.16b, {v4.16b},        v30.16b // trans4_float
+        tbl             v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
+
+        fadd            v20.4s, v16.4s, v4.4s
+        fsub            v22.4s, v16.4s, v4.4s
+        fadd            v21.4s, v17.4s, v5.4s
+        st1             {v20.4s}, [x0], #16     // {z[0], z[1]}
+        fsub            v23.4s, v17.4s, v5.4s
+
+        st1             {v21.4s}, [x1], #16     // {z[o1],z[o1+1]}
+        st1             {v22.4s}, [x2], #16     // {z[o2],z[o2+1]}
+        st1             {v23.4s}, [x3], #16     // {z[o3],z[o3+1]}
+        b.ne            1b
+
+        ret
+endfunc
+
+.macro  def_fft n, n2, n4
+function fft\n\()_neon, align=6
+        sub             sp,  sp,  #16
+        stp             x28, x30, [sp]
+        add             x28, x0,  #\n4*2*8
+        bl              fft\n2\()_neon
+        mov             x0,  x28
+        bl              fft\n4\()_neon
+        add             x0,  x28, #\n4*1*8
+        bl              fft\n4\()_neon
+        sub             x0,  x28, #\n4*2*8
+        ldp             x28, x30, [sp], #16
+        movrel          x4,  X(ff_cos_\n)
+        mov             x2,  #\n4>>1
+        b               fft_pass_neon
+endfunc
+.endm
+
+        def_fft    32,    16,     8
+        def_fft    64,    32,    16
+        def_fft   128,    64,    32
+        def_fft   256,   128,    64
+        def_fft   512,   256,   128
+        def_fft  1024,   512,   256
+        def_fft  2048,  1024,   512
+        def_fft  4096,  2048,  1024
+        def_fft  8192,  4096,  2048
+        def_fft 16384,  8192,  4096
+        def_fft 32768, 16384,  8192
+        def_fft 65536, 32768, 16384
+
+function ff_fft_calc_neon, export=1
+        prfm            pldl1keep, [x1]
+        movrel          x10, trans4_float
+        ldr             w2,  [x0]
+        movrel          x11, trans8_float
+        sub             w2,  w2,  #2
+        movrel          x3,  fft_tab_neon
+        ld1             {v30.16b}, [x10]
+        mov             x7,  #-8
+        movrel          x12, pmmp
+        ldr             x3,  [x3, x2, lsl #3]
+        movrel          x13, mppm
+        movrel          x14, X(ff_cos_16)
+        ld1             {v31.16b}, [x11]
+        mov             x0,  x1
+        ld1             {v29.4s},  [x12]         // pmmp
+        ld1             {v28.4s},  [x13]
+        br              x3
+endfunc
+
+function ff_fft_permute_neon, export=1
+        mov             x6,  #1
+        ldr             w2,  [x0]       // nbits
+        ldr             x3,  [x0, #16]  // tmp_buf
+        ldr             x0,  [x0, #8]   // revtab
+        lsl             x6,  x6, x2
+        mov             x2,  x6
+1:
+        ld1             {v0.2s,v1.2s}, [x1], #16
+        ldr             w4,  [x0], #4
+        uxth            w5,  w4
+        lsr             w4,  w4,  #16
+        add             x5,  x3,  x5,  lsl #3
+        add             x4,  x3,  x4,  lsl #3
+        st1             {v0.2s}, [x5]
+        st1             {v1.2s}, [x4]
+        subs            x6,  x6, #2
+        b.gt            1b
+
+        sub             x1,  x1,  x2,  lsl #3
+1:
+        ld1             {v0.4s,v1.4s}, [x3], #32
+        st1             {v0.4s,v1.4s}, [x1], #32
+        subs            x2,  x2,  #4
+        b.gt            1b
+
+        ret
+endfunc
+
+const   fft_tab_neon, relocate=1
+        .quad fft4_neon
+        .quad fft8_neon
+        .quad fft16_neon
+        .quad fft32_neon
+        .quad fft64_neon
+        .quad fft128_neon
+        .quad fft256_neon
+        .quad fft512_neon
+        .quad fft1024_neon
+        .quad fft2048_neon
+        .quad fft4096_neon
+        .quad fft8192_neon
+        .quad fft16384_neon
+        .quad fft32768_neon
+        .quad fft65536_neon
+endconst
+
+const   pmmp, align=4
+        .float          +1.0, -1.0, -1.0, +1.0
+endconst
+
+const   mppm, align=4
+        .float          -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
+endconst
new file mode 100644
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/h264chroma_init_aarch64.c
@@ -0,0 +1,59 @@
+/*
+ * ARM NEON optimised H.264 chroma functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/h264chroma.h"
+
+#include "config.h"
+
+void ff_put_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+void ff_put_h264_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+void ff_put_h264_chroma_mc2_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+
+void ff_avg_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+void ff_avg_h264_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+void ff_avg_h264_chroma_mc2_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+
+av_cold void ff_h264chroma_init_aarch64(H264ChromaContext *c, int bit_depth)
+{
+    const int high_bit_depth = bit_depth > 8;
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags) && !high_bit_depth) {
+        c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon;
+        c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon;
+        c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_neon;
+
+        c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_neon;
+        c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_neon;
+        c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_neon;
+    }
+}
new file mode 100644
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/h264cmc_neon.S
@@ -0,0 +1,450 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+/* chroma_mc8(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
+.macro  h264_chroma_mc8 type, codec=h264
+function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
+  .ifc \type,avg
+        mov             x8,  x0
+  .endif
+        prfm            pldl1strm, [x1]
+        prfm            pldl1strm, [x1, x2]
+  .ifc \codec,rv40
+        movrel          x6,  rv40bias
+        lsr             w9,  w5,  #1
+        lsr             w10, w4,  #1
+        lsl             w9,  w9,  #3
+        lsl             w10, w10, #1
+        add             w9,  w9,  w10
+        add             x6,  x6,  w9, UXTW
+        ld1r            {v22.8H}, [x6]
+  .endif
+  .ifc \codec,vc1
+        movi            v22.8H,   #28
+  .endif
+        mul             w7,  w4,  w5
+        lsl             w14, w5,  #3
+        lsl             w13, w4,  #3
+        cmp             w7,  #0
+        sub             w6,  w14, w7
+        sub             w12, w13, w7
+        sub             w4,  w7,  w13
+        sub             w4,  w4,  w14
+        add             w4,  w4,  #64
+        b.eq            2f
+
+        dup             v0.8B,  w4
+        dup             v1.8B,  w12
+        ld1             {v4.8B, v5.8B}, [x1], x2
+        dup             v2.8B,  w6
+        dup             v3.8B,  w7
+        ext             v5.8B,  v4.8B,  v5.8B,  #1
+1:      ld1             {v6.8B, v7.8B}, [x1], x2
+        umull           v16.8H, v4.8B,  v0.8B
+        umlal           v16.8H, v5.8B,  v1.8B
+        ext             v7.8B,  v6.8B,  v7.8B,  #1
+        ld1             {v4.8B, v5.8B}, [x1], x2
+        umlal           v16.8H, v6.8B,  v2.8B
+        prfm            pldl1strm, [x1]
+        ext             v5.8B,  v4.8B,  v5.8B,  #1
+        umlal           v16.8H, v7.8B,  v3.8B
+        umull           v17.8H, v6.8B,  v0.8B
+        subs            w3,  w3,  #2
+        umlal           v17.8H, v7.8B, v1.8B
+        umlal           v17.8H, v4.8B, v2.8B
+        umlal           v17.8H, v5.8B, v3.8B
+        prfm            pldl1strm, [x1, x2]
+  .ifc \codec,h264
+        rshrn           v16.8B, v16.8H, #6
+        rshrn           v17.8B, v17.8H, #6
+  .else
+        add             v16.8H, v16.8H, v22.8H
+        add             v17.8H, v17.8H, v22.8H
+        shrn            v16.8B, v16.8H, #6
+        shrn            v17.8B, v17.8H, #6
+  .endif
+  .ifc \type,avg
+        ld1             {v20.8B}, [x8], x2
+        ld1             {v21.8B}, [x8], x2
+        urhadd          v16.8B, v16.8B, v20.8B
+        urhadd          v17.8B, v17.8B, v21.8B
+  .endif
+        st1             {v16.8B}, [x0], x2
+        st1             {v17.8B}, [x0], x2
+        b.gt            1b
+        ret
+
+2:      adds            w12, w12, w6
+        dup             v0.8B, w4
+        b.eq            5f
+        tst             w6,  w6
+        dup             v1.8B, w12
+        b.eq            4f
+
+        ld1             {v4.8B}, [x1], x2
+3:      ld1             {v6.8B}, [x1], x2
+        umull           v16.8H, v4.8B,  v0.8B
+        umlal           v16.8H, v6.8B,  v1.8B
+        ld1             {v4.8B}, [x1], x2
+        umull           v17.8H, v6.8B,  v0.8B
+        umlal           v17.8H, v4.8B,  v1.8B
+        prfm            pldl1strm, [x1]
+  .ifc \codec,h264
+        rshrn           v16.8B, v16.8H, #6
+        rshrn           v17.8B, v17.8H, #6
+  .else
+        add             v16.8H, v16.8H, v22.8H
+        add             v17.8H, v17.8H, v22.8H
+        shrn            v16.8B, v16.8H, #6
+        shrn            v17.8B, v17.8H, #6
+  .endif
+        prfm            pldl1strm, [x1, x2]
+  .ifc \type,avg
+        ld1             {v20.8B}, [x8], x2
+        ld1             {v21.8B}, [x8], x2
+        urhadd          v16.8B, v16.8B, v20.8B
+        urhadd          v17.8B, v17.8B, v21.8B
+  .endif
+        subs            w3,  w3,  #2
+        st1             {v16.8B}, [x0], x2
+        st1             {v17.8B}, [x0], x2
+        b.gt            3b
+        ret
+
+4:      ld1             {v4.8B, v5.8B}, [x1], x2
+        ld1             {v6.8B, v7.8B}, [x1], x2
+        ext             v5.8B,  v4.8B,  v5.8B,  #1
+        ext             v7.8B,  v6.8B,  v7.8B,  #1
+        prfm            pldl1strm, [x1]
+        subs            w3,  w3,  #2
+        umull           v16.8H, v4.8B, v0.8B
+        umlal           v16.8H, v5.8B, v1.8B
+        umull           v17.8H, v6.8B, v0.8B
+        umlal           v17.8H, v7.8B, v1.8B
+        prfm            pldl1strm, [x1, x2]
+  .ifc \codec,h264
+        rshrn           v16.8B, v16.8H, #6
+        rshrn           v17.8B, v17.8H, #6
+  .else
+        add             v16.8H, v16.8H, v22.8H
+        add             v17.8H, v17.8H, v22.8H
+        shrn            v16.8B, v16.8H, #6
+        shrn            v17.8B, v17.8H, #6
+  .endif
+  .ifc \type,avg
+        ld1             {v20.8B}, [x8], x2
+        ld1             {v21.8B}, [x8], x2
+        urhadd          v16.8B, v16.8B, v20.8B
+        urhadd          v17.8B, v17.8B, v21.8B
+  .endif
+        st1             {v16.8B}, [x0], x2
+        st1             {v17.8B}, [x0], x2
+        b.gt            4b
+        ret
+
+5:      ld1             {v4.8B}, [x1], x2
+        ld1             {v5.8B}, [x1], x2
+        prfm            pldl1strm, [x1]
+        subs            w3,  w3,  #2
+        umull           v16.8H, v4.8B, v0.8B
+        umull           v17.8H, v5.8B, v0.8B
+        prfm            pldl1strm, [x1, x2]
+  .ifc \codec,h264
+        rshrn           v16.8B, v16.8H, #6
+        rshrn           v17.8B, v17.8H, #6
+  .else
+        add             v16.8H, v16.8H, v22.8H
+        add             v17.8H, v17.8H, v22.8H
+        shrn            v16.8B, v16.8H, #6
+        shrn            v17.8B, v17.8H, #6
+  .endif
+  .ifc \type,avg
+        ld1             {v20.8B}, [x8], x2
+        ld1             {v21.8B}, [x8], x2
+        urhadd          v16.8B, v16.8B, v20.8B
+        urhadd          v17.8B, v17.8B, v21.8B
+  .endif
+        st1             {v16.8B}, [x0], x2
+        st1             {v17.8B}, [x0], x2
+        b.gt            5b
+        ret
+endfunc
+.endm
+
+/* chroma_mc4(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
+.macro  h264_chroma_mc4 type, codec=h264
+function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
+  .ifc \type,avg
+        mov             x8,  x0
+  .endif
+        prfm            pldl1strm, [x1]
+        prfm            pldl1strm, [x1, x2]
+  .ifc \codec,rv40
+        movrel          x6,  rv40bias
+        lsr             w9,  w5,  #1
+        lsr             w10, w4,  #1
+        lsl             w9,  w9,  #3
+        lsl             w10, w10, #1
+        add             w9,  w9,  w10
+        add             x6,  x6,  w9, UXTW
+        ld1r            {v22.8H}, [x6]
+  .endif
+  .ifc \codec,vc1
+        movi            v22.8H,   #28
+  .endif
+        mul             w7,  w4,  w5
+        lsl             w14, w5,  #3
+        lsl             w13, w4,  #3
+        cmp             w7,  #0
+        sub             w6,  w14, w7
+        sub             w12, w13, w7
+        sub             w4,  w7,  w13
+        sub             w4,  w4,  w14
+        add             w4,  w4,  #64
+        b.eq            2f
+
+        dup             v24.8B,  w4
+        dup             v25.8B,  w12
+        ld1             {v4.8B}, [x1], x2
+        dup             v26.8B,  w6
+        dup             v27.8B,  w7
+        ext             v5.8B,  v4.8B,  v5.8B, #1
+        trn1            v0.2S,  v24.2S, v25.2S
+        trn1            v2.2S,  v26.2S, v27.2S
+        trn1            v4.2S,  v4.2S,  v5.2S
+1:      ld1             {v6.8B}, [x1], x2
+        ext             v7.8B,  v6.8B,  v7.8B, #1
+        trn1            v6.2S,  v6.2S,  v7.2S
+        umull           v18.8H, v4.8B,  v0.8B
+        umlal           v18.8H, v6.8B,  v2.8B
+        ld1             {v4.8B}, [x1], x2
+        ext             v5.8B,  v4.8B,  v5.8B, #1
+        trn1            v4.2S,  v4.2S,  v5.2S
+        prfm            pldl1strm, [x1]
+        umull           v19.8H, v6.8B,  v0.8B
+        umlal           v19.8H, v4.8B,  v2.8B
+        trn1            v30.2D, v18.2D, v19.2D
+        trn2            v31.2D, v18.2D, v19.2D
+        add             v18.8H, v30.8H, v31.8H
+  .ifc \codec,h264
+        rshrn           v16.8B, v18.8H, #6
+  .else
+        add             v18.8H, v18.8H, v22.8H
+        shrn            v16.8B, v18.8H, #6
+  .endif
+        subs            w3,  w3,  #2
+        prfm            pldl1strm, [x1, x2]
+  .ifc \type,avg
+        ld1             {v20.S}[0], [x8], x2
+        ld1             {v20.S}[1], [x8], x2
+        urhadd          v16.8B, v16.8B, v20.8B
+  .endif
+        st1             {v16.S}[0], [x0], x2
+        st1             {v16.S}[1], [x0], x2
+        b.gt            1b
+        ret
+
+2:      adds            w12, w12, w6
+        dup             v30.8B, w4
+        b.eq            5f
+        tst             w6,  w6
+        dup             v31.8B, w12
+        trn1            v0.2S,  v30.2S, v31.2S
+        trn2            v1.2S,  v30.2S, v31.2S
+        b.eq            4f
+
+        ext             v1.8B,  v0.8B,  v1.8B, #4
+        ld1             {v4.S}[0], [x1], x2
+3:      ld1             {v4.S}[1], [x1], x2
+        umull           v18.8H, v4.8B,  v0.8B
+        ld1             {v4.S}[0], [x1], x2
+        umull           v19.8H, v4.8B,  v1.8B
+        trn1            v30.2D, v18.2D, v19.2D
+        trn2            v31.2D, v18.2D, v19.2D
+        add             v18.8H, v30.8H, v31.8H
+        prfm            pldl1strm, [x1]
+  .ifc \codec,h264
+        rshrn           v16.8B, v18.8H, #6
+  .else
+        add             v18.8H, v18.8H, v22.8H
+        shrn            v16.8B, v18.8H, #6
+  .endif
+  .ifc \type,avg
+        ld1             {v20.S}[0], [x8], x2
+        ld1             {v20.S}[1], [x8], x2
+        urhadd          v16.8B, v16.8B, v20.8B
+  .endif
+        subs            w3,  w3,  #2
+        prfm            pldl1strm, [x1, x2]
+        st1             {v16.S}[0], [x0], x2
+        st1             {v16.S}[1], [x0], x2
+        b.gt            3b
+        ret
+
+4:      ld1             {v4.8B}, [x1], x2
+        ld1             {v6.8B}, [x1], x2
+        ext             v5.8B,  v4.8B,  v5.8B, #1
+        ext             v7.8B,  v6.8B,  v7.8B, #1
+        trn1            v4.2S,  v4.2S,  v5.2S
+        trn1            v6.2S,  v6.2S,  v7.2S
+        umull           v18.8H, v4.8B,  v0.8B
+        umull           v19.8H, v6.8B,  v0.8B
+        subs            w3,  w3,  #2
+        trn1            v30.2D, v18.2D, v19.2D
+        trn2            v31.2D, v18.2D, v19.2D
+        add             v18.8H, v30.8H, v31.8H
+        prfm            pldl1strm, [x1]
+  .ifc \codec,h264
+        rshrn           v16.8B, v18.8H, #6
+  .else
+        add             v18.8H, v18.8H, v22.8H
+        shrn            v16.8B, v18.8H, #6
+  .endif
+  .ifc \type,avg
+        ld1             {v20.S}[0], [x8], x2
+        ld1             {v20.S}[1], [x8], x2
+        urhadd          v16.8B, v16.8B, v20.8B
+  .endif
+        prfm            pldl1strm, [x1]
+        st1             {v16.S}[0], [x0], x2
+        st1             {v16.S}[1], [x0], x2
+        b.gt            4b
+        ret
+
+5:      ld1             {v4.S}[0], [x1], x2
+        ld1             {v4.S}[1], [x1], x2
+        umull           v18.8H, v4.8B,  v30.8B
+        subs            w3,  w3,  #2
+        prfm            pldl1strm, [x1]
+  .ifc \codec,h264
+        rshrn           v16.8B, v18.8H, #6
+  .else
+        add             v18.8H, v18.8H, v22.8H
+        shrn            v16.8B, v18.8H, #6
+  .endif
+  .ifc \type,avg
+        ld1             {v20.S}[0], [x8], x2
+        ld1             {v20.S}[1], [x8], x2
+        urhadd          v16.8B, v16.8B, v20.8B
+  .endif
+        prfm            pldl1strm, [x1]
+        st1             {v16.S}[0], [x0], x2
+        st1             {v16.S}[1], [x0], x2
+        b.gt            5b
+        ret
+endfunc
+.endm
+
+.macro  h264_chroma_mc2 type
+function ff_\type\()_h264_chroma_mc2_neon, export=1
+        prfm            pldl1strm, [x1]
+        prfm            pldl1strm, [x1, x2]
+        orr             w7,  w4,  w5
+        cbz             w7,  2f
+
+        mul             w7,  w4,  w5
+        lsl             w14, w5,  #3
+        lsl             w13, w4,  #3
+        sub             w6,  w14, w7
+        sub             w12, w13, w7
+        sub             w4,  w7,  w13
+        sub             w4,  w4,  w14
+        add             w4,  w4,  #64
+        dup             v0.8B,  w4
+        dup             v2.8B,  w12
+        dup             v1.8B,  w6
+        dup             v3.8B,  w7
+        trn1            v0.4H,  v0.4H,  v2.4H
+        trn1            v1.4H,  v1.4H,  v3.4H
+1:
+        ld1             {v4.S}[0],  [x1], x2
+        ld1             {v4.S}[1],  [x1], x2
+        rev64           v5.2S,  v4.2S
+        ld1             {v5.S}[1],  [x1]
+        ext             v6.8B,  v4.8B,  v5.8B,  #1
+        ext             v7.8B,  v5.8B,  v4.8B,  #1
+        trn1            v4.4H,  v4.4H,  v6.4H
+        trn1            v5.4H,  v5.4H,  v7.4H
+        umull           v16.8H, v4.8B,  v0.8B
+        umlal           v16.8H, v5.8B,  v1.8B
+  .ifc \type,avg
+        ld1             {v18.H}[0], [x0], x2
+        ld1             {v18.H}[2], [x0]
+        sub             x0,  x0,  x2
+  .endif
+        rev64           v17.4S, v16.4S
+        add             v16.8H, v16.8H, v17.8H
+        rshrn           v16.8B, v16.8H, #6
+  .ifc \type,avg
+        urhadd          v16.8B, v16.8B, v18.8B
+  .endif
+        st1             {v16.H}[0], [x0], x2
+        st1             {v16.H}[2], [x0], x2
+        subs            w3,  w3,  #2
+        b.gt            1b
+        ret
+
+2:
+        ld1             {v16.H}[0], [x1], x2
+        ld1             {v16.H}[1], [x1], x2
+  .ifc \type,avg
+        ld1             {v18.H}[0], [x0], x2
+        ld1             {v18.H}[1], [x0]
+        sub             x0,  x0,  x2
+        urhadd          v16.8B, v16.8B, v18.8B
+  .endif
+        st1             {v16.H}[0], [x0], x2
+        st1             {v16.H}[1], [x0], x2
+        subs            w3,  w3,  #2
+        b.gt            2b
+        ret
+endfunc
+.endm
+
+        h264_chroma_mc8 put
+        h264_chroma_mc8 avg
+        h264_chroma_mc4 put
+        h264_chroma_mc4 avg
+        h264_chroma_mc2 put
+        h264_chroma_mc2 avg
+
+#if CONFIG_RV40_DECODER
+const   rv40bias
+        .short           0, 16, 32, 16
+        .short          32, 28, 32, 28
+        .short           0, 32, 16, 32
+        .short          32, 28, 32, 28
+endconst
+
+        h264_chroma_mc8 put, rv40
+        h264_chroma_mc8 avg, rv40
+        h264_chroma_mc4 put, rv40
+        h264_chroma_mc4 avg, rv40
+#endif
+
+#if CONFIG_VC1DSP
+        h264_chroma_mc8 put, vc1
+        h264_chroma_mc8 avg, vc1
+        h264_chroma_mc4 put, vc1
+        h264_chroma_mc4 avg, vc1
+#endif
new file mode 100644
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/h264dsp_init_aarch64.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/h264dsp.h"
+
+void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
+                                     int beta, int8_t *tc0);
+void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
+                                     int beta, int8_t *tc0);
+void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
+                                       int beta, int8_t *tc0);
+void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
+                                       int beta, int8_t *tc0);
+
+void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height,
+                                   int log2_den, int weight, int offset);
+void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height,
+                                  int log2_den, int weight, int offset);
+void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height,
+                                  int log2_den, int weight, int offset);
+
+void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride,
+                                     int height, int log2_den, int weightd,
+                                     int weights, int offset);
+void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride,
+                                    int height, int log2_den, int weightd,
+                                    int weights, int offset);
+void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride,
+                                    int height, int log2_den, int weightd,
+                                    int weights, int offset);
+
+void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset,
+                             int16_t *block, int stride,
+                             const uint8_t nnzc[6*8]);
+void ff_h264_idct_add16intra_neon(uint8_t *dst, const int *block_offset,
+                                  int16_t *block, int stride,
+                                  const uint8_t nnzc[6*8]);
+void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset,
+                            int16_t *block, int stride,
+                            const uint8_t nnzc[6*8]);
+
+void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct8_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset,
+                             int16_t *block, int stride,
+                             const uint8_t nnzc[6*8]);
+
+av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
+                                     const int chroma_format_idc)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags) && bit_depth == 8) {
+        c->h264_v_loop_filter_luma   = ff_h264_v_loop_filter_luma_neon;
+        c->h264_h_loop_filter_luma   = ff_h264_h_loop_filter_luma_neon;
+        c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
+        if (chroma_format_idc <= 1)
+        c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
+
+        c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
+        c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
+        c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon;
+
+        c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon;
+        c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon;
+        c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon;
+
+        c->h264_idct_add        = ff_h264_idct_add_neon;
+        c->h264_idct_dc_add     = ff_h264_idct_dc_add_neon;
+        c->h264_idct_add16      = ff_h264_idct_add16_neon;
+        c->h264_idct_add16intra = ff_h264_idct_add16intra_neon;
+        if (chroma_format_idc <= 1)
+            c->h264_idct_add8   = ff_h264_idct_add8_neon;
+        c->h264_idct8_add       = ff_h264_idct8_add_neon;
+        c->h264_idct8_dc_add    = ff_h264_idct8_dc_add_neon;
+        c->h264_idct8_add4      = ff_h264_idct8_add4_neon;
+    }
+}
new file mode 100644
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/h264dsp_neon.S
@@ -0,0 +1,498 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+.macro  h264_loop_filter_start
+        cmp             w2,  #0
+        ldr             w6,  [x4]
+        ccmp            w3,  #0, #0, ne
+        mov             v24.S[0], w6
+        and             w6,  w6,  w6,  lsl #16
+        b.eq            1f
+        ands            w6,  w6,  w6,  lsl #8
+        b.ge            2f
+1:
+        ret
+2:
+.endm
+
+.macro  h264_loop_filter_luma
+        dup             v22.16B, w2                     // alpha
+        uxtl            v24.8H,  v24.8B
+        uabd            v21.16B, v16.16B, v0.16B        // abs(p0 - q0)
+        uxtl            v24.4S,  v24.4H
+        uabd            v28.16B, v18.16B, v16.16B       // abs(p1 - p0)
+        sli             v24.8H,  v24.8H,  #8
+        uabd            v30.16B, v2.16B,  v0.16B        // abs(q1 - q0)
+        sli             v24.4S,  v24.4S,  #16
+        cmhi            v21.16B, v22.16B, v21.16B       // < alpha
+        dup             v22.16B, w3                     // beta
+        cmlt            v23.16B, v24.16B, #0
+        cmhi            v28.16B, v22.16B, v28.16B       // < beta
+        cmhi            v30.16B, v22.16B, v30.16B       // < beta
+        bic             v21.16B, v21.16B, v23.16B
+        uabd            v17.16B, v20.16B, v16.16B       // abs(p2 - p0)
+        and             v21.16B, v21.16B, v28.16B
+        uabd            v19.16B,  v4.16B,  v0.16B       // abs(q2 - q0)
+        cmhi            v17.16B, v22.16B, v17.16B       // < beta
+        and             v21.16B, v21.16B, v30.16B
+        cmhi            v19.16B, v22.16B, v19.16B       // < beta
+        and             v17.16B, v17.16B, v21.16B
+        and             v19.16B, v19.16B, v21.16B
+        and             v24.16B, v24.16B, v21.16B
+        urhadd          v28.16B, v16.16B,  v0.16B
+        sub             v21.16B, v24.16B, v17.16B
+        uqadd           v23.16B, v18.16B, v24.16B
+        uhadd           v20.16B, v20.16B, v28.16B
+        sub             v21.16B, v21.16B, v19.16B
+        uhadd           v28.16B,  v4.16B, v28.16B
+        umin            v23.16B, v23.16B, v20.16B
+        uqsub           v22.16B, v18.16B, v24.16B
+        uqadd           v4.16B,   v2.16B, v24.16B
+        umax            v23.16B, v23.16B, v22.16B
+        uqsub           v22.16B,  v2.16B, v24.16B
+        umin            v28.16B,  v4.16B, v28.16B
+        uxtl            v4.8H,    v0.8B
+        umax            v28.16B, v28.16B, v22.16B
+        uxtl2           v20.8H,   v0.16B
+        usubw           v4.8H,    v4.8H,  v16.8B
+        usubw2          v20.8H,  v20.8H,  v16.16B
+        shl             v4.8H,    v4.8H,  #2
+        shl             v20.8H,  v20.8H,  #2
+        uaddw           v4.8H,    v4.8H,  v18.8B
+        uaddw2          v20.8H,  v20.8H,  v18.16B
+        usubw           v4.8H,    v4.8H,   v2.8B
+        usubw2          v20.8H,  v20.8H,   v2.16B
+        rshrn           v4.8B,    v4.8H,  #3
+        rshrn2          v4.16B,  v20.8H,  #3
+        bsl             v17.16B, v23.16B, v18.16B
+        bsl             v19.16B, v28.16B,  v2.16B
+        neg             v23.16B, v21.16B
+        uxtl            v28.8H,  v16.8B
+        smin            v4.16B,   v4.16B, v21.16B
+        uxtl2           v21.8H,  v16.16B
+        smax            v4.16B,   v4.16B, v23.16B
+        uxtl            v22.8H,   v0.8B
+        uxtl2           v24.8H,   v0.16B
+        saddw           v28.8H,  v28.8H,  v4.8B
+        saddw2          v21.8H,  v21.8H,  v4.16B
+        ssubw           v22.8H,  v22.8H,  v4.8B
+        ssubw2          v24.8H,  v24.8H,  v4.16B
+        sqxtun          v16.8B,  v28.8H
+        sqxtun2         v16.16B, v21.8H
+        sqxtun          v0.8B,   v22.8H
+        sqxtun2         v0.16B,  v24.8H
+.endm
+
+function ff_h264_v_loop_filter_luma_neon, export=1
+        h264_loop_filter_start
+        sxtw            x1,  w1
+
+        ld1             {v0.16B},  [x0], x1
+        ld1             {v2.16B},  [x0], x1
+        ld1             {v4.16B},  [x0], x1
+        sub             x0,  x0,  x1, lsl #2
+        sub             x0,  x0,  x1, lsl #1
+        ld1             {v20.16B},  [x0], x1
+        ld1             {v18.16B},  [x0], x1
+        ld1             {v16.16B},  [x0], x1
+
+        h264_loop_filter_luma
+
+        sub             x0,  x0,  x1, lsl #1
+        st1             {v17.16B},  [x0], x1
+        st1             {v16.16B}, [x0], x1
+        st1             {v0.16B},  [x0], x1
+        st1             {v19.16B}, [x0]
+
+        ret
+endfunc
+
+function ff_h264_h_loop_filter_luma_neon, export=1
+        h264_loop_filter_start
+
+        sub             x0,  x0,  #4
+        ld1             {v6.8B},  [x0], x1
+        ld1             {v20.8B}, [x0], x1
+        ld1             {v18.8B}, [x0], x1
+        ld1             {v16.8B}, [x0], x1
+        ld1             {v0.8B},  [x0], x1
+        ld1             {v2.8B},  [x0], x1
+        ld1             {v4.8B},  [x0], x1
+        ld1             {v26.8B}, [x0], x1
+        ld1             {v6.D}[1],  [x0], x1
+        ld1             {v20.D}[1], [x0], x1
+        ld1             {v18.D}[1], [x0], x1
+        ld1             {v16.D}[1], [x0], x1
+        ld1             {v0.D}[1],  [x0], x1
+        ld1             {v2.D}[1],  [x0], x1
+        ld1             {v4.D}[1],  [x0], x1
+        ld1             {v26.D}[1], [x0], x1
+
+        transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
+
+        h264_loop_filter_luma
+
+        transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27
+
+        sub             x0,  x0,  x1, lsl #4
+        add             x0,  x0,  #2
+        st1             {v17.S}[0],  [x0], x1
+        st1             {v16.S}[0], [x0], x1
+        st1             {v0.S}[0],  [x0], x1
+        st1             {v19.S}[0], [x0], x1
+        st1             {v17.S}[1],  [x0], x1
+        st1             {v16.S}[1], [x0], x1
+        st1             {v0.S}[1],  [x0], x1
+        st1             {v19.S}[1], [x0], x1
+        st1             {v17.S}[2],  [x0], x1
+        st1             {v16.S}[2], [x0], x1
+        st1             {v0.S}[2],  [x0], x1
+        st1             {v19.S}[2], [x0], x1
+        st1             {v17.S}[3],  [x0], x1
+        st1             {v16.S}[3], [x0], x1
+        st1             {v0.S}[3],  [x0], x1
+        st1             {v19.S}[3], [x0], x1
+
+        ret
+endfunc
+
+.macro  h264_loop_filter_chroma
+        dup             v22.8B, w2              // alpha
+        uxtl            v24.8H, v24.8B
+        uabd            v26.8B, v16.8B, v0.8B   // abs(p0 - q0)
+        uxtl            v4.8H,  v0.8B
+        uabd            v28.8B, v18.8B, v16.8B  // abs(p1 - p0)
+        usubw           v4.8H,  v4.8H,  v16.8B
+        sli             v24.8H, v24.8H, #8
+        shl             v4.8H,  v4.8H,  #2
+        uabd            v30.8B, v2.8B,  v0.8B   // abs(q1 - q0)
+        uaddw           v4.8H,  v4.8H,  v18.8B
+        cmhi            v26.8B, v22.8B, v26.8B  // < alpha
+        usubw           v4.8H,  v4.8H,  v2.8B
+        dup             v22.8B, w3              // beta
+        rshrn           v4.8B,  v4.8H,  #3
+        cmhi            v28.8B, v22.8B, v28.8B  // < beta
+        cmhi            v30.8B, v22.8B, v30.8B  // < beta
+        smin            v4.8B,  v4.8B,  v24.8B
+        neg             v25.8B, v24.8B
+        and             v26.8B, v26.8B, v28.8B
+        smax            v4.8B,  v4.8B,  v25.8B
+        and             v26.8B, v26.8B, v30.8B
+        uxtl            v22.8H, v0.8B
+        and             v4.8B,  v4.8B,  v26.8B
+        uxtl            v28.8H, v16.8B
+        saddw           v28.8H, v28.8H, v4.8B
+        ssubw           v22.8H, v22.8H, v4.8B
+        sqxtun          v16.8B, v28.8H
+        sqxtun          v0.8B,  v22.8H
+.endm
+
+function ff_h264_v_loop_filter_chroma_neon, export=1
+        h264_loop_filter_start
+
+        sub             x0,  x0,  x1, lsl #1
+        ld1             {v18.8B}, [x0], x1
+        ld1             {v16.8B}, [x0], x1
+        ld1             {v0.8B},  [x0], x1
+        ld1             {v2.8B},  [x0]
+
+        h264_loop_filter_chroma
+
+        sub             x0,  x0,  x1, lsl #1
+        st1             {v16.8B}, [x0], x1
+        st1             {v0.8B},  [x0], x1
+
+        ret
+endfunc
+
+function ff_h264_h_loop_filter_chroma_neon, export=1
+        h264_loop_filter_start
+
+        sub             x0,  x0,  #2
+        ld1             {v18.S}[0], [x0], x1
+        ld1             {v16.S}[0], [x0], x1
+        ld1             {v0.S}[0],  [x0], x1
+        ld1             {v2.S}[0],  [x0], x1
+        ld1             {v18.S}[1], [x0], x1
+        ld1             {v16.S}[1], [x0], x1
+        ld1             {v0.S}[1],  [x0], x1
+        ld1             {v2.S}[1],  [x0], x1
+
+        transpose_4x8B  v18, v16, v0, v2, v28, v29, v30, v31
+
+        h264_loop_filter_chroma
+
+        transpose_4x8B  v18, v16, v0, v2, v28, v29, v30, v31
+
+        sub             x0,  x0,  x1, lsl #3
+        st1             {v18.S}[0], [x0], x1
+        st1             {v16.S}[0], [x0], x1
+        st1             {v0.S}[0],  [x0], x1
+        st1             {v2.S}[0],  [x0], x1
+        st1             {v18.S}[1], [x0], x1
+        st1             {v16.S}[1], [x0], x1
+        st1             {v0.S}[1],  [x0], x1
+        st1             {v2.S}[1],  [x0], x1
+
+        ret
+endfunc
+
+.macro  biweight_16     macs, macd
+        dup             v0.16B,  w5
+        dup             v1.16B,  w6
+        mov             v4.16B,  v16.16B
+        mov             v6.16B,  v16.16B
+1:      subs            w3,  w3,  #2
+        ld1             {v20.16B}, [x0], x2
+        \macd           v4.8H,   v0.8B,  v20.8B
+        \macd\()2       v6.8H,   v0.16B, v20.16B
+        ld1             {v22.16B}, [x1], x2
+        \macs           v4.8H,   v1.8B,  v22.8B
+        \macs\()2       v6.8H,   v1.16B, v22.16B
+        mov             v24.16B, v16.16B
+        ld1             {v28.16B}, [x0], x2
+        mov             v26.16B, v16.16B
+        \macd           v24.8H,  v0.8B,  v28.8B
+        \macd\()2       v26.8H,  v0.16B, v28.16B
+        ld1             {v30.16B}, [x1], x2
+        \macs           v24.8H,  v1.8B,  v30.8B
+        \macs\()2       v26.8H,  v1.16B, v30.16B
+        sshl            v4.8H,   v4.8H,  v18.8H
+        sshl            v6.8H,   v6.8H,  v18.8H
+        sqxtun          v4.8B,   v4.8H
+        sqxtun2         v4.16B,  v6.8H
+        sshl            v24.8H,  v24.8H, v18.8H
+        sshl            v26.8H,  v26.8H, v18.8H
+        sqxtun          v24.8B,  v24.8H
+        sqxtun2         v24.16B, v26.8H
+        mov             v6.16B,  v16.16B
+        st1             {v4.16B},  [x7], x2
+        mov             v4.16B,  v16.16B
+        st1             {v24.16B}, [x7], x2
+        b.ne            1b
+        ret
+.endm
+
+.macro  biweight_8      macs, macd
+        dup             v0.8B,  w5
+        dup             v1.8B,  w6
+        mov             v2.16B,  v16.16B
+        mov             v20.16B, v16.16B
+1:      subs            w3,  w3,  #2
+        ld1             {v4.8B}, [x0], x2
+        \macd           v2.8H,  v0.8B,  v4.8B
+        ld1             {v5.8B}, [x1], x2
+        \macs           v2.8H,  v1.8B,  v5.8B
+        ld1             {v6.8B}, [x0], x2
+        \macd           v20.8H, v0.8B,  v6.8B
+        ld1             {v7.8B}, [x1], x2
+        \macs           v20.8H, v1.8B,  v7.8B
+        sshl            v2.8H,  v2.8H,  v18.8H
+        sqxtun          v2.8B,  v2.8H
+        sshl            v20.8H, v20.8H, v18.8H
+        sqxtun          v4.8B,  v20.8H
+        mov             v20.16B, v16.16B
+        st1             {v2.8B}, [x7], x2
+        mov             v2.16B,  v16.16B
+        st1             {v4.8B}, [x7], x2
+        b.ne            1b
+        ret
+.endm
+
+.macro  biweight_4      macs, macd
+        dup             v0.8B,  w5
+        dup             v1.8B,  w6
+        mov             v2.16B, v16.16B
+        mov             v20.16B,v16.16B
+1:      subs            w3,  w3,  #4
+        ld1             {v4.S}[0], [x0], x2
+        ld1             {v4.S}[1], [x0], x2
+        \macd           v2.8H,  v0.8B,  v4.8B
+        ld1             {v5.S}[0], [x1], x2
+        ld1             {v5.S}[1], [x1], x2
+        \macs           v2.8H,  v1.8B,  v5.8B
+        b.lt            2f
+        ld1             {v6.S}[0], [x0], x2
+        ld1             {v6.S}[1], [x0], x2
+        \macd           v20.8H, v0.8B,  v6.8B
+        ld1             {v7.S}[0], [x1], x2
+        ld1             {v7.S}[1], [x1], x2
+        \macs           v20.8H, v1.8B,  v7.8B
+        sshl            v2.8H,  v2.8H,  v18.8H
+        sqxtun          v2.8B,  v2.8H
+        sshl            v20.8H, v20.8H, v18.8H
+        sqxtun          v4.8B,  v20.8H
+        mov             v20.16B, v16.16B
+        st1             {v2.S}[0], [x7], x2
+        st1             {v2.S}[1], [x7], x2
+        mov             v2.16B,  v16.16B
+        st1             {v4.S}[0], [x7], x2
+        st1             {v4.S}[1], [x7], x2
+        b.ne            1b
+        ret
+2:      sshl            v2.8H,  v2.8H,  v18.8H
+        sqxtun          v2.8B,  v2.8H
+        st1             {v2.S}[0], [x7], x2
+        st1             {v2.S}[1], [x7], x2
+        ret
+.endm
+
+.macro  biweight_func   w
+function ff_biweight_h264_pixels_\w\()_neon, export=1
+        sxtw            x2,  w2
+        lsr             w8,  w5,  #31
+        add             w7,  w7,  #1
+        eor             w8,  w8,  w6,  lsr #30
+        orr             w7,  w7,  #1
+        dup             v18.8H,   w4
+        lsl             w7,  w7,  w4
+        not             v18.16B,  v18.16B
+        dup             v16.8H,   w7
+        mov             x7,  x0
+        cbz             w8,  10f
+        subs            w8,  w8,  #1
+        b.eq            20f
+        subs            w8,  w8,  #1
+        b.eq            30f
+        b               40f
+10:     biweight_\w     umlal, umlal
+20:     neg             w5, w5
+        biweight_\w     umlal, umlsl
+30:     neg             w5, w5
+        neg             w6, w6
+        biweight_\w     umlsl, umlsl
+40:     neg             w6, w6
+        biweight_\w     umlsl, umlal
+endfunc
+.endm
+
+        biweight_func   16
+        biweight_func   8
+        biweight_func   4
+
+.macro  weight_16       add
+        dup             v0.16B,  w4
+1:      subs            w2,  w2,  #2
+        ld1             {v20.16B}, [x0], x1
+        umull           v4.8H,   v0.8B,  v20.8B
+        umull2          v6.8H,   v0.16B, v20.16B
+        ld1             {v28.16B}, [x0], x1
+        umull           v24.8H,  v0.8B,  v28.8B
+        umull2          v26.8H,  v0.16B, v28.16B
+        \add            v4.8H,   v16.8H, v4.8H
+        srshl           v4.8H,   v4.8H,  v18.8H
+        \add            v6.8H,   v16.8H, v6.8H
+        srshl           v6.8H,   v6.8H,  v18.8H
+        sqxtun          v4.8B,   v4.8H
+        sqxtun2         v4.16B,  v6.8H
+        \add            v24.8H,  v16.8H, v24.8H
+        srshl           v24.8H,  v24.8H, v18.8H
+        \add            v26.8H,  v16.8H, v26.8H
+        srshl           v26.8H,  v26.8H, v18.8H
+        sqxtun          v24.8B,  v24.8H
+        sqxtun2         v24.16B, v26.8H
+        st1             {v4.16B},  [x5], x1
+        st1             {v24.16B}, [x5], x1
+        b.ne            1b
+        ret
+.endm
+
+.macro  weight_8        add
+        dup             v0.8B,  w4
+1:      subs            w2,  w2,  #2
+        ld1             {v4.8B}, [x0], x1
+        umull           v2.8H,  v0.8B,  v4.8B
+        ld1             {v6.8B}, [x0], x1
+        umull           v20.8H, v0.8B,  v6.8B
+        \add            v2.8H,  v16.8H,  v2.8H
+        srshl           v2.8H,  v2.8H,  v18.8H
+        sqxtun          v2.8B,  v2.8H
+        \add            v20.8H, v16.8H,  v20.8H
+        srshl           v20.8H, v20.8H, v18.8H
+        sqxtun          v4.8B,  v20.8H
+        st1             {v2.8B}, [x5], x1
+        st1             {v4.8B}, [x5], x1
+        b.ne            1b
+        ret
+.endm
+
+.macro  weight_4        add
+        dup             v0.8B,  w4
+1:      subs            w2,  w2,  #4
+        ld1             {v4.S}[0], [x0], x1
+        ld1             {v4.S}[1], [x0], x1
+        umull           v2.8H,  v0.8B,  v4.8B
+        b.lt            2f
+        ld1             {v6.S}[0], [x0], x1
+        ld1             {v6.S}[1], [x0], x1
+        umull           v20.8H, v0.8B,  v6.8B
+        \add            v2.8H,  v16.8H,  v2.8H
+        srshl           v2.8H,  v2.8H,  v18.8H
+        sqxtun          v2.8B,  v2.8H
+        \add            v20.8H, v16.8H,  v20.8H
+        srshl           v20.8H, v20.8h, v18.8H
+        sqxtun          v4.8B,  v20.8H
+        st1             {v2.S}[0], [x5], x1
+        st1             {v2.S}[1], [x5], x1
+        st1             {v4.S}[0], [x5], x1
+        st1             {v4.S}[1], [x5], x1
+        b.ne            1b
+        ret
+2:      \add            v2.8H,  v16.8H,  v2.8H
+        srshl           v2.8H,  v2.8H,  v18.8H
+        sqxtun          v2.8B,  v2.8H
+        st1             {v2.S}[0], [x5], x1
+        st1             {v2.S}[1], [x5], x1
+        ret
+.endm
+
+.macro  weight_func     w
+function ff_weight_h264_pixels_\w\()_neon, export=1
+        sxtw            x1,  w1
+        cmp             w3,  #1
+        mov             w6,  #1
+        lsl             w5,  w5,  w3
+        dup             v16.8H,  w5
+        mov             x5,  x0
+        b.le            20f
+        sub             w6,  w6,  w3
+        dup             v18.8H,  w6
+        cmp             w4, #0
+        b.lt            10f
+        weight_\w       shadd
+10:     neg             w4,  w4
+        weight_\w       shsub
+20:     neg             w6,  w3
+        dup             v18.8H,  w6
+        cmp             w4,  #0
+        b.lt            10f
+        weight_\w       add
+10:     neg             w4,  w4
+        weight_\w       sub
+endfunc
+.endm
+
+        weight_func     16
+        weight_func     8
+        weight_func     4
new file mode 100644
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/h264idct_neon.S
@@ -0,0 +1,409 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+function ff_h264_idct_add_neon, export=1
+        ld1             {v0.4H, v1.4H, v2.4H, v3.4H},  [x1]
+        sxtw            x2,     w2
+        movi            v30.8H, #0
+
+        add             v4.4H,  v0.4H,  v2.4H
+        sshr            v16.4H, v1.4H,  #1
+        st1             {v30.8H},    [x1], #16
+        sshr            v17.4H, v3.4H,  #1
+        st1             {v30.8H},    [x1], #16
+        sub             v5.4H,  v0.4H,  v2.4H
+        sub             v6.4H,  v16.4H, v3.4H
+        add             v7.4H,  v1.4H,  v17.4H
+        add             v0.4H,  v4.4H,  v7.4H
+        add             v1.4H,  v5.4H,  v6.4H
+        sub             v2.4H,  v5.4H,  v6.4H
+        sub             v3.4H,  v4.4H,  v7.4H
+
+        transpose_4x4H  v0, v1, v2, v3, v4, v5, v6, v7
+
+        add             v4.4H,  v0.4H,  v2.4H
+        ld1             {v18.S}[0], [x0], x2
+        sshr            v16.4H,  v3.4H,  #1
+        sshr            v17.4H,  v1.4H,  #1
+        ld1             {v18.S}[1], [x0], x2
+        sub             v5.4H,  v0.4H,  v2.4H
+        ld1             {v19.S}[1], [x0], x2
+        add             v6.4H,  v16.4H, v1.4H
+        ins             v4.D[1],  v5.D[0]
+        sub             v7.4H,  v17.4H, v3.4H
+        ld1             {v19.S}[0], [x0], x2
+        ins             v6.D[1],  v7.D[0]
+        sub             x0,  x0,  x2, lsl #2
+        add             v0.8H,  v4.8H,  v6.8H
+        sub             v1.8H,  v4.8H,  v6.8H
+
+        srshr           v0.8H,  v0.8H,  #6
+        srshr           v1.8H,  v1.8H,  #6
+
+        uaddw           v0.8H,  v0.8H,  v18.8B
+        uaddw           v1.8H,  v1.8H,  v19.8B
+
+        sqxtun          v0.8B, v0.8H
+        sqxtun          v1.8B, v1.8H
+
+        st1             {v0.S}[0],  [x0], x2
+        st1             {v0.S}[1],  [x0], x2
+        st1             {v1.S}[1],  [x0], x2
+        st1             {v1.S}[0],  [x0], x2
+
+        sub             x1,  x1,  #32
+        ret
+endfunc
+
+function ff_h264_idct_dc_add_neon, export=1
+        sxtw            x2,  w2
+        mov             w3,       #0
+        ld1r            {v2.8H},  [x1]
+        strh            w3,       [x1]
+        srshr           v2.8H,  v2.8H,  #6
+        ld1             {v0.S}[0],  [x0], x2
+        ld1             {v0.S}[1],  [x0], x2
+        uaddw           v3.8H,  v2.8H,  v0.8B
+        ld1             {v1.S}[0],  [x0], x2
+        ld1             {v1.S}[1],  [x0], x2
+        uaddw           v4.8H,  v2.8H,  v1.8B
+        sqxtun          v0.8B,  v3.8H
+        sqxtun          v1.8B,  v4.8H
+        sub             x0,  x0,  x2, lsl #2
+        st1             {v0.S}[0],  [x0], x2
+        st1             {v0.S}[1],  [x0], x2
+        st1             {v1.S}[0],  [x0], x2
+        st1             {v1.S}[1],  [x0], x2
+        ret
+endfunc
+
+function ff_h264_idct_add16_neon, export=1
+        mov             x12, x30
+        mov             x6,  x0         // dest
+        mov             x5,  x1         // block_offset
+        mov             x1,  x2         // block
+        mov             w9,  w3         // stride
+        movrel          x7,  scan8
+        mov             x10, #16
+        movrel          x13, X(ff_h264_idct_dc_add_neon)
+        movrel          x14, X(ff_h264_idct_add_neon)
+1:      mov             w2,  w9
+        ldrb            w3,  [x7], #1
+        ldrsw           x0,  [x5], #4
+        ldrb            w3,  [x4,  w3,  uxtw]
+        subs            w3,  w3,  #1
+        b.lt            2f
+        ldrsh           w3,  [x1]
+        add             x0,  x0,  x6
+        ccmp            w3,  #0,  #4,  eq
+        csel            x15, x13, x14, ne
+        blr             x15
+2:      subs            x10, x10, #1
+        add             x1,  x1,  #32
+        b.ne            1b
+        ret             x12
+endfunc
+
+function ff_h264_idct_add16intra_neon, export=1
+        mov             x12, x30
+        mov             x6,  x0         // dest
+        mov             x5,  x1         // block_offset
+        mov             x1,  x2         // block
+        mov             w9,  w3         // stride
+        movrel          x7,  scan8
+        mov             x10, #16
+        movrel          x13, X(ff_h264_idct_dc_add_neon)
+        movrel          x14, X(ff_h264_idct_add_neon)
+1:      mov             w2,  w9
+        ldrb            w3,  [x7], #1
+        ldrsw           x0,  [x5], #4
+        ldrb            w3,  [x4,  w3,  uxtw]
+        add             x0,  x0,  x6
+        cmp             w3,  #0
+        ldrsh           w3,  [x1]
+        csel            x15, x13, x14, eq
+        ccmp            w3,  #0,  #0,  eq
+        b.eq            2f
+        blr             x15
+2:      subs            x10, x10, #1
+        add             x1,  x1,  #32
+        b.ne            1b
+        ret             x12
+endfunc
+
+function ff_h264_idct_add8_neon, export=1
+        sub             sp,  sp, #0x40
+        stp             x19, x20, [sp]
+        mov             x12, x30
+        ldp             x6,  x15, [x0]          // dest[0], dest[1]
+        add             x5,  x1,  #16*4         // block_offset
+        add             x9,  x2,  #16*32        // block
+        mov             w19, w3                 // stride
+        movrel          x13, X(ff_h264_idct_dc_add_neon)
+        movrel          x14, X(ff_h264_idct_add_neon)
+        movrel          x7,  scan8, 16
+        mov             x10, #0
+        mov             x11, #16
+1:      mov             w2,  w19
+        ldrb            w3,  [x7, x10]          // scan8[i]
+        ldrsw           x0,  [x5, x10, lsl #2]  // block_offset[i]
+        ldrb            w3,  [x4, w3,  uxtw]    // nnzc[ scan8[i] ]
+        add             x0,  x0,  x6            // block_offset[i] + dst[j-1]
+        add             x1,  x9,  x10, lsl #5   // block + i * 16
+        cmp             w3,  #0
+        ldrsh           w3,  [x1]               // block[i*16]
+        csel            x20, x13, x14, eq
+        ccmp            w3,  #0,  #0,  eq
+        b.eq            2f
+        blr             x20
+2:      add             x10, x10, #1
+        cmp             x10, #4
+        csel            x10, x11, x10, eq     // mov x10, #16
+        csel            x6,  x15, x6,  eq
+        cmp             x10, #20
+        b.lt            1b
+        ldp             x19, x20, [sp]
+        add             sp,  sp,  #0x40
+        ret             x12
+endfunc
+
+.macro  idct8x8_cols    pass
+  .if \pass == 0
+        va      .req    v18
+        vb      .req    v30
+        sshr            v18.8H, v26.8H, #1
+        add             v16.8H, v24.8H, v28.8H
+        ld1             {v30.8H, v31.8H}, [x1]
+        st1             {v19.8H}, [x1],  #16
+        st1             {v19.8H}, [x1],  #16
+        sub             v17.8H,  v24.8H, v28.8H
+        sshr            v19.8H,  v30.8H, #1
+        sub             v18.8H,  v18.8H,  v30.8H
+        add             v19.8H,  v19.8H,  v26.8H
+  .else
+        va      .req    v30
+        vb      .req    v18
+        sshr            v30.8H, v26.8H, #1
+        sshr            v19.8H, v18.8H, #1
+        add             v16.8H, v24.8H, v28.8H
+        sub             v17.8H, v24.8H, v28.8H
+        sub             v30.8H, v30.8H, v18.8H
+        add             v19.8H, v19.8H, v26.8H
+  .endif
+        add             v26.8H, v17.8H, va.8H
+        sub             v28.8H, v17.8H, va.8H
+        add             v24.8H, v16.8H, v19.8H
+        sub             vb.8H,  v16.8H, v19.8H
+        sub             v16.8H, v29.8H, v27.8H
+        add             v17.8H, v31.8H, v25.8H
+        sub             va.8H,  v31.8H, v25.8H
+        add             v19.8H, v29.8H, v27.8H
+        sub             v16.8H, v16.8H, v31.8H
+        sub             v17.8H, v17.8H, v27.8H
+        add             va.8H,  va.8H,  v29.8H
+        add             v19.8H, v19.8H, v25.8H
+        sshr            v25.8H, v25.8H, #1
+        sshr            v27.8H, v27.8H, #1
+        sshr            v29.8H, v29.8H, #1
+        sshr            v31.8H, v31.8H, #1
+        sub             v16.8H, v16.8H, v31.8H
+        sub             v17.8H, v17.8H, v27.8H
+        add             va.8H,  va.8H,  v29.8H
+        add             v19.8H, v19.8H, v25.8H
+        sshr            v25.8H, v16.8H, #2
+        sshr            v27.8H, v17.8H, #2
+        sshr            v29.8H, va.8H,  #2
+        sshr            v31.8H, v19.8H, #2
+        sub             v19.8H, v19.8H, v25.8H
+        sub             va.8H,  v27.8H, va.8H
+        add             v17.8H, v17.8H, v29.8H
+        add             v16.8H, v16.8H, v31.8H
+  .if \pass == 0
+        sub             v31.8H, v24.8H, v19.8H
+        add             v24.8H, v24.8H, v19.8H
+        add             v25.8H, v26.8H, v18.8H
+        sub             v18.8H, v26.8H, v18.8H
+        add             v26.8H, v28.8H, v17.8H
+        add             v27.8H, v30.8H, v16.8H
+        sub             v29.8H, v28.8H, v17.8H
+        sub             v28.8H, v30.8H, v16.8H
+  .else
+        sub             v31.8H, v24.8H, v19.8H
+        add             v24.8H, v24.8H, v19.8H
+        add             v25.8H, v26.8H, v30.8H
+        sub             v30.8H, v26.8H, v30.8H
+        add             v26.8H, v28.8H, v17.8H
+        sub             v29.8H, v28.8H, v17.8H
+        add             v27.8H, v18.8H, v16.8H
+        sub             v28.8H, v18.8H, v16.8H
+  .endif
+        .unreq          va
+        .unreq          vb
+.endm
+
+function ff_h264_idct8_add_neon, export=1
+        movi            v19.8H,   #0
+        sxtw            x2,       w2
+        ld1             {v24.8H, v25.8H}, [x1]
+        st1             {v19.8H},  [x1],   #16
+        st1             {v19.8H},  [x1],   #16
+        ld1             {v26.8H, v27.8H}, [x1]
+        st1             {v19.8H},  [x1],   #16
+        st1             {v19.8H},  [x1],   #16
+        ld1             {v28.8H, v29.8H}, [x1]
+        st1             {v19.8H},  [x1],   #16
+        st1             {v19.8H},  [x1],   #16
+
+        idct8x8_cols    0
+        transpose_8x8H  v24, v25, v26, v27, v28, v29, v18, v31, v6, v7
+        idct8x8_cols    1
+
+        mov             x3,  x0
+        srshr           v24.8H, v24.8H, #6
+        ld1             {v0.8B},     [x0], x2
+        srshr           v25.8H, v25.8H, #6
+        ld1             {v1.8B},     [x0], x2
+        srshr           v26.8H, v26.8H, #6
+        ld1             {v2.8B},     [x0], x2
+        srshr           v27.8H, v27.8H, #6
+        ld1             {v3.8B},     [x0], x2
+        srshr           v28.8H, v28.8H, #6
+        ld1             {v4.8B},     [x0], x2
+        srshr           v29.8H, v29.8H, #6
+        ld1             {v5.8B},     [x0], x2
+        srshr           v30.8H, v30.8H, #6
+        ld1             {v6.8B},     [x0], x2
+        srshr           v31.8H, v31.8H, #6
+        ld1             {v7.8B},     [x0], x2
+        uaddw           v24.8H, v24.8H, v0.8B
+        uaddw           v25.8H, v25.8H, v1.8B
+        uaddw           v26.8H, v26.8H, v2.8B
+        sqxtun          v0.8B,  v24.8H
+        uaddw           v27.8H, v27.8H, v3.8B
+        sqxtun          v1.8B,  v25.8H
+        uaddw           v28.8H, v28.8H, v4.8B
+        sqxtun          v2.8B,  v26.8H
+        st1             {v0.8B},     [x3], x2
+        uaddw           v29.8H, v29.8H, v5.8B
+        sqxtun          v3.8B,  v27.8H
+        st1             {v1.8B},     [x3], x2
+        uaddw           v30.8H, v30.8H, v6.8B
+        sqxtun          v4.8B,  v28.8H
+        st1             {v2.8B},     [x3], x2
+        uaddw           v31.8H, v31.8H, v7.8B
+        sqxtun          v5.8B,  v29.8H
+        st1             {v3.8B},     [x3], x2
+        sqxtun          v6.8B,  v30.8H
+        sqxtun          v7.8B,  v31.8H
+        st1             {v4.8B},     [x3], x2
+        st1             {v5.8B},     [x3], x2
+        st1             {v6.8B},     [x3], x2
+        st1             {v7.8B},     [x3], x2
+
+        sub             x1,  x1,  #128
+        ret
+endfunc
+
+function ff_h264_idct8_dc_add_neon, export=1
+        mov             w3,       #0
+        sxtw            x2,       w2
+        ld1r            {v31.8H}, [x1]
+        strh            w3,       [x1]
+        ld1             {v0.8B},  [x0], x2
+        srshr           v31.8H, v31.8H, #6
+        ld1             {v1.8B},     [x0], x2
+        ld1             {v2.8B},     [x0], x2
+        uaddw           v24.8H, v31.8H, v0.8B
+        ld1             {v3.8B},     [x0], x2
+        uaddw           v25.8H, v31.8H, v1.8B
+        ld1             {v4.8B},     [x0], x2
+        uaddw           v26.8H, v31.8H, v2.8B
+        ld1             {v5.8B},     [x0], x2
+        uaddw           v27.8H, v31.8H, v3.8B
+        ld1             {v6.8B},     [x0], x2
+        uaddw           v28.8H, v31.8H, v4.8B
+        ld1             {v7.8B},     [x0], x2
+        uaddw           v29.8H, v31.8H, v5.8B
+        uaddw           v30.8H, v31.8H, v6.8B
+        uaddw           v31.8H, v31.8H, v7.8B
+        sqxtun          v0.8B,  v24.8H
+        sqxtun          v1.8B,  v25.8H
+        sqxtun          v2.8B,  v26.8H
+        sqxtun          v3.8B,  v27.8H
+        sub             x0,  x0,  x2, lsl #3
+        st1             {v0.8B},     [x0], x2
+        sqxtun          v4.8B,  v28.8H
+        st1             {v1.8B},     [x0], x2
+        sqxtun          v5.8B,  v29.8H
+        st1             {v2.8B},     [x0], x2
+        sqxtun          v6.8B,  v30.8H
+        st1             {v3.8B},     [x0], x2
+        sqxtun          v7.8B,  v31.8H
+        st1             {v4.8B},     [x0], x2
+        st1             {v5.8B},     [x0], x2
+        st1             {v6.8B},     [x0], x2
+        st1             {v7.8B},     [x0], x2
+        ret
+endfunc
+
+function ff_h264_idct8_add4_neon, export=1
+        mov             x12, x30
+        mov             x6,  x0
+        mov             x5,  x1
+        mov             x1,  x2
+        mov             w2,  w3
+        movrel          x7,  scan8
+        mov             w10, #16
+        movrel          x13, X(ff_h264_idct8_dc_add_neon)
+        movrel          x14, X(ff_h264_idct8_add_neon)
+1:      ldrb            w9,  [x7], #4
+        ldrsw           x0,  [x5], #16
+        ldrb            w9,  [x4, w9, UXTW]
+        subs            w9,  w9,  #1
+        b.lt            2f
+        ldrsh           w11,  [x1]
+        add             x0,  x6,  x0
+        ccmp            w11, #0,  #4,  eq
+        csel            x15, x13, x14, ne
+        blr             x15
+2:      subs            w10, w10, #4
+        add             x1,  x1,  #128
+        b.ne            1b
+        ret             x12
+endfunc
+
+const   scan8
+        .byte           4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
+        .byte           6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
+        .byte           4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
+        .byte           6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
+        .byte           4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
+        .byte           6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
+        .byte           4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
+        .byte           6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
+        .byte           4+11*8, 5+11*8, 4+12*8, 5+12*8
+        .byte           6+11*8, 7+11*8, 6+12*8, 7+12*8
+        .byte           4+13*8, 5+13*8, 4+14*8, 5+14*8
+        .byte           6+13*8, 7+13*8, 6+14*8, 7+14*8
+endconst
new file mode 100644
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/h264pred_init.c
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/h264pred.h"
+
+void ff_pred16x16_vert_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_hor_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_plane_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_128_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_left_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_top_dc_neon(uint8_t *src, ptrdiff_t stride);
+
+void ff_pred8x8_vert_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_hor_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_plane_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_128_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_left_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_top_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_l0t_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_0lt_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_l00_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_0l0_dc_neon(uint8_t *src, ptrdiff_t stride);
+
+static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id,
+                                        const int bit_depth,
+                                        const int chroma_format_idc)
+{
+    const int high_depth = bit_depth > 8;
+
+    if (high_depth)
+        return;
+
+    if (chroma_format_idc <= 1) {
+        h->pred8x8[VERT_PRED8x8     ] = ff_pred8x8_vert_neon;
+        h->pred8x8[HOR_PRED8x8      ] = ff_pred8x8_hor_neon;
+        if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
+            h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon;
+        h->pred8x8[DC_128_PRED8x8   ] = ff_pred8x8_128_dc_neon;
+        if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7 &&
+            codec_id != AV_CODEC_ID_VP8) {
+            h->pred8x8[DC_PRED8x8     ] = ff_pred8x8_dc_neon;
+            h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon;
+            h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon;
+            h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon;
+            h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon;
+            h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon;
+            h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon;
+        }
+    }
+
+    h->pred16x16[DC_PRED8x8     ] = ff_pred16x16_dc_neon;
+    h->pred16x16[VERT_PRED8x8   ] = ff_pred16x16_vert_neon;
+    h->pred16x16[HOR_PRED8x8    ] = ff_pred16x16_hor_neon;
+    h->pred16x16[LEFT_DC_PRED8x8] = ff_pred16x16_left_dc_neon;
+    h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon;
+    h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_neon;
+    if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 &&
+        codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
+        h->pred16x16[PLANE_PRED8x8  ] = ff_pred16x16_plane_neon;
+}
+
+av_cold void ff_h264_pred_init_aarch64(H264PredContext *h, int codec_id,
+                                       int bit_depth, const int chroma_format_idc)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags))
+        h264_pred_init_neon(h, codec_id, bit_depth, chroma_format_idc);
+}
new file mode 100644
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/h264pred_neon.S
@@ -0,0 +1,361 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+.macro ldcol.8  rd,  rs,  rt,  n=8,  hi=0
+.if \n >= 8 || \hi == 0
+        ld1             {\rd\().b}[0],  [\rs], \rt
+        ld1             {\rd\().b}[1],  [\rs], \rt
+        ld1             {\rd\().b}[2],  [\rs], \rt
+        ld1             {\rd\().b}[3],  [\rs], \rt
+.endif
+.if \n >= 8 || \hi == 1
+        ld1             {\rd\().b}[4],  [\rs], \rt
+        ld1             {\rd\().b}[5],  [\rs], \rt
+        ld1             {\rd\().b}[6],  [\rs], \rt
+        ld1             {\rd\().b}[7],  [\rs], \rt
+.endif
+.if \n == 16
+        ld1             {\rd\().b}[8],  [\rs], \rt
+        ld1             {\rd\().b}[9],  [\rs], \rt
+        ld1             {\rd\().b}[10], [\rs], \rt
+        ld1             {\rd\().b}[11], [\rs], \rt
+        ld1             {\rd\().b}[12], [\rs], \rt
+        ld1             {\rd\().b}[13], [\rs], \rt
+        ld1             {\rd\().b}[14], [\rs], \rt
+        ld1             {\rd\().b}[15], [\rs], \rt
+.endif
+.endm
+
+function ff_pred16x16_128_dc_neon, export=1
+        movi            v0.16b,  #128
+        b               .L_pred16x16_dc_end
+endfunc
+
+function ff_pred16x16_top_dc_neon, export=1
+        sub             x2,  x0,  x1
+        ld1             {v0.16b},  [x2]
+        uaddlv          h0,  v0.16b
+        rshrn           v0.8b,  v0.8h,  #4
+        dup             v0.16b, v0.b[0]
+        b               .L_pred16x16_dc_end
+endfunc
+
+function ff_pred16x16_left_dc_neon, export=1
+        sub             x2,  x0,  #1
+        ldcol.8         v0,  x2,  x1, 16
+        uaddlv          h0,  v0.16b
+        rshrn           v0.8b,  v0.8h,  #4
+        dup             v0.16b, v0.b[0]
+        b               .L_pred16x16_dc_end
+endfunc
+
+function ff_pred16x16_dc_neon, export=1
+        sub             x2,  x0,  x1
+        sub             x3,  x0,  #1
+        ld1             {v0.16b}, [x2]
+        ldcol.8         v1,  x3,  x1, 16
+        uaddlv          h0,  v0.16b
+        uaddlv          h1,  v1.16b
+        add             v0.4h,  v0.4h,  v1.4h
+        rshrn           v0.8b,  v0.8h,  #5
+        dup             v0.16b, v0.b[0]
+.L_pred16x16_dc_end:
+        mov             w3,  #8
+6:      st1             {v0.16b}, [x0], x1
+        st1             {v0.16b}, [x0], x1
+        subs            w3,  w3,  #1
+        b.ne            6b
+        ret
+endfunc
+
+function ff_pred16x16_hor_neon, export=1
+        sub             x2,  x0,  #1
+        mov             w3,  #16
+1:      ld1r            {v0.16b}, [x2], x1
+        st1             {v0.16b}, [x0], x1
+        subs            w3,  w3,  #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_pred16x16_vert_neon, export=1
+        sub             x2,  x0,  x1
+        add             x1,  x1,  x1
+        ld1             {v0.16b}, [x2], x1
+        mov             w3,  #8
+1:      st1             {v0.16b}, [x0], x1
+        st1             {v0.16b}, [x2], x1
+        subs            w3,  w3,  #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_pred16x16_plane_neon, export=1
+        sub             x3,  x0,  x1
+        movrel          x4,  p16weight
+        add             x2,  x3,  #8
+        sub             x3,  x3,  #1
+        ld1             {v0.8b},  [x3]
+        ld1             {v2.8b},  [x2], x1
+        ldcol.8         v1,  x3,  x1
+        add             x3,  x3,  x1
+        ldcol.8         v3,  x3,  x1
+        rev64           v0.8b,  v0.8b
+        rev64           v1.8b,  v1.8b
+        uaddl           v7.8h,  v2.8b,  v3.8b
+        usubl           v2.8h,  v2.8b,  v0.8b
+        usubl           v3.8h,  v3.8b,  v1.8b
+        ld1             {v0.8h},     [x4]
+        mul             v2.8h,  v2.8h,  v0.8h
+        mul             v3.8h,  v3.8h,  v0.8h
+        addp            v2.8h,  v2.8h,  v3.8h
+        addp            v2.8h,  v2.8h,  v2.8h
+        addp            v2.4h,  v2.4h,  v2.4h
+        sshll           v3.4s,  v2.4h,  #2
+        saddw           v2.4s,  v3.4s,  v2.4h
+        rshrn           v4.4h,  v2.4s,  #6
+        trn2            v5.4h,  v4.4h,  v4.4h
+        add             v2.4h,  v4.4h,  v5.4h
+        shl             v3.4h,  v2.4h,  #3
+        ext             v7.16b, v7.16b, v7.16b, #14
+        sub             v3.4h,  v3.4h,  v2.4h   // 7 * (b + c)
+        add             v7.4h,  v7.4h,  v0.4h
+        shl             v2.4h,  v7.4h,  #4
+        sub             v2.4h,  v2.4h,  v3.4h
+        shl             v3.4h,  v4.4h,  #4
+        ext             v0.16b, v0.16b, v0.16b, #14
+        sub             v6.4h,  v5.4h,  v3.4h
+        mov             v0.h[0],  wzr
+        mul             v0.8h,  v0.8h,  v4.h[0]
+        dup             v1.8h,  v2.h[0]
+        dup             v2.8h,  v4.h[0]
+        dup             v3.8h,  v6.h[0]
+        shl             v2.8h,  v2.8h,  #3
+        add             v1.8h,  v1.8h,  v0.8h
+        add             v3.8h,  v3.8h,  v2.8h
+        mov             w3,  #16
+1:
+        sqshrun         v0.8b,  v1.8h,  #5
+        add             v1.8h,  v1.8h,  v2.8h
+        sqshrun2        v0.16b, v1.8h,  #5
+        add             v1.8h,  v1.8h,  v3.8h
+        st1             {v0.16b}, [x0], x1
+        subs            w3,  w3,  #1
+        b.ne            1b
+        ret
+endfunc
+
+const   p16weight, align=4
+        .short          1,2,3,4,5,6,7,8
+endconst
+const   p8weight, align=4
+        .short          1,2,3,4,1,2,3,4
+endconst
+
+function ff_pred8x8_hor_neon, export=1
+        sub             x2,  x0,  #1
+        mov             w3,  #8
+1:      ld1r            {v0.8b},  [x2], x1
+        st1             {v0.8b},  [x0], x1
+        subs            w3,  w3,  #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_pred8x8_vert_neon, export=1
+        sub             x2,  x0,  x1
+        lsl             x1,  x1,  #1
+        ld1             {v0.8b},  [x2], x1
+        mov             w3,  #4
+1:      st1             {v0.8b},  [x0], x1
+        st1             {v0.8b},  [x2], x1
+        subs            w3,  w3,  #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_pred8x8_plane_neon, export=1
+        sub             x3,  x0,  x1
+        movrel          x4,  p8weight
+        movrel          x5,  p16weight
+        add             x2,  x3,  #4
+        sub             x3,  x3,  #1
+        ld1             {v0.s}[0],  [x3]
+        ld1             {v2.s}[0],  [x2], x1
+        ldcol.8         v0,  x3,  x1,  4,  hi=1
+        add             x3,  x3,  x1
+        ldcol.8         v3,  x3,  x1,  4
+        uaddl           v7.8h,  v2.8b,  v3.8b
+        rev32           v0.8b,  v0.8b
+        trn1            v2.2s,  v2.2s,  v3.2s
+        usubl           v2.8h,  v2.8b,  v0.8b
+        ld1             {v6.8h},  [x4]
+        mul             v2.8h,  v2.8h,  v6.8h
+        ld1             {v0.8h},  [x5]
+        saddlp          v2.4s,  v2.8h
+        addp            v2.4s,  v2.4s,  v2.4s
+        shl             v3.4s,  v2.4s,  #4
+        add             v2.4s,  v3.4s,  v2.4s
+        rshrn           v5.4h,  v2.4s,  #5
+        addp            v2.4h,  v5.4h,  v5.4h
+        shl             v3.4h,  v2.4h,  #1
+        add             v3.4h,  v3.4h,  v2.4h
+        rev64           v7.4h,  v7.4h
+        add             v7.4h,  v7.4h,  v0.4h
+        shl             v2.4h,  v7.4h,  #4
+        sub             v2.4h,  v2.4h,  v3.4h
+        ext             v0.16b, v0.16b, v0.16b, #14
+        mov             v0.h[0],  wzr
+        mul             v0.8h,  v0.8h,  v5.h[0]
+        dup             v1.8h,  v2.h[0]
+        dup             v2.8h,  v5.h[1]
+        add             v1.8h,  v1.8h,  v0.8h
+        mov             w3,  #8
+1:
+        sqshrun         v0.8b,  v1.8h,  #5
+        add             v1.8h,  v1.8h,  v2.8h
+        st1             {v0.8b},  [x0], x1
+        subs            w3,  w3,  #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_pred8x8_128_dc_neon, export=1
+        movi            v0.8b,  #128
+        movi            v1.8b,  #128
+        b               .L_pred8x8_dc_end
+endfunc
+
+function ff_pred8x8_top_dc_neon, export=1
+        sub             x2,  x0,  x1
+        ld1             {v0.8b},  [x2]
+        uaddlp          v0.4h,  v0.8b
+        addp            v0.4h,  v0.4h,  v0.4h
+        zip1            v0.8h,  v0.8h,  v0.8h
+        rshrn           v2.8b,  v0.8h,  #2
+        zip1            v0.8b,  v2.8b,  v2.8b
+        zip1            v1.8b,  v2.8b,  v2.8b
+        b               .L_pred8x8_dc_end
+endfunc
+
+function ff_pred8x8_left_dc_neon, export=1
+        sub             x2,  x0,  #1
+        ldcol.8         v0,  x2,  x1
+        uaddlp          v0.4h,  v0.8b
+        addp            v0.4h,  v0.4h,  v0.4h
+        rshrn           v2.8b,  v0.8h,  #2
+        dup             v1.8b,  v2.b[1]
+        dup             v0.8b,  v2.b[0]
+        b               .L_pred8x8_dc_end
+endfunc
+
+function ff_pred8x8_dc_neon, export=1
+        sub             x2,  x0,  x1
+        sub             x3,  x0,  #1
+        ld1             {v0.8b}, [x2]
+        ldcol.8         v1,  x3,  x1
+        uaddlp          v0.4h,  v0.8b
+        uaddlp          v1.4h,  v1.8b
+        trn1            v2.2s,  v0.2s,  v1.2s
+        trn2            v3.2s,  v0.2s,  v1.2s
+        addp            v4.4h,  v2.4h,  v3.4h
+        addp            v5.4h,  v4.4h,  v4.4h
+        rshrn           v6.8b,  v5.8h,  #3
+        rshrn           v7.8b,  v4.8h,  #2
+        dup             v0.8b,  v6.b[0]
+        dup             v2.8b,  v7.b[2]
+        dup             v1.8b,  v7.b[3]
+        dup             v3.8b,  v6.b[1]
+        zip1            v0.2s,  v0.2s,  v2.2s
+        zip1            v1.2s,  v1.2s,  v3.2s
+.L_pred8x8_dc_end:
+        mov             w3,  #4
+        add             x2,  x0,  x1,  lsl #2
+6:      st1             {v0.8b},  [x0], x1
+        st1             {v1.8b},  [x2], x1
+        subs            w3,  w3,  #1
+        b.ne            6b
+        ret
+endfunc
+
+function ff_pred8x8_l0t_dc_neon, export=1
+        sub             x2,  x0,  x1
+        sub             x3,  x0,  #1
+        ld1             {v0.8b},  [x2]
+        ldcol.8         v1,  x3,  x1,  4
+        zip1            v0.4s,  v0.4s,  v1.4s
+        uaddlp          v0.8h,  v0.16b
+        addp            v0.8h,  v0.8h,  v0.8h
+        addp            v1.4h,  v0.4h,  v0.4h
+        rshrn           v2.8b,  v0.8h,  #2
+        rshrn           v3.8b,  v1.8h,  #3
+        dup             v4.8b,  v3.b[0]
+        dup             v6.8b,  v2.b[2]
+        dup             v5.8b,  v2.b[0]
+        zip1            v0.2s,  v4.2s,  v6.2s
+        zip1            v1.2s,  v5.2s,  v6.2s
+        b               .L_pred8x8_dc_end
+endfunc
+
+function ff_pred8x8_l00_dc_neon, export=1
+        sub             x2,  x0,  #1
+        ldcol.8         v0,  x2,  x1,  4
+        uaddlp          v0.4h,  v0.8b
+        addp            v0.4h,  v0.4h,  v0.4h
+        rshrn           v0.8b,  v0.8h,  #2
+        movi            v1.8b,  #128
+        dup             v0.8b,  v0.b[0]
+        b               .L_pred8x8_dc_end
+endfunc
+
+function ff_pred8x8_0lt_dc_neon, export=1
+        add             x3,  x0,  x1,  lsl #2
+        sub             x2,  x0,  x1
+        sub             x3,  x3,  #1
+        ld1             {v0.8b},  [x2]
+        ldcol.8         v1,  x3,  x1,  4,  hi=1
+        zip1            v0.4s,  v0.4s,  v1.4s
+        uaddlp          v0.8h,  v0.16b
+        addp            v0.8h,  v0.8h,  v0.8h
+        addp            v1.4h,  v0.4h,  v0.4h
+        rshrn           v2.8b,  v0.8h,  #2
+        rshrn           v3.8b,  v1.8h,  #3
+        dup             v4.8b,  v2.b[0]
+        dup             v5.8b,  v2.b[3]
+        dup             v6.8b,  v2.b[2]
+        dup             v7.8b,  v3.b[1]
+        zip1            v0.2s,  v4.2s,  v6.2s
+        zip1            v1.2s,  v5.2s,  v7.2s
+        b               .L_pred8x8_dc_end
+endfunc
+
+function ff_pred8x8_0l0_dc_neon, export=1
+        add             x2,  x0,  x1,  lsl #2
+        sub             x2,  x2,  #1
+        ldcol.8         v1,  x2,  x1,  4
+        uaddlp          v2.4h,  v1.8b
+        addp            v2.4h,  v2.4h,  v2.4h
+        rshrn           v1.8b,  v2.8h,  #2
+        movi            v0.8b,  #128
+        dup             v1.8b,  v1.b[0]
+        b               .L_pred8x8_dc_end
+endfunc
new file mode 100644
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/hpeldsp_init_aarch64.c
@@ -0,0 +1,123 @@
+/*
+ * ARM NEON optimised DSP functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/hpeldsp.h"
+
+void     ff_put_pixels16_neon(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void  ff_put_pixels16_x2_neon(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void  ff_put_pixels16_y2_neon(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void ff_put_pixels16_xy2_neon(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void      ff_put_pixels8_neon(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void   ff_put_pixels8_x2_neon(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void   ff_put_pixels8_y2_neon(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void  ff_put_pixels8_xy2_neon(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+
+void  ff_put_pixels16_x2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
+void  ff_put_pixels16_y2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
+void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
+void   ff_put_pixels8_x2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
+void   ff_put_pixels8_y2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
+void  ff_put_pixels8_xy2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
+
+void     ff_avg_pixels16_neon(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void  ff_avg_pixels16_x2_neon(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void  ff_avg_pixels16_y2_neon(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void ff_avg_pixels16_xy2_neon(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void      ff_avg_pixels8_neon(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void   ff_avg_pixels8_x2_neon(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void   ff_avg_pixels8_y2_neon(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+void  ff_avg_pixels8_xy2_neon(uint8_t *block, const uint8_t *pixels,
+                              ptrdiff_t line_size, int h);
+
+void  ff_avg_pixels16_x2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
+void  ff_avg_pixels16_y2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
+void ff_avg_pixels16_xy2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
+                                     ptrdiff_t line_size, int h);
+
+av_cold void ff_hpeldsp_init_aarch64(HpelDSPContext *c, int flags)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        c->put_pixels_tab[0][0] = ff_put_pixels16_neon;
+        c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon;
+        c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon;
+        c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon;
+        c->put_pixels_tab[1][0] = ff_put_pixels8_neon;
+        c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon;
+        c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon;
+        c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon;
+
+        c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon;
+        c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon;
+        c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon;
+        c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon;
+        c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon;
+        c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon;
+        c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon;
+        c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon;
+
+        c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon;
+        c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_neon;
+        c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_neon;
+        c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_neon;
+        c->avg_pixels_tab[1][0] = ff_avg_pixels8_neon;
+        c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_neon;
+        c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_neon;
+        c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_neon;
+
+        c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_neon;
+        c->avg_no_rnd_pixels_tab[1] = ff_avg_pixels16_x2_no_rnd_neon;
+        c->avg_no_rnd_pixels_tab[2] = ff_avg_pixels16_y2_no_rnd_neon;
+        c->avg_no_rnd_pixels_tab[3] = ff_avg_pixels16_xy2_no_rnd_neon;
+    }
+}
new file mode 100644
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/hpeldsp_neon.S
@@ -0,0 +1,397 @@
+/*
+ * ARM NEON optimised DSP functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+.macro  pixels16        rnd=1, avg=0
+  .if \avg
+        mov             x12, x0
+  .endif
+1:      ld1             {v0.16B},  [x1], x2
+        ld1             {v1.16B},  [x1], x2
+        ld1             {v2.16B},  [x1], x2
+        ld1             {v3.16B},  [x1], x2
+  .if \avg
+        ld1             {v4.16B},  [x12], x2
+        urhadd          v0.16B,  v0.16B,  v4.16B
+        ld1             {v5.16B},  [x12], x2
+        urhadd          v1.16B,  v1.16B,  v5.16B
+        ld1             {v6.16B},  [x12], x2
+        urhadd          v2.16B,  v2.16B,  v6.16B
+        ld1             {v7.16B},  [x12], x2
+        urhadd          v3.16B,  v3.16B,  v7.16B
+  .endif
+        subs            w3,  w3,  #4
+        st1             {v0.16B},  [x0], x2
+        st1             {v1.16B},  [x0], x2
+        st1             {v2.16B},  [x0], x2
+        st1             {v3.16B},  [x0], x2
+        b.ne            1b
+        ret
+.endm
+
+.macro  pixels16_x2     rnd=1, avg=0
+1:      ld1             {v0.16B, v1.16B}, [x1], x2
+        ld1             {v2.16B, v3.16B}, [x1], x2
+        subs            w3,  w3,  #2
+        ext             v1.16B,  v0.16B,  v1.16B,  #1
+        avg             v0.16B,  v0.16B,  v1.16B
+        ext             v3.16B,  v2.16B,  v3.16B,  #1
+        avg             v2.16B,  v2.16B,  v3.16B
+  .if \avg
+        ld1             {v1.16B}, [x0], x2
+        ld1             {v3.16B}, [x0]
+        urhadd          v0.16B,  v0.16B,  v1.16B
+        urhadd          v2.16B,  v2.16B,  v3.16B
+        sub             x0,  x0,  x2
+  .endif
+        st1             {v0.16B}, [x0], x2
+        st1             {v2.16B}, [x0], x2
+        b.ne            1b
+        ret
+.endm
+
+.macro  pixels16_y2     rnd=1, avg=0
+        sub             w3,  w3,  #2
+        ld1             {v0.16B}, [x1], x2
+        ld1             {v1.16B}, [x1], x2
+1:      subs            w3,  w3,  #2
+        avg             v2.16B,  v0.16B,  v1.16B
+        ld1             {v0.16B}, [x1], x2
+        avg             v3.16B,  v0.16B,  v1.16B
+        ld1             {v1.16B}, [x1], x2
+  .if \avg
+        ld1             {v4.16B}, [x0], x2
+        ld1             {v5.16B}, [x0]
+        urhadd          v2.16B,  v2.16B,  v4.16B
+        urhadd          v3.16B,  v3.16B,  v5.16B
+        sub             x0,  x0,  x2
+  .endif
+        st1             {v2.16B}, [x0], x2
+        st1             {v3.16B}, [x0], x2
+        b.ne            1b
+
+        avg             v2.16B,  v0.16B,  v1.16B
+        ld1             {v0.16B}, [x1], x2
+        avg             v3.16B,  v0.16B,  v1.16B
+  .if \avg
+        ld1             {v4.16B}, [x0], x2
+        ld1             {v5.16B}, [x0]
+        urhadd          v2.16B,  v2.16B,  v4.16B
+        urhadd          v3.16B,  v3.16B,  v5.16B
+        sub             x0,  x0,  x2
+  .endif
+        st1             {v2.16B},     [x0], x2
+        st1             {v3.16B},     [x0], x2
+
+        ret
+.endm
+
+.macro  pixels16_xy2    rnd=1, avg=0
+        sub             w3,  w3,  #2
+        ld1             {v0.16B, v1.16B}, [x1], x2
+        ld1             {v4.16B, v5.16B}, [x1], x2
+NRND    movi            v26.8H, #1
+        ext             v1.16B,  v0.16B,  v1.16B,  #1
+        ext             v5.16B,  v4.16B,  v5.16B,  #1
+        uaddl           v16.8H,  v0.8B,   v1.8B
+        uaddl2          v20.8H,  v0.16B,  v1.16B
+        uaddl           v18.8H,  v4.8B,   v5.8B
+        uaddl2          v22.8H,  v4.16B,  v5.16B
+1:      subs            w3,  w3,  #2
+        ld1             {v0.16B, v1.16B}, [x1], x2
+        add             v24.8H,  v16.8H,  v18.8H
+NRND    add             v24.8H,  v24.8H,  v26.8H
+        ext             v30.16B, v0.16B,  v1.16B,  #1
+        add             v1.8H,   v20.8H,  v22.8H
+        mshrn           v28.8B,  v24.8H,  #2
+NRND    add             v1.8H,   v1.8H,   v26.8H
+        mshrn2          v28.16B, v1.8H,   #2
+  .if \avg
+        ld1             {v16.16B},        [x0]
+        urhadd          v28.16B, v28.16B, v16.16B
+  .endif
+        uaddl           v16.8H,  v0.8B,   v30.8B
+        ld1             {v2.16B, v3.16B}, [x1], x2
+        uaddl2          v20.8H,  v0.16B,  v30.16B
+        st1             {v28.16B},        [x0], x2
+        add             v24.8H,  v16.8H,  v18.8H
+NRND    add             v24.8H,  v24.8H,  v26.8H
+        ext             v3.16B,  v2.16B,  v3.16B,  #1
+        add             v0.8H,   v20.8H,  v22.8H
+        mshrn           v30.8B,  v24.8H,  #2
+NRND    add             v0.8H,   v0.8H,   v26.8H
+        mshrn2          v30.16B, v0.8H,   #2
+  .if \avg
+        ld1             {v18.16B},        [x0]
+        urhadd          v30.16B, v30.16B, v18.16B
+  .endif
+        uaddl           v18.8H,   v2.8B,  v3.8B
+        uaddl2          v22.8H,   v2.16B, v3.16B
+        st1             {v30.16B},        [x0], x2
+        b.gt            1b
+
+        ld1             {v0.16B, v1.16B}, [x1], x2
+        add             v24.8H,  v16.8H,  v18.8H
+NRND    add             v24.8H,  v24.8H,  v26.8H
+        ext             v30.16B, v0.16B,  v1.16B,  #1
+        add             v1.8H,   v20.8H,  v22.8H
+        mshrn           v28.8B,  v24.8H,  #2
+NRND    add             v1.8H,   v1.8H,   v26.8H
+        mshrn2          v28.16B, v1.8H,   #2
+  .if \avg
+        ld1             {v16.16B},        [x0]
+        urhadd          v28.16B, v28.16B, v16.16B
+  .endif
+        uaddl           v16.8H,  v0.8B,   v30.8B
+        uaddl2          v20.8H,  v0.16B,  v30.16B
+        st1             {v28.16B},        [x0], x2
+        add             v24.8H,  v16.8H,  v18.8H
+NRND    add             v24.8H,  v24.8H,  v26.8H
+        add             v0.8H,   v20.8H,  v22.8H
+        mshrn           v30.8B,  v24.8H,  #2
+NRND    add             v0.8H,   v0.8H,   v26.8H
+        mshrn2          v30.16B, v0.8H,   #2
+  .if \avg
+        ld1             {v18.16B},        [x0]
+        urhadd          v30.16B, v30.16B, v18.16B
+  .endif
+        st1             {v30.16B},        [x0], x2
+
+        ret
+.endm
+
+.macro  pixels8         rnd=1, avg=0
+1:      ld1             {v0.8B}, [x1], x2
+        ld1             {v1.8B}, [x1], x2
+        ld1             {v2.8B}, [x1], x2
+        ld1             {v3.8B}, [x1], x2
+  .if \avg
+        ld1             {v4.8B}, [x0], x2
+        urhadd          v0.8B,  v0.8B,  v4.8B
+        ld1             {v5.8B}, [x0], x2
+        urhadd          v1.8B,  v1.8B,  v5.8B
+        ld1             {v6.8B}, [x0], x2
+        urhadd          v2.8B,  v2.8B,  v6.8B
+        ld1             {v7.8B}, [x0], x2
+        urhadd          v3.8B,  v3.8B,  v7.8B
+        sub             x0,  x0,  x2,  lsl #2
+  .endif
+        subs            w3,  w3,  #4
+        st1             {v0.8B}, [x0], x2
+        st1             {v1.8B}, [x0], x2
+        st1             {v2.8B}, [x0], x2
+        st1             {v3.8B}, [x0], x2
+        b.ne            1b
+        ret
+.endm
+
+.macro  pixels8_x2      rnd=1, avg=0
+1:      ld1             {v0.8B, v1.8B}, [x1], x2
+        ext             v1.8B,  v0.8B,  v1.8B,  #1
+        ld1             {v2.8B, v3.8B}, [x1], x2
+        ext             v3.8B,  v2.8B,  v3.8B,  #1
+        subs            w3,  w3,  #2
+        avg             v0.8B,   v0.8B,   v1.8B
+        avg             v2.8B,   v2.8B,   v3.8B
+  .if \avg
+        ld1             {v4.8B},     [x0], x2
+        ld1             {v5.8B},     [x0]
+        urhadd          v0.8B,   v0.8B,   v4.8B
+        urhadd          v2.8B,   v2.8B,   v5.8B
+        sub             x0,  x0,  x2
+  .endif
+        st1             {v0.8B}, [x0], x2
+        st1             {v2.8B}, [x0], x2
+        b.ne            1b
+        ret
+.endm
+
+.macro  pixels8_y2      rnd=1, avg=0
+        sub             w3,  w3,  #2
+        ld1             {v0.8B},  [x1], x2
+        ld1             {v1.8B},  [x1], x2
+1:      subs            w3,  w3,  #2
+        avg             v4.8B,  v0.8B,  v1.8B
+        ld1             {v0.8B},  [x1], x2
+        avg             v5.8B,  v0.8B,  v1.8B
+        ld1             {v1.8B},  [x1], x2
+  .if \avg
+        ld1             {v2.8B},     [x0], x2
+        ld1             {v3.8B},     [x0]
+        urhadd          v4.8B,  v4.8B,  v2.8B
+        urhadd          v5.8B,  v5.8B,  v3.8B
+        sub             x0,  x0,  x2
+  .endif
+        st1             {v4.8B},     [x0], x2
+        st1             {v5.8B},     [x0], x2
+        b.ne            1b
+
+        avg             v4.8B,  v0.8B,  v1.8B
+        ld1             {v0.8B},  [x1], x2
+        avg             v5.8B,  v0.8B,  v1.8B
+  .if \avg
+        ld1             {v2.8B},     [x0], x2
+        ld1             {v3.8B},     [x0]
+        urhadd          v4.8B,  v4.8B,  v2.8B
+        urhadd          v5.8B,  v5.8B,  v3.8B
+        sub             x0,  x0,  x2
+  .endif
+        st1             {v4.8B},     [x0], x2
+        st1             {v5.8B},     [x0], x2
+
+        ret
+.endm
+
+.macro  pixels8_xy2     rnd=1, avg=0
+        sub             w3,  w3,  #2
+        ld1             {v0.16B},     [x1], x2
+        ld1             {v1.16B},     [x1], x2
+NRND    movi            v19.8H, #1
+        ext             v4.16B,  v0.16B,  v4.16B,  #1
+        ext             v6.16B,  v1.16B,  v6.16B,  #1
+        uaddl           v16.8H,  v0.8B,  v4.8B
+        uaddl           v17.8H,  v1.8B,  v6.8B
+1:      subs            w3,  w3,  #2
+        ld1             {v0.16B},     [x1], x2
+        add             v18.8H, v16.8H,  v17.8H
+        ext             v4.16B,  v0.16B,  v4.16B,  #1
+NRND    add             v18.8H, v18.8H, v19.8H
+        uaddl           v16.8H,  v0.8B,  v4.8B
+        mshrn           v5.8B,  v18.8H, #2
+        ld1             {v1.16B},     [x1], x2
+        add             v18.8H, v16.8H,  v17.8H
+  .if \avg
+        ld1             {v7.8B},     [x0]
+        urhadd          v5.8B,  v5.8B,  v7.8B
+  .endif
+NRND    add             v18.8H, v18.8H, v19.8H
+        st1             {v5.8B},     [x0], x2
+        mshrn           v7.8B,  v18.8H, #2
+  .if \avg
+        ld1             {v5.8B},     [x0]
+        urhadd          v7.8B,  v7.8B,  v5.8B
+  .endif
+        ext             v6.16B,  v1.16B,  v6.16B,  #1
+        uaddl           v17.8H,  v1.8B,   v6.8B
+        st1             {v7.8B},     [x0], x2
+        b.gt            1b
+
+        ld1             {v0.16B},     [x1], x2
+        add             v18.8H, v16.8H, v17.8H
+        ext             v4.16B, v0.16B, v4.16B,  #1
+NRND    add             v18.8H, v18.8H, v19.8H
+        uaddl           v16.8H,  v0.8B, v4.8B
+        mshrn           v5.8B,  v18.8H, #2
+        add             v18.8H, v16.8H, v17.8H
+  .if \avg
+        ld1             {v7.8B},     [x0]
+        urhadd          v5.8B,  v5.8B,  v7.8B
+  .endif
+NRND    add             v18.8H, v18.8H, v19.8H
+        st1             {v5.8B},     [x0], x2
+        mshrn           v7.8B,  v18.8H, #2
+  .if \avg
+        ld1             {v5.8B},     [x0]
+        urhadd          v7.8B,  v7.8B,  v5.8B
+  .endif
+        st1             {v7.8B},     [x0], x2
+
+        ret
+.endm
+
+.macro  pixfunc         pfx, name, suf, rnd=1, avg=0
+  .if \rnd
+    .macro avg  rd, rn, rm
+        urhadd          \rd, \rn, \rm
+    .endm
+    .macro mshrn rd, rn, rm
+        rshrn           \rd, \rn, \rm
+    .endm
+    .macro mshrn2 rd, rn, rm
+        rshrn2          \rd, \rn, \rm
+    .endm
+    .macro NRND insn:vararg
+    .endm
+  .else
+    .macro avg  rd, rn, rm
+        uhadd           \rd, \rn, \rm
+    .endm
+    .macro mshrn rd, rn, rm
+        shrn            \rd, \rn, \rm
+    .endm
+    .macro mshrn2 rd, rn, rm
+        shrn2           \rd, \rn, \rm
+    .endm
+    .macro NRND insn:vararg
+        \insn
+    .endm
+  .endif
+function ff_\pfx\name\suf\()_neon, export=1
+        \name           \rnd, \avg
+endfunc
+        .purgem         avg
+        .purgem         mshrn
+        .purgem         mshrn2
+        .purgem         NRND
+.endm
+
+.macro  pixfunc2        pfx, name, avg=0
+        pixfunc         \pfx, \name,          rnd=1, avg=\avg
+        pixfunc         \pfx, \name, _no_rnd, rnd=0, avg=\avg
+.endm
+
+function ff_put_h264_qpel16_mc00_neon, export=1
+        mov             w3,  #16
+endfunc
+
+        pixfunc         put_, pixels16,     avg=0
+        pixfunc2        put_, pixels16_x2,  avg=0
+        pixfunc2        put_, pixels16_y2,  avg=0
+        pixfunc2        put_, pixels16_xy2, avg=0
+
+function ff_avg_h264_qpel16_mc00_neon, export=1
+        mov             w3,  #16
+endfunc
+
+        pixfunc         avg_, pixels16,     avg=1
+        pixfunc2        avg_, pixels16_x2,  avg=1
+        pixfunc2        avg_, pixels16_y2,  avg=1
+        pixfunc2        avg_, pixels16_xy2, avg=1
+
+function ff_put_h264_qpel8_mc00_neon, export=1
+        mov             w3,  #8
+endfunc
+
+        pixfunc         put_, pixels8,     avg=0
+        pixfunc2        put_, pixels8_x2,  avg=0
+        pixfunc2        put_, pixels8_y2,  avg=0
+        pixfunc2        put_, pixels8_xy2, avg=0
+
+function ff_avg_h264_qpel8_mc00_neon, export=1
+        mov             w3,  #8
+endfunc
+
+        pixfunc         avg_, pixels8,     avg=1
+        pixfunc         avg_, pixels8_x2,  avg=1
+        pixfunc         avg_, pixels8_y2,  avg=1
+        pixfunc         avg_, pixels8_xy2, avg=1
new file mode 100644
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/idct.h
@@ -0,0 +1,28 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AARCH64_IDCT_H
+#define AVCODEC_AARCH64_IDCT_H
+
+#include <stdint.h>
+
+void ff_simple_idct_neon(int16_t *data);
+void ff_simple_idct_put_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
+void ff_simple_idct_add_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
+
+#endif /* AVCODEC_AARCH64_IDCT_H */
new file mode 100644
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/idctdsp_init_aarch64.c
@@ -0,0 +1,41 @@
+/*
+ * ARM-NEON-optimized IDCT functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2017 Matthieu Bouron <matthieu.bouron@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/idctdsp.h"
+#include "idct.h"
+
+av_cold void ff_idctdsp_init_aarch64(IDCTDSPContext *c, AVCodecContext *avctx,
+                                     unsigned high_bit_depth)
+{
+    if (!avctx->lowres && !high_bit_depth) {
+        if (avctx->idct_algo == FF_IDCT_AUTO ||
+            avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+            avctx->idct_algo == FF_IDCT_SIMPLENEON) {
+            c->idct_put  = ff_simple_idct_put_neon;
+            c->idct_add  = ff_simple_idct_add_neon;
+            c->idct      = ff_simple_idct_neon;
+            c->perm_type = FF_IDCT_PERM_PARTTRANS;
+        }
+    }
+}
new file mode 100644
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/mdct_neon.S
@@ -0,0 +1,323 @@
+/*
+ * AArch64 NEON optimised MDCT
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+function ff_imdct_half_neon, export=1
+        sub             sp,  sp,  #32
+        stp             x19, x20, [sp]
+        str             x30, [sp, #16]
+        mov             x12, #1
+        ldr             w14, [x0, #28]          // mdct_bits
+        ldr             x4,  [x0, #32]          // tcos
+        ldr             x3,  [x0, #8]           // revtab
+        lsl             x12, x12, x14           // n  = 1 << nbits
+        lsr             x14, x12, #2            // n4 = n >> 2
+        add             x7,  x2,  x12,  lsl #1
+        mov             x12, #-16
+        sub             x7,  x7,  #16
+
+        ld2             {v16.2s,v17.2s}, [x7], x12 // d16=x,n1 d17=x,n0
+        ld2             {v0.2s,v1.2s},   [x2], #16 // d0 =m0,x d1 =m1,x
+        rev64           v17.2s, v17.2s
+        ld2             {v2.2s,v3.2s},   [x4], #16 // d2=c0,c1 d3=s0,s2
+        fmul            v6.2s,  v17.2s, v2.2s
+        fmul            v7.2s,  v0.2s,  v2.2s
+1:
+        subs            x14, x14, #2
+        ldr             w6,  [x3], #4
+        fmul            v4.2s,  v0.2s,  v3.2s
+        fmul            v5.2s,  v17.2s, v3.2s
+        fsub            v4.2s,  v6.2s,  v4.2s
+        fadd            v5.2s,  v5.2s,  v7.2s
+        ubfm            x8,  x6,  #16, #31
+        ubfm            x6,  x6,  #0,  #15
+        add             x8,  x1,  x8,  lsl #3
+        add             x6,  x1,  x6,  lsl #3
+        b.eq            2f
+        ld2             {v16.2s,v17.2s}, [x7], x12
+        ld2             {v0.2s,v1.2s},   [x2], #16
+        rev64           v17.2s, v17.2s
+        ld2             {v2.2s,v3.2s},   [x4], #16    // d2=c0,c1 d3=s0,s2
+        fmul            v6.2s,  v17.2s, v2.2s
+        fmul            v7.2s,  v0.2s,  v2.2s
+        st2             {v4.s,v5.s}[0], [x6]
+        st2             {v4.s,v5.s}[1], [x8]
+        b               1b
+2:
+        st2             {v4.s,v5.s}[0], [x6]
+        st2             {v4.s,v5.s}[1], [x8]
+
+        mov             x19, x0
+        mov             x20, x1
+        bl              X(ff_fft_calc_neon)
+
+        mov             x12, #1
+        ldr             w14, [x19, #28]          // mdct_bits
+        ldr             x4,  [x19, #32]          // tcos
+        lsl             x12, x12, x14            // n  = 1 << nbits
+        lsr             x14, x12, #3             // n8 = n >> 3
+
+        add             x4,  x4,  x14, lsl #3
+        add             x6,  x20, x14, lsl #3
+        sub             x1,  x4,  #16
+        sub             x3,  x6,  #16
+
+        mov             x7,  #-16
+        mov             x8,  x6
+        mov             x0,  x3
+
+        ld2             {v0.2s,v1.2s},  [x3], x7 // d0 =i1,r1 d1 =i0,r0
+        ld2             {v20.2s,v21.2s},[x6], #16 // d20=i2,r2 d21=i3,r3
+        ld2             {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0
+3:
+        subs            x14, x14, #2
+        fmul            v7.2s,  v0.2s,  v17.2s
+        ld2             {v18.2s,v19.2s},[x4], #16    // d17=c2,c3 d19=s2,s3
+        fmul            v4.2s,  v1.2s,  v17.2s
+        fmul            v6.2s,  v21.2s, v19.2s
+        fmul            v5.2s,  v20.2s, v19.2s
+        fmul            v22.2s, v1.2s,  v16.2s
+        fmul            v23.2s, v21.2s, v18.2s
+        fmul            v24.2s, v0.2s,  v16.2s
+        fmul            v25.2s, v20.2s, v18.2s
+        fadd            v7.2s,  v7.2s,  v22.2s
+        fadd            v5.2s,  v5.2s,  v23.2s
+        fsub            v4.2s,  v4.2s,  v24.2s
+        fsub            v6.2s,  v6.2s,  v25.2s
+        b.eq            4f
+        ld2             {v0.2s,v1.2s},  [x3], x7
+        ld2             {v20.2s,v21.2s},[x6], #16
+        ld2             {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0
+        rev64           v5.2s,  v5.2s
+        rev64           v7.2s,  v7.2s
+        st2             {v4.2s,v5.2s},  [x0], x7
+        st2             {v6.2s,v7.2s},  [x8], #16
+        b               3b
+4:
+        rev64           v5.2s,  v5.2s
+        rev64           v7.2s,  v7.2s
+        st2             {v4.2s,v5.2s},  [x0]
+        st2             {v6.2s,v7.2s},  [x8]
+
+        ldp             x19, x20, [sp]
+        ldr             x30, [sp, #16]
+        add             sp,  sp,  #32
+
+        ret
+endfunc
+
+function ff_imdct_calc_neon, export=1
+        sub             sp,  sp,  #32
+        stp             x19, x20, [sp]
+        str             x30, [sp, #16]
+        ldr             w3,  [x0, #28]          // mdct_bits
+        mov             x19, #1
+        mov             x20, x1
+        lsl             x19, x19, x3
+        add             x1,  x1,  x19
+
+        bl              X(ff_imdct_half_neon)
+
+        add             x0,  x20, x19,  lsl #2
+        add             x1,  x20, x19,  lsl #1
+        sub             x0,  x0,  #8
+        sub             x2,  x1,  #16
+        mov             x3,  #-16
+        mov             x6,  #-8
+1:
+        ld1             {v0.4s}, [x2], x3
+        prfum           pldl1keep, [x0, #-16]
+        rev64           v0.4s, v0.4s
+        ld1             {v2.2s,v3.2s}, [x1], #16
+        fneg            v4.4s,  v0.4s
+        prfum           pldl1keep, [x2, #-16]
+        rev64           v2.2s, v2.2s
+        rev64           v3.2s, v3.2s
+        ext             v4.16b, v4.16b, v4.16b, #8
+        st1             {v2.2s}, [x0], x6
+        st1             {v3.2s}, [x0], x6
+        st1             {v4.4s}, [x20], #16
+        subs            x19, x19,  #16
+        b.gt            1b
+
+        ldp             x19, x20, [sp], #16
+        ldr             x30, [sp], #16
+
+        ret
+endfunc
+
+
+function ff_mdct_calc_neon, export=1
+        sub             sp,  sp,  #32
+        stp             x19, x20, [sp]
+        str             x30, [sp, #16]
+
+        mov             x12, #1
+        ldr             w14, [x0, #28]          // mdct_bits
+        ldr             x4,  [x0, #32]          // tcos
+        ldr             x3,  [x0, #8]           // revtab
+        lsl             x14, x12, x14           // n  = 1 << nbits
+        add             x7,  x2,  x14           // in4u
+        sub             x9,  x7,  #16           // in4d
+        add             x2,  x7,  x14, lsl #1   // in3u
+        add             x8,  x9,  x14, lsl #1   // in3d
+        add             x5,  x4,  x14, lsl #1
+        sub             x5,  x5,  #16
+        sub             x3,  x3,  #4
+        mov             x12, #-16
+        lsr             x13, x14, #1
+
+        ld2             {v16.2s,v17.2s}, [x9], x12  // in0u0,in0u1 in4d1,in4d0
+        ld2             {v18.2s,v19.2s}, [x8], x12  // in2u0,in2u1 in3d1,in3d0
+        ld2             {v0.2s, v1.2s},  [x7], #16  // in4u0,in4u1 in2d1,in2d0
+        rev64           v17.2s, v17.2s              // in4d0,in4d1 in3d0,in3d1
+        rev64           v19.2s, v19.2s              // in4d0,in4d1 in3d0,in3d1
+        ld2             {v2.2s, v3.2s},  [x2], #16  // in3u0,in3u1 in1d1,in1d0
+        fsub            v0.2s,  v17.2s, v0.2s       // in4d-in4u      I
+        ld2             {v20.2s,v21.2s}, [x4], #16  // c0,c1 s0,s1
+        rev64           v1.2s,  v1.2s               // in2d0,in2d1 in1d0,in1d1
+        rev64           v3.2s,  v3.2s               // in2d0,in2d1 in1d0,in1d1
+        ld2             {v30.2s,v31.2s}, [x5], x12  // c2,c3 s2,s3
+        fadd            v2.2s,  v2.2s,  v19.2s      // in3u+in3d     -R
+        fsub            v16.2s, v16.2s, v1.2s       // in0u-in2d      R
+        fadd            v18.2s, v18.2s, v3.2s       // in2u+in1d     -I
+1:
+        fmul            v7.2s,  v0.2s,  v21.2s      //  I*s
+        ldr             w10, [x3, x13]
+        fmul            v6.2s,  v2.2s,  v20.2s      // -R*c
+        ldr             w6,  [x3, #4]!
+        fmul            v4.2s,  v2.2s,  v21.2s      // -R*s
+        fmul            v5.2s,  v0.2s,  v20.2s      //  I*c
+        fmul            v24.2s, v16.2s, v30.2s      //  R*c
+        fmul            v25.2s, v18.2s, v31.2s      // -I*s
+        fmul            v22.2s, v16.2s, v31.2s      //  R*s
+        fmul            v23.2s, v18.2s, v30.2s      //  I*c
+        subs            x14, x14, #16
+        subs            x13, x13, #8
+        fsub            v6.2s,  v6.2s,  v7.2s       // -R*c-I*s
+        fadd            v7.2s,  v4.2s,  v5.2s       // -R*s+I*c
+        fsub            v24.2s, v25.2s, v24.2s      // I*s-R*c
+        fadd            v25.2s, v22.2s, v23.2s      // R*s-I*c
+        b.eq            1f
+        mov             x12, #-16
+        ld2             {v16.2s,v17.2s}, [x9], x12  // in0u0,in0u1 in4d1,in4d0
+        ld2             {v18.2s,v19.2s}, [x8], x12  // in2u0,in2u1 in3d1,in3d0
+        fneg            v7.2s,  v7.2s               //  R*s-I*c
+        ld2             {v0.2s, v1.2s},  [x7], #16  // in4u0,in4u1 in2d1,in2d0
+        rev64           v17.2s, v17.2s              // in4d0,in4d1 in3d0,in3d1
+        rev64           v19.2s, v19.2s              // in4d0,in4d1 in3d0,in3d1
+        ld2             {v2.2s, v3.2s},  [x2], #16  // in3u0,in3u1 in1d1,in1d0
+        fsub            v0.2s,  v17.2s, v0.2s       // in4d-in4u      I
+        ld2             {v20.2s,v21.2s}, [x4], #16  // c0,c1 s0,s1
+        rev64           v1.2s,  v1.2s               // in2d0,in2d1 in1d0,in1d1
+        rev64           v3.2s,  v3.2s               // in2d0,in2d1 in1d0,in1d1
+        ld2             {v30.2s,v31.2s}, [x5], x12  // c2,c3 s2,s3
+        fadd            v2.2s,  v2.2s,  v19.2s      // in3u+in3d     -R
+        fsub            v16.2s, v16.2s, v1.2s       // in0u-in2d      R
+        fadd            v18.2s, v18.2s, v3.2s       // in2u+in1d     -I
+        ubfm            x12, x6,  #16, #31
+        ubfm            x6,  x6,  #0,  #15
+        add             x12, x1,  x12, lsl #3
+        add             x6,  x1,  x6,  lsl #3
+        st2             {v6.s,v7.s}[0],   [x6]
+        st2             {v6.s,v7.s}[1],   [x12]
+        ubfm            x6,  x10, #16, #31
+        ubfm            x10, x10, #0,  #15
+        add             x6 , x1,  x6,  lsl #3
+        add             x10, x1,  x10, lsl #3
+        st2             {v24.s,v25.s}[0], [x10]
+        st2             {v24.s,v25.s}[1], [x6]
+        b               1b
+1:
+        fneg            v7.2s,  v7.2s           //  R*s-I*c
+        ubfm            x12, x6,  #16, #31
+        ubfm            x6,  x6,  #0,  #15
+        add             x12, x1,  x12, lsl #3
+        add             x6,  x1,  x6,  lsl #3
+        st2             {v6.s,v7.s}[0],   [x6]
+        st2             {v6.s,v7.s}[1],   [x12]
+        ubfm            x6,  x10, #16, #31
+        ubfm            x10, x10, #0,  #15
+        add             x6 , x1,  x6,  lsl #3
+        add             x10, x1,  x10, lsl #3
+        st2             {v24.s,v25.s}[0], [x10]
+        st2             {v24.s,v25.s}[1], [x6]
+
+        mov             x19, x0
+        mov             x20, x1
+        bl              X(ff_fft_calc_neon)
+
+        mov             x12, #1
+        ldr             w14, [x19, #28]         // mdct_bits
+        ldr             x4,  [x19, #32]         // tcos
+        lsl             x12, x12, x14           // n  = 1 << nbits
+        lsr             x14, x12, #3            // n8 = n >> 3
+
+        add             x4,  x4,  x14, lsl #3
+        add             x6,  x20, x14, lsl #3
+        sub             x1,  x4,  #16
+        sub             x3,  x6,  #16
+
+        mov             x7,  #-16
+        mov             x8,  x6
+        mov             x0,  x3
+
+        ld2             {v0.2s,v1.2s},   [x3], x7   // d0 =r1,i1 d1 =r0,i0
+        ld2             {v20.2s,v21.2s}, [x6], #16  // d20=r2,i2 d21=r3,i3
+        ld2             {v16.2s,v17.2s}, [x1], x7   // c1,c0 s1,s0
+1:
+        subs            x14, x14, #2
+        fmul            v7.2s,  v0.2s,  v17.2s      // r1*s1,r0*s0
+        ld2             {v18.2s,v19.2s}, [x4], #16  // c2,c3 s2,s3
+        fmul            v4.2s,  v1.2s,  v17.2s      // i1*s1,i0*s0
+        fmul            v6.2s,  v21.2s, v19.2s      // i2*s2,i3*s3
+        fmul            v5.2s,  v20.2s, v19.2s      // r2*s2,r3*s3
+        fmul            v24.2s, v0.2s,  v16.2s      // r1*c1,r0*c0
+        fmul            v25.2s, v20.2s, v18.2s      // r2*c2,r3*c3
+        fmul            v22.2s, v21.2s, v18.2s      // i2*c2,i3*c3
+        fmul            v23.2s, v1.2s,  v16.2s      // i1*c1,i0*c0
+        fadd            v4.2s,  v4.2s,  v24.2s      // i1*s1+r1*c1,i0*s0+r0*c0
+        fadd            v6.2s,  v6.2s,  v25.2s      // i2*s2+r2*c2,i3*s3+r3*c3
+        fsub            v5.2s,  v22.2s, v5.2s       // i2*c2-r2*s2,i3*c3-r3*s3
+        fsub            v7.2s,  v23.2s, v7.2s       // i1*c1-r1*s1,i0*c0-r0*s0
+        fneg            v4.2s,  v4.2s
+        fneg            v6.2s,  v6.2s
+        b.eq            1f
+        ld2             {v0.2s, v1.2s},  [x3], x7
+        ld2             {v20.2s,v21.2s}, [x6], #16
+        ld2             {v16.2s,v17.2s}, [x1], x7   // c1,c0 s1,s0
+        rev64           v5.2s,  v5.2s
+        rev64           v7.2s,  v7.2s
+        st2             {v4.2s,v5.2s},  [x0], x7
+        st2             {v6.2s,v7.2s},  [x8], #16
+        b               1b
+1:
+        rev64           v5.2s,  v5.2s
+        rev64           v7.2s,  v7.2s
+        st2             {v4.2s,v5.2s},  [x0]
+        st2             {v6.2s,v7.2s},  [x8]
+
+        ldp             x19, x20, [sp], #16
+        ldr             x30, [sp], #16
+        ret
+endfunc
new file mode 100644
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/neon.S
@@ -0,0 +1,149 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+.macro  transpose_8x8B  r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
+        trn1            \r8\().8B,  \r0\().8B,  \r1\().8B
+        trn2            \r9\().8B,  \r0\().8B,  \r1\().8B
+        trn1            \r1\().8B,  \r2\().8B,  \r3\().8B
+        trn2            \r3\().8B,  \r2\().8B,  \r3\().8B
+        trn1            \r0\().8B,  \r4\().8B,  \r5\().8B
+        trn2            \r5\().8B,  \r4\().8B,  \r5\().8B
+        trn1            \r2\().8B,  \r6\().8B,  \r7\().8B
+        trn2            \r7\().8B,  \r6\().8B,  \r7\().8B
+
+        trn1            \r4\().4H,  \r0\().4H,  \r2\().4H
+        trn2            \r2\().4H,  \r0\().4H,  \r2\().4H
+        trn1            \r6\().4H,  \r5\().4H,  \r7\().4H
+        trn2            \r7\().4H,  \r5\().4H,  \r7\().4H
+        trn1            \r5\().4H,  \r9\().4H,  \r3\().4H
+        trn2            \r9\().4H,  \r9\().4H,  \r3\().4H
+        trn1            \r3\().4H,  \r8\().4H,  \r1\().4H
+        trn2            \r8\().4H,  \r8\().4H,  \r1\().4H
+
+        trn1            \r0\().2S,  \r3\().2S,  \r4\().2S
+        trn2            \r4\().2S,  \r3\().2S,  \r4\().2S
+
+        trn1            \r1\().2S,  \r5\().2S,  \r6\().2S
+        trn2            \r5\().2S,  \r5\().2S,  \r6\().2S
+
+        trn2            \r6\().2S,  \r8\().2S,  \r2\().2S
+        trn1            \r2\().2S,  \r8\().2S,  \r2\().2S
+
+        trn1            \r3\().2S,  \r9\().2S,  \r7\().2S
+        trn2            \r7\().2S,  \r9\().2S,  \r7\().2S
+.endm
+
+.macro  transpose_8x16B r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
+        trn1            \t0\().16B, \r0\().16B, \r1\().16B
+        trn2            \t1\().16B, \r0\().16B, \r1\().16B
+        trn1            \r1\().16B, \r2\().16B, \r3\().16B
+        trn2            \r3\().16B, \r2\().16B, \r3\().16B
+        trn1            \r0\().16B, \r4\().16B, \r5\().16B
+        trn2            \r5\().16B, \r4\().16B, \r5\().16B
+        trn1            \r2\().16B, \r6\().16B, \r7\().16B
+        trn2            \r7\().16B, \r6\().16B, \r7\().16B
+
+        trn1            \r4\().8H,  \r0\().8H,  \r2\().8H
+        trn2            \r2\().8H,  \r0\().8H,  \r2\().8H
+        trn1            \r6\().8H,  \r5\().8H,  \r7\().8H
+        trn2            \r7\().8H,  \r5\().8H,  \r7\().8H
+        trn1            \r5\().8H,  \t1\().8H,  \r3\().8H
+        trn2            \t1\().8H,  \t1\().8H,  \r3\().8H
+        trn1            \r3\().8H,  \t0\().8H,  \r1\().8H
+        trn2            \t0\().8H,  \t0\().8H,  \r1\().8H
+
+        trn1            \r0\().4S,  \r3\().4S,  \r4\().4S
+        trn2            \r4\().4S,  \r3\().4S,  \r4\().4S
+
+        trn1            \r1\().4S,  \r5\().4S,  \r6\().4S
+        trn2            \r5\().4S,  \r5\().4S,  \r6\().4S
+
+        trn2            \r6\().4S,  \t0\().4S,  \r2\().4S
+        trn1            \r2\().4S,  \t0\().4S,  \r2\().4S
+
+        trn1            \r3\().4S,  \t1\().4S,  \r7\().4S
+        trn2            \r7\().4S,  \t1\().4S,  \r7\().4S
+.endm
+
+.macro  transpose_4x16B r0, r1, r2, r3, t4, t5, t6, t7
+        trn1            \t4\().16B, \r0\().16B,  \r1\().16B
+        trn2            \t5\().16B, \r0\().16B,  \r1\().16B
+        trn1            \t6\().16B, \r2\().16B,  \r3\().16B
+        trn2            \t7\().16B, \r2\().16B,  \r3\().16B
+
+        trn1            \r0\().8H,  \t4\().8H,  \t6\().8H
+        trn2            \r2\().8H,  \t4\().8H,  \t6\().8H
+        trn1            \r1\().8H,  \t5\().8H,  \t7\().8H
+        trn2            \r3\().8H,  \t5\().8H,  \t7\().8H
+.endm
+
+.macro  transpose_4x8B  r0, r1, r2, r3, t4, t5, t6, t7
+        trn1            \t4\().8B,  \r0\().8B,  \r1\().8B
+        trn2            \t5\().8B,  \r0\().8B,  \r1\().8B
+        trn1            \t6\().8B,  \r2\().8B,  \r3\().8B
+        trn2            \t7\().8B,  \r2\().8B,  \r3\().8B
+
+        trn1            \r0\().4H,  \t4\().4H,  \t6\().4H
+        trn2            \r2\().4H,  \t4\().4H,  \t6\().4H
+        trn1            \r1\().4H,  \t5\().4H,  \t7\().4H
+        trn2            \r3\().4H,  \t5\().4H,  \t7\().4H
+.endm
+
+.macro  transpose_4x4H  r0, r1, r2, r3, r4, r5, r6, r7
+        trn1            \r4\().4H,  \r0\().4H,  \r1\().4H
+        trn2            \r5\().4H,  \r0\().4H,  \r1\().4H
+        trn1            \r6\().4H,  \r2\().4H,  \r3\().4H
+        trn2            \r7\().4H,  \r2\().4H,  \r3\().4H
+        trn1            \r0\().2S,  \r4\().2S,  \r6\().2S
+        trn2            \r2\().2S,  \r4\().2S,  \r6\().2S
+        trn1            \r1\().2S,  \r5\().2S,  \r7\().2S
+        trn2            \r3\().2S,  \r5\().2S,  \r7\().2S
+.endm
+
+.macro  transpose_8x8H  r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
+        trn1            \r8\().8H,  \r0\().8H,  \r1\().8H
+        trn2            \r9\().8H,  \r0\().8H,  \r1\().8H
+        trn1            \r1\().8H,  \r2\().8H,  \r3\().8H
+        trn2            \r3\().8H,  \r2\().8H,  \r3\().8H
+        trn1            \r0\().8H,  \r4\().8H,  \r5\().8H
+        trn2            \r5\().8H,  \r4\().8H,  \r5\().8H
+        trn1            \r2\().8H,  \r6\().8H,  \r7\().8H
+        trn2            \r7\().8H,  \r6\().8H,  \r7\().8H
+
+        trn1            \r4\().4S,  \r0\().4S,  \r2\().4S
+        trn2            \r2\().4S,  \r0\().4S,  \r2\().4S
+        trn1            \r6\().4S,  \r5\().4S,  \r7\().4S
+        trn2            \r7\().4S,  \r5\().4S,  \r7\().4S
+        trn1            \r5\().4S,  \r9\().4S,  \r3\().4S
+        trn2            \r9\().4S,  \r9\().4S,  \r3\().4S
+        trn1            \r3\().4S,  \r8\().4S,  \r1\().4S
+        trn2            \r8\().4S,  \r8\().4S,  \r1\().4S
+
+        trn1            \r0\().2D,  \r3\().2D,  \r4\().2D
+        trn2            \r4\().2D,  \r3\().2D,  \r4\().2D
+
+        trn1            \r1\().2D,  \r5\().2D,  \r6\().2D
+        trn2            \r5\().2D,  \r5\().2D,  \r6\().2D
+
+        trn2            \r6\().2D,  \r8\().2D,  \r2\().2D
+        trn1            \r2\().2D,  \r8\().2D,  \r2\().2D
+
+        trn1            \r3\().2D,  \r9\().2D,  \r7\().2D
+        trn2            \r7\().2D,  \r9\().2D,  \r7\().2D
+
+.endm
new file mode 100644
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/simple_idct_neon.S
@@ -0,0 +1,362 @@
+/*
+ * ARM NEON IDCT
+ *
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2017 Matthieu Bouron <matthieu.bouron@gmail.com>
+ *
+ * Based on Simple IDCT
+ * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+#define Z1  22725  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z2  21407  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z3  19266  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z4  16383  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z5  12873  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z6  8867   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z7  4520   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define Z4c ((1<<(COL_SHIFT-1))/Z4)
+#define ROW_SHIFT 11
+#define COL_SHIFT 20
+
+#define z1 v0.H[0]
+#define z2 v0.H[1]
+#define z3 v0.H[2]
+#define z4 v0.H[3]
+#define z5 v0.H[4]
+#define z6 v0.H[5]
+#define z7 v0.H[6]
+#define z4c v0.H[7]
+
+const   idct_coeff_neon, align=4
+        .short Z1, Z2, Z3, Z4, Z5, Z6, Z7, Z4c
+endconst
+
+.macro idct_start data
+        prfm            pldl1keep, [\data]
+        mov             x10, x30
+        movrel          x3, idct_coeff_neon
+        ld1             {v0.2D}, [x3]
+.endm
+
+.macro idct_end
+        br              x10
+.endm
+
+.macro smull1 a, b, c
+        smull           \a, \b, \c
+.endm
+
+.macro smlal1 a, b, c
+        smlal           \a, \b, \c
+.endm
+
+.macro smlsl1 a, b, c
+        smlsl           \a, \b, \c
+.endm
+
+.macro idct_col4_top y1, y2, y3, y4, i, l
+        smull\i         v7.4S,  \y3\l, z2
+        smull\i         v16.4S, \y3\l, z6
+        smull\i         v17.4S, \y2\l, z1
+        add             v19.4S, v23.4S, v7.4S
+        smull\i         v18.4S, \y2\l, z3
+        add             v20.4S, v23.4S, v16.4S
+        smull\i         v5.4S,  \y2\l, z5
+        sub             v21.4S, v23.4S, v16.4S
+        smull\i         v6.4S,  \y2\l, z7
+        sub             v22.4S, v23.4S, v7.4S
+
+        smlal\i         v17.4S, \y4\l, z3
+        smlsl\i         v18.4S, \y4\l, z7
+        smlsl\i         v5.4S,  \y4\l, z1
+        smlsl\i         v6.4S,  \y4\l, z5
+.endm
+
+.macro idct_row4_neon y1, y2, y3, y4, pass
+        ld1             {\y1\().2D,\y2\().2D}, [x2], #32
+        movi            v23.4S, #1<<2, lsl #8
+        orr             v5.16B, \y1\().16B, \y2\().16B
+        ld1             {\y3\().2D,\y4\().2D}, [x2], #32
+        orr             v6.16B, \y3\().16B, \y4\().16B
+        orr             v5.16B, v5.16B, v6.16B
+        mov             x3, v5.D[1]
+        smlal           v23.4S, \y1\().4H, z4
+
+        idct_col4_top   \y1, \y2, \y3, \y4, 1, .4H
+
+        cmp             x3, #0
+        b.eq            \pass\()f
+
+        smull2          v7.4S, \y1\().8H, z4
+        smlal2          v17.4S, \y2\().8H, z5
+        smlsl2          v18.4S, \y2\().8H, z1
+        smull2          v16.4S, \y3\().8H, z2
+        smlal2          v5.4S, \y2\().8H, z7
+        add             v19.4S, v19.4S, v7.4S
+        sub             v20.4S, v20.4S, v7.4S
+        sub             v21.4S, v21.4S, v7.4S
+        add             v22.4S, v22.4S, v7.4S
+        smlal2          v6.4S, \y2\().8H, z3
+        smull2          v7.4S, \y3\().8H, z6
+        smlal2          v17.4S, \y4\().8H, z7
+        smlsl2          v18.4S, \y4\().8H, z5
+        smlal2          v5.4S, \y4\().8H, z3
+        smlsl2          v6.4S, \y4\().8H, z1
+        add             v19.4S, v19.4S, v7.4S
+        sub             v20.4S, v20.4S, v16.4S
+        add             v21.4S, v21.4S, v16.4S
+        sub             v22.4S, v22.4S, v7.4S
+
+\pass:  add             \y3\().4S, v19.4S, v17.4S
+        add             \y4\().4S, v20.4S, v18.4S
+        shrn            \y1\().4H, \y3\().4S, #ROW_SHIFT
+        shrn            \y2\().4H, \y4\().4S, #ROW_SHIFT
+        add             v7.4S, v21.4S, v5.4S
+        add             v16.4S, v22.4S, v6.4S
+        shrn            \y3\().4H, v7.4S, #ROW_SHIFT
+        shrn            \y4\().4H, v16.4S, #ROW_SHIFT
+        sub             v22.4S, v22.4S, v6.4S
+        sub             v19.4S, v19.4S, v17.4S
+        sub             v21.4S, v21.4S, v5.4S
+        shrn2           \y1\().8H, v22.4S, #ROW_SHIFT
+        sub             v20.4S, v20.4S, v18.4S
+        shrn2           \y2\().8H, v21.4S, #ROW_SHIFT
+        shrn2           \y3\().8H, v20.4S, #ROW_SHIFT
+        shrn2           \y4\().8H, v19.4S, #ROW_SHIFT
+
+        trn1            v16.8H, \y1\().8H, \y2\().8H
+        trn2            v17.8H, \y1\().8H, \y2\().8H
+        trn1            v18.8H, \y3\().8H, \y4\().8H
+        trn2            v19.8H, \y3\().8H, \y4\().8H
+        trn1            \y1\().4S, v16.4S, v18.4S
+        trn1            \y2\().4S, v17.4S, v19.4S
+        trn2            \y3\().4S, v16.4S, v18.4S
+        trn2            \y4\().4S, v17.4S, v19.4S
+.endm
+
+.macro declare_idct_col4_neon i, l
+function idct_col4_neon\i
+        dup             v23.4H, z4c
+.if \i == 1
+        add             v23.4H, v23.4H, v24.4H
+.else
+        mov             v5.D[0], v24.D[1]
+        add             v23.4H, v23.4H, v5.4H
+.endif
+        smull           v23.4S, v23.4H, z4
+
+        idct_col4_top   v24, v25, v26, v27, \i, \l
+
+        mov             x4, v28.D[\i - 1]
+        mov             x5, v29.D[\i - 1]
+        cmp             x4, #0
+        b.eq            1f
+
+        smull\i         v7.4S,  v28\l,  z4
+        add             v19.4S, v19.4S, v7.4S
+        sub             v20.4S, v20.4S, v7.4S
+        sub             v21.4S, v21.4S, v7.4S
+        add             v22.4S, v22.4S, v7.4S
+
+1:      mov             x4, v30.D[\i - 1]
+        cmp             x5, #0
+        b.eq            2f
+
+        smlal\i         v17.4S, v29\l, z5
+        smlsl\i         v18.4S, v29\l, z1
+        smlal\i         v5.4S,  v29\l, z7
+        smlal\i         v6.4S,  v29\l, z3
+
+2:      mov             x5, v31.D[\i - 1]
+        cmp             x4, #0
+        b.eq            3f
+
+        smull\i         v7.4S,  v30\l, z6
+        smull\i         v16.4S, v30\l, z2
+        add             v19.4S, v19.4S, v7.4S
+        sub             v22.4S, v22.4S, v7.4S
+        sub             v20.4S, v20.4S, v16.4S
+        add             v21.4S, v21.4S, v16.4S
+
+3:      cmp             x5, #0
+        b.eq            4f
+
+        smlal\i         v17.4S, v31\l, z7
+        smlsl\i         v18.4S, v31\l, z5
+        smlal\i         v5.4S,  v31\l, z3
+        smlsl\i         v6.4S,  v31\l, z1
+
+4:      addhn           v7.4H, v19.4S, v17.4S
+        addhn2          v7.8H, v20.4S, v18.4S
+        subhn           v18.4H, v20.4S, v18.4S
+        subhn2          v18.8H, v19.4S, v17.4S
+
+        addhn           v16.4H, v21.4S, v5.4S
+        addhn2          v16.8H, v22.4S, v6.4S
+        subhn           v17.4H, v22.4S, v6.4S
+        subhn2          v17.8H, v21.4S, v5.4S
+
+        ret
+endfunc
+.endm
+
+declare_idct_col4_neon 1, .4H
+declare_idct_col4_neon 2, .8H
+
+function ff_simple_idct_put_neon, export=1
+        idct_start      x2
+
+        idct_row4_neon  v24, v25, v26, v27, 1
+        idct_row4_neon  v28, v29, v30, v31, 2
+        bl              idct_col4_neon1
+
+        sqshrun         v1.8B,  v7.8H, #COL_SHIFT-16
+        sqshrun2        v1.16B, v16.8H, #COL_SHIFT-16
+        sqshrun         v3.8B,  v17.8H, #COL_SHIFT-16
+        sqshrun2        v3.16B, v18.8H, #COL_SHIFT-16
+
+        bl              idct_col4_neon2
+
+        sqshrun         v2.8B,  v7.8H, #COL_SHIFT-16
+        sqshrun2        v2.16B, v16.8H, #COL_SHIFT-16
+        sqshrun         v4.8B,  v17.8H, #COL_SHIFT-16
+        sqshrun2        v4.16B, v18.8H, #COL_SHIFT-16
+
+        zip1            v16.4S, v1.4S, v2.4S
+        zip2            v17.4S, v1.4S, v2.4S
+
+        st1             {v16.D}[0], [x0], x1
+        st1             {v16.D}[1], [x0], x1
+
+        zip1            v18.4S, v3.4S, v4.4S
+        zip2            v19.4S, v3.4S, v4.4S
+
+        st1             {v17.D}[0], [x0], x1
+        st1             {v17.D}[1], [x0], x1
+        st1             {v18.D}[0], [x0], x1
+        st1             {v18.D}[1], [x0], x1
+        st1             {v19.D}[0], [x0], x1
+        st1             {v19.D}[1], [x0], x1
+
+        idct_end
+endfunc
+
+function ff_simple_idct_add_neon, export=1
+        idct_start      x2
+
+        idct_row4_neon  v24, v25, v26, v27, 1
+        idct_row4_neon  v28, v29, v30, v31, 2
+        bl              idct_col4_neon1
+
+        sshr            v1.8H, v7.8H, #COL_SHIFT-16
+        sshr            v2.8H, v16.8H, #COL_SHIFT-16
+        sshr            v3.8H, v17.8H, #COL_SHIFT-16
+        sshr            v4.8H, v18.8H, #COL_SHIFT-16
+
+        bl              idct_col4_neon2
+
+        sshr            v7.8H, v7.8H, #COL_SHIFT-16
+        sshr            v16.8H, v16.8H, #COL_SHIFT-16
+        sshr            v17.8H, v17.8H, #COL_SHIFT-16
+        sshr            v18.8H, v18.8H, #COL_SHIFT-16
+
+        mov             x9,  x0
+        ld1             {v19.D}[0], [x0], x1
+        zip1            v23.2D, v1.2D, v7.2D
+        zip2            v24.2D, v1.2D, v7.2D
+        ld1             {v19.D}[1], [x0], x1
+        zip1            v25.2D, v2.2D, v16.2D
+        zip2            v26.2D, v2.2D, v16.2D
+        ld1             {v20.D}[0], [x0], x1
+        zip1            v27.2D, v3.2D, v17.2D
+        zip2            v28.2D, v3.2D, v17.2D
+        ld1             {v20.D}[1], [x0], x1
+        zip1            v29.2D, v4.2D, v18.2D
+        zip2            v30.2D, v4.2D, v18.2D
+        ld1             {v21.D}[0], [x0], x1
+        uaddw           v23.8H, v23.8H, v19.8B
+        uaddw2          v24.8H, v24.8H, v19.16B
+        ld1             {v21.D}[1], [x0], x1
+        sqxtun          v23.8B, v23.8H
+        sqxtun2         v23.16B, v24.8H
+        ld1             {v22.D}[0], [x0], x1
+        uaddw           v24.8H, v25.8H, v20.8B
+        uaddw2          v25.8H, v26.8H, v20.16B
+        ld1             {v22.D}[1], [x0], x1
+        sqxtun          v24.8B, v24.8H
+        sqxtun2         v24.16B, v25.8H
+        st1             {v23.D}[0], [x9], x1
+        uaddw           v25.8H, v27.8H, v21.8B
+        uaddw2          v26.8H, v28.8H, v21.16B
+        st1             {v23.D}[1], [x9], x1
+        sqxtun          v25.8B, v25.8H
+        sqxtun2         v25.16B, v26.8H
+        st1             {v24.D}[0], [x9], x1
+        uaddw           v26.8H, v29.8H, v22.8B
+        uaddw2          v27.8H, v30.8H, v22.16B
+        st1             {v24.D}[1], [x9], x1
+        sqxtun          v26.8B, v26.8H
+        sqxtun2         v26.16B, v27.8H
+        st1             {v25.D}[0], [x9], x1
+        st1             {v25.D}[1], [x9], x1
+        st1             {v26.D}[0], [x9], x1
+        st1             {v26.D}[1], [x9], x1
+
+        idct_end
+endfunc
+
+function ff_simple_idct_neon, export=1
+        idct_start      x0
+
+        mov             x2,  x0
+        idct_row4_neon  v24, v25, v26, v27, 1
+        idct_row4_neon  v28, v29, v30, v31, 2
+        sub             x2, x2, #128
+        bl              idct_col4_neon1
+
+        sshr            v1.8H, v7.8H, #COL_SHIFT-16
+        sshr            v2.8H, v16.8H, #COL_SHIFT-16
+        sshr            v3.8H, v17.8H, #COL_SHIFT-16
+        sshr            v4.8H, v18.8H, #COL_SHIFT-16
+
+        bl              idct_col4_neon2
+
+        sshr            v7.8H, v7.8H, #COL_SHIFT-16
+        sshr            v16.8H, v16.8H, #COL_SHIFT-16
+        sshr            v17.8H, v17.8H, #COL_SHIFT-16
+        sshr            v18.8H, v18.8H, #COL_SHIFT-16
+
+        zip1            v23.2D, v1.2D, v7.2D
+        zip2            v24.2D, v1.2D, v7.2D
+        st1             {v23.2D,v24.2D}, [x2], #32
+        zip1            v25.2D, v2.2D, v16.2D
+        zip2            v26.2D, v2.2D, v16.2D
+        st1             {v25.2D,v26.2D}, [x2], #32
+        zip1            v27.2D, v3.2D, v17.2D
+        zip2            v28.2D, v3.2D, v17.2D
+        st1             {v27.2D,v28.2D}, [x2], #32
+        zip1            v29.2D, v4.2D, v18.2D
+        zip2            v30.2D, v4.2D, v18.2D
+        st1             {v29.2D,v30.2D}, [x2], #32
+
+        idct_end
+endfunc
new file mode 100644
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/vc1dsp_init_aarch64.c
@@ -0,0 +1,47 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/vc1dsp.h"
+
+#include "config.h"
+
+void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                int h, int x, int y);
+void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                int h, int x, int y);
+void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                int h, int x, int y);
+void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                int h, int x, int y);
+
+av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon;
+        dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
+        dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
+        dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
+    }
+}
new file mode 100644
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/videodsp.S
@@ -0,0 +1,28 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+function ff_prefetch_aarch64, export=1
+        subs            w2,  w2,  #2
+        prfm            pldl1strm, [x0]
+        prfm            pldl1strm, [x0,  x1]
+        add             x0,  x0,  x1,  lsl #1
+        b.gt            X(ff_prefetch_aarch64)
+        ret
+endfunc
new file mode 100644
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/videodsp_init.c
@@ -0,0 +1,32 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/videodsp.h"
+
+void ff_prefetch_aarch64(uint8_t *mem, ptrdiff_t stride, int h);
+
+av_cold void ff_videodsp_init_aarch64(VideoDSPContext *ctx, int bpc)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_armv8(cpu_flags))
+        ctx->prefetch = ff_prefetch_aarch64;
+}
new file mode 100644
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/vp9dsp_init.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AARCH64_VP9DSP_INIT_H
+#define AVCODEC_AARCH64_VP9DSP_INIT_H
+
+#include "libavcodec/vp9dsp.h"
+
+void ff_vp9dsp_init_10bpp_aarch64(VP9DSPContext *dsp);
+void ff_vp9dsp_init_12bpp_aarch64(VP9DSPContext *dsp);
+
+#endif /* AVCODEC_AARCH64_VP9DSP_INIT_H */
new file mode 100644
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_10bpp_aarch64.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BPP 10
+#define INIT_FUNC ff_vp9dsp_init_10bpp_aarch64
+#include "vp9dsp_init_16bpp_aarch64_template.c"
new file mode 100644
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_12bpp_aarch64.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#define BPP 12
+#define INIT_FUNC ff_vp9dsp_init_12bpp_aarch64
+#include "vp9dsp_init_16bpp_aarch64_template.c"
new file mode 100644
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_16bpp_aarch64_template.c
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/internal.h"
+#include "libavutil/aarch64/cpu.h"
+#include "vp9dsp_init.h"
+
+#define declare_fpel(type, sz, suffix)                                          \
+void ff_vp9_##type##sz##suffix##_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
+                                      const uint8_t *src, ptrdiff_t src_stride, \
+                                      int h, int mx, int my)
+
+#define decl_mc_func(op, filter, dir, sz, bpp)                                                   \
+void ff_vp9_##op##_##filter##sz##_##dir##_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
+                                                       const uint8_t *src, ptrdiff_t src_stride, \
+                                                       int h, int mx, int my)
+
+#define define_8tap_2d_fn(op, filter, sz, bpp)                                      \
+static void op##_##filter##sz##_hv_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
+                                                const uint8_t *src,                 \
+                                                ptrdiff_t src_stride,               \
+                                                int h, int mx, int my)              \
+{                                                                                   \
+    LOCAL_ALIGNED_16(uint8_t, temp, [((1 + (sz < 64)) * sz + 8) * sz * 2]);         \
+    /* We only need h + 7 lines, but the horizontal filter assumes an               \
+     * even number of rows, so filter h + 8 lines here. */                          \
+    ff_vp9_put_##filter##sz##_h_##bpp##_neon(temp, 2 * sz,                          \
+                                             src - 3 * src_stride, src_stride,      \
+                                             h + 8, mx, 0);                         \
+    ff_vp9_##op##_##filter##sz##_v_##bpp##_neon(dst, dst_stride,                    \
+                                                temp + 3 * 2 * sz, 2 * sz,          \
+                                                h, 0, my);                          \
+}
+
+#define decl_filter_funcs(op, dir, sz, bpp)  \
+    decl_mc_func(op, regular, dir, sz, bpp); \
+    decl_mc_func(op, sharp,   dir, sz, bpp); \
+    decl_mc_func(op, smooth,  dir, sz, bpp)
+
+#define decl_mc_funcs(sz, bpp)           \
+    decl_filter_funcs(put, h,  sz, bpp); \
+    decl_filter_funcs(avg, h,  sz, bpp); \
+    decl_filter_funcs(put, v,  sz, bpp); \
+    decl_filter_funcs(avg, v,  sz, bpp); \
+    decl_filter_funcs(put, hv, sz, bpp); \
+    decl_filter_funcs(avg, hv, sz, bpp)
+
+#define ff_vp9_copy32_neon  ff_vp9_copy32_aarch64
+#define ff_vp9_copy64_neon  ff_vp9_copy64_aarch64
+#define ff_vp9_copy128_neon ff_vp9_copy128_aarch64
+
+declare_fpel(copy, 128, );
+declare_fpel(copy, 64,  );
+declare_fpel(copy, 32,  );
+declare_fpel(copy, 16,  );
+declare_fpel(copy, 8,   );
+declare_fpel(avg, 64, _16);
+declare_fpel(avg, 32, _16);
+declare_fpel(avg, 16, _16);
+declare_fpel(avg, 8,  _16);
+declare_fpel(avg, 4,  _16);
+
+decl_mc_funcs(64, BPP);
+decl_mc_funcs(32, BPP);
+decl_mc_funcs(16, BPP);
+decl_mc_funcs(8, BPP);
+decl_mc_funcs(4, BPP);
+
+#define define_8tap_2d_funcs(sz, bpp)        \
+    define_8tap_2d_fn(put, regular, sz, bpp) \
+    define_8tap_2d_fn(put, sharp,   sz, bpp) \
+    define_8tap_2d_fn(put, smooth,  sz, bpp) \
+    define_8tap_2d_fn(avg, regular, sz, bpp) \
+    define_8tap_2d_fn(avg, sharp,   sz, bpp) \
+    define_8tap_2d_fn(avg, smooth,  sz, bpp)
+
+define_8tap_2d_funcs(64, BPP)
+define_8tap_2d_funcs(32, BPP)
+define_8tap_2d_funcs(16, BPP)
+define_8tap_2d_funcs(8,  BPP)
+define_8tap_2d_funcs(4,  BPP)
+
+static av_cold void vp9dsp_mc_init_aarch64(VP9DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#define init_fpel(idx1, idx2, sz, type, suffix)      \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_vp9_##type##sz##suffix
+
+#define init_copy(idx, sz, suffix) \
+    init_fpel(idx, 0, sz, copy, suffix)
+
+#define init_avg(idx, sz, suffix) \
+    init_fpel(idx, 1, sz, avg,  suffix)
+
+#define init_copy_avg(idx, sz1, sz2) \
+    init_copy(idx, sz2, _neon);      \
+    init_avg (idx, sz1, _16_neon)
+
+    if (have_armv8(cpu_flags)) {
+        init_copy(0, 128, _aarch64);
+        init_copy(1, 64,  _aarch64);
+        init_copy(2, 32,  _aarch64);
+    }
+
+    if (have_neon(cpu_flags)) {
+#define init_mc_func(idx1, idx2, op, filter, fname, dir, mx, my, sz, pfx, bpp) \
+    dsp->mc[idx1][filter][idx2][mx][my] = pfx##op##_##fname##sz##_##dir##_##bpp##_neon
+
+#define init_mc_funcs(idx, dir, mx, my, sz, pfx, bpp)                                   \
+    init_mc_func(idx, 0, put, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \
+    init_mc_func(idx, 0, put, FILTER_8TAP_SHARP,   sharp,   dir, mx, my, sz, pfx, bpp); \
+    init_mc_func(idx, 0, put, FILTER_8TAP_SMOOTH,  smooth,  dir, mx, my, sz, pfx, bpp); \
+    init_mc_func(idx, 1, avg, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \
+    init_mc_func(idx, 1, avg, FILTER_8TAP_SHARP,   sharp,   dir, mx, my, sz, pfx, bpp); \
+    init_mc_func(idx, 1, avg, FILTER_8TAP_SMOOTH,  smooth,  dir, mx, my, sz, pfx, bpp)
+
+#define init_mc_funcs_dirs(idx, sz, bpp)            \
+    init_mc_funcs(idx, v,  0, 1, sz, ff_vp9_, bpp); \
+    init_mc_funcs(idx, h,  1, 0, sz, ff_vp9_, bpp); \
+    init_mc_funcs(idx, hv, 1, 1, sz,        , bpp)
+
+
+        init_avg(0, 64, _16_neon);
+        init_avg(1, 32, _16_neon);
+        init_avg(2, 16, _16_neon);
+        init_copy_avg(3, 8, 16);
+        init_copy_avg(4, 4, 8);
+
+        init_mc_funcs_dirs(0, 64, BPP);
+        init_mc_funcs_dirs(1, 32, BPP);
+        init_mc_funcs_dirs(2, 16, BPP);
+        init_mc_funcs_dirs(3, 8,  BPP);
+        init_mc_funcs_dirs(4, 4,  BPP);
+    }
+}
+
+#define define_itxfm2(type_a, type_b, sz, bpp)                                     \
+void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_##bpp##_neon(uint8_t *_dst,    \
+                                                                 ptrdiff_t stride, \
+                                                                 int16_t *_block, int eob)
+#define define_itxfm(type_a, type_b, sz, bpp) define_itxfm2(type_a, type_b, sz, bpp)
+
+#define define_itxfm_funcs(sz, bpp)      \
+    define_itxfm(idct,  idct,  sz, bpp); \
+    define_itxfm(iadst, idct,  sz, bpp); \
+    define_itxfm(idct,  iadst, sz, bpp); \
+    define_itxfm(iadst, iadst, sz, bpp)
+
+define_itxfm_funcs(4,  BPP);
+define_itxfm_funcs(8,  BPP);
+define_itxfm_funcs(16, BPP);
+define_itxfm(idct, idct, 32, BPP);
+define_itxfm(iwht, iwht, 4,  BPP);
+
+
+static av_cold void vp9dsp_itxfm_init_aarch64(VP9DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+#define init_itxfm2(tx, sz, bpp)                                               \
+    dsp->itxfm_add[tx][DCT_DCT]   = ff_vp9_idct_idct_##sz##_add_##bpp##_neon;  \
+    dsp->itxfm_add[tx][DCT_ADST]  = ff_vp9_iadst_idct_##sz##_add_##bpp##_neon; \
+    dsp->itxfm_add[tx][ADST_DCT]  = ff_vp9_idct_iadst_##sz##_add_##bpp##_neon; \
+    dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_##bpp##_neon
+#define init_itxfm(tx, sz, bpp) init_itxfm2(tx, sz, bpp)
+
+#define init_idct2(tx, nm, bpp)     \
+    dsp->itxfm_add[tx][DCT_DCT]   = \
+    dsp->itxfm_add[tx][ADST_DCT]  = \
+    dsp->itxfm_add[tx][DCT_ADST]  = \
+    dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_##bpp##_neon
+#define init_idct(tx, nm, bpp) init_idct2(tx, nm, bpp)
+
+        init_itxfm(TX_4X4,   4x4,   BPP);
+        init_itxfm(TX_8X8,   8x8,   BPP);
+        init_itxfm(TX_16X16, 16x16, BPP);
+        init_idct(TX_32X32, idct_idct_32x32, BPP);
+        init_idct(4,        iwht_iwht_4x4,   BPP);
+    }
+}
+
+#define define_loop_filter(dir, wd, size, bpp) \
+void ff_vp9_loop_filter_##dir##_##wd##_##size##_##bpp##_neon(uint8_t *dst, ptrdiff_t stride, int E, int I, int H)
+
+#define define_loop_filters(wd, size, bpp) \
+    define_loop_filter(h, wd, size, bpp);  \
+    define_loop_filter(v, wd, size, bpp)
+
+define_loop_filters(4,  8,  BPP);
+define_loop_filters(8,  8,  BPP);
+define_loop_filters(16, 8,  BPP);
+
+define_loop_filters(16, 16, BPP);
+
+define_loop_filters(44, 16, BPP);
+define_loop_filters(48, 16, BPP);
+define_loop_filters(84, 16, BPP);
+define_loop_filters(88, 16, BPP);
+
+static av_cold void vp9dsp_loopfilter_init_aarch64(VP9DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+#define init_lpf_func_8(idx1, idx2, dir, wd, bpp) \
+    dsp->loop_filter_8[idx1][idx2] = ff_vp9_loop_filter_##dir##_##wd##_8_##bpp##_neon
+
+#define init_lpf_func_16(idx, dir, bpp) \
+    dsp->loop_filter_16[idx] = ff_vp9_loop_filter_##dir##_16_16_##bpp##_neon
+
+#define init_lpf_func_mix2(idx1, idx2, idx3, dir, wd, bpp) \
+    dsp->loop_filter_mix2[idx1][idx2][idx3] = ff_vp9_loop_filter_##dir##_##wd##_16_##bpp##_neon
+
+#define init_lpf_funcs_8_wd(idx, wd, bpp) \
+    init_lpf_func_8(idx, 0, h, wd, bpp);  \
+    init_lpf_func_8(idx, 1, v, wd, bpp)
+
+#define init_lpf_funcs_16(bpp)   \
+    init_lpf_func_16(0, h, bpp); \
+    init_lpf_func_16(1, v, bpp)
+
+#define init_lpf_funcs_mix2_wd(idx1, idx2, wd, bpp) \
+    init_lpf_func_mix2(idx1, idx2, 0, h, wd, bpp);  \
+    init_lpf_func_mix2(idx1, idx2, 1, v, wd, bpp)
+
+#define init_lpf_funcs_8(bpp)        \
+    init_lpf_funcs_8_wd(0, 4,  bpp); \
+    init_lpf_funcs_8_wd(1, 8,  bpp); \
+    init_lpf_funcs_8_wd(2, 16, bpp)
+
+#define init_lpf_funcs_mix2(bpp)           \
+    init_lpf_funcs_mix2_wd(0, 0, 44, bpp); \
+    init_lpf_funcs_mix2_wd(0, 1, 48, bpp); \
+    init_lpf_funcs_mix2_wd(1, 0, 84, bpp); \
+    init_lpf_funcs_mix2_wd(1, 1, 88, bpp)
+
+        init_lpf_funcs_8(BPP);
+        init_lpf_funcs_16(BPP);
+        init_lpf_funcs_mix2(BPP);
+    }
+}
+
+av_cold void INIT_FUNC(VP9DSPContext *dsp)
+{
+    vp9dsp_mc_init_aarch64(dsp);
+    vp9dsp_loopfilter_init_aarch64(dsp);
+    vp9dsp_itxfm_init_aarch64(dsp);
+}
new file mode 100644
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/vp9dsp_init_aarch64.c
@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) 2016 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/internal.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/vp9dsp.h"
+#include "vp9dsp_init.h"
+
+#define declare_fpel(type, sz)                                          \
+void ff_vp9_##type##sz##_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
+                              const uint8_t *src, ptrdiff_t src_stride, \
+                              int h, int mx, int my)
+
+#define declare_copy_avg(sz) \
+    declare_fpel(copy, sz);  \
+    declare_fpel(avg , sz)
+
+#define decl_mc_func(op, filter, dir, sz)                                                \
+void ff_vp9_##op##_##filter##sz##_##dir##_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
+                                               const uint8_t *src, ptrdiff_t src_stride, \
+                                               int h, int mx, int my)
+
+#define define_8tap_2d_fn(op, filter, sz)                                         \
+static void op##_##filter##sz##_hv_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
+                                        const uint8_t *src, ptrdiff_t src_stride, \
+                                        int h, int mx, int my)                    \
+{                                                                                 \
+    LOCAL_ALIGNED_16(uint8_t, temp, [((1 + (sz < 64)) * sz + 8) * sz]);           \
+    /* We only need h + 7 lines, but the horizontal filter assumes an             \
+     * even number of rows, so filter h + 8 lines here. */                        \
+    ff_vp9_put_##filter##sz##_h_neon(temp, sz,                                    \
+                                     src - 3 * src_stride, src_stride,            \
+                                     h + 8, mx, 0);                               \
+    ff_vp9_##op##_##filter##sz##_v_neon(dst, dst_stride,                          \
+                                        temp + 3 * sz, sz,                        \
+                                        h, 0, my);                                \
+}
+
+#define decl_filter_funcs(op, dir, sz)  \
+    decl_mc_func(op, regular, dir, sz); \
+    decl_mc_func(op, sharp,   dir, sz); \
+    decl_mc_func(op, smooth,  dir, sz)
+
+#define decl_mc_funcs(sz)           \
+    decl_filter_funcs(put, h,  sz); \
+    decl_filter_funcs(avg, h,  sz); \
+    decl_filter_funcs(put, v,  sz); \
+    decl_filter_funcs(avg, v,  sz); \
+    decl_filter_funcs(put, hv, sz); \
+    decl_filter_funcs(avg, hv, sz)
+
+#define ff_vp9_copy32_neon ff_vp9_copy32_aarch64
+#define ff_vp9_copy64_neon ff_vp9_copy64_aarch64
+
+declare_copy_avg(64);
+declare_copy_avg(32);
+declare_copy_avg(16);
+declare_copy_avg(8);
+declare_copy_avg(4);
+
+decl_mc_funcs(64);
+decl_mc_funcs(32);
+decl_mc_funcs(16);
+decl_mc_funcs(8);
+decl_mc_funcs(4);
+
+#define define_8tap_2d_funcs(sz)        \
+    define_8tap_2d_fn(put, regular, sz) \
+    define_8tap_2d_fn(put, sharp,   sz) \
+    define_8tap_2d_fn(put, smooth,  sz) \
+    define_8tap_2d_fn(avg, regular, sz) \
+    define_8tap_2d_fn(avg, sharp,   sz) \
+    define_8tap_2d_fn(avg, smooth,  sz)
+
+define_8tap_2d_funcs(64)
+define_8tap_2d_funcs(32)
+define_8tap_2d_funcs(16)
+define_8tap_2d_funcs(8)
+define_8tap_2d_funcs(4)
+
+static av_cold void vp9dsp_mc_init_aarch64(VP9DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#define init_fpel(idx1, idx2, sz, type, suffix)      \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_vp9_##type##sz##suffix
+
+#define init_copy(idx, sz, suffix) \
+    init_fpel(idx, 0, sz, copy, suffix)
+
+#define init_avg(idx, sz, suffix) \
+    init_fpel(idx, 1, sz, avg,  suffix)
+
+#define init_copy_avg(idx, sz) \
+    init_copy(idx, sz, _neon); \
+    init_avg (idx, sz, _neon)
+
+    if (have_armv8(cpu_flags)) {
+        init_copy(0, 64, _aarch64);
+        init_copy(1, 32, _aarch64);
+    }
+
+    if (have_neon(cpu_flags)) {
+#define init_mc_func(idx1, idx2, op, filter, fname, dir, mx, my, sz, pfx) \
+    dsp->mc[idx1][filter][idx2][mx][my] = pfx##op##_##fname##sz##_##dir##_neon
+
+#define init_mc_funcs(idx, dir, mx, my, sz, pfx)                                   \
+    init_mc_func(idx, 0, put, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx); \
+    init_mc_func(idx, 0, put, FILTER_8TAP_SHARP,   sharp,   dir, mx, my, sz, pfx); \
+    init_mc_func(idx, 0, put, FILTER_8TAP_SMOOTH,  smooth,  dir, mx, my, sz, pfx); \
+    init_mc_func(idx, 1, avg, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx); \
+    init_mc_func(idx, 1, avg, FILTER_8TAP_SHARP,   sharp,   dir, mx, my, sz, pfx); \
+    init_mc_func(idx, 1, avg, FILTER_8TAP_SMOOTH,  smooth,  dir, mx, my, sz, pfx)
+
+#define init_mc_funcs_dirs(idx, sz)            \
+    init_mc_funcs(idx, h,  1, 0, sz, ff_vp9_); \
+    init_mc_funcs(idx, v,  0, 1, sz, ff_vp9_); \
+    init_mc_funcs(idx, hv, 1, 1, sz,)
+
+        init_avg(0, 64, _neon);
+        init_avg(1, 32, _neon);
+        init_copy_avg(2, 16);
+        init_copy_avg(3, 8);
+        init_copy_avg(4, 4);
+
+        init_mc_funcs_dirs(0, 64);
+        init_mc_funcs_dirs(1, 32);
+        init_mc_funcs_dirs(2, 16);
+        init_mc_funcs_dirs(3, 8);
+        init_mc_funcs_dirs(4, 4);
+    }
+}
+
+#define define_itxfm(type_a, type_b, sz)                                   \
+void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_neon(uint8_t *_dst,    \
+                                                         ptrdiff_t stride, \
+                                                         int16_t *_block, int eob)
+
+#define define_itxfm_funcs(sz)      \
+    define_itxfm(idct,  idct,  sz); \
+    define_itxfm(iadst, idct,  sz); \
+    define_itxfm(idct,  iadst, sz); \
+    define_itxfm(iadst, iadst, sz)
+
+define_itxfm_funcs(4);
+define_itxfm_funcs(8);
+define_itxfm_funcs(16);
+define_itxfm(idct, idct, 32);
+define_itxfm(iwht, iwht, 4);
+
+
+static av_cold void vp9dsp_itxfm_init_aarch64(VP9DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+#define init_itxfm(tx, sz)                                             \
+    dsp->itxfm_add[tx][DCT_DCT]   = ff_vp9_idct_idct_##sz##_add_neon;  \
+    dsp->itxfm_add[tx][DCT_ADST]  = ff_vp9_iadst_idct_##sz##_add_neon; \
+    dsp->itxfm_add[tx][ADST_DCT]  = ff_vp9_idct_iadst_##sz##_add_neon; \
+    dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_neon
+
+#define init_idct(tx, nm)           \
+    dsp->itxfm_add[tx][DCT_DCT]   = \
+    dsp->itxfm_add[tx][ADST_DCT]  = \
+    dsp->itxfm_add[tx][DCT_ADST]  = \
+    dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_neon
+
+        init_itxfm(TX_4X4, 4x4);
+        init_itxfm(TX_8X8, 8x8);
+        init_itxfm(TX_16X16, 16x16);
+        init_idct(TX_32X32, idct_idct_32x32);
+        init_idct(4, iwht_iwht_4x4);
+    }
+}
+
+#define define_loop_filter(dir, wd, len) \
+void ff_vp9_loop_filter_##dir##_##wd##_##len##_neon(uint8_t *dst, ptrdiff_t stride, int E, int I, int H)
+
+#define define_loop_filters(wd, len) \
+    define_loop_filter(h, wd, len);  \
+    define_loop_filter(v, wd, len)
+
+define_loop_filters(4, 8);
+define_loop_filters(8, 8);
+define_loop_filters(16, 8);
+
+define_loop_filters(16, 16);
+
+define_loop_filters(44, 16);
+define_loop_filters(48, 16);
+define_loop_filters(84, 16);
+define_loop_filters(88, 16);
+
+static av_cold void vp9dsp_loopfilter_init_aarch64(VP9DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        dsp->loop_filter_8[0][1] = ff_vp9_loop_filter_v_4_8_neon;
+        dsp->loop_filter_8[0][0] = ff_vp9_loop_filter_h_4_8_neon;
+        dsp->loop_filter_8[1][1] = ff_vp9_loop_filter_v_8_8_neon;
+        dsp->loop_filter_8[1][0] = ff_vp9_loop_filter_h_8_8_neon;
+        dsp->loop_filter_8[2][1] = ff_vp9_loop_filter_v_16_8_neon;
+        dsp->loop_filter_8[2][0] = ff_vp9_loop_filter_h_16_8_neon;
+
+        dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_neon;
+        dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_neon;
+
+        dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_neon;
+        dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_neon;
+        dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_neon;
+        dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_neon;
+        dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_neon;
+        dsp->loop_filter_mix2[1][0][1] = ff_vp9_loop_filter_v_84_16_neon;
+        dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_neon;
+        dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_neon;
+    }
+}
+
+av_cold void ff_vp9dsp_init_aarch64(VP9DSPContext *dsp, int bpp)
+{
+    if (bpp == 10) {
+        ff_vp9dsp_init_10bpp_aarch64(dsp);
+        return;
+    } else if (bpp == 12) {
+        ff_vp9dsp_init_12bpp_aarch64(dsp);
+        return;
+    } else if (bpp != 8)
+        return;
+
+    vp9dsp_mc_init_aarch64(dsp);
+    vp9dsp_loopfilter_init_aarch64(dsp);
+    vp9dsp_itxfm_init_aarch64(dsp);
+}
new file mode 100644
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/vp9itxfm_16bpp_neon.S
@@ -0,0 +1,2017 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+const itxfm4_coeffs, align=4
+        .short  11585, 0, 6270, 15137
+iadst4_coeffs:
+        .short  5283, 15212, 9929, 13377
+endconst
+
+const iadst8_coeffs, align=4
+        .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
+idct_coeffs:
+        .short  11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
+        .short  1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
+        .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
+        .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
+endconst
+
+const iadst16_coeffs, align=4
+        .short  16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
+        .short  14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
+endconst
+
+.macro transpose_4x4s r0, r1, r2, r3, r4, r5, r6, r7
+        trn1            \r4\().4s,  \r0\().4s,  \r1\().4s
+        trn2            \r5\().4s,  \r0\().4s,  \r1\().4s
+        trn1            \r6\().4s,  \r2\().4s,  \r3\().4s
+        trn2            \r7\().4s,  \r2\().4s,  \r3\().4s
+        trn1            \r0\().2d,  \r4\().2d,  \r6\().2d
+        trn2            \r2\().2d,  \r4\().2d,  \r6\().2d
+        trn1            \r1\().2d,  \r5\().2d,  \r7\().2d
+        trn2            \r3\().2d,  \r5\().2d,  \r7\().2d
+.endm
+
+// Transpose a 8x8 matrix of 32 bit elements, where each row is spread out
+// over two registers.
+.macro transpose_8x8s r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15, t0, t1, t2, t3
+        transpose_4x4s  \r0,  \r2,  \r4,  \r6,  \t0, \t1, \t2, \t3
+        transpose_4x4s  \r9,  \r11, \r13, \r15, \t0, \t1, \t2, \t3
+
+        // Do 4x4 transposes of r1,r3,r5,r7 and r8,r10,r12,r14
+        // while swapping the two 4x4 matrices between each other
+
+        // First step of the 4x4 transpose of r1-r7, into t0-t3
+        trn1            \t0\().4s,  \r1\().4s,  \r3\().4s
+        trn2            \t1\().4s,  \r1\().4s,  \r3\().4s
+        trn1            \t2\().4s,  \r5\().4s,  \r7\().4s
+        trn2            \t3\().4s,  \r5\().4s,  \r7\().4s
+
+        // First step of the 4x4 transpose of r8-r12, into r1-r7
+        trn1            \r1\().4s,  \r8\().4s,  \r10\().4s
+        trn2            \r3\().4s,  \r8\().4s,  \r10\().4s
+        trn1            \r5\().4s,  \r12\().4s, \r14\().4s
+        trn2            \r7\().4s,  \r12\().4s, \r14\().4s
+
+        // Second step of the 4x4 transpose of r1-r7 (now in t0-r3), into r8-r12
+        trn1            \r8\().2d,  \t0\().2d,  \t2\().2d
+        trn2            \r12\().2d, \t0\().2d,  \t2\().2d
+        trn1            \r10\().2d, \t1\().2d,  \t3\().2d
+        trn2            \r14\().2d, \t1\().2d,  \t3\().2d
+
+        // Second step of the 4x4 transpose of r8-r12 (now in r1-r7), in place as far as possible
+        trn1            \t0\().2d,  \r1\().2d,  \r5\().2d
+        trn2            \r5\().2d,  \r1\().2d,  \r5\().2d
+        trn1            \t1\().2d,  \r3\().2d,  \r7\().2d
+        trn2            \r7\().2d,  \r3\().2d,  \r7\().2d
+
+        // Move the outputs of trn1 back in place
+        mov             \r1\().16b,  \t0\().16b
+        mov             \r3\().16b,  \t1\().16b
+.endm
+
+// out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
+// out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
+// in/out are .4s registers; this can do with 4 temp registers, but is
+// more efficient if 6 temp registers are available.
+.macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
+.if \neg > 0
+        neg             \tmp4\().4s, v0.4s
+.endif
+        add             \tmp1\().4s, \in1\().4s,  \in2\().4s
+        sub             \tmp2\().4s, \in1\().4s,  \in2\().4s
+.if \neg > 0
+        smull           \tmp3\().2d, \tmp1\().2s, \tmp4\().s[0]
+        smull2          \tmp4\().2d, \tmp1\().4s, \tmp4\().s[0]
+.else
+        smull           \tmp3\().2d, \tmp1\().2s, v0.s[0]
+        smull2          \tmp4\().2d, \tmp1\().4s, v0.s[0]
+.endif
+.ifb \tmp5
+        rshrn           \out1\().2s, \tmp3\().2d, #14
+        rshrn2          \out1\().4s, \tmp4\().2d, #14
+        smull           \tmp3\().2d, \tmp2\().2s, v0.s[0]
+        smull2          \tmp4\().2d, \tmp2\().4s, v0.s[0]
+        rshrn           \out2\().2s, \tmp3\().2d, #14
+        rshrn2          \out2\().4s, \tmp4\().2d, #14
+.else
+        smull           \tmp5\().2d, \tmp2\().2s, v0.s[0]
+        smull2          \tmp6\().2d, \tmp2\().4s, v0.s[0]
+        rshrn           \out1\().2s, \tmp3\().2d, #14
+        rshrn2          \out1\().4s, \tmp4\().2d, #14
+        rshrn           \out2\().2s, \tmp5\().2d, #14
+        rshrn2          \out2\().4s, \tmp6\().2d, #14
+.endif
+.endm
+
+// Same as dmbutterfly0 above, but treating the input in in2 as zero,
+// writing the same output into both out1 and out2.
+.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
+        smull           \tmp1\().2d, \in1\().2s,  v0.s[0]
+        smull2          \tmp2\().2d, \in1\().4s,  v0.s[0]
+        rshrn           \out1\().2s, \tmp1\().2d, #14
+        rshrn2          \out1\().4s, \tmp2\().2d, #14
+        rshrn           \out2\().2s, \tmp1\().2d, #14
+        rshrn2          \out2\().4s, \tmp2\().2d, #14
+.endm
+
+// out1,out2 = in1 * coef1 - in2 * coef2
+// out3,out4 = in1 * coef2 + in2 * coef1
+// out are 4 x .2d registers, in are 2 x .4s registers
+.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
+        smull           \out1\().2d, \in1\().2s, \coef1
+        smull2          \out2\().2d, \in1\().4s, \coef1
+        smull           \out3\().2d, \in1\().2s, \coef2
+        smull2          \out4\().2d, \in1\().4s, \coef2
+        smlsl           \out1\().2d, \in2\().2s, \coef2
+        smlsl2          \out2\().2d, \in2\().4s, \coef2
+        smlal           \out3\().2d, \in2\().2s, \coef1
+        smlal2          \out4\().2d, \in2\().4s, \coef1
+.endm
+
+// inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
+// inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
+// inout are 2 x .4s registers
+.macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
+        dmbutterfly_l   \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
+.if \neg > 0
+        neg             \tmp3\().2d, \tmp3\().2d
+        neg             \tmp4\().2d, \tmp4\().2d
+.endif
+        rshrn           \inout1\().2s, \tmp1\().2d,  #14
+        rshrn2          \inout1\().4s, \tmp2\().2d,  #14
+        rshrn           \inout2\().2s, \tmp3\().2d,  #14
+        rshrn2          \inout2\().4s, \tmp4\().2d,  #14
+.endm
+
+// Same as dmbutterfly above, but treating the input in inout2 as zero
+.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+        smull           \tmp1\().2d, \inout1\().2s, \coef1
+        smull2          \tmp2\().2d, \inout1\().4s, \coef1
+        smull           \tmp3\().2d, \inout1\().2s, \coef2
+        smull2          \tmp4\().2d, \inout1\().4s, \coef2
+        rshrn           \inout1\().2s, \tmp1\().2d, #14
+        rshrn2          \inout1\().4s, \tmp2\().2d, #14
+        rshrn           \inout2\().2s, \tmp3\().2d, #14
+        rshrn2          \inout2\().4s, \tmp4\().2d, #14
+.endm
+
+// Same as dmbutterfly above, but treating the input in inout1 as zero
+.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+        smull           \tmp1\().2d, \inout2\().2s, \coef2
+        smull2          \tmp2\().2d, \inout2\().4s, \coef2
+        smull           \tmp3\().2d, \inout2\().2s, \coef1
+        smull2          \tmp4\().2d, \inout2\().4s, \coef1
+        neg             \tmp1\().2d, \tmp1\().2d
+        neg             \tmp2\().2d, \tmp2\().2d
+        rshrn           \inout2\().2s, \tmp3\().2d, #14
+        rshrn2          \inout2\().4s, \tmp4\().2d, #14
+        rshrn           \inout1\().2s, \tmp1\().2d, #14
+        rshrn2          \inout1\().4s, \tmp2\().2d, #14
+.endm
+
+.macro dsmull_h out1, out2, in, coef
+        smull           \out1\().2d, \in\().2s, \coef
+        smull2          \out2\().2d, \in\().4s, \coef
+.endm
+
+.macro drshrn_h out, in1, in2, shift
+        rshrn           \out\().2s, \in1\().2d, \shift
+        rshrn2          \out\().4s, \in2\().2d, \shift
+.endm
+
+
+// out1 = in1 + in2
+// out2 = in1 - in2
+.macro butterfly_4s out1, out2, in1, in2
+        add             \out1\().4s, \in1\().4s, \in2\().4s
+        sub             \out2\().4s, \in1\().4s, \in2\().4s
+.endm
+
+// out1 = in1 - in2
+// out2 = in1 + in2
+.macro butterfly_4s_r out1, out2, in1, in2
+        sub             \out1\().4s, \in1\().4s, \in2\().4s
+        add             \out2\().4s, \in1\().4s, \in2\().4s
+.endm
+
+// out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
+// out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
+// out are 2 x .4s registers, in are 4 x .2d registers
+.macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
+        add             \tmp1\().2d, \in1\().2d, \in3\().2d
+        add             \tmp2\().2d, \in2\().2d, \in4\().2d
+        sub             \tmp3\().2d, \in1\().2d, \in3\().2d
+        sub             \tmp4\().2d, \in2\().2d, \in4\().2d
+        rshrn           \out1\().2s, \tmp1\().2d,  #14
+        rshrn2          \out1\().4s, \tmp2\().2d,  #14
+        rshrn           \out2\().2s, \tmp3\().2d,  #14
+        rshrn2          \out2\().4s, \tmp4\().2d,  #14
+.endm
+
+.macro iwht4_10 c0, c1, c2, c3
+        add             \c0\().4s, \c0\().4s, \c1\().4s
+        sub             v17.4s,    \c2\().4s, \c3\().4s
+        sub             v16.4s,    \c0\().4s, v17.4s
+        sshr            v16.4s,    v16.4s,    #1
+        sub             \c2\().4s, v16.4s,    \c1\().4s
+        sub             \c1\().4s, v16.4s,    \c3\().4s
+        add             \c3\().4s, v17.4s,    \c2\().4s
+        sub             \c0\().4s, \c0\().4s, \c1\().4s
+.endm
+
+.macro iwht4_12 c0, c1, c2, c3
+        iwht4_10        \c0, \c1, \c2, \c3
+.endm
+
+.macro idct4_10 c0, c1, c2, c3
+        mul             v22.4s,    \c1\().4s, v0.s[3]
+        mul             v20.4s,    \c1\().4s, v0.s[2]
+        add             v16.4s,    \c0\().4s, \c2\().4s
+        sub             v17.4s,    \c0\().4s, \c2\().4s
+        mla             v22.4s,    \c3\().4s, v0.s[2]
+        mul             v18.4s,    v16.4s,    v0.s[0]
+        mul             v24.4s,    v17.4s,    v0.s[0]
+        mls             v20.4s,    \c3\().4s, v0.s[3]
+        srshr           v22.4s,    v22.4s,    #14
+        srshr           v18.4s,    v18.4s,    #14
+        srshr           v24.4s,    v24.4s,    #14
+        srshr           v20.4s,    v20.4s,    #14
+        add             \c0\().4s, v18.4s,    v22.4s
+        sub             \c3\().4s, v18.4s,    v22.4s
+        add             \c1\().4s, v24.4s,    v20.4s
+        sub             \c2\().4s, v24.4s,    v20.4s
+.endm
+
+.macro idct4_12 c0, c1, c2, c3
+        smull           v22.2d,    \c1\().2s, v0.s[3]
+        smull2          v23.2d,    \c1\().4s, v0.s[3]
+        smull           v20.2d,    \c1\().2s, v0.s[2]
+        smull2          v21.2d,    \c1\().4s, v0.s[2]
+        add             v16.4s,    \c0\().4s, \c2\().4s
+        sub             v17.4s,    \c0\().4s, \c2\().4s
+        smlal           v22.2d,    \c3\().2s, v0.s[2]
+        smlal2          v23.2d,    \c3\().4s, v0.s[2]
+        smull           v18.2d,    v16.2s,    v0.s[0]
+        smull2          v19.2d,    v16.4s,    v0.s[0]
+        smull           v24.2d,    v17.2s,    v0.s[0]
+        smull2          v25.2d,    v17.4s,    v0.s[0]
+        smlsl           v20.2d,    \c3\().2s, v0.s[3]
+        smlsl2          v21.2d,    \c3\().4s, v0.s[3]
+        rshrn           v22.2s,    v22.2d,    #14
+        rshrn2          v22.4s,    v23.2d,    #14
+        rshrn           v18.2s,    v18.2d,    #14
+        rshrn2          v18.4s,    v19.2d,    #14
+        rshrn           v24.2s,    v24.2d,    #14
+        rshrn2          v24.4s,    v25.2d,    #14
+        rshrn           v20.2s,    v20.2d,    #14
+        rshrn2          v20.4s,    v21.2d,    #14
+        add             \c0\().4s, v18.4s,    v22.4s
+        sub             \c3\().4s, v18.4s,    v22.4s
+        add             \c1\().4s, v24.4s,    v20.4s
+        sub             \c2\().4s, v24.4s,    v20.4s
+.endm
+
+.macro iadst4_10 c0, c1, c2, c3
+        mul             v16.4s,    \c0\().4s, v1.s[0]
+        mla             v16.4s,    \c2\().4s, v1.s[1]
+        mla             v16.4s,    \c3\().4s, v1.s[2]
+        mul             v18.4s,    \c0\().4s, v1.s[2]
+        mls             v18.4s,    \c2\().4s, v1.s[0]
+        sub             \c0\().4s, \c0\().4s, \c2\().4s
+        mls             v18.4s,    \c3\().4s, v1.s[1]
+        add             \c0\().4s, \c0\().4s, \c3\().4s
+        mul             v22.4s,    \c1\().4s, v1.s[3]
+        mul             v20.4s,    \c0\().4s, v1.s[3]
+        add             v24.4s,    v16.4s,    v22.4s
+        add             v26.4s,    v18.4s,    v22.4s
+        srshr           \c0\().4s, v24.4s,    #14
+        add             v16.4s,    v16.4s,    v18.4s
+        srshr           \c1\().4s, v26.4s,    #14
+        sub             v16.4s,    v16.4s,    v22.4s
+        srshr           \c2\().4s, v20.4s,    #14
+        srshr           \c3\().4s, v16.4s,    #14
+.endm
+
+.macro iadst4_12 c0, c1, c2, c3
+        smull           v16.2d,    \c0\().2s, v1.s[0]
+        smull2          v17.2d,    \c0\().4s, v1.s[0]
+        smlal           v16.2d,    \c2\().2s, v1.s[1]
+        smlal2          v17.2d,    \c2\().4s, v1.s[1]
+        smlal           v16.2d,    \c3\().2s, v1.s[2]
+        smlal2          v17.2d,    \c3\().4s, v1.s[2]
+        smull           v18.2d,    \c0\().2s, v1.s[2]
+        smull2          v19.2d,    \c0\().4s, v1.s[2]
+        smlsl           v18.2d,    \c2\().2s, v1.s[0]
+        smlsl2          v19.2d,    \c2\().4s, v1.s[0]
+        sub             \c0\().4s, \c0\().4s, \c2\().4s
+        smlsl           v18.2d,    \c3\().2s, v1.s[1]
+        smlsl2          v19.2d,    \c3\().4s, v1.s[1]
+        add             \c0\().4s, \c0\().4s, \c3\().4s
+        smull           v22.2d,    \c1\().2s, v1.s[3]
+        smull2          v23.2d,    \c1\().4s, v1.s[3]
+        smull           v20.2d,    \c0\().2s, v1.s[3]
+        smull2          v21.2d,    \c0\().4s, v1.s[3]
+        add             v24.2d,    v16.2d,    v22.2d
+        add             v25.2d,    v17.2d,    v23.2d
+        add             v26.2d,    v18.2d,    v22.2d
+        add             v27.2d,    v19.2d,    v23.2d
+        rshrn           \c0\().2s, v24.2d,    #14
+        rshrn2          \c0\().4s, v25.2d,    #14
+        add             v16.2d,    v16.2d,    v18.2d
+        add             v17.2d,    v17.2d,    v19.2d
+        rshrn           \c1\().2s, v26.2d,    #14
+        rshrn2          \c1\().4s, v27.2d,    #14
+        sub             v16.2d,    v16.2d,    v22.2d
+        sub             v17.2d,    v17.2d,    v23.2d
+        rshrn           \c2\().2s, v20.2d,    #14
+        rshrn2          \c2\().4s, v21.2d,    #14
+        rshrn           \c3\().2s, v16.2d,    #14
+        rshrn2          \c3\().4s, v17.2d,    #14
+.endm
+
+// The public functions in this file have got the following signature:
+// void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+
+.macro itxfm_func4x4 txfm1, txfm2, bpp
+function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_\bpp\()_neon, export=1
+.ifc \txfm1,\txfm2
+.ifc \txfm1,idct
+        movrel          x4,  itxfm4_coeffs
+        ld1             {v0.4h}, [x4]
+        sxtl            v0.4s,  v0.4h
+.endif
+.ifc \txfm1,iadst
+        movrel          x4,  iadst4_coeffs
+        ld1             {v0.d}[1], [x4]
+        sxtl2           v1.4s,  v0.8h
+.endif
+.else
+        movrel          x4,  itxfm4_coeffs
+        ld1             {v0.8h}, [x4]
+        sxtl2           v1.4s,  v0.8h
+        sxtl            v0.4s,  v0.4h
+.endif
+
+        movi            v30.4s, #0
+        movi            v31.4s, #0
+.ifc \txfm1\()_\txfm2,idct_idct
+        cmp             w3,  #1
+        b.ne            1f
+        // DC-only for idct/idct
+        ld1             {v2.s}[0],  [x2]
+        smull           v2.2d,  v2.2s, v0.s[0]
+        rshrn           v2.2s,  v2.2d, #14
+        smull           v2.2d,  v2.2s, v0.s[0]
+        rshrn           v2.2s,  v2.2d, #14
+        st1             {v31.s}[0], [x2]
+        dup             v4.4s,  v2.s[0]
+        mov             v5.16b, v4.16b
+        mov             v6.16b, v4.16b
+        mov             v7.16b, v4.16b
+        b               2f
+.endif
+
+1:
+        ld1             {v4.4s,v5.4s,v6.4s,v7.4s},  [x2]
+        st1             {v30.4s,v31.4s}, [x2], #32
+
+.ifc \txfm1,iwht
+        sshr            v4.4s,  v4.4s,  #2
+        sshr            v5.4s,  v5.4s,  #2
+        sshr            v6.4s,  v6.4s,  #2
+        sshr            v7.4s,  v7.4s,  #2
+.endif
+
+        \txfm1\()4_\bpp v4,  v5,  v6,  v7
+
+        st1             {v30.4s,v31.4s}, [x2], #32
+        // Transpose 4x4 with 32 bit elements
+        transpose_4x4s  v4,  v5,  v6,  v7,  v16, v17, v18, v19
+
+        \txfm2\()4_\bpp v4,  v5,  v6,  v7
+2:
+        mvni            v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
+        ld1             {v0.4h},   [x0], x1
+        ld1             {v1.4h},   [x0], x1
+.ifnc \txfm1,iwht
+        srshr           v4.4s,  v4.4s,  #4
+        srshr           v5.4s,  v5.4s,  #4
+        srshr           v6.4s,  v6.4s,  #4
+        srshr           v7.4s,  v7.4s,  #4
+.endif
+        uaddw           v4.4s,  v4.4s,  v0.4h
+        uaddw           v5.4s,  v5.4s,  v1.4h
+        ld1             {v2.4h},   [x0], x1
+        ld1             {v3.4h},   [x0], x1
+        sqxtun          v0.4h,  v4.4s
+        sqxtun2         v0.8h,  v5.4s
+        sub             x0,  x0,  x1, lsl #2
+
+        uaddw           v6.4s,  v6.4s,  v2.4h
+        umin            v0.8h,  v0.8h,  v31.8h
+        uaddw           v7.4s,  v7.4s,  v3.4h
+        st1             {v0.4h},   [x0], x1
+        sqxtun          v2.4h,  v6.4s
+        sqxtun2         v2.8h,  v7.4s
+        umin            v2.8h,  v2.8h,  v31.8h
+
+        st1             {v0.d}[1], [x0], x1
+        st1             {v2.4h},   [x0], x1
+        st1             {v2.d}[1], [x0], x1
+
+        ret
+endfunc
+.endm
+
+.macro itxfm_funcs4x4 bpp
+itxfm_func4x4 idct,  idct,  \bpp
+itxfm_func4x4 iadst, idct,  \bpp
+itxfm_func4x4 idct,  iadst, \bpp
+itxfm_func4x4 iadst, iadst, \bpp
+itxfm_func4x4 iwht,  iwht,  \bpp
+.endm
+
+itxfm_funcs4x4 10
+itxfm_funcs4x4 12
+
+function idct8x8_dc_add_neon
+        movrel          x4,  idct_coeffs
+        ld1             {v0.4h}, [x4]
+
+        movi            v1.4h,  #0
+        sxtl            v0.4s,  v0.4h
+
+        ld1             {v2.s}[0],  [x2]
+        smull           v2.2d,  v2.2s,  v0.s[0]
+        rshrn           v2.2s,  v2.2d,  #14
+        smull           v2.2d,  v2.2s,  v0.s[0]
+        rshrn           v2.2s,  v2.2d,  #14
+        st1             {v1.s}[0],  [x2]
+        dup             v2.4s,  v2.s[0]
+
+        srshr           v2.4s,  v2.4s,  #5
+
+        mov             x4,  #8
+        mov             x3,  x0
+        dup             v31.8h, w5
+1:
+        // Loop to add the constant from v2 into all 8x8 outputs
+        subs            x4,  x4,  #2
+        ld1             {v3.8h},  [x0], x1
+        ld1             {v4.8h},  [x0], x1
+        uaddw           v16.4s, v2.4s,  v3.4h
+        uaddw2          v17.4s, v2.4s,  v3.8h
+        uaddw           v18.4s, v2.4s,  v4.4h
+        uaddw2          v19.4s, v2.4s,  v4.8h
+        sqxtun          v3.4h,  v16.4s
+        sqxtun2         v3.8h,  v17.4s
+        sqxtun          v4.4h,  v18.4s
+        sqxtun2         v4.8h,  v19.4s
+        umin            v3.8h,  v3.8h,  v31.8h
+        umin            v4.8h,  v4.8h,  v31.8h
+        st1             {v3.8h},  [x3], x1
+        st1             {v4.8h},  [x3], x1
+        b.ne            1b
+
+        ret
+endfunc
+
+.macro idct8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5
+        dmbutterfly0    \r0, \r4, \r0, \r4, \t0, \t1, \t2, \t3, \t4, \t5 // r0 = t0a, r4 = t1a
+        dmbutterfly     \r2, \r6, v0.s[2], v0.s[3], \t0, \t1, \t2, \t3   // r2 = t2a, r6 = t3a
+        dmbutterfly     \r1, \r7, v1.s[0], v1.s[1], \t0, \t1, \t2, \t3   // r1 = t4a, r7 = t7a
+        dmbutterfly     \r5, \r3, v1.s[2], v1.s[3], \t0, \t1, \t2, \t3   // r5 = t5a, r3 = t6a
+
+        butterfly_4s    \t0, \t1, \r0, \r6 // t0 = t0, t1 = t3
+        butterfly_4s    \t2, \r5, \r1, \r5 // t2 = t4, r5 = t5a
+        butterfly_4s    \t3, \r6, \r7, \r3 // t3 = t7, r6 = t6a
+        butterfly_4s    \r7, \r4, \r4, \r2 // r7 = t1, r4 = t2
+
+        dmbutterfly0    \r6, \r5, \r6, \r5, \r0, \r1, \r2, \r3, \t4, \t5 // r6 = t6, r5 = t5
+
+        butterfly_4s    \r1, \r6, \r7, \r6 // r1 = out[1], r6 = out[6]
+        butterfly_4s    \r0, \r7, \t0, \t3 // r0 = out[0], r7 = out[7]
+        butterfly_4s    \r2, \r5, \r4, \r5 // r2 = out[2], r5 = out[5]
+        butterfly_4s    \r3, \r4, \t1, \t2 // r3 = out[3], r4 = out[4]
+.endm
+
+.macro iadst8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5
+        dmbutterfly_l   \t2, \t3, \t0, \t1, \r7, \r0, v2.s[1], v2.s[0]   // t2,t3 = t1a, t0,t1 = t0a
+        dmbutterfly_l   \r0, \r7, \t4, \t5, \r3, \r4, v3.s[1], v3.s[0]   // r0,r7 = t5a, t4,t5 = t4a
+
+        dbutterfly_n    \r3, \t0, \t0, \t1, \t4, \t5, \r3, \r4, \t0, \t1 // r3 = t0, t0 = t4
+        dbutterfly_n    \r4, \t1, \t2, \t3, \r0, \r7, \r4, \t1, \t4, \t5 // r4 = t1, t1 = t5
+
+        dmbutterfly_l   \t4, \t5, \t2, \t3, \r5, \r2, v2.s[3], v2.s[2]   // t4,t5 = t3a, t2,t3 = t2a
+        dmbutterfly_l   \r2, \r5, \r0, \r7, \r1, \r6, v3.s[3], v3.s[2]   // r2,r5 = t7a, r0,r7 = t6a
+
+        dbutterfly_n    \r1, \t2, \t2, \t3, \r0, \r7, \r1, \r6, \t2, \t3 // r1 = t2, t2 = t6
+        dbutterfly_n    \r0, \t4, \t4, \t5, \r2, \r5, \r0, \r7, \t4, \t5 // r0 = t3, t4 = t7
+
+        butterfly_4s    \r7, \r4, \r4, \r0   // r7 = -out[7], r4 = t3
+        neg             \r7\().4s, \r7\().4s // r7 = out[7]
+        butterfly_4s    \r0, \r1, \r3, \r1   // r0 = out[0],  r1 = t2
+
+        dmbutterfly_l   \r2, \r3, \t3, \t5, \t0, \t1, v0.s[2], v0.s[3]   // r2,r3 = t5a, t3,t5 = t4a
+        dmbutterfly_l   \t0, \t1, \r5, \r6, \t4, \t2, v0.s[3], v0.s[2]   // t0,t1 = t6a, r5,r6 = t7a
+
+        dbutterfly_n    \r6, \t2, \r2, \r3, \r5, \r6, \t2, \t4, \r2, \r3 // r6 = out[6],  t2 = t7
+
+        dmbutterfly0    \r3, \r4, \r1, \r4, \t4, \r5, \r1, \r2           // r3 = -out[3], r4 = out[4]
+        neg             \r3\().4s, \r3\().4s  // r3 = out[3]
+
+        dbutterfly_n    \r1, \t0, \t3, \t5, \t0, \t1, \r1, \r2, \t0, \t1 // r1 = -out[1], t0 = t6
+        neg             \r1\().4s, \r1\().4s  // r1 = out[1]
+
+        dmbutterfly0    \r2, \r5, \t0, \t2, \t1, \t3, \t4, \t5           // r2 = out[2],  r5 = -out[5]
+        neg             \r5\().4s, \r5\().4s  // r5 = out[5]
+.endm
+
+
+.macro itxfm_func8x8 txfm1, txfm2
+function vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
+.ifc \txfm1\()_\txfm2,idct_idct
+        cmp             w3,  #1
+        b.eq            idct8x8_dc_add_neon
+.endif
+        // The iadst also uses a few coefficients from
+        // idct, so those always need to be loaded.
+.ifc \txfm1\()_\txfm2,idct_idct
+        movrel          x4,  idct_coeffs
+.else
+        movrel          x4,  iadst8_coeffs
+        ld1             {v1.8h}, [x4], #16
+        stp             d8,  d9,  [sp, #-0x10]!
+        sxtl2           v3.4s,  v1.8h
+        sxtl            v2.4s,  v1.4h
+.endif
+        ld1             {v0.8h}, [x4]
+        sxtl2           v1.4s,  v0.8h
+        sxtl            v0.4s,  v0.4h
+
+        movi            v4.4s, #0
+        movi            v5.4s, #0
+        movi            v6.4s, #0
+        movi            v7.4s, #0
+
+1:
+        ld1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x2], #64
+        ld1             {v20.4s,v21.4s,v22.4s,v23.4s},  [x2], #64
+        ld1             {v24.4s,v25.4s,v26.4s,v27.4s},  [x2], #64
+        ld1             {v28.4s,v29.4s,v30.4s,v31.4s},  [x2], #64
+        sub             x2,  x2,  #256
+        st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
+        st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
+        st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
+        st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
+
+.ifc \txfm1\()_\txfm2,idct_idct
+        idct8           v16, v18, v20, v22, v24, v26, v28, v30, v2,  v3,  v4,  v5,  v6,  v7
+        idct8           v17, v19, v21, v23, v25, v27, v29, v31, v2,  v3,  v4,  v5,  v6,  v7
+.else
+        \txfm1\()8      v16, v18, v20, v22, v24, v26, v28, v30, v4,  v5,  v6,  v7,  v8,  v9
+        \txfm1\()8      v17, v19, v21, v23, v25, v27, v29, v31, v4,  v5,  v6,  v7,  v8,  v9
+.endif
+
+        // Transpose 8x8 with 16 bit elements
+        transpose_8x8s  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v4, v5, v6, v7
+
+.ifc \txfm1\()_\txfm2,idct_idct
+        idct8           v16, v18, v20, v22, v24, v26, v28, v30, v2,  v3,  v4,  v5,  v6,  v7
+        idct8           v17, v19, v21, v23, v25, v27, v29, v31, v2,  v3,  v4,  v5,  v6,  v7
+.else
+        \txfm2\()8      v16, v18, v20, v22, v24, v26, v28, v30, v4,  v5,  v6,  v7,  v8,  v9
+        \txfm2\()8      v17, v19, v21, v23, v25, v27, v29, v31, v4,  v5,  v6,  v7,  v8,  v9
+.endif
+2:
+        mov             x3,  x0
+        // Add into the destination
+        ld1             {v0.8h},  [x0], x1
+        srshr           v16.4s, v16.4s, #5
+        srshr           v17.4s, v17.4s, #5
+        ld1             {v1.8h},  [x0], x1
+        srshr           v18.4s, v18.4s, #5
+        srshr           v19.4s, v19.4s, #5
+        ld1             {v2.8h},  [x0], x1
+        srshr           v20.4s, v20.4s, #5
+        srshr           v21.4s, v21.4s, #5
+        uaddw           v16.4s, v16.4s, v0.4h
+        uaddw2          v17.4s, v17.4s, v0.8h
+        ld1             {v3.8h},  [x0], x1
+        srshr           v22.4s, v22.4s, #5
+        srshr           v23.4s, v23.4s, #5
+        uaddw           v18.4s, v18.4s, v1.4h
+        uaddw2          v19.4s, v19.4s, v1.8h
+        ld1             {v4.8h},  [x0], x1
+        srshr           v24.4s, v24.4s, #5
+        srshr           v25.4s, v25.4s, #5
+        uaddw           v20.4s, v20.4s, v2.4h
+        uaddw2          v21.4s, v21.4s, v2.8h
+        sqxtun          v0.4h,  v16.4s
+        sqxtun2         v0.8h,  v17.4s
+        dup             v16.8h, w5
+        ld1             {v5.8h},  [x0], x1
+        srshr           v26.4s, v26.4s, #5
+        srshr           v27.4s, v27.4s, #5
+        uaddw           v22.4s, v22.4s, v3.4h
+        uaddw2          v23.4s, v23.4s, v3.8h
+        sqxtun          v1.4h,  v18.4s
+        sqxtun2         v1.8h,  v19.4s
+        umin            v0.8h,  v0.8h,  v16.8h
+        ld1             {v6.8h},  [x0], x1
+        srshr           v28.4s, v28.4s, #5
+        srshr           v29.4s, v29.4s, #5
+        uaddw           v24.4s, v24.4s, v4.4h
+        uaddw2          v25.4s, v25.4s, v4.8h
+        sqxtun          v2.4h,  v20.4s
+        sqxtun2         v2.8h,  v21.4s
+        umin            v1.8h,  v1.8h,  v16.8h
+        ld1             {v7.8h},  [x0], x1
+        srshr           v30.4s, v30.4s, #5
+        srshr           v31.4s, v31.4s, #5
+        uaddw           v26.4s, v26.4s, v5.4h
+        uaddw2          v27.4s, v27.4s, v5.8h
+        sqxtun          v3.4h,  v22.4s
+        sqxtun2         v3.8h,  v23.4s
+        umin            v2.8h,  v2.8h,  v16.8h
+
+        st1             {v0.8h},  [x3], x1
+        uaddw           v28.4s, v28.4s, v6.4h
+        uaddw2          v29.4s, v29.4s, v6.8h
+        st1             {v1.8h},  [x3], x1
+        sqxtun          v4.4h,  v24.4s
+        sqxtun2         v4.8h,  v25.4s
+        umin            v3.8h,  v3.8h,  v16.8h
+        st1             {v2.8h},  [x3], x1
+        uaddw           v30.4s, v30.4s, v7.4h
+        uaddw2          v31.4s, v31.4s, v7.8h
+        st1             {v3.8h},  [x3], x1
+        sqxtun          v5.4h,  v26.4s
+        sqxtun2         v5.8h,  v27.4s
+        umin            v4.8h,  v4.8h,  v16.8h
+        st1             {v4.8h},  [x3], x1
+        sqxtun          v6.4h,  v28.4s
+        sqxtun2         v6.8h,  v29.4s
+        umin            v5.8h,  v5.8h,  v16.8h
+        st1             {v5.8h},  [x3], x1
+        sqxtun          v7.4h,  v30.4s
+        sqxtun2         v7.8h,  v31.4s
+        umin            v6.8h,  v6.8h,  v16.8h
+
+        st1             {v6.8h},  [x3], x1
+        umin            v7.8h,  v7.8h,  v16.8h
+        st1             {v7.8h},  [x3], x1
+
+.ifnc \txfm1\()_\txfm2,idct_idct
+        ldp             d8,  d9,  [sp], 0x10
+.endif
+        ret
+endfunc
+
+function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_10_neon, export=1
+        mov             x5,  #0x03ff
+        b               vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
+endfunc
+
+function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_12_neon, export=1
+        mov             x5,  #0x0fff
+        b               vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
+endfunc
+.endm
+
+itxfm_func8x8 idct,  idct
+itxfm_func8x8 iadst, idct
+itxfm_func8x8 idct,  iadst
+itxfm_func8x8 iadst, iadst
+
+
+function idct16x16_dc_add_neon
+        movrel          x4,  idct_coeffs
+        ld1             {v0.4h}, [x4]
+        sxtl            v0.4s,  v0.4h
+
+        movi            v1.4h,  #0
+
+        ld1             {v2.s}[0],  [x2]
+        smull           v2.2d,  v2.2s,  v0.s[0]
+        rshrn           v2.2s,  v2.2d,  #14
+        smull           v2.2d,  v2.2s,  v0.s[0]
+        rshrn           v2.2s,  v2.2d,  #14
+        st1             {v1.s}[0],  [x2]
+        dup             v2.4s,  v2.s[0]
+
+        srshr           v0.4s,  v2.4s,  #6
+
+        mov             x3, x0
+        mov             x4, #16
+        dup             v31.8h, w13
+1:
+        // Loop to add the constant from v2 into all 16x16 outputs
+        subs            x4,  x4,  #2
+        ld1             {v1.8h,v2.8h},  [x0], x1
+        uaddw           v16.4s, v0.4s,  v1.4h
+        uaddw2          v17.4s, v0.4s,  v1.8h
+        ld1             {v3.8h,v4.8h},  [x0], x1
+        uaddw           v18.4s, v0.4s,  v2.4h
+        uaddw2          v19.4s, v0.4s,  v2.8h
+        uaddw           v20.4s, v0.4s,  v3.4h
+        uaddw2          v21.4s, v0.4s,  v3.8h
+        uaddw           v22.4s, v0.4s,  v4.4h
+        uaddw2          v23.4s, v0.4s,  v4.8h
+        sqxtun          v1.4h,  v16.4s
+        sqxtun2         v1.8h,  v17.4s
+        sqxtun          v2.4h,  v18.4s
+        sqxtun2         v2.8h,  v19.4s
+        sqxtun          v3.4h,  v20.4s
+        sqxtun2         v3.8h,  v21.4s
+        sqxtun          v4.4h,  v22.4s
+        sqxtun2         v4.8h,  v23.4s
+        umin            v1.8h,  v1.8h,  v31.8h
+        umin            v2.8h,  v2.8h,  v31.8h
+        st1             {v1.8h,v2.8h},  [x3], x1
+        umin            v3.8h,  v3.8h,  v31.8h
+        umin            v4.8h,  v4.8h,  v31.8h
+        st1             {v3.8h,v4.8h},  [x3], x1
+        b.ne            1b
+
+        ret
+endfunc
+
+.macro idct16_end
+        butterfly_4s    v18, v7,  v4,  v7                // v18 = t0a,  v7  = t7a
+        butterfly_4s    v19, v22, v5,  v22               // v19 = t1a,  v22 = t6
+        butterfly_4s    v4,  v26, v20, v26               // v4  = t2a,  v26 = t5
+        butterfly_4s    v5,  v6,  v28, v6                // v5  = t3a,  v6  = t4
+        butterfly_4s    v20, v28, v16, v24               // v20 = t8a,  v28 = t11a
+        butterfly_4s    v24, v21, v23, v21               // v24 = t9,   v21 = t10
+        butterfly_4s    v23, v27, v25, v27               // v23 = t14,  v27 = t13
+        butterfly_4s    v25, v29, v29, v17               // v25 = t15a, v29 = t12a
+
+        dmbutterfly0    v8,  v9,  v27, v21, v8,  v9,  v16, v17, v30, v31 // v8  = t13a, v9  = t10a
+        dmbutterfly0    v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12,  v27 = t11
+
+        butterfly_4s    v16, v31, v18, v25               // v16 = out[0], v31 = out[15]
+        butterfly_4s    v17, v30, v19, v23               // v17 = out[1], v30 = out[14]
+        butterfly_4s_r  v25, v22, v22, v24               // v25 = out[9], v22 = out[6]
+        butterfly_4s    v23, v24, v7,  v20               // v23 = out[7], v24 = out[8]
+        butterfly_4s    v18, v29, v4,  v8                // v18 = out[2], v29 = out[13]
+        butterfly_4s    v19, v28, v5,  v28               // v19 = out[3], v28 = out[12]
+        butterfly_4s    v20, v27, v6,  v27               // v20 = out[4], v27 = out[11]
+        butterfly_4s    v21, v26, v26, v9                // v21 = out[5], v26 = out[10]
+        ret
+.endm
+
+function idct16
+        dmbutterfly0    v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a,  v24 = t1a
+        dmbutterfly     v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a,  v28 = t3a
+        dmbutterfly     v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a,  v30 = t7a
+        dmbutterfly     v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a,  v22 = t6a
+        dmbutterfly     v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a,  v31 = t15a
+        dmbutterfly     v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a,  v23 = t14a
+        dmbutterfly     v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a
+        dmbutterfly     v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a
+
+        butterfly_4s    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
+        butterfly_4s    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
+        butterfly_4s    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
+        butterfly_4s    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
+        butterfly_4s    v16, v25, v17, v25               // v16 = t8,   v25 = t9
+        butterfly_4s    v24, v21, v29, v21               // v24 = t11,  v21 = t10
+        butterfly_4s    v17, v27, v19, v27               // v17 = t12,  v27 = t13
+        butterfly_4s    v29, v23, v31, v23               // v29 = t15,  v23 = t14
+
+        dmbutterfly0    v22, v26, v22, v26, v8, v9, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
+        dmbutterfly     v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
+        dmbutterfly     v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
+        idct16_end
+endfunc
+
+function idct16_half
+        dmbutterfly0_h  v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a,  v24 = t1a
+        dmbutterfly_h1  v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a,  v28 = t3a
+        dmbutterfly_h1  v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a,  v30 = t7a
+        dmbutterfly_h2  v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a,  v22 = t6a
+        dmbutterfly_h1  v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a,  v31 = t15a
+        dmbutterfly_h2  v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a,  v23 = t14a
+        dmbutterfly_h1  v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a
+        dmbutterfly_h2  v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a
+
+        butterfly_4s    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
+        butterfly_4s    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
+        butterfly_4s    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
+        butterfly_4s    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
+        butterfly_4s    v16, v25, v17, v25               // v16 = t8,   v25 = t9
+        butterfly_4s    v24, v21, v29, v21               // v24 = t11,  v21 = t10
+        butterfly_4s    v17, v27, v19, v27               // v17 = t12,  v27 = t13
+        butterfly_4s    v29, v23, v31, v23               // v29 = t15,  v23 = t14
+
+        dmbutterfly0    v22, v26, v22, v26, v8, v9, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
+        dmbutterfly     v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
+        dmbutterfly     v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
+        idct16_end
+endfunc
+
+function idct16_quarter
+        dsmull_h        v24, v25, v19, v3.s[3]
+        dsmull_h        v4,  v5,  v17, v2.s[0]
+        dsmull_h        v7,  v6,  v18, v1.s[1]
+        dsmull_h        v30, v31, v18, v1.s[0]
+        neg             v24.2d,  v24.2d
+        neg             v25.2d,  v25.2d
+        dsmull_h        v29, v28, v17, v2.s[1]
+        dsmull_h        v26, v27, v19, v3.s[2]
+        dsmull_h        v22, v23, v16, v0.s[0]
+        drshrn_h        v24, v24, v25, #14
+        drshrn_h        v16, v4,  v5,  #14
+        drshrn_h        v7,  v7,  v6,  #14
+        drshrn_h        v6,  v30, v31, #14
+        drshrn_h        v29, v29, v28, #14
+        drshrn_h        v17, v26, v27, #14
+        drshrn_h        v28, v22, v23, #14
+
+        dmbutterfly_l   v20, v21, v22, v23, v17, v24, v0.s[2], v0.s[3]
+        dmbutterfly_l   v18, v19, v30, v31, v29, v16, v0.s[2], v0.s[3]
+        neg             v22.2d,  v22.2d
+        neg             v23.2d,  v23.2d
+        drshrn_h        v27, v20, v21, #14
+        drshrn_h        v21, v22, v23, #14
+        drshrn_h        v23, v18, v19, #14
+        drshrn_h        v25, v30, v31, #14
+        mov             v4.16b,  v28.16b
+        mov             v5.16b,  v28.16b
+        dmbutterfly0    v22, v26, v7,  v6,  v18, v19, v30, v31
+        mov             v20.16b, v28.16b
+        idct16_end
+endfunc
+
+function iadst16
+        ld1             {v0.8h,v1.8h}, [x11]
+        sxtl            v2.4s,  v1.4h
+        sxtl2           v3.4s,  v1.8h
+        sxtl2           v1.4s,  v0.8h
+        sxtl            v0.4s,  v0.4h
+
+        dmbutterfly_l   v6,  v7,  v4,  v5,  v31, v16, v0.s[1], v0.s[0]   // v6,v7   = t1,   v4,v5   = t0
+        dmbutterfly_l   v10, v11, v8,  v9,  v23, v24, v1.s[1], v1.s[0]   // v10,v11 = t9,   v8,v9   = t8
+        dbutterfly_n    v31, v24, v6,  v7,  v10, v11, v12, v13, v10, v11 // v31     = t1a,  v24     = t9a
+        dmbutterfly_l   v14, v15, v12, v13, v29, v18, v0.s[3], v0.s[2]   // v14,v15 = t3,   v12,v13 = t2
+        dbutterfly_n    v16, v23, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v16     = t0a,  v23     = t8a
+
+        dmbutterfly_l   v6,  v7,  v4,  v5,  v21, v26, v1.s[3], v1.s[2]   // v6,v7   = t11,  v4,v5   = t10
+        dbutterfly_n    v29, v26, v14, v15, v6,  v7,  v8,  v9,  v6,  v7  // v29     = t3a,  v26     = t11a
+        dmbutterfly_l   v10, v11, v8,  v9,  v27, v20, v2.s[1], v2.s[0]   // v10,v11 = t5,   v8,v9   = t4
+        dbutterfly_n    v18, v21, v12, v13, v4,  v5,  v6,  v7,  v4,  v5  // v18     = t2a,  v21     = t10a
+
+        dmbutterfly_l   v14, v15, v12, v13, v19, v28, v3.s[1], v3.s[0]   // v14,v15 = t13,  v12,v13 = t12
+        dbutterfly_n    v20, v28, v10, v11, v14, v15, v4,  v5,  v14, v15 // v20     = t5a,  v28     = t13a
+        dmbutterfly_l   v6,  v7,  v4,  v5,  v25, v22, v2.s[3], v2.s[2]   // v6,v7   = t7,   v4,v5   = t6
+        dbutterfly_n    v27, v19, v8,  v9,  v12, v13, v10, v11, v12, v13 // v27     = t4a,  v19     = t12a
+
+        dmbutterfly_l   v10, v11, v8,  v9,  v17, v30, v3.s[3], v3.s[2]   // v10,v11 = t15,  v8,v9   = t14
+        ld1             {v0.8h}, [x10]
+        dbutterfly_n    v22, v30, v6,  v7,  v10, v11, v12, v13, v10, v11 // v22     = t7a,  v30     = t15a
+        sxtl2           v1.4s,  v0.8h
+        sxtl            v0.4s,  v0.4h
+        dmbutterfly_l   v14, v15, v12, v13, v23, v24, v1.s[0], v1.s[1]   // v14,v15 = t9,   v12,v13 = t8
+        dbutterfly_n    v25, v17, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v25     = t6a,  v17     = t14a
+
+        dmbutterfly_l   v4,  v5,  v6,  v7,  v28, v19, v1.s[1], v1.s[0]   // v4,v5   = t12,  v6,v7   = t13
+        dbutterfly_n    v23, v19, v12, v13, v4,  v5,  v8,  v9,  v4,  v5  // v23     = t8a,  v19     = t12a
+        dmbutterfly_l   v10, v11, v8,  v9,  v21, v26, v1.s[2], v1.s[3]   // v10,v11 = t11,  v8,v9   = t10
+        butterfly_4s_r  v4,  v27, v16, v27               // v4  = t4,   v27 = t0
+        dbutterfly_n    v24, v28, v14, v15, v6,  v7,  v12, v13, v6,  v7  // v24     = t9a,  v28     = t13a
+
+        dmbutterfly_l   v12, v13, v14, v15, v30, v17, v1.s[3], v1.s[2]   // v12,v13 = t14,  v14,v15 = t15
+        butterfly_4s_r  v5,  v20, v31, v20               // v5  = t5, v20 = t1
+        dbutterfly_n    v21, v17, v8,  v9,  v12, v13, v6,  v7,  v12, v13 // v21     = t10a, v17     = t14a
+        dbutterfly_n    v26, v30, v10, v11, v14, v15, v8,  v9,  v14, v15 // v26     = t11a, v30     = t15a
+
+        butterfly_4s_r  v6,  v25, v18, v25               // v6  = t6, v25 = t2
+        butterfly_4s_r  v7,  v22, v29, v22               // v7  = t7, v22 = t3
+
+        dmbutterfly_l   v10, v11, v8,  v9,  v19, v28, v0.s[2], v0.s[3]   // v10,v11 = t13,  v8,v9   = t12
+        dmbutterfly_l   v12, v13, v14, v15, v30, v17, v0.s[3], v0.s[2]   // v12,v13 = t14,  v14,v15 = t15
+
+        dbutterfly_n    v18, v30, v8,  v9,  v12, v13, v16, v17, v12, v13 // v18   = out[2], v30     = t14a
+        dbutterfly_n    v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17     = t15a
+        neg             v29.4s, v29.4s                   // v29 = out[13]
+
+        dmbutterfly_l   v10, v11, v8,  v9,  v4,  v5,  v0.s[2], v0.s[3]   // v10,v11 = t5a,  v8,v9   = t4a
+        dmbutterfly_l   v12, v13, v14, v15, v7,  v6,  v0.s[3], v0.s[2]   // v12,v13 = t6a,  v14,v15 = t7a
+
+        butterfly_4s    v2,  v6,  v27, v25               // v2 = out[0], v6 = t2a
+        butterfly_4s    v3,  v7,  v23, v21               // v3 =-out[1], v7 = t10
+
+        dbutterfly_n    v19, v31, v8,  v9,  v12, v13, v4,  v5,  v8,  v9  // v19 = -out[3],  v31 = t6
+        neg             v19.4s, v19.4s                   // v19 = out[3]
+        dbutterfly_n    v28, v16, v10, v11, v14, v15, v4,  v5,  v10, v11 // v28 = out[12],  v16 = t7
+
+        butterfly_4s    v5,  v8,  v20, v22               // v5 =-out[15],v8 = t3a
+        butterfly_4s    v4,  v9,  v24, v26               // v4 = out[14],v9 = t11
+
+        dmbutterfly0    v23, v24, v6,  v8,  v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8]
+        dmbutterfly0    v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10]
+        dmbutterfly0    v20, v27, v16, v31, v10, v11, v12, v13, v14, v15    // v20 = out[4], v27 = out[11]
+        dmbutterfly0    v22, v25, v9,  v7,  v10, v11, v12, v13, v14, v15    // v22 = out[6], v25 = out[9]
+
+        neg             v31.4s,  v5.4s                    // v31 = out[15]
+        neg             v17.4s,  v3.4s                    // v17 = out[1]
+
+        mov             v16.16b, v2.16b
+        mov             v30.16b, v4.16b
+        ret
+endfunc
+
+// Helper macros; we can't use these expressions directly within
+// e.g. .irp due to the extra concatenation \(). Therefore wrap
+// them in macros to allow using .irp below.
+.macro load i, src, inc
+        ld1             {v\i\().4s},  [\src], \inc
+.endm
+.macro store i, dst, inc
+        st1             {v\i\().4s},  [\dst], \inc
+.endm
+.macro movi_v i, size, imm
+        movi            v\i\()\size,  \imm
+.endm
+.macro load_clear i, src, inc
+        ld1             {v\i\().4s}, [\src]
+        st1             {v4.4s},  [\src], \inc
+.endm
+
+.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7
+        srshr           \coef0, \coef0, #6
+        ld1             {v4.4h},   [x0], x1
+        srshr           \coef1, \coef1, #6
+        ld1             {v4.d}[1], [x3], x1
+        srshr           \coef2, \coef2, #6
+        ld1             {v5.4h},   [x0], x1
+        srshr           \coef3, \coef3, #6
+        uaddw           \coef0, \coef0, v4.4h
+        ld1             {v5.d}[1], [x3], x1
+        srshr           \coef4, \coef4, #6
+        uaddw2          \coef1, \coef1, v4.8h
+        ld1             {v6.4h},   [x0], x1
+        srshr           \coef5, \coef5, #6
+        uaddw           \coef2, \coef2, v5.4h
+        ld1             {v6.d}[1], [x3], x1
+        sqxtun          v4.4h,  \coef0
+        srshr           \coef6, \coef6, #6
+        uaddw2          \coef3, \coef3, v5.8h
+        ld1             {v7.4h},   [x0], x1
+        sqxtun2         v4.8h,  \coef1
+        srshr           \coef7, \coef7, #6
+        uaddw           \coef4, \coef4, v6.4h
+        ld1             {v7.d}[1], [x3], x1
+        umin            v4.8h,  v4.8h,  v8.8h
+        sub             x0,  x0,  x1, lsl #2
+        sub             x3,  x3,  x1, lsl #2
+        sqxtun          v5.4h,  \coef2
+        uaddw2          \coef5, \coef5, v6.8h
+        st1             {v4.4h},   [x0], x1
+        sqxtun2         v5.8h,  \coef3
+        uaddw           \coef6, \coef6, v7.4h
+        st1             {v4.d}[1], [x3], x1
+        umin            v5.8h,  v5.8h,  v8.8h
+        sqxtun          v6.4h,  \coef4
+        uaddw2          \coef7, \coef7, v7.8h
+        st1             {v5.4h},   [x0], x1
+        sqxtun2         v6.8h,  \coef5
+        st1             {v5.d}[1], [x3], x1
+        umin            v6.8h,  v6.8h,  v8.8h
+        sqxtun          v7.4h,  \coef6
+        st1             {v6.4h},   [x0], x1
+        sqxtun2         v7.8h,  \coef7
+        st1             {v6.d}[1], [x3], x1
+        umin            v7.8h,  v7.8h,  v8.8h
+        st1             {v7.4h},   [x0], x1
+        st1             {v7.d}[1], [x3], x1
+.endm
+
+// Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
+// transpose into a horizontal 16x4 slice and store.
+// x0 = dst (temp buffer)
+// x1 = slice offset
+// x2 = src
+// x9 = input stride
+.macro itxfm16_1d_funcs txfm
+function \txfm\()16_1d_4x16_pass1_neon
+        mov             x14, x30
+
+        movi            v4.4s, #0
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        load_clear      \i,  x2,  x9
+.endr
+
+        bl              \txfm\()16
+
+        // Do four 4x4 transposes. Originally, v16-v31 contain the
+        // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
+        // contain the four transposed 4x4 blocks.
+        transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
+        transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
+        transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
+        transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
+
+        // Store the transposed 4x4 blocks horizontally.
+        cmp             x1,  #12
+        b.eq            1f
+.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
+        store           \i,  x0,  #16
+.endr
+        br              x14
+1:
+        // Special case: For the last input column (x1 == 12),
+        // which would be stored as the last row in the temp buffer,
+        // don't store the first 4x4 block, but keep it in registers
+        // for the first slice of the second pass (where it is the
+        // last 4x4 block).
+        add             x0,  x0,  #16
+        st1             {v20.4s},  [x0], #16
+        st1             {v24.4s},  [x0], #16
+        st1             {v28.4s},  [x0], #16
+        add             x0,  x0,  #16
+        st1             {v21.4s},  [x0], #16
+        st1             {v25.4s},  [x0], #16
+        st1             {v29.4s},  [x0], #16
+        add             x0,  x0,  #16
+        st1             {v22.4s},  [x0], #16
+        st1             {v26.4s},  [x0], #16
+        st1             {v30.4s},  [x0], #16
+        add             x0,  x0,  #16
+        st1             {v23.4s},  [x0], #16
+        st1             {v27.4s},  [x0], #16
+        st1             {v31.4s},  [x0], #16
+
+        mov             v28.16b, v16.16b
+        mov             v29.16b, v17.16b
+        mov             v30.16b, v18.16b
+        mov             v31.16b, v19.16b
+        br              x14
+endfunc
+
+// Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
+// load the destination pixels (from a similar 4x16 slice), add and store back.
+// x0 = dst
+// x1 = dst stride
+// x2 = src (temp buffer)
+// x3 = slice offset
+// x9 = temp buffer stride
+function \txfm\()16_1d_4x16_pass2_neon
+        mov             x14, x30
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
+        load            \i,  x2,  x9
+.endr
+        cbz             x3,  1f
+.irp i, 28, 29, 30, 31
+        load            \i,  x2,  x9
+.endr
+1:
+
+        add             x3,  x0,  x1
+        lsl             x1,  x1,  #1
+        bl              \txfm\()16
+
+        dup             v8.8h, w13
+        load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+        load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+
+        br              x14
+endfunc
+.endm
+
+itxfm16_1d_funcs idct
+itxfm16_1d_funcs iadst
+
+// This is the minimum eob value for each subpartition, in increments of 4
+const min_eob_idct_idct_16, align=4
+        .short  0, 10, 38, 89
+endconst
+
+.macro itxfm_func16x16 txfm1, txfm2
+function vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
+.ifc \txfm1\()_\txfm2,idct_idct
+        cmp             w3,  #1
+        b.eq            idct16x16_dc_add_neon
+.endif
+        mov             x15, x30
+        // iadst16 requires clobbering v8-v15, idct16 only clobbers v8-v9.
+.ifnc \txfm1\()_\txfm2,idct_idct
+        stp             d14, d15, [sp, #-0x10]!
+        stp             d12, d13, [sp, #-0x10]!
+        stp             d10, d11, [sp, #-0x10]!
+.endif
+        stp             d8,  d9,  [sp, #-0x10]!
+
+        sub             sp,  sp,  #1024
+
+        mov             x4,  x0
+        mov             x5,  x1
+        mov             x6,  x2
+
+        movrel          x10, idct_coeffs
+.ifnc \txfm1\()_\txfm2,idct_idct
+        movrel          x11, iadst16_coeffs
+.endif
+.ifc \txfm1,idct
+        ld1             {v0.8h,v1.8h}, [x10]
+        sxtl            v2.4s,  v1.4h
+        sxtl2           v3.4s,  v1.8h
+        sxtl2           v1.4s,  v0.8h
+        sxtl            v0.4s,  v0.4h
+.endif
+        mov             x9,  #64
+
+.ifc \txfm1\()_\txfm2,idct_idct
+        cmp             w3,  #10
+        b.le            idct16x16_quarter_add_16_neon
+        cmp             w3,  #38
+        b.le            idct16x16_half_add_16_neon
+
+        movrel          x12, min_eob_idct_idct_16, 2
+.endif
+
+.irp i, 0, 4, 8, 12
+        add             x0,  sp,  #(\i*64)
+.ifc \txfm1\()_\txfm2,idct_idct
+.if \i > 0
+        ldrh            w1,  [x12], #2
+        cmp             w3,  w1
+        mov             x1,  #(16 - \i)/4
+        b.le            1f
+.endif
+.endif
+        mov             x1,  #\i
+        add             x2,  x6,  #(\i*4)
+        bl              \txfm1\()16_1d_4x16_pass1_neon
+.endr
+.ifc \txfm1\()_\txfm2,iadst_idct
+        ld1             {v0.8h,v1.8h}, [x10]
+        sxtl            v2.4s,  v1.4h
+        sxtl2           v3.4s,  v1.8h
+        sxtl2           v1.4s,  v0.8h
+        sxtl            v0.4s,  v0.4h
+.endif
+
+.ifc \txfm1\()_\txfm2,idct_idct
+        b               3f
+1:
+        // Set v28-v31 to zero, for the in-register passthrough of
+        // coefficients to pass 2.
+        movi            v28.4s,  #0
+        movi            v29.4s,  #0
+        movi            v30.4s,  #0
+        movi            v31.4s,  #0
+2:
+        subs            x1,  x1,  #1
+.rept 4
+        st1             {v28.4s,v29.4s,v30.4s,v31.4s}, [x0], x9
+.endr
+        b.ne            2b
+3:
+.endif
+
+.irp i, 0, 4, 8, 12
+        add             x0,  x4,  #(\i*2)
+        mov             x1,  x5
+        add             x2,  sp,  #(\i*4)
+        mov             x3,  #\i
+        bl              \txfm2\()16_1d_4x16_pass2_neon
+.endr
+
+        add             sp,  sp,  #1024
+        ldp             d8,  d9,  [sp], 0x10
+.ifnc \txfm1\()_\txfm2,idct_idct
+        ldp             d10, d11, [sp], 0x10
+        ldp             d12, d13, [sp], 0x10
+        ldp             d14, d15, [sp], 0x10
+.endif
+        br              x15
+endfunc
+
+function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_10_neon, export=1
+        mov             x13, #0x03ff
+        b               vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
+endfunc
+
+function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_12_neon, export=1
+        mov             x13, #0x0fff
+        b               vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
+endfunc
+.endm
+
+itxfm_func16x16 idct,  idct
+itxfm_func16x16 iadst, idct
+itxfm_func16x16 idct,  iadst
+itxfm_func16x16 iadst, iadst
+
+function idct16_1d_4x16_pass1_quarter_neon
+        mov             x14, x30
+
+        movi            v4.4s, #0
+.irp i, 16, 17, 18, 19
+        load_clear      \i,  x2,  x9
+.endr
+
+        bl              idct16_quarter
+
+        // Do four 4x4 transposes. Originally, v16-v31 contain the
+        // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
+        // contain the four transposed 4x4 blocks.
+        transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
+        transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
+        transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
+        transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
+
+        // Store the transposed 4x4 blocks horizontally.
+        // The first 4x4 block is kept in registers for the second pass,
+        // store the rest in the temp buffer.
+        add             x0,  x0,  #16
+        st1             {v20.4s},  [x0], #16
+        st1             {v24.4s},  [x0], #16
+        st1             {v28.4s},  [x0], #16
+        add             x0,  x0,  #16
+        st1             {v21.4s},  [x0], #16
+        st1             {v25.4s},  [x0], #16
+        st1             {v29.4s},  [x0], #16
+        add             x0,  x0,  #16
+        st1             {v22.4s},  [x0], #16
+        st1             {v26.4s},  [x0], #16
+        st1             {v30.4s},  [x0], #16
+        add             x0,  x0,  #16
+        st1             {v23.4s},  [x0], #16
+        st1             {v27.4s},  [x0], #16
+        st1             {v31.4s},  [x0], #16
+        br              x14
+endfunc
+
+function idct16_1d_4x16_pass2_quarter_neon
+        mov             x14, x30
+
+        // Only load the top 4 lines, and only do it for the later slices.
+        // For the first slice, d16-d19 is kept in registers from the first pass.
+        cbz             x3,  1f
+.irp i, 16, 17, 18, 19
+        load            \i,  x2,  x9
+.endr
+1:
+
+        add             x3,  x0,  x1
+        lsl             x1,  x1,  #1
+        bl              idct16_quarter
+
+        dup             v8.8h, w13
+        load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+        load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+
+        br              x14
+endfunc
+
+function idct16_1d_4x16_pass1_half_neon
+        mov             x14, x30
+
+        movi            v4.4s, #0
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load_clear      \i,  x2,  x9
+.endr
+
+        bl              idct16_half
+
+        // Do four 4x4 transposes. Originally, v16-v31 contain the
+        // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
+        // contain the four transposed 4x4 blocks.
+        transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
+        transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
+        transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
+        transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
+
+        // Store the transposed 4x4 blocks horizontally.
+        cmp             x1,  #4
+        b.eq            1f
+.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
+        store           \i,  x0,  #16
+.endr
+        br              x14
+1:
+        // Special case: For the second input column (r1 == 4),
+        // which would be stored as the second row in the temp buffer,
+        // don't store the first 4x4 block, but keep it in registers
+        // for the first slice of the second pass (where it is the
+        // second 4x4 block).
+        add             x0,  x0,  #16
+        st1             {v20.4s},  [x0], #16
+        st1             {v24.4s},  [x0], #16
+        st1             {v28.4s},  [x0], #16
+        add             x0,  x0,  #16
+        st1             {v21.4s},  [x0], #16
+        st1             {v25.4s},  [x0], #16
+        st1             {v29.4s},  [x0], #16
+        add             x0,  x0,  #16
+        st1             {v22.4s},  [x0], #16
+        st1             {v26.4s},  [x0], #16
+        st1             {v30.4s},  [x0], #16
+        add             x0,  x0,  #16
+        st1             {v23.4s},  [x0], #16
+        st1             {v27.4s},  [x0], #16
+        st1             {v31.4s},  [x0], #16
+
+        mov             v20.16b, v16.16b
+        mov             v21.16b, v17.16b
+        mov             v22.16b, v18.16b
+        mov             v23.16b, v19.16b
+        br              x14
+endfunc
+
+function idct16_1d_4x16_pass2_half_neon
+        mov             x14, x30
+
+.irp i, 16, 17, 18, 19
+        load            \i,  x2,  x9
+.endr
+        cbz             x3,  1f
+.irp i, 20, 21, 22, 23
+        load            \i,  x2,  x9
+.endr
+1:
+
+        add             x3,  x0,  x1
+        lsl             x1,  x1,  #1
+        bl              idct16_half
+
+        dup             v8.8h, w13
+        load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+        load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+
+        br              x14
+endfunc
+
+.macro idct16_partial size
+function idct16x16_\size\()_add_16_neon
+        add             x0,  sp,  #(0*64)
+        mov             x1,  #0
+        add             x2,  x6,  #(0*4)
+        bl              idct16_1d_4x16_pass1_\size\()_neon
+.ifc \size,half
+        add             x0,  sp,  #(4*64)
+        mov             x1,  #4
+        add             x2,  x6,  #(4*4)
+        bl              idct16_1d_4x16_pass1_\size\()_neon
+.endif
+
+.irp i, 0, 4, 8, 12
+        add             x0,  x4,  #(\i*2)
+        mov             x1,  x5
+        add             x2,  sp,  #(\i*4)
+        mov             x3,  #\i
+        bl              idct16_1d_4x16_pass2_\size\()_neon
+.endr
+
+        add             sp,  sp,  #1024
+        ldp             d8,  d9,  [sp], 0x10
+        br              x15
+endfunc
+.endm
+
+idct16_partial quarter
+idct16_partial half
+
+function idct32x32_dc_add_neon
+        movrel          x4,  idct_coeffs
+        ld1             {v0.4h}, [x4]
+        sxtl            v0.4s,  v0.4h
+
+        movi            v1.4h,  #0
+
+        ld1             {v2.s}[0],  [x2]
+        smull           v2.2d,  v2.2s,  v0.s[0]
+        rshrn           v2.2s,  v2.2d,  #14
+        smull           v2.2d,  v2.2s,  v0.s[0]
+        rshrn           v2.2s,  v2.2d,  #14
+        st1             {v1.s}[0],  [x2]
+        dup             v2.4s,  v2.s[0]
+
+        srshr           v0.4s,  v2.4s,  #6
+
+        mov             x3,  x0
+        mov             x4,  #32
+        sub             x1,  x1,  #32
+        dup             v31.8h, w13
+1:
+        // Loop to add the constant v0 into all 32x32 outputs
+        subs            x4,  x4,  #1
+        ld1             {v1.8h,v2.8h},  [x0], #32
+        uaddw           v16.4s, v0.4s,  v1.4h
+        uaddw2          v17.4s, v0.4s,  v1.8h
+        ld1             {v3.8h,v4.8h},  [x0], x1
+        uaddw           v18.4s, v0.4s,  v2.4h
+        uaddw2          v19.4s, v0.4s,  v2.8h
+        uaddw           v20.4s, v0.4s,  v3.4h
+        uaddw2          v21.4s, v0.4s,  v3.8h
+        uaddw           v22.4s, v0.4s,  v4.4h
+        uaddw2          v23.4s, v0.4s,  v4.8h
+        sqxtun          v1.4h,  v16.4s
+        sqxtun2         v1.8h,  v17.4s
+        sqxtun          v2.4h,  v18.4s
+        sqxtun2         v2.8h,  v19.4s
+        sqxtun          v3.4h,  v20.4s
+        sqxtun2         v3.8h,  v21.4s
+        sqxtun          v4.4h,  v22.4s
+        sqxtun2         v4.8h,  v23.4s
+        umin            v1.8h,  v1.8h,  v31.8h
+        umin            v2.8h,  v2.8h,  v31.8h
+        st1             {v1.8h,v2.8h},  [x3], #32
+        umin            v3.8h,  v3.8h,  v31.8h
+        umin            v4.8h,  v4.8h,  v31.8h
+        st1             {v3.8h,v4.8h},  [x3], x1
+        b.ne            1b
+
+        ret
+endfunc
+
+.macro idct32_end
+        butterfly_4s    v16, v5,  v4,  v5  // v16 = t16a, v5  = t19a
+        butterfly_4s    v17, v20, v23, v20 // v17 = t17,  v20 = t18
+        butterfly_4s    v18, v6,  v7,  v6  // v18 = t23a, v6  = t20a
+        butterfly_4s    v19, v21, v22, v21 // v19 = t22,  v21 = t21
+        butterfly_4s    v4,  v28, v28, v30 // v4  = t24a, v28 = t27a
+        butterfly_4s    v23, v26, v25, v26 // v23 = t25,  v26 = t26
+        butterfly_4s    v7,  v8,  v29, v31 // v7  = t31a, v3  = t28a
+        butterfly_4s    v22, v27, v24, v27 // v22 = t30,  v27 = t29
+
+        dmbutterfly     v27, v20, v0.s[2], v0.s[3], v24, v25, v30, v31        // v27 = t18a, v20 = t29a
+        dmbutterfly     v8,  v5,  v0.s[2], v0.s[3], v24, v25, v30, v31        // v3  = t19,  v5  = t28
+        dmbutterfly     v28, v6,  v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v28 = t27,  v6  = t20
+        dmbutterfly     v26, v21, v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
+
+        butterfly_4s    v31, v24, v7,  v4  // v31 = t31,  v24 = t24
+        butterfly_4s    v30, v25, v22, v23 // v30 = t30a, v25 = t25a
+        butterfly_4s_r  v23, v16, v16, v18 // v23 = t23,  v16 = t16
+        butterfly_4s_r  v22, v17, v17, v19 // v22 = t22a, v17 = t17a
+        butterfly_4s    v18, v21, v27, v21 // v18 = t18,  v21 = t21
+        butterfly_4s_r  v27, v28, v5,  v28 // v27 = t27a, v28 = t28a
+        butterfly_4s    v29, v26, v20, v26 // v29 = t29,  v26 = t26
+        butterfly_4s    v19, v20, v8,  v6  // v19 = t19a, v20 = t20
+
+        dmbutterfly0    v27, v20, v27, v20, v4, v5, v6, v7, v8, v9 // v27 = t27,  v20 = t20
+        dmbutterfly0    v26, v21, v26, v21, v4, v5, v6, v7, v8, v9 // v26 = t26a, v21 = t21a
+        dmbutterfly0    v25, v22, v25, v22, v4, v5, v6, v7, v8, v9 // v25 = t25,  v22 = t22
+        dmbutterfly0    v24, v23, v24, v23, v4, v5, v6, v7, v8, v9 // v24 = t24a, v23 = t23a
+        ret
+.endm
+
+function idct32_odd
+        dmbutterfly     v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
+        dmbutterfly     v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
+        dmbutterfly     v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
+        dmbutterfly     v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
+        dmbutterfly     v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
+        dmbutterfly     v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
+        dmbutterfly     v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
+        dmbutterfly     v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
+
+        butterfly_4s    v4,  v24, v16, v24 // v4  = t16, v24 = t17
+        butterfly_4s    v5,  v20, v28, v20 // v5  = t19, v20 = t18
+        butterfly_4s    v6,  v26, v18, v26 // v6  = t20, v26 = t21
+        butterfly_4s    v7,  v22, v30, v22 // v7  = t23, v22 = t22
+        butterfly_4s    v28, v25, v17, v25 // v28 = t24, v25 = t25
+        butterfly_4s    v30, v21, v29, v21 // v30 = t27, v21 = t26
+        butterfly_4s    v29, v23, v31, v23 // v29 = t31, v23 = t30
+        butterfly_4s    v31, v27, v19, v27 // v31 = t28, v27 = t29
+
+        dmbutterfly     v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
+        dmbutterfly     v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
+        dmbutterfly     v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
+        dmbutterfly     v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
+        idct32_end
+endfunc
+
+function idct32_odd_half
+        dmbutterfly_h1  v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
+        dmbutterfly_h2  v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
+        dmbutterfly_h1  v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
+        dmbutterfly_h2  v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
+        dmbutterfly_h1  v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
+        dmbutterfly_h2  v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
+        dmbutterfly_h1  v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
+        dmbutterfly_h2  v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
+
+        butterfly_4s    v4,  v24, v16, v24 // v4  = t16, v24 = t17
+        butterfly_4s    v5,  v20, v28, v20 // v5  = t19, v20 = t18
+        butterfly_4s    v6,  v26, v18, v26 // v6  = t20, v26 = t21
+        butterfly_4s    v7,  v22, v30, v22 // v7  = t23, v22 = t22
+        butterfly_4s    v28, v25, v17, v25 // v28 = t24, v25 = t25
+        butterfly_4s    v30, v21, v29, v21 // v30 = t27, v21 = t26
+        butterfly_4s    v29, v23, v31, v23 // v29 = t31, v23 = t30
+        butterfly_4s    v31, v27, v19, v27 // v31 = t28, v27 = t29
+
+        dmbutterfly     v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
+        dmbutterfly     v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
+        dmbutterfly     v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
+        dmbutterfly     v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
+        idct32_end
+endfunc
+
+function idct32_odd_quarter
+        dsmull_h        v4,  v5,  v16, v10.s[0]
+        dsmull_h        v28, v29, v19, v11.s[3]
+        dsmull_h        v30, v31, v16, v10.s[1]
+        dsmull_h        v22, v23, v17, v13.s[2]
+        dsmull_h        v7,  v6,  v17, v13.s[3]
+        dsmull_h        v26, v27, v19, v11.s[2]
+        dsmull_h        v20, v21, v18, v12.s[0]
+        dsmull_h        v24, v25, v18, v12.s[1]
+
+        neg             v28.2d, v28.2d
+        neg             v29.2d, v29.2d
+        neg             v7.2d,  v7.2d
+        neg             v6.2d,  v6.2d
+
+        drshrn_h        v4,  v4,  v5,  #14
+        drshrn_h        v5,  v28, v29, #14
+        drshrn_h        v29, v30, v31, #14
+        drshrn_h        v28, v22, v23, #14
+        drshrn_h        v7,  v7,  v6,  #14
+        drshrn_h        v31, v26, v27, #14
+        drshrn_h        v6,  v20, v21, #14
+        drshrn_h        v30, v24, v25, #14
+
+        dmbutterfly_l   v16, v17, v18, v19, v29, v4,  v1.s[0], v1.s[1]
+        dmbutterfly_l   v27, v26, v20, v21, v31, v5,  v1.s[0], v1.s[1]
+        drshrn_h        v23, v16, v17, #14
+        drshrn_h        v24, v18, v19, #14
+        neg             v20.2d, v20.2d
+        neg             v21.2d, v21.2d
+        drshrn_h        v27, v27, v26, #14
+        drshrn_h        v20, v20, v21, #14
+        dmbutterfly_l   v16, v17, v18, v19, v30, v6,  v1.s[2], v1.s[3]
+        drshrn_h        v21, v16, v17, #14
+        drshrn_h        v26, v18, v19, #14
+        dmbutterfly_l   v16, v17, v18, v19, v28, v7,  v1.s[2], v1.s[3]
+        drshrn_h        v25, v16, v17, #14
+        neg             v18.2d, v18.2d
+        neg             v19.2d, v19.2d
+        drshrn_h        v22, v18, v19, #14
+
+        idct32_end
+endfunc
+
+.macro idct32_funcs suffix
+// Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
+// The 32-point IDCT can be decomposed into two 16-point IDCTs;
+// a normal IDCT16 with every other input component (the even ones, with
+// each output written twice), followed by a separate 16-point IDCT
+// of the odd inputs, added/subtracted onto the outputs of the first idct16.
+// x0 = dst (temp buffer)
+// x1 = unused
+// x2 = src
+// x9 = double input stride
+function idct32_1d_4x32_pass1\suffix\()_neon
+        mov             x14, x30
+
+        movi            v4.4s,  #0
+
+        // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        load_clear      \i, x2, x9
+.endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        load_clear      \i, x2, x9
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load_clear      \i, x2, x9
+.endr
+.endif
+
+        bl              idct16\suffix
+
+        // Do four 4x4 transposes. Originally, v16-v31 contain the
+        // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
+        // contain the four transposed 4x4 blocks.
+        transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
+        transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
+        transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
+        transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
+
+        // Store the registers a, b, c, d horizontally, followed by the
+        // same registers d, c, b, a mirrored.
+.macro store_rev a, b, c, d
+        // There's no rev128 instruction, but we reverse each 64 bit
+        // half, and then flip them using an ext with 8 bytes offset.
+        rev64           v7.4s, \d
+        st1             {\a},  [x0], #16
+        ext             v7.16b, v7.16b, v7.16b, #8
+        st1             {\b},  [x0], #16
+        rev64           v6.4s, \c
+        st1             {\c},  [x0], #16
+        ext             v6.16b, v6.16b, v6.16b, #8
+        st1             {\d},  [x0], #16
+        rev64           v5.4s, \b
+        st1             {v7.4s},  [x0], #16
+        ext             v5.16b, v5.16b, v5.16b, #8
+        st1             {v6.4s},  [x0], #16
+        rev64           v4.4s, \a
+        st1             {v5.4s},  [x0], #16
+        ext             v4.16b, v4.16b, v4.16b, #8
+        st1             {v4.4s},  [x0], #16
+.endm
+        store_rev       v16.4s, v20.4s, v24.4s, v28.4s
+        store_rev       v17.4s, v21.4s, v25.4s, v29.4s
+        store_rev       v18.4s, v22.4s, v26.4s, v30.4s
+        store_rev       v19.4s, v23.4s, v27.4s, v31.4s
+        sub             x0,  x0,  #512
+.purgem store_rev
+
+        // Move x2 back to the start of the input, and move
+        // to the first odd row
+.ifb \suffix
+        sub             x2,  x2,  x9, lsl #4
+.endif
+.ifc \suffix,_quarter
+        sub             x2,  x2,  x9, lsl #2
+.endif
+.ifc \suffix,_half
+        sub             x2,  x2,  x9, lsl #3
+.endif
+        add             x2,  x2,  #128
+
+        movi            v4.4s,  #0
+        // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        load_clear      \i, x2, x9
+.endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        load_clear      \i, x2, x9
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load_clear      \i, x2, x9
+.endr
+.endif
+
+        bl              idct32_odd\suffix
+
+        transpose_4x4s  v31, v30, v29, v28, v4, v5, v6, v7
+        transpose_4x4s  v27, v26, v25, v24, v4, v5, v6, v7
+        transpose_4x4s  v23, v22, v21, v20, v4, v5, v6, v7
+        transpose_4x4s  v19, v18, v17, v16, v4, v5, v6, v7
+
+        // Store the registers a, b, c, d horizontally,
+        // adding into the output first, and the mirrored,
+        // subtracted from the output.
+.macro store_rev a, b, c, d, a16b, b16b
+        ld1             {v4.4s},  [x0]
+        rev64           v9.4s, \d
+        add             v4.4s, v4.4s, \a
+        st1             {v4.4s},  [x0], #16
+        rev64           v8.4s, \c
+        ld1             {v4.4s},  [x0]
+        ext             v9.16b, v9.16b, v9.16b, #8
+        add             v4.4s, v4.4s, \b
+        st1             {v4.4s},  [x0], #16
+        ext             v8.16b, v8.16b, v8.16b, #8
+        ld1             {v4.4s},  [x0]
+        rev64           \b, \b
+        add             v4.4s, v4.4s, \c
+        st1             {v4.4s},  [x0], #16
+        rev64           \a, \a
+        ld1             {v4.4s},  [x0]
+        ext             \b16b, \b16b, \b16b, #8
+        add             v4.4s, v4.4s, \d
+        st1             {v4.4s},  [x0], #16
+        ext             \a16b, \a16b, \a16b, #8
+        ld1             {v4.4s},  [x0]
+        sub             v4.4s, v4.4s, v9.4s
+        st1             {v4.4s},  [x0], #16
+        ld1             {v4.4s},  [x0]
+        sub             v4.4s, v4.4s, v8.4s
+        st1             {v4.4s},  [x0], #16
+        ld1             {v4.4s},  [x0]
+        sub             v4.4s, v4.4s, \b
+        st1             {v4.4s},  [x0], #16
+        ld1             {v4.4s},  [x0]
+        sub             v4.4s, v4.4s, \a
+        st1             {v4.4s},  [x0], #16
+.endm
+
+        store_rev       v31.4s, v27.4s, v23.4s, v19.4s, v31.16b, v27.16b
+        store_rev       v30.4s, v26.4s, v22.4s, v18.4s, v30.16b, v26.16b
+        store_rev       v29.4s, v25.4s, v21.4s, v17.4s, v29.16b, v25.16b
+        store_rev       v28.4s, v24.4s, v20.4s, v16.4s, v28.16b, v24.16b
+.purgem store_rev
+        br              x14
+endfunc
+
+// This is mostly the same as 4x32_pass1, but without the transpose,
+// and use the source as temp buffer between the two idct passes, and
+// add into the destination.
+// x0 = dst
+// x1 = dst stride
+// x2 = src (temp buffer)
+// x7 = negative double temp buffer stride
+// x9 = double temp buffer stride
+function idct32_1d_4x32_pass2\suffix\()_neon
+        mov             x14, x30
+
+        // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        load            \i, x2, x9
+.endr
+        sub             x2,  x2,  x9, lsl #4
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        load            \i, x2, x9
+.endr
+        sub             x2,  x2,  x9, lsl #2
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load            \i, x2, x9
+.endr
+        sub             x2,  x2,  x9, lsl #3
+.endif
+
+        bl              idct16\suffix
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        store           \i, x2, x9
+.endr
+
+        sub             x2,  x2,  x9, lsl #4
+        add             x2,  x2,  #128
+
+        // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        load            \i, x2, x9
+.endr
+        sub             x2,  x2,  x9, lsl #4
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        load            \i, x2, x9
+.endr
+        sub             x2,  x2,  x9, lsl #2
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load            \i, x2, x9
+.endr
+        sub             x2,  x2,  x9, lsl #3
+.endif
+        sub             x2,  x2,  #128
+
+        bl              idct32_odd\suffix
+
+.macro load_acc_store a, b, c, d, neg=0
+.if \neg == 0
+        ld1             {v4.4s},  [x2], x9
+        ld1             {v5.4s},  [x2], x9
+        add             v4.4s, v4.4s, \a
+        ld1             {v6.4s},  [x2], x9
+        add             v5.4s, v5.4s, \b
+        ld1             {v7.4s},  [x2], x9
+        add             v6.4s, v6.4s, \c
+        add             v7.4s, v7.4s, \d
+.else
+        ld1             {v4.4s},  [x2], x7
+        ld1             {v5.4s},  [x2], x7
+        sub             v4.4s, v4.4s, \a
+        ld1             {v6.4s},  [x2], x7
+        sub             v5.4s, v5.4s, \b
+        ld1             {v7.4s},  [x2], x7
+        sub             v6.4s, v6.4s, \c
+        sub             v7.4s, v7.4s, \d
+.endif
+        ld1             {v8.4h},   [x0], x1
+        ld1             {v8.d}[1], [x0], x1
+        srshr           v4.4s, v4.4s, #6
+        ld1             {v9.4h},   [x0], x1
+        srshr           v5.4s, v5.4s, #6
+        uaddw           v4.4s, v4.4s, v8.4h
+        ld1             {v9.d}[1], [x0], x1
+        srshr           v6.4s, v6.4s, #6
+        uaddw2          v5.4s, v5.4s, v8.8h
+        srshr           v7.4s, v7.4s, #6
+        sub             x0,  x0,  x1, lsl #2
+        uaddw           v6.4s, v6.4s, v9.4h
+        sqxtun          v4.4h, v4.4s
+        uaddw2          v7.4s, v7.4s, v9.8h
+        sqxtun2         v4.8h, v5.4s
+        umin            v4.8h, v4.8h, v15.8h
+        st1             {v4.4h},   [x0], x1
+        sqxtun          v5.4h, v6.4s
+        st1             {v4.d}[1], [x0], x1
+        sqxtun2         v5.8h, v7.4s
+        umin            v5.8h, v5.8h, v15.8h
+        st1             {v5.4h},   [x0], x1
+        st1             {v5.d}[1], [x0], x1
+.endm
+        load_acc_store  v31.4s, v30.4s, v29.4s, v28.4s
+        load_acc_store  v27.4s, v26.4s, v25.4s, v24.4s
+        load_acc_store  v23.4s, v22.4s, v21.4s, v20.4s
+        load_acc_store  v19.4s, v18.4s, v17.4s, v16.4s
+        sub             x2,  x2,  x9
+        load_acc_store  v16.4s, v17.4s, v18.4s, v19.4s, 1
+        load_acc_store  v20.4s, v21.4s, v22.4s, v23.4s, 1
+        load_acc_store  v24.4s, v25.4s, v26.4s, v27.4s, 1
+        load_acc_store  v28.4s, v29.4s, v30.4s, v31.4s, 1
+.purgem load_acc_store
+        br              x14
+endfunc
+.endm
+
+idct32_funcs
+idct32_funcs _quarter
+idct32_funcs _half
+
+const min_eob_idct_idct_32, align=4
+        .short  0, 9, 34, 70, 135, 240, 336, 448
+endconst
+
+function vp9_idct_idct_32x32_add_16_neon
+        cmp             w3,  #1
+        b.eq            idct32x32_dc_add_neon
+
+        movrel          x10, idct_coeffs
+
+        mov             x15, x30
+        stp             d8,  d9,  [sp, #-0x10]!
+        stp             d10, d11, [sp, #-0x10]!
+        stp             d12, d13, [sp, #-0x10]!
+        stp             d14, d15, [sp, #-0x10]!
+
+        sub             sp,  sp,  #4096
+
+        mov             x4,  x0
+        mov             x5,  x1
+        mov             x6,  x2
+
+        // Double stride of the input, since we only read every other line
+        mov             x9,  #256
+        neg             x7,  x9
+
+        ld1             {v0.8h,v1.8h},   [x10], #32
+        sxtl            v2.4s,  v1.4h
+        sxtl2           v3.4s,  v1.8h
+        sxtl2           v1.4s,  v0.8h
+        sxtl            v0.4s,  v0.4h
+        ld1             {v10.8h,v11.8h}, [x10]
+        sxtl            v12.4s, v11.4h
+        sxtl2           v13.4s, v11.8h
+        sxtl2           v11.4s, v10.8h
+        sxtl            v10.4s, v10.4h
+
+        dup             v15.8h, w13
+
+        cmp             w3,  #34
+        b.le            idct32x32_quarter_add_16_neon
+        cmp             w3,  #135
+        b.le            idct32x32_half_add_16_neon
+
+        movrel          x12, min_eob_idct_idct_32, 2
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             x0,  sp,  #(\i*128)
+.if \i > 0
+        ldrh            w1,  [x12], #2
+        cmp             w3,  w1
+        mov             x1,  #(32 - \i)/4
+        b.le            1f
+.endif
+        add             x2,  x6,  #(\i*4)
+        bl              idct32_1d_4x32_pass1_neon
+.endr
+        b               3f
+
+1:
+        // Write zeros to the temp buffer for pass 2
+        movi            v16.4s,  #0
+        movi            v17.4s,  #0
+        movi            v18.4s,  #0
+        movi            v19.4s,  #0
+2:
+        subs            x1,  x1,  #1
+.rept 4
+        st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
+        st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
+.endr
+        b.ne            2b
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             x0,  x4,  #(\i*2)
+        mov             x1,  x5
+        add             x2,  sp,  #(\i*4)
+        bl              idct32_1d_4x32_pass2_neon
+.endr
+
+        add             sp,  sp,  #4096
+        ldp             d14, d15, [sp], 0x10
+        ldp             d12, d13, [sp], 0x10
+        ldp             d10, d11, [sp], 0x10
+        ldp             d8,  d9,  [sp], 0x10
+
+        br              x15
+endfunc
+
+function ff_vp9_idct_idct_32x32_add_10_neon, export=1
+        mov             x13, #0x03ff
+        b               vp9_idct_idct_32x32_add_16_neon
+endfunc
+
+function ff_vp9_idct_idct_32x32_add_12_neon, export=1
+        mov             x13, #0x0fff
+        b               vp9_idct_idct_32x32_add_16_neon
+endfunc
+
+.macro idct32_partial size
+function idct32x32_\size\()_add_16_neon
+.irp i, 0, 4
+        add             x0,  sp,  #(\i*128)
+.ifc \size,quarter
+.if \i == 4
+        cmp             w3,  #9
+        b.le            1f
+.endif
+.endif
+        add             x2,  x6,  #(\i*4)
+        bl              idct32_1d_4x32_pass1_\size\()_neon
+.endr
+
+.ifc \size,half
+.irp i, 8, 12
+        add             x0,  sp,  #(\i*128)
+.if \i == 12
+        cmp             w3,  #70
+        b.le            1f
+.endif
+        add             x2,  x6,  #(\i*4)
+        bl              idct32_1d_4x32_pass1_\size\()_neon
+.endr
+.endif
+        b               3f
+
+1:
+        // Write zeros to the temp buffer for pass 2
+        movi            v16.4s,  #0
+        movi            v17.4s,  #0
+        movi            v18.4s,  #0
+        movi            v19.4s,  #0
+
+.rept 4
+        st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
+        st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
+.endr
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             x0,  x4,  #(\i*2)
+        mov             x1,  x5
+        add             x2,  sp,  #(\i*4)
+        bl              idct32_1d_4x32_pass2_\size\()_neon
+.endr
+
+        add             sp,  sp,  #4096
+        ldp             d14, d15, [sp], 0x10
+        ldp             d12, d13, [sp], 0x10
+        ldp             d10, d11, [sp], 0x10
+        ldp             d8,  d9,  [sp], 0x10
+
+        br              x15
+endfunc
+.endm
+
+idct32_partial quarter
+idct32_partial half
new file mode 100644
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/vp9itxfm_neon.S
@@ -0,0 +1,1580 @@
+/*
+ * Copyright (c) 2016 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+const itxfm4_coeffs, align=4
+        .short  11585, 0, 6270, 15137
+iadst4_coeffs:
+        .short  5283, 15212, 9929, 13377
+endconst
+
+const iadst8_coeffs, align=4
+        .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
+idct_coeffs:
+        .short  11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
+        .short  1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
+        .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
+        .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
+endconst
+
+const iadst16_coeffs, align=4
+        .short  16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
+        .short  14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
+endconst
+
+// out1 = ((in1 + in2) * v0[0] + (1 << 13)) >> 14
+// out2 = ((in1 - in2) * v0[0] + (1 << 13)) >> 14
+// in/out are .8h registers; this can do with 4 temp registers, but is
+// more efficient if 6 temp registers are available.
+.macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
+.if \neg > 0
+        neg             \tmp4\().4h, v0.4h
+.endif
+        add             \tmp1\().8h, \in1\().8h,  \in2\().8h
+        sub             \tmp2\().8h, \in1\().8h,  \in2\().8h
+.if \neg > 0
+        smull           \tmp3\().4s, \tmp1\().4h, \tmp4\().h[0]
+        smull2          \tmp4\().4s, \tmp1\().8h, \tmp4\().h[0]
+.else
+        smull           \tmp3\().4s, \tmp1\().4h, v0.h[0]
+        smull2          \tmp4\().4s, \tmp1\().8h, v0.h[0]
+.endif
+.ifb \tmp5
+        rshrn           \out1\().4h, \tmp3\().4s, #14
+        rshrn2          \out1\().8h, \tmp4\().4s, #14
+        smull           \tmp3\().4s, \tmp2\().4h, v0.h[0]
+        smull2          \tmp4\().4s, \tmp2\().8h, v0.h[0]
+        rshrn           \out2\().4h, \tmp3\().4s, #14
+        rshrn2          \out2\().8h, \tmp4\().4s, #14
+.else
+        smull           \tmp5\().4s, \tmp2\().4h, v0.h[0]
+        smull2          \tmp6\().4s, \tmp2\().8h, v0.h[0]
+        rshrn           \out1\().4h, \tmp3\().4s, #14
+        rshrn2          \out1\().8h, \tmp4\().4s, #14
+        rshrn           \out2\().4h, \tmp5\().4s, #14
+        rshrn2          \out2\().8h, \tmp6\().4s, #14
+.endif
+.endm
+
+// Same as dmbutterfly0 above, but treating the input in in2 as zero,
+// writing the same output into both out1 and out2.
+.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
+        smull           \tmp1\().4s,  \in1\().4h,  v0.h[0]
+        smull2          \tmp2\().4s,  \in1\().8h,  v0.h[0]
+        rshrn           \out1\().4h,  \tmp1\().4s, #14
+        rshrn2          \out1\().8h,  \tmp2\().4s, #14
+        rshrn           \out2\().4h,  \tmp1\().4s, #14
+        rshrn2          \out2\().8h,  \tmp2\().4s, #14
+.endm
+
+// out1,out2 = in1 * coef1 - in2 * coef2
+// out3,out4 = in1 * coef2 + in2 * coef1
+// out are 4 x .4s registers, in are 2 x .8h registers
+.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
+        smull           \out1\().4s, \in1\().4h, \coef1
+        smull2          \out2\().4s, \in1\().8h, \coef1
+        smull           \out3\().4s, \in1\().4h, \coef2
+        smull2          \out4\().4s, \in1\().8h, \coef2
+        smlsl           \out1\().4s, \in2\().4h, \coef2
+        smlsl2          \out2\().4s, \in2\().8h, \coef2
+        smlal           \out3\().4s, \in2\().4h, \coef1
+        smlal2          \out4\().4s, \in2\().8h, \coef1
+.endm
+
+// inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
+// inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
+// inout are 2 x .8h registers
+.macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
+        dmbutterfly_l   \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
+.if \neg > 0
+        neg             \tmp3\().4s, \tmp3\().4s
+        neg             \tmp4\().4s, \tmp4\().4s
+.endif
+        rshrn           \inout1\().4h, \tmp1\().4s,  #14
+        rshrn2          \inout1\().8h, \tmp2\().4s,  #14
+        rshrn           \inout2\().4h, \tmp3\().4s,  #14
+        rshrn2          \inout2\().8h, \tmp4\().4s,  #14
+.endm
+
+// Same as dmbutterfly above, but treating the input in inout2 as zero
+.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+        smull           \tmp1\().4s, \inout1\().4h, \coef1
+        smull2          \tmp2\().4s, \inout1\().8h, \coef1
+        smull           \tmp3\().4s, \inout1\().4h, \coef2
+        smull2          \tmp4\().4s, \inout1\().8h, \coef2
+        rshrn           \inout1\().4h, \tmp1\().4s, #14
+        rshrn2          \inout1\().8h, \tmp2\().4s, #14
+        rshrn           \inout2\().4h, \tmp3\().4s, #14
+        rshrn2          \inout2\().8h, \tmp4\().4s, #14
+.endm
+
+// Same as dmbutterfly above, but treating the input in inout1 as zero
+.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+        smull           \tmp1\().4s, \inout2\().4h, \coef2
+        smull2          \tmp2\().4s, \inout2\().8h, \coef2
+        smull           \tmp3\().4s, \inout2\().4h, \coef1
+        smull2          \tmp4\().4s, \inout2\().8h, \coef1
+        neg             \tmp1\().4s, \tmp1\().4s
+        neg             \tmp2\().4s, \tmp2\().4s
+        rshrn           \inout2\().4h, \tmp3\().4s, #14
+        rshrn2          \inout2\().8h, \tmp4\().4s, #14
+        rshrn           \inout1\().4h, \tmp1\().4s, #14
+        rshrn2          \inout1\().8h, \tmp2\().4s, #14
+.endm
+
+.macro dsmull_h out1, out2, in, coef
+        smull           \out1\().4s, \in\().4h, \coef
+        smull2          \out2\().4s, \in\().8h, \coef
+.endm
+
+.macro drshrn_h out, in1, in2, shift
+        rshrn           \out\().4h, \in1\().4s, \shift
+        rshrn2          \out\().8h, \in2\().4s, \shift
+.endm
+
+
+// out1 = in1 + in2
+// out2 = in1 - in2
+.macro butterfly_8h out1, out2, in1, in2
+        add             \out1\().8h, \in1\().8h, \in2\().8h
+        sub             \out2\().8h, \in1\().8h, \in2\().8h
+.endm
+
+// out1 = in1 - in2
+// out2 = in1 + in2
+.macro butterfly_8h_r out1, out2, in1, in2
+        sub             \out1\().8h, \in1\().8h, \in2\().8h
+        add             \out2\().8h, \in1\().8h, \in2\().8h
+.endm
+
+// out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
+// out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
+// out are 2 x .8h registers, in are 4 x .4s registers
+.macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
+        add             \tmp1\().4s, \in1\().4s, \in3\().4s
+        add             \tmp2\().4s, \in2\().4s, \in4\().4s
+        sub             \tmp3\().4s, \in1\().4s, \in3\().4s
+        sub             \tmp4\().4s, \in2\().4s, \in4\().4s
+        rshrn           \out1\().4h, \tmp1\().4s,  #14
+        rshrn2          \out1\().8h, \tmp2\().4s,  #14
+        rshrn           \out2\().4h, \tmp3\().4s,  #14
+        rshrn2          \out2\().8h, \tmp4\().4s,  #14
+.endm
+
+.macro iwht4 c0, c1, c2, c3
+        add             \c0\().4h, \c0\().4h, \c1\().4h
+        sub             v17.4h,    \c2\().4h, \c3\().4h
+        sub             v16.4h,    \c0\().4h, v17.4h
+        sshr            v16.4h,    v16.4h,    #1
+        sub             \c2\().4h, v16.4h,    \c1\().4h
+        sub             \c1\().4h, v16.4h,    \c3\().4h
+        add             \c3\().4h, v17.4h,    \c2\().4h
+        sub             \c0\().4h, \c0\().4h, \c1\().4h
+.endm
+
+.macro idct4 c0, c1, c2, c3
+        smull           v22.4s,    \c1\().4h, v0.h[3]
+        smull           v20.4s,    \c1\().4h, v0.h[2]
+        add             v16.4h,    \c0\().4h, \c2\().4h
+        sub             v17.4h,    \c0\().4h, \c2\().4h
+        smlal           v22.4s,    \c3\().4h, v0.h[2]
+        smull           v18.4s,    v16.4h,    v0.h[0]
+        smull           v19.4s,    v17.4h,    v0.h[0]
+        smlsl           v20.4s,    \c3\().4h, v0.h[3]
+        rshrn           v22.4h,    v22.4s,    #14
+        rshrn           v18.4h,    v18.4s,    #14
+        rshrn           v19.4h,    v19.4s,    #14
+        rshrn           v20.4h,    v20.4s,    #14
+        add             \c0\().4h, v18.4h,    v22.4h
+        sub             \c3\().4h, v18.4h,    v22.4h
+        add             \c1\().4h, v19.4h,    v20.4h
+        sub             \c2\().4h, v19.4h,    v20.4h
+.endm
+
+.macro iadst4 c0, c1, c2, c3
+        smull           v16.4s,    \c0\().4h, v0.h[4]
+        smlal           v16.4s,    \c2\().4h, v0.h[5]
+        smlal           v16.4s,    \c3\().4h, v0.h[6]
+        smull           v17.4s,    \c0\().4h, v0.h[6]
+        smlsl           v17.4s,    \c2\().4h, v0.h[4]
+        sub             \c0\().4h, \c0\().4h, \c2\().4h
+        smlsl           v17.4s,    \c3\().4h, v0.h[5]
+        add             \c0\().4h, \c0\().4h, \c3\().4h
+        smull           v19.4s,    \c1\().4h, v0.h[7]
+        smull           v18.4s,    \c0\().4h, v0.h[7]
+        add             v20.4s,    v16.4s,    v19.4s
+        add             v21.4s,    v17.4s,    v19.4s
+        rshrn           \c0\().4h, v20.4s,    #14
+        add             v16.4s,    v16.4s,    v17.4s
+        rshrn           \c1\().4h, v21.4s,    #14
+        sub             v16.4s,    v16.4s,    v19.4s
+        rshrn           \c2\().4h, v18.4s,    #14
+        rshrn           \c3\().4h, v16.4s,    #14
+.endm
+
+// The public functions in this file have got the following signature:
+// void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+
+.macro itxfm_func4x4 txfm1, txfm2
+function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
+.ifc \txfm1,\txfm2
+.ifc \txfm1,idct
+        movrel          x4,  itxfm4_coeffs
+        ld1             {v0.4h}, [x4]
+.endif
+.ifc \txfm1,iadst
+        movrel          x4,  iadst4_coeffs
+        ld1             {v0.d}[1], [x4]
+.endif
+.else
+        movrel          x4,  itxfm4_coeffs
+        ld1             {v0.8h}, [x4]
+.endif
+
+        movi            v31.8h, #0
+.ifc \txfm1\()_\txfm2,idct_idct
+        cmp             w3,  #1
+        b.ne            1f
+        // DC-only for idct/idct
+        ld1             {v2.h}[0], [x2]
+        smull           v2.4s,  v2.4h, v0.h[0]
+        rshrn           v2.4h,  v2.4s, #14
+        smull           v2.4s,  v2.4h, v0.h[0]
+        rshrn           v2.4h,  v2.4s, #14
+        st1             {v31.h}[0], [x2]
+        dup             v4.4h,  v2.h[0]
+        mov             v5.16b, v4.16b
+        mov             v6.16b, v4.16b
+        mov             v7.16b, v4.16b
+        b               2f
+.endif
+
+1:
+        ld1             {v4.4h,v5.4h,v6.4h,v7.4h},  [x2]
+        st1             {v31.8h}, [x2], #16
+
+.ifc \txfm1,iwht
+        sshr            v4.4h,  v4.4h,  #2
+        sshr            v5.4h,  v5.4h,  #2
+        sshr            v6.4h,  v6.4h,  #2
+        sshr            v7.4h,  v7.4h,  #2
+.endif
+
+        \txfm1\()4      v4,  v5,  v6,  v7
+
+        st1             {v31.8h}, [x2], #16
+        // Transpose 4x4 with 16 bit elements
+        transpose_4x4H  v4,  v5,  v6,  v7,  v16, v17, v18, v19
+
+        \txfm2\()4      v4,  v5,  v6,  v7
+2:
+        ld1             {v0.s}[0],   [x0], x1
+        ld1             {v1.s}[0],   [x0], x1
+.ifnc \txfm1,iwht
+        srshr           v4.4h,  v4.4h,  #4
+        srshr           v5.4h,  v5.4h,  #4
+        srshr           v6.4h,  v6.4h,  #4
+        srshr           v7.4h,  v7.4h,  #4
+.endif
+        uaddw           v4.8h,  v4.8h,  v0.8b
+        uaddw           v5.8h,  v5.8h,  v1.8b
+        ld1             {v2.s}[0],   [x0], x1
+        ld1             {v3.s}[0],   [x0], x1
+        sqxtun          v0.8b,  v4.8h
+        sqxtun          v1.8b,  v5.8h
+        sub             x0,  x0,  x1, lsl #2
+
+        uaddw           v6.8h,  v6.8h,  v2.8b
+        uaddw           v7.8h,  v7.8h,  v3.8b
+        st1             {v0.s}[0],  [x0], x1
+        sqxtun          v2.8b,  v6.8h
+        sqxtun          v3.8b,  v7.8h
+
+        st1             {v1.s}[0],  [x0], x1
+        st1             {v2.s}[0],  [x0], x1
+        st1             {v3.s}[0],  [x0], x1
+
+        ret
+endfunc
+.endm
+
+itxfm_func4x4 idct,  idct
+itxfm_func4x4 iadst, idct
+itxfm_func4x4 idct,  iadst
+itxfm_func4x4 iadst, iadst
+itxfm_func4x4 iwht,  iwht
+
+
+.macro idct8
+        dmbutterfly0    v16, v20, v16, v20, v2, v3, v4, v5, v6, v7 // v16 = t0a, v20 = t1a
+        dmbutterfly     v18, v22, v0.h[2], v0.h[3], v2, v3, v4, v5 // v18 = t2a, v22 = t3a
+        dmbutterfly     v17, v23, v0.h[4], v0.h[5], v2, v3, v4, v5 // v17 = t4a, v23 = t7a
+        dmbutterfly     v21, v19, v0.h[6], v0.h[7], v2, v3, v4, v5 // v21 = t5a, v19 = t6a
+
+        butterfly_8h    v24, v25, v16, v22 // v24 = t0, v25 = t3
+        butterfly_8h    v28, v29, v17, v21 // v28 = t4, v29 = t5a
+        butterfly_8h    v30, v31, v23, v19 // v30 = t7, v31 = t6a
+        butterfly_8h    v26, v27, v20, v18 // v26 = t1, v27 = t2
+
+        dmbutterfly0    v31, v29, v31, v29, v2, v3, v4, v5, v6, v7 // v31 = t6, v29 = t5
+
+        butterfly_8h    v16, v23, v24, v30 // v16 = out[0], v23 = out[7]
+        butterfly_8h    v17, v22, v26, v31 // v17 = out[1], v22 = out[6]
+        butterfly_8h    v18, v21, v27, v29 // q13 = out[2], q10 = out[5]
+        butterfly_8h    v19, v20, v25, v28 // v17 = out[3], q12 = out[4]
+.endm
+
+.macro iadst8
+        dmbutterfly_l   v24, v25, v26, v27, v23, v16, v1.h[1], v1.h[0]   // v24,v25 = t1a, v26,v27 = t0a
+        dmbutterfly_l   v28, v29, v30, v31, v21, v18, v1.h[3], v1.h[2]   // v28,v29 = t3a, v30,v31 = t2a
+        dmbutterfly_l   v2,  v3,  v4,  v5,  v19, v20, v1.h[5], v1.h[4]   // v2,v3   = t5a, v4,v5   = t4a
+        dmbutterfly_l   v16, v18, v21, v23, v17, v22, v1.h[7], v1.h[6]   // v16,v18 = t7a, v21,v23 = t6a
+
+        dbutterfly_n    v4,  v5,  v26, v27, v4,  v5,  v6,  v7, v26, v27  // v4  = t0, v5  = t4
+        dbutterfly_n    v2,  v3,  v24, v25, v2,  v3,  v6,  v7, v26, v27  // v2  = t1, v3  = t5
+        dbutterfly_n    v24, v25, v30, v31, v21, v23, v6,  v7, v26, v27  // v24 = t2, v25 = t6
+        dbutterfly_n    v30, v31, v28, v29, v16, v18, v6,  v7, v26, v27  // v30 = t3, v31 = t7
+
+        butterfly_8h    v16, v6,  v4, v24 // v16 = out[0],  v6 = t2
+        butterfly_8h    v23, v7,  v2, v30 // v23 = -out[7], v7 = t3
+        neg             v23.8h,   v23.8h  // v23 = out[7]
+
+        dmbutterfly0    v19, v20, v6, v7, v24, v26, v27, v28, v29, v30   // v19 = -out[3], v20 = out[4]
+        neg             v19.8h,   v19.8h  // v19 = out[3]
+
+        dmbutterfly_l   v26, v27, v28, v29, v5,  v3,  v0.h[2], v0.h[3]   // v26,v27 = t5a, v28,v29 = t4a
+        dmbutterfly_l   v2,  v3,  v4,  v5,  v31, v25, v0.h[3], v0.h[2]   // v2,v3   = t6a, v4,v5   = t7a
+
+        dbutterfly_n    v17, v30, v28, v29, v2,  v3,  v6,  v7,  v24, v25 // v17 = -out[1], v30 = t6
+        dbutterfly_n    v22, v31, v26, v27, v4,  v5,  v6,  v7,  v24, v25 // v22 = out[6],  v31 = t7
+        neg             v17.8h,   v17.8h  // v17 = out[1]
+
+        dmbutterfly0    v18, v21, v30, v31, v2,  v3,  v4,  v5,  v6,  v7  // v18 = out[2], v21 = -out[5]
+        neg             v21.8h,   v21.8h  // v21 = out[5]
+.endm
+
+
+.macro itxfm_func8x8 txfm1, txfm2
+function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
+        // The iadst also uses a few coefficients from
+        // idct, so those always need to be loaded.
+.ifc \txfm1\()_\txfm2,idct_idct
+        movrel          x4,  idct_coeffs
+.else
+        movrel          x4,  iadst8_coeffs
+        ld1             {v1.8h}, [x4], #16
+.endif
+        ld1             {v0.8h}, [x4]
+
+        movi            v2.8h, #0
+        movi            v3.8h, #0
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+
+.ifc \txfm1\()_\txfm2,idct_idct
+        cmp             w3,  #1
+        b.ne            1f
+        // DC-only for idct/idct
+        ld1             {v2.h}[0],  [x2]
+        smull           v2.4s,  v2.4h, v0.h[0]
+        rshrn           v2.4h,  v2.4s, #14
+        smull           v2.4s,  v2.4h, v0.h[0]
+        rshrn           v2.4h,  v2.4s, #14
+        st1             {v3.h}[0],  [x2]
+        dup             v16.8h,  v2.h[0]
+        mov             v17.16b, v16.16b
+        mov             v18.16b, v16.16b
+        mov             v19.16b, v16.16b
+        mov             v20.16b, v16.16b
+        mov             v21.16b, v16.16b
+        mov             v22.16b, v16.16b
+        mov             v23.16b, v16.16b
+        b               2f
+.endif
+1:
+        ld1             {v16.8h,v17.8h,v18.8h,v19.8h},  [x2], #64
+        ld1             {v20.8h,v21.8h,v22.8h,v23.8h},  [x2], #64
+        sub             x2,  x2,  #128
+        st1             {v2.8h,v3.8h,v4.8h,v5.8h},      [x2], #64
+        st1             {v2.8h,v3.8h,v4.8h,v5.8h},      [x2], #64
+
+        \txfm1\()8
+
+        // Transpose 8x8 with 16 bit elements
+        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
+
+        \txfm2\()8
+2:
+        mov             x3,  x0
+        // Add into the destination
+        ld1             {v0.8b},  [x0], x1
+        srshr           v16.8h, v16.8h, #5
+        ld1             {v1.8b},  [x0], x1
+        srshr           v17.8h, v17.8h, #5
+        ld1             {v2.8b},  [x0], x1
+        srshr           v18.8h, v18.8h, #5
+        uaddw           v16.8h, v16.8h, v0.8b
+        ld1             {v3.8b},  [x0], x1
+        srshr           v19.8h, v19.8h, #5
+        uaddw           v17.8h, v17.8h, v1.8b
+        ld1             {v4.8b},  [x0], x1
+        srshr           v20.8h, v20.8h, #5
+        uaddw           v18.8h, v18.8h, v2.8b
+        sqxtun          v0.8b,  v16.8h
+        ld1             {v5.8b},  [x0], x1
+        srshr           v21.8h, v21.8h, #5
+        uaddw           v19.8h, v19.8h, v3.8b
+        sqxtun          v1.8b,  v17.8h
+        ld1             {v6.8b},  [x0], x1
+        srshr           v22.8h, v22.8h, #5
+        uaddw           v20.8h, v20.8h, v4.8b
+        sqxtun          v2.8b,  v18.8h
+        ld1             {v7.8b},  [x0], x1
+        srshr           v23.8h, v23.8h, #5
+        uaddw           v21.8h, v21.8h, v5.8b
+        sqxtun          v3.8b,  v19.8h
+
+        st1             {v0.8b},  [x3], x1
+        uaddw           v22.8h, v22.8h, v6.8b
+        st1             {v1.8b},  [x3], x1
+        sqxtun          v4.8b,  v20.8h
+        st1             {v2.8b},  [x3], x1
+        uaddw           v23.8h, v23.8h, v7.8b
+        st1             {v3.8b},  [x3], x1
+        sqxtun          v5.8b,  v21.8h
+        st1             {v4.8b},  [x3], x1
+        sqxtun          v6.8b,  v22.8h
+        st1             {v5.8b},  [x3], x1
+        sqxtun          v7.8b,  v23.8h
+
+        st1             {v6.8b},  [x3], x1
+        st1             {v7.8b},  [x3], x1
+
+        ret
+endfunc
+.endm
+
+itxfm_func8x8 idct,  idct
+itxfm_func8x8 iadst, idct
+itxfm_func8x8 idct,  iadst
+itxfm_func8x8 iadst, iadst
+
+
+function idct16x16_dc_add_neon
+        movrel          x4,  idct_coeffs
+        ld1             {v0.4h}, [x4]
+
+        movi            v1.4h,  #0
+
+        ld1             {v2.h}[0], [x2]
+        smull           v2.4s,  v2.4h,  v0.h[0]
+        rshrn           v2.4h,  v2.4s,  #14
+        smull           v2.4s,  v2.4h,  v0.h[0]
+        rshrn           v2.4h,  v2.4s,  #14
+        dup             v2.8h,  v2.h[0]
+        st1             {v1.h}[0], [x2]
+
+        srshr           v2.8h,  v2.8h,  #6
+
+        mov             x3,  x0
+        mov             x4,  #16
+1:
+        // Loop to add the constant from v2 into all 16x16 outputs
+        subs            x4,  x4,  #2
+        ld1             {v3.16b},  [x0], x1
+        ld1             {v4.16b},  [x0], x1
+        uaddw           v16.8h, v2.8h,  v3.8b
+        uaddw2          v17.8h, v2.8h,  v3.16b
+        uaddw           v18.8h, v2.8h,  v4.8b
+        uaddw2          v19.8h, v2.8h,  v4.16b
+        sqxtun          v3.8b,  v16.8h
+        sqxtun2         v3.16b, v17.8h
+        sqxtun          v4.8b,  v18.8h
+        sqxtun2         v4.16b, v19.8h
+        st1             {v3.16b},  [x3], x1
+        st1             {v4.16b},  [x3], x1
+        b.ne            1b
+
+        ret
+endfunc
+
+.macro idct16_end
+        butterfly_8h    v18, v7,  v4,  v7                // v18 = t0a,  v7  = t7a
+        butterfly_8h    v19, v22, v5,  v22               // v19 = t1a,  v22 = t6
+        butterfly_8h    v4,  v26, v20, v26               // v4  = t2a,  v26 = t5
+        butterfly_8h    v5,  v6,  v28, v6                // v5  = t3a,  v6  = t4
+        butterfly_8h    v20, v28, v16, v24               // v20 = t8a,  v28 = t11a
+        butterfly_8h    v24, v21, v23, v21               // v24 = t9,   v21 = t10
+        butterfly_8h    v23, v27, v25, v27               // v23 = t14,  v27 = t13
+        butterfly_8h    v25, v29, v29, v17               // v25 = t15a, v29 = t12a
+
+        dmbutterfly0    v2,  v3,  v27, v21, v2,  v3,  v16, v17, v30, v31 // v2  = t13a, v3  = t10a
+        dmbutterfly0    v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12,  v27 = t11
+
+        butterfly_8h    v16, v31, v18, v25               // v16 = out[0], v31 = out[15]
+        butterfly_8h    v17, v30, v19, v23               // v17 = out[1], v30 = out[14]
+        butterfly_8h_r  v25, v22, v22, v24               // v25 = out[9], v22 = out[6]
+        butterfly_8h    v23, v24, v7,  v20               // v23 = out[7], v24 = out[8]
+        butterfly_8h    v18, v29, v4,  v2                // v18 = out[2], v29 = out[13]
+        butterfly_8h    v19, v28, v5,  v28               // v19 = out[3], v28 = out[12]
+        butterfly_8h    v20, v27, v6,  v27               // v20 = out[4], v27 = out[11]
+        butterfly_8h    v21, v26, v26, v3                // v21 = out[5], v26 = out[10]
+        ret
+.endm
+
+function idct16
+        dmbutterfly0    v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a,  v24 = t1a
+        dmbutterfly     v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a,  v28 = t3a
+        dmbutterfly     v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a,  v30 = t7a
+        dmbutterfly     v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a,  v22 = t6a
+        dmbutterfly     v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a,  v31 = t15a
+        dmbutterfly     v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a,  v23 = t14a
+        dmbutterfly     v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
+        dmbutterfly     v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
+
+        butterfly_8h    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
+        butterfly_8h    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
+        butterfly_8h    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
+        butterfly_8h    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
+        butterfly_8h    v16, v25, v17, v25               // v16 = t8,   v25 = t9
+        butterfly_8h    v24, v21, v29, v21               // v24 = t11,  v21 = t10
+        butterfly_8h    v17, v27, v19, v27               // v17 = t12,  v27 = t13
+        butterfly_8h    v29, v23, v31, v23               // v29 = t15,  v23 = t14
+
+        dmbutterfly0    v22, v26, v22, v26, v2, v3, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
+        dmbutterfly     v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
+        dmbutterfly     v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
+        idct16_end
+endfunc
+
+function idct16_half
+        dmbutterfly0_h  v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a,  v24 = t1a
+        dmbutterfly_h1  v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a,  v28 = t3a
+        dmbutterfly_h1  v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a,  v30 = t7a
+        dmbutterfly_h2  v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a,  v22 = t6a
+        dmbutterfly_h1  v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a,  v31 = t15a
+        dmbutterfly_h2  v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a,  v23 = t14a
+        dmbutterfly_h1  v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
+        dmbutterfly_h2  v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
+
+        butterfly_8h    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
+        butterfly_8h    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
+        butterfly_8h    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
+        butterfly_8h    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
+        butterfly_8h    v16, v25, v17, v25               // v16 = t8,   v25 = t9
+        butterfly_8h    v24, v21, v29, v21               // v24 = t11,  v21 = t10
+        butterfly_8h    v17, v27, v19, v27               // v17 = t12,  v27 = t13
+        butterfly_8h    v29, v23, v31, v23               // v29 = t15,  v23 = t14
+
+        dmbutterfly0    v22, v26, v22, v26, v2, v3, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
+        dmbutterfly     v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
+        dmbutterfly     v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
+        idct16_end
+endfunc
+
+function idct16_quarter
+        dsmull_h        v24, v25, v19, v1.h[7]
+        dsmull_h        v4,  v5,  v17, v1.h[0]
+        dsmull_h        v7,  v6,  v18, v0.h[5]
+        dsmull_h        v30, v31, v18, v0.h[4]
+        neg             v24.4s,  v24.4s
+        neg             v25.4s,  v25.4s
+        dsmull_h        v29, v28, v17, v1.h[1]
+        dsmull_h        v26, v27, v19, v1.h[6]
+        dsmull_h        v22, v23, v16, v0.h[0]
+        drshrn_h        v24, v24, v25, #14
+        drshrn_h        v16, v4,  v5,  #14
+        drshrn_h        v7,  v7,  v6,  #14
+        drshrn_h        v6,  v30, v31, #14
+        drshrn_h        v29, v29, v28, #14
+        drshrn_h        v17, v26, v27, #14
+        drshrn_h        v28, v22, v23, #14
+
+        dmbutterfly_l   v20, v21, v22, v23, v17, v24, v0.h[2], v0.h[3]
+        dmbutterfly_l   v18, v19, v30, v31, v29, v16, v0.h[2], v0.h[3]
+        neg             v22.4s,  v22.4s
+        neg             v23.4s,  v23.4s
+        drshrn_h        v27, v20, v21, #14
+        drshrn_h        v21, v22, v23, #14
+        drshrn_h        v23, v18, v19, #14
+        drshrn_h        v25, v30, v31, #14
+        mov             v4.16b,  v28.16b
+        mov             v5.16b,  v28.16b
+        dmbutterfly0    v22, v26, v7,  v6,  v18, v19, v30, v31
+        mov             v20.16b, v28.16b
+        idct16_end
+endfunc
+
+function iadst16
+        ld1             {v0.8h,v1.8h}, [x11]
+
+        dmbutterfly_l   v6,  v7,  v4,  v5,  v31, v16, v0.h[1], v0.h[0]   // v6,v7   = t1,   v4,v5   = t0
+        dmbutterfly_l   v10, v11, v8,  v9,  v23, v24, v0.h[5], v0.h[4]   // v10,v11 = t9,   v8,v9   = t8
+        dbutterfly_n    v31, v24, v6,  v7,  v10, v11, v12, v13, v10, v11 // v31     = t1a,  v24     = t9a
+        dmbutterfly_l   v14, v15, v12, v13, v29, v18, v0.h[3], v0.h[2]   // v14,v15 = t3,   v12,v13 = t2
+        dbutterfly_n    v16, v23, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v16     = t0a,  v23     = t8a
+
+        dmbutterfly_l   v6,  v7,  v4,  v5,  v21, v26, v0.h[7], v0.h[6]   // v6,v7   = t11,  v4,v5   = t10
+        dbutterfly_n    v29, v26, v14, v15, v6,  v7,  v8,  v9,  v6,  v7  // v29     = t3a,  v26     = t11a
+        dmbutterfly_l   v10, v11, v8,  v9,  v27, v20, v1.h[1], v1.h[0]   // v10,v11 = t5,   v8,v9   = t4
+        dbutterfly_n    v18, v21, v12, v13, v4,  v5,  v6,  v7,  v4,  v5  // v18     = t2a,  v21     = t10a
+
+        dmbutterfly_l   v14, v15, v12, v13, v19, v28, v1.h[5], v1.h[4]   // v14,v15 = t13,  v12,v13 = t12
+        dbutterfly_n    v20, v28, v10, v11, v14, v15, v4,  v5,  v14, v15 // v20     = t5a,  v28     = t13a
+        dmbutterfly_l   v6,  v7,  v4,  v5,  v25, v22, v1.h[3], v1.h[2]   // v6,v7   = t7,   v4,v5   = t6
+        dbutterfly_n    v27, v19, v8,  v9,  v12, v13, v10, v11, v12, v13 // v27     = t4a,  v19     = t12a
+
+        dmbutterfly_l   v10, v11, v8,  v9,  v17, v30, v1.h[7], v1.h[6]   // v10,v11 = t15,  v8,v9   = t14
+        ld1             {v0.8h}, [x10]
+        dbutterfly_n    v22, v30, v6,  v7,  v10, v11, v12, v13, v10, v11 // v22     = t7a,  v30     = t15a
+        dmbutterfly_l   v14, v15, v12, v13, v23, v24, v0.h[4], v0.h[5]   // v14,v15 = t9,   v12,v13 = t8
+        dbutterfly_n    v25, v17, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v25     = t6a,  v17     = t14a
+
+        dmbutterfly_l   v4,  v5,  v6,  v7,  v28, v19, v0.h[5], v0.h[4]   // v4,v5   = t12,  v6,v7   = t13
+        dbutterfly_n    v23, v19, v12, v13, v4,  v5,  v8,  v9,  v4,  v5  // v23     = t8a,  v19     = t12a
+        dmbutterfly_l   v10, v11, v8,  v9,  v21, v26, v0.h[6], v0.h[7]   // v10,v11 = t11,  v8,v9   = t10
+        butterfly_8h_r  v4,  v27, v16, v27               // v4  = t4,   v27 = t0
+        dbutterfly_n    v24, v28, v14, v15, v6,  v7,  v12, v13, v6,  v7  // v24     = t9a,  v28     = t13a
+
+        dmbutterfly_l   v12, v13, v14, v15, v30, v17, v0.h[7], v0.h[6]   // v12,v13 = t14,  v14,v15 = t15
+        butterfly_8h_r  v5,  v20, v31, v20               // v5  = t5, v20 = t1
+        dbutterfly_n    v21, v17, v8,  v9,  v12, v13, v6,  v7,  v12, v13 // v21     = t10a, v17     = t14a
+        dbutterfly_n    v26, v30, v10, v11, v14, v15, v8,  v9,  v14, v15 // v26     = t11a, v30     = t15a
+
+        butterfly_8h_r  v6,  v25, v18, v25               // v6  = t6, v25 = t2
+        butterfly_8h_r  v7,  v22, v29, v22               // v7  = t7, v22 = t3
+
+        dmbutterfly_l   v10, v11, v8,  v9,  v19, v28, v0.h[2], v0.h[3]   // v10,v11 = t13,  v8,v9   = t12
+        dmbutterfly_l   v12, v13, v14, v15, v30, v17, v0.h[3], v0.h[2]   // v12,v13 = t14,  v14,v15 = t15
+
+        dbutterfly_n    v18, v30, v8,  v9,  v12, v13, v16, v17, v12, v13 // v18   = out[2], v30     = t14a
+        dbutterfly_n    v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17     = t15a
+        neg             v29.8h, v29.8h                   // v29 = out[13]
+
+        dmbutterfly_l   v10, v11, v8,  v9,  v4,  v5,  v0.h[2], v0.h[3]   // v10,v11 = t5a,  v8,v9   = t4a
+        dmbutterfly_l   v12, v13, v14, v15, v7,  v6,  v0.h[3], v0.h[2]   // v12,v13 = t6a,  v14,v15 = t7a
+
+        butterfly_8h    v2,  v6,  v27, v25               // v2 = out[0], v6 = t2a
+        butterfly_8h    v3,  v7,  v23, v21               // v3 =-out[1], v7 = t10
+
+        dbutterfly_n    v19, v31, v8,  v9,  v12, v13, v4,  v5,  v8,  v9  // v19 = -out[3],  v31 = t6
+        neg             v19.8h, v19.8h                   // v19 = out[3]
+        dbutterfly_n    v28, v16, v10, v11, v14, v15, v4,  v5,  v10, v11 // v28 = out[12],  v16 = t7
+
+        butterfly_8h    v5,  v8,  v20, v22               // v5 =-out[15],v8 = t3a
+        butterfly_8h    v4,  v9,  v24, v26               // v4 = out[14],v9 = t11
+
+        dmbutterfly0    v23, v24, v6,  v8,  v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8]
+        dmbutterfly0    v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10]
+        dmbutterfly0    v20, v27, v16, v31, v10, v11, v12, v13, v14, v15    // v20 = out[4], v27 = out[11]
+        dmbutterfly0    v22, v25, v9,  v7,  v10, v11, v12, v13, v14, v15    // v22 = out[6], v25 = out[9]
+
+        neg             v31.8h,  v5.8h                    // v31 = out[15]
+        neg             v17.8h,  v3.8h                    // v17 = out[1]
+
+        mov             v16.16b, v2.16b
+        mov             v30.16b, v4.16b
+        ret
+endfunc
+
+// Helper macros; we can't use these expressions directly within
+// e.g. .irp due to the extra concatenation \(). Therefore wrap
+// them in macros to allow using .irp below.
+.macro load i, src, inc
+        ld1             {v\i\().8h},  [\src], \inc
+.endm
+.macro store i, dst, inc
+        st1             {v\i\().8h},  [\dst], \inc
+.endm
+.macro movi_v i, size, imm
+        movi            v\i\()\size,  \imm
+.endm
+.macro load_clear i, src, inc
+        ld1             {v\i\().8h}, [\src]
+        st1             {v2.8h},  [\src], \inc
+.endm
+
+.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2
+        srshr           \coef0, \coef0, #6
+        ld1             {v2.8b},  [x0], x1
+        srshr           \coef1, \coef1, #6
+        ld1             {v3.8b},  [x3], x1
+        srshr           \coef2, \coef2, #6
+        ld1             {v4.8b},  [x0], x1
+        srshr           \coef3, \coef3, #6
+        uaddw           \coef0, \coef0, v2.8b
+        ld1             {v5.8b},  [x3], x1
+        uaddw           \coef1, \coef1, v3.8b
+        srshr           \coef4, \coef4, #6
+        ld1             {v6.8b},  [x0], x1
+        srshr           \coef5, \coef5, #6
+        ld1             {v7.8b},  [x3], x1
+        sqxtun          v2.8b,  \coef0
+        srshr           \coef6, \coef6, #6
+        sqxtun          v3.8b,  \coef1
+        srshr           \coef7, \coef7, #6
+        uaddw           \coef2, \coef2, v4.8b
+        ld1             {\tmp1},  [x0], x1
+        uaddw           \coef3, \coef3, v5.8b
+        ld1             {\tmp2},  [x3], x1
+        sqxtun          v4.8b,  \coef2
+        sub             x0,  x0,  x1, lsl #2
+        sub             x3,  x3,  x1, lsl #2
+        sqxtun          v5.8b,  \coef3
+        uaddw           \coef4, \coef4, v6.8b
+        st1             {v2.8b},  [x0], x1
+        uaddw           \coef5, \coef5, v7.8b
+        st1             {v3.8b},  [x3], x1
+        sqxtun          v6.8b,  \coef4
+        st1             {v4.8b},  [x0], x1
+        sqxtun          v7.8b,  \coef5
+        st1             {v5.8b},  [x3], x1
+        uaddw           \coef6, \coef6, \tmp1
+        st1             {v6.8b},  [x0], x1
+        uaddw           \coef7, \coef7, \tmp2
+        st1             {v7.8b},  [x3], x1
+        sqxtun          \tmp1,  \coef6
+        sqxtun          \tmp2,  \coef7
+        st1             {\tmp1},  [x0], x1
+        st1             {\tmp2},  [x3], x1
+.endm
+
+// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
+// transpose into a horizontal 16x8 slice and store.
+// x0 = dst (temp buffer)
+// x1 = slice offset
+// x2 = src
+// x9 = input stride
+.macro itxfm16_1d_funcs txfm
+function \txfm\()16_1d_8x16_pass1_neon
+        mov             x14, x30
+
+        movi            v2.8h, #0
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        load_clear      \i,  x2,  x9
+.endr
+
+        bl              \txfm\()16
+
+        // Do two 8x8 transposes. Originally, v16-v31 contain the
+        // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
+        // transposed 8x8 blocks.
+        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
+
+        // Store the transposed 8x8 blocks horizontally.
+        cmp             x1,  #8
+        b.eq            1f
+.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
+        store           \i,  x0,  #16
+.endr
+        br              x14
+1:
+        // Special case: For the last input column (x1 == 8),
+        // which would be stored as the last row in the temp buffer,
+        // don't store the first 8x8 block, but keep it in registers
+        // for the first slice of the second pass (where it is the
+        // last 8x8 block).
+.irp i, 24, 25, 26, 27, 28, 29, 30, 31
+        add             x0,  x0,  #16
+        store           \i,  x0,  #16
+.endr
+        mov             v24.16b, v16.16b
+        mov             v25.16b, v17.16b
+        mov             v26.16b, v18.16b
+        mov             v27.16b, v19.16b
+        mov             v28.16b, v20.16b
+        mov             v29.16b, v21.16b
+        mov             v30.16b, v22.16b
+        mov             v31.16b, v23.16b
+        br              x14
+endfunc
+
+// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
+// load the destination pixels (from a similar 8x16 slice), add and store back.
+// x0 = dst
+// x1 = dst stride
+// x2 = src (temp buffer)
+// x3 = slice offset
+// x9 = temp buffer stride
+function \txfm\()16_1d_8x16_pass2_neon
+        mov             x14, x30
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load            \i,  x2,  x9
+.endr
+        cbz             x3,  1f
+.irp i, 24, 25, 26, 27, 28, 29, 30, 31
+        load            \i,  x2,  x9
+.endr
+1:
+
+        add             x3,  x0,  x1
+        lsl             x1,  x1,  #1
+        bl              \txfm\()16
+
+        load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
+        load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
+
+        br              x14
+endfunc
+.endm
+
+itxfm16_1d_funcs idct
+itxfm16_1d_funcs iadst
+
+.macro itxfm_func16x16 txfm1, txfm2
+function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
+.ifc \txfm1\()_\txfm2,idct_idct
+        cmp             w3,  #1
+        b.eq            idct16x16_dc_add_neon
+.endif
+        mov             x15, x30
+        // iadst16 requires clobbering v8-v15, but idct16 doesn't need to.
+.ifnc \txfm1\()_\txfm2,idct_idct
+        stp             d14, d15, [sp, #-0x10]!
+        stp             d12, d13, [sp, #-0x10]!
+        stp             d10, d11, [sp, #-0x10]!
+        stp             d8,  d9,  [sp, #-0x10]!
+.endif
+
+        sub             sp,  sp,  #512
+
+        mov             x4,  x0
+        mov             x5,  x1
+        mov             x6,  x2
+
+        movrel          x10, idct_coeffs
+.ifnc \txfm1\()_\txfm2,idct_idct
+        movrel          x11, iadst16_coeffs
+.endif
+.ifc \txfm1,idct
+        ld1             {v0.8h,v1.8h}, [x10]
+.endif
+        mov             x9,  #32
+
+.ifc \txfm1\()_\txfm2,idct_idct
+        cmp             w3,  #10
+        b.le            idct16x16_quarter_add_neon
+        cmp             w3,  #38
+        b.le            idct16x16_half_add_neon
+.endif
+
+.irp i, 0, 8
+        add             x0,  sp,  #(\i*32)
+.ifc \txfm1\()_\txfm2,idct_idct
+.if \i == 8
+        cmp             w3,  #38
+        b.le            1f
+.endif
+.endif
+        mov             x1,  #\i
+        add             x2,  x6,  #(\i*2)
+        bl              \txfm1\()16_1d_8x16_pass1_neon
+.endr
+.ifc \txfm1\()_\txfm2,iadst_idct
+        ld1             {v0.8h,v1.8h}, [x10]
+.endif
+
+.ifc \txfm1\()_\txfm2,idct_idct
+        b               3f
+1:
+        // Set v24-v31 to zero, for the in-register passthrough of
+        // coefficients to pass 2. Since we only do two slices, this can
+        // only ever happen for the second slice. So we only need to store
+        // zeros to the temp buffer for the second half of the buffer.
+        // Move x0 to the second half, and use x9 == 32 as increment.
+        add             x0,  x0,  #16
+.irp i, 24, 25, 26, 27, 28, 29, 30, 31
+        movi_v          \i,  .16b, #0
+        st1             {v24.8h},  [x0], x9
+.endr
+3:
+.endif
+
+.irp i, 0, 8
+        add             x0,  x4,  #(\i)
+        mov             x1,  x5
+        add             x2,  sp,  #(\i*2)
+        mov             x3,  #\i
+        bl              \txfm2\()16_1d_8x16_pass2_neon
+.endr
+
+        add             sp,  sp,  #512
+.ifnc \txfm1\()_\txfm2,idct_idct
+        ldp             d8,  d9,  [sp], 0x10
+        ldp             d10, d11, [sp], 0x10
+        ldp             d12, d13, [sp], 0x10
+        ldp             d14, d15, [sp], 0x10
+.endif
+        br              x15
+endfunc
+.endm
+
+itxfm_func16x16 idct,  idct
+itxfm_func16x16 iadst, idct
+itxfm_func16x16 idct,  iadst
+itxfm_func16x16 iadst, iadst
+
+function idct16_1d_8x16_pass1_quarter_neon
+        mov             x14, x30
+        movi            v2.8h, #0
+.irp i, 16, 17, 18, 19
+        load_clear      \i,  x2,  x9
+.endr
+
+        bl              idct16_quarter
+
+        // Do two 8x8 transposes. Originally, v16-v31 contain the
+        // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
+        // transposed 8x8 blocks.
+        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
+
+        // Store the transposed 8x8 blocks horizontally.
+        // The first 8x8 block is kept in registers for the second pass,
+        // store the rest in the temp buffer.
+        // Since only a 4x4 part of the input was nonzero, this means that
+        // only 4 rows are nonzero after transposing, and the second pass
+        // only reads the topmost 4 rows. Therefore only store the topmost
+        // 4 rows.
+        add             x0,  x0,  #16
+.irp i, 24, 25, 26, 27
+        store           \i,  x0,  x9
+.endr
+        br              x14
+endfunc
+
+function idct16_1d_8x16_pass2_quarter_neon
+        mov             x14, x30
+        cbz             x3,  1f
+.irp i, 16, 17, 18, 19
+        load            \i,  x2,  x9
+.endr
+1:
+
+        add             x3,  x0,  x1
+        lsl             x1,  x1,  #1
+        bl              idct16_quarter
+
+        load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
+        load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
+
+        br              x14
+endfunc
+
+function idct16_1d_8x16_pass1_half_neon
+        mov             x14, x30
+        movi            v2.8h, #0
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load_clear      \i,  x2,  x9
+.endr
+
+        bl              idct16_half
+
+        // Do two 8x8 transposes. Originally, v16-v31 contain the
+        // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
+        // transposed 8x8 blocks.
+        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
+
+        // Store the transposed 8x8 blocks horizontally.
+        // The first 8x8 block is kept in registers for the second pass,
+        // store the rest in the temp buffer.
+        add             x0,  x0,  #16
+.irp i, 24, 25, 26, 27, 28, 29, 30, 31
+        store           \i,  x0,  x9
+.endr
+        br              x14
+endfunc
+
+function idct16_1d_8x16_pass2_half_neon
+        mov             x14, x30
+        cbz             x3,  1f
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load            \i,  x2,  x9
+.endr
+1:
+
+        add             x3,  x0,  x1
+        lsl             x1,  x1,  #1
+        bl              idct16_half
+
+        load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
+        load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
+
+        br              x14
+endfunc
+
+.macro idct16_partial size
+function idct16x16_\size\()_add_neon
+        add             x0,  sp,  #(0*32)
+        add             x2,  x6,  #(0*2)
+        bl              idct16_1d_8x16_pass1_\size\()_neon
+.irp i, 0, 8
+        add             x0,  x4,  #(\i)
+        mov             x1,  x5
+        add             x2,  sp,  #(\i*2)
+        mov             x3,  #\i
+        bl              idct16_1d_8x16_pass2_\size\()_neon
+.endr
+
+        add             sp,  sp,  #512
+        br              x15
+endfunc
+.endm
+
+idct16_partial quarter
+idct16_partial half
+
+function idct32x32_dc_add_neon
+        movrel          x4,  idct_coeffs
+        ld1             {v0.4h}, [x4]
+
+        movi            v1.4h,  #0
+
+        ld1             {v2.h}[0], [x2]
+        smull           v2.4s,  v2.4h,  v0.h[0]
+        rshrn           v2.4h,  v2.4s,  #14
+        smull           v2.4s,  v2.4h,  v0.h[0]
+        rshrn           v2.4h,  v2.4s,  #14
+        dup             v2.8h,  v2.h[0]
+        st1             {v1.h}[0], [x2]
+
+        srshr           v0.8h,  v2.8h,  #6
+
+        mov             x3,  x0
+        mov             x4,  #32
+1:
+        // Loop to add the constant v0 into all 32x32 outputs
+        subs            x4,  x4,  #2
+        ld1             {v1.16b,v2.16b},  [x0], x1
+        uaddw           v16.8h, v0.8h,  v1.8b
+        uaddw2          v17.8h, v0.8h,  v1.16b
+        ld1             {v3.16b,v4.16b},  [x0], x1
+        uaddw           v18.8h, v0.8h,  v2.8b
+        uaddw2          v19.8h, v0.8h,  v2.16b
+        uaddw           v20.8h, v0.8h,  v3.8b
+        uaddw2          v21.8h, v0.8h,  v3.16b
+        uaddw           v22.8h, v0.8h,  v4.8b
+        uaddw2          v23.8h, v0.8h,  v4.16b
+        sqxtun          v1.8b,  v16.8h
+        sqxtun2         v1.16b, v17.8h
+        sqxtun          v2.8b,  v18.8h
+        sqxtun2         v2.16b, v19.8h
+        sqxtun          v3.8b,  v20.8h
+        sqxtun2         v3.16b, v21.8h
+        st1             {v1.16b,v2.16b},  [x3], x1
+        sqxtun          v4.8b,  v22.8h
+        sqxtun2         v4.16b, v23.8h
+        st1             {v3.16b,v4.16b},  [x3], x1
+        b.ne            1b
+
+        ret
+endfunc
+
+.macro idct32_end
+        butterfly_8h    v16, v5,  v4,  v5  // v16 = t16a, v5  = t19a
+        butterfly_8h    v17, v20, v23, v20 // v17 = t17,  v20 = t18
+        butterfly_8h    v18, v6,  v7,  v6  // v18 = t23a, v6  = t20a
+        butterfly_8h    v19, v21, v22, v21 // v19 = t22,  v21 = t21
+        butterfly_8h    v4,  v28, v28, v30 // v4  = t24a, v28 = t27a
+        butterfly_8h    v23, v26, v25, v26 // v23 = t25,  v26 = t26
+        butterfly_8h    v7,  v3,  v29, v31 // v7  = t31a, v3  = t28a
+        butterfly_8h    v22, v27, v24, v27 // v22 = t30,  v27 = t29
+
+        dmbutterfly     v27, v20, v0.h[2], v0.h[3], v24, v25, v30, v31        // v27 = t18a, v20 = t29a
+        dmbutterfly     v3,  v5,  v0.h[2], v0.h[3], v24, v25, v30, v31        // v3  = t19,  v5  = t28
+        dmbutterfly     v28, v6,  v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v28 = t27,  v6  = t20
+        dmbutterfly     v26, v21, v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
+
+        butterfly_8h    v31, v24, v7,  v4  // v31 = t31,  v24 = t24
+        butterfly_8h    v30, v25, v22, v23 // v30 = t30a, v25 = t25a
+        butterfly_8h_r  v23, v16, v16, v18 // v23 = t23,  v16 = t16
+        butterfly_8h_r  v22, v17, v17, v19 // v22 = t22a, v17 = t17a
+        butterfly_8h    v18, v21, v27, v21 // v18 = t18,  v21 = t21
+        butterfly_8h_r  v27, v28, v5,  v28 // v27 = t27a, v28 = t28a
+        butterfly_8h    v29, v26, v20, v26 // v29 = t29,  v26 = t26
+        butterfly_8h    v19, v20, v3,  v6  // v19 = t19a, v20 = t20
+
+        dmbutterfly0    v27, v20, v27, v20, v2, v3, v4, v5, v6, v7 // v27 = t27,  v20 = t20
+        dmbutterfly0    v26, v21, v26, v21, v2, v3, v4, v5, v6, v7 // v26 = t26a, v21 = t21a
+        dmbutterfly0    v25, v22, v25, v22, v2, v3, v4, v5, v6, v7 // v25 = t25,  v22 = t22
+        dmbutterfly0    v24, v23, v24, v23, v2, v3, v4, v5, v6, v7 // v24 = t24a, v23 = t23a
+        ret
+.endm
+
+function idct32_odd
+        dmbutterfly     v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
+        dmbutterfly     v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
+        dmbutterfly     v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
+        dmbutterfly     v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
+        dmbutterfly     v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
+        dmbutterfly     v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
+        dmbutterfly     v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
+        dmbutterfly     v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
+
+        butterfly_8h    v4,  v24, v16, v24 // v4  = t16, v24 = t17
+        butterfly_8h    v5,  v20, v28, v20 // v5  = t19, v20 = t18
+        butterfly_8h    v6,  v26, v18, v26 // v6  = t20, v26 = t21
+        butterfly_8h    v7,  v22, v30, v22 // v7  = t23, v22 = t22
+        butterfly_8h    v28, v25, v17, v25 // v28 = t24, v25 = t25
+        butterfly_8h    v30, v21, v29, v21 // v30 = t27, v21 = t26
+        butterfly_8h    v29, v23, v31, v23 // v29 = t31, v23 = t30
+        butterfly_8h    v31, v27, v19, v27 // v31 = t28, v27 = t29
+
+        dmbutterfly     v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
+        dmbutterfly     v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
+        dmbutterfly     v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
+        dmbutterfly     v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
+        idct32_end
+endfunc
+
+function idct32_odd_half
+        dmbutterfly_h1  v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
+        dmbutterfly_h2  v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
+        dmbutterfly_h1  v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
+        dmbutterfly_h2  v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
+        dmbutterfly_h1  v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
+        dmbutterfly_h2  v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
+        dmbutterfly_h1  v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
+        dmbutterfly_h2  v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
+
+        butterfly_8h    v4,  v24, v16, v24 // v4  = t16, v24 = t17
+        butterfly_8h    v5,  v20, v28, v20 // v5  = t19, v20 = t18
+        butterfly_8h    v6,  v26, v18, v26 // v6  = t20, v26 = t21
+        butterfly_8h    v7,  v22, v30, v22 // v7  = t23, v22 = t22
+        butterfly_8h    v28, v25, v17, v25 // v28 = t24, v25 = t25
+        butterfly_8h    v30, v21, v29, v21 // v30 = t27, v21 = t26
+        butterfly_8h    v29, v23, v31, v23 // v29 = t31, v23 = t30
+        butterfly_8h    v31, v27, v19, v27 // v31 = t28, v27 = t29
+
+        dmbutterfly     v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
+        dmbutterfly     v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
+        dmbutterfly     v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
+        dmbutterfly     v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
+        idct32_end
+endfunc
+
+function idct32_odd_quarter
+        dsmull_h        v4,  v5,  v16, v8.h[0]
+        dsmull_h        v28, v29, v19, v8.h[7]
+        dsmull_h        v30, v31, v16, v8.h[1]
+        dsmull_h        v22, v23, v17, v9.h[6]
+        dsmull_h        v7,  v6,  v17, v9.h[7]
+        dsmull_h        v26, v27, v19, v8.h[6]
+        dsmull_h        v20, v21, v18, v9.h[0]
+        dsmull_h        v24, v25, v18, v9.h[1]
+
+        neg             v28.4s, v28.4s
+        neg             v29.4s, v29.4s
+        neg             v7.4s,  v7.4s
+        neg             v6.4s,  v6.4s
+
+        drshrn_h        v4,  v4,  v5,  #14
+        drshrn_h        v5,  v28, v29, #14
+        drshrn_h        v29, v30, v31, #14
+        drshrn_h        v28, v22, v23, #14
+        drshrn_h        v7,  v7,  v6,  #14
+        drshrn_h        v31, v26, v27, #14
+        drshrn_h        v6,  v20, v21, #14
+        drshrn_h        v30, v24, v25, #14
+
+        dmbutterfly_l   v16, v17, v18, v19, v29, v4,  v0.h[4], v0.h[5]
+        dmbutterfly_l   v27, v26, v20, v21, v31, v5,  v0.h[4], v0.h[5]
+        drshrn_h        v23, v16, v17, #14
+        drshrn_h        v24, v18, v19, #14
+        neg             v20.4s, v20.4s
+        neg             v21.4s, v21.4s
+        drshrn_h        v27, v27, v26, #14
+        drshrn_h        v20, v20, v21, #14
+        dmbutterfly_l   v16, v17, v18, v19, v30, v6,  v0.h[6], v0.h[7]
+        drshrn_h        v21, v16, v17, #14
+        drshrn_h        v26, v18, v19, #14
+        dmbutterfly_l   v16, v17, v18, v19, v28, v7,  v0.h[6], v0.h[7]
+        drshrn_h        v25, v16, v17, #14
+        neg             v18.4s, v18.4s
+        neg             v19.4s, v19.4s
+        drshrn_h        v22, v18, v19, #14
+
+        idct32_end
+endfunc
+
+.macro idct32_funcs suffix
+// Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix.
+// The 32-point IDCT can be decomposed into two 16-point IDCTs;
+// a normal IDCT16 with every other input component (the even ones, with
+// each output written twice), followed by a separate 16-point IDCT
+// of the odd inputs, added/subtracted onto the outputs of the first idct16.
+// x0 = dst (temp buffer)
+// x1 = unused
+// x2 = src
+// x9 = double input stride
+function idct32_1d_8x32_pass1\suffix\()_neon
+        mov             x14, x30
+        movi            v2.8h,  #0
+
+        // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        load_clear      \i, x2, x9
+.endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        load_clear      \i, x2, x9
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load_clear      \i, x2, x9
+.endr
+.endif
+
+        bl              idct16\suffix
+
+        // Do two 8x8 transposes. Originally, v16-v31 contain the
+        // 16 rows. Afterwards, v16-v23 and v24-v31 contain the
+        // two transposed 8x8 blocks.
+        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
+
+        // Store the registers a, b horizontally, followed by the
+        // same registers b, a mirrored.
+.macro store_rev a, b
+        // There's no rev128 instruction, but we reverse each 64 bit
+        // half, and then flip them using an ext with 8 bytes offset.
+        rev64           v3.8h, \b
+        st1             {\a},  [x0], #16
+        rev64           v2.8h, \a
+        ext             v3.16b, v3.16b, v3.16b, #8
+        st1             {\b},  [x0], #16
+        ext             v2.16b, v2.16b, v2.16b, #8
+        st1             {v3.8h},  [x0], #16
+        st1             {v2.8h},  [x0], #16
+.endm
+        store_rev       v16.8h, v24.8h
+        store_rev       v17.8h, v25.8h
+        store_rev       v18.8h, v26.8h
+        store_rev       v19.8h, v27.8h
+        store_rev       v20.8h, v28.8h
+        store_rev       v21.8h, v29.8h
+        store_rev       v22.8h, v30.8h
+        store_rev       v23.8h, v31.8h
+        sub             x0,  x0,  #512
+.purgem store_rev
+
+        // Move x2 back to the start of the input, and move
+        // to the first odd row
+.ifb \suffix
+        sub             x2,  x2,  x9, lsl #4
+.endif
+.ifc \suffix,_quarter
+        sub             x2,  x2,  x9, lsl #2
+.endif
+.ifc \suffix,_half
+        sub             x2,  x2,  x9, lsl #3
+.endif
+        add             x2,  x2,  #64
+
+        movi            v2.8h,  #0
+        // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        load_clear      \i, x2, x9
+.endr
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        load_clear      \i, x2, x9
+.endr
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load_clear      \i, x2, x9
+.endr
+.endif
+
+        bl              idct32_odd\suffix
+
+        transpose_8x8H  v31, v30, v29, v28, v27, v26, v25, v24, v2, v3
+        transpose_8x8H  v23, v22, v21, v20, v19, v18, v17, v16, v2, v3
+
+        // Store the registers a, b horizontally,
+        // adding into the output first, and the mirrored,
+        // subtracted from the output.
+.macro store_rev a, b
+        ld1             {v4.8h},  [x0]
+        rev64           v3.8h, \b
+        add             v4.8h, v4.8h, \a
+        rev64           v2.8h, \a
+        st1             {v4.8h},  [x0], #16
+        ext             v3.16b, v3.16b, v3.16b, #8
+        ld1             {v5.8h},  [x0]
+        ext             v2.16b, v2.16b, v2.16b, #8
+        add             v5.8h, v5.8h, \b
+        st1             {v5.8h},  [x0], #16
+        ld1             {v6.8h},  [x0]
+        sub             v6.8h, v6.8h, v3.8h
+        st1             {v6.8h},  [x0], #16
+        ld1             {v7.8h},  [x0]
+        sub             v7.8h, v7.8h, v2.8h
+        st1             {v7.8h},  [x0], #16
+.endm
+
+        store_rev       v31.8h, v23.8h
+        store_rev       v30.8h, v22.8h
+        store_rev       v29.8h, v21.8h
+        store_rev       v28.8h, v20.8h
+        store_rev       v27.8h, v19.8h
+        store_rev       v26.8h, v18.8h
+        store_rev       v25.8h, v17.8h
+        store_rev       v24.8h, v16.8h
+.purgem store_rev
+        br              x14
+endfunc
+
+// This is mostly the same as 8x32_pass1, but without the transpose,
+// and use the source as temp buffer between the two idct passes, and
+// add into the destination.
+// x0 = dst
+// x1 = dst stride
+// x2 = src (temp buffer)
+// x7 = negative double temp buffer stride
+// x9 = double temp buffer stride
+function idct32_1d_8x32_pass2\suffix\()_neon
+        mov             x14, x30
+        // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        load            \i, x2, x9
+.endr
+        sub             x2,  x2,  x9, lsl #4
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        load            \i, x2, x9
+.endr
+        sub             x2,  x2,  x9, lsl #2
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load            \i, x2, x9
+.endr
+        sub             x2,  x2,  x9, lsl #3
+.endif
+
+        bl              idct16\suffix
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        store           \i, x2, x9
+.endr
+
+        sub             x2,  x2,  x9, lsl #4
+        add             x2,  x2,  #64
+
+        // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
+.ifb \suffix
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        load            \i, x2, x9
+.endr
+        sub             x2,  x2,  x9, lsl #4
+.endif
+.ifc \suffix,_quarter
+.irp i, 16, 17, 18, 19
+        load            \i, x2, x9
+.endr
+        sub             x2,  x2,  x9, lsl #2
+.endif
+.ifc \suffix,_half
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load            \i, x2, x9
+.endr
+        sub             x2,  x2,  x9, lsl #3
+.endif
+        sub             x2,  x2,  #64
+
+        bl              idct32_odd\suffix
+
+.macro load_acc_store a, b, c, d, neg=0
+.if \neg == 0
+        ld1             {v4.8h},  [x2], x9
+        ld1             {v5.8h},  [x2], x9
+        add             v4.8h, v4.8h, \a
+        ld1             {v6.8h},  [x2], x9
+        add             v5.8h, v5.8h, \b
+        ld1             {v7.8h},  [x2], x9
+        add             v6.8h, v6.8h, \c
+        add             v7.8h, v7.8h, \d
+.else
+        ld1             {v4.8h},  [x2], x7
+        ld1             {v5.8h},  [x2], x7
+        sub             v4.8h, v4.8h, \a
+        ld1             {v6.8h},  [x2], x7
+        sub             v5.8h, v5.8h, \b
+        ld1             {v7.8h},  [x2], x7
+        sub             v6.8h, v6.8h, \c
+        sub             v7.8h, v7.8h, \d
+.endif
+        ld1             {v10.8b}, [x0], x1
+        ld1             {v11.8b}, [x0], x1
+        srshr           v4.8h, v4.8h, #6
+        ld1             {v2.8b}, [x0], x1
+        srshr           v5.8h, v5.8h, #6
+        uaddw           v4.8h, v4.8h, v10.8b
+        ld1             {v3.8b}, [x0], x1
+        srshr           v6.8h, v6.8h, #6
+        uaddw           v5.8h, v5.8h, v11.8b
+        srshr           v7.8h, v7.8h, #6
+        sub             x0,  x0,  x1, lsl #2
+        uaddw           v6.8h, v6.8h, v2.8b
+        sqxtun          v4.8b, v4.8h
+        uaddw           v7.8h, v7.8h, v3.8b
+        sqxtun          v5.8b, v5.8h
+        st1             {v4.8b}, [x0], x1
+        sqxtun          v6.8b, v6.8h
+        st1             {v5.8b}, [x0], x1
+        sqxtun          v7.8b, v7.8h
+        st1             {v6.8b}, [x0], x1
+        st1             {v7.8b}, [x0], x1
+.endm
+        load_acc_store  v31.8h, v30.8h, v29.8h, v28.8h
+        load_acc_store  v27.8h, v26.8h, v25.8h, v24.8h
+        load_acc_store  v23.8h, v22.8h, v21.8h, v20.8h
+        load_acc_store  v19.8h, v18.8h, v17.8h, v16.8h
+        sub             x2,  x2,  x9
+        load_acc_store  v16.8h, v17.8h, v18.8h, v19.8h, 1
+        load_acc_store  v20.8h, v21.8h, v22.8h, v23.8h, 1
+        load_acc_store  v24.8h, v25.8h, v26.8h, v27.8h, 1
+        load_acc_store  v28.8h, v29.8h, v30.8h, v31.8h, 1
+.purgem load_acc_store
+        br              x14
+endfunc
+.endm
+
+idct32_funcs
+idct32_funcs _quarter
+idct32_funcs _half
+
+const min_eob_idct_idct_32, align=4
+        .short  0, 34, 135, 336
+endconst
+
+function ff_vp9_idct_idct_32x32_add_neon, export=1
+        cmp             w3,  #1
+        b.eq            idct32x32_dc_add_neon
+
+        movrel          x10, idct_coeffs
+
+        mov             x15, x30
+
+        stp             d10, d11, [sp, #-0x10]!
+        stp             d8,  d9,  [sp, #-0x10]!
+
+        sub             sp,  sp,  #2048
+
+        mov             x4,  x0
+        mov             x5,  x1
+        mov             x6,  x2
+
+        // Double stride of the input, since we only read every other line
+        mov             x9,  #128
+        neg             x7,  x9
+
+        ld1             {v0.8h,v1.8h}, [x10], #32
+        ld1             {v8.8h,v9.8h}, [x10]
+
+        cmp             w3,  #34
+        b.le            idct32x32_quarter_add_neon
+        cmp             w3,  #135
+        b.le            idct32x32_half_add_neon
+
+        movrel          x12, min_eob_idct_idct_32, 2
+
+.irp i, 0, 8, 16, 24
+        add             x0,  sp,  #(\i*64)
+.if \i > 0
+        ldrh            w1,  [x12], #2
+        cmp             w3,  w1
+        mov             x1,  #(32 - \i)/4
+        b.le            1f
+.endif
+        add             x2,  x6,  #(\i*2)
+        bl              idct32_1d_8x32_pass1_neon
+.endr
+        b               3f
+
+1:
+        // Write zeros to the temp buffer for pass 2
+        movi            v16.8h,  #0
+        movi            v17.8h,  #0
+        movi            v18.8h,  #0
+        movi            v19.8h,  #0
+2:
+        subs            x1,  x1,  #1
+.rept 4
+        st1             {v16.8h,v17.8h,v18.8h,v19.8h},  [x0], #64
+.endr
+        b.ne            2b
+3:
+.irp i, 0, 8, 16, 24
+        add             x0,  x4,  #(\i)
+        mov             x1,  x5
+        add             x2,  sp,  #(\i*2)
+        bl              idct32_1d_8x32_pass2_neon
+.endr
+
+        add             sp,  sp,  #2048
+
+        ldp             d8,  d9,  [sp], 0x10
+        ldp             d10, d11, [sp], 0x10
+
+        br              x15
+endfunc
+
+.macro idct32_partial size
+function idct32x32_\size\()_add_neon
+        add             x0,  sp,  #(0*64)
+        add             x2,  x6,  #(0*2)
+        bl              idct32_1d_8x32_pass1_\size\()_neon
+.ifc \size,half
+        add             x0,  sp,  #(8*64)
+        add             x2,  x6,  #(8*2)
+        bl              idct32_1d_8x32_pass1_\size\()_neon
+.endif
+.irp i, 0, 8, 16, 24
+        add             x0,  x4,  #(\i)
+        mov             x1,  x5
+        add             x2,  sp,  #(\i*2)
+        bl              idct32_1d_8x32_pass2_\size\()_neon
+.endr
+
+        add             sp,  sp,  #2048
+
+        ldp             d8,  d9,  [sp], 0x10
+        ldp             d10, d11, [sp], 0x10
+
+        br              x15
+endfunc
+.endm
+
+idct32_partial quarter
+idct32_partial half
new file mode 100644
--- /dev/null
+++ b/media/ffvpx/libavcodec/aarch64/vp9lpf_16bpp_neon.S
@@ -0,0 +1,873 @@
+/*
+ * Copyright (c) 2017 Google Inc.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+
+.macro transpose_4x8H r0, r1, r2, r3, t4, t5, t6, t7
+        trn1            \t4\().8h,  \r0\().8h,  \r1\().8h
+        trn2            \t5\().8h,  \r0\().8h,  \r1\().8h
+        trn1            \t6\().8h,  \r2\().8h,  \r3\().8h
+        trn2            \t7\().8h,  \r2\().8h,  \r3\().8h
+
+        trn1            \r0\().4s,  \t4\().4s,  \t6\().4s
+        trn2            \r2\().4s,  \t4\().4s,  \t6\().4s
+        trn1            \r1\().4s,  \t5\().4s,  \t7\().4s
+        trn2            \r3\().4s,  \t5\().4s,  \t7\().4s
+.endm
+
+// The input to and output from this macro is in the registers v16-v31,
+// and v0-v7 are used as scratch registers.
+// p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31
+// Depending on the width of the loop filter, we either use v16-v19
+// and v28-v31 as temp registers, or v8-v15.
+.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
+        dup             v0.8h,  w2                   // E
+        dup             v2.8h,  w3                   // I
+        dup             v3.8h,  w4                   // H
+
+        uabd            v4.8h,  v20.8h, v21.8h       // abs(p3 - p2)
+        uabd            v5.8h,  v21.8h, v22.8h       // abs(p2 - p1)
+        uabd            v6.8h,  v22.8h, v23.8h       // abs(p1 - p0)
+        uabd            v7.8h,  v24.8h, v25.8h       // abs(q0 - q1)
+        uabd            \tmp1\().8h,  v25.8h, v26.8h // abs(q1 - q2)
+        uabd            \tmp2\().8h,  v26.8h, v27.8h // abs(q2 - q3)
+        umax            v4.8h,  v4.8h,  v5.8h
+        umax            v5.8h,  v6.8h,  v7.8h
+        umax            \tmp1\().8h,  \tmp1\().8h, \tmp2\().8h
+        uabd            v6.8h,  v23.8h, v24.8h       // abs(p0 - q0)
+        umax            v4.8h,  v4.8h,  v5.8h
+        add             v6.8h,  v6.8h,  v6.8h        // abs(p0 - q0) * 2
+        uabd            v5.8h,  v22.8h, v25.8h       // abs(p1 - q1)
+        umax            v4.8h,  v4.8h,  \tmp1\().8h  // max(abs(p3 - p2), ..., abs(q2 - q3))
+        ushr            v5.8h,  v5.8h,  #1
+        cmhs            v4.8h,  v2.8h,  v4.8h        // max(abs()) <= I
+        add             v6.8h,  v6.8h,  v5.8h        // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+        cmhs            v6.8h,  v0.8h,  v6.8h
+        and             v4.16b, v4.16b, v6.16b       // fm
+
+        // If no pixels need filtering, just exit as soon as possible
+        mov             x11, v4.d[0]
+        mov             x12, v4.d[1]
+        adds            x11, x11, x12
+        b.ne            1f
+        br              x10
+1:
+
+.if \wd >= 8
+        dup             v0.8h,  w5
+
+        uabd            v6.8h,  v20.8h, v23.8h       // abs(p3 - p0)
+        uabd            v2.8h,  v21.8h, v23.8h       // abs(p2 - p0)
+        uabd            v1.8h,  v22.8h, v23.8h       // abs(p1 - p0)
+        uabd            \tmp1\().8h,  v25.8h, v24.8h // abs(q1 - q0)
+        uabd            \tmp2\().8h,  v26.8h, v24.8h // abs(q2 - q0)
+        uabd            \tmp3\().8h,  v27.8h, v24.8h // abs(q3 - q0)
+        umax            v6.8h,  v6.8h,  v2.8h
+        umax            v1.8h,  v1.8h,  \tmp1\().8h
+        umax            \tmp2\().8h,  \tmp2\().8h,  \tmp3\().8h
+.if \wd == 16
+        uabd            v7.8h,  v16.8h, v23.8h       // abs(p7 - p0)
+        umax            v6.8h,  v6.8h,  v1.8h
+        uabd            v2.8h,  v17.8h, v23.8h       // abs(p6 - p0)
+        umax            v6.8h,  v6.8h,  \tmp2\().8h
+        uabd            v1.8h,  v18.8h, v23.8h       // abs(p5 - p0)
+        cmhs            v6.8h,  v0.8h,  v6.8h        // flat8in
+        uabd            v8.8h,  v19.8h, v23.8h       // abs(p4 - p0)
+        and             v6.16b, v6.16b, v4.16b       // flat8in && fm
+        uabd            v9.8h,  v28.8h, v24.8h       // abs(q4 - q0)
+        bic             v4.16b, v4.16b, v6.16b       // fm && !flat8in
+        uabd            v10.8h, v29.8h, v24.8h       // abs(q5 - q0)
+        uabd            v11.8h, v30.8h, v24.8h       // abs(q6 - q0)
+        uabd            v12.8h, v31.8h, v24.8h       // abs(q7 - q0)
+
+        umax            v7.8h,  v7.8h,  v2.8h
+        umax            v1.8h,  v1.8h,  v8.8h
+        umax            v9.8h,  v9.8h,  v10.8h
+        umax            v11.8h, v11.8h, v12.8h
+        // The rest of the calculation of flat8out is interleaved below
+.else
+        // The rest of the calculation of flat8in is interleaved below
+.endif
+.endif
+
+        // Calculate the normal inner loop filter for 2 or 4 pixels
+        uabd            v5.8h,  v22.8h, v23.8h                  // abs(p1 - p0)
+.if \wd == 16
+        umax            v7.8h,  v7.8h,  v1.8h
+        umax            v9.8h,  v9.8h,  v11.8h
+.elseif \wd == 8
+        umax            v6.8h,  v6.8h,  v1.8h
+.endif
+        uabd            v1.8h,  v25.8h, v24.8h                  // abs(q1 - q0)
+.if \wd == 16
+        umax            v7.8h,  v7.8h,  v9.8h
+.elseif \wd == 8
+        umax            v6.8h,  v6.8h,  \tmp2\().8h
+.endif
+        dup             \tmp2\().8h,  w6                        // left shift for saturation
+        sub             \tmp1\().8h,  v22.8h,  v25.8h           // p1 - q1
+        neg             \tmp6\().8h,  \tmp2\().8h               // negative left shift after saturation
+        umax            v5.8h,  v5.8h,  v1.8h                   // max(abs(p1 - p0), abs(q1 - q0))
+        sub             \tmp3\().8h,  v24.8h,  v23.8h           // q0 - p0
+        movi            \tmp5\().8h,  #3
+.if \wd == 8
+        cmhs            v6.8h,  v0.8h,  v6.8h                   // flat8in
+.endif
+        cmhs            v5.8h,  v3.8h,  v5.8h                   // !hev
+.if \wd == 8
+        and             v6.16b, v6.16b, v4.16b                  // flat8in && fm
+.endif
+        sqshl           \tmp1\().8h,  \tmp1\().8h,  \tmp2\().8h
+.if \wd == 16
+        cmhs            v7.8h,  v0.8h,  v7.8h                   // flat8out
+.elseif \wd == 8
+        bic             v4.16b, v4.16b, v6.16b                  // fm && !flat8in
+.endif
+        and             v5.16b,  v5.16b,  v4.16b                // !hev && fm && !flat8in
+.if \wd == 16
+        and             v7.16b, v7.16b, v6.16b                  // flat8out && flat8in && fm
+.endif
+        sshl            \tmp1\().8h,  \tmp1\().8h,  \tmp6\().8h // av_clip_int2p(p1 - q1, BIT_DEPTH - 1)
+
+        mul             \tmp3\().8h,  \tmp3\().8h,  \tmp5\().8h // 3 * (q0 - p0)
+        bic             \tmp1\().16b, \tmp1\().16b, v5.16b      // if (!hev) av_clip_int8 = 0
+        movi            v2.8h,  #4
+        add             \tmp3\().8h,  \tmp3\().8h,  \tmp1\().8h // 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
+        movi            v3.8h,  #3
+        sqshl           \tmp1\().8h,  \tmp3\().8h,  \tmp2\().8h
+        movi            \tmp5\().8h,  #0
+        sshl            \tmp1\().8h,  \tmp1\().8h,  \tmp6\().8h // av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f
+        dup             \tmp6\().8h,  w7                        // max pixel value
+.if \wd == 16
+        bic             v6.16b, v6.16b, v7.16b                  // fm && flat8in && !flat8out
+.endif
+
+        ushr            \tmp2\().8h,  \tmp6\().8h,  #1          // (1 << (BIT_DEPTH - 1)) - 1
+
+        add             \tmp3\().8h,  \tmp1\().8h,  v2.8h       // f + 4
+        add             \tmp4\().8h,  \tmp1\().8h,  v3.8h       // f + 3
+        smin            \tmp3\().8h,  \tmp3\().8h,  \tmp2\().8h // FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1)
+        smin            \tmp4\().8h,  \tmp4\().8h,  \tmp2\().8h // FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1)
+        sshr            \tmp3\().8h,  \tmp3\().8h,  #3          // f1
+        sshr            \tmp4\().8h,  \tmp4\().8h,  #3          // f2
+
+        add             v0.8h,   v23.8h,  \tmp4\().8h           // p0 + f2
+        sub             v2.8h,   v24.8h,  \tmp3\().8h           // q0 - f1
+        smin            v0.8h,   v0.8h,   \tmp6\().8h
+        smin            v2.8h,   v2.8h,   \tmp6\().8h
+        srshr           \tmp3\().8h, \tmp3\().8h, #1            // f = (f1 + 1) >> 1
+        smax            v0.8h,   v0.8h,   \tmp5\().8h           // out p0
+        smax            v2.8h,   v2.8h,   \tmp5\().8h           // out q0
+        bit             v23.16b, v0.16b,  v4.16b                // if (fm && !flat8in)
+        bit             v24.16b, v2.16b,  v4.16b
+
+        add             v0.8h,  v22.8h,  \tmp3\().8h            // p1 + f
+        sub             v2.8h,  v25.8h,  \tmp3\().8h            // q1 - f
+.if \wd >= 8
+        mov             x11, v6.d[0]
+.endif
+        smin            v0.8h,  v0.8h,  \tmp6\().8h
+        smin            v2.8h,  v2.8h,  \tmp6\().8h
+.if \wd >= 8
+        mov             x12, v6.d[1]
+.endif
+        smax            v0.8h,  v0.8h,  \tmp5\().8h             // out p1
+        smax            v2.8h,  v2.8h,  \tmp5\().8h             // out q1
+.if \wd >= 8
+        adds            x11, x11, x12
+.endif
+        bit             v22.16b, v0.16b,  v5.16b                // if (!hev && fm && !flat8in)
+        bit             v25.16b, v2.16b,  v5.16b
+
+        // If no pixels need flat8in, jump to flat8out
+        // (or to a writeout of the inner 4 pixels, for wd=8)
+.if \wd >= 8
+.if \wd == 16
+        b.eq            6f
+.else
+        b.ne            1f
+        br              x13
+1:
+.endif
+
+        // flat8in
+        add             \tmp1\().8h, v20.8h, v21.8h
+        add             \tmp3\().8h, v22.8h, v25.8h
+        add             \tmp5\().8h, v20.8h, v22.8h
+        add             \tmp7\().8h, v23.8h, v26.8h
+        add             v0.8h,  \tmp1\().8h, \tmp1\().8h
+        add             v0.8h,  v0.8h,  v23.8h
+        add             v0.8h,  v0.8h,  v24.8h
+        add             v0.8h,  v0.8h,  \tmp5\().8h
+        sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
+        sub             \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
+        urshr           v2.8h,  v0.8h,  #3                      // out p2
+
+        add             v0.8h,  v0.8h,  \tmp3\().8h
+        add             \tmp1\().8h, v20.8h,  v23.8h
+        add             \tmp3\().8h, v24.8h,  v27.8h
+        urshr           v3.8h,  v0.8h,  #3                      // out p1
+
+        add             v0.8h,  v0.8h,  \tmp7\().8h
+        sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
+        add             \tmp5\().8h, v21.8h,  v24.8h
+        add             \tmp7\().8h, v25.8h,  v27.8h
+        urshr           v4.8h,  v0.8h,  #3                      // out p0
+
+        add             v0.8h,  v0.8h,  \tmp3\().8h
+        sub             \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
+        add             \tmp1\().8h, v22.8h,  v25.8h
+        add             \tmp3\().8h, v26.8h,  v27.8h
+        urshr           v5.8h,  v0.8h,  #3                      // out q0
+
+        add             v0.8h,  v0.8h,  \tmp7\().8h
+        sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
+        urshr           \tmp5\().8h, v0.8h,  #3                 // out q1
+
+        add             v0.8h,  v0.8h,  \tmp3\().8h
+        // The output here is written back into the input registers. This doesn't
+        // matter for the flat8part below, since we only update those pixels
+        // which won't be touched below.
+        bit             v21.16b, v2.16b,  v6.16b
+        bit             v22.16b, v3.16b,  v6.16b
+        bit             v23.16b, v4.16b,  v6.16b
+        urshr           \tmp6\().8h,  v0.8h,  #3                // out q2
+        bit             v24.16b, v5.16b,  v6.16b
+        bit             v25.16b, \tmp5\().16b,  v6.16b
+        bit             v26.16b, \tmp6\().16b,  v6.16b
+.endif
+.if \wd == 16
+6:
+        orr             v2.16b,  v6.16b,  v7.16b
+        mov             x11, v2.d[0]
+        mov             x12, v2.d[1]
+        adds            x11, x11, x12
+        b.ne            1f
+        // If no pixels needed flat8in nor flat8out, jump to a
+        // writeout of the inner 4 pixels
+        br              x14
+1:
+
+        mov             x11, v7.d[0]
+        mov             x12, v7.d[1]
+        adds            x11, x11, x12
+        b.ne            1f
+        // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
+        br              x15
+
+1:
+        // flat8out
+        // This writes all outputs into v2-v17 (skipping v6 and v16).
+        // If this part is skipped, the output is read from v21-v26 (which is the input
+        // to this section).
+        shl             v0.8h,   v16.8h,  #3     // 8 * v16
+        sub             v0.8h,   v0.8h,   v16.8h // 7 * v16
+        add             v0.8h,   v0.8h,   v17.8h
+        add             v8.8h,   v17.8h,  v18.8h
+        add             v10.8h,  v19.8h,  v20.8h
+        add             v0.8h,   v0.8h,   v8.8h
+        add             v8.8h,   v16.8h,  v17.8h
+        add             v12.8h,  v21.8h,  v22.8h
+        add             v0.8h,   v0.8h,   v10.8h
+        add             v10.8h,  v18.8h,  v25.8h
+        add             v14.8h,  v23.8h,  v24.8h
+        sub             v10.8h,  v10.8h,  v8.8h
+        add             v0.8h,   v0.8h,   v12.8h
+        add             v0.8h,   v0.8h,   v14.8h
+        add             v12.8h,  v16.8h,  v18.8h
+        add             v14.8h,  v19.8h,  v26.8h
+        urshr           v2.8h,   v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v10.8h
+        add             v8.8h,   v16.8h,  v19.8h
+        add             v10.8h,  v20.8h,  v27.8h
+        sub             v14.8h,  v14.8h,  v12.8h
+        bif             v2.16b,  v17.16b, v7.16b
+        urshr           v3.8h ,  v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v14.8h
+        add             v12.8h,  v16.8h,  v20.8h
+        add             v14.8h,  v21.8h,  v28.8h
+        sub             v10.8h,  v10.8h,  v8.8h
+        bif             v3.16b,  v18.16b, v7.16b
+        urshr           v4.8h,   v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v10.8h
+        add             v8.8h,   v16.8h,  v21.8h
+        add             v10.8h,  v22.8h,  v29.8h
+        sub             v14.8h,  v14.8h,  v12.8h
+        bif             v4.16b,  v19.16b, v7.16b
+        urshr           v5.8h,   v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v14.8h
+        add             v12.8h,  v16.8h,  v22.8h
+        add             v14.8h,  v23.8h,  v30.8h
+        sub             v10.8h,  v10.8h,  v8.8h
+        bif             v5.16b,  v20.16b, v7.16b
+        urshr           v6.8h,   v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v10.8h
+        add             v10.8h,  v16.8h,  v23.8h
+        sub             v14.8h,  v14.8h,  v12.8h
+        add             v12.8h,  v24.8h,  v31.8h
+        bif             v6.16b,  v21.16b, v7.16b
+        urshr           v8.8h,   v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v14.8h
+        sub             v10.8h,  v12.8h,  v10.8h
+        add             v12.8h,  v17.8h,  v24.8h
+        add             v14.8h,  v25.8h,  v31.8h
+        bif             v8.16b,  v22.16b, v7.16b
+        urshr           v9.8h,   v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v10.8h
+        sub             v14.8h,  v14.8h,  v12.8h
+        add             v12.8h,  v26.8h,  v31.8h
+        bif             v9.16b,  v23.16b, v7.16b
+        urshr           v10.8h,  v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v14.8h
+        add             v14.8h,  v18.8h,  v25.8h
+        add             v18.8h,  v19.8h,  v26.8h
+        sub             v12.8h,  v12.8h,  v14.8h
+        add             v14.8h,  v27.8h,  v31.8h
+        bif             v10.16b, v24.16b, v7.16b
+        urshr           v11.8h,  v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v12.8h
+        add             v12.8h,  v20.8h,  v27.8h
+        sub             v14.8h,  v14.8h,  v18.8h
+        add             v18.8h,  v28.8h,  v31.8h
+        bif             v11.16b, v25.16b, v7.16b
+        sub             v18.8h,  v18.8h,  v12.8h
+        urshr           v12.8h,  v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v14.8h
+        add             v14.8h,  v21.8h,  v28.8h
+        add             v20.8h,  v29.8h,  v31.8h
+        bif             v12.16b, v26.16b, v7.16b
+        urshr           v13.8h,  v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v18.8h
+        sub             v20.8h,  v20.8h,  v14.8h
+        add             v18.8h,  v22.8h,  v29.8h
+        add             v22.8h,  v30.8h,  v31.8h
+        bif             v13.16b, v27.16b, v7.16b
+        urshr           v14.8h,  v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v20.8h
+        sub             v22.8h,  v22.8h,  v18.8h
+        bif             v14.16b, v28.16b, v7.16b
+        urshr           v15.8h,  v0.8h,   #4
+
+        add             v0.8h,   v0.8h,   v22.8h
+        bif             v15.16b, v29.16b, v7.16b
+        urshr           v17.8h,  v0.8h,   #4
+        bif             v17.16b, v30.16b, v7.16b
+.endif
+.endm
+
+// For wd <= 8, we use v16-v19 and v28-v31 for temp registers,
+// while we need those for inputs/outputs in wd=16 and use v8-v15
+// for temp registers there instead.
+function vp9_loop_filter_4
+        loop_filter     4,  v16, v17, v18, v19, v28, v29, v30, v31
+        ret
+endfunc
+
+function vp9_loop_filter_8
+        loop_filter     8,  v16, v17, v18, v19, v28, v29, v30, v31
+        ret
+endfunc
+
+function vp9_loop_filter_16
+        loop_filter     16, v8,  v9,  v10, v11, v12, v13, v14, v15
+        ret
+endfunc
+
+.macro loop_filter_4
+        bl              vp9_loop_filter_4
+.endm
+
+.macro loop_filter_8
+        // calculate alternative 'return' targets
+        adr             x13, 6f
+        bl              vp9_loop_filter_8
+.endm
+
+.macro loop_filter_16
+        // calculate alternative 'return' targets
+        adr             x14, 7f
+        adr             x15, 8f
+        bl              vp9_loop_filter_16
+.endm
+
+
+// The public functions in this file have got the following signature:
+// void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
+
+.macro bpp_frontend func, bpp, push
+function ff_\func\()_\bpp\()_neon, export=1
+.if \push
+        mov             x16, x30
+        stp             d14, d15, [sp, #-0x10]!
+        stp             d12, d13, [sp, #-0x10]!
+        stp             d10, d11, [sp, #-0x10]!
+        stp             d8,  d9,  [sp, #-0x10]!
+.endif
+        lsl             w2,  w2,  #\bpp - 8
+        lsl             w3,  w3,  #\bpp - 8
+        lsl             w4,  w4,  #\bpp - 8
+        mov             x5,  #1 << (\bpp - 8)
+        mov             x6,  #16 - \bpp
+        mov             x7,  #((1 << \bpp) - 1)
+.if \push
+        bl              \func\()_16_neon
+        ldp             d8,  d9,  [sp], 0x10
+        ldp             d10, d11, [sp], 0x10
+        ldp             d12, d13, [sp], 0x10
+        ldp             d14, d15, [sp], 0x10
+        br              x16
+.else
+        b               \func\()_16_neon
+.endif
+endfunc
+.endm
+
+.macro bpp_frontends func, push=0
+        bpp_frontend    \func, 10, \push
+        bpp_frontend    \func, 12, \push
+.endm
+
+.macro bpp_frontend_rep func, suffix, int_suffix, dir, bpp, push
+function ff_\func\()_\suffix\()_\bpp\()_neon, export=1
+        mov             x16, x30
+.if \push
+        stp             d14, d15, [sp, #-0x10]!
+        stp             d12, d13, [sp, #-0x10]!
+        stp             d10, d11, [sp, #-0x10]!
+        stp             d8,  d9,  [sp, #-0x10]!
+.endif
+        lsl             w2,  w2,  #\bpp - 8
+        lsl             w3,  w3,  #\bpp - 8
+        lsl             w4,  w4,  #\bpp - 8
+        mov             x5,  #1 << (\bpp - 8)
+        mov             x6,  #16 - \bpp
+        mov             x7,  #((1 << \bpp) - 1)
+        bl              \func\()_\int_suffix\()_16_neon
+.ifc \dir,h
+        add             x0,  x0,  x1, lsl #3
+.else
+        add             x0,  x0,  #16
+.endif
+        bl              \func\()_\int_suffix\()_16_neon
+.if \push
+        ldp             d8,  d9,  [sp], 0x10
+        ldp             d10, d11, [sp], 0x10
+        ldp             d12, d13, [sp], 0x10
+        ldp             d14, d15, [sp], 0x10
+.endif
+        br              x16
+endfunc
+.endm
+
+.macro bpp_frontends_rep func, suffix, int_suffix, dir, push=0
+        bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 10, \push
+        bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 12, \push
+.endm
+
+.macro bpp_frontend_mix2 wd1, wd2, dir, bpp
+function ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1
+        mov             x16, x30
+        lsr             w8,  w2,  #8
+        lsr             w14, w3,  #8
+        lsr             w15, w4,  #8
+        and             w2,  w2,  #0xff
+        and             w3,  w3,  #0xff
+        and             w4,  w4,  #0xff
+        lsl             w2,  w2,  #\bpp - 8
+        lsl             w3,  w3,  #\bpp - 8
+        lsl             w4,  w4,  #\bpp - 8
+        mov             x5,  #1 << (\bpp - 8)
+        mov             x6,  #16 - \bpp
+        mov             x7,  #((1 << \bpp) - 1)
+        bl              vp9_loop_filter_\dir\()_\wd1\()_8_16_neon
+.ifc \dir,h
+        add             x0,  x0,  x1, lsl #3
+.else
+        add             x0,  x0,  #16
+.endif
+        lsl             w2,  w8,  #\bpp - 8
+        lsl             w3,  w14, #\bpp - 8
+        lsl             w4,  w15, #\bpp - 8
+        bl              vp9_loop_filter_\dir\()_\wd2\()_8_16_neon
+        br              x16
+endfunc
+.endm
+
+.macro bpp_frontends_mix2 wd1, wd2
+        bpp_frontend_mix2 \wd1, \wd2, v, 10
+        bpp_frontend_mix2 \wd1, \wd2, v, 12
+        bpp_frontend_mix2 \wd1, \wd2, h, 10
+        bpp_frontend_mix2 \wd1, \wd2, h, 12
+.endm
+
+function vp9_loop_filter_v_4_8_16_neon
+        mov             x10, x30
+        sub             x9,  x0,  x1, lsl #2
+        ld1             {v20.8h}, [x9], x1 // p3
+        ld1             {v24.8h}, [x0], x1 // q0
+        ld1             {v21.8h}, [x9], x1 // p2
+        ld1             {v25.8h}, [x0], x1 // q1
+        ld1             {v22.8h}, [x9], x1 // p1
+        ld1             {v26.8h}, [x0], x1 // q2
+        ld1             {v23.8h}, [x9], x1 // p0
+        ld1             {v27.8h}, [x0], x1 // q3
+        sub             x0,  x0,  x1, lsl #2
+        sub             x9,  x9,  x1, lsl #1
+
+        loop_filter_4
+
+        st1             {v22.8h}, [x9], x1
+        st1             {v24.8h}, [x0], x1
+        st1             {v23.8h}, [x9], x1
+        st1             {v25.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #1
+
+        br              x10
+endfunc
+
+bpp_frontends vp9_loop_filter_v_4_8
+
+function vp9_loop_filter_h_4_8_16_neon
+        mov             x10, x30
+        sub             x9,  x0,  #8
+        add             x0,  x9,  x1, lsl #2
+        ld1             {v20.8h}, [x9], x1
+        ld1             {v24.8h}, [x0], x1
+        ld1             {v21.8h}, [x9], x1
+        ld1             {v25.8h}, [x0], x1
+        ld1             {v22.8h}, [x9], x1
+        ld1             {v26.8h}, [x0], x1
+        ld1             {v23.8h}, [x9], x1
+        ld1             {v27.8h}, [x0], x1
+
+        sub             x9,  x9,  x1, lsl #2
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #8
+
+        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+        loop_filter_4
+
+        // Move x9 forward by 2 pixels; we don't need to rewrite the
+        // outermost 2 pixels since they aren't changed.
+        add             x9,  x9,  #4
+        add             x0,  x9,  x1, lsl #2
+
+        // We only will write the mid 4 pixels back; after the loop filter,
+        // these are in v22, v23, v24, v25, ordered as rows (8x4 pixels).
+        // We need to transpose them to columns, done with a 4x8 transpose
+        // (which in practice is two 4x4 transposes of the two 4x4 halves
+        // of the 8x4 pixels; into 4x8 pixels).
+        transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
+        st1             {v22.d}[0], [x9], x1
+        st1             {v22.d}[1], [x0], x1
+        st1             {v23.d}[0], [x9], x1
+        st1             {v23.d}[1], [x0], x1
+        st1             {v24.d}[0], [x9], x1
+        st1             {v24.d}[1], [x0], x1
+        st1             {v25.d}[0], [x9], x1
+        st1             {v25.d}[1], [x0], x1
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #4
+
+        br              x10
+endfunc
+
+bpp_frontends vp9_loop_filter_h_4_8
+
+function vp9_loop_filter_v_8_8_16_neon
+        mov             x10, x30
+        sub             x9,  x0,  x1, lsl #2
+        ld1             {v20.8h}, [x9], x1 // p3
+        ld1             {v24.8h}, [x0], x1 // q0
+        ld1             {v21.8h}, [x9], x1 // p2
+        ld1             {v25.8h}, [x0], x1 // q1
+        ld1             {v22.8h}, [x9], x1 // p1
+        ld1             {v26.8h}, [x0], x1 // q2
+        ld1             {v23.8h}, [x9], x1 // p0
+        ld1             {v27.8h}, [x0], x1 // q3
+        sub             x9,  x9,  x1, lsl #2
+        sub             x0,  x0,  x1, lsl #2
+        add             x9,  x9,  x1
+
+        loop_filter_8
+
+        st1             {v21.8h}, [x9], x1
+        st1             {v24.8h}, [x0], x1
+        st1             {v22.8h}, [x9], x1
+        st1             {v25.8h}, [x0], x1
+        st1             {v23.8h}, [x9], x1
+        st1             {v26.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #1
+        sub             x0,  x0,  x1
+
+        br              x10
+6:
+        sub             x9,  x0,  x1, lsl #1
+        st1             {v22.8h}, [x9], x1
+        st1             {v24.8h}, [x0], x1
+        st1             {v23.8h}, [x9], x1
+        st1             {v25.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #1
+        br              x10
+endfunc
+
+bpp_frontends vp9_loop_filter_v_8_8
+
+function vp9_loop_filter_h_8_8_16_neon
+        mov             x10, x30
+        sub             x9,  x0,  #8
+        add             x0,  x9,  x1, lsl #2
+        ld1             {v20.8h}, [x9], x1
+        ld1             {v24.8h}, [x0], x1
+        ld1             {v21.8h}, [x9], x1
+        ld1             {v25.8h}, [x0], x1
+        ld1             {v22.8h}, [x9], x1
+        ld1             {v26.8h}, [x0], x1
+        ld1             {v23.8h}, [x9], x1
+        ld1             {v27.8h}, [x0], x1
+
+        sub             x9,  x9,  x1, lsl #2
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #8
+
+        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+        loop_filter_8
+
+        add             x0,  x9,  x1, lsl #2
+
+        // Even though only 6 pixels per row have been changed, we write the
+        // full 8 pixel registers.
+        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
+
+        st1             {v20.8h}, [x9], x1
+        st1             {v24.8h}, [x0], x1
+        st1             {v21.8h}, [x9], x1
+        st1             {v25.8h}, [x0], x1
+        st1             {v22.8h}, [x9], x1
+        st1             {v26.8h}, [x0], x1
+        st1             {v23.8h}, [x9], x1
+        st1             {v27.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #8
+
+        br              x10
+6:
+        // If we didn't need to do the flat8in part, we use the same writeback
+        // as in loop_filter_h_4_8.
+        add             x9,  x9,  #4
+        add             x0,  x9,  x1, lsl #2
+        transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
+        st1             {v22.d}[0], [x9], x1
+        st1             {v22.d}[1], [x0], x1
+        st1             {v23.d}[0], [x9], x1
+        st1             {v23.d}[1], [x0], x1
+        st1             {v24.d}[0], [x9], x1
+        st1             {v24.d}[1], [x0], x1
+        st1             {v25.d}[0], [x9], x1
+        st1             {v25.d}[1], [x0], x1
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #4
+        br              x10
+endfunc
+
+bpp_frontends vp9_loop_filter_h_8_8
+
+bpp_frontends_mix2 4, 4
+bpp_frontends_mix2 4, 8
+bpp_frontends_mix2 8, 4
+bpp_frontends_mix2 8, 8
+
+function vp9_loop_filter_v_16_8_16_neon
+        mov             x10, x30
+        sub             x9,  x0,  x1, lsl #3
+        ld1             {v16.8h}, [x9], x1 // p7
+        ld1             {v24.8h}, [x0], x1 // q0
+        ld1             {v17.8h}, [x9], x1 // p6
+        ld1             {v25.8h}, [x0], x1 // q1
+        ld1             {v18.8h}, [x9], x1 // p5
+        ld1             {v26.8h}, [x0], x1 // q2
+        ld1             {v19.8h}, [x9], x1 // p4
+        ld1             {v27.8h}, [x0], x1 // q3
+        ld1             {v20.8h}, [x9], x1 // p3
+        ld1             {v28.8h}, [x0], x1 // q4
+        ld1             {v21.8h}, [x9], x1 // p2
+        ld1             {v29.8h}, [x0], x1 // q5
+        ld1             {v22.8h}, [x9], x1 // p1
+        ld1             {v30.8h}, [x0], x1 // q6
+        ld1             {v23.8h}, [x9], x1 // p0
+        ld1             {v31.8h}, [x0], x1 // q7
+        sub             x9,  x9,  x1, lsl #3
+        sub             x0,  x0,  x1, lsl #3
+        add             x9,  x9,  x1
+
+        loop_filter_16
+
+        // If we did the flat8out part, we get the output in
+        // v2-v17 (skipping v7 and v16). x9 points to x0 - 7 * stride,
+        // store v2-v9 there, and v10-v17 into x0.
+        st1             {v2.8h},  [x9], x1
+        st1             {v10.8h}, [x0], x1
+        st1             {v3.8h},  [x9], x1
+        st1             {v11.8h}, [x0], x1
+        st1             {v4.8h},  [x9], x1
+        st1             {v12.8h}, [x0], x1
+        st1             {v5.8h},  [x9], x1
+        st1             {v13.8h}, [x0], x1
+        st1             {v6.8h},  [x9], x1
+        st1             {v14.8h}, [x0], x1
+        st1             {v8.8h},  [x9], x1
+        st1             {v15.8h}, [x0], x1
+        st1             {v9.8h},  [x9], x1
+        st1             {v17.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  x1
+
+        br              x10
+8:
+        add             x9,  x9,  x1, lsl #2
+        // If we didn't do the flat8out part, the output is left in the
+        // input registers.
+        st1             {v21.8h}, [x9], x1
+        st1             {v24.8h}, [x0], x1
+        st1             {v22.8h}, [x9], x1
+        st1             {v25.8h}, [x0], x1
+        st1             {v23.8h}, [x9], x1
+        st1             {v26.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #1
+        sub             x0,  x0,  x1
+        br              x10
+7:
+        sub             x9,  x0,  x1, lsl #1
+        st1             {v22.8h}, [x9], x1
+        st1             {v24.8h}, [x0], x1
+        st1             {v23.8h}, [x9], x1
+        st1             {v25.8h}, [x0], x1
+        sub             x0,  x0,  x1, lsl #1
+        br              x10
+endfunc
+
+bpp_frontends vp9_loop_filter_v_16_8, push=1
+bpp_frontends_rep vp9_loop_filter_v_16, 16, 8, v, push=1
+
+function vp9_loop_filter_h_16_8_16_neon
+        mov             x10, x30
+        sub             x9,  x0,  #16
+        ld1             {v16.8h}, [x9], x1
+        ld1             {v24.8h}, [x0], x1
+        ld1             {v17.8h}, [x9], x1
+        ld1             {v25.8h}, [x0], x1
+        ld1             {v18.8h}, [x9], x1
+        ld1             {v26.8h}, [x0], x1
+        ld1             {v19.8h}, [x9], x1
+        ld1             {v27.8h}, [x0], x1