Backed out changeset 203314c3b2fa (bug 1738736) for causing AV1 video crashes (bug 1739910). a=backout
authorMarian-Vasile Laza <mlaza@mozilla.com>
Mon, 08 Nov 2021 11:52:27 +0200
changeset 672272 21719d674fc449f2cfce4d93bf3c813834acdfd8
parent 672261 5823cb0f699866c8672cce50cf8e1cbceadc43aa
child 672341 01a19f2bf0c2c6ed95d0b36e55744ddf0a5451b7
push id2724
push userffxbld-merge
push dateMon, 03 Jan 2022 22:10:59 +0000
treeherdermozilla-release@ef2a8189e3dc [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersbackout
bugs1738736, 1739910
milestone96.0a1
backs out203314c3b2fa40a0404eb8db867767d0a7d792b6
first release with
nightly linux32
21719d674fc4 / 96.0a1 / 20211108095312 / files
nightly linux64
21719d674fc4 / 96.0a1 / 20211108095312 / files
nightly mac
21719d674fc4 / 96.0a1 / 20211108095312 / files
nightly win32
21719d674fc4 / 96.0a1 / 20211108095312 / files
nightly win64
21719d674fc4 / 96.0a1 / 20211108095312 / files
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
releases
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Backed out changeset 203314c3b2fa (bug 1738736) for causing AV1 video crashes (bug 1739910). a=backout
media/libdav1d/asm/moz.build
media/libdav1d/moz.yaml
media/libdav1d/vcs_version.h
third_party/dav1d/meson.build
third_party/dav1d/meson_options.txt
third_party/dav1d/src/arm/32/cdef.S
third_party/dav1d/src/arm/32/cdef16.S
third_party/dav1d/src/arm/64/cdef.S
third_party/dav1d/src/arm/64/cdef16.S
third_party/dav1d/src/arm/64/film_grain.S
third_party/dav1d/src/arm/64/film_grain16.S
third_party/dav1d/src/arm/64/ipred.S
third_party/dav1d/src/arm/64/ipred16.S
third_party/dav1d/src/arm/64/itx.S
third_party/dav1d/src/arm/64/itx16.S
third_party/dav1d/src/arm/64/loopfilter.S
third_party/dav1d/src/arm/64/loopfilter16.S
third_party/dav1d/src/arm/64/looprestoration.S
third_party/dav1d/src/arm/64/looprestoration16.S
third_party/dav1d/src/arm/64/mc.S
third_party/dav1d/src/arm/64/mc16.S
third_party/dav1d/src/arm/64/refmvs.S
third_party/dav1d/src/arm/asm.S
third_party/dav1d/src/arm/cdef_init_tmpl.c
third_party/dav1d/src/arm/looprestoration_init_tmpl.c
third_party/dav1d/src/cdef.h
third_party/dav1d/src/cdef_apply.h
third_party/dav1d/src/cdef_apply_tmpl.c
third_party/dav1d/src/cdef_tmpl.c
third_party/dav1d/src/cpu.c
third_party/dav1d/src/decode.c
third_party/dav1d/src/internal.h
third_party/dav1d/src/lf_apply.h
third_party/dav1d/src/lf_apply_tmpl.c
third_party/dav1d/src/lib.c
third_party/dav1d/src/looprestoration.h
third_party/dav1d/src/looprestoration_tmpl.c
third_party/dav1d/src/lr_apply.h
third_party/dav1d/src/lr_apply_tmpl.c
third_party/dav1d/src/meson.build
third_party/dav1d/src/ppc/cdef_init_tmpl.c
third_party/dav1d/src/ppc/looprestoration_init_tmpl.c
third_party/dav1d/src/recon.h
third_party/dav1d/src/recon_tmpl.c
third_party/dav1d/src/tables.c
third_party/dav1d/src/thread_task.c
third_party/dav1d/src/x86/cdef16_avx2.asm
third_party/dav1d/src/x86/cdef16_sse.asm
third_party/dav1d/src/x86/cdef_avx2.asm
third_party/dav1d/src/x86/cdef_avx512.asm
third_party/dav1d/src/x86/cdef_init_tmpl.c
third_party/dav1d/src/x86/cdef_sse.asm
third_party/dav1d/src/x86/ipred_avx512.asm
third_party/dav1d/src/x86/ipred_init_tmpl.c
third_party/dav1d/src/x86/itx16_avx2.asm
third_party/dav1d/src/x86/itx16_sse.asm
third_party/dav1d/src/x86/itx_avx2.asm
third_party/dav1d/src/x86/itx_avx512.asm
third_party/dav1d/src/x86/itx_init_tmpl.c
third_party/dav1d/src/x86/loopfilter_avx512.asm
third_party/dav1d/src/x86/loopfilter_init_tmpl.c
third_party/dav1d/src/x86/looprestoration16_avx2.asm
third_party/dav1d/src/x86/looprestoration16_sse.asm
third_party/dav1d/src/x86/looprestoration_avx2.asm
third_party/dav1d/src/x86/looprestoration_avx512.asm
third_party/dav1d/src/x86/looprestoration_init_tmpl.c
third_party/dav1d/src/x86/looprestoration_sse.asm
third_party/dav1d/src/x86/mc_avx2.asm
third_party/dav1d/src/x86/mc_avx512.asm
third_party/dav1d/src/x86/mc_init_tmpl.c
third_party/dav1d/src/x86/refmvs.asm
third_party/dav1d/src/x86/refmvs_init.c
third_party/dav1d/tests/checkasm/cdef.c
third_party/dav1d/tests/checkasm/looprestoration.c
--- a/media/libdav1d/asm/moz.build
+++ b/media/libdav1d/asm/moz.build
@@ -85,26 +85,22 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64
         # an error when it compiles empty files.
         SOURCES += [
             '../../../third_party/dav1d/src/x86/cdef16_avx2.asm', # moved from autovendored
             '../../../third_party/dav1d/src/x86/cdef_avx2.asm',
             '../../../third_party/dav1d/src/x86/cdef_avx512.asm',
             '../../../third_party/dav1d/src/x86/film_grain16_avx2.asm',
             '../../../third_party/dav1d/src/x86/film_grain_avx2.asm',
             '../../../third_party/dav1d/src/x86/ipred_avx2.asm',
-            '../../../third_party/dav1d/src/x86/ipred_avx512.asm',
             '../../../third_party/dav1d/src/x86/itx16_avx2.asm',
             '../../../third_party/dav1d/src/x86/itx_avx2.asm',
-            '../../../third_party/dav1d/src/x86/itx_avx512.asm',
             '../../../third_party/dav1d/src/x86/loopfilter16_avx2.asm',
             '../../../third_party/dav1d/src/x86/loopfilter_avx2.asm',
-            '../../../third_party/dav1d/src/x86/loopfilter_avx512.asm',
             '../../../third_party/dav1d/src/x86/looprestoration16_avx2.asm',
             '../../../third_party/dav1d/src/x86/looprestoration_avx2.asm',
-            '../../../third_party/dav1d/src/x86/looprestoration_avx512.asm',
             '../../../third_party/dav1d/src/x86/mc16_avx2.asm',
             '../../../third_party/dav1d/src/x86/mc_avx2.asm',
             '../../../third_party/dav1d/src/x86/mc_avx512.asm',
         ]
 
     SOURCES += [
         '../../../third_party/dav1d/src/x86/cdef16_sse.asm',
         '../../../third_party/dav1d/src/x86/cdef_sse.asm',
--- a/media/libdav1d/moz.yaml
+++ b/media/libdav1d/moz.yaml
@@ -15,21 +15,21 @@ origin:
   description: dav1d, a fast AV1 decoder
 
   # Full URL for the package's homepage/etc
   # Usually different from repository url
   url: https://code.videolan.org/videolan/dav1d
 
   # Human-readable identifier for this version/release
   # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: commit 3fd2ad938ace8d2d0ab86e4108d46a7722691073 (2021-11-01T15:14:21.000+01:00).
+  release: commit f52aee04fbd711cddab23d0aa9b196e9c963e7b8 (2021-10-04T21:58:36.000+00:00).
 
   # Revision to pull in
   # Must be a long or short commit SHA (long preferred)
-  revision: 3fd2ad938ace8d2d0ab86e4108d46a7722691073
+  revision: f52aee04fbd711cddab23d0aa9b196e9c963e7b8
 
   # The package's license, where possible using the mnemonic from
   # https://spdx.org/licenses/
   # Multiple licenses can be specified (as a YAML list)
   # A "LICENSE" file must exist containing the full license text
   license: BSD-2-Clause
 
   license-file: COPYING
@@ -58,9 +58,8 @@ vendoring:
   update-actions:
     - action: copy-file
       from: include/vcs_version.h.in
       to: '{yaml_dir}/vcs_version.h'
     - action: replace-in-file
       pattern: '@VCS_TAG@'
       with: '{revision}'
       file: '{yaml_dir}/vcs_version.h'
-
--- a/media/libdav1d/vcs_version.h
+++ b/media/libdav1d/vcs_version.h
@@ -1,2 +1,2 @@
 /* auto-generated, do not edit */
-#define DAV1D_VERSION "3fd2ad938ace8d2d0ab86e4108d46a7722691073"
+#define DAV1D_VERSION "f52aee04fbd711cddab23d0aa9b196e9c963e7b8"
--- a/third_party/dav1d/meson.build
+++ b/third_party/dav1d/meson.build
@@ -198,21 +198,16 @@ endif
 if cc.check_header('unistd.h')
     cdata.set('HAVE_UNISTD_H', 1)
 endif
 
 if cc.check_header('io.h')
     cdata.set('HAVE_IO_H', 1)
 endif
 
-if cc.check_header('pthread_np.h')
-    cdata.set('HAVE_PTHREAD_NP_H', 1)
-    test_args += '-DHAVE_PTHREAD_NP_H'
-endif
-
 
 # Function checks
 
 if not cc.has_function('getopt_long', prefix : '#include <getopt.h>', args : test_args)
     getopt_dependency = declare_dependency(
         sources: files('tools/compat/getopt.c'),
         include_directories : include_directories('include/compat'),
     )
@@ -234,26 +229,16 @@ if (host_machine.cpu_family() == 'aarch6
     if cc.has_function('getauxval', prefix : '#include <sys/auxv.h>', args : test_args)
         cdata.set('HAVE_GETAUXVAL', 1)
     endif
     if cc.has_function('elf_aux_info', prefix : '#include <sys/auxv.h>', args : test_args)
         cdata.set('HAVE_ELF_AUX_INFO', 1)
     endif
 endif
 
-pthread_np_prefix = '''
-#include <pthread.h>
-#ifdef HAVE_PTHREAD_NP_H
-#include <pthread_np.h>
-#endif
-'''
-if cc.has_function('pthread_getaffinity_np', prefix : pthread_np_prefix, args : test_args, dependencies : thread_dependency)
-    cdata.set('HAVE_PTHREAD_GETAFFINITY_NP', 1)
-endif
-
 # Compiler flag tests
 
 if cc.has_argument('-fvisibility=hidden')
     add_project_arguments('-fvisibility=hidden', language: 'c')
 else
     warning('Compiler does not support -fvisibility=hidden, all symbols will be public!')
 endif
 
@@ -420,19 +405,24 @@ if is_asm_enabled and host_machine.cpu_f
         nasm_r = run_command(nasm, '-v')
 
         if nasm_r.returncode() != 0
             error('failed running nasm to obtain its version')
         endif
 
         out = nasm_r.stdout().strip().split()
         if out[1].to_lower() == 'version'
-            if out[2].version_compare('<2.14')
-                error('nasm 2.14 or later is required, found nasm @0@'.format(out[2]))
+            if out[2].version_compare('<2.13.02')
+                error('nasm 2.13.02 or later is required, found nasm @0@'.format(out[2]))
+            elif out[2].version_compare('<2.14') and get_option('enable_avx512')
+                error('nasm 2.14 or later is required for AVX-512 asm.\n' +
+                       'AVX-512 asm can be disabled with \'-Denable_avx512=false\'')
             endif
+            cdata.set10('HAVE_AVX512ICL', get_option('enable_avx512'))
+            cdata_asm.set10('HAVE_AVX512ICL', get_option('enable_avx512'))
         else
             error('unexpected nasm version string: @0@'.format(nasm_r.stdout()))
         endif
     endif
 
     # Generate config.asm
     config_asm_target = configure_file(output: 'config.asm', output_format: 'nasm', configuration: cdata_asm)
 
--- a/third_party/dav1d/meson_options.txt
+++ b/third_party/dav1d/meson_options.txt
@@ -5,16 +5,21 @@ option('bitdepths',
     choices: ['8', '16'],
     description: 'Enable only specified bitdepths')
 
 option('enable_asm',
     type: 'boolean',
     value: true,
     description: 'Build asm files, if available')
 
+option('enable_avx512',
+    type: 'boolean',
+    value: true,
+    description: 'Build AVX-512 asm files, requires nasm 2.14')
+
 option('enable_tools',
     type: 'boolean',
     value: true,
     description: 'Build dav1d cli tools')
 
 option('enable_examples',
     type: 'boolean',
     value: false,
--- a/third_party/dav1d/src/arm/32/cdef.S
+++ b/third_party/dav1d/src/arm/32/cdef.S
@@ -29,20 +29,20 @@
 #include "util.S"
 #include "cdef_tmpl.S"
 
 // n1 = s0/d0
 // w1 = d0/q0
 // n2 = s4/d2
 // w2 = d2/q1
 .macro pad_top_bottom s1, s2, w, stride, n1, w1, n2, w2, align, ret
-        tst             r7,  #1 // CDEF_HAVE_LEFT
+        tst             r6,  #1 // CDEF_HAVE_LEFT
         beq             2f
         // CDEF_HAVE_LEFT
-        tst             r7,  #2 // CDEF_HAVE_RIGHT
+        tst             r6,  #2 // CDEF_HAVE_RIGHT
         beq             1f
         // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
         ldrh            r12, [\s1, #-2]
         vldr            \n1, [\s1]
         vdup.16         d4,  r12
         ldrh            r12, [\s1, #\w]
         vmov.16         d4[1], r12
         ldrh            r12, [\s2, #-2]
@@ -56,17 +56,17 @@
         vstr            s8,  [r0, #-4]
         vst1.16         {\w1}, [r0, :\align]
         vstr            s9,  [r0, #2*\w]
         add             r0,  r0,  #2*\stride
         vstr            s10, [r0, #-4]
         vst1.16         {\w2}, [r0, :\align]
         vstr            s11, [r0, #2*\w]
 .if \ret
-        pop             {r4-r8,pc}
+        pop             {r4-r7,pc}
 .else
         add             r0,  r0,  #2*\stride
         b               3f
 .endif
 
 1:
         // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
         ldrh            r12, [\s1, #-2]
@@ -81,25 +81,25 @@ 1:
         vstr            s8,  [r0, #-4]
         vst1.16         {\w1}, [r0, :\align]
         vstr            s12, [r0, #2*\w]
         add             r0,  r0,  #2*\stride
         vstr            s9,  [r0, #-4]
         vst1.16         {\w2}, [r0, :\align]
         vstr            s12, [r0, #2*\w]
 .if \ret
-        pop             {r4-r8,pc}
+        pop             {r4-r7,pc}
 .else
         add             r0,  r0,  #2*\stride
         b               3f
 .endif
 
 2:
         // !CDEF_HAVE_LEFT
-        tst             r7,  #2 // CDEF_HAVE_RIGHT
+        tst             r6,  #2 // CDEF_HAVE_RIGHT
         beq             1f
         // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
         vldr            \n1, [\s1]
         ldrh            r12, [\s1, #\w]
         vldr            \n2, [\s2]
         vdup.16         d4,  r12
         ldrh            r12, [\s2, #\w]
         vmovl.u8        q0,  d0
@@ -109,17 +109,17 @@ 2:
         vstr            s12, [r0, #-4]
         vst1.16         {\w1}, [r0, :\align]
         vstr            s8,  [r0, #2*\w]
         add             r0,  r0,  #2*\stride
         vstr            s12, [r0, #-4]
         vst1.16         {\w2}, [r0, :\align]
         vstr            s9,  [r0, #2*\w]
 .if \ret
-        pop             {r4-r8,pc}
+        pop             {r4-r7,pc}
 .else
         add             r0,  r0,  #2*\stride
         b               3f
 .endif
 
 1:
         // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
         vldr            \n1, [\s1]
@@ -129,203 +129,201 @@ 1:
         vstr            s12, [r0, #-4]
         vst1.16         {\w1}, [r0, :\align]
         vstr            s12, [r0, #2*\w]
         add             r0,  r0,  #2*\stride
         vstr            s12, [r0, #-4]
         vst1.16         {\w2}, [r0, :\align]
         vstr            s12, [r0, #2*\w]
 .if \ret
-        pop             {r4-r8,pc}
+        pop             {r4-r7,pc}
 .else
         add             r0,  r0,  #2*\stride
 .endif
 3:
 .endm
 
 .macro load_n_incr dst, src, incr, w
 .if \w == 4
         vld1.32         {\dst\()[0]}, [\src, :32], \incr
 .else
         vld1.8          {\dst\()},    [\src, :64], \incr
 .endif
 .endm
 
 // void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src,
 //                                    ptrdiff_t src_stride, const pixel (*left)[2],
-//                                    const pixel *const top,
-//                                    const pixel *const bottom, int h,
+//                                    const pixel *const top, int h,
 //                                    enum CdefEdgeFlags edges);
 
 // n1 = s0/d0
 // w1 = d0/q0
 // n2 = s4/d2
 // w2 = d2/q1
 .macro padding_func w, stride, n1, w1, n2, w2, align
 function cdef_padding\w\()_8bpc_neon, export=1
-        push            {r4-r8,lr}
-        ldrd            r4,  r5,  [sp, #24]
-        ldrd            r6,  r7,  [sp, #32]
-        cmp             r7,  #0xf // fully edged
+        push            {r4-r7,lr}
+        ldrd            r4,  r5,  [sp, #20]
+        ldr             r6,  [sp, #28]
+        cmp             r6,  #0xf // fully edged
         beq             cdef_padding\w\()_edged_8bpc_neon
         vmov.i16        q3,  #0x8000
-        tst             r7,  #4 // CDEF_HAVE_TOP
+        tst             r6,  #4 // CDEF_HAVE_TOP
         bne             1f
         // !CDEF_HAVE_TOP
         sub             r12, r0,  #2*(2*\stride+2)
         vmov.i16        q2,  #0x8000
         vst1.16         {q2,q3}, [r12]!
 .if \w == 8
         vst1.16         {q2,q3}, [r12]!
 .endif
         b               3f
 1:
         // CDEF_HAVE_TOP
-        add             r8,  r4,  r2
+        add             r7,  r4,  r2
         sub             r0,  r0,  #2*(2*\stride)
-        pad_top_bottom  r4,  r8,  \w, \stride, \n1, \w1, \n2, \w2, \align, 0
+        pad_top_bottom  r4,  r7,  \w, \stride, \n1, \w1, \n2, \w2, \align, 0
 
         // Middle section
 3:
-        tst             r7,  #1 // CDEF_HAVE_LEFT
+        tst             r6,  #1 // CDEF_HAVE_LEFT
         beq             2f
         // CDEF_HAVE_LEFT
-        tst             r7,  #2 // CDEF_HAVE_RIGHT
+        tst             r6,  #2 // CDEF_HAVE_RIGHT
         beq             1f
         // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
 0:
         vld1.16         {d2[]}, [r3, :16]!
         ldrh            r12, [r1, #\w]
         load_n_incr     d0,  r1,  r2,  \w
-        subs            r6,  r6,  #1
+        subs            r5,  r5,  #1
         vmov.16         d2[1], r12
         vmovl.u8        q0,  d0
         vmovl.u8        q1,  d2
         vstr            s4,  [r0, #-4]
         vst1.16         {\w1}, [r0, :\align]
         vstr            s5,  [r0, #2*\w]
         add             r0,  r0,  #2*\stride
         bgt             0b
         b               3f
 1:
         // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
         vld1.16         {d2[]}, [r3, :16]!
         load_n_incr     d0,  r1,  r2,  \w
-        subs            r6,  r6,  #1
+        subs            r5,  r5,  #1
         vmovl.u8        q0,  d0
         vmovl.u8        q1,  d2
         vstr            s4,  [r0, #-4]
         vst1.16         {\w1}, [r0, :\align]
         vstr            s12, [r0, #2*\w]
         add             r0,  r0,  #2*\stride
         bgt             1b
         b               3f
 2:
-        tst             r7,  #2 // CDEF_HAVE_RIGHT
+        tst             r6,  #2 // CDEF_HAVE_RIGHT
         beq             1f
         // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
 0:
         ldrh            r12, [r1, #\w]
         load_n_incr     d0,  r1,  r2,  \w
         vdup.16         d2,  r12
-        subs            r6,  r6,  #1
+        subs            r5,  r5,  #1
         vmovl.u8        q0,  d0
         vmovl.u8        q1,  d2
         vstr            s12, [r0, #-4]
         vst1.16         {\w1}, [r0, :\align]
         vstr            s4,  [r0, #2*\w]
         add             r0,  r0,  #2*\stride
         bgt             0b
         b               3f
 1:
         // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
         load_n_incr     d0,  r1,  r2,  \w
-        subs            r6,  r6,  #1
+        subs            r5,  r5,  #1
         vmovl.u8        q0,  d0
         vstr            s12, [r0, #-4]
         vst1.16         {\w1}, [r0, :\align]
         vstr            s12, [r0, #2*\w]
         add             r0,  r0,  #2*\stride
         bgt             1b
 
 3:
-        tst             r7,  #8 // CDEF_HAVE_BOTTOM
+        tst             r6,  #8 // CDEF_HAVE_BOTTOM
         bne             1f
         // !CDEF_HAVE_BOTTOM
         sub             r12, r0,  #4
         vmov.i16        q2,  #0x8000
         vst1.16         {q2,q3}, [r12]!
 .if \w == 8
         vst1.16         {q2,q3}, [r12]!
 .endif
-        pop             {r4-r8,pc}
+        pop             {r4-r7,pc}
 1:
         // CDEF_HAVE_BOTTOM
-        add             r8,  r5,  r2
-        pad_top_bottom  r5,  r8,  \w, \stride, \n1, \w1, \n2, \w2, \align, 1
+        add             r7,  r1,  r2
+        pad_top_bottom  r1,  r7,  \w, \stride, \n1, \w1, \n2, \w2, \align, 1
 endfunc
 .endm
 
 padding_func 8, 16, d0, q0, d2, q1, 128
 padding_func 4, 8,  s0, d0, s4, d2, 64
 
 // void cdef_paddingX_edged_8bpc_neon(uint16_t *tmp, const pixel *src,
 //                                    ptrdiff_t src_stride, const pixel (*left)[2],
-//                                    const pixel *const top,
-//                                    const pixel *const bottom, int h,
+//                                    const pixel *const top, int h,
 //                                    enum CdefEdgeFlags edges);
 
 .macro padding_func_edged w, stride, reg, align
 function cdef_padding\w\()_edged_8bpc_neon
         sub             r0,  r0,  #(2*\stride)
 
         ldrh            r12, [r4, #-2]
         vldr            \reg, [r4]
-        add             r8,  r4,  r2
+        add             r7,  r4,  r2
         strh            r12, [r0, #-2]
         ldrh            r12, [r4, #\w]
         vstr            \reg, [r0]
         strh            r12, [r0, #\w]
 
-        ldrh            r12, [r8, #-2]
-        vldr            \reg, [r8]
+        ldrh            r12, [r7, #-2]
+        vldr            \reg, [r7]
         strh            r12, [r0, #\stride-2]
-        ldrh            r12, [r8, #\w]
+        ldrh            r12, [r7, #\w]
         vstr            \reg, [r0, #\stride]
         strh            r12, [r0, #\stride+\w]
         add             r0,  r0,  #2*\stride
 
 0:
         ldrh            r12, [r3], #2
         vldr            \reg, [r1]
         str             r12, [r0, #-2]
         ldrh            r12, [r1, #\w]
         add             r1,  r1,  r2
-        subs            r6,  r6,  #1
+        subs            r5,  r5,  #1
         vstr            \reg, [r0]
         str             r12, [r0, #\w]
         add             r0,  r0,  #\stride
         bgt             0b
 
-        ldrh            r12, [r5, #-2]
-        vldr            \reg, [r5]
-        add             r8,  r5,  r2
+        ldrh            r12, [r1, #-2]
+        vldr            \reg, [r1]
+        add             r7,  r1,  r2
         strh            r12, [r0, #-2]
-        ldrh            r12, [r5, #\w]
+        ldrh            r12, [r1, #\w]
         vstr            \reg, [r0]
         strh            r12, [r0, #\w]
 
-        ldrh            r12, [r8, #-2]
-        vldr            \reg, [r8]
+        ldrh            r12, [r7, #-2]
+        vldr            \reg, [r7]
         strh            r12, [r0, #\stride-2]
-        ldrh            r12, [r8, #\w]
+        ldrh            r12, [r7, #\w]
         vstr            \reg, [r0, #\stride]
         strh            r12, [r0, #\stride+\w]
 
-        pop             {r4-r8,pc}
+        pop             {r4-r7,pc}
 endfunc
 .endm
 
 padding_func_edged 8, 16, d0, 64
 padding_func_edged 4, 8,  s0, 32
 
 tables
 
--- a/third_party/dav1d/src/arm/32/cdef16.S
+++ b/third_party/dav1d/src/arm/32/cdef16.S
@@ -27,37 +27,37 @@
 
 #include "src/arm/asm.S"
 #include "util.S"
 #include "cdef_tmpl.S"
 
 // r1 = d0/q0
 // r2 = d2/q1
 .macro pad_top_bot_16 s1, s2, w, stride, r1, r2, align, ret
-        tst             r7,  #1 // CDEF_HAVE_LEFT
+        tst             r6,  #1 // CDEF_HAVE_LEFT
         beq             2f
         // CDEF_HAVE_LEFT
-        tst             r7,  #2 // CDEF_HAVE_RIGHT
+        tst             r6,  #2 // CDEF_HAVE_RIGHT
         beq             1f
         // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
         vldr            s8,  [\s1, #-4]
         vld1.16         {\r1}, [\s1, :\align]
         vldr            s9,  [\s1, #2*\w]
         vldr            s10, [\s2, #-4]
         vld1.16         {\r2}, [\s2, :\align]
         vldr            s11, [\s2, #2*\w]
         vstr            s8,  [r0, #-4]
         vst1.16         {\r1}, [r0, :\align]
         vstr            s9,  [r0, #2*\w]
         add             r0,  r0,  #2*\stride
         vstr            s10, [r0, #-4]
         vst1.16         {\r2}, [r0, :\align]
         vstr            s11, [r0, #2*\w]
 .if \ret
-        pop             {r4-r8,pc}
+        pop             {r4-r7,pc}
 .else
         add             r0,  r0,  #2*\stride
         b               3f
 .endif
 
 1:
         // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
         vldr            s8,  [\s1, #-4]
@@ -67,40 +67,40 @@ 1:
         vstr            s8,  [r0, #-4]
         vst1.16         {\r1}, [r0, :\align]
         vstr            s12, [r0, #2*\w]
         add             r0,  r0,  #2*\stride
         vstr            s9,  [r0, #-4]
         vst1.16         {\r2}, [r0, :\align]
         vstr            s12, [r0, #2*\w]
 .if \ret
-        pop             {r4-r8,pc}
+        pop             {r4-r7,pc}
 .else
         add             r0,  r0,  #2*\stride
         b               3f
 .endif
 
 2:
         // !CDEF_HAVE_LEFT
-        tst             r7,  #2 // CDEF_HAVE_RIGHT
+        tst             r6,  #2 // CDEF_HAVE_RIGHT
         beq             1f
         // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
         vld1.16         {\r1}, [\s1, :\align]
         vldr            s8,  [\s1, #2*\w]
         vld1.16         {\r2}, [\s2, :\align]
         vldr            s9,  [\s2, #2*\w]
         vstr            s12, [r0, #-4]
         vst1.16         {\r1}, [r0, :\align]
         vstr            s8,  [r0, #2*\w]
         add             r0,  r0,  #2*\stride
         vstr            s12, [r0, #-4]
         vst1.16         {\r2}, [r0, :\align]
         vstr            s9,  [r0, #2*\w]
 .if \ret
-        pop             {r4-r8,pc}
+        pop             {r4-r7,pc}
 .else
         add             r0,  r0,  #2*\stride
         b               3f
 .endif
 
 1:
         // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
         vld1.16         {\r1}, [\s1, :\align]
@@ -108,122 +108,121 @@ 1:
         vstr            s12, [r0, #-4]
         vst1.16         {\r1}, [r0, :\align]
         vstr            s12, [r0, #2*\w]
         add             r0,  r0,  #2*\stride
         vstr            s12, [r0, #-4]
         vst1.16         {\r2}, [r0, :\align]
         vstr            s12, [r0, #2*\w]
 .if \ret
-        pop             {r4-r8,pc}
+        pop             {r4-r7,pc}
 .else
         add             r0,  r0,  #2*\stride
 .endif
 3:
 .endm
 
 // void dav1d_cdef_paddingX_16bpc_neon(uint16_t *tmp, const pixel *src,
 //                                     ptrdiff_t src_stride, const pixel (*left)[2],
-//                                     const pixel *const top,
-//                                     const pixel *const bottom, int h,
+//                                     const pixel *const top, int h,
 //                                     enum CdefEdgeFlags edges);
 
 // r1 = d0/q0
 // r2 = d2/q1
 .macro padding_func_16 w, stride, r1, r2, align
 function cdef_padding\w\()_16bpc_neon, export=1
-        push            {r4-r8,lr}
-        ldrd            r4,  r5,  [sp, #24]
-        ldrd            r6,  r7,  [sp, #32]
+        push            {r4-r7,lr}
+        ldrd            r4,  r5,  [sp, #20]
+        ldr             r6,  [sp, #28]
         vmov.i16        q3,  #0x8000
-        tst             r7,  #4 // CDEF_HAVE_TOP
+        tst             r6,  #4 // CDEF_HAVE_TOP
         bne             1f
         // !CDEF_HAVE_TOP
         sub             r12, r0,  #2*(2*\stride+2)
         vmov.i16        q2,  #0x8000
         vst1.16         {q2,q3}, [r12]!
 .if \w == 8
         vst1.16         {q2,q3}, [r12]!
 .endif
         b               3f
 1:
         // CDEF_HAVE_TOP
-        add             r8,  r4,  r2
+        add             r7,  r4,  r2
         sub             r0,  r0,  #2*(2*\stride)
-        pad_top_bot_16  r4,  r8,  \w, \stride, \r1, \r2, \align, 0
+        pad_top_bot_16  r4,  r7,  \w, \stride, \r1, \r2, \align, 0
 
         // Middle section
 3:
-        tst             r7,  #1 // CDEF_HAVE_LEFT
+        tst             r6,  #1 // CDEF_HAVE_LEFT
         beq             2f
         // CDEF_HAVE_LEFT
-        tst             r7,  #2 // CDEF_HAVE_RIGHT
+        tst             r6,  #2 // CDEF_HAVE_RIGHT
         beq             1f
         // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
 0:
         vld1.32         {d2[]}, [r3, :32]!
         vldr            s5,  [r1, #2*\w]
         vld1.16         {\r1}, [r1, :\align], r2
-        subs            r6,  r6,  #1
+        subs            r5,  r5,  #1
         vstr            s4,  [r0, #-4]
         vst1.16         {\r1}, [r0, :\align]
         vstr            s5,  [r0, #2*\w]
         add             r0,  r0,  #2*\stride
         bgt             0b
         b               3f
 1:
         // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
         vld1.32         {d2[]}, [r3, :32]!
         vld1.16         {\r1}, [r1, :\align], r2
-        subs            r6,  r6,  #1
+        subs            r5,  r5,  #1
         vstr            s4,  [r0, #-4]
         vst1.16         {\r1}, [r0, :\align]
         vstr            s12, [r0, #2*\w]
         add             r0,  r0,  #2*\stride
         bgt             1b
         b               3f
 2:
-        tst             r7,  #2 // CDEF_HAVE_RIGHT
+        tst             r6,  #2 // CDEF_HAVE_RIGHT
         beq             1f
         // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
 0:
         vldr            s4,  [r1, #2*\w]
         vld1.16         {\r1}, [r1, :\align], r2
-        subs            r6,  r6,  #1
+        subs            r5,  r5,  #1
         vstr            s12, [r0, #-4]
         vst1.16         {\r1}, [r0, :\align]
         vstr            s4,  [r0, #2*\w]
         add             r0,  r0,  #2*\stride
         bgt             0b
         b               3f
 1:
         // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
         vld1.16         {\r1}, [r1, :\align], r2
-        subs            r6,  r6,  #1
+        subs            r5,  r5,  #1
         vstr            s12, [r0, #-4]
         vst1.16         {\r1}, [r0, :\align]
         vstr            s12, [r0, #2*\w]
         add             r0,  r0,  #2*\stride
         bgt             1b
 
 3:
-        tst             r7,  #8 // CDEF_HAVE_BOTTOM
+        tst             r6,  #8 // CDEF_HAVE_BOTTOM
         bne             1f
         // !CDEF_HAVE_BOTTOM
         sub             r12, r0,  #4
         vmov.i16        q2,  #0x8000
         vst1.16         {q2,q3}, [r12]!
 .if \w == 8
         vst1.16         {q2,q3}, [r12]!
 .endif
-        pop             {r4-r8,pc}
+        pop             {r4-r7,pc}
 1:
         // CDEF_HAVE_BOTTOM
-        add             r8,  r5,  r2
-        pad_top_bot_16  r5,  r8,  \w, \stride, \r1, \r2, \align, 1
+        add             r7,  r1,  r2
+        pad_top_bot_16  r1,  r7,  \w, \stride, \r1, \r2, \align, 1
 endfunc
 .endm
 
 padding_func_16 8, 16, q0, q1, 128
 padding_func_16 4, 8,  d0, d2, 64
 
 tables
 
--- a/third_party/dav1d/src/arm/64/cdef.S
+++ b/third_party/dav1d/src/arm/64/cdef.S
@@ -25,22 +25,22 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include "src/arm/asm.S"
 #include "util.S"
 #include "cdef_tmpl.S"
 
 .macro pad_top_bottom s1, s2, w, stride, rn, rw, ret
-        tst             w7,  #1 // CDEF_HAVE_LEFT
+        tst             w6,  #1 // CDEF_HAVE_LEFT
         b.eq            2f
         // CDEF_HAVE_LEFT
         sub             \s1,  \s1,  #2
         sub             \s2,  \s2,  #2
-        tst             w7,  #2 // CDEF_HAVE_RIGHT
+        tst             w6,  #2 // CDEF_HAVE_RIGHT
         b.eq            1f
         // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
         ldr             \rn\()0, [\s1]
         ldr             s1,      [\s1, #\w]
         ldr             \rn\()2, [\s2]
         ldr             s3,      [\s2, #\w]
         uxtl            v0.8h,   v0.8b
         uxtl            v1.8h,   v1.8b
@@ -79,17 +79,17 @@ 1:
         ret
 .else
         add             x0,  x0,  #2*\stride
         b               3f
 .endif
 
 2:
         // !CDEF_HAVE_LEFT
-        tst             w7,  #2 // CDEF_HAVE_RIGHT
+        tst             w6,  #2 // CDEF_HAVE_RIGHT
         b.eq            1f
         // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
         ldr             \rn\()0, [\s1]
         ldr             h1,      [\s1, #\w]
         ldr             \rn\()2, [\s2]
         ldr             h3,      [\s2, #\w]
         uxtl            v0.8h,  v0.8b
         uxtl            v1.8h,  v1.8b
@@ -135,131 +135,128 @@ 3:
         ld1             {\dst\().s}[0], [\src], \incr
 .else
         ld1             {\dst\().8b},   [\src], \incr
 .endif
 .endm
 
 // void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src,
 //                                    ptrdiff_t src_stride, const pixel (*left)[2],
-//                                    const pixel *const top,
-//                                    const pixel *const bottom, int h,
+//                                    const pixel *const top, int h,
 //                                    enum CdefEdgeFlags edges);
 
 .macro padding_func w, stride, rn, rw
 function cdef_padding\w\()_8bpc_neon, export=1
-        cmp             w7,  #0xf // fully edged
+        cmp             w6,  #0xf // fully edged
         b.eq            cdef_padding\w\()_edged_8bpc_neon
         movi            v30.8h,  #0x80, lsl #8
         mov             v31.16b, v30.16b
         sub             x0,  x0,  #2*(2*\stride+2)
-        tst             w7,  #4 // CDEF_HAVE_TOP
+        tst             w6,  #4 // CDEF_HAVE_TOP
         b.ne            1f
         // !CDEF_HAVE_TOP
         st1             {v30.8h, v31.8h}, [x0], #32
 .if \w == 8
         st1             {v30.8h, v31.8h}, [x0], #32
 .endif
         b               3f
 1:
         // CDEF_HAVE_TOP
         add             x9,  x4,  x2
         pad_top_bottom  x4,  x9, \w, \stride, \rn, \rw, 0
 
         // Middle section
 3:
-        tst             w7,  #1 // CDEF_HAVE_LEFT
+        tst             w6,  #1 // CDEF_HAVE_LEFT
         b.eq            2f
         // CDEF_HAVE_LEFT
-        tst             w7,  #2 // CDEF_HAVE_RIGHT
+        tst             w6,  #2 // CDEF_HAVE_RIGHT
         b.eq            1f
         // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
 0:
         ld1             {v0.h}[0], [x3], #2
         ldr             h2,      [x1, #\w]
         load_n_incr     v1,  x1,  x2,  \w
-        subs            w6,  w6,  #1
+        subs            w5,  w5,  #1
         uxtl            v0.8h,  v0.8b
         uxtl            v1.8h,  v1.8b
         uxtl            v2.8h,  v2.8b
         str             s0,      [x0]
         stur            \rw\()1, [x0, #4]
         str             s2,      [x0, #4+2*\w]
         add             x0,  x0,  #2*\stride
         b.gt            0b
         b               3f
 1:
         // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
         ld1             {v0.h}[0], [x3], #2
         load_n_incr     v1,  x1,  x2,  \w
-        subs            w6,  w6,  #1
+        subs            w5,  w5,  #1
         uxtl            v0.8h,  v0.8b
         uxtl            v1.8h,  v1.8b
         str             s0,      [x0]
         stur            \rw\()1, [x0, #4]
         str             s31,     [x0, #4+2*\w]
         add             x0,  x0,  #2*\stride
         b.gt            1b
         b               3f
 2:
-        tst             w7,  #2 // CDEF_HAVE_RIGHT
+        tst             w6,  #2 // CDEF_HAVE_RIGHT
         b.eq            1f
         // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
 0:
         ldr             h1,      [x1, #\w]
         load_n_incr     v0,  x1,  x2,  \w
-        subs            w6,  w6,  #1
+        subs            w5,  w5,  #1
         uxtl            v0.8h,  v0.8b
         uxtl            v1.8h,  v1.8b
         str             s31,     [x0]
         stur            \rw\()0, [x0, #4]
         str             s1,      [x0, #4+2*\w]
         add             x0,  x0,  #2*\stride
         b.gt            0b
         b               3f
 1:
         // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
         load_n_incr     v0,  x1,  x2,  \w
-        subs            w6,  w6,  #1
+        subs            w5,  w5,  #1
         uxtl            v0.8h,  v0.8b
         str             s31,     [x0]
         stur            \rw\()0, [x0, #4]
         str             s31,     [x0, #4+2*\w]
         add             x0,  x0,  #2*\stride
         b.gt            1b
 
 3:
-        tst             w7,  #8 // CDEF_HAVE_BOTTOM
+        tst             w6,  #8 // CDEF_HAVE_BOTTOM
         b.ne            1f
         // !CDEF_HAVE_BOTTOM
         st1             {v30.8h, v31.8h}, [x0], #32
 .if \w == 8
         st1             {v30.8h, v31.8h}, [x0], #32
 .endif
         ret
 1:
         // CDEF_HAVE_BOTTOM
-        add             x9,  x5,  x2
-        pad_top_bottom  x5,  x9, \w, \stride, \rn, \rw, 1
+        add             x9,  x1,  x2
+        pad_top_bottom  x1,  x9, \w, \stride, \rn, \rw, 1
 endfunc
 .endm
 
 padding_func 8, 16, d, q
 padding_func 4, 8,  s, d
 
 // void cdef_paddingX_edged_8bpc_neon(uint8_t *tmp, const pixel *src,
 //                                    ptrdiff_t src_stride, const pixel (*left)[2],
-//                                    const pixel *const top,
-//                                    const pixel *const bottom, int h,
+//                                    const pixel *const top, int h,
 //                                    enum CdefEdgeFlags edges);
 
 .macro padding_func_edged w, stride, reg
 function cdef_padding\w\()_edged_8bpc_neon, export=1
         sub             x4,  x4,  #2
-        sub             x5,  x5,  #2
         sub             x0,  x0,  #(2*\stride+2)
 
 .if \w == 4
         ldr             d0, [x4]
         ldr             d1, [x4, x2]
         st1             {v0.8b, v1.8b}, [x0], #16
 .else
         add             x9,  x4,  x2
@@ -273,31 +270,32 @@ function cdef_padding\w\()_edged_8bpc_ne
         str             s3, [x0, #\stride+8]
         add             x0,  x0,  #2*\stride
 .endif
 
 0:
         ld1             {v0.h}[0], [x3], #2
         ldr             h2,      [x1, #\w]
         load_n_incr     v1,  x1,  x2,  \w
-        subs            w6,  w6,  #1
+        subs            w5,  w5,  #1
         str             h0,      [x0]
         stur            \reg\()1, [x0, #2]
         str             h2,      [x0, #2+\w]
         add             x0,  x0,  #\stride
         b.gt            0b
 
+        sub             x1,  x1,  #2
 .if \w == 4
-        ldr             d0, [x5]
-        ldr             d1, [x5, x2]
+        ldr             d0, [x1]
+        ldr             d1, [x1, x2]
         st1             {v0.8b, v1.8b}, [x0], #16
 .else
-        add             x9,  x5,  x2
-        ldr             d0, [x5]
-        ldr             s1, [x5, #8]
+        add             x9,  x1,  x2
+        ldr             d0, [x1]
+        ldr             s1, [x1, #8]
         ldr             d2, [x9]
         ldr             s3, [x9, #8]
         str             d0, [x0]
         str             s1, [x0, #8]
         str             d2, [x0, #\stride]
         str             s3, [x0, #\stride+8]
 .endif
         ret
--- a/third_party/dav1d/src/arm/64/cdef16.S
+++ b/third_party/dav1d/src/arm/64/cdef16.S
@@ -25,22 +25,22 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include "src/arm/asm.S"
 #include "util.S"
 #include "cdef_tmpl.S"
 
 .macro pad_top_bot_16 s1, s2, w, stride, reg, ret
-        tst             w7,  #1 // CDEF_HAVE_LEFT
+        tst             w6,  #1 // CDEF_HAVE_LEFT
         b.eq            2f
         // CDEF_HAVE_LEFT
         sub             \s1,  \s1,  #4
         sub             \s2,  \s2,  #4
-        tst             w7,  #2 // CDEF_HAVE_RIGHT
+        tst             w6,  #2 // CDEF_HAVE_RIGHT
         b.eq            1f
         // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
         ldr             \reg\()0, [\s1]
         ldr             d1,       [\s1, #2*\w]
         ldr             \reg\()2, [\s2]
         ldr             d3,       [\s2, #2*\w]
         str             \reg\()0, [x0]
         str             d1,       [x0, #2*\w]
@@ -71,17 +71,17 @@ 1:
         ret
 .else
         add             x0,  x0,  #2*\stride
         b               3f
 .endif
 
 2:
         // !CDEF_HAVE_LEFT
-        tst             w7,  #2 // CDEF_HAVE_RIGHT
+        tst             w6,  #2 // CDEF_HAVE_RIGHT
         b.eq            1f
         // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
         ldr             \reg\()0, [\s1]
         ldr             s1,       [\s1, #2*\w]
         ldr             \reg\()2, [\s2]
         ldr             s3,       [\s2, #2*\w]
         str             s31, [x0]
         stur            \reg\()0, [x0, #4]
@@ -121,105 +121,104 @@ 3:
         ld1             {\dst\().4h}, [\src], \incr
 .else
         ld1             {\dst\().8h}, [\src], \incr
 .endif
 .endm
 
 // void dav1d_cdef_paddingX_16bpc_neon(uint16_t *tmp, const pixel *src,
 //                                     ptrdiff_t src_stride, const pixel (*left)[2],
-//                                     const pixel *const top,
-//                                     const pixel *const bottom, int h,
+//                                     const pixel *const top, int h,
 //                                     enum CdefEdgeFlags edges);
 
 .macro padding_func_16 w, stride, reg
 function cdef_padding\w\()_16bpc_neon, export=1
         movi            v30.8h,  #0x80, lsl #8
         mov             v31.16b, v30.16b
         sub             x0,  x0,  #2*(2*\stride+2)
-        tst             w7,  #4 // CDEF_HAVE_TOP
+        tst             w6,  #4 // CDEF_HAVE_TOP
         b.ne            1f
         // !CDEF_HAVE_TOP
         st1             {v30.8h, v31.8h}, [x0], #32
 .if \w == 8
         st1             {v30.8h, v31.8h}, [x0], #32
 .endif
         b               3f
 1:
         // CDEF_HAVE_TOP
         add             x9,  x4,  x2
         pad_top_bot_16  x4,  x9, \w, \stride, \reg, 0
 
         // Middle section
 3:
-        tst             w7,  #1 // CDEF_HAVE_LEFT
+        tst             w6,  #1 // CDEF_HAVE_LEFT
         b.eq            2f
         // CDEF_HAVE_LEFT
-        tst             w7,  #2 // CDEF_HAVE_RIGHT
+        tst             w6,  #2 // CDEF_HAVE_RIGHT
         b.eq            1f
         // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
 0:
         ld1             {v0.s}[0], [x3], #4
         ldr             s2,       [x1, #2*\w]
         load_n_incr_16  v1,  x1,  x2,  \w
-        subs            w6,  w6,  #1
+        subs            w5,  w5,  #1
         str             s0,       [x0]
         stur            \reg\()1, [x0, #4]
         str             s2,       [x0, #4+2*\w]
         add             x0,  x0,  #2*\stride
         b.gt            0b
         b               3f
 1:
         // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
         ld1             {v0.s}[0], [x3], #4
         load_n_incr_16  v1,  x1,  x2,  \w
-        subs            w6,  w6,  #1
+        subs            w5,  w5,  #1
         str             s0,       [x0]
         stur            \reg\()1, [x0, #4]
         str             s31,      [x0, #4+2*\w]
         add             x0,  x0,  #2*\stride
         b.gt            1b
         b               3f
 2:
-        tst             w7,  #2 // CDEF_HAVE_RIGHT
+        tst             w6,  #2 // CDEF_HAVE_RIGHT
         b.eq            1f
         // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
 0:
         ldr             s1,       [x1, #2*\w]
         load_n_incr_16  v0,  x1,  x2,  \w
-        subs            w6,  w6,  #1
+        subs            w5,  w5,  #1
         str             s31,      [x0]
         stur            \reg\()0, [x0, #4]
         str             s1,       [x0, #4+2*\w]
         add             x0,  x0,  #2*\stride
         b.gt            0b
         b               3f
 1:
         // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
         load_n_incr_16  v0,  x1,  x2,  \w
-        subs            w6,  w6,  #1
+        subs            w5,  w5,  #1
         str             s31,      [x0]
         stur            \reg\()0, [x0, #4]
         str             s31,      [x0, #4+2*\w]
         add             x0,  x0,  #2*\stride
         b.gt            1b
 
 3:
-        tst             w7,  #8 // CDEF_HAVE_BOTTOM
+        tst             w6,  #8 // CDEF_HAVE_BOTTOM
         b.ne            1f
         // !CDEF_HAVE_BOTTOM
         st1             {v30.8h, v31.8h}, [x0], #32
 .if \w == 8
         st1             {v30.8h, v31.8h}, [x0], #32
 .endif
         ret
 1:
         // CDEF_HAVE_BOTTOM
-        add             x9,  x5,  x2
-        pad_top_bot_16  x5,  x9, \w, \stride, \reg, 1
+        add             x9,  x1,  x2
+        pad_top_bot_16  x1,  x9, \w, \stride, \reg, 1
 endfunc
 .endm
 
 padding_func_16 8, 16, q
 padding_func_16 4, 8,  d
 
 tables
 
--- a/third_party/dav1d/src/arm/64/film_grain.S
+++ b/third_party/dav1d/src/arm/64/film_grain.S
@@ -899,17 +899,16 @@ function generate_grain_\type\()_8bpc_ne
 
 .ifc \type, uv_444
         eor             w2,  w2,  w11
 .endif
 
         br              x16
 
 L(generate_grain_\type\()_lag0):
-        AARCH64_VALID_JUMP_TARGET
 .ifc \type, y
         mov             w1,  #GRAIN_HEIGHT
         bl              generate_grain_rows_neon
 .else
         dup             v28.8h,  w7
         ld1r            {v27.16b}, [x4]     // ar_coeffs_uv[0]
         movi            v0.16b,  #0
         movi            v1.16b,  #255
@@ -948,17 +947,16 @@ 1:
         subs            w1,  w1,  #1
         store_grain_row v16, v17, v18, v19, v20, v21
         b.gt            1b
 .endif
         ldp             x30, x19, [sp], #96
         ret
 
 L(generate_grain_\type\()_lag1):
-        AARCH64_VALID_JUMP_TARGET
         ld1r            {v27.16b}, [x4], #1 // ar_coeffs_y[0]
         ld1r            {v28.16b}, [x4], #1 // ar_coeffs_y[1]
         ld1r            {v29.16b}, [x4]     // ar_coeffs_y[2]
 .ifc \type, y
         ldrsb           w4,  [x4, #1]       // ar_coeffs_y[3]
 .else
         add             x4,  x4,  #2
 .endif
@@ -988,17 +986,16 @@ 1:
         mov             v18.16b, v24.16b
         mov             v19.16b, v25.16b
         b.gt            1b
 
         ldp             x30, x19, [sp], #96
         ret
 
 L(generate_grain_\type\()_lag2):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v30.16b}, [x4]     // ar_coeffs_y[0-11], ar_coeffs_uv[0-12]
 
         smov            w4,  v30.b[10]
         smov            w17, v30.b[11]
 
         mov             w1,  #3
         bl              generate_grain_rows_neon
 
@@ -1016,17 +1013,16 @@ 1:
 .endif
         st1             {v16.h}[0], [x0], #2
         b.gt            1b
 
         ldp             x30, x19, [sp], #96
         ret
 
 L(generate_grain_\type\()_lag3):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
         stp             d8,  d9,  [sp, #16]
         stp             d10, d11, [sp, #32]
         stp             d12, d13, [sp, #48]
         stp             d14, d15, [sp, #64]
         stp             x20, x21, [sp, #80]
 
         smov            w4,  v30.b[5]
@@ -1124,17 +1120,16 @@ function generate_grain_\type\()_8bpc_ne
         mov             w5,  #127
         mov             w6,  #-128
 
         eor             w2,  w2,  w11
 
         br              x16
 
 L(generate_grain_\type\()_lag0):
-        AARCH64_VALID_JUMP_TARGET
         dup             v28.8h,  w7
         ld1r            {v27.16b}, [x4]     // ar_coeffs_uv[0]
         movi            v0.16b,  #0
         movi            v1.16b,  #255
         ext             v29.16b, v0.16b,  v1.16b,  #13
         ext             v30.16b, v1.16b,  v0.16b,  #7
         neg             v28.8h,  v28.8h
 
@@ -1162,17 +1157,16 @@ 1:
         increment_y_ptr x19, \type
         store_grain_row_44 v16, v17, v18
         b.gt            1b
 
         ldp             x30, x19, [sp], #96
         ret
 
 L(generate_grain_\type\()_lag1):
-        AARCH64_VALID_JUMP_TARGET
         ld1r            {v27.16b}, [x4], #1 // ar_coeffs_uv[0]
         ld1r            {v28.16b}, [x4], #1 // ar_coeffs_uv[1]
         ld1r            {v29.16b}, [x4]     // ar_coeffs_uv[2]
         add             x4,  x4,  #2
 
         mov             w1,  #3
         ld1r            {v30.16b}, [x4]     // ar_coeffs_u4[4]
         ldursb          w4,  [x4, #-1]      // ar_coeffs_uv[3]
@@ -1189,17 +1183,16 @@ 1:
         mov             v16.16b, v20.16b
         mov             v17.16b, v21.16b
         b.gt            1b
 
         ldp             x30, x19, [sp], #96
         ret
 
 L(generate_grain_\type\()_lag2):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v30.16b}, [x4]     // ar_coeffs_uv[0-12]
 
         smov            w4,  v30.b[10]
         smov            w17, v30.b[11]
 
         mov             w1,  #3
         bl              generate_grain_rows_44_neon
 
@@ -1212,17 +1205,16 @@ 1:
         increment_y_ptr x19, \type
         add             x0,  x0,  #GRAIN_WIDTH-48
         b.gt            1b
 
         ldp             x30, x19, [sp], #96
         ret
 
 L(generate_grain_\type\()_lag3):
-        AARCH64_VALID_JUMP_TARGET
         ldr             q29,      [x4]      // ar_coeffs_uv[0-15]
         ldr             q30,      [x4, #16] // ar_coeffs_uv[16-24]
         stp             d8,  d9,  [sp, #16]
         stp             d10, d11, [sp, #32]
         stp             d12, d13, [sp, #48]
         stp             d14, d15, [sp, #64]
         stp             x20, x21, [sp, #80]
 
@@ -1405,17 +1397,16 @@ 2:
         mov             w7,  #2
 1:
         br              x11
 endfunc
 
 function fgy_loop_neon
 .macro fgy ox, oy
 L(loop_\ox\oy):
-        AARCH64_VALID_JUMP_TARGET
 1:
         ld1             {v0.16b,  v1.16b},  [x1],  x2 // src
 .if \ox
         ld1             {v20.8b},           [x4],  x9 // grain_lut old
 .endif
 .if \oy
         ld1             {v22.16b, v23.16b}, [x6],  x9 // grain_lut top
 .endif
@@ -1653,17 +1644,16 @@ endfunc
 
 fguv 420, 1, 1
 fguv 422, 1, 0
 fguv 444, 0, 0
 
 function fguv_loop_sx0_neon
 .macro fguv_loop_sx0 csfl, ox, oy
 L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
-        AARCH64_VALID_JUMP_TARGET
 1:
         ld1             {v0.16b,  v1.16b},  [x6],  x7  // luma
         ld1             {v6.16b,  v7.16b},  [x1],  x2  // src
 .if \ox
         ld1             {v20.8b},           [x4],  x10 // grain_lut old
 .endif
 .if \oy
         ld1             {v22.16b, v23.16b}, [x8],  x10 // grain_lut top
@@ -1832,17 +1822,16 @@ L(fguv_loop_sx0_tbl):
         .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01)
         .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10)
         .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11)
 endfunc
 
 function fguv_loop_sx1_neon
 .macro fguv_loop_sx1 csfl, ox, oy
 L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
-        AARCH64_VALID_JUMP_TARGET
 1:
         ld1             {v0.16b, v1.16b},  [x6],  x7  // luma
         ld1             {v6.16b},          [x1],  x2  // src
 .if \ox
         ld1             {v20.8b},          [x4],  x10 // grain_lut old
 .endif
 .if \oy
         ld1             {v22.16b},         [x8],  x10 // grain_lut top
--- a/third_party/dav1d/src/arm/64/film_grain16.S
+++ b/third_party/dav1d/src/arm/64/film_grain16.S
@@ -756,17 +756,16 @@ function generate_grain_\type\()_16bpc_n
 
 .ifc \type, uv_444
         eor             w2,  w2,  w11
 .endif
 
         br              x16
 
 L(generate_grain_\type\()_lag0):
-        AARCH64_VALID_JUMP_TARGET
 .ifc \type, y
         mov             w1,  #GRAIN_HEIGHT
         bl              generate_grain_rows_neon
 .else
         dup             v28.4s,  w7
         ld1r            {v27.8b}, [x4]      // ar_coeffs_uv[0]
         movi            v0.16b,  #0
         movi            v1.16b,  #255
@@ -799,17 +798,16 @@ 1:
         add             x19, x19, #4
         st1             {v16.s}[0], [x0], #4
         b.gt            1b
 .endif
         ldp             x30, x19, [sp], #96
         ret
 
 L(generate_grain_\type\()_lag1):
-        AARCH64_VALID_JUMP_TARGET
         ld1r            {v27.8b}, [x4], #1  // ar_coeffs_y[0]
         ld1r            {v28.8b}, [x4], #1  // ar_coeffs_y[1]
         ld1r            {v29.8b}, [x4]      // ar_coeffs_y[2]
 .ifc \type, y
         ldrsb           w4,  [x4, #1]       // ar_coeffs_y[3]
 .else
         add             x4,  x4,  #2
 .endif
@@ -846,17 +844,16 @@ 1:
 .endif
         st1             {v16.s}[0], [x0], #4
         b.gt            1b
 
         ldp             x30, x19, [sp], #96
         ret
 
 L(generate_grain_\type\()_lag2):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v30.16b}, [x4]     // ar_coeffs_y[0-11], ar_coeffs_uv[0-12]
 
         smov            w4,  v30.b[10]
         smov            w17, v30.b[11]
 
         mov             w1,  #3
         bl              generate_grain_rows_neon
 
@@ -879,17 +876,16 @@ 1:
 .endif
         st1             {v16.s}[0], [x0], #4
         b.gt            1b
 
         ldp             x30, x19, [sp], #96
         ret
 
 L(generate_grain_\type\()_lag3):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
         stp             d8,  d9,  [sp, #16]
         stp             d10, d11, [sp, #32]
         stp             d12, d13, [sp, #48]
         stp             d14, d15, [sp, #64]
         stp             x20, x21, [sp, #80]
 
         smov            w4,  v30.b[5]
@@ -998,17 +994,16 @@ function generate_grain_\type\()_16bpc_n
         neg             w6,  w5             // -(128 << bitpdeth_min_8)
         sub             w5,  w5,  #1        //  (128 << bitdepth_min_8) - 1
 
         eor             w2,  w2,  w11
 
         br              x16
 
 L(generate_grain_\type\()_lag0):
-        AARCH64_VALID_JUMP_TARGET
         dup             v28.4s,  w7
         ld1r            {v27.8b}, [x4]      // ar_coeffs_uv[0]
         movi            v0.16b,  #0
         movi            v1.16b,  #255
         dup             v25.8h,  w5
         dup             v26.8h,  w6
         ext             v29.16b, v0.16b,  v1.16b,  #10
         ext             v30.16b, v1.16b,  v0.16b,  #14
@@ -1032,17 +1027,16 @@ 1:
         increment_y_ptr x19, \type
         add             x0,  x0,  #GRAIN_WIDTH*2-6*16
         b.gt            1b
 
         ldp             x30, x19, [sp], #96
         ret
 
 L(generate_grain_\type\()_lag1):
-        AARCH64_VALID_JUMP_TARGET
         ld1r            {v27.8b}, [x4], #1  // ar_coeffs_uv[0]
         ld1r            {v28.8b}, [x4], #1  // ar_coeffs_uv[1]
         ld1r            {v29.8b}, [x4]      // ar_coeffs_uv[2]
         add             x4,  x4,  #2
 
         mov             w1,  #3
         ld1r            {v30.8b}, [x4]      // ar_coeffs_u4[4]
         ldursb          w4,  [x4, #-1]      // ar_coeffs_uv[3]
@@ -1064,17 +1058,16 @@ 1:
         increment_y_ptr x19, \type
         add             x0,  x0,  #GRAIN_WIDTH*2-6*16
         b.gt            1b
 
         ldp             x30, x19, [sp], #96
         ret
 
 L(generate_grain_\type\()_lag2):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v30.16b}, [x4]     // ar_coeffs_uv[0-12]
 
         smov            w4,  v30.b[10]
         smov            w17, v30.b[11]
 
         mov             w1,  #3
         bl              generate_grain_rows_44_neon
 
@@ -1090,17 +1083,16 @@ 1:
         increment_y_ptr x19, \type
         add             x0,  x0,  #GRAIN_WIDTH*2-6*16
         b.gt            1b
 
         ldp             x30, x19, [sp], #96
         ret
 
 L(generate_grain_\type\()_lag3):
-        AARCH64_VALID_JUMP_TARGET
         ldr             q29,      [x4]      // ar_coeffs_uv[0-15]
         ldr             q30,      [x4, #16] // ar_coeffs_uv[16-24]
         stp             d8,  d9,  [sp, #16]
         stp             d10, d11, [sp, #32]
         stp             d12, d13, [sp, #48]
         stp             d14, d15, [sp, #64]
         stp             x20, x21, [sp, #80]
 
@@ -1302,17 +1294,16 @@ 2:
         mov             w7,  #2
 1:
         br              x11
 endfunc
 
 function fgy_loop_neon
 .macro fgy ox, oy
 L(loop_\ox\oy):
-        AARCH64_VALID_JUMP_TARGET
 1:
         ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x1],  x2 // src
 .if \ox
         ld1             {v20.4h},                         [x4],  x9 // grain_lut old
 .endif
 .if \oy
         ld1             {v21.8h, v22.8h, v23.8h, v24.8h}, [x6],  x9 // grain_lut top
 .endif
@@ -1594,17 +1585,16 @@ endfunc
 
 fguv 420, 1, 1
 fguv 422, 1, 0
 fguv 444, 0, 0
 
 function fguv_loop_sx0_neon
 .macro fguv_loop_sx0 csfl, ox, oy
 L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
-        AARCH64_VALID_JUMP_TARGET
 1:
 .if \ox
         ld1             {v4.4h}, [x4],  x10  // grain_lut old
 .endif
 .if \oy
         ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x8],  x10 // grain_lut top
 .endif
 .if \ox && \oy
@@ -1800,17 +1790,16 @@ L(fguv_loop_sx0_tbl):
         .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01)
         .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10)
         .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11)
 endfunc
 
 function fguv_loop_sx1_neon
 .macro fguv_loop_sx1 csfl, ox, oy
 L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
-        AARCH64_VALID_JUMP_TARGET
 1:
 .if \ox
         ld1             {v18.4h}, [x4],  x10  // grain_lut old
 .endif
 .if \oy
         ld1             {v20.8h, v21.8h},  [x8],  x10 // grain_lut top
 .endif
 .if \ox && \oy
--- a/third_party/dav1d/src/arm/64/ipred.S
+++ b/third_party/dav1d/src/arm/64/ipred.S
@@ -38,55 +38,50 @@ function ipred_dc_128_8bpc_neon, export=
         sub             w3,  w3,  #25
         ldrh            w3,  [x5, w3, uxtw #1]
         movi            v0.16b,  #128
         sub             x5,  x5,  w3, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
 4:
-        AARCH64_VALID_JUMP_TARGET
         st1             {v0.s}[0],  [x0], x1
         st1             {v0.s}[0],  [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.s}[0],  [x0], x1
         st1             {v0.s}[0],  [x6], x1
         b.gt            4b
         ret
 8:
-        AARCH64_VALID_JUMP_TARGET
         st1             {v0.8b},  [x0], x1
         st1             {v0.8b},  [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.8b},  [x0], x1
         st1             {v0.8b},  [x6], x1
         b.gt            8b
         ret
 16:
-        AARCH64_VALID_JUMP_TARGET
         st1             {v0.16b}, [x0], x1
         st1             {v0.16b}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.16b}, [x0], x1
         st1             {v0.16b}, [x6], x1
         b.gt            16b
         ret
 320:
-        AARCH64_VALID_JUMP_TARGET
         movi            v1.16b,  #128
 32:
         st1             {v0.16b, v1.16b}, [x0], x1
         st1             {v0.16b, v1.16b}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.16b, v1.16b}, [x0], x1
         st1             {v0.16b, v1.16b}, [x6], x1
         b.gt            32b
         ret
 640:
-        AARCH64_VALID_JUMP_TARGET
         movi            v1.16b,  #128
         movi            v2.16b,  #128
         movi            v3.16b,  #128
 64:
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
@@ -112,61 +107,56 @@ function ipred_v_8bpc_neon, export=1
         sub             w3,  w3,  #25
         ldrh            w3,  [x5, w3, uxtw #1]
         add             x2,  x2,  #1
         sub             x5,  x5,  w3, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
 40:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.s}[0],  [x2]
 4:
         st1             {v0.s}[0],  [x0], x1
         st1             {v0.s}[0],  [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.s}[0],  [x0], x1
         st1             {v0.s}[0],  [x6], x1
         b.gt            4b
         ret
 80:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8b},  [x2]
 8:
         st1             {v0.8b},  [x0], x1
         st1             {v0.8b},  [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.8b},  [x0], x1
         st1             {v0.8b},  [x6], x1
         b.gt            8b
         ret
 160:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.16b}, [x2]
 16:
         st1             {v0.16b}, [x0], x1
         st1             {v0.16b}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.16b}, [x0], x1
         st1             {v0.16b}, [x6], x1
         b.gt            16b
         ret
 320:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.16b, v1.16b}, [x2]
 32:
         st1             {v0.16b, v1.16b}, [x0], x1
         st1             {v0.16b, v1.16b}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.16b, v1.16b}, [x0], x1
         st1             {v0.16b, v1.16b}, [x6], x1
         b.gt            32b
         ret
 640:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
 64:
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
         b.gt            64b
@@ -191,61 +181,56 @@ function ipred_h_8bpc_neon, export=1
         ldrh            w3,  [x5, w3, uxtw #1]
         sub             x2,  x2,  #4
         sub             x5,  x5,  w3, uxtw
         mov             x7,  #-4
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
 4:
-        AARCH64_VALID_JUMP_TARGET
         ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
         st1             {v3.s}[0],  [x0], x1
         st1             {v2.s}[0],  [x6], x1
         subs            w4,  w4,  #4
         st1             {v1.s}[0],  [x0], x1
         st1             {v0.s}[0],  [x6], x1
         b.gt            4b
         ret
 8:
-        AARCH64_VALID_JUMP_TARGET
         ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
         st1             {v3.8b},  [x0], x1
         st1             {v2.8b},  [x6], x1
         subs            w4,  w4,  #4
         st1             {v1.8b},  [x0], x1
         st1             {v0.8b},  [x6], x1
         b.gt            8b
         ret
 16:
-        AARCH64_VALID_JUMP_TARGET
         ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
         st1             {v3.16b}, [x0], x1
         st1             {v2.16b}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v1.16b}, [x0], x1
         st1             {v0.16b}, [x6], x1
         b.gt            16b
         ret
 32:
-        AARCH64_VALID_JUMP_TARGET
         ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
         str             q3,  [x0, #16]
         str             q2,  [x6, #16]
         st1             {v3.16b}, [x0], x1
         st1             {v2.16b}, [x6], x1
         subs            w4,  w4,  #4
         str             q1,  [x0, #16]
         str             q0,  [x6, #16]
         st1             {v1.16b}, [x0], x1
         st1             {v0.16b}, [x6], x1
         b.gt            32b
         ret
 64:
-        AARCH64_VALID_JUMP_TARGET
         ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
         str             q3,  [x0, #16]
         str             q2,  [x6, #16]
         stp             q3,  q3,  [x0, #32]
         stp             q2,  q2,  [x6, #32]
         st1             {v3.16b}, [x0], x1
         st1             {v2.16b}, [x6], x1
         subs            w4,  w4,  #4
@@ -276,76 +261,71 @@ function ipred_dc_top_8bpc_neon, export=
         sub             w3,  w3,  #25
         ldrh            w3,  [x5, w3, uxtw #1]
         add             x2,  x2,  #1
         sub             x5,  x5,  w3, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
 40:
-        AARCH64_VALID_JUMP_TARGET
         ld1r            {v0.2s},  [x2]
         uaddlv          h0,      v0.8b
         rshrn           v0.8b,   v0.8h,   #3
         dup             v0.8b,   v0.b[0]
 4:
         st1             {v0.s}[0],  [x0], x1
         st1             {v0.s}[0],  [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.s}[0],  [x0], x1
         st1             {v0.s}[0],  [x6], x1
         b.gt            4b
         ret
 80:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8b},  [x2]
         uaddlv          h0,      v0.8b
         rshrn           v0.8b,   v0.8h,   #3
         dup             v0.8b,   v0.b[0]
 8:
         st1             {v0.8b},  [x0], x1
         st1             {v0.8b},  [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.8b},  [x0], x1
         st1             {v0.8b},  [x6], x1
         b.gt            8b
         ret
 160:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.16b}, [x2]
         uaddlv          h0,      v0.16b
         rshrn           v0.8b,   v0.8h,   #4
         dup             v0.16b,  v0.b[0]
 16:
         st1             {v0.16b}, [x0], x1
         st1             {v0.16b}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.16b}, [x0], x1
         st1             {v0.16b}, [x6], x1
         b.gt            16b
         ret
 320:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.16b, v1.16b}, [x2]
         uaddlv          h0,      v0.16b
         uaddlv          h1,      v1.16b
         add             v2.4h,   v0.4h,   v1.4h
         rshrn           v2.8b,   v2.8h,   #5
         dup             v0.16b,  v2.b[0]
         dup             v1.16b,  v2.b[0]
 32:
         st1             {v0.16b, v1.16b}, [x0], x1
         st1             {v0.16b, v1.16b}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.16b, v1.16b}, [x0], x1
         st1             {v0.16b, v1.16b}, [x6], x1
         b.gt            32b
         ret
 640:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
         uaddlv          h0,      v0.16b
         uaddlv          h1,      v1.16b
         uaddlv          h2,      v2.16b
         uaddlv          h3,      v3.16b
         add             v4.4h,   v0.4h,   v1.4h
         add             v5.4h,   v2.4h,   v3.4h
         add             v4.4h,   v4.4h,   v5.4h
@@ -386,102 +366,92 @@ function ipred_dc_left_8bpc_neon, export
         ldrh            w7,  [x5, w7, uxtw #1]
         sub             x3,  x5,  w3, uxtw
         sub             x5,  x5,  w7, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
 
 L(ipred_dc_left_h4):
-        AARCH64_VALID_JUMP_TARGET
         ld1r            {v0.2s},  [x2]
         uaddlv          h0,      v0.8b
         rshrn           v0.8b,   v0.8h,   #3
         dup             v0.16b,  v0.b[0]
         br              x3
 L(ipred_dc_left_w4):
-        AARCH64_VALID_JUMP_TARGET
         st1             {v0.s}[0],  [x0], x1
         st1             {v0.s}[0],  [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.s}[0],  [x0], x1
         st1             {v0.s}[0],  [x6], x1
         b.gt            L(ipred_dc_left_w4)
         ret
 
 L(ipred_dc_left_h8):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8b},  [x2]
         uaddlv          h0,      v0.8b
         rshrn           v0.8b,   v0.8h,   #3
         dup             v0.16b,  v0.b[0]
         br              x3
 L(ipred_dc_left_w8):
-        AARCH64_VALID_JUMP_TARGET
         st1             {v0.8b},  [x0], x1
         st1             {v0.8b},  [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.8b},  [x0], x1
         st1             {v0.8b},  [x6], x1
         b.gt            L(ipred_dc_left_w8)
         ret
 
 L(ipred_dc_left_h16):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.16b}, [x2]
         uaddlv          h0,      v0.16b
         rshrn           v0.8b,   v0.8h,   #4
         dup             v0.16b,  v0.b[0]
         br              x3
 L(ipred_dc_left_w16):
-        AARCH64_VALID_JUMP_TARGET
         st1             {v0.16b}, [x0], x1
         st1             {v0.16b}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.16b}, [x0], x1
         st1             {v0.16b}, [x6], x1
         b.gt            L(ipred_dc_left_w16)
         ret
 
 L(ipred_dc_left_h32):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.16b, v1.16b}, [x2]
         uaddlv          h0,      v0.16b
         uaddlv          h1,      v1.16b
         add             v0.4h,   v0.4h,   v1.4h
         rshrn           v0.8b,   v0.8h,   #5
         dup             v0.16b,  v0.b[0]
         br              x3
 L(ipred_dc_left_w32):
-        AARCH64_VALID_JUMP_TARGET
         mov             v1.16b,  v0.16b
 1:
         st1             {v0.16b, v1.16b}, [x0], x1
         st1             {v0.16b, v1.16b}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.16b, v1.16b}, [x0], x1
         st1             {v0.16b, v1.16b}, [x6], x1
         b.gt            1b
         ret
 
 L(ipred_dc_left_h64):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
         uaddlv          h0,      v0.16b
         uaddlv          h1,      v1.16b
         uaddlv          h2,      v2.16b
         uaddlv          h3,      v3.16b
         add             v0.4h,   v0.4h,   v1.4h
         add             v2.4h,   v2.4h,   v3.4h
         add             v0.4h,   v0.4h,   v2.4h
         rshrn           v0.8b,   v0.8h,   #6
         dup             v0.16b,  v0.b[0]
         br              x3
 L(ipred_dc_left_w64):
-        AARCH64_VALID_JUMP_TARGET
         mov             v1.16b,  v0.16b
         mov             v2.16b,  v0.16b
         mov             v3.16b,  v0.16b
 1:
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
@@ -524,24 +494,22 @@ function ipred_dc_8bpc_neon, export=1
         sub             x5,  x5,  w6, uxtw
         ushr            v16.8h,  v16.8h,  #1     // (width + height) >> 1
         dup             v17.8h,  w7              // -ctz(width + height)
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
 
 L(ipred_dc_h4):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.s}[0],  [x2], #4
         ins             v0.s[1], wzr
         uaddlv          h0,      v0.8b
         add             x2,  x2,  #1
         br              x3
 L(ipred_dc_w4):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v1.s}[0],  [x2]
         ins             v1.s[1], wzr
         add             v0.4h,   v0.4h,   v16.4h
         uaddlv          h1,      v1.8b
         cmp             w4,  #4
         add             v0.4h,   v0.4h,   v1.4h
         ushl            v0.4h,   v0.4h,   v17.4h
         b.eq            1f
@@ -559,23 +527,21 @@ 2:
         st1             {v0.s}[0],  [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.s}[0],  [x0], x1
         st1             {v0.s}[0],  [x6], x1
         b.gt            2b
         ret
 
 L(ipred_dc_h8):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8b},  [x2], #8
         uaddlv          h0,      v0.8b
         add             x2,  x2,  #1
         br              x3
 L(ipred_dc_w8):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v1.8b},  [x2]
         add             v0.4h,   v0.4h,   v16.4h
         uaddlv          h1,      v1.8b
         cmp             w4,  #8
         add             v0.4h,   v0.4h,   v1.4h
         ushl            v0.4h,   v0.4h,   v17.4h
         b.eq            1f
         // h = 4/16/32
@@ -592,23 +558,21 @@ 2:
         st1             {v0.8b},  [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.8b},  [x0], x1
         st1             {v0.8b},  [x6], x1
         b.gt            2b
         ret
 
 L(ipred_dc_h16):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.16b}, [x2], #16
         uaddlv          h0,      v0.16b
         add             x2,  x2,  #1
         br              x3
 L(ipred_dc_w16):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v1.16b}, [x2]
         add             v0.4h,   v0.4h,   v16.4h
         uaddlv          h1,      v1.16b
         cmp             w4,  #16
         add             v0.4h,   v0.4h,   v1.4h
         ushl            v0.4h,   v0.4h,   v17.4h
         b.eq            1f
         // h = 4/8/32/64
@@ -625,25 +589,23 @@ 2:
         st1             {v0.16b}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.16b}, [x0], x1
         st1             {v0.16b}, [x6], x1
         b.gt            2b
         ret
 
 L(ipred_dc_h32):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.16b, v1.16b}, [x2], #32
         uaddlv          h0,      v0.16b
         uaddlv          h1,      v1.16b
         add             x2,  x2,  #1
         add             v0.4h,   v0.4h,   v1.4h
         br              x3
 L(ipred_dc_w32):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v1.16b, v2.16b}, [x2]
         add             v0.4h,   v0.4h,   v16.4h
         uaddlv          h1,      v1.16b
         uaddlv          h2,      v2.16b
         cmp             w4,  #32
         add             v0.4h,   v0.4h,   v1.4h
         add             v0.4h,   v0.4h,   v2.4h
         ushl            v4.4h,   v0.4h,   v17.4h
@@ -663,29 +625,27 @@ 2:
         st1             {v0.16b, v1.16b}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.16b, v1.16b}, [x0], x1
         st1             {v0.16b, v1.16b}, [x6], x1
         b.gt            2b
         ret
 
 L(ipred_dc_h64):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64
         uaddlv          h0,      v0.16b
         uaddlv          h1,      v1.16b
         uaddlv          h2,      v2.16b
         uaddlv          h3,      v3.16b
         add             v0.4h,   v0.4h,   v1.4h
         add             v2.4h,   v2.4h,   v3.4h
         add             x2,  x2,  #1
         add             v0.4h,   v0.4h,   v2.4h
         br              x3
 L(ipred_dc_w64):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v1.16b, v2.16b, v3.16b, v4.16b}, [x2]
         add             v0.4h,   v0.4h,   v16.4h
         uaddlv          h1,      v1.16b
         uaddlv          h2,      v2.16b
         uaddlv          h3,      v3.16b
         uaddlv          h4,      v4.16b
         add             v1.4h,   v1.4h,   v2.4h
         add             v3.4h,   v3.4h,   v4.4h
@@ -740,17 +700,16 @@ function ipred_paeth_8bpc_neon, export=1
         add             x8,  x2,  #1
         sub             x2,  x2,  #4
         sub             x5,  x5,  w9, uxtw
         mov             x7,  #-4
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
 40:
-        AARCH64_VALID_JUMP_TARGET
         ld1r            {v5.4s},  [x8]
         usubl           v6.8h,   v5.8b,   v4.8b   // top - topleft
 4:
         ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
         zip1            v0.2s,   v0.2s,   v1.2s
         zip1            v2.2s,   v2.2s,   v3.2s
         uaddw           v16.8h,  v6.8h,   v0.8b
         uaddw           v17.8h,  v6.8h,   v2.8b
@@ -768,17 +727,16 @@ 4:
         st1             {v20.s}[3], [x0], x1
         st1             {v20.s}[2], [x6], x1
         subs            w4,  w4,  #4
         st1             {v20.s}[1], [x0], x1
         st1             {v20.s}[0], [x6], x1
         b.gt            4b
         ret
 80:
-        AARCH64_VALID_JUMP_TARGET
         ld1r            {v5.2d},  [x8]
         usubl           v6.8h,   v5.8b,   v4.8b   // top - topleft
 8:
         ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
         uaddw           v16.8h,  v6.8h,   v0.8b
         uaddw           v17.8h,  v6.8h,   v1.8b
         uaddw           v18.8h,  v6.8h,   v2.8b
         uaddw           v19.8h,  v6.8h,   v3.8b
@@ -809,17 +767,16 @@ 8:
         subs            w4,  w4,  #4
         st1             {v20.d}[1], [x0], x1
         st1             {v20.d}[0], [x6], x1
         b.gt            8b
         ret
 160:
 320:
 640:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v5.16b},  [x8], #16
         mov             w9,  w3
         // Set up pointers for four rows in parallel; x0, x6, x5, x10
         add             x5,  x0,  x1
         add             x10, x6,  x1
         lsl             x1,  x1,  #1
         sub             x1,  x1,  w3, uxtw
 1:
@@ -922,17 +879,16 @@ function ipred_smooth_8bpc_neon, export=
         ldrh            w9,  [x5, w9, uxtw #1]
         ld1r            {v4.16b},  [x12] // bottom
         add             x8,  x2,  #1
         sub             x5,  x5,  w9, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
 40:
-        AARCH64_VALID_JUMP_TARGET
         ld1r            {v6.2s}, [x8]             // top
         ld1r            {v7.2s}, [x10]            // weights_hor
         sub             x2,  x2,  #4
         mov             x7,  #-4
         dup             v5.16b,  v6.b[3]          // right
         usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
         uxtl            v7.8h,   v7.8b            // weights_hor
 4:
@@ -961,17 +917,16 @@ 4:
         st1             {v20.s}[0], [x0], x1
         st1             {v20.s}[1], [x6], x1
         subs            w4,  w4,  #4
         st1             {v21.s}[0], [x0], x1
         st1             {v21.s}[1], [x6], x1
         b.gt            4b
         ret
 80:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v6.8b}, [x8]             // top
         ld1             {v7.8b}, [x10]            // weights_hor
         sub             x2,  x2,  #4
         mov             x7,  #-4
         dup             v5.16b,  v6.b[7]          // right
         usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
         uxtl            v7.8h,   v7.8b            // weights_hor
 8:
@@ -1014,17 +969,16 @@ 8:
         subs            w4,  w4,  #4
         st1             {v22.8b}, [x0], x1
         st1             {v23.8b}, [x6], x1
         b.gt            8b
         ret
 160:
 320:
 640:
-        AARCH64_VALID_JUMP_TARGET
         add             x12, x2,  w3, uxtw
         sub             x2,  x2,  #2
         mov             x7,  #-2
         ld1r            {v5.16b}, [x12]           // right
         sub             x1,  x1,  w3, uxtw
         mov             w9,  w3
 
 1:
@@ -1102,17 +1056,16 @@ function ipred_smooth_v_8bpc_neon, expor
         ldrh            w9,  [x5, w9, uxtw #1]
         ld1r            {v4.16b},  [x8] // bottom
         add             x2,  x2,  #1
         sub             x5,  x5,  w9, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
 40:
-        AARCH64_VALID_JUMP_TARGET
         ld1r            {v6.2s}, [x2]             // top
         usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
 4:
         ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
         shll            v22.8h,  v4.8b,   #8      // bottom*256
         shll            v23.8h,  v4.8b,   #8
         zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
         zip1            v18.2s,  v18.2s,  v19.2s
@@ -1125,17 +1078,16 @@ 4:
         st1             {v22.s}[0], [x0], x1
         st1             {v22.s}[1], [x6], x1
         subs            w4,  w4,  #4
         st1             {v23.s}[0], [x0], x1
         st1             {v23.s}[1], [x6], x1
         b.gt            4b
         ret
 80:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v6.8b}, [x2]             // top
         usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
 8:
         ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
         shll            v24.8h,  v4.8b,   #8      // bottom*256
         shll            v25.8h,  v4.8b,   #8
         shll            v26.8h,  v4.8b,   #8
         shll            v27.8h,  v4.8b,   #8
@@ -1156,17 +1108,16 @@ 8:
         subs            w4,  w4,  #4
         st1             {v26.8b}, [x0], x1
         st1             {v27.8b}, [x6], x1
         b.gt            8b
         ret
 160:
 320:
 640:
-        AARCH64_VALID_JUMP_TARGET
         // Set up pointers for four rows in parallel; x0, x6, x5, x8
         add             x5,  x0,  x1
         add             x8,  x6,  x1
         lsl             x1,  x1,  #1
         sub             x1,  x1,  w3, uxtw
         mov             w9,  w3
 
 1:
@@ -1242,17 +1193,16 @@ function ipred_smooth_h_8bpc_neon, expor
         sub             w9,  w9,  #25
         ldrh            w9,  [x5, w9, uxtw #1]
         ld1r            {v5.16b},  [x12] // right
         sub             x5,  x5,  w9, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
 40:
-        AARCH64_VALID_JUMP_TARGET
         ld1r            {v7.2s}, [x8]             // weights_hor
         sub             x2,  x2,  #4
         mov             x7,  #-4
         uxtl            v7.8h,   v7.8b            // weights_hor
 4:
         ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
         shll            v20.8h,  v5.8b,   #8      // right*256
         shll            v21.8h,  v5.8b,   #8
@@ -1267,17 +1217,16 @@ 4:
         st1             {v20.s}[0], [x0], x1
         st1             {v20.s}[1], [x6], x1
         subs            w4,  w4,  #4
         st1             {v21.s}[0], [x0], x1
         st1             {v21.s}[1], [x6], x1
         b.gt            4b
         ret
 80:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v7.8b}, [x8]             // weights_hor
         sub             x2,  x2,  #4
         mov             x7,  #-4
         uxtl            v7.8h,   v7.8b            // weights_hor
 8:
         ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
         shll            v20.8h,  v5.8b,   #8      // right*256
         shll            v21.8h,  v5.8b,   #8
@@ -1300,17 +1249,16 @@ 8:
         subs            w4,  w4,  #4
         st1             {v22.8b}, [x0], x1
         st1             {v23.8b}, [x6], x1
         b.gt            8b
         ret
 160:
 320:
 640:
-        AARCH64_VALID_JUMP_TARGET
         sub             x2,  x2,  #4
         mov             x7,  #-4
         // Set up pointers for four rows in parallel; x0, x6, x5, x10
         add             x5,  x0,  x1
         add             x10, x6,  x1
         lsl             x1,  x1,  #1
         sub             x1,  x1,  w3, uxtw
         mov             w9,  w3
@@ -1397,17 +1345,16 @@ function ipred_filter_8bpc_neon, export=
         sxtl            v19.8h,  v19.8b
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         sxtl            v20.8h,  v20.8b
         sxtl            v21.8h,  v21.8b
         sxtl            v22.8h,  v22.8b
         br              x5
 40:
-        AARCH64_VALID_JUMP_TARGET
         ldur            s0,  [x2, #1]             // top (0-3)
         sub             x2,  x2,  #2
         mov             x7,  #-2
         uxtl            v0.8h,   v0.8b            // top (0-3)
 4:
         ld1             {v1.s}[0], [x2], x7       // left (0-1) + topleft (2)
         mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
         mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
@@ -1421,17 +1368,16 @@ 4:
         subs            w4,  w4,  #2
         st1             {v2.s}[0], [x0], x1
         uxtl            v0.8h,   v2.8b
         st1             {v2.s}[1], [x6], x1
         ext             v0.16b,  v0.16b,  v0.16b, #8 // move top from [4-7] to [0-3]
         b.gt            4b
         ret
 80:
-        AARCH64_VALID_JUMP_TARGET
         ldur            d0,  [x2, #1]             // top (0-7)
         sub             x2,  x2,  #2
         mov             x7,  #-2
         uxtl            v0.8h,   v0.8b            // top (0-7)
 8:
         ld1             {v1.s}[0], [x2], x7       // left (0-1) + topleft (2)
         mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
         mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
@@ -1455,17 +1401,16 @@ 8:
         st2             {v2.s, v3.s}[0], [x0], x1
         zip2            v0.2s,   v2.2s,   v3.2s
         st2             {v2.s, v3.s}[1], [x6], x1
         uxtl            v0.8h,   v0.8b
         b.gt            8b
         ret
 160:
 320:
-        AARCH64_VALID_JUMP_TARGET
         add             x8,  x2,  #1
         sub             x2,  x2,  #2
         mov             x7,  #-2
         sub             x1,  x1,  w3, uxtw
         mov             w9,  w3
 
 1:
         ld1             {v0.s}[0], [x2], x7       // left (0-1) + topleft (2)
@@ -1550,54 +1495,50 @@ function pal_pred_8bpc_neon, export=1
         sub             w9,  w9,  #25
         ldrh            w9,  [x6, w9, uxtw #1]
         xtn             v0.8b,  v0.8h
         sub             x6,  x6,  w9, uxtw
         add             x2,  x0,  x1
         lsl             x1,  x1,  #1
         br              x6
 4:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v1.16b}, [x3], #16
         subs            w5,  w5,  #4
         tbl             v1.16b, {v0.16b}, v1.16b
         st1             {v1.s}[0], [x0], x1
         st1             {v1.s}[1], [x2], x1
         st1             {v1.s}[2], [x0], x1
         st1             {v1.s}[3], [x2], x1
         b.gt            4b
         ret
 8:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v1.16b, v2.16b}, [x3], #32
         subs            w5,  w5,  #4
         tbl             v1.16b, {v0.16b}, v1.16b
         st1             {v1.d}[0], [x0], x1
         tbl             v2.16b, {v0.16b}, v2.16b
         st1             {v1.d}[1], [x2], x1
         st1             {v2.d}[0], [x0], x1
         st1             {v2.d}[1], [x2], x1
         b.gt            8b
         ret
 16:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v1.16b, v2.16b, v3.16b, v4.16b}, [x3], #64
         subs            w5,  w5,  #4
         tbl             v1.16b, {v0.16b}, v1.16b
         tbl             v2.16b, {v0.16b}, v2.16b
         st1             {v1.16b}, [x0], x1
         tbl             v3.16b, {v0.16b}, v3.16b
         st1             {v2.16b}, [x2], x1
         tbl             v4.16b, {v0.16b}, v4.16b
         st1             {v3.16b}, [x0], x1
         st1             {v4.16b}, [x2], x1
         b.gt            16b
         ret
 32:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
         ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64
         subs            w5,  w5,  #4
         tbl             v16.16b, {v0.16b}, v16.16b
         tbl             v17.16b, {v0.16b}, v17.16b
         tbl             v18.16b, {v0.16b}, v18.16b
         tbl             v19.16b, {v0.16b}, v19.16b
         tbl             v20.16b, {v0.16b}, v20.16b
@@ -1606,17 +1547,16 @@ 32:
         st1             {v18.16b, v19.16b}, [x2], x1
         tbl             v22.16b, {v0.16b}, v22.16b
         st1             {v20.16b, v21.16b}, [x0], x1
         tbl             v23.16b, {v0.16b}, v23.16b
         st1             {v22.16b, v23.16b}, [x2], x1
         b.gt            32b
         ret
 64:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
         ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64
         subs            w5,  w5,  #2
         tbl             v16.16b, {v0.16b}, v16.16b
         tbl             v17.16b, {v0.16b}, v17.16b
         tbl             v18.16b, {v0.16b}, v18.16b
         tbl             v19.16b, {v0.16b}, v19.16b
         st1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
@@ -1647,17 +1587,16 @@ function ipred_cfl_128_8bpc_neon, export
         ldrh            w9,  [x7, w9, uxtw #1]
         movi            v0.8h,   #128 // dc
         dup             v1.8h,   w6   // alpha
         sub             x7,  x7,  w9, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x7
 L(ipred_cfl_splat_w4):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v2.8h, v3.8h}, [x5], #32
         mul             v2.8h,   v2.8h,   v1.8h  // diff = ac * alpha
         mul             v3.8h,   v3.8h,   v1.8h
         sshr            v4.8h,   v2.8h,   #15    // sign = diff >> 15
         sshr            v5.8h,   v3.8h,   #15
         add             v2.8h,   v2.8h,   v4.8h  // diff + sign
         add             v3.8h,   v3.8h,   v5.8h
         srshr           v2.8h,   v2.8h,   #6     // (diff + sign + 32) >> 6 = apply_sign()
@@ -1669,17 +1608,16 @@ L(ipred_cfl_splat_w4):
         st1             {v2.s}[0],  [x0], x1
         st1             {v2.s}[1],  [x6], x1
         subs            w4,  w4,  #4
         st1             {v3.s}[0],  [x0], x1
         st1             {v3.s}[1],  [x6], x1
         b.gt            L(ipred_cfl_splat_w4)
         ret
 L(ipred_cfl_splat_w8):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x5], #64
         mul             v2.8h,   v2.8h,   v1.8h  // diff = ac * alpha
         mul             v3.8h,   v3.8h,   v1.8h
         mul             v4.8h,   v4.8h,   v1.8h
         mul             v5.8h,   v5.8h,   v1.8h
         sshr            v16.8h,  v2.8h,   #15    // sign = diff >> 15
         sshr            v17.8h,  v3.8h,   #15
         sshr            v18.8h,  v4.8h,   #15
@@ -1703,17 +1641,16 @@ L(ipred_cfl_splat_w8):
         st1             {v2.8b},  [x0], x1
         st1             {v3.8b},  [x6], x1
         subs            w4,  w4,  #4
         st1             {v4.8b},  [x0], x1
         st1             {v5.8b},  [x6], x1
         b.gt            L(ipred_cfl_splat_w8)
         ret
 L(ipred_cfl_splat_w16):
-        AARCH64_VALID_JUMP_TARGET
         add             x7,  x5,  w3, uxtw #1
         sub             x1,  x1,  w3, uxtw
         mov             w9,  w3
 1:
         ld1             {v2.8h, v3.8h}, [x5], #32
         ld1             {v4.8h, v5.8h}, [x7], #32
         mul             v2.8h,   v2.8h,   v1.8h  // diff = ac * alpha
         mul             v3.8h,   v3.8h,   v1.8h
@@ -1771,38 +1708,34 @@ function ipred_cfl_top_8bpc_neon, export
         ldrh            w9,  [x7, w9, uxtw #1]
         dup             v1.8h,   w6   // alpha
         add             x2,  x2,  #1
         sub             x7,  x7,  w9, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x7
 4:
-        AARCH64_VALID_JUMP_TARGET
         ld1r            {v0.2s},  [x2]
         uaddlv          h0,      v0.8b
         urshr           v0.4h,   v0.4h,   #3
         dup             v0.8h,   v0.h[0]
         b               L(ipred_cfl_splat_w4)
 8:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8b},  [x2]
         uaddlv          h0,      v0.8b
         urshr           v0.4h,   v0.4h,   #3
         dup             v0.8h,   v0.h[0]
         b               L(ipred_cfl_splat_w8)
 16:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.16b}, [x2]
         uaddlv          h0,      v0.16b
         urshr           v0.4h,   v0.4h,   #4
         dup             v0.8h,   v0.h[0]
         b               L(ipred_cfl_splat_w16)
 32:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v2.16b, v3.16b}, [x2]
         uaddlv          h2,      v2.16b
         uaddlv          h3,      v3.16b
         add             v2.4h,   v2.4h,   v3.4h
         urshr           v2.4h,   v2.4h,   #5
         dup             v0.8h,   v2.h[0]
         b               L(ipred_cfl_splat_w16)
 
@@ -1830,41 +1763,37 @@ function ipred_cfl_left_8bpc_neon, expor
         dup             v1.8h,   w6   // alpha
         sub             x9,  x10, w9, uxtw
         sub             x7,  x7,  w8, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x7
 
 L(ipred_cfl_left_h4):
-        AARCH64_VALID_JUMP_TARGET
         ld1r            {v0.2s},  [x2]
         uaddlv          h0,      v0.8b
         urshr           v0.4h,   v0.4h,   #3
         dup             v0.8h,   v0.h[0]
         br              x9
 
 L(ipred_cfl_left_h8):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8b},  [x2]
         uaddlv          h0,      v0.8b
         urshr           v0.4h,   v0.4h,   #3
         dup             v0.8h,   v0.h[0]
         br              x9
 
 L(ipred_cfl_left_h16):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.16b}, [x2]
         uaddlv          h0,      v0.16b
         urshr           v0.4h,   v0.4h,   #4
         dup             v0.8h,   v0.h[0]
         br              x9
 
 L(ipred_cfl_left_h32):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v2.16b, v3.16b}, [x2]
         uaddlv          h2,      v2.16b
         uaddlv          h3,      v3.16b
         add             v2.4h,   v2.4h,   v3.4h
         urshr           v2.4h,   v2.4h,   #5
         dup             v0.8h,   v2.h[0]
         br              x9
 
@@ -1898,24 +1827,22 @@ function ipred_cfl_8bpc_neon, export=1
         sub             x7,  x7,  w6, uxtw
         ushr            v16.8h,  v16.8h,  #1     // (width + height) >> 1
         dup             v17.8h,  w8              // -ctz(width + height)
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x7
 
 L(ipred_cfl_h4):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.s}[0],  [x2], #4
         ins             v0.s[1], wzr
         add             x2,  x2,  #1
         uaddlv          h0,      v0.8b
         br              x9
 L(ipred_cfl_w4):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v2.s}[0],  [x2]
         ins             v2.s[1], wzr
         add             v0.4h,   v0.4h,   v16.4h
         uaddlv          h2,      v2.8b
         cmp             w4,  #4
         add             v0.4h,   v0.4h,   v2.4h
         ushl            v0.4h,   v0.4h,   v17.4h
         b.eq            1f
@@ -1926,23 +1853,21 @@ L(ipred_cfl_w4):
         lsr             w16, w16, w17
         dup             v16.4h,  w16
         sqdmulh         v0.4h,   v0.4h,   v16.4h
 1:
         dup             v0.8h,   v0.h[0]
         b               L(ipred_cfl_splat_w4)
 
 L(ipred_cfl_h8):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8b},  [x2], #8
         uaddlv          h0,      v0.8b
         add             x2,  x2,  #1
         br              x9
 L(ipred_cfl_w8):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v2.8b},  [x2]
         add             v0.4h,   v0.4h,   v16.4h
         uaddlv          h2,      v2.8b
         cmp             w4,  #8
         add             v0.4h,   v0.4h,   v2.4h
         ushl            v0.4h,   v0.4h,   v17.4h
         b.eq            1f
         // h = 4/16/32
@@ -1952,23 +1877,21 @@ L(ipred_cfl_w8):
         csel            w16, w16, w17, eq
         dup             v16.4h,  w16
         sqdmulh         v0.4h,   v0.4h,   v16.4h
 1:
         dup             v0.8h,   v0.h[0]
         b               L(ipred_cfl_splat_w8)
 
 L(ipred_cfl_h16):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.16b}, [x2], #16
         uaddlv          h0,      v0.16b
         add             x2,  x2,  #1
         br              x9
 L(ipred_cfl_w16):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v2.16b}, [x2]
         add             v0.4h,   v0.4h,   v16.4h
         uaddlv          h2,      v2.16b
         cmp             w4,  #16
         add             v0.4h,   v0.4h,   v2.4h
         ushl            v0.4h,   v0.4h,   v17.4h
         b.eq            1f
         // h = 4/8/32
@@ -1978,25 +1901,23 @@ L(ipred_cfl_w16):
         csel            w16, w16, w17, eq
         dup             v16.4h,  w16
         sqdmulh         v0.4h,   v0.4h,   v16.4h
 1:
         dup             v0.8h,   v0.h[0]
         b               L(ipred_cfl_splat_w16)
 
 L(ipred_cfl_h32):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v2.16b, v3.16b}, [x2], #32
         uaddlv          h2,      v2.16b
         uaddlv          h3,      v3.16b
         add             x2,  x2,  #1
         add             v0.4h,   v2.4h,   v3.4h
         br              x9
 L(ipred_cfl_w32):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v2.16b, v3.16b}, [x2]
         add             v0.4h,   v0.4h,   v16.4h
         uaddlv          h2,      v2.16b
         uaddlv          h3,      v3.16b
         cmp             w4,  #32
         add             v0.4h,   v0.4h,   v2.4h
         add             v0.4h,   v0.4h,   v3.4h
         ushl            v0.4h,   v0.4h,   v17.4h
@@ -2045,17 +1966,16 @@ function ipred_cfl_ac_420_8bpc_neon, exp
         add             w9,  w9,  w10        // log2sz
         add             x10, x1,  x2
         dup             v31.4s,  w9
         lsl             x2,  x2,  #1
         neg             v31.4s,  v31.4s      // -log2sz
         br              x7
 
 L(ipred_cfl_ac_420_w4):
-        AARCH64_VALID_JUMP_TARGET
 1:      // Copy and subsample input
         ld1             {v0.8b},   [x1],  x2
         ld1             {v1.8b},   [x10], x2
         ld1             {v0.d}[1], [x1],  x2
         ld1             {v1.d}[1], [x10], x2
         uaddlp          v0.8h,   v0.16b
         uaddlp          v1.8h,   v1.16b
         add             v0.8h,   v0.8h,   v1.8h
@@ -2086,17 +2006,16 @@ 6:      // Subtract dc from ac
         subs            w6,  w6,  #4
         sub             v0.8h,   v0.8h,   v4.8h
         sub             v1.8h,   v1.8h,   v4.8h
         st1             {v0.8h, v1.8h}, [x0], #32
         b.gt            6b
         ret
 
 L(ipred_cfl_ac_420_w8):
-        AARCH64_VALID_JUMP_TARGET
         cbnz            w3,  L(ipred_cfl_ac_420_w8_wpad)
 1:      // Copy and subsample input, without padding
         ld1             {v0.16b}, [x1],  x2
         ld1             {v1.16b}, [x10], x2
         ld1             {v2.16b}, [x1],  x2
         uaddlp          v0.8h,   v0.16b
         ld1             {v3.16b}, [x10], x2
         uaddlp          v1.8h,   v1.16b
@@ -2169,24 +2088,22 @@ 6:      // Subtract dc from ac
         sub             v1.8h,   v1.8h,   v4.8h
         sub             v2.8h,   v2.8h,   v4.8h
         sub             v3.8h,   v3.8h,   v4.8h
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
         b.gt            6b
         ret
 
 L(ipred_cfl_ac_420_w16):
-        AARCH64_VALID_JUMP_TARGET
         adr             x7,  L(ipred_cfl_ac_420_w16_tbl)
         ldrh            w3,  [x7, w3, uxtw #1]
         sub             x7,  x7,  w3, uxtw
         br              x7
 
 L(ipred_cfl_ac_420_w16_wpad0):
-        AARCH64_VALID_JUMP_TARGET
 1:      // Copy and subsample input, without padding
         ld1             {v0.16b, v1.16b}, [x1],  x2
         ld1             {v2.16b, v3.16b}, [x10], x2
         uaddlp          v0.8h,   v0.16b
         ld1             {v4.16b, v5.16b}, [x1],  x2
         uaddlp          v1.8h,   v1.16b
         ld1             {v6.16b, v7.16b}, [x10], x2
         uaddlp          v2.8h,   v2.16b
@@ -2210,17 +2127,16 @@ 1:      // Copy and subsample input, wit
         add             v18.8h,  v18.8h,  v2.8h
         add             v19.8h,  v19.8h,  v3.8h
         b.gt            1b
         mov             v0.16b,  v2.16b
         mov             v1.16b,  v3.16b
         b               L(ipred_cfl_ac_420_w16_hpad)
 
 L(ipred_cfl_ac_420_w16_wpad1):
-        AARCH64_VALID_JUMP_TARGET
 1:      // Copy and subsample input, padding 4
         ldr             d1,  [x1,  #16]
         ld1             {v0.16b}, [x1],  x2
         ldr             d3,  [x10, #16]
         ld1             {v2.16b}, [x10], x2
         uaddlp          v1.4h,   v1.8b
         ldr             d5,  [x1,  #16]
         uaddlp          v0.8h,   v0.16b
@@ -2252,17 +2168,16 @@ 1:      // Copy and subsample input, pad
         add             v18.8h,  v18.8h,  v2.8h
         add             v19.8h,  v19.8h,  v3.8h
         b.gt            1b
         mov             v0.16b,  v2.16b
         mov             v1.16b,  v3.16b
         b               L(ipred_cfl_ac_420_w16_hpad)
 
 L(ipred_cfl_ac_420_w16_wpad2):
-        AARCH64_VALID_JUMP_TARGET
 1:      // Copy and subsample input, padding 8
         ld1             {v0.16b}, [x1],  x2
         ld1             {v2.16b}, [x10], x2
         ld1             {v4.16b}, [x1],  x2
         uaddlp          v0.8h,   v0.16b
         ld1             {v6.16b}, [x10], x2
         uaddlp          v2.8h,   v2.16b
         uaddlp          v4.8h,   v4.16b
@@ -2280,17 +2195,16 @@ 1:      // Copy and subsample input, pad
         add             v18.8h,  v18.8h,  v2.8h
         add             v19.8h,  v19.8h,  v3.8h
         b.gt            1b
         mov             v0.16b,  v2.16b
         mov             v1.16b,  v3.16b
         b               L(ipred_cfl_ac_420_w16_hpad)
 
 L(ipred_cfl_ac_420_w16_wpad3):
-        AARCH64_VALID_JUMP_TARGET
 1:      // Copy and subsample input, padding 12
         ld1             {v0.8b}, [x1],  x2
         ld1             {v2.8b}, [x10], x2
         ld1             {v4.8b}, [x1],  x2
         uaddlp          v0.4h,   v0.8b
         ld1             {v6.8b}, [x10], x2
         uaddlp          v2.4h,   v2.8b
         uaddlp          v4.4h,   v4.8b
@@ -2369,17 +2283,16 @@ function ipred_cfl_ac_422_8bpc_neon, exp
         add             w9,  w9,  w10        // log2sz
         add             x10, x1,  x2
         dup             v31.4s,  w9
         lsl             x2,  x2,  #1
         neg             v31.4s,  v31.4s      // -log2sz
         br              x7
 
 L(ipred_cfl_ac_422_w4):
-        AARCH64_VALID_JUMP_TARGET
 1:      // Copy and subsample input
         ld1             {v0.8b},   [x1],  x2
         ld1             {v0.d}[1], [x10], x2
         ld1             {v1.8b},   [x1],  x2
         ld1             {v1.d}[1], [x10], x2
         uaddlp          v0.8h,   v0.16b
         uaddlp          v1.8h,   v1.16b
         shl             v0.8h,   v0.8h,   #2
@@ -2389,17 +2302,16 @@ 1:      // Copy and subsample input
         add             v17.8h,  v17.8h,  v1.8h
         st1             {v0.8h, v1.8h}, [x0], #32
         b.gt            1b
         trn2            v0.2d,   v1.2d,   v1.2d
         trn2            v1.2d,   v1.2d,   v1.2d
         b               L(ipred_cfl_ac_420_w4_hpad)
 
 L(ipred_cfl_ac_422_w8):
-        AARCH64_VALID_JUMP_TARGET
         cbnz            w3,  L(ipred_cfl_ac_422_w8_wpad)
 1:      // Copy and subsample input, without padding
         ld1             {v0.16b}, [x1],  x2
         ld1             {v1.16b}, [x10], x2
         ld1             {v2.16b}, [x1],  x2
         uaddlp          v0.8h,   v0.16b
         ld1             {v3.16b}, [x10], x2
         uaddlp          v1.8h,   v1.16b
@@ -2445,24 +2357,22 @@ 1:      // Copy and subsample input, pad
         add             v18.8h,  v18.8h,  v2.8h
         add             v19.8h,  v19.8h,  v3.8h
         b.gt            1b
         mov             v0.16b,  v3.16b
         mov             v1.16b,  v3.16b
         b               L(ipred_cfl_ac_420_w8_hpad)
 
 L(ipred_cfl_ac_422_w16):
-        AARCH64_VALID_JUMP_TARGET
         adr             x7,  L(ipred_cfl_ac_422_w16_tbl)
         ldrh            w3,  [x7, w3, uxtw #1]
         sub             x7,  x7,  w3, uxtw
         br              x7
 
 L(ipred_cfl_ac_422_w16_wpad0):
-        AARCH64_VALID_JUMP_TARGET
 1:      // Copy and subsample input, without padding
         ld1             {v0.16b, v1.16b}, [x1],  x2
         ld1             {v2.16b, v3.16b}, [x10], x2
         uaddlp          v0.8h,   v0.16b
         uaddlp          v1.8h,   v1.16b
         uaddlp          v2.8h,   v2.16b
         uaddlp          v3.8h,   v3.16b
         shl             v0.8h,   v0.8h,   #2
@@ -2476,17 +2386,16 @@ 1:      // Copy and subsample input, wit
         add             v18.8h,  v18.8h,  v2.8h
         add             v19.8h,  v19.8h,  v3.8h
         b.gt            1b
         mov             v0.16b,  v2.16b
         mov             v1.16b,  v3.16b
         b               L(ipred_cfl_ac_420_w16_hpad)
 
 L(ipred_cfl_ac_422_w16_wpad1):
-        AARCH64_VALID_JUMP_TARGET
 1:      // Copy and subsample input, padding 4
         ldr             d1,  [x1,  #16]
         ld1             {v0.16b}, [x1],  x2
         ldr             d3,  [x10, #16]
         ld1             {v2.16b}, [x10], x2
         uaddlp          v1.4h,   v1.8b
         uaddlp          v0.8h,   v0.16b
         uaddlp          v3.4h,   v3.8b
@@ -2506,17 +2415,16 @@ 1:      // Copy and subsample input, pad
         add             v18.8h,  v18.8h,  v2.8h
         add             v19.8h,  v19.8h,  v3.8h
         b.gt            1b
         mov             v0.16b,  v2.16b
         mov             v1.16b,  v3.16b
         b               L(ipred_cfl_ac_420_w16_hpad)
 
 L(ipred_cfl_ac_422_w16_wpad2):
-        AARCH64_VALID_JUMP_TARGET
 1:      // Copy and subsample input, padding 8
         ld1             {v0.16b}, [x1],  x2
         ld1             {v2.16b}, [x10], x2
         uaddlp          v0.8h,   v0.16b
         uaddlp          v2.8h,   v2.16b
         shl             v0.8h,   v0.8h,   #2
         shl             v2.8h,   v2.8h,   #2
         dup             v1.8h,   v0.h[7]
@@ -2528,17 +2436,16 @@ 1:      // Copy and subsample input, pad
         add             v18.8h,  v18.8h,  v2.8h
         add             v19.8h,  v19.8h,  v3.8h
         b.gt            1b
         mov             v0.16b,  v2.16b
         mov             v1.16b,  v3.16b
         b               L(ipred_cfl_ac_420_w16_hpad)
 
 L(ipred_cfl_ac_422_w16_wpad3):
-        AARCH64_VALID_JUMP_TARGET
 1:      // Copy and subsample input, padding 12
         ld1             {v0.8b}, [x1],  x2
         ld1             {v2.8b}, [x10], x2
         uaddlp          v0.4h,   v0.8b
         uaddlp          v2.4h,   v2.8b
         shl             v0.4h,   v0.4h,   #2
         shl             v2.4h,   v2.4h,   #2
         dup             v1.8h,   v0.h[3]
@@ -2591,17 +2498,16 @@ function ipred_cfl_ac_444_8bpc_neon, exp
         add             w9,  w9,  w10        // log2sz
         add             x10, x1,  x2
         dup             v31.4s,  w9
         lsl             x2,  x2,  #1
         neg             v31.4s,  v31.4s      // -log2sz
         br              x7
 
 L(ipred_cfl_ac_444_w4):
-        AARCH64_VALID_JUMP_TARGET
 1:      // Copy and expand input
         ld1             {v0.s}[0], [x1],  x2
         ld1             {v0.s}[1], [x10], x2
         ld1             {v1.s}[0], [x1],  x2
         ld1             {v1.s}[1], [x10], x2
         ushll           v0.8h,   v0.8b,   #3
         ushll           v1.8h,   v1.8b,   #3
         subs            w8,  w8,  #4
@@ -2609,17 +2515,16 @@ 1:      // Copy and expand input
         add             v17.8h,  v17.8h,  v1.8h
         st1             {v0.8h, v1.8h}, [x0], #32
         b.gt            1b
         trn2            v0.2d,   v1.2d,   v1.2d
         trn2            v1.2d,   v1.2d,   v1.2d
         b               L(ipred_cfl_ac_420_w4_hpad)
 
 L(ipred_cfl_ac_444_w8):
-        AARCH64_VALID_JUMP_TARGET
 1:      // Copy and expand input
         ld1             {v0.8b}, [x1],  x2
         ld1             {v1.8b}, [x10], x2
         ld1             {v2.8b}, [x1],  x2
         ushll           v0.8h,   v0.8b,   #3
         ld1             {v3.8b}, [x10], x2
         ushll           v1.8h,   v1.8b,   #3
         ushll           v2.8h,   v2.8b,   #3
@@ -2631,17 +2536,16 @@ 1:      // Copy and expand input
         add             v18.8h,  v18.8h,  v2.8h
         add             v19.8h,  v19.8h,  v3.8h
         b.gt            1b
         mov             v0.16b,  v3.16b
         mov             v1.16b,  v3.16b
         b               L(ipred_cfl_ac_420_w8_hpad)
 
 L(ipred_cfl_ac_444_w16):
-        AARCH64_VALID_JUMP_TARGET
         cbnz            w3,  L(ipred_cfl_ac_444_w16_wpad)
 1:      // Copy and expand input, without padding
         ld1             {v0.16b}, [x1],  x2
         ld1             {v2.16b}, [x10], x2
         ld1             {v4.16b}, [x1],  x2
         ushll2          v1.8h,   v0.16b,  #3
         ushll           v0.8h,   v0.8b,   #3
         ld1             {v6.16b}, [x10], x2
@@ -2697,24 +2601,22 @@ 1:      // Copy and expand input, paddin
         b.gt            1b
         mov             v0.16b,  v6.16b
         mov             v1.16b,  v7.16b
         mov             v2.16b,  v6.16b
         mov             v3.16b,  v7.16b
         b               L(ipred_cfl_ac_420_w16_hpad)
 
 L(ipred_cfl_ac_444_w32):
-        AARCH64_VALID_JUMP_TARGET
         adr             x7,  L(ipred_cfl_ac_444_w32_tbl)
         ldrh            w3,  [x7, w3, uxtw] // (w3>>1) << 1
         sub             x7,  x7,  w3, uxtw
         br              x7
 
 L(ipred_cfl_ac_444_w32_wpad0):
-        AARCH64_VALID_JUMP_TARGET
 1:      // Copy and expand input, without padding
         ld1             {v2.16b, v3.16b}, [x1],  x2
         ld1             {v6.16b, v7.16b}, [x10], x2
         ushll           v0.8h,   v2.8b,   #3
         ushll2          v1.8h,   v2.16b,  #3
         ushll           v2.8h,   v3.8b,   #3
         ushll2          v3.8h,   v3.16b,  #3
         ushll           v4.8h,   v6.8b,   #3
@@ -2731,17 +2633,16 @@ 1:      // Copy and expand input, withou
         add             v16.8h,  v16.8h,  v4.8h
         add             v17.8h,  v17.8h,  v5.8h
         add             v18.8h,  v18.8h,  v6.8h
         add             v19.8h,  v19.8h,  v7.8h
         b.gt            1b
         b               L(ipred_cfl_ac_444_w32_hpad)
 
 L(ipred_cfl_ac_444_w32_wpad2):
-        AARCH64_VALID_JUMP_TARGET
 1:      // Copy and expand input, padding 8
         ldr             d2,  [x1,  #16]
         ld1             {v1.16b}, [x1],  x2
         ldr             d6,  [x10, #16]
         ld1             {v5.16b}, [x10], x2
         ushll           v2.8h,   v2.8b,   #3
         ushll           v0.8h,   v1.8b,   #3
         ushll2          v1.8h,   v1.16b,  #3
@@ -2760,17 +2661,16 @@ 1:      // Copy and expand input, paddin
         add             v16.8h,  v16.8h,  v4.8h
         add             v17.8h,  v17.8h,  v5.8h
         add             v18.8h,  v18.8h,  v6.8h
         add             v19.8h,  v19.8h,  v7.8h
         b.gt            1b
         b               L(ipred_cfl_ac_444_w32_hpad)
 
 L(ipred_cfl_ac_444_w32_wpad4):
-        AARCH64_VALID_JUMP_TARGET
 1:      // Copy and expand input, padding 16
         ld1             {v1.16b}, [x1],  x2
         ld1             {v5.16b}, [x10], x2
         ushll           v0.8h,   v1.8b,   #3
         ushll2          v1.8h,   v1.16b,  #3
         ushll           v4.8h,   v5.8b,   #3
         ushll2          v5.8h,   v5.16b,  #3
         dup             v2.8h,   v1.h[7]
@@ -2787,17 +2687,16 @@ 1:      // Copy and expand input, paddin
         add             v16.8h,  v16.8h,  v4.8h
         add             v17.8h,  v17.8h,  v5.8h
         add             v18.8h,  v18.8h,  v6.8h
         add             v19.8h,  v19.8h,  v7.8h
         b.gt            1b
         b               L(ipred_cfl_ac_444_w32_hpad)
 
 L(ipred_cfl_ac_444_w32_wpad6):
-        AARCH64_VALID_JUMP_TARGET
 1:      // Copy and expand input, padding 24
         ld1             {v0.8b}, [x1],  x2
         ld1             {v4.8b}, [x10], x2
         ushll           v0.8h,   v0.8b,   #3
         ushll           v4.8h,   v4.8b,   #3
         dup             v1.8h,   v0.h[7]
         dup             v2.8h,   v0.h[7]
         dup             v3.8h,   v0.h[7]
--- a/third_party/dav1d/src/arm/64/ipred16.S
+++ b/third_party/dav1d/src/arm/64/ipred16.S
@@ -41,59 +41,54 @@ function ipred_dc_128_16bpc_neon, export
         ldrh            w3,  [x5, w3, uxtw #1]
         dup             v0.8h,   w8
         sub             x5,  x5,  w3, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         urshr           v0.8h,   v0.8h,  #1
         br              x5
 4:
-        AARCH64_VALID_JUMP_TARGET
         st1             {v0.4h},  [x0], x1
         st1             {v0.4h},  [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.4h},  [x0], x1
         st1             {v0.4h},  [x6], x1
         b.gt            4b
         ret
 8:
-        AARCH64_VALID_JUMP_TARGET
         st1             {v0.8h},  [x0], x1
         st1             {v0.8h},  [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.8h},  [x0], x1
         st1             {v0.8h},  [x6], x1
         b.gt            8b
         ret
 160:
-        AARCH64_VALID_JUMP_TARGET
         mov             v1.16b,  v0.16b
 16:
         st1             {v0.8h, v1.8h}, [x0], x1
         st1             {v0.8h, v1.8h}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.8h, v1.8h}, [x0], x1
         st1             {v0.8h, v1.8h}, [x6], x1
         b.gt            16b
         ret
 320:
-        AARCH64_VALID_JUMP_TARGET
         mov             v1.16b,  v0.16b
         mov             v2.16b,  v0.16b
         mov             v3.16b,  v0.16b
 32:
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
         b.gt            32b
         ret
 640:
-        AARCH64_VALID_JUMP_TARGET
         mov             v1.16b,  v0.16b
         mov             v2.16b,  v0.16b
         mov             v3.16b,  v0.16b
         sub             x1,  x1,  #64
 64:
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
@@ -124,61 +119,56 @@ function ipred_v_16bpc_neon, export=1
         sub             w3,  w3,  #25
         ldrh            w3,  [x5, w3, uxtw #1]
         add             x2,  x2,  #2
         sub             x5,  x5,  w3, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
 40:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.4h},  [x2]
 4:
         st1             {v0.4h},  [x0], x1
         st1             {v0.4h},  [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.4h},  [x0], x1
         st1             {v0.4h},  [x6], x1
         b.gt            4b
         ret
 80:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8h},  [x2]
 8:
         st1             {v0.8h},  [x0], x1
         st1             {v0.8h},  [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.8h},  [x0], x1
         st1             {v0.8h},  [x6], x1
         b.gt            8b
         ret
 160:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8h, v1.8h}, [x2]
 16:
         st1             {v0.8h, v1.8h}, [x0], x1
         st1             {v0.8h, v1.8h}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.8h, v1.8h}, [x0], x1
         st1             {v0.8h, v1.8h}, [x6], x1
         b.gt            16b
         ret
 320:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
 32:
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
         b.gt            32b
         ret
 640:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
         sub             x1,  x1,  #64
         ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
 64:
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
         st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
         st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
@@ -209,51 +199,47 @@ function ipred_h_16bpc_neon, export=1
         ldrh            w3,  [x5, w3, uxtw #1]
         sub             x2,  x2,  #8
         sub             x5,  x5,  w3, uxtw
         mov             x7,  #-8
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
 4:
-        AARCH64_VALID_JUMP_TARGET
         ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
         st1             {v3.4h},  [x0], x1
         st1             {v2.4h},  [x6], x1
         subs            w4,  w4,  #4
         st1             {v1.4h},  [x0], x1
         st1             {v0.4h},  [x6], x1
         b.gt            4b
         ret
 8:
-        AARCH64_VALID_JUMP_TARGET
         ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
         st1             {v3.8h},  [x0], x1
         st1             {v2.8h},  [x6], x1
         subs            w4,  w4,  #4
         st1             {v1.8h},  [x0], x1
         st1             {v0.8h},  [x6], x1
         b.gt            8b
         ret
 16:
-        AARCH64_VALID_JUMP_TARGET
         ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
         str             q3,  [x0, #16]
         str             q2,  [x6, #16]
         st1             {v3.8h}, [x0], x1
         st1             {v2.8h}, [x6], x1
         subs            w4,  w4,  #4
         str             q1,  [x0, #16]
         str             q0,  [x6, #16]
         st1             {v1.8h}, [x0], x1
         st1             {v0.8h}, [x6], x1
         b.gt            16b
         ret
 32:
-        AARCH64_VALID_JUMP_TARGET
         ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
         str             q3,  [x0, #16]
         str             q2,  [x6, #16]
         stp             q3,  q3,  [x0, #32]
         stp             q2,  q2,  [x6, #32]
         st1             {v3.8h}, [x0], x1
         st1             {v2.8h}, [x6], x1
         subs            w4,  w4,  #4
@@ -261,17 +247,16 @@ 32:
         str             q0,  [x6, #16]
         stp             q1,  q1,  [x0, #32]
         stp             q0,  q0,  [x6, #32]
         st1             {v1.8h}, [x0], x1
         st1             {v0.8h}, [x6], x1
         b.gt            32b
         ret
 64:
-        AARCH64_VALID_JUMP_TARGET
         ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
         str             q3,  [x0, #16]
         str             q2,  [x6, #16]
         stp             q3,  q3,  [x0, #32]
         stp             q2,  q2,  [x6, #32]
         stp             q3,  q3,  [x0, #64]
         stp             q2,  q2,  [x6, #64]
         stp             q3,  q3,  [x0, #96]
@@ -310,61 +295,57 @@ function ipred_dc_top_16bpc_neon, export
         sub             w3,  w3,  #25
         ldrh            w3,  [x5, w3, uxtw #1]
         add             x2,  x2,  #2
         sub             x5,  x5,  w3, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
 40:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.4h},  [x2]
         addv            h0,      v0.4h
         urshr           v0.4h,   v0.4h,   #2
         dup             v0.4h,   v0.h[0]
 4:
         st1             {v0.4h},  [x0], x1
         st1             {v0.4h},  [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.4h},  [x0], x1
         st1             {v0.4h},  [x6], x1
         b.gt            4b
         ret
 80:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8h},  [x2]
         addv            h0,      v0.8h
         urshr           v0.4h,   v0.4h,   #3
         dup             v0.8h,   v0.h[0]
 8:
         st1             {v0.8h},  [x0], x1
         st1             {v0.8h},  [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.8h},  [x0], x1
         st1             {v0.8h},  [x6], x1
         b.gt            8b
         ret
 160:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8h, v1.8h}, [x2]
         addp            v0.8h,   v0.8h,   v1.8h
         addv            h0,      v0.8h
         urshr           v2.4h,   v0.4h,   #4
         dup             v0.8h,   v2.h[0]
         dup             v1.8h,   v2.h[0]
 16:
         st1             {v0.8h, v1.8h}, [x0], x1
         st1             {v0.8h, v1.8h}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.8h, v1.8h}, [x0], x1
         st1             {v0.8h, v1.8h}, [x6], x1
         b.gt            16b
         ret
 320:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
         addp            v0.8h,   v0.8h,   v1.8h
         addp            v2.8h,   v2.8h,   v3.8h
         addp            v0.8h,   v0.8h,   v2.8h
         uaddlv          s0,      v0.8h
         rshrn           v4.4h,   v0.4s,   #5
         dup             v0.8h,   v4.h[0]
         dup             v1.8h,   v4.h[0]
@@ -374,17 +355,16 @@ 32:
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
         b.gt            32b
         ret
 640:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
         addp            v0.8h,   v0.8h,   v1.8h
         ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
         addp            v2.8h,   v2.8h,   v3.8h
         addp            v4.8h,   v4.8h,   v5.8h
         addp            v6.8h,   v6.8h,   v7.8h
         addp            v0.8h,   v0.8h,   v2.8h
         addp            v4.8h,   v4.8h,   v6.8h
@@ -432,112 +412,102 @@ function ipred_dc_left_16bpc_neon, expor
         ldrh            w7,  [x5, w7, uxtw #1]
         sub             x3,  x5,  w3, uxtw
         sub             x5,  x5,  w7, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
 
 L(ipred_dc_left_h4):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.4h},  [x2]
         addv            h0,      v0.4h
         urshr           v0.4h,   v0.4h,   #2
         dup             v0.8h,   v0.h[0]
         br              x3
 L(ipred_dc_left_w4):
-        AARCH64_VALID_JUMP_TARGET
         st1             {v0.4h},  [x0], x1
         st1             {v0.4h},  [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.4h},  [x0], x1
         st1             {v0.4h},  [x6], x1
         b.gt            L(ipred_dc_left_w4)
         ret
 
 L(ipred_dc_left_h8):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8h},  [x2]
         addv            h0,      v0.8h
         urshr           v0.4h,   v0.4h,   #3
         dup             v0.8h,   v0.h[0]
         br              x3
 L(ipred_dc_left_w8):
-        AARCH64_VALID_JUMP_TARGET
         st1             {v0.8h},  [x0], x1
         st1             {v0.8h},  [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.8h},  [x0], x1
         st1             {v0.8h},  [x6], x1
         b.gt            L(ipred_dc_left_w8)
         ret
 
 L(ipred_dc_left_h16):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8h, v1.8h}, [x2]
         addp            v0.8h,   v0.8h,   v1.8h
         addv            h0,      v0.8h
         urshr           v2.4h,   v0.4h,   #4
         dup             v0.8h,   v2.h[0]
         dup             v1.8h,   v2.h[0]
         br              x3
 L(ipred_dc_left_w16):
-        AARCH64_VALID_JUMP_TARGET
         mov             v1.16b,  v0.16b
 1:
         st1             {v0.8h, v1.8h}, [x0], x1
         st1             {v0.8h, v1.8h}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.8h, v1.8h}, [x0], x1
         st1             {v0.8h, v1.8h}, [x6], x1
         b.gt            1b
         ret
 
 L(ipred_dc_left_h32):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
         addp            v0.8h,   v0.8h,   v1.8h
         addp            v2.8h,   v2.8h,   v3.8h
         addp            v0.8h,   v0.8h,   v2.8h
         uaddlp          v0.4s,   v0.8h
         addv            s0,      v0.4s
         rshrn           v4.4h,   v0.4s,   #5
         dup             v0.8h,   v4.h[0]
         br              x3
 L(ipred_dc_left_w32):
-        AARCH64_VALID_JUMP_TARGET
         mov             v1.16b,  v0.16b
         mov             v2.16b,  v0.16b
         mov             v3.16b,  v0.16b
 1:
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
         b.gt            1b
         ret
 
 L(ipred_dc_left_h64):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
         addp            v0.8h,   v0.8h,   v1.8h
         ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
         addp            v2.8h,   v2.8h,   v3.8h
         addp            v4.8h,   v4.8h,   v5.8h
         addp            v6.8h,   v6.8h,   v7.8h
         addp            v0.8h,   v0.8h,   v2.8h
         addp            v4.8h,   v4.8h,   v6.8h
         addp            v0.8h,   v0.8h,   v4.8h
         uaddlv          s0,      v0.8h
         rshrn           v4.4h,   v0.4s,   #6
         dup             v0.8h,   v4.h[0]
         br              x3
 L(ipred_dc_left_w64):
-        AARCH64_VALID_JUMP_TARGET
         mov             v1.16b,  v0.16b
         mov             v2.16b,  v0.16b
         mov             v3.16b,  v0.16b
         sub             x1,  x1,  #64
 1:
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
@@ -585,23 +555,21 @@ function ipred_dc_16bpc_neon, export=1
         sub             x5,  x5,  w6, uxtw
         ushr            v16.4s,  v16.4s,  #1     // (width + height) >> 1
         dup             v17.4s,  w7              // -ctz(width + height)
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
 
 L(ipred_dc_h4):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.4h},  [x2], #8
         uaddlv          s0,      v0.4h
         add             x2,  x2,  #2
         br              x3
 L(ipred_dc_w4):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v1.4h},  [x2]
         add             v0.2s,   v0.2s,   v16.2s
         uaddlv          s1,      v1.4h
         cmp             w4,  #4
         add             v0.2s,   v0.2s,   v1.2s
         ushl            v0.2s,   v0.2s,   v17.2s
         b.eq            1f
         // h = 8/16
@@ -619,23 +587,21 @@ 2:
         st1             {v0.4h},  [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.4h},  [x0], x1
         st1             {v0.4h},  [x6], x1
         b.gt            2b
         ret
 
 L(ipred_dc_h8):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8h},  [x2], #16
         uaddlv          s0,      v0.8h
         add             x2,  x2,  #2
         br              x3
 L(ipred_dc_w8):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v1.8h},  [x2]
         add             v0.2s,   v0.2s,   v16.2s
         uaddlv          s1,      v1.8h
         cmp             w4,  #8
         add             v0.2s,   v0.2s,   v1.2s
         ushl            v0.2s,   v0.2s,   v17.2s
         b.eq            1f
         // h = 4/16/32
@@ -653,24 +619,22 @@ 2:
         st1             {v0.8h},  [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.8h},  [x0], x1
         st1             {v0.8h},  [x6], x1
         b.gt            2b
         ret
 
 L(ipred_dc_h16):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8h, v1.8h}, [x2], #32
         addp            v0.8h,   v0.8h,   v1.8h
         add             x2,  x2,  #2
         uaddlv          s0,      v0.8h
         br              x3
 L(ipred_dc_w16):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v1.8h, v2.8h}, [x2]
         add             v0.2s,   v0.2s,   v16.2s
         addp            v1.8h,   v1.8h,   v2.8h
         uaddlv          s1,      v1.8h
         cmp             w4,  #16
         add             v0.2s,   v0.2s,   v1.2s
         ushl            v4.2s,   v0.2s,   v17.2s
         b.eq            1f
@@ -690,26 +654,24 @@ 2:
         st1             {v0.8h, v1.8h}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.8h, v1.8h}, [x0], x1
         st1             {v0.8h, v1.8h}, [x6], x1
         b.gt            2b
         ret
 
 L(ipred_dc_h32):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
         addp            v0.8h,   v0.8h,   v1.8h
         addp            v2.8h,   v2.8h,   v3.8h
         addp            v0.8h,   v0.8h,   v2.8h
         add             x2,  x2,  #2
         uaddlv          s0,      v0.8h
         br              x3
 L(ipred_dc_w32):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v1.8h, v2.8h, v3.8h, v4.8h}, [x2]
         add             v0.2s,   v0.2s,   v16.2s
         addp            v1.8h,   v1.8h,   v2.8h
         addp            v3.8h,   v3.8h,   v4.8h
         addp            v1.8h,   v1.8h,   v3.8h
         uaddlv          s1,      v1.8h
         cmp             w4,  #32
         add             v0.2s,   v0.2s,   v1.2s
@@ -733,31 +695,29 @@ 2:
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
         b.gt            2b
         ret
 
 L(ipred_dc_h64):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
         addp            v0.8h,   v0.8h,   v1.8h
         ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
         addp            v2.8h,   v2.8h,   v3.8h
         addp            v4.8h,   v4.8h,   v5.8h
         addp            v6.8h,   v6.8h,   v7.8h
         addp            v0.8h,   v0.8h,   v2.8h
         addp            v4.8h,   v4.8h,   v6.8h
         addp            v0.8h,   v0.8h,   v4.8h
         add             x2,  x2,  #2
         uaddlv          s0,      v0.8h
         br              x3
 L(ipred_dc_w64):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64
         add             v0.2s,   v0.2s,   v16.2s
         addp            v1.8h,   v1.8h,   v2.8h
         ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2]
         addp            v3.8h,   v3.8h,   v4.8h
         addp            v20.8h,  v20.8h,  v21.8h
         addp            v22.8h,  v22.8h,  v23.8h
         addp            v1.8h,   v1.8h,   v3.8h
@@ -821,17 +781,16 @@ function ipred_paeth_16bpc_neon, export=
         add             x8,  x2,  #2
         sub             x2,  x2,  #8
         sub             x5,  x5,  w9, uxtw
         mov             x7,  #-8
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
 40:
-        AARCH64_VALID_JUMP_TARGET
         ld1r            {v5.2d},  [x8]
         sub             v6.8h,   v5.8h,   v4.8h   // top - topleft
 4:
         ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7
         zip1            v0.2d,   v0.2d,   v1.2d
         zip1            v2.2d,   v2.2d,   v3.2d
         add             v16.8h,  v6.8h,   v0.8h   // base
         add             v17.8h,  v6.8h,   v2.8h
@@ -857,17 +816,16 @@ 4:
         st1             {v20.d}[1], [x0], x1
         st1             {v20.d}[0], [x6], x1
         b.gt            4b
         ret
 80:
 160:
 320:
 640:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v5.8h},  [x8], #16
         mov             w9,  w3
         // Set up pointers for four rows in parallel; x0, x6, x5, x10
         add             x5,  x0,  x1
         add             x10, x6,  x1
         lsl             x1,  x1,  #1
         sub             x1,  x1,  w3, uxtw #1
 1:
@@ -957,17 +915,16 @@ function ipred_smooth_16bpc_neon, export
         ldrh            w9,  [x5, w9, uxtw #1]
         ld1r            {v4.8h},  [x12] // bottom
         add             x8,  x2,  #2
         sub             x5,  x5,  w9, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
 40:
-        AARCH64_VALID_JUMP_TARGET
         ld1r            {v6.2d}, [x8]             // top
         ld1r            {v7.2s}, [x10]            // weights_hor
         sub             x2,  x2,  #8
         mov             x7,  #-8
         dup             v5.8h,   v6.h[3]          // right
         sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
         uxtl            v7.8h,   v7.8b            // weights_hor
         add             v31.4h,  v4.4h,   v5.4h   // bottom+right
@@ -1001,17 +958,16 @@ 4:
         st1             {v20.4h}, [x0], x1
         st1             {v21.4h}, [x6], x1
         subs            w4,  w4,  #4
         st1             {v22.4h}, [x0], x1
         st1             {v23.4h}, [x6], x1
         b.gt            4b
         ret
 80:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v6.8h}, [x8]             // top
         ld1             {v7.8b}, [x10]            // weights_hor
         sub             x2,  x2,  #8
         mov             x7,  #-8
         dup             v5.8h,   v6.h[7]          // right
         sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
         uxtl            v7.8h,   v7.8b            // weights_hor
         add             v31.4h,  v4.4h,   v5.4h   // bottom+right
@@ -1063,17 +1019,16 @@ 8:
         subs            w4,  w4,  #4
         st1             {v22.8h}, [x0], x1
         st1             {v23.8h}, [x6], x1
         b.gt            8b
         ret
 160:
 320:
 640:
-        AARCH64_VALID_JUMP_TARGET
         add             x12, x2,  w3, uxtw #1
         sub             x1,  x1,  w3, uxtw #1
         ld1r            {v5.8h}, [x12]            // right
         sub             x2,  x2,  #4
         mov             x7,  #-4
         mov             w9,  w3
         add             v31.4h,  v4.4h,   v5.4h   // bottom+right
 
@@ -1160,17 +1115,16 @@ function ipred_smooth_v_16bpc_neon, expo
         ldrh            w9,  [x5, w9, uxtw #1]
         ld1r            {v4.8h},  [x8] // bottom
         add             x2,  x2,  #2
         sub             x5,  x5,  w9, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
 40:
-        AARCH64_VALID_JUMP_TARGET
         ld1r            {v6.2d}, [x2]             // top
         sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
 4:
         ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
         zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
         zip1            v18.2s,  v18.2s,  v19.2s
         ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7
         ushll           v18.8h,  v18.8b,  #7
@@ -1181,17 +1135,16 @@ 4:
         st1             {v20.d}[0], [x0], x1
         st1             {v20.d}[1], [x6], x1
         subs            w4,  w4,  #4
         st1             {v21.d}[0], [x0], x1
         st1             {v21.d}[1], [x6], x1
         b.gt            4b
         ret
 80:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v6.8h}, [x2]             // top
         sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
 8:
         ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
         ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7
         ushll           v17.8h,  v17.8b,  #7
         ushll           v18.8h,  v18.8b,  #7
         ushll           v19.8h,  v19.8b,  #7
@@ -1208,17 +1161,16 @@ 8:
         subs            w4,  w4,  #4
         st1             {v22.8h}, [x0], x1
         st1             {v23.8h}, [x6], x1
         b.gt            8b
         ret
 160:
 320:
 640:
-        AARCH64_VALID_JUMP_TARGET
         // Set up pointers for four rows in parallel; x0, x6, x5, x8
         add             x5,  x0,  x1
         add             x8,  x6,  x1
         lsl             x1,  x1,  #1
         sub             x1,  x1,  w3, uxtw #1
         mov             w9,  w3
 
 1:
@@ -1286,17 +1238,16 @@ function ipred_smooth_h_16bpc_neon, expo
         sub             w9,  w9,  #25
         ldrh            w9,  [x5, w9, uxtw #1]
         ld1r            {v5.8h},  [x12] // right
         sub             x5,  x5,  w9, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
 40:
-        AARCH64_VALID_JUMP_TARGET
         ld1r            {v7.2s}, [x8]             // weights_hor
         sub             x2,  x2,  #8
         mov             x7,  #-8
         ushll           v7.8h,   v7.8b,   #7      // weights_hor << 7
 4:
         ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7 // left
         zip1            v1.2d,   v1.2d,   v0.2d   // left, flipped
         zip1            v0.2d,   v3.2d,   v2.2d
@@ -1309,17 +1260,16 @@ 4:
         st1             {v20.d}[0], [x0], x1
         st1             {v20.d}[1], [x6], x1
         subs            w4,  w4,  #4
         st1             {v21.d}[0], [x0], x1
         st1             {v21.d}[1], [x6], x1
         b.gt            4b
         ret
 80:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v7.8b}, [x8]             // weights_hor
         sub             x2,  x2,  #8
         mov             x7,  #-8
         ushll           v7.8h,   v7.8b,   #7      // weights_hor << 7
 8:
         ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7 // left
         sub             v3.8h,   v3.8h,   v5.8h   // left-right
         sub             v2.8h,   v2.8h,   v5.8h
@@ -1338,17 +1288,16 @@ 8:
         subs            w4,  w4,  #4
         st1             {v22.8h}, [x0], x1
         st1             {v23.8h}, [x6], x1
         b.gt            8b
         ret
 160:
 320:
 640:
-        AARCH64_VALID_JUMP_TARGET
         sub             x2,  x2,  #8
         mov             x7,  #-8
         // Set up pointers for four rows in parallel; x0, x6, x5, x10
         add             x5,  x0,  x1
         add             x10, x6,  x1
         lsl             x1,  x1,  #1
         sub             x1,  x1,  w3, uxtw #1
         mov             w9,  w3
@@ -1433,17 +1382,16 @@ function ipred_filter_\bpc\()bpc_neon
         sxtl            v21.8h,  v21.8b
         sxtl            v22.8h,  v22.8b
         dup             v31.8h,  w8
 .if \bpc == 10
         movi            v30.8h,  #0
 .endif
         br              x5
 40:
-        AARCH64_VALID_JUMP_TARGET
         ldur            d0,  [x2, #2]             // top (0-3)
         sub             x2,  x2,  #4
         mov             x7,  #-4
 4:
         ld1             {v1.4h}, [x2], x7         // left (0-1) + topleft (2)
 .if \bpc == 10
         mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
         mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
@@ -1475,17 +1423,16 @@ 4:
         smin            v2.8h,   v2.8h,   v31.8h
         subs            w4,  w4,  #2
         st1             {v2.d}[0], [x0], x1
         ext             v0.16b,  v2.16b,  v2.16b, #8 // move top from [4-7] to [0-3]
         st1             {v2.d}[1], [x6], x1
         b.gt            4b
         ret
 80:
-        AARCH64_VALID_JUMP_TARGET
         ldur            q0,  [x2, #2]             // top (0-7)
         sub             x2,  x2,  #4
         mov             x7,  #-4
 8:
         ld1             {v1.4h}, [x2], x7         // left (0-1) + topleft (2)
 .if \bpc == 10
         mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
         mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
@@ -1545,17 +1492,16 @@ 8:
         subs            w4,  w4,  #2
         st2             {v2.d, v3.d}[0], [x0], x1
         zip2            v0.2d,   v2.2d,   v3.2d
         st2             {v2.d, v3.d}[1], [x6], x1
         b.gt            8b
         ret
 160:
 320:
-        AARCH64_VALID_JUMP_TARGET
         add             x8,  x2,  #2
         sub             x2,  x2,  #4
         mov             x7,  #-4
         sub             x1,  x1,  w3, uxtw #1
         mov             w9,  w3
 
 1:
         ld1             {v0.4h}, [x2], x7         // left (0-1) + topleft (2)
@@ -1724,17 +1670,16 @@ function pal_pred_16bpc_neon, export=1
         clz             w9,  w4
         adr             x6,  L(pal_pred_tbl)
         sub             w9,  w9,  #25
         ldrh            w9,  [x6, w9, uxtw #1]
         movi            v31.8h,  #1, lsl #8
         sub             x6,  x6,  w9, uxtw
         br              x6
 40:
-        AARCH64_VALID_JUMP_TARGET
         add             x2,  x0,  x1
         lsl             x1,  x1,  #1
 4:
         ld1             {v1.16b}, [x3], #16
         subs            w5,  w5,  #4
         // Restructure v1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ...
         add             v1.16b,  v1.16b,  v1.16b
         zip1            v0.16b,  v1.16b,  v1.16b
@@ -1745,17 +1690,16 @@ 4:
         st1             {v0.d}[0], [x0], x1
         tbl             v1.16b, {v30.16b}, v1.16b
         st1             {v0.d}[1], [x2], x1
         st1             {v1.d}[0], [x0], x1
         st1             {v1.d}[1], [x2], x1
         b.gt            4b
         ret
 80:
-        AARCH64_VALID_JUMP_TARGET
         add             x2,  x0,  x1
         lsl             x1,  x1,  #1
 8:
         ld1             {v2.16b, v3.16b}, [x3], #32
         subs            w5,  w5,  #4
         add             v2.16b,  v2.16b,  v2.16b
         add             v3.16b,  v3.16b,  v3.16b
         zip1            v0.16b,  v2.16b,  v2.16b
@@ -1772,17 +1716,16 @@ 8:
         tbl             v2.16b, {v30.16b}, v2.16b
         st1             {v1.8h}, [x2], x1
         tbl             v3.16b, {v30.16b}, v3.16b
         st1             {v2.8h}, [x0], x1
         st1             {v3.8h}, [x2], x1
         b.gt            8b
         ret
 160:
-        AARCH64_VALID_JUMP_TARGET
         add             x2,  x0,  x1
         lsl             x1,  x1,  #1
 16:
         ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
         subs            w5,  w5,  #4
         add             v4.16b,  v4.16b,  v4.16b
         add             v5.16b,  v5.16b,  v5.16b
         add             v6.16b,  v6.16b,  v6.16b
@@ -1813,17 +1756,16 @@ 16:
         tbl             v6.16b, {v30.16b}, v6.16b
         st1             {v2.8h, v3.8h}, [x2], x1
         tbl             v7.16b, {v30.16b}, v7.16b
         st1             {v4.8h, v5.8h}, [x0], x1
         st1             {v6.8h, v7.8h}, [x2], x1
         b.gt            16b
         ret
 320:
-        AARCH64_VALID_JUMP_TARGET
         add             x2,  x0,  x1
         lsl             x1,  x1,  #1
 32:
         ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
         subs            w5,  w5,  #2
         add             v4.16b,  v4.16b,  v4.16b
         add             v5.16b,  v5.16b,  v5.16b
         add             v6.16b,  v6.16b,  v6.16b
@@ -1852,17 +1794,16 @@ 32:
         tbl             v5.16b, {v30.16b}, v5.16b
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
         tbl             v6.16b, {v30.16b}, v6.16b
         tbl             v7.16b, {v30.16b}, v7.16b
         st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1
         b.gt            32b
         ret
 640:
-        AARCH64_VALID_JUMP_TARGET
         add             x2,  x0,  #64
 64:
         ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
         subs            w5,  w5,  #1
         add             v4.16b,  v4.16b,  v4.16b
         add             v5.16b,  v5.16b,  v5.16b
         add             v6.16b,  v6.16b,  v6.16b
         add             v7.16b,  v7.16b,  v7.16b
@@ -1917,17 +1858,16 @@ function ipred_cfl_128_16bpc_neon, expor
         urshr           v0.8h,   v31.8h,  #1
         dup             v1.8h,   w6   // alpha
         sub             x7,  x7,  w9, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         movi            v30.8h,  #0
         br              x7
 L(ipred_cfl_splat_w4):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v4.8h, v5.8h}, [x5], #32
         subs            w4,  w4,  #4
         smull           v2.4s,   v4.4h,   v1.4h  // diff = ac * alpha
         smull2          v3.4s,   v4.8h,   v1.8h
         smull           v4.4s,   v5.4h,   v1.4h
         smull2          v5.4s,   v5.8h,   v1.8h
         sshr            v16.4s,  v2.4s,   #31    // sign = diff >> 31
         sshr            v17.4s,  v3.4s,   #31
@@ -1949,17 +1889,16 @@ L(ipred_cfl_splat_w4):
         smin            v3.8h,   v3.8h,   v31.8h
         st1             {v2.d}[0],  [x0], x1
         st1             {v2.d}[1],  [x6], x1
         st1             {v3.d}[0],  [x0], x1
         st1             {v3.d}[1],  [x6], x1
         b.gt            L(ipred_cfl_splat_w4)
         ret
 L(ipred_cfl_splat_w8):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v4.8h, v5.8h}, [x5], #32
         subs            w4,  w4,  #2
         smull           v2.4s,   v4.4h,   v1.4h  // diff = ac * alpha
         smull2          v3.4s,   v4.8h,   v1.8h
         smull           v4.4s,   v5.4h,   v1.4h
         smull2          v5.4s,   v5.8h,   v1.8h
         sshr            v16.4s,  v2.4s,   #31    // sign = diff >> 31
         sshr            v17.4s,  v3.4s,   #31
@@ -1979,17 +1918,16 @@ L(ipred_cfl_splat_w8):
         smax            v3.8h,   v3.8h,   v30.8h
         smin            v2.8h,   v2.8h,   v31.8h
         smin            v3.8h,   v3.8h,   v31.8h
         st1             {v2.8h},  [x0], x1
         st1             {v3.8h},  [x6], x1
         b.gt            L(ipred_cfl_splat_w8)
         ret
 L(ipred_cfl_splat_w16):
-        AARCH64_VALID_JUMP_TARGET
         add             x7,  x5,  w3, uxtw #1
         sub             x1,  x1,  w3, uxtw #1
         mov             w9,  w3
 1:
         ld1             {v2.8h, v3.8h}, [x5], #32
         ld1             {v4.8h, v5.8h}, [x7], #32
         subs            w3,  w3,  #16
         smull           v16.4s,  v2.4h,   v1.4h  // diff = ac * alpha
@@ -2070,39 +2008,35 @@ function ipred_cfl_top_16bpc_neon, expor
         dup             v1.8h,   w6   // alpha
         add             x2,  x2,  #2
         sub             x7,  x7,  w9, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         movi            v30.8h,  #0
         br              x7
 4:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.4h},  [x2]
         addv            h0,      v0.4h
         urshr           v0.4h,   v0.4h,   #2
         dup             v0.8h,   v0.h[0]
         b               L(ipred_cfl_splat_w4)
 8:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8h},  [x2]
         addv            h0,      v0.8h
         urshr           v0.4h,   v0.4h,   #3
         dup             v0.8h,   v0.h[0]
         b               L(ipred_cfl_splat_w8)
 16:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v2.8h, v3.8h}, [x2]
         addp            v0.8h,   v2.8h,   v3.8h
         addv            h0,      v0.8h
         urshr           v0.4h,   v0.4h,   #4
         dup             v0.8h,   v0.h[0]
         b               L(ipred_cfl_splat_w16)
 32:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
         addp            v2.8h,   v2.8h,   v3.8h
         addp            v4.8h,   v4.8h,   v5.8h
         addp            v0.8h,   v2.8h,   v4.8h
         uaddlv          s0,      v0.8h
         rshrn           v0.4h,   v0.4s,   #5
         dup             v0.8h,   v0.h[0]
         b               L(ipred_cfl_splat_w16)
@@ -2134,42 +2068,38 @@ function ipred_cfl_left_16bpc_neon, expo
         sub             x9,  x10, w9, uxtw
         sub             x7,  x7,  w8, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         movi            v30.8h,  #0
         br              x7
 
 L(ipred_cfl_left_h4):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.4h},  [x2]
         addv            h0,      v0.4h
         urshr           v0.4h,   v0.4h,   #2
         dup             v0.8h,   v0.h[0]
         br              x9
 
 L(ipred_cfl_left_h8):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8h},  [x2]
         addv            h0,      v0.8h
         urshr           v0.4h,   v0.4h,   #3
         dup             v0.8h,   v0.h[0]
         br              x9
 
 L(ipred_cfl_left_h16):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v2.8h, v3.8h}, [x2]
         addp            v0.8h,   v2.8h,   v3.8h
         addv            h0,      v0.8h
         urshr           v0.4h,   v0.4h,   #4
         dup             v0.8h,   v0.h[0]
         br              x9
 
 L(ipred_cfl_left_h32):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
         addp            v2.8h,   v2.8h,   v3.8h
         addp            v4.8h,   v4.8h,   v5.8h
         addp            v0.8h,   v2.8h,   v4.8h
         uaddlv          s0,      v0.8h
         rshrn           v0.4h,   v0.4s,   #5
         dup             v0.8h,   v0.h[0]
         br              x9
@@ -2207,23 +2137,21 @@ function ipred_cfl_16bpc_neon, export=1
         ushr            v16.4s,  v16.4s,  #1     // (width + height) >> 1
         dup             v17.4s,  w8              // -ctz(width + height)
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         movi            v30.8h,  #0
         br              x7
 
 L(ipred_cfl_h4):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.4h},  [x2], #8
         uaddlv          s0,      v0.4h
         add             x2,  x2,  #2
         br              x9
 L(ipred_cfl_w4):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v2.4h},  [x2]
         add             v0.2s,   v0.2s,   v16.2s
         uaddlv          s2,      v2.4h
         cmp             w4,  #4
         add             v0.2s,   v0.2s,   v2.2s
         ushl            v0.2s,   v0.2s,   v17.2s
         b.eq            1f
         // h = 8/16
@@ -2234,23 +2162,21 @@ L(ipred_cfl_w4):
         dup             v16.2s,  w16
         mul             v0.2s,   v0.2s,   v16.2s
         ushr            v0.2s,   v0.2s,   #17
 1:
         dup             v0.8h,   v0.h[0]
         b               L(ipred_cfl_splat_w4)
 
 L(ipred_cfl_h8):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8h},  [x2], #16
         uaddlv          s0,      v0.8h
         add             x2,  x2,  #2
         br              x9
 L(ipred_cfl_w8):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v2.8h},  [x2]
         add             v0.2s,   v0.2s,   v16.2s
         uaddlv          s2,      v2.8h
         cmp             w4,  #8
         add             v0.2s,   v0.2s,   v2.2s
         ushl            v0.2s,   v0.2s,   v17.2s
         b.eq            1f
         // h = 4/16/32
@@ -2261,24 +2187,22 @@ L(ipred_cfl_w8):
         dup             v16.2s,  w16
         mul             v0.2s,   v0.2s,   v16.2s
         ushr            v0.2s,   v0.2s,   #17
 1:
         dup             v0.8h,   v0.h[0]
         b               L(ipred_cfl_splat_w8)
 
 L(ipred_cfl_h16):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v2.8h, v3.8h}, [x2], #32
         addp            v0.8h,   v2.8h,   v3.8h
         add             x2,  x2,  #2
         uaddlv          s0,      v0.8h
         br              x9
 L(ipred_cfl_w16):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v2.8h, v3.8h}, [x2]
         add             v0.2s,   v0.2s,   v16.2s
         addp            v2.8h,   v2.8h,   v3.8h
         uaddlv          s2,      v2.8h
         cmp             w4,  #16
         add             v0.2s,   v0.2s,   v2.2s
         ushl            v0.2s,   v0.2s,   v17.2s
         b.eq            1f
@@ -2290,26 +2214,24 @@ L(ipred_cfl_w16):
         dup             v16.2s,  w16
         mul             v0.2s,   v0.2s,   v16.2s
         ushr            v0.2s,   v0.2s,   #17
 1:
         dup             v0.8h,   v0.h[0]
         b               L(ipred_cfl_splat_w16)
 
 L(ipred_cfl_h32):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2], #64
         addp            v2.8h,   v2.8h,   v3.8h
         addp            v4.8h,   v4.8h,   v5.8h
         addp            v0.8h,   v2.8h,   v4.8h
         add             x2,  x2,  #2
         uaddlv          s0,      v0.8h
         br              x9
 L(ipred_cfl_w32):
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
         add             v0.4s,   v0.4s,   v16.4s
         addp            v2.8h,   v2.8h,   v3.8h
         addp            v4.8h,   v4.8h,   v5.8h
         addp            v2.8h,   v2.8h,   v4.8h
         cmp             w4,  #32
         uaddlv          s2,      v2.8h
         add             v0.2s,   v0.2s,   v2.2s
@@ -2360,17 +2282,16 @@ function ipred_cfl_ac_420_16bpc_neon, ex
         add             w9,  w9,  w10        // log2sz
         add             x10, x1,  x2
         dup             v31.4s,  w9
         lsl             x2,  x2,  #1
         neg             v31.4s,  v31.4s      // -log2sz
         br              x7
 
 L(ipred_cfl_ac_420_w4):
-        AARCH64_VALID_JUMP_TARGET
 1:      // Copy and subsample input
         ld1             {v0.8h}, [x1],  x2
         ld1             {v1.8h}, [x10], x2
         ld1             {v2.8h}, [x1],  x2
         ld1             {v3.8h}, [x10], x2
         addp            v0.8h,   v0.8h,   v2.8h
         addp            v1.8h,   v1.8h,   v3.8h
         add             v0.8h,   v0.8h,   v1.8h
@@ -2407,17 +2328,16 @@ 6:      // Subtract dc from ac
         subs            w6,  w6,  #4
         sub             v0.8h,   v0.8h,   v4.8h
         sub             v1.8h,   v1.8h,   v4.8h
         st1             {v0.8h, v1.8h}, [x0], #32
         b.gt            6b
         ret
 
 L(ipred_cfl_ac_420_w8):
-        AARCH64_VALID_JUMP_TARGET
         cbnz            w3,  L(ipred_cfl_ac_420_w8_wpad)
 1:      // Copy and subsample input, without padding
         ld1             {v0.8h, v1.8h}, [x1],  x2
         ld1             {v2.8h, v3.8h}, [x10], x2
         ld1             {v4.8h, v5.8h}, [x1],  x2
         addp            v0.8h,   v0.8h,   v1.8h
         ld1             {v6.8h, v7.8h}, [x10], x2
         addp            v2.8h,   v2.8h,   v3.8h
@@ -2477,24 +2397,22 @@ 2:      // Vertical padding (h_pad > 0)
         b.gt            2b
 3:
 
         // Double the height and reuse the w4 summing/subtracting
         lsl             w6,  w6,  #1
         b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
 
 L(ipred_cfl_ac_420_w16):
-        AARCH64_VALID_JUMP_TARGET
         adr             x7,  L(ipred_cfl_ac_420_w16_tbl)
         ldrh            w3,  [x7, w3, uxtw #1]
         sub             x7,  x7,  w3, uxtw
         br              x7
 
 L(ipred_cfl_ac_420_w16_wpad0):
-        AARCH64_VALID_JUMP_TARGET
 1:      // Copy and subsample input, without padding
         ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2
         ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2
         addp            v0.8h,   v0.8h,   v1.8h
         addp            v2.8h,   v2.8h,   v3.8h
         addp            v4.8h,   v4.8h,   v5.8h
         addp            v6.8h,   v6.8h,   v7.8h
         ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x1],  x2
@@ -2522,17 +2440,16 @@ 1:      // Copy and subsample input, wit
         uaddw           v26.4s,  v26.4s,  v3.4h
         uaddw2          v27.4s,  v27.4s,  v3.8h
         b.gt            1b
         mov             v0.16b,  v2.16b
         mov             v1.16b,  v3.16b
         b               L(ipred_cfl_ac_420_w16_hpad)
 
 L(ipred_cfl_ac_420_w16_wpad1):
-        AARCH64_VALID_JUMP_TARGET
 1:      // Copy and subsample input, padding 4
         ldr             q2,  [x1,  #32]
         ld1             {v0.8h, v1.8h}, [x1],  x2
         ldr             q5,  [x10, #32]
         ld1             {v3.8h, v4.8h}, [x10], x2
         addp            v2.8h,   v2.8h,   v2.8h
         addp            v0.8h,   v0.8h,   v1.8h
         addp            v5.8h,   v5.8h,   v5.8h
@@ -2568,17 +2485,16 @@ 1:      // Copy and subsample input, pad
         uaddw           v26.4s,  v26.4s,  v3.4h
         uaddw2          v27.4s,  v27.4s,  v3.8h
         b.gt            1b
         mov             v0.16b,  v2.16b
         mov             v1.16b,  v3.16b
         b               L(ipred_cfl_ac_420_w16_hpad)
 
 L(ipred_cfl_ac_420_w16_wpad2):
-        AARCH64_VALID_JUMP_TARGET
 1:      // Copy and subsample input, padding 8
         ld1             {v0.8h, v1.8h}, [x1],  x2
         ld1             {v2.8h, v3.8h}, [x10], x2
         ld1             {v4.8h, v5.8h}, [x1],  x2
         addp            v0.8h,   v0.8h,   v1.8h
         ld1             {v6.8h, v7.8h}, [x10], x2
         addp            v2.8h,   v2.8h,   v3.8h
         addp            v4.8h,   v4.8h,   v5.8h
@@ -2600,17 +2516,16 @@ 1:      // Copy and subsample input, pad
         uaddw           v26.4s,  v26.4s,  v3.4h
         uaddw2          v27.4s,  v27.4s,  v3.8h
         b.gt            1b
         mov             v0.16b,  v2.16b
         mov             v1.16b,  v3.16b
         b               L(ipred_cfl_ac_420_w16_hpad)
 
 L(ipred_cfl_ac_420_w16_wpad3):
-        AARCH64_VALID_JUMP_TARGET
 1:      // Copy and subsample input, padding 12
         ld1             {v0.8h}, [x1],  x2
         ld1             {v2.8h}, [x10], x2
         ld1             {v4.8h}, [x1],  x2
         ld1             {v6.8h}, [x10], x2
         addp            v0.8h,   v0.8h,   v4.8h
         addp            v2.8h,   v2.8h,   v6.8h
         add             v0.8h,   v0.8h,   v2.8h
@@ -2697,17 +2612,16 @@ function ipred_cfl_ac_422_16bpc_neon, ex
         add             w9,  w9,  w10        // log2sz
         add             x10, x1,  x2
         dup             v31.4s,  w9
         lsl             x2,  x2,  #1
         neg             v31.4s,  v31.4s      // -log2sz
         br              x7
 
 L(ipred_cfl_ac_422_w4):
-        AARCH64_VALID_JUMP_TARGET
 1:      // Copy and subsample input
         ld1             {v0.8h}, [x1],  x2
         ld1             {v1.8h}, [x10], x2
         ld1             {v2.8h}, [x1],  x2
         ld1             {v3.8h}, [x10], x2
         addp            v0.8h,   v0.8h,   v1.8h
         addp            v2.8h,   v2.8h,   v3.8h
         shl             v0.8h,   v0.8h,   #2
@@ -2719,17 +2633,16 @@ 1:      // Copy and subsample input
         uaddw           v26.4s,  v26.4s,  v1.4h
         uaddw2          v27.4s,  v27.4s,  v1.8h
         b.gt            1b
         trn2            v0.2d,   v1.2d,   v1.2d
         trn2            v1.2d,   v1.2d,   v1.2d
         b               L(ipred_cfl_ac_420_w4_hpad)
 
 L(ipred_cfl_ac_422_w8):
-        AARCH64_VALID_JUMP_TARGET
         cbnz            w3,  L(ipred_cfl_ac_422_w8_wpad)
 1:      // Copy and subsample input, without padding
         ld1             {v0.8h, v1.8h}, [x1],  x2
         ld1             {v2.8h, v3.8h}, [x10], x2
         ld1             {v4.8h, v5.8h}, [x1],  x2
         addp            v0.8h,   v0.8h,   v1.8h
         ld1             {v6.8h, v7.8h}, [x10], x2
         addp            v2.8h,   v2.8h,   v3.8h
@@ -2783,24 +2696,22 @@ 1:      // Copy and subsample input, pad
         uaddw           v26.4s,  v26.4s,  v3.4h
         uaddw2          v27.4s,  v27.4s,  v3.8h
         b.gt            1b
         mov             v0.16b,  v3.16b
         mov             v1.16b,  v3.16b
         b               L(ipred_cfl_ac_420_w8_hpad)
 
 L(ipred_cfl_ac_422_w16):
-        AARCH64_VALID_JUMP_TARGET
         adr             x7,  L(ipred_cfl_ac_422_w16_tbl)
         ldrh            w3,  [x7, w3, uxtw #1]
         sub             x7,  x7,  w3, uxtw
         br              x7
 
 L(ipred_cfl_ac_422_w16_wpad0):
-        AARCH64_VALID_JUMP_TARGET
 1:      // Copy and subsample input, without padding
         ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2
         ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2
         addp            v0.8h,   v0.8h,   v1.8h
         addp            v2.8h,   v2.8h,   v3.8h
         addp            v4.8h,   v4.8h,   v5.8h
         addp            v6.8h,   v6.8h,   v7.8h
         shl             v0.8h,   v0.8h,   #2
@@ -2818,17 +2729,16 @@ 1:      // Copy and subsample input, wit
         uaddw           v26.4s,  v26.4s,  v3.4h
         uaddw2          v27.4s,  v27.4s,  v3.8h
         b.gt            1b
         mov             v0.16b,  v2.16b
         mov             v1.16b,  v3.16b
         b               L(ipred_cfl_ac_420_w16_hpad)
 
 L(ipred_cfl_ac_422_w16_wpad1):
-        AARCH64_VALID_JUMP_TARGET
 1:      // Copy and subsample input, padding 4
         ldr             q2,  [x1,  #32]
         ld1             {v0.8h, v1.8h}, [x1],  x2
         ldr             q6,  [x10, #32]
         ld1             {v4.8h, v5.8h}, [x10], x2
         addp            v2.8h,   v2.8h,   v2.8h
         addp            v0.8h,   v0.8h,   v1.8h
         addp            v6.8h,   v6.8h,   v6.8h
@@ -2852,17 +2762,16 @@ 1:      // Copy and subsample input, pad
         uaddw           v26.4s,  v26.4s,  v3.4h
         uaddw2          v27.4s,  v27.4s,  v3.8h
         b.gt            1b
         mov             v0.16b,  v2.16b
         mov             v1.16b,  v3.16b
         b               L(ipred_cfl_ac_420_w16_hpad)
 
 L(ipred_cfl_ac_422_w16_wpad2):
-        AARCH64_VALID_JUMP_TARGET
 1:      // Copy and subsample input, padding 8
         ld1             {v0.8h, v1.8h}, [x1],  x2
         ld1             {v2.8h, v3.8h}, [x10], x2
         addp            v0.8h,   v0.8h,   v1.8h
         addp            v2.8h,   v2.8h,   v3.8h
         shl             v0.8h,   v0.8h,   #2
         shl             v2.8h,   v2.8h,   #2
         dup             v1.8h,   v0.h[7]
@@ -2878,17 +2787,16 @@ 1:      // Copy and subsample input, pad
         uaddw           v26.4s,  v26.4s,  v3.4h
         uaddw2          v27.4s,  v27.4s,  v3.8h
         b.gt            1b
         mov             v0.16b,  v2.16b
         mov             v1.16b,  v3.16b
         b               L(ipred_cfl_ac_420_w16_hpad)
 
 L(ipred_cfl_ac_422_w16_wpad3):
-        AARCH64_VALID_JUMP_TARGET
 1:      // Copy and subsample input, padding 12
         ld1             {v0.8h}, [x1],  x2
         ld1             {v2.8h}, [x10], x2
         addp            v0.8h,   v0.8h,   v0.8h
         addp            v2.8h,   v2.8h,   v2.8h
         shl             v0.4h,   v0.4h,   #2
         shl             v2.4h,   v2.4h,   #2
         dup             v1.8h,   v0.h[3]
@@ -2945,17 +2853,16 @@ function ipred_cfl_ac_444_16bpc_neon, ex
         add             w9,  w9,  w10        // log2sz
         add             x10, x1,  x2
         dup             v31.4s,  w9
         lsl             x2,  x2,  #1
         neg             v31.4s,  v31.4s      // -log2sz
         br              x7
 
 L(ipred_cfl_ac_444_w4):
-        AARCH64_VALID_JUMP_TARGET
 1:      // Copy and expand input
         ld1             {v0.4h},   [x1],  x2
         ld1             {v0.d}[1], [x10], x2
         ld1             {v1.4h},   [x1],  x2
         ld1             {v1.d}[1], [x10], x2
         shl             v0.8h,   v0.8h,   #3
         shl             v1.8h,   v1.8h,   #3
         subs            w8,  w8,  #4
@@ -2965,17 +2872,16 @@ 1:      // Copy and expand input
         uaddw           v26.4s,  v26.4s,  v1.4h
         uaddw2          v27.4s,  v27.4s,  v1.8h
         b.gt            1b
         trn2            v0.2d,   v1.2d,   v1.2d
         trn2            v1.2d,   v1.2d,   v1.2d
         b               L(ipred_cfl_ac_420_w4_hpad)
 
 L(ipred_cfl_ac_444_w8):
-        AARCH64_VALID_JUMP_TARGET
 1:      // Copy and expand input
         ld1             {v0.8h}, [x1],  x2
         ld1             {v1.8h}, [x10], x2
         ld1             {v2.8h}, [x1],  x2
         shl             v0.8h,   v0.8h,   #3
         ld1             {v3.8h}, [x10], x2
         shl             v1.8h,   v1.8h,   #3
         shl             v2.8h,   v2.8h,   #3
@@ -2991,17 +2897,16 @@ 1:      // Copy and expand input
         uaddw           v26.4s,  v26.4s,  v3.4h
         uaddw2          v27.4s,  v27.4s,  v3.8h
         b.gt            1b
         mov             v0.16b,  v3.16b
         mov             v1.16b,  v3.16b
         b               L(ipred_cfl_ac_420_w8_hpad)
 
 L(ipred_cfl_ac_444_w16):
-        AARCH64_VALID_JUMP_TARGET
         cbnz            w3,  L(ipred_cfl_ac_444_w16_wpad)
 1:      // Copy and expand input, without padding
         ld1             {v0.8h, v1.8h}, [x1],  x2
         ld1             {v2.8h, v3.8h}, [x10], x2
         shl             v0.8h,   v0.8h,   #3
         shl             v1.8h,   v1.8h,   #3
         shl             v2.8h,   v2.8h,   #3
         shl             v3.8h,   v3.8h,   #3
@@ -3039,25 +2944,23 @@ 1:      // Copy and expand input, paddin
         uaddw           v26.4s,  v26.4s,  v3.4h
         uaddw2          v27.4s,  v27.4s,  v3.8h
         b.gt            1b
         mov             v0.16b,  v2.16b
         mov             v1.16b,  v3.16b
         b               L(ipred_cfl_ac_420_w16_hpad)
 
 L(ipred_cfl_ac_444_w32):
-        AARCH64_VALID_JUMP_TARGET
         adr             x7,  L(ipred_cfl_ac_444_w32_tbl)
         ldrh            w3,  [x7, w3, uxtw] // (w3>>1) << 1
         lsr             x2,  x2,  #1 // Restore the stride to one line increments
         sub             x7,  x7,  w3, uxtw
         br              x7
 
 L(ipred_cfl_ac_444_w32_wpad0):
-        AARCH64_VALID_JUMP_TARGET
 1:      // Copy and expand input, without padding
         ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2
         shl             v0.8h,   v0.8h,   #3
         shl             v1.8h,   v1.8h,   #3
         shl             v2.8h,   v2.8h,   #3
         shl             v3.8h,   v3.8h,   #3
         subs            w8,  w8,  #1
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
@@ -3068,17 +2971,16 @@ 1:      // Copy and expand input, withou
         uaddw           v24.4s,  v24.4s,  v2.4h
         uaddw2          v25.4s,  v25.4s,  v2.8h
         uaddw           v26.4s,  v26.4s,  v3.4h
         uaddw2          v27.4s,  v27.4s,  v3.8h
         b.gt            1b
         b               L(ipred_cfl_ac_444_w32_hpad)
 
 L(ipred_cfl_ac_444_w32_wpad2):
-        AARCH64_VALID_JUMP_TARGET
 1:      // Copy and expand input, padding 8
         ld1             {v0.8h, v1.8h, v2.8h}, [x1],  x2
         shl             v2.8h,   v2.8h,   #3
         shl             v0.8h,   v0.8h,   #3
         shl             v1.8h,   v1.8h,   #3
         dup             v3.8h,   v2.h[7]
         subs            w8,  w8,  #1
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
@@ -3089,17 +2991,16 @@ 1:      // Copy and expand input, paddin
         uaddw           v24.4s,  v24.4s,  v2.4h
         uaddw2          v25.4s,  v25.4s,  v2.8h
         uaddw           v26.4s,  v26.4s,  v3.4h
         uaddw2          v27.4s,  v27.4s,  v3.8h
         b.gt            1b
         b               L(ipred_cfl_ac_444_w32_hpad)
 
 L(ipred_cfl_ac_444_w32_wpad4):
-        AARCH64_VALID_JUMP_TARGET
 1:      // Copy and expand input, padding 16
         ld1             {v0.8h, v1.8h}, [x1],  x2
         shl             v1.8h,   v1.8h,   #3
         shl             v0.8h,   v0.8h,   #3
         dup             v2.8h,   v1.h[7]
         dup             v3.8h,   v1.h[7]
         subs            w8,  w8,  #1
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
@@ -3110,17 +3011,16 @@ 1:      // Copy and expand input, paddin
         uaddw           v24.4s,  v24.4s,  v2.4h
         uaddw2          v25.4s,  v25.4s,  v2.8h
         uaddw           v26.4s,  v26.4s,  v3.4h
         uaddw2          v27.4s,  v27.4s,  v3.8h
         b.gt            1b
         b               L(ipred_cfl_ac_444_w32_hpad)
 
 L(ipred_cfl_ac_444_w32_wpad6):
-        AARCH64_VALID_JUMP_TARGET
 1:      // Copy and expand input, padding 24
         ld1             {v0.8h}, [x1],  x2
         shl             v0.8h,   v0.8h,   #3
         dup             v1.8h,   v0.h[7]
         dup             v2.8h,   v0.h[7]
         dup             v3.8h,   v0.h[7]
         subs            w8,  w8,  #1
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
--- a/third_party/dav1d/src/arm/64/itx.S
+++ b/third_party/dav1d/src/arm/64/itx.S
@@ -655,17 +655,17 @@ L(itx_4x4_end):
         sqxtun          v0.8b,   v16.8h
         uaddw           v18.8h,  v18.8h,  v1.8b
         st1             {v0.s}[0], [x0], x1
         sqxtun          v1.8b,   v18.8h
         st1             {v0.s}[1], [x0], x1
         st1             {v1.s}[0], [x0], x1
         st1             {v1.s}[1], [x0], x1
 
-        ret             x15
+        br              x15
 endfunc
 
 .macro def_fn_4x4 txfm1, txfm2
 function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_neon, export=1
         mov             x15, x30
 
 .ifc \txfm1\()_\txfm2, dct_dct
         cbnz            w3,  1f
@@ -892,17 +892,17 @@ function inv_txfm_\variant\()add_8x8_neo
         srshr           v23.8h,  v23.8h,  #1
 .endif
 
         transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
 
         blr             x5
 
         load_add_store_8x8 x0, x7
-        ret             x15
+        br              x15
 endfunc
 .endm
 
 def_fn_8x8_base
 def_fn_8x8_base identity_
 
 .macro def_fn_8x8 txfm1, txfm2
 function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1
@@ -957,17 +957,17 @@ function inv_txfm_add_8x4_neon
         ins             v16.d[1], v20.d[0]
         ins             v17.d[1], v21.d[0]
         ins             v18.d[1], v22.d[0]
         ins             v19.d[1], v23.d[0]
 
         blr             x5
 
         load_add_store_8x4 x0, x7
-        ret             x15
+        br              x15
 endfunc
 
 function inv_txfm_add_4x8_neon
         movi            v28.8h,  #0
         movi            v29.8h,  #0
         movi            v30.8h,  #0
         movi            v31.8h,  #0
         mov             w16, #2896*8
@@ -983,17 +983,17 @@ function inv_txfm_add_4x8_neon
         ins             v20.d[0], v16.d[1]
         ins             v21.d[0], v17.d[1]
         ins             v22.d[0], v18.d[1]
         ins             v23.d[0], v19.d[1]
 
         blr             x5
 
         load_add_store_4x8 x0, x7
-        ret             x15
+        br              x15
 endfunc
 
 .macro def_fn_48 w, h, txfm1, txfm2
 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
         mov             x15, x30
 
 .ifc \txfm1\()_\txfm2, dct_dct
         idct_dc         \w,  \h,  0
@@ -1365,17 +1365,16 @@ endfunc
         sqrdmulh        v2.8h,   \i,      \c
         sqadd           \i,      \i,      \i
         sqadd           \i,      \i,      v2.8h
 .endr
 .endm
 
 .macro def_horz_16 scale=0, identity=0, shift=2, suffix
 function inv_txfm_horz\suffix\()_16x8_neon
-        AARCH64_VALID_CALL_TARGET
         mov             x14, x30
         movi            v7.8h,  #0
 .if \identity
         mov             w16, #2*(5793-4096)*8
         dup             v0.4h,   w16
 .elseif \scale
         mov             w16, #2896*8
         dup             v0.4h,   w16
@@ -1400,32 +1399,32 @@ function inv_txfm_horz\suffix\()_16x8_ne
 .endif
         transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
         transpose_8x8h  v24, v25, v26, v27, v28, v29, v30, v31, v4, v5
 
 .irp i, v16.8h, v24.8h, v17.8h, v25.8h, v18.8h, v26.8h, v19.8h, v27.8h, v20.8h, v28.8h, v21.8h, v29.8h, v22.8h, v30.8h, v23.8h, v31.8h
         st1             {\i}, [x6], #16
 .endr
 
-        ret             x14
+        br              x14
 endfunc
 .endm
 
 def_horz_16 scale=0, identity=0, shift=2
 def_horz_16 scale=1, identity=0, shift=1, suffix=_scale
 def_horz_16 scale=0, identity=1, shift=0, suffix=_identity
 
 function inv_txfm_add_vert_8x16_neon
         mov             x14, x30
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         ld1             {v\i\().8h}, [x7], x8
 .endr
         blr             x5
         load_add_store_8x16 x6, x7
-        ret             x14
+        br              x14
 endfunc
 
 function inv_txfm_add_16x16_neon
         mov             x15, x30
         sub             sp,  sp,  #512
 .irp i, 0, 8
         add             x6,  sp,  #(\i*16*2)
 .if \i == 8
@@ -1449,17 +1448,17 @@ 2:
 .irp i, 0, 8
         add             x6,  x0,  #(\i)
         add             x7,  sp,  #(\i*2)
         mov             x8,  #32
         bl              inv_txfm_add_vert_8x16_neon
 .endr
 
         add             sp,  sp,  #512
-        ret             x15
+        br              x15
 endfunc
 
 .macro def_fn_16x16 txfm1, txfm2, eob_half
 function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_neon, export=1
 .ifc \txfm1\()_\txfm2, dct_dct
         idct_dc         16,  16,  2
 .endif
 .ifc \txfm1, identity
@@ -1549,17 +1548,17 @@ function inv_txfm_\variant\()add_16x4_ne
         srshr           v18.8h,  v26.8h,  #1
         srshr           v19.8h,  v27.8h,  #1
 .endif
         transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5
         blr             x5
         add             x6,  x0,  #8
         load_add_store_8x4 x6, x7
 
-        ret             x15
+        br              x15
 endfunc
 
 function inv_txfm_\variant\()add_4x16_neon
         mov             x15, x30
         movi            v2.8h,   #0
 
         mov             x11, #32
         cmp             w3,  w13
@@ -1617,17 +1616,17 @@ 2:
         ins             v21.d[0], v17.d[1]
         ins             v22.d[0], v18.d[1]
         ins             v23.d[0], v19.d[1]
 
         blr             x5
 
         load_add_store_4x16 x0, x6
 
-        ret             x15
+        br              x15
 endfunc
 .endm
 
 def_fn_416_base
 def_fn_416_base identity_
 
 .macro def_fn_416 w, h, txfm1, txfm2, eob_half
 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
@@ -1727,17 +1726,17 @@ function inv_txfm_\variant\()add_16x8_ne
 
         transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
 
         blr             x5
 
         add             x0,  x0,  #8
         load_add_store_8x8 x0, x7
 
-        ret             x15
+        br              x15
 endfunc
 
 function inv_txfm_\variant\()add_8x16_neon
         mov             x15, x30
         movi            v4.8h,   #0
         mov             w16, #2896*8
         dup             v0.4h,   w16
         mov             x11, #32
@@ -1800,17 +1799,17 @@ 2:
 .endif
 
         transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
 
         blr             x5
 
         load_add_store_8x16 x0, x6
 
-        ret             x15
+        br              x15
 endfunc
 .endm
 
 def_fn_816_base
 def_fn_816_base identity_
 
 .macro def_fn_816 w, h, txfm1, txfm2, eob_half
 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
@@ -2085,17 +2084,17 @@ function inv_txfm_horz\suffix\()_dct_32x
         store2          v30.8h,  v22.8h, \shift
         store2          v29.8h,  v21.8h, \shift
         store2          v28.8h,  v20.8h, \shift
         store2          v27.8h,  v19.8h, \shift
         store2          v26.8h,  v18.8h, \shift
         store2          v25.8h,  v17.8h, \shift
         store2          v24.8h,  v16.8h, \shift
 .purgem store2
-        ret             x14
+        br              x14
 endfunc
 .endm
 
 def_horz_32 scale=0, shift=2
 def_horz_32 scale=1, shift=1, suffix=_scale
 
 function inv_txfm_add_vert_dct_8x32_neon
         mov             x14, x30
@@ -2159,17 +2158,17 @@ function inv_txfm_add_vert_dct_8x32_neon
         combine         v19.8h, v18.8h, v17.8h, v16.8h, sqadd, x8
         sub             x7,  x7,  x8
         combine         v16.8h, v17.8h, v18.8h, v19.8h, sqsub, x9
         combine         v20.8h, v21.8h, v22.8h, v23.8h, sqsub, x9
         combine         v24.8h, v25.8h, v26.8h, v27.8h, sqsub, x9
         combine         v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9
 .purgem combine
 
-        ret             x14
+        br              x14
 endfunc
 
 const eob_32x32
         .short 36, 136, 300, 1024
 endconst
 
 const eob_16x32
         .short 36, 151, 279, 512
@@ -2370,17 +2369,17 @@ 3:
 .irp i, 0, 8, 16, 24
         add             x6,  x0,  #(\i)
         add             x7,  sp,  #(\i*2)
         mov             x8,  #32*2
         bl              inv_txfm_add_vert_dct_8x32_neon
 .endr
 
         add             sp,  sp,  #2048
-        ret             x15
+        br              x15
 endfunc
 
 function inv_txfm_add_dct_dct_16x32_8bpc_neon, export=1
         idct_dc         16,  32,  1
 
         mov             x15, x30
         sub             sp,  sp,  #1024
         movrel          x13, eob_16x32
@@ -2419,17 +2418,17 @@ 3:
 .irp i, 0, 8
         add             x6,  x0,  #(\i)
         add             x7,  sp,  #(\i*2)
         mov             x8,  #16*2
         bl              inv_txfm_add_vert_dct_8x32_neon
 .endr
 
         add             sp,  sp,  #1024
-        ret             x15
+        br              x15
 endfunc
 
 function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1
         idct_dc         32,  16,  1
 
         mov             x15, x30
         sub             sp,  sp,  #1024
 
@@ -2464,17 +2463,17 @@ 3:
 .irp i, 0, 8, 16, 24
         add             x6,  x0,  #(\i)
         add             x7,  sp,  #(\i*2)
         mov             x8,  #32*2
         bl              inv_txfm_add_vert_8x16_neon
 .endr
 
         add             sp,  sp,  #1024
-        ret             x15
+        br              x15
 endfunc
 
 function inv_txfm_add_dct_dct_8x32_8bpc_neon, export=1
         idct_dc         8,   32, 2
 
         mov             x15, x30
         sub             sp,  sp,  #512
 
@@ -2521,17 +2520,17 @@ 2:
 
 3:
         mov             x6,  x0
         mov             x7,  sp
         mov             x8,  #8*2
         bl              inv_txfm_add_vert_dct_8x32_neon
 
         add             sp,  sp,  #512
-        ret             x15
+        br              x15
 endfunc
 
 function inv_txfm_add_dct_dct_32x8_8bpc_neon, export=1
         idct_dc         32,  8,   2
 
         mov             x15, x30
         sub             sp,  sp,  #512
 
@@ -2555,17 +2554,17 @@ 1:
 
         cmp             w9,  #32
 
         load_add_store_8x8 x6, x7
 
         b.lt            1b
 
         add             sp,  sp,  #512
-        ret             x15
+        br              x15
 endfunc
 
 function inv_dct64_step1_neon
         // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
         // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
         // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
         // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
 
@@ -2882,17 +2881,17 @@ function inv_txfm_dct\suffix\()_8h_x64_n
         scale_if        \scale, v0.h[0], v16, v17, v18, v19
         bl              inv_dct64_step1_neon
 
         sub             x6,  x6,  #2*8*32
         add             x9,  x6,  #2*8*7
 
         bl              inv_dct64_step2_neon
 
-        ret             x14
+        br              x14
 endfunc
 .endm
 
 def_dct64_func
 def_dct64_func _clear, clear=1
 def_dct64_func _clear_scale, clear=1, scale=1
 
 
@@ -2939,17 +2938,17 @@ 1:
 .purgem store_addsub
         sub             x6,  x6,  x10, lsl #3
         sub             x9,  x9,  x10, lsl #3
         add             x6,  x6,  #16
         sub             x9,  x9,  #16
 
         cmp             x7,  x8
         b.lt            1b
-        ret             x14
+        br              x14
 endfunc
 
 function inv_txfm_add_vert_dct_8x64_neon
         mov             x14, x30
         lsl             x8,  x8,  #1
 
         mov             x7,  sp
         add             x8,  sp,  #2*8*(64 - 4)
@@ -2995,17 +2994,17 @@ 1:
         add_dest_addsub v16.8h,  v31.8h,  v17.8h,  v30.8h
         add_dest_addsub v18.8h,  v29.8h,  v19.8h,  v28.8h
         add_dest_addsub v20.8h,  v27.8h,  v21.8h,  v26.8h
         add_dest_addsub v22.8h,  v25.8h,  v23.8h,  v24.8h
 .purgem add_dest_addsub
         cmp             x7,  x8
         b.lt            1b
 
-        ret             x14
+        br              x14
 endfunc
 
 function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1
         idct_dc         64,  64,  2
 
         mov             x15, x30
 
         sub_sp          64*32*2+64*8*2
@@ -3049,17 +3048,17 @@ 3:
         add             x7,  x5,  #(\i*2)
         mov             x8,  #64*2
         bl              inv_txfm_dct_8h_x64_neon
         add             x6,  x0,  #(\i)
         bl              inv_txfm_add_vert_dct_8x64_neon
 .endr
 
         add             sp,  x5,  #64*32*2
-        ret             x15
+        br              x15
 endfunc
 
 function inv_txfm_add_dct_dct_64x32_8bpc_neon, export=1
         idct_dc         64,  32,  1
 
         mov             x15, x30
 
         sub_sp          64*32*2+64*8*2
@@ -3102,17 +3101,17 @@ 3:
 .irp i, 0, 8, 16, 24, 32, 40, 48, 56
         add             x6,  x0,  #(\i)
         add             x7,  x5,  #(\i*2)
         mov             x8,  #64*2
         bl              inv_txfm_add_vert_dct_8x32_neon
 .endr
 
         add             sp,  x5,  #64*32*2
-        ret             x15
+        br              x15
 endfunc
 
 function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1
         idct_dc         32,  64,  1
 
         mov             x15, x30
 
         sub_sp          32*32*2+64*8*2
@@ -3154,17 +3153,17 @@ 3:
         add             x7,  x5,  #(\i*2)
         mov             x8,  #32*2
         bl              inv_txfm_dct_8h_x64_neon
         add             x6,  x0,  #(\i)
         bl              inv_txfm_add_vert_dct_8x64_neon
 .endr
 
         add             sp,  x5,  #32*32*2
-        ret             x15
+        br              x15
 endfunc
 
 function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1
         idct_dc         64,  16,  2
 
         mov             x15, x30
 
         sub_sp          64*16*2+64*8*2
@@ -3208,17 +3207,17 @@ 3:
 .irp i, 0, 8, 16, 24, 32, 40, 48, 56
         add             x6,  x0,  #(\i)
         add             x7,  x4,  #(\i*2)
         mov             x8,  #64*2
         bl              inv_txfm_add_vert_8x16_neon
 .endr
 
         add             sp,  x4,  #64*16*2
-        ret             x15
+        br              x15
 endfunc
 
 function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1
         idct_dc         16,  64,  2
 
         mov             x15, x30
 
         sub_sp          16*32*2+64*8*2
@@ -3261,10 +3260,10 @@ 3:
         add             x7,  x5,  #(\i*2)
         mov             x8,  #16*2
         bl              inv_txfm_dct_8h_x64_neon
         add             x6,  x0,  #(\i)
         bl              inv_txfm_add_vert_dct_8x64_neon
 .endr
 
         add             sp,  x5,  #16*32*2
-        ret             x15
+        br              x15
 endfunc
--- a/third_party/dav1d/src/arm/64/itx16.S
+++ b/third_party/dav1d/src/arm/64/itx16.S
@@ -411,17 +411,16 @@ endfunc
         srshr           v3.4s,  v3.4s,  #12
         sqadd           \r0\().4s,  v2.4s,   v6.4s
         sqsub           \r3\().4s,  v2.4s,   v6.4s
         sqadd           \r1\().4s,  v3.4s,   v7.4s
         sqsub           \r2\().4s,  v3.4s,   v7.4s
 .endm
 
 function inv_dct_4s_x4_neon
-        AARCH64_VALID_CALL_TARGET
         movrel          x16, idct_coeffs
         ld1             {v0.4s}, [x16]
         idct_4          v16, v17, v18, v19
         ret
 endfunc
 
 .macro iadst_4x4 o0, o1, o2, o3
         movrel          x16, iadst4_coeffs
@@ -445,29 +444,26 @@ endfunc
 
         srshr           \o0\().4s, \o0\().4s, #12
         srshr           \o2\().4s, \o2\().4s, #12
         srshr           \o1\().4s, \o1\().4s, #12
         srshr           \o3\().4s, \o3\().4s, #12
 .endm
 
 function inv_adst_4s_x4_neon
-        AARCH64_VALID_CALL_TARGET
         iadst_4x4       v16, v17, v18, v19
         ret
 endfunc
 
 function inv_flipadst_4s_x4_neon
-        AARCH64_VALID_CALL_TARGET
         iadst_4x4       v19, v18, v17, v16
         ret
 endfunc
 
 function inv_identity_4s_x4_neon
-        AARCH64_VALID_CALL_TARGET
         movz            w16, #(5793-4096)*8, lsl #16
         dup             v0.2s,   w16
         sqrdmulh        v4.4s,   v16.4s,  v0.s[0]
         sqrdmulh        v5.4s,   v17.4s,  v0.s[0]
         sqrdmulh        v6.4s,   v18.4s,  v0.s[0]
         sqrdmulh        v7.4s,   v19.4s,  v0.s[0]
         sqadd           v16.4s,  v16.4s,  v4.4s
         sqadd           v17.4s,  v17.4s,  v5.4s
@@ -540,17 +536,17 @@ L(itx_4x4_end):
         usqadd          v1.8h,   v18.8h
         smin            v0.8h,   v0.8h,   v31.8h
         st1             {v0.d}[0], [x0], x1
         smin            v1.8h,   v1.8h,   v31.8h
         st1             {v0.d}[1], [x0], x1
         st1             {v1.d}[0], [x0], x1
         st1             {v1.d}[1], [x0], x1
 
-        ret             x15
+        br              x15
 endfunc
 
 .macro def_fn_4x4 txfm1, txfm2
 function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1
         mov             x15, x30
 
 .ifc \txfm1\()_\txfm2, dct_dct
         cbnz            w3,  1f
@@ -625,17 +621,16 @@ def_fn_4x4 identity, flipadst
         sqadd           \r2\().4s,  \r4\().4s,  v4.4s    // out2
         sqsub           \r5\().4s,  \r4\().4s,  v4.4s    // out5
         sqadd           \r3\().4s,  \r6\().4s,  v2.4s    // out3
         sqsub           \r4\().4s,  \r6\().4s,  v2.4s    // out4
         mov             \r6\().16b, v6.16b               // out6
 .endm
 
 function inv_dct_4s_x8_neon
-        AARCH64_VALID_CALL_TARGET
         movrel          x16, idct_coeffs
         ld1             {v0.4s, v1.4s}, [x16]
         idct_8          v16, v17, v18, v19, v20, v21, v22, v23
         ret
 endfunc
 
 .macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7
         movrel          x16, iadst8_coeffs
@@ -702,29 +697,26 @@ endfunc
         srshr           \o2\().4s, v18.4s, #12 // out2 (v18 or v21)
         srshr           \o4\().4s, v6.4s,  #12 // out4 (v20 or v19)
 
         sqneg           \o3\().4s, v2.4s     // out3
         sqneg           \o5\().4s, v3.4s     // out5
 .endm
 
 function inv_adst_4s_x8_neon
-        AARCH64_VALID_CALL_TARGET
         iadst_8         v16, v17, v18, v19, v20, v21, v22, v23
         ret
 endfunc
 
 function inv_flipadst_4s_x8_neon
-        AARCH64_VALID_CALL_TARGET
         iadst_8         v23, v22, v21, v20, v19, v18, v17, v16
         ret
 endfunc
 
 function inv_identity_4s_x8_neon
-        AARCH64_VALID_CALL_TARGET
         sqshl           v16.4s,  v16.4s,  #1
         sqshl           v17.4s,  v17.4s,  #1
         sqshl           v18.4s,  v18.4s,  #1
         sqshl           v19.4s,  v19.4s,  #1
         sqshl           v20.4s,  v20.4s,  #1
         sqshl           v21.4s,  v21.4s,  #1
         sqshl           v22.4s,  v22.4s,  #1
         sqshl           v23.4s,  v23.4s,  #1
@@ -787,17 +779,17 @@ 2:
         mov             v20.16b, v24.16b
         mov             v21.16b, v25.16b
         mov             v22.16b, v26.16b
         mov             v23.16b, v27.16b
 
         blr             x5
 
         load_add_store_8x8 x0, x7
-        ret             x15
+        br              x15
 endfunc
 
 .macro def_fn_8x8 txfm1, txfm2, eob_half
 function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_16bpc_neon, export=1
         mov             x15, x30
 
 .ifc \txfm1\()_\txfm2, dct_dct
         idct_dc         8,   8,   1
@@ -856,17 +848,17 @@ function inv_txfm_add_8x4_neon
         ins             v16.d[1], v20.d[0]
         ins             v17.d[1], v21.d[0]
         ins             v18.d[1], v22.d[0]
         ins             v19.d[1], v23.d[0]
 
         blr             x5
 
         load_add_store_8x4 x0, x7
-        ret             x15
+        br              x15
 endfunc
 
 function inv_txfm_add_4x8_neon
         movz            w16, #2896*8, lsl #16
         movi            v31.4s,  #0
         dup             v30.2s,  w16
 
         cmp             w3,  w13
@@ -905,17 +897,17 @@ 2:
         sqxtn           v17.4h,  v17.4s
         sqxtn           v18.4h,  v18.4s
         sqxtn           v19.4h,  v19.4s
         transpose_4x4h  v16, v17, v18, v19, v4,  v5,  v6,  v7
 
         blr             x5
 
         load_add_store_4x8 x0, x7
-        ret             x15
+        br              x15
 endfunc
 
 .macro def_fn_48 w, h, txfm1, txfm2, eob_half
 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
         mov             x15, x30
 
 .ifc \txfm1\()_\txfm2, dct_dct
         idct_dc         \w,  \h,  0
@@ -948,17 +940,16 @@ def_fn_48 \w, \h, identity, adst, 16
 def_fn_48 \w, \h, identity, flipadst, 16
 .endm
 
 def_fns_48 4, 8
 def_fns_48 8, 4
 
 
 function inv_dct_4s_x16_neon
-        AARCH64_VALID_CALL_TARGET
         movrel          x16, idct_coeffs
         ld1             {v0.4s, v1.4s}, [x16], #32
 
         idct_8          v16, v18, v20, v22, v24, v26, v28, v30
 
         ld1             {v0.4s, v1.4s}, [x16]
         sub             x16, x16, #32
 
@@ -1210,29 +1201,26 @@ endfunc
 
         sqneg           \o7\().4s,   v4.4s // out7
         sqneg           \o5\().4s,   v5.4s // out5
         sqneg           \o11\().4s,  v6.4s // out11
         sqneg           \o9\().4s,   v7.4s // out9
 .endm
 
 function inv_adst_4s_x16_neon
-        AARCH64_VALID_CALL_TARGET
         iadst_16        v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
         ret
 endfunc
 
 function inv_flipadst_4s_x16_neon
-        AARCH64_VALID_CALL_TARGET
         iadst_16        v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16
         ret
 endfunc
 
 function inv_identity_4s_x16_neon
-        AARCH64_VALID_CALL_TARGET
         movz            w16, #2*(5793-4096)*8, lsl #16
         dup             v0.2s,   w16
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         sqrdmulh        v2.4s,      v\i\().4s,  v0.s[0]
         sqadd           v\i\().4s,  v\i\().4s,  v\i\().4s
         sqadd           v\i\().4s,  v\i\().4s,  v2.4s
 .endr
         ret
@@ -1289,31 +1277,31 @@ function inv_txfm_horz\suffix\()_16x4_ne
         sqrshrn2        v23.8h,  v31.4s,  #\shift
         transpose_4x8h  v16, v17, v18, v19, v4,  v5,  v6,  v7
         transpose_4x8h  v20, v21, v22, v23, v4,  v5,  v6,  v7
 
 .irp i, v16.8h, v20.8h, v17.8h, v21.8h, v18.8h, v22.8h, v19.8h, v23.8h
         st1             {\i}, [x6], #16
 .endr
 
-        ret             x14
+        br              x14
 endfunc
 .endm
 
 def_horz_16 scale=0, shift=2
 def_horz_16 scale=1, shift=1, suffix=_scale
 
 function inv_txfm_add_vert_8x16_neon
         mov             x14, x30
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         ld1             {v\i\().8h}, [x7], x8
 .endr
         blr             x5
         load_add_store_8x16 x6, x7
-        ret             x14
+        br              x14
 endfunc
 
 function inv_txfm_add_16x16_neon
         mov             x15, x30
         sub             sp,  sp,  #512
         ldrh            w12, [x13], #2
 .irp i, 0, 4, 8, 12
         add             x6,  sp,  #(\i*16*2)
@@ -1345,17 +1333,17 @@ 3:
 .irp i, 0, 8
         add             x6,  x0,  #(\i*2)
         add             x7,  sp,  #(\i*2)
         mov             x8,  #32
         bl              inv_txfm_add_vert_8x16_neon
 .endr
 
         add             sp,  sp,  #512
-        ret             x15
+        br              x15
 endfunc
 
 const eob_16x16
         .short 10, 36, 78, 256
 endconst
 
 const eob_16x16_identity
         .short 4, 8, 12, 256
@@ -1430,17 +1418,17 @@ function inv_txfm_add_16x4_neon
         sqrshrn2        v17.8h,  v29.4s,  #1
         sqrshrn2        v18.8h,  v30.4s,  #1
         sqrshrn2        v19.8h,  v31.4s,  #1
         transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5
         blr             x5
         add             x6,  x0,  #16
         load_add_store_8x4 x6, x7
 
-        ret             x15
+        br              x15
 endfunc
 
 function inv_txfm_add_4x16_neon
         ldrh            w12, [x13, #4]
         mov             x15, x30
 
         mov             x11, #64
 
@@ -1524,17 +1512,17 @@ 2:
         sqrshrn         v18.4h,  v18.4s,  #1
         sqrshrn         v19.4h,  v19.4s,  #1
         transpose_4x8h  v16, v17, v18, v19, v4,  v5,  v6,  v7
 
         blr             x5
 
         load_add_store_4x16 x0, x6
 
-        ret             x15
+        br              x15
 endfunc
 
 const eob_4x16
         .short 13, 29, 45, 64
 endconst
 
 const eob_4x16_identity1
         .short 16, 32, 48, 64
@@ -1705,17 +1693,17 @@ 2:
 
         add             x0,  x0,  #16
         load_add_store_8x8 x0, x7
 
         ldp             d14, d15, [sp, #0x30]
         ldp             d12, d13, [sp, #0x20]
         ldp             d10, d11, [sp, #0x10]
         ldp             d8,  d9,  [sp], 0x40
-        ret             x15
+        br              x15
 endfunc
 
 function inv_txfm_add_8x16_neon
         mov             x15, x30
         stp             d8,  d9,  [sp, #-0x20]!
         stp             d10, d11, [sp, #0x10]
         ldrh            w12, [x13, #4]
 
@@ -1846,17 +1834,17 @@ 2:
 
         blr             x5
 
         load_add_store_8x16 x0, x6
 
         ldp             d10, d11, [sp, #0x10]
         ldp             d8,  d9,  [sp], 0x20
 
-        ret             x15
+        br              x15
 endfunc
 
 const eob_8x16
         .short 10, 43, 75, 128
 endconst
 
 const eob_8x16_identity1
         .short 4, 64, 96, 128
@@ -2148,17 +2136,17 @@ function inv_txfm_horz\suffix\()_dct_32x
         st1             {v2.8h, v3.8h}, [x6], #32
 .endm
 
         store2          v31.4s,  v27.4s,  v23.4s,  v19.4s,  \shift
         store2          v30.4s,  v26.4s,  v22.4s,  v18.4s,  \shift
         store2          v29.4s,  v25.4s,  v21.4s,  v17.4s,  \shift
         store2          v28.4s,  v24.4s,  v20.4s,  v16.4s,  \shift
 .purgem store2
-        ret             x14
+        br              x14
 endfunc
 .endm
 
 def_horz_32 scale=0, shift=2
 def_horz_32 scale=1, shift=1, suffix=_scale
 
 function inv_txfm_add_vert_dct_8x32_neon
         mov             x14, x30
@@ -2223,17 +2211,17 @@ function inv_txfm_add_vert_dct_8x32_neon
         combine         v19.8h, v18.8h, v17.8h, v16.8h, sqadd, x8
         sub             x7,  x7,  x8
         combine         v16.8h, v17.8h, v18.8h, v19.8h, sqsub, x9
         combine         v20.8h, v21.8h, v22.8h, v23.8h, sqsub, x9
         combine         v24.8h, v25.8h, v26.8h, v27.8h, sqsub, x9
         combine         v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9
 .purgem combine
 
-        ret             x14
+        br              x14
 endfunc
 
 const eob_32x32
         .short 10, 36, 78, 136, 210, 300, 406, 1024
 endconst
 
 const eob_16x32
         .short 10, 36, 78, 151, 215, 279, 343, 512
@@ -2540,17 +2528,17 @@ 3:
 .irp i, 0, 8, 16, 24
         add             x6,  x0,  #(\i*2)
         add             x7,  sp,  #(\i*2)
         mov             x8,  #32*2
         bl              inv_txfm_add_vert_dct_8x32_neon
 .endr
 
         add             sp,  sp,  #2048
-        ret             x15
+        br              x15
 endfunc
 
 function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1
         idct_dc         16,  32,  1
 
         mov             x15, x30
         sub             sp,  sp,  #1024
         movrel          x13, eob_16x32
@@ -2589,17 +2577,17 @@ 3:
 .irp i, 0, 8
         add             x6,  x0,  #(\i*2)
         add             x7,  sp,  #(\i*2)
         mov             x8,  #16*2
         bl              inv_txfm_add_vert_dct_8x32_neon
 .endr
 
         add             sp,  sp,  #1024
-        ret             x15
+        br              x15
 endfunc
 
 function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1
         idct_dc         32,  16,  1
 
         mov             x15, x30
         sub             sp,  sp,  #1024
 
@@ -2639,17 +2627,17 @@ 3:
 .irp i, 0, 8, 16, 24
         add             x6,  x0,  #(\i*2)
         add             x7,  sp,  #(\i*2)
         mov             x8,  #32*2
         bl              inv_txfm_add_vert_8x16_neon
 .endr
 
         add             sp,  sp,  #1024
-        ret             x15
+        br              x15
 endfunc
 
 function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1
         idct_dc         8,   32, 2
 
         mov             x15, x30
         sub             sp,  sp,  #512
 
@@ -2699,17 +2687,17 @@ 2:
 
 3:
         mov             x6,  x0
         mov             x7,  sp
         mov             x8,  #8*2
         bl              inv_txfm_add_vert_dct_8x32_neon
 
         add             sp,  sp,  #512
-        ret             x15
+        br              x15
 endfunc
 
 function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1
         idct_dc         32,  8,   2
 
         mov             x15, x30
         sub             sp,  sp,  #512
 
@@ -2750,17 +2738,17 @@ 1:
 
         cmp             w9,  #32
 
         load_add_store_8x8 x6, x7
 
         b.lt            1b
 
         add             sp,  sp,  #512
-        ret             x15
+        br              x15
 endfunc
 
 function inv_dct64_step1_neon
         // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
         // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
         // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
         // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
 
@@ -3077,17 +3065,17 @@ function inv_txfm_dct\suffix\()_4s_x64_n
         scale_if        \scale, v0.s[0], v16, v17, v18, v19
         bl              inv_dct64_step1_neon
 
         sub             x6,  x6,  #4*4*32
         add             x9,  x6,  #4*4*7
 
         bl              inv_dct64_step2_neon
 
-        ret             x14
+        br              x14
 endfunc
 .endm
 
 def_dct64_func _clear, clear=1
 def_dct64_func _clear_scale, clear=1, scale=1
 
 
 function inv_txfm_horz_dct_64x4_neon
@@ -3134,17 +3122,17 @@ 1:
 .purgem store_addsub
         sub             x6,  x6,  x10, lsl #2
         sub             x9,  x9,  x10, lsl #2
         add             x6,  x6,  #16
         sub             x9,  x9,  #16
 
         cmp             x7,  x8
         b.lt            1b
-        ret             x14
+        br              x14
 endfunc
 
 function inv_txfm_add_vert_dct_8x64_neon
         mov             x14, x30
         lsl             x8,  x8,  #1
 
         mov             x7,  sp
         add             x8,  sp,  #2*8*(64 - 4)
@@ -3191,17 +3179,17 @@ 1:
         add_dest_addsub v16.8h,  v31.8h,  v17.8h,  v30.8h
         add_dest_addsub v18.8h,  v29.8h,  v19.8h,  v28.8h
         add_dest_addsub v20.8h,  v27.8h,  v21.8h,  v26.8h
         add_dest_addsub v22.8h,  v25.8h,  v23.8h,  v24.8h
 .purgem add_dest_addsub
         cmp             x7,  x8
         b.lt            1b
 
-        ret             x14
+        br              x14
 endfunc
 
 function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1
         idct_dc         64,  64,  2
 
         mov             x15, x30
 
         sub_sp          64*32*2+64*4*4
@@ -3245,17 +3233,17 @@ 3:
         add             x7,  x5,  #(\i*2)
         mov             x8,  #64*2
         bl              X(inv_txfm_dct_8h_x64_neon)
         add             x6,  x0,  #(\i*2)
         bl              inv_txfm_add_vert_dct_8x64_neon
 .endr
 
         add             sp,  x5,  #64*32*2
-        ret             x15
+        br              x15
 endfunc
 
 function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1
         idct_dc         64,  32,  1
 
         mov             x15, x30
 
         sub_sp          64*32*2+64*4*4
@@ -3298,17 +3286,17 @@ 3:
 .irp i, 0, 8, 16, 24, 32, 40, 48, 56
         add             x6,  x0,  #(\i*2)
         add             x7,  x5,  #(\i*2)
         mov             x8,  #64*2
         bl              inv_txfm_add_vert_dct_8x32_neon
 .endr
 
         add             sp,  x5,  #64*32*2
-        ret             x15
+        br              x15
 endfunc
 
 function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1
         idct_dc         32,  64,  1
 
         mov             x15, x30
 
         sub_sp          32*32*2+64*8*2
@@ -3348,17 +3336,17 @@ 3:
         add             x7,  x5,  #(\i*2)
         mov             x8,  #32*2
         bl              X(inv_txfm_dct_8h_x64_neon)
         add             x6,  x0,  #(\i*2)
         bl              inv_txfm_add_vert_dct_8x64_neon
 .endr
 
         add             sp,  x5,  #32*32*2
-        ret             x15
+        br              x15
 endfunc
 
 function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1
         idct_dc         64,  16,  2
 
         mov             x15, x30
 
         sub_sp          64*16*2+64*4*4
@@ -3402,17 +3390,17 @@ 3:
 .irp i, 0, 8, 16, 24, 32, 40, 48, 56
         add             x6,  x0,  #(\i*2)
         add             x7,  x4,  #(\i*2)
         mov             x8,  #64*2
         bl              inv_txfm_add_vert_8x16_neon
 .endr
 
         add             sp,  x4,  #64*16*2
-        ret             x15
+        br              x15
 endfunc
 
 function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1
         idct_dc         16,  64,  2
 
         mov             x15, x30
 
         sub_sp          16*32*2+64*8*2
@@ -3455,10 +3443,10 @@ 3:
         add             x7,  x5,  #(\i*2)
         mov             x8,  #16*2
         bl              X(inv_txfm_dct_8h_x64_neon)
         add             x6,  x0,  #(\i*2)
         bl              inv_txfm_add_vert_dct_8x64_neon
 .endr
 
         add             sp,  x5,  #16*32*2
-        ret             x15
+        br              x15
 endfunc
--- a/third_party/dav1d/src/arm/64/loopfilter.S
+++ b/third_party/dav1d/src/arm/64/loopfilter.S
@@ -473,26 +473,26 @@ 1:
         bif             v10.16b, v28.16b, v15.16b // out q4
         bif             v11.16b, v29.16b, v15.16b // out q5
 .endif
 
         ret
 .if \wd == 16
 7:
         // Return to a shorter epilogue, writing only the inner 6 pixels
-        ret             x13
+        br              x13
 .endif
 .if \wd >= 8
 8:
         // Return to a shorter epilogue, writing only the inner 4 pixels
-        ret             x14
+        br              x14
 .endif
 9:
         // Return directly without writing back any pixels
-        ret             x15
+        br              x15
 endfunc
 .endm
 
 loop_filter 16
 loop_filter 8
 loop_filter 6
 loop_filter 4
 
@@ -527,17 +527,17 @@ function lpf_v_4_16_neon
         lpf_16_wd4
 
         sub             x16, x0,  x1, lsl #1
         st1             {v22.16b}, [x16], x1 // p1
         st1             {v24.16b}, [x0],  x1 // q0
         st1             {v23.16b}, [x16], x1 // p0
         st1             {v25.16b}, [x0],  x1 // q1
         sub             x0,  x0,  x1, lsl #1
-        ret             x15
+        br              x15
 endfunc
 
 function lpf_h_4_16_neon
         mov             x15, x30
         sub             x16, x0,  #2
         add             x0,  x16, x1, lsl #3
         ld1             {v22.s}[0], [x16], x1
         ld1             {v22.s}[2], [x0],  x1
@@ -578,17 +578,17 @@ function lpf_h_4_16_neon
         st1             {v22.s}[3], [x0],  x1
         st1             {v23.s}[1], [x16], x1
         st1             {v23.s}[3], [x0],  x1
         st1             {v24.s}[1], [x16], x1
         st1             {v24.s}[3], [x0],  x1
         st1             {v25.s}[1], [x16], x1
         st1             {v25.s}[3], [x0],  x1
         add             x0,  x0,  #2
-        ret             x15
+        br              x15
 endfunc
 
 function lpf_v_6_16_neon
         mov             x15, x30
         sub             x16, x0,  x1, lsl #1
         sub             x16, x16, x1
         ld1             {v21.16b}, [x16], x1 // p2
         ld1             {v24.16b}, [x0],  x1 // q0
@@ -602,17 +602,17 @@ function lpf_v_6_16_neon
         lpf_16_wd6
 
         sub             x16, x0,  x1, lsl #1
         st1             {v22.16b}, [x16], x1 // p1
         st1             {v24.16b}, [x0],  x1 // q0
         st1             {v23.16b}, [x16], x1 // p0
         st1             {v25.16b}, [x0],  x1 // q1
         sub             x0,  x0,  x1, lsl #1
-        ret             x15
+        br              x15
 endfunc
 
 function lpf_h_6_16_neon
         mov             x15, x30
         sub             x16, x0,  #4
         add             x0,  x16, x1, lsl #3
         ld1             {v20.d}[0], [x16], x1
         ld1             {v20.d}[1], [x0],  x1
@@ -653,17 +653,17 @@ function lpf_h_6_16_neon
         st1             {v22.s}[3], [x0],  x1
         st1             {v23.s}[1], [x16], x1
         st1             {v23.s}[3], [x0],  x1
         st1             {v24.s}[1], [x16], x1
         st1             {v24.s}[3], [x0],  x1
         st1             {v25.s}[1], [x16], x1
         st1             {v25.s}[3], [x0],  x1
         add             x0,  x0,  #2
-        ret             x15
+        br              x15
 endfunc
 
 function lpf_v_8_16_neon
         mov             x15, x30
         sub             x16, x0,  x1, lsl #2
         ld1             {v20.16b}, [x16], x1 // p3
         ld1             {v24.16b}, [x0],  x1 // q0
         ld1             {v21.16b}, [x16], x1 // p2
@@ -681,26 +681,26 @@ function lpf_v_8_16_neon
         st1             {v21.16b}, [x16], x1 // p2
         st1             {v24.16b}, [x0],  x1 // q0
         st1             {v22.16b}, [x16], x1 // p1
         st1             {v25.16b}, [x0],  x1 // q1
         st1             {v23.16b}, [x16], x1 // p0
         st1             {v26.16b}, [x0],  x1 // q2
         sub             x0,  x0,  x1, lsl #1
         sub             x0,  x0,  x1
-        ret             x15
+        br              x15
 
 8:
         sub             x16, x0,  x1, lsl #1
         st1             {v22.16b}, [x16], x1 // p1
         st1             {v24.16b}, [x0],  x1 // q0
         st1             {v23.16b}, [x16], x1 // p0
         st1             {v25.16b}, [x0],  x1 // q1
         sub             x0,  x0,  x1, lsl #1
-        ret             x15
+        br              x15
 endfunc
 
 function lpf_h_8_16_neon
         mov             x15, x30
         sub             x16, x0,  #4
         add             x0,  x16, x1, lsl #3
         ld1             {v20.d}[0], [x16], x1
         ld1             {v20.d}[1], [x0],  x1
@@ -741,17 +741,17 @@ function lpf_h_8_16_neon
         st1             {v24.d}[1], [x0],  x1
         st1             {v25.d}[0], [x16], x1
         st1             {v25.d}[1], [x0],  x1
         st1             {v26.d}[0], [x16], x1
         st1             {v26.d}[1], [x0],  x1
         st1             {v27.d}[0], [x16], x1
         st1             {v27.d}[1], [x0],  x1
         add             x0,  x0,  #4
-        ret             x15
+        br              x15
 8:
         sub             x16, x0,  x1, lsl #4
         sub             x16, x16, #2
         transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
         add             x0,  x16, x1, lsl #3
 
         st1             {v22.s}[0], [x16], x1
         st1             {v22.s}[2], [x0],  x1
@@ -765,17 +765,17 @@ 8:
         st1             {v22.s}[3], [x0],  x1
         st1             {v23.s}[1], [x16], x1
         st1             {v23.s}[3], [x0],  x1
         st1             {v24.s}[1], [x16], x1
         st1             {v24.s}[3], [x0],  x1
         st1             {v25.s}[1], [x16], x1
         st1             {v25.s}[3], [x0],  x1
         add             x0,  x0,  #2
-        ret             x15
+        br              x15
 endfunc
 
 function lpf_v_16_16_neon
         mov             x15, x30
 
         sub             x16, x0,  x1, lsl #3
         add             x16, x16, x1
         ld1             {v17.16b}, [x16], x1 // p6
@@ -808,38 +808,38 @@ function lpf_v_16_16_neon
         st1             {v3.16b},  [x16], x1 // p2
         st1             {v9.16b},  [x0],  x1 // q3
         st1             {v4.16b},  [x16], x1 // p1
         st1             {v10.16b}, [x0],  x1 // q4
         st1             {v5.16b},  [x16], x1 // p0
         st1             {v11.16b}, [x0],  x1 // q5
         sub             x0,  x0,  x1, lsl #2
         sub             x0,  x0,  x1, lsl #1
-        ret             x15
+        br              x15
 7:
         sub             x16, x0,  x1
         sub             x16, x16, x1, lsl #1
         st1             {v21.16b}, [x16], x1 // p2
         st1             {v24.16b}, [x0],  x1 // q0
         st1             {v22.16b}, [x16], x1 // p1
         st1             {v25.16b}, [x0],  x1 // q1
         st1             {v23.16b}, [x16], x1 // p0
         st1             {v26.16b}, [x0],  x1 // q2
         sub             x0,  x0,  x1, lsl #1
         sub             x0,  x0,  x1
-        ret             x15
+        br              x15
 
 8:
         sub             x16, x0,  x1, lsl #1
         st1             {v22.16b}, [x16], x1 // p1
         st1             {v24.16b}, [x0],  x1 // q0
         st1             {v23.16b}, [x16], x1 // p0
         st1             {v25.16b}, [x0],  x1 // q1
         sub             x0,  x0,  x1, lsl #1
-        ret             x15
+        br              x15
 endfunc
 
 function lpf_h_16_16_neon
         mov             x15, x30
         sub             x16, x0,  #8
         ld1             {v16.d}[0], [x16], x1
         ld1             {v24.d}[0], [x0],  x1
         ld1             {v17.d}[0], [x16], x1
@@ -911,17 +911,17 @@ function lpf_h_16_16_neon
         st1             {v2.d}[1],  [x16], x1
         st1             {v10.d}[1], [x0],  x1
         st1             {v3.d}[1],  [x16], x1
         st1             {v11.d}[1], [x0],  x1
         st1             {v4.d}[1],  [x16], x1
         st1             {v30.d}[1], [x0],  x1
         st1             {v5.d}[1],  [x16], x1
         st1             {v31.d}[1], [x0],  x1
-        ret             x15
+        br              x15
 
 7:
         sub             x16, x0,  x1, lsl #4
         sub             x16, x16, #4
         transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
         add             x0,  x16, x1, lsl #3
 
         st1             {v20.d}[0], [x16], x1
@@ -936,17 +936,17 @@ 7:
         st1             {v24.d}[1], [x0],  x1
         st1             {v25.d}[0], [x16], x1
         st1             {v25.d}[1], [x0],  x1
         st1             {v26.d}[0], [x16], x1
         st1             {v26.d}[1], [x0],  x1
         st1             {v27.d}[0], [x16], x1
         st1             {v27.d}[1], [x0],  x1
         add             x0,  x0,  #4
-        ret             x15
+        br              x15
 8:
         sub             x16, x0,  x1, lsl #4
         sub             x16, x16, #2
         transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
         add             x0,  x16, x1, lsl #3
 
         st1             {v22.s}[0], [x16], x1
         st1             {v22.s}[2], [x0],  x1
@@ -960,17 +960,17 @@ 8:
         st1             {v22.s}[3], [x0],  x1
         st1             {v23.s}[1], [x16], x1
         st1             {v23.s}[3], [x0],  x1
         st1             {v24.s}[1], [x16], x1
         st1             {v24.s}[3], [x0],  x1
         st1             {v25.s}[1], [x16], x1
         st1             {v25.s}[3], [x0],  x1
         add             x0,  x0,  #2
-        ret             x15
+        br              x15
 endfunc
 
 // void dav1d_lpf_v_sb_y_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                                 const uint32_t *const vmask,
 //                                 const uint8_t (*l)[4], ptrdiff_t b4_stride,
 //                                 const Av1FilterLUT *lut, const int w)
 
 .macro lpf_func dir, type
@@ -1091,17 +1091,17 @@ 8:
         // For dir h, x0 is returned incremented
 .endif
         cbnz            w6,  1b
 
         ldp             d14, d15, [sp, #0x30]
         ldp             d12, d13, [sp, #0x20]
         ldp             d10, d11, [sp, #0x10]
         ldp             d8,  d9,  [sp], 0x40
-        ret             x11
+        br              x11
 endfunc
 .endm
 
 lpf_func v, y
 lpf_func h, y
 lpf_func v, uv
 lpf_func h, uv
 
--- a/third_party/dav1d/src/arm/64/loopfilter16.S
+++ b/third_party/dav1d/src/arm/64/loopfilter16.S
@@ -359,26 +359,26 @@ 1:
         bif             v10.16b, v28.16b, v15.16b // out q4
         bif             v11.16b, v29.16b, v15.16b // out q5
 .endif
 
         ret
 .if \wd == 16
 7:
         // Return to a shorter epilogue, writing only the inner 6 pixels
-        ret             x13
+        br              x13
 .endif
 .if \wd >= 8
 8:
         // Return to a shorter epilogue, writing only the inner 4 pixels
-        ret             x14
+        br              x14
 .endif
 9:
         // Return directly without writing back any pixels
-        ret             x15
+        br              x15
 endfunc
 .endm
 
 loop_filter 16
 loop_filter 8
 loop_filter 6
 loop_filter 4
 
@@ -413,17 +413,17 @@ function lpf_v_4_8_neon
         lpf_8_wd4
 
         sub             x16, x0,  x1, lsl #1
         st1             {v22.8h}, [x16], x1 // p1
         st1             {v24.8h}, [x0],  x1 // q0
         st1             {v23.8h}, [x16], x1 // p0
         st1             {v25.8h}, [x0],  x1 // q1
         sub             x0,  x0,  x1, lsl #1
-        ret             x15
+        br              x15
 endfunc
 
 function lpf_h_4_8_neon
         mov             x15, x30
         sub             x16, x0,  #4
         add             x0,  x16, x1, lsl #2
         ld1             {v22.d}[0], [x16], x1
         ld1             {v22.d}[1], [x0],  x1
@@ -448,17 +448,17 @@ function lpf_h_4_8_neon
         st1             {v22.d}[1], [x0],  x1
         st1             {v23.d}[0], [x16], x1
         st1             {v23.d}[1], [x0],  x1
         st1             {v24.d}[0], [x16], x1
         st1             {v24.d}[1], [x0],  x1
         st1             {v25.d}[0], [x16], x1
         st1             {v25.d}[1], [x0],  x1
         add             x0,  x0,  #4
-        ret             x15
+        br              x15
 endfunc
 
 function lpf_v_6_8_neon
         mov             x15, x30
         sub             x16, x0,  x1, lsl #1
         sub             x16, x16, x1
         ld1             {v21.8h}, [x16], x1 // p2
         ld1             {v24.8h}, [x0],  x1 // q0
@@ -472,17 +472,17 @@ function lpf_v_6_8_neon
         lpf_8_wd6
 
         sub             x16, x0,  x1, lsl #1
         st1             {v22.8h}, [x16], x1 // p1
         st1             {v24.8h}, [x0],  x1 // q0
         st1             {v23.8h}, [x16], x1 // p0
         st1             {v25.8h}, [x0],  x1 // q1
         sub             x0,  x0,  x1, lsl #1
-        ret             x15
+        br              x15
 endfunc
 
 function lpf_h_6_8_neon
         mov             x15, x30
         sub             x16, x0,  #8
         add             x0,  x16, x1, lsl #2
         ld1             {v20.8h}, [x16], x1
         ld1             {v24.8h}, [x0],  x1
@@ -507,17 +507,17 @@ function lpf_h_6_8_neon
         st1             {v22.d}[1], [x0],  x1
         st1             {v23.d}[0], [x16], x1
         st1             {v23.d}[1], [x0],  x1
         st1             {v24.d}[0], [x16], x1
         st1             {v24.d}[1], [x0],  x1
         st1             {v25.d}[0], [x16], x1
         st1             {v25.d}[1], [x0],  x1
         add             x0,  x0,  #4
-        ret             x15
+        br              x15
 endfunc
 
 function lpf_v_8_8_neon
         mov             x15, x30
         sub             x16, x0,  x1, lsl #2
         ld1             {v20.8h}, [x16], x1 // p3
         ld1             {v24.8h}, [x0],  x1 // q0
         ld1             {v21.8h}, [x16], x1 // p2
@@ -535,26 +535,26 @@ function lpf_v_8_8_neon
         st1             {v21.8h}, [x16], x1 // p2
         st1             {v24.8h}, [x0],  x1 // q0
         st1             {v22.8h}, [x16], x1 // p1
         st1             {v25.8h}, [x0],  x1 // q1
         st1             {v23.8h}, [x16], x1 // p0
         st1             {v26.8h}, [x0],  x1 // q2
         sub             x0,  x0,  x1, lsl #1
         sub             x0,  x0,  x1
-        ret             x15
+        br              x15
 
 8:
         sub             x16, x0,  x1, lsl #1
         st1             {v22.8h}, [x16], x1 // p1
         st1             {v24.8h}, [x0],  x1 // q0
         st1             {v23.8h}, [x16], x1 // p0
         st1             {v25.8h}, [x0],  x1 // q1
         sub             x0,  x0,  x1, lsl #1
-        ret             x15
+        br              x15
 endfunc
 
 function lpf_h_8_8_neon
         mov             x15, x30
         sub             x16, x0,  #8
         add             x0,  x16, x1, lsl #2
         ld1             {v20.8h}, [x16], x1
         ld1             {v24.8h}, [x0],  x1
@@ -579,33 +579,33 @@ function lpf_h_8_8_neon
         st1             {v24.8h}, [x0],  x1
         st1             {v21.8h}, [x16], x1
         st1             {v25.8h}, [x0],  x1
         st1             {v22.8h}, [x16], x1
         st1             {v26.8h}, [x0],  x1
         st1             {v23.8h}, [x16], x1
         st1             {v27.8h}, [x0],  x1
         add             x0,  x0,  #8
-        ret             x15
+        br              x15
 8:
         sub             x16, x0,  x1, lsl #3
         sub             x16, x16, #4
         transpose_4x8h  v22, v23, v24, v25, v26, v27, v28, v29
         add             x0,  x16, x1, lsl #2
 
         st1             {v22.d}[0], [x16], x1
         st1             {v22.d}[1], [x0],  x1
         st1             {v23.d}[0], [x16], x1
         st1             {v23.d}[1], [x0],  x1
         st1             {v24.d}[0], [x16], x1
         st1             {v24.d}[1], [x0],  x1
         st1             {v25.d}[0], [x16], x1
         st1             {v25.d}[1], [x0],  x1
         add             x0,  x0,  #4
-        ret             x15
+        br              x15
 endfunc
 
 function lpf_v_16_8_neon
         mov             x15, x30
 
         sub             x16, x0,  x1, lsl #3
         add             x16, x16, x1
         ld1             {v17.8h}, [x16], x1 // p6
@@ -638,38 +638,38 @@ function lpf_v_16_8_neon
         st1             {v3.8h},  [x16], x1 // p2
         st1             {v9.8h},  [x0],  x1 // q3
         st1             {v4.8h},  [x16], x1 // p1
         st1             {v10.8h}, [x0],  x1 // q4
         st1             {v5.8h},  [x16], x1 // p0
         st1             {v11.8h}, [x0],  x1 // q5
         sub             x0,  x0,  x1, lsl #2
         sub             x0,  x0,  x1, lsl #1
-        ret             x15
+        br              x15
 7:
         sub             x16, x0,  x1
         sub             x16, x16, x1, lsl #1
         st1             {v21.8h}, [x16], x1 // p2
         st1             {v24.8h}, [x0],  x1 // q0
         st1             {v22.8h}, [x16], x1 // p1
         st1             {v25.8h}, [x0],  x1 // q1
         st1             {v23.8h}, [x16], x1 // p0
         st1             {v26.8h}, [x0],  x1 // q2
         sub             x0,  x0,  x1, lsl #1
         sub             x0,  x0,  x1
-        ret             x15
+        br              x15
 
 8:
         sub             x16, x0,  x1, lsl #1
         st1             {v22.8h}, [x16], x1 // p1
         st1             {v24.8h}, [x0],  x1 // q0
         st1             {v23.8h}, [x16], x1 // p0
         st1             {v25.8h}, [x0],  x1 // q1
         sub             x0,  x0,  x1, lsl #1
-        ret             x15
+        br              x15
 endfunc
 
 function lpf_h_16_8_neon
         mov             x15, x30
         sub             x16, x0,  #16
         ld1             {v16.8h}, [x16], x1
         ld1             {v24.8h}, [x0],  x1
         ld1             {v17.8h}, [x16], x1
@@ -709,50 +709,50 @@ function lpf_h_16_8_neon
         st1             {v2.8h},  [x16], x1
         st1             {v10.8h}, [x0],  x1
         st1             {v3.8h},  [x16], x1
         st1             {v11.8h}, [x0],  x1
         st1             {v4.8h},  [x16], x1
         st1             {v30.8h}, [x0],  x1
         st1             {v5.8h},  [x16], x1
         st1             {v31.8h}, [x0],  x1
-        ret             x15
+        br              x15
 
 7:
         sub             x16, x0,  x1, lsl #3
         sub             x16, x16, #8
         transpose_8x8h  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
         add             x0,  x16, x1, lsl #2
 
         st1             {v20.8h}, [x16], x1
         st1             {v24.8h}, [x0],  x1
         st1             {v21.8h}, [x16], x1
         st1             {v25.8h}, [x0],  x1
         st1             {v22.8h}, [x16], x1
         st1             {v26.8h}, [x0],  x1
         st1             {v23.8h}, [x16], x1
         st1             {v27.8h}, [x0],  x1
         add             x0,  x0,  #8
-        ret             x15
+        br              x15
 8:
         sub             x16, x0,  x1, lsl #3
         sub             x16, x16, #4
         transpose_4x8h  v22, v23, v24, v25, v26, v27, v28, v29
         add             x0,  x16, x1, lsl #2
 
         st1             {v22.d}[0], [x16], x1
         st1             {v22.d}[1], [x0],  x1
         st1             {v23.d}[0], [x16], x1
         st1             {v23.d}[1], [x0],  x1
         st1             {v24.d}[0], [x16], x1
         st1             {v24.d}[1], [x0],  x1
         st1             {v25.d}[0], [x16], x1
         st1             {v25.d}[1], [x0],  x1
         add             x0,  x0,  #4
-        ret             x15
+        br              x15
 endfunc
 
 // void dav1d_lpf_v_sb_y_16bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                                  const uint32_t *const vmask,
 //                                  const uint8_t (*l)[4], ptrdiff_t b4_stride,
 //                                  const Av1FilterLUT *lut, const int w,
 //                                  const int bitdepth_max)
 
@@ -887,17 +887,17 @@ 8:
         // For dir h, x0 is returned incremented
 .endif
         cbnz            w6,  1b
 
         ldp             d14, d15, [sp, #0x30]
         ldp             d12, d13, [sp, #0x20]
         ldp             d10, d11, [sp, #0x10]
         ldp             d8,  d9,  [sp], 0x40
-        ret             x11
+        br              x11
 endfunc
 .endm
 
 lpf_func v, y
 lpf_func h, y
 lpf_func v, uv
 lpf_func h, uv
 
--- a/third_party/dav1d/src/arm/64/looprestoration.S
+++ b/third_party/dav1d/src/arm/64/looprestoration.S
@@ -39,26 +39,28 @@ right_ext_mask:
         .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
         .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
         .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
         .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
         .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
         .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 endconst
 
-// void dav1d_wiener_filter7_8bpc_neon(pixel *p, const ptrdiff_t stride,
-//                                     const pixel (*left)[4], const pixel *lpf,
+// void dav1d_wiener_filter7_8bpc_neon(pixel *p, const ptrdiff_t p_stride,
+//                                     const pixel (*left)[4],
+//                                     const pixel *lpf, const ptrdiff_t lpf_stride,
 //                                     const int w, int h,
 //                                     const int16_t filter[2][8],
 //                                     const enum LrEdgeFlags edges);
 function wiener_filter7_8bpc_neon, export=1
+        ldr             w8,  [sp]
         stp             x29, x30, [sp, #-16]!
         mov             x29, sp
-        ld1             {v0.8h, v1.8h},  [x6]
-        tst             w7,  #4               // LR_HAVE_TOP
+        ld1             {v0.8h, v1.8h},  [x7]
+        tst             w8,  #4               // LR_HAVE_TOP
         sub_sp          384*2*6
 
         mov             w17, #(1 << 14) - (1 << 2)
         dup             v30.8h,  w17
         movi            v31.8h,  #8, lsl #8
 
         // x9  - t6
         // x10 - t5
@@ -68,113 +70,115 @@ function wiener_filter7_8bpc_neon, expor
         // x14 - t1
         // x15 - t0
         mov             x14, sp               // t1
         b.eq            L(no_top_7)
 
         mov             x16, x2               // backup left
         mov             x2,  #0
         bl              wiener_filter7_h_8bpc_neon
-        add             x3,  x3,  x1          // lpf += stride
+        add             x3,  x3,  x4          // lpf += lpf_stride
         mov             x9,  x14              // t6
         mov             x10, x14              // t5
         add             x14, x14, #384*2      // t1 += 384*2
         bl              wiener_filter7_h_8bpc_neon
-        add             x3,  x3,  x1,  lsl #2
-        add             x3,  x3,  x1          // lpf += stride*5
+        add             x3,  x3,  x4,  lsl #2
+        add             x3,  x3,  x4          // lpf += lpf_stride*5
         mov             x11, x14              // t4
         add             x14, x14, #384*2      // t1 += 384*2
         mov             x2,  x16              // left
         mov             x16, x3               // backup lpf
         mov             x3,  x0               // lpf = p
         bl              wiener_filter7_h_8bpc_neon
-        subs            w5,  w5,  #1          // h--
+        subs            w6,  w6,  #1          // h--
         mov             x12, x14              // t3
         mov             x13, x14              // t2
         b.eq            L(v1_7)
-        add             x3,  x3,  x1          // src += stride
+        add             x3,  x3,  x1          // src += p_stride
         add             x14, x14, #384*2      // t1 += 384*2
         bl              wiener_filter7_h_8bpc_neon
         mov             x13, x14              // t2
-        subs            w5,  w5,  #1          // h--
+        subs            w6,  w6,  #1          // h--
         b.eq            L(v2_7)
-        add             x3,  x3,  x1          // src += stride
+        add             x3,  x3,  x1          // src += p_stride
         add             x14, x14, #384*2      // t1 += 384*2
         bl              wiener_filter7_h_8bpc_neon
-        subs            w5,  w5,  #1          // h--
+        subs            w6,  w6,  #1          // h--
         b.eq            L(v3_7)
-        add             x3,  x3,  x1          // src += stride
+        add             x3,  x3,  x1          // src += p_stride
 
 L(main_7):
         add             x15, x14, #384*2      // t0 = t1 + 384*2
 L(main_loop_7):
         bl              wiener_filter7_hv_8bpc_neon
-        subs            w5,  w5,  #1          // h--
+        subs            w6,  w6,  #1          // h--
         b.ne            L(main_loop_7)
-        tst             w7,  #8 // LR_HAVE_BOTTOM
+        tst             w8,  #8 // LR_HAVE_BOTTOM
         b.eq            L(v3_7)
 
         mov             x3,  x16              // restore lpf
         mov             x2,  #0               // left = NULL
+        sub             x4,  x4,  x1          // lpf_stride - p_stride
         bl              wiener_filter7_hv_8bpc_neon
+        add             x3,  x3,  x4          // src += lpf_stride - p_stride
         bl              wiener_filter7_hv_8bpc_neon
 L(v1_7):
         bl              wiener_filter7_v_8bpc_neon
 
         mov             sp,  x29
         ldp             x29, x30, [sp], #16
         ret
 
 L(no_top_7):
-        add             x3,  x3,  x1,  lsl #2
-        add             x16, x3,  x1,  lsl #1 // lpf += stride*6, backup
+        add             x3,  x3,  x4,  lsl #2
+        add             x16, x3,  x4,  lsl #1 // lpf += lpf_stride*6, backup
         mov             x3,  x0               // lpf = p
 
         bl              wiener_filter7_h_8bpc_neon
-        subs            w5,  w5,  #1          // h--
+        subs            w6,  w6,  #1          // h--
         mov             x9,  x14              // t6
         mov             x10, x14              // t5
         mov             x11, x14              // t4
         mov             x12, x14              // t3
         mov             x13, x14              // t2
         b.eq            L(v1_7)
-        add             x3,  x3,  x1          // src += stride
+        add             x3,  x3,  x1          // src += p_stride
         add             x14, x14, #384*2      // t1 += 384*2
         bl              wiener_filter7_h_8bpc_neon
-        subs            w5,  w5,  #1          // h--
+        subs            w6,  w6,  #1          // h--
         mov             x13, x14              // t2
         b.eq            L(v2_7)
-        add             x3,  x3,  x1          // src += stride
+        add             x3,  x3,  x1          // src += p_stride
         add             x14, x14, #384*2      // t1 += 384*2
         bl              wiener_filter7_h_8bpc_neon
-        subs            w5,  w5,  #1          // h--
+        subs            w6,  w6,  #1          // h--
         b.eq            L(v3_7)
-        add             x3,  x3,  x1          // src += stride
+        add             x3,  x3,  x1          // src += p_stride
         add             x15, x14, #384*2      // t0 = t1 + 384*2
         bl              wiener_filter7_hv_8bpc_neon
-        subs            w5,  w5,  #1          // h--
+        subs            w6,  w6,  #1          // h--
         b.eq            L(v3_7)
         add             x15, x15, #384*2*4    // t0 += 384*2*4
         bl              wiener_filter7_hv_8bpc_neon
-        subs            w5,  w5,  #1          // h--
+        subs            w6,  w6,  #1          // h--
         b.ne            L(main_7)
 L(v3_7):
         bl              wiener_filter7_v_8bpc_neon
 L(v2_7):
         bl              wiener_filter7_v_8bpc_neon
         b               L(v1_7)
 endfunc
 
 
 function wiener_filter7_h_8bpc_neon
-        stp             x3,  x4,  [sp, #-32]!
+        stp             x3,  x5,  [sp, #-32]!
         str             x14,      [sp, #16]
 
         // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
-        tst             w7,  #1 // LR_HAVE_LEFT
+        tst             w8,  #1 // LR_HAVE_LEFT
         b.eq            1f
         // LR_HAVE_LEFT
         cbnz            x2,  0f
         // left == NULL
         sub             x3,  x3,  #3
         ld1             {v3.16b}, [x3], #16
         b               2f
 
@@ -199,39 +203,39 @@ 1:
         ext             v3.16b,  v2.16b,  v3.16b, #13
 
 2:
         ld1             {v4.8b}, [x3], #8
         uxtl            v2.8h,   v3.8b
         uxtl2           v3.8h,   v3.16b
         uxtl            v4.8h,   v4.8b
 
-        tst             w7,  #2 // LR_HAVE_RIGHT
+        tst             w8,  #2 // LR_HAVE_RIGHT
         b.ne            4f
 
 3:      // !LR_HAVE_RIGHT
 
         // Check whether we need to pad the right edge
-        cmp             w4,  #19
+        cmp             w5,  #19
         b.ge            4f   // If w >= 19, all used input pixels are valid
 
         // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
         // this ends up called again; it's not strictly needed in those
         // cases (we pad enough here), but keeping the code as simple as possible.
 
         // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
         // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
-        sub             w17, w4,  #22
+        sub             w17, w5,  #22
         // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
         // buffer pointer.
-        movrel          x6,  right_ext_mask, -6
+        movrel          x7,  right_ext_mask, -6
         ldr             b28, [x3,  w17, sxtw]
-        sub             x6,  x6,  w4,  uxtw #1
+        sub             x7,  x7,  w5,  uxtw #1
         dup             v28.8h,  v28.h[0]
-        ld1             {v25.16b, v26.16b, v27.16b}, [x6]
+        ld1             {v25.16b, v26.16b, v27.16b}, [x7]
 
         bit             v2.16b,  v28.16b, v25.16b
         bit             v3.16b,  v28.16b, v26.16b
         bit             v4.16b,  v28.16b, v27.16b
 
 4:      // Loop horizontally
         // Interleaving the mul/mla chains actually hurts performance
         // significantly on Cortex A53, thus keeping mul/mla tightly
@@ -271,42 +275,42 @@ 4:      // Loop horizontally
         sub             v23.8h,  v23.8h,  v30.8h
         sqadd           v6.8h,   v6.8h,   v22.8h
         sqadd           v7.8h,   v7.8h,   v23.8h
         sshr            v6.8h,   v6.8h,   #3
         sshr            v7.8h,   v7.8h,   #3
         add             v6.8h,   v6.8h,   v31.8h
         add             v7.8h,   v7.8h,   v31.8h
 
-        subs            w4,  w4,  #16
+        subs            w5,  w5,  #16
 
         st1             {v6.8h, v7.8h}, [x14], #32
 
         b.le            0f
         mov             v2.16b,  v4.16b
         ld1             {v4.16b}, [x3], #16
-        tst             w7,  #2 // LR_HAVE_RIGHT
+        tst             w8,  #2 // LR_HAVE_RIGHT
         uxtl            v3.8h,   v4.8b
         uxtl2           v4.8h,   v4.16b
         b.ne            4b // If we don't need to pad, just keep filtering.
         b               3b // If we need to pad, check how many pixels we have left.
 
 0:
         ldr             x14,      [sp, #16]
-        ldp             x3,  x4,  [sp], #32
+        ldp             x3,  x5,  [sp], #32
         ret
 endfunc
 
 function wiener_filter7_v_8bpc_neon
         // Backing up/restoring registers shifted, so that x9 gets the value
         // of x10, etc, afterwards.
         stp             x10, x11, [sp, #-64]!
         stp             x12, x13, [sp, #16]
         stp             x14, x14, [sp, #32]
-        stp             x0,  x4,  [sp, #48]
+        stp             x0,  x5,  [sp, #48]
 1:
         ld1             {v20.8h, v21.8h}, [x11], #32
         ld1             {v24.8h, v25.8h}, [x13], #32
 
         ld1             {v18.8h, v19.8h}, [x10], #32
         add             v24.8h,  v24.8h,  v20.8h
         ld1             {v26.8h, v27.8h}, [x14], #32
 
@@ -336,40 +340,40 @@ 1:
         smlal2          v5.4s,   v29.8h,  v1.h[5]
         smlal2          v5.4s,   v17.8h,  v1.h[6]
         sqrshrun        v2.4h,   v2.4s,   #11
         sqrshrun2       v2.8h,   v3.4s,   #11
         sqrshrun        v3.4h,   v4.4s,   #11
         sqrshrun2       v3.8h,   v5.4s,   #11
         sqxtun          v2.8b,   v2.8h
         sqxtun2         v2.16b,  v3.8h
-        subs            w4,  w4,  #16
+        subs            w5,  w5,  #16
         st1             {v2.16b}, [x0], #16
         b.gt            1b
 
-        ldp             x0,  x4,  [sp, #48]
+        ldp             x0,  x5,  [sp, #48]
         ldp             x13, x14, [sp, #32]
         ldp             x11, x12, [sp, #16]
         ldp             x9,  x10, [sp], #64
 
         add             x0,  x0,  x1
         ret
 endfunc
 
 function wiener_filter7_hv_8bpc_neon
         // Backing up/restoring registers shifted, so that x9 gets the value
         // of x10, etc, and x15==x9, afterwards.
         stp             x10, x11, [sp, #-80]!
         stp             x12, x13, [sp, #16]
         stp             x14, x15, [sp, #32]
         stp             x10, x0,  [sp, #48]
-        stp             x3,  x4,  [sp, #64]
+        stp             x3,  x5,  [sp, #64]
 
         // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
-        tst             w7,  #1 // LR_HAVE_LEFT
+        tst             w8,  #1 // LR_HAVE_LEFT
         b.eq            1f
         // LR_HAVE_LEFT
         cbnz            x2,  0f
         // left == NULL
         sub             x3,  x3,  #3
         ld1             {v3.16b}, [x3], #16
         b               2f
 
@@ -393,39 +397,39 @@ 1:
         ext             v3.16b,  v2.16b,  v3.16b, #13
 
 2:
         ld1             {v4.8b}, [x3], #8
         uxtl            v2.8h,   v3.8b
         uxtl2           v3.8h,   v3.16b
         uxtl            v4.8h,   v4.8b
 
-        tst             w7,  #2 // LR_HAVE_RIGHT
+        tst             w8,  #2 // LR_HAVE_RIGHT
         b.ne            4f
 
 3:      // !LR_HAVE_RIGHT
 
         // Check whether we need to pad the right edge
-        cmp             w4,  #19
+        cmp             w5,  #19
         b.ge            4f   // If w >= 19, all used input pixels are valid
 
         // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
         // this ends up called again; it's not strictly needed in those
         // cases (we pad enough here), but keeping the code as simple as possible.
 
         // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
         // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
-        sub             w17, w4,  #22
+        sub             w17, w5,  #22
         // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
         // buffer pointer.
-        movrel          x6,  right_ext_mask, -6
+        movrel          x7,  right_ext_mask, -6
         ldr             b28, [x3,  w17, sxtw]
-        sub             x6,  x6,  w4,  uxtw #1
+        sub             x7,  x7,  w5,  uxtw #1
         dup             v28.8h,  v28.h[0]
-        ld1             {v25.16b, v26.16b, v27.16b}, [x6]
+        ld1             {v25.16b, v26.16b, v27.16b}, [x7]
 
         bit             v2.16b,  v28.16b, v25.16b
         bit             v3.16b,  v28.16b, v26.16b
         bit             v4.16b,  v28.16b, v27.16b
 
 4:      // Loop horizontally
         ext             v17.16b, v2.16b,  v3.16b, #4
         ext             v19.16b, v2.16b,  v3.16b, #8
@@ -501,52 +505,54 @@ 4:      // Loop horizontally
         smlal2          v21.4s,  v17.8h,  v1.h[6]
         sqrshrun        v18.4h,  v18.4s,  #11
         sqrshrun2       v18.8h,  v19.4s,  #11
         sqrshrun        v19.4h,  v20.4s,  #11
         sqrshrun2       v19.8h,  v21.4s,  #11
         st1             {v6.8h, v7.8h}, [x15], #32
         sqxtun          v18.8b,  v18.8h
         sqxtun2         v18.16b, v19.8h
-        subs            w4,  w4,  #16
+        subs            w5,  w5,  #16
 
         st1             {v18.16b}, [x0], #16
 
         b.le            0f
         mov             v2.16b,  v4.16b
         ld1             {v4.16b}, [x3], #16
-        tst             w7,  #2 // LR_HAVE_RIGHT
+        tst             w8,  #2 // LR_HAVE_RIGHT
         uxtl            v3.8h,   v4.8b
         uxtl2           v4.8h,   v4.16b
         b.ne            4b // If we don't need to pad, just keep filtering.
         b               3b // If we need to pad, check how many pixels we have left.
 
 0:
-        ldp             x3,  x4,  [sp, #64]
+        ldp             x3,  x5,  [sp, #64]
         ldp             x15, x0,  [sp, #48]
         ldp             x13, x14, [sp, #32]
         ldp             x11, x12, [sp, #16]
         ldp             x9,  x10, [sp], #80
 
         add             x3,  x3,  x1
         add             x0,  x0,  x1
 
         ret
 endfunc
 
-// void dav1d_wiener_filter5_8bpc_neon(pixel *p, const ptrdiff_t stride,
-//                                     const pixel (*left)[4], const pixel *lpf,
+// void dav1d_wiener_filter5_8bpc_neon(pixel *p, const ptrdiff_t p_stride,
+//                                     const pixel (*left)[4],
+//                                     const pixel *lpf, const ptrdiff_t lpf_stride,
 //                                     const int w, int h,
 //                                     const int16_t filter[2][8],
 //                                     const enum LrEdgeFlags edges);
 function wiener_filter5_8bpc_neon, export=1
+        ldr             w8,  [sp]
         stp             x29, x30, [sp, #-16]!
         mov             x29, sp
-        ld1             {v0.8h, v1.8h},  [x6]
-        tst             w7,  #4               // LR_HAVE_TOP
+        ld1             {v0.8h, v1.8h},  [x7]
+        tst             w8,  #4               // LR_HAVE_TOP
         sub_sp          384*2*4
 
         mov             w17, #(1 << 14) - (1 << 2)
         dup             v30.8h,  w17
         movi            v31.8h,  #8, lsl #8
 
         // x11 - t4
         // x12 - t3
@@ -554,100 +560,102 @@ function wiener_filter5_8bpc_neon, expor
         // x14 - t1
         // x15 - t0
         mov             x14, sp               // t1
         b.eq            L(no_top_5)
 
         mov             x16, x2               // backup left
         mov             x2,  #0
         bl              wiener_filter5_h_8bpc_neon
-        add             x3,  x3,  x1          // lpf += stride
+        add             x3,  x3,  x4          // lpf += lpf_stride
         mov             x11, x14              // t4
         add             x14, x14, #384*2      // t1 += 384*2
         bl              wiener_filter5_h_8bpc_neon
-        add             x3,  x3,  x1,  lsl #2
-        add             x3,  x3,  x1          // lpf += stride*5
+        add             x3,  x3,  x4,  lsl #2
+        add             x3,  x3,  x4          // lpf += lpf_stride*5
         mov             x12, x14              // t3
         add             x14, x14, #384*2      // t1 += 384*2
         mov             x2,  x16              // left
         mov             x16, x3               // backup lpf
         mov             x3,  x0               // lpf = p
         bl              wiener_filter5_h_8bpc_neon
-        subs            w5,  w5,  #1          // h--
+        subs            w6,  w6,  #1          // h--
         mov             x13, x14              // t2
         b.eq            L(v1_5)
-        add             x3,  x3,  x1          // src += stride
+        add             x3,  x3,  x1          // src += p_stride
         add             x14, x14, #384*2      // t1 += 384*2
         bl              wiener_filter5_h_8bpc_neon
-        subs            w5,  w5,  #1          // h--
+        subs            w6,  w6,  #1          // h--
         b.eq            L(v2_5)
-        add             x3,  x3,  x1          // src += stride
+        add             x3,  x3,  x1          // src += p_stride
 
 L(main_5):
         mov             x15, x11              // t0 = t4
 L(main_loop_5):
         bl              wiener_filter5_hv_8bpc_neon
-        subs            w5,  w5,  #1          // h--
+        subs            w6,  w6,  #1          // h--
         b.ne            L(main_loop_5)
-        tst             w7,  #8 // LR_HAVE_BOTTOM
+        tst             w8,  #8 // LR_HAVE_BOTTOM
         b.eq            L(v2_5)
 
         mov             x3,  x16              // restore lpf
         mov             x2,  #0               // left = NULL
+        sub             x4,  x4,  x1          // lpf_stride - p_stride
         bl              wiener_filter5_hv_8bpc_neon
+        add             x3,  x3,  x4          // src += lpf_stride - p_stride
         bl              wiener_filter5_hv_8bpc_neon
 L(end_5):
 
         mov             sp,  x29
         ldp             x29, x30, [sp], #16
         ret
 
 L(no_top_5):
-        add             x3,  x3,  x1,  lsl #2
-        add             x16, x3,  x1,  lsl #1 // lpf += stride*6, backup
+        add             x3,  x3,  x4,  lsl #2
+        add             x16, x3,  x4,  lsl #1 // lpf += lpf_stride*6, backup
         mov             x3,  x0               // lpf = p
 
         bl              wiener_filter5_h_8bpc_neon
-        subs            w5,  w5,  #1          // h--
+        subs            w6,  w6,  #1          // h--
         mov             x11, x14              // t4
         mov             x12, x14              // t3
         mov             x13, x14              // t2
         b.eq            L(v1_5)
-        add             x3,  x3,  x1          // src += stride
+        add             x3,  x3,  x1          // src += p_stride
         add             x14, x14, #384*2      // t1 += 384*2
         bl              wiener_filter5_h_8bpc_neon
-        subs            w5,  w5,  #1          // h--
+        subs            w6,  w6,  #1          // h--
         b.eq            L(v2_5)
-        add             x3,  x3,  x1          // src += stride
+        add             x3,  x3,  x1          // src += p_stride
         add             x15, x14, #384*2      // t0 = t1 + 384*2
         bl              wiener_filter5_hv_8bpc_neon
-        subs            w5,  w5,  #1          // h--
+        subs            w6,  w6,  #1          // h--
         b.eq            L(v2_5)
         add             x15, x15, #384*2*3    // t0 += 384*2*3
         bl              wiener_filter5_hv_8bpc_neon
-        subs            w5,  w5,  #1          // h--
+        subs            w6,  w6,  #1          // h--
         b.ne            L(main_5)
 L(v2_5):
         bl              wiener_filter5_v_8bpc_neon
         add             x0,  x0,  x1
         mov             x11, x12
         mov             x12, x13
         mov             x13, x14
 L(v1_5):
         bl              wiener_filter5_v_8bpc_neon
         b               L(end_5)
 endfunc
 
 
 function wiener_filter5_h_8bpc_neon
-        stp             x3,  x4,  [sp, #-32]!
+        stp             x3,  x5,  [sp, #-32]!
         str             x14,      [sp, #16]
 
         // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
-        tst             w7,  #1 // LR_HAVE_LEFT
+        tst             w8,  #1 // LR_HAVE_LEFT
         b.eq            1f
         // LR_HAVE_LEFT
         cbnz            x2,  0f
         // left == NULL
         sub             x3,  x3,  #2
         ld1             {v3.16b}, [x3], #16
         b               2f
 
@@ -672,39 +680,39 @@ 1:
         ext             v3.16b,  v2.16b,  v3.16b, #14
 
 2:
         ld1             {v4.8b}, [x3], #8
         uxtl            v2.8h,   v3.8b
         uxtl2           v3.8h,   v3.16b
         uxtl            v4.8h,   v4.8b
 
-        tst             w7,  #2 // LR_HAVE_RIGHT
+        tst             w8,  #2 // LR_HAVE_RIGHT
         b.ne            4f
 
 3:      // !LR_HAVE_RIGHT
 
         // Check whether we need to pad the right edge
-        cmp             w4,  #18
+        cmp             w5,  #18
         b.ge            4f   // If w >= 18, all used input pixels are valid
 
         // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
         // this ends up called again; it's not strictly needed in those
         // cases (we pad enough here), but keeping the code as simple as possible.
 
         // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
         // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
-        sub             w17, w4,  #23
+        sub             w17, w5,  #23
         // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
         // buffer pointer.
-        movrel          x6,  right_ext_mask, -4
+        movrel          x7,  right_ext_mask, -4
         ldr             b28, [x3,  w17, sxtw]
-        sub             x6,  x6,  w4,  uxtw #1
+        sub             x7,  x7,  w5,  uxtw #1
         dup             v28.8h,  v28.h[0]
-        ld1             {v25.16b, v26.16b, v27.16b}, [x6]
+        ld1             {v25.16b, v26.16b, v27.16b}, [x7]
 
         bit             v2.16b,  v28.16b, v25.16b
         bit             v3.16b,  v28.16b, v26.16b
         bit             v4.16b,  v28.16b, v27.16b
 
 4:      // Loop horizontally
         // Interleaving the mul/mla chains actually hurts performance
         // significantly on Cortex A53, thus keeping mul/mla tightly
@@ -735,39 +743,39 @@ 4:      // Loop horizontally
         sub             v23.8h,  v23.8h,  v30.8h
         sqadd           v6.8h,   v6.8h,   v22.8h
         sqadd           v7.8h,   v7.8h,   v23.8h
         sshr            v6.8h,   v6.8h,   #3
         sshr            v7.8h,   v7.8h,   #3
         add             v6.8h,   v6.8h,   v31.8h
         add             v7.8h,   v7.8h,   v31.8h
 
-        subs            w4,  w4,  #16
+        subs            w5,  w5,  #16
 
         st1             {v6.8h, v7.8h}, [x14], #32
 
         b.le            0f
         mov             v2.16b,  v4.16b
         ld1             {v4.16b}, [x3], #16
-        tst             w7,  #2 // LR_HAVE_RIGHT
+        tst             w8,  #2 // LR_HAVE_RIGHT
         uxtl            v3.8h,   v4.8b
         uxtl2           v4.8h,   v4.16b
         b.ne            4b // If we don't need to pad, just keep filtering.
         b               3b // If we need to pad, check how many pixels we have left.
 
 0:
         ldr             x14,      [sp, #16]
-        ldp             x3,  x4,  [sp], #32
+        ldp             x3,  x5,  [sp], #32
         ret
 endfunc
 
 function wiener_filter5_v_8bpc_neon
         stp             x11, x12, [sp, #-48]!
         stp             x13, x14, [sp, #16]
-        stp             x0,  x4,  [sp, #32]
+        stp             x0,  x5,  [sp, #32]
 1:
         ld1             {v18.8h, v19.8h}, [x12], #32
         ld1             {v22.8h, v23.8h}, [x14], #32
         ld1             {v16.8h, v17.8h}, [x11], #32
 
         add             v24.8h,  v22.8h,  v18.8h
         ld1             {v20.8h, v21.8h}, [x13], #32
         add             v16.8h,  v22.8h,  v16.8h
@@ -787,37 +795,37 @@ 1:
         smlal2          v5.4s,   v25.8h,  v1.h[4]
         smlal2          v5.4s,   v17.8h,  v1.h[5]
         sqrshrun        v2.4h,   v2.4s,   #11
         sqrshrun2       v2.8h,   v3.4s,   #11
         sqrshrun        v3.4h,   v4.4s,   #11
         sqrshrun2       v3.8h,   v5.4s,   #11
         sqxtun          v2.8b,   v2.8h
         sqxtun2         v2.16b,  v3.8h
-        subs            w4,  w4,  #16
+        subs            w5,  w5,  #16
         st1             {v2.16b}, [x0], #16
         b.gt            1b
 
-        ldp             x0,  x4,  [sp, #32]
+        ldp             x0,  x5,  [sp, #32]
         ldp             x13, x14, [sp, #16]
         ldp             x11, x12, [sp], #48
 
         ret
 endfunc
 
 function wiener_filter5_hv_8bpc_neon
         // Backing up/restoring registers shifted, so that x11 gets the value
         // of x12, etc, and x15==x11, afterwards.
         stp             x12, x13, [sp, #-64]!
         stp             x14, x15, [sp, #16]
         stp             x12, x0,  [sp, #32]
-        stp             x3,  x4,  [sp, #48]
+        stp             x3,  x5,  [sp, #48]
 
         // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
-        tst             w7,  #1 // LR_HAVE_LEFT
+        tst             w8,  #1 // LR_HAVE_LEFT
         b.eq            1f
         // LR_HAVE_LEFT
         cbnz            x2,  0f
         // left == NULL
         sub             x3,  x3,  #2
         ld1             {v3.16b}, [x3], #16
         b               2f
 
@@ -841,39 +849,39 @@ 1:
         ext             v3.16b, v2.16b, v3.16b, #14
 
 2:
         ld1             {v4.8b}, [x3], #8
         uxtl            v2.8h,  v3.8b
         uxtl2           v3.8h,  v3.16b
         uxtl            v4.8h,  v4.8b
 
-        tst             w7,  #2 // LR_HAVE_RIGHT
+        tst             w8,  #2 // LR_HAVE_RIGHT
         b.ne            4f
 
 3:      // !LR_HAVE_RIGHT
 
         // Check whether we need to pad the right edge
-        cmp             w4,  #18
+        cmp             w5,  #18
         b.ge            4f   // If w >= 18, all used input pixels are valid
 
         // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
         // this ends up called again; it's not strictly needed in those
         // cases (we pad enough here), but keeping the code as simple as possible.
 
         // The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie
         // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
-        sub             w17, w4,  #23
+        sub             w17, w5,  #23
         // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
         // buffer pointer.
-        movrel          x6,  right_ext_mask, -4
+        movrel          x7,  right_ext_mask, -4
         ldr             b28, [x3,  w17, sxtw]
-        sub             x6,  x6,  w4,  uxtw #1
+        sub             x7,  x7,  w5,  uxtw #1
         dup             v28.8h,  v28.h[0]
-        ld1             {v25.16b, v26.16b, v27.16b}, [x6]
+        ld1             {v25.16b, v26.16b, v27.16b}, [x7]
 
         bit             v2.16b,  v28.16b, v25.16b
         bit             v3.16b,  v28.16b, v26.16b
         bit             v4.16b,  v28.16b, v27.16b
 
 4:      // Loop horizontally
 
         ext             v16.16b, v2.16b,  v3.16b, #2
@@ -931,31 +939,31 @@ 4:      // Loop horizontally
         smlal2          v21.4s,  v17.8h,  v1.h[5]
         sqrshrun        v18.4h,  v18.4s,  #11
         sqrshrun2       v18.8h,  v19.4s,  #11
         sqrshrun        v19.4h,  v20.4s,  #11
         sqrshrun2       v19.8h,  v21.4s,  #11
         st1             {v6.8h, v7.8h}, [x15], #32
         sqxtun          v18.8b,  v18.8h
         sqxtun2         v18.16b, v19.8h
-        subs            w4,  w4,  #16
+        subs            w5,  w5,  #16
 
         st1             {v18.16b}, [x0], #16
 
         b.le            0f
         mov             v2.16b,  v4.16b
         ld1             {v4.16b}, [x3], #16
-        tst             w7,  #2 // LR_HAVE_RIGHT
+        tst             w8,  #2 // LR_HAVE_RIGHT
         uxtl            v3.8h,   v4.8b
         uxtl2           v4.8h,   v4.16b
         b.ne            4b // If we don't need to pad, just keep filtering.
         b               3b // If we need to pad, check how many pixels we have left.
 
 0:
-        ldp             x3,  x4,  [sp, #48]
+        ldp             x3,  x5,  [sp, #48]
         ldp             x15, x0,  [sp, #32]
         ldp             x13, x14, [sp, #16]
         ldp             x11, x12, [sp], #64
 
         add             x3,  x3,  x1
         add             x0,  x0,  x1
 
         ret
--- a/third_party/dav1d/src/arm/64/looprestoration16.S
+++ b/third_party/dav1d/src/arm/64/looprestoration16.S
@@ -40,40 +40,46 @@ right_ext_mask:
         .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
         .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
         .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
         .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
         .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 endconst
 
 // void dav1d_wiener_filter7_16bpc_neon(pixel *p, const ptrdiff_t p_stride,
-//                                      const pixel (*left)[4], const pixel *lpf,
+//                                      const pixel (*left)[4],
+//                                      const pixel *lpf, const ptrdiff_t lpf_stride,
 //                                      const int w, int h,
 //                                      const int16_t filter[2][8],
 //                                      const enum LrEdgeFlags edges,
 //                                      const int bitdepth_max);
 function wiener_filter7_16bpc_neon, export=1
         ldr             w8,  [sp]
+#ifdef __APPLE__
+        ldr             w9,  [sp, #4]
+#else
+        ldr             w9,  [sp, #8]
+#endif
         stp             x29, x30, [sp, #-32]!
         stp             d8,  d9,  [sp, #16]
         mov             x29, sp
-        ld1             {v0.8h, v1.8h},  [x6]
-        tst             w7,  #4               // LR_HAVE_TOP
+        ld1             {v0.8h, v1.8h},  [x7]
+        tst             w8,  #4               // LR_HAVE_TOP
         sub_sp          384*2*6
 
-        dup             v28.8h,  w8           // bitdepth_max
-        clz             w8,  w8
+        dup             v28.8h,  w9           // bitdepth_max
+        clz             w9,  w9
         movi            v30.4s,  #1
-        sub             w10, w8,  #38         // -(bitdepth + 6)
-        sub             w11, w8,  #11         // round_bits_v
-        sub             w8,  w8,  #25         // -round_bits_h
+        sub             w10, w9,  #38         // -(bitdepth + 6)
+        sub             w11, w9,  #11         // round_bits_v
+        sub             w9,  w9,  #25         // -round_bits_h
         neg             w10, w10              // bitdepth + 6
         neg             w11, w11              // -round_bits_v
         dup             v2.4s,   w10
-        dup             v29.4s,  w8           // -round_bits_h
+        dup             v29.4s,  w9           // -round_bits_h
         dup             v27.4s,  w11          // -round_bits_v
         movi            v31.8h,  #0x20, lsl #8  // 1 << 13 = 8192
         ushl            v30.4s,  v30.4s,  v2.4s // 1 << (bitdepth + 6)
 
         zip1            v0.2d,   v0.2d,   v1.2d // move vertical coeffs to v0.h[4-7], freeing up v1
 
         // x9  - t6
         // x10 - t5
@@ -83,114 +89,116 @@ function wiener_filter7_16bpc_neon, expo
         // x14 - t1
         // x15 - t0
         mov             x14, sp               // t1
         b.eq            L(no_top_7)
 
         mov             x16, x2               // backup left
         mov             x2,  #0
         bl              wiener_filter7_h_16bpc_neon
-        add             x3,  x3,  x1          // lpf += stride
+        add             x3,  x3,  x4          // lpf += lpf_stride
         mov             x9,  x14              // t6
         mov             x10, x14              // t5
         add             x14, x14, #384*2      // t1 += 384*2
         bl              wiener_filter7_h_16bpc_neon
-        add             x3,  x3,  x1,  lsl #2
-        add             x3,  x3,  x1          // lpf += stride*5
+        add             x3,  x3,  x4,  lsl #2
+        add             x3,  x3,  x4          // lpf += lpf_stride*5
         mov             x11, x14              // t4
         add             x14, x14, #384*2      // t1 += 384*2
         mov             x2,  x16              // left
         mov             x16, x3               // backup lpf
         mov             x3,  x0               // lpf = p
         bl              wiener_filter7_h_16bpc_neon
-        subs            w5,  w5,  #1          // h--
+        subs            w6,  w6,  #1          // h--
         mov             x12, x14              // t3
         mov             x13, x14              // t2
         b.eq            L(v1_7)
-        add             x3,  x3,  x1          // src += stride
+        add             x3,  x3,  x1          // src += p_stride
         add             x14, x14, #384*2      // t1 += 384*2
         bl              wiener_filter7_h_16bpc_neon
         mov             x13, x14              // t2
-        subs            w5,  w5,  #1          // h--
+        subs            w6,  w6,  #1          // h--
         b.eq            L(v2_7)
-        add             x3,  x3,  x1          // src += stride
+        add             x3,  x3,  x1          // src += p_stride
         add             x14, x14, #384*2      // t1 += 384*2
         bl              wiener_filter7_h_16bpc_neon
-        subs            w5,  w5,  #1          // h--
+        subs            w6,  w6,  #1          // h--
         b.eq            L(v3_7)
-        add             x3,  x3,  x1          // src += stride
+        add             x3,  x3,  x1          // src += p_stride
 
 L(main_7):
         add             x15, x14, #384*2      // t0 = t1 + 384*2
 L(main_loop_7):
         bl              wiener_filter7_hv_16bpc_neon
-        subs            w5,  w5,  #1          // h--
+        subs            w6,  w6,  #1          // h--
         b.ne            L(main_loop_7)
-        tst             w7,  #8 // LR_HAVE_BOTTOM
+        tst             w8,  #8 // LR_HAVE_BOTTOM
         b.eq            L(v3_7)
 
         mov             x3,  x16              // restore lpf
         mov             x2,  #0               // left = NULL
+        sub             x4,  x4,  x1          // lpf_stride - p_stride
         bl              wiener_filter7_hv_16bpc_neon
+        add             x3,  x3,  x4          // src += lpf_stride - p_stride
         bl              wiener_filter7_hv_16bpc_neon
 L(v1_7):
         bl              wiener_filter7_v_16bpc_neon
 
         mov             sp,  x29
         ldp             d8,  d9,  [sp, #16]
         ldp             x29, x30, [sp], #32
         ret
 
 L(no_top_7):
-        add             x3,  x3,  x1,  lsl #2
-        add             x16, x3,  x1,  lsl #1 // lpf += stride*6, backup
+        add             x3,  x3,  x4,  lsl #2
+        add             x16, x3,  x4,  lsl #1 // lpf += lpf_stride*6, backup
         mov             x3,  x0               // lpf = p
 
         bl              wiener_filter7_h_16bpc_neon
-        subs            w5,  w5,  #1          // h--
+        subs            w6,  w6,  #1          // h--
         mov             x9,  x14              // t6
         mov             x10, x14              // t5
         mov             x11, x14              // t4
         mov             x12, x14              // t3
         mov             x13, x14              // t2
         b.eq            L(v1_7)
         add             x3,  x3,  x1          // src += p_stride
         add             x14, x14, #384*2      // t1 += 384*2
         bl              wiener_filter7_h_16bpc_neon
-        subs            w5,  w5,  #1          // h--
+        subs            w6,  w6,  #1          // h--
         mov             x13, x14              // t2
         b.eq            L(v2_7)
         add             x3,  x3,  x1          // src += p_stride
         add             x14, x14, #384*2      // t1 += 384*2
         bl              wiener_filter7_h_16bpc_neon
-        subs            w5,  w5,  #1          // h--
+        subs            w6,  w6,  #1          // h--
         b.eq            L(v3_7)
         add             x3,  x3,  x1          // src += p_stride
         add             x15, x14, #384*2      // t0 = t1 + 384*2
         bl              wiener_filter7_hv_16bpc_neon
-        subs            w5,  w5,  #1          // h--
+        subs            w6,  w6,  #1          // h--
         b.eq            L(v3_7)
         add             x15, x15, #384*2*4    // t0 += 384*2*4
         bl              wiener_filter7_hv_16bpc_neon
-        subs            w5,  w5,  #1          // h--
+        subs            w6,  w6,  #1          // h--
         b.ne            L(main_7)
 L(v3_7):
         bl              wiener_filter7_v_16bpc_neon
 L(v2_7):
         bl              wiener_filter7_v_16bpc_neon
         b               L(v1_7)
 endfunc
 
 
 function wiener_filter7_h_16bpc_neon
-        stp             x3,  x4,  [sp, #-32]!
+        stp             x3,  x5,  [sp, #-32]!
         str             x14,      [sp, #16]
 
         // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
-        tst             w7,  #1 // LR_HAVE_LEFT
+        tst             w8,  #1 // LR_HAVE_LEFT
         b.eq            1f
         // LR_HAVE_LEFT
         cbnz            x2,  0f
         // left == NULL
         sub             x3,  x3,  #6
         ld1             {v2.8h, v3.8h}, [x3], #32
         b               2f
 
@@ -214,39 +222,39 @@ 1:
         // which we shifted out.
         sub             x3,  x3,  #6
         ext             v3.16b,  v2.16b,  v3.16b,  #10
         ext             v2.16b,  v4.16b,  v2.16b,  #10
 
 2:
         ld1             {v4.8h}, [x3], #16
 
-        tst             w7,  #2 // LR_HAVE_RIGHT
+        tst             w8,  #2 // LR_HAVE_RIGHT
         b.ne            4f
 
 3:      // !LR_HAVE_RIGHT
 
         // Check whether we need to pad the right edge
-        cmp             w4,  #19
+        cmp             w5,  #19
         b.ge            4f   // If w >= 19, all used input pixels are valid
 
         // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
         // this ends up called again; it's not strictly needed in those
         // cases (we pad enough here), but keeping the code as simple as possible.
 
         // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
         // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
-        sub             w17, w4,  #22
+        sub             w17, w5,  #22
         // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
         // buffer pointer.
-        movrel          x6,  right_ext_mask, -6
+        movrel          x7,  right_ext_mask, -6
         ldr             h26, [x3,  w17, sxtw #1]
-        sub             x6,  x6,  w4,  uxtw #1
+        sub             x7,  x7,  w5,  uxtw #1
         dup             v26.8h,  v26.h[0]
-        ld1             {v23.16b, v24.16b, v25.16b}, [x6]
+        ld1             {v23.16b, v24.16b, v25.16b}, [x7]
 
         bit             v2.16b,  v26.16b, v23.16b
         bit             v3.16b,  v26.16b, v24.16b
         bit             v4.16b,  v26.16b, v25.16b
 
 4:      // Loop horizontally
         // Interleaving the mul/mla chains actually hurts performance
         // significantly on Cortex A53, thus keeping mul/mla tightly
@@ -301,40 +309,40 @@ 4:      // Loop horizontally
         sqxtun2         v6.8h,   v7.4s
         sqxtun          v7.4h,   v16.4s
         sqxtun2         v7.8h,   v17.4s
         umin            v6.8h,   v6.8h,   v24.8h
         umin            v7.8h,   v7.8h,   v24.8h
         sub             v6.8h,   v6.8h,   v31.8h
         sub             v7.8h,   v7.8h,   v31.8h
 
-        subs            w4,  w4,  #16
+        subs            w5,  w5,  #16
 
         st1             {v6.8h, v7.8h}, [x14], #32
 
         b.le            0f
         mov             v2.16b,  v4.16b
-        tst             w7,  #2 // LR_HAVE_RIGHT
+        tst             w8,  #2 // LR_HAVE_RIGHT
         ld1             {v3.8h, v4.8h}, [x3], #32
         b.ne            4b // If we don't need to pad, just keep filtering.
         b               3b // If we need to pad, check how many pixels we have left.
 
 0:
         ldr             x14,      [sp, #16]
-        ldp             x3,  x4,  [sp], #32
+        ldp             x3,  x5,  [sp], #32
         ret
 endfunc
 
 function wiener_filter7_v_16bpc_neon
         // Backing up/restoring registers shifted, so that x9 gets the value
         // of x10, etc, afterwards.
         stp             x10, x11, [sp, #-64]!
         stp             x12, x13, [sp, #16]
         stp             x14, x14, [sp, #32]
-        stp             x0,  x4,  [sp, #48]
+        stp             x0,  x5,  [sp, #48]
 1:
         ld1             {v16.8h, v17.8h}, [x9],  #32
         ld1             {v18.8h, v19.8h}, [x10], #32
         ld1             {v20.8h, v21.8h}, [x11], #32
         ld1             {v22.8h, v23.8h}, [x12], #32
         ld1             {v24.8h, v25.8h}, [x13], #32
         ld1             {v6.8h,  v7.8h},  [x14], #32
 
@@ -371,40 +379,40 @@ 1:
         srshl           v4.4s,   v4.4s,   v27.4s
         srshl           v5.4s,   v5.4s,   v27.4s
         sqxtun          v2.4h,   v2.4s
         sqxtun2         v2.8h,   v3.4s
         sqxtun          v3.4h,   v4.4s
         sqxtun2         v3.8h,   v5.4s
         umin            v2.8h,   v2.8h,   v28.8h  // bitdepth_max
         umin            v3.8h,   v3.8h,   v28.8h
-        subs            w4,  w4,  #16
+        subs            w5,  w5,  #16
         st1             {v2.8h, v3.8h}, [x0], #32
         b.gt            1b
 
-        ldp             x0,  x4,  [sp, #48]
+        ldp             x0,  x5,  [sp, #48]
         ldp             x13, x14, [sp, #32]
         ldp             x11, x12, [sp, #16]
         ldp             x9,  x10, [sp], #64
 
         add             x0,  x0,  x1
         ret
 endfunc
 
 function wiener_filter7_hv_16bpc_neon
         // Backing up/restoring registers shifted, so that x9 gets the value
         // of x10, etc, and x15==x9, afterwards.
         stp             x10, x11, [sp, #-80]!
         stp             x12, x13, [sp, #16]
         stp             x14, x15, [sp, #32]
         stp             x10, x0,  [sp, #48]
-        stp             x3,  x4,  [sp, #64]
+        stp             x3,  x5,  [sp, #64]
 
         // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
-        tst             w7,  #1 // LR_HAVE_LEFT
+        tst             w8,  #1 // LR_HAVE_LEFT
         b.eq            1f
         // LR_HAVE_LEFT
         cbnz            x2,  0f
         // left == NULL
         sub             x3,  x3,  #6
         ld1             {v2.8h, v3.8h}, [x3], #32
         b               2f
 
@@ -427,39 +435,39 @@ 1:
         // which we shifted out.
         sub             x3,  x3,  #6
         ext             v3.16b,  v2.16b,  v3.16b,  #10
         ext             v2.16b,  v4.16b,  v2.16b,  #10
 
 2:
         ld1             {v4.8h}, [x3], #16
 
-        tst             w7,  #2 // LR_HAVE_RIGHT
+        tst             w8,  #2 // LR_HAVE_RIGHT
         b.ne            4f
 
 3:      // !LR_HAVE_RIGHT
 
         // Check whether we need to pad the right edge
-        cmp             w4,  #19
+        cmp             w5,  #19
         b.ge            4f   // If w >= 19, all used input pixels are valid
 
         // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
         // this ends up called again; it's not strictly needed in those
         // cases (we pad enough here), but keeping the code as simple as possible.
 
         // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
         // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
-        sub             w17, w4,  #22
+        sub             w17, w5,  #22
         // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
         // buffer pointer.
-        movrel          x6,  right_ext_mask, -6
+        movrel          x7,  right_ext_mask, -6
         ldr             h26, [x3,  w17, sxtw #1]
-        sub             x6,  x6,  w4,  uxtw #1
+        sub             x7,  x7,  w5,  uxtw #1
         dup             v26.8h,  v26.h[0]
-        ld1             {v23.16b, v24.16b, v25.16b}, [x6]
+        ld1             {v23.16b, v24.16b, v25.16b}, [x7]
 
         bit             v2.16b,  v26.16b, v23.16b
         bit             v3.16b,  v26.16b, v24.16b
         bit             v4.16b,  v26.16b, v25.16b
 
 4:      // Loop horizontally
         ext             v17.16b, v2.16b,  v3.16b, #4
         ext             v19.16b, v2.16b,  v3.16b, #8
@@ -558,65 +566,71 @@ 4:      // Loop horizontally
         srshl           v16.4s,  v16.4s,  v27.4s
         sqxtun          v18.4h,  v1.4s
         sqxtun2         v18.8h,  v5.4s
         sqxtun          v19.4h,  v26.4s
         sqxtun2         v19.8h,  v16.4s
         st1             {v6.8h, v7.8h}, [x15], #32
         umin            v18.8h,  v18.8h,  v28.8h  // bitdepth_max
         umin            v19.8h,  v19.8h,  v28.8h
-        subs            w4,  w4,  #16
+        subs            w5,  w5,  #16
 
         st1             {v18.8h, v19.8h}, [x0], #32
 
         b.le            0f
         mov             v2.16b,  v4.16b
-        tst             w7,  #2 // LR_HAVE_RIGHT
+        tst             w8,  #2 // LR_HAVE_RIGHT
         ld1             {v3.8h, v4.8h}, [x3], #32
         b.ne            4b // If we don't need to pad, just keep filtering.
         b               3b // If we need to pad, check how many pixels we have left.
 
 0:
-        ldp             x3,  x4,  [sp, #64]
+        ldp             x3,  x5,  [sp, #64]
         ldp             x15, x0,  [sp, #48]
         ldp             x13, x14, [sp, #32]
         ldp             x11, x12, [sp, #16]
         ldp             x9,  x10, [sp], #80
 
         add             x3,  x3,  x1
         add             x0,  x0,  x1
 
         ret
 endfunc
 
 // void dav1d_wiener_filter5_16bpc_neon(pixel *p, const ptrdiff_t p_stride,
-//                                      const pixel (*left)[4], const pixel *lpf,
+//                                      const pixel (*left)[4],
+//                                      const pixel *lpf, const ptrdiff_t lpf_stride,
 //                                      const int w, int h,
 //                                      const int16_t filter[2][8],
 //                                      const enum LrEdgeFlags edges,
 //                                      const int bitdepth_max);
 function wiener_filter5_16bpc_neon, export=1
         ldr             w8,  [sp]
+#ifdef __APPLE__
+        ldr             w9,  [sp, #4]
+#else
+        ldr             w9,  [sp, #8]
+#endif
         stp             x29, x30, [sp, #-32]!
         stp             d8,  d9,  [sp, #16]
         mov             x29, sp
-        ld1             {v0.8h, v1.8h},  [x6]
-        tst             w7,  #4               // LR_HAVE_TOP
+        ld1             {v0.8h, v1.8h},  [x7]
+        tst             w8,  #4               // LR_HAVE_TOP
         sub_sp          384*2*4
 
-        dup             v28.8h,  w8           // bitdepth_max
-        clz             w8,  w8
+        dup             v28.8h,  w9           // bitdepth_max
+        clz             w9,  w9
         movi            v30.4s,  #1
-        sub             w10, w8,  #38         // -(bitdepth + 6)
-        sub             w11, w8,  #11         // round_bits_v
-        sub             w8,  w8,  #25         // -round_bits_h
+        sub             w10, w9,  #38         // -(bitdepth + 6)
+        sub             w11, w9,  #11         // round_bits_v
+        sub             w9,  w9,  #25         // -round_bits_h
         neg             w10, w10              // bitdepth + 6
         neg             w11, w11              // -round_bits_v
         dup             v2.4s,   w10
-        dup             v29.4s,  w8           // -round_bits_h
+        dup             v29.4s,  w9           // -round_bits_h
         dup             v27.4s,  w11          // -round_bits_v
         movi            v31.8h,  #0x20, lsl #8  // 1 << 13 = 8192
         ushl            v30.4s,  v30.4s,  v2.4s // 1 << (bitdepth + 6)
 
         zip1            v0.2d,   v0.2d,   v1.2d // move vertical coeffs to v0.h[4-7], freeing up v1
 
         // x11 - t4
         // x12 - t3
@@ -624,101 +638,103 @@ function wiener_filter5_16bpc_neon, expo
         // x14 - t1
         // x15 - t0
         mov             x14, sp               // t1
         b.eq            L(no_top_5)
 
         mov             x16, x2               // backup left
         mov             x2,  #0
         bl              wiener_filter5_h_16bpc_neon
-        add             x3,  x3,  x1          // lpf += stride
+        add             x3,  x3,  x4          // lpf += lpf_stride
         mov             x11, x14              // t4
         add             x14, x14, #384*2      // t1 += 384*2
         bl              wiener_filter5_h_16bpc_neon
-        add             x3,  x3,  x1,  lsl #2
-        add             x3,  x3,  x1          // lpf += stride*5
+        add             x3,  x3,  x4,  lsl #2
+        add             x3,  x3,  x4          // lpf += lpf_stride*5
         mov             x12, x14              // t3
         add             x14, x14, #384*2      // t1 += 384*2
         mov             x2,  x16              // left
         mov             x16, x3               // backup lpf
         mov             x3,  x0               // lpf = p
         bl              wiener_filter5_h_16bpc_neon
-        subs            w5,  w5,  #1          // h--
+        subs            w6,  w6,  #1          // h--
         mov             x13, x14              // t2
         b.eq            L(v1_5)
-        add             x3,  x3,  x1          // src += stride
+        add             x3,  x3,  x1          // src += p_stride
         add             x14, x14, #384*2      // t1 += 384*2
         bl              wiener_filter5_h_16bpc_neon
-        subs            w5,  w5,  #1          // h--
+        subs            w6,  w6,  #1          // h--
         b.eq            L(v2_5)
-        add             x3,  x3,  x1          // src += stride
+        add             x3,  x3,  x1          // src += p_stride
 
 L(main_5):
         mov             x15, x11              // t0 = t4
 L(main_loop_5):
         bl              wiener_filter5_hv_16bpc_neon
-        subs            w5,  w5,  #1          // h--
+        subs            w6,  w6,  #1          // h--
         b.ne            L(main_loop_5)
-        tst             w7,  #8 // LR_HAVE_BOTTOM
+        tst             w8,  #8 // LR_HAVE_BOTTOM
         b.eq            L(v2_5)
 
         mov             x3,  x16              // restore lpf
         mov             x2,  #0               // left = NULL
+        sub             x4,  x4,  x1          // lpf_stride - p_stride
         bl              wiener_filter5_hv_16bpc_neon
+        add             x3,  x3,  x4          // src += lpf_stride - p_stride
         bl              wiener_filter5_hv_16bpc_neon
 L(end_5):
 
         mov             sp,  x29
         ldp             d8,  d9,  [sp, #16]
         ldp             x29, x30, [sp], #32
         ret
 
 L(no_top_5):
-        add             x3,  x3,  x1,  lsl #2
-        add             x16, x3,  x1,  lsl #1 // lpf += stride*6, backup
+        add             x3,  x3,  x4,  lsl #2
+        add             x16, x3,  x4,  lsl #1 // lpf += lpf_stride*6, backup
         mov             x3,  x0               // lpf = p
 
         bl              wiener_filter5_h_16bpc_neon
-        subs            w5,  w5,  #1          // h--
+        subs            w6,  w6,  #1          // h--
         mov             x11, x14              // t4
         mov             x12, x14              // t3
         mov             x13, x14              // t2
         b.eq            L(v1_5)
-        add             x3,  x3,  x1          // src += stride
+        add             x3,  x3,  x1          // src += p_stride
         add             x14, x14, #384*2      // t1 += 384*2
         bl              wiener_filter5_h_16bpc_neon
-        subs            w5,  w5,  #1          // h--
+        subs            w6,  w6,  #1          // h--
         b.eq            L(v2_5)
-        add             x3,  x3,  x1          // src += stride
+        add             x3,  x3,  x1          // src += p_stride
         add             x15, x14, #384*2      // t0 = t1 + 384*2
         bl              wiener_filter5_hv_16bpc_neon
-        subs            w5,  w5,  #1          // h--
+        subs            w6,  w6,  #1          // h--
         b.eq            L(v2_5)
         add             x15, x15, #384*2*3    // t0 += 384*2*3
         bl              wiener_filter5_hv_16bpc_neon
-        subs            w5,  w5,  #1          // h--
+        subs            w6,  w6,  #1          // h--
         b.ne            L(main_5)
 L(v2_5):
         bl              wiener_filter5_v_16bpc_neon
         add             x0,  x0,  x1
         mov             x11, x12
         mov             x12, x13
         mov             x13, x14
 L(v1_5):
         bl              wiener_filter5_v_16bpc_neon
         b               L(end_5)
 endfunc
 
 
 function wiener_filter5_h_16bpc_neon
-        stp             x3,  x4,  [sp, #-32]!
+        stp             x3,  x5,  [sp, #-32]!
         str             x14,      [sp, #16]
 
         // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
-        tst             w7,  #1 // LR_HAVE_LEFT
+        tst             w8,  #1 // LR_HAVE_LEFT
         b.eq            1f
         // LR_HAVE_LEFT
         cbnz            x2,  0f
         // left == NULL
         sub             x3,  x3,  #4
         ld1             {v2.8h, v3.8h}, [x3], #32
         b               2f
 
@@ -742,39 +758,39 @@ 1:
         // which we shifted out.
         sub             x3,  x3,  #4
         ext             v3.16b,  v2.16b,  v3.16b,  #12
         ext             v2.16b,  v4.16b,  v2.16b,  #12
 
 2:
         ld1             {v4.8h}, [x3], #16
 
-        tst             w7,  #2 // LR_HAVE_RIGHT
+        tst             w8,  #2 // LR_HAVE_RIGHT
         b.ne            4f
 
 3:      // !LR_HAVE_RIGHT
 
         // Check whether we need to pad the right edge
-        cmp             w4,  #18
+        cmp             w5,  #18
         b.ge            4f   // If w >= 18, all used input pixels are valid
 
         // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
         // this ends up called again; it's not strictly needed in those
         // cases (we pad enough here), but keeping the code as simple as possible.
 
         // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
         // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
-        sub             w17, w4,  #23
+        sub             w17, w5,  #23
         // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
         // buffer pointer.
-        movrel          x6,  right_ext_mask, -4
+        movrel          x7,  right_ext_mask, -4
         ldr             h26, [x3,  w17, sxtw #1]
-        sub             x6,  x6,  w4,  uxtw #1
+        sub             x7,  x7,  w5,  uxtw #1
         dup             v26.8h,  v26.h[0]
-        ld1             {v23.16b, v24.16b, v25.16b}, [x6]
+        ld1             {v23.16b, v24.16b, v25.16b}, [x7]
 
         bit             v2.16b,  v26.16b, v23.16b
         bit             v3.16b,  v26.16b, v24.16b
         bit             v4.16b,  v26.16b, v25.16b
 
 4:      // Loop horizontally
         // Interleaving the mul/mla chains actually hurts performance
         // significantly on Cortex A53, thus keeping mul/mla tightly
@@ -818,37 +834,37 @@ 4:      // Loop horizontally
         sqxtun2         v6.8h,   v7.4s
         sqxtun          v7.4h,   v16.4s
         sqxtun2         v7.8h,   v17.4s
         umin            v6.8h,   v6.8h,   v24.8h
         umin            v7.8h,   v7.8h,   v24.8h
         sub             v6.8h,   v6.8h,   v31.8h
         sub             v7.8h,   v7.8h,   v31.8h
 
-        subs            w4,  w4,  #16
+        subs            w5,  w5,  #16
 
         st1             {v6.8h, v7.8h}, [x14], #32
 
         b.le            0f
         mov             v2.16b,  v4.16b
-        tst             w7,  #2 // LR_HAVE_RIGHT
+        tst             w8,  #2 // LR_HAVE_RIGHT
         ld1             {v3.8h, v4.8h}, [x3], #32
         b.ne            4b // If we don't need to pad, just keep filtering.
         b               3b // If we need to pad, check how many pixels we have left.
 
 0:
         ldr             x14,      [sp, #16]
-        ldp             x3,  x4,  [sp], #32
+        ldp             x3,  x5,  [sp], #32
         ret
 endfunc
 
 function wiener_filter5_v_16bpc_neon
         stp             x11, x12, [sp, #-48]!
         stp             x13, x14, [sp, #16]
-        stp             x0,  x4,  [sp, #32]
+        stp             x0,  x5,  [sp, #32]
 1:
         ld1             {v16.8h, v17.8h}, [x11], #32
         ld1             {v18.8h, v19.8h}, [x12], #32
         ld1             {v20.8h, v21.8h}, [x13], #32
         ld1             {v22.8h, v23.8h}, [x14], #32
 
         smull           v2.4s,   v16.4h,  v0.h[5]
         smlal           v2.4s,   v18.4h,  v0.h[6]
@@ -876,37 +892,37 @@ 1:
         srshl           v5.4s,   v5.4s,   v27.4s
         sqxtun          v2.4h,   v2.4s
         sqxtun2         v2.8h,   v3.4s
         sqxtun          v3.4h,   v4.4s
         sqxtun2         v3.8h,   v5.4s
         umin            v2.8h,   v2.8h,   v28.8h  // bitdepth_max
         umin            v3.8h,   v3.8h,   v28.8h
 
-        subs            w4,  w4,  #16
+        subs            w5,  w5,  #16
         st1             {v2.8h, v3.8h}, [x0], #32
         b.gt            1b
 
-        ldp             x0,  x4,  [sp, #32]
+        ldp             x0,  x5,  [sp, #32]
         ldp             x13, x14, [sp, #16]
         ldp             x11, x12, [sp], #48
 
         ret
 endfunc
 
 function wiener_filter5_hv_16bpc_neon
         // Backing up/restoring registers shifted, so that x11 gets the value
         // of x12, etc, and x15==x11, afterwards.
         stp             x12, x13, [sp, #-64]!
         stp             x14, x15, [sp, #16]
         stp             x12, x0,  [sp, #32]
-        stp             x3,  x4,  [sp, #48]
+        stp             x3,  x5,  [sp, #48]
 
         // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
-        tst             w7,  #1 // LR_HAVE_LEFT
+        tst             w8,  #1 // LR_HAVE_LEFT
         b.eq            1f
         // LR_HAVE_LEFT
         cbnz            x2,  0f
         // left == NULL
         sub             x3,  x3,  #4
         ld1             {v2.8h, v3.8h}, [x3], #32
         b               2f
 
@@ -929,39 +945,39 @@ 1:
         // which we shifted out.
         sub             x3,  x3,  #4
         ext             v3.16b,  v2.16b,  v3.16b,  #12
         ext             v2.16b,  v4.16b,  v2.16b,  #12
 
 2:
         ld1             {v4.8h}, [x3], #16
 
-        tst             w7,  #2 // LR_HAVE_RIGHT
+        tst             w8,  #2 // LR_HAVE_RIGHT
         b.ne            4f
 
 3:      // !LR_HAVE_RIGHT
 
         // Check whether we need to pad the right edge
-        cmp             w4,  #18
+        cmp             w5,  #18
         b.ge            4f   // If w >= 18, all used input pixels are valid
 
         // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
         // this ends up called again; it's not strictly needed in those
         // cases (we pad enough here), but keeping the code as simple as possible.
 
         // The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie
         // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
-        sub             w17, w4,  #23
+        sub             w17, w5,  #23
         // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
         // buffer pointer.
-        movrel          x6,  right_ext_mask, -4
+        movrel          x7,  right_ext_mask, -4
         ldr             h26, [x3,  w17, sxtw #1]
-        sub             x6,  x6,  w4,  uxtw #1
+        sub             x7,  x7,  w5,  uxtw #1
         dup             v26.8h,  v26.h[0]
-        ld1             {v23.16b, v24.16b, v25.16b}, [x6]
+        ld1             {v23.16b, v24.16b, v25.16b}, [x7]
 
         bit             v2.16b,  v26.16b, v23.16b
         bit             v3.16b,  v26.16b, v24.16b
         bit             v4.16b,  v26.16b, v25.16b
 
 4:      // Loop horizontally
         ext             v16.16b, v2.16b,  v3.16b, #2
         ext             v18.16b, v2.16b,  v3.16b, #6
@@ -1038,29 +1054,29 @@ 4:      // Loop horizontally
         sqxtun          v8.4h,   v8.4s
         sqxtun2         v8.8h,   v9.4s
         sqxtun          v9.4h,   v1.4s
         sqxtun2         v9.8h,   v5.4s
         st1             {v6.8h, v7.8h}, [x15], #32
         umin            v8.8h,   v8.8h,   v28.8h  // bitdepth_max
         umin            v9.8h,   v9.8h,   v28.8h
 
-        subs            w4,  w4,  #16
+        subs            w5,  w5,  #16
 
         st1             {v8.8h, v9.8h}, [x0], #32
 
         b.le            0f
         mov             v2.16b,  v4.16b
-        tst             w7,  #2 // LR_HAVE_RIGHT
+        tst             w8,  #2 // LR_HAVE_RIGHT
         ld1             {v3.8h, v4.8h}, [x3], #32
         b.ne            4b // If we don't need to pad, just keep filtering.
         b               3b // If we need to pad, check how many pixels we have left.
 
 0:
-        ldp             x3,  x4,  [sp, #48]
+        ldp             x3,  x5,  [sp, #48]
         ldp             x15, x0,  [sp, #32]
         ldp             x13, x14, [sp, #16]
         ldp             x11, x12, [sp], #64
 
         add             x3,  x3,  x1
         add             x0,  x0,  x1
 
         ret
--- a/third_party/dav1d/src/arm/64/mc.S
+++ b/third_party/dav1d/src/arm/64/mc.S
@@ -81,17 +81,16 @@ function \type\()_8bpc_neon, export=1
 .endif
         adr             x7,  L(\type\()_tbl)
         sub             w4,  w4,  #24
         ldrh            w4,  [x7, x4, lsl #1]
         \type           v4,  v0,  v1,  v2,  v3
         sub             x7,  x7,  w4, uxtw
         br              x7
 40:
-        AARCH64_VALID_JUMP_TARGET
         add             x7,  x0,  x1
         lsl             x1,  x1,  #1
 4:
         cmp             w5,  #4
         st1             {v4.s}[0],  [x0], x1
         st1             {v4.s}[1],  [x7], x1
         st1             {v4.s}[2],  [x0], x1
         st1             {v4.s}[3],  [x7], x1
@@ -110,58 +109,54 @@ 4:
         st1             {v4.s}[2],  [x0], x1
         st1             {v4.s}[3],  [x7], x1
         st1             {v5.s}[0],  [x0], x1
         st1             {v5.s}[1],  [x7], x1
         st1             {v5.s}[2],  [x0], x1
         st1             {v5.s}[3],  [x7], x1
         ret
 80:
-        AARCH64_VALID_JUMP_TARGET
         add             x7,  x0,  x1
         lsl             x1,  x1,  #1
 8:
         st1             {v4.d}[0],  [x0], x1
         \type           v5,  v0,  v1,  v2,  v3
         st1             {v4.d}[1],  [x7], x1
         st1             {v5.d}[0],  [x0], x1
         subs            w5,  w5,  #4
         st1             {v5.d}[1],  [x7], x1
         b.le            0f
         \type           v4,  v0,  v1,  v2,  v3
         b               8b
 16:
-        AARCH64_VALID_JUMP_TARGET
         \type           v5,  v0,  v1,  v2,  v3
         st1             {v4.16b}, [x0], x1
         \type           v6,  v0,  v1,  v2,  v3
         st1             {v5.16b}, [x0], x1
         \type           v7,  v0,  v1,  v2,  v3
         st1             {v6.16b}, [x0], x1
         subs            w5,  w5,  #4
         st1             {v7.16b}, [x0], x1
         b.le            0f
         \type           v4,  v0,  v1,  v2,  v3
         b               16b
 320:
-        AARCH64_VALID_JUMP_TARGET
         add             x7,  x0,  x1
         lsl             x1,  x1,  #1
 32:
         \type           v5,  v0,  v1,  v2,  v3
         \type           v6,  v0,  v1,  v2,  v3
         st1             {v4.16b,v5.16b}, [x0], x1
         \type           v7,  v0,  v1,  v2,  v3
         subs            w5,  w5,  #2
         st1             {v6.16b,v7.16b}, [x7], x1
         b.le            0f
         \type           v4,  v0,  v1,  v2,  v3
         b               32b
 640:
-        AARCH64_VALID_JUMP_TARGET
         add             x7,  x0,  x1
         lsl             x1,  x1,  #1
 64:
         \type           v5,  v0,  v1,  v2,  v3
         \type           v6,  v0,  v1,  v2,  v3
         \type           v7,  v0,  v1,  v2,  v3
         \type           v16, v0,  v1,  v2,  v3
         \type           v17, v0,  v1,  v2,  v3
@@ -169,17 +164,16 @@ 64:
         \type           v18, v0,  v1,  v2,  v3
         \type           v19, v0,  v1,  v2,  v3
         subs            w5,  w5,  #2
         st1             {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
         b.le            0f
         \type           v4, v0,  v1,  v2,  v3
         b               64b
 1280:
-        AARCH64_VALID_JUMP_TARGET
         add             x7,  x0,  #64
 128:
         \type           v5,  v0,  v1,  v2,  v3
         \type           v6,  v0,  v1,  v2,  v3
         \type           v7,  v0,  v1,  v2,  v3
         \type           v16, v0,  v1,  v2,  v3
         \type           v17, v0,  v1,  v2,  v3
         st1             {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
@@ -226,17 +220,16 @@ function w_mask_\type\()_8bpc_neon, expo
         dup             v2.8h,   w7
         movi            v3.8h,   #1, lsl #8
         sub             v3.8h,   v3.8h,   v2.8h
 .endif
         add             x12,  x0,  x1
         lsl             x1,   x1,  #1
         br              x9
 4:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v4.8h,   v5.8h},   [x2],  #32  // tmp1 (four rows at once)
         ld1             {v6.8h,   v7.8h},   [x3],  #32  // tmp2 (four rows at once)
         subs            w5,  w5,  #4
         sub             v16.8h,  v6.8h,   v4.8h
         sub             v17.8h,  v7.8h,   v5.8h
         sabd            v18.8h,  v4.8h,   v6.8h
         sabd            v19.8h,  v5.8h,   v7.8h
         uqsub           v18.8h,  v0.8h,   v18.8h
@@ -272,17 +265,16 @@ 4:
 .endif
         st1             {v22.s}[0],  [x0],  x1
         st1             {v22.s}[1],  [x12], x1
         st1             {v23.s}[0],  [x0],  x1
         st1             {v23.s}[1],  [x12], x1
         b.gt            4b
         ret
 8:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v4.8h,   v5.8h},   [x2],  #32
         ld1             {v6.8h,   v7.8h},   [x3],  #32
         subs            w5,  w5,  #2
         sub             v16.8h,  v6.8h,   v4.8h
         sub             v17.8h,  v7.8h,   v5.8h
         sabd            v18.8h,  v4.8h,   v6.8h
         sabd            v19.8h,  v5.8h,   v7.8h
         uqsub           v18.8h,  v0.8h,   v18.8h
@@ -317,17 +309,16 @@ 8:
         st1             {v22.8b},  [x0],  x1
         st1             {v23.8b},  [x12], x1
         b.gt            8b
         ret
 1280:
 640:
 320:
 160:
-        AARCH64_VALID_JUMP_TARGET
         mov             w11, w4
         sub             x1,  x1,  w4,  uxtw
 .if \type == 444
         add             x10, x6,  w4,  uxtw
 .elseif \type == 422
         add             x10, x6,  x11, lsr #1
 .endif
         add             x9,  x3,  w4,  uxtw #1
@@ -438,32 +429,30 @@ function blend_8bpc_neon, export=1
         sub             w3,  w3,  #26
         ldrh            w3,  [x6,  x3,  lsl #1]
         sub             x6,  x6,  w3,  uxtw
         movi            v4.16b,  #64
         add             x8,  x0,  x1
         lsl             x1,  x1,  #1
         br              x6
 4:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v2.8b},     [x5],  #8
         ld1             {v1.d}[0],   [x2],  #8
         ld1             {v0.s}[0],   [x0]
         subs            w4,  w4,  #2
         ld1             {v0.s}[1],   [x8]
         sub             v3.8b,   v4.8b,   v2.8b
         umull           v5.8h,   v1.8b,   v2.8b
         umlal           v5.8h,   v0.8b,   v3.8b
         rshrn           v6.8b,   v5.8h,   #6
         st1             {v6.s}[0],   [x0],  x1
         st1             {v6.s}[1],   [x8],  x1
         b.gt            4b
         ret
 8:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v2.16b},  [x5],  #16
         ld1             {v1.16b},  [x2],  #16
         ld1             {v0.d}[0],   [x0]
         ld1             {v0.d}[1],   [x8]
         sub             v3.16b,  v4.16b,  v2.16b
         subs            w4,  w4,  #2
         umull           v5.8h,   v1.8b,   v2.8b
         umlal           v5.8h,   v0.8b,   v3.8b
@@ -471,17 +460,16 @@ 8:
         umlal2          v6.8h,   v0.16b,  v3.16b
         rshrn           v7.8b,   v5.8h,   #6
         rshrn2          v7.16b,  v6.8h,   #6
         st1             {v7.d}[0],   [x0],  x1
         st1             {v7.d}[1],   [x8],  x1
         b.gt            8b
         ret
 16:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v1.16b,  v2.16b},  [x5],  #32
         ld1             {v5.16b,  v6.16b},  [x2],  #32
         ld1             {v0.16b},  [x0]
         subs            w4,  w4,  #2
         sub             v7.16b,  v4.16b,  v1.16b
         sub             v20.16b, v4.16b,  v2.16b
         ld1             {v3.16b},  [x8]
         umull           v16.8h,  v5.8b,   v1.8b
@@ -496,17 +484,16 @@ 16:
         rshrn2          v18.16b, v17.8h,  #6
         rshrn           v19.8b,  v21.8h,  #6
         rshrn2          v19.16b, v22.8h,  #6
         st1             {v18.16b}, [x0],  x1
         st1             {v19.16b}, [x8],  x1
         b.gt            16b
         ret
 32:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.16b,  v1.16b,  v2.16b,  v3.16b},  [x5],  #64
         ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2],  #64
         ld1             {v20.16b, v21.16b}, [x0]
         subs            w4,  w4,  #2
         ld1             {v22.16b, v23.16b}, [x8]
         sub             v5.16b,  v4.16b,  v0.16b
         sub             v6.16b,  v4.16b,  v1.16b
         sub             v30.16b, v4.16b,  v2.16b
@@ -555,49 +542,46 @@ function blend_h_8bpc_neon, export=1
         movi            v4.16b,  #64
         add             x8,  x0,  x1
         lsl             x1,  x1,  #1
         sub             w7,  w7,  #24
         ldrh            w7,  [x6,  x7,  lsl #1]
         sub             x6,  x6,  w7, uxtw
         br              x6
 2:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.h}[0],   [x5],  #2
         ld1             {v1.s}[0],   [x2],  #4
         subs            w4,  w4,  #2
         ld1             {v2.h}[0],   [x0]
         zip1            v0.8b,   v0.8b,   v0.8b
         sub             v3.8b,   v4.8b,   v0.8b
         ld1             {v2.h}[1],   [x8]
         umull           v5.8h,   v1.8b,   v0.8b
         umlal           v5.8h,   v2.8b,   v3.8b
         rshrn           v5.8b,   v5.8h,   #6
         st1             {v5.h}[0],   [x0],  x1
         st1             {v5.h}[1],   [x8],  x1
         b.gt            2b
         ret
 4:
-        AARCH64_VALID_JUMP_TARGET
         ld2r            {v0.8b,   v1.8b},   [x5],  #2
         ld1             {v2.8b},   [x2],  #8
         subs            w4,  w4,  #2
         ext             v0.8b,   v0.8b,   v1.8b,   #4
         ld1             {v3.s}[0],   [x0]
         sub             v5.8b,   v4.8b,   v0.8b
         ld1             {v3.s}[1],   [x8]
         umull           v6.8h,   v2.8b,   v0.8b
         umlal           v6.8h,   v3.8b,   v5.8b
         rshrn           v6.8b,   v6.8h,   #6
         st1             {v6.s}[0],   [x0],  x1
         st1             {v6.s}[1],   [x8],  x1
         b.gt            4b
         ret
 8:
-        AARCH64_VALID_JUMP_TARGET
         ld2r            {v0.16b,  v1.16b},  [x5],  #2
         ld1             {v2.16b},  [x2],  #16
         ld1             {v3.d}[0],   [x0]
         ext             v0.16b,  v0.16b,  v1.16b,  #8
         sub             v5.16b,  v4.16b,  v0.16b
         ld1             {v3.d}[1],   [x8]
         subs            w4,  w4,  #2
         umull           v6.8h,   v0.8b,   v2.8b
@@ -606,17 +590,16 @@ 8:
         umlal2          v7.8h,   v3.16b,  v5.16b
         rshrn           v16.8b,  v6.8h,   #6
         rshrn2          v16.16b, v7.8h,   #6
         st1             {v16.d}[0],  [x0],  x1
         st1             {v16.d}[1],  [x8],  x1
         b.gt            8b
         ret
 16:
-        AARCH64_VALID_JUMP_TARGET
         ld2r            {v0.16b,  v1.16b},  [x5],  #2
         ld1             {v2.16b,  v3.16b},  [x2],  #32
         ld1             {v5.16b},  [x0]
         sub             v7.16b,  v4.16b,  v0.16b
         sub             v16.16b, v4.16b,  v1.16b
         ld1             {v6.16b},  [x8]
         subs            w4,  w4,  #2
         umull           v17.8h,  v0.8b,   v2.8b
@@ -633,17 +616,16 @@ 16:
         rshrn2          v22.16b, v20.8h,  #6
         st1             {v21.16b}, [x0],  x1
         st1             {v22.16b}, [x8],  x1
         b.gt            16b
         ret
 1280:
 640:
 320:
-        AARCH64_VALID_JUMP_TARGET
         sub             x1,  x1,  w3,  uxtw
         add             x7,  x2,  w3,  uxtw
 321:
         ld2r            {v0.16b,  v1.16b},  [x5],  #2
         mov             w6,  w3
         sub             v20.16b, v4.16b,  v0.16b
         sub             v21.16b, v4.16b,  v1.16b
 32:
@@ -704,17 +686,16 @@ function blend_v_8bpc_neon, export=1
         movi            v4.16b,  #64
         add             x8,  x0,  x1
         lsl             x1,  x1,  #1
         sub             w3,  w3,  #26
         ldrh            w3,  [x6,  x3,  lsl #1]
         sub             x6,  x6,  w3,  uxtw
         br              x6
 20:
-        AARCH64_VALID_JUMP_TARGET
         ld1r            {v0.8b},   [x5]
         sub             v1.8b,   v4.8b,   v0.8b
 2:
         ld1             {v2.h}[0],   [x2],  #2
         ld1             {v3.b}[0],   [x0]
         subs            w4,  w4,  #2
         ld1             {v2.b}[1],   [x2]
         ld1             {v3.b}[1],   [x8]
@@ -722,17 +703,16 @@ 2:
         umlal           v5.8h,   v3.8b,   v1.8b
         rshrn           v5.8b,   v5.8h,   #6
         add             x2,  x2,  #2
         st1             {v5.b}[0],   [x0],  x1
         st1             {v5.b}[1],   [x8],  x1
         b.gt            2b
         ret
 40:
-        AARCH64_VALID_JUMP_TARGET
         ld1r            {v0.2s},   [x5]
         sub             x1,  x1,  #2
         sub             v1.8b,   v4.8b,   v0.8b
 4:
         ld1             {v2.8b},   [x2],  #8
         ld1             {v3.s}[0],   [x0]
         ld1             {v3.s}[1],   [x8]
         subs            w4,  w4,  #2
@@ -741,17 +721,16 @@ 4:
         rshrn           v5.8b,   v5.8h,   #6
         st1             {v5.h}[0],   [x0],  #2
         st1             {v5.h}[2],   [x8],  #2
         st1             {v5.b}[2],   [x0],  x1
         st1             {v5.b}[6],   [x8],  x1
         b.gt            4b
         ret
 80:
-        AARCH64_VALID_JUMP_TARGET
         ld1r            {v0.2d},   [x5]
         sub             x1,  x1,  #4
         sub             v1.16b,  v4.16b,  v0.16b
 8:
         ld1             {v2.16b},  [x2],  #16
         ld1             {v3.d}[0],   [x0]
         ld1             {v3.d}[1],   [x8]
         subs            w4,  w4,  #2
@@ -763,17 +742,16 @@ 8:
         rshrn2          v7.16b, v6.8h,  #6
         st1             {v7.s}[0],   [x0],  #4
         st1             {v7.s}[2],   [x8],  #4
         st1             {v7.h}[2],   [x0],  x1
         st1             {v7.h}[6],   [x8],  x1
         b.gt            8b
         ret
 160:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.16b},  [x5]
         sub             x1,  x1,  #8
         sub             v2.16b,  v4.16b,  v0.16b
 16:
         ld1             {v5.16b,  v6.16b},  [x2],  #32
         ld1             {v7.16b},  [x0]
         subs            w4,  w4,  #2
         ld1             {v16.16b}, [x8]
@@ -791,17 +769,16 @@ 16:
         rshrn2          v22.16b, v21.8h,  #6
         st1             {v19.8b},  [x0],  #8
         st1             {v22.8b},  [x8],  #8
         st1             {v19.s}[2],  [x0],  x1
         st1             {v22.s}[2],  [x8],  x1
         b.gt            16b
         ret
 320:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.16b,  v1.16b},  [x5]
         sub             x1,  x1,  #16
         sub             v2.16b,  v4.16b,  v0.16b
         sub             v3.8b,   v4.8b,   v1.8b
 32:
         ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2],  #64
         ld1             {v5.16b,  v6.16b},  [x0]
         subs            w4,  w4,  #2
@@ -843,84 +820,77 @@ endfunc
 // and assumes that x8 is set to (clz(w)-24).
 function put_neon
         adr             x9,  L(put_tbl)
         ldrh            w8,  [x9, x8, lsl #1]
         sub             x9,  x9,  w8, uxtw
         br              x9
 
 2:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.h}[0], [x2], x3
         ld1             {v1.h}[0], [x2], x3
         subs            w5,  w5,  #2
         st1             {v0.h}[0], [x0], x1
         st1             {v1.h}[0], [x0], x1
         b.gt            2b
         ret
 4:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.s}[0], [x2], x3
         ld1             {v1.s}[0], [x2], x3
         subs            w5,  w5,  #2
         st1             {v0.s}[0], [x0], x1
         st1             {v1.s}[0], [x0], x1
         b.gt            4b
         ret
 8:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8b}, [x2], x3
         ld1             {v1.8b}, [x2], x3
         subs            w5,  w5,  #2
         st1             {v0.8b}, [x0], x1
         st1             {v1.8b}, [x0], x1
         b.gt            8b
         ret
 160:
-        AARCH64_VALID_JUMP_TARGET
         add             x8,  x0,  x1
         lsl             x1,  x1,  #1
         add             x9,  x2,  x3
         lsl             x3,  x3,  #1
 16:
         ld1             {v0.16b}, [x2], x3
         ld1             {v1.16b}, [x9], x3
         subs            w5,  w5,  #2
         st1             {v0.16b}, [x0], x1
         st1             {v1.16b}, [x8], x1
         b.gt            16b
         ret
 32:
-        AARCH64_VALID_JUMP_TARGET
         ldp             x6,  x7,  [x2]
         ldp             x8,  x9,  [x2, #16]
         stp             x6,  x7,  [x0]
         subs            w5,  w5,  #1
         stp             x8,  x9,  [x0, #16]
         add             x2,  x2,  x3
         add             x0,  x0,  x1
         b.gt            32b
         ret
 64:
-        AARCH64_VALID_JUMP_TARGET
         ldp             x6,  x7,  [x2]
         ldp             x8,  x9,  [x2, #16]
         stp             x6,  x7,  [x0]
         ldp             x10, x11, [x2, #32]
         stp             x8,  x9,  [x0, #16]
         subs            w5,  w5,  #1
         ldp             x12, x13, [x2, #48]
         stp             x10, x11, [x0, #32]
         stp             x12, x13, [x0, #48]
         add             x2,  x2,  x3
         add             x0,  x0,  x1
         b.gt            64b
         ret
 128:
-        AARCH64_VALID_JUMP_TARGET
         ldp             q0,  q1,  [x2]
         ldp             q2,  q3,  [x2, #32]
         stp             q0,  q1,  [x0]
         ldp             q4,  q5,  [x2, #64]
         stp             q2,  q3,  [x0, #32]
         ldp             q6,  q7,  [x2, #96]
         subs            w5,  w5,  #1
         stp             q4,  q5,  [x0, #64]
@@ -945,52 +915,48 @@ endfunc
 // and assumes that x8 is set to (clz(w)-24), and x7 to w*2.
 function prep_neon
         adr             x9,  L(prep_tbl)
         ldrh            w8,  [x9, x8, lsl #1]
         sub             x9,  x9,  w8, uxtw
         br              x9
 
 4:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.s}[0], [x1], x2
         ld1             {v1.s}[0], [x1], x2
         subs            w4,  w4,  #2
         ushll           v0.8h, v0.8b, #4
         ushll           v1.8h, v1.8b, #4
         st1             {v0.4h, v1.4h}, [x0], #16
         b.gt            4b
         ret
 8:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8b}, [x1], x2
         ld1             {v1.8b}, [x1], x2
         subs            w4,  w4,  #2
         ushll           v0.8h, v0.8b, #4
         ushll           v1.8h, v1.8b, #4
         st1             {v0.8h, v1.8h}, [x0], #32
         b.gt            8b
         ret
 160:
-        AARCH64_VALID_JUMP_TARGET
         add             x9,  x1,  x2
         lsl             x2,  x2,  #1
 16:
         ld1             {v0.16b}, [x1], x2
         ld1             {v1.16b}, [x9], x2
         subs            w4,  w4,  #2
         ushll           v4.8h, v0.8b,  #4
         ushll2          v5.8h, v0.16b, #4
         ushll           v6.8h, v1.8b,  #4
         ushll2          v7.8h, v1.16b, #4
         st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
         b.gt            16b
         ret
 320:
-        AARCH64_VALID_JUMP_TARGET
         add             x8,  x0,  w3, uxtw
 32:
         ld1             {v0.16b, v1.16b},  [x1], x2
         subs            w4,  w4,  #2
         ushll           v4.8h,  v0.8b,  #4
         ushll2          v5.8h,  v0.16b, #4
         ld1             {v2.16b, v3.16b},  [x1], x2
         ushll           v6.8h,  v1.8b,  #4
@@ -1001,17 +967,16 @@ 32:
         st1             {v6.8h,  v7.8h},  [x8], x7
         ushll           v18.8h, v3.8b,  #4
         st1             {v16.8h, v17.8h}, [x0], x7
         ushll2          v19.8h, v3.16b, #4
         st1             {v18.8h, v19.8h}, [x8], x7
         b.gt            32b
         ret
 640:
-        AARCH64_VALID_JUMP_TARGET
         add             x8,  x0,  #32
         mov             x6,  #64
 64:
         ldp             q0,  q1,  [x1]
         subs            w4,  w4,  #1
         ushll           v4.8h,  v0.8b,  #4
         ushll2          v5.8h,  v0.16b, #4
         ldp             q2,  q3,  [x1, #32]
@@ -1024,17 +989,16 @@ 64:
         ushll           v18.8h, v3.8b,  #4
         st1             {v6.8h,  v7.8h},  [x8], x6
         ushll2          v19.8h, v3.16b, #4
         st1             {v16.8h, v17.8h}, [x0], x6
         st1             {v18.8h, v19.8h}, [x8], x6
         b.gt            64b
         ret
 1280:
-        AARCH64_VALID_JUMP_TARGET
         add             x8,  x0,  #64
         mov             x6,  #128
 128:
         ldp             q0,  q1,  [x1]
         ldp             q2,  q3,  [x1, #32]
         ushll           v16.8h,  v0.8b,  #4
         ushll2          v17.8h,  v0.16b, #4
         ushll           v18.8h,  v1.8b,  #4
@@ -1373,17 +1337,16 @@ 4:
         b.ne            L(\type\()_8tap_hv)
 
         adr             x9,  L(\type\()_8tap_h_tbl)
         ldrh            w8,  [x9, x8, lsl #1]
         sub             x9,  x9,  w8, uxtw
         br              x9
 
 20:     // 2xN h
-        AARCH64_VALID_JUMP_TARGET
 .ifc \type, put
         add             \xmx,  \xmx,  #2
         ld1             {v0.s}[0], [\xmx]
         sub             \src,  \src,  #1
         add             \ds2,  \dst,  \d_strd
         add             \sr2,  \src,  \s_strd
         lsl             \d_strd,  \d_strd,  #1
         lsl             \s_strd,  \s_strd,  #1
@@ -1408,17 +1371,16 @@ 2:
         sqrshrun        v3.8b,  v3.8h,  #4
         st1             {v3.h}[0], [\dst], \d_strd
         st1             {v3.h}[1], [\ds2], \d_strd
         b.gt            2b
         ret
 .endif
 
 40:     // 4xN h
-        AARCH64_VALID_JUMP_TARGET
         add             \xmx,  \xmx,  #2
         ld1             {v0.s}[0], [\xmx]
         sub             \src,  \src,  #1
         add             \ds2,  \dst,  \d_strd
         add             \sr2,  \src,  \s_strd
         lsl             \d_strd,  \d_strd,  #1
         lsl             \s_strd,  \s_strd,  #1
         sxtl            v0.8h,  v0.8b
@@ -1452,17 +1414,16 @@ 4:
 .else
         st1             {v16.4h}, [\dst], \d_strd
         st1             {v20.4h}, [\ds2], \d_strd
 .endif
         b.gt            4b
         ret
 
 80:     // 8xN h
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8b}, [\xmx]
         sub             \src,  \src,  #3
         add             \ds2,  \dst,  \d_strd
         add             \sr2,  \src,  \s_strd
         lsl             \d_strd,  \d_strd,  #1
         lsl             \s_strd,  \s_strd,  #1
         sxtl            v0.8h, v0.8b
 8:
@@ -1494,17 +1455,16 @@ 8:
         st1             {v22.8h}, [\ds2], \d_strd
 .endif
         b.gt            8b
         ret
 160:
 320:
 640:
 1280:   // 16xN, 32xN, ... h
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8b}, [\xmx]
         sub             \src,  \src,  #3
         add             \ds2,  \dst,  \d_strd
         add             \sr2,  \src,  \s_strd
         lsl             \s_strd,  \s_strd,  #1
         sxtl            v0.8h, v0.8b
 
         sub             \s_strd,  \s_strd,  \w, uxtw
@@ -1598,17 +1558,16 @@ 4:
         add             \xmy, x10, \my, uxtw #3
 
         adr             x9,  L(\type\()_8tap_v_tbl)
         ldrh            w8,  [x9, x8, lsl #1]
         sub             x9,  x9,  w8, uxtw
         br              x9
 
 20:     // 2xN v
-        AARCH64_VALID_JUMP_TARGET
 .ifc \type, put
         b.gt            28f
 
         cmp             \h,  #2
         add             \xmy, \xmy, #2
         ld1             {v0.s}[0], [\xmy]
         sub             \src,  \src,  \s_strd
         add             \ds2,  \dst,  \d_strd
@@ -1674,17 +1633,16 @@ 216:
         mov             v6.16b,  v22.16b
         mov             v7.16b,  v23.16b
         b               216b
 0:
         ret
 .endif
 
 40:
-        AARCH64_VALID_JUMP_TARGET
         b.gt            480f
 
         // 4x2, 4x4 v
         cmp             \h,  #2
         add             \xmy, \xmy, #2
         ld1             {v0.s}[0], [\xmy]
         sub             \src, \src, \s_strd
         add             \ds2, \dst, \d_strd
@@ -1743,17 +1701,16 @@ 48:
         uxtl_b          v18, v19, v20, v21
         mul_mla_8_2     v1,  v2,  v24, v25, v26, v27, v16, v17, v18, v19, v20, v21
         shift_store_4   \type, \d_strd, v1, v2
         b.gt            48b
 0:
         ret
 
 80:
-        AARCH64_VALID_JUMP_TARGET
         b.gt            880f
 
         // 8x2, 8x4 v
         cmp             \h,  #2
         add             \xmy, \xmy, #2
         ld1             {v0.s}[0], [\xmy]
         sub             \src, \src, \s_strd
         add             \ds2, \dst, \d_strd
@@ -1776,17 +1733,16 @@ 80:
 0:
         ret
 
 880:    // 8x6, 8x8, 8x16, 8x32 v
 1680:   // 16x8, 16x16, ...
 320:    // 32x8, 32x16, ...
 640:
 1280:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8b}, [\xmy]
         sub             \src, \src, \s_strd
         sub             \src, \src, \s_strd, lsl #1
         sxtl            v0.8h, v0.8b
         mov             \my,  \h
 168:
         add             \ds2, \dst, \d_strd
         add             \sr2, \src, \s_strd
@@ -1843,17 +1799,16 @@ 9:
 .else
         add             \dst, \dst, #16
 .endif
         b               168b
 0:
         ret
 
 160:
-        AARCH64_VALID_JUMP_TARGET
         b.gt            1680b
 
         // 16x2, 16x4 v
         add             \xmy, \xmy, #2
         ld1             {v0.s}[0], [\xmy]
         sub             \src, \src, \s_strd
         add             \ds2, \dst, \d_strd
         add             \sr2, \src, \s_strd
@@ -1912,17 +1867,16 @@ 4:
         add             \xmy,  x10, \my, uxtw #3
 
         adr             x9,  L(\type\()_8tap_hv_tbl)
         ldrh            w8,  [x9, x8, lsl #1]
         sub             x9,  x9,  w8, uxtw
         br              x9
 
 20:
-        AARCH64_VALID_JUMP_TARGET
 .ifc \type, put
         add             \xmx,  \xmx,  #2
         ld1             {v0.s}[0],  [\xmx]
         b.gt            280f
         add             \xmy,  \xmy,  #2
         ld1             {v1.s}[0],  [\xmy]
 
         // 2x2, 2x4 hv
@@ -2020,17 +1974,17 @@ 28:
         mov             v17.8b, v19.8b
         mov             v18.8b, v20.8b
         mov             v19.8b, v21.8b
         mov             v20.8b, v22.8b
         mov             v21.8b, v28.8b
         b               28b
 
 0:
-        ret             x15
+        br              x15
 
 L(\type\()_8tap_filter_2):
         ld1             {v28.8b},  [\sr2], \s_strd
         ld1             {v30.8b},  [\src], \s_strd
         uxtl            v28.8h,  v28.8b
         uxtl            v30.8h,  v30.8b
         ext             v29.16b, v28.16b, v28.16b, #2
         ext             v31.16b, v30.16b, v30.16b, #2
@@ -2042,17 +1996,16 @@ L(\type\()_8tap_filter_2):
         mla             v27.4h,  v28.4h,  v0.h[1]
         mla             v27.4h,  v30.4h,  v0.h[2]
         mla             v27.4h,  v31.4h,  v0.h[3]
         srshr           v28.4h,  v27.4h,  #2
         ret
 .endif
 
 40:
-        AARCH64_VALID_JUMP_TARGET
         add             \xmx, \xmx, #2
         ld1             {v0.s}[0],  [\xmx]
         b.gt            480f
         add             \xmy, \xmy,  #2
         ld1             {v1.s}[0],  [\xmy]
         sub             \sr2, \src, #1
         sub             \src, \sr2, \s_strd
         add             \ds2, \dst, \d_strd
@@ -2177,17 +2130,17 @@ 48:
         mov             v17.8b,  v19.8b
         mov             v18.8b,  v20.8b
         mov             v19.8b,  v21.8b
         mov             v20.8b,  v22.8b
         mov             v21.8b,  v28.8b
         mov             v22.8b,  v29.8b
         b               48b
 0:
-        ret             x15
+        br              x15
 
 L(\type\()_8tap_filter_4):
         ld1             {v26.8b}, [\sr2], \s_strd
         ld1             {v27.8b}, [\src], \s_strd
         uxtl            v26.8h,  v26.8b
         uxtl            v27.8h,  v27.8b
         ext             v28.16b, v26.16b, v26.16b, #2
         ext             v29.16b, v26.16b, v26.16b, #4
@@ -2205,17 +2158,16 @@ L(\type\()_8tap_filter_4):
         mla             v27.4h,  v30.4h,  v0.h[3]
         srshr           v28.4h,  v31.4h,  #2
         srshr           v29.4h,  v27.4h,  #2
         ret
 
 80:
 160:
 320:
-        AARCH64_VALID_JUMP_TARGET
         b.gt            880f
         add             \xmy,  \xmy,  #2
         ld1             {v0.8b},  [\xmx]
         ld1             {v1.s}[0],  [\xmy]
         sub             \src,  \src,  #3
         sub             \src,  \src,  \s_strd
         sxtl            v0.8h,  v0.8b
         sxtl            v1.8h,  v1.8b
@@ -2285,17 +2237,16 @@ 9:
 .else
         add             \dst,  \dst,  #16
 .endif
         b               164b
 
 880:    // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
 640:
 1280:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8b},  [\xmx]
         ld1             {v1.8b},  [\xmy]
         sub             \src,  \src,  #3
         sub             \src,  \src,  \s_strd
         sub             \src,  \src,  \s_strd, lsl #1
         sxtl            v0.8h,  v0.8b
         sxtl            v1.8h,  v1.8b
         mov             x15, x30
@@ -2387,17 +2338,17 @@ 9:
         add             \src,  \src,  #8
 .ifc \type, put
         add             \dst,  \dst,  #8
 .else
         add             \dst,  \dst,  #16
 .endif
         b               168b
 0:
-        ret             x15
+        br              x15
 
 L(\type\()_8tap_filter_8_first):
         ld1             {v28.8b, v29.8b},  [\src], \s_strd
         uxtl            v28.8h,  v28.8b
         uxtl            v29.8h,  v29.8b
         mul             v16.8h,  v28.8h,  v0.h[0]
         ext             v24.16b, v28.16b, v29.16b, #(2*1)
         ext             v25.16b, v28.16b, v29.16b, #(2*2)
@@ -2470,17 +2421,16 @@ L(\type\()_bilin_h):
         cbnz            \my, L(\type\()_bilin_hv)
 
         adr             x9,  L(\type\()_bilin_h_tbl)
         ldrh            w8,  [x9, x8, lsl #1]
         sub             x9,  x9,  w8, uxtw
         br              x9
 
 20:     // 2xN h
-        AARCH64_VALID_JUMP_TARGET
 .ifc \type, put
         add             \ds2,  \dst,  \d_strd
         add             \sr2,  \src,  \s_strd
         lsl             \d_strd,  \d_strd,  #1
         lsl             \s_strd,  \s_strd,  #1
 2:
         ld1             {v4.s}[0],  [\src], \s_strd
         ld1             {v6.s}[0],  [\sr2], \s_strd
@@ -2494,17 +2444,16 @@ 2:
         uqrshrn         v4.8b,  v4.8h,  #4
         st1             {v4.h}[0], [\dst], \d_strd
         st1             {v4.h}[1], [\ds2], \d_strd
         b.gt            2b
         ret
 .endif
 
 40:     // 4xN h
-        AARCH64_VALID_JUMP_TARGET
         add             \ds2,  \dst,  \d_strd
         add             \sr2,  \src,  \s_strd
         lsl             \d_strd,  \d_strd,  #1
         lsl             \s_strd,  \s_strd,  #1
 4:
         ld1             {v4.8b}, [\src], \s_strd
         ld1             {v6.8b}, [\sr2], \s_strd
         ext             v5.8b,  v4.8b,  v4.8b, #1
@@ -2521,17 +2470,16 @@ 4:
 .else
         st1             {v4.d}[0], [\dst], \d_strd
         st1             {v4.d}[1], [\ds2], \d_strd
 .endif
         b.gt            4b
         ret
 
 80:     // 8xN h
-        AARCH64_VALID_JUMP_TARGET
         add             \ds2,  \dst,  \d_strd
         add             \sr2,  \src,  \s_strd
         lsl             \d_strd,  \d_strd,  #1
         lsl             \s_strd,  \s_strd,  #1
 8:
         ld1             {v4.16b}, [\src], \s_strd
         ld1             {v6.16b}, [\sr2], \s_strd
         ext             v5.16b, v4.16b, v4.16b, #1
@@ -2551,17 +2499,16 @@ 8:
         st1             {v6.8h}, [\ds2], \d_strd
 .endif
         b.gt            8b
         ret
 160:
 320:
 640:
 1280:   // 16xN, 32xN, ... h
-        AARCH64_VALID_JUMP_TARGET
         add             \ds2,  \dst,  \d_strd
         add             \sr2,  \src,  \s_strd
         lsl             \s_strd,  \s_strd,  #1
 
         sub             \s_strd,  \s_strd,  \w, uxtw
         sub             \s_strd,  \s_strd,  #8
 .ifc \type, put
         lsl             \d_strd,  \d_strd,  #1
@@ -2629,17 +2576,16 @@ L(\type\()_bilin_h_tbl):
 L(\type\()_bilin_v):
         cmp             \h,  #4
         adr             x9,  L(\type\()_bilin_v_tbl)
         ldrh            w8,  [x9, x8, lsl #1]
         sub             x9,  x9,  w8, uxtw
         br              x9
 
 20:     // 2xN v
-        AARCH64_VALID_JUMP_TARGET
 .ifc \type, put
         cmp             \h,  #2
         add             \ds2,  \dst,  \d_strd
         add             \sr2,  \src,  \s_strd
         lsl             \s_strd,  \s_strd,  #1
         lsl             \d_strd,  \d_strd,  #1
 
         // 2x2 v
@@ -2677,17 +2623,16 @@ 24:     // 2x4, 2x8, ... v
         b.le            0f
         mov             v16.8b, v20.8b
         b               24b
 0:
         ret
 .endif
 
 40:     // 4xN v
-        AARCH64_VALID_JUMP_TARGET
         add             \ds2,  \dst,  \d_strd
         add             \sr2,  \src,  \s_strd
         lsl             \s_strd,  \s_strd,  #1
         lsl             \d_strd,  \d_strd,  #1
         ld1             {v16.s}[0], [\src], \s_strd
 4:
         ld1             {v17.s}[0], [\sr2], \s_strd
         ld1             {v18.s}[0], [\src], \s_strd
@@ -2706,17 +2651,16 @@ 4:
 .endif
         b.le            0f
         mov             v16.8b, v18.8b
         b               4b
 0:
         ret
 
 80:     // 8xN v
-        AARCH64_VALID_JUMP_TARGET
         add             \ds2,  \dst,  \d_strd
         add             \sr2,  \src,  \s_strd
         lsl             \s_strd,  \s_strd,  #1
         lsl             \d_strd,  \d_strd,  #1
         ld1             {v16.8b}, [\src], \s_strd
 8:
         ld1             {v17.8b}, [\sr2], \s_strd
         ld1             {v18.8b}, [\src], \s_strd
@@ -2739,17 +2683,16 @@ 8:
         b               8b
 0:
         ret
 
 160:    // 16xN, 32xN, ...
 320:
 640:
 1280:
-        AARCH64_VALID_JUMP_TARGET
         mov             \my,  \h
 1:
         add             \ds2, \dst, \d_strd
         add             \sr2, \src, \s_strd
         lsl             \s_strd, \s_strd, #1
         lsl             \d_strd, \d_strd, #1
 
         ld1             {v16.16b}, [\src], \s_strd
@@ -2812,17 +2755,16 @@ L(\type\()_bilin_hv):
         uxtl            v2.8h, v2.8b
         uxtl            v3.8h, v3.8b
         adr             x9,  L(\type\()_bilin_hv_tbl)
         ldrh            w8,  [x9, x8, lsl #1]
         sub             x9,  x9,  w8, uxtw
         br              x9
 
 20:     // 2xN hv
-        AARCH64_VALID_JUMP_TARGET
 .ifc \type, put
         add             \sr2, \src, \s_strd
         add             \ds2, \dst, \d_strd
         lsl             \s_strd, \s_strd, #1
         lsl             \d_strd, \d_strd, #1
 
         ld1             {v28.s}[0],  [\src], \s_strd
         ext             v29.8b, v28.8b, v28.8b, #1
@@ -2850,17 +2792,16 @@ 2:
         b.le            0f
         trn2            v16.2s, v17.2s, v17.2s
         b               2b
 0:
         ret
 .endif
 
 40:     // 4xN hv
-        AARCH64_VALID_JUMP_TARGET
         add             \sr2, \src, \s_strd
         add             \ds2, \dst, \d_strd
         lsl             \s_strd, \s_strd, #1
         lsl             \d_strd, \d_strd, #1
 
         ld1             {v28.8b},  [\src], \s_strd
         ext             v29.8b, v28.8b, v28.8b, #1
         umull           v16.8h, v28.8b, v0.8b
@@ -2896,17 +2837,16 @@ 4:
 0:
         ret
 
 80:     // 8xN, 16xN, ... hv
 160:
 320:
 640:
 1280:
-        AARCH64_VALID_JUMP_TARGET
         mov             \my,  \h
 
 1:
         add             \sr2, \src, \s_strd
         add             \ds2, \dst, \d_strd
         lsl             \s_strd, \s_strd, #1
         lsl             \d_strd, \d_strd, #1
 
@@ -3127,17 +3067,17 @@ 1:
         st1             {v16.8h}, [x0], x1
 .else
         st1             {v16.8b}, [x0], x1
 .endif
 
         add             w6,  w6,  w4
         b.gt            1b
 
-        ret             x15
+        br              x15
 endfunc
 .endm
 
 warp  , 11
 warp t, 7
 
 // void dav1d_emu_edge_8bpc_neon(
 //         const intptr_t bw, const intptr_t bh,
--- a/third_party/dav1d/src/arm/64/mc16.S
+++ b/third_party/dav1d/src/arm/64/mc16.S
@@ -151,71 +151,65 @@ function \type\()_16bpc_neon, export=1
 .endif
         adr             x7,  L(\type\()_tbl)
         sub             w4,  w4,  #24
         \type           v4,  v5,  v0,  v1,  v2,  v3
         ldrh            w4,  [x7, x4, lsl #1]
         sub             x7,  x7,  w4, uxtw
         br              x7
 40:
-        AARCH64_VALID_JUMP_TARGET
         add             x7,  x0,  x1
         lsl             x1,  x1,  #1
 4:
         subs            w5,  w5,  #4
         st1             {v4.d}[0],  [x0], x1
         st1             {v4.d}[1],  [x7], x1
         st1             {v5.d}[0],  [x0], x1
         st1             {v5.d}[1],  [x7], x1
         b.le            0f
         \type           v4,  v5,  v0,  v1,  v2,  v3
         b               4b
 80:
-        AARCH64_VALID_JUMP_TARGET
         add             x7,  x0,  x1
         lsl             x1,  x1,  #1
 8:
         st1             {v4.8h},  [x0], x1
         subs            w5,  w5,  #2
         st1             {v5.8h},  [x7], x1
         b.le            0f
         \type           v4,  v5,  v0,  v1,  v2,  v3
         b               8b
 16:
-        AARCH64_VALID_JUMP_TARGET
         \type           v6,  v7,  v0,  v1,  v2,  v3
         st1             {v4.8h, v5.8h}, [x0], x1
         subs            w5,  w5,  #2
         st1             {v6.8h, v7.8h}, [x0], x1
         b.le            0f
         \type           v4,  v5,  v0,  v1,  v2,  v3
         b               16b
 32:
-        AARCH64_VALID_JUMP_TARGET
         \type           v6,  v7,  v0,  v1,  v2,  v3
         subs            w5,  w5,  #1
         st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
         b.le            0f
         \type           v4,  v5,  v0,  v1,  v2,  v3
         b               32b
 640:
-        AARCH64_VALID_JUMP_TARGET
         add             x7,  x0,  #64
 64:
         \type           v6,  v7,  v0,  v1,  v2,  v3
         \type           v16, v17, v0,  v1,  v2,  v3
         st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
         \type           v18, v19, v0,  v1,  v2,  v3
         subs            w5,  w5,  #1
         st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1
         b.le            0f
         \type           v4,  v5,  v0,  v1,  v2,  v3
         b               64b
 1280:
-        AARCH64_VALID_JUMP_TARGET
         add             x7,  x0,  #64
         mov             x8,  #128
         sub             x1,  x1,  #128
 128:
         \type           v6,  v7,  v0,  v1,  v2,  v3
         \type           v16, v17, v0,  v1,  v2,  v3
         st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x8
         \type           v18, v19, v0,  v1,  v2,  v3
@@ -274,17 +268,16 @@ function w_mask_\type\()_16bpc_neon, exp
         dup             v2.8h,   w7
         movi            v3.8h,   #1, lsl #8
         sub             v3.8h,   v3.8h,   v2.8h
 .endif
         add             x12,  x0,  x1
         lsl             x1,   x1,  #1
         br              x10
 4:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once)
         ld1             {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once)
         subs            w5,  w5,  #4
         sabd            v20.8h,  v4.8h,   v6.8h   // abs(tmp1 - tmp2)
         sabd            v21.8h,  v5.8h,   v7.8h
         ssubl           v16.4s,  v6.4h,   v4.4h   // tmp2 - tmp1 (requires 17 bit)
         ssubl2          v17.4s,  v6.8h,   v4.8h
         ssubl           v18.4s,  v7.4h,   v5.4h
@@ -340,17 +333,16 @@ 4:
 .endif
         st1             {v4.d}[0],  [x0],  x1
         st1             {v4.d}[1],  [x12], x1
         st1             {v5.d}[0],  [x0],  x1
         st1             {v5.d}[1],  [x12], x1
         b.gt            4b
         ret
 8:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v4.8h, v5.8h}, [x2], #32 // tmp1
         ld1             {v6.8h, v7.8h}, [x3], #32 // tmp2
         subs            w5,  w5,  #2
         sabd            v20.8h,  v4.8h,   v6.8h   // abs(tmp1 - tmp2)
         sabd            v21.8h,  v5.8h,   v7.8h
         ssubl           v16.4s,  v6.4h,   v4.4h   // tmp2 - tmp1 (requires 17 bit)
         ssubl2          v17.4s,  v6.8h,   v4.8h
         ssubl           v18.4s,  v7.4h,   v5.4h
@@ -405,17 +397,16 @@ 8:
         st1             {v4.8h}, [x0],  x1
         st1             {v5.8h}, [x12], x1
         b.gt            8b
         ret
 1280:
 640:
 320:
 160:
-        AARCH64_VALID_JUMP_TARGET
         mov             w11, w4
         sub             x1,  x1,  w4,  uxtw #1
 .if \type == 444
         add             x10, x6,  w4,  uxtw
 .elseif \type == 422
         add             x10, x6,  x11, lsr #1
 .endif
         add             x9,  x3,  w4,  uxtw #1
@@ -568,17 +559,16 @@ function blend_16bpc_neon, export=1
         adr             x6,  L(blend_tbl)
         clz             w3,  w3
         sub             w3,  w3,  #26
         ldrh            w3,  [x6,  x3,  lsl #1]
         sub             x6,  x6,  w3,  uxtw
         add             x8,  x0,  x1
         br              x6
 40:
-        AARCH64_VALID_JUMP_TARGET
         lsl             x1,  x1,  #1
 4:
         ld1             {v2.8b},   [x5], #8
         ld1             {v1.8h},   [x2], #16
         ld1             {v0.d}[0], [x0]
         neg             v2.8b,   v2.8b            // -m
         subs            w4,  w4,  #2
         ld1             {v0.d}[1], [x8]
@@ -587,17 +577,16 @@ 4:
         sub             v1.8h,   v0.8h,   v1.8h   // a - b
         sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
         add             v0.8h,   v0.8h,   v1.8h
         st1             {v0.d}[0], [x0], x1
         st1             {v0.d}[1], [x8], x1
         b.gt            4b
         ret
 80:
-        AARCH64_VALID_JUMP_TARGET
         lsl             x1,  x1,  #1
 8:
         ld1             {v4.16b},       [x5], #16
         ld1             {v2.8h, v3.8h}, [x2], #32
         neg             v5.16b,  v4.16b           // -m
         ld1             {v0.8h},   [x0]
         ld1             {v1.8h},   [x8]
         sxtl            v4.8h,   v5.8b
@@ -611,17 +600,16 @@ 8:
         sqrdmulh        v3.8h,   v3.8h,   v5.8h
         add             v0.8h,   v0.8h,   v2.8h
         add             v1.8h,   v1.8h,   v3.8h
         st1             {v0.8h}, [x0], x1
         st1             {v1.8h}, [x8], x1
         b.gt            8b
         ret
 160:
-        AARCH64_VALID_JUMP_TARGET
         lsl             x1,  x1,  #1
 16:
         ld1             {v16.16b, v17.16b},           [x5], #32
         ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
         subs            w4,  w4,  #2
         neg             v18.16b, v16.16b          // -m
         neg             v19.16b, v17.16b
         ld1             {v0.8h, v1.8h}, [x0]
@@ -646,17 +634,16 @@ 16:
         add             v1.8h,   v1.8h,   v5.8h
         add             v2.8h,   v2.8h,   v6.8h
         add             v3.8h,   v3.8h,   v7.8h
         st1             {v0.8h, v1.8h}, [x0], x1
         st1             {v2.8h, v3.8h}, [x8], x1
         b.gt            16b
         ret
 32:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v16.16b, v17.16b},           [x5], #32
         ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
         subs            w4,  w4,  #1
         neg             v18.16b, v16.16b          // -m
         neg             v19.16b, v17.16b
         sxtl            v16.8h,  v18.8b
         sxtl2           v17.8h,  v18.16b
         sxtl            v18.8h,  v19.8b
@@ -696,17 +683,16 @@ function blend_h_16bpc_neon, export=1
         clz             w7,  w3
         add             x8,  x0,  x1
         lsl             x1,  x1,  #1
         sub             w7,  w7,  #24
         ldrh            w7,  [x6,  x7,  lsl #1]
         sub             x6,  x6,  w7, uxtw
         br              x6
 2:
-        AARCH64_VALID_JUMP_TARGET
         ld2r            {v2.8b, v3.8b}, [x5], #2
         ld1             {v1.4h},        [x2], #8
         ext             v2.8b,   v2.8b,   v3.8b,   #6
         subs            w4,  w4,  #2
         neg             v2.8b,   v2.8b            // -m
         ld1             {v0.s}[0], [x0]
         ld1             {v0.s}[1], [x8]
         sxtl            v2.8h,   v2.8b
@@ -714,17 +700,16 @@ 2:
         sub             v1.4h,   v0.4h,   v1.4h   // a - b
         sqrdmulh        v1.4h,   v1.4h,   v2.4h   // ((a-b)*-m + 32) >> 6
         add             v0.4h,   v0.4h,   v1.4h
         st1             {v0.s}[0], [x0], x1
         st1             {v0.s}[1], [x8], x1
         b.gt            2b
         ret
 4:
-        AARCH64_VALID_JUMP_TARGET
         ld2r            {v2.8b, v3.8b}, [x5], #2
         ld1             {v1.8h},        [x2], #16
         ext             v2.8b,   v2.8b,   v3.8b,   #4
         subs            w4,  w4,  #2
         neg             v2.8b,   v2.8b            // -m
         ld1             {v0.d}[0],   [x0]
         ld1             {v0.d}[1],   [x8]
         sxtl            v2.8h,   v2.8b
@@ -732,17 +717,16 @@ 4:
         sub             v1.8h,   v0.8h,   v1.8h   // a - b
         sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
         add             v0.8h,   v0.8h,   v1.8h
         st1             {v0.d}[0], [x0], x1
         st1             {v0.d}[1], [x8], x1
         b.gt            4b
         ret
 8:
-        AARCH64_VALID_JUMP_TARGET
         ld2r            {v4.8b, v5.8b}, [x5], #2
         ld1             {v2.8h, v3.8h}, [x2], #32
         neg             v4.8b,   v4.8b            // -m
         neg             v5.8b,   v5.8b
         ld1             {v0.8h}, [x0]
         subs            w4,  w4,  #2
         sxtl            v4.8h,   v4.8b
         sxtl            v5.8h,   v5.8b
@@ -755,17 +739,16 @@ 8:
         sqrdmulh        v3.8h,   v3.8h,   v5.8h
         add             v0.8h,   v0.8h,   v2.8h
         add             v1.8h,   v1.8h,   v3.8h
         st1             {v0.8h}, [x0], x1
         st1             {v1.8h}, [x8], x1
         b.gt            8b
         ret
 16:
-        AARCH64_VALID_JUMP_TARGET
         ld2r            {v16.8b, v17.8b}, [x5], #2
         ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
         neg             v16.8b,  v16.8b           // -m
         neg             v17.8b,  v17.8b
         ld1             {v0.8h, v1.8h},  [x0]
         ld1             {v2.8h, v3.8h},  [x8]
         subs            w4,  w4,  #2
         sxtl            v16.8h,  v16.8b
@@ -786,17 +769,16 @@ 16:
         add             v3.8h,   v3.8h,   v7.8h
         st1             {v0.8h, v1.8h}, [x0], x1
         st1             {v2.8h, v3.8h}, [x8], x1
         b.gt            16b
         ret
 1280:
 640:
 320:
-        AARCH64_VALID_JUMP_TARGET
         sub             x1,  x1,  w3,  uxtw #1
         add             x7,  x2,  w3,  uxtw #1
 321:
         ld2r            {v24.8b, v25.8b}, [x5], #2
         mov             w6,  w3
         neg             v24.8b,  v24.8b           // -m
         neg             v25.8b,  v25.8b
         sxtl            v24.8h,  v24.8b
@@ -860,17 +842,16 @@ function blend_v_16bpc_neon, export=1
         clz             w3,  w3
         add             x8,  x0,  x1
         lsl             x1,  x1,  #1
         sub             w3,  w3,  #26
         ldrh            w3,  [x6,  x3,  lsl #1]
         sub             x6,  x6,  w3,  uxtw
         br              x6
 20:
-        AARCH64_VALID_JUMP_TARGET
         ld1r            {v2.8b}, [x5]
         neg             v2.8b,   v2.8b            // -m
         sxtl            v2.8h,   v2.8b
         shl             v2.4h,   v2.4h,   #9      // -m << 9
 2:
         ld1             {v1.s}[0], [x2], #4
         ld1             {v0.h}[0], [x0]
         subs            w4,  w4,  #2
@@ -880,17 +861,16 @@ 2:
         sub             v1.4h,   v0.4h,   v1.4h   // a - b
         sqrdmulh        v1.4h,   v1.4h,   v2.4h   // ((a-b)*-m + 32) >> 6
         add             v0.4h,   v0.4h,   v1.4h
         st1             {v0.h}[0], [x0],  x1
         st1             {v0.h}[1], [x8],  x1
         b.gt            2b
         ret
 40:
-        AARCH64_VALID_JUMP_TARGET
         ld1r            {v2.2s}, [x5]
         sub             x1,  x1,  #4
         neg             v2.8b,   v2.8b            // -m
         sxtl            v2.8h,   v2.8b
         shl             v2.8h,   v2.8h,   #9      // -m << 9
 4:
         ld1             {v1.8h},   [x2], #16
         ld1             {v0.d}[0], [x0]
@@ -901,17 +881,16 @@ 4:
         add             v0.8h,   v0.8h,   v1.8h
         st1             {v0.s}[0], [x0], #4
         st1             {v0.s}[2], [x8], #4
         st1             {v0.h}[2], [x0], x1
         st1             {v0.h}[6], [x8], x1
         b.gt            4b
         ret
 80:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v4.8b}, [x5]
         sub             x1,  x1,  #8
         neg             v4.8b,   v4.8b            // -m
         sxtl            v4.8h,   v4.8b
         shl             v4.8h,   v4.8h,   #9      // -m << 9
 8:
         ld1             {v2.8h, v3.8h}, [x2], #32
         ld1             {v0.8h}, [x0]
@@ -925,17 +904,16 @@ 8:
         add             v1.8h,   v1.8h,   v3.8h
         st1             {v0.d}[0], [x0], #8
         st1             {v1.d}[0], [x8], #8
         st1             {v0.s}[2], [x0], x1
         st1             {v1.s}[2], [x8], x1
         b.gt            8b
         ret
 160:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v16.16b}, [x5]
         sub             x1,  x1,  #16
         neg             v17.16b, v16.16b          // -m
         sxtl            v16.8h,  v17.8b
         sxtl2           v17.8h,  v17.16b
         shl             v16.8h,  v16.8h,  #9      // -m << 9
         shl             v17.4h,  v17.4h,  #9
 16:
@@ -957,17 +935,16 @@ 16:
         add             v3.4h,   v3.4h,   v7.4h
         st1             {v0.8h}, [x0], #16
         st1             {v2.8h}, [x8], #16
         st1             {v1.4h}, [x0], x1
         st1             {v3.4h}, [x8], x1
         b.gt            16b
         ret
 320:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v24.16b, v25.16b},  [x5]
         neg             v26.16b, v24.16b          // -m
         neg             v27.8b,  v25.8b
         sxtl            v24.8h,  v26.8b
         sxtl2           v25.8h,  v26.16b
         sxtl            v26.8h,  v27.8b
         shl             v24.8h,  v24.8h,  #9      // -m << 9
         shl             v25.8h,  v25.8h,  #9
@@ -1013,90 +990,83 @@ endfunc
 // and assumes that x9 is set to (clz(w)-24).
 function put_neon
         adr             x10, L(put_tbl)
         ldrh            w9, [x10, x9, lsl #1]
         sub             x10, x10, w9, uxtw
         br              x10
 
 2:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.s}[0], [x2], x3
         ld1             {v1.s}[0], [x2], x3
         subs            w5,  w5,  #2
         st1             {v0.s}[0], [x0], x1
         st1             {v1.s}[0], [x0], x1
         b.gt            2b
         ret
 4:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.4h}, [x2], x3
         ld1             {v1.4h}, [x2], x3
         subs            w5,  w5,  #2
         st1             {v0.4h}, [x0], x1
         st1             {v1.4h}, [x0], x1
         b.gt            4b
         ret
 80:
-        AARCH64_VALID_JUMP_TARGET
         add             x8,  x0,  x1
         lsl             x1,  x1,  #1
         add             x9,  x2,  x3
         lsl             x3,  x3,  #1
 8:
         ld1             {v0.8h}, [x2], x3
         ld1             {v1.8h}, [x9], x3
         subs            w5,  w5,  #2
         st1             {v0.8h}, [x0], x1
         st1             {v1.8h}, [x8], x1
         b.gt            8b
         ret
 16:
-        AARCH64_VALID_JUMP_TARGET
         ldp             x6,  x7,  [x2]
         ldp             x8,  x9,  [x2, #16]
         stp             x6,  x7,  [x0]
         subs            w5,  w5,  #1
         stp             x8,  x9,  [x0, #16]
         add             x2,  x2,  x3
         add             x0,  x0,  x1
         b.gt            16b
         ret
 32:
-        AARCH64_VALID_JUMP_TARGET
         ldp             x6,  x7,  [x2]
         ldp             x8,  x9,  [x2, #16]
         stp             x6,  x7,  [x0]
         ldp             x10, x11, [x2, #32]
         stp             x8,  x9,  [x0, #16]
         subs            w5,  w5,  #1
         ldp             x12, x13, [x2, #48]
         stp             x10, x11, [x0, #32]
         stp             x12, x13, [x0, #48]
         add             x2,  x2,  x3
         add             x0,  x0,  x1
         b.gt            32b
         ret
 64:
-        AARCH64_VALID_JUMP_TARGET
         ldp             q0,  q1,  [x2]
         ldp             q2,  q3,  [x2, #32]
         stp             q0,  q1,  [x0]
         ldp             q4,  q5,  [x2, #64]
         stp             q2,  q3,  [x0, #32]
         ldp             q6,  q7,  [x2, #96]
         subs            w5,  w5,  #1
         stp             q4,  q5,  [x0, #64]
         stp             q6,  q7,  [x0, #96]
         add             x2,  x2,  x3
         add             x0,  x0,  x1
         b.gt            64b
         ret
 128:
-        AARCH64_VALID_JUMP_TARGET
         ldp             q0,  q1,  [x2]
         ldp             q2,  q3,  [x2, #32]
         stp             q0,  q1,  [x0]
         ldp             q4,  q5,  [x2, #64]
         stp             q2,  q3,  [x0, #32]
         ldp             q6,  q7,  [x2, #96]
         subs            w5,  w5,  #1
         stp             q4,  q5,  [x0, #64]
@@ -1132,45 +1102,42 @@ function prep_neon
         adr             x10, L(prep_tbl)
         ldrh            w9, [x10, x9, lsl #1]
         dup             v31.8h,  w7   // intermediate_bits
         movi            v30.8h,  #(PREP_BIAS >> 8), lsl #8
         sub             x10, x10, w9, uxtw
         br              x10
 
 40:
-        AARCH64_VALID_JUMP_TARGET
         add             x9,  x1,  x2
         lsl             x2,  x2,  #1
 4:
         ld1             {v0.d}[0], [x1], x2
         ld1             {v0.d}[1], [x9], x2
         subs            w4,  w4,  #2
         sshl            v0.8h,   v0.8h,   v31.8h
         sub             v0.8h,   v0.8h,   v30.8h
         st1             {v0.8h}, [x0], #16
         b.gt            4b
         ret
 80:
-        AARCH64_VALID_JUMP_TARGET
         add             x9,  x1,  x2
         lsl             x2,  x2,  #1
 8:
         ld1             {v0.8h}, [x1], x2
         ld1             {v1.8h}, [x9], x2
         subs            w4,  w4,  #2
         sshl            v0.8h,   v0.8h,   v31.8h
         sshl            v1.8h,   v1.8h,   v31.8h
         sub             v0.8h,   v0.8h,   v30.8h
         sub             v1.8h,   v1.8h,   v30.8h
         st1             {v0.8h, v1.8h}, [x0], #32
         b.gt            8b
         ret
 16:
-        AARCH64_VALID_JUMP_TARGET
         ldp             q0,  q1,  [x1]
         add             x1,  x1,  x2
         sshl            v0.8h,   v0.8h,   v31.8h
         ldp             q2,  q3,  [x1]
         add             x1,  x1,  x2
         subs            w4,  w4,  #2
         sshl            v1.8h,   v1.8h,   v31.8h
         sshl            v2.8h,   v2.8h,   v31.8h
@@ -1178,34 +1145,32 @@ 16:
         sub             v0.8h,   v0.8h,   v30.8h
         sub             v1.8h,   v1.8h,   v30.8h
         sub             v2.8h,   v2.8h,   v30.8h
         sub             v3.8h,   v3.8h,   v30.8h
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
         b.gt            16b
         ret
 32:
-        AARCH64_VALID_JUMP_TARGET
         ldp             q0,  q1,  [x1]
         sshl            v0.8h,   v0.8h,   v31.8h
         ldp             q2,  q3,  [x1, #32]
         add             x1,  x1,  x2
         sshl            v1.8h,   v1.8h,   v31.8h
         sshl            v2.8h,   v2.8h,   v31.8h
         sshl            v3.8h,   v3.8h,   v31.8h
         subs            w4,  w4,  #1
         sub             v0.8h,   v0.8h,   v30.8h
         sub             v1.8h,   v1.8h,   v30.8h
         sub             v2.8h,   v2.8h,   v30.8h
         sub             v3.8h,   v3.8h,   v30.8h
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
         b.gt            32b
         ret
 64:
-        AARCH64_VALID_JUMP_TARGET
         ldp             q0,  q1,  [x1]
         subs            w4,  w4,  #1
         sshl            v0.8h,   v0.8h,   v31.8h
         ldp             q2,  q3,  [x1, #32]
         sshl            v1.8h,   v1.8h,   v31.8h
         ldp             q4,  q5,  [x1, #64]
         sshl            v2.8h,   v2.8h,   v31.8h
         sshl            v3.8h,   v3.8h,   v31.8h
@@ -1226,17 +1191,16 @@ 64:
         sub             v6.8h,   v6.8h,   v30.8h
         sub             v7.8h,   v7.8h,   v30.8h
         stp             q4,  q5,  [x0, #64]
         stp             q6,  q7,  [x0, #96]
         add             x0,  x0,  x8
         b.gt            64b
         ret
 128:
-        AARCH64_VALID_JUMP_TARGET
         ldp             q0,  q1,  [x1]
         subs            w4,  w4,  #1
         sshl            v0.8h,   v0.8h,   v31.8h
         ldp             q2,  q3,  [x1, #32]
         sshl            v1.8h,   v1.8h,   v31.8h
         ldp             q4,  q5,  [x1, #64]
         sshl            v2.8h,   v2.8h,   v31.8h
         sshl            v3.8h,   v3.8h,   v31.8h
@@ -1584,17 +1548,16 @@ 4:
 .endif
         sub             x10, x10, w9, uxtw
 .ifc \type, put
         neg             v29.8h,  v29.8h        // -intermediate_bits
 .endif
         br              x10
 
 20:     // 2xN h
-        AARCH64_VALID_JUMP_TARGET
 .ifc \type, put
         add             \xmx,  \xmx,  #2
         ld1             {v0.s}[0], [\xmx]
         sub             \src,  \src,  #2
         add             \ds2,  \dst,  \d_strd
         add             \sr2,  \src,  \s_strd
         lsl             \d_strd,  \d_strd,  #1
         lsl             \s_strd,  \s_strd,  #1
@@ -1619,17 +1582,16 @@ 2:
         umin            v3.4h,   v3.4h,   v31.4h
         st1             {v3.s}[0], [\dst], \d_strd
         st1             {v3.s}[1], [\ds2], \d_strd
         b.gt            2b
         ret
 .endif
 
 40:     // 4xN h
-        AARCH64_VALID_JUMP_TARGET
         add             \xmx,  \xmx,  #2
         ld1             {v0.s}[0], [\xmx]
         sub             \src,  \src,  #2
         add             \ds2,  \dst,  \d_strd
         add             \sr2,  \src,  \s_strd
         lsl             \d_strd,  \d_strd,  #1
         lsl             \s_strd,  \s_strd,  #1
         sxtl            v0.8h,   v0.8b
@@ -1668,17 +1630,16 @@ 4:
         b.gt            4b
         ret
 
 80:
 160:
 320:
 640:
 1280:   // 8xN, 16xN, 32xN, ... h
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8b}, [\xmx]
         sub             \src,  \src,  #6
         add             \ds2,  \dst,  \d_strd
         add             \sr2,  \src,  \s_strd
         lsl             \s_strd,  \s_strd,  #1
         sxtl            v0.8h,   v0.8b
 
         sub             \s_strd,  \s_strd,  \w, uxtw #1
@@ -1775,17 +1736,16 @@ 4:
         ldrh            w9,  [x10, x9, lsl #1]
 .ifc \type, prep
         neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
 .endif
         sub             x10, x10, w9, uxtw
         br              x10
 
 20:     // 2xN v
-        AARCH64_VALID_JUMP_TARGET
 .ifc \type, put
         b.gt            28f
 
         cmp             \h,  #2
         add             \xmy, \xmy, #2
         ld1             {v0.s}[0], [\xmy]
         sub             \src,  \src,  \s_strd
         add             \ds2,  \dst,  \d_strd
@@ -1849,17 +1809,16 @@ 216:
         mov             v6.16b,  v22.16b
         mov             v7.16b,  v23.16b
         b               216b
 0:
         ret
 .endif
 
 40:
-        AARCH64_VALID_JUMP_TARGET
         b.gt            480f
 
         // 4x2, 4x4 v
         cmp             \h,  #2
         add             \xmy, \xmy, #2
         ld1             {v0.s}[0], [\xmy]
         sub             \src, \src, \s_strd
         add             \ds2, \dst, \d_strd
@@ -1907,17 +1866,16 @@ 48:
         mov             v20.8b,  v24.8b
         mov             v21.8b,  v25.8b
         mov             v22.8b,  v26.8b
         b               48b
 0:
         ret
 
 80:
-        AARCH64_VALID_JUMP_TARGET
         b.gt            880f
 
         // 8x2, 8x4 v
         cmp             \h,  #2
         add             \xmy, \xmy, #2
         ld1             {v0.s}[0], [\xmy]
         sub             \src, \src, \s_strd
         add             \ds2, \dst, \d_strd
@@ -1942,17 +1900,16 @@ 80:
 0:
         ret
 
 880:    // 8x6, 8x8, 8x16, 8x32 v
 1680:   // 16x8, 16x16, ...
 320:    // 32x8, 32x16, ...
 640:
 1280:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8b}, [\xmy]
         sub             \src, \src, \s_strd
         sub             \src, \src, \s_strd, lsl #1
         sxtl            v0.8h,   v0.8b
         mov             \my,  \h
 168:
         add             \ds2, \dst, \d_strd
         add             \sr2, \src, \s_strd
@@ -1997,17 +1954,16 @@ 9:
         mov             \h,  \my
         add             \src, \src, #16
         add             \dst, \dst, #16
         b               168b
 0:
         ret
 
 160:
-        AARCH64_VALID_JUMP_TARGET
         b.gt            1680b
 
         // 16x2, 16x4 v
         add             \xmy, \xmy, #2
         ld1             {v0.s}[0], [\xmy]
         sub             \src, \src, \s_strd
         sxtl            v0.8h,   v0.8b
 
@@ -2061,17 +2017,16 @@ 4:
 .endif
         sub             x10, x10, w9, uxtw
 .ifc \type, put
         neg             v29.4s,  v29.4s        // -(6+intermediate_bits)
 .endif
         br              x10
 
 20:
-        AARCH64_VALID_JUMP_TARGET
 .ifc \type, put
         add             \xmx,  \xmx,  #2
         ld1             {v0.s}[0],  [\xmx]
         b.gt            280f
         add             \xmy,  \xmy,  #2
         ld1             {v1.s}[0],  [\xmy]
 
         // 2x2, 2x4 hv
@@ -2179,17 +2134,17 @@ 28:
         mov             v17.8b,  v19.8b
         mov             v18.8b,  v20.8b
         mov             v19.8b,  v21.8b
         mov             v20.8b,  v22.8b
         mov             v21.8b,  v24.8b
         b               28b
 
 0:
-        ret             x15
+        br              x15
 
 L(\type\()_8tap_filter_2):
         ld1             {v25.8h},  [\sr2], \s_strd
         ld1             {v27.8h},  [\src], \s_strd
         ext             v26.16b, v25.16b, v25.16b, #2
         ext             v28.16b, v27.16b, v27.16b, #2
         trn1            v24.2s,  v25.2s,  v27.2s
         trn2            v27.2s,  v25.2s,  v27.2s
@@ -2200,17 +2155,16 @@ L(\type\()_8tap_filter_2):
         smlal           v24.4s,  v27.4h,  v0.h[2]
         smlal           v24.4s,  v28.4h,  v0.h[3]
         srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
         xtn             v24.4h,  v24.4s
         ret
 .endif
 
 40:
-        AARCH64_VALID_JUMP_TARGET
         add             \xmx, \xmx, #2
         ld1             {v0.s}[0],  [\xmx]
         b.gt            480f
         add             \xmy, \xmy,  #2
         ld1             {v1.s}[0],  [\xmy]
         sub             \sr2, \src, #2
         sub             \src, \sr2, \s_strd
         add             \ds2, \dst, \d_strd
@@ -2345,17 +2299,17 @@ 48:
         mov             v17.8b,  v19.8b
         mov             v18.8b,  v20.8b
         mov             v19.8b,  v21.8b
         mov             v20.8b,  v22.8b
         mov             v21.8b,  v24.8b
         mov             v22.8b,  v25.8b
         b               48b
 0:
-        ret             x15
+        br              x15
 
 L(\type\()_8tap_filter_4):
         ld1             {v24.8h}, [\sr2], \s_strd
         ld1             {v25.8h}, [\src], \s_strd
         ext             v26.16b, v24.16b, v24.16b, #2
         ext             v27.16b, v24.16b, v24.16b, #4
         ext             v28.16b, v24.16b, v24.16b, #6
         smull           v24.4s,  v24.4h,  v0.h[0]
@@ -2373,17 +2327,16 @@ L(\type\()_8tap_filter_4):
         srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
         xtn             v24.4h,  v24.4s
         xtn             v25.4h,  v25.4s
         ret
 
 80:
 160:
 320:
-        AARCH64_VALID_JUMP_TARGET
         b.gt            880f
         add             \xmy,  \xmy,  #2
         ld1             {v0.8b},  [\xmx]
         ld1             {v1.s}[0],  [\xmy]
         sub             \src,  \src,  #6
         sub             \src,  \src,  \s_strd
         sxtl            v0.8h,   v0.8b
         sxtl            v1.8h,   v1.8b
@@ -2474,17 +2427,16 @@ 9:
         mov             \h,  \my
         add             \src,  \src,  #16
         add             \dst,  \dst,  #16
         b               164b
 
 880:    // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
 640:
 1280:
-        AARCH64_VALID_JUMP_TARGET
         ld1             {v0.8b},  [\xmx]
         ld1             {v1.8b},  [\xmy]
         sub             \src,  \src,  #6
         sub             \src,  \src,  \s_strd
         sub             \src,  \src,  \s_strd, lsl #1
         sxtl            v0.8h,   v0.8b
         sxtl            v1.8h,   v1.8b
         mov             x15, x30
@@ -2597,17 +2549,17 @@ 9:
         msub            \src,  \s_strd,  \xmy,  \src
         msub            \dst,  \d_strd,  \xmy,  \dst
         sub             \src,  \src,  \s_strd,  lsl #3
         mov             \h,  \my
         add             \src,  \src,  #16
         add             \dst,  \dst,  #16
         b               168b
 0:
-        ret             x15
+        br              x15
 
 L(\type\()_8tap_filter_8):
         ld1             {v4.8h, v5.8h},  [\sr2], \s_strd
         ld1             {v6.8h, v7.8h},  [\src], \s_strd
         smull           v25.4s,  v4.4h,   v0.h[0]
         smull2          v26.4s,  v4.8h,   v0.h[0]
         smull           v27.4s,  v6.4h,   v0.h[0]
         smull2          v28.4s,  v6.8h,   v0.h[0]
@@ -2682,17 +2634,16 @@ L(\type\()_bilin_h):
 .endif
         sub             x10, x10, w9, uxtw
 .ifc \type, put
         neg             v30.8h,  v30.8h   // -intermediate_bits
 .endif
         br              x10
 
 20:     // 2xN h
-        AARCH64_VALID_JUMP_TARGET
 .ifc \type, put
         add             \ds2,  \dst,  \d_strd
         add             \sr2,  \src,  \s_strd
         lsl             \d_strd,  \d_strd,  #1
         lsl             \s_strd,  \s_strd,  #1
 2:
         ld1             {v4.4h},  [\src], \s_strd
         ld1             {v6.4h},  [\sr2], \s_strd
@@ -2707,17 +2658,16 @@ 2:
         urshl           v4.4h,   v4.4h,   v30.4h
         st1             {v4.s}[0], [\dst], \d_strd
         st1             {v4.s}[1], [\ds2], \d_strd
         b.gt            2b
         ret
 .endif
 
 40:     // 4xN h
-        AARCH64_VALID_JUMP_TARGET
         add             \ds2,  \dst,  \d_strd
         add             \sr2,  \src,  \s_strd
         lsl             \d_strd,  \d_strd,  #1
         lsl             \s_strd,  \s_strd,  #1
 4:
         ld1             {v4.8h}, [\src], \s_strd
         ld1             {v6.8h}, [\sr2], \s_strd
         ext             v5.16b,  v4.16b,  v4.16b,  #2
@@ -2734,17 +2684,16 @@ 4:
         sub             v4.8h,   v4.8h,   v29.8h
 .endif
         st1             {v4.d}[0], [\dst], \d_strd
         st1             {v4.d}[1], [\ds2], \d_strd
         b.gt            4b
         ret
 
 80:     // 8xN h
-        AARCH64_VALID_JUMP_TARGET
         add             \ds2,  \dst,  \d_strd
         add             \sr2,  \src,  \s_strd
         lsl             \d_strd,  \d_strd,  #1
         lsl             \s_strd,  \s_strd,  #1
 8:
         ldr             h5,  [\src, #16]
         ldr             h7,  [\sr2, #16]
         ld1             {v4.8h}, [\src], \s_strd
@@ -2768,17 +2717,16 @@ 8:
         st1             {v4.8h}, [\dst], \d_strd
         st1             {v6.8h}, [\ds2], \d_strd
         b.gt            8b
         ret
 160:
 320:
 640:
 1280:   // 16xN, 32xN, ... h
-        AARCH64_VALID_JUMP_TARGET
         add             \ds2,  \dst,  \d_strd
         add             \sr2,  \src,  \s_strd
         lsl             \s_strd,  \s_strd,  #1
 
         sub             \s_strd,  \s_strd,  \w, uxtw #1
         sub             \s_strd,  \s_strd,  #16
 .ifc \type, put
         lsl             \d_strd,  \d_strd,  #1
@@ -2859,17 +2807,16 @@ L(\type\()_bilin_v):
 .ifc \type, prep
         movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
         neg             v31.8h,  v31.8h   // -(4-intermediate_bits)
 .endif
         sub             x10, x10, w9, uxtw
         br              x10
 
 20:     // 2xN v
-        AARCH64_VALID_JUMP_TARGET
 .ifc \type, put
         cmp             \h,  #2
         add             \ds2,  \dst,  \d_strd
         add             \sr2,  \src,  \s_strd
         lsl             \s_strd,  \s_strd,  #1
         lsl             \d_strd,  \d_strd,  #1
 
         // 2x2 v
@@ -2907,17 +2854,16 @@ 24:     // 2x4, 2x8, ... v
         b.le            0f
         mov             v16.8b,  v20.8b
         b               24b
 0:
         ret
 .endif
 
 40:     // 4xN v
-        AARCH64_VALID_JUMP_TARGET
         add             \ds2,  \dst,  \d_strd
         add             \sr2,  \src,  \s_strd
         lsl             \s_strd,  \s_strd,  #1
         lsl             \d_strd,  \d_strd,  #1
         ld1             {v16.4h}, [\src], \s_strd
 4:
         ld1             {v17.4h}, [\sr2], \s_strd
         ld1             {v18.4h}, [\src], \s_strd
@@ -2936,17 +2882,16 @@ 4:
         st1             {v4.d}[1], [\ds2], \d_strd
         b.le            0f
         mov             v16.8b,  v18.8b
         b               4b
 0:
         ret
 
 80:     // 8xN v
-        AARCH64_VALID_JUMP_TARGET
         add             \ds2,  \dst,  \d_strd
         add             \sr2,  \src,  \s_strd
         lsl             \s_strd,  \s_strd,  #1
         lsl             \d_strd,  \d_strd,  #1
         ld1             {v16.8h}, [\src], \s_strd
 8:
         ld1             {v17.8h}, [\sr2], \s_strd
         ld1             {v18.8h}, [\src], \s_strd
@@ -2971,17 +2916,16 @@ 8:
         b               8b
 0:
         ret
 
 160:    // 16xN, 32xN, ...
 320:
 640:
 1280:
-        AARCH64_VALID_JUMP_TARGET
         mov             \my, \h
 1:
         add             \ds2, \dst, \d_strd
         add             \sr2, \src, \s_strd
         lsl             \s_strd, \s_strd, #1
         lsl             \d_strd, \d_strd, #1
 
         ld1             {v16.8h, v17.8h}, [\src], \s_strd
@@ -3055,17 +2999,16 @@ L(\type\()_bilin_hv):
 .endif
         sub             x10, x10, w9, uxtw
 .ifc \type, put
         neg             v30.4s,  v30.4s   // -(4+intermediate_bits)
 .endif
         br              x10
 
 20:     // 2xN hv
-        AARCH64_VALID_JUMP_TARGET
 .ifc \type, put
         add             \sr2, \src, \s_strd
         add             \ds2, \dst, \d_strd
         lsl             \s_strd, \s_strd, #1
         lsl             \d_strd, \d_strd, #1
 
         ld1             {v20.4h},  [\src], \s_strd
         ext             v21.8b,  v20.8b,  v20.8b,  #2
@@ -3096,17 +3039,16 @@ 2:
         b.le            0f
         trn2            v16.2s,  v17.2s,  v17.2s
         b               2b
 0:
         ret
 .endif
 
 40:     // 4xN hv
-        AARCH64_VALID_JUMP_TARGET
         add             \sr2, \src, \s_strd
         add             \ds2, \dst, \d_strd
         lsl             \s_strd, \s_strd, #1
         lsl             \d_strd, \d_strd, #1
 
         ld1             {v20.8h},  [\src], \s_strd
         ext             v21.16b, v20.16b, v20.16b, #2
         mul             v16.4h,  v20.4h,  v0.4h
@@ -3149,17 +3091,16 @@ 4:
 0:
         ret
 
 80:     // 8xN, 16xN, ... hv
 160:
 320:
 640:
 1280:
-        AARCH64_VALID_JUMP_TARGET
         mov             \my, \h
 
 1:
         add             \sr2, \src, \s_strd
         add             \ds2, \dst, \d_strd
         lsl             \s_strd, \s_strd, #1
         lsl             \d_strd, \d_strd, #1
 
@@ -3452,17 +3393,17 @@ 1:
         add             w6,  w6,  w4
         b.gt            1b
 
         ldp             d14, d15, [sp, #0x30]
         ldp             d12, d13, [sp, #0x20]
         ldp             d10, d11, [sp, #0x10]
         ldp             d8,  d9,  [sp], 0x40
 
-        ret             x15
+        br              x15
 endfunc
 .endm
 
 warp
 warp t
 
 // void dav1d_emu_edge_16bpc_neon(
 //         const intptr_t bw, const intptr_t bh,
--- a/third_party/dav1d/src/arm/64/refmvs.S
+++ b/third_party/dav1d/src/arm/64/refmvs.S
@@ -46,42 +46,36 @@ function splat_mv_neon, export=1
         ext             v2.16b,  v2.16b,  v3.16b,  #12
 1:
         ldr             x1,  [x0],  #8
         subs            w4,  w4,  #1
         add             x1,  x1,  x2
         br              x3
 
 10:
-        AARCH64_VALID_JUMP_TARGET
         st1             {v0.8b}, [x1]
         str             s2,  [x1, #8]
         b.gt            1b
         ret
 20:
-        AARCH64_VALID_JUMP_TARGET
         st1             {v0.16b}, [x1]
         str             d1,  [x1, #16]
         b.gt            1b
         ret
 320:
-        AARCH64_VALID_JUMP_TARGET
         st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
         st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
         st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
         st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
 160:
-        AARCH64_VALID_JUMP_TARGET
         st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
         st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
 80:
-        AARCH64_VALID_JUMP_TARGET
         st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
 40:
-        AARCH64_VALID_JUMP_TARGET
         st1             {v0.16b, v1.16b, v2.16b}, [x1]
         b.gt            1b
         ret
 
 L(splat_tbl):
         .hword L(splat_tbl) -  320b
         .hword L(splat_tbl) -  160b
         .hword L(splat_tbl) -   80b
--- a/third_party/dav1d/src/arm/asm.S
+++ b/third_party/dav1d/src/arm/asm.S
@@ -25,52 +25,16 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #ifndef DAV1D_SRC_ARM_ASM_S
 #define DAV1D_SRC_ARM_ASM_S
 
 #include "config.h"
 
-#if ARCH_AARCH64
-#define x18 do_not_use_x18
-#define w18 do_not_use_w18
-
-/* Support macros for the Armv8.5-A Branch Target Identification feature which
- * requires emitting a .note.gnu.property section with the appropriate
- * architecture-dependent feature bits set.
- * Read more: "ELF for the ArmĀ® 64-bit Architecture"
- */
-#if defined(__ARM_FEATURE_BTI_DEFAULT) && (__ARM_FEATURE_BTI_DEFAULT == 1)
-#define GNU_PROPERTY_AARCH64_BTI (1 << 0)   // Has Branch Target Identification
-#define AARCH64_VALID_JUMP_CALL_TARGET hint #38  // BTI 'jc'
-#define AARCH64_VALID_CALL_TARGET      hint #34  // BTI 'c'
-#define AARCH64_VALID_JUMP_TARGET      hint #36  // BTI 'j'
-#else
-#define GNU_PROPERTY_AARCH64_BTI 0          // No Branch Target Identification
-#define AARCH64_VALID_JUMP_CALL_TARGET
-#define AARCH64_VALID_CALL_TARGET
-#define AARCH64_VALID_JUMP_TARGET
-#endif
-
-#if (GNU_PROPERTY_AARCH64_BTI != 0)
-        .pushsection .note.gnu.property, "a"
-        .balign 8
-        .long 4
-        .long 0x10
-        .long 0x5
-        .asciz "GNU"
-        .long 0xc0000000 /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */
-        .long 4
-        .long GNU_PROPERTY_AARCH64_BTI
-        .long 0
-        .popsection
-#endif
-#endif
-
 #if ARCH_ARM
         .syntax unified
 #ifdef __ELF__
         .arch armv7-a
         .fpu neon
         .eabi_attribute 10, 0           // suppress Tag_FP_arch
         .eabi_attribute 12, 0           // suppress Tag_Advanced_SIMD_arch
         .section .note.GNU-stack,"",%progbits // Mark stack as non-executable
@@ -141,21 +105,16 @@ EXTERN\name:
 #ifdef __ELF__
         .type \name, %function
 #endif
 #if HAVE_AS_FUNC
         .func \name
 #endif
     .endif
 \name:
-#if ARCH_AARCH64
-    .if \export
-         AARCH64_VALID_CALL_TARGET
-    .endif
-#endif
 .endm
 
 .macro  const   name, export=0, align=2
     .macro endconst
 #ifdef __ELF__
         .size   \name, . - \name
 #endif
         .purgem endconst
@@ -183,10 +142,14 @@ EXTERN\name:
 #ifdef __APPLE__
 #define L(x) L ## x
 #else
 #define L(x) .L ## x
 #endif
 
 #define X(x) CONCAT(EXTERN, x)
 
+#if ARCH_AARCH64
+#define x18 do_not_use_x18
+#define w18 do_not_use_w18
+#endif
 
 #endif /* DAV1D_SRC_ARM_ASM_S */
--- a/third_party/dav1d/src/arm/cdef_init_tmpl.c
+++ b/third_party/dav1d/src/arm/cdef_init_tmpl.c
@@ -26,52 +26,48 @@
 
 #include "src/cpu.h"
 #include "src/cdef.h"
 
 decl_cdef_dir_fn(BF(dav1d_cdef_find_dir, neon));
 
 void BF(dav1d_cdef_padding4, neon)(uint16_t *tmp, const pixel *src,
                                    ptrdiff_t src_stride, const pixel (*left)[2],
-                                   const pixel *const top,
-                                   const pixel *const bottom, int h,
+                                   const pixel *const top, int h,
                                    enum CdefEdgeFlags edges);
 void BF(dav1d_cdef_padding8, neon)(uint16_t *tmp, const pixel *src,
                                    ptrdiff_t src_stride, const pixel (*left)[2],
-                                   const pixel *const top,
-                                   const pixel *const bottom, int h,
+                                   const pixel *const top, int h,
                                    enum CdefEdgeFlags edges);
 
 // Passing edges to this function, to allow it to switch to a more
 // optimized version for fully edged cases. Using size_t for edges,
 // to avoid ABI differences for passing more than one argument on the stack.
 void BF(dav1d_cdef_filter4, neon)(pixel *dst, ptrdiff_t dst_stride,
                                   const uint16_t *tmp, int pri_strength,
                                   int sec_strength, int dir, int damping, int h,
                                   size_t edges HIGHBD_DECL_SUFFIX);
 void BF(dav1d_cdef_filter8, neon)(pixel *dst, ptrdiff_t dst_stride,
                                   const uint16_t *tmp, int pri_strength,
                                   int sec_strength, int dir, int damping, int h,
                                   size_t edges HIGHBD_DECL_SUFFIX);
 
 #define DEFINE_FILTER(w, h, tmp_stride)                                      \
 static void                                                                  \
-cdef_filter_##w##x##h##_neon(pixel *dst, const ptrdiff_t stride,             \
-                             const pixel (*left)[2],                         \
-                             const pixel *const top,                         \
-                             const pixel *const bottom,                      \
+cdef_filter_##w##x##h##_neon(pixel *dst,                                     \
+                             const ptrdiff_t stride,                         \
+                             const pixel (*left)[2], const pixel *const top, \
                              const int pri_strength, const int sec_strength, \
                              const int dir, const int damping,               \
                              const enum CdefEdgeFlags edges                  \
                              HIGHBD_DECL_SUFFIX)                             \
 {                                                                            \
     ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride + 8,);                   \
     uint16_t *tmp = tmp_buf + 2 * tmp_stride + 8;                            \
-    BF(dav1d_cdef_padding##w, neon)(tmp, dst, stride,                        \
-                                    left, top, bottom, h, edges);            \
+    BF(dav1d_cdef_padding##w, neon)(tmp, dst, stride, left, top, h, edges);  \
     BF(dav1d_cdef_filter##w, neon)(dst, stride, tmp, pri_strength,           \
                                    sec_strength, dir, damping, h, edges      \
                                    HIGHBD_TAIL_SUFFIX);                      \
 }
 
 DEFINE_FILTER(8, 8, 16)
 DEFINE_FILTER(4, 8, 8)
 DEFINE_FILTER(4, 4, 8)
--- a/third_party/dav1d/src/arm/looprestoration_init_tmpl.c
+++ b/third_party/dav1d/src/arm/looprestoration_init_tmpl.c
@@ -24,24 +24,26 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include "src/cpu.h"
 #include "src/looprestoration.h"
 
 #if ARCH_AARCH64
-void BF(dav1d_wiener_filter7, neon)(pixel *p, const ptrdiff_t stride,
-                                    const pixel (*left)[4], const pixel *lpf,
+void BF(dav1d_wiener_filter7, neon)(pixel *p, const ptrdiff_t p_stride,
+                                    const pixel (*left)[4],
+                                    const pixel *lpf, const ptrdiff_t lpf_stride,
                                     const int w, int h,
                                     const LooprestorationParams *const params,
                                     const enum LrEdgeFlags edges
                                     HIGHBD_DECL_SUFFIX);
-void BF(dav1d_wiener_filter5, neon)(pixel *p, const ptrdiff_t stride,
-                                    const pixel (*left)[4], const pixel *lpf,
+void BF(dav1d_wiener_filter5, neon)(pixel *p, const ptrdiff_t p_stride,
+                                    const pixel (*left)[4],
+                                    const pixel *lpf, const ptrdiff_t lpf_stride,
                                     const int w, int h,
                                     const LooprestorationParams *const params,
                                     const enum LrEdgeFlags edges
                                     HIGHBD_DECL_SUFFIX);
 #else
 
 // The 8bpc version calculates things slightly differently than the reference
 // C version. That version calculates roughly this:
@@ -69,41 +71,42 @@ void BF(dav1d_wiener_filter_h, neon)(int
 //     sum += mid[idx] * fv[i];
 // sum = (sum + rounding_off_v) >> round_bits_v;
 // This function assumes that the width is a multiple of 8.
 void BF(dav1d_wiener_filter_v, neon)(pixel *dst, ptrdiff_t stride,
                                      const int16_t *mid, int w, int h,
                                      const int16_t fv[8], enum LrEdgeFlags edges,
                                      ptrdiff_t mid_stride HIGHBD_DECL_SUFFIX);
 
-static void wiener_filter_neon(pixel *const dst, const ptrdiff_t stride,
-                               const pixel (*const left)[4], const pixel *lpf,
+static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
+                               const pixel (*const left)[4],
+                               const pixel *lpf, const ptrdiff_t lpf_stride,
                                const int w, const int h,
                                const LooprestorationParams *const params,
                                const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
 {
     const int16_t (*const filter)[8] = params->filter;
     ALIGN_STK_16(int16_t, mid, 68 * 384,);
     int mid_stride = (w + 7) & ~7;
 
     // Horizontal filter
-    BF(dav1d_wiener_filter_h, neon)(&mid[2 * mid_stride], left, dst, stride,
+    BF(dav1d_wiener_filter_h, neon)(&mid[2 * mid_stride], left, dst, dst_stride,
                                     filter[0], w, h, edges HIGHBD_TAIL_SUFFIX);
     if (edges & LR_HAVE_TOP)
-        BF(dav1d_wiener_filter_h, neon)(mid, NULL, lpf, stride,
+        BF(dav1d_wiener_filter_h, neon)(mid, NULL, lpf, lpf_stride,
                                         filter[0], w, 2, edges
                                         HIGHBD_TAIL_SUFFIX);
     if (edges & LR_HAVE_BOTTOM)
         BF(dav1d_wiener_filter_h, neon)(&mid[(2 + h) * mid_stride], NULL,
-                                        lpf + 6 * PXSTRIDE(stride),
-                                        stride, filter[0], w, 2, edges
+                                        lpf + 6 * PXSTRIDE(lpf_stride),
+                                        lpf_stride, filter[0], w, 2, edges
                                         HIGHBD_TAIL_SUFFIX);
 
     // Vertical filter
-    BF(dav1d_wiener_filter_v, neon)(dst, stride, &mid[2*mid_stride],
+    BF(dav1d_wiener_filter_v, neon)(dst, dst_stride, &mid[2*mid_stride],
                                     w, h, filter[1], edges,
                                     mid_stride * sizeof(*mid)
                                     HIGHBD_TAIL_SUFFIX);
 }
 #endif
 
 void BF(dav1d_sgr_box3_h, neon)(int32_t *sumsq, int16_t *sum,
                                 const pixel (*left)[4],
@@ -119,35 +122,36 @@ void dav1d_sgr_calc_ab1_neon(int32_t *a,
 void BF(dav1d_sgr_finish_filter1, neon)(int16_t *tmp,
                                         const pixel *src, const ptrdiff_t stride,
                                         const int32_t *a, const int16_t *b,
                                         const int w, const int h);
 
 /* filter with a 3x3 box (radius=1) */
 static void dav1d_sgr_filter1_neon(int16_t *tmp,
                                    const pixel *src, const ptrdiff_t stride,
-                                   const pixel (*left)[4], const pixel *lpf,
+                                   const pixel (*left)[4],
+                                   const pixel *lpf, const ptrdiff_t lpf_stride,
                                    const int w, const int h, const int strength,
                                    const enum LrEdgeFlags edges
                                    HIGHBD_DECL_SUFFIX)
 {
     ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
     int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
     ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
     int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
 
     BF(dav1d_sgr_box3_h, neon)(sumsq, sum, left, src, stride, w, h, edges);
     if (edges & LR_HAVE_TOP)
         BF(dav1d_sgr_box3_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
-                                   NULL, lpf, stride, w, 2, edges);
+                                   NULL, lpf, lpf_stride, w, 2, edges);
 
     if (edges & LR_HAVE_BOTTOM)
         BF(dav1d_sgr_box3_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
-                                   NULL, lpf + 6 * PXSTRIDE(stride),
-                                   stride, w, 2, edges);
+                                   NULL, lpf + 6 * PXSTRIDE(lpf_stride),
+                                   lpf_stride, w, 2, edges);
 
     dav1d_sgr_box3_v_neon(sumsq, sum, w, h, edges);
     dav1d_sgr_calc_ab1_neon(a, b, w, h, strength, BITDEPTH_MAX);
     BF(dav1d_sgr_finish_filter1, neon)(tmp, src, stride, a, b, w, h);
 }
 
 void BF(dav1d_sgr_box5_h, neon)(int32_t *sumsq, int16_t *sum,
                                 const pixel (*left)[4],
@@ -163,91 +167,95 @@ void dav1d_sgr_calc_ab2_neon(int32_t *a,
 void BF(dav1d_sgr_finish_filter2, neon)(int16_t *tmp,
                                         const pixel *src, const ptrdiff_t stride,
                                         const int32_t *a, const int16_t *b,
                                         const int w, const int h);
 
 /* filter with a 5x5 box (radius=2) */
 static void dav1d_sgr_filter2_neon(int16_t *tmp,
                                    const pixel *src, const ptrdiff_t stride,
-                                   const pixel (*left)[4], const pixel *lpf,
+                                   const pixel (*left)[4],
+                                   const pixel *lpf, const ptrdiff_t lpf_stride,
                                    const int w, const int h, const int strength,
                                    const enum LrEdgeFlags edges
                                    HIGHBD_DECL_SUFFIX)
 {
     ALIGN_STK_16(int32_t, sumsq_mem, (384 + 16) * 68 + 8,);
     int32_t *const sumsq = &sumsq_mem[(384 + 16) * 2 + 8], *const a = sumsq;
     ALIGN_STK_16(int16_t, sum_mem, (384 + 16) * 68 + 16,);
     int16_t *const sum = &sum_mem[(384 + 16) * 2 + 16], *const b = sum;
 
     BF(dav1d_sgr_box5_h, neon)(sumsq, sum, left, src, stride, w, h, edges);
     if (edges & LR_HAVE_TOP)
         BF(dav1d_sgr_box5_h, neon)(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
-                                   NULL, lpf, stride, w, 2, edges);
+                                   NULL, lpf, lpf_stride, w, 2, edges);
 
     if (edges & LR_HAVE_BOTTOM)
         BF(dav1d_sgr_box5_h, neon)(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
-                                   NULL, lpf + 6 * PXSTRIDE(stride),
-                                   stride, w, 2, edges);
+                                   NULL, lpf + 6 * PXSTRIDE(lpf_stride),
+                                   lpf_stride, w, 2, edges);
 
     dav1d_sgr_box5_v_neon(sumsq, sum, w, h, edges);
     dav1d_sgr_calc_ab2_neon(a, b, w, h, strength, BITDEPTH_MAX);
     BF(dav1d_sgr_finish_filter2, neon)(tmp, src, stride, a, b, w, h);
 }
 
 void BF(dav1d_sgr_weighted1, neon)(pixel *dst, const ptrdiff_t dst_stride,
                                    const pixel *src, const ptrdiff_t src_stride,
                                    const int16_t *t1, const int w, const int h,
                                    const int wt HIGHBD_DECL_SUFFIX);
 void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride,
                                    const pixel *src, const ptrdiff_t src_stride,
                                    const int16_t *t1, const int16_t *t2,
                                    const int w, const int h,
                                    const int16_t wt[2] HIGHBD_DECL_SUFFIX);
 
-static void sgr_filter_5x5_neon(pixel *const dst, const ptrdiff_t stride,
-                                const pixel (*const left)[4], const pixel *lpf,
+static void sgr_filter_5x5_neon(pixel *const dst, const ptrdiff_t dst_stride,
+                                const pixel (*const left)[4],
+                                const pixel *lpf, const ptrdiff_t lpf_stride,
                                 const int w, const int h,
                                 const LooprestorationParams *const params,
                                 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
 {
     ALIGN_STK_16(int16_t, tmp, 64 * 384,);
-    dav1d_sgr_filter2_neon(tmp, dst, stride, left, lpf,
+    dav1d_sgr_filter2_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
                            w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX);
-    BF(dav1d_sgr_weighted1, neon)(dst, stride, dst, stride,
+    BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride,
                                   tmp, w, h, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
 }
 
-static void sgr_filter_3x3_neon(pixel *const dst, const ptrdiff_t stride,
-                                const pixel (*const left)[4], const pixel *lpf,
+static void sgr_filter_3x3_neon(pixel *const dst, const ptrdiff_t dst_stride,
+                                const pixel (*const left)[4],
+                                const pixel *lpf, const ptrdiff_t lpf_stride,
                                 const int w, const int h,
                                 const LooprestorationParams *const params,
                                 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
 {
     ALIGN_STK_16(int16_t, tmp, 64 * 384,);
-    dav1d_sgr_filter1_neon(tmp, dst, stride, left, lpf,
+    dav1d_sgr_filter1_neon(tmp, dst, dst_stride, left, lpf, lpf_stride,
                            w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX);
-    BF(dav1d_sgr_weighted1, neon)(dst, stride, dst, stride,
+    BF(dav1d_sgr_weighted1, neon)(dst, dst_stride, dst, dst_stride,
                                   tmp, w, h, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
 }
 
-static void sgr_filter_mix_neon(pixel *const dst, const ptrdiff_t stride,
-                                const pixel (*const left)[4], const pixel *lpf,
+static void sgr_filter_mix_neon(pixel *const dst, const ptrdiff_t dst_stride,
+                                const pixel (*const left)[4],
+                                const pixel *lpf, const ptrdiff_t lpf_stride,
                                 const int w, const int h,
                                 const LooprestorationParams *const params,
                                 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
 {
     ALIGN_STK_16(int16_t, tmp1, 64 * 384,);
     ALIGN_STK_16(int16_t, tmp2, 64 * 384,);
-    dav1d_sgr_filter2_neon(tmp1, dst, stride, left, lpf,
+    dav1d_sgr_filter2_neon(tmp1, dst, dst_stride, left, lpf, lpf_stride,
                            w, h, params->sgr.s0, edges HIGHBD_TAIL_SUFFIX);
-    dav1d_sgr_filter1_neon(tmp2, dst, stride, left, lpf,
+    dav1d_sgr_filter1_neon(tmp2, dst, dst_stride, left, lpf, lpf_stride,
                            w, h, params->sgr.s1, edges HIGHBD_TAIL_SUFFIX);
     const int16_t wt[2] = { params->sgr.w0, params->sgr.w1 };
-    BF(dav1d_sgr_weighted2, neon)(dst, stride, dst, stride,
+    BF(dav1d_sgr_weighted2, neon)(dst, dst_stride, dst, dst_stride,
                                   tmp1, tmp2, w, h, wt HIGHBD_TAIL_SUFFIX);
 }
 
 COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *const c, int bpc) {
     const unsigned flags = dav1d_get_cpu_flags();
 
     if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
 
--- a/third_party/dav1d/src/cdef.h
+++ b/third_party/dav1d/src/cdef.h
@@ -47,18 +47,17 @@ typedef const void *const_left_pixel_row
 #endif
 
 // CDEF operates entirely on pre-filter data; if bottom/right edges are
 // present (according to $edges), then the pre-filter data is located in
 // $dst. However, the edge pixels above $dst may be post-filter, so in
 // order to get access to pre-filter top pixels, use $top.
 #define decl_cdef_fn(name) \
 void (name)(pixel *dst, ptrdiff_t stride, const_left_pixel_row_2px left, \
-            const pixel *top, const pixel *bottom, \
-            int pri_strength, int sec_strength, \
+            const pixel *top, int pri_strength, int sec_strength, \
             int dir, int damping, enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX)
 typedef decl_cdef_fn(*cdef_fn);
 
 #define decl_cdef_dir_fn(name) \
 int (name)(const pixel *dst, ptrdiff_t dst_stride, unsigned *var HIGHBD_DECL_SUFFIX)
 typedef decl_cdef_dir_fn(*cdef_dir_fn);
 
 typedef struct Dav1dCdefDSPContext {
--- a/third_party/dav1d/src/cdef_apply.h
+++ b/third_party/dav1d/src/cdef_apply.h
@@ -27,13 +27,12 @@
 
 #ifndef DAV1D_SRC_CDEF_APPLY_H
 #define DAV1D_SRC_CDEF_APPLY_H
 
 #include "common/bitdepth.h"
 
 #include "src/internal.h"
 
-void bytefn(dav1d_cdef_brow)(Dav1dTaskContext *tc, pixel *const p[3],
-                             const Av1Filter *lflvl, int by_start, int by_end,
-                             int sbrow_start, int sby);
+void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *f, pixel *const p[3],
+                             const Av1Filter *lflvl, int by_start, int by_end);
 
 #endif /* DAV1D_SRC_CDEF_APPLY_H */
--- a/third_party/dav1d/src/cdef_apply_tmpl.c
+++ b/third_party/dav1d/src/cdef_apply_tmpl.c
@@ -28,16 +28,17 @@
 #include "config.h"
 
 #include <string.h>
 
 #include "common/intops.h"
 
 #include "src/cdef_apply.h"
 
+
 enum Backup2x8Flags {
     BACKUP_2X8_Y = 1 << 0,
     BACKUP_2X8_UV = 1 << 1,
 };
 
 static void backup2lines(pixel *const dst[3], /*const*/ pixel *const src[3],
                          const ptrdiff_t stride[2],
                          const enum Dav1dPixelLayout layout)
@@ -89,59 +90,43 @@ static void backup2x8(pixel dst[3][8][2]
 }
 
 static int adjust_strength(const int strength, const unsigned var) {
     if (!var) return 0;
     const int i = var >> 6 ? imin(ulog2(var >> 6), 12) : 0;
     return (strength * (4 + i) + 8) >> 4;
 }
 
-void bytefn(dav1d_cdef_brow)(Dav1dTaskContext *const tc,
+void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
                              pixel *const p[3],
                              const Av1Filter *const lflvl,
-                             const int by_start, const int by_end,
-                             const int sbrow_start, const int sby)
+                             const int by_start, const int by_end)
 {
-    Dav1dFrameContext *const f = (Dav1dFrameContext *)tc->f;
     const int bitdepth_min_8 = BITDEPTH == 8 ? 0 : f->cur.p.bpc - 8;
     const Dav1dDSPContext *const dsp = f->dsp;
     enum CdefEdgeFlags edges = CDEF_HAVE_BOTTOM | (by_start > 0 ? CDEF_HAVE_TOP : 0);
     pixel *ptrs[3] = { p[0], p[1], p[2] };
     const int sbsz = 16;
     const int sb64w = f->sb128w << 1;
     const int damping = f->frame_hdr->cdef.damping + bitdepth_min_8;
     const enum Dav1dPixelLayout layout = f->cur.p.layout;
     const int uv_idx = DAV1D_PIXEL_LAYOUT_I444 - layout;
     const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
     const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
     static const uint8_t uv_dirs[2][8] = { { 0, 1, 2, 3, 4, 5, 6, 7 },
                                            { 7, 0, 2, 4, 5, 6, 6, 6 } };
     const uint8_t *uv_dir = uv_dirs[layout == DAV1D_PIXEL_LAYOUT_I422];
-    const int have_tt = f->c->n_tc > 1;
-    const int sb128 = f->seq_hdr->sb128;
-    const int resize = f->frame_hdr->width[0] != f->frame_hdr->width[1];
-    const ptrdiff_t y_stride = PXSTRIDE(f->cur.stride[0]);
-    const ptrdiff_t uv_stride = PXSTRIDE(f->cur.stride[1]);
 
     for (int bit = 0, by = by_start; by < by_end; by += 2, edges |= CDEF_HAVE_TOP) {
-        const int tf = tc->top_pre_cdef_toggle;
+        const int tf = f->lf.top_pre_cdef_toggle;
         const int by_idx = (by & 30) >> 1;
         if (by + 2 >= f->bh) edges &= ~CDEF_HAVE_BOTTOM;
 
-        if ((!have_tt || sbrow_start || by + 2 < by_end) &&
-            edges & CDEF_HAVE_BOTTOM)
-        {
-            // backup pre-filter data for next iteration
-            pixel *const cdef_top_bak[3] = {
-                f->lf.cdef_line[!tf][0] + have_tt * sby * 4 * y_stride,
-                f->lf.cdef_line[!tf][1] + have_tt * sby * 8 * uv_stride,
-                f->lf.cdef_line[!tf][2] + have_tt * sby * 8 * uv_stride
-            };
-            backup2lines(cdef_top_bak, ptrs, f->cur.stride, layout);
-        }
+        if (edges & CDEF_HAVE_BOTTOM) // backup pre-filter data for next iteration
+            backup2lines(f->lf.cdef_line[!tf], ptrs, f->cur.stride, layout);
 
         ALIGN_STK_16(pixel, lr_bak, 2 /* idx */, [3 /* plane */][8 /* y */][2 /* x */]);
         pixel *iptrs[3] = { ptrs[0], ptrs[1], ptrs[2] };
         edges &= ~CDEF_HAVE_LEFT;
         edges |= CDEF_HAVE_RIGHT;
         enum Backup2x8Flags prev_flag = 0;
         for (int sbx = 0, last_skip = 1; sbx < sb64w; sbx++, edges |= CDEF_HAVE_LEFT) {
             const int sb128x = sbx >> 1;
@@ -200,97 +185,39 @@ void bytefn(dav1d_cdef_brow)(Dav1dTaskCo
                 }
 
                 int dir;
                 unsigned variance;
                 if (y_pri_lvl || uv_pri_lvl)
                     dir = dsp->cdef.dir(bptrs[0], f->cur.stride[0],
                                         &variance HIGHBD_CALL_SUFFIX);
 
-                const pixel *top, *bot;
-                ptrdiff_t offset;
-
-                if (!have_tt) goto st_y;
-                if (sbrow_start && by == by_start) {
-                    if (resize) {
-                        offset = (sby - 1) * 4 * y_stride + bx * 4;
-                        top = &f->lf.cdef_lpf_line[0][offset];
-                    } else {
-                        offset = (sby * (4 << sb128) - 4) * y_stride + bx * 4;
-                        top = &f->lf.lr_lpf_line[0][offset];
-                    }
-                    bot = bptrs[0] + 8 * y_stride;
-                } else if (!sbrow_start && by + 2 >= by_end) {
-                    top = &f->lf.cdef_line[tf][0][sby * 4 * y_stride + bx * 4];
-                    if (resize) {
-                        offset = (sby * 4 + 2) * y_stride + bx * 4;
-                        bot = &f->lf.cdef_lpf_line[0][offset];
-                    } else {
-                        const int line = sby * (4 << sb128) + 4 * sb128 + 2;
-                        offset = line * y_stride + bx * 4;
-                        bot = &f->lf.lr_lpf_line[0][offset];
-                    }
-                } else {
-            st_y:;
-                    offset = sby * 4 * y_stride;
-                    top = &f->lf.cdef_line[tf][0][have_tt * offset + bx * 4];
-                    bot = bptrs[0] + 8 * y_stride;
-                }
                 if (y_pri_lvl) {
                     const int adj_y_pri_lvl = adjust_strength(y_pri_lvl, variance);
                     if (adj_y_pri_lvl || y_sec_lvl)
                         dsp->cdef.fb[0](bptrs[0], f->cur.stride[0], lr_bak[bit][0],
-                                        top, bot, adj_y_pri_lvl, y_sec_lvl,
-                                        dir, damping, edges HIGHBD_CALL_SUFFIX);
+                                        &f->lf.cdef_line[tf][0][bx * 4],
+                                        adj_y_pri_lvl, y_sec_lvl, dir,
+                                        damping, edges HIGHBD_CALL_SUFFIX);
                 } else if (y_sec_lvl)
                     dsp->cdef.fb[0](bptrs[0], f->cur.stride[0], lr_bak[bit][0],
-                                    top, bot, 0, y_sec_lvl, 0, damping,
-                                    edges HIGHBD_CALL_SUFFIX);
-
-                if (!uv_lvl) goto skip_uv;
-                assert(layout != DAV1D_PIXEL_LAYOUT_I400);
-
-                const int uvdir = uv_pri_lvl ? uv_dir[dir] : 0;
-                for (int pl = 1; pl <= 2; pl++) {
-                    if (!have_tt) goto st_uv;
-                    if (sbrow_start && by == by_start) {
-                        if (resize) {
-                            offset = (sby - 1) * 4 * uv_stride + (bx * 4 >> ss_hor);
-                            top = &f->lf.cdef_lpf_line[pl][offset];
-                        } else {
-                            const int line = sby * (4 << sb128) - 4;
-                            offset = line * uv_stride + (bx * 4 >> ss_hor);
-                            top = &f->lf.lr_lpf_line[pl][offset];
-                        }
-                        bot = bptrs[pl] + (8 >> ss_ver) * uv_stride;
-                    } else if (!sbrow_start && by + 2 >= by_end) {
-                        const ptrdiff_t top_offset = sby * 8 * uv_stride +
-                                                     (bx * 4 >> ss_hor);
-                        top = &f->lf.cdef_line[tf][pl][top_offset];
-                        if (resize) {
-                            offset = (sby * 4 + 2) * uv_stride + (bx * 4 >> ss_hor);
-                            bot = &f->lf.cdef_lpf_line[pl][offset];
-                        } else {
-                            const int line = sby * (4 << sb128) + 4 * sb128 + 2;
-                            offset = line * uv_stride + (bx * 4 >> ss_hor);
-                            bot = &f->lf.lr_lpf_line[pl][offset];
-                        }
-                    } else {
-                st_uv:;
-                        const ptrdiff_t offset = sby * 8 * uv_stride;
-                        top = &f->lf.cdef_line[tf][pl][have_tt * offset + (bx * 4 >> ss_hor)];
-                        bot = bptrs[pl] + (8 >> ss_ver) * uv_stride;
+                                    &f->lf.cdef_line[tf][0][bx * 4],
+                                    0, y_sec_lvl, 0,
+                                    damping, edges HIGHBD_CALL_SUFFIX);
+                if (uv_lvl) {
+                    assert(layout != DAV1D_PIXEL_LAYOUT_I400);
+                    const int uvdir = uv_pri_lvl ? uv_dir[dir] : 0;
+                    for (int pl = 1; pl <= 2; pl++) {
+                        dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.stride[1], lr_bak[bit][pl],
+                                             &f->lf.cdef_line[tf][pl][bx * 4 >> ss_hor],
+                                             uv_pri_lvl, uv_sec_lvl, uvdir,
+                                             damping - 1, edges HIGHBD_CALL_SUFFIX);
                     }
-                    dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.stride[1],
-                                         lr_bak[bit][pl], top, bot,
-                                         uv_pri_lvl, uv_sec_lvl, uvdir,
-                                         damping - 1, edges HIGHBD_CALL_SUFFIX);
                 }
 
-            skip_uv:
                 bit ^= 1;
                 last_skip = 0;
 
             next_b:
                 bptrs[0] += 8;
                 bptrs[1] += 8 >> ss_hor;
                 bptrs[2] += 8 >> ss_hor;
             }
@@ -299,11 +226,11 @@ void bytefn(dav1d_cdef_brow)(Dav1dTaskCo
             iptrs[0] += sbsz * 4;
             iptrs[1] += sbsz * 4 >> ss_hor;
             iptrs[2] += sbsz * 4 >> ss_hor;
         }
 
         ptrs[0] += 8 * PXSTRIDE(f->cur.stride[0]);
         ptrs[1] += 8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver;
         ptrs[2] += 8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver;
-        tc->top_pre_cdef_toggle ^= 1;
+        f->lf.top_pre_cdef_toggle ^= 1;
     }
 }
--- a/third_party/dav1d/src/cdef_tmpl.c
+++ b/third_party/dav1d/src/cdef_tmpl.c
@@ -50,19 +50,19 @@ static inline void fill(int16_t *tmp, co
         for (int x = 0; x < w; x++)
             tmp[x] = INT16_MIN;
         tmp += stride;
     }
 }
 
 static void padding(int16_t *tmp, const ptrdiff_t tmp_stride,
                     const pixel *src, const ptrdiff_t src_stride,
-                    const pixel (*left)[2],
-                    const pixel *top, const pixel *bottom,
-                    const int w, const int h, const enum CdefEdgeFlags edges)
+                    const pixel (*left)[2], const pixel *top,
+                    const int w, const int h,
+                    const enum CdefEdgeFlags edges)
 {
     // fill extended input buffer
     int x_start = -2, x_end = w + 2, y_start = -2, y_end = h + 2;
     if (!(edges & CDEF_HAVE_TOP)) {
         fill(tmp - 2 - 2 * tmp_stride, tmp_stride, w + 4, 2);
         y_start = 0;
     }
     if (!(edges & CDEF_HAVE_BOTTOM)) {
@@ -81,45 +81,37 @@ static void padding(int16_t *tmp, const 
     for (int y = y_start; y < 0; y++) {
         for (int x = x_start; x < x_end; x++)
             tmp[x + y * tmp_stride] = top[x];
         top += PXSTRIDE(src_stride);
     }
     for (int y = 0; y < h; y++)
         for (int x = x_start; x < 0; x++)
             tmp[x + y * tmp_stride] = left[y][2 + x];
-    for (int y = 0; y < h; y++) {
+    for (int y = 0; y < y_end; y++) {
         for (int x = (y < h) ? 0 : x_start; x < x_end; x++)
             tmp[x] = src[x];
         src += PXSTRIDE(src_stride);
         tmp += tmp_stride;
     }
-    for (int y = h; y < y_end; y++) {
-        for (int x = x_start; x < x_end; x++)
-            tmp[x] = bottom[x];
-        bottom += PXSTRIDE(src_stride);
-        tmp += tmp_stride;
-    }
-
 }
 
 static NOINLINE void
 cdef_filter_block_c(pixel *dst, const ptrdiff_t dst_stride,
-                    const pixel (*left)[2],
-                    const pixel *const top, const pixel *const bottom,
+                    const pixel (*left)[2], const pixel *const top,
                     const int pri_strength, const int sec_strength,
                     const int dir, const int damping, const int w, int h,
                     const enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX)
 {
     const ptrdiff_t tmp_stride = 12;
     assert((w == 4 || w == 8) && (h == 4 || h == 8));
     int16_t tmp_buf[144]; // 12*12 is the maximum value of tmp_stride * (h + 4)
     int16_t *tmp = tmp_buf + 2 * tmp_stride + 2;
 
-    padding(tmp, tmp_stride, dst, dst_stride, left, top, bottom, w, h, edges);
+    padding(tmp, tmp_stride, dst, dst_stride, left, top, w, h, edges);
 
     if (pri_strength) {
         const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
         const int pri_tap = 4 - ((pri_strength >> bitdepth_min_8) & 1);
         const int pri_shift = imax(0, damping - ulog2(pri_strength));
         if (sec_strength) {
             const int sec_shift = damping - ulog2(sec_strength);
             do {
@@ -214,26 +206,25 @@ cdef_filter_block_c(pixel *dst, const pt
     }
 }
 
 #define cdef_fn(w, h) \
 static void cdef_filter_block_##w##x##h##_c(pixel *const dst, \
                                             const ptrdiff_t stride, \
                                             const pixel (*left)[2], \
                                             const pixel *const top, \
-                                            const pixel *const bottom, \
                                             const int pri_strength, \
                                             const int sec_strength, \
                                             const int dir, \
                                             const int damping, \
                                             const enum CdefEdgeFlags edges \
                                             HIGHBD_DECL_SUFFIX) \
 { \
-    cdef_filter_block_c(dst, stride, left, top, bottom, \
-                        pri_strength, sec_strength, dir, damping, w, h, edges HIGHBD_TAIL_SUFFIX); \
+    cdef_filter_block_c(dst, stride, left, top, pri_strength, sec_strength, \
+                        dir, damping, w, h, edges HIGHBD_TAIL_SUFFIX); \
 }
 
 cdef_fn(4, 4);
 cdef_fn(4, 8);
 cdef_fn(8, 8);
 
 static int cdef_find_dir_c(const pixel *img, const ptrdiff_t stride,
                            unsigned *const var HIGHBD_DECL_SUFFIX)
--- a/third_party/dav1d/src/cpu.c
+++ b/third_party/dav1d/src/cpu.c
@@ -28,36 +28,32 @@
 
 #include <stdint.h>
 
 #include "src/cpu.h"
 #include "src/log.h"
 
 #ifdef _WIN32
 #include <windows.h>
+#elif defined(__linux__)
+#include <sched.h>
+#include <unistd.h>
 #elif defined(__APPLE__)
 #include <sys/sysctl.h>
 #include <sys/types.h>
-#else
-#include <pthread.h>
-#include <unistd.h>
-#endif
-
-#ifdef HAVE_PTHREAD_NP_H
-#include <pthread_np.h>
-#endif
-#if defined(__FreeBSD__)
-#define cpu_set_t cpuset_t
 #endif
 
 static unsigned flags = 0;
 
 #if __has_feature(memory_sanitizer)
 // memory sanitizer is inherently incompatible with asm
 static unsigned flags_mask = 0;
+#elif ARCH_X86
+/* Disable AVX-512 by default for the time being */
+static unsigned flags_mask = ~DAV1D_X86_CPU_FLAG_AVX512ICL;
 #else
 static unsigned flags_mask = -1;
 #endif
 
 COLD void dav1d_init_cpu(void) {
 #if HAVE_ASM
 #if ARCH_AARCH64 || ARCH_ARM
     flags = dav1d_get_cpu_flags_arm();
@@ -87,23 +83,25 @@ COLD int dav1d_num_logical_processors(Da
             num_processors++;
         return num_processors;
     }
 #else
     SYSTEM_INFO system_info;
     GetNativeSystemInfo(&system_info);
     return system_info.dwNumberOfProcessors;
 #endif
-#elif defined(HAVE_PTHREAD_GETAFFINITY_NP) && defined(CPU_COUNT)
+#elif defined(__linux__)
+#ifdef CPU_COUNT
     cpu_set_t affinity;
-    if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity))
+    if (!sched_getaffinity(0, sizeof(affinity), &affinity))
         return CPU_COUNT(&affinity);
+#else
+    return (int)sysconf(_SC_NPROCESSORS_ONLN);
+#endif
 #elif defined(__APPLE__)
     int num_processors;
     size_t length = sizeof(num_processors);
     if (!sysctlbyname("hw.logicalcpu", &num_processors, &length, NULL, 0))
         return num_processors;
-#elif defined(_SC_NPROCESSORS_ONLN)
-    return (int)sysconf(_SC_NPROCESSORS_ONLN);
 #endif
     dav1d_log(c, "Unable to detect thread count, defaulting to single-threaded mode\n");
     return 1;
 }
--- a/third_party/dav1d/src/decode.c
+++ b/third_party/dav1d/src/decode.c
@@ -2299,56 +2299,55 @@ static int checked_decode_b(Dav1dTaskCon
 #define decode_b checked_decode_b
 
 #endif /* defined(__has_feature) */
 
 static int decode_sb(Dav1dTaskContext *const t, const enum BlockLevel bl,
                      const EdgeNode *const node)
 {
     const Dav1dFrameContext *const f = t->f;
-    Dav1dTileState *const ts = t->ts;
     const int hsz = 16 >> bl;
     const int have_h_split = f->bw > t->bx + hsz;
     const int have_v_split = f->bh > t->by + hsz;
 
     if (!have_h_split && !have_v_split) {
         assert(bl < BL_8X8);
         return decode_sb(t, bl + 1, ((const EdgeBranch *) node)->split[0]);
     }
 
     uint16_t *pc;
     enum BlockPartition bp;
     int ctx, bx8, by8;
     if (t->frame_thread.pass != 2) {
         if (0 && bl == BL_64X64)
             printf("poc=%d,y=%d,x=%d,bl=%d,r=%d\n",
-                   f->frame_hdr->frame_offset, t->by, t->bx, bl, ts->msac.rng);
+                   f->frame_hdr->frame_offset, t->by, t->bx, bl, t->ts->msac.rng);
         bx8 = (t->bx & 31) >> 1;
         by8 = (t->by & 31) >> 1;
         ctx = get_partition_ctx(t->a, &t->l, bl, by8, bx8);
-        pc = ts->cdf.m.partition[bl][ctx];
+        pc = t->ts->cdf.m.partition[bl][ctx];
     }
 
     if (have_h_split && have_v_split) {
         if (t->frame_thread.pass == 2) {
             const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx];
             bp = b->bl == bl ? b->bp : PARTITION_SPLIT;
         } else {
-            bp = dav1d_msac_decode_symbol_adapt16(&ts->msac, pc,
+            bp = dav1d_msac_decode_symbol_adapt16(&t->ts->msac, pc,
                                                   dav1d_partition_type_count[bl]);
             if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I422 &&
                 (bp == PARTITION_V || bp == PARTITION_V4 ||
                  bp == PARTITION_T_LEFT_SPLIT || bp == PARTITION_T_RIGHT_SPLIT))
             {
                 return 1;
             }
             if (DEBUG_BLOCK_INFO)
                 printf("poc=%d,y=%d,x=%d,bl=%d,ctx=%d,bp=%d: r=%d\n",
                        f->frame_hdr->frame_offset, t->by, t->bx, bl, ctx, bp,
-                       ts->msac.rng);
+                       t->ts->msac.rng);
         }