Bug 1533559 - Update libdav1d to 0.2.1 on beta. r=TD-Linux a=pascalc
authorThomas Daede <tdaede@mozilla.com>
Tue, 02 Apr 2019 15:36:37 +0300
changeset 525872 8184308efa35edc61ed6b187a5da0d683c85df77
parent 525871 7a9123da87cc164d6962c7bebcae26630004310d
child 525873 704467f7c23219aa77151371afb2e9f5460310a2
push id2032
push userffxbld-merge
push dateMon, 13 May 2019 09:36:57 +0000
treeherdermozilla-release@455c1065dcbe [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersTD-Linux, pascalc
bugs1533559
milestone67.0
Bug 1533559 - Update libdav1d to 0.2.1 on beta. r=TD-Linux a=pascalc Reviewers: TD-Linux Reviewed By: TD-Linux Subscribers: jya Bug #: 1533559 Differential Revision: https://phabricator.services.mozilla.com/D24486
media/libdav1d/README_MOZILLA
media/libdav1d/asm/moz.build
media/libdav1d/dav1d.rc
media/libdav1d/moz.build
media/libdav1d/moz.yaml
media/libdav1d/vcs_version.h
media/libdav1d/version.h
python/mozbuild/mozbuild/vendor_dav1d.py
third_party/dav1d/.gitlab-ci.yml
third_party/dav1d/COPYING
third_party/dav1d/NEWS
third_party/dav1d/README.md
third_party/dav1d/THANKS.md
third_party/dav1d/include/common/attributes.h
third_party/dav1d/include/common/bitdepth.h
third_party/dav1d/include/common/dump.h
third_party/dav1d/include/common/intops.h
third_party/dav1d/include/common/mem.h
third_party/dav1d/include/common/validate.h
third_party/dav1d/include/dav1d/common.h
third_party/dav1d/include/dav1d/data.h
third_party/dav1d/include/dav1d/dav1d.h
third_party/dav1d/include/dav1d/headers.h
third_party/dav1d/include/dav1d/meson.build
third_party/dav1d/include/dav1d/picture.h
third_party/dav1d/include/dav1d/version.h.in
third_party/dav1d/include/meson.build
third_party/dav1d/include/vcs_version.h.in
third_party/dav1d/include/version.h.in
third_party/dav1d/meson.build
third_party/dav1d/meson_options.txt
third_party/dav1d/snap/snapcraft.yaml
third_party/dav1d/src/arm/32/looprestoration.S
third_party/dav1d/src/arm/32/mc.S
third_party/dav1d/src/arm/32/util.S
third_party/dav1d/src/arm/64/cdef.S
third_party/dav1d/src/arm/64/looprestoration.S
third_party/dav1d/src/arm/64/mc.S
third_party/dav1d/src/arm/64/util.S
third_party/dav1d/src/arm/asm.S
third_party/dav1d/src/arm/cdef_init_tmpl.c
third_party/dav1d/src/arm/cpu.c
third_party/dav1d/src/arm/cpu.h
third_party/dav1d/src/arm/looprestoration_init_tmpl.c
third_party/dav1d/src/arm/mc_init_tmpl.c
third_party/dav1d/src/cdef.h
third_party/dav1d/src/cdef_apply.h
third_party/dav1d/src/cdef_apply_tmpl.c
third_party/dav1d/src/cdef_tmpl.c
third_party/dav1d/src/cdf.h
third_party/dav1d/src/cpu.h
third_party/dav1d/src/ctx.h
third_party/dav1d/src/data.c
third_party/dav1d/src/data.h
third_party/dav1d/src/dav1d.rc.in
third_party/dav1d/src/decode.c
third_party/dav1d/src/decode.h
third_party/dav1d/src/dequant_tables.h
third_party/dav1d/src/env.h
third_party/dav1d/src/ext/x86/x86inc.asm
third_party/dav1d/src/film_grain.h
third_party/dav1d/src/getbits.c
third_party/dav1d/src/getbits.h
third_party/dav1d/src/internal.h
third_party/dav1d/src/intra_edge.h
third_party/dav1d/src/ipred.h
third_party/dav1d/src/ipred_prepare.h
third_party/dav1d/src/ipred_prepare_tmpl.c
third_party/dav1d/src/ipred_tmpl.c
third_party/dav1d/src/itx.h
third_party/dav1d/src/itx_tmpl.c
third_party/dav1d/src/levels.h
third_party/dav1d/src/lf_apply.h
third_party/dav1d/src/lf_mask.c
third_party/dav1d/src/lf_mask.h
third_party/dav1d/src/lib.c
third_party/dav1d/src/log.c
third_party/dav1d/src/log.h
third_party/dav1d/src/loopfilter.h
third_party/dav1d/src/looprestoration.h
third_party/dav1d/src/looprestoration_tmpl.c
third_party/dav1d/src/lr_apply.h
third_party/dav1d/src/lr_apply_tmpl.c
third_party/dav1d/src/mc.h
third_party/dav1d/src/mc_tmpl.c
third_party/dav1d/src/meson.build
third_party/dav1d/src/msac.c
third_party/dav1d/src/msac.h
third_party/dav1d/src/obu.c
third_party/dav1d/src/obu.h
third_party/dav1d/src/picture.c
third_party/dav1d/src/picture.h
third_party/dav1d/src/qm.h
third_party/dav1d/src/recon.h
third_party/dav1d/src/recon_tmpl.c
third_party/dav1d/src/ref.h
third_party/dav1d/src/ref_mvs.c
third_party/dav1d/src/ref_mvs.h
third_party/dav1d/src/scan.h
third_party/dav1d/src/tables.c
third_party/dav1d/src/tables.h
third_party/dav1d/src/thread.h
third_party/dav1d/src/thread_data.h
third_party/dav1d/src/thread_task.c
third_party/dav1d/src/thread_task.h
third_party/dav1d/src/warpmv.c
third_party/dav1d/src/warpmv.h
third_party/dav1d/src/wedge.h
third_party/dav1d/src/win32/thread.c
third_party/dav1d/src/x86/cdef.asm
third_party/dav1d/src/x86/cdef_init_tmpl.c
third_party/dav1d/src/x86/cdef_ssse3.asm
third_party/dav1d/src/x86/cpu.c
third_party/dav1d/src/x86/cpu.h
third_party/dav1d/src/x86/ipred.asm
third_party/dav1d/src/x86/ipred_init_tmpl.c
third_party/dav1d/src/x86/ipred_ssse3.asm
third_party/dav1d/src/x86/itx_init_tmpl.c
third_party/dav1d/src/x86/itx_ssse3.asm
third_party/dav1d/src/x86/looprestoration.asm
third_party/dav1d/src/x86/looprestoration_init_tmpl.c
third_party/dav1d/src/x86/looprestoration_ssse3.asm
third_party/dav1d/src/x86/mc.asm
third_party/dav1d/src/x86/mc_init_tmpl.c
third_party/dav1d/src/x86/mc_ssse3.asm
third_party/dav1d/tests/checkasm/cdef.c
third_party/dav1d/tests/checkasm/checkasm.c
third_party/dav1d/tests/checkasm/checkasm.h
third_party/dav1d/tests/checkasm/itx.c
third_party/dav1d/tests/checkasm/mc.c
third_party/dav1d/tests/libfuzzer/alloc_fail.c
third_party/dav1d/tests/libfuzzer/dav1d_fuzzer.h
third_party/dav1d/tests/libfuzzer/main.c
third_party/dav1d/tests/meson.build
third_party/dav1d/tools/dav1d.c
third_party/dav1d/tools/dav1d_cli_parse.c
third_party/dav1d/tools/dav1d_cli_parse.h
third_party/dav1d/tools/input/annexb.c
third_party/dav1d/tools/input/demuxer.h
third_party/dav1d/tools/input/input.c
third_party/dav1d/tools/input/input.h
third_party/dav1d/tools/input/ivf.c
third_party/dav1d/tools/output/md5.c
third_party/dav1d/tools/output/muxer.h
third_party/dav1d/tools/output/output.c
third_party/dav1d/tools/output/output.h
--- a/media/libdav1d/README_MOZILLA
+++ b/media/libdav1d/README_MOZILLA
@@ -13,12 +13,8 @@ To update to a specific upstream git tag
 
   ./mach vendor dav1d -r <commit>
 
 The upstream git repository is https://aomedia.googlesource.com/aom
 
 To update to a fork, use
 
   ./mach vendor dav1d --repo <repository url> [-r <commit>]
-
-The last update was pulled from https://code.videolan.org/videolan/dav1d
-
-The git commit ID used was 197a19ad702d5e7472852efcde98feeb07f373e0 (2018-11-26T12:15:41.000Z).
--- a/media/libdav1d/asm/moz.build
+++ b/media/libdav1d/asm/moz.build
@@ -69,19 +69,21 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64
             '../../../third_party/dav1d/src/x86/ipred.asm',
             '../../../third_party/dav1d/src/x86/itx.asm',
             '../../../third_party/dav1d/src/x86/loopfilter.asm',
             '../../../third_party/dav1d/src/x86/looprestoration.asm',
             '../../../third_party/dav1d/src/x86/mc.asm',
         ]
 
     SOURCES += [
+        '../../../third_party/dav1d/src/x86/cdef_ssse3.asm',
         '../../../third_party/dav1d/src/x86/cpuid.asm',
         '../../../third_party/dav1d/src/x86/ipred_ssse3.asm',
         '../../../third_party/dav1d/src/x86/itx_ssse3.asm',
+        '../../../third_party/dav1d/src/x86/looprestoration_ssse3.asm',
         '../../../third_party/dav1d/src/x86/mc_ssse3.asm',
     ]
 
     # BITDEPTH
     relative_path = '../../../third_party/dav1d/src/x86/'
     bitdepth_basenames = [
         'cdef_init_tmpl.c',
         'ipred_init_tmpl.c',
--- a/media/libdav1d/dav1d.rc
+++ b/media/libdav1d/dav1d.rc
@@ -1,30 +1,32 @@
-#define VERSION_NUMBER 0,0,1,0
-#define VERSION_NUMBER_STR "0.0.1.0"
+#define API_VERSION_NUMBER 1,0,1,0
+#define API_VERSION_NUMBER_STR "1.0.1"
+#define PROJECT_VERSION_NUMBER 0,2,2,0
+#define PROJECT_VERSION_NUMBER_STR "0.2.2"
 
 #include <windows.h>
 
 1 VERSIONINFO
 FILETYPE VFT_DLL
 FILEOS VOS_NT_WINDOWS32
-PRODUCTVERSION VERSION_NUMBER
-FILEVERSION VERSION_NUMBER
+PRODUCTVERSION PROJECT_VERSION_NUMBER
+FILEVERSION API_VERSION_NUMBER
 BEGIN
   BLOCK "StringFileInfo"
   BEGIN
     BLOCK "040904E4"
     BEGIN
       VALUE "CompanyName", "VideoLAN"
       VALUE "ProductName", "dav1d"
-      VALUE "ProductVersion", VERSION_NUMBER_STR
-      VALUE "FileVersion", VERSION_NUMBER_STR
-      VALUE "FileDescription", "dav1d AV1 decoder"
+      VALUE "ProductVersion", PROJECT_VERSION_NUMBER_STR
+      VALUE "FileVersion", API_VERSION_NUMBER_STR
+      VALUE "FileDescription", "dav1d " PROJECT_VERSION_NUMBER_STR " - AV1 decoder"
       VALUE "InternalName", "dav1d"
       VALUE "OriginalFilename", "libdav1d.dll"
-      VALUE "LegalCopyright", "Copyright \251 2018 VideoLAN and dav1d Authors"
+      VALUE "LegalCopyright", "Copyright \251 2019 VideoLAN and dav1d Authors"
     END
   END
   BLOCK "VarFileInfo"
   BEGIN
     VALUE "Translation", 0x409, 1252
   END
 END
--- a/media/libdav1d/moz.build
+++ b/media/libdav1d/moz.build
@@ -56,16 +56,17 @@ SOURCES += [
     '../../third_party/dav1d/src/cdf.c',
     '../../third_party/dav1d/src/cpu.c',
     '../../third_party/dav1d/src/data.c',
     '../../third_party/dav1d/src/decode.c',
     '../../third_party/dav1d/src/dequant_tables.c',
     '../../third_party/dav1d/src/getbits.c',
     '../../third_party/dav1d/src/intra_edge.c',
     '../../third_party/dav1d/src/lf_mask.c',
+    '../../third_party/dav1d/src/log.c',
     '../../third_party/dav1d/src/msac.c',
     '../../third_party/dav1d/src/obu.c',
     '../../third_party/dav1d/src/picture.c',
     '../../third_party/dav1d/src/qm.c',
     '../../third_party/dav1d/src/ref.c',
     '../../third_party/dav1d/src/ref_mvs.c',
     '../../third_party/dav1d/src/scan.c',
     '../../third_party/dav1d/src/tables.c',
@@ -80,16 +81,17 @@ EXPORTS.dav1d.src += [
     '../../third_party/dav1d/src/ctx.h',
     '../../third_party/dav1d/src/data.h',
     '../../third_party/dav1d/src/decode.h',
     '../../third_party/dav1d/src/dequant_tables.h',
     '../../third_party/dav1d/src/film_grain.h',
     '../../third_party/dav1d/src/getbits.h',
     '../../third_party/dav1d/src/intra_edge.h',
     '../../third_party/dav1d/src/lf_mask.h',
+    '../../third_party/dav1d/src/log.h',
     '../../third_party/dav1d/src/msac.h',
     '../../third_party/dav1d/src/obu.h',
     '../../third_party/dav1d/src/picture.h',
     '../../third_party/dav1d/src/qm.h',
     '../../third_party/dav1d/src/ref.h',
     '../../third_party/dav1d/src/ref_mvs.h',
     '../../third_party/dav1d/src/scan.h',
     '../../third_party/dav1d/src/tables.h',
--- a/media/libdav1d/moz.yaml
+++ b/media/libdav1d/moz.yaml
@@ -15,15 +15,15 @@ origin:
   description: dav1d, a fast AV1 decoder
 
   # Full URL for the package's homepage/etc
   # Usually different from repository url
   url: https://code.videolan.org/videolan/dav1d
 
   # Human-readable identifier for this version/release
   # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: commit f813285c1d1a5421e0180efbb7cbdd377cd31c69 (2019-01-13T22:08:25.000Z).
+  release: commit 7350c59e7894cb7e487a0add9942d2b1b39f7161 (2019-03-16T23:17:05.000Z).
 
   # The package's license, where possible using the mnemonic from
   # https://spdx.org/licenses/
   # Multiple licenses can be specified (as a YAML list)
   # A "LICENSE" file must exist containing the full license text
   license: BSD-2-Clause
copy from media/libdav1d/version.h
copy to media/libdav1d/vcs_version.h
--- a/media/libdav1d/version.h
+++ b/media/libdav1d/vcs_version.h
@@ -1,2 +1,2 @@
 /* auto-generated, do not edit */
-#define DAV1D_VERSION "0.1.1"
+#define DAV1D_VERSION "0.2.2"
--- a/media/libdav1d/version.h
+++ b/media/libdav1d/version.h
@@ -1,2 +1,34 @@
-/* auto-generated, do not edit */
-#define DAV1D_VERSION "0.1.1"
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_VERSION_H
+#define DAV1D_VERSION_H
+
+#define DAV1D_API_VERSION_MAJOR 1
+#define DAV1D_API_VERSION_MINOR 0
+#define DAV1D_API_VERSION_PATCH 1
+
+#endif /* DAV1D_VERSION_H */
--- a/python/mozbuild/mozbuild/vendor_dav1d.py
+++ b/python/mozbuild/mozbuild/vendor_dav1d.py
@@ -100,16 +100,26 @@ Please set a repository url with --repo 
                                 yaml, flags=re.MULTILINE)
         else:
             new_yaml = '%s\n\n%s %s.' % (yaml, prefix, revision)
 
         if yaml != new_yaml:
             with open(filename, 'w') as f:
                 f.write(new_yaml)
 
+    def update_vcs_version(self, revision, vendor_dir, glue_dir):
+        src_filename = mozpath.join(vendor_dir, 'include/vcs_version.h.in')
+        dst_filename = mozpath.join(glue_dir, 'vcs_version.h')
+        with open(src_filename) as f:
+            vcs_version_in = f.read()
+        vcs_version = vcs_version_in.replace('@VCS_TAG@', revision)
+        with open(dst_filename, 'w') as f:
+            f.write(vcs_version)
+
+
     def clean_upstream(self, target):
         '''Remove files we don't want to import.'''
         mozfile.remove(mozpath.join(target, '.gitattributes'))
         mozfile.remove(mozpath.join(target, '.gitignore'))
         mozfile.remove(mozpath.join(target, 'build', '.gitattributes'))
         mozfile.remove(mozpath.join(target, 'build' ,'.gitignore'))
 
     def check_modified_files(self):
@@ -149,15 +159,17 @@ Please commit or stash these changes bef
         self.fetch_and_unpack(commit, vendor_dir)
         self.log(logging.INFO, 'clean_upstream', {},
                  '''Removing unnecessary files.''')
         self.clean_upstream(vendor_dir)
         glue_dir = mozpath.join(self.topsrcdir, 'media/libdav1d')
         self.log(logging.INFO, 'update_moz.yaml', {},
                  '''Updating moz.yaml.''')
         self.update_yaml(commit, timestamp, glue_dir)
-        self.repository.add_remove_files(vendor_dir)
+        self.log(logging.INFO, 'update_vcs_version', {},
+                 '''Updating vcs_version.h.''')
+        self.update_vcs_version(commit, vendor_dir, glue_dir)
         self.log(logging.INFO, 'add_remove_files', {},
                  '''Registering changes with version control.''')
         self.repository.add_remove_files(vendor_dir)
         self.repository.add_remove_files(glue_dir)
         self.log(logging.INFO, 'done', {'revision': revision},
                  '''Update to dav1d version '{revision}' ready to commit.''')
--- a/third_party/dav1d/.gitlab-ci.yml
+++ b/third_party/dav1d/.gitlab-ci.yml
@@ -1,36 +1,51 @@
 stages:
     - style
     - build
     - test
 
 style-check:
-    image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132
+    image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
     stage: style
     tags:
         - debian
         - amd64
     script:
-        - git grep -n -e $'\t' --or -e $'\r' -- . ':(exclude)*/compat/*' && exit 1
-        - /bin/true
+        - git grep -n -P "\t|\r| $" -- . ':(exclude)*/compat/*' && exit 1
+        - git grep -n -i "david" -- . ':(exclude)THANKS.md' ':(exclude).gitlab-ci.yml' && exit 1
+        - git remote rm upstream 2> /dev/null || true
+        - git remote add upstream https://code.videolan.org/videolan/dav1d.git
+        - git fetch -q upstream master
+        - for i in $(git rev-list HEAD ^upstream/master); do
+              echo "Checking commit message of $i";
+              msg="$(git log --format=%B -n 1 $i)";
+              if [ -n "$(echo "$msg" | awk "NR==2")" ]; then
+                  echo "Malformed commit message in $i, second line must be empty";
+                  exit 1;
+              fi;
+              if echo "$msg" | head -1 | grep -q '\.$'; then
+                  echo "Malformed commit message in $i, trailing period in subject line";
+                  exit 1;
+              fi;
+          done
 
 build-debian:
-    image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132
+    image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
     stage: build
     tags:
         - debian
         - amd64
     script:
         - meson build --buildtype release --werror
         - ninja -C build
         - cd build && meson test -v
 
 build-debian-static:
-    image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132
+    image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
     stage: build
     tags:
         - debian
         - amd64
     script:
         - meson build --buildtype release --default-library static --werror
         - ninja -C build
         - cd build && meson test -v
@@ -44,46 +59,103 @@ build-debian32:
     script:
         - meson build --buildtype release
                       --werror
                       --cross-file /opt/crossfiles/linux32.meson
         - ninja -C build
         - cd build && meson test -v
 
 build-win32:
-    image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132
+    image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
     stage: build
     tags:
-        - win32
+        - debian
+        - amd64
     script:
+        - wineserver -p && wine wineboot
         - meson build --buildtype release
                       --werror
                       --libdir lib
                       --prefix "$(pwd)/build/dav1d_install"
                       --cross-file /opt/crossfiles/i686-w64-mingw32.meson
                       -Ddefault_library=both
         - ninja -C build
         - ninja -C build install
+        - cd build && meson test -v
     artifacts:
         name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG"
         paths:
             - build/dav1d_install/
         expire_in: 1 week
 
+build-win32-unaligned-stack:
+    image: registry.videolan.org:5000/vlc-debian-llvm-mingw:20190218133533
+    stage: build
+    tags:
+        - debian
+        - amd64
+    script:
+        - wineserver -p && wine wineboot
+        - meson build --buildtype release
+                      --werror
+                      --cross-file /opt/crossfiles/i686-w64-mingw32.meson
+                      -Dstack_alignment=4
+        - ninja -C build
+        - cd build && meson test -v
+
 build-win64:
-    image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132
+    image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
     stage: build
     tags:
-        - win64
+        - debian
+        - amd64
+    script:
+        - wineserver -p && wine wineboot
+        - meson build --buildtype release
+                      --werror
+                      --libdir lib
+                      --prefix "$(pwd)/build/dav1d_install"
+                      --cross-file /opt/crossfiles/x86_64-w64-mingw32.meson
+                      -Ddefault_library=both
+        - ninja -C build
+        - ninja -C build install
+        - cd build && meson test -v
+    artifacts:
+        name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG"
+        paths:
+            - build/dav1d_install/
+        expire_in: 1 week
+
+build-win-arm32:
+    image: registry.videolan.org:5000/vlc-debian-llvm-mingw:20190218133533
+    stage: build
+    tags:
+        - debian
+        - amd64
     script:
         - meson build --buildtype release
                       --werror
                       --libdir lib
                       --prefix "$(pwd)/build/dav1d_install"
-                      --cross-file /opt/crossfiles/x86_64-w64-mingw32.meson
+                      --cross-file /opt/crossfiles/armv7-w64-mingw32.meson
+                      -Ddefault_library=both
+        - ninja -C build
+
+build-win-arm64:
+    image: registry.videolan.org:5000/vlc-debian-llvm-mingw:20190218133533
+    stage: build
+    tags:
+        - debian
+        - amd64
+    script:
+        - meson build --buildtype release
+                      --werror
+                      --libdir lib
+                      --prefix "$(pwd)/build/dav1d_install"
+                      --cross-file /opt/crossfiles/aarch64-w64-mingw32.meson
                       -Ddefault_library=both
         - ninja -C build
         - ninja -C build install
     artifacts:
         name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG"
         paths:
             - build/dav1d_install/
         expire_in: 1 week
@@ -124,93 +196,160 @@ build-debian-werror:
     stage: build
     tags:
         - aarch64
         - debian
     script:
         - env CC='clang-7' meson build --buildtype debug --werror
         - ninja -C build
 
+build-debian-armv7:
+    stage: build
+    image: registry.videolan.org:5000/dav1d-debian-unstable-armv7:20190202101732
+    tags:
+        - armv7
+        - debian
+    script:
+        - meson build --buildtype release --werror
+        - ninja -C build
+        - cd build && meson test -v
+
+build-debian-armv7-clang-5:
+    stage: build
+    image: registry.videolan.org:5000/dav1d-debian-unstable-armv7:20190202101732
+    tags:
+        - armv7
+        - debian
+    script:
+        - env CC=clang-5.0 CFLAGS='-integrated-as' meson build --buildtype release
+        - ninja -C build
+        - cd build && meson test -v
+
+build-ubuntu-snap:
+    stage: build
+    image: registry.videolan.org:5000/dav1d-ubuntu-bionic:20190221154127
+    tags:
+        - debian
+        - amd64
+    script:
+        - snapcraft snap
+        - |
+           if [ "$CI_PROJECT_NAMESPACE" = "videolan" ]; then
+            echo $SNAP_LOGIN | base64 --decode | snapcraft login --with -
+            snapcraft push dav1d_*.snap --release edge
+            snapcraft logout
+           fi
+    artifacts:
+        name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG"
+        paths:
+            - dav1d_*.snap
+        expire_in: 1 week
+    allow_failure: true
+
 test-debian:
-    image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132
+    image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
     stage: test
     tags:
         - debian
         - amd64
     cache:
-        key: testdata.git
+        key: testdata.git-20190215
         paths:
             - cache/dav1d-test-data.git/
     script:
         - test -d cache || mkdir cache
         - test -d cache/dav1d-test-data.git && GIT_DIR=cache/dav1d-test-data.git git fetch --refmap=refs/heads/master:refs/heads/master origin master
         - test -d cache/dav1d-test-data.git || git clone --bare https://code.videolan.org/videolan/dav1d-test-data.git cache/dav1d-test-data.git
         - git clone cache/dav1d-test-data.git tests/dav1d-test-data
-        - meson build --buildtype release -Dtestdata_tests=true
+        - meson build --buildtype release -Dtestdata_tests=true -Dlogging=false
         - ninja -C build
         - cd build && time meson test -v
     dependencies: []
 
 test-debian-asan:
-    image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132
+    image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
     stage: test
     tags:
         - debian
         - amd64
     cache:
-        key: testdata.git
+        key: testdata.git-20190215
         paths:
             - cache/dav1d-test-data.git/
     variables:
         ASAN_OPTIONS: 'detect_leaks=0'
     script:
         - test -d cache || mkdir cache
         - test -d cache/dav1d-test-data.git && GIT_DIR=cache/dav1d-test-data.git git fetch --refmap=refs/heads/master:refs/heads/master origin master
         - test -d cache/dav1d-test-data.git || git clone --bare https://code.videolan.org/videolan/dav1d-test-data.git cache/dav1d-test-data.git
         - git clone cache/dav1d-test-data.git tests/dav1d-test-data
-        - meson build --buildtype debugoptimized -Dtestdata_tests=true -Db_sanitize=address -Dbuild_asm=false
+        - meson build --buildtype debugoptimized -Dtestdata_tests=true -Dlogging=false -Db_sanitize=address -Dbuild_asm=false
         - ninja -C build
         - cd build && time meson test -v --setup=sanitizer
     dependencies: []
 
 test-debian-msan:
-    image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132
+    image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
     stage: test
     tags:
         - debian
         - amd64
     cache:
-        key: testdata.git
+        key: testdata.git-20190215
         paths:
             - cache/dav1d-test-data.git/
     variables:
         MSAN_OPTIONS: 'exitcode=1'
     script:
         - test -d cache || mkdir cache
         - test -d cache/dav1d-test-data.git && GIT_DIR=cache/dav1d-test-data.git git fetch --refmap=refs/heads/master:refs/heads/master origin master
         - test -d cache/dav1d-test-data.git || git clone --bare https://code.videolan.org/videolan/dav1d-test-data.git cache/dav1d-test-data.git
         - git clone cache/dav1d-test-data.git tests/dav1d-test-data
-        - env CC=clang meson build --buildtype debugoptimized -Dtestdata_tests=true -Db_sanitize=memory -Db_lundef=false -Dbuild_asm=false
+        - env CC=clang meson build --buildtype debugoptimized -Dtestdata_tests=true -Dlogging=false -Db_sanitize=memory -Db_lundef=false -Dbuild_asm=false
         - ninja -C build
         - cd build && time meson test -v --setup=sanitizer
     dependencies: []
 
 test-debian-ubsan:
-    image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132
+    image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
     stage: test
     tags:
         - debian
         - amd64
     cache:
-        key: testdata.git
+        key: testdata.git-20190215
         paths:
             - cache/dav1d-test-data.git/
     variables:
         UBSAN_OPTIONS: 'print_stacktrace=1:halt_on_error=1'
     script:
         - test -d cache || mkdir cache
         - test -d cache/dav1d-test-data.git && GIT_DIR=cache/dav1d-test-data.git git fetch --refmap=refs/heads/master:refs/heads/master origin master
         - test -d cache/dav1d-test-data.git || git clone --bare https://code.videolan.org/videolan/dav1d-test-data.git cache/dav1d-test-data.git
         - git clone cache/dav1d-test-data.git tests/dav1d-test-data
-        - env CC=clang meson build --buildtype debugoptimized -Dtestdata_tests=true -Db_sanitize=undefined -Db_lundef=false -Dbuild_asm=false
+        - env CC=clang meson build --buildtype debugoptimized -Dtestdata_tests=true -Dlogging=false -Db_sanitize=undefined -Db_lundef=false -Dbuild_asm=false
         - ninja -C build
         - cd build && time meson test -v --setup=sanitizer
     dependencies: []
+
+test-win64:
+    image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
+    stage: test
+    tags:
+        - debian
+        - amd64
+    cache:
+        key: testdata.git-20190215
+        paths:
+            - cache/dav1d-test-data.git/
+    script:
+        - test -d cache || mkdir cache
+        - test -d cache/dav1d-test-data.git && GIT_DIR=cache/dav1d-test-data.git git fetch --refmap=refs/heads/master:refs/heads/master origin master
+        - test -d cache/dav1d-test-data.git || git clone --bare https://code.videolan.org/videolan/dav1d-test-data.git cache/dav1d-test-data.git
+        - git clone cache/dav1d-test-data.git tests/dav1d-test-data
+        - wineserver -p && wine wineboot
+        - meson build --buildtype release
+                      -Dtestdata_tests=true
+                      -Dlogging=false
+                      --cross-file /opt/crossfiles/x86_64-w64-mingw32.meson
+        - ninja -C build
+        - cd build && time meson test -v
+    dependencies: []
--- a/third_party/dav1d/COPYING
+++ b/third_party/dav1d/COPYING
@@ -1,9 +1,9 @@
-Copyright © 2018, VideoLAN and dav1d authors
+Copyright © 2018-2019, VideoLAN and dav1d authors
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 
 1. Redistributions of source code must retain the above copyright notice, this
    list of conditions and the following disclaimer.
 
--- a/third_party/dav1d/NEWS
+++ b/third_party/dav1d/NEWS
@@ -1,14 +1,32 @@
-Changes for 0.1.1 'Gazelle':
+Changes for 0.2.2 'Antelope':
+----------------------------
+
+
+Changes for 0.2.1 'Antelope':
 ----------------------------
 
+ - SSSE3 optimization for cdef_dir
+ - AVX-2 improvements of the existing CDEF optimizations
+ - NEON improvements of the existing CDEF and wiener optimizations
+ - Clarification about the numbering/versionning scheme
+
+
+Changes for 0.2.0 'Antelope':
+----------------------------
+
+ - ARM64 and ARM optimizations using NEON instructions
+ - SSSE3 optimizations for both 32 and 64bits
+ - More AVX-2 assembly, reaching almost completion
  - Fix installation of includes
  - Rewrite inverse transforms to avoid overflows
- - More AVX-2 assembly
+ - Snap packaging for Linux
+ - Updated API (ABI and API break)
+ - Fixes for un-decodable samples
 
 
 Changes for 0.1.0 'Gazelle':
 ----------------------------
 
 Initial release of dav1d, the fast and small AV1 decoder.
  - Support for all features of the AV1 bitstream
  - Support for all bitdepth, 8, 10 and 12bits
--- a/third_party/dav1d/README.md
+++ b/third_party/dav1d/README.md
@@ -30,17 +30,17 @@ 2. Provide a usable API,
 3. Port to most platforms,
 4. Make it fast on desktop, by writing asm for AVX-2 chips.
 
 ### On-going
 5. Make it fast on mobile, by writing asm for ARMv8 chips,
 6. Make it fast on older desktop, by writing asm for SSE chips.
 
 ### After
-7. Improve C code base with [various tweaks](wiki/task-list),
+7. Improve C code base with [various tweaks](https://code.videolan.org/videolan/dav1d/wikis/task-list),
 8. Accelerate for less common architectures,
 9. Use more GPU, when possible.
 
 # Contribute
 
 Currently, we are looking for help from:
 - C developers,
 - asm developers,
@@ -65,17 +65,17 @@ People will keep their copyright and the
 VideoLAN will only have the collective work rights.
 
 ## CoC
 
 The [VideoLAN Code of Conduct](https://wiki.videolan.org/CoC) applies to this project.
 
 # Compile
 
-1. Install [Meson](https://mesonbuild.com/) (0.47 or higher), [Ninja](https://ninja-build.org/), and, for x86* targets, [nasm](https://nasm.us/) (2.13 or higher)
+1. Install [Meson](https://mesonbuild.com/) (0.47 or higher), [Ninja](https://ninja-build.org/), and, for x86\* targets, [nasm](https://nasm.us/) (2.13.02 or higher)
 2. Run `meson build --buildtype release`
 3. Build with `ninja -C build`
 
 # Run tests
 
 1. During initial build dir setup or `meson configure` specify `-Dbuild_tests=true`
 2. In the build directory run `meson test` optionally with `-v` for more verbose output, especially useful
    for checkasm
@@ -83,16 +83,20 @@ 2. In the build directory run `meson tes
 # Run testdata based tests
 
 1. Checkout the test data repository
 
    ```
    git clone https://code.videolan.org/videolan/dav1d-test-data.git tests/dav1d-test-data
    ```
 2. During initial build dir setup or `meson configure` specify `-Dbuild_tests=true` and `-Dtestdata_tests=true`
+
+   ```
+   meson .test -Dbuild_tests=true -Dtestdata_tests=true
+   ```
 3. In the build directory run `meson test` optionally with `-v` for more verbose output
 
 # Support
 
 This project is partially funded by the *Alliance for Open Media*/**AOM** and is supported by TwoOrioles and VideoLabs.
 
 These companies can provide support and integration help, should you need it.
 
--- a/third_party/dav1d/THANKS.md
+++ b/third_party/dav1d/THANKS.md
@@ -11,9 +11,9 @@ The Alliance for Open Media (AOM) for fu
 * VideoLAN
 * FFmpeg
 * libplacebo
 
 ## Individual
 
 And all the dav1d Authors (git shortlog -sn), including:
 
-Janne Grunau, Ronald S. Bultje, James Almer, Marvin Scholz, Henrik Gramner, Martin Storsjö, Luc Trudeau, David Michael Barr, Hugo Beauzée-Luyssen, Steve Lhomme, Jean-Baptiste Kempf, Derek Buitenhuis, Nathan E. Egge, Raphaël Zumer, Francois Cartegnie, Niklas Haas, Konstantin Pavlov, Boyuan Xiao, Raphael Zumer and Michael Bradshaw.
+Janne Grunau, Ronald S. Bultje, Martin Storsjö, James Almer, Henrik Gramner, Marvin Scholz, Luc Trudeau, David Michael Barr, Jean-Baptiste Kempf, Hugo Beauzée-Luyssen, Steve Lhomme, Francois Cartegnie, Konstantin Pavlov, Nathan E. Egge, Victorien Le Couviour--Tuffet, Derek Buitenhuis, Liwei Wang, Raphaël Zumer, Michael Bradshaw, Niklas Haas, Xuefeng Jiang, Boyuan Xiao, Kyle Siefring, Matthias Dressel, Rupert Swarbrick, Thierry Foucu, Thomas Daede, Jan Beich, SmilingWolf, Tristan Laurent, Vittorio Giovara, Anisse Astier, Dale Curtis, Fred Barbier, Jean-Yves Avenard, Luca Barbato, Mark Shuttleworth, Nicolas Frattaroli, Rostislav Pehlivanov, Shiz, Steinar Midtskogen, Timo Gurr and skal.
--- a/third_party/dav1d/include/common/attributes.h
+++ b/third_party/dav1d/include/common/attributes.h
@@ -20,27 +20,29 @@
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __DAV1D_COMMON_ATTRIBUTES_H__
-#define __DAV1D_COMMON_ATTRIBUTES_H__
+#ifndef DAV1D_COMMON_ATTRIBUTES_H
+#define DAV1D_COMMON_ATTRIBUTES_H
 
 #include "config.h"
 
 #include <stddef.h>
 
 #ifdef __GNUC__
 #define ATTR_ALIAS __attribute__((may_alias))
+#define ATTR_FORMAT_PRINTF(fmt, attr) __attribute__((__format__(__printf__, fmt, attr)));
 #else
 #define ATTR_ALIAS
+#define ATTR_FORMAT_PRINTF(fmt, attr)
 #endif
 
 #if ARCH_X86_64
 /* x86-64 needs 32-byte alignment for AVX2. */
 #define ALIGN_32_VAL 32
 #define ALIGN_16_VAL 16
 #elif ARCH_X86_32 || ARCH_ARM || ARCH_AARCH64
 /* ARM doesn't benefit from anything more than 16-byte alignment. */
@@ -132,9 +134,9 @@ static inline int clz(const unsigned int
     return __builtin_clz(mask);
 }
 
 static inline int clzll(const unsigned long long mask) {
     return __builtin_clzll(mask);
 }
 #endif /* !_MSC_VER */
 
-#endif /* __DAV1D_COMMON_ATTRIBUTES_H__ */
+#endif /* DAV1D_COMMON_ATTRIBUTES_H */
--- a/third_party/dav1d/include/common/bitdepth.h
+++ b/third_party/dav1d/include/common/bitdepth.h
@@ -20,18 +20,18 @@
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __DAV1D_COMMON_BITDEPTH_H__
-#define __DAV1D_COMMON_BITDEPTH_H__ 1
+#ifndef DAV1D_COMMON_BITDEPTH_H
+#define DAV1D_COMMON_BITDEPTH_H
 
 #include <stdint.h>
 #include <string.h>
 
 #if !defined(BITDEPTH)
 typedef void pixel;
 typedef void coef;
 #define HIGHBD_DECL_SUFFIX /* nothing */
@@ -72,9 +72,9 @@ static inline void pixel_set(pixel *cons
 #error invalid value for bitdepth
 #endif
 #define bytefn(x) bitfn(x)
 
 #define bitfn_decls(name, ...) \
 name##_8bpc(__VA_ARGS__); \
 name##_16bpc(__VA_ARGS__)
 
-#endif /* __DAV1D_COMMON_BITDEPTH_H__ */
+#endif /* DAV1D_COMMON_BITDEPTH_H */
--- a/third_party/dav1d/include/common/dump.h
+++ b/third_party/dav1d/include/common/dump.h
@@ -20,18 +20,18 @@
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __DAV1D_COMMON_DUMP_H__
-#define __DAV1D_COMMON_DUMP_H__
+#ifndef DAV1D_COMMON_DUMP_H
+#define DAV1D_COMMON_DUMP_H
 
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
 
 #include "common/bitdepth.h"
 
 static inline void append_plane_to_file(const pixel *buf, ptrdiff_t stride,
@@ -78,9 +78,9 @@ static inline void ac_dump(const int16_t
     while (h--) {
         for (int x = 0; x < w; x++)
             printf(" %03d", buf[x]);
         buf += w;
         printf("\n");
     }
 }
 
-#endif /* __DAV1D_COMMON_DUMP_H__ */
+#endif /* DAV1D_COMMON_DUMP_H */
--- a/third_party/dav1d/include/common/intops.h
+++ b/third_party/dav1d/include/common/intops.h
@@ -20,18 +20,18 @@
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __DAV1D_COMMON_INTOPS_H__
-#define __DAV1D_COMMON_INTOPS_H__
+#ifndef DAV1D_COMMON_INTOPS_H
+#define DAV1D_COMMON_INTOPS_H
 
 #include <stdint.h>
 
 #include "common/attributes.h"
 
 static inline int imax(const int a, const int b) {
     return a > b ? a : b;
 }
@@ -68,9 +68,9 @@ static inline unsigned inv_recenter(cons
     if (v > (r << 1))
         return v;
     else if ((v & 1) == 0)
         return (v >> 1) + r;
     else
         return r - ((v + 1) >> 1);
 }
 
-#endif /* __DAV1D_COMMON_INTOPS_H__ */
+#endif /* DAV1D_COMMON_INTOPS_H */
--- a/third_party/dav1d/include/common/mem.h
+++ b/third_party/dav1d/include/common/mem.h
@@ -20,18 +20,18 @@
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __DAV1D_COMMON_MEM_H__
-#define __DAV1D_COMMON_MEM_H__
+#ifndef DAV1D_COMMON_MEM_H
+#define DAV1D_COMMON_MEM_H
 
 #include <assert.h>
 #include <stdlib.h>
 
 #if defined(HAVE_ALIGNED_MALLOC) || defined(HAVE_MEMALIGN)
 #include <malloc.h>
 #endif
 
@@ -75,9 +75,9 @@ static inline void dav1d_freep_aligned(v
 static inline void freep(void *ptr) {
     void **mem = (void **) ptr;
     if (*mem) {
         free(*mem);
         *mem = NULL;
     }
 }
 
-#endif /* __DAV1D_COMMON_MEM_H__ */
+#endif /* DAV1D_COMMON_MEM_H */
--- a/third_party/dav1d/include/common/validate.h
+++ b/third_party/dav1d/include/common/validate.h
@@ -20,18 +20,18 @@
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __DAV1D_COMMON_VALIDATE_H__
-#define __DAV1D_COMMON_VALIDATE_H__
+#ifndef DAV1D_COMMON_VALIDATE_H
+#define DAV1D_COMMON_VALIDATE_H
 
 #include <stdio.h>
 #include <stdlib.h>
 
 #if defined(NDEBUG)
 #define debug_abort()
 #else
 #define debug_abort abort
@@ -51,9 +51,9 @@
         fprintf(stderr, "Input validation check \'%s\' failed in %s!\n", \
                 #x, __func__); \
         debug_abort(); \
         return r; \
     }
 
 #define validate_input(x) validate_input_or_ret(x, )
 
-#endif /* __DAV1D_COMMON_VALIDATE_H__ */
+#endif /* DAV1D_COMMON_VALIDATE_H */
--- a/third_party/dav1d/include/dav1d/common.h
+++ b/third_party/dav1d/include/dav1d/common.h
@@ -20,18 +20,18 @@
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __DAV1D_COMMON_H__
-#define __DAV1D_COMMON_H__
+#ifndef DAV1D_COMMON_H
+#define DAV1D_COMMON_H
 
 #include <stddef.h>
 #include <stdint.h>
 
 #ifndef DAV1D_API
     #if defined _WIN32
       #define DAV1D_API __declspec(dllexport)
     #else
@@ -62,9 +62,9 @@ typedef struct Dav1dUserData {
 typedef struct Dav1dDataProps {
     int64_t timestamp; ///< container timestamp of input data, INT64_MIN if unknown (default)
     int64_t duration; ///< container duration of input data, 0 if unknown (default)
     int64_t offset; ///< stream offset of input data, -1 if unknown (default)
     size_t size; ///< packet size, default Dav1dData.sz
     struct Dav1dUserData user_data; ///< user-configurable data, default NULL members
 } Dav1dDataProps;
 
-#endif // __DAV1D_COMMON_H__
+#endif /* DAV1D_COMMON_H */
--- a/third_party/dav1d/include/dav1d/data.h
+++ b/third_party/dav1d/include/dav1d/data.h
@@ -20,29 +20,29 @@
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __DAV1D_DATA_H__
-#define __DAV1D_DATA_H__
+#ifndef DAV1D_DATA_H
+#define DAV1D_DATA_H
 
 #include <stddef.h>
 #include <stdint.h>
 
 #include "common.h"
 
 typedef struct Dav1dData {
     const uint8_t *data; ///< data pointer
     size_t sz; ///< data size
     struct Dav1dRef *ref; ///< allocation origin
-    Dav1dDataProps m;
+    Dav1dDataProps m; ///< user provided metadata passed to the output picture
 } Dav1dData;
 
 /**
  * Allocate data.
  *
  * @param data Input context.
  * @param   sz Size of the data that should be allocated.
  *
@@ -101,9 +101,9 @@ DAV1D_API int dav1d_data_wrap_user_data(
  * The reference count for data->m.user_data will be decremented (if it has been
  * initialized with dav1d_data_wrap_user_data). The $data object will be memset
  * to 0.
  *
  * @param data Input context.
  */
 DAV1D_API void dav1d_data_unref(Dav1dData *data);
 
-#endif /* __DAV1D_DATA_H__ */
+#endif /* DAV1D_DATA_H */
--- a/third_party/dav1d/include/dav1d/dav1d.h
+++ b/third_party/dav1d/include/dav1d/dav1d.h
@@ -20,42 +20,57 @@
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __DAV1D_H__
-#define __DAV1D_H__
+#ifndef DAV1D_H
+#define DAV1D_H
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 #include <errno.h>
+#include <stdarg.h>
 
 #include "common.h"
 #include "picture.h"
 #include "data.h"
+#include "version.h"
 
 typedef struct Dav1dContext Dav1dContext;
 typedef struct Dav1dRef Dav1dRef;
 
 #define DAV1D_MAX_FRAME_THREADS 256
 #define DAV1D_MAX_TILE_THREADS 64
 
+typedef struct Dav1dLogger {
+    void *cookie; ///< Custom data to pass to the callback.
+    /**
+     * Logger callback. Default prints to stderr. May be NULL to disable logging.
+     *
+     * @param cookie Custom pointer passed to all calls.
+     * @param format The vprintf compatible format string.
+     * @param     ap List of arguments referenced by the format string.
+     */
+    void (*callback)(void *cookie, const char *format, va_list ap);
+} Dav1dLogger;
+
 typedef struct Dav1dSettings {
     int n_frame_threads;
     int n_tile_threads;
-    Dav1dPicAllocator allocator;
     int apply_grain;
     int operating_point; ///< select an operating point for scalable AV1 bitstreams (0 - 31)
     int all_layers; ///< output all spatial layers of a scalable AV1 biststream
+    Dav1dPicAllocator allocator;
+    Dav1dLogger logger;
 } Dav1dSettings;
 
 /**
  * Get library version.
  */
 DAV1D_API const char *dav1d_version(void);
 
 /**
@@ -182,9 +197,9 @@ DAV1D_API void dav1d_close(Dav1dContext 
  *
  */
 DAV1D_API void dav1d_flush(Dav1dContext *c);
 
 # ifdef __cplusplus
 }
 # endif
 
-#endif /* __DAV1D_H__ */
+#endif /* DAV1D_H */
--- a/third_party/dav1d/include/dav1d/headers.h
+++ b/third_party/dav1d/include/dav1d/headers.h
@@ -20,18 +20,18 @@
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __DAV1D_HEADERS_H__
-#define __DAV1D_HEADERS_H__
+#ifndef DAV1D_HEADERS_H
+#define DAV1D_HEADERS_H
 
 // Constants from Section 3. "Symbols and abbreviated terms"
 #define DAV1D_MAX_CDEF_STRENGTHS 8
 #define DAV1D_MAX_OPERATING_POINTS 32
 #define DAV1D_MAX_TILE_COLS 64
 #define DAV1D_MAX_TILE_ROWS 64
 #define DAV1D_MAX_SEGMENTS 8
 #define DAV1D_NUM_REF_FRAMES 8
@@ -155,16 +155,32 @@ enum Dav1dMatrixCoefficients {
 
 enum Dav1dChromaSamplePosition {
     DAV1D_CHR_UNKNOWN = 0,
     DAV1D_CHR_VERTICAL = 1,  ///< Horizontally co-located with luma(0, 0)
                            ///< sample, between two vertical samples
     DAV1D_CHR_COLOCATED = 2, ///< Co-located with luma(0, 0) sample
 };
 
+typedef struct Dav1dContentLightLevel {
+    int max_content_light_level;
+    int max_frame_average_light_level;
+} Dav1dContentLightLevel;
+
+typedef struct Dav1dMasteringDisplay {
+    ///< 0.16 fixed point
+    uint16_t primaries[3][2];
+    ///< 0.16 fixed point
+    uint16_t white_point[2];
+    ///< 24.8 fixed point
+    uint32_t max_luminance;
+    ///< 18.14 fixed point
+    uint32_t min_luminance;
+} Dav1dMasteringDisplay;
+
 typedef struct Dav1dSequenceHeader {
     /**
      * Stream profile, 0 for 8-10 bits/component 4:2:0 or monochrome;
      * 1 for 8-10 bits/component 4:4:4; 2 for 4:2:2 at any bits/component,
      * or 12 bits/component at any chroma subsampling.
      */
     int profile;
     /**
@@ -174,31 +190,36 @@ typedef struct Dav1dSequenceHeader {
      */
     int max_width, max_height;
     enum Dav1dPixelLayout layout; ///< format of the picture
     enum Dav1dColorPrimaries pri; ///< color primaries (av1)
     enum Dav1dTransferCharacteristics trc; ///< transfer characteristics (av1)
     enum Dav1dMatrixCoefficients mtrx; ///< matrix coefficients (av1)
     enum Dav1dChromaSamplePosition chr; ///< chroma sample position (av1)
     /**
+     * 0, 1 and 2 mean 8, 10 or 12 bits/component, respectively. This is not
+     * exactly the same as 'hbd' from the spec; the spec's hbd distinguishes
+     * between 8 (0) and 10-12 (1) bits/component, and another element
+     * (twelve_bit) to distinguish between 10 and 12 bits/component. To get
+     * the spec's hbd, use !!our_hbd, and to get twelve_bit, use hbd == 2.
+     */
+    int hbd;
+    /**
      * Pixel data uses JPEG pixel range ([0,255] for 8bits) instead of
      * MPEG pixel range ([16,235] for 8bits luma, [16,240] for 8bits chroma).
      */
     int color_range;
 
     int num_operating_points;
     struct Dav1dSequenceHeaderOperatingPoint {
         int major_level, minor_level;
         int initial_display_delay;
         int idc;
         int tier;
         int decoder_model_param_present;
-        int decoder_buffer_delay;
-        int encoder_buffer_delay;
-        int low_delay_mode;
         int display_model_param_present;
     } operating_points[DAV1D_MAX_OPERATING_POINTS];
 
     int still_picture;
     int reduced_still_picture_header;
     int timing_info_present;
     int num_units_in_tick;
     int time_scale;
@@ -225,28 +246,32 @@ typedef struct Dav1dSequenceHeader {
     int jnt_comp;
     int ref_frame_mvs;
     enum Dav1dAdaptiveBoolean screen_content_tools;
     enum Dav1dAdaptiveBoolean force_integer_mv;
     int order_hint_n_bits;
     int super_res;
     int cdef;
     int restoration;
-    /**
-     * 0, 1 and 2 mean 8, 10 or 12 bits/component, respectively. This is not
-     * exactly the same as 'hbd' from the spec; the spec's hbd distinguishes
-     * between 8 (0) and 10-12 (1) bits/component, and another element
-     * (twelve_bit) to distinguish between 10 and 12 bits/component. To get
-     * the spec's hbd, use !!our_hbd, and to get twelve_bit, use hbd == 2.
-     */
-    int hbd;
     int ss_hor, ss_ver, monochrome;
     int color_description_present;
     int separate_uv_delta_q;
     int film_grain_present;
+
+    // Dav1dSequenceHeaders of the same sequence are required to be
+    // bit-identical until this offset. See 7.5 "Ordering of OBUs":
+    //   Within a particular coded video sequence, the contents of
+    //   sequence_header_obu must be bit-identical each time the
+    //   sequence header appears except for the contents of
+    //   operating_parameters_info.
+    struct Dav1dSequenceHeaderOperatingParameterInfo {
+        int decoder_buffer_delay;
+        int encoder_buffer_delay;
+        int low_delay_mode;
+    } operating_parameter_info[DAV1D_MAX_OPERATING_POINTS];
 } Dav1dSequenceHeader;
 
 typedef struct Dav1dSegmentationData {
     int delta_q;
     int delta_lf_y_v, delta_lf_y_h, delta_lf_u, delta_lf_v;
     int ref;
     int skip;
     int globalmv;
@@ -377,9 +402,9 @@ typedef struct Dav1dFrameHeader {
     enum Dav1dTxfmMode txfm_mode;
     int switchable_comp_refs;
     int skip_mode_allowed, skip_mode_enabled, skip_mode_refs[2];
     int warp_motion;
     int reduced_txtp_set;
     Dav1dWarpedMotionParams gmv[DAV1D_REFS_PER_FRAME];
 } Dav1dFrameHeader;
 
-#endif /* __DAV1D_HEADERS_H__ */
+#endif /* DAV1D_HEADERS_H */
copy from third_party/dav1d/include/meson.build
copy to third_party/dav1d/include/dav1d/meson.build
--- a/third_party/dav1d/include/meson.build
+++ b/third_party/dav1d/include/dav1d/meson.build
@@ -1,9 +1,9 @@
-# Copyright © 2018, VideoLAN and dav1d authors
+# Copyright © 2019, VideoLAN and dav1d authors
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 #
 # 1. Redistributions of source code must retain the above copyright notice, this
 #    list of conditions and the following disclaimer.
 #
@@ -17,21 +17,25 @@
 # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-# Revision file (version.h) generation
-dav1d_git_dir = join_paths(dav1d_src_root, '.git')
-rev_target = vcs_tag(command: [
-        'git', '--git-dir', dav1d_git_dir,
-        'describe', '--tags', '--long',
-        '--match', '?.*.*', '--always'
-    ],
-    input: 'version.h.in',
-    output: 'version.h'
-)
+# installed version.h header generation
+version_h_data = configuration_data()
+version_h_data.set('DAV1D_API_VERSION_MAJOR', dav1d_api_version_major)
+version_h_data.set('DAV1D_API_VERSION_MINOR', dav1d_api_version_minor)
+version_h_data.set('DAV1D_API_VERSION_PATCH', dav1d_api_version_revision)
+version_h_target = configure_file(input: 'version.h.in',
+                                  output: 'version.h',
+                                  configuration: version_h_data)
 
-# Install include/dav1d headers
-install_subdir('dav1d', install_dir: get_option('includedir'))
+# install headers
+install_headers('common.h',
+                'data.h',
+                'dav1d.h',
+                'headers.h',
+                'picture.h',
+                version_h_target,
+                subdir : 'dav1d')
--- a/third_party/dav1d/include/dav1d/picture.h
+++ b/third_party/dav1d/include/dav1d/picture.h
@@ -20,25 +20,30 @@
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __DAV1D_PICTURE_H__
-#define __DAV1D_PICTURE_H__
+#ifndef DAV1D_PICTURE_H
+#define DAV1D_PICTURE_H
 
 #include <stddef.h>
 #include <stdint.h>
 
 #include "common.h"
 #include "headers.h"
 
+/* Number of bytes to align AND pad picture memory buffers by, so that SIMD
+ * implementations can over-read by a few bytes, and use aligned read/write
+ * instructions. */
+#define DAV1D_PICTURE_ALIGNMENT 32
+
 typedef struct Dav1dPictureParameters {
     int w; ///< width (in pixels)
     int h; ///< height (in pixels)
     enum Dav1dPixelLayout layout; ///< format of the picture
     int bpc; ///< bits per pixel component (8 or 10)
 } Dav1dPictureParameters;
 
 typedef struct Dav1dPicture {
@@ -56,42 +61,60 @@ typedef struct Dav1dPicture {
 
     /**
      * Number of bytes between 2 lines in data[] for luma [0] or chroma [1].
      */
     ptrdiff_t stride[2];
 
     Dav1dPictureParameters p;
     Dav1dDataProps m;
-    struct Dav1dRef *frame_hdr_ref, *seq_hdr_ref, *ref; ///< allocation origins
+
+    /**
+     * High Dynamic Range Content Light Level metadata applying to this picture,
+     * as defined in section 5.8.3 and 6.7.3
+     */
+    Dav1dContentLightLevel *content_light;
+    /**
+     * High Dynamic Range Mastering Display Color Volume metadata applying to
+     * this picture, as defined in section 5.8.4 and 6.7.4
+     */
+    Dav1dMasteringDisplay *mastering_display;
+
+    struct Dav1dRef *frame_hdr_ref, *seq_hdr_ref; ///< Frame parameter allocation origins
+    struct Dav1dRef *content_light_ref, *mastering_display_ref; ///< Metadata allocation origins
+    struct Dav1dRef *ref; ///< Frame data allocation origin
 
     void *allocator_data; ///< pointer managed by the allocator
 } Dav1dPicture;
 
 typedef struct Dav1dPicAllocator {
     void *cookie; ///< custom data to pass to the allocator callbacks.
     /**
      * Allocate the picture buffer based on the Dav1dPictureParameters.
      *
-     * The data[0], data[1] and data[2] must be 32 byte aligned and with a
-     * pixel width/height multiple of 128 pixels.
+     * The data[0], data[1] and data[2] must be DAV1D_PICTURE_ALIGNMENT byte
+     * aligned and with a pixel width/height multiple of 128 pixels. Any
+     * allocated memory area should also be padded by DAV1D_PICTURE_ALIGNMENT
+     * bytes.
      * data[1] and data[2] must share the same stride[1].
      *
      * This function will be called on the main thread (the thread which calls
      * dav1d_get_picture()).
      *
      * @param  pic The picture to allocate the buffer for. The callback needs to
      *             fill the picture data[0], data[1], data[2], stride[0] and
      *             stride[1].
      *             The allocator can fill the pic allocator_data pointer with
      *             a custom pointer that will be passed to
      *             release_picture_callback().
      * @param cookie Custom pointer passed to all calls.
-    *
-    * @return 0 on success. A negative errno value on error.
+     *
+     * @note No fields other than data, stride and allocator_data must be filled
+     *       by this callback.
+     * @return 0 on success. A negative errno value on error.
      */
     int (*alloc_picture_callback)(Dav1dPicture *pic, void *cookie);
     /**
      * Release the picture buffer.
      *
      * If frame threading is used, this function may be called by the main
      * thread (the thread which calls dav1d_get_picture()) or any of the frame
      * threads and thus must be thread-safe. If frame threading is not used,
@@ -103,9 +126,9 @@ typedef struct Dav1dPicAllocator {
     void (*release_picture_callback)(Dav1dPicture *pic, void *cookie);
 } Dav1dPicAllocator;
 
 /**
  * Release reference to a picture.
  */
 DAV1D_API void dav1d_picture_unref(Dav1dPicture *p);
 
-#endif /* __DAV1D_PICTURE_H__ */
+#endif /* DAV1D_PICTURE_H */
copy from third_party/dav1d/src/obu.h
copy to third_party/dav1d/include/dav1d/version.h.in
--- a/third_party/dav1d/src/obu.h
+++ b/third_party/dav1d/include/dav1d/version.h.in
@@ -1,11 +1,10 @@
 /*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
+ * Copyright © 2019, VideoLAN and dav1d authors
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright notice, this
  *    list of conditions and the following disclaimer.
  *
@@ -20,17 +19,16 @@
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __DAV1D_SRC_OBU_H__
-#define __DAV1D_SRC_OBU_H__
+#ifndef DAV1D_VERSION_H
+#define DAV1D_VERSION_H
 
-#include "dav1d/data.h"
-#include "src/internal.h"
+#define DAV1D_API_VERSION_MAJOR @DAV1D_API_VERSION_MAJOR@
+#define DAV1D_API_VERSION_MINOR @DAV1D_API_VERSION_MINOR@
+#define DAV1D_API_VERSION_PATCH @DAV1D_API_VERSION_PATCH@
 
-int dav1d_parse_obus(Dav1dContext *c, Dav1dData *in, int global);
-
-#endif /* __DAV1D_SRC_OBU_H__ */
+#endif /* DAV1D_VERSION_H */
--- a/third_party/dav1d/include/meson.build
+++ b/third_party/dav1d/include/meson.build
@@ -17,21 +17,20 @@
 # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-# Revision file (version.h) generation
+# Revision file (vcs_version.h) generation
 dav1d_git_dir = join_paths(dav1d_src_root, '.git')
 rev_target = vcs_tag(command: [
         'git', '--git-dir', dav1d_git_dir,
         'describe', '--tags', '--long',
         '--match', '?.*.*', '--always'
     ],
-    input: 'version.h.in',
-    output: 'version.h'
+    input: 'vcs_version.h.in',
+    output: 'vcs_version.h'
 )
 
-# Install include/dav1d headers
-install_subdir('dav1d', install_dir: get_option('includedir'))
+subdir('dav1d')
rename from third_party/dav1d/include/version.h.in
rename to third_party/dav1d/include/vcs_version.h.in
--- a/third_party/dav1d/meson.build
+++ b/third_party/dav1d/meson.build
@@ -1,9 +1,9 @@
-# Copyright © 2018, VideoLAN and dav1d authors
+# Copyright © 2018-2019, VideoLAN and dav1d authors
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 #
 # 1. Redistributions of source code must retain the above copyright notice, this
 #    list of conditions and the following disclaimer.
 #
@@ -18,39 +18,40 @@
 # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 project('dav1d', ['c'],
-    version: '0.1.1',
+    version: '0.2.2',
     default_options: ['c_std=c99',
                       'warning_level=2',
                       'buildtype=release',
                       'b_ndebug=if-release'],
     meson_version: '>= 0.47.0')
 
-dav1d_version_array    = meson.project_version().split('.')
-dav1d_version_major    = dav1d_version_array[0]
-dav1d_version_minor    = dav1d_version_array[1]
-dav1d_version_revision = dav1d_version_array[2]
+dav1d_soname_version   = '1.0.1'
+dav1d_api_version_array    = dav1d_soname_version.split('.')
+dav1d_api_version_major    = dav1d_api_version_array[0]
+dav1d_api_version_minor    = dav1d_api_version_array[1]
+dav1d_api_version_revision = dav1d_api_version_array[2]
 
 dav1d_src_root = meson.current_source_dir()
 cc = meson.get_compiler('c')
 
 # Configuratin data for config.h
 cdata = configuration_data()
 
 # Configuration data for config.asm
 cdata_asm = configuration_data()
 
 # Include directories
-dav1d_inc_dirs = include_directories(['.', 'include', 'include/dav1d'])
+dav1d_inc_dirs = include_directories(['.', 'include/dav1d', 'include'])
 
 
 
 #
 # Option handling
 #
 
 # Bitdepth option
@@ -65,16 +66,18 @@ is_asm_enabled = (get_option('build_asm'
      host_machine.cpu_family() == 'aarch64'      or
      host_machine.cpu_family().startswith('arm')))
 cdata.set10('HAVE_ASM', is_asm_enabled)
 
 if is_asm_enabled and get_option('b_sanitize') == 'memory'
     error('asm causes false positive with memory sanitizer. Use \'-Dbuild_asm=false\'.')
 endif
 
+# Logging option
+cdata.set10('CONFIG_LOG', get_option('logging'))
 
 #
 # OS/Compiler checks and defines
 #
 
 # Arguments in test_args will be used even on feature tests
 test_args = []
 
@@ -82,16 +85,22 @@ test_args = []
 test_args  += '-D_POSIX_C_SOURCE=200112L'
 add_project_arguments('-D_POSIX_C_SOURCE=200112L', language: 'c')
 
 if host_machine.system() == 'windows'
     cdata.set('_WIN32_WINNT',           '0x0601')
     cdata.set('UNICODE',                1) # Define to 1 for Unicode (Wide Chars) APIs
     cdata.set('_UNICODE',               1) # Define to 1 for Unicode (Wide Chars) APIs
     cdata.set('__USE_MINGW_ANSI_STDIO', 1) # Define to force use of MinGW printf
+    if cc.has_function('fseeko', prefix : '#include <stdio.h>', args : test_args)
+        cdata.set('_FILE_OFFSET_BITS', 64) # Not set by default by Meson on Windows
+    else
+        cdata.set('fseeko', '_fseeki64')
+        cdata.set('ftello', '_ftelli64')
+    endif
 endif
 
 # On Windows, we use a compatibility layer to emulate pthread
 if host_machine.system() == 'windows'
     thread_dependency = []
     thread_compat_dep = declare_dependency(sources : files('src/win32/thread.c'))
 else
     thread_dependency = dependency('threads')
@@ -117,16 +126,20 @@ if not cc.check_header('stdatomic.h')
         error('Atomics not supported')
     endif
 endif
 
 if cc.check_header('unistd.h')
     cdata.set('HAVE_UNISTD_H', 1)
 endif
 
+if cc.check_header('io.h')
+    cdata.set('HAVE_IO_H', 1)
+endif
+
 
 # Function checks
 
 if not cc.has_function('getopt_long', prefix : '#include <getopt.h>', args : test_args)
     getopt_dependency = declare_dependency(
         sources: files('tools/compat/getopt.c'),
         include_directories : include_directories('include/compat'),
     )
@@ -162,18 +175,20 @@ endif
 
 # Compiler flags that should be set
 # But when the compiler does not supports them
 # it is not an error and silently tolerated
 optional_arguments = [
   '-Wundef',
   '-Werror=vla',
   '-Wno-maybe-uninitialized',
+  '-Wno-missing-field-initializers',
   '-Wno-unused-parameter',
   '-Werror=missing-prototypes',
+  '-Wshorten-64-to-32',
 ]
 if cc.get_id() == 'msvc'
     optional_arguments += [
       '-wd4028', # parameter different from declaration
       '-wd4996'  # use of POSIX functions
     ]
 endif
 
@@ -194,18 +209,22 @@ if fuzzing_engine == 'libfuzzer'
     add_project_arguments(cc.first_supported_argument(fuzzer_args), language : 'c')
 endif
 
 # Stack alignments flags
 
 stackalign_flag = []
 stackrealign_flag = []
 
+cdata.set10('ENDIANNESS_BIG', host_machine.endian() == 'big')
+
 if host_machine.cpu_family().startswith('x86')
-    if host_machine.cpu_family() == 'x86_64'
+    if get_option('stack_alignment') > 0
+        stack_alignment = get_option('stack_alignment')
+    elif host_machine.cpu_family() == 'x86_64'
         if cc.has_argument('-mpreferred-stack-boundary=5')
             stackalign_flag = ['-mpreferred-stack-boundary=5']
             stackrealign_flag = ['-mincoming-stack-boundary=4']
             stack_alignment = 32
         elif cc.has_argument('-mstack-alignment=32')
             stackalign_flag = ['-mstack-alignment=32']
             stackrealign_flag = ['-mstackrealign']
             stack_alignment = 32
@@ -306,18 +325,18 @@ if is_asm_enabled and host_machine.cpu_f
 
     nasm = find_program('nasm')
 
     # check NASM version
     if nasm.found()
         nasm_r = run_command(nasm, '-v')
         out = nasm_r.stdout().strip().split()
         if out[1].to_lower() == 'version'
-            if out[2].version_compare('<2.13')
-                error('nasm 2.13 or later is required, found nasm @0@'.format(out[2]))
+            if out[2].version_compare('<2.13.02')
+                error('nasm 2.13.02 or later is required, found nasm @0@'.format(out[2]))
             endif
         else
             error('unexpected nasm version string: @0@'.format(nasm_r.stdout()))
         endif
     endif
 
     if host_machine.system() == 'windows'
         nasm_format = 'win'
--- a/third_party/dav1d/meson_options.txt
+++ b/third_party/dav1d/meson_options.txt
@@ -15,18 +15,27 @@ option('build_tools',
     value: true,
     description: 'Build dav1d cli tools')
 
 option('build_tests',
     type: 'boolean',
     value: true,
     description: 'Build dav1d tests')
 
+option('logging',
+    type: 'boolean',
+    value: true,
+    description: 'Print error log messages using the provided callback function')
+
 option('testdata_tests',
     type: 'boolean',
     value: false,
     description: 'Run tests requiring the test data repository')
 
 option('fuzzing_engine',
     type: 'combo',
     choices : ['none', 'libfuzzer', 'oss-fuzz'],
     value: 'none',
     description: 'Select the fuzzing engine')
+
+option('stack_alignment',
+    type: 'integer',
+    value: 0)
new file mode 100644
--- /dev/null
+++ b/third_party/dav1d/snap/snapcraft.yaml
@@ -0,0 +1,24 @@
+name: dav1d
+base: core18
+version: git
+version-script: git describe HEAD --always
+summary: AV1 decoder from VideoLAN
+description: |
+  A small and fast AV1 decoder from the people who brought you VLC.
+
+grade: devel # must be 'stable' to release into candidate/stable channels
+confinement: strict # use 'strict' once you have the right plugs and slots
+
+apps:
+  dav1d:
+    command: usr/bin/dav1d
+    plugs: [ 'home' ]
+
+parts:
+  dav1d:
+    plugin: meson
+    source: .
+    build-packages: [ 'nasm' ]
+    meson-parameters:
+      - --prefix=/usr
+      - --buildtype=release
new file mode 100644
--- /dev/null
+++ b/third_party/dav1d/src/arm/32/looprestoration.S
@@ -0,0 +1,685 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+
+// void dav1d_wiener_filter_h_neon(int16_t *dst, const pixel (*left)[4],
+//                                 const pixel *src, ptrdiff_t stride,
+//                                 const int16_t fh[7], const intptr_t w,
+//                                 int h, enum LrEdgeFlags edges);
+function wiener_filter_h_neon, export=1
+        push            {r4-r11,lr}
+        vpush           {q4}
+        ldrd            r4,  r5,  [sp, #52]
+        ldrd            r6,  r7,  [sp, #60]
+        mov             r8,  r5
+        vld1.16         {q0},  [r4]
+        movw            r9,  #(1 << 14) - (1 << 2)
+        vdup.16         q14,  r9
+        vmov.s16        q15,  #2048
+        // Calculate mid_stride
+        add             r10, r5,  #7
+        bic             r10, r10, #7
+        lsl             r10, r10, #1
+
+        // Clear the last unused element of q0, to allow filtering a single
+        // pixel with one plain vmul+vpadd.
+        mov             r12, #0
+        vmov.16         d1[3], r12
+
+        // Set up pointers for reading/writing alternate rows
+        add             r12, r0,  r10
+        lsl             r10, r10, #1
+        add             lr,  r2,  r3
+        lsl             r3,  r3,  #1
+
+        // Subtract the width from mid_stride
+        sub             r10, r10, r5, lsl #1
+
+        // For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels.
+        cmp             r5,  #8
+        add             r11, r5,  #13
+        bic             r11, r11, #7
+        bge             1f
+        mov             r11, #16
+1:
+        sub             r3,  r3,  r11
+
+        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+        tst             r7,  #1 // LR_HAVE_LEFT
+        beq             2f
+        // LR_HAVE_LEFT
+        cmp             r1,  #0
+        bne             0f
+        // left == NULL
+        sub             r2,  r2,  #3
+        sub             lr,  lr,  #3
+        b               1f
+0:      // LR_HAVE_LEFT, left != NULL
+2:      // !LR_HAVE_LEFT, increase the stride.
+        // For this case we don't read the left 3 pixels from the src pointer,
+        // but shift it as if we had done that.
+        add             r3,  r3,  #3
+
+
+1:      // Loop vertically
+        vld1.8          {q2},  [r2]!
+        vld1.8          {q9},  [lr]!
+
+        tst             r7,  #1 // LR_HAVE_LEFT
+        beq             0f
+        cmp             r1,  #0
+        beq             2f
+        // LR_HAVE_LEFT, left != NULL
+        vld1.32         {d3[1]},  [r1]!
+        // Move r2/lr back to account for the last 3 bytes we loaded earlier,
+        // which we'll shift out.
+        sub             r2,  r2,  #3
+        sub             lr,  lr,  #3
+        vld1.32         {d17[1]},  [r1]!
+        vext.8          q2,  q1,  q2,  #13
+        vext.8          q9,  q8,  q9,  #13
+        b               2f
+0:
+        // !LR_HAVE_LEFT, fill q1 with the leftmost byte
+        // and shift q2 to have 3x the first byte at the front.
+        vdup.8          q1, d4[0]
+        vdup.8          q8, d18[0]
+        // Move r2 back to account for the last 3 bytes we loaded before,
+        // which we shifted out.
+        sub             r2,  r2,  #3
+        sub             lr,  lr,  #3
+        vext.8          q2,  q1,  q2,  #13
+        vext.8          q9,  q8,  q9,  #13
+
+2:
+        vmovl.u8        q1,  d4
+        vmovl.u8        q2,  d5
+        vmovl.u8        q8,  d18
+        vmovl.u8        q9,  d19
+
+        tst             r7,  #2 // LR_HAVE_RIGHT
+        bne             4f
+        // If we'll need to pad the right edge, load that byte to pad with
+        // here since we can find it pretty easily from here.
+        sub             r9,  r5, #14
+        ldrb            r11, [r2, r9]
+        ldrb            r9,  [lr, r9]
+        // Fill q12/q13 with the right padding pixel
+        vdup.8          d24, r11
+        vdup.8          d26, r9
+        vmovl.u8        q12, d24
+        vmovl.u8        q13, d26
+3:      // !LR_HAVE_RIGHT
+        // If we'll have to pad the right edge we need to quit early here.
+        cmp             r5,  #11
+        bge             4f   // If w >= 11, all used input pixels are valid
+        cmp             r5,  #7
+        bge             5f   // If w >= 7, we can filter 4 pixels
+        b               6f
+
+4:      // Loop horizontally
+.macro filter_8
+        // This is tuned as some sort of compromise between Cortex A7, A8,
+        // A9 and A53.
+        vmul.s16        q3,  q1,  d0[0]
+        vext.8          q10, q1,  q2,  #2
+        vext.8          q11, q1,  q2,  #4
+        vmla.s16        q3,  q10, d0[1]
+        vmla.s16        q3,  q11, d0[2]
+        vext.8          q10, q1,  q2,  #6
+        vext.8          q11, q1,  q2,  #8
+        vmla.s16        q3,  q10, d0[3]
+        vmla.s16        q3,  q11, d1[0]
+        vext.8          q10, q1,  q2,  #10
+        vext.8          q11, q1,  q2,  #12
+        vmla.s16        q3,  q10, d1[1]
+        vmla.s16        q3,  q11, d1[2]
+
+        vmul.s16        q10, q8,  d0[0]
+        vext.8          q11, q8,  q9,  #2
+        vext.8          q4,  q8,  q9,  #4
+        vmla.s16        q10, q11, d0[1]
+        vmla.s16        q10, q4,  d0[2]
+        vext.8          q11, q8,  q9,  #6
+        vext.8          q4,  q8,  q9,  #8
+        vmla.s16        q10, q11, d0[3]
+        vmla.s16        q10, q4,  d1[0]
+        vext.8          q11, q8,  q9,  #10
+        vext.8          q4,  q8,  q9,  #12
+        vmla.s16        q10, q11, d1[1]
+        vmla.s16        q10, q4,  d1[2]
+
+        vext.8          q1,  q1,  q2,  #6
+        vext.8          q8,  q8,  q9,  #6
+        vshl.s16        q1,  q1,  #7
+        vshl.s16        q8,  q8,  #7
+        vsub.s16        q1,  q1,  q14
+        vsub.s16        q8,  q8,  q14
+        vqadd.s16       q3,  q3,  q1
+        vqadd.s16       q10, q10, q8
+        vshr.s16        q3,  q3,  #3
+        vshr.s16        q10, q10, #3
+        vadd.s16        q3,  q3,  q15
+        vadd.s16        q10, q10, q15
+.endm
+        filter_8
+        vst1.16         {q3},  [r0,  :128]!
+        vst1.16         {q10}, [r12, :128]!
+
+        subs            r5,  r5,  #8
+        ble             9f
+        tst             r7,  #2 // LR_HAVE_RIGHT
+        vmov            q1,  q2
+        vmov            q8,  q9
+        vld1.8          {d4},  [r2]!
+        vld1.8          {d18}, [lr]!
+        vmovl.u8        q2,  d4
+        vmovl.u8        q9,  d18
+        bne             4b // If we don't need to pad, just keep filtering.
+        b               3b // If we need to pad, check how many pixels we have left.
+
+5:      // Filter 4 pixels, 7 <= w < 11
+.macro filter_4
+        vmul.s16        d6,  d2,  d0[0]
+        vext.8          q10, q1,  q2,  #2
+        vext.8          q11, q1,  q2,  #4
+        vmla.s16        d6,  d20, d0[1]
+        vmla.s16        d6,  d22, d0[2]
+        vext.8          q10, q1,  q2,  #6
+        vext.8          q11, q1,  q2,  #8
+        vmla.s16        d6,  d20, d0[3]
+        vmla.s16        d6,  d22, d1[0]
+        vext.8          q10, q1,  q2,  #10
+        vext.8          q11, q1,  q2,  #12
+        vmla.s16        d6,  d20, d1[1]
+        vmla.s16        d6,  d22, d1[2]
+
+        vmul.s16        d20, d16, d0[0]
+        vext.8          q11, q8,  q9,  #2
+        vext.8          q4,  q8,  q9,  #4
+        vmla.s16        d20, d22, d0[1]
+        vmla.s16        d20, d8,  d0[2]
+        vext.8          q11, q8,  q9,  #6
+        vext.8          q4,  q8,  q9,  #8
+        vmla.s16        d20, d22, d0[3]
+        vmla.s16        d20, d8,  d1[0]
+        vext.8          q11, q8,  q9,  #10
+        vext.8          q4,  q8,  q9,  #12
+        vmla.s16        d20, d22, d1[1]
+        vmla.s16        d20, d8,  d1[2]
+
+        vext.8          q11, q1,  q2,  #6
+        vshl.s16        d22, d22, #7
+        vsub.s16        d22, d22, d28
+        vqadd.s16       d6,  d6,  d22
+        vext.8          q11, q8,  q9,  #6
+        vshl.s16        d22, d22, #7
+        vsub.s16        d22, d22, d28
+        vqadd.s16       d20, d20, d22
+        vshr.s16        d6,  d6,  #3
+        vshr.s16        d20, d20, #3
+        vadd.s16        d6,  d6,  d30
+        vadd.s16        d20, d20, d30
+.endm
+        filter_4
+        vst1.16         {d6},  [r0,  :64]!
+        vst1.16         {d20}, [r12, :64]!
+
+        subs            r5,  r5,  #4 // 3 <= w < 7
+        vext.8          q1,  q1,  q2,  #8
+        vext.8          q2,  q2,  q2,  #8
+        vext.8          q8,  q8,  q9,  #8
+        vext.8          q9,  q9,  q9,  #8
+
+6:      // Pad the right edge and filter the last few pixels.
+        // w < 7, w+3 pixels valid in q1-q2
+        cmp             r5,  #5
+        blt             7f
+        bgt             8f
+        // w == 5, 8 pixels valid in q1, q2 invalid
+        vmov            q2,  q12
+        vmov            q9,  q13
+        b               88f
+
+7:      // 1 <= w < 5, 4-7 pixels valid in q1
+        sub             r9,  r5,  #1
+        // w9 = (pixels valid - 4)
+        adr             r11, L(variable_shift_tbl)
+        ldr             r9,  [r11, r9, lsl #2]
+        add             r11, r11, r9
+        vmov            q2,  q12
+        vmov            q9,  q13
+        bx              r11
+
+        .align 2
+L(variable_shift_tbl):
+        .word 44f - L(variable_shift_tbl) + CONFIG_THUMB
+        .word 55f - L(variable_shift_tbl) + CONFIG_THUMB
+        .word 66f - L(variable_shift_tbl) + CONFIG_THUMB
+        .word 77f - L(variable_shift_tbl) + CONFIG_THUMB
+
+44:     // 4 pixels valid in d2/d16, fill d3/d17 with padding.
+        vmov            d3,  d4
+        vmov            d17, d18
+        b               88f
+        // Shift q1 right, shifting out invalid pixels,
+        // shift q1 left to the original offset, shifting in padding pixels.
+55:     // 5 pixels valid
+        vext.8          q1,  q1,  q1,  #10
+        vext.8          q1,  q1,  q2,  #6
+        vext.8          q8,  q8,  q8,  #10
+        vext.8          q8,  q8,  q9,  #6
+        b               88f
+66:     // 6 pixels valid
+        vext.8          q1,  q1,  q1,  #12
+        vext.8          q1,  q1,  q2,  #4
+        vext.8          q8,  q8,  q8,  #12
+        vext.8          q8,  q8,  q9,  #4
+        b               88f
+77:     // 7 pixels valid
+        vext.8          q1,  q1,  q1,  #14
+        vext.8          q1,  q1,  q2,  #2
+        vext.8          q8,  q8,  q8,  #14
+        vext.8          q8,  q8,  q9,  #2
+        b               88f
+
+8:      // w > 5, w == 6, 9 pixels valid in q1-q2, 1 pixel valid in q2
+        vext.8          q2,  q2,  q2,  #2
+        vext.8          q2,  q2,  q12, #14
+        vext.8          q9,  q9,  q9,  #2
+        vext.8          q9,  q9,  q13, #14
+
+88:
+        // w < 7, q1-q2 padded properly
+        cmp             r5,  #4
+        blt             888f
+
+        // w >= 4, filter 4 pixels
+        filter_4
+        vst1.16         {d6},  [r0,  :64]!
+        vst1.16         {d20}, [r12, :64]!
+        subs            r5,  r5,  #4 // 0 <= w < 4
+        vext.8          q1,  q1,  q2,  #8
+        vext.8          q8,  q8,  q9,  #8
+        beq             9f
+888:    // 1 <= w < 4, filter 1 pixel at a time
+        vmul.s16        q3,  q1,  q0
+        vmul.s16        q10, q8,  q0
+        vpadd.s16       d6,  d6,  d7
+        vpadd.s16       d7,  d20, d21
+        vdup.16         d24, d2[3]
+        vpadd.s16       d6,  d6,  d7
+        vdup.16         d25, d16[3]
+        vpadd.s16       d6,  d6,  d6
+        vtrn.16         d24, d25
+        vshl.s16        d24, d24,  #7
+        vsub.s16        d24, d24,  d28
+        vqadd.s16       d6,  d6,   d24
+        vshr.s16        d6,  d6,   #3
+        vadd.s16        d6,  d6,   d30
+        vst1.s16        {d6[0]}, [r0,  :16]!
+        vst1.s16        {d6[1]}, [r12, :16]!
+        subs            r5,  r5,  #1
+        vext.8          q1,  q1,  q2,  #2
+        vext.8          q8,  q8,  q9,  #2
+        bgt             888b
+
+9:
+        subs            r6,  r6,  #2
+        ble             0f
+        // Jump to the next row and loop horizontally
+        add             r0,  r0,  r10
+        add             r12, r12, r10
+        add             r2,  r2,  r3
+        add             lr,  lr,  r3
+        mov             r5,  r8
+        b               1b
+0:
+        vpop            {q4}
+        pop             {r4-r11,pc}
+.purgem filter_8
+.purgem filter_4
+endfunc
+
+// void dav1d_wiener_filter_v_neon(pixel *dst, ptrdiff_t stride,
+//                                 const int16_t *mid, int w, int h,
+//                                 const int16_t fv[7], enum LrEdgeFlags edges,
+//                                 ptrdiff_t mid_stride);
+function wiener_filter_v_neon, export=1
+        push            {r4-r7,lr}
+        ldrd            r4,  r5,  [sp, #20]
+        ldrd            r6,  r7,  [sp, #28]
+        mov             lr,  r4
+        vmov.s16        q1,  #0
+        mov             r12, #128
+        vld1.16         {q0},  [r5]
+        vmov.s16        d2[3], r12
+        vadd.s16        q0,  q0,  q1
+
+        // Calculate the number of rows to move back when looping vertically
+        mov             r12, r4
+        tst             r6,  #4 // LR_HAVE_TOP
+        beq             0f
+        sub             r2,  r2,  r7,  lsl #1
+        add             r12, r12, #2
+0:
+        tst             r6,  #8 // LR_HAVE_BOTTOM
+        beq             1f
+        add             r12, r12, #2
+
+1:      // Start of horizontal loop; start one vertical filter slice.
+        // Load rows into q8-q11 and pad properly.
+        tst             r6,  #4 // LR_HAVE_TOP
+        vld1.16         {q8},  [r2, :128], r7
+        beq             2f
+        // LR_HAVE_TOP
+        vld1.16         {q10}, [r2, :128], r7
+        vmov            q9,  q8
+        vld1.16         {q11}, [r2, :128], r7
+        b               3f
+2:      // !LR_HAVE_TOP
+        vmov            q9,  q8
+        vmov            q10, q8
+        vmov            q11, q8
+
+3:
+        cmp             r4,  #4
+        blt             5f
+        // Start filtering normally; fill in q12-q14 with unique rows.
+        vld1.16         {q12}, [r2, :128], r7
+        vld1.16         {q13}, [r2, :128], r7
+        vld1.16         {q14}, [r2, :128], r7
+
+4:
+.macro filter compare
+        subs            r4,  r4,  #1
+        // Interleaving the mul/mla chains actually hurts performance
+        // significantly on Cortex A53, thus keeping mul/mla tightly
+        // chained like this.
+        vmull.s16       q2,  d16,  d0[0]
+        vmlal.s16       q2,  d18,  d0[1]
+        vmlal.s16       q2,  d20,  d0[2]
+        vmlal.s16       q2,  d22,  d0[3]
+        vmlal.s16       q2,  d24,  d1[0]
+        vmlal.s16       q2,  d26,  d1[1]
+        vmlal.s16       q2,  d28,  d1[2]
+        vmull.s16       q3,  d17,  d0[0]
+        vmlal.s16       q3,  d19,  d0[1]
+        vmlal.s16       q3,  d21,  d0[2]
+        vmlal.s16       q3,  d23,  d0[3]
+        vmlal.s16       q3,  d25,  d1[0]
+        vmlal.s16       q3,  d27,  d1[1]
+        vmlal.s16       q3,  d29,  d1[2]
+        vqrshrun.s32    d4,  q2,   #11
+        vqrshrun.s32    d5,  q3,   #11
+        vqmovun.s16     d4,  q2
+        vst1.8          {d4}, [r0], r1
+.if \compare
+        cmp             r4,  #4
+.else
+        ble             9f
+.endif
+        vmov            q8,  q9
+        vmov            q9,  q10
+        vmov            q10, q11
+        vmov            q11, q12
+        vmov            q12, q13
+        vmov            q13, q14
+.endm
+        filter          1
+        blt             7f
+        vld1.16         {q14}, [r2, :128], r7
+        b               4b
+
+5:      // Less than 4 rows in total; not all of q12-q13 are filled yet.
+        tst             r6,  #8 // LR_HAVE_BOTTOM
+        beq             6f
+        // LR_HAVE_BOTTOM
+        cmp             r4,  #2
+        // We load at least 2 rows in all cases.
+        vld1.16         {q12}, [r2, :128], r7
+        vld1.16         {q13}, [r2, :128], r7
+        bgt             53f // 3 rows in total
+        beq             52f // 2 rows in total
+51:     // 1 row in total, q11 already loaded, load edge into q12-q14.
+        vmov            q13, q12
+        b               8f
+52:     // 2 rows in total, q11 already loaded, load q12 with content data
+        // and 2 rows of edge.
+        vld1.16         {q14}, [r2, :128], r7
+        vmov            q15,  q14
+        b               8f
+53:
+        // 3 rows in total, q11 already loaded, load q12 and q13 with content
+        // and 2 rows of edge.
+        vld1.16         {q14}, [r2, :128], r7
+        vld1.16         {q15}, [r2, :128], r7
+        vmov            q1,  q15
+        b               8f
+
+6:
+        // !LR_HAVE_BOTTOM
+        cmp             r4,  #2
+        bgt             63f // 3 rows in total
+        beq             62f // 2 rows in total
+61:     // 1 row in total, q11 already loaded, pad that into q12-q14.
+        vmov            q12, q11
+        vmov            q13, q11
+        vmov            q14, q11
+        b               8f
+62:     // 2 rows in total, q11 already loaded, load q12 and pad that into q12-q15.
+        vld1.16         {q12}, [r2, :128], r7
+        vmov            q13, q12
+        vmov            q14, q12
+        vmov            q15, q12
+        b               8f
+63:
+        // 3 rows in total, q11 already loaded, load q12 and q13 and pad q13 into q14-q15,q1.
+        vld1.16         {q12}, [r2, :128], r7
+        vld1.16         {q13}, [r2, :128], r7
+        vmov            q14, q13
+        vmov            q15, q13
+        vmov            q1,  q13
+        b               8f
+
+7:
+        // All registers up to q13 are filled already, 3 valid rows left.
+        // < 4 valid rows left; fill in padding and filter the last
+        // few rows.
+        tst             r6,  #8 // LR_HAVE_BOTTOM
+        beq             71f
+        // LR_HAVE_BOTTOM; load 2 rows of edge.
+        vld1.16         {q14}, [r2, :128], r7
+        vld1.16         {q15}, [r2, :128], r7
+        vmov            q1,  q15
+        b               8f
+71:
+        // !LR_HAVE_BOTTOM, pad 3 rows
+        vmov            q14, q13
+        vmov            q15, q13
+        vmov            q1,  q13
+
+8:      // At this point, all registers up to q14-15,q1 are loaded with
+        // edge/padding (depending on how many rows are left).
+        filter          0 // This branches to 9f when done
+        vmov            q14, q15
+        vmov            q15, q1
+        b               8b
+
+9:      // End of one vertical slice.
+        subs            r3,  r3,  #8
+        ble             0f
+        // Move pointers back up to the top and loop horizontally.
+        mls             r0,  r1,  lr,  r0
+        mls             r2,  r7,  r12, r2
+        add             r0,  r0,  #8
+        add             r2,  r2,  #16
+        mov             r4,  lr
+        b               1b
+
+0:
+        pop             {r4-r7,pc}
+.purgem filter
+endfunc
+
+// void dav1d_copy_narrow_neon(pixel *dst, ptrdiff_t stride,
+//                             const pixel *src, int w, int h);
+function copy_narrow_neon, export=1
+        push            {r4,lr}
+        ldr             r4, [sp, #8]
+        adr             r12, L(copy_narrow_tbl)
+        ldr             r3,  [r12, r3, lsl #2]
+        add             r12, r12, r3
+        bx              r12
+
+        .align 2
+L(copy_narrow_tbl):
+        .word 0
+        .word 10f - L(copy_narrow_tbl) + CONFIG_THUMB
+        .word 20f - L(copy_narrow_tbl) + CONFIG_THUMB
+        .word 30f - L(copy_narrow_tbl) + CONFIG_THUMB
+        .word 40f - L(copy_narrow_tbl) + CONFIG_THUMB
+        .word 50f - L(copy_narrow_tbl) + CONFIG_THUMB
+        .word 60f - L(copy_narrow_tbl) + CONFIG_THUMB
+        .word 70f - L(copy_narrow_tbl) + CONFIG_THUMB
+
+10:
+        add             r3,  r0,  r1
+        lsl             r1,  r1,  #1
+18:
+        subs            r4,  r4,  #8
+        blt             110f
+        vld1.8          {d0}, [r2, :64]!
+        vst1.8          {d0[0]}, [r0], r1
+        vst1.8          {d0[1]}, [r3], r1
+        vst1.8          {d0[2]}, [r0], r1
+        vst1.8          {d0[3]}, [r3], r1
+        vst1.8          {d0[4]}, [r0], r1
+        vst1.8          {d0[5]}, [r3], r1
+        vst1.8          {d0[6]}, [r0], r1
+        vst1.8          {d0[7]}, [r3], r1
+        ble             0f
+        b               18b
+110:
+        add             r4,  r4,  #8
+        asr             r1,  r1,  #1
+11:
+        subs            r4,  r4,  #1
+        vld1.8          {d0[]},  [r2]!
+        vst1.8          {d0[0]}, [r0], r1
+        bgt             11b
+0:
+        pop             {r4,pc}
+
+20:
+        add             r3,  r0,  r1
+        lsl             r1,  r1,  #1
+24:
+        subs            r4,  r4,  #4
+        blt             210f
+        vld1.16         {d0}, [r2, :64]!
+        vst1.16         {d0[0]}, [r0, :16], r1
+        vst1.16         {d0[1]}, [r3, :16], r1
+        vst1.16         {d0[2]}, [r0, :16], r1
+        vst1.16         {d0[3]}, [r3, :16], r1
+        ble             0f
+        b               24b
+210:
+        add             r4,  r4,  #4
+        asr             r1,  r1,  #1
+22:
+        subs            r4,  r4,  #1
+        vld1.16         {d0[]},  [r2]!
+        vst1.16         {d0[0]}, [r0], r1
+        bgt             22b
+0:
+        pop             {r4,pc}
+
+30:
+        ldrh            r3,  [r2]
+        ldrb            r12, [r2, #2]
+        add             r2,  r2,  #3
+        subs            r4,  r4,  #1
+        strh            r3,  [r0]
+        strb            r12, [r0, #2]
+        add             r0,  r0,  r1
+        bgt             30b
+        pop             {r4,pc}
+
+40:
+        add             r3,  r0,  r1
+        lsl             r1,  r1,  #1
+42:
+        subs            r4,  r4,  #2
+        blt             41f
+        vld1.8          {d0}, [r2, :64]!
+        vst1.32         {d0[0]}, [r0, :32], r1
+        vst1.32         {d0[1]}, [r3, :32], r1
+        ble             0f
+        b               42b
+41:
+        vld1.32         {d0[]},  [r2]
+        vst1.32         {d0[0]}, [r0]
+0:
+        pop             {r4,pc}
+
+50:
+        ldr             r3,  [r2]
+        ldrb            r12, [r2, #4]
+        add             r2,  r2,  #5
+        subs            r4,  r4,  #1
+        str             r3,  [r0]
+        strb            r12, [r0, #4]
+        add             r0,  r0,  r1
+        bgt             50b
+        pop             {r4,pc}
+
+60:
+        ldr             r3,  [r2]
+        ldrh            r12, [r2, #4]
+        add             r2,  r2,  #6
+        subs            r4,  r4,  #1
+        str             r3,  [r0]
+        strh            r12, [r0, #4]
+        add             r0,  r0,  r1
+        bgt             60b
+        pop             {r4,pc}
+
+70:
+        ldr             r3,  [r2]
+        ldrh            r12, [r2, #4]
+        ldrb            lr,  [r2, #6]
+        add             r2,  r2,  #7
+        subs            r4,  r4,  #1
+        str             r3,  [r0]
+        strh            r12, [r0, #4]
+        strb            lr,  [r0, #6]
+        add             r0,  r0,  r1
+        bgt             70b
+        pop             {r4,pc}
+endfunc
--- a/third_party/dav1d/src/arm/32/mc.S
+++ b/third_party/dav1d/src/arm/32/mc.S
@@ -22,16 +22,17 @@
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include "src/arm/asm.S"
+#include "src/arm/32/util.S"
 
 .macro avg dst0, dst1, t0, t1, t2, t3
         vld1.16         {\t0,\t1},   [r2, :128]!
         vld1.16         {\t2,\t3},   [r3, :128]!
         vadd.i16        \t0,   \t0,  \t2
         vadd.i16        \t1,   \t1,  \t3
         vqrshrun.s16    \dst0, \t0,  #5
         vqrshrun.s16    \dst1, \t1,  #5
@@ -207,8 +208,2124 @@ 128:
 0:
         pop             {r4-r6,pc}
 endfunc
 .endm
 
 bidir_fn avg
 bidir_fn w_avg
 bidir_fn mask
+
+
+// This has got the same signature as the put_8tap functions,
+// assumes that the caller has loaded the h argument into r5,
+// and assumes that r8 is set to (24-clz(w)).
+function put
+        adr             r9,  L(put_tbl)
+        ldr             r8,  [r9, r8, lsl #2]
+        add             r9,  r9,  r8
+        bx              r9
+
+        .align 2
+L(put_tbl):
+        .word 1280f - L(put_tbl) + CONFIG_THUMB
+        .word 640f  - L(put_tbl) + CONFIG_THUMB
+        .word 32f   - L(put_tbl) + CONFIG_THUMB
+        .word 160f  - L(put_tbl) + CONFIG_THUMB
+        .word 8f    - L(put_tbl) + CONFIG_THUMB
+        .word 4f    - L(put_tbl) + CONFIG_THUMB
+        .word 2f    - L(put_tbl) + CONFIG_THUMB
+
+2:
+        vld1.16         {d0[]}, [r2], r3
+        vld1.16         {d1[]}, [r2], r3
+        subs            r5,  r5,  #2
+        vst1.16         {d0[0]}, [r0, :16], r1
+        vst1.16         {d1[0]}, [r0, :16], r1
+        bgt             2b
+        pop             {r4-r11,pc}
+4:
+        vld1.32         {d0[]}, [r2], r3
+        vld1.32         {d1[]}, [r2], r3
+        subs            r5,  r5,  #2
+        vst1.32         {d0[0]}, [r0, :32], r1
+        vst1.32         {d1[0]}, [r0, :32], r1
+        bgt             4b
+        pop             {r4-r11,pc}
+8:
+        vld1.8          {d0}, [r2], r3
+        vld1.8          {d1}, [r2], r3
+        subs            r5,  r5,  #2
+        vst1.8          {d0}, [r0, :64], r1
+        vst1.8          {d1}, [r0, :64], r1
+        bgt             8b
+        pop             {r4-r11,pc}
+160:
+        add             r8,  r0,  r1
+        lsl             r1,  r1,  #1
+        add             r9,  r2,  r3
+        lsl             r3,  r3,  #1
+16:
+        vld1.8          {q0}, [r2], r3
+        vld1.8          {q1}, [r9], r3
+        subs            r5,  r5,  #2
+        vst1.8          {q0}, [r0, :128], r1
+        vst1.8          {q1}, [r8, :128], r1
+        bgt             16b
+        pop             {r4-r11,pc}
+32:
+        vld1.8          {q0,  q1},  [r2], r3
+        subs            r5,  r5,  #1
+        vst1.8          {q0,  q1},  [r0, :128], r1
+        bgt             32b
+        pop             {r4-r11,pc}
+640:
+        sub             r1,  r1,  #32
+        sub             r3,  r3,  #32
+64:
+        vld1.8          {q0,  q1},  [r2]!
+        vst1.8          {q0,  q1},  [r0, :128]!
+        vld1.8          {q2,  q3},  [r2], r3
+        subs            r5,  r5,  #1
+        vst1.8          {q2,  q3},  [r0, :128], r1
+        bgt             64b
+        pop             {r4-r11,pc}
+1280:
+        sub             r1,  r1,  #96
+        sub             r3,  r3,  #96
+128:
+        vld1.8          {q8,  q9},  [r2]!
+        vst1.8          {q8,  q9},  [r0, :128]!
+        vld1.8          {q10, q11}, [r2]!
+        vst1.8          {q10, q11}, [r0, :128]!
+        vld1.8          {q12, q13}, [r2]!
+        vst1.8          {q12, q13}, [r0, :128]!
+        vld1.8          {q14, q15}, [r2], r3
+        subs            r5,  r5,  #1
+        vst1.8          {q14, q15}, [r0, :128], r1
+        bgt             128b
+        pop             {r4-r11,pc}
+endfunc
+
+
+// This has got the same signature as the put_8tap functions,
+// assumes that the caller has loaded the h argument into r5,
+// and assumes that r8 is set to (24-clz(w)), and r7 to w*2.
+function prep
+        adr             r9,  L(prep_tbl)
+        ldr             r8,  [r9, r8, lsl #2]
+        add             r9,  r9,  r8
+        bx              r9
+
+        .align 2
+L(prep_tbl):
+        .word 1280f - L(prep_tbl) + CONFIG_THUMB
+        .word 640f  - L(prep_tbl) + CONFIG_THUMB
+        .word 320f  - L(prep_tbl) + CONFIG_THUMB
+        .word 160f  - L(prep_tbl) + CONFIG_THUMB
+        .word 8f    - L(prep_tbl) + CONFIG_THUMB
+        .word 4f    - L(prep_tbl) + CONFIG_THUMB
+
+4:
+        vld1.32         {d0[]}, [r1], r2
+        vld1.32         {d2[]}, [r1], r2
+        subs            r4,  r4,  #2
+        vshll.u8        q0,  d0,  #4
+        vshll.u8        q1,  d2,  #4
+        vst1.16         {d1, d2}, [r0, :64]!
+        bgt             4b
+        pop             {r4-r11,pc}
+8:
+        vld1.8          {d0}, [r1], r2
+        vld1.8          {d2}, [r1], r2
+        subs            r4,  r4,  #2
+        vshll.u8        q0,  d0,  #4
+        vshll.u8        q1,  d2,  #4
+        vst1.16         {q0, q1}, [r0, :128]!
+        bgt             8b
+        pop             {r4-r11,pc}
+160:
+        add             r9,  r1,  r2
+        lsl             r2,  r2,  #1
+        add             r8,  r0,  r7
+        lsl             r7,  r7,  #1
+16:
+        vld1.8          {q2}, [r1], r2
+        vld1.8          {q3}, [r9], r2
+        subs            r4,  r4,  #2
+        vshll.u8        q0,  d4,  #4
+        vshll.u8        q1,  d5,  #4
+        vshll.u8        q2,  d6,  #4
+        vshll.u8        q3,  d7,  #4
+        vst1.16         {q0, q1}, [r0, :128], r7
+        vst1.16         {q2, q3}, [r8, :128], r7
+        bgt             16b
+        pop             {r4-r11,pc}
+320:
+        add             r8,  r0,  r3
+32:
+        vld1.8          {q0,  q1},  [r1], r2
+        subs            r4,  r4,  #2
+        vshll.u8        q8,  d0,  #4
+        vshll.u8        q9,  d1,  #4
+        vld1.8          {q2,  q3},  [r1], r2
+        vshll.u8        q10, d2,  #4
+        vshll.u8        q11, d3,  #4
+        vshll.u8        q12, d4,  #4
+        vst1.16         {q8,  q9},  [r0, :128], r7
+        vshll.u8        q13, d5,  #4
+        vst1.16         {q10, q11}, [r8, :128], r7
+        vshll.u8        q14, d6,  #4
+        vst1.16         {q12, q13}, [r0, :128], r7
+        vshll.u8        q15, d7,  #4
+        vst1.16         {q14, q15}, [r8, :128], r7
+        bgt             32b
+        pop             {r4-r11,pc}
+640:
+        sub             r2,  r2,  #32
+        add             r8,  r0,  #32
+        mov             r6,  #64
+64:
+        vld1.8          {q0,  q1},  [r1]!
+        subs            r4,  r4,  #1
+        vshll.u8        q8,  d0,  #4
+        vshll.u8        q9,  d1,  #4
+        vld1.8          {q2,  q3},  [r1], r2
+        vshll.u8        q10, d2,  #4
+        vshll.u8        q11, d3,  #4
+        vshll.u8        q12, d4,  #4
+        vst1.16         {q8,  q9},  [r0, :128], r6
+        vshll.u8        q13, d5,  #4
+        vshll.u8        q14, d6,  #4
+        vst1.16         {q10, q11}, [r8, :128], r6
+        vshll.u8        q15, d7,  #4
+        vst1.16         {q12, q13}, [r0, :128], r6
+        vst1.16         {q14, q15}, [r8, :128], r6
+        bgt             64b
+        pop             {r4-r11,pc}
+1280:
+        sub             r2,  r2,  #96
+        add             r8,  r0,  #32
+        mov             r6,  #64
+128:
+        vld1.8          {q0,  q1},  [r1]!
+        vld1.8          {q2,  q3},  [r1]!
+        vshll.u8        q10, d0,  #4
+        vshll.u8        q11, d1,  #4
+        vshll.u8        q12, d2,  #4
+        vshll.u8        q13, d3,  #4
+        vshll.u8        q14, d4,  #4
+        vshll.u8        q15, d5,  #4
+        vld1.8          {q8,  q9},  [r1]!
+        vst1.16         {q10, q11}, [r0, :128], r6
+        vst1.16         {q12, q13}, [r8, :128], r6
+        vshll.u8        q0,  d6,  #4
+        vshll.u8        q1,  d7,  #4
+        vshll.u8        q2,  d16, #4
+        vshll.u8        q3,  d17, #4
+        vshll.u8        q8,  d18, #4
+        vshll.u8        q9,  d19, #4
+        vld1.8          {q10, q11}, [r1], r2
+        vst1.16         {q14, q15}, [r0, :128], r6
+        vst1.16         {q0,  q1},  [r8, :128], r6
+        vshll.u8        q12, d20, #4
+        vshll.u8        q13, d21, #4
+        vshll.u8        q14, d22, #4
+        vshll.u8        q15, d23, #4
+        subs            r4,  r4,  #1
+        vst1.16         {q2,  q3},  [r0, :128], r6
+        vst1.16         {q8,  q9},  [r8, :128], r6
+        vst1.16         {q12, q13}, [r0, :128], r6
+        vst1.16         {q14, q15}, [r8, :128], r6
+        bgt             128b
+        pop             {r4-r11,pc}
+endfunc
+
+
+.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
+        vld1.\wd        {\d0[]}, [\s0], \strd
+        vld1.\wd        {\d1[]}, [\s1], \strd
+.ifnb \d2
+        vld1.\wd        {\d2[]}, [\s0], \strd
+        vld1.\wd        {\d3[]}, [\s1], \strd
+.endif
+.ifnb \d4
+        vld1.\wd        {\d4[]}, [\s0], \strd
+.endif
+.ifnb \d5
+        vld1.\wd        {\d5[]}, [\s1], \strd
+.endif
+.ifnb \d6
+        vld1.\wd        {\d6[]}, [\s0], \strd
+.endif
+.endm
+.macro load_reg s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+        vld1.8          {\d0}, [\s0], \strd
+        vld1.8          {\d1}, [\s1], \strd
+.ifnb \d2
+        vld1.8          {\d2}, [\s0], \strd
+        vld1.8          {\d3}, [\s1], \strd
+.endif
+.ifnb \d4
+        vld1.8          {\d4}, [\s0], \strd
+.endif
+.ifnb \d5
+        vld1.8          {\d5}, [\s1], \strd
+.endif
+.ifnb \d6
+        vld1.8          {\d6}, [\s0], \strd
+.endif
+.endm
+.macro load_16 s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+        load_slice      \s0, \s1, \strd, 16, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro load_32 s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
+        load_slice      \s0, \s1, \strd, 32, \d0, \d1, \d2, \d3, \d4, \d5, \d6
+.endm
+.macro interleave_1_16 r0, r1, r2, r3, r4
+        vext.8          \r0, \r0, \r1, #6
+        vext.8          \r1, \r1, \r2, #6
+.ifnb \r3
+        vext.8          \r2, \r2, \r3, #6
+        vext.8          \r3, \r3, \r4, #6
+.endif
+.endm
+.macro interleave_1_32 r0, r1, r2, r3, r4
+        vext.8          \r0, \r0, \r1, #4
+        vext.8          \r1, \r1, \r2, #4
+.ifnb \r3
+        vext.8          \r2, \r2, \r3, #4
+        vext.8          \r3, \r3, \r4, #4
+.endif
+.endm
+.macro vmovl_u8 q0, d0, q1, d1, q2, d2, q3, d3, q4, d4, q5, d5, q6, d6
+        vmovl.u8        \q0, \d0
+        vmovl.u8        \q1, \d1
+.ifnb \q2
+        vmovl.u8        \q2, \d2
+        vmovl.u8        \q3, \d3
+.endif
+.ifnb \q4
+        vmovl.u8        \q4, \d4
+.endif
+.ifnb \q5
+        vmovl.u8        \q5, \d5
+.endif
+.ifnb \q6
+        vmovl.u8        \q6, \d6
+.endif
+.endm
+.macro mul_mla_4 d, s0, s1, s2, s3
+        vmul.s16        \d,  \s0,  d0[0]
+        vmla.s16        \d,  \s1,  d0[1]
+        vmla.s16        \d,  \s2,  d0[2]
+        vmla.s16        \d,  \s3,  d0[3]
+.endm
+.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+        vmul.s16        \d0, \s0, d0[0]
+        vmla.s16        \d0, \s1, d0[1]
+        vmla.s16        \d0, \s2, d0[2]
+        vmla.s16        \d0, \s3, d0[3]
+        vmla.s16        \d0, \s4, d1[0]
+        vmla.s16        \d0, \s5, d1[1]
+        vmla.s16        \d0, \s6, d1[2]
+        vmla.s16        \d0, \s7, d1[3]
+        vmul.s16        \d1, \s1, d0[0]
+        vmla.s16        \d1, \s2, d0[1]
+        vmla.s16        \d1, \s3, d0[2]
+        vmla.s16        \d1, \s4, d0[3]
+        vmla.s16        \d1, \s5, d1[0]
+        vmla.s16        \d1, \s6, d1[1]
+        vmla.s16        \d1, \s7, d1[2]
+        vmla.s16        \d1, \s8, d1[3]
+.endm
+.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
+        vmul.s16        \d0, \s0, d0[0]
+        vmla.s16        \d0, \s1, d0[1]
+        vmla.s16        \d0, \s2, d0[2]
+        vmla.s16        \d0, \s3, d0[3]
+        vmla.s16        \d0, \s4, d1[0]
+        vmla.s16        \d0, \s5, d1[1]
+        vmla.s16        \d0, \s6, d1[2]
+        vmla.s16        \d0, \s7, d1[3]
+        vmul.s16        \d1, \s2, d0[0]
+        vmla.s16        \d1, \s3, d0[1]
+        vmla.s16        \d1, \s4, d0[2]
+        vmla.s16        \d1, \s5, d0[3]
+        vmla.s16        \d1, \s6, d1[0]
+        vmla.s16        \d1, \s7, d1[1]
+        vmla.s16        \d1, \s8, d1[2]
+        vmla.s16        \d1, \s9, d1[3]
+.endm
+.macro mul_mla_8_4 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11
+        vmul.s16        \d0, \s0,  d0[0]
+        vmla.s16        \d0, \s1,  d0[1]
+        vmla.s16        \d0, \s2,  d0[2]
+        vmla.s16        \d0, \s3,  d0[3]
+        vmla.s16        \d0, \s4,  d1[0]
+        vmla.s16        \d0, \s5,  d1[1]
+        vmla.s16        \d0, \s6,  d1[2]
+        vmla.s16        \d0, \s7,  d1[3]
+        vmul.s16        \d1, \s4,  d0[0]
+        vmla.s16        \d1, \s5,  d0[1]
+        vmla.s16        \d1, \s6,  d0[2]
+        vmla.s16        \d1, \s7,  d0[3]
+        vmla.s16        \d1, \s8,  d1[0]
+        vmla.s16        \d1, \s9,  d1[1]
+        vmla.s16        \d1, \s10, d1[2]
+        vmla.s16        \d1, \s11, d1[3]
+.endm
+.macro vqrshrun_s16 shift, q0, d0, q1, d1, q2, d2, q3, d3
+        vqrshrun.s16    \d0, \q0, #\shift
+.ifnb \q1
+        vqrshrun.s16    \d1, \q1, #\shift
+.endif
+.ifnb \q2
+        vqrshrun.s16    \d2, \q2, #\shift
+        vqrshrun.s16    \d3, \q3, #\shift
+.endif
+.endm
+.macro vrshr_s16 shift, r0, r1, r2, r3
+        vrshr.s16       \r0, \r0, #\shift
+.ifnb \r1
+        vrshr.s16       \r1, \r1, #\shift
+.endif
+.ifnb \r2
+        vrshr.s16       \r2, \r2, #\shift
+        vrshr.s16       \r3, \r3, #\shift
+.endif
+.endm
+.macro st_16 strd, reg, lanes
+        vst1.16         {\reg[0]}, [r0, :16], \strd
+        vst1.16         {\reg[1]}, [r8, :16], \strd
+.if \lanes > 2
+        vst1.16         {\reg[2]}, [r0, :16], \strd
+        vst1.16         {\reg[3]}, [r8, :16], \strd
+.endif
+.endm
+.macro st_32 strd, r0, r1
+        vst1.32         {\r0[0]}, [r0, :32], \strd
+        vst1.32         {\r0[1]}, [r8, :32], \strd
+.ifnb \r1
+        vst1.32         {\r1[0]}, [r0, :32], \strd
+        vst1.32         {\r1[1]}, [r8, :32], \strd
+.endif
+.endm
+.macro st_reg strd, align, r0, r1, r2, r3, r4, r5, r6, r7
+        vst1.8          {\r0}, [r0, \align], \strd
+        vst1.8          {\r1}, [r8, \align], \strd
+.ifnb \r2
+        vst1.8          {\r2}, [r0, \align], \strd
+        vst1.8          {\r3}, [r8, \align], \strd
+.endif
+.ifnb \r4
+        vst1.8          {\r4}, [r0, \align], \strd
+        vst1.8          {\r5}, [r8, \align], \strd
+        vst1.8          {\r6}, [r0, \align], \strd
+        vst1.8          {\r7}, [r8, \align], \strd
+.endif
+.endm
+.macro shift_store_4 type, strd, q0, d0, d1, q1, d2, d3
+.ifc \type, put
+        vqrshrun_s16    6,     \q0, \d0, \q1, \d2
+        st_32           \strd, \d0, \d2
+.else
+        vrshr_s16       2,          \q0, \q1
+        st_reg          \strd, :64, \d0, \d1, \d2, \d3
+.endif
+.endm
+.macro shift_store_8 type, strd, q0, d0, q1, d1, q2, d2, q3, d3
+.ifc \type, put
+        vqrshrun_s16    6,          \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3
+        st_reg          \strd, :64, \d0, \d1, \d2, \d3
+.else
+        vrshr_s16       2,          \q0, \q1, \q2, \q3
+        st_reg          \strd, :128,\q0, \q1, \q2, \q3
+.endif
+.endm
+.macro shift_store_16 type, strd, q0, d0, d1, q1, q2, d4, d5, q3
+.ifc \type, put
+        vqrshrun.s16    \d0,   \q0, #6
+        vqrshrun.s16    \d1,   \q1, #6
+        vqrshrun.s16    \d4,   \q2, #6
+        vqrshrun.s16    \d5,   \q3, #6
+        st_reg          \strd, :128, \q0, \q2
+.else
+        vrshr_s16       2,     \q0, \q1, \q2, \q3
+        vst1.16         {\q0, \q1}, [r0, :128], \strd
+        vst1.16         {\q2, \q3}, [r8, :128], \strd
+.endif
+.endm
+
+.macro make_8tap_fn op, type, type_h, type_v
+function \op\()_8tap_\type\()_8bpc_neon, export=1
+        push            {r4-r11,lr}
+        movw            r8,  \type_h
+        movw            r9,  \type_v
+        b               \op\()_8tap
+endfunc
+.endm
+
+// No spaces in these expressions, due to gas-preprocessor.
+#define REGULAR ((0*15<<7)|3*15)
+#define SMOOTH  ((1*15<<7)|4*15)
+#define SHARP   ((2*15<<7)|3*15)
+
+.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, my, ds2, sr2, shift_hv
+make_8tap_fn \type, regular,        REGULAR, REGULAR
+make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
+make_8tap_fn \type, regular_sharp,  REGULAR, SHARP
+make_8tap_fn \type, smooth,         SMOOTH,  SMOOTH
+make_8tap_fn \type, smooth_regular, SMOOTH,  REGULAR
+make_8tap_fn \type, smooth_sharp,   SMOOTH,  SHARP
+make_8tap_fn \type, sharp,          SHARP,   SHARP
+make_8tap_fn \type, sharp_regular,  SHARP,   REGULAR
+make_8tap_fn \type, sharp_smooth,   SHARP,   SMOOTH
+
+function \type\()_8tap
+        ldrd            r4,  r5,  [sp, #36]
+        ldrd            r6,  r7,  [sp, #44]
+        movw            r10,  #0x4081  // (1 << 14) | (1 << 7) | (1 << 0)
+        mul             \mx,  \mx, r10
+        mul             \my,  \my, r10
+        add             \mx,  \mx, r8 // mx, 8tap_h, 4tap_h
+        add             \my,  \my, r9 // my, 8tap_v, 4tap_v
+.ifc \type, prep
+        lsl             \d_strd, \w, #1
+.endif
+
+        clz             r8,  \w
+        tst             \mx, #(0x7f << 14)
+        sub             r8,  r8,  #24
+        movrel          r10, X(mc_subpel_filters), -8
+        bne             L(\type\()_8tap_h)
+        tst             \my, #(0x7f << 14)
+        bne             L(\type\()_8tap_v)
+        b               \type
+
+L(\type\()_8tap_h):
+        cmp             \w,  #4
+        ubfx            r9,  \mx, #7, #7
+        and             \mx, \mx, #0x7f
+        it              gt
+        movgt           \mx,  r9
+        tst             \my,  #(0x7f << 14)
+        add             \mx, r10, \mx, lsl #3
+        bne             L(\type\()_8tap_hv)
+
+        adr             r9,  L(\type\()_8tap_h_tbl)
+        ldr             r8,  [r9, r8, lsl #2]
+        add             r9,  r9,  r8
+        bx              r9
+
+        .align 2
+L(\type\()_8tap_h_tbl):
+        .word 1280f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+        .word 640f  - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+        .word 320f  - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+        .word 160f  - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+        .word 80f   - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+        .word 40f   - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+        .word 20f   - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
+
+20:     // 2xN h
+.ifc \type, put
+        add             \mx,  \mx,  #2
+        vld1.32         {d0[]}, [\mx]
+        sub             \src,  \src,  #1
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+        vmovl.s8        q0,  d0
+2:
+        vld1.8          {d4},  [\src], \s_strd
+        vld1.8          {d6},  [\sr2], \s_strd
+        vmovl.u8        q2,  d4
+        vmovl.u8        q3,  d6
+        vext.8          d5,  d4,  d5,  #2
+        vext.8          d7,  d6,  d7,  #2
+        subs            \h,  \h,  #2
+        vtrn.32         d4,  d6
+        vtrn.32         d5,  d7
+        vmul.s16        d2,  d4,  d0[0]
+        vmla.s16        d2,  d5,  d0[1]
+        vmla.s16        d2,  d6,  d0[2]
+        vmla.s16        d2,  d7,  d0[3]
+        vrshr.s16       d2,  d2,  #2
+        vqrshrun.s16    d2,  q1,  #4
+        vst1.16         {d2[0]}, [\dst, :16], \d_strd
+        vst1.16         {d2[1]}, [\ds2, :16], \d_strd
+        bgt             2b
+        pop             {r4-r11,pc}
+.endif
+
+40:     // 4xN h
+        add             \mx,  \mx,  #2
+        vld1.32         {d0[]}, [\mx]
+        sub             \src,  \src,  #1
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+        vmovl.s8        q0,  d0
+4:
+        vld1.8          {d16}, [\src], \s_strd
+        vld1.8          {d24}, [\sr2], \s_strd
+        vmovl.u8        q8,  d16
+        vmovl.u8        q12, d24
+        vext.8          q9,  q8,  q8,  #2
+        vext.8          q10, q8,  q8,  #4
+        vext.8          q11, q8,  q8,  #6
+        vext.8          q13, q12, q12, #2
+        vext.8          q14, q12, q12, #4
+        vext.8          q15, q12, q12, #6
+        subs            \h,  \h,  #2
+        vmul.s16        d4,  d16, d0[0]
+        vmla.s16        d4,  d18, d0[1]
+        vmla.s16        d4,  d20, d0[2]
+        vmla.s16        d4,  d22, d0[3]
+        vmul.s16        d5,  d24, d0[0]
+        vmla.s16        d5,  d26, d0[1]
+        vmla.s16        d5,  d28, d0[2]
+        vmla.s16        d5,  d30, d0[3]
+        vrshr.s16       q2,  q2,  #2
+.ifc \type, put
+        vqrshrun.s16    d4,  q2,  #4
+        vst1.32         {d4[0]}, [\dst, :32], \d_strd
+        vst1.32         {d4[1]}, [\ds2, :32], \d_strd
+.else
+        vst1.16         {d4}, [\dst, :64], \d_strd
+        vst1.16         {d5}, [\ds2, :64], \d_strd
+.endif
+        bgt             4b
+        pop             {r4-r11,pc}
+
+80:     // 8xN h
+        vld1.8          {d0}, [\mx]
+        sub             \src,  \src,  #3
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+        vmovl.s8        q0,  d0
+8:
+        vld1.8          {q8},  [\src], \s_strd
+        vld1.8          {q12}, [\sr2], \s_strd
+        vmovl.u8        q9,  d17
+        vmovl.u8        q8,  d16
+        vmovl.u8        q13, d25
+        vmovl.u8        q12, d24
+
+        vmul.s16        q10, q8,  d0[0]
+        vmul.s16        q14, q12, d0[0]
+.irpc i, 1234567
+        vext.8          q11, q8,  q9,  #(2*\i)
+        vext.8          q15, q12, q13, #(2*\i)
+.if \i < 4
+        vmla.s16        q10, q11, d0[\i]
+        vmla.s16        q14, q15, d0[\i]
+.else
+        vmla.s16        q10, q11, d1[\i-4]
+        vmla.s16        q14, q15, d1[\i-4]
+.endif
+.endr
+        subs            \h,  \h,  #2
+        vrshr.s16       q10, q10, #2
+        vrshr.s16       q14, q14, #2
+.ifc \type, put
+        vqrshrun.s16    d20, q10, #4
+        vqrshrun.s16    d28, q14, #4
+        vst1.8          {d20}, [\dst, :64], \d_strd
+        vst1.8          {d28}, [\ds2, :64], \d_strd
+.else
+        vst1.16         {q10}, [\dst, :128], \d_strd
+        vst1.16         {q14}, [\ds2, :128], \d_strd
+.endif
+        bgt             8b
+        pop             {r4-r11,pc}
+
+160:
+320:
+640:
+1280:   // 16xN, 32xN, ... h
+        // This could be done without touching q4-q6, by using only
+        // one temporary for vext in the loop. That's slower on A7 and A53,
+        // (but surprisingly, marginally faster on A8 and A73).
+        vpush           {q4-q6}
+        vld1.8          {d0}, [\mx]
+        sub             \src,  \src,  #3
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+        vmovl.s8        q0,  d0
+
+        sub             \s_strd,  \s_strd,  \w
+        sub             \s_strd,  \s_strd,  #8
+.ifc \type, put
+        lsl             \d_strd,  \d_strd,  #1
+        sub             \d_strd,  \d_strd,  \w
+.endif
+161:
+        vld1.8          {d16, d17, d18},  [\src]!
+        vld1.8          {d24, d25, d26},  [\sr2]!
+        mov             \mx, \w
+        vmovl.u8        q10, d18
+        vmovl.u8        q9,  d17
+        vmovl.u8        q8,  d16
+        vmovl.u8        q14, d26
+        vmovl.u8        q13, d25
+        vmovl.u8        q12, d24
+
+16:
+        vmul.s16        q1,  q8,  d0[0]
+        vmul.s16        q2,  q9,  d0[0]
+        vmul.s16        q3,  q12, d0[0]
+        vmul.s16        q4,  q13, d0[0]
+.irpc i, 1234567
+        vext.8          q5,  q8,  q9,  #(2*\i)
+        vext.8          q6,  q9,  q10, #(2*\i)
+        vext.8          q11, q12, q13, #(2*\i)
+        vext.8          q15, q13, q14, #(2*\i)
+.if \i < 4
+        vmla.s16        q1,  q5,  d0[\i]
+        vmla.s16        q2,  q6,  d0[\i]
+        vmla.s16        q3,  q11, d0[\i]
+        vmla.s16        q4,  q15, d0[\i]
+.else
+        vmla.s16        q1,  q5,  d1[\i-4]
+        vmla.s16        q2,  q6,  d1[\i-4]
+        vmla.s16        q3,  q11, d1[\i-4]
+        vmla.s16        q4,  q15, d1[\i-4]
+.endif
+.endr
+        vrshr.s16       q1,  q1,  #2
+        vrshr.s16       q2,  q2,  #2
+        vrshr.s16       q3,  q3,  #2
+        vrshr.s16       q4,  q4,  #2
+        subs            \mx, \mx, #16
+.ifc \type, put
+        vqrshrun.s16    d2,  q1,  #4
+        vqrshrun.s16    d3,  q2,  #4
+        vqrshrun.s16    d4,  q3,  #4
+        vqrshrun.s16    d5,  q4,  #4
+        vst1.8          {q1}, [\dst, :128]!
+        vst1.8          {q2}, [\ds2, :128]!
+.else
+        vst1.16         {q1, q2}, [\dst, :128]!
+        vst1.16         {q3, q4}, [\ds2, :128]!
+.endif
+        ble             9f
+
+        vmov            q8,  q10
+        vmov            q12, q14
+        vld1.8          {d18, d19}, [\src]!
+        vld1.8          {d26, d27}, [\sr2]!
+        vmovl.u8        q10, d19
+        vmovl.u8        q9,  d18
+        vmovl.u8        q14, d27
+        vmovl.u8        q13, d26
+        b               16b
+
+9:
+        add             \dst,  \dst,  \d_strd
+        add             \ds2,  \ds2,  \d_strd
+        add             \src,  \src,  \s_strd
+        add             \sr2,  \sr2,  \s_strd
+
+        subs            \h,  \h,  #2
+        bgt             161b
+        vpop            {q4-q6}
+        pop             {r4-r11,pc}
+
+L(\type\()_8tap_v):
+        cmp             \h,  #4
+        ubfx            r9,  \my, #7, #7
+        and             \my, \my, #0x7f
+        it              gt
+        movgt           \my, r9
+        add             \my, r10, \my, lsl #3
+
+        adr             r9,  L(\type\()_8tap_v_tbl)
+        ldr             r8,  [r9, r8, lsl #2]
+        add             r9,  r9,  r8
+        bx              r9
+
+        .align 2
+L(\type\()_8tap_v_tbl):
+        .word 1280f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+        .word 640f  - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+        .word 320f  - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+        .word 160f  - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+        .word 80f   - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+        .word 40f   - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+        .word 20f   - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
+
+20:     // 2xN v
+.ifc \type, put
+        bgt             28f
+
+        cmp             \h,  #2
+        add             \my, \my, #2
+        vld1.32         {d0[]}, [\my]
+        sub             \src,  \src,  \s_strd
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+        lsl             \d_strd,  \d_strd,  #1
+        vmovl.s8        q0,  d0
+
+        // 2x2 v
+        load_16         \src, \sr2, \s_strd, d1, d2, d3, d4, d5
+        interleave_1_16 d1, d2, d3, d4, d5
+        bgt             24f
+        vmovl_u8        q8, d1, q9, d2, q10, d3, q11, d4
+        mul_mla_4       d6, d16, d18, d20, d22
+        vqrshrun_s16    6,   q3,  d6
+        st_16           \d_strd, d6, 2
+        pop             {r4-r11,pc}
+
+24:     // 2x4 v
+        load_16         \sr2, \src, \s_strd, d6, d7
+        interleave_1_16 d5, d6, d7
+        vmovl_u8        q8, d1, q9, d2, q10, d3, q11, d4, q12, d5, q13, d6
+        vmov            d17, d20
+        vmov            d19, d22
+        vmov            d21, d24
+        vmov            d23, d26
+        mul_mla_4       q3, q8, q9, q10, q11
+        vqrshrun_s16    6,   q3,  d6
+        st_16           \d_strd, d6, 4
+        pop             {r4-r11,pc}
+
+28:     // 2x8, 2x16 v
+        vpush           {q4-q7}
+        vld1.8          {d0}, [\my]
+        sub             \sr2,  \src,  \s_strd, lsl #1
+        add             \ds2,  \dst,  \d_strd
+        sub             \src,  \sr2,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+        vmovl.s8        q0,  d0
+
+        load_16         \src, \sr2, \s_strd, d2,  d4,  d6,  d8,  d10, d12, d14
+        interleave_1_16 d2,  d4,  d6,  d8,  d10
+        interleave_1_16 d10, d12, d14
+        vmovl_u8        q1,  d2,  q2,  d4,  q3,  d6,  q4,  d8,  q5,  d10, q6,  d12
+        vmov            d3,  d6
+        vmov            d5,  d8
+        vmov            d7,  d10
+        vmov            d9,  d12
+216:
+        subs            \h,  \h,  #8
+        load_16         \sr2, \src, \s_strd, d16, d18, d20, d22
+        load_16         \sr2, \src, \s_strd, d24, d26, d28, d30
+        interleave_1_16 d14, d16, d18, d20, d22
+        interleave_1_16 d22, d24, d26, d28, d30
+        vmovl_u8        q7,  d14, q8,  d16, q9,  d18, q10, d20
+        vmovl_u8        q11, d22, q12, d24, q13, d26, q14, d28
+        vmov            d11, d14
+        vmov            d13, d16
+        vmov            d15, d18
+        vmov            d17, d20
+        vmov            d19, d22
+        vmov            d21, d24
+        vmov            d23, d26
+        vmov            d25, d28
+        mul_mla_8_4     q1,  q2,  q1,  q2,  q3,  q4,  q5,  q6,  q7,  q8,  q9,  q10, q11, q12
+        vqrshrun_s16    6,   q1,  d2,  q2,  d4
+        st_16           \d_strd, d2, 4
+        st_16           \d_strd, d4, 4
+        ble             0f
+        vmov            q1,  q9
+        vmov            q2,  q10
+        vmov            q3,  q11
+        vmov            q4,  q12
+        vmov            q5,  q13
+        vmov            q6,  q14
+        vmov            d14, d30
+        b               216b
+0:
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+.endif
+
+40:
+        bgt            480f
+
+        // 4x2, 4x4 v
+        cmp             \h,  #2
+        add             \my, \my, #2
+        vld1.32         {d0[]}, [\my]
+        sub             \src, \src, \s_strd
+        add             \ds2, \dst, \d_strd
+        add             \sr2, \src, \s_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        vmovl.s8        q0,  d0
+
+        load_32         \src, \sr2, \s_strd, d1, d2, d3, d4, d5
+        interleave_1_32 d1,  d2,  d3,  d4,  d5
+        vmovl_u8        q8,  d1,  q9,  d2,  q10, d3,  q11, d4
+        mul_mla_4       q3,  q8,  q9,  q10, q11
+        shift_store_4   \type, \d_strd, q3, d6, d7
+        ble             0f
+        load_32         \sr2, \src, \s_strd, d6, d7
+        interleave_1_32 d5,  d6,  d7
+        vmovl_u8        q12, d5,  q13, d6
+        mul_mla_4       q3,  q10, q11, q12, q13
+        shift_store_4   \type, \d_strd, q3, d6, d7
+0:
+        pop             {r4-r11,pc}
+
+480:    // 4x8, 4x16 v
+        vpush           {q4}
+        vld1.8          {d0}, [\my]
+        sub             \sr2, \src, \s_strd, lsl #1
+        add             \ds2, \dst, \d_strd
+        sub             \src, \sr2, \s_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        vmovl.s8        q0,  d0
+
+        load_32         \src, \sr2, \s_strd, d2,  d4,  d6,  d8,  d16, d18, d20
+        interleave_1_32 d2,  d4,  d6
+        interleave_1_32 d6,  d8,  d16, d18, d20
+        vmovl_u8        q1,  d2,  q2,  d4,  q3,  d6,  q4,  d8,  q8,  d16, q9,  d18
+
+48:
+        subs            \h,  \h,  #4
+        load_32         \sr2, \src, \s_strd, d22, d24, d26, d28
+        interleave_1_32 d20, d22, d24, d26, d28
+        vmovl_u8        q10, d20, q11, d22, q12, d24, q13, d26
+        mul_mla_8_2     q1,  q2,  q1,  q2,  q3,  q4,  q8,  q9,  q10, q11, q12, q13
+        shift_store_4   \type, \d_strd, q1,  d2,  d3,  q2,  d4,  d5
+        ble             0f
+        subs            \h,  \h,  #4
+        load_32         \sr2,  \src, \s_strd, d30, d2,  d4,  d6
+        interleave_1_32 d28, d30, d2,  d4,  d6
+        vmovl_u8        q14, d28, q15, d30, q1,  d2,  q2,  d4
+        mul_mla_8_2     q8,  q9,  q8,  q9,  q10, q11, q12, q13, q14, q15, q1,  q2
+        shift_store_4   \type, \d_strd, q8,  d16, d17, q9,  d18, d19
+        ble             0f
+        subs            \h,  \h,  #4
+        load_32         \sr2, \src, \s_strd, d8,  d16, d18, d20
+        interleave_1_32 d6,  d8,  d16, d18, d20
+        vmovl_u8        q3,  d6,  q4,  d8,  q8,  d16, q9, d18
+        mul_mla_8_2     q12, q13, q12, q13, q14, q15, q1,  q2,  q3,  q4,  q8,  q9
+        shift_store_4   \type, \d_strd, q12, d24, d25, q13, d26, d27
+        b               48b
+0:
+        vpop            {q4}
+        pop             {r4-r11,pc}
+
+80:
+        bgt             880f
+
+        // 8x2, 8x4 v
+        cmp             \h,  #2
+        add             \my, \my, #2
+        vld1.32         {d0[]}, [\my]
+        sub             \src, \src, \s_strd
+        add             \ds2, \dst, \d_strd
+        add             \sr2, \src, \s_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        vmovl.s8        q0,  d0
+
+        load_reg        \src, \sr2, \s_strd, d1, d2, d3, d4, d5
+        vmovl_u8        q8,  d1,  q9,  d2,  q10, d3,  q11, d4,  q12, d5
+        mul_mla_4       q1,  q8,  q9,  q10, q11
+        mul_mla_4       q2,  q9,  q10, q11, q12
+        shift_store_8   \type, \d_strd, q1, d2, q2, d4
+        ble             0f
+        load_reg        \sr2, \src, \s_strd, d6, d7
+        vmovl_u8        q13, d6,  q14, d7
+        mul_mla_4       q1,  q10, q11, q12, q13
+        mul_mla_4       q2,  q11, q12, q13, q14
+        shift_store_8   \type, \d_strd, q1, d2, q2, d4
+0:
+        pop             {r4-r11,pc}
+
+880:    // 8x8, 8x16, 8x32 v
+1680:   // 16x8, 16x16, ...
+320:    // 32x8, 32x16, ...
+640:
+1280:
+        vpush           {q4}
+        vld1.8          {d0}, [\my]
+        sub             \src, \src, \s_strd
+        sub             \src, \src, \s_strd, lsl #1
+        vmovl.s8        q0,  d0
+        mov             \my, \h
+168:
+        add             \ds2, \dst, \d_strd
+        add             \sr2, \src, \s_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+
+        load_reg        \src, \sr2, \s_strd, d2,  d4,  d6,  d8,  d16, d18, d20
+        vmovl_u8        q1,  d2,  q2,  d4,  q3,  d6,  q4,  d8,  q8,  d16, q9,  d18, q10, d20
+
+88:
+        subs            \h,  \h,  #2
+        load_reg        \sr2, \src, \s_strd, d22, d24
+        vmovl_u8        q11, d22, q12, d24
+        mul_mla_8_1     q1,  q2,  q1,  q2,  q3,  q4,  q8,  q9,  q10,  q11, q12
+        shift_store_8   \type, \d_strd, q1,  d2,  q2,  d4
+        ble             9f
+        subs            \h,  \h,  #2
+        load_reg        \sr2, \src, \s_strd, d26, d28
+        vmovl_u8        q13, d26, q14, d28
+        mul_mla_8_1     q3,  q4,  q3,  q4,  q8,  q9,  q10, q11, q12, q13, q14
+        shift_store_8   \type, \d_strd, q3,  d6,  q4,  d8
+        ble             9f
+        subs            \h,  \h,  #4
+        load_reg        \sr2, \src, \s_strd, d30, d2,  d4,  d6
+        vmovl_u8        q15, d30, q1,  d2,  q2,  d4,  q3,  d6
+        mul_mla_8_1     q8,  q9,  q8,  q9,  q10, q11, q12, q13, q14, q15, q1
+        mul_mla_8_1     q10, q11, q10, q11, q12, q13, q14, q15, q1,  q2,  q3
+        shift_store_8   \type, \d_strd, q8,  d16, q9,  d18, q10, d20, q11, d22
+        ble             9f
+        subs            \h,  \h,  #4
+        load_reg        \sr2, \src, \s_strd, d8,  d16, d18, d20
+        vmovl_u8        q4,  d8,  q8,  d16, q9,  d18, q10, d20
+        mul_mla_8_1     q12, q13, q12, q13, q14, q15, q1,  q2,  q3,  q4,  q8
+        mul_mla_8_1     q14, q15, q14, q15, q1,  q2,  q3,  q4,  q8,  q9,  q10
+        shift_store_8   \type, \d_strd, q12, d24, q13, d26, q14, d28, q15, d30
+        bgt             88b
+9:
+        subs            \w,  \w,  #8
+        ble             0f
+        asr             \s_strd, \s_strd, #1
+        asr             \d_strd, \d_strd, #1
+        mls             \src, \s_strd, \my, \src
+        mls             \dst, \d_strd, \my, \dst
+        sub             \src, \src, \s_strd, lsl #3
+        mov             \h,  \my
+        add             \src, \src, #8
+.ifc \type, put
+        add             \dst, \dst, #8
+.else
+        add             \dst, \dst, #16
+.endif
+        b               168b
+0:
+        vpop            {q4}
+        pop             {r4-r11,pc}
+
+160:
+        bgt             1680b
+
+        // 16x2, 16x4 v
+        add             \my, \my, #2
+        vld1.32         {d0[]}, [\my]
+        sub             \src, \src, \s_strd
+        add             \ds2, \dst, \d_strd
+        add             \sr2, \src, \s_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        vmovl.s8        q0,  d0
+
+        cmp             \h,  #2
+        load_reg        \src, \sr2, \s_strd, q11, q12, q13, q14, q15
+        vmovl.u8        q1,  d22
+        vmovl.u8        q2,  d24
+        vmovl.u8        q3,  d26
+        vmovl.u8        q8,  d28
+        vmovl.u8        q9,  d30
+        vmovl.u8        q11, d23
+        vmovl.u8        q12, d25
+        vmovl.u8        q13, d27
+        vmovl.u8        q14, d29
+        vmovl.u8        q15, d31
+        mul_mla_4       q1,  q1,  q2,  q3,  q8
+        mul_mla_4       q10, q2,  q3,  q8,  q9
+        mul_mla_4       q2,  q11, q12, q13, q14
+        mul_mla_4       q11, q12, q13, q14, q15
+        shift_store_16  \type, \d_strd, q1, d2, d3, q2, q10, d20, d21, q11
+        ble             0f
+        load_reg        \sr2, \src, \s_strd, q10, q11
+        vmovl.u8        q1,  d20
+        vmovl.u8        q10, d21
+        vmovl.u8        q12, d22
+        vmovl.u8        q11, d23
+        mul_mla_4       q2,  q3,  q8,  q9,  q1
+        mul_mla_4       q3,  q13, q14, q15, q10
+        mul_mla_4       q13, q8,  q9,  q1,  q12
+        mul_mla_4       q14, q14, q15, q10, q11
+        shift_store_16  \type, \d_strd, q2, d4, d5, q3, q13, d26, d27, q14
+0:
+        pop             {r4-r11,pc}
+
+L(\type\()_8tap_hv):
+        cmp             \h,  #4
+        ubfx            r9,  \my, #7, #7
+        and             \my, \my, #0x7f
+        it              gt
+        movgt           \my, r9
+        add             \my,  r10, \my, lsl #3
+
+        adr             r9,  L(\type\()_8tap_hv_tbl)
+        ldr             r8,  [r9, r8, lsl #2]
+        add             r9,  r9,  r8
+        bx              r9
+
+        .align 2
+L(\type\()_8tap_hv_tbl):
+        .word 1280f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+        .word 640f  - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+        .word 320f  - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+        .word 160f  - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+        .word 80f   - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+        .word 40f   - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+        .word 20f   - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
+
+20:
+.ifc \type, put
+        add             \mx,  \mx,  #2
+        vld1.32         {d0[]},  [\mx]
+        bgt             280f
+        add             \my,  \my,  #2
+        vld1.32         {d2[]},  [\my]
+
+        // 2x2, 2x4 hv
+        sub             \sr2, \src, #1
+        sub             \src, \sr2, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        vmovl.s8        q0,  d0
+        vmovl.s8        q1,  d2
+
+
+        vld1.8          {d26}, [\src], \s_strd
+        vmovl.u8        q13, d26
+        vext.8          q14, q13, q13, #2
+        vmul.s16        d26, d26, d0
+        vmul.s16        d28, d28, d0
+        vpadd.s16       d26, d26, d28
+        vpadd.s16       d26, d26, d26
+        vrshr.s16       d16, d26, #2
+        bl              L(\type\()_8tap_filter_2)
+
+        vext.8          d16, d16, d16, #4
+        vmov            d17, d26
+        vext.8          d16, d16, d26, #4
+
+2:
+        bl              L(\type\()_8tap_filter_2)
+
+        vext.8          d18, d17, d26, #4
+        vmov            d19, d26
+        vmull.s16       q2,  d16, d2[0]
+        vmlal.s16       q2,  d17, d2[1]
+        vmlal.s16       q2,  d18, d2[2]
+        vmlal.s16       q2,  d19, d2[3]
+
+        vqrshrn.s32     d4,  q2,  #\shift_hv
+        vqmovun.s16     d4,  q2
+        subs            \h,  \h,  #2
+        vst1.16         {d4[0]}, [\dst, :16], \d_strd
+        vst1.16         {d4[1]}, [\ds2, :16], \d_strd
+        ble             0f
+        vmov            d16, d18
+        vmov            d17, d19
+        b               2b
+
+280:    // 2x8, 2x16, 2x32 hv
+        vld1.8          {d2},  [\my]
+        sub             \src, \src, #1
+        sub             \sr2, \src, \s_strd, lsl #1
+        sub             \src, \sr2, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        vmovl.s8        q0,  d0
+        vmovl.s8        q1,  d2
+
+        vld1.8          {d26}, [\src], \s_strd
+        vmovl.u8        q13, d26
+        vext.8          q14, q13, q13, #2
+        vmul.s16        d26, d26, d0
+        vmul.s16        d28, d28, d0
+        vpadd.s16       d26, d26, d28
+        vpadd.s16       d26, d26, d26
+        vrshr.s16       d16, d26, #2
+
+        bl              L(\type\()_8tap_filter_2)
+        vext.8          d16, d16, d16, #4
+        vmov            d17, d26
+        vext.8          d16, d16, d26, #4
+        bl              L(\type\()_8tap_filter_2)
+        vext.8          d18, d17, d26, #4
+        vmov            d19, d26
+        bl              L(\type\()_8tap_filter_2)
+        vext.8          d20, d19, d26, #4
+        vmov            d21, d26
+
+28:
+        bl              L(\type\()_8tap_filter_2)
+        vext.8          d22, d21, d26, #4
+        vmov            d23, d26
+        vmull.s16       q2,  d16, d2[0]
+        vmlal.s16       q2,  d17, d2[1]
+        vmlal.s16       q2,  d18, d2[2]
+        vmlal.s16       q2,  d19, d2[3]
+        vmlal.s16       q2,  d20, d3[0]
+        vmlal.s16       q2,  d21, d3[1]
+        vmlal.s16       q2,  d22, d3[2]
+        vmlal.s16       q2,  d23, d3[3]
+
+        vqrshrn.s32     d4,  q2,  #\shift_hv
+        vqmovun.s16     d4,  q2
+        subs            \h,  \h,  #2
+        vst1.16         {d4[0]}, [\dst, :16], \d_strd
+        vst1.16         {d4[1]}, [\ds2, :16], \d_strd
+        ble             0f
+        vmov            d16, d18
+        vmov            d17, d19
+        vmov            d18, d20
+        vmov            d19, d21
+        vmov            d20, d22
+        vmov            d21, d23
+        b               28b
+
+0:
+        pop             {r4-r11,pc}
+
+L(\type\()_8tap_filter_2):
+        vld1.8          {d28},  [\sr2], \s_strd
+        vld1.8          {d30},  [\src], \s_strd
+        vext.8          d29, d28, d28, #1
+        vext.8          d31, d30, d30, #1
+        vmovl.u8        q13, d28
+        vmovl.u8        q14, d29
+        vmov            d27, d28
+        vmovl.u8        q14, d30
+        vmovl.u8        q15, d31
+        vtrn.32         d26, d28
+        vtrn.32         d27, d30
+        vmul.s16        d26, d26, d0[0]
+        vmla.s16        d26, d27, d0[1]
+        vmla.s16        d26, d28, d0[2]
+        vmla.s16        d26, d30, d0[3]
+        vrshr.s16       d26, d26, #2
+        vext.8          d27, d26, d26, #4
+        bx              lr
+.endif
+
+40:
+        add             \mx, \mx, #2
+        vld1.32         {d0[]},  [\mx]
+        bgt             480f
+        add             \my, \my,  #2
+        vld1.32         {d2[]},  [\my]
+        sub             \sr2, \src, #1
+        sub             \src, \sr2, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        vmovl.s8        q0,  d0
+        vmovl.s8        q1,  d2
+
+        // 4x2, 4x4 hv
+        vld1.8          {d30}, [\src], \s_strd
+        vmovl.u8        q14, d30
+        vext.8          d27, d28, d29, #2
+        vext.8          d30, d28, d29, #4
+        vext.8          d31, d28, d29, #6
+        vmul.s16        d26, d28, d0[0]
+        vmla.s16        d26, d27, d0[1]
+        vmla.s16        d26, d30, d0[2]
+        vmla.s16        d26, d31, d0[3]
+        vrshr.s16       d16, d26, #2
+
+        bl              L(\type\()_8tap_filter_4)
+        vmov            d17, d26
+        vmov            d18, d27
+
+4:
+        bl              L(\type\()_8tap_filter_4)
+        vmull.s16       q2,  d16, d2[0]
+        vmlal.s16       q2,  d17, d2[1]
+        vmlal.s16       q2,  d18, d2[2]
+        vmlal.s16       q2,  d26, d2[3]
+        vmull.s16       q3,  d17, d2[0]
+        vmlal.s16       q3,  d18, d2[1]
+        vmlal.s16       q3,  d26, d2[2]
+        vmlal.s16       q3,  d27, d2[3]
+        vqrshrn.s32     d4,  q2,  #\shift_hv
+        vqrshrn.s32     d6,  q3,  #\shift_hv
+        subs            \h,  \h,  #2
+.ifc \type, put
+        vqmovun.s16     d4,  q2
+        vqmovun.s16     d6,  q3
+        vst1.32         {d4[0]}, [\dst, :32], \d_strd
+        vst1.32         {d6[0]}, [\ds2, :32], \d_strd
+.else
+        vst1.16         {d4}, [\dst, :64], \d_strd
+        vst1.16         {d6}, [\ds2, :64], \d_strd
+.endif
+        ble             0f
+        vmov            d16, d18
+        vmov            d17, d26
+        vmov            d18, d27
+        b               4b
+
+480:    // 4x8, 4x16, 4x32 hv
+        vld1.8          {d2},  [\my]
+        sub             \src, \src, #1
+        sub             \sr2, \src, \s_strd, lsl #1
+        sub             \src, \sr2, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+        vmovl.s8        q0,  d0
+        vmovl.s8        q1,  d2
+
+        vld1.8          {d30}, [\src], \s_strd
+        vmovl.u8        q14, d30
+        vext.8          d27, d28, d29, #2
+        vext.8          d30, d28, d29, #4
+        vext.8          d31, d28, d29, #6
+        vmul.s16        d26, d28, d0[0]
+        vmla.s16        d26, d27, d0[1]
+        vmla.s16        d26, d30, d0[2]
+        vmla.s16        d26, d31, d0[3]
+        vrshr.s16       d16, d26, #2
+
+        bl              L(\type\()_8tap_filter_4)
+        vmov            d17, d26
+        vmov            d18, d27
+        bl              L(\type\()_8tap_filter_4)
+        vmov            d19, d26
+        vmov            d20, d27
+        bl              L(\type\()_8tap_filter_4)
+        vmov            d21, d26
+        vmov            d22, d27
+
+48:
+        bl              L(\type\()_8tap_filter_4)
+        vmull.s16       q2,  d16, d2[0]
+        vmlal.s16       q2,  d17, d2[1]
+        vmlal.s16       q2,  d18, d2[2]
+        vmlal.s16       q2,  d19, d2[3]
+        vmlal.s16       q2,  d20, d3[0]
+        vmlal.s16       q2,  d21, d3[1]
+        vmlal.s16       q2,  d22, d3[2]
+        vmlal.s16       q2,  d26, d3[3]
+        vmull.s16       q3,  d17, d2[0]
+        vmlal.s16       q3,  d18, d2[1]
+        vmlal.s16       q3,  d19, d2[2]
+        vmlal.s16       q3,  d20, d2[3]
+        vmlal.s16       q3,  d21, d3[0]
+        vmlal.s16       q3,  d22, d3[1]
+        vmlal.s16       q3,  d26, d3[2]
+        vmlal.s16       q3,  d27, d3[3]
+        vqrshrn.s32     d4,  q2,  #\shift_hv
+        vqrshrn.s32     d6,  q3,  #\shift_hv
+        subs            \h,  \h,  #2
+.ifc \type, put
+        vqmovun.s16     d4,  q2
+        vqmovun.s16     d6,  q3
+        vst1.32         {d4[0]}, [\dst, :32], \d_strd
+        vst1.32         {d6[0]}, [\ds2, :32], \d_strd
+.else
+        vst1.16         {d4}, [\dst, :64], \d_strd
+        vst1.16         {d6}, [\ds2, :64], \d_strd
+.endif
+        ble             0f
+        vmov            d16, d18
+        vmov            d17, d19
+        vmov            d18, d20
+        vmov            d19, d21
+        vmov            d20, d22
+        vmov            d21, d26
+        vmov            d22, d27
+        b               48b
+0:
+        pop             {r4-r11,pc}
+
+L(\type\()_8tap_filter_4):
+        vld1.8          {d30}, [\sr2], \s_strd
+        vld1.8          {d31}, [\src], \s_strd
+        vmovl.u8        q14, d30
+        vext.8          d27, d28, d29, #2
+        vext.8          d30, d28, d29, #4
+        vext.8          d1,  d28, d29, #6
+        vmul.s16        d26, d28, d0[0]
+        vmla.s16        d26, d27, d0[1]
+        vmla.s16        d26, d30, d0[2]
+        vmla.s16        d26, d1,  d0[3]
+
+        vmovl.u8        q14, d31
+        vext.8          d30, d28, d29, #2
+        vext.8          d31, d28, d29, #4
+        vext.8          d1,  d28, d29, #6
+        vmul.s16        d27, d28, d0[0]
+        vmla.s16        d27, d30, d0[1]
+        vmla.s16        d27, d31, d0[2]
+        vmla.s16        d27, d1,  d0[3]
+        vrshr.s16       d26, d26, #2
+        vrshr.s16       d27, d27, #2
+        bx              lr
+
+80:
+160:
+320:
+        bgt             880f
+        vpush           {q4-q7}
+        add             \my,  \my,  #2
+        vld1.8          {d0},  [\mx]
+        vld1.32         {d2[]},  [\my]
+        sub             \src,  \src,  #3
+        sub             \src,  \src,  \s_strd
+        vmovl.s8        q0,  d0
+        vmovl.s8        q1,  d2
+        mov             \my, \h
+
+164:    // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd, \d_strd, #1
+        lsl             \s_strd, \s_strd, #1
+
+        vld1.8          {q14},  [\src], \s_strd
+        vmovl.u8        q12, d28
+        vmovl.u8        q13, d29
+        vmul.s16        q10, q12, d0[0]
+.irpc i, 123
+        vext.8          q14, q12, q13, #(2*\i)
+        vmla.s16        q10, q14, d0[\i]
+.endr
+.irpc i, 4567
+        vext.8          q14, q12, q13, #(2*\i)
+        vmla.s16        q10, q14, d1[\i-4]
+.endr
+        vrshr.s16       q3,  q10, #2
+
+        bl              L(\type\()_8tap_filter_8)
+        vmov            q4,  q10
+        vmov            q5,  q11
+
+8:
+        bl              L(\type\()_8tap_filter_8)
+        vmull.s16       q12, d6,  d2[0]
+        vmull.s16       q13, d7,  d2[0]
+        vmull.s16       q14, d8,  d2[0]
+        vmull.s16       q15, d9,  d2[0]
+        vmlal.s16       q12, d8,  d2[1]
+        vmlal.s16       q13, d9,  d2[1]
+        vmlal.s16       q14, d10, d2[1]
+        vmlal.s16       q15, d11, d2[1]
+        vmlal.s16       q12, d10, d2[2]
+        vmlal.s16       q13, d11, d2[2]
+        vmlal.s16       q14, d20, d2[2]
+        vmlal.s16       q15, d21, d2[2]
+        vmlal.s16       q12, d20, d2[3]
+        vmlal.s16       q13, d21, d2[3]
+        vmlal.s16       q14, d22, d2[3]
+        vmlal.s16       q15, d23, d2[3]
+        vqrshrn.s32     d24, q12, #\shift_hv
+        vqrshrn.s32     d25, q13, #\shift_hv
+        vqrshrn.s32     d28, q14, #\shift_hv
+        vqrshrn.s32     d29, q15, #\shift_hv
+        subs            \h,  \h,  #2
+.ifc \type, put
+        vqmovun.s16     d24, q12
+        vqmovun.s16     d28, q14
+        vst1.8          {d24}, [\dst, :64], \d_strd
+        vst1.8          {d28}, [\ds2, :64], \d_strd
+.else
+        vst1.16         {q12}, [\dst, :128], \d_strd
+        vst1.16         {q14}, [\ds2, :128], \d_strd
+.endif
+        ble             9f
+        vmov            q3,  q5
+        vmov            q4,  q10
+        vmov            q5,  q11
+        b               8b
+9:
+        subs            \w,  \w,  #8
+        ble             0f
+        asr             \s_strd,  \s_strd,  #1
+        asr             \d_strd,  \d_strd,  #1
+        mls             \src,  \s_strd,  \my,  \src
+        mls             \dst,  \d_strd,  \my,  \dst
+        sub             \src,  \src,  \s_strd,  lsl #2
+        mov             \h,  \my
+        add             \src,  \src,  #8
+.ifc \type, put
+        add             \dst,  \dst,  #8
+.else
+        add             \dst,  \dst,  #16
+.endif
+        b               164b
+
+880:    // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
+640:
+1280:
+        vpush           {q4-q7}
+        vld1.8          {d0},  [\mx]
+        vld1.8          {d2},  [\my]
+        sub             \src,  \src,  #3
+        sub             \src,  \src,  \s_strd
+        sub             \src,  \src,  \s_strd, lsl #1
+        vmovl.s8        q0,  d0
+        vmovl.s8        q1,  d2
+        mov             \my, \h
+
+168:
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd, \d_strd, #1
+        lsl             \s_strd, \s_strd, #1
+
+        vld1.8          {q14},  [\src], \s_strd
+        vmovl.u8        q12, d28
+        vmovl.u8        q13, d29
+        vmul.s16        q10, q12, d0[0]
+.irpc i, 123
+        vext.8          q14, q12, q13, #(2*\i)
+        vmla.s16        q10, q14, d0[\i]
+.endr
+.irpc i, 4567
+        vext.8          q14, q12, q13, #(2*\i)
+        vmla.s16        q10, q14, d1[\i-4]
+.endr
+        vrshr.s16       q3,  q10, #2
+
+        bl              L(\type\()_8tap_filter_8)
+        vmov            q4,  q10
+        vmov            q5,  q11
+        bl              L(\type\()_8tap_filter_8)
+        vmov            q6,  q10
+        vmov            q7,  q11
+        bl              L(\type\()_8tap_filter_8)
+        vmov            q8,  q10
+        vmov            q9,  q11
+
+88:
+        bl              L(\type\()_8tap_filter_8)
+        vmull.s16       q12, d6,  d2[0]
+        vmull.s16       q13, d7,  d2[0]
+        vmull.s16       q14, d8,  d2[0]
+        vmull.s16       q15, d9,  d2[0]
+        vmlal.s16       q12, d8,  d2[1]
+        vmlal.s16       q13, d9,  d2[1]
+        vmlal.s16       q14, d10, d2[1]
+        vmlal.s16       q15, d11, d2[1]
+        vmlal.s16       q12, d10, d2[2]
+        vmlal.s16       q13, d11, d2[2]
+        vmlal.s16       q14, d12, d2[2]
+        vmlal.s16       q15, d13, d2[2]
+        vmlal.s16       q12, d12, d2[3]
+        vmlal.s16       q13, d13, d2[3]
+        vmlal.s16       q14, d14, d2[3]
+        vmlal.s16       q15, d15, d2[3]
+        vmlal.s16       q12, d14, d3[0]
+        vmlal.s16       q13, d15, d3[0]
+        vmlal.s16       q14, d16, d3[0]
+        vmlal.s16       q15, d17, d3[0]
+        vmlal.s16       q12, d16, d3[1]
+        vmlal.s16       q13, d17, d3[1]
+        vmlal.s16       q14, d18, d3[1]
+        vmlal.s16       q15, d19, d3[1]
+        vmlal.s16       q12, d18, d3[2]
+        vmlal.s16       q13, d19, d3[2]
+        vmlal.s16       q14, d20, d3[2]
+        vmlal.s16       q15, d21, d3[2]
+        vmlal.s16       q12, d20, d3[3]
+        vmlal.s16       q13, d21, d3[3]
+        vmlal.s16       q14, d22, d3[3]
+        vmlal.s16       q15, d23, d3[3]
+        vqrshrn.s32     d24, q12, #\shift_hv
+        vqrshrn.s32     d25, q13, #\shift_hv
+        vqrshrn.s32     d28, q14, #\shift_hv
+        vqrshrn.s32     d29, q15, #\shift_hv
+        subs            \h,  \h,  #2
+.ifc \type, put
+        vqmovun.s16     d24, q12
+        vqmovun.s16     d28, q14
+        vst1.8          {d24}, [\dst, :64], \d_strd
+        vst1.8          {d28}, [\ds2, :64], \d_strd
+.else
+        vst1.16         {q12}, [\dst, :128], \d_strd
+        vst1.16         {q14}, [\ds2, :128], \d_strd
+.endif
+        ble             9f
+        vmov            q3,  q5
+        vmov            q4,  q6
+        vmov            q5,  q7
+        vmov            q6,  q8
+        vmov            q7,  q9
+        vmov            q8,  q10
+        vmov            q9,  q11
+        b               88b
+9:
+        subs            \w,  \w,  #8
+        ble             0f
+        asr             \s_strd,  \s_strd,  #1
+        asr             \d_strd,  \d_strd,  #1
+        mls             \src,  \s_strd,  \my,  \src
+        mls             \dst,  \d_strd,  \my,  \dst
+        sub             \src,  \src,  \s_strd,  lsl #3
+        mov             \h,  \my
+        add             \src,  \src,  #8
+.ifc \type, put
+        add             \dst,  \dst,  #8
+.else
+        add             \dst,  \dst,  #16
+.endif
+        b               168b
+0:
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+
+L(\type\()_8tap_filter_8):
+        vld1.8          {q14},  [\sr2], \s_strd
+        vld1.8          {q15},  [\src], \s_strd
+        vmovl.u8        q12, d28
+        vmovl.u8        q13, d29
+        vmul.s16        q10, q12, d0[0]
+.irpc i, 123
+        vext.8          q14, q12, q13, #(2*\i)
+        vmla.s16        q10, q14, d0[\i]
+.endr
+.irpc i, 4567
+        vext.8          q14, q12, q13, #(2*\i)
+        vmla.s16        q10, q14, d1[\i-4]
+.endr
+        vmovl.u8        q12, d30
+        vmovl.u8        q13, d31
+        vmul.s16        q11, q12, d0[0]
+.irpc i, 123
+        vext.8          q14, q12, q13, #(2*\i)
+        vmla.s16        q11, q14, d0[\i]
+.endr
+.irpc i, 4567
+        vext.8          q14, q12, q13, #(2*\i)
+        vmla.s16        q11, q14, d1[\i-4]
+.endr
+        vrshr.s16       q10, q10, #2
+        vrshr.s16       q11, q11, #2
+        bx              lr
+endfunc
+
+
+function \type\()_bilin_8bpc_neon, export=1
+        push            {r4-r11,lr}
+        ldrd            r4,  r5,  [sp, #36]
+        ldrd            r6,  r7,  [sp, #44]
+        vdup.8          d1,  \mx
+        vdup.8          d3,  \my
+        rsb             r8,  \mx, #16
+        rsb             r9,  \my, #16
+        vdup.8          d0,  r8
+        vdup.8          d2,  r9
+.ifc \type, prep
+        lsl             \d_strd, \w, #1
+.endif
+        clz             r8,  \w
+        cmp             \mx, #0
+        sub             r8,  r8,  #24
+        bne             L(\type\()_bilin_h)
+        cmp             \my, #0
+        bne             L(\type\()_bilin_v)
+        b               \type
+
+L(\type\()_bilin_h):
+        cmp             \my, #0
+        bne             L(\type\()_bilin_hv)
+
+        adr             r9,  L(\type\()_bilin_h_tbl)
+        ldr             r8,  [r9, r8, lsl #2]
+        add             r9,  r9,  r8
+        bx              r9
+
+        .align 2
+L(\type\()_bilin_h_tbl):
+        .word 1280f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+        .word 640f  - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+        .word 320f  - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+        .word 160f  - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+        .word 80f   - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+        .word 40f   - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+        .word 20f   - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
+
+20:     // 2xN h
+.ifc \type, put
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+2:
+        vld1.32         {d4[]},  [\src], \s_strd
+        vld1.32         {d6[]},  [\sr2], \s_strd
+        vext.8          d5,  d4,  d4, #1
+        vext.8          d7,  d6,  d6, #1
+        vtrn.16         q2,  q3
+        subs            \h,  \h,  #2
+        vmull.u8        q3,  d4,  d0
+        vmlal.u8        q3,  d5,  d1
+        vqrshrn.u16     d4,  q3,  #4
+        vst1.16         {d4[0]}, [\dst, :16], \d_strd
+        vst1.16         {d4[1]}, [\ds2, :16], \d_strd
+        bgt             2b
+        pop             {r4-r11,pc}
+.endif
+
+40:     // 4xN h
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+4:
+        vld1.8          {d4}, [\src], \s_strd
+        vld1.8          {d6}, [\sr2], \s_strd
+        vext.8          d5,  d4,  d4, #1
+        vext.8          d7,  d6,  d6, #1
+        vtrn.32         q2,  q3
+        subs            \h,  \h,  #2
+        vmull.u8        q3,  d4,  d0
+        vmlal.u8        q3,  d5,  d1
+.ifc \type, put
+        vqrshrn.u16     d4,  q3,  #4
+        vst1.32         {d4[0]}, [\dst, :32], \d_strd
+        vst1.32         {d4[1]}, [\ds2, :32], \d_strd
+.else
+        vst1.16         {d6}, [\dst, :64], \d_strd
+        vst1.16         {d7}, [\ds2, :64], \d_strd
+.endif
+        bgt             4b
+        pop             {r4-r11,pc}
+
+80:     // 8xN h
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \d_strd,  \d_strd,  #1
+        lsl             \s_strd,  \s_strd,  #1
+8:
+        vld1.8          {q8},  [\src], \s_strd
+        vld1.8          {q10}, [\sr2], \s_strd
+        vext.8          q9,  q8,  q8,  #1
+        vext.8          q11, q10, q10, #1
+        subs            \h,  \h,  #2
+        vmull.u8        q8,  d16, d0
+        vmull.u8        q10, d20, d0
+        vmlal.u8        q8,  d18, d1
+        vmlal.u8        q10, d22, d1
+.ifc \type, put
+        vqrshrn.u16     d16,  q8,  #4
+        vqrshrn.u16     d18,  q10, #4
+        vst1.8          {d16}, [\dst, :64], \d_strd
+        vst1.8          {d18}, [\ds2, :64], \d_strd
+.else
+        vst1.16         {q8},  [\dst, :128], \d_strd
+        vst1.16         {q10}, [\ds2, :128], \d_strd
+.endif
+        bgt             8b
+        pop             {r4-r11,pc}
+160:
+320:
+640:
+1280:   // 16xN, 32xN, ... h
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+
+        sub             \s_strd,  \s_strd,  \w
+        sub             \s_strd,  \s_strd,  #8
+.ifc \type, put
+        lsl             \d_strd,  \d_strd,  #1
+        sub             \d_strd,  \d_strd,  \w
+.endif
+161:
+        vld1.8          {d16},  [\src]!
+        vld1.8          {d22},  [\sr2]!
+        mov             \mx, \w
+
+16:
+        vld1.8          {d17,d18},  [\src]!
+        vld1.8          {d23,d24},  [\sr2]!
+        vext.8          q10, q8,  q9,  #1
+        vext.8          q13, q11, q12, #1
+        vmull.u8        q2,  d16, d0
+        vmull.u8        q3,  d17, d0
+        vmull.u8        q14, d22, d0
+        vmull.u8        q15, d23, d0
+        vmlal.u8        q2,  d20, d1
+        vmlal.u8        q3,  d21, d1
+        vmlal.u8        q14, d26, d1
+        vmlal.u8        q15, d27, d1
+        subs            \mx, \mx, #16
+.ifc \type, put
+        vqrshrn.u16     d4,  q2,  #4
+        vqrshrn.u16     d5,  q3,  #4
+        vqrshrn.u16     d28, q14, #4
+        vqrshrn.u16     d29, q15, #4
+        vst1.8          {q2},  [\dst, :128]!
+        vst1.8          {q14}, [\ds2, :128]!
+.else
+        vst1.16         {q2,  q3},  [\dst, :128]!
+        vst1.16         {q14, q15}, [\ds2, :128]!
+.endif
+        ble             9f
+
+        vmov            d16, d18
+        vmov            d22, d24
+        b               16b
+
+9:
+        add             \dst,  \dst,  \d_strd
+        add             \ds2,  \ds2,  \d_strd
+        add             \src,  \src,  \s_strd
+        add             \sr2,  \sr2,  \s_strd
+
+        subs            \h,  \h,  #2
+        bgt             161b
+        pop             {r4-r11,pc}
+
+L(\type\()_bilin_v):
+        cmp             \h,  #4
+        adr             r9,  L(\type\()_bilin_v_tbl)
+        ldr             r8,  [r9, r8, lsl #2]
+        add             r9,  r9,  r8
+        bx              r9
+
+        .align 2
+L(\type\()_bilin_v_tbl):
+        .word 1280f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+        .word 640f  - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+        .word 320f  - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+        .word 160f  - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+        .word 80f   - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+        .word 40f   - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+        .word 20f   - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
+
+20:     // 2xN v
+.ifc \type, put
+        cmp             \h,  #2
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+        lsl             \d_strd,  \d_strd,  #1
+
+        // 2x2 v
+        vld1.16         {d16[]}, [\src], \s_strd
+        bgt             24f
+        vld1.16         {d17[]}, [\sr2], \s_strd
+        vld1.16         {d18[]}, [\src], \s_strd
+        vext.8          d16, d16, d17, #6
+        vext.8          d17, d17, d18, #6
+        vmull.u8        q2,  d16, d2
+        vmlal.u8        q2,  d17, d3
+        vqrshrn.u16     d4,  q2,  #4
+        vst1.16         {d4[0]}, [\dst, :16]
+        vst1.16         {d4[1]}, [\ds2, :16]
+        pop             {r4-r11,pc}
+24:     // 2x4, 2x8, ... v
+        vld1.16         {d17[]}, [\sr2], \s_strd
+        vld1.16         {d18[]}, [\src], \s_strd
+        vld1.16         {d19[]}, [\sr2], \s_strd
+        vld1.16         {d20[]}, [\src], \s_strd
+        vext.8          d16, d16, d17, #6
+        vext.8          d17, d17, d18, #6
+        vext.8          d18, d18, d19, #6
+        vext.8          d19, d19, d20, #6
+        vtrn.32         d16, d18
+        vtrn.32         d17, d19
+        vmull.u8        q2,  d16, d2
+        vmlal.u8        q2,  d17, d3
+        subs            \h,  \h,  #4
+        vqrshrn.u16     d4,  q2,  #4
+        vst1.16         {d4[0]}, [\dst, :16], \d_strd
+        vst1.16         {d4[1]}, [\ds2, :16], \d_strd
+        vst1.16         {d4[2]}, [\dst, :16], \d_strd
+        vst1.16         {d4[3]}, [\ds2, :16], \d_strd
+        ble             0f
+        vmov            d16, d20
+        b               24b
+0:
+        pop             {r4-r11,pc}
+.endif
+
+40:     // 4xN v
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+        lsl             \d_strd,  \d_strd,  #1
+        vld1.32         {d16[]}, [\src], \s_strd
+4:
+        vld1.32         {d17[]}, [\sr2], \s_strd
+        vld1.32         {d18[]}, [\src], \s_strd
+        vext.8          d16, d16, d17, #4
+        vext.8          d17, d17, d18, #4
+        vmull.u8        q2,  d16, d2
+        vmlal.u8        q2,  d17, d3
+        subs            \h,  \h,  #2
+.ifc \type, put
+        vqrshrn.u16     d4,  q2,  #4
+        vst1.32         {d4[0]}, [\dst, :32], \d_strd
+        vst1.32         {d4[1]}, [\ds2, :32], \d_strd
+.else
+        vst1.16         {d4}, [\dst, :64], \d_strd
+        vst1.16         {d5}, [\ds2, :64], \d_strd
+.endif
+        ble             0f
+        vmov            d16,  d18
+        b               4b
+0:
+        pop             {r4-r11,pc}
+
+80:     // 8xN v
+        add             \ds2,  \dst,  \d_strd
+        add             \sr2,  \src,  \s_strd
+        lsl             \s_strd,  \s_strd,  #1
+        lsl             \d_strd,  \d_strd,  #1
+        vld1.8          {d16}, [\src], \s_strd
+8:
+        vld1.8          {d17}, [\sr2], \s_strd
+        vld1.8          {d18}, [\src], \s_strd
+        vmull.u8        q2,  d16, d2
+        vmull.u8        q3,  d17, d2
+        vmlal.u8        q2,  d17, d3
+        vmlal.u8        q3,  d18, d3
+        subs            \h,  \h,  #2
+.ifc \type, put
+        vqrshrn.u16     d4,  q2,  #4
+        vqrshrn.u16     d6,  q3,  #4
+        vst1.8          {d4}, [\dst, :64], \d_strd
+        vst1.8          {d6}, [\ds2, :64], \d_strd
+.else
+        vst1.16         {q2}, [\dst, :128], \d_strd
+        vst1.16         {q3}, [\ds2, :128], \d_strd
+.endif
+        ble             0f
+        vmov            d16, d18
+        b               8b
+0:
+        pop             {r4-r11,pc}
+
+160:    // 16xN, 32xN, ...
+320:
+640:
+1280:
+        mov             \my, \h
+1:
+        add             \ds2, \dst, \d_strd
+        add             \sr2, \src, \s_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+
+        vld1.8          {q8},  [\src], \s_strd
+2:
+        vld1.8          {q9},  [\sr2], \s_strd
+        vld1.8          {q10}, [\src], \s_strd
+        vmull.u8        q12, d16, d2
+        vmull.u8        q13, d17, d2
+        vmull.u8        q14, d18, d2
+        vmull.u8        q15, d19, d2
+        vmlal.u8        q12, d18, d3
+        vmlal.u8        q13, d19, d3
+        vmlal.u8        q14, d20, d3
+        vmlal.u8        q15, d21, d3
+        subs            \h,  \h,  #2
+.ifc \type, put
+        vqrshrn.u16     d24, q12, #4
+        vqrshrn.u16     d25, q13, #4
+        vqrshrn.u16     d28, q14, #4
+        vqrshrn.u16     d29, q15, #4
+        vst1.8          {q12}, [\dst, :128], \d_strd
+        vst1.8          {q14}, [\ds2, :128], \d_strd
+.else
+        vst1.16         {q12, q13}, [\dst, :128], \d_strd
+        vst1.16         {q14, q15}, [\ds2, :128], \d_strd
+.endif
+        ble             9f
+        vmov            q8,  q10
+        b               2b
+9:
+        subs            \w,  \w,  #16
+        ble             0f
+        asr             \s_strd, \s_strd, #1
+        asr             \d_strd, \d_strd, #1
+        mls             \src, \s_strd, \my, \src
+        mls             \dst, \d_strd, \my, \dst
+        sub             \src, \src, \s_strd, lsl #1
+        mov             \h,  \my
+        add             \src, \src, #16
+.ifc \type, put
+        add             \dst, \dst, #16
+.else
+        add             \dst, \dst, #32
+.endif
+        b               1b
+0:
+        pop             {r4-r11,pc}
+
+L(\type\()_bilin_hv):
+        vmovl.u8        q2,  d2
+        vmovl.u8        q3,  d3
+        adr             r9,  L(\type\()_bilin_hv_tbl)
+        ldr             r8,  [r9, r8, lsl #2]
+        add             r9,  r9,  r8
+        bx              r9
+
+        .align 2
+L(\type\()_bilin_hv_tbl):
+        .word 1280f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+        .word 640f  - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+        .word 320f  - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+        .word 160f  - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+        .word 80f   - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+        .word 40f   - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+        .word 20f   - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
+
+20:     // 2xN hv
+.ifc \type, put
+        add             \sr2, \src, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+
+        vld1.32         {d28[]},  [\src], \s_strd
+        vext.8          d29, d28, d28, #1
+        vmull.u8        q8,  d28, d0
+        vmlal.u8        q8,  d29, d1
+
+2:
+        vld1.32         {d28[]},  [\sr2], \s_strd
+        vld1.32         {d30[]},  [\src], \s_strd
+        vext.8          d29, d28, d28, #1
+        vext.8          d31, d30, d30, #1
+        vtrn.16         d28, d30
+        vtrn.16         d29, d31
+        vmull.u8        q9,  d28, d0
+        vmlal.u8        q9,  d29, d1
+
+        vtrn.32         d16, d18
+
+        vmul.u16        d20, d16, d4
+        vmla.u16        d20, d19, d6
+        vqrshrn.u16     d20, q10, #8
+        subs            \h,  \h,  #2
+        vst1.16         {d20[0]}, [\dst, :16], \d_strd
+        vst1.16         {d20[1]}, [\ds2, :16], \d_strd
+        ble             0f
+        vtrn.32         d19, d16
+        b               2b
+0:
+        pop             {r4-r11,pc}
+.endif
+
+40:     // 4xN hv
+        add             \sr2, \src, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+
+        vld1.8          {d28},  [\src], \s_strd
+        vext.8          d29, d28, d28, #1
+        vmull.u8        q8,  d28, d0
+        vmlal.u8        q8,  d29, d1
+
+4:
+        vld1.8          {d28},  [\sr2], \s_strd
+        vld1.8          {d30},  [\src], \s_strd
+        vext.8          d29, d28, d28, #1
+        vext.8          d31, d30, d30, #1
+        vtrn.32         d28, d30
+        vtrn.32         d29, d31
+        vmull.u8        q9,  d28, d0
+        vmlal.u8        q9,  d29, d1
+
+        vmov            d17, d18
+
+        vmul.u16        q10, q8, q2
+        vmla.u16        q10, q9, q3
+        subs            \h,  \h,  #2
+.ifc \type, put
+        vqrshrn.u16     d20, q10, #8
+        vst1.32         {d20[0]}, [\dst, :32], \d_strd
+        vst1.32         {d20[1]}, [\ds2, :32], \d_strd
+.else
+        vrshr.u16       q10, q10, #4
+        vst1.16         {d20}, [\dst, :64], \d_strd
+        vst1.16         {d21}, [\ds2, :64], \d_strd
+.endif
+        ble             0f
+        vmov            d16, d19
+        b               4b
+0:
+        pop             {r4-r11,pc}
+
+80:     // 8xN, 16xN, ... hv
+160:
+320:
+640:
+1280:
+        mov             \my, \h
+
+1:
+        add             \sr2, \src, \s_strd
+        add             \ds2, \dst, \d_strd
+        lsl             \s_strd, \s_strd, #1
+        lsl             \d_strd, \d_strd, #1
+
+        vld1.8          {q12},  [\src], \s_strd
+        vext.8          q13, q12, q12, #1
+        vmull.u8        q8,  d24, d0
+        vmlal.u8        q8,  d26, d1
+
+2:
+        vld1.8          {q12},  [\sr2], \s_strd
+        vld1.8          {q14},  [\src], \s_strd
+        vext.8          q13, q12, q12, #1
+        vext.8          q15, q14, q14, #1
+        vmull.u8        q9,  d24, d0
+        vmlal.u8        q9,  d26, d1
+        vmull.u8        q10, d28, d0
+        vmlal.u8        q10, d30, d1
+
+        vmul.u16        q8,  q8,  q2
+        vmla.u16        q8,  q9,  q3
+        vmul.u16        q9,  q9,  q2
+        vmla.u16        q9,  q10, q3
+        subs            \h,  \h,  #2
+.ifc \type, put
+        vqrshrn.u16     d16, q8,  #8
+        vqrshrn.u16     d18, q9,  #8
+        vst1.8          {d16}, [\dst, :64], \d_strd
+        vst1.8          {d18}, [\ds2, :64], \d_strd
+.else
+        vrshr.u16       q8,  q8,  #4
+        vrshr.u16       q9,  q9,  #4
+        vst1.16         {q8}, [\dst, :128], \d_strd
+        vst1.16         {q9}, [\ds2, :128], \d_strd
+.endif
+        ble             9f
+        vmov            q8,  q10
+        b               2b
+9:
+        subs            \w,  \w,  #8
+        ble             0f
+        asr             \s_strd,  \s_strd,  #1
+        asr             \d_strd,  \d_strd,  #1
+        mls             \src,  \s_strd,  \my,  \src
+        mls             \dst,  \d_strd,  \my,  \dst
+        sub             \src,  \src,  \s_strd,  lsl #1
+        mov             \h,  \my
+        add             \src,  \src,  #8
+.ifc \type, put
+        add             \dst,  \dst,  #8
+.else
+        add             \dst,  \dst,  #16
+.endif
+        b               1b
+0:
+        pop             {r4-r11,pc}
+endfunc
+.endm
+
+filter_fn put,  r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, 10
+filter_fn prep, r0, r7, r1, r2, r3, r4, r5, r6, r8, r9, 6
--- a/third_party/dav1d/src/arm/32/util.S
+++ b/third_party/dav1d/src/arm/32/util.S
@@ -21,30 +21,46 @@
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *****************************************************************************/
 
-#ifndef __DAVID_SRC_ARM_32_UTIL_S__
-#define __DAVID_SRC_ARM_32_UTIL_S__
+#ifndef DAV1D_SRC_ARM_32_UTIL_S
+#define DAV1D_SRC_ARM_32_UTIL_S
 
 #include "config.h"
 #include "src/arm/asm.S"
 
-.macro movrel rd, val
-#if defined(PIC)
+.macro movrel rd, val, offset=0
+#if defined(PIC) && defined(__APPLE__)
     ldr         \rd,  1f
     b           2f
 1:
-@ FIXME: thumb
-    .word       \val - (2f + 8)
+    .word       3f - (2f + 8 - 4 * CONFIG_THUMB)
+2:
+    ldr         \rd,  [pc, \rd]
+.if \offset < 0
+    sub         \rd,  \rd,  #-(\offset)
+.elseif \offset > 0
+    add         \rd,  \rd,  #\offset
+.endif
+    .non_lazy_symbol_pointer
+3:
+    .indirect_symbol \val
+    .word       0
+    .text
+#elif defined(PIC)
+    ldr         \rd,  1f
+    b           2f
+1:
+    .word       \val + \offset - (2f + 8 - 4 * CONFIG_THUMB)
 2:
     add         \rd,  \rd,  pc
 #else
-    movw        \rd, #:lower16:\val
-    movt        \rd, #:upper16:\val
+    movw        \rd, #:lower16:\val+\offset
+    movt        \rd, #:upper16:\val+\offset
 #endif
 .endm
 
-#endif /* __DAVID_SRC_ARM_32_UTIL_S__ */
+#endif /* DAV1D_SRC_ARM_32_UTIL_S */
new file mode 100644
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/cdef.S
@@ -0,0 +1,603 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+.macro pad_top_bottom s1, s2, w, stride, rn, rw, ret
+        tst             w6,  #1 // CDEF_HAVE_LEFT
+        b.eq            2f
+        // CDEF_HAVE_LEFT
+        sub             \s1,  \s1,  #2
+        sub             \s2,  \s2,  #2
+        tst             w6,  #2 // CDEF_HAVE_RIGHT
+        b.eq            1f
+        // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+        ldr             \rn\()0, [\s1]
+        ldr             s1,      [\s1, #\w]
+        ldr             \rn\()2, [\s2]
+        ldr             s3,      [\s2, #\w]
+        uxtl            v0.8h,   v0.8b
+        uxtl            v1.8h,   v1.8b
+        uxtl            v2.8h,   v2.8b
+        uxtl            v3.8h,   v3.8b
+        str             \rw\()0, [x0]
+        str             d1,      [x0, #2*\w]
+        add             x0,  x0,  #2*\stride
+        str             \rw\()2, [x0]
+        str             d3,      [x0, #2*\w]
+.if \ret
+        ret
+.else
+        add             x0,  x0,  #2*\stride
+        b               3f
+.endif
+
+1:
+        // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+        ldr             \rn\()0, [\s1]
+        ldr             h1,      [\s1, #\w]
+        ldr             \rn\()2, [\s2]
+        ldr             h3,      [\s2, #\w]
+        uxtl            v0.8h,   v0.8b
+        uxtl            v1.8h,   v1.8b
+        uxtl            v2.8h,   v2.8b
+        uxtl            v3.8h,   v3.8b
+        str             \rw\()0, [x0]
+        str             s1,      [x0, #2*\w]
+        str             s31,     [x0, #2*\w+4]
+        add             x0,  x0,  #2*\stride
+        str             \rw\()2, [x0]
+        str             s3,      [x0, #2*\w]
+        str             s31,     [x0, #2*\w+4]
+.if \ret
+        ret
+.else
+        add             x0,  x0,  #2*\stride
+        b               3f
+.endif
+
+2:
+        // !CDEF_HAVE_LEFT
+        tst             w6,  #2 // CDEF_HAVE_RIGHT
+        b.eq            1f
+        // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+        ldr             \rn\()0, [\s1]
+        ldr             h1,      [\s1, #\w]
+        ldr             \rn\()2, [\s2]
+        ldr             h3,      [\s2, #\w]
+        uxtl            v0.8h,  v0.8b
+        uxtl            v1.8h,  v1.8b
+        uxtl            v2.8h,  v2.8b
+        uxtl            v3.8h,  v3.8b
+        str             s31, [x0]
+        stur            \rw\()0, [x0, #4]
+        str             s1,      [x0, #4+2*\w]
+        add             x0,  x0,  #2*\stride
+        str             s31, [x0]
+        stur            \rw\()2, [x0, #4]
+        str             s3,      [x0, #4+2*\w]
+.if \ret
+        ret
+.else
+        add             x0,  x0,  #2*\stride
+        b               3f
+.endif
+
+1:
+        // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+        ldr             \rn\()0, [\s1]
+        ldr             \rn\()1, [\s2]
+        uxtl            v0.8h,  v0.8b
+        uxtl            v1.8h,  v1.8b
+        str             s31,     [x0]
+        stur            \rw\()0, [x0, #4]
+        str             s31,     [x0, #4+2*\w]
+        add             x0,  x0,  #2*\stride
+        str             s31,     [x0]
+        stur            \rw\()1, [x0, #4]
+        str             s31,     [x0, #4+2*\w]
+.if \ret
+        ret
+.else
+        add             x0,  x0,  #2*\stride
+.endif
+3:
+.endm
+
+// void dav1d_cdef_paddingX_neon(uint16_t *tmp, const pixel *src,
+//                               ptrdiff_t src_stride, const pixel (*left)[2],
+//                               /*const*/ pixel *const top[2], int h,
+//                               enum CdefEdgeFlags edges);
+
+.macro padding_func w, stride, rn, rw
+function cdef_padding\w\()_neon, export=1
+        movi            v30.8h,  #0x80, lsl #8
+        mov             v31.16b, v30.16b
+        sub             x0,  x0,  #2*(2*\stride+2)
+        tst             w6,  #4 // CDEF_HAVE_TOP
+        b.ne            1f
+        // !CDEF_HAVE_TOP
+        st1             {v30.8h, v31.8h}, [x0], #32
+.if \w == 8
+        st1             {v30.8h, v31.8h}, [x0], #32
+.endif
+        b               3f
+1:
+        // CDEF_HAVE_TOP
+        ldr             x8,  [x4]
+        ldr             x9,  [x4, #8]
+        pad_top_bottom  x8,  x9, \w, \stride, \rn, \rw, 0
+
+        // Middle section
+3:
+        tst             w6,  #1 // CDEF_HAVE_LEFT
+        b.eq            2f
+        // CDEF_HAVE_LEFT
+        tst             w6,  #2 // CDEF_HAVE_RIGHT
+        b.eq            1f
+        // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+        ld1             {v0.h}[0], [x3], #2
+        ldr             \rn\()1, [x1]
+        ldr             h2,      [x1, #\w]
+        add             x1,  x1,  x2
+        subs            w5,  w5,  #1
+        uxtl            v0.8h,  v0.8b
+        uxtl            v1.8h,  v1.8b
+        uxtl            v2.8h,  v2.8b
+        str             s0,      [x0]
+        stur            \rw\()1, [x0, #4]
+        str             s2,      [x0, #4+2*\w]
+        add             x0,  x0,  #2*\stride
+        b.gt            0b
+        b               3f
+1:
+        // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+        ld1             {v0.h}[0], [x3], #2
+.if \w == 8
+        ld1             {v1.8b},   [x1], x2
+.else
+        ld1             {v1.s}[0], [x1], x2
+.endif
+        subs            w5,  w5,  #1
+        uxtl            v0.8h,  v0.8b
+        uxtl            v1.8h,  v1.8b
+        str             s0,      [x0]
+        stur            \rw\()1, [x0, #4]
+        str             s31,     [x0, #4+2*\w]
+        add             x0,  x0,  #2*\stride
+        b.gt            1b
+        b               3f
+2:
+        tst             w6,  #2 // CDEF_HAVE_RIGHT
+        b.eq            1f
+        // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
+0:
+        ldr             \rn\()0, [x1]
+        ldr             h1,      [x1, #\w]
+        add             x1,  x1,  x2
+        subs            w5,  w5,  #1
+        uxtl            v0.8h,  v0.8b
+        uxtl            v1.8h,  v1.8b
+        str             s31,     [x0]
+        stur            \rw\()0, [x0, #4]
+        str             s1,      [x0, #4+2*\w]
+        add             x0,  x0,  #2*\stride
+        b.gt            0b
+        b               3f
+1:
+        // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
+.if \w == 8
+        ld1             {v0.8b},   [x1], x2
+.else
+        ld1             {v0.s}[0], [x1], x2
+.endif
+        subs            w5,  w5,  #1
+        uxtl            v0.8h,  v0.8b
+        str             s31,     [x0]
+        stur            \rw\()0, [x0, #4]
+        str             s31,     [x0, #4+2*\w]
+        add             x0,  x0,  #2*\stride
+        b.gt            1b
+
+3:
+        tst             w6,  #8 // CDEF_HAVE_BOTTOM
+        b.ne            1f
+        // !CDEF_HAVE_BOTTOM
+        st1             {v30.8h, v31.8h}, [x0], #32
+.if \w == 8
+        st1             {v30.8h, v31.8h}, [x0], #32
+.endif
+        ret
+1:
+        // CDEF_HAVE_BOTTOM
+        add             x9,  x1,  x2
+        pad_top_bottom  x1,  x9, \w, \stride, \rn, \rw, 1
+endfunc
+.endm
+
+padding_func 8, 16, d, q
+padding_func 4, 8,  s, d
+
+.macro dir_table w, stride
+const directions\w
+        .byte           -1 * \stride + 1, -2 * \stride + 2
+        .byte            0 * \stride + 1, -1 * \stride + 2
+        .byte            0 * \stride + 1,  0 * \stride + 2
+        .byte            0 * \stride + 1,  1 * \stride + 2
+        .byte            1 * \stride + 1,  2 * \stride + 2
+        .byte            1 * \stride + 0,  2 * \stride + 1
+        .byte            1 * \stride + 0,  2 * \stride + 0
+        .byte            1 * \stride + 0,  2 * \stride - 1
+// Repeated, to avoid & 7
+        .byte           -1 * \stride + 1, -2 * \stride + 2
+        .byte            0 * \stride + 1, -1 * \stride + 2
+        .byte            0 * \stride + 1,  0 * \stride + 2
+        .byte            0 * \stride + 1,  1 * \stride + 2
+        .byte            1 * \stride + 1,  2 * \stride + 2
+        .byte            1 * \stride + 0,  2 * \stride + 1
+endconst
+.endm
+
+dir_table 8, 16
+dir_table 4, 8
+
+const pri_taps
+        .byte           4, 2, 3, 3
+endconst
+
+.macro load_px d1, d2, w
+.if \w == 8
+        add             x6,  x2,  w9, sxtb #1       // x + off
+        sub             x9,  x2,  w9, sxtb #1       // x - off
+        ld1             {\d1\().8h}, [x6]           // p0
+        ld1             {\d2\().8h}, [x9]           // p1
+.else
+        add             x6,  x2,  w9, sxtb #1       // x + off
+        sub             x9,  x2,  w9, sxtb #1       // x - off
+        ld1             {\d1\().4h}, [x6]           // p0
+        add             x6,  x6,  #2*8              // += stride
+        ld1             {\d2\().4h}, [x9]           // p1
+        add             x9,  x9,  #2*8              // += stride
+        ld1             {\d1\().d}[1], [x6]         // p0
+        ld1             {\d2\().d}[1], [x9]         // p1
+.endif
+.endm
+.macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap
+        umin            v2.8h,   v2.8h,  \s1\().8h
+        smax            v3.8h,   v3.8h,  \s1\().8h
+        umin            v2.8h,   v2.8h,  \s2\().8h
+        smax            v3.8h,   v3.8h,  \s2\().8h
+
+        cbz             \threshold, 3f
+        uabd            v16.8h, v0.8h,  \s1\().8h   // abs(diff)
+        uabd            v20.8h, v0.8h,  \s2\().8h   // abs(diff)
+        ushl            v17.8h, v16.8h, \shift      // abs(diff) >> shift
+        ushl            v21.8h, v20.8h, \shift      // abs(diff) >> shift
+        uqsub           v17.8h, \thresh_vec, v17.8h // imax(0, threshold - (abs(diff) >> shift))
+        uqsub           v21.8h, \thresh_vec, v21.8h // imax(0, threshold - (abs(diff) >> shift))
+        cmhi            v18.8h, v0.8h,  \s1\().8h   // px > p0
+        cmhi            v22.8h, v0.8h,  \s2\().8h   // px > p1
+        umin            v17.8h, v17.8h, v16.8h      // imin(abs(diff), imax())
+        umin            v21.8h, v21.8h, v20.8h      // imin(abs(diff), imax())
+        dup             v19.8h, \tap                // taps[k]
+        neg             v16.8h, v17.8h              // -imin()
+        neg             v20.8h, v21.8h              // -imin()
+        bsl             v18.16b, v16.16b, v17.16b   // constrain() = apply_sign()
+        bsl             v22.16b, v20.16b, v21.16b   // constrain() = apply_sign()
+        mla             v1.8h,  v18.8h, v19.8h      // sum += taps[k] * constrain()
+        mla             v1.8h,  v22.8h, v19.8h      // sum += taps[k] * constrain()
+3:
+.endm
+
+// void dav1d_cdef_filterX_neon(pixel *dst, ptrdiff_t dst_stride,
+//                              const uint16_t *tmp, int pri_strength,
+//                              int sec_strength, int dir, int damping, int h);
+.macro filter w
+function cdef_filter\w\()_neon, export=1
+        movrel          x8,  pri_taps
+        and             w9,  w3,  #1
+        add             x8,  x8,  w9, uxtw #1
+        movrel          x9,  directions\w
+        add             x5,  x9,  w5, uxtw #1
+        movi            v30.8h,   #15
+        dup             v28.8h,   w6                // damping
+
+        dup             v25.8h, w3                  // threshold
+        dup             v27.8h, w4                  // threshold
+        clz             v24.8h, v25.8h              // clz(threshold)
+        clz             v26.8h, v27.8h              // clz(threshold)
+        sub             v24.8h, v30.8h, v24.8h      // ulog2(threshold)
+        sub             v26.8h, v30.8h, v26.8h      // ulog2(threshold)
+        uqsub           v24.8h, v28.8h, v24.8h      // shift = imax(0, damping - ulog2(threshold))
+        uqsub           v26.8h, v28.8h, v26.8h      // shift = imax(0, damping - ulog2(threshold))
+        neg             v24.8h, v24.8h              // -shift
+        neg             v26.8h, v26.8h              // -shift
+
+1:
+.if \w == 8
+        ld1             {v0.8h}, [x2]               // px
+.else
+        add             x12, x2,  #2*8
+        ld1             {v0.4h},   [x2]             // px
+        ld1             {v0.d}[1], [x12]            // px
+.endif
+
+        movi            v1.8h,  #0                  // sum
+        mov             v2.16b, v0.16b              // min
+        mov             v3.16b, v0.16b              // max
+
+        // Instead of loading sec_taps 2, 1 from memory, just set it
+        // to 2 initially and decrease for the second round.
+        mov             w11, #2                     // sec_taps[0]
+
+2:
+        ldrb            w9,  [x5]                   // off1
+
+        load_px         v4,  v5, \w
+
+        add             x5,  x5,  #4                // +2*2
+        ldrb            w9,  [x5]                   // off2
+        load_px         v6,  v7,  \w
+
+        ldrb            w10, [x8]                   // *pri_taps
+
+        handle_pixel    v4,  v5,  w3,  v25.8h, v24.8h, w10
+
+        add             x5,  x5,  #8                // +2*4
+        ldrb            w9,  [x5]                   // off3
+        load_px         v4,  v5,  \w
+
+        handle_pixel    v6,  v7,  w4,  v27.8h, v26.8h, w11
+
+        handle_pixel    v4,  v5,  w4,  v27.8h, v26.8h, w11
+
+        sub             x5,  x5,  #11               // x8 -= 2*(2+4); x8 += 1;
+        subs            w11, w11, #1                // sec_tap-- (value)
+        add             x8,  x8,  #1                // pri_taps++ (pointer)
+        b.ne            2b
+
+        sshr            v4.8h,  v1.8h,  #15         // -(sum < 0)
+        add             v1.8h,  v1.8h,  v4.8h       // sum - (sum < 0)
+        srshr           v1.8h,  v1.8h,  #4          // (8 + sum - (sum < 0)) >> 4
+        add             v0.8h,  v0.8h,  v1.8h       // px + (8 + sum ...) >> 4
+        smin            v0.8h,  v0.8h,  v3.8h
+        smax            v0.8h,  v0.8h,  v2.8h       // iclip(px + .., min, max)
+        xtn             v0.8b,  v0.8h
+.if \w == 8
+        add             x2,  x2,  #2*16             // tmp += tmp_stride
+        subs            w7,  w7,  #1                // h--
+        st1             {v0.8b}, [x0], x1
+.else
+        st1             {v0.s}[0], [x0], x1
+        add             x2,  x2,  #2*16             // tmp += 2*tmp_stride
+        subs            w7,  w7,  #2                // h -= 2
+        st1             {v0.s}[1], [x0], x1
+.endif
+
+        // Reset pri_taps/sec_taps back to the original point
+        sub             x5,  x5,  #2
+        sub             x8,  x8,  #2
+
+        b.gt            1b
+        ret
+endfunc
+.endm
+
+filter 8
+filter 4
+
+const div_table
+        .short         840, 420, 280, 210, 168, 140, 120, 105
+endconst
+
+const alt_fact
+        .short         420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
+endconst
+
+// int dav1d_cdef_find_dir_neon(const pixel *img, const ptrdiff_t stride,
+//                              unsigned *const var)
+function cdef_find_dir_neon, export=1
+        sub             sp,  sp,  #32 // cost
+        mov             w3,  #8
+        movi            v31.16b, #128
+        movi            v30.16b, #0
+        movi            v1.8h,   #0 // v0-v1 sum_diag[0]
+        movi            v3.8h,   #0 // v2-v3 sum_diag[1]
+        movi            v5.8h,   #0 // v4-v5 sum_hv[0-1]
+        movi            v7.8h,   #0 // v6-v7 sum_alt[0]
+        movi            v17.8h,  #0 // v16-v17 sum_alt[1]
+        movi            v18.8h,  #0 // v18-v19 sum_alt[2]
+        movi            v19.8h,  #0
+        movi            v21.8h,  #0 // v20-v21 sum_alt[3]
+
+.irpc i, 01234567
+        ld1             {v26.8b}, [x0], x1
+        usubl           v26.8h,  v26.8b, v31.8b
+
+        addv            h25,     v26.8h               // [y]
+        rev64           v27.8h,  v26.8h
+        addp            v28.8h,  v26.8h,  v30.8h      // [(x >> 1)]
+        add             v5.8h,   v5.8h,   v26.8h      // sum_hv[1]
+        ext             v27.16b, v27.16b, v27.16b, #8 // [-x]
+        rev64           v29.4h,  v28.4h               // [-(x >> 1)]
+        ins             v4.h[\i], v25.h[0]            // sum_hv[0]
+
+.if \i == 0
+        mov             v0.16b,  v26.16b              // sum_diag[0]
+        mov             v2.16b,  v27.16b              // sum_diag[1]
+        mov             v6.16b,  v28.16b              // sum_alt[0]
+        mov             v16.16b, v29.16b              // sum_alt[1]
+.else
+        ext             v22.16b, v30.16b, v26.16b, #(16-2*\i)
+        ext             v23.16b, v26.16b, v30.16b, #(16-2*\i)
+        ext             v24.16b, v30.16b, v27.16b, #(16-2*\i)
+        ext             v25.16b, v27.16b, v30.16b, #(16-2*\i)
+        add             v0.8h,   v0.8h,   v22.8h      // sum_diag[0]
+        add             v1.8h,   v1.8h,   v23.8h      // sum_diag[0]
+        add             v2.8h,   v2.8h,   v24.8h      // sum_diag[1]
+        add             v3.8h,   v3.8h,   v25.8h      // sum_diag[1]
+        ext             v22.16b, v30.16b, v28.16b, #(16-2*\i)
+        ext             v23.16b, v28.16b, v30.16b, #(16-2*\i)
+        ext             v24.16b, v30.16b, v29.16b, #(16-2*\i)
+        ext             v25.16b, v29.16b, v30.16b, #(16-2*\i)
+        add             v6.8h,   v6.8h,   v22.8h      // sum_alt[0]
+        add             v7.8h,   v7.8h,   v23.8h      // sum_alt[0]
+        add             v16.8h,  v16.8h,  v24.8h      // sum_alt[1]
+        add             v17.8h,  v17.8h,  v25.8h      // sum_alt[1]
+.endif
+.if \i < 6
+        ext             v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2)))
+        ext             v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2)))
+        add             v18.8h,  v18.8h,  v22.8h      // sum_alt[2]
+        add             v19.8h,  v19.8h,  v23.8h      // sum_alt[2]
+.else
+        add             v18.8h,  v18.8h,  v26.8h      // sum_alt[2]
+.endif
+.if \i == 0
+        mov             v20.16b, v26.16b              // sum_alt[3]
+.elseif \i == 1
+        add             v20.8h,  v20.8h,  v26.8h      // sum_alt[3]
+.else
+        ext             v24.16b, v30.16b, v26.16b, #(16-2*(\i/2))
+        ext             v25.16b, v26.16b, v30.16b, #(16-2*(\i/2))
+        add             v20.8h,  v20.8h,  v24.8h      // sum_alt[3]
+        add             v21.8h,  v21.8h,  v25.8h      // sum_alt[3]
+.endif
+.endr
+
+        movi            v31.4s,  #105
+
+        smull           v26.4s,  v4.4h,   v4.4h       // sum_hv[0]*sum_hv[0]
+        smlal2          v26.4s,  v4.8h,   v4.8h
+        smull           v27.4s,  v5.4h,   v5.4h       // sum_hv[1]*sum_hv[1]
+        smlal2          v27.4s,  v5.8h,   v5.8h
+        mul             v26.4s,  v26.4s,  v31.4s      // cost[2] *= 105
+        mul             v27.4s,  v27.4s,  v31.4s      // cost[6] *= 105
+        addv            s4,  v26.4s                   // cost[2]
+        addv            s5,  v27.4s                   // cost[6]
+
+        rev64           v1.8h,   v1.8h
+        rev64           v3.8h,   v3.8h
+        ext             v1.16b,  v1.16b,  v1.16b, #8  // sum_diag[0][15-n]
+        ext             v3.16b,  v3.16b,  v3.16b, #8  // sum_diag[1][15-n]
+        ext             v1.16b,  v1.16b,  v1.16b, #2  // sum_diag[0][14-n]
+        ext             v3.16b,  v3.16b,  v3.16b, #2  // sum_diag[1][14-n]
+
+        str             s4,  [sp, #2*4]               // cost[2]
+        str             s5,  [sp, #6*4]               // cost[6]
+
+        movrel          x4,  div_table
+        ld1             {v31.8h}, [x4]
+
+        smull           v22.4s,  v0.4h,   v0.4h       // sum_diag[0]*sum_diag[0]
+        smull2          v23.4s,  v0.8h,   v0.8h
+        smlal           v22.4s,  v1.4h,   v1.4h
+        smlal2          v23.4s,  v1.8h,   v1.8h
+        smull           v24.4s,  v2.4h,   v2.4h       // sum_diag[1]*sum_diag[1]
+        smull2          v25.4s,  v2.8h,   v2.8h
+        smlal           v24.4s,  v3.4h,   v3.4h
+        smlal2          v25.4s,  v3.8h,   v3.8h
+        uxtl            v30.4s,  v31.4h               // div_table
+        uxtl2           v31.4s,  v31.8h
+        mul             v22.4s,  v22.4s,  v30.4s      // cost[0]
+        mla             v22.4s,  v23.4s,  v31.4s      // cost[0]
+        mul             v24.4s,  v24.4s,  v30.4s      // cost[4]
+        mla             v24.4s,  v25.4s,  v31.4s      // cost[4]
+        addv            s0,  v22.4s                   // cost[0]
+        addv            s2,  v24.4s                   // cost[4]
+
+        movrel          x5,  alt_fact
+        ld1             {v29.4h, v30.4h, v31.4h}, [x5]// div_table[2*m+1] + 105
+
+        str             s0,  [sp, #0*4]               // cost[0]
+        str             s2,  [sp, #4*4]               // cost[4]
+
+        uxtl            v29.4s,  v29.4h               // div_table[2*m+1] + 105
+        uxtl            v30.4s,  v30.4h
+        uxtl            v31.4s,  v31.4h
+
+.macro cost_alt d1, d2, s1, s2, s3, s4
+        smull           v22.4s,  \s1\().4h, \s1\().4h // sum_alt[n]*sum_alt[n]
+        smull2          v23.4s,  \s1\().8h, \s1\().8h
+        smull           v24.4s,  \s2\().4h, \s2\().4h
+        smull           v25.4s,  \s3\().4h, \s3\().4h // sum_alt[n]*sum_alt[n]
+        smull2          v26.4s,  \s3\().8h, \s3\().8h
+        smull           v27.4s,  \s4\().4h, \s4\().4h
+        mul             v22.4s,  v22.4s,  v29.4s      // sum_alt[n]^2*fact
+        mla             v22.4s,  v23.4s,  v30.4s
+        mla             v22.4s,  v24.4s,  v31.4s
+        mul             v25.4s,  v25.4s,  v29.4s      // sum_alt[n]^2*fact
+        mla             v25.4s,  v26.4s,  v30.4s
+        mla             v25.4s,  v27.4s,  v31.4s
+        addv            \d1, v22.4s                   // *cost_ptr
+        addv            \d2, v25.4s                   // *cost_ptr
+.endm
+        cost_alt        s6,  s16, v6,  v7,  v16, v17  // cost[1], cost[3]
+        str             s6,  [sp, #1*4]               // cost[1]
+        str             s16, [sp, #3*4]               // cost[3]
+        cost_alt        s18, s20, v18, v19, v20, v21  // cost[5], cost[7]
+        str             s18, [sp, #5*4]               // cost[5]
+        str             s20, [sp, #7*4]               // cost[7]
+
+        mov             w0,  #0                       // best_dir
+        mov             w1,  v0.s[0]                  // best_cost
+        mov             w3,  #1                       // n
+
+        mov             w4,  v6.s[0]
+
+.macro find_best s1, s2, s3
+.ifnb \s2
+        mov             w5,  \s2\().s[0]
+.endif
+        cmp             w4,  w1                       // cost[n] > best_cost
+        csel            w0,  w3,  w0,  gt             // best_dir = n
+        csel            w1,  w4,  w1,  gt             // best_cost = cost[n]
+.ifnb \s2
+        add             w3,  w3,  #1                  // n++
+        cmp             w5,  w1                       // cost[n] > best_cost
+        mov             w4,  \s3\().s[0]
+        csel            w0,  w3,  w0,  gt             // best_dir = n
+        csel            w1,  w5,  w1,  gt             // best_cost = cost[n]
+        add             w3,  w3,  #1                  // n++
+.endif
+.endm
+        find_best       v6,  v4, v16
+        find_best       v16, v2, v18
+        find_best       v18, v5, v20
+        find_best       v20
+
+        eor             w3,  w0,  #4                  // best_dir ^4
+        ldr             w4,  [sp, w3, uxtw #2]
+        sub             w1,  w1,  w4                  // best_cost - cost[best_dir ^ 4]
+        lsr             w1,  w1,  #10
+        str             w1,  [x2]                     // *var
+
+        add             sp,  sp,  #32
+        ret
+endfunc
--- a/third_party/dav1d/src/arm/64/looprestoration.S
+++ b/third_party/dav1d/src/arm/64/looprestoration.S
@@ -47,17 +47,17 @@ function wiener_filter_h_neon, export=1
         ins             v0.h[7], wzr
 
         // Set up pointers for reading/writing alternate rows
         add             x12, x0,  x10
         lsl             w10, w10, #1
         add             x13, x2,  x3
         lsl             x3,  x3,  #1
 
-        // Subtract the width from mid_strid3
+        // Subtract the width from mid_stride
         sub             x10, x10, w5, uxtw #1
 
         // For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels.
         cmp             w5,  #8
         add             w11, w5,  #13
         bic             w11, w11, #7
         b.ge            1f
         mov             w11, #16
@@ -219,41 +219,35 @@ 7:      // 1 <= w < 5, 4-7 pixels valid 
         sub             w9,  w5,  #1
         // w9 = (pixels valid - 4)
         adr             x11, L(variable_shift_tbl)
         ldrh            w9,  [x11, w9, uxtw #1]
         sub             x11, x11, w9, uxth
         mov             v3.16b,  v28.16b
         mov             v5.16b,  v29.16b
         br              x11
+44:     // 4 pixels valid in v2/v4, fill the high half with padding.
+        ins             v2.d[1], v3.d[0]
+        ins             v4.d[1], v5.d[0]
+        b               88f
         // Shift v2 right, shifting out invalid pixels,
         // shift v2 left to the original offset, shifting in padding pixels.
-44:     // 4 pixels valid
-        ext             v2.16b,  v2.16b,  v2.16b,  #8
-        ext             v2.16b,  v2.16b,  v3.16b,  #8
-        ext             v4.16b,  v4.16b,  v4.16b,  #8
-        ext             v4.16b,  v4.16b,  v5.16b,  #8
-        b               88f
 55:     // 5 pixels valid
         ext             v2.16b,  v2.16b,  v2.16b,  #10
         ext             v2.16b,  v2.16b,  v3.16b,  #6
         ext             v4.16b,  v4.16b,  v4.16b,  #10
         ext             v4.16b,  v4.16b,  v5.16b,  #6
         b               88f
-66:     // 6 pixels valid
-        ext             v2.16b,  v2.16b,  v2.16b,  #12
-        ext             v2.16b,  v2.16b,  v3.16b,  #4
-        ext             v4.16b,  v4.16b,  v4.16b,  #12
-        ext             v4.16b,  v4.16b,  v5.16b,  #4
+66:     // 6 pixels valid, fill the upper 2 pixels with padding.
+        ins             v2.s[3], v3.s[0]
+        ins             v4.s[3], v5.s[0]
         b               88f
-77:     // 7 pixels valid
-        ext             v2.16b,  v2.16b,  v2.16b,  #14
-        ext             v2.16b,  v2.16b,  v3.16b,  #2
-        ext             v4.16b,  v4.16b,  v4.16b,  #14
-        ext             v4.16b,  v4.16b,  v5.16b,  #2
+77:     // 7 pixels valid, fill the last pixel with padding.
+        ins             v2.h[7], v3.h[0]
+        ins             v4.h[7], v5.h[0]
         b               88f
 
 L(variable_shift_tbl):
         .hword L(variable_shift_tbl) - 44b
         .hword L(variable_shift_tbl) - 55b
         .hword L(variable_shift_tbl) - 66b
         .hword L(variable_shift_tbl) - 77b
 
@@ -277,29 +271,25 @@ 88:
         ext             v4.16b,  v4.16b,  v5.16b, #8
         b.eq            9f
 888:    // 1 <= w < 4, filter 1 pixel at a time
         mul             v6.8h,   v2.8h,   v0.8h
         mul             v7.8h,   v4.8h,   v0.8h
         addv            h6,      v6.8h
         addv            h7,      v7.8h
         dup             v16.4h,  v2.h[3]
-        dup             v17.4h,  v4.h[3]
+        ins             v16.h[1], v4.h[3]
+        ins             v6.h[1], v7.h[0]
         shl             v16.4h,  v16.4h,  #7
-        shl             v17.4h,  v17.4h,  #7
         sub             v16.4h,  v16.4h,  v30.4h
-        sub             v17.4h,  v17.4h,  v30.4h
         sqadd           v6.4h,   v6.4h,   v16.4h
-        sqadd           v7.4h,   v7.4h,   v17.4h
         sshr            v6.4h,   v6.4h,   #3
-        sshr            v7.4h,   v7.4h,   #3
         add             v6.4h,   v6.4h,   v31.4h
-        add             v7.4h,   v7.4h,   v31.4h
         st1             {v6.h}[0], [x0],  #2
-        st1             {v7.h}[0], [x12], #2
+        st1             {v6.h}[1], [x12], #2
         subs            w5,  w5,  #1
         ext             v2.16b,  v2.16b,  v3.16b,  #2
         ext             v4.16b,  v4.16b,  v5.16b,  #2
         b.gt            888b
 
 9:
         subs            w6,  w6,  #2
         b.le            0f
@@ -317,18 +307,17 @@ endfunc
 
 // void dav1d_wiener_filter_v_neon(pixel *dst, ptrdiff_t stride,
 //                                 const int16_t *mid, int w, int h,
 //                                 const int16_t fv[7], enum LrEdgeFlags edges,
 //                                 ptrdiff_t mid_stride);
 function wiener_filter_v_neon, export=1
         mov             w8,  w4
         ld1             {v0.8h},  [x5]
-        mov             w9,  #128
-        dup             v1.8h, w9
+        movi            v1.8h, #128
         add             v1.8h,  v1.8h,  v0.8h
 
         // Calculate the number of rows to move back when looping vertically
         mov             w11, w4
         tst             w6,  #4 // LR_HAVE_TOP
         b.eq            0f
         sub             x2,  x2,  x7,  lsl #1
         add             w11, w11, #2
@@ -432,17 +421,17 @@ 6:
         cmp             w4,  #2
         b.gt            63f // 3 rows in total
         b.eq            62f // 2 rows in total
 61:     // 1 row in total, v19 already loaded, pad that into v20-v22.
         mov             v20.16b,  v19.16b
         mov             v21.16b,  v19.16b
         mov             v22.16b,  v19.16b
         b               8f
-62:     // 2 rows in total, v19 already loaded, load v20 and pad that into v20-v23.
+62:     // 2 rows in total, v19 already loaded, load v20 and pad that into v21-v23.
         ld1             {v20.8h}, [x2], x7
         mov             v21.16b,  v20.16b
         mov             v22.16b,  v20.16b
         mov             v23.16b,  v20.16b
         b               8f
 63:
         // 3 rows in total, v19 already loaded, load v20 and v21 and pad v21 into v22-v24.
         ld1             {v20.8h}, [x2], x7
@@ -498,61 +487,61 @@ function copy_narrow_neon, export=1
         adr             x5,  L(copy_narrow_tbl)
         ldrh            w6,  [x5, w3, uxtw #1]
         sub             x5,  x5,  w6, uxth
         br              x5
 10:
         add             x7,  x0,  x1
         lsl             x1,  x1,  #1
 18:
-        cmp             w4,  #8
+        subs            w4,  w4,  #8
         b.lt            110f
-        subs            w4,  w4,  #8
         ld1             {v0.8b}, [x2], #8
         st1             {v0.b}[0], [x0], x1
         st1             {v0.b}[1], [x7], x1
         st1             {v0.b}[2], [x0], x1
         st1             {v0.b}[3], [x7], x1
         st1             {v0.b}[4], [x0], x1
         st1             {v0.b}[5], [x7], x1
         st1             {v0.b}[6], [x0], x1
         st1             {v0.b}[7], [x7], x1
         b.le            0f
         b               18b
 110:
+        add             w4,  w4,  #8
         asr             x1,  x1,  #1
 11:
         subs            w4,  w4,  #1
         ld1             {v0.b}[0], [x2], #1
         st1             {v0.b}[0], [x0], x1
-        b.ge            11b
+        b.gt            11b
 0:
         ret
 
 20:
         add             x7,  x0,  x1
         lsl             x1,  x1,  #1
 24:
-        cmp             w4,  #4
+        subs            w4,  w4,  #4
         b.lt            210f
-        subs            w4,  w4,  #4
         ld1             {v0.4h}, [x2], #8
         st1             {v0.h}[0], [x0], x1
         st1             {v0.h}[1], [x7], x1
         st1             {v0.h}[2], [x0], x1
         st1             {v0.h}[3], [x7], x1
         b.le            0f
         b               24b
 210:
+        add             w4,  w4,  #4
         asr             x1,  x1,  #1
 22:
         subs            w4,  w4,  #1
         ld1             {v0.h}[0], [x2], #2
         st1             {v0.h}[0], [x0], x1
-        b.ge            22b
+        b.gt            22b
 0:
         ret
 
 30:
         ldrh            w5,  [x2]
         ldrb            w6,  [x2, #2]
         add             x2,  x2,  #3
         subs            w4,  w4,  #1
@@ -561,19 +550,18 @@ 30:
         add             x0,  x0,  x1
         b.gt            30b
         ret
 
 40:
         add             x7,  x0,  x1
         lsl             x1,  x1,  #1
 42:
-        cmp             w4,  #2
+        subs            w4,  w4,  #2
         b.lt            41f
-        subs            w4,  w4,  #2
         ld1             {v0.2s}, [x2], #8
         st1             {v0.s}[0], [x0], x1
         st1             {v0.s}[1], [x7], x1
         b.le            0f
         b               42b
 41:
         ld1             {v0.s}[0], [x2]
         st1             {v0.s}[0], [x0]
--- a/third_party/dav1d/src/arm/64/mc.S
+++ b/third_party/dav1d/src/arm/64/mc.S
@@ -541,68 +541,71 @@ endfunc
 .endif
 .endm
 .macro mul_mla_4 d, s0, s1, s2, s3, wd
         mul             \d\wd,  \s0\wd,  v0.h[0]
         mla             \d\wd,  \s1\wd,  v0.h[1]
         mla             \d\wd,  \s2\wd,  v0.h[2]
         mla             \d\wd,  \s3\wd,  v0.h[3]
 .endm
+// Interleaving the mul/mla chains actually hurts performance
+// significantly on Cortex A53, thus keeping mul/mla tightly
+// chained like this.
 .macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
         mul             \d0\().8h, \s0\().8h, v0.h[0]
+        mla             \d0\().8h, \s1\().8h, v0.h[1]
+        mla             \d0\().8h, \s2\().8h, v0.h[2]
+        mla             \d0\().8h, \s3\().8h, v0.h[3]
+        mla             \d0\().8h, \s4\().8h, v0.h[4]
+        mla             \d0\().8h, \s5\().8h, v0.h[5]
+        mla             \d0\().8h, \s6\().8h, v0.h[6]
+        mla             \d0\().8h, \s7\().8h, v0.h[7]
         mul             \d1\().8h, \s1\().8h, v0.h[0]
-        mla             \d0\().8h, \s1\().8h, v0.h[1]
         mla             \d1\().8h, \s2\().8h, v0.h[1]
-        mla             \d0\().8h, \s2\().8h, v0.h[2]
         mla             \d1\().8h, \s3\().8h, v0.h[2]
-        mla             \d0\().8h, \s3\().8h, v0.h[3]
         mla             \d1\().8h, \s4\().8h, v0.h[3]
-        mla             \d0\().8h, \s4\().8h, v0.h[4]
         mla             \d1\().8h, \s5\().8h, v0.h[4]
-        mla             \d0\().8h, \s5\().8h, v0.h[5]
         mla             \d1\().8h, \s6\().8h, v0.h[5]
-        mla             \d0\().8h, \s6\().8h, v0.h[6]
         mla             \d1\().8h, \s7\().8h, v0.h[6]
-        mla             \d0\().8h, \s7\().8h, v0.h[7]
         mla             \d1\().8h, \s8\().8h, v0.h[7]
 .endm
 .macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
         mul             \d0\().8h, \s0\().8h, v0.h[0]
+        mla             \d0\().8h, \s1\().8h, v0.h[1]
+        mla             \d0\().8h, \s2\().8h, v0.h[2]
+        mla             \d0\().8h, \s3\().8h, v0.h[3]
+        mla             \d0\().8h, \s4\().8h, v0.h[4]
+        mla             \d0\().8h, \s5\().8h, v0.h[5]
+        mla             \d0\().8h, \s6\().8h, v0.h[6]
+        mla             \d0\().8h, \s7\().8h, v0.h[7]
         mul             \d1\().8h, \s2\().8h, v0.h[0]
-        mla             \d0\().8h, \s1\().8h, v0.h[1]
         mla             \d1\().8h, \s3\().8h, v0.h[1]
-        mla             \d0\().8h, \s2\().8h, v0.h[2]
         mla             \d1\().8h, \s4\().8h, v0.h[2]
-        mla             \d0\().8h, \s3\().8h, v0.h[3]
         mla             \d1\().8h, \s5\().8h, v0.h[3]
-        mla             \d0\().8h, \s4\().8h, v0.h[4]
         mla             \d1\().8h, \s6\().8h, v0.h[4]
-        mla             \d0\().8h, \s5\().8h, v0.h[5]
         mla             \d1\().8h, \s7\().8h, v0.h[5]
-        mla             \d0\().8h, \s6\().8h, v0.h[6]
         mla             \d1\().8h, \s8\().8h, v0.h[6]
-        mla             \d0\().8h, \s7\().8h, v0.h[7]
         mla             \d1\().8h, \s9\().8h, v0.h[7]
 .endm
 .macro mul_mla_8_4 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11
         mul             \d0\().8h, \s0\().8h,  v0.h[0]
+        mla             \d0\().8h, \s1\().8h,  v0.h[1]
+        mla             \d0\().8h, \s2\().8h,  v0.h[2]
+        mla             \d0\().8h, \s3\().8h,  v0.h[3]
+        mla             \d0\().8h, \s4\().8h,  v0.h[4]
+        mla             \d0\().8h, \s5\().8h,  v0.h[5]
+        mla             \d0\().8h, \s6\().8h,  v0.h[6]
+        mla             \d0\().8h, \s7\().8h,  v0.h[7]
         mul             \d1\().8h, \s4\().8h,  v0.h[0]
-        mla             \d0\().8h, \s1\().8h,  v0.h[1]
         mla             \d1\().8h, \s5\().8h,  v0.h[1]
-        mla             \d0\().8h, \s2\().8h,  v0.h[2]
         mla             \d1\().8h, \s6\().8h,  v0.h[2]
-        mla             \d0\().8h, \s3\().8h,  v0.h[3]
         mla             \d1\().8h, \s7\().8h,  v0.h[3]
-        mla             \d0\().8h, \s4\().8h,  v0.h[4]
         mla             \d1\().8h, \s8\().8h,  v0.h[4]
-        mla             \d0\().8h, \s5\().8h,  v0.h[5]
         mla             \d1\().8h, \s9\().8h,  v0.h[5]
-        mla             \d0\().8h, \s6\().8h,  v0.h[6]
         mla             \d1\().8h, \s10\().8h, v0.h[6]
-        mla             \d0\().8h, \s7\().8h,  v0.h[7]
         mla             \d1\().8h, \s11\().8h, v0.h[7]
 .endm
 .macro sqrshrun_b shift, r0, r1, r2, r3
         sqrshrun        \r0\().8b, \r0\().8h,  #\shift
 .ifnb \r1
         sqrshrun        \r1\().8b, \r1\().8h,  #\shift
 .endif
 .ifnb \r2
@@ -623,39 +626,39 @@ endfunc
 .macro st_h strd, reg, lanes
         st1             {\reg\().h}[0], [x0], \strd
         st1             {\reg\().h}[1], [x8], \strd
 .if \lanes > 2
         st1             {\reg\().h}[2], [x0], \strd
         st1             {\reg\().h}[3], [x8], \strd
 .endif
 .endm
-.macro st_s strd, r0, r1, r2, r3
+.macro st_s strd, r0, r1
         st1             {\r0\().s}[0], [x0], \strd
         st1             {\r0\().s}[1], [x8], \strd
 .ifnb \r1
         st1             {\r1\().s}[0], [x0], \strd
         st1             {\r1\().s}[1], [x8], \strd
 .endif
 .endm
-.macro st_d strd, r0, r1, r2, r3
+.macro st_d strd, r0, r1
         st1             {\r0\().d}[0], [x0], \strd
         st1             {\r0\().d}[1], [x8], \strd
 .ifnb \r1
         st1             {\r1\().d}[0], [x0], \strd
         st1             {\r1\().d}[1], [x8], \strd
 .endif
 .endm
-.macro shift_store_4 type, strd, r0, r1, r2, r3
+.macro shift_store_4 type, strd, r0, r1
 .ifc \type, put
-        sqrshrun_b      6,     \r0, \r1, \r2, \r3
-        st_s            \strd, \r0, \r1, \r2, \r3
+        sqrshrun_b      6,     \r0, \r1
+        st_s            \strd, \r0, \r1
 .else
-        srshr_h         2,     \r0, \r1, \r2, \r3
-        st_d            \strd, \r0, \r1, \r2, \r3
+        srshr_h         2,     \r0, \r1
+        st_d            \strd, \r0, \r1
 .endif
 .endm
 .macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7
         st1             {\r0\wd}, [x0], \strd
         st1             {\r1\wd}, [x8], \strd
 .ifnb \r2
         st1             {\r2\wd}, [x0], \strd
         st1             {\r3\wd}, [x8], \strd
@@ -737,17 +740,17 @@ function \type\()_8tap
         movrel          x10, X(mc_subpel_filters), -8
         b.ne            L(\type\()_8tap_h)
         tst             \my, #(0x7f << 14)
         b.ne            L(\type\()_8tap_v)
         b               \type
 
 L(\type\()_8tap_h):
         cmp             \w,  #4
-        ubfm            w9,  \mx, #7, #13
+        ubfx            w9,  \mx, #7, #7
         and             \mx, \mx, #0x7f
         b.le            4f
         mov             \mx,  w9
 4:
         tst             \my,  #(0x7f << 14)
         add             \xmx, x10, \mx, uxtw #3
         b.ne            L(\type\()_8tap_hv)
 
@@ -960,17 +963,17 @@ L(\type\()_8tap_h_tbl):
         .hword L(\type\()_8tap_h_tbl) -   80b
         .hword L(\type\()_8tap_h_tbl) -   40b
         .hword L(\type\()_8tap_h_tbl) -   20b
         .hword 0
 
 
 L(\type\()_8tap_v):
         cmp             \h,  #4
-        ubfm            w9,  \my, #7, #13
+        ubfx            w9,  \my, #7, #7
         and             \my, \my, #0x7f
         b.le            4f
         mov             \my, w9
 4:
         add             \xmy, x10, \my, uxtw #3
 
         adr             x9,  L(\type\()_8tap_v_tbl)
         ldrh            w8,  [x9, x8, lsl #1]
@@ -1211,17 +1214,17 @@ 9:
 .endif
         b               168b
 0:
         ret
 
 160:
         b.gt            1680b
 
-        // 16x4 v
+        // 16x2, 16x4 v
         add             \xmy, \xmy, #2
         ld1             {v0.s}[0], [\xmy]
         sub             \src, \src, \s_strd
         add             \ds2, \dst, \d_strd
         add             \sr2, \src, \s_strd
         lsl             \s_strd, \s_strd, #1
         lsl             \d_strd, \d_strd, #1
         sxtl            v0.8h, v0.8b
@@ -1264,17 +1267,17 @@ L(\type\()_8tap_v_tbl):
         .hword L(\type\()_8tap_v_tbl) -  160b
         .hword L(\type\()_8tap_v_tbl) -   80b
         .hword L(\type\()_8tap_v_tbl) -   40b
         .hword L(\type\()_8tap_v_tbl) -   20b
         .hword 0
 
 L(\type\()_8tap_hv):
         cmp             \h,  #4
-        ubfm            w9,  \my, #7, #13
+        ubfx            w9,  \my, #7, #7
         and             \my, \my, #0x7f
         b.le            4f
         mov             \my,  w9
 4:
         add             \xmy,  x10, \my, uxtw #3
 
         adr             x9,  L(\type\()_8tap_hv_tbl)
         ldrh            w8,  [x9, x8, lsl #1]
@@ -1299,45 +1302,42 @@ 20:
         sxtl            v1.8h,  v1.8b
         mov             x15, x30
 
         ld1             {v28.8b}, [\src], \s_strd
         uxtl            v28.8h,  v28.8b
         ext             v29.16b, v28.16b, v28.16b, #2
         mul             v28.4h,  v28.4h,  v0.4h
         mul             v29.4h,  v29.4h,  v0.4h
-        addv            h28, v28.4h
-        addv            h29, v29.4h
-        trn1            v16.4h, v28.4h, v29.4h
-        srshr           v16.4h, v16.4h, #2
+        addp            v28.4h,  v28.4h,  v29.4h
+        addp            v16.4h,  v28.4h,  v28.4h
+        srshr           v16.4h,  v16.4h,  #2
         bl              L(\type\()_8tap_filter_2)
 
         trn1            v16.2s, v16.2s, v28.2s
-        trn1            v17.2s, v28.2s, v30.2s
-        mov             v18.8b, v30.8b
+        mov             v17.8b, v28.8b
 
 2:
         bl              L(\type\()_8tap_filter_2)
 
-        trn1            v18.2s, v18.2s, v28.2s
-        trn1            v19.2s, v28.2s, v30.2s
+        ext             v18.8b, v17.8b, v28.8b, #4
+        mov             v19.8b, v28.8b
         smull           v2.4s,  v16.4h, v1.h[0]
         smlal           v2.4s,  v17.4h, v1.h[1]
         smlal           v2.4s,  v18.4h, v1.h[2]
         smlal           v2.4s,  v19.4h, v1.h[3]
 
         sqrshrn         v2.4h,  v2.4s,  #\shift_hv
         sqxtun          v2.8b,  v2.8h
         subs            \h,  \h,  #2
         st1             {v2.h}[0], [\dst], \d_strd
         st1             {v2.h}[1], [\ds2], \d_strd
         b.le            0f
         mov             v16.8b, v18.8b
         mov             v17.8b, v19.8b
-        mov             v18.8b, v30.8b
         b               2b
 
 280:    // 2x8, 2x16, 2x32 hv
         ld1             {v1.8b},  [\xmy]
         sub             \src, \src, #1
         sub             \sr2, \src, \s_strd, lsl #1
         sub             \src, \sr2, \s_strd
         add             \ds2, \dst, \d_strd
@@ -1347,38 +1347,34 @@ 280:    // 2x8, 2x16, 2x32 hv
         sxtl            v1.8h,  v1.8b
         mov             x15, x30
 
         ld1             {v28.8b}, [\src], \s_strd
         uxtl            v28.8h,  v28.8b
         ext             v29.16b, v28.16b, v28.16b, #2
         mul             v28.4h,  v28.4h,  v0.4h
         mul             v29.4h,  v29.4h,  v0.4h
-        addv            h28, v28.4h
-        addv            h29, v29.4h
-        trn1            v16.4h, v28.4h, v29.4h
-        srshr           v16.4h, v16.4h, #2
+        addp            v28.4h,  v28.4h,  v29.4h
+        addp            v16.4h,  v28.4h,  v28.4h
+        srshr           v16.4h,  v16.4h,  #2
 
         bl              L(\type\()_8tap_filter_2)
         trn1            v16.2s, v16.2s, v28.2s
-        trn1            v17.2s, v28.2s, v30.2s
-        mov             v18.8b, v30.8b
+        mov             v17.8b, v28.8b
         bl              L(\type\()_8tap_filter_2)
-        trn1            v18.2s, v18.2s, v28.2s
-        trn1            v19.2s, v28.2s, v30.2s
-        mov             v20.8b, v30.8b
+        ext             v18.8b, v17.8b, v28.8b, #4
+        mov             v19.8b, v28.8b
         bl              L(\type\()_8tap_filter_2)
-        trn1            v20.2s, v20.2s, v28.2s
-        trn1            v21.2s, v28.2s, v30.2s
-        mov             v22.8b, v30.8b
+        ext             v20.8b, v19.8b, v28.8b, #4
+        mov             v21.8b, v28.8b
 
 28:
         bl              L(\type\()_8tap_filter_2)
-        trn1            v22.2s, v22.2s, v28.2s
-        trn1            v23.2s, v28.2s, v30.2s
+        ext             v22.8b, v21.8b, v28.8b, #4
+        mov             v23.8b, v28.8b
         smull           v2.4s,  v16.4h, v1.h[0]
         smlal           v2.4s,  v17.4h, v1.h[1]
         smlal           v2.4s,  v18.4h, v1.h[2]
         smlal           v2.4s,  v19.4h, v1.h[3]
         smlal           v2.4s,  v20.4h, v1.h[4]
         smlal           v2.4s,  v21.4h, v1.h[5]
         smlal           v2.4s,  v22.4h, v1.h[6]
         smlal           v2.4s,  v23.4h, v1.h[7]
@@ -1390,17 +1386,16 @@ 28:
         st1             {v2.h}[1], [\ds2], \d_strd
         b.le            0f
         mov             v16.8b, v18.8b
         mov             v17.8b, v19.8b
         mov             v18.8b, v20.8b
         mov             v19.8b, v21.8b
         mov             v20.8b, v22.8b
         mov             v21.8b, v23.8b
-        mov             v22.8b, v30.8b
         b               28b
 
 0:
         br              x15
 
 L(\type\()_8tap_filter_2):
         ld1             {v28.8b},  [\sr2], \s_strd
         ld1             {v30.8b},  [\src], \s_strd
@@ -1412,17 +1407,16 @@ L(\type\()_8tap_filter_2):
         trn2            v30.2s,  v28.2s,  v30.2s
         trn1            v28.2s,  v29.2s,  v31.2s
         trn2            v31.2s,  v29.2s,  v31.2s
         mul             v27.4h,  v27.4h,  v0.h[0]
         mla             v27.4h,  v28.4h,  v0.h[1]
         mla             v27.4h,  v30.4h,  v0.h[2]
         mla             v27.4h,  v31.4h,  v0.h[3]
         srshr           v28.4h,  v27.4h,  #2
-        trn2            v30.2s,  v28.2s,  v28.2s
         ret
 .endif
 
 40:
         add             \xmx, \xmx, #2
         ld1             {v0.s}[0],  [\xmx]
         b.gt            480f
         add             \xmy, \xmy,  #2
@@ -1448,24 +1442,27 @@ 40:
         mla             v31.4h,  v30.4h,  v0.h[3]
         srshr           v16.4h,  v31.4h,  #2
 
         bl              L(\type\()_8tap_filter_4)
         mov             v17.8b, v28.8b
         mov             v18.8b, v29.8b
 
 4:
+        bl              L(\type\()_8tap_filter_4)
+        // Interleaving the mul/mla chains actually hurts performance
+        // significantly on Cortex A53, thus keeping mul/mla tightly
+        // chained like this.
         smull           v2.4s,  v16.4h, v1.h[0]
-        bl              L(\type\()_8tap_filter_4)
-        smull           v3.4s,  v17.4h, v1.h[0]
         smlal           v2.4s,  v17.4h, v1.h[1]
+        smlal           v2.4s,  v18.4h, v1.h[2]
+        smlal           v2.4s,  v28.4h, v1.h[3]
+        smull           v3.4s,  v17.4h, v1.h[0]
         smlal           v3.4s,  v18.4h, v1.h[1]
-        smlal           v2.4s,  v18.4h, v1.h[2]
         smlal           v3.4s,  v28.4h, v1.h[2]
-        smlal           v2.4s,  v28.4h, v1.h[3]
         smlal           v3.4s,  v29.4h, v1.h[3]
         sqrshrn         v2.4h,  v2.4s,  #\shift_hv
         sqrshrn         v3.4h,  v3.4s,  #\shift_hv
         subs            \h,  \h,  #2
 .ifc \type, put
         sqxtun          v2.8b,  v2.8h
         sqxtun          v3.8b,  v3.8h
         st1             {v2.s}[0], [\dst], \d_strd
@@ -1509,32 +1506,32 @@ 480:    // 4x8, 4x16, 4x32 hv
         bl              L(\type\()_8tap_filter_4)
         mov             v19.8b, v28.8b
         mov             v20.8b, v29.8b
         bl              L(\type\()_8tap_filter_4)
         mov             v21.8b, v28.8b
         mov             v22.8b, v29.8b
 
 48:
+        bl              L(\type\()_8tap_filter_4)
         smull           v2.4s,  v16.4h, v1.h[0]
-        bl              L(\type\()_8tap_filter_4)
-        smull           v3.4s,  v17.4h, v1.h[0]
         smlal           v2.4s,  v17.4h, v1.h[1]
-        smlal           v3.4s,  v18.4h, v1.h[1]
         smlal           v2.4s,  v18.4h, v1.h[2]
-        smlal           v3.4s,  v19.4h, v1.h[2]
         smlal           v2.4s,  v19.4h, v1.h[3]
-        smlal           v3.4s,  v20.4h, v1.h[3]
         smlal           v2.4s,  v20.4h, v1.h[4]
-        smlal           v3.4s,  v21.4h, v1.h[4]
         smlal           v2.4s,  v21.4h, v1.h[5]
+        smlal           v2.4s,  v22.4h, v1.h[6]
+        smlal           v2.4s,  v28.4h, v1.h[7]
+        smull           v3.4s,  v17.4h, v1.h[0]
+        smlal           v3.4s,  v18.4h, v1.h[1]
+        smlal           v3.4s,  v19.4h, v1.h[2]
+        smlal           v3.4s,  v20.4h, v1.h[3]
+        smlal           v3.4s,  v21.4h, v1.h[4]
         smlal           v3.4s,  v22.4h, v1.h[5]
-        smlal           v2.4s,  v22.4h, v1.h[6]
         smlal           v3.4s,  v28.4h, v1.h[6]
-        smlal           v2.4s,  v28.4h, v1.h[7]
         smlal           v3.4s,  v29.4h, v1.h[7]
         sqrshrn         v2.4h,  v2.4s,  #\shift_hv
         sqrshrn         v3.4h,  v3.4s,  #\shift_hv
         subs            \h,  \h,  #2
 .ifc \type, put
         sqxtun          v2.8b,  v2.8h
         sqxtun          v3.8b,  v3.8h
         st1             {v2.s}[0], [\dst], \d_strd
@@ -2326,8 +2323,196 @@ L(\type\()_bilin_hv_tbl):
         .hword L(\type\()_bilin_hv_tbl) -   40b
         .hword L(\type\()_bilin_hv_tbl) -   20b
         .hword 0
 endfunc
 .endm
 
 filter_fn put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10
 filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6
+
+.macro load_filter_row dst, src, inc
+        asr             w13, \src, #10
+        ldr             \dst, [x11, w13, sxtw #3]
+        add             \src, \src, \inc
+.endm
+
+function warp_filter_horz
+        add             w12, w5,  #512
+
+        ld1             {v16.8b, v17.8b}, [x2], x3
+
+        load_filter_row d0, w12, w7
+        load_filter_row d1, w12, w7
+        load_filter_row d2, w12, w7
+        sxtl            v0.8h,   v0.8b
+        load_filter_row d3, w12, w7
+        sxtl            v1.8h,   v1.8b
+        load_filter_row d4, w12, w7
+        sxtl            v2.8h,   v2.8b
+        load_filter_row d5, w12, w7
+        sxtl            v3.8h,   v3.8b
+        load_filter_row d6, w12, w7
+        sxtl            v4.8h,   v4.8b
+        load_filter_row d7, w12, w7
+        sxtl            v5.8h,   v5.8b
+        sxtl            v6.8h,   v6.8b
+        sxtl            v7.8h,   v7.8b
+
+        uxtl            v16.8h,  v16.8b
+        uxtl            v17.8h,  v17.8b
+
+        ext             v18.16b, v16.16b, v17.16b, #2*1
+        mul             v23.8h,  v16.8h,  v0.8h
+        ext             v19.16b, v16.16b, v17.16b, #2*2
+        mul             v18.8h,  v18.8h,  v1.8h
+        ext             v20.16b, v16.16b, v17.16b, #2*3
+        mul             v19.8h,  v19.8h,  v2.8h
+        ext             v21.16b, v16.16b, v17.16b, #2*4
+        saddlp          v23.4s,  v23.8h
+        mul             v20.8h,  v20.8h,  v3.8h
+        ext             v22.16b, v16.16b, v17.16b, #2*5
+        saddlp          v18.4s,  v18.8h
+        mul             v21.8h,  v21.8h,  v4.8h
+        saddlp          v19.4s,  v19.8h
+        mul             v22.8h,  v22.8h,  v5.8h
+        saddlp          v20.4s,  v20.8h
+        addv            s23,     v23.4s
+        saddlp          v21.4s,  v21.8h
+        addv            s18,     v18.4s
+        saddlp          v22.4s,  v22.8h
+        addv            s19,     v19.4s
+        trn1            v18.2s,  v23.2s,  v18.2s
+        addv            s20,     v20.4s
+        ext             v23.16b, v16.16b, v17.16b, #2*6
+        trn1            v19.2s,  v19.2s,  v20.2s
+        addv            s21,     v21.4s
+        mul             v23.8h,  v23.8h,  v6.8h
+        ext             v20.16b, v16.16b, v17.16b, #2*7
+        addv            s22,     v22.4s
+        mul             v20.8h,  v20.8h,  v7.8h
+        saddlp          v23.4s,  v23.8h
+        trn1            v21.2s,  v21.2s,  v22.2s
+        saddlp          v20.4s,  v20.8h
+        addv            s23,     v23.4s
+        addv            s20,     v20.4s
+        trn1            v20.2s,  v23.2s,  v20.2s
+        trn1            v18.2d,  v18.2d,  v19.2d
+        trn1            v20.2d,  v21.2d,  v20.2d
+
+        add             w5,  w5,  w8
+
+        rshrn           v16.4h,  v18.4s,  #3
+        rshrn2          v16.8h,  v20.4s,  #3
+
+        ret
+endfunc
+
+// void dav1d_warp_affine_8x8_8bpc_neon(
+//         pixel *dst, const ptrdiff_t dst_stride,
+//         const pixel *src, const ptrdiff_t src_stride,
+//         const int16_t *const abcd, int mx, int my)
+.macro warp t, shift
+function warp_affine_8x8\t\()_8bpc_neon, export=1
+        ldr             x4,  [x4]
+        ubfx            x7,  x4, #0,  #16
+        ubfx            x8,  x4, #16, #16
+        ubfx            x9,  x4, #32, #16
+        ubfx            x4,  x4, #48, #16
+        sxth            w7,  w7
+        sxth            w8,  w8
+        sxth            w9,  w9
+        sxth            w4,  w4
+        mov             w10, #8
+        sub             x2,  x2,  x3, lsl #1
+        sub             x2,  x2,  x3
+        sub             x2,  x2,  #3
+        movrel          x11, X(mc_warp_filter), 64*8
+        mov             x15, x30
+.ifnb \t
+        lsl             x1,  x1,  #1
+.endif
+
+        bl              warp_filter_horz
+        mov             v24.16b, v16.16b
+        bl              warp_filter_horz
+        mov             v25.16b, v16.16b
+        bl              warp_filter_horz
+        mov             v26.16b, v16.16b
+        bl              warp_filter_horz
+        mov             v27.16b, v16.16b
+        bl              warp_filter_horz
+        mov             v28.16b, v16.16b
+        bl              warp_filter_horz
+        mov             v29.16b, v16.16b
+        bl              warp_filter_horz
+        mov             v30.16b, v16.16b
+
+1:
+        add             w14, w6,  #512
+        bl              warp_filter_horz
+        mov             v31.16b, v16.16b
+
+        load_filter_row d0, w14, w9
+        load_filter_row d1, w14, w9
+        load_filter_row d2, w14, w9
+        load_filter_row d3, w14, w9
+        load_filter_row d4, w14, w9
+        load_filter_row d5, w14, w9
+        load_filter_row d6, w14, w9
+        load_filter_row d7, w14, w9
+        transpose_8x8b  v0, v1, v2, v3, v4, v5, v6, v7, v16, v17
+        sxtl            v0.8h,   v0.8b
+        sxtl            v1.8h,   v1.8b
+        sxtl            v2.8h,   v2.8b
+        sxtl            v3.8h,   v3.8b
+        sxtl            v4.8h,   v4.8b
+        sxtl            v5.8h,   v5.8b
+        sxtl            v6.8h,   v6.8b
+        sxtl            v7.8h,   v7.8b
+
+        // This ordering of smull/smlal/smull2/smlal2 is highly
+        // beneficial for Cortex A53 here.
+        smull           v16.4s,  v24.4h,  v0.4h
+        smlal           v16.4s,  v25.4h,  v1.4h
+        smlal           v16.4s,  v26.4h,  v2.4h
+        smlal           v16.4s,  v27.4h,  v3.4h
+        smlal           v16.4s,  v28.4h,  v4.4h
+        smlal           v16.4s,  v29.4h,  v5.4h
+        smlal           v16.4s,  v30.4h,  v6.4h
+        smlal           v16.4s,  v31.4h,  v7.4h
+        smull2          v17.4s,  v24.8h,  v0.8h
+        smlal2          v17.4s,  v25.8h,  v1.8h
+        smlal2          v17.4s,  v26.8h,  v2.8h
+        smlal2          v17.4s,  v27.8h,  v3.8h
+        smlal2          v17.4s,  v28.8h,  v4.8h
+        smlal2          v17.4s,  v29.8h,  v5.8h
+        smlal2          v17.4s,  v30.8h,  v6.8h
+        smlal2          v17.4s,  v31.8h,  v7.8h
+
+        mov             v24.16b, v25.16b
+        mov             v25.16b, v26.16b
+        sqrshrn         v16.4h,  v16.4s,  #\shift
+        mov             v26.16b, v27.16b
+        sqrshrn2        v16.8h,  v17.4s,  #\shift
+        mov             v27.16b, v28.16b
+        mov             v28.16b, v29.16b
+.ifb \t
+        sqxtun          v16.8b,  v16.8h
+.endif
+        mov             v29.16b, v30.16b
+        mov             v30.16b, v31.16b
+        subs            w10, w10, #1
+.ifnb \t
+        st1             {v16.8h}, [x0], x1
+.else
+        st1             {v16.8b}, [x0], x1
+.endif
+
+        add             w6,  w6,  w4
+        b.gt            1b
+
+        br              x15
+endfunc
+.endm
+
+warp  , 11
+warp t, 7
--- a/third_party/dav1d/src/arm/64/util.S
+++ b/third_party/dav1d/src/arm/64/util.S
@@ -21,18 +21,18 @@
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *****************************************************************************/
 
-#ifndef __DAVID_SRC_ARM_64_UTIL_S__
-#define __DAVID_SRC_ARM_64_UTIL_S__
+#ifndef DAV1D_SRC_ARM_64_UTIL_S
+#define DAV1D_SRC_ARM_64_UTIL_S
 
 #include "config.h"
 #include "src/arm/asm.S"
 
 .macro  movrel rd, val, offset=0
 #if defined(__APPLE__)
   .if \offset < 0
     adrp        \rd, \val@PAGE
@@ -54,9 +54,38 @@
 #elif defined(PIC)
     adrp        \rd, \val+(\offset)
     add         \rd, \rd, :lo12:\val+(\offset)
 #else
     ldr         \rd, =\val+\offset
 #endif
 .endm
 
-#endif /* __DAVID_SRC_ARM_64_UTIL_S__ */
+.macro transpose_8x8b r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
+    trn1        \r8\().8b,  \r0\().8b,  \r1\().8b
+    trn2        \r9\().8b,  \r0\().8b,  \r1\().8b
+    trn1        \r1\().8b,  \r2\().8b,  \r3\().8b
+    trn2        \r3\().8b,  \r2\().8b,  \r3\().8b
+    trn1        \r0\().8b,  \r4\().8b,  \r5\().8b
+    trn2        \r5\().8b,  \r4\().8b,  \r5\().8b
+    trn1        \r2\().8b,  \r6\().8b,  \r7\().8b
+    trn2        \r7\().8b,  \r6\().8b,  \r7\().8b
+
+    trn1        \r4\().4h,  \r0\().4h,  \r2\().4h
+    trn2        \r2\().4h,  \r0\().4h,  \r2\().4h
+    trn1        \r6\().4h,  \r5\().4h,  \r7\().4h
+    trn2        \r7\().4h,  \r5\().4h,  \r7\().4h
+    trn1        \r5\().4h,  \r9\().4h,  \r3\().4h
+    trn2        \r9\().4h,  \r9\().4h,  \r3\().4h
+    trn1        \r3\().4h,  \r8\().4h,  \r1\().4h
+    trn2        \r8\().4h,  \r8\().4h,  \r1\().4h
+
+    trn1        \r0\().2s,  \r3\().2s,  \r4\().2s
+    trn2        \r4\().2s,  \r3\().2s,  \r4\().2s
+    trn1        \r1\().2s,  \r5\().2s,  \r6\().2s
+    trn2        \r5\().2s,  \r5\().2s,  \r6\().2s
+    trn2        \r6\().2s,  \r8\().2s,  \r2\().2s
+    trn1        \r2\().2s,  \r8\().2s,  \r2\().2s
+    trn1        \r3\().2s,  \r9\().2s,  \r7\().2s
+    trn2        \r7\().2s,  \r9\().2s,  \r7\().2s
+.endm
+
+#endif /* DAV1D_SRC_ARM_64_UTIL_S */
--- a/third_party/dav1d/src/arm/asm.S
+++ b/third_party/dav1d/src/arm/asm.S
@@ -20,24 +20,25 @@
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __DAV1D_SRC_ARM_ASM_S__
-#define __DAV1D_SRC_ARM_ASM_S__
+#ifndef DAV1D_SRC_ARM_ASM_S
+#define DAV1D_SRC_ARM_ASM_S
 
 #include "config.h"
 
 #if ARCH_ARM
     .syntax unified
 #ifdef __ELF__
+    .arch armv7-a
     .fpu neon
     .eabi_attribute 10, 0           // suppress Tag_FP_arch
     .eabi_attribute 12, 0           // suppress Tag_Advanced_SIMD_arch
 #endif
 
 #ifdef _WIN32
 #define CONFIG_THUMB 1
 #else
@@ -109,26 +110,28 @@ EXTERN\name:
 
 .macro  const   name, align=2
     .macro endconst
 #ifdef __ELF__
         .size   \name, . - \name
 #endif
         .purgem endconst
     .endm
-#if !defined(__MACH__)
+#if defined(_WIN32)
+        .section        .rdata
+#elif !defined(__MACH__)
         .section        .rodata
 #else
         .const_data
 #endif
         .align          \align
 \name:
 .endm
 
 #ifdef __APPLE__
 #define L(x) L ## x
 #else
 #define L(x) .L ## x
 #endif
 
 #define X(x) CONCAT(EXTERN, x)
 
-#endif /* __DAV1D_SRC_ARM_ASM_S__ */
+#endif /* DAV1D_SRC_ARM_ASM_S */
new file mode 100644
--- /dev/null
+++ b/third_party/dav1d/src/arm/cdef_init_tmpl.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "common/attributes.h"
+#include "src/cpu.h"
+#include "src/cdef.h"
+
+#if BITDEPTH == 8 && ARCH_AARCH64
+decl_cdef_dir_fn(dav1d_cdef_find_dir_neon);
+
+void dav1d_cdef_padding4_neon(uint16_t *tmp, const pixel *src,
+                              ptrdiff_t src_stride, const pixel (*left)[2],
+                              /*const*/ pixel *const top[2], int h,
+                              enum CdefEdgeFlags edges);
+void dav1d_cdef_padding8_neon(uint16_t *tmp, const pixel *src,
+                              ptrdiff_t src_stride, const pixel (*left)[2],
+                              /*const*/ pixel *const top[2], int h,
+                              enum CdefEdgeFlags edges);
+
+void dav1d_cdef_filter4_neon(pixel *dst, ptrdiff_t dst_stride,
+                             const uint16_t *tmp, int pri_strength,
+                             int sec_strength, int dir, int damping, int h);
+void dav1d_cdef_filter8_neon(pixel *dst, ptrdiff_t dst_stride,
+                             const uint16_t *tmp, int pri_strength,
+                             int sec_strength, int dir, int damping, int h);
+
+#define DEFINE_FILTER(w, h, tmp_stride)                                      \
+static void                                                                  \
+cdef_filter_##w##x##h##_neon(pixel *dst,                                     \
+                             const ptrdiff_t stride,                         \
+                             const pixel (*left)[2],                         \
+                             /*const*/ pixel *const top[2],                  \
+                             const int pri_strength,                         \
+                             const int sec_strength,                         \
+                             const int dir,                                  \
+                             const int damping,                              \
+                             const enum CdefEdgeFlags edges)                 \
+{                                                                            \
+    ALIGN_STK_16(uint16_t, tmp_buf, 12*tmp_stride,);                         \
+    uint16_t *tmp = tmp_buf + 2 * tmp_stride + 2;                            \
+    dav1d_cdef_padding##w##_neon(tmp, dst, stride, left, top, h, edges);     \
+    dav1d_cdef_filter##w##_neon(dst, stride, tmp, pri_strength,              \
+                                sec_strength, dir, damping, h);              \
+}
+
+DEFINE_FILTER(8, 8, 16)
+DEFINE_FILTER(4, 8, 8)
+DEFINE_FILTER(4, 4, 8)
+#endif
+
+
+void bitfn(dav1d_cdef_dsp_init_arm)(Dav1dCdefDSPContext *const c) {
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
+
+#if BITDEPTH == 8 && ARCH_AARCH64
+    c->dir = dav1d_cdef_find_dir_neon;
+    c->fb[0] = cdef_filter_8x8_neon;
+    c->fb[1] = cdef_filter_4x8_neon;
+    c->fb[2] = cdef_filter_4x4_neon;
+#endif
+}
--- a/third_party/dav1d/src/arm/cpu.c
+++ b/third_party/dav1d/src/arm/cpu.c
@@ -57,17 +57,17 @@ static unsigned parse_proc_cpuinfo(const
     while ((line = fgets(line_buffer, sizeof(line_buffer), file))) {
         if (strstr(line, flag)) {
             fclose(file);
             return 1;
         }
         // if line is incomplete seek back to avoid splitting the search
         // string into two buffers
         if (!strchr(line, '\n') && strlen(line) > strlen(flag)) {
-            if (fseek(file, -strlen(flag), SEEK_CUR))
+            if (fseeko(file, -strlen(flag), SEEK_CUR))
                 break;
         }
     }
 
     fclose(file);
 
     return 0;
 }
--- a/third_party/dav1d/src/arm/cpu.h
+++ b/third_party/dav1d/src/arm/cpu.h
@@ -20,18 +20,18 @@
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __DAV1D_SRC_ARM_CPU_H__
-#define __DAV1D_SRC_ARM_CPU_H__
+#ifndef DAV1D_SRC_ARM_CPU_H
+#define DAV1D_SRC_ARM_CPU_H
 
 enum CpuFlags {
     DAV1D_ARM_CPU_FLAG_NEON = 1 << 0,
 };
 
 unsigned dav1d_get_cpu_flags_arm(void);
 
-#endif /* __DAV1D_SRC_ARM_CPU_H__ */
+#endif /* DAV1D_SRC_ARM_CPU_H */
--- a/third_party/dav1d/src/arm/looprestoration_init_tmpl.c
+++ b/third_party/dav1d/src/arm/looprestoration_init_tmpl.c
@@ -24,20 +24,18 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include "src/cpu.h"
 #include "src/looprestoration.h"
 
 #include "common/attributes.h"
-#include "common/intops.h"
-#include "src/tables.h"
 
-#if BITDEPTH == 8 && ARCH_AARCH64
+#if BITDEPTH == 8
 // This calculates things slightly differently than the reference C version.
 // This version calculates roughly this:
 // int16_t sum = 0;
 // for (int i = 0; i < 7; i++)
 //     sum += src[idx] * fh[i];
 // int16_t sum2 = (src[x] << 7) - (1 << (BITDEPTH + 6)) + rounding_off_h;
 // sum = iclip(sum + sum2, INT16_MIN, INT16_MAX) >> round_bits_h;
 // sum += 2048;
@@ -61,17 +59,17 @@ void dav1d_copy_narrow_neon(pixel *dst, 
                             const pixel *src, int w, int h);
 
 static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
                                const pixel (*const left)[4],
                                const pixel *lpf, const ptrdiff_t lpf_stride,
                                const int w, const int h, const int16_t fh[7],
                                const int16_t fv[7], const enum LrEdgeFlags edges)
 {
-    ALIGN_STK_32(int16_t, mid, 68 * 384,);
+    ALIGN_STK_16(int16_t, mid, 68 * 384,);
     int mid_stride = (w + 7) & ~7;
 
     // Horizontal filter
     dav1d_wiener_filter_h_neon(&mid[2 * mid_stride], left, dst, dst_stride,
                                fh, w, h, edges);
     if (edges & LR_HAVE_TOP)
         dav1d_wiener_filter_h_neon(mid, NULL, lpf, lpf_stride,
                                    fh, w, 2, edges);
@@ -95,12 +93,12 @@ static void wiener_filter_neon(pixel *co
 }
 #endif
 
 void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *const c) {
     const unsigned flags = dav1d_get_cpu_flags();
 
     if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
 
-#if BITDEPTH == 8 && ARCH_AARCH64
+#if BITDEPTH == 8
     c->wiener = wiener_filter_neon;
 #endif
 }
--- a/third_party/dav1d/src/arm/mc_init_tmpl.c
+++ b/third_party/dav1d/src/arm/mc_init_tmpl.c
@@ -51,27 +51,29 @@ decl_mct_fn(dav1d_prep_8tap_sharp_8bpc_n
 decl_mct_fn(dav1d_prep_8tap_sharp_regular_8bpc_neon);
 decl_mct_fn(dav1d_prep_8tap_sharp_smooth_8bpc_neon);
 decl_mct_fn(dav1d_prep_bilin_8bpc_neon);
 
 decl_avg_fn(dav1d_avg_8bpc_neon);
 decl_w_avg_fn(dav1d_w_avg_8bpc_neon);
 decl_mask_fn(dav1d_mask_8bpc_neon);
 
+decl_warp8x8_fn(dav1d_warp_affine_8x8_8bpc_neon);
+decl_warp8x8t_fn(dav1d_warp_affine_8x8t_8bpc_neon);
+
 void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
 #define init_mc_fn(type, name, suffix) \
     c->mc[type] = dav1d_put_##name##_8bpc_##suffix
 #define init_mct_fn(type, name, suffix) \
     c->mct[type] = dav1d_prep_##name##_8bpc_##suffix
     const unsigned flags = dav1d_get_cpu_flags();
 
     if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
 
 #if BITDEPTH == 8
-#if ARCH_AARCH64
     init_mc_fn (FILTER_2D_8TAP_REGULAR,        8tap_regular,        neon);
     init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
     init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  neon);
     init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon);
     init_mc_fn (FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         neon);
     init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   neon);
     init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  neon);
     init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   neon);
@@ -83,15 +85,18 @@ void bitfn(dav1d_mc_dsp_init_arm)(Dav1dM
     init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  neon);
     init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon);
     init_mct_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         neon);
     init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   neon);
     init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  neon);
     init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   neon);
     init_mct_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          neon);
     init_mct_fn(FILTER_2D_BILINEAR,            bilin,               neon);
-#endif
 
     c->avg = dav1d_avg_8bpc_neon;
     c->w_avg = dav1d_w_avg_8bpc_neon;
     c->mask = dav1d_mask_8bpc_neon;
+#if ARCH_AARCH64
+    c->warp8x8 = dav1d_warp_affine_8x8_8bpc_neon;
+    c->warp8x8t = dav1d_warp_affine_8x8t_8bpc_neon;
+#endif
 #endif
 }
--- a/third_party/dav1d/src/cdef.h
+++ b/third_party/dav1d/src/cdef.h
@@ -20,29 +20,29 @@
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __DAV1D_SRC_CDEF_H__
-#define __DAV1D_SRC_CDEF_H__
+#ifndef DAV1D_SRC_CDEF_H
+#define DAV1D_SRC_CDEF_H
 
 #include <stddef.h>
 #include <stdint.h>
 
 #include "common/bitdepth.h"
 
 enum CdefEdgeFlags {
-    HAVE_LEFT = 1 << 0,
-    HAVE_RIGHT = 1 << 1,
-    HAVE_TOP = 1 << 2,
-    HAVE_BOTTOM = 1 << 3,
+    CDEF_HAVE_LEFT = 1 << 0,
+    CDEF_HAVE_RIGHT = 1 << 1,
+    CDEF_HAVE_TOP = 1 << 2,
+    CDEF_HAVE_BOTTOM = 1 << 3,
 };
 
 #ifdef BITDEPTH
 typedef const pixel (*const_left_pixel_row_2px)[2];
 #else
 typedef const void *const_left_pixel_row_2px;
 #endif
 
@@ -61,11 +61,12 @@ int (name)(const pixel *dst, ptrdiff_t d
 typedef decl_cdef_dir_fn(*cdef_dir_fn);
 
 typedef struct Dav1dCdefDSPContext {
     cdef_dir_fn dir;
     cdef_fn fb[3 /* 444/luma, 422, 420 */];
 } Dav1dCdefDSPContext;
 
 bitfn_decls(void dav1d_cdef_dsp_init, Dav1dCdefDSPContext *c);
+bitfn_decls(void dav1d_cdef_dsp_init_arm, Dav1dCdefDSPContext *c);
 bitfn_decls(void dav1d_cdef_dsp_init_x86, Dav1dCdefDSPContext *c);
 
-#endif /* __DAV1D_SRC_CDEF_H__ */
+#endif /* DAV1D_SRC_CDEF_H */
--- a/third_party/dav1d/src/cdef_apply.h
+++ b/third_party/dav1d/src/cdef_apply.h
@@ -20,19 +20,19 @@
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __DAV1D_SRC_CDEF_APPLY_H__
-#define __DAV1D_SRC_CDEF_APPLY_H__
+#ifndef DAV1D_SRC_CDEF_APPLY_H
+#define DAV1D_SRC_CDEF_APPLY_H
 
 #include "common/bitdepth.h"
 
 #include "src/internal.h"
 
 void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *f, pixel *const p[3],
                              const Av1Filter *lflvl, int by_start, int by_end);
 
-#endif /* __DAV1D_SRC_CDEF_APPLY_H__ */
+#endif /* DAV1D_SRC_CDEF_APPLY_H */
--- a/third_party/dav1d/src/cdef_apply_tmpl.c
+++ b/third_party/dav1d/src/cdef_apply_tmpl.c
@@ -53,25 +53,27 @@ static void backup2lines(pixel *const ds
     pixel_copy(dst[2][1], src[2] + (y_off - 1) * PXSTRIDE(src_stride[1]), w);
 }
 
 static void backup2x8(pixel dst[3][8][2],
                       /*const*/ pixel *const src[3],
                       const ptrdiff_t src_stride[2], int x_off,
                       const enum Dav1dPixelLayout layout)
 {
-    for (int y = 0, y_off = 0; y < 8; y++, y_off += PXSTRIDE(src_stride[0]))
+    ptrdiff_t y_off = 0;
+    for (int y = 0; y < 8; y++, y_off += PXSTRIDE(src_stride[0]))
         pixel_copy(dst[0][y], &src[0][y_off + x_off - 2], 2);
 
     if (layout == DAV1D_PIXEL_LAYOUT_I400) return;
     const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
     const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
 
     x_off >>= ss_hor;
-    for (int y = 0, y_off = 0; y < (8 >> ss_ver); y++, y_off += PXSTRIDE(src_stride[1])) {
+    y_off = 0;
+    for (int y = 0; y < (8 >> ss_ver); y++, y_off += PXSTRIDE(src_stride[1])) {
         pixel_copy(dst[1][y], &src[1][y_off + x_off - 2], 2);
         pixel_copy(dst[2][y], &src[2][y_off + x_off - 2], 2);
     }
 }
 
 static int adjust_strength(const int strength, const unsigned var) {
     if (!var) return 0;
     const int i = var >> 6 ? imin(ulog2(var >> 6), 12) : 0;
@@ -80,83 +82,83 @@ static int adjust_strength(const int str
 
 void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
                              pixel *const p[3],
                              const Av1Filter *const lflvl,
                              const int by_start, const int by_end)
 {
     const int bitdepth_min_8 = BITDEPTH == 8 ? 0 : f->cur.p.bpc - 8;
     const Dav1dDSPContext *const dsp = f->dsp;
-    enum CdefEdgeFlags edges = HAVE_BOTTOM | (by_start > 0 ? HAVE_TOP : 0);
+    enum CdefEdgeFlags edges = CDEF_HAVE_BOTTOM | (by_start > 0 ? CDEF_HAVE_TOP : 0);
     pixel *ptrs[3] = { p[0], p[1], p[2] };
     const int sbsz = 16;
     const int sb64w = f->sb128w << 1;
     const int damping = f->frame_hdr->cdef.damping + bitdepth_min_8;
     const enum Dav1dPixelLayout layout = f->cur.p.layout;
     const int uv_idx = DAV1D_PIXEL_LAYOUT_I444 - layout;
     const int has_chroma = layout != DAV1D_PIXEL_LAYOUT_I400;
     const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
     const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
 
     // FIXME a design improvement that could be made here is to keep a set of
     // flags for each block position on whether the block was filtered; if not,
     // the backup of pre-filter data is empty, and the restore is therefore
     // unnecessary as well.
 
-    for (int bit = 0, by = by_start; by < by_end; by += 2, edges |= HAVE_TOP) {
+    for (int bit = 0, by = by_start; by < by_end; by += 2, edges |= CDEF_HAVE_TOP) {
         const int tf = f->lf.top_pre_cdef_toggle;
-        if (by + 2 >= f->bh) edges &= ~HAVE_BOTTOM;
+        if (by + 2 >= f->bh) edges &= ~CDEF_HAVE_BOTTOM;
 
-        if (edges & HAVE_BOTTOM) {
+        if (edges & CDEF_HAVE_BOTTOM) {
             // backup pre-filter data for next iteration
             backup2lines(f->lf.cdef_line_ptr[!tf], ptrs, f->cur.stride,
                          8, f->bw * 4, layout);
         }
 
         pixel lr_bak[2 /* idx */][3 /* plane */][8 /* y */][2 /* x */];
         pixel *iptrs[3] = { ptrs[0], ptrs[1], ptrs[2] };
-        edges &= ~HAVE_LEFT;
-        edges |= HAVE_RIGHT;
-        for (int sbx = 0, last_skip = 1; sbx < sb64w; sbx++, edges |= HAVE_LEFT) {
+        edges &= ~CDEF_HAVE_LEFT;
+        edges |= CDEF_HAVE_RIGHT;
+        for (int sbx = 0, last_skip = 1; sbx < sb64w; sbx++, edges |= CDEF_HAVE_LEFT) {
             const int sb128x = sbx >>1;
             const int sb64_idx = ((by & sbsz) >> 3) + (sbx & 1);
             const int cdef_idx = lflvl[sb128x].cdef_idx[sb64_idx];
             if (cdef_idx == -1 ||
                 (!f->frame_hdr->cdef.y_strength[cdef_idx] &&
                  !f->frame_hdr->cdef.uv_strength[cdef_idx]))
             {
                 last_skip = 1;
                 goto next_sb;
             }
 
             const int y_lvl = f->frame_hdr->cdef.y_strength[cdef_idx];
             const int uv_lvl = f->frame_hdr->cdef.uv_strength[cdef_idx];
             pixel *bptrs[3] = { iptrs[0], iptrs[1], iptrs[2] };
             for (int bx = sbx * sbsz; bx < imin((sbx + 1) * sbsz, f->bw);
-                 bx += 2, edges |= HAVE_LEFT)
+                 bx += 2, edges |= CDEF_HAVE_LEFT)
             {
-                if (bx + 2 >= f->bw) edges &= ~HAVE_RIGHT;
+                if (bx + 2 >= f->bw) edges &= ~CDEF_HAVE_RIGHT;
 
                 // check if this 8x8 block had any coded coefficients; if not,
                 // go to the next block
                 const unsigned bx_mask = 3U << (bx & 14);
                 const int by_idx = by & 30, bx_idx = (bx & 16) >> 4;
                 if (!((lflvl[sb128x].noskip_mask[by_idx + 0][bx_idx] |
                        lflvl[sb128x].noskip_mask[by_idx + 1][bx_idx]) & bx_mask))
                 {
                     last_skip = 1;
                     goto next_b;
                 }
 
-                if (last_skip && edges & HAVE_LEFT) {
+                if (last_skip && edges & CDEF_HAVE_LEFT) {
                     // we didn't backup the prefilter data because it wasn't
                     // there, so do it here instead
                     backup2x8(lr_bak[bit], bptrs, f->cur.stride, 0, layout);
                 }
-                if (edges & HAVE_RIGHT) {
+                if (edges & CDEF_HAVE_RIGHT) {
                     // backup pre-filter data for next iteration
                     backup2x8(lr_bak[!bit], bptrs, f->cur.stride, 8, layout);
                 }
 
                 // the actual filter
                 const int y_pri_lvl = (y_lvl >> 2) << bitdepth_min_8;
                 int y_sec_lvl = y_lvl & 3;
                 y_sec_lvl += y_sec_lvl == 3;
--- a/third_party/dav1d/src/cdef_tmpl.c
+++ b/third_party/dav1d/src/cdef_tmpl.c
@@ -56,29 +56,29 @@ static inline void fill(uint16_t *tmp, c
 static void padding(uint16_t *tmp, const ptrdiff_t tmp_stride,
                     const pixel *src, const ptrdiff_t src_stride,
                     const pixel (*left)[2], pixel *const top[2],
                     const int w, const int h,
                     const enum CdefEdgeFlags edges)
 {
     // fill extended input buffer
     int x_start = -2, x_end = w + 2, y_start = -2, y_end = h + 2;
-    if (!(edges & HAVE_TOP)) {
+    if (!(edges & CDEF_HAVE_TOP)) {
         fill(tmp - 2 - 2 * tmp_stride, tmp_stride, w + 4, 2);
         y_start = 0;
     }
-    if (!(edges & HAVE_BOTTOM)) {
+    if (!(edges & CDEF_HAVE_BOTTOM)) {
         fill(tmp + h * tmp_stride - 2, tmp_stride, w + 4, 2);
         y_end -= 2;
     }
-    if (!(edges & HAVE_LEFT)) {
+    if (!(edges & CDEF_HAVE_LEFT)) {
         fill(tmp + y_start * tmp_stride - 2, tmp_stride, 2, y_end - y_start);
         x_start = 0;
     }
-    if (!(edges & HAVE_RIGHT)) {
+    if (!(edges & CDEF_HAVE_RIGHT)) {
         fill(tmp + y_start * tmp_stride + w, tmp_stride, 2, y_end - y_start);
         x_end -= 2;
     }
 
     for (int y = y_start; y < 0; y++)
         for (int x = x_start; x < x_end; x++)
             tmp[x + y * tmp_stride] = top[y & 1][x];
     for (int y = 0; y < h; y++)
@@ -105,39 +105,40 @@ cdef_filter_block_c(pixel *dst, const pt
         {  0 * 12 + 1, -1 * 12 + 2 },
         {  0 * 12 + 1,  0 * 12 + 2 },
         {  0 * 12 + 1,  1 * 12 + 2 },
         {  1 * 12 + 1,  2 * 12 + 2 },
         {  1 * 12 + 0,  2 * 12 + 1 },
         {  1 * 12 + 0,  2 * 12 + 0 },
         {  1 * 12 + 0,  2 * 12 - 1 }
     };
-    static const uint8_t cdef_pri_taps[2][2] = { { 4, 2 }, { 3, 3 } };
-    static const uint8_t sec_taps[2] = { 2, 1 };
     const ptrdiff_t tmp_stride = 12;
     assert((w == 4 || w == 8) && (h == 4 || h == 8));
     uint16_t tmp_buf[144];  // 12*12 is the maximum value of tmp_stride * (h + 4)
     uint16_t *tmp = tmp_buf + 2 * tmp_stride + 2;
     const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
-    const uint8_t *const pri_taps = cdef_pri_taps[(pri_strength >> bitdepth_min_8) & 1];
+    const int pri_tap = 4 - ((pri_strength >> bitdepth_min_8) & 1);
 
     padding(tmp, tmp_stride, dst, dst_stride, left, top, w, h, edges);
 
     // run actual filter
     for (int y = 0; y < h; y++) {
         for (int x = 0; x < w; x++) {
             int sum = 0;
             const int px = dst[x];
             int max = px, min = px;
+            int pri_tap_k = pri_tap;
             for (int k = 0; k < 2; k++) {
                 const int off1 = cdef_directions[dir][k];
                 const int p0 = tmp[x + off1];
                 const int p1 = tmp[x - off1];
-                sum += pri_taps[k] * constrain(p0 - px, pri_strength, damping);
-                sum += pri_taps[k] * constrain(p1 - px, pri_strength, damping);
+                sum += pri_tap_k * constrain(p0 - px, pri_strength, damping);
+                sum += pri_tap_k * constrain(p1 - px, pri_strength, damping);
+                // if pri_tap_k == 4 then it becomes 2 else it remains 3
+                pri_tap_k -= (pri_tap_k << 1) - 6;
                 if (p0 != INT16_MAX) max = imax(p0, max);
                 if (p1 != INT16_MAX) max = imax(p1, max);
                 min = imin(p0, min);
                 min = imin(p1, min);
                 const int off2 = cdef_directions[(dir + 2) & 7][k];
                 const int s0 = tmp[x + off2];
                 const int s1 = tmp[x - off2];
                 const int off3 = cdef_directions[(dir + 6) & 7][k];
@@ -146,20 +147,22 @@ cdef_filter_block_c(pixel *dst, const pt
                 if (s0 != INT16_MAX) max = imax(s0, max);
                 if (s1 != INT16_MAX) max = imax(s1, max);
                 if (s2 != INT16_MAX) max = imax(s2, max);
                 if (s3 != INT16_MAX) max = imax(s3, max);
                 min = imin(s0, min);
                 min = imin(s1, min);
                 min = imin(s2, min);
                 min = imin(s3, min);
-                sum += sec_taps[k] * constrain(s0 - px, sec_strength, damping);
-                sum += sec_taps[k] * constrain(s1 - px, sec_strength, damping);
-                sum += sec_taps[k] * constrain(s2 - px, sec_strength, damping);
-                sum += sec_taps[k] * constrain(s3 - px, sec_strength, damping);
+                // sec_tap starts at 2 and becomes 1
+                const int sec_tap = 2 - k;
+                sum += sec_tap * constrain(s0 - px, sec_strength, damping);
+                sum += sec_tap * constrain(s1 - px, sec_strength, damping);
+                sum += sec_tap * constrain(s2 - px, sec_strength, damping);
+                sum += sec_tap * constrain(s3 - px, sec_strength, damping);
             }
             dst[x] = iclip(px + ((8 + sum - (sum < 0)) >> 4), min, max);
         }
         dst += PXSTRIDE(dst_stride);
         tmp += tmp_stride;
     }
 }
 
@@ -252,12 +255,16 @@ static int cdef_find_dir_c(const pixel *
 }
 
 void bitfn(dav1d_cdef_dsp_init)(Dav1dCdefDSPContext *const c) {
     c->dir = cdef_find_dir_c;
     c->fb[0] = cdef_filter_block_8x8_c;
     c->fb[1] = cdef_filter_block_4x8_c;
     c->fb[2] = cdef_filter_block_4x4_c;
 
-#if HAVE_ASM && ARCH_X86 && BITDEPTH == 8
+#if HAVE_ASM
+#if ARCH_AARCH64 || ARCH_ARM
+    bitfn(dav1d_cdef_dsp_init_arm)(c);
+#elif ARCH_X86
     bitfn(dav1d_cdef_dsp_init_x86)(c);
 #endif
+#endif
 }
--- a/third_party/dav1d/src/cdf.h
+++ b/third_party/dav1d/src/cdf.h
@@ -20,18 +20,18 @@
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __AV1_CDF_H__
-#define __AV1_CDF_H__
+#ifndef DAV1D_SRC_CDF_H
+#define DAV1D_SRC_CDF_H
 
 #include <stdint.h>
 
 #include "src/levels.h"
 #include "src/ref.h"
 #include "src/thread_data.h"
 
 typedef struct CdfModeContext {
@@ -143,9 +143,9 @@ void dav1d_cdf_thread_update(const Dav1d
                              const CdfContext *src);
 
 /*
  * These are binary signals (so a signal is either "done" or "not done").
  */
 void dav1d_cdf_thread_wait(CdfThreadContext *cdf);
 void dav1d_cdf_thread_signal(CdfThreadContext *cdf);
 
-#endif /* __AV1_CDF_H__ */
+#endif /* DAV1D_SRC_CDF_H */
--- a/third_party/dav1d/src/cpu.h
+++ b/third_party/dav1d/src/cpu.h
@@ -20,25 +20,25 @@
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __DAV1D_SRC_CPU_H__
-#define __DAV1D_SRC_CPU_H__
+#ifndef DAV1D_SRC_CPU_H
+#define DAV1D_SRC_CPU_H
 
 #include "config.h"
 
 #include "dav1d/common.h"
 
 #if ARCH_AARCH64 || ARCH_ARM
 #include "src/arm/cpu.h"
 #elif ARCH_X86
 #include "src/x86/cpu.h"
 #endif
 
 unsigned dav1d_get_cpu_flags(void);
 DAV1D_API void dav1d_set_cpu_flags_mask(const unsigned mask);
 
-#endif /* __DAV1D_SRC_CPU_H__ */
+#endif /* DAV1D_SRC_CPU_H */
--- a/third_party/dav1d/src/ctx.h
+++ b/third_party/dav1d/src/ctx.h
@@ -20,18 +20,18 @@
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __DAV1D_SRC_CTX_H__
-#define __DAV1D_SRC_CTX_H__
+#ifndef DAV1D_SRC_CTX_H
+#define DAV1D_SRC_CTX_H
 
 #include <stdint.h>
 
 #include "common/attributes.h"
 
 union alias64 { uint64_t u64; uint8_t u8[8]; } ATTR_ALIAS;
 union alias32 { uint32_t u32; uint8_t u8[4]; } ATTR_ALIAS;
 union alias16 { uint16_t u16; uint8_t u8[2]; } ATTR_ALIAS;
@@ -83,9 +83,9 @@ union alias8 { uint8_t u8; } ATTR_ALIAS;
     case  1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
     case  2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
     case  4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
     case  8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
     case 16: set_ctx(  , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
     default: default_memset(dir, diridx, off, var); break; \
     }
 
-#endif /* __DAV1D_SRC_CTX_H__ */
+#endif /* DAV1D_SRC_CTX_H */
--- a/third_party/dav1d/src/data.c
+++ b/third_party/dav1d/src/data.c
@@ -42,21 +42,17 @@
 
 uint8_t *dav1d_data_create_internal(Dav1dData *const buf, const size_t sz) {
     validate_input_or_ret(buf != NULL, NULL);
 
     buf->ref = dav1d_ref_create(sz);
     if (!buf->ref) return NULL;
     buf->data = buf->ref->const_data;
     buf->sz = buf->m.size = sz;
-    buf->m.timestamp = INT64_MIN;
-    buf->m.duration = 0;
-    buf->m.offset = -1;
-    buf->m.user_data.data = NULL;
-    buf->m.user_data.ref = NULL;
+    dav1d_data_props_set_defaults(&buf->m);
 
     return buf->ref->data;
 }
 
 int dav1d_data_wrap_internal(Dav1dData *const buf, const uint8_t *const ptr,
                              const size_t sz,
                              void (*const free_callback)(const uint8_t *data,
                                                          void *cookie),
@@ -65,21 +61,17 @@ int dav1d_data_wrap_internal(Dav1dData *
     validate_input_or_ret(buf != NULL, -EINVAL);
     validate_input_or_ret(ptr != NULL, -EINVAL);
     validate_input_or_ret(free_callback != NULL, -EINVAL);
 
     buf->ref = dav1d_ref_wrap(ptr, free_callback, cookie);
     if (!buf->ref) return -ENOMEM;
     buf->data = ptr;
     buf->sz = buf->m.size = sz;
-    buf->m.timestamp = INT64_MIN;
-    buf->m.duration = 0;
-    buf->m.offset = -1;
-    buf->m.user_data.data = NULL;
-    buf->m.user_data.ref = NULL;
+    dav1d_data_props_set_defaults(&buf->m);
 
     return 0;
 }
 
 int dav1d_data_wrap_user_data_internal(Dav1dData *const buf,
                                        const uint8_t *const user_data,
                                        void (*const free_callback)(const uint8_t *user_data,
                                                                    void *cookie),
@@ -127,16 +119,26 @@ void dav1d_data_props_copy(Dav1dDataProp
     assert(dst != NULL);
     assert(src != NULL);
 
     dav1d_ref_dec(&dst->user_data.ref);
     *dst = *src;
     if (dst->user_data.ref) dav1d_ref_inc(dst->user_data.ref);
 }
 
+void dav1d_data_props_set_defaults(Dav1dDataProps *const props) {
+    assert(props != NULL);
+
+    props->timestamp = INT64_MIN;
+    props->duration = 0;
+    props->offset = -1;
+    props->user_data.data = NULL;
+    props->user_data.ref = NULL;
+}
+
 void dav1d_data_unref_internal(Dav1dData *const buf) {
     validate_input(buf != NULL);
 
     struct Dav1dRef *user_data_ref = buf->m.user_data.ref;
     if (buf->ref) {
         validate_input(buf->data != NULL);
         dav1d_ref_dec(&buf->ref);
     }
--- a/third_party/dav1d/src/data.h
+++ b/third_party/dav1d/src/data.h
@@ -20,39 +20,41 @@
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __DAV1D_SRC_DATA_H__
-#define __DAV1D_SRC_DATA_H__
+#ifndef DAV1D_SRC_DATA_H
+#define DAV1D_SRC_DATA_H
 
 #include "dav1d/data.h"
 
 void dav1d_data_ref(Dav1dData *dst, const Dav1dData *src);
 
 /**
  * Move a data reference.
  */
 void dav1d_data_move_ref(Dav1dData *dst, Dav1dData *src);
 
 /**
  * Copy the source properties to the destitionatin and increase the
  * user_data's reference count (if it's not NULL).
  */
 void dav1d_data_props_copy(Dav1dDataProps *dst, const Dav1dDataProps *src);
 
+void dav1d_data_props_set_defaults(Dav1dDataProps *props);
+
 uint8_t *dav1d_data_create_internal(Dav1dData *buf, size_t sz);
 int dav1d_data_wrap_internal(Dav1dData *buf, const uint8_t *ptr, size_t sz,
                              void (*free_callback)(const uint8_t *data,
                                                    void *user_data),
                              void *user_data);
 int dav1d_data_wrap_user_data_internal(Dav1dData *buf,
                                        const uint8_t *user_data,
                                        void (*free_callback)(const uint8_t *user_data,
                                                              void *cookie),
                                        void *cookie);
 void dav1d_data_unref_internal(Dav1dData *buf);
 
-#endif /* __DAV1D_SRC_DATA_H__ */
+#endif /* DAV1D_SRC_DATA_H */
--- a/third_party/dav1d/src/dav1d.rc.in
+++ b/third_party/dav1d/src/dav1d.rc.in
@@ -1,28 +1,30 @@
-#define VERSION_NUMBER @VERSION_MAJOR@,@VERSION_MINOR@,@VERSION_REVISION@,@VERSION_EXTRA@
-#define VERSION_NUMBER_STR "@VERSION_MAJOR@.@VERSION_MINOR@.@VERSION_REVISION@.@VERSION_EXTRA@"
+#define API_VERSION_NUMBER @API_VERSION_MAJOR@,@API_VERSION_MINOR@,@API_VERSION_REVISION@,0
+#define API_VERSION_NUMBER_STR "@API_VERSION_MAJOR@.@API_VERSION_MINOR@.@API_VERSION_REVISION@"
+#define PROJECT_VERSION_NUMBER @PROJECT_VERSION_MAJOR@,@PROJECT_VERSION_MINOR@,@PROJECT_VERSION_REVISION@,0
+#define PROJECT_VERSION_NUMBER_STR "@PROJECT_VERSION_MAJOR@.@PROJECT_VERSION_MINOR@.@PROJECT_VERSION_REVISION@"
 
 #include <windows.h>
 
 1 VERSIONINFO
 FILETYPE VFT_DLL
 FILEOS VOS_NT_WINDOWS32
-PRODUCTVERSION VERSION_NUMBER
-FILEVERSION VERSION_NUMBER
+PRODUCTVERSION PROJECT_VERSION_NUMBER
+FILEVERSION API_VERSION_NUMBER
 BEGIN
   BLOCK "StringFileInfo"
   BEGIN
     BLOCK "040904E4"
     BEGIN
       VALUE "CompanyName", "VideoLAN"
       VALUE "ProductName", "dav1d"
-      VALUE "ProductVersion", VERSION_NUMBER_STR
-      VALUE "FileVersion", VERSION_NUMBER_STR
-      VALUE "FileDescription", "dav1d AV1 decoder"
+      VALUE "ProductVersion", PROJECT_VERSION_NUMBER_STR
+      VALUE "FileVersion", API_VERSION_NUMBER_STR
+      VALUE "FileDescription", "dav1d " PROJECT_VERSION_NUMBER_STR " - AV1 decoder"
       VALUE "InternalName", "dav1d"
       VALUE "OriginalFilename", "libdav1d.dll"
       VALUE "LegalCopyright", "Copyright \251 @COPYRIGHT_YEARS@ VideoLAN and dav1d Authors"
     END
   END
   BLOCK "VarFileInfo"
   BEGIN
     VALUE "Translation", 0x409, 1252
--- a/third_party/dav1d/src/decode.c
+++ b/third_party/dav1d/src/decode.c
@@ -37,16 +37,17 @@
 
 #include "common/intops.h"
 #include "common/mem.h"
 
 #include "src/ctx.h"
 #include "src/decode.h"
 #include "src/dequant_tables.h"
 #include "src/env.h"
+#include "src/log.h"
 #include "src/qm.h"
 #include "src/recon.h"
 #include "src/ref.h"
 #include "src/tables.h"
 #include "src/thread_task.h"
 #include "src/warpmv.h"
 
 static void init_quant_tables(const Dav1dSequenceHeader *const seq_hdr,
@@ -73,51 +74,59 @@ static void init_quant_tables(const Dav1
 
 static int read_mv_component_diff(Dav1dTileContext *const t,
                                   CdfMvComponent *const mv_comp,
                                   const int have_fp)
 {
     Dav1dTileState *const ts = t->ts;
     const Dav1dFrameContext *const f = t->f;
     const int have_hp = f->frame_hdr->hp;
-    const int sign = msac_decode_bool_adapt(&ts->msac, mv_comp->sign);
-    const int cl = msac_decode_symbol_adapt(&ts->msac, mv_comp->classes, 11);
+    const int sign = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->sign);
+    const int cl = dav1d_msac_decode_symbol_adapt(&ts->msac,
+                                                  mv_comp->classes, 11);
     int up, fp, hp;
 
     if (!cl) {
-        up = msac_decode_bool_adapt(&ts->msac, mv_comp->class0);
+        up = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->class0);
         if (have_fp) {
-            fp = msac_decode_symbol_adapt(&ts->msac, mv_comp->class0_fp[up], 4);
-            hp = have_hp ? msac_decode_bool_adapt(&ts->msac, mv_comp->class0_hp) : 1;
+            fp = dav1d_msac_decode_symbol_adapt(&ts->msac,
+                                                mv_comp->class0_fp[up], 4);
+            hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac,
+                                                        mv_comp->class0_hp) : 1;
         } else {
             fp = 3;
             hp = 1;
         }
     } else {
         up = 1 << cl;
         for (int n = 0; n < cl; n++)
-            up |= msac_decode_bool_adapt(&ts->msac, mv_comp->classN[n]) << n;
+            up |= dav1d_msac_decode_bool_adapt(&ts->msac,
+                                               mv_comp->classN[n]) << n;
         if (have_fp) {
-            fp = msac_decode_symbol_adapt(&ts->msac, mv_comp->classN_fp, 4);
-            hp = have_hp ? msac_decode_bool_adapt(&ts->msac, mv_comp->classN_hp) : 1;
+            fp = dav1d_msac_decode_symbol_adapt(&ts->msac,
+                                                mv_comp->classN_fp, 4);
+            hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac,
+                                                        mv_comp->classN_hp) : 1;
         } else {
             fp = 3;
             hp = 1;
         }
     }
 
     const int diff = ((up << 3) | (fp << 1) | hp) + 1;
 
     return sign ? -diff : diff;
 }
 
 static void read_mv_residual(Dav1dTileContext *const t, mv *const ref_mv,
                              CdfMvContext *const mv_cdf, const int have_fp)
 {
-    switch (msac_decode_symbol_adapt(&t->ts->msac, t->ts->cdf.mv.joint, N_MV_JOINTS)) {
+    switch (dav1d_msac_decode_symbol_adapt(&t->ts->msac, t->ts->cdf.mv.joint,
+                                           N_MV_JOINTS))
+    {
     case MV_JOINT_HV:
         ref_mv->y += read_mv_component_diff(t, &mv_cdf->comp[0], have_fp);
         ref_mv->x += read_mv_component_diff(t, &mv_cdf->comp[1], have_fp);
         break;
     case MV_JOINT_H:
         ref_mv->x += read_mv_component_diff(t, &mv_cdf->comp[1], have_fp);
         break;
     case MV_JOINT_V:
@@ -139,17 +148,18 @@ static void read_tx_tree(Dav1dTileContex
     const int txw = t_dim->lw, txh = t_dim->lh;
     int is_split;
 
     if (depth < 2 && from > (int) TX_4X4) {
         const int cat = 2 * (TX_64X64 - t_dim->max) - depth;
         const int a = t->a->tx[bx4] < txw;
         const int l = t->l.tx[by4] < txh;
 
-        is_split = msac_decode_bool_adapt(&t->ts->msac, t->ts->cdf.m.txpart[cat][a + l]);
+        is_split = dav1d_msac_decode_bool_adapt(&t->ts->msac,
+                       t->ts->cdf.m.txpart[cat][a + l]);
         if (is_split)
             masks[depth] |= 1 << (y_off * 4 + x_off);
     } else {
         is_split = 0;
     }
 
     if (is_split && t_dim->max > TX_8X8) {
         const enum RectTxfmSize sub = t_dim->sub;
@@ -297,27 +307,27 @@ static void derive_warpmv(const Dav1dTil
     pts[np][1][1] = pts[np][0][1] + (rp)->mv[0].y; \
     np++; \
 } while (0)
 
     // use masks[] to find the projectable motion vectors in the edges
     if ((unsigned) masks[0] == 1 && !(masks[1] >> 32)) {
         const int off = t->bx & (bs(&r[-b4_stride])[0] - 1);
         add_sample(-off, 0, 1, -1, &r[-b4_stride]);
-    } else for (unsigned off = 0, xmask = masks[0]; np < 8 && xmask;) { // top
+    } else for (unsigned off = 0, xmask = (uint32_t) masks[0]; np < 8 && xmask;) { // top
         const int tz = ctz(xmask);
         off += tz;
         xmask >>= tz;
         add_sample(off, 0, 1, -1, &r[off - b4_stride]);
         xmask &= ~1;
     }
     if (np < 8 && masks[1] == 1) {
         const int off = t->by & (bs(&r[-1])[1] - 1);
         add_sample(0, -off, -1, 1, &r[-1 - off * b4_stride]);
-    } else for (unsigned off = 0, ymask = masks[1]; np < 8 && ymask;) { // left
+    } else for (unsigned off = 0, ymask = (uint32_t) masks[1]; np < 8 && ymask;) { // left
         const int tz = ctz(ymask);
         off += tz;
         ymask >>= tz;
         add_sample(0, off, -1, 1, &r[off * b4_stride - 1]);
         ymask &= ~1;
     }
     if (np < 8 && masks[1] >> 32) // top/left
         add_sample(0, 0, -1, -1, &r[-(1 + b4_stride)]);
@@ -325,18 +335,18 @@ static void derive_warpmv(const Dav1dTil
         add_sample(bw4, 0, 1, -1, &r[bw4 - b4_stride]);
     assert(np > 0 && np <= 8);
 #undef bs
 
     // select according to motion vector difference against a threshold
     int mvd[8], ret = 0;
     const int thresh = 4 * iclip(imax(bw4, bh4), 4, 28);
     for (int i = 0; i < np; i++) {
-        mvd[i] = labs(pts[i][1][0] - pts[i][0][0] - mv.x) +
-                 labs(pts[i][1][1] - pts[i][0][1] - mv.y);
+        mvd[i] = abs(pts[i][1][0] - pts[i][0][0] - mv.x) +
+                 abs(pts[i][1][1] - pts[i][0][1] - mv.y);
         if (mvd[i] > thresh)
             mvd[i] = -1;
         else
             ret++;
     }
     if (!ret) {
         ret = 1;
     } else for (int i = 0, j = np - 1, k = 0; k < np - ret; k++, i++, j--) {
@@ -364,18 +374,18 @@ static inline int findoddzero(const uint
 }
 
 static void read_pal_plane(Dav1dTileContext *const t, Av1Block *const b,
                            const int pl, const int sz_ctx,
                            const int bx4, const int by4)
 {
     Dav1dTileState *const ts = t->ts;
     const Dav1dFrameContext *const f = t->f;
-    const int pal_sz = b->pal_sz[pl] = 2 + msac_decode_symbol_adapt(&ts->msac,
-                                                 ts->cdf.m.pal_sz[pl][sz_ctx], 7);
+    const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt(&ts->msac,
+                                           ts->cdf.m.pal_sz[pl][sz_ctx], 7) + 2;
     uint16_t cache[16], used_cache[8];
     int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4];
     int n_cache = 0;
     // don't reuse above palette outside SB64 boundaries
     int a_cache = by4 & 15 ? pl ? t->pal_sz_uv[0][bx4] : t->a->pal_sz[bx4] : 0;
     const uint16_t *l = t->al_pal[1][by4][pl], *a = t->al_pal[0][bx4][pl];
 
     // fill/sort cache
@@ -408,33 +418,33 @@ static void read_pal_plane(Dav1dTileCont
                 cache[n_cache++] = *a;
             a++;
         } while (--a_cache > 0);
     }
 
     // find reused cache entries
     int i = 0;
     for (int n = 0; n < n_cache && i < pal_sz; n++)
-        if (msac_decode_bool(&ts->msac, EC_BOOL_EPROB))
+        if (dav1d_msac_decode_bool_equi(&ts->msac))
             used_cache[i++] = cache[n];
     const int n_used_cache = i;
 
     // parse new entries
     uint16_t *const pal = f->frame_thread.pass ?
         f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
                             ((t->bx >> 1) + (t->by & 1))][pl] : t->pal[pl];
     if (i < pal_sz) {
-        int prev = pal[i++] = msac_decode_bools(&ts->msac, f->cur.p.bpc);
+        int prev = pal[i++] = dav1d_msac_decode_bools(&ts->msac, f->cur.p.bpc);
 
         if (i < pal_sz) {
-            int bits = f->cur.p.bpc - 3 + msac_decode_bools(&ts->msac, 2);
+            int bits = f->cur.p.bpc - 3 + dav1d_msac_decode_bools(&ts->msac, 2);
             const int max = (1 << f->cur.p.bpc) - 1;
 
             do {
-                const int delta = msac_decode_bools(&ts->msac, bits);
+                const int delta = dav1d_msac_decode_bools(&ts->msac, bits);
                 prev = pal[i++] = imin(prev + delta + !pl, max);
                 if (prev + !pl >= max) {
                     for (; i < pal_sz; i++)
                         pal[i] = max;
                     break;
                 }
                 bits = imin(bits, 1 + ulog2(max - prev - !pl));
             } while (i < pal_sz);
@@ -472,28 +482,29 @@ static void read_pal_uv(Dav1dTileContext
     read_pal_plane(t, b, 1, sz_ctx, bx4, by4);
 
     // V pal coding
     Dav1dTileState *const ts = t->ts;
     const Dav1dFrameContext *const f = t->f;
     uint16_t *const pal = f->frame_thread.pass ?
         f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
                             ((t->bx >> 1) + (t->by & 1))][2] : t->pal[2];
-    if (msac_decode_bool(&ts->msac, EC_BOOL_EPROB)) {
-        const int bits = f->cur.p.bpc - 4 + msac_decode_bools(&ts->msac, 2);
-        int prev = pal[0] = msac_decode_bools(&ts->msac, f->cur.p.bpc);
+    if (dav1d_msac_decode_bool_equi(&ts->msac)) {
+        const int bits = f->cur.p.bpc - 4 +
+                         dav1d_msac_decode_bools(&ts->msac, 2);
+        int prev = pal[0] = dav1d_msac_decode_bools(&ts->msac, f->cur.p.bpc);
         const int max = (1 << f->cur.p.bpc) - 1;
         for (int i = 1; i < b->pal_sz[1]; i++) {
-            int delta = msac_decode_bools(&ts->msac, bits);
-            if (delta && msac_decode_bool(&ts->msac, EC_BOOL_EPROB)) delta = -delta;
+            int delta = dav1d_msac_decode_bools(&ts->msac, bits);
+            if (delta && dav1d_msac_decode_bool_equi(&ts->msac)) delta = -delta;
             prev = pal[i] = (prev + delta) & max;
         }
     } else {
         for (int i = 0; i < b->pal_sz[1]; i++)
-            pal[i] = msac_decode_bools(&ts->msac, f->cur.p.bpc);
+            pal[i] = dav1d_msac_decode_bools(&ts->msac, f->cur.p.bpc);
     }
     if (DEBUG_BLOCK_INFO) {
         printf("Post-pal[pl=2]: r=%d ", ts->msac.rng);
         for (int n = 0; n < b->pal_sz[1]; n++)
             printf("%c%02x", n ? ' ' : '[', pal[n]);
         printf("]\n");
     }
 }
@@ -569,29 +580,28 @@ static void order_palette(const uint8_t 
 static void read_pal_indices(Dav1dTileContext *const t,
                              uint8_t *const pal_idx,
                              const Av1Block *const b, const int pl,
                              const int w4, const int h4,
                              const int bw4, const int bh4)
 {
     Dav1dTileState *const ts = t->ts;
     const ptrdiff_t stride = bw4 * 4;
-    pal_idx[0] = msac_decode_uniform(&ts->msac, b->pal_sz[pl]);
+    pal_idx[0] = dav1d_msac_decode_uniform(&ts->msac, b->pal_sz[pl]);
     uint16_t (*const color_map_cdf)[8 + 1] =
         ts->cdf.m.color_map[pl][b->pal_sz[pl] - 2];
     for (int i = 1; i < 4 * (w4 + h4) - 1; i++) {
         // top/left-to-bottom/right diagonals ("wave-front")
         uint8_t order[64][8], ctx[64];
         const int first = imin(i, w4 * 4 - 1);
         const int last = imax(0, i - h4 * 4 + 1);
         order_palette(pal_idx, stride, i, first, last, order, ctx);
         for (int j = first, m = 0; j >= last; j--, m++) {
-            const int color_idx =
-                msac_decode_symbol_adapt(&ts->msac, color_map_cdf[ctx[m]],
-                                         b->pal_sz[pl]);
+            const int color_idx = dav1d_msac_decode_symbol_adapt(&ts->msac,
+                                      color_map_cdf[ctx[m]], b->pal_sz[pl]);
             pal_idx[(i - j) * stride + j] = order[m][color_idx];
         }
     }
     // fill invisible edges
     if (bw4 > w4)
         for (int y = 0; y < 4 * h4; y++)
             memset(&pal_idx[y * stride + 4 * w4],
                    pal_idx[y * stride + 4 * w4 - 1], 4 * (bw4 - w4));
@@ -776,19 +786,19 @@ static int decode_b(Dav1dTileContext *co
                 if (seg_id >= 8) return -1;
                 b->seg_id = seg_id;
             } else {
                 b->seg_id = 0;
             }
             seg = &f->frame_hdr->segmentation.seg_data.d[b->seg_id];
         } else if (f->frame_hdr->segmentation.seg_data.preskip) {
             if (f->frame_hdr->segmentation.temporal &&
-                (seg_pred = msac_decode_bool_adapt(&ts->msac,
-                                       ts->cdf.m.seg_pred[t->a->seg_pred[bx4] +
-                                                          t->l.seg_pred[by4]])))
+                (seg_pred = dav1d_msac_decode_bool_adapt(&ts->msac,
+                                ts->cdf.m.seg_pred[t->a->seg_pred[bx4] +
+                                t->l.seg_pred[by4]])))
             {
                 // temporal predicted seg_id
                 if (f->prev_segmap) {
                     unsigned seg_id = get_prev_frame_segid(f, t->by, t->bx,
                                                            w4, h4,
                                                            f->prev_segmap,
                                                            f->b4_stride);
                     if (seg_id >= 8) return -1;
@@ -796,19 +806,19 @@ static int decode_b(Dav1dTileContext *co
                 } else {
                     b->seg_id = 0;
                 }
             } else {
                 int seg_ctx;
                 const unsigned pred_seg_id =
                     get_cur_frame_segid(t->by, t->bx, have_top, have_left,
                                         &seg_ctx, f->cur_segmap, f->b4_stride);
-                const unsigned diff = msac_decode_symbol_adapt(&ts->msac,
-                                                   ts->cdf.m.seg_id[seg_ctx],
-                                                   DAV1D_MAX_SEGMENTS);
+                const unsigned diff = dav1d_msac_decode_symbol_adapt(&ts->msac,
+                                          ts->cdf.m.seg_id[seg_ctx],
+                                          DAV1D_MAX_SEGMENTS);
                 const unsigned last_active_seg_id =
                     f->frame_hdr->segmentation.seg_data.last_active_segid;
                 b->seg_id = neg_deinterleave(diff, pred_seg_id,
                                              last_active_seg_id + 1);
                 if (b->seg_id > last_active_seg_id) b->seg_id = 0; // error?
                 if (b->seg_id >= DAV1D_MAX_SEGMENTS) b->seg_id = 0; // error?
             }
 
@@ -822,43 +832,43 @@ static int decode_b(Dav1dTileContext *co
         b->seg_id = 0;
     }
 
     // skip_mode
     if ((!seg || (!seg->globalmv && seg->ref == -1 && !seg->skip)) &&
         f->frame_hdr->skip_mode_enabled && imin(bw4, bh4) > 1)
     {
         const int smctx = t->a->skip_mode[bx4] + t->l.skip_mode[by4];
-        b->skip_mode = msac_decode_bool_adapt(&ts->msac,
-                                              ts->cdf.m.skip_mode[smctx]);
+        b->skip_mode = dav1d_msac_decode_bool_adapt(&ts->msac,
+                           ts->cdf.m.skip_mode[smctx]);
         if (DEBUG_BLOCK_INFO)
             printf("Post-skipmode[%d]: r=%d\n", b->skip_mode, ts->msac.rng);
     } else {
         b->skip_mode = 0;
     }
 
     // skip
     if (b->skip_mode || (seg && seg->skip)) {
         b->skip = 1;
     } else {
         const int sctx = t->a->skip[bx4] + t->l.skip[by4];
-        b->skip = msac_decode_bool_adapt(&ts->msac, ts->cdf.m.skip[sctx]);
+        b->skip = dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.skip[sctx]);
         if (DEBUG_BLOCK_INFO)
             printf("Post-skip[%d]: r=%d\n", b->skip, ts->msac.rng);
     }
 
     // segment_id
     if (f->frame_hdr->segmentation.enabled &&
         f->frame_hdr->segmentation.update_map &&
         !f->frame_hdr->segmentation.seg_data.preskip)
     {
         if (!b->skip && f->frame_hdr->segmentation.temporal &&
-            (seg_pred = msac_decode_bool_adapt(&ts->msac,
-                                   ts->cdf.m.seg_pred[t->a->seg_pred[bx4] +
-                                                      t->l.seg_pred[by4]])))
+            (seg_pred = dav1d_msac_decode_bool_adapt(&ts->msac,
+                            ts->cdf.m.seg_pred[t->a->seg_pred[bx4] +
+                            t->l.seg_pred[by4]])))
         {
             // temporal predicted seg_id
             if (f->prev_segmap) {
                 unsigned seg_id = get_prev_frame_segid(f, t->by, t->bx, w4, h4,
                                                        f->prev_segmap,
                                                        f->b4_stride);
                 if (seg_id >= 8) return -1;
                 b->seg_id = seg_id;
@@ -868,19 +878,19 @@ static int decode_b(Dav1dTileContext *co
         } else {
             int seg_ctx;
             const unsigned pred_seg_id =
                 get_cur_frame_segid(t->by, t->bx, have_top, have_left,
                                     &seg_ctx, f->cur_segmap, f->b4_stride);
             if (b->skip) {
                 b->seg_id = pred_seg_id;
             } else {
-                const unsigned diff = msac_decode_symbol_adapt(&ts->msac,
-                                                   ts->cdf.m.seg_id[seg_ctx],
-                                                   DAV1D_MAX_SEGMENTS);
+                const unsigned diff = dav1d_msac_decode_symbol_adapt(&ts->msac,
+                                          ts->cdf.m.seg_id[seg_ctx],
+                                          DAV1D_MAX_SEGMENTS);
                 const unsigned last_active_seg_id =
                     f->frame_hdr->segmentation.seg_data.last_active_segid;
                 b->seg_id = neg_deinterleave(diff, pred_seg_id,
                                              last_active_seg_id + 1);
                 if (b->seg_id > last_active_seg_id) b->seg_id = 0; // error?
             }
             if (b->seg_id >= DAV1D_MAX_SEGMENTS) b->seg_id = 0; // error?
         }
@@ -892,17 +902,18 @@ static int decode_b(Dav1dTileContext *co
                    b->seg_id, ts->msac.rng);
     }
 
     // cdef index
     if (!b->skip) {
         const int idx = f->seq_hdr->sb128 ? ((t->bx & 16) >> 4) +
                                            ((t->by & 16) >> 3) : 0;
         if (t->cur_sb_cdef_idx_ptr[idx] == -1) {
-            const int v = msac_decode_bools(&ts->msac, f->frame_hdr->cdef.n_bits);
+            const int v = dav1d_msac_decode_bools(&ts->msac,
+                              f->frame_hdr->cdef.n_bits);
             t->cur_sb_cdef_idx_ptr[idx] = v;
             if (bw4 > 16) t->cur_sb_cdef_idx_ptr[idx + 1] = v;
             if (bh4 > 16) t->cur_sb_cdef_idx_ptr[idx + 2] = v;
             if (bw4 == 32 && bh4 == 32) t->cur_sb_cdef_idx_ptr[idx + 3] = v;
 
             if (DEBUG_BLOCK_INFO)
                 printf("Post-cdef_idx[%d]: r=%d\n",
                         *t->cur_sb_cdef_idx_ptr, ts->msac.rng);
@@ -916,45 +927,46 @@ static int decode_b(Dav1dTileContext *co
         const int prev_qidx = ts->last_qidx;
         const int have_delta_q = f->frame_hdr->delta.q.present &&
             (bs != (f->seq_hdr->sb128 ? BS_128x128 : BS_64x64) || !b->skip);
 
         int8_t prev_delta_lf[4];
         memcpy(prev_delta_lf, ts->last_delta_lf, 4);
 
         if (have_delta_q) {
-            int delta_q = msac_decode_symbol_adapt(&ts->msac, ts->cdf.m.delta_q, 4);
+            int delta_q = dav1d_msac_decode_symbol_adapt(&ts->msac,
+                                                         ts->cdf.m.delta_q, 4);
             if (delta_q == 3) {
-                const int n_bits = 1 + msac_decode_bools(&ts->msac, 3);
-                delta_q = msac_decode_bools(&ts->msac, n_bits) + 1 + (1 << n_bits);
+                const int n_bits = 1 + dav1d_msac_decode_bools(&ts->msac, 3);
+                delta_q = dav1d_msac_decode_bools(&ts->msac, n_bits) +
+                          1 + (1 << n_bits);
             }
             if (delta_q) {
-                if (msac_decode_bool(&ts->msac, EC_BOOL_EPROB)) delta_q = -delta_q;
+                if (dav1d_msac_decode_bool_equi(&ts->msac)) delta_q = -delta_q;
                 delta_q *= 1 << f->frame_hdr->delta.q.res_log2;
             }
             ts->last_qidx = iclip(ts->last_qidx + delta_q, 1, 255);
             if (have_delta_q && DEBUG_BLOCK_INFO)
                 printf("Post-delta_q[%d->%d]: r=%d\n",
                        delta_q, ts->last_qidx, ts->msac.rng);
 
             if (f->frame_hdr->delta.lf.present) {
                 const int n_lfs = f->frame_hdr->delta.lf.multi ?
                     f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 ? 4 : 2 : 1;
 
                 for (int i = 0; i < n_lfs; i++) {
-                    int delta_lf =
-                        msac_decode_symbol_adapt(&ts->msac,
+                    int delta_lf = dav1d_msac_decode_symbol_adapt(&ts->msac,
                         ts->cdf.m.delta_lf[i + f->frame_hdr->delta.lf.multi], 4);
                     if (delta_lf == 3) {
-                        const int n_bits = 1 + msac_decode_bools(&ts->msac, 3);
-                        delta_lf = msac_decode_bools(&ts->msac, n_bits) +
+                        const int n_bits = 1 + dav1d_msac_decode_bools(&ts->msac, 3);
+                        delta_lf = dav1d_msac_decode_bools(&ts->msac, n_bits) +
                                    1 + (1 << n_bits);
                     }
                     if (delta_lf) {
-                        if (msac_decode_bool(&ts->msac, EC_BOOL_EPROB))
+                        if (dav1d_msac_decode_bool_equi(&ts->msac))
                             delta_lf = -delta_lf;
                         delta_lf *= 1 << f->frame_hdr->delta.lf.res_log2;
                     }
                     ts->last_delta_lf[i] =
                         iclip(ts->last_delta_lf[i] + delta_lf, -63, 63);
                     if (have_delta_q && DEBUG_BLOCK_INFO)
                         printf("Post-delta_lf[%d:%d]: r=%d\n", i, delta_lf,
                                ts->msac.rng);
@@ -982,131 +994,132 @@ static int decode_b(Dav1dTileContext *co
     if (b->skip_mode) {
         b->intra = 0;
     } else if (f->frame_hdr->frame_type & 1) {
         if (seg && (seg->ref >= 0 || seg->globalmv)) {
             b->intra = !seg->ref;
         } else {
             const int ictx = get_intra_ctx(t->a, &t->l, by4, bx4,
                                            have_top, have_left);
-            b->intra = !msac_decode_bool_adapt(&ts->msac, ts->cdf.m.intra[ictx]);
+            b->intra = !dav1d_msac_decode_bool_adapt(&ts->msac,
+                            ts->cdf.m.intra[ictx]);
             if (DEBUG_BLOCK_INFO)
                 printf("Post-intra[%d]: r=%d\n", b->intra, ts->msac.rng);
         }
     } else if (f->frame_hdr->allow_intrabc) {
-        b->intra = !msac_decode_bool_adapt(&ts->msac, ts->cdf.m.intrabc);
+        b->intra = !dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.intrabc);
         if (DEBUG_BLOCK_INFO)
             printf("Post-intrabcflag[%d]: r=%d\n", b->intra, ts->msac.rng);
     } else {
         b->intra = 1;
     }
 
     // intra/inter-specific stuff
     if (b->intra) {
         uint16_t *const ymode_cdf = f->frame_hdr->frame_type & 1 ?
             ts->cdf.m.y_mode[dav1d_ymode_size_context[bs]] :
             ts->cdf.kfym[dav1d_intra_mode_context[t->a->mode[bx4]]]
                         [dav1d_intra_mode_context[t->l.mode[by4]]];
-        b->y_mode = msac_decode_symbol_adapt(&ts->msac, ymode_cdf,
-                                              N_INTRA_PRED_MODES);
+        b->y_mode = dav1d_msac_decode_symbol_adapt(&ts->msac, ymode_cdf,
+                                                   N_INTRA_PRED_MODES);
         if (DEBUG_BLOCK_INFO)
             printf("Post-ymode[%d]: r=%d\n", b->y_mode, ts->msac.rng);
 
         // angle delta
         if (b_dim[2] + b_dim[3] >= 2 && b->y_mode >= VERT_PRED &&
             b->y_mode <= VERT_LEFT_PRED)
         {
             uint16_t *const acdf = ts->cdf.m.angle_delta[b->y_mode - VERT_PRED];
-            const int angle = msac_decode_symbol_adapt(&ts->msac, acdf, 7);
+            const int angle = dav1d_msac_decode_symbol_adapt(&ts->msac, acdf, 7);
             b->y_angle = angle - 3;
         } else {
             b->y_angle = 0;
         }
 
         if (has_chroma) {
             const int cfl_allowed = f->frame_hdr->segmentation.lossless[b->seg_id] ?
                 cbw4 == 1 && cbh4 == 1 : !!(cfl_allowed_mask & (1 << bs));
             uint16_t *const uvmode_cdf = ts->cdf.m.uv_mode[cfl_allowed][b->y_mode];
-            b->uv_mode = msac_decode_symbol_adapt(&ts->msac, uvmode_cdf,
-                                         N_UV_INTRA_PRED_MODES - !cfl_allowed);
+            b->uv_mode = dav1d_msac_decode_symbol_adapt(&ts->msac, uvmode_cdf,
+                             N_UV_INTRA_PRED_MODES - !cfl_allowed);
             if (DEBUG_BLOCK_INFO)
                 printf("Post-uvmode[%d]: r=%d\n", b->uv_mode, ts->msac.rng);
 
             if (b->uv_mode == CFL_PRED) {
 #define SIGN(a) (!!(a) + ((a) > 0))
-                const int sign =
-                    msac_decode_symbol_adapt(&ts->msac, ts->cdf.m.cfl_sign, 8) + 1;
+                const int sign = dav1d_msac_decode_symbol_adapt(&ts->msac,
+                                     ts->cdf.m.cfl_sign, 8) + 1;
                 const int sign_u = sign * 0x56 >> 8, sign_v = sign - sign_u * 3;
                 assert(sign_u == sign / 3);
                 if (sign_u) {
                     const int ctx = (sign_u == 2) * 3 + sign_v;
-                    b->cfl_alpha[0] = msac_decode_symbol_adapt(&ts->msac,
-                                            ts->cdf.m.cfl_alpha[ctx], 16) + 1;
+                    b->cfl_alpha[0] = dav1d_msac_decode_symbol_adapt(&ts->msac,
+                                          ts->cdf.m.cfl_alpha[ctx], 16) + 1;
                     if (sign_u == 1) b->cfl_alpha[0] = -b->cfl_alpha[0];
                 } else {
                     b->cfl_alpha[0] = 0;
                 }
                 if (sign_v) {
                     const int ctx = (sign_v == 2) * 3 + sign_u;
-                    b->cfl_alpha[1] = msac_decode_symbol_adapt(&ts->msac,
-                                            ts->cdf.m.cfl_alpha[ctx], 16) + 1;
+                    b->cfl_alpha[1] = dav1d_msac_decode_symbol_adapt(&ts->msac,
+                                          ts->cdf.m.cfl_alpha[ctx], 16) + 1;
                     if (sign_v == 1) b->cfl_alpha[1] = -b->cfl_alpha[1];
                 } else {
                     b->cfl_alpha[1] = 0;
                 }
 #undef SIGN
                 if (DEBUG_BLOCK_INFO)
                     printf("Post-uvalphas[%d/%d]: r=%d\n",
                            b->cfl_alpha[0], b->cfl_alpha[1], ts->msac.rng);
             } else if (b_dim[2] + b_dim[3] >= 2 && b->uv_mode >= VERT_PRED &&
                        b->uv_mode <= VERT_LEFT_PRED)
             {
                 uint16_t *const acdf = ts->cdf.m.angle_delta[b->uv_mode - VERT_PRED];
-                const int angle = msac_decode_symbol_adapt(&ts->msac, acdf, 7);
+                const int angle = dav1d_msac_decode_symbol_adapt(&ts->msac, acdf, 7);
                 b->uv_angle = angle - 3;
             } else {
                 b->uv_angle = 0;
             }
         }
 
         b->pal_sz[0] = b->pal_sz[1] = 0;
         if (f->frame_hdr->allow_screen_content_tools &&
             imax(bw4, bh4) <= 16 && bw4 + bh4 >= 4)
         {
             const int sz_ctx = b_dim[2] + b_dim[3] - 2;
             if (b->y_mode == DC_PRED) {
                 const int pal_ctx = (t->a->pal_sz[bx4] > 0) + (t->l.pal_sz[by4] > 0);
-                const int use_y_pal =
-                    msac_decode_bool_adapt(&ts->msac, ts->cdf.m.pal_y[sz_ctx][pal_ctx]);
+                const int use_y_pal = dav1d_msac_decode_bool_adapt(&ts->msac,
+                                          ts->cdf.m.pal_y[sz_ctx][pal_ctx]);
                 if (DEBUG_BLOCK_INFO)
                     printf("Post-y_pal[%d]: r=%d\n", use_y_pal, ts->msac.rng);
                 if (use_y_pal)
                     read_pal_plane(t, b, 0, sz_ctx, bx4, by4);
             }
 
             if (has_chroma && b->uv_mode == DC_PRED) {
                 const int pal_ctx = b->pal_sz[0] > 0;
-                const int use_uv_pal =
-                    msac_decode_bool_adapt(&ts->msac, ts->cdf.m.pal_uv[pal_ctx]);
+                const int use_uv_pal = dav1d_msac_decode_bool_adapt(&ts->msac,
+                                           ts->cdf.m.pal_uv[pal_ctx]);
                 if (DEBUG_BLOCK_INFO)
                     printf("Post-uv_pal[%d]: r=%d\n", use_uv_pal, ts->msac.rng);
                 if (use_uv_pal) // see aomedia bug 2183 for why we use luma coordinates
                     read_pal_uv(t, b, sz_ctx, bx4, by4);
             }
         }
 
         if (b->y_mode == DC_PRED && !b->pal_sz[0] &&
             imax(b_dim[2], b_dim[3]) <= 3 && f->seq_hdr->filter_intra)
         {
-            const int is_filter = msac_decode_bool_adapt(&ts->msac,
-                                            ts->cdf.m.use_filter_intra[bs]);
+            const int is_filter = dav1d_msac_decode_bool_adapt(&ts->msac,
+                                      ts->cdf.m.use_filter_intra[bs]);
             if (is_filter) {
                 b->y_mode = FILTER_PRED;
-                b->y_angle = msac_decode_symbol_adapt(&ts->msac,
-                                                  ts->cdf.m.filter_intra, 5);
+                b->y_angle = dav1d_msac_decode_symbol_adapt(&ts->msac,
+                                 ts->cdf.m.filter_intra, 5);
             }
             if (DEBUG_BLOCK_INFO)
                 printf("Post-filterintramode[%d/%d]: r=%d\n",
                        b->y_mode, b->y_angle, ts->msac.rng);
         }
 
         if (b->pal_sz[0]) {
             uint8_t *pal_idx;
@@ -1138,18 +1151,18 @@ static int decode_b(Dav1dTileContext *co
             t_dim = &dav1d_txfm_dimensions[TX_4X4];
         } else {
             b->tx = dav1d_max_txfm_size_for_bs[bs][0];
             b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.layout];
             t_dim = &dav1d_txfm_dimensions[b->tx];
             if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE && t_dim->max > TX_4X4) {
                 const int tctx = get_tx_ctx(t->a, &t->l, t_dim, by4, bx4);
                 uint16_t *const tx_cdf = ts->cdf.m.txsz[t_dim->max - 1][tctx];
-                int depth = msac_decode_symbol_adapt(&ts->msac, tx_cdf,
-                                                     imin(t_dim->max + 1, 3));
+                int depth = dav1d_msac_decode_symbol_adapt(&ts->msac, tx_cdf,
+                                imin(t_dim->max + 1, 3));
 
                 while (depth--) {
                     b->tx = t_dim->sub;
                     t_dim = &dav1d_txfm_dimensions[b->tx];
                 }
             }
             if (DEBUG_BLOCK_INFO)
                 printf("Post-tx[%d]: r=%d\n", b->tx, ts->msac.rng);
@@ -1351,17 +1364,18 @@ static int decode_b(Dav1dTileContext *co
 
         if (b->skip_mode) {
             is_comp = 1;
         } else if ((!seg || (seg->ref == -1 && !seg->globalmv && !seg->skip)) &&
                    f->frame_hdr->switchable_comp_refs && imin(bw4, bh4) > 1)
         {
             const int ctx = get_comp_ctx(t->a, &t->l, by4, bx4,
                                          have_top, have_left);
-            is_comp = msac_decode_bool_adapt(&ts->msac, ts->cdf.m.comp[ctx]);
+            is_comp = dav1d_msac_decode_bool_adapt(&ts->msac,
+                          ts->cdf.m.comp[ctx]);
             if (DEBUG_BLOCK_INFO)
                 printf("Post-compflag[%d]: r=%d\n", is_comp, ts->msac.rng);
         } else {
             is_comp = 0;
         }
 
         if (b->skip_mode) {
             b->ref[0] = f->frame_hdr->skip_mode_refs[0];
@@ -1386,116 +1400,118 @@ static int decode_b(Dav1dTileContext *co
             fix_mv_precision(f->frame_hdr, &b->mv[1]);
             if (DEBUG_BLOCK_INFO)
                 printf("Post-skipmodeblock[mv=1:y=%d,x=%d,2:y=%d,x=%d,refs=%d+%d\n",
                        b->mv[0].y, b->mv[0].x, b->mv[1].y, b->mv[1].x,
                        b->ref[0], b->ref[1]);
         } else if (is_comp) {
             const int dir_ctx = get_comp_dir_ctx(t->a, &t->l, by4, bx4,
                                                  have_top, have_left);
-            if (msac_decode_bool_adapt(&ts->msac, ts->cdf.m.comp_dir[dir_ctx])) {
+            if (dav1d_msac_decode_bool_adapt(&ts->msac,
+                    ts->cdf.m.comp_dir[dir_ctx]))
+            {
                 // bidir - first reference (fw)
                 const int ctx1 = av1_get_fwd_ref_ctx(t->a, &t->l, by4, bx4,
                                                      have_top, have_left);
-                if (msac_decode_bool_adapt(&ts->msac,
-                                           ts->cdf.m.comp_fwd_ref[0][ctx1]))
+                if (dav1d_msac_decode_bool_adapt(&ts->msac,
+                        ts->cdf.m.comp_fwd_ref[0][ctx1]))
                 {
                     const int ctx2 = av1_get_fwd_ref_2_ctx(t->a, &t->l, by4, bx4,
                                                            have_top, have_left);
-                    b->ref[0] = 2 + msac_decode_bool_adapt(&ts->msac,
-                                            ts->cdf.m.comp_fwd_ref[2][ctx2]);
+                    b->ref[0] = 2 + dav1d_msac_decode_bool_adapt(&ts->msac,
+                                        ts->cdf.m.comp_fwd_ref[2][ctx2]);
                 } else {
                     const int ctx2 = av1_get_fwd_ref_1_ctx(t->a, &t->l, by4, bx4,
                                                            have_top, have_left);
-                    b->ref[0] = msac_decode_bool_adapt(&ts->msac,
-                                            ts->cdf.m.comp_fwd_ref[1][ctx2]);
+                    b->ref[0] = dav1d_msac_decode_bool_adapt(&ts->msac,
+                                    ts->cdf.m.comp_fwd_ref[1][ctx2]);
                 }
 
                 // second reference (bw)
                 const int ctx3 = av1_get_bwd_ref_ctx(t->a, &t->l, by4, bx4,
                                                      have_top, have_left);
-                if (msac_decode_bool_adapt(&ts->msac,
-                                           ts->cdf.m.comp_bwd_ref[0][ctx3]))
+                if (dav1d_msac_decode_bool_adapt(&ts->msac,
+                        ts->cdf.m.comp_bwd_ref[0][ctx3]))
                 {
                     b->ref[1] = 6;
                 } else {
                     const int ctx4 = av1_get_bwd_ref_1_ctx(t->a, &t->l, by4, bx4,
                                                            have_top, have_left);
-                    b->ref[1] = 4 + msac_decode_bool_adapt(&ts->msac,
-                                           ts->cdf.m.comp_bwd_ref[1][ctx4]);
+                    b->ref[1] = 4 + dav1d_msac_decode_bool_adapt(&ts->msac,
+                                        ts->cdf.m.comp_bwd_ref[1][ctx4]);
                 }
             } else {
                 // unidir
                 const int uctx_p = av1_get_uni_p_ctx(t->a, &t->l, by4, bx4,
                                                      have_top, have_left);
-                if (msac_decode_bool_adapt(&ts->msac,
-                                           ts->cdf.m.comp_uni_ref[0][uctx_p]))
+                if (dav1d_msac_decode_bool_adapt(&ts->msac,
+                        ts->cdf.m.comp_uni_ref[0][uctx_p]))
                 {
                     b->ref[0] = 4;
                     b->ref[1] = 6;
                 } else {
                     const int uctx_p1 = av1_get_uni_p1_ctx(t->a, &t->l, by4, bx4,
                                                            have_top, have_left);
                     b->ref[0] = 0;
-                    b->ref[1] = 1 + msac_decode_bool_adapt(&ts->msac,
-                                           ts->cdf.m.comp_uni_ref[1][uctx_p1]);
+                    b->ref[1] = 1 + dav1d_msac_decode_bool_adapt(&ts->msac,
+                                        ts->cdf.m.comp_uni_ref[1][uctx_p1]);
                     if (b->ref[1] == 2) {
                         const int uctx_p2 = av1_get_uni_p2_ctx(t->a, &t->l, by4, bx4,
                                                                have_top, have_left);
-                        b->ref[1] += msac_decode_bool_adapt(&ts->msac,
-                                           ts->cdf.m.comp_uni_ref[2][uctx_p2]);
+                        b->ref[1] += dav1d_msac_decode_bool_adapt(&ts->msac,
+                                         ts->cdf.m.comp_uni_ref[2][uctx_p2]);
                     }
                 }
             }
             if (DEBUG_BLOCK_INFO)
                 printf("Post-refs[%d/%d]: r=%d\n",
                        b->ref[0], b->ref[1], ts->msac.rng);
 
             candidate_mv mvstack[8];
             int n_mvs, ctx;
             mv mvlist[2][2];
             av1_find_ref_mvs(mvstack, &n_mvs, mvlist, &ctx,
                              (int[2]) { b->ref[0], b->ref[1] }, f->bw, f->bh,
                              bs, bp, t->by, t->bx, ts->tiling.col_start,
                              ts->tiling.col_end, ts->tiling.row_start,
                              ts->tiling.row_end, f->libaom_cm);
 
-            b->inter_mode = msac_decode_symbol_adapt(&ts->msac,
-                                             ts->cdf.m.comp_inter_mode[ctx],
-                                             N_COMP_INTER_PRED_MODES);
+            b->inter_mode = dav1d_msac_decode_symbol_adapt(&ts->msac,
+                                ts->cdf.m.comp_inter_mode[ctx],
+                                N_COMP_INTER_PRED_MODES);
             if (DEBUG_BLOCK_INFO)
                 printf("Post-compintermode[%d,ctx=%d,n_mvs=%d]: r=%d\n",
                        b->inter_mode, ctx, n_mvs, ts->msac.rng);
 
             const uint8_t *const im = dav1d_comp_inter_pred_modes[b->inter_mode];
             b->drl_idx = 0;
             if (b->inter_mode == NEWMV_NEWMV) {
                 if (n_mvs > 1) {
                     const int drl_ctx_v1 = get_drl_context(mvstack, 0);
-                    b->drl_idx += msac_decode_bool_adapt(&ts->msac,
-                                             ts->cdf.m.drl_bit[drl_ctx_v1]);
+                    b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
+                                      ts->cdf.m.drl_bit[drl_ctx_v1]);
                     if (b->drl_idx == 1 && n_mvs > 2) {
                         const int drl_ctx_v2 = get_drl_context(mvstack, 1);
-                        b->drl_idx += msac_decode_bool_adapt(&ts->msac,
-                                             ts->cdf.m.drl_bit[drl_ctx_v2]);
+                        b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
+                                          ts->cdf.m.drl_bit[drl_ctx_v2]);
                     }
                     if (DEBUG_BLOCK_INFO)
                         printf("Post-drlidx[%d,n_mvs=%d]: r=%d\n",
                                b->drl_idx, n_mvs, ts->msac.rng);
                 }
             } else if (im[0] == NEARMV || im[1] == NEARMV) {
                 b->drl_idx = 1;
                 if (n_mvs > 2) {
                     const int drl_ctx_v2 = get_drl_context(mvstack, 1);
-                    b->drl_idx += msac_decode_bool_adapt(&ts->msac,
-                                             ts->cdf.m.drl_bit[drl_ctx_v2]);
+                    b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
+                                      ts->cdf.m.drl_bit[drl_ctx_v2]);
                     if (b->drl_idx == 2 && n_mvs > 3) {
                         const int drl_ctx_v3 = get_drl_context(mvstack, 2);
-                        b->drl_idx += msac_decode_bool_adapt(&ts->msac,
-                                             ts->cdf.m.drl_bit[drl_ctx_v3]);
+                        b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
+                                          ts->cdf.m.drl_bit[drl_ctx_v3]);
                     }
                     if (DEBUG_BLOCK_INFO)
                         printf("Post-drlidx[%d,n_mvs=%d]: r=%d\n",
                                b->drl_idx, n_mvs, ts->msac.rng);
                 }
             }
 
 #define assign_comp_mv(idx, pfx) \
@@ -1528,96 +1544,102 @@ static int decode_b(Dav1dTileContext *co
                        b->mv[0].y, b->mv[0].x, b->mv[1].y, b->mv[1].x,
                        ts->msac.rng);
 
             // jnt_comp vs. seg vs. wedge
             int is_segwedge = 0;
             if (f->seq_hdr->masked_compound) {
                 const int mask_ctx = get_mask_comp_ctx(t->a, &t->l, by4, bx4);
 
-                is_segwedge = msac_decode_bool_adapt(&ts->msac,
-                                                 ts->cdf.m.mask_comp[mask_ctx]);
+                is_segwedge = dav1d_msac_decode_bool_adapt(&ts->msac,
+                                  ts->cdf.m.mask_comp[mask_ctx]);
                 if (DEBUG_BLOCK_INFO)
                     printf("Post-segwedge_vs_jntavg[%d,ctx=%d]: r=%d\n",
                            is_segwedge, mask_ctx, ts->msac.rng);
             }
 
             if (!is_segwedge) {
                 if (f->seq_hdr->jnt_comp) {
                     const int jnt_ctx =
                         get_jnt_comp_ctx(f->seq_hdr->order_hint_n_bits,
                                          f->cur.frame_hdr->frame_offset,
                                          f->refp[b->ref[0]].p.frame_hdr->frame_offset,
                                          f->refp[b->ref[1]].p.frame_hdr->frame_offset,
                                          t->a, &t->l, by4, bx4);
                     b->comp_type = COMP_INTER_WEIGHTED_AVG +
-                        msac_decode_bool_adapt(&ts->msac,
-                                               ts->cdf.m.jnt_comp[jnt_ctx]);
+                                   dav1d_msac_decode_bool_adapt(&ts->msac,
+                                       ts->cdf.m.jnt_comp[jnt_ctx]);
                     if (DEBUG_BLOCK_INFO)
                         printf("Post-jnt_comp[%d,ctx=%d[ac:%d,ar:%d,lc:%d,lr:%d]]: r=%d\n",
                                b->comp_type == COMP_INTER_AVG,
                                jnt_ctx, t->a->comp_type[bx4], t->a->ref[0][bx4],
                                t->l.comp_type[by4], t->l.ref[0][by4],
                                ts->msac.rng);
                 } else {
                     b->comp_type = COMP_INTER_AVG;
                 }
             } else {
                 if (wedge_allowed_mask & (1 << bs)) {
                     const int ctx = dav1d_wedge_ctx_lut[bs];
                     b->comp_type = COMP_INTER_WEDGE -
-                        msac_decode_bool_adapt(&ts->msac,
-                                               ts->cdf.m.wedge_comp[ctx]);
+                                   dav1d_msac_decode_bool_adapt(&ts->msac,
+                                       ts->cdf.m.wedge_comp[ctx]);
                     if (b->comp_type == COMP_INTER_WEDGE)
-                        b->wedge_idx = msac_decode_symbol_adapt(&ts->msac,
-                                                ts->cdf.m.wedge_idx[ctx], 16);
+                        b->wedge_idx = dav1d_msac_decode_symbol_adapt(&ts->msac,
+                                           ts->cdf.m.wedge_idx[ctx], 16);
                 } else {
                     b->comp_type = COMP_INTER_SEG;
                 }
-                b->mask_sign = msac_decode_bool(&ts->msac, EC_BOOL_EPROB);
+                b->mask_sign = dav1d_msac_decode_bool_equi(&ts->msac);
                 if (DEBUG_BLOCK_INFO)
                     printf("Post-seg/wedge[%d,wedge_idx=%d,sign=%d]: r=%d\n",
                            b->comp_type == COMP_INTER_WEDGE,
                            b->wedge_idx, b->mask_sign, ts->msac.rng);
             }
         } else {
             b->comp_type = COMP_INTER_NONE;
 
             // ref
             if (seg && seg->ref > 0) {
                 b->ref[0] = seg->ref - 1;
             } else if (seg && (seg->globalmv || seg->skip)) {
                 b->ref[0] = 0;
             } else {
                 const int ctx1 = av1_get_ref_ctx(t->a, &t->l, by4, bx4,
                                                  have_top, have_left);
-                if (msac_decode_bool_adapt(&ts->msac, ts->cdf.m.ref[0][ctx1])) {
+                if (dav1d_msac_decode_bool_adapt(&ts->msac,
+                                                 ts->cdf.m.ref[0][ctx1]))
+                {
                     const int ctx2 = av1_get_ref_2_ctx(t->a, &t->l, by4, bx4,
                                                        have_top, have_left);
-                    if (msac_decode_bool_adapt(&ts->msac, ts->cdf.m.ref[1][ctx2])) {
+                    if (dav1d_msac_decode_bool_adapt(&ts->msac,
+                                                     ts->cdf.m.ref[1][ctx2]))
+                    {
                         b->ref[0] = 6;
                     } else {
                         const int ctx3 = av1_get_ref_6_ctx(t->a, &t->l, by4, bx4,
                                                            have_top, have_left);
-                        b->ref[0] = 4 + msac_decode_bool_adapt(&ts->msac,
-                                                           ts->cdf.m.ref[5][ctx3]);
+                        b->ref[0] = 4 + dav1d_msac_decode_bool_adapt(&ts->msac,
+                                            ts->cdf.m.ref[5][ctx3]);
                     }
                 } else {
                     const int ctx2 = av1_get_ref_3_ctx(t->a, &t->l, by4, bx4,
                                                        have_top, have_left);
-                    if (msac_decode_bool_adapt(&ts->msac, ts->cdf.m.ref[2][ctx2])) {
+                    if (dav1d_msac_decode_bool_adapt(&ts->msac,
+                                                     ts->cdf.m.ref[2][ctx2]))
+                    {
                         const int ctx3 = av1_get_ref_5_ctx(t->a, &t->l, by4, bx4,
                                                            have_top, have_left);
-                        b->ref[0] = 2 + msac_decode_bool_adapt(&ts->msac,
-                                                           ts->cdf.m.ref[4][ctx3]);
+                        b->ref[0] = 2 + dav1d_msac_decode_bool_adapt(&ts->msac,
+                                            ts->cdf.m.ref[4][ctx3]);
                     } else {
                         const int ctx3 = av1_get_ref_4_ctx(t->a, &t->l, by4, bx4,
                                                            have_top, have_left);
-                        b->ref[0] = msac_decode_bool_adapt(&ts->msac,
-                                                           ts->cdf.m.ref[3][ctx3]);
+                        b->ref[0] = dav1d_msac_decode_bool_adapt(&ts->msac,
+                                        ts->cdf.m.ref[3][ctx3]);
                     }
                 }
                 if (DEBUG_BLOCK_INFO)
                     printf("Post-ref[%d]: r=%d\n", b->ref[0], ts->msac.rng);
             }
             b->ref[1] = -1;
 
             candidate_mv mvstack[8];
@@ -1626,44 +1648,45 @@ static int decode_b(Dav1dTileContext *co
             av1_find_ref_mvs(mvstack, &n_mvs, mvlist, &ctx,
                              (int[2]) { b->ref[0], -1 }, f->bw, f->bh, bs, bp,
                              t->by, t->bx, ts->tiling.col_start,
                              ts->tiling.col_end, ts->tiling.row_start,
                              ts->tiling.row_end, f->libaom_cm);
 
             // mode parsing and mv derivation from ref_mvs
             if ((seg && (seg->skip || seg->globalmv)) ||
-                msac_decode_bool_adapt(&ts->msac, ts->cdf.m.newmv_mode[ctx & 7]))
+                dav1d_msac_decode_bool_adapt(&ts->msac,
+                                             ts->cdf.m.newmv_mode[ctx & 7]))
             {
                 if ((seg && (seg->skip || seg->globalmv)) ||
-                    !msac_decode_bool_adapt(&ts->msac,
-                                        ts->cdf.m.globalmv_mode[(ctx >> 3) & 1]))
+                    !dav1d_msac_decode_bool_adapt(&ts->msac,
+                         ts->cdf.m.globalmv_mode[(ctx >> 3) & 1]))
                 {
                     b->inter_mode = GLOBALMV;
                     b->mv[0] = get_gmv_2d(&f->frame_hdr->gmv[b->ref[0]],
                                           t->bx, t->by, bw4, bh4, f->frame_hdr);
                     fix_mv_precision(f->frame_hdr, &b->mv[0]);
                     has_subpel_filter = imin(bw4, bh4) == 1 ||
                         f->frame_hdr->gmv[b->ref[0]].type == DAV1D_WM_TYPE_TRANSLATION;
                 } else {
                     has_subpel_filter = 1;
-                    if (msac_decode_bool_adapt(&ts->msac,
-                                       ts->cdf.m.refmv_mode[(ctx >> 4) & 15]))
+                    if (dav1d_msac_decode_bool_adapt(&ts->msac,
+                            ts->cdf.m.refmv_mode[(ctx >> 4) & 15]))
                     {
                         b->inter_mode = NEARMV;
                         b->drl_idx = 1;
                         if (n_mvs > 2) {
                             const int drl_ctx_v2 = get_drl_context(mvstack, 1);
-                            b->drl_idx += msac_decode_bool_adapt(&ts->msac,
-                                                 ts->cdf.m.drl_bit[drl_ctx_v2]);
+                            b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
+                                              ts->cdf.m.drl_bit[drl_ctx_v2]);
                             if (b->drl_idx == 2 && n_mvs > 3) {
                                 const int drl_ctx_v3 =
                                     get_drl_context(mvstack, 2);
-                                b->drl_idx += msac_decode_bool_adapt(&ts->msac,
-                                                 ts->cdf.m.drl_bit[drl_ctx_v3]);
+                                b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
+                                                  ts->cdf.m.drl_bit[drl_ctx_v3]);
                             }
                         }
                     } else {
                         b->inter_mode = NEARESTMV;
                         b->drl_idx = 0;
                     }
                     if (b->drl_idx >= 2) {
                         b->mv[0] = mvstack[b->drl_idx].this_mv;
@@ -1678,22 +1701,22 @@ static int decode_b(Dav1dTileContext *co
                            b->inter_mode, b->drl_idx, b->mv[0].y, b->mv[0].x, n_mvs,
                            ts->msac.rng);
             } else {
                 has_subpel_filter = 1;
                 b->inter_mode = NEWMV;
                 b->drl_idx = 0;
                 if (n_mvs > 1) {
                     const int drl_ctx_v1 = get_drl_context(mvstack, 0);
-                    b->drl_idx += msac_decode_bool_adapt(&ts->msac,
-                                                 ts->cdf.m.drl_bit[drl_ctx_v1]);
+                    b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
+                                      ts->cdf.m.drl_bit[drl_ctx_v1]);
                     if (b->drl_idx == 1 && n_mvs > 2) {
                         const int drl_ctx_v2 = get_drl_context(mvstack, 1);
-                        b->drl_idx += msac_decode_bool_adapt(&ts->msac,
-                                                 ts->cdf.m.drl_bit[drl_ctx_v2]);
+                        b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
+                                          ts->cdf.m.drl_bit[drl_ctx_v2]);
                     }
                 }
                 if (n_mvs > 1) {
                     b->mv[0] = mvstack[b->drl_idx].this_mv;
                 } else {
                     b->mv[0] = mvlist[0][0];
                     fix_mv_precision(f->frame_hdr, &b->mv[0]);
                 }
@@ -1706,28 +1729,29 @@ static int decode_b(Dav1dTileContext *co
                     printf("Post-residualmv[mv=y:%d,x:%d]: r=%d\n",
                            b->mv[0].y, b->mv[0].x, ts->msac.rng);
             }
 
             // interintra flags
             const int ii_sz_grp = dav1d_ymode_size_context[bs];
             if (f->seq_hdr->inter_intra &&
                 interintra_allowed_mask & (1 << bs) &&
-                msac_decode_bool_adapt(&ts->msac, ts->cdf.m.interintra[ii_sz_grp]))
+                dav1d_msac_decode_bool_adapt(&ts->msac,
+                                             ts->cdf.m.interintra[ii_sz_grp]))
             {
-                b->interintra_mode = msac_decode_symbol_adapt(&ts->msac,
-                                          ts->cdf.m.interintra_mode[ii_sz_grp],
-                                          N_INTER_INTRA_PRED_MODES);
+                b->interintra_mode = dav1d_msac_decode_symbol_adapt(&ts->msac,
+                                         ts->cdf.m.interintra_mode[ii_sz_grp],
+                                         N_INTER_INTRA_PRED_MODES);
                 const int wedge_ctx = dav1d_wedge_ctx_lut[bs];
                 b->interintra_type = INTER_INTRA_BLEND +
-                    msac_decode_bool_adapt(&ts->msac,
-                                           ts->cdf.m.interintra_wedge[wedge_ctx]);
+                                     dav1d_msac_decode_bool_adapt(&ts->msac,
+                                         ts->cdf.m.interintra_wedge[wedge_ctx]);
                 if (b->interintra_type == INTER_INTRA_WEDGE)
-                    b->wedge_idx = msac_decode_symbol_adapt(&ts->msac,
-                                            ts->cdf.m.wedge_idx[wedge_ctx], 16);
+                    b->wedge_idx = dav1d_msac_decode_symbol_adapt(&ts->msac,
+                                       ts->cdf.m.wedge_idx[wedge_ctx], 16);
             } else {
                 b->interintra_type = INTER_INTRA_NONE;
             }
             if (DEBUG_BLOCK_INFO && f->seq_hdr->inter_intra &&
                 interintra_allowed_mask & (1 << bs))
             {
                 printf("Post-interintra[t=%d,m=%d,w=%d]: r=%d\n",
                        b->interintra_type, b->interintra_mode,
@@ -1749,18 +1773,19 @@ static int decode_b(Dav1dTileContext *co
                 uint64_t mask[2] = { 0, 0 };
                 find_matching_ref(t, intra_edge_flags, bw4, bh4, w4, h4,
                                   have_left, have_top, b->ref[0], mask);
                 const int allow_warp = !f->svc[b->ref[0]][0].scale &&
                     !f->frame_hdr->force_integer_mv &&
                     f->frame_hdr->warp_motion && (mask[0] | mask[1]);
 
                 b->motion_mode = allow_warp ?
-                    msac_decode_symbol_adapt(&ts->msac, ts->cdf.m.motion_mode[bs], 3) :
-                    msac_decode_bool_adapt(&ts->msac, ts->cdf.m.obmc[bs]);
+                    dav1d_msac_decode_symbol_adapt(&ts->msac,
+                        ts->cdf.m.motion_mode[bs], 3) :
+                    dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.obmc[bs]);
                 if (b->motion_mode == MM_WARP) {
                     has_subpel_filter = 0;
                     derive_warpmv(t, bw4, bh4, mask, b->mv[0], &t->warpmv);
 #define signabs(v) v < 0 ? '-' : ' ', abs(v)
                     if (DEBUG_BLOCK_INFO)
                         printf("[ %c%x %c%x %c%x\n  %c%x %c%x %c%x ]\n"
                                "alpha=%c%x, beta=%c%x, gamma=%c%x, delta=%c%x\n",
                                signabs(t->warpmv.matrix[0]),
@@ -1787,26 +1812,28 @@ static int decode_b(Dav1dTileContext *co
 
         // subpel filter
         enum Dav1dFilterMode filter[2];
         if (f->frame_hdr->subpel_filter_mode == DAV1D_FILTER_SWITCHABLE) {
             if (has_subpel_filter) {
                 const int comp = b->comp_type != COMP_INTER_NONE;
                 const int ctx1 = get_filter_ctx(t->a, &t->l, comp, 0, b->ref[0],
                                                 by4, bx4);
-                filter[0] = msac_decode_symbol_adapt(&ts->msac,
-                    ts->cdf.m.filter[0][ctx1], DAV1D_N_SWITCHABLE_FILTERS);
+                filter[0] = dav1d_msac_decode_symbol_adapt(&ts->msac,
+                               ts->cdf.m.filter[0][ctx1],
+                               DAV1D_N_SWITCHABLE_FILTERS);
                 if (f->seq_hdr->dual_filter) {
                     const int ctx2 = get_filter_ctx(t->a, &t->l, comp, 1,
                                                     b->ref[0], by4, bx4);
                     if (DEBUG_BLOCK_INFO)
                         printf("Post-subpel_filter1[%d,ctx=%d]: r=%d\n",
                                filter[0], ctx1, ts->msac.rng);
-                    filter[1] = msac_decode_symbol_adapt(&ts->msac,
-                        ts->cdf.m.filter[1][ctx2], DAV1D_N_SWITCHABLE_FILTERS);
+                    filter[1] = dav1d_msac_decode_symbol_adapt(&ts->msac,
+                                    ts->cdf.m.filter[1][ctx2],
+                                    DAV1D_N_SWITCHABLE_FILTERS);
                     if (DEBUG_BLOCK_INFO)
                         printf("Post-subpel_filter2[%d,ctx=%d]: r=%d\n",
                                filter[1], ctx2, ts->msac.rng);
                 } else {
                     filter[1] = filter[0];
                     if (DEBUG_BLOCK_INFO)
                         printf("Post-subpel_filter[%d,ctx=%d]: r=%d\n",
                                filter[0], ctx1, ts->msac.rng);
@@ -1989,17 +2016,17 @@ static int decode_sb(Dav1dTileContext *c
 
     if (have_h_split && have_v_split) {
         if (f->frame_thread.pass == 2) {
             const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx];
             bp = b->bl == bl ? b->bp : PARTITION_SPLIT;
         } else {
             const unsigned n_part = bl == BL_8X8 ? N_SUB8X8_PARTITIONS :
                 bl == BL_128X128 ? N_PARTITIONS - 2 : N_PARTITIONS;
-            bp = msac_decode_symbol_adapt(&t->ts->msac, pc, n_part);
+            bp = dav1d_msac_decode_symbol_adapt(&t->ts->msac, pc, n_part);
             if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I422 &&
                 (bp == PARTITION_V || bp == PARTITION_V4 ||
                  bp == PARTITION_T_LEFT_SPLIT || bp == PARTITION_T_RIGHT_SPLIT))
             {
                 return 1;
             }
             if (DEBUG_BLOCK_INFO)
                 printf("poc=%d,y=%d,x=%d,bl=%d,ctx=%d,bp=%d: r=%d\n",
@@ -2160,17 +2187,18 @@ static int decode_sb(Dav1dTileContext *c
         default: assert(0);
         }
     } else if (have_h_split) {
         unsigned is_split;
         if (f->frame_thread.pass == 2) {
             const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx];
             is_split = b->bl != bl;
         } else {
-            is_split = msac_decode_bool(&t->ts->msac, gather_top_partition_prob(pc, bl) >> EC_PROB_SHIFT);
+            is_split = dav1d_msac_decode_bool(&t->ts->msac,
+                           gather_top_partition_prob(pc, bl));
             if (DEBUG_BLOCK_INFO)
                 printf("poc=%d,y=%d,x=%d,bl=%d,ctx=%d,bp=%d: r=%d\n",
                        f->frame_hdr->frame_offset, t->by, t->bx, bl, ctx,
                        is_split ? PARTITION_SPLIT : PARTITION_H, t->ts->msac.rng);
         }
 
         assert(bl < BL_8X8);
         if (is_split) {
@@ -2188,17 +2216,18 @@ static int decode_sb(Dav1dTileContext *c
         }
     } else {
         assert(have_v_split);
         unsigned is_split;
         if (f->frame_thread.pass == 2) {
             const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx];
             is_split = b->bl != bl;
         } else {
-            is_split = msac_decode_bool(&t->ts->msac, gather_left_partition_prob(pc, bl) >> EC_PROB_SHIFT);
+            is_split = dav1d_msac_decode_bool(&t->ts->msac,
+                           gather_left_partition_prob(pc, bl));
             if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I422 && !is_split)
                 return 1;
             if (DEBUG_BLOCK_INFO)
                 printf("poc=%d,y=%d,x=%d,bl=%d,ctx=%d,bp=%d: r=%d\n",
                        f->frame_hdr->frame_offset, t->by, t->bx, bl, ctx,
                        is_split ? PARTITION_SPLIT : PARTITION_V, t->ts->msac.rng);
         }
 
@@ -2270,17 +2299,17 @@ static void setup_tile(Dav1dTileState *c
     const int sb_shift = f->sb_shift;
 
     ts->frame_thread.pal_idx = &f->frame_thread.pal_idx[tile_start_off * 2];
     ts->frame_thread.cf = &((int32_t *) f->frame_thread.cf)[tile_start_off * 3];
     dav1d_cdf_thread_copy(&ts->cdf, &f->in_cdf);
     ts->last_qidx = f->frame_hdr->quant.yac;
     memset(ts->last_delta_lf, 0, sizeof(ts->last_delta_lf));
 
-    msac_init(&ts->msac, data, sz, f->frame_hdr->disable_cdf_update);
+    dav1d_msac_init(&ts->msac, data, sz, f->frame_hdr->disable_cdf_update);
 
     ts->tiling.row = tile_row;
     ts->tiling.col = tile_col;
     ts->tiling.col_start = col_sb_start << sb_shift;
     ts->tiling.col_end = imin(col_sb_end << sb_shift, f->bw);
     ts->tiling.row_start = row_sb_start << sb_shift;
     ts->tiling.row_end = imin(row_sb_end << sb_shift, f->bh);
 
@@ -2331,79 +2360,67 @@ static void setup_tile(Dav1dTileState *c
 static void read_restoration_info(Dav1dTileContext *const t,
                                   Av1RestorationUnit *const lr, const int p,
                                   const enum Dav1dRestorationType frame_type)
 {
     const Dav1dFrameContext *const f = t->f;
     Dav1dTileState *const ts = t->ts;
 
     if (frame_type == DAV1D_RESTORATION_SWITCHABLE) {
-        const int filter =
-            msac_decode_symbol_adapt(&ts->msac,
-                                     ts->cdf.m.restore_switchable, 3);
+        const int filter = dav1d_msac_decode_symbol_adapt(&ts->msac,
+                               ts->cdf.m.restore_switchable, 3);
         lr->type = filter ? filter == 2 ? DAV1D_RESTORATION_SGRPROJ :
                                           DAV1D_RESTORATION_WIENER :
-                            DAV1D_RESTORATION_NONE;
+                                          DAV1D_RESTORATION_NONE;
     } else {
         const unsigned type =
-            msac_decode_bool_adapt(&ts->msac,
-                                   frame_type == DAV1D_RESTORATION_WIENER ?
-                                       ts->cdf.m.restore_wiener :
-                                       ts->cdf.m.restore_sgrproj);
+            dav1d_msac_decode_bool_adapt(&ts->msac,
+                frame_type == DAV1D_RESTORATION_WIENER ?
+                ts->cdf.m.restore_wiener : ts->cdf.m.restore_sgrproj);
         lr->type = type ? frame_type : DAV1D_RESTORATION_NONE;
     }
 
     if (lr->type == DAV1D_RESTORATION_WIENER) {
-        lr->filter_v[0] =
-            !p ? msac_decode_subexp(&ts->msac,
-                                    ts->lr_ref[p]->filter_v[0] + 5, 16,
-                                    1) - 5:
-                 0;
+        lr->filter_v[0] = p ? 0 :
+            dav1d_msac_decode_subexp(&ts->msac,
+                ts->lr_ref[p]->filter_v[0] + 5, 16, 1) - 5;
         lr->filter_v[1] =
-            msac_decode_subexp(&ts->msac,
-                               ts->lr_ref[p]->filter_v[1] + 23, 32,
-                               2) - 23;
+            dav1d_msac_decode_subexp(&ts->msac,
+                ts->lr_ref[p]->filter_v[1] + 23, 32, 2) - 23;
         lr->filter_v[2] =
-            msac_decode_subexp(&ts->msac,
-                               ts->lr_ref[p]->filter_v[2] + 17, 64,
-                               3) - 17;
+            dav1d_msac_decode_subexp(&ts->msac,
+                ts->lr_ref[p]->filter_v[2] + 17, 64, 3) - 17;
 
-        lr->filter_h[0] =
-            !p ? msac_decode_subexp(&ts->msac,
-                                    ts->lr_ref[p]->filter_h[0] + 5, 16,
-                                    1) - 5:
-                0;
+        lr->filter_h[0] = p ? 0 :
+            dav1d_msac_decode_subexp(&ts->msac,
+                ts->lr_ref[p]->filter_h[0] + 5, 16, 1) - 5;
         lr->filter_h[1] =
-            msac_decode_subexp(&ts->msac,
-                               ts->lr_ref[p]->filter_h[1] + 23, 32,
-                               2) - 23;
+            dav1d_msac_decode_subexp(&ts->msac,
+                ts->lr_ref[p]->filter_h[1] + 23, 32, 2) - 23;
         lr->filter_h[2] =
-            msac_decode_subexp(&ts->msac,
-                               ts->lr_ref[p]->filter_h[2] + 17, 64,
-                               3) - 17;
+            dav1d_msac_decode_subexp(&ts->msac,
+                ts->lr_ref[p]->filter_h[2] + 17, 64, 3) - 17;
         memcpy(lr->sgr_weights, ts->lr_ref[p]->sgr_weights, sizeof(lr->sgr_weights));
         ts->lr_ref[p] = lr;
         if (DEBUG_BLOCK_INFO)
             printf("Post-lr_wiener[pl=%d,v[%d,%d,%d],h[%d,%d,%d]]: r=%d\n",
                    p, lr->filter_v[0], lr->filter_v[1],
                    lr->filter_v[2], lr->filter_h[0],
                    lr->filter_h[1], lr->filter_h[2], ts->msac.rng);
     } else if (lr->type == DAV1D_RESTORATION_SGRPROJ) {
-        const unsigned idx = msac_decode_bools(&ts->msac, 4);
+        const unsigned idx = dav1d_msac_decode_bools(&ts->msac, 4);
         lr->sgr_idx = idx;
         lr->sgr_weights[0] = dav1d_sgr_params[idx][0] ?
-            msac_decode_subexp(&ts->msac,
-                               ts->lr_ref[p]->sgr_weights[0] + 96, 128,
-                               4) - 96 :
+            dav1d_msac_decode_subexp(&ts->msac,
+                ts->lr_ref[p]->sgr_weights[0] + 96, 128, 4) - 96 :
             0;
         lr->sgr_weights[1] = dav1d_sgr_params[idx][1] ?
-            msac_decode_subexp(&ts->msac,
-                               ts->lr_ref[p]->sgr_weights[1] + 32, 128,
-                               4) - 32 :
-            iclip(128 - lr->sgr_weights[0], -32, 95);
+            dav1d_msac_decode_subexp(&ts->msac,
+                ts->lr_ref[p]->sgr_weights[1] + 32, 128, 4) - 32 :
+            95;
         memcpy(lr->filter_v, ts->lr_ref[p]->filter_v, sizeof(lr->filter_v));
         memcpy(lr->filter_h, ts->lr_ref[p]->filter_h, sizeof(lr->filter_h));
         ts->lr_ref[p] = lr;
         if (DEBUG_BLOCK_INFO)
             printf("Post-lr_sgrproj[pl=%d,idx=%d,w[%d,%d]]: r=%d\n",
                    p, lr->sgr_idx, lr->sgr_weights[0],
                    lr->sgr_weights[1], ts->msac.rng);
     }
@@ -2588,18 +2605,22 @@ int dav1d_decode_frame(Dav1dFrameContext
     if (f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows > f->n_ts) {
         f->ts = realloc(f->ts, f->frame_hdr->tiling.cols *
                                f->frame_hdr->tiling.rows * sizeof(*f->ts));
         if (!f->ts) goto error;
         for (int n = f->n_ts;
              n < f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows; n++)
         {
             Dav1dTileState *const ts = &f->ts[n];
-            pthread_mutex_init(&ts->tile_thread.lock, NULL);
-            pthread_cond_init(&ts->tile_thread.cond, NULL);
+            if (pthread_mutex_init(&ts->tile_thread.lock, NULL)) goto error;
+            if (pthread_cond_init(&ts->tile_thread.cond, NULL)) {
+                pthread_mutex_destroy(&ts->tile_thread.lock);
+                goto error;
+            }
+            f->n_ts = n + 1;
         }
         if (c->n_fc > 1) {
             freep(&f->frame_thread.tile_start_off);
             f->frame_thread.tile_start_off =
                 malloc(sizeof(*f->frame_thread.tile_start_off) *
                        f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows);
             if (!f->frame_thread.tile_start_off) goto error;
         }
@@ -2657,38 +2678,39 @@ int dav1d_decode_frame(Dav1dFrameContext
 
         if (!lr_ptr) goto error;
 
         for (int pl = 0; pl <= 2; pl++) {
             f->lf.lr_lpf_line_ptr[pl] = lr_ptr;
             lr_ptr += lr_stride * 12;
         }
 
-        f->lf.lr_line_sz = lr_stride;
+        f->lf.lr_line_sz = (int) lr_stride;
     }
 
     // update allocation for loopfilter masks
     if (f->sb128w * f->sb128h > f->lf.mask_sz) {
         freep(&f->lf.mask);
         freep(&f->lf.level);
         freep(&f->frame_thread.b);
         f->lf.mask = malloc(f->sb128w * f->sb128h * sizeof(*f->lf.mask));
         f->lf.level = malloc(f->sb128w * f->sb128h * 32 * 32 *
                              sizeof(*f->lf.level));
         if (!f->lf.mask || !f->lf.level) goto error;
         if (c->n_fc > 1) {
             freep(&f->frame_thread.b);
             freep(&f->frame_thread.cbi);
             dav1d_freep_aligned(&f->frame_thread.cf);
             dav1d_freep_aligned(&f->frame_thread.pal_idx);
-            freep(&f->frame_thread.pal);
+            dav1d_freep_aligned(&f->frame_thread.pal);
             f->frame_thread.b = malloc(sizeof(*f->frame_thread.b) *
                                        f->sb128w * f->sb128h * 32 * 32);
-            f->frame_thread.pal = malloc(sizeof(*f->frame_thread.pal) *
-                                         f->sb128w * f->sb128h * 16 * 16);
+            f->frame_thread.pal =
+                dav1d_alloc_aligned(sizeof(*f->frame_thread.pal) *
+                                    f->sb128w * f->sb128h * 16 * 16, 32);
             f->frame_thread.pal_idx =
                 dav1d_alloc_aligned(sizeof(*f->frame_thread.pal_idx) *
                                     f->sb128w * f->sb128h * 128 * 128 * 2, 32);
             f->frame_thread.cbi = malloc(sizeof(*f->frame_thread.cbi) *
                                          f->sb128w * f->sb128h * 32 * 32);
             f->frame_thread.cf =
                 dav1d_alloc_aligned(sizeof(int32_t) * 3 *
                                     f->sb128w * f->sb128h * 128 * 128, 32);
@@ -3084,17 +3106,17 @@ int dav1d_submit_frame(Dav1dContext *con
 #endif
 #if CONFIG_16BPC
         case 10:
         case 12:
             assign_bitdepth_case(16);
 #endif
 #undef assign_bitdepth_case
         default:
-            fprintf(stderr, "Compiled without support for %d-bit decoding\n",
+            dav1d_log(c, "Compiled without support for %d-bit decoding\n",
                     8 + 2 * f->seq_hdr->hbd);
             res = -ENOPROTOOPT;
             goto error;
         }
     }
 
 #define assign_bitdepth_case(bd) \
         f->bd_fn.recon_b_inter = dav1d_recon_b_inter_##bd##bpc; \
@@ -3167,33 +3189,34 @@ int dav1d_submit_frame(Dav1dContext *con
         dav1d_cdf_thread_ref(&f->in_cdf, &c->cdf[pri_ref]);
     }
     if (f->frame_hdr->refresh_context) {
         res = dav1d_cdf_thread_alloc(&f->out_cdf, c->n_fc > 1 ? &f->frame_thread.td : NULL);
         if (res < 0) goto error;
     }
 
     // FIXME qsort so tiles are in order (for frame threading)
+    if (f->n_tile_data_alloc < c->n_tile_data) {
+        freep(&f->tile);
+        assert(c->n_tile_data < INT_MAX / (int)sizeof(*f->tile));
+        f->tile = malloc(c->n_tile_data * sizeof(*f->tile));
+        if (!f->tile) goto error;
+        f->n_tile_data_alloc = c->n_tile_data;
+    }
     memcpy(f->tile, c->tile, c->n_tile_data * sizeof(*f->tile));
     memset(c->tile, 0, c->n_tile_data * sizeof(*c->tile));
     f->n_tile_data = c->n_tile_data;
     c->n_tile_data = 0;
 
     // allocate frame
-    res = dav1d_thread_picture_alloc(&f->sr_cur, f->frame_hdr->width[1],
-                                     f->frame_hdr->height,
-                                     f->seq_hdr, f->seq_hdr_ref,
-                                     f->frame_hdr, f->frame_hdr_ref,
-                                     bpc, &f->tile[0].data.m,
-                                     c->n_fc > 1 ? &f->frame_thread.td : NULL,
-                                     f->frame_hdr->show_frame, &c->allocator);
+    res = dav1d_thread_picture_alloc(c, f, bpc);
     if (res < 0) goto error;
 
     if (f->frame_hdr->super_res.enabled) {
-        res = dav1d_picture_alloc_copy(&f->cur, f->frame_hdr->width[0], &f->sr_cur.p);
+        res = dav1d_picture_alloc_copy(c, &f->cur, f->frame_hdr->width[0], &f->sr_cur.p);
         if (res < 0) goto error;
     } else {
         dav1d_picture_ref(&f->cur, &f->sr_cur.p);
     }
 
     if (f->frame_hdr->super_res.enabled) {
         f->resize_step[0] = scale_fac(f->cur.p.w, f->sr_cur.p.p.w);
         const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
--- a/third_party/dav1d/src/decode.h
+++ b/third_party/dav1d/src/decode.h
@@ -20,16 +20,16 @@
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __DAV1D_SRC_DECODE_H__
-#define __DAV1D_SRC_DECODE_H__
+#ifndef DAV1D_SRC_DECODE_H
+#define DAV1D_SRC_DECODE_H
 
 #include "src/internal.h"
 
 int dav1d_submit_frame(Dav1dContext *c);
 
-#endif /* __DAV1D_SRC_DECODE_H__ */
+#endif /* DAV1D_SRC_DECODE_H */
--- a/third_party/dav1d/src/dequant_tables.h
+++ b/third_party/dav1d/src/dequant_tables.h
@@ -20,18 +20,18 @@
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __DAV1D_SRC_DEQUANT_TABLES_H__
-#define __DAV1D_SRC_DEQUANT_TABLES_H__
+#ifndef DAV1D_SRC_DEQUANT_TABLES_H
+#define DAV1D_SRC_DEQUANT_TABLES_H
 
 #include <stdint.h>
 
 #include "src/levels.h"
 
 extern const uint16_t dav1d_dq_tbl[][QINDEX_RANGE][2];
 
-#endif /* __DAV1D_SRC_DEQUANT_TABLES_H__ */
+#endif /* DAV1D_SRC_DEQUANT_TABLES_H */
--- a/third_party/dav1d/src/env.h
+++ b/third_party/dav1d/src/env.h
@@ -20,18 +20,18 @@
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __DAV1D_SRC_ENV_H__
-#define __DAV1D_SRC_ENV_H__
+#ifndef DAV1D_SRC_ENV_H
+#define DAV1D_SRC_ENV_H
 
 #include <assert.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <stdlib.h>
 
 #include "src/levels.h"
 #include "src/ref_mvs.h"
@@ -597,18 +597,18 @@ static inline int get_coef_skip_ctx(cons
         case TX_4X4:   MERGE_CTX(l, uint8_t,  TX_4X4);
         case TX_8X8:   MERGE_CTX(l, uint16_t, TX_8X8);
         case TX_16X16: MERGE_CTX(l, uint32_t, TX_16X16);
         case TX_32X32: MERGE_CTX(l, uint64_t, TX_32X32);
         case TX_64X64: MERGE_CTX(l, uint64_t, TX_64X64);
         }
 #undef MERGE_CTX
 
-        const int max = imin(la | ll, 4);
-        const int min = imin(imin(la, ll), 4);
+        const int max = imin((int) (la | ll), 4);
+        const int min = imin(imin((int) la, (int) ll), 4);
 
         return skip_contexts[min][max];
     }
 }
 
 static inline int get_coef_nz_ctx(uint8_t *const levels, const int scan_idx,
                                   const int rc, const int is_eob,
                                   const enum RectTxfmSize tx,
@@ -749,9 +749,9 @@ static inline mv get_gmv_2d(const Dav1dW
             .y = gmv->matrix[0] >> 13,
             .x = gmv->matrix[1] >> 13,
         };
     case DAV1D_WM_TYPE_IDENTITY:
         return (mv) { .x = 0, .y = 0 };
     }
 }
 
-#endif /* __DAV1D_SRC_ENV_H__ */
+#endif /* DAV1D_SRC_ENV_H */
--- a/third_party/dav1d/src/ext/x86/x86inc.asm
+++ b/third_party/dav1d/src/ext/x86/x86inc.asm
@@ -1,12 +1,12 @@
 ;*****************************************************************************
 ;* x86inc.asm: x264asm abstraction layer
 ;*****************************************************************************
-;* Copyright (C) 2005-2018 x264 project
+;* Copyright (C) 2005-2019 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Henrik Gramner <henrik@gramner.com>
 ;*          Anton Mitrofanov <BugMaster@narod.ru>
 ;*          Fiona Glaser <fiona@x264.com>
 ;*
 ;* Permission to use, copy, modify, and/or distribute this software for any
 ;* purpose with or without fee is hereby granted, provided that the above
@@ -60,22 +60,29 @@
     %elifidn __OUTPUT_FORMAT__,x64
         %define WIN64  1
     %else
         %define UNIX64 1
     %endif
 %endif
 
 %define FORMAT_ELF 0
+%define FORMAT_MACHO 0
 %ifidn __OUTPUT_FORMAT__,elf
     %define FORMAT_ELF 1
 %elifidn __OUTPUT_FORMAT__,elf32
     %define FORMAT_ELF 1
 %elifidn __OUTPUT_FORMAT__,elf64
     %define FORMAT_ELF 1
+%elifidn __OUTPUT_FORMAT__,macho
+    %define FORMAT_MACHO 1
+%elifidn __OUTPUT_FORMAT__,macho32
+    %define FORMAT_MACHO 1
+%elifidn __OUTPUT_FORMAT__,macho64
+    %define FORMAT_MACHO 1
 %endif
 
 %ifdef PREFIX
     %define mangle(x) _ %+ x
 %else
     %define mangle(x) x
 %endif
 
@@ -93,18 +100,22 @@
     %define PIC 1 ; always use PIC on x86-64
     default rel
 %elifidn __OUTPUT_FORMAT__,win32
     %define PIC 0 ; PIC isn't used on 32-bit Windows
 %elifndef PIC
     %define PIC 0
 %endif
 
+%define HAVE_PRIVATE_EXTERN 1
 %ifdef __NASM_VER__
     %use smartalign
+    %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14
+        %define HAVE_PRIVATE_EXTERN 0
+    %endif
 %endif
 
 ; Macros to eliminate most code duplication between x86_32 and x86_64:
 ; Currently this works only for leaf functions which load all their arguments
 ; into registers at the start, and make no other use of the stack. Luckily that
 ; covers most of x264's asm.
 
 ; PROLOGUE:
@@ -707,32 +718,35 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, 
 %macro cglobal 1-2+ "" ; name, [PROLOGUE args]
     cglobal_internal 1, %1 %+ SUFFIX, %2
 %endmacro
 %macro cvisible 1-2+ "" ; name, [PROLOGUE args]
     cglobal_internal 0, %1 %+ SUFFIX, %2
 %endmacro
 %macro cglobal_internal 2-3+
     annotate_function_size
-    %if %1
-        %xdefine %%FUNCTION_PREFIX private_prefix
-        %xdefine %%VISIBILITY hidden
-    %else
-        %xdefine %%FUNCTION_PREFIX public_prefix
-        %xdefine %%VISIBILITY
-    %endif
     %ifndef cglobaled_%2
-        %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2)
+        %if %1
+            %xdefine %2 mangle(private_prefix %+ _ %+ %2)
+        %else
+            %xdefine %2 mangle(public_prefix %+ _ %+ %2)
+        %endif
         %xdefine %2.skip_prologue %2 %+ .skip_prologue
         CAT_XDEFINE cglobaled_, %2, 1
     %endif
     %xdefine current_function %2
     %xdefine current_function_section __SECT__
     %if FORMAT_ELF
-        global %2:function %%VISIBILITY
+        %if %1
+            global %2:function hidden
+        %else
+            global %2:function
+        %endif
+    %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN && %1
+        global %2:private_extern
     %else
         global %2
     %endif
     align function_align
     %2:
     RESET_MM_PERMUTATION        ; needed for x86-64, also makes disassembly somewhat nicer
     %xdefine rstk rsp           ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
     %assign stack_offset 0      ; stack pointer offset relative to the return address
@@ -743,16 +757,18 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, 
         PROLOGUE %3
     %endif
 %endmacro
 
 ; Create a global symbol from a local label with the correct name mangling and type
 %macro cglobal_label 1
     %if FORMAT_ELF
         global current_function %+ %1:function hidden
+    %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN
+        global current_function %+ %1:private_extern
     %else
         global current_function %+ %1
     %endif
     %1:
 %endmacro
 
 %macro cextern 1
     %xdefine %1 mangle(private_prefix %+ _ %+ %1)
@@ -768,16 +784,18 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, 
     CAT_XDEFINE cglobaled_, %1, 1
     extern %1
 %endmacro
 
 %macro const 1-2+
     %xdefine %1 mangle(private_prefix %+ _ %+ %1)
     %if FORMAT_ELF
         global %1:data hidden
+    %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN
+        global %1:private_extern
     %else
         global %1
     %endif
     %1: %2
 %endmacro
 
 ; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default.
 %if FORMAT_ELF
@@ -812,29 +830,30 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, 
 %assign cpuflags_sse2     (1<<5) | cpuflags_sse
 %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
 %assign cpuflags_lzcnt    (1<<7) | cpuflags_sse2
 %assign cpuflags_sse3     (1<<8) | cpuflags_sse2
 %assign cpuflags_ssse3    (1<<9) | cpuflags_sse3
 %assign cpuflags_sse4     (1<<10)| cpuflags_ssse3
 %assign cpuflags_sse42    (1<<11)| cpuflags_sse4
 %assign cpuflags_aesni    (1<<12)| cpuflags_sse42
-%assign cpuflags_avx      (1<<13)| cpuflags_sse42
-%assign cpuflags_xop      (1<<14)| cpuflags_avx
-%assign cpuflags_fma4     (1<<15)| cpuflags_avx
-%assign cpuflags_fma3     (1<<16)| cpuflags_avx
-%assign cpuflags_bmi1     (1<<17)| cpuflags_avx|cpuflags_lzcnt
-%assign cpuflags_bmi2     (1<<18)| cpuflags_bmi1
-%assign cpuflags_avx2     (1<<19)| cpuflags_fma3|cpuflags_bmi2
-%assign cpuflags_avx512   (1<<20)| cpuflags_avx2 ; F, CD, BW, DQ, VL
+%assign cpuflags_gfni     (1<<13)| cpuflags_sse42
+%assign cpuflags_avx      (1<<14)| cpuflags_sse42
+%assign cpuflags_xop      (1<<15)| cpuflags_avx
+%assign cpuflags_fma4     (1<<16)| cpuflags_avx
+%assign cpuflags_fma3     (1<<17)| cpuflags_avx
+%assign cpuflags_bmi1     (1<<18)| cpuflags_avx|cpuflags_lzcnt
+%assign cpuflags_bmi2     (1<<19)| cpuflags_bmi1
+%assign cpuflags_avx2     (1<<20)| cpuflags_fma3|cpuflags_bmi2
+%assign cpuflags_avx512   (1<<21)| cpuflags_avx2 ; F, CD, BW, DQ, VL
 
-%assign cpuflags_cache32  (1<<21)
-%assign cpuflags_cache64  (1<<22)
-%assign cpuflags_aligned  (1<<23) ; not a cpu feature, but a function variant
-%assign cpuflags_atom     (1<<24)
+%assign cpuflags_cache32  (1<<22)
+%assign cpuflags_cache64  (1<<23)
+%assign cpuflags_aligned  (1<<24) ; not a cpu feature, but a function variant
+%assign cpuflags_atom     (1<<25)
 
 ; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
 %define    cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
 %define notcpuflag(x) (cpuflag(x) ^ 1)
 
 ; Takes an arbitrary number of cpuflags from the above list.
 ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
 ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
@@ -1216,18 +1235,26 @@ INIT_XMM
         %if %0 >= 8+%4
             %assign __emulate_avx 1
         %endif
     %endif
     %ifnidn %2, fnord
         %ifdef cpuname
             %if notcpuflag(%2)
                 %error use of ``%1'' %2 instruction in cpuname function: current_function
-            %elif cpuflags_%2 < cpuflags_sse && notcpuflag(sse2) && __sizeofreg > 8
+            %elif %3 == 0 && __sizeofreg == 16 && notcpuflag(sse2)
                 %error use of ``%1'' sse2 instruction in cpuname function: current_function
+            %elif %3 == 0 && __sizeofreg == 32 && notcpuflag(avx2)
+                %error use of ``%1'' avx2 instruction in cpuname function: current_function
+            %elifidn %1, pextrw ; special case because the base instruction is mmx2,
+                %ifnid %6       ; but sse4 is required for memory operands
+                    %if notcpuflag(sse4)
+                        %error use of ``%1'' sse4 instruction in cpuname function: current_function
+                    %endif
+                %endif
             %endif
         %endif
     %endif
 
     %if __emulate_avx
         %xdefine __src1 %7
         %xdefine __src2 %8
         %if %5 && %4 == 0
@@ -1379,82 +1406,85 @@ AVX_INSTR cmpordss, sse 1, 0, 0
 AVX_INSTR cmppd, sse2, 1, 1, 0
 AVX_INSTR cmpps, sse, 1, 1, 0
 AVX_INSTR cmpsd, sse2, 1, 1, 0
 AVX_INSTR cmpss, sse, 1, 1, 0
 AVX_INSTR cmpunordpd, sse2, 1, 0, 1
 AVX_INSTR cmpunordps, sse, 1, 0, 1
 AVX_INSTR cmpunordsd, sse2, 1, 0, 0
 AVX_INSTR cmpunordss, sse, 1, 0, 0
-AVX_INSTR comisd, sse2
-AVX_INSTR comiss, sse
-AVX_INSTR cvtdq2pd, sse2
-AVX_INSTR cvtdq2ps, sse2
-AVX_INSTR cvtpd2dq, sse2
-AVX_INSTR cvtpd2ps, sse2
-AVX_INSTR cvtps2dq, sse2
-AVX_INSTR cvtps2pd, sse2
-AVX_INSTR cvtsd2si, sse2
+AVX_INSTR comisd, sse2, 1
+AVX_INSTR comiss, sse, 1
+AVX_INSTR cvtdq2pd, sse2, 1
+AVX_INSTR cvtdq2ps, sse2, 1
+AVX_INSTR cvtpd2dq, sse2, 1
+AVX_INSTR cvtpd2ps, sse2, 1
+AVX_INSTR cvtps2dq, sse2, 1
+AVX_INSTR cvtps2pd, sse2, 1
+AVX_INSTR cvtsd2si, sse2, 1
 AVX_INSTR cvtsd2ss, sse2, 1, 0, 0
 AVX_INSTR cvtsi2sd, sse2, 1, 0, 0
 AVX_INSTR cvtsi2ss, sse, 1, 0, 0
 AVX_INSTR cvtss2sd, sse2, 1, 0, 0
-AVX_INSTR cvtss2si, sse
-AVX_INSTR cvttpd2dq, sse2
-AVX_INSTR cvttps2dq, sse2
-AVX_INSTR cvttsd2si, sse2
-AVX_INSTR cvttss2si, sse
+AVX_INSTR cvtss2si, sse, 1
+AVX_INSTR cvttpd2dq, sse2, 1
+AVX_INSTR cvttps2dq, sse2, 1
+AVX_INSTR cvttsd2si, sse2, 1
+AVX_INSTR cvttss2si, sse, 1
 AVX_INSTR divpd, sse2, 1, 0, 0
 AVX_INSTR divps, sse, 1, 0, 0
 AVX_INSTR divsd, sse2, 1, 0, 0
 AVX_INSTR divss, sse, 1, 0, 0
 AVX_INSTR dppd, sse4, 1, 1, 0
 AVX_INSTR dpps, sse4, 1, 1, 0
-AVX_INSTR extractps, sse4
+AVX_INSTR extractps, sse4, 1
+AVX_INSTR gf2p8affineinvqb, gfni, 0, 1, 0
+AVX_INSTR gf2p8affineqb, gfni, 0, 1, 0
+AVX_INSTR gf2p8mulb, gfni, 0, 0, 0
 AVX_INSTR haddpd, sse3, 1, 0, 0
 AVX_INSTR haddps, sse3, 1, 0, 0
 AVX_INSTR hsubpd, sse3, 1, 0, 0
 AVX_INSTR hsubps, sse3, 1, 0, 0
 AVX_INSTR insertps, sse4, 1, 1,