Bug 1549915 - Import dav1d from upstream to a713643. r=TD-Linux
authorAlex Chronopoulos <achronop@gmail.com>
Fri, 10 May 2019 05:20:18 +0000
changeset 532164 dd4958dfdea0a3d5bd2181500ed6d8958061e723
parent 532162 7c1fce459b7aa1c21e92331ce7ddbf05a7a49f58
child 532165 adb37249163ee7243b14417e5771faa8564c985e
push id11265
push userffxbld-merge
push dateMon, 13 May 2019 10:53:39 +0000
treeherdermozilla-beta@77e0fe8dbdd3 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersTD-Linux
bugs1549915
milestone68.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1549915 - Import dav1d from upstream to a713643. r=TD-Linux Differential Revision: https://phabricator.services.mozilla.com/D30507
media/libdav1d/asm/moz.build
media/libdav1d/dav1d.rc
media/libdav1d/moz.yaml
media/libdav1d/vcs_version.h
media/libdav1d/version.h
third_party/dav1d/.gitlab-ci.yml
third_party/dav1d/NEWS
third_party/dav1d/include/dav1d/common.h
third_party/dav1d/include/dav1d/data.h
third_party/dav1d/include/dav1d/dav1d.h
third_party/dav1d/include/dav1d/picture.h
third_party/dav1d/meson.build
third_party/dav1d/src/arm/32/mc.S
third_party/dav1d/src/arm/64/mc.S
third_party/dav1d/src/arm/64/msac.S
third_party/dav1d/src/cdf.c
third_party/dav1d/src/data.c
third_party/dav1d/src/decode.c
third_party/dav1d/src/itx_tmpl.c
third_party/dav1d/src/lib.c
third_party/dav1d/src/meson.build
third_party/dav1d/src/msac.h
third_party/dav1d/src/obu.c
third_party/dav1d/src/picture.c
third_party/dav1d/src/picture.h
third_party/dav1d/src/recon_tmpl.c
third_party/dav1d/src/ref_mvs.c
third_party/dav1d/src/ref_mvs.h
third_party/dav1d/src/thread.h
third_party/dav1d/src/win32/thread.c
third_party/dav1d/src/x86/ipred_init_tmpl.c
third_party/dav1d/src/x86/ipred_ssse3.asm
third_party/dav1d/src/x86/itx_ssse3.asm
third_party/dav1d/tests/checkasm/itx.c
third_party/dav1d/tests/checkasm/msac.c
third_party/dav1d/tests/libfuzzer/dav1d_fuzzer.c
third_party/dav1d/tests/meson.build
third_party/dav1d/tools/dav1d.c
third_party/dav1d/tools/input/input.c
third_party/dav1d/tools/output/output.c
--- a/media/libdav1d/asm/moz.build
+++ b/media/libdav1d/asm/moz.build
@@ -183,16 +183,17 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONF
 
     # BITDEPTH .S files
     if CONFIG['CPU_ARCH'] == 'aarch64':
         SOURCES += [
             '../../../third_party/dav1d/src/arm/64/cdef.S',
             '../../../third_party/dav1d/src/arm/64/loopfilter.S',
             '../../../third_party/dav1d/src/arm/64/looprestoration.S',
             '../../../third_party/dav1d/src/arm/64/mc.S',
+            '../../../third_party/dav1d/src/arm/64/msac.S',
         ]
     elif CONFIG['CPU_ARCH'] == 'arm':
         SOURCES += [
             '../../../third_party/dav1d/src/arm/32/looprestoration.S',
             '../../../third_party/dav1d/src/arm/32/mc.S',
         ]
 
 if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
--- a/media/libdav1d/dav1d.rc
+++ b/media/libdav1d/dav1d.rc
@@ -1,12 +1,12 @@
-#define API_VERSION_NUMBER 1,0,1,0
-#define API_VERSION_NUMBER_STR "1.0.1"
-#define PROJECT_VERSION_NUMBER 0,2,2,0
-#define PROJECT_VERSION_NUMBER_STR "0.2.2"
+#define API_VERSION_NUMBER 1,1,0,0
+#define API_VERSION_NUMBER_STR "1.1.0"
+#define PROJECT_VERSION_NUMBER 0,3,0,0
+#define PROJECT_VERSION_NUMBER_STR "0.3.0"
 
 #include <windows.h>
 
 1 VERSIONINFO
 FILETYPE VFT_DLL
 FILEOS VOS_NT_WINDOWS32
 PRODUCTVERSION PROJECT_VERSION_NUMBER
 FILEVERSION API_VERSION_NUMBER
--- a/media/libdav1d/moz.yaml
+++ b/media/libdav1d/moz.yaml
@@ -15,15 +15,15 @@ origin:
   description: dav1d, a fast AV1 decoder
 
   # Full URL for the package's homepage/etc
   # Usually different from repository url
   url: https://code.videolan.org/videolan/dav1d
 
   # Human-readable identifier for this version/release
   # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: commit f8cac8c56b3e8afec0e356b297c373a352746a1b (2019-04-22T14:37:04.000Z).
+  release: commit a713643eadcf50c9f7fd2ea22a598127c959a723 (2019-05-09T07:52:54.000Z).
 
   # The package's license, where possible using the mnemonic from
   # https://spdx.org/licenses/
   # Multiple licenses can be specified (as a YAML list)
   # A "LICENSE" file must exist containing the full license text
   license: BSD-2-Clause
--- a/media/libdav1d/vcs_version.h
+++ b/media/libdav1d/vcs_version.h
@@ -1,2 +1,2 @@
 /* auto-generated, do not edit */
-#define DAV1D_VERSION "0.2.2-1-gf8cac8c"
+#define DAV1D_VERSION "0.3.0-13-ga713643"
--- a/media/libdav1d/version.h
+++ b/media/libdav1d/version.h
@@ -23,12 +23,12 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #ifndef DAV1D_VERSION_H
 #define DAV1D_VERSION_H
 
 #define DAV1D_API_VERSION_MAJOR 1
-#define DAV1D_API_VERSION_MINOR 0
-#define DAV1D_API_VERSION_PATCH 1
+#define DAV1D_API_VERSION_MINOR 1
+#define DAV1D_API_VERSION_PATCH 0
 
 #endif /* DAV1D_VERSION_H */
--- a/third_party/dav1d/.gitlab-ci.yml
+++ b/third_party/dav1d/.gitlab-ci.yml
@@ -5,19 +5,19 @@ stages:
 
 style-check:
     image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
     stage: style
     tags:
         - debian
         - amd64
     script:
-        - git grep -n -P "\t|\r| $" -- . ':(exclude)*/compat/*' && exit 1
-        - git grep -n -i "david" -- . ':(exclude)THANKS.md' ':(exclude).gitlab-ci.yml' && exit 1
-        - for i in $(git ls-files -- . ':(exclude)*/compat/*'); do
+        - git grep -I -n -P "\t|\r| $" -- . ':(exclude)*/compat/*' && exit 1
+        - git grep -I -n -i "david" -- . ':(exclude)THANKS.md' ':(exclude).gitlab-ci.yml' && exit 1
+        - git grep -I -l -z "" -- . ':(exclude)*/compat/*' | while IFS= read -r -d '' i; do
               if [ -n "$(tail -c 1 "$i")" ]; then
                   echo "No newline at end of $i";
                   exit 1;
               fi;
           done
         - git remote rm upstream 2> /dev/null || true
         - git remote add upstream https://code.videolan.org/videolan/dav1d.git
         - git fetch -q upstream master
@@ -50,16 +50,17 @@ build-debian-static:
     stage: build
     tags:
         - debian
         - amd64
     script:
         - meson build --buildtype release --default-library static --werror
         - ninja -C build
         - cd build && meson test -v
+        - nm -A -g src/libdav1d.a | grep " [ABCDGRST] " | (! grep -v " _*dav1d_")
 
 build-debian32:
     image: registry.videolan.org:5000/dav1d-debian-unstable:20181218135732
     stage: build
     tags:
         - debian
         - amd64
     script:
@@ -81,16 +82,17 @@ build-win32:
                       --werror
                       --libdir lib
                       --prefix "$(pwd)/build/dav1d_install"
                       --cross-file /opt/crossfiles/i686-w64-mingw32.meson
                       -Ddefault_library=both
         - ninja -C build
         - ninja -C build install
         - cd build && meson test -v
+        - i686-w64-mingw32-nm -A -g src/libdav1d.a | grep " [ABCDGRST] " | (! grep -E -v " \.| _*dav1d_")
     artifacts:
         name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG"
         paths:
             - build/dav1d_install/
         expire_in: 1 week
 
 build-win32-unaligned-stack:
     image: registry.videolan.org:5000/vlc-debian-llvm-mingw:20190218133533
@@ -119,16 +121,17 @@ build-win64:
                       --werror
                       --libdir lib
                       --prefix "$(pwd)/build/dav1d_install"
                       --cross-file /opt/crossfiles/x86_64-w64-mingw32.meson
                       -Ddefault_library=both
         - ninja -C build
         - ninja -C build install
         - cd build && meson test -v
+        - x86_64-w64-mingw32-nm -A -g src/libdav1d.a | grep " [ABCDGRST] " | (! grep -E -v " \.| _*dav1d_")
     artifacts:
         name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG"
         paths:
             - build/dav1d_install/
         expire_in: 1 week
 
 build-win-arm32:
     image: registry.videolan.org:5000/vlc-debian-llvm-mingw:20190218133533
@@ -139,32 +142,34 @@ build-win-arm32:
     script:
         - meson build --buildtype release
                       --werror
                       --libdir lib
                       --prefix "$(pwd)/build/dav1d_install"
                       --cross-file /opt/crossfiles/armv7-w64-mingw32.meson
                       -Ddefault_library=both
         - ninja -C build
+        - armv7-w64-mingw32-nm -A -g build/src/libdav1d.a | grep " [ABCDGRST] " | (! grep -E -v " \.| _*dav1d_")
 
 build-win-arm64:
     image: registry.videolan.org:5000/vlc-debian-llvm-mingw:20190218133533
     stage: build
     tags:
         - debian
         - amd64
     script:
         - meson build --buildtype release
                       --werror
                       --libdir lib
                       --prefix "$(pwd)/build/dav1d_install"
                       --cross-file /opt/crossfiles/aarch64-w64-mingw32.meson
                       -Ddefault_library=both
         - ninja -C build
         - ninja -C build install
+        - aarch64-w64-mingw32-nm -A -g build/src/libdav1d.a | grep " [ABCDGRST] " | (! grep -E -v " \.| _*dav1d_")
     artifacts:
         name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG"
         paths:
             - build/dav1d_install/
         expire_in: 1 week
 
 build-debian-aarch64:
     stage: build
@@ -265,16 +270,36 @@ test-debian:
         - test -d cache/dav1d-test-data.git && GIT_DIR=cache/dav1d-test-data.git git fetch --refmap=refs/heads/master:refs/heads/master origin master
         - test -d cache/dav1d-test-data.git || git clone --bare https://code.videolan.org/videolan/dav1d-test-data.git cache/dav1d-test-data.git
         - git clone cache/dav1d-test-data.git tests/dav1d-test-data
         - meson build --buildtype release -Dtestdata_tests=true -Dlogging=false
         - ninja -C build
         - cd build && time meson test -v
     dependencies: []
 
+test-debian-unaligned-stack:
+    image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
+    stage: test
+    tags:
+        - debian
+        - amd64
+    cache:
+        key: testdata.git-20190215
+        paths:
+            - cache/dav1d-test-data.git/
+    script:
+        - test -d cache || mkdir cache
+        - test -d cache/dav1d-test-data.git && GIT_DIR=cache/dav1d-test-data.git git fetch --refmap=refs/heads/master:refs/heads/master origin master
+        - test -d cache/dav1d-test-data.git || git clone --bare https://code.videolan.org/videolan/dav1d-test-data.git cache/dav1d-test-data.git
+        - git clone cache/dav1d-test-data.git tests/dav1d-test-data
+        - meson build --buildtype release -Dtestdata_tests=true -Dlogging=false -Dstack_alignment=16
+        - ninja -C build
+        - cd build && time meson test -v
+    dependencies: []
+
 test-debian-asan:
     image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
     stage: test
     tags:
         - debian
         - amd64
     cache:
         key: testdata.git-20190215
--- a/third_party/dav1d/NEWS
+++ b/third_party/dav1d/NEWS
@@ -1,10 +1,18 @@
-Changes for 0.2.2 'Antelope':
-----------------------------
+Changes for 0.3.0 'Sailfish':
+------------------------------
+
+This is the final release for the numerous speed improvements of 0.3.0-rc.
+It mostly:
+ - Fixes an annoying crash on SSSE3 that happened in the itx functions
+
+
+Changes for 0.2.2 (0.3.0-rc) 'Antelope':
+-----------------------------
 
  - Large improvement on MSAC decoding with SSE, bringing 4-6% speed increase
    The impact is important on SSSE3, SSE4 and AVX-2 cpus
  - SSSE3 optimizations for all blocks size in itx
  - SSSE3 optimizations for ipred_paeth and ipref_cfl (420, 422 and 444)
  - Speed improvements on CDEF for SSE4 CPUs
  - NEON optimizations for SGR and loop filter
  - Minor crashes, improvements and build changes
--- a/third_party/dav1d/include/dav1d/common.h
+++ b/third_party/dav1d/include/dav1d/common.h
@@ -23,16 +23,17 @@
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #ifndef DAV1D_COMMON_H
 #define DAV1D_COMMON_H
 
+#include <errno.h>
 #include <stddef.h>
 #include <stdint.h>
 
 #ifndef DAV1D_API
     #if defined _WIN32
       #if defined DAV1D_BUILDING_DLL
         #define DAV1D_API __declspec(dllexport)
       #else
@@ -42,16 +43,22 @@
       #if __GNUC__ >= 4
         #define DAV1D_API __attribute__ ((visibility ("default")))
       #else
         #define DAV1D_API
       #endif
     #endif
 #endif
 
+#if EPERM > 0
+#define DAV1D_ERR(e) (-(e)) ///< Negate POSIX error code.
+#else
+#define DAV1D_ERR(e) (e)
+#endif
+
 /**
  * A reference-counted object wrapper for a user-configurable pointer.
  */
 typedef struct Dav1dUserData {
     const uint8_t *data; ///< data pointer
     struct Dav1dRef *ref; ///< allocation origin
 } Dav1dUserData;
 
--- a/third_party/dav1d/include/dav1d/data.h
+++ b/third_party/dav1d/include/dav1d/data.h
@@ -57,17 +57,17 @@ DAV1D_API uint8_t * dav1d_data_create(Da
  * @param           buf The data to be wrapped.
  * @param            sz Size of the data.
  * @param free_callback Function to be called when we release our last
  *                      reference to this data. In this callback, $buf will be
  *                      the $buf argument to this function, and $cookie will
  *                      be the $cookie input argument to this function.
  * @param        cookie Opaque parameter passed to free_callback().
  *
- * @return 0 on success. A negative errno value on error.
+ * @return 0 on success. A negative DAV1D_ERR value on error.
  */
 DAV1D_API int dav1d_data_wrap(Dav1dData *data, const uint8_t *buf, size_t sz,
                               void (*free_callback)(const uint8_t *buf, void *cookie),
                               void *cookie);
 
 /**
  * Wrap a user-provided data pointer into a reference counted object.
  *
@@ -82,17 +82,17 @@ DAV1D_API int dav1d_data_wrap(Dav1dData 
  * @param     user_data The user data to be wrapped.
  * @param free_callback Function to be called when we release our last
  *                      reference to this data. In this callback, $user_data
  *                      will be the $user_data argument to this function, and
  *                      $cookie will be the $cookie input argument to this
  *                      function.
  * @param        cookie Opaque parameter passed to $free_callback.
  *
- * @return 0 on success. A negative errno value on error.
+ * @return 0 on success. A negative DAV1D_ERR value on error.
  */
 DAV1D_API int dav1d_data_wrap_user_data(Dav1dData *data,
                                         const uint8_t *user_data,
                                         void (*free_callback)(const uint8_t *user_data,
                                                               void *cookie),
                                         void *cookie);
 
 /**
--- a/third_party/dav1d/include/dav1d/dav1d.h
+++ b/third_party/dav1d/include/dav1d/dav1d.h
@@ -85,28 +85,28 @@ DAV1D_API void dav1d_default_settings(Da
  *
  * @param c_out The decoder instance to open. *c_out will be set to the
  *              allocated context.
  * @param     s Input settings context.
  *
  * @note The context must be freed using dav1d_close() when decoding is
  *       finished.
  *
- * @return 0 on success, or < 0 (a negative errno code) on error.
+ * @return 0 on success, or < 0 (a negative DAV1D_ERR code) on error.
  */
 DAV1D_API int dav1d_open(Dav1dContext **c_out, const Dav1dSettings *s);
 
 /**
  * Parse a Sequence Header OBU from bitstream data.
  *
  * @param out Output Sequence Header.
  * @param buf The data to be parser.
  * @param sz  Size of the data.
  *
- * @return 0 on success, or < 0 (a negative errno code) on error.
+ * @return 0 on success, or < 0 (a negative DAV1D_ERR code) on error.
  *
  * @note It is safe to feed this function data containing other OBUs than a
  *       Sequence Header, as they will simply be ignored. If there is more than
  *       one Sequence Header OBU present, only the last will be returned.
  */
 DAV1D_API int dav1d_parse_sequence_header(Dav1dSequenceHeader *out,
                                           const uint8_t *buf, const size_t sz);
 
@@ -114,37 +114,37 @@ DAV1D_API int dav1d_parse_sequence_heade
  * Feed bitstream data to the decoder.
  *
  * @param   c Input decoder instance.
  * @param  in Input bitstream data. On success, ownership of the reference is
  *            passed to the library.
  *
  * @return
  *         0: Success, and the data was consumed.
- *   -EAGAIN: The data can't be consumed. dav1d_get_picture() should be called
- *            to get one or more frames before the function can consume new
- *            data.
- *   other negative errno codes: Error during decoding or because of invalid
- *                               passed-in arguments.
+ *  DAV1D_ERR(EAGAIN): The data can't be consumed. dav1d_get_picture() should
+ *                     be called to get one or more frames before the function
+ *                     can consume new data.
+ *  other negative DAV1D_ERR codes: Error during decoding or because of invalid
+ *                                  passed-in arguments.
  */
 DAV1D_API int dav1d_send_data(Dav1dContext *c, Dav1dData *in);
 
 /**
  * Return a decoded picture.
  *
  * @param   c Input decoder instance.
  * @param out Output frame. The caller assumes ownership of the returned
  *            reference.
  *
  * @return
  *         0: Success, and a frame is returned.
- *   -EAGAIN: Not enough data to output a frame. dav1d_send_data() should be
- *            called with new input.
- *   other negative errno codes: Error during decoding or because of invalid
- *                               passed-in arguments.
+ *  DAV1D_ERR(EAGAIN): Not enough data to output a frame. dav1d_send_data()
+ *                     should be called with new input.
+ *  other negative DAV1D_ERR codes: Error during decoding or because of invalid
+ *                                  passed-in arguments.
  *
  * @note To drain buffered frames from the decoder (i.e. on end of stream),
  *       call this function until it returns -EAGAIN.
  *
  * @code{.c}
  *  Dav1dData data = { 0 };
  *  Dav1dPicture p = { 0 };
  *  int res;
--- a/third_party/dav1d/include/dav1d/picture.h
+++ b/third_party/dav1d/include/dav1d/picture.h
@@ -104,17 +104,17 @@ typedef struct Dav1dPicAllocator {
      *             stride[1].
      *             The allocator can fill the pic allocator_data pointer with
      *             a custom pointer that will be passed to
      *             release_picture_callback().
      * @param cookie Custom pointer passed to all calls.
      *
      * @note No fields other than data, stride and allocator_data must be filled
      *       by this callback.
-     * @return 0 on success. A negative errno value on error.
+     * @return 0 on success. A negative DAV1D_ERR value on error.
      */
     int (*alloc_picture_callback)(Dav1dPicture *pic, void *cookie);
     /**
      * Release the picture buffer.
      *
      * If frame threading is used, this function may be called by the main
      * thread (the thread which calls dav1d_get_picture()) or any of the frame
      * threads and thus must be thread-safe. If frame threading is not used,
--- a/third_party/dav1d/meson.build
+++ b/third_party/dav1d/meson.build
@@ -18,24 +18,24 @@
 # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 project('dav1d', ['c'],
-    version: '0.2.2',
+    version: '0.3.0',
     default_options: ['c_std=c99',
                       'warning_level=2',
                       'buildtype=release',
                       'b_ndebug=if-release'],
     meson_version: '>= 0.47.0')
 
-dav1d_soname_version   = '1.0.1'
+dav1d_soname_version   = '1.1.0'
 dav1d_api_version_array    = dav1d_soname_version.split('.')
 dav1d_api_version_major    = dav1d_api_version_array[0]
 dav1d_api_version_minor    = dav1d_api_version_array[1]
 dav1d_api_version_revision = dav1d_api_version_array[2]
 
 dav1d_src_root = meson.current_source_dir()
 cc = meson.get_compiler('c')
 
--- a/third_party/dav1d/src/arm/32/mc.S
+++ b/third_party/dav1d/src/arm/32/mc.S
@@ -22,17 +22,17 @@
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include "src/arm/asm.S"
-#include "src/arm/32/util.S"
+#include "util.S"
 
 .macro avg dst0, dst1, t0, t1, t2, t3
         vld1.16         {\t0,\t1},   [r2, :128]!
         vld1.16         {\t2,\t3},   [r3, :128]!
         vadd.i16        \t0,   \t0,  \t2
         vadd.i16        \t1,   \t1,  \t3
         vqrshrun.s16    \dst0, \t0,  #5
         vqrshrun.s16    \dst1, \t1,  #5
--- a/third_party/dav1d/src/arm/64/mc.S
+++ b/third_party/dav1d/src/arm/64/mc.S
@@ -22,17 +22,17 @@
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include "src/arm/asm.S"
-#include "src/arm/64/util.S"
+#include "util.S"
 
 .macro avg dst, t0, t1
         ld1             {\t0\().8h},   [x2],  16
         ld1             {\t1\().8h},   [x3],  16
         add             \t0\().8h,   \t0\().8h,   \t1\().8h
         sqrshrun        \dst\().8b,  \t0\().8h,   #5
 .endm
 
@@ -698,17 +698,17 @@ endfunc
         st1             {\r2\().8h, \r3\().8h}, [x8], \strd
 .endif
 .endm
 
 .macro make_8tap_fn op, type, type_h, type_v
 function \op\()_8tap_\type\()_8bpc_neon, export=1
         mov             x8,  \type_h
         mov             x9,  \type_v
-        b               \op\()_8tap\()_neon
+        b               \op\()_8tap_neon
 endfunc
 .endm
 
 // No spaces in these expressions, due to gas-preprocessor.
 #define REGULAR ((0*15<<7)|3*15)
 #define SMOOTH  ((1*15<<7)|4*15)
 #define SHARP   ((2*15<<7)|3*15)
 
new file mode 100644
--- /dev/null
+++ b/third_party/dav1d/src/arm/64/msac.S
@@ -0,0 +1,280 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define BUF_POS 0
+#define BUF_END 8
+#define DIF 16
+#define RNG 24
+#define CNT 28
+#define ALLOW_UPDATE_CDF 32
+
+const coeffs
+        .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
+        .short 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0, 0
+endconst
+
+const bits
+        .short   0x1,   0x2,   0x4,   0x8,   0x10,   0x20,   0x40,   0x80
+        .short 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000
+endconst
+
+.macro ld1_n d0, d1, src, sz, n
+.if \n <= 8
+        ld1             {\d0\sz},  [\src]
+.else
+        ld1             {\d0\sz, \d1\sz},  [\src]
+.endif
+.endm
+
+.macro st1_n s0, s1, dst, sz, n
+.if \n <= 8
+        st1             {\s0\sz},  [\dst]
+.else
+        st1             {\s0\sz, \s1\sz},  [\dst]
+.endif
+.endm
+
+.macro ushr_n d0, d1, s0, s1, shift, sz, n
+        ushr            \d0\sz,  \s0\sz,  \shift
+.if \n == 16
+        ushr            \d1\sz,  \s1\sz,  \shift
+.endif
+.endm
+
+.macro add_n d0, d1, s0, s1, s2, s3, sz, n
+        add             \d0\sz,  \s0\sz,  \s2\sz
+.if \n == 16
+        add             \d1\sz,  \s1\sz,  \s3\sz
+.endif
+.endm
+
+.macro sub_n d0, d1, s0, s1, s2, s3, sz, n
+        sub             \d0\sz,  \s0\sz,  \s2\sz
+.if \n == 16
+        sub             \d1\sz,  \s1\sz,  \s3\sz
+.endif
+.endm
+
+.macro and_n d0, d1, s0, s1, s2, s3, sz, n
+        and             \d0\sz,  \s0\sz,  \s2\sz
+.if \n == 16
+        and             \d1\sz,  \s1\sz,  \s3\sz
+.endif
+.endm
+
+.macro cmhs_n d0, d1, s0, s1, s2, s3, sz, n
+        cmhs            \d0\sz,  \s0\sz,  \s2\sz
+.if \n == 16
+        cmhs            \d1\sz,  \s1\sz,  \s3\sz
+.endif
+.endm
+
+.macro urhadd_n d0, d1, s0, s1, s2, s3, sz, n
+        urhadd          \d0\sz,  \s0\sz,  \s2\sz
+.if \n == 16
+        urhadd          \d1\sz,  \s1\sz,  \s3\sz
+.endif
+.endm
+
+.macro sshl_n d0, d1, s0, s1, s2, s3, sz, n
+        sshl            \d0\sz,  \s0\sz,  \s2\sz
+.if \n == 16
+        sshl            \d1\sz,  \s1\sz,  \s3\sz
+.endif
+.endm
+
+.macro umull_n d0, d1, d2, d3, s0, s1, s2, s3, n
+        umull           \d0\().4s, \s0\().4h,  \s2\().4h
+.if \n >= 8
+        umull2          \d1\().4s, \s0\().8h,  \s2\().8h
+.endif
+.if \n == 16
+        umull           \d2\().4s, \s1\().4h,  \s3\().4h
+        umull2          \d3\().4s, \s1\().8h,  \s3\().8h
+.endif
+.endm
+
+.macro shrn_n d0, d1, s0, s1, s2, s3, shift, n
+        shrn            \d0\().4h,  \s0\().4s, \shift
+.if \n >= 8
+        shrn2           \d0\().8h,  \s1\().4s, \shift
+.endif
+.if \n == 16
+        shrn            \d1\().4h,  \s2\().4s, \shift
+        shrn2           \d1\().8h,  \s3\().4s, \shift
+.endif
+.endm
+
+.macro str_n            idx0, idx1, dstreg, dstoff, n
+        str             q\idx0,  [\dstreg, \dstoff]
+.if \n == 16
+        str             q\idx1,  [\dstreg, \dstoff + 16]
+.endif
+.endm
+
+// unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf,
+//                                               size_t n_symbols);
+
+function msac_decode_symbol_adapt4_neon, export=1
+.macro decode_update sz, szb, n
+        sub             sp,  sp,  #48
+        add             x8,  x0,  #RNG
+        ld1_n           v0,  v1,  x1,  \sz, \n                    // cdf
+        ld1r            {v4\sz},  [x8]                            // rng
+        movrel          x9,  coeffs, 32
+        sub             x9,  x9,  x2, lsl #1
+        ushr_n          v2,  v3,  v0,  v1,  #6, \sz, \n           // cdf >> EC_PROB_SHIFT
+        str             h4,  [sp, #14]                            // store original u = s->rng
+        ushr            v4\sz,  v4\sz,  #8                        // r = rng >> 8
+
+        umull_n         v16, v17, v18, v19, v4,  v4,  v2,  v3, \n // r * (cdf >> EC_PROB_SHIFT)
+        ld1_n           v4,  v5,  x9,  \sz, \n                    // EC_MIN_PROB * (n_symbols - ret)
+        shrn_n          v2,  v3,  v16, v17, v18, v19, #1, \n      // v >>= 7 - EC_PROB_SHIFT
+        add             x8,  x0,  #DIF + 6
+
+        add_n           v4,  v5,  v2,  v3,  v4,  v5, \sz, \n      // v += EC_MIN_PROB * (n_symbols - ret)
+
+        ld1r            {v6.8h},  [x8]                            // dif >> (EC_WIN_SIZE - 16)
+        movrel          x8,  bits
+        str_n           4,   5,  sp, #16, \n                      // store v values to allow indexed access
+
+        ld1_n           v16, v17, x8,  .8h, \n
+
+        cmhs_n          v2,  v3,  v6,  v6,  v4,  v5,  .8h,  \n    // c >= v
+
+        and_n           v6,  v7,  v2,  v3,  v16, v17, .16b, \n    // One bit per halfword set in the mask
+.if \n == 16
+        add             v6.8h,  v6.8h,  v7.8h
+.endif
+        addv            h6,  v6.8h                                // Aggregate mask bits
+        ldr             w4,  [x0, #ALLOW_UPDATE_CDF]
+        umov            w3,  v6.h[0]
+        rbit            w3,  w3
+        clz             w15, w3                                   // ret
+
+        cbz             w4,  L(renorm)
+        // update_cdf
+        ldrh            w3,  [x1, x2, lsl #1]                     // count = cdf[n_symbols]
+        movi            v5\szb, #0xff
+        cmp             x2,  #4                                   // set C if n_symbols >= 4 (n_symbols > 3)
+        mov             w14, #4
+        lsr             w4,  w3,  #4                              // count >> 4
+        urhadd_n        v4,  v5,  v5,  v5,  v2,  v3,  \sz, \n     // i >= val ? -1 : 32768
+        adc             w4,  w4,  w14                             // (count >> 4) + (n_symbols > 3) + 4
+        neg             w4,  w4                                   // -rate
+        sub_n           v4,  v5,  v4,  v5,  v0,  v1,  \sz, \n     // (32768 - cdf[i]) or (-1 - cdf[i])
+        dup             v6.8h,    w4                              // -rate
+
+        sub             w3,  w3,  w3, lsr #5                      // count + (count >= 32)
+        sub_n           v0,  v1,  v0,  v1,  v2,  v3,  \sz, \n     // cdf + (i >= val ? 1 : 0)
+        sshl_n          v4,  v5,  v4,  v5,  v6,  v6,  \sz, \n     // ({32768,-1} - cdf[i]) >> rate
+        add             w3,  w3,  #1                              // count + (count < 32)
+        add_n           v0,  v1,  v0,  v1,  v4,  v5,  \sz, \n     // cdf + (32768 - cdf[i]) >> rate
+        st1_n           v0,  v1,  x1,  \sz, \n
+        strh            w3,  [x1, x2, lsl #1]
+.endm
+
+        decode_update   .4h, .8b, 4
+
+L(renorm):
+        add             x8,  sp,  #16
+        add             x8,  x8,  w15, uxtw #1
+        ldrh            w3,  [x8]              // v
+        ldurh           w4,  [x8, #-2]         // u
+        ldr             w6,  [x0, #CNT]
+        ldr             x7,  [x0, #DIF]
+        sub             w4,  w4,  w3           // rng = u - v
+        clz             w5,  w4                // clz(rng)
+        eor             w5,  w5,  #16          // d = clz(rng) ^ 16
+        mvn             x7,  x7                // ~dif
+        add             x7,  x7,  x3, lsl #48  // ~dif + (v << 48)
+        lsl             w4,  w4,  w5           // rng << d
+        subs            w6,  w6,  w5           // cnt -= d
+        lsl             x7,  x7,  x5           // (~dif + (v << 48)) << d
+        str             w4,  [x0, #RNG]
+        mvn             x7,  x7                // ~dif
+        b.ge            9f
+
+        // refill
+        ldr             x3,  [x0, #BUF_POS]
+        ldr             x4,  [x0, #BUF_END]
+        add             x5,  x3,  #8
+        cmp             x5,  x4
+        b.gt            2f
+
+        ldr             x3,  [x3]              // next_bits
+        add             w8,  w6,  #23          // shift_bits = cnt + 23
+        add             w6,  w6,  #16          // cnt += 16
+        rev             x3,  x3                // next_bits = bswap(next_bits)
+        sub             x5,  x5,  x8, lsr #3   // buf_pos -= shift_bits >> 3
+        and             w8,  w8,  #24          // shift_bits &= 24
+        lsr             x3,  x3,  x8           // next_bits >>= shift_bits
+        sub             w8,  w8,  w6           // shift_bits -= 16 + cnt
+        str             x5,  [x0, #BUF_POS]
+        lsl             x3,  x3,  x8           // next_bits <<= shift_bits
+        mov             w4,  #48
+        sub             w6,  w4,  w8           // cnt = cnt + 64 - shift_bits
+        eor             x7,  x7,  x3           // dif ^= next_bits
+        b               9f
+
+2:      // refill_eob
+        mov             w14, #40
+        sub             w5,  w14, w6           // c = 40 - cnt
+3:
+        cmp             x3,  x4
+        b.ge            4f
+        ldrb            w8,  [x3], #1
+        lsl             x8,  x8,  x5
+        eor             x7,  x7,  x8
+        subs            w5,  w5,  #8
+        b.ge            3b
+
+4:      // refill_eob_end
+        str             x3,  [x0, #BUF_POS]
+        sub             w6,  w14, w5           // cnt = 40 - c
+
+9:
+        str             w6,  [x0, #CNT]
+        str             x7,  [x0, #DIF]
+
+        mov             w0,  w15
+        add             sp,  sp,  #48
+        ret
+endfunc
+
+function msac_decode_symbol_adapt8_neon, export=1
+        decode_update   .8h, .16b, 8
+        b               L(renorm)
+endfunc
+
+function msac_decode_symbol_adapt16_neon, export=1
+        decode_update   .8h, .16b, 16
+        b               L(renorm)
+endfunc
--- a/third_party/dav1d/src/cdf.c
+++ b/third_party/dav1d/src/cdf.c
@@ -4176,17 +4176,17 @@ void dav1d_cdf_thread_copy(CdfContext *c
     }
 }
 
 int dav1d_cdf_thread_alloc(CdfThreadContext *const cdf,
                             struct thread_data *const t)
 {
     cdf->ref = dav1d_ref_create(sizeof(CdfContext) +
                                 (t != NULL) * sizeof(atomic_uint));
-    if (!cdf->ref) return -ENOMEM;
+    if (!cdf->ref) return DAV1D_ERR(ENOMEM);
     cdf->data.cdf = cdf->ref->data;
     if (t) {
         cdf->progress = (atomic_uint *) &cdf->data.cdf[1];
         atomic_init(cdf->progress, 0);
         cdf->t = t;
     }
     return 0;
 }
--- a/third_party/dav1d/src/data.c
+++ b/third_party/dav1d/src/data.c
@@ -53,40 +53,40 @@ uint8_t *dav1d_data_create_internal(Dav1
 }
 
 int dav1d_data_wrap_internal(Dav1dData *const buf, const uint8_t *const ptr,
                              const size_t sz,
                              void (*const free_callback)(const uint8_t *data,
                                                          void *cookie),
                              void *const cookie)
 {
-    validate_input_or_ret(buf != NULL, -EINVAL);
-    validate_input_or_ret(ptr != NULL, -EINVAL);
-    validate_input_or_ret(free_callback != NULL, -EINVAL);
+    validate_input_or_ret(buf != NULL, DAV1D_ERR(EINVAL));
+    validate_input_or_ret(ptr != NULL, DAV1D_ERR(EINVAL));
+    validate_input_or_ret(free_callback != NULL, DAV1D_ERR(EINVAL));
 
     buf->ref = dav1d_ref_wrap(ptr, free_callback, cookie);
-    if (!buf->ref) return -ENOMEM;
+    if (!buf->ref) return DAV1D_ERR(ENOMEM);
     buf->data = ptr;
     buf->sz = buf->m.size = sz;
     dav1d_data_props_set_defaults(&buf->m);
 
     return 0;
 }
 
 int dav1d_data_wrap_user_data_internal(Dav1dData *const buf,
                                        const uint8_t *const user_data,
                                        void (*const free_callback)(const uint8_t *user_data,
                                                                    void *cookie),
                                        void *const cookie)
 {
-    validate_input_or_ret(buf != NULL, -EINVAL);
-    validate_input_or_ret(free_callback != NULL, -EINVAL);
+    validate_input_or_ret(buf != NULL, DAV1D_ERR(EINVAL));
+    validate_input_or_ret(free_callback != NULL, DAV1D_ERR(EINVAL));
 
     buf->m.user_data.ref = dav1d_ref_wrap(user_data, free_callback, cookie);
-    if (!buf->m.user_data.ref) return -ENOMEM;
+    if (!buf->m.user_data.ref) return DAV1D_ERR(ENOMEM);
     buf->m.user_data.data = user_data;
 
     return 0;
 }
 
 
 void dav1d_data_ref(Dav1dData *const dst, const Dav1dData *const src) {
     validate_input(dst != NULL);
--- a/third_party/dav1d/src/decode.c
+++ b/third_party/dav1d/src/decode.c
@@ -226,17 +226,17 @@ static void find_matching_ref(const Dav1
     const ptrdiff_t b4_stride = f->b4_stride;
     const refmvs *const r = &f->mvs[t->by * b4_stride + t->bx];
     int count = 0;
     int have_topleft = have_top && have_left;
     int have_topright = imax(bw4, bh4) < 32 &&
                         have_top && t->bx + bw4 < t->ts->tiling.col_end &&
                         (intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT);
 
-#define bs(rp) dav1d_block_dimensions[sbtype_to_bs[(rp)->sb_type]]
+#define bs(rp) dav1d_block_dimensions[dav1d_sbtype_to_bs[(rp)->sb_type]]
 #define matches(rp) ((rp)->ref[0] == ref + 1 && (rp)->ref[1] == -1)
 
     if (have_top) {
         const refmvs *r2 = &r[-b4_stride];
         if (matches(r2)) {
             masks[0] |= 1;
             count = 1;
         }
@@ -1238,21 +1238,21 @@ static int decode_b(Dav1dTileContext *co
             splat_intraref(f->mvs, f->b4_stride, t->by, t->bx, bs,
                            y_mode_nofilt);
         }
     } else if (!(f->frame_hdr->frame_type & 1)) {
         // intra block copy
         candidate_mv mvstack[8];
         int n_mvs;
         mv mvlist[2][2];
-        av1_find_ref_mvs(mvstack, &n_mvs, mvlist, NULL,
-                         (int[2]) { -1, -1 }, f->bw, f->bh,
-                         bs, bp, t->by, t->bx, ts->tiling.col_start,
-                         ts->tiling.col_end, ts->tiling.row_start,
-                         ts->tiling.row_end, f->libaom_cm);
+        dav1d_find_ref_mvs(mvstack, &n_mvs, mvlist, NULL,
+                           (int[2]) { -1, -1 }, f->bw, f->bh,
+                           bs, bp, t->by, t->bx, ts->tiling.col_start,
+                           ts->tiling.col_end, ts->tiling.row_start,
+                           ts->tiling.row_end, f->libaom_cm);
 
         if (mvlist[0][0].y | mvlist[0][0].x)
             b->mv[0] = mvlist[0][0];
         else if (mvlist[0][1].y | mvlist[0][1].x)
             b->mv[0] = mvlist[0][1];
         else {
             if (t->by - (16 << f->seq_hdr->sb128) < ts->tiling.row_start) {
                 b->mv[0].y = 0;
@@ -1383,21 +1383,21 @@ static int decode_b(Dav1dTileContext *co
             b->comp_type = COMP_INTER_AVG;
             b->inter_mode = NEARESTMV_NEARESTMV;
             b->drl_idx = 0;
             has_subpel_filter = 0;
 
             candidate_mv mvstack[8];
             int n_mvs, ctx;
             mv mvlist[2][2];
-            av1_find_ref_mvs(mvstack, &n_mvs, mvlist, &ctx,
-                             (int[2]) { b->ref[0], b->ref[1] }, f->bw, f->bh,
-                             bs, bp, t->by, t->bx, ts->tiling.col_start,
-                             ts->tiling.col_end, ts->tiling.row_start,
-                             ts->tiling.row_end, f->libaom_cm);
+            dav1d_find_ref_mvs(mvstack, &n_mvs, mvlist, &ctx,
+                               (int[2]) { b->ref[0], b->ref[1] }, f->bw, f->bh,
+                               bs, bp, t->by, t->bx, ts->tiling.col_start,
+                               ts->tiling.col_end, ts->tiling.row_start,
+                               ts->tiling.row_end, f->libaom_cm);
 
             b->mv[0] = mvstack[0].this_mv;
             b->mv[1] = mvstack[0].comp_mv;
             fix_mv_precision(f->frame_hdr, &b->mv[0]);
             fix_mv_precision(f->frame_hdr, &b->mv[1]);
             if (DEBUG_BLOCK_INFO)
                 printf("Post-skipmodeblock[mv=1:y=%d,x=%d,2:y=%d,x=%d,refs=%d+%d\n",
                        b->mv[0].y, b->mv[0].x, b->mv[1].y, b->mv[1].x,
@@ -1463,21 +1463,21 @@ static int decode_b(Dav1dTileContext *co
             }
             if (DEBUG_BLOCK_INFO)
                 printf("Post-refs[%d/%d]: r=%d\n",
                        b->ref[0], b->ref[1], ts->msac.rng);
 
             candidate_mv mvstack[8];
             int n_mvs, ctx;
             mv mvlist[2][2];
-            av1_find_ref_mvs(mvstack, &n_mvs, mvlist, &ctx,
-                             (int[2]) { b->ref[0], b->ref[1] }, f->bw, f->bh,
-                             bs, bp, t->by, t->bx, ts->tiling.col_start,
-                             ts->tiling.col_end, ts->tiling.row_start,
-                             ts->tiling.row_end, f->libaom_cm);
+            dav1d_find_ref_mvs(mvstack, &n_mvs, mvlist, &ctx,
+                               (int[2]) { b->ref[0], b->ref[1] }, f->bw, f->bh,
+                               bs, bp, t->by, t->bx, ts->tiling.col_start,
+                               ts->tiling.col_end, ts->tiling.row_start,
+                               ts->tiling.row_end, f->libaom_cm);
 
             b->inter_mode = dav1d_msac_decode_symbol_adapt8(&ts->msac,
                                 ts->cdf.m.comp_inter_mode[ctx],
                                 N_COMP_INTER_PRED_MODES);
             if (DEBUG_BLOCK_INFO)
                 printf("Post-compintermode[%d,ctx=%d,n_mvs=%d]: r=%d\n",
                        b->inter_mode, ctx, n_mvs, ts->msac.rng);
 
@@ -1640,21 +1640,21 @@ static int decode_b(Dav1dTileContext *co
                 if (DEBUG_BLOCK_INFO)
                     printf("Post-ref[%d]: r=%d\n", b->ref[0], ts->msac.rng);
             }
             b->ref[1] = -1;
 
             candidate_mv mvstack[8];
             int n_mvs, ctx;
             mv mvlist[2][2];
-            av1_find_ref_mvs(mvstack, &n_mvs, mvlist, &ctx,
-                             (int[2]) { b->ref[0], -1 }, f->bw, f->bh, bs, bp,
-                             t->by, t->bx, ts->tiling.col_start,
-                             ts->tiling.col_end, ts->tiling.row_start,
-                             ts->tiling.row_end, f->libaom_cm);
+            dav1d_find_ref_mvs(mvstack, &n_mvs, mvlist, &ctx,
+                               (int[2]) { b->ref[0], -1 }, f->bw, f->bh, bs, bp,
+                               t->by, t->bx, ts->tiling.col_start,
+                               ts->tiling.col_end, ts->tiling.row_start,
+                               ts->tiling.row_end, f->libaom_cm);
 
             // mode parsing and mv derivation from ref_mvs
             if ((seg && (seg->skip || seg->globalmv)) ||
                 dav1d_msac_decode_bool_adapt(&ts->msac,
                                              ts->cdf.m.newmv_mode[ctx & 7]))
             {
                 if ((seg && (seg->skip || seg->globalmv)) ||
                     !dav1d_msac_decode_bool_adapt(&ts->msac,
@@ -2458,19 +2458,19 @@ int dav1d_decode_tile_sbrow(Dav1dTileCon
 
     if (c->n_fc > 1 && f->frame_hdr->use_ref_frame_mvs) {
         for (int n = 0; n < 7; n++)
             if (dav1d_thread_picture_wait(&f->refp[n], 4 * (t->by + sb_step),
                                           PLANE_TYPE_BLOCK))
             {
                 return 1;
             }
-        av1_init_ref_mv_tile_row(f->libaom_cm,
-                                 ts->tiling.col_start, ts->tiling.col_end,
-                                 t->by, imin(t->by + sb_step, f->bh));
+        dav1d_init_ref_mv_tile_row(f->libaom_cm,
+                                   ts->tiling.col_start, ts->tiling.col_end,
+                                   t->by, imin(t->by + sb_step, f->bh));
     }
     memset(t->pal_sz_uv[1], 0, sizeof(*t->pal_sz_uv));
     const int sb128y = t->by >> 5;
     for (t->bx = ts->tiling.col_start, t->a = f->a + col_sb128_start + tile_row * f->sb128w,
          t->lf_mask = f->lf.mask + sb128y * f->sb128w + col_sb128_start;
          t->bx < ts->tiling.col_end; t->bx += sb_step)
     {
         if (atomic_load_explicit(c->frame_thread.flush, memory_order_acquire))
@@ -2561,17 +2561,17 @@ int dav1d_decode_tile_sbrow(Dav1dTileCon
     memcpy(&f->lf.tx_lpf_right_edge[1][align_h * tile_col + (t->by >> ss_ver)],
            &t->l.tx_lpf_uv[(t->by & 16) >> ss_ver], sb_step >> ss_ver);
 
     return 0;
 }
 
 int dav1d_decode_frame(Dav1dFrameContext *const f) {
     const Dav1dContext *const c = f->c;
-    int retval = -ENOMEM;
+    int retval = DAV1D_ERR(ENOMEM);
 
     if (f->n_tc > 1) {
         if (f->frame_hdr->tiling.cols * f->sbh > f->tile_thread.titsati_sz) {
             freep(&f->tile_thread.task_idx_to_sby_and_tile_idx);
             f->tile_thread.task_idx_to_sby_and_tile_idx =
                 malloc(sizeof(*f->tile_thread.task_idx_to_sby_and_tile_idx) *
                        f->frame_hdr->tiling.cols * f->sbh);
             if (!f->tile_thread.task_idx_to_sby_and_tile_idx) goto error;
@@ -2760,30 +2760,30 @@ int dav1d_decode_frame(Dav1dFrameContext
                                      f->sb128h * 32 * f->frame_hdr->tiling.cols;
         f->lf.re_sz = f->sb128h * f->frame_hdr->tiling.cols;
     }
 
     // init ref mvs
     if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) {
         f->mvs = f->mvs_ref->data;
         const int order_hint_n_bits = f->seq_hdr->order_hint * f->seq_hdr->order_hint_n_bits;
-        const int ret = av1_init_ref_mv_common(f->libaom_cm, f->bw >> 1, f->bh >> 1,
-                                               f->b4_stride, f->seq_hdr->sb128,
-                                               f->mvs, f->ref_mvs,
-                                               f->cur.frame_hdr->frame_offset,
-                                               f->refpoc,
-                                               f->refrefpoc, f->frame_hdr->gmv,
-                                               f->frame_hdr->hp, f->frame_hdr->force_integer_mv,
-                                               f->frame_hdr->use_ref_frame_mvs,
-                                               order_hint_n_bits);
+        const int ret = dav1d_init_ref_mv_common(f->libaom_cm, f->bw >> 1, f->bh >> 1,
+                                                 f->b4_stride, f->seq_hdr->sb128,
+                                                 f->mvs, f->ref_mvs,
+                                                 f->cur.frame_hdr->frame_offset,
+                                                 f->refpoc,
+                                                 f->refrefpoc, f->frame_hdr->gmv,
+                                                 f->frame_hdr->hp, f->frame_hdr->force_integer_mv,
+                                                 f->frame_hdr->use_ref_frame_mvs,
+                                                 order_hint_n_bits);
         if (ret < 0) goto error;
         if (c->n_fc == 1 && f->frame_hdr->use_ref_frame_mvs)
-            av1_init_ref_mv_tile_row(f->libaom_cm, 0, f->bw, 0, f->bh);
+            dav1d_init_ref_mv_tile_row(f->libaom_cm, 0, f->bw, 0, f->bh);
     }
-    retval = -EINVAL;
+    retval = DAV1D_ERR(EINVAL);
 
     // setup dequant tables
     init_quant_tables(f->seq_hdr, f->frame_hdr, f->frame_hdr->quant.yac, f->dq);
     if (f->frame_hdr->quant.qm)
         for (int j = 0; j < N_RECT_TX_SIZES; j++) {
             f->qm[0][j][0] = dav1d_qm_tbl[f->frame_hdr->quant.qm_y][0][j];
             f->qm[0][j][1] = dav1d_qm_tbl[f->frame_hdr->quant.qm_u][1][j];
             f->qm[0][j][2] = dav1d_qm_tbl[f->frame_hdr->quant.qm_v][1][j];
@@ -3110,17 +3110,17 @@ int dav1d_submit_frame(Dav1dContext *con
         case 10:
         case 12:
             assign_bitdepth_case(16);
 #endif
 #undef assign_bitdepth_case
         default:
             dav1d_log(c, "Compiled without support for %d-bit decoding\n",
                     8 + 2 * f->seq_hdr->hbd);
-            res = -ENOPROTOOPT;
+            res = DAV1D_ERR(ENOPROTOOPT);
             goto error;
         }
     }
 
 #define assign_bitdepth_case(bd) \
         f->bd_fn.recon_b_inter = dav1d_recon_b_inter_##bd##bpc; \
         f->bd_fn.recon_b_intra = dav1d_recon_b_intra_##bd##bpc; \
         f->bd_fn.filter_sbrow = dav1d_filter_sbrow_##bd##bpc; \
@@ -3137,33 +3137,33 @@ int dav1d_submit_frame(Dav1dContext *con
     }
 #undef assign_bitdepth_case
 
     int ref_coded_width[7];
     if (f->frame_hdr->frame_type & 1) {
         if (f->frame_hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE) {
             const int pri_ref = f->frame_hdr->refidx[f->frame_hdr->primary_ref_frame];
             if (!c->refs[pri_ref].p.p.data[0]) {
-                res = -EINVAL;
+                res = DAV1D_ERR(EINVAL);
                 goto error;
             }
         }
         for (int i = 0; i < 7; i++) {
             const int refidx = f->frame_hdr->refidx[i];
             if (!c->refs[refidx].p.p.data[0] ||
                 f->frame_hdr->width[0] * 2 < c->refs[refidx].p.p.p.w ||
                 f->frame_hdr->height * 2 < c->refs[refidx].p.p.p.h ||
                 f->frame_hdr->width[0] > c->refs[refidx].p.p.p.w * 16 ||
                 f->frame_hdr->height > c->refs[refidx].p.p.p.h * 16 ||
                 f->seq_hdr->layout != c->refs[refidx].p.p.p.layout ||
                 bpc != c->refs[refidx].p.p.p.bpc)
             {
                 for (int j = 0; j < i; j++)
                     dav1d_thread_picture_unref(&f->refp[j]);
-                res = -EINVAL;
+                res = DAV1D_ERR(EINVAL);
                 goto error;
             }
             dav1d_thread_picture_ref(&f->refp[i], &c->refs[refidx].p);
             ref_coded_width[i] = c->refs[refidx].p.p.frame_hdr->width[0];
             if (f->frame_hdr->width[0] != c->refs[refidx].p.p.p.w ||
                 f->frame_hdr->height != c->refs[refidx].p.p.p.h)
             {
 #define scale_fac(ref_sz, this_sz) \
@@ -3250,17 +3250,17 @@ int dav1d_submit_frame(Dav1dContext *con
     f->b4_stride = (f->bw + 31) & ~31;
     f->bitdepth_max = (1 << f->cur.p.bpc) - 1;
 
     // ref_mvs
     if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) {
         f->mvs_ref = dav1d_ref_create(f->sb128h * 32 * f->b4_stride *
                                       sizeof(*f->mvs));
         if (!f->mvs_ref) {
-            res = -ENOMEM;
+            res = DAV1D_ERR(ENOMEM);
             goto error;
         }
         f->mvs = f->mvs_ref->data;
         if (!f->frame_hdr->allow_intrabc) {
             for (int i = 0; i < 7; i++)
                 f->refpoc[i] = f->refp[i].p.frame_hdr->frame_offset;
         } else {
             memset(f->refpoc, 0, sizeof(f->refpoc));
@@ -3313,31 +3313,31 @@ int dav1d_submit_frame(Dav1dContext *con
         }
 
         if (f->frame_hdr->segmentation.update_map) {
             // We're updating an existing map, but need somewhere to
             // put the new values. Allocate them here (the data
             // actually gets set elsewhere)
             f->cur_segmap_ref = dav1d_ref_create(f->b4_stride * 32 * f->sb128h);
             if (!f->cur_segmap_ref) {
-                res = -ENOMEM;
+                res = DAV1D_ERR(ENOMEM);
                 goto error;
             }
             f->cur_segmap = f->cur_segmap_ref->data;
         } else if (f->prev_segmap_ref) {
             // We're not updating an existing map, and we have a valid
             // reference. Use that.
             f->cur_segmap_ref = f->prev_segmap_ref;
             dav1d_ref_inc(f->cur_segmap_ref);
             f->cur_segmap = f->prev_segmap_ref->data;
         } else {
             // We need to make a new map. Allocate one here and zero it out.
             f->cur_segmap_ref = dav1d_ref_create(f->b4_stride * 32 * f->sb128h);
             if (!f->cur_segmap_ref) {
-                res = -ENOMEM;
+                res = DAV1D_ERR(ENOMEM);
                 goto error;
             }
             f->cur_segmap = f->cur_segmap_ref->data;
             memset(f->cur_segmap_ref->data, 0, f->b4_stride * 32 * f->sb128h);
         }
     } else {
         f->cur_segmap = NULL;
         f->cur_segmap_ref = NULL;
--- a/third_party/dav1d/src/itx_tmpl.c
+++ b/third_party/dav1d/src/itx_tmpl.c
@@ -45,17 +45,17 @@ typedef void (*itx_1d_fn)(const coef *in
 static void NOINLINE
 inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
                coef *const coeff, const int eob,
                const int w, const int h, const int shift,
                const itx_1d_fn first_1d_fn, const itx_1d_fn second_1d_fn,
                const int has_dconly HIGHBD_DECL_SUFFIX)
 {
     int i, j;
-    assert((h >= 4 || h <= 64) && (w >= 4 || w <= 64));
+    assert((h >= 4 && h <= 64) && (w >= 4 && w <= 64));
     const int is_rect2 = w * 2 == h || h * 2 == w;
     const int bitdepth = bitdepth_from_max(bitdepth_max);
     const int rnd = (1 << shift) >> 1;
 
     if (has_dconly && eob == 0) {
         int dc = coeff[0];
         coeff[0] = 0;
         if (is_rect2)
--- a/third_party/dav1d/src/lib.c
+++ b/third_party/dav1d/src/lib.c
@@ -56,44 +56,48 @@ const char *dav1d_version(void) {
     return DAV1D_VERSION;
 }
 
 void dav1d_default_settings(Dav1dSettings *const s) {
     s->n_frame_threads = 1;
     s->n_tile_threads = 1;
     s->apply_grain = 1;
     s->allocator.cookie = NULL;
-    s->allocator.alloc_picture_callback = default_picture_allocator;
-    s->allocator.release_picture_callback = default_picture_release;
+    s->allocator.alloc_picture_callback = dav1d_default_picture_alloc;
+    s->allocator.release_picture_callback = dav1d_default_picture_release;
     s->logger.cookie = NULL;
     s->logger.callback = dav1d_log_default_callback;
     s->operating_point = 0;
     s->all_layers = 1; // just until the tests are adjusted
 }
 
 static void close_internal(Dav1dContext **const c_out, int flush);
 
 int dav1d_open(Dav1dContext **const c_out,
                const Dav1dSettings *const s)
 {
     static pthread_once_t initted = PTHREAD_ONCE_INIT;
     pthread_once(&initted, init_internal);
 
-    validate_input_or_ret(c_out != NULL, -EINVAL);
-    validate_input_or_ret(s != NULL, -EINVAL);
+    validate_input_or_ret(c_out != NULL, DAV1D_ERR(EINVAL));
+    validate_input_or_ret(s != NULL, DAV1D_ERR(EINVAL));
     validate_input_or_ret(s->n_tile_threads >= 1 &&
-                          s->n_tile_threads <= DAV1D_MAX_TILE_THREADS, -EINVAL);
+                          s->n_tile_threads <= DAV1D_MAX_TILE_THREADS, DAV1D_ERR(EINVAL));
     validate_input_or_ret(s->n_frame_threads >= 1 &&
-                          s->n_frame_threads <= DAV1D_MAX_FRAME_THREADS, -EINVAL);
+                          s->n_frame_threads <= DAV1D_MAX_FRAME_THREADS, DAV1D_ERR(EINVAL));
     validate_input_or_ret(s->allocator.alloc_picture_callback != NULL,
-                          -EINVAL);
+                          DAV1D_ERR(EINVAL));
     validate_input_or_ret(s->allocator.release_picture_callback != NULL,
-                          -EINVAL);
+                          DAV1D_ERR(EINVAL));
     validate_input_or_ret(s->operating_point >= 0 &&
-                          s->operating_point <= 31, -EINVAL);
+                          s->operating_point <= 31, DAV1D_ERR(EINVAL));
+
+    pthread_attr_t thread_attr;
+    if (pthread_attr_init(&thread_attr)) return DAV1D_ERR(ENOMEM);
+    pthread_attr_setstacksize(&thread_attr, 512 * 1024);
 
     Dav1dContext *const c = *c_out = dav1d_alloc_aligned(sizeof(*c), 32);
     if (!c) goto error;
     memset(c, 0, sizeof(*c));
 
     c->allocator = s->allocator;
     c->logger = s->logger;
     c->apply_grain = s->apply_grain;
@@ -146,65 +150,68 @@ int dav1d_open(Dav1dContext **const c_ou
             if (!t->emu_edge) goto error;
             if (f->n_tc > 1) {
                 if (pthread_mutex_init(&t->tile_thread.td.lock, NULL)) goto error;
                 if (pthread_cond_init(&t->tile_thread.td.cond, NULL)) {
                     pthread_mutex_destroy(&t->tile_thread.td.lock);
                     goto error;
                 }
                 t->tile_thread.fttd = &f->tile_thread;
-                if (pthread_create(&t->tile_thread.td.thread, NULL, dav1d_tile_task, t)) {
+                if (pthread_create(&t->tile_thread.td.thread, &thread_attr, dav1d_tile_task, t)) {
                     pthread_cond_destroy(&t->tile_thread.td.cond);
                     pthread_mutex_destroy(&t->tile_thread.td.lock);
                     goto error;
                 }
                 t->tile_thread.td.inited = 1;
             }
         }
-        f->libaom_cm = av1_alloc_ref_mv_common();
+        f->libaom_cm = dav1d_alloc_ref_mv_common();
         if (!f->libaom_cm) goto error;
         if (c->n_fc > 1) {
             if (pthread_mutex_init(&f->frame_thread.td.lock, NULL)) goto error;
             if (pthread_cond_init(&f->frame_thread.td.cond, NULL)) {
                 pthread_mutex_destroy(&f->frame_thread.td.lock);
                 goto error;
             }
-            if (pthread_create(&f->frame_thread.td.thread, NULL, dav1d_frame_task, f)) {
+            if (pthread_create(&f->frame_thread.td.thread, &thread_attr, dav1d_frame_task, f)) {
                 pthread_cond_destroy(&f->frame_thread.td.cond);
                 pthread_mutex_destroy(&f->frame_thread.td.lock);
                 goto error;
             }
             f->frame_thread.td.inited = 1;
         }
     }
 
     // intra edge tree
     c->intra_edge.root[BL_128X128] = &c->intra_edge.branch_sb128[0].node;
     dav1d_init_mode_tree(c->intra_edge.root[BL_128X128], c->intra_edge.tip_sb128, 1);
     c->intra_edge.root[BL_64X64] = &c->intra_edge.branch_sb64[0].node;
     dav1d_init_mode_tree(c->intra_edge.root[BL_64X64], c->intra_edge.tip_sb64, 0);
 
+    pthread_attr_destroy(&thread_attr);
+
     return 0;
 
 error:
     if (c) close_internal(c_out, 0);
-    return -ENOMEM;
+    pthread_attr_destroy(&thread_attr);
+    return DAV1D_ERR(ENOMEM);
 }
 
 static void dummy_free(const uint8_t *const data, void *const user_data) {
     assert(data && !user_data);
 }
 
 int dav1d_parse_sequence_header(Dav1dSequenceHeader *const out,
                                 const uint8_t *const ptr, const size_t sz)
 {
     Dav1dData buf = { 0 };
     int res;
 
-    validate_input_or_ret(out != NULL, -EINVAL);
+    validate_input_or_ret(out != NULL, DAV1D_ERR(EINVAL));
 
     Dav1dSettings s;
     dav1d_default_settings(&s);
     s.logger.callback = NULL;
 
     Dav1dContext *c;
     res = dav1d_open(&c, &s);
     if (res < 0) return res;
@@ -219,40 +226,40 @@ int dav1d_parse_sequence_header(Dav1dSeq
         if (res < 0) goto error;
 
         assert((size_t)res <= buf.sz);
         buf.sz -= res;
         buf.data += res;
     }
 
     if (!c->seq_hdr) {
-        res = -EINVAL;
+        res = DAV1D_ERR(EINVAL);
         goto error;
     }
 
     memcpy(out, c->seq_hdr, sizeof(*out));
 
     res = 0;
 error:
     dav1d_data_unref_internal(&buf);
     dav1d_close(&c);
 
     return res;
 }
 
 int dav1d_send_data(Dav1dContext *const c, Dav1dData *const in)
 {
-    validate_input_or_ret(c != NULL, -EINVAL);
-    validate_input_or_ret(in != NULL, -EINVAL);
-    validate_input_or_ret(in->data == NULL || in->sz, -EINVAL);
+    validate_input_or_ret(c != NULL, DAV1D_ERR(EINVAL));
+    validate_input_or_ret(in != NULL, DAV1D_ERR(EINVAL));
+    validate_input_or_ret(in->data == NULL || in->sz, DAV1D_ERR(EINVAL));
 
     c->drain = 0;
 
     if (c->in.data)
-        return -EAGAIN;
+        return DAV1D_ERR(EAGAIN);
     dav1d_data_move_ref(&c->in, in);
 
     return 0;
 }
 
 static int output_image(Dav1dContext *const c, Dav1dPicture *const out,
                         Dav1dPicture *const in)
 {
@@ -331,32 +338,32 @@ static int drain_picture(Dav1dContext *c
             if (out_delayed->visible && progress != FRAME_ERROR)
                 dav1d_picture_ref(&c->out, &out_delayed->p);
             dav1d_thread_picture_unref(out_delayed);
             if (output_picture_ready(c))
                 return output_image(c, out, &c->out);
         }
     } while (++drain_count < c->n_fc);
 
-    return -EAGAIN;
+    return DAV1D_ERR(EAGAIN);
 }
 
 int dav1d_get_picture(Dav1dContext *const c, Dav1dPicture *const out)
 {
     int res;
 
-    validate_input_or_ret(c != NULL, -EINVAL);
-    validate_input_or_ret(out != NULL, -EINVAL);
+    validate_input_or_ret(c != NULL, DAV1D_ERR(EINVAL));
+    validate_input_or_ret(out != NULL, DAV1D_ERR(EINVAL));
 
     const int drain = c->drain;
     c->drain = 1;
 
     Dav1dData *const in = &c->in;
     if (!in->data) {
-        if (c->n_fc == 1) return -EAGAIN;
+        if (c->n_fc == 1) return DAV1D_ERR(EAGAIN);
         return drain_picture(c, out);
     }
 
     while (in->sz > 0) {
         res = dav1d_parse_obus(c, in, 0);
         if (res < 0) {
             dav1d_data_unref_internal(in);
         } else {
@@ -372,17 +379,17 @@ int dav1d_get_picture(Dav1dContext *cons
     }
 
     if (output_picture_ready(c))
         return output_image(c, out, &c->out);
 
     if (c->n_fc > 1 && drain)
         return drain_picture(c, out);
 
-    return -EAGAIN;
+    return DAV1D_ERR(EAGAIN);
 }
 
 void dav1d_flush(Dav1dContext *const c) {
     dav1d_data_unref_internal(&c->in);
     c->drain = 0;
 
     for (int i = 0; i < 8; i++) {
         if (c->refs[i].p.p.data[0])
@@ -497,17 +504,17 @@ static void close_internal(Dav1dContext 
         dav1d_free_aligned(f->tc);
         dav1d_free_aligned(f->ipred_edge[0]);
         free(f->a);
         free(f->tile);
         free(f->lf.mask);
         free(f->lf.lr_mask);
         free(f->lf.level);
         free(f->lf.tx_lpf_right_edge[0]);
-        if (f->libaom_cm) av1_free_ref_mv_common(f->libaom_cm);
+        if (f->libaom_cm) dav1d_free_ref_mv_common(f->libaom_cm);
         dav1d_free_aligned(f->lf.cdef_line);
         dav1d_free_aligned(f->lf.lr_lpf_line);
     }
     dav1d_free_aligned(c->fc);
     dav1d_data_unref_internal(&c->in);
     if (c->n_fc > 1 && c->frame_thread.out_delayed) {
         for (unsigned n = 0; n < c->n_fc; n++)
             if (c->frame_thread.out_delayed[n].p.data[0])
--- a/third_party/dav1d/src/meson.build
+++ b/third_party/dav1d/src/meson.build
@@ -91,16 +91,17 @@ if is_asm_enabled
             'arm/mc_init_tmpl.c',
         )
         if host_machine.cpu_family() == 'aarch64'
             libdav1d_sources += files(
                 'arm/64/cdef.S',
                 'arm/64/loopfilter.S',
                 'arm/64/looprestoration.S',
                 'arm/64/mc.S',
+                'arm/64/msac.S',
             )
         elif host_machine.cpu_family().startswith('arm')
             libdav1d_sources += files(
                 'arm/32/looprestoration.S',
                 'arm/32/mc.S',
             )
         endif
     elif host_machine.cpu_family().startswith('x86')
--- a/third_party/dav1d/src/msac.h
+++ b/third_party/dav1d/src/msac.h
@@ -50,17 +50,27 @@ unsigned dav1d_msac_decode_symbol_adapt_
 unsigned dav1d_msac_decode_bool_equi(MsacContext *s);
 unsigned dav1d_msac_decode_bool(MsacContext *s, unsigned f);
 unsigned dav1d_msac_decode_bool_adapt(MsacContext *s, uint16_t *cdf);
 unsigned dav1d_msac_decode_bools(MsacContext *s, unsigned n);
 int dav1d_msac_decode_subexp(MsacContext *s, int ref, int n, unsigned k);
 int dav1d_msac_decode_uniform(MsacContext *s, unsigned n);
 
 /* Supported n_symbols ranges: adapt4: 1-5, adapt8: 1-8, adapt16: 4-16 */
-#if ARCH_X86_64 && HAVE_ASM
+#if ARCH_AARCH64 && HAVE_ASM
+unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf,
+                                              size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt8_neon(MsacContext *s, uint16_t *cdf,
+                                              size_t n_symbols);
+unsigned dav1d_msac_decode_symbol_adapt16_neon(MsacContext *s, uint16_t *cdf,
+                                               size_t n_symbols);
+#define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt4_neon
+#define dav1d_msac_decode_symbol_adapt8  dav1d_msac_decode_symbol_adapt8_neon
+#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_neon
+#elif ARCH_X86_64 && HAVE_ASM
 unsigned dav1d_msac_decode_symbol_adapt4_sse2(MsacContext *s, uint16_t *cdf,
                                               size_t n_symbols);
 unsigned dav1d_msac_decode_symbol_adapt8_sse2(MsacContext *s, uint16_t *cdf,
                                               size_t n_symbols);
 unsigned dav1d_msac_decode_symbol_adapt16_sse2(MsacContext *s, uint16_t *cdf,
                                                size_t n_symbols);
 #define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt4_sse2
 #define dav1d_msac_decode_symbol_adapt8  dav1d_msac_decode_symbol_adapt8_sse2
--- a/third_party/dav1d/src/obu.c
+++ b/third_party/dav1d/src/obu.c
@@ -279,17 +279,17 @@ static int parse_seq_hdr(Dav1dContext *c
     // We needn't bother flushing the OBU here: we'll check we didn't
     // overrun in the caller and will then discard gb, so there's no
     // point in setting its position properly.
 
     return 0;
 
 error:
     dav1d_log(c, "Error parsing sequence header\n");
-    return -EINVAL;
+    return DAV1D_ERR(EINVAL);
 }
 
 static int read_frame_size(Dav1dContext *const c, GetBits *const gb,
                            const int use_ref)
 {
     const Dav1dSequenceHeader *const seqhdr = c->seq_hdr;
     Dav1dFrameHeader *const hdr = c->frame_hdr;
 
@@ -751,17 +751,17 @@ static int parse_frame_hdr(Dav1dContext 
                     hdr->segmentation.seg_data.preskip = 1;
                 }
             }
         } else {
             // segmentation.update_data was false so we should copy
             // segmentation data from the reference frame.
             assert(hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE);
             const int pri_ref = hdr->refidx[hdr->primary_ref_frame];
-            if (!c->refs[pri_ref].p.p.frame_hdr) return -EINVAL;
+            if (!c->refs[pri_ref].p.p.frame_hdr) return DAV1D_ERR(EINVAL);
             hdr->segmentation.seg_data =
                 c->refs[pri_ref].p.p.frame_hdr->segmentation.seg_data;
         }
     } else {
         memset(&hdr->segmentation.seg_data, 0, sizeof(Dav1dSegmentationDataSet));
         for (int i = 0; i < DAV1D_MAX_SEGMENTS; i++)
             hdr->segmentation.seg_data.d[i].ref = -1;
     }
@@ -813,17 +813,17 @@ static int parse_frame_hdr(Dav1dContext 
             hdr->loopfilter.level_v = dav1d_get_bits(gb, 6);
         }
         hdr->loopfilter.sharpness = dav1d_get_bits(gb, 3);
 
         if (hdr->primary_ref_frame == DAV1D_PRIMARY_REF_NONE) {
             hdr->loopfilter.mode_ref_deltas = default_mode_ref_deltas;
         } else {
             const int ref = hdr->refidx[hdr->primary_ref_frame];
-            if (!c->refs[ref].p.p.frame_hdr) return -EINVAL;
+            if (!c->refs[ref].p.p.frame_hdr) return DAV1D_ERR(EINVAL);
             hdr->loopfilter.mode_ref_deltas =
                 c->refs[ref].p.p.frame_hdr->loopfilter.mode_ref_deltas;
         }
         hdr->loopfilter.mode_ref_delta_enabled = dav1d_get_bits(gb, 1);
         if (hdr->loopfilter.mode_ref_delta_enabled) {
             hdr->loopfilter.mode_ref_delta_update = dav1d_get_bits(gb, 1);
             if (hdr->loopfilter.mode_ref_delta_update) {
                 for (int i = 0; i < 8; i++)
@@ -917,17 +917,17 @@ static int parse_frame_hdr(Dav1dContext 
     hdr->skip_mode_allowed = 0;
     if (hdr->switchable_comp_refs && hdr->frame_type & 1 && seqhdr->order_hint) {
         const unsigned poc = hdr->frame_offset;
         unsigned off_before[2] = { 0xFFFFFFFF, 0xFFFFFFFF };
         int off_after = -1;
         int off_before_idx[2], off_after_idx;
         off_before_idx[0] = 0;
         for (int i = 0; i < 7; i++) {
-            if (!c->refs[hdr->refidx[i]].p.p.data[0]) return -EINVAL;
+            if (!c->refs[hdr->refidx[i]].p.p.data[0]) return DAV1D_ERR(EINVAL);
             const unsigned refpoc = c->refs[hdr->refidx[i]].p.p.frame_hdr->frame_offset;
 
             const int diff = get_poc_diff(seqhdr->order_hint_n_bits, refpoc, poc);
             if (diff > 0) {
                 if (off_after == -1 || get_poc_diff(seqhdr->order_hint_n_bits,
                                                     off_after, refpoc) > 0)
                 {
                     off_after = refpoc;
@@ -994,17 +994,17 @@ static int parse_frame_hdr(Dav1dContext 
 
             if (hdr->gmv[i].type == DAV1D_WM_TYPE_IDENTITY) continue;
 
             const Dav1dWarpedMotionParams *ref_gmv;
             if (hdr->primary_ref_frame == DAV1D_PRIMARY_REF_NONE) {
                 ref_gmv = &dav1d_default_wm_params;
             } else {
                 const int pri_ref = hdr->refidx[hdr->primary_ref_frame];
-                if (!c->refs[pri_ref].p.p.frame_hdr) return -EINVAL;
+                if (!c->refs[pri_ref].p.p.frame_hdr) return DAV1D_ERR(EINVAL);
                 ref_gmv = &c->refs[pri_ref].p.p.frame_hdr->gmv[i];
             }
             int32_t *const mat = hdr->gmv[i].matrix;
             const int32_t *const ref_mat = ref_gmv->matrix;
             int bits, shift;
 
             if (hdr->gmv[i].type >= DAV1D_WM_TYPE_ROT_ZOOM) {
                 mat[2] = (1 << 16) + 2 *
@@ -1117,17 +1117,17 @@ static int parse_frame_hdr(Dav1dContext 
     printf("HDR: post-filmgrain: off=%ld\n",
            (gb->ptr - init_ptr) * 8 - gb->bits_left);
 #endif
 
     return 0;
 
 error:
     dav1d_log(c, "Error parsing frame header\n");
-    return -EINVAL;
+    return DAV1D_ERR(EINVAL);
 }
 
 static void parse_tile_hdr(Dav1dContext *const c, GetBits *const gb) {
     int have_tile_pos = 0;
     const int n_tiles = c->frame_hdr->tiling.cols * c->frame_hdr->tiling.rows;
     if (n_tiles > 1)
         have_tile_pos = dav1d_get_bits(gb, 1);
 
@@ -1222,26 +1222,26 @@ int dav1d_parse_obus(Dav1dContext *const
         const int in_spatial_layer = (c->operating_point_idc >> (spatial_id + 8)) & 1;
         if (!in_temporal_layer || !in_spatial_layer)
             return len + init_byte_pos;
     }
 
     switch (type) {
     case OBU_SEQ_HDR: {
         Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dSequenceHeader));
-        if (!ref) return -ENOMEM;
+        if (!ref) return DAV1D_ERR(ENOMEM);
         Dav1dSequenceHeader *seq_hdr = ref->data;
         memset(seq_hdr, 0, sizeof(*seq_hdr));
         if ((res = parse_seq_hdr(c, &gb, seq_hdr)) < 0) {
             dav1d_ref_dec(&ref);
             return res;
         }
         if (check_for_overrun(c, &gb, init_bit_pos, len)) {
             dav1d_ref_dec(&ref);
-            return -EINVAL;
+            return DAV1D_ERR(EINVAL);
         }
         // If we have read a sequence header which is different from
         // the old one, this is a new video sequence and can't use any
         // previous state. Free that state.
         if (!c->seq_hdr)
             c->frame_hdr = NULL;
         // see 7.5, operating_parameter_info is allowed to change in
         // sequence headers of a single sequence
@@ -1268,17 +1268,17 @@ int dav1d_parse_obus(Dav1dContext *const
         if (c->frame_hdr) break;
         // fall-through
     case OBU_FRAME:
     case OBU_FRAME_HDR:
         if (global) break;
         if (!c->seq_hdr) goto error;
         if (!c->frame_hdr_ref) {
             c->frame_hdr_ref = dav1d_ref_create(sizeof(Dav1dFrameHeader));
-            if (!c->frame_hdr_ref) return -ENOMEM;
+            if (!c->frame_hdr_ref) return DAV1D_ERR(ENOMEM);
         }
         // ensure that the reference is writable
         assert(dav1d_ref_is_writable(c->frame_hdr_ref));
         c->frame_hdr = c->frame_hdr_ref->data;
         memset(c->frame_hdr, 0, sizeof(*c->frame_hdr));
         c->frame_hdr->temporal_id = temporal_id;
         c->frame_hdr->spatial_id = spatial_id;
         if ((res = parse_frame_hdr(c, &gb)) < 0) {
@@ -1290,17 +1290,17 @@ int dav1d_parse_obus(Dav1dContext *const
         c->n_tile_data = 0;
         c->n_tiles = 0;
         if (type != OBU_FRAME) {
             // This is actually a frame header OBU so read the
             // trailing bit and check for overrun.
             dav1d_get_bits(&gb, 1);
             if (check_for_overrun(c, &gb, init_bit_pos, len)) {
                 c->frame_hdr = NULL;
-                return -EINVAL;
+                return DAV1D_ERR(EINVAL);
             }
 
             break;
         }
         // OBU_FRAMEs shouldn't be signalled with show_existing_frame
         if (c->frame_hdr->show_existing_frame) {
             c->frame_hdr = NULL;
             goto error;
@@ -1321,17 +1321,17 @@ int dav1d_parse_obus(Dav1dContext *const
             c->tile = tile;
             memset(c->tile + c->n_tile_data, 0, sizeof(*c->tile));
             c->n_tile_data_alloc = c->n_tile_data + 1;
         }
         parse_tile_hdr(c, &gb);
         // Align to the next byte boundary and check for overrun.
         dav1d_bytealign_get_bits(&gb);
         if (check_for_overrun(c, &gb, init_bit_pos, len))
-            return -EINVAL;
+            return DAV1D_ERR(EINVAL);
         // The current bit position is a multiple of 8 (because we
         // just aligned it) and less than 8*pkt_bytelen because
         // otherwise the overrun check would have fired.
         const unsigned bit_pos = dav1d_get_bits_pos(&gb);
         assert((bit_pos & 7) == 0);
         assert(pkt_bytelen >= (bit_pos >> 3));
         dav1d_data_ref(&c->tile[c->n_tile_data].data, in);
         c->tile[c->n_tile_data].data.data += bit_pos >> 3;
@@ -1357,17 +1357,17 @@ int dav1d_parse_obus(Dav1dContext *const
         if (gb.error) goto error;
         Dav1dRef *ref;
         Dav1dContentLightLevel *content_light;
         Dav1dMasteringDisplay *mastering_display;
 
         switch (meta_type) {
         case OBU_META_HDR_CLL:
             ref = dav1d_ref_create(sizeof(Dav1dContentLightLevel));
-            if (!ref) return -ENOMEM;
+            if (!ref) return DAV1D_ERR(ENOMEM);
             content_light = ref->data;
             memset(content_light, 0, sizeof(*content_light));
 
             content_light->max_content_light_level = dav1d_get_bits(&gb, 16);
             content_light->max_frame_average_light_level = dav1d_get_bits(&gb, 16);
 
             // Skip the trailing bit, align to the next byte boundary and check for overrun.
             dav1d_get_bits(&gb, 1);
@@ -1378,17 +1378,17 @@ int dav1d_parse_obus(Dav1dContext *const
             }
 
             dav1d_ref_dec(&c->content_light_ref);
             c->content_light = content_light;
             c->content_light_ref = ref;
             break;
         case OBU_META_HDR_MDCV: {
             ref = dav1d_ref_create(sizeof(Dav1dMasteringDisplay));
-            if (!ref) return -ENOMEM;
+            if (!ref) return DAV1D_ERR(ENOMEM);
             mastering_display = ref->data;
             memset(mastering_display, 0, sizeof(*mastering_display));
 
             for (int i = 0; i < 3; i++) {
                 mastering_display->primaries[i][0] = dav1d_get_bits(&gb, 16);
                 mastering_display->primaries[i][1] = dav1d_get_bits(&gb, 16);
             }
             mastering_display->white_point[0] = dav1d_get_bits(&gb, 16);
@@ -1423,22 +1423,22 @@ int dav1d_parse_obus(Dav1dContext *const
         break;
     }
     case OBU_PADDING:
     case OBU_TD:
         // ignore OBUs we don't care about
         break;
     default:
         dav1d_log(c, "Unknown OBU type %d of size %u\n", type, len);
-        return -EINVAL;
+        return DAV1D_ERR(EINVAL);
     }
 
     if (c->seq_hdr && c->frame_hdr) {
         if (c->frame_hdr->show_existing_frame) {
-            if (!c->refs[c->frame_hdr->existing_frame_idx].p.p.data[0]) return -EINVAL;
+            if (!c->refs[c->frame_hdr->existing_frame_idx].p.p.data[0]) return DAV1D_ERR(EINVAL);
             if (c->n_fc == 1) {
                 dav1d_picture_ref(&c->out,
                                   &c->refs[c->frame_hdr->existing_frame_idx].p.p);
                 dav1d_data_props_copy(&c->out.m, &in->m);
             } else {
                 // need to append this to the frame output queue
                 const unsigned next = c->frame_thread.next++;
                 if (c->frame_thread.next == c->n_fc)
@@ -1481,23 +1481,23 @@ int dav1d_parse_obus(Dav1dContext *const
                     if (c->refs[r].segmap)
                         dav1d_ref_inc(c->refs[r].segmap);
                     dav1d_ref_dec(&c->refs[i].refmvs);
                 }
             }
             c->frame_hdr = NULL;
         } else if (c->n_tiles == c->frame_hdr->tiling.cols * c->frame_hdr->tiling.rows) {
             if (!c->n_tile_data)
-                return -EINVAL;
+                return DAV1D_ERR(EINVAL);
             if ((res = dav1d_submit_frame(c)) < 0)
                 return res;
             assert(!c->n_tile_data);
             c->frame_hdr = NULL;
             c->n_tiles = 0;
         }
     }
 
     return len + init_byte_pos;
 
 error:
     dav1d_log(c, "Error parsing OBU data\n");
-    return -EINVAL;
+    return DAV1D_ERR(EINVAL);
 }
--- a/third_party/dav1d/src/picture.c
+++ b/third_party/dav1d/src/picture.c
@@ -40,48 +40,48 @@
 
 #include "src/internal.h"
 #include "src/log.h"
 #include "src/picture.h"
 #include "src/ref.h"
 #include "src/thread.h"
 #include "src/thread_task.h"
 
-int default_picture_allocator(Dav1dPicture *const p, void *cookie) {
+int dav1d_default_picture_alloc(Dav1dPicture *const p, void *const cookie) {
     assert(cookie == NULL);
     const int hbd = p->p.bpc > 8;
     const int aligned_w = (p->p.w + 127) & ~127;
     const int aligned_h = (p->p.h + 127) & ~127;
     const int has_chroma = p->p.layout != DAV1D_PIXEL_LAYOUT_I400;
     const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420;
     const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444;
     p->stride[0] = aligned_w << hbd;
     p->stride[1] = has_chroma ? (aligned_w >> ss_hor) << hbd : 0;
     const size_t y_sz = p->stride[0] * aligned_h;
     const size_t uv_sz = p->stride[1] * (aligned_h >> ss_ver);
     const size_t pic_size = y_sz + 2 * uv_sz;
 
     uint8_t *data = dav1d_alloc_aligned(pic_size + DAV1D_PICTURE_ALIGNMENT,
                                         DAV1D_PICTURE_ALIGNMENT);
     if (data == NULL) {
-        return -ENOMEM;
+        return DAV1D_ERR(ENOMEM);
     }
 
     p->data[0] = data;
     p->data[1] = has_chroma ? data + y_sz : NULL;
     p->data[2] = has_chroma ? data + y_sz + uv_sz : NULL;
 
 #ifndef NDEBUG /* safety check */
     p->allocator_data = data;
 #endif
 
     return 0;
 }
 
-void default_picture_release(Dav1dPicture *const p, void *cookie) {
+void dav1d_default_picture_release(Dav1dPicture *const p, void *const cookie) {
     assert(cookie == NULL);
 #ifndef NDEBUG /* safety check */
     assert(p->allocator_data == p->data[0]);
 #endif
     dav1d_free_aligned(p->data[0]);
 }
 
 struct pic_ctx_context {
@@ -111,17 +111,17 @@ static int picture_alloc_with_edges(Dav1
     if (p->data[0]) {
         dav1d_log(c, "Picture already allocated!\n");
         return -1;
     }
     assert(bpc > 0 && bpc <= 16);
 
     struct pic_ctx_context *pic_ctx = malloc(extra + sizeof(struct pic_ctx_context));
     if (pic_ctx == NULL) {
-        return -ENOMEM;
+        return DAV1D_ERR(ENOMEM);
     }
 
     p->p.w = w;
     p->p.h = h;
     p->seq_hdr = seq_hdr;
     p->frame_hdr = frame_hdr;
     p->content_light = content_light;
     p->mastering_display = mastering_display;
@@ -136,17 +136,17 @@ static int picture_alloc_with_edges(Dav1
 
     pic_ctx->allocator = *p_allocator;
     pic_ctx->pic = *p;
 
     if (!(p->ref = dav1d_ref_wrap(p->data[0], free_buffer, pic_ctx))) {
         p_allocator->release_picture_callback(p, p_allocator->cookie);
         free(pic_ctx);
         dav1d_log(c, "Failed to wrap picture: %s\n", strerror(errno));
-        return -ENOMEM;
+        return DAV1D_ERR(ENOMEM);
     }
 
     p->seq_hdr_ref = seq_hdr_ref;
     if (seq_hdr_ref) dav1d_ref_inc(seq_hdr_ref);
 
     p->frame_hdr_ref = frame_hdr_ref;
     if (frame_hdr_ref) dav1d_ref_inc(frame_hdr_ref);
 
--- a/third_party/dav1d/src/picture.h
+++ b/third_party/dav1d/src/picture.h
@@ -100,13 +100,13 @@ int dav1d_thread_picture_wait(const Dav1
  * error to frames using this frame as reference frame.
  * plane_type denotes whether we have completed block data (pass 1;
  * PLANE_TYPE_BLOCK), pixel data (pass 2, PLANE_TYPE_Y) or both (no
  * 2-pass decoding; PLANE_TYPE_ALL).
  */
 void dav1d_thread_picture_signal(const Dav1dThreadPicture *p, int y,
                                  enum PlaneType plane_type);
 
-int default_picture_allocator(Dav1dPicture *, void *cookie);
-void default_picture_release(Dav1dPicture *, void *cookie);
+int dav1d_default_picture_alloc(Dav1dPicture *p, void *cookie);
+void dav1d_default_picture_release(Dav1dPicture *p, void *cookie);
 void dav1d_picture_unref_internal(Dav1dPicture *p);
 
 #endif /* DAV1D_SRC_PICTURE_H */
--- a/third_party/dav1d/src/recon_tmpl.c
+++ b/third_party/dav1d/src/recon_tmpl.c
@@ -634,17 +634,17 @@ static int obmc(Dav1dTileContext *const 
 
     if (t->by > t->ts->tiling.row_start &&
         (!pl || b_dim[0] * h_mul + b_dim[1] * v_mul >= 16))
     {
         for (int i = 0, x = 0; x < w4 && i < imin(b_dim[2], 4); ) {
             // only odd blocks are considered for overlap handling, hence +1
             const refmvs *const a_r = &r[x - f->b4_stride + 1];
             const uint8_t *const a_b_dim =
-                dav1d_block_dimensions[sbtype_to_bs[a_r->sb_type]];
+                dav1d_block_dimensions[dav1d_sbtype_to_bs[a_r->sb_type]];
 
             if (a_r->ref[0] > 0) {
                 const int ow4 = iclip(a_b_dim[0], 2, b_dim[0]);
                 const int oh4 = imin(b_dim[1], 16) >> 1;
                 res = mc(t, lap, NULL, ow4 * h_mul * sizeof(pixel), ow4, oh4,
                          t->bx + x, t->by, pl, a_r->mv[0],
                          &f->refp[a_r->ref[0] - 1], a_r->ref[0] - 1,
                          dav1d_filter_2d[t->a->filter[1][bx4 + x + 1]][t->a->filter[0][bx4 + x + 1]]);
@@ -657,17 +657,17 @@ static int obmc(Dav1dTileContext *const 
         }
     }
 
     if (t->bx > t->ts->tiling.col_start)
         for (int i = 0, y = 0; y < h4 && i < imin(b_dim[3], 4); ) {
             // only odd blocks are considered for overlap handling, hence +1
             const refmvs *const l_r = &r[(y + 1) * f->b4_stride - 1];
             const uint8_t *const l_b_dim =
-                dav1d_block_dimensions[sbtype_to_bs[l_r->sb_type]];
+                dav1d_block_dimensions[dav1d_sbtype_to_bs[l_r->sb_type]];
 
             if (l_r->ref[0] > 0) {
                 const int ow4 = imin(b_dim[0], 16) >> 1;
                 const int oh4 = iclip(l_b_dim[1], 2, b_dim[1]);
                 res = mc(t, lap, NULL, h_mul * ow4 * sizeof(pixel), ow4, oh4,
                          t->bx, t->by + y, pl, l_r->mv[0],
                          &f->refp[l_r->ref[0] - 1], l_r->ref[0] - 1,
                          dav1d_filter_2d[t->l.filter[1][by4 + y + 1]][t->l.filter[0][by4 + y + 1]]);
--- a/third_party/dav1d/src/ref_mvs.c
+++ b/third_party/dav1d/src/ref_mvs.c
@@ -50,16 +50,18 @@
 #include <assert.h>
 #include <errno.h>
 #include <limits.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 
+#include "dav1d/common.h"
+
 #include "common/intops.h"
 
 #define av1_zero(a) memset(a, 0, sizeof(a))
 
 #define ATTRIBUTE_PACKED
 #define INLINE inline
 #define IMPLIES(a, b) (!(a) || (b))  //  Logical 'a implies b' (or 'a -> b')
 
@@ -761,35 +763,16 @@ static INLINE int16_t av1_mode_context_a
   const int16_t refmv_ctx =
       (mode_context[ref_frame] >> REFMV_OFFSET) & REFMV_CTX_MASK;
 
   const int16_t comp_ctx = compound_mode_ctx_map[refmv_ctx >> 1][AOMMIN(
       newmv_ctx, COMP_NEWMV_CTXS - 1)];
   return comp_ctx;
 }
 
-static void av1_setup_frame_buf_refs(AV1_COMMON *cm);
-void av1_setup_frame_sign_bias(AV1_COMMON *cm);
-void av1_setup_skip_mode_allowed(AV1_COMMON *cm);
-
-void av1_copy_frame_mvs(const AV1_COMMON *const cm, MB_MODE_INFO *mi,
-                        int mi_row, int mi_col, int x_mis, int y_mis);
-
-static void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd,
-                      MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
-                      uint8_t ref_mv_count[MODE_CTX_REF_FRAMES],
-                      CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE],
-                      int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES],
-                      int_mv *global_mvs, int mi_row, int mi_col,
-                      int16_t *mode_context);
-
-int selectSamples(MV *mv, int *pts, int *pts_inref, int len, BLOCK_SIZE bsize);
-int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
-                int *pts, int *pts_inref);
-
 #define INTRABC_DELAY_PIXELS 256  //  Delay of 256 pixels
 #define INTRABC_DELAY_SB64 (INTRABC_DELAY_PIXELS / 64)
 #define USE_WAVE_FRONT 1  // Use only top left area of frame for reference.
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
@@ -1845,17 +1828,17 @@ enum BlockSize {
     BS_8x8,
     BS_8x4,
     BS_4x16,
     BS_4x8,
     BS_4x4,
     N_BS_SIZES,
 };
 extern const uint8_t dav1d_block_dimensions[N_BS_SIZES][4];
-const uint8_t bs_to_sbtype[N_BS_SIZES] = {
+const uint8_t dav1d_bs_to_sbtype[N_BS_SIZES] = {
     [BS_128x128] = BLOCK_128X128,
     [BS_128x64] = BLOCK_128X64,
     [BS_64x128] = BLOCK_64X128,
     [BS_64x64] = BLOCK_64X64,
     [BS_64x32] = BLOCK_64X32,
     [BS_64x16] = BLOCK_64X16,
     [BS_32x64] = BLOCK_32X64,
     [BS_32x32] = BLOCK_32X32,
@@ -1869,17 +1852,17 @@ const uint8_t bs_to_sbtype[N_BS_SIZES] =
     [BS_8x32] = BLOCK_8X32,
     [BS_8x16] = BLOCK_8X16,
     [BS_8x8] = BLOCK_8X8,
     [BS_8x4] = BLOCK_8X4,
     [BS_4x16] = BLOCK_4X16,
     [BS_4x8] = BLOCK_4X8,
     [BS_4x4] = BLOCK_4X4,
 };
-const uint8_t sbtype_to_bs[BLOCK_SIZES_ALL] = {
+const uint8_t dav1d_sbtype_to_bs[BLOCK_SIZES_ALL] = {
     [BLOCK_128X128] = BS_128x128,
     [BLOCK_128X64] = BS_128x64,
     [BLOCK_64X128] = BS_64x128,
     [BLOCK_64X64] = BS_64x64,
     [BLOCK_64X32] = BS_64x32,
     [BLOCK_64X16] = BS_64x16,
     [BLOCK_32X64] = BS_32x64,
     [BLOCK_32X32] = BS_32x32,
@@ -1896,28 +1879,28 @@ const uint8_t sbtype_to_bs[BLOCK_SIZES_A
     [BLOCK_8X4] = BS_8x4,
     [BLOCK_4X16] = BS_4x16,
     [BLOCK_4X8] = BS_4x8,
     [BLOCK_4X4] = BS_4x4,
 };
 
 #include <stdio.h>
 
-void av1_find_ref_mvs(CANDIDATE_MV *mvstack, int *cnt, int_mv (*mvlist)[2],
-                      int *ctx, int refidx_dav1d[2],
-                      int w4, int h4, int bs, int bp, int by4, int bx4,
-                      int tile_col_start4, int tile_col_end4,
-                      int tile_row_start4, int tile_row_end4,
-                      AV1_COMMON *cm);
-void av1_find_ref_mvs(CANDIDATE_MV *mvstack, int *cnt, int_mv (*mvlist)[2],
-                      int *ctx, int refidx_dav1d[2],
-                      int w4, int h4, int bs, int bp, int by4, int bx4,
-                      int tile_col_start4, int tile_col_end4,
-                      int tile_row_start4, int tile_row_end4,
-                      AV1_COMMON *cm)
+void dav1d_find_ref_mvs(CANDIDATE_MV *mvstack, int *cnt, int_mv (*mvlist)[2],
+                        int *ctx, int refidx_dav1d[2],
+                        int w4, int h4, int bs, int bp, int by4, int bx4,
+                        int tile_col_start4, int tile_col_end4,
+                        int tile_row_start4, int tile_row_end4,
+                        AV1_COMMON *cm);
+void dav1d_find_ref_mvs(CANDIDATE_MV *mvstack, int *cnt, int_mv (*mvlist)[2],
+                        int *ctx, int refidx_dav1d[2],
+                        int w4, int h4, int bs, int bp, int by4, int bx4,
+                        int tile_col_start4, int tile_col_end4,
+                        int tile_row_start4, int tile_row_end4,
+                        AV1_COMMON *cm)
 {
     const int bw4 = dav1d_block_dimensions[bs][0];
     const int bh4 = dav1d_block_dimensions[bs][1];
     int stride = (int) cm->cur_frame.mv_stride;
     MACROBLOCKD xd = (MACROBLOCKD) {
         .n8_w = bw4,
         .n8_h = bh4,
         .mi_stride = stride,
@@ -1935,17 +1918,17 @@ void av1_find_ref_mvs(CANDIDATE_MV *mvst
         .mb_to_left_edge = -bx4 * 32,
         .mb_to_right_edge = (w4 - bw4 - bx4) * 32,
         .mb_to_top_edge = -by4 * 32,
         .is_sec_rect = 0,
         .cur_mi = {
             .partition = bp,
         },
     };
-    xd.mi->sb_type = bs_to_sbtype[bs];
+    xd.mi->sb_type = dav1d_bs_to_sbtype[bs];
     if (xd.n8_w < xd.n8_h) {
         // Only mark is_sec_rect as 1 for the last block.
         // For PARTITION_VERT_4, it would be (0, 0, 0, 1);
         // For other partitions, it would be (0, 1).
         if (!((bx4 + xd.n8_w) & (xd.n8_h - 1))) xd.is_sec_rect = 1;
     }
 
     if (xd.n8_w > xd.n8_h)
@@ -1975,50 +1958,42 @@ void av1_find_ref_mvs(CANDIDATE_MV *mvst
     if (ctx) {
         if (refidx_dav1d[1] == -1)
             *ctx = single_context[refidx_dav1d[0] + 1];
         else
             *ctx = av1_mode_context_analyzer(single_context, rf);
     }
 }
 
-int av1_init_ref_mv_common(AV1_COMMON *cm,
-                           const int w8, const int h8,
-                           const ptrdiff_t stride,
-                           const int allow_sb128,
-                           MV_REF *cur,
-                           MV_REF *ref_mvs[7],
-                           const unsigned cur_poc,
-                           const unsigned ref_poc[7],
-                           const unsigned ref_ref_poc[7][7],
-                           const Dav1dWarpedMotionParams gmv[7],
-                           const int allow_hp,
-                           const int force_int_mv,
-                           const int allow_ref_frame_mvs,
-                           const int order_hint);
-int av1_init_ref_mv_common(AV1_COMMON *cm,
-                           const int w8, const int h8,
-                           const ptrdiff_t stride,
-                           const int allow_sb128,
-                           MV_REF *cur,
-                           MV_REF *ref_mvs[7],
-                           const unsigned cur_poc,
-                           const unsigned ref_poc[7],
-                           const unsigned ref_ref_poc[7][7],
-                           const Dav1dWarpedMotionParams gmv[7],
-                           const int allow_hp,
-                           const int force_int_mv,
-                           const int allow_ref_frame_mvs,
-                           const int order_hint)
+int dav1d_init_ref_mv_common(AV1_COMMON *cm, const int w8, const int h8,
+                             const ptrdiff_t stride, const int allow_sb128,
+                             MV_REF *cur, MV_REF *ref_mvs[7],
+                             const unsigned cur_poc,
+                             const unsigned ref_poc[7],
+                             const unsigned ref_ref_poc[7][7],
+                             const Dav1dWarpedMotionParams gmv[7],
+                             const int allow_hp, const int force_int_mv,
+                             const int allow_ref_frame_mvs,
+                             const int order_hint);
+int dav1d_init_ref_mv_common(AV1_COMMON *cm, const int w8, const int h8,
+                             const ptrdiff_t stride, const int allow_sb128,
+                             MV_REF *cur, MV_REF *ref_mvs[7],
+                             const unsigned cur_poc,
+                             const unsigned ref_poc[7],
+                             const unsigned ref_ref_poc[7][7],
+                             const Dav1dWarpedMotionParams gmv[7],
+                             const int allow_hp, const int force_int_mv,
+                             const int allow_ref_frame_mvs,
+                             const int order_hint)
 {
     if (cm->mi_cols != (w8 << 1) || cm->mi_rows != (h8 << 1)) {
         const int align_h = (h8 + 15) & ~15;
         if (cm->tpl_mvs) free(cm->tpl_mvs);
         cm->tpl_mvs = malloc(sizeof(*cm->tpl_mvs) * (stride >> 1) * align_h);
-        if (!cm->tpl_mvs) return -ENOMEM;
+        if (!cm->tpl_mvs) return DAV1D_ERR(ENOMEM);
         for (int i = 0; i < 7; i++)
             cm->frame_refs[i].idx = i;
         cm->mi_cols = w8 << 1;
         cm->mi_rows = h8 << 1;
         cm->mi_stride = (int) stride;
         for (int i = 0; i < 7; i++) {
             cm->buffer_pool.frame_bufs[i].mi_rows = cm->mi_rows;
             cm->buffer_pool.frame_bufs[i].mi_cols = cm->mi_cols;
@@ -2056,22 +2031,22 @@ int av1_init_ref_mv_common(AV1_COMMON *c
     }
     if (allow_ref_frame_mvs) {
         av1_setup_motion_field(cm);
     }
 
     return 0;
 }
 
-void av1_init_ref_mv_tile_row(AV1_COMMON *cm,
-                              int tile_col_start4, int tile_col_end4,
-                              int row_start4, int row_end4);
-void av1_init_ref_mv_tile_row(AV1_COMMON *cm,
-                              int tile_col_start4, int tile_col_end4,
-                              int row_start4, int row_end4)
+void dav1d_init_ref_mv_tile_row(AV1_COMMON *cm,
+                                int tile_col_start4, int tile_col_end4,
+                                int row_start4, int row_end4);
+void dav1d_init_ref_mv_tile_row(AV1_COMMON *cm,
+                                int tile_col_start4, int tile_col_end4,
+                                int row_start4, int row_end4)
 {
   RefCntBuffer *const frame_bufs = cm->buffer_pool.frame_bufs;
   const int cur_order_hint = cm->cur_frame.cur_frame_offset;
   int *const ref_buf_idx = cm->ref_buf_idx;
   int *const ref_order_hint = cm->ref_order_hint;
 
   int ref_stamp = MFMV_STACK_SIZE - 1;
 
@@ -2110,21 +2085,21 @@ void av1_init_ref_mv_tile_row(AV1_COMMON
                                   row_start4, row_end4)) --ref_stamp;
 
   if (ref_stamp >= 0 && ref_buf_idx[LAST2_FRAME - LAST_FRAME] >= 0)
       if (motion_field_projection(cm, LAST2_FRAME, 2,
                                   tile_col_start4, tile_col_end4,
                                   row_start4, row_end4)) --ref_stamp;
 }
 
-AV1_COMMON *av1_alloc_ref_mv_common(void);
-AV1_COMMON *av1_alloc_ref_mv_common(void) {
+AV1_COMMON *dav1d_alloc_ref_mv_common(void);
+AV1_COMMON *dav1d_alloc_ref_mv_common(void) {
     AV1_COMMON *cm = malloc(sizeof(*cm));
     if (!cm) return NULL;
     memset(cm, 0, sizeof(*cm));
     return cm;
 }
 
-void av1_free_ref_mv_common(AV1_COMMON *cm);
-void av1_free_ref_mv_common(AV1_COMMON *cm) {
+void dav1d_free_ref_mv_common(AV1_COMMON *cm);
+void dav1d_free_ref_mv_common(AV1_COMMON *cm) {
     if (cm->tpl_mvs) free(cm->tpl_mvs);
     free(cm);
 }
--- a/third_party/dav1d/src/ref_mvs.h
+++ b/third_party/dav1d/src/ref_mvs.h
@@ -27,63 +27,60 @@ typedef struct candidate_mv {
     mv this_mv;
     mv comp_mv;
     int weight;
 } candidate_mv;
 
 typedef struct AV1_COMMON AV1_COMMON;
 
 // call once per frame thread
-AV1_COMMON *av1_alloc_ref_mv_common(void);
-void av1_free_ref_mv_common(AV1_COMMON *cm);
+AV1_COMMON *dav1d_alloc_ref_mv_common(void);
+void dav1d_free_ref_mv_common(AV1_COMMON *cm);
 
 // call once per frame
-int av1_init_ref_mv_common(AV1_COMMON *cm,
-                           int w8, int h8,
-                           ptrdiff_t stride,
-                           int allow_sb128,
-                           refmvs *cur,
-                           refmvs *ref_mvs[7],
-                           unsigned cur_poc,
-                           const unsigned ref_poc[7],
-                           const unsigned ref_ref_poc[7][7],
-                           const Dav1dWarpedMotionParams gmv[7],
-                           int allow_hp, int force_int_mv,
-                           int allow_ref_frame_mvs, int order_hint);
+int dav1d_init_ref_mv_common(AV1_COMMON *cm, int w8, int h8,
+                             ptrdiff_t stride, int allow_sb128,
+                             refmvs *cur, refmvs *ref_mvs[7],
+                             unsigned cur_poc,
+                             const unsigned ref_poc[7],
+                             const unsigned ref_ref_poc[7][7],
+                             const Dav1dWarpedMotionParams gmv[7],
+                             int allow_hp, int force_int_mv,
+                             int allow_ref_frame_mvs, int order_hint);
 
 // call for start of each sbrow per tile
-void av1_init_ref_mv_tile_row(AV1_COMMON *cm,
-                              int tile_col_start4, int tile_col_end4,
-                              int row_start4, int row_end4);
+void dav1d_init_ref_mv_tile_row(AV1_COMMON *cm,
+                                int tile_col_start4, int tile_col_end4,
+                                int row_start4, int row_end4);
 
 // call for each block
-void av1_find_ref_mvs(candidate_mv *mvstack, int *cnt, mv (*mvlist)[2],
-                      int *ctx, int refidx[2], int w4, int h4,
-                      enum BlockSize bs, enum BlockPartition bp,
-                      int by4, int bx4, int tile_col_start4,
-                      int tile_col_end4, int tile_row_start4,
-                      int tile_row_end4, AV1_COMMON *cm);
+void dav1d_find_ref_mvs(candidate_mv *mvstack, int *cnt, mv (*mvlist)[2],
+                        int *ctx, int refidx[2], int w4, int h4,
+                        enum BlockSize bs, enum BlockPartition bp,
+                        int by4, int bx4, int tile_col_start4,
+                        int tile_col_end4, int tile_row_start4,
+                        int tile_row_end4, AV1_COMMON *cm);
 
-extern const uint8_t bs_to_sbtype[];
-extern const uint8_t sbtype_to_bs[];
+extern const uint8_t dav1d_bs_to_sbtype[];
+extern const uint8_t dav1d_sbtype_to_bs[];
 static inline void splat_oneref_mv(refmvs *r, const ptrdiff_t stride,
                                    const int by4, const int bx4,
                                    const enum BlockSize bs,
                                    const enum InterPredMode mode,
                                    const int ref, const mv mv,
                                    const int is_interintra)
 {
     const int bw4 = dav1d_block_dimensions[bs][0];
     int bh4 = dav1d_block_dimensions[bs][1];
 
     r += by4 * stride + bx4;
     const refmvs tmpl = (refmvs) {
         .ref = { ref + 1, is_interintra ? 0 : -1 },
         .mv = { mv },
-        .sb_type = bs_to_sbtype[bs],
+        .sb_type = dav1d_bs_to_sbtype[bs],
         .mode = N_INTRA_PRED_MODES + mode,
     };
     do {
         for (int x = 0; x < bw4; x++)
             r[x] = tmpl;
         r += stride;
     } while (--bh4);
 }
@@ -94,17 +91,17 @@ static inline void splat_intrabc_mv(refm
 {
     const int bw4 = dav1d_block_dimensions[bs][0];
     int bh4 = dav1d_block_dimensions[bs][1];
 
     r += by4 * stride + bx4;
     const refmvs tmpl = (refmvs) {
         .ref = { 0, -1 },
         .mv = { mv },
-        .sb_type = bs_to_sbtype[bs],
+        .sb_type = dav1d_bs_to_sbtype[bs],
         .mode = DC_PRED,
     };
     do {
         for (int x = 0; x < bw4; x++)
             r[x] = tmpl;
         r += stride;
     } while (--bh4);
 }
@@ -118,17 +115,17 @@ static inline void splat_tworef_mv(refmv
 {
     const int bw4 = dav1d_block_dimensions[bs][0];
     int bh4 = dav1d_block_dimensions[bs][1];
 
     r += by4 * stride + bx4;
     const refmvs tmpl = (refmvs) {
         .ref = { ref1 + 1, ref2 + 1 },
         .mv = { mv1, mv2 },
-        .sb_type = bs_to_sbtype[bs],
+        .sb_type = dav1d_bs_to_sbtype[bs],
         .mode = N_INTRA_PRED_MODES + N_INTER_PRED_MODES + mode,
     };
     do {
         for (int x = 0; x < bw4; x++)
             r[x] = tmpl;
         r += stride;
     } while (--bh4);
 }
@@ -144,17 +141,17 @@ static inline void splat_intraref(refmvs
     r += by4 * stride + bx4;
     do {
         int x;
 
         for (x = 0; x < bw4; x++)
             r[x] = (refmvs) {
                 .ref = { 0, -1 },
                 .mv = { [0] = { .y = -0x8000, .x = -0x8000 }, },
-                .sb_type = bs_to_sbtype[bs],
+                .sb_type = dav1d_bs_to_sbtype[bs],
                 .mode = mode,
             };
         r += stride;
     } while (--bh4);
 }
 
 static inline void fix_mv_precision(const Dav1dFrameHeader *const hdr,
                                     mv *const mv)
--- a/third_party/dav1d/src/thread.h
+++ b/third_party/dav1d/src/thread.h
@@ -35,30 +35,50 @@
 #define PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT
 
 typedef struct {
     HANDLE h;
     void *(*func)(void*);
     void *arg;
 } pthread_t;
 
+typedef struct {
+    unsigned stack_size;
+} pthread_attr_t;
+
 typedef SRWLOCK pthread_mutex_t;
 typedef CONDITION_VARIABLE pthread_cond_t;
 typedef INIT_ONCE pthread_once_t;
 
-int dav1d_pthread_create(pthread_t *thread, const void *attr,
+int dav1d_pthread_create(pthread_t *thread, const pthread_attr_t *attr,
                          void *(*func)(void*), void *arg);
 int dav1d_pthread_join(pthread_t *thread, void **res);
 int dav1d_pthread_once(pthread_once_t *once_control,
                        void (*init_routine)(void));
 
 #define pthread_create dav1d_pthread_create
 #define pthread_join(thread, res) dav1d_pthread_join(&(thread), res)
 #define pthread_once   dav1d_pthread_once
 
+static inline int pthread_attr_init(pthread_attr_t *const attr) {
+    attr->stack_size = 0;
+    return 0;
+}
+
+static inline int pthread_attr_destroy(pthread_attr_t *const attr) {
+    return 0;
+}
+
+static inline int pthread_attr_setstacksize(pthread_attr_t *const attr,
+                                            const unsigned stack_size)
+{
+    attr->stack_size = stack_size;
+    return 0;
+}
+
 static inline int pthread_mutex_init(pthread_mutex_t *const mutex,
                                      const void *const attr)
 {
     InitializeSRWLock(mutex);
     return 0;
 }
 
 static inline int pthread_mutex_destroy(pthread_mutex_t *const mutex) {
--- a/third_party/dav1d/src/win32/thread.c
+++ b/third_party/dav1d/src/win32/thread.c
@@ -36,23 +36,25 @@
 #include "src/thread.h"
 
 static unsigned __stdcall thread_entrypoint(void *const data) {
     pthread_t *const t = data;
     t->arg = t->func(t->arg);
     return 0;
 }
 
-int dav1d_pthread_create(pthread_t *const thread, const void *const attr,
+int dav1d_pthread_create(pthread_t *const thread,
+                         const pthread_attr_t *const attr,
                          void *(*const func)(void*), void *const arg)
 {
+    const unsigned stack_size = attr ? attr->stack_size : 0;
     thread->func = func;
     thread->arg = arg;
-    thread->h = (HANDLE)_beginthreadex(NULL, 0, thread_entrypoint,
-                                       thread, 0, NULL);
+    thread->h = (HANDLE)_beginthreadex(NULL, stack_size, thread_entrypoint, thread,
+                                       STACK_SIZE_PARAM_IS_A_RESERVATION, NULL);
     return !thread->h;
 }
 
 int dav1d_pthread_join(pthread_t *const thread, void **const res) {
     if (WaitForSingleObject(thread->h, INFINITE))
         return 1;
 
     if (res)
--- a/third_party/dav1d/src/x86/ipred_init_tmpl.c
+++ b/third_party/dav1d/src/x86/ipred_init_tmpl.c
@@ -57,16 +57,17 @@ decl_angular_ipred_fn(dav1d_ipred_dc_128
 decl_angular_ipred_fn(dav1d_ipred_dc_top_ssse3);
 decl_angular_ipred_fn(dav1d_ipred_dc_left_ssse3);
 decl_angular_ipred_fn(dav1d_ipred_h_ssse3);
 decl_angular_ipred_fn(dav1d_ipred_v_ssse3);
 decl_angular_ipred_fn(dav1d_ipred_paeth_ssse3);
 decl_angular_ipred_fn(dav1d_ipred_smooth_ssse3);
 decl_angular_ipred_fn(dav1d_ipred_smooth_v_ssse3);
 decl_angular_ipred_fn(dav1d_ipred_smooth_h_ssse3);
+decl_angular_ipred_fn(dav1d_ipred_filter_ssse3);
 
 decl_cfl_pred_fn(dav1d_ipred_cfl_ssse3);
 decl_cfl_pred_fn(dav1d_ipred_cfl_128_ssse3);
 decl_cfl_pred_fn(dav1d_ipred_cfl_top_ssse3);
 decl_cfl_pred_fn(dav1d_ipred_cfl_left_ssse3);
 
 decl_cfl_ac_fn(dav1d_ipred_cfl_ac_420_ssse3);
 decl_cfl_ac_fn(dav1d_ipred_cfl_ac_422_ssse3);
@@ -85,16 +86,17 @@ void bitfn(dav1d_intra_pred_dsp_init_x86
     c->intra_pred[TOP_DC_PRED]   = dav1d_ipred_dc_top_ssse3;
     c->intra_pred[LEFT_DC_PRED]  = dav1d_ipred_dc_left_ssse3;
     c->intra_pred[HOR_PRED]      = dav1d_ipred_h_ssse3;
     c->intra_pred[VERT_PRED]     = dav1d_ipred_v_ssse3;
     c->intra_pred[PAETH_PRED]    = dav1d_ipred_paeth_ssse3;
     c->intra_pred[SMOOTH_PRED]   = dav1d_ipred_smooth_ssse3;
     c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_ssse3;
     c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_ssse3;
+    c->intra_pred[FILTER_PRED]   = dav1d_ipred_filter_ssse3;
 
     c->cfl_pred[DC_PRED]         = dav1d_ipred_cfl_ssse3;
     c->cfl_pred[DC_128_PRED]     = dav1d_ipred_cfl_128_ssse3;
     c->cfl_pred[TOP_DC_PRED]     = dav1d_ipred_cfl_top_ssse3;
     c->cfl_pred[LEFT_DC_PRED]    = dav1d_ipred_cfl_left_ssse3;
 
     c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_ipred_cfl_ac_420_ssse3;
     c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_ipred_cfl_ac_422_ssse3;
--- a/third_party/dav1d/src/x86/ipred_ssse3.asm
+++ b/third_party/dav1d/src/x86/ipred_ssse3.asm
@@ -54,17 +54,20 @@ smooth_weights: SMOOTH_WEIGHT_TABLE     
      65,  61,  57,  54,  50,  47,  44,  41, \
      38,  35,  32,  29,  27,  25,  22,  20, \
      18,  16,  15,  13,  12,  10,   9,   8, \
       7,   6,   6,   5,   5,   4,   4,   4
 
 ipred_v_shuf      : db  0,  1,  0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7
 ipred_h_shuf      : db  3,  3,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  0,  0,  0,  0
 ipred_paeth_shuf  : db  1,  1,  1,  1,  1,  1,  1,  1,  0,  0,  0,  0,  0,  0,  0,  0
+filter_shuf1      : db  3,  4,  3,  4,  5,  6,  5,  6,  7,  2,  7,  2,  1, -1,  1, -1
+filter_shuf2      : db  3,  4,  3,  4,  5,  6,  5,  6,  7, 11,  7, 11, 15, -1, 15, -1
 
+pw_8        : times 8  dw 8
 pb_3        : times 16 db 3
 pb_128      : times 8  db 128
 pw_128      : times 4  dw 128
 pw_255      : times 4  dw 255
 pb_2        : times 8  db 2
 pb_4        : times 8  db 4
 pb_127_m127 : times 4  db 127, -127
 pd_32768    : times 1  dd 32768
@@ -90,16 +93,19 @@ JMP_TABLE ipred_dc_left,    ssse3, h4, h
 JMP_TABLE ipred_smooth,     ssse3, w4, w8, w16, w32, w64
 JMP_TABLE ipred_smooth_v,   ssse3, w4, w8, w16, w32, w64
 JMP_TABLE ipred_smooth_h,   ssse3, w4, w8, w16, w32, w64
 JMP_TABLE ipred_paeth,      ssse3, w4, w8, w16, w32, w64
 JMP_TABLE pal_pred,         ssse3, w4, w8, w16, w32, w64
 JMP_TABLE ipred_cfl,        ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \
                                 s4-8*4, s8-8*4, s16-8*4, s32-8*4
 JMP_TABLE ipred_cfl_left,   ssse3, h4, h8, h16, h32
+JMP_TABLE ipred_filter,     ssse3, w4, w8, w16, w32
+
+cextern filter_intra_taps
 
 
 SECTION .text
 
 ;---------------------------------------------------------------------------------------
 ;int dav1d_ipred_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
 ;                                    const int width, const int height, const int a);
 ;---------------------------------------------------------------------------------------
@@ -2904,8 +2910,201 @@ ALIGN function_align
     mova                 m6, [rsp+96]
     PAETH                 6, 7
     mova          [dstq+48], m1
     add                dstq, strideq
     dec                  hd
     jg .w64_loop
     RET
 
+
+%macro FILTER 4  ;dst, src, tmp, shuf
+%ifnum %4
+    pshufb               m%2, m%4
+%else
+    pshufb               m%2, %4
+%endif
+    pshufd               m%1, m%2, q0000           ;p0 p1
+    pmaddubsw            m%1, m2
+    pshufd               m%3, m%2, q1111           ;p2 p3
+    pmaddubsw            m%3, m3
+    paddw                m%1, [base+pw_8]
+    paddw                m%1, m%3
+    pshufd               m%3, m%2, q2222           ;p4 p5
+    pmaddubsw            m%3, m4
+    paddw                m%1, m%3
+    pshufd               m%3, m%2, q3333           ;p6 __
+    pmaddubsw            m%3, m5
+    paddw                m%1, m%3
+    psraw                m%1, 4
+    packuswb             m%1, m%1
+%endmacro
+
+cglobal ipred_filter, 3, 7, 8, dst, stride, tl, w, h, filter
+%define base r6-$$
+    LEA                   r6, $$
+    tzcnt                 wd, wm
+%ifidn filterd, filterm
+    movzx            filterd, filterb
+%else
+    movzx            filterd, byte filterm
+%endif
+    shl              filterd, 6
+    lea              filterq, [base+filter_intra_taps+filterq]
+    movq                  m0, [tlq-3]                     ;_ 6 5 0 1 2 3 4
+    movsxd                wq, [base+ipred_filter_ssse3_table+wq*4]
+    mova                  m2, [filterq+16*0]
+    mova                  m3, [filterq+16*1]
+    mova                  m4, [filterq+16*2]
+    mova                  m5, [filterq+16*3]
+    lea                   wq, [base+ipred_filter_ssse3_table+wq]
+    mov                   hd, hm
+    jmp                   wq
+.w4:
+    mova                  m1, [base+filter_shuf1]
+    sub                  tlq, 3
+    sub                  tlq, hq
+    jmp .w4_loop_start
+.w4_loop:
+    movd                  m0, [tlq+hq]
+    punpckldq             m0, m6
+    lea                 dstq, [dstq+strideq*2]
+.w4_loop_start:
+    FILTER                 6, 0, 7, 1
+    movd    [dstq+strideq*0], m6
+    pshuflw               m6, m6, q1032
+    movd    [dstq+strideq*1], m6
+    sub                   hd, 2
+    jg .w4_loop
+    RET
+
+ALIGN function_align
+.w8:
+    movq                  m6, [tlq+1]                   ;_ _ _ 0 1 2 3 4
+    sub                  tlq, 5
+    sub                  tlq, hq
+
+.w8_loop:
+    FILTER                 7, 0, 1, [base+filter_shuf1]
+    punpcklqdq            m6, m7                        ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+    FILTER                 0, 6, 1, [base+filter_shuf2]
+
+    punpckldq             m6, m7, m0
+    movq    [dstq+strideq*0], m6
+    punpckhqdq            m6, m6
+    movq    [dstq+strideq*1], m6
+
+    movd                  m0, [tlq+hq]                  ;_ 6 5 0
+    punpckldq             m0, m6                        ;_ 6 5 0 1 2 3 4
+
+    lea                 dstq, [dstq+strideq*2]
+    sub                   hd, 2
+    jg .w8_loop
+    RET
+
+ALIGN function_align
+.w16:
+    movu                  m6, [tlq+1]                   ;top row
+    sub                  tlq, 5
+    sub                  tlq, hq
+
+.w16_loop:
+    FILTER                 7, 0, 1, [base+filter_shuf1]
+    punpcklqdq            m0, m6, m7                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+    movd    [dstq+strideq*0], m7
+    psrlq                 m7, 32
+    palignr               m7, m6, 4
+
+    FILTER                 6, 0, 1, [base+filter_shuf2]
+    punpcklqdq            m0, m7, m6                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+    movd  [dstq+4+strideq*0], m6
+    psrlq                 m6, 32
+    palignr               m6, m7, 4
+
+    FILTER                 7, 0, 1, [base+filter_shuf2]
+    punpcklqdq            m0, m6, m7                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+    movd  [dstq+8+strideq*0], m7
+    psrlq                 m7, 32
+    palignr               m7, m6, 4
+
+    FILTER                 6, 0, 1, [base+filter_shuf2]
+    movd [dstq+12+strideq*0], m6
+    psrlq                 m6, 32
+    palignr               m6, m7, 4
+    mova    [dstq+strideq*1], m6
+
+    movd                  m0, [tlq+hq]                  ;_ 6 5 0
+    punpckldq             m0, m6                        ;_ 6 5 0 1 2 3 4
+
+    lea                 dstq, [dstq+strideq*2]
+    sub                   hd, 2
+    jg .w16_loop
+    RET
+
+ALIGN function_align
+.w32:
+    movu                  m6, [tlq+1]                   ;top row
+    lea              filterq, [tlq+17]
+    sub                  tlq, 5
+    sub                  tlq, hq
+
+.w32_loop:
+    FILTER                 7, 0, 1, [base+filter_shuf1]
+    punpcklqdq            m0, m6, m7                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+    movd    [dstq+strideq*0], m7
+    psrlq                 m7, 32
+    palignr               m7, m6, 4
+
+    FILTER                 6, 0, 1, [base+filter_shuf2]
+    punpcklqdq            m0, m7, m6                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+    movd  [dstq+4+strideq*0], m6
+    psrlq                 m6, 32
+    palignr               m6, m7, 4
+
+    FILTER                 7, 0, 1, [base+filter_shuf2]
+    punpcklqdq            m0, m6, m7                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+    movd  [dstq+8+strideq*0], m7
+    psrlq                 m7, 32
+    palignr               m7, m6, 4
+
+    FILTER                 6, 0, 1, [base+filter_shuf2]
+    movu                  m1, [filterq]
+    punpckldq             m0, m7, m1                    ;_ _ _ 0 1 2 3 4 _ _ _ _ _ _ _ _
+    punpcklqdq            m0, m6                        ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+    movd [dstq+12+strideq*0], m6
+    psrlq                 m6, 32
+    palignr               m6, m7, 4
+    mova    [dstq+strideq*1], m6
+
+    mova                  m6, m1
+
+    FILTER                 7, 0, 6, [base+filter_shuf2]
+    punpcklqdq            m0, m1, m7                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+    movd [dstq+16+strideq*0], m7
+    psrlq                 m7, 32
+    palignr               m7, m1, 4
+
+    FILTER                 6, 0, 1, [base+filter_shuf2]
+    punpcklqdq            m0, m7, m6                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+    movd [dstq+20+strideq*0], m6
+    psrlq                 m6, 32
+    palignr               m6, m7, 4
+
+    FILTER                 7, 0, 1, [base+filter_shuf2]
+    punpcklqdq            m0, m6, m7                    ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6
+    movd [dstq+24+strideq*0], m7
+    psrlq                 m7, 32
+    palignr               m7, m6, 4
+
+    FILTER                 6, 0, 1, [base+filter_shuf2]
+    movd [dstq+28+strideq*0], m6
+    psrlq                 m6, 32
+    palignr               m6, m7, 4
+    mova [dstq+16+strideq*1], m6
+
+    mova                  m6, [dstq+strideq*1]
+    movd                  m0, [tlq+hq]                  ;_ 6 5 0
+    punpckldq             m0, m6                        ;_ 6 5 0 1 2 3 4
+    lea              filterq, [dstq+16+strideq*1]
+    lea                 dstq, [dstq+strideq*2]
+    sub                   hd, 2
+    jg .w32_loop
+    RET
--- a/third_party/dav1d/src/x86/itx_ssse3.asm
+++ b/third_party/dav1d/src/x86/itx_ssse3.asm
@@ -6092,17 +6092,17 @@ ALIGN function_align
     mova [rsp+gprsize*2+16*50], m2                        ;out47
     mova [rsp+gprsize*2+16*19], m0                        ;out16
     mova [rsp+gprsize*2+16*51], m3                        ;out48
     mova [rsp+gprsize*2+16*18], m1                        ;out15
     ret
 
 
 
-cglobal inv_txfm_add_dct_dct_64x16, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2
+cglobal inv_txfm_add_dct_dct_64x16, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx2
 %if ARCH_X86_32
     LEA                     r5, $$
 %endif
     test                  eobd, eobd
     jz .dconly
 
     call m(idct_64x16_internal)
     RET
@@ -6181,17 +6181,19 @@ cglobal inv_txfm_add_dct_dct_64x16, 4, 6
 %macro LOAD_4ROWS_H 2 ;src, stride
     mova                 m4, [%1+%2*0]
     mova                 m5, [%1+%2*1]
     mova                 m6, [%1+%2*2]
     mova                 m7, [%1+%2*3]
 %endmacro
 
 cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
-    mov                     r3, 2
+    mov                    r3d, 2
+    mov  [rsp+gprsize*2+16*67], dstq
+    lea                   dstq, [rsp+gprsize+16*68]
 
 .pass1_loop:
     LOAD_4ROWS     coeffq+32*0, 32*8
     pxor                    m4, m4
     REPX          {mova x, m4}, m5, m6, m7
     call  m(idct_8x8_internal).main
     SAVE_7ROWS    rsp+gprsize+16*3, 16
 
@@ -6272,54 +6274,54 @@ cglobal idct_64x16_internal, 0, 0, 0, ds
     SAVE_8ROWS    coeffq+32*24, 32
     LOAD_8ROWS   rsp+gprsize+16*35, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
     lea                   tx2q, [o(m(idct_64x16_internal).pass1_end4)]
     jmp   m(idct_8x8_internal).pass1_end1
 
 .pass1_end4:
-    SAVE_8ROWS    coeffq+32*32, 32
+    SAVE_8ROWS       dstq+32*0, 32
     LOAD_8ROWS   rsp+gprsize+16*43, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
     lea                   tx2q, [o(m(idct_64x16_internal).pass1_end5)]
     jmp   m(idct_8x8_internal).pass1_end1
 
 .pass1_end5:
-    SAVE_8ROWS    coeffq+32*40, 32
+    SAVE_8ROWS       dstq+32*8, 32
     LOAD_8ROWS   rsp+gprsize+16*51, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
     lea                   tx2q, [o(m(idct_64x16_internal).pass1_end6)]
     jmp   m(idct_8x8_internal).pass1_end1
 
 .pass1_end6:
-    SAVE_8ROWS    coeffq+32*48, 32
+    SAVE_8ROWS      dstq+32*16, 32
     LOAD_8ROWS   rsp+gprsize+16*59, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
     lea                   tx2q, [o(m(idct_64x16_internal).pass1_end7)]
     jmp   m(idct_8x8_internal).pass1_end1
 
 .pass1_end7:
-    SAVE_8ROWS    coeffq+32*56, 32
+    SAVE_8ROWS      dstq+32*24, 32
 
     add                 coeffq, 16
-    dec                     r3
+    add                   dstq, 16
+    dec                    r3d
     jg .pass1_loop
 
 .pass2:
+    mov                   dstq, [rsp+gprsize*2+16*67]
     sub                 coeffq, 32
-    mov                     r3, 8
-    lea                     r4, [dstq+8]
-    mov  [rsp+gprsize*2+16*67], r4
+    mov                    r3d, 4
 
 .pass2_loop:
-    mov  [rsp+gprsize*1+16*67], r3
+    mov  [rsp+gprsize*1+16*67], r3d
 
     LOAD_4ROWS     coeffq+16*0, 32*2
     LOAD_4ROWS_H   coeffq+16*1, 32*2
     call  m(idct_8x8_internal).main
     SAVE_7ROWS    rsp+gprsize+16*3, 16
     LOAD_4ROWS     coeffq+16*2, 32*2
     LOAD_4ROWS_H   coeffq+16*3, 32*2
     call m(idct_16x8_internal).main
@@ -6336,23 +6338,57 @@ cglobal idct_64x16_internal, 0, 0, 0, ds
     mov                  dstq, r3
     jmp  m(idct_8x8_internal).end
 
 .end1:
     pxor                   m7, m7
     REPX  {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
 
     add                 coeffq, 16*16
-    mov                     r3, [rsp+gprsize*1+16*67]
+    mov                    r3d, [rsp+gprsize*1+16*67]
     mov                   dstq, [rsp+gprsize*2+16*67]
-    lea                     r4, [dstq+8]
-    mov  [rsp+gprsize*2+16*67], r4
-
-    dec                     r3
+    add                   dstq, 8
+    mov  [rsp+gprsize*2+16*67], dstq
+    dec                    r3d
     jg .pass2_loop
+
+    mov                    r3d, 4
+    lea                 coeffq, [rsp+gprsize+16*68]
+.pass2_loop2:
+    mov  [rsp+gprsize*1+16*67], r3d
+
+    LOAD_4ROWS     coeffq+16*0, 32*2
+    LOAD_4ROWS_H   coeffq+16*1, 32*2
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+    LOAD_4ROWS     coeffq+16*2, 32*2
+    LOAD_4ROWS_H   coeffq+16*3, 32*2
+    call m(idct_16x8_internal).main
+
+    mov                    r3, dstq
+    lea                  tx2q, [o(m(idct_64x16_internal).end2)]
+    lea                  dstq, [dstq+strideq*8]
+    jmp  m(idct_8x8_internal).end
+
+.end2:
+    LOAD_8ROWS   rsp+gprsize+16*3, 16
+    mova   [rsp+gprsize+16*0], m7
+    lea                  tx2q, [o(m(idct_64x16_internal).end3)]
+    mov                  dstq, r3
+    jmp  m(idct_8x8_internal).end
+
+.end3:
+
+    add                 coeffq, 16*16
+    mov                    r3d, [rsp+gprsize*1+16*67]
+    mov                   dstq, [rsp+gprsize*2+16*67]
+    add                   dstq, 8
+    mov  [rsp+gprsize*2+16*67], dstq
+    dec                    r3d
+    jg .pass2_loop2
     ret
 
 
 cglobal inv_txfm_add_dct_dct_32x64, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2
 %if ARCH_X86_32
     LEA                     r5, $$
 %endif
     test                  eobd, eobd
--- a/third_party/dav1d/tests/checkasm/itx.c
+++ b/third_party/dav1d/tests/checkasm/itx.c
@@ -153,16 +153,18 @@ static int copy_subcoefs(coef *coeff,
         } else if (!eob && (rcx > sub_low || rcy > sub_low))
             eob = n; /* lower boundary */
     }
 
     if (eob)
         eob += rnd() % (n - eob - 1);
     for (n = eob + 1; n < sw * sh; n++)
         coeff[scan[n]] = 0;
+    for (; n < 32 * 32; n++)
+        coeff[n] = rnd();
     return eob;
 }
 
 static int ftx(coef *const buf, const enum RectTxfmSize tx,
                const enum TxfmType txtp, const int w, const int h,
                const int subsh, const int bitdepth_max)
 {
     double out[64 * 64], temp[64 * 64];
@@ -219,17 +221,17 @@ static int ftx(coef *const buf, const en
 
     return copy_subcoefs(buf, tx, txtp, sw, sh, subsh);
 }
 
 void bitfn(checkasm_check_itx)(void) {
     Dav1dInvTxfmDSPContext c;
     bitfn(dav1d_itx_dsp_init)(&c);
 
-    ALIGN_STK_32(coef, coeff, 3, [32 * 32]);
+    ALIGN_STK_32(coef, coeff, 2, [32 * 32]);
     ALIGN_STK_32(pixel, c_dst, 64 * 64,);
     ALIGN_STK_32(pixel, a_dst, 64 * 64,);
 
     static const uint8_t txfm_size_order[N_RECT_TX_SIZES] = {
         TX_4X4,   RTX_4X8,  RTX_4X16,
         RTX_8X4,  TX_8X8,   RTX_8X16,  RTX_8X32,
         RTX_16X4, RTX_16X8, TX_16X16,  RTX_16X32, RTX_16X64,
                   RTX_32X8, RTX_32X16, TX_32X32,  RTX_32X64,
@@ -240,17 +242,16 @@ void bitfn(checkasm_check_itx)(void) {
 
     declare_func(void, pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob
                  HIGHBD_DECL_SUFFIX);
 
     for (int i = 0; i < N_RECT_TX_SIZES; i++) {
         const enum RectTxfmSize tx = txfm_size_order[i];
         const int w = dav1d_txfm_dimensions[tx].w * 4;
         const int h = dav1d_txfm_dimensions[tx].h * 4;
-        const int sw = imin(w, 32), sh = imin(h, 32);
         const int subsh_max = subsh_iters[imax(dav1d_txfm_dimensions[tx].lw,
                                                dav1d_txfm_dimensions[tx].lh)];
 
         for (enum TxfmType txtp = 0; txtp < N_TX_TYPES_PLUS_LL; txtp++)
             for (int subsh = 0; subsh < subsh_max; subsh++)
                 if (check_func(c.itxfm_add[tx][txtp],
                                "inv_txfm_add_%dx%d_%s_%s_%d_%dbpc",
                                w, h, itx_1d_names[itx_1d_types[txtp][0]],
@@ -258,31 +259,29 @@ void bitfn(checkasm_check_itx)(void) {
                                BITDEPTH))
                 {
 #if BITDEPTH == 16
                     const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
 #else
                     const int bitdepth_max = 0xff;
 #endif
                     const int eob = ftx(coeff[0], tx, txtp, w, h, subsh, bitdepth_max);
+                    memcpy(coeff[1], coeff[0], sizeof(*coeff));
 
                     for (int j = 0; j < w * h; j++)
                         c_dst[j] = a_dst[j] = rnd() & bitdepth_max;
 
-                    memcpy(coeff[1], coeff[0], sw * sh * sizeof(**coeff));
-                    memcpy(coeff[2], coeff[0], sw * sh * sizeof(**coeff));
-
                     call_ref(c_dst, w * sizeof(*c_dst), coeff[0], eob
                              HIGHBD_TAIL_SUFFIX);
                     call_new(a_dst, w * sizeof(*c_dst), coeff[1], eob
                              HIGHBD_TAIL_SUFFIX);
                     if (memcmp(c_dst, a_dst, w * h * sizeof(*c_dst)) ||
-                        memcmp(coeff[0], coeff[1], sw * sh * sizeof(**coeff)))
+                        memcmp(coeff[0], coeff[1], sizeof(*coeff)))
                     {
                         fail();
                     }
 
-                    bench_new(a_dst, w * sizeof(*c_dst), coeff[2], eob
+                    bench_new(a_dst, w * sizeof(*c_dst), coeff[0], eob
                               HIGHBD_TAIL_SUFFIX);
                 }
         report("add_%dx%d", w, h);
     }
 }
--- a/third_party/dav1d/tests/checkasm/msac.c
+++ b/third_party/dav1d/tests/checkasm/msac.c
@@ -98,17 +98,23 @@ static void check_decode_symbol_adapt(Ms
 }
 
 void checkasm_check_msac(void) {
     MsacDSPContext c;
     c.symbol_adapt4  = dav1d_msac_decode_symbol_adapt_c;
     c.symbol_adapt8  = dav1d_msac_decode_symbol_adapt_c;
     c.symbol_adapt16 = dav1d_msac_decode_symbol_adapt_c;
 
-#if ARCH_X86_64 && HAVE_ASM
+#if ARCH_AARCH64 && HAVE_ASM
+    if (dav1d_get_cpu_flags() & DAV1D_ARM_CPU_FLAG_NEON) {
+        c.symbol_adapt4  = dav1d_msac_decode_symbol_adapt4_neon;
+        c.symbol_adapt8  = dav1d_msac_decode_symbol_adapt8_neon;
+        c.symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_neon;
+    }
+#elif ARCH_X86_64 && HAVE_ASM
     if (dav1d_get_cpu_flags() & DAV1D_X86_CPU_FLAG_SSE2) {
         c.symbol_adapt4  = dav1d_msac_decode_symbol_adapt4_sse2;
         c.symbol_adapt8  = dav1d_msac_decode_symbol_adapt8_sse2;
         c.symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_sse2;
     }
 #endif
 
     check_decode_symbol_adapt(&c);
--- a/third_party/dav1d/tests/libfuzzer/dav1d_fuzzer.c
+++ b/third_party/dav1d/tests/libfuzzer/dav1d_fuzzer.c
@@ -56,17 +56,17 @@ static unsigned r32le(const uint8_t *con
 
 #define DAV1D_FUZZ_MAX_SIZE 4096
 
 #if defined(DAV1D_FUZZ_MAX_SIZE)
 static int (*default_picture_allocator)(Dav1dPicture *, void *);
 
 static int fuzz_picture_allocator(Dav1dPicture *pic, void *cookie) {
     if (pic->p.w > DAV1D_FUZZ_MAX_SIZE || pic->p.h > DAV1D_FUZZ_MAX_SIZE)
-        return -EINVAL;
+        return DAV1D_ERR(EINVAL);
 
     return default_picture_allocator(pic, cookie);
 }
 #endif
 
 // expects ivf input
 
 int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)
@@ -144,37 +144,37 @@ int LLVMFuzzerTestOneInput(const uint8_t
         // copy frame data to a new buffer to catch reads past the end of input
         p = dav1d_data_create(&buf, frame_size);
         if (!p) goto cleanup;
         memcpy(p, ptr, frame_size);
         ptr += frame_size;
 
         do {
             if ((err = dav1d_send_data(ctx, &buf)) < 0) {
-                if (err != -EAGAIN)
+                if (err != DAV1D_ERR(EAGAIN))
                     break;
             }
             memset(&pic, 0, sizeof(pic));
             err = dav1d_get_picture(ctx, &pic);
             if (err == 0) {
                 dav1d_picture_unref(&pic);
-            } else if (err != -EAGAIN) {
+            } else if (err != DAV1D_ERR(EAGAIN)) {
                 break;
             }
         } while (buf.sz > 0);
 
         if (buf.sz > 0)
             dav1d_data_unref(&buf);
     }
 
     do {
         memset(&pic, 0, sizeof(pic));
         err = dav1d_get_picture(ctx, &pic);
         if (err == 0)
             dav1d_picture_unref(&pic);
-    } while (err != -EAGAIN);
+    } while (err != DAV1D_ERR(EAGAIN));
 
 cleanup:
     dav1d_flush(ctx);
     dav1d_close(&ctx);
 end:
     return 0;
 }
--- a/third_party/dav1d/tests/meson.build
+++ b/third_party/dav1d/tests/meson.build
@@ -99,17 +99,17 @@ dav1d_fuzzer_sources =  files('libfuzzer
 fuzzer_ldflags = []
 
 if fuzzing_engine == 'none'
     dav1d_fuzzer_sources += files('libfuzzer/main.c')
 elif fuzzing_engine == 'libfuzzer'
     fuzzer_ldflags += ['-fsanitize=fuzzer']
 elif fuzzing_engine == 'oss-fuzz'
     # libFuzzingEngine needs libc++
-    fuzzer_ldflags += ['-lFuzzingEngine', '-lc++']
+    fuzzer_ldflags += ['-fsanitize=fuzzer', '-lFuzzingEngine', '-lc++']
 endif
 
 dav1d_fuzzer = executable('dav1d_fuzzer',
     dav1d_fuzzer_sources,
     include_directories: dav1d_inc_dirs,
     c_args: [stackalign_flag, stackrealign_flag],
     link_args: fuzzer_ldflags,
     link_with : libdav1d,
--- a/third_party/dav1d/tools/dav1d.c
+++ b/third_party/dav1d/tools/dav1d.c
@@ -124,25 +124,25 @@ int main(const int argc, char *const *co
         total = cli_settings.limit;
 
     if ((res = dav1d_open(&c, &lib_settings)))
         return res;
 
     do {
         memset(&p, 0, sizeof(p));
         if ((res = dav1d_send_data(c, &data)) < 0) {
-            if (res != -EAGAIN) {
+            if (res != DAV1D_ERR(EAGAIN)) {
                 fprintf(stderr, "Error decoding frame: %s\n",
                         strerror(-res));
                 break;
             }
         }
 
         if ((res = dav1d_get_picture(c, &p)) < 0) {
-            if (res != -EAGAIN) {
+            if (res != DAV1D_ERR(EAGAIN)) {
                 fprintf(stderr, "Error decoding frame: %s\n",
                         strerror(-res));
                 break;
             }
             res = 0;
         } else {
             if (!n_out) {
                 if ((res = output_open(&out, cli_settings.muxer,
@@ -163,17 +163,17 @@ int main(const int argc, char *const *co
             break;
     } while (data.sz > 0 || !input_read(in, &data));
 
     if (data.sz > 0) dav1d_data_unref(&data);
 
     // flush
     if (res == 0) while (!cli_settings.limit || n_out < cli_settings.limit) {
         if ((res = dav1d_get_picture(c, &p)) < 0) {
-            if (res != -EAGAIN) {
+            if (res != DAV1D_ERR(EAGAIN)) {
                 fprintf(stderr, "Error decoding frame: %s\n",
                         strerror(-res));
             } else {
                 res = 0;
                 break;
             }
         } else {
             if (!n_out) {
--- a/third_party/dav1d/tools/input/input.c
+++ b/third_party/dav1d/tools/input/input.c
@@ -85,17 +85,17 @@ int input_open(DemuxerContext **const c_
         for (i = 0; i < num_demuxers; i++) {
             if (!strcmp(demuxers[i]->name, name)) {
                 impl = demuxers[i];
                 break;
             }
         }
         if (i == num_demuxers) {
             fprintf(stderr, "Failed to find demuxer named \"%s\"\n", name);
-            return -ENOPROTOOPT;
+            return DAV1D_ERR(ENOPROTOOPT);
         }
     } else {
         const char *const ext = find_extension(filename);
         if (!ext) {
             fprintf(stderr, "No extension found for file %s\n", filename);
             return -1;
         }
 
@@ -104,23 +104,23 @@ int input_open(DemuxerContext **const c_
                 impl = demuxers[i];
                 break;
             }
         }
         if (i == num_demuxers) {
             fprintf(stderr,
                     "Failed to find demuxer for file %s (\"%s\")\n",
                     filename, ext);
-            return -ENOPROTOOPT;
+            return DAV1D_ERR(ENOPROTOOPT);
         }
     }
 
     if (!(c = malloc(sizeof(DemuxerContext) + impl->priv_data_size))) {
         fprintf(stderr, "Failed to allocate memory\n");
-        return -ENOMEM;
+        return DAV1D_ERR(ENOMEM);
     }
     memset(c, 0, sizeof(DemuxerContext) + impl->priv_data_size);
     c->impl = impl;
     c->data = (DemuxerPriv *) &c[1];
     if ((res = impl->open(c->data, filename, fps, num_frames)) < 0) {
         free(c);
         return res;
     }
--- a/third_party/dav1d/tools/output/output.c
+++ b/third_party/dav1d/tools/output/output.c
@@ -88,39 +88,39 @@ int output_open(MuxerContext **const c_o
         for (i = 0; i < num_muxers; i++) {
             if (!strcmp(muxers[i]->name, name)) {
                 impl = muxers[i];
                 break;
             }
         }
         if (i == num_muxers) {
             fprintf(stderr, "Failed to find muxer named \"%s\"\n", name);
-            return -ENOPROTOOPT;
+            return DAV1D_ERR(ENOPROTOOPT);
         }
     } else {
         const char *ext = find_extension(filename);
         if (!ext) {
             fprintf(stderr, "No extension found for file %s\n", filename);
             return -1;
         }
         for (i = 0; i < num_muxers; i++) {
             if (!strcmp(muxers[i]->extension, ext)) {
                 impl = muxers[i];
                 break;
             }
         }
         if (i == num_muxers) {
             fprintf(stderr, "Failed to find muxer for extension \"%s\"\n", ext);
-            return -ENOPROTOOPT;
+            return DAV1D_ERR(ENOPROTOOPT);
         }
     }
 
     if (!(c = malloc(sizeof(MuxerContext) + impl->priv_data_size))) {
         fprintf(stderr, "Failed to allocate memory\n");
-        return -ENOMEM;
+        return DAV1D_ERR(ENOMEM);
     }
     c->impl = impl;
     c->data = (MuxerPriv *) &c[1];
     if (impl->write_header && (res = impl->write_header(c->data, filename, p, fps)) < 0) {
         free(c);
         return res;
     }
     *c_out = c;