Bug 1493400 - Update dav1d from upstream to d27598e. r=TD-Linux
authorAlex Chronopoulos <achronop@gmail.com>
Tue, 27 Nov 2018 14:04:37 +0000
changeset 507514 94a729d77bc5451e8372c28238f1d3b58856dd6f
parent 507513 ddfa91686df0498021c8edc383f032707c237ce9
child 507515 5c4bf474ddb3d0f47c493bca5e8a2e9b01ce38e5
push id1905
push userffxbld-merge
push dateMon, 21 Jan 2019 12:33:13 +0000
treeherdermozilla-release@c2fca1944d8c [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersTD-Linux
bugs1493400
milestone65.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1493400 - Update dav1d from upstream to d27598e. r=TD-Linux Depends on D9607 Differential Revision: https://phabricator.services.mozilla.com/D12162
media/libdav1d/README_MOZILLA
third_party/dav1d/.gitlab-ci.yml
third_party/dav1d/README.md
third_party/dav1d/include/common/attributes.h
third_party/dav1d/include/common/intops.h
third_party/dav1d/include/common/mem.h
third_party/dav1d/include/compat/gcc/stdatomic.h
third_party/dav1d/include/compat/msvc/stdatomic.h
third_party/dav1d/include/compat/stdatomic.h
third_party/dav1d/include/dav1d/data.h
third_party/dav1d/include/dav1d/dav1d.h
third_party/dav1d/include/dav1d/picture.h
third_party/dav1d/meson.build
third_party/dav1d/meson_options.txt
third_party/dav1d/src/arm/32/mc.S
third_party/dav1d/src/arm/64/mc.S
third_party/dav1d/src/arm/cpu.c
third_party/dav1d/src/arm/mc_init.c
third_party/dav1d/src/arm/mc_init_tmpl.c
third_party/dav1d/src/cdef.h
third_party/dav1d/src/cdef_apply_tmpl.c
third_party/dav1d/src/cdef_tmpl.c
third_party/dav1d/src/cdf.c
third_party/dav1d/src/cpu.c
third_party/dav1d/src/ctx.h
third_party/dav1d/src/data.c
third_party/dav1d/src/data.h
third_party/dav1d/src/decode.c
third_party/dav1d/src/env.h
third_party/dav1d/src/getbits.c
third_party/dav1d/src/getbits.h
third_party/dav1d/src/internal.h
third_party/dav1d/src/ipred.h
third_party/dav1d/src/ipred_prepare.h
third_party/dav1d/src/ipred_tmpl.c
third_party/dav1d/src/levels.h
third_party/dav1d/src/lf_apply_tmpl.c
third_party/dav1d/src/lf_mask.c
third_party/dav1d/src/lib.c
third_party/dav1d/src/loopfilter_tmpl.c
third_party/dav1d/src/looprestoration_tmpl.c
third_party/dav1d/src/lr_apply_tmpl.c
third_party/dav1d/src/mc.h
third_party/dav1d/src/mc_tmpl.c
third_party/dav1d/src/meson.build
third_party/dav1d/src/msac.c
third_party/dav1d/src/msac.h
third_party/dav1d/src/obu.c
third_party/dav1d/src/picture.c
third_party/dav1d/src/picture.h
third_party/dav1d/src/recon.h
third_party/dav1d/src/recon_tmpl.c
third_party/dav1d/src/ref.c
third_party/dav1d/src/ref.h
third_party/dav1d/src/ref_mvs.c
third_party/dav1d/src/ref_mvs.h
third_party/dav1d/src/tables.c
third_party/dav1d/src/tables.h
third_party/dav1d/src/thread_task.c
third_party/dav1d/src/thread_task.h
third_party/dav1d/src/warpmv.c
third_party/dav1d/src/wedge.c
third_party/dav1d/src/win32/thread.c
third_party/dav1d/src/x86/cdef.asm
third_party/dav1d/src/x86/cdef_init_tmpl.c
third_party/dav1d/src/x86/ipred_init.c
third_party/dav1d/src/x86/ipred_init_tmpl.c
third_party/dav1d/src/x86/itx_init.c
third_party/dav1d/src/x86/itx_init_tmpl.c
third_party/dav1d/src/x86/loopfilter_init.c
third_party/dav1d/src/x86/loopfilter_init_tmpl.c
third_party/dav1d/src/x86/looprestoration.asm
third_party/dav1d/src/x86/looprestoration_init.c
third_party/dav1d/src/x86/looprestoration_init_tmpl.c
third_party/dav1d/src/x86/mc.asm
third_party/dav1d/src/x86/mc_init.c
third_party/dav1d/src/x86/mc_init_tmpl.c
third_party/dav1d/tests/checkasm/cdef.c
third_party/dav1d/tests/checkasm/checkasm.c
third_party/dav1d/tests/checkasm/checkasm.h
third_party/dav1d/tests/checkasm/ipred.c
third_party/dav1d/tests/checkasm/looprestoration.c
third_party/dav1d/tests/checkasm/mc.c
third_party/dav1d/tests/libfuzzer/dav1d_fuzzer.c
third_party/dav1d/tests/libfuzzer/main.c
third_party/dav1d/tests/meson.build
third_party/dav1d/tools/dav1d.c
third_party/dav1d/tools/dav1d_cli_parse.c
third_party/dav1d/tools/dav1d_cli_parse.h
third_party/dav1d/tools/input/ivf.c
third_party/dav1d/tools/meson.build
third_party/dav1d/tools/output/md5.c
third_party/dav1d/tools/output/muxer.h
third_party/dav1d/tools/output/null.c
third_party/dav1d/tools/output/output.c
third_party/dav1d/tools/output/output.h
new file mode 100644
--- /dev/null
+++ b/media/libdav1d/README_MOZILLA
@@ -0,0 +1,24 @@
+This directory contains build files for dav1d. The actual library
+source is in $TOPSRCDIR/third_party/dav1d/
+
+Any patches or additional configuration to be applied to the
+upstream source should be kept here in the media/libaom
+directory.
+
+To update the library source and build config files, execute
+
+  ./mach vendor dav1d
+
+To update to a specific upstream git tag or commit, use
+
+  ./mach vendor dav1d -r <commit>
+
+The upstream git repository is https://aomedia.googlesource.com/aom
+
+To update to a fork, use
+
+  ./mach vendor dav1d --repo <repository url> [-r <commit>]
+
+The last update was pulled from https://code.videolan.org/videolan/dav1d
+
+The git commit ID used was 46e2a2d0cc451e1d6bb929f80088f8a7b8940dd0 (2018-10-25T16:51:31.000Z).
--- a/third_party/dav1d/.gitlab-ci.yml
+++ b/third_party/dav1d/.gitlab-ci.yml
@@ -1,35 +1,36 @@
 stages:
     - build
+    - test
 
 build-debian:
-    image: registry.videolan.org:5000/dav1d-debian-unstable:20180928151533
+    image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132
     stage: build
     tags:
         - debian
         - amd64
     script:
         - env CFLAGS='-Werror' meson build --buildtype release
         - ninja -C build
         - cd build && meson test -v
 
 build-debian-static:
-    image: registry.videolan.org:5000/dav1d-debian-unstable:20180928151533
+    image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132
     stage: build
     tags:
         - debian
         - amd64
     script:
         - env CFLAGS='-Werror' meson build --buildtype release --default-library static
         - ninja -C build
         - cd build && meson test -v
 
 build-win32:
-    image: registry.videolan.org:5000/dav1d-debian-unstable:20180928151533
+    image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132
     stage: build
     tags:
         - win32
     script:
         - env CFLAGS='-Werror'
             meson build --buildtype release
                         --libdir lib
                         --prefix "$(pwd)/build/dav1d_install"
@@ -39,17 +40,17 @@ build-win32:
         - ninja -C build install
     artifacts:
         name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG"
         paths:
             - build/dav1d_install/
         expire_in: 1 week
 
 build-win64:
-    image: registry.videolan.org:5000/dav1d-debian-unstable:20180928151533
+    image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132
     stage: build
     tags:
         - win64
     script:
         - env CFLAGS='-Werror'
             meson build --buildtype release
                         --libdir lib
                         --prefix "$(pwd)/build/dav1d_install"
@@ -97,8 +98,27 @@ build-debian-werror:
     image: dav1d-debian-aarch64:201810240631
     stage: build
     tags:
         - aarch64
         - debian
     script:
         - env CC='clang-7' CFLAGS='-Werror' meson build -Dbuild_tests=false
         - ninja -C build
+
+test-debian:
+    image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132
+    stage: test
+    tags:
+        - debian
+        - amd64
+    cache:
+        key: testdata.git
+        paths:
+            - cache/dav1d-test-data.git/
+    script:
+        - test -d cache || mkdir cache
+        - test -d cache/dav1d-test-data.git && GIT_DIR=cache/dav1d-test-data.git git fetch --refmap=refs/heads/master:refs/heads/master origin master
+        - test -d cache/dav1d-test-data.git || git clone --bare https://code.videolan.org/videolan/dav1d-test-data.git cache/dav1d-test-data.git
+        - git clone cache/dav1d-test-data.git tests/dav1d-test-data
+        - meson build --buildtype release -Dtestdata_tests=true
+        - ninja -C build
+        - cd build && time meson test -v
--- a/third_party/dav1d/README.md
+++ b/third_party/dav1d/README.md
@@ -60,16 +60,32 @@ VideoLAN will only have the collective w
 The [VideoLAN Code of Conduct](https://wiki.videolan.org/CoC) applies to this project.
 
 # Compile
 
 1. Install [Meson](https://mesonbuild.com/) (0.47 or higher), [Ninja](https://ninja-build.org/), and, for x86* targets, [nasm](https://nasm.us/) (2.13 or higher)
 2. Run `meson build --buildtype release`
 3. Build with `ninja -C build`
 
+# Run tests
+
+1. During initial build dir setup or `meson configure` specify `-Dbuild_tests=true`
+2. In the build directory run `meson test` optionally with `-v` for more verbose output, especially useful
+   for checkasm
+
+# Run testdata based tests
+
+1. Checkout the test data repository
+   
+   ```
+   git clone https://code.videolan.org/videolan/dav1d-test-data.git tests/dav1d-test-data
+   ```
+2. During initial build dir setup or `meson configure` specify `-Dbuild_tests=true` and `-Dtestdata_tests=true`
+3. In the build directory run `meson test` optionally with `-v` for more verbose output
+
 # Support
 
 This project is partially funded by the *Alliance for Open Media*/**AOM** and is supported by TwoOrioles and VideoLabs.
 
 These companies can provide support and integration help, should you need it.
 
 
 # FAQ
--- a/third_party/dav1d/include/common/attributes.h
+++ b/third_party/dav1d/include/common/attributes.h
@@ -27,16 +27,22 @@
 
 #ifndef __DAV1D_COMMON_ATTRIBUTES_H__
 #define __DAV1D_COMMON_ATTRIBUTES_H__
 
 #include "config.h"
 
 #include <stddef.h>
 
+#ifdef __GNUC__
+#define ATTR_ALIAS __attribute__((may_alias))
+#else
+#define ATTR_ALIAS
+#endif
+
 #if ARCH_X86
 #define ALIGN_32_VAL 32
 #define ALIGN_16_VAL 16
 #elif ARCH_ARM || ARCH_AARCH64
 // ARM doesn't benefit from anything more than 16 byte alignment.
 #define ALIGN_32_VAL 16
 #define ALIGN_16_VAL 16
 #else
--- a/third_party/dav1d/include/common/intops.h
+++ b/third_party/dav1d/include/common/intops.h
@@ -47,32 +47,28 @@ static inline int iclip(const int v, con
 static inline int iclip_u8(const int v) {
     return iclip(v, 0, 255);
 }
 
 static inline int apply_sign(const int v, const int s) {
     return s < 0 ? -v : v;
 }
 
+static inline int apply_sign64(const int v, const int64_t s) {
+    return s < 0 ? -v : v;
+}
+
 static inline int ulog2(const unsigned v) {
     return 31 - clz(v);
 }
 
 static inline int u64log2(const uint64_t v) {
     return 63 - clzll(v);
 }
 
-static inline unsigned rl16(const uint8_t *const ptr) {
-    return (ptr[1] << 8) | ptr[0];
-}
-
-static inline unsigned rl32(const uint8_t *const ptr) {
-    return (rl16(&ptr[2]) << 16) | rl16(ptr);
-}
-
 static inline unsigned inv_recenter(const unsigned r, const unsigned v) {
     if (v > (r << 1))
         return v;
     else if ((v & 1) == 0)
         return (v >> 1) + r;
     else
         return r - ((v + 1) >> 1);
 }
--- a/third_party/dav1d/include/common/mem.h
+++ b/third_party/dav1d/include/common/mem.h
@@ -26,42 +26,46 @@
  */
 
 #ifndef __DAV1D_COMMON_MEM_H__
 #define __DAV1D_COMMON_MEM_H__
 
 #include <assert.h>
 #include <stdlib.h>
 
-#ifdef HAVE_ALIGNED_MALLOC
+#if defined(HAVE_ALIGNED_MALLOC) || defined(HAVE_MEMALIGN)
 #include <malloc.h>
 #endif
 
 /*
  * Allocate 32-byte aligned memory. The return value can be released
  * by calling the standard free() function.
  */
 static inline void *dav1d_alloc_aligned(size_t sz, size_t align) {
 #ifdef HAVE_POSIX_MEMALIGN
     void *ptr;
     assert(!(align & (align - 1)));
     if (posix_memalign(&ptr, align, sz)) return NULL;
     return ptr;
 #elif defined(HAVE_ALIGNED_MALLOC)
     return _aligned_malloc(sz, align);
+#elif defined(HAVE_MEMALIGN)
+    return memalign(align, sz);
 #else
 #error Missing aligned alloc implementation
 #endif
 }
 
 static inline void dav1d_free_aligned(void* ptr) {
 #ifdef HAVE_POSIX_MEMALIGN
     free(ptr);
 #elif defined(HAVE_ALIGNED_MALLOC)
     _aligned_free(ptr);
+#elif defined(HAVE_MEMALIGN)
+    free(ptr);
 #endif
 }
 
 static inline void dav1d_freep_aligned(void* ptr) {
     void **mem = (void **) ptr;
     if (*mem) {
         dav1d_free_aligned(*mem);
         *mem = NULL;
new file mode 100644
--- /dev/null
+++ b/third_party/dav1d/include/compat/gcc/stdatomic.h
@@ -0,0 +1,47 @@
+/*
+* Copyright © 2018, VideoLAN and dav1d authors
+* All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*
+* 1. Redistributions of source code must retain the above copyright notice, this
+*    list of conditions and the following disclaimer.
+*
+* 2. Redistributions in binary form must reproduce the above copyright notice,
+*    this list of conditions and the following disclaimer in the documentation
+*    and/or other materials provided with the distribution.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef GCCVER_STDATOMIC_H_
+#define GCCVER_STDATOMIC_H_
+
+#if !defined(__cplusplus)
+
+typedef int atomic_int;
+typedef unsigned int atomic_uint;
+
+#define memory_order_relaxed __ATOMIC_RELAXED
+#define memory_order_acquire __ATOMIC_ACQUIRE
+
+#define atomic_init(p_a, v)           do { *(p_a) = (v); } while(0)
+#define atomic_store(p_a, v)          __atomic_store_n(p_a, v, __ATOMIC_SEQ_CST)
+#define atomic_load(p_a)              __atomic_load_n(p_a, __ATOMIC_SEQ_CST)
+#define atomic_load_explicit(p_a, mo) __atomic_load_n(p_a, mo)
+#define atomic_fetch_add(p_a, inc)    __atomic_fetch_add(p_a, inc, __ATOMIC_SEQ_CST)
+#define atomic_fetch_sub(p_a, dec)    __atomic_fetch_sub(p_a, dec, __ATOMIC_SEQ_CST)
+
+#endif /* !defined(__cplusplus) */
+
+#endif /* GCCVER_STDATOMIC_H_ */
rename from third_party/dav1d/include/compat/stdatomic.h
rename to third_party/dav1d/include/compat/msvc/stdatomic.h
--- a/third_party/dav1d/include/dav1d/data.h
+++ b/third_party/dav1d/include/dav1d/data.h
@@ -29,47 +29,47 @@
 #define __DAV1D_DATA_H__
 
 #include <stddef.h>
 #include <stdint.h>
 
 #include "common.h"
 
 typedef struct Dav1dData {
-    uint8_t *data; ///< data pointer
+    const uint8_t *data; ///< data pointer
     size_t sz; ///< data size
     struct Dav1dRef *ref; ///< allocation origin
 } Dav1dData;
 
 /**
  * Allocate data.
  *
  * @param data Input context.
  * @param   sz Size of the data that should be allocated.
  *
- * @return 0 on success. A negative errno value on error.
+ * @return Pointer to the allocated bufferon success. NULL on error.
  */
-DAV1D_API int dav1d_data_create(Dav1dData *data, size_t sz);
+DAV1D_API uint8_t * dav1d_data_create(Dav1dData *data, size_t sz);
 
 /**
  * Wrap an existing data array.
  *
  * @param          data Input context.
  * @param           buf The data to be wrapped.
  * @param            sz Size of the data.
  * @param free_callback Function to be called when we release our last
  *                      reference to this data. In this callback, $buf will be
  *                      the $buf argument to this function, and $user_data
  *                      will be the $user_data input argument to this function.
  * @param     user_data Opaque parameter passed to free_callback().
  *
  * @return 0 on success. A negative errno value on error.
  */
-DAV1D_API int dav1d_data_wrap(Dav1dData *data, uint8_t *buf, size_t sz,
-                              void (*free_callback)(uint8_t *buf, void *user_data),
+DAV1D_API int dav1d_data_wrap(Dav1dData *data, const uint8_t *buf, size_t sz,
+                              void (*free_callback)(const uint8_t *buf, void *user_data),
                               void *user_data);
 
 /**
  * Free the data reference.
  *
  * @param data Input context.
  */
 DAV1D_API void dav1d_data_unref(Dav1dData *data);
--- a/third_party/dav1d/include/dav1d/dav1d.h
+++ b/third_party/dav1d/include/dav1d/dav1d.h
@@ -23,16 +23,22 @@
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #ifndef __DAV1D_H__
 #define __DAV1D_H__
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <errno.h>
+
 #include "common.h"
 #include "picture.h"
 #include "data.h"
 
 typedef struct Dav1dContext Dav1dContext;
 typedef struct Dav1dRef Dav1dRef;
 
 typedef struct Dav1dSettings {
@@ -51,55 +57,74 @@ DAV1D_API const char *dav1d_version(void
  *
  * @param s Input settings context.
  */
 DAV1D_API void dav1d_default_settings(Dav1dSettings *s);
 
 /**
  * Allocate and open a decoder instance.
  *
- * @param c_out The decoder instance to open. To be used in iterative calls to
- *              dav1d_decode(). *c_out will be set to the allocated context.
+ * @param c_out The decoder instance to open. *c_out will be set to the
+ *              allocated context.
  * @param     s Input settings context.
  *
  * @note The context must be freed using dav1d_close() when decoding is
  *       finished.
  *
  * @return 0 on success, or < 0 (a negative errno code) on error.
  */
 DAV1D_API int dav1d_open(Dav1dContext **c_out, const Dav1dSettings *s);
 
 /**
- * Decode one frame.
+ * Feed bitstream data to the decoder.
  *
  * @param   c Input decoder instance.
- * @param  in Input bitstream data. On success, the caller retains ownership of
- *            the input reference if the data was not fully consumed.
+ * @param  in Input bitstream data. On success, ownership of the reference is
+ *            passed to the library.
+ *
+ * @return
+ *         0: Success, and the data was consumed.
+ *   -EAGAIN: The data can't be consumed. dav1d_get_picture() should be called
+ *            to get one or more frames before the function can consume new
+ *            data.
+ *   other negative errno codes: Error during decoding or because of invalid
+ *                               passed-in arguments.
+ */
+DAV1D_API int dav1d_send_data(Dav1dContext *c, Dav1dData *in);
+
+/**
+ * Return a decoded picture.
+ *
+ * @param   c Input decoder instance.
  * @param out Output frame. The caller assumes ownership of the returned
  *            reference.
  *
  * @return
  *         0: Success, and a frame is returned.
- *   -EAGAIN: Not enough data to output a frame. The fuction should be called
- *            again with new input.
+ *   -EAGAIN: Not enough data to output a frame. dav1d_send_data() should be
+ *            called with new input.
  *   other negative errno codes: Error during decoding or because of invalid
  *                               passed-in arguments.
  *
- * @note To flush the decoder (i.e. all input is finished), feed it NULL input
- *       data until it returns -EAGAIN.
+ * @note To drain buffered frames from the decoder (i.e. on end of stream),
+ *       call this function until it returns -EAGAIN.
  */
-DAV1D_API int dav1d_decode(Dav1dContext *c, Dav1dData *in, Dav1dPicture *out);
+DAV1D_API int dav1d_get_picture(Dav1dContext *c, Dav1dPicture *out);
 
 /**
  * Close a decoder instance and free all associated memory.
  *
  * @param c_out The decoder instance to close. *c_out will be set to NULL.
  */
 DAV1D_API void dav1d_close(Dav1dContext **c_out);
 
 /**
  * Flush all delayed frames in decoder, to be used when seeking.
  *
  * @param c Input decoder instance.
  */
 DAV1D_API void dav1d_flush(Dav1dContext *c);
 
+# ifdef __cplusplus
+}
+# endif
+
 #endif /* __DAV1D_H__ */
--- a/third_party/dav1d/include/dav1d/picture.h
+++ b/third_party/dav1d/include/dav1d/picture.h
@@ -120,16 +120,18 @@ typedef struct Dav1dPictureParameters {
     /**
      * Pixel data uses JPEG pixel range ([0,255] for 8bits) instead of
      * MPEG pixel range ([16,235] for 8bits luma, [16,240] for 8bits chroma).
      */
     int fullrange;
 } Dav1dPictureParameters;
 
 typedef struct Dav1dPicture {
+    int poc; ///< frame number
+
     /**
      * Pointers to planar image data (Y is [0], U is [1], V is [2]). The data
      * should be bytes (for 8 bpc) or words (for 10 bpc). In case of words
      * containing 10 bpc image data, the pixels should be located in the LSB
      * bits, so that values range between [0, 1023]; the upper bits should be
      * zero'ed out.
      */
     void *data[3];
@@ -137,18 +139,16 @@ typedef struct Dav1dPicture {
 
     /**
      * Number of bytes between 2 lines in data[] for luma [0] or chroma [1].
      */
     ptrdiff_t stride[2];
 
     Dav1dPictureParameters p;
 
-    int poc; ///< frame number
-
     void *allocator_data; ///< pointer managed by the allocator
 } Dav1dPicture;
 
 typedef struct Dav1dPicAllocator {
     void *cookie; ///< custom data to pass to the allocator callbacks.
     /**
      * Allocate the picture buffer based on the Dav1dPictureParameters.
      *
--- a/third_party/dav1d/meson.build
+++ b/third_party/dav1d/meson.build
@@ -56,21 +56,24 @@ dav1d_inc_dirs = include_directories(['.
 # Bitdepth option
 dav1d_bitdepths = get_option('bitdepths')
 foreach bitdepth : ['8', '10']
     cdata.set10('CONFIG_@0@BPC'.format(bitdepth), dav1d_bitdepths.contains(bitdepth))
 endforeach
 
 # ASM option
 is_asm_enabled = (get_option('build_asm') == true and
-    (host_machine.cpu_family().startswith('x86')) or
-     host_machine.cpu_family() == 'aarch64'       or
-     host_machine.cpu_family().startswith('arm'))
+    (host_machine.cpu_family().startswith('x86') or
+     host_machine.cpu_family() == 'aarch64'      or
+     host_machine.cpu_family().startswith('arm')))
 cdata.set10('HAVE_ASM', is_asm_enabled)
 
+if is_asm_enabled and get_option('b_sanitize') == 'memory'
+    error('asm causes false positive with memory sanitizer. Use \'-Dbuild_asm=false\'.')
+endif
 
 
 #
 # OS/Compiler checks and defines
 #
 
 # Arguments in test_args will be used even on feature tests
 test_args = []
@@ -98,17 +101,22 @@ endif
 
 # Header checks
 
 stdatomic_dependency = []
 if not cc.check_header('stdatomic.h')
     if cc.get_id() == 'msvc'
         # we have a custom replacement for MSVC
         stdatomic_dependency = declare_dependency(
-            include_directories : include_directories('include/compat'),
+            include_directories : include_directories('include/compat/msvc'),
+        )
+    elif cc.compiles('''int main() { int v = 0; return __atomic_fetch_add(&v, 1, __ATOMIC_SEQ_CST); }''',
+                     name : 'GCC-style atomics', args : test_args)
+        stdatomic_dependency = declare_dependency(
+            include_directories : include_directories('include/compat/gcc'),
         )
     else
         error('Atomics not supported')
     endif
 endif
 
 if cc.check_header('unistd.h')
     cdata.set('HAVE_UNISTD_H', 1)
@@ -125,18 +133,26 @@ if not cc.has_function('getopt_long', pr
 else
     getopt_dependency = []
 endif
 
 if cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args)
     cdata.set('HAVE_POSIX_MEMALIGN', 1)
 elif cc.has_function('_aligned_malloc', prefix : '#include <malloc.h>', args : test_args)
     cdata.set('HAVE_ALIGNED_MALLOC', 1)
+elif cc.has_function('memalign', prefix : '#include <malloc.h>', args : test_args)
+    cdata.set('HAVE_MEMALIGN', 1)
 endif
 
+if (host_machine.cpu_family() == 'aarch64' or
+    host_machine.cpu_family().startswith('arm'))
+    if cc.has_function('getauxval', prefix : '#include <sys/auxv.h>', args : test_args)
+        cdata.set('HAVE_GETAUXVAL', 1)
+    endif
+endif
 
 # Compiler flag tests
 
 if cc.has_argument('-fvisibility=hidden')
     add_project_arguments('-fvisibility=hidden', language: 'c')
 else
     warning('Compiler does not support -fvisibility=hidden, all symbols will be public!')
 endif
--- a/third_party/dav1d/meson_options.txt
+++ b/third_party/dav1d/meson_options.txt
@@ -15,13 +15,18 @@ option('build_tools',
     value: true,
     description: 'Build dav1d cli tools')
 
 option('build_tests',
     type: 'boolean',
     value: true,
     description: 'Build dav1d tests')
 
+option('testdata_tests',
+    type: 'boolean',
+    value: false,
+    description: 'Run tests requiring the test data repository')
+
 option('fuzzing_engine',
     type: 'combo',
     choices : ['none', 'libfuzzer', 'oss-fuzz'],
     value: 'none',
     description: 'Select the fuzzing engine')
--- a/third_party/dav1d/src/arm/32/mc.S
+++ b/third_party/dav1d/src/arm/32/mc.S
@@ -23,18 +23,16 @@
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include "src/arm/asm.S"
 
-#if BITDEPTH == 8
-
 .macro avg dst0, dst1, t0, t1, t2, t3
         vld1.16         {\t0,\t1},   [r2, :128]!
         vld1.16         {\t2,\t3},   [r3, :128]!
         vadd.i16        \t0,   \t0,  \t2
         vadd.i16        \t1,   \t1,  \t3
         vqrshrun.s16    \dst0, \t0,  #5
         vqrshrun.s16    \dst1, \t1,  #5
 .endm
@@ -69,43 +67,42 @@
         vqrshrun.s16    \dst1, \t1,  #4
 .endm
 
 .macro bidir_fn type
 function \type\()_8bpc_neon, export=1
         push            {r4-r6,lr}
         ldr             r4, [sp, #16]
         ldr             r5, [sp, #20]
+        clz             r4,  r4
 .ifnc \type, avg
         ldr             lr, [sp, #24]
 .endif
 .ifc \type, w_avg
         vdup.s16        q15, lr
         vneg.s16        q15, q15
         vshl.i16        q15, q15, #11
 .endif
 .ifc \type, mask
         vmov.i8         q15, #256-2
 .endif
-        rbit            r4,  r4
         adr             r12, L(\type\()_tbl)
-        clz             r4,  r4
+        sub             r4,  r4,  #24
         ldr             r4,  [r12, r4, lsl #2]
         \type           d16, d17, q0,  q1,  q2,  q3
         add             r12, r12, r4
         bx              r12
         .align 2
 L(\type\()_tbl):
-        .word 0, 0
-        .word 4f    - L(\type\()_tbl) + CONFIG_THUMB
-        .word 80f   - L(\type\()_tbl) + CONFIG_THUMB
+        .word 1280f - L(\type\()_tbl) + CONFIG_THUMB
+        .word 640f  - L(\type\()_tbl) + CONFIG_THUMB
+        .word 320f  - L(\type\()_tbl) + CONFIG_THUMB
         .word 160f  - L(\type\()_tbl) + CONFIG_THUMB
-        .word 320f  - L(\type\()_tbl) + CONFIG_THUMB
-        .word 640f  - L(\type\()_tbl) + CONFIG_THUMB
-        .word 1280f - L(\type\()_tbl) + CONFIG_THUMB
+        .word 80f   - L(\type\()_tbl) + CONFIG_THUMB
+        .word 4f    - L(\type\()_tbl) + CONFIG_THUMB
 4:
         add             r6,  r0,  r1
         lsl             r1,  r1,  #1
         cmp             r5,  #4
         vst1.32         {d16[0]},  [r0, :32], r1
         vst1.32         {d16[1]},  [r6, :32], r1
         vst1.32         {d17[0]},  [r0, :32], r1
         vst1.32         {d17[1]},  [r6, :32], r1
@@ -210,10 +207,8 @@ 128:
 0:
         pop             {r4-r6,pc}
 endfunc
 .endm
 
 bidir_fn avg
 bidir_fn w_avg
 bidir_fn mask
-
-#endif /* BITDEPTH == 8 */
--- a/third_party/dav1d/src/arm/64/mc.S
+++ b/third_party/dav1d/src/arm/64/mc.S
@@ -22,18 +22,16 @@
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include "src/arm/asm.S"
 
-#if BITDEPTH == 8
-
 .macro avg dst, t0, t1
         ld1             {\t0\().8h},   [x2],  16
         ld1             {\t1\().8h},   [x3],  16
         add             \t0\().8h,   \t0\().8h,   \t1\().8h
         sqrshrun        \dst\().8b,  \t0\().8h,   #5
 .endm
 
 .macro avg16 dst, t0, t1, t2, t3
@@ -93,27 +91,27 @@
         add             \t0\().8h,   \t2\().8h,   \t0\().8h
         add             \t1\().8h,   \t3\().8h,   \t1\().8h
         sqrshrun        \dst\().8b,  \t0\().8h,   #4
         sqrshrun2       \dst\().16b, \t1\().8h,   #4
 .endm
 
 .macro bidir_fn type
 function \type\()_8bpc_neon, export=1
+        clz             w4,  w4
 .ifc \type, w_avg
         dup             v30.8h, w6
         neg             v30.8h, v30.8h
         shl             v30.8h, v30.8h, #11
 .endif
 .ifc \type, mask
         movi            v31.16b, #256-2
 .endif
-        rbit            w4,  w4
         adr             x7,  L(\type\()_tbl)
-        clz             w4,  w4
+        sub             w4,  w4,  #24
         \type           v4,  v0,  v1
         ldrh            w4,  [x7, x4, lsl #1]
         \type           v5,  v2,  v3
         sub             x7,  x7,  w4, uxtw
         br              x7
 4:
         cmp             w5,  #4
         st1             {v4.s}[0],  [x0], x1
@@ -215,23 +213,20 @@ 128:
         subs            w5,  w5,  #1
         st1             {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
         b.le            0f
         \type\()16      v4, v0, v1, v2, v3
         b               128b
 0:
         ret
 L(\type\()_tbl):
-        .hword 0, 0
-        .hword L(\type\()_tbl) -    4b
-        .hword L(\type\()_tbl) -    8b
+        .hword L(\type\()_tbl) - 1280b
+        .hword L(\type\()_tbl) -  640b
+        .hword L(\type\()_tbl) -  320b
         .hword L(\type\()_tbl) -  160b
-        .hword L(\type\()_tbl) -  320b
-        .hword L(\type\()_tbl) -  640b
-        .hword L(\type\()_tbl) - 1280b
+        .hword L(\type\()_tbl) -    8b
+        .hword L(\type\()_tbl) -    4b
 endfunc
 .endm
 
 bidir_fn avg
 bidir_fn w_avg
 bidir_fn mask
-
-#endif /* BITDEPTH == 8 */
--- a/third_party/dav1d/src/arm/cpu.c
+++ b/third_party/dav1d/src/arm/cpu.c
@@ -20,13 +20,68 @@
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include "config.h"
+
 #include "src/arm/cpu.h"
 
+#if defined(HAVE_GETAUXVAL) && ARCH_ARM
+#include <sys/auxv.h>
+
+#ifndef HWCAP_ARM_NEON
+#define HWCAP_ARM_NEON (1 << 12)
+#endif
+#define NEON_HWCAP HWCAP_ARM_NEON
+
+#elif defined(__ANDROID__)
+#include <stdio.h>
+#include <string.h>
+
+static unsigned parse_proc_cpuinfo(const char *flag) {
+    FILE *file = fopen("/proc/cpuinfo", "r");
+    if (!file)
+        return 0;
+
+    char line_buffer[120];
+    const char *line;
+
+    while ((line = fgets(line_buffer, sizeof(line_buffer), file))) {
+        if (strstr(line, flag)) {
+            fclose(file);
+            return 1;
+        }
+        // if line is incomplete seek back to avoid splitting the search
+        // string into two buffers
+        if (!strchr(line, '\n') && strlen(line) > strlen(flag)) {
+            if (fseek(file, -strlen(flag), SEEK_CUR))
+                break;
+        }
+    }
+
+    fclose(file);
+
+    return 0;
+}
+#endif
+
 unsigned dav1d_get_cpu_flags_arm(void) {
-    return DAV1D_ARM_CPU_FLAG_NEON;
+    unsigned flags = 0;
+#if ARCH_AARCH64
+    flags |= DAV1D_ARM_CPU_FLAG_NEON;
+#elif defined(HAVE_GETAUXVAL) && ARCH_ARM
+    unsigned long hw_cap = getauxval(AT_HWCAP);
+    flags |= (hw_cap & NEON_HWCAP) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
+#elif defined(__ANDROID__)
+    flags |= parse_proc_cpuinfo("neon") ? DAV1D_ARM_CPU_FLAG_NEON : 0;
+#elif defined(__APPLE__)
+    flags |= DAV1D_ARM_CPU_FLAG_NEON;
+#elif defined(_WIN32)
+    flags |= DAV1D_ARM_CPU_FLAG_NEON;
+#endif
+
+    return flags;
 }
rename from third_party/dav1d/src/arm/mc_init.c
rename to third_party/dav1d/src/arm/mc_init_tmpl.c
--- a/third_party/dav1d/src/cdef.h
+++ b/third_party/dav1d/src/cdef.h
@@ -35,28 +35,40 @@
 
 enum CdefEdgeFlags {
     HAVE_LEFT = 1 << 0,
     HAVE_RIGHT = 1 << 1,
     HAVE_TOP = 1 << 2,
     HAVE_BOTTOM = 1 << 3,
 };
 
+#ifdef BITDEPTH
+typedef const pixel (*const_left_pixel_row_2px)[2];
+#else
+typedef const void *const_left_pixel_row_2px;
+#endif
+
 // CDEF operates entirely on pre-filter data; if bottom/right edges are
 // present (according to $edges), then the pre-filter data is located in
 // $dst. However, the edge pixels above $dst may be post-filter, so in
 // order to get access to pre-filter top pixels, use $top.
-typedef void (*cdef_fn)(pixel *dst, ptrdiff_t stride,
-                        /*const*/ pixel *const top[2],
-                        int pri_strength, int sec_strength,
-                        int dir, int damping, enum CdefEdgeFlags edges);
-typedef int (*cdef_dir_fn)(const pixel *dst, ptrdiff_t stride,
-                           unsigned *var);
+#define decl_cdef_fn(name) \
+void (name)(pixel *dst, ptrdiff_t stride, const_left_pixel_row_2px left, \
+            /*const*/ pixel *const top[2], int pri_strength, int sec_strength, \
+            int dir, int damping, enum CdefEdgeFlags edges)
+typedef decl_cdef_fn(*cdef_fn);
+
+#define decl_cdef_dir_fn(name) \
+int (name)(const pixel *dst, ptrdiff_t dst_stride, unsigned *var)
+typedef decl_cdef_dir_fn(*cdef_dir_fn);
 
 typedef struct Dav1dCdefDSPContext {
     cdef_dir_fn dir;
     cdef_fn fb[3 /* 444/luma, 422, 420 */];
 } Dav1dCdefDSPContext;
 
 void dav1d_cdef_dsp_init_8bpc(Dav1dCdefDSPContext *c);
 void dav1d_cdef_dsp_init_10bpc(Dav1dCdefDSPContext *c);
 
+void dav1d_cdef_dsp_init_x86_8bpc(Dav1dCdefDSPContext *c);
+void dav1d_cdef_dsp_init_x86_10bpc(Dav1dCdefDSPContext *c);
+
 #endif /* __DAV1D_SRC_CDEF_H__ */
--- a/third_party/dav1d/src/cdef_apply_tmpl.c
+++ b/third_party/dav1d/src/cdef_apply_tmpl.c
@@ -67,32 +67,16 @@ static void backup2x8(pixel dst[3][8][2]
 
     x_off >>= ss_hor;
     for (int y = 0, y_off = 0; y < (8 >> ss_ver); y++, y_off += PXSTRIDE(src_stride[1])) {
         pixel_copy(dst[1][y], &src[1][y_off + x_off - 2], 2);
         pixel_copy(dst[2][y], &src[2][y_off + x_off - 2], 2);
     }
 }
 
-static void restore2x8(pixel *const dst[3],
-                       const ptrdiff_t dst_stride[2],
-                       const pixel src[3][8][2], const enum Dav1dPixelLayout layout)
-{
-    for (int y = 0, y_off = 0; y < 8; y++, y_off += PXSTRIDE(dst_stride[0]))
-        pixel_copy(&dst[0][y_off - 2], src[0][y], 2);
-
-    if (layout == DAV1D_PIXEL_LAYOUT_I400) return;
-    const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
-
-    for (int y = 0, y_off = 0; y < (8 >> ss_ver); y++, y_off += PXSTRIDE(dst_stride[1])) {
-        pixel_copy(&dst[1][y_off - 2], src[1][y], 2);
-        pixel_copy(&dst[2][y_off - 2], src[2][y], 2);
-    }
-}
-
 static int adjust_strength(const int strength, const unsigned var) {
     if (!var) return 0;
     const int i = var >> 6 ? imin(ulog2(var >> 6), 12) : 0;
     return (strength * (4 + i) + 8) >> 4;
 }
 
 void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
                              pixel *const p[3],
@@ -111,17 +95,17 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameC
     const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
     const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
 
     // FIXME a design improvement that could be made here is to keep a set of
     // flags for each block position on whether the block was filtered; if not,
     // the backup of pre-filter data is empty, and the restore is therefore
     // unnecessary as well.
 
-    for (int by = by_start; by < by_end; by += 2, edges |= HAVE_TOP) {
+    for (int bit = 0, by = by_start; by < by_end; by += 2, edges |= HAVE_TOP) {
         const int tf = f->lf.top_pre_cdef_toggle;
         if (by + 2 >= f->bh) edges &= ~HAVE_BOTTOM;
 
         if (edges & HAVE_BOTTOM) {
             // backup pre-filter data for next iteration
             backup2lines(f->lf.cdef_line_ptr[!tf], ptrs, f->cur.p.stride,
                          8, f->bw * 4, layout);
         }
@@ -156,70 +140,66 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameC
                 const int by_idx = by & 30, bx_idx = (bx & 16) >> 4;
                 if (!((lflvl[sb128x].noskip_mask[by_idx + 0][bx_idx] |
                        lflvl[sb128x].noskip_mask[by_idx + 1][bx_idx]) & bx_mask))
                 {
                     last_skip = 1;
                     goto next_b;
                 }
 
-                if (!last_skip) {
-                    // backup post-filter data (will be restored at the end)
-                    backup2x8(lr_bak[1], bptrs, f->cur.p.stride, 0, layout);
-
-                    // restore pre-filter data from last iteration
-                    restore2x8(bptrs, f->cur.p.stride, lr_bak[0], layout);
+                if (last_skip && edges & HAVE_LEFT) {
+                    // we didn't backup the prefilter data because it wasn't
+                    // there, so do it here instead
+                    backup2x8(lr_bak[bit], bptrs, f->cur.p.stride, 0, layout);
                 }
                 if (edges & HAVE_RIGHT) {
                     // backup pre-filter data for next iteration
-                    backup2x8(lr_bak[0], bptrs, f->cur.p.stride, 8, layout);
+                    backup2x8(lr_bak[!bit], bptrs, f->cur.p.stride, 8, layout);
                 }
 
                 // the actual filter
                 const int y_pri_lvl = (y_lvl >> 2) << (BITDEPTH - 8);
                 int y_sec_lvl = y_lvl & 3;
                 y_sec_lvl += y_sec_lvl == 3;
                 y_sec_lvl <<= BITDEPTH - 8;
                 const int uv_pri_lvl = (uv_lvl >> 2) << (BITDEPTH - 8);
                 int uv_sec_lvl = uv_lvl & 3;
                 uv_sec_lvl += uv_sec_lvl == 3;
                 uv_sec_lvl <<= BITDEPTH - 8;
                 unsigned variance;
                 const int dir = dsp->cdef.dir(bptrs[0], f->cur.p.stride[0],
                                               &variance);
                 if (y_lvl) {
-                    dsp->cdef.fb[0](bptrs[0], f->cur.p.stride[0],
+                    dsp->cdef.fb[0](bptrs[0], f->cur.p.stride[0], lr_bak[bit][0],
                                     (pixel *const [2]) {
                                         &f->lf.cdef_line_ptr[tf][0][0][bx * 4],
                                         &f->lf.cdef_line_ptr[tf][0][1][bx * 4],
                                     },
                                     adjust_strength(y_pri_lvl, variance),
                                     y_sec_lvl, y_pri_lvl ? dir : 0,
                                     damping, edges);
                 }
                 if (uv_lvl && has_chroma) {
                     const int uvdir =
                         f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I422 ? dir :
                         ((uint8_t[]) { 7, 0, 2, 4, 5, 6, 6, 6 })[dir];
                     for (int pl = 1; pl <= 2; pl++) {
                         dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.p.stride[1],
+                                             lr_bak[bit][pl],
                                              (pixel *const [2]) {
                                                  &f->lf.cdef_line_ptr[tf][pl][0][bx * 4 >> ss_hor],
                                                  &f->lf.cdef_line_ptr[tf][pl][1][bx * 4 >> ss_hor],
                                              },
                                              uv_pri_lvl, uv_sec_lvl,
                                              uv_pri_lvl ? uvdir : 0,
                                              damping - 1, edges);
                     }
                 }
 
-                if (!last_skip) {
-                    // restore post-filter data from the beginning of this loop
-                    restore2x8(bptrs, f->cur.p.stride, lr_bak[1], layout);
-                }
+                bit ^= 1;
                 last_skip = 0;
 
             next_b:
                 bptrs[0] += 8;
                 bptrs[1] += 8 >> ss_hor;
                 bptrs[2] += 8 >> ss_hor;
             }
 
--- a/third_party/dav1d/src/cdef_tmpl.c
+++ b/third_party/dav1d/src/cdef_tmpl.c
@@ -20,279 +20,240 @@
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
 #include "config.h"
 
 #include <assert.h>
 #include <stdlib.h>
 
 #include "common/intops.h"
 
 #include "src/cdef.h"
 
-static const int8_t cdef_directions4[8 /* dir */][2 /* pass */] = {
-    { -1 * 8 + 1, -2 * 8 + 2 },
-    {  0 * 8 + 1, -1 * 8 + 2 },
-    {  0 * 8 + 1,  0 * 8 + 2 },
-    {  0 * 8 + 1,  1 * 8 + 2 },
-    {  1 * 8 + 1,  2 * 8 + 2 },
-    {  1 * 8 + 0,  2 * 8 + 1 },
-    {  1 * 8 + 0,  2 * 8 + 0 },
-    {  1 * 8 + 0,  2 * 8 - 1 }
-};
-
-static const int8_t cdef_directions8[8 /* dir */][2 /* pass */] = {
-    { -1 * 16 + 1, -2 * 16 + 2 },
-    {  0 * 16 + 1, -1 * 16 + 2 },
-    {  0 * 16 + 1,  0 * 16 + 2 },
-    {  0 * 16 + 1,  1 * 16 + 2 },
-    {  1 * 16 + 1,  2 * 16 + 2 },
-    {  1 * 16 + 0,  2 * 16 + 1 },
-    {  1 * 16 + 0,  2 * 16 + 0 },
-    {  1 * 16 + 0,  2 * 16 - 1 }
-};
-static const uint8_t cdef_pri_taps[2][2] = { { 4, 2 }, { 3, 3 } };
-static const uint8_t cdef_sec_taps[2][2] = { { 2, 1 }, { 2, 1 } };
-
 static inline int constrain(const int diff, const int threshold,
                             const int damping)
 {
     if (!threshold) return 0;
     const int shift = imax(0, damping - ulog2(threshold));
     return apply_sign(imin(abs(diff), imax(0, threshold - (abs(diff) >> shift))),
                       diff);
 }
 
-/*
- * <code partially copied from libaom>
- */
-
-#define CDEF_VERY_LARGE (30000)
-
-static void fill(uint16_t *tmp, const ptrdiff_t stride,
-                 const int w, const int h)
+static inline void fill(uint16_t *tmp, const ptrdiff_t stride,
+                        const int w, const int h)
 {
     for (int y = 0; y < h; y++) {
         for (int x = 0; x < w; x++)
-            tmp[x] = CDEF_VERY_LARGE;
+            tmp[x] = INT16_MAX;
         tmp += stride;
     }
 }
 
-/* Smooth in the direction detected. */
-static void cdef_filter_block_c(pixel *const dst, const ptrdiff_t dst_stride,
-                                /*const*/ pixel *const top[2],
-                                const int w, const int h, const int pri_strength,
-                                const int sec_strength, const int dir,
-                                const int damping, const enum CdefEdgeFlags edges)
+static void padding(uint16_t *tmp, const ptrdiff_t tmp_stride,
+                    const pixel *src, const ptrdiff_t src_stride,
+                    const pixel (*left)[2], pixel *const top[2],
+                    const int w, const int h,
+                    const enum CdefEdgeFlags edges)
 {
-    const ptrdiff_t tmp_stride = 16 >> (w == 4);
-    assert((w == 4 || w == 8) && (h == 4 || h == 8));
-    uint16_t tmp[192];  // 16*12 is the maximum value of tmp_stride * (h + 4)
-    uint16_t *tmp2 = tmp + 2 * tmp_stride + 2;
-    const uint8_t *const pri_taps = cdef_pri_taps[(pri_strength >> (BITDEPTH - 8)) & 1];
-    const uint8_t *const sec_taps = cdef_sec_taps[(pri_strength >> (BITDEPTH - 8)) & 1];
-    const int8_t (*cdef_directions)[2];
-
-    assert(w == 4 || w == 8);
-    cdef_directions = w == 4 ? cdef_directions4 : cdef_directions8;
-
     // fill extended input buffer
     int x_start = -2, x_end = w + 2, y_start = -2, y_end = h + 2;
     if (!(edges & HAVE_TOP)) {
-        fill(tmp, tmp_stride, w + 4, 2);
+        fill(tmp - 2 - 2 * tmp_stride, tmp_stride, w + 4, 2);
         y_start = 0;
     }
     if (!(edges & HAVE_BOTTOM)) {
-        fill(tmp + (h + 2) * tmp_stride, tmp_stride, w + 4, 2);
+        fill(tmp + h * tmp_stride - 2, tmp_stride, w + 4, 2);
         y_end -= 2;
     }
     if (!(edges & HAVE_LEFT)) {
-        fill(tmp + (2 + y_start) * tmp_stride, tmp_stride, 2, y_end - y_start);
+        fill(tmp + y_start * tmp_stride - 2, tmp_stride, 2, y_end - y_start);
         x_start = 0;
     }
     if (!(edges & HAVE_RIGHT)) {
-        fill(tmp + (2 + y_start) * tmp_stride + w + 2, tmp_stride,
-             2, y_end - y_start);
+        fill(tmp + y_start * tmp_stride + w, tmp_stride, 2, y_end - y_start);
         x_end -= 2;
     }
+
     for (int y = y_start; y < 0; y++)
         for (int x = x_start; x < x_end; x++)
-            tmp2[y * tmp_stride + x] = top[y & 1][x];
-    for (int y = 0; y < y_end; y++)
-        for (int x = x_start; x < x_end; x++)
-            tmp2[y * tmp_stride + x] = dst[y * PXSTRIDE(dst_stride) + x];
+            tmp[x + y * tmp_stride] = top[y & 1][x];
+    for (int y = 0; y < h; y++)
+        for (int x = x_start; x < 0; x++)
+            tmp[x + y * tmp_stride] = left[y][2 + x];
+    for (int y = 0; y < y_end; y++) {
+        for (int x = (y < h) ? 0 : x_start; x < x_end; x++)
+            tmp[x] = src[x];
+        src += PXSTRIDE(src_stride);
+        tmp += tmp_stride;
+    }
+}
+
+static NOINLINE void
+cdef_filter_block_c(pixel *dst, const ptrdiff_t dst_stride,
+                    const pixel (*left)[2], /*const*/ pixel *const top[2],
+                    const int w, const int h, const int pri_strength,
+                    const int sec_strength, const int dir,
+                    const int damping, const enum CdefEdgeFlags edges)
+{
+    static const int8_t cdef_directions[8 /* dir */][2 /* pass */] = {
+        { -1 * 12 + 1, -2 * 12 + 2 },
+        {  0 * 12 + 1, -1 * 12 + 2 },
+        {  0 * 12 + 1,  0 * 12 + 2 },
+        {  0 * 12 + 1,  1 * 12 + 2 },
+        {  1 * 12 + 1,  2 * 12 + 2 },
+        {  1 * 12 + 0,  2 * 12 + 1 },
+        {  1 * 12 + 0,  2 * 12 + 0 },
+        {  1 * 12 + 0,  2 * 12 - 1 }
+    };
+    static const uint8_t cdef_pri_taps[2][2] = { { 4, 2 }, { 3, 3 } };
+    static const uint8_t sec_taps[2] = { 2, 1 };
+    const ptrdiff_t tmp_stride = 12;
+    assert((w == 4 || w == 8) && (h == 4 || h == 8));
+    uint16_t tmp_buf[144];  // 12*12 is the maximum value of tmp_stride * (h + 4)
+    uint16_t *tmp = tmp_buf + 2 * tmp_stride + 2;
+    const uint8_t *const pri_taps = cdef_pri_taps[(pri_strength >> (BITDEPTH - 8)) & 1];
+
+    padding(tmp, tmp_stride, dst, dst_stride, left, top, w, h, edges);
 
     // run actual filter
     for (int y = 0; y < h; y++) {
         for (int x = 0; x < w; x++) {
             int sum = 0;
-            const int px = dst[y * PXSTRIDE(dst_stride) + x];
+            const int px = dst[x];
             int max = px, min = px;
             for (int k = 0; k < 2; k++) {
-                const int8_t off1 = cdef_directions[dir][k];
-                const int p0 = tmp2[y * tmp_stride + x + off1];
-                const int p1 = tmp2[y * tmp_stride + x - off1];
+                const int off1 = cdef_directions[dir][k];
+                const int p0 = tmp[x + off1];
+                const int p1 = tmp[x - off1];
                 sum += pri_taps[k] * constrain(p0 - px, pri_strength, damping);
                 sum += pri_taps[k] * constrain(p1 - px, pri_strength, damping);
-                if (p0 != CDEF_VERY_LARGE) max = imax(p0, max);
-                if (p1 != CDEF_VERY_LARGE) max = imax(p1, max);
+                if (p0 != INT16_MAX) max = imax(p0, max);
+                if (p1 != INT16_MAX) max = imax(p1, max);
                 min = imin(p0, min);
                 min = imin(p1, min);
-                const int8_t off2 = cdef_directions[(dir + 2) & 7][k];
-                const int s0 = tmp2[y * tmp_stride + x + off2];
-                const int s1 = tmp2[y * tmp_stride + x - off2];
-                const int8_t off3 = cdef_directions[(dir + 6) & 7][k];
-                const int s2 = tmp2[y * tmp_stride + x + off3];
-                const int s3 = tmp2[y * tmp_stride + x - off3];
-                if (s0 != CDEF_VERY_LARGE) max = imax(s0, max);
-                if (s1 != CDEF_VERY_LARGE) max = imax(s1, max);
-                if (s2 != CDEF_VERY_LARGE) max = imax(s2, max);
-                if (s3 != CDEF_VERY_LARGE) max = imax(s3, max);
+                const int off2 = cdef_directions[(dir + 2) & 7][k];
+                const int s0 = tmp[x + off2];
+                const int s1 = tmp[x - off2];
+                const int off3 = cdef_directions[(dir + 6) & 7][k];
+                const int s2 = tmp[x + off3];
+                const int s3 = tmp[x - off3];
+                if (s0 != INT16_MAX) max = imax(s0, max);
+                if (s1 != INT16_MAX) max = imax(s1, max);
+                if (s2 != INT16_MAX) max = imax(s2, max);
+                if (s3 != INT16_MAX) max = imax(s3, max);
                 min = imin(s0, min);
                 min = imin(s1, min);
                 min = imin(s2, min);
                 min = imin(s3, min);
                 sum += sec_taps[k] * constrain(s0 - px, sec_strength, damping);
                 sum += sec_taps[k] * constrain(s1 - px, sec_strength, damping);
                 sum += sec_taps[k] * constrain(s2 - px, sec_strength, damping);
                 sum += sec_taps[k] * constrain(s3 - px, sec_strength, damping);
             }
-            dst[y * PXSTRIDE(dst_stride) + x] =
-                iclip(px + ((8 + sum - (sum < 0)) >> 4), min, max);
+            dst[x] = iclip(px + ((8 + sum - (sum < 0)) >> 4), min, max);
         }
+        dst += PXSTRIDE(dst_stride);
+        tmp += tmp_stride;
     }
 }
 
-/*
- * </code partially copied from libaom>
- */
-
 #define cdef_fn(w, h) \
 static void cdef_filter_block_##w##x##h##_c(pixel *const dst, \
                                             const ptrdiff_t stride, \
+                                            const pixel (*left)[2], \
                                             /*const*/ pixel *const top[2], \
                                             const int pri_strength, \
                                             const int sec_strength, \
                                             const int dir, \
                                             const int damping, \
                                             const enum CdefEdgeFlags edges) \
 { \
-    cdef_filter_block_c(dst, stride, top, w, h, pri_strength, sec_strength, \
+    cdef_filter_block_c(dst, stride, left, top, w, h, pri_strength, sec_strength, \
                         dir, damping, edges); \
 }
 
 cdef_fn(4, 4);
 cdef_fn(4, 8);
 cdef_fn(8, 8);
 
-/*
- * <code copied from libaom>
- */
-
-/* Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on.
-   The search minimizes the weighted variance along all the lines in a
-   particular direction, i.e. the squared error between the input and a
-   "predicted" block where each pixel is replaced by the average along a line
-   in a particular direction. Since each direction have the same sum(x^2) term,
-   that term is never computed. See Section 2, step 2, of:
-   http://jmvalin.ca/notes/intra_paint.pdf */
-static const uint16_t div_table[] = {
-    0, 840, 420, 280, 210, 168, 140, 120, 105
-};
 static int cdef_find_dir_c(const pixel *img, const ptrdiff_t stride,
                            unsigned *const var)
 {
-    int i;
-    int32_t cost[8] = { 0 };
-    int partial[8][15] = { { 0 } };
-    int32_t best_cost = 0;
-    int best_dir = 0;
-    /* Instead of dividing by n between 2 and 8, we multiply by 3*5*7*8/n.
-     The output is then 840 times larger, but we don't care for finding
-     the max. */
-    for (i = 0; i < 8; i++) {
-        int j;
-        for (j = 0; j < 8; j++) {
-            int x;
-            /* We subtract 128 here to reduce the maximum range of the squared
-             partial sums. */
-            x = (img[i * PXSTRIDE(stride) + j] >> (BITDEPTH - 8)) - 128;
-            partial[0][i + j] += x;
-            partial[1][i + j / 2] += x;
-            partial[2][i] += x;
-            partial[3][3 + i - j / 2] += x;
-            partial[4][7 + i - j] += x;
-            partial[5][3 - i / 2 + j] += x;
-            partial[6][j] += x;
-            partial[7][i / 2 + j] += x;
+    int partial_sum_hv[2][8] = { { 0 } };
+    int partial_sum_diag[2][15] = { { 0 } };
+    int partial_sum_alt[4][11] = { { 0 } };
+
+    for (int y = 0; y < 8; y++) {
+        for (int x = 0; x < 8; x++) {
+            const int px = (img[x] >> (BITDEPTH - 8)) - 128;
+
+            partial_sum_diag[0][     y       +  x      ] += px;
+            partial_sum_alt [0][     y       + (x >> 1)] += px;
+            partial_sum_hv  [0][     y                 ] += px;
+            partial_sum_alt [1][3 +  y       - (x >> 1)] += px;
+            partial_sum_diag[1][7 +  y       -  x      ] += px;
+            partial_sum_alt [2][3 - (y >> 1) +  x      ] += px;
+            partial_sum_hv  [1][                x      ] += px;
+            partial_sum_alt [3][    (y >> 1) +  x      ] += px;
+        }
+        img += PXSTRIDE(stride);
+    }
+
+    unsigned cost[8] = { 0 };
+    for (int n = 0; n < 8; n++) {
+        cost[2] += partial_sum_hv[0][n] * partial_sum_hv[0][n];
+        cost[6] += partial_sum_hv[1][n] * partial_sum_hv[1][n];
+    }
+    cost[2] *= 105;
+    cost[6] *= 105;
+
+    static const uint16_t div_table[7] = { 840, 420, 280, 210, 168, 140, 120 };
+    for (int n = 0; n < 7; n++) {
+        const int d = div_table[n];
+        cost[0] += (partial_sum_diag[0][n]      * partial_sum_diag[0][n] +
+                    partial_sum_diag[0][14 - n] * partial_sum_diag[0][14 - n]) * d;
+        cost[4] += (partial_sum_diag[1][n]      * partial_sum_diag[1][n] +
+                    partial_sum_diag[1][14 - n] * partial_sum_diag[1][14 - n]) * d;
+    }
+    cost[0] += partial_sum_diag[0][7] * partial_sum_diag[0][7] * 105;
+    cost[4] += partial_sum_diag[1][7] * partial_sum_diag[1][7] * 105;
+
+    for (int n = 0; n < 4; n++) {
+        unsigned *const cost_ptr = &cost[n * 2 + 1];
+        for (int m = 0; m < 5; m++)
+            *cost_ptr += partial_sum_alt[n][3 + m] * partial_sum_alt[n][3 + m];
+        *cost_ptr *= 105;
+        for (int m = 0; m < 3; m++) {
+            const int d = div_table[2 * m + 1];
+            *cost_ptr += (partial_sum_alt[n][m]      * partial_sum_alt[n][m] +
+                          partial_sum_alt[n][10 - m] * partial_sum_alt[n][10 - m]) * d;
         }
     }
-    for (i = 0; i < 8; i++) {
-        cost[2] += partial[2][i] * partial[2][i];
-        cost[6] += partial[6][i] * partial[6][i];
-    }
-    cost[2] *= div_table[8];
-    cost[6] *= div_table[8];
-    for (i = 0; i < 7; i++) {
-        cost[0] += (partial[0][i] * partial[0][i] +
-                    partial[0][14 - i] * partial[0][14 - i]) *
-                   div_table[i + 1];
-        cost[4] += (partial[4][i] * partial[4][i] +
-                    partial[4][14 - i] * partial[4][14 - i]) *
-                   div_table[i + 1];
-    }
-    cost[0] += partial[0][7] * partial[0][7] * div_table[8];
-    cost[4] += partial[4][7] * partial[4][7] * div_table[8];
-    for (i = 1; i < 8; i += 2) {
-        int j;
-        for (j = 0; j < 4 + 1; j++) {
-            cost[i] += partial[i][3 + j] * partial[i][3 + j];
-        }
-        cost[i] *= div_table[8];
-        for (j = 0; j < 4 - 1; j++) {
-            cost[i] += (partial[i][j] * partial[i][j] +
-                        partial[i][10 - j] * partial[i][10 - j]) *
-                       div_table[2 * j + 2];
+
+    int best_dir = 0;
+    unsigned best_cost = cost[0];
+    for (int n = 1; n < 8; n++) {
+        if (cost[n] > best_cost) {
+            best_cost = cost[n];
+            best_dir = n;
         }
     }
-    for (i = 0; i < 8; i++) {
-        if (cost[i] > best_cost) {
-            best_cost = cost[i];
-            best_dir = i;
-        }
-    }
-    /* Difference between the optimal variance and the variance along the
-     orthogonal direction. Again, the sum(x^2) terms cancel out. */
-    *var = best_cost - cost[(best_dir + 4) & 7];
-    /* We'd normally divide by 840, but dividing by 1024 is close enough
-     for what we're going to do with this. */
-    *var >>= 10;
+
+    *var = (best_cost - (cost[best_dir ^ 4])) >> 10;
     return best_dir;
 }
 
-/*
- * </code copied from libaom>
- */
-
 void bitfn(dav1d_cdef_dsp_init)(Dav1dCdefDSPContext *const c) {
     c->dir = cdef_find_dir_c;
     c->fb[0] = cdef_filter_block_8x8_c;
     c->fb[1] = cdef_filter_block_4x8_c;
     c->fb[2] = cdef_filter_block_4x4_c;
+
+#if HAVE_ASM && ARCH_X86 && BITDEPTH == 8
+    bitfn(dav1d_cdef_dsp_init_x86)(c);
+#endif
 }
--- a/third_party/dav1d/src/cdf.c
+++ b/third_party/dav1d/src/cdf.c
@@ -4227,17 +4227,17 @@ void dav1d_cdf_thread_alloc(CdfThreadCon
 void dav1d_cdf_thread_ref(CdfThreadContext *const dst,
                           CdfThreadContext *const src)
 {
     dav1d_ref_inc(src->ref);
     *dst = *src;
 }
 
 void dav1d_cdf_thread_unref(CdfThreadContext *const cdf) {
-    dav1d_ref_dec(cdf->ref);
+    dav1d_ref_dec(&cdf->ref);
     memset(cdf, 0, sizeof(*cdf));
 }
 
 void dav1d_cdf_thread_wait(CdfThreadContext *const cdf) {
     if (!cdf->t) return;
 
     if (atomic_load(cdf->progress)) return;
     pthread_mutex_lock(&cdf->t->lock);
--- a/third_party/dav1d/src/cpu.c
+++ b/third_party/dav1d/src/cpu.c
@@ -32,17 +32,17 @@
 
 static unsigned flags_mask = -1;
 
 unsigned dav1d_get_cpu_flags(void) {
     static unsigned flags;
     static uint8_t checked = 0;
 
     if (!checked) {
-#if ARCH_AARCH64 || ARCH_ARM
+#if (ARCH_AARCH64 || ARCH_ARM) && HAVE_ASM
         flags = dav1d_get_cpu_flags_arm();
 #elif ARCH_X86 && HAVE_ASM
         flags = dav1d_get_cpu_flags_x86();
 #else
         flags = 0;
 #endif
         checked = 1;
     }
new file mode 100644
--- /dev/null
+++ b/third_party/dav1d/src/ctx.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __DAV1D_SRC_CTX_H__
+#define __DAV1D_SRC_CTX_H__
+
+#include <stdint.h>
+
+#include "common/attributes.h"
+
+union alias64 { uint64_t u64; uint8_t u8[8]; } ATTR_ALIAS;
+union alias32 { uint32_t u32; uint8_t u8[4]; } ATTR_ALIAS;
+union alias16 { uint16_t u16; uint8_t u8[2]; } ATTR_ALIAS;
+union alias8 { uint8_t u8; } ATTR_ALIAS;
+
+#define set_ctx_rep4(type, var, off, val) do { \
+        const uint64_t const_val = val; \
+        ((union alias64 *) &var[off +  0])->u64 = const_val; \
+        ((union alias64 *) &var[off +  8])->u64 = const_val; \
+        ((union alias64 *) &var[off + 16])->u64 = const_val; \
+        ((union alias64 *) &var[off + 24])->u64 = const_val; \
+    } while (0)
+#define set_ctx_rep2(type, var, off, val) do { \
+        const uint64_t const_val = val; \
+        ((union alias64 *) &var[off + 0])->u64 = const_val; \
+        ((union alias64 *) &var[off + 8])->u64 = const_val; \
+    } while (0)
+#define set_ctx_rep1(typesz, var, off, val) \
+    ((union alias##typesz *) &var[off])->u##typesz = val
+#define case_set(var, dir, diridx, off) \
+    switch (var) { \
+    case  1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
+    case  2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
+    case  4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
+    case  8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
+    case 16: set_ctx(  , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
+    case 32: set_ctx(  , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep4); break; \
+    }
+#define case_set_upto16(var, dir, diridx, off) \
+    switch (var) { \
+    case  1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
+    case  2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
+    case  4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
+    case  8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
+    case 16: set_ctx(  , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
+    }
+#define case_set_upto32_with_default(var, dir, diridx, off) \
+    switch (var) { \
+    case  1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
+    case  2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
+    case  4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
+    case  8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
+    case 16: set_ctx(  , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
+    case 32: set_ctx(  , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep4); break; \
+    default: default_memset(dir, diridx, off, var); break; \
+    }
+#define case_set_upto16_with_default(var, dir, diridx, off) \
+    switch (var) { \
+    case  1: set_ctx( 8, dir, diridx, off, 0x01, set_ctx_rep1); break; \
+    case  2: set_ctx(16, dir, diridx, off, 0x0101, set_ctx_rep1); break; \
+    case  4: set_ctx(32, dir, diridx, off, 0x01010101U, set_ctx_rep1); break; \
+    case  8: set_ctx(64, dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep1); break; \
+    case 16: set_ctx(  , dir, diridx, off, 0x0101010101010101ULL, set_ctx_rep2); break; \
+    default: default_memset(dir, diridx, off, var); break; \
+    }
+
+#endif /* __DAV1D_SRC_CTX_H__ */
--- a/third_party/dav1d/src/data.c
+++ b/third_party/dav1d/src/data.c
@@ -30,46 +30,59 @@
 #include <errno.h>
 #include <stdlib.h>
 #include <string.h>
 
 #include "dav1d/data.h"
 
 #include "common/validate.h"
 
+#include "src/data.h"
 #include "src/ref.h"
 
-int dav1d_data_create(Dav1dData *const buf, const size_t sz) {
-    validate_input_or_ret(buf != NULL, -EINVAL);
+uint8_t * dav1d_data_create(Dav1dData *const buf, const size_t sz) {
+    validate_input_or_ret(buf != NULL, NULL);
 
     buf->ref = dav1d_ref_create(sz);
-    if (!buf->ref) return -ENOMEM;
-    buf->data = buf->ref->data;
+    if (!buf->ref) return NULL;
+    buf->data = buf->ref->const_data;
     buf->sz = sz;
 
-    return 0;
+    return buf->ref->data;
 }
 
-int dav1d_data_wrap(Dav1dData *const buf, uint8_t *const ptr, const size_t sz,
-                    void (*free_callback)(uint8_t *data, void *user_data),
+int dav1d_data_wrap(Dav1dData *const buf, const uint8_t *const ptr, const size_t sz,
+                    void (*free_callback)(const uint8_t *data, void *user_data),
                     void *user_data)
 {
     validate_input_or_ret(buf != NULL, -EINVAL);
     validate_input_or_ret(ptr != NULL, -EINVAL);
     validate_input_or_ret(free_callback != NULL, -EINVAL);
 
     buf->ref = dav1d_ref_wrap(ptr, free_callback, user_data);
     if (!buf->ref) return -ENOMEM;
     buf->data = ptr;
     buf->sz = sz;
 
     return 0;
 }
 
+void dav1d_data_move_ref(Dav1dData *const dst, Dav1dData *const src) {
+    validate_input(dst != NULL);
+    validate_input(dst->data == NULL);
+    validate_input(src != NULL);
+
+    if (src->ref)
+        validate_input(src->data != NULL);
+
+    *dst = *src;
+    memset(src, 0, sizeof(*src));
+}
+
 void dav1d_data_unref(Dav1dData *const buf) {
     validate_input(buf != NULL);
 
     if (buf->ref) {
         validate_input(buf->data != NULL);
-        dav1d_ref_dec(buf->ref);
+        dav1d_ref_dec(&buf->ref);
     }
     memset(buf, 0, sizeof(*buf));
 }
new file mode 100644
--- /dev/null
+++ b/third_party/dav1d/src/data.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __DAV1D_SRC_DATA_H__
+#define __DAV1D_SRC_DATA_H__
+
+#include "dav1d/data.h"
+
+/**
+ * Move a data reference.
+ */
+void dav1d_data_move_ref(Dav1dData *dst, Dav1dData *src);
+
+#endif /* __DAV1D_SRC_DATA_H__ */
--- a/third_party/dav1d/src/decode.c
+++ b/third_party/dav1d/src/decode.c
@@ -33,16 +33,17 @@
 #include <stdio.h>
 #include <inttypes.h>
 
 #include "dav1d/data.h"
 
 #include "common/intops.h"
 #include "common/mem.h"
 
+#include "src/ctx.h"
 #include "src/decode.h"
 #include "src/dequant_tables.h"
 #include "src/env.h"
 #include "src/qm.h"
 #include "src/recon.h"
 #include "src/ref.h"
 #include "src/tables.h"
 #include "src/thread_task.h"
@@ -166,18 +167,24 @@ static void read_tx_tree(Dav1dTileContex
             t->bx += txsw;
             if (txw >= txh && t->bx < f->bw)
                 read_tx_tree(t, sub, depth + 1, masks,
                              x_off * 2 + 1, y_off * 2 + 1);
             t->bx -= txsw;
         }
         t->by -= txsh;
     } else {
-        memset(&t->a->tx[bx4], is_split ? TX_4X4 : txw, t_dim->w);
-        memset(&t->l.tx[by4], is_split ? TX_4X4 : txh, t_dim->h);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+        rep_macro(type, t->dir tx, off, is_split ? TX_4X4 : mul * txh)
+        case_set_upto16(t_dim->h, l., 1, by4);
+#undef set_ctx
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+        rep_macro(type, t->dir tx, off, is_split ? TX_4X4 : mul * txw)
+        case_set_upto16(t_dim->w, a->, 0, bx4);
+#undef set_ctx
     }
 }
 
 static int neg_deinterleave(int diff, int ref, int max) {
     if (!ref) return diff;
     if (ref >= (max - 1)) return max - diff - 1;
     if (2 * ref < max) {
         if (diff <= 2 * ref) {
@@ -401,17 +408,17 @@ static void read_pal_plane(Dav1dTileCont
                 cache[n_cache++] = *a;
             a++;
         } while (--a_cache > 0);
     }
 
     // find reused cache entries
     int i = 0;
     for (int n = 0; n < n_cache && i < pal_sz; n++)
-        if (msac_decode_bool(&ts->msac, 128 << 7))
+        if (msac_decode_bool(&ts->msac, EC_BOOL_EPROB))
             used_cache[i++] = cache[n];
     const int n_used_cache = i;
 
     // parse new entries
     uint16_t *const pal = f->frame_thread.pass ?
         f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
                             ((t->bx >> 1) + (t->by & 1))][pl] : t->pal[pl];
     if (i < pal_sz) {
@@ -465,23 +472,23 @@ static void read_pal_uv(Dav1dTileContext
     read_pal_plane(t, b, 1, sz_ctx, bx4, by4);
 
     // V pal coding
     Dav1dTileState *const ts = t->ts;
     const Dav1dFrameContext *const f = t->f;
     uint16_t *const pal = f->frame_thread.pass ?
         f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
                             ((t->bx >> 1) + (t->by & 1))][2] : t->pal[2];
-    if (msac_decode_bool(&ts->msac, 128 << 7)) {
+    if (msac_decode_bool(&ts->msac, EC_BOOL_EPROB)) {
         const int bits = f->cur.p.p.bpc - 4 + msac_decode_bools(&ts->msac, 2);
         int prev = pal[0] = msac_decode_bools(&ts->msac, f->cur.p.p.bpc);
         const int max = (1 << f->cur.p.p.bpc) - 1;
         for (int i = 1; i < b->pal_sz[1]; i++) {
             int delta = msac_decode_bools(&ts->msac, bits);
-            if (delta && msac_decode_bool(&ts->msac, 128 << 7)) delta = -delta;
+            if (delta && msac_decode_bool(&ts->msac, EC_BOOL_EPROB)) delta = -delta;
             prev = pal[i] = (prev + delta) & max;
         }
     } else {
         for (int i = 0; i < b->pal_sz[1]; i++)
             pal[i] = msac_decode_bools(&ts->msac, f->cur.p.p.bpc);
     }
     if (DEBUG_BLOCK_INFO) {
         printf("Post-pal[pl=2]: r=%d ", ts->msac.rng);
@@ -606,23 +613,29 @@ static void read_vartx_tree(Dav1dTileCon
     // var-tx tree coding
     b->tx_split[0] = b->tx_split[1] = 0;
     b->max_ytx = dav1d_max_txfm_size_for_bs[bs][0];
     if (f->frame_hdr.segmentation.lossless[b->seg_id] ||
         b->max_ytx == TX_4X4)
     {
         b->max_ytx = b->uvtx = TX_4X4;
         if (f->frame_hdr.txfm_mode == TX_SWITCHABLE) {
-            memset(&t->a->tx[bx4], TX_4X4, bw4);
-            memset(&t->l.tx[by4], TX_4X4, bh4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+            rep_macro(type, t->dir tx, off, TX_4X4)
+            case_set(bh4, l., 1, by4);
+            case_set(bw4, a->, 0, bx4);
+#undef set_ctx
         }
     } else if (f->frame_hdr.txfm_mode != TX_SWITCHABLE || b->skip) {
         if (f->frame_hdr.txfm_mode == TX_SWITCHABLE) {
-            memset(&t->a->tx[bx4], b_dim[2], bw4);
-            memset(&t->l.tx[by4], b_dim[3], bh4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+            rep_macro(type, t->dir tx, off, mul * b_dim[2 + diridx])
+            case_set(bh4, l., 1, by4);
+            case_set(bw4, a->, 0, bx4);
+#undef set_ctx
         } else {
             assert(f->frame_hdr.txfm_mode == TX_LARGEST);
         }
         b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.p.layout];
     } else {
         assert(imin(bw4, bh4) <= 16 || b->max_ytx == TX_64X64);
         int y, x, y_off, x_off;
         const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx];
@@ -647,18 +660,21 @@ static inline unsigned get_prev_frame_se
                                             const int by, const int bx,
                                             const int w4, int h4,
                                             const uint8_t *ref_seg_map,
                                             const ptrdiff_t stride)
 {
     unsigned seg_id = 8;
 
     assert(f->frame_hdr.primary_ref_frame != PRIMARY_REF_NONE);
-    dav1d_thread_picture_wait(&f->refp[f->frame_hdr.primary_ref_frame],
-                              (by + h4) * 4, PLANE_TYPE_BLOCK);
+    if (dav1d_thread_picture_wait(&f->refp[f->frame_hdr.primary_ref_frame],
+                                  (by + h4) * 4, PLANE_TYPE_BLOCK))
+    {
+        return 8;
+    }
 
     ref_seg_map += by * stride + bx;
     do {
         for (int x = 0; x < w4; x++)
             seg_id = imin(seg_id, ref_seg_map[x]);
         ref_seg_map += stride;
     } while (--h4 > 0);
     assert(seg_id < 8);
@@ -689,82 +705,102 @@ static int decode_b(Dav1dTileContext *co
     const int has_chroma = f->seq_hdr.layout != DAV1D_PIXEL_LAYOUT_I400 &&
                            (bw4 > ss_hor || t->bx & 1) &&
                            (bh4 > ss_ver || t->by & 1);
 
     if (f->frame_thread.pass == 2) {
         if (b->intra) {
             f->bd_fn.recon_b_intra(t, bs, intra_edge_flags, b);
 
-            if (has_chroma) {
-                memset(&t->l.uvmode[cby4], b->uv_mode, cbh4);
-                memset(&t->a->uvmode[cbx4], b->uv_mode, cbw4);
-            }
             const enum IntraPredMode y_mode_nofilt =
                 b->y_mode == FILTER_PRED ? DC_PRED : b->y_mode;
-            memset(&t->l.mode[by4], y_mode_nofilt, bh4);
-            memset(&t->a->mode[bx4], y_mode_nofilt, bw4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+            rep_macro(type, t->dir mode, off, mul * y_mode_nofilt); \
+            rep_macro(type, t->dir intra, off, mul)
+            case_set(bh4, l., 1, by4);
+            case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+
+            if (has_chroma) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+                rep_macro(type, t->dir uvmode, off, mul * b->uv_mode)
+                case_set(cbh4, l., 1, cby4);
+                case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
+            }
         } else {
-            if (b->comp_type == COMP_INTER_NONE && b->motion_mode == MM_WARP) {
+            if (f->frame_hdr.frame_type & 1 /* not intrabc */ &&
+                b->comp_type == COMP_INTER_NONE && b->motion_mode == MM_WARP)
+            {
                 uint64_t mask[2] = { 0, 0 };
                 find_matching_ref(t, intra_edge_flags, bw4, bh4, w4, h4,
                                   have_left, have_top, b->ref[0], mask);
                 derive_warpmv(t, bw4, bh4, mask, b->mv[0], &t->warpmv);
             }
-            f->bd_fn.recon_b_inter(t, bs, b);
+            if (f->bd_fn.recon_b_inter(t, bs, b)) return -1;
 
             const uint8_t *const filter = dav1d_filter_dir[b->filter2d];
-            memset(&t->l.filter[0][by4], filter[0], bh4);
-            memset(&t->a->filter[0][bx4], filter[0], bw4);
-            memset(&t->l.filter[1][by4], filter[1], bh4);
-            memset(&t->a->filter[1][bx4], filter[1], bw4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+            rep_macro(type, t->dir filter[0], off, mul * filter[0]); \
+            rep_macro(type, t->dir filter[1], off, mul * filter[1]); \
+            rep_macro(type, t->dir intra, off, 0)
+            case_set(bh4, l., 1, by4);
+            case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+
             if (has_chroma) {
-                memset(&t->l.uvmode[cby4], DC_PRED, cbh4);
-                memset(&t->a->uvmode[cbx4], DC_PRED, cbw4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+                rep_macro(type, t->dir uvmode, off, mul * DC_PRED)
+                case_set(cbh4, l., 1, cby4);
+                case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
             }
         }
-        memset(&t->l.intra[by4], b->intra, bh4);
-        memset(&t->a->intra[bx4], b->intra, bw4);
         return 0;
     }
 
     const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
 
     b->bl = bl;
     b->bp = bp;
     b->bs = bs;
 
-    // skip_mode
-    if (f->frame_hdr.skip_mode_enabled && imin(bw4, bh4) > 1) {
-        const int smctx = t->a->skip_mode[bx4] + t->l.skip_mode[by4];
-        b->skip_mode = msac_decode_bool_adapt(&ts->msac,
-                                              ts->cdf.m.skip_mode[smctx]);
-        if (DEBUG_BLOCK_INFO)
-            printf("Post-skipmode[%d]: r=%d\n", b->skip_mode, ts->msac.rng);
-    } else {
-        b->skip_mode = 0;
-    }
+    const Av1SegmentationData *seg = NULL;
 
     // segment_id (if seg_feature for skip/ref/gmv is enabled)
     int seg_pred = 0;
     if (f->frame_hdr.segmentation.enabled) {
         if (!f->frame_hdr.segmentation.update_map) {
-            b->seg_id = f->prev_segmap ?
-                        get_prev_frame_segid(f, t->by, t->bx, w4, h4,
-                                             f->prev_segmap, f->b4_stride) : 0;
+            if (f->prev_segmap) {
+                unsigned seg_id = get_prev_frame_segid(f, t->by, t->bx, w4, h4,
+                                                       f->prev_segmap,
+                                                       f->b4_stride);
+                if (seg_id >= 8) return -1;
+                b->seg_id = seg_id;
+            } else {
+                b->seg_id = 0;
+            }
+            seg = &f->frame_hdr.segmentation.seg_data.d[b->seg_id];
         } else if (f->frame_hdr.segmentation.seg_data.preskip) {
             if (f->frame_hdr.segmentation.temporal &&
                 (seg_pred = msac_decode_bool_adapt(&ts->msac,
                                        ts->cdf.m.seg_pred[t->a->seg_pred[bx4] +
                                                           t->l.seg_pred[by4]])))
             {
                 // temporal predicted seg_id
-                b->seg_id = f->prev_segmap ?
-                            get_prev_frame_segid(f, t->by, t->bx, w4, h4,
-                                                 f->prev_segmap, f->b4_stride) : 0;
+                if (f->prev_segmap) {
+                    unsigned seg_id = get_prev_frame_segid(f, t->by, t->bx,
+                                                           w4, h4,
+                                                           f->prev_segmap,
+                                                           f->b4_stride);
+                    if (seg_id >= 8) return -1;
+                    b->seg_id = seg_id;
+                } else {
+                    b->seg_id = 0;
+                }
             } else {
                 int seg_ctx;
                 const unsigned pred_seg_id =
                     get_cur_frame_segid(t->by, t->bx, have_top, have_left,
                                         &seg_ctx, f->cur_segmap, f->b4_stride);
                 const unsigned diff = msac_decode_symbol_adapt(&ts->msac,
                                                    ts->cdf.m.seg_id[seg_ctx],
                                                    NUM_SEGMENTS);
@@ -774,42 +810,66 @@ static int decode_b(Dav1dTileContext *co
                                              last_active_seg_id + 1);
                 if (b->seg_id > last_active_seg_id) b->seg_id = 0; // error?
                 if (b->seg_id >= NUM_SEGMENTS) b->seg_id = 0; // error?
             }
 
             if (DEBUG_BLOCK_INFO)
                 printf("Post-segid[preskip;%d]: r=%d\n",
                        b->seg_id, ts->msac.rng);
+
+            seg = &f->frame_hdr.segmentation.seg_data.d[b->seg_id];
         }
     } else {
         b->seg_id = 0;
     }
 
+    // skip_mode
+    if ((!seg || (!seg->globalmv && seg->ref == -1 && !seg->skip)) &&
+        f->frame_hdr.skip_mode_enabled && imin(bw4, bh4) > 1)
+    {
+        const int smctx = t->a->skip_mode[bx4] + t->l.skip_mode[by4];
+        b->skip_mode = msac_decode_bool_adapt(&ts->msac,
+                                              ts->cdf.m.skip_mode[smctx]);
+        if (DEBUG_BLOCK_INFO)
+            printf("Post-skipmode[%d]: r=%d\n", b->skip_mode, ts->msac.rng);
+    } else {
+        b->skip_mode = 0;
+    }
+
     // skip
-    const int sctx = t->a->skip[bx4] + t->l.skip[by4];
-    b->skip = b->skip_mode ? 1 :
-              msac_decode_bool_adapt(&ts->msac, ts->cdf.m.skip[sctx]);
-    if (DEBUG_BLOCK_INFO)
-        printf("Post-skip[%d]: r=%d\n", b->skip, ts->msac.rng);
+    if (b->skip_mode || (seg && seg->skip)) {
+        b->skip = 1;
+    } else {
+        const int sctx = t->a->skip[bx4] + t->l.skip[by4];
+        b->skip = msac_decode_bool_adapt(&ts->msac, ts->cdf.m.skip[sctx]);
+        if (DEBUG_BLOCK_INFO)
+            printf("Post-skip[%d]: r=%d\n", b->skip, ts->msac.rng);
+    }
 
     // segment_id
     if (f->frame_hdr.segmentation.enabled &&
         f->frame_hdr.segmentation.update_map &&
         !f->frame_hdr.segmentation.seg_data.preskip)
     {
         if (!b->skip && f->frame_hdr.segmentation.temporal &&
             (seg_pred = msac_decode_bool_adapt(&ts->msac,
                                    ts->cdf.m.seg_pred[t->a->seg_pred[bx4] +
                                                       t->l.seg_pred[by4]])))
         {
             // temporal predicted seg_id
-            b->seg_id = f->prev_segmap ?
-                        get_prev_frame_segid(f, t->by, t->bx, w4, h4,
-                                             f->prev_segmap, f->b4_stride) : 0;
+            if (f->prev_segmap) {
+                unsigned seg_id = get_prev_frame_segid(f, t->by, t->bx, w4, h4,
+                                                       f->prev_segmap,
+                                                       f->b4_stride);
+                if (seg_id >= 8) return -1;
+                b->seg_id = seg_id;
+            } else {
+                b->seg_id = 0;
+            }
         } else {
             int seg_ctx;
             const unsigned pred_seg_id =
                 get_cur_frame_segid(t->by, t->bx, have_top, have_left,
                                     &seg_ctx, f->cur_segmap, f->b4_stride);
             if (b->skip) {
                 b->seg_id = pred_seg_id;
             } else {
@@ -820,16 +880,18 @@ static int decode_b(Dav1dTileContext *co
                     f->frame_hdr.segmentation.seg_data.last_active_segid;
                 b->seg_id = neg_deinterleave(diff, pred_seg_id,
                                              last_active_seg_id + 1);
                 if (b->seg_id > last_active_seg_id) b->seg_id = 0; // error?
             }
             if (b->seg_id >= NUM_SEGMENTS) b->seg_id = 0; // error?
         }
 
+        seg = &f->frame_hdr.segmentation.seg_data.d[b->seg_id];
+
         if (DEBUG_BLOCK_INFO)
             printf("Post-segid[postskip;%d]: r=%d\n",
                    b->seg_id, ts->msac.rng);
     }
 
     // cdef index
     if (!b->skip) {
         const int idx = f->seq_hdr.sb128 ? ((t->bx & 16) >> 4) +
@@ -860,17 +922,17 @@ static int decode_b(Dav1dTileContext *co
 
         if (have_delta_q) {
             int delta_q = msac_decode_symbol_adapt(&ts->msac, ts->cdf.m.delta_q, 4);
             if (delta_q == 3) {
                 const int n_bits = 1 + msac_decode_bools(&ts->msac, 3);
                 delta_q = msac_decode_bools(&ts->msac, n_bits) + 1 + (1 << n_bits);
             }
             if (delta_q) {
-                if (msac_decode_bool(&ts->msac, 128 << 7)) delta_q = -delta_q;
+                if (msac_decode_bool(&ts->msac, EC_BOOL_EPROB)) delta_q = -delta_q;
                 delta_q *= 1 << f->frame_hdr.delta.q.res_log2;
             }
             ts->last_qidx = iclip(ts->last_qidx + delta_q, 1, 255);
             if (have_delta_q && DEBUG_BLOCK_INFO)
                 printf("Post-delta_q[%d->%d]: r=%d\n",
                        delta_q, ts->last_qidx, ts->msac.rng);
 
             if (f->frame_hdr.delta.lf.present) {
@@ -882,17 +944,17 @@ static int decode_b(Dav1dTileContext *co
                         msac_decode_symbol_adapt(&ts->msac,
                         ts->cdf.m.delta_lf[i + f->frame_hdr.delta.lf.multi], 4);
                     if (delta_lf == 3) {
                         const int n_bits = 1 + msac_decode_bools(&ts->msac, 3);
                         delta_lf = msac_decode_bools(&ts->msac, n_bits) +
                                    1 + (1 << n_bits);
                     }
                     if (delta_lf) {
-                        if (msac_decode_bool(&ts->msac, 128 << 7))
+                        if (msac_decode_bool(&ts->msac, EC_BOOL_EPROB))
                             delta_lf = -delta_lf;
                         delta_lf *= 1 << f->frame_hdr.delta.lf.res_log2;
                     }
                     ts->last_delta_lf[i] =
                         iclip(ts->last_delta_lf[i] + delta_lf, -63, 63);
                     if (have_delta_q && DEBUG_BLOCK_INFO)
                         printf("Post-delta_lf[%d:%d]: r=%d\n", i, delta_lf,
                                ts->msac.rng);
@@ -915,21 +977,25 @@ static int decode_b(Dav1dTileContext *co
             dav1d_calc_lf_values(ts->lflvlmem, &f->frame_hdr, ts->last_delta_lf);
             ts->lflvl = ts->lflvlmem;
         }
     }
 
     if (b->skip_mode) {
         b->intra = 0;
     } else if (f->frame_hdr.frame_type & 1) {
-        const int ictx = get_intra_ctx(t->a, &t->l, by4, bx4,
-                                       have_top, have_left);
-        b->intra = !msac_decode_bool_adapt(&ts->msac, ts->cdf.m.intra[ictx]);
-        if (DEBUG_BLOCK_INFO)
-            printf("Post-intra[%d]: r=%d\n", b->intra, ts->msac.rng);
+        if (seg && (seg->ref >= 0 || seg->globalmv)) {
+            b->intra = !seg->ref;
+        } else {
+            const int ictx = get_intra_ctx(t->a, &t->l, by4, bx4,
+                                           have_top, have_left);
+            b->intra = !msac_decode_bool_adapt(&ts->msac, ts->cdf.m.intra[ictx]);
+            if (DEBUG_BLOCK_INFO)
+                printf("Post-intra[%d]: r=%d\n", b->intra, ts->msac.rng);
+        }
     } else if (f->frame_hdr.allow_intrabc) {
         b->intra = !msac_decode_bool_adapt(&ts->msac, ts->cdf.m.intrabc);
         if (DEBUG_BLOCK_INFO)
             printf("Post-intrabcflag[%d]: r=%d\n", b->intra, ts->msac.rng);
     } else {
         b->intra = 1;
     }
 
@@ -1094,79 +1160,76 @@ static int decode_b(Dav1dTileContext *co
             f->bd_fn.read_coef_blocks(t, bs, b);
         } else {
             f->bd_fn.recon_b_intra(t, bs, intra_edge_flags, b);
         }
 
         dav1d_create_lf_mask_intra(t->lf_mask, f->lf.level, f->b4_stride,
                                    &f->frame_hdr, (const uint8_t (*)[8][2])
                                    &ts->lflvl[b->seg_id][0][0][0],
-                                   t->bx, t->by, (f->cur.p.p.w + 3) >> 2,
-                                   (f->cur.p.p.h + 3) >> 2, bs,
+                                   t->bx, t->by, f->w4, f->h4, bs,
                                    b->tx, b->uvtx, f->cur.p.p.layout,
                                    &t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4],
                                    has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL,
                                    has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
 
         // update contexts
-        memset(&t->a->tx_intra[bx4], t_dim->lw, bw4);
-        memset(&t->l.tx_intra[by4], t_dim->lh, bh4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+        rep_macro(type, t->dir tx_intra, off, mul * (((uint8_t *) &t_dim->lw)[diridx])); \
+        rep_macro(type, t->dir tx, off, mul * (((uint8_t *) &t_dim->lw)[diridx])); \
+        rep_macro(type, t->dir mode, off, mul * y_mode_nofilt); \
+        rep_macro(type, t->dir pal_sz, off, mul * b->pal_sz[0]); \
+        rep_macro(type, t->dir seg_pred, off, mul * seg_pred); \
+        rep_macro(type, t->dir skip_mode, off, 0); \
+        rep_macro(type, t->dir intra, off, mul); \
+        rep_macro(type, t->dir skip, off, mul * b->skip); \
+        /* see aomedia bug 2183 for why we use luma coordinates here */ \
+        rep_macro(type, t->pal_sz_uv[diridx], off, mul * (has_chroma ? b->pal_sz[1] : 0)); \
+        if (f->frame_hdr.frame_type & 1) { \
+            rep_macro(type, t->dir comp_type, off, mul * COMP_INTER_NONE); \
+            rep_macro(type, t->dir ref[0], off, mul * ((uint8_t) -1)); \
+            rep_macro(type, t->dir ref[1], off, mul * ((uint8_t) -1)); \
+            rep_macro(type, t->dir filter[0], off, mul * N_SWITCHABLE_FILTERS); \
+            rep_macro(type, t->dir filter[1], off, mul * N_SWITCHABLE_FILTERS); \
+        }
         const enum IntraPredMode y_mode_nofilt =
             b->y_mode == FILTER_PRED ? DC_PRED : b->y_mode;
-        memset(&t->l.mode[by4], y_mode_nofilt, bh4);
-        memset(&t->a->mode[bx4], y_mode_nofilt, bw4);
-        memset(&t->l.pal_sz[by4], b->pal_sz[0], bh4);
-        memset(&t->a->pal_sz[bx4], b->pal_sz[0], bw4);
+        case_set(bh4, l., 1, by4);
+        case_set(bw4, a->, 0, bx4);
+#undef set_ctx
         if (b->pal_sz[0]) {
             uint16_t *const pal = f->frame_thread.pass ?
                 f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
                                     ((t->bx >> 1) + (t->by & 1))][0] : t->pal[0];
             for (int x = 0; x < bw4; x++)
                 memcpy(t->al_pal[0][bx4 + x][0], pal, 16);
             for (int y = 0; y < bh4; y++)
                 memcpy(t->al_pal[1][by4 + y][0], pal, 16);
         }
         if (has_chroma) {
-            memset(&t->l.uvmode[cby4], b->uv_mode, cbh4);
-            memset(&t->a->uvmode[cbx4], b->uv_mode, cbw4);
-            // see aomedia bug 2183 for why we use luma coordinates here
-            memset(&t->pal_sz_uv[1][by4], b->pal_sz[1], bh4);
-            memset(&t->pal_sz_uv[0][bx4], b->pal_sz[1], bw4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+                rep_macro(type, t->dir uvmode, off, mul * b->uv_mode)
+                case_set(cbh4, l., 1, cby4);
+                case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
             if (b->pal_sz[1]) for (int pl = 1; pl < 3; pl++) {
                 uint16_t *const pal = f->frame_thread.pass ?
                     f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
                                         ((t->bx >> 1) + (t->by & 1))][pl] : t->pal[pl];
                 // see aomedia bug 2183 for why we use luma coordinates here
                 for (int x = 0; x < bw4; x++)
                     memcpy(t->al_pal[0][bx4 + x][pl], pal, 16);
                 for (int y = 0; y < bh4; y++)
                     memcpy(t->al_pal[1][by4 + y][pl], pal, 16);
             }
-        } else { // see aomedia bug 2183 for why we reset this
-            memset(&t->pal_sz_uv[1][by4], 0, bh4);
-            memset(&t->pal_sz_uv[0][bx4], 0, bw4);
         }
         if ((f->frame_hdr.frame_type & 1) || f->frame_hdr.allow_intrabc) {
-            memset(&t->a->tx[bx4], t_dim->lw, bw4);
-            memset(&t->l.tx[by4], t_dim->lh, bh4);
             splat_intraref(f->mvs, f->b4_stride, t->by, t->bx, bs,
                            y_mode_nofilt);
         }
-        if (f->frame_hdr.frame_type & 1) {
-            memset(&t->l.comp_type[by4], COMP_INTER_NONE, bh4);
-            memset(&t->a->comp_type[bx4], COMP_INTER_NONE, bw4);
-            memset(&t->l.ref[0][by4], -1, bh4);
-            memset(&t->a->ref[0][bx4], -1, bw4);
-            memset(&t->l.ref[1][by4], -1, bh4);
-            memset(&t->a->ref[1][bx4], -1, bw4);
-            memset(&t->l.filter[0][by4], N_SWITCHABLE_FILTERS, bh4);
-            memset(&t->a->filter[0][bx4], N_SWITCHABLE_FILTERS, bw4);
-            memset(&t->l.filter[1][by4], N_SWITCHABLE_FILTERS, bh4);
-            memset(&t->a->filter[1][bx4], N_SWITCHABLE_FILTERS, bw4);
-        }
     } else if (!(f->frame_hdr.frame_type & 1)) {
         // intra block copy
         candidate_mv mvstack[8];
         int n_mvs;
         mv mvlist[2][2];
         av1_find_ref_mvs(mvstack, &n_mvs, mvlist, NULL,
                          (int[2]) { -1, -1 }, f->bw, f->bh,
                          bs, bp, t->by, t->bx, ts->tiling.col_start,
@@ -1249,42 +1312,52 @@ static int decode_b(Dav1dTileContext *co
             printf("Post-dmv[%d/%d,ref=%d/%d|%d/%d]: r=%d\n",
                    b->mv[0].y, b->mv[0].x, ref.y, ref.x,
                    mvlist[0][0].y, mvlist[0][0].x, ts->msac.rng);
         read_vartx_tree(t, b, bs, bx4, by4);
 
         // reconstruction
         if (f->frame_thread.pass == 1) {
             f->bd_fn.read_coef_blocks(t, bs, b);
+            b->filter2d = FILTER_2D_BILINEAR;
         } else {
-            f->bd_fn.recon_b_inter(t, bs, b);
+            if (f->bd_fn.recon_b_inter(t, bs, b)) return -1;
         }
 
         splat_intrabc_mv(f->mvs, f->b4_stride, t->by, t->bx, bs, b->mv[0]);
 
-        memset(&t->a->tx_intra[bx4], b_dim[2], bw4);
-        memset(&t->l.tx_intra[by4], b_dim[3], bh4);
-        memset(&t->l.mode[by4], DC_PRED, bh4);
-        memset(&t->a->mode[bx4], DC_PRED, bw4);
-        memset(&t->l.pal_sz[by4], 0, bh4);
-        memset(&t->a->pal_sz[bx4], 0, bw4);
-        // see aomedia bug 2183 for why this is outside if (has_chroma)
-        memset(&t->pal_sz_uv[1][by4], 0, bh4);
-        memset(&t->pal_sz_uv[0][bx4], 0, bw4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+        rep_macro(type, t->dir tx_intra, off, mul * b_dim[2 + diridx]); \
+        rep_macro(type, t->dir mode, off, mul * DC_PRED); \
+        rep_macro(type, t->dir pal_sz, off, 0); \
+        /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \
+        rep_macro(type, t->pal_sz_uv[diridx], off, 0); \
+        rep_macro(type, t->dir seg_pred, off, mul * seg_pred); \
+        rep_macro(type, t->dir skip_mode, off, 0); \
+        rep_macro(type, t->dir intra, off, 0); \
+        rep_macro(type, t->dir skip, off, mul * b->skip)
+        case_set(bh4, l., 1, by4);
+        case_set(bw4, a->, 0, bx4);
+#undef set_ctx
         if (has_chroma) {
-            memset(&t->l.uvmode[cby4], DC_PRED, cbh4);
-            memset(&t->a->uvmode[cbx4], DC_PRED, cbw4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+            rep_macro(type, t->dir uvmode, off, mul * DC_PRED)
+            case_set(cbh4, l., 1, cby4);
+            case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
         }
     } else {
         // inter-specific mode/mv coding
         int is_comp, has_subpel_filter;
 
         if (b->skip_mode) {
             is_comp = 1;
-        } else if (f->frame_hdr.switchable_comp_refs && imin(bw4, bh4) > 1) {
+        } else if ((!seg || (seg->ref == -1 && !seg->globalmv && !seg->skip)) &&
+                   f->frame_hdr.switchable_comp_refs && imin(bw4, bh4) > 1)
+        {
             const int ctx = get_comp_ctx(t->a, &t->l, by4, bx4,
                                          have_top, have_left);
             is_comp = msac_decode_bool_adapt(&ts->msac, ts->cdf.m.comp[ctx]);
             if (DEBUG_BLOCK_INFO)
                 printf("Post-compflag[%d]: r=%d\n", is_comp, ts->msac.rng);
         } else {
             is_comp = 0;
         }
@@ -1492,70 +1565,79 @@ static int decode_b(Dav1dTileContext *co
                         msac_decode_bool_adapt(&ts->msac,
                                                ts->cdf.m.wedge_comp[ctx]);
                     if (b->comp_type == COMP_INTER_WEDGE)
                         b->wedge_idx = msac_decode_symbol_adapt(&ts->msac,
                                                 ts->cdf.m.wedge_idx[ctx], 16);
                 } else {
                     b->comp_type = COMP_INTER_SEG;
                 }
-                b->mask_sign = msac_decode_bool(&ts->msac, 128 << 7);
+                b->mask_sign = msac_decode_bool(&ts->msac, EC_BOOL_EPROB);
                 if (DEBUG_BLOCK_INFO)
                     printf("Post-seg/wedge[%d,wedge_idx=%d,sign=%d]: r=%d\n",
                            b->comp_type == COMP_INTER_WEDGE,
                            b->wedge_idx, b->mask_sign, ts->msac.rng);
             }
         } else {
             b->comp_type = COMP_INTER_NONE;
 
             // ref
-            const int ctx1 = av1_get_ref_ctx(t->a, &t->l, by4, bx4,
-                                             have_top, have_left);
-            if (msac_decode_bool_adapt(&ts->msac, ts->cdf.m.ref[0][ctx1])) {
-                const int ctx2 = av1_get_ref_2_ctx(t->a, &t->l, by4, bx4,
-                                                   have_top, have_left);
-                if (msac_decode_bool_adapt(&ts->msac, ts->cdf.m.ref[1][ctx2])) {
-                    b->ref[0] = 6;
-                } else {
-                    const int ctx3 = av1_get_ref_6_ctx(t->a, &t->l, by4, bx4,
-                                                       have_top, have_left);
-                    b->ref[0] = 4 + msac_decode_bool_adapt(&ts->msac,
-                                                       ts->cdf.m.ref[5][ctx3]);
-                }
+            if (seg && seg->ref > 0) {
+                b->ref[0] = seg->ref - 1;
+            } else if (seg && (seg->globalmv || seg->skip)) {
+                b->ref[0] = 0;
             } else {
-                const int ctx2 = av1_get_ref_3_ctx(t->a, &t->l, by4, bx4,
-                                                   have_top, have_left);
-                if (msac_decode_bool_adapt(&ts->msac, ts->cdf.m.ref[2][ctx2])) {
-                    const int ctx3 = av1_get_ref_5_ctx(t->a, &t->l, by4, bx4,
+                const int ctx1 = av1_get_ref_ctx(t->a, &t->l, by4, bx4,
+                                                 have_top, have_left);
+                if (msac_decode_bool_adapt(&ts->msac, ts->cdf.m.ref[0][ctx1])) {
+                    const int ctx2 = av1_get_ref_2_ctx(t->a, &t->l, by4, bx4,
                                                        have_top, have_left);
-                    b->ref[0] = 2 + msac_decode_bool_adapt(&ts->msac,
-                                                       ts->cdf.m.ref[4][ctx3]);
+                    if (msac_decode_bool_adapt(&ts->msac, ts->cdf.m.ref[1][ctx2])) {
+                        b->ref[0] = 6;
+                    } else {
+                        const int ctx3 = av1_get_ref_6_ctx(t->a, &t->l, by4, bx4,
+                                                           have_top, have_left);
+                        b->ref[0] = 4 + msac_decode_bool_adapt(&ts->msac,
+                                                           ts->cdf.m.ref[5][ctx3]);
+                    }
                 } else {
-                    const int ctx3 = av1_get_ref_4_ctx(t->a, &t->l, by4, bx4,
+                    const int ctx2 = av1_get_ref_3_ctx(t->a, &t->l, by4, bx4,
                                                        have_top, have_left);
-                    b->ref[0] = msac_decode_bool_adapt(&ts->msac,
-                                                       ts->cdf.m.ref[3][ctx3]);
+                    if (msac_decode_bool_adapt(&ts->msac, ts->cdf.m.ref[2][ctx2])) {
+                        const int ctx3 = av1_get_ref_5_ctx(t->a, &t->l, by4, bx4,
+                                                           have_top, have_left);
+                        b->ref[0] = 2 + msac_decode_bool_adapt(&ts->msac,
+                                                           ts->cdf.m.ref[4][ctx3]);
+                    } else {
+                        const int ctx3 = av1_get_ref_4_ctx(t->a, &t->l, by4, bx4,
+                                                           have_top, have_left);
+                        b->ref[0] = msac_decode_bool_adapt(&ts->msac,
+                                                           ts->cdf.m.ref[3][ctx3]);
+                    }
                 }
+                if (DEBUG_BLOCK_INFO)
+                    printf("Post-ref[%d]: r=%d\n", b->ref[0], ts->msac.rng);
             }
             b->ref[1] = -1;
-            if (DEBUG_BLOCK_INFO)
-                printf("Post-ref[%d]: r=%d\n", b->ref[0], ts->msac.rng);
 
             candidate_mv mvstack[8];
             int n_mvs, ctx;
             mv mvlist[2][2];
             av1_find_ref_mvs(mvstack, &n_mvs, mvlist, &ctx,
                              (int[2]) { b->ref[0], -1 }, f->bw, f->bh, bs, bp,
                              t->by, t->bx, ts->tiling.col_start,
                              ts->tiling.col_end, ts->tiling.row_start,
                              ts->tiling.row_end, f->libaom_cm);
 
             // mode parsing and mv derivation from ref_mvs
-            if (msac_decode_bool_adapt(&ts->msac, ts->cdf.m.newmv_mode[ctx & 7])) {
-                if (!msac_decode_bool_adapt(&ts->msac,
+            if ((seg && (seg->skip || seg->globalmv)) ||
+                msac_decode_bool_adapt(&ts->msac, ts->cdf.m.newmv_mode[ctx & 7]))
+            {
+                if ((seg && (seg->skip || seg->globalmv)) ||
+                    !msac_decode_bool_adapt(&ts->msac,
                                         ts->cdf.m.globalmv_mode[(ctx >> 3) & 1]))
                 {
                     b->inter_mode = GLOBALMV;
                     b->mv[0] = get_gmv_2d(&f->frame_hdr.gmv[b->ref[0]],
                                           t->bx, t->by, bw4, bh4, &f->frame_hdr);
                     fix_mv_precision(&f->frame_hdr, &b->mv[0]);
                     has_subpel_filter = imin(bw4, bh4) == 1 ||
                         f->frame_hdr.gmv[b->ref[0]].type == WM_TYPE_TRANSLATION;
@@ -1660,17 +1742,18 @@ static int decode_b(Dav1dTileContext *co
                 ((have_left && findoddzero(&t->l.intra[by4 + 1], h4 >> 1)) ||
                  (have_top && findoddzero(&t->a->intra[bx4 + 1], w4 >> 1))))
             {
                 // reaching here means the block allows obmc - check warp by
                 // finding matching-ref blocks in top/left edges
                 uint64_t mask[2] = { 0, 0 };
                 find_matching_ref(t, intra_edge_flags, bw4, bh4, w4, h4,
                                   have_left, have_top, b->ref[0], mask);
-                const int allow_warp = !f->frame_hdr.force_integer_mv &&
+                const int allow_warp = !f->svc[b->ref[0]][0].scale &&
+                    !f->frame_hdr.force_integer_mv &&
                     f->frame_hdr.warp_motion && (mask[0] | mask[1]);
 
                 b->motion_mode = allow_warp ?
                     msac_decode_symbol_adapt(&ts->msac, ts->cdf.m.motion_mode[bs], 3) :
                     msac_decode_bool_adapt(&ts->msac, ts->cdf.m.obmc[bs]);
                 if (b->motion_mode == MM_WARP) {
                     has_subpel_filter = 0;
                     derive_warpmv(t, bw4, bh4, mask, b->mv[0], &t->warpmv);
@@ -1735,86 +1818,83 @@ static int decode_b(Dav1dTileContext *co
         b->filter2d = dav1d_filter_2d[filter[1]][filter[0]];
 
         read_vartx_tree(t, b, bs, bx4, by4);
 
         // reconstruction
         if (f->frame_thread.pass == 1) {
             f->bd_fn.read_coef_blocks(t, bs, b);
         } else {
-            f->bd_fn.recon_b_inter(t, bs, b);
+            if (f->bd_fn.recon_b_inter(t, bs, b)) return -1;
         }
 
         const int is_globalmv =
             b->inter_mode == (is_comp ? GLOBALMV_GLOBALMV : GLOBALMV);
         const uint8_t (*const lf_lvls)[8][2] = (const uint8_t (*)[8][2])
             &ts->lflvl[b->seg_id][0][b->ref[0] + 1][!is_globalmv];
         dav1d_create_lf_mask_inter(t->lf_mask, f->lf.level, f->b4_stride,
                                    &f->frame_hdr, lf_lvls, t->bx, t->by,
-                                   (f->cur.p.p.w + 3) >> 2,
-                                   (f->cur.p.p.h + 3) >> 2,
-                                   b->skip, bs, b->tx_split,
+                                   f->w4, f->h4, b->skip, bs, b->tx_split,
                                    b->uvtx, f->cur.p.p.layout,
                                    &t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4],
                                    has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL,
                                    has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
 
         // context updates
         if (is_comp) {
             splat_tworef_mv(f->mvs, f->b4_stride, t->by, t->bx, bs,
                             b->inter_mode, b->ref[0], b->ref[1],
                             b->mv[0], b->mv[1]);
         } else {
             splat_oneref_mv(f->mvs, f->b4_stride, t->by, t->bx, bs,
                             b->inter_mode, b->ref[0], b->mv[0],
                             b->interintra_type);
         }
-        memset(&t->l.pal_sz[by4], 0, bh4);
-        memset(&t->a->pal_sz[bx4], 0, bw4);
-        // see aomedia bug 2183 for why this is outside if (has_chroma)
-        memset(&t->pal_sz_uv[1][by4], 0, bh4);
-        memset(&t->pal_sz_uv[0][bx4], 0, bw4);
+
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+        rep_macro(type, t->dir seg_pred, off, mul * seg_pred); \
+        rep_macro(type, t->dir skip_mode, off, mul * b->skip_mode); \
+        rep_macro(type, t->dir intra, off, 0); \
+        rep_macro(type, t->dir skip, off, mul * b->skip); \
+        rep_macro(type, t->dir pal_sz, off, 0); \
+        /* see aomedia bug 2183 for why this is outside if (has_chroma) */ \
+        rep_macro(type, t->pal_sz_uv[diridx], off, 0); \
+        rep_macro(type, t->dir tx_intra, off, mul * b_dim[2 + diridx]); \
+        rep_macro(type, t->dir comp_type, off, mul * b->comp_type); \
+        rep_macro(type, t->dir filter[0], off, mul * filter[0]); \
+        rep_macro(type, t->dir filter[1], off, mul * filter[1]); \
+        rep_macro(type, t->dir mode, off, mul * b->inter_mode); \
+        rep_macro(type, t->dir ref[0], off, mul * b->ref[0]); \
+        rep_macro(type, t->dir ref[1], off, mul * ((uint8_t) b->ref[1]))
+        case_set(bh4, l., 1, by4);
+        case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+
         if (has_chroma) {
-            memset(&t->l.uvmode[cby4], DC_PRED, cbh4);
-            memset(&t->a->uvmode[cbx4], DC_PRED, cbw4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+            rep_macro(type, t->dir uvmode, off, mul * DC_PRED)
+            case_set(cbh4, l., 1, cby4);
+            case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
         }
-        memset(&t->a->tx_intra[bx4], b_dim[2], bw4);
-        memset(&t->l.tx_intra[by4], b_dim[3], bh4);
-        memset(&t->l.comp_type[by4], b->comp_type, bh4);
-        memset(&t->a->comp_type[bx4], b->comp_type, bw4);
-        memset(&t->l.filter[0][by4], filter[0], bh4);
-        memset(&t->a->filter[0][bx4], filter[0], bw4);
-        memset(&t->l.filter[1][by4], filter[1], bh4);
-        memset(&t->a->filter[1][bx4], filter[1], bw4);
-        memset(&t->l.mode[by4], b->inter_mode, bh4);
-        memset(&t->a->mode[bx4], b->inter_mode, bw4);
-        memset(&t->l.ref[0][by4], b->ref[0], bh4);
-        memset(&t->a->ref[0][bx4], b->ref[0], bw4);
-        memset(&t->l.ref[1][by4], b->ref[1], bh4);
-        memset(&t->a->ref[1][bx4], b->ref[1], bw4);
     }
 
     // update contexts
     if (f->frame_hdr.segmentation.enabled &&
         f->frame_hdr.segmentation.update_map)
     {
         uint8_t *seg_ptr = &f->cur_segmap[t->by * f->b4_stride + t->bx];
-        for (int y = 0; y < bh4; y++) {
-            memset(seg_ptr, b->seg_id, bw4);
-            seg_ptr += f->b4_stride;
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+        for (int y = 0; y < bh4; y++) { \
+            rep_macro(type, seg_ptr, 0, mul * b->seg_id); \
+            seg_ptr += f->b4_stride; \
         }
+        case_set(bw4, NULL, 0, 0);
+#undef set_ctx
     }
-    memset(&t->l.seg_pred[by4], seg_pred, bh4);
-    memset(&t->a->seg_pred[bx4], seg_pred, bw4);
-    memset(&t->l.skip_mode[by4], b->skip_mode, bh4);
-    memset(&t->a->skip_mode[bx4], b->skip_mode, bw4);
-    memset(&t->l.intra[by4], b->intra, bh4);
-    memset(&t->a->intra[bx4], b->intra, bw4);
-    memset(&t->l.skip[by4], b->skip, bh4);
-    memset(&t->a->skip[bx4], b->skip, bw4);
     if (!b->skip) {
         uint16_t (*noskip_mask)[2] = &t->lf_mask->noskip_mask[by4];
         const unsigned mask = (~0U >> (32 - bw4)) << (bx4 & 15);
         const int bx_idx = (bx4 & 16) >> 4;
         for (int y = 0; y < bh4; y++, noskip_mask++) {
             (*noskip_mask)[bx_idx] |= mask;
             if (bw4 == 32) // this should be mask >> 16, but it's 0xffffffff anyway
                 (*noskip_mask)[1] |= mask;
@@ -2023,18 +2103,18 @@ static int decode_sb(Dav1dTileContext *c
         default: assert(0);
         }
     } else if (have_h_split) {
         unsigned is_split;
         if (f->frame_thread.pass == 2) {
             const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx];
             is_split = b->bl != bl;
         } else {
-            const unsigned p = gather_top_partition_prob(pc, bl);
-            is_split = msac_decode_bool(&t->ts->msac, p);
+            const uint16_t cdf[2] = { gather_top_partition_prob(pc, bl), 0 };
+            is_split = msac_decode_symbol(&t->ts->msac, cdf, 2);
             if (DEBUG_BLOCK_INFO)
                 printf("poc=%d,y=%d,x=%d,bl=%d,ctx=%d,bp=%d: r=%d\n",
                        f->frame_hdr.frame_offset, t->by, t->bx, bl, ctx,
                        is_split ? PARTITION_SPLIT : PARTITION_H, t->ts->msac.rng);
         }
 
         assert(bl < BL_8X8);
         if (is_split) {
@@ -2052,18 +2132,18 @@ static int decode_sb(Dav1dTileContext *c
         }
     } else {
         assert(have_v_split);
         unsigned is_split;
         if (f->frame_thread.pass == 2) {
             const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx];
             is_split = b->bl != bl;
         } else {
-            const unsigned p = gather_left_partition_prob(pc, bl);
-            is_split = msac_decode_bool(&t->ts->msac, p);
+            uint16_t cdf[2] = { gather_left_partition_prob(pc, bl), 0 };
+            is_split = msac_decode_symbol(&t->ts->msac, cdf, 2);
             if (f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I422 && !is_split)
                 return 1;
             if (DEBUG_BLOCK_INFO)
                 printf("poc=%d,y=%d,x=%d,bl=%d,ctx=%d,bp=%d: r=%d\n",
                        f->frame_hdr.frame_offset, t->by, t->bx, bl, ctx,
                        is_split ? PARTITION_SPLIT : PARTITION_V, t->ts->msac.rng);
         }
 
@@ -2079,18 +2159,21 @@ static int decode_sb(Dav1dTileContext *c
             bp = PARTITION_V;
             if (decode_b(t, bl, dav1d_block_sizes[bl][PARTITION_V][0],
                          PARTITION_V, node->v[0]))
                 return -1;
         }
     }
 
     if (f->frame_thread.pass != 2 && (bp != PARTITION_SPLIT || bl == BL_8X8)) {
-        memset(&t->a->partition[bx8], dav1d_al_part_ctx[0][bl][bp], hsz);
-        memset(&t->l.partition[by8], dav1d_al_part_ctx[1][bl][bp], hsz);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+        rep_macro(type, t->a->partition, bx8, mul * dav1d_al_part_ctx[0][bl][bp]); \
+        rep_macro(type, t->l.partition, by8, mul * dav1d_al_part_ctx[1][bl][bp])
+        case_set_upto16(hsz,,,);
+#undef set_ctx
     }
 
     return 0;
 }
 
 static void reset_context(BlockContext *const ctx, const int keyframe, const int pass) {
     memset(ctx->intra, keyframe, sizeof(ctx->intra));
     memset(ctx->uvmode, DC_PRED, sizeof(ctx->uvmode));
@@ -2132,17 +2215,17 @@ static void setup_tile(Dav1dTileState *c
     const int sb_shift = f->sb_shift;
 
     ts->frame_thread.pal_idx = &f->frame_thread.pal_idx[tile_start_off * 2];
     ts->frame_thread.cf = &((int32_t *) f->frame_thread.cf)[tile_start_off * 3];
     ts->cdf = *f->in_cdf.cdf;
     ts->last_qidx = f->frame_hdr.quant.yac;
     memset(ts->last_delta_lf, 0, sizeof(ts->last_delta_lf));
 
-    msac_init(&ts->msac, data, sz);
+    msac_init(&ts->msac, data, sz, f->frame_hdr.disable_cdf_update);
 
     ts->tiling.row = tile_row;
     ts->tiling.col = tile_col;
     ts->tiling.col_start = col_sb_start << sb_shift;
     ts->tiling.col_end = imin(col_sb_end << sb_shift, f->bw);
     ts->tiling.row_start = row_sb_start << sb_shift;
     ts->tiling.row_end = imin(row_sb_end << sb_shift, f->bh);
 
@@ -2192,18 +2275,21 @@ int dav1d_decode_tile_sbrow(Dav1dTileCon
         return 0;
     }
 
     const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
     const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
 
     if (c->n_fc > 1 && f->frame_hdr.use_ref_frame_mvs) {
         for (int n = 0; n < 7; n++)
-            dav1d_thread_picture_wait(&f->refp[n], 4 * (t->by + sb_step),
-                                      PLANE_TYPE_BLOCK);
+            if (dav1d_thread_picture_wait(&f->refp[n], 4 * (t->by + sb_step),
+                                          PLANE_TYPE_BLOCK))
+            {
+                return 1;
+            }
         av1_init_ref_mv_tile_row(f->libaom_cm,
                                  ts->tiling.col_start, ts->tiling.col_end,
                                  t->by, imin(t->by + sb_step, f->bh));
     }
     memset(t->pal_sz_uv[1], 0, sizeof(*t->pal_sz_uv));
     const int sb128y = t->by >> 5;
     for (t->bx = ts->tiling.col_start, t->a = f->a + col_sb128_start + tile_row * f->sb128w,
          t->lf_mask = f->lf.mask + sb128y * f->sb128w + col_sb128_start;
@@ -2332,19 +2418,20 @@ int dav1d_decode_tile_sbrow(Dav1dTileCon
     if (f->frame_thread.pass != 1)
         f->bd_fn.backup_ipred_edge(t);
 
     // backup t->a/l.tx_lpf_y/uv at tile boundaries to use them to "fix"
     // up the initial value in neighbour tiles when running the loopfilter
     int align_h = (f->bh + 31) & ~31;
     memcpy(&f->lf.tx_lpf_right_edge[0][align_h * tile_col + t->by],
            &t->l.tx_lpf_y[t->by & 16], sb_step);
-    align_h >>= 1;
-    memcpy(&f->lf.tx_lpf_right_edge[1][align_h * tile_col + (t->by >> 1)],
-           &t->l.tx_lpf_uv[(t->by & 16) >> 1], sb_step >> 1);
+    align_h >>= ss_ver;
+
+    memcpy(&f->lf.tx_lpf_right_edge[1][align_h * tile_col + (t->by >> ss_ver)],
+           &t->l.tx_lpf_uv[(t->by & 16) >> ss_ver], sb_step >> ss_ver);
 
     return 0;
 }
 
 int dav1d_decode_frame(Dav1dFrameContext *const f) {
     const Dav1dContext *const c = f->c;
     int retval = -EINVAL;
 
@@ -2698,26 +2785,37 @@ int dav1d_decode_frame(Dav1dFrameContext
             // loopfilter + cdef + restoration
             for (int tile_row = 0; tile_row < f->frame_hdr.tiling.rows; tile_row++) {
                 for (int sby = f->frame_hdr.tiling.row_start_sb[tile_row];
                      sby < f->frame_hdr.tiling.row_start_sb[tile_row + 1]; sby++)
                 {
                     for (int tile_col = 0; tile_col < f->frame_hdr.tiling.cols;
                          tile_col++)
                     {
+                        int progress;
                         Dav1dTileState *const ts =
                             &f->ts[tile_row * f->frame_hdr.tiling.cols + tile_col];
 
-                        if (atomic_load(&ts->progress) <= sby) {
+                        if ((progress = atomic_load(&ts->progress)) <= sby) {
                             pthread_mutex_lock(&ts->tile_thread.lock);
-                            while (atomic_load(&ts->progress) <= sby)
+                            while ((progress = atomic_load(&ts->progress)) <= sby)
                                 pthread_cond_wait(&ts->tile_thread.cond,
                                                   &ts->tile_thread.lock);
                             pthread_mutex_unlock(&ts->tile_thread.lock);
                         }
+                        if (progress == TILE_ERROR) {
+                            dav1d_thread_picture_signal(&f->cur, FRAME_ERROR,
+                                                        progress_plane_type);
+                            const uint64_t all_mask = ~0ULL >> (64 - f->n_tc);
+                            pthread_mutex_lock(&f->tile_thread.lock);
+                            while (f->tile_thread.available != all_mask)
+                                pthread_cond_wait(&f->tile_thread.icond, &f->tile_thread.lock);
+                            pthread_mutex_unlock(&f->tile_thread.lock);
+                            goto error;
+                        }
                     }
 
                     // loopfilter + cdef + restoration
                     if (f->frame_thread.pass != 1)
                         f->bd_fn.filter_sbrow(f, sby);
                     dav1d_thread_picture_signal(&f->cur, (sby + 1) * f->sb_step * 4,
                                                 progress_plane_type);
                 }
@@ -2742,53 +2840,53 @@ int dav1d_decode_frame(Dav1dFrameContext
             for (int tile_idx = 0;
                  tile_idx < f->frame_hdr.tiling.rows * f->frame_hdr.tiling.cols;
                  tile_idx++)
             {
                 Dav1dTileState *const ts = &f->ts[tile_idx];
                 const int tile_start_off = f->frame_thread.tile_start_off[tile_idx];
                 ts->frame_thread.pal_idx = &f->frame_thread.pal_idx[tile_start_off * 2];
                 ts->frame_thread.cf = &((int32_t *) f->frame_thread.cf)[tile_start_off * 3];
-                if (f->n_tc > 0)
-                    atomic_init(&ts->progress, 0);
+                if (f->n_tc > 0) {
+                    unsigned row_sb_start = f->frame_hdr.tiling.row_start_sb[ts->tiling.row];
+                    atomic_init(&ts->progress, row_sb_start);
+                }
             }
         }
     }
 
-    dav1d_thread_picture_signal(&f->cur, UINT_MAX, PLANE_TYPE_ALL);
-
     retval = 0;
 error:
+    dav1d_thread_picture_signal(&f->cur, retval == 0 ? UINT_MAX : FRAME_ERROR,
+                                PLANE_TYPE_ALL);
     for (int i = 0; i < 7; i++) {
         if (f->refp[i].p.data[0])
             dav1d_thread_picture_unref(&f->refp[i]);
-        if (f->ref_mvs_ref[i])
-            dav1d_ref_dec(f->ref_mvs_ref[i]);
+        dav1d_ref_dec(&f->ref_mvs_ref[i]);
     }
 
     dav1d_thread_picture_unref(&f->cur);
     dav1d_cdf_thread_unref(&f->in_cdf);
-    if (f->frame_hdr.refresh_context)
-            dav1d_cdf_thread_unref(&f->out_cdf);
-    if (f->cur_segmap_ref)
-        dav1d_ref_dec(f->cur_segmap_ref);
-    if (f->prev_segmap_ref)
-        dav1d_ref_dec(f->prev_segmap_ref);
-    if (f->mvs_ref)
-        dav1d_ref_dec(f->mvs_ref);
+    if (f->frame_hdr.refresh_context) {
+        dav1d_cdf_thread_signal(&f->out_cdf);
+        dav1d_cdf_thread_unref(&f->out_cdf);
+    }
+    dav1d_ref_dec(&f->cur_segmap_ref);
+    dav1d_ref_dec(&f->prev_segmap_ref);
+    dav1d_ref_dec(&f->mvs_ref);
 
     for (int i = 0; i < f->n_tile_data; i++)
         dav1d_data_unref(&f->tile[i].data);
 
     return retval;
 }
 
 int dav1d_submit_frame(Dav1dContext *const c) {
     Dav1dFrameContext *f;
-    int res;
+    int res = -1;
 
     // wait for c->out_delayed[next] and move into c->out if visible
     Dav1dThreadPicture *out_delayed;
     if (c->n_fc > 1) {
         const unsigned next = c->frame_thread.next++;
         if (c->frame_thread.next == c->n_fc)
             c->frame_thread.next = 0;
 
@@ -2830,17 +2928,18 @@ int dav1d_submit_frame(Dav1dContext *con
 #endif
 #if CONFIG_10BPC
         assign_bitdepth_case(10);
 #endif
 #undef assign_bitdepth_case
         default:
             fprintf(stderr, "Compiled without support for %d-bit decoding\n",
                     f->seq_hdr.bpc);
-            return -ENOPROTOOPT;
+            res = -ENOPROTOOPT;
+            goto error;
         }
     }
 
 #define assign_bitdepth_case(bd) \
         f->bd_fn.recon_b_inter = dav1d_recon_b_inter_##bd##bpc; \
         f->bd_fn.recon_b_intra = dav1d_recon_b_intra_##bd##bpc; \
         f->bd_fn.filter_sbrow = dav1d_filter_sbrow_##bd##bpc; \
         f->bd_fn.backup_ipred_edge = dav1d_backup_ipred_edge_##bd##bpc; \
@@ -2854,61 +2953,81 @@ int dav1d_submit_frame(Dav1dContext *con
         assign_bitdepth_case(16);
 #endif
     }
 #undef assign_bitdepth_case
 
     if (f->frame_hdr.frame_type & 1) {
         if (f->frame_hdr.primary_ref_frame != PRIMARY_REF_NONE) {
             const int pri_ref = f->frame_hdr.refidx[f->frame_hdr.primary_ref_frame];
-            if (!c->refs[pri_ref].p.p.data[0])
-                return -EINVAL;
+            if (!c->refs[pri_ref].p.p.data[0]) {
+                res = -EINVAL;
+                goto error;
+            }
         }
         for (int i = 0; i < 7; i++) {
             const int refidx = f->frame_hdr.refidx[i];
             if (!c->refs[refidx].p.p.data[0] ||
-                f->frame_hdr.width  != c->refs[refidx].p.p.p.w ||
-                f->frame_hdr.height != c->refs[refidx].p.p.p.h ||
-                f->seq_hdr.layout != c->refs[refidx].p.p.p.layout)
+                f->frame_hdr.width * 2 < c->refs[refidx].p.p.p.w ||
+                f->frame_hdr.height * 2 < c->refs[refidx].p.p.p.h ||
+                f->frame_hdr.width > c->refs[refidx].p.p.p.w * 16 ||
+                f->frame_hdr.height > c->refs[refidx].p.p.p.h * 16 ||
+                f->seq_hdr.layout != c->refs[refidx].p.p.p.layout ||
+                f->seq_hdr.bpc != c->refs[refidx].p.p.p.bpc)
             {
                 for (int j = 0; j < i; j++)
                     dav1d_thread_picture_unref(&f->refp[j]);
-                return -EINVAL;
+                res = -EINVAL;
+                goto error;
             }
             dav1d_thread_picture_ref(&f->refp[i], &c->refs[refidx].p);
+            if (f->frame_hdr.width  != c->refs[refidx].p.p.p.w ||
+                f->frame_hdr.height != c->refs[refidx].p.p.p.h)
+            {
+#define scale_fac(ref_sz, this_sz) \
+    (((ref_sz << 14) + (this_sz >> 1)) / this_sz)
+                f->svc[i][0].scale = scale_fac(c->refs[refidx].p.p.p.w,
+                                               f->frame_hdr.width);
+                f->svc[i][1].scale = scale_fac(c->refs[refidx].p.p.p.h,
+                                               f->frame_hdr.height);
+#undef scale_fac
+                f->svc[i][0].step = (f->svc[i][0].scale + 8) >> 4;
+                f->svc[i][1].step = (f->svc[i][1].scale + 8) >> 4;
+            } else {
+                f->svc[i][0].scale = 0;
+            }
         }
     }
 
     // setup entropy
     if (f->frame_hdr.primary_ref_frame == PRIMARY_REF_NONE) {
         dav1d_init_states(&f->in_cdf, f->frame_hdr.quant.yac);
     } else {
         const int pri_ref = f->frame_hdr.refidx[f->frame_hdr.primary_ref_frame];
         dav1d_cdf_thread_ref(&f->in_cdf, &c->cdf[pri_ref]);
     }
     if (f->frame_hdr.refresh_context) {
         dav1d_cdf_thread_alloc(&f->out_cdf, c->n_fc > 1 ? &f->frame_thread.td : NULL);
     }
 
     // FIXME qsort so tiles are in order (for frame threading)
     memcpy(f->tile, c->tile, c->n_tile_data * sizeof(*f->tile));
+    memset(c->tile, 0, c->n_tile_data * sizeof(*c->tile));
     f->n_tile_data = c->n_tile_data;
     c->n_tile_data = 0;
 
     // allocate frame
     if ((res = dav1d_thread_picture_alloc(&f->cur, f->frame_hdr.width,
                                           f->frame_hdr.height,
                                           f->seq_hdr.layout, f->seq_hdr.bpc,
                                           c->n_fc > 1 ? &f->frame_thread.td : NULL,
                                           f->frame_hdr.show_frame,
                                           &c->allocator)) < 0)
     {
-        if (f->frame_hdr.refresh_context)
-            dav1d_cdf_thread_unref(&f->out_cdf);
-        return res;
+        goto error;
     }
 
     f->cur.p.poc = f->frame_hdr.frame_offset;
     f->cur.p.p.type = f->frame_hdr.frame_type;
     f->cur.p.p.pri = f->seq_hdr.pri;
     f->cur.p.p.trc = f->seq_hdr.trc;
     f->cur.p.p.mtrx = f->seq_hdr.mtrx;
     f->cur.p.p.chr = f->seq_hdr.chr;
@@ -2917,32 +3036,38 @@ int dav1d_submit_frame(Dav1dContext *con
     // move f->cur into output queue
     if (c->n_fc == 1) {
         if (f->frame_hdr.show_frame)
             dav1d_picture_ref(&c->out, &f->cur.p);
     } else {
         dav1d_thread_picture_ref(out_delayed, &f->cur);
     }
 
+    f->w4 = (f->frame_hdr.width + 3) >> 2;
+    f->h4 = (f->frame_hdr.height + 3) >> 2;
     f->bw = ((f->frame_hdr.width + 7) >> 3) << 1;
     f->bh = ((f->frame_hdr.height + 7) >> 3) << 1;
     f->sb128w = (f->bw + 31) >> 5;
     f->sb128h = (f->bh + 31) >> 5;
     f->sb_shift = 4 + f->seq_hdr.sb128;
     f->sb_step = 16 << f->seq_hdr.sb128;
     f->sbh = (f->bh + f->sb_step - 1) >> f->sb_shift;
     f->b4_stride = (f->bw + 31) & ~31;
 
     // ref_mvs
     if ((f->frame_hdr.frame_type & 1) || f->frame_hdr.allow_intrabc) {
         f->mvs_ref = dav1d_ref_create(f->sb128h * 32 * f->b4_stride *
                                       sizeof(*f->mvs));
         f->mvs = f->mvs_ref->data;
-        for (int i = 0; i < 7; i++)
-            f->refpoc[i] = f->refp[i].p.poc;
+        if (!f->frame_hdr.allow_intrabc) {
+            for (int i = 0; i < 7; i++)
+                f->refpoc[i] = f->refp[i].p.poc;
+        } else {
+            memset(f->refpoc, 0, sizeof(f->refpoc));
+        }
         if (f->frame_hdr.use_ref_frame_mvs) {
             for (int i = 0; i < 7; i++) {
                 const int refidx = f->frame_hdr.refidx[i];
                 if (c->refs[refidx].refmvs != NULL &&
                     f->refp[i].p.p.w == f->cur.p.p.w &&
                     f->refp[i].p.p.h == f->cur.p.p.h)
                 {
                     f->ref_mvs_ref[i] = c->refs[refidx].refmvs;
@@ -2960,43 +3085,61 @@ int dav1d_submit_frame(Dav1dContext *con
         }
     } else {
         f->mvs_ref = NULL;
         memset(f->ref_mvs_ref, 0, sizeof(f->ref_mvs_ref));
     }
 
     // segmap
     if (f->frame_hdr.segmentation.enabled) {
-        if (f->frame_hdr.segmentation.temporal) {
+
+        // By default, the previous segmentation map is not initialised.
+        f->prev_segmap_ref = NULL;
+        f->prev_segmap = NULL;
+
+        // We might need a previous frame's segmentation map. This
+        // happens if there is either no update or a temporal update.
+        if (f->frame_hdr.segmentation.temporal || !f->frame_hdr.segmentation.update_map) {
             const int pri_ref = f->frame_hdr.primary_ref_frame;
             assert(pri_ref != PRIMARY_REF_NONE);
-            const int ref_w = (f->refp[pri_ref].p.p.w + 3) >> 2;
-            const int ref_h = (f->refp[pri_ref].p.p.h + 3) >> 2;
+            const int ref_w = ((f->refp[pri_ref].p.p.w + 7) >> 3) << 1;
+            const int ref_h = ((f->refp[pri_ref].p.p.h + 7) >> 3) << 1;
             if (ref_w == f->bw && ref_h == f->bh) {
                 f->prev_segmap_ref = c->refs[f->frame_hdr.refidx[pri_ref]].segmap;
-                if (f->prev_segmap_ref == NULL) goto error;
-                dav1d_ref_inc(f->prev_segmap_ref);
-                f->prev_segmap = f->prev_segmap_ref->data;
-            } else {
-                f->prev_segmap_ref = NULL;
-                f->prev_segmap = NULL;
+                if (f->prev_segmap_ref) {
+                    dav1d_ref_inc(f->prev_segmap_ref);
+                    f->prev_segmap = f->prev_segmap_ref->data;
+                }
             }
-        } else {
-            f->prev_segmap_ref = NULL;
-            f->prev_segmap = NULL;
+            // It is an error to signal a temporal update if the
+            // previous frame was the wrong size or had no
+            // segmentation data.
+            if (f->frame_hdr.segmentation.temporal && !f->prev_segmap_ref) {
+                res = -EINVAL;
+                goto error;
+            }
         }
+
         if (f->frame_hdr.segmentation.update_map) {
+            // We're updating an existing map, but need somewhere to
+            // put the new values. Allocate them here (the data
+            // actually gets set elsewhere)
             f->cur_segmap_ref = dav1d_ref_create(f->b4_stride * 32 * f->sb128h);
             f->cur_segmap = f->cur_segmap_ref->data;
         } else if (f->prev_segmap_ref) {
+            // We're not updating an existing map, and we have a valid
+            // reference. Use that.
             f->cur_segmap_ref = f->prev_segmap_ref;
             dav1d_ref_inc(f->cur_segmap_ref);
             f->cur_segmap = f->prev_segmap_ref->data;
         } else {
-            goto error;
+            // We need to make a new map. Allocate one here and zero it out.
+            f->cur_segmap_ref = dav1d_ref_create(f->b4_stride * 32 * f->sb128h);
+            f->cur_segmap = f->cur_segmap_ref->data;
+            memset(f->cur_segmap_ref->data, 0, f->b4_stride * 32 * f->sb128h);
         }
     } else {
         f->cur_segmap = NULL;
         f->cur_segmap_ref = NULL;
         f->prev_segmap_ref = NULL;
     }
 
     // update references etc.
@@ -3013,58 +3156,68 @@ int dav1d_submit_frame(Dav1dContext *con
                 dav1d_cdf_thread_ref(&c->cdf[i], &f->in_cdf);
             }
             c->refs[i].lf_mode_ref_deltas =
                 f->frame_hdr.loopfilter.mode_ref_deltas;
             c->refs[i].seg_data = f->frame_hdr.segmentation.seg_data;
             memcpy(c->refs[i].gmv, f->frame_hdr.gmv, sizeof(c->refs[i].gmv));
             c->refs[i].film_grain = f->frame_hdr.film_grain.data;
 
-            if (c->refs[i].segmap)
-                dav1d_ref_dec(c->refs[i].segmap);
+            dav1d_ref_dec(&c->refs[i].segmap);
             c->refs[i].segmap = f->cur_segmap_ref;
             if (f->cur_segmap_ref)
                 dav1d_ref_inc(f->cur_segmap_ref);
-            if (c->refs[i].refmvs)
-                dav1d_ref_dec(c->refs[i].refmvs);
-            if (f->frame_hdr.allow_intrabc) {
-                c->refs[i].refmvs = NULL;
-            } else {
+            dav1d_ref_dec(&c->refs[i].refmvs);
+            if (!f->frame_hdr.allow_intrabc) {
                 c->refs[i].refmvs = f->mvs_ref;
                 if (f->mvs_ref)
                     dav1d_ref_inc(f->mvs_ref);
             }
             memcpy(c->refs[i].refpoc, f->refpoc, sizeof(f->refpoc));
             c->refs[i].qidx = f->frame_hdr.quant.yac;
         }
     }
 
     if (c->n_fc == 1) {
         if ((res = dav1d_decode_frame(f)) < 0) {
             dav1d_picture_unref(&c->out);
+            for (int i = 0; i < 8; i++) {
+                if (f->frame_hdr.refresh_frame_flags & (1 << i)) {
+                    if (c->refs[i].p.p.data[0])
+                        dav1d_thread_picture_unref(&c->refs[i].p);
+                    if (c->cdf[i].cdf)
+                        dav1d_cdf_thread_unref(&c->cdf[i]);
+                    dav1d_ref_dec(&c->refs[i].segmap);
+                    dav1d_ref_dec(&c->refs[i].refmvs);
+                }
+            }
             return res;
         }
     } else {
         pthread_cond_signal(&f->frame_thread.td.cond);
         pthread_mutex_unlock(&f->frame_thread.td.lock);
     }
 
     return 0;
 error:
     dav1d_cdf_thread_unref(&f->in_cdf);
     if (f->frame_hdr.refresh_context)
         dav1d_cdf_thread_unref(&f->out_cdf);
     for (int i = 0; i < 7; i++) {
         if (f->refp[i].p.data[0])
             dav1d_thread_picture_unref(&f->refp[i]);
-        if (f->ref_mvs_ref[i])
-            dav1d_ref_dec(f->ref_mvs_ref[i]);
+        dav1d_ref_dec(&f->ref_mvs_ref[i]);
     }
     dav1d_picture_unref(&c->out);
     dav1d_thread_picture_unref(&f->cur);
-    if (f->mvs_ref)
-        dav1d_ref_dec(f->mvs_ref);
+    dav1d_ref_dec(&f->mvs_ref);
 
     for (int i = 0; i < f->n_tile_data; i++)
         dav1d_data_unref(&f->tile[i].data);
+    f->n_tile_data = 0;
 
-    return -1;
+    if (c->n_fc > 1) {
+        pthread_cond_signal(&f->frame_thread.td.cond);
+        pthread_mutex_unlock(&f->frame_thread.td.lock);
+    }
+
+    return res;
 }
--- a/third_party/dav1d/src/env.h
+++ b/third_party/dav1d/src/env.h
@@ -33,33 +33,33 @@
 #include <stdint.h>
 #include <stdlib.h>
 
 #include "src/levels.h"
 #include "src/ref_mvs.h"
 #include "src/tables.h"
 
 typedef struct BlockContext {
-    uint8_t mode[32];
-    uint8_t lcoef[32];
-    uint8_t ccoef[2][32];
-    uint8_t seg_pred[32];
-    uint8_t skip[32];
-    uint8_t skip_mode[32];
-    uint8_t intra[32];
-    uint8_t comp_type[32];
-    int8_t ref[2][32]; // -1 means intra
-    uint8_t filter[2][32]; // 3 means unset
-    int8_t tx_intra[32];
-    int8_t tx[32];
-    uint8_t tx_lpf_y[32];
-    uint8_t tx_lpf_uv[32];
-    uint8_t partition[16];
-    uint8_t uvmode[32];
-    uint8_t pal_sz[32];
+    uint8_t ALIGN(mode[32], 8);
+    uint8_t ALIGN(lcoef[32], 8);
+    uint8_t ALIGN(ccoef[2][32], 8);
+    uint8_t ALIGN(seg_pred[32], 8);
+    uint8_t ALIGN(skip[32], 8);
+    uint8_t ALIGN(skip_mode[32], 8);
+    uint8_t ALIGN(intra[32], 8);
+    uint8_t ALIGN(comp_type[32], 8);
+    int8_t ALIGN(ref[2][32], 8); // -1 means intra
+    uint8_t ALIGN(filter[2][32], 8); // 3 means unset
+    int8_t ALIGN(tx_intra[32], 8);
+    int8_t ALIGN(tx[32], 8);
+    uint8_t ALIGN(tx_lpf_y[32], 8);
+    uint8_t ALIGN(tx_lpf_uv[32], 8);
+    uint8_t ALIGN(partition[16], 8);
+    uint8_t ALIGN(uvmode[32], 8);
+    uint8_t ALIGN(pal_sz[32], 8);
 } BlockContext;
 
 static inline int get_intra_ctx(const BlockContext *const a,
                                 const BlockContext *const l,
                                 const int yb4, const int xb4,
                                 const int have_top, const int have_left)
 {
     if (have_left) {
--- a/third_party/dav1d/src/getbits.c
+++ b/third_party/dav1d/src/getbits.c
@@ -90,17 +90,17 @@ unsigned dav1d_get_uniform(GetBits *cons
     return v < m ? v : (v << 1) - m + dav1d_get_bits(c, 1);
 }
 
 unsigned dav1d_get_vlc(GetBits *const c) {
     int n_bits = 0;
     while (!dav1d_get_bits(c, 1))
         if (++n_bits == 32)
             return 0xFFFFFFFFU;
-    return n_bits ? ((1 << n_bits) - 1) + dav1d_get_bits(c, n_bits) : 0;
+    return n_bits ? ((1U << n_bits) - 1) + dav1d_get_bits(c, n_bits) : 0;
 }
 
 static unsigned get_bits_subexp_u(GetBits *const c, const unsigned ref,
                                   const unsigned n)
 {
     unsigned v = 0;
 
     for (int i = 0;; i++) {
@@ -121,13 +121,21 @@ static unsigned get_bits_subexp_u(GetBit
 
     return ref * 2 <= n ? inv_recenter(ref, v) : n - inv_recenter(n - ref, v);
 }
 
 int dav1d_get_bits_subexp(GetBits *const c, const int ref, const unsigned n) {
     return (int) get_bits_subexp_u(c, ref + (1 << n), 2 << n) - (1 << n);
 }
 
-const uint8_t *dav1d_flush_get_bits(GetBits *c) {
+void dav1d_bytealign_get_bits(GetBits *c) {
+    // bits_left is never more than 7, because it is only incremented
+    // by refill(), called by dav1d_get_bits and that never reads more
+    // than 7 bits more than it needs.
+    //
+    // If this wasn't true, we would need to work out how many bits to
+    // discard (bits_left % 8), subtract that from bits_left and then
+    // shift state right by that amount.
+    assert(c->bits_left <= 7);
+
     c->bits_left = 0;
     c->state = 0;
-    return c->ptr;
 }
--- a/third_party/dav1d/src/getbits.h
+++ b/third_party/dav1d/src/getbits.h
@@ -41,11 +41,18 @@ typedef struct GetBits {
 void dav1d_init_get_bits(GetBits *c, const uint8_t *data, size_t sz);
 unsigned dav1d_get_bits(GetBits *c, unsigned n);
 int dav1d_get_sbits(GetBits *c, unsigned n);
 
 // Output in range 0..max-1
 unsigned dav1d_get_uniform(GetBits *c, unsigned max);
 unsigned dav1d_get_vlc(GetBits *c);
 int dav1d_get_bits_subexp(GetBits *c, int ref, unsigned n);
-const uint8_t *dav1d_flush_get_bits(GetBits *c);
+
+// Discard bits from the buffer until we're next byte-aligned.
+void dav1d_bytealign_get_bits(GetBits *c);
+
+// Return the current bit position relative to the start of the buffer.
+static inline unsigned dav1d_get_bits_pos(const GetBits *c) {
+    return (c->ptr - c->ptr_start) * 8 - c->bits_left;
+}
 
 #endif /* __DAV1D_SRC_GETBITS_H__ */
--- a/third_party/dav1d/src/internal.h
+++ b/third_party/dav1d/src/internal.h
@@ -35,16 +35,17 @@
 typedef struct Dav1dFrameContext Dav1dFrameContext;
 typedef struct Dav1dTileState Dav1dTileState;
 typedef struct Dav1dTileContext Dav1dTileContext;
 
 #include "common/attributes.h"
 
 #include "src/cdef.h"
 #include "src/cdf.h"
+#include "src/data.h"
 #include "src/env.h"
 #include "src/intra_edge.h"
 #include "src/ipred.h"
 #include "src/itx.h"
 #include "src/levels.h"
 #include "src/lf_mask.h"
 #include "src/loopfilter.h"
 #include "src/looprestoration.h"
@@ -75,16 +76,17 @@ struct Dav1dContext {
         int start, end;
     } tile[256];
     int n_tile_data, have_seq_hdr, have_frame_hdr;
     int n_tiles;
     Av1SequenceHeader seq_hdr; // FIXME make ref?
     Av1FrameHeader frame_hdr; // FIXME make ref?
 
     // decoded output picture queue
+    Dav1dData in;
     Dav1dPicture out;
     struct {
         Dav1dThreadPicture *out_delayed;
         unsigned next;
     } frame_thread;
 
     // reference/entropy state
     struct {
@@ -127,16 +129,22 @@ struct Dav1dFrameContext {
     unsigned refpoc[7], refrefpoc[7][7];
     CdfThreadContext in_cdf, out_cdf;
     struct {
         Dav1dData data;
         int start, end;
     } tile[256];
     int n_tile_data;
 
+    // for scalable references
+    struct ScalableMotionParams {
+        int scale; // if no scaling, this is 0
+        int step;
+    } svc[7][2 /* x, y */];
+
     const Dav1dContext *c;
     Dav1dTileContext *tc;
     int n_tc;
     Dav1dTileState *ts;
     int n_ts;
     const Dav1dDSPContext *dsp;
     struct {
         recon_b_intra_fn recon_b_intra;
@@ -144,17 +152,17 @@ struct Dav1dFrameContext {
         filter_sbrow_fn filter_sbrow;
         backup_ipred_edge_fn backup_ipred_edge;
         read_coef_blocks_fn read_coef_blocks;
     } bd_fn;
 
     int ipred_edge_sz;
     pixel *ipred_edge[3];
     ptrdiff_t b4_stride;
-    int bw, bh, sb128w, sb128h, sbh, sb_shift, sb_step;
+    int w4, h4, bw, bh, sb128w, sb128h, sbh, sb_shift, sb_step;
     uint16_t dq[NUM_SEGMENTS][3 /* plane */][2 /* dc/ac */];
     const uint8_t *qm[2 /* is_1d */][N_RECT_TX_SIZES][3 /* plane */];
     BlockContext *a;
     int a_sz /* w*tile_rows */;
     AV1_COMMON *libaom_cm; // FIXME
     uint8_t jnt_weights[7][7];
 
     struct {
@@ -212,17 +220,17 @@ struct Dav1dTileState {
     struct {
         int col_start, col_end, row_start, row_end; // in 4px units
         int col, row; // in tile units
     } tiling;
 
     CdfContext cdf;
     MsacContext msac;
 
-    atomic_int progress; // in sby units
+    atomic_int progress; // in sby units, TILE_ERROR after a decoding error
     struct {
         pthread_mutex_t lock;
         pthread_cond_t cond;
     } tile_thread;
     struct {
         uint8_t *pal_idx;
         coef *cf;
     } frame_thread;
@@ -239,17 +247,17 @@ struct Dav1dTileState {
 };
 
 struct Dav1dTileContext {
     const Dav1dFrameContext *f;
     Dav1dTileState *ts;
     int bx, by;
     BlockContext l, *a;
     coef *cf;
-    pixel *emu_edge; // stride=160
+    pixel *emu_edge; // stride=192 for non-SVC, or 320 for SVC
     // FIXME types can be changed to pixel (and dynamically allocated)
     // which would make copy/assign operations slightly faster?
     uint16_t al_pal[2 /* a/l */][32 /* bx/y4 */][3 /* plane */][8 /* palette_idx */];
     uint16_t pal[3 /* plane */][8 /* palette_idx */];
     uint8_t pal_sz_uv[2 /* a/l */][32 /* bx4/by4 */];
     uint8_t txtp_map[32 * 32]; // inter-only
     WarpedMotionParams warpmv;
     union {
--- a/third_party/dav1d/src/ipred.h
+++ b/third_party/dav1d/src/ipred.h
@@ -36,28 +36,28 @@
  * Intra prediction.
  * - a is the angle (in degrees) for directional intra predictors. For other
  *   modes, it is ignored;
  * - topleft is the same as the argument given to dav1d_prepare_intra_edges(),
  *   see ipred_prepare.h for more detailed documentation.
  */
 #define decl_angular_ipred_fn(name) \
 void (name)(pixel *dst, ptrdiff_t stride, const pixel *topleft, \
-            int width, int height, int angle)
+            int width, int height, int angle, int max_width, int max_height)
 typedef decl_angular_ipred_fn(*angular_ipred_fn);
 
 /*
  * Create a subsampled Y plane with the DC subtracted.
  * - w/h_pad is the edge of the width/height that extends outside the visible
  *   portion of the frame in 4px units;
  * - ac has a stride of 16.
  */
 #define decl_cfl_ac_fn(name) \
 void (name)(int16_t *ac, const pixel *y, ptrdiff_t stride, \
-            int w_pad, int h_pad)
+            int w_pad, int h_pad, int cw, int ch)
 typedef decl_cfl_ac_fn(*cfl_ac_fn);
 
 /*
  * dst[x,y] += alpha * ac[x,y]
  * - alpha contains a q3 scalar in [-16,16] range;
  */
 #define decl_cfl_pred_fn(name) \
 void (name)(pixel *dst, ptrdiff_t stride, const pixel *topleft, \
@@ -72,17 +72,17 @@ typedef decl_cfl_pred_fn(*cfl_pred_fn);
 void (name)(pixel *dst, ptrdiff_t stride, const uint16_t *pal, \
             const uint8_t *idx, int w, int h)
 typedef decl_pal_pred_fn(*pal_pred_fn);
 
 typedef struct Dav1dIntraPredDSPContext {
     angular_ipred_fn intra_pred[N_IMPL_INTRA_PRED_MODES];
 
     // chroma-from-luma
-    cfl_ac_fn cfl_ac[3 /* 420, 422, 444 */][N_RECT_TX_SIZES /* chroma tx size */];
+    cfl_ac_fn cfl_ac[3 /* 420, 422, 444 */];
     cfl_pred_fn cfl_pred[DC_128_PRED + 1];
 
     // palette
     pal_pred_fn pal_pred;
 } Dav1dIntraPredDSPContext;
 
 void dav1d_intra_pred_dsp_init_8bpc(Dav1dIntraPredDSPContext *c);
 void dav1d_intra_pred_dsp_init_10bpc(Dav1dIntraPredDSPContext *c);
--- a/third_party/dav1d/src/ipred_prepare.h
+++ b/third_party/dav1d/src/ipred_prepare.h
@@ -78,19 +78,24 @@
 enum IntraPredMode
     bytefn(dav1d_prepare_intra_edges)(int x, int have_left, int y, int have_top,
                                       int w, int h, enum EdgeFlags edge_flags,
                                       const pixel *dst, ptrdiff_t stride,
                                       const pixel *prefilter_toplevel_sb_edge,
                                       enum IntraPredMode mode, int *angle,
                                       int tw, int th, pixel *topleft_out);
 
-// is or'ed with the angle argument into intra predictors to signal that edges
-// are smooth and should use reduced filter strength
-#define ANGLE_SMOOTH_EDGE_FLAG 512
+// These flags are OR'd with the angle argument into intra predictors.
+// ANGLE_USE_EDGE_FILTER_FLAG signals that edges should be convolved
+// with a filter before using them to predict values in a block.
+// ANGLE_SMOOTH_EDGE_FLAG means that edges are smooth and should use
+// reduced filter strength.
+#define ANGLE_USE_EDGE_FILTER_FLAG 1024
+#define ANGLE_SMOOTH_EDGE_FLAG      512
+
 static inline int sm_flag(const BlockContext *const b, const int idx) {
     if (!b->intra[idx]) return 0;
     const enum IntraPredMode m = b->mode[idx];
     return (m == SMOOTH_PRED || m == SMOOTH_H_PRED ||
             m == SMOOTH_V_PRED) ? ANGLE_SMOOTH_EDGE_FLAG : 0;
 }
 
 static inline int sm_uv_flag(const BlockContext *const b, const int idx) {
--- a/third_party/dav1d/src/ipred_tmpl.c
+++ b/third_party/dav1d/src/ipred_tmpl.c
@@ -78,50 +78,50 @@ cfl_pred(pixel *dst, const ptrdiff_t str
             const int diff = alpha * ac[x];
             dst[x] = iclip_pixel(dc + apply_sign((abs(diff) + 32) >> 6, diff));
         }
         ac += width;
         dst += PXSTRIDE(stride);
     }
 }
 
-static unsigned dc_gen_top(const pixel *const topleft, const int width)
-{
+static unsigned dc_gen_top(const pixel *const topleft, const int width) {
     unsigned dc = width >> 1;
     for (int i = 0; i < width; i++)
        dc += topleft[1 + i];
     return dc >> ctz(width);
 }
 
 static void ipred_dc_top_c(pixel *dst, const ptrdiff_t stride,
                            const pixel *const topleft,
-                           const int width, const int height, const int a)
+                           const int width, const int height, const int a,
+                           const int max_width, const int max_height)
 {
     splat_dc(dst, stride, width, height, dc_gen_top(topleft, width));
 }
 
 static void ipred_cfl_top_c(pixel *dst, const ptrdiff_t stride,
                             const pixel *const topleft,
                             const int width, const int height,
                             const int16_t *ac, const int alpha)
 {
     cfl_pred(dst, stride, width, height, dc_gen_top(topleft, width), ac, alpha);
 }
 
-static unsigned dc_gen_left(const pixel *const topleft, const int height)
-{
+static unsigned dc_gen_left(const pixel *const topleft, const int height) {
     unsigned dc = height >> 1;
     for (int i = 0; i < height; i++)
        dc += topleft[-(1 + i)];
     return dc >> ctz(height);
 }
 
 static void ipred_dc_left_c(pixel *dst, const ptrdiff_t stride,
                             const pixel *const topleft,
-                            const int width, const int height, const int a)
+                            const int width, const int height, const int a,
+                            const int max_width, const int max_height)
 {
     splat_dc(dst, stride, width, height, dc_gen_left(topleft, height));
 }
 
 static void ipred_cfl_left_c(pixel *dst, const ptrdiff_t stride,
                              const pixel *const topleft,
                              const int width, const int height,
                              const int16_t *ac, const int alpha)
@@ -135,18 +135,18 @@ static void ipred_cfl_left_c(pixel *dst,
 #define MULTIPLIER_1x4 0x3334
 #define BASE_SHIFT 16
 #else
 #define MULTIPLIER_1x2 0xAAAB
 #define MULTIPLIER_1x4 0x6667
 #define BASE_SHIFT 17
 #endif
 
-static unsigned
-dc_gen(const pixel *const topleft, const int width, const int height)
+static unsigned dc_gen(const pixel *const topleft,
+                       const int width, const int height)
 {
     unsigned dc = (width + height) >> 1;
     for (int i = 0; i < width; i++)
        dc += topleft[i + 1];
     for (int i = 0; i < height; i++)
        dc += topleft[-(i + 1)];
     dc >>= ctz(width + height);
 
@@ -155,17 +155,18 @@ dc_gen(const pixel *const topleft, const
                                                            MULTIPLIER_1x2;
         dc >>= BASE_SHIFT;
     }
     return dc;
 }
 
 static void ipred_dc_c(pixel *dst, const ptrdiff_t stride,
                        const pixel *const topleft,
-                       const int width, const int height, const int a)
+                       const int width, const int height, const int a,
+                       const int max_width, const int max_height)
 {
     splat_dc(dst, stride, width, height, dc_gen(topleft, width, height));
 }
 
 static void ipred_cfl_c(pixel *dst, const ptrdiff_t stride,
                         const pixel *const topleft,
                         const int width, const int height,
                         const int16_t *ac, const int alpha)
@@ -175,52 +176,56 @@ static void ipred_cfl_c(pixel *dst, cons
 }
 
 #undef MULTIPLIER_1x2
 #undef MULTIPLIER_1x4
 #undef BASE_SHIFT
 
 static void ipred_dc_128_c(pixel *dst, const ptrdiff_t stride,
                            const pixel *const topleft,
-                           const int width, const int height, const int a)
+                           const int width, const int height, const int a,
+                           const int max_width, const int max_height)
 {
     splat_dc(dst, stride, width, height, 1 << (BITDEPTH - 1));
 }
 
 static void ipred_cfl_128_c(pixel *dst, const ptrdiff_t stride,
                             const pixel *const topleft,
                             const int width, const int height,
                             const int16_t *ac, const int alpha)
 {
     cfl_pred(dst, stride, width, height, 1 << (BITDEPTH - 1), ac, alpha);
 }
 
 static void ipred_v_c(pixel *dst, const ptrdiff_t stride,
                       const pixel *const topleft,
-                      const int width, const int height, const int a)
+                      const int width, const int height, const int a,
+                      const int max_width, const int max_height)
 {
     for (int y = 0; y < height; y++) {
         pixel_copy(dst, topleft + 1, width);
         dst += PXSTRIDE(stride);
     }
 }
 
 static void ipred_h_c(pixel *dst, const ptrdiff_t stride,
                       const pixel *const topleft,
-                      const int width, const int height, const int a)
+                      const int width, const int height, const int a,
+                      const int max_width, const int max_height)
 {
     for (int y = 0; y < height; y++) {
         pixel_set(dst, topleft[-(1 + y)], width);
         dst += PXSTRIDE(stride);
     }
 }
 
 static void ipred_paeth_c(pixel *dst, const ptrdiff_t stride,
                           const pixel *const tl_ptr,
-                          const int width, const int height, const int a)
+                          const int width, const int height, const int a,
+                          const int max_width, const int max_height)
 {
     const int topleft = tl_ptr[0];
     for (int y = 0; y < height; y++) {
         const int left = tl_ptr[-(y + 1)];
         for (int x = 0; x < width; x++) {
             const int top = tl_ptr[1 + x];
             const int base = left + top - topleft;
             const int ldiff = abs(left - base);
@@ -231,17 +236,18 @@ static void ipred_paeth_c(pixel *dst, co
                      tdiff <= tldiff ? top : topleft;
         }
         dst += PXSTRIDE(stride);
     }
 }
 
 static void ipred_smooth_c(pixel *dst, const ptrdiff_t stride,
                            const pixel *const topleft,
-                           const int width, const int height, const int a)
+                           const int width, const int height, const int a,
+                           const int max_width, const int max_height)
 {
     const uint8_t *const weights_hor = &dav1d_sm_weights[width];
     const uint8_t *const weights_ver = &dav1d_sm_weights[height];
     const int right = topleft[width], bottom = topleft[-height];
 
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x++) {
             const int pred = weights_ver[y]  * topleft[1 + x] +
@@ -251,34 +257,36 @@ static void ipred_smooth_c(pixel *dst, c
             dst[x] = (pred + 256) >> 9;
         }
         dst += PXSTRIDE(stride);
     }
 }
 
 static void ipred_smooth_v_c(pixel *dst, const ptrdiff_t stride,
                              const pixel *const topleft,
-                             const int width, const int height, const int a)
+                             const int width, const int height, const int a,
+                             const int max_width, const int max_height)
 {
     const uint8_t *const weights_ver = &dav1d_sm_weights[height];
     const int bottom = topleft[-height];
 
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x++) {
             const int pred = weights_ver[y]  * topleft[1 + x] +
                       (256 - weights_ver[y]) * bottom;
             dst[x] = (pred + 128) >> 8;
         }
         dst += PXSTRIDE(stride);
     }
 }
 
 static void ipred_smooth_h_c(pixel *dst, const ptrdiff_t stride,
                              const pixel *const topleft,
-                             const int width, const int height, const int a)
+                             const int width, const int height, const int a,
+                             const int max_width, const int max_height)
 {
     const uint8_t *const weights_hor = &dav1d_sm_weights[width];
     const int right = topleft[width];
 
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x++) {
             const int pred = weights_hor[x]  * topleft[-(y + 1)] +
                       (256 - weights_hor[x]) * right;
@@ -323,32 +331,39 @@ static int get_filter_strength(const uns
         } else {
             if (d >= 1) strength = 3;
         }
     }
 
     return strength;
 }
 
-static void filter_edge(pixel *const out, const int sz, const pixel *const in,
+static void filter_edge(pixel *const out, const int sz,
+                        const int lim_from, const int lim_to,
+                        const pixel *const in,
                         const int from, const int to, const unsigned strength)
 {
     static const uint8_t kernel[3][5] = {
         { 0, 4, 8, 4, 0 },
         { 0, 5, 6, 5, 0 },
         { 2, 4, 4, 4, 2 }
     };
 
     assert(strength > 0);
-    for (int i = 0; i < sz; i++) {
+    int i = 0;
+    for (; i < imin(sz, lim_from); i++)
+        out[i] = in[iclip(i, from, to - 1)];
+    for (; i < imin(lim_to, sz); i++) {
         int s = 0;
         for (int j = 0; j < 5; j++)
             s += in[iclip(i - 2 + j, from, to - 1)] * kernel[strength - 1][j];
         out[i] = (s + 8) >> 4;
     }
+    for (; i < sz; i++)
+        out[i] = in[iclip(i, from, to - 1)];
 }
 
 static int get_upsample(const int blk_wh, const unsigned d, const int type) {
     if (d >= 40) return 0;
     return type ? (blk_wh <= 8) : (blk_wh <= 16);
 }
 
 static void upsample_edge(pixel *const out, const int hsz,
@@ -364,37 +379,39 @@ static void upsample_edge(pixel *const o
             s += in[iclip(i + j - 1, from, to - 1)] * kernel[j];
         out[i * 2 + 1] = iclip_pixel((s + 8) >> 4);
     }
     out[i * 2] = in[iclip(i, from, to - 1)];
 }
 
 static void ipred_z1_c(pixel *dst, const ptrdiff_t stride,
                        const pixel *const topleft_in,
-                       const int width, const int height, int angle)
+                       const int width, const int height, int angle,
+                       const int max_width, const int max_height)
 {
-    const int is_sm = angle >> 9;
+    const int is_sm = (angle >> 9) & 0x1;
+    const int enable_intra_edge_filter = angle >> 10;
     angle &= 511;
     assert(angle < 90);
     const int dx = dav1d_dr_intra_derivative[angle];
     pixel top_out[(64 + 64) * 2];
     const pixel *top;
     int max_base_x;
-    const int upsample_above = get_upsample(width + height, 90 - angle, is_sm);
+    const int upsample_above = enable_intra_edge_filter ?
+        get_upsample(width + height, 90 - angle, is_sm) : 0;
     if (upsample_above) {
         upsample_edge(top_out, width + height,
                       &topleft_in[1], -1, width + imin(width, height));
         top = top_out;
         max_base_x = 2 * (width + height) - 2;
     } else {
-        const int filter_strength =
-            get_filter_strength(width + height, 90 - angle, is_sm);
-
+        const int filter_strength = enable_intra_edge_filter ?
+            get_filter_strength(width + height, 90 - angle, is_sm) : 0;
         if (filter_strength) {
-            filter_edge(top_out, width + height,
+            filter_edge(top_out, width + height, 0, width + height,
                         &topleft_in[1], -1, width + imin(width, height),
                         filter_strength);
             top = top_out;
             max_base_x = width + height - 1;
         } else {
             top = &topleft_in[1];
             max_base_x = width + imin(width, height) - 1;
         }
@@ -416,49 +433,55 @@ static void ipred_z1_c(pixel *dst, const
                 break;
             }
         }
     }
 }
 
 static void ipred_z2_c(pixel *dst, const ptrdiff_t stride,
                        const pixel *const topleft_in,
-                       const int width, const int height, int angle)
+                       const int width, const int height, int angle,
+                       const int max_width, const int max_height)
 {
-    const int is_sm = angle >> 9;
+    const int is_sm = (angle >> 9) & 0x1;
+    const int enable_intra_edge_filter = angle >> 10;
     angle &= 511;
     assert(angle > 90 && angle < 180);
     const int dy = dav1d_dr_intra_derivative[angle - 90];
     const int dx = dav1d_dr_intra_derivative[180 - angle];
-    const int upsample_left = get_upsample(width + height, 180 - angle, is_sm);
-    const int upsample_above = get_upsample(width + height, angle - 90, is_sm);
+    const int upsample_left = enable_intra_edge_filter ?
+        get_upsample(width + height, 180 - angle, is_sm) : 0;
+    const int upsample_above = enable_intra_edge_filter ?
+        get_upsample(width + height, angle - 90, is_sm) : 0;
     pixel edge[64 * 2 + 64 * 2 + 1];
     pixel *const topleft = &edge[height * 2];
 
     if (upsample_above) {
         upsample_edge(topleft, width + 1, topleft_in, 0, width + 1);
     } else {
-        const int filter_strength =
-            get_filter_strength(width + height, angle - 90, is_sm);
+        const int filter_strength = enable_intra_edge_filter ?
+            get_filter_strength(width + height, angle - 90, is_sm) : 0;
 
         if (filter_strength) {
-            filter_edge(&topleft[1], width, &topleft_in[1], -1, width,
+            filter_edge(&topleft[1], width, 0, max_width,
+                        &topleft_in[1], -1, width,
                         filter_strength);
         } else {
             pixel_copy(&topleft[1], &topleft_in[1], width);
         }
     }
     if (upsample_left) {
         upsample_edge(edge, height + 1, &topleft_in[-height], 0, height + 1);
     } else {
-        const int filter_strength =
-            get_filter_strength(width + height, 180 - angle, is_sm);
+        const int filter_strength = enable_intra_edge_filter ?
+            get_filter_strength(width + height, 180 - angle, is_sm) : 0;
 
         if (filter_strength) {
-            filter_edge(&topleft[-height], height, &topleft_in[-height],
+            filter_edge(&topleft[-height], height, height - max_height, height,
+                        &topleft_in[-height],
                         0, height + 1, filter_strength);
         } else {
             pixel_copy(&topleft[-height], &topleft_in[-height], height);
         }
     }
     *topleft = *topleft_in;
 
     const int min_base_x = -(1 << upsample_above);
@@ -487,38 +510,41 @@ static void ipred_z2_c(pixel *dst, const
             }
             dst[x] = iclip_pixel((v + 16) >> 5);
         }
     }
 }
 
 static void ipred_z3_c(pixel *dst, const ptrdiff_t stride,
                        const pixel *const topleft_in,
-                       const int width, const int height, int angle)
+                       const int width, const int height, int angle,
+                       const int max_width, const int max_height)
 {
-    const int is_sm = angle >> 9;
+    const int is_sm = (angle >> 9) & 0x1;
+    const int enable_intra_edge_filter = angle >> 10;
     angle &= 511;
     assert(angle > 180);
     const int dy = dav1d_dr_intra_derivative[270 - angle];
     pixel left_out[(64 + 64) * 2];
     const pixel *left;
     int max_base_y;
-    const int upsample_left = get_upsample(width + height, angle - 180, is_sm);
+    const int upsample_left = enable_intra_edge_filter ?
+        get_upsample(width + height, angle - 180, is_sm) : 0;
     if (upsample_left) {
         upsample_edge(left_out, width + height,
                       &topleft_in[-(width + height)],
                       imax(width - height, 0), width + height + 1);
         left = &left_out[2 * (width + height) - 2];
         max_base_y = 2 * (width + height) - 2;
     } else {
-        const int filter_strength =
-            get_filter_strength(width + height, angle - 180, is_sm);
+        const int filter_strength = enable_intra_edge_filter ?
+            get_filter_strength(width + height, angle - 180, is_sm) : 0;
 
         if (filter_strength) {
-            filter_edge(left_out, width + height,
+            filter_edge(left_out, width + height, 0, width + height,
                         &topleft_in[-(width + height)],
                         imax(width - height, 0), width + height + 1,
                         filter_strength);
             left = &left_out[width + height - 1];
             max_base_y = width + height - 1;
         } else {
             left = &topleft_in[-1];
             max_base_y = height + imin(width, height) - 1;
@@ -543,17 +569,18 @@ static void ipred_z3_c(pixel *dst, const
             }
         }
     }
 }
 
 /* Up to 32x32 only */
 static void ipred_filter_c(pixel *dst, const ptrdiff_t stride,
                            const pixel *const topleft_in,
-                           const int width, const int height, int filt_idx)
+                           const int width, const int height, int filt_idx,
+                           const int max_width, const int max_height)
 {
     filt_idx &= 511;
     assert(filt_idx < 5);
 
     const int8_t *const filter = dav1d_filter_intra_taps[filt_idx];
     int x, y;
     ptrdiff_t left_stride;
     const pixel *left, *topleft, *top;
@@ -588,17 +615,17 @@ static void ipred_filter_c(pixel *dst, c
         top = &dst[PXSTRIDE(stride)];
         dst = &dst[PXSTRIDE(stride) * 2];
     }
 }
 
 static NOINLINE void
 cfl_ac_c(int16_t *ac, const pixel *ypx, const ptrdiff_t stride,
          const int w_pad, const int h_pad, const int width, const int height,
-         const int ss_hor, const int ss_ver, const int log2sz)
+         const int ss_hor, const int ss_ver)
 {
     int y, x;
     int16_t *const ac_orig = ac;
 
     assert(w_pad >= 0 && w_pad * 4 < width);
     assert(h_pad >= 0 && h_pad * 4 < height);
 
     for (y = 0; y < height - 4 * h_pad; y++) {
@@ -616,75 +643,44 @@ cfl_ac_c(int16_t *ac, const pixel *ypx, 
         ac += width;
         ypx += PXSTRIDE(stride) << ss_ver;
     }
     for (; y < height; y++) {
         memcpy(ac, &ac[-width], width * sizeof(*ac));
         ac += width;
     }
 
+    const int log2sz = ctz(width) + ctz(height);
     int sum = (1 << log2sz) >> 1;
     for (ac = ac_orig, y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
             sum += ac[x];
         ac += width;
     }
     sum >>= log2sz;
 
     // subtract DC
     for (ac = ac_orig, y = 0; y < height; y++) {
         for (x = 0; x < width; x++)
             ac[x] -= sum;
         ac += width;
     }
 }
 
-#define cfl_ac_fn(lw, lh, cw, ch, ss_hor, ss_ver, log2sz) \
-static void cfl_ac_##lw##x##lh##_to_##cw##x##ch##_c(int16_t *const ac, \
-                                                    const pixel *const ypx, \
-                                                    const ptrdiff_t stride, \
-                                                    const int w_pad, \
-                                                    const int h_pad) \
+#define cfl_ac_fn(fmt, ss_hor, ss_ver) \
+static void cfl_ac_##fmt##_c(int16_t *const ac, const pixel *const ypx, \
+                             const ptrdiff_t stride, const int w_pad, \
+                             const int h_pad, const int cw, const int ch) \
 { \
-    cfl_ac_c(ac, ypx, stride, w_pad, h_pad, cw, ch, ss_hor, ss_ver, log2sz); \
+    cfl_ac_c(ac, ypx, stride, w_pad, h_pad, cw, ch, ss_hor, ss_ver); \
 }
 
-cfl_ac_fn( 8,  8,  4,  4, 1, 1, 4)
-cfl_ac_fn( 8, 16,  4,  8, 1, 1, 5)
-cfl_ac_fn( 8, 32,  4, 16, 1, 1, 6)
-cfl_ac_fn(16,  8,  8,  4, 1, 1, 5)
-cfl_ac_fn(16, 16,  8,  8, 1, 1, 6)
-cfl_ac_fn(16, 32,  8, 16, 1, 1, 7)
-cfl_ac_fn(32,  8, 16,  4, 1, 1, 6)
-cfl_ac_fn(32, 16, 16,  8, 1, 1, 7)
-cfl_ac_fn(32, 32, 16, 16, 1, 1, 8)
-
-cfl_ac_fn( 8,  4,  4,  4, 1, 0, 4)
-cfl_ac_fn( 8,  8,  4,  8, 1, 0, 5)
-cfl_ac_fn(16,  4,  8,  4, 1, 0, 5)
-cfl_ac_fn(16,  8,  8,  8, 1, 0, 6)
-cfl_ac_fn(16, 16,  8, 16, 1, 0, 7)
-cfl_ac_fn(32,  8, 16,  8, 1, 0, 7)
-cfl_ac_fn(32, 16, 16, 16, 1, 0, 8)
-cfl_ac_fn(32, 32, 16, 32, 1, 0, 9)
-
-cfl_ac_fn( 4,  4,  4,  4, 0, 0, 4)
-cfl_ac_fn( 4,  8,  4,  8, 0, 0, 5)
-cfl_ac_fn( 4, 16,  4, 16, 0, 0, 6)
-cfl_ac_fn( 8,  4,  8,  4, 0, 0, 5)
-cfl_ac_fn( 8,  8,  8,  8, 0, 0, 6)
-cfl_ac_fn( 8, 16,  8, 16, 0, 0, 7)
-cfl_ac_fn( 8, 32,  8, 32, 0, 0, 8)
-cfl_ac_fn(16,  4, 16,  4, 0, 0, 6)
-cfl_ac_fn(16,  8, 16,  8, 0, 0, 7)
-cfl_ac_fn(16, 16, 16, 16, 0, 0, 8)
-cfl_ac_fn(16, 32, 16, 32, 0, 0, 9)
-cfl_ac_fn(32,  8, 32,  8, 0, 0, 8)
-cfl_ac_fn(32, 16, 32, 16, 0, 0, 9)
-cfl_ac_fn(32, 32, 32, 32, 0, 0, 10)
+cfl_ac_fn(420, 1, 1)
+cfl_ac_fn(422, 1, 0)
+cfl_ac_fn(444, 0, 0)
 
 static void pal_pred_c(pixel *dst, const ptrdiff_t stride,
                        const uint16_t *const pal, const uint8_t *idx,
                        const int w, const int h)
 {
     for (int y = 0; y < h; y++) {
         for (int x = 0; x < w; x++)
             dst[x] = pal[idx[x]];
@@ -704,50 +700,19 @@ void bitfn(dav1d_intra_pred_dsp_init)(Da
     c->intra_pred[SMOOTH_PRED  ] = ipred_smooth_c;
     c->intra_pred[SMOOTH_V_PRED] = ipred_smooth_v_c;
     c->intra_pred[SMOOTH_H_PRED] = ipred_smooth_h_c;
     c->intra_pred[Z1_PRED      ] = ipred_z1_c;
     c->intra_pred[Z2_PRED      ] = ipred_z2_c;
     c->intra_pred[Z3_PRED      ] = ipred_z3_c;
     c->intra_pred[FILTER_PRED  ] = ipred_filter_c;
 
-    // cfl functions are split per chroma subsampling type
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][ TX_4X4  ] = cfl_ac_8x8_to_4x4_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_4X8  ] = cfl_ac_8x16_to_4x8_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_4X16 ] = cfl_ac_8x32_to_4x16_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_8X4  ] = cfl_ac_16x8_to_8x4_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][ TX_8X8  ] = cfl_ac_16x16_to_8x8_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_8X16 ] = cfl_ac_16x32_to_8x16_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_16X4 ] = cfl_ac_32x8_to_16x4_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][RTX_16X8 ] = cfl_ac_32x16_to_16x8_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1][ TX_16X16] = cfl_ac_32x32_to_16x16_c;
-
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][ TX_4X4  ] = cfl_ac_8x4_to_4x4_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_4X8  ] = cfl_ac_8x8_to_4x8_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_8X4  ] = cfl_ac_16x4_to_8x4_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][ TX_8X8  ] = cfl_ac_16x8_to_8x8_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_8X16 ] = cfl_ac_16x16_to_8x16_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_16X8 ] = cfl_ac_32x8_to_16x8_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][ TX_16X16] = cfl_ac_32x16_to_16x16_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1][RTX_16X32] = cfl_ac_32x32_to_16x32_c;
-
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][ TX_4X4  ] = cfl_ac_4x4_to_4x4_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_4X8  ] = cfl_ac_4x8_to_4x8_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_4X16 ] = cfl_ac_4x16_to_4x16_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_8X4  ] = cfl_ac_8x4_to_8x4_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][ TX_8X8  ] = cfl_ac_8x8_to_8x8_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_8X16 ] = cfl_ac_8x16_to_8x16_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_8X32 ] = cfl_ac_8x32_to_8x32_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_16X4 ] = cfl_ac_16x4_to_16x4_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_16X8 ] = cfl_ac_16x8_to_16x8_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][ TX_16X16] = cfl_ac_16x16_to_16x16_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_16X32] = cfl_ac_16x32_to_16x32_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_32X8 ] = cfl_ac_32x8_to_32x8_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][RTX_32X16] = cfl_ac_32x16_to_32x16_c;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1][ TX_32X32] = cfl_ac_32x32_to_32x32_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = cfl_ac_420_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = cfl_ac_422_c;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = cfl_ac_444_c;
 
     c->cfl_pred[DC_PRED     ] = ipred_cfl_c;
     c->cfl_pred[DC_128_PRED ] = ipred_cfl_128_c;
     c->cfl_pred[TOP_DC_PRED ] = ipred_cfl_top_c;
     c->cfl_pred[LEFT_DC_PRED] = ipred_cfl_left_c;
 
     c->pal_pred = pal_pred_c;
 
--- a/third_party/dav1d/src/levels.h
+++ b/third_party/dav1d/src/levels.h
@@ -36,16 +36,20 @@ enum ObuType {
     OBU_FRAME_HDR = 3,
     OBU_TILE_GRP  = 4,
     OBU_METADATA  = 5,
     OBU_FRAME     = 6,
     OBU_REDUNDANT_FRAME_HDR = 7,
     OBU_PADDING   = 15,
 };
 
+// Constants from Section 3. "Symbols and abbreviated terms"
+#define MAX_TILE_COLS 64
+#define MAX_TILE_ROWS 64
+
 enum TxfmSize {
     TX_4X4,
     TX_8X8,
     TX_16X16,
     TX_32X32,
     TX_64X64,
     N_TX_SIZES,
 };
@@ -438,19 +442,19 @@ typedef struct Av1FrameHeader {
     enum FilterMode subpel_filter_mode;
     int switchable_motion_mode;
     int use_ref_frame_mvs;
     int refresh_context;
     struct {
         int uniform;
         unsigned n_bytes;
         int min_log2_cols, max_log2_cols, log2_cols, cols;
-        int col_start_sb[1025];
         int min_log2_rows, max_log2_rows, log2_rows, rows;
-        int row_start_sb[1025];
+        uint16_t col_start_sb[MAX_TILE_COLS + 1];
+        uint16_t row_start_sb[MAX_TILE_ROWS + 1];
         int update;
     } tiling;
     struct {
         int yac;
         int ydc_delta;
         int udc_delta, uac_delta, vdc_delta, vac_delta;
         int qm, qm_y, qm_u, qm_v;
     } quant;
--- a/third_party/dav1d/src/lf_apply_tmpl.c
+++ b/third_party/dav1d/src/lf_apply_tmpl.c
@@ -49,19 +49,19 @@ static inline void filter_plane_cols_y(c
     for (int x = 0; x < w; x++) {
         if (!have_left && !x) continue;
         uint32_t hmask[4];
         if (!starty4) {
             hmask[0] = mask[x][0][0];
             hmask[1] = mask[x][1][0];
             hmask[2] = mask[x][2][0];
             if (endy4 > 16) {
-                hmask[0] |= mask[x][0][1] << 16;
-                hmask[1] |= mask[x][1][1] << 16;
-                hmask[2] |= mask[x][2][1] << 16;
+                hmask[0] |= (unsigned) mask[x][0][1] << 16;
+                hmask[1] |= (unsigned) mask[x][1][1] << 16;
+                hmask[2] |= (unsigned) mask[x][2][1] << 16;
             }
         } else {
             hmask[0] = mask[x][0][1];
             hmask[1] = mask[x][1][1];
             hmask[2] = mask[x][2][1];
         }
         hmask[3] = 0;
         dsp->lf.loop_filter_sb[0][0](&dst[x * 4], ls, hmask,
@@ -84,19 +84,19 @@ static inline void filter_plane_rows_y(c
     //                                 block1
     // filter edges between rows (e.g. ------)
     //                                 block2
     for (int y = starty4; y < endy4;
          y++, dst += 4 * PXSTRIDE(ls), lvl += b4_stride)
     {
         if (!have_top && !y) continue;
         const uint32_t vmask[4] = {
-            mask[y][0][0] | (mask[y][0][1] << 16),
-            mask[y][1][0] | (mask[y][1][1] << 16),
-            mask[y][2][0] | (mask[y][2][1] << 16),
+            mask[y][0][0] | ((unsigned) mask[y][0][1] << 16),
+            mask[y][1][0] | ((unsigned) mask[y][1][1] << 16),
+            mask[y][2][0] | ((unsigned) mask[y][2][1] << 16),
             0,
         };
         dsp->lf.loop_filter_sb[0][1](dst, ls, vmask,
                                      (const uint8_t(*)[4]) &lvl[0][1], b4_stride,
                                      &f->lf.lim_lut, w);
     }
 }
 
@@ -115,18 +115,18 @@ static inline void filter_plane_cols_uv(
     // filter edges between columns (e.g. block1 | block2)
     for (int x = 0; x < w; x++) {
         if (!have_left && !x) continue;
         uint32_t hmask[3];
         if (!starty4) {
             hmask[0] = mask[x][0][0];
             hmask[1] = mask[x][1][0];
             if (endy4 > (16 >> ss_ver)) {
-                hmask[0] |= mask[x][0][1] << (16 >> ss_ver);
-                hmask[1] |= mask[x][1][1] << (16 >> ss_ver);
+                hmask[0] |= (unsigned) mask[x][0][1] << (16 >> ss_ver);
+                hmask[1] |= (unsigned) mask[x][1][1] << (16 >> ss_ver);
             }
         } else {
             hmask[0] = mask[x][0][1];
             hmask[1] = mask[x][1][1];
         }
         hmask[2] = 0;
         dsp->lf.loop_filter_sb[1][0](&u[x * 4], ls, hmask,
                                      (const uint8_t(*)[4]) &lvl[x][2], b4_stride,
@@ -153,18 +153,18 @@ static inline void filter_plane_rows_uv(
     //                                 block1
     // filter edges between rows (e.g. ------)
     //                                 block2
     for (int y = starty4; y < endy4;
          y++, off_l += 4 * PXSTRIDE(ls), lvl += b4_stride)
     {
         if (!have_top && !y) continue;
         const uint32_t vmask[3] = {
-            mask[y][0][0] | (mask[y][0][1] << (16 >> ss_hor)),
-            mask[y][1][0] | (mask[y][1][1] << (16 >> ss_hor)),
+            mask[y][0][0] | ((unsigned) mask[y][0][1] << (16 >> ss_hor)),
+            mask[y][1][0] | ((unsigned) mask[y][1][1] << (16 >> ss_hor)),
             0,
         };
         dsp->lf.loop_filter_sb[1][1](&u[off_l], ls, vmask,
                                      (const uint8_t(*)[4]) &lvl[0][2], b4_stride,
                                      &f->lf.lim_lut, w);
         dsp->lf.loop_filter_sb[1][1](&v[off_l], ls, vmask,
                                      (const uint8_t(*)[4]) &lvl[0][3], b4_stride,
                                      &f->lf.lim_lut, w);
@@ -172,42 +172,41 @@ static inline void filter_plane_rows_uv(
 }
 
 void bytefn(dav1d_loopfilter_sbrow)(const Dav1dFrameContext *const f,
                                     pixel *const p[3], Av1Filter *const lflvl,
                                     int sby, const int start_of_tile_row)
 {
     int x, have_left;
     // Don't filter outside the frame
-    const int hy4 = (f->cur.p.p.h + 3) >> 2;
     const int have_top = sby > 0;
     const int is_sb64 = !f->seq_hdr.sb128;
     const int starty4 = (sby & is_sb64) << 4;
     const int sbsz = 32 >> is_sb64;
     const int sbl2 = 5 - is_sb64;
     const int halign = (f->bh + 31) & ~31;
     const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
     const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
     const int vmask = 16 >> ss_ver, hmask = 16 >> ss_hor;
-    const unsigned vmax = 1 << vmask, hmax = 1 << hmask;
-    const unsigned endy4 = starty4 + imin(hy4 - sby * sbsz, sbsz);
+    const unsigned vmax = 1U << vmask, hmax = 1U << hmask;
+    const unsigned endy4 = starty4 + imin(f->h4 - sby * sbsz, sbsz);
     const unsigned uv_endy4 = (endy4 + ss_ver) >> ss_ver;
 
     // fix lpf strength at tile col boundaries
     const uint8_t *lpf_y = &f->lf.tx_lpf_right_edge[0][sby << sbl2];
     const uint8_t *lpf_uv = &f->lf.tx_lpf_right_edge[1][sby << (sbl2 - ss_ver)];
     for (int tile_col = 1;; tile_col++) {
         x = f->frame_hdr.tiling.col_start_sb[tile_col];
         if ((x << sbl2) >= f->bw) break;
         const int bx4 = x & is_sb64 ? 16 : 0, cbx4 = bx4 >> ss_hor;
         x >>= is_sb64;
 
         uint16_t (*const y_hmask)[2] = lflvl[x].filter_y[0][bx4];
         for (unsigned y = starty4, mask = 1 << y; y < endy4; y++, mask <<= 1) {
-            const int sidx = mask >= 0x10000;
+            const int sidx = mask >= 0x10000U;
             const unsigned smask = mask >> (sidx << 4);
             const int idx = 2 * !!(y_hmask[2][sidx] & smask) +
                                 !!(y_hmask[1][sidx] & smask);
             y_hmask[2][sidx] &= ~smask;
             y_hmask[1][sidx] &= ~smask;
             y_hmask[0][sidx] &= ~smask;
             y_hmask[imin(idx, lpf_y[y - starty4])][sidx] |= smask;
         }
@@ -231,30 +230,32 @@ void bytefn(dav1d_loopfilter_sbrow)(cons
 
     // fix lpf strength at tile row boundaries
     if (start_of_tile_row) {
         const BlockContext *a;
         for (x = 0, a = &f->a[f->sb128w * (start_of_tile_row - 1)];
              x < f->sb128w; x++, a++)
         {
             uint16_t (*const y_vmask)[2] = lflvl[x].filter_y[1][starty4];
-            for (unsigned mask = 1, i = 0; i < 32; mask <<= 1, i++) {
-                const int sidx = mask >= 0x10000;
+            const unsigned w = imin(32, f->w4 - (x << 5));
+            for (unsigned mask = 1, i = 0; i < w; mask <<= 1, i++) {
+                const int sidx = mask >= 0x10000U;
                 const unsigned smask = mask >> (sidx << 4);
                 const int idx = 2 * !!(y_vmask[2][sidx] & smask) +
                                     !!(y_vmask[1][sidx] & smask);
                 y_vmask[2][sidx] &= ~smask;
                 y_vmask[1][sidx] &= ~smask;
                 y_vmask[0][sidx] &= ~smask;
                 y_vmask[imin(idx, a->tx_lpf_y[i])][sidx] |= smask;
             }
 
             if (f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
+                const unsigned cw = (w + ss_hor) >> ss_hor;
                 uint16_t (*const uv_vmask)[2] = lflvl[x].filter_uv[1][starty4 >> ss_ver];
-                for (unsigned uv_mask = 1, i = 0; i < (32U >> ss_hor); uv_mask <<= 1, i++) {
+                for (unsigned uv_mask = 1, i = 0; i < cw; uv_mask <<= 1, i++) {
                     const int sidx = uv_mask >= hmax;
                     const unsigned smask = uv_mask >> (sidx << (4 - ss_hor));
                     const int idx = !!(uv_vmask[1][sidx] & smask);
                     uv_vmask[1][sidx] &= ~smask;
                     uv_vmask[0][sidx] &= ~smask;
                     uv_vmask[imin(idx, a->tx_lpf_uv[i])][sidx] |= smask;
                 }
             }
@@ -263,44 +264,44 @@ void bytefn(dav1d_loopfilter_sbrow)(cons
 
     pixel *ptr;
     uint8_t (*level_ptr)[4] = f->lf.level + f->b4_stride * sby * sbsz;
     for (ptr = p[0], have_left = 0, x = 0; x < f->sb128w;
          x++, have_left = 1, ptr += 128, level_ptr += 32)
     {
         filter_plane_cols_y(f, have_left, level_ptr, f->b4_stride,
                             lflvl[x].filter_y[0], ptr, f->cur.p.stride[0],
-                            imin(32, f->bw - x * 32), starty4, endy4);
+                            imin(32, f->w4 - x * 32), starty4, endy4);
     }
 
     level_ptr = f->lf.level + f->b4_stride * sby * sbsz;
     for (ptr = p[0], x = 0; x < f->sb128w; x++, ptr += 128, level_ptr += 32) {
         filter_plane_rows_y(f, have_top, level_ptr, f->b4_stride,
                             lflvl[x].filter_y[1], ptr, f->cur.p.stride[0],
-                            imin(32, f->bw - x * 32), starty4, endy4);
+                            imin(32, f->w4 - x * 32), starty4, endy4);
     }
 
     if (!f->frame_hdr.loopfilter.level_u && !f->frame_hdr.loopfilter.level_v)
         return;
 
     ptrdiff_t uv_off;
     level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver);
     for (uv_off = 0, have_left = 0, x = 0; x < f->sb128w;
          x++, have_left = 1, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor)
     {
         filter_plane_cols_uv(f, have_left, level_ptr, f->b4_stride,
                              lflvl[x].filter_uv[0],
                              &p[1][uv_off], &p[2][uv_off], f->cur.p.stride[1],
-                             (imin(32, f->bw - x * 32) + ss_hor) >> ss_hor,
+                             (imin(32, f->w4 - x * 32) + ss_hor) >> ss_hor,
                              starty4 >> ss_ver, uv_endy4, ss_ver);
     }
 
     level_ptr = f->lf.level + f->b4_stride * (sby * sbsz >> ss_ver);
     for (uv_off = 0, x = 0; x < f->sb128w;
          x++, uv_off += 128 >> ss_hor, level_ptr += 32 >> ss_hor)
     {
         filter_plane_rows_uv(f, have_top, level_ptr, f->b4_stride,
                              lflvl[x].filter_uv[1],
                              &p[1][uv_off], &p[2][uv_off], f->cur.p.stride[1],
-                             (imin(32, f->bw - x * 32) + ss_hor) >> ss_hor,
+                             (imin(32, f->w4 - x * 32) + ss_hor) >> ss_hor,
                              starty4 >> ss_ver, uv_endy4, ss_hor);
     }
 }
--- a/third_party/dav1d/src/lf_mask.c
+++ b/third_party/dav1d/src/lf_mask.c
@@ -27,16 +27,17 @@
 
 #include "config.h"
 
 #include <assert.h>
 #include <string.h>
 
 #include "common/intops.h"
 
+#include "src/ctx.h"
 #include "src/levels.h"
 #include "src/lf_mask.h"
 #include "src/tables.h"
 
 static void decomp_tx(uint8_t (*const txa)[2 /* txsz, step */][32 /* y */][32 /* x */],
                       const enum RectTxfmSize from,
                       const int depth,
                       const int y_off, const int x_off,
@@ -59,22 +60,28 @@ static void decomp_tx(uint8_t (*const tx
                       sub, depth + 1, y_off * 2 + 1, x_off * 2 + 0, tx_masks);
             if (t_dim->w >= t_dim->h)
                 decomp_tx((uint8_t(*)[2][32][32]) &txa[0][0][hth4][htw4],
                           sub, depth + 1, y_off * 2 + 1, x_off * 2 + 1, tx_masks);
         }
     } else {
         const int lw = imin(2, t_dim->lw), lh = imin(2, t_dim->lh);
 
-        for (int y = 0; y < t_dim->h; y++) {
-            memset(txa[0][0][y], lw, t_dim->w);
-            memset(txa[1][0][y], lh, t_dim->w);
-            txa[0][1][y][0] = t_dim->w;
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+        for (int y = 0; y < t_dim->h; y++) { \
+            rep_macro(type, txa[0][0][y], off, mul * lw); \
+            rep_macro(type, txa[1][0][y], off, mul * lh); \
+            txa[0][1][y][0] = t_dim->w; \
         }
-        memset(txa[1][1][0], t_dim->h, t_dim->w);
+        case_set_upto16(t_dim->w,,, 0);
+#undef set_ctx
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+        rep_macro(type, txa[1][1][0], off, mul * t_dim->h)
+        case_set_upto16(t_dim->w,,, 0);
+#undef set_ctx
     }
 }
 
 static inline void mask_edges_inter(uint16_t (*const masks)[32][3][2],
                                     const int by4, const int bx4,
                                     const int w4, const int h4, const int skip,
                                     const enum RectTxfmSize max_tx,
                                     const uint16_t *const tx_masks,
@@ -185,18 +192,30 @@ static inline void mask_edges_intra(uint
     inner = (((uint64_t) t) << w4) - t;
     inner1 = inner & 0xffff;
     inner2 = inner >> 16;
     for (y = vstep; y < h4; y += vstep) {
         if (inner1) masks[1][by4 + y][thl4c][0] |= inner1;
         if (inner2) masks[1][by4 + y][thl4c][1] |= inner2;
     }
 
-    memset(a, thl4c, w4);
-    memset(l, twl4c, h4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+    rep_macro(type, a, off, mul * thl4c)
+#define default_memset(dir, diridx, off, var) \
+    memset(a, thl4c, var)
+    case_set_upto32_with_default(w4,,, 0);
+#undef default_memset
+#undef set_ctx
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+    rep_macro(type, l, off, mul * twl4c)
+#define default_memset(dir, diridx, off, var) \
+    memset(l, twl4c, var)
+    case_set_upto32_with_default(h4,,, 0);
+#undef default_memset
+#undef set_ctx
 }
 
 static inline void mask_edges_chroma(uint16_t (*const masks)[32][2][2],
                                      const int cby4, const int cbx4,
                                      const int cw4, const int ch4,
                                      const int skip_inter,
                                      const enum RectTxfmSize tx,
                                      uint8_t *const a, uint8_t *const l,
@@ -244,18 +263,30 @@ static inline void mask_edges_chroma(uin
         inner = (((uint64_t) t) << cw4) - t;
         inner1 = inner & ((1 << hmask) - 1), inner2 = inner >> hmask;
         for (y = vstep; y < ch4; y += vstep) {
             if (inner1) masks[1][cby4 + y][thl4c][0] |= inner1;
             if (inner2) masks[1][cby4 + y][thl4c][1] |= inner2;
         }
     }
 
-    memset(a, thl4c, cw4);
-    memset(l, twl4c, ch4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+    rep_macro(type, a, off, mul * thl4c)
+#define default_memset(dir, diridx, off, var) \
+    memset(a, thl4c, var)
+    case_set_upto32_with_default(cw4,,, 0);
+#undef default_memset
+#undef set_ctx
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+    rep_macro(type, l, off, mul * twl4c)
+#define default_memset(dir, diridx, off, var) \
+    memset(l, twl4c, var)
+    case_set_upto32_with_default(ch4,,, 0);
+#undef default_memset
+#undef set_ctx
 }
 
 void dav1d_create_lf_mask_intra(Av1Filter *const lflvl,
                                 uint8_t (*const level_cache)[4],
                                 const ptrdiff_t b4_stride,
                                 const Av1FrameHeader *const hdr,
                                 const uint8_t (*filter_level)[8][2],
                                 const int bx, const int by,
--- a/third_party/dav1d/src/lib.c
+++ b/third_party/dav1d/src/lib.c
@@ -112,17 +112,17 @@ int dav1d_open(Dav1dContext **const c_ou
             Dav1dTileContext *const t = &f->tc[m];
             t->f = f;
             t->cf = dav1d_alloc_aligned(32 * 32 * sizeof(int32_t), 32);
             if (!t->cf) goto error;
             t->scratch.mem = dav1d_alloc_aligned(128 * 128 * 8, 32);
             if (!t->scratch.mem) goto error;
             memset(t->cf, 0, 32 * 32 * sizeof(int32_t));
             t->emu_edge =
-                dav1d_alloc_aligned(160 * (128 + 7) * sizeof(uint16_t), 32);
+                dav1d_alloc_aligned(320 * (256 + 7) * sizeof(uint16_t), 32);
             if (!t->emu_edge) goto error;
             if (f->n_tc > 1) {
                 pthread_mutex_init(&t->tile_thread.td.lock, NULL);
                 pthread_cond_init(&t->tile_thread.td.cond, NULL);
                 t->tile_thread.fttd = &f->tile_thread;
                 pthread_create(&t->tile_thread.td.thread, NULL, dav1d_tile_task, t);
             }
         }
@@ -152,25 +152,38 @@ error:
             dav1d_free_aligned(c->fc);
         }
         dav1d_freep_aligned(c_out);
     }
     fprintf(stderr, "Failed to allocate memory: %s\n", strerror(errno));
     return -ENOMEM;
 }
 
-int dav1d_decode(Dav1dContext *const c,
-                 Dav1dData *const in, Dav1dPicture *const out)
+int dav1d_send_data(Dav1dContext *const c, Dav1dData *const in)
+{
+    validate_input_or_ret(c != NULL, -EINVAL);
+    validate_input_or_ret(in != NULL, -EINVAL);
+    validate_input_or_ret(in->data == NULL || in->sz, -EINVAL);
+
+    if (c->in.data)
+        return -EAGAIN;
+    dav1d_data_move_ref(&c->in, in);
+
+    return 0;
+}
+
+int dav1d_get_picture(Dav1dContext *const c, Dav1dPicture *const out)
 {
     int res;
 
     validate_input_or_ret(c != NULL, -EINVAL);
     validate_input_or_ret(out != NULL, -EINVAL);
 
-    if (!in) {
+    Dav1dData *const in = &c->in;
+    if (!in->data) {
         if (c->n_fc == 1) return -EAGAIN;
 
         // flush
         unsigned flush_count = 0;
         do {
             const unsigned next = c->frame_thread.next;
             Dav1dFrameContext *const f = &c->fc[next];
 
@@ -193,40 +206,42 @@ int dav1d_decode(Dav1dContext *const c,
                 }
                 // else continue
             }
         } while (++flush_count < c->n_fc);
         return -EAGAIN;
     }
 
     while (in->sz > 0) {
-        if ((res = dav1d_parse_obus(c, in)) < 0)
+        if ((res = dav1d_parse_obus(c, in)) < 0) {
+            dav1d_data_unref(in);
             return res;
+        }
 
         assert((size_t)res <= in->sz);
         in->sz -= res;
         in->data += res;
         if (!in->sz) dav1d_data_unref(in);
         if (c->out.data[0]) {
-            dav1d_picture_ref(out, &c->out);
-            dav1d_picture_unref(&c->out);
+            dav1d_picture_move_ref(out, &c->out);
             return 0;
         }
     }
 
     if (c->out.data[0]) {
-        dav1d_picture_ref(out, &c->out);
-        dav1d_picture_unref(&c->out);
+        dav1d_picture_move_ref(out, &c->out);
         return 0;
     }
 
     return -EAGAIN;
 }
 
 void dav1d_flush(Dav1dContext *const c) {
+    dav1d_data_unref(&c->in);
+
     if (c->n_fc == 1) return;
 
     for (unsigned n = 0; n < c->n_fc; n++)
         c->frame_thread.out_delayed[n].flushed = 1;
 }
 
 void dav1d_close(Dav1dContext **const c_out) {
     validate_input(c_out != NULL);
@@ -239,33 +254,49 @@ void dav1d_close(Dav1dContext **const c_
 
         // clean-up threading stuff
         if (c->n_fc > 1) {
             pthread_mutex_lock(&f->frame_thread.td.lock);
             f->frame_thread.die = 1;
             pthread_cond_signal(&f->frame_thread.td.cond);
             pthread_mutex_unlock(&f->frame_thread.td.lock);
             pthread_join(f->frame_thread.td.thread, NULL);
+            // free references from dav1d_submit_frame() usually freed by
+            // dav1d_decode_frame
+            for (int i = 0; i < 7; i++) {
+                if (f->refp[i].p.data[0])
+                    dav1d_thread_picture_unref(&f->refp[i]);
+                dav1d_ref_dec(&f->ref_mvs_ref[i]);
+            }
+            dav1d_thread_picture_unref(&f->cur);
+            dav1d_cdf_thread_unref(&f->in_cdf);
+            if (f->frame_hdr.refresh_context)
+                dav1d_cdf_thread_unref(&f->out_cdf);
+            dav1d_ref_dec(&f->cur_segmap_ref);
+            dav1d_ref_dec(&f->prev_segmap_ref);
+            dav1d_ref_dec(&f->mvs_ref);
+            for (int i = 0; i < f->n_tile_data; i++)
+                dav1d_data_unref(&f->tile[i].data);
             freep(&f->frame_thread.b);
             dav1d_freep_aligned(&f->frame_thread.pal_idx);
             dav1d_freep_aligned(&f->frame_thread.cf);
             freep(&f->frame_thread.tile_start_off);
             freep(&f->frame_thread.pal);
             freep(&f->frame_thread.cbi);
             pthread_mutex_destroy(&f->frame_thread.td.lock);
             pthread_cond_destroy(&f->frame_thread.td.cond);
         }
         if (f->n_tc > 1) {
             pthread_mutex_lock(&f->tile_thread.lock);
             for (int m = 0; m < f->n_tc; m++) {
                 Dav1dTileContext *const t = &f->tc[m];
                 t->tile_thread.die = 1;
             }
             pthread_cond_broadcast(&f->tile_thread.cond);
-            while (f->tile_thread.available != (1U << f->n_tc) - 1)
+            while (f->tile_thread.available != ~0ULL >> (64 - f->n_tc))
                 pthread_cond_wait(&f->tile_thread.icond,
                                   &f->tile_thread.lock);
             pthread_mutex_unlock(&f->tile_thread.lock);
             for (int m = 0; m < f->n_tc; m++) {
                 Dav1dTileContext *const t = &f->tc[m];
                 if (f->n_tc > 1) {
                     pthread_join(t->tile_thread.td.thread, NULL);
                     pthread_mutex_destroy(&t->tile_thread.td.lock);
@@ -295,28 +326,27 @@ void dav1d_close(Dav1dContext **const c_
         free(f->lf.mask);
         free(f->lf.level);
         free(f->lf.tx_lpf_right_edge[0]);
         av1_free_ref_mv_common(f->libaom_cm);
         dav1d_free_aligned(f->lf.cdef_line);
         dav1d_free_aligned(f->lf.lr_lpf_line);
     }
     dav1d_free_aligned(c->fc);
+    dav1d_data_unref(&c->in);
     if (c->n_fc > 1) {
         for (unsigned n = 0; n < c->n_fc; n++)
             if (c->frame_thread.out_delayed[n].p.data[0])
                 dav1d_thread_picture_unref(&c->frame_thread.out_delayed[n]);
         free(c->frame_thread.out_delayed);
     }
     for (int n = 0; n < c->n_tile_data; n++)
         dav1d_data_unref(&c->tile[n].data);
     for (int n = 0; n < 8; n++) {
         if (c->cdf[n].cdf)
             dav1d_cdf_thread_unref(&c->cdf[n]);
         if (c->refs[n].p.p.data[0])
             dav1d_thread_picture_unref(&c->refs[n].p);
-        if (c->refs[n].refmvs)
-            dav1d_ref_dec(c->refs[n].refmvs);
-        if (c->refs[n].segmap)
-            dav1d_ref_dec(c->refs[n].segmap);
+        dav1d_ref_dec(&c->refs[n].refmvs);
+        dav1d_ref_dec(&c->refs[n].segmap);
     }
     dav1d_freep_aligned(c_out);
 }
--- a/third_party/dav1d/src/loopfilter_tmpl.c
+++ b/third_party/dav1d/src/loopfilter_tmpl.c
@@ -158,17 +158,17 @@ loop_filter(pixel *dst, int E, int I, in
     }
 }
 
 static void loop_filter_h_sb128y_c(pixel *dst, const ptrdiff_t stride,
                                    const uint32_t *const vmask,
                                    const uint8_t (*l)[4], ptrdiff_t b4_stride,
                                    const Av1FilterLUT *lut, const int h)
 {
-    const unsigned vm = (vmask[0] | vmask[1] | vmask[2]) & ((1ULL << h) - 1);
+    const unsigned vm = vmask[0] | vmask[1] | vmask[2];
     for (unsigned y = 1; vm & ~(y - 1);
          y <<= 1, dst += 4 * PXSTRIDE(stride), l += b4_stride)
     {
         if (vm & y) {
             const int L = l[0][0] ? l[0][0] : l[-1][0];
             if (!L) continue;
             const int H = L >> 4;
             const int E = lut->e[L], I = lut->i[L];
@@ -196,17 +196,17 @@ static void loop_filter_v_sb128y_c(pixel
     }
 }
 
 static void loop_filter_h_sb128uv_c(pixel *dst, const ptrdiff_t stride,
                                     const uint32_t *const vmask,
                                     const uint8_t (*l)[4], ptrdiff_t b4_stride,
                                     const Av1FilterLUT *lut, const int h)
 {
-    const unsigned vm = (vmask[0] | vmask[1]) & ((1ULL << h) - 1);
+    const unsigned vm = vmask[0] | vmask[1];
     for (unsigned y = 1; vm & ~(y - 1);
          y <<= 1, dst += 4 * PXSTRIDE(stride), l += b4_stride)
     {
         if (vm & y) {
             const int L = l[0][0] ? l[0][0] : l[-1][0];
             if (!L) continue;
             const int H = L >> 4;
             const int E = lut->e[L], I = lut->i[L];
--- a/third_party/dav1d/src/looprestoration_tmpl.c
+++ b/third_party/dav1d/src/looprestoration_tmpl.c
@@ -319,17 +319,17 @@ static void boxsum5(coef *dst, const pix
 // See boxsum3 function comments for details on row and column skipping
 static void boxsum3sqr(int32_t *dst, const pixel *src, const int w, const int h) {
     // We skip the first row, as it is never used
     src += REST_UNIT_STRIDE;
     dst += REST_UNIT_STRIDE;
 
     // We skip the first and last columns, as they are never used
     for (int x = 1; x < w - 1; x++) {
-        int *ds = dst + x;
+        int32_t *ds = dst + x;
         const pixel *s = src + x;
         int a = s[0] * s[0];
         int b = s[REST_UNIT_STRIDE] * s[REST_UNIT_STRIDE];
 
         // We skip the first row, as it is skipped in the next loop and
         // we don't need the last row as it is skipped in the next loop
         for (int y = 2; y < h - 2; y++) {
             s += REST_UNIT_STRIDE;
@@ -362,17 +362,17 @@ static void boxsum3sqr(int32_t *dst, con
 // See boxsum5 function comments for details on row and column skipping
 static void boxsum5sqr(int32_t *dst, const pixel *const src, const int w,
                        const int h)
 {
     // We skip the first row, as it is never used
     dst += REST_UNIT_STRIDE;
 
     for (int x = 0; x < w; x++) {
-        int *ds = dst + x;
+        int32_t *ds = dst + x;
         const pixel *s = src + 3 * REST_UNIT_STRIDE + x;
         int a = s[-3 * REST_UNIT_STRIDE] * s[-3 * REST_UNIT_STRIDE];
         int b = s[-2 * REST_UNIT_STRIDE] * s[-2 * REST_UNIT_STRIDE];
         int c = s[-1 * REST_UNIT_STRIDE] * s[-1 * REST_UNIT_STRIDE];
         int d = s[0] * s[0];
 
         // We skip the first 2 rows, as they are skipped in the next loop and
         // we don't need the last 2 row as it is skipped in the next loop
@@ -403,20 +403,22 @@ static void boxsum5sqr(int32_t *dst, con
             b = c;
             c = d;
             d = e;
         }
         dst += REST_UNIT_STRIDE;
     }
 }
 
-static void selfguided_filter(int32_t *dst, const pixel *src,
+static void selfguided_filter(int16_t *dst, const pixel *src,
                               const ptrdiff_t src_stride, const int w,
                               const int h, const int n, const int s)
 {
+    const int sgr_one_by_x = n == 25 ? 164 : 455;
+
     // Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels
     // of padding above and below
     int32_t A_[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
     int32_t *A = A_ + 3 * REST_UNIT_STRIDE + 3;
     // By inverting A and B after the boxsums, B can be of size coef instead
     // of int32_t
     coef B_[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
     coef *B = B_ + 3 * REST_UNIT_STRIDE + 3;
@@ -434,72 +436,72 @@ static void selfguided_filter(int32_t *d
     coef *BB = B - REST_UNIT_STRIDE;
     for (int j = -1; j < h + 1; j+= step) {
         for (int i = -1; i < w + 1; i++) {
             const int a =
                 (AA[i] + (1 << (2 * (BITDEPTH - 8)) >> 1)) >> (2 * (BITDEPTH - 8));
             const int b =
                 (BB[i] + (1 << (BITDEPTH - 8) >> 1)) >> (BITDEPTH - 8);
 
-            const uint32_t p = (a * n >= b * b) * (a * n - b * b);
-            const uint32_t z = (p * s + (1 << 19)) >> 20;
+            const unsigned p = imax(a * n - b * b, 0);
+            const unsigned z = (p * s + (1 << 19)) >> 20;
 
             const int x = dav1d_sgr_x_by_xplus1[imin(z, 255)];
             // This is where we invert A and B, so that B is of size coef.
-            AA[i] = (((1 << 8) - x) * BB[i] * dav1d_sgr_one_by_x[n - 1] + (1 << 11)) >> 12;
+            AA[i] = (((1 << 8) - x) * BB[i] * sgr_one_by_x + (1 << 11)) >> 12;
             BB[i] = x;
         }
         AA += step * REST_UNIT_STRIDE;
         BB += step * REST_UNIT_STRIDE;
     }
 
     src += 3 * REST_UNIT_STRIDE + 3;
     if (n == 25) {
         int j = 0;
 #define SIX_NEIGHBORS(P, i)\
     ((P[i - REST_UNIT_STRIDE]     + P[i + REST_UNIT_STRIDE]) * 6 +   \
      (P[i - 1 - REST_UNIT_STRIDE] + P[i - 1 + REST_UNIT_STRIDE] +    \
       P[i + 1 - REST_UNIT_STRIDE] + P[i + 1 + REST_UNIT_STRIDE]) * 5)
         for (; j < h - 1; j+=2) {
             for (int i = 0; i < w; i++) {
-                const int32_t a = SIX_NEIGHBORS(B, i);
-                const int32_t b = SIX_NEIGHBORS(A, i);
+                const int a = SIX_NEIGHBORS(B, i);
+                const int b = SIX_NEIGHBORS(A, i);
                 dst[i] = (a * src[i] + b + (1 << 8)) >> 9;
             }
             dst += 384 /* Maximum restoration width is 384 (256 * 1.5) */;
             src += REST_UNIT_STRIDE;
             B += REST_UNIT_STRIDE;
             A += REST_UNIT_STRIDE;
             for (int i = 0; i < w; i++) {
-                const int32_t a = B[i] * 6 + (B[i - 1] + B[i + 1]) * 5;
-                const int32_t b = A[i] * 6 + (A[i - 1] + A[i + 1]) * 5;
+                const int a = B[i] * 6 + (B[i - 1] + B[i + 1]) * 5;
+                const int b = A[i] * 6 + (A[i - 1] + A[i + 1]) * 5;
                 dst[i] = (a * src[i] + b + (1 << 7)) >> 8;
             }
             dst += 384 /* Maximum restoration width is 384 (256 * 1.5) */;
             src += REST_UNIT_STRIDE;
             B += REST_UNIT_STRIDE;
             A += REST_UNIT_STRIDE;
         }
         if (j + 1 == h) { // Last row, when number of rows is odd
             for (int i = 0; i < w; i++) {
-                const int32_t a = SIX_NEIGHBORS(B, i);
-                const int32_t b = SIX_NEIGHBORS(A, i);
+                const int a = SIX_NEIGHBORS(B, i);
+                const int b = SIX_NEIGHBORS(A, i);
                 dst[i] = (a * src[i] + b + (1 << 8)) >> 9;
             }
         }
 #undef SIX_NEIGHBORS
     } else {
 #define EIGHT_NEIGHBORS(P, i)\
     ((P[i] + P[i - 1] + P[i + 1] + P[i - REST_UNIT_STRIDE] + P[i + REST_UNIT_STRIDE]) * 4 + \
      (P[i - 1 - REST_UNIT_STRIDE] + P[i - 1 + REST_UNIT_STRIDE] +                           \
       P[i + 1 - REST_UNIT_STRIDE] + P[i + 1 + REST_UNIT_STRIDE]) * 3)
         for (int j = 0; j < h; j++) {
             for (int i = 0; i < w; i++) {
-                const int32_t a = EIGHT_NEIGHBORS(B, i);
-                const int32_t b = EIGHT_NEIGHBORS(A, i);
+                const int a = EIGHT_NEIGHBORS(B, i);
+                const int b = EIGHT_NEIGHBORS(A, i);
                 dst[i] = (a * src[i] + b + (1 << 8)) >> 9;
             }
             dst += 384;
             src += REST_UNIT_STRIDE;
             B += REST_UNIT_STRIDE;
             A += REST_UNIT_STRIDE;
         }
     }
@@ -515,56 +517,56 @@ static void selfguided_c(pixel *p, const
     // Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels
     // of padding above and below
     pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
 
     padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
 
     // Selfguided filter outputs to a maximum stripe height of 64 and a
     // maximum restoration width of 384 (256 * 1.5)
-    int32_t dst[64 * 384];
+    int16_t dst[64 * 384];
 
     // both r1 and r0 can't be zero
     if (!dav1d_sgr_params[sgr_idx][0]) {
         const int s1 = dav1d_sgr_params[sgr_idx][3];
         selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 9, s1);
         const int w1 = (1 << 7) - sgr_w[1];
         for (int j = 0; j < h; j++) {
             for (int i = 0; i < w; i++) {
-                const int32_t u = (p[i] << 4);
-                const int32_t v = (u << 7) + w1 * (dst[j * 384 + i] - u);
+                const int u = (p[i] << 4);
+                const int v = (u << 7) + w1 * (dst[j * 384 + i] - u);
                 p[i] = iclip_pixel((v + (1 << 10)) >> 11);
             }
             p += PXSTRIDE(p_stride);
         }
     } else if (!dav1d_sgr_params[sgr_idx][1]) {
         const int s0 = dav1d_sgr_params[sgr_idx][2];
         selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25, s0);
         const int w0 = sgr_w[0];
         for (int j = 0; j < h; j++) {
             for (int i = 0; i < w; i++) {
-                const int32_t u = (p[i] << 4);
-                const int32_t v = (u << 7) + w0 * (dst[j * 384 + i] - u);
+                const int u = (p[i] << 4);
+                const int v = (u << 7) + w0 * (dst[j * 384 + i] - u);
                 p[i] = iclip_pixel((v + (1 << 10)) >> 11);
             }
             p += PXSTRIDE(p_stride);
         }
     } else {
-        int32_t dst1[64 * 384];
+        int16_t dst1[64 * 384];
         const int s0 = dav1d_sgr_params[sgr_idx][2];
         const int s1 = dav1d_sgr_params[sgr_idx][3];
         const int w0 = sgr_w[0];
         const int w1 = (1 << 7) - w0 - sgr_w[1];
         selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25, s0);
         selfguided_filter(dst1, tmp, REST_UNIT_STRIDE, w, h, 9, s1);
         for (int j = 0; j < h; j++) {
             for (int i = 0; i < w; i++) {
-                const int32_t u = (p[i] << 4);
-                const int32_t v = (u << 7) + w0 * (dst[j * 384 + i] - u) +
-                                  w1 * (dst1[j * 384 + i] - u);
+                const int u = (p[i] << 4);
+                const int v = (u << 7) + w0 * (dst[j * 384 + i] - u) +
+                              w1 * (dst1[j * 384 + i] - u);
                 p[i] = iclip_pixel((v + (1 << 10)) >> 11);
             }
             p += PXSTRIDE(p_stride);
         }
     }
 }
 
 void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c) {
--- a/third_party/dav1d/src/lr_apply_tmpl.c
+++ b/third_party/dav1d/src/lr_apply_tmpl.c
@@ -213,20 +213,16 @@ static void lr_sbrow(const Dav1dFrameCon
     // TODO Support chroma subsampling.
     const int shift_ver = 7 - ss_ver;
     const int shift_hor = 7 - ss_hor;
 
     int ruy = (row_y >> unit_size_log2);
     // Merge last restoration unit if its height is < half_unit_size
     if (ruy > 0) ruy -= (ruy << unit_size_log2) + half_unit_size > h;
 
-    // The first stripe of the frame is shorter by 8 luma pixel rows.
-    const int filter_h =
-        imin(((1 << (6 + f->seq_hdr.sb128)) - 8 * !y) >> ss_ver, h - y);
-
     pixel pre_lr_border[2][128 + 8 /* maximum sbrow height is 128 + 8 rows offset */][4];
 
     int unit_w = unit_size, bit = 0;
 
     enum LrEdgeFlags edges = (y > 0 ? LR_HAVE_TOP : 0) |
                              (row_h < h ? LR_HAVE_BOTTOM : 0);
 
     for (int x = 0, rux = 0; x < w; x+= unit_w, rux++, edges |= LR_HAVE_LEFT, bit ^= 1) {
@@ -243,17 +239,17 @@ static void lr_sbrow(const Dav1dFrameCon
         const int unit_idx = ((ruy & 16) >> 3) + ((rux & 16) >> 4);
         const Av1RestorationUnit *const lr =
             &f->lf.mask[(((ruy << (unit_size_log2)) >> shift_ver) * f->sb128w) +
                         (x >> shift_hor)].lr[plane][unit_idx];
 
         // FIXME Don't backup if the next restoration unit is RESTORE_NONE
         // This also requires not restoring in the same conditions.
         if (edges & LR_HAVE_RIGHT) {
-            backup4xU(pre_lr_border[bit], p + unit_w - 4, p_stride, filter_h);
+            backup4xU(pre_lr_border[bit], p + unit_w - 4, p_stride, row_h - y);
         }
         if (lr->type != RESTORATION_NONE) {
             lr_stripe(f, p, pre_lr_border[!bit], x, y, plane, unit_w, row_h, lr, edges);
         }
         p += unit_w;
     }
 }
 
--- a/third_party/dav1d/src/mc.h
+++ b/third_party/dav1d/src/mc.h
@@ -36,27 +36,38 @@
 #include "src/levels.h"
 
 #define decl_mc_fn(name) \
 void (name)(pixel *dst, ptrdiff_t dst_stride, \
             const pixel *src, ptrdiff_t src_stride, \
             int w, int h, int mx, int my)
 typedef decl_mc_fn(*mc_fn);
 
+#define decl_mc_scaled_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, \
+            const pixel *src, ptrdiff_t src_stride, \
+            int w, int h, int mx, int my, int dx, int dy)
+typedef decl_mc_scaled_fn(*mc_scaled_fn);
+
 #define decl_warp8x8_fn(name) \
 void (name)(pixel *dst, ptrdiff_t dst_stride, \
             const pixel *src, ptrdiff_t src_stride, \
             const int16_t *abcd, int mx, int my)
 typedef decl_warp8x8_fn(*warp8x8_fn);
 
 #define decl_mct_fn(name) \
 void (name)(coef *tmp, const pixel *src, ptrdiff_t src_stride, \
             int w, int h, int mx, int my)
 typedef decl_mct_fn(*mct_fn);
 
+#define decl_mct_scaled_fn(name) \
+void (name)(coef *tmp, const pixel *src, ptrdiff_t src_stride, \
+            int w, int h, int mx, int my, int dx, int dy)
+typedef decl_mct_scaled_fn(*mct_scaled_fn);
+
 #define decl_warp8x8t_fn(name) \
 void (name)(coef *tmp, const ptrdiff_t tmp_stride, \
             const pixel *src, ptrdiff_t src_stride, \
             const int16_t *abcd, int mx, int my)
 typedef decl_warp8x8t_fn(*warp8x8t_fn);
 
 #define decl_avg_fn(name) \
 void (name)(pixel *dst, ptrdiff_t dst_stride, \
@@ -76,31 +87,44 @@ typedef decl_mask_fn(*mask_fn);
 
 #define decl_w_mask_fn(name) \
 void (name)(pixel *dst, ptrdiff_t dst_stride, \
             const coef *tmp1, const coef *tmp2, int w, int h, \
             uint8_t *mask, int sign)
 typedef decl_w_mask_fn(*w_mask_fn);
 
 #define decl_blend_fn(name) \
-void (name)(pixel *dst, ptrdiff_t dst_stride, \
-            const pixel *tmp, ptrdiff_t tmp_stride, int w, int h, \
-            const uint8_t *mask, ptrdiff_t mstride)
+void (name)(pixel *dst, ptrdiff_t dst_stride, const pixel *tmp, \
+            int w, int h, const uint8_t *mask)
 typedef decl_blend_fn(*blend_fn);
 
+#define decl_blend_dir_fn(name) \
+void (name)(pixel *dst, ptrdiff_t dst_stride, const pixel *tmp, int w, int h)
+typedef decl_blend_dir_fn(*blend_dir_fn);
+
+#define decl_emu_edge_fn(name) \
+void (name)(intptr_t bw, intptr_t bh, intptr_t iw, intptr_t ih, intptr_t x, intptr_t y, \
+            pixel *dst, ptrdiff_t dst_stride, const pixel *src, ptrdiff_t src_stride)
+typedef decl_emu_edge_fn(*emu_edge_fn);
+
 typedef struct Dav1dMCDSPContext {
     mc_fn mc[N_2D_FILTERS];
+    mc_scaled_fn mc_scaled[N_2D_FILTERS];
     mct_fn mct[N_2D_FILTERS];
+    mct_scaled_fn mct_scaled[N_2D_FILTERS];
     avg_fn avg;
     w_avg_fn w_avg;
     mask_fn mask;
     w_mask_fn w_mask[3 /* 444, 422, 420 */];
     blend_fn blend;
+    blend_dir_fn blend_v;
+    blend_dir_fn blend_h;
     warp8x8_fn warp8x8;
     warp8x8t_fn warp8x8t;
+    emu_edge_fn emu_edge;
 } Dav1dMCDSPContext;
 
 void dav1d_mc_dsp_init_8bpc(Dav1dMCDSPContext *c);
 void dav1d_mc_dsp_init_10bpc(Dav1dMCDSPContext *c);
 
 void dav1d_mc_dsp_init_arm_8bpc(Dav1dMCDSPContext *c);
 void dav1d_mc_dsp_init_arm_10bpc(Dav1dMCDSPContext *c);
 
--- a/third_party/dav1d/src/mc_tmpl.c
+++ b/third_party/dav1d/src/mc_tmpl.c
@@ -22,16 +22,17 @@
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include "config.h"
 
+#include <assert.h>
 #include <stdlib.h>
 #include <string.h>
 
 #include "common/attributes.h"
 #include "common/intops.h"
 
 #include "src/mc.h"
 #include "src/tables.h"
@@ -72,23 +73,29 @@ prep_c(coef *tmp, const pixel *src, cons
      F[7] * src[x + +4 * stride])
 
 #define FILTER_8TAP_RND(src, x, F, stride, sh) \
     ((FILTER_8TAP(src, x, F, stride) + ((1 << sh) >> 1)) >> sh)
 
 #define FILTER_8TAP_CLIP(src, x, F, stride, sh) \
     iclip_pixel(FILTER_8TAP_RND(src, x, F, stride, sh))
 
+#define GET_H_FILTER(mx) \
+    const int8_t *const fh = !(mx) ? NULL : w > 4 ? \
+        dav1d_mc_subpel_filters[filter_type & 3][(mx) - 1] : \
+        dav1d_mc_subpel_filters[3 + (filter_type & 1)][(mx) - 1]
+
+#define GET_V_FILTER(my) \
+    const int8_t *const fv = !(my) ? NULL : h > 4 ? \
+        dav1d_mc_subpel_filters[filter_type >> 2][(my) - 1] : \
+        dav1d_mc_subpel_filters[3 + ((filter_type >> 2) & 1)][(my) - 1]
+
 #define GET_FILTERS() \
-    const int8_t *const fh = !mx ? NULL : w > 4 ? \
-        dav1d_mc_subpel_filters[filter_type & 3][mx - 1] : \
-        dav1d_mc_subpel_filters[3 + (filter_type & 1)][mx - 1]; \
-    const int8_t *const fv = !my ? NULL : h > 4 ? \
-        dav1d_mc_subpel_filters[filter_type >> 2][my - 1] : \
-        dav1d_mc_subpel_filters[3 + ((filter_type >> 2) & 1)][my - 1]; \
+    GET_H_FILTER(mx); \
+    GET_V_FILTER(my)
 
 static NOINLINE void
 put_8tap_c(pixel *dst, ptrdiff_t dst_stride,
            const pixel *src, ptrdiff_t src_stride,
            const int w, int h, const int mx, const int my,
            const int filter_type)
 {
     GET_FILTERS();
@@ -136,16 +143,59 @@ put_8tap_c(pixel *dst, ptrdiff_t dst_str
             dst += dst_stride;
             src += src_stride;
         } while (--h);
     } else
         put_c(dst, dst_stride, src, src_stride, w, h);
 }
 
 static NOINLINE void
+put_8tap_scaled_c(pixel *dst, const ptrdiff_t dst_stride,
+                  const pixel *src, ptrdiff_t src_stride,
+                  const int w, int h, const int mx, int my,
+                  const int dx, const int dy, const int filter_type)
+{
+    int tmp_h = (((h - 1) * dy + my) >> 10) + 8;
+    coef mid[128 * (256 + 7)], *mid_ptr = mid;
+    src_stride = PXSTRIDE(src_stride);
+
+    src -= src_stride * 3;
+    do {
+        int x;
+        int imx = mx, ioff = 0;
+
+        for (x = 0; x < w; x++) {
+            GET_H_FILTER(imx >> 6);
+            mid_ptr[x] = fh ? FILTER_8TAP_RND(src, ioff, fh, 1, 2) : src[ioff] << 4;
+            imx += dx;
+            ioff += imx >> 10;
+            imx &= 0x3ff;
+        }
+
+        mid_ptr += 128;
+        src += src_stride;
+    } while (--tmp_h);
+
+    mid_ptr = mid + 128 * 3;
+    for (int y = 0; y < h; y++) {
+        int x;
+        GET_V_FILTER(my >> 6);
+
+        for (x = 0; x < w; x++)
+            dst[x] = fv ? FILTER_8TAP_CLIP(mid_ptr, x, fv, 128, 10) :
+                          iclip_pixel((mid_ptr[x] + 8) >> 4);
+
+        my += dy;
+        mid_ptr += (my >> 10) * 128;
+        my &= 0x3ff;
+        dst += PXSTRIDE(dst_stride);
+    }
+}
+
+static NOINLINE void
 prep_8tap_c(coef *tmp, const pixel *src, ptrdiff_t src_stride,
             const int w, int h, const int mx, const int my,
             const int filter_type)
 {
     GET_FILTERS();
     src_stride = PXSTRIDE(src_stride);
 
     if (fh) {
@@ -186,49 +236,111 @@ prep_8tap_c(coef *tmp, const pixel *src,
 
             tmp += w;
             src += src_stride;
         } while (--h);
     } else
         prep_c(tmp, src, src_stride, w, h);
 }
 
+static NOINLINE void
+prep_8tap_scaled_c(coef *tmp, const pixel *src, ptrdiff_t src_stride,
+                   const int w, int h, const int mx, int my,
+                   const int dx, const int dy, const int filter_type)
+{
+    int tmp_h = (((h - 1) * dy + my) >> 10) + 8;
+    coef mid[128 * (256 + 7)], *mid_ptr = mid;
+    src_stride = PXSTRIDE(src_stride);
+
+    src -= src_stride * 3;
+    do {
+        int x;
+        int imx = mx, ioff = 0;
+
+        for (x = 0; x < w; x++) {
+            GET_H_FILTER(imx >> 6);
+            mid_ptr[x] = fh ? FILTER_8TAP_RND(src, ioff, fh, 1, 2) : src[ioff] << 4;
+            imx += dx;
+            ioff += imx >> 10;
+            imx &= 0x3ff;
+        }
+
+        mid_ptr += 128;
+        src += src_stride;
+    } while (--tmp_h);
+
+    mid_ptr = mid + 128 * 3;
+    for (int y = 0; y < h; y++) {
+        int x;
+        GET_V_FILTER(my >> 6);
+
+        for (x = 0; x < w; x++)
+            tmp[x] = fv ? FILTER_8TAP_RND(mid_ptr, x, fv, 128, 6) : mid_ptr[x];
+
+        my += dy;
+        mid_ptr += (my >> 10) * 128;
+        my &= 0x3ff;
+        tmp += w;
+    }
+}
+
 #define filter_fns(type, type_h, type_v) \
 static void put_8tap_##type##_c(pixel *const dst, \
                                 const ptrdiff_t dst_stride, \
                                 const pixel *const src, \
                                 const ptrdiff_t src_stride, \
                                 const int w, const int h, \
                                 const int mx, const int my) \
 { \
     put_8tap_c(dst, dst_stride, src, src_stride, w, h, mx, my, \
                type_h | (type_v << 2)); \
 } \
+static void put_8tap_##type##_scaled_c(pixel *const dst, \
+                                       const ptrdiff_t dst_stride, \
+                                       const pixel *const src, \
+                                       const ptrdiff_t src_stride, \
+                                       const int w, const int h, \
+                                       const int mx, const int my, \
+                                       const int dx, const int dy) \
+{ \
+    put_8tap_scaled_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, \
+                      type_h | (type_v << 2)); \
+} \
 static void prep_8tap_##type##_c(coef *const tmp, \
                                  const pixel *const src, \
                                  const ptrdiff_t src_stride, \
                                  const int w, const int h, \
                                  const int mx, const int my) \
 { \
     prep_8tap_c(tmp, src, src_stride, w, h, mx, my, \
                 type_h | (type_v << 2)); \
+} \
+static void prep_8tap_##type##_scaled_c(coef *const tmp, \
+                                        const pixel *const src, \
+                                        const ptrdiff_t src_stride, \
+                                        const int w, const int h, \
+                                        const int mx, const int my, \
+                                        const int dx, const int dy) \
+{ \
+    prep_8tap_scaled_c(tmp, src, src_stride, w, h, mx, my, dx, dy, \
+                       type_h | (type_v << 2)); \
 }
 
 filter_fns(regular,        FILTER_8TAP_REGULAR, FILTER_8TAP_REGULAR)
 filter_fns(regular_sharp,  FILTER_8TAP_REGULAR, FILTER_8TAP_SHARP)
 filter_fns(regular_smooth, FILTER_8TAP_REGULAR, FILTER_8TAP_SMOOTH)
 filter_fns(smooth,         FILTER_8TAP_SMOOTH,  FILTER_8TAP_SMOOTH)
 filter_fns(smooth_regular, FILTER_8TAP_SMOOTH,  FILTER_8TAP_REGULAR)
 filter_fns(smooth_sharp,   FILTER_8TAP_SMOOTH,  FILTER_8TAP_SHARP)
 filter_fns(sharp,          FILTER_8TAP_SHARP,   FILTER_8TAP_SHARP)
 filter_fns(sharp_regular,  FILTER_8TAP_SHARP,   FILTER_8TAP_REGULAR)
 filter_fns(sharp_smooth,   FILTER_8TAP_SHARP,   FILTER_8TAP_SMOOTH)
 
 #define FILTER_BILIN(src, x, mxy, stride) \
-    (16 * src[x] + (mxy * (src[x + stride] - src[x])))
+    (16 * src[x] + ((mxy) * (src[x + stride] - src[x])))
 
 #define FILTER_BILIN_RND(src, x, mxy, stride, sh) \
     ((FILTER_BILIN(src, x, mxy, stride) + ((1 << sh) >> 1)) >> sh)
 
 #define FILTER_BILIN_CLIP(src, x, mxy, stride, sh) \
     iclip_pixel(FILTER_BILIN_RND(src, x, mxy, stride, sh))
 
 static void put_bilin_c(pixel *dst, ptrdiff_t dst_stride,
@@ -275,16 +387,53 @@ static void put_bilin_c(pixel *dst, ptrd
 
             dst += dst_stride;
             src += src_stride;
         } while (--h);
     } else
         put_c(dst, dst_stride, src, src_stride, w, h);
 }
 
+static void put_bilin_scaled_c(pixel *dst, ptrdiff_t dst_stride,
+                               const pixel *src, ptrdiff_t src_stride,
+                               const int w, int h, const int mx, int my,
+                               const int dx, const int dy)
+{
+    int tmp_h = (((h - 1) * dy + my) >> 10) + 2;
+    coef mid[128 * (256 + 1)], *mid_ptr = mid;
+
+    do {
+        int x;
+        int imx = mx, ioff = 0;
+
+        for (x = 0; x < w; x++) {
+            mid_ptr[x] = FILTER_BILIN(src, ioff, imx >> 6, 1);
+            imx += dx;
+            ioff += imx >> 10;
+            imx &= 0x3ff;
+        }
+
+        mid_ptr += 128;
+        src += PXSTRIDE(src_stride);
+    } while (--tmp_h);
+
+    mid_ptr = mid;
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            dst[x] = FILTER_BILIN_CLIP(mid_ptr, x, my >> 6, 128, 8);
+
+        my += dy;
+        mid_ptr += (my >> 10) * 128;
+        my &= 0x3ff;
+        dst += PXSTRIDE(dst_stride);
+    } while (--h);
+}
+
 static void prep_bilin_c(coef *tmp,
                          const pixel *src, ptrdiff_t src_stride,
                          const int w, int h, const int mx, const int my)
 {
     src_stride = PXSTRIDE(src_stride);
 
     if (mx) {
         if (my) {
@@ -323,16 +472,53 @@ static void prep_bilin_c(coef *tmp,
 
             tmp += w;
             src += src_stride;
         } while (--h);
     } else
         prep_c(tmp, src, src_stride, w, h);
 }
 
+static void prep_bilin_scaled_c(coef *tmp,
+                                const pixel *src, ptrdiff_t src_stride,
+                                const int w, int h, const int mx, int my,
+                                const int dx, const int dy)
+{
+    int tmp_h = (((h - 1) * dy + my) >> 10) + 2;
+    coef mid[128 * (256 + 1)], *mid_ptr = mid;
+
+    do {
+        int x;
+        int imx = mx, ioff = 0;
+
+        for (x = 0; x < w; x++) {
+            mid_ptr[x] = FILTER_BILIN(src, ioff, imx >> 6, 1);
+            imx += dx;
+            ioff += imx >> 10;
+            imx &= 0x3ff;
+        }
+
+        mid_ptr += 128;
+        src += PXSTRIDE(src_stride);
+    } while (--tmp_h);
+
+    mid_ptr = mid;
+    do {
+        int x;
+
+        for (x = 0; x < w; x++)
+            tmp[x] = FILTER_BILIN_RND(mid_ptr, x, my >> 6, 128, 4);
+
+        my += dy;
+        mid_ptr += (my >> 10) * 128;
+        my &= 0x3ff;
+        tmp += w;
+    } while (--h);
+}
+
 static void avg_c(pixel *dst, const ptrdiff_t dst_stride,
                   const coef *tmp1, const coef *tmp2, const int w, int h)
 {
     do {
         for (int x = 0; x < w; x++)
             dst[x] = iclip_pixel((tmp1[x] + tmp2[x] + 16) >> 5);
 
         tmp1 += w;
@@ -367,30 +553,56 @@ static void mask_c(pixel *dst, const ptr
 
         tmp1 += w;
         tmp2 += w;
         mask += w;
         dst += PXSTRIDE(dst_stride);
     } while (--h);
 }
 
-static void blend_c(pixel *dst, const ptrdiff_t dst_stride,
-                    const pixel *tmp, const ptrdiff_t tmp_stride,
-                    const int w, const int h,
-                    const uint8_t *mask, const ptrdiff_t m_stride)
+#define blend_px(a, b, m) (((a * (64 - m) + b * m) + 32) >> 6)
+static NOINLINE void
+blend_internal_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
+                 const int w, int h, const uint8_t *mask,
+                 const ptrdiff_t mask_stride)
 {
-    for (int y = 0; y < h; y++) {
+    do {
         for (int x = 0; x < w; x++) {
-#define blend_px(a, b, m) (((a * (64 - m) + b * m) + 32) >> 6)
-            dst[x] = blend_px(dst[x], tmp[x], mask[m_stride == 1 ? 0 : x]);
+            dst[x] = blend_px(dst[x], tmp[x], mask[x]);
         }
         dst += PXSTRIDE(dst_stride);
-        tmp += PXSTRIDE(tmp_stride);
-        mask += m_stride;
-    }
+        tmp += w;
+        mask += mask_stride;
+    } while (--h);
+}
+
+static void blend_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
+                    const int w, const int h, const uint8_t *mask)
+{
+    blend_internal_c(dst, dst_stride, tmp, w, h, mask, w);
+}
+
+static void blend_v_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
+                      const int w, const int h)
+{
+    blend_internal_c(dst, dst_stride, tmp, w, h, &dav1d_obmc_masks[w], 0);
+}
+
+static void blend_h_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
+                      const int w, int h)
+{
+    const uint8_t *mask = &dav1d_obmc_masks[h];
+    do {
+        const int m = *mask++;
+        for (int x = 0; x < w; x++) {
+            dst[x] = blend_px(dst[x], tmp[x], m);
+        }
+        dst += PXSTRIDE(dst_stride);
+        tmp += w;
+    } while (--h);
 }
 
 static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride,
                      const coef *tmp1, const coef *tmp2, const int w, int h,
                      uint8_t *mask, const int sign,
                      const int ss_hor, const int ss_ver)
 {
     // store mask at 2x2 resolution, i.e. store 2x1 sum for even rows,
@@ -438,41 +650,57 @@ static void w_mask_##ssn##_c(pixel *cons
 }
 
 w_mask_fns(444, 0, 0);
 w_mask_fns(422, 1, 0);
 w_mask_fns(420, 1, 1);
 
 #undef w_mask_fns
 
+#define FILTER_WARP(src, x, F, stride) \
+    (F[0] * src[x + -3 * stride] + \
+     F[4] * src[x + -2 * stride] + \
+     F[1] * src[x + -1 * stride] + \
+     F[5] * src[x + +0 * stride] + \
+     F[2] * src[x + +1 * stride] + \
+     F[6] * src[x + +2 * stride] + \
+     F[3] * src[x + +3 * stride] + \
+     F[7] * src[x + +4 * stride])
+
+#define FILTER_WARP_RND(src, x, F, stride, sh) \
+    ((FILTER_WARP(src, x, F, stride) + ((1 << sh) >> 1)) >> sh)
+
+#define FILTER_WARP_CLIP(src, x, F, stride, sh) \
+    iclip_pixel(FILTER_WARP_RND(src, x, F, stride, sh))
+
 static void warp_affine_8x8_c(pixel *dst, const ptrdiff_t dst_stride,
                               const pixel *src, const ptrdiff_t src_stride,
                               const int16_t *const abcd, int mx, int my)
 {
     coef mid[15 * 8], *mid_ptr = mid;
 
     src -= 3 * PXSTRIDE(src_stride);
     for (int y = 0; y < 15; y++, mx += abcd[1]) {
         for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) {
             const int8_t *const filter =
                 dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)];
 
-            mid_ptr[x] = FILTER_8TAP_RND(src, x, filter, 1, 3);
+            mid_ptr[x] = FILTER_WARP_RND(src, x, filter, 1, 3);
         }
         src += PXSTRIDE(src_stride);
         mid_ptr += 8;
     }
 
     mid_ptr = &mid[3 * 8];
     for (int y = 0; y < 8; y++, my += abcd[3]) {
         for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) {
             const int8_t *const filter =
                 dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)];
 
-            dst[x] = FILTER_8TAP_CLIP(mid_ptr, x, filter, 8, 11);
+            dst[x] = FILTER_WARP_CLIP(mid_ptr, x, filter, 8, 11);
         }
         mid_ptr += 8;
         dst += PXSTRIDE(dst_stride);
     }
 }
 
 static void warp_affine_8x8t_c(coef *tmp, const ptrdiff_t tmp_stride,
                                const pixel *src, const ptrdiff_t src_stride,
@@ -481,39 +709,90 @@ static void warp_affine_8x8t_c(coef *tmp
     coef mid[15 * 8], *mid_ptr = mid;
 
     src -= 3 * PXSTRIDE(src_stride);
     for (int y = 0; y < 15; y++, mx += abcd[1]) {
         for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) {
             const int8_t *const filter =
                 dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)];
 
-            mid_ptr[x] = FILTER_8TAP_RND(src, x, filter, 1, 3);
+            mid_ptr[x] = FILTER_WARP_RND(src, x, filter, 1, 3);
         }
         src += PXSTRIDE(src_stride);
         mid_ptr += 8;
     }
 
     mid_ptr = &mid[3 * 8];
     for (int y = 0; y < 8; y++, my += abcd[3]) {
         for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) {
             const int8_t *const filter =
                 dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)];
 
-            tmp[x] = FILTER_8TAP_RND(mid_ptr, x, filter, 8, 7);
+            tmp[x] = FILTER_WARP_RND(mid_ptr, x, filter, 8, 7);
         }
         mid_ptr += 8;
         tmp += tmp_stride;
     }
 }
 
+static void emu_edge_c(const intptr_t bw, const intptr_t bh,
+                       const intptr_t iw, const intptr_t ih,
+                       const intptr_t x, const intptr_t y,
+                       pixel *dst, const ptrdiff_t dst_stride,
+                       const pixel *ref, const ptrdiff_t ref_stride)
+{
+    // find offset in reference of visible block to copy
+    ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + iclip(x, 0, iw - 1);
+
+    // number of pixels to extend (left, right, top, bottom)
+    const int left_ext = iclip(-x, 0, bw - 1);
+    const int right_ext = iclip(x + bw - iw, 0, bw - 1);
+    assert(left_ext + right_ext < bw);
+    const int top_ext = iclip(-y, 0, bh - 1);
+    const int bottom_ext = iclip(y + bh - ih, 0, bh - 1);
+    assert(top_ext + bottom_ext < bh);
+
+    // copy visible portion first
+    pixel *blk = dst + top_ext * PXSTRIDE(dst_stride);
+    const int center_w = bw - left_ext - right_ext;
+    const int center_h = bh - top_ext - bottom_ext;
+    for (int y = 0; y < center_h; y++) {
+        pixel_copy(blk + left_ext, ref, center_w);
+        // extend left edge for this line
+        if (left_ext)
+            pixel_set(blk, blk[left_ext], left_ext);
+        // extend right edge for this line
+        if (right_ext)
+            pixel_set(blk + left_ext + center_w, blk[left_ext + center_w - 1],
+                      right_ext);
+        ref += PXSTRIDE(ref_stride);
+        blk += PXSTRIDE(dst_stride);
+    }
+
+    // copy top
+    blk = dst + top_ext * PXSTRIDE(dst_stride);
+    for (int y = 0; y < top_ext; y++) {
+        pixel_copy(dst, blk, bw);
+        dst += PXSTRIDE(dst_stride);
+    }
+
+    // copy bottom
+    dst += center_h * PXSTRIDE(dst_stride);
+    for (int y = 0; y < bottom_ext; y++) {
+        pixel_copy(dst, &dst[-PXSTRIDE(dst_stride)], bw);
+        dst += PXSTRIDE(dst_stride);
+    }
+}
+
 void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) {
 #define init_mc_fns(type, name) do { \
-    c->mc [type] = put_##name##_c; \
-    c->mct[type] = prep_##name##_c; \
+    c->mc        [type] = put_##name##_c; \
+    c->mc_scaled [type] = put_##name##_scaled_c; \
+    c->mct       [type] = prep_##name##_c; \
+    c->mct_scaled[type] = prep_##name##_scaled_c; \
 } while (0)
 
     init_mc_fns(FILTER_2D_8TAP_REGULAR,        8tap_regular);
     init_mc_fns(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth);
     init_mc_fns(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp);
     init_mc_fns(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular);
     init_mc_fns(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth);
     init_mc_fns(FILTER_2D_8TAP_SHARP,          8tap_sharp);
@@ -521,21 +800,24 @@ void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSP
     init_mc_fns(FILTER_2D_8TAP_SMOOTH,         8tap_smooth);
     init_mc_fns(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp);
     init_mc_fns(FILTER_2D_BILINEAR,            bilin);
 
     c->avg      = avg_c;
     c->w_avg    = w_avg_c;
     c->mask     = mask_c;
     c->blend    = blend_c;
+    c->blend_v  = blend_v_c;
+    c->blend_h  = blend_h_c;
     c->w_mask[0] = w_mask_444_c;
     c->w_mask[1] = w_mask_422_c;
     c->w_mask[2] = w_mask_420_c;
     c->warp8x8  = warp_affine_8x8_c;
     c->warp8x8t = warp_affine_8x8t_c;
+    c->emu_edge = emu_edge_c;
 
 #if HAVE_ASM
 #if ARCH_AARCH64 || ARCH_ARM
     bitfn(dav1d_mc_dsp_init_arm)(c);
 #elif ARCH_X86
     bitfn(dav1d_mc_dsp_init_x86)(c);
 #endif
 #endif
--- a/third_party/dav1d/src/meson.build
+++ b/third_party/dav1d/src/meson.build
@@ -78,43 +78,45 @@ libdav1d_nasm_objs = []
 if is_asm_enabled
     if (host_machine.cpu_family() == 'aarch64' or
         host_machine.cpu_family().startswith('arm'))
 
         libdav1d_sources += files(
             'arm/cpu.c',
         )
         libdav1d_tmpl_sources += files(
-            'arm/mc_init.c',
+            'arm/mc_init_tmpl.c',
         )
         if host_machine.cpu_family() == 'aarch64'
-            libdav1d_tmpl_sources += files(
+            libdav1d_sources += files(
                 'arm/64/mc.S',
             )
         elif host_machine.cpu_family().startswith('arm')
-            libdav1d_tmpl_sources += files(
+            libdav1d_sources += files(
                 'arm/32/mc.S',
             )
         endif
     elif host_machine.cpu_family().startswith('x86')
 
         libdav1d_sources += files(
             'x86/cpu.c',
         )
 
         libdav1d_tmpl_sources += files(
-            'x86/ipred_init.c',
-            'x86/itx_init.c',
-            'x86/loopfilter_init.c',
-            'x86/looprestoration_init.c',
-            'x86/mc_init.c',
+            'x86/cdef_init_tmpl.c',
+            'x86/ipred_init_tmpl.c',
+            'x86/itx_init_tmpl.c',
+            'x86/loopfilter_init_tmpl.c',
+            'x86/looprestoration_init_tmpl.c',
+            'x86/mc_init_tmpl.c',
         )
 
         # NASM source files
         libdav1d_sources_asm = files(
+            'x86/cdef.asm',
             'x86/cpuid.asm',
             'x86/ipred.asm',
             'x86/itx.asm',
             'x86/loopfilter.asm',
             'x86/looprestoration.asm',
             'x86/mc.asm',
         )
 
@@ -124,17 +126,17 @@ if is_asm_enabled
 endif
 
 
 
 #
 # Windows .rc file
 #
 
-if host_machine.system() == 'windows'
+if host_machine.system() == 'windows' and get_option('default_library') != 'static'
     winmod = import('windows')
     rc_data = configuration_data()
     rc_data.set('VERSION_MAJOR', dav1d_version_major)
     rc_data.set('VERSION_MINOR', dav1d_version_minor)
     rc_data.set('VERSION_REVISION', dav1d_version_revision)
     rc_data.set('VERSION_EXTRA', '0')
     rc_data.set('COPYRIGHT_YEARS', '2018')
 
--- a/third_party/dav1d/src/msac.c
+++ b/third_party/dav1d/src/msac.c
@@ -1,318 +1,193 @@
 /*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
  *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include "config.h"
 
 #include <assert.h>
 #include <limits.h>
 
 #include "common/intops.h"
 
 #include "src/msac.h"
 
-typedef MsacContext od_ec_dec;
-
-//#define CDF_SIZE(x) ((x) + 1)
-#define CDF_PROB_BITS 15
-#define CDF_PROB_TOP (1 << CDF_PROB_BITS)
-//#define CDF_INIT_TOP 32768
-#define CDF_SHIFT (15 - CDF_PROB_BITS)
-
-#define OD_CLZ0 (1)
-#define OD_CLZ(x) (-get_msb(x))
-#define OD_ILOG_NZ(x) (OD_CLZ0 - OD_CLZ(x))
-
-static inline int get_msb(unsigned int n) {
-    assert(n != 0);
-    return 31 ^ clz(n);
-}
-
-#define EC_PROB_SHIFT 6
 #define EC_MIN_PROB 4  // must be <= (1<<EC_PROB_SHIFT)/16
 
-/*OPT: od_ec_window must be at least 32 bits, but if you have fast arithmetic
- on a larger type, you can speed up the decoder by using it here.*/
-typedef uint32_t od_ec_window;
-
-#define OD_EC_WINDOW_SIZE ((int)sizeof(od_ec_window) * CHAR_BIT)
-
-/*The resolution of fractional-precision bit usage measurements, i.e.,
- 3 => 1/8th bits.*/
-#define OD_BITRES (3)
-
-#define OD_ICDF AOM_ICDF
-
-#define AOM_ICDF(a) (32768-(a))
-
-/*A range decoder.
-  This is an entropy decoder based upon \cite{Mar79}, which is itself a
-   rediscovery of the FIFO arithmetic code introduced by \cite{Pas76}.
-  It is very similar to arithmetic encoding, except that encoding is done with
-   digits in any base, instead of with bits, and so it is faster when using
-   larger bases (i.e.: a byte).
-  The author claims an average waste of $\frac{1}{2}\log_b(2b)$ bits, where $b$
-   is the base, longer than the theoretical optimum, but to my knowledge there
-   is no published justification for this claim.
-  This only seems true when using near-infinite precision arithmetic so that
-   the process is carried out with no rounding errors.
-
-  An excellent description of implementation details is available at
-   http://www.arturocampos.com/ac_range.html
-  A recent work \cite{MNW98} which proposes several changes to arithmetic
-   encoding for efficiency actually re-discovers many of the principles
-   behind range encoding, and presents a good theoretical analysis of them.
+#define EC_WIN_SIZE (sizeof(ec_win) << 3)
 
-  End of stream is handled by writing out the smallest number of bits that
-   ensures that the stream will be correctly decoded regardless of the value of
-   any subsequent bits.
-  od_ec_dec_tell() can be used to determine how many bits were needed to decode
-   all the symbols thus far; other data can be packed in the remaining bits of
-   the input buffer.
-  @PHDTHESIS{Pas76,
-    author="Richard Clark Pasco",
-    title="Source coding algorithms for fast data compression",
-    school="Dept. of Electrical Engineering, Stanford University",
-    address="Stanford, CA",
-    month=May,
-    year=1976,
-    URL="http://www.richpasco.org/scaffdc.pdf"
-  }
-  @INPROCEEDINGS{Mar79,
-   author="Martin, G.N.N.",
-   title="Range encoding: an algorithm for removing redundancy from a digitised
-    message",
-   booktitle="Video & Data Recording Conference",
-   year=1979,
-   address="Southampton",
-   month=Jul,
-   URL="http://www.compressconsult.com/rangecoder/rngcod.pdf.gz"
-  }
-  @ARTICLE{MNW98,
-   author="Alistair Moffat and Radford Neal and Ian H. Witten",
-   title="Arithmetic Coding Revisited",
-   journal="{ACM} Transactions on Information Systems",
-   year=1998,
-   volume=16,
-   number=3,
-   pages="256--294",
-   month=Jul,
-   URL="http://researchcommons.waikato.ac.nz/bitstream/handle/10289/78/content.pdf"
-  }*/
+static inline void ctx_refill(MsacContext *s) {
+    const uint8_t *buf_pos = s->buf_pos;
+    const uint8_t *buf_end = s->buf_end;
+    int c = EC_WIN_SIZE - s->cnt - 24;
+    ec_win dif = s->dif;
+    while (c >= 0 && buf_pos < buf_end) {
+        dif ^= ((ec_win)*buf_pos++) << c;
+        c -= 8;
+    }
+    s->dif = dif;
+    s->cnt = EC_WIN_SIZE - c - 24;
+    s->buf_pos = buf_pos;
+}
 
-/*This is meant to be a large, positive constant that can still be efficiently
-   loaded as an immediate (on platforms like ARM, for example).
-  Even relatively modest values like 100 would work fine.*/
-#define OD_EC_LOTS_OF_BITS (0x4000)
-
-static void od_ec_dec_refill(od_ec_dec *dec) {
-  int s;
-  od_ec_window dif;
-  int16_t cnt;
-  const unsigned char *bptr;
-  const unsigned char *end;
-  dif = dec->dif;
-  cnt = dec->cnt;
-  bptr = dec->bptr;
-  end = dec->end;
-  s = OD_EC_WINDOW_SIZE - 9 - (cnt + 15);
-  for (; s >= 0 && bptr < end; s -= 8, bptr++) {
-    assert(s <= OD_EC_WINDOW_SIZE - 8);
-    dif ^= (od_ec_window)bptr[0] << s;
-    cnt += 8;
-  }
-  if (bptr >= end) {
-    dec->tell_offs += OD_EC_LOTS_OF_BITS - cnt;
-    cnt = OD_EC_LOTS_OF_BITS;
-  }
-  dec->dif = dif;
-  dec->cnt = cnt;
-  dec->bptr = bptr;
+/* Takes updated dif and range values, renormalizes them so that
+ * 32768 <= rng < 65536 (reading more bytes from the stream into dif if
+ * necessary), and stores them back in the decoder context.
+ * dif: The new value of dif.
+ * rng: The new value of the range. */
+static inline void ctx_norm(MsacContext *s, ec_win dif, uint32_t rng) {
+    const uint16_t d = 15 - (31 ^ clz(rng));
+    assert(rng <= 65535U);
+    s->cnt -= d;
+    s->dif = ((dif + 1) << d) - 1; /* Shift in 1s in the LSBs */
+    s->rng = rng << d;
+    if (s->cnt < 0)
+        ctx_refill(s);
 }
 
-/*Takes updated dif and range values, renormalizes them so that
-   32768 <= rng < 65536 (reading more bytes from the stream into dif if
-   necessary), and stores them back in the decoder context.
-  dif: The new value of dif.
-  rng: The new value of the range.
-  ret: The value to return.
-  Return: ret.
-          This allows the compiler to jump to this function via a tail-call.*/
-static int od_ec_dec_normalize(od_ec_dec *dec, od_ec_window dif, unsigned rng,
-                               int ret) {
-  int d;
-  assert(rng <= 65535U);
-  d = 16 - OD_ILOG_NZ(rng);
-  dec->cnt -= d;
-  /*This is equivalent to shifting in 1's instead of 0's.*/
-  dec->dif = ((dif + 1) << d) - 1;
-  dec->rng = rng << d;
-  if (dec->cnt < 0) od_ec_dec_refill(dec);
-  return ret;
-}
+/* Decodes a symbol given an inverse cumulative distribution function (CDF)
+ * table in Q15. */
+unsigned msac_decode_symbol(MsacContext *const s, const uint16_t *const cdf,
+                            const unsigned n_symbols)
+{
+    ec_win u, v = s->rng, r = s->rng >> 8;
+    const ec_win c = s->dif >> (EC_WIN_SIZE - 16);
+    unsigned ret = 0;
+
+    assert(!cdf[n_symbols - 1]);
 
-/*Initializes the decoder.
-  buf: The input buffer to use.
-  Return: 0 on success, or a negative value on error.*/
-static void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf,
-                    uint32_t storage) {
-  dec->buf = buf;
-  dec->tell_offs = 10 - (OD_EC_WINDOW_SIZE - 8);
-  dec->end = buf + storage;
-  dec->bptr = buf;
-  dec->dif = ((od_ec_window)1 << (OD_EC_WINDOW_SIZE - 1)) - 1;
-  dec->rng = 0x8000;
-  dec->cnt = -15;
-  dec->error = 0;
-  od_ec_dec_refill(dec);
+    do {
+        u = v;
+        v = r * (cdf[ret++] >> EC_PROB_SHIFT);
+        v >>= 7 - EC_PROB_SHIFT;
+        v += EC_MIN_PROB * (n_symbols - ret);
+    } while (c < v);
+
+    assert(u <= s->rng);
+
+    ctx_norm(s, s->dif - (v << (EC_WIN_SIZE - 16)), u - v);
+    return ret - 1;
 }
 
-/*Decode a single binary value.
-  f: The probability that the bit is one, scaled by 32768.
-  Return: The value decoded (0 or 1).*/
-static int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) {
-  od_ec_window dif;
-  od_ec_window vw;
-  unsigned r;
-  unsigned r_new;
-  unsigned v;
-  int ret;
-  assert(0 < f);
-  assert(f < 32768U);
-  dif = dec->dif;
-  r = dec->rng;
-  assert(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
-  assert(32768U <= r);
-  v = ((r >> 8) * (uint32_t)(f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT));
-  v += EC_MIN_PROB;
-  vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
-  ret = 1;
-  r_new = v;
-  if (dif >= vw) {
-    r_new = r - v;
-    dif -= vw;
-    ret = 0;
-  }
-  return od_ec_dec_normalize(dec, dif, r_new, ret);
-}
-
-/*Decodes a symbol given an inverse cumulative distribution function (CDF)
-   table in Q15.
-  icdf: CDF_PROB_TOP minus the CDF, such that symbol s falls in the range
-         [s > 0 ? (CDF_PROB_TOP - icdf[s - 1]) : 0, CDF_PROB_TOP - icdf[s]).
-        The values must be monotonically non-increasing, and icdf[nsyms - 1]
-         must be 0.
-  nsyms: The number of symbols in the alphabet.
-         This should be at most 16.
-  Return: The decoded symbol s.*/
-static int od_ec_decode_cdf_q15(od_ec_dec *dec, const uint16_t *icdf, int nsyms) {
-  od_ec_window dif;
-  unsigned r;
-  unsigned c;
-  unsigned u;
-  unsigned v;
-  int ret;
-  (void)nsyms;
-  dif = dec->dif;
-  r = dec->rng;
-  const int N = nsyms - 1;
-
-  assert(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
-  assert(icdf[nsyms - 1] == OD_ICDF(CDF_PROB_TOP));
-  assert(32768U <= r);
-  assert(7 - EC_PROB_SHIFT - CDF_SHIFT >= 0);
-  c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16));
-  v = r;
-  ret = -1;
-  do {
-    u = v;
-    v = ((r >> 8) * (uint32_t)(icdf[++ret] >> EC_PROB_SHIFT) >>
-         (7 - EC_PROB_SHIFT - CDF_SHIFT));
-    v += EC_MIN_PROB * (N - ret);
-  } while (c < v);
-  assert(v < u);
-  assert(u <= r);
-  r = u - v;
-  dif -= (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
-  return od_ec_dec_normalize(dec, dif, r, ret);
-}
-
-void msac_init(MsacContext *const c,
-               const uint8_t *const data, const size_t sz)
-{
-    od_ec_dec_init(c, data, sz);
-}
-
-unsigned msac_decode_symbol(MsacContext *const c, const uint16_t *const cdf,
-                            const unsigned n_symbols)
-{
-    return od_ec_decode_cdf_q15(c, cdf, n_symbols);
-}
-
-unsigned msac_decode_bool(MsacContext *const c, const unsigned cdf) {
-    return od_ec_decode_bool_q15(c, cdf);
+/* Decode a single binary value.
+ * f: The probability that the bit is one
+ * Return: The value decoded (0 or 1). */
+unsigned msac_decode_bool(MsacContext *const s, const unsigned f) {
+    ec_win v, vw, dif = s->dif;
+    uint16_t r = s->rng;
+    unsigned ret;
+    assert((dif >> (EC_WIN_SIZE - 16)) < r);
+    v = ((r >> 8) * f >> (7 - EC_PROB_SHIFT)) + EC_MIN_PROB;
+    vw   = v << (EC_WIN_SIZE - 16);
+    ret  = dif >= vw;
+    dif -= ret*vw;
+    v   += ret*(r - 2*v);
+    ctx_norm(s, dif, v);
+    return !ret;
 }
 
 unsigned msac_decode_bools(MsacContext *const c, const unsigned l) {
     int v = 0;
     for (int n = (int) l - 1; n >= 0; n--)
-        v = (v << 1) | msac_decode_bool(c, 128 << 7);
+        v = (v << 1) | msac_decode_bool(c, EC_BOOL_EPROB);
     return v;
 }
 
 int msac_decode_subexp(MsacContext *const c, const int ref,
-                       const unsigned n, const unsigned k)
+                       const int n, const unsigned k)
 {
     int i = 0;
     int a = 0;
     int b = k;
-    while ((2U << b) < n) {
-        if (!msac_decode_bool(c, 128 << 7)) break;
+    while ((2 << b) < n) {
+        if (!msac_decode_bool(c, EC_BOOL_EPROB)) break;
         b = k + i++;
         a = (1 << b);
     }
     const unsigned v = msac_decode_bools(c, b) + a;
-    return ref * 2U <= n ? inv_recenter(ref, v) :
-                           n - 1 - inv_recenter(n - 1 - ref, v);
+    return ref * 2 <= n ? inv_recenter(ref, v) :
+                          n - 1 - inv_recenter(n - 1 - ref, v);
 }
 
 int msac_decode_uniform(MsacContext *const c, const unsigned n) {
     assert(n > 0);
     const int l = ulog2(n) + 1;
     assert(l > 1);
-    const unsigned m = (1U << l) - n;
+    const unsigned m = (1 << l) - n;
     const unsigned v = msac_decode_bools(c, l - 1);
-    return v < m ? v : (v << 1) - m + msac_decode_bool(c, 128 << 7);
+    return v < m ? v : (v << 1) - m + msac_decode_bool(c, EC_BOOL_EPROB);
+}
+
+static void update_cdf(uint16_t *const cdf, const unsigned val,
+                       const unsigned n_symbols)
+{
+    const unsigned count = cdf[n_symbols];
+    const int rate = ((count >> 4) | 4) + (n_symbols > 3);
+    unsigned i;
+    for (i = 0; i < val; i++)
+        cdf[i] += (32768 - cdf[i]) >> rate;
+    for (; i < n_symbols - 1; i++)
+        cdf[i] -= cdf[i] >> rate;
+    cdf[n_symbols] = count + (count < 32);
+}
+
+unsigned msac_decode_symbol_adapt(MsacContext *const c,
+                                  uint16_t *const cdf, const unsigned n_symbols)
+{
+    const unsigned val = msac_decode_symbol(c, cdf, n_symbols);
+    if(c->allow_update_cdf)
+        update_cdf(cdf, val, n_symbols);
+    return val;
 }
 
-void update_cdf(uint16_t *cdf, unsigned val, unsigned nsymbs) {
-    int rate;
-    unsigned i, tmp;
+unsigned msac_decode_bool_adapt(MsacContext *const c, uint16_t *const cdf) {
+    const unsigned bit = msac_decode_bool(c, *cdf >> EC_PROB_SHIFT);
 
-    static const int nsymbs2speed[17] = {
-        0, 0, 1, 1, 2, 2, 2, 2, 2,
-        2, 2, 2, 2, 2, 2, 2, 2
-    };
-    assert(nsymbs < 17);
-    rate = 3 + (cdf[nsymbs] > 15) + (cdf[nsymbs] > 31) + nsymbs2speed[nsymbs];
-    tmp = 32768U;
-
-    // Single loop (faster)
-    for (i = 0; i < nsymbs - 1; ++i) {
-        tmp = (i == val) ? 0 : tmp;
-        if (tmp < cdf[i]) {
-            cdf[i] -= ((cdf[i] - tmp) >> rate);
+    if(c->allow_update_cdf){
+        // update_cdf() specialized for boolean CDFs
+        const unsigned count = cdf[1];
+        const int rate = (count >> 4) | 4;
+        if (bit) {
+            cdf[0] += (32768 - cdf[0]) >> rate;
         } else {
-            cdf[i] += ((tmp - cdf[i]) >> rate);
+            cdf[0] -= cdf[0] >> rate;
         }
+        cdf[1] = count + (count < 32);
     }
 
-    cdf[nsymbs] += (cdf[nsymbs] < 32);
+    return bit;
 }
+
+void msac_init(MsacContext *const s, const uint8_t *const data,
+               const size_t sz, const int disable_cdf_update_flag)
+{
+    s->buf_pos = data;
+    s->buf_end = data + sz;
+    s->dif = ((ec_win)1 << (EC_WIN_SIZE - 1)) - 1;
+    s->rng = 0x8000;
+    s->cnt = -15;
+    s->allow_update_cdf = !disable_cdf_update_flag;
+    ctx_refill(s);
+}
--- a/third_party/dav1d/src/msac.h
+++ b/third_party/dav1d/src/msac.h
@@ -1,56 +1,60 @@
 /*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
  *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #ifndef __DAV1D_SRC_MSAC_H__
 #define __DAV1D_SRC_MSAC_H__
 
 #include <stdint.h>
 #include <stdlib.h>
 
+/* Using uint32_t should be faster on 32 bit systems, in theory, maybe */
+typedef uint64_t ec_win;
+
 typedef struct MsacContext {
-    const uint8_t *buf, *end, *bptr;
-    int32_t tell_offs;
-    uint32_t dif;
+    const uint8_t *buf_pos;
+    const uint8_t *buf_end;
+    ec_win dif;
     uint16_t rng;
-    int16_t cnt;
-    int error;
+    int cnt;
+    int allow_update_cdf;
 } MsacContext;
 
-void msac_init(MsacContext *c, const uint8_t *data, size_t sz);
-unsigned msac_decode_symbol(MsacContext *c, const uint16_t *cdf,
-                            const unsigned n_symbols);
-unsigned msac_decode_bool(MsacContext *c, unsigned cdf);
-unsigned msac_decode_bools(MsacContext *c, unsigned l);
-int msac_decode_subexp(MsacContext *c, int ref, unsigned n, unsigned k);
-int msac_decode_uniform(MsacContext *c, unsigned n);
-void update_cdf(uint16_t *cdf, unsigned val, unsigned nsymbs);
+#define EC_PROB_SHIFT 6
+#define EC_BOOL_EPROB 256
 
-static inline unsigned msac_decode_symbol_adapt(MsacContext *const c,
-                                                uint16_t *const cdf,
-                                                const unsigned n_symbols)
-{
-    const unsigned val = msac_decode_symbol(c, cdf, n_symbols);
-    update_cdf(cdf, val, n_symbols);
-    return val;
-}
-
-static inline unsigned msac_decode_bool_adapt(MsacContext *const c,
-                                              uint16_t *const cdf)
-{
-    const unsigned bit = msac_decode_bool(c, *cdf);
-    uint16_t bak_cdf[3] = { cdf[0], 0, cdf[1] };
-    update_cdf(bak_cdf, bit, 2);
-    cdf[0] = bak_cdf[0];
-    cdf[1] = bak_cdf[2];
-    return bit;
-}
+void msac_init(MsacContext *c, const uint8_t *data, size_t sz, int disable_cdf_update_flag);
+unsigned msac_decode_symbol(MsacContext *s, const uint16_t *cdf,
+                            const unsigned n_symbols);
+unsigned msac_decode_symbol_adapt(MsacContext *s, uint16_t *cdf,
+                                  const unsigned n_symbols);
+unsigned msac_decode_bool(MsacContext *s, unsigned f);
+unsigned msac_decode_bool_adapt(MsacContext *s, uint16_t *cdf);
+unsigned msac_decode_bools(MsacContext *c, unsigned l);
+int msac_decode_subexp(MsacContext *c, int ref, int n, unsigned k);
+int msac_decode_uniform(MsacContext *c, unsigned n);
 
 #endif /* __DAV1D_SRC_MSAC_H__ */
--- a/third_party/dav1d/src/obu.c
+++ b/third_party/dav1d/src/obu.c
@@ -38,35 +38,38 @@
 
 #include "src/decode.h"
 #include "src/getbits.h"
 #include "src/levels.h"
 #include "src/obu.h"
 #include "src/ref.h"
 #include "src/warpmv.h"
 
-static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb) {
-    const uint8_t *const init_ptr = gb->ptr;
-    Av1SequenceHeader *const hdr = &c->seq_hdr;
+static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
+                         Av1SequenceHeader *const hdr)
+{
+#define DEBUG_SEQ_HDR 0
 
-#define DEBUG_SEQ_HDR 0
+#if DEBUG_SEQ_HDR
+    const unsigned init_bit_pos = dav1d_get_bits_pos(gb);
+#endif
 
     hdr->profile = dav1d_get_bits(gb, 3);
     if (hdr->profile > 2) goto error;
 #if DEBUG_SEQ_HDR
     printf("SEQHDR: post-profile: off=%ld\n",
-           (gb->ptr - init_ptr) * 8 - gb->bits_left);
+           dav1d_get_bits_pos(gb) - init_bit_pos);
 #endif
 
     hdr->still_picture = dav1d_get_bits(gb, 1);
     hdr->reduced_still_picture_header = dav1d_get_bits(gb, 1);
     if (hdr->reduced_still_picture_header && !hdr->still_picture) goto error;
 #if DEBUG_SEQ_HDR
     printf("SEQHDR: post-stillpicture_flags: off=%ld\n",
-           (gb->ptr - init_ptr) * 8 - gb->bits_left);
+           dav1d_get_bits_pos(gb) - init_bit_pos);
 #endif
 
     if (hdr->reduced_still_picture_header) {
         hdr->timing_info_present = 0;
         hdr->decoder_model_info_present = 0;
         hdr->display_model_info_present = 0;
         hdr->num_operating_points = 1;
         hdr->operating_points[0].idc = 0;
@@ -91,22 +94,22 @@ static int parse_seq_hdr(Dav1dContext *c
                 hdr->buffer_removal_delay_length = dav1d_get_bits(gb, 5) + 1;
                 hdr->frame_presentation_delay_length = dav1d_get_bits(gb, 5) + 1;
             }
         } else {
             hdr->decoder_model_info_present = 0;
         }
 #if DEBUG_SEQ_HDR
         printf("SEQHDR: post-timinginfo: off=%ld\n",
-               (gb->ptr - init_ptr) * 8 - gb->bits_left);
+               dav1d_get_bits_pos(gb) - init_bit_pos);
 #endif
 
         hdr->display_model_info_present = dav1d_get_bits(gb, 1);
         hdr->num_operating_points = dav1d_get_bits(gb, 5) + 1;
-        for (int i = 0; i < c->seq_hdr.num_operating_points; i++) {
+        for (int i = 0; i < hdr->num_operating_points; i++) {
             struct Av1SequenceHeaderOperatingPoint *const op =
                 &hdr->operating_points[i];
             op->idc = dav1d_get_bits(gb, 12);
             op->major_level = 2 + dav1d_get_bits(gb, 3);
             op->minor_level = dav1d_get_bits(gb, 2);
             op->tier = op->major_level > 3 ? dav1d_get_bits(gb, 1) : 0;
             op->decoder_model_param_present =
                 hdr->decoder_model_info_present && dav1d_get_bits(gb, 1);
@@ -120,37 +123,37 @@ static int parse_seq_hdr(Dav1dContext *c
             op->display_model_param_present =
                 hdr->display_model_info_present && dav1d_get_bits(gb, 1);
             if (op->display_model_param_present) {
                 op->initial_display_delay = dav1d_get_bits(gb, 4) + 1;
             }
         }
 #if DEBUG_SEQ_HDR
         printf("SEQHDR: post-operating-points: off=%ld\n",
-               (gb->ptr - init_ptr) * 8 - gb->bits_left);
+               dav1d_get_bits_pos(gb) - init_bit_pos);
 #endif
     }
 
     hdr->width_n_bits = dav1d_get_bits(gb, 4) + 1;
     hdr->height_n_bits = dav1d_get_bits(gb, 4) + 1;
     hdr->max_width = dav1d_get_bits(gb, hdr->width_n_bits) + 1;
     hdr->max_height = dav1d_get_bits(gb, hdr->height_n_bits) + 1;
 #if DEBUG_SEQ_HDR
     printf("SEQHDR: post-size: off=%ld\n",
-           (gb->ptr - init_ptr) * 8 - gb->bits_left);
+           dav1d_get_bits_pos(gb) - init_bit_pos);
 #endif
     hdr->frame_id_numbers_present =
         hdr->reduced_still_picture_header ? 0 : dav1d_get_bits(gb, 1);
     if (hdr->frame_id_numbers_present) {
         hdr->delta_frame_id_n_bits = dav1d_get_bits(gb, 4) + 2;
         hdr->frame_id_n_bits = dav1d_get_bits(gb, 3) + hdr->delta_frame_id_n_bits + 1;
     }
 #if DEBUG_SEQ_HDR
     printf("SEQHDR: post-frame-id-numbers-present: off=%ld\n",
-           (gb->ptr - init_ptr) * 8 - gb->bits_left);
+           dav1d_get_bits_pos(gb) - init_bit_pos);
 #endif
 
     hdr->sb128 = dav1d_get_bits(gb, 1);
     hdr->filter_intra = dav1d_get_bits(gb, 1);
     hdr->intra_edge_filter = dav1d_get_bits(gb, 1);
     if (hdr->reduced_still_picture_header) {
         hdr->inter_intra = 0;
         hdr->masked_compound = 0;
@@ -174,29 +177,29 @@ static int parse_seq_hdr(Dav1dContext *c
         } else {
             hdr->jnt_comp = 0;
             hdr->ref_frame_mvs = 0;
             hdr->order_hint_n_bits = 0;
         }
         hdr->screen_content_tools = dav1d_get_bits(gb, 1) ? ADAPTIVE : dav1d_get_bits(gb, 1);
     #if DEBUG_SEQ_HDR
         printf("SEQHDR: post-screentools: off=%ld\n",
-               (gb->ptr - init_ptr) * 8 - gb->bits_left);
+               dav1d_get_bits_pos(gb) - init_bit_pos);
     #endif
         hdr->force_integer_mv = hdr->screen_content_tools ?
                                 dav1d_get_bits(gb, 1) ? ADAPTIVE : dav1d_get_bits(gb, 1) : 2;
         if (hdr->order_hint)
             hdr->order_hint_n_bits = dav1d_get_bits(gb, 3) + 1;
     }
     hdr->super_res = dav1d_get_bits(gb, 1);
     hdr->cdef = dav1d_get_bits(gb, 1);
     hdr->restoration = dav1d_get_bits(gb, 1);
 #if DEBUG_SEQ_HDR
     printf("SEQHDR: post-featurebits: off=%ld\n",
-           (gb->ptr - init_ptr) * 8 - gb->bits_left);
+           dav1d_get_bits_pos(gb) - init_bit_pos);
 #endif
 
     const int hbd = dav1d_get_bits(gb, 1);
     hdr->bpc = hdr->profile == 2 && hbd ? 10U + 2 * dav1d_get_bits(gb, 1) : 8U + 2 * hbd;
     hdr->hbd = hdr->bpc > 8;
     const int monochrome = hdr->profile != 1 ? dav1d_get_bits(gb, 1) : 0;
     hdr->color_description_present = dav1d_get_bits(gb, 1);
     if (hdr->color_description_present) {
@@ -237,28 +240,32 @@ static int parse_seq_hdr(Dav1dContext *c
             break;
         }
         if (hdr->layout == DAV1D_PIXEL_LAYOUT_I420)
             hdr->chr = dav1d_get_bits(gb, 2);
         hdr->separate_uv_delta_q = dav1d_get_bits(gb, 1);
     }
 #if DEBUG_SEQ_HDR
     printf("SEQHDR: post-colorinfo: off=%ld\n",
-           (gb->ptr - init_ptr) * 8 - gb->bits_left);
+           dav1d_get_bits_pos(gb) - init_bit_pos);
 #endif
 
-    c->seq_hdr.film_grain_present = dav1d_get_bits(gb, 1);
+    hdr->film_grain_present = dav1d_get_bits(gb, 1);
 #if DEBUG_SEQ_HDR
     printf("SEQHDR: post-filmgrain: off=%ld\n",
-           (gb->ptr - init_ptr) * 8 - gb->bits_left);
+           dav1d_get_bits_pos(gb) - init_bit_pos);
 #endif
 
     dav1d_get_bits(gb, 1); // dummy bit
 
-    return dav1d_flush_get_bits(gb) - init_ptr;
+    // We needn't bother flushing the OBU here: we'll check we didn't
+    // overrun in the caller and will then discard gb, so there's no
+    // point in setting its position properly.
+
+    return 0;
 
 error:
     fprintf(stderr, "Error parsing sequence header\n");
     return -EINVAL;
 }
 
 static int read_frame_size(Dav1dContext *const c, GetBits *const gb,
                            const int use_ref)
@@ -287,18 +294,18 @@ static int read_frame_size(Dav1dContext 
     } else {
         hdr->width = seqhdr->max_width;
         hdr->height = seqhdr->max_height;
     }
     hdr->super_res = seqhdr->super_res && dav1d_get_bits(gb, 1);
     if (hdr->super_res) return -1; // FIXME
     hdr->have_render_size = dav1d_get_bits(gb, 1);
     if (hdr->have_render_size) {
-        hdr->render_width = dav1d_get_bits(gb, seqhdr->width_n_bits) + 1;
-        hdr->render_height = dav1d_get_bits(gb, seqhdr->height_n_bits) + 1;
+        hdr->render_width = dav1d_get_bits(gb, 16) + 1;
+        hdr->render_height = dav1d_get_bits(gb, 16) + 1;
     } else {
         hdr->render_width = hdr->width;
         hdr->render_height = hdr->height;
     }
     return 0;
 }
 
 static inline int tile_log2(int sz, int tgt) {
@@ -307,39 +314,39 @@ static inline int tile_log2(int sz, int 
     return k;
 }
 
 static const Av1LoopfilterModeRefDeltas default_mode_ref_deltas = {
     .mode_delta = { 0, 0 },
     .ref_delta = { 1, 0, 0, 0, -1, 0, -1, -1 },
 };
 
-static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb,
-                           const int have_trailing_bit)
-{
+static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
+#define DEBUG_FRAME_HDR 0
+
+#if DEBUG_FRAME_HDR
     const uint8_t *const init_ptr = gb->ptr;
+#endif
     const Av1SequenceHeader *const seqhdr = &c->seq_hdr;
     Av1FrameHeader *const hdr = &c->frame_hdr;
     int res;
 
-#define DEBUG_FRAME_HDR 0
-
     hdr->show_existing_frame =
         !seqhdr->reduced_still_picture_header && dav1d_get_bits(gb, 1);
 #if DEBUG_FRAME_HDR
     printf("HDR: post-show_existing_frame: off=%ld\n",
            (gb->ptr - init_ptr) * 8 - gb->bits_left);
 #endif
     if (hdr->show_existing_frame) {
         hdr->existing_frame_idx = dav1d_get_bits(gb, 3);
         if (seqhdr->decoder_model_info_present && !seqhdr->equal_picture_interval)
             hdr->frame_presentation_delay = dav1d_get_bits(gb, seqhdr->frame_presentation_delay_length);
         if (seqhdr->frame_id_numbers_present)
             hdr->frame_id = dav1d_get_bits(gb, seqhdr->frame_id_n_bits);
-        goto end;
+        return 0;
     }
 
     hdr->frame_type = seqhdr->reduced_still_picture_header ? DAV1D_FRAME_TYPE_KEY : dav1d_get_bits(gb, 2);
     hdr->show_frame = seqhdr->reduced_still_picture_header || dav1d_get_bits(gb, 1);
     if (hdr->show_frame) {
         if (seqhdr->decoder_model_info_present && !seqhdr->equal_picture_interval)
             hdr->frame_presentation_delay = dav1d_get_bits(gb, seqhdr->frame_presentation_delay_length);
     } else
@@ -445,60 +452,59 @@ static int parse_frame_hdr(Dav1dContext 
 #endif
 
     // tile data
     hdr->tiling.uniform = dav1d_get_bits(gb, 1);
     const int sbsz_min1 = (64 << seqhdr->sb128) - 1;
     int sbsz_log2 = 6 + seqhdr->sb128;
     int sbw = (hdr->width + sbsz_min1) >> sbsz_log2;
     int sbh = (hdr->height + sbsz_min1) >> sbsz_log2;
-    int max_tile_width_sb = 4096 >> sbsz_log2, max_tile_height_sb;
+    int max_tile_width_sb = 4096 >> sbsz_log2;
     int max_tile_area_sb = 4096 * 2304 >> (2 * sbsz_log2);
     hdr->tiling.min_log2_cols = tile_log2(max_tile_width_sb, sbw);
-    hdr->tiling.max_log2_cols = tile_log2(1, imin(sbw, 1024));
-    hdr->tiling.max_log2_rows = tile_log2(1, imin(sbh, 1024));
+    hdr->tiling.max_log2_cols = tile_log2(1, imin(sbw, MAX_TILE_COLS));
+    hdr->tiling.max_log2_rows = tile_log2(1, imin(sbh, MAX_TILE_ROWS));
     int min_log2_tiles = imax(tile_log2(max_tile_area_sb, sbw * sbh),
                               hdr->tiling.min_log2_cols);
     if (hdr->tiling.uniform) {
         for (hdr->tiling.log2_cols = hdr->tiling.min_log2_cols;
              hdr->tiling.log2_cols < hdr->tiling.max_log2_cols && dav1d_get_bits(gb, 1);
              hdr->tiling.log2_cols++) ;
         const int tile_w = 1 + ((sbw - 1) >> hdr->tiling.log2_cols);
         hdr->tiling.cols = 0;
         for (int sbx = 0; sbx < sbw; sbx += tile_w, hdr->tiling.cols++)
             hdr->tiling.col_start_sb[hdr->tiling.cols] = sbx;
         hdr->tiling.min_log2_rows =
             imax(min_log2_tiles - hdr->tiling.log2_cols, 0);
-        max_tile_height_sb = sbh >> hdr->tiling.min_log2_rows;
 
         for (hdr->tiling.log2_rows = hdr->tiling.min_log2_rows;
              hdr->tiling.log2_rows < hdr->tiling.max_log2_rows && dav1d_get_bits(gb, 1);
              hdr->tiling.log2_rows++) ;
         const int tile_h = 1 + ((sbh - 1) >> hdr->tiling.log2_rows);
         hdr->tiling.rows = 0;
         for (int sby = 0; sby < sbh; sby += tile_h, hdr->tiling.rows++)
             hdr->tiling.row_start_sb[hdr->tiling.rows] = sby;
     } else {
         hdr->tiling.cols = 0;
         int widest_tile = 0, max_tile_area_sb = sbw * sbh;
-        for (int sbx = 0; sbx < sbw; hdr->tiling.cols++) {
+        for (int sbx = 0; sbx < sbw && hdr->tiling.cols < MAX_TILE_COLS; hdr->tiling.cols++) {
             const int tile_width_sb = imin(sbw - sbx, max_tile_width_sb);
             const int tile_w = (tile_width_sb > 1) ?
                                    1 + dav1d_get_uniform(gb, tile_width_sb) :
                                    1;
             hdr->tiling.col_start_sb[hdr->tiling.cols] = sbx;
             sbx += tile_w;
             widest_tile = imax(widest_tile, tile_w);
         }
         hdr->tiling.log2_cols = tile_log2(1, hdr->tiling.cols);
         if (min_log2_tiles) max_tile_area_sb >>= min_log2_tiles + 1;
-        max_tile_height_sb = imax(max_tile_area_sb / widest_tile, 1);
+        int max_tile_height_sb = imax(max_tile_area_sb / widest_tile, 1);
 
         hdr->tiling.rows = 0;
-        for (int sby = 0; sby < sbh; hdr->tiling.rows++) {
+        for (int sby = 0; sby < sbh && hdr->tiling.rows < MAX_TILE_ROWS; hdr->tiling.rows++) {
             const int tile_height_sb = imin(sbh - sby, max_tile_height_sb);
             const int tile_h = (tile_height_sb > 1) ?
                                    1 + dav1d_get_uniform(gb, tile_height_sb) :
                                    1;
             hdr->tiling.row_start_sb[hdr->tiling.rows] = sby;
             sby += tile_h;
         }
         hdr->tiling.log2_rows = tile_log2(1, hdr->tiling.rows);
@@ -518,19 +524,23 @@ static int parse_frame_hdr(Dav1dContext 
     printf("HDR: post-tiling: off=%ld\n",
            (gb->ptr - init_ptr) * 8 - gb->bits_left);
 #endif
 
     // quant data
     hdr->quant.yac = dav1d_get_bits(gb, 8);
     hdr->quant.ydc_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0;
     if (seqhdr->layout != DAV1D_PIXEL_LAYOUT_I400) {
+        // If the sequence header says that delta_q might be different
+        // for U, V, we must check whether it actually is for this
+        // frame.
+        int diff_uv_delta = seqhdr->separate_uv_delta_q ? dav1d_get_bits(gb, 1) : 0;
         hdr->quant.udc_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0;
         hdr->quant.uac_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0;
-        if (seqhdr->separate_uv_delta_q) {
+        if (diff_uv_delta) {
             hdr->quant.vdc_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0;
             hdr->quant.vac_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0;
         } else {
             hdr->quant.vdc_delta = hdr->quant.udc_delta;
             hdr->quant.vac_delta = hdr->quant.uac_delta;
         }
     }
 #if DEBUG_FRAME_HDR
@@ -611,27 +621,27 @@ static int parse_frame_hdr(Dav1dContext 
                     hdr->segmentation.seg_data.last_active_segid = i;
                     hdr->segmentation.seg_data.preskip = 1;
                 }
                 if ((seg->globalmv = dav1d_get_bits(gb, 1))) {
                     hdr->segmentation.seg_data.last_active_segid = i;
                     hdr->segmentation.seg_data.preskip = 1;
                 }
             }
-        } else if (hdr->primary_ref_frame == PRIMARY_REF_NONE) {
-            memset(&hdr->segmentation.seg_data, 0, sizeof(Av1SegmentationDataSet));
         } else {
+            // segmentation.update_data was false so we should copy
+            // segmentation data from the reference frame.
+            assert(hdr->primary_ref_frame != PRIMARY_REF_NONE);
             const int pri_ref = hdr->refidx[hdr->primary_ref_frame];
             hdr->segmentation.seg_data = c->refs[pri_ref].seg_data;
         }
-    } else if (hdr->primary_ref_frame == PRIMARY_REF_NONE) {
+    } else {
         memset(&hdr->segmentation.seg_data, 0, sizeof(Av1SegmentationDataSet));
-    } else {
-        const int pri_ref = hdr->refidx[hdr->primary_ref_frame];
-        hdr->segmentation.seg_data = c->refs[pri_ref].seg_data;
+        for (int i = 0; i < NUM_SEGMENTS; i++)
+            hdr->segmentation.seg_data.d[i].ref = -1;
     }
 #if DEBUG_FRAME_HDR
     printf("HDR: post-segmentation: off=%ld\n",
            (gb->ptr - init_ptr) * 8 - gb->bits_left);
 #endif
 
     // delta q
     hdr->delta.q.present = hdr->quant.yac ? dav1d_get_bits(gb, 1) : 0;
@@ -878,17 +888,17 @@ static int parse_frame_hdr(Dav1dContext 
                 mat[4] = -mat[3];
                 mat[5] = mat[2];
             }
 
             mat[0] = dav1d_get_bits_subexp(gb, ref_mat[0] >> shift, bits) * (1 << shift);
             mat[1] = dav1d_get_bits_subexp(gb, ref_mat[1] >> shift, bits) * (1 << shift);
 
             if (dav1d_get_shear_params(&hdr->gmv[i]))
-                goto error;
+                hdr->gmv[i].type = WM_TYPE_TRANSLATION;
         }
     }
 #if DEBUG_FRAME_HDR
     printf("HDR: post-gmv: off=%ld\n",
            (gb->ptr - init_ptr) * 8 - gb->bits_left);
 #endif
 
     hdr->film_grain.present = seqhdr->film_grain_present &&
@@ -960,143 +970,214 @@ static int parse_frame_hdr(Dav1dContext 
                     fgd->uv_mult[pl] = dav1d_get_bits(gb, 8);
                     fgd->uv_luma_mult[pl] = dav1d_get_bits(gb, 8);
                     fgd->uv_offset[pl] = dav1d_get_bits(gb, 9);
                 }
             fgd->overlap_flag = dav1d_get_bits(gb, 1);
             fgd->clip_to_restricted_range = dav1d_get_bits(gb, 1);
         }
     } else {
-        memset(&hdr->film_grain.data, 0, sizeof(hdr->film_grain));
+        memset(&hdr->film_grain.data, 0, sizeof(hdr->film_grain.data));
     }
 #if DEBUG_FRAME_HDR
     printf("HDR: post-filmgrain: off=%ld\n",
            (gb->ptr - init_ptr) * 8 - gb->bits_left);
 #endif
 
-end:
-
-    if (have_trailing_bit)
-        dav1d_get_bits(gb, 1); // dummy bit
-
-    return dav1d_flush_get_bits(gb) - init_ptr;
+    return 0;
 
 error:
     fprintf(stderr, "Error parsing frame header\n");
     return -EINVAL;
 }
 
-static int parse_tile_hdr(Dav1dContext *const c, GetBits *const gb) {
-    const uint8_t *const init_ptr = gb->ptr;
-
+static void parse_tile_hdr(Dav1dContext *const c, GetBits *const gb) {
     int have_tile_pos = 0;
     const int n_tiles = c->frame_hdr.tiling.cols * c->frame_hdr.tiling.rows;
     if (n_tiles > 1)
         have_tile_pos = dav1d_get_bits(gb, 1);
 
     if (have_tile_pos) {
         const int n_bits = c->frame_hdr.tiling.log2_cols +
                            c->frame_hdr.tiling.log2_rows;
         c->tile[c->n_tile_data].start = dav1d_get_bits(gb, n_bits);
         c->tile[c->n_tile_data].end = dav1d_get_bits(gb, n_bits);
     } else {
         c->tile[c->n_tile_data].start = 0;
         c->tile[c->n_tile_data].end = n_tiles - 1;
     }
+}
 
-    return dav1d_flush_get_bits(gb) - init_ptr;
+// Check that we haven't read more than obu_len bytes from the buffer
+// since init_bit_pos.
+static int
+check_for_overrun(GetBits *const gb, unsigned init_bit_pos, unsigned obu_len)
+{
+    // Make sure we haven't actually read past the end of the gb buffer
+    if (gb->error) {
+        fprintf(stderr, "Overrun in OBU bit buffer\n");
+        return 1;
+    }
+
+    unsigned pos = dav1d_get_bits_pos(gb);
+
+    // We assume that init_bit_pos was the bit position of the buffer
+    // at some point in the past, so cannot be smaller than pos.
+    assert (init_bit_pos <= pos);
+
+    if (pos - init_bit_pos > 8 * obu_len) {
+        fprintf(stderr, "Overrun in OBU bit buffer into next OBU\n");
+        return 1;
+    }
+
+    return 0;
 }
 
 int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in) {
     GetBits gb;
     int res;
 
     dav1d_init_get_bits(&gb, in->data, in->sz);
 
     // obu header
     dav1d_get_bits(&gb, 1); // obu_forbidden_bit
     const enum ObuType type = dav1d_get_bits(&gb, 4);
     const int has_extension = dav1d_get_bits(&gb, 1);
     const int has_length_field = dav1d_get_bits(&gb, 1);
-    if (!has_length_field) goto error;
     dav1d_get_bits(&gb, 1); // reserved
     if (has_extension) {
         dav1d_get_bits(&gb, 3); // temporal_layer_id
         dav1d_get_bits(&gb, 2); // enhancement_layer_id
         dav1d_get_bits(&gb, 3); // reserved
     }
 
     // obu length field
     unsigned len = 0, more, i = 0;
-    do {
-        more = dav1d_get_bits(&gb, 1);
-        unsigned bits = dav1d_get_bits(&gb, 7);
-        if (i <= 3 || (i == 4 && bits < (1 << 4)))
-            len |= bits << (i * 7);
-        else if (bits)
-            goto error;
-        if (more && ++i == 8) goto error;
-    } while (more);
+    if (has_length_field)
+        do {
+            more = dav1d_get_bits(&gb, 1);
+            unsigned bits = dav1d_get_bits(&gb, 7);
+            if (i <= 3 || (i == 4 && bits < (1 << 4)))
+                len |= bits << (i * 7);
+            else if (bits)
+                goto error;
+            if (more && ++i == 8) goto error;
+        } while (more);
+    else
+        len = in->sz - 1 - has_extension;
     if (gb.error) goto error;
 
-    unsigned off = dav1d_flush_get_bits(&gb) - in->data;
-    const unsigned init_off = off;
-    if (len > in->sz - off) goto error;
+    const unsigned init_bit_pos = dav1d_get_bits_pos(&gb);
+    const unsigned init_byte_pos = init_bit_pos >> 3;
+    const unsigned pkt_bytelen = init_byte_pos + len;
+
+    // We must have read a whole number of bytes at this point (1 byte
+    // for the header and whole bytes at a time when reading the
+    // leb128 length field).
+    assert((init_bit_pos & 7) == 0);
+
+    // We also know that we haven't tried to read more than in->sz
+    // bytes yet (otherwise the error flag would have been set by the
+    // code in getbits.c)
+    assert(in->sz >= init_byte_pos);
+
+    // Make sure that there are enough bits left in the buffer for the
+    // rest of the OBU.
+    if (len > in->sz - init_byte_pos) goto error;
 
     switch (type) {
-    case OBU_SEQ_HDR:
-        if ((res = parse_seq_hdr(c, &gb)) < 0)
+    case OBU_SEQ_HDR: {
+        Av1SequenceHeader hdr, *const hdr_ptr = c->have_seq_hdr ? &hdr : &c->seq_hdr;
+        memset(hdr_ptr, 0, sizeof(*hdr_ptr));
+        c->have_frame_hdr = 0;
+        if ((res = parse_seq_hdr(c, &gb, hdr_ptr)) < 0)
             return res;
-        if ((unsigned)res != len) goto error;
+        if (check_for_overrun(&gb, init_bit_pos, len))
+            return -EINVAL;
+        // If we have read a sequence header which is different from
+        // the old one, this is a new video sequence and can't use any
+        // previous state. Free that state.
+        if (c->have_seq_hdr && memcmp(&hdr, &c->seq_hdr, sizeof(hdr))) {
+            for (int i = 0; i < 8; i++) {
+                if (c->refs[i].p.p.data[0])
+                    dav1d_thread_picture_unref(&c->refs[i].p);
+                dav1d_ref_dec(&c->refs[i].segmap);
+                dav1d_ref_dec(&c->refs[i].refmvs);
+                if (c->cdf[i].cdf)
+                    dav1d_cdf_thread_unref(&c->cdf[i]);
+            }
+            c->seq_hdr = hdr;
+        }
         c->have_seq_hdr = 1;
-        c->have_frame_hdr = 0;
         break;
+    }
     case OBU_REDUNDANT_FRAME_HDR:
         if (c->have_frame_hdr) break;
         // fall-through
     case OBU_FRAME:
     case OBU_FRAME_HDR:
+        c->have_frame_hdr = 0;
         if (!c->have_seq_hdr) goto error;
-        if ((res = parse_frame_hdr(c, &gb, type != OBU_FRAME)) < 0)
+        if ((res = parse_frame_hdr(c, &gb)) < 0)
             return res;
         c->have_frame_hdr = 1;
         for (int n = 0; n < c->n_tile_data; n++)
             dav1d_data_unref(&c->tile[n].data);
         c->n_tile_data = 0;
         c->n_tiles = 0;
-        if (type != OBU_FRAME) break;
+        if (type != OBU_FRAME) {
+            // This is actually a frame header OBU so read the
+            // trailing bit and check for overrun.
+            dav1d_get_bits(&gb, 1);
+            if (check_for_overrun(&gb, init_bit_pos, len))
+                return -EINVAL;
+
+            break;
+        }
+        // OBU_FRAMEs shouldn't be signalled with show_existing_frame
         if (c->frame_hdr.show_existing_frame) goto error;
-        off += res;
+
+        // This is the frame header at the start of a frame OBU.
+        // There's no trailing bit at the end to skip, but we do need
+        // to align to the next byte.
+        dav1d_bytealign_get_bits(&gb);
         // fall-through
-    case OBU_TILE_GRP:
+    case OBU_TILE_GRP: {
         if (!c->have_frame_hdr) goto error;
         if (c->n_tile_data >= 256) goto error;
-        if ((res = parse_tile_hdr(c, &gb)) < 0)
-            return res;
-        off += res;
-        if (off > len + init_off)
-            goto error;
+        parse_tile_hdr(c, &gb);
+        // Align to the next byte boundary and check for overrun.
+        dav1d_bytealign_get_bits(&gb);
+        if (check_for_overrun(&gb, init_bit_pos, len))
+            return -EINVAL;
+        // The current bit position is a multiple of 8 (because we
+        // just aligned it) and less than 8*pkt_bytelen because
+        // otherwise the overrun check would have fired.
+        const unsigned bit_pos = dav1d_get_bits_pos(&gb);
+        assert((bit_pos & 7) == 0);
+        assert(pkt_bytelen >= (bit_pos >> 3));
         dav1d_ref_inc(in->ref);
         c->tile[c->n_tile_data].data.ref = in->ref;
-        c->tile[c->n_tile_data].data.data = in->data + off;
-        c->tile[c->n_tile_data].data.sz = len + init_off - off;
+        c->tile[c->n_tile_data].data.data = in->data + (bit_pos >> 3);
+        c->tile[c->n_tile_data].data.sz = pkt_bytelen - (bit_pos >> 3);
         // ensure tile groups are in order and sane, see 6.10.1
         if (c->tile[c->n_tile_data].start > c->tile[c->n_tile_data].end ||
             c->tile[c->n_tile_data].start != c->n_tiles)
         {
             for (int i = 0; i <= c->n_tile_data; i++)
                 dav1d_data_unref(&c->tile[i].data);
             c->n_tile_data = 0;
             c->n_tiles = 0;
             goto error;
         }
         c->n_tiles += 1 + c->tile[c->n_tile_data].end -
                           c->tile[c->n_tile_data].start;
         c->n_tile_data++;
         break;
+    }
     case OBU_PADDING:
     case OBU_TD:
     case OBU_METADATA:
         // ignore OBUs we don't care about
         break;
     default:
         fprintf(stderr, "Unknown OBU type %d of size %u\n", type, len);
         return -EINVAL;
@@ -1156,27 +1237,24 @@ int dav1d_parse_obus(Dav1dContext *const
                 dav1d_init_states(&c->cdf[i], c->refs[r].qidx);
 
                 c->refs[i].lf_mode_ref_deltas = c->refs[r].lf_mode_ref_deltas;
                 c->refs[i].seg_data = c->refs[r].seg_data;
                 for (int j = 0; j < 7; j++)
                     c->refs[i].gmv[j] = dav1d_default_wm_params;
                 c->refs[i].film_grain = c->refs[r].film_grain;
 
-                if (c->refs[i].segmap)
-                    dav1d_ref_dec(c->refs[i].segmap);
+                dav1d_ref_dec(&c->refs[i].segmap);
                 c->refs[i].segmap = c->refs[r].segmap;
                 if (c->refs[r].segmap)
                     dav1d_ref_inc(c->refs[r].segmap);
-                if (c->refs[i].refmvs)
-                    dav1d_ref_dec(c->refs[i].refmvs);
-                c->refs[i].refmvs = NULL;
+                dav1d_ref_dec(&c->refs[i].refmvs);
                 c->refs[i].qidx = c->refs[r].qidx;
             }
         }
     }
 
-    return len + init_off;
+    return len + init_byte_pos;
 
 error:
     fprintf(stderr, "Error parsing OBU data\n");
     return -EINVAL;
 }
--- a/third_party/dav1d/src/picture.c
+++ b/third_party/dav1d/src/picture.c
@@ -35,16 +35,17 @@
 
 #include "common/intops.h"
 #include "common/mem.h"
 #include "common/validate.h"
 
 #include "src/picture.h"
 #include "src/ref.h"
 #include "src/thread.h"
+#include "src/thread_task.h"
 
 int default_picture_allocator(Dav1dPicture *const p, void *cookie) {
     assert(cookie == NULL);
     const int hbd = p->p.bpc > 8;
     const int aligned_w = (p->p.w + 127) & ~127;
     const int aligned_h = (p->p.h + 127) & ~127;
     const int has_chroma = p->p.layout != DAV1D_PIXEL_LAYOUT_I400;
     const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420;
@@ -81,24 +82,25 @@ void default_picture_release(uint8_t *co
     assert(allocator_data == data);
 #endif
     dav1d_free_aligned(data);
 }
 
 struct pic_ctx_context {
     Dav1dPicAllocator allocator;
     void *allocator_data;
+    uint8_t *data;
     void *extra_ptr; /* MUST BE AT THE END */
 };
 
-static void free_buffer(uint8_t *data, void *user_data)
+static void free_buffer(const uint8_t *data, void *user_data)
 {
     struct pic_ctx_context *pic_ctx = user_data;
 
-    pic_ctx->allocator.release_picture_callback(data,
+    pic_ctx->allocator.release_picture_callback(pic_ctx->data,
                                                 pic_ctx->allocator_data,
                                                 pic_ctx->allocator.cookie);
     free(pic_ctx);
 }
 
 static int picture_alloc_with_edges(Dav1dPicture *const p,
                                     const int w, const int h,
                                     const enum Dav1dPixelLayout layout,
@@ -128,16 +130,17 @@ static int picture_alloc_with_edges(Dav1
     int res = p_allocator->alloc_picture_callback(p, p_allocator->cookie);
     if (res < 0) {
         free(pic_ctx);
         return -ENOMEM;
     }
 
     pic_ctx->allocator = *p_allocator;
     pic_ctx->allocator_data = p->allocator_data;
+    pic_ctx->data = p->data[0];
 
     if (!(p->ref = dav1d_ref_wrap(p->data[0], free_buffer, pic_ctx))) {
         p_allocator->release_picture_callback(p->data[0], p->allocator_data,
                                               p_allocator->cookie);
         fprintf(stderr, "Failed to wrap picture: %s\n", strerror(errno));
         return -ENOMEM;
     }
 
@@ -154,16 +157,17 @@ int dav1d_thread_picture_alloc(Dav1dThre
                                Dav1dPicAllocator *const p_allocator)
 {
     p->t = t;
 
     const int res =
         picture_alloc_with_edges(&p->p, w, h, layout, bpc, p_allocator,
                                  t != NULL ? sizeof(atomic_int) * 2 : 0,
                                  (void **) &p->progress);
+    if (res) return res;
 
     p->visible = visible;
     p->flushed = 0;
     if (t) {
         atomic_init(&p->progress[0], 0);
         atomic_init(&p->progress[1], 0);
     }
     return res;
@@ -176,65 +180,79 @@ void dav1d_picture_ref(Dav1dPicture *con
 
     if (src->ref) {
         validate_input(src->data[0] != NULL);
         dav1d_ref_inc(src->ref);
     }
     *dst = *src;
 }
 
+void dav1d_picture_move_ref(Dav1dPicture *const dst, Dav1dPicture *const src) {
+    validate_input(dst != NULL);
+    validate_input(dst->data[0] == NULL);
+    validate_input(src != NULL);
+
+    if (src->ref)
+        validate_input(src->data[0] != NULL);
+
+    *dst = *src;
+    memset(src, 0, sizeof(*src));
+}
+
 void dav1d_thread_picture_ref(Dav1dThreadPicture *dst,
                               const Dav1dThreadPicture *src)
 {
     dav1d_picture_ref(&dst->p, &src->p);
     dst->t = src->t;
     dst->visible = src->visible;
     dst->progress = src->progress;
     dst->flushed = src->flushed;
 }
 
 void dav1d_picture_unref(Dav1dPicture *const p) {
     validate_input(p != NULL);
 
     if (p->ref) {
         validate_input(p->data[0] != NULL);
-        dav1d_ref_dec(p->ref);
+        dav1d_ref_dec(&p->ref);
     }
     memset(p, 0, sizeof(*p));
 }
 
 void dav1d_thread_picture_unref(Dav1dThreadPicture *const p) {
     dav1d_picture_unref(&p->p);
 
     p->t = NULL;
     p->progress = NULL;
 }
 
-void dav1d_thread_picture_wait(const Dav1dThreadPicture *const p,
-                               int y_unclipped, const enum PlaneType plane_type)
+int dav1d_thread_picture_wait(const Dav1dThreadPicture *const p,
+                              int y_unclipped, const enum PlaneType plane_type)
 {
     assert(plane_type != PLANE_TYPE_ALL);
 
     if (!p->t)
-        return;
+        return 0;
 
     // convert to luma units; include plane delay from loopfilters; clip
     const int ss_ver = p->p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
     y_unclipped *= 1 << (plane_type & ss_ver); // we rely here on PLANE_TYPE_UV being 1
     y_unclipped += (plane_type != PLANE_TYPE_BLOCK) * 8; // delay imposed by loopfilter
     const unsigned y = iclip(y_unclipped, 1, p->p.p.h);
     atomic_uint *const progress = &p->progress[plane_type != PLANE_TYPE_BLOCK];
+    unsigned state;
 
-    if (atomic_load_explicit(progress, memory_order_acquire) >= y)
-        return;
+    if ((state = atomic_load_explicit(progress, memory_order_acquire)) >= y)
+        return state == FRAME_ERROR;
 
     pthread_mutex_lock(&p->t->lock);
-    while (atomic_load_explicit(progress, memory_order_relaxed) < y)
+    while ((state = atomic_load_explicit(progress, memory_order_relaxed)) < y)
         pthread_cond_wait(&p->t->cond, &p->t->lock);
     pthread_mutex_unlock(&p->t->lock);
+    return state == FRAME_ERROR;
 }
 
 void dav1d_thread_picture_signal(const Dav1dThreadPicture *const p,
                                  const int y, // in pixel units
                                  const enum PlaneType plane_type)
 {
     assert(plane_type != PLANE_TYPE_UV);
 
--- a/third_party/dav1d/src/picture.h
+++ b/third_party/dav1d/src/picture.h
@@ -63,30 +63,38 @@ int dav1d_thread_picture_alloc(Dav1dThre
  * Create a copy of a picture.
  */
 void dav1d_picture_ref(Dav1dPicture *dst, const Dav1dPicture *src);
 void dav1d_thread_picture_ref(Dav1dThreadPicture *dst,
                               const Dav1dThreadPicture *src);
 void dav1d_thread_picture_unref(Dav1dThreadPicture *p);
 
 /**
+ * Move a picture reference.
+ */
+void dav1d_picture_move_ref(Dav1dPicture *dst, Dav1dPicture *src);
+
+/**
  * Wait for picture to reach a certain stage.
  *
  * y is in full-pixel units. If pt is not UV, this is in luma
  * units, else it is in chroma units.
  * plane_type is used to determine how many pixels delay are
  * introduced by loopfilter processes.
+ *
+ * Returns 0 on success, and 1 if there was an error while decoding p
  */
-void dav1d_thread_picture_wait(const Dav1dThreadPicture *p, int y,
+int dav1d_thread_picture_wait(const Dav1dThreadPicture *p, int y,
                                enum PlaneType plane_type);
 
 /**
  * Signal decoding progress.
  *
- * y is in full-pixel luma units.
+ * y is in full-pixel luma units. FRAME_ERROR is used to signal a decoding
+ * error to frames using this frame as reference frame.
  * plane_type denotes whether we have completed block data (pass 1;
  * PLANE_TYPE_BLOCK), pixel data (pass 2, PLANE_TYPE_Y) or both (no
  * 2-pass decoding; PLANE_TYPE_ALL).
  */
 void dav1d_thread_picture_signal(const Dav1dThreadPicture *p, int y,
                                  enum PlaneType plane_type);
 
 int default_picture_allocator(Dav1dPicture *, void *cookie);
--- a/third_party/dav1d/src/recon.h
+++ b/third_party/dav1d/src/recon.h
@@ -37,17 +37,17 @@
 #define DEBUG_B_PIXELS 0
 
 #define decl_recon_b_intra_fn(name) \
 void (name)(Dav1dTileContext *t, enum BlockSize bs, \
             enum EdgeFlags intra_edge_flags, const Av1Block *b)
 typedef decl_recon_b_intra_fn(*recon_b_intra_fn);
 
 #define decl_recon_b_inter_fn(name) \
-void (name)(Dav1dTileContext *t, enum BlockSize bs, const Av1Block *b)
+int (name)(Dav1dTileContext *t, enum BlockSize bs, const Av1Block *b)
 typedef decl_recon_b_inter_fn(*recon_b_inter_fn);
 
 #define decl_filter_sbrow_fn(name) \
 void (name)(Dav1dFrameContext *f, int sby)
 typedef decl_filter_sbrow_fn(*filter_sbrow_fn);
 
 #define decl_backup_ipred_edge_fn(name) \
 void (name)(Dav1dTileContext *t)
--- a/third_party/dav1d/src/recon_tmpl.c
+++ b/third_party/dav1d/src/recon_tmpl.c
@@ -32,30 +32,31 @@
 
 #include "common/attributes.h"
 #include "common/bitdepth.h"
 #include "common/dump.h"
 #include "common/intops.h"
 #include "common/mem.h"
 
 #include "src/cdef_apply.h"
+#include "src/ctx.h"
 #include "src/ipred_prepare.h"
 #include "src/lf_apply.h"
 #include "src/lr_apply.h"
 #include "src/recon.h"
 #include "src/scan.h"
 #include "src/tables.h"
 #include "src/wedge.h"
 
 static unsigned read_golomb(MsacContext *const msac) {
     int len = 0;
     unsigned val = 1;
 
-    while (!msac_decode_bool(msac, 128 << 7) && len < 32) len++;
-    while (len--) val = (val << 1) | msac_decode_bool(msac, 128 << 7);
+    while (!msac_decode_bool(msac, EC_BOOL_EPROB) && len < 32) len++;
+    while (len--) val = (val << 1) | msac_decode_bool(msac, EC_BOOL_EPROB);
 
     return val - 1;
 }
 
 static int decode_coefs(Dav1dTileContext *const t,
                         uint8_t *const a, uint8_t *const l,
                         const enum RectTxfmSize tx, const enum BlockSize bs,
                         const Av1Block *const b, const int intra,
@@ -96,25 +97,25 @@ static int decode_coefs(Dav1dTileContext
         const enum TxfmTypeSet set = get_ext_txtp_set(tx, !intra,
                                                       &f->frame_hdr, b->seg_id);
         const unsigned set_cnt = dav1d_tx_type_count[set];
         unsigned idx;
         if (set_cnt == 1) {
             idx = 0;
         } else {
             const int set_idx = dav1d_tx_type_set_index[!intra][set];
-            const enum IntraPredMode y_mode_nofilt = b->y_mode == FILTER_PRED ?
-                dav1d_filter_mode_to_y_mode[b->y_angle] : b->y_mode;
+            const enum IntraPredMode y_mode_nofilt = intra ? b->y_mode == FILTER_PRED ?
+                dav1d_filter_mode_to_y_mode[b->y_angle] : b->y_mode : 0;
             uint16_t *const txtp_cdf = intra ?
                        ts->cdf.m.txtp_intra[set_idx][t_dim->min][y_mode_nofilt] :
                        ts->cdf.m.txtp_inter[set_idx][t_dim->min];
             idx = msac_decode_symbol_adapt(&ts->msac, txtp_cdf, set_cnt);
             if (dbg)
             printf("Post-txtp[%d->%d][%d->%d][%d][%d->%d]: r=%d\n",
-                   set, set_idx, tx, t_dim->min, b->intra ? (int)y_mode_nofilt : -1,
+                   set, set_idx, tx, t_dim->min, intra ? (int)y_mode_nofilt : -1,
                    idx, dav1d_tx_types_per_set[set][idx], ts->msac.rng);
         }
         *txtp = dav1d_tx_types_per_set[set][idx];
     }
 
     // find end-of-block (eob)
     int eob_bin;
     const int tx2dszctx = imin(t_dim->lw, TX_32X32) + imin(t_dim->lh, TX_32X32);
@@ -146,17 +147,17 @@ static int decode_coefs(Dav1dTileContext
             ts->cdf.coef.eob_hi_bit[t_dim->ctx][chroma][eob_bin];
         const int eob_hi_bit = msac_decode_bool_adapt(&ts->msac, eob_hi_bit_cdf);
         if (dbg)
         printf("Post-eob_hi_bit[%d][%d][%d][%d]: r=%d\n",
                t_dim->ctx, chroma, eob_bin, eob_hi_bit, ts->msac.rng);
         unsigned mask = eob >> 1;
         if (eob_hi_bit) eob |= mask;
         for (mask >>= 1; mask; mask >>= 1) {
-            const int eob_bit = msac_decode_bool(&ts->msac, 128 << 7);
+            const int eob_bit = msac_decode_bool(&ts->msac, EC_BOOL_EPROB);
             if (eob_bit) eob |= mask;
         }
         if (dbg)
         printf("Post-eob[%d]: r=%d\n", eob, ts->msac.rng);
     } else {
         eob = eob_bin;
     }
 
@@ -221,35 +222,39 @@ static int decode_coefs(Dav1dTileContext
                 ts->cdf.coef.dc_sign[chroma][dc_sign_ctx];
             sign = msac_decode_bool_adapt(&ts->msac, dc_sign_cdf);
             if (dbg)
             printf("Post-dc_sign[%d][%d][%d]: r=%d\n",
                    chroma, dc_sign_ctx, sign, ts->msac.rng);
             dc_sign = sign ? 0 : 2;
             dq = (dq_tbl[0] * qm_tbl[0] + 16) >> 5;
         } else {
-            sign = msac_decode_bool(&ts->msac, 128 << 7);
+            sign = msac_decode_bool(&ts->msac, EC_BOOL_EPROB);
             if (dbg)
             printf("Post-sign[%d=%d=%d]: r=%d\n", i, rc, sign, ts->msac.rng);
             dq = (dq_tbl[1] * qm_tbl[rc] + 16) >> 5;
         }
 
         // residual
         if (tok == 15) {
             tok += read_golomb(&ts->msac);
             if (dbg)
             printf("Post-residual[%d=%d=%d->%d]: r=%d\n",
                    i, rc, tok - 15, tok, ts->msac.rng);
         }
 
-        // dequant
+        // coefficient parsing, see 5.11.39
+        tok &= 0xfffff;
+
+        // dequant, see 7.12.3
         cul_level += tok;
-        tok *= dq;
-        tok >>= dq_shift;
-        cf[rc] = sign ? -tok : tok;
+        tok = (((int64_t)dq * tok) & 0xffffff) >> dq_shift;
+        cf[rc] = iclip(sign ? -tok : tok,
+                       -(1 << (7 + BITDEPTH)),
+                       (1 << (7 + BITDEPTH)) - 1);
     }
 
     // context
     *res_ctx = imin(cul_level, 63) | (dc_sign << 6);
 
     return eob;
 }
 
@@ -260,17 +265,21 @@ static void read_coef_tree(Dav1dTileCont
                            const int x_off, const int y_off, pixel *dst)
 {
     const Dav1dFrameContext *const f = t->f;
     Dav1dTileState *const ts = t->ts;
     const Dav1dDSPContext *const dsp = f->dsp;
     const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[ytx];
     const int txw = t_dim->w, txh = t_dim->h;
 
-    if (depth < 2 && tx_split[depth] & (1 << (y_off * 4 + x_off))) {
+    /* y_off can be larger than 3 since lossless blocks use TX_4X4 but can't
+     * be splitted. Aviods an undefined left shift. */
+    if (depth < 2 && tx_split[depth] &&
+        tx_split[depth] & (1 << (y_off * 4 + x_off)))
+    {
         const enum RectTxfmSize sub = t_dim->sub;
         const TxfmInfo *const sub_t_dim = &dav1d_txfm_dimensions[sub];
         const int txsw = sub_t_dim->w, txsh = sub_t_dim->h;
 
         read_coef_tree(t, bs, b, sub, depth + 1, tx_split,
                        x_off * 2 + 0, y_off * 2 + 0, dst);
         t->bx += txsw;
         if (txw >= txh && t->bx < f->bw)
@@ -306,20 +315,32 @@ static void read_coef_tree(Dav1dTileCont
             cf = t->cf;
         }
         if (f->frame_thread.pass != 2) {
             eob = decode_coefs(t, &t->a->lcoef[bx4], &t->l.lcoef[by4],
                                ytx, bs, b, 0, 0, cf, &txtp, &cf_ctx);
             if (DEBUG_BLOCK_INFO)
                 printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
                        ytx, txtp, eob, ts->msac.rng);
-            memset(&t->a->lcoef[bx4], cf_ctx, imin(txw, f->bw - t->bx));
-            memset(&t->l.lcoef[by4], cf_ctx, imin(txh, f->bh - t->by));
-            for (int y = 0; y < txh; y++)
-                memset(&t->txtp_map[(by4 + y) * 32 + bx4], txtp, txw);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+            rep_macro(type, t->dir lcoef, off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+            memset(&t->dir lcoef[off], cf_ctx, sz)
+            case_set_upto16_with_default(imin(txh, f->bh - t->by), l., 1, by4);
+            case_set_upto16_with_default(imin(txw, f->bw - t->bx), a->, 0, bx4);
+#undef default_memset
+#undef set_ctx
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+            for (int y = 0; y < txh; y++) { \
+                rep_macro(type, txtp_map, 0, mul * txtp); \
+                txtp_map += 32; \
+            }
+            uint8_t *txtp_map = &t->txtp_map[by4 * 32 + bx4];
+            case_set_upto16(txw,,,);
+#undef set_ctx
             if (f->frame_thread.pass == 1) {
                 cbi->eob[0] = eob;
                 cbi->txtp[0] = txtp;
             }
         } else {
             eob = cbi->eob[0];
             txtp = cbi->txtp[0];
         }
@@ -347,21 +368,28 @@ void bytefn(dav1d_read_coef_blocks)(Dav1
     const uint8_t *const b_dim = dav1d_block_dimensions[bs];
     const int bw4 = b_dim[0], bh4 = b_dim[1];
     const int cbw4 = (bw4 + 1) >> ss_hor, cbh4 = (bh4 + 1) >> ss_ver;
     const int has_chroma = f->seq_hdr.layout != DAV1D_PIXEL_LAYOUT_I400 &&
                            (bw4 > ss_hor || t->bx & 1) &&
                            (bh4 > ss_ver || t->by & 1);
 
     if (b->skip) {
-        memset(&t->a->lcoef[bx4], 0x40, bw4);
-        memset(&t->l.lcoef[by4], 0x40, bh4);
-        if (has_chroma) for (int pl = 0; pl < 2; pl++) {
-            memset(&t->a->ccoef[pl][cbx4], 0x40, cbw4);
-            memset(&t->l.ccoef[pl][cby4], 0x40, cbh4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+        rep_macro(type, t->dir lcoef, off, mul * 0x40)
+        case_set(bh4, l., 1, by4);
+        case_set(bw4, a->, 0, bx4);
+#undef set_ctx
+        if (has_chroma) {
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+            rep_macro(type, t->dir ccoef[0], off, mul * 0x40); \
+            rep_macro(type, t->dir ccoef[1], off, mul * 0x40)
+            case_set(cbh4, l., 1, cby4);
+            case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
         }
         return;
     }
 
     Dav1dTileState *const ts = t->ts;
     const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
     const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
     assert(f->frame_thread.pass == 1);
@@ -393,20 +421,26 @@ void bytefn(dav1d_read_coef_blocks)(Dav1
                             decode_coefs(t, &t->a->lcoef[bx4 + x],
                                          &t->l.lcoef[by4 + y], b->tx, bs, b, 1,
                                          0, ts->frame_thread.cf, &txtp, &cf_ctx);
                         if (DEBUG_BLOCK_INFO)
                             printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
                                    b->tx, txtp, eob, ts->msac.rng);
                         cbi[t->bx].txtp[0] = txtp;
                         ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
-                        memset(&t->a->lcoef[bx4 + x], cf_ctx,
-                               imin(t_dim->w, f->bw - t->bx));
-                        memset(&t->l.lcoef[by4 + y], cf_ctx,
-                               imin(t_dim->h, f->bh - t->by));
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+                        rep_macro(type, t->dir lcoef, off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+                        memset(&t->dir lcoef[off], cf_ctx, sz)
+                        case_set_upto16_with_default(imin(t_dim->h, f->bh - t->by),
+                                                     l., 1, by4 + y);
+                        case_set_upto16_with_default(imin(t_dim->w, f->bw - t->bx),
+                                                     a->, 0, bx4 + x);
+#undef default_memset
+#undef set_ctx
                     }
                 }
                 t->bx -= x;
             }
             t->by -= y;
 
             if (!has_chroma) continue;
 
@@ -432,221 +466,225 @@ void bytefn(dav1d_read_coef_blocks)(Dav1
                                          b, b->intra, 1 + pl, ts->frame_thread.cf,
                                          &txtp, &cf_ctx);
                         if (DEBUG_BLOCK_INFO)
                             printf("Post-uv-cf-blk[pl=%d,tx=%d,"
                                    "txtp=%d,eob=%d]: r=%d\n",
                                    pl, b->uvtx, txtp, eob, ts->msac.rng);
                         cbi[t->bx].txtp[1 + pl] = txtp;
                         ts->frame_thread.cf += uv_t_dim->w * uv_t_dim->h * 16;
-                        memset(&t->a->ccoef[pl][cbx4 + x], cf_ctx,
-                               imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor));
-                        memset(&t->l.ccoef[pl][cby4 + y], cf_ctx,
-                               imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver));
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+                        rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+                        memset(&t->dir ccoef[pl][off], cf_ctx, sz)
+                        case_set_upto16_with_default( \
+                                 imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver),
+                                 l., 1, cby4 + y);
+                        case_set_upto16_with_default( \
+                                 imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor),
+                                 a->, 0, cbx4 + x);
+#undef default_memset
+#undef set_ctx
                     }
                     t->bx -= x << ss_hor;
                 }
                 t->by -= y << ss_ver;
             }
         }
     }
 }
 
-static void emu_edge(pixel *dst, const ptrdiff_t dst_stride,
-                     const pixel *ref, const ptrdiff_t ref_stride,
-                     const int bw, const int bh,
-                     const int iw, const int ih,
-                     const int x, const int y)
-{
-    // find offset in reference of visible block to copy
-    ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + iclip(x, 0, iw - 1);
-
-    // number of pixels to extend (left, right, top, bottom)
-    const int left_ext = iclip(-x, 0, bw - 1);
-    const int right_ext = iclip(x + bw - iw, 0, bw - 1);
-    assert(left_ext + right_ext < bw);
-    const int top_ext = iclip(-y, 0, bh - 1);
-    const int bottom_ext = iclip(y + bh - ih, 0, bh - 1);
-    assert(top_ext + bottom_ext < bh);
-
-    // copy visible portion first
-    pixel *blk = dst + top_ext * PXSTRIDE(dst_stride);
-    const int center_w = bw - left_ext - right_ext;
-    const int center_h = bh - top_ext - bottom_ext;
-    for (int y = 0; y < center_h; y++) {
-        pixel_copy(blk + left_ext, ref, center_w);
-        // extend left edge for this line
-        if (left_ext)
-            pixel_set(blk, blk[left_ext], left_ext);
-        // extend right edge for this line
-        if (right_ext)
-            pixel_set(blk + left_ext + center_w, blk[left_ext + center_w - 1],
-                      right_ext);
-        ref += PXSTRIDE(ref_stride);
-        blk += PXSTRIDE(dst_stride);
-    }
-
-    // copy top
-    blk = dst + top_ext * PXSTRIDE(dst_stride);
-    for (int y = 0; y < top_ext; y++) {
-        pixel_copy(dst, blk, bw);
-        dst += PXSTRIDE(dst_stride);
-    }
-
-    // copy bottom
-    dst += center_h * PXSTRIDE(dst_stride);
-    for (int y = 0; y < bottom_ext; y++) {
-        pixel_copy(dst, &dst[-PXSTRIDE(dst_stride)], bw);
-        dst += PXSTRIDE(dst_stride);
-    }
-}
-
-static void mc(Dav1dTileContext *const t,
-               pixel *const dst8, coef *const dst16, const ptrdiff_t dst_stride,
-               const int bw4, const int bh4,
-               const int bx, const int by, const int pl,
-               const mv mv, const Dav1dThreadPicture *const refp,
-               const enum Filter2d filter_2d)
+static int mc(Dav1dTileContext *const t,
+              pixel *const dst8, coef *const dst16, const ptrdiff_t dst_stride,
+              const int bw4, const int bh4,
+              const int bx, const int by, const int pl,
+              const mv mv, const Dav1dThreadPicture *const refp, const int refidx,
+              const enum Filter2d filter_2d)
 {
     assert((dst8 != NULL) ^ (dst16 != NULL));
     const Dav1dFrameContext *const f = t->f;
     const int ss_ver = !!pl && f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
     const int ss_hor = !!pl && f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
     const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
     const int mvx = mv.x, mvy = mv.y;
     const int mx = mvx & (15 >> !ss_hor), my = mvy & (15 >> !ss_ver);
-    const int dx = bx * h_mul + (mvx >> (3 + ss_hor));
-    const int dy = by * v_mul + (mvy >> (3 + ss_ver));
     ptrdiff_t ref_stride = refp->p.stride[!!pl];
     const pixel *ref;
-    int w, h;
+
+    if (refp->p.p.w == f->cur.p.p.w && refp->p.p.h == f->cur.p.p.h) {
+        const int dx = bx * h_mul + (mvx >> (3 + ss_hor));
+        const int dy = by * v_mul + (mvy >> (3 + ss_ver));
+        int w, h;
 
-    if (refp != &f->cur) { // i.e. not for intrabc
-        dav1d_thread_picture_wait(refp, dy + bh4 * v_mul + !!my * 4,
-                                  PLANE_TYPE_Y + !!pl);
-        w = (f->cur.p.p.w + ss_hor) >> ss_hor;
-        h = (f->cur.p.p.h + ss_ver) >> ss_ver;
+        if (refp != &f->cur) { // i.e. not for intrabc
+            if (dav1d_thread_picture_wait(refp, dy + bh4 * v_mul + !!my * 4,
+                                          PLANE_TYPE_Y + !!pl))
+            {
+                return -1;
+            }
+            w = (f->cur.p.p.w + ss_hor) >> ss_hor;
+            h = (f->cur.p.p.h + ss_ver) >> ss_ver;
+        } else {
+            w = f->bw * 4 >> ss_hor;
+            h = f->bh * 4 >> ss_ver;
+        }
+        if (dx < !!mx * 3 || dy < !!my * 3 ||
+            dx + bw4 * h_mul + !!mx * 4 > w ||
+            dy + bh4 * v_mul + !!my * 4 > h)
+        {
+            f->dsp->mc.emu_edge(bw4 * h_mul + !!mx * 7, bh4 * v_mul + !!my * 7,
+                                w, h, dx - !!mx * 3, dy - !!my * 3,
+                                t->emu_edge, 192 * sizeof(pixel),
+                                refp->p.data[pl], ref_stride);
+            ref = &t->emu_edge[192 * !!my * 3 + !!mx * 3];
+            ref_stride = 192 * sizeof(pixel);
+        } else {
+            ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;
+        }
+
+        if (dst8 != NULL) {
+            f->dsp->mc.mc[filter_2d](dst8, dst_stride, ref, ref_stride, bw4 * h_mul,
+                                     bh4 * v_mul, mx << !ss_hor, my << !ss_ver);
+        } else {
+            f->dsp->mc.mct[filter_2d](dst16, ref, ref_stride, bw4 * h_mul,
+                                      bh4 * v_mul, mx << !ss_hor, my << !ss_ver);
+        }
     } else {
-        w = f->bw * 4 >> ss_hor;
-        h = f->bh * 4 >> ss_ver;
-    }
-    if (dx < !!mx * 3 || dy < !!my * 3 ||
-        dx + bw4 * h_mul + !!mx * 4 > w ||
-        dy + bh4 * v_mul + !!my * 4 > h)
-    {
-        emu_edge(t->emu_edge, 160 * sizeof(pixel), refp->p.data[pl], ref_stride,
-                 bw4 * h_mul + !!mx * 7, bh4 * v_mul + !!my * 7, w, h,
-                 dx - !!mx * 3, dy - !!my * 3);
-        ref = &t->emu_edge[160 * !!my * 3 + !!mx * 3];
-        ref_stride = 160 * sizeof(pixel);
-    } else {
-        ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;
+        assert(refp != &f->cur);
+
+        int orig_pos_y = (by * v_mul << 4) + mvy * (1 << !ss_ver);
+        int orig_pos_x = (bx * h_mul << 4) + mvx * (1 << !ss_hor);
+#define scale_mv(res, val, scale) do { \
+            const int64_t tmp = (int64_t)(val) * scale + (scale - 0x4000) * 8; \
+            res = (int)apply_sign64((llabs(tmp) + 128) >> 8, tmp) + 32; \
+        } while (0)
+        int pos_y, pos_x;
+        scale_mv(pos_x, orig_pos_x, f->svc[refidx][0].scale);
+        scale_mv(pos_y, orig_pos_y, f->svc[refidx][1].scale);
+#undef scale_mv
+        const int left = pos_x >> 10;
+        const int top = pos_y >> 10;
+        const int right =
+            ((pos_x + (bw4 * h_mul - 1) * f->svc[refidx][0].step) >> 10) + 1;
+        const int bottom =
+            ((pos_y + (bh4 * v_mul - 1) * f->svc[refidx][1].step) >> 10) + 1;
+
+        if (dav1d_thread_picture_wait(refp, bottom, PLANE_TYPE_Y + !!pl))
+            return -1;
+
+        const int w = (refp->p.p.w + ss_hor) >> ss_hor;
+        const int h = (refp->p.p.h + ss_ver) >> ss_ver;
+        if (left < 3 || top < 3 || right + 4 > w || bottom + 4 > h) {
+            f->dsp->mc.emu_edge(right - left + 7, bottom - top + 7,
+                                w, h, left - 3, top - 3,
+                                t->emu_edge, 320 * sizeof(pixel),
+                                refp->p.data[pl], ref_stride);
+            ref = &t->emu_edge[320 * 3 + 3];
+            ref_stride = 320 * sizeof(pixel);
+        } else {
+            ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * top + left;
+        }
+
+        if (dst8 != NULL) {
+            f->dsp->mc.mc_scaled[filter_2d](dst8, dst_stride, ref, ref_stride,
+                                            bw4 * h_mul, bh4 * v_mul,
+                                            pos_x & 0x3ff, pos_y & 0x3ff,
+                                            f->svc[refidx][0].step,
+                                            f->svc[refidx][1].step);
+        } else {
+            f->dsp->mc.mct_scaled[filter_2d](dst16, ref, ref_stride,
+                                             bw4 * h_mul, bh4 * v_mul,
+                                             pos_x & 0x3ff, pos_y & 0x3ff,
+                                             f->svc[refidx][0].step,
+                                             f->svc[refidx][1].step);
+        }
     }
 
-    if (dst8 != NULL) {
-        f->dsp->mc.mc[filter_2d](dst8, dst_stride, ref, ref_stride, bw4 * h_mul,
-                                 bh4 * v_mul, mx << !ss_hor, my << !ss_ver);
-    } else {
-        f->dsp->mc.mct[filter_2d](dst16, ref, ref_stride, bw4 * h_mul,
-                                  bh4 * v_mul, mx << !ss_hor, my << !ss_ver);
-    }
+    return 0;
 }
 
-static void obmc(Dav1dTileContext *const t,
-                 pixel *const dst, const ptrdiff_t dst_stride,
-                 const uint8_t *const b_dim, const int pl,
-                 const int bx4, const int by4, const int w4, const int h4)
+static int obmc(Dav1dTileContext *const t,
+                pixel *const dst, const ptrdiff_t dst_stride,
+                const uint8_t *const b_dim, const int pl,
+                const int bx4, const int by4, const int w4, const int h4)
 {
     assert(!(t->bx & 1) && !(t->by & 1));
     const Dav1dFrameContext *const f = t->f;
     const refmvs *const r = &f->mvs[t->by * f->b4_stride + t->bx];
     pixel *const lap = t->scratch.lap;
-    static const uint8_t obmc_mask_2[2] = { 19,  0 };
-    static const uint8_t obmc_mask_4[4] = { 25, 14,  5,  0 };
-    static const uint8_t obmc_mask_8[8] = { 28, 22, 16, 11,  7,  3,  0,  0 };
-    static const uint8_t obmc_mask_16[16] = { 30, 27, 24, 21, 18, 15, 12, 10,
-                                               8,  6,  4,  3,  0,  0,  0,  0 };
-    static const uint8_t obmc_mask_32[32] = { 31, 29, 28, 26, 24, 23, 21, 20,
-                                              19, 17, 16, 14, 13, 12, 11,  9,
-                                               8,  7,  6,  5,  4,  4,  3,  2,
-                                               0,  0,  0,  0,  0,  0,  0,  0 };
-    static const uint8_t *const obmc_masks[] = {
-        obmc_mask_2, obmc_mask_4, obmc_mask_8, obmc_mask_16, obmc_mask_32
-    };
     const int ss_ver = !!pl && f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
     const int ss_hor = !!pl && f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
     const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
+    int res;
 
     if (t->by > t->ts->tiling.row_start &&
         (!pl || b_dim[0] * h_mul + b_dim[1] * v_mul >= 16))
     {
         for (int i = 0, x = 0; x < w4 && i < imin(b_dim[2], 4); ) {
             // only odd blocks are considered for overlap handling, hence +1
             const refmvs *const a_r = &r[x - f->b4_stride + 1];
             const uint8_t *const a_b_dim =
                 dav1d_block_dimensions[sbtype_to_bs[a_r->sb_type]];
 
             if (a_r->ref[0] > 0) {
-                mc(t, lap, NULL, 128 * sizeof(pixel),
-                   iclip(a_b_dim[0], 2, b_dim[0]), imin(b_dim[1], 16) >> 1,
-                   t->bx + x, t->by, pl, a_r->mv[0],
-                   &f->refp[a_r->ref[0] - 1],
-                   dav1d_filter_2d[t->a->filter[1][bx4 + x + 1]][t->a->filter[0][bx4 + x + 1]]);
-                f->dsp->mc.blend(&dst[x * h_mul], dst_stride,
-                                 lap, 128 * sizeof(pixel),
-                                 h_mul * iclip(a_b_dim[0], 2, b_dim[0]),
-                                 v_mul * imin(b_dim[1], 16) >> 1,
-                                 obmc_masks[imin(b_dim[3], 4) - ss_ver], 1);
+                const int ow4 = iclip(a_b_dim[0], 2, b_dim[0]);
+                const int oh4 = imin(b_dim[1], 16) >> 1;
+                res = mc(t, lap, NULL, ow4 * h_mul * sizeof(pixel), ow4, oh4,
+                         t->bx + x, t->by, pl, a_r->mv[0],
+                         &f->refp[a_r->ref[0] - 1], a_r->ref[0] - 1,
+                         dav1d_filter_2d[t->a->filter[1][bx4 + x + 1]][t->a->filter[0][bx4 + x + 1]]);
+                if (res) return res;
+                f->dsp->mc.blend_h(&dst[x * h_mul], dst_stride, lap,
+                                   h_mul * ow4, v_mul * oh4);
                 i++;
             }
             x += imax(a_b_dim[0], 2);
         }
     }
 
     if (t->bx > t->ts->tiling.col_start)
         for (int i = 0, y = 0; y < h4 && i < imin(b_dim[3], 4); ) {
             // only odd blocks are considered for overlap handling, hence +1
             const refmvs *const l_r = &r[(y + 1) * f->b4_stride - 1];
             const uint8_t *const l_b_dim =
                 dav1d_block_dimensions[sbtype_to_bs[l_r->sb_type]];
 
             if (l_r->ref[0] > 0) {
-                mc(t, lap, NULL, 32 * sizeof(pixel),
-                   imin(b_dim[0], 16) >> 1,
-                   iclip(l_b_dim[1], 2, b_dim[1]),
-                   t->bx, t->by + y, pl, l_r->mv[0],
-                   &f->refp[l_r->ref[0] - 1],
-                   dav1d_filter_2d[t->l.filter[1][by4 + y + 1]][t->l.filter[0][by4 + y + 1]]);
-                f->dsp->mc.blend(&dst[y * v_mul * PXSTRIDE(dst_stride)], dst_stride,
-                                 lap, 32 * sizeof(pixel),
-                                 h_mul * imin(b_dim[0], 16) >> 1,
-                                 v_mul * iclip(l_b_dim[1], 2, b_dim[1]),
-                                 obmc_masks[imin(b_dim[2], 4) - ss_hor], 0);
+                const int ow4 = imin(b_dim[0], 16) >> 1;
+                const int oh4 = iclip(l_b_dim[1], 2, b_dim[1]);
+                res = mc(t, lap, NULL, h_mul * ow4 * sizeof(pixel), ow4, oh4,
+                         t->bx, t->by + y, pl, l_r->mv[0],
+                         &f->refp[l_r->ref[0] - 1], l_r->ref[0] - 1,
+                         dav1d_filter_2d[t->l.filter[1][by4 + y + 1]][t->l.filter[0][by4 + y + 1]]);
+                if (res) return res;
+                f->dsp->mc.blend_v(&dst[y * v_mul * PXSTRIDE(dst_stride)],
+                                   dst_stride, lap, h_mul * ow4, v_mul * oh4);
                 i++;
             }
             y += imax(l_b_dim[1], 2);
         }
+    return 0;
 }
 
-static void warp_affine(Dav1dTileContext *const t,
-                        pixel *dst8, coef *dst16, const ptrdiff_t dstride,
-                        const uint8_t *const b_dim, const int pl,
-                        const Dav1dThreadPicture *const refp,
-                        const WarpedMotionParams *const wmp)
+static int warp_affine(Dav1dTileContext *const t,
+                       pixel *dst8, coef *dst16, const ptrdiff_t dstride,
+                       const uint8_t *const b_dim, const int pl,
+                       const Dav1dThreadPicture *const refp,
+                       const WarpedMotionParams *const wmp)
 {
     assert((dst8 != NULL) ^ (dst16 != NULL));
     const Dav1dFrameContext *const f = t->f;
     const Dav1dDSPContext *const dsp = f->dsp;
     const int ss_ver = !!pl && f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
     const int ss_hor = !!pl && f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
     const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
     assert(!((b_dim[0] * h_mul) & 7) && !((b_dim[1] * v_mul) & 7));
     const int32_t *const mat = wmp->matrix;
-    const int width = (f->cur.p.p.w + ss_hor) >> ss_hor;
-    const int height = (f->cur.p.p.h + ss_ver) >> ss_ver;
+    const int width = (refp->p.p.w + ss_hor) >> ss_hor;
+    const int height = (refp->p.p.h + ss_ver) >> ss_ver;
 
     for (int y = 0; y < b_dim[1] * v_mul; y += 8) {
         for (int x = 0; x < b_dim[0] * h_mul; x += 8) {
             // calculate transformation relative to center of 8x8 block in
             // luma pixel units
             const int src_x = t->bx * 4 + ((x + 4) << ss_hor);
             const int src_y = t->by * 4 + ((y + 4) << ss_ver);
             const int mvx = (mat[2] * src_x + mat[3] * src_y + mat[0]) >> ss_hor;
@@ -657,36 +695,41 @@ static void warp_affine(Dav1dTileContext
                                              wmp->beta  * 7) & ~0x3f;
             const int dy = (mvy >> 16) - 4;
             const int my = ((mvy & 0xffff) - wmp->gamma * 4 -
                                              wmp->delta * 4) & ~0x3f;
 
             const pixel *ref_ptr;
             ptrdiff_t ref_stride = refp->p.stride[!!pl];
 
-            dav1d_thread_picture_wait(refp, dy + 4 + 8,
-                                      PLANE_TYPE_Y + !!pl);
+            if (dav1d_thread_picture_wait(refp, dy + 4 + 8,
+                                          PLANE_TYPE_Y + !!pl))
+            {
+                return -1;
+            }
             if (dx < 3 || dx + 8 + 4 > width || dy < 3 || dy + 8 + 4 > height) {
-                emu_edge(t->emu_edge, 160 * sizeof(pixel), refp->p.data[pl],
-                         ref_stride, 15, 15, width, height, dx - 3, dy - 3);
-                ref_ptr = &t->emu_edge[160 * 3 + 3];
-                ref_stride = 160 * sizeof(pixel);
+                f->dsp->mc.emu_edge(15, 15, width, height, dx - 3, dy - 3,
+                                    t->emu_edge, 192 * sizeof(pixel),
+                                    refp->p.data[pl], ref_stride);
+                ref_ptr = &t->emu_edge[192 * 3 + 3];
+                ref_stride = 192 * sizeof(pixel);
             } else {
                 ref_ptr = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;
             }
             if (dst16 != NULL)
                 dsp->mc.warp8x8t(&dst16[x], dstride, ref_ptr, ref_stride,
                                  wmp->abcd, mx, my);
             else
                 dsp->mc.warp8x8(&dst8[x], dstride, ref_ptr, ref_stride,
                                 wmp->abcd, mx, my);
         }
         if (dst8) dst8  += 8 * PXSTRIDE(dstride);
         else      dst16 += 8 * dstride;
     }
+    return 0;
 }
 
 void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize bs,
                                  const enum EdgeFlags intra_edge_flags,
                                  const Av1Block *const b)
 {
     Dav1dTileState *const ts = t->ts;
     const Dav1dFrameContext *const f = t->f;
@@ -705,16 +748,18 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTi
     const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->tx];
     const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
 
     // coefficient coding
     ALIGN_STK_32(pixel, edge_buf, 257,);
     pixel *const edge = edge_buf + 128;
     const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
 
+    const int intra_edge_filter_flag = f->seq_hdr.intra_edge_filter << 10;
+
     for (int init_y = 0; init_y < h4; init_y += 16) {
         for (int init_x = 0; init_x < w4; init_x += 16) {
             if (b->pal_sz[0]) {
                 pixel *dst = ((pixel *) f->cur.p.data[0]) +
                              4 * (t->by * PXSTRIDE(f->cur.p.stride[0]) + t->bx);
                 const uint8_t *pal_idx;
                 if (f->frame_thread.pass) {
                     pal_idx = ts->frame_thread.pal_idx;
@@ -727,17 +772,19 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTi
                                         ((t->bx >> 1) + (t->by & 1))][0] : t->pal[0];
                 f->dsp->ipred.pal_pred(dst, f->cur.p.stride[0], pal,
                                        pal_idx, bw4 * 4, bh4 * 4);
                 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
                     hex_dump(dst, PXSTRIDE(f->cur.p.stride[0]),
                              bw4 * 4, bh4 * 4, "y-pal-pred");
             }
 
-            const int sm_fl = sm_flag(t->a, bx4) | sm_flag(&t->l, by4);
+            const int intra_flags = (sm_flag(t->a, bx4) |
+                                     sm_flag(&t->l, by4) |
+                                     intra_edge_filter_flag);
             const int sb_has_tr = init_x + 16 < w4 ? 1 : init_y ? 0 :
                               intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT;
             const int sb_has_bl = init_x ? 0 : init_y + 16 < h4 ? 1 :
                               intra_edge_flags & EDGE_I444_LEFT_HAS_BOTTOM;
             int y, x;
             const int sub_h4 = imin(h4, 16 + init_y);
             const int sub_w4 = imin(w4, init_x + 16);
             for (y = init_y, t->by += init_y; y < sub_h4;
@@ -771,17 +818,19 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTi
                                                           ts->tiling.col_end,
                                                           ts->tiling.row_end,
                                                           edge_flags, dst,
                                                           f->cur.p.stride[0], top_sb_edge,
                                                           b->y_mode, &angle,
                                                           t_dim->w, t_dim->h, edge);
                     dsp->ipred.intra_pred[m](dst, f->cur.p.stride[0], edge,
                                              t_dim->w * 4, t_dim->h * 4,
-                                             angle | sm_fl);
+                                             angle | intra_flags,
+                                             4 * f->bw - 4 * t->bx,
+                                             4 * f->bh - 4 * t->by);
 
                     if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
                         hex_dump(edge - t_dim->h * 4, t_dim->h * 4,
                                  t_dim->h * 4, 2, "l");
                         hex_dump(edge, 0, 1, 1, "tl");
                         hex_dump(edge + 1, t_dim->w * 4,
                                  t_dim->w * 4, 2, "t");
                         hex_dump(dst, f->cur.p.stride[0],
@@ -804,36 +853,45 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTi
                             uint8_t cf_ctx;
                             cf = t->cf;
                             eob = decode_coefs(t, &t->a->lcoef[bx4 + x],
                                                &t->l.lcoef[by4 + y], b->tx, bs,
                                                b, 1, 0, cf, &txtp, &cf_ctx);
                             if (DEBUG_BLOCK_INFO)
                                 printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
                                        b->tx, txtp, eob, ts->msac.rng);
-                            memset(&t->a->lcoef[bx4 + x], cf_ctx,
-                                   imin(t_dim->w, f->bw - t->bx));
-                            memset(&t->l.lcoef[by4 + y], cf_ctx,
-                                   imin(t_dim->h, f->bh - t->by));
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+                            rep_macro(type, t->dir lcoef, off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+                            memset(&t->dir lcoef[off], cf_ctx, sz)
+                            case_set_upto16_with_default(imin(t_dim->h, f->bh - t->by), \
+                                                         l., 1, by4 + y);
+                            case_set_upto16_with_default(imin(t_dim->w, f->bw - t->bx), \
+                                                         a->, 0, bx4 + x);
+#undef default_memset
+#undef set_ctx
                         }
                         if (eob >= 0) {
                             if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
                                 coef_dump(cf, imin(t_dim->h, 8) * 4,
                                           imin(t_dim->w, 8) * 4, 3, "dq");
                             dsp->itx.itxfm_add[b->tx]
                                               [txtp](dst,
                                                      f->cur.p.stride[0],
                                                      cf, eob);
                             if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
                                 hex_dump(dst, f->cur.p.stride[0],
                                          t_dim->w * 4, t_dim->h * 4, "recon");
                         }
                     } else if (!f->frame_thread.pass) {
-                        memset(&t->a->lcoef[bx4 + x], 0x40, t_dim->w);
-                        memset(&t->l.lcoef[by4 + y], 0x40, t_dim->h);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+                        rep_macro(type, t->dir lcoef, off, mul * 0x40)
+                        case_set_upto16(t_dim->h, l., 1, by4 + y);
+                        case_set_upto16(t_dim->w, a->, 0, bx4 + x);
+#undef set_ctx
                     }
                     dst += 4 * t_dim->w;
                 }
                 t->bx -= x;
             }
             t->by -= y;
 
             if (!has_chroma) continue;
@@ -850,20 +908,20 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTi
                                               (t->by >> ss_ver) * PXSTRIDE(stride));
                 pixel *const uv_dst[2] = { ((pixel *) f->cur.p.data[1]) + uv_off,
                                            ((pixel *) f->cur.p.data[2]) + uv_off };
 
                 const int furthest_r =
                     ((cw4 << ss_hor) + t_dim->w - 1) & ~(t_dim->w - 1);
                 const int furthest_b =
                     ((ch4 << ss_ver) + t_dim->h - 1) & ~(t_dim->h - 1);
-                dsp->ipred.cfl_ac[f->cur.p.p.layout - 1]
-                                 [b->uvtx](ac, y_src, f->cur.p.stride[0],
-                                           cbw4 - (furthest_r >> ss_hor),
-                                           cbh4 - (furthest_b >> ss_ver));
+                dsp->ipred.cfl_ac[f->cur.p.p.layout - 1](ac, y_src, f->cur.p.stride[0],
+                                                         cbw4 - (furthest_r >> ss_hor),
+                                                         cbh4 - (furthest_b >> ss_ver),
+                                                         cbw4 * 4, cbh4 * 4);
                 for (int pl = 0; pl < 2; pl++) {
                     if (!b->cfl_alpha[pl]) continue;
                     int angle = 0;
                     const pixel *top_sb_edge = NULL;
                     if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
                         top_sb_edge = f->ipred_edge[pl + 1];
                         const int sby = t->by >> f->sb_shift;
                         top_sb_edge += f->sb128w * 128 * (sby - 1);
@@ -974,20 +1032,25 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTi
                             bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
                                                               ypos, ypos > ystart,
                                                               ts->tiling.col_end >> ss_hor,
                                                               ts->tiling.row_end >> ss_ver,
                                                               edge_flags, dst, stride,
                                                               top_sb_edge, uv_mode,
                                                               &angle, uv_t_dim->w,
                                                               uv_t_dim->h, edge);
+                        angle |= intra_edge_filter_flag;
                         dsp->ipred.intra_pred[m](dst, stride, edge,
                                                  uv_t_dim->w * 4,
                                                  uv_t_dim->h * 4,
-                                                 angle | sm_uv_fl);
+                                                 angle | sm_uv_fl,
+                                                 (4 * f->bw + ss_hor -
+                                                  4 * (t->bx & ~ss_hor)) >> ss_hor,
+                                                 (4 * f->bh + ss_ver -
+                                                  4 * (t->by & ~ss_ver)) >> ss_ver);
                         if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
                             hex_dump(edge - uv_t_dim->h * 4, uv_t_dim->h * 4,
                                      uv_t_dim->h * 4, 2, "l");
                             hex_dump(edge, 0, 1, 1, "tl");
                             hex_dump(edge + 1, uv_t_dim->w * 4,
                                      uv_t_dim->w * 4, 2, "t");
                             hex_dump(dst, stride, uv_t_dim->w * 4,
                                      uv_t_dim->h * 4, pl ? "v-intra-pred" : "u-intra-pred");
@@ -1011,47 +1074,58 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTi
                                 eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
                                                    &t->l.ccoef[pl][cby4 + y],
                                                    b->uvtx, bs, b, 1, 1 + pl, cf,
                                                    &txtp, &cf_ctx);
                                 if (DEBUG_BLOCK_INFO)
                                     printf("Post-uv-cf-blk[pl=%d,tx=%d,"
                                            "txtp=%d,eob=%d]: r=%d [x=%d,cbx4=%d]\n",
                                            pl, b->uvtx, txtp, eob, ts->msac.rng, x, cbx4);
-                                memset(&t->a->ccoef[pl][cbx4 + x], cf_ctx,
-                                       imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor));
-                                memset(&t->l.ccoef[pl][cby4 + y], cf_ctx,
-                                       imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver));
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+                                rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+                                memset(&t->dir ccoef[pl][off], cf_ctx, sz)
+                                case_set_upto16_with_default( \
+                                         imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver),
+                                         l., 1, cby4 + y);
+                                case_set_upto16_with_default( \
+                                         imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor),
+                                         a->, 0, cbx4 + x);
+#undef default_memset
+#undef set_ctx
                             }
                             if (eob >= 0) {
                                 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
                                     coef_dump(cf, uv_t_dim->h * 4,
                                               uv_t_dim->w * 4, 3, "dq");
                                 dsp->itx.itxfm_add[b->uvtx]
                                                   [txtp](dst, stride,
                                                          cf, eob);
                                 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
                                     hex_dump(dst, stride, uv_t_dim->w * 4,
                                              uv_t_dim->h * 4, "recon");
                             }
                         } else if (!f->frame_thread.pass) {
-                            memset(&t->a->ccoef[pl][cbx4 + x], 0x40, uv_t_dim->w);
-                            memset(&t->l.ccoef[pl][cby4 + y], 0x40, uv_t_dim->h);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+                            rep_macro(type, t->dir ccoef[pl], off, mul * 0x40)
+                            case_set_upto16(uv_t_dim->h, l., 1, cby4 + y);
+                            case_set_upto16(uv_t_dim->w, a->, 0, cbx4 + x);
+#undef set_ctx
                         }
                         dst += uv_t_dim->w * 4;
                     }
                     t->bx -= x << ss_hor;
                 }
                 t->by -= y << ss_ver;
             }
         }
     }
 }
 
-void bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize bs,
+int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize bs,
                                  const Av1Block *const b)
 {
     Dav1dTileState *const ts = t->ts;
     const Dav1dFrameContext *const f = t->f;
     const Dav1dDSPContext *const dsp = f->dsp;
     const int bx4 = t->bx & 31, by4 = t->by & 31;
     const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
     const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
@@ -1059,50 +1133,58 @@ void bytefn(dav1d_recon_b_inter)(Dav1dTi
     const uint8_t *const b_dim = dav1d_block_dimensions[bs];
     const int bw4 = b_dim[0], bh4 = b_dim[1];
     const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
     const int has_chroma = f->seq_hdr.layout != DAV1D_PIXEL_LAYOUT_I400 &&
                            (bw4 > ss_hor || t->bx & 1) &&
                            (bh4 > ss_ver || t->by & 1);
     const int chr_layout_idx = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I400 ? 0 :
                                DAV1D_PIXEL_LAYOUT_I444 - f->cur.p.p.layout;
+    int res;
 
     // prediction
     const int cbh4 = (bh4 + ss_ver) >> ss_ver, cbw4 = (bw4 + ss_hor) >> ss_hor;
     pixel *dst = ((pixel *) f->cur.p.data[0]) +
         4 * (t->by * PXSTRIDE(f->cur.p.stride[0]) + t->bx);
     const ptrdiff_t uvdstoff =
         4 * ((t->bx >> ss_hor) + (t->by >> ss_ver) * PXSTRIDE(f->cur.p.stride[1]));
     if (!(f->frame_hdr.frame_type & 1)) {
         // intrabc
-        mc(t, dst, NULL, f->cur.p.stride[0],
-           bw4, bh4, t->bx, t->by, 0, b->mv[0], &f->cur, FILTER_2D_BILINEAR);
-        if (has_chroma) for (int pl = 1; pl < 3; pl++)
-            mc(t, ((pixel *) f->cur.p.data[pl]) + uvdstoff, NULL, f->cur.p.stride[1],
-               bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
-               t->bx & ~ss_hor, t->by & ~ss_ver,
-               pl, b->mv[0], &f->cur, FILTER_2D_BILINEAR);
+        res = mc(t, dst, NULL, f->cur.p.stride[0],
+                 bw4, bh4, t->bx, t->by, 0, b->mv[0], &f->cur, -1, FILTER_2D_BILINEAR);
+        if (res) return res;
+        if (has_chroma) for (int pl = 1; pl < 3; pl++) {
+            res = mc(t, ((pixel *)f->cur.p.data[pl]) + uvdstoff, NULL, f->cur.p.stride[1],
+                     bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
+                     t->bx & ~ss_hor, t->by & ~ss_ver,
+                     pl, b->mv[0], &f->cur, -1, FILTER_2D_BILINEAR);
+            if (res) return res;
+        }
     } else if (b->comp_type == COMP_INTER_NONE) {
         const Dav1dThreadPicture *const refp = &f->refp[b->ref[0]];
         const enum Filter2d filter_2d = b->filter2d;
 
         if (imin(bw4, bh4) > 1 && !f->frame_hdr.force_integer_mv &&
             ((b->inter_mode == GLOBALMV &&
               f->frame_hdr.gmv[b->ref[0]].type > WM_TYPE_TRANSLATION) ||
              (b->motion_mode == MM_WARP &&
               t->warpmv.type > WM_TYPE_TRANSLATION)))
         {
-            warp_affine(t, dst, NULL, f->cur.p.stride[0], b_dim, 0, refp,
-                        b->motion_mode == MM_WARP ? &t->warpmv :
-                            &f->frame_hdr.gmv[b->ref[0]]);
+            res = warp_affine(t, dst, NULL, f->cur.p.stride[0], b_dim, 0, refp,
+                              b->motion_mode == MM_WARP ? &t->warpmv :
+                                  &f->frame_hdr.gmv[b->ref[0]]);
+            if (res) return res;
         } else {
-            mc(t, dst, NULL, f->cur.p.stride[0],
-               bw4, bh4, t->bx, t->by, 0, b->mv[0], refp, filter_2d);
-            if (b->motion_mode == MM_OBMC)
-                obmc(t, dst, f->cur.p.stride[0], b_dim, 0, bx4, by4, w4, h4);
+            res = mc(t, dst, NULL, f->cur.p.stride[0],
+                     bw4, bh4, t->bx, t->by, 0, b->mv[0], refp, b->ref[0], filter_2d);
+            if (res) return res;
+            if (b->motion_mode == MM_OBMC) {
+                res = obmc(t, dst, f->cur.p.stride[0], b_dim, 0, bx4, by4, w4, h4);
+                if (res) return res;
+            }
         }
         if (b->interintra_type) {
             ALIGN_STK_32(pixel, tl_edge_buf, 65,);
             pixel *const tl_edge = tl_edge_buf + 32;
             enum IntraPredMode m = b->interintra_mode == II_SMOOTH_PRED ?
                                    SMOOTH_PRED : b->interintra_mode;
             pixel *const tmp = t->scratch.interintra;
             int angle = 0;
@@ -1113,23 +1195,23 @@ void bytefn(dav1d_recon_b_inter)(Dav1dTi
                 top_sb_edge += f->sb128w * 128 * (sby - 1);
             }
             m = bytefn(dav1d_prepare_intra_edges)(t->bx, t->bx > ts->tiling.col_start,
                                                   t->by, t->by > ts->tiling.row_start,
                                                   ts->tiling.col_end, ts->tiling.row_end,
                                                   0, dst, f->cur.p.stride[0], top_sb_edge,
                                                   m, &angle, bw4, bh4, tl_edge);
             dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel),
-                                     tl_edge, bw4 * 4, bh4 * 4, 0);
+                                     tl_edge, bw4 * 4, bh4 * 4, 0, 0, 0);
             const uint8_t *const ii_mask =
                 b->interintra_type == INTER_INTRA_BLEND ?
                      dav1d_ii_masks[bs][0][b->interintra_mode] :
                      dav1d_wedge_masks[bs][0][0][b->wedge_idx];
-            dsp->mc.blend(dst, f->cur.p.stride[0], tmp, bw4 * 4 * sizeof(pixel),
-                          bw4 * 4, bh4 * 4, ii_mask, bw4 * 4);
+            dsp->mc.blend(dst, f->cur.p.stride[0], tmp,
+                          bw4 * 4, bh4 * 4, ii_mask);
         }
 
         if (!has_chroma) goto skip_inter_chroma_pred;
 
         // sub8x8 derivation
         int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver;
         refmvs *r;
         if (is_sub8x8) {
@@ -1141,75 +1223,92 @@ void bytefn(dav1d_recon_b_inter)(Dav1dTi
                 is_sub8x8 &= r[-(1 + f->b4_stride)].ref[0] > 0;
         }
 
         // chroma prediction
         if (is_sub8x8) {
             assert(ss_hor == 1);
             int h_off = 0, v_off = 0;
             if (bw4 == 1 && bh4 == ss_ver) {
-                for (int pl = 0; pl < 2; pl++)
-                    mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff,
-                       NULL, f->cur.p.stride[1],
-                       bw4, bh4, t->bx - 1, t->by - 1, 1 + pl,
-                       r[-(f->b4_stride + 1)].mv[0],
-                       &f->refp[r[-(f->b4_stride + 1)].ref[0] - 1],
-                       f->frame_thread.pass != 2 ? t->tl_4x4_filter :
-                           f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx - 1].filter2d);
+                for (int pl = 0; pl < 2; pl++) {
+                    res = mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff,
+                             NULL, f->cur.p.stride[1],
+                             bw4, bh4, t->bx - 1, t->by - 1, 1 + pl,
+                             r[-(f->b4_stride + 1)].mv[0],
+                             &f->refp[r[-(f->b4_stride + 1)].ref[0] - 1],
+                             r[-(f->b4_stride + 1)].ref[0] - 1,
+                             f->frame_thread.pass != 2 ? t->tl_4x4_filter :
+                                 f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx - 1].filter2d);
+                    if (res) return res;
+                }
                 v_off = 2 * PXSTRIDE(f->cur.p.stride[1]);
                 h_off = 2;
             }
             if (bw4 == 1) {
                 const enum Filter2d left_filter_2d =
                     dav1d_filter_2d[t->l.filter[1][by4]][t->l.filter[0][by4]];
-                for (int pl = 0; pl < 2; pl++)
-                    mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff + v_off, NULL,
-                       f->cur.p.stride[1], bw4, bh4, t->bx - 1,
-                       t->by, 1 + pl, r[-1].mv[0], &f->refp[r[-1].ref[0] - 1],
-                       f->frame_thread.pass != 2 ? left_filter_2d :
-                           f->frame_thread.b[(t->by * f->b4_stride) + t->bx - 1].filter2d);
+                for (int pl = 0; pl < 2; pl++) {
+                    res = mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff + v_off, NULL,
+                             f->cur.p.stride[1], bw4, bh4, t->bx - 1,
+                             t->by, 1 + pl, r[-1].mv[0], &f->refp[r[-1].ref[0] - 1],
+                             r[-1].ref[0] - 1,
+                             f->frame_thread.pass != 2 ? left_filter_2d :
+                                 f->frame_thread.b[(t->by * f->b4_stride) + t->bx - 1].filter2d);
+                    if (res) return res;
+                }
                 h_off = 2;
             }
             if (bh4 == ss_ver) {
                 const enum Filter2d top_filter_2d =
                     dav1d_filter_2d[t->a->filter[1][bx4]][t->a->filter[0][bx4]];
-                for (int pl = 0; pl < 2; pl++)
-                    mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff + h_off, NULL,
-                       f->cur.p.stride[1], bw4, bh4, t->bx, t->by - 1,
-                       1 + pl, r[-f->b4_stride].mv[0],
-                       &f->refp[r[-f->b4_stride].ref[0] - 1],
-                       f->frame_thread.pass != 2 ? top_filter_2d :
-                           f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx].filter2d);
+                for (int pl = 0; pl < 2; pl++) {
+                    res = mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff + h_off, NULL,
+                             f->cur.p.stride[1], bw4, bh4, t->bx, t->by - 1,
+                             1 + pl, r[-f->b4_stride].mv[0],
+                             &f->refp[r[-f->b4_stride].ref[0] - 1],
+                             r[-f->b4_stride].ref[0] - 1,
+                             f->frame_thread.pass != 2 ? top_filter_2d :
+                                 f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx].filter2d);
+                    if (res) return res;
+                }
                 v_off = 2 * PXSTRIDE(f->cur.p.stride[1]);
             }
-            for (int pl = 0; pl < 2; pl++)
-                mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff + h_off + v_off, NULL, f->cur.p.stride[1],
-                   bw4, bh4, t->bx, t->by, 1 + pl, b->mv[0], refp, filter_2d);
+            for (int pl = 0; pl < 2; pl++) {
+                res = mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff + h_off + v_off, NULL, f->cur.p.stride[1],
+                         bw4, bh4, t->bx, t->by, 1 + pl, b->mv[0],
+                         refp, b->ref[0], filter_2d);
+                if (res) return res;
+            }
         } else {
             if (imin(cbw4, cbh4) > 1 && !f->frame_hdr.force_integer_mv &&
                 ((b->inter_mode == GLOBALMV &&
                   f->frame_hdr.gmv[b->ref[0]].type > WM_TYPE_TRANSLATION) ||
                  (b->motion_mode == MM_WARP &&
                   t->warpmv.type > WM_TYPE_TRANSLATION)))
             {
-                for (int pl = 0; pl < 2; pl++)
-                    warp_affine(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff, NULL,
-                                f->cur.p.stride[1], b_dim, 1 + pl, refp,
-                                b->motion_mode == MM_WARP ? &t->warpmv :
-                                    &f->frame_hdr.gmv[b->ref[0]]);
+                for (int pl = 0; pl < 2; pl++) {
+                    res = warp_affine(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff, NULL,
+                                      f->cur.p.stride[1], b_dim, 1 + pl, refp,
+                                      b->motion_mode == MM_WARP ? &t->warpmv :
+                                          &f->frame_hdr.gmv[b->ref[0]]);
+                    if (res) return res;
+                }
             } else {
                 for (int pl = 0; pl < 2; pl++) {
-                    mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff,
-                       NULL, f->cur.p.stride[1],
-                       bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
-                       t->bx & ~ss_hor, t->by & ~ss_ver,
-                       1 + pl, b->mv[0], refp, filter_2d);
-                    if (b->motion_mode == MM_OBMC)
-                        obmc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff,
-                             f->cur.p.stride[1], b_dim, 1 + pl, bx4, by4, w4, h4);
+                    res = mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff,
+                             NULL, f->cur.p.stride[1],
+                             bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
+                             t->bx & ~ss_hor, t->by & ~ss_ver,
+                             1 + pl, b->mv[0], refp, b->ref[0], filter_2d);
+                    if (res) return res;
+                    if (b->motion_mode == MM_OBMC) {
+                        res = obmc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff,
+                                   f->cur.p.stride[1], b_dim, 1 + pl, bx4, by4, w4, h4);
+                        if (res) return res;
+                    }
                 }
             }
             if (b->interintra_type) {
                 // FIXME for 8x32 with 4:2:2 subsampling, this probably does
                 // the wrong thing since it will select 4x16, not 4x32, as a
                 // transform size...
                 const uint8_t *const ii_mask =
                     b->interintra_type == INTER_INTRA_BLEND ?
@@ -1237,19 +1336,19 @@ void bytefn(dav1d_recon_b_inter)(Dav1dTi
                                                           (t->by >> ss_ver) >
                                                               (ts->tiling.row_start >> ss_ver),
                                                           ts->tiling.col_end >> ss_hor,
                                                           ts->tiling.row_end >> ss_ver,
                                                           0, uvdst, f->cur.p.stride[1],
                                                           top_sb_edge, m,
                                                           &angle, cbw4, cbh4, tl_edge);
                     dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel),
-                                             tl_edge, cbw4 * 4, cbh4 * 4, 0);
-                    dsp->mc.blend(uvdst, f->cur.p.stride[1], tmp, cbw4 * 4 * sizeof(pixel),
-                                  cbw4 * 4, cbh4 * 4, ii_mask, cbw4 * 4);
+                                             tl_edge, cbw4 * 4, cbh4 * 4, 0, 0, 0);
+                    dsp->mc.blend(uvdst, f->cur.p.stride[1], tmp,
+                                  cbw4 * 4, cbh4 * 4, ii_mask);
                 }
             }
         }
 
     skip_inter_chroma_pred: {}
         t->tl_4x4_filter = filter_2d;
     } else {
         const enum Filter2d filter_2d = b->filter2d;
@@ -1260,21 +1359,23 @@ void bytefn(dav1d_recon_b_inter)(Dav1dTi
         const uint8_t *mask;
 
         for (int i = 0; i < 2; i++) {
             const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
 
             if (b->inter_mode == GLOBALMV_GLOBALMV && !f->frame_hdr.force_integer_mv &&
                 f->frame_hdr.gmv[b->ref[i]].type > WM_TYPE_TRANSLATION)
             {
-                warp_affine(t, NULL, tmp[i], bw4 * 4, b_dim, 0, refp,
-                            &f->frame_hdr.gmv[b->ref[i]]);
+                res = warp_affine(t, NULL, tmp[i], bw4 * 4, b_dim, 0, refp,
+                                  &f->frame_hdr.gmv[b->ref[i]]);
+                if (res) return res;
             } else {
-                mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by, 0,
-                   b->mv[i], refp, filter_2d);
+                res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by, 0,
+                         b->mv[i], refp, b->ref[i], filter_2d);
+                if (res) return res;
             }
         }
         switch (b->comp_type) {
         case COMP_INTER_AVG:
             dsp->mc.avg(dst, f->cur.p.stride[0], tmp[0], tmp[1],
                         bw4 * 4, bh4 * 4);
             break;
         case COMP_INTER_WEIGHTED_AVG:
@@ -1301,21 +1402,23 @@ void bytefn(dav1d_recon_b_inter)(Dav1dTi
         // chroma
         if (has_chroma) for (int pl = 0; pl < 2; pl++) {
             for (int i = 0; i < 2; i++) {
                 const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
                 if (b->inter_mode == GLOBALMV_GLOBALMV &&
                     imin(cbw4, cbh4) > 1 && !f->frame_hdr.force_integer_mv &&
                     f->frame_hdr.gmv[b->ref[i]].type > WM_TYPE_TRANSLATION)
                 {
-                    warp_affine(t, NULL, tmp[i], bw4 * 2, b_dim, 1 + pl,
-                                refp, &f->frame_hdr.gmv[b->ref[i]]);
+                    res = warp_affine(t, NULL, tmp[i], bw4 * 2, b_dim, 1 + pl,
+                                      refp, &f->frame_hdr.gmv[b->ref[i]]);
+                    if (res) return res;
                 } else {
-                    mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by,
-                       1 + pl, b->mv[i], refp, filter_2d);
+                    res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by,
+                             1 + pl, b->mv[i], refp, b->ref[i], filter_2d);
+                    if (res) return res;
                 }
             }
             pixel *const uvdst = ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff;
             switch (b->comp_type) {
             case COMP_INTER_AVG:
                 dsp->mc.avg(uvdst, f->cur.p.stride[1], tmp[0], tmp[1],
                             bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver);
                 break;
@@ -1342,25 +1445,30 @@ void bytefn(dav1d_recon_b_inter)(Dav1dTi
                      cbw4 * 4, cbh4 * 4, "v-pred");
         }
     }
 
     const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
 
     if (b->skip) {
         // reset coef contexts
-        memset(&t->a->lcoef[bx4], 0x40, w4);
-        memset(&t->l.lcoef[by4], 0x40, h4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+        rep_macro(type, t->dir lcoef, off, mul * 0x40)
+        case_set(bh4, l., 1, by4);
+        case_set(bw4, a->, 0, bx4);
+#undef set_ctx
         if (has_chroma) {
-            memset(&t->a->ccoef[0][cbx4], 0x40, cw4);
-            memset(&t->l.ccoef[0][cby4], 0x40, ch4);
-            memset(&t->a->ccoef[1][cbx4], 0x40, cw4);
-            memset(&t->l.ccoef[1][cby4], 0x40, ch4);
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+            rep_macro(type, t->dir ccoef[0], off, mul * 0x40); \
+            rep_macro(type, t->dir ccoef[1], off, mul * 0x40)
+            case_set(cbh4, l., 1, cby4);
+            case_set(cbw4, a->, 0, cbx4);
+#undef set_ctx
         }
-        return;
+        return 0;
     }
 
     const TxfmInfo *const uvtx = &dav1d_txfm_dimensions[b->uvtx];
     const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx];
 
     for (int init_y = 0; init_y < bh4; init_y += 16) {
         for (int init_x = 0; init_x < bw4; init_x += 16) {
             // coefficient coding & inverse transforms
@@ -1413,20 +1521,28 @@ void bytefn(dav1d_recon_b_inter)(Dav1dTi
                             eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
                                                &t->l.ccoef[pl][cby4 + y],
                                                b->uvtx, bs, b, 0, 1 + pl,
                                                cf, &txtp, &cf_ctx);
                             if (DEBUG_BLOCK_INFO)
                                 printf("Post-uv-cf-blk[pl=%d,tx=%d,"
                                        "txtp=%d,eob=%d]: r=%d\n",
                                        pl, b->uvtx, txtp, eob, ts->msac.rng);
-                            memset(&t->a->ccoef[pl][cbx4 + x], cf_ctx,
-                                   imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor));
-                            memset(&t->l.ccoef[pl][cby4 + y], cf_ctx,
-                                   imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver));
+#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
+                            rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx)
+#define default_memset(dir, diridx, off, sz) \
+                            memset(&t->dir ccoef[pl][off], cf_ctx, sz)
+                            case_set_upto16_with_default( \
+                                     imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver),
+                                     l., 1, cby4 + y);
+                            case_set_upto16_with_default( \
+                                     imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor),
+                                     a->, 0, cbx4 + x);
+#undef default_memset
+#undef set_ctx
                         }
                         if (eob >= 0) {
                             if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
                                 coef_dump(cf, uvtx->h * 4, uvtx->w * 4, 3, "dq");
                             dsp->itx.itxfm_add[b->uvtx]
                                               [txtp](&uvdst[4 * x],
                                                      f->cur.p.stride[1],
                                                      cf, eob);
@@ -1439,16 +1555,17 @@ void bytefn(dav1d_recon_b_inter)(Dav1dTi
                     uvdst += PXSTRIDE(f->cur.p.stride[1]) * 4 * uvtx->h;
                     t->bx -= x << ss_hor;
                     t->by += uvtx->h << ss_ver;
                 }
                 t->by -= y << ss_ver;
             }
         }
     }
+    return 0;
 }
 
 void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
     const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
     const int sbsz = f->sb_step, sbh = f->sbh;
 
     if (f->frame_hdr.loopfilter.level_y[0] ||
         f->frame_hdr.loopfilter.level_y[1])
--- a/third_party/dav1d/src/ref.c
+++ b/third_party/dav1d/src/ref.c
@@ -26,52 +26,61 @@
  */
 
 #include "config.h"
 
 #include "common/mem.h"
 
 #include "src/ref.h"
 
-static void default_free_callback(uint8_t *const data, void *const user_data) {
-    dav1d_free_aligned(data);
+static void default_free_callback(const uint8_t *const data, void *const user_data) {
+    assert(data == user_data);
+    dav1d_free_aligned(user_data);
 }
 
 Dav1dRef *dav1d_ref_create(const size_t size) {
     Dav1dRef *res;
     void *data = dav1d_alloc_aligned(size, 32);
     if (!data) {
         return NULL;
     }
 
-    res = dav1d_ref_wrap(data, default_free_callback, NULL);
+    res = dav1d_ref_wrap(data, default_free_callback, data);
     if (!res) {
         free(data);
     }
 
     return res;
 }
 
-Dav1dRef *dav1d_ref_wrap(uint8_t *const ptr,
-                         void (*free_callback)(uint8_t *data, void *user_data),
+Dav1dRef *dav1d_ref_wrap(const uint8_t *const ptr,
+                         void (*free_callback)(const uint8_t *data, void *user_data),
                          void *user_data)
 {
     Dav1dRef *res = malloc(sizeof(Dav1dRef));
     if (!res) return NULL;
 
-    res->data = ptr;
+    if (ptr == user_data)
+        res->data = user_data;
+    res->const_data = ptr;
     atomic_init(&res->ref_cnt, 1);
     res->free_callback = free_callback;
     res->user_data = user_data;
 
     return res;
 }
 
 void dav1d_ref_inc(Dav1dRef *const ref) {
     atomic_fetch_add(&ref->ref_cnt, 1);
 }
 
-void dav1d_ref_dec(Dav1dRef *const ref) {
+void dav1d_ref_dec(Dav1dRef **const pref) {
+    assert(pref != NULL);
+
+    Dav1dRef *const ref = *pref;
+    if (!ref) return;
+
     if (atomic_fetch_sub(&ref->ref_cnt, 1) == 1) {
-        ref->free_callback(ref->data, ref->user_data);
+        ref->free_callback(ref->const_data, ref->user_data);
         free(ref);
     }
+    *pref = NULL;
 }
--- a/third_party/dav1d/src/ref.h
+++ b/third_party/dav1d/src/ref.h
@@ -30,21 +30,22 @@
 
 #include "dav1d/dav1d.h"
 
 #include <stdatomic.h>
 #include <stddef.h>
 
 struct Dav1dRef {
     void *data;
+    const void *const_data;
     atomic_int ref_cnt;
-    void (*free_callback)(uint8_t *data, void *user_data);
+    void (*free_callback)(const uint8_t *data, void *user_data);
     void *user_data;
 };
 
 Dav1dRef *dav1d_ref_create(size_t size);
-Dav1dRef *dav1d_ref_wrap(uint8_t *ptr,
-                         void (*free_callback)(uint8_t *data, void *user_data),
+Dav1dRef *dav1d_ref_wrap(const uint8_t *ptr,
+                         void (*free_callback)(const uint8_t *data, void *user_data),
                          void *user_data);
 void dav1d_ref_inc(Dav1dRef *ref);
-void dav1d_ref_dec(Dav1dRef *ref);
+void dav1d_ref_dec(Dav1dRef **ref);
 
 #endif /* __DAV1D_SRC_REF_H__ */
--- a/third_party/dav1d/src/ref_mvs.c
+++ b/third_party/dav1d/src/ref_mvs.c
@@ -1949,20 +1949,19 @@ void av1_find_ref_mvs(CANDIDATE_MV *mvst
     }
 
     if (xd.n8_w > xd.n8_h)
         if (by4 & (xd.n8_w - 1)) xd.is_sec_rect = 1;
 
     MV_REFERENCE_FRAME rf[2] = { refidx_dav1d[0] + 1, refidx_dav1d[1] + 1 };
     const int refidx = av1_ref_frame_type(rf);
     int16_t single_context[MODE_CTX_REF_FRAMES];
-    uint8_t mv_cnt[MODE_CTX_REF_FRAMES] = { 0 };
+    uint8_t mv_cnt[MODE_CTX_REF_FRAMES];
     CANDIDATE_MV mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
-    memset(mv_stack, 0, sizeof(mv_stack));
-    int_mv mv_list[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES] = { { { 0 } } };
+    int_mv mv_list[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
     int_mv gmvs[MODE_CTX_REF_FRAMES];
     av1_find_mv_refs(cm, &xd, xd.mi, refidx, mv_cnt,
                      mv_stack, mv_list, gmvs, by4, bx4,
                      single_context);
     for (int i = 0; i < mv_cnt[refidx]; i++)
         mvstack[i] = mv_stack[refidx][i];
     *cnt = mv_cnt[refidx];
 
@@ -1974,81 +1973,16 @@ void av1_find_ref_mvs(CANDIDATE_MV *mvst
     }
 
     if (ctx) {
         if (refidx_dav1d[1] == -1)
             *ctx = single_context[refidx_dav1d[0] + 1];
         else
             *ctx = av1_mode_context_analyzer(single_context, rf);
     }
-
-    if (0 && bx4 == 38 && by4 == 15 && cm->frame_offset == 3 &&
-        refidx_dav1d[1] == -1 && refidx_dav1d[0] == 4 &&
-        bw4 == 1 && bh4 == 1 && bp == 3)
-    {
-        MV_REF *l = bx4 ? &cm->cur_frame.mvs[by4*stride+bx4-1] : NULL;
-        MV_REF *a = by4 ? &cm->cur_frame.mvs[by4*stride+bx4-stride] : NULL;
-        printf("Input: left=[0]y:%d,x:%d,r:%d,[1]y:%d,x:%d,r:%d,mode=%d, "
-               "above=[0]y:%d,x:%d,r:%d,[1]y:%d,x:%d,r:%d,mode=%d, "
-               "temp=y:%d,x:%d,r:%d [use_ref=%d]\n",
-               l ? l->mv[0].as_mv.row : -1,
-               l ? l->mv[0].as_mv.col : -1,
-               l ? l->ref_frame[0]: -1,
-               l ? l->mv[1].as_mv.row : -1,
-               l ? l->mv[1].as_mv.col : -1,
-               l ? l->ref_frame[1]: -1,
-               l ? l->mode : -1,
-               a ? a->mv[0].as_mv.row: -1,
-               a ? a->mv[0].as_mv.col : -1,
-               a ? a->ref_frame[0] : -1,
-               a ? a->mv[1].as_mv.row: -1,
-               a ? a->mv[1].as_mv.col : -1,
-               a ? a->ref_frame[1] : -1,
-               a ? a->mode : -1,
-               cm->tpl_mvs[(by4 >> 1) * (cm->mi_stride >> 1) + (bx4 >> 1)].mfmv0.as_mv.row,
-               cm->tpl_mvs[(by4 >> 1) * (cm->mi_stride >> 1) + (bx4 >> 1)].mfmv0.as_mv.col,
-               cm->tpl_mvs[(by4 >> 1) * (cm->mi_stride >> 1) +
-                           (bx4 >> 1)].ref_frame_offset,
-               cm->allow_ref_frame_mvs);
-        printf("Edges: l=%d,t=%d,r=%d,b=%d,w=%d,h=%d,border=%d\n",
-               xd.mb_to_left_edge,
-               xd.mb_to_top_edge,
-               xd.mb_to_right_edge,
-               xd.mb_to_bottom_edge,
-               xd.n8_w << MI_SIZE_LOG2,
-               xd.n8_h << MI_SIZE_LOG2,
-               MV_BORDER);
-        printf("bp=%d, x=%d, y=%d, refs=%d/%d, n_mvs: %d, "
-               "first mv: y=%d,x=%d | y=%d,x=%d, "
-               "first comp mv: y=%d,x=%d | y=%d,x=%d, "
-               "second mv: y=%d, x=%d | y=%d, x=%d, "
-               "second comp mv: y=%d, x=%d | y=%d, x=%d, "
-               "third mv: y=%d, x=%d, "
-               "ctx=%d\n",
-               bp, bx4, by4, refidx_dav1d[0], refidx_dav1d[1], mv_cnt[refidx],
-               mv_stack[refidx][0].this_mv.as_mv.row,
-               mv_stack[refidx][0].this_mv.as_mv.col,
-               mv_list[refidx_dav1d[0] + 1][0].as_mv.row,
-               mv_list[refidx_dav1d[0] + 1][0].as_mv.col,
-               mv_stack[refidx][0].comp_mv.as_mv.row,
-               mv_stack[refidx][0].comp_mv.as_mv.col,
-               mv_list[refidx_dav1d[1] + 1][0].as_mv.row,
-               mv_list[refidx_dav1d[1] + 1][0].as_mv.col,
-               mv_stack[refidx][1].this_mv.as_mv.row,
-               mv_stack[refidx][1].this_mv.as_mv.col,
-               mv_list[refidx_dav1d[0] + 1][1].as_mv.row,
-               mv_list[refidx_dav1d[0] + 1][1].as_mv.col,
-               mv_stack[refidx][1].comp_mv.as_mv.row,
-               mv_stack[refidx][1].comp_mv.as_mv.col,
-               mv_list[refidx_dav1d[1] + 1][1].as_mv.row,
-               mv_list[refidx_dav1d[1] + 1][1].as_mv.col,
-               mv_stack[refidx][2].this_mv.as_mv.row,
-               mv_stack[refidx][2].this_mv.as_mv.col,
-               *ctx);
-    }
 }
 
 int av1_init_ref_mv_common(AV1_COMMON *cm,
                            const int w8, const int h8,
                            const ptrdiff_t stride,
                            const int allow_sb128,
                            MV_REF *cur,
                            MV_REF *ref_mvs[7],
@@ -2115,17 +2049,19 @@ int av1_init_ref_mv_common(AV1_COMMON *c
             cm->buffer_pool.frame_bufs[i].ref_frame_offset[j] =
                 ref_ref_poc[i][j];
     }
     av1_setup_frame_buf_refs(cm);
     for (int i = 0; i < 7; i++) {
         const int ref_poc = cm->buffer_pool.frame_bufs[i].cur_frame_offset;
         cm->ref_frame_sign_bias[1 + i] = get_relative_dist(cm, ref_poc, cur_poc) > 0;
     }
-    av1_setup_motion_field(cm);
+    if (allow_ref_frame_mvs) {
+        av1_setup_motion_field(cm);
+    }
 
     return 0;
 }
 
 void av1_init_ref_mv_tile_row(AV1_COMMON *cm,
                               int tile_col_start4, int tile_col_end4,
                               int row_start4, int row_end4);
 void av1_init_ref_mv_tile_row(AV1_COMMON *cm,
--- a/third_party/dav1d/src/ref_mvs.h
+++ b/third_party/dav1d/src/ref_mvs.h
@@ -173,23 +173,9 @@ static inline void fix_mv_precision(cons
         }
         if (mv->y & 1) {
             if (mv->y < 0) mv->y++;
             else           mv->y--;
         }
     }
 }
 
-static inline mv av1_clamp_mv(const mv mv,
-                              const int bx4, const int by4,
-                              const int bw4, const int bh4,
-                              const int iw4, const int ih4)
-{
-    const int left = -(bx4 + bw4 + 4) * 4 * 8;
-    const int right = (iw4 - bx4 + 0 * bw4 + 4) * 4 * 8;
-    const int top = -(by4 + bh4 + 4) * 4 * 8;
-    const int bottom = (ih4 - by4 + 0 * bh4 + 4) * 4 * 8;
-
-    return (struct mv) { .x = iclip(mv.x, left, right),
-                         .y = iclip(mv.y, top, bottom) };
-}
-
 #endif /* __DAV1D_SRC_REF_MVS_H__ */
--- a/third_party/dav1d/src/tables.c
+++ b/third_party/dav1d/src/tables.c
@@ -441,17 +441,17 @@ const uint8_t /* enum FilterMode */ dav1
     [FILTER_2D_8TAP_REGULAR_SMOOTH] = { FILTER_8TAP_SMOOTH,  FILTER_8TAP_REGULAR },
     [FILTER_2D_8TAP_REGULAR_SHARP]  = { FILTER_8TAP_SHARP,   FILTER_8TAP_REGULAR },
     [FILTER_2D_8TAP_SHARP_REGULAR]  = { FILTER_8TAP_REGULAR, FILTER_8TAP_SHARP   },
     [FILTER_2D_8TAP_SHARP_SMOOTH]   = { FILTER_8TAP_SMOOTH,  FILTER_8TAP_SHARP   },
     [FILTER_2D_8TAP_SHARP]          = { FILTER_8TAP_SHARP,   FILTER_8TAP_SHARP   },
     [FILTER_2D_8TAP_SMOOTH_REGULAR] = { FILTER_8TAP_REGULAR, FILTER_8TAP_SMOOTH  },
     [FILTER_2D_8TAP_SMOOTH]         = { FILTER_8TAP_SMOOTH,  FILTER_8TAP_SMOOTH  },
     [FILTER_2D_8TAP_SMOOTH_SHARP]   = { FILTER_8TAP_SHARP,   FILTER_8TAP_SMOOTH  },
-    [FILTER_2D_BILINEAR]            = { FILTER_2D_BILINEAR,  FILTER_2D_BILINEAR  },
+    [FILTER_2D_BILINEAR]            = { FILTER_BILINEAR,     FILTER_BILINEAR     },
 };
 
 const uint8_t dav1d_filter_mode_to_y_mode[5] = {
     DC_PRED, VERT_PRED, HOR_PRED, HOR_DOWN_PRED, DC_PRED
 };
 
 const uint8_t dav1d_intra_mode_context[N_INTRA_PRED_MODES] = {
     [DC_PRED]              = 0,
@@ -497,17 +497,17 @@ const int16_t dav1d_sgr_params[16][4] = 
     { 2, 1, 140, 3236 }, { 2, 1, 112, 2158 }, { 2, 1, 93, 1618 },
     { 2, 1,  80, 1438 }, { 2, 1,  70, 1295 }, { 2, 1, 58, 1177 },
     { 2, 1,  47, 1079 }, { 2, 1,  37,  996 }, { 2, 1, 30,  925 },
     { 2, 1,  25,  863 }, { 0, 1,  -1, 2589 }, { 0, 1, -1, 1618 },
     { 0, 1,  -1, 1177 }, { 0, 1,  -1,  925 }, { 2, 0, 56,   -1 },
     { 2, 0,  22,   -1 },
 };
 
-const int16_t dav1d_sgr_x_by_xplus1[256] = {
+const int dav1d_sgr_x_by_xplus1[256] = {
   1,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
   240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
   248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
   250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
   252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
   253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
   253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
   254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
@@ -518,21 +518,16 @@ const int16_t dav1d_sgr_x_by_xplus1[256]
   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
   255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
   256,
 };
 
-const int16_t dav1d_sgr_one_by_x[25] = {
-  4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
-  293,  273,  256,  241,  228, 216, 205, 195, 186, 178, 171, 164,
-};
-
 const int8_t ALIGN(dav1d_mc_subpel_filters[5][15][8], 8) = {
     [FILTER_8TAP_REGULAR] = {
         {   0,   1,  -3,  63,   4,  -1,   0,   0 },
         {   0,   1,  -5,  61,   9,  -2,   0,   0 },
         {   0,   1,  -6,  58,  14,  -4,   1,   0 },
         {   0,   1,  -7,  55,  19,  -5,   1,   0 },
         {   0,   1,  -7,  51,  24,  -6,   1,   0 },
         {   0,   1,  -8,  47,  29,  -6,   1,   0 },
@@ -608,121 +603,118 @@ const int8_t ALIGN(dav1d_mc_subpel_filte
         {   0,   0,   4,  22,  29,   9,   0,   0 },
         {   0,   0,   3,  21,  30,  10,   0,   0 },
         {   0,   0,   2,  20,  31,  11,   0,   0 },
         {   0,   0,   2,  18,  31,  13,   0,   0 },
         {   0,   0,   1,  17,  31,  15,   0,   0 }
     }
 };
 
-const int8_t dav1d_mc_warp_filter[][8] = {
-    // [-1, 0)
-    { 0,   0, 127,   1,   0, 0, 0, 0 }, { 0, - 1, 127,   2,   0, 0, 0, 0 },
-    { 1, - 3, 127,   4, - 1, 0, 0, 0 }, { 1, - 4, 126,   6, - 2, 1, 0, 0 },
-    { 1, - 5, 126,   8, - 3, 1, 0, 0 }, { 1, - 6, 125,  11, - 4, 1, 0, 0 },
-    { 1, - 7, 124,  13, - 4, 1, 0, 0 }, { 2, - 8, 123,  15, - 5, 1, 0, 0 },
-    { 2, - 9, 122,  18, - 6, 1, 0, 0 }, { 2, -10, 121,  20, - 6, 1, 0, 0 },
-    { 2, -11, 120,  22, - 7, 2, 0, 0 }, { 2, -12, 119,  25, - 8, 2, 0, 0 },
-    { 3, -13, 117,  27, - 8, 2, 0, 0 }, { 3, -13, 116,  29, - 9, 2, 0, 0 },
-    { 3, -14, 114,  32, -10, 3, 0, 0 }, { 3, -15, 113,  35, -10, 2, 0, 0 },
-    { 3, -15, 111,  37, -11, 3, 0, 0 }, { 3, -16, 109,  40, -11, 3, 0, 0 },
-    { 3, -16, 108,  42, -12, 3, 0, 0 }, { 4, -17, 106,  45, -13, 3, 0, 0 },
-    { 4, -17, 104,  47, -13, 3, 0, 0 }, { 4, -17, 102,  50, -14, 3, 0, 0 },
-    { 4, -17, 100,  52, -14, 3, 0, 0 }, { 4, -18,  98,  55, -15, 4, 0, 0 },
-    { 4, -18,  96,  58, -15, 3, 0, 0 }, { 4, -18,  94,  60, -16, 4, 0, 0 },
-    { 4, -18,  91,  63, -16, 4, 0, 0 }, { 4, -18,  89,  65, -16, 4, 0, 0 },
-    { 4, -18,  87,  68, -17, 4, 0, 0 }, { 4, -18,  85,  70, -17, 4, 0, 0 },
-    { 4, -18,  82,  73, -17, 4, 0, 0 }, { 4, -18,  80,  75, -17, 4, 0, 0 },
-    { 4, -18,  78,  78, -18, 4, 0, 0 }, { 4, -17,  75,  80, -18, 4, 0, 0 },
-    { 4, -17,  73,  82, -18, 4, 0, 0 }, { 4, -17,  70,  85, -18, 4, 0, 0 },
-    { 4, -17,  68,  87, -18, 4, 0, 0 }, { 4, -16,  65,  89, -18, 4, 0, 0 },
-    { 4, -16,  63,  91, -18, 4, 0, 0 }, { 4, -16,  60,  94, -18, 4, 0, 0 },
-    { 3, -15,  58,  96, -18, 4, 0, 0 }, { 4, -15,  55,  98, -18, 4, 0, 0 },
-    { 3, -14,  52, 100, -17, 4, 0, 0 }, { 3, -14,  50, 102, -17, 4, 0, 0 },
-    { 3, -13,  47, 104, -17, 4, 0, 0 }, { 3, -13,  45, 106, -17, 4, 0, 0 },
-    { 3, -12,  42, 108, -16, 3, 0, 0 }, { 3, -11,  40, 109, -16, 3, 0, 0 },
-    { 3, -11,  37, 111, -15, 3, 0, 0 }, { 2, -10,  35, 113, -15, 3, 0, 0 },
-    { 3, -10,  32, 114, -14, 3, 0, 0 }, { 2, - 9,  29, 116, -13, 3, 0, 0 },
-    { 2, - 8,  27, 117, -13, 3, 0, 0 }, { 2, - 8,  25, 119, -12, 2, 0, 0 },
-    { 2, - 7,  22, 120, -11, 2, 0, 0 }, { 1, - 6,  20, 121, -10, 2, 0, 0 },
-    { 1, - 6,  18, 122, - 9, 2, 0, 0 }, { 1, - 5,  15, 123, - 8, 2, 0, 0 },
-    { 1, - 4,  13, 124, - 7, 1, 0, 0 }, { 1, - 4,  11, 125, - 6, 1, 0, 0 },
-    { 1, - 3,   8, 126, - 5, 1, 0, 0 }, { 1, - 2,   6, 126, - 4, 1, 0, 0 },
-    { 0, - 1,   4, 127, - 3, 1, 0, 0 }, { 0,   0,   2, 127, - 1, 0, 0, 0 },
-
+const int8_t ALIGN(dav1d_mc_warp_filter[193][8], 8) = {
+   // [-1, 0)
+    { 0, 127,   0, 0,   0,   1, 0, 0 }, { 0, 127,   0, 0,  -1,   2, 0, 0 },
+    { 1, 127,  -1, 0,  -3,   4, 0, 0 }, { 1, 126,  -2, 0,  -4,   6, 1, 0 },
+    { 1, 126,  -3, 0,  -5,   8, 1, 0 }, { 1, 125,  -4, 0,  -6,  11, 1, 0 },
+    { 1, 124,  -4, 0,  -7,  13, 1, 0 }, { 2, 123,  -5, 0,  -8,  15, 1, 0 },
+    { 2, 122,  -6, 0,  -9,  18, 1, 0 }, { 2, 121,  -6, 0, -10,  20, 1, 0 },
+    { 2, 120,  -7, 0, -11,  22, 2, 0 }, { 2, 119,  -8, 0, -12,  25, 2, 0 },
+    { 3, 117,  -8, 0, -13,  27, 2, 0 }, { 3, 116,  -9, 0, -13,  29, 2, 0 },
+    { 3, 114, -10, 0, -14,  32, 3, 0 }, { 3, 113, -10, 0, -15,  35, 2, 0 },
+    { 3, 111, -11, 0, -15,  37, 3, 0 }, { 3, 109, -11, 0, -16,  40, 3, 0 },
+    { 3, 108, -12, 0, -16,  42, 3, 0 }, { 4, 106, -13, 0, -17,  45, 3, 0 },
+    { 4, 104, -13, 0, -17,  47, 3, 0 }, { 4, 102, -14, 0, -17,  50, 3, 0 },
+    { 4, 100, -14, 0, -17,  52, 3, 0 }, { 4,  98, -15, 0, -18,  55, 4, 0 },
+    { 4,  96, -15, 0, -18,  58, 3, 0 }, { 4,  94, -16, 0, -18,  60, 4, 0 },
+    { 4,  91, -16, 0, -18,  63, 4, 0 }, { 4,  89, -16, 0, -18,  65, 4, 0 },
+    { 4,  87, -17, 0, -18,  68, 4, 0 }, { 4,  85, -17, 0, -18,  70, 4, 0 },
+    { 4,  82, -17, 0, -18,  73, 4, 0 }, { 4,  80, -17, 0, -18,  75, 4, 0 },
+    { 4,  78, -18, 0, -18,  78, 4, 0 }, { 4,  75, -18, 0, -17,  80, 4, 0 },
+    { 4,  73, -18, 0, -17,  82, 4, 0 }, { 4,  70, -18, 0, -17,  85, 4, 0 },
+    { 4,  68, -18, 0, -17,  87, 4, 0 }, { 4,  65, -18, 0, -16,  89, 4, 0 },
+    { 4,  63, -18, 0, -16,  91, 4, 0 }, { 4,  60, -18, 0, -16,  94, 4, 0 },
+    { 3,  58, -18, 0, -15,  96, 4, 0 }, { 4,  55, -18, 0, -15,  98, 4, 0 },
+    { 3,  52, -17, 0, -14, 100, 4, 0 }, { 3,  50, -17, 0, -14, 102, 4, 0 },
+    { 3,  47, -17, 0, -13, 104, 4, 0 }, { 3,  45, -17, 0, -13, 106, 4, 0 },
+    { 3,  42, -16, 0, -12, 108, 3, 0 }, { 3,  40, -16, 0, -11, 109, 3, 0 },
+    { 3,  37, -15, 0, -11, 111, 3, 0 }, { 2,  35, -15, 0, -10, 113, 3, 0 },
+    { 3,  32, -14, 0, -10, 114, 3, 0 }, { 2,  29, -13, 0,  -9, 116, 3, 0 },
+    { 2,  27, -13, 0,  -8, 117, 3, 0 }, { 2,  25, -12, 0,  -8, 119, 2, 0 },
+    { 2,  22, -11, 0,  -7, 120, 2, 0 }, { 1,  20, -10, 0,  -6, 121, 2, 0 },
+    { 1,  18,  -9, 0,  -6, 122, 2, 0 }, { 1,  15,  -8, 0,  -5, 123, 2, 0 },
+    { 1,  13,  -7, 0,  -4, 124, 1, 0 }, { 1,  11,  -6, 0,  -4, 125, 1, 0 },
+    { 1,   8,  -5, 0,  -3, 126, 1, 0 }, { 1,   6,  -4, 0,  -2, 126, 1, 0 },
+    { 0,   4,  -3, 0,  -1, 127, 1, 0 }, { 0,   2,  -1, 0,   0, 127, 0, 0 },
     // [0, 1)
-    { 0,  0,   0, 127,   1,   0,  0,  0}, { 0,  0,  -1, 127,   2,   0,  0,  0},
-    { 0,  1,  -3, 127,   4,  -2,  1,  0}, { 0,  1,  -5, 127,   6,  -2,  1,  0},
-    { 0,  2,  -6, 126,   8,  -3,  1,  0}, {-1,  2,  -7, 126,  11,  -4,  2, -1},
-    {-1,  3,  -8, 125,  13,  -5,  2, -1}, {-1,  3, -10, 124,  16,  -6,  3, -1},
-    {-1,  4, -11, 123,  18,  -7,  3, -1}, {-1,  4, -12, 122,  20,  -7,  3, -1},
-    {-1,  4, -13, 121,  23,  -8,  3, -1}, {-2,  5, -14, 120,  25,  -9,  4, -1},
-    {-1,  5, -15, 119,  27, -10,  4, -1}, {-1,  5, -16, 118,  30, -11,  4, -1},
-    {-2,  6, -17, 116,  33, -12,  5, -1}, {-2,  6, -17, 114,  35, -12,  5, -1},
-    {-2,  6, -18, 113,  38, -13,  5, -1}, {-2,  7, -19, 111,  41, -14,  6, -2},
-    {-2,  7, -19, 110,  43, -15,  6, -2}, {-2,  7, -20, 108,  46, -15,  6, -2},
-    {-2,  7, -20, 106,  49, -16,  6, -2}, {-2,  7, -21, 104,  51, -16,  7, -2},
-    {-2,  7, -21, 102,  54, -17,  7, -2}, {-2,  8, -21, 100,  56, -18,  7, -2},
-    {-2,  8, -22,  98,  59, -18,  7, -2}, {-2,  8, -22,  96,  62, -19,  7, -2},
-    {-2,  8, -22,  94,  64, -19,  7, -2}, {-2,  8, -22,  91,  67, -20,  8, -2},
-    {-2,  8, -22,  89,  69, -20,  8, -2}, {-2,  8, -22,  87,  72, -21,  8, -2},
-    {-2,  8, -21,  84,  74, -21,  8, -2}, {-2,  8, -22,  82,  77, -21,  8, -2},
-    {-2,  8, -21,  79,  79, -21,  8, -2}, {-2,  8, -21,  77,  82, -22,  8, -2},
-    {-2,  8, -21,  74,  84, -21,  8, -2}, {-2,  8, -21,  72,  87, -22,  8, -2},
-    {-2,  8, -20,  69,  89, -22,  8, -2}, {-2,  8, -20,  67,  91, -22,  8, -2},
-    {-2,  7, -19,  64,  94, -22,  8, -2}, {-2,  7, -19,  62,  96, -22,  8, -2},
-    {-2,  7, -18,  59,  98, -22,  8, -2}, {-2,  7, -18,  56, 100, -21,  8, -2},
-    {-2,  7, -17,  54, 102, -21,  7, -2}, {-2,  7, -16,  51, 104, -21,  7, -2},
-    {-2,  6, -16,  49, 106, -20,  7, -2}, {-2,  6, -15,  46, 108, -20,  7, -2},
-    {-2,  6, -15,  43, 110, -19,  7, -2}, {-2,  6, -14,  41, 111, -19,  7, -2},
-    {-1,  5, -13,  38, 113, -18,  6, -2}, {-1,  5, -12,  35, 114, -17,  6, -2},
-    {-1,  5, -12,  33, 116, -17,  6, -2}, {-1,  4, -11,  30, 118, -16,  5, -1},
-    {-1,  4, -10,  27, 119, -15,  5, -1}, {-1,  4,  -9,  25, 120, -14,  5, -2},
-    {-1,  3,  -8,  23, 121, -13,  4, -1}, {-1,  3,  -7,  20, 122, -12,  4, -1},
-    {-1,  3,  -7,  18, 123, -11,  4, -1}, {-1,  3,  -6,  16, 124, -10,  3, -1},
-    {-1,  2,  -5,  13, 125,  -8,  3, -1}, {-1,  2,  -4,  11, 126,  -7,  2, -1},
-    { 0,  1,  -3,   8, 126,  -6,  2,  0}, { 0,  1,  -2,   6, 127,  -5,  1,  0},
-    { 0,  1,  -2,   4, 127,  -3,  1,  0}, { 0,  0,   0,   2, 127,  -1,  0,  0},
-
+    {  0,   0,   1, 0, 0, 127,   0,  0 }, {  0,  -1,   2, 0, 0, 127,   0,  0 },
+    {  0,  -3,   4, 1, 1, 127,  -2,  0 }, {  0,  -5,   6, 1, 1, 127,  -2,  0 },
+    {  0,  -6,   8, 1, 2, 126,  -3,  0 }, { -1,  -7,  11, 2, 2, 126,  -4, -1 },
+    { -1,  -8,  13, 2, 3, 125,  -5, -1 }, { -1, -10,  16, 3, 3, 124,  -6, -1 },
+    { -1, -11,  18, 3, 4, 123,  -7, -1 }, { -1, -12,  20, 3, 4, 122,  -7, -1 },
+    { -1, -13,  23, 3, 4, 121,  -8, -1 }, { -2, -14,  25, 4, 5, 120,  -9, -1 },
+    { -1, -15,  27, 4, 5, 119, -10, -1 }, { -1, -16,  30, 4, 5, 118, -11, -1 },
+    { -2, -17,  33, 5, 6, 116, -12, -1 }, { -2, -17,  35, 5, 6, 114, -12, -1 },
+    { -2, -18,  38, 5, 6, 113, -13, -1 }, { -2, -19,  41, 6, 7, 111, -14, -2 },
+    { -2, -19,  43, 6, 7, 110, -15, -2 }, { -2, -20,  46, 6, 7, 108, -15, -2 },
+    { -2, -20,  49, 6, 7, 106, -16, -2 }, { -2, -21,  51, 7, 7, 104, -16, -2 },
+    { -2, -21,  54, 7, 7, 102, -17, -2 }, { -2, -21,  56, 7, 8, 100, -18, -2 },
+    { -2, -22,  59, 7, 8,  98, -18, -2 }, { -2, -22,  62, 7, 8,  96, -19, -2 },
+    { -2, -22,  64, 7, 8,  94, -19, -2 }, { -2, -22,  67, 8, 8,  91, -20, -2 },
+    { -2, -22,  69, 8, 8,  89, -20, -2 }, { -2, -22,  72, 8, 8,  87, -21, -2 },
+    { -2, -21,  74, 8, 8,  84, -21, -2 }, { -2, -22,  77, 8, 8,  82, -21, -2 },
+    { -2, -21,  79, 8, 8,  79, -21, -2 }, { -2, -21,  82, 8, 8,  77, -22, -2 },
+    { -2, -21,  84, 8, 8,  74, -21, -2 }, { -2, -21,  87, 8, 8,  72, -22, -2 },
+    { -2, -20,  89, 8, 8,  69, -22, -2 }, { -2, -20,  91, 8, 8,  67, -22, -2 },
+    { -2, -19,  94, 8, 7,  64, -22, -2 }, { -2, -19,  96, 8, 7,  62, -22, -2 },
+    { -2, -18,  98, 8, 7,  59, -22, -2 }, { -2, -18, 100, 8, 7,  56, -21, -2 },
+    { -2, -17, 102, 7, 7,  54, -21, -2 }, { -2, -16, 104, 7, 7,  51, -21, -2 },
+    { -2, -16, 106, 7, 6,  49, -20, -2 }, { -2, -15, 108, 7, 6,  46, -20, -2 },
+    { -2, -15, 110, 7, 6,  43, -19, -2 }, { -2, -14, 111, 7, 6,  41, -19, -2 },
+    { -1, -13, 113, 6, 5,  38, -18, -2 }, { -1, -12, 114, 6, 5,  35, -17, -2 },
+    { -1, -12, 116, 6, 5,  33, -17, -2 }, { -1, -11, 118, 5, 4,  30, -16, -1 },
+    { -1, -10, 119, 5, 4,  27, -15, -1 }, { -1,  -9, 120, 5, 4,  25, -14, -2 },
+    { -1,  -8, 121, 4, 3,  23, -13, -1 }, { -1,  -7, 122, 4, 3,  20, -12, -1 },
+    { -1,  -7, 123, 4, 3,  18, -11, -1 }, { -1,  -6, 124, 3, 3,  16, -10, -1 },
+    { -1,  -5, 125, 3, 2,  13,  -8, -1 }, { -1,  -4, 126, 2, 2,  11,  -7, -1 },
+    {  0,  -3, 126, 2, 1,   8,  -6,  0 }, {  0,  -2, 127, 1, 1,   6,  -5,  0 },
+    {  0,  -2, 127, 1, 1,   4,  -3,  0 }, {  0,   0, 127, 0, 0,   2,  -1,  0 },
     // [1, 2)
-    { 0, 0, 0,   1, 127,   0,   0, 0 }, { 0, 0, 0, - 1, 127,   2,   0, 0 },
-    { 0, 0, 1, - 3, 127,   4, - 1, 0 }, { 0, 0, 1, - 4, 126,   6, - 2, 1 },
-    { 0, 0, 1, - 5, 126,   8, - 3, 1 }, { 0, 0, 1, - 6, 125,  11, - 4, 1 },
-    { 0, 0, 1, - 7, 124,  13, - 4, 1 }, { 0, 0, 2, - 8, 123,  15, - 5, 1 },
-    { 0, 0, 2, - 9, 122,  18, - 6, 1 }, { 0, 0, 2, -10, 121,  20, - 6, 1 },
-    { 0, 0, 2, -11, 120,  22, - 7, 2 }, { 0, 0, 2, -12, 119,  25, - 8, 2 },
-    { 0, 0, 3, -13, 117,  27, - 8, 2 }, { 0, 0, 3, -13, 116,  29, - 9, 2 },
-    { 0, 0, 3, -14, 114,  32, -10, 3 }, { 0, 0, 3, -15, 113,  35, -10, 2 },
-    { 0, 0, 3, -15, 111,  37, -11, 3 }, { 0, 0, 3, -16, 109,  40, -11, 3 },
-    { 0, 0, 3, -16, 108,  42, -12, 3 }, { 0, 0, 4, -17, 106,  45, -13, 3 },
-    { 0, 0, 4, -17, 104,  47, -13, 3 }, { 0, 0, 4, -17, 102,  50, -14, 3 },
-    { 0, 0, 4, -17, 100,  52, -14, 3 }, { 0, 0, 4, -18,  98,  55, -15, 4 },
-    { 0, 0, 4, -18,  96,  58, -15, 3 }, { 0, 0, 4, -18,  94,  60, -16, 4 },
-    { 0, 0, 4, -18,  91,  63, -16, 4 }, { 0, 0, 4, -18,  89,  65, -16, 4 },
-    { 0, 0, 4, -18,  87,  68, -17, 4 }, { 0, 0, 4, -18,  85,  70, -17, 4 },
-    { 0, 0, 4, -18,  82,  73, -17, 4 }, { 0, 0, 4, -18,  80,  75, -17, 4 },
-    { 0, 0, 4, -18,  78,  78, -18, 4 }, { 0, 0, 4, -17,  75,  80, -18, 4 },
-    { 0, 0, 4, -17,  73,  82, -18, 4 }, { 0, 0, 4, -17,  70,  85, -18, 4 },
-    { 0, 0, 4, -17,  68,  87, -18, 4 }, { 0, 0, 4, -16,  65,  89, -18, 4 },
-    { 0, 0, 4, -16,  63,  91, -18, 4 }, { 0, 0, 4, -16,  60,  94, -18, 4 },
-    { 0, 0, 3, -15,  58,  96, -18, 4 }, { 0, 0, 4, -15,  55,  98, -18, 4 },
-    { 0, 0, 3, -14,  52, 100, -17, 4 }, { 0, 0, 3, -14,  50, 102, -17, 4 },
-    { 0, 0, 3, -13,  47, 104, -17, 4 }, { 0, 0, 3, -13,  45, 106, -17, 4 },
-    { 0, 0, 3, -12,  42, 108, -16, 3 }, { 0, 0, 3, -11,  40, 109, -16, 3 },
-    { 0, 0, 3, -11,  37, 111, -15, 3 }, { 0, 0, 2, -10,  35, 113, -15, 3 },
-    { 0, 0, 3, -10,  32, 114, -14, 3 }, { 0, 0, 2, - 9,  29, 116, -13, 3 },
-    { 0, 0, 2, - 8,  27, 117, -13, 3 }, { 0, 0, 2, - 8,  25, 119, -12, 2 },
-    { 0, 0, 2, - 7,  22, 120, -11, 2 }, { 0, 0, 1, - 6,  20, 121, -10, 2 },
-    { 0, 0, 1, - 6,  18, 122, - 9, 2 }, { 0, 0, 1, - 5,  15, 123, - 8, 2 },
-    { 0, 0, 1, - 4,  13, 124, - 7, 1 }, { 0, 0, 1, - 4,  11, 125, - 6, 1 },
-    { 0, 0, 1, - 3,   8, 126, - 5, 1 }, { 0, 0, 1, - 2,   6, 126, - 4, 1 },
-    { 0, 0, 0, - 1,   4, 127, - 3, 1 }, { 0, 0, 0,   0,   2, 127, - 1, 0 },
-
+    { 0, 0, 127,   0, 0,   1,   0, 0 }, { 0, 0, 127,   0, 0,  -1,   2, 0 },
+    { 0, 1, 127,  -1, 0,  -3,   4, 0 }, { 0, 1, 126,  -2, 0,  -4,   6, 1 },
+    { 0, 1, 126,  -3, 0,  -5,   8, 1 }, { 0, 1, 125,  -4, 0,  -6,  11, 1 },
+    { 0, 1, 124,  -4, 0,  -7,  13, 1 }, { 0, 2, 123,  -5, 0,  -8,  15, 1 },
+    { 0, 2, 122,  -6, 0,  -9,  18, 1 }, { 0, 2, 121,  -6, 0, -10,  20, 1 },
+    { 0, 2, 120,  -7, 0, -11,  22, 2 }, { 0, 2, 119,  -8, 0, -12,  25, 2 },
+    { 0, 3, 117,  -8, 0, -13,  27, 2 }, { 0, 3, 116,  -9, 0, -13,  29, 2 },
+    { 0, 3, 114, -10, 0, -14,  32, 3 }, { 0, 3, 113, -10, 0, -15,  35, 2 },
+    { 0, 3, 111, -11, 0, -15,  37, 3 }, { 0, 3, 109, -11, 0, -16,  40, 3 },
+    { 0, 3, 108, -12, 0, -16,  42, 3 }, { 0, 4, 106, -13, 0, -17,  45, 3 },
+    { 0, 4, 104, -13, 0, -17,  47, 3 }, { 0, 4, 102, -14, 0, -17,  50, 3 },
+    { 0, 4, 100, -14, 0, -17,  52, 3 }, { 0, 4,  98, -15, 0, -18,  55, 4 },
+    { 0, 4,  96, -15, 0, -18,  58, 3 }, { 0, 4,  94, -16, 0, -18,  60, 4 },
+    { 0, 4,  91, -16, 0, -18,  63, 4 }, { 0, 4,  89, -16, 0, -18,  65, 4 },
+    { 0, 4,  87, -17, 0, -18,  68, 4 }, { 0, 4,  85, -17, 0, -18,  70, 4 },
+    { 0, 4,  82, -17, 0, -18,  73, 4 }, { 0, 4,  80, -17, 0, -18,  75, 4 },
+    { 0, 4,  78, -18, 0, -18,  78, 4 }, { 0, 4,  75, -18, 0, -17,  80, 4 },
+    { 0, 4,  73, -18, 0, -17,  82, 4 }, { 0, 4,  70, -18, 0, -17,  85, 4 },
+    { 0, 4,  68, -18, 0, -17,  87, 4 }, { 0, 4,  65, -18, 0, -16,  89, 4 },
+    { 0, 4,  63, -18, 0, -16,  91, 4 }, { 0, 4,  60, -18, 0, -16,  94, 4 },
+    { 0, 3,  58, -18, 0, -15,  96, 4 }, { 0, 4,  55, -18, 0, -15,  98, 4 },
+    { 0, 3,  52, -17, 0, -14, 100, 4 }, { 0, 3,  50, -17, 0, -14, 102, 4 },
+    { 0, 3,  47, -17, 0, -13, 104, 4 }, { 0, 3,  45, -17, 0, -13, 106, 4 },
+    { 0, 3,  42, -16, 0, -12, 108, 3 }, { 0, 3,  40, -16, 0, -11, 109, 3 },
+    { 0, 3,  37, -15, 0, -11, 111, 3 }, { 0, 2,  35, -15, 0, -10, 113, 3 },
+    { 0, 3,  32, -14, 0, -10, 114, 3 }, { 0, 2,  29, -13, 0,  -9, 116, 3 },
+    { 0, 2,  27, -13, 0,  -8, 117, 3 }, { 0, 2,  25, -12, 0,  -8, 119, 2 },
+    { 0, 2,  22, -11, 0,  -7, 120, 2 }, { 0, 1,  20, -10, 0,  -6, 121, 2 },
+    { 0, 1,  18,  -9, 0,  -6, 122, 2 }, { 0, 1,  15,  -8, 0,  -5, 123, 2 },
+    { 0, 1,  13,  -7, 0,  -4, 124, 1 }, { 0, 1,  11,  -6, 0,  -4, 125, 1 },
+    { 0, 1,   8,  -5, 0,  -3, 126, 1 }, { 0, 1,   6,  -4, 0,  -2, 126, 1 },
+    { 0, 0,   4,  -3, 0,  -1, 127, 1 }, { 0, 0,   2,  -1, 0,   0, 127, 0 },
     // dummy (replicate row index 191)
-    { 0, 0, 0,   0,   2, 127, - 1, 0 },
+    { 0, 0,   2,  -1, 0,   0, 127, 0 }
 };
 
 const uint8_t dav1d_sm_weights[128] = {
     // Unused, because we always offset by bs, which is at least 2.
       0,   0,
     // bs = 2
     255, 128,
     // bs = 4
@@ -824,8 +816,24 @@ const int8_t ALIGN(dav1d_filter_intra_ta
           0,   0,  14,   0,   0,  14,   0,   0,
           0,   0,  12,   0,   0,  12,   0,   1,
           0,  14,   0,  12,   0,  11,  14,  10,
           0,   0,   0,   0,   0,   1,  12,   1,
           0,   0,   0,   0,   0,   0,   0,   0,
          14,   0,  12,   0,  11,   0,   9,   0
     }
 };
+
+const uint8_t ALIGN(dav1d_obmc_masks[64], 32) = {
+    /* Unused */
+     0,  0,
+    /* 2 */
+    19,  0,
+    /* 4 */
+    25, 14,  5,  0,
+    /* 8 */
+    28, 22, 16, 11,  7,  3,  0,  0,
+    /* 16 */
+    30, 27, 24, 21, 18, 15, 12, 10,  8,  6,  4,  3,  0,  0,  0,  0,
+    /* 32 */
+    31, 29, 28, 26, 24, 23, 21, 20, 19, 17, 16, 14, 13, 12, 11,  9,
+     8,  7,  6,  5,  4,  4,  3,  2,  0,  0,  0,  0,  0,  0,  0,  0,
+};
--- a/third_party/dav1d/src/tables.h
+++ b/third_party/dav1d/src/tables.h
@@ -102,19 +102,20 @@ static const unsigned interintra_allowed
     (1 << BS_16x16) |
     (1 << BS_16x8) |
     (1 << BS_8x16) |
     (1 << BS_8x8);
 
 extern const WarpedMotionParams dav1d_default_wm_params;
 
 extern const int16_t dav1d_sgr_params[16][4];
-extern const int16_t dav1d_sgr_x_by_xplus1[256];
-extern const int16_t dav1d_sgr_one_by_x[25];
+extern const int dav1d_sgr_x_by_xplus1[256];
 
 extern const int8_t dav1d_mc_subpel_filters[5][15][8];
-extern const int8_t dav1d_mc_warp_filter[][8];
+extern const int8_t dav1d_mc_warp_filter[193][8];
 
 extern const uint8_t dav1d_sm_weights[128];
 extern const int16_t dav1d_dr_intra_derivative[90];
 extern const int8_t dav1d_filter_intra_taps[5][64];
 
+extern const uint8_t dav1d_obmc_masks[64];
+
 #endif /* __DAV1D_SRC_TABLES_H__ */
--- a/third_party/dav1d/src/thread_task.c
+++ b/third_party/dav1d/src/thread_task.c
@@ -66,66 +66,72 @@ void *dav1d_tile_task(void *const data) 
         while (!fttd->tasks_left && !t->tile_thread.die) {
             if (!did_signal) {
                 did_signal = 1;
                 pthread_cond_signal(&fttd->icond);
             }
             pthread_cond_wait(&fttd->cond, &fttd->lock);
         }
         if (t->tile_thread.die) {
+            pthread_cond_signal(&fttd->icond);
             pthread_mutex_unlock(&fttd->lock);
             break;
         }
         fttd->available &= ~mask;
         const int task_idx = fttd->num_tasks - fttd->tasks_left--;
         pthread_mutex_unlock(&fttd->lock);
 
         if (f->frame_thread.pass == 1 || f->n_tc >= f->frame_hdr.tiling.cols) {
             // we can (or in fact, if >, we need to) do full tile decoding.
             // loopfilter happens in the main thread
             Dav1dTileState *const ts = t->ts = &f->ts[task_idx];
             for (t->by = ts->tiling.row_start; t->by < ts->tiling.row_end;
                  t->by += f->sb_step)
             {
-                dav1d_decode_tile_sbrow(t);
+                int error = dav1d_decode_tile_sbrow(t);
+                int progress = error ? TILE_ERROR : 1 + (t->by >> f->sb_shift);
 
                 // signal progress
                 pthread_mutex_lock(&ts->tile_thread.lock);
-                atomic_store(&ts->progress, 1 + (t->by >> f->sb_shift));
+                atomic_store(&ts->progress, progress);
                 pthread_cond_signal(&ts->tile_thread.cond);
                 pthread_mutex_unlock(&ts->tile_thread.lock);
+                if (error) break;
             }
         } else {
             const int sby = f->tile_thread.task_idx_to_sby_and_tile_idx[task_idx][0];
             const int tile_idx = f->tile_thread.task_idx_to_sby_and_tile_idx[task_idx][1];
             Dav1dTileState *const ts = &f->ts[tile_idx];
+            int progress;
 
             // the interleaved decoding can sometimes cause dependency issues
             // if one part of the frame decodes signifcantly faster than others.
             // Ideally, we'd "skip" tile_sbrows where dependencies are missing,
             // and resume them later as dependencies are met. This also would
             // solve the broadcast() below and allow us to use signal(). However,
             // for now, we use linear dependency tracking because it's simpler.
-            if (atomic_load(&ts->progress) < sby) {
+            if ((progress = atomic_load(&ts->progress)) < sby) {
                 pthread_mutex_lock(&ts->tile_thread.lock);
-                while (atomic_load(&ts->progress) < sby)
+                while ((progress = atomic_load(&ts->progress)) < sby)
                     pthread_cond_wait(&ts->tile_thread.cond,
                                       &ts->tile_thread.lock);
                 pthread_mutex_unlock(&ts->tile_thread.lock);
             }
+            if (progress == TILE_ERROR) continue;
 
             // we need to interleave sbrow decoding for all tile cols in a
             // tile row, since otherwise subsequent threads will be blocked
             // waiting for the post-filter to complete
             t->ts = ts;
             t->by = sby << f->sb_shift;
-            dav1d_decode_tile_sbrow(t);
+            int error = dav1d_decode_tile_sbrow(t);
+            progress = error ? TILE_ERROR : 1 + sby;
 
             // signal progress
             pthread_mutex_lock(&ts->tile_thread.lock);
-            atomic_store(&ts->progress, 1 + sby);
+            atomic_store(&ts->progress, progress);
             pthread_cond_broadcast(&ts->tile_thread.cond);
             pthread_mutex_unlock(&ts->tile_thread.lock);
         }
     }
 
     return NULL;
 }
--- a/third_party/dav1d/src/thread_task.h
+++ b/third_party/dav1d/src/thread_task.h
@@ -23,17 +23,22 @@
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #ifndef __DAV1D_SRC_THREAD_TASK_H__
 #define __DAV1D_SRC_THREAD_TASK_H__
 
+#include <limits.h>
+
 #include "src/internal.h"
 
+#define FRAME_ERROR (UINT_MAX - 1)
+#define TILE_ERROR (INT_MAX - 1)
+
 int dav1d_decode_frame(Dav1dFrameContext *f);
 void *dav1d_frame_task(void *data);
 
 int dav1d_decode_tile_sbrow(Dav1dTileContext *t);
 void *dav1d_tile_task(void *data);
 
 #endif /* __DAV1D_SRC_THREAD_TASK_H__ */
--- a/third_party/dav1d/src/warpmv.c
+++ b/third_party/dav1d/src/warpmv.c
@@ -62,20 +62,16 @@ static const uint16_t div_lut[257] = {
 };
 
 static inline int iclip_wmp(const int v) {
     const int cv = iclip(v, INT16_MIN, INT16_MAX);
 
     return apply_sign((abs(cv) + 32) >> 6, cv) * (1 << 6);
 }
 
-static inline int apply_sign64(const int v, const int64_t s) {
-    return s < 0 ? -v : v;
-}
-
 static inline int resolve_divisor_32(const unsigned d, int *const shift) {
     *shift = ulog2(d);
     const int e = d - (1 << *shift);
     const int f = *shift > 8 ? (e + (1 << (*shift - 9))) >> (*shift - 8) :
                                e << (8 - *shift);
     assert(f <= 256);
     *shift += 14;
     // Use f as lookup into the precomputed table of multipliers
--- a/third_party/dav1d/src/wedge.c
+++ b/third_party/dav1d/src/wedge.c
@@ -267,26 +267,28 @@ void dav1d_init_wedge_masks(void) {
     fill(16, 16,  8x16,  8x8,  heqw, 0x7bfb);
     fill(16,  8,  8x8,   8x4,  hltw, 0x7beb);
     fill( 8, 32,  4x32,  4x16, hgtw, 0x7aeb);
     fill( 8, 16,  4x16,  4x8,  hgtw, 0x7beb);
     fill( 8,  8,  4x8,   4x4,  heqw, 0x7bfb);
 #undef fill
 }
 
-static uint8_t ii_dc_mask[32 * 32];
-static uint8_t ii_nondc_mask_32x32[N_INTER_INTRA_PRED_MODES - 1][32 * 32];
-static uint8_t ii_nondc_mask_16x32[N_INTER_INTRA_PRED_MODES - 1][16 * 32];
-static uint8_t ii_nondc_mask_16x16[N_INTER_INTRA_PRED_MODES - 1][16 * 16];
-static uint8_t ii_nondc_mask_8x32[N_INTER_INTRA_PRED_MODES - 1][8 * 32];
-static uint8_t ii_nondc_mask_8x16[N_INTER_INTRA_PRED_MODES - 1][8 * 16];
-static uint8_t ii_nondc_mask_8x8[N_INTER_INTRA_PRED_MODES - 1][8 * 8];
-static uint8_t ii_nondc_mask_4x16[N_INTER_INTRA_PRED_MODES - 1][4 * 16];
-static uint8_t ii_nondc_mask_4x8[N_INTER_INTRA_PRED_MODES - 1][4 * 8];
-static uint8_t ii_nondc_mask_4x4[N_INTER_INTRA_PRED_MODES - 1][4 * 4];
+#define N_II_PRED_MODES (N_INTER_INTRA_PRED_MODES - 1)
+static uint8_t ALIGN(ii_dc_mask[32 * 32], 32);
+static uint8_t ALIGN(ii_nondc_mask_32x32[N_II_PRED_MODES][32 * 32], 32);
+static uint8_t ALIGN(ii_nondc_mask_16x32[N_II_PRED_MODES][16 * 32], 32);
+static uint8_t ALIGN(ii_nondc_mask_16x16[N_II_PRED_MODES][16 * 16], 32);
+static uint8_t ALIGN(ii_nondc_mask_8x32 [N_II_PRED_MODES][ 8 * 32], 32);
+static uint8_t ALIGN(ii_nondc_mask_8x16 [N_II_PRED_MODES][ 8 * 16], 32);
+static uint8_t ALIGN(ii_nondc_mask_8x8  [N_II_PRED_MODES][ 8 *  8], 32);
+static uint8_t ALIGN(ii_nondc_mask_4x16 [N_II_PRED_MODES][ 4 * 16], 32);
+static uint8_t ALIGN(ii_nondc_mask_4x8  [N_II_PRED_MODES][ 4 *  8], 32);
+static uint8_t ALIGN(ii_nondc_mask_4x4  [N_II_PRED_MODES][ 4 *  4], 32);
+#undef N_II_PRED_MODES
 
 #define set1(sz) \
     [II_DC_PRED] = ii_dc_mask, \
     [II_VERT_PRED] = ii_nondc_mask_##sz[II_VERT_PRED - 1], \
     [II_HOR_PRED] = ii_nondc_mask_##sz[II_HOR_PRED - 1], \
     [II_SMOOTH_PRED] = ii_nondc_mask_##sz[II_SMOOTH_PRED - 1]
 #define set(sz_444, sz_422, sz_420) \
     { { set1(sz_444) }, { set1(sz_422) }, { set1(sz_420) } }
--- a/third_party/dav1d/src/win32/thread.c
+++ b/third_party/dav1d/src/win32/thread.c
@@ -24,19 +24,20 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include "config.h"
 
 #if defined(_WIN32)
 
-#include <windows.h>
+#include <errno.h>
 #include <process.h>
-#include <errno.h>
+#include <stdlib.h>
+#include <windows.h>
 
 #include "config.h"
 #include "src/thread.h"
 
 typedef struct dav1d_win32_thread_t {
     HANDLE h;
     void* param;
     void*(*proc)(void*);
new file mode 100644
--- /dev/null
+++ b/third_party/dav1d/src/x86/cdef.asm
@@ -0,0 +1,672 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC