Bug 1520174 - Update dav1d from upstream to f813285. r=TD-Linux
authorAlex Chronopoulos <achronop@gmail.com>
Wed, 16 Jan 2019 00:05:25 +0000
changeset 514046 a0588068456ca1002798e5d358dfade5d7879bc3
parent 514045 2fc4ce319e749d9d8b672bfdbc9d3fe1fb7276a6
child 514047 e56368888bd4d684992e58ba4c67b4c75b262712
push id1953
push userffxbld-merge
push dateMon, 11 Mar 2019 12:10:20 +0000
treeherdermozilla-release@9c35dcbaa899 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersTD-Linux
bugs1520174
milestone66.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1520174 - Update dav1d from upstream to f813285. r=TD-Linux Differential Revision: https://phabricator.services.mozilla.com/D16562
media/libdav1d/moz.yaml
third_party/dav1d/.gitlab-ci.yml
third_party/dav1d/include/common/attributes.h
third_party/dav1d/include/compat/msvc/stdatomic.h
third_party/dav1d/include/dav1d/common.h
third_party/dav1d/include/dav1d/data.h
third_party/dav1d/meson.build
third_party/dav1d/src/arm/cpu.c
third_party/dav1d/src/data.c
third_party/dav1d/src/data.h
third_party/dav1d/src/decode.c
third_party/dav1d/src/ext/x86/x86inc.asm
third_party/dav1d/src/ipred_tmpl.c
third_party/dav1d/src/itx_tmpl.c
third_party/dav1d/src/lib.c
third_party/dav1d/src/lr_apply_tmpl.c
third_party/dav1d/src/obu.c
third_party/dav1d/src/picture.c
third_party/dav1d/src/picture.h
third_party/dav1d/src/recon_tmpl.c
third_party/dav1d/src/ref.c
third_party/dav1d/src/tables.c
third_party/dav1d/src/tables.h
third_party/dav1d/src/x86/cpu.c
third_party/dav1d/src/x86/ipred.asm
third_party/dav1d/src/x86/ipred_init_tmpl.c
third_party/dav1d/src/x86/ipred_ssse3.asm
third_party/dav1d/src/x86/itx.asm
third_party/dav1d/src/x86/itx_init_tmpl.c
third_party/dav1d/src/x86/itx_ssse3.asm
third_party/dav1d/src/x86/mc_init_tmpl.c
third_party/dav1d/src/x86/mc_ssse3.asm
third_party/dav1d/tests/checkasm/x86/checkasm.asm
--- a/media/libdav1d/moz.yaml
+++ b/media/libdav1d/moz.yaml
@@ -15,15 +15,15 @@ origin:
   description: dav1d, a fast AV1 decoder
 
   # Full URL for the package's homepage/etc
   # Usually different from repository url
   url: https://code.videolan.org/videolan/dav1d
 
   # Human-readable identifier for this version/release
   # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: commit b53a99b97f93d0eb15d1f532739ca062fe44b4ca
+  release: commit f813285c1d1a5421e0180efbb7cbdd377cd31c69 (2019-01-13T22:08:25.000Z).
 
   # The package's license, where possible using the mnemonic from
   # https://spdx.org/licenses/
   # Multiple licenses can be specified (as a YAML list)
   # A "LICENSE" file must exist containing the full license text
   license: BSD-2-Clause
--- a/third_party/dav1d/.gitlab-ci.yml
+++ b/third_party/dav1d/.gitlab-ci.yml
@@ -1,12 +1,23 @@
 stages:
+    - style
     - build
     - test
 
+style-check:
+    image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132
+    stage: style
+    tags:
+        - debian
+        - amd64
+    script:
+        - git grep -n -e $'\t' --or -e $'\r' -- . ':(exclude)*/compat/*' && exit 1
+        - /bin/true
+
 build-debian:
     image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132
     stage: build
     tags:
         - debian
         - amd64
     script:
         - meson build --buildtype release --werror
@@ -19,16 +30,29 @@ build-debian-static:
     tags:
         - debian
         - amd64
     script:
         - meson build --buildtype release --default-library static --werror
         - ninja -C build
         - cd build && meson test -v
 
+build-debian32:
+    image: registry.videolan.org:5000/dav1d-debian-unstable:20181218135732
+    stage: build
+    tags:
+        - debian
+        - amd64
+    script:
+        - meson build --buildtype release
+                      --werror
+                      --cross-file /opt/crossfiles/linux32.meson
+        - ninja -C build
+        - cd build && meson test -v
+
 build-win32:
     image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132
     stage: build
     tags:
         - win32
     script:
         - meson build --buildtype release
                       --werror
--- a/third_party/dav1d/include/common/attributes.h
+++ b/third_party/dav1d/include/common/attributes.h
@@ -33,25 +33,26 @@
 #include <stddef.h>
 
 #ifdef __GNUC__
 #define ATTR_ALIAS __attribute__((may_alias))
 #else
 #define ATTR_ALIAS
 #endif
 
-#if ARCH_X86
+#if ARCH_X86_64
+/* x86-64 needs 32-byte alignment for AVX2. */
 #define ALIGN_32_VAL 32
 #define ALIGN_16_VAL 16
-#elif ARCH_ARM || ARCH_AARCH64
-// ARM doesn't benefit from anything more than 16 byte alignment.
+#elif ARCH_X86_32 || ARCH_ARM || ARCH_AARCH64
+/* ARM doesn't benefit from anything more than 16-byte alignment. */
 #define ALIGN_32_VAL 16
 #define ALIGN_16_VAL 16
 #else
-// No need for extra alignment on platforms without assembly.
+/* No need for extra alignment on platforms without assembly. */
 #define ALIGN_32_VAL 8
 #define ALIGN_16_VAL 8
 #endif
 
 /*
  * API for variables, struct members (ALIGN()) like:
  * uint8_t var[1][2][3][4]
  * becomes:
--- a/third_party/dav1d/include/compat/msvc/stdatomic.h
+++ b/third_party/dav1d/include/compat/msvc/stdatomic.h
@@ -18,53 +18,53 @@
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
-
-#ifndef MSCVER_STDATOMIC_H_
-#define MSCVER_STDATOMIC_H_
-
+
+#ifndef MSCVER_STDATOMIC_H_
+#define MSCVER_STDATOMIC_H_
+
 #if !defined(__cplusplus) && defined(_MSC_VER)
 
 #pragma warning(push)
 #pragma warning(disable:4067)    /* newline for __has_include_next */
 
 #if defined(__clang__) && __has_include_next(<stdatomic.h>)
    /* use the clang stdatomic.h with clang-cl*/
 #  include_next <stdatomic.h>
 #else /* ! stdatomic.h */
 
-#include <windows.h>
-
-#include "common/attributes.h"
-
-typedef volatile LONG  __declspec(align(32)) atomic_int;
-typedef volatile ULONG __declspec(align(32)) atomic_uint;
-
-typedef enum {
-    memory_order_relaxed,
-    memory_order_acquire
-} msvc_atomic_memory_order;
-
-#define atomic_init(p_a, v)           do { *(p_a) = (v); } while(0)
-#define atomic_store(p_a, v)          InterlockedExchange((LONG*)p_a, v)
-#define atomic_load(p_a)              InterlockedCompareExchange((LONG*)p_a, 0, 0)
-#define atomic_load_explicit(p_a, mo) atomic_load(p_a)
-
-/*
- * TODO use a special call to increment/decrement
- * using InterlockedIncrement/InterlockedDecrement
- */
-#define atomic_fetch_add(p_a, inc)    InterlockedExchangeAdd(p_a, inc)
-#define atomic_fetch_sub(p_a, dec)    InterlockedExchangeAdd(p_a, -(dec))
-
-#endif /* ! stdatomic.h */
-
-#pragma warning(pop)
-
-#endif /* !defined(__cplusplus) && defined(_MSC_VER) */
-
-#endif /* MSCVER_STDATOMIC_H_ */
+#include <windows.h>
+
+#include "common/attributes.h"
+
+typedef volatile LONG  __declspec(align(32)) atomic_int;
+typedef volatile ULONG __declspec(align(32)) atomic_uint;
+
+typedef enum {
+    memory_order_relaxed,
+    memory_order_acquire
+} msvc_atomic_memory_order;
+
+#define atomic_init(p_a, v)           do { *(p_a) = (v); } while(0)
+#define atomic_store(p_a, v)          InterlockedExchange((LONG*)p_a, v)
+#define atomic_load(p_a)              InterlockedCompareExchange((LONG*)p_a, 0, 0)
+#define atomic_load_explicit(p_a, mo) atomic_load(p_a)
+
+/*
+ * TODO use a special call to increment/decrement
+ * using InterlockedIncrement/InterlockedDecrement
+ */
+#define atomic_fetch_add(p_a, inc)    InterlockedExchangeAdd(p_a, inc)
+#define atomic_fetch_sub(p_a, dec)    InterlockedExchangeAdd(p_a, -(dec))
+
+#endif /* ! stdatomic.h */
+
+#pragma warning(pop)
+
+#endif /* !defined(__cplusplus) && defined(_MSC_VER) */
+
+#endif /* MSCVER_STDATOMIC_H_ */
--- a/third_party/dav1d/include/dav1d/common.h
+++ b/third_party/dav1d/include/dav1d/common.h
@@ -39,23 +39,32 @@
         #define DAV1D_API __attribute__ ((visibility ("default")))
       #else
         #define DAV1D_API
       #endif
     #endif
 #endif
 
 /**
+ * A reference-counted object wrapper for a user-configurable pointer.
+ */
+typedef struct Dav1dUserData {
+    const uint8_t *data; ///< data pointer
+    struct Dav1dRef *ref; ///< allocation origin
+} Dav1dUserData;
+
+/**
  * Input packet metadata which are copied from the input data used to
  * decode each image into the matching structure of the output image
  * returned back to the user. Since these are metadata fields, they
  * can be used for other purposes than the documented ones, they will
  * still be passed from input data to output picture without being
  * used internally.
  */
 typedef struct Dav1dDataProps {
     int64_t timestamp; ///< container timestamp of input data, INT64_MIN if unknown (default)
     int64_t duration; ///< container duration of input data, 0 if unknown (default)
     int64_t offset; ///< stream offset of input data, -1 if unknown (default)
     size_t size; ///< packet size, default Dav1dData.sz
+    struct Dav1dUserData user_data; ///< user-configurable data, default NULL members
 } Dav1dDataProps;
 
 #endif // __DAV1D_COMMON_H__
--- a/third_party/dav1d/include/dav1d/data.h
+++ b/third_party/dav1d/include/dav1d/data.h
@@ -53,26 +53,57 @@ DAV1D_API uint8_t * dav1d_data_create(Da
 /**
  * Wrap an existing data array.
  *
  * @param          data Input context.
  * @param           buf The data to be wrapped.
  * @param            sz Size of the data.
  * @param free_callback Function to be called when we release our last
  *                      reference to this data. In this callback, $buf will be
- *                      the $buf argument to this function, and $user_data
- *                      will be the $user_data input argument to this function.
- * @param     user_data Opaque parameter passed to free_callback().
+ *                      the $buf argument to this function, and $cookie will
+ *                      be the $cookie input argument to this function.
+ * @param        cookie Opaque parameter passed to free_callback().
  *
  * @return 0 on success. A negative errno value on error.
  */
 DAV1D_API int dav1d_data_wrap(Dav1dData *data, const uint8_t *buf, size_t sz,
-                              void (*free_callback)(const uint8_t *buf, void *user_data),
-                              void *user_data);
+                              void (*free_callback)(const uint8_t *buf, void *cookie),
+                              void *cookie);
+
+/**
+ * Wrap a user-provided data pointer into a reference counted object.
+ *
+ * data->m.user_data field will initialized to wrap the provided $user_data
+ * pointer.
+ *
+ * $free_callback will be called on the same thread that released the last
+ * reference. If frame threading is used, make sure $free_callback is
+ * thread-safe.
+ *
+ * @param          data Input context.
+ * @param     user_data The user data to be wrapped.
+ * @param free_callback Function to be called when we release our last
+ *                      reference to this data. In this callback, $user_data
+ *                      will be the $user_data argument to this function, and
+ *                      $cookie will be the $cookie input argument to this
+ *                      function.
+ * @param        cookie Opaque parameter passed to $free_callback.
+ *
+ * @return 0 on success. A negative errno value on error.
+ */
+DAV1D_API int dav1d_data_wrap_user_data(Dav1dData *data,
+                                        const uint8_t *user_data,
+                                        void (*free_callback)(const uint8_t *user_data,
+                                                              void *cookie),
+                                        void *cookie);
 
 /**
  * Free the data reference.
  *
+ * The reference count for data->m.user_data will be decremented (if it has been
+ * initialized with dav1d_data_wrap_user_data). The $data object will be memset
+ * to 0.
+ *
  * @param data Input context.
  */
 DAV1D_API void dav1d_data_unref(Dav1dData *data);
 
 #endif /* __DAV1D_DATA_H__ */
--- a/third_party/dav1d/meson.build
+++ b/third_party/dav1d/meson.build
@@ -142,16 +142,19 @@ elif cc.has_function('memalign', prefix 
     cdata.set('HAVE_MEMALIGN', 1)
 endif
 
 if (host_machine.cpu_family() == 'aarch64' or
     host_machine.cpu_family().startswith('arm'))
     if cc.has_function('getauxval', prefix : '#include <sys/auxv.h>', args : test_args)
         cdata.set('HAVE_GETAUXVAL', 1)
     endif
+    if cc.has_function('elf_aux_info', prefix : '#include <sys/auxv.h>', args : test_args)
+        cdata.set('HAVE_ELF_AUX_INFO', 1)
+    endif
 endif
 
 # Compiler flag tests
 
 if cc.has_argument('-fvisibility=hidden')
     add_project_arguments('-fvisibility=hidden', language: 'c')
 else
     warning('Compiler does not support -fvisibility=hidden, all symbols will be public!')
@@ -162,16 +165,22 @@ endif
 # it is not an error and silently tolerated
 optional_arguments = [
   '-Wundef',
   '-Werror=vla',
   '-Wno-maybe-uninitialized',
   '-Wno-unused-parameter',
   '-Werror=missing-prototypes',
 ]
+if cc.get_id() == 'msvc'
+    optional_arguments += [
+      '-wd4028', # parameter different from declaration
+      '-wd4996'  # use of POSIX functions
+    ]
+endif
 
 if (get_option('buildtype') != 'debug' and get_option('buildtype') != 'plain')
     optional_arguments += '-fomit-frame-pointer'
     optional_arguments += '-ffast-math'
 endif
 
 add_project_arguments(cc.get_supported_arguments(optional_arguments), language : 'c')
 
@@ -186,40 +195,45 @@ if fuzzing_engine == 'libfuzzer'
 endif
 
 # Stack alignments flags
 
 stackalign_flag = []
 stackrealign_flag = []
 
 if host_machine.cpu_family().startswith('x86')
-    if cc.has_argument('-mpreferred-stack-boundary=5')
-        stackalign_flag = ['-mpreferred-stack-boundary=5']
-        stackrealign_flag = ['-mincoming-stack-boundary=4']
-        cdata_asm.set('STACK_ALIGNMENT', 32)
-        cdata.set('STACK_ALIGNMENT', 32)
-    elif cc.has_argument('-mpreferred-stack-boundary=4')
-        stackalign_flag = ['-mpreferred-stack-boundary=4']
-        stackrealign_flag = ['-mincoming-stack-boundary=4']
-        cdata_asm.set('STACK_ALIGNMENT', 16)
-        cdata.set('STACK_ALIGNMENT', 16)
-    elif cc.has_argument('-mstack-alignment=32')
-        stackalign_flag = ['-mstack-alignment=32']
-        stackrealign_flag = ['-mstackrealign']
-        cdata_asm.set('STACK_ALIGNMENT', 32)
-        cdata.set('STACK_ALIGNMENT', 32)
+    if host_machine.cpu_family() == 'x86_64'
+        if cc.has_argument('-mpreferred-stack-boundary=5')
+            stackalign_flag = ['-mpreferred-stack-boundary=5']
+            stackrealign_flag = ['-mincoming-stack-boundary=4']
+            stack_alignment = 32
+        elif cc.has_argument('-mstack-alignment=32')
+            stackalign_flag = ['-mstack-alignment=32']
+            stackrealign_flag = ['-mstackrealign']
+            stack_alignment = 32
+        else
+            stack_alignment = 16
+        endif
     else
-        if host_machine.cpu_family() == 'x86_64'
-            cdata_asm.set('STACK_ALIGNMENT', 16)
-            cdata.set('STACK_ALIGNMENT', 16)
+        if host_machine.system() == 'linux' or host_machine.system() == 'darwin'
+            stack_alignment = 16
+        elif cc.has_argument('-mpreferred-stack-boundary=4')
+            stackalign_flag = ['-mpreferred-stack-boundary=4']
+            stackrealign_flag = ['-mincoming-stack-boundary=2']
+            stack_alignment = 16
+        elif cc.has_argument('-mstack-alignment=16')
+            stackalign_flag = ['-mstack-alignment=16']
+            stackrealign_flag = ['-mstackrealign']
+            stack_alignment = 16
         else
-            cdata_asm.set('STACK_ALIGNMENT', 4)
-            cdata.set('STACK_ALIGNMENT', 4)
+            stack_alignment = 4
         endif
     endif
+    cdata_asm.set('STACK_ALIGNMENT', stack_alignment)
+    cdata.set('STACK_ALIGNMENT', stack_alignment)
 endif
 
 cdata.set10('ARCH_AARCH64', host_machine.cpu_family() == 'aarch64')
 cdata.set10('ARCH_ARM',     host_machine.cpu_family().startswith('arm'))
 if (is_asm_enabled and
     (host_machine.cpu_family() == 'aarch64' or
      host_machine.cpu_family().startswith('arm')))
 
@@ -251,23 +265,22 @@ endif
 
 if host_machine.cpu_family().startswith('x86')
     cdata.set10('ARCH_X86', true)
     if host_machine.cpu_family() == 'x86_64'
         cdata_asm.set10('ARCH_X86_64', true)
         cdata.set10('ARCH_X86_64', true)
         cdata_asm.set10('ARCH_X86_32', false)
         cdata.set10('ARCH_X86_32', false)
-
-        cdata_asm.set10('PIC', true)
     else
         cdata_asm.set10('ARCH_X86_64', false)
         cdata.set10('ARCH_X86_64', false)
         cdata_asm.set10('ARCH_X86_32', true)
         cdata.set10('ARCH_X86_32', true)
+        cdata_asm.set10('PIC', true)
     endif
 else
     cdata.set10('ARCH_X86', false)
     cdata.set10('ARCH_X86_64', false)
     cdata.set10('ARCH_X86_32', false)
 endif
 
 if cc.symbols_have_underscore_prefix()
--- a/third_party/dav1d/src/arm/cpu.c
+++ b/third_party/dav1d/src/arm/cpu.c
@@ -32,16 +32,21 @@
 #if defined(HAVE_GETAUXVAL) && ARCH_ARM
 #include <sys/auxv.h>
 
 #ifndef HWCAP_ARM_NEON
 #define HWCAP_ARM_NEON (1 << 12)
 #endif
 #define NEON_HWCAP HWCAP_ARM_NEON
 
+#elif defined(HAVE_ELF_AUX_INFO) && ARCH_ARM
+#include <sys/auxv.h>
+
+#define NEON_HWCAP HWCAP_NEON
+
 #elif defined(__ANDROID__)
 #include <stdio.h>
 #include <string.h>
 
 static unsigned parse_proc_cpuinfo(const char *flag) {
     FILE *file = fopen("/proc/cpuinfo", "r");
     if (!file)
         return 0;
@@ -67,19 +72,25 @@ static unsigned parse_proc_cpuinfo(const
     return 0;
 }
 #endif
 
 unsigned dav1d_get_cpu_flags_arm(void) {
     unsigned flags = 0;
 #if ARCH_AARCH64
     flags |= DAV1D_ARM_CPU_FLAG_NEON;
+#elif defined(__ARM_NEON)
+    flags |= DAV1D_ARM_CPU_FLAG_NEON;
 #elif defined(HAVE_GETAUXVAL) && ARCH_ARM
     unsigned long hw_cap = getauxval(AT_HWCAP);
     flags |= (hw_cap & NEON_HWCAP) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
+#elif defined(HAVE_ELF_AUX_INFO) && ARCH_ARM
+    unsigned long hw_cap = 0;
+    elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap));
+    flags |= (hw_cap & NEON_HWCAP) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
 #elif defined(__ANDROID__)
     flags |= parse_proc_cpuinfo("neon") ? DAV1D_ARM_CPU_FLAG_NEON : 0;
 #elif defined(__APPLE__)
     flags |= DAV1D_ARM_CPU_FLAG_NEON;
 #elif defined(_WIN32)
     flags |= DAV1D_ARM_CPU_FLAG_NEON;
 #endif
 
--- a/third_party/dav1d/src/data.c
+++ b/third_party/dav1d/src/data.c
@@ -22,74 +22,124 @@
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include "config.h"
 
+#include <assert.h>
 #include <errno.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 
 #include "dav1d/data.h"
 
 #include "common/validate.h"
 
 #include "src/data.h"
 #include "src/ref.h"
 
-uint8_t * dav1d_data_create(Dav1dData *const buf, const size_t sz) {
+uint8_t *dav1d_data_create_internal(Dav1dData *const buf, const size_t sz) {
     validate_input_or_ret(buf != NULL, NULL);
 
     buf->ref = dav1d_ref_create(sz);
     if (!buf->ref) return NULL;
     buf->data = buf->ref->const_data;
     buf->sz = buf->m.size = sz;
     buf->m.timestamp = INT64_MIN;
     buf->m.duration = 0;
     buf->m.offset = -1;
+    buf->m.user_data.data = NULL;
+    buf->m.user_data.ref = NULL;
 
     return buf->ref->data;
 }
 
-int dav1d_data_wrap(Dav1dData *const buf, const uint8_t *const ptr, const size_t sz,
-                    void (*free_callback)(const uint8_t *data, void *user_data),
-                    void *user_data)
+int dav1d_data_wrap_internal(Dav1dData *const buf, const uint8_t *const ptr,
+                             const size_t sz,
+                             void (*const free_callback)(const uint8_t *data,
+                                                         void *cookie),
+                             void *const cookie)
 {
     validate_input_or_ret(buf != NULL, -EINVAL);
     validate_input_or_ret(ptr != NULL, -EINVAL);
     validate_input_or_ret(free_callback != NULL, -EINVAL);
 
-    buf->ref = dav1d_ref_wrap(ptr, free_callback, user_data);
+    buf->ref = dav1d_ref_wrap(ptr, free_callback, cookie);
     if (!buf->ref) return -ENOMEM;
     buf->data = ptr;
     buf->sz = buf->m.size = sz;
     buf->m.timestamp = INT64_MIN;
     buf->m.duration = 0;
     buf->m.offset = -1;
+    buf->m.user_data.data = NULL;
+    buf->m.user_data.ref = NULL;
 
     return 0;
 }
 
+int dav1d_data_wrap_user_data_internal(Dav1dData *const buf,
+                                       const uint8_t *const user_data,
+                                       void (*const free_callback)(const uint8_t *user_data,
+                                                                   void *cookie),
+                                       void *const cookie)
+{
+    validate_input_or_ret(buf != NULL, -EINVAL);
+    validate_input_or_ret(free_callback != NULL, -EINVAL);
+
+    buf->m.user_data.ref = dav1d_ref_wrap(user_data, free_callback, cookie);
+    if (!buf->m.user_data.ref) return -ENOMEM;
+    buf->m.user_data.data = user_data;
+
+    return 0;
+}
+
+
+void dav1d_data_ref(Dav1dData *const dst, const Dav1dData *const src) {
+    validate_input(dst != NULL);
+    validate_input(dst->data == NULL);
+    validate_input(src != NULL);
+
+    if (src->ref) {
+        validate_input(src->data != NULL);
+        dav1d_ref_inc(src->ref);
+    }
+    if (src->m.user_data.ref) dav1d_ref_inc(src->m.user_data.ref);
+    *dst = *src;
+}
+
 void dav1d_data_move_ref(Dav1dData *const dst, Dav1dData *const src) {
     validate_input(dst != NULL);
     validate_input(dst->data == NULL);
     validate_input(src != NULL);
 
     if (src->ref)
         validate_input(src->data != NULL);
 
     *dst = *src;
     memset(src, 0, sizeof(*src));
 }
 
-void dav1d_data_unref(Dav1dData *const buf) {
+void dav1d_data_props_copy(Dav1dDataProps *const dst,
+                           const Dav1dDataProps *const src)
+{
+    assert(dst != NULL);
+    assert(src != NULL);
+
+    dav1d_ref_dec(&dst->user_data.ref);
+    *dst = *src;
+    if (dst->user_data.ref) dav1d_ref_inc(dst->user_data.ref);
+}
+
+void dav1d_data_unref_internal(Dav1dData *const buf) {
     validate_input(buf != NULL);
 
+    struct Dav1dRef *user_data_ref = buf->m.user_data.ref;
     if (buf->ref) {
         validate_input(buf->data != NULL);
         dav1d_ref_dec(&buf->ref);
     }
     memset(buf, 0, sizeof(*buf));
+    dav1d_ref_dec(&user_data_ref);
 }
--- a/third_party/dav1d/src/data.h
+++ b/third_party/dav1d/src/data.h
@@ -25,14 +25,34 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #ifndef __DAV1D_SRC_DATA_H__
 #define __DAV1D_SRC_DATA_H__
 
 #include "dav1d/data.h"
 
+void dav1d_data_ref(Dav1dData *dst, const Dav1dData *src);
+
 /**
  * Move a data reference.
  */
 void dav1d_data_move_ref(Dav1dData *dst, Dav1dData *src);
 
+/**
+ * Copy the source properties to the destitionatin and increase the
+ * user_data's reference count (if it's not NULL).
+ */
+void dav1d_data_props_copy(Dav1dDataProps *dst, const Dav1dDataProps *src);
+
+uint8_t *dav1d_data_create_internal(Dav1dData *buf, size_t sz);
+int dav1d_data_wrap_internal(Dav1dData *buf, const uint8_t *ptr, size_t sz,
+                             void (*free_callback)(const uint8_t *data,
+                                                   void *user_data),
+                             void *user_data);
+int dav1d_data_wrap_user_data_internal(Dav1dData *buf,
+                                       const uint8_t *user_data,
+                                       void (*free_callback)(const uint8_t *user_data,
+                                                             void *cookie),
+                                       void *cookie);
+void dav1d_data_unref_internal(Dav1dData *buf);
+
 #endif /* __DAV1D_SRC_DATA_H__ */
--- a/third_party/dav1d/src/decode.c
+++ b/third_party/dav1d/src/decode.c
@@ -1259,26 +1259,27 @@ static int decode_b(Dav1dTileContext *co
         if (has_chroma) {
             if (bw4 < 2 &&  ss_hor)
                 border_left += 4;
             if (bh4 < 2 &&  ss_ver)
                 border_top  += 4;
         }
         int src_left   = t->bx * 4 + (b->mv[0].x >> 3);
         int src_top    = t->by * 4 + (b->mv[0].y >> 3);
-        int src_right  = src_left + w4 * 4;
-        int src_bottom = src_top  + h4 * 4;
+        int src_right  = src_left + bw4 * 4;
+        int src_bottom = src_top  + bh4 * 4;
+        const int border_right = ((ts->tiling.col_end + (bw4 - 1)) & ~(bw4 - 1)) * 4;
 
         // check against left or right tile boundary and adjust if necessary
         if (src_left < border_left) {
             src_right += border_left - src_left;
             src_left  += border_left - src_left;
-        } else if (src_right > ts->tiling.col_end * 4) {
-            src_left  -= src_right - ts->tiling.col_end * 4;
-            src_right -= src_right - ts->tiling.col_end * 4;
+        } else if (src_right > border_right) {
+            src_left  -= src_right - border_right;
+            src_right -= src_right - border_right;
         }
         // check against top tile boundary and adjust if necessary
         if (src_top < border_top) {
             src_bottom += border_top - src_top;
             src_top    += border_top - src_top;
         }
 
         const int sbx = (t->bx >> (4 + f->seq_hdr->sb128)) << (6 + f->seq_hdr->sb128);
@@ -1900,16 +1901,71 @@ static int decode_b(Dav1dTileContext *co
             if (bw4 == 32) // this should be mask >> 16, but it's 0xffffffff anyway
                 (*noskip_mask)[1] |= mask;
         }
     }
 
     return 0;
 }
 
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+
+#include <sanitizer/msan_interface.h>
+
+static int checked_decode_b(Dav1dTileContext *const t,
+                            const enum BlockLevel bl,
+                            const enum BlockSize bs,
+                            const enum BlockPartition bp,
+                            const enum EdgeFlags intra_edge_flags)
+{
+    const Dav1dFrameContext *const f = t->f;
+    const int err = decode_b(t, bl, bs, bp, intra_edge_flags);
+
+    if (err == 0 && !(f->frame_thread.pass & 1)) {
+        const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+        const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+        const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+        const int bw4 = b_dim[0], bh4 = b_dim[1];
+        const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
+        const int has_chroma = f->seq_hdr->layout != DAV1D_PIXEL_LAYOUT_I400 &&
+                               (bw4 > ss_hor || t->bx & 1) &&
+                               (bh4 > ss_ver || t->by & 1);
+
+        for (int p = 0; p < 1 + 2 * has_chroma; p++) {
+            const int ss_ver = p && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+            const int ss_hor = p && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+            const int stride = f->cur.stride[!!p];
+            const int bx = t->bx & ~ss_hor;
+            const int by = t->by & ~ss_ver;
+            const int width  = w4 << (2 - ss_hor + (bw4 == ss_hor));
+            const int height = h4 << (2 - ss_ver + (bh4 == ss_ver));
+
+            const uint8_t *data = f->cur.data[p] + (by << (2 - ss_ver)) * stride +
+                                  (bx << (2 - ss_hor + !!f->seq_hdr->hbd));
+
+            for (int y = 0; y < height; data += stride, y++) {
+                const size_t line_sz = width << !!f->seq_hdr->hbd;
+                if (__msan_test_shadow(data, line_sz) != -1) {
+                    fprintf(stderr, "B[%d](%d, %d) w4:%d, h4:%d, row:%d\n",
+                            p, bx, by, w4, h4, y);
+                    __msan_check_mem_is_initialized(data, line_sz);
+                }
+            }
+        }
+    }
+
+    return err;
+}
+
+#define decode_b checked_decode_b
+
+#endif /* defined(__has_feature) */
+#endif /* __has_feature(memory_sanitizer) */
+
 static int decode_sb(Dav1dTileContext *const t, const enum BlockLevel bl,
                      const EdgeNode *const node)
 {
     const Dav1dFrameContext *const f = t->f;
     const int hsz = 16 >> bl;
     const int have_h_split = f->bw > t->bx + hsz;
     const int have_v_split = f->bh > t->by + hsz;
 
@@ -2941,31 +2997,31 @@ error:
     dav1d_thread_picture_signal(&f->sr_cur, retval == 0 ? UINT_MAX : FRAME_ERROR,
                                 PLANE_TYPE_ALL);
     for (int i = 0; i < 7; i++) {
         if (f->refp[i].p.data[0])
             dav1d_thread_picture_unref(&f->refp[i]);
         dav1d_ref_dec(&f->ref_mvs_ref[i]);
     }
 
-    dav1d_picture_unref(&f->cur);
+    dav1d_picture_unref_internal(&f->cur);
     dav1d_thread_picture_unref(&f->sr_cur);
     dav1d_cdf_thread_unref(&f->in_cdf);
     if (f->frame_hdr->refresh_context) {
         dav1d_cdf_thread_signal(&f->out_cdf);
         dav1d_cdf_thread_unref(&f->out_cdf);
     }
     dav1d_ref_dec(&f->cur_segmap_ref);
     dav1d_ref_dec(&f->prev_segmap_ref);
     dav1d_ref_dec(&f->mvs_ref);
     dav1d_ref_dec(&f->seq_hdr_ref);
     dav1d_ref_dec(&f->frame_hdr_ref);
 
     for (int i = 0; i < f->n_tile_data; i++)
-        dav1d_data_unref(&f->tile[i].data);
+        dav1d_data_unref_internal(&f->tile[i].data);
 
     return retval;
 }
 
 static int get_upscale_x0(const int in_w, const int out_w, const int step) {
     const int err = out_w * step - (in_w << 14);
     const int x0 = (-((out_w - in_w) << 13) + (out_w >> 1)) / out_w + 128 - (err >> 1);
     return x0 & 0x3fff;
@@ -3119,29 +3175,23 @@ int dav1d_submit_frame(Dav1dContext *con
     memcpy(f->tile, c->tile, c->n_tile_data * sizeof(*f->tile));
     memset(c->tile, 0, c->n_tile_data * sizeof(*c->tile));
     f->n_tile_data = c->n_tile_data;
     c->n_tile_data = 0;
 
     // allocate frame
     res = dav1d_thread_picture_alloc(&f->sr_cur, f->frame_hdr->width[1],
                                      f->frame_hdr->height,
-                                     f->seq_hdr->layout, bpc,
+                                     f->seq_hdr, f->seq_hdr_ref,
+                                     f->frame_hdr, f->frame_hdr_ref,
+                                     bpc, &f->tile[0].data.m,
                                      c->n_fc > 1 ? &f->frame_thread.td : NULL,
                                      f->frame_hdr->show_frame, &c->allocator);
     if (res < 0) goto error;
 
-    f->sr_cur.p.m = f->tile[0].data.m;
-    f->sr_cur.p.frame_hdr = f->frame_hdr;
-    f->sr_cur.p.frame_hdr_ref = f->frame_hdr_ref;
-    dav1d_ref_inc(f->frame_hdr_ref);
-    f->sr_cur.p.seq_hdr = f->seq_hdr;
-    f->sr_cur.p.seq_hdr_ref = f->seq_hdr_ref;
-    dav1d_ref_inc(f->seq_hdr_ref);
-
     if (f->frame_hdr->super_res.enabled) {
         res = dav1d_picture_alloc_copy(&f->cur, f->frame_hdr->width[0], &f->sr_cur.p);
         if (res < 0) goto error;
     } else {
         dav1d_picture_ref(&f->cur, &f->sr_cur.p);
     }
 
     if (f->frame_hdr->super_res.enabled) {
@@ -3295,17 +3345,17 @@ int dav1d_submit_frame(Dav1dContext *con
             }
             memcpy(c->refs[i].refpoc, f->refpoc, sizeof(f->refpoc));
         }
     }
 
     if (c->n_fc == 1) {
         const unsigned refresh_frame_flags = f->frame_hdr->refresh_frame_flags;
         if ((res = dav1d_decode_frame(f)) < 0) {
-            dav1d_picture_unref(&c->out);
+            dav1d_picture_unref_internal(&c->out);
             for (int i = 0; i < 8; i++) {
                 if (refresh_frame_flags & (1 << i)) {
                     if (c->refs[i].p.p.data[0])
                         dav1d_thread_picture_unref(&c->refs[i].p);
                     dav1d_cdf_thread_unref(&c->cdf[i]);
                     dav1d_ref_dec(&c->refs[i].segmap);
                     dav1d_ref_dec(&c->refs[i].refmvs);
                 }
@@ -3323,27 +3373,27 @@ error:
     if (f->frame_hdr->refresh_context)
         dav1d_cdf_thread_unref(&f->out_cdf);
     for (int i = 0; i < 7; i++) {
         if (f->refp[i].p.data[0])
             dav1d_thread_picture_unref(&f->refp[i]);
         dav1d_ref_dec(&f->ref_mvs_ref[i]);
     }
     if (c->n_fc == 1)
-        dav1d_picture_unref(&c->out);
+        dav1d_picture_unref_internal(&c->out);
     else
         dav1d_thread_picture_unref(out_delayed);
-    dav1d_picture_unref(&f->cur);
+    dav1d_picture_unref_internal(&f->cur);
     dav1d_thread_picture_unref(&f->sr_cur);
     dav1d_ref_dec(&f->mvs_ref);
     dav1d_ref_dec(&f->seq_hdr_ref);
     dav1d_ref_dec(&f->frame_hdr_ref);
 
     for (int i = 0; i < f->n_tile_data; i++)
-        dav1d_data_unref(&f->tile[i].data);
+        dav1d_data_unref_internal(&f->tile[i].data);
     f->n_tile_data = 0;
 
     if (c->n_fc > 1) {
         pthread_cond_signal(&f->frame_thread.td.cond);
         pthread_mutex_unlock(&f->frame_thread.td.lock);
     }
 
     return res;
--- a/third_party/dav1d/src/ext/x86/x86inc.asm
+++ b/third_party/dav1d/src/ext/x86/x86inc.asm
@@ -84,26 +84,23 @@
         SECTION .rdata align=%1
     %elif WIN64
         SECTION .rdata align=%1
     %else
         SECTION .rodata align=%1
     %endif
 %endmacro
 
-%if WIN64
-    %define PIC
-%elif ARCH_X86_64 == 0
-; x86_32 doesn't require PIC.
-; Some distros prefer shared objects to be PIC, but nothing breaks if
-; the code contains a few textrels, so we'll skip that complexity.
-    %undef PIC
-%endif
-%ifdef PIC
+%if ARCH_X86_64
+    %define PIC 1 ; always use PIC on x86-64
     default rel
+%elifidn __OUTPUT_FORMAT__,win32
+    %define PIC 0 ; PIC isn't used on 32-bit Windows
+%elifndef PIC
+    %define PIC 0
 %endif
 
 %ifdef __NASM_VER__
     %use smartalign
 %endif
 
 ; Macros to eliminate most code duplication between x86_32 and x86_64:
 ; Currently this works only for leaf functions which load all their arguments
@@ -215,16 +212,28 @@ DECLARE_REG_SIZE bp, bpl, null
 DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
 
 %if ARCH_X86_64
     %define gprsize 8
 %else
     %define gprsize 4
 %endif
 
+%macro LEA 2
+%if ARCH_X86_64
+    lea %1, [%2]
+%elif PIC
+    call $+5 ; special-cased to not affect the RSB on most CPU:s
+    pop %1
+    add %1, (%2)-$+1
+%else
+    mov %1, %2
+%endif
+%endmacro
+
 %macro PUSH 1
     push %1
     %ifidn rstk, rsp
         %assign stack_offset stack_offset+gprsize
     %endif
 %endmacro
 
 %macro POP 1
@@ -668,17 +677,17 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
             %endif
         %endmacro
         %rotate 1
     %endrep
 %endmacro
 
 BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
 
-%macro TAIL_CALL 2 ; callee, is_nonadjacent
+%macro TAIL_CALL 1-2 1 ; callee, is_nonadjacent
     %if has_epilogue
         call %1
         RET
     %elif %2
         jmp %1
     %endif
     annotate_function_size
 %endmacro
--- a/third_party/dav1d/src/ipred_tmpl.c
+++ b/third_party/dav1d/src/ipred_tmpl.c
@@ -417,17 +417,17 @@ static void ipred_z1_c(pixel *dst, const
                        const int width, const int height, int angle,
                        const int max_width, const int max_height
                        HIGHBD_DECL_SUFFIX)
 {
     const int is_sm = (angle >> 9) & 0x1;
     const int enable_intra_edge_filter = angle >> 10;
     angle &= 511;
     assert(angle < 90);
-    int dx = dav1d_dr_intra_derivative[angle];
+    int dx = dav1d_dr_intra_derivative[angle >> 1];
     pixel top_out[(64 + 64) * 2];
     const pixel *top;
     int max_base_x;
     const int upsample_above = enable_intra_edge_filter ?
         get_upsample(width + height, 90 - angle, is_sm) : 0;
     if (upsample_above) {
         upsample_edge(top_out, width + height, &topleft_in[1], -1,
                       width + imin(width, height) HIGHBD_TAIL_SUFFIX);
@@ -471,18 +471,18 @@ static void ipred_z2_c(pixel *dst, const
                        const int width, const int height, int angle,
                        const int max_width, const int max_height
                        HIGHBD_DECL_SUFFIX)
 {
     const int is_sm = (angle >> 9) & 0x1;
     const int enable_intra_edge_filter = angle >> 10;
     angle &= 511;
     assert(angle > 90 && angle < 180);
-    int dy = dav1d_dr_intra_derivative[angle - 90];
-    int dx = dav1d_dr_intra_derivative[180 - angle];
+    int dy = dav1d_dr_intra_derivative[(angle - 90) >> 1];
+    int dx = dav1d_dr_intra_derivative[(180 - angle) >> 1];
     const int upsample_left = enable_intra_edge_filter ?
         get_upsample(width + height, 180 - angle, is_sm) : 0;
     const int upsample_above = enable_intra_edge_filter ?
         get_upsample(width + height, angle - 90, is_sm) : 0;
     pixel edge[64 * 2 + 64 * 2 + 1];
     pixel *const topleft = &edge[height * 2];
 
     if (upsample_above) {
@@ -552,17 +552,17 @@ static void ipred_z3_c(pixel *dst, const
                        const int width, const int height, int angle,
                        const int max_width, const int max_height
                        HIGHBD_DECL_SUFFIX)
 {
     const int is_sm = (angle >> 9) & 0x1;
     const int enable_intra_edge_filter = angle >> 10;
     angle &= 511;
     assert(angle > 180);
-    int dy = dav1d_dr_intra_derivative[270 - angle];
+    int dy = dav1d_dr_intra_derivative[(270 - angle) >> 1];
     pixel left_out[(64 + 64) * 2];
     const pixel *left;
     int max_base_y;
     const int upsample_left = enable_intra_edge_filter ?
         get_upsample(width + height, angle - 180, is_sm) : 0;
     if (upsample_left) {
         upsample_edge(left_out, width + height,
                       &topleft_in[-(width + height)],
--- a/third_party/dav1d/src/itx_tmpl.c
+++ b/third_party/dav1d/src/itx_tmpl.c
@@ -53,34 +53,37 @@ inv_txfm_add_c(pixel *dst, const ptrdiff
     const ptrdiff_t sh = imin(h, 32), sw = imin(w, 32);
     assert((h >= 4 || h <= 64) && (w >= 4 || w <= 64));
     // Maximum value for h and w is 64
     coef tmp[4096 /* w * h */], out[64 /* h */], in_mem[64 /* w */];
     const int is_rect2 = w * 2 == h || h * 2 == w;
     const int bitdepth = bitdepth_from_max(bitdepth_max);
     const int row_clip_max = (1 << (bitdepth + 8 - 1)) - 1;
     const int col_clip_max = (1 << (imax(bitdepth + 6, 16) - 1)) -1;
-    const int col_clip_min = -col_clip_max - 1;
 
     if (w != sw) memset(&in_mem[sw], 0, (w - sw) * sizeof(*in_mem));
     const int rnd1 = (1 << shift1) >> 1;
     for (i = 0; i < sh; i++) {
         if (w != sw || is_rect2) {
             for (j = 0; j < sw; j++) {
                 in_mem[j] = coeff[i + j * sh];
                 if (is_rect2)
                     in_mem[j] = (in_mem[j] * 2896 + 2048) >> 12;
             }
             first_1d_fn(in_mem, 1, &tmp[i * w], 1, row_clip_max);
         } else {
             first_1d_fn(&coeff[i], sh, &tmp[i * w], 1, row_clip_max);
         }
         for (j = 0; j < w; j++)
+#if BITDEPTH == 8
+            tmp[i * w + j] = (tmp[i * w + j] + (rnd1)) >> shift1;
+#else
             tmp[i * w + j] = iclip((tmp[i * w + j] + (rnd1)) >> shift1,
-                                   col_clip_min, col_clip_max);
+                                   -col_clip_max - 1, col_clip_max);
+#endif
     }
 
     if (h != sh) memset(&tmp[sh * w], 0, w * (h - sh) * sizeof(*tmp));
     const int rnd2 = (1 << shift2) >> 1;
     for (i = 0; i < w; i++) {
         second_1d_fn(&tmp[i], w, out, 1, col_clip_max);
         for (j = 0; j < h; j++)
             dst[i + j * PXSTRIDE(stride)] =
--- a/third_party/dav1d/src/lib.c
+++ b/third_party/dav1d/src/lib.c
@@ -189,21 +189,21 @@ int dav1d_parse_sequence_header(Dav1dSeq
     int res;
 
     validate_input_or_ret(out != NULL, -EINVAL);
 
     Dav1dSettings s;
     dav1d_default_settings(&s);
 
     Dav1dContext *c;
-    res	= dav1d_open(&c, &s);
+    res = dav1d_open(&c, &s);
     if (res < 0) return res;
 
     if (ptr) {
-        res = dav1d_data_wrap(&buf, ptr, sz, dummy_free, NULL);
+        res = dav1d_data_wrap_internal(&buf, ptr, sz, dummy_free, NULL);
         if (res < 0) goto error;
     }
 
     while (buf.sz > 0) {
         res = dav1d_parse_obus(c, &buf, 1);
         if (res < 0) goto error;
 
         assert((size_t)res <= buf.sz);
@@ -215,17 +215,17 @@ int dav1d_parse_sequence_header(Dav1dSeq
         res = -EINVAL;
         goto error;
     }
 
     memcpy(out, c->seq_hdr, sizeof(*out));
 
     res = 0;
 error:
-    dav1d_data_unref(&buf);
+    dav1d_data_unref_internal(&buf);
     dav1d_close(&c);
 
     return res;
 }
 
 int dav1d_send_data(Dav1dContext *const c, Dav1dData *const in)
 {
     validate_input_or_ret(c != NULL, -EINVAL);
@@ -252,18 +252,18 @@ static int output_image(Dav1dContext *co
     if (!c->apply_grain || !has_grain) {
         dav1d_picture_move_ref(out, in);
         return 0;
     }
 
     // Apply film grain to a new copy of the image to avoid corrupting refs
     int res = dav1d_picture_alloc_copy(out, in->p.w, in);
     if (res < 0) {
-        dav1d_picture_unref(in);
-        dav1d_picture_unref(out);
+        dav1d_picture_unref_internal(in);
+        dav1d_picture_unref_internal(out);
         return res;
     }
 
     switch (out->p.bpc) {
 #if CONFIG_8BPC
     case 8:
         dav1d_apply_grain_8bpc(out, in);
         break;
@@ -273,29 +273,29 @@ static int output_image(Dav1dContext *co
     case 12:
         dav1d_apply_grain_16bpc(out, in);
         break;
 #endif
     default:
         assert(0);
     }
 
-    dav1d_picture_unref(in);
+    dav1d_picture_unref_internal(in);
     return 0;
 }
 
 static int output_picture_ready(Dav1dContext *const c) {
 
     if (!c->out.data[0]) return 0;
 
     // skip lower spatial layers
     if (c->operating_point_idc && !c->all_layers) {
         const int max_spatial_id = ulog2(c->operating_point_idc >> 8);
         if (max_spatial_id > c->out.frame_hdr->spatial_id) {
-            dav1d_picture_unref(&c->out);
+            dav1d_picture_unref_internal(&c->out);
             return 0;
         }
     }
 
     return 1;
 }
 
 static int drain_picture(Dav1dContext *const c, Dav1dPicture *const out) {
@@ -341,22 +341,22 @@ int dav1d_get_picture(Dav1dContext *cons
     if (!in->data) {
         if (c->n_fc == 1) return -EAGAIN;
         return drain_picture(c, out);
     }
 
     while (in->sz > 0) {
         res = dav1d_parse_obus(c, in, 0);
         if (res < 0) {
-            dav1d_data_unref(in);
+            dav1d_data_unref_internal(in);
         } else {
             assert((size_t)res <= in->sz);
             in->sz -= res;
             in->data += res;
-            if (!in->sz) dav1d_data_unref(in);
+            if (!in->sz) dav1d_data_unref_internal(in);
         }
         if (output_picture_ready(c))
             break;
         if (res < 0)
             return res;
     }
 
     if (output_picture_ready(c))
@@ -364,17 +364,17 @@ int dav1d_get_picture(Dav1dContext *cons
 
     if (c->n_fc > 1 && drain)
         return drain_picture(c, out);
 
     return -EAGAIN;
 }
 
 void dav1d_flush(Dav1dContext *const c) {
-    dav1d_data_unref(&c->in);
+    dav1d_data_unref_internal(&c->in);
     c->drain = 0;
 
     if (c->n_fc == 1) return;
 
     // mark each currently-running frame as flushing, so that we
     // exit out as quickly as the running thread checks this flag
     atomic_store(c->frame_thread.flush, 1);
     for (unsigned n = 0, next = c->frame_thread.next; n < c->n_fc; n++, next++) {
@@ -477,29 +477,62 @@ void dav1d_close(Dav1dContext **const c_
         free(f->lf.lr_mask);
         free(f->lf.level);
         free(f->lf.tx_lpf_right_edge[0]);
         av1_free_ref_mv_common(f->libaom_cm);
         dav1d_free_aligned(f->lf.cdef_line);
         dav1d_free_aligned(f->lf.lr_lpf_line);
     }
     dav1d_free_aligned(c->fc);
-    dav1d_data_unref(&c->in);
+    dav1d_data_unref_internal(&c->in);
     if (c->n_fc > 1) {
         for (unsigned n = 0; n < c->n_fc; n++)
             if (c->frame_thread.out_delayed[n].p.data[0])
                 dav1d_thread_picture_unref(&c->frame_thread.out_delayed[n]);
         free(c->frame_thread.out_delayed);
     }
     for (int n = 0; n < c->n_tile_data; n++)
-        dav1d_data_unref(&c->tile[n].data);
+        dav1d_data_unref_internal(&c->tile[n].data);
     for (int n = 0; n < 8; n++) {
         dav1d_cdf_thread_unref(&c->cdf[n]);
         if (c->refs[n].p.p.data[0])
             dav1d_thread_picture_unref(&c->refs[n].p);
         dav1d_ref_dec(&c->refs[n].refmvs);
         dav1d_ref_dec(&c->refs[n].segmap);
     }
     dav1d_ref_dec(&c->seq_hdr_ref);
     dav1d_ref_dec(&c->frame_hdr_ref);
 
     dav1d_freep_aligned(c_out);
 }
+
+void dav1d_picture_unref(Dav1dPicture *const p) {
+    dav1d_picture_unref_internal(p);
+}
+
+uint8_t *dav1d_data_create(Dav1dData *const buf, const size_t sz) {
+    return dav1d_data_create_internal(buf, sz);
+}
+
+int dav1d_data_wrap(Dav1dData *const buf, const uint8_t *const ptr,
+                    const size_t sz,
+                    void (*const free_callback)(const uint8_t *data,
+                                                void *user_data),
+                    void *const user_data)
+{
+    return dav1d_data_wrap_internal(buf, ptr, sz, free_callback, user_data);
+}
+
+int dav1d_data_wrap_user_data(Dav1dData *const buf,
+                              const uint8_t *const user_data,
+                              void (*const free_callback)(const uint8_t *user_data,
+                                                          void *cookie),
+                              void *const cookie)
+{
+    return dav1d_data_wrap_user_data_internal(buf,
+                                              user_data,
+                                              free_callback,
+                                              cookie);
+}
+
+void dav1d_data_unref(Dav1dData *const buf) {
+    dav1d_data_unref_internal(buf);
+}
--- a/third_party/dav1d/src/lr_apply_tmpl.c
+++ b/third_party/dav1d/src/lr_apply_tmpl.c
@@ -42,17 +42,18 @@ enum LrRestorePlanes {
 // The loop filter buffer stores 12 rows of pixels. A superblock block will
 // contain at most 2 stripes. Each stripe requires 4 rows pixels (2 above
 // and 2 below) the final 4 rows are used to swap the bottom of the last
 // stripe with the top of the next super block row.
 static void backup_lpf(const Dav1dFrameContext *const f,
                        pixel *dst, const ptrdiff_t dst_stride,
                        const pixel *src, const ptrdiff_t src_stride,
                        const int ss_ver, const int sb128,
-                       int row, const int row_h, const int src_w, const int ss_hor)
+                       int row, const int row_h, const int src_w,
+                       const int h, const int ss_hor)
 {
     const int dst_w = f->frame_hdr->super_res.enabled ?
                       (f->frame_hdr->width[1] + ss_hor) >> ss_hor : src_w;
 
     // The first stripe of the frame is shorter by 8 luma pixel rows.
     int stripe_h = (64 - 8 * !row) >> ss_ver;
 
     if (row) {
@@ -69,28 +70,35 @@ static void backup_lpf(const Dav1dFrameC
                    &dst[PXSTRIDE(dst_stride) * (top + 3)], dst_w);
     }
 
     dst += 4 * PXSTRIDE(dst_stride);
     src += (stripe_h - 2) * PXSTRIDE(src_stride);
 
     if (f->frame_hdr->super_res.enabled) {
         while (row + stripe_h <= row_h) {
+            const int n_lines = 4 - (row + stripe_h + 1 == h);
             f->dsp->mc.resize(dst, dst_stride, src, src_stride,
-                              dst_w, src_w, 4, f->resize_step[ss_hor],
+                              dst_w, src_w, n_lines, f->resize_step[ss_hor],
                               f->resize_start[ss_hor] HIGHBD_CALL_SUFFIX);
             row += stripe_h; // unmodified stripe_h for the 1st stripe
             stripe_h = 64 >> ss_ver;
             src += stripe_h * PXSTRIDE(src_stride);
-            dst += 4 * PXSTRIDE(dst_stride);
+            dst += n_lines * PXSTRIDE(dst_stride);
+            if (n_lines == 3) {
+                pixel_copy(dst, &dst[-PXSTRIDE(dst_stride)], dst_w);
+                dst += PXSTRIDE(dst_stride);
+            }
         }
     } else {
         while (row + stripe_h <= row_h) {
+            const int n_lines = 4 - (row + stripe_h + 1 == h);
             for (int i = 0; i < 4; i++) {
-                pixel_copy(dst, src, src_w);
+                pixel_copy(dst, i == n_lines ? &dst[-PXSTRIDE(dst_stride)] :
+                                               src, src_w);
                 dst += PXSTRIDE(dst_stride);
                 src += PXSTRIDE(src_stride);
             }
             row += stripe_h; // unmodified stripe_h for the 1st stripe
             stripe_h = 64 >> ss_ver;
             src += (stripe_h - 4) * PXSTRIDE(src_stride);
         }
     }
@@ -105,43 +113,43 @@ void bytefn(dav1d_lr_copy_lpf)(Dav1dFram
 
     // TODO Also check block level restore type to reduce copying.
     const int restore_planes =
         ((f->frame_hdr->restoration.type[0] != DAV1D_RESTORATION_NONE) << 0) +
         ((f->frame_hdr->restoration.type[1] != DAV1D_RESTORATION_NONE) << 1) +
         ((f->frame_hdr->restoration.type[2] != DAV1D_RESTORATION_NONE) << 2);
 
     if (restore_planes & LR_RESTORE_Y) {
-        const int h = f->bh << 2;
+        const int h = f->cur.p.h;
         const int w = f->bw << 2;
-        const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h - 4);
+        const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h - 1);
         const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset;
         backup_lpf(f, f->lf.lr_lpf_line_ptr[0], lr_stride,
                    src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0],
-                   0, f->seq_hdr->sb128, y_stripe, row_h, w, 0);
+                   0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0);
     }
     if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) {
         const int ss_ver = f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
         const int ss_hor = f->sr_cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
-        const int h = f->bh << (2 - ss_ver);
+        const int h = (f->cur.p.h + ss_ver) >> ss_ver;
         const int w = f->bw << (2 - ss_hor);
-        const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h - 4);
+        const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h - 1);
         const ptrdiff_t offset_uv = offset >> ss_ver;
         const int y_stripe =
             (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv;
 
         if (restore_planes & LR_RESTORE_U) {
             backup_lpf(f, f->lf.lr_lpf_line_ptr[1], lr_stride,
                        src[1] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],
-                       ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, ss_hor);
+                       ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor);
         }
         if (restore_planes & LR_RESTORE_V) {
             backup_lpf(f, f->lf.lr_lpf_line_ptr[2], lr_stride,
                        src[2] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],
-                       ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, ss_hor);
+                       ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor);
         }
     }
 }
 
 static void lr_stripe(const Dav1dFrameContext *const f, pixel *p,
                       const pixel (*left)[4], int x, int y,
                       const int plane, const int unit_w, const int row_h,
                       const Av1RestorationUnit *const lr, enum LrEdgeFlags edges)
--- a/third_party/dav1d/src/obu.c
+++ b/third_party/dav1d/src/obu.c
@@ -216,25 +216,25 @@ static int parse_seq_hdr(Dav1dContext *c
     } else {
         hdr->pri = DAV1D_COLOR_PRI_UNKNOWN;
         hdr->trc = DAV1D_TRC_UNKNOWN;
         hdr->mtrx = DAV1D_MC_UNKNOWN;
     }
     if (hdr->monochrome) {
         hdr->color_range = dav1d_get_bits(gb, 1);
         hdr->layout = DAV1D_PIXEL_LAYOUT_I400;
-        hdr->ss_hor = hdr->ss_ver = 0;
+        hdr->ss_hor = hdr->ss_ver = 1;
         hdr->chr = DAV1D_CHR_UNKNOWN;
         hdr->separate_uv_delta_q = 0;
     } else if (hdr->pri == DAV1D_COLOR_PRI_BT709 &&
                hdr->trc == DAV1D_TRC_SRGB &&
                hdr->mtrx == DAV1D_MC_IDENTITY)
     {
         hdr->layout = DAV1D_PIXEL_LAYOUT_I444;
-        hdr->ss_hor = hdr->ss_ver = 1;
+        hdr->ss_hor = hdr->ss_ver = 0;
         hdr->color_range = 1;
         if (hdr->profile != 1 && !(hdr->profile == 2 && hdr->hbd == 2))
             goto error;
     } else {
         hdr->color_range = dav1d_get_bits(gb, 1);
         switch (hdr->profile) {
         case 0: hdr->layout = DAV1D_PIXEL_LAYOUT_I420;
                 hdr->ss_hor = hdr->ss_ver = 1;
@@ -253,18 +253,18 @@ static int parse_seq_hdr(Dav1dContext *c
             hdr->layout = hdr->ss_hor ?
                           hdr->ss_ver ? DAV1D_PIXEL_LAYOUT_I420 :
                                         DAV1D_PIXEL_LAYOUT_I422 :
                                         DAV1D_PIXEL_LAYOUT_I444;
             break;
         }
         hdr->chr = hdr->ss_hor == 1 && hdr->ss_ver == 1 ?
                    dav1d_get_bits(gb, 2) : DAV1D_CHR_UNKNOWN;
-        hdr->separate_uv_delta_q = dav1d_get_bits(gb, 1);
     }
+    hdr->separate_uv_delta_q = !hdr->monochrome && dav1d_get_bits(gb, 1);
 #if DEBUG_SEQ_HDR
     printf("SEQHDR: post-colorinfo: off=%ld\n",
            dav1d_get_bits_pos(gb) - init_bit_pos);
 #endif
 
     hdr->film_grain_present = dav1d_get_bits(gb, 1);
 #if DEBUG_SEQ_HDR
     printf("SEQHDR: post-filmgrain: off=%ld\n",
@@ -1278,17 +1278,17 @@ int dav1d_parse_obus(Dav1dContext *const
         memset(c->frame_hdr, 0, sizeof(*c->frame_hdr));
         c->frame_hdr->temporal_id = temporal_id;
         c->frame_hdr->spatial_id = spatial_id;
         if ((res = parse_frame_hdr(c, &gb)) < 0) {
             c->frame_hdr = NULL;
             return res;
         }
         for (int n = 0; n < c->n_tile_data; n++)
-            dav1d_data_unref(&c->tile[n].data);
+            dav1d_data_unref_internal(&c->tile[n].data);
         c->n_tile_data = 0;
         c->n_tiles = 0;
         if (type != OBU_FRAME) {
             // This is actually a frame header OBU so read the
             // trailing bit and check for overrun.
             dav1d_get_bits(&gb, 1);
             if (check_for_overrun(&gb, init_bit_pos, len)) {
                 c->frame_hdr = NULL;
@@ -1318,27 +1318,25 @@ int dav1d_parse_obus(Dav1dContext *const
         if (check_for_overrun(&gb, init_bit_pos, len))
             return -EINVAL;
         // The current bit position is a multiple of 8 (because we
         // just aligned it) and less than 8*pkt_bytelen because
         // otherwise the overrun check would have fired.
         const unsigned bit_pos = dav1d_get_bits_pos(&gb);
         assert((bit_pos & 7) == 0);
         assert(pkt_bytelen >= (bit_pos >> 3));
-        dav1d_ref_inc(in->ref);
-        c->tile[c->n_tile_data].data.ref = in->ref;
-        c->tile[c->n_tile_data].data.m = in->m;
-        c->tile[c->n_tile_data].data.data = in->data + (bit_pos >> 3);
+        dav1d_data_ref(&c->tile[c->n_tile_data].data, in);
+        c->tile[c->n_tile_data].data.data += bit_pos >> 3;
         c->tile[c->n_tile_data].data.sz = pkt_bytelen - (bit_pos >> 3);
         // ensure tile groups are in order and sane, see 6.10.1
         if (c->tile[c->n_tile_data].start > c->tile[c->n_tile_data].end ||
             c->tile[c->n_tile_data].start != c->n_tiles)
         {
             for (int i = 0; i <= c->n_tile_data; i++)
-                dav1d_data_unref(&c->tile[i].data);
+                dav1d_data_unref_internal(&c->tile[i].data);
             c->n_tile_data = 0;
             c->n_tiles = 0;
             goto error;
         }
         c->n_tiles += 1 + c->tile[c->n_tile_data].end -
                           c->tile[c->n_tile_data].start;
         c->n_tile_data++;
         break;
@@ -1354,17 +1352,17 @@ int dav1d_parse_obus(Dav1dContext *const
     }
 
     if (c->seq_hdr && c->frame_hdr) {
         if (c->frame_hdr->show_existing_frame) {
             if (!c->refs[c->frame_hdr->existing_frame_idx].p.p.data[0]) return -EINVAL;
             if (c->n_fc == 1) {
                 dav1d_picture_ref(&c->out,
                                   &c->refs[c->frame_hdr->existing_frame_idx].p.p);
-                c->out.m = in->m;
+                dav1d_data_props_copy(&c->out.m, &in->m);
             } else {
                 // need to append this to the frame output queue
                 const unsigned next = c->frame_thread.next++;
                 if (c->frame_thread.next == c->n_fc)
                     c->frame_thread.next = 0;
 
                 Dav1dFrameContext *const f = &c->fc[next];
                 pthread_mutex_lock(&f->frame_thread.td.lock);
@@ -1378,17 +1376,17 @@ int dav1d_parse_obus(Dav1dContext *const
                                                                    memory_order_relaxed);
                     if (out_delayed->visible && progress != FRAME_ERROR)
                         dav1d_picture_ref(&c->out, &out_delayed->p);
                     dav1d_thread_picture_unref(out_delayed);
                 }
                 dav1d_thread_picture_ref(out_delayed,
                                          &c->refs[c->frame_hdr->existing_frame_idx].p);
                 out_delayed->visible = 1;
-                out_delayed->p.m = in->m;
+                dav1d_data_props_copy(&out_delayed->p.m, &in->m);
                 pthread_mutex_unlock(&f->frame_thread.td.lock);
             }
             if (c->refs[c->frame_hdr->existing_frame_idx].p.p.frame_hdr->frame_type == DAV1D_FRAME_TYPE_KEY) {
                 const int r = c->frame_hdr->existing_frame_idx;
                 for (int i = 0; i < 8; i++) {
                     if (i == r) continue;
 
                     if (c->refs[i].p.p.data[0])
--- a/third_party/dav1d/src/picture.c
+++ b/third_party/dav1d/src/picture.c
@@ -94,18 +94,19 @@ static void free_buffer(const uint8_t *c
 
     pic_ctx->allocator.release_picture_callback(&pic_ctx->pic,
                                                 pic_ctx->allocator.cookie);
     free(pic_ctx);
 }
 
 static int picture_alloc_with_edges(Dav1dPicture *const p,
                                     const int w, const int h,
-                                    const enum Dav1dPixelLayout layout,
-                                    const int bpc,
+                                    Dav1dSequenceHeader *seq_hdr, Dav1dRef *seq_hdr_ref,
+                                    Dav1dFrameHeader *frame_hdr,  Dav1dRef *frame_hdr_ref,
+                                    const int bpc, const Dav1dDataProps *props,
                                     Dav1dPicAllocator *const p_allocator,
                                     const size_t extra, void **const extra_ptr)
 {
     if (p->data[0]) {
         fprintf(stderr, "Picture already allocated!\n");
         return -1;
     }
     assert(bpc > 0 && bpc <= 16);
@@ -115,17 +116,21 @@ static int picture_alloc_with_edges(Dav1
         return -ENOMEM;
     }
 
     p->p.w = w;
     p->p.h = h;
     p->m.timestamp = INT64_MIN;
     p->m.duration = 0;
     p->m.offset = -1;
-    p->p.layout = layout;
+    p->m.user_data.data = NULL;
+    p->m.user_data.ref = NULL;
+    p->seq_hdr = seq_hdr;
+    p->frame_hdr = frame_hdr;
+    p->p.layout = seq_hdr->layout;
     p->p.bpc = bpc;
     int res = p_allocator->alloc_picture_callback(p, p_allocator->cookie);
     if (res < 0) {
         free(pic_ctx);
         return -ENOMEM;
     }
 
     pic_ctx->allocator = *p_allocator;
@@ -133,77 +138,80 @@ static int picture_alloc_with_edges(Dav1
 
     if (!(p->ref = dav1d_ref_wrap(p->data[0], free_buffer, pic_ctx))) {
         p_allocator->release_picture_callback(p, p_allocator->cookie);
         free(pic_ctx);
         fprintf(stderr, "Failed to wrap picture: %s\n", strerror(errno));
         return -ENOMEM;
     }
 
+    p->seq_hdr_ref = seq_hdr_ref;
+    if (seq_hdr_ref) dav1d_ref_inc(seq_hdr_ref);
+
+    p->frame_hdr_ref = frame_hdr_ref;
+    if (frame_hdr_ref) dav1d_ref_inc(frame_hdr_ref);
+
+    dav1d_data_props_copy(&p->m, props);
+
     if (extra && extra_ptr)
         *extra_ptr = &pic_ctx->extra_ptr;
 
     return 0;
 }
 
 int dav1d_thread_picture_alloc(Dav1dThreadPicture *const p,
                                const int w, const int h,
-                               const enum Dav1dPixelLayout layout, const int bpc,
+                               Dav1dSequenceHeader *seq_hdr, Dav1dRef *seq_hdr_ref,
+                               Dav1dFrameHeader *frame_hdr, Dav1dRef *frame_hdr_ref,
+                               const int bpc, const Dav1dDataProps *props,
                                struct thread_data *const t, const int visible,
                                Dav1dPicAllocator *const p_allocator)
 {
     p->t = t;
 
     const int res =
-        picture_alloc_with_edges(&p->p, w, h, layout, bpc, p_allocator,
+        picture_alloc_with_edges(&p->p, w, h,
+                                 seq_hdr, seq_hdr_ref,
+                                 frame_hdr, frame_hdr_ref,
+                                 bpc, props, p_allocator,
                                  t != NULL ? sizeof(atomic_int) * 2 : 0,
                                  (void **) &p->progress);
     if (res) return res;
 
     p->visible = visible;
     if (t) {
         atomic_init(&p->progress[0], 0);
         atomic_init(&p->progress[1], 0);
     }
     return res;
 }
 
 int dav1d_picture_alloc_copy(Dav1dPicture *const dst, const int w,
                              const Dav1dPicture *const src)
 {
     struct pic_ctx_context *const pic_ctx = src->ref->user_data;
-    const int res = picture_alloc_with_edges(dst, w, src->p.h, src->p.layout,
-                                             src->p.bpc, &pic_ctx->allocator,
+    const int res = picture_alloc_with_edges(dst, w, src->p.h,
+                                             src->seq_hdr, src->seq_hdr_ref,
+                                             src->frame_hdr, src->frame_hdr_ref,
+                                             src->p.bpc, &src->m, &pic_ctx->allocator,
                                              0, NULL);
-
-    if (!res) {
-        dst->p = src->p;
-        dst->m = src->m;
-        dst->p.w = w;
-        dst->frame_hdr = src->frame_hdr;
-        dst->frame_hdr_ref = src->frame_hdr_ref;
-        if (dst->frame_hdr_ref) dav1d_ref_inc(dst->frame_hdr_ref);
-        dst->seq_hdr = src->seq_hdr;
-        dst->seq_hdr_ref = src->seq_hdr_ref;
-        if (dst->seq_hdr_ref) dav1d_ref_inc(dst->seq_hdr_ref);
-    }
-
     return res;
 }
 
 void dav1d_picture_ref(Dav1dPicture *const dst, const Dav1dPicture *const src) {
     validate_input(dst != NULL);
     validate_input(dst->data[0] == NULL);
     validate_input(src != NULL);
 
     if (src->ref) {
         validate_input(src->data[0] != NULL);
         dav1d_ref_inc(src->ref);
         if (src->frame_hdr_ref) dav1d_ref_inc(src->frame_hdr_ref);
         if (src->seq_hdr_ref) dav1d_ref_inc(src->seq_hdr_ref);
+        if (src->m.user_data.ref) dav1d_ref_inc(src->m.user_data.ref);
     }
     *dst = *src;
 }
 
 void dav1d_picture_move_ref(Dav1dPicture *const dst, Dav1dPicture *const src) {
     validate_input(dst != NULL);
     validate_input(dst->data[0] == NULL);
     validate_input(src != NULL);
@@ -219,30 +227,31 @@ void dav1d_thread_picture_ref(Dav1dThrea
                               const Dav1dThreadPicture *src)
 {
     dav1d_picture_ref(&dst->p, &src->p);
     dst->t = src->t;
     dst->visible = src->visible;
     dst->progress = src->progress;
 }
 
-void dav1d_picture_unref(Dav1dPicture *const p) {
+void dav1d_picture_unref_internal(Dav1dPicture *const p) {
     validate_input(p != NULL);
 
     if (p->ref) {
         validate_input(p->data[0] != NULL);
         dav1d_ref_dec(&p->ref);
         dav1d_ref_dec(&p->seq_hdr_ref);
         dav1d_ref_dec(&p->frame_hdr_ref);
+        dav1d_ref_dec(&p->m.user_data.ref);
     }
     memset(p, 0, sizeof(*p));
 }
 
 void dav1d_thread_picture_unref(Dav1dThreadPicture *const p) {
-    dav1d_picture_unref(&p->p);
+    dav1d_picture_unref_internal(&p->p);
 
     p->t = NULL;
     p->progress = NULL;
 }
 
 int dav1d_thread_picture_wait(const Dav1dThreadPicture *const p,
                               int y_unclipped, const enum PlaneType plane_type)
 {
--- a/third_party/dav1d/src/picture.h
+++ b/third_party/dav1d/src/picture.h
@@ -29,16 +29,17 @@
 #define __DAV1D_SRC_PICTURE_H__
 
 #include <stdatomic.h>
 
 #include "src/thread.h"
 #include "dav1d/picture.h"
 
 #include "src/thread_data.h"
+#include "src/ref.h"
 
 enum PlaneType {
     PLANE_TYPE_Y,
     PLANE_TYPE_UV,
     PLANE_TYPE_BLOCK,
     PLANE_TYPE_ALL,
 };
 
@@ -50,17 +51,19 @@ typedef struct Dav1dThreadPicture {
     // [1] pixel data
     atomic_uint *progress;
 } Dav1dThreadPicture;
 
 /*
  * Allocate a picture with custom border size.
  */
 int dav1d_thread_picture_alloc(Dav1dThreadPicture *p, int w, int h,
-                               enum Dav1dPixelLayout layout, int bpc,
+                               Dav1dSequenceHeader *seq_hdr, Dav1dRef *seq_hdr_ref,
+                               Dav1dFrameHeader *frame_hdr, Dav1dRef *frame_hdr_ref,
+                               int bpc, const Dav1dDataProps *props,
                                struct thread_data *t, int visible,
                                Dav1dPicAllocator *);
 
 /**
  * Allocate a picture with identical metadata to an existing picture.
  * The width is a separate argument so this function can be used for
  * super-res, where the width changes, but everything else is the same.
  * For the more typical use case of allocating a new image of the same
@@ -104,10 +107,11 @@ int dav1d_thread_picture_wait(const Dav1
  * PLANE_TYPE_BLOCK), pixel data (pass 2, PLANE_TYPE_Y) or both (no
  * 2-pass decoding; PLANE_TYPE_ALL).
  */
 void dav1d_thread_picture_signal(const Dav1dThreadPicture *p, int y,
                                  enum PlaneType plane_type);
 
 int default_picture_allocator(Dav1dPicture *, void *cookie);
 void default_picture_release(Dav1dPicture *, void *cookie);
+void dav1d_picture_unref_internal(Dav1dPicture *p);
 
 #endif /* __DAV1D_SRC_PICTURE_H__ */
--- a/third_party/dav1d/src/recon_tmpl.c
+++ b/third_party/dav1d/src/recon_tmpl.c
@@ -200,18 +200,19 @@ static int decode_coefs(Dav1dTileContext
             } while (tok < 15);
         }
 
         levels[x * stride + y] = cf[rc] = tok;
     }
 
     // residual and sign
     int dc_sign = 1;
+    const int lossless = f->frame_hdr->segmentation.lossless[b->seg_id];
     const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane];
-    const uint8_t *const qm_tbl = f->qm[is_1d || *txtp == IDTX][tx][plane];
+    const uint8_t *const qm_tbl = f->qm[lossless || is_1d || *txtp == IDTX][tx][plane];
     const int dq_shift = imax(0, t_dim->ctx - 2);
     const int bitdepth = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
     const int cf_min = -(1 << (7 + bitdepth));
     const int cf_max = (1 << (7 + bitdepth)) - 1;
     for (int i = 0; i <= eob; i++) {
         const int rc = scan[i];
         int tok = cf[rc];
         if (!tok) continue;
--- a/third_party/dav1d/src/ref.c
+++ b/third_party/dav1d/src/ref.c
@@ -40,31 +40,32 @@ Dav1dRef *dav1d_ref_create(const size_t 
     Dav1dRef *res;
     void *data = dav1d_alloc_aligned(size, 32);
     if (!data) {
         return NULL;
     }
 
     res = dav1d_ref_wrap(data, default_free_callback, data);
     if (!res) {
-        free(data);
+        dav1d_free_aligned(data);
+    } else {
+        res->data = data;
     }
 
     return res;
 }
 
 Dav1dRef *dav1d_ref_wrap(const uint8_t *const ptr,
                          void (*free_callback)(const uint8_t *data, void *user_data),
                          void *user_data)
 {
     Dav1dRef *res = malloc(sizeof(Dav1dRef));
     if (!res) return NULL;
 
-    if (ptr == user_data)
-        res->data = user_data;
+    res->data = NULL;
     res->const_data = ptr;
     atomic_init(&res->ref_cnt, 1);
     res->free_callback = free_callback;
     res->user_data = user_data;
 
     return res;
 }
 
@@ -81,10 +82,10 @@ void dav1d_ref_dec(Dav1dRef **const pref
     if (atomic_fetch_sub(&ref->ref_cnt, 1) == 1) {
         ref->free_callback(ref->const_data, ref->user_data);
         free(ref);
     }
     *pref = NULL;
 }
 
 int dav1d_ref_is_writable(Dav1dRef *const ref) {
-    return atomic_load(&ref->ref_cnt) == 1;
+    return atomic_load(&ref->ref_cnt) == 1 && ref->data;
 }
--- a/third_party/dav1d/src/tables.c
+++ b/third_party/dav1d/src/tables.c
@@ -770,47 +770,46 @@ const uint8_t dav1d_sm_weights[128] = {
     144, 138, 133, 127, 121, 116, 111, 106,
     101,  96,  91,  86,  82,  77,  73,  69,
      65,  61,  57,  54,  50,  47,  44,  41,
      38,  35,  32,  29,  27,  25,  22,  20,
      18,  16,  15,  13,  12,  10,   9,   8,
       7,   6,   6,   5,   5,   4,   4,   4
 };
 
-const int16_t dav1d_dr_intra_derivative[90] = {
-    // More evenly spread out angles and limited to 10-bit
+const uint16_t dav1d_dr_intra_derivative[44] = {
     // Values that are 0 will never be used
-       0, 0, 0,       // Approx angle
-    1023, 0, 0,       // 3, ...
-     547, 0, 0,       // 6, ...
-     372, 0, 0, 0, 0, // 9, ...
-     273, 0, 0,       // 14, ...
-     215, 0, 0,       // 17, ...
-     178, 0, 0,       // 20, ...
-     151, 0, 0,       // 23, ... (113 & 203 are base angles)
-     132, 0, 0,       // 26, ...
-     116, 0, 0,       // 29, ...
-     102, 0, 0, 0,    // 32, ...
-      90, 0, 0,       // 36, ...
-      80, 0, 0,       // 39, ...
-      71, 0, 0,       // 42, ...
-      64, 0, 0,       // 45, ... (45 & 135 are base angles)
-      57, 0, 0,       // 48, ...
-      51, 0, 0,       // 51, ...
-      45, 0, 0, 0,    // 54, ...
-      40, 0, 0,       // 58, ...
-      35, 0, 0,       // 61, ...
-      31, 0, 0,       // 64, ...
-      27, 0, 0,       // 67, ... (67 & 157 are base angles)
-      23, 0, 0,       // 70, ...
-      19, 0, 0,       // 73, ...
-      15, 0, 0, 0, 0, // 76, ...
-      11, 0, 0,       // 81, ...
-       7, 0, 0,       // 84, ...
-       3, 0, 0,       // 87, ...
+          0,    // Angles:
+    1023, 0,    //  3,  93, 183
+     547,       //  6,  96, 186
+     372, 0, 0, //  9,  99, 189
+     273,       // 14, 104, 194
+     215, 0,    // 17, 107, 197
+     178,       // 20, 110, 200
+     151, 0,    // 23, 113, 203 (113 & 203 are base angles)
+     132,       // 26, 116, 206
+     116, 0,    // 29, 119, 209
+     102, 0,    // 32, 122, 212
+      90,       // 36, 126, 216
+      80, 0,    // 39, 129, 219
+      71,       // 42, 132, 222
+      64, 0,    // 45, 135, 225 (45 & 135 are base angles)
+      57,       // 48, 138, 228
+      51, 0,    // 51, 141, 231
+      45, 0,    // 54, 144, 234
+      40,       // 58, 148, 238
+      35, 0,    // 61, 151, 241
+      31,       // 64, 154, 244
+      27, 0,    // 67, 157, 247 (67 & 157 are base angles)
+      23,       // 70, 160, 250
+      19, 0,    // 73, 163, 253
+      15, 0,    // 76, 166, 256
+      11, 0,    // 81, 171, 261
+       7,       // 84, 174, 264
+       3        // 87, 177, 267
 };
 
 const int8_t ALIGN(dav1d_filter_intra_taps[5][64], 16) = {
     {
          -6,  10,  -5,   2,  -3,   1,  -3,   1,
          -4,   6,  -3,   2,  -3,   2,  -3,   1,
           0,   0,  10,   0,   1,  10,   1,   2,
           0,   0,   6,   0,   2,   6,   2,   2,
--- a/third_party/dav1d/src/tables.h
+++ b/third_party/dav1d/src/tables.h
@@ -109,16 +109,16 @@ extern const Dav1dWarpedMotionParams dav
 extern const int16_t dav1d_sgr_params[16][4];
 extern const int dav1d_sgr_x_by_xplus1[256];
 
 extern const int8_t dav1d_mc_subpel_filters[5][15][8];
 extern const int8_t dav1d_mc_warp_filter[193][8];
 extern const int16_t dav1d_resize_filter[64][8];
 
 extern const uint8_t dav1d_sm_weights[128];
-extern const int16_t dav1d_dr_intra_derivative[90];
+extern const uint16_t dav1d_dr_intra_derivative[44];
 extern const int8_t dav1d_filter_intra_taps[5][64];
 
 extern const uint8_t dav1d_obmc_masks[64];
 
 extern const int16_t dav1d_gaussian_sequence[2048]; // for fgs
 
 #endif /* __DAV1D_SRC_TABLES_H__ */
--- a/third_party/dav1d/src/x86/cpu.c
+++ b/third_party/dav1d/src/x86/cpu.c
@@ -20,16 +20,18 @@
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include "config.h"
+
 #include <stdint.h>
 
 #include "src/x86/cpu.h"
 
 void dav1d_cpu_cpuid(uint32_t *info, int leaf);
 uint64_t dav1d_cpu_xgetbv(int xcr);
 
 unsigned dav1d_get_cpu_flags_x86(void) {
@@ -42,26 +44,29 @@ unsigned dav1d_get_cpu_flags_x86(void) {
     if (n_ids >= 1) {
         dav1d_cpu_cpuid(info, 1);
         if (info[3] & (1 << 25)) flags |= DAV1D_X86_CPU_FLAG_SSE;
         if (info[3] & (1 << 26)) flags |= DAV1D_X86_CPU_FLAG_SSE2;
         if (info[2] & (1 <<  0)) flags |= DAV1D_X86_CPU_FLAG_SSE3;
         if (info[2] & (1 <<  9)) flags |= DAV1D_X86_CPU_FLAG_SSSE3;
         if (info[2] & (1 << 19)) flags |= DAV1D_X86_CPU_FLAG_SSE41;
         if (info[2] & (1 << 20)) flags |= DAV1D_X86_CPU_FLAG_SSE42;
+#if ARCH_X86_64
+        /* We only support >128-bit SIMD on x86-64. */
         if (info[2] & (1 << 27)) /* OSXSAVE */ {
             uint64_t xcr = dav1d_cpu_xgetbv(0);
             if ((xcr & 0x00000006) == 0x00000006) /* XMM/YMM */ {
                 if (info[2] & (1 << 28)) flags |= DAV1D_X86_CPU_FLAG_AVX;
                 if (n_ids >= 7) {
                     dav1d_cpu_cpuid(info, 7);
                     if (info[1] & (1 <<  5)) flags |= DAV1D_X86_CPU_FLAG_AVX2;
                     if ((xcr & 0x000000e0) == 0x000000e0) /* ZMM/OPMASK */ {
                         if ((info[1] & 0xd0030000) == 0xd0030000)
                             flags |= DAV1D_X86_CPU_FLAG_AVX512;
                     }
                 }
             }
         }
+#endif
     }
 
     return flags;
 }
--- a/third_party/dav1d/src/x86/ipred.asm
+++ b/third_party/dav1d/src/x86/ipred.asm
@@ -23,17 +23,17 @@
 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 %include "config.asm"
 %include "ext/x86/x86inc.asm"
 
 %if ARCH_X86_64
 
-SECTION_RODATA 32
+SECTION_RODATA 64
 
 %macro SMOOTH_WEIGHT_TABLE 1-*
     %rep %0
         db %1-128, 127-%1
         %rotate 1
     %endrep
 %endmacro
 
@@ -52,73 +52,82 @@ smooth_weights: SMOOTH_WEIGHT_TABLE     
     196, 189, 182, 176, 169, 163, 156, 150, \
     144, 138, 133, 127, 121, 116, 111, 106, \
     101,  96,  91,  86,  82,  77,  73,  69, \
      65,  61,  57,  54,  50,  47,  44,  41, \
      38,  35,  32,  29,  27,  25,  22,  20, \
      18,  16,  15,  13,  12,  10,   9,   8, \
       7,   6,   6,   5,   5,   4,   4,   4
 
-; Note that the order of (some of) the following z constants matter
 z_filter_wh:  db  7,  7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
               db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1
 z_filter_k:   db  0, 16,  0, 16,  0, 20,  0, 20,  8, 16,  8, 16
               db 32, 16, 32, 16, 24, 20, 24, 20, 16, 16, 16, 16
               db  0,  0,  0,  0,  0,  0,  0,  0,  8,  0,  8,  0
 z_filter_s:   db  0,  0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7
               db  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15
+              db 15, 15, 15, 15, 15, 15, 15, 15 ; should be in one cache line
+pb_12:        times 4 db 12 ; those are just placed here for alignment.
+pb_14:        times 4 db 14
+z3_shuf:      db  8,  7,  7,  6,  6,  5,  5,  4,  4,  3,  3,  2,  2,  1,  1,  0
 z_filter_t0:  db 55,127, 39,127, 39,127,  7, 15, 31,  7, 15, 31,  0,  3, 31,  0
 z_filter_t1:  db 39, 63, 19, 47, 19, 47,  3,  3,  3,  3,  3,  3,  0,  0,  0,  0
-z_upsample:   db  1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
-z_shuf_w4:    db  0,  1,  1,  2,  2,  3,  3,  4,  8,  9,  9, 10, 10, 11, 11, 12
+z_upsample1:  db  1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
+z_upsample2:  db  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  8,  8,  8
+z_upsample3:  db  0,  0,  0,  0,  1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5
+z1_shuf_w4:   db  0,  1,  1,  2,  2,  3,  3,  4,  8,  9,  9, 10, 10, 11, 11, 12
+z3_shuf_w4:   db  4,  3,  3,  2,  2,  1,  1,  0, 12, 11, 11, 10, 10,  9,  9,  8
+z_transpose4: db  0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
 z_base_inc:   dw  0*64,  1*64,  2*64,  3*64,  4*64,  5*64,  6*64,  7*64
               dw 16*64, 17*64, 18*64, 19*64, 20*64, 21*64, 22*64, 23*64
 
 ; vpermd indices in bits 4..6 of filter_shuf1: 0, 2, 6, 4, 1, 3, 7, 5
 filter_shuf1: db 10,  4, 10,  4, 37,  6,  5,  6,103,  9,  7,  9, 72, -1,  8, -1
               db 16,  4,  0,  4, 53,  6,  5,  6,119, 11,  7, 11, 95, -1, 15, -1
 filter_shuf2: db  3,  4,  3,  4,  5,  6,  5,  6,  7,  2,  7,  2,  1, -1,  1, -1
-filter_shuf3: db  3,  4,  3,  4,  5,  6,  5,  6,  7, 11,  7, 11, 15, -1, 15, -1
+filter_shuf3: db  3,  4,  3,  4,  5,  6,  5,  6,  7, 11,  7, 11; 15, -1, 15, -1
+pb_127_m127:  times 2 db 127, -127
 ipred_v_shuf: db  0,  1,  0,  1,  4,  5,  4,  5,  8,  9,  8,  9, 12, 13, 12, 13
               db  2,  3,  2,  3,  6,  7,  6,  7, 10, 11, 10, 11, 14, 15, 14, 15
 ipred_h_shuf: db  7,  7,  7,  7,  3,  3,  3,  3,  5,  5,  5,  5,  1,  1,  1,  1
-              db  6,  6,  6,  6,  2,  2,  2,  2,  4,  4,  4,  4,  0,  0,  0,  0
+              db  6,  6,  6,  6,  2,  2,  2,  2,  4,  4,  4,  4;  0,  0,  0,  0
+pw_64:        times 2 dw 64
 
-pb_0to15:
 cfl_ac_w16_pad_shuffle: ; w=16, w_pad=1
                         db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
                         ; w=8, w_pad=1 as well as second half of previous one
 cfl_ac_w8_pad1_shuffle: db 0, 1, 2, 3, 4, 5
                         times 5 db 6, 7
                         ; w=16,w_pad=2
                         db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
                         times 8 db 14, 15
                         ; w=16,w_pad=3
                         db 0, 1, 2, 3, 4, 5
                         times 13 db 6, 7
+pb_15to0:               db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
-pb_1:   times 4 db 1
-pb_2:   times 4 db 2
-pb_4:   times 4 db 4
-pb_8:   times 4 db 8
-pb_12:  times 4 db 12
-pb_14:  times 4 db 14
-pb_15   times 4 db 15
-pb_31:  times 4 db 31
-pb_128: times 4 db 128
-pw_1:   times 2 dw 1
-pw_8:   times 2 dw 8
-pw_62:  times 2 dw 62
-pw_64:  times 2 dw 64
-pw_128: times 2 dw 128
-pw_255: times 2 dw 255
-pw_512: times 2 dw 512
+%define pb_0to15 cfl_ac_w16_pad_shuffle
+%define pb_1  (ipred_h_shuf+12)
+%define pb_2  (ipred_h_shuf+20)
+%define pb_3  (ipred_h_shuf+ 4)
+%define pb_4  (ipred_h_shuf+24)
+%define pb_7  (ipred_h_shuf+ 0)
+%define pb_8  (z_upsample2 +12)
+%define pb_15 (z_filter_s  +32)
+%define pw_8  (z_filter_k  +32)
 
-pb_36_m4:    times 2 db  36,   -4
-pb_127_m127: times 2 db 127, -127
+pb_27:    times 4 db 27
+pb_31:    times 4 db 31
+pb_128:   times 4 db 128
+pw_1:     times 2 dw 1
+pw_62:    times 2 dw 62
+pw_128:   times 2 dw 128
+pw_255:   times 2 dw 255
+pw_512:   times 2 dw 512
+pb_36_m4: times 2 db 36, -4
 
 %macro JMP_TABLE 3-*
     %xdefine %1_%2_table (%%table - 2*4)
     %xdefine %%base mangle(private_prefix %+ _%1_%2)
     %%table:
     %rep %0 - 2
         dd %%base %+ .%3 - (%%table - 2*4)
         %rotate 1
@@ -133,16 +142,17 @@ JMP_TABLE ipred_smooth_v, avx2, w4, w8, 
 JMP_TABLE ipred_smooth_h, avx2, w4, w8, w16, w32, w64
 JMP_TABLE ipred_paeth,    avx2, w4, w8, w16, w32, w64
 JMP_TABLE ipred_filter,   avx2, w4, w8, w16, w32
 JMP_TABLE ipred_dc,       avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
                                 s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
 JMP_TABLE ipred_dc_left,  avx2, h4, h8, h16, h32, h64
 JMP_TABLE ipred_h,        avx2, w4, w8, w16, w32, w64
 JMP_TABLE ipred_z1,       avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z3,       avx2, h4, h8, h16, h32, h64
 JMP_TABLE ipred_cfl,      avx2, h4, h8, h16, h32, w4, w8, w16, w32, \
                                 s4-8*4, s8-8*4, s16-8*4, s32-8*4
 JMP_TABLE ipred_cfl_left, avx2, h4, h8, h16, h32
 JMP_TABLE ipred_cfl_ac_420, avx2, w16_pad1, w16_pad2, w16_pad3
 JMP_TABLE ipred_cfl_ac_422, avx2, w16_pad1, w16_pad2, w16_pad3
 JMP_TABLE pal_pred,       avx2, w4, w8, w16, w32, w64
 
 cextern dr_intra_derivative
@@ -1293,37 +1303,36 @@ cglobal ipred_z1, 3, 8, 0, dst, stride, 
     lea                  r6, [ipred_z1_avx2_table]
     tzcnt                wd, wm
     movifnidn        angled, anglem
     movifnidn            hd, hm
     lea                  r7, [dr_intra_derivative]
     inc                 tlq
     movsxd               wq, [r6+wq*4]
     add                  wq, r6
-    movzx               dxd, angleb
+    mov                 dxd, angled
+    and                 dxd, 0x7e
     add              angled, 165 ; ~90
-    movzx               dxd, word [r7+dxq*2]
+    movzx               dxd, word [r7+dxq]
     xor              angled, 0x4ff ; d = 90 - angle
     vpbroadcastd         m3, [pw_512]
     vpbroadcastd         m4, [pw_62]
     vpbroadcastd         m5, [pw_64]
     jmp                  wq
 .w4:
     cmp              angleb, 40
     jae .w4_no_upsample
     lea                 r3d, [angleq-1024]
     sar                 r3d, 7
     add                 r3d, hd
     jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
     ALLOC_STACK         -32, 8
     mova                xm1, [tlq-1]
-    pshufb              xm0, xm1, [z_upsample]
-    vpbroadcastd        xm2, [pb_8]
-    pminub              xm2, [z_filter_s+6]
-    pshufb              xm1, xm2
+    pshufb              xm0, xm1, [z_upsample1]
+    pshufb              xm1, [z_upsample2]
     vpbroadcastd        xm2, [pb_36_m4] ; upshifted by 2 to be able to reuse
     add                 dxd, dxd        ; pw_512 (which is already in m3)
     pmaddubsw           xm0, xm2        ; for rounding instead of pw_2048
     pextrd         [rsp+16], xm1, 3 ; top[max_base_x]
     pmaddubsw           xm1, xm2
     movd                xm7, dxd
     mov                 r3d, dxd ; xpos
     vpbroadcastw         m7, xm7
@@ -1370,47 +1379,47 @@ cglobal ipred_z1, 3, 8, 0, dst, stride, 
     lea                dstq, [dstq+strideq*4]
     sub                  hd, 4
     jg .w4_upsample_loop
     RET
 ALIGN function_align
 .filter_strength: ; w4/w8/w16
     ; The C version uses a lot of branches, but we can do all the comparisons
     ; in parallel and use popcnt to get the final filter strength value.
+%define base r3-z_filter_t0
+    lea                  r3, [z_filter_t0]
     movd                xm0, maxbased
     movd                xm2, angled
-    lea                  r3, [z_filter_t0]
     shr              angled, 8 ; is_sm << 1
     vpbroadcastb         m0, xm0
     vpbroadcastb         m2, xm2
-    pcmpeqb              m1, m0, [r3-z_filter_t0+z_filter_wh]
+    pcmpeqb              m1, m0, [base+z_filter_wh]
     pand                 m1, m2
     mova                xm2, [r3+angleq*8] ; upper ymm half zero in both cases
     pcmpgtb              m1, m2
     pmovmskb            r5d, m1
     popcnt              r5d, r5d ; sets ZF which can be used by caller
     ret
 .w4_no_upsample:
     %assign stack_offset org_stack_offset
     ALLOC_STACK         -16, 11
     mov            maxbased, 7
     test             angled, 0x400 ; !enable_intra_edge_filter
     jnz .w4_main
     lea            maxbased, [hq+3]
     call .filter_strength
     mov            maxbased, 7
     jz .w4_main ; filter_strength == 0
-    lea                  r3, [z_filter_k-4]
-    vpbroadcastd         m7, [pb_8]
+    vpbroadcastd         m7, [base+pb_8]
     vbroadcasti128       m2, [tlq-1]
-    pminub               m1, m7, [r3-z_filter_k+z_filter_s+4]
-    vpbroadcastd         m8, [r3+r5*4+12*0]
-    pminub               m7, [r3-z_filter_k+z_filter_s+12]
-    vpbroadcastd         m9, [r3+r5*4+12*1]
-    vpbroadcastd        m10, [r3+r5*4+12*2]
+    pminub               m1, m7, [base+z_filter_s]
+    vpbroadcastd         m8, [base+z_filter_k-4+r5*4+12*0]
+    pminub               m7, [base+z_filter_s+8]
+    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*1]
+    vpbroadcastd        m10, [base+z_filter_k-4+r5*4+12*2]
     pshufb               m0, m2, m1
     shufps               m1, m7, q2121
     pmaddubsw            m0, m8
     pshufb               m1, m2, m1
     pmaddubsw            m1, m9
     pshufb               m2, m7
     pmaddubsw            m2, m10
     paddw                m0, m1
@@ -1427,17 +1436,17 @@ ALIGN function_align
     movd                xm6, dxd
     vpbroadcastq         m0, [z_base_inc] ; base_inc << 6
     vpbroadcastb         m7, [tlq+maxbaseq]
     shl            maxbased, 6
     vpbroadcastw         m6, xm6
     mov                 r3d, dxd ; xpos
     movd                xm9, maxbased
     vpbroadcastw         m9, xm9
-    vbroadcasti128       m8, [z_shuf_w4]
+    vbroadcasti128       m8, [z1_shuf_w4]
     psrlw                m7, 8  ; top[max_base_x]
     paddw               m10, m6, m6
     psubw                m9, m0 ; max_base_x
     vpblendd             m6, m10, 0xcc
     mova                xm0, xm10
     paddw                m6, m0 ; xpos2 xpos3 xpos0 xpos1
     paddw               m10, m10
 .w4_loop:
@@ -1497,17 +1506,17 @@ ALIGN function_align
     ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
     %assign stack_offset org_stack_offset
     ALLOC_STACK         -32, 8
     movu                xm2, [z_filter_s+6]
     mova                xm0, [tlq-1]
     movd                xm6, hd
     vinserti128          m0, [tlq+7], 1
     vpbroadcastb        xm6, xm6
-    vbroadcasti128       m1, [z_upsample]
+    vbroadcasti128       m1, [z_upsample1]
     pminub              xm6, xm2
     vpbroadcastd         m7, [pb_36_m4]
     vinserti128          m2, xm6, 1
     add                 dxd, dxd
     pshufb               m1, m0, m1
     pshufb               m2, m0, m2
     movd                xm6, dxd
     pmaddubsw            m1, m7
@@ -1556,73 +1565,67 @@ ALIGN function_align
     movhps [dstq+strideq*2], xm0
     movq   [dstq+strideq*1], xm1
     movhps [dstq+r2       ], xm1
     lea                dstq, [dstq+strideq*4]
     sub                  hd, 4
     jg .w8_upsample_loop
     RET
 .w8_no_intra_edge_filter:
-    mov                 r3d, 15
-    cmp                  hd, 8
-    cmova          maxbased, r3d
+    and            maxbased, 7
+    or             maxbased, 8 ; imin(h+7, 15)
     jmp .w8_main
 .w8_no_upsample:
     %assign stack_offset org_stack_offset
     ALLOC_STACK         -32, 10
     lea            maxbased, [hq+7]
     test             angled, 0x400
     jnz .w8_no_intra_edge_filter
     call .filter_strength
-    vpbroadcastd        xm6, [pb_15]
-    pminub              xm6, xm0 ; imin(h, 8) + 7
-    movd           maxbased, xm6
-    movzx          maxbased, maxbaseb
     jz .w8_main ; filter_strength == 0
-    lea                  r3, [z_filter_k-4]
     movu                xm2, [tlq]
-    pminub              xm1, xm6, [r3-z_filter_k+z_filter_s+18]
+    pminub              xm1, xm0, [base+z_filter_s+14]
     vinserti128          m2, [tlq-1], 1
-    vinserti128          m1, [r3-z_filter_k+z_filter_s+ 4], 1
-    vpbroadcastd         m7, [r3+r5*4+12*0]
-    pminub              xm6, [r3-z_filter_k+z_filter_s+26]
-    vinserti128          m6, [r3-z_filter_k+z_filter_s+12], 1
-    pshufb               m0, m2, m1
-    pmaddubsw            m0, m7
-    vpbroadcastd         m7, [r3+r5*4+12*1]
+    vinserti128          m1, [base+z_filter_s+ 0], 1
+    vpbroadcastd         m7, [base+z_filter_k-4+r5*4+12*0]
+    pminub              xm0, [base+z_filter_s+22]
+    vinserti128          m0, [base+z_filter_s+ 8], 1
+    pshufb               m6, m2, m1
+    pmaddubsw            m6, m7
+    vpbroadcastd         m7, [base+z_filter_k-4+r5*4+12*1]
     movzx               r3d, byte [tlq+15]
-    shufps               m1, m6, q2121
+    shufps               m1, m0, q2121
     pshufb               m1, m2, m1
     pmaddubsw            m1, m7
-    paddw                m0, m1
+    paddw                m1, m6
     sub                 r5d, 3
     jnz .w8_3tap
     ; filter_strength == 3 uses a 5-tap filter instead of a 3-tap one,
     ; which also results in an awkward edge case where out[w*2] is
     ; slightly different from out[max_base_x] when h > w.
     vpbroadcastd         m7, [z_filter_k+4*8]
     movzx               r2d, byte [tlq+14]
-    pshufb               m2, m6
+    pshufb               m2, m0
     pmaddubsw            m2, m7
     sub                 r2d, r3d
     lea                 r2d, [r2+r3*8+4]
     shr                 r2d, 3 ; (tlq[w*2-2] + tlq[w*2-1]*7 + 4) >> 3
     mov            [rsp+16], r2b
-    paddw                m0, m2
+    paddw                m1, m2
 .w8_3tap:
-    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
     sar                 r5d, 1
     mov                 tlq, rsp
     add                 r5d, 17 ; w*2 + (filter_strength == 3)
     cmp                  hd, 8
     cmova          maxbased, r5d
     mov            [tlq+r5], r3b
-    vextracti128        xm1, m0, 1
-    packuswb            xm1, xm0
-    mova              [tlq], xm1
+    vextracti128        xm0, m1, 1
+    packuswb            xm0, xm1
+    mova              [tlq], xm0
 .w8_main:
     movd                xm2, dxd
     vbroadcasti128       m0, [z_base_inc]
     vpbroadcastw         m2, xm2
     vpbroadcastb         m7, [tlq+maxbaseq]
     shl            maxbased, 6
     movd                xm9, maxbased
     vbroadcasti128       m8, [z_filter_s+2]
@@ -1663,58 +1666,50 @@ ALIGN function_align
     movq   [dstq+strideq*0], xm7
     movq   [dstq+strideq*1], xm7
     lea                dstq, [dstq+strideq*2]
     sub                  hd, 2
     jg .w8_end_loop
 .w8_end:
     RET
 .w16_no_intra_edge_filter:
-    mov                 r3d, 31
-    cmp                  hd, 16
-    cmova          maxbased, r3d
+    and            maxbased, 15
+    or             maxbased, 16 ; imin(h+15, 31)
     jmp .w16_main
 ALIGN function_align
 .w16:
     %assign stack_offset org_stack_offset
     ALLOC_STACK         -64, 12
     lea            maxbased, [hq+15]
     test             angled, 0x400
     jnz .w16_no_intra_edge_filter
     call .filter_strength
-    vpbroadcastd         m1, [pb_31]
-    pminub               m0, m1 ; imin(h, 16) + 15
-    movd           maxbased, xm0
-    movzx          maxbased, maxbaseb
     jz .w16_main ; filter_strength == 0
-    lea                  r3, [z_filter_k-4]
-    vpbroadcastd         m1, [pb_12]
-    vpbroadcastd        m11, [pb_15]
-    vbroadcasti128       m6, [r3-z_filter_k+z_filter_s+12]
-    vinserti128          m2, m6, [r3-z_filter_k+z_filter_s+4], 0
-    vinserti128          m6, [r3-z_filter_k+z_filter_s+20], 1
+    vpbroadcastd         m1, [base+pb_12]
+    vbroadcasti128       m6, [base+z_filter_s+8]
+    vinserti128          m2, m6, [base+z_filter_s], 0
+    vinserti128          m6, [base+z_filter_s+16], 1
     mova               xm10, [tlq-1]
     vinserti128         m10, [tlq+3], 1
-    vpbroadcastd         m9, [r3+r5*4+12*0]
-    vbroadcasti128       m7, [r3-z_filter_k+z_filter_s+18]
-    vinserti128          m8, m7, [r3-z_filter_k+z_filter_s+10], 0
-    vinserti128          m7, [r3-z_filter_k+z_filter_s+26], 1
+    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*0]
+    vbroadcasti128       m7, [base+z_filter_s+14]
+    vinserti128          m8, m7, [base+z_filter_s+6], 0
+    vinserti128          m7, [base+z_filter_s+22], 1
     psubw                m0, m1
-    pminub               m0, m11 ; imin(h+3, 15)
     movu               xm11, [tlq+12]
     vinserti128         m11, [tlq+16], 1
     pminub               m8, m0
     pminub               m7, m0
     pshufb               m0, m10, m2
     shufps               m2, m6, q2121
     pmaddubsw            m0, m9
     pshufb               m1, m11, m8
     shufps               m8, m7, q2121
     pmaddubsw            m1, m9
-    vpbroadcastd         m9, [r3+r5*4+12*1]
+    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*1]
     movzx               r3d, byte [tlq+31]
     pshufb               m2, m10, m2
     pmaddubsw            m2, m9
     pshufb               m8, m11, m8
     pmaddubsw            m8, m9
     paddw                m0, m2
     paddw                m1, m8
     sub                 r5d, 3
@@ -2126,16 +2121,1180 @@ ALIGN function_align
     mova          [dstq+ 0], m7
     mova          [dstq+32], m7
     add                dstq, strideq
     dec                  hd
     jg .w64_end_loop
 .w64_end:
     RET
 
+cglobal ipred_z3, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase
+    %assign org_stack_offset stack_offset
+    lea                  r6, [ipred_z3_avx2_table]
+    tzcnt                hd, hm
+    movifnidn        angled, anglem
+    lea                  r7, [dr_intra_derivative+45*2-1]
+    dec                 tlq
+    movsxd               hq, [r6+hq*4]
+    sub              angled, 180
+    add                  hq, r6
+    mov                 dyd, angled
+    neg                 dyd
+    xor              angled, 0x400
+    or                  dyq, ~0x7e
+    movzx               dyd, word [r7+dyq]
+    vpbroadcastd         m3, [pw_512]
+    vpbroadcastd         m4, [pw_62]
+    vpbroadcastd         m5, [pw_64]
+    mov              org_wd, wd
+    jmp                  hq
+.h4:
+    lea                  r7, [strideq*3]
+    cmp              angleb, 40
+    jae .h4_no_upsample
+    lea                 r4d, [angleq-1024]
+    sar                 r4d, 7
+    add                 r4d, wd
+    jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm)
+    ALLOC_STACK         -32, 9
+    movu                xm8, [tlq-7]
+    pshufb              xm0, xm8, [z_upsample3]
+    vpbroadcastb        xm2, xm8
+    pshufb              xm1, xm8, [z_filter_s+2]
+    mova           [rsp+16], xm2 ; top[max_base_y]
+    vpbroadcastd        xm2, [pb_36_m4]
+    add                 dyd, dyd
+    pmaddubsw           xm0, xm2
+    pmaddubsw           xm1, xm2
+    movd                xm7, dyd
+    mov                 r2d, dyd
+    vpbroadcastw         m7, xm7
+    paddw               xm1, xm0
+    pmulhrsw            xm1, xm3
+    pslldq               m6, m7, 8
+    paddw               xm2, xm7, xm7
+    paddw                m6, m7
+    packuswb            xm1, xm1
+    paddw                m6, m2
+    punpcklbw           xm1, xm8
+    mova                xm8, [z_transpose4]
+    psllw                m7, 2
+    pshufb              xm1, [pb_15to0]
+    mova              [rsp], xm1
+.h4_upsample_loop:
+    lea                 r4d, [r2+dyq]
+    shr                 r2d, 6
+    vpbroadcastq         m1, [rsp+r2]
+    lea                 r2d, [r4+dyq]
+    shr                 r4d, 6
+    vpbroadcastq         m2, [rsp+r4]
+    lea                 r4d, [r2+dyq]
+    shr                 r2d, 6
+    movq                xm0, [rsp+r2]
+    lea                 r2d, [r4+dyq]
+    shr                 r4d, 6
+    movhps              xm0, [rsp+r4]
+    vpblendd             m1, m2, 0xc0
+    pand                 m2, m4, m6
+    vpblendd             m0, m1, 0xf0
+    psubw                m1, m5, m2
+    psllw                m2, 8
+    por                  m1, m2
+    pmaddubsw            m0, m1
+    paddw                m6, m7
+    pmulhrsw             m0, m3
+    vextracti128        xm1, m0, 1
+    packuswb            xm1, xm0
+    pshufb              xm1, xm8
+    movd   [dstq+strideq*0], xm1
+    pextrd [dstq+strideq*1], xm1, 1
+    pextrd [dstq+strideq*2], xm1, 2
+    pextrd [dstq+r7       ], xm1, 3
+    add                dstq, 4
+    sub                  wd, 4
+    jg .h4_upsample_loop
+    RET
+ALIGN function_align
+.filter_strength: ; h4/h8/h16
+%define base r4-z_filter_t0
+    lea                  r4, [z_filter_t0]
+    movd                xm0, maxbased
+    movd                xm2, angled
+    shr              angled, 8 ; is_sm << 1
+    vpbroadcastb         m0, xm0
+    vpbroadcastb         m2, xm2
+    pcmpeqb              m1, m0, [base+z_filter_wh]
+    pand                 m1, m2
+    mova                xm2, [r4+angleq*8]
+    pcmpgtb              m1, m2
+    pmovmskb            r5d, m1
+    popcnt              r5d, r5d
+    ret
+.h4_no_upsample:
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK         -16, 12
+    mov            maxbased, 7
+    test             angled, 0x400 ; !enable_intra_edge_filter
+    jnz .h4_main
+    lea            maxbased, [wq+3]
+    call .filter_strength
+    mov            maxbased, 7
+    jz .h4_main ; filter_strength == 0
+    vpbroadcastd         m7, [base+pb_7]
+    vbroadcasti128       m2, [tlq-14]
+    pmaxub               m1, m7, [base+z_filter_s-4]
+    vpbroadcastd         m8, [base+z_filter_k-4+r5*4+12*0]
+    pmaxub               m7, [base+z_filter_s+4]
+    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*1]
+    vpbroadcastd        m10, [base+z_filter_k-4+r5*4+12*2]
+    pshufb               m0, m2, m1
+    shufps               m1, m7, q2121
+    pmaddubsw            m0, m8
+    pshufb               m1, m2, m1
+    pmaddubsw            m1, m9
+    pshufb               m2, m7
+    pmaddubsw            m2, m10
+    paddw                m0, m1
+    paddw                m0, m2
+    pmulhrsw             m0, m3
+    mov                 r4d, 9
+    lea                 tlq, [rsp+15]
+    cmp                  wd, 4
+    cmova          maxbased, r4d
+    vextracti128        xm1, m0, 1
+    packuswb            xm0, xm1
+    mova              [rsp], xm0
+.h4_main:
+    movd                xm6, dyd
+    vpbroadcastq         m0, [z_base_inc] ; base_inc << 6
+    mov                  r4, tlq
+    sub                 tlq, 4
+    neg                 dyq
+    vpbroadcastw         m6, xm6
+    sub                  r4, maxbaseq
+    shl            maxbased, 6
+    vpbroadcastb         m7, [r4]
+    lea                  r4, [dyq+63] ; ypos
+    movd                xm9, maxbased
+    sub            maxbased, 63
+    vbroadcasti128       m8, [z3_shuf_w4]
+    neg            maxbaseq
+    vpbroadcastw         m9, xm9
+    psrlw                m7, 8  ; top[max_base_y]
+    paddw               m10, m6, m6
+    psubw                m9, m0 ; max_base_y
+    vpblendd             m6, m10, 0xcc
+    mova                xm0, xm10
+    paddw                m6, m0 ; ypos2 ypos3 ypos0 ypos1
+    paddw               m10, m10
+    mova               xm11, [z_transpose4]
+.h4_loop:
+    lea                  r5, [r4+dyq]
+    sar                  r4, 6 ; base0
+    vpbroadcastq         m1, [tlq+r4]
+    lea                  r4, [r5+dyq]
+    sar                  r5, 6 ; base1
+    vpbroadcastq         m2, [tlq+r5]
+    lea                  r5, [r4+dyq]
+    sar                  r4, 6 ; base2
+    movq                xm0, [tlq+r4]
+    lea                  r4, [r5+dyq]
+    sar                  r5, 6 ; base3
+    movhps              xm0, [tlq+r5]
+    vpblendd             m1, m2, 0xc0
+    pand                 m2, m4, m6 ; frac << 1
+    vpblendd             m0, m1, 0xf0
+    psubw                m1, m5, m2 ; (32 - frac) << 1
+    psllw                m2, 8
+    pshufb               m0, m8
+    por                  m1, m2     ; (32-frac, frac) << 1
+    pmaddubsw            m0, m1
+    pcmpgtw              m1, m9, m6 ; base < max_base_y
+    pmulhrsw             m0, m3
+    paddsw               m6, m10    ; ypos += dy
+    vpblendvb            m0, m7, m0, m1
+    vextracti128        xm1, m0, 1
+    packuswb            xm1, xm0
+    pshufb              xm1, xm11   ; transpose
+    movd   [dstq+strideq*0], xm1
+    pextrd [dstq+strideq*1], xm1, 1
+    pextrd [dstq+strideq*2], xm1, 2
+    pextrd [dstq+r7       ], xm1, 3
+    add                dstq, 4
+    sub                  wd, 4
+    jz .h4_end
+    cmp                 r4d, maxbased
+    jg .h4_loop
+    packuswb            xm7, xm7
+.h4_end_loop:
+    movd   [dstq+strideq*0], xm7
+    movd   [dstq+strideq*1], xm7
+    movd   [dstq+strideq*2], xm7
+    movd   [dstq+r7       ], xm7
+    add                dstq, 4
+    sub                  wd, 4
+    jg .h4_end_loop
+.h4_end:
+    RET
+ALIGN function_align
+.h8:
+    lea                 r4d, [angleq+216]
+    mov                 r4b, wb
+    cmp                 r4d, 8
+    ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK         -32, 8
+    and                 r4d, 4
+    mova                xm0, [tlq-15]
+    vinserti128          m0, [tlq- 9], 1
+    movd                xm1, r4d
+    movu                xm2, [z_filter_s+2]
+    vinserti128          m2, [z_filter_s+6], 1
+    vpbroadcastb        xm1, xm1 ; w & 4
+    vpbroadcastd         m7, [pb_36_m4]
+    pmaxub              xm1, [z_upsample3] ; clip 4x8
+    vinserti128          m1, [z_upsample1], 1
+    add                 dyd, dyd
+    pshufb               m1, m0, m1
+    pshufb               m2, m0, m2
+    vinserti128          m0, [tlq-7], 1
+    movd                xm6, dyd
+    pmaddubsw            m1, m7
+    pmaddubsw            m2, m7
+    vpbroadcastw         m6, xm6
+    mov                 r2d, dyd
+    lea                  r5, [strideq*3]
+    paddw                m7, m6, m6
+    paddw                m1, m2
+    vpblendd             m6, m7, 0xf0
+    pmulhrsw             m1, m3
+    pslldq               m2, m7, 8
+    paddw                m7, m7
+    paddw                m6, m2
+    vbroadcasti128       m2, [pb_15to0]
+    packuswb             m1, m1
+    punpcklbw            m1, m0
+    pshufb               m1, m2
+    vextracti128   [rsp+ 0], m1, 1
+    mova           [rsp+16], xm1
+.h8_upsample_loop:
+    lea                 r4d, [r2+dyq]
+    shr                 r2d, 6 ; base0
+    movu                xm0, [rsp+r2]
+    lea                 r2d, [r4+dyq]
+    shr                 r4d, 6 ; base1
+    vinserti128          m0, [rsp+r4], 1
+    lea                 r4d, [r2+dyq]
+    shr                 r2d, 6 ; base2
+    pand                 m1, m4, m6
+    psubw                m2, m5, m1
+    psllw                m1, 8
+    por                  m2, m1
+    punpcklqdq           m1, m2, m2 ; frac0 frac1
+    pmaddubsw            m0, m1
+    movu                xm1, [rsp+r2]
+    lea                 r2d, [r4+dyq]
+    shr                 r4d, 6 ; base3
+    vinserti128          m1, [rsp+r4], 1
+    punpckhqdq           m2, m2 ; frac2 frac3
+    pmaddubsw            m1, m2
+    pmulhrsw             m0, m3
+    paddw                m6, m7
+    pmulhrsw             m1, m3
+    lea                  r4, [dstq+strideq*4]
+    psllw                m1, 8
+    por                  m0, m1
+    vextracti128        xm1, m0, 1
+    punpcklbw           xm2, xm0, xm1
+    punpckhbw           xm0, xm1
+    movd   [dstq+strideq*0], xm2
+    pextrd [dstq+strideq*1], xm2, 1
+    pextrd [dstq+strideq*2], xm2, 2
+    pextrd [dstq+r5       ], xm2, 3
+    movd   [r4  +strideq*0], xm0
+    pextrd [r4  +strideq*1], xm0, 1
+    pextrd [r4  +strideq*2], xm0, 2
+    pextrd [r4  +r5       ], xm0, 3
+    add                dstq, 4
+    sub                  wd, 4
+    jg .h8_upsample_loop
+    RET
+.h8_no_intra_edge_filter:
+    and            maxbased, 7
+    or             maxbased, 8 ; imin(w+7, 15)
+    jmp .h8_main
+.h8_no_upsample:
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK         -32, 10
+    lea            maxbased, [wq+7]
+    test             angled, 0x400
+    jnz .h8_no_intra_edge_filter
+    call .filter_strength
+    jz .h8_main ; filter_strength == 0
+    vpbroadcastd        xm6, [base+pb_15]
+    pcmpeqb             xm1, xm1
+    psubusb             xm6, xm0
+    psubb               xm6, xm1 ; w == 4 ? 5 : 1
+    movu                xm2, [tlq-16]
+    pmaxub              xm1, xm6, [base+z_filter_s]
+    vinserti128          m2, [tlq-14], 1
+    vinserti128          m1, [base+z_filter_s+12], 1
+    vpbroadcastd         m7, [base+z_filter_k-4+r5*4+12*0]
+    pmaxub              xm6, [base+z_filter_s+ 8]
+    vinserti128          m6, [base+z_filter_s+20], 1
+    pshufb               m0, m2, m1
+    pmaddubsw            m0, m7
+    vpbroadcastd         m7, [base+z_filter_k-4+r5*4+12*1]
+    movzx               r4d, byte [tlq-15]
+    shufps               m1, m6, q2121
+    pshufb               m1, m2, m1
+    pmaddubsw            m1, m7
+    paddw                m0, m1
+    sub                 r5d, 3
+    jnz .h8_3tap
+    vpbroadcastd         m7, [z_filter_k+4*8]
+    movzx               r2d, byte [tlq-14]
+    pshufb               m2, m6
+    pmaddubsw            m2, m7
+    sub                 r2d, r4d
+    lea                 r2d, [r2+r4*8+4]
+    shr                 r2d, 3
+    mov            [rsp+15], r2b
+    paddw                m0, m2
+.h8_3tap:
+    pmulhrsw             m0, m3
+    sar                 r5d, 1
+    lea                 tlq, [rsp+31]
+    add                 r5d, 17
+    cmp                  wd, 8
+    cmova          maxbased, r5d
+    neg                  r5
+    mov            [tlq+r5], r4b
+    vextracti128        xm1, m0, 1
+    packuswb            xm0, xm1
+    mova           [tlq-15], xm0
+.h8_main:
+    movd                xm2, dyd
+    vbroadcasti128       m0, [z_base_inc]
+    mov                  r4, tlq
+    sub                 tlq, 8
+    neg                 dyq
+    vpbroadcastw         m2, xm2
+    sub                  r4, maxbaseq
+    shl            maxbased, 6
+    vpbroadcastb         m7, [r4]
+    lea                  r4, [dyq+63]
+    movd                xm9, maxbased
+    sub            maxbased, 63
+    vbroadcasti128       m8, [z3_shuf]
+    neg            maxbaseq
+    vpbroadcastw         m9, xm9
+    psrlw                m7, 8
+    psubw                m9, m0
+    paddw                m6, m2, m2
+    vpblendd             m2, m6, 0x0f
+.h8_loop:
+    lea                  r5, [r4+dyq]
+    sar                  r4, 6
+    pand                 m0, m4, m2
+    psubw                m1, m5, m0
+    psllw                m0, 8
+    por                  m1, m0
+    vbroadcasti128       m0, [tlq+r4]
+    lea                  r4, [r5+dyq]
+    sar                  r5, 6
+    vinserti128          m0, [tlq+r5], 0
+    sub                 rsp, 8*2
+    pshufb               m0, m8
+    pmaddubsw            m0, m1
+    pcmpgtw              m1, m9, m2
+    paddsw               m2, m6
+    pmulhrsw             m0, m3
+    vpblendvb            m0, m7, m0, m1
+    vextracti128        xm1, m0, 1
+    psllw               xm0, 8
+    por                 xm0, xm1 ; interleave rows (partial transpose)
+    mova              [rsp], xm0
+    sub                  wd, 2
+    jz .h8_transpose
+    cmp                 r4d, maxbased
+    jg .h8_loop
+    packuswb            xm0, xm7, xm7
+.h8_end_loop:
+    sub                 rsp, 8*2
+    mova              [rsp], xm0
+    sub                  wd, 2
+    jg .h8_end_loop
+.h8_transpose:
+    mova                xm2, [rsp+16*1]
+    sub              org_wd, 8
+    lea                  r2, [strideq*3]
+    lea                  r6, [dstq+org_wq]
+    cmovg              dstq, r6
+    punpcklwd           xm1, xm2, xm0
+    punpckhwd           xm2, xm0
+    lea                  r6, [dstq+strideq*4]
+    jge .h8_w8
+    add                 rsp, 16*2
+    movd   [dstq+strideq*0], xm1
+    pextrd [dstq+strideq*1], xm1, 1
+    pextrd [dstq+strideq*2], xm1, 2
+    pextrd [dstq+r2       ], xm1, 3
+    movd   [r6  +strideq*0], xm2
+    pextrd [r6  +strideq*1], xm2, 1
+    pextrd [r6  +strideq*2], xm2, 2
+    pextrd [r6  +r2       ], xm2, 3
+    jmp .h8_end
+.h8_w8_loop:
+    mova                xm0, [rsp+16*0]
+    mova                xm2, [rsp+16*1]
+    punpcklwd           xm1, xm2, xm0
+    punpckhwd           xm2, xm0
+.h8_w8: ; w8/w16/w32
+    mova                xm0, [rsp+16*2]
+    mova                xm4, [rsp+16*3]
+    add                 rsp, 16*4
+    punpcklwd           xm3, xm4, xm0
+    punpckhwd           xm4, xm0
+    punpckldq           xm0, xm3, xm1
+    punpckhdq           xm3, xm1
+    punpckldq           xm1, xm4, xm2
+    punpckhdq           xm4, xm2
+    movq   [dstq+strideq*0], xm0
+    movhps [dstq+strideq*1], xm0
+    movq   [dstq+strideq*2], xm3
+    movhps [dstq+r2       ], xm3
+    movq   [r6  +strideq*0], xm1
+    movhps [r6  +strideq*1], xm1
+    movq   [r6  +strideq*2], xm4
+    movhps [r6  +r2       ], xm4
+    sub                dstq, 8
+    sub                  r6, 8
+    sub              org_wd, 8
+    jge .h8_w8_loop
+.h8_end:
+    RET
+.h16_no_intra_edge_filter:
+    and            maxbased, 15
+    or             maxbased, 16 ; imin(w+15, 31)
+    jmp .h16_main
+ALIGN function_align
+.h16:
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK         -64, 12
+    lea            maxbased, [wq+15]
+    test             angled, 0x400
+    jnz .h16_no_intra_edge_filter
+    call .filter_strength
+    jz .h16_main ; filter_strength == 0
+    vpbroadcastd        m11, [base+pb_27]
+    vpbroadcastd         m1, [base+pb_1]
+    vbroadcasti128       m6, [base+z_filter_s+12]
+    vinserti128          m2, m6, [base+z_filter_s+4], 0
+    vinserti128          m6, [base+z_filter_s+20], 1
+    movu               xm10, [tlq-18]
+    vinserti128         m10, [tlq-14], 1
+    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*0]
+    vbroadcasti128       m7, [base+z_filter_s+8]
+    vinserti128          m8, m7, [base+z_filter_s+0], 0
+    vinserti128          m7, [base+z_filter_s+16], 1
+    psubusb             m11, m0
+    por                  m1, m11
+    movu               xm11, [tlq-32]
+    vinserti128         m11, [tlq-28], 1
+    pmaxub               m8, m1
+    pmaxub               m7, m1
+    pshufb               m0, m10, m2
+    shufps               m2, m6, q2121
+    pmaddubsw            m0, m9
+    pshufb               m1, m11, m8
+    shufps               m8, m7, q2121
+    pmaddubsw            m1, m9
+    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*1]
+    movzx               r4d, byte [tlq-31]
+    pshufb               m2, m10, m2
+    pmaddubsw            m2, m9
+    pshufb               m8, m11, m8
+    pmaddubsw            m8, m9
+    paddw                m0, m2
+    paddw                m1, m8
+    sub                 r5d, 3
+    jnz .h16_3tap
+    vpbroadcastd         m9, [z_filter_k+4*8]
+    movzx               r2d, byte [tlq-30]
+    pshufb              m10, m6
+    pmaddubsw           m10, m9
+    pshufb              m11, m7
+    pmaddubsw           m11, m9
+    sub                 r2d, r4d
+    lea                 r2d, [r2+r4*8+4]
+    shr                 r2d, 3
+    mov            [rsp+31], r2b
+    paddw                m0, m10
+    paddw                m1, m11
+.h16_3tap:
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    sar                 r5d, 1
+    lea                 tlq, [rsp+63]
+    add                 r5d, 33
+    cmp                  wd, 16
+    cmova          maxbased, r5d
+    neg                  r5
+    mov            [tlq+r5], r4b
+    packuswb             m0, m1
+    vpermq               m0, m0, q2031
+    mova           [tlq-31], m0
+.h16_main:
+    movd                xm6, dyd
+    vbroadcasti128       m0, [z_base_inc]
+    mov                  r4, tlq
+    sub                 tlq, 8
+    neg                 dyq
+    vpbroadcastw         m6, xm6
+    sub                  r4, maxbaseq
+    shl            maxbased, 6
+    vpbroadcastb         m7, [r4]
+    lea                  r4, [dyq+63]
+    movd                xm9, maxbased
+    sub            maxbased, 63
+    vbroadcasti128       m8, [z3_shuf]
+    neg            maxbaseq
+    vpbroadcastw         m9, xm9
+    psubw                m9, m0
+    paddw               m11, m6, m6
+    psubw               m10, m9, m3 ; 64*8
+    vpblendd             m6, m11, 0xf0
+.h16_loop:
+    lea                  r5, [r4+dyq]
+    sar                  r4, 6
+    pand                 m1, m4, m6
+    psubw                m2, m5, m1
+    psllw                m1, 8
+    por                  m2, m1
+    movu                xm0, [tlq+r4-0]
+    movu                xm1, [tlq+r4-8]
+    lea                  r4, [r5+dyq]
+    sar                  r5, 6
+    vinserti128          m0, [tlq+r5-0], 1
+    vinserti128          m1, [tlq+r5-8], 1
+    sub                 rsp, 32
+    pshufb               m0, m8
+    pshufb               m1, m8
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    packuswb             m0, m1
+    pcmpgtw              m1, m9, m6
+    pcmpgtw              m2, m10, m6
+    packsswb             m1, m2
+    paddsw               m6, m11
+    vpblendvb            m0, m7, m0, m1
+    vpermq               m0, m0, q3120
+    mova              [rsp], m0
+    sub                  wd, 2
+    jz .h16_transpose
+    cmp                 r4d, maxbased
+    jg .h16_loop
+    mova                 m0, m7
+.h16_end_loop:
+    sub                 rsp, 32
+    mova              [rsp], m7
+    sub                  wd, 2
+    jg .h16_end_loop
+.h16_transpose:
+    mova                 m2, [rsp+32*1]
+    sub              org_wd, 8
+    lea                  r2, [strideq*3]
+    lea                  r6, [dstq+org_wq]
+    cmovg              dstq, r6
+    punpcklbw            m1, m2, m0
+    punpckhbw            m2, m0
+    lea                  r3, [strideq*5]
+    punpcklbw            m0, m1, m2
+    punpckhbw            m1, m2
+    lea                  r4, [strideq+r2*2] ; stride*7
+    jge .h16_w8
+    add                 rsp, 32*2
+    movd   [dstq+strideq*0], xm0
+    pextrd [dstq+strideq*1], xm0, 1
+    pextrd [dstq+strideq*2], xm0, 2
+    pextrd [dstq+r2       ], xm0, 3
+    vextracti128        xm0, m0, 1
+    movd   [dstq+strideq*4], xm1
+    pextrd [dstq+r3       ], xm1, 1
+    pextrd [dstq+r2*2     ], xm1, 2
+    pextrd [dstq+r4       ], xm1, 3
+    lea                dstq, [dstq+strideq*8]
+    vextracti128        xm1, m1, 1
+    movd   [dstq+strideq*0], xm0
+    pextrd [dstq+strideq*1], xm0, 1
+    pextrd [dstq+strideq*2], xm0, 2
+    pextrd [dstq+r2       ], xm0, 3
+    movd   [dstq+strideq*4], xm1
+    pextrd [dstq+r3       ], xm1, 1
+    pextrd [dstq+r2*2     ], xm1, 2
+    pextrd [dstq+r4       ], xm1, 3
+    jmp .h16_end
+.h16_w8_loop:
+    mova                 m0, [rsp+32*0]
+    mova                 m2, [rsp+32*1]
+    punpcklbw            m1, m2, m0
+    punpckhbw            m2, m0
+    punpcklbw            m0, m1, m2
+    punpckhbw            m1, m2
+.h16_w8:
+    mova                 m2, [rsp+32*2]
+    mova                 m4, [rsp+32*3]
+    lea                  r6, [dstq+strideq*8]
+    add                 rsp, 32*4
+    punpcklbw            m3, m4, m2
+    punpckhbw            m4, m2
+    punpcklbw            m2, m3, m4
+    punpckhbw            m3, m4
+    punpckldq            m4, m2, m0
+    punpckhdq            m2, m0
+    punpckldq            m0, m3, m1
+    punpckhdq            m3, m1
+    movq   [dstq+strideq*0], xm4
+    movhps [dstq+strideq*1], xm4
+    vextracti128        xm4, m4, 1
+    movq   [dstq+strideq*2], xm2
+    movhps [dstq+r2       ], xm2
+    vextracti128        xm2, m2, 1
+    movq   [dstq+strideq*4], xm0
+    movhps [dstq+r3       ], xm0
+    vextracti128        xm0, m0, 1
+    movq   [dstq+r2*2     ], xm3
+    movhps [dstq+r4       ], xm3
+    vextracti128        xm3, m3, 1
+    movq     [r6+strideq*0], xm4
+    movhps   [r6+strideq*1], xm4
+    movq     [r6+strideq*2], xm2
+    movhps   [r6+r2       ], xm2
+    movq     [r6+strideq*4], xm0
+    movhps   [r6+r3       ], xm0
+    movq     [r6+r2*2     ], xm3
+    movhps   [r6+r4       ], xm3
+    sub                dstq, 8
+    sub              org_wd, 8
+    jge .h16_w8_loop
+.h16_end:
+    RET
+ALIGN function_align
+.h32:
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK         -96, 15
+    lea            maxbased, [wq+31]
+    and            maxbased, 31
+    or             maxbased, 32 ; imin(w+31, 63)
+    test             angled, 0x400 ; !enable_intra_edge_filter
+    jnz .h32_main
+    vbroadcasti128       m0, [pb_0to15]
+    mov                 r4d, 21
+    mov                 r5d, 3
+    movu               xm11, [tlq-66]    ; 56-63
+    vinserti128         m11, [tlq-52], 1 ; 40-47
+    sub                 r4d, wd ; 21-w
+    cmovg               r5d, r4d
+    movu               xm12, [tlq-58]    ; 48-55
+    vinserti128         m12, [tlq-44], 1 ; 32-39
+    sub                 r4d, 8 ; 13-w
+    movd                xm1, r5d
+    movu               xm13, [tlq-34]    ; 24-31
+    vinserti128         m13, [tlq-20], 1 ;  8-15
+    movd                xm2, r4d
+    vpbroadcastb         m1, xm1
+    movu               xm14, [tlq-28]    ; 16-23
+    vinserti128         m14, [tlq-14], 1 ;  0- 7
+    vpbroadcastb         m2, xm2
+    pmaxsb               m1, m0 ; clip 16x32 and (32|64)x32
+    movu                 m7, [z_filter_s+4]
+    pshufb              m11, m1
+    vinserti128          m8, m7, [z_filter_s+8], 1
+    vinserti128          m7, [z_filter_s+16], 0
+    pmaxsb               m2, m0 ; clip 8x32
+    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
+    pshufb              m12, m2
+    pshufb               m0, m11, m8
+    pmaddubsw            m0, m9
+    pshufb               m2, m12, m8
+    pmaddubsw            m2, m9
+    pshufb               m1, m13, m8
+    pmaddubsw            m1, m9
+    shufps               m8, m7, q1021
+    pshufb               m6, m14, m8
+    pmaddubsw            m6, m9
+    vpbroadcastd         m9, [z_filter_k+4*2+12*1]
+    pshufb              m10, m11, m8
+    pmaddubsw           m10, m9
+    paddw                m0, m10
+    pshufb              m10, m12, m8
+    pmaddubsw           m10, m9
+    paddw                m2, m10
+    pshufb              m10, m13, m8
+    pmaddubsw           m10, m9
+    shufps               m8, m7, q2121
+    paddw                m1, m10
+    pshufb              m10, m14, m8
+    pmaddubsw           m10, m9
+    paddw                m6, m10
+    vpbroadcastd         m9, [z_filter_k+4*2+12*2]
+    pshufb              m11, m8
+    pmaddubsw           m11, m9
+    pshufb              m12, m8
+    pmaddubsw           m12, m9
+    movzx               r4d, byte [tlq-63]
+    movzx               r2d, byte [tlq-62]
+    paddw                m0, m11
+    paddw                m2, m12
+    pshufb              m13, m8
+    pmaddubsw           m13, m9
+    pshufb              m14, m7
+    pmaddubsw           m14, m9
+    paddw                m1, m13
+    paddw                m6, m14
+    sub                 r2d, r4d
+    lea                 r2d, [r2+r4*8+4] ; edge case for 64x32
+    pmulhrsw             m0, m3
+    pmulhrsw             m2, m3
+    pmulhrsw             m1, m3
+    pmulhrsw             m6, m3
+    shr                 r2d, 3
+    mov            [rsp+31], r2b
+    lea                 tlq, [rsp+95]
+    mov            [tlq-65], r4b
+    mov                 r4d, 65
+    cmp                  wd, 32
+    cmova          maxbased, r4d
+    packuswb             m0, m2
+    packuswb             m1, m6
+    mova           [tlq-63], m0
+    mova           [tlq-31], m1
+.h32_main:
+    movd                xm6, dyd
+    mov                  r4, tlq
+    sub                 tlq, 8
+    neg                 dyq
+    vpbroadcastw         m6, xm6
+    sub                  r4, maxbaseq
+    shl            maxbased, 6
+    vpbroadcastb         m7, [r4]
+    lea                  r4, [dyq+63]
+    movd                xm9, maxbased
+    sub            maxbased, 63
+    vbroadcasti128       m8, [z3_shuf]
+    neg            maxbaseq
+    vpbroadcastw         m9, xm9
+    psubw                m9, [z_base_inc]
+    mova                m11, m6
+    psubw               m10, m9, m3 ; 64*8
+.h32_loop:
+    mov                  r5, r4
+    sar                  r5, 6
+    pand                 m1, m4, m6
+    psubw                m2, m5, m1
+    psllw                m1, 8
+    por                  m2, m1
+    movu                xm0, [tlq+r5- 0]
+    vinserti128          m0, [tlq+r5-16], 1
+    movu                xm1, [tlq+r5- 8]
+    vinserti128          m1, [tlq+r5-24], 1
+    sub                 rsp, 32
+    add                  r4, dyq
+    pshufb               m0, m8
+    pshufb               m1, m8
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    packuswb             m0, m1
+    pcmpgtw              m1, m9, m6
+    pcmpgtw              m2, m10, m6
+    packsswb             m1, m2
+    paddsw               m6, m11
+    vpblendvb            m0, m7, m0, m1
+    mova              [rsp], m0
+    dec                  wd
+    jz .h32_transpose
+    cmp                 r4d, maxbased
+    jg .h32_loop
+.h32_end_loop:
+    sub                 rsp, 32
+    mova              [rsp], m7
+    dec                  wd
+    jg .h32_end_loop
+.h32_transpose:
+    lea                dstq, [dstq+org_wq-8]
+    lea                  r2, [strideq*3]
+    lea                  r3, [strideq*5]
+    lea                  r4, [strideq+r2*2] ; stride*7
+.h32_w8_loop:
+    mova                 m7, [rsp+32*0]
+    mova                 m6, [rsp+32*1]
+    mova                 m5, [rsp+32*2]
+    mova                 m4, [rsp+32*3]
+    mova                 m3, [rsp+32*4]
+    mova                 m2, [rsp+32*5]
+    mova                 m1, [rsp+32*6]
+    mova                 m0, [rsp+32*7]
+    lea                  r6, [dstq+strideq*8]
+    add                 rsp, 32*8
+    punpcklbw            m8, m0, m1
+    punpckhbw            m0, m1
+    punpcklbw            m1, m2, m3
+    punpckhbw            m2, m3
+    punpcklbw            m3, m4, m5
+    punpckhbw            m4, m5
+    punpcklbw            m5, m6, m7
+    punpckhbw            m6, m7
+    punpcklwd            m7, m8, m1
+    punpckhwd            m8, m1
+    punpcklwd            m1, m0, m2
+    punpckhwd            m0, m2
+    punpcklwd            m2, m3, m5
+    punpckhwd            m3, m5
+    punpcklwd            m5, m4, m6
+    punpckhwd            m4, m6
+    punpckldq            m6, m7, m2
+    punpckhdq            m7, m2
+    punpckldq            m2, m8, m3
+    punpckhdq            m8, m3
+    punpckldq            m3, m1, m5
+    punpckhdq            m1, m5
+    punpckldq            m5, m0, m4
+    punpckhdq            m0, m4
+    movq   [dstq+strideq*0], xm6
+    movhps [dstq+strideq*1], xm6
+    vextracti128        xm6, m6, 1
+    movq   [dstq+strideq*2], xm7
+    movhps [dstq+r2       ], xm7
+    vextracti128        xm7, m7, 1
+    movq   [dstq+strideq*4], xm2
+    movhps [dstq+r3       ], xm2
+    vextracti128        xm2, m2, 1
+    movq   [dstq+r2*2     ], xm8
+    movhps [dstq+r4       ], xm8
+    vextracti128        xm8, m8, 1
+    movq     [r6+strideq*0], xm3
+    movhps   [r6+strideq*1], xm3
+    vextracti128        xm3, m3, 1
+    movq     [r6+strideq*2], xm1
+    movhps   [r6+r2       ], xm1
+    vextracti128        xm1, m1, 1
+    movq     [r6+strideq*4], xm5
+    movhps   [r6+r3       ], xm5
+    vextracti128        xm5, m5, 1
+    movq     [r6+r2*2     ], xm0
+    movhps   [r6+r4       ], xm0
+    lea                  r6, [r6+strideq*8]
+    vextracti128        xm0, m0, 1
+    movq     [r6+strideq*0], xm6
+    movhps   [r6+strideq*1], xm6
+    movq     [r6+strideq*2], xm7
+    movhps   [r6+r2       ], xm7
+    movq     [r6+strideq*4], xm2
+    movhps   [r6+r3       ], xm2
+    movq     [r6+r2*2     ], xm8
+    movhps   [r6+r4       ], xm8
+    lea                  r6, [r6+strideq*8]
+    movq     [r6+strideq*0], xm3
+    movhps   [r6+strideq*1], xm3
+    movq     [r6+strideq*2], xm1
+    movhps   [r6+r2       ], xm1
+    movq     [r6+strideq*4], xm5
+    movhps   [r6+r3       ], xm5
+    movq     [r6+r2*2     ], xm0
+    movhps   [r6+r4       ], xm0
+    sub                dstq, 8
+    sub              org_wd, 8
+    jg .h32_w8_loop
+    RET
+ALIGN function_align
+.h64:
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK        -128, 16
+    lea            maxbased, [wq+63]
+    test             angled, 0x400 ; !enable_intra_edge_filter
+    jnz .h64_main
+    mov                 r4d, 21
+    vpbroadcastb       xm11, [tlq-127]
+    vpblendd           xm11, [tlq-130], 0x0e ; 120-127
+    sub                 r4d, wd ; 21-w
+    mov                 r5d, 3
+    vinserti128         m11, [tlq-116], 1    ; 104-111
+    movu                 m7, [z_filter_s+4]
+    cmp                  wd, 32
+    cmove               r4d, r5d
+    vinserti128          m8, m7, [z_filter_s+8], 1
+    vbroadcasti128       m6, [pb_0to15]
+    movd                xm1, r4d
+    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
+    movu               xm12, [tlq-122]       ; 112-119
+    vinserti128         m12, [tlq-108], 1    ;  96-103
+    vpbroadcastb         m1, xm1
+    movu               xm13, [tlq- 98]       ;  88- 95
+    vinserti128         m13, [tlq- 84], 1    ;  72- 79
+    movu               xm14, [tlq- 90]       ;  80- 87
+    vinserti128         m14, [tlq- 76], 1    ;  64- 71
+    vinserti128          m7, [z_filter_s+16], 0
+    pshufb               m0, m11, m8
+    pmaddubsw            m0, m9
+    pshufb               m2, m12, m8
+    pmaddubsw            m2, m9
+    pmaxsb               m1, m6 ; clip (16|32)x64
+    pshufb              m13, m1
+    pshufb               m1, m13, m8
+    pmaddubsw            m1, m9
+    pshufb               m6, m14, m8
+    pmaddubsw            m6, m9
+    vpbroadcastd         m9, [z_filter_k+4*2+12*1]
+    shufps              m15, m8, m7, q1021
+    pshufb              m10, m11, m15
+    pmaddubsw           m10, m9
+    paddw                m0, m10
+    pshufb              m10, m12, m15
+    pmaddubsw           m10, m9
+    paddw                m2, m10
+    pshufb              m10, m13, m15
+    pmaddubsw           m10, m9
+    paddw                m1, m10
+    pshufb              m10, m14, m15
+    pmaddubsw           m10, m9
+    paddw                m6, m10
+    vpbroadcastd         m9, [z_filter_k+4*2+12*2]
+    shufps              m10, m8, m7, q2132
+    pshufb              m11, m10
+    pmaddubsw           m11, m9
+    pshufb              m12, m10
+    pmaddubsw           m12, m9
+    pshufb              m13, m10
+    pmaddubsw           m13, m9
+    pshufb              m14, m10
+    pmaddubsw           m14, m9
+    paddw                m0, m11
+    paddw                m2, m12
+    paddw                m1, m13
+    paddw                m6, m14
+    movu               xm11, [tlq-66]    ; 56-63
+    vinserti128         m11, [tlq-52], 1 ; 40-47
+    movu               xm12, [tlq-58]    ; 48-55
+    vinserti128         m12, [tlq-44], 1 ; 32-39
+    movu               xm13, [tlq-34]    ; 24-31
+    vinserti128         m13, [tlq-20], 1 ;  8-15
+    movu               xm14, [tlq-28]    ; 16-23
+    vinserti128         m14, [tlq-14], 1 ;  0- 7
+    pmulhrsw             m0, m3
+    pmulhrsw             m2, m3
+    pmulhrsw             m1, m3
+    pmulhrsw             m6, m3
+    lea                 tlq, [rsp+127]
+    packuswb             m0, m2
+    packuswb             m1, m6
+    mova          [tlq-127], m0
+    mova          [tlq- 95], m1
+    pshufb               m0, m11, m10
+    pmaddubsw            m0, m9
+    pshufb               m2, m12, m10
+    pmaddubsw            m2, m9
+    pshufb               m1, m13, m10
+    pmaddubsw            m1, m9
+    pshufb               m6, m14, m7
+    pmaddubsw            m6, m9
+    vpbroadcastd         m9, [z_filter_k+4*2+12*1]
+    pshufb               m7, m11, m15
+    pmaddubsw            m7, m9
+    paddw                m0, m7
+    pshufb               m7, m12, m15
+    pmaddubsw            m7, m9
+    paddw                m2, m7
+    pshufb               m7, m13, m15
+    pmaddubsw            m7, m9
+    paddw                m1, m7
+    pshufb               m7, m14, m10
+    pmaddubsw            m7, m9
+    paddw                m6, m7
+    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
+    pshufb              m11, m8
+    pmaddubsw           m11, m9
+    pshufb              m12, m8
+    pmaddubsw           m12, m9
+    pshufb              m13, m8
+    pmaddubsw           m13, m9
+    pshufb              m14, m15
+    pmaddubsw           m14, m9
+    paddw                m0, m11
+    paddw                m2, m12
+    paddw                m1, m13
+    paddw                m6, m14
+    pmulhrsw             m0, m3
+    pmulhrsw             m2, m3
+    pmulhrsw             m1, m3
+    pmulhrsw             m6, m3
+    packuswb             m0, m2
+    packuswb             m1, m6
+    mova           [tlq-63], m0
+    mova           [tlq-31], m1
+.h64_main:
+    movd                xm6, dyd
+    mov                  r4, tlq
+    sub                 tlq, 24
+    neg                 dyq
+    vpbroadcastw         m6, xm6
+    sub                  r4, maxbaseq
+    shl            maxbased, 6
+    vpbroadcastb         m7, [r4]
+    lea                  r4, [dyq+63]
+    movd               xm10, maxbased
+    sub            maxbased, 63
+    vbroadcasti128       m8, [z3_shuf]
+    neg            maxbaseq
+    mova                xm1, [z_base_inc+16]
+    vinserti128          m1, [z_base_inc], 1
+    vpbroadcastw        m10, xm10
+    psllw                m0, m3, 2   ; 64*32
+    psubw               m10, m1
+    mova                m14, m6
+    psubw               m11, m10, m3 ; 64*8
+    psubw               m12, m10, m0
+    psubw               m13, m11, m0
+.h64_loop:
+    mov                  r5, r4
+    sar                  r5, 6
+    movu                 m0, [tlq+r5-0]
+    movu                 m1, [tlq+r5-8]
+    pand                 m2, m4, m6
+    psubw                m9, m5, m2
+    psllw                m2, 8
+    por                  m9, m2
+    pshufb               m0, m8
+    pshufb               m1, m8
+    pmaddubsw            m0, m9
+    pmaddubsw            m1, m9
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    packuswb             m0, m1
+    pcmpgtw              m1, m10, m6
+    pcmpgtw              m2, m11, m6
+    packsswb             m1, m2
+    vpblendvb            m2, m7, m0, m1
+    movu                 m0, [tlq+r5-32]
+    movu                 m1, [tlq+r5-40]
+    add                  r4, dyq
+    sub                 rsp, 64
+    mova           [rsp+32], m2
+    pshufb               m0, m8
+    pshufb               m1, m8
+    pmaddubsw            m0, m9
+    pmaddubsw            m1, m9
+    pcmpgtw              m9, m12, m6
+    pcmpgtw              m2, m13, m6
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    paddsw               m6, m14
+    packsswb             m9, m2
+    packuswb             m0, m1
+    vpblendvb            m0, m7, m0, m9
+    mova              [rsp], m0
+    dec                  wd
+    jz .h64_transpose
+    cmp                 r4d, maxbased
+    jg .h64_loop
+.h64_end_loop:
+    sub                 rsp, 64
+    mova           [rsp+32], m7
+    mova           [rsp+ 0], m7
+    dec                  wd
+    jg .h64_end_loop
+.h64_transpose:
+    lea                  r2, [strideq*3]
+    lea                  r3, [strideq*5]
+    imul                 r5, strideq, -8
+    lea                dstq, [dstq+org_wq-16]
+    lea                  r4, [strideq+r2*2] ; stride*7
+.h64_transpose_loop0:
+    lea                  r6, [rsp+16*3]
+.h64_transpose_loop:
+    mova                xm0, [r6+64*15]
+    vinserti128          m0, [r6+64* 7], 1
+    mova                xm1, [r6+64*14]
+    vinserti128          m1, [r6+64* 6], 1
+    mova                xm2, [r6+64*13]
+    vinserti128          m2, [r6+64* 5], 1
+    mova                xm3, [r6+64*12]
+    vinserti128          m3, [r6+64* 4], 1
+    mova                xm4, [r6+64*11]
+    vinserti128          m4, [r6+64* 3], 1
+    mova                xm5, [r6+64*10]
+    vinserti128          m5, [r6+64* 2], 1
+    mova                xm6, [r6+64* 9]
+    vinserti128          m6, [r6+64* 1], 1
+    mova                xm7, [r6+64* 8]
+    vinserti128          m7, [r6+64* 0], 1
+    sub                  r6, 16
+    punpcklbw            m8, m0, m1
+    punpckhbw            m0, m1
+    punpcklbw            m1, m2, m3
+    punpckhbw            m2, m3
+    punpcklbw            m3, m4, m5
+    punpckhbw            m4, m5
+    punpcklbw            m5, m6, m7
+    punpckhbw            m6, m7
+    punpcklwd            m7, m8, m1
+    punpckhwd            m8, m1
+    punpcklwd            m1, m0, m2
+    punpckhwd            m0, m2
+    punpcklwd            m2, m3, m5
+    punpckhwd            m3, m5
+    punpcklwd            m5, m4, m6
+    punpckhwd            m4, m6
+    punpckldq            m6, m7, m2
+    punpckhdq            m7, m2
+    punpckldq            m2, m8, m3
+    punpckhdq            m8, m3
+    punpckldq            m3, m1, m5
+    punpckhdq            m1, m5
+    punpckldq            m5, m0, m4
+    punpckhdq            m0, m4
+    vpermq               m6, m6, q3120
+    vpermq               m7, m7, q3120
+    vpermq               m2, m2, q3120
+    vpermq               m8, m8, q3120
+    vpermq               m3, m3, q3120
+    vpermq               m1, m1, q3120
+    vpermq               m5, m5, q3120
+    vpermq               m0, m0, q3120
+    mova         [dstq+strideq*0], xm6
+    vextracti128 [dstq+strideq*1], m6, 1
+    mova         [dstq+strideq*2], xm7
+    vextracti128 [dstq+r2       ], m7, 1
+    mova         [dstq+strideq*4], xm2
+    vextracti128 [dstq+r3       ], m2, 1
+    mova         [dstq+r2*2     ], xm8
+    vextracti128 [dstq+r4       ], m8, 1
+    sub               dstq, r5
+    mova         [dstq+strideq*0], xm3
+    vextracti128 [dstq+strideq*1], m3, 1
+    mova         [dstq+strideq*2], xm1
+    vextracti128 [dstq+r2       ], m1, 1
+    mova         [dstq+strideq*4], xm5
+    vextracti128 [dstq+r3       ], m5, 1
+    mova         [dstq+r2*2     ], xm0
+    vextracti128 [dstq+r4       ], m0, 1
+    sub                dstq, r5
+    cmp                  r6, rsp
+    jae .h64_transpose_loop
+    add                 rsp, 64*16
+    lea                dstq, [dstq+r5*8-16]
+    sub              org_wd, 16
+    jg .h64_transpose_loop0
+.h64_end:
+    RET
+
 %macro FILTER_XMM 4 ; dst, src, tmp, shuf
 %ifnum %4
     pshufb             xm%2, xm%4
 %else
     pshufb             xm%2, %4
 %endif
     pshufd             xm%1, xm%2, q0000 ; p0 p1
     pmaddubsw          xm%1, xm2
@@ -2163,17 +3322,17 @@ ALIGN function_align
     paddw               m%1, m%3
     pshufd              m%3, m%2, q2222
     pmaddubsw           m%3, m4
     paddw               m%1, m%3
     pshufd              m%3, m%2, q3333
     pmaddubsw           m%3, m5
     paddw               m%1, m%3
     psraw               m%1, 4
-    vperm2i128          m%3, m%1, m%1, 0x01
+    vpermq              m%3, m%1, q1032
     packuswb            m%1, m%3
 %endmacro
 
 ; The ipred_filter SIMD processes 4x2 blocks in the following order which
 ; increases parallelism compared to doing things row by row. One redundant
 ; block is calculated for w8 and w16, two for w32.
 ;     w4     w8       w16             w32
 ;     1     1 2     1 2 3 5     1 2 3 5 b c d f
@@ -2244,20 +3403,22 @@ ALIGN function_align
     movq   [dstq+strideq*0], xm6
     movhps [dstq+strideq*1], xm6
     lea                dstq, [dstq+strideq*2]
     sub                  hd, 2
     jg .w8_loop
     RET
 ALIGN function_align
 .w16:
+%if WIN64
     %assign stack_offset stack_offset - stack_size_padded
     %assign xmm_regs_used 15
     %assign stack_size_padded 0x98
     SUB                 rsp, stack_size_padded
+%endif
     sub                  hd, 2
     TAIL_CALL .w16_main, 0
 .w16_main:
 %if WIN64
     movaps       [rsp+0xa8], xmm6
     movaps       [rsp+0xb8], xmm7
     movaps       [rsp+0x28], xmm8
     movaps       [rsp+0x38], xmm9
--- a/third_party/dav1d/src/x86/ipred_init_tmpl.c
+++ b/third_party/dav1d/src/x86/ipred_init_tmpl.c
@@ -34,53 +34,65 @@ decl_angular_ipred_fn(dav1d_ipred_dc_top
 decl_angular_ipred_fn(dav1d_ipred_dc_left_avx2);
 decl_angular_ipred_fn(dav1d_ipred_h_avx2);
 decl_angular_ipred_fn(dav1d_ipred_v_avx2);
 decl_angular_ipred_fn(dav1d_ipred_paeth_avx2);
 decl_angular_ipred_fn(dav1d_ipred_smooth_avx2);
 decl_angular_ipred_fn(dav1d_ipred_smooth_v_avx2);
 decl_angular_ipred_fn(dav1d_ipred_smooth_h_avx2);
 decl_angular_ipred_fn(dav1d_ipred_z1_avx2);
+decl_angular_ipred_fn(dav1d_ipred_z3_avx2);
 decl_angular_ipred_fn(dav1d_ipred_filter_avx2);
 
 decl_cfl_pred_fn(dav1d_ipred_cfl_avx2);
 decl_cfl_pred_fn(dav1d_ipred_cfl_128_avx2);
 decl_cfl_pred_fn(dav1d_ipred_cfl_top_avx2);
 decl_cfl_pred_fn(dav1d_ipred_cfl_left_avx2);
 
 decl_cfl_ac_fn(dav1d_ipred_cfl_ac_420_avx2);
 decl_cfl_ac_fn(dav1d_ipred_cfl_ac_422_avx2);
 
 decl_pal_pred_fn(dav1d_pal_pred_avx2);
 
+decl_angular_ipred_fn(dav1d_ipred_dc_ssse3);
+decl_angular_ipred_fn(dav1d_ipred_dc_128_ssse3);
+decl_angular_ipred_fn(dav1d_ipred_dc_top_ssse3);
+decl_angular_ipred_fn(dav1d_ipred_dc_left_ssse3);
 decl_angular_ipred_fn(dav1d_ipred_h_ssse3);
+decl_angular_ipred_fn(dav1d_ipred_v_ssse3);
 
 void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) {
     const unsigned flags = dav1d_get_cpu_flags();
 
     if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
 
 #if BITDEPTH == 8
-    c->intra_pred[HOR_PRED] = dav1d_ipred_h_ssse3;
+    c->intra_pred[DC_PRED]       = dav1d_ipred_dc_ssse3;
+    c->intra_pred[DC_128_PRED]   = dav1d_ipred_dc_128_ssse3;
+    c->intra_pred[TOP_DC_PRED]   = dav1d_ipred_dc_top_ssse3;
+    c->intra_pred[LEFT_DC_PRED]  = dav1d_ipred_dc_left_ssse3;
+    c->intra_pred[HOR_PRED]      = dav1d_ipred_h_ssse3;
+    c->intra_pred[VERT_PRED]     = dav1d_ipred_v_ssse3;
 #endif
 
     if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
 
 #if BITDEPTH == 8 && ARCH_X86_64
     c->intra_pred[DC_PRED]       = dav1d_ipred_dc_avx2;
     c->intra_pred[DC_128_PRED]   = dav1d_ipred_dc_128_avx2;
     c->intra_pred[TOP_DC_PRED]   = dav1d_ipred_dc_top_avx2;
     c->intra_pred[LEFT_DC_PRED]  = dav1d_ipred_dc_left_avx2;
     c->intra_pred[HOR_PRED]      = dav1d_ipred_h_avx2;
     c->intra_pred[VERT_PRED]     = dav1d_ipred_v_avx2;
     c->intra_pred[PAETH_PRED]    = dav1d_ipred_paeth_avx2;
     c->intra_pred[SMOOTH_PRED]   = dav1d_ipred_smooth_avx2;
     c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_avx2;
     c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_avx2;
     c->intra_pred[Z1_PRED]       = dav1d_ipred_z1_avx2;
+    c->intra_pred[Z3_PRED]       = dav1d_ipred_z3_avx2;
     c->intra_pred[FILTER_PRED]   = dav1d_ipred_filter_avx2;
 
     c->cfl_pred[DC_PRED]      = dav1d_ipred_cfl_avx2;
     c->cfl_pred[DC_128_PRED]  = dav1d_ipred_cfl_128_avx2;
     c->cfl_pred[TOP_DC_PRED]  = dav1d_ipred_cfl_top_avx2;
     c->cfl_pred[LEFT_DC_PRED] = dav1d_ipred_cfl_left_avx2;
 
     c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_ipred_cfl_ac_420_avx2;
--- a/third_party/dav1d/src/x86/ipred_ssse3.asm
+++ b/third_party/dav1d/src/x86/ipred_ssse3.asm
@@ -24,31 +24,42 @@
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 %include "config.asm"
 %include "ext/x86/x86inc.asm"
 
 SECTION_RODATA 16
 
+pb_128   : times 8 db 128
+pd_32768 : times 1 dd 32768
+
 %macro JMP_TABLE 3-*
     %xdefine %1_%2_table (%%table - 2*4)
     %xdefine %%base mangle(private_prefix %+ _%1_%2)
     %%table:
     %rep %0 - 2
         dd %%base %+ .%3 - (%%table - 2*4)
         %rotate 1
     %endrep
 %endmacro
 
-JMP_TABLE      ipred_h,  ssse3, w4, w8, w16, w32, w64
+%define ipred_dc_splat_ssse3_table (ipred_dc_ssse3_table + 10*4)
+
+JMP_TABLE ipred_h,       ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_dc,      ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
+                                s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
+JMP_TABLE ipred_dc_left, ssse3, h4, h8, h16, h32, h64
 
 SECTION .text
 
-
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+;                                    const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
 %macro IPRED_SET   3                                          ; width, stride, stride size pshuflw_imm8
     pshuflw                      m1, m0, %3                   ; extend 8 byte for 2 pos
     punpcklqdq                   m1, m1
     mova           [dstq +      %2], m1
 %if %1 > 16
     mova           [dstq + 16 + %2], m1
 %endif
 %if %1 > 32
@@ -88,25 +99,374 @@ SECTION .text
     lea                        dstq, [dstq+strideq*4]
     sub                          hd, 4
     jg .w%1
     RET
 %endmacro
 
 INIT_XMM ssse3
 cglobal ipred_h, 3, 6, 2, dst, stride, tl, w, h, stride3
-    lea                          r5, [ipred_h_ssse3_table]
+    LEA                          r5, ipred_h_ssse3_table
     tzcnt                        wd, wm
     movifnidn                    hd, hm
     movsxd                       wq, [r5+wq*4]
     add                          wq, r5
     lea                    stride3q, [strideq*3]
     jmp                          wq
 .w4:
     IPRED_H                       4
 .w8:
     IPRED_H                       8
 .w16:
     IPRED_H                      16
 .w32:
     IPRED_H                      32
 .w64:
     IPRED_H                      64
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+;                                    const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_v, 3, 7, 6, dst, stride, tl, w, h, stride3
+    LEA                  r5, ipred_dc_splat_ssse3_table
+    tzcnt                wd, wm
+    movu                 m0, [tlq+ 1]
+    movu                 m1, [tlq+17]
+    movu                 m2, [tlq+33]
+    movu                 m3, [tlq+49]
+    movifnidn            hd, hm
+    movsxd               wq, [r5+wq*4]
+    add                  wq, r5
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_dc_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+;                                    const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_dc, 3, 7, 6, dst, stride, tl, w, h, stride3
+    movifnidn                    hd, hm
+    movifnidn                    wd, wm
+    tzcnt                       r6d, hd
+    lea                         r5d, [wq+hq]
+    movd                         m4, r5d
+    tzcnt                       r5d, r5d
+    movd                         m5, r5d
+    LEA                          r5, ipred_dc_ssse3_table
+    tzcnt                        wd, wd
+    movsxd                       r6, [r5+r6*4]
+    movsxd                       wq, [r5+wq*4+20]
+    pcmpeqd                      m3, m3
+    psrlw                        m4, 1                             ; dc = (width + height) >> 1;
+    add                          r6, r5
+    add                          wq, r5
+    lea                    stride3q, [strideq*3]
+    jmp r6
+.h4:
+    movd                         m0, [tlq-4]
+    pmaddubsw                    m0, m3
+    jmp                          wq
+.w4:
+    movd                         m1, [tlq+1]
+    pmaddubsw                    m1, m3
+    psubw                        m0, m4
+    paddw                        m0, m1
+    pmaddwd                      m0, m3
+    cmp                          hd, 4
+    jg .w4_mul
+    psrlw                        m0, 3                             ; dc >>= ctz(width + height);
+    jmp .w4_end
+.w4_mul:
+    punpckhqdq                   m1, m0, m0
+    paddw                        m0, m1
+    psrlq                        m1, m0, 32
+    paddw                        m0, m1
+    psrlw                        m0, 2
+    mov                         r6d, 0x5556
+    mov                         r2d, 0x3334
+    test                         hd, 8
+    cmovz                       r6d, r2d
+    movd                         m5, r6d
+    pmulhuw                      m0, m5
+.w4_end:
+    pxor                         m1, m1
+    pshufb                       m0, m1
+.s4:
+    movd           [dstq+strideq*0], m0
+    movd           [dstq+strideq*1], m0
+    movd           [dstq+strideq*2], m0
+    movd           [dstq+stride3q ], m0
+    lea                        dstq, [dstq+strideq*4]
+    sub                          hd, 4
+    jg .s4
+    RET
+ALIGN function_align
+.h8:
+    movq                         m0, [tlq-8]
+    pmaddubsw                    m0, m3
+    jmp                          wq
+.w8:
+    movq                         m1, [tlq+1]
+    pmaddubsw                    m1, m3
+    psubw                        m4, m0
+    punpckhqdq                   m0, m0
+    psubw                        m0, m4
+    paddw                        m0, m1
+    pshuflw                      m1, m0, q1032                  ; psrlq  m1, m0, 32
+    paddw                        m0, m1
+    pmaddwd                      m0, m3
+    psrlw                        m0, m5
+    cmp                          hd, 8
+    je .w8_end
+    mov                         r6d, 0x5556
+    mov                         r2d, 0x3334
+    cmp                          hd, 32
+    cmovz                       r6d, r2d
+    movd                         m1, r6d
+    pmulhuw                      m0, m1
+.w8_end:
+    pxor                         m1, m1
+    pshufb                       m0, m1
+.s8:
+    movq           [dstq+strideq*0], m0
+    movq           [dstq+strideq*1], m0
+    movq           [dstq+strideq*2], m0
+    movq           [dstq+stride3q ], m0
+    lea                        dstq, [dstq+strideq*4]
+    sub                          hd, 4
+    jg .s8
+    RET
+ALIGN function_align
+.h16:
+    mova                         m0, [tlq-16]
+    pmaddubsw                    m0, m3
+    jmp                          wq
+.w16:
+    movu                         m1, [tlq+1]
+    pmaddubsw                    m1, m3
+    paddw                        m0, m1
+    psubw                        m4, m0
+    punpckhqdq                   m0, m0
+    psubw                        m0, m4
+    pshuflw                      m1, m0, q1032                  ; psrlq  m1, m0, 32
+    paddw                        m0, m1
+    pmaddwd                      m0, m3
+    psrlw                        m0, m5
+    cmp                          hd, 16
+    je .w16_end
+    mov                         r6d, 0x5556
+    mov                         r2d, 0x3334
+    test                         hd, 8|32
+    cmovz                       r6d, r2d
+    movd                         m1, r6d
+    pmulhuw                      m0, m1
+.w16_end:
+    pxor                         m1, m1
+    pshufb                       m0, m1
+.s16:
+    mova           [dstq+strideq*0], m0
+    mova           [dstq+strideq*1], m0
+    mova           [dstq+strideq*2], m0
+    mova           [dstq+stride3q ], m0
+    lea                        dstq, [dstq+strideq*4]
+    sub                          hd, 4
+    jg .s16
+    RET
+ALIGN function_align
+.h32:
+    mova                         m0, [tlq-32]
+    pmaddubsw                    m0, m3
+    mova                         m2, [tlq-16]
+    pmaddubsw                    m2, m3
+    paddw                        m0, m2
+    jmp wq
+.w32:
+    movu                         m1, [tlq+1]
+    pmaddubsw                    m1, m3
+    movu                         m2, [tlq+17]
+    pmaddubsw                    m2, m3
+    paddw                        m1, m2
+    paddw                        m0, m1
+    psubw                        m4, m0
+    punpckhqdq                   m0, m0
+    psubw                        m0, m4
+    pshuflw                      m1, m0, q1032                   ; psrlq  m1, m0, 32
+    paddw                        m0, m1
+    pmaddwd                      m0, m3
+    psrlw                        m0, m5
+    cmp                          hd, 32
+    je .w32_end
+    lea                         r2d, [hq*2]
+    mov                         r6d, 0x5556
+    mov                         r2d, 0x3334
+    test                         hd, 64|16
+    cmovz                       r6d, r2d
+    movd                         m1, r6d
+    pmulhuw                      m0, m1
+.w32_end:
+    pxor                         m1, m1
+    pshufb                       m0, m1
+    mova                         m1, m0
+.s32:
+    mova                     [dstq], m0
+    mova                  [dstq+16], m1
+    mova             [dstq+strideq], m0
+    mova          [dstq+strideq+16], m1
+    mova           [dstq+strideq*2], m0
+    mova        [dstq+strideq*2+16], m1
+    mova            [dstq+stride3q], m0
+    mova         [dstq+stride3q+16], m1
+    lea                        dstq, [dstq+strideq*4]
+    sub                          hd, 4
+    jg .s32
+    RET
+ALIGN function_align
+.h64:
+    mova                         m0, [tlq-64]
+    mova                         m1, [tlq-48]
+    pmaddubsw                    m0, m3
+    pmaddubsw                    m1, m3
+    paddw                        m0, m1
+    mova                         m1, [tlq-32]
+    pmaddubsw                    m1, m3
+    paddw                        m0, m1
+    mova                         m1, [tlq-16]
+    pmaddubsw                    m1, m3
+    paddw                        m0, m1
+    jmp wq
+.w64:
+    movu                         m1, [tlq+ 1]
+    movu                         m2, [tlq+17]
+    pmaddubsw                    m1, m3
+    pmaddubsw                    m2, m3
+    paddw                        m1, m2
+    movu                         m2, [tlq+33]
+    pmaddubsw                    m2, m3
+    paddw                        m1, m2
+    movu                         m2, [tlq+49]
+    pmaddubsw                    m2, m3
+    paddw                        m1, m2
+    paddw                        m0, m1
+    psubw                        m4, m0
+    punpckhqdq                   m0, m0
+    psubw                        m0, m4
+    pshuflw                      m1, m0, q1032                   ; psrlq  m1, m0, 32
+    paddw                        m0, m1
+    pmaddwd                      m0, m3
+    psrlw                        m0, m5
+    cmp                          hd, 64
+    je .w64_end
+    mov                         r6d, 0x5556
+    mov                         r2d, 0x3334
+    test                         hd, 32
+    cmovz                       r6d, r2d
+    movd                         m1, r6d
+    pmulhuw                      m0, m1
+.w64_end:
+    pxor                         m1, m1
+    pshufb                       m0, m1
+    mova                         m1, m0
+    mova                         m2, m0
+    mova                         m3, m0
+.s64:
+    mova                     [dstq], m0
+    mova                  [dstq+16], m1
+    mova                  [dstq+32], m2
+    mova                  [dstq+48], m3
+    mova             [dstq+strideq], m0
+    mova          [dstq+strideq+16], m1
+    mova          [dstq+strideq+32], m2
+    mova          [dstq+strideq+48], m3
+    lea                        dstq, [dstq+strideq*2]
+    sub                          hd, 2
+    jg .s64
+    RET
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_dc_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+;                                    const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_dc_left, 3, 7, 6, dst, stride, tl, w, h, stride3
+    LEA                  r5, ipred_dc_left_ssse3_table
+    mov                  hd, hm                ; zero upper half
+    tzcnt               r6d, hd
+    sub                 tlq, hq
+    tzcnt                wd, wm
+    movu                 m0, [tlq]
+    movd                 m3, [r5-ipred_dc_left_ssse3_table+pd_32768]
+    movd                 m2, r6d
+    psrld                m3, m2
+    movsxd               r6, [r5+r6*4]
+    pcmpeqd              m2, m2
+    pmaddubsw            m0, m2
+    add                  r6, r5
+    add                  r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table
+    movsxd               wq, [r5+wq*4]
+    add                  wq, r5
+    jmp                  r6
+.h64:
+    movu                 m1, [tlq+48]                           ; unaligned when jumping here from dc_top
+    pmaddubsw            m1, m2
+    paddw                m0, m1
+    movu                 m1, [tlq+32]                           ; unaligned when jumping here from dc_top
+    pmaddubsw            m1, m2
+    paddw                m0, m1
+.h32:
+    movu                 m1, [tlq+16]                           ; unaligned when jumping here from dc_top
+    pmaddubsw            m1, m2
+    paddw                m0, m1
+.h16:
+    pshufd               m1, m0, q3232                          ; psrlq               m1, m0, 16
+    paddw                m0, m1
+.h8:
+    pshuflw              m1, m0, q1032                          ; psrlq               m1, m0, 32
+    paddw                m0, m1
+.h4:
+    pmaddwd              m0, m2
+    pmulhrsw             m0, m3
+    lea            stride3q, [strideq*3]
+    pxor                 m1, m1
+    pshufb               m0, m1
+    mova                 m1, m0
+    mova                 m2, m0
+    mova                 m3, m0
+    jmp                  wq
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_dc_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+;                                    const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_dc_128, 2, 7, 6, dst, stride, tl, w, h, stride3
+    LEA                  r5, ipred_dc_splat_ssse3_table
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    movsxd               wq, [r5+wq*4]
+    movddup              m0, [r5-ipred_dc_splat_ssse3_table+pb_128]
+    mova                 m1, m0
+    mova                 m2, m0
+    mova                 m3, m0
+    add                  wq, r5
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_dc_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+;                                    const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_dc_top, 3, 7, 6, dst, stride, tl, w, h
+    LEA                  r5, ipred_dc_left_ssse3_table
+    tzcnt                wd, wm
+    inc                 tlq
+    movu                 m0, [tlq]
+    movifnidn            hd, hm
+    movd                 m3, [r5-ipred_dc_left_ssse3_table+pd_32768]
+    movd                 m2, wd
+    psrld                m3, m2
+    movsxd               r6, [r5+wq*4]
+    pcmpeqd              m2, m2
+    pmaddubsw            m0, m2
+    add                  r6, r5
+    add                  r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table
+    movsxd               wq, [r5+wq*4]
+    add                  wq, r5
+    jmp                  r6
+
--- a/third_party/dav1d/src/x86/itx.asm
+++ b/third_party/dav1d/src/x86/itx.asm
@@ -108,16 +108,25 @@ pw_2440x8:  COEF_X8  2440
 pw_m601x8:  COEF_X8  -601
 pw_4052x8:  COEF_X8  4052
 
 idct64_mul: COEF_X8  4095,   101,  4065,   501,  2967, -2824,  3229, -2520
             COEF_X8  3745,  1660,  3564,  2019,  3822, -1474,  3948, -1092
             COEF_X8  3996,   897,  3889,  1285,  3461, -2191,  3659, -1842
             COEF_X8  3349,  2359,  3102,  2675,  4036,  -700,  4085,  -301
 
+pw_201_4091x8:   dw   201*8, 4091*8
+pw_m601_4052x8:  dw  -601*8, 4052*8
+pw_995_3973x8:   dw   995*8, 3973*8
+pw_m1380_3857x8: dw -1380*8, 3857*8
+pw_1751_3703x8:  dw  1751*8, 3703*8
+pw_m2106_3513x8: dw -2106*8, 3513*8
+pw_2440_3290x8:  dw  2440*8, 3290*8
+pw_m2751_3035x8: dw -2751*8, 3035*8
+
 %define o_idct64_offset idct64_mul - (o_base) - 8
 
 SECTION .text
 
 ; Code size reduction trickery: Intead of using rip-relative loads with
 ; mandatory 4-byte offsets everywhere, we can set up a base pointer with a
 ; single rip-relative lea and then address things relative from that with
 ; 1-byte offsets as long as data is within +-128 bytes of the base pointer.
@@ -210,92 +219,86 @@ SECTION .text
 %endif
     paddd               m%3, m%5
     paddd               m%2, m%5
     psrad               m%3, 12
     psrad               m%2, 12
     packssdw            m%2, m%3
 %endmacro
 
-%macro ITX_MULHRSW_SHL3 4 ; dst/src, tmp, coef[1-2]
-    vpbroadcastd        m%2, [pw_%3_%4]
-    psllw               m%2, 3
-    pmulhrsw            m%1, m%2
-%endmacro
-
 %macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048
     ITX_MULSUB_2W        %2, %4, %5, %6, %7, 1567, 3784 ; t2, t3
     vpbroadcastd        m%6, [o(pw_2896x8)]
     paddw               m%5, m%1, m%3
     psubw               m%1, m%3
     pmulhrsw            m%1, m%6 ; t1
     pmulhrsw            m%5, m%6 ; t0
-    psubw               m%3, m%1, m%2
-    paddw               m%2, m%1
-    paddw               m%1, m%5, m%4
-    psubw               m%4, m%5, m%4
+    psubsw              m%3, m%1, m%2
+    paddsw              m%2, m%1
+    paddsw              m%1, m%5, m%4
+    psubsw              m%4, m%5, m%4
 %endmacro
 
 %macro IDCT8_1D 11 ; src[1-8], tmp[1-2], pd_2048
     ITX_MULSUB_2W        %6, %4, %9, %10, %11, 3406, 2276 ; t5a, t6a
     ITX_MULSUB_2W        %2, %8, %9, %10, %11,  799, 4017 ; t4a, t7a
     ITX_MULSUB_2W        %3, %7, %9, %10, %11, 1567, 3784 ; t2, t3
-    paddw               m%9, m%2, m%6  ; t4
-    psubw               m%2, m%6       ; t5a
-    paddw              m%10, m%8, m%4  ; t7
-    psubw               m%8, m%4       ; t6a
+    paddsw              m%9, m%2, m%6  ; t4
+    psubsw              m%2, m%6       ; t5a
+    paddsw             m%10, m%8, m%4  ; t7
+    psubsw              m%8, m%4       ; t6a
     vpbroadcastd        m%4, [o(pw_2896x8)]
     psubw               m%6, m%1, m%5
     paddw               m%1, m%5
     psubw               m%5, m%8, m%2
     paddw               m%8, m%2
     pmulhrsw            m%1, m%4       ; t0
     pmulhrsw            m%6, m%4       ; t1
     pmulhrsw            m%8, m%4       ; t6
     pmulhrsw            m%5, m%4       ; t5
-    psubw               m%4, m%1, m%7  ; dct4 out3
-    paddw               m%1, m%7       ; dct4 out0
-    paddw               m%7, m%6, m%3  ; dct4 out1
-    psubw               m%6, m%3       ; dct4 out2
-    paddw               m%2, m%7, m%8  ; out1
-    psubw               m%7, m%8       ; out6
-    psubw               m%8, m%1, m%10 ; out7
-    paddw               m%1, m%10      ; out0
-    paddw               m%3, m%6, m%5  ; out2
-    psubw               m%6, m%5       ; out5
-    psubw               m%5, m%4, m%9  ; out4
-    paddw               m%4, m%9       ; out3
+    psubsw              m%4, m%1, m%7  ; dct4 out3
+    paddsw              m%1, m%7       ; dct4 out0
+    paddsw              m%7, m%6, m%3  ; dct4 out1
+    psubsw              m%6, m%3       ; dct4 out2
+    paddsw              m%2, m%7, m%8  ; out1
+    psubsw              m%7, m%8       ; out6
+    psubsw              m%8, m%1, m%10 ; out7
+    paddsw              m%1, m%10      ; out0
+    paddsw              m%3, m%6, m%5  ; out2
+    psubsw              m%6, m%5       ; out5
+    psubsw              m%5, m%4, m%9  ; out4
+    paddsw              m%4, m%9       ; out3
 %endmacro
 
 ; in1 = %1, in3  = %2, in5  = %3, in7  = %4
 ; in9 = %5, in11 = %6, in13 = %7, in15 = %8
 %macro IDCT16_1D_ODDHALF 11 ; src[1-8], tmp[1-2], pd_2048
     ITX_MULSUB_2W        %1, %8, %9, %10, %11,  401, 4076 ; t8a,  t15a
     ITX_MULSUB_2W        %5, %4, %9, %10, %11, 3166, 2598 ; t9a,  t14a
     ITX_MULSUB_2W        %3, %6, %9, %10, %11, 1931, 3612 ; t10a, t13a
     ITX_MULSUB_2W        %7, %2, %9, %10, %11, 3920, 1189 ; t11a, t12a
-    psubw               m%9, m%2, m%6 ; t13
-    paddw               m%6, m%2      ; t12
-    psubw               m%2, m%8, m%4 ; t14
-    paddw               m%8, m%4      ; t15
-    psubw               m%4, m%7, m%3 ; t10
-    paddw               m%3, m%7      ; t11
-    psubw               m%7, m%1, m%5 ; t9
-    paddw               m%1, m%5      ; t8
+    psubsw              m%9, m%2, m%6 ; t13
+    paddsw              m%6, m%2      ; t12
+    psubsw              m%2, m%8, m%4 ; t14
+    paddsw              m%8, m%4      ; t15
+    psubsw              m%4, m%7, m%3 ; t10
+    paddsw              m%3, m%7      ; t11
+    psubsw              m%7, m%1, m%5 ; t9
+    paddsw              m%1, m%5      ; t8
     ITX_MULSUB_2W        %2, %7, %5, %10, %11,  1567, 3784 ; t9a,  t14a
     ITX_MULSUB_2W        %9, %4, %5, %10, %11, m3784, 1567 ; t10a, t13a
     vpbroadcastd       m%10, [o(pw_2896x8)]
-    psubw               m%5, m%2, m%9 ; t10
-    paddw               m%2, m%9      ; t9
-    psubw               m%9, m%1, m%3 ; t11a
-    paddw               m%1, m%3      ; t8a
-    psubw               m%3, m%7, m%4 ; t13
-    paddw               m%7, m%4      ; t14
-    psubw               m%4, m%8, m%6 ; t12a
-    paddw               m%8, m%6      ; t15a
+    psubsw              m%5, m%2, m%9 ; t10
+    paddsw              m%2, m%9      ; t9
+    psubsw              m%9, m%1, m%3 ; t11a
+    paddsw              m%1, m%3      ; t8a
+    psubsw              m%3, m%7, m%4 ; t13
+    paddsw              m%7, m%4      ; t14
+    psubsw              m%4, m%8, m%6 ; t12a
+    paddsw              m%8, m%6      ; t15a
     paddw               m%6, m%3, m%5 ; t13a
     psubw               m%3, m%5      ; t10a
     paddw               m%5, m%4, m%9 ; t12
     psubw               m%4, m%9      ; t11
     REPX {pmulhrsw x, m%10}, m%6, m%3, m%5, m%4
 %endmacro
 
 %macro WRAP_XMM 1+
@@ -450,18 +453,18 @@ ALIGN function_align
     punpcklqdq           m0, m3
     ITX_MUL2X_PACK        2, 1, 3, 4, 1567, 3784
 %if %0 == 1
     pmulhrsw             m0, m%1
 %else
     vpbroadcastd         m4, [o(pw_2896x8)]
     pmulhrsw             m0, m4     ; t0 t1
 %endif
-    psubw                m1, m0, m2 ; out3 out2
-    paddw                m0, m2     ; out0 out1
+    psubsw               m1, m0, m2 ; out3 out2
+    paddsw               m0, m2     ; out0 out1
 %endmacro
 
 %macro IADST4_1D_PACKED 0
     punpcklwd            m2, m1, m0
     punpckhwd            m3, m1, m0
     psubw                m0, m1
     punpckhqdq           m1, m1
     paddw                m1, m0 ; in0 - in2 + in3
@@ -685,57 +688,57 @@ cglobal iidentity_4x4_internal, 0, 5, 6,
     punpcklwd            m3, m1     ; in2 in6
     psubw                m1, m0, m2
     paddw                m0, m2
     punpcklqdq           m0, m1     ; in0+in4 in0-in4
     ITX_MUL2X_PACK        5, 1, 2, 6,  799, 4017, 1 ; t4a t7a
     ITX_MUL2X_PACK        4, 1, 2, 6, 3406, 2276, 1 ; t5a t6a
     ITX_MUL2X_PACK        3, 1, 2, 6, 1567, 3784    ; t3 t2
     vpbroadcastd         m6, [o(pw_2896x8)]
-    psubw                m2, m5, m4 ; t4 t7
-    paddw                m5, m4     ; t5a t6a
+    psubsw               m2, m5, m4 ; t4 t7
+    paddsw               m5, m4     ; t5a t6a
     pshufd               m4, m2, q1032
     psubw                m1, m2, m4
     paddw                m4, m2
     vpblendd             m4, m4, m1, 0xcc
     pmulhrsw             m0, m6     ; t0 t1
     pmulhrsw             m4, m6     ; t6 t5
-    psubw                m1, m0, m3 ; tmp3 tmp2
-    paddw                m0, m3     ; tmp0 tmp1
+    psubsw               m1, m0, m3 ; tmp3 tmp2
+    paddsw               m0, m3     ; tmp0 tmp1
     shufps               m2, m5, m4, q1032 ; t7 t6
     vpblendd             m5, m5, m4, 0xcc  ; t4 t5
-    psubw                m3, m0, m2 ; out7 out6
-    paddw                m0, m2     ; out0 out1
-    psubw                m2, m1, m5 ; out4 out5
-    paddw                m1, m5     ; out3 out2
+    psubsw               m3, m0, m2 ; out7 out6
+    paddsw               m0, m2     ; out0 out1
+    psubsw               m2, m1, m5 ; out4 out5
+    paddsw               m1, m5     ; out3 out2
 %endmacro
 
 %macro IADST8_1D_PACKED 0
     vpbroadcastd         m6, [o(pd_2048)]
     punpckhwd            m0, m4, m3 ; 0 7
     punpckhwd            m1, m5, m2 ; 2 5
     punpcklwd            m2, m5     ; 4 3
     punpcklwd            m3, m4     ; 6 1
     ITX_MUL2X_PACK        0, 4, 5, 6,  401, 4076 ; t0a t1a
     ITX_MUL2X_PACK        1, 4, 5, 6, 1931, 3612 ; t2a t3a
     ITX_MUL2X_PACK        2, 4, 5, 6, 3166, 2598 ; t4a t5a
     ITX_MUL2X_PACK        3, 4, 5, 6, 3920, 1189 ; t6a t7a
-    psubw                m4, m0, m2 ; t4 t5
-    paddw                m0, m2     ; t0 t1
-    psubw                m5, m1, m3 ; t6 t7
-    paddw                m1, m3     ; t2 t3
+    psubsw               m4, m0, m2 ; t4 t5
+    paddsw               m0, m2     ; t0 t1
+    psubsw               m5, m1, m3 ; t6 t7
+    paddsw               m1, m3     ; t2 t3
     shufps               m2, m5, m4, q1032
     punpckhwd            m4, m2
     punpcklwd            m5, m2
     ITX_MUL2X_PACK        4, 2, 3, 6, 1567, 3784, 1 ; t5a t4a
     ITX_MUL2X_PACK        5, 2, 3, 6, 3784, 1567    ; t7a t6a
-    psubw                m2, m0, m1        ; t2 t3
-    paddw                m0, m1            ; out0 -out7
-    psubw                m1, m4, m5        ; t7 t6
-    paddw                m4, m5            ; out6 -out1
+    psubsw               m2, m0, m1        ; t2 t3
+    paddsw               m0, m1            ; out0 -out7
+    psubsw               m1, m4, m5        ; t7 t6
+    paddsw               m4, m5            ; out6 -out1
     vpbroadcastd         m5, [o(pw_2896x8)]
     vpblendd             m3, m0, m4, 0x33  ; out6 -out7
     vpblendd             m0, m0, m4, 0xcc  ; out0 -out1
     shufps               m4, m2, m1, q1032 ; t3 t7
     vpblendd             m1, m2, m1, 0xcc  ; t2 t6
     psubw                m2, m1, m4        ; t2-t3 t6-t7
     paddw                m1, m4            ; t2+t3 t6+t7
     pmulhrsw             m2, m5            ; out4 -out5
@@ -973,37 +976,37 @@ cglobal iidentity_4x8_internal, 0, 5, 7,
     punpcklwd            m6, m2     ; dct4  in3  in1
     ITX_MUL2X_PACK        8, 2, 4, 10,  401, 4076, 3 ; t8a  t15a
     ITX_MUL2X_PACK        0, 2, 4, 10, 3166, 2598, 3 ; t9a  t14a
     ITX_MUL2X_PACK        1, 2, 4, 10, 3920, 1189, 3 ; t11a t12a
     ITX_MUL2X_PACK        5, 2, 4, 10, 1931, 3612, 3 ; t10a t13a
     ITX_MUL2X_PACK        7, 2, 4, 10,  799, 4017, 1 ; t4a  t7a
     ITX_MUL2X_PACK        3, 2, 4, 10, 3406, 2276, 1 ; t5a  t6a
     ITX_MUL2X_PACK        6, 2, 4, 10, 1567, 3784    ; t3   t2
-    psubw                m2, m8, m0 ; t9  t14
-    paddw                m8, m0     ; t8  t15
-    psubw                m0, m1, m5 ; t10 t13
-    paddw                m1, m5     ; t11 t12
+    psubsw               m2, m8, m0 ; t9  t14
+    paddsw               m8, m0     ; t8  t15
+    psubsw               m0, m1, m5 ; t10 t13
+    paddsw               m1, m5     ; t11 t12
 %if mmsize > 16
     vbroadcasti128       m5, [o(deint_shuf)]
 %else
     mova                 m5, [o(deint_shuf)]
 %endif
     pshufb               m8, m5
     pshufb               m1, m5
     vpbroadcastd         m5, [o(pw_m3784_1567)]  ; reuse pw_1567_3784
     ITX_MUL2X_PACK        2, 4, _, 10, 4, 5, 4   ; t9a  t14a
     vpbroadcastd         m4, [o(pw_m1567_m3784)] ; reuse pw_m3784_1567
     ITX_MUL2X_PACK        0, 5, _, 10, 5, 4, 4   ; t10a t13a
-    psubw                m5, m7, m3 ; t5a t6a
-    paddw                m7, m3     ; t4  t7
-    psubw                m4, m8, m1 ; t11a t12a
-    paddw                m8, m1     ; t8a  t15a
-    paddw                m1, m2, m0 ; t9   t14
-    psubw                m2, m0     ; t10  t13
+    psubsw               m5, m7, m3 ; t5a t6a
+    paddsw               m7, m3     ; t4  t7
+    psubsw               m4, m8, m1 ; t11a t12a
+    paddsw               m8, m1     ; t8a  t15a
+    paddsw               m1, m2, m0 ; t9   t14
+    psubsw               m2, m0     ; t10  t13
     punpckhqdq           m0, m8, m1 ; t15a t14
     punpcklqdq           m8, m1     ; t8a  t9
     pshufd               m3, m5, q1032
     psubw                m1, m5, m3
     paddw                m3, m5
     vpblendd             m3, m3, m1, 0xcc ; t6 t5
     vpbroadcastd         m1, [o(pw_2896x8)]
     punpckhqdq           m5, m4, m2 ; t12a t13
@@ -1011,30 +1014,30 @@ cglobal iidentity_4x8_internal, 0, 5, 7,
     psubw                m4, m5, m2
     paddw                m5, m2
     pmulhrsw             m9, m1     ; t0   t1
     pmulhrsw             m3, m1     ; t6   t5
     pmulhrsw             m4, m1     ; t11  t10a
     pmulhrsw             m5, m1     ; t12  t13a
     shufps               m2, m7, m3, q1032 ; t7 t6
     vpblendd             m7, m7, m3, 0xcc  ; t4 t5
-    psubw                m1, m9, m6 ; dct4 out3 out2
-    paddw                m9, m6     ; dct4 out0 out1
-    psubw                m3, m9, m2 ; dct8 out7 out6
-    paddw                m9, m2     ; dct8 out0 out1
-    psubw                m2, m1, m7 ; dct8 out4 out5
-    paddw                m1, m7     ; dct8 out3 out2
-    psubw                m7, m9, m0 ; out15 out14
-    paddw                m0, m9     ; out0  out1
-    psubw                m6, m1, m5 ; out12 out13
-    paddw                m1, m5     ; out3  out2
-    psubw                m5, m2, m4 ; out11 out10
-    paddw                m2, m4     ; out4  out5
-    psubw                m4, m3, m8 ; out8  out9
-    paddw                m3, m8     ; out7  out6
+    psubsw               m1, m9, m6 ; dct4 out3 out2
+    paddsw               m9, m6     ; dct4 out0 out1
+    psubsw               m3, m9, m2 ; dct8 out7 out6
+    paddsw               m9, m2     ; dct8 out0 out1
+    psubsw               m2, m1, m7 ; dct8 out4 out5
+    paddsw               m1, m7     ; dct8 out3 out2
+    psubsw               m7, m9, m0 ; out15 out14
+    paddsw               m0, m9     ; out0  out1
+    psubsw               m6, m1, m5 ; out12 out13
+    paddsw               m1, m5     ; out3  out2
+    psubsw               m5, m2, m4 ; out11 out10
+    paddsw               m2, m4     ; out4  out5
+    psubsw               m4, m3, m8 ; out8  out9
+    paddsw               m3, m8     ; out7  out6
 %endmacro
 
 INV_TXFM_4X16_FN dct, dct,      0
 INV_TXFM_4X16_FN dct, identity, 15
 INV_TXFM_4X16_FN dct, adst
 INV_TXFM_4X16_FN dct, flipadst
 
 cglobal idct_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
@@ -1145,46 +1148,46 @@ ALIGN function_align
     punpckhwd            m4, m3, m0 ; in12 in3  in14 in1
     punpcklwd            m0, m3     ; in0  in15 in2  in13
     punpckhwd            m3, m2, m1 ; in8  in7  in10 in5
     punpcklwd            m1, m2     ; in4  in11 in6  in9
     ITX_MUL4X_PACK        0, 2, 5, 6, 8,  201, 4091,  995, 3973, 3
     ITX_MUL4X_PACK        1, 2, 5, 6, 8, 1751, 3703, 2440, 3290, 3
     ITX_MUL4X_PACK        3, 2, 5, 6, 8, 3035, 2751, 3513, 2106, 3
     ITX_MUL4X_PACK        4, 2, 5, 6, 8, 3857, 1380, 4052,  601, 3
-    psubw                m2, m0, m3 ; t9a  t8a  t11a t10a
-    paddw                m0, m3     ; t1a  t0a  t3a  t2a
-    psubw                m3, m1, m4 ; t13a t12a t15a t14a
-    paddw                m1, m4     ; t5a  t4a  t7a  t6a
+    psubsw               m2, m0, m3 ; t9a  t8a  t11a t10a
+    paddsw               m0, m3     ; t1a  t0a  t3a  t2a
+    psubsw               m3, m1, m4 ; t13a t12a t15a t14a
+    paddsw               m1, m4     ; t5a  t4a  t7a  t6a
     ITX_MUL4X_PACK        2, 4, 5, 6, 8,  799, 4017, 3406, 2276, 3
     psubw                m6, m7, m5
     ITX_MUL2X_PACK        3, 5, _, 8, 6, 4, 6
     vpbroadcastd         m6, [o(pw_m3784_1567)]
     vpbroadcastd         m5, [o(pw_1567_3784)]
-    psubw                m4, m0, m1 ; t5   t4   t7   t6
-    paddw                m0, m1     ; t1   t0   t3   t2
-    psubw                m1, m2, m3 ; t13a t12a t15a t14a
-    paddw                m2, m3     ; t9a  t8a  t11a t10a
-    psubw                m3, m7, m6
+    psubsw               m4, m0, m1 ; t5   t4   t7   t6
+    paddsw               m0, m1     ; t1   t0   t3   t2
+    psubsw               m1, m2, m3 ; t13a t12a t15a t14a
+    paddsw               m2, m3     ; t9a  t8a  t11a t10a
+    psubw                m3, m7, m6 ; pw_3784_m1567
     vpblendd             m6, m6, m3, 0xf0
     ITX_MUL2X_PACK        4, 3, _, 8, 6, 5, 4 ; t4a t5a t7a t6a
     ITX_MUL2X_PACK        1, 3, _, 8, 6, 5, 4 ; t12 t13 t15 t14
     vbroadcasti128       m5, [o(deint_shuf)]
     pshufb               m0, m5
     pshufb               m2, m5
     vperm2i128           m3, m0, m2, 0x31  ; t3   t2   t11a t10a
     vinserti128          m0, m0, xm2, 1    ; t1   t0   t9a  t8a
     vperm2i128           m2, m4, m1, 0x31  ; t7a  t6a  t15  t14
     vinserti128          m4, m4, xm1, 1    ; t4a  t5a  t12  t13
     vpbroadcastd         m5, [o(pw_2896x8)]
     pshufd               m2, m2, q1032     ; t6a  t7a  t14  t15
-    psubw                m1, m0, m3        ; t3a t2a t11 t10
-    paddw                m0, m3     ; -out15  out0   out14 -out1
-    paddw                m3, m4, m2 ; -out3   out12  out2  -out13
-    psubw                m4, m2            ; t6 t7 t14a t15a
+    psubsw               m1, m0, m3        ; t3a t2a t11 t10
+    paddsw               m0, m3     ; -out15  out0   out14 -out1
+    paddsw               m3, m4, m2 ; -out3   out12  out2  -out13
+    psubsw               m4, m2            ; t6 t7 t14a t15a
     shufps               m2, m1, m4, q1032 ; t2a t6  t10 t14a
     vpblendd             m4, m4, m1, 0x33  ; t3a t7  t11 t15a
     paddw                m1, m2, m4
     psubw                m2, m4
     pmulhrsw             m1, m5     ; -out7   out4   out6  -out5
     pmulhrsw             m2, m5     ;  out8  -out11 -out9   out10
     ret
 
@@ -1894,63 +1897,63 @@ ALIGN function_align
     ITX_MUL2X_PACK        0, 4, 9, 10,  201, 4091, 3 ; t0  t1
     ITX_MUL2X_PACK        1, 4, 9, 10,  995, 3973, 3 ; t2  t3
     ITX_MUL2X_PACK        2, 4, 9, 10, 1751, 3703, 3 ; t4  t5
     ITX_MUL2X_PACK        3, 4, 9, 10, 2440, 3290, 3 ; t6  t7
     ITX_MUL2X_PACK        5, 4, 9, 10, 3035, 2751, 3 ; t8  t9
     ITX_MUL2X_PACK        6, 4, 9, 10, 3513, 2106, 3 ; t10 t11
     ITX_MUL2X_PACK        7, 4, 9, 10, 3857, 1380, 3 ; t12 t13
     ITX_MUL2X_PACK        8, 4, 9, 10, 4052,  601, 3 ; t14 t15
-    psubw                m4, m0, m5 ; t9a  t8a
-    paddw                m0, m5     ; t1a  t0a
-    psubw                m5, m1, m6 ; t11a t10a
-    paddw                m1, m6     ; t3a  t2a
-    psubw                m6, m2, m7 ; t13a t12a
-    paddw                m2, m7     ; t5a  t4a
-    psubw                m7, m3, m8 ; t15a t14a
-    paddw                m3, m8     ; t7a  t6a
+    psubsw               m4, m0, m5 ; t9a  t8a
+    paddsw               m0, m5     ; t1a  t0a
+    psubsw               m5, m1, m6 ; t11a t10a
+    paddsw               m1, m6     ; t3a  t2a
+    psubsw               m6, m2, m7 ; t13a t12a
+    paddsw               m2, m7     ; t5a  t4a
+    psubsw               m7, m3, m8 ; t15a t14a
+    paddsw               m3, m8     ; t7a  t6a
     vpbroadcastd        m11, [o(pw_m4017_799)]
     vpbroadcastd        m12, [o(pw_799_4017)]
     pxor                 m9, m9
     ITX_MUL2X_PACK        4, 8, _, 10, 11, 12, 6 ; t8  t9
-    psubw                m8, m9, m11
+    psubw                m8, m9, m11 ; pw_4017_m799
     ITX_MUL2X_PACK        6, 12, _, 10, 12, 8, 6 ; t12 t13
     vpbroadcastd        m11, [o(pw_m2276_3406)]
     vpbroadcastd        m12, [o(pw_3406_2276)]
     ITX_MUL2X_PACK        5, 8, _, 10, 11, 12, 6 ; t10 t11
-    psubw                m8, m9, m11
+    psubw                m8, m9, m11 ; pw_2276_m3406
     ITX_MUL2X_PACK        7, 12, _, 10, 12, 8, 6 ; t14 t15
-    psubw                m8, m1, m3 ; t7   t6
-    paddw                m1, m3     ; t3   t2
-    psubw                m3, m0, m2 ; t5   t4
-    paddw                m0, m2     ; t1   t0
-    psubw                m2, m5, m7 ; t14a t15a
-    paddw                m7, m5     ; t10a t11a
-    psubw                m5, m4, m6 ; t12a t13a
-    paddw                m4, m6     ; t8a  t9a
+    psubsw               m8, m1, m3 ; t7   t6
+    paddsw               m1, m3     ; t3   t2
+    psubsw               m3, m0, m2 ; t5   t4
+    paddsw               m0, m2     ; t1   t0
+    psubsw               m2, m5, m7 ; t14a t15a
+    paddsw               m7, m5     ; t10a t11a
+    psubsw               m5, m4, m6 ; t12a t13a
+    paddsw               m4, m6     ; t8a  t9a
     vpbroadcastd        m11, [o(pw_m3784_1567)]
     vpbroadcastd        m12, [o(pw_1567_3784)]
     ITX_MUL2X_PACK        3, 6, _, 10, 11, 12, 4 ; t4a t5a
-    psubw                m6, m9, m11
+    psubw                m6, m9, m11 ; pw_3784_m1567
     ITX_MUL2X_PACK        8, 12, _, 10, 12, 6, 4 ; t6a t7a
     vpbroadcastd        m11, [o(pw_m1567_3784)]
     vpbroadcastd        m12, [o(pw_3784_1567)]
     ITX_MUL2X_PACK        2, 6, _, 10, 11, 12, 4 ; t15 t14
-    psubw                m6, m9, m11
+    psubw                m6, m9, m11 ; pw_1567_m3784
     ITX_MUL2X_PACK        5, 12, _, 10, 12, 6, 4 ; t13 t12
     vbroadcasti128      m11, [o(deint_shuf)]
     vpbroadcastd        m12, [o(pw_2896x8)]
-    psubw                m6, m0, m1        ;  t3a    t2a
-    paddw                m0, m1            ; -out15  out0
-    paddw                m1, m2, m5        ; -out13  out2
-    psubw                m5, m2            ;  t15a   t14a
-    paddw                m2, m4, m7        ; -out1  out14
-    psubw                m4, m7            ;  t10    t11
-    psubw                m7, m3, m8        ;  t6     t7
-    paddw                m8, m3            ; -out3   out12
+    psubsw               m6, m0, m1        ;  t3a    t2a
+    paddsw               m0, m1            ; -out15  out0
+    paddsw               m1, m2, m5        ; -out13  out2
+    psubsw               m5, m2            ;  t15a   t14a
+    paddsw               m2, m4, m7        ; -out1  out14
+    psubsw               m4, m7            ;  t10    t11
+    psubsw               m7, m3, m8        ;  t6     t7
+    paddsw               m8, m3            ; -out3   out12
     REPX    {pshufb x, m11}, m6, m4, m0, m2
     vpblendd             m3, m6, m4, 0xcc  ;  t3a    t11
     shufps               m6, m6, m4, q1032 ;  t2a    t10
     vpblendd             m4, m5, m7, 0xcc  ;  t15a   t7
     shufps               m5, m5, m7, q1032 ;  t14a   t6
     shufps               m7, m2, m0, q1032 ;  out14 -out15
     vpblendd             m0, m0, m2, 0x33  ; -out1   out0
     paddw                m2, m5, m4        ; -out5   out4
@@ -2572,35 +2575,35 @@ ALIGN function_align
     jmp m(idct_16x8_internal).end2
 ALIGN function_align
 .main:
     vpbroadcastd        m10, [o(pd_2048)]
     ITX_MULSUB_2W         7, 0, 8, 9, 10,  401, 4076 ; t1a, t0a
     ITX_MULSUB_2W         3, 4, 8, 9, 10, 3166, 2598 ; t5a, t4a
     ITX_MULSUB_2W         1, 6, 8, 9, 10, 3920, 1189 ; t7a, t6a
     ITX_MULSUB_2W         5, 2, 8, 9, 10, 1931, 3612 ; t3a, t2a
-    psubw                m8, m2, m6 ; t6
-    paddw                m2, m6     ; t2
-    psubw                m6, m0, m4 ; t4
-    paddw                m0, m4     ; t0
-    psubw                m4, m5, m1 ; t7
-    paddw                m5, m1     ; t3
-    psubw                m1, m7, m3 ; t5
-    paddw                m7, m3     ; t1
+    psubsw               m8, m2, m6 ; t6
+    paddsw               m2, m6     ; t2
+    psubsw               m6, m0, m4 ; t4
+    paddsw               m0, m4     ; t0
+    psubsw               m4, m5, m1 ; t7
+    paddsw               m5, m1     ; t3
+    psubsw               m1, m7, m3 ; t5
+    paddsw               m7, m3     ; t1
     ITX_MULSUB_2W         6, 1, 3, 9, 10, 1567, 3784 ; t5a, t4a
     ITX_MULSUB_2W         4, 8, 3, 9, 10, 3784, 1567 ; t6a, t7a
-    psubw                m9, m6, m8 ;  t7
-    paddw                m6, m8     ;  out6
+    psubsw               m9, m6, m8 ;  t7
+    paddsw               m6, m8     ;  out6
     vpbroadcastd         m8, [o(pw_2896x8)]
-    psubw                m3, m7, m5 ;  t3
-    paddw                m7, m5     ; -out7
-    psubw                m5, m0, m2 ;  t2
-    paddw                m0, m2     ;  out0
-    psubw                m2, m1, m4 ;  t6
-    paddw                m1, m4     ; -out1
+    psubsw               m3, m7, m5 ;  t3
+    paddsw               m7, m5     ; -out7
+    psubsw               m5, m0, m2 ;  t2
+    paddsw               m0, m2     ;  out0
+    psubsw               m2, m1, m4 ;  t6
+    paddsw               m1, m4     ; -out1
     psubw                m4, m5, m3
     paddw                m3, m5
     psubw                m5, m2, m9
     paddw                m2, m9
     pmulhrsw             m2, m8     ;  out2
     pmulhrsw             m3, m8     ; -out3
     pmulhrsw             m4, m8     ;  out4
     pmulhrsw             m5, m8     ; -out5
@@ -2951,35 +2954,35 @@ ALIGN function_align
     mova                 m1, [rsp+gprsize+32*2] ; in9
     mova [rsp+gprsize+32*2], m14 ; tmp7
     mova                 m9, [rsp+gprsize+32*1] ; in1
     mova [rsp+gprsize+32*1], m10 ; tmp5
     mova                m14, [rsp+gprsize+32*0] ; in15
     mova [rsp+gprsize+32*0], m6  ; tmp3
     IDCT16_1D_ODDHALF     9,  3,  5,  7,  1, 11, 13, 14,  6, 10, 15
     mova                 m6, [rsp+gprsize+32*1] ; tmp5
-    psubw               m15, m0, m14  ; out15
-    paddw                m0, m14      ; out0
-    psubw               m14, m2, m13  ; out14
-    paddw                m2, m13      ; out1
+    psubsw              m15, m0, m14  ; out15
+    paddsw               m0, m14      ; out0
+    psubsw              m14, m2, m13  ; out14
+    paddsw               m2, m13      ; out1
     mova [rsp+gprsize+32*1], m2
-    psubw               m13, m4, m11  ; out13
-    paddw                m2, m4, m11  ; out2
-    psubw               m11, m8, m7   ; out11
-    paddw                m4, m8, m7   ; out4
+    psubsw              m13, m4, m11  ; out13
+    paddsw               m2, m4, m11  ; out2
+    psubsw              m11, m8, m7   ; out11
+    paddsw               m4, m8, m7   ; out4
     mova                 m7, [rsp+gprsize+32*2] ; tmp7
-    psubw               m10, m6, m5   ; out10
-    paddw                m5, m6       ; out5
-    psubw                m8, m7, m9   ; out8
-    paddw                m7, m9       ; out7
-    psubw                m9, m12, m3  ; out9
-    paddw                m6, m12, m3  ; out6
+    psubsw              m10, m6, m5   ; out10
+    paddsw               m5, m6       ; out5
+    psubsw               m8, m7, m9   ; out8
+    paddsw               m7, m9       ; out7
+    psubsw               m9, m12, m3  ; out9
+    paddsw               m6, m12, m3  ; out6
     mova                 m3, [rsp+gprsize+32*0] ; tmp3
-    psubw               m12, m3, m1   ; out12
-    paddw                m3, m1       ; out3
+    psubsw              m12, m3, m1   ; out12
+    paddsw               m3, m1       ; out3
     ret
 
 INV_TXFM_16X16_FN adst, dct
 INV_TXFM_16X16_FN adst, adst
 INV_TXFM_16X16_FN adst, flipadst
 
 cglobal iadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
     ITX_16X16_LOAD_COEFS
@@ -3004,86 +3007,86 @@ ALIGN function_align
 .main:
     vpbroadcastd        m15, [o(pd_2048)]
     mova [rsp+gprsize+32*1], m0
     mova [rsp+gprsize+32*2], m4
     ITX_MULSUB_2W        13,  2,  0,  4, 15,  995, 3973 ; t3,  t2
     ITX_MULSUB_2W         9,  6,  0,  4, 15, 2440, 3290 ; t7,  t6
     ITX_MULSUB_2W         5, 10,  0,  4, 15, 3513, 2106 ; t11, t10
     ITX_MULSUB_2W         1, 14,  0,  4, 15, 4052,  601 ; t15, t14
-    psubw                m0, m2, m10  ; t10a
-    paddw                m2, m10      ; t2a
-    psubw               m10, m13, m5  ; t11a
-    paddw               m13, m5       ; t3a
-    psubw                m5, m6, m14  ; t14a
-    paddw                m6, m14      ; t6a
-    psubw               m14, m9, m1   ; t15a
-    paddw                m9, m1       ; t7a
+    psubsw               m0, m2, m10  ; t10a
+    paddsw               m2, m10      ; t2a
+    psubsw              m10, m13, m5  ; t11a
+    paddsw              m13, m5       ; t3a
+    psubsw               m5, m6, m14  ; t14a
+    paddsw               m6, m14      ; t6a
+    psubsw              m14, m9, m1   ; t15a
+    paddsw               m9, m1       ; t7a
     ITX_MULSUB_2W         0, 10,  1,  4, 15, 3406, 2276 ; t11, t10
     ITX_MULSUB_2W        14,  5,  1,  4, 15, 2276, 3406 ; t14, t15
-    psubw                m1, m10, m14 ; t14a
-    paddw               m10, m14      ; t10a
-    psubw               m14, m0, m5   ; t15a
-    paddw                m0, m5       ; t11a
-    psubw                m5, m2, m6   ; t6
-    paddw                m2, m6       ; t2
-    psubw                m6, m13, m9  ; t7
-    paddw               m13, m9       ; t3
+    psubsw               m1, m10, m14 ; t14a
+    paddsw              m10, m14      ; t10a
+    psubsw              m14, m0, m5   ; t15a
+    paddsw               m0, m5       ; t11a
+    psubsw               m5, m2, m6   ; t6
+    paddsw               m2, m6       ; t2
+    psubsw               m6, m13, m9  ; t7
+    paddsw              m13, m9       ; t3
     ITX_MULSUB_2W         6,  5,  4,  9, 15, 3784, 1567 ; t6a, t7a
     ITX_MULSUB_2W        14,  1,  4,  9, 15, 3784, 1567 ; t14, t15
     mova                 m9, [rsp+gprsize+32*0] ; in15
     mova [rsp+gprsize+32*0], m10 ; t10a
     mova                 m4, [rsp+gprsize+32*1] ; in0
     mova [rsp+gprsize+32*1], m6  ; t6a
     mova                 m6, [rsp+gprsize+32*2] ; in4
     mova [rsp+gprsize+32*2], m2  ; t2
     ITX_MULSUB_2W         9,  4,  2, 10, 15,  201, 4091 ; t1,  t0
     ITX_MULSUB_2W        11,  6,  2, 10, 15, 1751, 3703 ; t5,  t4
     ITX_MULSUB_2W         7,  8,  2, 10, 15, 3035, 2751 ; t9,  t8
     ITX_MULSUB_2W         3, 12,  2, 10, 15, 3857, 1380 ; t13, t12
-    psubw               m10, m4, m8  ; t8a
-    paddw                m8, m4      ; t0a
-    psubw                m4, m9, m7  ; t9a
-    paddw                m9, m7      ; t1a
-    psubw                m7, m6, m12 ; t12a
-    paddw                m6, m12     ; t4a
-    psubw               m12, m11, m3 ; t13a
-    paddw               m11, m3      ; t5a
+    psubsw              m10, m4, m8  ; t8a
+    paddsw               m8, m4      ; t0a
+    psubsw               m4, m9, m7  ; t9a
+    paddsw               m9, m7      ; t1a
+    psubsw               m7, m6, m12 ; t12a
+    paddsw               m6, m12     ; t4a
+    psubsw              m12, m11, m3 ; t13a
+    paddsw              m11, m3      ; t5a
     ITX_MULSUB_2W        10,  4,  2,  3, 15,  799, 4017 ; t9,  t8
     ITX_MULSUB_2W        12,  7,  2,  3, 15, 4017,  799 ; t12, t13
-    psubw                m3, m9, m11 ; t5
-    paddw                m9, m11     ; t1
-    psubw               m11, m4, m12 ; t12a
-    paddw                m4, m12     ; t8a
-    paddw               m12, m8, m6  ; t0
-    psubw                m8, m6      ; t4
-    paddw                m6, m10, m7 ; t9a
-    psubw               m10, m7      ; t13a
+    psubsw               m3, m9, m11 ; t5
+    paddsw               m9, m11     ; t1
+    psubsw              m11, m4, m12 ; t12a
+    paddsw               m4, m12     ; t8a
+    paddsw              m12, m8, m6  ; t0
+    psubsw               m8, m6      ; t4
+    paddsw               m6, m10, m7 ; t9a
+    psubsw              m10, m7      ; t13a
     ITX_MULSUB_2W         8,  3,  2,  7, 15, 1567, 3784 ; t5a, t4a
     ITX_MULSUB_2W        11, 10,  2,  7, 15, 1567, 3784 ; t13, t12
     mova                 m7, [rsp+gprsize+32*0] ; t10a
     mova                 m2, [rsp+gprsize+32*1] ; t6a
-    paddw               m15, m9, m13  ; -out15
-    psubw                m9, m13      ;  t3a
-    paddw               m13, m11, m1  ; -out13
-    psubw               m11, m1       ;  t15a
-    psubw                m1, m4, m7   ;  t10
-    paddw                m7, m4       ; -out1
-    psubw                m4, m3, m2   ;  t6
-    paddw                m3, m2       ; -out3
-    paddw                m2, m10, m14 ;  out2
-    psubw               m10, m14      ;  t14a
-    paddw               m14, m6, m0   ;  out14
-    psubw                m6, m0       ;  t11
+    paddsw              m15, m9, m13  ; -out15
+    psubsw               m9, m13      ;  t3a
+    paddsw              m13, m11, m1  ; -out13
+    psubsw              m11, m1       ;  t15a
+    psubsw               m1, m4, m7   ;  t10
+    paddsw               m7, m4       ; -out1
+    psubsw               m4, m3, m2   ;  t6
+    paddsw               m3, m2       ; -out3
+    paddsw               m2, m10, m14 ;  out2
+    psubsw              m10, m14      ;  t14a
+    paddsw              m14, m6, m0   ;  out14
+    psubsw               m6, m0       ;  t11
     mova                 m0, [rsp+gprsize+32*2] ; t2
     mova [rsp+gprsize+32*1], m7
-    psubw                m7, m12, m0  ;  t2a
-    paddw                m0, m12      ;  out0
-    paddw               m12, m8, m5   ;  out12
-    psubw                m8, m5       ;  t7
+    psubsw               m7, m12, m0  ;  t2a
+    paddsw               m0, m12      ;  out0
+    paddsw              m12, m8, m5   ;  out12
+    psubsw               m8, m5       ;  t7
     paddw                m5, m10, m11 ; -out5
     psubw               m10, m11      ;  out10
     psubw               m11, m4, m8   ; -out11
     paddw                m4, m8       ;  out4
     psubw                m8, m7, m9   ;  out8
     paddw                m7, m9       ; -out7
     psubw                m9, m1, m6   ; -out9
     paddw                m6, m1       ;  out6
@@ -3272,16 +3275,25 @@ ALIGN function_align
     mova                m11, [%1+%2*3]
     mova                m12, [%1+%2*4]
     mova                m13, [%1+%2*5]
     mova                m14, [%1+%2*6]
     mova                m15, [%1+%2*7]
 %endif
 %endmacro
 
+%macro ITX_UNPACK_MULHRSW 7 ; dst1, dst2/src, tmp, coef[1-4]
+    vpbroadcastd        m%3, [r5-pw_201_4091x8+pw_%4_%5x8]
+    punpcklwd           m%1, m%2, m%2
+    pmulhrsw            m%1, m%3
+    vpbroadcastd        m%3, [r5-pw_201_4091x8+pw_%6_%7x8]
+    punpckhwd           m%2, m%2
+    pmulhrsw            m%2, m%3
+%endmacro
+
 cglobal inv_txfm_add_dct_dct_8x32, 4, 4, 0, dst, stride, c, eob
     lea                 rax, [o_base]
     test               eobd, eobd
     jz .dconly
     PROLOGUE              0, 4, 16, 32*3, dst, stride, c, eob
     %undef cmp
     cmp                eobd, 106
     jle .fast
@@ -3426,32 +3438,21 @@ ALIGN function_align
 .main_fast: ; bottom half is zero
     call m(idct_8x16_internal).main
     mova                 m8, [rsp+gprsize+0*32]
     mova [rsp+gprsize+0*32], m0
     mova                 m9, [rsp+gprsize+1*32]
     mova [rsp+gprsize+1*32], m1
     mova                 m0, [rsp+gprsize+2*32]
     mova [rsp+gprsize+2*32], m6
-    punpcklwd            m1, m8, m8
-    punpckhwd            m8, m8
-    punpcklwd           m15, m9, m9
-    punpckhwd            m9, m9
-    punpcklwd           m14, m0, m0
-    punpckhwd            m0, m0
-    punpcklwd           m13, m11, m11
-    punpckhwd           m11, m11
-    ITX_MULHRSW_SHL3      1,  6,   201, 4091 ; t16a, t31a
-    ITX_MULHRSW_SHL3      8,  6,  m601, 4052 ; t23a, t24a
-    ITX_MULHRSW_SHL3     15,  6,   995, 3973 ; t20a, t27a
-    ITX_MULHRSW_SHL3      9,  6, m1380, 3857 ; t19a, t28a
-    ITX_MULHRSW_SHL3     14,  6,  1751, 3703 ; t18a, t29a
-    ITX_MULHRSW_SHL3      0,  6, m2106, 3513 ; t21a, t26a
-    ITX_MULHRSW_SHL3     13,  6,  2440, 3290 ; t22a, t25a
-    ITX_MULHRSW_SHL3     11,  6, m2751, 3035 ; t17a, t30a
+    lea                  r5, [rax-(o_base)+pw_201_4091x8]
+    ITX_UNPACK_MULHRSW    1,  8,  6,  201, 4091,  m601, 4052 ; t16a, t31a, t23a, t24a
+    ITX_UNPACK_MULHRSW   15,  9,  6,  995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a
+    ITX_UNPACK_MULHRSW   14,  0,  6, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a
+    ITX_UNPACK_MULHRSW   13, 11,  6, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a
     jmp .main2
 ALIGN function_align
 .main:
     call m(idct_8x16_internal).main
     mova                 m8, [rsp+gprsize+0*32]
     mova [rsp+gprsize+0*32], m0
     mova                 m9, [rsp+gprsize+1*32]
     mova [rsp+gprsize+1*32], m1
@@ -3469,84 +3470,84 @@ ALIGN function_align
     ITX_MUL2X_PACK        8,  6, 12, 10, 4052,  601, 3 ; t23a, t24a
     ITX_MUL2X_PACK       15,  6, 12, 10,  995, 3973, 3 ; t20a, t27a
     ITX_MUL2X_PACK        9,  6, 12, 10, 3857, 1380, 3 ; t19a, t28a
     ITX_MUL2X_PACK       14,  6, 12, 10, 1751, 3703, 3 ; t18a, t29a
     ITX_MUL2X_PACK        0,  6, 12, 10, 3513, 2106, 3 ; t21a, t26a
     ITX_MUL2X_PACK       13,  6, 12, 10, 2440, 3290, 3 ; t22a, t25a
     ITX_MUL2X_PACK       11,  6, 12, 10, 3035, 2751, 3 ; t17a, t30a
 .main2:
-    psubw                m6, m1, m11  ; t17 t30
-    paddw                m1, m11      ; t16 t31
-    psubw               m11, m9, m14  ; t18 t29
-    paddw                m9, m14      ; t19 t28
-    psubw               m14, m15, m0  ; t21 t26
-    paddw               m15, m0       ; t20 t27
-    psubw                m0, m8, m13  ; t22 t25
-    paddw                m8, m13      ; t23 t24
+    psubsw               m6, m1, m11  ; t17 t30
+    paddsw               m1, m11      ; t16 t31
+    psubsw              m11, m9, m14  ; t18 t29
+    paddsw               m9, m14      ; t19 t28
+    psubsw              m14, m15, m0  ; t21 t26
+    paddsw              m15, m0       ; t20 t27
+    psubsw               m0, m8, m13  ; t22 t25
+    paddsw               m8, m13      ; t23 t24
     ITX_MUL2X_PACK        6, 12, 13, 10,   799, 4017, 3 ; t17a t30a
     ITX_MUL2X_PACK       11, 12, 13, 10, m4017,  799, 3 ; t18a t29a
     ITX_MUL2X_PACK       14, 12, 13, 10,  3406, 2276, 3 ; t21a t26a
     ITX_MUL2X_PACK        0, 12, 13, 10, m2276, 3406, 3 ; t22a t25a
-    psubw               m13, m1, m9   ; t19a t28a
-    paddw                m1, m9       ; t16a t31a
-    psubw                m9, m8, m15  ; t20a t27a
-    paddw                m8, m15      ; t23a t24a
-    psubw               m15, m6, m11  ; t18  t29
-    paddw                m6, m11      ; t17  t30
-    psubw               m11, m0, m14  ; t21  t26
-    paddw                m0, m14      ; t22  t25
+    psubsw              m13, m1, m9   ; t19a t28a
+    paddsw               m1, m9       ; t16a t31a
+    psubsw               m9, m8, m15  ; t20a t27a
+    paddsw               m8, m15      ; t23a t24a
+    psubsw              m15, m6, m11  ; t18  t29
+    paddsw               m6, m11      ; t17  t30
+    psubsw              m11, m0, m14  ; t21  t26
+    paddsw               m0, m14      ; t22  t25
     ITX_MUL2X_PACK       15, 12, 14, 10,  1567, 3784, 1 ; t18a t29a
     ITX_MUL2X_PACK       13, 12, 14, 10,  1567, 3784, 1 ; t19  t28
     ITX_MUL2X_PACK        9, 12, 14, 10, m3784, 1567, 1 ; t20  t27
     ITX_MUL2X_PACK       11, 12, 14, 10, m3784, 1567, 1 ; t21a t26a
     vbroadcasti128      m12, [o(deint_shuf)]
     REPX    {pshufb x, m12}, m0, m1, m6, m8
-    psubw               m14, m1, m8   ; t23  t24
-    paddw                m1, m8       ; t16  t31
-    psubw                m8, m6, m0   ; t22a t25a
-    paddw                m6, m0       ; t17a t30a
-    psubw                m0, m15, m11 ; t21  t26
-    paddw               m15, m11      ; t18  t29
-    psubw               m11, m13, m9  ; t20a t27a
-    paddw               m13, m9       ; t19a t28a
+    psubsw              m14, m1, m8   ; t23  t24
+    paddsw               m1, m8       ; t16  t31
+    psubsw               m8, m6, m0   ; t22a t25a
+    paddsw               m6, m0       ; t17a t30a
+    psubsw               m0, m15, m11 ; t21  t26
+    paddsw              m15, m11      ; t18  t29
+    psubsw              m11, m13, m9  ; t20a t27a
+    paddsw              m13, m9       ; t19a t28a
     vpbroadcastd        m12, [o(pw_2896x8)]
-    punpcklqdq            m9, m11, m0 ; t20a t21
-    punpckhqdq           m11, m0      ; t27a t26
-    punpcklqdq            m0, m14, m8 ; t23  t22a
-    punpckhqdq           m14, m8      ; t24  t25a
-    psubw                 m8, m11, m9 ; t20  t21a
-    paddw                m11, m9      ; t27  t26a
-    psubw                 m9, m14, m0 ; t23a t22
-    paddw                m14, m0      ; t24a t25
-    REPX   {pmulhrsw x, m12}, m8, m9, m14, m11
+    punpcklqdq           m9, m11, m0  ; t20a t21
+    punpckhqdq          m11, m0       ; t27a t26
+    punpcklqdq           m0, m14, m8  ; t23  t22a
+    punpckhqdq          m14, m8       ; t24  t25a
+    psubw                m8, m11, m9  ; t20  t21a
+    paddw               m11, m9       ; t27  t26a
+    psubw                m9, m14, m0  ; t23a t22
+    paddw               m14, m0       ; t24a t25
+    REPX  {pmulhrsw x, m12}, m8, m9, m14, m11
     punpcklqdq           m0, m1, m6   ; t16  t17a
     punpckhqdq           m1, m6       ; t31  t30a
-    psubw               m10, m5, m8   ; out20 out21
-    paddw                m5, m8       ; out11 out10
-    psubw                m6, m3, m14  ; out24 out25
-    paddw                m3, m14      ; out7  out6
-    psubw                m8, m7, m0   ; out16 out17
-    paddw                m7, m0       ; out15 out14
+    psubsw              m10, m5, m8   ; out20 out21
+    paddsw               m5, m8       ; out11 out10
+    psubsw               m6, m3, m14  ; out24 out25
+    paddsw               m3, m14      ; out7  out6
+    psubsw               m8, m7, m0   ; out16 out17
+    paddsw               m7, m0       ; out15 out14
     mova                 m0, [rsp+gprsize+0*32]
     punpcklqdq          m12, m13, m15 ; t19a t18
     punpckhqdq          m13, m15      ; t28a t29
-    psubw               m15, m0, m1   ; out31 out30
-    paddw                m0, m1       ; out0  out1
+    psubsw              m15, m0, m1   ; out31 out30
+    paddsw               m0, m1       ; out0  out1
     mova                 m1, [rsp+gprsize+1*32]
     mova [rsp+gprsize+0*32], m6
     mova                 m6, [rsp+gprsize+2*32]
-    psubw               m14, m1, m13  ; out28 out29
-    paddw                m1, m13      ; out3  out2
-    psubw               m13, m2, m11  ; out27 out26
-    paddw                m2, m11      ; out4  out5
-    psubw               m11, m4, m9   ; out23 out22
-    paddw                m4, m9       ; out8  out9
-    psubw                m9, m6, m12  ; out19 out18
-    paddw                m6, m12      ; out12 out13
+    psubsw              m14, m1, m13  ; out28 out29
+    paddsw               m1, m13      ; out3  out2
+    psubsw              m13, m2, m11  ; out27 out26
+    paddsw               m2, m11      ; out4  out5
+    psubsw              m11, m4, m9   ; out23 out22
+    paddsw               m4, m9       ; out8  out9
+    psubsw               m9, m6, m12  ; out19 out18
+    paddsw               m6, m12      ; out12 out13
     ret
 
 %macro LOAD_PACKED_16X2 4 ; dst, tmp, row[1-2]
     vbroadcasti128      m%1, [cq+16*%3]
     vbroadcasti128      m%2, [cq+16*%4]
     shufpd              m%1, m%1, m%2, 0x0c
 %endmacro
 
@@ -3867,18 +3868,18 @@ cglobal inv_txfm_add_identity_identity_3
     pxor                m15, m15
     REPX {mova [%1+%2*x], m15}, 0,  1,  2,  3,  4,  5,  6,  7, \
                                 8,  9, 10, 11, 12, 13, 14, 15
 %endif
 %endmacro
 
 %macro IDCT32_PASS2_END 7 ; coefs[1-2], tmp[1-2], rnd, offset[1-2]
     mova                m%4, [%2]
-    paddw               m%3, m%1, m%4
-    psubw               m%1, m%4
+    paddsw              m%3, m%1, m%4
+    psubsw              m%1, m%4
     pmovzxbw            m%4, [dstq+%6]
     pmulhrsw            m%3, m%5
     pmulhrsw            m%1, m%5
     paddw               m%3, m%4
     pmovzxbw            m%4, [r2+%7]
     paddw               m%1, m%4
     packuswb            m%3, m%1
     vpermq              m%3, m%3, q3120
@@ -4051,89 +4052,89 @@ ALIGN function_align
     vpbroadcastd        m15, [o(pd_2048)]
     ITX_MULSUB_2W         4, 11,  7,  8, 15, 1751, 3703 ; t18a, t29a
     ITX_MULSUB_2W        12,  3,  7,  8, 15, 3857, 1380 ; t19a, t28a
     ITX_MULSUB_2W         2, 13,  7,  8, 15,  995, 3973 ; t20a, t27a
     ITX_MULSUB_2W        10,  5,  7,  8, 15, 3513, 2106 ; t21a, t26a
     ITX_MULSUB_2W         6,  9,  7,  8, 15, 2440, 3290 ; t22a, t25a
     ITX_MULSUB_2W        14,  1,  7,  8, 15, 4052,  601 ; t23a, t24a
 .main2:
-    psubw                m7, m12, m4  ; t18
-    paddw               m12, m4       ; t19
-    psubw                m4, m2, m10  ; t21
-    paddw                m2, m10      ; t20
-    psubw               m10, m14, m6  ; t22
-    paddw               m14, m6       ; t23
-    psubw                m6, m1, m9   ; t25
-    paddw                m1, m9       ; t24
-    psubw                m9, m13, m5  ; t26
-    paddw               m13, m5       ; t27
-    psubw                m5, m3, m11  ; t29
-    paddw                m3, m11      ; t28
+    psubsw               m7, m12, m4  ; t18
+    paddsw              m12, m4       ; t19
+    psubsw               m4, m2, m10  ; t21
+    paddsw               m2, m10      ; t20
+    psubsw              m10, m14, m6  ; t22
+    paddsw              m14, m6       ; t23
+    psubsw               m6, m1, m9   ; t25
+    paddsw               m1, m9       ; t24
+    psubsw               m9, m13, m5  ; t26
+    paddsw              m13, m5       ; t27
+    psubsw               m5, m3, m11  ; t29
+    paddsw               m3, m11      ; t28
     ITX_MULSUB_2W         5,  7,  8, 11, 15, m4017,  799 ; t18a, t29a
     ITX_MULSUB_2W         9,  4,  8, 11, 15,  3406, 2276 ; t21a, t26a
     ITX_MULSUB_2W         6, 10,  8, 11, 15, m2276, 3406 ; t22a, t25a
-    psubw                m8, m14, m2  ; t20a
-    paddw               m14, m2       ; t23a
-    psubw                m2, m1, m13  ; t27a
-    paddw                m1, m13      ; t24a
-    psubw               m13, m6, m9   ; t21
-    paddw                m6, m9       ; t22
-    psubw                m9, m10, m4  ; t26
-    paddw               m10, m4       ; t25
+    psubsw               m8, m14, m2  ; t20a
+    paddsw              m14, m2       ; t23a
+    psubsw               m2, m1, m13  ; t27a
+    paddsw               m1, m13      ; t24a
+    psubsw              m13, m6, m9   ; t21
+    paddsw               m6, m9       ; t22
+    psubsw               m9, m10, m4  ; t26
+    paddsw              m10, m4       ; t25
     ITX_MULSUB_2W         2,  8,  4, 11, 15, m3784, 1567 ; t20,  t27
     ITX_MULSUB_2W         9, 13,  4, 11, 15, m3784, 1567 ; t21a, t26a
     mova                 m4, [rsp+gprsize+32*0] ; in31
     mova [rsp+gprsize+32*0], m6  ; t22
     mova                 m6, [rsp+gprsize+32*1] ; in15
     mova [rsp+gprsize+32*1], m14 ; t23a
     mova                m14, [rsp+gprsize+32*2] ; in17
     mova [rsp+gprsize+32*2], m1  ; t24a
     ITX_MULSUB_2W         0,  4,  1, 11, 15,  201, 4091 ; t16a, t31a
     ITX_MULSUB_2W        14,  6,  1, 11, 15, 3035, 2751 ; t17a, t30a
-    psubw                m1, m0, m14  ; t17
-    paddw                m0, m14      ; t16
-    psubw               m14, m4, m6   ; t30
-    paddw                m4, m6       ; t31
+    psubsw               m1, m0, m14  ; t17
+    paddsw               m0, m14      ; t16
+    psubsw              m14, m4, m6   ; t30
+    paddsw               m4, m6       ; t31
     ITX_MULSUB_2W        14,  1,  6, 11, 15,  799, 4017 ; t17a, t30a
-    psubw                m6, m0, m12  ; t19a
-    paddw                m0, m12      ; t16a
-    psubw               m12, m4, m3   ; t28a
-    paddw                m4, m3       ; t31a
-    psubw                m3, m14, m5  ; t18
-    paddw               m14, m5       ; t17
-    psubw                m5, m1, m7   ; t29
-    paddw                m1, m7       ; t30
+    psubsw               m6, m0, m12  ; t19a
+    paddsw               m0, m12      ; t16a
+    psubsw              m12, m4, m3   ; t28a
+    paddsw               m4, m3       ; t31a
+    psubsw               m3, m14, m5  ; t18
+    paddsw              m14, m5       ; t17
+    psubsw               m5, m1, m7   ; t29
+    paddsw               m1, m7       ; t30
     ITX_MULSUB_2W         5,  3,  7, 11, 15, 1567, 3784 ; t18a, t29a
     ITX_MULSUB_2W        12,  6,  7, 11, 15, 1567, 3784 ; t19,  t28
-    psubw                m7, m1, m10  ; t25a
-    paddw                m1, m10      ; t30a
-    psubw               m10, m5, m9   ; t21
-    paddw                m5, m9       ; t18
-    psubw                m9, m12, m2  ; t20a
-    paddw               m12, m2       ; t19a
-    psubw                m2, m3, m13  ; t26
-    paddw                m3, m13      ; t29
-    psubw               m13, m6, m8   ; t27a
-    paddw                m6, m8       ; t28a
+    psubsw               m7, m1, m10  ; t25a
+    paddsw               m1, m10      ; t30a
+    psubsw              m10, m5, m9   ; t21
+    paddsw               m5, m9       ; t18
+    psubsw               m9, m12, m2  ; t20a
+    paddsw              m12, m2       ; t19a
+    psubsw               m2, m3, m13  ; t26
+    paddsw               m3, m13      ; t29
+    psubsw              m13, m6, m8   ; t27a
+    paddsw               m6, m8       ; t28a
     mova       [tmp1q-32*2], m5
     mova       [tmp1q-32*1], m12
     mova       [tmp2q+32*0], m6
     mova       [tmp2q+32*1], m3
     mova       [tmp2q+32*2], m1
     mova                 m5, [rsp+gprsize+32*0] ; t22
     mova                 m6, [rsp+gprsize+32*1] ; t23
     mova                 m3, [rsp+gprsize+32*2] ; t24a
     vpbroadcastd         m8, [o(pw_2896x8)]
-    psubw                m1, m14, m5  ; t22a
-    paddw               m14, m5       ; t17a
-    psubw                m5, m0, m6   ; t23
-    paddw                m0, m6       ; t16
-    psubw                m6, m4, m3   ; t24
-    paddw                m4, m3       ; t31
+    psubsw               m1, m14, m5  ; t22a
+    paddsw              m14, m5       ; t17a
+    psubsw               m5, m0, m6   ; t23
+    paddsw               m0, m6       ; t16
+    psubsw               m6, m4, m3   ; t24
+    paddsw               m4, m3       ; t31
     mova       [tmp1q-32*4], m0
     mova       [tmp1q-32*3], m14
     mova       [tmp2q+32*3], m4
     psubw                m3, m13, m9  ; t20
     paddw               m13, m9       ; t27
     psubw                m9, m2, m10  ; t21a
     paddw                m2, m10      ; t26a
     psubw               m10, m7, m1   ; t22
@@ -4236,23 +4237,23 @@ ALIGN function_align
     IDCT32_PASS2_END      7, tmp2q-32*4, 0, 4, 15, strideq*4, strideq*8
     IDCT32_PASS2_END     11, tmp1q+32*0, 0, 4, 15, strideq*8, strideq*4
     IDCT32_PASS2_END      1, tmp1q-32*4, 0, 4, 15, r3*4,      strideq*0
     ret
 
 ; Perform the final sumsub step and YMM lane shuffling
 %macro IDCT32_PASS1_END 4 ; row[1-2], tmp[1-2]
     mova                m%3, [tmp2q+32*( 3-%1)]
-    psubw               m%4, m%1, m%3
-    paddw               m%1, m%3
+    psubsw              m%4, m%1, m%3
+    paddsw              m%1, m%3
     mova                m%3, [tmp1q+32*(11-%2)]
     mova         [tmp1q+32*(11-%2)+16], xm%4
     vextracti128 [tmp2q+32*( 3-%1)+16], m%4, 1
-    paddw               m%4, m%2, m%3
-    psubw               m%2, m%3
+    paddsw              m%4, m%2, m%3
+    psubsw              m%2, m%3
     mova         [tmp1q+32*(11-%2)], xm%2
     vextracti128 [tmp2q+32*( 3-%1)], m%2, 1
     vperm2i128          m%2, m%1, m%4, 0x31
     vinserti128         m%1, m%1, xm%4, 1
 %endmacro
 
 cglobal inv_txfm_add_dct_dct_32x16, 4, 4, 0, dst, stride, c, eob
     lea                 rax, [o_base]
@@ -4703,22 +4704,22 @@ cglobal inv_txfm_add_identity_identity_3
 %macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4])
 %if %1 & 1
     mova                m%5, [tmp2q-32*(51-%1)] ; idct16 out 0+n
     mova                m%4, [tmp1q-32*(14+%1)] ; idct32 out31-n
 %else
     mova                m%5, [tmp1q-32*(45-%1)]
     mova                m%4, [tmp2q-32*(20+%1)]
 %endif
-    psubw               m%6, m%5, m%4 ; idct32 out31-n
-    paddw               m%5, m%4      ; idct32 out 0+n
-    psubw               m%4, m%6, m%3 ; out32+n
-    paddw               m%6, m%3      ; out31-n
-    psubw               m%3, m%5, m%2 ; out63-n
-    paddw               m%5, m%2      ; out 0+n
+    psubsw              m%6, m%5, m%4 ; idct32 out31-n
+    paddsw              m%5, m%4      ; idct32 out 0+n
+    psubsw              m%4, m%6, m%3 ; out32+n
+    paddsw              m%6, m%3      ; out31-n
+    psubsw              m%3, m%5, m%2 ; out63-n
+    paddsw              m%5, m%2      ; out 0+n
 %if %0 == 6 ; pass 1
 %if %1 & 1
     mova [tmp2q-32*(19-%1)], m%4
     mova [tmp1q-32*(14+%1)], m%6
     mova [tmp1q+32*(18-%1)], m%3
     mova [tmp2q-32*(51-%1)], m%5
 %else
     mova [tmp1q-32*(13-%1)], m%4
@@ -4943,35 +4944,35 @@ ALIGN function_align
     vpbroadcastd         m9, [o(idct64_mul+4* 8)]
     vpbroadcastd        m13, [o(idct64_mul+4* 9)]
     vpbroadcastd         m8, [o(idct64_mul+4*12)]
     vpbroadcastd        m12, [o(idct64_mul+4*13)]
     pmulhrsw             m9, m2  ; t61a
     pmulhrsw             m2, m13 ; t34a
     pmulhrsw             m8, m3  ; t60a
     pmulhrsw             m3, m12 ; t35a
-    psubw               m12, m0, m1   ; t33
-    paddw                m0, m1       ; t32
-    psubw                m1, m3, m2   ; t34
-    paddw                m3, m2       ; t35
-    psubw                m2, m8, m9   ; t61
-    paddw                m8, m9       ; t60
-    psubw                m9, m11, m10 ; t62
-    paddw               m11, m10      ; t63
+    psubsw              m12, m0, m1   ; t33
+    paddsw               m0, m1       ; t32
+    psubsw               m1, m3, m2   ; t34
+    paddsw               m3, m2       ; t35
+    psubsw               m2, m8, m9   ; t61
+    paddsw               m8, m9       ; t60
+    psubsw               m9, m11, m10 ; t62
+    paddsw              m11, m10      ; t63
     ITX_MULSUB_2W         2,  1, 10, 13, 15, m4076, 401 ; t34a, t61a
     vpbroadcastd        m14, [o(pw_401_4076)]
     ITX_MULSUB_2W         9, 12, 10, 13, 15, 14, 13 ; t33a, t62a
-    psubw               m10, m0, m3  ; t35a
-    paddw                m0, m3      ; t32a
-    psubw                m3, m11, m8 ; t60a
-    paddw               m11, m8      ; t63a
-    psubw                m8, m9, m2  ; t34
-    paddw                m9, m2      ; t33
-    psubw                m2, m12, m1 ; t61
-    paddw               m12, m1      ; t62
+    psubsw              m10, m0, m3  ; t35a
+    paddsw               m0, m3      ; t32a
+    psubsw               m3, m11, m8 ; t60a
+    paddsw              m11, m8      ; t63a
+    psubsw               m8, m9, m2  ; t34
+    paddsw               m9, m2      ; t33
+    psubsw               m2, m12, m1 ; t61
+    paddsw              m12, m1      ; t62
     mova       [tmp1q-32*4], m0
     mova       [tmp1q-32*3], m9
     mova       [tmp2q+32*2], m12
     mova       [tmp2q+32*3], m11
     vpbroadcastd        m13, [o(pw_m4017_799)]
     vpbroadcastd        m14, [o(pw_799_4017)]
     ITX_MULSUB_2W         2,  8,  0,  1, 15, 14, 13 ; t34a, t61a
     ITX_MULSUB_2W         3, 10,  0,  1, 15, 14, 13 ; t35,  t60
@@ -4990,35 +4991,35 @@ ALIGN function_align
     pmulhrsw             m3, m4  ; t59a
     pmulhrsw             m4, m11 ; t36a
     pmulhrsw             m2, m5  ; t58a
     pmulhrsw             m5, m10 ; t37a
     pmulhrsw             m1, m6  ; t57a
     pmulhrsw             m6, m9  ; t38a
     pmulhrsw             m0, m7  ; t56a
     pmulhrsw             m7, m8  ; t39a
-    psubw                m8, m4, m5 ; t37
-    paddw                m4, m5     ; t36
-    psubw                m5, m7, m6 ; t38
-    paddw                m7, m6     ; t39
-    psubw                m6, m0, m1 ; t57
-    paddw                m0, m1     ; t56
-    psubw                m1, m3, m2 ; t58
-    paddw                m3, m2     ; t59
+    psubsw               m8, m4, m5 ; t37
+    paddsw               m4, m5     ; t36
+    psubsw               m5, m7, m6 ; t38
+    paddsw               m7, m6     ; t39
+    psubsw               m6, m0, m1 ; t57
+    paddsw               m0, m1     ; t56
+    psubsw               m1, m3, m2 ; t58
+    paddsw               m3, m2     ; t59
     ITX_MULSUB_2W         6,  5,  2,  9, 15, m2598, 3166 ; t38a, t57a
     vpbroadcastd        m10, [o(pw_3166_2598)]
     ITX_MULSUB_2W         1,  8,  2,  9, 15, 10,  9 ; t37a, t58a
-    psubw                m2, m7, m4 ; t36a
-    paddw                m7, m4     ; t39a
-    psubw                m4, m0, m3 ; t59a
-    paddw                m0, m3     ; t56a
-    psubw                m3, m6, m1 ; t37
-    paddw                m6, m1     ; t38
-    psubw                m1, m5, m8 ; t58
-    paddw                m5, m8     ; t57
+    psubsw               m2, m7, m4 ; t36a
+    paddsw               m7, m4     ; t39a
+    psubsw               m4, m0, m3 ; t59a
+    paddsw               m0, m3     ; t56a
+    psubsw               m3, m6, m1 ; t37
+    paddsw               m6, m1     ; t38
+    psubsw               m1, m5, m8 ; t58
+    paddsw               m5, m8     ; t57
     mova       [tmp1q+32*2], m6
     mova       [tmp1q+32*3], m7
     mova       [tmp2q-32*4], m0
     mova       [tmp2q-32*3], m5
     vpbroadcastd         m6, [o(pw_m799_m4017)]
     vpbroadcastd         m7, [o(pw_m4017_799)]
     ITX_MULSUB_2W         4,  2,  0,  5, 15,  7,  6 ; t36,  t59
     ITX_MULSUB_2W         1,  3,  0,  5, 15,  7,  6 ; t37a, t58a
@@ -5050,34 +5051,34 @@ ALIGN function_align
     mova                 m2, [tmp1q-32* 4] ; t40a
     mova                 m5, [tmp2q+32* 3] ; t55a
     add               tmp1q, 32
     sub               tmp2q, 32
     mova                 m4, [tmp1q+32* 3] ; t48a
     mova                 m3, [tmp2q-32* 4] ; t47a
     mova                 m6, [tmp1q+32*11] ; t56a
     mova                 m7, [tmp2q+32*12] ; t63a
-    psubw                m8, m0, m1 ; t39
-    paddw                m0, m1     ; t32
-    psubw                m1, m3, m2 ; t40
-    paddw                m3, m2     ; t47
-    psubw                m2, m4, m5 ; t55
-    paddw                m4, m5     ; t48
-    psubw                m5, m7, m6 ; t56
-    paddw                m7, m6     ; t63
+    psubsw               m8, m0, m1 ; t39
+    paddsw               m0, m1     ; t32
+    psubsw               m1, m3, m2 ; t40
+    paddsw               m3, m2     ; t47
+    psubsw               m2, m4, m5 ; t55
+    paddsw               m4, m5     ; t48
+    psubsw               m5, m7, m6 ; t56
+    paddsw               m7, m6     ; t63
     ITX_MULSUB_2W         5,  8,  6,  9, 15, 11, 12 ; t39a, t56a
     ITX_MULSUB_2W         2,  1,  6,  9, 15, 12, 13 ; t40a, t55a
-    psubw                m6, m0, m3 ; t47a
-    paddw                m0, m3     ; t32a
-    psubw                m3, m7, m4 ; t48a
-    paddw                m7, m4     ; t63a
-    psubw                m4, m5, m2 ; t40
-    paddw                m5, m2     ; t39
-    psubw                m2, m8, m1 ; t55
-    paddw                m8, m1     ; t56
+    psubsw               m6, m0, m3 ; t47a
+    paddsw               m0, m3     ; t32a
+    psubsw               m3, m7, m4 ; t48a
+    paddsw               m7, m4     ; t63a
+    psubsw               m4, m5, m2 ; t40
+    paddsw               m5, m2     ; t39
+    psubsw               m2, m8, m1 ; t55
+    paddsw               m8, m1     ; t56
     psubw                m1, m2, m4 ; t40a
     paddw                m2, m4     ; t55a
     psubw                m4, m3, m6 ; t47
     paddw                m3, m6     ; t48
     ret
 .main_part2_pass2:
     sub                 rax, o_idct64_offset + 8
     vpbroadcastd        m11, [o(pw_1567_3784)]
--- a/third_party/dav1d/src/x86/itx_init_tmpl.c
+++ b/third_party/dav1d/src/x86/itx_init_tmpl.c
@@ -73,16 +73,19 @@ decl_itx2_fns (32, 32, avx2);
 
 decl_itx_fn(dav1d_inv_txfm_add_dct_dct_16x64_avx2);
 decl_itx_fn(dav1d_inv_txfm_add_dct_dct_32x64_avx2);
 decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x16_avx2);
 decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_avx2);
 decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_avx2);
 
 decl_itx17_fns(4, 4, ssse3);
+decl_itx16_fns(4, 8, ssse3);
+decl_itx16_fns(8, 4, ssse3);
+decl_itx16_fns(8, 8, ssse3);
 
 void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
 #define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
     c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
         dav1d_inv_txfm_add_##type##_##w##x##h##_##ext
 
 #define assign_itx1_fn(pfx, w, h, ext) \
     assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
@@ -116,17 +119,20 @@ void bitfn(dav1d_itx_dsp_init_x86)(Dav1d
     assign_itx_fn(pfx, w, h, wht_wht,           WHT_WHT,           ext)
 
 
     const unsigned flags = dav1d_get_cpu_flags();
 
     if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
 
 #if BITDEPTH == 8
-    assign_itx17_fn(, 4, 4, ssse3);
+    assign_itx17_fn(,  4, 4, ssse3);
+    assign_itx16_fn(R, 4, 8, ssse3);
+    assign_itx16_fn(R, 8, 4, ssse3);
+    assign_itx16_fn(,  8, 8, ssse3);
 #endif
 
     if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
 
 #if BITDEPTH == 8 && ARCH_X86_64
     assign_itx17_fn( ,  4,  4, avx2);
     assign_itx16_fn(R,  4,  8, avx2);
     assign_itx16_fn(R,  4, 16, avx2);
--- a/third_party/dav1d/src/x86/itx_ssse3.asm
+++ b/third_party/dav1d/src/x86/itx_ssse3.asm
@@ -24,255 +24,313 @@
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 %include "config.asm"
 %include "ext/x86/x86inc.asm"
 
 
 SECTION_RODATA 16
 
-deint_shuf: db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
+deint_shuf:  db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
+
+deint_shuf1: db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
+deint_shuf2: db  8,  9,  0,  1, 10, 11,  2,  3, 12, 13,  4,  5, 14, 15,  6,  7
+
+%macro COEF_PAIR 2
+pw_%1_m%2:  times 4 dw   %1, -%2
+pw_%2_%1:   times 4 dw   %2,  %1
+%endmacro
+
+;adst4
+pw_1321_3803:   times 4 dw  1321,  3803
+pw_2482_m1321:  times 4 dw  2482, -1321
+pw_3344_2482:   times 4 dw  3344,  2482
+pw_3344_m3803:  times 4 dw  3344, -3803
+pw_m6688_m3803: times 4 dw -6688, -3803
 
-qw_2896x8:      times 8 dw  2896*8
-qw_1567_m3784:  times 4 dw  1567, -3784
-qw_3784_1567:   times 4 dw  3784,  1567
-
-qw_1321_3803:   times 4 dw  1321,  3803
-qw_2482_m1321:  times 4 dw  2482, -1321
-qw_3344_2482:   times 4 dw  3344,  2482
-qw_3344_m3803:  times 4 dw  3344, -3803
-qw_m6688_m3803: times 4 dw -6688, -3803
-qw_3344x8:      times 8 dw  3344*8
-qw_5793x4:      times 8 dw  5793*4
-
-pd_2048:        times 4 dd  2048
-qw_2048:        times 8 dw  2048
-
+COEF_PAIR 1567, 3784
+COEF_PAIR  799, 4017
+COEF_PAIR 3406, 2276
+COEF_PAIR  401, 4076
+COEF_PAIR 1931, 3612
+COEF_PAIR 3166, 2598
+COEF_PAIR 3920, 1189
+COEF_PAIR 3784, 1567
+
+pd_2048:        times 4 dd  2048
+pw_2048:        times 8 dw  2048
+pw_4096:        times 8 dw  4096
+pw_16384:       times 8 dw  16384
+pw_m16384:      times 8 dw  -16384
+pw_2896x8:      times 8 dw  2896*8
+pw_3344x8:      times 8 dw  3344*8
+pw_5793x4:      times 8 dw  5793*4
+
 iadst4_dconly1a: times 2 dw 10568, 19856, 26752, 30424
-iadst4_dconly1b: times 2 dw 30424, 26752, 19856, 10568
+iadst4_dconly1b: times 2 dw 30424, 26752, 19856, 10568
 iadst4_dconly2a: dw 10568, 10568, 10568, 10568, 19856, 19856, 19856, 19856
-iadst4_dconly2b: dw 26752, 26752, 26752, 26752, 30424, 30424, 30424, 30424
-
-SECTION .text
-
-%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
-
-%macro ITX4_END 4-5 2048 ; row[1-4], rnd
-%if %5
-    mova                 m2, [qw_%5]
-    pmulhrsw             m0, m2
-    pmulhrsw             m1, m2
-%endif
-    lea                  r2, [dstq+strideq*2]
-%assign %%i 1
-%rep 4
-    %if %1 & 2
-        CAT_XDEFINE %%row_adr, %%i, r2   + strideq*(%1&1)
-    %else
-        CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1)
-    %endif
-    %assign %%i %%i + 1
-    %rotate 1
-%endrep
-
-    movd                 m2, [%%row_adr1]       ;dst0
-    movd                 m4, [%%row_adr2]       ;dst1
-    punpckldq            m2, m4                 ;high: dst1 :low: dst0
-    movd                 m3, [%%row_adr3]       ;dst2
-    movd                 m4, [%%row_adr4]       ;dst3
-    punpckldq            m3, m4                 ;high: dst3 :low: dst2
-
-    pxor                 m4, m4
-    punpcklbw            m2, m4                 ;extend byte to word
-    punpcklbw            m3, m4                 ;extend byte to word
-
-    paddw                m0, m2                 ;high: dst1 + out1 ;low: dst0 + out0
-    paddw                m1, m3                 ;high: dst3 + out3 ;low: dst2 + out2
-
-    packuswb             m0, m1                 ;high->low: dst3 + out3, dst2 + out2, dst1 + out1, dst0 + out0
-
-    movd       [%%row_adr1], m0                 ;store dst0 + out0
-    pshuflw              m1, m0, q1032
-    movd       [%%row_adr2], m1                 ;store dst1 + out1
-    punpckhqdq           m0, m0
-    movd       [%%row_adr3], m0                 ;store dst2 + out2
-    psrlq                m0, 32
-    movd       [%%row_adr4], m0                 ;store dst3 + out3
-
-    ret
-%endmacro
-
-
+iadst4_dconly2b: dw 26752, 26752, 26752, 26752, 30424, 30424, 30424, 30424
+
+SECTION .text
+
+%macro REPX 2-*
+    %xdefine %%f(x) %1
+%rep %0 - 1
+    %rotate 1
+    %%f(%1)
+%endrep
+%endmacro
+
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+%if ARCH_X86_64
+%define o(x) x
+%else
+%define o(x) r5-$$+x ; PIC
+%endif
+
+%macro WRITE_4X4 9  ;src[1-2], tmp[1-3], row[1-4]
+    lea                  r2, [dstq+strideq*2]
+%assign %%i 1
+%rotate 5
+%rep 4
+    %if %1 & 2
+        CAT_XDEFINE %%row_adr, %%i, r2   + strideq*(%1&1)
+    %else
+        CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1)
+    %endif
+    %assign %%i %%i + 1
+    %rotate 1
+%endrep
+
+    movd                 m%3, [%%row_adr1]        ;dst0
+    movd                 m%5, [%%row_adr2]        ;dst1
+    punpckldq            m%3, m%5                 ;high: dst1 :low: dst0
+    movd                 m%4, [%%row_adr3]        ;dst2
+    movd                 m%5, [%%row_adr4]        ;dst3
+    punpckldq            m%4, m%5                 ;high: dst3 :low: dst2
+
+    pxor                 m%5, m%5
+    punpcklbw            m%3, m%5                 ;extend byte to word
+    punpcklbw            m%4, m%5                 ;extend byte to word
+
+    paddw                m%1, m%3                 ;high: dst1 + out1 ;low: dst0 + out0
+    paddw                m%2, m%4                 ;high: dst3 + out3 ;low: dst2 + out2
+
+    packuswb             m%1, m%2                 ;high->low: dst3 + out3, dst2 + out2, dst1 + out1, dst0 + out0
+
+    movd       [%%row_adr1], m%1                  ;store dst0 + out0
+    pshuflw              m%2, m%1, q1032
+    movd       [%%row_adr2], m%2                  ;store dst1 + out1
+    punpckhqdq           m%1, m%1
+    movd       [%%row_adr3], m%1                  ;store dst2 + out2
+    psrlq                m%1, 32
+    movd       [%%row_adr4], m%1                  ;store dst3 + out3
+%endmacro
+
+%macro ITX4_END 4-5 2048 ; row[1-4], rnd
+%if %5
+    mova                 m2, [o(pw_%5)]
+    pmulhrsw             m0, m2
+    pmulhrsw             m1, m2
+%endif
+
+    WRITE_4X4            0, 1, 2, 3, 4, %1, %2, %3, %4
+    ret
+%endmacro
+
+
 ; flags: 1 = swap, 2: coef_regs
 %macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags
 %if %6 & 2
     pmaddwd              m%2, m%4, m%1
     pmaddwd              m%1, m%5
 %elif %6 & 1
-    pmaddwd              m%2, m%1, [qw_%5_%4]
-    pmaddwd              m%1, [qw_%4_m%5]
+    pmaddwd              m%2, m%1, [o(pw_%5_%4)]
+    pmaddwd              m%1, [o(pw_%4_m%5)]
 %else
-    pmaddwd              m%2, m%1, [qw_%4_m%5]
-    pmaddwd              m%1, [qw_%5_%4]
+    pmaddwd              m%2, m%1, [o(pw_%4_m%5)]
+    pmaddwd              m%1, [o(pw_%5_%4)]
 %endif
     paddd                m%2, m%3
     paddd                m%1, m%3
     psrad                m%2, 12
     psrad                m%1, 12
     packssdw             m%1, m%2
-%endmacro
-
-%macro IDCT4_1D_PACKED 0-1   ;qw_2896x8
-    punpckhwd            m2, m0, m1           ;unpacked in1 in3
-    psubw                m3, m0, m1
-    paddw                m0, m1
-    punpcklqdq           m0, m3               ;high: in0-in2 ;low: in0+in2
-
-    mova                 m3, [pd_2048]
-    ITX_MUL2X_PACK 2, 1, 3, 1567, 3784
-
-%if %0 == 1
-    pmulhrsw             m0, m%1
-%else
-    pmulhrsw             m0, [qw_2896x8]     ;high: t1 ;low: t0
-%endif
-
-    psubw                m1, m0, m2          ;high: out2 ;low: out3
-    paddw                m0, m2              ;high: out1 ;low: out0
-%endmacro
-
+%endmacro
+
+%macro IDCT4_1D_PACKED 0-1   ;pw_2896x8
+    punpckhwd            m2, m0, m1            ;unpacked in1 in3
+    psubw                m3, m0, m1
+    paddw                m0, m1
+    punpcklqdq           m0, m3                ;high: in0-in2 ;low: in0+in2
+
+    mova                 m3, [o(pd_2048)]
+    ITX_MUL2X_PACK        2, 1, 3, 1567, 3784
+
+%if %0 == 1
+    pmulhrsw             m0, m%1
+%else
+    pmulhrsw             m0, [o(pw_2896x8)]    ;high: t1 ;low: t0
+%endif
+
+    psubsw               m1, m0, m2            ;high: out2 ;low: out3
+    paddsw               m0, m2                ;high: out1 ;low: out0
+%endmacro
+
+
 %macro IADST4_1D_PACKED 0
     punpcklwd            m2, m0, m1                ;unpacked in0 in2
     punpckhwd            m3, m0, m1                ;unpacked in1 in3
     psubw                m0, m1
     punpckhqdq           m1, m1                    ;
     paddw                m1, m0                    ;low: in0 - in2 + in3
 
-    pmaddwd              m0, m2, [qw_1321_3803]    ;1321 * in0 + 3803 * in2
-    pmaddwd              m2, [qw_2482_m1321]       ;2482 * in0 - 1321 * in2
-    pmaddwd              m4, m3, [qw_3344_2482]    ;3344 * in1 + 2482 * in3
-    pmaddwd              m5, m3, [qw_3344_m3803]   ;3344 * in1 - 3803 * in3
+    pmaddwd              m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
+    pmaddwd              m2, [o(pw_2482_m1321)]    ;2482 * in0 - 1321 * in2
+    pmaddwd              m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
+    pmaddwd              m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3
     paddd                m4, m0                    ;t0 + t3
-
-    pmaddwd              m3, [qw_m6688_m3803]      ;-2 * 3344 * in1 - 3803 * in3
-    pmulhrsw             m1, [qw_3344x8]           ;low: out2
-    mova                 m0, [pd_2048]
+    pmaddwd              m3, [o(pw_m6688_m3803)]   ;-2 * 3344 * in1 - 3803 * in3
+    pmulhrsw             m1, [o(pw_3344x8)]        ;low: out2
+    mova                 m0, [o(pd_2048)]
     paddd                m2, m0
     paddd                m0, m4                    ;t0 + t3 + 2048
     paddd                m5, m2                    ;t1 + t3 + 2048
     paddd                m2, m4
     paddd                m2, m3                    ;t0 + t1 - t3 + 2048
 
     psrad                m0, 12                    ;out0
     psrad                m5, 12                    ;out1
     psrad                m2, 12                    ;out3
     packssdw             m0, m5                    ;high: out1 ;low: out0
     packssdw             m2, m2                    ;high: out3 ;low: out3
-%endmacro
-
-%macro INV_TXFM_FN 4 ; type1, type2, fast_thresh, size
-cglobal inv_txfm_add_%1_%2_%4, 4, 5, 0, dst, stride, coeff, eob, tx2
-    %undef cmp
-    lea tx2q, [m(i%2_%4_internal).pass2]
-%if %3 > 0
-    cmp                  eobd, %3
-    jle %%end
-%elif %3 == 0
-    test                 eobd, eobd
-    jz %%end
-%endif
-    call i%1_%4_internal
-    RET
-ALIGN function_align
-%%end:
-%endmacro
-
-%macro INV_TXFM_4X4_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 4x4
+%endmacro
+
+%macro INV_TXFM_FN 5+ ; type1, type2, fast_thresh, size, xmm/stack
+cglobal inv_txfm_add_%1_%2_%4, 4, 6, %5, dst, stride, coeff, eob, tx2
+    %undef cmp
+    %define %%p1 m(i%1_%4_internal)
+%if ARCH_X86_32
+    LEA                    r5, $$
+%endif
+%if has_epilogue
+%if %3 > 0
+    cmp                  eobd, %3
+    jle %%end
+%elif %3 == 0
+    test                 eobd, eobd
+    jz %%end
+%endif
+    lea                  tx2q, [o(m(i%2_%4_internal).pass2)]
+    call %%p1
+    RET
+%%end:
+%else
+    lea                  tx2q, [o(m(i%2_%4_internal).pass2)]
+%if %3 > 0
+    cmp                  eobd, %3
+    jg %%p1
+%elif %3 == 0
+    test                 eobd, eobd
+    jnz %%p1
+%else
+    times ((%%end - %%p1) >> 31) & 1 jmp %%p1
+ALIGN function_align
+%%end:
+%endif
+%endif
+%endmacro
+
+%macro INV_TXFM_4X4_FN 2-3 -1 ; type1, type2, fast_thresh
+    INV_TXFM_FN          %1, %2, %3, 4x4, 6
 %ifidn %1_%2, dct_identity
-    mova                 m0, [qw_2896x8]
+    mova                 m0, [o(pw_2896x8)]
     pmulhrsw             m0, [coeffq]
     paddw                m0, m0
-    pmulhrsw             m0, [qw_5793x4]
+    pmulhrsw             m0, [o(pw_5793x4)]
     punpcklwd            m0, m0
     punpckhdq            m1, m0, m0
     punpckldq            m0, m0
-    call m(iadst_4x4_internal).end
-    RET
+    TAIL_CALL m(iadst_4x4_internal).end
 %elifidn %1_%2, identity_dct
     mova                 m1, [coeffq+16*0]
     mova                 m2, [coeffq+16*1]
     punpcklwd            m0, m1, m2
     punpckhwd            m1, m2
     punpcklwd            m0, m1
     punpcklqdq           m0, m0
     paddw                m0, m0
-    pmulhrsw             m0, [qw_5793x4]
-    pmulhrsw             m0, [qw_2896x8]
+    pmulhrsw             m0, [o(pw_5793x4)]
+    pmulhrsw             m0, [o(pw_2896x8)]
     mova                 m1, m0
-    call m(iadst_4x4_internal).end
-    RET
+    TAIL_CALL m(iadst_4x4_internal).end
 %elif %3 >= 0
     pshuflw              m0, [coeffq], q0000
     punpcklqdq           m0, m0
 %ifidn %1, dct
-    mova                 m1, [qw_2896x8]
+    mova                 m1, [o(pw_2896x8)]
     pmulhrsw             m0, m1
 %elifidn %1, adst
-    pmulhrsw             m0, [iadst4_dconly1a]
+    pmulhrsw             m0, [o(iadst4_dconly1a)]
 %elifidn %1, flipadst
-    pmulhrsw             m0, [iadst4_dconly1b]
+    pmulhrsw             m0, [o(iadst4_dconly1b)]
 %endif
     mov            [coeffq], eobd                ;0
 %ifidn %2, dct
 %ifnidn %1, dct
-    pmulhrsw             m0, [qw_2896x8]
+    pmulhrsw             m0, [o(pw_2896x8)]
 %else
     pmulhrsw             m0, m1
 %endif
     mova                 m1, m0
-    call m(iadst_4x4_internal).end2
-    RET
+    TAIL_CALL m(iadst_4x4_internal).end2
 %else ; adst / flipadst
-    pmulhrsw             m1, m0, [iadst4_dconly2b]
-    pmulhrsw             m0, [iadst4_dconly2a]
-    call m(i%2_4x4_internal).end2
-    RET
+    pmulhrsw             m1, m0, [o(iadst4_dconly2b)]
+    pmulhrsw             m0, [o(iadst4_dconly2a)]
+    TAIL_CALL m(i%2_4x4_internal).end2
 %endif
 %endif
-%endmacro
-
-
-INIT_XMM ssse3
-
-cglobal idct_4x4_internal, 0, 0, 4, dst, stride, coeff, eob, tx2
-    mova                 m0, [coeffq+16*0]      ;high: in1 ;low: in0
-    mova                 m1, [coeffq+16*1]      ;high: in3 ;low in2
-
-    IDCT4_1D_PACKED
-
-    mova                 m2, [deint_shuf]
-    shufps               m3, m0, m1, q1331
-    shufps               m0, m1, q0220
-    pshufb               m0, m2                 ;high: in1 ;low: in0
-    pshufb               m1, m3, m2             ;high: in3 ;low :in2
+%endmacro
+
+INIT_XMM ssse3
+
+INV_TXFM_4X4_FN dct, dct,      0
+INV_TXFM_4X4_FN dct, adst,     0
+INV_TXFM_4X4_FN dct, flipadst, 0
+INV_TXFM_4X4_FN dct, identity, 3
+
+cglobal idct_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                 m0, [coeffq+16*0]      ;high: in1 ;low: in0
+    mova                 m1, [coeffq+16*1]      ;high: in3 ;low in2
+
+    IDCT4_1D_PACKED
+
+    mova                 m2, [o(deint_shuf)]
+    shufps               m3, m0, m1, q1331
+    shufps               m0, m1, q0220
+    pshufb               m0, m2                 ;high: in1 ;low: in0
+    pshufb               m1, m3, m2             ;high: in3 ;low :in2
     jmp                tx2q
 
-.pass2:
-    IDCT4_1D_PACKED
-
-    pxor                 m2, m2
-    mova      [coeffq+16*0], m2
-    mova      [coeffq+16*1], m2                 ;memset(coeff, 0, sizeof(*coeff) * sh * sw);
-
-    ITX4_END     0, 1, 3, 2
-
-INV_TXFM_4X4_FN dct, dct, 0
-
-cglobal iadst_4x4_internal, 0, 0, 6, dst, stride, coeff, eob, tx2
+.pass2:
+    IDCT4_1D_PACKED
+
+    pxor                 m2, m2
+    mova      [coeffq+16*0], m2
+    mova      [coeffq+16*1], m2                 ;memset(coeff, 0, sizeof(*coeff) * sh * sw);
+
+    ITX4_END     0, 1, 3, 2
+
+INV_TXFM_4X4_FN adst, dct,      0
+INV_TXFM_4X4_FN adst, adst,     0
+INV_TXFM_4X4_FN adst, flipadst, 0
+INV_TXFM_4X4_FN adst, identity
+
+cglobal iadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     mova                 m0, [coeffq+16*0]
     mova                 m1, [coeffq+16*1]
     call .main
     punpckhwd            m3, m0, m2
     punpcklwd            m0, m1
     punpckhwd            m1, m0, m3       ;high: in3 ;low :in2
     punpcklwd            m0, m3           ;high: in1 ;low: in0
     jmp                tx2q
@@ -289,21 +347,22 @@ cglobal iadst_4x4_internal, 0, 0, 6, dst
 .end2:
     ITX4_END              0, 1, 2, 3
 
 ALIGN function_align
 .main:
     IADST4_1D_PACKED
     ret
 
-INV_TXFM_4X4_FN adst, adst, 0
-INV_TXFM_4X4_FN dct,  adst, 0
-INV_TXFM_4X4_FN adst, dct,  0
+INV_TXFM_4X4_FN flipadst, dct,      0
+INV_TXFM_4X4_FN flipadst, adst,     0
+INV_TXFM_4X4_FN flipadst, flipadst, 0
+INV_TXFM_4X4_FN flipadst, identity
 
-cglobal iflipadst_4x4_internal, 0, 0, 6, dst, stride, coeff, eob, tx2
+cglobal iflipadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     mova                 m0, [coeffq+16*0]
     mova                 m1, [coeffq+16*1]
     call m(iadst_4x4_internal).main
     punpcklwd            m1, m0
     punpckhwd            m2, m0
     punpcklwd            m0, m2, m1            ;high: in3 ;low :in2
     punpckhwd            m2, m1                ;high: in1 ;low: in0
     mova                 m1, m2
@@ -316,53 +375,44 @@ cglobal iflipadst_4x4_internal, 0, 0, 6,
 .end:
     pxor                 m2, m2
     mova      [coeffq+16*0], m2
     mova      [coeffq+16*1], m2
 
 .end2:
     ITX4_END              3, 2, 1, 0
 
-INV_TXFM_4X4_FN flipadst, flipadst, 0
-INV_TXFM_4X4_FN flipadst, dct,      0
-INV_TXFM_4X4_FN flipadst, adst,     0
-INV_TXFM_4X4_FN dct,      flipadst, 0
-INV_TXFM_4X4_FN adst,     flipadst, 0
+INV_TXFM_4X4_FN identity, dct,      3
+INV_TXFM_4X4_FN identity, adst
+INV_TXFM_4X4_FN identity, flipadst
+INV_TXFM_4X4_FN identity, identity
 
-cglobal iidentity_4x4_internal, 0, 0, 6, dst, stride, coeff, eob, tx2
+cglobal iidentity_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     mova                 m0, [coeffq+16*0]
     mova                 m1, [coeffq+16*1]
-    mova                 m2, [qw_5793x4]
+    mova                 m2, [o(pw_5793x4)]
     paddw                m0, m0
     paddw                m1, m1
     pmulhrsw             m0, m2
     pmulhrsw             m1, m2
 
     punpckhwd            m2, m0, m1
     punpcklwd            m0, m1
     punpckhwd            m1, m0, m2            ;high: in3 ;low :in2
     punpcklwd            m0, m2                ;high: in1 ;low: in0
     jmp                tx2q
 
 .pass2:
-    mova                 m2, [qw_5793x4]
+    mova                 m2, [o(pw_5793x4)]
     paddw                m0, m0
     paddw                m1, m1
     pmulhrsw             m0, m2
     pmulhrsw             m1, m2
     jmp m(iadst_4x4_internal).end
 
-INV_TXFM_4X4_FN identity, identity
-INV_TXFM_4X4_FN identity, dct,      3
-INV_TXFM_4X4_FN identity, adst
-INV_TXFM_4X4_FN identity, flipadst
-INV_TXFM_4X4_FN dct,      identity, 3
-INV_TXFM_4X4_FN adst,     identity
-INV_TXFM_4X4_FN flipadst, identity
-
 %macro IWHT4_1D_PACKED 0
     punpckhqdq           m3, m0, m1            ;low: in1 high: in3
     punpcklqdq           m0, m1                ;low: in0 high: in2
     psubw                m2, m0, m3            ;low: in0 - in1 high: in2 - in3
     paddw                m0, m3                ;low: in0 + in1 high: in2 + in3
     punpckhqdq           m2, m2                ;t2 t2
     punpcklqdq           m0, m0                ;t0 t0
     psubw                m1, m0, m2
@@ -386,9 +436,1043 @@ cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 
     punpckhwd            m0, m1
     punpcklwd            m3, m1, m2
     punpckhdq            m1, m0, m3
     punpckldq            m0, m3
 
     IWHT4_1D_PACKED
 
     shufpd               m0, m2, 0x01
-    ITX4_END              0, 3, 2, 1, 0
+    ITX4_END              0, 3, 2, 1, 0
+
+
+%macro IDCT8_1D_PACKED 0
+    mova                 m6, [o(pd_2048)]
+    punpckhwd            m5, m0, m3                 ;unpacked in1 in7
+    punpckhwd            m4, m2, m1                 ;unpacked in5 in3
+    punpcklwd            m1, m3                     ;unpacked in2 in6
+    psubw                m3, m0, m2
+    paddw                m0, m2
+    punpcklqdq           m0, m3                     ;low: in0+in4 high: in0-in4
+    ITX_MUL2X_PACK        5, 2, 6,  799, 4017, 1    ;low: t4a high: t7a
+    ITX_MUL2X_PACK        4, 2, 6, 3406, 2276, 1    ;low: t5a high: t6a
+    ITX_MUL2X_PACK        1, 2, 6, 1567, 3784       ;low: t3  high: t2
+    mova                 m6, [o(pw_2896x8)]
+    psubsw               m2, m5, m4                 ;low: t5a high: t6a
+    paddsw               m5, m4                     ;low: t4  high: t7
+    punpckhqdq           m4, m2, m2                 ;low: t6a high: t6a
+    psubw                m3, m4, m2                 ;low: t6a - t5a
+    paddw                m4, m2                     ;low: t6a + t5a
+    punpcklqdq           m4, m3                     ;low: t6a + t5a high: t6a - t5a
+    pmulhrsw             m0, m6                     ;low: t0   high: t1
+    pmulhrsw             m4, m6                     ;low: t6   high: t5
+    shufps               m2, m5, m4, q1032          ;low: t7   high: t6
+    shufps               m5, m4, q3210              ;low: t4   high: t5
+    psubsw               m4, m0, m1                 ;low: tmp3 high: tmp2
+    paddsw               m0, m1                     ;low: tmp0 high: tmp1
+    psubsw               m3, m0, m2                 ;low: out7 high: out6
+    paddsw               m0, m2                     ;low: out0 high: out1
+    psubsw               m2, m4, m5                 ;low: out4 high: out5
+    paddsw               m1, m4, m5                 ;low: out3 high: out2
+%endmacro
+
+;dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
+;dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
+%macro ITX_MULSUB_2W 7 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2]
+    punpckhwd           m%3, m%1, m%2
+    punpcklwd           m%1, m%2
+%if %7 < 8
+    pmaddwd             m%2, m%7, m%1
+    pmaddwd             m%4, m%7, m%3
+%else
+    mova                m%2, [o(pw_%7_%6)]
+    pmaddwd             m%4, m%3, m%2
+    pmaddwd             m%2, m%1
+%endif
+    paddd               m%4, m%5
+    paddd               m%2, m%5
+    psrad               m%4, 12
+    psrad               m%2, 12
+    packssdw            m%2, m%4                 ;dst2
+%if %7 < 8
+    pmaddwd             m%3, m%6
+    pmaddwd             m%1, m%6
+%else
+    mova                m%4, [o(pw_%6_m%7)]
+    pmaddwd             m%3, m%4
+    pmaddwd             m%1, m%4
+%endif
+    paddd               m%3, m%5
+    paddd               m%1, m%5
+    psrad               m%3, 12
+    psrad               m%1, 12
+    packssdw            m%1, m%3                 ;dst1
+%endmacro
+
+%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048
+    ITX_MULSUB_2W        %2, %4, %5, %6, %7, 1567, 3784   ;t2, t3
+    mova                m%6, [o(pw_2896x8)]
+    paddw               m%5, m%1, m%3
+    psubw               m%1, m%3
+    pmulhrsw            m%1, m%6                          ;t1
+    pmulhrsw            m%5, m%6                          ;t0
+    psubsw              m%3, m%1, m%2                     ;out2
+    paddsw              m%2, m%1                          ;out1
+    paddsw              m%1, m%5, m%4                     ;out0
+    psubsw              m%5, m%4                          ;out3
+    mova                m%4, m%5
+%endmacro
+
+%macro IADST4_1D 0
+    mova                 m4, m2
+    psubw                m2, m0, m4
+    paddw                m2, m3                        ;low: in0 - in2 + in3
+
+    punpckhwd            m6, m0, m4                    ;unpacked in0 in2
+    punpckhwd            m7, m1, m3                    ;unpacked in1 in3
+    punpcklwd            m0, m4                        ;unpacked in0 in2
+    punpcklwd            m1, m3                        ;unpacked in1 in3
+
+    pmaddwd              m4, m0, [o(pw_1321_3803)]     ;1321 * in0 + 3803 * in2
+    pmaddwd              m0, [o(pw_2482_m1321)]        ;2482 * in0 - 1321 * in2
+    pmaddwd              m3, m1, [o(pw_3344_2482)]     ;3344 * in1 + 2482 * in3
+    pmaddwd              m5, m1, [o(pw_3344_m3803)]    ;3344 * in1 - 3803 * in3
+    paddd                m3, m4                        ;t0 + t3
+
+    pmaddwd              m1, [o(pw_m6688_m3803)]       ;-2 * 3344 * in1 - 3803 * in3
+    pmulhrsw             m2, [o(pw_3344x8)]            ;out2
+    mova                 m4, [o(pd_2048)]
+    paddd                m0, m4
+    paddd                m4, m3                        ;t0 + t3 + 2048
+    paddd                m5, m0                        ;t1 + t3 + 2048
+    paddd                m3, m0
+    paddd                m3, m1                        ;t0 + t1 - t3 + 2048
+
+    psrad                m4, 12                        ;out0
+    psrad                m5, 12                        ;out1
+    psrad                m3, 12                        ;out3
+    packssdw             m0, m4, m5                    ;low: out0  high: out1
+
+    pmaddwd              m4, m6, [o(pw_1321_3803)]     ;1321 * in0 + 3803 * in2
+    pmaddwd              m6, [o(pw_2482_m1321)]        ;2482 * in0 - 1321 * in2
+    pmaddwd              m1, m7, [o(pw_3344_2482)]     ;3344 * in1 + 2482 * in3
+    pmaddwd              m5, m7, [o(pw_3344_m3803)]    ;3344 * in1 - 3803 * in3
+    paddd                m1, m4                        ;t0 + t3
+    pmaddwd              m7, [o(pw_m6688_m3803)]       ;-2 * 3344 * in1 - 3803 * in3
+
+    mova                 m4, [o(pd_2048)]
+    paddd                m6, m4
+    paddd                m4, m1                        ;t0 + t3 + 2048
+    paddd                m5, m6                        ;t1 + t3 + 2048
+    paddd                m1, m6
+    paddd                m1, m7                        ;t0 + t1 - t3 + 2048
+
+    psrad                m4, 12                        ;out0
+    psrad                m5, 12                        ;out1
+    psrad                m1, 12                        ;out3
+    packssdw             m3, m1                        ;out3
+    packssdw             m4, m5                        ;low: out0  high: out1
+
+    punpckhqdq           m1, m0, m4                    ;out1
+    punpcklqdq           m0, m4                        ;out0
+%endmacro
+
+%macro IADST8_1D_PACKED 0
+    mova                 m6, [o(pd_2048)]
+    punpckhwd            m4, m3, m0                ;unpacked in7 in0
+    punpckhwd            m5, m2, m1                ;unpacked in5 in2
+    punpcklwd            m1, m2                    ;unpacked in3 in4
+    punpcklwd            m0, m3                    ;unpacked in1 in6
+    ITX_MUL2X_PACK        4, 2, 6,  401, 4076      ;low:  t0a   high:  t1a
+    ITX_MUL2X_PACK        5, 2, 6, 1931, 3612      ;low:  t2a   high:  t3a
+    ITX_MUL2X_PACK        1, 2, 6, 3166, 2598      ;low:  t4a   high:  t5a
+    ITX_MUL2X_PACK        0, 2, 6, 3920, 1189      ;low:  t6a   high:  t7a
+
+    psubsw               m3, m4, m1                ;low:  t4    high:  t5
+    paddsw               m4, m1                    ;low:  t0    high:  t1
+    psubsw               m2, m5, m0                ;low:  t6    high:  t7
+    paddsw               m5, m0                    ;low:  t2    high:  t3
+
+    shufps               m1, m3, m2, q1032
+    punpckhwd            m2, m1
+    punpcklwd            m3, m1
+    ITX_MUL2X_PACK        3, 0, 6, 1567, 3784, 1   ;low:  t5a   high:  t4a
+    ITX_MUL2X_PACK        2, 0, 6, 3784, 1567      ;low:  t7a   high:  t6a
+
+    psubsw               m1, m4, m5                ;low:  t2    high:  t3
+    paddsw               m4, m5                    ;low:  out0  high: -out7
+    psubsw               m5, m3, m2                ;low:  t7    high:  t6
+    paddsw               m3, m2                    ;low:  out6  high: -out1
+    shufps               m0, m4, m3, q3210         ;low:  out0  high: -out1
+    shufps               m3, m4, q3210             ;low:  out6  high: -out7
+
+    shufps               m4, m1, m5, q1032         ;low:  t3    high:  t7
+    shufps               m1, m5, q3210             ;low:  t2    high:  t6
+    mova                 m5, [o(pw_2896x8)]
+    psubw                m2, m1, m4                ;low:  t2-t3 high:  t6-t7
+    paddw                m1, m4                    ;low:  t2+t3 high:  t6+t7
+    pmulhrsw             m2, m5                    ;low:  out4  high: -out5
+    shufps               m1, m1, q1032
+    pmulhrsw             m1, m5                    ;low:  out2  high: -out3
+%endmacro
+
+%macro WRITE_4X8 4 ;row[1-4]
+    WRITE_4X4             0, 1, 4, 5, 6, %1, %2, %3, %4
+    lea                dstq, [dstq+strideq*4]
+    WRITE_4X4             2, 3, 4, 5, 6, %1, %2, %3, %4
+%endmacro
+
+%macro INV_4X8 0
+    punpckhwd            m4, m2, m3
+    punpcklwd            m2, m3
+    punpckhwd            m3, m0, m1
+    punpcklwd            m0, m1
+    punpckhdq            m1, m0, m2                  ;low: in2 high: in3
+    punpckldq            m0, m2                      ;low: in0 high: in1
+    punpckldq            m2, m3, m4                  ;low: in4 high: in5
+    punpckhdq            m3, m4                      ;low: in6 high: in7
+%endmacro
+
+%macro INV_TXFM_4X8_FN 2-3 -1 ; type1, type2, fast_thresh
+    INV_TXFM_FN          %1, %2, %3, 4x8, 8
+%if %3 >= 0
+%ifidn %1_%2, dct_identity
+    mova                 m1, [o(pw_2896x8)]
+    pmulhrsw             m0, m1, [coeffq]
+    pmulhrsw             m0, m1
+    pmulhrsw             m0, [o(pw_4096)]
+    punpckhwd            m2, m0, m0
+    punpcklwd            m0, m0
+    punpckhdq            m1, m0, m0
+    punpckldq            m0, m0
+    punpckhdq            m3, m2, m2
+    punpckldq            m2, m2
+    TAIL_CALL m(iadst_4x8_internal).end3
+%elifidn %1_%2, identity_dct
+    movd                 m0, [coeffq+16*0]
+    punpcklwd            m0, [coeffq+16*1]
+    movd                 m1, [coeffq+16*2]
+    punpcklwd            m1, [coeffq+16*3]
+    mova                 m2, [o(pw_2896x8)]
+    punpckldq            m0, m1
+    pmulhrsw             m0, m2
+    paddw                m0, m0
+    pmulhrsw             m0, [o(pw_5793x4)]
+    pmulhrsw             m0, m2
+    pmulhrsw             m0, [o(pw_2048)]
+    punpcklqdq           m0, m0
+    mova                 m1, m0
+    mova                 m2, m0
+    mova                 m3, m0
+    TAIL_CALL m(iadst_4x8_internal).end3
+%elifidn %1_%2, dct_dct
+    pshuflw              m0, [coeffq], q0000
+    punpcklqdq           m0, m0
+    mova                 m1, [o(pw_2896x8)]
+    pmulhrsw             m0, m1
+    mov           [coeffq], eobd
+    pmulhrsw             m0, m1
+    pmulhrsw             m0, m1
+    pmulhrsw             m0, [o(pw_2048)]
+    mova                 m1, m0
+    mova                 m2, m0
+    mova                 m3, m0
+    TAIL_CALL m(iadst_4x8_internal).end4
+%else ; adst_dct / flipadst_dct
+    pshuflw              m0, [coeffq], q0000
+    punpcklqdq           m0, m0
+    mova                 m1, [o(pw_2896x8)]
+    pmulhrsw             m0, m1
+%ifidn %1, adst
+    pmulhrsw             m0, [o(iadst4_dconly1a)]
+%else ; flipadst
+    pmulhrsw             m0, [o(iadst4_dconly1b)]
+%endif
+    mov            [coeffq], eobd
+    pmulhrsw             m0, m1
+    pmulhrsw             m0, [o(pw_2048)]
+    mova                 m1, m0
+    mova                 m2, m0
+    mova                 m3, m0
+    TAIL_CALL m(iadst_4x8_internal).end4
+%endif
+%endif
+%endmacro
+
+INV_TXFM_4X8_FN dct, dct,      0
+INV_TXFM_4X8_FN dct, identity, 7
+INV_TXFM_4X8_FN dct, adst
+INV_TXFM_4X8_FN dct, flipadst
+
+cglobal idct_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                 m3, [o(pw_2896x8)]
+    pmulhrsw             m0, m3, [coeffq+16*0]
+    pmulhrsw             m1, m3, [coeffq+16*1]
+    pmulhrsw             m2, m3, [coeffq+16*2]
+    pmulhrsw             m3,     [coeffq+16*3]
+
+    call m(idct_8x4_internal).main
+    call m(iadst_4x8_internal).inversion
+    jmp                tx2q
+
+.pass2:
+    call .main
+    shufps               m1, m1, q1032
+    shufps               m3, m3, q1032
+    mova                 m4, [o(pw_2048)]
+    jmp m(iadst_4x8_internal).end2
+
+ALIGN function_align
+.main:
+    IDCT8_1D_PACKED
+    ret
+
+
+INV_TXFM_4X8_FN adst, dct,      0
+INV_TXFM_4X8_FN adst, adst
+INV_TXFM_4X8_FN adst, flipadst
+INV_TXFM_4X8_FN adst, identity
+
+cglobal iadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                 m3, [o(pw_2896x8)]
+    pmulhrsw             m0, m3, [coeffq+16*0]
+    pmulhrsw             m1, m3, [coeffq+16*1]
+    pmulhrsw             m2, m3, [coeffq+16*2]
+    pmulhrsw             m3,     [coeffq+16*3]
+
+    call m(iadst_8x4_internal).main
+    call .inversion
+    jmp                tx2q
+
+.pass2:
+    shufps               m0, m0, q1032
+    shufps               m1, m1, q1032
+    call .main
+    mova                 m4, [o(pw_2048)]
+    pxor                 m5, m5
+    psubw                m5, m4
+
+.end:
+    punpcklqdq           m4, m5
+
+.end2:
+    pmulhrsw             m0, m4
+    pmulhrsw             m1, m4
+    pmulhrsw             m2, m4
+    pmulhrsw             m3, m4
+
+.end3:
+    pxor                 m5, m5
+    mova      [coeffq+16*0], m5
+    mova      [coeffq+16*1], m5
+    mova      [coeffq+16*2], m5
+    mova      [coeffq+16*3], m5
+
+.end4:
+    WRITE_4X8             0, 1, 2, 3
+    RET
+
+ALIGN function_align
+.main:
+    IADST8_1D_PACKED
+    ret
+
+ALIGN function_align
+.inversion:
+    INV_4X8
+    ret
+
+INV_TXFM_4X8_FN flipadst, dct,      0
+INV_TXFM_4X8_FN flipadst, adst
+INV_TXFM_4X8_FN flipadst, flipadst
+INV_TXFM_4X8_FN flipadst, identity
+
+cglobal iflipadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                 m3, [o(pw_2896x8)]
+    pmulhrsw             m0, m3, [coeffq+16*0]
+    pmulhrsw             m1, m3, [coeffq+16*1]
+    pmulhrsw             m2, m3, [coeffq+16*2]
+    pmulhrsw             m3,     [coeffq+16*3]
+
+    call m(iadst_8x4_internal).main
+
+    punpcklwd            m4, m3, m2
+    punpckhwd            m3, m2
+    punpcklwd            m5, m1, m0
+    punpckhwd            m1, m0
+    punpckldq            m2, m3, m1                  ;low: in4 high: in5
+    punpckhdq            m3, m1                      ;low: in6 high: in7
+    punpckldq            m0, m4, m5                  ;low: in0 high: in1
+    punpckhdq            m1, m4, m5                  ;low: in2 high: in3
+    jmp                tx2q
+
+.pass2:
+    shufps               m0, m0, q1032
+    shufps               m1, m1, q1032
+    call m(iadst_4x8_internal).main
+
+    mova                 m4, m0
+    mova                 m5, m1
+    pshufd               m0, m3, q1032
+    pshufd               m1, m2, q1032
+    pshufd               m2, m5, q1032
+    pshufd               m3, m4, q1032
+    mova                 m5, [o(pw_2048)]
+    pxor                 m4, m4
+    psubw                m4, m5
+    jmp m(iadst_4x8_internal).end
+
+INV_TXFM_4X8_FN identity, dct,      3
+INV_TXFM_4X8_FN identity, adst
+INV_TXFM_4X8_FN identity, flipadst
+INV_TXFM_4X8_FN identity, identity
+
+cglobal iidentity_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                 m3, [o(pw_2896x8)]
+    pmulhrsw             m0, m3, [coeffq+16*0]
+    pmulhrsw             m1, m3, [coeffq+16*1]
+    pmulhrsw             m2, m3, [coeffq+16*2]
+    pmulhrsw             m3,     [coeffq+16*3]
+
+    mova                 m5, [o(pw_5793x4)]
+    paddw                m0, m0
+    paddw                m1, m1
+    paddw                m2, m2
+    paddw                m3, m3
+    pmulhrsw             m0, m5
+    pmulhrsw             m1, m5
+    pmulhrsw             m2, m5
+    pmulhrsw             m3, m5
+
+    call m(iadst_4x8_internal).inversion
+    jmp                tx2q
+
+.pass2:
+    mova                 m4, [o(pw_4096)]
+    jmp m(iadst_4x8_internal).end2
+
+
+%macro WRITE_8X2 5       ;coefs[1-2], tmp[1-3]
+    movq                 m%3, [dstq        ]
+    movq                 m%4, [dstq+strideq]
+    pxor                 m%5, m%5
+    punpcklbw            m%3, m%5                 ;extend byte to word
+    punpcklbw            m%4, m%5                 ;extend byte to word
+%ifnum %1
+    paddw                m%3, m%1
+%else
+    paddw                m%3, %1
+%endif
+%ifnum %2
+    paddw                m%4, m%2
+%else
+    paddw                m%4, %2
+%endif
+    packuswb             m%3, m%4
+    movq      [dstq        ], m%3
+    punpckhqdq           m%3, m%3
+    movq      [dstq+strideq], m%3
+%endmacro
+
+%macro WRITE_8X4 7      ;coefs[1-4], tmp[1-3]
+    WRITE_8X2             %1, %2, %5, %6, %7
+    lea                dstq, [dstq+strideq*2]
+    WRITE_8X2             %3, %4, %5, %6, %7
+%endmacro
+
+%macro INV_TXFM_8X4_FN 2-3 -1 ; type1, type2, fast_thresh
+    INV_TXFM_FN          %1, %2, %3, 8x4, 8
+%if %3 >= 0
+%ifidn %1_%2, dct_identity
+    mova                 m0, [o(pw_2896x8)]
+    pmulhrsw             m1, m0, [coeffq]
+    pmulhrsw             m1, m0
+    paddw                m1, m1
+    pmulhrsw             m1, [o(pw_5793x4)]
+    pmulhrsw             m1, [o(pw_2048)]
+    punpcklwd            m1, m1
+    punpckhdq            m2, m1, m1
+    punpckldq            m1, m1
+    punpckhdq            m3, m2, m2
+    punpckldq            m2, m2
+    punpckldq            m0, m1, m1
+    punpckhdq            m1, m1
+%elifidn %1_%2, identity_dct
+    mova                 m0, [coeffq+16*0]
+    mova                 m1, [coeffq+16*1]
+    mova                 m2, [coeffq+16*2]
+    mova                 m3, [coeffq+16*3]
+    punpckhwd            m4, m0, m1
+    punpcklwd            m0, m1
+    punpckhwd            m5, m2, m3
+    punpcklwd            m2, m3
+    punpcklwd            m0, m4
+    punpcklwd            m2, m5
+    punpcklqdq           m0, m2
+    mova                 m4, [o(pw_2896x8)]
+    pmulhrsw             m0, m4
+    paddw                m0, m0
+    pmulhrsw             m0, m4
+    pmulhrsw             m0, [o(pw_2048)]
+    mova                 m1, m0
+    mova                 m2, m0
+    mova                 m3, m0
+%else
+    pshuflw              m0, [coeffq], q0000
+    punpcklqdq           m0, m0
+    mova                 m1, [o(pw_2896x8)]
+    pmulhrsw             m0, m1
+    pmulhrsw             m0, m1
+%ifidn %2, dct
+    mova                 m2, [o(pw_2048)]
+    pmulhrsw             m0, m1
+    pmulhrsw             m0, m2
+    mova                 m1, m0
+    mova                 m2, m0
+    mova                 m3, m0
+%else ; adst / flipadst
+    pmulhrsw             m2, m0, [o(iadst4_dconly2b)]
+    pmulhrsw             m0, [o(iadst4_dconly2a)]
+    mova                 m1, [o(pw_2048)]
+    pmulhrsw             m0, m1
+    pmulhrsw             m2, m1
+%ifidn %2, adst
+    punpckhqdq           m1, m0, m0
+    punpcklqdq           m0, m0
+    punpckhqdq           m3, m2, m2
+    punpcklqdq           m2, m2
+%else ; flipadst
+    mova                 m3, m0
+    punpckhqdq           m0, m2, m2
+    punpcklqdq           m1, m2, m2
+    punpckhqdq           m2, m3, m3
+    punpcklqdq           m3, m3
+%endif
+%endif
+%endif
+    TAIL_CALL m(iadst_8x4_internal).end2
+%endif
+%endmacro
+
+INV_TXFM_8X4_FN dct, dct,      0
+INV_TXFM_8X4_FN dct, adst,     0
+INV_TXFM_8X4_FN dct, flipadst, 0
+INV_TXFM_8X4_FN dct, identity, 3
+
+cglobal idct_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                 m3, [o(pw_2896x8)]
+    pmulhrsw             m0, m3, [coeffq+16*0]
+    pmulhrsw             m1, m3, [coeffq+16*1]
+    pmulhrsw             m2, m3, [coeffq+16*2]
+    pmulhrsw             m3,     [coeffq+16*3]
+
+    call m(idct_4x8_internal).main
+
+    mova                 m4, [o(deint_shuf1)]
+    mova                 m5, [o(deint_shuf2)]
+    pshufb               m0, m4
+    pshufb               m1, m5
+    pshufb               m2, m4
+    pshufb               m3, m5
+    punpckhdq            m4, m0, m1
+    punpckldq            m0, m1
+    punpckhdq            m5, m2, m3
+    punpckldq            m2, m3
+    punpckhqdq           m1, m0, m2                      ;in1
+    punpcklqdq           m0, m2                          ;in0
+    punpckhqdq           m3, m4, m5                      ;in3
+    punpcklqdq           m2 ,m4, m5                      ;in2
+    jmp                tx2q
+
+.pass2:
+    call .main
+    jmp m(iadst_8x4_internal).end
+
+ALIGN function_align
+.main:
+    mova                 m6, [o(pd_2048)]
+    IDCT4_1D             0, 1, 2, 3, 4, 5, 6
+    ret
+
+INV_TXFM_8X4_FN adst, dct
+INV_TXFM_8X4_FN adst, adst
+INV_TXFM_8X4_FN adst, flipadst
+INV_TXFM_8X4_FN adst, identity
+
+cglobal iadst_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                 m3, [o(pw_2896x8)]
+    pmulhrsw             m0, m3, [coeffq+16*0]
+    pmulhrsw             m1, m3, [coeffq+16*1]
+    pmulhrsw             m2, m3, [coeffq+16*2]
+    pmulhrsw             m3,     [coeffq+16*3]
+
+    shufps               m0, m0, q1032
+    shufps               m1, m1, q1032
+    call m(iadst_4x8_internal).main
+
+    punpckhwd            m4, m0, m1
+    punpcklwd            m0, m1
+    punpckhwd            m1, m2, m3
+    punpcklwd            m2, m3
+    pxor                 m5, m5
+    psubw                m3, m5, m1
+    psubw                m5, m4
+    punpckhdq            m4, m5, m3
+    punpckldq            m5, m3
+    punpckhdq            m3, m0, m2
+    punpckldq            m0, m2
+    punpckhwd            m1, m0, m5      ;in1
+    punpcklwd            m0, m5          ;in0
+    punpcklwd            m2, m3, m4      ;in2
+    punpckhwd            m3, m4          ;in3
+    jmp              tx2q
+
+.pass2:
+    call .main
+
+.end:
+    mova                 m4, [o(pw_2048)]
+    pmulhrsw             m0, m4
+    pmulhrsw             m1, m4
+    pmulhrsw             m2, m4
+    pmulhrsw             m3, m4
+
+.end2:
+    pxor                 m6, m6
+    mova      [coeffq+16*0], m6
+    mova      [coeffq+16*1], m6
+    mova      [coeffq+16*2], m6
+    mova      [coeffq+16*3], m6
+.end3:
+    WRITE_8X4             0, 1, 2, 3, 4, 5, 6
+    RET
+
+ALIGN function_align
+.main:
+    IADST4_1D
+    ret
+
+INV_TXFM_8X4_FN flipadst, dct
+INV_TXFM_8X4_FN flipadst, adst
+INV_TXFM_8X4_FN flipadst, flipadst
+INV_TXFM_8X4_FN flipadst, identity
+
+cglobal iflipadst_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                 m3, [o(pw_2896x8)]
+    pmulhrsw             m0, m3, [coeffq+16*0]
+    pmulhrsw             m1, m3, [coeffq+16*1]
+    pmulhrsw             m2, m3, [coeffq+16*2]
+    pmulhrsw             m3,     [coeffq+16*3]
+
+    shufps               m0, m0, q1032
+    shufps               m1, m1, q1032
+    call m(iadst_4x8_internal).main
+
+    punpckhwd            m5, m3, m2
+    punpcklwd            m3, m2
+    punpckhwd            m2, m1, m0
+    punpcklwd            m1, m0
+
+    pxor                 m0, m0
+    psubw                m4, m0, m2
+    psubw                m0, m5
+    punpckhdq            m2, m0, m4
+    punpckldq            m0, m4
+    punpckhdq            m4, m3, m1
+    punpckldq            m3, m1
+    punpckhwd            m1, m0, m3      ;in1
+    punpcklwd            m0, m3          ;in0
+    punpckhwd            m3, m2, m4      ;in3
+    punpcklwd            m2, m4          ;in2
+    jmp                  tx2q
+
+.pass2:
+    call m(iadst_8x4_internal).main
+    mova                 m4, m0
+    mova                 m5, m1
+    mova                 m0, m3
+    mova                 m1, m2
+    mova                 m2, m5
+    mova                 m3, m4
+    jmp m(iadst_8x4_internal).end
+
+INV_TXFM_8X4_FN identity, dct,      7
+INV_TXFM_8X4_FN identity, adst
+INV_TXFM_8X4_FN identity, flipadst
+INV_TXFM_8X4_FN identity, identity
+
+cglobal iidentity_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                 m3, [o(pw_2896x8)]
+    pmulhrsw             m0, m3, [coeffq+16*0]
+    pmulhrsw             m1, m3, [coeffq+16*1]
+    pmulhrsw             m2, m3, [coeffq+16*2]
+    pmulhrsw             m3,     [coeffq+16*3]
+    paddw                m0, m0
+    paddw                m1, m1
+    paddw                m2, m2
+    paddw                m3, m3
+
+    punpckhwd            m4, m0, m1
+    punpcklwd            m0, m1
+    punpckhwd            m1, m2, m3
+    punpcklwd            m2, m3
+    punpckhdq            m5, m4, m1
+    punpckldq            m4, m1
+    punpckhdq            m3, m0, m2
+    punpckldq            m0, m2
+    punpckhwd            m1, m0, m4      ;in1
+    punpcklwd            m0, m4          ;in0
+    punpcklwd            m2, m3, m5      ;in2
+    punpckhwd            m3, m5          ;in3
+    jmp                tx2q
+
+.pass2:
+    mova                 m4, [o(pw_5793x4)]
+    paddw                m0, m0
+    paddw                m1, m1
+    paddw                m2, m2
+    paddw                m3, m3
+    pmulhrsw             m0, m4
+    pmulhrsw             m1, m4
+    pmulhrsw             m2, m4
+    pmulhrsw             m3, m4
+    jmp m(iadst_8x4_internal).end
+
+%macro INV_TXFM_8X8_FN 2-3 -1 ; type1, type2, fast_thresh
+    INV_TXFM_FN          %1, %2, %3, 8x8, 8
+%ifidn %1_%2, dct_identity
+    mova                 m0, [o(pw_2896x8)]
+    pmulhrsw             m0, [coeffq]
+    mova                 m1, [o(pw_16384)]
+    pmulhrsw             m0, m1
+    psrlw                m1, 2
+    pmulhrsw             m0, m1
+    punpckhwd            m7, m0, m0
+    punpcklwd            m0, m0
+    pshufd               m3, m0, q3333
+    pshufd               m2, m0, q2222
+    pshufd               m1, m0, q1111
+    pshufd               m0, m0, q0000
+    call m(iadst_8x4_internal).end2
+    pshufd               m3, m7, q3333
+    pshufd               m2, m7, q2222
+    pshufd               m1, m7, q1111
+    pshufd               m0, m7, q0000
+    lea                dstq, [dstq+strideq*2]
+    TAIL_CALL m(iadst_8x4_internal).end3
+%elif %3 >= 0
+%ifidn %1, dct
+    pshuflw              m0, [coeffq], q0000
+    punpcklwd            m0, m0
+    mova                 m1, [o(pw_2896x8)]
+    pmulhrsw             m0, m1
+    mova                 m2, [o(pw_16384)]
+    mov            [coeffq], eobd
+    pmulhrsw             m0, m2
+    psrlw                m2, 3
+    pmulhrsw             m0, m1
+    pmulhrsw             m0, m2
+.end:
+    mov                 r2d, 2
+.end2:
+    lea                  r3, [strideq*3]
+.loop:
+    WRITE_8X4             0, 0, 0, 0, 1, 2, 3
+    lea                dstq, [dstq+strideq*2]
+    dec                 r2d
+    jg .loop
+    RET
+%else ; identity
+    mova                 m0, [coeffq+16*0]
+    mova                 m1, [coeffq+16*1]
+    mova                 m2, [coeffq+16*2]
+    mova                 m3, [coeffq+16*3]
+    punpcklwd            m0, [coeffq+16*4]
+    punpcklwd            m1, [coeffq+16*5]
+    punpcklwd            m2, [coeffq+16*6]
+    punpcklwd            m3, [coeffq+16*7]
+    punpcklwd            m0, m2
+    punpcklwd            m1, m3
+    punpcklwd            m0, m1
+    pmulhrsw             m0, [o(pw_2896x8)]
+    pmulhrsw             m0, [o(pw_2048)]
+    pxor                 m4, m4
+    REPX {mova [coeffq+16*x], m4}, 0,  1,  2,  3,  4,  5,  6,  7
+    jmp m(inv_txfm_add_dct_dct_8x8).end
+%endif
+%endif
+%endmacro
+
+%macro ITX_8X8_LOAD_COEFS 0
+    mova                 m0, [coeffq+16*0]
+    mova                 m1, [coeffq+16*1]
+    mova                 m2, [coeffq+16*2]
+    mova                 m3, [coeffq+16*3]
+    mova                 m4, [coeffq+16*4]
+    mova                 m5, [coeffq+16*5]
+    mova                 m6, [coeffq+16*6]
+%endmacro
+
+%macro IDCT8_1D_ODDHALF 7 ; src[1-4], tmp[1-2], pd_2048
+    ITX_MULSUB_2W        %1, %4, %5, %6, %7,  799, 4017   ;t4a, t7a
+    ITX_MULSUB_2W        %3, %2, %5, %6, %7, 3406, 2276   ;t5a, t6a
+    psubsw               m%5, m%1, m%3                    ;t5a
+    paddsw               m%1, m%3                         ;t4
+    psubsw               m%6, m%4, m%2                    ;t6a
+    paddsw               m%4, m%2                         ;t7
+    mova                 m%3, [o(pw_2896x8)]
+    psubw                m%2, m%6, m%5                    ;t6a - t5a
+    paddw                m%6, m%5                         ;t6a + t5a
+    pmulhrsw             m%2, m%3                         ;t5
+    pmulhrsw             m%3, m%6                         ;t6
+%endmacro
+
+INV_TXFM_8X8_FN dct, dct,      0
+INV_TXFM_8X8_FN dct, identity, 7
+INV_TXFM_8X8_FN dct, adst
+INV_TXFM_8X8_FN dct, flipadst
+
+cglobal idct_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    ITX_8X8_LOAD_COEFS
+    call .main
+
+.pass1_end:
+    mova                  m7, [o(pw_16384)]
+    REPX    {pmulhrsw x, m7}, m0, m2, m4, m6
+    mova       [coeffq+16*6], m6
+
+.pass1_end2:
+    REPX    {pmulhrsw x, m7}, m1, m3, m5
+    pmulhrsw              m7, [coeffq+16*7]
+
+.pass1_end3:
+    punpcklwd             m6, m1, m5             ;10 50 11 51 12 52 13 53
+    punpckhwd             m1, m5                 ;14 54 15 55 16 56 17 57
+    punpckhwd             m5, m0, m4             ;04 44 05 45 06 46 07 47
+    punpcklwd             m0, m4                 ;00 40 01 41 02 42 03 43
+    punpckhwd             m4, m3, m7             ;34 74 35 75 36 76 37 77
+    punpcklwd             m3, m7                 ;30 70 31 71 32 72 33 73
+    punpckhwd             m7, m1, m4             ;16 36 56 76 17 37 57 77
+    punpcklwd             m1, m4                 ;14 34 54 74 15 35 55 75
+    punpckhwd             m4, m6, m3             ;12 32 52 72 13 33 53 73
+    punpcklwd             m6, m3                 ;10 30 50 70 11 31 51 71
+    mova       [coeffq+16*5], m6
+    mova                  m6, [coeffq+16*6]
+    punpckhwd             m3, m2, m6             ;24 64 25 65 26 66 27 67
+    punpcklwd             m2, m6                 ;20 60 21 61 22 62 23 63
+    punpckhwd             m6, m5, m3             ;06 26 46 66 07 27 47 67
+    punpcklwd             m5, m3                 ;04 24 44 64 05 25 45 65
+    punpckhwd             m3, m0, m2             ;02 22 42 62 03 23 43 63
+    punpcklwd             m0, m2                 ;00 20 40 60 01 21 41 61
+
+    punpckhwd             m2, m6, m7             ;07 17 27 37 47 57 67 77
+    punpcklwd             m6, m7                 ;06 16 26 36 46 56 66 76
+    mova       [coeffq+16*7], m2
+    punpcklwd             m2, m3, m4             ;02 12 22 32 42 52 62 72
+    punpckhwd             m3, m4                 ;03 13 23 33 43 53 63 73
+    punpcklwd             m4, m5, m1             ;04 14 24 34 44 54 64 74
+    punpckhwd             m5, m1                 ;05 15 25 35 45 55 65 75
+    mova                  m7, [coeffq+16*5]
+    punpckhwd             m1, m0, m7             ;01 11 21 31 41 51 61 71
+    punpcklwd             m0, m7                 ;00 10 20 30 40 50 60 70
+    jmp                tx2q
+
+.pass2:
+    call .main
+
+.end:
+    mova                  m7, [o(pw_2048)]
+    REPX    {pmulhrsw x, m7}, m0, m2, m4, m6
+    mova       [coeffq+16*6], m6
+
+.end2:
+    REPX    {pmulhrsw x, m7}, m1, m3, m5
+    pmulhrsw              m7, [coeffq+16*7]
+    mova       [coeffq+16*5], m5
+    mova       [coeffq+16*7], m7
+
+.end3:
+    WRITE_8X4             0, 1, 2, 3, 5, 6, 7
+    lea                dstq, [dstq+strideq*2]
+    WRITE_8X4             4, [coeffq+16*5], [coeffq+16*6], [coeffq+16*7], 5, 6, 7
+
+    pxor                 m7, m7
+    REPX {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7
+    ret
+
+ALIGN function_align
+.main:
+    mova       [coeffq+16*6], m3
+    mova       [coeffq+16*5], m1
+    mova                  m7, [o(pd_2048)]
+    IDCT4_1D               0, 2, 4, 6, 1, 3, 7
+    mova                  m3, [coeffq+16*5]
+    mova       [coeffq+16*5], m2
+    mova                  m2, [coeffq+16*6]
+    mova       [coeffq+16*6], m4
+    mova                  m4, [coeffq+16*7]
+    mova       [coeffq+16*7], m6
+    IDCT8_1D_ODDHALF       3, 2, 5, 4, 1, 6, 7
+    mova                  m6, [coeffq+16*7]
+    psubsw                m7, m0, m4                    ;out7
+    paddsw                m0, m4                        ;out0
+    mova       [coeffq+16*7], m7
+    mova                  m1, [coeffq+16*5]
+    psubsw                m4, m6, m3                    ;out4
+    paddsw                m3, m6                        ;out3
+    mova                  m7, [coeffq+16*6]
+    psubsw                m6, m1, m5                    ;out6
+    paddsw                m1, m5                        ;out1
+    psubsw                m5, m7, m2                    ;out5
+    paddsw                m2, m7                        ;out2
+    ret
+
+
+INV_TXFM_8X8_FN adst, dct
+INV_TXFM_8X8_FN adst, adst
+INV_TXFM_8X8_FN adst, flipadst
+INV_TXFM_8X8_FN adst, identity
+
+cglobal iadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    ITX_8X8_LOAD_COEFS
+    call .main
+    mova                  m7, [o(pw_16384)]
+    REPX    {pmulhrsw x, m7}, m0, m2, m4, m6
+    mova       [coeffq+16*6], m6
+    pxor                  m6, m6
+    psubw                 m6, m7
+    mova                  m7, m6
+    jmp m(idct_8x8_internal).pass1_end2
+
+ALIGN function_align
+.pass2:
+    call .main
+    mova                  m7, [o(pw_2048)]
+    REPX    {pmulhrsw x, m7}, m0, m2, m4, m6
+    mova       [coeffq+16*6], m6
+    pxor                  m6, m6
+    psubw                 m6, m7
+    mova                  m7, m6
+    jmp m(idct_8x8_internal).end2
+
+ALIGN function_align
+.main:
+    mova       [coeffq+16*6], m3
+    mova       [coeffq+16*5], m4
+    mova                  m7, [o(pd_2048)]
+    ITX_MULSUB_2W          5, 2, 3, 4, 7, 1931, 3612    ;t3a, t2a
+    ITX_MULSUB_2W          1, 6, 3, 4, 7, 3920, 1189    ;t7a, t6a
+    paddsw                m3, m2, m6                    ;t2
+    psubsw                m2, m6                        ;t6
+    paddsw                m4, m5, m1                    ;t3
+    psubsw                m5, m1                        ;t7
+    ITX_MULSUB_2W          5, 2, 1, 6, 7, 3784, 1567    ;t6a, t7a
+
+    mova                  m6, [coeffq+16*5]
+    mova       [coeffq+16*5], m5
+    mova                  m1, [coeffq+16*6]
+    mova       [coeffq+16*6], m2
+    mova                  m5, [coeffq+16*7]
+    mova       [coeffq+16*7], m3
+    ITX_MULSUB_2W          5, 0, 2, 3, 7,  401, 4076    ;t1a, t0a
+    ITX_MULSUB_2W          1, 6, 2, 3, 7, 3166, 2598    ;t5a, t4a
+    psubsw                m2, m0, m6                    ;t4
+    paddsw                m0, m6                        ;t0
+    paddsw                m3, m5, m1                    ;t1
+    psubsw                m5, m1                        ;t5
+    ITX_MULSUB_2W          2, 5, 1, 6, 7, 1567, 3784    ;t5a, t4a
+
+    mova                  m7, [coeffq+16*7]
+    paddsw                m1, m3, m4                    ;-out7
+    psubsw                m3, m4                        ;t3
+    mova       [coeffq+16*7], m1
+    psubsw                m4, m0, m7                    ;t2
+    paddsw                m0, m7                        ;out0
+    mova                  m6, [coeffq+16*5]
+    mova                  m7, [coeffq+16*6]
+    paddsw                m1, m5, m6                    ;-out1
+    psubsw                m5, m6                        ;t6
+    paddsw                m6, m2, m7                    ;out6
+    psubsw                m2, m7                        ;t7
+    paddw                 m7, m4, m3                    ;t2 + t3
+    psubw                 m4, m3                        ;t2 - t3
+    paddw                 m3, m5, m2                    ;t6 + t7
+    psubw                 m5, m2                        ;t6 - t7
+    mova                  m2, [o(pw_2896x8)]
+    pmulhrsw              m4, m2                        ;out4
+    pmulhrsw              m5, m2                        ;-out5
+    pmulhrsw              m7, m2                        ;-out3
+    pmulhrsw              m2, m3                        ;out2
+    mova                  m3, m7
+    ret
+
+INV_TXFM_8X8_FN flipadst, dct
+INV_TXFM_8X8_FN flipadst, adst
+INV_TXFM_8X8_FN flipadst, flipadst
+INV_TXFM_8X8_FN flipadst, identity
+
+cglobal iflipadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    ITX_8X8_LOAD_COEFS
+    call m(iadst_8x8_internal).main
+    mova                  m7, [o(pw_m16384)]
+    pmulhrsw              m1, m7
+    mova       [coeffq+16*6], m1
+    mova                  m1, m6
+    mova                  m6, m2
+    pmulhrsw              m2, m5, m7
+    mova                  m5, m6
+    mova                  m6, m4
+    pmulhrsw              m4, m3, m7
+    mova                  m3, m6
+    mova                  m6, m0
+    mova                  m0, m7
+    pxor                  m7, m7
+    psubw                 m7, m0
+    pmulhrsw              m0, [coeffq+16*7]
+    REPX    {pmulhrsw x, m7}, m1, m3, m5
+    pmulhrsw              m7, m6
+    jmp m(idct_8x8_internal).pass1_end3
+
+ALIGN function_align
+.pass2:
+    call m(iadst_8x8_internal).main
+    mova                  m7, [o(pw_2048)]
+    REPX    {pmulhrsw x, m7}, m0, m2, m4, m6
+    mova       [coeffq+16*5], m2
+    mova                  m2, m0
+    pxor                  m0, m0
+    psubw                 m0, m7
+    mova                  m7, m2
+    pmulhrsw              m1, m0
+    pmulhrsw              m2, m5, m0
+    mova       [coeffq+16*6], m1
+    mova                  m5, m4
+    mova                  m1, m6
+    pmulhrsw              m4, m3, m0
+    pmulhrsw              m0, [coeffq+16*7]
+    mova                  m3, m5
+    mova       [coeffq+16*7], m7
+    jmp m(idct_8x8_internal).end3
+
+INV_TXFM_8X8_FN identity, dct,      7
+INV_TXFM_8X8_FN identity, adst
+INV_TXFM_8X8_FN identity, flipadst
+INV_TXFM_8X8_FN identity, identity
+
+cglobal iidentity_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
+    mova                 m0, [coeffq+16*0]
+    mova                 m1, [coeffq+16*1]
+    mova                 m2, [coeffq+16*2]
+    mova                 m3, [coeffq+16*3]
+    mova                 m4, [coeffq+16*4]
+    mova                 m5, [coeffq+16*5]
+    mova                 m7, [coeffq+16*7]
+    jmp m(idct_8x8_internal).pass1_end3
+
+ALIGN function_align
+.pass2:
+    mova                  m7, [o(pw_4096)]
+    REPX    {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+    pmulhrsw              m7, [coeffq+16*7]
+    mova       [coeffq+16*5], m5
+    mova       [coeffq+16*6], m6
+    mova       [coeffq+16*7], m7
+    jmp m(idct_8x8_internal).end3
--- a/third_party/dav1d/src/x86/mc_init_tmpl.c
+++ b/third_party/dav1d/src/x86/mc_init_tmpl.c
@@ -33,16 +33,17 @@ decl_mc_fn(dav1d_put_8tap_regular_smooth
 decl_mc_fn(dav1d_put_8tap_regular_sharp_avx2);
 decl_mc_fn(dav1d_put_8tap_smooth_avx2);
 decl_mc_fn(dav1d_put_8tap_smooth_regular_avx2);
 decl_mc_fn(dav1d_put_8tap_smooth_sharp_avx2);
 decl_mc_fn(dav1d_put_8tap_sharp_avx2);
 decl_mc_fn(dav1d_put_8tap_sharp_regular_avx2);
 decl_mc_fn(dav1d_put_8tap_sharp_smooth_avx2);
 decl_mc_fn(dav1d_put_bilin_avx2);
+decl_mc_fn(dav1d_put_bilin_ssse3);
 
 decl_mct_fn(dav1d_prep_8tap_regular_avx2);
 decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx2);
 decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx2);
 decl_mct_fn(dav1d_prep_8tap_smooth_avx2);
 decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx2);
 decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx2);
 decl_mct_fn(dav1d_prep_8tap_sharp_avx2);
@@ -64,36 +65,40 @@ decl_blend_dir_fn(dav1d_blend_v_avx2);
 decl_blend_dir_fn(dav1d_blend_v_ssse3);
 decl_blend_dir_fn(dav1d_blend_h_avx2);
 decl_blend_dir_fn(dav1d_blend_h_ssse3);
 
 decl_warp8x8_fn(dav1d_warp_affine_8x8_avx2);
 decl_warp8x8t_fn(dav1d_warp_affine_8x8t_avx2);
 
 decl_emu_edge_fn(dav1d_emu_edge_avx2);
+decl_emu_edge_fn(dav1d_emu_edge_ssse3);
 
 void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
 #define init_mc_fn(type, name, suffix) \
     c->mc[type] = dav1d_put_##name##_##suffix
 #define init_mct_fn(type, name, suffix) \
     c->mct[type] = dav1d_prep_##name##_##suffix
     const unsigned flags = dav1d_get_cpu_flags();
 
 
     if(!(flags & DAV1D_X86_CPU_FLAG_SSSE3))
         return;
 
 #if BITDEPTH == 8
+    init_mc_fn (FILTER_2D_BILINEAR,            bilin,               ssse3);
+
     c->avg = dav1d_avg_ssse3;
     c->w_avg = dav1d_w_avg_ssse3;
     c->mask = dav1d_mask_ssse3;
     c->w_mask[2] = dav1d_w_mask_420_ssse3;
     c->blend = dav1d_blend_ssse3;
     c->blend_v = dav1d_blend_v_ssse3;
     c->blend_h = dav1d_blend_h_ssse3;
+    c->emu_edge = dav1d_emu_edge_ssse3;
 #endif
 
     if (!(flags & DAV1D_X86_CPU_FLAG_AVX2))
         return;
 
 #if BITDEPTH == 8 && ARCH_X86_64
     init_mc_fn (FILTER_2D_8TAP_REGULAR,        8tap_regular,        avx2);
     init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
--- a/third_party/dav1d/src/x86/mc_ssse3.asm
+++ b/third_party/dav1d/src/x86/mc_ssse3.asm
@@ -40,17 +40,20 @@ obmc_masks: db  0,  0,  0,  0
             ; 16 @32
             db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
             db 56,  8, 58,  6, 60,  4, 61,  3, 64,  0, 64,  0, 64,  0, 64,  0
             ; 32 @64
             db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
             db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55,  9
             db 56,  8, 57,  7, 58,  6, 59,  5, 60,  4, 60,  4, 61,  3, 62,  2
             db 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0
-blend_shuf: db  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
+
+bilin_h_shuf4:  db 1,  0,  2,  1,  3,  2,  4,  3,  9,  8, 10,  9, 11, 10, 12, 11
+bilin_h_shuf8:  db 1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
+blend_shuf:     db 0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
 
 pb_64:   times 16 db 64
 pw_8:    times 8 dw 8
 pw_26:   times 8 dw 26
 pw_258:  times 8 dw 258
 pw_512:  times 8 dw 512
 pw_1024: times 8 dw 1024
 pw_2048: times 8 dw 2048
@@ -71,20 +74,660 @@ pw_2048: times 8 dw 2048
 BIDIR_JMP_TABLE avg_ssse3,        4, 8, 16, 32, 64, 128
 BIDIR_JMP_TABLE w_avg_ssse3,      4, 8, 16, 32, 64, 128
 BIDIR_JMP_TABLE mask_ssse3,       4, 8, 16, 32, 64, 128
 BIDIR_JMP_TABLE w_mask_420_ssse3, 4, 8, 16, 16, 16, 16
 BIDIR_JMP_TABLE blend_ssse3,      4, 8, 16, 32
 BIDIR_JMP_TABLE blend_v_ssse3, 2, 4, 8, 16, 32
 BIDIR_JMP_TABLE blend_h_ssse3, 2, 4, 8, 16, 16, 16, 16
 
+%macro BASE_JMP_TABLE 3-*
+    %xdefine %1_%2_table (%%table - %3)
+    %xdefine %%base %1_%2
+    %%table:
+    %rep %0 - 2
+        dw %%base %+ _w%3 - %%base
+        %rotate 1
+    %endrep
+%endmacro
+
+%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_ssse3.put)
+
+BASE_JMP_TABLE put,  ssse3, 2, 4, 8, 16, 32, 64, 128
+
+%macro HV_JMP_TABLE 5-*
+    %xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3)
+    %xdefine %%base %1_%3
+    %assign %%types %4
+    %if %%types & 1
+        %xdefine %1_%2_h_%3_table  (%%h  - %5)
+        %%h:
+        %rep %0 - 4
+            dw %%prefix %+ .h_w%5 - %%base
+            %rotate 1
+        %endrep
+        %rotate 4
+    %endif
+    %if %%types & 2
+        %xdefine %1_%2_v_%3_table  (%%v  - %5)
+        %%v:
+        %rep %0 - 4
+            dw %%prefix %+ .v_w%5 - %%base
+            %rotate 1
+        %endrep
+        %rotate 4
+    %endif
+    %if %%types & 4
+        %xdefine %1_%2_hv_%3_table (%%hv - %5)
+        %%hv:
+        %rep %0 - 4
+            dw %%prefix %+ .hv_w%5 - %%base
+            %rotate 1
+        %endrep
+    %endif
+%endmacro
+
+HV_JMP_TABLE put,  bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128
+
+%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
+
 SECTION .text
 
 INIT_XMM ssse3
 
+%if ARCH_X86_32
+DECLARE_REG_TMP 1
+cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak
+%define base t0-put_ssse3
+%else
+DECLARE_REG_TMP 7
+%define base 0
+cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
+%endif
+;
+%macro RESTORE_DSQ_32 1
+ %if ARCH_X86_32
+   mov                  %1, dsm ; restore dsq
+ %endif
+%endmacro
+;
+    movifnidn          mxyd, r6m ; mx
+    LEA                  t0, put_ssse3
+    tzcnt                wd, wm
+    mov                  hd, hm
+    test               mxyd, mxyd
+    jnz .h
+    mov                mxyd, r7m ; my
+    test               mxyd, mxyd
+    jnz .v
+.put:
+    movzx                wd, word [t0+wq*2+table_offset(put,)]
+    add                  wq, t0
+    lea                  r6, [ssq*3]
+    RESTORE_DSQ_32       t0
+    jmp                  wq
+.put_w2:
+    movzx               r4d, word [srcq+ssq*0]
+    movzx               r6d, word [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    mov        [dstq+dsq*0], r4w
+    mov        [dstq+dsq*1], r6w
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .put_w2
+    RET
+.put_w4:
+    mov                 r4d, [srcq+ssq*0]
+    mov                 r6d, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    mov        [dstq+dsq*0], r4d
+    mov        [dstq+dsq*1], r6d
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .put_w4
+    RET
+.put_w8:
+    movq                 m0, [srcq+ssq*0]
+    movq                 m1, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    movq       [dstq+dsq*0], m0
+    movq       [dstq+dsq*1], m1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .put_w8
+    RET
+.put_w16:
+    lea                  r4, [dsq*3]
+.put_w16_in:
+    movu                 m0, [srcq+ssq*0]
+    movu                 m1, [srcq+ssq*1]
+    movu                 m2, [srcq+ssq*2]
+    movu                 m3, [srcq+r6   ]
+    lea                srcq, [srcq+ssq*4]
+    mova       [dstq+dsq*0], m0
+    mova       [dstq+dsq*1], m1
+    mova       [dstq+dsq*2], m2
+    mova       [dstq+r4   ], m3
+    lea                dstq, [dstq+dsq*4]
+    sub                  hd, 4
+    jg .put_w16_in
+    RET
+.put_w32:
+    movu                 m0, [srcq+ssq*0+16*0]
+    movu                 m1, [srcq+ssq*0+16*1]
+    movu                 m2, [srcq+ssq*1+16*0]
+    movu                 m3, [srcq+ssq*1+16*1]
+    lea                srcq, [srcq+ssq*2]
+    mova  [dstq+dsq*0+16*0], m0
+    mova  [dstq+dsq*0+16*1], m1
+    mova  [dstq+dsq*1+16*0], m2
+    mova  [dstq+dsq*1+16*1], m3
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .put_w32
+    RET
+.put_w64:
+    movu                 m0, [srcq+16*0]
+    movu                 m1, [srcq+16*1]
+    movu                 m2, [srcq+16*2]
+    movu                 m3, [srcq+16*3]
+    add                srcq, ssq
+    mova        [dstq+16*0], m0
+    mova        [dstq+16*1], m1
+    mova        [dstq+16*2], m2
+    mova        [dstq+16*3], m3
+    add                dstq, dsq
+    dec                  hd
+    jg .put_w64
+    RET
+.put_w128:
+    movu                 m0, [srcq+16*0]
+    movu                 m1, [srcq+16*1]
+    movu                 m2, [srcq+16*2]
+    movu                 m3, [srcq+16*3]
+    mova        [dstq+16*0], m0
+    mova        [dstq+16*1], m1
+    mova        [dstq+16*2], m2
+    mova        [dstq+16*3], m3
+    movu                 m0, [srcq+16*4]
+    movu                 m1, [srcq+16*5]
+    movu                 m2, [srcq+16*6]
+    movu                 m3, [srcq+16*7]
+    mova        [dstq+16*4], m0
+    mova        [dstq+16*5], m1
+    mova        [dstq+16*6], m2
+    mova        [dstq+16*7], m3
+    add                srcq, ssq
+    add                dstq, dsq
+    dec                  hd
+    jg .put_w128
+    RET
+.h:
+    ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
+    ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
+    imul               mxyd, 0xff01
+    mova                 m4, [base+bilin_h_shuf8]
+    mova                 m0, [base+bilin_h_shuf4]
+    WIN64_SPILL_XMM       7
+    add                mxyd, 16 << 8
+    movd                 m5, mxyd
+    mov                mxyd, r7m ; my
+    pshuflw              m5, m5, q0000
+    punpcklqdq           m5, m5
+    test               mxyd, mxyd
+    jnz .hv
+    movzx                wd, word [t0+wq*2+table_offset(put, _bilin_h)]
+    mova                 m6, [base+pw_2048]
+    add                  wq, t0
+    RESTORE_DSQ_32       t0
+    jmp                  wq
+.h_w2:
+    pshufd               m4, m4, q3120 ; m4 = {1, 0, 2, 1, 5, 4, 6, 5}
+.h_w2_loop:
+    movd                 m0, [srcq+ssq*0]
+    movd                 m1, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    punpckldq            m0, m1
+    pshufb               m0, m4
+    pmaddubsw            m0, m5
+    pmulhrsw             m0, m6
+    packuswb             m0, m0
+    movd                r6d, m0
+    mov        [dstq+dsq*0], r6w
+    shr                 r6d, 16
+    mov        [dstq+dsq*1], r6w
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .h_w2_loop
+    RET
+.h_w4:
+    movq                 m4, [srcq+ssq*0]
+    movhps               m4, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    pshufb               m4, m0
+    pmaddubsw            m4, m5
+    pmulhrsw             m4, m6
+    packuswb             m4, m4
+    movd       [dstq+dsq*0], m4
+    pshufd               m4, m4, q0101
+    movd       [dstq+dsq*1], m4
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .h_w4
+    RET
+.h_w8:
+    movu                 m0, [srcq+ssq*0]
+    movu                 m1, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmulhrsw             m0, m6
+    pmulhrsw             m1, m6
+    packuswb             m0, m1
+    movq       [dstq+dsq*0], m0
+    movhps     [dstq+dsq*1], m0
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .h_w8
+    RET
+.h_w16:
+    movu                 m0, [srcq+8*0]
+    movu                 m1, [srcq+8*1]
+    add                srcq, ssq
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmulhrsw             m0, m6
+    pmulhrsw             m1, m6
+    packuswb             m0, m1
+    mova             [dstq], m0
+    add                dstq, dsq
+    dec                  hd
+    jg .h_w16
+    RET
+.h_w32:
+    movu                 m0, [srcq+mmsize*0+8*0]
+    movu                 m1, [srcq+mmsize*0+8*1]
+    movu                 m2, [srcq+mmsize*1+8*0]
+    movu                 m3, [srcq+mmsize*1+8*1]
+    add                srcq, ssq
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pshufb               m2, m4
+    pshufb               m3, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmaddubsw            m2, m5
+    pmaddubsw            m3, m5
+    pmulhrsw             m0, m6
+    pmulhrsw             m1, m6
+    pmulhrsw             m2, m6
+    pmulhrsw             m3, m6
+    packuswb             m0, m1
+    packuswb             m2, m3
+    mova        [dstq+16*0], m0
+    mova        [dstq+16*1], m2
+    add                dstq, dsq
+    dec                  hd
+    jg .h_w32
+    RET
+.h_w64:
+    mov                  r6, -16*3
+.h_w64_loop:
+    movu                 m0, [srcq+r6+16*3+8*0]
+    movu                 m1, [srcq+r6+16*3+8*1]
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmulhrsw             m0, m6
+    pmulhrsw             m1, m6
+    packuswb             m0, m1
+    mova     [dstq+r6+16*3], m0
+    add                  r6, 16
+    jle .h_w64_loop
+    add                srcq, ssq
+    add                dstq, dsq
+    dec                  hd
+    jg .h_w64
+    RET
+.h_w128:
+    mov                  r6, -16*7
+.h_w128_loop:
+    movu                 m0, [srcq+r6+16*7+8*0]
+    movu                 m1, [srcq+r6+16*7+8*1]
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmulhrsw             m0, m6
+    pmulhrsw             m1, m6
+    packuswb             m0, m1
+    mova     [dstq+r6+16*7], m0
+    add                  r6, 16
+    jle .h_w128_loop
+    add                srcq, ssq
+    add                dstq, dsq
+    dec                  hd
+    jg .h_w128
+    RET
+.v:
+    movzx                wd, word [t0+wq*2+table_offset(put, _bilin_v)]
+    %assign stack_offset stack_offset - stack_size_padded
+    WIN64_SPILL_XMM       8
+    imul               mxyd, 0xff01
+    mova                 m7, [base+pw_2048]
+    add                mxyd, 16 << 8
+    add                  wq, t0
+    movd                 m6, mxyd
+    pshuflw              m6, m6, q0000
+    punpcklqdq           m6, m6
+    RESTORE_DSQ_32       t0
+    jmp                  wq
+.v_w2:
+    movd                 m0, [srcq+ssq*0]
+.v_w2_loop:
+    pinsrw               m0, [srcq+ssq*1], 1 ; 0 1
+    lea                srcq, [srcq+ssq*2]
+    pshuflw              m2, m0, q2301
+    pinsrw               m0, [srcq+ssq*0], 0 ; 2 1
+    punpcklbw            m1, m0, m2
+    pmaddubsw            m1, m6
+    pmulhrsw             m1, m7
+    packuswb             m1, m1
+    movd                r6d, m1
+    mov        [dstq+dsq*1], r6w
+    shr                 r6d, 16
+    mov        [dstq+dsq*0], r6w
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w2_loop
+    RET
+.v_w4:
+    movd                 m0, [srcq+ssq*0]
+.v_w4_loop:
+    movd                 m1, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    punpckldq            m2, m0, m1 ; 0 1
+    movd                 m0, [srcq+ssq*0]
+    punpckldq            m1, m0  ; 1 2
+    punpcklbw            m1, m2
+    pmaddubsw            m1, m6
+    pmulhrsw             m1, m7
+    packuswb             m1, m1
+    movd       [dstq+dsq*0], m1
+    psrlq                m1, 32
+    movd       [dstq+dsq*1], m1
+    ;
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w4_loop
+    RET
+.v_w8:
+    movq                 m0, [srcq+ssq*0]
+.v_w8_loop:
+    movddup              m2, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    punpcklqdq           m3, m0, m2 ; 0 1 m2qh:m0ql
+    movddup              m0, [srcq+ssq*0]
+    punpcklqdq           m4, m2, m0 ; 1 2 m0qh:m2ql
+    punpcklbw            m1, m4, m3
+    punpckhbw            m4, m3
+    pmaddubsw            m1, m6
+    pmaddubsw            m4, m6
+    pmulhrsw             m1, m7
+    pmulhrsw             m4, m7
+    packuswb             m1, m4
+    movq         [dstq+dsq*0], m1
+    movhps       [dstq+dsq*1], m1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .v_w8_loop
+    RET
+    ;
+%macro PUT_BILIN_V_W16 0
+    movu                 m0, [srcq+ssq*0]
+%%loop:
+    movu                 m4, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    punpcklbw            m1, m4, m0
+    punpckhbw            m3, m4, m0
+    movu                 m0, [srcq+ssq*0]
+    punpcklbw            m2, m0, m4
+    pmaddubsw            m1, m6
+    pmaddubsw            m3, m6
+    pmulhrsw             m1, m7
+    pmulhrsw             m3, m7
+    packuswb             m1, m3
+    mova       [dstq+dsq*0], m1
+    punpckhbw            m3, m0, m4
+    pmaddubsw            m2, m6
+    pmaddubsw            m3, m6
+    pmulhrsw             m2, m7
+    pmulhrsw             m3, m7
+    packuswb             m2, m3
+    mova       [dstq+dsq*1], m2
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg %%loop
+%endmacro
+    ;
+.v_w16:
+    PUT_BILIN_V_W16
+    RET
+.v_w16gt:
+    mov                  r4, dstq
+    mov                  r6, srcq
+.v_w16gt_loop:
+%if ARCH_X86_32
+    mov                bakm, t0q
+    RESTORE_DSQ_32       t0
+    PUT_BILIN_V_W16
+    mov                 t0q, bakm
+%else
+    PUT_BILIN_V_W16
+%endif
+    mov                  hw, t0w
+    add                  r4, mmsize
+    add                  r6, mmsize
+    mov                dstq, r4
+    mov                srcq, r6
+    sub                 t0d, 1<<16
+    jg .v_w16gt
+    RET
+.v_w32:
+    lea                 t0d, [hq+(1<<16)]
+    jmp .v_w16gt
+.v_w64:
+    lea                 t0d, [hq+(3<<16)]
+    jmp .v_w16gt
+.v_w128:
+    lea                 t0d, [hq+(7<<16)]
+    jmp .v_w16gt
+.hv:
+    ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
+    ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
+    movzx                wd, word [t0+wq*2+table_offset(put, _bilin_hv)]
+    %assign stack_offset stack_offset - stack_size_padded
+    WIN64_SPILL_XMM       8
+    shl                mxyd, 11 ; can't shift by 12 due to signed overflow
+    mova                 m7, [base+pw_2048]
+    movd                 m6, mxyd
+    add                  wq, t0
+    pshuflw              m6, m6, q0000
+    punpcklqdq           m6, m6
+    jmp                  wq
+.hv_w2:
+    RESTORE_DSQ_32       t0
+    movd                 m0, [srcq+ssq*0]
+    pshufd               m0, m0, q0000      ; src[x - src_stride]
+    pshufb               m0, m4
+    pmaddubsw            m0, m5
+.hv_w2_loop:
+    movd                 m1, [srcq+ssq*1]   ; src[x]
+    lea                srcq, [srcq+ssq*2]
+    movhps               m1, [srcq+ssq*0]   ; src[x + src_stride]
+    pshufd               m1, m1, q3120
+    pshufb               m1, m4
+    pmaddubsw            m1, m5             ; 1 _ 2 _
+    shufps               m2, m0, m1, q1032  ; 0 _ 1 _
+    mova                 m0, m1
+    psubw                m1, m2   ; src[x + src_stride] - src[x]
+    paddw                m1, m1
+    pmulhw               m1, m6   ; (my * (src[x + src_stride] - src[x])
+    paddw                m1, m2   ; src[x] + (my * (src[x + src_stride] - src[x])
+    pmulhrsw             m1, m7
+    packuswb             m1, m1
+    pshuflw              m1, m1, q2020
+    movd                r6d, m1
+    mov        [dstq+dsq*0], r6w
+    shr                 r6d, 16
+    mov        [dstq+dsq*1], r6w
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .hv_w2_loop
+    RET
+.hv_w4:
+    mova                 m4, [base+bilin_h_shuf4]
+    RESTORE_DSQ_32       t0
+    movddup             xm0, [srcq+ssq*0]
+    pshufb               m0, m4
+    pmaddubsw            m0, m5
+.hv_w4_loop:
+    movq                 m1,     [srcq+ssq*1]
+    lea                srcq,     [srcq+ssq*2]
+    movhps               m1,     [srcq+ssq*0]
+    pshufb               m1, m4
+    pmaddubsw            m1, m5           ; 1 2
+    shufps               m2, m0, m1, q1032 ; 0 1
+    mova                 m0, m1
+    psubw                m1, m2
+    paddw                m1, m1
+    pmulhw               m1, m6
+    paddw                m1, m2
+    pmulhrsw             m1, m7
+    packuswb             m1, m1
+    movd       [dstq+dsq*0], m1
+    psrlq                m1, 32
+    movd       [dstq+dsq*1], m1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .hv_w4_loop
+    RET
+.hv_w8:
+    RESTORE_DSQ_32       t0
+    movu                 m0,     [srcq+ssq*0+8*0]
+    pshufb               m0, m4
+    pmaddubsw            m0, m5
+.hv_w8_loop:
+    movu                 m2,     [srcq+ssq*1+8*0]
+    lea                srcq,     [srcq+ssq*2]
+    movu                 m3,     [srcq+ssq*0+8*0]
+    pshufb               m2, m4
+    pshufb               m3, m4
+    pmaddubsw            m2, m5
+    psubw                m1, m2, m0
+    paddw                m1, m1
+    pmulhw               m1, m6
+    paddw                m1, m0
+    pmaddubsw            m0, m3, m5
+    psubw                m3, m0, m2
+    paddw                m3, m3
+    pmulhw               m3, m6
+    paddw                m3, m2
+    pmulhrsw             m1, m7
+    pmulhrsw             m3, m7
+    packuswb             m1, m3
+    movq         [dstq+dsq*0], m1
+    movhps       [dstq+dsq*1], m1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .hv_w8_loop
+    RET
+    ;
+    ; 32bit has ssq, dsq free
+%macro PUT_BILIN_HV_W16 0
+    movu                 m0,     [srcq+8*0]
+    movu                 m1,     [srcq+8*1]
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+ %if WIN64
+    movaps              r4m, xmm8
+ %endif
+%%loop:
+%if ARCH_X86_32
+ %define m3back [dstq]
+ %define dsqval dsm
+%else
+ %define m3back m8
+ %define dsqval dsq
+%endif
+    add                srcq, ssq
+    movu                 m2,     [srcq+8*1]
+    pshufb               m2, m4
+    pmaddubsw            m2, m5
+    psubw                m3, m2, m1
+    paddw                m3, m3
+    pmulhw               m3, m6
+    paddw                m3, m1
+    mova                 m1, m2
+    pmulhrsw             m3, m7
+    mova             m3back, m3
+    movu                 m2,     [srcq+8*0]
+    pshufb               m2, m4
+    pmaddubsw            m2, m5
+    psubw                m3, m2, m0
+    paddw                m3, m3
+    pmulhw               m3, m6
+    paddw                m3, m0
+    mova                 m0, m2
+    pmulhrsw             m3, m7
+    packuswb             m3, m3back
+    mova             [dstq], m3
+    add                dstq, dsqval
+    dec                  hd
+    jg %%loop
+ %if WIN64
+    movaps             xmm8, r4m
+ %endif
+ %undef m3back
+ %undef dsqval
+%endmacro
+    ;
+.hv_w16:
+    PUT_BILIN_HV_W16
+    RET
+.hv_w16gt:
+    mov                  r4, dstq
+    mov                  r6, srcq
+.hv_w16gt_loop:
+    PUT_BILIN_HV_W16
+    mov                  hw, t0w
+    add                  r4, mmsize
+    add                  r6, mmsize
+    mov                dstq, r4
+    mov                srcq, r6
+    sub                 t0d, 1<<16
+    jg .hv_w16gt_loop
+    RET
+.hv_w32:
+    lea                 t0d, [hq+(1<<16)]
+    jmp .hv_w16gt
+.hv_w64:
+    lea                 t0d, [hq+(3<<16)]
+    jmp .hv_w16gt
+.hv_w128:
+    lea                 t0d, [hq+(7<<16)]
+    jmp .hv_w16gt
+
 %if WIN64
 DECLARE_REG_TMP 6, 4
 %else
 DECLARE_REG_TMP 6, 7
 %endif
 
 %macro BIDIR_FN 1 ; op
     %1                    0
@@ -181,17 +824,17 @@ DECLARE_REG_TMP 6, 7
 %endmacro
 
 %macro AVG_INC_PTR 1
     add               tmp1q, %1*mmsize
     add               tmp2q, %1*mmsize
 %endmacro
 
 cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
-    lea                  r6, [avg_ssse3_table]
+    LEA                  r6, avg_ssse3_table
     tzcnt                wd, wm ; leading zeros
     movifnidn            hd, hm ; move h(stack) to h(register) if not already that register
     movsxd               wq, dword [r6+wq*4] ; push table entry matching the tile width (tzcnt) in widen reg
     mova                 m2, [pw_1024+r6-avg_ssse3_table] ; fill m2 with shift/align
     add                  wq, r6
     BIDIR_FN            AVG
 
 %macro W_AVG 1 ; src_offset
@@ -211,17 +854,17 @@ cglobal avg, 4, 7, 3, dst, stride, tmp1,
     pmulhrsw             m0, m5
     pmulhrsw             m1, m5
     packuswb             m0, m1
 %endmacro
 
 %define W_AVG_INC_PTR AVG_INC_PTR
 
 cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
-    lea                  r6, [w_avg_ssse3_table]
+    LEA                  r6, w_avg_ssse3_table
     tzcnt                wd, wm
     movifnidn            hd, hm
     movd                 m0, r6m
     pshuflw              m0, m0, q0000
     punpcklqdq           m0, m0
     movsxd               wq, dword [r6+wq*4]
     pxor                 m4, m4
     psllw                m0, 11 ; can't shift by 12, sign bit must be preserved
@@ -264,34 +907,35 @@ cglobal w_avg, 4, 7, 6, dst, stride, tmp
 
 %if ARCH_X86_64
 cglobal mask, 4, 8, 7, dst, stride, tmp1, tmp2, w, h, mask, stride3
     movifnidn            hd, hm
 %else
 cglobal mask, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3
 %define hd dword r5m
 %endif
-    lea                  r6, [mask_ssse3_table]
+%define base r6-mask_ssse3_table
+    LEA                  r6, mask_ssse3_table
     tzcnt                wd, wm
     movsxd               wq, dword [r6+wq*4]
     pxor                 m4, m4
-    mova                 m5, [pw_2048+r6-mask_ssse3_table]
+    mova                 m5, [base+pw_2048]
     add                  wq, r6
     mov               maskq, r6m
     BIDIR_FN           MASK
 %undef hd
 
 %if ARCH_X86_64
  %define reg_pw_8         m8
  %define reg_pw_27        m9
  %define reg_pw_2048      m10
 %else
- %define reg_pw_8         [pw_8]
- %define reg_pw_27        [pw_26] ; 64 - 38
- %define reg_pw_2048      [pw_2048]
+ %define reg_pw_8         [base+pw_8]
+ %define reg_pw_27        [base+pw_26] ; 64 - 38
+ %define reg_pw_2048      [base+pw_2048]
 %endif
 
 %macro W_MASK_420_B 2 ; src_offset in bytes, mask_out
     ;**** do m0 = u16.dst[7..0], m%2 = u16.m[7..0] ****
     mova                 m0, [tmp1q+(%1)]
     mova                 m1, [tmp2q+(%1)]
     psubw                m1, m0 ; tmp1 - tmp2
     pabsw                m3, m1 ; abs(tmp1 - tmp2)
@@ -318,73 +962,70 @@ cglobal mask, 4, 7, 7, dst, stride, tmp1
     pmulhrsw             m1, reg_pw_2048 ; round/scale 2048
     packuswb             m0, m1 ; concat m0 = u8.dst[15..0]
 %endmacro
 
 %macro W_MASK_420 2
     W_MASK_420_B (%1*16), %2
 %endmacro
 
+%define base r6-w_mask_420_ssse3_table
 %if ARCH_X86_64
 ; args: dst, stride, tmp1, tmp2, w, h, mask, sign
-cglobal w_mask_420, 4, 9, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
-    lea                  r7, [w_mask_420_ssse3_table]
+cglobal w_mask_420, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask
+    lea                  r6, [w_mask_420_ssse3_table]
     mov                  wd, wm
-    tzcnt               r8d, wd
+    tzcnt               r7d, wd
     movifnidn            hd, hm
-    mov               maskq, maskmp
     movd                 m0, r7m
     pshuflw              m0, m0, q0000 ; sign
     punpcklqdq           m0, m0
-    movsxd               r8, dword [r7+r8*4]
-    mova           reg_pw_8, [pw_8]
-    mova          reg_pw_27, [pw_26] ; 64 - 38
-    mova        reg_pw_2048, [pw_2048]
-    mova                 m6, [pw_258] ; 64 * 4 + 2
+    movsxd               r7, [r6+r7*4]
+    mova           reg_pw_8, [base+pw_8]
+    mova          reg_pw_27, [base+pw_26] ; 64 - 38
+    mova        reg_pw_2048, [base+pw_2048]
+    mova                 m6, [base+pw_258] ; 64 * 4 + 2
+    add                  r7, r6
+    mov               maskq, maskmp
     psubw                m6, m0
-    add                  r8, r7
     W_MASK_420            0, 4
-    lea            stride3q, [strideq*3]
-    jmp                  r8
-    %define dst_bak      r8
-    %define loop_w       r7
-    %define orig_w       wq
+    jmp                  r7
+    %define loop_w      r7d
 %else
-cglobal w_mask_420, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask, stride3
-    tzcnt               r6d, r4m
-    mov                  wd, w_mask_420_ssse3_table
-    add                  wd, [wq+r6*4]
+cglobal w_mask_420, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask
+    tzcnt                wd, wm
+    LEA                  r6, w_mask_420_ssse3_table
+    mov                  wd, [r6+wq*4]
     mov               maskq, r6mp
     movd                 m0, r7m
     pshuflw              m0, m0, q0000 ; sign
     punpcklqdq           m0, m0
-    mova                 m6, [pw_258] ; 64 * 4 + 2
+    mova                 m6, [base+pw_258] ; 64 * 4 + 2
+    add                  wq, r6
     psubw                m6, m0
     W_MASK_420            0, 4
-    lea            stride3q, [strideq*3]
     jmp                  wd
-    %define dst_bak     r0m
-    %define loop_w      r6q
-    %define orig_w      r4m
-    %define hd    dword r5m
+    %define loop_w dword r0m
+    %define hd     dword r5m
 %endif
 .w4_loop:
     add               tmp1q, 2*16
     add               tmp2q, 2*16
     W_MASK_420            0, 4
-    lea                dstq, [dstq+strideq*4]
+    lea                dstq, [dstq+strideq*2]
     add               maskq, 4
 .w4:
     movd   [dstq          ], m0 ; copy m0[0]
     pshuflw              m1, m0, q1032
     movd   [dstq+strideq*1], m1 ; copy m0[1]
+    lea                dstq, [dstq+strideq*2]
     punpckhqdq           m0, m0
-    movd   [dstq+strideq*2], m0 ; copy m0[2]
+    movd   [dstq+strideq*0], m0 ; copy m0[2]
     psrlq                m0, 32
-    movd   [dstq+stride3q ], m0 ; copy m0[3]
+    movd   [dstq+strideq*1], m0 ; copy m0[3]
     pshufd               m5, m4, q3131; DBDB even lines repeated
     pshufd               m4, m4, q2020; CACA odd lines repeated
     psubw                m1, m6, m4   ; m9 == 64 * 4 + 2
     psubw                m1, m5       ; C-D A-B C-D A-B
     psrlw                m1, 2        ; >> 2
     packuswb             m1, m1
     movd            [maskq], m1
     sub                  hd, 4
@@ -404,46 +1045,44 @@ cglobal w_mask_420, 4, 7, 8, dst, stride
     psubw                m0, m1
     psrlw                m0, 2
     packuswb             m0, m0
     movd            [maskq], m0
     sub                  hd, 2
     jg .w8_loop
     RET
 .w16: ; w32/64/128
-    mov             dst_bak, dstq
-    mov              loop_w, orig_w ; use width as counter
 %if ARCH_X86_32
-    mov                  wq, orig_w ; because we altered it in 32bit setup
+    mov                  wd, wm     ; because we altered it in 32bit setup
 %endif
+    mov              loop_w, wd     ; use width as counter
     jmp .w16ge_inner_loop_first
 .w16ge_loop:
     lea               tmp1q, [tmp1q+wq*2] ; skip even line pixels
     lea               tmp2q, [tmp2q+wq*2] ; skip even line pixels
+    sub                dstq, wq
+    mov              loop_w, wd
     lea                dstq, [dstq+strideq*2]
-    mov             dst_bak, dstq
-    mov              loop_w, orig_w
 .w16ge_inner_loop:
-    W_MASK_420_B           0, 4
+    W_MASK_420_B          0, 4
 .w16ge_inner_loop_first:
     mova   [dstq          ], m0
     W_MASK_420_B       wq*2, 5  ; load matching even line (offset = widthpx * (16+16))
     mova   [dstq+strideq*1], m0
     psubw                m1, m6, m4 ; m9 == 64 * 4 + 2
     psubw                m1, m5     ; - odd line mask
     psrlw                m1, 2      ; >> 2
     packuswb             m1, m1
     movq            [maskq], m1
     add               tmp1q, 2*16
     add               tmp2q, 2*16
     add               maskq, 8
     add                dstq, 16
     sub              loop_w, 16
     jg .w16ge_inner_loop
-    mov                dstq, dst_bak
     sub                  hd, 2
     jg .w16ge_loop
     RET
 
 %undef reg_pw_8
 %undef reg_pw_27
 %undef reg_pw_2048
 %undef dst_bak
@@ -465,17 +1104,17 @@ cglobal w_mask_420, 4, 7, 8, dst, stride
     psubb                m3, m4, m0 ; m3 = (64 - m)
     punpcklbw            m2, m3, m0 ; {m;(64-m)}[7..0]
     punpckhbw            m3, m0     ; {m;(64-m)}[15..8]
     BLEND_64M            %1, %2, m2, m3
 %endmacro
 
 cglobal blend, 3, 7, 7, dst, ds, tmp, w, h, mask
 %define base r6-blend_ssse3_table
-    lea                  r6, [blend_ssse3_table]
+    LEA                  r6, blend_ssse3_table
     tzcnt                wd, wm
     movifnidn            hd, hm
     movifnidn         maskq, maskmp
     movsxd               wq, dword [r6+wq*4]
     mova                 m4, [base+pb_64]
     mova                 m5, [base+pw_512]
     add                  wq, r6
     lea                  r6, [dsq*3]
@@ -541,17 +1180,17 @@ cglobal blend, 3, 7, 7, dst, ds, tmp, w,
     add                tmpq, 32
     add                dstq, dsq ; dst_stride
     dec                  hd
     jg .w32
     RET
 
 cglobal blend_v, 3, 6, 8, dst, ds, tmp, w, h, mask
 %define base r5-blend_v_ssse3_table
-    lea                  r5, [blend_v_ssse3_table]
+    LEA                  r5, blend_v_ssse3_table
     tzcnt                wd, wm
     movifnidn            hd, hm
     movsxd               wq, dword [r5+wq*4]
     mova                 m5, [base+pw_512]
     add                  wq, r5
     add               maskq, obmc_masks-blend_v_ssse3_table
     jmp                  wq
 .w2:
@@ -641,25 +1280,31 @@ cglobal blend_v, 3, 6, 8, dst, ds, tmp, 
     BLEND_64M            m1, m2, m6, m7
     mova        [dstq+16*1], m0
     add                tmpq, 32
     add                dstq, dsq
     dec                  hd
     jg .w32_loop
     RET
 
-cglobal blend_h, 4, 7, 6, dst, ds, tmp, w, h, mask
-%define base r5-blend_h_ssse3_table
-    lea                  r5, [blend_h_ssse3_table]
+cglobal blend_h, 3, 7, 6, dst, ds, tmp, w, h, mask
+%define base t0-blend_h_ssse3_table
+%if ARCH_X86_32
+    ; We need to keep the PIC pointer for w4, reload wd from stack instead
+    DECLARE_REG_TMP 6
+%else
+    DECLARE_REG_TMP 5
     mov                 r6d, wd
-    tzcnt                wd, wd
+%endif
+    LEA                  t0, blend_h_ssse3_table
+    tzcnt                wd, wm
     mov                  hd, hm
-    movsxd               wq, dword [r5+wq*4]
+    movsxd               wq, dword [t0+wq*4]
     mova                 m5, [base+pw_512]
-    add                  wq, r5
+    add                  wq, t0
     lea               maskq, [base+obmc_masks+hq*4]
     neg                  hq
     jmp                  wq
 .w2:
     movd                 m0, [dstq+dsq*0]
     pinsrw               m0, [dstq+dsq*1], 1
     movd                 m2, [maskq+hq*2]
     movd                 m1, [tmpq]
@@ -673,17 +1318,21 @@ cglobal blend_h, 4, 7, 6, dst, ds, tmp, 
     shr                 r3d, 16
     mov        [dstq+dsq*1], r3w
     lea                dstq, [dstq+dsq*2]
     add                tmpq, 2*2
     add                  hq, 2
     jl .w2
     RET
 .w4:
+%if ARCH_X86_32
+    mova                 m3, [base+blend_shuf]
+%else
     mova                 m3, [blend_shuf]
+%endif
 .w4_loop:
     movd                 m0, [dstq+dsq*0]
     movd                 m2, [dstq+dsq*1]
     punpckldq            m0, m2 ; a
     movq                 m1, [tmpq] ; b
     movq                 m2, [maskq+hq*2] ; m
     pshufb               m2, m3
     punpcklbw            m0, m1
@@ -711,16 +1360,19 @@ cglobal blend_h, 4, 7, 6, dst, ds, tmp, 
     movhps     [dstq+dsq*1], m0
     lea                dstq, [dstq+dsq*2]
     add                tmpq, 8*2
     add                  hq, 2
     jl .w8
     RET
 ; w16/w32/w64/w128
 .w16:
+%if ARCH_X86_32
+    mov                 r6d, wm
+%endif
     sub                 dsq, r6
 .w16_loop0:
     movd                 m3, [maskq+hq*2]
     pshuflw              m3, m3, q0000
     punpcklqdq           m3, m3
     mov                  wd, r6d
 .w16_loop:
     mova                 m1, [dstq] ; a
@@ -730,8 +1382,378 @@ cglobal blend_h, 4, 7, 6, dst, ds, tmp, 
     add                dstq, 16
     add                tmpq, 16
     sub                  wd, 16
     jg .w16_loop
     add                dstq, dsq
     inc                  hq
     jl .w16_loop0
     RET
+
+; emu_edge args:
+; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih,
+; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride,
+; const pixel *ref, const ptrdiff_t ref_stride
+;
+; bw, bh total filled size
+; iw, ih, copied block -> fill bottom, right
+; x, y, offset in bw/bh -> fill top, left
+cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \
+                             y, dst, dstride, src, sstride, \
+                             bottomext, rightext, blk
+    ; we assume that the buffer (stride) is larger than width, so we can
+    ; safely overwrite by a few bytes
+    pxor                 m1, m1
+
+%if ARCH_X86_64
+ %define reg_zero       r12q
+ %define reg_tmp        r10
+ %define reg_src        srcq
+ %define reg_bottomext  bottomextq
+ %define reg_rightext   rightextq
+ %define reg_blkm       r9m
+%else
+ %define reg_zero       r6
+ %define reg_tmp        r0
+ %define reg_src        r1
+ %define reg_bottomext  r0
+ %define reg_rightext   r1
+ %define reg_blkm       r2m
+%endif
+    ;
+    ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+    xor            reg_zero, reg_zero
+    lea             reg_tmp, [ihq-1]
+    cmp                  yq, ihq
+    cmovl           reg_tmp, yq
+    test                 yq, yq
+    cmovl           reg_tmp, reg_zero
+%if ARCH_X86_64
+    imul            reg_tmp, sstrideq
+    add                srcq, reg_tmp
+%else
+    imul            reg_tmp, sstridem
+    mov             reg_src, srcm
+    add             reg_src, reg_tmp
+%endif
+    ;
+    ; ref += iclip(x, 0, iw - 1)
+    lea             reg_tmp, [iwq-1]
+    cmp                  xq, iwq
+    cmovl           reg_tmp, xq
+    test                 xq, xq
+    cmovl           reg_tmp, reg_zero
+    add             reg_src, reg_tmp
+%if ARCH_X86_32
+    mov                srcm, reg_src
+%endif
+    ;
+    ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+%if ARCH_X86_32
+    mov                  r1, r1m ; restore bh
+%endif
+    lea       reg_bottomext, [yq+bhq]
+    sub       reg_bottomext, ihq
+    lea                  r3, [bhq-1]
+    cmovl     reg_bottomext, reg_zero
+    ;
+
+    DEFINE_ARGS bw, bh, iw, ih, x, \
+                topext, dst, dstride, src, sstride, \
+                bottomext, rightext, blk
+
+    ; top_ext = iclip(-y, 0, bh - 1)
+    neg             topextq
+    cmovl           topextq, reg_zero
+    cmp       reg_bottomext, bhq
+    cmovge    reg_bottomext, r3
+    cmp             topextq, bhq
+    cmovg           topextq, r3
+ %if ARCH_X86_32
+    mov                 r4m, reg_bottomext
+    ;
+    ; right_ext = iclip(x + bw - iw, 0, bw - 1)
+    mov                  r0, r0m ; restore bw
+ %endif
+    lea        reg_rightext, [xq+bwq]
+    sub        reg_rightext, iwq
+    lea                  r2, [bwq-1]
+    cmovl      reg_rightext, reg_zero
+
+    DEFINE_ARGS bw, bh, iw, ih, leftext, \
+                topext, dst, dstride, src, sstride, \
+                bottomext, rightext, blk
+
+    ; left_ext = iclip(-x, 0, bw - 1)
+    neg            leftextq
+    cmovl          leftextq, reg_zero
+    cmp        reg_rightext, bwq
+    cmovge     reg_rightext, r2
+ %if ARCH_X86_32
+    mov                 r3m, r1
+ %endif
+    cmp            leftextq, bwq
+    cmovge         leftextq, r2
+
+%undef reg_zero
+%undef reg_tmp
+%undef reg_src
+%undef reg_bottomext
+%undef reg_rightext
+
+    DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \
+                topext, dst, dstride, src, sstride, \
+                bottomext, rightext, blk
+
+    ; center_h = bh - top_ext - bottom_ext
+%if ARCH_X86_64
+    lea                  r3, [bottomextq+topextq]
+    sub            centerhq, r3
+%else
+    mov                   r1, centerhm ; restore r1
+    sub             centerhq, topextq
+    sub             centerhq, r4m
+    mov                  r1m, centerhq
+%endif
+    ;
+    ; blk += top_ext * PXSTRIDE(dst_stride)
+    mov                  r2, topextq
+%if ARCH_X86_64
+    imul                 r2, dstrideq
+%else
+    mov                  r6, r6m ; restore dstq
+    imul                 r2, dstridem
+%endif
+    add                dstq, r2
+    mov            reg_blkm, dstq ; save pointer for ext
+    ;
+    ; center_w = bw - left_ext - right_ext
+    mov            centerwq, bwq
+%if ARCH_X86_64
+    lea                  r3, [rightextq+leftextq]
+    sub            centerwq, r3
+%else
+    sub            centerwq, r3m
+    sub            centerwq, leftextq
+%endif
+
+; vloop Macro
+%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
+  %if ARCH_X86_64
+    %define reg_tmp        r12
+  %else
+    %define reg_tmp        r0
+  %endif
+.v_loop_%3:
+  %if ARCH_X86_32
+    mov                  r0, r0m
+    mov                  r1, r1m
+  %endif
+%if %1
+    test           leftextq, leftextq
+    jz .body_%3
+    ; left extension
+  %if ARCH_X86_64
+    movd                 m0, [srcq]
+  %else
+    mov                  r3, srcm
+    movd                 m0, [r3]
+  %endif
+    pshufb               m0, m1
+    xor                  r3, r3
+.left_loop_%3:
+    mova          [dstq+r3], m0
+    add                  r3, mmsize
+    cmp                  r3, leftextq
+    jl .left_loop_%3
+    ; body
+.body_%3:
+    lea             reg_tmp, [dstq+leftextq]
+%endif
+    xor                  r3, r3
+.body_loop_%3:
+  %if ARCH_X86_64
+    movu                 m0, [srcq+r3]
+  %else
+    mov                  r1, srcm
+    movu                 m0, [r1+r3]
+  %endif
+%if %1
+    movu       [reg_tmp+r3], m0
+%else
+    movu          [dstq+r3], m0
+%endif
+    add                  r3, mmsize
+    cmp                  r3, centerwq
+    jl .body_loop_%3
+%if %2
+    ; right extension
+  %if ARCH_X86_64
+    test          rightextq, rightextq
+  %else
+    mov                  r1, r3m
+    test                 r1, r1
+  %endif
+    jz .body_loop_end_%3
+%if %1
+    add             reg_tmp, centerwq
+%else
+    lea             reg_tmp, [dstq+centerwq]
+%endif
+  %if ARCH_X86_64
+    movd                 m0, [srcq+centerwq-1]
+  %else
+    mov                  r3, srcm
+    movd                 m0, [r3+centerwq-1]
+  %endif
+    pshufb               m0, m1
+    xor                  r3, r3
+.right_loop_%3:
+    movu       [reg_tmp+r3], m0
+    add                  r3, mmsize
+  %if ARCH_X86_64
+    cmp                  r3, rightextq
+  %else
+    cmp                  r3, r3m
+  %endif
+    jl .right_loop_%3
+.body_loop_end_%3:
+%endif
+  %if ARCH_X86_64
+    add                dstq, dstrideq
+    add                srcq, sstrideq
+    dec            centerhq
+    jg .v_loop_%3
+  %else
+    add                dstq, dstridem
+    mov                  r0, sstridem
+    add                srcm, r0
+    sub       dword centerhm, 1
+    jg .v_loop_%3
+    mov                  r0, r0m ; restore r0
+  %endif
+%endmacro ; vloop MACRO
+
+    test           leftextq, leftextq
+    jnz .need_left_ext
+ %if ARCH_X86_64
+    test          rightextq, rightextq
+    jnz .need_right_ext
+ %else
+    cmp            leftextq, r3m ; leftextq == 0
+    jne .need_right_ext
+ %endif
+    v_loop                0, 0, 0
+    jmp .body_done
+
+    ;left right extensions
+.need_left_ext:
+ %if ARCH_X86_64
+    test          rightextq, rightextq
+ %else
+    mov                  r3, r3m
+    test                 r3, r3
+ %endif
+    jnz .need_left_right_ext
+    v_loop                1, 0, 1
+    jmp .body_done
+
+.need_left_right_ext:
+    v_loop                1, 1, 2
+    jmp .body_done
+
+.need_right_ext:
+    v_loop                0, 1, 3
+
+.body_done:
+; r0 ; bw
+; r1 ;; x loop
+; r4 ;; y loop
+; r5 ; topextq
+; r6 ;dstq
+; r7 ;dstrideq
+; r8 ; srcq
+%if ARCH_X86_64
+ %define reg_dstride    dstrideq
+%else
+ %define reg_dstride    r2
+%endif
+    ;
+    ; bottom edge extension
+ %if ARCH_X86_64
+    test         bottomextq, bottomextq
+    jz .top
+ %else
+    xor                  r1, r1
+    cmp                  r1, r4m
+    je .top
+ %endif
+    ;
+ %if ARCH_X86_64
+    mov                srcq, dstq
+    sub                srcq, dstrideq
+    xor                  r1, r1
+ %else
+    mov                  r3, dstq
+    mov         reg_dstride, dstridem
+    sub                  r3, reg_dstride
+    mov                srcm, r3
+ %endif
+    ;
+.bottom_x_loop:
+ %if ARCH_X86_64
+    mova                 m0, [srcq+r1]
+    lea                  r3, [dstq+r1]
+    mov                  r4, bottomextq
+ %else
+    mov                  r3, srcm
+    mova                 m0, [r3+r1]
+    lea                  r3, [dstq+r1]
+    mov                  r4, r4m
+ %endif
+    ;
+.bottom_y_loop:
+    mova               [r3], m0
+    add                  r3, reg_dstride
+    dec                  r4
+    jg .bottom_y_loop
+    add                  r1, mmsize
+    cmp                  r1, bwq
+    jl .bottom_x_loop
+
+.top:
+    ; top edge extension
+    test            topextq, topextq
+    jz .end
+%if ARCH_X86_64
+    mov                srcq, reg_blkm
+%else
+    mov                  r3, reg_blkm
+    mov         reg_dstride, dstridem
+%endif
+    mov                dstq, dstm
+    xor                  r1, r1
+    ;
+.top_x_loop:
+%if ARCH_X86_64
+    mova                 m0, [srcq+r1]
+%else
+    mov                  r3, reg_blkm
+    mova                 m0, [r3+r1]
+%endif
+    lea                  r3, [dstq+r1]
+    mov                  r4, topextq
+    ;
+.top_y_loop:
+    mova               [r3], m0
+    add                  r3, reg_dstride
+    dec                  r4
+    jg .top_y_loop
+    add                  r1, mmsize
+    cmp                  r1, bwq
+    jl .top_x_loop
+
+.end:
+    RET
+
+%undef reg_dstride
+%undef reg_blkm
+%undef reg_tmp
--- a/third_party/dav1d/tests/checkasm/x86/checkasm.asm
+++ b/third_party/dav1d/tests/checkasm/x86/checkasm.asm
@@ -195,17 +195,17 @@ cglobal checked_call, 1,7
     xor  r5, n5
     xor  r6, n6
     or   r3, r4
     or   r5, r6
     or   r3, r5
     jz .ok
     mov  r3, eax
     mov  r4, edx
-    lea  r0, [error_message]
+    LEA  r0, error_message
     mov [esp], r0
     call fail_func
     mov  edx, r4
     mov  eax, r3
 .ok:
     add  esp, max_args*4
     RET