Backed out 3 changesets (bug 1509327) for build bustages on a CLOSED TREE
authorAndreea Pavel <apavel@mozilla.com>
Fri, 30 Nov 2018 05:16:08 +0200
changeset 505345 1b5ec7fefbbf8959db1bb438b6491d925074b066
parent 505344 095c92e4b6b569c2af6e24990a36fac1c4d2e214
child 505346 d8084f695ac4ffd9673201b8fab65fc86cf19793
push id10290
push userffxbld-merge
push dateMon, 03 Dec 2018 16:23:23 +0000
treeherdermozilla-beta@700bed2445e6 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
bugs1509327
milestone65.0a1
backs outaeb00b8974a472794d513c5884c38a96397334da
ea27440a2126bc93ca919be0dcd56880b8803bd7
9881fd46fde2d15b213c7c4b36fdc6d39df9ec34
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Backed out 3 changesets (bug 1509327) for build bustages on a CLOSED TREE Backed out changeset aeb00b8974a4 (bug 1509327) Backed out changeset ea27440a2126 (bug 1509327) Backed out changeset 9881fd46fde2 (bug 1509327)
dom/media/platforms/agnostic/DAV1DDecoder.cpp
dom/media/platforms/agnostic/DAV1DDecoder.h
media/libdav1d/README_MOZILLA
media/libdav1d/moz.build
media/libdav1d/moz.yaml
third_party/dav1d/.gitlab-ci.yml
third_party/dav1d/doc/Doxyfile.in
third_party/dav1d/doc/meson.build
third_party/dav1d/include/dav1d/common.h
third_party/dav1d/include/dav1d/data.h
third_party/dav1d/include/dav1d/dav1d.h
third_party/dav1d/include/dav1d/headers.h
third_party/dav1d/include/dav1d/picture.h
third_party/dav1d/meson.build
third_party/dav1d/src/arm/64/looprestoration.S
third_party/dav1d/src/arm/64/mc.S
third_party/dav1d/src/arm/asm.S
third_party/dav1d/src/arm/looprestoration_init_tmpl.c
third_party/dav1d/src/arm/mc_init_tmpl.c
third_party/dav1d/src/cdef_apply_tmpl.c
third_party/dav1d/src/cdf.c
third_party/dav1d/src/cdf.h
third_party/dav1d/src/cpu.h
third_party/dav1d/src/data.c
third_party/dav1d/src/decode.c
third_party/dav1d/src/env.h
third_party/dav1d/src/film_grain.h
third_party/dav1d/src/film_grain_tmpl.c
third_party/dav1d/src/internal.h
third_party/dav1d/src/intra_edge.c
third_party/dav1d/src/ipred.h
third_party/dav1d/src/itx.h
third_party/dav1d/src/itx_1d.c
third_party/dav1d/src/itx_tmpl.c
third_party/dav1d/src/levels.h
third_party/dav1d/src/lf_apply_tmpl.c
third_party/dav1d/src/lf_mask.c
third_party/dav1d/src/lf_mask.h
third_party/dav1d/src/lib.c
third_party/dav1d/src/looprestoration.h
third_party/dav1d/src/looprestoration_tmpl.c
third_party/dav1d/src/lr_apply_tmpl.c
third_party/dav1d/src/mc.h
third_party/dav1d/src/mc_tmpl.c
third_party/dav1d/src/meson.build
third_party/dav1d/src/obu.c
third_party/dav1d/src/obu.h
third_party/dav1d/src/picture.c
third_party/dav1d/src/picture.h
third_party/dav1d/src/recon.h
third_party/dav1d/src/recon_tmpl.c
third_party/dav1d/src/ref.c
third_party/dav1d/src/ref.h
third_party/dav1d/src/ref_mvs.c
third_party/dav1d/src/ref_mvs.h
third_party/dav1d/src/tables.c
third_party/dav1d/src/tables.h
third_party/dav1d/src/thread_task.c
third_party/dav1d/src/warpmv.c
third_party/dav1d/src/warpmv.h
third_party/dav1d/src/x86/mc_init_tmpl.c
third_party/dav1d/src/x86/mc_ssse3.asm
third_party/dav1d/tests/checkasm/checkasm.c
third_party/dav1d/tests/checkasm/ipred.c
third_party/dav1d/tests/checkasm/loopfilter.c
third_party/dav1d/tests/libfuzzer/dav1d_fuzzer.c
third_party/dav1d/tests/meson.build
third_party/dav1d/tools/dav1d.c
third_party/dav1d/tools/dav1d_cli_parse.c
third_party/dav1d/tools/input/ivf.c
third_party/dav1d/tools/output/md5.c
third_party/dav1d/tools/output/y4m2.c
--- a/dom/media/platforms/agnostic/DAV1DDecoder.cpp
+++ b/dom/media/platforms/agnostic/DAV1DDecoder.cpp
@@ -13,16 +13,18 @@
 
 namespace mozilla {
 
 DAV1DDecoder::DAV1DDecoder(const CreateDecoderParams& aParams)
     : mInfo(aParams.VideoConfig()),
       mTaskQueue(aParams.mTaskQueue),
       mImageContainer(aParams.mImageContainer) {}
 
+DAV1DDecoder::~DAV1DDecoder() {}
+
 RefPtr<MediaDataDecoder::InitPromise> DAV1DDecoder::Init() {
   Dav1dSettings settings;
   dav1d_default_settings(&settings);
   int decoder_threads = 2;
   if (mInfo.mDisplay.width >= 2048) {
     decoder_threads = 8;
   } else if (mInfo.mDisplay.width >= 1024) {
     decoder_threads = 4;
@@ -70,32 +72,32 @@ void DAV1DDecoder::ReleaseDataBuffer(con
   Unused << rv;
 }
 
 RefPtr<MediaDataDecoder::DecodePromise> DAV1DDecoder::InvokeDecode(
     MediaRawData* aSample) {
   MOZ_ASSERT(mTaskQueue->IsCurrentThreadIn());
   MOZ_ASSERT(aSample);
 
+  // Save the last timing values to use in drain.
+  mLastTimecode = aSample->mTimecode;
+  mLastDuration = aSample->mDuration;
+  mLastOffset = aSample->mOffset;
   // Add the buffer to the hashtable in order to increase
   // the ref counter and keep it alive. When dav1d does not
   // need it any more will call it's release callback. Remove
   // the buffer, in there, to reduce the ref counter and eventually
   // free it. We need a hashtable and not an array because the
   // release callback are not coming in the same order that the
   // buffers have been added in the decoder (threading ordering
   // inside decoder)
   mDecodingBuffers.Put(aSample->Data(), aSample);
   Dav1dData data;
   int res = dav1d_data_wrap(&data, aSample->Data(), aSample->Size(),
                             ReleaseDataBuffer_s, this);
-  data.m.timestamp = aSample->mTimecode.ToMicroseconds();
-  data.m.duration = aSample->mDuration.ToMicroseconds();
-  data.m.offset = aSample->mOffset;
-
   if (res < 0) {
     LOG("Create decoder data error.");
     return DecodePromise::CreateAndReject(
         MediaResult(NS_ERROR_OUT_OF_MEMORY, __func__), __func__);
   }
   DecodedData results;
   do {
     res = dav1d_send_data(mContext, &data);
@@ -105,33 +107,34 @@ RefPtr<MediaDataDecoder::DecodePromise> 
           MediaResult(NS_ERROR_DOM_MEDIA_DECODE_ERR, __func__), __func__);
     }
     // Alway consume the whole buffer on success.
     // At this point only -EAGAIN error is expected.
     MOZ_ASSERT((res == 0 && !data.sz) ||
                (res == -EAGAIN && data.sz == aSample->Size()));
 
     MediaResult rs(NS_OK);
-    res = GetPicture(results, rs);
+    res = GetPicture(aSample, results, rs);
     if (res < 0) {
       if (res == -EAGAIN) {
         // No frames ready to return. This is not an
         // error, in some circumstances, we need to
         // feed it with a certain amount of frames
         // before we get a picture.
         continue;
       }
       return DecodePromise::CreateAndReject(rs, __func__);
     }
   } while (data.sz > 0);
 
   return DecodePromise::CreateAndResolve(std::move(results), __func__);
 }
 
-int DAV1DDecoder::GetPicture(DecodedData& aData, MediaResult& aResult) {
+int DAV1DDecoder::GetPicture(const MediaRawData* aSample, DecodedData& aData,
+                             MediaResult& aResult) {
   class Dav1dPictureWrapper {
    public:
     Dav1dPicture* operator&() { return &p; }
     const Dav1dPicture& operator*() const { return p; }
     ~Dav1dPictureWrapper() { dav1d_picture_unref(&p); }
 
    private:
     Dav1dPicture p = Dav1dPicture();
@@ -144,102 +147,93 @@ int DAV1DDecoder::GetPicture(DecodedData
     aResult = MediaResult(NS_ERROR_DOM_MEDIA_DECODE_ERR, __func__);
     return res;
   }
 
   if ((*picture).p.layout == DAV1D_PIXEL_LAYOUT_I400) {
     return 0;
   }
 
-  RefPtr<VideoData> v = ConstructImage(*picture);
+  RefPtr<VideoData> v = ConstructImage(aSample, *picture);
   if (!v) {
     LOG("Image allocation error: %ux%u"
         " display %ux%u picture %ux%u",
         (*picture).p.w, (*picture).p.h, mInfo.mDisplay.width,
         mInfo.mDisplay.height, mInfo.mImage.width, mInfo.mImage.height);
     aResult = MediaResult(NS_ERROR_OUT_OF_MEMORY, __func__);
     return -1;
   }
   aData.AppendElement(std::move(v));
   return 0;
 }
 
 already_AddRefed<VideoData> DAV1DDecoder::ConstructImage(
-    const Dav1dPicture& aPicture) {
+    const MediaRawData* aSample, const Dav1dPicture& picture) {
   VideoData::YCbCrBuffer b;
-  if (aPicture.p.bpc == 10) {
+  if (picture.p.bpc == 10) {
     b.mColorDepth = ColorDepth::COLOR_10;
-  } else if (aPicture.p.bpc == 12) {
+  } else if (picture.p.bpc == 12) {
     b.mColorDepth = ColorDepth::COLOR_12;
   }
-
-  // On every other case use the default (BT601).
-  if (aPicture.seq_hdr->color_description_present) {
-    if (aPicture.seq_hdr->pri == DAV1D_COLOR_PRI_BT709) {
-      b.mYUVColorSpace = YUVColorSpace::BT709;
-    }
-  } else if (aPicture.p.h >= 720) {
-    b.mYUVColorSpace = YUVColorSpace::BT709;
-  }
-
-  b.mPlanes[0].mData = static_cast<uint8_t*>(aPicture.data[0]);
-  b.mPlanes[0].mStride = aPicture.stride[0];
-  b.mPlanes[0].mHeight = aPicture.p.h;
-  b.mPlanes[0].mWidth = aPicture.p.w;
+  b.mPlanes[0].mData = static_cast<uint8_t*>(picture.data[0]);
+  b.mPlanes[0].mStride = picture.stride[0];
+  b.mPlanes[0].mHeight = picture.p.h;
+  b.mPlanes[0].mWidth = picture.p.w;
   b.mPlanes[0].mOffset = 0;
   b.mPlanes[0].mSkip = 0;
 
-  b.mPlanes[1].mData = static_cast<uint8_t*>(aPicture.data[1]);
-  b.mPlanes[1].mStride = aPicture.stride[1];
+  b.mPlanes[1].mData = static_cast<uint8_t*>(picture.data[1]);
+  b.mPlanes[1].mStride = picture.stride[1];
   b.mPlanes[1].mOffset = 0;
   b.mPlanes[1].mSkip = 0;
 
-  b.mPlanes[2].mData = static_cast<uint8_t*>(aPicture.data[2]);
-  b.mPlanes[2].mStride = aPicture.stride[1];
+  b.mPlanes[2].mData = static_cast<uint8_t*>(picture.data[2]);
+  b.mPlanes[2].mStride = picture.stride[1];
   b.mPlanes[2].mOffset = 0;
   b.mPlanes[2].mSkip = 0;
 
   // https://code.videolan.org/videolan/dav1d/blob/master/tools/output/yuv.c#L67
-  const int ss_ver = aPicture.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-  const int ss_hor = aPicture.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+  const int ss_ver = picture.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+  const int ss_hor = picture.p.layout != DAV1D_PIXEL_LAYOUT_I444;
 
-  b.mPlanes[1].mHeight = (aPicture.p.h + ss_ver) >> ss_ver;
-  b.mPlanes[1].mWidth = (aPicture.p.w + ss_hor) >> ss_hor;
+  b.mPlanes[1].mHeight = (picture.p.h + ss_ver) >> ss_ver;
+  b.mPlanes[1].mWidth = (picture.p.w + ss_hor) >> ss_hor;
 
-  b.mPlanes[2].mHeight = (aPicture.p.h + ss_ver) >> ss_ver;
-  b.mPlanes[2].mWidth = (aPicture.p.w + ss_hor) >> ss_hor;
+  b.mPlanes[2].mHeight = (picture.p.h + ss_ver) >> ss_ver;
+  b.mPlanes[2].mWidth = (picture.p.w + ss_hor) >> ss_hor;
 
   // Timestamp, duration and offset used here are wrong.
   // We need to take those values from the decoder. Latest
   // dav1d version allows for that.
-  media::TimeUnit timecode =
-      media::TimeUnit::FromMicroseconds(aPicture.m.timestamp);
-  media::TimeUnit duration =
-      media::TimeUnit::FromMicroseconds(aPicture.m.duration);
-  int64_t offset = aPicture.m.offset;
-  bool keyframe = aPicture.frame_hdr->frame_type == DAV1D_FRAME_TYPE_KEY;
-
   return VideoData::CreateAndCopyData(
-      mInfo, mImageContainer, offset, timecode, duration, b, keyframe, timecode,
-      mInfo.ScaledImageRect(aPicture.p.w, aPicture.p.h));
+      mInfo, mImageContainer, aSample->mOffset, aSample->mTime,
+      aSample->mDuration, b, aSample->mKeyframe, aSample->mTimecode,
+      mInfo.ScaledImageRect(picture.p.w, picture.p.h));
 }
 
 RefPtr<MediaDataDecoder::DecodePromise> DAV1DDecoder::Drain() {
   RefPtr<DAV1DDecoder> self = this;
   return InvokeAsync(mTaskQueue, __func__, [self, this] {
     int res = 0;
     DecodedData results;
     do {
+      RefPtr<MediaRawData> empty(new MediaRawData());
+      // Update last timecode in case we loop over.
+      empty->mTimecode = empty->mTime = mLastTimecode =
+          mLastTimecode + mLastDuration;
+      empty->mDuration = mLastDuration;
+      empty->mOffset = mLastOffset;
+
       MediaResult rs(NS_OK);
-      res = GetPicture(results, rs);
+      res = GetPicture(empty, results, rs);
       if (res < 0 && res != -EAGAIN) {
         return DecodePromise::CreateAndReject(rs, __func__);
       }
     } while (res != -EAGAIN);
-    return DecodePromise::CreateAndResolve(std::move(results), __func__);
+    return DecodePromise::CreateAndResolve(results, __func__);
   });
 }
 
 RefPtr<MediaDataDecoder::FlushPromise> DAV1DDecoder::Flush() {
   RefPtr<DAV1DDecoder> self = this;
   return InvokeAsync(mTaskQueue, __func__, [self]() {
     dav1d_flush(self->mContext);
     return FlushPromise::CreateAndResolve(true, __func__);
--- a/dom/media/platforms/agnostic/DAV1DDecoder.h
+++ b/dom/media/platforms/agnostic/DAV1DDecoder.h
@@ -28,27 +28,35 @@ class DAV1DDecoder : public MediaDataDec
   RefPtr<ShutdownPromise> Shutdown() override;
   nsCString GetDescriptionName() const override {
     return NS_LITERAL_CSTRING("av1 libdav1d video decoder");
   }
 
   void ReleaseDataBuffer(const uint8_t* buf);
 
  private:
-  ~DAV1DDecoder() = default;
+  ~DAV1DDecoder();
   RefPtr<DecodePromise> InvokeDecode(MediaRawData* aSample);
-  int GetPicture(DecodedData& aData, MediaResult& aResult);
-  already_AddRefed<VideoData> ConstructImage(const Dav1dPicture& aPicture);
+  int GetPicture(const MediaRawData* aSample, DecodedData& aData,
+                 MediaResult& aResult);
+  already_AddRefed<VideoData> ConstructImage(const MediaRawData* aSample,
+                                             const Dav1dPicture&);
 
   Dav1dContext* mContext;
 
   const VideoInfo& mInfo;
   const RefPtr<TaskQueue> mTaskQueue;
   const RefPtr<layers::ImageContainer> mImageContainer;
 
   // Keep the buffers alive until dav1d
   // does not need them any more.
   MediaRawDataHashtable mDecodingBuffers;
+
+  // Store the last timing values to use
+  // them during drain.
+  media::TimeUnit mLastTimecode;
+  media::TimeUnit mLastDuration;
+  int64_t mLastOffset = 0;
 };
 
 }  // namespace mozilla
 
 #endif  // DAV1DDecoder_h_
--- a/media/libdav1d/README_MOZILLA
+++ b/media/libdav1d/README_MOZILLA
@@ -16,9 +16,9 @@ To update to a specific upstream git tag
 The upstream git repository is https://aomedia.googlesource.com/aom
 
 To update to a fork, use
 
   ./mach vendor dav1d --repo <repository url> [-r <commit>]
 
 The last update was pulled from https://code.videolan.org/videolan/dav1d
 
-The git commit ID used was 197a19ad702d5e7472852efcde98feeb07f373e0 (2018-11-26T12:15:41.000Z).
+The git commit ID used was 46e2a2d0cc451e1d6bb929f80088f8a7b8940dd0 (2018-10-25T16:51:31.000Z).
--- a/media/libdav1d/moz.build
+++ b/media/libdav1d/moz.build
@@ -5,17 +5,16 @@
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 Library('dav1d')
 
 LOCAL_INCLUDES += [
     '/third_party/dav1d',
     '/third_party/dav1d/include',
     '/third_party/dav1d/include/dav1d',
-    '/third_party/dav1d/src',
 ]
 
 if CONFIG['CPU_ARCH'] == 'x86':
     if CONFIG['OS_TARGET'] == 'Android':
         LOCAL_INCLUDES += ['/media/libdav1d/config/x86_32/android/']
         EXPORTS.dav1d += ['config/x86_32/android/config.h']
     else:
         LOCAL_INCLUDES += ['/media/libdav1d/config/x86_32/']
@@ -69,17 +68,16 @@ SOURCES += [
 # includes src
 EXPORTS.dav1d.src += [
     '../../third_party/dav1d/src/cdf.h',
     '../../third_party/dav1d/src/cpu.h',
     '../../third_party/dav1d/src/ctx.h',
     '../../third_party/dav1d/src/data.h',
     '../../third_party/dav1d/src/decode.h',
     '../../third_party/dav1d/src/dequant_tables.h',
-    '../../third_party/dav1d/src/film_grain.h',
     '../../third_party/dav1d/src/getbits.h',
     '../../third_party/dav1d/src/intra_edge.h',
     '../../third_party/dav1d/src/lf_mask.h',
     '../../third_party/dav1d/src/msac.h',
     '../../third_party/dav1d/src/obu.h',
     '../../third_party/dav1d/src/picture.h',
     '../../third_party/dav1d/src/qm.h',
     '../../third_party/dav1d/src/ref.h',
@@ -91,17 +89,16 @@ EXPORTS.dav1d.src += [
     '../../third_party/dav1d/src/wedge.h',
 ]
 
 # common BITDEPTH 8, 10
 relative_path = '../../third_party/dav1d/src/'
 bitdepth_basenames = [
     'cdef_apply_tmpl.c',
     'cdef_tmpl.c',
-    'film_grain_tmpl.c',
     'ipred_prepare_tmpl.c',
     'ipred_tmpl.c',
     'itx_tmpl.c',
     'lf_apply_tmpl.c',
     'loopfilter_tmpl.c',
     'looprestoration_tmpl.c',
     'lr_apply_tmpl.c',
     'mc_tmpl.c',
@@ -156,17 +153,16 @@ EXPORTS.dav1d += [
     '../../third_party/dav1d/include/common/validate.h',
 ]
 
 # include/dav1d
 EXPORTS.dav1d += [
    '../../third_party/dav1d/include/dav1d/common.h',
    '../../third_party/dav1d/include/dav1d/data.h',
    '../../third_party/dav1d/include/dav1d/dav1d.h',
-   '../../third_party/dav1d/include/dav1d/headers.h',
    '../../third_party/dav1d/include/dav1d/picture.h',
 ]
 
 if CONFIG['OS_TARGET'] == 'WINNT':
     RCFILE = 'dav1d.rc'
     SOURCES += [
         '../../third_party/dav1d/src/win32/thread.c'
     ]
@@ -176,9 +172,10 @@ if CONFIG['CC_TYPE'] == 'msvc':
     EXPORTS.dav1d += ['../../third_party/dav1d/include/compat/msvc/stdatomic.h']
 
 if CONFIG['CC_TYPE'] == 'gcc':
     LOCAL_INCLUDES += ['../../third_party/dav1d/include/compat/gcc/']
     EXPORTS.dav1d += ['../../third_party/dav1d/include/compat/gcc/stdatomic.h']
 
 FINAL_LIBRARY = 'gkmedias'
 
-DisableCompilerWarnings();
+# We allow warnings for third-party code that can be updated from upstream.
+AllowCompilerWarnings()
--- a/media/libdav1d/moz.yaml
+++ b/media/libdav1d/moz.yaml
@@ -15,15 +15,15 @@ origin:
   description: dav1d, a fast AV1 decoder
 
   # Full URL for the package's homepage/etc
   # Usually different from repository url
   url: https://code.videolan.org/videolan/dav1d
 
   # Human-readable identifier for this version/release
   # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: commit 9e08ac7112b6a4fef6e1dde6152ceef1117aa6f4
+  release: commit 46e2a2d0cc451e1d6bb929f80088f8a7b8940dd0
 
   # The package's license, where possible using the mnemonic from
   # https://spdx.org/licenses/
   # Multiple licenses can be specified (as a YAML list)
   # A "LICENSE" file must exist containing the full license text
   license: BSD-2-Clause
--- a/third_party/dav1d/.gitlab-ci.yml
+++ b/third_party/dav1d/.gitlab-ci.yml
@@ -4,110 +4,109 @@ stages:
 
 build-debian:
     image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132
     stage: build
     tags:
         - debian
         - amd64
     script:
-        - meson build --buildtype release --werror
+        - env CFLAGS='-Werror' meson build --buildtype release
         - ninja -C build
         - cd build && meson test -v
 
 build-debian-static:
     image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132
     stage: build
     tags:
         - debian
         - amd64
     script:
-        - meson build --buildtype release --default-library static --werror
+        - env CFLAGS='-Werror' meson build --buildtype release --default-library static
         - ninja -C build
         - cd build && meson test -v
 
 build-win32:
     image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132
     stage: build
     tags:
         - win32
     script:
-        - meson build --buildtype release
-                      --werror
-                      --libdir lib
-                      --prefix "$(pwd)/build/dav1d_install"
-                      --cross-file /opt/crossfiles/i686-w64-mingw32.meson
-                      -Ddefault_library=both
+        - env CFLAGS='-Werror'
+            meson build --buildtype release
+                        --libdir lib
+                        --prefix "$(pwd)/build/dav1d_install"
+                        --cross-file /opt/crossfiles/i686-w64-mingw32.meson
+                        -Ddefault_library=both
         - ninja -C build
         - ninja -C build install
     artifacts:
         name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG"
         paths:
             - build/dav1d_install/
         expire_in: 1 week
 
 build-win64:
     image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132
     stage: build
     tags:
         - win64
     script:
-        - meson build --buildtype release
-                      --werror
-                      --libdir lib
-                      --prefix "$(pwd)/build/dav1d_install"
-                      --cross-file /opt/crossfiles/x86_64-w64-mingw32.meson
-                      -Ddefault_library=both
+        - env CFLAGS='-Werror'
+            meson build --buildtype release
+                        --libdir lib
+                        --prefix "$(pwd)/build/dav1d_install"
+                        --cross-file /opt/crossfiles/x86_64-w64-mingw32.meson
+                        -Ddefault_library=both
         - ninja -C build
         - ninja -C build install
     artifacts:
         name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG"
         paths:
             - build/dav1d_install/
         expire_in: 1 week
 
 build-debian-aarch64:
     stage: build
-    image: registry.videolan.org:5000/dav1d-debian-unstable-aarch64:20181122182457
     tags:
         - aarch64
         - debian
     script:
-        - meson build --buildtype release --werror
+        - env CFLAGS='-Werror' meson build --buildtype release
         - ninja -C build
         - cd build && meson test -v
 
 build-debian-aarch64-clang-5:
     stage: build
-    image: registry.videolan.org:5000/dav1d-debian-unstable-aarch64:20181122182457
     tags:
         - aarch64
+        - clang5
         - debian
     script:
         - env CC=clang-5.0 CFLAGS='-integrated-as' meson build --buildtype release
         - ninja -C build
         - cd build && meson test -v
 
 build-macos:
     stage: build
     tags:
         - macos
     script:
-        - meson build --buildtype release -Ddefault_library=both --werror
+        - env CFLAGS='-Werror' meson build --buildtype release -Ddefault_library=both
         - ninja -C build
         - cd build && meson test -v
 
 build-debian-werror:
-    image: registry.videolan.org:5000/dav1d-debian-unstable-aarch64:20181122182457
+    image: dav1d-debian-aarch64:201810240631
     stage: build
     tags:
         - aarch64
         - debian
     script:
-        - env CC='clang-7' meson build --buildtype debug --werror
+        - env CC='clang-7' CFLAGS='-Werror' meson build -Dbuild_tests=false
         - ninja -C build
 
 test-debian:
     image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132
     stage: test
     tags:
         - debian
         - amd64
@@ -118,71 +117,8 @@ test-debian:
     script:
         - test -d cache || mkdir cache
         - test -d cache/dav1d-test-data.git && GIT_DIR=cache/dav1d-test-data.git git fetch --refmap=refs/heads/master:refs/heads/master origin master
         - test -d cache/dav1d-test-data.git || git clone --bare https://code.videolan.org/videolan/dav1d-test-data.git cache/dav1d-test-data.git
         - git clone cache/dav1d-test-data.git tests/dav1d-test-data
         - meson build --buildtype release -Dtestdata_tests=true
         - ninja -C build
         - cd build && time meson test -v
-
-test-debian-asan:
-    image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132
-    stage: test
-    tags:
-        - debian
-        - amd64
-    cache:
-        key: testdata.git
-        paths:
-            - cache/dav1d-test-data.git/
-    variables:
-        ASAN_OPTIONS: 'detect_leaks=0'
-    script:
-        - test -d cache || mkdir cache
-        - test -d cache/dav1d-test-data.git && GIT_DIR=cache/dav1d-test-data.git git fetch --refmap=refs/heads/master:refs/heads/master origin master
-        - test -d cache/dav1d-test-data.git || git clone --bare https://code.videolan.org/videolan/dav1d-test-data.git cache/dav1d-test-data.git
-        - git clone cache/dav1d-test-data.git tests/dav1d-test-data
-        - meson build --buildtype debugoptimized -Dtestdata_tests=true -Db_sanitize=address -Dbuild_asm=false
-        - ninja -C build
-        - cd build && time meson test -v --setup=sanitizer
-
-test-debian-msan:
-    image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132
-    stage: test
-    tags:
-        - debian
-        - amd64
-    cache:
-        key: testdata.git
-        paths:
-            - cache/dav1d-test-data.git/
-    variables:
-        MSAN_OPTIONS: 'exitcode=1'
-    script:
-        - test -d cache || mkdir cache
-        - test -d cache/dav1d-test-data.git && GIT_DIR=cache/dav1d-test-data.git git fetch --refmap=refs/heads/master:refs/heads/master origin master
-        - test -d cache/dav1d-test-data.git || git clone --bare https://code.videolan.org/videolan/dav1d-test-data.git cache/dav1d-test-data.git
-        - git clone cache/dav1d-test-data.git tests/dav1d-test-data
-        - env CC=clang meson build --buildtype debugoptimized -Dtestdata_tests=true -Db_sanitize=memory -Db_lundef=false -Dbuild_asm=false
-        - ninja -C build
-        - cd build && time meson test -v --setup=sanitizer
-
-test-debian-ubsan:
-    image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132
-    stage: test
-    tags:
-        - debian
-        - amd64
-    cache:
-        key: testdata.git
-        paths:
-            - cache/dav1d-test-data.git/
-    variables:
-        UBSAN_OPTIONS: 'print_stacktrace=1:halt_on_error=1'
-    script:
-        - test -d cache || mkdir cache
-        - test -d cache/dav1d-test-data.git && GIT_DIR=cache/dav1d-test-data.git git fetch --refmap=refs/heads/master:refs/heads/master origin master
-        - test -d cache/dav1d-test-data.git || git clone --bare https://code.videolan.org/videolan/dav1d-test-data.git cache/dav1d-test-data.git
-        - git clone cache/dav1d-test-data.git tests/dav1d-test-data
-        - env CC=clang meson build --buildtype debugoptimized -Dtestdata_tests=true -Db_sanitize=undefined -Db_lundef=false -Dbuild_asm=false
-        - ninja -C build
-        - cd build && time meson test -v --setup=sanitizer
deleted file mode 100644
--- a/third_party/dav1d/doc/Doxyfile.in
+++ /dev/null
@@ -1,19 +0,0 @@
-PROJECT_NAME            = dav1d
-OUTPUT_DIRECTORY        = @DOXYGEN_OUTPUT@
-STRIP_FROM_PATH         = @DOXYGEN_STRIP@
-OUTPUT_LANGUAGE         = English
-TAB_SIZE                = 4
-EXTRACT_ALL             = YES
-OPTIMIZE_OUTPUT_FOR_C   = YES
-DOXYFILE_ENCODING       = UTF-8
-TYPEDEF_HIDES_STRUCT    = YES
-
-QUIET                   = YES
-WARNINGS                = YES
-WARN_IF_UNDOCUMENTED    = YES
-
-INPUT                   = @DOXYGEN_INPUT@
-FILE_PATTERNS           = *.h
-
-GENERATE_HTML           = YES
-GENERATE_LATEX          = NO
deleted file mode 100644
--- a/third_party/dav1d/doc/meson.build
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright © 2018, VideoLAN and dav1d authors
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# 1. Redistributions of source code must retain the above copyright notice, this
-#    list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-doxygen = find_program('doxygen', required: false)
-
-if doxygen.found()
-    conf_data = configuration_data()
-    conf_data.set('DOXYGEN_INPUT', join_paths(meson.source_root(), 'include/dav1d'))
-    conf_data.set('DOXYGEN_STRIP', join_paths(meson.source_root(), 'include'))
-    conf_data.set('DOXYGEN_OUTPUT', meson.current_build_dir())
-    doxyfile = configure_file(input: 'Doxyfile.in',
-                              output: 'Doxyfile',
-                              configuration: conf_data)
-
-    custom_target('doc',
-                  build_by_default: false,
-                  command: [doxygen, doxyfile],
-                  output: ['html']
-    )
-endif
-
--- a/third_party/dav1d/include/dav1d/common.h
+++ b/third_party/dav1d/include/dav1d/common.h
@@ -20,42 +20,24 @@
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __DAV1D_COMMON_H__
-#define __DAV1D_COMMON_H__
-
-#include <stddef.h>
-#include <stdint.h>
+#ifndef __COMMON_H__
+#define __COMMON_H__
 
 #ifndef DAV1D_API
     #if defined _WIN32
       #define DAV1D_API __declspec(dllexport)
     #else
       #if __GNUC__ >= 4
         #define DAV1D_API __attribute__ ((visibility ("default")))
       #else
         #define DAV1D_API
       #endif
     #endif
 #endif
 
-/**
- * Input packet metadata which are copied from the input data used to
- * decode each image into the matching structure of the output image
- * returned back to the user. Since these are metadata fields, they
- * can be used for other purposes than the documented ones, they will
- * still be passed from input data to output picture without being
- * used internally.
- */
-typedef struct Dav1dDataProps {
-    int64_t timestamp; ///< container timestamp of input data, default INT64_MIN
-    uint64_t duration; ///< container duration of input data, default -1
-    int64_t offset; ///< stream offset of input data, default INT64_MIN
-    size_t size; ///< packet size, default Dav1dData.sz
-} Dav1dDataProps;
-
-#endif // __DAV1D_COMMON_H__
+#endif // __COMMON_H__
--- a/third_party/dav1d/include/dav1d/data.h
+++ b/third_party/dav1d/include/dav1d/data.h
@@ -32,26 +32,25 @@
 #include <stdint.h>
 
 #include "common.h"
 
 typedef struct Dav1dData {
     const uint8_t *data; ///< data pointer
     size_t sz; ///< data size
     struct Dav1dRef *ref; ///< allocation origin
-    Dav1dDataProps m;
 } Dav1dData;
 
 /**
  * Allocate data.
  *
  * @param data Input context.
  * @param   sz Size of the data that should be allocated.
  *
- * @return Pointer to the allocated buffer on success. NULL on error.
+ * @return Pointer to the allocated bufferon success. NULL on error.
  */
 DAV1D_API uint8_t * dav1d_data_create(Dav1dData *data, size_t sz);
 
 /**
  * Wrap an existing data array.
  *
  * @param          data Input context.
  * @param           buf The data to be wrapped.
--- a/third_party/dav1d/include/dav1d/dav1d.h
+++ b/third_party/dav1d/include/dav1d/dav1d.h
@@ -36,26 +36,20 @@ extern "C" {
 
 #include "common.h"
 #include "picture.h"
 #include "data.h"
 
 typedef struct Dav1dContext Dav1dContext;
 typedef struct Dav1dRef Dav1dRef;
 
-#define DAV1D_MAX_FRAME_THREADS 256
-#define DAV1D_MAX_TILE_THREADS 64
-
 typedef struct Dav1dSettings {
     int n_frame_threads;
     int n_tile_threads;
     Dav1dPicAllocator allocator;
-    int apply_grain;
-    int operating_point; ///< select an operating point for scalable AV1 bitstreams (0 - 31)
-    int all_layers; ///< output all spatial layers of a scalable AV1 biststream
 } Dav1dSettings;
 
 /**
  * Get library version.
  */
 DAV1D_API const char *dav1d_version(void);
 
 /**
@@ -75,32 +69,16 @@ DAV1D_API void dav1d_default_settings(Da
  * @note The context must be freed using dav1d_close() when decoding is
  *       finished.
  *
  * @return 0 on success, or < 0 (a negative errno code) on error.
  */
 DAV1D_API int dav1d_open(Dav1dContext **c_out, const Dav1dSettings *s);
 
 /**
- * Parse a Sequence Header OBU from bitstream data.
- *
- * @param out Output Sequence Header.
- * @param buf The data to be parser.
- * @param sz  Size of the data.
- *
- * @return 0 on success, or < 0 (a negative errno code) on error.
- *
- * @note It is safe to feed this function data containing other OBUs than a
- *       Sequence Header, as they will simply be ignored. If there is more than
- *       one Sequence Header OBU present, only the last will be returned.
- */
-DAV1D_API int dav1d_parse_sequence_header(Dav1dSequenceHeader *out,
-                                          const uint8_t *buf, const size_t sz);
-
-/**
  * Feed bitstream data to the decoder.
  *
  * @param   c Input decoder instance.
  * @param  in Input bitstream data. On success, ownership of the reference is
  *            passed to the library.
  *
  * @return
  *         0: Success, and the data was consumed.
@@ -123,68 +101,30 @@ DAV1D_API int dav1d_send_data(Dav1dConte
  *         0: Success, and a frame is returned.
  *   -EAGAIN: Not enough data to output a frame. dav1d_send_data() should be
  *            called with new input.
  *   other negative errno codes: Error during decoding or because of invalid
  *                               passed-in arguments.
  *
  * @note To drain buffered frames from the decoder (i.e. on end of stream),
  *       call this function until it returns -EAGAIN.
- *
- * @code{.c}
- *  Dav1dData data = { 0 };
- *  Dav1dPicture p = { 0 };
- *  int res;
- *
- *  read_data(&data);
- *  do {
- *      res = dav1d_send_data(c, &data);
- *      // Keep going even if the function can't consume the current data
- *         packet. It eventually will after one or more frames have been
- *         returned in this loop.
- *      if (res < 0 && res != -EAGAIN)
- *          free_and_abort();
- *      res = dav1d_get_picture(c, &p);
- *      if (res < 0) {
- *          if (res != -EAGAIN)
- *              free_and_abort();
- *      } else
- *          output_and_unref_picture(&p);
- *  // Stay in the loop as long as there's data to consume.
- *  } while (data.sz || read_data(&data) == SUCCESS);
- *
- *  // Handle EOS by draining all buffered frames.
- *  do {
- *      res = dav1d_get_picture(c, &p);
- *      if (res < 0) {
- *          if (res != -EAGAIN)
- *              free_and_abort();
- *      } else
- *          output_and_unref_picture(&p);
- *  } while (res == 0);
- * @endcode
  */
 DAV1D_API int dav1d_get_picture(Dav1dContext *c, Dav1dPicture *out);
 
 /**
  * Close a decoder instance and free all associated memory.
  *
  * @param c_out The decoder instance to close. *c_out will be set to NULL.
  */
 DAV1D_API void dav1d_close(Dav1dContext **c_out);
 
 /**
- * Flush all delayed frames in decoder and clear internal decoder state,
- * to be used when seeking.
+ * Flush all delayed frames in decoder, to be used when seeking.
  *
  * @param c Input decoder instance.
- *
- * @note Decoding will start only after a valid sequence header OBU is
- *       delivered to dav1d_send_data().
- *
  */
 DAV1D_API void dav1d_flush(Dav1dContext *c);
 
 # ifdef __cplusplus
 }
 # endif
 
 #endif /* __DAV1D_H__ */
deleted file mode 100644
--- a/third_party/dav1d/include/dav1d/headers.h
+++ /dev/null
@@ -1,385 +0,0 @@
-/*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- *    list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __DAV1D_HEADERS_H__
-#define __DAV1D_HEADERS_H__
-
-// Constants from Section 3. "Symbols and abbreviated terms"
-#define DAV1D_MAX_CDEF_STRENGTHS 8
-#define DAV1D_MAX_OPERATING_POINTS 32
-#define DAV1D_MAX_TILE_COLS 64
-#define DAV1D_MAX_TILE_ROWS 64
-#define DAV1D_MAX_SEGMENTS 8
-#define DAV1D_NUM_REF_FRAMES 8
-#define DAV1D_PRIMARY_REF_NONE 7
-#define DAV1D_REFS_PER_FRAME 7
-#define DAV1D_TOTAL_REFS_PER_FRAME (DAV1D_REFS_PER_FRAME + 1)
-
-enum Dav1dTxfmMode {
-    DAV1D_TX_4X4_ONLY,
-    DAV1D_TX_LARGEST,
-    DAV1D_TX_SWITCHABLE,
-    DAV1D_N_TX_MODES,
-};
-
-enum Dav1dFilterMode {
-    DAV1D_FILTER_8TAP_REGULAR,
-    DAV1D_FILTER_8TAP_SMOOTH,
-    DAV1D_FILTER_8TAP_SHARP,
-    DAV1D_N_SWITCHABLE_FILTERS,
-    DAV1D_FILTER_BILINEAR = DAV1D_N_SWITCHABLE_FILTERS,
-    DAV1D_N_FILTERS,
-    DAV1D_FILTER_SWITCHABLE = DAV1D_N_FILTERS,
-};
-
-enum Dav1dAdaptiveBoolean {
-    DAV1D_OFF = 0,
-    DAV1D_ON = 1,
-    DAV1D_ADAPTIVE = 2,
-};
-
-enum Dav1dRestorationType {
-    DAV1D_RESTORATION_NONE,
-    DAV1D_RESTORATION_SWITCHABLE,
-    DAV1D_RESTORATION_WIENER,
-    DAV1D_RESTORATION_SGRPROJ,
-};
-
-enum Dav1dWarpedMotionType {
-    DAV1D_WM_TYPE_IDENTITY,
-    DAV1D_WM_TYPE_TRANSLATION,
-    DAV1D_WM_TYPE_ROT_ZOOM,
-    DAV1D_WM_TYPE_AFFINE,
-};
-
-typedef struct Dav1dWarpedMotionParams {
-    enum Dav1dWarpedMotionType type;
-    int32_t matrix[6];
-    union {
-        struct {
-            int16_t alpha, beta, gamma, delta;
-        };
-        int16_t abcd[4];
-    };
-} Dav1dWarpedMotionParams;
-
-enum Dav1dPixelLayout {
-    DAV1D_PIXEL_LAYOUT_I400, ///< monochrome
-    DAV1D_PIXEL_LAYOUT_I420, ///< 4:2:0 planar
-    DAV1D_PIXEL_LAYOUT_I422, ///< 4:2:2 planar
-    DAV1D_PIXEL_LAYOUT_I444, ///< 4:4:4 planar
-};
-
-enum Dav1dFrameType {
-    DAV1D_FRAME_TYPE_KEY = 0,    ///< Key Intra frame
-    DAV1D_FRAME_TYPE_INTER = 1,  ///< Inter frame
-    DAV1D_FRAME_TYPE_INTRA = 2,  ///< Non key Intra frame
-    DAV1D_FRAME_TYPE_SWITCH = 3, ///< Switch Inter frame
-};
-
-enum Dav1dColorPrimaries {
-    DAV1D_COLOR_PRI_BT709 = 1,
-    DAV1D_COLOR_PRI_UNKNOWN = 2,
-    DAV1D_COLOR_PRI_BT470M = 4,
-    DAV1D_COLOR_PRI_BT470BG = 5,
-    DAV1D_COLOR_PRI_BT601 = 6,
-    DAV1D_COLOR_PRI_SMPTE240 = 7,
-    DAV1D_COLOR_PRI_FILM = 8,
-    DAV1D_COLOR_PRI_BT2020 = 9,
-    DAV1D_COLOR_PRI_XYZ = 10,
-    DAV1D_COLOR_PRI_SMPTE431 = 11,
-    DAV1D_COLOR_PRI_SMPTE432 = 12,
-    DAV1D_COLOR_PRI_EBU3213 = 22,
-};
-
-enum Dav1dTransferCharacteristics {
-    DAV1D_TRC_BT709 = 1,
-    DAV1D_TRC_UNKNOWN = 2,
-    DAV1D_TRC_BT470M = 4,
-    DAV1D_TRC_BT470BG = 5,
-    DAV1D_TRC_BT601 = 6,
-    DAV1D_TRC_SMPTE240 = 7,
-    DAV1D_TRC_LINEAR = 8,
-    DAV1D_TRC_LOG100 = 9,         ///< logarithmic (100:1 range)
-    DAV1D_TRC_LOG100_SQRT10 = 10, ///< lograithmic (100*sqrt(10):1 range)
-    DAV1D_TRC_IEC61966 = 11,
-    DAV1D_TRC_BT1361 = 12,
-    DAV1D_TRC_SRGB = 13,
-    DAV1D_TRC_BT2020_10BIT = 14,
-    DAV1D_TRC_BT2020_12BIT = 15,
-    DAV1D_TRC_SMPTE2084 = 16,     ///< PQ
-    DAV1D_TRC_SMPTE428 = 17,
-    DAV1D_TRC_HLG = 18,           ///< hybrid log/gamma (BT.2100 / ARIB STD-B67)
-};
-
-enum Dav1dMatrixCoefficients {
-    DAV1D_MC_IDENTITY = 0,
-    DAV1D_MC_BT709 = 1,
-    DAV1D_MC_UNKNOWN = 2,
-    DAV1D_MC_FCC = 4,
-    DAV1D_MC_BT470BG = 5,
-    DAV1D_MC_BT601 = 6,
-    DAV1D_MC_SMPTE240 = 7,
-    DAV1D_MC_SMPTE_YCGCO = 8,
-    DAV1D_MC_BT2020_NCL = 9,
-    DAV1D_MC_BT2020_CL = 10,
-    DAV1D_MC_SMPTE2085 = 11,
-    DAV1D_MC_CHROMAT_NCL = 12, ///< Chromaticity-derived
-    DAV1D_MC_CHROMAT_CL = 13,
-    DAV1D_MC_ICTCP = 14,
-};
-
-enum Dav1dChromaSamplePosition {
-    DAV1D_CHR_UNKNOWN = 0,
-    DAV1D_CHR_VERTICAL = 1,  ///< Horizontally co-located with luma(0, 0)
-                           ///< sample, between two vertical samples
-    DAV1D_CHR_COLOCATED = 2, ///< Co-located with luma(0, 0) sample
-};
-
-typedef struct Dav1dSequenceHeader {
-    /**
-     * Stream profile, 0 for 8-10 bits/component 4:2:0 or monochrome;
-     * 1 for 8-10 bits/component 4:4:4; 2 for 4:2:2 at any bits/component,
-     * or 12 bits/component at any chroma subsampling.
-     */
-    int profile;
-    /**
-     * Maximum dimensions for this stream. In non-scalable streams, these
-     * are often the actual dimensions of the stream, although that is not
-     * a normative requirement.
-     */
-    int max_width, max_height;
-    enum Dav1dPixelLayout layout; ///< format of the picture
-    enum Dav1dColorPrimaries pri; ///< color primaries (av1)
-    enum Dav1dTransferCharacteristics trc; ///< transfer characteristics (av1)
-    enum Dav1dMatrixCoefficients mtrx; ///< matrix coefficients (av1)
-    enum Dav1dChromaSamplePosition chr; ///< chroma sample position (av1)
-    /**
-     * Pixel data uses JPEG pixel range ([0,255] for 8bits) instead of
-     * MPEG pixel range ([16,235] for 8bits luma, [16,240] for 8bits chroma).
-     */
-    int color_range;
-
-    int num_operating_points;
-    struct Dav1dSequenceHeaderOperatingPoint {
-        int major_level, minor_level;
-        int initial_display_delay;
-        int idc;
-        int tier;
-        int decoder_model_param_present;
-        int decoder_buffer_delay;
-        int encoder_buffer_delay;
-        int low_delay_mode;
-        int display_model_param_present;
-    } operating_points[DAV1D_MAX_OPERATING_POINTS];
-
-    int still_picture;
-    int reduced_still_picture_header;
-    int timing_info_present;
-    int num_units_in_tick;
-    int time_scale;
-    int equal_picture_interval;
-    int num_ticks_per_picture;
-    int decoder_model_info_present;
-    int encoder_decoder_buffer_delay_length;
-    int num_units_in_decoding_tick;
-    int buffer_removal_delay_length;
-    int frame_presentation_delay_length;
-    int display_model_info_present;
-    int width_n_bits, height_n_bits;
-    int frame_id_numbers_present;
-    int delta_frame_id_n_bits;
-    int frame_id_n_bits;
-    int sb128;
-    int filter_intra;
-    int intra_edge_filter;
-    int inter_intra;
-    int masked_compound;
-    int warped_motion;
-    int dual_filter;
-    int order_hint;
-    int jnt_comp;
-    int ref_frame_mvs;
-    enum Dav1dAdaptiveBoolean screen_content_tools;
-    enum Dav1dAdaptiveBoolean force_integer_mv;
-    int order_hint_n_bits;
-    int super_res;
-    int cdef;
-    int restoration;
-    /**
-     * 0, 1 and 2 mean 8, 10 or 12 bits/component, respectively. This is not
-     * exactly the same as 'hbd' from the spec; the spec's hbd distinguishes
-     * between 8 (0) and 10-12 (1) bits/component, and another element
-     * (twelve_bit) to distinguish between 10 and 12 bits/component. To get
-     * the spec's hbd, use !!our_hbd, and to get twelve_bit, use hbd == 2.
-     */
-    int hbd;
-    int ss_hor, ss_ver, monochrome;
-    int color_description_present;
-    int separate_uv_delta_q;
-    int film_grain_present;
-} Dav1dSequenceHeader;
-
-typedef struct Dav1dSegmentationData {
-    int delta_q;
-    int delta_lf_y_v, delta_lf_y_h, delta_lf_u, delta_lf_v;
-    int ref;
-    int skip;
-    int globalmv;
-} Dav1dSegmentationData;
-
-typedef struct Dav1dSegmentationDataSet {
-    Dav1dSegmentationData d[DAV1D_MAX_SEGMENTS];
-    int preskip;
-    int last_active_segid;
-} Dav1dSegmentationDataSet;
-
-typedef struct Dav1dLoopfilterModeRefDeltas {
-    int mode_delta[2 /* is_zeromv */];
-    int ref_delta[DAV1D_TOTAL_REFS_PER_FRAME];
-} Dav1dLoopfilterModeRefDeltas;
-
-typedef struct Dav1dFilmGrainData {
-    uint16_t seed;
-    int num_y_points;
-    uint8_t y_points[14][2 /* value, scaling */];
-    int chroma_scaling_from_luma;
-    int num_uv_points[2];
-    uint8_t uv_points[2][10][2 /* value, scaling */];
-    int scaling_shift;
-    int ar_coeff_lag;
-    int8_t ar_coeffs_y[24];
-    int8_t ar_coeffs_uv[2][25];
-    int ar_coeff_shift;
-    int grain_scale_shift;
-    int uv_mult[2];
-    int uv_luma_mult[2];
-    int uv_offset[2];
-    int overlap_flag;
-    int clip_to_restricted_range;
-} Dav1dFilmGrainData;
-
-typedef struct Dav1dFrameHeader {
-    enum Dav1dFrameType frame_type; ///< type of the picture
-    int width[2 /* { coded_width, superresolution_upscaled_width } */], height;
-    int frame_offset; ///< frame number
-    struct {
-        int present, update;
-        Dav1dFilmGrainData data;
-    } film_grain; ///< film grain parameters
-    int temporal_id, spatial_id; ///< spatial and temporal id of the frame for SVC
-
-    int show_existing_frame;
-    int existing_frame_idx;
-    int frame_id;
-    int frame_presentation_delay;
-    int show_frame;
-    int showable_frame;
-    int error_resilient_mode;
-    int disable_cdf_update;
-    int allow_screen_content_tools;
-    int force_integer_mv;
-    int frame_size_override;
-    int primary_ref_frame;
-    int buffer_removal_time_present;
-    struct Dav1dFrameHeaderOperatingPoint {
-        int buffer_removal_time;
-    } operating_points[DAV1D_MAX_OPERATING_POINTS];
-    int refresh_frame_flags;
-    int render_width, render_height;
-    struct {
-        int width_scale_denominator;
-        int enabled;
-    } super_res;
-    int have_render_size;
-    int allow_intrabc;
-    int frame_ref_short_signaling;
-    int refidx[DAV1D_REFS_PER_FRAME];
-    int hp;
-    enum Dav1dFilterMode subpel_filter_mode;
-    int switchable_motion_mode;
-    int use_ref_frame_mvs;
-    int refresh_context;
-    struct {
-        int uniform;
-        unsigned n_bytes;
-        int min_log2_cols, max_log2_cols, log2_cols, cols;
-        int min_log2_rows, max_log2_rows, log2_rows, rows;
-        uint16_t col_start_sb[DAV1D_MAX_TILE_COLS + 1];
-        uint16_t row_start_sb[DAV1D_MAX_TILE_ROWS + 1];
-        int update;
-    } tiling;
-    struct {
-        int yac;
-        int ydc_delta;
-        int udc_delta, uac_delta, vdc_delta, vac_delta;
-        int qm, qm_y, qm_u, qm_v;
-    } quant;
-    struct {
-        int enabled, update_map, temporal, update_data;
-        Dav1dSegmentationDataSet seg_data;
-        int lossless[DAV1D_MAX_SEGMENTS], qidx[DAV1D_MAX_SEGMENTS];
-    } segmentation;
-    struct {
-        struct {
-            int present;
-            int res_log2;
-        } q;
-        struct {
-            int present;
-            int res_log2;
-            int multi;
-        } lf;
-    } delta;
-    int all_lossless;
-    struct {
-        int level_y[2 /* dir */];
-        int level_u, level_v;
-        int mode_ref_delta_enabled;
-        int mode_ref_delta_update;
-        Dav1dLoopfilterModeRefDeltas mode_ref_deltas;
-        int sharpness;
-    } loopfilter;
-    struct {
-        int damping;
-        int n_bits;
-        int y_strength[DAV1D_MAX_CDEF_STRENGTHS];
-        int uv_strength[DAV1D_MAX_CDEF_STRENGTHS];
-    } cdef;
-    struct {
-        enum Dav1dRestorationType type[3 /* plane */];
-        int unit_size[2 /* y, uv */];
-    } restoration;
-    enum Dav1dTxfmMode txfm_mode;
-    int switchable_comp_refs;
-    int skip_mode_allowed, skip_mode_enabled, skip_mode_refs[2];
-    int warp_motion;
-    int reduced_txtp_set;
-    Dav1dWarpedMotionParams gmv[DAV1D_REFS_PER_FRAME];
-} Dav1dFrameHeader;
-
-#endif /* __DAV1D_HEADERS_H__ */
--- a/third_party/dav1d/include/dav1d/picture.h
+++ b/third_party/dav1d/include/dav1d/picture.h
@@ -27,56 +27,137 @@
 
 #ifndef __DAV1D_PICTURE_H__
 #define __DAV1D_PICTURE_H__
 
 #include <stddef.h>
 #include <stdint.h>
 
 #include "common.h"
-#include "headers.h"
+
+enum Dav1dPixelLayout {
+    DAV1D_PIXEL_LAYOUT_I400, ///< monochrome
+    DAV1D_PIXEL_LAYOUT_I420, ///< 4:2:0 planar
+    DAV1D_PIXEL_LAYOUT_I422, ///< 4:2:2 planar
+    DAV1D_PIXEL_LAYOUT_I444, ///< 4:4:4 planar
+};
+
+enum Dav1dFrameType {
+    DAV1D_FRAME_TYPE_KEY = 0,    ///< Key Intra frame
+    DAV1D_FRAME_TYPE_INTER = 1,  ///< Inter frame
+    DAV1D_FRAME_TYPE_INTRA = 2,  ///< Non key Intra frame
+    DAV1D_FRAME_TYPE_SWITCH = 3, ///< Switch Inter frame
+};
+
+enum Dav1dColorPrimaries {
+    DAV1D_COLOR_PRI_BT709 = 1,
+    DAV1D_COLOR_PRI_UNKNOWN = 2,
+    DAV1D_COLOR_PRI_BT470M = 4,
+    DAV1D_COLOR_PRI_BT470BG = 5,
+    DAV1D_COLOR_PRI_BT601 = 6,
+    DAV1D_COLOR_PRI_SMPTE240 = 7,
+    DAV1D_COLOR_PRI_FILM = 8,
+    DAV1D_COLOR_PRI_BT2020 = 9,
+    DAV1D_COLOR_PRI_XYZ = 10,
+    DAV1D_COLOR_PRI_SMPTE431 = 11,
+    DAV1D_COLOR_PRI_SMPTE432 = 12,
+    DAV1D_COLOR_PRI_EBU3213 = 22,
+};
+
+enum Dav1dTransferCharacteristics {
+    DAV1D_TRC_BT709 = 1,
+    DAV1D_TRC_UNKNOWN = 2,
+    DAV1D_TRC_BT470M = 4,
+    DAV1D_TRC_BT470BG = 5,
+    DAV1D_TRC_BT601 = 6,
+    DAV1D_TRC_SMPTE240 = 7,
+    DAV1D_TRC_LINEAR = 8,
+    DAV1D_TRC_LOG100 = 9,         ///< logarithmic (100:1 range)
+    DAV1D_TRC_LOG100_SQRT10 = 10, ///< lograithmic (100*sqrt(10):1 range)
+    DAV1D_TRC_IEC61966 = 11,
+    DAV1D_TRC_BT1361 = 12,
+    DAV1D_TRC_SRGB = 13,
+    DAV1D_TRC_BT2020_10BIT = 14,
+    DAV1D_TRC_BT2020_12BIT = 15,
+    DAV1D_TRC_SMPTE2084 = 16,     ///< PQ
+    DAV1D_TRC_SMPTE428 = 17,
+    DAV1D_TRC_HLG = 18,           ///< hybrid log/gamma (BT.2100 / ARIB STD-B67)
+};
+
+enum Dav1dMatrixCoefficients {
+    DAV1D_MC_IDENTITY = 0,
+    DAV1D_MC_BT709 = 1,
+    DAV1D_MC_UNKNOWN = 2,
+    DAV1D_MC_FCC = 4,
+    DAV1D_MC_BT470BG = 5,
+    DAV1D_MC_BT601 = 6,
+    DAV1D_MC_SMPTE240 = 7,
+    DAV1D_MC_SMPTE_YCGCO = 8,
+    DAV1D_MC_BT2020_NCL = 9,
+    DAV1D_MC_BT2020_CL = 10,
+    DAV1D_MC_SMPTE2085 = 11,
+    DAV1D_MC_CHROMAT_NCL = 12, ///< Chromaticity-derived
+    DAV1D_MC_CHROMAT_CL = 13,
+    DAV1D_MC_ICTCP = 14,
+};
+
+enum Dav1dChromaSamplePosition {
+    DAV1D_CHR_UNKNOWN = 0,
+    DAV1D_CHR_VERTICAL = 1,  ///< Horizontally co-located with luma(0, 0)
+                           ///< sample, between two vertical samples
+    DAV1D_CHR_COLOCATED = 2, ///< Co-located with luma(0, 0) sample
+};
 
 typedef struct Dav1dPictureParameters {
     int w; ///< width (in pixels)
     int h; ///< height (in pixels)
     enum Dav1dPixelLayout layout; ///< format of the picture
+    enum Dav1dFrameType type; ///< type of the picture
     int bpc; ///< bits per pixel component (8 or 10)
+
+    enum Dav1dColorPrimaries pri; ///< color primaries (av1)
+    enum Dav1dTransferCharacteristics trc; ///< transfer characteristics (av1)
+    enum Dav1dMatrixCoefficients mtrx; ///< matrix coefficients (av1)
+    enum Dav1dChromaSamplePosition chr; ///< chroma sample position (av1)
+    /**
+     * Pixel data uses JPEG pixel range ([0,255] for 8bits) instead of
+     * MPEG pixel range ([16,235] for 8bits luma, [16,240] for 8bits chroma).
+     */
+    int fullrange;
 } Dav1dPictureParameters;
 
 typedef struct Dav1dPicture {
-    Dav1dSequenceHeader *seq_hdr;
-    Dav1dFrameHeader *frame_hdr;
+    int poc; ///< frame number
 
     /**
      * Pointers to planar image data (Y is [0], U is [1], V is [2]). The data
      * should be bytes (for 8 bpc) or words (for 10 bpc). In case of words
      * containing 10 bpc image data, the pixels should be located in the LSB
      * bits, so that values range between [0, 1023]; the upper bits should be
      * zero'ed out.
      */
     void *data[3];
+    struct Dav1dRef *ref; ///< allocation origin
 
     /**
      * Number of bytes between 2 lines in data[] for luma [0] or chroma [1].
      */
     ptrdiff_t stride[2];
 
     Dav1dPictureParameters p;
-    Dav1dDataProps m;
-    struct Dav1dRef *frame_hdr_ref, *seq_hdr_ref, *ref; ///< allocation origins
 
     void *allocator_data; ///< pointer managed by the allocator
 } Dav1dPicture;
 
 typedef struct Dav1dPicAllocator {
     void *cookie; ///< custom data to pass to the allocator callbacks.
     /**
      * Allocate the picture buffer based on the Dav1dPictureParameters.
      *
-     * The data[0], data[1] and data[2] must be 32 byte aligned and with a
+     * The data[0], data[1] and data[2] must be 32 bits aligned and with a
      * pixel width/height multiple of 128 pixels.
      * data[1] and data[2] must share the same stride[1].
      *
      * @param  pic The picture to allocate the buffer for. The callback needs to
      *             fill the picture data[0], data[1], data[2], stride[0] and
      *             stride[1].
      *             The allocator can fill the pic allocator_data pointer with
      *             a custom pointer that will be passed to
@@ -84,20 +165,24 @@ typedef struct Dav1dPicAllocator {
      * @param cookie Custom pointer passed to all calls.
     *
     * @return 0 on success. A negative errno value on error.
      */
     int (*alloc_picture_callback)(Dav1dPicture *pic, void *cookie);
     /**
      * Release the picture buffer.
      *
-     * @param pic    The picture that was filled by alloc_picture_callback().
-     * @param cookie Custom pointer passed to all calls.
+     * @param buf           The buffer that was returned by 
+     *                                   alloc_picture_callback().
+     * @param allocator_tag The Dav1dPicture.allocator_data that was filled by
+     *                      alloc_picture_callback()
+     * @param cookie        Custom pointer passed to all calls.
      */
-    void (*release_picture_callback)(Dav1dPicture *pic, void *cookie);
+    void (*release_picture_callback)(uint8_t *buf, void *allocator_data,
+                                     void *cookie);
 } Dav1dPicAllocator;
 
 /**
  * Release reference to a picture.
  */
 DAV1D_API void dav1d_picture_unref(Dav1dPicture *p);
 
 #endif /* __DAV1D_PICTURE_H__ */
--- a/third_party/dav1d/meson.build
+++ b/third_party/dav1d/meson.build
@@ -319,15 +319,13 @@ endif
 
 
 #
 # Include subdir meson.build files
 # The order is important!
 
 subdir('include')
 
-subdir('doc')
-
 subdir('src')
 
 subdir('tools')
 
 subdir('tests')
deleted file mode 100644
--- a/third_party/dav1d/src/arm/64/looprestoration.S
+++ /dev/null
@@ -1,627 +0,0 @@
-/*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Martin Storsjo
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- *    list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "src/arm/asm.S"
-
-// void dav1d_wiener_filter_h_neon(int16_t *dst, const pixel (*left)[4],
-//                                 const pixel *src, ptrdiff_t stride,
-//                                 const int16_t fh[7], const intptr_t w,
-//                                 int h, enum LrEdgeFlags edges);
-function wiener_filter_h_neon, export=1
-        mov             w8,  w5
-        ld1             {v0.8h},  [x4]
-        mov             w9,  #(1 << 14) - (1 << 2)
-        dup             v30.8h,  w9
-        movi            v31.8h,  #8, lsl #8
-        // Calculate mid_stride
-        add             w10, w5,  #7
-        bic             w10, w10, #7
-        lsl             w10, w10, #1
-
-        // Clear the last unused element of v0, to allow filtering a single
-        // pixel with one plain mul+addv.
-        ins             v0.h[7], wzr
-
-        // Set up pointers for reading/writing alternate rows
-        add             x12, x0,  x10
-        lsl             w10, w10, #1
-        add             x13, x2,  x3
-        lsl             x3,  x3,  #1
-
-        // Subtract the width from mid_strid3
-        sub             x10, x10, w5, uxtw #1
-
-        // For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels.
-        cmp             w5,  #8
-        add             w11, w5,  #13
-        bic             w11, w11, #7
-        b.ge            1f
-        mov             w11, #16
-1:
-        sub             x3,  x3,  w11, uxtw
-
-        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
-        tst             w7,  #1 // LR_HAVE_LEFT
-        b.eq            2f
-        // LR_HAVE_LEFT
-        cbnz            x1,  0f
-        // left == NULL
-        sub             x2,  x2,  #3
-        sub             x13, x13, #3
-        b               1f
-0:      // LR_HAVE_LEFT, left != NULL
-2:      // !LR_HAVE_LEFT, increase the stride.
-        // For this case we don't read the left 3 pixels from the src pointer,
-        // but shift it as if we had done that.
-        add             x3,  x3,  #3
-
-
-1:      // Loop vertically
-        ld1             {v3.16b},  [x2],  #16
-        ld1             {v5.16b},  [x13], #16
-
-        tst             w7,  #1 // LR_HAVE_LEFT
-        b.eq            0f
-        cbz             x1,  2f
-        // LR_HAVE_LEFT, left != NULL
-        ld1             {v2.s}[3],  [x1], #4
-        // Move x2/x13 back to account for the last 3 bytes we loaded earlier,
-        // which we'll shift out.
-        sub             x2,  x2,  #3
-        sub             x13, x13, #3
-        ld1             {v4.s}[3],  [x1], #4
-        ext             v3.16b, v2.16b, v3.16b, #13
-        ext             v5.16b, v4.16b, v5.16b, #13
-        b               2f
-0:
-        // !LR_HAVE_LEFT, fill v2 with the leftmost byte
-        // and shift v3 to have 3x the first byte at the front.
-        dup             v2.16b, v3.b[0]
-        dup             v4.16b, v5.b[0]
-        // Move x2 back to account for the last 3 bytes we loaded before,
-        // which we shifted out.
-        sub             x2,  x2,  #3
-        sub             x13, x13, #3
-        ext             v3.16b, v2.16b, v3.16b, #13
-        ext             v5.16b, v4.16b, v5.16b, #13
-
-2:
-        uxtl            v2.8h,  v3.8b
-        uxtl2           v3.8h,  v3.16b
-        uxtl            v4.8h,  v5.8b
-        uxtl2           v5.8h,  v5.16b
-
-        tst             w7,  #2 // LR_HAVE_RIGHT
-        b.ne            4f
-        // If we'll need to pad the right edge, load that byte to pad with
-        // here since we can find it pretty easily from here.
-        sub             w9,  w5, #14
-        ldr             b28, [x2,  w9, sxtw]
-        ldr             b29, [x13, w9, sxtw]
-        // Fill v28/v29 with the right padding pixel
-        dup             v28.8b,  v28.b[0]
-        dup             v29.8b,  v29.b[0]
-        uxtl            v28.8h,  v28.8b
-        uxtl            v29.8h,  v29.8b
-3:      // !LR_HAVE_RIGHT
-        // If we'll have to pad the right edge we need to quit early here.
-        cmp             w5,  #11
-        b.ge            4f   // If w >= 11, all used input pixels are valid
-        cmp             w5,  #7
-        b.ge            5f   // If w >= 7, we can filter 4 pixels
-        b               6f
-
-4:      // Loop horizontally
-.macro filter wd
-        // Interleaving the mul/mla chains actually hurts performance
-        // significantly on Cortex A53, thus keeping mul/mla tightly
-        // chained like this.
-        ext             v16.16b, v2.16b,  v3.16b, #2
-        ext             v17.16b, v2.16b,  v3.16b, #4
-        ext             v18.16b, v2.16b,  v3.16b, #6
-        ext             v19.16b, v2.16b,  v3.16b, #8
-        ext             v20.16b, v2.16b,  v3.16b, #10
-        ext             v21.16b, v2.16b,  v3.16b, #12
-        mul             v6\wd,   v2\wd,   v0.h[0]
-        mla             v6\wd,   v16\wd,  v0.h[1]
-        mla             v6\wd,   v17\wd,  v0.h[2]
-        mla             v6\wd,   v18\wd,  v0.h[3]
-        mla             v6\wd,   v19\wd,  v0.h[4]
-        mla             v6\wd,   v20\wd,  v0.h[5]
-        mla             v6\wd,   v21\wd,  v0.h[6]
-        ext             v22.16b, v4.16b,  v5.16b, #2
-        ext             v23.16b, v4.16b,  v5.16b, #4
-        ext             v24.16b, v4.16b,  v5.16b, #6
-        ext             v25.16b, v4.16b,  v5.16b, #8
-        ext             v26.16b, v4.16b,  v5.16b, #10
-        ext             v27.16b, v4.16b,  v5.16b, #12
-        mul             v7\wd,   v4\wd,   v0.h[0]
-        mla             v7\wd,   v22\wd,  v0.h[1]
-        mla             v7\wd,   v23\wd,  v0.h[2]
-        mla             v7\wd,   v24\wd,  v0.h[3]
-        mla             v7\wd,   v25\wd,  v0.h[4]
-        mla             v7\wd,   v26\wd,  v0.h[5]
-        mla             v7\wd,   v27\wd,  v0.h[6]
-
-        shl             v18\wd,  v18\wd,  #7
-        shl             v24\wd,  v24\wd,  #7
-        sub             v18\wd,  v18\wd,  v30\wd
-        sub             v24\wd,  v24\wd,  v30\wd
-        sqadd           v6\wd,   v6\wd,   v18\wd
-        sqadd           v7\wd,   v7\wd,   v24\wd
-        sshr            v6\wd,   v6\wd,   #3
-        sshr            v7\wd,   v7\wd,   #3
-        add             v6\wd,   v6\wd,   v31\wd
-        add             v7\wd,   v7\wd,   v31\wd
-.endm
-        filter          .8h
-        st1             {v6.8h},  [x0],  #16
-        st1             {v7.8h},  [x12], #16
-
-        subs            w5,  w5,  #8
-        b.le            9f
-        tst             w7,  #2 // LR_HAVE_RIGHT
-        mov             v2.16b,  v3.16b
-        mov             v4.16b,  v5.16b
-        ld1             {v3.8b},  [x2],  #8
-        ld1             {v5.8b},  [x13], #8
-        uxtl            v3.8h,   v3.8b
-        uxtl            v5.8h,   v5.8b
-        b.ne            4b // If we don't need to pad, just keep filtering.
-        b               3b // If we need to pad, check how many pixels we have left.
-
-5:      // Filter 4 pixels, 7 <= w < 11
-        filter          .4h
-        st1             {v6.4h},  [x0],  #8
-        st1             {v7.4h},  [x12], #8
-
-        subs            w5,  w5,  #4 // 3 <= w < 7
-        ext             v2.16b,  v2.16b,  v3.16b, #8
-        ext             v3.16b,  v3.16b,  v3.16b, #8
-        ext             v4.16b,  v4.16b,  v5.16b, #8
-        ext             v5.16b,  v5.16b,  v5.16b, #8
-
-6:      // Pad the right edge and filter the last few pixels.
-        // w < 7, w+3 pixels valid in v2-v3
-        cmp             w5,  #5
-        b.lt            7f
-        b.gt            8f
-        // w == 5, 8 pixels valid in v2, v3 invalid
-        mov             v3.16b,  v28.16b
-        mov             v5.16b,  v29.16b
-        b               88f
-
-7:      // 1 <= w < 5, 4-7 pixels valid in v2
-        sub             w9,  w5,  #1
-        // w9 = (pixels valid - 4)
-        adr             x11, L(variable_shift_tbl)
-        ldrh            w9,  [x11, w9, uxtw #1]
-        sub             x11, x11, w9, uxth
-        mov             v3.16b,  v28.16b
-        mov             v5.16b,  v29.16b
-        br              x11
-        // Shift v2 right, shifting out invalid pixels,
-        // shift v2 left to the original offset, shifting in padding pixels.
-44:     // 4 pixels valid
-        ext             v2.16b,  v2.16b,  v2.16b,  #8
-        ext             v2.16b,  v2.16b,  v3.16b,  #8
-        ext             v4.16b,  v4.16b,  v4.16b,  #8
-        ext             v4.16b,  v4.16b,  v5.16b,  #8
-        b               88f
-55:     // 5 pixels valid
-        ext             v2.16b,  v2.16b,  v2.16b,  #10
-        ext             v2.16b,  v2.16b,  v3.16b,  #6
-        ext             v4.16b,  v4.16b,  v4.16b,  #10
-        ext             v4.16b,  v4.16b,  v5.16b,  #6
-        b               88f
-66:     // 6 pixels valid
-        ext             v2.16b,  v2.16b,  v2.16b,  #12
-        ext             v2.16b,  v2.16b,  v3.16b,  #4
-        ext             v4.16b,  v4.16b,  v4.16b,  #12
-        ext             v4.16b,  v4.16b,  v5.16b,  #4
-        b               88f
-77:     // 7 pixels valid
-        ext             v2.16b,  v2.16b,  v2.16b,  #14
-        ext             v2.16b,  v2.16b,  v3.16b,  #2
-        ext             v4.16b,  v4.16b,  v4.16b,  #14
-        ext             v4.16b,  v4.16b,  v5.16b,  #2
-        b               88f
-
-L(variable_shift_tbl):
-        .hword L(variable_shift_tbl) - 44b
-        .hword L(variable_shift_tbl) - 55b
-        .hword L(variable_shift_tbl) - 66b
-        .hword L(variable_shift_tbl) - 77b
-
-8:      // w > 5, w == 6, 9 pixels valid in v2-v3, 1 pixel valid in v3
-        ins             v28.h[0],  v3.h[0]
-        ins             v29.h[0],  v5.h[0]
-        mov             v3.16b,  v28.16b
-        mov             v5.16b,  v29.16b
-
-88:
-        // w < 7, v2-v3 padded properly
-        cmp             w5,  #4
-        b.lt            888f
-
-        // w >= 4, filter 4 pixels
-        filter          .4h
-        st1             {v6.4h},  [x0],  #8
-        st1             {v7.4h},  [x12], #8
-        subs            w5,  w5,  #4 // 0 <= w < 4
-        ext             v2.16b,  v2.16b,  v3.16b, #8
-        ext             v4.16b,  v4.16b,  v5.16b, #8
-        b.eq            9f
-888:    // 1 <= w < 4, filter 1 pixel at a time
-        mul             v6.8h,   v2.8h,   v0.8h
-        mul             v7.8h,   v4.8h,   v0.8h
-        addv            h6,      v6.8h
-        addv            h7,      v7.8h
-        dup             v16.4h,  v2.h[3]
-        dup             v17.4h,  v4.h[3]
-        shl             v16.4h,  v16.4h,  #7
-        shl             v17.4h,  v17.4h,  #7
-        sub             v16.4h,  v16.4h,  v30.4h
-        sub             v17.4h,  v17.4h,  v30.4h
-        sqadd           v6.4h,   v6.4h,   v16.4h
-        sqadd           v7.4h,   v7.4h,   v17.4h
-        sshr            v6.4h,   v6.4h,   #3
-        sshr            v7.4h,   v7.4h,   #3
-        add             v6.4h,   v6.4h,   v31.4h
-        add             v7.4h,   v7.4h,   v31.4h
-        st1             {v6.h}[0], [x0],  #2
-        st1             {v7.h}[0], [x12], #2
-        subs            w5,  w5,  #1
-        ext             v2.16b,  v2.16b,  v3.16b,  #2
-        ext             v4.16b,  v4.16b,  v5.16b,  #2
-        b.gt            888b
-
-9:
-        subs            w6,  w6,  #2
-        b.le            0f
-        // Jump to the next row and loop horizontally
-        add             x0,  x0,  x10
-        add             x12, x12, x10
-        add             x2,  x2,  x3
-        add             x13, x13, x3
-        mov             w5,  w8
-        b               1b
-0:
-        ret
-.purgem filter
-endfunc
-
-// void dav1d_wiener_filter_v_neon(pixel *dst, ptrdiff_t stride,
-//                                 const int16_t *mid, int w, int h,
-//                                 const int16_t fv[7], enum LrEdgeFlags edges,
-//                                 ptrdiff_t mid_stride);
-function wiener_filter_v_neon, export=1
-        mov             w8,  w4
-        ld1             {v0.8h},  [x5]
-        mov             w9,  #128
-        dup             v1.8h, w9
-        add             v1.8h,  v1.8h,  v0.8h
-
-        // Calculate the number of rows to move back when looping vertically
-        mov             w11, w4
-        tst             w6,  #4 // LR_HAVE_TOP
-        b.eq            0f
-        sub             x2,  x2,  x7,  lsl #1
-        add             w11, w11, #2
-0:
-        tst             w6,  #8 // LR_HAVE_BOTTOM
-        b.eq            1f
-        add             w11, w11, #2
-
-1:      // Start of horizontal loop; start one vertical filter slice.
-        // Load rows into v16-v19 and pad properly.
-        tst             w6,  #4 // LR_HAVE_TOP
-        ld1             {v16.8h}, [x2], x7
-        b.eq            2f
-        // LR_HAVE_TOP
-        ld1             {v18.8h}, [x2], x7
-        mov             v17.16b, v16.16b
-        ld1             {v19.8h}, [x2], x7
-        b               3f
-2:      // !LR_HAVE_TOP
-        mov             v17.16b, v16.16b
-        mov             v18.16b, v16.16b
-        mov             v19.16b, v16.16b
-
-3:
-        cmp             w4,  #4
-        b.lt            5f
-        // Start filtering normally; fill in v20-v22 with unique rows.
-        ld1             {v20.8h}, [x2], x7
-        ld1             {v21.8h}, [x2], x7
-        ld1             {v22.8h}, [x2], x7
-
-4:
-.macro filter compare
-        subs            w4,  w4,  #1
-        // Interleaving the mul/mla chains actually hurts performance
-        // significantly on Cortex A53, thus keeping mul/mla tightly
-        // chained like this.
-        smull           v2.4s,  v16.4h,  v0.h[0]
-        smlal           v2.4s,  v17.4h,  v0.h[1]
-        smlal           v2.4s,  v18.4h,  v0.h[2]
-        smlal           v2.4s,  v19.4h,  v1.h[3]
-        smlal           v2.4s,  v20.4h,  v0.h[4]
-        smlal           v2.4s,  v21.4h,  v0.h[5]
-        smlal           v2.4s,  v22.4h,  v0.h[6]
-        smull2          v3.4s,  v16.8h,  v0.h[0]
-        smlal2          v3.4s,  v17.8h,  v0.h[1]
-        smlal2          v3.4s,  v18.8h,  v0.h[2]
-        smlal2          v3.4s,  v19.8h,  v1.h[3]
-        smlal2          v3.4s,  v20.8h,  v0.h[4]
-        smlal2          v3.4s,  v21.8h,  v0.h[5]
-        smlal2          v3.4s,  v22.8h,  v0.h[6]
-        sqrshrun        v2.4h,  v2.4s,   #11
-        sqrshrun2       v2.8h,  v3.4s,   #11
-        sqxtun          v2.8b,  v2.8h
-        st1             {v2.8b}, [x0], x1
-.if \compare
-        cmp             w4,  #4
-.else
-        b.le            9f
-.endif
-        mov             v16.16b,  v17.16b
-        mov             v17.16b,  v18.16b
-        mov             v18.16b,  v19.16b
-        mov             v19.16b,  v20.16b
-        mov             v20.16b,  v21.16b
-        mov             v21.16b,  v22.16b
-.endm
-        filter          1
-        b.lt            7f
-        ld1             {v22.8h}, [x2], x7
-        b               4b
-
-5:      // Less than 4 rows in total; not all of v20-v21 are filled yet.
-        tst             w6,  #8 // LR_HAVE_BOTTOM
-        b.eq            6f
-        // LR_HAVE_BOTTOM
-        cmp             w4,  #2
-        // We load at least 2 rows in all cases.
-        ld1             {v20.8h}, [x2], x7
-        ld1             {v21.8h}, [x2], x7
-        b.gt            53f // 3 rows in total
-        b.eq            52f // 2 rows in total
-51:     // 1 row in total, v19 already loaded, load edge into v20-v22.
-        mov             v22.16b,  v21.16b
-        b               8f
-52:     // 2 rows in total, v19 already loaded, load v20 with content data
-        // and 2 rows of edge.
-        ld1             {v22.8h}, [x2], x7
-        mov             v23.16b,  v22.16b
-        b               8f
-53:
-        // 3 rows in total, v19 already loaded, load v20 and v21 with content
-        // and 2 rows of edge.
-        ld1             {v22.8h}, [x2], x7
-        ld1             {v23.8h}, [x2], x7
-        mov             v24.16b,  v23.16b
-        b               8f
-
-6:
-        // !LR_HAVE_BOTTOM
-        cmp             w4,  #2
-        b.gt            63f // 3 rows in total
-        b.eq            62f // 2 rows in total
-61:     // 1 row in total, v19 already loaded, pad that into v20-v22.
-        mov             v20.16b,  v19.16b
-        mov             v21.16b,  v19.16b
-        mov             v22.16b,  v19.16b
-        b               8f
-62:     // 2 rows in total, v19 already loaded, load v20 and pad that into v20-v23.
-        ld1             {v20.8h}, [x2], x7
-        mov             v21.16b,  v20.16b
-        mov             v22.16b,  v20.16b
-        mov             v23.16b,  v20.16b
-        b               8f
-63:
-        // 3 rows in total, v19 already loaded, load v20 and v21 and pad v21 into v22-v24.
-        ld1             {v20.8h}, [x2], x7
-        ld1             {v21.8h}, [x2], x7
-        mov             v22.16b,  v21.16b
-        mov             v23.16b,  v21.16b
-        mov             v24.16b,  v21.16b
-        b               8f
-
-7:
-        // All registers up to v21 are filled already, 3 valid rows left.
-        // < 4 valid rows left; fill in padding and filter the last
-        // few rows.
-        tst             w6,  #8 // LR_HAVE_BOTTOM
-        b.eq            71f
-        // LR_HAVE_BOTTOM; load 2 rows of edge.
-        ld1             {v22.8h}, [x2], x7
-        ld1             {v23.8h}, [x2], x7
-        mov             v24.16b,  v23.16b
-        b               8f
-71:
-        // !LR_HAVE_BOTTOM, pad 3 rows
-        mov             v22.16b,  v21.16b
-        mov             v23.16b,  v21.16b
-        mov             v24.16b,  v21.16b
-
-8:      // At this point, all registers up to v22-v24 are loaded with
-        // edge/padding (depending on how many rows are left).
-        filter          0 // This branches to 9f when done
-        mov             v22.16b,  v23.16b
-        mov             v23.16b,  v24.16b
-        b               8b
-
-9:      // End of one vertical slice.
-        subs            w3,  w3,  #8
-        b.le            0f
-        // Move pointers back up to the top and loop horizontally.
-        msub            x0,  x1,  x8,  x0
-        msub            x2,  x7,  x11, x2
-        add             x0,  x0,  #8
-        add             x2,  x2,  #16
-        mov             w4,  w8
-        b               1b
-
-0:
-        ret
-.purgem filter
-endfunc
-
-// void dav1d_copy_narrow_neon(pixel *dst, ptrdiff_t stride,
-//                             const pixel *src, int w, int h);
-function copy_narrow_neon, export=1
-        adr             x5,  L(copy_narrow_tbl)
-        ldrh            w6,  [x5, w3, uxtw #1]
-        sub             x5,  x5,  w6, uxth
-        br              x5
-10:
-        add             x7,  x0,  x1
-        lsl             x1,  x1,  #1
-18:
-        cmp             w4,  #8
-        b.lt            110f
-        subs            w4,  w4,  #8
-        ld1             {v0.8b}, [x2], #8
-        st1             {v0.b}[0], [x0], x1
-        st1             {v0.b}[1], [x7], x1
-        st1             {v0.b}[2], [x0], x1
-        st1             {v0.b}[3], [x7], x1
-        st1             {v0.b}[4], [x0], x1
-        st1             {v0.b}[5], [x7], x1
-        st1             {v0.b}[6], [x0], x1
-        st1             {v0.b}[7], [x7], x1
-        b.le            0f
-        b               18b
-110:
-        asr             x1,  x1,  #1
-11:
-        subs            w4,  w4,  #1
-        ld1             {v0.b}[0], [x2], #1
-        st1             {v0.b}[0], [x0], x1
-        b.ge            11b
-0:
-        ret
-
-20:
-        add             x7,  x0,  x1
-        lsl             x1,  x1,  #1
-24:
-        cmp             w4,  #4
-        b.lt            210f
-        subs            w4,  w4,  #4
-        ld1             {v0.4h}, [x2], #8
-        st1             {v0.h}[0], [x0], x1
-        st1             {v0.h}[1], [x7], x1
-        st1             {v0.h}[2], [x0], x1
-        st1             {v0.h}[3], [x7], x1
-        b.le            0f
-        b               24b
-210:
-        asr             x1,  x1,  #1
-22:
-        subs            w4,  w4,  #1
-        ld1             {v0.h}[0], [x2], #2
-        st1             {v0.h}[0], [x0], x1
-        b.ge            22b
-0:
-        ret
-
-30:
-        ldrh            w5,  [x2]
-        ldrb            w6,  [x2, #2]
-        add             x2,  x2,  #3
-        subs            w4,  w4,  #1
-        strh            w5,  [x0]
-        strb            w6,  [x0, #2]
-        add             x0,  x0,  x1
-        b.gt            30b
-        ret
-
-40:
-        add             x7,  x0,  x1
-        lsl             x1,  x1,  #1
-42:
-        cmp             w4,  #2
-        b.lt            41f
-        subs            w4,  w4,  #2
-        ld1             {v0.2s}, [x2], #8
-        st1             {v0.s}[0], [x0], x1
-        st1             {v0.s}[1], [x7], x1
-        b.le            0f
-        b               42b
-41:
-        ld1             {v0.s}[0], [x2]
-        st1             {v0.s}[0], [x0]
-0:
-        ret
-
-50:
-        ldr             w5,  [x2]
-        ldrb            w6,  [x2, #4]
-        add             x2,  x2,  #5
-        subs            w4,  w4,  #1
-        str             w5,  [x0]
-        strb            w6,  [x0, #4]
-        add             x0,  x0,  x1
-        b.gt            50b
-        ret
-
-60:
-        ldr             w5,  [x2]
-        ldrh            w6,  [x2, #4]
-        add             x2,  x2,  #6
-        subs            w4,  w4,  #1
-        str             w5,  [x0]
-        strh            w6,  [x0, #4]
-        add             x0,  x0,  x1
-        b.gt            60b
-        ret
-
-70:
-        ldr             w5,  [x2]
-        ldrh            w6,  [x2, #4]
-        ldrb            w7,  [x2, #6]
-        add             x2,  x2,  #7
-        subs            w4,  w4,  #1
-        str             w5,  [x0]
-        strh            w6,  [x0, #4]
-        strb            w7,  [x0, #6]
-        add             x0,  x0,  x1
-        b.gt            70b
-        ret
-
-L(copy_narrow_tbl):
-        .hword 0
-        .hword L(copy_narrow_tbl) - 10b
-        .hword L(copy_narrow_tbl) - 20b
-        .hword L(copy_narrow_tbl) - 30b
-        .hword L(copy_narrow_tbl) - 40b
-        .hword L(copy_narrow_tbl) - 50b
-        .hword L(copy_narrow_tbl) - 60b
-        .hword L(copy_narrow_tbl) - 70b
-endfunc
--- a/third_party/dav1d/src/arm/64/mc.S
+++ b/third_party/dav1d/src/arm/64/mc.S
@@ -1,12 +1,11 @@
 /*
  * Copyright © 2018, VideoLAN and dav1d authors
  * Copyright © 2018, Janne Grunau
- * Copyright © 2018, Martin Storsjo
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright notice, this
  *    list of conditions and the following disclaimer.
  *
@@ -22,17 +21,16 @@
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include "src/arm/asm.S"
-#include "src/arm/64/util.S"
 
 .macro avg dst, t0, t1
         ld1             {\t0\().8h},   [x2],  16
         ld1             {\t1\().8h},   [x3],  16
         add             \t0\().8h,   \t0\().8h,   \t1\().8h
         sqrshrun        \dst\().8b,  \t0\().8h,   #5
 .endm
 
@@ -227,2107 +225,8 @@ L(\type\()_tbl):
         .hword L(\type\()_tbl) -    8b
         .hword L(\type\()_tbl) -    4b
 endfunc
 .endm
 
 bidir_fn avg
 bidir_fn w_avg
 bidir_fn mask
-
-
-// This has got the same signature as the put_8tap functions,
-// and assumes that x8 is set to (24-clz(w)).
-function put
-        adr             x9,  L(put_tbl)
-        ldrh            w8,  [x9, x8, lsl #1]
-        sub             x9,  x9,  w8, uxtw
-        br              x9
-
-2:
-        ld1             {v0.h}[0], [x2], x3
-        ld1             {v1.h}[0], [x2], x3
-        subs            w5,  w5,  #2
-        st1             {v0.h}[0], [x0], x1
-        st1             {v1.h}[0], [x0], x1
-        b.gt            2b
-        ret
-4:
-        ld1             {v0.s}[0], [x2], x3
-        ld1             {v1.s}[0], [x2], x3
-        subs            w5,  w5,  #2
-        st1             {v0.s}[0], [x0], x1
-        st1             {v1.s}[0], [x0], x1
-        b.gt            4b
-        ret
-8:
-        ld1             {v0.8b}, [x2], x3
-        ld1             {v1.8b}, [x2], x3
-        subs            w5,  w5,  #2
-        st1             {v0.8b}, [x0], x1
-        st1             {v1.8b}, [x0], x1
-        b.gt            8b
-        ret
-160:
-        add             x8,  x0,  x1
-        lsl             x1,  x1,  #1
-        add             x9,  x2,  x3
-        lsl             x3,  x3,  #1
-16:
-        ld1             {v0.16b}, [x2], x3
-        ld1             {v1.16b}, [x9], x3
-        subs            w5,  w5,  #2
-        st1             {v0.16b}, [x0], x1
-        st1             {v1.16b}, [x8], x1
-        b.gt            16b
-        ret
-32:
-        ldp             x6,  x7,  [x2]
-        ldp             x8,  x9,  [x2, #16]
-        stp             x6,  x7,  [x0]
-        subs            w5,  w5,  #1
-        stp             x8,  x9,  [x0, #16]
-        add             x2,  x2,  x3
-        add             x0,  x0,  x1
-        b.gt            32b
-        ret
-64:
-        ldp             x6,  x7,  [x2]
-        ldp             x8,  x9,  [x2, #16]
-        stp             x6,  x7,  [x0]
-        ldp             x10, x11, [x2, #32]
-        stp             x8,  x9,  [x0, #16]
-        subs            w5,  w5,  #1
-        ldp             x12, x13, [x2, #48]
-        stp             x10, x11, [x0, #32]
-        stp             x12, x13, [x0, #48]
-        add             x2,  x2,  x3
-        add             x0,  x0,  x1
-        b.gt            64b
-        ret
-128:
-        ldp             q0,  q1,  [x2]
-        ldp             q2,  q3,  [x2, #32]
-        stp             q0,  q1,  [x0]
-        ldp             q4,  q5,  [x2, #64]
-        stp             q2,  q3,  [x0, #32]
-        ldp             q6,  q7,  [x2, #96]
-        subs            w5,  w5,  #1
-        stp             q4,  q5,  [x0, #64]
-        stp             q6,  q7,  [x0, #96]
-        add             x2,  x2,  x3
-        add             x0,  x0,  x1
-        b.gt            128b
-        ret
-
-L(put_tbl):
-        .hword L(put_tbl) - 128b
-        .hword L(put_tbl) -  64b
-        .hword L(put_tbl) -  32b
-        .hword L(put_tbl) - 160b
-        .hword L(put_tbl) -   8b
-        .hword L(put_tbl) -   4b
-        .hword L(put_tbl) -   2b
-endfunc
-
-
-// This has got the same signature as the prep_8tap functions,
-// and assumes that x8 is set to (24-clz(w)), and x7 to w*2.
-function prep
-        adr             x9,  L(prep_tbl)
-        ldrh            w8,  [x9, x8, lsl #1]
-        sub             x9,  x9,  w8, uxtw
-        br              x9
-
-4:
-        ld1             {v0.s}[0], [x1], x2
-        ld1             {v1.s}[0], [x1], x2
-        subs            w4,  w4,  #2
-        ushll           v0.8h, v0.8b, #4
-        ushll           v1.8h, v1.8b, #4
-        st1             {v0.4h, v1.4h}, [x0], #16
-        b.gt            4b
-        ret
-8:
-        ld1             {v0.8b}, [x1], x2
-        ld1             {v1.8b}, [x1], x2
-        subs            w4,  w4,  #2
-        ushll           v0.8h, v0.8b, #4
-        ushll           v1.8h, v1.8b, #4
-        st1             {v0.8h, v1.8h}, [x0], #32
-        b.gt            8b
-        ret
-160:
-        add             x9,  x1,  x2
-        lsl             x2,  x2,  #1
-16:
-        ld1             {v0.16b}, [x1], x2
-        ld1             {v1.16b}, [x9], x2
-        subs            w4,  w4,  #2
-        ushll           v4.8h, v0.8b,  #4
-        ushll2          v5.8h, v0.16b, #4
-        ushll           v6.8h, v1.8b,  #4
-        ushll2          v7.8h, v1.16b, #4
-        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
-        b.gt            16b
-        ret
-320:
-        add             x8,  x0,  w3, uxtw
-32:
-        ld1             {v0.16b, v1.16b},  [x1], x2
-        subs            w4,  w4,  #2
-        ushll           v4.8h,  v0.8b,  #4
-        ushll2          v5.8h,  v0.16b, #4
-        ld1             {v2.16b, v3.16b},  [x1], x2
-        ushll           v6.8h,  v1.8b,  #4
-        ushll2          v7.8h,  v1.16b, #4
-        ushll           v16.8h, v2.8b,  #4
-        st1             {v4.8h,  v5.8h},  [x0], x7
-        ushll2          v17.8h, v2.16b, #4
-        st1             {v6.8h,  v7.8h},  [x8], x7
-        ushll           v18.8h, v3.8b,  #4
-        st1             {v16.8h, v17.8h}, [x0], x7
-        ushll2          v19.8h, v3.16b, #4
-        st1             {v18.8h, v19.8h}, [x8], x7
-        b.gt            32b
-        ret
-640:
-        add             x8,  x0,  #32
-        mov             x6,  #64
-64:
-        ldp             q0,  q1,  [x1]
-        subs            w4,  w4,  #1
-        ushll           v4.8h,  v0.8b,  #4
-        ushll2          v5.8h,  v0.16b, #4
-        ldp             q2,  q3,  [x1, #32]
-        ushll           v6.8h,  v1.8b,  #4
-        ushll2          v7.8h,  v1.16b, #4
-        add             x1,  x1,  x2
-        ushll           v16.8h, v2.8b,  #4
-        st1             {v4.8h,  v5.8h},  [x0], x6
-        ushll2          v17.8h, v2.16b, #4
-        ushll           v18.8h, v3.8b,  #4
-        st1             {v6.8h,  v7.8h},  [x8], x6
-        ushll2          v19.8h, v3.16b, #4
-        st1             {v16.8h, v17.8h}, [x0], x6
-        st1             {v18.8h, v19.8h}, [x8], x6
-        b.gt            64b
-        ret
-1280:
-        add             x8,  x0,  #64
-        mov             x6,  #128
-128:
-        ldp             q0,  q1,  [x1]
-        ldp             q2,  q3,  [x1, #32]
-        ushll           v16.8h,  v0.8b,  #4
-        ushll2          v17.8h,  v0.16b, #4
-        ushll           v18.8h,  v1.8b,  #4
-        ushll2          v19.8h,  v1.16b, #4
-        ushll           v20.8h,  v2.8b,  #4
-        ushll2          v21.8h,  v2.16b, #4
-        ldp             q4,  q5,  [x1, #64]
-        st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x6
-        ushll           v22.8h,  v3.8b,  #4
-        ushll2          v23.8h,  v3.16b, #4
-        ushll           v24.8h,  v4.8b,  #4
-        ushll2          v25.8h,  v4.16b, #4
-        ushll           v26.8h,  v5.8b,  #4
-        ushll2          v27.8h,  v5.16b, #4
-        ldp             q6,  q7,  [x1, #96]
-        st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x8], x6
-        ushll           v28.8h,  v6.8b,  #4
-        ushll2          v29.8h,  v6.16b, #4
-        ushll           v30.8h,  v7.8b,  #4
-        ushll2          v31.8h,  v7.16b, #4
-        subs            w4,  w4,  #1
-        add             x1,  x1,  x2
-        st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], x6
-        st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x6
-        b.gt            128b
-        ret
-
-L(prep_tbl):
-        .hword L(prep_tbl) - 1280b
-        .hword L(prep_tbl) -  640b
-        .hword L(prep_tbl) -  320b
-        .hword L(prep_tbl) -  160b
-        .hword L(prep_tbl) -    8b
-        .hword L(prep_tbl) -    4b
-endfunc
-
-
-.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
-        ld1             {\d0\wd}[0], [\s0], \strd
-        ld1             {\d1\wd}[0], [\s1], \strd
-.ifnb \d2
-        ld1             {\d2\wd}[0], [\s0], \strd
-        ld1             {\d3\wd}[0], [\s1], \strd
-.endif
-.ifnb \d4
-        ld1             {\d4\wd}[0], [\s0], \strd
-.endif
-.ifnb \d5
-        ld1             {\d5\wd}[0], [\s1], \strd
-.endif
-.ifnb \d6
-        ld1             {\d6\wd}[0], [\s0], \strd
-.endif
-.endm
-.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
-        ld1             {\d0\wd}, [\s0], \strd
-        ld1             {\d1\wd}, [\s1], \strd
-.ifnb \d2
-        ld1             {\d2\wd}, [\s0], \strd
-        ld1             {\d3\wd}, [\s1], \strd
-.endif
-.ifnb \d4
-        ld1             {\d4\wd}, [\s0], \strd
-.endif
-.ifnb \d5
-        ld1             {\d5\wd}, [\s1], \strd
-.endif
-.ifnb \d6
-        ld1             {\d6\wd}, [\s0], \strd
-.endif
-.endm
-.macro load_h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
-        load_slice      \s0, \s1, \strd, .h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
-.endm
-.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
-        load_slice      \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6
-.endm
-.macro load_8b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
-        load_reg        \s0, \s1, \strd, .8b, \d0, \d1, \d2, \d3, \d4, \d5, \d6
-.endm
-.macro load_16b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
-        load_reg        \s0, \s1, \strd, .16b, \d0, \d1, \d2, \d3, \d4, \d5, \d6
-.endm
-.macro interleave_1 wd, r0, r1, r2, r3, r4
-        trn1            \r0\wd, \r0\wd, \r1\wd
-        trn1            \r1\wd, \r1\wd, \r2\wd
-.ifnb \r3
-        trn1            \r2\wd, \r2\wd, \r3\wd
-        trn1            \r3\wd, \r3\wd, \r4\wd
-.endif
-.endm
-.macro interleave_1_h r0, r1, r2, r3, r4
-        interleave_1    .4h, \r0, \r1, \r2, \r3, \r4
-.endm
-.macro interleave_1_s r0, r1, r2, r3, r4
-        interleave_1    .2s, \r0, \r1, \r2, \r3, \r4
-.endm
-.macro interleave_2 wd, r0, r1, r2, r3, r4, r5
-        trn1            \r0\wd,  \r0\wd, \r2\wd
-        trn1            \r1\wd,  \r1\wd, \r3\wd
-        trn1            \r2\wd,  \r2\wd, \r4\wd
-        trn1            \r3\wd,  \r3\wd, \r5\wd
-.endm
-.macro interleave_2_s r0, r1, r2, r3, r4, r5
-        interleave_2    .2s, \r0, \r1, \r2, \r3, \r4, \r5
-.endm
-.macro uxtl_b r0, r1, r2, r3, r4, r5, r6
-        uxtl            \r0\().8h, \r0\().8b
-        uxtl            \r1\().8h, \r1\().8b
-.ifnb \r2
-        uxtl            \r2\().8h, \r2\().8b
-        uxtl            \r3\().8h, \r3\().8b
-.endif
-.ifnb \r4
-        uxtl            \r4\().8h, \r4\().8b
-.endif
-.ifnb \r5
-        uxtl            \r5\().8h, \r5\().8b
-.endif
-.ifnb \r6
-        uxtl            \r6\().8h, \r6\().8b
-.endif
-.endm
-.macro mul_mla_4 d, s0, s1, s2, s3, wd
-        mul             \d\wd,  \s0\wd,  v0.h[0]
-        mla             \d\wd,  \s1\wd,  v0.h[1]
-        mla             \d\wd,  \s2\wd,  v0.h[2]
-        mla             \d\wd,  \s3\wd,  v0.h[3]
-.endm
-.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
-        mul             \d0\().8h, \s0\().8h, v0.h[0]
-        mul             \d1\().8h, \s1\().8h, v0.h[0]
-        mla             \d0\().8h, \s1\().8h, v0.h[1]
-        mla             \d1\().8h, \s2\().8h, v0.h[1]
-        mla             \d0\().8h, \s2\().8h, v0.h[2]
-        mla             \d1\().8h, \s3\().8h, v0.h[2]
-        mla             \d0\().8h, \s3\().8h, v0.h[3]
-        mla             \d1\().8h, \s4\().8h, v0.h[3]
-        mla             \d0\().8h, \s4\().8h, v0.h[4]
-        mla             \d1\().8h, \s5\().8h, v0.h[4]
-        mla             \d0\().8h, \s5\().8h, v0.h[5]
-        mla             \d1\().8h, \s6\().8h, v0.h[5]
-        mla             \d0\().8h, \s6\().8h, v0.h[6]
-        mla             \d1\().8h, \s7\().8h, v0.h[6]
-        mla             \d0\().8h, \s7\().8h, v0.h[7]
-        mla             \d1\().8h, \s8\().8h, v0.h[7]
-.endm
-.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
-        mul             \d0\().8h, \s0\().8h, v0.h[0]
-        mul             \d1\().8h, \s2\().8h, v0.h[0]
-        mla             \d0\().8h, \s1\().8h, v0.h[1]
-        mla             \d1\().8h, \s3\().8h, v0.h[1]
-        mla             \d0\().8h, \s2\().8h, v0.h[2]
-        mla             \d1\().8h, \s4\().8h, v0.h[2]
-        mla             \d0\().8h, \s3\().8h, v0.h[3]
-        mla             \d1\().8h, \s5\().8h, v0.h[3]
-        mla             \d0\().8h, \s4\().8h, v0.h[4]
-        mla             \d1\().8h, \s6\().8h, v0.h[4]
-        mla             \d0\().8h, \s5\().8h, v0.h[5]
-        mla             \d1\().8h, \s7\().8h, v0.h[5]
-        mla             \d0\().8h, \s6\().8h, v0.h[6]
-        mla             \d1\().8h, \s8\().8h, v0.h[6]
-        mla             \d0\().8h, \s7\().8h, v0.h[7]
-        mla             \d1\().8h, \s9\().8h, v0.h[7]
-.endm
-.macro mul_mla_8_4 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11
-        mul             \d0\().8h, \s0\().8h,  v0.h[0]
-        mul             \d1\().8h, \s4\().8h,  v0.h[0]
-        mla             \d0\().8h, \s1\().8h,  v0.h[1]
-        mla             \d1\().8h, \s5\().8h,  v0.h[1]
-        mla             \d0\().8h, \s2\().8h,  v0.h[2]
-        mla             \d1\().8h, \s6\().8h,  v0.h[2]
-        mla             \d0\().8h, \s3\().8h,  v0.h[3]
-        mla             \d1\().8h, \s7\().8h,  v0.h[3]
-        mla             \d0\().8h, \s4\().8h,  v0.h[4]
-        mla             \d1\().8h, \s8\().8h,  v0.h[4]
-        mla             \d0\().8h, \s5\().8h,  v0.h[5]
-        mla             \d1\().8h, \s9\().8h,  v0.h[5]
-        mla             \d0\().8h, \s6\().8h,  v0.h[6]
-        mla             \d1\().8h, \s10\().8h, v0.h[6]
-        mla             \d0\().8h, \s7\().8h,  v0.h[7]
-        mla             \d1\().8h, \s11\().8h, v0.h[7]
-.endm
-.macro sqrshrun_b shift, r0, r1, r2, r3
-        sqrshrun        \r0\().8b, \r0\().8h,  #\shift
-.ifnb \r1
-        sqrshrun        \r1\().8b, \r1\().8h,  #\shift
-.endif
-.ifnb \r2
-        sqrshrun        \r2\().8b, \r2\().8h,  #\shift
-        sqrshrun        \r3\().8b, \r3\().8h,  #\shift
-.endif
-.endm
-.macro srshr_h shift, r0, r1, r2, r3
-        srshr           \r0\().8h, \r0\().8h,  #\shift
-.ifnb \r1
-        srshr           \r1\().8h, \r1\().8h,  #\shift
-.endif
-.ifnb \r2
-        srshr           \r2\().8h, \r2\().8h,  #\shift
-        srshr           \r3\().8h, \r3\().8h,  #\shift
-.endif
-.endm
-.macro st_h strd, reg, lanes
-        st1             {\reg\().h}[0], [x0], \strd
-        st1             {\reg\().h}[1], [x8], \strd
-.if \lanes > 2
-        st1             {\reg\().h}[2], [x0], \strd
-        st1             {\reg\().h}[3], [x8], \strd
-.endif
-.endm
-.macro st_s strd, r0, r1, r2, r3
-        st1             {\r0\().s}[0], [x0], \strd
-        st1             {\r0\().s}[1], [x8], \strd
-.ifnb \r1
-        st1             {\r1\().s}[0], [x0], \strd
-        st1             {\r1\().s}[1], [x8], \strd
-.endif
-.endm
-.macro st_d strd, r0, r1, r2, r3
-        st1             {\r0\().d}[0], [x0], \strd
-        st1             {\r0\().d}[1], [x8], \strd
-.ifnb \r1
-        st1             {\r1\().d}[0], [x0], \strd
-        st1             {\r1\().d}[1], [x8], \strd
-.endif
-.endm
-.macro shift_store_4 type, strd, r0, r1, r2, r3
-.ifc \type, put
-        sqrshrun_b      6,     \r0, \r1, \r2, \r3
-        st_s            \strd, \r0, \r1, \r2, \r3
-.else
-        srshr_h         2,     \r0, \r1, \r2, \r3
-        st_d            \strd, \r0, \r1, \r2, \r3
-.endif
-.endm
-.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7
-        st1             {\r0\wd}, [x0], \strd
-        st1             {\r1\wd}, [x8], \strd
-.ifnb \r2
-        st1             {\r2\wd}, [x0], \strd
-        st1             {\r3\wd}, [x8], \strd
-.endif
-.ifnb \r4
-        st1             {\r4\wd}, [x0], \strd
-        st1             {\r5\wd}, [x8], \strd
-        st1             {\r6\wd}, [x0], \strd
-        st1             {\r7\wd}, [x8], \strd
-.endif
-.endm
-.macro st_8b strd, r0, r1, r2, r3, r4, r5, r6, r7
-        st_reg          \strd, .8b,  \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
-.endm
-.macro st_16b strd, r0, r1, r2, r3, r4, r5, r6, r7
-        st_reg          \strd, .16b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
-.endm
-.macro shift_store_8 type, strd, r0, r1, r2, r3
-.ifc \type, put
-        sqrshrun_b      6,     \r0, \r1, \r2, \r3
-        st_8b           \strd, \r0, \r1, \r2, \r3
-.else
-        srshr_h         2,     \r0, \r1, \r2, \r3
-        st_16b          \strd, \r0, \r1, \r2, \r3
-.endif
-.endm
-.macro shift_store_16 type, strd, r0, r1, r2, r3
-.ifc \type, put
-        sqrshrun        \r0\().8b,  \r0\().8h, #6
-        sqrshrun2       \r0\().16b, \r1\().8h, #6
-        sqrshrun        \r2\().8b,  \r2\().8h, #6
-        sqrshrun2       \r2\().16b, \r3\().8h, #6
-        st_16b          \strd, \r0, \r2
-.else
-        srshr_h         2,     \r0, \r1, \r2, \r3
-        st1             {\r0\().8h, \r1\().8h}, [x0], \strd
-        st1             {\r2\().8h, \r3\().8h}, [x8], \strd
-.endif
-.endm
-
-.macro make_8tap_fn op, type, type_h, type_v
-function \op\()_8tap_\type\()_8bpc_neon, export=1
-        mov             x8,  \type_h
-        mov             x9,  \type_v
-        b               \op\()_8tap
-endfunc
-.endm
-
-// No spaces in these expressions, due to gas-preprocessor.
-#define REGULAR ((0*15<<7)|3*15)
-#define SMOOTH  ((1*15<<7)|4*15)
-#define SHARP   ((2*15<<7)|3*15)
-
-.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv
-make_8tap_fn \type, regular,        REGULAR, REGULAR
-make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
-make_8tap_fn \type, regular_sharp,  REGULAR, SHARP
-make_8tap_fn \type, smooth,         SMOOTH,  SMOOTH
-make_8tap_fn \type, smooth_regular, SMOOTH,  REGULAR
-make_8tap_fn \type, smooth_sharp,   SMOOTH,  SHARP
-make_8tap_fn \type, sharp,          SHARP,   SHARP
-make_8tap_fn \type, sharp_regular,  SHARP,   REGULAR
-make_8tap_fn \type, sharp_smooth,   SHARP,   SMOOTH
-
-function \type\()_8tap
-        mov             w10,  #0x4081  // (1 << 14) | (1 << 7) | (1 << 0)
-        mul             \mx,  \mx, w10
-        mul             \my,  \my, w10
-        add             \mx,  \mx, w8 // mx, 8tap_h, 4tap_h
-        add             \my,  \my, w9 // my, 8tap_v, 4tap_v
-.ifc \type, prep
-        uxtw            \d_strd, \w
-        lsl             \d_strd, \d_strd, #1
-.endif
-
-        clz             w8,  \w
-        tst             \mx, #(0x7f << 14)
-        sub             w8,  w8,  #24
-        movrel          x10, X(mc_subpel_filters), -8
-        b.ne            L(\type\()_8tap_h)
-        tst             \my, #(0x7f << 14)
-        b.ne            L(\type\()_8tap_v)
-        b               \type
-
-L(\type\()_8tap_h):
-        cmp             \w,  #4
-        ubfm            w9,  \mx, #7, #13
-        and             \mx, \mx, #0x7f
-        b.le            4f
-        mov             \mx,  w9
-4:
-        tst             \my,  #(0x7f << 14)
-        add             \xmx, x10, \mx, uxtw #3
-        b.ne            L(\type\()_8tap_hv)
-
-        adr             x9,  L(\type\()_8tap_h_tbl)
-        ldrh            w8,  [x9, x8, lsl #1]
-        sub             x9,  x9,  w8, uxtw
-        br              x9
-
-20:     // 2xN h
-.ifc \type, put
-        add             \xmx,  \xmx,  #2
-        ld1             {v0.s}[0], [\xmx]
-        sub             \src,  \src,  #1
-        add             \ds2,  \dst,  \d_strd
-        add             \sr2,  \src,  \s_strd
-        lsl             \d_strd,  \d_strd,  #1
-        lsl             \s_strd,  \s_strd,  #1
-        sxtl            v0.8h,  v0.8b
-2:
-        ld1             {v4.8b},  [\src], \s_strd
-        ld1             {v6.8b},  [\sr2], \s_strd
-        uxtl            v4.8h,  v4.8b
-        uxtl            v6.8h,  v6.8b
-        ext             v5.16b, v4.16b, v4.16b, #2
-        ext             v7.16b, v6.16b, v6.16b, #2
-        subs            \h,  \h,  #2
-        trn1            v3.2s,  v4.2s,  v6.2s
-        trn2            v6.2s,  v4.2s,  v6.2s
-        trn1            v4.2s,  v5.2s,  v7.2s
-        trn2            v7.2s,  v5.2s,  v7.2s
-        mul             v3.4h,  v3.4h,  v0.h[0]
-        mla             v3.4h,  v4.4h,  v0.h[1]
-        mla             v3.4h,  v6.4h,  v0.h[2]
-        mla             v3.4h,  v7.4h,  v0.h[3]
-        srshr           v3.4h,  v3.4h,  #2
-        sqrshrun        v3.8b,  v3.8h,  #4
-        st1             {v3.h}[0], [\dst], \d_strd
-        st1             {v3.h}[1], [\ds2], \d_strd
-        b.gt            2b
-        ret
-.endif
-
-40:     // 4xN h
-        add             \xmx,  \xmx,  #2
-        ld1             {v0.s}[0], [\xmx]
-        sub             \src,  \src,  #1
-        add             \ds2,  \dst,  \d_strd
-        add             \sr2,  \src,  \s_strd
-        lsl             \d_strd,  \d_strd,  #1
-        lsl             \s_strd,  \s_strd,  #1
-        sxtl            v0.8h,  v0.8b
-4:
-        ld1             {v16.8b}, [\src], \s_strd
-        ld1             {v20.8b}, [\sr2], \s_strd
-        uxtl            v16.8h,  v16.8b
-        uxtl            v20.8h,  v20.8b
-        ext             v17.16b, v16.16b, v16.16b, #2
-        ext             v18.16b, v16.16b, v16.16b, #4
-        ext             v19.16b, v16.16b, v16.16b, #6
-        ext             v21.16b, v20.16b, v20.16b, #2
-        ext             v22.16b, v20.16b, v20.16b, #4
-        ext             v23.16b, v20.16b, v20.16b, #6
-        subs            \h,  \h,  #2
-        mul             v16.4h,  v16.4h,  v0.h[0]
-        mla             v16.4h,  v17.4h,  v0.h[1]
-        mla             v16.4h,  v18.4h,  v0.h[2]
-        mla             v16.4h,  v19.4h,  v0.h[3]
-        mul             v20.4h,  v20.4h,  v0.h[0]
-        mla             v20.4h,  v21.4h,  v0.h[1]
-        mla             v20.4h,  v22.4h,  v0.h[2]
-        mla             v20.4h,  v23.4h,  v0.h[3]
-        srshr           v16.4h,  v16.4h,  #2
-        srshr           v20.4h,  v20.4h,  #2
-.ifc \type, put
-        sqrshrun        v16.8b,  v16.8h,  #4
-        sqrshrun        v20.8b,  v20.8h,  #4
-        st1             {v16.s}[0], [\dst], \d_strd
-        st1             {v20.s}[0], [\ds2], \d_strd
-.else
-        st1             {v16.4h}, [\dst], \d_strd
-        st1             {v20.4h}, [\ds2], \d_strd
-.endif
-        b.gt            4b
-        ret
-
-80:     // 8xN h
-        ld1             {v0.8b}, [\xmx]
-        sub             \src,  \src,  #3
-        add             \ds2,  \dst,  \d_strd
-        add             \sr2,  \src,  \s_strd
-        lsl             \d_strd,  \d_strd,  #1
-        lsl             \s_strd,  \s_strd,  #1
-        sxtl            v0.8h, v0.8b
-8:
-        ld1             {v16.8b, v17.8b},  [\src], \s_strd
-        ld1             {v20.8b, v21.8b},  [\sr2], \s_strd
-        uxtl            v16.8h,  v16.8b
-        uxtl            v17.8h,  v17.8b
-        uxtl            v20.8h,  v20.8b
-        uxtl            v21.8h,  v21.8b
-
-        mul             v18.8h,  v16.8h,  v0.h[0]
-        mul             v22.8h,  v20.8h,  v0.h[0]
-.irpc i, 1234567
-        ext             v19.16b, v16.16b, v17.16b, #(2*\i)
-        ext             v23.16b, v20.16b, v21.16b, #(2*\i)
-        mla             v18.8h,  v19.8h,  v0.h[\i]
-        mla             v22.8h,  v23.8h,  v0.h[\i]
-.endr
-        subs            \h,  \h,  #2
-        srshr           v18.8h,  v18.8h, #2
-        srshr           v22.8h,  v22.8h, #2
-.ifc \type, put
-        sqrshrun        v18.8b,  v18.8h, #4
-        sqrshrun        v22.8b,  v22.8h, #4
-        st1             {v18.8b}, [\dst], \d_strd
-        st1             {v22.8b}, [\ds2], \d_strd
-.else
-        st1             {v18.8h}, [\dst], \d_strd
-        st1             {v22.8h}, [\ds2], \d_strd
-.endif
-        b.gt            8b
-        ret
-160:
-320:
-640:
-1280:   // 16xN, 32xN, ... h
-        ld1             {v0.8b}, [\xmx]
-        sub             \src,  \src,  #3
-        add             \ds2,  \dst,  \d_strd
-        add             \sr2,  \src,  \s_strd
-        lsl             \s_strd,  \s_strd,  #1
-        sxtl            v0.8h, v0.8b
-
-        sub             \s_strd,  \s_strd,  \w, uxtw
-        sub             \s_strd,  \s_strd,  #8
-.ifc \type, put
-        lsl             \d_strd,  \d_strd,  #1
-        sub             \d_strd,  \d_strd,  \w, uxtw
-.endif
-161:
-        ld1             {v16.8b, v17.8b, v18.8b},  [\src], #24
-        ld1             {v20.8b, v21.8b, v22.8b},  [\sr2], #24
-        mov             \mx, \w
-        uxtl            v16.8h,  v16.8b
-        uxtl            v17.8h,  v17.8b
-        uxtl            v18.8h,  v18.8b
-        uxtl            v20.8h,  v20.8b
-        uxtl            v21.8h,  v21.8b
-        uxtl            v22.8h,  v22.8b
-
-16:
-        mul             v24.8h,  v16.8h,  v0.h[0]
-        mul             v25.8h,  v17.8h,  v0.h[0]
-        mul             v26.8h,  v20.8h,  v0.h[0]
-        mul             v27.8h,  v21.8h,  v0.h[0]
-.irpc i, 1234567
-        ext             v28.16b, v16.16b, v17.16b, #(2*\i)
-        ext             v29.16b, v17.16b, v18.16b, #(2*\i)
-        ext             v30.16b, v20.16b, v21.16b, #(2*\i)
-        ext             v31.16b, v21.16b, v22.16b, #(2*\i)
-        mla             v24.8h,  v28.8h,  v0.h[\i]
-        mla             v25.8h,  v29.8h,  v0.h[\i]
-        mla             v26.8h,  v30.8h,  v0.h[\i]
-        mla             v27.8h,  v31.8h,  v0.h[\i]
-.endr
-        srshr           v24.8h,  v24.8h, #2
-        srshr           v25.8h,  v25.8h, #2
-        srshr           v26.8h,  v26.8h, #2
-        srshr           v27.8h,  v27.8h, #2
-        subs            \mx, \mx, #16
-.ifc \type, put
-        sqrshrun        v24.8b,  v24.8h, #4
-        sqrshrun2       v24.16b, v25.8h, #4
-        sqrshrun        v26.8b,  v26.8h, #4
-        sqrshrun2       v26.16b, v27.8h, #4
-        st1             {v24.16b}, [\dst], #16
-        st1             {v26.16b}, [\ds2], #16
-.else
-        st1             {v24.8h, v25.8h}, [\dst], #32
-        st1             {v26.8h, v27.8h}, [\ds2], #32
-.endif
-        b.le            9f
-
-        mov             v16.16b, v18.16b
-        mov             v20.16b, v22.16b
-        ld1             {v17.8b, v18.8b}, [\src], #16
-        ld1             {v21.8b, v22.8b}, [\sr2], #16
-        uxtl            v17.8h,  v17.8b
-        uxtl            v18.8h,  v18.8b
-        uxtl            v21.8h,  v21.8b
-        uxtl            v22.8h,  v22.8b
-        b               16b
-
-9:
-        add             \dst,  \dst,  \d_strd
-        add             \ds2,  \ds2,  \d_strd
-        add             \src,  \src,  \s_strd
-        add             \sr2,  \sr2,  \s_strd
-
-        subs            \h,  \h,  #2
-        b.gt            161b
-        ret
-
-L(\type\()_8tap_h_tbl):
-        .hword L(\type\()_8tap_h_tbl) - 1280b
-        .hword L(\type\()_8tap_h_tbl) -  640b
-        .hword L(\type\()_8tap_h_tbl) -  320b
-        .hword L(\type\()_8tap_h_tbl) -  160b
-        .hword L(\type\()_8tap_h_tbl) -   80b
-        .hword L(\type\()_8tap_h_tbl) -   40b
-        .hword L(\type\()_8tap_h_tbl) -   20b
-        .hword 0
-
-
-L(\type\()_8tap_v):
-        cmp             \h,  #4
-        ubfm            w9,  \my, #7, #13
-        and             \my, \my, #0x7f
-        b.le            4f
-        mov             \my, w9
-4:
-        add             \xmy, x10, \my, uxtw #3
-
-        adr             x9,  L(\type\()_8tap_v_tbl)
-        ldrh            w8,  [x9, x8, lsl #1]
-        sub             x9,  x9,  w8, uxtw
-        br              x9
-
-20:     // 2xN v
-.ifc \type, put
-        b.gt            28f
-
-        cmp             \h,  #2
-        add             \xmy, \xmy, #2
-        ld1             {v0.s}[0], [\xmy]
-        sub             \src,  \src,  \s_strd
-        add             \ds2,  \dst,  \d_strd
-        add             \sr2,  \src,  \s_strd
-        lsl             \s_strd,  \s_strd,  #1
-        lsl             \d_strd,  \d_strd,  #1
-        sxtl            v0.8h, v0.8b
-
-        // 2x2 v
-        load_h          \src, \sr2, \s_strd, v1, v2, v3, v4, v5
-        interleave_1_h  v1, v2, v3, v4, v5
-        b.gt            24f
-        uxtl_b          v1, v2, v3, v4
-        mul_mla_4       v6, v1, v2, v3, v4, .4h
-        sqrshrun_b      6,  v6
-        st_h            \d_strd, v6, 2
-        ret
-
-24:     // 2x4 v
-        load_h          \sr2, \src, \s_strd, v6, v7
-        interleave_1_h  v5, v6, v7
-        interleave_2_s  v1, v2, v3, v4, v5, v6
-        uxtl_b          v1, v2, v3, v4
-        mul_mla_4       v6, v1, v2, v3, v4, .8h
-        sqrshrun_b      6,  v6
-        st_h            \d_strd, v6, 4
-        ret
-
-28:     // 2x8, 2x16 v
-        ld1             {v0.8b}, [\xmy]
-        sub             \sr2,  \src,  \s_strd, lsl #1
-        add             \ds2,  \dst,  \d_strd
-        sub             \src,  \sr2,  \s_strd
-        lsl             \d_strd,  \d_strd,  #1
-        lsl             \s_strd,  \s_strd,  #1
-        sxtl            v0.8h, v0.8b
-
-        load_h          \src, \sr2, \s_strd, v1,  v2,  v3,  v4, v5, v6, v7
-        interleave_1_h  v1,  v2,  v3,  v4,  v5
-        interleave_1_h  v5,  v6,  v7
-        interleave_2_s  v1,  v2,  v3,  v4,  v5,  v6
-        uxtl_b          v1,  v2,  v3,  v4
-216:
-        subs            \h,  \h,  #8
-        load_h          \sr2, \src, \s_strd, v16, v17, v18, v19
-        load_h          \sr2, \src, \s_strd, v20, v21, v22, v23
-        interleave_1_h  v7,  v16, v17, v18, v19
-        interleave_1_h  v19, v20, v21, v22, v23
-        interleave_2_s  v5,  v6,  v7,  v16, v17, v18
-        interleave_2_s  v17, v18, v19, v20, v21, v22
-        uxtl_b          v5,  v6,  v7,  v16
-        uxtl_b          v17, v18, v19, v20
-        mul_mla_8_4     v30, v31, v1,  v2,  v3,  v4,  v5,  v6,  v7,  v16, v17, v18, v19, v20
-        sqrshrun_b      6,   v30, v31
-        st_h            \d_strd, v30, 4
-        st_h            \d_strd, v31, 4
-        b.le            0f
-        mov             v1.16b,  v17.16b
-        mov             v2.16b,  v18.16b
-        mov             v3.16b,  v19.16b
-        mov             v4.16b,  v20.16b
-        mov             v5.16b,  v21.16b
-        mov             v6.16b,  v22.16b
-        mov             v7.16b,  v23.16b
-        b               216b
-0:
-        ret
-.endif
-
-40:
-        b.gt            480f
-
-        // 4x2, 4x4 v
-        cmp             \h,  #2
-        add             \xmy, \xmy, #2
-        ld1             {v0.s}[0], [\xmy]
-        sub             \src, \src, \s_strd
-        add             \ds2, \dst, \d_strd
-        add             \sr2, \src, \s_strd
-        lsl             \s_strd, \s_strd, #1
-        lsl             \d_strd, \d_strd, #1
-        sxtl            v0.8h, v0.8b
-
-        load_s          \src, \sr2, \s_strd, v1, v2, v3, v4, v5
-        interleave_1_s  v1, v2, v3, v4, v5
-        uxtl_b          v1, v2, v3, v4
-        mul_mla_4       v6, v1, v2, v3, v4, .8h
-        shift_store_4   \type, \d_strd, v6
-        b.le            0f
-        load_s          \sr2, \src, \s_strd, v6, v7
-        interleave_1_s  v5, v6, v7
-        uxtl_b          v5, v6
-        mul_mla_4       v7, v3, v4, v5, v6, .8h
-        shift_store_4   \type, \d_strd, v7
-0:
-        ret
-
-480:    // 4x8, 4x16 v
-        ld1             {v0.8b}, [\xmy]
-        sub             \sr2, \src, \s_strd, lsl #1
-        add             \ds2, \dst, \d_strd
-        sub             \src, \sr2, \s_strd
-        lsl             \s_strd, \s_strd, #1
-        lsl             \d_strd, \d_strd, #1
-        sxtl            v0.8h, v0.8b
-
-        load_s          \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
-        interleave_1_s  v16, v17, v18
-        interleave_1_s  v18, v19, v20, v21, v22
-        uxtl_b          v16, v17
-        uxtl_b          v18, v19, v20, v21
-
-48:
-        subs            \h,  \h,  #4
-        load_s          \sr2, \src, \s_strd, v23, v24, v25, v26
-        interleave_1_s  v22, v23, v24, v25, v26
-        uxtl_b          v22, v23, v24, v25
-        mul_mla_8_2     v1,  v2,  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
-        shift_store_4   \type, \d_strd, v1, v2
-        b.le            0f
-        subs            \h,  \h,  #4
-        load_s          \sr2,  \src, \s_strd, v27, v16, v17, v18
-        interleave_1_s  v26, v27, v16, v17, v18
-        uxtl_b          v26, v27, v16, v17
-        mul_mla_8_2     v1,  v2,  v20, v21, v22, v23, v24, v25, v26, v27, v16, v17
-        shift_store_4   \type, \d_strd, v1, v2
-        b.le            0f
-        subs            \h,  \h,  #4
-        load_s          \sr2, \src, \s_strd, v19, v20, v21, v22
-        interleave_1_s  v18, v19, v20, v21, v22
-        uxtl_b          v18, v19, v20, v21
-        mul_mla_8_2     v1,  v2,  v24, v25, v26, v27, v16, v17, v18, v19, v20, v21
-        shift_store_4   \type, \d_strd, v1, v2
-        b               48b
-0:
-        ret
-
-80:
-        b.gt            880f
-
-        // 8x2, 8x4 v
-        cmp             \h,  #2
-        add             \xmy, \xmy, #2
-        ld1             {v0.s}[0], [\xmy]
-        sub             \src, \src, \s_strd
-        add             \ds2, \dst, \d_strd
-        add             \sr2, \src, \s_strd
-        lsl             \s_strd, \s_strd, #1
-        lsl             \d_strd, \d_strd, #1
-        sxtl            v0.8h, v0.8b
-
-        load_8b         \src, \sr2, \s_strd, v1, v2, v3, v4, v5
-        uxtl_b          v1, v2, v3, v4, v5
-        mul_mla_4       v6, v1, v2, v3, v4, .8h
-        mul_mla_4       v7, v2, v3, v4, v5, .8h
-        shift_store_8   \type, \d_strd, v6, v7
-        b.le            0f
-        load_8b         \sr2, \src, \s_strd, v6, v7
-        uxtl_b          v6, v7
-        mul_mla_4       v1, v3, v4, v5, v6, .8h
-        mul_mla_4       v2, v4, v5, v6, v7, .8h
-        shift_store_8   \type, \d_strd, v1, v2
-0:
-        ret
-
-880:    // 8x8, 8x16, 8x32 v
-1680:   // 16x8, 16x16, ...
-320:    // 32x8, 32x16, ...
-640:
-1280:
-        ld1             {v0.8b}, [\xmy]
-        sub             \src, \src, \s_strd
-        sub             \src, \src, \s_strd, lsl #1
-        sxtl            v0.8h, v0.8b
-        mov             \my,  \h
-168:
-        add             \ds2, \dst, \d_strd
-        add             \sr2, \src, \s_strd
-        lsl             \s_strd, \s_strd, #1
-        lsl             \d_strd, \d_strd, #1
-
-        load_8b         \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
-        uxtl_b          v16, v17, v18, v19, v20, v21, v22
-
-88:
-        subs            \h,  \h,  #2
-        load_8b         \sr2, \src, \s_strd, v23, v24
-        uxtl_b          v23, v24
-        mul_mla_8_1     v1,  v2,  v16, v17, v18, v19, v20, v21, v22, v23, v24
-        shift_store_8   \type, \d_strd, v1, v2
-        b.le            9f
-        subs            \h,  \h,  #2
-        load_8b         \sr2, \src, \s_strd, v25, v26
-        uxtl_b          v25, v26
-        mul_mla_8_1     v3,  v4,  v18, v19, v20, v21, v22, v23, v24, v25, v26
-        shift_store_8   \type, \d_strd, v3, v4
-        b.le            9f
-        subs            \h,  \h,  #4
-        load_8b         \sr2, \src, \s_strd, v27, v16, v17, v18
-        uxtl_b          v27, v16, v17, v18
-        mul_mla_8_1     v1,  v2,  v20, v21, v22, v23, v24, v25, v26, v27, v16
-        mul_mla_8_1     v3,  v4,  v22, v23, v24, v25, v26, v27, v16, v17, v18
-        shift_store_8   \type, \d_strd, v1, v2, v3, v4
-        b.le            9f
-        subs            \h,  \h,  #4
-        load_8b         \sr2, \src, \s_strd, v19, v20, v21, v22
-        uxtl_b          v19, v20, v21, v22
-        mul_mla_8_1     v1,  v2,  v24, v25, v26, v27, v16, v17, v18, v19, v20
-        mul_mla_8_1     v3,  v4,  v26, v27, v16, v17, v18, v19, v20, v21, v22
-        shift_store_8   \type, \d_strd, v1, v2, v3, v4
-        b.gt            88b
-9:
-        subs            \w,  \w,  #8
-        b.le            0f
-        asr             \s_strd, \s_strd, #1
-        asr             \d_strd, \d_strd, #1
-        msub            \src, \s_strd, \xmy, \src
-        msub            \dst, \d_strd, \xmy, \dst
-        sub             \src, \src, \s_strd, lsl #3
-        mov             \h,  \my
-        add             \src, \src, #8
-.ifc \type, put
-        add             \dst, \dst, #8
-.else
-        add             \dst, \dst, #16
-.endif
-        b               168b
-0:
-        ret
-
-160:
-        b.gt            1680b
-
-        // 16x4 v
-        add             \xmy, \xmy, #2
-        ld1             {v0.s}[0], [\xmy]
-        sub             \src, \src, \s_strd
-        add             \ds2, \dst, \d_strd
-        add             \sr2, \src, \s_strd
-        lsl             \s_strd, \s_strd, #1
-        lsl             \d_strd, \d_strd, #1
-        sxtl            v0.8h, v0.8b
-
-        cmp             \h,  #2
-        load_16b        \src, \sr2, \s_strd, v1,  v2,  v3,  v4,  v5
-        uxtl            v16.8h, v1.8b
-        uxtl            v17.8h, v2.8b
-        uxtl            v18.8h, v3.8b
-        uxtl            v19.8h, v4.8b
-        uxtl            v20.8h, v5.8b
-        uxtl2           v23.8h, v1.16b
-        uxtl2           v24.8h, v2.16b
-        uxtl2           v25.8h, v3.16b
-        uxtl2           v26.8h, v4.16b
-        uxtl2           v27.8h, v5.16b
-        mul_mla_4       v1,  v16, v17, v18, v19, .8h
-        mul_mla_4       v16, v17, v18, v19, v20, .8h
-        mul_mla_4       v2,  v23, v24, v25, v26, .8h
-        mul_mla_4       v17, v24, v25, v26, v27, .8h
-        shift_store_16  \type, \d_strd, v1, v2, v16, v17
-        b.le            0f
-        load_16b        \sr2, \src, \s_strd, v6,  v7
-        uxtl            v21.8h, v6.8b
-        uxtl            v22.8h, v7.8b
-        uxtl2           v28.8h, v6.16b
-        uxtl2           v29.8h, v7.16b
-        mul_mla_4       v1,  v18, v19, v20, v21, .8h
-        mul_mla_4       v3,  v19, v20, v21, v22, .8h
-        mul_mla_4       v2,  v25, v26, v27, v28, .8h
-        mul_mla_4       v4,  v26, v27, v28, v29, .8h
-        shift_store_16  \type, \d_strd, v1, v2, v3, v4
-0:
-        ret
-
-L(\type\()_8tap_v_tbl):
-        .hword L(\type\()_8tap_v_tbl) - 1280b
-        .hword L(\type\()_8tap_v_tbl) -  640b
-        .hword L(\type\()_8tap_v_tbl) -  320b
-        .hword L(\type\()_8tap_v_tbl) -  160b
-        .hword L(\type\()_8tap_v_tbl) -   80b
-        .hword L(\type\()_8tap_v_tbl) -   40b
-        .hword L(\type\()_8tap_v_tbl) -   20b
-        .hword 0
-
-L(\type\()_8tap_hv):
-        cmp             \h,  #4
-        ubfm            w9,  \my, #7, #13
-        and             \my, \my, #0x7f
-        b.le            4f
-        mov             \my,  w9
-4:
-        add             \xmy,  x10, \my, uxtw #3
-
-        adr             x9,  L(\type\()_8tap_hv_tbl)
-        ldrh            w8,  [x9, x8, lsl #1]
-        sub             x9,  x9,  w8, uxtw
-        br              x9
-
-20:
-.ifc \type, put
-        add             \xmx,  \xmx,  #2
-        ld1             {v0.s}[0],  [\xmx]
-        b.gt            280f
-        add             \xmy,  \xmy,  #2
-        ld1             {v1.s}[0],  [\xmy]
-
-        // 2x2, 2x4 hv
-        sub             \sr2, \src, #1
-        sub             \src, \sr2, \s_strd
-        add             \ds2, \dst, \d_strd
-        lsl             \s_strd, \s_strd, #1
-        lsl             \d_strd, \d_strd, #1
-        sxtl            v0.8h,  v0.8b
-        sxtl            v1.8h,  v1.8b
-        mov             x15, x30
-
-        ld1             {v28.8b}, [\src], \s_strd
-        uxtl            v28.8h,  v28.8b
-        ext             v29.16b, v28.16b, v28.16b, #2
-        mul             v28.4h,  v28.4h,  v0.4h
-        mul             v29.4h,  v29.4h,  v0.4h
-        addv            h28, v28.4h
-        addv            h29, v29.4h
-        trn1            v16.4h, v28.4h, v29.4h
-        srshr           v16.4h, v16.4h, #2
-        bl              L(\type\()_8tap_filter_2)
-
-        trn1            v16.2s, v16.2s, v28.2s
-        trn1            v17.2s, v28.2s, v30.2s
-        mov             v18.8b, v30.8b
-
-2:
-        bl              L(\type\()_8tap_filter_2)
-
-        trn1            v18.2s, v18.2s, v28.2s
-        trn1            v19.2s, v28.2s, v30.2s
-        smull           v2.4s,  v16.4h, v1.h[0]
-        smlal           v2.4s,  v17.4h, v1.h[1]
-        smlal           v2.4s,  v18.4h, v1.h[2]
-        smlal           v2.4s,  v19.4h, v1.h[3]
-
-        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
-        sqxtun          v2.8b,  v2.8h
-        subs            \h,  \h,  #2
-        st1             {v2.h}[0], [\dst], \d_strd
-        st1             {v2.h}[1], [\ds2], \d_strd
-        b.le            0f
-        mov             v16.8b, v18.8b
-        mov             v17.8b, v19.8b
-        mov             v18.8b, v30.8b
-        b               2b
-
-280:    // 2x8, 2x16, 2x32 hv
-        ld1             {v1.8b},  [\xmy]
-        sub             \src, \src, #1
-        sub             \sr2, \src, \s_strd, lsl #1
-        sub             \src, \sr2, \s_strd
-        add             \ds2, \dst, \d_strd
-        lsl             \s_strd, \s_strd, #1
-        lsl             \d_strd, \d_strd, #1
-        sxtl            v0.8h,  v0.8b
-        sxtl            v1.8h,  v1.8b
-        mov             x15, x30
-
-        ld1             {v28.8b}, [\src], \s_strd
-        uxtl            v28.8h,  v28.8b
-        ext             v29.16b, v28.16b, v28.16b, #2
-        mul             v28.4h,  v28.4h,  v0.4h
-        mul             v29.4h,  v29.4h,  v0.4h
-        addv            h28, v28.4h
-        addv            h29, v29.4h
-        trn1            v16.4h, v28.4h, v29.4h
-        srshr           v16.4h, v16.4h, #2
-
-        bl              L(\type\()_8tap_filter_2)
-        trn1            v16.2s, v16.2s, v28.2s
-        trn1            v17.2s, v28.2s, v30.2s
-        mov             v18.8b, v30.8b
-        bl              L(\type\()_8tap_filter_2)
-        trn1            v18.2s, v18.2s, v28.2s
-        trn1            v19.2s, v28.2s, v30.2s
-        mov             v20.8b, v30.8b
-        bl              L(\type\()_8tap_filter_2)
-        trn1            v20.2s, v20.2s, v28.2s
-        trn1            v21.2s, v28.2s, v30.2s
-        mov             v22.8b, v30.8b
-
-28:
-        bl              L(\type\()_8tap_filter_2)
-        trn1            v22.2s, v22.2s, v28.2s
-        trn1            v23.2s, v28.2s, v30.2s
-        smull           v2.4s,  v16.4h, v1.h[0]
-        smlal           v2.4s,  v17.4h, v1.h[1]
-        smlal           v2.4s,  v18.4h, v1.h[2]
-        smlal           v2.4s,  v19.4h, v1.h[3]
-        smlal           v2.4s,  v20.4h, v1.h[4]
-        smlal           v2.4s,  v21.4h, v1.h[5]
-        smlal           v2.4s,  v22.4h, v1.h[6]
-        smlal           v2.4s,  v23.4h, v1.h[7]
-
-        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
-        sqxtun          v2.8b,  v2.8h
-        subs            \h,  \h,  #2
-        st1             {v2.h}[0], [\dst], \d_strd
-        st1             {v2.h}[1], [\ds2], \d_strd
-        b.le            0f
-        mov             v16.8b, v18.8b
-        mov             v17.8b, v19.8b
-        mov             v18.8b, v20.8b
-        mov             v19.8b, v21.8b
-        mov             v20.8b, v22.8b
-        mov             v21.8b, v23.8b
-        mov             v22.8b, v30.8b
-        b               28b
-
-0:
-        br              x15
-
-L(\type\()_8tap_filter_2):
-        ld1             {v28.8b},  [\sr2], \s_strd
-        ld1             {v30.8b},  [\src], \s_strd
-        uxtl            v28.8h,  v28.8b
-        uxtl            v30.8h,  v30.8b
-        ext             v29.16b, v28.16b, v28.16b, #2
-        ext             v31.16b, v30.16b, v30.16b, #2
-        trn1            v27.2s,  v28.2s,  v30.2s
-        trn2            v30.2s,  v28.2s,  v30.2s
-        trn1            v28.2s,  v29.2s,  v31.2s
-        trn2            v31.2s,  v29.2s,  v31.2s
-        mul             v27.4h,  v27.4h,  v0.h[0]
-        mla             v27.4h,  v28.4h,  v0.h[1]
-        mla             v27.4h,  v30.4h,  v0.h[2]
-        mla             v27.4h,  v31.4h,  v0.h[3]
-        srshr           v28.4h,  v27.4h,  #2
-        trn2            v30.2s,  v28.2s,  v28.2s
-        ret
-.endif
-
-40:
-        add             \xmx, \xmx, #2
-        ld1             {v0.s}[0],  [\xmx]
-        b.gt            480f
-        add             \xmy, \xmy,  #2
-        ld1             {v1.s}[0],  [\xmy]
-        sub             \sr2, \src, #1
-        sub             \src, \sr2, \s_strd
-        add             \ds2, \dst, \d_strd
-        lsl             \s_strd, \s_strd, #1
-        lsl             \d_strd, \d_strd, #1
-        sxtl            v0.8h,  v0.8b
-        sxtl            v1.8h,  v1.8b
-        mov             x15, x30
-
-        // 4x2, 4x4 hv
-        ld1             {v26.8b}, [\src], \s_strd
-        uxtl            v26.8h,  v26.8b
-        ext             v28.16b, v26.16b, v26.16b, #2
-        ext             v29.16b, v26.16b, v26.16b, #4
-        ext             v30.16b, v26.16b, v26.16b, #6
-        mul             v31.4h,  v26.4h,  v0.h[0]
-        mla             v31.4h,  v28.4h,  v0.h[1]
-        mla             v31.4h,  v29.4h,  v0.h[2]
-        mla             v31.4h,  v30.4h,  v0.h[3]
-        srshr           v16.4h,  v31.4h,  #2
-
-        bl              L(\type\()_8tap_filter_4)
-        mov             v17.8b, v28.8b
-        mov             v18.8b, v29.8b
-
-4:
-        smull           v2.4s,  v16.4h, v1.h[0]
-        bl              L(\type\()_8tap_filter_4)
-        smull           v3.4s,  v17.4h, v1.h[0]
-        smlal           v2.4s,  v17.4h, v1.h[1]
-        smlal           v3.4s,  v18.4h, v1.h[1]
-        smlal           v2.4s,  v18.4h, v1.h[2]
-        smlal           v3.4s,  v28.4h, v1.h[2]
-        smlal           v2.4s,  v28.4h, v1.h[3]
-        smlal           v3.4s,  v29.4h, v1.h[3]
-        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
-        sqrshrn         v3.4h,  v3.4s,  #\shift_hv
-        subs            \h,  \h,  #2
-.ifc \type, put
-        sqxtun          v2.8b,  v2.8h
-        sqxtun          v3.8b,  v3.8h
-        st1             {v2.s}[0], [\dst], \d_strd
-        st1             {v3.s}[0], [\ds2], \d_strd
-.else
-        st1             {v2.4h}, [\dst], \d_strd
-        st1             {v3.4h}, [\ds2], \d_strd
-.endif
-        b.le            0f
-        mov             v16.16b, v18.16b
-        mov             v17.16b, v28.16b
-        mov             v18.16b, v29.16b
-        b               4b
-
-480:    // 4x8, 4x16, 4x32 hv
-        ld1             {v1.8b},  [\xmy]
-        sub             \src, \src, #1
-        sub             \sr2, \src, \s_strd, lsl #1
-        sub             \src, \sr2, \s_strd
-        add             \ds2, \dst, \d_strd
-        lsl             \s_strd, \s_strd, #1
-        lsl             \d_strd, \d_strd, #1
-        sxtl            v0.8h,  v0.8b
-        sxtl            v1.8h,  v1.8b
-        mov             x15, x30
-
-        ld1             {v26.8b}, [\src], \s_strd
-        uxtl            v26.8h,  v26.8b
-        ext             v28.16b, v26.16b, v26.16b, #2
-        ext             v29.16b, v26.16b, v26.16b, #4
-        ext             v30.16b, v26.16b, v26.16b, #6
-        mul             v31.4h,  v26.4h,  v0.h[0]
-        mla             v31.4h,  v28.4h,  v0.h[1]
-        mla             v31.4h,  v29.4h,  v0.h[2]
-        mla             v31.4h,  v30.4h,  v0.h[3]
-        srshr           v16.4h,  v31.4h,  #2
-
-        bl              L(\type\()_8tap_filter_4)
-        mov             v17.8b, v28.8b
-        mov             v18.8b, v29.8b
-        bl              L(\type\()_8tap_filter_4)
-        mov             v19.8b, v28.8b
-        mov             v20.8b, v29.8b
-        bl              L(\type\()_8tap_filter_4)
-        mov             v21.8b, v28.8b
-        mov             v22.8b, v29.8b
-
-48:
-        smull           v2.4s,  v16.4h, v1.h[0]
-        bl              L(\type\()_8tap_filter_4)
-        smull           v3.4s,  v17.4h, v1.h[0]
-        smlal           v2.4s,  v17.4h, v1.h[1]
-        smlal           v3.4s,  v18.4h, v1.h[1]
-        smlal           v2.4s,  v18.4h, v1.h[2]
-        smlal           v3.4s,  v19.4h, v1.h[2]
-        smlal           v2.4s,  v19.4h, v1.h[3]
-        smlal           v3.4s,  v20.4h, v1.h[3]
-        smlal           v2.4s,  v20.4h, v1.h[4]
-        smlal           v3.4s,  v21.4h, v1.h[4]
-        smlal           v2.4s,  v21.4h, v1.h[5]
-        smlal           v3.4s,  v22.4h, v1.h[5]
-        smlal           v2.4s,  v22.4h, v1.h[6]
-        smlal           v3.4s,  v28.4h, v1.h[6]
-        smlal           v2.4s,  v28.4h, v1.h[7]
-        smlal           v3.4s,  v29.4h, v1.h[7]
-        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
-        sqrshrn         v3.4h,  v3.4s,  #\shift_hv
-        subs            \h,  \h,  #2
-.ifc \type, put
-        sqxtun          v2.8b,  v2.8h
-        sqxtun          v3.8b,  v3.8h
-        st1             {v2.s}[0], [\dst], \d_strd
-        st1             {v3.s}[0], [\ds2], \d_strd
-.else
-        st1             {v2.4h}, [\dst], \d_strd
-        st1             {v3.4h}, [\ds2], \d_strd
-.endif
-        b.le            0f
-        mov             v16.8b,  v18.8b
-        mov             v17.8b,  v19.8b
-        mov             v18.8b,  v20.8b
-        mov             v19.8b,  v21.8b
-        mov             v20.8b,  v22.8b
-        mov             v21.8b,  v28.8b
-        mov             v22.8b,  v29.8b
-        b               48b
-0:
-        br              x15
-
-L(\type\()_8tap_filter_4):
-        ld1             {v26.8b}, [\sr2], \s_strd
-        ld1             {v27.8b}, [\src], \s_strd
-        uxtl            v26.8h,  v26.8b
-        uxtl            v27.8h,  v27.8b
-        ext             v28.16b, v26.16b, v26.16b, #2
-        ext             v29.16b, v26.16b, v26.16b, #4
-        ext             v30.16b, v26.16b, v26.16b, #6
-        mul             v31.4h,  v26.4h,  v0.h[0]
-        mla             v31.4h,  v28.4h,  v0.h[1]
-        mla             v31.4h,  v29.4h,  v0.h[2]
-        mla             v31.4h,  v30.4h,  v0.h[3]
-        ext             v28.16b, v27.16b, v27.16b, #2
-        ext             v29.16b, v27.16b, v27.16b, #4
-        ext             v30.16b, v27.16b, v27.16b, #6
-        mul             v27.4h,  v27.4h,  v0.h[0]
-        mla             v27.4h,  v28.4h,  v0.h[1]
-        mla             v27.4h,  v29.4h,  v0.h[2]
-        mla             v27.4h,  v30.4h,  v0.h[3]
-        srshr           v28.4h,  v31.4h,  #2
-        srshr           v29.4h,  v27.4h,  #2
-        ret
-
-80:
-160:
-320:
-        b.gt            880f
-        add             \xmy,  \xmy,  #2
-        ld1             {v0.8b},  [\xmx]
-        ld1             {v1.s}[0],  [\xmy]
-        sub             \src,  \src,  #3
-        sub             \src,  \src,  \s_strd
-        sxtl            v0.8h,  v0.8b
-        sxtl            v1.8h,  v1.8b
-        mov             x15, x30
-        mov             \my,  \h
-
-164:    // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
-        add             \ds2,  \dst,  \d_strd
-        add             \sr2,  \src,  \s_strd
-        lsl             \d_strd, \d_strd, #1
-        lsl             \s_strd, \s_strd, #1
-
-        ld1             {v28.8b, v29.8b},  [\src], \s_strd
-        uxtl            v28.8h,  v28.8b
-        uxtl            v29.8h,  v29.8b
-        mul             v24.8h,  v28.8h,  v0.h[0]
-.irpc i, 1234567
-        ext             v26.16b, v28.16b, v29.16b, #(2*\i)
-        mla             v24.8h,  v26.8h,  v0.h[\i]
-.endr
-        srshr           v16.8h,  v24.8h, #2
-
-        bl              L(\type\()_8tap_filter_8)
-        mov             v17.16b, v24.16b
-        mov             v18.16b, v25.16b
-
-8:
-        smull           v2.4s,  v16.4h, v1.h[0]
-        smull2          v3.4s,  v16.8h, v1.h[0]
-        bl              L(\type\()_8tap_filter_8)
-        smull           v4.4s,  v17.4h, v1.h[0]
-        smull2          v5.4s,  v17.8h, v1.h[0]
-        smlal           v2.4s,  v17.4h, v1.h[1]
-        smlal2          v3.4s,  v17.8h, v1.h[1]
-        smlal           v4.4s,  v18.4h, v1.h[1]
-        smlal2          v5.4s,  v18.8h, v1.h[1]
-        smlal           v2.4s,  v18.4h, v1.h[2]
-        smlal2          v3.4s,  v18.8h, v1.h[2]
-        smlal           v4.4s,  v24.4h, v1.h[2]
-        smlal2          v5.4s,  v24.8h, v1.h[2]
-        smlal           v2.4s,  v24.4h, v1.h[3]
-        smlal2          v3.4s,  v24.8h, v1.h[3]
-        smlal           v4.4s,  v25.4h, v1.h[3]
-        smlal2          v5.4s,  v25.8h, v1.h[3]
-        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
-        sqrshrn2        v2.8h,  v3.4s,  #\shift_hv
-        sqrshrn         v4.4h,  v4.4s,  #\shift_hv
-        sqrshrn2        v4.8h,  v5.4s,  #\shift_hv
-        subs            \h,  \h,  #2
-.ifc \type, put
-        sqxtun          v2.8b,  v2.8h
-        sqxtun          v4.8b,  v4.8h
-        st1             {v2.8b}, [\dst], \d_strd
-        st1             {v4.8b}, [\ds2], \d_strd
-.else
-        st1             {v2.8h}, [\dst], \d_strd
-        st1             {v4.8h}, [\ds2], \d_strd
-.endif
-        b.le            9f
-        mov             v16.16b, v18.16b
-        mov             v17.16b, v24.16b
-        mov             v18.16b, v25.16b
-        b               8b
-9:
-        subs            \w,  \w,  #8
-        b.le            0f
-        asr             \s_strd,  \s_strd,  #1
-        asr             \d_strd,  \d_strd,  #1
-        msub            \src,  \s_strd,  \xmy,  \src
-        msub            \dst,  \d_strd,  \xmy,  \dst
-        sub             \src,  \src,  \s_strd,  lsl #2
-        mov             \h,  \my
-        add             \src,  \src,  #8
-.ifc \type, put
-        add             \dst,  \dst,  #8
-.else
-        add             \dst,  \dst,  #16
-.endif
-        b               164b
-
-880:    // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
-640:
-1280:
-        ld1             {v0.8b},  [\xmx]
-        ld1             {v1.8b},  [\xmy]
-        sub             \src,  \src,  #3
-        sub             \src,  \src,  \s_strd
-        sub             \src,  \src,  \s_strd, lsl #1
-        sxtl            v0.8h,  v0.8b
-        sxtl            v1.8h,  v1.8b
-        mov             x15, x30
-        mov             \my,  \h
-
-168:
-        add             \ds2,  \dst,  \d_strd
-        add             \sr2,  \src,  \s_strd
-        lsl             \d_strd, \d_strd, #1
-        lsl             \s_strd, \s_strd, #1
-
-        ld1             {v28.8b, v29.8b},  [\src], \s_strd
-        uxtl            v28.8h,  v28.8b
-        uxtl            v29.8h,  v29.8b
-        mul             v24.8h,  v28.8h,  v0.h[0]
-.irpc i, 1234567
-        ext             v26.16b, v28.16b, v29.16b, #(2*\i)
-        mla             v24.8h,  v26.8h,  v0.h[\i]
-.endr
-        srshr           v16.8h,  v24.8h, #2
-
-        bl              L(\type\()_8tap_filter_8)
-        mov             v17.16b, v24.16b
-        mov             v18.16b, v25.16b
-        bl              L(\type\()_8tap_filter_8)
-        mov             v19.16b, v24.16b
-        mov             v20.16b, v25.16b
-        bl              L(\type\()_8tap_filter_8)
-        mov             v21.16b, v24.16b
-        mov             v22.16b, v25.16b
-
-88:
-        smull           v2.4s,  v16.4h, v1.h[0]
-        smull2          v3.4s,  v16.8h, v1.h[0]
-        bl              L(\type\()_8tap_filter_8)
-        smull           v4.4s,  v17.4h, v1.h[0]
-        smull2          v5.4s,  v17.8h, v1.h[0]
-        smlal           v2.4s,  v17.4h, v1.h[1]
-        smlal2          v3.4s,  v17.8h, v1.h[1]
-        smlal           v4.4s,  v18.4h, v1.h[1]
-        smlal2          v5.4s,  v18.8h, v1.h[1]
-        smlal           v2.4s,  v18.4h, v1.h[2]
-        smlal2          v3.4s,  v18.8h, v1.h[2]
-        smlal           v4.4s,  v19.4h, v1.h[2]
-        smlal2          v5.4s,  v19.8h, v1.h[2]
-        smlal           v2.4s,  v19.4h, v1.h[3]
-        smlal2          v3.4s,  v19.8h, v1.h[3]
-        smlal           v4.4s,  v20.4h, v1.h[3]
-        smlal2          v5.4s,  v20.8h, v1.h[3]
-        smlal           v2.4s,  v20.4h, v1.h[4]
-        smlal2          v3.4s,  v20.8h, v1.h[4]
-        smlal           v4.4s,  v21.4h, v1.h[4]
-        smlal2          v5.4s,  v21.8h, v1.h[4]
-        smlal           v2.4s,  v21.4h, v1.h[5]
-        smlal2          v3.4s,  v21.8h, v1.h[5]
-        smlal           v4.4s,  v22.4h, v1.h[5]
-        smlal2          v5.4s,  v22.8h, v1.h[5]
-        smlal           v2.4s,  v22.4h, v1.h[6]
-        smlal2          v3.4s,  v22.8h, v1.h[6]
-        smlal           v4.4s,  v24.4h, v1.h[6]
-        smlal2          v5.4s,  v24.8h, v1.h[6]
-        smlal           v2.4s,  v24.4h, v1.h[7]
-        smlal2          v3.4s,  v24.8h, v1.h[7]
-        smlal           v4.4s,  v25.4h, v1.h[7]
-        smlal2          v5.4s,  v25.8h, v1.h[7]
-        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
-        sqrshrn2        v2.8h,  v3.4s,  #\shift_hv
-        sqrshrn         v4.4h,  v4.4s,  #\shift_hv
-        sqrshrn2        v4.8h,  v5.4s,  #\shift_hv
-        subs            \h,  \h,  #2
-.ifc \type, put
-        sqxtun          v2.8b,  v2.8h
-        sqxtun          v4.8b,  v4.8h
-        st1             {v2.8b}, [\dst], \d_strd
-        st1             {v4.8b}, [\ds2], \d_strd
-.else
-        st1             {v2.8h}, [\dst], \d_strd
-        st1             {v4.8h}, [\ds2], \d_strd
-.endif
-        b.le            9f
-        mov             v16.16b, v18.16b
-        mov             v17.16b, v19.16b
-        mov             v18.16b, v20.16b
-        mov             v19.16b, v21.16b
-        mov             v20.16b, v22.16b
-        mov             v21.16b, v24.16b
-        mov             v22.16b, v25.16b
-        b               88b
-9:
-        subs            \w,  \w,  #8
-        b.le            0f
-        asr             \s_strd,  \s_strd,  #1
-        asr             \d_strd,  \d_strd,  #1
-        msub            \src,  \s_strd,  \xmy,  \src
-        msub            \dst,  \d_strd,  \xmy,  \dst
-        sub             \src,  \src,  \s_strd,  lsl #3
-        mov             \h,  \my
-        add             \src,  \src,  #8
-.ifc \type, put
-        add             \dst,  \dst,  #8
-.else
-        add             \dst,  \dst,  #16
-.endif
-        b               168b
-0:
-        br              x15
-
-L(\type\()_8tap_filter_8):
-        ld1             {v28.8b, v29.8b},  [\sr2], \s_strd
-        ld1             {v30.8b, v31.8b},  [\src], \s_strd
-        uxtl            v28.8h,  v28.8b
-        uxtl            v29.8h,  v29.8b
-        uxtl            v30.8h,  v30.8b
-        uxtl            v31.8h,  v31.8b
-        mul             v24.8h,  v28.8h,  v0.h[0]
-        mul             v25.8h,  v30.8h,  v0.h[0]
-.irpc i, 1234567
-        ext             v26.16b, v28.16b, v29.16b, #(2*\i)
-        ext             v27.16b, v30.16b, v31.16b, #(2*\i)
-        mla             v24.8h,  v26.8h,  v0.h[\i]
-        mla             v25.8h,  v27.8h,  v0.h[\i]
-.endr
-        srshr           v24.8h,  v24.8h, #2
-        srshr           v25.8h,  v25.8h, #2
-        ret
-
-L(\type\()_8tap_hv_tbl):
-        .hword L(\type\()_8tap_hv_tbl) - 1280b
-        .hword L(\type\()_8tap_hv_tbl) -  640b
-        .hword L(\type\()_8tap_hv_tbl) -  320b
-        .hword L(\type\()_8tap_hv_tbl) -  160b
-        .hword L(\type\()_8tap_hv_tbl) -   80b
-        .hword L(\type\()_8tap_hv_tbl) -   40b
-        .hword L(\type\()_8tap_hv_tbl) -   20b
-        .hword 0
-endfunc
-
-
-function \type\()_bilin_8bpc_neon, export=1
-        dup             v1.16b, \mx
-        dup             v3.16b, \my
-        mov             w9,  #16
-        sub             w8, w9, \mx
-        sub             w9, w9, \my
-        dup             v0.16b, w8
-        dup             v2.16b, w9
-.ifc \type, prep
-        uxtw            \d_strd, \w
-        lsl             \d_strd, \d_strd, #1
-.endif
-
-        clz             w8,  \w
-        sub             w8,  w8,  #24
-        cbnz            \mx, L(\type\()_bilin_h)
-        cbnz            \my, L(\type\()_bilin_v)
-        b               \type
-
-L(\type\()_bilin_h):
-        cbnz            \my, L(\type\()_bilin_hv)
-
-        adr             x9,  L(\type\()_bilin_h_tbl)
-        ldrh            w8,  [x9, x8, lsl #1]
-        sub             x9,  x9,  w8, uxtw
-        br              x9
-
-20:     // 2xN h
-.ifc \type, put
-        add             \ds2,  \dst,  \d_strd
-        add             \sr2,  \src,  \s_strd
-        lsl             \d_strd,  \d_strd,  #1
-        lsl             \s_strd,  \s_strd,  #1
-2:
-        ld1             {v4.s}[0],  [\src], \s_strd
-        ld1             {v6.s}[0],  [\sr2], \s_strd
-        ext             v5.8b,  v4.8b,  v4.8b, #1
-        ext             v7.8b,  v6.8b,  v6.8b, #1
-        trn1            v4.4h,  v4.4h,  v6.4h
-        trn1            v5.4h,  v5.4h,  v7.4h
-        subs            \h,  \h,  #2
-        umull           v4.8h,  v4.8b,  v0.8b
-        umlal           v4.8h,  v5.8b,  v1.8b
-        uqrshrn         v4.8b,  v4.8h,  #4
-        st1             {v4.h}[0], [\dst], \d_strd
-        st1             {v4.h}[1], [\ds2], \d_strd
-        b.gt            2b
-        ret
-.endif
-
-40:     // 4xN h
-        add             \ds2,  \dst,  \d_strd
-        add             \sr2,  \src,  \s_strd
-        lsl             \d_strd,  \d_strd,  #1
-        lsl             \s_strd,  \s_strd,  #1
-4:
-        ld1             {v4.8b}, [\src], \s_strd
-        ld1             {v6.8b}, [\sr2], \s_strd
-        ext             v5.8b,  v4.8b,  v4.8b, #1
-        ext             v7.8b,  v6.8b,  v6.8b, #1
-        trn1            v4.2s,  v4.2s,  v6.2s
-        trn1            v5.2s,  v5.2s,  v7.2s
-        subs            \h,  \h,  #2
-        umull           v4.8h,  v4.8b,  v0.8b
-        umlal           v4.8h,  v5.8b,  v1.8b
-.ifc \type, put
-        uqrshrn         v4.8b,  v4.8h,  #4
-        st1             {v4.s}[0], [\dst], \d_strd
-        st1             {v4.s}[1], [\ds2], \d_strd
-.else
-        st1             {v4.d}[0], [\dst], \d_strd
-        st1             {v4.d}[1], [\ds2], \d_strd
-.endif
-        b.gt            4b
-        ret
-
-80:     // 8xN h
-        add             \ds2,  \dst,  \d_strd
-        add             \sr2,  \src,  \s_strd
-        lsl             \d_strd,  \d_strd,  #1
-        lsl             \s_strd,  \s_strd,  #1
-8:
-        ld1             {v4.16b}, [\src], \s_strd
-        ld1             {v6.16b}, [\sr2], \s_strd
-        ext             v5.16b, v4.16b, v4.16b, #1
-        ext             v7.16b, v6.16b, v6.16b, #1
-        subs            \h,  \h,  #2
-        umull           v4.8h,  v4.8b,  v0.8b
-        umull           v6.8h,  v6.8b,  v0.8b
-        umlal           v4.8h,  v5.8b,  v1.8b
-        umlal           v6.8h,  v7.8b,  v1.8b
-.ifc \type, put
-        uqrshrn         v4.8b,  v4.8h,  #4
-        uqrshrn         v6.8b,  v6.8h,  #4
-        st1             {v4.8b}, [\dst], \d_strd
-        st1             {v6.8b}, [\ds2], \d_strd
-.else
-        st1             {v4.8h}, [\dst], \d_strd
-        st1             {v6.8h}, [\ds2], \d_strd
-.endif
-        b.gt            8b
-        ret
-160:
-320:
-640:
-1280:   // 16xN, 32xN, ... h
-        add             \ds2,  \dst,  \d_strd
-        add             \sr2,  \src,  \s_strd
-        lsl             \s_strd,  \s_strd,  #1
-
-        sub             \s_strd,  \s_strd,  \w, uxtw
-        sub             \s_strd,  \s_strd,  #8
-.ifc \type, put
-        lsl             \d_strd,  \d_strd,  #1
-        sub             \d_strd,  \d_strd,  \w, uxtw
-.endif
-161:
-        ld1             {v16.d}[1],  [\src], #8
-        ld1             {v20.d}[1],  [\sr2], #8
-        mov             \mx, \w
-
-16:
-        ld1             {v18.16b},  [\src], #16
-        ld1             {v22.16b},  [\sr2], #16
-        ext             v17.16b, v16.16b, v18.16b, #8
-        ext             v19.16b, v16.16b, v18.16b, #9
-        ext             v21.16b, v20.16b, v22.16b, #8
-        ext             v23.16b, v20.16b, v22.16b, #9
-        umull           v16.8h,  v17.8b,  v0.8b
-        umull2          v17.8h,  v17.16b, v0.16b
-        umull           v20.8h,  v21.8b,  v0.8b
-        umull2          v21.8h,  v21.16b, v0.16b
-        umlal           v16.8h,  v19.8b,  v1.8b
-        umlal2          v17.8h,  v19.16b, v1.16b
-        umlal           v20.8h,  v23.8b,  v1.8b
-        umlal2          v21.8h,  v23.16b, v1.16b
-        subs            \mx, \mx, #16
-.ifc \type, put
-        uqrshrn         v16.8b,  v16.8h, #4
-        uqrshrn2        v16.16b, v17.8h, #4
-        uqrshrn         v20.8b,  v20.8h, #4
-        uqrshrn2        v20.16b, v21.8h, #4
-        st1             {v16.16b}, [\dst], #16
-        st1             {v20.16b}, [\ds2], #16
-.else
-        st1             {v16.8h, v17.8h}, [\dst], #32
-        st1             {v20.8h, v21.8h}, [\ds2], #32
-.endif
-        b.le            9f
-
-        mov             v16.16b, v18.16b
-        mov             v20.16b, v22.16b
-        b               16b
-
-9:
-        add             \dst,  \dst,  \d_strd
-        add             \ds2,  \ds2,  \d_strd
-        add             \src,  \src,  \s_strd
-        add             \sr2,  \sr2,  \s_strd
-
-        subs            \h,  \h,  #2
-        b.gt            161b
-        ret
-
-L(\type\()_bilin_h_tbl):
-        .hword L(\type\()_bilin_h_tbl) - 1280b
-        .hword L(\type\()_bilin_h_tbl) -  640b
-        .hword L(\type\()_bilin_h_tbl) -  320b
-        .hword L(\type\()_bilin_h_tbl) -  160b
-        .hword L(\type\()_bilin_h_tbl) -   80b
-        .hword L(\type\()_bilin_h_tbl) -   40b
-        .hword L(\type\()_bilin_h_tbl) -   20b
-        .hword 0
-
-
-L(\type\()_bilin_v):
-        cmp             \h,  #4
-        adr             x9,  L(\type\()_bilin_v_tbl)
-        ldrh            w8,  [x9, x8, lsl #1]
-        sub             x9,  x9,  w8, uxtw
-        br              x9
-
-20:     // 2xN v
-.ifc \type, put
-        cmp             \h,  #2
-        add             \ds2,  \dst,  \d_strd
-        add             \sr2,  \src,  \s_strd
-        lsl             \s_strd,  \s_strd,  #1
-        lsl             \d_strd,  \d_strd,  #1
-
-        // 2x2 v
-        ld1             {v16.h}[0], [\src], \s_strd
-        b.gt            24f
-        ld1             {v17.h}[0], [\sr2], \s_strd
-        ld1             {v18.h}[0], [\src], \s_strd
-        trn1            v16.4h, v16.4h, v17.4h
-        trn1            v17.4h, v17.4h, v18.4h
-        umull           v4.8h,  v16.8b,  v2.8b
-        umlal           v4.8h,  v17.8b,  v3.8b
-        uqrshrn         v4.8b,  v4.8h,  #4
-        st1             {v4.h}[0], [\dst]
-        st1             {v4.h}[1], [\ds2]
-        ret
-24:     // 2x4, 2x8, ... v
-        ld1             {v17.h}[0], [\sr2], \s_strd
-        ld1             {v18.h}[0], [\src], \s_strd
-        ld1             {v19.h}[0], [\sr2], \s_strd
-        ld1             {v20.h}[0], [\src], \s_strd
-        trn1            v16.4h, v16.4h, v17.4h
-        trn1            v17.4h, v17.4h, v18.4h
-        trn1            v18.4h, v18.4h, v19.4h
-        trn1            v19.4h, v19.4h, v20.4h
-        trn1            v16.2s, v16.2s, v18.2s
-        trn1            v17.2s, v17.2s, v19.2s
-        umull           v4.8h,  v16.8b,  v2.8b
-        umlal           v4.8h,  v17.8b,  v3.8b
-        subs            \h,  \h,  #4
-        uqrshrn         v4.8b,  v4.8h,  #4
-        st1             {v4.h}[0], [\dst], \d_strd
-        st1             {v4.h}[1], [\ds2], \d_strd
-        st1             {v4.h}[2], [\dst], \d_strd
-        st1             {v4.h}[3], [\ds2], \d_strd
-        b.le            0f
-        mov             v16.8b, v20.8b
-        b               24b
-0:
-        ret
-.endif
-
-40:     // 4xN v
-        add             \ds2,  \dst,  \d_strd
-        add             \sr2,  \src,  \s_strd
-        lsl             \s_strd,  \s_strd,  #1
-        lsl             \d_strd,  \d_strd,  #1
-        ld1             {v16.s}[0], [\src], \s_strd
-4:
-        ld1             {v17.s}[0], [\sr2], \s_strd
-        ld1             {v18.s}[0], [\src], \s_strd
-        trn1            v16.2s, v16.2s, v17.2s
-        trn1            v17.2s, v17.2s, v18.2s
-        umull           v4.8h,  v16.8b,  v2.8b
-        umlal           v4.8h,  v17.8b,  v3.8b
-        subs            \h,  \h,  #2
-.ifc \type, put
-        uqrshrn         v4.8b,  v4.8h,  #4
-        st1             {v4.s}[0], [\dst], \d_strd
-        st1             {v4.s}[1], [\ds2], \d_strd
-.else
-        st1             {v4.d}[0], [\dst], \d_strd
-        st1             {v4.d}[1], [\ds2], \d_strd
-.endif
-        b.le            0f
-        mov             v16.8b, v18.8b
-        b               4b
-0:
-        ret
-
-80:     // 8xN v
-        add             \ds2,  \dst,  \d_strd
-        add             \sr2,  \src,  \s_strd
-        lsl             \s_strd,  \s_strd,  #1
-        lsl             \d_strd,  \d_strd,  #1
-        ld1             {v16.8b}, [\src], \s_strd
-8:
-        ld1             {v17.8b}, [\sr2], \s_strd
-        ld1             {v18.8b}, [\src], \s_strd
-        umull           v4.8h,  v16.8b,  v2.8b
-        umull           v5.8h,  v17.8b,  v2.8b
-        umlal           v4.8h,  v17.8b,  v3.8b
-        umlal           v5.8h,  v18.8b,  v3.8b
-        subs            \h,  \h,  #2
-.ifc \type, put
-        uqrshrn         v4.8b,  v4.8h,  #4
-        uqrshrn         v5.8b,  v5.8h,  #4
-        st1             {v4.8b}, [\dst], \d_strd
-        st1             {v5.8b}, [\ds2], \d_strd
-.else
-        st1             {v4.8h}, [\dst], \d_strd
-        st1             {v5.8h}, [\ds2], \d_strd
-.endif
-        b.le            0f
-        mov             v16.8b, v18.8b
-        b               8b
-0:
-        ret
-
-160:    // 16xN, 32xN, ...
-320:
-640:
-1280:
-        mov             \my,  \h
-1:
-        add             \ds2, \dst, \d_strd
-        add             \sr2, \src, \s_strd
-        lsl             \s_strd, \s_strd, #1
-        lsl             \d_strd, \d_strd, #1
-
-        ld1             {v16.16b}, [\src], \s_strd
-2:
-        ld1             {v17.16b}, [\sr2], \s_strd
-        ld1             {v18.16b}, [\src], \s_strd
-        umull           v4.8h,  v16.8b,  v2.8b
-        umull2          v5.8h,  v16.16b, v2.16b
-        umull           v6.8h,  v17.8b,  v2.8b
-        umull2          v7.8h,  v17.16b, v2.16b
-        umlal           v4.8h,  v17.8b,  v3.8b
-        umlal2          v5.8h,  v17.16b, v3.16b
-        umlal           v6.8h,  v18.8b,  v3.8b
-        umlal2          v7.8h,  v18.16b, v3.16b
-        subs            \h,  \h,  #2
-.ifc \type, put
-        uqrshrn         v4.8b,  v4.8h,  #4
-        uqrshrn2        v4.16b, v5.8h,  #4
-        uqrshrn         v6.8b,  v6.8h,  #4
-        uqrshrn2        v6.16b, v7.8h,  #4
-        st1             {v4.16b}, [\dst], \d_strd
-        st1             {v6.16b}, [\ds2], \d_strd
-.else
-        st1             {v4.8h, v5.8h}, [\dst], \d_strd
-        st1             {v6.8h, v7.8h}, [\ds2], \d_strd
-.endif
-        b.le            9f
-        mov             v16.16b, v18.16b
-        b               2b
-9:
-        subs            \w,  \w,  #16
-        b.le            0f
-        asr             \s_strd, \s_strd, #1
-        asr             \d_strd, \d_strd, #1
-        msub            \src, \s_strd, \xmy, \src
-        msub            \dst, \d_strd, \xmy, \dst
-        sub             \src, \src, \s_strd, lsl #1
-        mov             \h,  \my
-        add             \src, \src, #16
-.ifc \type, put
-        add             \dst, \dst, #16
-.else
-        add             \dst, \dst, #32
-.endif
-        b               1b
-0:
-        ret
-
-L(\type\()_bilin_v_tbl):
-        .hword L(\type\()_bilin_v_tbl) - 1280b
-        .hword L(\type\()_bilin_v_tbl) -  640b
-        .hword L(\type\()_bilin_v_tbl) -  320b
-        .hword L(\type\()_bilin_v_tbl) -  160b
-        .hword L(\type\()_bilin_v_tbl) -   80b
-        .hword L(\type\()_bilin_v_tbl) -   40b
-        .hword L(\type\()_bilin_v_tbl) -   20b
-        .hword 0
-
-L(\type\()_bilin_hv):
-        uxtl            v2.8h, v2.8b
-        uxtl            v3.8h, v3.8b
-        adr             x9,  L(\type\()_bilin_hv_tbl)
-        ldrh            w8,  [x9, x8, lsl #1]
-        sub             x9,  x9,  w8, uxtw
-        br              x9
-
-20:     // 2xN hv
-.ifc \type, put
-        add             \sr2, \src, \s_strd
-        add             \ds2, \dst, \d_strd
-        lsl             \s_strd, \s_strd, #1
-        lsl             \d_strd, \d_strd, #1
-
-        ld1             {v28.s}[0],  [\src], \s_strd
-        ext             v29.8b, v28.8b, v28.8b, #1
-        umull           v16.8h, v28.8b, v0.8b
-        umlal           v16.8h, v29.8b, v1.8b
-
-2:
-        ld1             {v28.s}[0],  [\sr2], \s_strd
-        ld1             {v30.s}[0],  [\src], \s_strd
-        ext             v29.8b, v28.8b, v28.8b, #1
-        ext             v31.8b, v30.8b, v30.8b, #1
-        trn1            v28.4h, v28.4h, v30.4h
-        trn1            v29.4h, v29.4h, v31.4h
-        umull           v17.8h, v28.8b, v0.8b
-        umlal           v17.8h, v29.8b, v1.8b
-
-        trn1            v16.2s, v16.2s, v17.2s
-
-        mul             v4.4h,  v16.4h, v2.4h
-        mla             v4.4h,  v17.4h, v3.4h
-        uqrshrn         v4.8b,  v4.8h,  #8
-        subs            \h,  \h,  #2
-        st1             {v4.h}[0], [\dst], \d_strd
-        st1             {v4.h}[1], [\ds2], \d_strd
-        b.le            0f
-        trn2            v16.2s, v17.2s, v17.2s
-        b               2b
-0:
-        ret
-.endif
-
-40:     // 4xN hv
-        add             \sr2, \src, \s_strd
-        add             \ds2, \dst, \d_strd
-        lsl             \s_strd, \s_strd, #1
-        lsl             \d_strd, \d_strd, #1
-
-        ld1             {v28.8b},  [\src], \s_strd
-        ext             v29.8b, v28.8b, v28.8b, #1
-        umull           v16.8h, v28.8b, v0.8b
-        umlal           v16.8h, v29.8b, v1.8b
-
-4:
-        ld1             {v28.8b},  [\sr2], \s_strd
-        ld1             {v30.8b},  [\src], \s_strd
-        ext             v29.8b, v28.8b, v28.8b, #1
-        ext             v31.8b, v30.8b, v30.8b, #1
-        trn1            v28.2s, v28.2s, v30.2s
-        trn1            v29.2s, v29.2s, v31.2s
-        umull           v17.8h, v28.8b, v0.8b
-        umlal           v17.8h, v29.8b, v1.8b
-
-        trn1            v16.2d, v16.2d, v17.2d
-
-        mul             v4.8h,  v16.8h, v2.8h
-        mla             v4.8h,  v17.8h, v3.8h
-        subs            \h,  \h,  #2
-.ifc \type, put
-        uqrshrn         v4.8b,  v4.8h,  #8
-        st1             {v4.s}[0], [\dst], \d_strd
-        st1             {v4.s}[1], [\ds2], \d_strd
-.else
-        urshr           v4.8h,  v4.8h,  #4
-        st1             {v4.d}[0], [\dst], \d_strd
-        st1             {v4.d}[1], [\ds2], \d_strd
-.endif
-        b.le            0f
-        trn2            v16.2d, v17.2d, v17.2d
-        b               4b
-0:
-        ret
-
-80:     // 8xN, 16xN, ... hv
-160:
-320:
-640:
-1280:
-        mov             \my,  \h
-
-1:
-        add             \sr2, \src, \s_strd
-        add             \ds2, \dst, \d_strd
-        lsl             \s_strd, \s_strd, #1
-        lsl             \d_strd, \d_strd, #1
-
-        ld1             {v28.16b},  [\src], \s_strd
-        ext             v29.16b, v28.16b, v28.16b, #1
-        umull           v16.8h, v28.8b, v0.8b
-        umlal           v16.8h, v29.8b, v1.8b
-
-2:
-        ld1             {v28.16b},  [\sr2], \s_strd
-        ld1             {v30.16b},  [\src], \s_strd
-        ext             v29.16b, v28.16b, v28.16b, #1
-        ext             v31.16b, v30.16b, v30.16b, #1
-        umull           v17.8h, v28.8b, v0.8b
-        umlal           v17.8h, v29.8b, v1.8b
-        umull           v18.8h, v30.8b, v0.8b
-        umlal           v18.8h, v31.8b, v1.8b
-
-        mul             v4.8h,  v16.8h, v2.8h
-        mla             v4.8h,  v17.8h, v3.8h
-        mul             v5.8h,  v17.8h, v2.8h
-        mla             v5.8h,  v18.8h, v3.8h
-        subs            \h,  \h,  #2
-.ifc \type, put
-        uqrshrn         v4.8b,  v4.8h,  #8
-        uqrshrn         v5.8b,  v5.8h,  #8
-        st1             {v4.8b}, [\dst], \d_strd
-        st1             {v5.8b}, [\ds2], \d_strd
-.else
-        urshr           v4.8h,  v4.8h,  #4
-        urshr           v5.8h,  v5.8h,  #4
-        st1             {v4.8h}, [\dst], \d_strd
-        st1             {v5.8h}, [\ds2], \d_strd
-.endif
-        b.le            9f
-        mov             v16.16b, v18.16b
-        b               2b
-9:
-        subs            \w,  \w,  #8
-        b.le            0f
-        asr             \s_strd,  \s_strd,  #1
-        asr             \d_strd,  \d_strd,  #1
-        msub            \src,  \s_strd,  \xmy,  \src
-        msub            \dst,  \d_strd,  \xmy,  \dst
-        sub             \src,  \src,  \s_strd,  lsl #1
-        mov             \h,  \my
-        add             \src,  \src,  #8
-.ifc \type, put
-        add             \dst,  \dst,  #8
-.else
-        add             \dst,  \dst,  #16
-.endif
-        b               1b
-0:
-        ret
-
-L(\type\()_bilin_hv_tbl):
-        .hword L(\type\()_bilin_hv_tbl) - 1280b
-        .hword L(\type\()_bilin_hv_tbl) -  640b
-        .hword L(\type\()_bilin_hv_tbl) -  320b
-        .hword L(\type\()_bilin_hv_tbl) -  160b
-        .hword L(\type\()_bilin_hv_tbl) -   80b
-        .hword L(\type\()_bilin_hv_tbl) -   40b
-        .hword L(\type\()_bilin_hv_tbl) -   20b
-        .hword 0
-endfunc
-.endm
-
-filter_fn put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10
-filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6
--- a/third_party/dav1d/src/arm/asm.S
+++ b/third_party/dav1d/src/arm/asm.S
@@ -49,24 +49,16 @@
 #define A @
 #define T
 #else
 #define A
 #define T @
 #endif
 #endif
 
-#if !defined(PIC)
-#if defined(__PIC__)
-#define PIC __PIC__
-#elif defined(__pic__)
-#define PIC __pic__
-#endif
-#endif
-
 #ifndef PRIVATE_PREFIX
 #define PRIVATE_PREFIX dav1d_
 #endif
 
 #define PASTE(a,b) a ## b
 #define CONCAT(a,b) PASTE(a,b)
 
 #ifdef PREFIX
@@ -124,11 +116,9 @@ EXTERN\name:
 .endm
 
 #ifdef __APPLE__
 #define L(x) L ## x
 #else
 #define L(x) .L ## x
 #endif
 
-#define X(x) CONCAT(EXTERN, x)
-
 #endif /* __DAV1D_SRC_ARM_ASM_S__ */
deleted file mode 100644
--- a/third_party/dav1d/src/arm/looprestoration_init_tmpl.c
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- *    list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "src/cpu.h"
-#include "src/looprestoration.h"
-
-#include "common/attributes.h"
-#include "common/intops.h"
-#include "src/tables.h"
-
-#if BITDEPTH == 8 && ARCH_AARCH64
-// This calculates things slightly differently than the reference C version.
-// This version calculates roughly this:
-// int16_t sum = 0;
-// for (int i = 0; i < 7; i++)
-//     sum += src[idx] * fh[i];
-// int16_t sum2 = (src[x] << 7) - (1 << (BITDEPTH + 6)) + rounding_off_h;
-// sum = iclip(sum + sum2, INT16_MIN, INT16_MAX) >> round_bits_h;
-// sum += 2048;
-void dav1d_wiener_filter_h_neon(int16_t *dst, const pixel (*left)[4],
-                                const pixel *src, ptrdiff_t stride,
-                                const int16_t fh[7], const intptr_t w,
-                                int h, enum LrEdgeFlags edges);
-// This calculates things slightly differently than the reference C version.
-// This version calculates roughly this:
-// fv[3] += 128;
-// int32_t sum = 0;
-// for (int i = 0; i < 7; i++)
-//     sum += mid[idx] * fv[i];
-// sum = (sum + rounding_off_v) >> round_bits_v;
-// This function assumes that the width is a multiple of 8.
-void dav1d_wiener_filter_v_neon(pixel *dst, ptrdiff_t stride,
-                                const int16_t *mid, int w, int h,
-                                const int16_t fv[7], enum LrEdgeFlags edges,
-                                ptrdiff_t mid_stride);
-void dav1d_copy_narrow_neon(pixel *dst, ptrdiff_t stride,
-                            const pixel *src, int w, int h);
-
-static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
-                               const pixel (*const left)[4],
-                               const pixel *lpf, const ptrdiff_t lpf_stride,
-                               const int w, const int h, const int16_t fh[7],
-                               const int16_t fv[7], const enum LrEdgeFlags edges)
-{
-    ALIGN_STK_32(int16_t, mid, 68 * 384,);
-    int mid_stride = (w + 7) & ~7;
-
-    // Horizontal filter
-    dav1d_wiener_filter_h_neon(&mid[2 * mid_stride], left, dst, dst_stride,
-                               fh, w, h, edges);
-    if (edges & LR_HAVE_TOP)
-        dav1d_wiener_filter_h_neon(mid, NULL, lpf, lpf_stride,
-                                   fh, w, 2, edges);
-    if (edges & LR_HAVE_BOTTOM)
-        dav1d_wiener_filter_h_neon(&mid[(2 + h) * mid_stride], NULL,
-                                   lpf + 6 * PXSTRIDE(lpf_stride), lpf_stride,
-                                   fh, w, 2, edges);
-
-    // Vertical filter
-    if (w >= 8)
-        dav1d_wiener_filter_v_neon(dst, dst_stride, &mid[2*mid_stride],
-                                   w & ~7, h, fv, edges, mid_stride * sizeof(*mid));
-    if (w & 7) {
-        // For uneven widths, do a full 8 pixel wide filtering into a temp
-        // buffer and copy out the narrow slice of pixels separately into dest.
-        ALIGN_STK_16(pixel, tmp, 64 * 8,);
-        dav1d_wiener_filter_v_neon(tmp, w & 7, &mid[2*mid_stride + (w & ~7)],
-                                   w & 7, h, fv, edges, mid_stride * sizeof(*mid));
-        dav1d_copy_narrow_neon(dst + (w & ~7), dst_stride, tmp, w & 7, h);
-    }
-}
-#endif
-
-void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *const c) {
-    const unsigned flags = dav1d_get_cpu_flags();
-
-    if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
-
-#if BITDEPTH == 8 && ARCH_AARCH64
-    c->wiener = wiener_filter_neon;
-#endif
-}
--- a/third_party/dav1d/src/arm/mc_init_tmpl.c
+++ b/third_party/dav1d/src/arm/mc_init_tmpl.c
@@ -25,73 +25,23 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include "config.h"
 
 #include "src/mc.h"
 #include "src/cpu.h"
 
-decl_mc_fn(dav1d_put_8tap_regular_8bpc_neon);
-decl_mc_fn(dav1d_put_8tap_regular_smooth_8bpc_neon);
-decl_mc_fn(dav1d_put_8tap_regular_sharp_8bpc_neon);
-decl_mc_fn(dav1d_put_8tap_smooth_8bpc_neon);
-decl_mc_fn(dav1d_put_8tap_smooth_regular_8bpc_neon);
-decl_mc_fn(dav1d_put_8tap_smooth_sharp_8bpc_neon);
-decl_mc_fn(dav1d_put_8tap_sharp_8bpc_neon);
-decl_mc_fn(dav1d_put_8tap_sharp_regular_8bpc_neon);
-decl_mc_fn(dav1d_put_8tap_sharp_smooth_8bpc_neon);
-decl_mc_fn(dav1d_put_bilin_8bpc_neon);
-
-decl_mct_fn(dav1d_prep_8tap_regular_8bpc_neon);
-decl_mct_fn(dav1d_prep_8tap_regular_smooth_8bpc_neon);
-decl_mct_fn(dav1d_prep_8tap_regular_sharp_8bpc_neon);
-decl_mct_fn(dav1d_prep_8tap_smooth_8bpc_neon);
-decl_mct_fn(dav1d_prep_8tap_smooth_regular_8bpc_neon);
-decl_mct_fn(dav1d_prep_8tap_smooth_sharp_8bpc_neon);
-decl_mct_fn(dav1d_prep_8tap_sharp_8bpc_neon);
-decl_mct_fn(dav1d_prep_8tap_sharp_regular_8bpc_neon);
-decl_mct_fn(dav1d_prep_8tap_sharp_smooth_8bpc_neon);
-decl_mct_fn(dav1d_prep_bilin_8bpc_neon);
-
 decl_avg_fn(dav1d_avg_8bpc_neon);
 decl_w_avg_fn(dav1d_w_avg_8bpc_neon);
 decl_mask_fn(dav1d_mask_8bpc_neon);
 
 void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
-#define init_mc_fn(type, name, suffix) \
-    c->mc[type] = dav1d_put_##name##_8bpc_##suffix
-#define init_mct_fn(type, name, suffix) \
-    c->mct[type] = dav1d_prep_##name##_8bpc_##suffix
     const unsigned flags = dav1d_get_cpu_flags();
 
     if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
 
 #if BITDEPTH == 8
-#if ARCH_AARCH64
-    init_mc_fn (FILTER_2D_8TAP_REGULAR,        8tap_regular,        neon);
-    init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
-    init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  neon);
-    init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon);
-    init_mc_fn (FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         neon);
-    init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   neon);
-    init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  neon);
-    init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   neon);
-    init_mc_fn (FILTER_2D_8TAP_SHARP,          8tap_sharp,          neon);
-    init_mc_fn (FILTER_2D_BILINEAR,            bilin,               neon);
-
-    init_mct_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        neon);
-    init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
-    init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  neon);
-    init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon);
-    init_mct_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         neon);
-    init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   neon);
-    init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  neon);
-    init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   neon);
-    init_mct_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          neon);
-    init_mct_fn(FILTER_2D_BILINEAR,            bilin,               neon);
-#endif
-
     c->avg = dav1d_avg_8bpc_neon;
     c->w_avg = dav1d_w_avg_8bpc_neon;
     c->mask = dav1d_mask_8bpc_neon;
 #endif
 }
--- a/third_party/dav1d/src/cdef_apply_tmpl.c
+++ b/third_party/dav1d/src/cdef_apply_tmpl.c
@@ -83,56 +83,56 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameC
                              const Av1Filter *const lflvl,
                              const int by_start, const int by_end)
 {
     const Dav1dDSPContext *const dsp = f->dsp;
     enum CdefEdgeFlags edges = HAVE_BOTTOM | (by_start > 0 ? HAVE_TOP : 0);
     pixel *ptrs[3] = { p[0], p[1], p[2] };
     const int sbsz = 16;
     const int sb64w = f->sb128w << 1;
-    const int damping = f->frame_hdr->cdef.damping + BITDEPTH - 8;
-    const enum Dav1dPixelLayout layout = f->cur.p.layout;
+    const int damping = f->frame_hdr.cdef.damping + BITDEPTH - 8;
+    const enum Dav1dPixelLayout layout = f->cur.p.p.layout;
     const int uv_idx = DAV1D_PIXEL_LAYOUT_I444 - layout;
     const int has_chroma = layout != DAV1D_PIXEL_LAYOUT_I400;
     const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
     const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
 
     // FIXME a design improvement that could be made here is to keep a set of
     // flags for each block position on whether the block was filtered; if not,
     // the backup of pre-filter data is empty, and the restore is therefore
     // unnecessary as well.
 
     for (int bit = 0, by = by_start; by < by_end; by += 2, edges |= HAVE_TOP) {
         const int tf = f->lf.top_pre_cdef_toggle;
         if (by + 2 >= f->bh) edges &= ~HAVE_BOTTOM;
 
         if (edges & HAVE_BOTTOM) {
             // backup pre-filter data for next iteration
-            backup2lines(f->lf.cdef_line_ptr[!tf], ptrs, f->cur.stride,
+            backup2lines(f->lf.cdef_line_ptr[!tf], ptrs, f->cur.p.stride,
                          8, f->bw * 4, layout);
         }
 
         pixel lr_bak[2 /* idx */][3 /* plane */][8 /* y */][2 /* x */];
         pixel *iptrs[3] = { ptrs[0], ptrs[1], ptrs[2] };
         edges &= ~HAVE_LEFT;
         edges |= HAVE_RIGHT;
         for (int sbx = 0, last_skip = 1; sbx < sb64w; sbx++, edges |= HAVE_LEFT) {
             const int sb128x = sbx >>1;
             const int sb64_idx = ((by & sbsz) >> 3) + (sbx & 1);
             const int cdef_idx = lflvl[sb128x].cdef_idx[sb64_idx];
             if (cdef_idx == -1 ||
-                (!f->frame_hdr->cdef.y_strength[cdef_idx] &&
-                 !f->frame_hdr->cdef.uv_strength[cdef_idx]))
+                (!f->frame_hdr.cdef.y_strength[cdef_idx] &&
+                 !f->frame_hdr.cdef.uv_strength[cdef_idx]))
             {
                 last_skip = 1;
                 goto next_sb;
             }
 
-            const int y_lvl = f->frame_hdr->cdef.y_strength[cdef_idx];
-            const int uv_lvl = f->frame_hdr->cdef.uv_strength[cdef_idx];
+            const int y_lvl = f->frame_hdr.cdef.y_strength[cdef_idx];
+            const int uv_lvl = f->frame_hdr.cdef.uv_strength[cdef_idx];
             pixel *bptrs[3] = { iptrs[0], iptrs[1], iptrs[2] };
             for (int bx = sbx * sbsz; bx < imin((sbx + 1) * sbsz, f->bw);
                  bx += 2, edges |= HAVE_LEFT)
             {
                 if (bx + 2 >= f->bw) edges &= ~HAVE_RIGHT;
 
                 // check if this 8x8 block had any coded coefficients; if not,
                 // go to the next block
@@ -143,51 +143,51 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameC
                 {
                     last_skip = 1;
                     goto next_b;
                 }
 
                 if (last_skip && edges & HAVE_LEFT) {
                     // we didn't backup the prefilter data because it wasn't
                     // there, so do it here instead
-                    backup2x8(lr_bak[bit], bptrs, f->cur.stride, 0, layout);
+                    backup2x8(lr_bak[bit], bptrs, f->cur.p.stride, 0, layout);
                 }
                 if (edges & HAVE_RIGHT) {
                     // backup pre-filter data for next iteration
-                    backup2x8(lr_bak[!bit], bptrs, f->cur.stride, 8, layout);
+                    backup2x8(lr_bak[!bit], bptrs, f->cur.p.stride, 8, layout);
                 }
 
                 // the actual filter
                 const int y_pri_lvl = (y_lvl >> 2) << (BITDEPTH - 8);
                 int y_sec_lvl = y_lvl & 3;
                 y_sec_lvl += y_sec_lvl == 3;
                 y_sec_lvl <<= BITDEPTH - 8;
                 const int uv_pri_lvl = (uv_lvl >> 2) << (BITDEPTH - 8);
                 int uv_sec_lvl = uv_lvl & 3;
                 uv_sec_lvl += uv_sec_lvl == 3;
                 uv_sec_lvl <<= BITDEPTH - 8;
                 unsigned variance;
-                const int dir = dsp->cdef.dir(bptrs[0], f->cur.stride[0],
+                const int dir = dsp->cdef.dir(bptrs[0], f->cur.p.stride[0],
                                               &variance);
                 if (y_lvl) {
-                    dsp->cdef.fb[0](bptrs[0], f->cur.stride[0], lr_bak[bit][0],
+                    dsp->cdef.fb[0](bptrs[0], f->cur.p.stride[0], lr_bak[bit][0],
                                     (pixel *const [2]) {
                                         &f->lf.cdef_line_ptr[tf][0][0][bx * 4],
                                         &f->lf.cdef_line_ptr[tf][0][1][bx * 4],
                                     },
                                     adjust_strength(y_pri_lvl, variance),
                                     y_sec_lvl, y_pri_lvl ? dir : 0,
                                     damping, edges);
                 }
                 if (uv_lvl && has_chroma) {
                     const int uvdir =
-                        f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I422 ? dir :
+                        f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I422 ? dir :
                         ((uint8_t[]) { 7, 0, 2, 4, 5, 6, 6, 6 })[dir];
                     for (int pl = 1; pl <= 2; pl++) {
-                        dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.stride[1],
+                        dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.p.stride[1],
                                              lr_bak[bit][pl],
                                              (pixel *const [2]) {
                                                  &f->lf.cdef_line_ptr[tf][pl][0][bx * 4 >> ss_hor],
                                                  &f->lf.cdef_line_ptr[tf][pl][1][bx * 4 >> ss_hor],
                                              },
                                              uv_pri_lvl, uv_sec_lvl,
                                              uv_pri_lvl ? uvdir : 0,
                                              damping - 1, edges);
@@ -204,14 +204,14 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameC
             }
 
         next_sb:
             iptrs[0] += sbsz * 4;
             iptrs[1] += sbsz * 4 >> ss_hor;
             iptrs[2] += sbsz * 4 >> ss_hor;
         }
 
-        ptrs[0] += 8 * PXSTRIDE(f->cur.stride[0]);
-        ptrs[1] += 8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver;
-        ptrs[2] += 8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver;
+        ptrs[0] += 8 * PXSTRIDE(f->cur.p.stride[0]);
+        ptrs[1] += 8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver;
+        ptrs[2] += 8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver;
         f->lf.top_pre_cdef_toggle ^= 1;
     }
 }
--- a/third_party/dav1d/src/cdf.c
+++ b/third_party/dav1d/src/cdf.c
@@ -4067,17 +4067,17 @@ void dav1d_init_states(CdfThreadContext 
     memcpy(cdf_init[qcat].cdf->kfym, default_kf_y_mode_cdf,
            sizeof(default_kf_y_mode_cdf));
     cdf_init[qcat].cdf->coef = av1_default_coef_cdf[qcat];
     cdf_init[qcat].cdf->mv = default_mv_cdf;
     cdf_init[qcat].cdf->dmv = default_mv_cdf;
     dav1d_cdf_thread_ref(cdf, &cdf_init[qcat]);
 }
 
-void dav1d_update_tile_cdf(const Dav1dFrameHeader *const hdr,
+void dav1d_update_tile_cdf(const Av1FrameHeader *const hdr,
                            CdfContext *const dst,
                            const CdfContext *const src)
 {
     int i, j, k, l;
 
 #define update_cdf_1d(n1d, name) \
     do { \
         memcpy(dst->name, src->name, sizeof(*dst->name) * n1d); \
@@ -4133,17 +4133,17 @@ void dav1d_update_tile_cdf(const Dav1dFr
     update_cdf_3d(2, 2, 9, coef.eob_bin_256);
     update_cdf_3d(2, 2, 10, coef.eob_bin_512);
     update_cdf_3d(2, 2, 11, coef.eob_bin_1024);
     update_bit_3d(N_TX_SIZES, 2, 11 /*22*/, coef.eob_hi_bit);
     update_cdf_4d(N_TX_SIZES, 2, 4, 3, coef.eob_base_tok);
     update_cdf_4d(N_TX_SIZES, 2, 41 /*42*/, 4, coef.base_tok);
     update_bit_2d(2, 3, coef.dc_sign);
     update_cdf_4d(4, 2, 21, 4, coef.br_tok);
-    update_cdf_2d(3, DAV1D_MAX_SEGMENTS, m.seg_id);
+    update_cdf_2d(3, NUM_SEGMENTS, m.seg_id);
     update_cdf_1d(8, m.cfl_sign);
     update_cdf_2d(6, 16, m.cfl_alpha);
     update_bit_0d(m.restore_wiener);
     update_bit_0d(m.restore_sgrproj);
     update_cdf_1d(3, m.restore_switchable);
     update_cdf_1d(4, m.delta_q);
     update_cdf_2d(5, 4, m.delta_lf);
     update_bit_2d(7, 3, m.pal_y);
@@ -4166,17 +4166,17 @@ void dav1d_update_tile_cdf(const Dav1dFr
             update_bit_1d(10, dmv.comp[k].classN);
             update_bit_0d(dmv.comp[k].sign);
         }
         return;
     }
 
     update_bit_1d(3, m.skip_mode);
     update_cdf_2d(4, N_INTRA_PRED_MODES, m.y_mode);
-    update_cdf_3d(2, 8, DAV1D_N_SWITCHABLE_FILTERS, m.filter);
+    update_cdf_3d(2, 8, N_SWITCHABLE_FILTERS, m.filter);
     update_bit_1d(6, m.newmv_mode);
     update_bit_1d(2, m.globalmv_mode);
     update_bit_1d(6, m.refmv_mode);
     update_bit_1d(3, m.drl_bit);
     update_cdf_2d(8, N_COMP_INTER_PRED_MODES, m.comp_inter_mode);
     update_bit_1d(4, m.intra);
     update_bit_1d(5, m.comp);
     update_bit_1d(5, m.comp_dir);
--- a/third_party/dav1d/src/cdf.h
+++ b/third_party/dav1d/src/cdf.h
@@ -35,17 +35,17 @@
 #include "src/thread_data.h"
 
 typedef struct CdfModeContext {
     uint16_t y_mode[4][N_INTRA_PRED_MODES + 1];
     uint16_t use_filter_intra[N_BS_SIZES][2];
     uint16_t filter_intra[5 + 1];
     uint16_t uv_mode[2][N_INTRA_PRED_MODES][N_UV_INTRA_PRED_MODES + 1];
     uint16_t angle_delta[8][8];
-    uint16_t filter[2][8][DAV1D_N_SWITCHABLE_FILTERS + 1];
+    uint16_t filter[2][8][N_SWITCHABLE_FILTERS + 1];
     uint16_t newmv_mode[6][2];
     uint16_t globalmv_mode[2][2];
     uint16_t refmv_mode[6][2];
     uint16_t drl_bit[3][2];
     uint16_t comp_inter_mode[8][N_COMP_INTER_PRED_MODES + 1];
     uint16_t intra[4][2];
     uint16_t comp[5][2];
     uint16_t comp_dir[5][2];
@@ -63,17 +63,17 @@ typedef struct CdfModeContext {
     uint16_t txsz[N_TX_SIZES - 1][3][4];
     uint16_t txpart[7][3][2];
     uint16_t txtp_inter[4][N_TX_SIZES][N_TX_TYPES + 1];
     uint16_t txtp_intra[3][N_TX_SIZES][N_INTRA_PRED_MODES][N_TX_TYPES + 1];
     uint16_t skip[3][2];
     uint16_t skip_mode[3][2];
     uint16_t partition[N_BL_LEVELS][4][N_PARTITIONS + 1];
     uint16_t seg_pred[3][2];
-    uint16_t seg_id[3][DAV1D_MAX_SEGMENTS + 1];
+    uint16_t seg_id[3][NUM_SEGMENTS + 1];
     uint16_t cfl_sign[8 + 1];
     uint16_t cfl_alpha[6][16 + 1];
     uint16_t restore_wiener[2];
     uint16_t restore_sgrproj[2];
     uint16_t restore_switchable[3 + 1];
     uint16_t delta_q[4 + 1];
     uint16_t delta_lf[5][4 + 1];
     uint16_t obmc[N_BS_SIZES][2];
@@ -127,17 +127,17 @@ typedef struct CdfContext {
 typedef struct CdfThreadContext {
     CdfContext *cdf;
     Dav1dRef *ref; ///< allocation origin
     struct thread_data *t;
     atomic_uint *progress;
 } CdfThreadContext;
 
 void dav1d_init_states(CdfThreadContext *cdf, int qidx);
-void dav1d_update_tile_cdf(const Dav1dFrameHeader *hdr, CdfContext *dst,
+void dav1d_update_tile_cdf(const Av1FrameHeader *hdr, CdfContext *dst,
                          const CdfContext *src);
 
 void dav1d_cdf_thread_alloc(CdfThreadContext *cdf, struct thread_data *t);
 void dav1d_cdf_thread_ref(CdfThreadContext *dst, CdfThreadContext *src);
 void dav1d_cdf_thread_unref(CdfThreadContext *cdf);
 
 /*
  * These are binary signals (so a signal is either "done" or "not done").
--- a/third_party/dav1d/src/cpu.h
+++ b/third_party/dav1d/src/cpu.h
@@ -25,20 +25,18 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #ifndef __DAV1D_SRC_CPU_H__
 #define __DAV1D_SRC_CPU_H__
 
 #include "config.h"
 
-#include "dav1d/common.h"
-
 #if ARCH_AARCH64 || ARCH_ARM
 #include "src/arm/cpu.h"
 #elif ARCH_X86
 #include "src/x86/cpu.h"
 #endif
 
 unsigned dav1d_get_cpu_flags(void);
-DAV1D_API void dav1d_set_cpu_flags_mask(const unsigned mask);
+void dav1d_set_cpu_flags_mask(const unsigned mask);
 
 #endif /* __DAV1D_SRC_CPU_H__ */
--- a/third_party/dav1d/src/data.c
+++ b/third_party/dav1d/src/data.c
@@ -23,54 +23,49 @@
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include "config.h"
 
 #include <errno.h>
-#include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 
 #include "dav1d/data.h"
 
 #include "common/validate.h"
 
 #include "src/data.h"
 #include "src/ref.h"
 
 uint8_t * dav1d_data_create(Dav1dData *const buf, const size_t sz) {
     validate_input_or_ret(buf != NULL, NULL);
 
     buf->ref = dav1d_ref_create(sz);
     if (!buf->ref) return NULL;
     buf->data = buf->ref->const_data;
-    buf->sz = buf->m.size = sz;
-    buf->m.timestamp = buf->m.offset = INT64_MIN;
-    buf->m.duration = ~0ULL;
+    buf->sz = sz;
 
     return buf->ref->data;
 }
 
 int dav1d_data_wrap(Dav1dData *const buf, const uint8_t *const ptr, const size_t sz,
                     void (*free_callback)(const uint8_t *data, void *user_data),
                     void *user_data)
 {
     validate_input_or_ret(buf != NULL, -EINVAL);
     validate_input_or_ret(ptr != NULL, -EINVAL);
     validate_input_or_ret(free_callback != NULL, -EINVAL);
 
     buf->ref = dav1d_ref_wrap(ptr, free_callback, user_data);
     if (!buf->ref) return -ENOMEM;
     buf->data = ptr;
-    buf->sz = buf->m.size = sz;
-    buf->m.timestamp = buf->m.offset = INT64_MIN;
-    buf->m.duration = ~0ULL;
+    buf->sz = sz;
 
     return 0;
 }
 
 void dav1d_data_move_ref(Dav1dData *const dst, Dav1dData *const src) {
     validate_input(dst != NULL);
     validate_input(dst->data == NULL);
     validate_input(src != NULL);
--- a/third_party/dav1d/src/decode.c
+++ b/third_party/dav1d/src/decode.c
@@ -44,45 +44,45 @@
 #include "src/env.h"
 #include "src/qm.h"
 #include "src/recon.h"
 #include "src/ref.h"
 #include "src/tables.h"
 #include "src/thread_task.h"
 #include "src/warpmv.h"
 
-static void init_quant_tables(const Dav1dSequenceHeader *const seq_hdr,
-                              const Dav1dFrameHeader *const frame_hdr,
+static void init_quant_tables(const Av1SequenceHeader *const seq_hdr,
+                              const Av1FrameHeader *const frame_hdr,
                               const int qidx, uint16_t (*dq)[3][2])
 {
     for (int i = 0; i < (frame_hdr->segmentation.enabled ? 8 : 1); i++) {
         const int yac = frame_hdr->segmentation.enabled ?
             iclip_u8(qidx + frame_hdr->segmentation.seg_data.d[i].delta_q) : qidx;
         const int ydc = iclip_u8(yac + frame_hdr->quant.ydc_delta);
         const int uac = iclip_u8(yac + frame_hdr->quant.uac_delta);
         const int udc = iclip_u8(yac + frame_hdr->quant.udc_delta);
         const int vac = iclip_u8(yac + frame_hdr->quant.vac_delta);
         const int vdc = iclip_u8(yac + frame_hdr->quant.vdc_delta);
 
-        dq[i][0][0] = dav1d_dq_tbl[seq_hdr->hbd][ydc][0];
-        dq[i][0][1] = dav1d_dq_tbl[seq_hdr->hbd][yac][1];
-        dq[i][1][0] = dav1d_dq_tbl[seq_hdr->hbd][udc][0];
-        dq[i][1][1] = dav1d_dq_tbl[seq_hdr->hbd][uac][1];
-        dq[i][2][0] = dav1d_dq_tbl[seq_hdr->hbd][vdc][0];
-        dq[i][2][1] = dav1d_dq_tbl[seq_hdr->hbd][vac][1];
+        dq[i][0][0] = dav1d_dq_tbl[seq_hdr->bpc > 8][ydc][0];
+        dq[i][0][1] = dav1d_dq_tbl[seq_hdr->bpc > 8][yac][1];
+        dq[i][1][0] = dav1d_dq_tbl[seq_hdr->bpc > 8][udc][0];
+        dq[i][1][1] = dav1d_dq_tbl[seq_hdr->bpc > 8][uac][1];
+        dq[i][2][0] = dav1d_dq_tbl[seq_hdr->bpc > 8][vdc][0];
+        dq[i][2][1] = dav1d_dq_tbl[seq_hdr->bpc > 8][vac][1];
     }
 }
 
 static int read_mv_component_diff(Dav1dTileContext *const t,
                                   CdfMvComponent *const mv_comp,
                                   const int have_fp)
 {
     Dav1dTileState *const ts = t->ts;
     const Dav1dFrameContext *const f = t->f;
-    const int have_hp = f->frame_hdr->hp;
+    const int have_hp = f->frame_hdr.hp;
     const int sign = msac_decode_bool_adapt(&ts->msac, mv_comp->sign);
     const int cl = msac_decode_symbol_adapt(&ts->msac, mv_comp->classes, 11);
     int up, fp, hp;
 
     if (!cl) {
         up = msac_decode_bool_adapt(&ts->msac, mv_comp->class0);
         if (have_fp) {
             fp = msac_decode_symbol_adapt(&ts->msac, mv_comp->class0_fp[up], 4);
@@ -278,17 +278,17 @@ static void find_matching_ref(const Dav1
         masks[0] |= 1ULL << 32;
     }
 #undef matches
 }
 
 static void derive_warpmv(const Dav1dTileContext *const t,
                           const int bw4, const int bh4,
                           const uint64_t masks[2], const struct mv mv,
-                          Dav1dWarpedMotionParams *const wmp)
+                          WarpedMotionParams *const wmp)
 {
     int pts[8][2 /* in, out */][2 /* x, y */], np = 0;
     const Dav1dFrameContext *const f = t->f;
     const ptrdiff_t b4_stride = f->b4_stride;
     const refmvs *const r = &f->mvs[t->by * b4_stride + t->bx];
 
 #define add_sample(dx, dy, sx, sy, rp) do { \
     pts[np][0][0] = 16 * (2 * dx + sx * bs(rp)[0]) - 8; \
@@ -347,19 +347,19 @@ static void derive_warpmv(const Dav1dTil
         // replace the discarded samples;
         mvd[i] = mvd[j];
         memcpy(pts[i], pts[j], sizeof(*pts));
     }
 
     if (!dav1d_find_affine_int(pts, ret, bw4, bh4, mv, wmp, t->bx, t->by) &&
         !dav1d_get_shear_params(wmp))
     {
-        wmp->type = DAV1D_WM_TYPE_AFFINE;
+        wmp->type = WM_TYPE_AFFINE;
     } else
-        wmp->type = DAV1D_WM_TYPE_IDENTITY;
+        wmp->type = WM_TYPE_IDENTITY;
 }
 
 static inline int findoddzero(const uint8_t *buf, int len) {
     for (int n = 0; n < len; n++)
         if (!buf[n * 2]) return 1;
     return 0;
 }
 
@@ -417,21 +417,21 @@ static void read_pal_plane(Dav1dTileCont
             used_cache[i++] = cache[n];
     const int n_used_cache = i;
 
     // parse new entries
     uint16_t *const pal = f->frame_thread.pass ?
         f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
                             ((t->bx >> 1) + (t->by & 1))][pl] : t->pal[pl];
     if (i < pal_sz) {
-        int prev = pal[i++] = msac_decode_bools(&ts->msac, f->cur.p.bpc);
+        int prev = pal[i++] = msac_decode_bools(&ts->msac, f->cur.p.p.bpc);
 
         if (i < pal_sz) {
-            int bits = f->cur.p.bpc - 3 + msac_decode_bools(&ts->msac, 2);
-            const int max = (1 << f->cur.p.bpc) - 1;
+            int bits = f->cur.p.p.bpc - 3 + msac_decode_bools(&ts->msac, 2);
+            const int max = (1 << f->cur.p.p.bpc) - 1;
 
             do {
                 const int delta = msac_decode_bools(&ts->msac, bits);
                 prev = pal[i++] = imin(prev + delta + !pl, max);
                 if (prev + !pl >= max) {
                     for (; i < pal_sz; i++)
                         pal[i] = max;
                     break;
@@ -473,27 +473,27 @@ static void read_pal_uv(Dav1dTileContext
 
     // V pal coding
     Dav1dTileState *const ts = t->ts;
     const Dav1dFrameContext *const f = t->f;
     uint16_t *const pal = f->frame_thread.pass ?
         f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
                             ((t->bx >> 1) + (t->by & 1))][2] : t->pal[2];
     if (msac_decode_bool(&ts->msac, EC_BOOL_EPROB)) {
-        const int bits = f->cur.p.bpc - 4 + msac_decode_bools(&ts->msac, 2);
-        int prev = pal[0] = msac_decode_bools(&ts->msac, f->cur.p.bpc);
-        const int max = (1 << f->cur.p.bpc) - 1;
+        const int bits = f->cur.p.p.bpc - 4 + msac_decode_bools(&ts->msac, 2);
+        int prev = pal[0] = msac_decode_bools(&ts->msac, f->cur.p.p.bpc);
+        const int max = (1 << f->cur.p.p.bpc) - 1;
         for (int i = 1; i < b->pal_sz[1]; i++) {
             int delta = msac_decode_bools(&ts->msac, bits);
             if (delta && msac_decode_bool(&ts->msac, EC_BOOL_EPROB)) delta = -delta;
             prev = pal[i] = (prev + delta) & max;
         }
     } else {
         for (int i = 0; i < b->pal_sz[1]; i++)
-            pal[i] = msac_decode_bools(&ts->msac, f->cur.p.bpc);
+            pal[i] = msac_decode_bools(&ts->msac, f->cur.p.p.bpc);
     }
     if (DEBUG_BLOCK_INFO) {
         printf("Post-pal[pl=2]: r=%d ", ts->msac.rng);
         for (int n = 0; n < b->pal_sz[1]; n++)
             printf("%c%02x", n ? ' ' : '[', pal[n]);
         printf("]\n");
     }
 }
@@ -608,38 +608,38 @@ static void read_vartx_tree(Dav1dTileCon
 {
     const Dav1dFrameContext *const f = t->f;
     const uint8_t *const b_dim = dav1d_block_dimensions[bs];
     const int bw4 = b_dim[0], bh4 = b_dim[1];
 
     // var-tx tree coding
     b->tx_split[0] = b->tx_split[1] = 0;
     b->max_ytx = dav1d_max_txfm_size_for_bs[bs][0];
-    if (f->frame_hdr->segmentation.lossless[b->seg_id] ||
+    if (f->frame_hdr.segmentation.lossless[b->seg_id] ||
         b->max_ytx == TX_4X4)
     {
         b->max_ytx = b->uvtx = TX_4X4;
-        if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE) {
+        if (f->frame_hdr.txfm_mode == TX_SWITCHABLE) {
 #define set_ctx(type, dir, diridx, off, mul, rep_macro) \
             rep_macro(type, t->dir tx, off, TX_4X4)
             case_set(bh4, l., 1, by4);
             case_set(bw4, a->, 0, bx4);
 #undef set_ctx
         }
-    } else if (f->frame_hdr->txfm_mode != DAV1D_TX_SWITCHABLE || b->skip) {
-        if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE) {
+    } else if (f->frame_hdr.txfm_mode != TX_SWITCHABLE || b->skip) {
+        if (f->frame_hdr.txfm_mode == TX_SWITCHABLE) {
 #define set_ctx(type, dir, diridx, off, mul, rep_macro) \
             rep_macro(type, t->dir tx, off, mul * b_dim[2 + diridx])
             case_set(bh4, l., 1, by4);
             case_set(bw4, a->, 0, bx4);
 #undef set_ctx
         } else {
-            assert(f->frame_hdr->txfm_mode == DAV1D_TX_LARGEST);
+            assert(f->frame_hdr.txfm_mode == TX_LARGEST);
         }
-        b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.layout];
+        b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.p.layout];
     } else {
         assert(imin(bw4, bh4) <= 16 || b->max_ytx == TX_64X64);
         int y, x, y_off, x_off;
         const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx];
         for (y = 0, y_off = 0; y < bh4; y += ytx->h, y_off++) {
             for (x = 0, x_off = 0; x < bw4; x += ytx->w, x_off++) {
                 read_tx_tree(t, b->max_ytx, 0, b->tx_split, x_off, y_off);
                 // contexts are updated inside read_tx_tree()
@@ -647,30 +647,30 @@ static void read_vartx_tree(Dav1dTileCon
             }
             t->bx -= x;
             t->by += ytx->h;
         }
         t->by -= y;
         if (DEBUG_BLOCK_INFO)
             printf("Post-vartxtree[%x/%x]: r=%d\n",
                    b->tx_split[0], b->tx_split[1], t->ts->msac.rng);
-        b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.layout];
+        b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.p.layout];
     }
 }
 
 static inline unsigned get_prev_frame_segid(const Dav1dFrameContext *const f,
                                             const int by, const int bx,
                                             const int w4, int h4,
                                             const uint8_t *ref_seg_map,
                                             const ptrdiff_t stride)
 {
     unsigned seg_id = 8;
 
-    assert(f->frame_hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE);
-    if (dav1d_thread_picture_wait(&f->refp[f->frame_hdr->primary_ref_frame],
+    assert(f->frame_hdr.primary_ref_frame != PRIMARY_REF_NONE);
+    if (dav1d_thread_picture_wait(&f->refp[f->frame_hdr.primary_ref_frame],
                                   (by + h4) * 4, PLANE_TYPE_BLOCK))
     {
         return 8;
     }
 
     ref_seg_map += by * stride + bx;
     do {
         for (int x = 0; x < w4; x++)
@@ -689,25 +689,25 @@ static int decode_b(Dav1dTileContext *co
                     const enum EdgeFlags intra_edge_flags)
 {
     Dav1dTileState *const ts = t->ts;
     const Dav1dFrameContext *const f = t->f;
     Av1Block b_mem, *const b = f->frame_thread.pass ?
         &f->frame_thread.b[t->by * f->b4_stride + t->bx] : &b_mem;
     const uint8_t *const b_dim = dav1d_block_dimensions[bs];
     const int bx4 = t->bx & 31, by4 = t->by & 31;
-    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-    const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
     const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
     const int bw4 = b_dim[0], bh4 = b_dim[1];
     const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
     const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
     const int have_left = t->bx > ts->tiling.col_start;
     const int have_top = t->by > ts->tiling.row_start;
-    const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
+    const int has_chroma = f->seq_hdr.layout != DAV1D_PIXEL_LAYOUT_I400 &&
                            (bw4 > ss_hor || t->bx & 1) &&
                            (bh4 > ss_ver || t->by & 1);
 
     if (f->frame_thread.pass == 2) {
         if (b->intra) {
             f->bd_fn.recon_b_intra(t, bs, intra_edge_flags, b);
 
             const enum IntraPredMode y_mode_nofilt =
@@ -722,17 +722,17 @@ static int decode_b(Dav1dTileContext *co
             if (has_chroma) {
 #define set_ctx(type, dir, diridx, off, mul, rep_macro) \
                 rep_macro(type, t->dir uvmode, off, mul * b->uv_mode)
                 case_set(cbh4, l., 1, cby4);
                 case_set(cbw4, a->, 0, cbx4);
 #undef set_ctx
             }
         } else {
-            if (f->frame_hdr->frame_type & 1 /* not intrabc */ &&
+            if (f->frame_hdr.frame_type & 1 /* not intrabc */ &&
                 b->comp_type == COMP_INTER_NONE && b->motion_mode == MM_WARP)
             {
                 uint64_t mask[2] = { 0, 0 };
                 find_matching_ref(t, intra_edge_flags, bw4, bh4, w4, h4,
                                   have_left, have_top, b->ref[0], mask);
                 derive_warpmv(t, bw4, bh4, mask, b->mv[0], &t->warpmv);
             }
             if (f->bd_fn.recon_b_inter(t, bs, b)) return -1;
@@ -758,34 +758,34 @@ static int decode_b(Dav1dTileContext *co
     }
 
     const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
 
     b->bl = bl;
     b->bp = bp;
     b->bs = bs;
 
-    const Dav1dSegmentationData *seg = NULL;
+    const Av1SegmentationData *seg = NULL;
 
     // segment_id (if seg_feature for skip/ref/gmv is enabled)
     int seg_pred = 0;
-    if (f->frame_hdr->segmentation.enabled) {
-        if (!f->frame_hdr->segmentation.update_map) {
+    if (f->frame_hdr.segmentation.enabled) {
+        if (!f->frame_hdr.segmentation.update_map) {
             if (f->prev_segmap) {
                 unsigned seg_id = get_prev_frame_segid(f, t->by, t->bx, w4, h4,
                                                        f->prev_segmap,
                                                        f->b4_stride);
                 if (seg_id >= 8) return -1;
                 b->seg_id = seg_id;
             } else {
                 b->seg_id = 0;
             }
-            seg = &f->frame_hdr->segmentation.seg_data.d[b->seg_id];
-        } else if (f->frame_hdr->segmentation.seg_data.preskip) {
-            if (f->frame_hdr->segmentation.temporal &&
+            seg = &f->frame_hdr.segmentation.seg_data.d[b->seg_id];
+        } else if (f->frame_hdr.segmentation.seg_data.preskip) {
+            if (f->frame_hdr.segmentation.temporal &&
                 (seg_pred = msac_decode_bool_adapt(&ts->msac,
                                        ts->cdf.m.seg_pred[t->a->seg_pred[bx4] +
                                                           t->l.seg_pred[by4]])))
             {
                 // temporal predicted seg_id
                 if (f->prev_segmap) {
                     unsigned seg_id = get_prev_frame_segid(f, t->by, t->bx,
                                                            w4, h4,
@@ -798,38 +798,38 @@ static int decode_b(Dav1dTileContext *co
                 }
             } else {
                 int seg_ctx;
                 const unsigned pred_seg_id =
                     get_cur_frame_segid(t->by, t->bx, have_top, have_left,
                                         &seg_ctx, f->cur_segmap, f->b4_stride);
                 const unsigned diff = msac_decode_symbol_adapt(&ts->msac,
                                                    ts->cdf.m.seg_id[seg_ctx],
-                                                   DAV1D_MAX_SEGMENTS);
+                                                   NUM_SEGMENTS);
                 const unsigned last_active_seg_id =
-                    f->frame_hdr->segmentation.seg_data.last_active_segid;
+                    f->frame_hdr.segmentation.seg_data.last_active_segid;
                 b->seg_id = neg_deinterleave(diff, pred_seg_id,
                                              last_active_seg_id + 1);
                 if (b->seg_id > last_active_seg_id) b->seg_id = 0; // error?
-                if (b->seg_id >= DAV1D_MAX_SEGMENTS) b->seg_id = 0; // error?
+                if (b->seg_id >= NUM_SEGMENTS) b->seg_id = 0; // error?
             }
 
             if (DEBUG_BLOCK_INFO)
                 printf("Post-segid[preskip;%d]: r=%d\n",
                        b->seg_id, ts->msac.rng);
 
-            seg = &f->frame_hdr->segmentation.seg_data.d[b->seg_id];
+            seg = &f->frame_hdr.segmentation.seg_data.d[b->seg_id];
         }
     } else {
         b->seg_id = 0;
     }
 
     // skip_mode
     if ((!seg || (!seg->globalmv && seg->ref == -1 && !seg->skip)) &&
-        f->frame_hdr->skip_mode_enabled && imin(bw4, bh4) > 1)
+        f->frame_hdr.skip_mode_enabled && imin(bw4, bh4) > 1)
     {
         const int smctx = t->a->skip_mode[bx4] + t->l.skip_mode[by4];
         b->skip_mode = msac_decode_bool_adapt(&ts->msac,
                                               ts->cdf.m.skip_mode[smctx]);
         if (DEBUG_BLOCK_INFO)
             printf("Post-skipmode[%d]: r=%d\n", b->skip_mode, ts->msac.rng);
     } else {
         b->skip_mode = 0;
@@ -841,21 +841,21 @@ static int decode_b(Dav1dTileContext *co
     } else {
         const int sctx = t->a->skip[bx4] + t->l.skip[by4];
         b->skip = msac_decode_bool_adapt(&ts->msac, ts->cdf.m.skip[sctx]);
         if (DEBUG_BLOCK_INFO)
             printf("Post-skip[%d]: r=%d\n", b->skip, ts->msac.rng);
     }
 
     // segment_id
-    if (f->frame_hdr->segmentation.enabled &&
-        f->frame_hdr->segmentation.update_map &&
-        !f->frame_hdr->segmentation.seg_data.preskip)
+    if (f->frame_hdr.segmentation.enabled &&
+        f->frame_hdr.segmentation.update_map &&
+        !f->frame_hdr.segmentation.seg_data.preskip)
     {
-        if (!b->skip && f->frame_hdr->segmentation.temporal &&
+        if (!b->skip && f->frame_hdr.segmentation.temporal &&
             (seg_pred = msac_decode_bool_adapt(&ts->msac,
                                    ts->cdf.m.seg_pred[t->a->seg_pred[bx4] +
                                                       t->l.seg_pred[by4]])))
         {
             // temporal predicted seg_id
             if (f->prev_segmap) {
                 unsigned seg_id = get_prev_frame_segid(f, t->by, t->bx, w4, h4,
                                                        f->prev_segmap,
@@ -870,143 +870,143 @@ static int decode_b(Dav1dTileContext *co
             const unsigned pred_seg_id =
                 get_cur_frame_segid(t->by, t->bx, have_top, have_left,
                                     &seg_ctx, f->cur_segmap, f->b4_stride);
             if (b->skip) {
                 b->seg_id = pred_seg_id;
             } else {
                 const unsigned diff = msac_decode_symbol_adapt(&ts->msac,
                                                    ts->cdf.m.seg_id[seg_ctx],
-                                                   DAV1D_MAX_SEGMENTS);
+                                                   NUM_SEGMENTS);
                 const unsigned last_active_seg_id =
-                    f->frame_hdr->segmentation.seg_data.last_active_segid;
+                    f->frame_hdr.segmentation.seg_data.last_active_segid;
                 b->seg_id = neg_deinterleave(diff, pred_seg_id,
                                              last_active_seg_id + 1);
                 if (b->seg_id > last_active_seg_id) b->seg_id = 0; // error?
             }
-            if (b->seg_id >= DAV1D_MAX_SEGMENTS) b->seg_id = 0; // error?
+            if (b->seg_id >= NUM_SEGMENTS) b->seg_id = 0; // error?
         }
 
-        seg = &f->frame_hdr->segmentation.seg_data.d[b->seg_id];
+        seg = &f->frame_hdr.segmentation.seg_data.d[b->seg_id];
 
         if (DEBUG_BLOCK_INFO)
             printf("Post-segid[postskip;%d]: r=%d\n",
                    b->seg_id, ts->msac.rng);
     }
 
     // cdef index
     if (!b->skip) {
-        const int idx = f->seq_hdr->sb128 ? ((t->bx & 16) >> 4) +
+        const int idx = f->seq_hdr.sb128 ? ((t->bx & 16) >> 4) +
                                            ((t->by & 16) >> 3) : 0;
         if (t->cur_sb_cdef_idx_ptr[idx] == -1) {
-            const int v = msac_decode_bools(&ts->msac, f->frame_hdr->cdef.n_bits);
+            const int v = msac_decode_bools(&ts->msac, f->frame_hdr.cdef.n_bits);
             t->cur_sb_cdef_idx_ptr[idx] = v;
             if (bw4 > 16) t->cur_sb_cdef_idx_ptr[idx + 1] = v;
             if (bh4 > 16) t->cur_sb_cdef_idx_ptr[idx + 2] = v;
             if (bw4 == 32 && bh4 == 32) t->cur_sb_cdef_idx_ptr[idx + 3] = v;
 
             if (DEBUG_BLOCK_INFO)
                 printf("Post-cdef_idx[%d]: r=%d\n",
                         *t->cur_sb_cdef_idx_ptr, ts->msac.rng);
         }
     }
 
     // delta-q/lf
-    if (!(t->bx & (31 >> !f->seq_hdr->sb128)) &&
-        !(t->by & (31 >> !f->seq_hdr->sb128)))
+    if (!(t->bx & (31 >> !f->seq_hdr.sb128)) &&
+        !(t->by & (31 >> !f->seq_hdr.sb128)))
     {
         const int prev_qidx = ts->last_qidx;
-        const int have_delta_q = f->frame_hdr->delta.q.present &&
-            (bs != (f->seq_hdr->sb128 ? BS_128x128 : BS_64x64) || !b->skip);
+        const int have_delta_q = f->frame_hdr.delta.q.present &&
+            (bs != (f->seq_hdr.sb128 ? BS_128x128 : BS_64x64) || !b->skip);
 
         int8_t prev_delta_lf[4];
         memcpy(prev_delta_lf, ts->last_delta_lf, 4);
 
         if (have_delta_q) {
             int delta_q = msac_decode_symbol_adapt(&ts->msac, ts->cdf.m.delta_q, 4);
             if (delta_q == 3) {
                 const int n_bits = 1 + msac_decode_bools(&ts->msac, 3);
                 delta_q = msac_decode_bools(&ts->msac, n_bits) + 1 + (1 << n_bits);
             }
             if (delta_q) {
                 if (msac_decode_bool(&ts->msac, EC_BOOL_EPROB)) delta_q = -delta_q;
-                delta_q *= 1 << f->frame_hdr->delta.q.res_log2;
+                delta_q *= 1 << f->frame_hdr.delta.q.res_log2;
             }
             ts->last_qidx = iclip(ts->last_qidx + delta_q, 1, 255);
             if (have_delta_q && DEBUG_BLOCK_INFO)
                 printf("Post-delta_q[%d->%d]: r=%d\n",
                        delta_q, ts->last_qidx, ts->msac.rng);
 
-            if (f->frame_hdr->delta.lf.present) {
-                const int n_lfs = f->frame_hdr->delta.lf.multi ?
-                    f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 ? 4 : 2 : 1;
+            if (f->frame_hdr.delta.lf.present) {
+                const int n_lfs = f->frame_hdr.delta.lf.multi ?
+                    f->seq_hdr.layout != DAV1D_PIXEL_LAYOUT_I400 ? 4 : 2 : 1;
 
                 for (int i = 0; i < n_lfs; i++) {
                     int delta_lf =
                         msac_decode_symbol_adapt(&ts->msac,
-                        ts->cdf.m.delta_lf[i + f->frame_hdr->delta.lf.multi], 4);
+                        ts->cdf.m.delta_lf[i + f->frame_hdr.delta.lf.multi], 4);
                     if (delta_lf == 3) {
                         const int n_bits = 1 + msac_decode_bools(&ts->msac, 3);
                         delta_lf = msac_decode_bools(&ts->msac, n_bits) +
                                    1 + (1 << n_bits);
                     }
                     if (delta_lf) {
                         if (msac_decode_bool(&ts->msac, EC_BOOL_EPROB))
                             delta_lf = -delta_lf;
-                        delta_lf *= 1 << f->frame_hdr->delta.lf.res_log2;
+                        delta_lf *= 1 << f->frame_hdr.delta.lf.res_log2;
                     }
                     ts->last_delta_lf[i] =
                         iclip(ts->last_delta_lf[i] + delta_lf, -63, 63);
                     if (have_delta_q && DEBUG_BLOCK_INFO)
                         printf("Post-delta_lf[%d:%d]: r=%d\n", i, delta_lf,
                                ts->msac.rng);
                 }
             }
         }
-        if (ts->last_qidx == f->frame_hdr->quant.yac) {
+        if (ts->last_qidx == f->frame_hdr.quant.yac) {
             // assign frame-wide q values to this sb
             ts->dq = f->dq;
         } else if (ts->last_qidx != prev_qidx) {
             // find sb-specific quant parameters
-            init_quant_tables(f->seq_hdr, f->frame_hdr, ts->last_qidx, ts->dqmem);
+            init_quant_tables(&f->seq_hdr, &f->frame_hdr, ts->last_qidx, ts->dqmem);
             ts->dq = ts->dqmem;
         }
         if (!memcmp(ts->last_delta_lf, (int8_t[4]) { 0, 0, 0, 0 }, 4)) {
             // assign frame-wide lf values to this sb
             ts->lflvl = f->lf.lvl;
         } else if (memcmp(ts->last_delta_lf, prev_delta_lf, 4)) {
             // find sb-specific lf lvl parameters
-            dav1d_calc_lf_values(ts->lflvlmem, f->frame_hdr, ts->last_delta_lf);
+            dav1d_calc_lf_values(ts->lflvlmem, &f->frame_hdr, ts->last_delta_lf);
             ts->lflvl = ts->lflvlmem;
         }
     }
 
     if (b->skip_mode) {
         b->intra = 0;
-    } else if (f->frame_hdr->frame_type & 1) {
+    } else if (f->frame_hdr.frame_type & 1) {
         if (seg && (seg->ref >= 0 || seg->globalmv)) {
             b->intra = !seg->ref;
         } else {
             const int ictx = get_intra_ctx(t->a, &t->l, by4, bx4,
                                            have_top, have_left);
             b->intra = !msac_decode_bool_adapt(&ts->msac, ts->cdf.m.intra[ictx]);
             if (DEBUG_BLOCK_INFO)
                 printf("Post-intra[%d]: r=%d\n", b->intra, ts->msac.rng);
         }
-    } else if (f->frame_hdr->allow_intrabc) {
+    } else if (f->frame_hdr.allow_intrabc) {
         b->intra = !msac_decode_bool_adapt(&ts->msac, ts->cdf.m.intrabc);
         if (DEBUG_BLOCK_INFO)
             printf("Post-intrabcflag[%d]: r=%d\n", b->intra, ts->msac.rng);
     } else {
         b->intra = 1;
     }
 
     // intra/inter-specific stuff
     if (b->intra) {
-        uint16_t *const ymode_cdf = f->frame_hdr->frame_type & 1 ?
+        uint16_t *const ymode_cdf = f->frame_hdr.frame_type & 1 ?
             ts->cdf.m.y_mode[dav1d_ymode_size_context[bs]] :
             ts->cdf.kfym[dav1d_intra_mode_context[t->a->mode[bx4]]]
                         [dav1d_intra_mode_context[t->l.mode[by4]]];
         b->y_mode = msac_decode_symbol_adapt(&ts->msac, ymode_cdf,
                                               N_INTRA_PRED_MODES);
         if (DEBUG_BLOCK_INFO)
             printf("Post-ymode[%d]: r=%d\n", b->y_mode, ts->msac.rng);
 
@@ -1017,17 +1017,17 @@ static int decode_b(Dav1dTileContext *co
             uint16_t *const acdf = ts->cdf.m.angle_delta[b->y_mode - VERT_PRED];
             const int angle = msac_decode_symbol_adapt(&ts->msac, acdf, 7);
             b->y_angle = angle - 3;
         } else {
             b->y_angle = 0;
         }
 
         if (has_chroma) {
-            const int cfl_allowed = f->frame_hdr->segmentation.lossless[b->seg_id] ?
+            const int cfl_allowed = f->frame_hdr.segmentation.lossless[b->seg_id] ?
                 cbw4 == 1 && cbh4 == 1 : !!(cfl_allowed_mask & (1 << bs));
             uint16_t *const uvmode_cdf = ts->cdf.m.uv_mode[cfl_allowed][b->y_mode];
             b->uv_mode = msac_decode_symbol_adapt(&ts->msac, uvmode_cdf,
                                          N_UV_INTRA_PRED_MODES - !cfl_allowed);
             if (DEBUG_BLOCK_INFO)
                 printf("Post-uvmode[%d]: r=%d\n", b->uv_mode, ts->msac.rng);
 
             if (b->uv_mode == CFL_PRED) {
@@ -1063,17 +1063,17 @@ static int decode_b(Dav1dTileContext *co
                 const int angle = msac_decode_symbol_adapt(&ts->msac, acdf, 7);
                 b->uv_angle = angle - 3;
             } else {
                 b->uv_angle = 0;
             }
         }
 
         b->pal_sz[0] = b->pal_sz[1] = 0;
-        if (f->frame_hdr->allow_screen_content_tools &&
+        if (f->frame_hdr.allow_screen_content_tools &&
             imax(bw4, bh4) <= 16 && bw4 + bh4 >= 4)
         {
             const int sz_ctx = b_dim[2] + b_dim[3] - 2;
             if (b->y_mode == DC_PRED) {
                 const int pal_ctx = (t->a->pal_sz[bx4] > 0) + (t->l.pal_sz[by4] > 0);
                 const int use_y_pal =
                     msac_decode_bool_adapt(&ts->msac, ts->cdf.m.pal_y[sz_ctx][pal_ctx]);
                 if (DEBUG_BLOCK_INFO)
@@ -1089,17 +1089,17 @@ static int decode_b(Dav1dTileContext *co
                 if (DEBUG_BLOCK_INFO)
                     printf("Post-uv_pal[%d]: r=%d\n", use_uv_pal, ts->msac.rng);
                 if (use_uv_pal) // see aomedia bug 2183 for why we use luma coordinates
                     read_pal_uv(t, b, sz_ctx, bx4, by4);
             }
         }
 
         if (b->y_mode == DC_PRED && !b->pal_sz[0] &&
-            imax(b_dim[2], b_dim[3]) <= 3 && f->seq_hdr->filter_intra)
+            imax(b_dim[2], b_dim[3]) <= 3 && f->seq_hdr.filter_intra)
         {
             const int is_filter = msac_decode_bool_adapt(&ts->msac,
                                             ts->cdf.m.use_filter_intra[bs]);
             if (is_filter) {
                 b->y_mode = FILTER_PRED;
                 b->y_angle = msac_decode_symbol_adapt(&ts->msac,
                                                   ts->cdf.m.filter_intra, 5);
             }
@@ -1128,24 +1128,24 @@ static int decode_b(Dav1dTileContext *co
             } else
                 pal_idx = &t->scratch.pal_idx[bw4 * bh4 * 16];
             read_pal_indices(t, pal_idx, b, 1, cw4, ch4, cbw4, cbh4);
             if (DEBUG_BLOCK_INFO)
                 printf("Post-uv-pal-indices: r=%d\n", ts->msac.rng);
         }
 
         const TxfmInfo *t_dim;
-        if (f->frame_hdr->segmentation.lossless[b->seg_id]) {
+        if (f->frame_hdr.segmentation.lossless[b->seg_id]) {
             b->tx = b->uvtx = (int) TX_4X4;
             t_dim = &dav1d_txfm_dimensions[TX_4X4];
         } else {
             b->tx = dav1d_max_txfm_size_for_bs[bs][0];
-            b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.layout];
+            b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.p.layout];
             t_dim = &dav1d_txfm_dimensions[b->tx];
-            if (f->frame_hdr->txfm_mode == DAV1D_TX_SWITCHABLE && t_dim->max > TX_4X4) {
+            if (f->frame_hdr.txfm_mode == TX_SWITCHABLE && t_dim->max > TX_4X4) {
                 const int tctx = get_tx_ctx(t->a, &t->l, t_dim, by4, bx4);
                 uint16_t *const tx_cdf = ts->cdf.m.txsz[t_dim->max - 1][tctx];
                 int depth = msac_decode_symbol_adapt(&ts->msac, tx_cdf,
                                                      imin(t_dim->max + 1, 3));
 
                 while (depth--) {
                     b->tx = t_dim->sub;
                     t_dim = &dav1d_txfm_dimensions[b->tx];
@@ -1158,42 +1158,42 @@ static int decode_b(Dav1dTileContext *co
         // reconstruction
         if (f->frame_thread.pass == 1) {
             f->bd_fn.read_coef_blocks(t, bs, b);
         } else {
             f->bd_fn.recon_b_intra(t, bs, intra_edge_flags, b);
         }
 
         dav1d_create_lf_mask_intra(t->lf_mask, f->lf.level, f->b4_stride,
-                                   f->frame_hdr, (const uint8_t (*)[8][2])
+                                   &f->frame_hdr, (const uint8_t (*)[8][2])
                                    &ts->lflvl[b->seg_id][0][0][0],
                                    t->bx, t->by, f->w4, f->h4, bs,
-                                   b->tx, b->uvtx, f->cur.p.layout,
+                                   b->tx, b->uvtx, f->cur.p.p.layout,
                                    &t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4],
                                    has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL,
                                    has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
 
         // update contexts
 #define set_ctx(type, dir, diridx, off, mul, rep_macro) \
         rep_macro(type, t->dir tx_intra, off, mul * (((uint8_t *) &t_dim->lw)[diridx])); \
         rep_macro(type, t->dir tx, off, mul * (((uint8_t *) &t_dim->lw)[diridx])); \
         rep_macro(type, t->dir mode, off, mul * y_mode_nofilt); \
         rep_macro(type, t->dir pal_sz, off, mul * b->pal_sz[0]); \
         rep_macro(type, t->dir seg_pred, off, mul * seg_pred); \
         rep_macro(type, t->dir skip_mode, off, 0); \
         rep_macro(type, t->dir intra, off, mul); \
         rep_macro(type, t->dir skip, off, mul * b->skip); \
         /* see aomedia bug 2183 for why we use luma coordinates here */ \
         rep_macro(type, t->pal_sz_uv[diridx], off, mul * (has_chroma ? b->pal_sz[1] : 0)); \
-        if (f->frame_hdr->frame_type & 1) { \
+        if (f->frame_hdr.frame_type & 1) { \
             rep_macro(type, t->dir comp_type, off, mul * COMP_INTER_NONE); \
             rep_macro(type, t->dir ref[0], off, mul * ((uint8_t) -1)); \
             rep_macro(type, t->dir ref[1], off, mul * ((uint8_t) -1)); \
-            rep_macro(type, t->dir filter[0], off, mul * DAV1D_N_SWITCHABLE_FILTERS); \
-            rep_macro(type, t->dir filter[1], off, mul * DAV1D_N_SWITCHABLE_FILTERS); \
+            rep_macro(type, t->dir filter[0], off, mul * N_SWITCHABLE_FILTERS); \
+            rep_macro(type, t->dir filter[1], off, mul * N_SWITCHABLE_FILTERS); \
         }
         const enum IntraPredMode y_mode_nofilt =
             b->y_mode == FILTER_PRED ? DC_PRED : b->y_mode;
         case_set(bh4, l., 1, by4);
         case_set(bw4, a->, 0, bx4);
 #undef set_ctx
         if (b->pal_sz[0]) {
             uint16_t *const pal = f->frame_thread.pass ?
@@ -1216,41 +1216,41 @@ static int decode_b(Dav1dTileContext *co
                                         ((t->bx >> 1) + (t->by & 1))][pl] : t->pal[pl];
                 // see aomedia bug 2183 for why we use luma coordinates here
                 for (int x = 0; x < bw4; x++)
                     memcpy(t->al_pal[0][bx4 + x][pl], pal, 16);
                 for (int y = 0; y < bh4; y++)
                     memcpy(t->al_pal[1][by4 + y][pl], pal, 16);
             }
         }
-        if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) {
+        if ((f->frame_hdr.frame_type & 1) || f->frame_hdr.allow_intrabc) {
             splat_intraref(f->mvs, f->b4_stride, t->by, t->bx, bs,
                            y_mode_nofilt);
         }
-    } else if (!(f->frame_hdr->frame_type & 1)) {
+    } else if (!(f->frame_hdr.frame_type & 1)) {
         // intra block copy
         candidate_mv mvstack[8];
         int n_mvs;
         mv mvlist[2][2];
         av1_find_ref_mvs(mvstack, &n_mvs, mvlist, NULL,
                          (int[2]) { -1, -1 }, f->bw, f->bh,
                          bs, bp, t->by, t->bx, ts->tiling.col_start,
                          ts->tiling.col_end, ts->tiling.row_start,
                          ts->tiling.row_end, f->libaom_cm);
 
         if (mvlist[0][0].y | mvlist[0][0].x)
             b->mv[0] = mvlist[0][0];
         else if (mvlist[0][1].y | mvlist[0][1].x)
             b->mv[0] = mvlist[0][1];
         else {
-            if (t->by - (16 << f->seq_hdr->sb128) < ts->tiling.row_start) {
+            if (t->by - (16 << f->seq_hdr.sb128) < ts->tiling.row_start) {
                 b->mv[0].y = 0;
-                b->mv[0].x = -(512 << f->seq_hdr->sb128) - 2048;
+                b->mv[0].x = -(512 << f->seq_hdr.sb128) - 2048;
             } else {
-                b->mv[0].y = -(512 << f->seq_hdr->sb128);
+                b->mv[0].y = -(512 << f->seq_hdr.sb128);
                 b->mv[0].x = 0;
             }
         }
 
         const struct mv ref = b->mv[0];
         read_mv_residual(t, &b->mv[0], &ts->cdf.dmv, 0);
 
         // clip intrabc motion vector to decoded parts of current tile
@@ -1276,19 +1276,19 @@ static int decode_b(Dav1dTileContext *co
             src_right -= src_right - ts->tiling.col_end * 4;
         }
         // check against top tile boundary and adjust if necessary
         if (src_top < border_top) {
             src_bottom += border_top - src_top;
             src_top    += border_top - src_top;
         }
 
-        const int sbx = (t->bx >> (4 + f->seq_hdr->sb128)) << (6 + f->seq_hdr->sb128);
-        const int sby = (t->by >> (4 + f->seq_hdr->sb128)) << (6 + f->seq_hdr->sb128);
-        const int sb_size = 1 << (6 + f->seq_hdr->sb128);
+        const int sbx = (t->bx >> (4 + f->seq_hdr.sb128)) << (6 + f->seq_hdr.sb128);
+        const int sby = (t->by >> (4 + f->seq_hdr.sb128)) << (6 + f->seq_hdr.sb128);
+        const int sb_size = 1 << (6 + f->seq_hdr.sb128);
         // check for overlap with current superblock
         if (src_bottom > sby && src_right > sbx) {
             if (src_top - border_top >= src_bottom - sby) {
                 // if possible move src up into the previous suberblock row
                 src_top    -= src_bottom - sby;
                 src_bottom -= src_bottom - sby;
             } else if (src_left - border_left >= src_right - sbx) {
                 // if possible move src left into the previous suberblock
@@ -1346,48 +1346,48 @@ static int decode_b(Dav1dTileContext *co
         }
     } else {
         // inter-specific mode/mv coding
         int is_comp, has_subpel_filter;
 
         if (b->skip_mode) {
             is_comp = 1;
         } else if ((!seg || (seg->ref == -1 && !seg->globalmv && !seg->skip)) &&
-                   f->frame_hdr->switchable_comp_refs && imin(bw4, bh4) > 1)
+                   f->frame_hdr.switchable_comp_refs && imin(bw4, bh4) > 1)
         {
             const int ctx = get_comp_ctx(t->a, &t->l, by4, bx4,
                                          have_top, have_left);
             is_comp = msac_decode_bool_adapt(&ts->msac, ts->cdf.m.comp[ctx]);
             if (DEBUG_BLOCK_INFO)
                 printf("Post-compflag[%d]: r=%d\n", is_comp, ts->msac.rng);
         } else {
             is_comp = 0;
         }
 
         if (b->skip_mode) {
-            b->ref[0] = f->frame_hdr->skip_mode_refs[0];
-            b->ref[1] = f->frame_hdr->skip_mode_refs[1];
+            b->ref[0] = f->frame_hdr.skip_mode_refs[0];
+            b->ref[1] = f->frame_hdr.skip_mode_refs[1];
             b->comp_type = COMP_INTER_AVG;
             b->inter_mode = NEARESTMV_NEARESTMV;
             b->drl_idx = 0;
             has_subpel_filter = 0;
 
             candidate_mv mvstack[8];
             int n_mvs, ctx;
             mv mvlist[2][2];
             av1_find_ref_mvs(mvstack, &n_mvs, mvlist, &ctx,
                              (int[2]) { b->ref[0], b->ref[1] }, f->bw, f->bh,
                              bs, bp, t->by, t->bx, ts->tiling.col_start,
                              ts->tiling.col_end, ts->tiling.row_start,
                              ts->tiling.row_end, f->libaom_cm);
 
             b->mv[0] = mvstack[0].this_mv;
             b->mv[1] = mvstack[0].comp_mv;
-            fix_mv_precision(f->frame_hdr, &b->mv[0]);
-            fix_mv_precision(f->frame_hdr, &b->mv[1]);
+            fix_mv_precision(&f->frame_hdr, &b->mv[0]);
+            fix_mv_precision(&f->frame_hdr, &b->mv[1]);
             if (DEBUG_BLOCK_INFO)
                 printf("Post-skipmodeblock[mv=1:y=%d,x=%d,2:y=%d,x=%d,refs=%d+%d\n",
                        b->mv[0].y, b->mv[0].x, b->mv[1].y, b->mv[1].x,
                        b->ref[0], b->ref[1]);
         } else if (is_comp) {
             const int dir_ctx = get_comp_dir_ctx(t->a, &t->l, by4, bx4,
                                                  have_top, have_left);
             if (msac_decode_bool_adapt(&ts->msac, ts->cdf.m.comp_dir[dir_ctx])) {
@@ -1497,61 +1497,60 @@ static int decode_b(Dav1dTileContext *co
                 }
             }
 
 #define assign_comp_mv(idx, pfx) \
             switch (im[idx]) { \
             case NEARMV: \
             case NEARESTMV: \
                 b->mv[idx] = mvstack[b->drl_idx].pfx##_mv; \
-                fix_mv_precision(f->frame_hdr, &b->mv[idx]); \
+                fix_mv_precision(&f->frame_hdr, &b->mv[idx]); \
                 break; \
             case GLOBALMV: \
                 has_subpel_filter |= \
-                    f->frame_hdr->gmv[b->ref[idx]].type == DAV1D_WM_TYPE_TRANSLATION; \
-                b->mv[idx] = get_gmv_2d(&f->frame_hdr->gmv[b->ref[idx]], \
-                                        t->bx, t->by, bw4, bh4, f->frame_hdr); \
-                fix_mv_precision(f->frame_hdr, &b->mv[idx]); \
+                    f->frame_hdr.gmv[b->ref[idx]].type == WM_TYPE_TRANSLATION; \
+                b->mv[idx] = get_gmv_2d(&f->frame_hdr.gmv[b->ref[idx]], \
+                                        t->bx, t->by, bw4, bh4, &f->frame_hdr); \
+                fix_mv_precision(&f->frame_hdr, &b->mv[idx]); \
                 break; \
             case NEWMV: \
                 b->mv[idx] = mvstack[b->drl_idx].pfx##_mv; \
                 read_mv_residual(t, &b->mv[idx], &ts->cdf.mv, \
-                                 !f->frame_hdr->force_integer_mv); \
+                                 !f->frame_hdr.force_integer_mv); \
                 break; \
             }
             has_subpel_filter = imin(bw4, bh4) == 1 ||
                                 b->inter_mode != GLOBALMV_GLOBALMV;
             assign_comp_mv(0, this);
             assign_comp_mv(1, comp);
 #undef assign_comp_mv
             if (DEBUG_BLOCK_INFO)
                 printf("Post-residual_mv[1:y=%d,x=%d,2:y=%d,x=%d]: r=%d\n",
                        b->mv[0].y, b->mv[0].x, b->mv[1].y, b->mv[1].x,
                        ts->msac.rng);
 
             // jnt_comp vs. seg vs. wedge
             int is_segwedge = 0;
-            if (f->seq_hdr->masked_compound) {
+            if (f->seq_hdr.masked_compound) {
                 const int mask_ctx = get_mask_comp_ctx(t->a, &t->l, by4, bx4);
 
                 is_segwedge = msac_decode_bool_adapt(&ts->msac,
                                                  ts->cdf.m.mask_comp[mask_ctx]);
                 if (DEBUG_BLOCK_INFO)
                     printf("Post-segwedge_vs_jntavg[%d,ctx=%d]: r=%d\n",
                            is_segwedge, mask_ctx, ts->msac.rng);
             }
 
             if (!is_segwedge) {
-                if (f->seq_hdr->jnt_comp) {
+                if (f->seq_hdr.jnt_comp) {
                     const int jnt_ctx =
-                        get_jnt_comp_ctx(f->seq_hdr->order_hint_n_bits,
-                                         f->cur.frame_hdr->frame_offset,
-                                         f->refp[b->ref[0]].p.frame_hdr->frame_offset,
-                                         f->refp[b->ref[1]].p.frame_hdr->frame_offset,
-                                         t->a, &t->l, by4, bx4);
+                        get_jnt_comp_ctx(f->seq_hdr.order_hint_n_bits,
+                                         f->cur.p.poc, f->refp[b->ref[0]].p.poc,
+                                         f->refp[b->ref[1]].p.poc, t->a, &t->l,
+                                         by4, bx4);
                     b->comp_type = COMP_INTER_WEIGHTED_AVG +
                         msac_decode_bool_adapt(&ts->msac,
                                                ts->cdf.m.jnt_comp[jnt_ctx]);
                     if (DEBUG_BLOCK_INFO)
                         printf("Post-jnt_comp[%d,ctx=%d[ac:%d,ar:%d,lc:%d,lr:%d]]: r=%d\n",
                                b->comp_type == COMP_INTER_AVG,
                                jnt_ctx, t->a->comp_type[bx4], t->a->ref[0][bx4],
                                t->l.comp_type[by4], t->l.ref[0][by4],
@@ -1632,21 +1631,21 @@ static int decode_b(Dav1dTileContext *co
             if ((seg && (seg->skip || seg->globalmv)) ||
                 msac_decode_bool_adapt(&ts->msac, ts->cdf.m.newmv_mode[ctx & 7]))
             {
                 if ((seg && (seg->skip || seg->globalmv)) ||
                     !msac_decode_bool_adapt(&ts->msac,
                                         ts->cdf.m.globalmv_mode[(ctx >> 3) & 1]))
                 {
                     b->inter_mode = GLOBALMV;
-                    b->mv[0] = get_gmv_2d(&f->frame_hdr->gmv[b->ref[0]],
-                                          t->bx, t->by, bw4, bh4, f->frame_hdr);
-                    fix_mv_precision(f->frame_hdr, &b->mv[0]);
+                    b->mv[0] = get_gmv_2d(&f->frame_hdr.gmv[b->ref[0]],
+                                          t->bx, t->by, bw4, bh4, &f->frame_hdr);
+                    fix_mv_precision(&f->frame_hdr, &b->mv[0]);
                     has_subpel_filter = imin(bw4, bh4) == 1 ||
-                        f->frame_hdr->gmv[b->ref[0]].type == DAV1D_WM_TYPE_TRANSLATION;
+                        f->frame_hdr.gmv[b->ref[0]].type == WM_TYPE_TRANSLATION;
                 } else {
                     has_subpel_filter = 1;
                     if (msac_decode_bool_adapt(&ts->msac,
                                        ts->cdf.m.refmv_mode[(ctx >> 4) & 15]))
                     {
                         b->inter_mode = NEARMV;
                         b->drl_idx = 1;
                         if (n_mvs > 2) {
@@ -1663,17 +1662,17 @@ static int decode_b(Dav1dTileContext *co
                     } else {
                         b->inter_mode = NEARESTMV;
                         b->drl_idx = 0;
                     }
                     if (b->drl_idx >= 2) {
                         b->mv[0] = mvstack[b->drl_idx].this_mv;
                     } else {
                         b->mv[0] = mvlist[0][b->drl_idx];
-                        fix_mv_precision(f->frame_hdr, &b->mv[0]);
+                        fix_mv_precision(&f->frame_hdr, &b->mv[0]);
                     }
                 }
 
                 if (DEBUG_BLOCK_INFO)
                     printf("Post-intermode[%d,drl=%d,mv=y:%d,x:%d,n_mvs=%d]: r=%d\n",
                            b->inter_mode, b->drl_idx, b->mv[0].y, b->mv[0].x, n_mvs,
                            ts->msac.rng);
             } else {
@@ -1689,73 +1688,73 @@ static int decode_b(Dav1dTileContext *co
                         b->drl_idx += msac_decode_bool_adapt(&ts->msac,
                                                  ts->cdf.m.drl_bit[drl_ctx_v2]);
                     }
                 }
                 if (n_mvs > 1) {
                     b->mv[0] = mvstack[b->drl_idx].this_mv;
                 } else {
                     b->mv[0] = mvlist[0][0];
-                    fix_mv_precision(f->frame_hdr, &b->mv[0]);
+                    fix_mv_precision(&f->frame_hdr, &b->mv[0]);
                 }
                 if (DEBUG_BLOCK_INFO)
                     printf("Post-intermode[%d,drl=%d]: r=%d\n",
                            b->inter_mode, b->drl_idx, ts->msac.rng);
                 read_mv_residual(t, &b->mv[0], &ts->cdf.mv,
-                                 !f->frame_hdr->force_integer_mv);
+                                 !f->frame_hdr.force_integer_mv);
                 if (DEBUG_BLOCK_INFO)
                     printf("Post-residualmv[mv=y:%d,x:%d]: r=%d\n",
                            b->mv[0].y, b->mv[0].x, ts->msac.rng);
             }
 
             // interintra flags
             const int ii_sz_grp = dav1d_ymode_size_context[bs];
-            if (f->seq_hdr->inter_intra &&
+            if (f->seq_hdr.inter_intra &&
                 interintra_allowed_mask & (1 << bs) &&
                 msac_decode_bool_adapt(&ts->msac, ts->cdf.m.interintra[ii_sz_grp]))
             {
                 b->interintra_mode = msac_decode_symbol_adapt(&ts->msac,
                                           ts->cdf.m.interintra_mode[ii_sz_grp],
                                           N_INTER_INTRA_PRED_MODES);
                 const int wedge_ctx = dav1d_wedge_ctx_lut[bs];
                 b->interintra_type = INTER_INTRA_BLEND +
                     msac_decode_bool_adapt(&ts->msac,
                                            ts->cdf.m.interintra_wedge[wedge_ctx]);
                 if (b->interintra_type == INTER_INTRA_WEDGE)
                     b->wedge_idx = msac_decode_symbol_adapt(&ts->msac,
                                             ts->cdf.m.wedge_idx[wedge_ctx], 16);
             } else {
                 b->interintra_type = INTER_INTRA_NONE;
             }
-            if (DEBUG_BLOCK_INFO && f->seq_hdr->inter_intra &&
+            if (DEBUG_BLOCK_INFO && f->seq_hdr.inter_intra &&
                 interintra_allowed_mask & (1 << bs))
             {
                 printf("Post-interintra[t=%d,m=%d,w=%d]: r=%d\n",
                        b->interintra_type, b->interintra_mode,
                        b->wedge_idx, ts->msac.rng);
             }
 
             // motion variation
-            if (f->frame_hdr->switchable_motion_mode &&
+            if (f->frame_hdr.switchable_motion_mode &&
                 b->interintra_type == INTER_INTRA_NONE && imin(bw4, bh4) >= 2 &&
                 // is not warped global motion
-                !(!f->frame_hdr->force_integer_mv && b->inter_mode == GLOBALMV &&
-                  f->frame_hdr->gmv[b->ref[0]].type > DAV1D_WM_TYPE_TRANSLATION) &&
+                !(!f->frame_hdr.force_integer_mv && b->inter_mode == GLOBALMV &&
+                  f->frame_hdr.gmv[b->ref[0]].type > WM_TYPE_TRANSLATION) &&
                 // has overlappable neighbours
                 ((have_left && findoddzero(&t->l.intra[by4 + 1], h4 >> 1)) ||
                  (have_top && findoddzero(&t->a->intra[bx4 + 1], w4 >> 1))))
             {
                 // reaching here means the block allows obmc - check warp by
                 // finding matching-ref blocks in top/left edges
                 uint64_t mask[2] = { 0, 0 };
                 find_matching_ref(t, intra_edge_flags, bw4, bh4, w4, h4,
                                   have_left, have_top, b->ref[0], mask);
                 const int allow_warp = !f->svc[b->ref[0]][0].scale &&
-                    !f->frame_hdr->force_integer_mv &&
-                    f->frame_hdr->warp_motion && (mask[0] | mask[1]);
+                    !f->frame_hdr.force_integer_mv &&
+                    f->frame_hdr.warp_motion && (mask[0] | mask[1]);
 
                 b->motion_mode = allow_warp ?
                     msac_decode_symbol_adapt(&ts->msac, ts->cdf.m.motion_mode[bs], 3) :
                     msac_decode_bool_adapt(&ts->msac, ts->cdf.m.obmc[bs]);
                 if (b->motion_mode == MM_WARP) {
                     has_subpel_filter = 0;
                     derive_warpmv(t, bw4, bh4, mask, b->mv[0], &t->warpmv);
 #define signabs(v) v < 0 ? '-' : ' ', abs(v)
@@ -1780,46 +1779,46 @@ static int decode_b(Dav1dTileContext *co
                            PRIu64 "x]\n", b->motion_mode, ts->msac.rng, mask[0],
                             mask[1]);
             } else {
                 b->motion_mode = MM_TRANSLATION;
             }
         }
 
         // subpel filter
-        enum Dav1dFilterMode filter[2];
-        if (f->frame_hdr->subpel_filter_mode == DAV1D_FILTER_SWITCHABLE) {
+        enum FilterMode filter[2];
+        if (f->frame_hdr.subpel_filter_mode == FILTER_SWITCHABLE) {
             if (has_subpel_filter) {
                 const int comp = b->comp_type != COMP_INTER_NONE;
                 const int ctx1 = get_filter_ctx(t->a, &t->l, comp, 0, b->ref[0],
                                                 by4, bx4);
                 filter[0] = msac_decode_symbol_adapt(&ts->msac,
-                    ts->cdf.m.filter[0][ctx1], DAV1D_N_SWITCHABLE_FILTERS);
-                if (f->seq_hdr->dual_filter) {
+                    ts->cdf.m.filter[0][ctx1], N_SWITCHABLE_FILTERS);
+                if (f->seq_hdr.dual_filter) {
                     const int ctx2 = get_filter_ctx(t->a, &t->l, comp, 1,
                                                     b->ref[0], by4, bx4);
                     if (DEBUG_BLOCK_INFO)
                         printf("Post-subpel_filter1[%d,ctx=%d]: r=%d\n",
                                filter[0], ctx1, ts->msac.rng);
                     filter[1] = msac_decode_symbol_adapt(&ts->msac,
-                        ts->cdf.m.filter[1][ctx2], DAV1D_N_SWITCHABLE_FILTERS);
+                        ts->cdf.m.filter[1][ctx2], N_SWITCHABLE_FILTERS);
                     if (DEBUG_BLOCK_INFO)
                         printf("Post-subpel_filter2[%d,ctx=%d]: r=%d\n",
                                filter[1], ctx2, ts->msac.rng);
                 } else {
                     filter[1] = filter[0];
                     if (DEBUG_BLOCK_INFO)
                         printf("Post-subpel_filter[%d,ctx=%d]: r=%d\n",
                                filter[0], ctx1, ts->msac.rng);
                 }
             } else {
-                filter[0] = filter[1] = DAV1D_FILTER_8TAP_REGULAR;
+                filter[0] = filter[1] = FILTER_8TAP_REGULAR;
             }
         } else {
-            filter[0] = filter[1] = f->frame_hdr->subpel_filter_mode;
+            filter[0] = filter[1] = f->frame_hdr.subpel_filter_mode;
         }
         b->filter2d = dav1d_filter_2d[filter[1]][filter[0]];
 
         read_vartx_tree(t, b, bs, bx4, by4);
 
         // reconstruction
         if (f->frame_thread.pass == 1) {
             f->bd_fn.read_coef_blocks(t, bs, b);
@@ -1827,19 +1826,19 @@ static int decode_b(Dav1dTileContext *co
             if (f->bd_fn.recon_b_inter(t, bs, b)) return -1;
         }
 
         const int is_globalmv =
             b->inter_mode == (is_comp ? GLOBALMV_GLOBALMV : GLOBALMV);
         const uint8_t (*const lf_lvls)[8][2] = (const uint8_t (*)[8][2])
             &ts->lflvl[b->seg_id][0][b->ref[0] + 1][!is_globalmv];
         dav1d_create_lf_mask_inter(t->lf_mask, f->lf.level, f->b4_stride,
-                                   f->frame_hdr, lf_lvls, t->bx, t->by,
+                                   &f->frame_hdr, lf_lvls, t->bx, t->by,
                                    f->w4, f->h4, b->skip, bs, b->tx_split,
-                                   b->uvtx, f->cur.p.layout,
+                                   b->uvtx, f->cur.p.p.layout,
                                    &t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4],
                                    has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL,
                                    has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
 
         // context updates
         if (is_comp) {
             splat_tworef_mv(f->mvs, f->b4_stride, t->by, t->bx, bs,
                             b->inter_mode, b->ref[0], b->ref[1],
@@ -1874,18 +1873,18 @@ static int decode_b(Dav1dTileContext *co
             rep_macro(type, t->dir uvmode, off, mul * DC_PRED)
             case_set(cbh4, l., 1, cby4);
             case_set(cbw4, a->, 0, cbx4);
 #undef set_ctx
         }
     }
 
     // update contexts
-    if (f->frame_hdr->segmentation.enabled &&
-        f->frame_hdr->segmentation.update_map)
+    if (f->frame_hdr.segmentation.enabled &&
+        f->frame_hdr.segmentation.update_map)
     {
         uint8_t *seg_ptr = &f->cur_segmap[t->by * f->b4_stride + t->bx];
 #define set_ctx(type, dir, diridx, off, mul, rep_macro) \
         for (int y = 0; y < bh4; y++) { \
             rep_macro(type, seg_ptr, 0, mul * b->seg_id); \
             seg_ptr += f->b4_stride; \
         }
         case_set(bw4, NULL, 0, 0);
@@ -1919,40 +1918,40 @@ static int decode_sb(Dav1dTileContext *c
     }
 
     uint16_t *pc;
     enum BlockPartition bp;
     int ctx, bx8, by8;
     if (f->frame_thread.pass != 2) {
         if (0 && bl == BL_64X64)
             printf("poc=%d,y=%d,x=%d,bl=%d,r=%d\n",
-                   f->frame_hdr->frame_offset, t->by, t->bx, bl, t->ts->msac.rng);
+                   f->frame_hdr.frame_offset, t->by, t->bx, bl, t->ts->msac.rng);
         bx8 = (t->bx & 31) >> 1;
         by8 = (t->by & 31) >> 1;
         ctx = get_partition_ctx(t->a, &t->l, bl, by8, bx8);
         pc = t->ts->cdf.m.partition[bl][ctx];
     }
 
     if (have_h_split && have_v_split) {
         if (f->frame_thread.pass == 2) {
             const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx];
             bp = b->bl == bl ? b->bp : PARTITION_SPLIT;
         } else {
             const unsigned n_part = bl == BL_8X8 ? N_SUB8X8_PARTITIONS :
                 bl == BL_128X128 ? N_PARTITIONS - 2 : N_PARTITIONS;
             bp = msac_decode_symbol_adapt(&t->ts->msac, pc, n_part);
-            if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I422 &&
+            if (f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I422 &&
                 (bp == PARTITION_V || bp == PARTITION_V4 ||
                  bp == PARTITION_T_LEFT_SPLIT || bp == PARTITION_T_RIGHT_SPLIT))
             {
                 return 1;
             }
             if (DEBUG_BLOCK_INFO)
                 printf("poc=%d,y=%d,x=%d,bl=%d,ctx=%d,bp=%d: r=%d\n",
-                       f->frame_hdr->frame_offset, t->by, t->bx, bl, ctx, bp,
+                       f->frame_hdr.frame_offset, t->by, t->bx, bl, ctx, bp,
                        t->ts->msac.rng);
         }
         const uint8_t *const b = dav1d_block_sizes[bl][bp];
 
         switch (bp) {
         case PARTITION_NONE:
             if (decode_b(t, bl, b[0], PARTITION_NONE, node->o))
                 return -1;
@@ -2104,20 +2103,21 @@ static int decode_sb(Dav1dTileContext *c
         default: assert(0);
         }
     } else if (have_h_split) {
         unsigned is_split;
         if (f->frame_thread.pass == 2) {
             const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx];
             is_split = b->bl != bl;
         } else {
-            is_split = msac_decode_bool(&t->ts->msac, gather_top_partition_prob(pc, bl) >> EC_PROB_SHIFT);
+            const uint16_t cdf[2] = { gather_top_partition_prob(pc, bl), 0 };
+            is_split = msac_decode_symbol(&t->ts->msac, cdf, 2);
             if (DEBUG_BLOCK_INFO)
                 printf("poc=%d,y=%d,x=%d,bl=%d,ctx=%d,bp=%d: r=%d\n",
-                       f->frame_hdr->frame_offset, t->by, t->bx, bl, ctx,
+                       f->frame_hdr.frame_offset, t->by, t->bx, bl, ctx,
                        is_split ? PARTITION_SPLIT : PARTITION_H, t->ts->msac.rng);
         }
 
         assert(bl < BL_8X8);
         if (is_split) {
             const EdgeBranch *const branch = (const EdgeBranch *) node;
             bp = PARTITION_SPLIT;
             if (decode_sb(t, bl + 1, branch->split[0])) return 1;
@@ -2132,22 +2132,23 @@ static int decode_sb(Dav1dTileContext *c
         }
     } else {
         assert(have_v_split);
         unsigned is_split;
         if (f->frame_thread.pass == 2) {
             const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx];
             is_split = b->bl != bl;
         } else {
-            is_split = msac_decode_bool(&t->ts->msac, gather_left_partition_prob(pc, bl) >> EC_PROB_SHIFT);
-            if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I422 && !is_split)
+            uint16_t cdf[2] = { gather_left_partition_prob(pc, bl), 0 };
+            is_split = msac_decode_symbol(&t->ts->msac, cdf, 2);
+            if (f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I422 && !is_split)
                 return 1;
             if (DEBUG_BLOCK_INFO)
                 printf("poc=%d,y=%d,x=%d,bl=%d,ctx=%d,bp=%d: r=%d\n",
-                       f->frame_hdr->frame_offset, t->by, t->bx, bl, ctx,
+                       f->frame_hdr.frame_offset, t->by, t->bx, bl, ctx,
                        is_split ? PARTITION_SPLIT : PARTITION_V, t->ts->msac.rng);
         }
 
         assert(bl < BL_8X8);
         if (is_split) {
             const EdgeBranch *const branch = (const EdgeBranch *) node;
             bp = PARTITION_SPLIT;
             if (decode_sb(t, bl + 1, branch->split[0])) return 1;
@@ -2190,430 +2191,362 @@ static void reset_context(BlockContext *
     memset(ctx->tx, TX_64X64, sizeof(ctx->tx));
     if (!keyframe) {
         memset(ctx->ref, -1, sizeof(ctx->ref));
         memset(ctx->comp_type, 0, sizeof(ctx->comp_type));
         memset(ctx->mode, NEARESTMV, sizeof(ctx->mode));
     }
     memset(ctx->lcoef, 0x40, sizeof(ctx->lcoef));
     memset(ctx->ccoef, 0x40, sizeof(ctx->ccoef));
-    memset(ctx->filter, DAV1D_N_SWITCHABLE_FILTERS, sizeof(ctx->filter));
+    memset(ctx->filter, N_SWITCHABLE_FILTERS, sizeof(ctx->filter));
     memset(ctx->seg_pred, 0, sizeof(ctx->seg_pred));
     memset(ctx->pal_sz, 0, sizeof(ctx->pal_sz));
 }
 
 static void setup_tile(Dav1dTileState *const ts,
                        const Dav1dFrameContext *const f,
                        const uint8_t *const data, const size_t sz,
                        const int tile_row, const int tile_col,
                        const int tile_start_off)
 {
-    const int col_sb_start = f->frame_hdr->tiling.col_start_sb[tile_col];
-    const int col_sb128_start = col_sb_start >> !f->seq_hdr->sb128;
-    const int col_sb_end = f->frame_hdr->tiling.col_start_sb[tile_col + 1];
-    const int row_sb_start = f->frame_hdr->tiling.row_start_sb[tile_row];
-    const int row_sb_end = f->frame_hdr->tiling.row_start_sb[tile_row + 1];
+    const int col_sb_start = f->frame_hdr.tiling.col_start_sb[tile_col];
+    const int col_sb128_start = col_sb_start >> !f->seq_hdr.sb128;
+    const int col_sb_end = f->frame_hdr.tiling.col_start_sb[tile_col + 1];
+    const int row_sb_start = f->frame_hdr.tiling.row_start_sb[tile_row];
+    const int row_sb_end = f->frame_hdr.tiling.row_start_sb[tile_row + 1];
     const int sb_shift = f->sb_shift;
 
     ts->frame_thread.pal_idx = &f->frame_thread.pal_idx[tile_start_off * 2];
     ts->frame_thread.cf = &((int32_t *) f->frame_thread.cf)[tile_start_off * 3];
     ts->cdf = *f->in_cdf.cdf;
-    ts->last_qidx = f->frame_hdr->quant.yac;
+    ts->last_qidx = f->frame_hdr.quant.yac;
     memset(ts->last_delta_lf, 0, sizeof(ts->last_delta_lf));
 
-    msac_init(&ts->msac, data, sz, f->frame_hdr->disable_cdf_update);
+    msac_init(&ts->msac, data, sz, f->frame_hdr.disable_cdf_update);
 
     ts->tiling.row = tile_row;
     ts->tiling.col = tile_col;
     ts->tiling.col_start = col_sb_start << sb_shift;
     ts->tiling.col_end = imin(col_sb_end << sb_shift, f->bw);
     ts->tiling.row_start = row_sb_start << sb_shift;
     ts->tiling.row_end = imin(row_sb_end << sb_shift, f->bh);
 
     // Reference Restoration Unit (used for exp coding)
-    int sb_idx, unit_idx;
-    if (f->frame_hdr->super_res.enabled) {
-        // vertical components only
-        sb_idx = (ts->tiling.row_start >> 5) * f->sr_sb128w;
-        unit_idx = (ts->tiling.row_start & 16) >> 3;
-    } else {
-        sb_idx = (ts->tiling.row_start >> 5) * f->sb128w + col_sb128_start;
-        unit_idx = ((ts->tiling.row_start & 16) >> 3) +
-                   ((ts->tiling.col_start & 16) >> 4);
-    }
+    Av1Filter *const lf_mask =
+        f->lf.mask + (ts->tiling.row_start >> 5) * f->sb128w + col_sb128_start;
+    const int unit_idx = ((ts->tiling.row_start & 16) >> 3) +
+                         ((ts->tiling.col_start & 16) >> 4);
     for (int p = 0; p < 3; p++) {
-        if (f->frame_hdr->restoration.type[p] == DAV1D_RESTORATION_NONE)
-            continue;
-
-        if (f->frame_hdr->super_res.enabled) {
-            const int ss_hor = p && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
-            const int d = f->frame_hdr->super_res.width_scale_denominator;
-            const int unit_size_log2 = f->frame_hdr->restoration.unit_size[!!p];
-            const int rnd = (8 << unit_size_log2) - 1, shift = unit_size_log2 + 3;
-            const int x = ((4 * ts->tiling.col_start * d >> ss_hor) + rnd) >> shift;
-            const int px_x = x << (unit_size_log2 + ss_hor);
-            const int u_idx = unit_idx + ((px_x & 64) >> 6);
-            const int sb128x = px_x >> 7;
-            if (sb128x >= f->sr_sb128w) continue;
-            ts->lr_ref[p] = &f->lf.lr_mask[sb_idx + sb128x].lr[p][u_idx];
-        } else {
-            ts->lr_ref[p] = &f->lf.lr_mask[sb_idx].lr[p][unit_idx];
-        }
-
+        ts->lr_ref[p] = &lf_mask->lr[p][unit_idx];
         ts->lr_ref[p]->filter_v[0] = 3;
         ts->lr_ref[p]->filter_v[1] = -7;
         ts->lr_ref[p]->filter_v[2] = 15;
         ts->lr_ref[p]->filter_h[0] = 3;
         ts->lr_ref[p]->filter_h[1] = -7;
         ts->lr_ref[p]->filter_h[2] = 15;
         ts->lr_ref[p]->sgr_weights[0] = -32;
         ts->lr_ref[p]->sgr_weights[1] = 31;
     }
 
     if (f->n_tc > 1)
         atomic_init(&ts->progress, row_sb_start);
 }
 
-static void read_restoration_info(Dav1dTileContext *const t,
-                                  Av1RestorationUnit *const lr, const int p,
-                                  const enum Dav1dRestorationType frame_type)
-{
-    const Dav1dFrameContext *const f = t->f;
-    Dav1dTileState *const ts = t->ts;
-
-    if (frame_type == DAV1D_RESTORATION_SWITCHABLE) {
-        const int filter =
-            msac_decode_symbol_adapt(&ts->msac,
-                                     ts->cdf.m.restore_switchable, 3);
-        lr->type = filter ? filter == 2 ? DAV1D_RESTORATION_SGRPROJ :
-                                          DAV1D_RESTORATION_WIENER :
-                            DAV1D_RESTORATION_NONE;
-    } else {
-        const unsigned type =
-            msac_decode_bool_adapt(&ts->msac,
-                                   frame_type == DAV1D_RESTORATION_WIENER ?
-                                       ts->cdf.m.restore_wiener :
-                                       ts->cdf.m.restore_sgrproj);
-        lr->type = type ? frame_type : DAV1D_RESTORATION_NONE;
-    }
-
-    if (lr->type == DAV1D_RESTORATION_WIENER) {
-        lr->filter_v[0] =
-            !p ? msac_decode_subexp(&ts->msac,
-                                    ts->lr_ref[p]->filter_v[0] + 5, 16,
-                                    1) - 5:
-                 0;
-        lr->filter_v[1] =
-            msac_decode_subexp(&ts->msac,
-                               ts->lr_ref[p]->filter_v[1] + 23, 32,
-                               2) - 23;
-        lr->filter_v[2] =
-            msac_decode_subexp(&ts->msac,
-                               ts->lr_ref[p]->filter_v[2] + 17, 64,
-                               3) - 17;
-
-        lr->filter_h[0] =
-            !p ? msac_decode_subexp(&ts->msac,
-                                    ts->lr_ref[p]->filter_h[0] + 5, 16,
-                                    1) - 5:
-                0;
-        lr->filter_h[1] =
-            msac_decode_subexp(&ts->msac,
-                               ts->lr_ref[p]->filter_h[1] + 23, 32,
-                               2) - 23;
-        lr->filter_h[2] =
-            msac_decode_subexp(&ts->msac,
-                               ts->lr_ref[p]->filter_h[2] + 17, 64,
-                               3) - 17;
-        memcpy(lr->sgr_weights, ts->lr_ref[p]->sgr_weights, sizeof(lr->sgr_weights));
-        ts->lr_ref[p] = lr;
-        if (DEBUG_BLOCK_INFO)
-            printf("Post-lr_wiener[pl=%d,v[%d,%d,%d],h[%d,%d,%d]]: r=%d\n",
-                   p, lr->filter_v[0], lr->filter_v[1],
-                   lr->filter_v[2], lr->filter_h[0],
-                   lr->filter_h[1], lr->filter_h[2], ts->msac.rng);
-    } else if (lr->type == DAV1D_RESTORATION_SGRPROJ) {
-        const unsigned idx = msac_decode_bools(&ts->msac, 4);
-        lr->sgr_idx = idx;
-        lr->sgr_weights[0] = dav1d_sgr_params[idx][0] ?
-            msac_decode_subexp(&ts->msac,
-                               ts->lr_ref[p]->sgr_weights[0] + 96, 128,
-                               4) - 96 :
-            0;
-        lr->sgr_weights[1] = dav1d_sgr_params[idx][1] ?
-            msac_decode_subexp(&ts->msac,
-                               ts->lr_ref[p]->sgr_weights[1] + 32, 128,
-                               4) - 32 :
-            iclip(128 - lr->sgr_weights[0], -32, 95);
-        memcpy(lr->filter_v, ts->lr_ref[p]->filter_v, sizeof(lr->filter_v));
-        memcpy(lr->filter_h, ts->lr_ref[p]->filter_h, sizeof(lr->filter_h));
-        ts->lr_ref[p] = lr;
-        if (DEBUG_BLOCK_INFO)
-            printf("Post-lr_sgrproj[pl=%d,idx=%d,w[%d,%d]]: r=%d\n",
-                   p, lr->sgr_idx, lr->sgr_weights[0],
-                   lr->sgr_weights[1], ts->msac.rng);
-    }
-}
-
 int dav1d_decode_tile_sbrow(Dav1dTileContext *const t) {
     const Dav1dFrameContext *const f = t->f;
-    const enum BlockLevel root_bl = f->seq_hdr->sb128 ? BL_128X128 : BL_64X64;
+    const enum BlockLevel root_bl = f->seq_hdr.sb128 ? BL_128X128 : BL_64X64;
     Dav1dTileState *const ts = t->ts;
     const Dav1dContext *const c = f->c;
     const int sb_step = f->sb_step;
     const int tile_row = ts->tiling.row, tile_col = ts->tiling.col;
-    const int col_sb_start = f->frame_hdr->tiling.col_start_sb[tile_col];
-    const int col_sb128_start = col_sb_start >> !f->seq_hdr->sb128;
+    const int col_sb_start = f->frame_hdr.tiling.col_start_sb[tile_col];
+    const int col_sb128_start = col_sb_start >> !f->seq_hdr.sb128;
 
-    reset_context(&t->l, !(f->frame_hdr->frame_type & 1), f->frame_thread.pass);
+    reset_context(&t->l, !(f->frame_hdr.frame_type & 1), f->frame_thread.pass);
     if (f->frame_thread.pass == 2) {
         for (t->bx = ts->tiling.col_start,
              t->a = f->a + col_sb128_start + tile_row * f->sb128w;
              t->bx < ts->tiling.col_end; t->bx += sb_step)
         {
-            if (atomic_load_explicit(c->frame_thread.flush, memory_order_acquire))
-                return 1;
             if (decode_sb(t, root_bl, c->intra_edge.root[root_bl]))
                 return 1;
-            if (t->bx & 16 || f->seq_hdr->sb128)
+            if (t->bx & 16 || f->seq_hdr.sb128)
                 t->a++;
         }
         f->bd_fn.backup_ipred_edge(t);
         return 0;
     }
 
-    // error out on symbol decoder overread
-    if (ts->msac.cnt < -15) return 1;
+    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
 
-    if (c->n_fc > 1 && f->frame_hdr->use_ref_frame_mvs) {
+    if (c->n_fc > 1 && f->frame_hdr.use_ref_frame_mvs) {
         for (int n = 0; n < 7; n++)
             if (dav1d_thread_picture_wait(&f->refp[n], 4 * (t->by + sb_step),
                                           PLANE_TYPE_BLOCK))
             {
                 return 1;
             }
         av1_init_ref_mv_tile_row(f->libaom_cm,
                                  ts->tiling.col_start, ts->tiling.col_end,
                                  t->by, imin(t->by + sb_step, f->bh));
     }
     memset(t->pal_sz_uv[1], 0, sizeof(*t->pal_sz_uv));
     const int sb128y = t->by >> 5;
     for (t->bx = ts->tiling.col_start, t->a = f->a + col_sb128_start + tile_row * f->sb128w,
          t->lf_mask = f->lf.mask + sb128y * f->sb128w + col_sb128_start;
          t->bx < ts->tiling.col_end; t->bx += sb_step)
     {
-        if (atomic_load_explicit(c->frame_thread.flush, memory_order_acquire))
-            return 1;
         if (root_bl == BL_128X128) {
             t->cur_sb_cdef_idx_ptr = t->lf_mask->cdef_idx;
             t->cur_sb_cdef_idx_ptr[0] = -1;
             t->cur_sb_cdef_idx_ptr[1] = -1;
             t->cur_sb_cdef_idx_ptr[2] = -1;
             t->cur_sb_cdef_idx_ptr[3] = -1;
         } else {
             t->cur_sb_cdef_idx_ptr =
                 &t->lf_mask->cdef_idx[((t->bx & 16) >> 4) +
                                       ((t->by & 16) >> 3)];
             t->cur_sb_cdef_idx_ptr[0] = -1;
         }
         // Restoration filter
         for (int p = 0; p < 3; p++) {
-            if (f->frame_hdr->restoration.type[p] == DAV1D_RESTORATION_NONE)
+            if (f->frame_hdr.restoration.type[p] == RESTORATION_NONE)
                 continue;
+            const int by = t->by >> (ss_ver & !!p);
+            const int bx = t->bx >> (ss_hor & !!p);
+            const int bh = f->bh >> (ss_ver & !!p);
+            const int bw = f->bw >> (ss_hor & !!p);
 
-            const int ss_ver = p && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-            const int ss_hor = p && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
-            const int unit_size_log2 = f->frame_hdr->restoration.unit_size[!!p];
-            const int y = t->by * 4 >> ss_ver;
-            const int h = (f->cur.p.h + ss_ver) >> ss_ver;
-
-            const int unit_size = 1 << unit_size_log2;
-            const unsigned mask = unit_size - 1;
-            if (y & mask) continue;
-            const int half_unit = unit_size >> 1;
+            const int unit_size_log2 =
+                f->frame_hdr.restoration.unit_size[!!p];
+            // 4pel unit size
+            const int b_unit_size = 1 << (unit_size_log2 - 2);
+            const unsigned mask = b_unit_size - 1;
+            if (by & mask || bx & mask) continue;
+            const int half_unit = b_unit_size >> 1;
             // Round half up at frame boundaries, if there's more than one
             // restoration unit
-            if (y && y + half_unit > h) continue;
-
-            const enum Dav1dRestorationType frame_type = f->frame_hdr->restoration.type[p];
+            const int bottom_round = by && by + half_unit > bh;
+            const int right_round = bx && bx + half_unit > bw;
+            if (bottom_round || right_round) continue;
+            const int unit_idx = ((t->by & 16) >> 3) + ((t->bx & 16) >> 4);
+            Av1RestorationUnit *const lr = &t->lf_mask->lr[p][unit_idx];
+            const enum RestorationType frame_type =
+                f->frame_hdr.restoration.type[p];
 
-            if (f->frame_hdr->super_res.enabled) {
-                const int w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
-                const int n_units = imax(1, (w + half_unit) >> unit_size_log2);
+            if (frame_type == RESTORATION_SWITCHABLE) {
+                const int filter =
+                    msac_decode_symbol_adapt(&ts->msac,
+                                             ts->cdf.m.restore_switchable, 3);
+                lr->type = filter ? filter == 2 ? RESTORATION_SGRPROJ :
+                                                  RESTORATION_WIENER :
+                                    RESTORATION_NONE;
+            } else {
+                const unsigned type =
+                    msac_decode_bool_adapt(&ts->msac,
+                                           frame_type == RESTORATION_WIENER ?
+                                               ts->cdf.m.restore_wiener :
+                                               ts->cdf.m.restore_sgrproj);
+                lr->type = type ? frame_type : RESTORATION_NONE;
+            }
 
-                const int d = f->frame_hdr->super_res.width_scale_denominator;
-                const int rnd = unit_size * 8 - 1, shift = unit_size_log2 + 3;
-                const int x0 = ((4 *  t->bx            * d >> ss_hor) + rnd) >> shift;
-                const int x1 = ((4 * (t->bx + sb_step) * d >> ss_hor) + rnd) >> shift;
+            if (lr->type == RESTORATION_WIENER) {
+                lr->filter_v[0] =
+                    !p ? msac_decode_subexp(&ts->msac,
+                                            ts->lr_ref[p]->filter_v[0] + 5, 16,
+                                            1) - 5:
+                         0;
+                lr->filter_v[1] =
+                    msac_decode_subexp(&ts->msac,
+                                       ts->lr_ref[p]->filter_v[1] + 23, 32,
+                                       2) - 23;
+                lr->filter_v[2] =
+                    msac_decode_subexp(&ts->msac,
+                                       ts->lr_ref[p]->filter_v[2] + 17, 64,
+                                       3) - 17;
 
-                for (int x = x0; x < imin(x1, n_units); x++) {
-                    const int px_x = x << (unit_size_log2 + ss_hor);
-                    const int sb_idx = (t->by >> 5) * f->sr_sb128w + (px_x >> 7);
-                    const int unit_idx = ((t->by & 16) >> 3) + ((px_x & 64) >> 6);
-                    Av1RestorationUnit *const lr = &f->lf.lr_mask[sb_idx].lr[p][unit_idx];
-
-                    read_restoration_info(t, lr, p, frame_type);
-                }
-            } else {
-                const int x = 4 * t->bx >> ss_hor;
-                if (x & mask) continue;
-                const int w = (f->cur.p.w + ss_hor) >> ss_hor;
-                // Round half up at frame boundaries, if there's more than one
-                // restoration unit
-                if (x && x + half_unit > w) continue;
-                const int sb_idx = (t->by >> 5) * f->sr_sb128w + (t->bx >> 5);
-                const int unit_idx = ((t->by & 16) >> 3) + ((t->bx & 16) >> 4);
-                Av1RestorationUnit *const lr = &f->lf.lr_mask[sb_idx].lr[p][unit_idx];
-
-                read_restoration_info(t, lr, p, frame_type);
+                lr->filter_h[0] =
+                    !p ? msac_decode_subexp(&ts->msac,
+                                            ts->lr_ref[p]->filter_h[0] + 5, 16,
+                                            1) - 5:
+                        0;
+                lr->filter_h[1] =
+                    msac_decode_subexp(&ts->msac,
+                                       ts->lr_ref[p]->filter_h[1] + 23, 32,
+                                       2) - 23;
+                lr->filter_h[2] =
+                    msac_decode_subexp(&ts->msac,
+                                       ts->lr_ref[p]->filter_h[2] + 17, 64,
+                                       3) - 17;
+                memcpy(lr->sgr_weights, ts->lr_ref[p]->sgr_weights, sizeof(lr->sgr_weights));
+                ts->lr_ref[p] = lr;
+                if (DEBUG_BLOCK_INFO)
+                    printf("Post-lr_wiener[pl=%d,v[%d,%d,%d],h[%d,%d,%d]]: r=%d\n",
+                           p, lr->filter_v[0], lr->filter_v[1],
+                           lr->filter_v[2], lr->filter_h[0],
+                           lr->filter_h[1], lr->filter_h[2], ts->msac.rng);
+            } else if (lr->type == RESTORATION_SGRPROJ) {
+                const unsigned idx = msac_decode_bools(&ts->msac, 4);
+                lr->sgr_idx = idx;
+                lr->sgr_weights[0] = dav1d_sgr_params[idx][0] ?
+                    msac_decode_subexp(&ts->msac,
+                                       ts->lr_ref[p]->sgr_weights[0] + 96, 128,
+                                       4) - 96 :
+                    0;
+                lr->sgr_weights[1] = dav1d_sgr_params[idx][1] ?
+                    msac_decode_subexp(&ts->msac,
+                                       ts->lr_ref[p]->sgr_weights[1] + 32, 128,
+                                       4) - 32 :
+                    iclip(128 - lr->sgr_weights[0], -32, 95);
+                memcpy(lr->filter_v, ts->lr_ref[p]->filter_v, sizeof(lr->filter_v));
+                memcpy(lr->filter_h, ts->lr_ref[p]->filter_h, sizeof(lr->filter_h));
+                ts->lr_ref[p] = lr;
+                if (DEBUG_BLOCK_INFO)
+                    printf("Post-lr_sgrproj[pl=%d,idx=%d,w[%d,%d]]: r=%d\n",
+                           p, lr->sgr_idx, lr->sgr_weights[0],
+                           lr->sgr_weights[1], ts->msac.rng);
             }
         }
         if (decode_sb(t, root_bl, c->intra_edge.root[root_bl]))
             return 1;
-        if (t->bx & 16 || f->seq_hdr->sb128) {
+        if (t->bx & 16 || f->seq_hdr.sb128) {
             t->a++;
             t->lf_mask++;
         }
     }
 
     // backup pre-loopfilter pixels for intra prediction of the next sbrow
     if (f->frame_thread.pass != 1)
         f->bd_fn.backup_ipred_edge(t);
 
     // backup t->a/l.tx_lpf_y/uv at tile boundaries to use them to "fix"
     // up the initial value in neighbour tiles when running the loopfilter
     int align_h = (f->bh + 31) & ~31;
     memcpy(&f->lf.tx_lpf_right_edge[0][align_h * tile_col + t->by],
            &t->l.tx_lpf_y[t->by & 16], sb_step);
-    const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
     align_h >>= ss_ver;
+
     memcpy(&f->lf.tx_lpf_right_edge[1][align_h * tile_col + (t->by >> ss_ver)],
            &t->l.tx_lpf_uv[(t->by & 16) >> ss_ver], sb_step >> ss_ver);
 
     return 0;
 }
 
 int dav1d_decode_frame(Dav1dFrameContext *const f) {
     const Dav1dContext *const c = f->c;
     int retval = -EINVAL;
 
     if (f->n_tc > 1) {
-        if (f->frame_hdr->tiling.cols * f->sbh > f->tile_thread.titsati_sz) {
+        if (f->frame_hdr.tiling.cols * f->sbh > f->tile_thread.titsati_sz) {
             freep(&f->tile_thread.task_idx_to_sby_and_tile_idx);
             f->tile_thread.task_idx_to_sby_and_tile_idx =
                 malloc(sizeof(*f->tile_thread.task_idx_to_sby_and_tile_idx) *
-                       f->frame_hdr->tiling.cols * f->sbh);
+                       f->frame_hdr.tiling.cols * f->sbh);
             if (!f->tile_thread.task_idx_to_sby_and_tile_idx) return -ENOMEM;
-            f->tile_thread.titsati_sz = f->frame_hdr->tiling.cols * f->sbh;
+            f->tile_thread.titsati_sz = f->frame_hdr.tiling.cols * f->sbh;
         }
-        if (f->tile_thread.titsati_init[0] != f->frame_hdr->tiling.cols ||
-            f->tile_thread.titsati_init[1] != f->sbh ||
-            f->tile_thread.titsati_init[2] != f->frame_hdr->tiling.rows)
+        if (f->tile_thread.titsati_init[0] != f->frame_hdr.tiling.cols ||
+            f->tile_thread.titsati_init[1] != f->sbh)
         {
             for (int tile_row = 0, tile_idx = 0;
-                 tile_row < f->frame_hdr->tiling.rows; tile_row++)
+                 tile_row < f->frame_hdr.tiling.rows; tile_row++)
             {
-                for (int sby = f->frame_hdr->tiling.row_start_sb[tile_row];
-                     sby < f->frame_hdr->tiling.row_start_sb[tile_row + 1]; sby++)
+                for (int sby = f->frame_hdr.tiling.row_start_sb[tile_row];
+                     sby < f->frame_hdr.tiling.row_start_sb[tile_row + 1]; sby++)
                 {
-                    for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols;
+                    for (int tile_col = 0; tile_col < f->frame_hdr.tiling.cols;
                          tile_col++, tile_idx++)
                     {
                         f->tile_thread.task_idx_to_sby_and_tile_idx[tile_idx][0] = sby;
                         f->tile_thread.task_idx_to_sby_and_tile_idx[tile_idx][1] =
-                            tile_row * f->frame_hdr->tiling.cols + tile_col;
+                            tile_row * f->frame_hdr.tiling.cols + tile_col;
                     }
                 }
             }
-            f->tile_thread.titsati_init[0] = f->frame_hdr->tiling.cols;
+            f->tile_thread.titsati_init[0] = f->frame_hdr.tiling.cols;
             f->tile_thread.titsati_init[1] = f->sbh;
-            f->tile_thread.titsati_init[2] = f->frame_hdr->tiling.rows;
         }
     }
 
-    if (f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows > f->n_ts) {
-        f->ts = realloc(f->ts, f->frame_hdr->tiling.cols *
-                               f->frame_hdr->tiling.rows * sizeof(*f->ts));
+    if (f->frame_hdr.tiling.cols * f->frame_hdr.tiling.rows > f->n_ts) {
+        f->ts = realloc(f->ts, f->frame_hdr.tiling.cols *
+                               f->frame_hdr.tiling.rows * sizeof(*f->ts));
         if (!f->ts) return -ENOMEM;
         for (int n = f->n_ts;
-             n < f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows; n++)
+             n < f->frame_hdr.tiling.cols * f->frame_hdr.tiling.rows; n++)
         {
             Dav1dTileState *const ts = &f->ts[n];
             pthread_mutex_init(&ts->tile_thread.lock, NULL);
             pthread_cond_init(&ts->tile_thread.cond, NULL);
         }
         if (c->n_fc > 1) {
             freep(&f->frame_thread.tile_start_off);
             f->frame_thread.tile_start_off =
                 malloc(sizeof(*f->frame_thread.tile_start_off) *
-                       f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows);
+                       f->frame_hdr.tiling.cols * f->frame_hdr.tiling.rows);
             if (!f->frame_thread.tile_start_off) return -ENOMEM;
         }
-        f->n_ts = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows;
+        f->n_ts = f->frame_hdr.tiling.cols * f->frame_hdr.tiling.rows;
     }
 
     if (c->n_fc > 1) {
         int tile_idx = 0;
-        for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) {
-            int row_off = f->frame_hdr->tiling.row_start_sb[tile_row] *
+        for (int tile_row = 0; tile_row < f->frame_hdr.tiling.rows; tile_row++) {
+            int row_off = f->frame_hdr.tiling.row_start_sb[tile_row] *
                           f->sb_step * 4 * f->sb128w * 128;
-            int b_diff = (f->frame_hdr->tiling.row_start_sb[tile_row + 1] -
-                          f->frame_hdr->tiling.row_start_sb[tile_row]) * f->sb_step * 4;
-            for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) {
+            int b_diff = (f->frame_hdr.tiling.row_start_sb[tile_row + 1] -
+                          f->frame_hdr.tiling.row_start_sb[tile_row]) * f->sb_step * 4;
+            for (int tile_col = 0; tile_col < f->frame_hdr.tiling.cols; tile_col++) {
                 f->frame_thread.tile_start_off[tile_idx++] = row_off + b_diff *
-                    f->frame_hdr->tiling.col_start_sb[tile_col] * f->sb_step * 4;
+                    f->frame_hdr.tiling.col_start_sb[tile_col] * f->sb_step * 4;
             }
         }
     }
 
-    if (f->sb128w * f->frame_hdr->tiling.rows > f->a_sz) {
+    if (f->sb128w * f->frame_hdr.tiling.rows > f->a_sz) {
         freep(&f->a);
-        f->a = malloc(f->sb128w * f->frame_hdr->tiling.rows * sizeof(*f->a));
+        f->a = malloc(f->sb128w * f->frame_hdr.tiling.rows * sizeof(*f->a));
         if (!f->a) return -ENOMEM;
-        f->a_sz = f->sb128w * f->frame_hdr->tiling.rows;
+        f->a_sz = f->sb128w * f->frame_hdr.tiling.rows;
     }
 
     // update allocation of block contexts for above
     if (f->sb128w > f->lf.line_sz) {
         dav1d_freep_aligned(&f->lf.cdef_line);
+        dav1d_freep_aligned(&f->lf.lr_lpf_line);
 
         // note that we allocate all pixel arrays as if we were dealing with
         // 10 bits/component data
         uint16_t *ptr = f->lf.cdef_line =
             dav1d_alloc_aligned(f->b4_stride * 4 * 12 * sizeof(uint16_t), 32);
-        if (!ptr) return -ENOMEM;
+
+        uint16_t *lr_ptr = f->lf.lr_lpf_line =
+            dav1d_alloc_aligned(f->b4_stride * 4 * 3 * 12 * sizeof(uint16_t), 32);
+
+        if (!ptr || !lr_ptr) {
+            if (ptr) dav1d_free_aligned(ptr);
+            if (lr_ptr) dav1d_free_aligned(lr_ptr);
+            return -ENOMEM;
+        }
 
         for (int pl = 0; pl <= 2; pl++) {
             f->lf.cdef_line_ptr[0][pl][0] = ptr + f->b4_stride * 4 * 0;
             f->lf.cdef_line_ptr[0][pl][1] = ptr + f->b4_stride * 4 * 1;
             f->lf.cdef_line_ptr[1][pl][0] = ptr + f->b4_stride * 4 * 2;
             f->lf.cdef_line_ptr[1][pl][1] = ptr + f->b4_stride * 4 * 3;
             ptr += f->b4_stride * 4 * 4;
+
+            f->lf.lr_lpf_line_ptr[pl] = lr_ptr;
+            lr_ptr += f->b4_stride * 4 * 12;
         }
 
         f->lf.line_sz = f->sb128w;
     }
 
-    const ptrdiff_t lr_stride = (f->sr_cur.p.p.w + 31) & ~31;
-    if (lr_stride > f->lf.lr_line_sz) {
-        dav1d_freep_aligned(&f->lf.lr_lpf_line);
-
-        uint16_t *lr_ptr = f->lf.lr_lpf_line =
-            dav1d_alloc_aligned(lr_stride * 3 * 12 * sizeof(uint16_t), 32);
-
-        if (!lr_ptr) return -ENOMEM;
-
-        for (int pl = 0; pl <= 2; pl++) {
-            f->lf.lr_lpf_line_ptr[pl] = lr_ptr;
-            lr_ptr += lr_stride * 12;
-        }
-
-        f->lf.lr_line_sz = lr_stride;
-    }
-
     // update allocation for loopfilter masks
     if (f->sb128w * f->sb128h > f->lf.mask_sz) {
         freep(&f->lf.mask);
         freep(&f->lf.level);
         freep(&f->frame_thread.b);
         f->lf.mask = malloc(f->sb128w * f->sb128h * sizeof(*f->lf.mask));
         f->lf.level = malloc(f->sb128w * f->sb128h * 32 * 32 *
                              sizeof(*f->lf.level));
@@ -2641,95 +2574,84 @@ int dav1d_decode_frame(Dav1dFrameContext
             {
                 return -ENOMEM;
             }
             memset(f->frame_thread.cf, 0,
                    sizeof(int32_t) * 3 * f->sb128w * f->sb128h * 128 * 128);
         }
         f->lf.mask_sz = f->sb128w * f->sb128h;
     }
-    f->sr_sb128w = (f->sr_cur.p.p.w + 127) >> 7;
-    if (f->sr_sb128w * f->sb128h > f->lf.lr_mask_sz) {
-        freep(&f->lf.lr_mask);
-        f->lf.lr_mask = malloc(f->sr_sb128w * f->sb128h * sizeof(*f->lf.lr_mask));
-        if (!f->lf.lr_mask) return -ENOMEM;
-        f->lf.lr_mask_sz = f->sr_sb128w * f->sb128h;
+    if (f->frame_hdr.loopfilter.sharpness != f->lf.last_sharpness) {
+        dav1d_calc_eih(&f->lf.lim_lut, f->frame_hdr.loopfilter.sharpness);
+        f->lf.last_sharpness = f->frame_hdr.loopfilter.sharpness;
     }
-    if (f->frame_hdr->loopfilter.sharpness != f->lf.last_sharpness) {
-        dav1d_calc_eih(&f->lf.lim_lut, f->frame_hdr->loopfilter.sharpness);
-        f->lf.last_sharpness = f->frame_hdr->loopfilter.sharpness;
-    }
-    dav1d_calc_lf_values(f->lf.lvl, f->frame_hdr, (int8_t[4]) { 0, 0, 0, 0 });
+    dav1d_calc_lf_values(f->lf.lvl, &f->frame_hdr, (int8_t[4]) { 0, 0, 0, 0 });
     memset(f->lf.mask, 0, sizeof(*f->lf.mask) * f->sb128w * f->sb128h);
 
     if (f->sbh * f->sb128w * 128 > f->ipred_edge_sz) {
         dav1d_freep_aligned(&f->ipred_edge[0]);
         uint16_t *ptr = f->ipred_edge[0] =
             dav1d_alloc_aligned(f->sb128w * 128 * f->sbh * 3 * sizeof(uint16_t), 32);
         if (!f->ipred_edge[0]) return -ENOMEM;
         f->ipred_edge_sz = f->sbh * f->sb128w * 128;
         f->ipred_edge[1] = &ptr[f->ipred_edge_sz];
         f->ipred_edge[2] = &ptr[f->ipred_edge_sz * 2];
     }
 
-    if (f->sb128h * f->frame_hdr->tiling.cols > f->lf.re_sz) {
+    if (f->sb128h * f->frame_hdr.tiling.cols > f->lf.re_sz) {
         freep(&f->lf.tx_lpf_right_edge[0]);
         f->lf.tx_lpf_right_edge[0] = malloc((f->sb128h * 32 * 2) *
-                                            f->frame_hdr->tiling.cols);
+                                            f->frame_hdr.tiling.cols);
         if (!f->lf.tx_lpf_right_edge[0]) return -ENOMEM;
         f->lf.tx_lpf_right_edge[1] = f->lf.tx_lpf_right_edge[0] +
-                                     f->sb128h * 32 * f->frame_hdr->tiling.cols;
-        f->lf.re_sz = f->sb128h * f->frame_hdr->tiling.cols;
+                                     f->sb128h * 32 * f->frame_hdr.tiling.cols;
+        f->lf.re_sz = f->sb128h * f->frame_hdr.tiling.cols;
     }
 
     // init ref mvs
-    if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) {
+    if ((f->frame_hdr.frame_type & 1) || f->frame_hdr.allow_intrabc) {
         f->mvs = f->mvs_ref->data;
-        const int order_hint_n_bits = f->seq_hdr->order_hint * f->seq_hdr->order_hint_n_bits;
+        const int order_hint_n_bits = f->seq_hdr.order_hint * f->seq_hdr.order_hint_n_bits;
         const int ret = av1_init_ref_mv_common(f->libaom_cm, f->bw >> 1, f->bh >> 1,
-                                               f->b4_stride, f->seq_hdr->sb128,
-                                               f->mvs, f->ref_mvs,
-                                               f->cur.frame_hdr->frame_offset,
-                                               f->refpoc,
-                                               f->refrefpoc, f->frame_hdr->gmv,
-                                               f->frame_hdr->hp, f->frame_hdr->force_integer_mv,
-                                               f->frame_hdr->use_ref_frame_mvs,
+                                               f->b4_stride, f->seq_hdr.sb128,
+                                               f->mvs, f->ref_mvs, f->cur.p.poc, f->refpoc,
+                                               f->refrefpoc, f->frame_hdr.gmv,
+                                               f->frame_hdr.hp, f->frame_hdr.force_integer_mv,
+                                               f->frame_hdr.use_ref_frame_mvs,
                                                order_hint_n_bits);
         if (ret < 0) return -ENOMEM;
-        if (c->n_fc == 1 && f->frame_hdr->use_ref_frame_mvs)
+        if (c->n_fc == 1 && f->frame_hdr.use_ref_frame_mvs)
             av1_init_ref_mv_tile_row(f->libaom_cm, 0, f->bw, 0, f->bh);
     }
 
     // setup dequant tables
-    init_quant_tables(f->seq_hdr, f->frame_hdr, f->frame_hdr->quant.yac, f->dq);
-    if (f->frame_hdr->quant.qm)
+    init_quant_tables(&f->seq_hdr, &f->frame_hdr, f->frame_hdr.quant.yac, f->dq);
+    if (f->frame_hdr.quant.qm)
         for (int j = 0; j < N_RECT_TX_SIZES; j++) {
-            f->qm[0][j][0] = dav1d_qm_tbl[f->frame_hdr->quant.qm_y][0][j];
-            f->qm[0][j][1] = dav1d_qm_tbl[f->frame_hdr->quant.qm_u][1][j];
-            f->qm[0][j][2] = dav1d_qm_tbl[f->frame_hdr->quant.qm_v][1][j];
+            f->qm[0][j][0] = dav1d_qm_tbl[f->frame_hdr.quant.qm_y][0][j];
+            f->qm[0][j][1] = dav1d_qm_tbl[f->frame_hdr.quant.qm_u][1][j];
+            f->qm[0][j][2] = dav1d_qm_tbl[f->frame_hdr.quant.qm_v][1][j];
         }
-    for (int i = f->frame_hdr->quant.qm; i < 2; i++)
+    for (int i = f->frame_hdr.quant.qm; i < 2; i++)
         for (int tx = 0; tx < N_RECT_TX_SIZES; tx++)
             for (int pl = 0; pl < 3; pl++)
                 f->qm[i][tx][pl] = dav1d_qm_tbl[15][!!pl][tx];
 
     // setup jnt_comp weights
-    if (f->frame_hdr->switchable_comp_refs) {
+    if (f->frame_hdr.switchable_comp_refs) {
         for (int i = 0; i < 7; i++) {
-            const unsigned ref0poc = f->refp[i].p.frame_hdr->frame_offset;
+            const unsigned ref0poc = f->refp[i].p.poc;
 
             for (int j = i + 1; j < 7; j++) {
-                const unsigned ref1poc = f->refp[j].p.frame_hdr->frame_offset;
+                const unsigned ref1poc = f->refp[j].p.poc;
 
-                const unsigned d1 =
-                    imin(abs(get_poc_diff(f->seq_hdr->order_hint_n_bits, ref0poc,
-                                          f->cur.frame_hdr->frame_offset)), 31);
-                const unsigned d0 =
-                    imin(abs(get_poc_diff(f->seq_hdr->order_hint_n_bits, ref1poc,
-                                          f->cur.frame_hdr->frame_offset)), 31);
+                const unsigned d1 = imin(abs(get_poc_diff(f->seq_hdr.order_hint_n_bits,
+                                                          ref0poc, f->cur.p.poc)), 31);
+                const unsigned d0 = imin(abs(get_poc_diff(f->seq_hdr.order_hint_n_bits,
+                                                          ref1poc, f->cur.p.poc)), 31);
                 const int order = d0 <= d1;
 
                 static const uint8_t quant_dist_weight[3][2] = {
                     { 2, 3 }, { 2, 5 }, { 2, 7 }
                 };
                 static const uint8_t quant_dist_lookup_table[4][2] = {
                     { 9, 7 }, { 11, 5 }, { 12, 4 }, { 13, 3 }
                 };
@@ -2745,235 +2667,223 @@ int dav1d_decode_frame(Dav1dFrameContext
 
                 f->jnt_weights[i][j] = quant_dist_lookup_table[k][order];
             }
         }
     }
 
     // init loopfilter pointers
     f->lf.mask_ptr = f->lf.mask;
-    f->lf.p[0] = f->cur.data[0];
-    f->lf.p[1] = f->cur.data[1];
-    f->lf.p[2] = f->cur.data[2];
-    f->lf.sr_p[0] = f->sr_cur.p.data[0];
-    f->lf.sr_p[1] = f->sr_cur.p.data[1];
-    f->lf.sr_p[2] = f->sr_cur.p.data[2];
+    f->lf.p[0] = f->cur.p.data[0];
+    f->lf.p[1] = f->cur.p.data[1];
+    f->lf.p[2] = f->cur.p.data[2];
     f->lf.tile_row = 1;
 
     dav1d_cdf_thread_wait(&f->in_cdf);
-    if (f->frame_hdr->refresh_context)
+    if (f->frame_hdr.refresh_context)
         memcpy(f->out_cdf.cdf, f->in_cdf.cdf, sizeof(*f->in_cdf.cdf));
 
     // parse individual tiles per tile group
     int update_set = 0, tile_row = 0, tile_col = 0;
     for (int i = 0; i < f->n_tile_data; i++) {
         const uint8_t *data = f->tile[i].data.data;
         size_t size = f->tile[i].data.sz;
 
         for (int j = f->tile[i].start; j <= f->tile[i].end; j++) {
             size_t tile_sz;
             if (j == f->tile[i].end) {
                 tile_sz = size;
             } else {
-                if (f->frame_hdr->tiling.n_bytes > size) goto error;
+                if (f->frame_hdr.tiling.n_bytes > size) goto error;
                 tile_sz = 0;
-                for (unsigned k = 0; k < f->frame_hdr->tiling.n_bytes; k++)
+                for (unsigned k = 0; k < f->frame_hdr.tiling.n_bytes; k++)
                     tile_sz |= (unsigned)*data++ << (k * 8);
                 tile_sz++;
-                size -= f->frame_hdr->tiling.n_bytes;
+                size -= f->frame_hdr.tiling.n_bytes;
                 if (tile_sz > size) goto error;
             }
 
             setup_tile(&f->ts[j], f, data, tile_sz, tile_row, tile_col++,
                        c->n_fc > 1 ? f->frame_thread.tile_start_off[j] : 0);
 
-            if (tile_col == f->frame_hdr->tiling.cols) {
+            if (tile_col == f->frame_hdr.tiling.cols) {
                 tile_col = 0;
                 tile_row++;
             }
-            if (j == f->frame_hdr->tiling.update && f->frame_hdr->refresh_context)
+            if (j == f->frame_hdr.tiling.update && f->frame_hdr.refresh_context)
                 update_set = 1;
             data += tile_sz;
             size -= tile_sz;
         }
     }
 
     // 2-pass decoding:
     // - enabled for frame-threading, so that one frame can do symbol parsing
     //   as another (or multiple) are doing reconstruction. One advantage here
     //   is that although reconstruction is limited by reference availability,
     //   symbol parsing is not. Therefore, symbol parsing can effectively use
     //   row and col tile threading, but reconstruction only col tile threading;
     // - pass 0 means no 2-pass;
     // - pass 1 means symbol parsing only;
     // - pass 2 means reconstruction and loop filtering.
 
-    const int uses_2pass = c->n_fc > 1 && f->frame_hdr->refresh_context;
+    const int uses_2pass = c->n_fc > 1 && f->frame_hdr.refresh_context;
     for (f->frame_thread.pass = uses_2pass;
          f->frame_thread.pass <= 2 * uses_2pass; f->frame_thread.pass++)
     {
         const enum PlaneType progress_plane_type =
             f->frame_thread.pass == 0 ? PLANE_TYPE_ALL :
             f->frame_thread.pass == 1 ? PLANE_TYPE_BLOCK : PLANE_TYPE_Y;
 
-        for (int n = 0; n < f->sb128w * f->frame_hdr->tiling.rows; n++)
-            reset_context(&f->a[n], !(f->frame_hdr->frame_type & 1), f->frame_thread.pass);
+        for (int n = 0; n < f->sb128w * f->frame_hdr.tiling.rows; n++)
+            reset_context(&f->a[n], !(f->frame_hdr.frame_type & 1), f->frame_thread.pass);
 
         if (f->n_tc == 1) {
             Dav1dTileContext *const t = f->tc;
 
             // no tile threading - we explicitly interleave tile/sbrow decoding
             // and post-filtering, so that the full process runs in-line, so
             // that frame threading is still possible
-            for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) {
+            for (int tile_row = 0; tile_row < f->frame_hdr.tiling.rows; tile_row++) {
                 const int sbh_end =
-                    imin(f->frame_hdr->tiling.row_start_sb[tile_row + 1], f->sbh);
-                for (int sby = f->frame_hdr->tiling.row_start_sb[tile_row];
+                    imin(f->frame_hdr.tiling.row_start_sb[tile_row + 1], f->sbh);
+                for (int sby = f->frame_hdr.tiling.row_start_sb[tile_row];
                      sby < sbh_end; sby++)
                 {
-                    t->by = sby << (4 + f->seq_hdr->sb128);
-                    for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) {
-                        t->ts = &f->ts[tile_row * f->frame_hdr->tiling.cols + tile_col];
+                    t->by = sby << (4 + f->seq_hdr.sb128);
+                    for (int tile_col = 0; tile_col < f->frame_hdr.tiling.cols; tile_col++) {
+                        t->ts = &f->ts[tile_row * f->frame_hdr.tiling.cols + tile_col];
 
                         if (dav1d_decode_tile_sbrow(t)) goto error;
                     }
 
                     // loopfilter + cdef + restoration
                     if (f->frame_thread.pass != 1)
                         f->bd_fn.filter_sbrow(f, sby);
-                    dav1d_thread_picture_signal(&f->sr_cur, (sby + 1) * f->sb_step * 4,
+                    dav1d_thread_picture_signal(&f->cur, (sby + 1) * f->sb_step * 4,
                                                 progress_plane_type);
                 }
             }
         } else {
             // signal available tasks to worker threads
             int num_tasks;
 
             pthread_mutex_lock(&f->tile_thread.lock);
             assert(!f->tile_thread.tasks_left);
-            if (f->frame_thread.pass == 1 || f->n_tc >= f->frame_hdr->tiling.cols) {
+            if (f->frame_thread.pass == 1 || f->n_tc >= f->frame_hdr.tiling.cols) {
                 // we can (or in fact, if >, we need to) do full tile decoding.
                 // loopfilter happens below
-                num_tasks = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows;
+                num_tasks = f->frame_hdr.tiling.cols * f->frame_hdr.tiling.rows;
             } else {
                 // we need to interleave sbrow decoding for all tile cols in a
                 // tile row, since otherwise subsequent threads will be blocked
                 // waiting for the post-filter to complete
-                num_tasks = f->sbh * f->frame_hdr->tiling.cols;
+                num_tasks = f->sbh * f->frame_hdr.tiling.cols;
             }
             f->tile_thread.num_tasks = f->tile_thread.tasks_left = num_tasks;
             pthread_cond_broadcast(&f->tile_thread.cond);
             pthread_mutex_unlock(&f->tile_thread.lock);
 
             // loopfilter + cdef + restoration
-            for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) {
-                for (int sby = f->frame_hdr->tiling.row_start_sb[tile_row];
-                     sby < f->frame_hdr->tiling.row_start_sb[tile_row + 1]; sby++)
+            for (int tile_row = 0; tile_row < f->frame_hdr.tiling.rows; tile_row++) {
+                for (int sby = f->frame_hdr.tiling.row_start_sb[tile_row];
+                     sby < f->frame_hdr.tiling.row_start_sb[tile_row + 1]; sby++)
                 {
-                    for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols;
+                    for (int tile_col = 0; tile_col < f->frame_hdr.tiling.cols;
                          tile_col++)
                     {
                         int progress;
                         Dav1dTileState *const ts =
-                            &f->ts[tile_row * f->frame_hdr->tiling.cols + tile_col];
+                            &f->ts[tile_row * f->frame_hdr.tiling.cols + tile_col];
 
                         if ((progress = atomic_load(&ts->progress)) <= sby) {
                             pthread_mutex_lock(&ts->tile_thread.lock);
                             while ((progress = atomic_load(&ts->progress)) <= sby)
                                 pthread_cond_wait(&ts->tile_thread.cond,
                                                   &ts->tile_thread.lock);
                             pthread_mutex_unlock(&ts->tile_thread.lock);
                         }
                         if (progress == TILE_ERROR) {
-                            dav1d_thread_picture_signal(&f->sr_cur, FRAME_ERROR,
-                                                        PLANE_TYPE_ALL);
+                            dav1d_thread_picture_signal(&f->cur, FRAME_ERROR,
+                                                        progress_plane_type);
                             const uint64_t all_mask = ~0ULL >> (64 - f->n_tc);
                             pthread_mutex_lock(&f->tile_thread.lock);
                             while (f->tile_thread.available != all_mask)
                                 pthread_cond_wait(&f->tile_thread.icond, &f->tile_thread.lock);
                             pthread_mutex_unlock(&f->tile_thread.lock);
                             goto error;
                         }
                     }
 
                     // loopfilter + cdef + restoration
                     if (f->frame_thread.pass != 1)
                         f->bd_fn.filter_sbrow(f, sby);
-                    dav1d_thread_picture_signal(&f->sr_cur, (sby + 1) * f->sb_step * 4,
+                    dav1d_thread_picture_signal(&f->cur, (sby + 1) * f->sb_step * 4,
                                                 progress_plane_type);
                 }
             }
 
             const uint64_t all_mask = ~0ULL >> (64 - f->n_tc);
             pthread_mutex_lock(&f->tile_thread.lock);
             while (f->tile_thread.available != all_mask)
                 pthread_cond_wait(&f->tile_thread.icond, &f->tile_thread.lock);
             pthread_mutex_unlock(&f->tile_thread.lock);
         }
 
-        if (f->frame_thread.pass <= 1 && f->frame_hdr->refresh_context) {
+        if (f->frame_thread.pass <= 1 && f->frame_hdr.refresh_context) {
             // cdf update
             if (update_set)
-                dav1d_update_tile_cdf(f->frame_hdr, f->out_cdf.cdf,
-                                      &f->ts[f->frame_hdr->tiling.update].cdf);
+                dav1d_update_tile_cdf(&f->frame_hdr, f->out_cdf.cdf,
+                                      &f->ts[f->frame_hdr.tiling.update].cdf);
             dav1d_cdf_thread_signal(&f->out_cdf);
         }
         if (f->frame_thread.pass == 1) {
             assert(c->n_fc > 1);
             for (int tile_idx = 0;
-                 tile_idx < f->frame_hdr->tiling.rows * f->frame_hdr->tiling.cols;
+                 tile_idx < f->frame_hdr.tiling.rows * f->frame_hdr.tiling.cols;
                  tile_idx++)
             {
                 Dav1dTileState *const ts = &f->ts[tile_idx];
                 const int tile_start_off = f->frame_thread.tile_start_off[tile_idx];
                 ts->frame_thread.pal_idx = &f->frame_thread.pal_idx[tile_start_off * 2];
                 ts->frame_thread.cf = &((int32_t *) f->frame_thread.cf)[tile_start_off * 3];
                 if (f->n_tc > 0) {
-                    unsigned row_sb_start = f->frame_hdr->tiling.row_start_sb[ts->tiling.row];
+                    unsigned row_sb_start = f->frame_hdr.tiling.row_start_sb[ts->tiling.row];
                     atomic_init(&ts->progress, row_sb_start);
                 }
             }
         }
     }
 
     retval = 0;
 error:
-    dav1d_thread_picture_signal(&f->sr_cur, retval == 0 ? UINT_MAX : FRAME_ERROR,
+    dav1d_thread_picture_signal(&f->cur, retval == 0 ? UINT_MAX : FRAME_ERROR,
                                 PLANE_TYPE_ALL);
     for (int i = 0; i < 7; i++) {
         if (f->refp[i].p.data[0])
             dav1d_thread_picture_unref(&f->refp[i]);
         dav1d_ref_dec(&f->ref_mvs_ref[i]);
     }
 
-    dav1d_picture_unref(&f->cur);
-    dav1d_thread_picture_unref(&f->sr_cur);
+    dav1d_thread_picture_unref(&f->cur);
     dav1d_cdf_thread_unref(&f->in_cdf);
-    if (f->frame_hdr->refresh_context) {
+    if (f->frame_hdr.refresh_context) {
         dav1d_cdf_thread_signal(&f->out_cdf);
         dav1d_cdf_thread_unref(&f->out_cdf);
     }
     dav1d_ref_dec(&f->cur_segmap_ref);
     dav1d_ref_dec(&f->prev_segmap_ref);
     dav1d_ref_dec(&f->mvs_ref);
-    dav1d_ref_dec(&f->seq_hdr_ref);
-    dav1d_ref_dec(&f->frame_hdr_ref);
 
     for (int i = 0; i < f->n_tile_data; i++)
         dav1d_data_unref(&f->tile[i].data);
 
     return retval;
 }
 
-static int get_upscale_x0(const int in_w, const int out_w, const int step) {
-    const int err = out_w * step - (in_w << 14);
-    const int x0 = (-((out_w - in_w) << 13) + (out_w >> 1)) / out_w + 128 - (err >> 1);
-    return x0 & 0x3fff;
-}
-
 int dav1d_submit_frame(Dav1dContext *const c) {
     Dav1dFrameContext *f;
     int res = -1;
 
     // wait for c->out_delayed[next] and move into c->out if visible
     Dav1dThreadPicture *out_delayed;
     if (c->n_fc > 1) {
         const unsigned next = c->frame_thread.next++;
@@ -2982,39 +2892,33 @@ int dav1d_submit_frame(Dav1dContext *con
 
         f = &c->fc[next];
         pthread_mutex_lock(&f->frame_thread.td.lock);
         while (f->n_tile_data > 0)
             pthread_cond_wait(&f->frame_thread.td.cond,
                               &f->frame_thread.td.lock);
         out_delayed = &c->frame_thread.out_delayed[next];
         if (out_delayed->p.data[0]) {
-            if (out_delayed->visible)
+            if (out_delayed->visible && !out_delayed->flushed)
                 dav1d_picture_ref(&c->out, &out_delayed->p);
             dav1d_thread_picture_unref(out_delayed);
         }
     } else {
         f = c->fc;
     }
 
     f->seq_hdr = c->seq_hdr;
-    f->seq_hdr_ref = c->seq_hdr_ref;
-    dav1d_ref_inc(f->seq_hdr_ref);
     f->frame_hdr = c->frame_hdr;
-    f->frame_hdr_ref = c->frame_hdr_ref;
-    c->frame_hdr = NULL;
-    c->frame_hdr_ref = NULL;
-    f->dsp = &c->dsp[f->seq_hdr->hbd];
-
-    const int bpc = 8 + 2 * f->seq_hdr->hbd;
+    const int bd_idx = (f->seq_hdr.bpc - 8) >> 1;
+    f->dsp = &c->dsp[bd_idx];
 
     if (!f->dsp->ipred.intra_pred[DC_PRED]) {
-        Dav1dDSPContext *const dsp = &c->dsp[f->seq_hdr->hbd];
+        Dav1dDSPContext *const dsp = &c->dsp[bd_idx];
 
-        switch (bpc) {
+        switch (f->seq_hdr.bpc) {
 #define assign_bitdepth_case(bd) \
         case bd: \
             dav1d_cdef_dsp_init_##bd##bpc(&dsp->cdef); \
             dav1d_intra_pred_dsp_init_##bd##bpc(&dsp->ipred); \
             dav1d_itx_dsp_init_##bd##bpc(&dsp->itx); \
             dav1d_loop_filter_dsp_init_##bd##bpc(&dsp->lf); \
             dav1d_loop_restoration_dsp_init_##bd##bpc(&dsp->lr); \
             dav1d_mc_dsp_init_##bd##bpc(&dsp->mc); \
@@ -3023,172 +2927,153 @@ int dav1d_submit_frame(Dav1dContext *con
         assign_bitdepth_case(8);
 #endif
 #if CONFIG_10BPC
         assign_bitdepth_case(10);
 #endif
 #undef assign_bitdepth_case
         default:
             fprintf(stderr, "Compiled without support for %d-bit decoding\n",
-                    8 + 2 * f->seq_hdr->hbd);
+                    f->seq_hdr.bpc);
             res = -ENOPROTOOPT;
             goto error;
         }
     }
 
 #define assign_bitdepth_case(bd) \
         f->bd_fn.recon_b_inter = dav1d_recon_b_inter_##bd##bpc; \
         f->bd_fn.recon_b_intra = dav1d_recon_b_intra_##bd##bpc; \
         f->bd_fn.filter_sbrow = dav1d_filter_sbrow_##bd##bpc; \
         f->bd_fn.backup_ipred_edge = dav1d_backup_ipred_edge_##bd##bpc; \
         f->bd_fn.read_coef_blocks = dav1d_read_coef_blocks_##bd##bpc
-    if (!f->seq_hdr->hbd) {
+    if (f->seq_hdr.bpc <= 8) {
 #if CONFIG_8BPC
         assign_bitdepth_case(8);
 #endif
     } else {
 #if CONFIG_10BPC
         assign_bitdepth_case(16);
 #endif
     }
 #undef assign_bitdepth_case
 
-    int ref_coded_width[7];
-    if (f->frame_hdr->frame_type & 1) {
-        if (f->frame_hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE) {
-            const int pri_ref = f->frame_hdr->refidx[f->frame_hdr->primary_ref_frame];
+    if (f->frame_hdr.frame_type & 1) {
+        if (f->frame_hdr.primary_ref_frame != PRIMARY_REF_NONE) {
+            const int pri_ref = f->frame_hdr.refidx[f->frame_hdr.primary_ref_frame];
             if (!c->refs[pri_ref].p.p.data[0]) {
                 res = -EINVAL;
                 goto error;
             }
         }
         for (int i = 0; i < 7; i++) {
-            const int refidx = f->frame_hdr->refidx[i];
+            const int refidx = f->frame_hdr.refidx[i];
             if (!c->refs[refidx].p.p.data[0] ||
-                f->frame_hdr->width[0] * 2 < c->refs[refidx].p.p.p.w ||
-                f->frame_hdr->height * 2 < c->refs[refidx].p.p.p.h ||
-                f->frame_hdr->width[0] > c->refs[refidx].p.p.p.w * 16 ||
-                f->frame_hdr->height > c->refs[refidx].p.p.p.h * 16 ||
-                f->seq_hdr->layout != c->refs[refidx].p.p.p.layout ||
-                bpc != c->refs[refidx].p.p.p.bpc)
+                f->frame_hdr.width * 2 < c->refs[refidx].p.p.p.w ||
+                f->frame_hdr.height * 2 < c->refs[refidx].p.p.p.h ||
+                f->frame_hdr.width > c->refs[refidx].p.p.p.w * 16 ||
+                f->frame_hdr.height > c->refs[refidx].p.p.p.h * 16 ||
+                f->seq_hdr.layout != c->refs[refidx].p.p.p.layout ||
+                f->seq_hdr.bpc != c->refs[refidx].p.p.p.bpc)
             {
                 for (int j = 0; j < i; j++)
                     dav1d_thread_picture_unref(&f->refp[j]);
                 res = -EINVAL;
                 goto error;
             }
             dav1d_thread_picture_ref(&f->refp[i], &c->refs[refidx].p);
-            ref_coded_width[i] = c->refs[refidx].p.p.frame_hdr->width[0];
-            if (f->frame_hdr->width[0] != c->refs[refidx].p.p.p.w ||
-                f->frame_hdr->height != c->refs[refidx].p.p.p.h)
+            if (f->frame_hdr.width  != c->refs[refidx].p.p.p.w ||
+                f->frame_hdr.height != c->refs[refidx].p.p.p.h)
             {
 #define scale_fac(ref_sz, this_sz) \
-    ((((ref_sz) << 14) + ((this_sz) >> 1)) / (this_sz))
+    (((ref_sz << 14) + (this_sz >> 1)) / this_sz)
                 f->svc[i][0].scale = scale_fac(c->refs[refidx].p.p.p.w,
-                                               f->frame_hdr->width[0]);
+                                               f->frame_hdr.width);
                 f->svc[i][1].scale = scale_fac(c->refs[refidx].p.p.p.h,
-                                               f->frame_hdr->height);
+                                               f->frame_hdr.height);
+#undef scale_fac
                 f->svc[i][0].step = (f->svc[i][0].scale + 8) >> 4;
                 f->svc[i][1].step = (f->svc[i][1].scale + 8) >> 4;
             } else {
                 f->svc[i][0].scale = 0;
             }
-            f->gmv_warp_allowed[i] = f->frame_hdr->gmv[i].type > DAV1D_WM_TYPE_TRANSLATION &&
-                                     !f->frame_hdr->force_integer_mv &&
-                                     !dav1d_get_shear_params(&f->frame_hdr->gmv[i]);
         }
     }
 
     // setup entropy
-    if (f->frame_hdr->primary_ref_frame == DAV1D_PRIMARY_REF_NONE) {
-        dav1d_init_states(&f->in_cdf, f->frame_hdr->quant.yac);
+    if (f->frame_hdr.primary_ref_frame == PRIMARY_REF_NONE) {
+        dav1d_init_states(&f->in_cdf, f->frame_hdr.quant.yac);
     } else {
-        const int pri_ref = f->frame_hdr->refidx[f->frame_hdr->primary_ref_frame];
+        const int pri_ref = f->frame_hdr.refidx[f->frame_hdr.primary_ref_frame];
         dav1d_cdf_thread_ref(&f->in_cdf, &c->cdf[pri_ref]);
     }
-    if (f->frame_hdr->refresh_context) {
+    if (f->frame_hdr.refresh_context) {
         dav1d_cdf_thread_alloc(&f->out_cdf, c->n_fc > 1 ? &f->frame_thread.td : NULL);
     }
 
     // FIXME qsort so tiles are in order (for frame threading)
     memcpy(f->tile, c->tile, c->n_tile_data * sizeof(*f->tile));
     memset(c->tile, 0, c->n_tile_data * sizeof(*c->tile));
     f->n_tile_data = c->n_tile_data;
     c->n_tile_data = 0;
 
     // allocate frame
-    res = dav1d_thread_picture_alloc(&f->sr_cur, f->frame_hdr->width[1],
-                                     f->frame_hdr->height,
-                                     f->seq_hdr->layout, bpc,
-                                     c->n_fc > 1 ? &f->frame_thread.td : NULL,
-                                     f->frame_hdr->show_frame, &c->allocator);
-    if (res < 0) goto error;
-
-    f->sr_cur.p.m = f->tile[0].data.m;
-    f->sr_cur.p.frame_hdr = f->frame_hdr;
-    f->sr_cur.p.frame_hdr_ref = f->frame_hdr_ref;
-    dav1d_ref_inc(f->frame_hdr_ref);
-    f->sr_cur.p.seq_hdr = f->seq_hdr;
-    f->sr_cur.p.seq_hdr_ref = f->seq_hdr_ref;
-    dav1d_ref_inc(f->seq_hdr_ref);
-
-    if (f->frame_hdr->super_res.enabled) {
-        res = dav1d_picture_alloc_copy(&f->cur, f->frame_hdr->width[0], &f->sr_cur.p);
-        if (res < 0) goto error;
-    } else {
-        dav1d_picture_ref(&f->cur, &f->sr_cur.p);
+    if ((res = dav1d_thread_picture_alloc(&f->cur, f->frame_hdr.width,
+                                          f->frame_hdr.height,
+                                          f->seq_hdr.layout, f->seq_hdr.bpc,
+                                          c->n_fc > 1 ? &f->frame_thread.td : NULL,
+                                          f->frame_hdr.show_frame,
+                                          &c->allocator)) < 0)
+    {
+        goto error;
     }
 
-    if (f->frame_hdr->super_res.enabled) {
-        f->resize_step[0] = scale_fac(f->cur.p.w, f->sr_cur.p.p.w);
-        const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
-        const int in_cw = (f->cur.p.w + ss_hor) >> ss_hor;
-        const int out_cw = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
-        f->resize_step[1] = scale_fac(in_cw, out_cw);
-#undef scale_fac
-        f->resize_start[0] = get_upscale_x0(f->cur.p.w, f->sr_cur.p.p.w, f->resize_step[0]);
-        f->resize_start[1] = get_upscale_x0(in_cw, out_cw, f->resize_step[1]);
-    }
+    f->cur.p.poc = f->frame_hdr.frame_offset;
+    f->cur.p.p.type = f->frame_hdr.frame_type;
+    f->cur.p.p.pri = f->seq_hdr.pri;
+    f->cur.p.p.trc = f->seq_hdr.trc;
+    f->cur.p.p.mtrx = f->seq_hdr.mtrx;
+    f->cur.p.p.chr = f->seq_hdr.chr;
+    f->cur.p.p.fullrange = f->seq_hdr.color_range;
 
     // move f->cur into output queue
     if (c->n_fc == 1) {
-        if (f->frame_hdr->show_frame)
-            dav1d_picture_ref(&c->out, &f->sr_cur.p);
+        if (f->frame_hdr.show_frame)
+            dav1d_picture_ref(&c->out, &f->cur.p);
     } else {
-        dav1d_thread_picture_ref(out_delayed, &f->sr_cur);
+        dav1d_thread_picture_ref(out_delayed, &f->cur);
     }
 
-    f->w4 = (f->frame_hdr->width[0] + 3) >> 2;
-    f->h4 = (f->frame_hdr->height + 3) >> 2;
-    f->bw = ((f->frame_hdr->width[0] + 7) >> 3) << 1;
-    f->bh = ((f->frame_hdr->height + 7) >> 3) << 1;
+    f->w4 = (f->frame_hdr.width + 3) >> 2;
+    f->h4 = (f->frame_hdr.height + 3) >> 2;
+    f->bw = ((f->frame_hdr.width + 7) >> 3) << 1;
+    f->bh = ((f->frame_hdr.height + 7) >> 3) << 1;
     f->sb128w = (f->bw + 31) >> 5;
     f->sb128h = (f->bh + 31) >> 5;
-    f->sb_shift = 4 + f->seq_hdr->sb128;
-    f->sb_step = 16 << f->seq_hdr->sb128;
+    f->sb_shift = 4 + f->seq_hdr.sb128;
+    f->sb_step = 16 << f->seq_hdr.sb128;
     f->sbh = (f->bh + f->sb_step - 1) >> f->sb_shift;
     f->b4_stride = (f->bw + 31) & ~31;
 
     // ref_mvs
-    if ((f->frame_hdr->frame_type & 1) || f->frame_hdr->allow_intrabc) {
+    if ((f->frame_hdr.frame_type & 1) || f->frame_hdr.allow_intrabc) {
         f->mvs_ref = dav1d_ref_create(f->sb128h * 32 * f->b4_stride *
                                       sizeof(*f->mvs));
         f->mvs = f->mvs_ref->data;
-        if (!f->frame_hdr->allow_intrabc) {
+        if (!f->frame_hdr.allow_intrabc) {
             for (int i = 0; i < 7; i++)
-                f->refpoc[i] = f->refp[i].p.frame_hdr->frame_offset;
+                f->refpoc[i] = f->refp[i].p.poc;
         } else {
             memset(f->refpoc, 0, sizeof(f->refpoc));
         }
-        if (f->frame_hdr->use_ref_frame_mvs) {
+        if (f->frame_hdr.use_ref_frame_mvs) {
             for (int i = 0; i < 7; i++) {
-                const int refidx = f->frame_hdr->refidx[i];
+                const int refidx = f->frame_hdr.refidx[i];
                 if (c->refs[refidx].refmvs != NULL &&
-                    ref_coded_width[i] == f->cur.p.w &&
-                    f->refp[i].p.p.h == f->cur.p.h)
+                    f->refp[i].p.p.w == f->cur.p.p.w &&
+                    f->refp[i].p.p.h == f->cur.p.p.h)
                 {
                     f->ref_mvs_ref[i] = c->refs[refidx].refmvs;
                     dav1d_ref_inc(f->ref_mvs_ref[i]);
                     f->ref_mvs[i] = c->refs[refidx].refmvs->data;
                 } else {
                     f->ref_mvs[i] = NULL;
                     f->ref_mvs_ref[i] = NULL;
                 }
@@ -3199,38 +3084,46 @@ int dav1d_submit_frame(Dav1dContext *con
             memset(f->ref_mvs_ref, 0, sizeof(f->ref_mvs_ref));
         }
     } else {
         f->mvs_ref = NULL;
         memset(f->ref_mvs_ref, 0, sizeof(f->ref_mvs_ref));
     }
 
     // segmap
-    if (f->frame_hdr->segmentation.enabled) {
+    if (f->frame_hdr.segmentation.enabled) {
+
         // By default, the previous segmentation map is not initialised.
         f->prev_segmap_ref = NULL;
         f->prev_segmap = NULL;
 
         // We might need a previous frame's segmentation map. This
         // happens if there is either no update or a temporal update.
-        if (f->frame_hdr->segmentation.temporal || !f->frame_hdr->segmentation.update_map) {
-            const int pri_ref = f->frame_hdr->primary_ref_frame;
-            assert(pri_ref != DAV1D_PRIMARY_REF_NONE);
-            const int ref_w = ((ref_coded_width[pri_ref] + 7) >> 3) << 1;
+        if (f->frame_hdr.segmentation.temporal || !f->frame_hdr.segmentation.update_map) {
+            const int pri_ref = f->frame_hdr.primary_ref_frame;
+            assert(pri_ref != PRIMARY_REF_NONE);
+            const int ref_w = ((f->refp[pri_ref].p.p.w + 7) >> 3) << 1;
             const int ref_h = ((f->refp[pri_ref].p.p.h + 7) >> 3) << 1;
             if (ref_w == f->bw && ref_h == f->bh) {
-                f->prev_segmap_ref = c->refs[f->frame_hdr->refidx[pri_ref]].segmap;
+                f->prev_segmap_ref = c->refs[f->frame_hdr.refidx[pri_ref]].segmap;
                 if (f->prev_segmap_ref) {
                     dav1d_ref_inc(f->prev_segmap_ref);
                     f->prev_segmap = f->prev_segmap_ref->data;
                 }
             }
+            // It is an error to signal a temporal update if the
+            // previous frame was the wrong size or had no
+            // segmentation data.
+            if (f->frame_hdr.segmentation.temporal && !f->prev_segmap_ref) {
+                res = -EINVAL;
+                goto error;
+            }
         }
 
-        if (f->frame_hdr->segmentation.update_map) {
+        if (f->frame_hdr.segmentation.update_map) {
             // We're updating an existing map, but need somewhere to
             // put the new values. Allocate them here (the data
             // actually gets set elsewhere)
             f->cur_segmap_ref = dav1d_ref_create(f->b4_stride * 32 * f->sb128h);
             f->cur_segmap = f->cur_segmap_ref->data;
         } else if (f->prev_segmap_ref) {
             // We're not updating an existing map, and we have a valid
             // reference. Use that.
@@ -3246,48 +3139,53 @@ int dav1d_submit_frame(Dav1dContext *con
     } else {
         f->cur_segmap = NULL;
         f->cur_segmap_ref = NULL;
         f->prev_segmap_ref = NULL;
     }
 
     // update references etc.
     for (int i = 0; i < 8; i++) {
-        if (f->frame_hdr->refresh_frame_flags & (1 << i)) {
+        if (f->frame_hdr.refresh_frame_flags & (1 << i)) {
             if (c->refs[i].p.p.data[0])
                 dav1d_thread_picture_unref(&c->refs[i].p);
-            dav1d_thread_picture_ref(&c->refs[i].p, &f->sr_cur);
+            dav1d_thread_picture_ref(&c->refs[i].p, &f->cur);
 
             if (c->cdf[i].cdf) dav1d_cdf_thread_unref(&c->cdf[i]);
-            if (f->frame_hdr->refresh_context) {
+            if (f->frame_hdr.refresh_context) {
                 dav1d_cdf_thread_ref(&c->cdf[i], &f->out_cdf);
             } else {
                 dav1d_cdf_thread_ref(&c->cdf[i], &f->in_cdf);
             }
+            c->refs[i].lf_mode_ref_deltas =
+                f->frame_hdr.loopfilter.mode_ref_deltas;
+            c->refs[i].seg_data = f->frame_hdr.segmentation.seg_data;
+            memcpy(c->refs[i].gmv, f->frame_hdr.gmv, sizeof(c->refs[i].gmv));
+            c->refs[i].film_grain = f->frame_hdr.film_grain.data;
 
             dav1d_ref_dec(&c->refs[i].segmap);
             c->refs[i].segmap = f->cur_segmap_ref;
             if (f->cur_segmap_ref)
                 dav1d_ref_inc(f->cur_segmap_ref);
             dav1d_ref_dec(&c->refs[i].refmvs);
-            if (!f->frame_hdr->allow_intrabc) {
+            if (!f->frame_hdr.allow_intrabc) {
                 c->refs[i].refmvs = f->mvs_ref;
                 if (f->mvs_ref)
                     dav1d_ref_inc(f->mvs_ref);
             }
             memcpy(c->refs[i].refpoc, f->refpoc, sizeof(f->refpoc));
+            c->refs[i].qidx = f->frame_hdr.quant.yac;
         }
     }
 
     if (c->n_fc == 1) {
-        const unsigned refresh_frame_flags = f->frame_hdr->refresh_frame_flags;
         if ((res = dav1d_decode_frame(f)) < 0) {
             dav1d_picture_unref(&c->out);
             for (int i = 0; i < 8; i++) {
-                if (refresh_frame_flags & (1 << i)) {
+                if (f->frame_hdr.refresh_frame_flags & (1 << i)) {
                     if (c->refs[i].p.p.data[0])
                         dav1d_thread_picture_unref(&c->refs[i].p);
                     if (c->cdf[i].cdf)
                         dav1d_cdf_thread_unref(&c->cdf[i]);
                     dav1d_ref_dec(&c->refs[i].segmap);
                     dav1d_ref_dec(&c->refs[i].refmvs);
                 }
             }
@@ -3296,29 +3194,26 @@ int dav1d_submit_frame(Dav1dContext *con
     } else {
         pthread_cond_signal(&f->frame_thread.td.cond);
         pthread_mutex_unlock(&f->frame_thread.td.lock);
     }
 
     return 0;
 error:
     dav1d_cdf_thread_unref(&f->in_cdf);
-    if (f->frame_hdr->refresh_context)
+    if (f->frame_hdr.refresh_context)
         dav1d_cdf_thread_unref(&f->out_cdf);
     for (int i = 0; i < 7; i++) {
         if (f->refp[i].p.data[0])
             dav1d_thread_picture_unref(&f->refp[i]);
         dav1d_ref_dec(&f->ref_mvs_ref[i]);
     }
     dav1d_picture_unref(&c->out);
-    dav1d_picture_unref(&f->cur);
-    dav1d_thread_picture_unref(&f->sr_cur);
+    dav1d_thread_picture_unref(&f->cur);
     dav1d_ref_dec(&f->mvs_ref);
-    dav1d_ref_dec(&f->seq_hdr_ref);
-    dav1d_ref_dec(&f->frame_hdr_ref);
 
     for (int i = 0; i < f->n_tile_data; i++)
         dav1d_data_unref(&f->tile[i].data);
     f->n_tile_data = 0;
 
     if (c->n_fc > 1) {
         pthread_cond_signal(&f->frame_thread.td.cond);
         pthread_mutex_unlock(&f->frame_thread.td.lock);
--- a/third_party/dav1d/src/env.h
+++ b/third_party/dav1d/src/env.h
@@ -86,58 +86,55 @@ static inline int get_partition_ctx(cons
                                     const enum BlockLevel bl,
                                     const int yb8, const int xb8)
 {
     return ((a->partition[xb8] >> (4 - bl)) & 1) +
           (((l->partition[yb8] >> (4 - bl)) & 1) << 1);
 }
 
 static inline unsigned cdf_element_prob(const uint16_t *const cdf, const int e) {
-    assert(e > 0);
-    return cdf[e - 1] - cdf[e];
+    return (e > 0 ? cdf[e - 1] : 32768) - cdf[e];
 }
 
 static inline unsigned gather_left_partition_prob(const uint16_t *const in,
                                                   const enum BlockLevel bl)
 {
-    unsigned out = 0;
-    out += cdf_element_prob(in, PARTITION_H);
+    unsigned out = 32768;
+    out -= cdf_element_prob(in, PARTITION_H);
     if (bl != BL_128X128)
-        out += cdf_element_prob(in, PARTITION_H4);
-    // Exploit the fact that cdfs for PARTITION_SPLIT, PARTITION_T_TOP_SPLIT,
-    //  PARTITION_T_BOTTOM_SPLIT and PARTITION_T_LEFT_SPLIT are neighbors.
-    out += in[PARTITION_SPLIT - 1] - in[PARTITION_T_LEFT_SPLIT];
-    return out;
+        out -= cdf_element_prob(in, PARTITION_H4);
+    out -= cdf_element_prob(in, PARTITION_SPLIT);
+    out -= cdf_element_prob(in, PARTITION_T_TOP_SPLIT);
+    out -= cdf_element_prob(in, PARTITION_T_BOTTOM_SPLIT);
+    out -= cdf_element_prob(in, PARTITION_T_LEFT_SPLIT);
+    return 32768 - out;
 }
 
 static inline unsigned gather_top_partition_prob(const uint16_t *const in,
                                                  const enum BlockLevel bl)
 {
-    unsigned out = 0;
+    unsigned out = 32768;
+    out -= cdf_element_prob(in, PARTITION_V);
     if (bl != BL_128X128)
-        out += cdf_element_prob(in, PARTITION_V4);
-    // Exploit the fact that cdfs for PARTITION_T_LEFT_SPLIT and PARTITION_T_RIGHT_SPLIT,
-    //  and PARTITION_V, PARTITION_SPLIT and PARTITION_T_TOP_SPLIT are neighbors.
-    out += in[PARTITION_T_LEFT_SPLIT - 1] - in[PARTITION_T_RIGHT_SPLIT];
-    out += in[PARTITION_V - 1] - in[PARTITION_T_TOP_SPLIT];
-    return out;
+        out -= cdf_element_prob(in, PARTITION_V4);
+    out -= cdf_element_prob(in, PARTITION_SPLIT);
+    out -= cdf_element_prob(in, PARTITION_T_TOP_SPLIT);
+    out -= cdf_element_prob(in, PARTITION_T_LEFT_SPLIT);
+    out -= cdf_element_prob(in, PARTITION_T_RIGHT_SPLIT);
+    return 32768 - out;
 }
 
 static inline enum TxfmTypeSet get_ext_txtp_set(const enum RectTxfmSize tx,
                                                 const int inter,
-                                                const Dav1dFrameHeader *const hdr,
+                                                const Av1FrameHeader *const hdr,
                                                 const int seg_id)
 {
-    if (!hdr->segmentation.qidx[seg_id]) {
-        if (hdr->segmentation.lossless[seg_id]) {
-            assert(tx == (int) TX_4X4);
-            return TXTP_SET_LOSSLESS;
-        } else {
-            return TXTP_SET_DCT;
-        }
+    if (hdr->segmentation.lossless[seg_id]) {
+        assert(tx == (int) TX_4X4);
+        return TXTP_SET_LOSSLESS;
     }
 
     const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
 
     if (t_dim->max >= TX_64X64)
         return TXTP_SET_DCT;
 
     if (t_dim->max == TX_32X32)
@@ -151,32 +148,32 @@ static inline enum TxfmTypeSet get_ext_t
     if (inter)
         return txsqsz == TX_16X16 ? TXTP_SET_DT9_ID_1D : TXTP_SET_ALL;
     else
         return txsqsz == TX_16X16 ? TXTP_SET_DT4_ID : TXTP_SET_DT4_ID_1D;
 }
 
 static inline enum TxfmType get_uv_intra_txtp(const enum IntraPredMode uv_mode,
                                               const enum RectTxfmSize tx,
-                                              const Dav1dFrameHeader *const hdr,
+                                              const Av1FrameHeader *const hdr,
                                               const int seg_id)
 {
     if (hdr->segmentation.lossless[seg_id]) {
         assert(tx == (int) TX_4X4);
         return WHT_WHT;
     }
 
     const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
 
     return t_dim->max == TX_32X32 ? DCT_DCT : dav1d_txtp_from_uvmode[uv_mode];
 }
 
 static inline enum TxfmType get_uv_inter_txtp(const TxfmInfo *const uvt_dim,
                                               const enum TxfmType ytxtp,
-                                              const Dav1dFrameHeader *const hdr,
+                                              const Av1FrameHeader *const hdr,
                                               const int seg_id)
 {
     if (hdr->segmentation.lossless[seg_id]) {
         assert(uvt_dim->max == TX_4X4);
         return WHT_WHT;
     }
 
     if (uvt_dim->max == TX_32X32)
@@ -192,28 +189,28 @@ static inline enum TxfmType get_uv_inter
 }
 
 static inline int get_filter_ctx(const BlockContext *const a,
                                  const BlockContext *const l,
                                  const int comp, const int dir, const int ref,
                                  const int yb4, const int xb4)
 {
     const int a_filter = (a->ref[0][xb4] == ref || a->ref[1][xb4] == ref) ?
-                         a->filter[dir][xb4] : DAV1D_N_SWITCHABLE_FILTERS;
+                         a->filter[dir][xb4] : N_SWITCHABLE_FILTERS;
     const int l_filter = (l->ref[0][yb4] == ref || l->ref[1][yb4] == ref) ?
-                         l->filter[dir][yb4] : DAV1D_N_SWITCHABLE_FILTERS;
+                         l->filter[dir][yb4] : N_SWITCHABLE_FILTERS;
 
     if (a_filter == l_filter) {
         return comp * 4 + a_filter;
-    } else if (a_filter == DAV1D_N_SWITCHABLE_FILTERS) {
+    } else if (a_filter == N_SWITCHABLE_FILTERS) {
         return comp * 4 + l_filter;
-    } else if (l_filter == DAV1D_N_SWITCHABLE_FILTERS) {
+    } else if (l_filter == N_SWITCHABLE_FILTERS) {
         return comp * 4 + a_filter;
     } else {
-        return comp * 4 + DAV1D_N_SWITCHABLE_FILTERS;
+        return comp * 4 + N_SWITCHABLE_FILTERS;
     }
 }
 
 static inline int get_comp_ctx(const BlockContext *const a,
                                const BlockContext *const l,
                                const int yb4, const int xb4,
                                const int have_top, const int have_left)
 {
@@ -714,44 +711,44 @@ static inline int get_br_ctx(const uint8
         break;
     case TX_CLASS_V:
         if (y == 0) return mag + 7;
         break;
     }
     return mag + 14;
 }
 
-static inline mv get_gmv_2d(const Dav1dWarpedMotionParams *const gmv,
+static inline mv get_gmv_2d(const WarpedMotionParams *const gmv,
                             const int bx4, const int by4,
                             const int bw4, const int bh4,
-                            const Dav1dFrameHeader *const hdr)
+                            const Av1FrameHeader *const hdr)
 {
     switch (gmv->type) {
-    case DAV1D_WM_TYPE_ROT_ZOOM:
+    case WM_TYPE_ROT_ZOOM:
         assert(gmv->matrix[5] ==  gmv->matrix[2]);
         assert(gmv->matrix[4] == -gmv->matrix[3]);
         // fall-through
     default:
-    case DAV1D_WM_TYPE_AFFINE: {
+    case WM_TYPE_AFFINE: {
         const int x = bx4 * 4 + bw4 * 2 - 1;
         const int y = by4 * 4 + bh4 * 2 - 1;
         const int xc = (gmv->matrix[2] - (1 << 16)) * x +
                        gmv->matrix[3] * y + gmv->matrix[0];
         const int yc = (gmv->matrix[5] - (1 << 16)) * y +
                        gmv->matrix[4] * x + gmv->matrix[1];
         const int shift = 16 - (3 - !hdr->hp);
         const int round = (1 << shift) >> 1;
         return (mv) {
             .y = apply_sign(((abs(yc) + round) >> shift) << !hdr->hp, yc),
             .x = apply_sign(((abs(xc) + round) >> shift) << !hdr->hp, xc),
         };
     }
-    case DAV1D_WM_TYPE_TRANSLATION:
+    case WM_TYPE_TRANSLATION:
         return (mv) {
             .y = gmv->matrix[0] >> 13,
             .x = gmv->matrix[1] >> 13,
         };
-    case DAV1D_WM_TYPE_IDENTITY:
+    case WM_TYPE_IDENTITY:
         return (mv) { .x = 0, .y = 0 };
     }
 }
 
 #endif /* __DAV1D_SRC_ENV_H__ */
deleted file mode 100644
--- a/third_party/dav1d/src/film_grain.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- *    list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __DAV1D_SRC_FILM_GRAIN_H__
-#define __DAV1D_SRC_FILM_GRAIN_H__
-
-#include "dav1d/dav1d.h"
-
-void dav1d_apply_grain_8bpc(Dav1dPicture *const out,
-                            const Dav1dPicture *const in);
-
-void dav1d_apply_grain_10bpc(Dav1dPicture *const out,
-                             const Dav1dPicture *const in);
-
-#endif /* __DAV1D_SRC_FILM_GRAIN_H__ */
deleted file mode 100644
--- a/third_party/dav1d/src/film_grain_tmpl.c
+++ /dev/null
@@ -1,512 +0,0 @@
-/*
- * Copyright © 2018, Niklas Haas
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- *    list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "config.h"
-
-#include <assert.h>
-#include <stdint.h>
-
-#include "common.h"
-#include "common/intops.h"
-#include "common/bitdepth.h"
-#include "tables.h"
-
-#include "film_grain.h"
-
-#if BITDEPTH == 8
-typedef int8_t entry;
-#else
-typedef int16_t entry;
-#endif
-
-enum {
-    GRAIN_WIDTH  = 82,
-    GRAIN_HEIGHT = 73,
-    SUB_GRAIN_WIDTH = 44,
-    SUB_GRAIN_HEIGHT = 38,
-    SUB_GRAIN_OFFSET = 6,
-    BLOCK_SIZE = 32,
-    SCALING_SIZE = 1 << BITDEPTH,
-};
-
-static inline int get_random_number(const int bits, unsigned *state) {
-    const int r = *state;
-    unsigned bit = ((r >> 0) ^ (r >> 1) ^ (r >> 3) ^ (r >> 12)) & 1;
-    *state = (r >> 1) | (bit << 15);
-
-    return (*state >> (16 - bits)) & ((1 << bits) - 1);
-}
-
-static inline int round2(const int x, const int shift) {
-    return (x + ((1 << shift) >> 1)) >> shift;
-}
-
-enum {
-    GRAIN_CENTER = 128 << (BITDEPTH - 8),
-    GRAIN_MIN = -GRAIN_CENTER,
-    GRAIN_MAX = (256 << (BITDEPTH - 8)) - 1 - GRAIN_CENTER,
-};
-
-static void generate_grain_y(const Dav1dPicture *const in,
-                             entry buf[GRAIN_HEIGHT][GRAIN_WIDTH])
-{
-    const Dav1dFilmGrainData *data = &in->frame_hdr->film_grain.data;
-    unsigned seed = data->seed;
-    const int shift = 12 - BITDEPTH + data->grain_scale_shift;
-
-    for (int y = 0; y < GRAIN_HEIGHT; y++) {
-        for (int x = 0; x < GRAIN_WIDTH; x++) {
-            const int value = get_random_number(11, &seed);
-            buf[y][x] = round2(dav1d_gaussian_sequence[ value ], shift);
-        }
-    }
-
-    const int ar_pad = 3;
-    const int ar_lag = data->ar_coeff_lag;
-
-    for (int y = ar_pad; y < GRAIN_HEIGHT; y++) {
-        for (int x = ar_pad; x < GRAIN_WIDTH - ar_pad; x++) {
-            const int8_t *coeff = data->ar_coeffs_y;
-            int sum = 0;
-            for (int dy = -ar_lag; dy <= 0; dy++) {
-                for (int dx = -ar_lag; dx <= ar_lag; dx++) {
-                    if (!dx && !dy)
-                        break;
-                    sum += *(coeff++) * buf[y + dy][x + dx];
-                }
-            }
-
-            int grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
-            buf[y][x] = iclip(grain, GRAIN_MIN, GRAIN_MAX);
-        }
-    }
-}
-
-static void generate_grain_uv(const Dav1dPicture *const in, int uv,
-                              entry buf[GRAIN_HEIGHT][GRAIN_WIDTH],
-                              entry buf_y[GRAIN_HEIGHT][GRAIN_WIDTH])
-{
-    const Dav1dFilmGrainData *data = &in->frame_hdr->film_grain.data;
-    unsigned seed = data->seed ^ (uv ? 0x49d8 : 0xb524);
-    const int shift = 12 - BITDEPTH + data->grain_scale_shift;
-
-    const int subx = in->p.layout != DAV1D_PIXEL_LAYOUT_I444;
-    const int suby = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
-
-    const int chromaW = subx ? SUB_GRAIN_WIDTH  : GRAIN_WIDTH;
-    const int chromaH = suby ? SUB_GRAIN_HEIGHT : GRAIN_HEIGHT;
-
-    for (int y = 0; y < chromaH; y++) {
-        for (int x = 0; x < chromaW; x++) {
-            const int value = get_random_number(11, &seed);
-            buf[y][x] = round2(dav1d_gaussian_sequence[ value ], shift);
-        }
-    }
-
-    const int ar_pad = 3;
-    const int ar_lag = data->ar_coeff_lag;
-
-    for (int y = ar_pad; y < chromaH; y++) {
-        for (int x = ar_pad; x < chromaW - ar_pad; x++) {
-            const int8_t *coeff = data->ar_coeffs_uv[uv];
-            int sum = 0;
-            for (int dy = -ar_lag; dy <= 0; dy++) {
-                for (int dx = -ar_lag; dx <= ar_lag; dx++) {
-                    // For the final (current) pixel, we need to add in the
-                    // contribution from the luma grain texture
-                    if (!dx && !dy) {
-                        if (!data->num_y_points)
-                            break;
-                        int luma = 0;
-                        const int lumaX = ((x - ar_pad) << subx) + ar_pad;
-                        const int lumaY = ((y - ar_pad) << suby) + ar_pad;
-                        for (int i = 0; i <= suby; i++) {
-                            for (int j = 0; j <= subx; j++) {
-                                luma += buf_y[lumaY + i][lumaX + j];
-                            }
-                        }
-                        luma = round2(luma, subx + suby);
-                        sum += luma * (*coeff);
-                        break;
-                    }
-
-                    sum += *(coeff++) * buf[y + dy][x + dx];
-                }
-            }
-
-            const int grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
-            buf[y][x] = iclip(grain, GRAIN_MIN, GRAIN_MAX);
-        }
-    }
-}
-
-static void generate_scaling(const uint8_t points[][2], int num,
-                             uint8_t scaling[SCALING_SIZE])
-{
-    const int shift_x = BITDEPTH - 8;
-
-    // Fill up the preceding entries with the initial value
-    for (int i = 0; i < points[0][0] << shift_x; i++)
-        scaling[i] = points[0][1];
-
-    // Linearly interpolate the values in the middle
-    for (int i = 0; i < num - 1; i++) {
-        const int bx = points[i][0] << shift_x;
-        const int by = points[i][1];
-        const int ex = points[i+1][0] << shift_x;
-        const int ey = points[i+1][1];
-        const int dx = ex - bx;
-        const int dy = ey - by;
-        const int delta = dy * ((0xFFFF + (dx >> 1))) / dx;
-        for (int x = 0; x < dx; x++) {
-            const int v = by + ((x * delta + 0x8000) >> 16);
-            scaling[bx + x] = v;
-        }
-    }
-
-    // Fill up the remaining entries with the final value
-    for (int i = points[num - 1][0] << shift_x; i < SCALING_SIZE; i++)
-        scaling[i] = points[num - 1][1];
-}
-
-// samples from the correct block of a grain LUT, while taking into account the
-// offsets provided by the offsets cache
-static inline entry sample_lut(entry grain_lut[GRAIN_HEIGHT][GRAIN_WIDTH],
-                               int offsets[2][2], int subx, int suby,
-                               int bx, int by, int x, int y)
-{
-    const int randval = offsets[bx][by];
-    const int offx = 3 + (2 >> subx) * (3 + (randval >> 4));
-    const int offy = 3 + (2 >> suby) * (3 + (randval & 0xF));
-    return grain_lut[offy + y + (BLOCK_SIZE >> suby) * by]
-                    [offx + x + (BLOCK_SIZE >> subx) * bx];
-}
-
-static void apply_to_row_y(Dav1dPicture *const out, const Dav1dPicture *const in,
-                           entry grain_lut[GRAIN_HEIGHT][GRAIN_WIDTH],
-                           uint8_t scaling[SCALING_SIZE], int row_num)
-{
-    const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;
-    const int rows = 1 + (data->overlap_flag && row_num > 0);
-
-    int min_value, max_value;
-    if (data->clip_to_restricted_range) {
-        min_value = 16 << (BITDEPTH - 8);
-        max_value = 235 << (BITDEPTH - 8);
-    } else {
-        min_value = 0;
-        max_value = (1 << BITDEPTH) - 1;
-    }
-
-    // seed[0] contains the current row, seed[1] contains the previous
-    unsigned seed[2];
-    for (int i = 0; i < rows; i++) {
-        seed[i] = data->seed;
-        seed[i] ^= (((row_num - i) * 37  + 178) & 0xFF) << 8;
-        seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
-    }
-
-    const ptrdiff_t stride = out->stride[0];
-    assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0);
-    assert(stride == in->stride[0]);
-    pixel *const src_row = (pixel *)  in->data[0] + PXSTRIDE(stride) * row_num * BLOCK_SIZE;
-    pixel *const dst_row = (pixel *) out->data[0] + PXSTRIDE(stride) * row_num * BLOCK_SIZE;
-
-    int offsets[2 /* col offset */][2 /* row offset */];
-
-    // process this row in BLOCK_SIZE^2 blocks
-    const int bh = imin(out->p.h - row_num * BLOCK_SIZE, BLOCK_SIZE);
-    for (int bx = 0; bx < out->p.w; bx += BLOCK_SIZE) {
-        const int bw = imin(BLOCK_SIZE, out->p.w - bx);
-
-        if (data->overlap_flag && bx) {
-            // shift previous offsets left
-            for (int i = 0; i < rows; i++)
-                offsets[1][i] = offsets[0][i];
-        }
-
-        // update current offsets
-        for (int i = 0; i < rows; i++)
-            offsets[0][i] = get_random_number(8, &seed[i]);
-
-        // x/y block offsets to compensate for overlapped regions
-        const int ystart = data->overlap_flag && row_num ? 2 : 0;
-        const int xstart = data->overlap_flag && bx      ? 2 : 0;
-
-        static const int w[2][2] = { { 27, 17 }, { 17, 27 } };
-
-#define add_noise_y(x, y, grain)                                                \
-            pixel *src = src_row + (y) * PXSTRIDE(stride) + (bx + (x));         \
-            pixel *dst = dst_row + (y) * PXSTRIDE(stride) + (bx + (x));         \
-            int noise = round2(scaling[ *src ] * (grain), data->scaling_shift); \
-            *dst = iclip(*src + noise, min_value, max_value);
-
-        for (int y = ystart; y < bh; y++) {
-            // Non-overlapped image region (straightforward)
-            for (int x = xstart; x < bw; x++) {
-                int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
-                add_noise_y(x, y, grain);
-            }
-
-            // Special case for overlapped column
-            for (int x = 0; x < xstart; x++) {
-                int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
-                int old   = sample_lut(grain_lut, offsets, 0, 0, 1, 0, x, y);
-                grain = round2(old * w[x][0] + grain * w[x][1], 5);
-                grain = iclip(grain, GRAIN_MIN, GRAIN_MAX);
-                add_noise_y(x, y, grain);
-            }
-        }
-
-        for (int y = 0; y < ystart; y++) {
-            // Special case for overlapped row (sans corner)
-            for (int x = xstart; x < bw; x++) {
-                int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
-                int old   = sample_lut(grain_lut, offsets, 0, 0, 0, 1, x, y);
-                grain = round2(old * w[y][0] + grain * w[y][1], 5);
-                grain = iclip(grain, GRAIN_MIN, GRAIN_MAX);
-                add_noise_y(x, y, grain);
-            }
-
-            // Special case for doubly-overlapped corner
-            for (int x = 0; x < xstart; x++) {
-                // Blend the top pixel with the top left block
-                int top = sample_lut(grain_lut, offsets, 0, 0, 0, 1, x, y);
-                int old = sample_lut(grain_lut, offsets, 0, 0, 1, 1, x, y);
-                top = round2(old * w[x][0] + top * w[x][1], 5);
-                top = iclip(top, GRAIN_MIN, GRAIN_MAX);
-
-                // Blend the current pixel with the left block
-                int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y);
-                old = sample_lut(grain_lut, offsets, 0, 0, 1, 0, x, y);
-                grain = round2(old * w[x][0] + grain * w[x][1], 5);
-                grain = iclip(grain, GRAIN_MIN, GRAIN_MAX);
-
-                // Mix the row rows together and apply grain
-                grain = round2(top * w[y][0] + grain * w[y][1], 5);
-                grain = iclip(grain, GRAIN_MIN, GRAIN_MAX);
-                add_noise_y(x, y, grain);
-            }
-        }
-    }
-}
-
-static void apply_to_row_uv(Dav1dPicture *const out, const Dav1dPicture *const in,
-                            entry grain_lut[GRAIN_HEIGHT][GRAIN_WIDTH],
-                            uint8_t scaling[SCALING_SIZE], int uv, int row_num)
-{
-    const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;
-    const int rows = 1 + (data->overlap_flag && row_num > 0);
-
-    int min_value, max_value;
-    if (data->clip_to_restricted_range) {
-        min_value = 16 << (BITDEPTH - 8);
-        if (out->seq_hdr->mtrx == DAV1D_MC_IDENTITY) {
-            max_value = 235 << (BITDEPTH - 8);
-        } else {
-            max_value = 240 << (BITDEPTH - 8);
-        }
-    } else {
-        min_value = 0;
-        max_value = (1 << BITDEPTH) - 1;
-    }
-
-    const int sx = in->p.layout != DAV1D_PIXEL_LAYOUT_I444;
-    const int sy = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
-
-    // seed[0] contains the current row, seed[1] contains the previous
-    unsigned seed[2];
-    for (int i = 0; i < rows; i++) {
-        seed[i] = data->seed;
-        seed[i] ^= (((row_num - i) * 37  + 178) & 0xFF) << 8;
-        seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
-    }
-
-    const ptrdiff_t stride = out->stride[1];
-    assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0);
-    assert(stride == in->stride[1]);
-
-    const int by = row_num * (BLOCK_SIZE >> sy);
-    pixel *const dst_row = (pixel *) out->data[1 + uv] + PXSTRIDE(stride) * by;
-    pixel *const src_row = (pixel *)  in->data[1 + uv] + PXSTRIDE(stride) * by;
-    pixel *const luma_row = (pixel *) out->data[0] + PXSTRIDE(out->stride[0]) * row_num * BLOCK_SIZE;
-
-    int offsets[2 /* col offset */][2 /* row offset */];
-
-    // process this row in BLOCK_SIZE^2 blocks (subsampled)
-    const int bh = (imin(out->p.h - row_num * BLOCK_SIZE, BLOCK_SIZE) + sy) >> sy;
-    for (int bx = 0; bx < (out->p.w + sx) >> sx; bx += BLOCK_SIZE >> sx) {
-        const int bw = (imin(BLOCK_SIZE, out->p.w - (bx << sx)) + sx) >> sx;
-        if (data->overlap_flag && bx) {
-            // shift previous offsets left
-            for (int i = 0; i < rows; i++)
-                offsets[1][i] = offsets[0][i];
-        }
-
-        // update current offsets
-        for (int i = 0; i < rows; i++)
-            offsets[0][i] = get_random_number(8, &seed[i]);
-
-        // x/y block offsets to compensate for overlapped regions
-        const int ystart = data->overlap_flag && row_num ? (2 >> sy) : 0;
-        const int xstart = data->overlap_flag && bx      ? (2 >> sx) : 0;
-
-        static const int w[2 /* sub */][2 /* off */][2] = {
-            { { 27, 17 }, { 17, 27 } },
-            { { 23, 22 } },
-        };
-
-#define add_noise_uv(x, y, grain)                                               \
-            const int lx = (bx + x) << sx;                                      \
-            const int ly = y << sy;                                             \
-            pixel *luma = luma_row + ly * PXSTRIDE(out->stride[0]) + lx;        \
-            pixel avg = luma[0];                                                \
-            if (sx && lx + 1 < out->p.w)                                        \
-                avg = (avg + luma[1] + 1) >> 1;                                 \
-                                                                                \
-            pixel *src = src_row + (y) * PXSTRIDE(stride) + (bx + (x));         \
-            pixel *dst = dst_row + (y) * PXSTRIDE(stride) + (bx + (x));         \
-            int val = avg;                                                      \
-            if (!data->chroma_scaling_from_luma) {                              \
-                int combined = avg * data->uv_luma_mult[uv] +                   \
-                               *src * data->uv_mult[uv];                        \
-                val = iclip_pixel( (combined >> 6) +                            \
-                                   (data->uv_offset[uv] * (1 << (BITDEPTH - 8))) );   \
-            }                                                                   \
-                                                                                \
-            int noise = round2(scaling[ val ] * (grain), data->scaling_shift);  \
-            *dst = iclip(*src + noise, min_value, max_value);
-
-        for (int y = ystart; y < bh; y++) {
-            // Non-overlapped image region (straightforward)
-            for (int x = xstart; x < bw; x++) {
-                int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
-                add_noise_uv(x, y, grain);
-            }
-
-            // Special case for overlapped column
-            for (int x = 0; x < xstart; x++) {
-                int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
-                int old   = sample_lut(grain_lut, offsets, sx, sy, 1, 0, x, y);
-                grain = (old * w[sx][x][0] + grain * w[sx][x][1] + 16) >> 5;
-                grain = iclip(grain, GRAIN_MIN, GRAIN_MAX);
-                add_noise_uv(x, y, grain);
-            }
-        }
-
-        for (int y = 0; y < ystart; y++) {
-            // Special case for overlapped row (sans corner)
-            for (int x = xstart; x < bw; x++) {
-                int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
-                int old   = sample_lut(grain_lut, offsets, sx, sy, 0, 1, x, y);
-                grain = (old * w[sy][y][0] + grain * w[sy][y][1] + 16) >> 5;
-                grain = iclip(grain, GRAIN_MIN, GRAIN_MAX);
-                add_noise_uv(x, y, grain);
-            }
-
-            // Special case for doubly-overlapped corner
-            for (int x = 0; x < xstart; x++) {
-                // Blend the top pixel with the top left block
-                int top = sample_lut(grain_lut, offsets, sx, sy, 0, 1, x, y);
-                int old = sample_lut(grain_lut, offsets, sx, sy, 1, 1, x, y);
-                top = (old * w[sx][x][0] + top * w[sx][x][1] + 16) >> 5;
-                top = iclip(top, GRAIN_MIN, GRAIN_MAX);
-
-                // Blend the current pixel with the left block
-                int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y);
-                old = sample_lut(grain_lut, offsets, sx, sy, 1, 0, x, y);
-                grain = (old * w[sx][x][0] + grain * w[sx][x][1] + 16) >> 5;
-                grain = iclip(grain, GRAIN_MIN, GRAIN_MAX);
-
-                // Mix the row rows together and apply to image
-                grain = (top * w[sy][y][0] + grain * w[sy][y][1] + 16) >> 5;
-                grain = iclip(grain, GRAIN_MIN, GRAIN_MAX);
-                add_noise_uv(x, y, grain);
-            }
-        }
-    }
-}
-
-void bitfn(dav1d_apply_grain)(Dav1dPicture *const out,
-                              const Dav1dPicture *const in)
-{
-    const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;
-
-    entry grain_lut[3][GRAIN_HEIGHT][GRAIN_WIDTH];
-    uint8_t scaling[3][SCALING_SIZE];
-
-    // Generate grain LUTs as needed
-    generate_grain_y(out, grain_lut[0]); // always needed
-    if (data->num_uv_points[0] || data->chroma_scaling_from_luma)
-        generate_grain_uv(out, 0, grain_lut[1], grain_lut[0]);
-    if (data->num_uv_points[1] || data->chroma_scaling_from_luma)
-        generate_grain_uv(out, 1, grain_lut[2], grain_lut[0]);
-
-    // Generate scaling LUTs as needed
-    if (data->num_y_points)
-        generate_scaling(data->y_points, data->num_y_points, scaling[0]);
-    if (data->num_uv_points[0])
-        generate_scaling(data->uv_points[0], data->num_uv_points[0], scaling[1]);
-    if (data->num_uv_points[1])
-        generate_scaling(data->uv_points[1], data->num_uv_points[1], scaling[2]);
-
-    // Synthesize grain for the affected planes
-    int rows = (out->p.h + 31) >> 5;
-    for (int row = 0; row < rows; row++) {
-        if (data->num_y_points)
-            apply_to_row_y(out, in, grain_lut[0], scaling[0], row);
-
-        if (data->chroma_scaling_from_luma) {
-            apply_to_row_uv(out, in, grain_lut[1], scaling[0], 0, row);
-            apply_to_row_uv(out, in, grain_lut[2], scaling[0], 1, row);
-        } else {
-            if (data->num_uv_points[0])
-                apply_to_row_uv(out, in, grain_lut[1], scaling[1], 0, row);
-            if (data->num_uv_points[1])
-                apply_to_row_uv(out, in, grain_lut[2], scaling[2], 1, row);
-        }
-    }
-
-    // Copy over the non-modified planes
-    // TODO: eliminate in favor of per-plane refs
-    if (!data->num_y_points) {
-        assert(out->stride[0] == in->stride[0]);
-        memcpy(out->data[0], in->data[0], out->p.h * out->stride[0]);
-    }
-
-    if (in->p.layout != DAV1D_PIXEL_LAYOUT_I400) {
-        for (int i = 0; i < 2; i++) {
-            if (!data->num_uv_points[i] && !data->chroma_scaling_from_luma) {
-                const int suby = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
-                assert(out->stride[1] == in->stride[1]);
-                memcpy(out->data[1+i], in->data[1+i],
-                       (out->p.h >> suby) * out->stride[1]);
-            }
-        }
-    }
-}
--- a/third_party/dav1d/src/internal.h
+++ b/third_party/dav1d/src/internal.h
@@ -70,91 +70,80 @@ struct Dav1dContext {
     unsigned n_fc;
 
     // cache of OBUs that make up a single frame before we submit them
     // to a frame worker to be decoded
     struct {
         Dav1dData data;
         int start, end;
     } tile[256];
-    int n_tile_data;
+    int n_tile_data, have_seq_hdr, have_frame_hdr;
     int n_tiles;
-    Dav1dRef *seq_hdr_ref;
-    Dav1dSequenceHeader *seq_hdr;
-    Dav1dRef *frame_hdr_ref;
-    Dav1dFrameHeader *frame_hdr;
+    Av1SequenceHeader seq_hdr; // FIXME make ref?
+    Av1FrameHeader frame_hdr; // FIXME make ref?
 
     // decoded output picture queue
     Dav1dData in;
     Dav1dPicture out;
     struct {
         Dav1dThreadPicture *out_delayed;
         unsigned next;
-        // dummy is a pointer to prevent compiler errors about atomic_load()
-        // not taking const arguments; the const attribute is not taken
-        // from pointers
-        atomic_int flush_mem, *flush;
     } frame_thread;
 
     // reference/entropy state
     struct {
         Dav1dThreadPicture p;
         Dav1dRef *segmap;
+        Av1SegmentationDataSet seg_data;
         Dav1dRef *refmvs;
         unsigned refpoc[7];
+        WarpedMotionParams gmv[7];
+        Av1LoopfilterModeRefDeltas lf_mode_ref_deltas;
+        Av1FilmGrainData film_grain;
+        uint8_t qidx;
     } refs[8];
     CdfThreadContext cdf[8];
 
     Dav1dDSPContext dsp[3 /* 8, 10, 12 bits/component */];
 
     // tree to keep track of which edges are available
     struct {
         EdgeNode *root[2 /* BL_128X128 vs. BL_64X64 */];
         EdgeBranch branch_sb128[1 + 4 + 16 + 64];
         EdgeBranch branch_sb64[1 + 4 + 16];
         EdgeTip tip_sb128[256];
         EdgeTip tip_sb64[64];
     } intra_edge;
 
     Dav1dPicAllocator allocator;
-    int apply_grain;
-    int operating_point;
-    unsigned operating_point_idc;
-    int all_layers;
 };
 
 struct Dav1dFrameContext {
-    Dav1dRef *seq_hdr_ref;
-    Dav1dSequenceHeader *seq_hdr;
-    Dav1dRef *frame_hdr_ref;
-    Dav1dFrameHeader *frame_hdr;
-    Dav1dThreadPicture refp[7];
-    Dav1dPicture cur; // during block coding / reconstruction
-    Dav1dThreadPicture sr_cur; // after super-resolution upscaling
+    Av1SequenceHeader seq_hdr;
+    Av1FrameHeader frame_hdr;
+    Dav1dThreadPicture refp[7], cur;
     Dav1dRef *mvs_ref;
     refmvs *mvs, *ref_mvs[7];
     Dav1dRef *ref_mvs_ref[7];
     Dav1dRef *cur_segmap_ref, *prev_segmap_ref;
     uint8_t *cur_segmap;
     const uint8_t *prev_segmap;
     unsigned refpoc[7], refrefpoc[7][7];
-    uint8_t gmv_warp_allowed[7];
     CdfThreadContext in_cdf, out_cdf;
     struct {
         Dav1dData data;
         int start, end;
     } tile[256];
     int n_tile_data;
 
     // for scalable references
     struct ScalableMotionParams {
         int scale; // if no scaling, this is 0
         int step;
     } svc[7][2 /* x, y */];
-    int resize_step[2 /* y, uv */], resize_start[2 /* y, uv */];
 
     const Dav1dContext *c;
     Dav1dTileContext *tc;
     int n_tc;
     Dav1dTileState *ts;
     int n_ts;
     const Dav1dDSPContext *dsp;
     struct {
@@ -163,18 +152,18 @@ struct Dav1dFrameContext {
         filter_sbrow_fn filter_sbrow;
         backup_ipred_edge_fn backup_ipred_edge;
         read_coef_blocks_fn read_coef_blocks;
     } bd_fn;
 
     int ipred_edge_sz;
     pixel *ipred_edge[3];
     ptrdiff_t b4_stride;
-    int w4, h4, bw, bh, sb128w, sb128h, sbh, sb_shift, sb_step, sr_sb128w;
-    uint16_t dq[DAV1D_MAX_SEGMENTS][3 /* plane */][2 /* dc/ac */];
+    int w4, h4, bw, bh, sb128w, sb128h, sbh, sb_shift, sb_step;
+    uint16_t dq[NUM_SEGMENTS][3 /* plane */][2 /* dc/ac */];
     const uint8_t *qm[2 /* is_1d */][N_RECT_TX_SIZES][3 /* plane */];
     BlockContext *a;
     int a_sz /* w*tile_rows */;
     AV1_COMMON *libaom_cm; // FIXME
     uint8_t jnt_weights[7][7];
 
     struct {
         struct thread_data td;
@@ -194,42 +183,41 @@ struct Dav1dFrameContext {
         // start offsets per tile
         int *tile_start_off;
     } frame_thread;
 
     // loopfilter
     struct {
         uint8_t (*level)[4];
         Av1Filter *mask;
-        Av1Restoration *lr_mask;
         int top_pre_cdef_toggle;
-        int mask_sz /* w*h */, lr_mask_sz, line_sz /* w */, lr_line_sz, re_sz /* h */;
+        int mask_sz /* w*h */, line_sz /* w */, re_sz /* h */;
         Av1FilterLUT lim_lut;
         int last_sharpness;
         uint8_t lvl[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */];
         uint8_t *tx_lpf_right_edge[2];
         pixel *cdef_line;
         pixel *cdef_line_ptr[2 /* pre, post */][3 /* plane */][2 /* y */];
         pixel *lr_lpf_line;
         pixel *lr_lpf_line_ptr[3 /* plane */];
 
         // in-loop filter per-frame state keeping
         int tile_row; // for carry-over at tile row edges
-        pixel *p[3], *sr_p[3];
+        pixel *p[3];
         Av1Filter *mask_ptr, *prev_mask_ptr;
     } lf;
 
     // threading (refer to tc[] for per-thread things)
     struct FrameTileThreadData {
         uint64_t available;
         pthread_mutex_t lock;
         pthread_cond_t cond, icond;
         int tasks_left, num_tasks;
         int (*task_idx_to_sby_and_tile_idx)[2];
-        int titsati_sz, titsati_init[3];
+        int titsati_sz, titsati_init[2];
     } tile_thread;
 };
 
 struct Dav1dTileState {
     struct {
         int col_start, col_end, row_start, row_end; // in 4px units
         int col, row; // in tile units
     } tiling;
@@ -242,17 +230,17 @@ struct Dav1dTileState {
         pthread_mutex_t lock;
         pthread_cond_t cond;
     } tile_thread;
     struct {
         uint8_t *pal_idx;
         coef *cf;
     } frame_thread;
 
-    uint16_t dqmem[DAV1D_MAX_SEGMENTS][3 /* plane */][2 /* dc/ac */];
+    uint16_t dqmem[NUM_SEGMENTS][3 /* plane */][2 /* dc/ac */];
     const uint16_t (*dq)[3][2];
     int last_qidx;
 
     int8_t last_delta_lf[4];
     uint8_t lflvlmem[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */];
     const uint8_t (*lflvl)[4][8][2];
 
     Av1RestorationUnit *lr_ref[3];
@@ -266,17 +254,17 @@ struct Dav1dTileContext {
     coef *cf;
     pixel *emu_edge; // stride=192 for non-SVC, or 320 for SVC
     // FIXME types can be changed to pixel (and dynamically allocated)
     // which would make copy/assign operations slightly faster?
     uint16_t al_pal[2 /* a/l */][32 /* bx/y4 */][3 /* plane */][8 /* palette_idx */];
     uint16_t pal[3 /* plane */][8 /* palette_idx */];
     uint8_t pal_sz_uv[2 /* a/l */][32 /* bx4/by4 */];
     uint8_t txtp_map[32 * 32]; // inter-only
-    Dav1dWarpedMotionParams warpmv;
+    WarpedMotionParams warpmv;
     union {
         void *mem;
         uint8_t *pal_idx;
         int16_t *ac;
         pixel *interintra, *lap;
         coef *compinter;
     } scratch;
     ALIGN(uint8_t scratch_seg_mask[128 * 128], 32);
--- a/third_party/dav1d/src/intra_edge.c
+++ b/third_party/dav1d/src/intra_edge.c
@@ -23,17 +23,16 @@
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include "config.h"
 
 #include <assert.h>
-#include <stdlib.h>
 
 #include "src/intra_edge.h"
 #include "src/levels.h"
 
 struct ModeSelMem {
     EdgeBranch *nwc[3 /* 64x64, 32x32, 16x16 */];
     EdgeTip *nt;
 };
--- a/third_party/dav1d/src/ipred.h
+++ b/third_party/dav1d/src/ipred.h
@@ -23,18 +23,16 @@
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #ifndef __DAV1D_SRC_IPRED_H__
 #define __DAV1D_SRC_IPRED_H__
 
-#include <stddef.h>
-
 #include "common/bitdepth.h"
 
 #include "src/levels.h"
 
 /*
  * Intra prediction.
  * - a is the angle (in degrees) for directional intra predictors. For other
  *   modes, it is ignored;
--- a/third_party/dav1d/src/itx.h
+++ b/third_party/dav1d/src/itx.h
@@ -23,18 +23,16 @@
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #ifndef __DAV1D_SRC_ITX_H__
 #define __DAV1D_SRC_ITX_H__
 
-#include <stddef.h>
-
 #include "common/bitdepth.h"
 
 #include "src/levels.h"
 
 #define decl_itx_fn(name) \
 void (name)(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob)
 typedef decl_itx_fn(*itxfm_fn);
 
--- a/third_party/dav1d/src/itx_1d.c
+++ b/third_party/dav1d/src/itx_1d.c
@@ -27,149 +27,143 @@
 
 #include "config.h"
 
 #include <stddef.h>
 #include <stdint.h>
 
 #include "common/attributes.h"
 
-#define CLIP(a) iclip(a, min, max)
-
 static void NOINLINE
 inv_dct4_1d(const coef *const in, const ptrdiff_t in_s,
-            coef *const out, const ptrdiff_t out_s, const int max)
+            coef *const out, const ptrdiff_t out_s)
 {
-    const int min = -max - 1;
     const int in0 = in[0 * in_s], in1 = in[1 * in_s];
     const int in2 = in[2 * in_s], in3 = in[3 * in_s];
 
     int t0 = ((in0 + in2) * 2896 + 2048) >> 12;
     int t1 = ((in0 - in2) * 2896 + 2048) >> 12;
     int t2 = (in1 * 1567 - in3 * 3784 + 2048) >> 12;
     int t3 = (in1 * 3784 + in3 * 1567 + 2048) >> 12;
 
-    out[0 * out_s] = CLIP(t0 + t3);
-    out[1 * out_s] = CLIP(t1 + t2);
-    out[2 * out_s] = CLIP(t1 - t2);
-    out[3 * out_s] = CLIP(t0 - t3);
+    out[0 * out_s] = t0 + t3;
+    out[1 * out_s] = t1 + t2;
+    out[2 * out_s] = t1 - t2;
+    out[3 * out_s] = t0 - t3;
 }
 
 static void NOINLINE
 inv_dct8_1d(const coef *const in, const ptrdiff_t in_s,
-            coef *const out, const ptrdiff_t out_s, const int max)
+            coef *const out, const ptrdiff_t out_s)
 {
-    const int min = -max - 1;
     coef tmp[4];
 
-    inv_dct4_1d(in, in_s * 2, tmp, 1, max);
+    inv_dct4_1d(in, in_s * 2, tmp, 1);
 
     const int in1 = in[1 * in_s], in3 = in[3 * in_s];
     const int in5 = in[5 * in_s], in7 = in[7 * in_s];
 
     int t4a = (in1 *  799 - in7 * 4017 + 2048) >> 12;
     int t5a = (in5 * 3406 - in3 * 2276 + 2048) >> 12;
     int t6a = (in5 * 2276 + in3 * 3406 + 2048) >> 12;
     int t7a = (in1 * 4017 + in7 *  799 + 2048) >> 12;
 
-    int t4  = CLIP(t4a + t5a);
-        t5a = CLIP(t4a - t5a);
-    int t7  = CLIP(t7a + t6a);
-        t6a = CLIP(t7a - t6a);
+    int t4  = t4a + t5a;
+        t5a = t4a - t5a;
+    int t7  = t7a + t6a;
+        t6a = t7a - t6a;
 
     int t5  = ((t6a - t5a) * 2896 + 2048) >> 12;
     int t6  = ((t6a + t5a) * 2896 + 2048) >> 12;
 
-    out[0 * out_s] = CLIP(tmp[0] + t7);
-    out[1 * out_s] = CLIP(tmp[1] + t6);
-    out[2 * out_s] = CLIP(tmp[2] + t5);
-    out[3 * out_s] = CLIP(tmp[3] + t4);
-    out[4 * out_s] = CLIP(tmp[3] - t4);
-    out[5 * out_s] = CLIP(tmp[2] - t5);
-    out[6 * out_s] = CLIP(tmp[1] - t6);
-    out[7 * out_s] = CLIP(tmp[0] - t7);
+    out[0 * out_s] = tmp[0] + t7;
+    out[1 * out_s] = tmp[1] + t6;
+    out[2 * out_s] = tmp[2] + t5;
+    out[3 * out_s] = tmp[3] + t4;
+    out[4 * out_s] = tmp[3] - t4;
+    out[5 * out_s] = tmp[2] - t5;
+    out[6 * out_s] = tmp[1] - t6;
+    out[7 * out_s] = tmp[0] - t7;
 }
 
 static void NOINLINE
 inv_dct16_1d(const coef *const in, const ptrdiff_t in_s,
-             coef *const out, const ptrdiff_t out_s, const int max)
+             coef *const out, const ptrdiff_t out_s)
 {
-    const int min = -max - 1;
     coef tmp[8];
 
-    inv_dct8_1d(in, in_s * 2, tmp, 1, max);
+    inv_dct8_1d(in, in_s * 2, tmp, 1);
 
     const int in1  = in[ 1 * in_s], in3  = in[ 3 * in_s];
     const int in5  = in[ 5 * in_s], in7  = in[ 7 * in_s];
     const int in9  = in[ 9 * in_s], in11 = in[11 * in_s];
     const int in13 = in[13 * in_s], in15 = in[15 * in_s];
 
     int t8a  = (in1  *  401 - in15 * 4076 + 2048) >> 12;
     int t15a = (in1  * 4076 + in15 *  401 + 2048) >> 12;
     int t9a  = (in9  * 3166 - in7  * 2598 + 2048) >> 12;
     int t14a = (in9  * 2598 + in7  * 3166 + 2048) >> 12;
     int t10a = (in5  * 1931 - in11 * 3612 + 2048) >> 12;
     int t13a = (in5  * 3612 + in11 * 1931 + 2048) >> 12;
     int t11a = (in13 * 3920 - in3  * 1189 + 2048) >> 12;
     int t12a = (in13 * 1189 + in3  * 3920 + 2048) >> 12;
 
-    int t8  = CLIP(t8a  + t9a);
-    int t9  = CLIP(t8a  - t9a);
-    int t10 = CLIP(t11a - t10a);
-    int t11 = CLIP(t11a + t10a);
-    int t12 = CLIP(t12a + t13a);
-    int t13 = CLIP(t12a - t13a);
-    int t14 = CLIP(t15a - t14a);
-    int t15 = CLIP(t15a + t14a);
+    int t8  = t8a  + t9a;
+    int t9  = t8a  - t9a;
+    int t10 = t11a - t10a;
+    int t11 = t11a + t10a;
+    int t12 = t12a + t13a;
+    int t13 = t12a - t13a;
+    int t14 = t15a - t14a;
+    int t15 = t15a + t14a;
 
     t9a  = (  t14 * 1567 - t9  * 3784  + 2048) >> 12;
     t14a = (  t14 * 3784 + t9  * 1567  + 2048) >> 12;
     t10a = (-(t13 * 3784 + t10 * 1567) + 2048) >> 12;
     t13a = (  t13 * 1567 - t10 * 3784  + 2048) >> 12;
 
-    t8a  = CLIP(t8   + t11);
-    t9   = CLIP(t9a  + t10a);
-    t10  = CLIP(t9a  - t10a);
-    t11a = CLIP(t8   - t11);
-    t12a = CLIP(t15  - t12);
-    t13  = CLIP(t14a - t13a);
-    t14  = CLIP(t14a + t13a);
-    t15a = CLIP(t15  + t12);
+    t8a  = t8   + t11;
+    t9   = t9a  + t10a;
+    t10  = t9a  - t10a;
+    t11a = t8   - t11;
+    t12a = t15  - t12;
+    t13  = t14a - t13a;
+    t14  = t14a + t13a;
+    t15a = t15  + t12;
 
     t10a = ((t13  - t10)  * 2896 + 2048) >> 12;
     t13a = ((t13  + t10)  * 2896 + 2048) >> 12;
     t11  = ((t12a - t11a) * 2896 + 2048) >> 12;
     t12  = ((t12a + t11a) * 2896 + 2048) >> 12;
 
-    out[ 0 * out_s] = CLIP(tmp[0] + t15a);
-    out[ 1 * out_s] = CLIP(tmp[1] + t14);
-    out[ 2 * out_s] = CLIP(tmp[2] + t13a);
-    out[ 3 * out_s] = CLIP(tmp[3] + t12);
-    out[ 4 * out_s] = CLIP(tmp[4] + t11);
-    out[ 5 * out_s] = CLIP(tmp[5] + t10a);
-    out[ 6 * out_s] = CLIP(tmp[6] + t9);
-    out[ 7 * out_s] = CLIP(tmp[7] + t8a);
-    out[ 8 * out_s] = CLIP(tmp[7] - t8a);
-    out[ 9 * out_s] = CLIP(tmp[6] - t9);
-    out[10 * out_s] = CLIP(tmp[5] - t10a);
-    out[11 * out_s] = CLIP(tmp[4] - t11);
-    out[12 * out_s] = CLIP(tmp[3] - t12);
-    out[13 * out_s] = CLIP(tmp[2] - t13a);
-    out[14 * out_s] = CLIP(tmp[1] - t14);
-    out[15 * out_s] = CLIP(tmp[0] - t15a);
+    out[ 0 * out_s] = tmp[0] + t15a;
+    out[ 1 * out_s] = tmp[1] + t14;
+    out[ 2 * out_s] = tmp[2] + t13a;
+    out[ 3 * out_s] = tmp[3] + t12;
+    out[ 4 * out_s] = tmp[4] + t11;
+    out[ 5 * out_s] = tmp[5] + t10a;
+    out[ 6 * out_s] = tmp[6] + t9;
+    out[ 7 * out_s] = tmp[7] + t8a;
+    out[ 8 * out_s] = tmp[7] - t8a;
+    out[ 9 * out_s] = tmp[6] - t9;
+    out[10 * out_s] = tmp[5] - t10a;
+    out[11 * out_s] = tmp[4] - t11;
+    out[12 * out_s] = tmp[3] - t12;
+    out[13 * out_s] = tmp[2] - t13a;
+    out[14 * out_s] = tmp[1] - t14;
+    out[15 * out_s] = tmp[0] - t15a;
 }
 
 static void NOINLINE
 inv_dct32_1d(const coef *const in, const ptrdiff_t in_s,
-             coef *const out, const ptrdiff_t out_s, const int max)
+             coef *const out, const ptrdiff_t out_s)
 {
-    const int min = -max - 1;
     coef tmp[16];
 
-    inv_dct16_1d(in, in_s * 2, tmp, 1, max);
+    inv_dct16_1d(in, in_s * 2, tmp, 1);
 
     const int in1  = in[ 1 * in_s], in3  = in[ 3 * in_s];
     const int in5  = in[ 5 * in_s], in7  = in[ 7 * in_s];
     const int in9  = in[ 9 * in_s], in11 = in[11 * in_s];
     const int in13 = in[13 * in_s], in15 = in[15 * in_s];
     const int in17 = in[17 * in_s], in19 = in[19 * in_s];
     const int in21 = in[21 * in_s], in23 = in[23 * in_s];
     const int in25 = in[25 * in_s], in27 = in[27 * in_s];
@@ -187,136 +181,135 @@ inv_dct32_1d(const coef *const in, const
     int t27a = (in5  * 3973 + in27 *  995 + 2048) >> 12;
     int t21a = (in21 * 3513 - in11 * 2106 + 2048) >> 12;
     int t26a = (in21 * 2106 + in11 * 3513 + 2048) >> 12;
     int t22a = (in13 * 2440 - in19 * 3290 + 2048) >> 12;
     int t25a = (in13 * 3290 + in19 * 2440 + 2048) >> 12;
     int t23a = (in29 * 4052 - in3  *  601 + 2048) >> 12;
     int t24a = (in29 *  601 + in3  * 4052 + 2048) >> 12;
 
-    int t16 = CLIP(t16a + t17a);
-    int t17 = CLIP(t16a - t17a);
-    int t18 = CLIP(t19a - t18a);
-    int t19 = CLIP(t19a + t18a);
-    int t20 = CLIP(t20a + t21a);
-    int t21 = CLIP(t20a - t21a);
-    int t22 = CLIP(t23a - t22a);
-    int t23 = CLIP(t23a + t22a);
-    int t24 = CLIP(t24a + t25a);
-    int t25 = CLIP(t24a - t25a);
-    int t26 = CLIP(t27a - t26a);
-    int t27 = CLIP(t27a + t26a);
-    int t28 = CLIP(t28a + t29a);
-    int t29 = CLIP(t28a - t29a);
-    int t30 = CLIP(t31a - t30a);
-    int t31 = CLIP(t31a + t30a);
+    int t16 = t16a + t17a;
+    int t17 = t16a - t17a;
+    int t18 = t19a - t18a;
+    int t19 = t19a + t18a;
+    int t20 = t20a + t21a;
+    int t21 = t20a - t21a;
+    int t22 = t23a - t22a;
+    int t23 = t23a + t22a;
+    int t24 = t24a + t25a;
+    int t25 = t24a - t25a;
+    int t26 = t27a - t26a;
+    int t27 = t27a + t26a;
+    int t28 = t28a + t29a;
+    int t29 = t28a - t29a;
+    int t30 = t31a - t30a;
+    int t31 = t31a + t30a;
 
     t17a = (  t30 *  799 - t17 * 4017  + 2048) >> 12;
     t30a = (  t30 * 4017 + t17 *  799  + 2048) >> 12;
     t18a = (-(t29 * 4017 + t18 *  799) + 2048) >> 12;
     t29a = (  t29 *  799 - t18 * 4017  + 2048) >> 12;
     t21a = (  t26 * 3406 - t21 * 2276  + 2048) >> 12;
     t26a = (  t26 * 2276 + t21 * 3406  + 2048) >> 12;
     t22a = (-(t25 * 2276 + t22 * 3406) + 2048) >> 12;
     t25a = (  t25 * 3406 - t22 * 2276  + 2048) >> 12;
 
-    t16a = CLIP(t16  + t19);
-    t17  = CLIP(t17a + t18a);
-    t18  = CLIP(t17a - t18a);
-    t19a = CLIP(t16  - t19);
-    t20a = CLIP(t23  - t20);
-    t21  = CLIP(t22a - t21a);
-    t22  = CLIP(t22a + t21a);
-    t23a = CLIP(t23  + t20);
-    t24a = CLIP(t24  + t27);
-    t25  = CLIP(t25a + t26a);
-    t26  = CLIP(t25a - t26a);
-    t27a = CLIP(t24  - t27);
-    t28a = CLIP(t31  - t28);
-    t29  = CLIP(t30a - t29a);
-    t30  = CLIP(t30a + t29a);
-    t31a = CLIP(t31  + t28);
+    t16a = t16  + t19;
+    t17  = t17a + t18a;
+    t18  = t17a - t18a;
+    t19a = t16  - t19;
+    t20a = t23  - t20;
+    t21  = t22a - t21a;
+    t22  = t22a + t21a;
+    t23a = t23  + t20;
+    t24a = t24  + t27;
+    t25  = t25a + t26a;
+    t26  = t25a - t26a;
+    t27a = t24  - t27;
+    t28a = t31  - t28;
+    t29  = t30a - t29a;
+    t30  = t30a + t29a;
+    t31a = t31  + t28;
 
     t18a = (  t29  * 1567 - t18  * 3784  + 2048) >> 12;
     t29a = (  t29  * 3784 + t18  * 1567  + 2048) >> 12;
     t19  = (  t28a * 1567 - t19a * 3784  + 2048) >> 12;
     t28  = (  t28a * 3784 + t19a * 1567  + 2048) >> 12;
     t20  = (-(t27a * 3784 + t20a * 1567) + 2048) >> 12;
     t27  = (  t27a * 1567 - t20a * 3784  + 2048) >> 12;
     t21a = (-(t26  * 3784 + t21  * 1567) + 2048) >> 12;
     t26a = (  t26  * 1567 - t21  * 3784  + 2048) >> 12;
 
-    t16  = CLIP(t16a + t23a);
-    t17a = CLIP(t17  + t22);
-    t18  = CLIP(t18a + t21a);
-    t19a = CLIP(t19  + t20);
-    t20a = CLIP(t19  - t20);
-    t21  = CLIP(t18a - t21a);
-    t22a = CLIP(t17  - t22);
-    t23  = CLIP(t16a - t23a);
-    t24  = CLIP(t31a - t24a);
-    t25a = CLIP(t30  - t25);
-    t26  = CLIP(t29a - t26a);
-    t27a = CLIP(t28  - t27);
-    t28a = CLIP(t28  + t27);
-    t29  = CLIP(t29a + t26a);
-    t30a = CLIP(t30  + t25);
-    t31  = CLIP(t31a + t24a);
+    t16  = t16a + t23a;
+    t17a = t17  + t22;
+    t18  = t18a + t21a;
+    t19a = t19  + t20;
+    t20a = t19  - t20;
+    t21  = t18a - t21a;
+    t22a = t17  - t22;
+    t23  = t16a - t23a;
+    t24  = t31a - t24a;
+    t25a = t30  - t25;
+    t26  = t29a - t26a;
+    t27a = t28  - t27;
+    t28a = t28  + t27;
+    t29  = t29a + t26a;
+    t30a = t30  + t25;
+    t31  = t31a + t24a;
 
     t20  = ((t27a - t20a) * 2896 + 2048) >> 12;
     t27  = ((t27a + t20a) * 2896 + 2048) >> 12;
     t21a = ((t26  - t21 ) * 2896 + 2048) >> 12;
     t26a = ((t26  + t21 ) * 2896 + 2048) >> 12;
     t22  = ((t25a - t22a) * 2896 + 2048) >> 12;
     t25  = ((t25a + t22a) * 2896 + 2048) >> 12;
     t23a = ((t24  - t23 ) * 2896 + 2048) >> 12;
     t24a = ((t24  + t23 ) * 2896 + 2048) >> 12;
 
-    out[ 0 * out_s] = CLIP(tmp[ 0] + t31);
-    out[ 1 * out_s] = CLIP(tmp[ 1] + t30a);
-    out[ 2 * out_s] = CLIP(tmp[ 2] + t29);
-    out[ 3 * out_s] = CLIP(tmp[ 3] + t28a);
-    out[ 4 * out_s] = CLIP(tmp[ 4] + t27);
-    out[ 5 * out_s] = CLIP(tmp[ 5] + t26a);
-    out[ 6 * out_s] = CLIP(tmp[ 6] + t25);
-    out[ 7 * out_s] = CLIP(tmp[ 7] + t24a);
-    out[ 8 * out_s] = CLIP(tmp[ 8] + t23a);
-    out[ 9 * out_s] = CLIP(tmp[ 9] + t22);
-    out[10 * out_s] = CLIP(tmp[10] + t21a);
-    out[11 * out_s] = CLIP(tmp[11] + t20);
-    out[12 * out_s] = CLIP(tmp[12] + t19a);
-    out[13 * out_s] = CLIP(tmp[13] + t18);
-    out[14 * out_s] = CLIP(tmp[14] + t17a);
-    out[15 * out_s] = CLIP(tmp[15] + t16);
-    out[16 * out_s] = CLIP(tmp[15] - t16);
-    out[17 * out_s] = CLIP(tmp[14] - t17a);
-    out[18 * out_s] = CLIP(tmp[13] - t18);
-    out[19 * out_s] = CLIP(tmp[12] - t19a);
-    out[20 * out_s] = CLIP(tmp[11] - t20);
-    out[21 * out_s] = CLIP(tmp[10] - t21a);
-    out[22 * out_s] = CLIP(tmp[ 9] - t22);
-    out[23 * out_s] = CLIP(tmp[ 8] - t23a);
-    out[24 * out_s] = CLIP(tmp[ 7] - t24a);
-    out[25 * out_s] = CLIP(tmp[ 6] - t25);
-    out[26 * out_s] = CLIP(tmp[ 5] - t26a);
-    out[27 * out_s] = CLIP(tmp[ 4] - t27);
-    out[28 * out_s] = CLIP(tmp[ 3] - t28a);
-    out[29 * out_s] = CLIP(tmp[ 2] - t29);
-    out[30 * out_s] = CLIP(tmp[ 1] - t30a);
-    out[31 * out_s] = CLIP(tmp[ 0] - t31);
+    out[ 0 * out_s] = tmp[ 0] + t31;
+    out[ 1 * out_s] = tmp[ 1] + t30a;
+    out[ 2 * out_s] = tmp[ 2] + t29;
+    out[ 3 * out_s] = tmp[ 3] + t28a;
+    out[ 4 * out_s] = tmp[ 4] + t27;
+    out[ 5 * out_s] = tmp[ 5] + t26a;
+    out[ 6 * out_s] = tmp[ 6] + t25;
+    out[ 7 * out_s] = tmp[ 7] + t24a;
+    out[ 8 * out_s] = tmp[ 8] + t23a;
+    out[ 9 * out_s] = tmp[ 9] + t22;
+    out[10 * out_s] = tmp[10] + t21a;
+    out[11 * out_s] = tmp[11] + t20;
+    out[12 * out_s] = tmp[12] + t19a;
+    out[13 * out_s] = tmp[13] + t18;
+    out[14 * out_s] = tmp[14] + t17a;
+    out[15 * out_s] = tmp[15] + t16;
+    out[16 * out_s] = tmp[15] - t16;
+    out[17 * out_s] = tmp[14] - t17a;
+    out[18 * out_s] = tmp[13] - t18;
+    out[19 * out_s] = tmp[12] - t19a;
+    out[20 * out_s] = tmp[11] - t20;
+    out[21 * out_s] = tmp[10] - t21a;
+    out[22 * out_s] = tmp[ 9] - t22;
+    out[23 * out_s] = tmp[ 8] - t23a;
+    out[24 * out_s] = tmp[ 7] - t24a;
+    out[25 * out_s] = tmp[ 6] - t25;
+    out[26 * out_s] = tmp[ 5] - t26a;
+    out[27 * out_s] = tmp[ 4] - t27;
+    out[28 * out_s] = tmp[ 3] - t28a;
+    out[29 * out_s] = tmp[ 2] - t29;
+    out[30 * out_s] = tmp[ 1] - t30a;
+    out[31 * out_s] = tmp[ 0] - t31;
 }
 
 static void NOINLINE
 inv_dct64_1d(const coef *const in, const ptrdiff_t in_s,
-             coef *const out, const ptrdiff_t out_s, const int max)
+             coef *const out, const ptrdiff_t out_s)
 {
-    const int min = -max - 1;
     coef tmp[32];
 
-    inv_dct32_1d(in, in_s * 2, tmp, 1, max);
+    inv_dct32_1d(in, in_s * 2, tmp, 1);
 
     const int in1  = in[ 1 * in_s], in3  = in[ 3 * in_s];
     const int in5  = in[ 5 * in_s], in7  = in[ 7 * in_s];
     const int in9  = in[ 9 * in_s], in11 = in[11 * in_s];
     const int in13 = in[13 * in_s], in15 = in[15 * in_s];
     const int in17 = in[17 * in_s], in19 = in[19 * in_s];
     const int in21 = in[21 * in_s], in23 = in[23 * in_s];
     const int in25 = in[25 * in_s], in27 = in[27 * in_s];
@@ -358,48 +351,48 @@ inv_dct64_1d(const coef *const in, const
     int t57a = (in25 * 3349 + in39 * 2359 + 2048) >> 12;
     int t58a = (in41 * 2191 + in23 * 3461 + 2048) >> 12;
     int t59a = (in9  * 3996 + in55 *  897 + 2048) >> 12;
     int t60a = (in49 * 1474 + in15 * 3822 + 2048) >> 12;
     int t61a = (in17 * 3745 + in47 * 1660 + 2048) >> 12;
     int t62a = (in33 * 2824 + in31 * 2967 + 2048) >> 12;
     int t63a = (in1  * 4095 + in63 *  101 + 2048) >> 12;
 
-    int t32 = CLIP(t32a + t33a);
-    int t33 = CLIP(t32a - t33a);
-    int t34 = CLIP(t35a - t34a);
-    int t35 = CLIP(t35a + t34a);
-    int t36 = CLIP(t36a + t37a);
-    int t37 = CLIP(t36a - t37a);
-    int t38 = CLIP(t39a - t38a);
-    int t39 = CLIP(t39a + t38a);
-    int t40 = CLIP(t40a + t41a);
-    int t41 = CLIP(t40a - t41a);
-    int t42 = CLIP(t43a - t42a);
-    int t43 = CLIP(t43a + t42a);
-    int t44 = CLIP(t44a + t45a);
-    int t45 = CLIP(t44a - t45a);
-    int t46 = CLIP(t47a - t46a);
-    int t47 = CLIP(t47a + t46a);
-    int t48 = CLIP(t48a + t49a);
-    int t49 = CLIP(t48a - t49a);
-    int t50 = CLIP(t51a - t50a);
-    int t51 = CLIP(t51a + t50a);
-    int t52 = CLIP(t52a + t53a);
-    int t53 = CLIP(t52a - t53a);
-    int t54 = CLIP(t55a - t54a);
-    int t55 = CLIP(t55a + t54a);
-    int t56 = CLIP(t56a + t57a);
-    int t57 = CLIP(t56a - t57a);
-    int t58 = CLIP(t59a - t58a);
-    int t59 = CLIP(t59a + t58a);
-    int t60 = CLIP(t60a + t61a);
-    int t61 = CLIP(t60a - t61a);
-    int t62 = CLIP(t63a - t62a);
-    int t63 = CLIP(t63a + t62a);
+    int t32 = t32a + t33a;
+    int t33 = t32a - t33a;
+    int t34 = t35a - t34a;
+    int t35 = t35a + t34a;
+    int t36 = t36a + t37a;
+    int t37 = t36a - t37a;
+    int t38 = t39a - t38a;
+    int t39 = t39a + t38a;
+    int t40 = t40a + t41a;
+    int t41 = t40a - t41a;
+    int t42 = t43a - t42a;
+    int t43 = t43a + t42a;
+    int t44 = t44a + t45a;
+    int t45 = t44a - t45a;
+    int t46 = t47a - t46a;
+    int t47 = t47a + t46a;
+    int t48 = t48a + t49a;
+    int t49 = t48a - t49a;
+    int t50 = t51a - t50a;
+    int t51 = t51a + t50a;
+    int t52 = t52a + t53a;
+    int t53 = t52a - t53a;
+    int t54 = t55a - t54a;
+    int t55 = t55a + t54a;
+    int t56 = t56a + t57a;
+    int t57 = t56a - t57a;
+    int t58 = t59a - t58a;
+    int t59 = t59a + t58a;
+    int t60 = t60a + t61a;
+    int t61 = t60a - t61a;
+    int t62 = t63a - t62a;
+    int t63 = t63a + t62a;
 
     t33a = (t33 * -4076 + t62 *   401 + 2048) >> 12;
     t34a = (t34 * - 401 + t61 * -4076 + 2048) >> 12;
     t37a = (t37 * -2598 + t58 *  3166 + 2048) >> 12;
     t38a = (t38 * -3166 + t57 * -2598 + 2048) >> 12;
     t41a = (t41 * -3612 + t54 *  1931 + 2048) >> 12;
     t42a = (t42 * -1931 + t53 * -3612 + 2048) >> 12;
     t45a = (t45 * -1189 + t50 *  3920 + 2048) >> 12;
@@ -408,48 +401,48 @@ inv_dct64_1d(const coef *const in, const
     t50a = (t45 *  3920 + t50 *  1189 + 2048) >> 12;
     t53a = (t42 * -3612 + t53 *  1931 + 2048) >> 12;
     t54a = (t41 *  1931 + t54 *  3612 + 2048) >> 12;
     t57a = (t38 * -2598 + t57 *  3166 + 2048) >> 12;
     t58a = (t37 *  3166 + t58 *  2598 + 2048) >> 12;
     t61a = (t34 * -4076 + t61 *   401 + 2048) >> 12;
     t62a = (t33 *   401 + t62 *  4076 + 2048) >> 12;
 
-    t32a = CLIP(t32  + t35);
-    t33  = CLIP(t33a + t34a);
-    t34  = CLIP(t33a - t34a);
-    t35a = CLIP(t32  - t35);
-    t36a = CLIP(t39  - t36);
-    t37  = CLIP(t38a - t37a);
-    t38  = CLIP(t38a + t37a);
-    t39a = CLIP(t39  + t36);
-    t40a = CLIP(t40  + t43);
-    t41  = CLIP(t41a + t42a);
-    t42  = CLIP(t41a - t42a);
-    t43a = CLIP(t40  - t43);
-    t44a = CLIP(t47  - t44);
-    t45  = CLIP(t46a - t45a);
-    t46  = CLIP(t46a + t45a);
-    t47a = CLIP(t47  + t44);
-    t48a = CLIP(t48  + t51);
-    t49  = CLIP(t49a + t50a);
-    t50  = CLIP(t49a - t50a);
-    t51a = CLIP(t48  - t51);
-    t52a = CLIP(t55  - t52);
-    t53  = CLIP(t54a - t53a);
-    t54  = CLIP(t54a + t53a);
-    t55a = CLIP(t55  + t52);
-    t56a = CLIP(t56  + t59);
-    t57  = CLIP(t57a + t58a);
-    t58  = CLIP(t57a - t58a);
-    t59a = CLIP(t56  - t59);
-    t60a = CLIP(t63  - t60);
-    t61  = CLIP(t62a - t61a);
-    t62  = CLIP(t62a + t61a);
-    t63a = CLIP(t63  + t60);
+    t32a = t32  + t35;
+    t33  = t33a + t34a;
+    t34  = t33a - t34a;
+    t35a = t32  - t35;
+    t36a = t39  - t36;
+    t37  = t38a - t37a;
+    t38  = t38a + t37a;
+    t39a = t39  + t36;
+    t40a = t40  + t43;
+    t41  = t41a + t42a;
+    t42  = t41a - t42a;
+    t43a = t40  - t43;
+    t44a = t47  - t44;
+    t45  = t46a - t45a;
+    t46  = t46a + t45a;
+    t47a = t47  + t44;
+    t48a = t48  + t51;
+    t49  = t49a + t50a;
+    t50  = t49a - t50a;
+    t51a = t48  - t51;
+    t52a = t55  - t52;
+    t53  = t54a - t53a;
+    t54  = t54a + t53a;
+    t55a = t55  + t52;
+    t56a = t56  + t59;
+    t57  = t57a + t58a;
+    t58  = t57a - t58a;
+    t59a = t56  - t59;
+    t60a = t63  - t60;
+    t61  = t62a - t61a;
+    t62  = t62a + t61a;
+    t63a = t63  + t60;
 
     t34a = (t34  * -4017 + t61  *   799 + 2048) >> 12;
     t35  = (t35a * -4017 + t60a *   799 + 2048) >> 12;
     t36  = (t36a * - 799 + t59a * -4017 + 2048) >> 12;
     t37a = (t37  * - 799 + t58  * -4017 + 2048) >> 12;
     t42a = (t42  * -2276 + t53  *  3406 + 2048) >> 12;
     t43  = (t43a * -2276 + t52a *  3406 + 2048) >> 12;
     t44  = (t44a * -3406 + t51a * -2276 + 2048) >> 12;
@@ -458,48 +451,48 @@ inv_dct64_1d(const coef *const in, const
     t51  = (t44a * -2276 + t51a *  3406 + 2048) >> 12;
     t52  = (t43a *  3406 + t52a *  2276 + 2048) >> 12;
     t53a = (t42  *  3406 + t53  *  2276 + 2048) >> 12;
     t58a = (t37  * -4017 + t58  *   799 + 2048) >> 12;
     t59  = (t36a * -4017 + t59a *   799 + 2048) >> 12;
     t60  = (t35a *   799 + t60a *  4017 + 2048) >> 12;
     t61a = (t34  *   799 + t61  *  4017 + 2048) >> 12;
 
-    t32  = CLIP(t32a + t39a);
-    t33a = CLIP(t33  + t38);
-    t34  = CLIP(t34a + t37a);
-    t35a = CLIP(t35  + t36);
-    t36a = CLIP(t35  - t36);
-    t37  = CLIP(t34a - t37a);
-    t38a = CLIP(t33  - t38);
-    t39  = CLIP(t32a - t39a);
-    t40  = CLIP(t47a - t40a);
-    t41a = CLIP(t46  - t41);
-    t42  = CLIP(t45a - t42a);
-    t43a = CLIP(t44  - t43);
-    t44a = CLIP(t44  + t43);
-    t45  = CLIP(t45a + t42a);
-    t46a = CLIP(t46  + t41);
-    t47  = CLIP(t47a + t40a);
-    t48  = CLIP(t48a + t55a);
-    t49a = CLIP(t49  + t54);
-    t50  = CLIP(t50a + t53a);
-    t51a = CLIP(t51  + t52);
-    t52a = CLIP(t51  - t52);
-    t53  = CLIP(t50a - t53a);
-    t54a = CLIP(t49  - t54);
-    t55  = CLIP(t48a - t55a);
-    t56  = CLIP(t63a - t56a);
-    t57a = CLIP(t62  - t57);
-    t58  = CLIP(t61a - t58a);
-    t59a = CLIP(t60  - t59);
-    t60a = CLIP(t60  + t59);
-    t61  = CLIP(t61a + t58a);
-    t62a = CLIP(t62  + t57);
-    t63  = CLIP(t63a + t56a);
+    t32  = t32a + t39a;
+    t33a = t33  + t38;
+    t34  = t34a + t37a;
+    t35a = t35  + t36;
+    t36a = t35  - t36;
+    t37  = t34a - t37a;
+    t38a = t33  - t38;
+    t39  = t32a - t39a;
+    t40  = t47a - t40a;
+    t41a = t46  - t41;
+    t42  = t45a - t42a;
+    t43a = t44  - t43;
+    t44a = t44  + t43;
+    t45  = t45a + t42a;
+    t46a = t46  + t41;
+    t47  = t47a + t40a;
+    t48  = t48a + t55a;
+    t49a = t49  + t54;
+    t50  = t50a + t53a;
+    t51a = t51  + t52;
+    t52a = t51  - t52;
+    t53  = t50a - t53a;
+    t54a = t49  - t54;
+    t55  = t48a - t55a;
+    t56  = t63a - t56a;
+    t57a = t62  - t57;
+    t58  = t61a - t58a;
+    t59a = t60  - t59;
+    t60a = t60  + t59;
+    t61  = t61a + t58a;
+    t62a = t62  + t57;
+    t63  = t63a + t56a;
 
     t36  = (t36a * -3784 + t59a *  1567 + 2048) >> 12;
     t37a = (t37  * -3784 + t58  *  1567 + 2048) >> 12;
     t38  = (t38a * -3784 + t57a *  1567 + 2048) >> 12;
     t39a = (t39  * -3784 + t56  *  1567 + 2048) >> 12;
     t40a = (t40  * -1567 + t55  * -3784 + 2048) >> 12;
     t41  = (t41a * -1567 + t54a * -3784 + 2048) >> 12;
     t42a = (t42  * -1567 + t53  * -3784 + 2048) >> 12;
@@ -508,48 +501,48 @@ inv_dct64_1d(const coef *const in, const
     t53a = (t42  * -3784 + t53  *  1567 + 2048) >> 12;
     t54  = (t41a * -3784 + t54a *  1567 + 2048) >> 12;
     t55a = (t40  * -3784 + t55  *  1567 + 2048) >> 12;
     t56a = (t39  *  1567 + t56  *  3784 + 2048) >> 12;
     t57  = (t38a *  1567 + t57a *  3784 + 2048) >> 12;
     t58a = (t37  *  1567 + t58  *  3784 + 2048) >> 12;
     t59  = (t36a *  1567 + t59a *  3784 + 2048) >> 12;
 
-    t32a = CLIP(t32  + t47);
-    t33  = CLIP(t33a + t46a);
-    t34a = CLIP(t34  + t45);
-    t35  = CLIP(t35a + t44a);
-    t36a = CLIP(t36  + t43);
-    t37  = CLIP(t37a + t42a);
-    t38a = CLIP(t38  + t41);
-    t39  = CLIP(t39a + t40a);
-    t40  = CLIP(t39a - t40a);
-    t41a = CLIP(t38  - t41);
-    t42  = CLIP(t37a - t42a);
-    t43a = CLIP(t36  - t43);
-    t44  = CLIP(t35a - t44a);
-    t45a = CLIP(t34  - t45);
-    t46  = CLIP(t33a - t46a);
-    t47a = CLIP(t32  - t47);
-    t48a = CLIP(t63  - t48);
-    t49  = CLIP(t62a - t49a);
-    t50a = CLIP(t61  - t50);
-    t51  = CLIP(t60a - t51a);
-    t52a = CLIP(t59  - t52);
-    t53  = CLIP(t58a - t53a);
-    t54a = CLIP(t57  - t54);
-    t55  = CLIP(t56a - t55a);
-    t56  = CLIP(t56a + t55a);
-    t57a = CLIP(t57  + t54);
-    t58  = CLIP(t58a + t53a);
-    t59a = CLIP(t59  + t52);
-    t60  = CLIP(t60a + t51a);
-    t61a = CLIP(t61  + t50);
-    t62  = CLIP(t62a + t49a);
-    t63a = CLIP(t63  + t48);
+    t32a = t32  + t47;
+    t33  = t33a + t46a;
+    t34a = t34  + t45;
+    t35  = t35a + t44a;
+    t36a = t36  + t43;
+    t37  = t37a + t42a;
+    t38a = t38  + t41;
+    t39  = t39a + t40a;
+    t40  = t39a - t40a;
+    t41a = t38  - t41;
+    t42  = t37a - t42a;
+    t43a = t36  - t43;
+    t44  = t35a - t44a;
+    t45a = t34  - t45;
+    t46  = t33a - t46a;
+    t47a = t32  - t47;
+    t48a = t63  - t48;
+    t49  = t62a - t49a;
+    t50a = t61  - t50;
+    t51  = t60a - t51a;
+    t52a = t59  - t52;
+    t53  = t58a - t53a;
+    t54a = t57  - t54;
+    t55  = t56a - t55a;
+    t56  = t56a + t55a;
+    t57a = t57  + t54;
+    t58  = t58a + t53a;
+    t59a = t59  + t52;
+    t60  = t60a + t51a;
+    t61a = t61  + t50;
+    t62  = t62a + t49a;
+    t63a = t63  + t48;
 
     t40a = (t40  * -2896 + t55  * 2896 + 2048) >> 12;
     t41  = (t41a * -2896 + t54a * 2896 + 2048) >> 12;
     t42a = (t42  * -2896 + t53  * 2896 + 2048) >> 12;
     t43  = (t43a * -2896 + t52a * 2896 + 2048) >> 12;
     t44a = (t44  * -2896 + t51  * 2896 + 2048) >> 12;
     t45  = (t45a * -2896 + t50a * 2896 + 2048) >> 12;
     t46a = (t46  * -2896 + t49  * 2896 + 2048) >> 12;
@@ -558,85 +551,85 @@ inv_dct64_1d(const coef *const in, const
     t49a = (t46  *  2896 + t49  * 2896 + 2048) >> 12;
     t50  = (t45a *  2896 + t50a * 2896 + 2048) >> 12;
     t51a = (t44  *  2896 + t51  * 2896 + 2048) >> 12;
     t52  = (t43a *  2896 + t52a * 2896 + 2048) >> 12;
     t53a = (t42  *  2896 + t53  * 2896 + 2048) >> 12;
     t54  = (t41a *  2896 + t54a * 2896 + 2048) >> 12;
     t55a = (t40  *  2896 + t55  * 2896 + 2048) >> 12;
 
-    out[ 0 * out_s] = CLIP(tmp[ 0] + t63a);
-    out[ 1 * out_s] = CLIP(tmp[ 1] + t62);
-    out[ 2 * out_s] = CLIP(tmp[ 2] + t61a);
-    out[ 3 * out_s] = CLIP(tmp[ 3] + t60);
-    out[ 4 * out_s] = CLIP(tmp[ 4] + t59a);
-    out[ 5 * out_s] = CLIP(tmp[ 5] + t58);
-    out[ 6 * out_s] = CLIP(tmp[ 6] + t57a);
-    out[ 7 * out_s] = CLIP(tmp[ 7] + t56);
-    out[ 8 * out_s] = CLIP(tmp[ 8] + t55a);
-    out[ 9 * out_s] = CLIP(tmp[ 9] + t54);
-    out[10 * out_s] = CLIP(tmp[10] + t53a);
-    out[11 * out_s] = CLIP(tmp[11] + t5