Merge mozilla-central to mozilla-inbound. a=merge on a CLOSED TREE
authorRazvan Maries <rmaries@mozilla.com>
Wed, 16 Jan 2019 19:09:08 +0200
changeset 514191 3c6af4081f3a1494e47daed014cfc65d784cc208
parent 514190 138110dd654b811c6e0e8e2873f59740f23dd932 (current diff)
parent 514076 1312db5d495953cc6e18b16d082d06d611a21166 (diff)
child 514192 adb2bf52cba2bd1c0093396a6b7d32fced5e6066
push id1953
push userffxbld-merge
push dateMon, 11 Mar 2019 12:10:20 +0000
treeherdermozilla-release@9c35dcbaa899 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersmerge
milestone66.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Merge mozilla-central to mozilla-inbound. a=merge on a CLOSED TREE
js/src/jit/arm64/Assembler-arm64.h
js/src/jit/arm64/MacroAssembler-arm64.cpp
--- a/accessible/ipc/win/HandlerProvider.cpp
+++ b/accessible/ipc/win/HandlerProvider.cpp
@@ -130,18 +130,18 @@ void HandlerProvider::GetAndSerializePay
 
 HRESULT
 HandlerProvider::GetHandlerPayloadSize(
     NotNull<mscom::IInterceptor*> aInterceptor,
     NotNull<DWORD*> aOutPayloadSize) {
   MOZ_ASSERT(mscom::IsCurrentThreadMTA());
 
   if (!IsTargetInterfaceCacheable()) {
-    *aOutPayloadSize = mscom::StructToStream::GetEmptySize();
-    return S_OK;
+    // No handler, so no payload for this instance.
+    return E_NOTIMPL;
   }
 
   MutexAutoLock lock(mMutex);
 
   GetAndSerializePayload(lock, aInterceptor);
 
   if (!mSerializer || !(*mSerializer)) {
     // Failed payload serialization is non-fatal
@@ -373,16 +373,21 @@ void HandlerProvider::BuildInitialIA2Dat
 bool HandlerProvider::IsTargetInterfaceCacheable() {
   return MarshalAs(mTargetUnkIid) == NEWEST_IA2_IID ||
          mTargetUnkIid == IID_IAccessibleHyperlink;
 }
 
 HRESULT
 HandlerProvider::WriteHandlerPayload(NotNull<mscom::IInterceptor*> aInterceptor,
                                      NotNull<IStream*> aStream) {
+  if (!IsTargetInterfaceCacheable()) {
+    // No handler, so no payload for this instance.
+    return E_NOTIMPL;
+  }
+
   MutexAutoLock lock(mMutex);
 
   if (!mSerializer || !(*mSerializer)) {
     // Failed payload serialization is non-fatal
     mscom::StructToStream emptyStruct;
     return emptyStruct.Write(aStream);
   }
 
@@ -430,16 +435,23 @@ HandlerProvider::GetEffectiveOutParamIid
   // IAccessible2_2::accessibleWithCaret
   static_assert(&NEWEST_IA2_IID == &IID_IAccessible2_3,
                 "You have modified NEWEST_IA2_IID. This code needs updating.");
   if ((aCallIid == IID_IAccessible2_2 || aCallIid == IID_IAccessible2_3) &&
       aCallMethod == 47) {
     return NEWEST_IA2_IID;
   }
 
+  // IAccessible::get_accSelection
+  if ((aCallIid == IID_IAccessible || aCallIid == IID_IAccessible2 ||
+       aCallIid == IID_IAccessible2_2 || aCallIid == IID_IAccessible2_3) &&
+      aCallMethod == 19) {
+    return IID_IEnumVARIANT;
+  }
+
   MOZ_ASSERT(false);
   return IID_IUnknown;
 }
 
 HRESULT
 HandlerProvider::NewInstance(
     REFIID aIid, mscom::InterceptorTargetPtr<IUnknown> aTarget,
     NotNull<mscom::IHandlerProvider**> aOutNewPayload) {
--- a/browser/base/content/browser.js
+++ b/browser/base/content/browser.js
@@ -4150,20 +4150,18 @@ const BrowserSearch = {
    *
    * @param engine
    *        (nsISearchEngine) The engine handling the search.
    * @param source
    *        (string) Where the search originated from. See BrowserUsageTelemetry for
    *        allowed values.
    * @param type
    *        (string) Indicates how the user selected the search item.
-   * @param where
-   *        (string) Where was the search link opened (e.g. new tab, current tab, ..).
    */
-  recordOneoffSearchInTelemetry(engine, source, type, where) {
+  recordOneoffSearchInTelemetry(engine, source, type) {
     try {
       const details = {type, isOneOff: true};
       BrowserUsageTelemetry.recordSearch(gBrowser, engine, source, details);
     } catch (ex) {
       Cu.reportError(ex);
     }
   },
 };
--- a/browser/base/content/urlbarBindings.xml
+++ b/browser/base/content/urlbarBindings.xml
@@ -694,18 +694,17 @@ file, You can obtain one at http://mozil
           let action = this._parseActionUrl(url);
 
           if (selectedOneOff && selectedOneOff.engine) {
             // If there's a selected one-off button then load a search using
             // the one-off's engine.
             [url, postData] =
               this._parseAndRecordSearchEngineLoad(selectedOneOff.engine,
                                                    this.oneOffSearchQuery,
-                                                   event, where,
-                                                   openUILinkParams);
+                                                   event);
           } else if (action) {
             switch (action.type) {
               case "visiturl":
                 // Unifiedcomplete uses fixupURI to tell if something is a visit
                 // or a search, and passes out the fixedURI as the url param.
                 // By using that uri we would end up passing a different string
                 // to the docshell that may run a different not-found heuristic.
                 // For example, "mozilla/run" would be fixed by unifiedcomplete
@@ -771,18 +770,16 @@ file, You can obtain one at http://mozil
                 const actionDetails = {
                   isSuggestion: !!action.params.searchSuggestion,
                   alias: action.params.alias,
                 };
                 [url, postData] = this._parseAndRecordSearchEngineLoad(
                   action.params.engineName,
                   action.params.searchSuggestion || action.params.searchQuery,
                   event,
-                  where,
-                  openUILinkParams,
                   actionDetails
                 );
                 break;
               case "extension":
                 this.handleRevert();
                 // Give the extension control of handling the command.
                 let searchString = action.params.content;
                 let keyword = action.params.keyword;
@@ -899,26 +896,24 @@ file, You can obtain one at http://mozil
           this.selectionStart = this.selectionEnd = 0;
         ]]></body>
       </method>
 
       <method name="_parseAndRecordSearchEngineLoad">
         <parameter name="engineOrEngineName"/>
         <parameter name="query"/>
         <parameter name="event"/>
-        <parameter name="openUILinkWhere"/>
-        <parameter name="openUILinkParams"/>
         <parameter name="searchActionDetails"/>
         <body><![CDATA[
           let engine =
             typeof(engineOrEngineName) == "string" ?
               Services.search.getEngineByName(engineOrEngineName) :
               engineOrEngineName;
           let isOneOff = this.popup.oneOffSearchButtons
-              .maybeRecordTelemetry(event, openUILinkWhere, openUILinkParams);
+              .maybeRecordTelemetry(event);
           // Infer the type of the event which triggered the search.
           let eventType = "unknown";
           if (event instanceof KeyboardEvent) {
             eventType = "key";
           } else if (event instanceof MouseEvent) {
             eventType = "mouse";
           }
           // Augment the search action details object.
--- a/browser/components/controlcenter/content/panel.inc.xul
+++ b/browser/components/controlcenter/content/panel.inc.xul
@@ -59,16 +59,17 @@
             when-connection="not-secure secure secure-ev secure-cert-user-overridden extension">
         <vbox id="identity-popup-content-blocking-content" flex="1">
           <hbox align="start">
             <label id="content-blocking-label"
                    class="identity-popup-headline"
                    flex="1">&contentBlocking.title;</label>
             <toolbarbutton id="tracking-protection-preferences-button"
                            class="identity-popup-preferences-button subviewbutton"
+                           tooltiptext="&identity.contentBlockingPreferences.tooltip;"
                            oncommand="ContentBlocking.openPreferences('identityPopup-TP-preferencesButton'); gIdentityHandler.recordClick('cb_prefs_button');" />
           </hbox>
 
           <description id="identity-popup-content-blocking-detected"
                        crop="end">&contentBlocking.detected;</description>
           <description id="identity-popup-content-blocking-not-detected"
                        crop="end">&contentBlocking.notDetected;</description>
 
--- a/browser/components/search/content/search-one-offs.js
+++ b/browser/components/search/content/search-one-offs.js
@@ -982,23 +982,19 @@ class SearchOneOffs {
 
   /**
    * If the given event is related to the one-offs, this method records
    * one-off telemetry for it.  this.telemetryOrigin will be appended to the
    * computed source, so make sure you set that first.
    *
    * @param {Event} aEvent
    *        An event, like a click on a one-off button.
-   * @param {string} aOpenUILinkWhere
-   *        The "where" passed to openUILink.
-   * @param {object} aOpenUILinkParams
-   *        The "params" passed to openUILink.
    * @returns {boolean} True if telemetry was recorded and false if not.
    */
-  maybeRecordTelemetry(aEvent, aOpenUILinkWhere, aOpenUILinkParams) {
+  maybeRecordTelemetry(aEvent) {
     if (!aEvent) {
       return false;
     }
 
     let source = null;
     let type = "unknown";
     let engine = null;
     let target = aEvent.originalTarget;
@@ -1024,21 +1020,17 @@ class SearchOneOffs {
     if (!source) {
       return false;
     }
 
     if (this.telemetryOrigin) {
       source += "-" + this.telemetryOrigin;
     }
 
-    let tabBackground = aOpenUILinkWhere == "tab" &&
-      aOpenUILinkParams &&
-      aOpenUILinkParams.inBackground;
-    let where = tabBackground ? "tab-background" : aOpenUILinkWhere;
-    BrowserSearch.recordOneoffSearchInTelemetry(engine, source, type, where);
+    BrowserSearch.recordOneoffSearchInTelemetry(engine, source, type);
     return true;
   }
 
   _resetAddEngineMenuTimeout() {
     if (this._addEngineMenuTimeout) {
       clearTimeout(this._addEngineMenuTimeout);
     }
     this._addEngineMenuTimeout = setTimeout(() => {
--- a/browser/components/search/content/searchbar.js
+++ b/browser/components/search/content/searchbar.js
@@ -299,17 +299,17 @@ class MozSearchbar extends MozXULElement
 
     BrowserUsageTelemetry.recordSearchbarSelectedResultMethod(
       aEvent,
       selection ? selection.index : -1
     );
 
     if (!selection || (selection.index == -1)) {
       oneOffRecorded = this.textbox.popup.oneOffButtons
-        .maybeRecordTelemetry(aEvent, aWhere, aParams);
+        .maybeRecordTelemetry(aEvent);
       if (!oneOffRecorded) {
         let source = "unknown";
         let type = "unknown";
         let target = aEvent.originalTarget;
         if (aEvent instanceof KeyboardEvent) {
           type = "key";
         } else if (aEvent instanceof MouseEvent) {
           type = "mouse";
@@ -320,18 +320,17 @@ class MozSearchbar extends MozXULElement
         } else if (aEvent instanceof XULCommandEvent) {
           if (target.getAttribute("anonid") == "paste-and-search") {
             source = "paste";
           }
         }
         if (!aEngine) {
           aEngine = this.currentEngine;
         }
-        BrowserSearch.recordOneoffSearchInTelemetry(aEngine, source, type,
-          aWhere);
+        BrowserSearch.recordOneoffSearchInTelemetry(aEngine, source, type);
       }
     }
 
     // This is a one-off search only if oneOffRecorded is true.
     this.doSearch(textValue, aWhere, aEngine, aParams, oneOffRecorded);
 
     if (aWhere == "tab" && aParams && aParams.inBackground)
       this.focus();
--- a/browser/components/urlbar/UrlbarView.jsm
+++ b/browser/components/urlbar/UrlbarView.jsm
@@ -24,17 +24,17 @@ class UrlbarView {
     this.panel = input.panel;
     this.controller = input.controller;
     this.document = this.panel.ownerDocument;
     this.window = this.document.defaultView;
 
     this._mainContainer = this.panel.querySelector(".urlbarView-body-inner");
     this._rows = this.panel.querySelector(".urlbarView-results");
 
-    this._rows.addEventListener("click", this);
+    this._rows.addEventListener("mouseup", this);
 
     // For the horizontal fade-out effect, set the overflow attribute on result
     // rows when they overflow.
     this._rows.addEventListener("overflow", this);
     this._rows.addEventListener("underflow", this);
 
     this.controller.setView(this);
     this.controller.addQueryListener(this);
@@ -341,17 +341,22 @@ class UrlbarView {
     let methodName = "_on_" + event.type;
     if (methodName in this) {
       this[methodName](event);
     } else {
       throw new Error("Unrecognized UrlbarView event: " + event.type);
     }
   }
 
-  _on_click(event) {
+  _on_mouseup(event) {
+    if (event.button == 2) {
+      // Ignore right clicks.
+      return;
+    }
+
     let row = event.target;
     while (!row.classList.contains("urlbarView-row")) {
       row = row.parentNode;
     }
     let resultIndex = row.getAttribute("resultIndex");
     let result = this._queryContext.results[resultIndex];
     if (result) {
       this.input.pickResult(event, result);
--- a/browser/locales/en-US/chrome/browser/browser.dtd
+++ b/browser/locales/en-US/chrome/browser/browser.dtd
@@ -873,16 +873,18 @@ you can use these alternative items. Oth
 <!ENTITY identity.moreInfoLinkText2 "More Information">
 
 <!ENTITY identity.clearSiteData "Clear Cookies and Site Data…">
 
 <!ENTITY identity.permissions "Permissions">
 <!ENTITY identity.permissionsEmpty "You have not granted this site any special permissions.">
 <!ENTITY identity.permissionsReloadHint "You may need to reload the page for changes to apply.">
 <!ENTITY identity.permissionsPreferences.tooltip "Open Permissions Preferences">
+<!ENTITY identity.contentBlockingPreferences.tooltip "Open Content Blocking Preferences">
+
 
 <!-- Name for the tabs toolbar as spoken by screen readers.
      The word "toolbar" is appended automatically and should not be contained below! -->
 <!ENTITY tabsToolbar.label "Browser tabs">
 
 <!-- LOCALIZATION NOTE (syncTabsMenu3.label): This appears in the history menu -->
 <!ENTITY syncTabsMenu3.label     "Synced Tabs">
 
--- a/dom/base/DocumentInlines.h
+++ b/dom/base/DocumentInlines.h
@@ -14,18 +14,18 @@
 namespace mozilla {
 namespace dom {
 
 inline HTMLBodyElement* Document::GetBodyElement() {
   return static_cast<HTMLBodyElement*>(GetHtmlChildElement(nsGkAtoms::body));
 }
 
 template <typename T>
-size_t Document::FindDocStyleSheetInsertionPoint(
-    const nsTArray<T>& aDocSheets, const StyleSheet& aSheet) {
+size_t Document::FindDocStyleSheetInsertionPoint(const nsTArray<T>& aDocSheets,
+                                                 const StyleSheet& aSheet) {
   nsStyleSheetService* sheetService = nsStyleSheetService::GetInstance();
 
   // lowest index first
   int32_t newDocIndex = IndexOfSheet(aSheet);
 
   size_t count = aDocSheets.Length();
   size_t index = 0;
   for (; index < count; index++) {
--- a/dom/base/IdentifierMapEntry.h
+++ b/dom/base/IdentifierMapEntry.h
@@ -26,17 +26,17 @@
 class nsIContent;
 class nsContentList;
 class nsBaseContentList;
 
 namespace mozilla {
 namespace dom {
 class Document;
 class Element;
-}
+}  // namespace dom
 
 /**
  * Right now our identifier map entries contain information for 'name'
  * and 'id' mappings of a given string. This is so that
  * nsHTMLDocument::ResolveName only has to do one hash lookup instead
  * of two. It's not clear whether this still matters for performance.
  *
  * We also store the document.all result list here. This is mainly so that
--- a/dom/bindings/RemoteObjectProxy.h
+++ b/dom/bindings/RemoteObjectProxy.h
@@ -170,18 +170,18 @@ class RemoteObjectProxy : public RemoteO
                               const js::Class* aClasp) const {
     return RemoteObjectProxyBase::CreateProxyObject(aCx, aNative, aClasp);
   }
 
  protected:
   using RemoteObjectProxyBase::RemoteObjectProxyBase;
 
  private:
-  bool DefinePropertiesAndFunctions(
-      JSContext* aCx, JS::Handle<JSObject*> aHolder) const final {
+  bool DefinePropertiesAndFunctions(JSContext* aCx,
+                                    JS::Handle<JSObject*> aHolder) const final {
     return JS_DefineProperties(aCx, aHolder, P) &&
            JS_DefineFunctions(aCx, aHolder, F);
   }
 };
 
 /**
  * Returns true if aObj is a proxy object that represents an object implementing
  * the WebIDL interface for aProtoID.
--- a/dom/html/HTMLFormSubmission.cpp
+++ b/dom/html/HTMLFormSubmission.cpp
@@ -818,34 +818,34 @@ void GetEnumAttr(nsGenericHTMLElement* a
 
   nsresult rv;
 
   // Get action
   nsCOMPtr<nsIURI> actionURL;
   rv = aForm->GetActionURL(getter_AddRefs(actionURL), aOriginatingElement);
   NS_ENSURE_SUCCESS(rv, rv);
 
- // Check if CSP allows this form-action
- nsCOMPtr<nsIContentSecurityPolicy> csp;
- rv = aForm->NodePrincipal()->GetCsp(getter_AddRefs(csp));
- NS_ENSURE_SUCCESS(rv, rv);
- if (csp) {
-   bool permitsFormAction = true;
+  // Check if CSP allows this form-action
+  nsCOMPtr<nsIContentSecurityPolicy> csp;
+  rv = aForm->NodePrincipal()->GetCsp(getter_AddRefs(csp));
+  NS_ENSURE_SUCCESS(rv, rv);
+  if (csp) {
+    bool permitsFormAction = true;
 
-   // form-action is only enforced if explicitly defined in the
-   // policy - do *not* consult default-src, see:
-   // http://www.w3.org/TR/CSP2/#directive-default-src
-   rv = csp->Permits(aForm, nullptr /* nsICSPEventListener */, actionURL,
-                     nsIContentSecurityPolicy::FORM_ACTION_DIRECTIVE, true,
-                     &permitsFormAction);
-   NS_ENSURE_SUCCESS(rv, rv);
-   if (!permitsFormAction) {
-     return NS_ERROR_CSP_FORM_ACTION_VIOLATION;
-   }
- }
+    // form-action is only enforced if explicitly defined in the
+    // policy - do *not* consult default-src, see:
+    // http://www.w3.org/TR/CSP2/#directive-default-src
+    rv = csp->Permits(aForm, nullptr /* nsICSPEventListener */, actionURL,
+                      nsIContentSecurityPolicy::FORM_ACTION_DIRECTIVE, true,
+                      &permitsFormAction);
+    NS_ENSURE_SUCCESS(rv, rv);
+    if (!permitsFormAction) {
+      return NS_ERROR_CSP_FORM_ACTION_VIOLATION;
+    }
+  }
 
   // Get target
   // The target is the originating element formtarget attribute if the element
   // is a submit control and has such an attribute.
   // Otherwise, the target is the form owner's target attribute,
   // if it has such an attribute.
   // Finally, if one of the child nodes of the head element is a base element
   // with a target attribute, then the value of the target attribute of the
--- a/dom/html/HTMLInputElement.h
+++ b/dom/html/HTMLInputElement.h
@@ -745,17 +745,18 @@ class HTMLInputElement final : public ns
    * The following functions are called from datetime picker to let input box
    * know the current state of the picker or to update the input box on changes.
    */
   void GetDateTimeInputBoxValue(DateTimeValue& aValue);
 
   /*
    * This locates the inner datetimebox UA Widget element and only the
    * UA Widget
-   * element. This should fold into GetDateTimeBoxElement() when the XBL binding is removed.
+   * element. This should fold into GetDateTimeBoxElement() when the XBL binding
+   * is removed.
    */
   Element* GetDateTimeBoxElementInUAWidget();
 
   /*
    * This allows chrome JavaScript to dispatch event to the inner datetimebox
    * anonymous or UA Widget element and access nsIDateTimeInputArea
    * implementation.
    */
--- a/dom/indexedDB/ActorsParent.cpp
+++ b/dom/indexedDB/ActorsParent.cpp
@@ -28,16 +28,17 @@
 #include "mozilla/ErrorNames.h"
 #include "mozilla/LazyIdleThread.h"
 #include "mozilla/Maybe.h"
 #include "mozilla/Preferences.h"
 #include "mozilla/SnappyCompressOutputStream.h"
 #include "mozilla/SnappyUncompressInputStream.h"
 #include "mozilla/StaticPtr.h"
 #include "mozilla/storage.h"
+#include "mozilla/Telemetry.h"
 #include "mozilla/Unused.h"
 #include "mozilla/UniquePtrExtensions.h"
 #include "mozilla/dom/ContentParent.h"
 #include "mozilla/dom/File.h"
 #include "mozilla/dom/FileBlobImpl.h"
 #include "mozilla/dom/StructuredCloneTags.h"
 #include "mozilla/dom/TabParent.h"
 #include "mozilla/dom/filehandle/ActorsParent.h"
@@ -8324,16 +8325,27 @@ nsresult DeserializeStructuredCloneFile(
     id = text.ToInteger(&rv);
   }
   if (NS_WARN_IF(NS_FAILED(rv))) {
     return rv;
   }
 
   RefPtr<FileInfo> fileInfo = aFileManager->GetFileInfo(id);
   MOZ_ASSERT(fileInfo);
+  // XXX In bug 1432133, for some reasons FileInfo object cannot be got. This
+  // is just a short-term fix, and we are working on finding the real cause
+  // in bug 1519859.
+  if (!fileInfo) {
+    IDB_WARNING(
+        "Corrupt structured clone data detected in IndexedDB. Failing the "
+        "database request. Bug 1519859 will address this problem.");
+    Telemetry::ScalarAdd(Telemetry::ScalarID::IDB_FAILURE_FILEINFO_ERROR, 1);
+
+    return NS_ERROR_DOM_INDEXEDDB_UNKNOWN_ERR;
+  }
 
   aFile->mFileInfo.swap(fileInfo);
   aFile->mType = type;
 
   return NS_OK;
 }
 
 nsresult DeserializeStructuredCloneFiles(FileManager* aFileManager,
--- a/dom/ipc/MemoryReportRequest.cpp
+++ b/dom/ipc/MemoryReportRequest.cpp
@@ -49,18 +49,18 @@ MemoryReportRequestHost::~MemoryReportRe
 NS_IMPL_ISUPPORTS(MemoryReportRequestClient, nsIRunnable)
 
 /* static */ void MemoryReportRequestClient::Start(
     uint32_t aGeneration, bool aAnonymize, bool aMinimizeMemoryUsage,
     const MaybeFileDesc& aDMDFile, const nsACString& aProcessString,
     const ReportCallback& aReportCallback,
     const FinishCallback& aFinishCallback) {
   RefPtr<MemoryReportRequestClient> request = new MemoryReportRequestClient(
-      aGeneration, aAnonymize, aDMDFile, aProcessString,
-      aReportCallback, aFinishCallback);
+      aGeneration, aAnonymize, aDMDFile, aProcessString, aReportCallback,
+      aFinishCallback);
 
   DebugOnly<nsresult> rv;
   if (aMinimizeMemoryUsage) {
     nsCOMPtr<nsIMemoryReporterManager> mgr =
         do_GetService("@mozilla.org/memory-reporter-manager;1");
     rv = mgr->MinimizeMemoryUsage(request);
     // mgr will eventually call actor->Run()
   } else {
@@ -122,18 +122,17 @@ NS_IMPL_ISUPPORTS(HandleReportCallback, 
 class FinishReportingCallback final : public nsIFinishReportingCallback {
  public:
   using FinishCallback = typename MemoryReportRequestClient::FinishCallback;
 
   NS_DECL_ISUPPORTS
 
   explicit FinishReportingCallback(uint32_t aGeneration,
                                    const FinishCallback& aFinishCallback)
-      : mGeneration(aGeneration),
-        mFinishCallback(aFinishCallback) {}
+      : mGeneration(aGeneration), mFinishCallback(aFinishCallback) {}
 
   NS_IMETHOD Callback(nsISupports* aUnused) override {
     return mFinishCallback(mGeneration) ? NS_OK : NS_ERROR_FAILURE;
   }
 
  private:
   ~FinishReportingCallback() = default;
 
--- a/dom/media/MediaManager.cpp
+++ b/dom/media/MediaManager.cpp
@@ -4175,64 +4175,64 @@ SourceListener::InitializeAsync() {
                                   __func__);
                    return;
                  }
                }
 
                LOG("started all sources");
                aHolder.Resolve(true, __func__);
              })
-      ->Then(GetMainThreadSerialEventTarget(), __func__,
-             [self = RefPtr<SourceListener>(this), this]() {
-               if (mStopped) {
-                 // We were shut down during the async init
-                 return SourceListenerPromise::CreateAndResolve(true, __func__);
-               }
-
-               for (DeviceState* state :
-                    {mAudioDeviceState.get(), mVideoDeviceState.get()}) {
-                 if (!state) {
-                   continue;
-                 }
-                 MOZ_DIAGNOSTIC_ASSERT(!state->mTrackEnabled);
-                 MOZ_DIAGNOSTIC_ASSERT(!state->mDeviceEnabled);
-                 MOZ_DIAGNOSTIC_ASSERT(!state->mStopped);
-
-                 state->mDeviceEnabled = true;
-                 state->mTrackEnabled = true;
-                 state->mTrackEnabledTime = TimeStamp::Now();
-
-                 if (state == mVideoDeviceState.get() &&
-                     !mStream->IsDestroyed()) {
-                   mStream->SetPullingEnabled(kVideoTrack, true);
-                 }
-               }
-               return SourceListenerPromise::CreateAndResolve(true, __func__);
-             },
-             [self = RefPtr<SourceListener>(this),
-              this](RefPtr<MediaMgrError>&& aResult) {
-               if (mStopped) {
-                 return SourceListenerPromise::CreateAndReject(
-                     std::move(aResult), __func__);
-               }
-
-               for (DeviceState* state :
-                    {mAudioDeviceState.get(), mVideoDeviceState.get()}) {
-                 if (!state) {
-                   continue;
-                 }
-                 MOZ_DIAGNOSTIC_ASSERT(!state->mTrackEnabled);
-                 MOZ_DIAGNOSTIC_ASSERT(!state->mDeviceEnabled);
-                 MOZ_DIAGNOSTIC_ASSERT(!state->mStopped);
-
-                 state->mStopped = true;
-               }
-               return SourceListenerPromise::CreateAndReject(std::move(aResult),
-                                                             __func__);
-             });
+      ->Then(
+          GetMainThreadSerialEventTarget(), __func__,
+          [self = RefPtr<SourceListener>(this), this]() {
+            if (mStopped) {
+              // We were shut down during the async init
+              return SourceListenerPromise::CreateAndResolve(true, __func__);
+            }
+
+            for (DeviceState* state :
+                 {mAudioDeviceState.get(), mVideoDeviceState.get()}) {
+              if (!state) {
+                continue;
+              }
+              MOZ_DIAGNOSTIC_ASSERT(!state->mTrackEnabled);
+              MOZ_DIAGNOSTIC_ASSERT(!state->mDeviceEnabled);
+              MOZ_DIAGNOSTIC_ASSERT(!state->mStopped);
+
+              state->mDeviceEnabled = true;
+              state->mTrackEnabled = true;
+              state->mTrackEnabledTime = TimeStamp::Now();
+
+              if (state == mVideoDeviceState.get() && !mStream->IsDestroyed()) {
+                mStream->SetPullingEnabled(kVideoTrack, true);
+              }
+            }
+            return SourceListenerPromise::CreateAndResolve(true, __func__);
+          },
+          [self = RefPtr<SourceListener>(this),
+           this](RefPtr<MediaMgrError>&& aResult) {
+            if (mStopped) {
+              return SourceListenerPromise::CreateAndReject(std::move(aResult),
+                                                            __func__);
+            }
+
+            for (DeviceState* state :
+                 {mAudioDeviceState.get(), mVideoDeviceState.get()}) {
+              if (!state) {
+                continue;
+              }
+              MOZ_DIAGNOSTIC_ASSERT(!state->mTrackEnabled);
+              MOZ_DIAGNOSTIC_ASSERT(!state->mDeviceEnabled);
+              MOZ_DIAGNOSTIC_ASSERT(!state->mStopped);
+
+              state->mStopped = true;
+            }
+            return SourceListenerPromise::CreateAndReject(std::move(aResult),
+                                                          __func__);
+          });
 }
 
 void SourceListener::Stop() {
   MOZ_ASSERT(NS_IsMainThread(), "Only call on main thread");
 
   if (mStopped) {
     return;
   }
--- a/dom/media/gtest/mp4_demuxer/TestParser.cpp
+++ b/dom/media/gtest/mp4_demuxer/TestParser.cpp
@@ -89,17 +89,17 @@ TEST(MP4Metadata, EmptyStream) {
   // We can seek anywhere in any MPEG4.
   EXPECT_TRUE(metadata.CanSeek());
   EXPECT_FALSE(metadata.Crypto().Ref()->valid);
 }
 
 TEST(MoofParser, EmptyStream) {
   RefPtr<ByteStream> stream = new TestStream(nullptr, 0);
 
-  MoofParser parser(stream, 0, false);
+  MoofParser parser(stream, 0, false, true);
   EXPECT_EQ(0u, parser.mOffset);
   EXPECT_TRUE(parser.ReachedEnd());
 
   MediaByteRangeSet byteRanges;
   EXPECT_FALSE(parser.RebuildFragmentedIndex(byteRanges));
 
   EXPECT_TRUE(parser.GetCompositionRange(byteRanges).IsNull());
   EXPECT_TRUE(parser.mInitRange.IsEmpty());
@@ -399,17 +399,17 @@ TEST(MoofParser, test_case_mp4) {
   length = ArrayLength(testFiles);
 
   for (size_t test = 0; test < length; ++test) {
     nsTArray<uint8_t> buffer = ReadTestFile(tests[test].mFilename);
     ASSERT_FALSE(buffer.IsEmpty());
     RefPtr<ByteStream> stream =
         new TestStream(buffer.Elements(), buffer.Length());
 
-    MoofParser parser(stream, 0, false);
+    MoofParser parser(stream, 0, false, true);
     EXPECT_EQ(0u, parser.mOffset) << tests[test].mFilename;
     EXPECT_FALSE(parser.ReachedEnd()) << tests[test].mFilename;
     EXPECT_TRUE(parser.mInitRange.IsEmpty()) << tests[test].mFilename;
 
     RefPtr<MediaByteBuffer> metadataBuffer = parser.Metadata();
     EXPECT_TRUE(metadataBuffer) << tests[test].mFilename;
 
     EXPECT_FALSE(parser.mInitRange.IsEmpty()) << tests[test].mFilename;
--- a/dom/media/mediasource/ContainerParser.cpp
+++ b/dom/media/mediasource/ContainerParser.cpp
@@ -521,17 +521,18 @@ class MP4ContainerParser : public Contai
     if (initSegment) {
       mResource = new SourceBufferResource();
       DDLINKCHILD("resource", mResource.get());
       mStream = new MP4Stream(mResource);
       // We use a timestampOffset of 0 for ContainerParser, and require
       // consumers of ParseStartAndEndTimestamps to add their timestamp offset
       // manually. This allows the ContainerParser to be shared across different
       // timestampOffsets.
-      mParser = new MoofParser(mStream, 0, /* aIsAudio = */ false);
+      mParser = new MoofParser(mStream, 0, /* aIsAudio = */ false,
+                               /* aIsMultitrackParser */ true);
       DDLINKCHILD("parser", mParser.get());
       mInitData = new MediaByteBuffer();
       mCompleteInitSegmentRange = MediaByteRange();
       mCompleteMediaHeaderRange = MediaByteRange();
       mCompleteMediaSegmentRange = MediaByteRange();
       mGlobalOffset = mTotalParsed;
     } else if (!mStream || !mParser) {
       mTotalParsed += aData->Length();
--- a/dom/media/mp4/MP4Metadata.cpp
+++ b/dom/media/mp4/MP4Metadata.cpp
@@ -455,17 +455,17 @@ MP4Metadata::ResultAndIndice MP4Metadata
   UniquePtr<IndiceWrapper> indice;
   indice = mozilla::MakeUnique<IndiceWrapper>(indiceRawData);
 
   return {NS_OK, std::move(indice)};
 }
 
 /*static*/ MP4Metadata::ResultAndByteBuffer MP4Metadata::Metadata(
     ByteStream* aSource) {
-  auto parser = mozilla::MakeUnique<MoofParser>(aSource, 0, false);
+  auto parser = mozilla::MakeUnique<MoofParser>(aSource, 0, false, true);
   RefPtr<mozilla::MediaByteBuffer> buffer = parser->Metadata();
   if (!buffer) {
     return {MediaResult(NS_ERROR_DOM_MEDIA_METADATA_ERR,
                         RESULT_DETAIL("Cannot parse metadata")),
             nullptr};
   }
   return {NS_OK, std::move(buffer)};
 }
--- a/dom/media/mp4/MoofParser.cpp
+++ b/dom/media/mp4/MoofParser.cpp
@@ -50,17 +50,17 @@ bool MoofParser::RebuildFragmentedIndex(
   bool foundValidMoof = false;
 
   for (Box box(&aContext, mOffset); box.IsAvailable(); box = box.Next()) {
     if (box.IsType("moov") && mInitRange.IsEmpty()) {
       mInitRange = MediaByteRange(0, box.Range().mEnd);
       ParseMoov(box);
     } else if (box.IsType("moof")) {
       Moof moof(box, mTrex, mMvhd, mMdhd, mEdts, mSinf, &mLastDecodeTime,
-                mIsAudio);
+                mIsAudio, mIsMultitrackParser);
 
       if (!moof.IsValid() && !box.Next().IsAvailable()) {
         // Moof isn't valid abort search for now.
         break;
       }
 
       if (!mMoofs.IsEmpty()) {
         // Stitch time ranges together in the case of a (hopefully small) time
@@ -222,21 +222,21 @@ void MoofParser::ParseMoov(Box& aBox) {
 }
 
 void MoofParser::ParseTrak(Box& aBox) {
   Tkhd tkhd;
   for (Box box = aBox.FirstChild(); box.IsAvailable(); box = box.Next()) {
     if (box.IsType("tkhd")) {
       tkhd = Tkhd(box);
     } else if (box.IsType("mdia")) {
-      if (!mTrex.mTrackId || tkhd.mTrackId == mTrex.mTrackId) {
+      if (mIsMultitrackParser || tkhd.mTrackId == mTrex.mTrackId) {
         ParseMdia(box, tkhd);
       }
     } else if (box.IsType("edts") &&
-               (!mTrex.mTrackId || tkhd.mTrackId == mTrex.mTrackId)) {
+               (mIsMultitrackParser || tkhd.mTrackId == mTrex.mTrackId)) {
       mEdts = Edts(box);
     }
   }
 }
 
 void MoofParser::ParseMdia(Box& aBox, Tkhd& aTkhd) {
   for (Box box = aBox.FirstChild(); box.IsAvailable(); box = box.Next()) {
     if (box.IsType("mdhd")) {
@@ -246,22 +246,18 @@ void MoofParser::ParseMdia(Box& aBox, Tk
     }
   }
 }
 
 void MoofParser::ParseMvex(Box& aBox) {
   for (Box box = aBox.FirstChild(); box.IsAvailable(); box = box.Next()) {
     if (box.IsType("trex")) {
       Trex trex = Trex(box);
-      if (!mTrex.mTrackId || trex.mTrackId == mTrex.mTrackId) {
-        auto trackId = mTrex.mTrackId;
+      if (mIsMultitrackParser || trex.mTrackId == mTrex.mTrackId) {
         mTrex = trex;
-        // Keep the original trackId, as should it be 0 we want to continue
-        // parsing all tracks.
-        mTrex.mTrackId = trackId;
       }
     }
   }
 }
 
 void MoofParser::ParseMinf(Box& aBox) {
   for (Box box = aBox.FirstChild(); box.IsAvailable(); box = box.Next()) {
     if (box.IsType("stbl")) {
@@ -294,18 +290,18 @@ void MoofParser::ParseStbl(Box& aBox) {
           return;
         }
       }
     }
   }
 }
 
 void MoofParser::ParseStsd(Box& aBox) {
-  if (mTrex.mTrackId == 0) {
-    // If mTrex.mTrackId is 0, then the parser is being used to read multiple
+  if (mIsMultitrackParser) {
+    // If mIsMultitrackParser, then the parser is being used to read multiple
     // tracks metadata, and it is not a sane operation to try and map multiple
     // sample description boxes, from different tracks, onto the parser, which
     // is modeled around storing metadata for a single track.
     return;
   }
   MOZ_ASSERT(
       mSampleDescriptions.IsEmpty(),
       "Shouldn't have any sample descriptions when starting to parse stsd");
@@ -348,22 +344,24 @@ class CtsComparator {
     return aA->mCompositionRange.start == aB->mCompositionRange.start;
   }
   bool LessThan(Sample* const aA, Sample* const aB) const {
     return aA->mCompositionRange.start < aB->mCompositionRange.start;
   }
 };
 
 Moof::Moof(Box& aBox, Trex& aTrex, Mvhd& aMvhd, Mdhd& aMdhd, Edts& aEdts,
-           Sinf& aSinf, uint64_t* aDecodeTime, bool aIsAudio)
+           Sinf& aSinf, uint64_t* aDecodeTime, bool aIsAudio,
+           bool aIsMultitrackParser)
     : mRange(aBox.Range()), mTfhd(aTrex), mMaxRoundingError(35000) {
   nsTArray<Box> psshBoxes;
   for (Box box = aBox.FirstChild(); box.IsAvailable(); box = box.Next()) {
     if (box.IsType("traf")) {
-      ParseTraf(box, aTrex, aMvhd, aMdhd, aEdts, aSinf, aDecodeTime, aIsAudio);
+      ParseTraf(box, aTrex, aMvhd, aMdhd, aEdts, aSinf, aDecodeTime, aIsAudio,
+                aIsMultitrackParser);
     }
     if (box.IsType("pssh")) {
       psshBoxes.AppendElement(box);
     }
   }
 
   // The EME spec requires that PSSH boxes which are contiguous in the
   // file are dispatched to the media element in a single "encrypted" event.
@@ -501,24 +499,24 @@ bool Moof::ProcessCencAuxInfo(AtomType a
   for (int i = 0; i < cencRanges.Length(); i++) {
     mIndex[i].mCencRange = cencRanges[i];
   }
   return true;
 }
 
 void Moof::ParseTraf(Box& aBox, Trex& aTrex, Mvhd& aMvhd, Mdhd& aMdhd,
                      Edts& aEdts, Sinf& aSinf, uint64_t* aDecodeTime,
-                     bool aIsAudio) {
+                     bool aIsAudio, bool aIsMultitrackParser) {
   MOZ_ASSERT(aDecodeTime);
   Tfdt tfdt;
 
   for (Box box = aBox.FirstChild(); box.IsAvailable(); box = box.Next()) {
     if (box.IsType("tfhd")) {
       mTfhd = Tfhd(box, aTrex);
-    } else if (!aTrex.mTrackId || mTfhd.mTrackId == aTrex.mTrackId) {
+    } else if (aIsMultitrackParser || mTfhd.mTrackId == aTrex.mTrackId) {
       if (box.IsType("tfdt")) {
         tfdt = Tfdt(box);
       } else if (box.IsType("sgpd")) {
         Sgpd sgpd(box);
         if (sgpd.IsValid() && sgpd.mGroupingType == "seig") {
           mFragmentSampleEncryptionInfoEntries.Clear();
           if (!mFragmentSampleEncryptionInfoEntries.AppendElements(
                   sgpd.mEntries, mozilla::fallible)) {
@@ -546,17 +544,17 @@ void Moof::ParseTraf(Box& aBox, Trex& aT
         if (!mSaios.AppendElement(Saio(box, aSinf.mDefaultEncryptionType),
                                   mozilla::fallible)) {
           LOG(Moof, "OOM");
           return;
         }
       }
     }
   }
-  if (aTrex.mTrackId && mTfhd.mTrackId != aTrex.mTrackId) {
+  if (!aIsMultitrackParser && mTfhd.mTrackId != aTrex.mTrackId) {
     return;
   }
   // Now search for TRUN boxes.
   uint64_t decodeTime =
       tfdt.IsValid() ? tfdt.mBaseMediaDecodeTime : *aDecodeTime;
   for (Box box = aBox.FirstChild(); box.IsAvailable(); box = box.Next()) {
     if (box.IsType("trun")) {
       if (ParseTrun(box, aMvhd, aMdhd, aEdts, &decodeTime, aIsAudio).isOk()) {
--- a/dom/media/mp4/MoofParser.h
+++ b/dom/media/mp4/MoofParser.h
@@ -222,17 +222,18 @@ class Sgpd final : public Atom  // Sampl
 // these are encrypted when parsing the track fragment header (tfhd).
 struct SampleDescriptionEntry {
   bool mIsEncryptedEntry = false;
 };
 
 class Moof final : public Atom {
  public:
   Moof(Box& aBox, Trex& aTrex, Mvhd& aMvhd, Mdhd& aMdhd, Edts& aEdts,
-       Sinf& aSinf, uint64_t* aDecoderTime, bool aIsAudio);
+       Sinf& aSinf, uint64_t* aDecoderTime, bool aIsAudio,
+       bool aIsMultitrackParser);
   bool GetAuxInfo(AtomType aType, FallibleTArray<MediaByteRange>* aByteRanges);
   void FixRounding(const Moof& aMoof);
 
   mozilla::MediaByteRange mRange;
   mozilla::MediaByteRange mMdatRange;
   MP4Interval<Microseconds> mTimeRange;
   FallibleTArray<Sample> mIndex;
 
@@ -243,17 +244,18 @@ class Moof final : public Atom {
   Tfhd mTfhd;
   FallibleTArray<Saiz> mSaizs;
   FallibleTArray<Saio> mSaios;
   nsTArray<nsTArray<uint8_t>> mPsshes;
 
  private:
   // aDecodeTime is updated to the end of the parsed TRAF on return.
   void ParseTraf(Box& aBox, Trex& aTrex, Mvhd& aMvhd, Mdhd& aMdhd, Edts& aEdts,
-                 Sinf& aSinf, uint64_t* aDecodeTime, bool aIsAudio);
+                 Sinf& aSinf, uint64_t* aDecodeTime, bool aIsAudio,
+                 bool aIsMultitrackParser);
   // aDecodeTime is updated to the end of the parsed TRUN on return.
   Result<Ok, nsresult> ParseTrun(Box& aBox, Mvhd& aMvhd, Mdhd& aMdhd,
                                  Edts& aEdts, uint64_t* aDecodeTime,
                                  bool aIsAudio);
   // Process the sample auxiliary information used by common encryption.
   // aScheme is used to select the appropriate auxiliary information and should
   // be set based on the encryption scheme used by the track being processed.
   // Note, the term cenc here refers to the standard, not the specific scheme
@@ -262,24 +264,28 @@ class Moof final : public Atom {
   bool ProcessCencAuxInfo(AtomType aScheme);
   uint64_t mMaxRoundingError;
 };
 
 DDLoggedTypeDeclName(MoofParser);
 
 class MoofParser : public DecoderDoctorLifeLogger<MoofParser> {
  public:
-  MoofParser(ByteStream* aSource, uint32_t aTrackId, bool aIsAudio)
+  MoofParser(ByteStream* aSource, uint32_t aTrackId, bool aIsAudio,
+             bool aIsMultitrackParser = false)
       : mSource(aSource),
         mOffset(0),
         mTrex(aTrackId),
         mIsAudio(aIsAudio),
-        mLastDecodeTime(0) {
-    // Setting the mTrex.mTrackId to 0 is a nasty work around for calculating
-    // the composition range for MSE. We need an array of tracks.
+        mLastDecodeTime(0),
+        mIsMultitrackParser(aIsMultitrackParser) {
+    // Setting mIsMultitrackParser is a nasty work around for calculating
+    // the composition range for MSE that causes the parser to parse multiple
+    // tracks. Ideally we'd store an array of tracks with different metadata
+    // for each.
     DDLINKCHILD("source", aSource);
   }
   bool RebuildFragmentedIndex(const mozilla::MediaByteRangeSet& aByteRanges);
   // If *aCanEvict is set to true. then will remove all moofs already parsed
   // from index then rebuild the index. *aCanEvict is set to true upon return if
   // some moofs were removed.
   bool RebuildFragmentedIndex(const mozilla::MediaByteRangeSet& aByteRanges,
                               bool* aCanEvict);
@@ -321,12 +327,13 @@ class MoofParser : public DecoderDoctorL
   nsTArray<Moof>& Moofs() { return mMoofs; }
 
  private:
   void ScanForMetadata(mozilla::MediaByteRange& aMoov);
   nsTArray<Moof> mMoofs;
   nsTArray<MediaByteRange> mMediaRanges;
   bool mIsAudio;
   uint64_t mLastDecodeTime;
+  bool mIsMultitrackParser;
 };
 }  // namespace mozilla
 
 #endif
--- a/dom/media/webrtc/MediaEngineWebRTCAudio.cpp
+++ b/dom/media/webrtc/MediaEngineWebRTCAudio.cpp
@@ -600,30 +600,29 @@ nsresult MediaEngineWebRTCMicrophoneSour
   MOZ_ASSERT(mStream, "SetTrack must have been called before ::Stop");
 
   if (mState == kStopped) {
     // Already stopped - this is allowed
     return NS_OK;
   }
 
   RefPtr<MediaEngineWebRTCMicrophoneSource> that = this;
-  NS_DispatchToMainThread(
-      media::NewRunnableFrom([that, stream = mStream]() {
-        if (stream->IsDestroyed()) {
-          return NS_OK;
-        }
+  NS_DispatchToMainThread(media::NewRunnableFrom([that, stream = mStream]() {
+    if (stream->IsDestroyed()) {
+      return NS_OK;
+    }
 
-        stream->GraphImpl()->AppendMessage(MakeUnique<StartStopMessage>(
-            that->mInputProcessing, StartStopMessage::Stop));
-        CubebUtils::AudioDeviceID deviceID = that->mDeviceInfo->DeviceID();
-        Maybe<CubebUtils::AudioDeviceID> id = Some(deviceID);
-        stream->CloseAudioInput(id, that->mInputProcessing);
+    stream->GraphImpl()->AppendMessage(MakeUnique<StartStopMessage>(
+        that->mInputProcessing, StartStopMessage::Stop));
+    CubebUtils::AudioDeviceID deviceID = that->mDeviceInfo->DeviceID();
+    Maybe<CubebUtils::AudioDeviceID> id = Some(deviceID);
+    stream->CloseAudioInput(id, that->mInputProcessing);
 
-        return NS_OK;
-      }));
+    return NS_OK;
+  }));
 
   MOZ_ASSERT(mState == kStarted, "Should be started when stopping");
   mState = kStopped;
 
   return NS_OK;
 }
 
 void MediaEngineWebRTCMicrophoneSource::GetSettings(
--- a/dom/power/PowerManagerService.cpp
+++ b/dom/power/PowerManagerService.cpp
@@ -147,18 +147,19 @@ already_AddRefed<WakeLock> PowerManagerS
 }
 
 }  // namespace power
 }  // namespace dom
 }  // namespace mozilla
 
 NS_DEFINE_NAMED_CID(NS_POWERMANAGERSERVICE_CID);
 
-NS_GENERIC_FACTORY_SINGLETON_CONSTRUCTOR(nsIPowerManagerService,
-                                         mozilla::dom::power::PowerManagerService::GetInstance)
+NS_GENERIC_FACTORY_SINGLETON_CONSTRUCTOR(
+    nsIPowerManagerService,
+    mozilla::dom::power::PowerManagerService::GetInstance)
 
 static const mozilla::Module::CIDEntry kPowerManagerCIDs[] = {
     // clang-format off
   { &kNS_POWERMANAGERSERVICE_CID, false, nullptr, nsIPowerManagerServiceConstructor, mozilla::Module::ALLOW_IN_GPU_PROCESS },
   { nullptr }
     // clang-format on
 };
 
@@ -166,18 +167,19 @@ static const mozilla::Module::ContractID
     // clang-format off
   { POWERMANAGERSERVICE_CONTRACTID, &kNS_POWERMANAGERSERVICE_CID, mozilla::Module::ALLOW_IN_GPU_PROCESS },
   { nullptr }
     // clang-format on
 };
 
 // We mark the power module as being available in the GPU process because the
 // appshell depends on the power manager service.
-static const mozilla::Module kPowerManagerModule = {mozilla::Module::kVersion,
-                                                    kPowerManagerCIDs,
-                                                    kPowerManagerContracts,
-                                                    nullptr,
-                                                    nullptr,
-                                                    nullptr,
-                                                    nullptr,
-                                                    mozilla::Module::ALLOW_IN_GPU_PROCESS};
+static const mozilla::Module kPowerManagerModule = {
+    mozilla::Module::kVersion,
+    kPowerManagerCIDs,
+    kPowerManagerContracts,
+    nullptr,
+    nullptr,
+    nullptr,
+    nullptr,
+    mozilla::Module::ALLOW_IN_GPU_PROCESS};
 
 NSMODULE_DEFN(nsPowerManagerModule) = &kPowerManagerModule;
--- a/dom/script/LoadedScript.cpp
+++ b/dom/script/LoadedScript.cpp
@@ -35,19 +35,17 @@ NS_IMPL_CYCLE_COLLECTION_TRAVERSE_END
 NS_IMPL_CYCLE_COLLECTION_TRACE_BEGIN(LoadedScript)
 NS_IMPL_CYCLE_COLLECTION_TRACE_END
 
 NS_IMPL_CYCLE_COLLECTING_ADDREF(LoadedScript)
 NS_IMPL_CYCLE_COLLECTING_RELEASE(LoadedScript)
 
 LoadedScript::LoadedScript(ScriptKind aKind, ScriptFetchOptions* aFetchOptions,
                            nsIURI* aBaseURL)
-    : mKind(aKind),
-      mFetchOptions(aFetchOptions),
-      mBaseURL(aBaseURL) {
+    : mKind(aKind), mFetchOptions(aFetchOptions), mBaseURL(aBaseURL) {
   MOZ_ASSERT(mFetchOptions);
   MOZ_ASSERT(mBaseURL);
 }
 
 LoadedScript::~LoadedScript() { DropJSObjects(this); }
 
 void LoadedScript::AssociateWithScript(JSScript* aScript) {
   // Set a JSScript's private value to point to this object and
--- a/dom/script/ScriptLoader.cpp
+++ b/dom/script/ScriptLoader.cpp
@@ -2296,19 +2296,18 @@ nsresult ScriptLoader::FillCompileOption
   }
 
   if (mDocument) {
     mDocument->NoteScriptTrackingStatus(aRequest->mURL, aRequest->IsTracking());
   }
 
   bool isScriptElement =
       !aRequest->IsModuleRequest() || aRequest->AsModuleRequest()->IsTopLevel();
-  aOptions->setIntroductionInfoToCaller(jsapi.cx(),
-                                        isScriptElement ? "scriptElement"
-                                                        : "importedModule");
+  aOptions->setIntroductionInfoToCaller(
+      jsapi.cx(), isScriptElement ? "scriptElement" : "importedModule");
   aOptions->setFileAndLine(aRequest->mURL.get(), aRequest->mLineNo);
   aOptions->setIsRunOnce(true);
   aOptions->setNoScriptRval(true);
   if (aRequest->mHasSourceMapURL) {
     aOptions->setSourceMapURL(aRequest->mSourceMapURL.get());
   }
   if (aRequest->mOriginPrincipal) {
     nsIPrincipal* scriptPrin = nsContentUtils::ObjectPrincipal(aScopeChain);
--- a/dom/smil/SMILKeySpline.h
+++ b/dom/smil/SMILKeySpline.h
@@ -22,20 +22,20 @@ class SMILKeySpline {
   }
 
     /**
      * Creates a new key spline control point description.
      *
      * aX1, etc. are the x1, y1, x2, y2 cubic Bezier control points as defined
      * by SMILANIM 3.2.3. They must each be in the range 0.0 <= x <= 1.0
      */
-  SMILKeySpline(double aX1, double aY1, double aX2, double aY2)
-      : mX1(0), mY1(0), mX2(0), mY2(0) {
-    Init(aX1, aY1, aX2, aY2);
-  }
+    SMILKeySpline(double aX1, double aY1, double aX2, double aY2)
+        : mX1(0), mY1(0), mX2(0), mY2(0) {
+      Init(aX1, aY1, aX2, aY2);
+    }
 
     double X1() const { return mX1; }
     double Y1() const { return mY1; }
     double X2() const { return mX2; }
     double Y2() const { return mY2; }
 
     void Init(double aX1, double aY1, double aX2, double aY2);
 
--- a/dom/svg/SVGContentUtils.h
+++ b/dom/svg/SVGContentUtils.h
@@ -186,18 +186,17 @@ class SVGContentUtils {
    */
   static float GetFontXHeight(mozilla::dom::Element* aElement);
   static float GetFontXHeight(nsIFrame* aFrame);
   static float GetFontXHeight(ComputedStyle*, nsPresContext*);
 
   /*
    * Report a localized error message to the error console.
    */
-  static nsresult ReportToConsole(dom::Document* doc,
-                                  const char* aWarning,
+  static nsresult ReportToConsole(dom::Document* doc, const char* aWarning,
                                   const char16_t** aParams,
                                   uint32_t aParamsLength);
 
   static Matrix GetCTM(dom::SVGElement* aElement, bool aScreenCTM);
 
   /**
    * Gets the tight bounds-space stroke bounds of the non-scaling-stroked rect
    * aRect.
--- a/dom/svg/SVGIntegerPair.h
+++ b/dom/svg/SVGIntegerPair.h
@@ -72,18 +72,18 @@ class SVGIntegerPair {
   struct DOMAnimatedInteger final : public mozilla::dom::SVGAnimatedInteger {
     DOMAnimatedInteger(SVGIntegerPair* aVal, PairIndex aIndex,
                        SVGElement* aSVGElement)
         : mozilla::dom::SVGAnimatedInteger(aSVGElement),
           mVal(aVal),
           mIndex(aIndex) {}
     virtual ~DOMAnimatedInteger();
 
-    SVGIntegerPair* mVal;    // kept alive because it belongs to content
-    PairIndex mIndex;        // are we the first or second integer
+    SVGIntegerPair* mVal;  // kept alive because it belongs to content
+    PairIndex mIndex;      // are we the first or second integer
 
     virtual int32_t BaseVal() override { return mVal->GetBaseValue(mIndex); }
     virtual void SetBaseVal(int32_t aValue) override {
       mVal->SetBaseValue(aValue, mIndex, mSVGElement);
     }
 
     // Script may have modified animation parameters or timeline -- DOM getters
     // need to flush any resample requests to reflect these modifications.
--- a/dom/svg/SVGNumberPair.h
+++ b/dom/svg/SVGNumberPair.h
@@ -73,18 +73,18 @@ class SVGNumberPair {
   struct DOMAnimatedNumber final : public mozilla::dom::SVGAnimatedNumber {
     DOMAnimatedNumber(SVGNumberPair* aVal, PairIndex aIndex,
                       SVGElement* aSVGElement)
         : mozilla::dom::SVGAnimatedNumber(aSVGElement),
           mVal(aVal),
           mIndex(aIndex) {}
     virtual ~DOMAnimatedNumber();
 
-    SVGNumberPair* mVal;    // kept alive because it belongs to content
-    PairIndex mIndex;       // are we the first or second number
+    SVGNumberPair* mVal;  // kept alive because it belongs to content
+    PairIndex mIndex;     // are we the first or second number
 
     virtual float BaseVal() override { return mVal->GetBaseValue(mIndex); }
     virtual void SetBaseVal(float aValue) override {
       MOZ_ASSERT(mozilla::IsFinite(aValue));
       mVal->SetBaseValue(aValue, mIndex, mSVGElement);
     }
 
     // Script may have modified animation parameters or timeline -- DOM getters
--- a/dom/svg/SVGTests.cpp
+++ b/dom/svg/SVGTests.cpp
@@ -41,17 +41,18 @@ already_AddRefed<DOMSVGStringList> SVGTe
                                          AsSVGElement(), true, LANGUAGE);
 }
 
 bool SVGTests::HasExtension(const nsAString& aExtension) const {
 #define SVG_SUPPORTED_EXTENSION(str) \
   if (aExtension.EqualsLiteral(str)) return true;
   SVG_SUPPORTED_EXTENSION("http://www.w3.org/1999/xhtml")
   nsNameSpaceManager* nameSpaceManager = nsNameSpaceManager::GetInstance();
-  if (AsSVGElement()->IsInChromeDocument() || !nameSpaceManager->mMathMLDisabled) {
+  if (AsSVGElement()->IsInChromeDocument() ||
+      !nameSpaceManager->mMathMLDisabled) {
     SVG_SUPPORTED_EXTENSION("http://www.w3.org/1998/Math/MathML")
   }
 #undef SVG_SUPPORTED_EXTENSION
 
   return false;
 }
 
 bool SVGTests::IsConditionalProcessingAttribute(
--- a/dom/workers/WorkerDebugger.cpp
+++ b/dom/workers/WorkerDebugger.cpp
@@ -455,18 +455,17 @@ RefPtr<PerformanceInfoPromise> WorkerDeb
       }
     }
   }
 
   // getting the worker URL
   RefPtr<nsIURI> scriptURI = mWorkerPrivate->GetResolvedScriptURI();
   if (NS_WARN_IF(!scriptURI)) {
     // This can happen at shutdown, let's stop here.
-    return PerformanceInfoPromise::CreateAndReject(NS_ERROR_FAILURE,
-                                                   __func__);
+    return PerformanceInfoPromise::CreateAndReject(NS_ERROR_FAILURE, __func__);
   }
   nsCString url = scriptURI->GetSpecOrDefault();
 
   // Workers only produce metrics for a single category -
   // DispatchCategory::Worker. We still return an array of CategoryDispatch so
   // the PerformanceInfo struct is common to all performance counters throughout
   // Firefox.
   FallibleTArray<CategoryDispatch> items;
--- a/dom/xbl/nsXBLService.cpp
+++ b/dom/xbl/nsXBLService.cpp
@@ -147,18 +147,18 @@ class nsXBLStreamListener final : public
 
  private:
   ~nsXBLStreamListener();
 
   nsCOMPtr<nsIStreamListener> mInner;
   AutoTArray<nsXBLBindingRequest*, 8> mBindingRequests;
 
   nsWeakPtr mBoundDocument;
-  nsCOMPtr<nsIXMLContentSink> mSink;       // Only set until OnStartRequest
-  nsCOMPtr<Document> mBindingDocument;     // Only set until OnStartRequest
+  nsCOMPtr<nsIXMLContentSink> mSink;    // Only set until OnStartRequest
+  nsCOMPtr<Document> mBindingDocument;  // Only set until OnStartRequest
 };
 
 /* Implementation file */
 NS_IMPL_ISUPPORTS(nsXBLStreamListener, nsIStreamListener, nsIRequestObserver,
                   nsIDOMEventListener)
 
 nsXBLStreamListener::nsXBLStreamListener(Document* aBoundDocument,
                                          nsIXMLContentSink* aSink,
--- a/editor/libeditor/EditorUtils.cpp
+++ b/editor/libeditor/EditorUtils.cpp
@@ -30,19 +30,17 @@ using namespace dom;
 
 DOMIterator::DOMIterator(nsINode& aNode MOZ_GUARD_OBJECT_NOTIFIER_PARAM_IN_IMPL)
     : mIter(&mPostOrderIter) {
   MOZ_GUARD_OBJECT_NOTIFIER_INIT;
   DebugOnly<nsresult> rv = mIter->Init(&aNode);
   MOZ_ASSERT(NS_SUCCEEDED(rv));
 }
 
-nsresult DOMIterator::Init(nsRange& aRange) {
-  return mIter->Init(&aRange);
-}
+nsresult DOMIterator::Init(nsRange& aRange) { return mIter->Init(&aRange); }
 
 DOMIterator::DOMIterator(MOZ_GUARD_OBJECT_NOTIFIER_ONLY_PARAM_IN_IMPL)
     : mIter(&mPostOrderIter) {
   MOZ_GUARD_OBJECT_NOTIFIER_INIT;
 }
 
 void DOMIterator::AppendList(
     const BoolDomIterFunctor& functor,
--- a/editor/spellchecker/TextServicesDocument.cpp
+++ b/editor/spellchecker/TextServicesDocument.cpp
@@ -1,41 +1,41 @@
 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 #include "TextServicesDocument.h"
 
 #include "FilteredContentIterator.h"  // for FilteredContentIterator
-#include "mozilla/Assertions.h"   // for MOZ_ASSERT, etc
-#include "mozilla/EditorUtils.h"  // for AutoTransactionBatchExternal
+#include "mozilla/Assertions.h"       // for MOZ_ASSERT, etc
+#include "mozilla/EditorUtils.h"      // for AutoTransactionBatchExternal
 #include "mozilla/dom/Element.h"
 #include "mozilla/dom/Selection.h"
 #include "mozilla/mozalloc.h"    // for operator new, etc
 #include "mozilla/TextEditor.h"  // for TextEditor
 #include "nsAString.h"           // for nsAString::Length, etc
 #include "nsContentUtils.h"      // for nsContentUtils
 #include "nsComposeTxtSrvFilter.h"
-#include "nsDebug.h"                    // for NS_ENSURE_TRUE, etc
-#include "nsDependentSubstring.h"       // for Substring
-#include "nsError.h"                    // for NS_OK, NS_ERROR_FAILURE, etc
-#include "nsGenericHTMLElement.h"       // for nsGenericHTMLElement
-#include "nsIContent.h"                 // for nsIContent, etc
-#include "nsID.h"                       // for NS_GET_IID
-#include "nsIEditor.h"                  // for nsIEditor, etc
-#include "nsINode.h"                    // for nsINode
-#include "nsISelectionController.h"     // for nsISelectionController, etc
-#include "nsISupportsBase.h"            // for nsISupports
-#include "nsISupportsUtils.h"           // for NS_IF_ADDREF, NS_ADDREF, etc
-#include "mozilla/intl/WordBreaker.h"   // for WordRange, WordBreaker
-#include "nsRange.h"                    // for nsRange
-#include "nsString.h"                   // for nsString, nsAutoString
-#include "nscore.h"                     // for nsresult, NS_IMETHODIMP, etc
-#include "mozilla/UniquePtr.h"          // for UniquePtr
+#include "nsDebug.h"                   // for NS_ENSURE_TRUE, etc
+#include "nsDependentSubstring.h"      // for Substring
+#include "nsError.h"                   // for NS_OK, NS_ERROR_FAILURE, etc
+#include "nsGenericHTMLElement.h"      // for nsGenericHTMLElement
+#include "nsIContent.h"                // for nsIContent, etc
+#include "nsID.h"                      // for NS_GET_IID
+#include "nsIEditor.h"                 // for nsIEditor, etc
+#include "nsINode.h"                   // for nsINode
+#include "nsISelectionController.h"    // for nsISelectionController, etc
+#include "nsISupportsBase.h"           // for nsISupports
+#include "nsISupportsUtils.h"          // for NS_IF_ADDREF, NS_ADDREF, etc
+#include "mozilla/intl/WordBreaker.h"  // for WordRange, WordBreaker
+#include "nsRange.h"                   // for nsRange
+#include "nsString.h"                  // for nsString, nsAutoString
+#include "nscore.h"                    // for nsresult, NS_IMETHODIMP, etc
+#include "mozilla/UniquePtr.h"         // for UniquePtr
 
 namespace mozilla {
 
 using namespace dom;
 
 class OffsetEntry final {
  public:
   OffsetEntry(nsINode* aNode, int32_t aOffset, int32_t aLength)
--- a/editor/spellchecker/TextServicesDocument.h
+++ b/editor/spellchecker/TextServicesDocument.h
@@ -24,17 +24,17 @@ namespace mozilla {
 
 class FilteredContentIterator;
 class OffsetEntry;
 class TextEditor;
 
 namespace dom {
 class Document;
 class Element;
-};
+};  // namespace dom
 
 /**
  * The TextServicesDocument presents the document in as a bunch of flattened
  * text blocks. Each text block can be retrieved as an nsString.
  */
 class TextServicesDocument final : public nsIEditActionListener {
  private:
   enum class IteratorStatus : uint8_t {
--- a/gfx/gl/GLContextProviderEGL.cpp
+++ b/gfx/gl/GLContextProviderEGL.cpp
@@ -305,18 +305,17 @@ GLContextEGL::GLContextEGL(CreateContext
       mFallbackSurface(CreateFallbackSurface(config)),
       mContext(context) {
 #ifdef DEBUG
   printf_stderr("Initializing context %p surface %p on display %p\n", mContext,
                 mSurface, EGL_DISPLAY());
 #endif
 }
 
-void
-GLContextEGL::OnMarkDestroyed() {
+void GLContextEGL::OnMarkDestroyed() {
   if (mSurfaceOverride != EGL_NO_SURFACE) {
     SetEGLSurfaceOverride(EGL_NO_SURFACE);
   }
 }
 
 GLContextEGL::~GLContextEGL() {
   MarkDestroyed();
 
--- a/gfx/layers/FrameMetrics.cpp
+++ b/gfx/layers/FrameMetrics.cpp
@@ -12,17 +12,18 @@ namespace mozilla {
 namespace layers {
 
 const ScrollableLayerGuid::ViewID ScrollableLayerGuid::NULL_SCROLL_ID = 0;
 
 void FrameMetrics::RecalculateLayoutViewportOffset() {
   if (!mIsRootContent) {
     return;
   }
-  KeepLayoutViewportEnclosingVisualViewport(GetVisualViewport(), mLayoutViewport);
+  KeepLayoutViewportEnclosingVisualViewport(GetVisualViewport(),
+                                            mLayoutViewport);
 }
 
 /* static */ void FrameMetrics::KeepLayoutViewportEnclosingVisualViewport(
     const CSSRect& aVisualViewport, CSSRect& aLayoutViewport) {
   // If the visual viewport is contained within the layout viewport, we don't
   // need to make any adjustments, so we can exit early.
   //
   // Additionally, if the composition bounds changes (due to an orientation
--- a/gfx/layers/LayersTypes.h
+++ b/gfx/layers/LayersTypes.h
@@ -8,17 +8,17 @@
 #define GFX_LAYERSTYPES_H
 
 #include <stdint.h>  // for uint32_t
 
 #include "Units.h"
 #include "mozilla/DefineEnum.h"  // for MOZ_DEFINE_ENUM
 #include "mozilla/gfx/Point.h"   // for IntPoint
 #include "mozilla/Maybe.h"
-#include "mozilla/TimeStamp.h"   // for TimeStamp
+#include "mozilla/TimeStamp.h"  // for TimeStamp
 #include "mozilla/TypedEnumBits.h"
 #include "nsRegion.h"
 #include "nsStyleConsts.h"
 
 #include <stdio.h>            // FILE
 #include "mozilla/Logging.h"  // for PR_LOG
 
 #ifndef MOZ_LAYERS_HAVE_LOG
@@ -415,17 +415,17 @@ MOZ_DEFINE_ENUM_CLASS_WITH_BASE(ScrollDi
 MOZ_DEFINE_ENUM_CLASS_WITH_BASE(CompositionPayloadType, uint8_t, (
   eKeyPress,
   eAPZScroll,
   eAPZPinchZoom
 ));
 // clang-format on
 
 struct CompositionPayload {
-  bool operator ==(const CompositionPayload& aOther) const {
+  bool operator==(const CompositionPayload& aOther) const {
     return mType == aOther.mType && mTimeStamp == aOther.mTimeStamp;
   }
   /* The type of payload that is in this composition */
   CompositionPayloadType mType;
   /* When this payload was generated */
   TimeStamp mTimeStamp;
 };
 
--- a/gfx/layers/apz/util/APZCCallbackHelper.cpp
+++ b/gfx/layers/apz/util/APZCCallbackHelper.cpp
@@ -200,17 +200,18 @@ static ScreenMargin ScrollFrame(nsIConte
       }
     } else {
       // Correct the display port due to the difference between mScrollOffset
       // and the actual scroll offset.
       displayPortMargins = APZCCallbackHelper::AdjustDisplayPortForScrollDelta(
           aRequest, actualScrollOffset);
     }
   } else if (aRequest.IsRootContent() &&
-             aRequest.GetScrollOffset() != aRequest.GetLayoutViewport().TopLeft()) {
+             aRequest.GetScrollOffset() !=
+                 aRequest.GetLayoutViewport().TopLeft()) {
     // APZ uses the visual viewport's offset to calculate where to place the
     // display port, so the display port is misplaced when a pinch zoom occurs.
     //
     // We need to force a display port adjustment in the following paint to
     // account for a difference between mScrollOffset and the actual scroll
     // offset in repaints requested by
     // AsyncPanZoomController::NotifyLayersUpdated.
     displayPortMargins = APZCCallbackHelper::AdjustDisplayPortForScrollDelta(
--- a/gfx/layers/ipc/CrossProcessCompositorBridgeParent.h
+++ b/gfx/layers/ipc/CrossProcessCompositorBridgeParent.h
@@ -165,18 +165,19 @@ class CrossProcessCompositorBridgeParent
       const LayersId& aLayersId) override;
   bool DeallocPAPZCTreeManagerParent(PAPZCTreeManagerParent* aActor) override;
 
   PAPZParent* AllocPAPZParent(const LayersId& aLayersId) override;
   bool DeallocPAPZParent(PAPZParent* aActor) override;
 
   void UpdatePaintTime(LayerTransactionParent* aLayerTree,
                        const TimeDuration& aPaintTime) override;
-  void RegisterPayload(LayerTransactionParent* aLayerTree,
-                       const InfallibleTArray<CompositionPayload>& aPayload) override;
+  void RegisterPayload(
+      LayerTransactionParent* aLayerTree,
+      const InfallibleTArray<CompositionPayload>& aPayload) override;
 
   PWebRenderBridgeParent* AllocPWebRenderBridgeParent(
       const wr::PipelineId& aPipelineId,
       const LayoutDeviceIntSize& aSize) override;
   bool DeallocPWebRenderBridgeParent(PWebRenderBridgeParent* aActor) override;
 
   void ObserveLayersUpdate(LayersId aLayersId, LayersObserverEpoch aEpoch,
                            bool aActive) override;
--- a/gfx/layers/ipc/LayersMessageUtils.h
+++ b/gfx/layers/ipc/LayersMessageUtils.h
@@ -705,35 +705,35 @@ struct ParamTraits<mozilla::layers::Comp
 
 template <>
 struct ParamTraits<mozilla::layers::SimpleLayerAttributes>
     : public PlainOldDataSerializer<mozilla::layers::SimpleLayerAttributes> {};
 
 template <>
 struct ParamTraits<mozilla::layers::ScrollUpdateInfo>
     : public PlainOldDataSerializer<mozilla::layers::ScrollUpdateInfo> {};
- 
+
 template <>
 struct ParamTraits<mozilla::layers::CompositionPayloadType>
     : public ContiguousEnumSerializerInclusive<
           mozilla::layers::CompositionPayloadType,
           mozilla::layers::CompositionPayloadType::eKeyPress,
           mozilla::layers::kHighestCompositionPayloadType> {};
 
 template <>
-struct ParamTraits<mozilla::layers::CompositionPayload>
-{
+struct ParamTraits<mozilla::layers::CompositionPayload> {
   typedef mozilla::layers::CompositionPayload paramType;
 
   static void Write(Message* aMsg, const paramType& aParam) {
     WriteParam(aMsg, aParam.mType);
     WriteParam(aMsg, aParam.mTimeStamp);
   }
 
-  static bool Read(const Message* aMsg, PickleIterator* aIter, paramType* aResult) {
+  static bool Read(const Message* aMsg, PickleIterator* aIter,
+                   paramType* aResult) {
     return ReadParam(aMsg, aIter, &aResult->mType) &&
            ReadParam(aMsg, aIter, &aResult->mTimeStamp);
   }
 };
 
 } /* namespace IPC */
 
 #endif /* mozilla_layers_LayersMessageUtils */
--- a/gfx/layers/wr/ClipManager.cpp
+++ b/gfx/layers/wr/ClipManager.cpp
@@ -15,17 +15,19 @@
 #include "nsDisplayList.h"
 #include "nsStyleStructInlines.h"
 #include "UnitTransforms.h"
 
 #define CLIP_LOG(...)
 
 //#define CLIP_LOG(...) printf_stderr("CLIP: " __VA_ARGS__)
 
+// clang-format off
 //#define CLIP_LOG(...) if (XRE_IsContentProcess()) printf_stderr("CLIP: " __VA_ARGS__)
+// clang-format on
 
 namespace mozilla {
 namespace layers {
 
 ClipManager::ClipManager() : mManager(nullptr), mBuilder(nullptr) {}
 
 void ClipManager::BeginBuild(WebRenderLayerManager* aManager,
                              wr::DisplayListBuilder& aBuilder) {
--- a/gfx/layers/wr/RenderRootStateManager.cpp
+++ b/gfx/layers/wr/RenderRootStateManager.cpp
@@ -6,210 +6,165 @@
 
 #include "mozilla/layers/RenderRootStateManager.h"
 
 #include "mozilla/layers/WebRenderBridgeChild.h"
 
 namespace mozilla {
 namespace layers {
 
-RenderRootStateManager::RenderRootStateManager(WebRenderLayerManager* aLayerManager)
-  : mLayerManager(aLayerManager)
-  , mDestroyed(false)
-{
-}
+RenderRootStateManager::RenderRootStateManager(
+    WebRenderLayerManager* aLayerManager)
+    : mLayerManager(aLayerManager), mDestroyed(false) {}
 
-RenderRootStateManager::~RenderRootStateManager()
-{}
+RenderRootStateManager::~RenderRootStateManager() {}
 
-// RenderRootStateManager shares its ref count with the WebRenderLayerManager that
-// created it. You can think of the two classes as being one unit, except there
-// are multiple RenderRootStateManagers per WebRenderLayerManager. Since we need
-// to reference the WebRenderLayerManager and it needs to reference us, this
-// avoids us needing to involve the cycle collector.
-void
-RenderRootStateManager::AddRef()
-{
-  mLayerManager->AddRef();
-}
+// RenderRootStateManager shares its ref count with the WebRenderLayerManager
+// that created it. You can think of the two classes as being one unit, except
+// there are multiple RenderRootStateManagers per WebRenderLayerManager. Since
+// we need to reference the WebRenderLayerManager and it needs to reference us,
+// this avoids us needing to involve the cycle collector.
+void RenderRootStateManager::AddRef() { mLayerManager->AddRef(); }
 
-void
-RenderRootStateManager::Release()
-{
-  mLayerManager->Release();
-}
+void RenderRootStateManager::Release() { mLayerManager->Release(); }
 
-
-WebRenderBridgeChild*
-RenderRootStateManager::WrBridge() const
-{
+WebRenderBridgeChild* RenderRootStateManager::WrBridge() const {
   return mLayerManager->WrBridge();
 }
 
-WebRenderCommandBuilder&
-RenderRootStateManager::CommandBuilder()
-{
+WebRenderCommandBuilder& RenderRootStateManager::CommandBuilder() {
   return mLayerManager->CommandBuilder();
 }
 
 RenderRootStateManager::WebRenderUserDataRefTable*
-RenderRootStateManager::GetWebRenderUserDataTable()
-{
+RenderRootStateManager::GetWebRenderUserDataTable() {
   return mLayerManager->GetWebRenderUserDataTable();
 }
 
-wr::IpcResourceUpdateQueue&
-RenderRootStateManager::AsyncResourceUpdates()
-{
+wr::IpcResourceUpdateQueue& RenderRootStateManager::AsyncResourceUpdates() {
   MOZ_ASSERT(NS_IsMainThread());
 
   if (!mAsyncResourceUpdates) {
     mAsyncResourceUpdates.emplace(WrBridge());
 
     RefPtr<Runnable> task = NewRunnableMethod(
-      "RenderRootStateManager::FlushAsyncResourceUpdates",
-      this, &RenderRootStateManager::FlushAsyncResourceUpdates);
+        "RenderRootStateManager::FlushAsyncResourceUpdates", this,
+        &RenderRootStateManager::FlushAsyncResourceUpdates);
     NS_DispatchToMainThread(task.forget());
   }
 
   return mAsyncResourceUpdates.ref();
 }
 
-void
-RenderRootStateManager::Destroy()
-{
+void RenderRootStateManager::Destroy() {
   ClearAsyncAnimations();
 
   if (WrBridge()) {
     // Just clear ImageKeys, they are deleted during WebRenderAPI destruction.
     DiscardLocalImages();
     // CompositorAnimations are cleared by WebRenderBridgeParent.
     mDiscardedCompositorAnimationsIds.Clear();
   }
 
   mActiveCompositorAnimationIds.clear();
 
   mDestroyed = true;
 }
 
-void
-RenderRootStateManager::FlushAsyncResourceUpdates()
-{
+void RenderRootStateManager::FlushAsyncResourceUpdates() {
   MOZ_ASSERT(NS_IsMainThread());
 
   if (!mAsyncResourceUpdates) {
     return;
   }
 
   if (!IsDestroyed() && WrBridge()) {
     WrBridge()->UpdateResources(mAsyncResourceUpdates.ref());
   }
 
   mAsyncResourceUpdates.reset();
 }
 
-void
-RenderRootStateManager::AddImageKeyForDiscard(wr::ImageKey key)
-{
+void RenderRootStateManager::AddImageKeyForDiscard(wr::ImageKey key) {
   mImageKeysToDelete.AppendElement(key);
 }
 
-void
-RenderRootStateManager::AddBlobImageKeyForDiscard(wr::BlobImageKey key)
-{
+void RenderRootStateManager::AddBlobImageKeyForDiscard(wr::BlobImageKey key) {
   mBlobImageKeysToDelete.AppendElement(key);
 }
 
-void
-RenderRootStateManager::DiscardImagesInTransaction(wr::IpcResourceUpdateQueue& aResources)
-{
+void RenderRootStateManager::DiscardImagesInTransaction(
+    wr::IpcResourceUpdateQueue& aResources) {
   for (const auto& key : mImageKeysToDelete) {
     aResources.DeleteImage(key);
   }
   for (const auto& key : mBlobImageKeysToDelete) {
     aResources.DeleteBlobImage(key);
   }
   mImageKeysToDelete.Clear();
   mBlobImageKeysToDelete.Clear();
 }
 
-void
-RenderRootStateManager::DiscardLocalImages()
-{
+void RenderRootStateManager::DiscardLocalImages() {
   // Removes images but doesn't tell the parent side about them
   // This is useful in empty / failed transactions where we created
   // image keys but didn't tell the parent about them yet.
   mImageKeysToDelete.Clear();
   mBlobImageKeysToDelete.Clear();
 }
 
-void
-RenderRootStateManager::ClearCachedResources()
-{
+void RenderRootStateManager::ClearCachedResources() {
   mActiveCompositorAnimationIds.clear();
   mDiscardedCompositorAnimationsIds.Clear();
 }
 
-void
-RenderRootStateManager::AddActiveCompositorAnimationId(uint64_t aId)
-{
+void RenderRootStateManager::AddActiveCompositorAnimationId(uint64_t aId) {
   // In layers-free mode we track the active compositor animation ids on the
   // client side so that we don't try to discard the same animation id multiple
   // times. We could just ignore the multiple-discard on the parent side, but
   // checking on the content side reduces IPC traffic.
   mActiveCompositorAnimationIds.insert(aId);
 }
 
-void
-RenderRootStateManager::AddCompositorAnimationsIdForDiscard(uint64_t aId)
-{
+void RenderRootStateManager::AddCompositorAnimationsIdForDiscard(uint64_t aId) {
   if (mActiveCompositorAnimationIds.erase(aId)) {
-    // For layers-free ensure we don't try to discard an animation id that wasn't
-    // active. We also remove it from mActiveCompositorAnimationIds so we don't
-    // discard it again unless it gets re-activated.
+    // For layers-free ensure we don't try to discard an animation id that
+    // wasn't active. We also remove it from mActiveCompositorAnimationIds so we
+    // don't discard it again unless it gets re-activated.
     mDiscardedCompositorAnimationsIds.AppendElement(aId);
   }
 }
 
-void
-RenderRootStateManager::DiscardCompositorAnimations()
-{
-  if (WrBridge()->IPCOpen() &&
-      !mDiscardedCompositorAnimationsIds.IsEmpty()) {
-    WrBridge()->
-      SendDeleteCompositorAnimations(mDiscardedCompositorAnimationsIds);
+void RenderRootStateManager::DiscardCompositorAnimations() {
+  if (WrBridge()->IPCOpen() && !mDiscardedCompositorAnimationsIds.IsEmpty()) {
+    WrBridge()->SendDeleteCompositorAnimations(
+        mDiscardedCompositorAnimationsIds);
   }
   mDiscardedCompositorAnimationsIds.Clear();
 }
 
-void
-RenderRootStateManager::RegisterAsyncAnimation(const wr::ImageKey& aKey,
-                                              SharedSurfacesAnimation* aAnimation)
-{
+void RenderRootStateManager::RegisterAsyncAnimation(
+    const wr::ImageKey& aKey, SharedSurfacesAnimation* aAnimation) {
   mAsyncAnimations.insert(std::make_pair(wr::AsUint64(aKey), aAnimation));
 }
 
-void
-RenderRootStateManager::DeregisterAsyncAnimation(const wr::ImageKey& aKey)
-{
+void RenderRootStateManager::DeregisterAsyncAnimation(
+    const wr::ImageKey& aKey) {
   mAsyncAnimations.erase(wr::AsUint64(aKey));
 }
 
-void
-RenderRootStateManager::ClearAsyncAnimations()
-{
+void RenderRootStateManager::ClearAsyncAnimations() {
   for (const auto& i : mAsyncAnimations) {
     i.second->Invalidate(this);
   }
   mAsyncAnimations.clear();
 }
 
-void
-RenderRootStateManager::WrReleasedImages(const nsTArray<wr::ExternalImageKeyPair>& aPairs)
-{
+void RenderRootStateManager::WrReleasedImages(
+    const nsTArray<wr::ExternalImageKeyPair>& aPairs) {
   // A SharedSurfaceAnimation object's lifetime is tied to its owning
   // ImageContainer. When the ImageContainer is released,
   // SharedSurfaceAnimation::Destroy is called which should ensure it is removed
   // from the layer manager. Whenever the namespace for the
   // WebRenderLayerManager itself is invalidated (e.g. we changed windows, or
   // were destroyed ourselves), we callback into the SharedSurfaceAnimation
   // object to remove its image key for us and any bound surfaces. If, for any
   // reason, we somehow missed an WrReleasedImages call before the animation
@@ -218,59 +173,46 @@ RenderRootStateManager::WrReleasedImages
   for (const auto& pair : aPairs) {
     auto i = mAsyncAnimations.find(wr::AsUint64(pair.key));
     if (i != mAsyncAnimations.end()) {
       i->second->ReleasePreviousFrame(this, pair.id);
     }
   }
 }
 
-void
-RenderRootStateManager::AddWebRenderParentCommand(const WebRenderParentCommand& aCmd)
-{
+void RenderRootStateManager::AddWebRenderParentCommand(
+    const WebRenderParentCommand& aCmd) {
   WrBridge()->AddWebRenderParentCommand(aCmd);
 }
-void
-RenderRootStateManager::UpdateResources(wr::IpcResourceUpdateQueue& aResources)
-{
+void RenderRootStateManager::UpdateResources(
+    wr::IpcResourceUpdateQueue& aResources) {
   WrBridge()->UpdateResources(aResources);
 }
-void
-RenderRootStateManager::AddPipelineIdForAsyncCompositable(const wr::PipelineId& aPipelineId,
-                                                   const CompositableHandle& aHandle)
-{
+void RenderRootStateManager::AddPipelineIdForAsyncCompositable(
+    const wr::PipelineId& aPipelineId, const CompositableHandle& aHandle) {
   WrBridge()->AddPipelineIdForAsyncCompositable(aPipelineId, aHandle);
 }
-void
-RenderRootStateManager::AddPipelineIdForCompositable(const wr::PipelineId& aPipelineId,
-                                              const CompositableHandle& aHandle)
-{
+void RenderRootStateManager::AddPipelineIdForCompositable(
+    const wr::PipelineId& aPipelineId, const CompositableHandle& aHandle) {
   WrBridge()->AddPipelineIdForCompositable(aPipelineId, aHandle);
 }
-void
-RenderRootStateManager::RemovePipelineIdForCompositable(const wr::PipelineId& aPipelineId)
-{
+void RenderRootStateManager::RemovePipelineIdForCompositable(
+    const wr::PipelineId& aPipelineId) {
   WrBridge()->RemovePipelineIdForCompositable(aPipelineId);
 }
-  /// Release TextureClient that is bounded to ImageKey.
-  /// It is used for recycling TextureClient.
-void
-RenderRootStateManager::ReleaseTextureOfImage(const wr::ImageKey& aKey)
-{
+/// Release TextureClient that is bounded to ImageKey.
+/// It is used for recycling TextureClient.
+void RenderRootStateManager::ReleaseTextureOfImage(const wr::ImageKey& aKey) {
   WrBridge()->ReleaseTextureOfImage(aKey);
 }
 
-wr::FontInstanceKey
-RenderRootStateManager::GetFontKeyForScaledFont(gfx::ScaledFont* aScaledFont,
-                                              wr::IpcResourceUpdateQueue* aResources)
-{
+wr::FontInstanceKey RenderRootStateManager::GetFontKeyForScaledFont(
+    gfx::ScaledFont* aScaledFont, wr::IpcResourceUpdateQueue* aResources) {
   return WrBridge()->GetFontKeyForScaledFont(aScaledFont, aResources);
 }
 
-wr::FontKey
-RenderRootStateManager::GetFontKeyForUnscaledFont(gfx::UnscaledFont* aUnscaledFont,
-                                                wr::IpcResourceUpdateQueue* aResources)
-{
+wr::FontKey RenderRootStateManager::GetFontKeyForUnscaledFont(
+    gfx::UnscaledFont* aUnscaledFont, wr::IpcResourceUpdateQueue* aResources) {
   return WrBridge()->GetFontKeyForUnscaledFont(aUnscaledFont, aResources);
 }
 
-} // namespace layers
-} // namespace mozilla
\ No newline at end of file
+}  // namespace layers
+}  // namespace mozilla
\ No newline at end of file
--- a/gfx/layers/wr/RenderRootStateManager.h
+++ b/gfx/layers/wr/RenderRootStateManager.h
@@ -11,88 +11,90 @@
 
 #include "mozilla/layers/IpcResourceUpdateQueue.h"
 #include "mozilla/layers/WebRenderCommandBuilder.h"
 
 namespace mozilla {
 
 namespace layers {
 
-class RenderRootStateManager
-{
-  typedef nsTHashtable<nsRefPtrHashKey<WebRenderUserData>> WebRenderUserDataRefTable;
+class RenderRootStateManager {
+  typedef nsTHashtable<nsRefPtrHashKey<WebRenderUserData>>
+      WebRenderUserDataRefTable;
 
-public:
+ public:
   void AddRef();
   void Release();
 
   explicit RenderRootStateManager(WebRenderLayerManager* aLayerManager);
 
   void Destroy();
   bool IsDestroyed() { return mDestroyed; }
   wr::IpcResourceUpdateQueue& AsyncResourceUpdates();
   WebRenderBridgeChild* WrBridge() const;
   WebRenderCommandBuilder& CommandBuilder();
   WebRenderUserDataRefTable* GetWebRenderUserDataTable();
-  WebRenderLayerManager* LayerManager()
-  {
-    return mLayerManager;
-  }
+  WebRenderLayerManager* LayerManager() { return mLayerManager; }
 
   void AddImageKeyForDiscard(wr::ImageKey key);
   void AddBlobImageKeyForDiscard(wr::BlobImageKey key);
   void DiscardImagesInTransaction(wr::IpcResourceUpdateQueue& aResources);
   void DiscardLocalImages();
 
   void ClearCachedResources();
 
   // Methods to manage the compositor animation ids. Active animations are still
   // going, and when they end we discard them and remove them from the active
   // list.
   void AddActiveCompositorAnimationId(uint64_t aId);
   void AddCompositorAnimationsIdForDiscard(uint64_t aId);
   void DiscardCompositorAnimations();
 
-  void RegisterAsyncAnimation(const wr::ImageKey& aKey, SharedSurfacesAnimation* aAnimation);
+  void RegisterAsyncAnimation(const wr::ImageKey& aKey,
+                              SharedSurfacesAnimation* aAnimation);
   void DeregisterAsyncAnimation(const wr::ImageKey& aKey);
   void ClearAsyncAnimations();
   void WrReleasedImages(const nsTArray<wr::ExternalImageKeyPair>& aPairs);
 
   void AddWebRenderParentCommand(const WebRenderParentCommand& aCmd);
   void UpdateResources(wr::IpcResourceUpdateQueue& aResources);
   void AddPipelineIdForAsyncCompositable(const wr::PipelineId& aPipelineId,
                                          const CompositableHandle& aHandlee);
   void AddPipelineIdForCompositable(const wr::PipelineId& aPipelineId,
                                     const CompositableHandle& aHandlee);
   void RemovePipelineIdForCompositable(const wr::PipelineId& aPipelineId);
   /// Release TextureClient that is bounded to ImageKey.
   /// It is used for recycling TextureClient.
   void ReleaseTextureOfImage(const wr::ImageKey& aKey);
-  wr::FontInstanceKey GetFontKeyForScaledFont(gfx::ScaledFont* aScaledFont,
-                                              wr::IpcResourceUpdateQueue* aResources = nullptr);
-  wr::FontKey GetFontKeyForUnscaledFont(gfx::UnscaledFont* aUnscaledFont,
-                                        wr::IpcResourceUpdateQueue* aResources = nullptr);
+  wr::FontInstanceKey GetFontKeyForScaledFont(
+      gfx::ScaledFont* aScaledFont,
+      wr::IpcResourceUpdateQueue* aResources = nullptr);
+  wr::FontKey GetFontKeyForUnscaledFont(
+      gfx::UnscaledFont* aUnscaledFont,
+      wr::IpcResourceUpdateQueue* aResources = nullptr);
 
   void FlushAsyncResourceUpdates();
-private:
+
+ private:
   ~RenderRootStateManager();
   WebRenderLayerManager* mLayerManager;
   Maybe<wr::IpcResourceUpdateQueue> mAsyncResourceUpdates;
   nsTArray<wr::ImageKey> mImageKeysToDelete;
   nsTArray<wr::BlobImageKey> mBlobImageKeysToDelete;
-  std::unordered_map<uint64_t, RefPtr<SharedSurfacesAnimation>> mAsyncAnimations;
+  std::unordered_map<uint64_t, RefPtr<SharedSurfacesAnimation>>
+      mAsyncAnimations;
 
   // Set of compositor animation ids for which there are active animations (as
   // of the last transaction) on the compositor side.
   std::unordered_set<uint64_t> mActiveCompositorAnimationIds;
   // Compositor animation ids for animations that are done now and that we want
   // the compositor to discard information for.
   nsTArray<uint64_t> mDiscardedCompositorAnimationsIds;
 
   bool mDestroyed;
 
   friend class WebRenderLayerManager;
 };
 
-} // namespace layers
-} // namespace mozilla
+}  // namespace layers
+}  // namespace mozilla
 
 #endif /* GFX_RENDERROOTSTATEMANAGER_H */
--- a/gfx/layers/wr/WebRenderCommandBuilder.cpp
+++ b/gfx/layers/wr/WebRenderCommandBuilder.cpp
@@ -1062,18 +1062,19 @@ static bool IsItemProbablyActive(nsDispl
     case DisplayItemType::TYPE_TRANSFORM: {
       nsDisplayTransform* transformItem =
           static_cast<nsDisplayTransform*>(aItem);
       const Matrix4x4Flagged& t = transformItem->GetTransform();
       Matrix t2d;
       bool is2D = t.Is2D(&t2d);
       GP("active: %d\n", transformItem->MayBeAnimated(aDisplayListBuilder));
       return transformItem->MayBeAnimated(aDisplayListBuilder, false) ||
-             !is2D || HasActiveChildren(*transformItem->GetChildren(),
-                                        aDisplayListBuilder);
+             !is2D ||
+             HasActiveChildren(*transformItem->GetChildren(),
+                               aDisplayListBuilder);
     }
     case DisplayItemType::TYPE_OPACITY: {
       nsDisplayOpacity* opacityItem = static_cast<nsDisplayOpacity*>(aItem);
       bool active = opacityItem->NeedsActiveLayer(aDisplayListBuilder,
                                                   opacityItem->Frame(), false);
       GP("active: %d\n", active);
       return active || HasActiveChildren(*opacityItem->GetChildren(),
                                          aDisplayListBuilder);
@@ -2298,18 +2299,17 @@ Maybe<wr::WrImageMask> WebRenderCommandB
     bool paintFinished =
         aMaskItem->PaintMask(aDisplayListBuilder, context, &maskPainted);
     if (!maskPainted) {
       return Nothing();
     }
 
     recorder->FlushItem(IntRect(0, 0, size.width, size.height));
     TakeExternalSurfaces(recorder, maskData->mExternalSurfaces,
-                         mManager->GetRenderRootStateManager(),
-                         aResources);
+                         mManager->GetRenderRootStateManager(), aResources);
     recorder->Finish();
 
     Range<uint8_t> bytes((uint8_t*)recorder->mOutputStream.mData,
                          recorder->mOutputStream.mLength);
     wr::BlobImageKey key =
         wr::BlobImageKey{mManager->WrBridge()->GetNextImageKey()};
     wr::ImageDescriptor descriptor(size, 0, dt->GetFormat(),
                                    wr::OpacityType::HasAlphaChannel);
--- a/gfx/thebes/gfxFont.h
+++ b/gfx/thebes/gfxFont.h
@@ -1935,19 +1935,17 @@ class gfxFont {
 
   // subclasses may provide (possibly hinted) glyph widths (in font units);
   // if they do not override this, harfbuzz will use unhinted widths
   // derived from the font tables
   virtual bool ProvidesGlyphWidths() const { return false; }
 
   // The return value is interpreted as a horizontal advance in 16.16 fixed
   // point format.
-  virtual int32_t GetGlyphWidth(uint16_t aGID) {
-    return -1;
-  }
+  virtual int32_t GetGlyphWidth(uint16_t aGID) { return -1; }
 
   bool IsSpaceGlyphInvisible(DrawTarget* aRefDrawTarget,
                              const gfxTextRun* aTextRun);
 
   void AddGlyphChangeObserver(GlyphChangeObserver* aObserver);
   void RemoveGlyphChangeObserver(GlyphChangeObserver* aObserver);
 
   // whether font contains substitution lookups containing spaces
--- a/image/decoders/nsWebPDecoder.cpp
+++ b/image/decoders/nsWebPDecoder.cpp
@@ -303,20 +303,20 @@ void nsWebPDecoder::ApplyColorProfile(co
         ("[this=%p] nsWebPDecoder::ApplyColorProfile -- bad color profile\n",
          this));
     return;
   }
 
   uint32_t profileSpace = qcms_profile_get_color_space(mInProfile);
   if (profileSpace == icSigGrayData) {
     // WebP doesn't produce grayscale data, this must be corrupt.
-    MOZ_LOG(
-        sWebPLog, LogLevel::Error,
-        ("[this=%p] nsWebPDecoder::ApplyColorProfile -- ignoring grayscale color profile\n",
-         this));
+    MOZ_LOG(sWebPLog, LogLevel::Error,
+            ("[this=%p] nsWebPDecoder::ApplyColorProfile -- ignoring grayscale "
+             "color profile\n",
+             this));
     return;
   }
 
   // Calculate rendering intent.
   int intent = gfxPlatform::GetRenderingIntent();
   if (intent == -1) {
     intent = qcms_profile_get_rendering_intent(mInProfile);
   }
--- a/ipc/chromium/src/base/at_exit.cc
+++ b/ipc/chromium/src/base/at_exit.cc
@@ -10,25 +10,23 @@
 namespace base {
 
 // Keep a stack of registered AtExitManagers.  We always operate on the most
 // recent, and we should never have more than one outside of testing, when we
 // use the shadow version of the constructor.  We don't protect this for
 // thread-safe access, since it will only be modified in testing.
 static AtExitManager* g_top_manager = NULL;
 
-AtExitManager::AtExitManager() : lock_("AtExitManager"),
-                                 next_manager_(NULL) {
+AtExitManager::AtExitManager() : lock_("AtExitManager"), next_manager_(NULL) {
   DCHECK(!g_top_manager);
   g_top_manager = this;
 }
 
-AtExitManager::AtExitManager(bool shadow) : lock_("AtExitManager"),
-                                            next_manager_(g_top_manager)
- {
+AtExitManager::AtExitManager(bool shadow)
+    : lock_("AtExitManager"), next_manager_(g_top_manager) {
   DCHECK(shadow || !g_top_manager);
   g_top_manager = this;
 }
 
 AtExitManager::~AtExitManager() {
   if (!g_top_manager) {
     NOTREACHED() << "Tried to ~AtExitManager without an AtExitManager";
     return;
--- a/ipc/chromium/src/base/time_win.cc
+++ b/ipc/chromium/src/base/time_win.cc
@@ -238,25 +238,23 @@ class NowSingleton {
     // pass it into the NowSingleton constructor.
     static mozilla::StaticMutex mutex;
     static NowSingleton now(mutex);
     return now;
   }
 
  private:
   explicit NowSingleton(mozilla::StaticMutex& aMutex)
-    : lock_(aMutex)
-    , rollover_(TimeDelta::FromMilliseconds(0))
-    , last_seen_(0)
-  {
-  }
+      : lock_(aMutex),
+        rollover_(TimeDelta::FromMilliseconds(0)),
+        last_seen_(0) {}
   ~NowSingleton() = default;
 
   mozilla::StaticMutex& lock_;  // To protected last_seen_ and rollover_.
-  TimeDelta rollover_;  // Accumulation of time lost due to rollover.
+  TimeDelta rollover_;          // Accumulation of time lost due to rollover.
   DWORD last_seen_;  // The last timeGetTime value we saw, to detect rollover.
 
   DISALLOW_COPY_AND_ASSIGN(NowSingleton);
 };
 
 }  // namespace
 
 // static
--- a/ipc/chromium/src/chrome/common/ipc_channel_posix.cc
+++ b/ipc/chromium/src/chrome/common/ipc_channel_posix.cc
@@ -145,19 +145,17 @@ class PipeMap {
     // a member variable.  So we have to have this separate variable and pass
     // it into the PipeMap constructor.
     static mozilla::StaticMutex mutex;
     static PipeMap map(mutex);
     return map;
   }
 
  private:
-  explicit PipeMap(mozilla::StaticMutex& aMutex)
-    : lock_(aMutex)
-  {}
+  explicit PipeMap(mozilla::StaticMutex& aMutex) : lock_(aMutex) {}
   ~PipeMap() = default;
 
   mozilla::StaticMutex& lock_;
   typedef std::map<std::string, int> ChannelToFDMap;
   ChannelToFDMap map_;
 };
 
 // This is the file descriptor number that a client process expects to find its
--- a/ipc/glue/CrashReporterHost.cpp
+++ b/ipc/glue/CrashReporterHost.cpp
@@ -15,40 +15,46 @@
 #include "nsICrashService.h"
 #include "nsXULAppAPI.h"
 
 // Consistency checking for nsICrashService constants.  We depend on the
 // equivalence between nsICrashService values and GeckoProcessType values
 // in the code below.  Making them equal also ensures that if new process
 // types are added, people will know they may need to add crash reporting
 // support in various places because compilation errors will be triggered here.
-static_assert(nsICrashService::PROCESS_TYPE_MAIN == (int)GeckoProcessType_Default,
+static_assert(nsICrashService::PROCESS_TYPE_MAIN ==
+                  (int)GeckoProcessType_Default,
               "GeckoProcessType enum is out of sync with nsICrashService!");
-static_assert(nsICrashService::PROCESS_TYPE_PLUGIN == (int)GeckoProcessType_Plugin,
+static_assert(nsICrashService::PROCESS_TYPE_PLUGIN ==
+                  (int)GeckoProcessType_Plugin,
               "GeckoProcessType enum is out of sync with nsICrashService!");
-static_assert(nsICrashService::PROCESS_TYPE_CONTENT == (int)GeckoProcessType_Content,
+static_assert(nsICrashService::PROCESS_TYPE_CONTENT ==
+                  (int)GeckoProcessType_Content,
               "GeckoProcessType enum is out of sync with nsICrashService!");
-static_assert(nsICrashService::PROCESS_TYPE_IPDLUNITTEST == (int)GeckoProcessType_IPDLUnitTest,
+static_assert(nsICrashService::PROCESS_TYPE_IPDLUNITTEST ==
+                  (int)GeckoProcessType_IPDLUnitTest,
               "GeckoProcessType enum is out of sync with nsICrashService!");
-static_assert(nsICrashService::PROCESS_TYPE_GMPLUGIN == (int)GeckoProcessType_GMPlugin,
+static_assert(nsICrashService::PROCESS_TYPE_GMPLUGIN ==
+                  (int)GeckoProcessType_GMPlugin,
               "GeckoProcessType enum is out of sync with nsICrashService!");
 static_assert(nsICrashService::PROCESS_TYPE_GPU == (int)GeckoProcessType_GPU,
               "GeckoProcessType enum is out of sync with nsICrashService!");
 static_assert(nsICrashService::PROCESS_TYPE_VR == (int)GeckoProcessType_VR,
               "GeckoProcessType enum is out of sync with nsICrashService!");
 static_assert(nsICrashService::PROCESS_TYPE_RDD == (int)GeckoProcessType_RDD,
               "GeckoProcessType enum is out of sync with nsICrashService!");
-static_assert(nsICrashService::PROCESS_TYPE_SOCKET == (int)GeckoProcessType_Socket,
+static_assert(nsICrashService::PROCESS_TYPE_SOCKET ==
+                  (int)GeckoProcessType_Socket,
               "GeckoProcessType enum is out of sync with nsICrashService!");
 // Add new static asserts here if you add more process types.
 // Update this static assert as well.
-static_assert(nsICrashService::PROCESS_TYPE_SOCKET + 1 == (int)GeckoProcessType_End,
+static_assert(nsICrashService::PROCESS_TYPE_SOCKET + 1 ==
+                  (int)GeckoProcessType_End,
               "GeckoProcessType enum is out of sync with nsICrashService!");
 
-
 namespace mozilla {
 namespace ipc {
 
 CrashReporterHost::CrashReporterHost(GeckoProcessType aProcessType,
                                      const Shmem& aShmem,
                                      CrashReporter::ThreadId aThreadId)
     : mProcessType(aProcessType),
       mShmem(aShmem),
@@ -111,24 +117,24 @@ bool CrashReporterHost::FinalizeCrashRep
   // as a historical artifact.
   if (mProcessType == GeckoProcessType_GMPlugin) {
     type.AssignLiteral("plugin");
   } else {
     // This check will pick up some cases that will never happen (e.g. IPDL
     // unit tests), but that's OK.
     switch (mProcessType) {
 #define GECKO_PROCESS_TYPE(enum_name, string_name, xre_name) \
-      case GeckoProcessType_##enum_name: \
-        type.AssignLiteral(string_name); \
-        break;
+  case GeckoProcessType_##enum_name:                         \
+    type.AssignLiteral(string_name);                         \
+    break;
 #include "mozilla/GeckoProcessTypes.h"
 #undef GECKO_PROCESS_TYPE
-    default:
-      NS_ERROR("unknown process type");
-      break;
+      default:
+        NS_ERROR("unknown process type");
+        break;
     }
   }
   annotations[CrashReporter::Annotation::ProcessType] = type;
 
   char startTime[32];
   SprintfLiteral(startTime, "%lld", static_cast<long long>(mStartTime));
   annotations[CrashReporter::Annotation::StartupTime] =
       nsDependentCString(startTime);
@@ -183,23 +189,23 @@ bool CrashReporterHost::FinalizeCrashRep
   }
 
   if (aProcessType == GeckoProcessType_Plugin &&
       aCrashType == nsICrashService::CRASH_TYPE_HANG) {
     telemetryKey.AssignLiteral("pluginhang");
   } else {
     switch (aProcessType) {
 #define GECKO_PROCESS_TYPE(enum_name, string_name, xre_name) \
-      case GeckoProcessType_##enum_name:                     \
-        telemetryKey.AssignLiteral(string_name);             \
-        break;
+  case GeckoProcessType_##enum_name:                         \
+    telemetryKey.AssignLiteral(string_name);                 \
+    break;
 #include "mozilla/GeckoProcessTypes.h"
 #undef GECKO_PROCESS_TYPE
-      // We can't really hit this, thanks to the above switch, but having it here
-      // will placate the compiler.
+      // We can't really hit this, thanks to the above switch, but having it
+      // here will placate the compiler.
       default:
         NS_ERROR("unknown process type");
         return;
     }
   }
 
   RefPtr<Promise> promise;
   crashService->AddCrash(processType, aCrashType, aChildDumpID,
--- a/js/src/builtin/ModuleObject.cpp
+++ b/js/src/builtin/ModuleObject.cpp
@@ -376,17 +376,18 @@ bool IndirectBindingMap::lookup(jsid nam
   Rooted<UniquePtr<IndirectBindingMap>> rootedBindings(cx, std::move(bindings));
   RootedObject object(
       cx, NewProxyObject(cx, &proxyHandler, priv, nullptr, options));
   if (!object) {
     return nullptr;
   }
 
   SetProxyReservedSlot(object, ExportsSlot, ObjectValue(*exports));
-  SetProxyReservedSlot(object, BindingsSlot, PrivateValue(rootedBindings.release()));
+  SetProxyReservedSlot(object, BindingsSlot,
+                       PrivateValue(rootedBindings.release()));
 
   return &object->as<ModuleNamespaceObject>();
 }
 
 ModuleObject& ModuleNamespaceObject::module() {
   return GetProxyPrivate(this).toObject().as<ModuleObject>();
 }
 
--- a/js/src/frontend/NameFunctions.cpp
+++ b/js/src/frontend/NameFunctions.cpp
@@ -980,18 +980,18 @@ class NameResolver {
 
     // It would be nice to common up the repeated |parents[initialParents]|
     // in a single variable, but the #if condition required to prevent an
     // unused-variable warning across three separate conditionally-expanded
     // macros would be super-ugly.  :-(
     MOZ_ASSERT(parents[initialParents] == cur,
                "pushed child shouldn't change underneath us");
 
-    AlwaysPoison(&parents[initialParents], 0xFF, sizeof(parents[initialParents]),
-                 MemCheckKind::MakeUndefined);
+    AlwaysPoison(&parents[initialParents], 0xFF,
+                 sizeof(parents[initialParents]), MemCheckKind::MakeUndefined);
 
     return true;
   }
 };
 
 } /* anonymous namespace */
 
 bool frontend::NameFunctions(JSContext* cx, ParseNode* pn) {
--- a/js/src/gc/GC.cpp
+++ b/js/src/gc/GC.cpp
@@ -587,17 +587,17 @@ inline size_t Arena::finalize(FreeOp* fo
                                 thing - thingSize, this);
         newListTail = newListTail->nextSpanUnchecked(this);
       }
       firstThingOrSuccessorOfLastMarkedThing = thing + thingSize;
       nmarked++;
     } else {
       t->finalize(fop);
       AlwaysPoison(t, JS_SWEPT_TENURED_PATTERN, thingSize,
-             MemCheckKind::MakeUndefined);
+                   MemCheckKind::MakeUndefined);
       gcTracer.traceTenuredFinalize(t);
     }
   }
 
   if (nmarked == 0) {
     // Do nothing. The caller will update the arena appropriately.
     MOZ_ASSERT(newListTail == &newListHead);
     DebugOnlyPoison(data, JS_SWEPT_TENURED_PATTERN, sizeof(data),
@@ -3582,18 +3582,18 @@ void GCRuntime::sweepFromBackgroundThrea
     ZoneList zones;
     zones.transferFrom(backgroundSweepZones.ref());
     LifoAlloc freeLifoAlloc(JSContext::TEMP_LIFO_ALLOC_PRIMARY_CHUNK_SIZE);
     freeLifoAlloc.transferFrom(&lifoBlocksToFree.ref());
 
     AutoUnlockHelperThreadState unlock(lock);
     sweepBackgroundThings(zones, freeLifoAlloc);
 
-    // The main thread may call queueZonesAndStartBackgroundSweep() while this is
-    // running so we must check there is no more work after releasing the
+    // The main thread may call queueZonesAndStartBackgroundSweep() while this
+    // is running so we must check there is no more work after releasing the
     // lock.
   } while (!backgroundSweepZones.ref().isEmpty());
 }
 
 void GCRuntime::waitBackgroundSweepEnd() {
   sweepTask.join();
 
   // TODO: Improve assertion to work in incremental GC?
@@ -3637,18 +3637,17 @@ void GCRuntime::startBackgroundFree() {
     AutoLockHelperThreadState lock;
     freeTask.startOrRunIfIdle(lock);
   } else {
     freeTask.joinAndRunFromMainThread(rt);
   }
 }
 
 void BackgroundFreeTask::run() {
-  AutoTraceLog logFreeing(TraceLoggerForCurrentThread(),
-                           TraceLogger_GCFree);
+  AutoTraceLog logFreeing(TraceLoggerForCurrentThread(), TraceLogger_GCFree);
 
   AutoLockHelperThreadState lock;
 
   runtime()->gc.freeFromBackgroundThread(lock);
 
   // Signal to the main thread that we're about to finish, because we release
   // the lock again before GCParallelTask's state is changed to finished.
   setFinishing(lock);
@@ -3661,26 +3660,25 @@ void GCRuntime::freeFromBackgroundThread
 
     Nursery::BufferSet buffers;
     mozilla::Swap(buffers, buffersToFreeAfterMinorGC.ref());
 
     AutoUnlockHelperThreadState unlock(lock);
 
     lifoBlocks.freeAll();
 
-    for (Nursery::BufferSet::Range r = buffers.all(); !r.empty(); r.popFront()) {
+    for (Nursery::BufferSet::Range r = buffers.all(); !r.empty();
+         r.popFront()) {
       rt->defaultFreeOp()->free_(r.front());
     }
   } while (!lifoBlocksToFree.ref().isEmpty() ||
            !buffersToFreeAfterMinorGC.ref().empty());
 }
 
-void GCRuntime::waitBackgroundFreeEnd() {
-  freeTask.join();
-}
+void GCRuntime::waitBackgroundFreeEnd() { freeTask.join(); }
 
 struct IsAboutToBeFinalizedFunctor {
   template <typename T>
   bool operator()(Cell** t) {
     mozilla::DebugOnly<const Cell*> prior = *t;
     bool result = IsAboutToBeFinalizedUnbarriered(reinterpret_cast<T**>(t));
     // Sweep should not have to deal with moved pointers, since moving GC
     // handles updating the UID table manually.
@@ -6930,18 +6928,18 @@ static bool ShouldCleanUpEverything(JS::
                                     JSGCInvocationKind gckind) {
   // During shutdown, we must clean everything up, for the sake of leak
   // detection. When a runtime has no contexts, or we're doing a GC before a
   // shutdown CC, those are strong indications that we're shutting down.
   return IsShutdownGC(reason) || gckind == GC_SHRINK;
 }
 
 static bool ShouldSweepOnBackgroundThread(JS::gcreason::Reason reason) {
-  return reason != JS::gcreason::DESTROY_RUNTIME &&
-         !gcTracer.traceEnabled() && CanUseExtraThreads();
+  return reason != JS::gcreason::DESTROY_RUNTIME && !gcTracer.traceEnabled() &&
+         CanUseExtraThreads();
 }
 
 void GCRuntime::incrementalSlice(SliceBudget& budget,
                                  JS::gcreason::Reason reason,
                                  AutoGCSession& session) {
   AutoDisableBarriers disableBarriers(rt);
 
   bool destroyingRuntime = (reason == JS::gcreason::DESTROY_RUNTIME);
--- a/js/src/gc/Heap.h
+++ b/js/src/gc/Heap.h
@@ -161,18 +161,19 @@ class FreeSpan {
       // The last space points to the next free span (which may be empty).
       const FreeSpan* next = nextSpan(arena);
       first = next->first;
       last = next->last;
     } else {
       return nullptr;  // The span is empty.
     }
     checkSpan(arena);
-    DebugOnlyPoison(reinterpret_cast<void*>(thing), JS_ALLOCATED_TENURED_PATTERN,
-                    thingSize, MemCheckKind::MakeUndefined);
+    DebugOnlyPoison(reinterpret_cast<void*>(thing),
+                    JS_ALLOCATED_TENURED_PATTERN, thingSize,
+                    MemCheckKind::MakeUndefined);
     return reinterpret_cast<TenuredCell*>(thing);
   }
 
   inline void checkSpan(const Arena* arena) const;
   inline void checkRange(uintptr_t first, uintptr_t last,
                          const Arena* arena) const;
 };
 
@@ -232,17 +233,17 @@ class Arena {
  private:
   /*
    * When recursive marking uses too much stack we delay marking of
    * arenas and link them into a list for later processing. This
    * uses the following fields.
    */
   static const size_t DELAYED_MARKING_FLAG_BITS = 3;
   static const size_t DELAYED_MARKING_ARENA_BITS =
-    JS_BITS_PER_WORD - 8 - DELAYED_MARKING_FLAG_BITS;
+      JS_BITS_PER_WORD - 8 - DELAYED_MARKING_FLAG_BITS;
   size_t onDelayedMarkingList_ : 1;
   size_t hasDelayedBlackMarking_ : 1;
   size_t hasDelayedGrayMarking_ : 1;
   size_t nextDelayedMarkingArena_ : DELAYED_MARKING_ARENA_BITS;
   static_assert(
       DELAYED_MARKING_ARENA_BITS >= JS_BITS_PER_WORD - ArenaShift,
       "Arena::nextDelayedMarkingArena_ packing assumes that ArenaShift has "
       "enough bits to cover allocKind and delayed marking state.");
--- a/js/src/gc/Marking.cpp
+++ b/js/src/gc/Marking.cpp
@@ -969,17 +969,17 @@ struct ParticipatesInCCFunctor {
     return TypeParticipatesInCC<T>::value;
   }
 };
 
 static bool TraceKindParticipatesInCC(JS::TraceKind kind) {
   return DispatchTraceKindTyped(ParticipatesInCCFunctor(), kind);
 }
 
-#endif // DEBUG
+#endif  // DEBUG
 
 template <typename T>
 bool js::GCMarker::mark(T* thing) {
   if (IsInsideNursery(thing)) {
     return false;
   }
   AssertShouldMarkInZone(thing);
   TenuredCell* cell = TenuredCell::fromPointer(thing);
--- a/js/src/gc/Nursery.cpp
+++ b/js/src/gc/Nursery.cpp
@@ -67,18 +67,17 @@ static_assert(sizeof(js::NurseryChunk) =
               "Nursery chunk size must match gc::Chunk size.");
 
 } /* namespace js */
 
 inline void js::NurseryChunk::poisonAndInit(JSRuntime* rt, size_t extent) {
   MOZ_ASSERT(extent <= ChunkSize);
   MOZ_MAKE_MEM_UNDEFINED(this, extent);
 
-  Poison(this, JS_FRESH_NURSERY_PATTERN, extent,
-         MemCheckKind::MakeUndefined);
+  Poison(this, JS_FRESH_NURSERY_PATTERN, extent, MemCheckKind::MakeUndefined);
 
   new (&trailer) gc::ChunkTrailer(rt, &rt->gc.storeBuffer());
 }
 
 inline void js::NurseryChunk::poisonAfterSweep(size_t extent) {
   MOZ_ASSERT(extent <= ChunkSize);
   // We can poison the same chunk more than once, so first make sure memory
   // sanitizers will let us poison it.
@@ -177,19 +176,17 @@ bool js::Nursery::init(uint32_t maxNurse
   if (!runtime()->gc.storeBuffer().enable()) {
     return false;
   }
 
   MOZ_ASSERT(isEnabled());
   return true;
 }
 
-js::Nursery::~Nursery() {
-  disable();
-}
+js::Nursery::~Nursery() { disable(); }
 
 void js::Nursery::enable() {
   MOZ_ASSERT(isEmpty());
   MOZ_ASSERT(!runtime()->gc.isVerifyPreBarriersEnabled());
   if (isEnabled() || !chunkCountLimit()) {
     return;
   }
 
--- a/js/src/gc/Nursery.h
+++ b/js/src/gc/Nursery.h
@@ -307,18 +307,17 @@ class Nursery {
 
   MOZ_MUST_USE bool queueDictionaryModeObjectToSweep(NativeObject* obj);
 
   size_t sizeOfHeapCommitted() const {
     return allocatedChunkCount() * gc::ChunkSize;
   }
   size_t sizeOfMallocedBuffers(mozilla::MallocSizeOf mallocSizeOf) const {
     size_t total = 0;
-    for (BufferSet::Range r = mallocedBuffers.all(); !r.empty();
-         r.popFront()) {
+    for (BufferSet::Range r = mallocedBuffers.all(); !r.empty(); r.popFront()) {
       total += mallocSizeOf(r.front());
     }
     total += mallocedBuffers.shallowSizeOfExcludingThis(mallocSizeOf);
     return total;
   }
 
   // The number of bytes from the start position to the end of the nursery.
   // pass maxChunkCount(), allocatedChunkCount() or chunkCountLimit()
--- a/js/src/gc/Statistics.cpp
+++ b/js/src/gc/Statistics.cpp
@@ -176,18 +176,19 @@ Phase Statistics::lookupChildPhase(Phase
   for (phase = phaseKinds[phaseKind].firstPhase; phase != Phase::NONE;
        phase = phases[phase].nextWithPhaseKind) {
     if (phases[phase].parent == currentPhase()) {
       break;
     }
   }
 
   if (phase == Phase::NONE) {
-      MOZ_CRASH_UNSAFE_PRINTF("Child phase kind %u not found under current phase kind %u",
-                              unsigned(phaseKind), unsigned(currentPhaseKind()));
+    MOZ_CRASH_UNSAFE_PRINTF(
+        "Child phase kind %u not found under current phase kind %u",
+        unsigned(phaseKind), unsigned(currentPhaseKind()));
   }
 
   return phase;
 }
 
 inline decltype(mozilla::MakeEnumeratedRange(Phase::FIRST, Phase::LIMIT))
 AllPhases() {
   return mozilla::MakeEnumeratedRange(Phase::FIRST, Phase::LIMIT);
--- a/js/src/jit-test/tests/debug/bug-1192401.js
+++ b/js/src/jit-test/tests/debug/bug-1192401.js
@@ -1,5 +1,6 @@
+// |jit-test| --more-compartments
 const dbg = new Debugger();
 const g = evalcx("lazy");
 dbg.addDebuggee(g);
 dbg.memory.trackingAllocationSites = true;
 g.eval("this.alloc = {}");
--- a/js/src/jit-test/tests/realms/basic.js
+++ b/js/src/jit-test/tests/realms/basic.js
@@ -73,8 +73,15 @@ function testTypedArrayLazyBuffer(global
     var arr1 = new global.Int32Array(1);
     var arr2 = new Int32Array(arr1);
     assertEq(objectGlobal(arr2.buffer), this);
     global.buf = arr1.buffer;
     global.eval("assertEq(objectGlobal(buf), this);");
 }
 testTypedArrayLazyBuffer(newGlobal());
 testTypedArrayLazyBuffer(newGlobal({sameCompartmentAs: this}));
+
+function testEvalcx() {
+    var g = newGlobal();
+    evalcx("this.x = 7", g);
+    assertEq(g.x, 7);
+}
+testEvalcx();
--- a/js/src/jit/BaselineCompiler.cpp
+++ b/js/src/jit/BaselineCompiler.cpp
@@ -1735,18 +1735,18 @@ static const VMFunction GetFunctionThisI
 
 template <typename Handler>
 bool BaselineCodeGen<Handler>::emit_JSOP_FUNCTIONTHIS() {
   MOZ_ASSERT(function());
   MOZ_ASSERT(!function()->isArrow());
 
   frame.pushThis();
 
-  // In strict mode code or self-hosted functions, |this| is left alone.
-  if (script->strict() || (function() && function()->isSelfHostedBuiltin())) {
+  // In strict mode code, |this| is left alone.
+  if (script->strict()) {
     return true;
   }
 
   // Load |thisv| in R0. Skip the call if it's already an object.
   Label skipCall;
   frame.popRegsAndSync(1);
   masm.branchTestObject(Assembler::Equal, R0, &skipCall);
 
--- a/js/src/jit/CodeGenerator.cpp
+++ b/js/src/jit/CodeGenerator.cpp
@@ -5408,19 +5408,19 @@ static const VMFunction DefVarInfo =
     FunctionInfo<DefVarFn>(DefVarOperation, "DefVarOperation");
 
 void CodeGenerator::visitDefVar(LDefVar* lir) {
   Register envChain = ToRegister(lir->environmentChain());
 
   JSScript* script = current->mir()->info().script();
   jsbytecode* pc = lir->mir()->resumePoint()->pc();
 
-  pushArg(ImmPtr(pc));                    // jsbytecode*
-  pushArg(ImmGCPtr(script));              // JSScript*
-  pushArg(envChain);                      // JSObject*
+  pushArg(ImmPtr(pc));        // jsbytecode*
+  pushArg(ImmGCPtr(script));  // JSScript*
+  pushArg(envChain);          // JSObject*
 
   callVM(DefVarInfo, lir);
 }
 
 typedef bool (*DefLexicalFn)(JSContext*, HandleObject, HandleScript,
                              jsbytecode*);
 static const VMFunction DefLexicalInfo =
     FunctionInfo<DefLexicalFn>(DefLexicalOperation, "DefLexicalOperation");
--- a/js/src/jit/IonBuilder.cpp
+++ b/js/src/jit/IonBuilder.cpp
@@ -12847,18 +12847,18 @@ AbortReasonOr<Ok> IonBuilder::jsop_check
 
   return Ok();
 }
 
 AbortReasonOr<Ok> IonBuilder::jsop_functionthis() {
   MOZ_ASSERT(info().funMaybeLazy());
   MOZ_ASSERT(!info().funMaybeLazy()->isArrow());
 
-  if (script()->strict() || info().funMaybeLazy()->isSelfHostedBuiltin()) {
-    // No need to wrap primitive |this| in strict mode or self-hosted code.
+  if (script()->strict()) {
+    // No need to wrap primitive |this| in strict mode.
     current->pushSlot(info().thisSlot());
     return Ok();
   }
 
   if (thisTypes && (thisTypes->getKnownMIRType() == MIRType::Object ||
                     (thisTypes->empty() && baselineFrame_ &&
                      baselineFrame_->thisType.isSomeObject()))) {
     // This is safe, because if the entry type of |this| is an object, it
--- a/js/src/jit/JitFrames.cpp
+++ b/js/src/jit/JitFrames.cpp
@@ -145,18 +145,17 @@ class IonTryNoteFilter {
   }
 
   bool operator()(const JSTryNote* note) { return note->stackDepth <= depth_; }
 };
 
 class TryNoteIterIon : public TryNoteIter<IonTryNoteFilter> {
  public:
   TryNoteIterIon(JSContext* cx, const InlineFrameIterator& frame)
-      : TryNoteIter(cx, frame.script(), frame.pc(),
-                    IonTryNoteFilter(frame)) {}
+      : TryNoteIter(cx, frame.script(), frame.pc(), IonTryNoteFilter(frame)) {}
 };
 
 static void HandleExceptionIon(JSContext* cx, const InlineFrameIterator& frame,
                                ResumeFromException* rfe,
                                bool* hitBailoutException) {
   if (cx->realm()->isDebuggee()) {
     // We need to bail when there is a catchable exception, and we are the
     // debuggee of a Debugger with a live onExceptionUnwind hook, or if a
@@ -318,18 +317,17 @@ class BaselineTryNoteFilter {
     uint32_t currDepth = frame_->numValueSlots() - frame_->script()->nfixed();
     return note->stackDepth <= currDepth;
   }
 };
 
 class TryNoteIterBaseline : public TryNoteIter<BaselineTryNoteFilter> {
  public:
   TryNoteIterBaseline(JSContext* cx, BaselineFrame* frame, jsbytecode* pc)
-      : TryNoteIter(cx, frame->script(), pc, BaselineTryNoteFilter(frame)) {
-  }
+      : TryNoteIter(cx, frame->script(), pc, BaselineTryNoteFilter(frame)) {}
 };
 
 // Close all live iterators on a BaselineFrame due to exception unwinding. The
 // pc parameter is updated to where the envs have been unwound to.
 static void CloseLiveIteratorsBaselineForUncatchableException(
     JSContext* cx, const JSJitFrameIter& frame, jsbytecode* pc) {
   for (TryNoteIterBaseline tni(cx, frame.baselineFrame(), pc); !tni.done();
        ++tni) {
--- a/js/src/jit/arm64/Assembler-arm64.h
+++ b/js/src/jit/arm64/Assembler-arm64.h
@@ -38,26 +38,26 @@ static constexpr Register ScratchReg2{Re
 static constexpr ARMRegister ScratchReg2_64 = {ScratchReg2, 64};
 
 static constexpr FloatRegister ReturnDoubleReg = {FloatRegisters::d0,
                                                   FloatRegisters::Double};
 static constexpr FloatRegister ScratchDoubleReg = {FloatRegisters::d31,
                                                    FloatRegisters::Double};
 struct ScratchDoubleScope : public AutoFloatRegisterScope {
   explicit ScratchDoubleScope(MacroAssembler& masm)
-    : AutoFloatRegisterScope(masm, ScratchDoubleReg) {}
+      : AutoFloatRegisterScope(masm, ScratchDoubleReg) {}
 };
 
 static constexpr FloatRegister ReturnFloat32Reg = {FloatRegisters::s0,
                                                    FloatRegisters::Single};
 static constexpr FloatRegister ScratchFloat32Reg = {FloatRegisters::s31,
                                                     FloatRegisters::Single};
 struct ScratchFloat32Scope : public AutoFloatRegisterScope {
   explicit ScratchFloat32Scope(MacroAssembler& masm)
-    : AutoFloatRegisterScope(masm, ScratchFloat32Reg) {}
+      : AutoFloatRegisterScope(masm, ScratchFloat32Reg) {}
 };
 
 static constexpr Register InvalidReg{Registers::invalid_reg};
 static constexpr FloatRegister InvalidFloatReg = {FloatRegisters::invalid_fpreg,
                                                   FloatRegisters::Single};
 
 static constexpr Register OsrFrameReg{Registers::x3};
 static constexpr Register CallTempReg0{Registers::x9};
--- a/js/src/jit/arm64/MacroAssembler-arm64.cpp
+++ b/js/src/jit/arm64/MacroAssembler-arm64.cpp
@@ -488,20 +488,20 @@ void MacroAssembler::storeRegsInMask(Liv
     dest.offset -= reg.size();
     if (reg.isDouble()) {
       storeDouble(reg, dest);
     } else if (reg.isSingle()) {
       storeFloat32(reg, dest);
     } else {
       MOZ_CRASH("Unknown register type.");
     }
-
   }
   MOZ_ASSERT(numFpu == 0);
-  // Padding to keep the stack aligned, taken from the x64 and mips64 implementations.
+  // Padding to keep the stack aligned, taken from the x64 and mips64
+  // implementations.
   diffF -= diffF % sizeof(uintptr_t);
   MOZ_ASSERT(diffF == 0);
 }
 
 void MacroAssembler::PopRegsInMaskIgnore(LiveRegisterSet set,
                                          LiveRegisterSet ignore) {
   // The offset of the data from the stack pointer.
   uint32_t offset = 0;
--- a/js/src/jsapi.cpp
+++ b/js/src/jsapi.cpp
@@ -3552,18 +3552,18 @@ CompileOptions& CompileOptions::setIntro
   RootedScript maybeScript(cx);
   const char* filename;
   unsigned lineno;
   uint32_t pcOffset;
   bool mutedErrors;
   DescribeScriptedCallerForCompilation(cx, &maybeScript, &filename, &lineno,
                                        &pcOffset, &mutedErrors);
   if (filename) {
-    return setIntroductionInfo(filename, introductionType, lineno,
-                               maybeScript, pcOffset);
+    return setIntroductionInfo(filename, introductionType, lineno, maybeScript,
+                               pcOffset);
   } else {
     return setIntroductionType(introductionType);
   }
 }
 
 #if defined(JS_BUILD_BINAST)
 
 JSScript* JS::DecodeBinAST(JSContext* cx, const ReadOnlyCompileOptions& options,
--- a/js/src/jsfriendapi.cpp
+++ b/js/src/jsfriendapi.cpp
@@ -1323,19 +1323,21 @@ JS_FRIEND_API void js::SetWindowProxyCla
 }
 
 JS_FRIEND_API void js::SetWindowProxy(JSContext* cx, HandleObject global,
                                       HandleObject windowProxy) {
   AssertHeapIsIdle();
   CHECK_THREAD(cx);
 
   cx->check(global, windowProxy);
+  MOZ_ASSERT(IsWindowProxy(windowProxy));
 
-  MOZ_ASSERT(IsWindowProxy(windowProxy));
-  global->as<GlobalObject>().setWindowProxy(windowProxy);
+  GlobalObject& globalObj = global->as<GlobalObject>();
+  globalObj.setWindowProxy(windowProxy);
+  globalObj.lexicalEnvironment().setWindowProxyThisValue(windowProxy);
 }
 
 JS_FRIEND_API JSObject* js::ToWindowIfWindowProxy(JSObject* obj) {
   if (IsWindowProxy(obj)) {
     return &obj->nonCCWGlobal();
   }
   return obj;
 }
--- a/js/src/shell/js.cpp
+++ b/js/src/shell/js.cpp
@@ -3762,16 +3762,23 @@ static void SetStandardRealmOptions(JS::
       .setBigIntEnabled(enableBigInt)
 #endif
       .setStreamsEnabled(enableStreams);
 }
 
 static JSObject* NewSandbox(JSContext* cx, bool lazy) {
   JS::RealmOptions options;
   SetStandardRealmOptions(options);
+
+  if (defaultToSameCompartment) {
+    options.creationOptions().setExistingCompartment(cx->global());
+  } else {
+    options.creationOptions().setNewCompartmentAndZone();
+  }
+
   RootedObject obj(cx,
                    JS_NewGlobalObject(cx, &sandbox_class, nullptr,
                                       JS::DontFireOnNewGlobalHook, options));
   if (!obj) {
     return nullptr;
   }
 
   {
@@ -3843,27 +3850,23 @@ static bool EvalInContext(JSContext* cx,
     return true;
   }
 
   JS::AutoFilename filename;
   unsigned lineno;
 
   DescribeScriptedCaller(cx, &filename, &lineno);
   {
-    Maybe<JSAutoRealm> ar;
-    unsigned flags;
-    JSObject* unwrapped = UncheckedUnwrap(sobj, true, &flags);
-    if (flags & Wrapper::CROSS_COMPARTMENT) {
-      sobj = unwrapped;
-      ar.emplace(cx, sobj);
-    }
+    sobj = UncheckedUnwrap(sobj, true);
+
+    JSAutoRealm ar(cx, sobj);
 
     sobj = ToWindowIfWindowProxy(sobj);
 
-    if (!(sobj->getClass()->flags & JSCLASS_IS_GLOBAL)) {
+    if (!JS_IsGlobalObject(sobj)) {
       JS_ReportErrorASCII(cx, "Invalid scope argument to evalcx");
       return false;
     }
 
     JS::CompileOptions opts(cx);
     opts.setFileAndLine(filename.get(), lineno);
 
     JS::SourceText<char16_t> srcBuf;
--- a/js/src/vm/Debugger.cpp
+++ b/js/src/vm/Debugger.cpp
@@ -4208,19 +4208,19 @@ static T* findDebuggerInVector(Debugger*
     }
   }
   MOZ_ASSERT(p != vec->end());
   return p;
 }
 
 // a ReadBarriered version for findDebuggerInVector
 // TODO: Bug 1515934 - findDebuggerInVector<T> triggers read barriers.
-static ReadBarriered<Debugger*>*
-findDebuggerInVector(Debugger* dbg,
-                     Vector<ReadBarriered<Debugger*>, 0, js::SystemAllocPolicy>* vec) {
+static ReadBarriered<Debugger*>* findDebuggerInVector(
+    Debugger* dbg,
+    Vector<ReadBarriered<Debugger*>, 0, js::SystemAllocPolicy>* vec) {
   ReadBarriered<Debugger*>* p;
   for (p = vec->begin(); p != vec->end(); p++) {
     if (p->unbarrieredGet() == dbg) {
       break;
     }
   }
   MOZ_ASSERT(p != vec->end());
   return p;
--- a/js/src/vm/EnvironmentObject.cpp
+++ b/js/src/vm/EnvironmentObject.cpp
@@ -1128,27 +1128,30 @@ LexicalEnvironmentObject::createHollowFo
 
 bool LexicalEnvironmentObject::isExtensible() const {
   return NativeObject::isExtensible();
 }
 
 Value LexicalEnvironmentObject::thisValue() const {
   MOZ_ASSERT(isExtensible());
   Value v = getReservedSlot(THIS_VALUE_OR_SCOPE_SLOT);
-  if (v.isObject()) {
-    // A WindowProxy may have been attached after this environment was
-    // created so check ToWindowProxyIfWindow again. For example,
-    // GlobalObject::createInternal will construct its lexical environment
-    // before SetWindowProxy can be called.
-    // See also: js::GetThisValue / js::GetThisValueOfLexical
-    return ObjectValue(*ToWindowProxyIfWindow(&v.toObject()));
-  }
+
+  // Windows must never be exposed to script. setWindowProxyThisValue should
+  // have set this to the WindowProxy.
+  MOZ_ASSERT_IF(v.isObject(), !IsWindow(&v.toObject()));
+
   return v;
 }
 
+void LexicalEnvironmentObject::setWindowProxyThisValue(JSObject* obj) {
+  MOZ_ASSERT(isGlobal());
+  MOZ_ASSERT(IsWindowProxy(obj));
+  setReservedSlot(THIS_VALUE_OR_SCOPE_SLOT, ObjectValue(*obj));
+}
+
 const Class LexicalEnvironmentObject::class_ = {
     "LexicalEnvironment",
     JSCLASS_HAS_RESERVED_SLOTS(LexicalEnvironmentObject::RESERVED_SLOTS) |
         JSCLASS_IS_ANONYMOUS,
     JS_NULL_CLASS_OPS,
     JS_NULL_CLASS_SPEC,
     JS_NULL_CLASS_EXT,
     JS_NULL_OBJECT_OPS};
--- a/js/src/vm/EnvironmentObject.h
+++ b/js/src/vm/EnvironmentObject.h
@@ -562,16 +562,18 @@ class LexicalEnvironmentObject : public 
 
   // Is this the global lexical scope?
   bool isGlobal() const { return enclosingEnvironment().is<GlobalObject>(); }
 
   GlobalObject& global() const {
     return enclosingEnvironment().as<GlobalObject>();
   }
 
+  void setWindowProxyThisValue(JSObject* obj);
+
   // Global and non-syntactic lexical scopes are extensible. All other
   // lexical scopes are not.
   bool isExtensible() const;
 
   // Is this a syntactic (i.e. corresponds to a source text) lexical
   // environment?
   bool isSyntactic() const { return !isExtensible() || isGlobal(); }
 
--- a/js/src/vm/Interpreter.cpp
+++ b/js/src/vm/Interpreter.cpp
@@ -114,22 +114,24 @@ bool js::BoxNonStrictThis(JSContext* cx,
   return true;
 }
 
 bool js::GetFunctionThis(JSContext* cx, AbstractFramePtr frame,
                          MutableHandleValue res) {
   MOZ_ASSERT(frame.isFunctionFrame());
   MOZ_ASSERT(!frame.callee()->isArrow());
 
-  if (frame.thisArgument().isObject() || frame.callee()->strict() ||
-      frame.callee()->isSelfHostedBuiltin()) {
+  if (frame.thisArgument().isObject() || frame.callee()->strict()) {
     res.set(frame.thisArgument());
     return true;
   }
 
+  MOZ_ASSERT(!frame.callee()->isSelfHostedBuiltin(),
+             "Self-hosted builtins must be strict");
+
   RootedValue thisv(cx, frame.thisArgument());
 
   // If there is a NSVO on environment chain, use it as basis for fallback
   // global |this|. This gives a consistent definition of global lexical
   // |this| between function and global contexts.
   //
   // NOTE: If only non-syntactic WithEnvironments are on the chain, we use the
   // global lexical |this| value. This is for compatibility with the Subscript
@@ -1100,18 +1102,17 @@ class InterpreterTryNoteFilter {
  public:
   explicit InterpreterTryNoteFilter(const InterpreterRegs& regs)
       : regs_(regs) {}
   bool operator()(const JSTryNote* note) {
     return note->stackDepth <= regs_.stackDepth();
   }
 };
 
-class TryNoteIterInterpreter
-    : public TryNoteIter<InterpreterTryNoteFilter> {
+class TryNoteIterInterpreter : public TryNoteIter<InterpreterTryNoteFilter> {
  public:
   TryNoteIterInterpreter(JSContext* cx, const InterpreterRegs& regs)
       : TryNoteIter(cx, regs.fp()->script(), regs.pc,
                     InterpreterTryNoteFilter(regs)) {}
 };
 
 static void UnwindIteratorsForUncatchableException(
     JSContext* cx, const InterpreterRegs& regs) {
--- a/js/src/vm/Interpreter.h
+++ b/js/src/vm/Interpreter.h
@@ -410,17 +410,16 @@ class MOZ_STACK_CLASS TryNoteIter {
 
   bool pcInRange() const {
     // This checks both ends of the range at once
     // because unsigned integers wrap on underflow.
     uint32_t offset = pcOffset_;
     uint32_t start = tn_->start;
     uint32_t length = tn_->length;
     return offset - start < length;
-
   }
   bool done() const { return tn_ == tnEnd_; }
   const JSTryNote* operator*() const { return tn_; }
 };
 
 bool HandleClosingGeneratorReturn(JSContext* cx, AbstractFramePtr frame,
                                   bool ok);
 
--- a/js/src/vm/JSScript-inl.h
+++ b/js/src/vm/JSScript-inl.h
@@ -114,17 +114,17 @@ inline js::RegExpObject* JSScript::getRe
 inline js::GlobalObject& JSScript::global() const {
   /*
    * A JSScript always marks its realm's global (via bindings) so we can
    * assert that maybeGlobal is non-null here.
    */
   return *realm()->maybeGlobal();
 }
 
-inline bool JSScript::hasGlobal(const js::GlobalObject *global) const {
+inline bool JSScript::hasGlobal(const js::GlobalObject* global) const {
   return global == realm()->unsafeUnbarrieredMaybeGlobal();
 }
 
 inline js::LexicalScope* JSScript::maybeNamedLambdaScope() const {
   // Dynamically created Functions via the 'new Function' are considered
   // named lambdas but they do not have the named lambda scope of
   // textually-created named lambdas.
   js::Scope* scope = outermostScope();
--- a/js/src/vm/JSScript.cpp
+++ b/js/src/vm/JSScript.cpp
@@ -1377,17 +1377,18 @@ ScriptSourceObject* ScriptSourceObject::
   }
 
   // The slots below should either be populated by a call to initFromOptions or,
   // if this is a non-canonical ScriptSourceObject, they are unused. Poison
   // them.
   obj->initReservedSlot(ELEMENT_SLOT, MagicValue(JS_GENERIC_MAGIC));
   obj->initReservedSlot(ELEMENT_PROPERTY_SLOT, MagicValue(JS_GENERIC_MAGIC));
   obj->initReservedSlot(INTRODUCTION_SCRIPT_SLOT, MagicValue(JS_GENERIC_MAGIC));
-  obj->initReservedSlot(INTRODUCTION_SOURCE_OBJECT_SLOT, MagicValue(JS_GENERIC_MAGIC));
+  obj->initReservedSlot(INTRODUCTION_SOURCE_OBJECT_SLOT,
+                        MagicValue(JS_GENERIC_MAGIC));
 
   return obj;
 }
 
 ScriptSourceObject* ScriptSourceObject::create(JSContext* cx,
                                                ScriptSource* source) {
   return createInternal(cx, source, nullptr);
 }
--- a/js/src/vm/SavedStacks.cpp
+++ b/js/src/vm/SavedStacks.cpp
@@ -1672,18 +1672,18 @@ bool SavedStacks::getLocation(JSContext*
     }
   }
 
   locationp.set(p->value());
   return true;
 }
 
 void SavedStacks::chooseSamplingProbability(Realm* realm) {
-  // Use unbarriered version to prevent triggering read barrier while collecting,
-  // this is safe as long as global does not escape.
+  // Use unbarriered version to prevent triggering read barrier while
+  // collecting, this is safe as long as global does not escape.
   GlobalObject* global = realm->unsafeUnbarrieredMaybeGlobal();
   if (!global) {
     return;
   }
 
   GlobalObject::DebuggerVector* dbgs = global->getDebuggers();
   if (!dbgs || dbgs->empty()) {
     return;
@@ -1698,18 +1698,17 @@ void SavedStacks::chooseSamplingProbabil
     // such that the vector gets reallocated.
     MOZ_ASSERT(dbgs->begin() == begin);
     // Use unbarrieredGet() to prevent triggering read barrier while collecting,
     // this is safe as long as dbgp does not escape.
     Debugger* dbgp = p->unbarrieredGet();
 
     if (dbgp->trackingAllocationSites && dbgp->enabled) {
       foundAnyDebuggers = true;
-      probability =
-          std::max(dbgp->allocationSamplingProbability, probability);
+      probability = std::max(dbgp->allocationSamplingProbability, probability);
     }
   }
   MOZ_ASSERT(foundAnyDebuggers);
 
   if (!bernoulliSeeded) {
     mozilla::Array<uint64_t, 2> seed;
     GenerateXorShift128PlusSeed(seed);
     bernoulli.setRandomState(seed[0], seed[1]);
--- a/js/src/vm/SelfHosting.cpp
+++ b/js/src/vm/SelfHosting.cpp
@@ -3342,16 +3342,17 @@ bool JSRuntime::cloneSelfHostedFunctionS
   RootedScope emptyGlobalScope(cx, &cx->global()->emptyGlobalScope());
   if (!CloneScriptIntoFunction(cx, emptyGlobalScope, targetFun, sourceScript)) {
     return false;
   }
   MOZ_ASSERT(!targetFun->isInterpretedLazy());
 
   MOZ_ASSERT(sourceFun->nargs() == targetFun->nargs());
   MOZ_ASSERT(sourceScript->hasRest() == targetFun->nonLazyScript()->hasRest());
+  MOZ_ASSERT(targetFun->strict(), "Self-hosted builtins must be strict");
 
   // The target function might have been relazified after its flags changed.
   targetFun->setFlags(targetFun->flags() | sourceFun->flags());
   return true;
 }
 
 bool JSRuntime::getUnclonedSelfHostedValue(JSContext* cx,
                                            HandlePropertyName name,
--- a/js/src/wasm/AsmJS.cpp
+++ b/js/src/wasm/AsmJS.cpp
@@ -1320,17 +1320,17 @@ class MOZ_STACK_CLASS JS_HAZ_ROOTED Modu
   };
 
   using SigSet = HashSet<HashableSig, HashableSig>;
   using FuncImportMap = HashMap<NamedSig, uint32_t, NamedSig>;
   using GlobalMap = HashMap<PropertyName*, Global*>;
   using MathNameMap = HashMap<PropertyName*, MathBuiltin>;
   using ArrayViewVector = Vector<ArrayView>;
 
-protected:
+ protected:
   JSContext* cx_;
   CodeNode* moduleFunctionNode_;
   PropertyName* moduleFunctionName_;
   PropertyName* globalArgumentName_ = nullptr;
   PropertyName* importArgumentName_ = nullptr;
   PropertyName* bufferArgumentName_ = nullptr;
   MathNameMap standardLibraryMathNames_;
   RootedFunction dummyFunction_;
--- a/js/src/wasm/WasmCompile.cpp
+++ b/js/src/wasm/WasmCompile.cpp
@@ -254,18 +254,20 @@ static const double arm64IonBytecodesPer
 
 // Tiering cutoff values: if code section sizes are below these values (when
 // divided by the effective number of cores) we do not tier, because we guess
 // that parallel Ion compilation will be fast enough.
 
 static const double x64DesktopTierCutoff = x64IonBytecodesPerMs * tierCutoffMs;
 static const double x86DesktopTierCutoff = x86IonBytecodesPerMs * tierCutoffMs;
 static const double x86MobileTierCutoff = x86DesktopTierCutoff / 2;  // Guess
-static const double arm32MobileTierCutoff = arm32IonBytecodesPerMs * tierCutoffMs;
-static const double arm64MobileTierCutoff = arm64IonBytecodesPerMs * tierCutoffMs;
+static const double arm32MobileTierCutoff =
+    arm32IonBytecodesPerMs * tierCutoffMs;
+static const double arm64MobileTierCutoff =
+    arm64IonBytecodesPerMs * tierCutoffMs;
 
 static double CodesizeCutoff(SystemClass cls) {
   switch (cls) {
     case SystemClass::DesktopX86:
     case SystemClass::DesktopUnknown32:
       return x86DesktopTierCutoff;
     case SystemClass::DesktopX64:
     case SystemClass::DesktopUnknown64:
--- a/layout/build/nsLayoutModule.cpp
+++ b/layout/build/nsLayoutModule.cpp
@@ -696,17 +696,14 @@ static void LayoutModuleDtor() {
   mozilla::image::ShutdownModule();
   gfxPlatform::Shutdown();
   gfx::gfxVars::Shutdown();
 
   nsScriptSecurityManager::Shutdown();
   xpcModuleDtor();
 }
 
-static const mozilla::Module kLayoutModule = {mozilla::Module::kVersion,
-                                              kLayoutCIDs,
-                                              kLayoutContracts,
-                                              kLayoutCategories,
-                                              nullptr,
-                                              Initialize,
-                                              LayoutModuleDtor};
+static const mozilla::Module kLayoutModule = {
+    mozilla::Module::kVersion, kLayoutCIDs, kLayoutContracts,
+    kLayoutCategories,         nullptr,     Initialize,
+    LayoutModuleDtor};
 
 NSMODULE_DEFN(nsLayoutModule) = &kLayoutModule;
--- a/layout/generic/nsIFrame.h
+++ b/layout/generic/nsIFrame.h
@@ -9,39 +9,40 @@
 #ifndef nsIFrame_h___
 #define nsIFrame_h___
 
 #ifndef MOZILLA_INTERNAL_API
 #error This header/class should only be used within Mozilla code. It should not be used by extensions.
 #endif
 
 #if (defined(XP_WIN) && !defined(HAVE_64BIT_BUILD)) || defined(ANDROID)
-// Blink's magic depth limit from its HTML parser (513) plus as much as fits in the
-// default run-time stack on armv7 Android on Dalvik when using display: block minus
-// a bit just to be sure. The Dalvik default stack crashes at 588. ART can do a few
-// frames more. Using the same number for 32-bit Windows for consistency. Over there,
-// Blink's magic depth of 513 doesn't fit in the default stack of 1 MB, but this magic
-// depth fits when the default is grown by mere 192 KB (tested in 64 KB increments).
+// Blink's magic depth limit from its HTML parser (513) plus as much as fits in
+// the default run-time stack on armv7 Android on Dalvik when using display:
+// block minus a bit just to be sure. The Dalvik default stack crashes at 588.
+// ART can do a few frames more. Using the same number for 32-bit Windows for
+// consistency. Over there, Blink's magic depth of 513 doesn't fit in the
+// default stack of 1 MB, but this magic depth fits when the default is grown by
+// mere 192 KB (tested in 64 KB increments).
 //
 // 32-bit Windows has a different limit compared to 64-bit desktop, because the
-// default stack size affects all threads and consumes address space. Fixing that
-// is bug 1257522.
+// default stack size affects all threads and consumes address space. Fixing
+// that is bug 1257522.
 //
-// 32-bit Android on ARM already happens to have defaults that are close enough to
-// what makes sense as a temporary measure on Windows, so adjusting the Android
-// stack can be a follow-up. The stack on 64-bit ARM needs adjusting in any case
-// before 64-bit ARM can become tier-1. See bug 1400811.
+// 32-bit Android on ARM already happens to have defaults that are close enough
+// to what makes sense as a temporary measure on Windows, so adjusting the
+// Android stack can be a follow-up. The stack on 64-bit ARM needs adjusting in
+// any case before 64-bit ARM can become tier-1. See bug 1400811.
 //
-// Ideally, we'd get rid of this smaller limit and make 32-bit Windows and Android
-// capable of working with the Linux/Mac/Win64 number below.
+// Ideally, we'd get rid of this smaller limit and make 32-bit Windows and
+// Android capable of working with the Linux/Mac/Win64 number below.
 #define MAX_REFLOW_DEPTH 585
 #else
-// Blink's magic depth limit from its HTML parser times two. Also just about fits
-// within the system default runtime stack limit of 8 MB on 64-bit Mac and Linux with
-// display: table-cell.
+// Blink's magic depth limit from its HTML parser times two. Also just about
+// fits within the system default runtime stack limit of 8 MB on 64-bit Mac and
+// Linux with display: table-cell.
 #define MAX_REFLOW_DEPTH 1026
 #endif
 
 /* nsIFrame is in the process of being deCOMtaminated, i.e., this file is
    eventually going to be eliminated, and all callers will use nsFrame instead.
    At the moment we're midway through this process, so you will see inlined
    functions and member variables in this file.  -dwh */
 
--- a/layout/generic/nsTextFrame.cpp
+++ b/layout/generic/nsTextFrame.cpp
@@ -1893,17 +1893,17 @@ bool BuildTextRunsScanner::ContinueTextR
         }
 
         aFrame = aFrame->GetParent();
       }
       return false;
     };
 
     const nsIFrame* ancestor =
-      nsLayoutUtils::FindNearestCommonAncestorFrame(aFrame1, aFrame2);
+        nsLayoutUtils::FindNearestCommonAncestorFrame(aFrame1, aFrame2);
     MOZ_ASSERT(ancestor);
 
     // Map inline-end and inline-start to physical sides for checking presence
     // of non-zero margin/border/padding.
     Side side1 = wm.PhysicalSide(eLogicalSideIEnd);
     Side side2 = wm.PhysicalSide(eLogicalSideIStart);
     // If the frames have an embedding level that is opposite to the writing
     // mode, we need to swap which sides we're checking.
--- a/layout/style/nsDOMCSSDeclaration.h
+++ b/layout/style/nsDOMCSSDeclaration.h
@@ -28,17 +28,17 @@ class DeclarationBlock;
 struct DeclarationBlockMutationClosure;
 namespace css {
 class Loader;
 class Rule;
 }  // namespace css
 namespace dom {
 class Document;
 class Element;
-}
+}  // namespace dom
 
 struct MutationClosureData {
   MutationClosureData() : mClosure(nullptr), mElement(nullptr), mModType(0) {}
 
   // mClosure is non-null as long as the closure hasn't been called.
   // This is needed so that it can be guaranteed that
   // InlineStyleDeclarationWillChange is always called before
   // SetInlineStyleDeclaration.
--- a/media/libdav1d/asm/moz.build
+++ b/media/libdav1d/asm/moz.build
@@ -16,27 +16,30 @@ LOCAL_INCLUDES += [
     '/third_party/dav1d/src/',
 ]
 
 CFLAGS += [
     # find the config.h file.
     '-I%s/dist/include/dav1d/' % TOPOBJDIR,
 ]
 
-if CONFIG['CC_TYPE'] == 'clang':
-    CFLAGS += ['-mstack-alignment=32']
-elif CONFIG['CC_TYPE'] == 'gcc':
-    CFLAGS += ['-mpreferred-stack-boundary=5']
+# This is Linux only for now
 
 # Attaching config.asm file
 if CONFIG['CPU_ARCH'] == 'x86':
     ASFLAGS += ['-I%s/media/libdav1d/asm/x86_32/' % TOPSRCDIR]
     SOURCES += ['x86_32/config.asm']
 
 if CONFIG['CPU_ARCH'] == 'x86_64':
+    # Change the default stack aligment (16) to 32
+    if CONFIG['CC_TYPE'] == 'clang':
+        CFLAGS += ['-mstack-alignment=32']
+    elif CONFIG['CC_TYPE'] == 'gcc':
+        CFLAGS += ['-mpreferred-stack-boundary=5']
+
     if CONFIG['OS_TARGET'] == 'Darwin':
         ASFLAGS += ['-I%s/media/libdav1d/asm/x86_64/osx/' % TOPSRCDIR]
         SOURCES += ['x86_64/osx/config.asm']
     else:
         ASFLAGS += ['-I%s/media/libdav1d/asm/x86_64/' % TOPSRCDIR]
         SOURCES += ['x86_64/config.asm']
 
 if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
--- a/media/libdav1d/asm/x86_32/config.asm
+++ b/media/libdav1d/asm/x86_32/config.asm
@@ -1,9 +1,11 @@
 ; Autogenerated by the Meson build system.
 ; Do not edit, your changes will be lost.
 
 %define ARCH_X86_32 1
 
 %define ARCH_X86_64 0
 
-%define STACK_ALIGNMENT 32
+%define PIC 1
 
+%define STACK_ALIGNMENT 16
+
--- a/media/libdav1d/asm/x86_64/config.asm
+++ b/media/libdav1d/asm/x86_64/config.asm
@@ -1,11 +1,9 @@
 ; Autogenerated by the Meson build system.
 ; Do not edit, your changes will be lost.
 
 %define ARCH_X86_32 0
 
 %define ARCH_X86_64 1
 
-%define PIC 1
-
 %define STACK_ALIGNMENT 32
 
--- a/media/libdav1d/config.h
+++ b/media/libdav1d/config.h
@@ -31,35 +31,39 @@
 #define ARCH_X86 0
 #endif
 
 // Set both bitdepeth in every case
 #define CONFIG_16BPC 1
 #define CONFIG_8BPC 1
 
 // Enable asm
-#if ARCH_X86_64 == 1 && defined(__linux__) && !defined(__ANDROID__)
+#if (ARCH_x86_32 == 1 || ARCH_X86_64 == 1) && defined(__linux__) && \
+    !defined(__ANDROID__)
 #define HAVE_ASM 1
 #else
 #define HAVE_ASM 0
 #endif
 
 // Set memory aligment
 #if defined(__ANDROID__) && (ARCH_ARM == 1 || ARCH_X86_32 == 1)
 #define HAVE_MEMALIGN 1
 #elif ARCH_X86_64 == 1 && (defined(_WIN32) || defined(__CYGWIN__)) && \
     defined(MOZ_ASAN)
 #define HAVE_ALIGNED_MALLOC 1
 #else
 #define HAVE_POSIX_MEMALIGN 1
 #endif
 
 // unistd.h is used by tools, which we do not
-// built, so we do not really care.
+// build, so we do not really care.
 #define HAVE_UNISTD_H 1
 
 // Important when asm is enabled
 #if defined(__APPLE__)
 #define PREFIX 1
 #endif
 
-// aligment is 32 in evry case
+#if ARCH_X86_32 == 1 && defined(__linux__) && !defined(__ANDROID__)
+#define STACK_ALIGNMENT 16
+#else
 #define STACK_ALIGNMENT 32
+#endif
--- a/media/libdav1d/moz.build
+++ b/media/libdav1d/moz.build
@@ -20,26 +20,29 @@ EXPORTS.dav1d += [
 
 # entrypoint source files
 SOURCES += [
     '../../third_party/dav1d/src/lib.c',
     '../../third_party/dav1d/src/thread_task.c',
 ]
 
 # Enable ASM on Linux for now.
-if CONFIG['OS_TARGET'] == 'Linux' and CONFIG['CPU_ARCH'] == 'x86_64':
+if CONFIG['OS_TARGET'] == 'Linux' and (CONFIG['CPU_ARCH'] in ('x86', 'x86_64')):
+    # Default stack aligment is 16 bytes
     DIRS += ['asm']
-    if CONFIG['CC_TYPE'] == 'clang':
-        CFLAGS += ['-mstack-alignment=32']
-        SOURCES['../../third_party/dav1d/src/lib.c'].flags += ['-mstackrealign']
-        SOURCES['../../third_party/dav1d/src/thread_task.c'].flags += ['-mstackrealign']
-    elif CONFIG['CC_TYPE'] == 'gcc':
-        CFLAGS += ['-mpreferred-stack-boundary=5']
-        SOURCES['../../third_party/dav1d/src/lib.c'].flags += ['-mincoming-stack-boundary=4']
-        SOURCES['../../third_party/dav1d/src/thread_task.c'].flags += ['-mincoming-stack-boundary=4']
+    if CONFIG['CPU_ARCH'] == 'x86_64':
+        # Update stack aligment to 32 bytes
+        if CONFIG['CC_TYPE'] == 'clang':
+            CFLAGS += ['-mstack-alignment=32']
+            SOURCES['../../third_party/dav1d/src/lib.c'].flags += ['-mstackrealign']
+            SOURCES['../../third_party/dav1d/src/thread_task.c'].flags += ['-mstackrealign']
+        elif CONFIG['CC_TYPE'] == 'gcc':
+            CFLAGS += ['-mpreferred-stack-boundary=5']
+            SOURCES['../../third_party/dav1d/src/lib.c'].flags += ['-mincoming-stack-boundary=4']
+            SOURCES['../../third_party/dav1d/src/thread_task.c'].flags += ['-mincoming-stack-boundary=4']
 
 # common sources
 SOURCES += [
     '../../third_party/dav1d/src/cdf.c',
     '../../third_party/dav1d/src/cpu.c',
     '../../third_party/dav1d/src/data.c',
     '../../third_party/dav1d/src/decode.c',
     '../../third_party/dav1d/src/dequant_tables.c',
--- a/media/libdav1d/moz.yaml
+++ b/media/libdav1d/moz.yaml
@@ -15,15 +15,15 @@ origin:
   description: dav1d, a fast AV1 decoder
 
   # Full URL for the package's homepage/etc
   # Usually different from repository url
   url: https://code.videolan.org/videolan/dav1d
 
   # Human-readable identifier for this version/release
   # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: commit b53a99b97f93d0eb15d1f532739ca062fe44b4ca
+  release: commit f813285c1d1a5421e0180efbb7cbdd377cd31c69 (2019-01-13T22:08:25.000Z).
 
   # The package's license, where possible using the mnemonic from
   # https://spdx.org/licenses/
   # Multiple licenses can be specified (as a YAML list)
   # A "LICENSE" file must exist containing the full license text
   license: BSD-2-Clause
--- a/mobile/android/components/geckoview/GeckoViewHistory.h
+++ b/mobile/android/components/geckoview/GeckoViewHistory.h
@@ -16,17 +16,17 @@
 #include "mozilla/StaticPtr.h"
 
 class nsIWidget;
 
 namespace mozilla {
 namespace dom {
 class Document;
 }
-}
+}  // namespace mozilla
 
 struct VisitedURI {
   nsCOMPtr<nsIURI> mURI;
   bool mVisited = false;
 };
 
 struct TrackedURI {
   // Per `IHistory`, these are not owning references.
--- a/modules/libpref/init/all.js
+++ b/modules/libpref/init/all.js
@@ -1900,21 +1900,20 @@ pref("network.http.rcwn.small_resource_s
 
 pref("network.http.rcwn.min_wait_before_racing_ms", 0);
 pref("network.http.rcwn.max_wait_before_racing_ms", 500);
 
 // The ratio of the transaction count for the focused window and the count of
 // all available active connections.
 pref("network.http.focused_window_transaction_ratio", "0.9");
 
-// XXX Disable for intranet downloading issue.
 // This is the size of the flow control window (KB) (i.e., the amount of data
 // that the parent can send to the child before getting an ack). 0 for disable
 // the flow control.
-pref("network.http.send_window_size", 0);
+pref("network.http.send_window_size", 1024);
 
 // Whether or not we give more priority to active tab.
 // Note that this requires restart for changes to take effect.
 #ifdef ANDROID
 // disabled because of bug 1382274
 pref("network.http.active_tab_priority", false);
 #else
 pref("network.http.active_tab_priority", true);
--- a/netwerk/base/nsIOService.cpp
+++ b/netwerk/base/nsIOService.cpp
@@ -444,17 +444,18 @@ void nsIOService::NotifySocketProcessPre
 
   if (!XRE_IsParentProcess()) {
     return;
   }
 
   dom::Pref pref(nsCString(aName), /* isLocked */ false, null_t(), null_t());
   Preferences::GetPreference(&pref);
   auto sendPrefUpdate = [pref]() {
-    Unused << gIOService->mSocketProcess->GetActor()->SendPreferenceUpdate(pref);
+    Unused << gIOService->mSocketProcess->GetActor()->SendPreferenceUpdate(
+        pref);
   };
   CallOrWaitForSocketProcess(sendPrefUpdate);
 }
 
 void nsIOService::OnProcessLaunchComplete(SocketProcessHost *aHost,
                                           bool aSucceeded) {
   MOZ_ASSERT(NS_IsMainThread());
 
@@ -464,23 +465,24 @@ void nsIOService::OnProcessLaunchComplet
 
   if (mShutdown || !SocketProcessReady()) {
     return;
   }
 
   if (!mPendingEvents.IsEmpty()) {
     nsTArray<std::function<void()>> pendingEvents;
     mPendingEvents.SwapElements(pendingEvents);
-    for (auto& func : pendingEvents) {
+    for (auto &func : pendingEvents) {
       func();
     }
   }
 }
 
-void nsIOService::CallOrWaitForSocketProcess(const std::function<void()>& aFunc) {
+void nsIOService::CallOrWaitForSocketProcess(
+    const std::function<void()> &aFunc) {
   MOZ_ASSERT(NS_IsMainThread());
   if (IsSocketProcessLaunchComplete() && SocketProcessReady()) {
     aFunc();
   } else {
     mPendingEvents.AppendElement(aFunc);  // infallible
   }
 }
 
@@ -494,19 +496,19 @@ void nsIOService::OnProcessUnexpectedShu
 
   LOG(("nsIOService::OnProcessUnexpectedShutdown\n"));
   DestroySocketProcess();
 }
 
 RefPtr<MemoryReportingProcess> nsIOService::GetSocketProcessMemoryReporter() {
   // Check the prefs here again, since we don't want to create
   // SocketProcessMemoryReporter for some tests.
-  if (!Preferences::GetBool("network.process.enabled") || !SocketProcessReady()) {
+  if (!Preferences::GetBool("network.process.enabled") ||
+      !SocketProcessReady()) {
     return nullptr;
-
   }
 
   return new SocketProcessMemoryReporter();
 }
 
 NS_IMETHODIMP
 nsIOService::SocketProcessTelemetryPing() {
   CallOrWaitForSocketProcess([]() {
--- a/netwerk/ipc/SocketProcessHost.cpp
+++ b/netwerk/ipc/SocketProcessHost.cpp
@@ -57,18 +57,18 @@ class OfflineObserver final : public nsI
       if (!mProcessHost->IsConnected() ||
           mProcessHost->GetActor()->SendSetOffline(
               !strcmp(offline, "true") ? true : false)) {
         return NS_ERROR_NOT_AVAILABLE;
       }
     } else if (!strcmp(aTopic, NS_XPCOM_WILL_SHUTDOWN_OBSERVER_ID)) {
       nsCOMPtr<nsIObserverService> obs =
           mozilla::services::GetObserverService();
-        obs->RemoveObserver(this, NS_IPC_IOSERVICE_SET_OFFLINE_TOPIC);
-        obs->RemoveObserver(this, NS_XPCOM_WILL_SHUTDOWN_OBSERVER_ID);
+      obs->RemoveObserver(this, NS_IPC_IOSERVICE_SET_OFFLINE_TOPIC);
+      obs->RemoveObserver(this, NS_XPCOM_WILL_SHUTDOWN_OBSERVER_ID);
     }
 
     return NS_OK;
   }
   virtual ~OfflineObserver() = default;
 
   SocketProcessHost* mProcessHost;
 };
--- a/netwerk/protocol/http/HttpChannelChild.cpp
+++ b/netwerk/protocol/http/HttpChannelChild.cpp
@@ -875,17 +875,21 @@ void HttpChannelChild::OnTransportAndDat
       }
       mUnreportBytesRead = 0;
     }
   }
 }
 
 bool HttpChannelChild::NeedToReportBytesRead() {
   if (mCacheNeedToReportBytesReadInitialized) {
-    return mNeedToReportBytesRead;
+    // No need to send SendRecvBytes when diversion starts since the parent
+    // process will suspend for diversion triggered in during OnStrartRequest at
+    // child side, which is earlier. Parent will take over the flow control
+    // after the diverting starts. Sending |SendBytesRead| is redundant.
+    return mNeedToReportBytesRead && !mDivertingToParent;
   }
 
   // Might notify parent for partial cache, and the IPC message is ignored by
   // parent.
   int64_t contentLength = -1;
   if (gHttpHandler->SendWindowSize() == 0 || mIsFromCache ||
       NS_FAILED(GetContentLength(&contentLength)) ||
       contentLength < gHttpHandler->SendWindowSize()) {
--- a/netwerk/protocol/http/HttpChannelParent.cpp
+++ b/netwerk/protocol/http/HttpChannelParent.cpp
@@ -1604,16 +1604,17 @@ HttpChannelParent::OnDataAvailable(nsIRe
     aCount -= toRead;
     toRead = std::min<uint32_t>(aCount, kCopyChunkSize);
   }
 
   if (NeedFlowControl()) {
     // We're going to run out of sending window size
     if (mSendWindowSize > 0 && mSendWindowSize <= count) {
       MOZ_ASSERT(!mSuspendedForFlowControl);
+      LOG(("  suspend the channel due to e10s backpressure"));
       Unused << mChannel->Suspend();
       mSuspendedForFlowControl = true;
       mHasSuspendedByBackPressure = true;
     } else if (!mResumedTimestamp.IsNull()) {
       // Calculate the delay when the first packet arrived after resume
       Telemetry::AccumulateTimeDelta(
           Telemetry::NETWORK_BACK_PRESSURE_SUSPENSION_DELAY_TIME_MS,
           mResumedTimestamp);
@@ -1647,25 +1648,27 @@ bool HttpChannelParent::NeedFlowControl(
     mNeedFlowControl = false;
   }
   mCacheNeedFlowControlInitialized = true;
   return mNeedFlowControl;
 }
 
 mozilla::ipc::IPCResult HttpChannelParent::RecvBytesRead(
     const int32_t& aCount) {
-  if (!NeedFlowControl()) {
+  // no more flow control after diviersion starts
+  if (!NeedFlowControl() || mDivertingFromChild) {
     return IPC_OK();
   }
 
   LOG(("HttpChannelParent::RecvBytesRead [this=%p count=%" PRId32 "]\n", this,
        aCount));
 
   if (mSendWindowSize <= 0 && mSendWindowSize + aCount > 0) {
     MOZ_ASSERT(mSuspendedForFlowControl);
+    LOG(("  resume the channel due to e10s backpressure relief"));
     Unused << mChannel->Resume();
     mSuspendedForFlowControl = false;
 
     mResumedTimestamp = TimeStamp::Now();
   }
   mSendWindowSize += aCount;
   return IPC_OK();
 }
@@ -2071,16 +2074,24 @@ nsresult HttpChannelParent::SuspendForDi
     // of this suspend, therefore mEventQ should not be suspened so we need to
     // resume it once.
     mEventQ->Resume();
   }
 
   rv = mParentListener->SuspendForDiversion();
   MOZ_ASSERT(NS_SUCCEEDED(rv));
 
+  // After we suspend for diversion, we don't need the flow control since the
+  // channel is suspended until all the data is consumed and no more e10s later.
+  // No point to have another redundant suspension.
+  if (mSuspendedForFlowControl) {
+    Unused << mChannel->Resume();
+    mSuspendedForFlowControl = false;
+  }
+
   // Once this is set, no more OnStart/OnData/OnStop callbacks should be sent
   // to the child.
   mDivertingFromChild = true;
 
   return NS_OK;
 }
 
 nsresult HttpChannelParent::SuspendMessageDiversion() {
--- a/parser/htmlparser/nsParserModule.cpp
+++ b/parser/htmlparser/nsParserModule.cpp
@@ -10,18 +10,17 @@
 
 //----------------------------------------------------------------------
 
 NS_GENERIC_FACTORY_CONSTRUCTOR(nsParser)
 
 NS_DEFINE_NAMED_CID(NS_PARSER_CID);
 
 static const mozilla::Module::CIDEntry kParserCIDs[] = {
-    {&kNS_PARSER_CID, false, nullptr, nsParserConstructor},
-    {nullptr}};
+    {&kNS_PARSER_CID, false, nullptr, nsParserConstructor}, {nullptr}};
 
 static nsresult Initialize() {
   nsresult rv = nsHTMLTags::AddRefTable();
   NS_ENSURE_SUCCESS(rv, rv);
 
 #ifdef DEBUG
   CheckElementTable();
   nsHTMLTags::TestTagTable();
--- a/services/common/docs/RemoteSettings.rst
+++ b/services/common/docs/RemoteSettings.rst
@@ -145,16 +145,37 @@ It basically consists in:
 #. (*optional*) Allow attachments on entries
 
 And once done:
 
 #. Create, modify or delete entries and let reviewers approve the changes
 #. Wait for Firefox to pick-up the changes for your settings key
 
 
+Global Notifications
+====================
+
+The polling for changes process sends two notifications that observers can register to:
+
+* ``remote-settings:changes-poll-start``: Polling for changes is starting. triggered either by the scheduled timer or a push broadcast.
+* ``remote-settings:changes-poll-end``: Polling for changes has ended
+
+.. code-block:: javascript
+
+    const observer = {
+      observe(aSubject, aTopic, aData) {
+        Services.obs.removeObserver(this, "remote-settings:changes-poll-start");
+
+        const { expectedTimestamp } = JSON.parse(aData);
+        console.log("Polling started", expectedTimestamp ? "from push broadcast" : "by scheduled trigger");
+      },
+    };
+    Services.obs.addObserver(observer, "remote-settings:changes-poll-start");
+
+
 Advanced Options
 ================
 
 ``filterFunc``: custom filtering function
 -----------------------------------------
 
 By default, the entries returned by ``.get()`` are filtered based on the JEXL expression result from the ``filter_expression`` field. The ``filterFunc`` option allows to execute a custom filter (async) function, that should return the record (modified or not) if kept or a falsy value if filtered out.
 
--- a/services/settings/remote-settings.js
+++ b/services/settings/remote-settings.js
@@ -163,16 +163,18 @@ function remoteSettingsFunction() {
         UptakeTelemetry.report(TELEMETRY_HISTOGRAM_KEY,
                                UptakeTelemetry.STATUS.BACKOFF);
         throw new Error(`Server is asking clients to back off; retry in ${Math.ceil(remainingMilliseconds / 1000)}s.`);
       } else {
         gPrefs.clearUserPref(PREF_SETTINGS_SERVER_BACKOFF);
       }
     }
 
+    Services.obs.notifyObservers(null, "remote-settings:changes-poll-start", JSON.stringify({ expectedTimestamp }));
+
     const lastEtag = gPrefs.getCharPref(PREF_SETTINGS_LAST_ETAG, "");
 
     let pollResult;
     try {
       pollResult = await Utils.fetchLatestChanges(remoteSettings.pollingEndpoint, { expectedTimestamp, lastEtag });
     } catch (e) {
       // Report polling error to Uptake Telemetry.
       let report;
@@ -221,16 +223,17 @@ function remoteSettingsFunction() {
       const client = await _client(bucket, collection);
       if (!client) {
         continue;
       }
       // Start synchronization! It will be a no-op if the specified `lastModified` equals
       // the one in the local database.
       try {
         await client.maybeSync(last_modified, { loadDump });
+
         // Save last time this client was successfully synced.
         Services.prefs.setIntPref(client.lastCheckTimePref, checkedServerTimeInSeconds);
       } catch (e) {
         if (!firstError) {
           firstError = e;
           firstError.details = change;
         }
       }
@@ -240,17 +243,17 @@ function remoteSettingsFunction() {
       throw firstError;
     }
 
     // Save current Etag for next poll.
     if (currentEtag) {
       gPrefs.setCharPref(PREF_SETTINGS_LAST_ETAG, currentEtag);
     }
 
-    Services.obs.notifyObservers(null, "remote-settings-changes-polled");
+    Services.obs.notifyObservers(null, "remote-settings:changes-poll-end");
   };
 
   /**
    * Returns an object with polling status information and the list of
    * known remote settings collections.
    */
   remoteSettings.inspect = async () => {
     const { changes, currentEtag: serverTimestamp } = await Utils.fetchLatestChanges(remoteSettings.pollingEndpoint);
--- a/services/settings/test/unit/test_remote_settings_poll.js
+++ b/services/settings/test/unit/test_remote_settings_poll.js
@@ -55,16 +55,40 @@ function run_test() {
 
   registerCleanupFunction(function() {
     server.stop(function() { });
   });
 }
 
 add_task(clear_state);
 
+
+add_task(async function test_an_event_is_sent_on_start() {
+  server.registerPathHandler(CHANGES_PATH, (request, response) => {
+    response.write(JSON.stringify({ data: [] }));
+    response.setHeader("ETag", '"42"');
+    response.setHeader("Date", (new Date()).toUTCString());
+    response.setStatusLine(null, 200, "OK");
+  });
+  let notificationObserved = null;
+  const observer = {
+    observe(aSubject, aTopic, aData) {
+      Services.obs.removeObserver(this, "remote-settings:changes-poll-start");
+      notificationObserved = JSON.parse(aData);
+    },
+  };
+  Services.obs.addObserver(observer, "remote-settings:changes-poll-start");
+
+  await RemoteSettings.pollChanges({ expectedTimestamp: 13 });
+
+  Assert.equal(notificationObserved.expectedTimestamp, 13, "start notification should have been observed");
+});
+add_task(clear_state);
+
+
 add_task(async function test_check_success() {
   const startHistogram = getUptakeTelemetrySnapshot(TELEMETRY_HISTOGRAM_KEY);
   const serverTime = 8000;
 
   server.registerPathHandler(CHANGES_PATH, serveChangesEntries(serverTime, [{
     id: "330a0c5f-fadf-ff0b-40c8-4eb0d924ff6a",
     last_modified: 1100,
     host: "localhost",
@@ -81,25 +105,25 @@ add_task(async function test_check_succe
   // add a test kinto client that will respond to lastModified information
   // for a collection called 'test-collection'.
   // Let's use a bucket that is not the default one (`test-bucket`).
   Services.prefs.setCharPref("services.settings.test_bucket", "test-bucket");
   const c = RemoteSettings("test-collection", { bucketNamePref: "services.settings.test_bucket" });
   let maybeSyncCalled = false;
   c.maybeSync = () => { maybeSyncCalled = true; };
 
-  // Ensure that the remote-settings-changes-polled notification works
+  // Ensure that the remote-settings:changes-poll-end notification works
   let notificationObserved = false;
   const observer = {
     observe(aSubject, aTopic, aData) {
-      Services.obs.removeObserver(this, "remote-settings-changes-polled");
+      Services.obs.removeObserver(this, "remote-settings:changes-poll-end");
       notificationObserved = true;
     },
   };
-  Services.obs.addObserver(observer, "remote-settings-changes-polled");
+  Services.obs.addObserver(observer, "remote-settings:changes-poll-end");
 
   await RemoteSettings.pollChanges();
 
   // It didn't fail, hence we are sure that the unknown collection ``some-other-bucket/test-collection``
   // was ignored, otherwise it would have tried to reach the network.
 
   Assert.ok(maybeSyncCalled, "maybeSync was called");
   Assert.ok(notificationObserved, "a notification should have been observed");
@@ -126,17 +150,17 @@ add_task(async function test_update_time
     id: "028261ad-16d4-40c2-a96a-66f72914d125",
     last_modified: 42,
     host: "localhost",
     bucket: "main",
     collection: "whatever-collection",
   }]));
 
   await new Promise((resolve) => {
-    const e = "remote-settings-changes-polled";
+    const e = "remote-settings:changes-poll-end";
     const changesPolledObserver = {
       observe(aSubject, aTopic, aData) {
         Services.obs.removeObserver(this, e);
         resolve();
       },
     };
     Services.obs.addObserver(changesPolledObserver, e);
     remoteSettings.notify(null);
@@ -159,25 +183,25 @@ add_task(async function test_check_up_to
       response.setHeader("Date", (new Date(serverTime)).toUTCString());
       response.setStatusLine(null, 304, "Service Not Modified");
     }
   }
   server.registerPathHandler(CHANGES_PATH, server304);
 
   Services.prefs.setCharPref(PREF_LAST_ETAG, '"1100"');
 
-  // Ensure that the remote-settings-changes-polled notification is sent.
+  // Ensure that the remote-settings:changes-poll-end notification is sent.
   let notificationObserved = false;
   const observer = {
     observe(aSubject, aTopic, aData) {
-      Services.obs.removeObserver(this, "remote-settings-changes-polled");
+      Services.obs.removeObserver(this, "remote-settings:changes-poll-end");
       notificationObserved = true;
     },
   };
-  Services.obs.addObserver(observer, "remote-settings-changes-polled");
+  Services.obs.addObserver(observer, "remote-settings:changes-poll-end");
 
   // If server has no change, a 304 is received, maybeSync() is not called.
   let maybeSyncCalled = false;
   const c = RemoteSettings("test-collection", {
     bucketName: "test-bucket",
   });
   c.maybeSync = () => { maybeSyncCalled = true; };
 
@@ -252,16 +276,17 @@ add_task(async function test_client_last
   Services.prefs.setIntPref(c.lastCheckTimePref, 0);
 
   await RemoteSettings.pollChanges({ expectedTimestamp: '"42"' });
 
   notEqual(Services.prefs.getIntPref(c.lastCheckTimePref), 0);
 });
 add_task(clear_state);
 
+
 add_task(async function test_success_with_partial_list() {
   function partialList(request, response) {
     const entries = [{
       id: "028261ad-16d4-40c2-a96a-66f72914d125",
       last_modified: 43,
       host: "localhost",
       bucket: "main",
       collection: "cid-1",
@@ -348,21 +373,21 @@ add_task(async function test_server_erro
     }));
     response.setStatusLine(null, 503, "Service Unavailable");
   }
   server.registerPathHandler(CHANGES_PATH, simulateErrorResponse);
 
   let notificationObserved = false;
   const observer = {
     observe(aSubject, aTopic, aData) {
-      Services.obs.removeObserver(this, "remote-settings-changes-polled");
+      Services.obs.removeObserver(this, "remote-settings:changes-poll-end");
       notificationObserved = true;
     },
   };
-  Services.obs.addObserver(observer, "remote-settings-changes-polled");
+  Services.obs.addObserver(observer, "remote-settings:changes-poll-end");
   Services.prefs.setIntPref(PREF_LAST_UPDATE, 42);
 
   // pollChanges() fails with adequate error and no notification.
   let error;
   try {
     await RemoteSettings.pollChanges();
   } catch (e) {
     error = e;
--- a/storage/mozStorageConnection.cpp
+++ b/storage/mozStorageConnection.cpp
@@ -598,18 +598,20 @@ nsresult Connection::initialize() {
   // in memory database requested, sqlite uses a magic file name
   int srv = ::sqlite3_open_v2(":memory:", &mDBConn, mFlags, GetVFSName());
   if (srv != SQLITE_OK) {
     mDBConn = nullptr;
     return convertResultCode(srv);
   }
 
 #ifdef MOZ_SQLITE_FTS3_TOKENIZER
-  srv = ::sqlite3_db_config(mDBConn, SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER, 1, 0);
-  MOZ_ASSERT(srv == SQLITE_OK, "SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER should be enabled");
+  srv =
+      ::sqlite3_db_config(mDBConn, SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER, 1, 0);
+  MOZ_ASSERT(srv == SQLITE_OK,
+             "SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER should be enabled");
 #endif
 
   // Do not set mDatabaseFile or mFileURL here since this is a "memory"
   // database.
 
   nsresult rv = initializeInternal();
   NS_ENSURE_SUCCESS(rv, rv);
 
@@ -637,18 +639,20 @@ nsresult Connection::initialize(nsIFile 
   int srv = ::sqlite3_open_v2(NS_ConvertUTF16toUTF8(path).get(), &mDBConn,
                               mFlags, vfs);
   if (srv != SQLITE_OK) {
     mDBConn = nullptr;
     return convertResultCode(srv);
   }
 
 #ifdef MOZ_SQLITE_FTS3_TOKENIZER
-  srv = ::sqlite3_db_config(mDBConn, SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER, 1, 0);
-  MOZ_ASSERT(srv == SQLITE_OK, "SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER should be enabled");
+  srv =
+      ::sqlite3_db_config(mDBConn, SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER, 1, 0);
+  MOZ_ASSERT(srv == SQLITE_OK,
+             "SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER should be enabled");
 #endif
 
   // Do not set mFileURL here since this is database does not have an associated
   // URL.
   mDatabaseFile = aDatabaseFile;
 
   rv = initializeInternal();
   NS_ENSURE_SUCCESS(rv, rv);
@@ -671,18 +675,20 @@ nsresult Connection::initialize(nsIFileU
 
   int srv = ::sqlite3_open_v2(spec.get(), &mDBConn, mFlags, GetVFSName());
   if (srv != SQLITE_OK) {
     mDBConn = nullptr;
     return convertResultCode(srv);
   }
 
 #ifdef MOZ_SQLITE_FTS3_TOKENIZER
-  srv = ::sqlite3_db_config(mDBConn, SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER, 1, 0);
-  MOZ_ASSERT(srv == SQLITE_OK, "SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER should be enabled");
+  srv =
+      ::sqlite3_db_config(mDBConn, SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER, 1, 0);
+  MOZ_ASSERT(srv == SQLITE_OK,
+             "SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER should be enabled");
 #endif
 
   // Set both mDatabaseFile and mFileURL here.
   mFileURL = aFileURL;
   mDatabaseFile = databaseFile;
 
   rv = initializeInternal();
   NS_ENSURE_SUCCESS(rv, rv);
--- a/testing/geckodriver/src/marionette.rs
+++ b/testing/geckodriver/src/marionette.rs
@@ -15,20 +15,20 @@ use std::io::Error as IoError;
 use std::io::ErrorKind;
 use std::io::Result as IoResult;
 use std::net::{TcpListener, TcpStream};
 use std::path::PathBuf;
 use std::sync::Mutex;
 use std::thread;
 use std::time;
 use webdriver::capabilities::CapabilitiesMatching;
-use webdriver::command::WebDriverCommand::{AcceptAlert, AddCookie, CloseWindow, DeleteCookie,
-                                           DeleteCookies, DeleteSession, DismissAlert,
-                                           ElementClear, ElementClick, ElementSendKeys,
-                                           ExecuteAsyncScript, ExecuteScript,
+use webdriver::command::WebDriverCommand::{AcceptAlert, AddCookie, NewWindow, CloseWindow,
+                                           DeleteCookie, DeleteCookies, DeleteSession,
+                                           DismissAlert, ElementClear, ElementClick,
+                                           ElementSendKeys, ExecuteAsyncScript, ExecuteScript,
                                            Extension, FindElement, FindElementElement,
                                            FindElementElements, FindElements, FullscreenWindow,
                                            Get, GetActiveElement, GetAlertText, GetCSSValue,
                                            GetCookies, GetCurrentUrl, GetElementAttribute,
                                            GetElementProperty, GetElementRect, GetElementTagName,
                                            GetElementText, GetNamedCookie, GetPageSource,
                                            GetTimeouts, GetTitle, GetWindowHandle,
                                            GetWindowHandles, GetWindowRect, GoBack, GoForward,
@@ -36,21 +36,21 @@ use webdriver::command::WebDriverCommand
                                            MinimizeWindow, NewSession, PerformActions, Refresh,
                                            ReleaseActions, SendAlertText, SetTimeouts,
                                            SetWindowRect, Status, SwitchToFrame,
                                            SwitchToParentFrame, SwitchToWindow,
                                            TakeElementScreenshot, TakeScreenshot};
 use webdriver::command::{ActionsParameters, AddCookieParameters, GetNamedCookieParameters,
                          GetParameters, JavascriptCommandParameters, LocatorParameters,
                          NewSessionParameters, SwitchToFrameParameters, SwitchToWindowParameters,
-                         TimeoutsParameters, WindowRectParameters};
+                         TimeoutsParameters, WindowRectParameters, NewWindowParameters};
 use webdriver::command::{WebDriverCommand, WebDriverMessage};
 use webdriver::common::{Cookie, FrameId, WebElement, ELEMENT_KEY, FRAME_KEY, WINDOW_KEY};
 use webdriver::error::{ErrorStatus, WebDriverError, WebDriverResult};
-use webdriver::response::{CloseWindowResponse, CookieResponse, CookiesResponse,
+use webdriver::response::{NewWindowResponse, CloseWindowResponse, CookieResponse, CookiesResponse,
                           ElementRectResponse, NewSessionResponse, TimeoutsResponse,
                           ValueResponse, WebDriverResponse, WindowRectResponse};
 use webdriver::server::{Session, WebDriverHandler};
 
 use crate::build::BuildInfo;
 use crate::capabilities::{FirefoxCapabilities, FirefoxOptions};
 use crate::logging;
 use crate::prefs;
@@ -517,16 +517,38 @@ impl MarionetteSession {
                 WebDriverResponse::Timeouts(TimeoutsResponse {
                     script: script,
                     page_load: page_load,
                     implicit: implicit,
                 })
             }
             Status => panic!("Got status command that should already have been handled"),
             GetWindowHandles => WebDriverResponse::Generic(resp.to_value_response(false)?),
+            NewWindow(_) => {
+                let handle: String = try_opt!(
+                    try_opt!(
+                        resp.result.get("handle"),
+                        ErrorStatus::UnknownError,
+                        "Failed to find handle field"
+                    ).as_str(),
+                    ErrorStatus::UnknownError,
+                    "Failed to interpret handle as string"
+                ).into();
+                let typ: String = try_opt!(
+                    try_opt!(
+                        resp.result.get("type"),
+                        ErrorStatus::UnknownError,
+                        "Failed to find type field"
+                    ).as_str(),
+                    ErrorStatus::UnknownError,
+                    "Failed to interpret type as string"
+                ).into();
+
+                WebDriverResponse::NewWindow(NewWindowResponse { handle, typ })
+            }
             CloseWindow => {
                 let data = try_opt!(
                     resp.result.as_array(),
                     ErrorStatus::UnknownError,
                     "Failed to interpret value as array"
                 );
                 let handles = data
                     .iter()
@@ -783,16 +805,17 @@ impl MarionetteCommand {
 
         let (opt_name, opt_parameters) = match msg.command {
             Status => panic!("Got status command that should already have been handled"),
             AcceptAlert => {
                 // Needs to be updated to "WebDriver:AcceptAlert" for Firefox 63
                 (Some("WebDriver:AcceptDialog"), None)
             }
             AddCookie(ref x) => (Some("WebDriver:AddCookie"), Some(x.to_marionette())),
+            NewWindow(ref x) => (Some("WebDriver:NewWindow"), Some(x.to_marionette())),
             CloseWindow => (Some("WebDriver:CloseWindow"), None),
             DeleteCookie(ref x) => {
                 let mut data = Map::new();
                 data.insert("name".to_string(), Value::String(x.clone()));
                 (Some("WebDriver:DeleteCookie"), Some(Ok(data)))
             }
             DeleteCookies => (Some("WebDriver:DeleteAllCookies"), None),
             DeleteSession => {
@@ -1420,16 +1443,26 @@ impl ToMarionette for LocatorParameters 
         Ok(try_opt!(
             serde_json::to_value(self)?.as_object(),
             ErrorStatus::UnknownError,
             "Expected an object"
         ).clone())
     }
 }
 
+impl ToMarionette for NewWindowParameters {
+    fn to_marionette(&self) -> WebDriverResult<Map<String, Value>> {
+        let mut data = Map::new();
+        if let Some(ref x) = self.type_hint {
+            data.insert("type".to_string(), serde_json::to_value(x)?);
+        }
+        Ok(data)
+    }
+}
+
 impl ToMarionette for SwitchToFrameParameters {
     fn to_marionette(&self) -> WebDriverResult<Map<String, Value>> {
         let mut data = Map::new();
         let key = match self.id {
             None => None,
             Some(FrameId::Short(_)) => Some("id"),
             Some(FrameId::Element(_)) => Some("element"),
         };
--- a/testing/web-platform/meta/mediacapture-streams/MediaStream-default-feature-policy.https.html.ini
+++ b/testing/web-platform/meta/mediacapture-streams/MediaStream-default-feature-policy.https.html.ini
@@ -1,21 +1,32 @@
 [MediaStream-default-feature-policy.https.sub.html]
   [Default "microphone" feature policy ["self"\] disallows cross-origin iframes.]
     expected: FAIL
 
   [Default "camera" feature policy ["self"\] disallows cross-origin iframes.]
     expected: FAIL
 
-  [Default "camera; microphone" feature policy ["self"\] disallows cross-origin iframes.]
+  [Default "camera;microphone" feature policy ["self"\] disallows cross-origin iframes.]
+    expected: FAIL
+
+  [Feature policy "microphone" can be enabled in cross-origin iframes using "allow" attribute.]
+    expected: FAIL
+
+  [Feature policy "camera" can be enabled in cross-origin iframes using "allow" attribute.]
     expected: FAIL
 
 
 [MediaStream-default-feature-policy.https.html]
   [Default "microphone" feature policy ["self"\] disallows cross-origin iframes.]
     expected: FAIL
 
   [Default "camera" feature policy ["self"\] disallows cross-origin iframes.]
     expected: FAIL
 
-  [Default "camera; microphone" feature policy ["self"\] disallows cross-origin iframes.]
+  [Default "camera;microphone" feature policy ["self"\] disallows cross-origin iframes.]
     expected: FAIL
 
+  [Feature policy "microphone" can be enabled in cross-origin iframes using "allow" attribute.]
+    expected: FAIL
+
+  [Feature policy "camera" can be enabled in cross-origin iframes using "allow" attribute.]
+    expected: FAIL
new file mode 100644
--- /dev/null
+++ b/testing/web-platform/meta/webdriver/tests/new_window/new_window.py.ini
@@ -0,0 +1,2 @@
+[new_window.py]
+  disabled: os == "android": Fennec doesn't support opening new windows
--- a/testing/web-platform/tests/feature-policy/resources/featurepolicy.js
+++ b/testing/web-platform/tests/feature-policy/resources/featurepolicy.js
@@ -77,17 +77,17 @@ function test_feature_availability_with_
   };
   test_feature_availability(null, test, src, test_result, allow_attribute);
 }
 
 // If this page is intended to test the named feature (according to the URL),
 // tests the feature availability and posts the result back to the parent.
 // Otherwise, does nothing.
 function test_feature_in_iframe(feature_name, feature_promise_factory) {
-  if (location.hash.includes(feature_name)) {
+  if (location.hash.endsWith(`#${feature_name}`)) {
     feature_promise_factory().then(
         () => window.parent.postMessage('#OK', '*'),
         (e) => window.parent.postMessage('#' + e.name, '*'));
   }
 }
 
 // Returns true if the URL for this page indicates that it is embedded in an
 // iframe.
--- a/testing/web-platform/tests/mediacapture-streams/MediaStream-default-feature-policy.https.html
+++ b/testing/web-platform/tests/mediacapture-streams/MediaStream-default-feature-policy.https.html
@@ -2,61 +2,78 @@
 <body>
   <script src=/resources/testharness.js></script>
   <script src=/resources/testharnessreport.js></script>
   <script src=/common/get-host-info.sub.js></script>
   <script src=/feature-policy/resources/featurepolicy.js></script>
   <script>
   'use strict';
 
-  // The promise_factory must return a promise that runs the feature and
-  // resolves if feature usage is successful, otherwise rejects. Using
-  // getUserMedia is successful if at least one mic/camera is returned when
-  // mic/camera has been explicitly allowed by feature policy.
-  function promise_factory(allowed_features) {
-    return new Promise((resolve, reject) => {
-      navigator.mediaDevices.getUserMedia({video: true, audio: true}).then(
-          function(stream) {
-            // If microphone is allowed, there should be at least one microphone
-            // in the result. If camera is allowed, there should be at least one
-            // camera in the result.
-            if ((allowed_features.includes('microphone') &&
-                 stream.getAudioTracks().length == 0) ||
-                (allowed_features.includes('camera') &&
-                 stream.getVideoTracks().length == 0)) {
-                reject('Feature policy allowed feature but devices not ' +
-                    'present.');
-            } else {
-              // Otherwise the result is expected.
-              resolve();
-            }
-          },
-          function(error) { reject(error); });
-    });
-  };
+  async function gUM({audio, video}) {
+    let stream;
+    try {
+      stream = await navigator.mediaDevices.getUserMedia({audio, video});
+      // getUserMedia must guarantee the number of tracks requested or fail.
+      if ((audio && stream.getAudioTracks().length == 0) ||
+          (video && stream.getVideoTracks().length == 0)) {
+        throw {name: `All requested devices must be present with ` +
+                     `audio ${audio} and video ${video}, or fail`};
+      }
+    } finally {
+      if (stream) {
+        stream.getTracks().forEach(track => track.stop());
+      }
+    }
+  }
 
-  var cross_domain = get_host_info().HTTPS_REMOTE_ORIGIN;
+  async function must_disallow_gUM({audio, video}) {
+    try {
+      await gUM({audio, video});
+    } catch (e) {
+      if (e.name == 'NotAllowedError') {
+        return;
+      }
+      throw e;
+    }
+    throw {name: `audio ${audio} and video ${video} constraints must not be ` +
+                 `allowed.`};
+  }
+
+  const cross_domain = get_host_info().HTTPS_REMOTE_ORIGIN;
   run_all_fp_tests_allow_self(
-      cross_domain,
-      'microphone',
-      'NotAllowedError',
-      function() {
-        return promise_factory('microphone');
-      });
-
-  run_all_fp_tests_allow_self(
-      cross_domain,
-      'camera',
-      'NotAllowedError',
-      function() {
-        return promise_factory('camera');
-      });
+    cross_domain,
+    'microphone',
+    'NotAllowedError',
+    async () => {
+      await gUM({audio: true});
+      if (window.location.href.includes(cross_domain)) {
+        await must_disallow_gUM({video: true});
+        await must_disallow_gUM({audio: true, video: true});
+      }
+    }
+  );
 
   run_all_fp_tests_allow_self(
     cross_domain,
-    'camera; microphone',
+    'camera',
     'NotAllowedError',
-    function() {
-      return promise_factory('camera; microphone');
-    });
+    async () => {
+      await gUM({video: true});
+      if (window.location.href.includes(cross_domain)) {
+        await must_disallow_gUM({audio: true});
+        await must_disallow_gUM({audio: true, video: true});
+      }
+    }
+  );
+
+  run_all_fp_tests_allow_self(
+    cross_domain,
+    'camera;microphone',
+    'NotAllowedError',
+    async () => {
+      await gUM({audio: true, video: true});
+      await gUM({audio: true});
+      await gUM({video: true});
+    }
+  );
   </script>
 </body>
 
new file mode 100644
--- /dev/null
+++ b/testing/web-platform/tests/webdriver/tests/new_window/__init__.py
@@ -0,0 +1,10 @@
+def opener(session):
+    return session.execute_script("""
+        return window.opener;
+        """)
+
+
+def window_name(session):
+    return session.execute_script("""
+        return window.name;
+        """)
new file mode 100644
--- /dev/null
+++ b/testing/web-platform/tests/webdriver/tests/new_window/new.py
@@ -0,0 +1,52 @@
+import pytest
+
+from webdriver.transport import Response
+
+from tests.support.asserts import assert_error, assert_success
+
+
+def new_window(session, type_hint=None):
+    return session.transport.send(
+        "POST", "session/{session_id}/window/new".format(**vars(session)),
+        {"type": type_hint})
+
+
+def test_null_parameter_value(session, http):
+    path = "/session/{session_id}/window/new".format(**vars(session))
+    with http.post(path, None) as response:
+        assert_error(Response.from_http(response), "invalid argument")
+
+
+def test_no_browsing_context(session, closed_window):
+    response = new_window(session)
+    assert_error(response, "no such window")
+
+
+@pytest.mark.parametrize("type_hint", [True, 42, 4.2, [], {}])
+def test_type_with_invalid_type(session, type_hint):
+    response = new_window(session, type_hint)
+    assert_error(response, "invalid argument")
+
+
+def test_type_with_null_value(session):
+    original_handles = session.handles
+
+    response = new_window(session, type_hint=None)
+    value = assert_success(response)
+    handles = session.handles
+    assert len(handles) == len(original_handles) + 1
+    assert value["handle"] in handles
+    assert value["handle"] not in original_handles
+    assert value["type"] in ["tab", "window"]
+
+
+def test_type_with_unknown_value(session):
+    original_handles = session.handles
+
+    response = new_window(session, type_hint="foo")
+    value = assert_success(response)
+    handles = session.handles
+    assert len(handles) == len(original_handles) + 1
+    assert value["handle"] in handles
+    assert value["handle"] not in original_handles
+    assert value["type"] in ["tab", "window"]
new file mode 100644
--- /dev/null
+++ b/testing/web-platform/tests/webdriver/tests/new_window/new_tab.py
@@ -0,0 +1,48 @@
+from tests.support.asserts import assert_success
+
+from . import opener, window_name
+
+
+def new_window(session, type_hint=None):
+    return session.transport.send(
+        "POST", "session/{session_id}/window/new".format(**vars(session)),
+        {"type": type_hint})
+
+
+def test_new_tab(session):
+    original_handles = session.handles
+
+    response = new_window(session, type_hint="tab")
+    value = assert_success(response)
+    handles = session.handles
+    assert len(handles) == len(original_handles) + 1
+    assert value["handle"] in handles
+    assert value["handle"] not in original_handles
+    assert value["type"] == "tab"
+
+
+def test_new_tab_opens_about_blank(session):
+    response = new_window(session, type_hint="tab")
+    value = assert_success(response)
+    assert value["type"] == "tab"
+
+    session.handle = value["handle"]
+    assert session.url == "about:blank"
+
+
+def test_new_tab_sets_no_window_name(session):
+    response = new_window(session, type_hint="tab")
+    value = assert_success(response)
+    assert value["type"] == "tab"
+
+    session.handle = value["handle"]
+    assert window_name(session) == ""
+
+
+def test_new_tab_sets_no_opener(session):
+    response = new_window(session, type_hint="tab")
+    value = assert_success(response)
+    assert value["type"] == "tab"
+
+    session.handle = value["handle"]
+    assert opener(session) is None
new file mode 100644
--- /dev/null
+++ b/testing/web-platform/tests/webdriver/tests/new_window/new_window.py
@@ -0,0 +1,48 @@
+from tests.support.asserts import assert_success
+
+from . import opener, window_name
+
+
+def new_window(session, type_hint=None):
+    return session.transport.send(
+        "POST", "session/{session_id}/window/new".format(**vars(session)),
+        {"type": type_hint})
+
+
+def test_type_with_window(session):
+    original_handles = session.handles
+
+    response = new_window(session, type_hint="window")
+    value = assert_success(response)
+    handles = session.handles
+    assert len(handles) == len(original_handles) + 1
+    assert value["handle"] in handles
+    assert value["handle"] not in original_handles
+    assert value["type"] == "window"
+
+
+def test_new_window_opens_about_blank(session):
+    response = new_window(session, type_hint="window")
+    value = assert_success(response)
+    assert value["type"] == "window"
+
+    session.handle = value["handle"]
+    assert session.url == "about:blank"
+
+
+def test_new_window_sets_no_window_name(session):
+    response = new_window(session, type_hint="window")
+    value = assert_success(response)
+    assert value["type"] == "window"
+
+    session.handle = value["handle"]
+    assert window_name(session) == ""
+
+
+def test_new_window_sets_no_opener(session):
+    response = new_window(session, type_hint="window")
+    value = assert_success(response)
+    assert value["type"] == "window"
+
+    session.handle = value["handle"]
+    assert opener(session) is None
new file mode 100644
--- /dev/null
+++ b/testing/web-platform/tests/webdriver/tests/new_window/user_prompts.py
@@ -0,0 +1,121 @@
+# META: timeout=long
+
+import pytest
+
+from tests.support.asserts import assert_dialog_handled, assert_error, assert_success
+
+
+def new_window(session, type_hint=None):
+    return session.transport.send(
+        "POST", "session/{session_id}/window/new".format(**vars(session)),
+        {"type": type_hint})
+
+
+@pytest.fixture
+def check_user_prompt_closed_without_exception(session, create_dialog):
+    def check_user_prompt_closed_without_exception(dialog_type, retval):
+        original_handles = session.handles
+
+        create_dialog(dialog_type, text=dialog_type)
+
+        response = new_window(session)
+        value = assert_success(response)
+
+        handles = session.handles
+        assert len(handles) == len(original_handles) + 1
+        assert value["handle"] in handles
+        assert value["handle"] not in original_handles
+
+        assert_dialog_handled(session, expected_text=dialog_type, expected_retval=retval)
+
+    return check_user_prompt_closed_without_exception
+
+
+@pytest.fixture
+def check_user_prompt_closed_with_exception(session, create_dialog):
+    def check_user_prompt_closed_with_exception(dialog_type, retval):
+        original_handles = session.handles
+
+        create_dialog(dialog_type, text=dialog_type)
+
+        response = new_window(session)
+        assert_error(response, "unexpected alert open")
+
+        assert_dialog_handled(session, expected_text=dialog_type, expected_retval=retval)
+
+        assert len(session.handles) == len(original_handles)
+
+    return check_user_prompt_closed_with_exception
+
+
+@pytest.fixture
+def check_user_prompt_not_closed_but_exception(session, create_dialog):
+    def check_user_prompt_not_closed_but_exception(dialog_type):
+        original_handles = session.handles
+
+        create_dialog(dialog_type, text=dialog_type)
+
+        response = new_window(session)
+        assert_error(response, "unexpected alert open")
+
+        assert session.alert.text == dialog_type
+        session.alert.dismiss()
+
+        assert len(session.handles) == len(original_handles)
+
+    return check_user_prompt_not_closed_but_exception
+
+
+@pytest.mark.capabilities({"unhandledPromptBehavior": "accept"})
+@pytest.mark.parametrize("dialog_type, retval", [
+    ("alert", None),
+    ("confirm", True),
+    ("prompt", ""),
+])
+def test_accept(check_user_prompt_closed_without_exception, dialog_type, retval):
+    check_user_prompt_closed_without_exception(dialog_type, retval)
+
+
+@pytest.mark.capabilities({"unhandledPromptBehavior": "accept and notify"})
+@pytest.mark.parametrize("dialog_type, retval", [
+    ("alert", None),
+    ("confirm", True),
+    ("prompt", ""),
+])
+def test_accept_and_notify(check_user_prompt_closed_with_exception, dialog_type, retval):
+    check_user_prompt_closed_with_exception(dialog_type, retval)
+
+
+@pytest.mark.capabilities({"unhandledPromptBehavior": "dismiss"})
+@pytest.mark.parametrize("dialog_type, retval", [
+    ("alert", None),
+    ("confirm", False),
+    ("prompt", None),
+])
+def test_dismiss(check_user_prompt_closed_without_exception, dialog_type, retval):
+    check_user_prompt_closed_without_exception(dialog_type, retval)
+
+
+@pytest.mark.capabilities({"unhandledPromptBehavior": "dismiss and notify"})
+@pytest.mark.parametrize("dialog_type, retval", [
+    ("alert", None),
+    ("confirm", False),
+    ("prompt", None),
+])
+def test_dismiss_and_notify(check_user_prompt_closed_with_exception, dialog_type, retval):
+    check_user_prompt_closed_with_exception(dialog_type, retval)
+
+
+@pytest.mark.capabilities({"unhandledPromptBehavior": "ignore"})
+@pytest.mark.parametrize("dialog_type", ["alert", "confirm", "prompt"])
+def test_ignore(check_user_prompt_not_closed_but_exception, dialog_type):
+    check_user_prompt_not_closed_but_exception(dialog_type)
+
+
+@pytest.mark.parametrize("dialog_type, retval", [
+    ("alert", None),
+    ("confirm", False),
+    ("prompt", None),
+])
+def test_default(check_user_prompt_closed_with_exception, dialog_type, retval):
+    check_user_prompt_closed_with_exception(dialog_type, retval)
--- a/testing/webdriver/src/command.rs
+++ b/testing/webdriver/src/command.rs
@@ -18,16 +18,17 @@ pub enum WebDriverCommand<T: WebDriverEx
     GetCurrentUrl,
     GoBack,
     GoForward,
     Refresh,
     GetTitle,
     GetPageSource,
     GetWindowHandle,
     GetWindowHandles,
+    NewWindow(NewWindowParameters),
     CloseWindow,
     GetWindowRect,
     SetWindowRect(WindowRectParameters),
     MinimizeWindow,
     MaximizeWindow,
     FullscreenWindow,
     SwitchToWindow(SwitchToWindowParameters),
     SwitchToFrame(SwitchToFrameParameters),
@@ -115,16 +116,17 @@ impl<U: WebDriverExtensionRoute> WebDriv
             Route::GetCurrentUrl => WebDriverCommand::GetCurrentUrl,
             Route::GoBack => WebDriverCommand::GoBack,
             Route::GoForward => WebDriverCommand::GoForward,
             Route::Refresh => WebDriverCommand::Refresh,
             Route::GetTitle => WebDriverCommand::GetTitle,
             Route::GetPageSource => WebDriverCommand::GetPageSource,
             Route::GetWindowHandle => WebDriverCommand::GetWindowHandle,
             Route::GetWindowHandles => WebDriverCommand::GetWindowHandles,
+            Route::NewWindow => WebDriverCommand::NewWindow(serde_json::from_str(raw_body)?),
             Route::CloseWindow => WebDriverCommand::CloseWindow,
             Route::GetTimeouts => WebDriverCommand::GetTimeouts,
             Route::SetTimeouts => WebDriverCommand::SetTimeouts(serde_json::from_str(raw_body)?),
             Route::GetWindowRect | Route::GetWindowPosition | Route::GetWindowSize => {
                 WebDriverCommand::GetWindowRect
             }
             Route::SetWindowRect | Route::SetWindowPosition | Route::SetWindowSize => {
                 WebDriverCommand::SetWindowRect(serde_json::from_str(raw_body)?)
@@ -465,16 +467,22 @@ impl CapabilitiesMatching for NewSession
         match self {
             NewSessionParameters::Spec(x) => x.match_browser(browser_capabilities),
             NewSessionParameters::Legacy(x) => x.match_browser(browser_capabilities),
         }
     }
 }
 
 #[derive(Debug, PartialEq, Serialize, Deserialize)]
+pub struct NewWindowParameters {
+    #[serde(rename = "type")]
+    pub type_hint: Option<String>,
+}
+
+#[derive(Debug, PartialEq, Serialize, Deserialize)]
 pub struct SendKeysParameters {
     pub text: String,
 }
 
 #[derive(Debug, PartialEq, Serialize, Deserialize)]
 pub struct SwitchToFrameParameters {
     pub id: Option<FrameId>,
 }
@@ -935,16 +943,55 @@ mod tests {
             alwaysMatch: Capabilities::new(),
             firstMatch: vec![Capabilities::new()],
         });
 
         check_deserialize(&json, &data);
     }
 
     #[test]
+    fn test_json_new_window_parameters_without_type() {
+        let json = r#"{}"#;
+        let data = NewWindowParameters { type_hint: None };
+
+        check_deserialize(&json, &data);
+    }
+
+    #[test]
+    fn test_json_new_window_parameters_with_optional_null_type() {
+        let json = r#"{"type":null}"#;
+        let data = NewWindowParameters { type_hint: None };
+
+        check_deserialize(&json, &data);
+    }
+
+    #[test]
+    fn test_json_new_window_parameters_with_supported_type() {
+        let json = r#"{"type":"tab"}"#;
+        let data = NewWindowParameters { type_hint: Some("tab".into()) };
+
+        check_deserialize(&json, &data);
+    }
+
+    #[test]
+    fn test_json_new_window_parameters_with_unknown_type() {
+        let json = r#"{"type":"foo"}"#;
+        let data = NewWindowParameters { type_hint: Some("foo".into()) };
+
+        check_deserialize(&json, &data);
+    }
+
+    #[test]
+    fn test_json_new_window_parameters_with_invalid_type() {
+        let json = r#"{"type":3}"#;
+
+        assert!(serde_json::from_str::<NewWindowParameters>(&json).is_err());
+    }
+
+    #[test]
     fn test_json_send_keys_parameters_with_value() {
         let json = r#"{"text":"foo"}"#;
         let data = SendKeysParameters { text: "foo".into() };
 
         check_deserialize(&json, &data);
     }
 
     #[test]
--- a/testing/webdriver/src/httpapi.rs
+++ b/testing/webdriver/src/httpapi.rs
@@ -20,16 +20,17 @@ fn standard_routes<U: WebDriverExtension
         (Method::GET, "/session/{sessionId}/title", Route::GetTitle),
         (Method::GET, "/session/{sessionId}/source", Route::GetPageSource),
         (Method::GET, "/session/{sessionId}/window", Route::GetWindowHandle),
         (
             Method::GET,
             "/session/{sessionId}/window/handles",
             Route::GetWindowHandles,
         ),
+        (Method::POST, "/session/{sessionId}/window/new", Route::NewWindow),
         (Method::DELETE, "/session/{sessionId}/window", Route::CloseWindow),
         (
             Method::GET,
             "/session/{sessionId}/window/size",
             Route::GetWindowSize,
         ),
         (
             Method::POST,
@@ -224,16 +225,17 @@ pub enum Route<U: WebDriverExtensionRout
     GetCurrentUrl,
     GoBack,
     GoForward,
     Refresh,
     GetTitle,
     GetPageSource,
     GetWindowHandle,
     GetWindowHandles,
+    NewWindow,
     CloseWindow,
     GetWindowSize,     // deprecated
     SetWindowSize,     // deprecated
     GetWindowPosition, // deprecated
     SetWindowPosition, // deprecated
     GetWindowRect,
     SetWindowRect,
     MinimizeWindow,
--- a/testing/webdriver/src/response.rs
+++ b/testing/webdriver/src/response.rs
@@ -1,15 +1,16 @@
 use crate::common::Cookie;
 use serde::ser::{Serialize, Serializer};
 use serde_json::Value;
 
 #[derive(Debug, PartialEq, Serialize)]
 #[serde(untagged, remote = "Self")]
 pub enum WebDriverResponse {
+    NewWindow(NewWindowResponse),
     CloseWindow(CloseWindowResponse),
     Cookie(CookieResponse),
     Cookies(CookiesResponse),
     DeleteSession,
     ElementRect(ElementRectResponse),
     Generic(ValueResponse),
     NewSession(NewSessionResponse),
     Timeouts(TimeoutsResponse),
@@ -28,16 +29,23 @@ impl Serialize for WebDriverResponse {
             value: &'a WebDriverResponse,
         }
 
         Wrapper { value: self }.serialize(serializer)
     }
 }
 
 #[derive(Debug, PartialEq, Serialize)]
+pub struct NewWindowResponse {
+    pub handle: String,
+    #[serde(rename = "type")]
+    pub typ: String,
+}
+
+#[derive(Debug, PartialEq, Serialize)]
 pub struct CloseWindowResponse(pub Vec<String>);
 
 #[derive(Clone, Debug, PartialEq, Serialize)]
 pub struct CookieResponse(pub Cookie);
 
 #[derive(Debug, PartialEq, Serialize)]
 pub struct CookiesResponse(pub Vec<Cookie>);
 
@@ -129,16 +137,27 @@ pub struct WindowRectResponse {
 #[cfg(test)]
 mod tests {
     use super::*;
     use crate::common::Date;
     use crate::test::check_serialize;
     use serde_json;
 
     #[test]
+    fn test_json_new_window_response() {
+        let json = r#"{"value":{"handle":"42","type":"window"}}"#;
+        let data = WebDriverResponse::NewWindow(NewWindowResponse {
+            handle: "42".into(),
+            typ: "window".into(),
+        });
+
+        check_serialize(&json, &data);
+    }
+
+    #[test]
     fn test_json_close_window_response() {
         let json = r#"{"value":["1234"]}"#;
         let data = WebDriverResponse::CloseWindow(CloseWindowResponse(vec!["1234".into()]));
 
         check_serialize(&json, &data);
     }
 
     #[test]
--- a/third_party/dav1d/.gitlab-ci.yml
+++ b/third_party/dav1d/.gitlab-ci.yml
@@ -1,12 +1,23 @@
 stages:
+    - style
     - build
     - test
 
+style-check:
+    image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132
+    stage: style
+    tags:
+        - debian
+        - amd64
+    script:
+        - git grep -n -e $'\t' --or -e $'\r' -- . ':(exclude)*/compat/*' && exit 1
+        - /bin/true
+
 build-debian:
     image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132
     stage: build
     tags:
         - debian
         - amd64
     script:
         - meson build --buildtype release --werror
@@ -19,16 +30,29 @@ build-debian-static:
     tags:
         - debian
         - amd64
     script:
         - meson build --buildtype release --default-library static --werror
         - ninja -C build
         - cd build && meson test -v
 
+build-debian32:
+    image: registry.videolan.org:5000/dav1d-debian-unstable:20181218135732
+    stage: build
+    tags:
+        - debian
+        - amd64
+    script:
+        - meson build --buildtype release
+                      --werror
+                      --cross-file /opt/crossfiles/linux32.meson
+        - ninja -C build
+        - cd build && meson test -v
+
 build-win32:
     image: registry.videolan.org:5000/dav1d-debian-unstable:20181114201132
     stage: build
     tags:
         - win32
     script:
         - meson build --buildtype release
                       --werror
--- a/third_party/dav1d/include/common/attributes.h
+++ b/third_party/dav1d/include/common/attributes.h
@@ -33,25 +33,26 @@
 #include <stddef.h>
 
 #ifdef __GNUC__
 #define ATTR_ALIAS __attribute__((may_alias))
 #else
 #define ATTR_ALIAS
 #endif
 
-#if ARCH_X86
+#if ARCH_X86_64
+/* x86-64 needs 32-byte alignment for AVX2. */
 #define ALIGN_32_VAL 32
 #define ALIGN_16_VAL 16
-#elif ARCH_ARM || ARCH_AARCH64
-// ARM doesn't benefit from anything more than 16 byte alignment.
+#elif ARCH_X86_32 || ARCH_ARM || ARCH_AARCH64
+/* ARM doesn't benefit from anything more than 16-byte alignment. */
 #define ALIGN_32_VAL 16
 #define ALIGN_16_VAL 16
 #else
-// No need for extra alignment on platforms without assembly.
+/* No need for extra alignment on platforms without assembly. */
 #define ALIGN_32_VAL 8
 #define ALIGN_16_VAL 8
 #endif
 
 /*
  * API for variables, struct members (ALIGN()) like:
  * uint8_t var[1][2][3][4]
  * becomes:
--- a/third_party/dav1d/include/compat/msvc/stdatomic.h
+++ b/third_party/dav1d/include/compat/msvc/stdatomic.h
@@ -18,53 +18,53 @@
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
-
-#ifndef MSCVER_STDATOMIC_H_
-#define MSCVER_STDATOMIC_H_
-
+
+#ifndef MSCVER_STDATOMIC_H_
+#define MSCVER_STDATOMIC_H_
+
 #if !defined(__cplusplus) && defined(_MSC_VER)
 
 #pragma warning(push)
 #pragma warning(disable:4067)    /* newline for __has_include_next */
 
 #if defined(__clang__) && __has_include_next(<stdatomic.h>)
    /* use the clang stdatomic.h with clang-cl*/
 #  include_next <stdatomic.h>
 #else /* ! stdatomic.h */
 
-#include <windows.h>
-
-#include "common/attributes.h"
-
-typedef volatile LONG  __declspec(align(32)) atomic_int;
-typedef volatile ULONG __declspec(align(32)) atomic_uint;
-
-typedef enum {
-    memory_order_relaxed,
-    memory_order_acquire
-} msvc_atomic_memory_order;
-
-#define atomic_init(p_a, v)           do { *(p_a) = (v); } while(0)
-#define atomic_store(p_a, v)          InterlockedExchange((LONG*)p_a, v)
-#define atomic_load(p_a)              InterlockedCompareExchange((LONG*)p_a, 0, 0)
-#define atomic_load_explicit(p_a, mo) atomic_load(p_a)
-
-/*
- * TODO use a special call to increment/decrement
- * using InterlockedIncrement/InterlockedDecrement
- */
-#define atomic_fetch_add(p_a, inc)    InterlockedExchangeAdd(p_a, inc)
-#define atomic_fetch_sub(p_a, dec)    InterlockedExchangeAdd(p_a, -(dec))
-
-#endif /* ! stdatomic.h */
-
-#pragma warning(pop)
-
-#endif /* !defined(__cplusplus) && defined(_MSC_VER) */
-
-#endif /* MSCVER_STDATOMIC_H_ */
+#include <windows.h>
+
+#include "common/attributes.h"
+
+typedef volatile LONG  __declspec(align(32)) atomic_int;
+typedef volatile ULONG __declspec(align(32)) atomic_uint;
+
+typedef enum {
+    memory_order_relaxed,
+    memory_order_acquire
+} msvc_atomic_memory_order;
+
+#define atomic_init(p_a, v)           do { *(p_a) = (v); } while(0)
+#define atomic_store(p_a, v)          InterlockedExchange((LONG*)p_a, v)
+#define atomic_load(p_a)              InterlockedCompareExchange((LONG*)p_a, 0, 0)
+#define atomic_load_explicit(p_a, mo) atomic_load(p_a)
+
+/*
+ * TODO use a special call to increment/decrement
+ * using InterlockedIncrement/InterlockedDecrement
+ */
+#define atomic_fetch_add(p_a, inc)    InterlockedExchangeAdd(p_a, inc)
+#define atomic_fetch_sub(p_a, dec)    InterlockedExchangeAdd(p_a, -(dec))
+
+#endif /* ! stdatomic.h */
+
+#pragma warning(pop)
+
+#endif /* !defined(__cplusplus) && defined(_MSC_VER) */
+
+#endif /* MSCVER_STDATOMIC_H_ */
--- a/third_party/dav1d/include/dav1d/common.h
+++ b/third_party/dav1d/include/dav1d/common.h
@@ -39,23 +39,32 @@
         #define DAV1D_API __attribute__ ((visibility ("default")))
       #else
         #define DAV1D_API
       #endif
     #endif
 #endif
 
 /**
+ * A reference-counted object wrapper for a user-configurable pointer.
+ */
+typedef struct Dav1dUserData {
+    const uint8_t *data; ///< data pointer
+    struct Dav1dRef *ref; ///< allocation origin
+} Dav1dUserData;
+
+/**
  * Input packet metadata which are copied from the input data used to
  * decode each image into the matching structure of the output image
  * returned back to the user. Since these are metadata fields, they
  * can be used for other purposes than the documented ones, they will
  * still be passed from input data to output picture without being
  * used internally.
  */
 typedef struct Dav1dDataProps {
     int64_t timestamp; ///< container timestamp of input data, INT64_MIN if unknown (default)
     int64_t duration; ///< container duration of input data, 0 if unknown (default)
     int64_t offset; ///< stream offset of input data, -1 if unknown (default)
     size_t size; ///< packet size, default Dav1dData.sz
+    struct Dav1dUserData user_data; ///< user-configurable data, default NULL members
 } Dav1dDataProps;
 
 #endif // __DAV1D_COMMON_H__
--- a/third_party/dav1d/include/dav1d/data.h
+++ b/third_party/dav1d/include/dav1d/data.h
@@ -53,26 +53,57 @@ DAV1D_API uint8_t * dav1d_data_create(Da
 /**
  * Wrap an existing data array.
  *
  * @param          data Input context.
  * @param           buf The data to be wrapped.
  * @param            sz Size of the data.
  * @param free_callback Function to be called when we release our last
  *                      reference to this data. In this callback, $buf will be
- *                      the $buf argument to this function, and $user_data
- *                      will be the $user_data input argument to this function.
- * @param     user_data Opaque parameter passed to free_callback().
+ *                      the $buf argument to this function, and $cookie will
+ *                      be the $cookie input argument to this function.
+ * @param        cookie Opaque parameter passed to free_callback().
  *
  * @return 0 on success. A negative errno value on error.
  */
 DAV1D_API int dav1d_data_wrap(Dav1dData *data, const uint8_t *buf, size_t sz,
-                              void (*free_callback)(const uint8_t *buf, void *user_data),
-                              void *user_data);
+                              void (*free_callback)(const uint8_t *buf, void *cookie),
+                              void *cookie);
+
+/**
+ * Wrap a user-provided data pointer into a reference counted object.
+ *
+ * data->m.user_data field will initialized to wrap the provided $user_data
+ * pointer.
+ *
+ * $free_callback will be called on the same thread that released the last
+ * reference. If frame threading is used, make sure $free_callback is
+ * thread-safe.
+ *
+ * @param          data Input context.
+ * @param     user_data The user data to be wrapped.
+ * @param free_callback Function to be called when we release our last
+ *                      reference to this data. In this callback, $user_data
+ *                      will be the $user_data argument to this function, and
+ *                      $cookie will be the $cookie input argument to this
+ *                      function.
+ * @param        cookie Opaque parameter passed to $free_callback.
+ *
+ * @return 0 on success. A negative errno value on error.
+ */
+DAV1D_API int dav1d_data_wrap_user_data(Dav1dData *data,
+                                        const uint8_t *user_data,
+                                        void (*free_callback)(const uint8_t *user_data,
+                                                              void *cookie),
+                                        void *cookie);
 
 /**
  * Free the data reference.
  *
+ * The reference count for data->m.user_data will be decremented (if it has been
+ * initialized with dav1d_data_wrap_user_data). The $data object will be memset
+ * to 0.
+ *
  * @param data Input context.
  */
 DAV1D_API void dav1d_data_unref(Dav1dData *data);
 
 #endif /* __DAV1D_DATA_H__ */
--- a/third_party/dav1d/meson.build
+++ b/third_party/dav1d/meson.build
@@ -142,16 +142,19 @@ elif cc.has_function('memalign', prefix 
     cdata.set('HAVE_MEMALIGN', 1)
 endif
 
 if (host_machine.cpu_family() == 'aarch64' or
     host_machine.cpu_family().startswith('arm'))
     if cc.has_function('getauxval', prefix : '#include <sys/auxv.h>', args : test_args)
         cdata.set('HAVE_GETAUXVAL', 1)
     endif
+    if cc.has_function('elf_aux_info', prefix : '#include <sys/auxv.h>', args : test_args)
+        cdata.set('HAVE_ELF_AUX_INFO', 1)
+    endif
 endif
 
 # Compiler flag tests
 
 if cc.has_argument('-fvisibility=hidden')
     add_project_arguments('-fvisibility=hidden', language: 'c')
 else
     warning('Compiler does not support -fvisibility=hidden, all symbols will be public!')
@@ -162,16 +165,22 @@ endif
 # it is not an error and silently tolerated
 optional_arguments = [
   '-Wundef',
   '-Werror=vla',
   '-Wno-maybe-uninitialized',
   '-Wno-unused-parameter',
   '-Werror=missing-prototypes',
 ]
+if cc.get_id() == 'msvc'
+    optional_arguments += [
+      '-wd4028', # parameter different from declaration
+      '-wd4996'  # use of POSIX functions
+    ]
+endif
 
 if (get_option('buildtype') != 'debug' and get_option('buildtype') != 'plain')
     optional_arguments += '-fomit-frame-pointer'
     optional_arguments += '-ffast-math'
 endif
 
 add_project_arguments(cc.get_supported_arguments(optional_arguments), language : 'c')
 
@@ -186,40 +195,45 @@ if fuzzing_engine == 'libfuzzer'
 endif
 
 # Stack alignments flags
 
 stackalign_flag = []
 stackrealign_flag = []
 
 if host_machine.cpu_family().startswith('x86')
-    if cc.has_argument('-mpreferred-stack-boundary=5')
-        stackalign_flag = ['-mpreferred-stack-boundary=5']
-        stackrealign_flag = ['-mincoming-stack-boundary=4']
-        cdata_asm.set('STACK_ALIGNMENT', 32)
-        cdata.set('STACK_ALIGNMENT', 32)
-    elif cc.has_argument('-mpreferred-stack-boundary=4')
-        stackalign_flag = ['-mpreferred-stack-boundary=4']
-        stackrealign_flag = ['-mincoming-stack-boundary=4']
-        cdata_asm.set('STACK_ALIGNMENT', 16)
-        cdata.set('STACK_ALIGNMENT', 16)
-    elif cc.has_argument('-mstack-alignment=32')
-        stackalign_flag = ['-mstack-alignment=32']
-        stackrealign_flag = ['-mstackrealign']
-        cdata_asm.set('STACK_ALIGNMENT', 32)
-        cdata.set('STACK_ALIGNMENT', 32)
+    if host_machine.cpu_family() == 'x86_64'
+        if cc.has_argument('-mpreferred-stack-boundary=5')
+            stackalign_flag = ['-mpreferred-stack-boundary=5']
+            stackrealign_flag = ['-mincoming-stack-boundary=4']
+            stack_alignment = 32
+        elif cc.has_argument('-mstack-alignment=32')
+            stackalign_flag = ['-mstack-alignment=32']
+            stackrealign_flag = ['-mstackrealign']
+            stack_alignment = 32
+        else
+            stack_alignment = 16
+        endif
     else
-        if host_machine.cpu_family() == 'x86_64'
-            cdata_asm.set('STACK_ALIGNMENT', 16)
-            cdata.set('STACK_ALIGNMENT', 16)
+        if host_machine.system() == 'linux' or host_machine.system() == 'darwin'
+            stack_alignment = 16
+        elif cc.has_argument('-mpreferred-stack-boundary=4')
+            stackalign_flag = ['-mpreferred-stack-boundary=4']
+            stackrealign_flag = ['-mincoming-stack-boundary=2']
+            stack_alignment = 16
+        elif cc.has_argument('-mstack-alignment=16')
+            stackalign_flag = ['-mstack-alignment=16']
+            stackrealign_flag = ['-mstackrealign']
+            stack_alignment = 16
         else
-            cdata_asm.set('STACK_ALIGNMENT', 4)
-            cdata.set('STACK_ALIGNMENT', 4)
+            stack_alignment = 4
         endif
     endif
+    cdata_asm.set('STACK_ALIGNMENT', stack_alignment)
+    cdata.set('STACK_ALIGNMENT', stack_alignment)
 endif
 
 cdata.set10('ARCH_AARCH64', host_machine.cpu_family() == 'aarch64')
 cdata.set10('ARCH_ARM',     host_machine.cpu_family().startswith('arm'))
 if (is_asm_enabled and
     (host_machine.cpu_family() == 'aarch64' or
      host_machine.cpu_family().startswith('arm')))
 
@@ -251,23 +265,22 @@ endif
 
 if host_machine.cpu_family().startswith('x86')
     cdata.set10('ARCH_X86', true)
     if host_machine.cpu_family() == 'x86_64'
         cdata_asm.set10('ARCH_X86_64', true)
         cdata.set10('ARCH_X86_64', true)
         cdata_asm.set10('ARCH_X86_32', false)
         cdata.set10('ARCH_X86_32', false)
-
-        cdata_asm.set10('PIC', true)
     else
         cdata_asm.set10('ARCH_X86_64', false)
         cdata.set10('ARCH_X86_64', false)
         cdata_asm.set10('ARCH_X86_32', true)
         cdata.set10('ARCH_X86_32', true)
+        cdata_asm.set10('PIC', true)
     endif
 else
     cdata.set10('ARCH_X86', false)
     cdata.set10('ARCH_X86_64', false)
     cdata.set10('ARCH_X86_32', false)
 endif
 
 if cc.symbols_have_underscore_prefix()
--- a/third_party/dav1d/src/arm/cpu.c
+++ b/third_party/dav1d/src/arm/cpu.c
@@ -32,16 +32,21 @@
 #if defined(HAVE_GETAUXVAL) && ARCH_ARM
 #include <sys/auxv.h>
 
 #ifndef HWCAP_ARM_NEON
 #define HWCAP_ARM_NEON (1 << 12)
 #endif
 #define NEON_HWCAP HWCAP_ARM_NEON
 
+#elif defined(HAVE_ELF_AUX_INFO) && ARCH_ARM
+#include <sys/auxv.h>
+
+#define NEON_HWCAP HWCAP_NEON
+
 #elif defined(__ANDROID__)
 #include <stdio.h>
 #include <string.h>
 
 static unsigned parse_proc_cpuinfo(const char *flag) {
     FILE *file = fopen("/proc/cpuinfo", "r");
     if (!file)
         return 0;
@@ -67,19 +72,25 @@ static unsigned parse_proc_cpuinfo(const
     return 0;
 }
 #endif
 
 unsigned dav1d_get_cpu_flags_arm(void) {
     unsigned flags = 0;
 #if ARCH_AARCH64
     flags |= DAV1D_ARM_CPU_FLAG_NEON;
+#elif defined(__ARM_NEON)
+    flags |= DAV1D_ARM_CPU_FLAG_NEON;
 #elif defined(HAVE_GETAUXVAL) && ARCH_ARM
     unsigned long hw_cap = getauxval(AT_HWCAP);
     flags |= (hw_cap & NEON_HWCAP) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
+#elif defined(HAVE_ELF_AUX_INFO) && ARCH_ARM
+    unsigned long hw_cap = 0;
+    elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap));
+    flags |= (hw_cap & NEON_HWCAP) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
 #elif defined(__ANDROID__)
     flags |= parse_proc_cpuinfo("neon") ? DAV1D_ARM_CPU_FLAG_NEON : 0;
 #elif defined(__APPLE__)
     flags |= DAV1D_ARM_CPU_FLAG_NEON;
 #elif defined(_WIN32)
     flags |= DAV1D_ARM_CPU_FLAG_NEON;
 #endif
 
--- a/third_party/dav1d/src/data.c
+++ b/third_party/dav1d/src/data.c
@@ -22,74 +22,124 @@
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include "config.h"
 
+#include <assert.h>
 #include <errno.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 
 #include "dav1d/data.h"
 
 #include "common/validate.h"
 
 #include "src/data.h"
 #include "src/ref.h"
 
-uint8_t * dav1d_data_create(Dav1dData *const buf, const size_t sz) {
+uint8_t *dav1d_data_create_internal(Dav1dData *const buf, const size_t sz) {
     validate_input_or_ret(buf != NULL, NULL);
 
     buf->ref = dav1d_ref_create(sz);
     if (!buf->ref) return NULL;
     buf->data = buf->ref->const_data;
     buf->sz = buf->m.size = sz;
     buf->m.timestamp = INT64_MIN;
     buf->m.duration = 0;
     buf->m.offset = -1;
+    buf->m.user_data.data = NULL;
+    buf->m.user_data.ref = NULL;
 
     return buf->ref->data;
 }
 
-int dav1d_data_wrap(Dav1dData *const buf, const uint8_t *const ptr, const size_t sz,
-                    void (*free_callback)(const uint8_t *data, void *user_data),
-                    void *user_data)
+int dav1d_data_wrap_internal(Dav1dData *const buf, const uint8_t *const ptr,
+                             const size_t sz,
+                             void (*const free_callback)(const uint8_t *data,
+                                                         void *cookie),
+                             void *const cookie)
 {
     validate_input_or_ret(buf != NULL, -EINVAL);
     validate_input_or_ret(ptr != NULL, -EINVAL);
     validate_input_or_ret(free_callback != NULL, -EINVAL);
 
-    buf->ref = dav1d_ref_wrap(ptr, free_callback, user_data);
+    buf->ref = dav1d_ref_wrap(ptr, free_callback, cookie);
     if (!buf->ref) return -ENOMEM;
     buf->data = ptr;
     buf->sz = buf->m.size = sz;
     buf->m.timestamp = INT64_MIN;
     buf->m.duration = 0;
     buf->m.offset = -1;
+    buf->m.user_data.data = NULL;
+    buf->m.user_data.ref = NULL;
 
     return 0;
 }
 
+int dav1d_data_wrap_user_data_internal(Dav1dData *const buf,
+                                       const uint8_t *const user_data,
+                                       void (*const free_callback)(const uint8_t *user_data,
+                                                                   void *cookie),
+                                       void *const cookie)
+{
+    validate_input_or_ret(buf != NULL, -EINVAL);
+    validate_input_or_ret(free_callback != NULL, -EINVAL);
+
+    buf->m.user_data.ref = dav1d_ref_wrap(user_data, free_callback, cookie);
+    if (!buf->m.user_data.ref) return -ENOMEM;
+    buf->m.user_data.data = user_data;
+
+    return 0;
+}
+
+
+void dav1d_data_ref(Dav1dData *const dst, const Dav1dData *const src) {
+    validate_input(dst != NULL);
+    validate_input(dst->data == NULL);
+    validate_input(src != NULL);
+
+    if (src->ref) {
+        validate_input(src->data != NULL);
+        dav1d_ref_inc(src->ref);
+    }
+    if (src->m.user_data.ref) dav1d_ref_inc(src->m.user_data.ref);
+    *dst = *src;
+}
+
 void dav1d_data_move_ref(Dav1dData *const dst, Dav1dData *const src) {
     validate_input(dst != NULL);
     validate_input(dst->data == NULL);
     validate_input(src != NULL);
 
     if (src->ref)
         validate_input(src->data != NULL);
 
     *dst = *src;
     memset(src, 0, sizeof(*src));
 }
 
-void dav1d_data_unref(Dav1dData *const buf) {
+void dav1d_data_props_copy(Dav1dDataProps *const dst,
+                           const Dav1dDataProps *const src)
+{
+    assert(dst != NULL);
+    assert(src != NULL);
+
+    dav1d_ref_dec(&dst->user_data.ref);
+    *dst = *src;
+    if (dst->user_data.ref) dav1d_ref_inc(dst->user_data.ref);
+}
+
+void dav1d_data_unref_internal(Dav1dData *const buf) {
     validate_input(buf != NULL);
 
+    struct Dav1dRef *user_data_ref = buf->m.user_data.ref;
     if (buf->ref) {
         validate_input(buf->data != NULL);
         dav1d_ref_dec(&buf->ref);
     }
     memset(buf, 0, sizeof(*buf));
+    dav1d_ref_dec(&user_data_ref);
 }
--- a/third_party/dav1d/src/data.h
+++ b/third_party/dav1d/src/data.h
@@ -25,14 +25,34 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #ifndef __DAV1D_SRC_DATA_H__
 #define __DAV1D_SRC_DATA_H__
 
 #include "dav1d/data.h"
 
+void dav1d_data_ref(Dav1dData *dst, const Dav1dData *src);
+
 /**
  * Move a data reference.
  */
 void dav1d_data_move_ref(Dav1dData *dst, Dav1dData *src);
 
+/**
+ * Copy the source properties to the destitionatin and increase the
+ * user_data's reference count (if it's not NULL).
+ */
+void dav1d_data_props_copy(Dav1dDataProps *dst, const Dav1dDataProps *src);
+
+uint8_t *dav1d_data_create_internal(Dav1dData *buf, size_t sz);
+int dav1d_data_wrap_internal(Dav1dData *buf, const uint8_t *ptr, size_t sz,
+                             void (*free_callback)(const uint8_t *data,
+                                                   void *user_data),
+                             void *user_data);
+int dav1d_data_wrap_user_data_internal(Dav1dData *buf,
+                                       const uint8_t *user_data,
+                                       void (*free_callback)(const uint8_t *user_data,
+                                                             void *cookie),
+                                       void *cookie);
+void dav1d_data_unref_internal(Dav1dData *buf);
+
 #endif /* __DAV1D_SRC_DATA_H__ */
--- a/third_party/dav1d/src/decode.c
+++ b/third_party/dav1d/src/decode.c
@@ -1259,26 +1259,27 @@ static int decode_b(Dav1dTileContext *co
         if (has_chroma) {
             if (bw4 < 2 &&  ss_hor)
                 border_left += 4;
             if (bh4 < 2 &&  ss_ver)
                 border_top  += 4;
         }
         int src_left   = t->bx * 4 + (b->mv[0].x >> 3);
         int src_top    = t->by * 4 + (b->mv[0].y >> 3);
-        int src_right  = src_left + w4 * 4;
-        int src_bottom = src_top  + h4 * 4;
+        int src_right  = src_left + bw4 * 4;
+        int src_bottom = src_top  + bh4 * 4;
+        const int border_right = ((ts->tiling.col_end + (bw4 - 1)) & ~(bw4 - 1)) * 4;
 
         // check against left or right tile boundary and adjust if necessary
         if (src_left < border_left) {
             src_right += border_left - src_left;
             src_left  += border_left - src_left;
-        } else if (src_right > ts->tiling.col_end * 4) {
-            src_left  -= src_right - ts->tiling.col_end * 4;
-            src_right -= src_right - ts->tiling.col_end * 4;
+        } else if (src_right > border_right) {
+            src_left  -= src_right - border_right;
+            src_right -= src_right - border_right;
         }
         // check against top tile boundary and adjust if necessary
         if (src_top < border_top) {
             src_bottom += border_top - src_top;
             src_top    += border_top - src_top;
         }
 
         const int sbx = (t->bx >> (4 + f->seq_hdr->sb128)) << (6 + f->seq_hdr->sb128);
@@ -1900,16 +1901,71 @@ static int decode_b(Dav1dTileContext *co
             if (bw4 == 32) // this should be mask >> 16, but it's 0xffffffff anyway
                 (*noskip_mask)[1] |= mask;
         }
     }
 
     return 0;
 }
 
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+
+#include <sanitizer/msan_interface.h>
+
+static int checked_decode_b(Dav1dTileContext *const t,
+                            const enum BlockLevel bl,
+                            const enum BlockSize bs,
+                            const enum BlockPartition bp,
+                            const enum EdgeFlags intra_edge_flags)
+{
+    const Dav1dFrameContext *const f = t->f;
+    const int err = decode_b(t, bl, bs, bp, intra_edge_flags);
+
+    if (err == 0 && !(f->frame_thread.pass & 1)) {
+        const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+        const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+        const uint8_t *const b_dim = dav1d_block_dimensions[bs];
+        const int bw4 = b_dim[0], bh4 = b_dim[1];
+        const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
+        const int has_chroma = f->seq_hdr->layout != DAV1D_PIXEL_LAYOUT_I400 &&
+                               (bw4 > ss_hor || t->bx & 1) &&
+                               (bh4 > ss_ver || t->by & 1);
+
+        for (int p = 0; p < 1 + 2 * has_chroma; p++) {
+            const int ss_ver = p && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+            const int ss_hor = p && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+            const int stride = f->cur.stride[!!p];
+            const int bx = t->bx & ~ss_hor;
+            const int by = t->by & ~ss_ver;
+            const int width  = w4 << (2 - ss_hor + (bw4 == ss_hor));
+            const int height = h4 << (2 - ss_ver + (bh4 == ss_ver));
+
+            const uint8_t *data = f->cur.data[p] + (by << (2 - ss_ver)) * stride +
+                                  (bx << (2 - ss_hor + !!f->seq_hdr->hbd));
+
+            for (int y = 0; y < height; data += stride, y++) {
+                const size_t line_sz = width << !!f->seq_hdr->hbd;
+                if (__msan_test_shadow(data, line_sz) != -1) {
+                    fprintf(stderr, "B[%d](%d, %d) w4:%d, h4:%d, row:%d\n",
+                            p, bx, by, w4, h4, y);
+                    __msan_check_mem_is_initialized(data, line_sz);
+                }
+            }
+        }
+    }
+
+    return err;
+}
+
+#define decode_b checked_decode_b
+
+#endif /* defined(__has_feature) */
+#endif /* __has_feature(memory_sanitizer) */
+
 static int decode_sb(Dav1dTileContext *const t, const enum BlockLevel bl,
                      const EdgeNode *const node)
 {
     const Dav1dFrameContext *const f = t->f;
     const int hsz = 16 >> bl;
     const int have_h_split = f->bw > t->bx + hsz;
     const int have_v_split = f->bh > t->by + hsz;
 
@@ -2941,31 +2997,31 @@ error:
     dav1d_thread_picture_signal(&f->sr_cur, retval == 0 ? UINT_MAX : FRAME_ERROR,
                                 PLANE_TYPE_ALL);
     for (int i = 0; i < 7; i++) {
         if (f->refp[i].p.data[0])
             dav1d_thread_picture_unref(&f->refp[i]);
         dav1d_ref_dec(&f->ref_mvs_ref[i]);
     }
 
-    dav1d_picture_unref(&f->cur);
+    dav1d_picture_unref_internal(&f->cur);
     dav1d_thread_picture_unref(&f->sr_cur);
     dav1d_cdf_thread_unref(&f->in_cdf);
     if (f->frame_hdr->refresh_context) {
         dav1d_cdf_thread_signal(&f->out_cdf);
         dav1d_cdf_thread_unref(&f->out_cdf);
     }
     dav1d_ref_dec(&f->cur_segmap_ref);
     dav1d_ref_dec(&f->prev_segmap_ref);
     dav1d_ref_dec(&f->mvs_ref);
     dav1d_ref_dec(&f->seq_hdr_ref);
     dav1d_ref_dec(&f->frame_hdr_ref);
 
     for (int i = 0; i < f->n_tile_data; i++)
-        dav1d_data_unref(&f->tile[i].data);
+        dav1d_data_unref_internal(&f->tile[i].data);
 
     return retval;
 }
 
 static int get_upscale_x0(const int in_w, const int out_w, const int step) {
     const int err = out_w * step - (in_w << 14);
     const int x0 = (-((out_w - in_w) << 13) + (out_w >> 1)) / out_w + 128 - (err >> 1);
     return x0 & 0x3fff;
@@ -3119,29 +3175,23 @@ int dav1d_submit_frame(Dav1dContext *con
     memcpy(f->tile, c->tile, c->n_tile_data * sizeof(*f->tile));
     memset(c->tile, 0, c->n_tile_data * sizeof(*c->tile));
     f->n_tile_data = c->n_tile_data;
     c->n_tile_data = 0;
 
     // allocate frame
     res = dav1d_thread_picture_alloc(&f->sr_cur, f->frame_hdr->width[1],
                                      f->frame_hdr->height,
-                                     f->seq_hdr->layout, bpc,
+                                     f->seq_hdr, f->seq_hdr_ref,
+                                     f->frame_hdr, f->frame_hdr_ref,
+                                     bpc, &f->tile[0].data.m,
                                      c->n_fc > 1 ? &f->frame_thread.td : NULL,
                                      f->frame_hdr->show_frame, &c->allocator);
     if (res < 0) goto error;
 
-    f->sr_cur.p.m = f->tile[0].data.m;
-    f->sr_cur.p.frame_hdr = f->frame_hdr;
-    f->sr_cur.p.frame_hdr_ref = f->frame_hdr_ref;
-    dav1d_ref_inc(f->frame_hdr_ref);
-    f->sr_cur.p.seq_hdr = f->seq_hdr;
-    f->sr_cur.p.seq_hdr_ref = f->seq_hdr_ref;
-    dav1d_ref_inc(f->seq_hdr_ref);
-
     if (f->frame_hdr->super_res.enabled) {
         res = dav1d_picture_alloc_copy(&f->cur, f->frame_hdr->width[0], &f->sr_cur.p);
         if (res < 0) goto error;
     } else {
         dav1d_picture_ref(&f->cur, &f->sr_cur.p);
     }
 
     if (f->frame_hdr->super_res.enabled) {
@@ -3295,17 +3345,17 @@ int dav1d_submit_frame(Dav1dContext *con
             }
             memcpy(c->refs[i].refpoc, f->refpoc, sizeof(f->refpoc));
         }
     }
 
     if (c->n_fc == 1) {
         const unsigned refresh_frame_flags = f->frame_hdr->refresh_frame_flags;
         if ((res = dav1d_decode_frame(f)) < 0) {
-            dav1d_picture_unref(&c->out);
+            dav1d_picture_unref_internal(&c->out);
             for (int i = 0; i < 8; i++) {
                 if (refresh_frame_flags & (1 << i)) {
                     if (c->refs[i].p.p.data[0])
                         dav1d_thread_picture_unref(&c->refs[i].p);
                     dav1d_cdf_thread_unref(&c->cdf[i]);
                     dav1d_ref_dec(&c->refs[i].segmap);
                     dav1d_ref_dec(&c->refs[i].refmvs);
                 }
@@ -3323,27 +3373,27 @@ error:
     if (f->frame_hdr->refresh_context)
         dav1d_cdf_thread_unref(&f->out_cdf);
     for (int i = 0; i < 7; i++) {
         if (f->refp[i].p.data[0])
             dav1d_thread_picture_unref(&f->refp[i]);
         dav1d_ref_dec(&f->ref_mvs_ref[i]);
     }
     if (c->n_fc == 1)
-        dav1d_picture_unref(&c->out);
+        dav1d_picture_unref_internal(&c->out);
     else
         dav1d_thread_picture_unref(out_delayed);
-    dav1d_picture_unref(&f->cur);
+    dav1d_picture_unref_internal(&f->cur);
     dav1d_thread_picture_unref(&f->sr_cur);
     dav1d_ref_dec(&f->mvs_ref);
     dav1d_ref_dec(&f->seq_hdr_ref);
     dav1d_ref_dec(&f->frame_hdr_ref);
 
     for (int i = 0; i < f->n_tile_data; i++)
-        dav1d_data_unref(&f->tile[i].data);
+        dav1d_data_unref_internal(&f->tile[i].data);
     f->n_tile_data = 0;
 
     if (c->n_fc > 1) {
         pthread_cond_signal(&f->frame_thread.td.cond);
         pthread_mutex_unlock(&f->frame_thread.td.lock);
     }
 
     return res;
--- a/third_party/dav1d/src/ext/x86/x86inc.asm
+++ b/third_party/dav1d/src/ext/x86/x86inc.asm
@@ -84,26 +84,23 @@
         SECTION .rdata align=%1
     %elif WIN64
         SECTION .rdata align=%1
     %else
         SECTION .rodata align=%1
     %endif
 %endmacro
 
-%if WIN64
-    %define PIC
-%elif ARCH_X86_64 == 0
-; x86_32 doesn't require PIC.
-; Some distros prefer shared objects to be PIC, but nothing breaks if
-; the code contains a few textrels, so we'll skip that complexity.
-    %undef PIC
-%endif
-%ifdef PIC
+%if ARCH_X86_64
+    %define PIC 1 ; always use PIC on x86-64
     default rel
+%elifidn __OUTPUT_FORMAT__,win32
+    %define PIC 0 ; PIC isn't used on 32-bit Windows
+%elifndef PIC
+    %define PIC 0
 %endif
 
 %ifdef __NASM_VER__
     %use smartalign
 %endif
 
 ; Macros to eliminate most code duplication between x86_32 and x86_64:
 ; Currently this works only for leaf functions which load all their arguments
@@ -215,16 +212,28 @@ DECLARE_REG_SIZE bp, bpl, null
 DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
 
 %if ARCH_X86_64
     %define gprsize 8
 %else
     %define gprsize 4
 %endif
 
+%macro LEA 2
+%if ARCH_X86_64
+    lea %1, [%2]
+%elif PIC
+    call $+5 ; special-cased to not affect the RSB on most CPU:s
+    pop %1
+    add %1, (%2)-$+1
+%else
+    mov %1, %2
+%endif
+%endmacro
+
 %macro PUSH 1
     push %1
     %ifidn rstk, rsp
         %assign stack_offset stack_offset+gprsize
     %endif
 %endmacro
 
 %macro POP 1
@@ -668,17 +677,17 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
             %endif
         %endmacro
         %rotate 1
     %endrep
 %endmacro
 
 BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
 
-%macro TAIL_CALL 2 ; callee, is_nonadjacent
+%macro TAIL_CALL 1-2 1 ; callee, is_nonadjacent
     %if has_epilogue
         call %1
         RET
     %elif %2
         jmp %1
     %endif
     annotate_function_size
 %endmacro
--- a/third_party/dav1d/src/ipred_tmpl.c
+++ b/third_party/dav1d/src/ipred_tmpl.c
@@ -417,17 +417,17 @@ static void ipred_z1_c(pixel *dst, const
                        const int width, const int height, int angle,
                        const int max_width, const int max_height
                        HIGHBD_DECL_SUFFIX)
 {
     const int is_sm = (angle >> 9) & 0x1;
     const int enable_intra_edge_filter = angle >> 10;
     angle &= 511;
     assert(angle < 90);
-    int dx = dav1d_dr_intra_derivative[angle];
+    int dx = dav1d_dr_intra_derivative[angle >> 1];
     pixel top_out[(64 + 64) * 2];
     const pixel *top;
     int max_base_x;
     const int upsample_above = enable_intra_edge_filter ?
         get_upsample(width + height, 90 - angle, is_sm) : 0;
     if (upsample_above) {
         upsample_edge(top_out, width + height, &topleft_in[1], -1,
                       width + imin(width, height) HIGHBD_TAIL_SUFFIX);
@@ -471,18 +471,18 @@ static void ipred_z2_c(pixel *dst, const
                        const int width, const int height, int angle,
                        const int max_width, const int max_height
                        HIGHBD_DECL_SUFFIX)
 {
     const int is_sm = (angle >> 9) & 0x1;
     const int enable_intra_edge_filter = angle >> 10;
     angle &= 511;
     assert(angle > 90 && angle < 180);
-    int dy = dav1d_dr_intra_derivative[angle - 90];
-    int dx = dav1d_dr_intra_derivative[180 - angle];
+    int dy = dav1d_dr_intra_derivative[(angle - 90) >> 1];
+    int dx = dav1d_dr_intra_derivative[(180 - angle) >> 1];
     const int upsample_left = enable_intra_edge_filter ?
         get_upsample(width + height, 180 - angle, is_sm) : 0;
     const int upsample_above = enable_intra_edge_filter ?
         get_upsample(width + height, angle - 90, is_sm) : 0;
     pixel edge[64 * 2 + 64 * 2 + 1];
     pixel *const topleft = &edge[height * 2];
 
     if (upsample_above) {
@@ -552,17 +552,17 @@ static void ipred_z3_c(pixel *dst, const
                        const int width, const int height, int angle,
                        const int max_width, const int max_height
                        HIGHBD_DECL_SUFFIX)
 {
     const int is_sm = (angle >> 9) & 0x1;
     const int enable_intra_edge_filter = angle >> 10;
     angle &= 511;
     assert(angle > 180);
-    int dy = dav1d_dr_intra_derivative[270 - angle];
+    int dy = dav1d_dr_intra_derivative[(270 - angle) >> 1];
     pixel left_out[(64 + 64) * 2];
     const pixel *left;
     int max_base_y;
     const int upsample_left = enable_intra_edge_filter ?
         get_upsample(width + height, angle - 180, is_sm) : 0;
     if (upsample_left) {
         upsample_edge(left_out, width + height,
                       &topleft_in[-(width + height)],
--- a/third_party/dav1d/src/itx_tmpl.c
+++ b/third_party/dav1d/src/itx_tmpl.c
@@ -53,34 +53,37 @@ inv_txfm_add_c(pixel *dst, const ptrdiff
     const ptrdiff_t sh = imin(h, 32), sw = imin(w, 32);
     assert((h >= 4 || h <= 64) && (w >= 4 || w <= 64));
     // Maximum value for h and w is 64
     coef tmp[4096 /* w * h */], out[64 /* h */], in_mem[64 /* w */];
     const int is_rect2 = w * 2 == h || h * 2 == w;
     const int bitdepth = bitdepth_from_max(bitdepth_max);
     const int row_clip_max = (1 << (bitdepth + 8 - 1)) - 1;
     const int col_clip_max = (1 << (imax(bitdepth + 6, 16) - 1)) -1;
-    const int col_clip_min = -col_clip_max - 1;
 
     if (w != sw) memset(&in_mem[sw], 0, (w - sw) * sizeof(*in_mem));
     const int rnd1 = (1 << shift1) >> 1;
     for (i = 0; i < sh; i++) {
         if (w != sw || is_rect2) {
             for (j = 0; j < sw; j++) {
                 in_mem[j] = coeff[i + j * sh];
                 if (is_rect2)
                     in_mem[j] = (in_mem[j] * 2896 + 2048) >> 12;
             }
             first_1d_fn(in_mem, 1, &tmp[i * w], 1, row_clip_max);
         } else {
             first_1d_fn(&coeff[i], sh, &tmp[i * w], 1, row_clip_max);
         }
         for (j = 0; j < w; j++)
+#if BITDEPTH == 8
+            tmp[i * w + j] = (tmp[i * w + j] + (rnd1)) >> shift1;
+#else
             tmp[i * w + j] = iclip((tmp[i * w + j] + (rnd1)) >> shift1,
-                                   col_clip_min, col_clip_max);
+                                   -col_clip_max - 1, col_clip_max);
+#endif
     }
 
     if (h != sh) memset(&tmp[sh * w], 0, w * (h - sh) * sizeof(*tmp));
     const int rnd2 = (1 << shift2) >> 1;
     for (i = 0; i < w; i++) {
         second_1d_fn(&tmp[i], w, out, 1, col_clip_max);
         for (j = 0; j < h; j++)
             dst[i + j * PXSTRIDE(stride)] =
--- a/third_party/dav1d/src/lib.c
+++ b/third_party/dav1d/src/lib.c
@@ -189,21 +189,21 @@ int dav1d_parse_sequence_header(Dav1dSeq
     int res;
 
     validate_input_or_ret(out != NULL, -EINVAL);
 
     Dav1dSettings s;
     dav1d_default_settings(&s);
 
     Dav1dContext *c;
-    res	= dav1d_open(&c, &s);
+    res = dav1d_open(&c, &s);
     if (res < 0) return res;
 
     if (ptr) {
-        res = dav1d_data_wrap(&buf, ptr, sz, dummy_free, NULL);
+        res = dav1d_data_wrap_internal(&buf, ptr, sz, dummy_free, NULL);
         if (res < 0) goto error;
     }
 
     while (buf.sz > 0) {
         res = dav1d_parse_obus(c, &buf, 1);
         if (res < 0) goto error;
 
         assert((size_t)res <= buf.sz);
@@ -215,17 +215,17 @@ int dav1d_parse_sequence_header(Dav1dSeq
         res = -EINVAL;
         goto error;
     }
 
     memcpy(out, c->seq_hdr, sizeof(*out));
 
     res = 0;
 error:
-    dav1d_data_unref(&buf);
+    dav1d_data_unref_internal(&buf);
     dav1d_close(&c);
 
     return res;
 }
 
 int dav1d_send_data(Dav1dContext *const c, Dav1dData *const in)
 {
     validate_input_or_ret(c != NULL, -EINVAL);
@@ -252,18 +252,18 @@ static int output_image(Dav1dContext *co
     if (!c->apply_grain || !has_grain) {
         dav1d_picture_move_ref(out, in);
         return 0;
     }
 
     // Apply film grain to a new copy of the image to avoid corrupting refs
     int res = dav1d_picture_alloc_copy(out, in->p.w, in);
     if (res < 0) {
-        dav1d_picture_unref(in);
-        dav1d_picture_unref(out);
+        dav1d_picture_unref_internal(in);
+        dav1d_picture_unref_internal(out);
         return res;
     }
 
     switch (out->p.bpc) {
 #if CONFIG_8BPC
     case 8:
         dav1d_apply_grain_8bpc(out, in);
         break;
@@ -273,29 +273,29 @@ static int output_image(Dav1dContext *co
     case 12:
         dav1d_apply_grain_16bpc(out, in);
         break;
 #endif
     default:
         assert(0);
     }
 
-    dav1d_picture_unref(in);
+    dav1d_picture_unref_internal(in);
     return 0;
 }
 
 static int output_picture_ready(Dav1dContext *const c) {
 
     if (!c->out.data[0]) return 0;
 
     // skip lower spatial layers
     if (c->operating_point_idc && !c->all_layers) {
         const int max_spatial_id = ulog2(c->operating_point_idc >> 8);
         if (max_spatial_id > c->out.frame_hdr->spatial_id) {
-            dav1d_picture_unref(&c->out);
+            dav1d_picture_unref_internal(&c->out);
             return 0;
         }
     }
 
     return 1;
 }
 
 static int drain_picture(Dav1dContext *const c, Dav1dPicture *const out) {
@@ -341,22 +341,22 @@ int dav1d_get_picture(Dav1dContext *cons
     if (!in->data) {
         if (c->n_fc == 1) return -EAGAIN;
         return drain_picture(c, out);
     }
 
     while (in->sz > 0) {
         res = dav1d_parse_obus(c, in, 0);
         if (res < 0) {
-            dav1d_data_unref(in);
+            dav1d_data_unref_internal(in);
         } else {
             assert((size_t)res <= in->sz);
             in->sz -= res;
             in->data += res;
-            if (!in->sz) dav1d_data_unref(in);
+            if (!in->sz) dav1d_data_unref_internal(in);
         }
         if (output_picture_ready(c))
             break;
         if (res < 0)
             return res;
     }
 
     if (output_picture_ready(c))
@@ -364,17 +364,17 @@ int dav1d_get_picture(Dav1dContext *cons
 
     if (c->n_fc > 1 && drain)
         return drain_picture(c, out);
 
     return -EAGAIN;
 }
 
 void dav1d_flush(Dav1dContext *const c) {
-    dav1d_data_unref(&c->in);
+    dav1d_data_unref_internal(&c->in);
     c->drain = 0;
 
     if (c->n_fc == 1) return;
 
     // mark each currently-running frame as flushing, so that we
     // exit out as quickly as the running thread checks this flag
     atomic_store(c->frame_thread.flush, 1);
     for (unsigned n = 0, next = c->frame_thread.next; n < c->n_fc; n++, next++) {
@@ -477,29 +477,62 @@ void dav1d_close(Dav1dContext **const c_
         free(f->lf.lr_mask);
         free(f->lf.level);
         free(f->lf.tx_lpf_right_edge[0]);
         av1_free_ref_mv_common(f->libaom_cm);
         dav1d_free_aligned(f->lf.cdef_line);
         dav1d_free_aligned(f->lf.lr_lpf_line);
     }
     dav1d_free_aligned(c->fc);
-    dav1d_data_unref(&c->in);
+    dav1d_data_unref_internal(&c->in);
     if (c->n_fc > 1) {
         for (unsigned n = 0; n < c->n_fc; n++)
             if (c->frame_thread.out_delayed[n].p.data[0])
                 dav1d_thread_picture_unref(&c->frame_thread.out_delayed[n]);
         free(c->frame_thread.out_delayed);
     }
     for (int n = 0; n < c->n_tile_data; n++)
-        dav1d_data_unref(&c->tile[n].data);
+        dav1d_data_unref_internal(&c->tile[n].data);
     for (int n = 0; n < 8; n++) {
         dav1d_cdf_thread_unref(&c->cdf[n]);
         if (c->refs[n].p.p.data[0])
             dav1d_thread_picture_unref(&c->refs[n].p);
         dav1d_ref_dec(&c->refs[n].refmvs);
         dav1d_ref_dec(&c->refs[n].segmap);
     }
     dav1d_ref_dec(&c->seq_hdr_ref);
     dav1d_ref_dec(&c->frame_hdr_ref);
 
     dav1d_freep_aligned(c_out);
 }
+
+void dav1d_picture_unref(Dav1dPicture *const p) {
+    dav1d_picture_unref_internal(p);
+}
+
+uint8_t *dav1d_data_create(Dav1dData *const buf, const size_t sz) {
+    return dav1d_data_create_internal(buf, sz);
+}
+
+int dav1d_data_wrap(Dav1dData *const buf, const uint8_t *const ptr,
+                    const size_t sz,
+                    void (*const free_callback)(const uint8_t *data,
+                                                void *user_data),
+                    void *const user_data)
+{
+    return dav1d_data_wrap_internal(buf, ptr, sz, free_callback, user_data);
+}
+
+int dav1d_data_wrap_user_data(Dav1dData *const buf,
+                              const uint8_t *const user_data,
+                              void (*const free_callback)(const uint8_t *user_data,
+                                                          void *cookie),
+                              void *const cookie)
+{
+    return dav1d_data_wrap_user_data_internal(buf,
+                                              user_data,
+                                              free_callback,
+                                              cookie);
+}
+
+void dav1d_data_unref(Dav1dData *const buf) {
+    dav1d_data_unref_internal(buf);
+}
--- a/third_party/dav1d/src/lr_apply_tmpl.c
+++ b/third_party/dav1d/src/lr_apply_tmpl.c
@@ -42,17 +42,18 @@ enum LrRestorePlanes {
 // The loop filter buffer stores 12 rows of pixels. A superblock block will
 // contain at most 2 stripes. Each stripe requires 4 rows pixels (2 above
 // and 2 below) the final 4 rows are used to swap the bottom of the last
 // stripe with the top of the next super block row.
 static void backup_lpf(const Dav1dFrameContext *const f,
                        pixel *dst, const ptrdiff_t dst_stride,
                        const pixel *src, const ptrdiff_t src_stride,
                        const int ss_ver, const int sb128,
-                       int row, const int row_h, const int src_w, const int ss_hor)
+                       int row, const int row_h, const int src_w,
+                       const int h, const int ss_hor)
 {
     const int dst_w = f->frame_hdr->super_res.enabled ?
                       (f->frame_hdr->width[1] + ss_hor) >> ss_hor : src_w;
 
     // The first stripe of the frame is shorter by 8 luma pixel rows.
     int stripe_h = (64 - 8 * !row) >> ss_ver;
 
     if (row) {
@@ -69,28 +70,35 @@ static void backup_lpf(const Dav1dFrameC
                    &dst[PXSTRIDE(dst_stride) * (top + 3)], dst_w);
     }
 
     dst += 4 * PXSTRIDE(dst_stride);
     src += (stripe_h - 2) * PXSTRIDE(src_stride);
 
     if (f->frame_hdr->super_res.enabled) {
         while (row + stripe_h <= row_h) {
+            const int n_lines = 4 - (row + stripe_h + 1 == h);
             f->dsp->mc.resize(dst, dst_stride, src, src_stride,
-                              dst_w, src_w, 4, f->resize_step[ss_hor],
+                              dst_w, src_w, n_lines, f->resize_step[ss_hor],
                               f->resize_start[ss_hor] HIGHBD_CALL_SUFFIX);
             row += stripe_h; // unmodified stripe_h for the 1st stripe
             stripe_h = 64 >> ss_ver;
             src += stripe_h * PXSTRIDE(src_stride);
-            dst += 4 * PXSTRIDE(dst_stride);
+            dst += n_lines * PXSTRIDE(dst_stride);
+            if (n_lines == 3) {
+                pixel_copy(dst, &dst[-PXSTRIDE(dst_stride)], dst_w);
+                dst += PXSTRIDE(dst_stride);
+            }
         }
     } else {
         while (row + stripe_h <= row_h) {
+            const int n_lines = 4 - (row + stripe_h + 1 == h);
             for (int i = 0; i < 4; i++) {
-                pixel_copy(dst, src, src_w);
+                pixel_copy(dst, i == n_lines ? &dst[-PXSTRIDE(dst_stride)] :
+                                               src, src_w);
                 dst += PXSTRIDE(dst_stride);
                 src += PXSTRIDE(src_stride);
             }
             row += stripe_h; // unmodified stripe_h for the 1st stripe
             stripe_h = 64 >> ss_ver;
             src += (stripe_h - 4) * PXSTRIDE(src_stride);
         }
     }
@@ -105,43 +113,43 @@ void bytefn(dav1d_lr_copy_lpf)(Dav1dFram
 
     // TODO Also check block level restore type to reduce copying.
     const int restore_planes =
         ((f->frame_hdr->restoration.type[0] != DAV1D_RESTORATION_NONE) << 0) +
         ((f->frame_hdr->restoration.type[1] != DAV1D_RESTORATION_NONE) << 1) +
         ((f->frame_hdr->restoration.type[2] != DAV1D_RESTORATION_NONE) << 2);
 
     if (restore_planes & LR_RESTORE_Y) {
-        const int h = f->bh << 2;
+        const int h = f->cur.p.h;
         const int w = f->bw << 2;
-        const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h - 4);
+        const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h - 1);
         const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset;
         backup_lpf(f, f->lf.lr_lpf_line_ptr[0], lr_stride,
                    src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0],
-                   0, f->seq_hdr->sb128, y_stripe, row_h, w, 0);
+                   0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0);
     }
     if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) {
         const int ss_ver = f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
         const int ss_hor = f->sr_cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
-        const int h = f->bh << (2 - ss_ver);
+        const int h = (f->cur.p.h + ss_ver) >> ss_ver;
         const int w = f->bw << (2 - ss_hor);
-        const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h - 4);
+        const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h - 1);
         const ptrdiff_t offset_uv = offset >> ss_ver;
         const int y_stripe =
             (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv;
 
         if (restore_planes & LR_RESTORE_U) {
             backup_lpf(f, f->lf.lr_lpf_line_ptr[1], lr_stride,
                        src[1] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],
-                       ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, ss_hor);
+                       ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor);
         }
         if (restore_planes & LR_RESTORE_V) {
             backup_lpf(f, f->lf.lr_lpf_line_ptr[2], lr_stride,
                        src[2] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],
-                       ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, ss_hor);
+                       ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor);
         }
     }
 }
 
 static void lr_stripe(const Dav1dFrameContext *const f, pixel *p,
                       const pixel (*left)[4], int x, int y,
                       const int plane, const int unit_w, const int row_h,
                       const Av1RestorationUnit *const lr, enum LrEdgeFlags edges)
--- a/third_party/dav1d/src/obu.c
+++ b/third_party/dav1d/src/obu.c
@@ -216,25 +216,25 @@ static int parse_seq_hdr(Dav1dContext *c
     } else {
         hdr->pri = DAV1D_COLOR_PRI_UNKNOWN;
         hdr->trc = DAV1D_TRC_UNKNOWN;
         hdr->mtrx = DAV1D_MC_UNKNOWN;
     }
     if (hdr->monochrome) {
         hdr->color_range = dav1d_get_bits(gb, 1);
         hdr->layout = DAV1D_PIXEL_LAYOUT_I400;
-        hdr->ss_hor = hdr->ss_ver = 0;
+        hdr->ss_hor = hdr->ss_ver = 1;
         hdr->chr = DAV1D_CHR_UNKNOWN;
         hdr->separate_uv_delta_q = 0;
     } else if (hdr->pri == DAV1D_COLOR_PRI_BT709 &&
                hdr->trc == DAV1D_TRC_SRGB &&
                hdr->mtrx == DAV1D_MC_IDENTITY)
     {
         hdr->layout = DAV1D_PIXEL_LAYOUT_I444;
-        hdr->ss_hor = hdr->ss_ver = 1;
+        hdr->ss_hor = hdr->ss_ver = 0;
         hdr->color_range = 1;
         if (hdr->profile != 1 && !(hdr->profile == 2 && hdr->hbd == 2))
             goto error;
     } else {
         hdr->color_range = dav1d_get_bits(gb, 1);
         switch (hdr->profile) {
         case 0: hdr->layout = DAV1D_PIXEL_LAYOUT_I420;
                 hdr->ss_hor = hdr->ss_ver = 1;
@@ -253,18 +253,18 @@ static int parse_seq_hdr(Dav1dContext *c
             hdr->layout = hdr->ss_hor ?
                           hdr->ss_ver ? DAV1D_PIXEL_LAYOUT_I420 :
                                         DAV1D_PIXEL_LAYOUT_I422 :
                                         DAV1D_PIXEL_LAYOUT_I444;
             break;
         }
         hdr->chr = hdr->ss_hor == 1 && hdr->ss_ver == 1 ?
                    dav1d_get_bits(gb, 2) : DAV1D_CHR_UNKNOWN;
-        hdr->separate_uv_delta_q = dav1d_get_bits(gb, 1);
     }
+    hdr->separate_uv_delta_q = !hdr->monochrome && dav1d_get_bits(gb, 1);
 #if DEBUG_SEQ_HDR
     printf("SEQHDR: post-colorinfo: off=%ld\n",
            dav1d_get_bits_pos(gb) - init_bit_pos);
 #endif
 
     hdr->film_grain_present = dav1d_get_bits(gb, 1);
 #if DEBUG_SEQ_HDR
     printf("SEQHDR: post-filmgrain: off=%ld\n",
@@ -1278,17 +1278,17 @@ int dav1d_parse_obus(Dav1dContext *const
         memset(c->frame_hdr, 0, sizeof(*c->frame_hdr));
         c->frame_hdr->temporal_id = temporal_id;
         c->frame_hdr->spatial_id = spatial_id;
         if ((res = parse_frame_hdr(c, &gb)) < 0) {
             c->frame_hdr = NULL;
             return res;
         }
         for (int n = 0; n < c->n_tile_data; n++)
-            dav1d_data_unref(&c->tile[n].data);
+            dav1d_data_unref_internal(&c->tile[n].data);
         c->n_tile_data = 0;
         c->n_tiles = 0;
         if (type != OBU_FRAME) {
             // This is actually a frame header OBU so read the
             // trailing bit and check for overrun.
             dav1d_get_bits(&gb, 1);
             if (check_for_overrun(&gb, init_bit_pos, len)) {
                 c->frame_hdr = NULL;
@@ -1318,27 +1318,25 @@ int dav1d_parse_obus(Dav1dContext *const
         if (check_for_overrun(&gb, init_bit_pos, len))
             return -EINVAL;
         // The current bit position is a multiple of 8 (because we
         // just aligned it) and less than 8*pkt_bytelen because
         // otherwise the overrun check would have fired.
         const unsigned bit_pos = dav1d_get_bits_pos(&gb);
         assert((bit_pos & 7) == 0);
         assert(pkt_bytelen >= (bit_pos >> 3));
-        dav1d_ref_inc(in->ref);
-        c->tile[c->n_tile_data].data.ref = in->ref;
-        c->tile[c->n_tile_data].data.m = in->m;
-        c->tile[c->n_tile_data].data.data = in->data + (bit_pos >> 3);
+        dav1d_data_ref(&c->tile[c->n_tile_data].data, in);
+        c->tile[c->n_tile_data].data.data += bit_pos >> 3;
         c->tile[c->n_tile_data].data.sz = pkt_bytelen - (bit_pos >> 3);
         // ensure tile groups are in order and sane, see 6.10.1
         if (c->tile[c->n_tile_data].start > c->tile[c->n_tile_data].end ||
             c->tile[c->n_tile_data].start != c->n_tiles)
         {
             for (int i = 0; i <= c->n_tile_data; i++)
-                dav1d_data_unref(&c->tile[i].data);
+                dav1d_data_unref_internal(&c->tile[i].data);
             c->n_tile_data = 0;
             c->n_tiles = 0;
             goto error;
         }
         c->n_tiles += 1 + c->tile[c->n_tile_data].end -
                           c->tile[c->n_tile_data].start;
         c->n_tile_data++;
         break;
@@ -1354,17 +1352,17 @@ int dav1d_parse_obus(Dav1dContext *const
     }
 
     if (c->seq_hdr && c->frame_hdr) {
         if (c->frame_hdr->show_existing_frame) {
             if (!c->refs[c->frame_hdr->existing_frame_idx].p.p.data[0]) return -EINVAL;
             if (c->n_fc == 1) {
                 dav1d_picture_ref(&c->out,
                                   &c->refs[c->frame_hdr->existing_frame_idx].p.p);
-                c->out.m = in->m;
+                dav1d_data_props_copy(&c->out.m, &in->m);
             } else {
                 // need to append this to the frame output queue
                 const unsigned next = c->frame_thread.next++;
                 if (c->frame_thread.next == c->n_fc)
                     c->frame_thread.next = 0;
 
                 Dav1dFrameContext *const f = &c->fc[next];
                 pthread_mutex_lock(&f->frame_thread.td.lock);
@@ -1378,17 +1376,17 @@ int dav1d_parse_obus(Dav1dContext *const
                                                                    memory_order_relaxed);
                     if (out_delayed->visible && progress != FRAME_ERROR)
                         dav1d_picture_ref(&c->out, &out_delayed->p);
                     dav1d_thread_picture_unref(out_delayed);
                 }
                 dav1d_thread_picture_ref(out_delayed,
                                          &c->refs[c->frame_hdr->existing_frame_idx].p);
                 out_delayed->visible = 1;
-                out_delayed->p.m = in->m;
+                dav1d_data_props_copy(&out_delayed->p.m, &in->m);
                 pthread_mutex_unlock(&f->frame_thread.td.lock);
             }
             if (c->refs[c->frame_hdr->existing_frame_idx].p.p.frame_hdr->frame_type == DAV1D_FRAME_TYPE_KEY) {
                 const int r = c->frame_hdr->existing_frame_idx;
                 for (int i = 0; i < 8; i++) {
                     if (i == r) continue;
 
                     if (c->refs[i].p.p.data[0])
--- a/third_party/dav1d/src/picture.c
+++ b/third_party/dav1d/src/picture.c
@@ -94,18 +94,19 @@ static void free_buffer(const uint8_t *c
 
     pic_ctx->allocator.release_picture_callback(&pic_ctx->pic,
                                                 pic_ctx->allocator.cookie);
     free(pic_ctx);
 }
 
 static int picture_alloc_with_edges(Dav1dPicture *const p,
                                     const int w, const int h,
-                                    const enum Dav1dPixelLayout layout,
-                                    const int bpc,
+                                    Dav1dSequenceHeader *seq_hdr, Dav1dRef *seq_hdr_ref,
+                                    Dav1dFrameHeader *frame_hdr,  Dav1dRef *frame_hdr_ref,
+                                    const int bpc, const Dav1dDataProps *props,
                                     Dav1dPicAllocator *const p_allocator,
                                     const size_t extra, void **const extra_ptr)
 {
     if (p->data[0]) {
         fprintf(stderr, "Picture already allocated!\n");
         return -1;
     }
     assert(bpc > 0 && bpc <= 16);
@@ -115,17 +116,21 @@ static int picture_alloc_with_edges(Dav1
         return -ENOMEM;
     }
 
     p->p.w = w;
     p->p.h = h;
     p->m.timestamp = INT64_MIN;
     p->m.duration = 0;
     p->m.offset = -1;
-    p->p.layout = layout;
+    p->m.user_data.data = NULL;
+    p->m.user_data.ref = NULL;
+    p->seq_hdr = seq_hdr;
+    p->frame_hdr = frame_hdr;
+    p->p.layout = seq_hdr->layout;
     p->p.bpc = bpc;
     int res = p_allocator->alloc_picture_callback(p, p_allocator->cookie);
     if (res < 0) {
         free(pic_ctx);
         return -ENOMEM;
     }
 
     pic_ctx->allocator = *p_allocator;
@@ -133,77 +138,80 @@ static int picture_alloc_with_edges(Dav1
 
     if (!(p->ref = dav1d_ref_wrap(p->data[0], free_buffer, pic_ctx))) {
         p_allocator->release_picture_callback(p, p_allocator->cookie);
         free(pic_ctx);
         fprintf(stderr, "Failed to wrap picture: %s\n", strerror(errno));
         return -ENOMEM;
     }
 
+    p->seq_hdr_ref = seq_hdr_ref;
+    if (seq_hdr_ref) dav1d_ref_inc(seq_hdr_ref);
+
+    p->frame_hdr_ref = frame_hdr_ref;
+    if (frame_hdr_ref) dav1d_ref_inc(frame_hdr_ref);
+
+    dav1d_data_props_copy(&p->m, props);
+
     if (extra && extra_ptr)
         *extra_ptr = &pic_ctx->extra_ptr;
 
     return 0;
 }
 
 int dav1d_thread_picture_alloc(Dav1dThreadPicture *const p,
                                const int w, const int h,
-                               const enum Dav1dPixelLayout layout, const int bpc,
+                               Dav1dSequenceHeader *seq_hdr, Dav1dRef *seq_hdr_ref,
+                               Dav1dFrameHeader *frame_hdr, Dav1dRef *frame_hdr_ref,
+                               const int bpc, const Dav1dDataProps *props,
                                struct thread_data *const t, const int visible,
                                Dav1dPicAllocator *const p_allocator)
 {
     p->t = t;
 
     const int res =
-        picture_alloc_with_edges(&p->p, w, h, layout, bpc, p_allocator,
+        picture_alloc_with_edges(&p->p, w, h,
+                                 seq_hdr, seq_hdr_ref,
+                                 frame_hdr, frame_hdr_ref,
+                                 bpc, props, p_allocator,
                                  t != NULL ? sizeof(atomic_int) * 2 : 0,
                                  (void **) &p->progress);
     if (res) return res;
 
     p->visible = visible;
     if (t) {
         atomic_init(&p->progress[0], 0);
         atomic_init(&p->progress[1], 0);
     }
     return res;
 }
 
 int dav1d_picture_alloc_copy(Dav1dPicture *const dst, const int w,
                              const Dav1dPicture *const src)
 {
     struct pic_ctx_context *const pic_ctx = src->ref->user_data;
-    const int res = picture_alloc_with_edges(dst, w, src->p.h, src->p.layout,
-                                             src->p.bpc, &pic_ctx->allocator,
+    const int res = picture_alloc_with_edges(dst, w, src->p.h,
+                                             src->seq_hdr, src->seq_hdr_ref,
+                                             src->frame_hdr, src->frame_hdr_ref,
+                                             src->p.bpc, &src->m, &pic_ctx->allocator,
                                              0, NULL);
-
-    if (!res) {
-        dst->p = src->p;
-        dst->m = src->m;
-        dst->p.w = w;
-        dst->frame_hdr = src->frame_hdr;
-        dst->frame_hdr_ref = src->frame_hdr_ref;
-        if (dst->frame_hdr_ref) dav1d_ref_inc(dst->frame_hdr_ref);
-        dst->seq_hdr = src->seq_hdr;
-        dst->seq_hdr_ref = src->seq_hdr_ref;
-        if (dst->seq_hdr_ref) dav1d_ref_inc(dst->seq_hdr_ref);
-    }
-
     return res;
 }
 
 void dav1d_picture_ref(Dav1dPicture *const dst, const Dav1dPicture *const src) {
     validate_input(dst != NULL);
     validate_input(dst->data[0] == NULL);
     validate_input(src != NULL);
 
     if (src->ref) {
         validate_input(src->data[0] != NULL);
         dav1d_ref_inc(src->ref);
         if (src->frame_hdr_ref) dav1d_ref_inc(src->frame_hdr_ref);
         if (src->seq_hdr_ref) dav1d_ref_inc(src->seq_hdr_ref);
+        if (src->m.user_data.ref) dav1d_ref_inc(src->m.user_data.ref);
     }
     *dst = *src;
 }
 
 void dav1d_picture_move_ref(Dav1dPicture *const dst, Dav1dPicture *const src) {
     validate_input(dst != NULL);
     validate_input(dst->data[0] == NULL);
     validate_input(src != NULL);
@@ -219,30 +227,31 @@ void dav1d_thread_picture_ref(Dav1dThrea
                               const Dav1dThreadPicture *src)
 {
     dav1d_picture_ref(&dst->p, &src->p);
     dst->t = src->t;
     dst->visible = src->visible;
     dst->progress = src->progress;
 }
 
-void dav1d_picture_unref(Dav1dPicture *const p) {
+void dav1d_picture_unref_internal(Dav1dPicture *const p) {
     validate_input(p != NULL);
 
     if (p->ref) {
         validate_input(p->data[0] != NULL);
         dav1d_ref_dec(&p->ref);
         dav1d_ref_dec(&p->seq_hdr_ref);
         dav1d_ref_dec(&p->frame_hdr_ref);
+        dav1d_ref_dec(&p->m.user_data.ref);
     }
     memset(p, 0, sizeof(*p));
 }
 
 void dav1d_thread_picture_unref(Dav1dThreadPicture *const p) {
-    dav1d_picture_unref(&p->p);
+    dav1d_picture_unref_internal(&p->p);
 
     p->t = NULL;
     p->progress = NULL;
 }
 
 int dav1d_thread_picture_wait(const Dav1dThreadPicture *const p,
                               int y_unclipped, const enum PlaneType plane_type)
 {
--- a/third_party/dav1d/src/picture.h
+++ b/third_party/dav1d/src/picture.h
@@ -29,16 +29,17 @@
 #define __DAV1D_SRC_PICTURE_H__
 
 #include <stdatomic.h>
 
 #include "src/thread.h"
 #include "dav1d/picture.h"
 
 #include "src/thread_data.h"
+#include "src/ref.h"
 
 enum PlaneType {
     PLANE_TYPE_Y,
     PLANE_TYPE_UV,
     PLANE_TYPE_BLOCK,
     PLANE_TYPE_ALL,
 };
 
@@ -50,17 +51,19 @@ typedef struct Dav1dThreadPicture {
     // [1] pixel data
     atomic_uint *progress;
 } Dav1dThreadPicture;
 
 /*
  * Allocate a picture with custom border size.
  */
 int dav1d_thread_picture_alloc(Dav1dThreadPicture *p, int w, int h,
-                               enum Dav1dPixelLayout layout, int bpc,
+                               Dav1dSequenceHeader *seq_hdr, Dav1dRef *seq_hdr_ref,
+                               Dav1dFrameHeader *frame_hdr, Dav1dRef *frame_hdr_ref,
+                               int bpc, const Dav1dDataProps *props,
                                struct thread_data *t, int visible,
                                Dav1dPicAllocator *);
 
 /**
  * Allocate a picture with identical metadata to an existing picture.
  * The width is a separate argument so this function can be used for
  * super-res, where the width changes, but everything else is the same.
  * For the more typical use case of allocating a new image of the same
@@ -104,10 +107,11 @@ int dav1d_thread_picture_wait(const Dav1
  * PLANE_TYPE_BLOCK), pixel data (pass 2, PLANE_TYPE_Y) or both (no
  * 2-pass decoding; PLANE_TYPE_ALL).
  */
 void dav1d_thread_picture_signal(const Dav1dThreadPicture *p, int y,
                                  enum PlaneType plane_type);
 
 int default_picture_allocator(Dav1dPicture *, void *cookie);
 void default_picture_release(Dav1dPicture *, void *cookie);
+void dav1d_picture_unref_internal(Dav1dPicture *p);
 
 #endif /* __DAV1D_SRC_PICTURE_H__ */
--- a/third_party/dav1d/src/recon_tmpl.c
+++ b/third_party/dav1d/src/recon_tmpl.c
@@ -200,18 +200,19 @@ static int decode_coefs(Dav1dTileContext
             } while (tok < 15);
         }
 
         levels[x * stride + y] = cf[rc] = tok;
     }
 
     // residual and sign
     int dc_sign = 1;
+    const int lossless = f->frame_hdr->segmentation.lossless[b->seg_id];
     const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane];
-    const uint8_t *const qm_tbl = f->qm[is_1d || *txtp == IDTX][tx][plane];
+    const uint8_t *const qm_tbl = f->qm[lossless || is_1d || *txtp == IDTX][tx][plane];
     const int dq_shift = imax(0, t_dim->ctx - 2);
     const int bitdepth = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
     const int cf_min = -(1 << (7 + bitdepth));
     const int cf_max = (1 << (7 + bitdepth)) - 1;
     for (int i = 0; i <= eob; i++) {
         const int rc = scan[i];
         int tok = cf[rc];
         if (!tok) continue;
--- a/third_party/dav1d/src/ref.c
+++ b/third_party/dav1d/src/ref.c
@@ -40,31 +40,32 @@ Dav1dRef *dav1d_ref_create(const size_t 
     Dav1dRef *res;
     void *data = dav1d_alloc_aligned(size, 32);
     if (!data) {
         return NULL;
     }
 
     res = dav1d_ref_wrap(data, default_free_callback, data);
     if (!res) {
-        free(data);
+        dav1d_free_aligned(data);
+    } else {
+        res->data = data;
     }
 
     return res;
 }
 
 Dav1dRef *dav1d_ref_wrap(const uint8_t *const ptr,
                          void (*free_callback)(const uint8_t *data, void *user_data),
                          void *user_data)
 {
     Dav1dRef *res = malloc(sizeof(Dav1dRef));
     if (!res) return NULL;
 
-    if (ptr == user_data)
-        res->data = user_data;
+    res->data = NULL;
     res->const_data = ptr;
     atomic_init(&res->ref_cnt, 1);
     res->free_callback = free_callback;
     res->user_data = user_data;
 
     return res;
 }
 
@@ -81,10 +82,10 @@ void dav1d_ref_dec(Dav1dRef **const pref
     if (atomic_fetch_sub(&ref->ref_cnt, 1) == 1) {
         ref->free_callback(ref->const_data, ref->user_data);
         free(ref);
     }
     *pref = NULL;
 }
 
 int dav1d_ref_is_writable(Dav1dRef *const ref) {
-    return atomic_load(&ref->ref_cnt) == 1;
+    return atomic_load(&ref->ref_cnt) == 1 && ref->data;
 }
--- a/third_party/dav1d/src/tables.c
+++ b/third_party/dav1d/src/tables.c
@@ -770,47 +770,46 @@ const uint8_t dav1d_sm_weights[128] = {
     144, 138, 133, 127, 121, 116, 111, 106,
     101,  96,  91,  86,  82,  77,  73,  69,
      65,  61,  57,  54,  50,  47,  44,  41,
      38,  35,  32,  29,  27,  25,  22,  20,
      18,  16,  15,  13,  12,  10,   9,   8,
       7,   6,   6,   5,   5,   4,   4,   4
 };
 
-const int16_t dav1d_dr_intra_derivative[90] = {
-    // More evenly spread out angles and limited to 10-bit
+const uint16_t dav1d_dr_intra_derivative[44] = {
     // Values that are 0 will never be used
-       0, 0, 0,       // Approx angle
-    1023, 0, 0,       // 3, ...
-     547, 0, 0,       // 6, ...
-     372, 0, 0, 0, 0, // 9, ...
-     273, 0, 0,       // 14, ...
-     215, 0, 0,       // 17, ...
-     178, 0, 0,       // 20, ...
-     151, 0, 0,       // 23, ... (113 & 203 are base angles)
-     132, 0, 0,       // 26, ...
-     116, 0, 0,       // 29, ...
-     102, 0, 0, 0,    // 32, ...
-      90, 0, 0,       // 36, ...
-      80, 0, 0,       // 39, ...
-      71, 0, 0,       // 42, ...
-      64, 0, 0,       // 45, ... (45 & 135 are base angles)
-      57, 0, 0,       // 48, ...
-      51, 0, 0,       // 51, ...
-      45, 0, 0, 0,    // 54, ...
-      40, 0, 0,       // 58, ...
-      35, 0, 0,       // 61, ...
-      31, 0, 0,       // 64, ...
-      27, 0, 0,       // 67, ... (67 & 157 are base angles)
-      23, 0, 0,       // 70, ...
-      19, 0, 0,       // 73, ...
-      15, 0, 0, 0, 0, // 76, ...
-      11, 0, 0,       // 81, ...
-       7, 0, 0,       // 84, ...
-       3, 0, 0,       // 87, ...
+          0,    // Angles:
+    1023, 0,    //  3,  93, 183
+     547,       //  6,  96, 186
+     372, 0, 0, //  9,  99, 189
+     273,       // 14, 104, 194
+     215, 0,    // 17, 107, 197
+     178,       // 20, 110, 200
+     151, 0,    // 23, 113, 203 (113 & 203 are base angles)
+     132,       // 26, 116, 206
+     116, 0,    // 29, 119, 209
+     102, 0,    // 32, 122, 212
+      90,       // 36, 126, 216
+      80, 0,    // 39, 129, 219
+      71,       // 42, 132, 222
+      64, 0,    // 45, 135, 225 (45 & 135 are base angles)
+      57,       // 48, 138, 228
+      51, 0,    // 51, 141, 231
+      45, 0,    // 54, 144, 234
+      40,       // 58, 148, 238
+      35, 0,    // 61, 151, 241
+      31,       // 64, 154, 244
+      27, 0,    // 67, 157, 247 (67 & 157 are base angles)
+      23,       // 70, 160, 250
+      19, 0,    // 73, 163, 253
+      15, 0,    // 76, 166, 256
+      11, 0,    // 81, 171, 261
+       7,       // 84, 174, 264
+       3        // 87, 177, 267
 };
 
 const int8_t ALIGN(dav1d_filter_intra_taps[5][64], 16) = {
     {
          -6,  10,  -5,   2,  -3,   1,  -3,   1,
          -4,   6,  -3,   2,  -3,   2,  -3,   1,
           0,   0,  10,   0,   1,  10,   1,   2,
           0,   0,   6,   0,   2,   6,   2,   2,
--- a/third_party/dav1d/src/tables.h
+++ b/third_party/dav1d/src/tables.h
@@ -109,16 +109,16 @@ extern const Dav1dWarpedMotionParams dav
 extern const int16_t dav1d_sgr_params[16][4];
 extern const int dav1d_sgr_x_by_xplus1[256];
 
 extern const int8_t dav1d_mc_subpel_filters[5][15][8];
 extern const int8_t dav1d_mc_warp_filter[193][8];
 extern const int16_t dav1d_resize_filter[64][8];
 
 extern const uint8_t dav1d_sm_weights[128];
-extern const int16_t dav1d_dr_intra_derivative[90];
+extern const uint16_t dav1d_dr_intra_derivative[44];
 extern const int8_t dav1d_filter_intra_taps[5][64];
 
 extern const uint8_t dav1d_obmc_masks[64];
 
 extern const int16_t dav1d_gaussian_sequence[2048]; // for fgs
 
 #endif /* __DAV1D_SRC_TABLES_H__ */
--- a/third_party/dav1d/src/x86/cpu.c
+++ b/third_party/dav1d/src/x86/cpu.c
@@ -20,16 +20,18 @@
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include "config.h"
+
 #include <stdint.h>
 
 #include "src/x86/cpu.h"
 
 void dav1d_cpu_cpuid(uint32_t *info, int leaf);
 uint64_t dav1d_cpu_xgetbv(int xcr);
 
 unsigned dav1d_get_cpu_flags_x86(void) {
@@ -42,26 +44,29 @@ unsigned dav1d_get_cpu_flags_x86(void) {
     if (n_ids >= 1) {
         dav1d_cpu_cpuid(info, 1);
         if (info[3] & (1 << 25)) flags |= DAV1D_X86_CPU_FLAG_SSE;
         if (info[3] & (1 << 26)) flags |= DAV1D_X86_CPU_FLAG_SSE2;
         if (info[2] & (1 <<  0)) flags |= DAV1D_X86_CPU_FLAG_SSE3;
         if (info[2] & (1 <<  9)) flags |= DAV1D_X86_CPU_FLAG_SSSE3;
         if (info[2] & (1 << 19)) flags |= DAV1D_X86_CPU_FLAG_SSE41;
         if (info[2] & (1 << 20)) flags |= DAV1D_X86_CPU_FLAG_SSE42;
+#if ARCH_X86_64
+        /* We only support >128-bit SIMD on x86-64. */
         if (info[2] & (1 << 27)) /* OSXSAVE */ {
             uint64_t xcr = dav1d_cpu_xgetbv(0);
             if ((xcr & 0x00000006) == 0x00000006) /* XMM/YMM */ {
                 if (info[2] & (1 << 28)) flags |= DAV1D_X86_CPU_FLAG_AVX;
                 if (n_ids >= 7) {
                     dav1d_cpu_cpuid(info, 7);
                     if (info[1] & (1 <<  5)) flags |= DAV1D_X86_CPU_FLAG_AVX2;
                     if ((xcr & 0x000000e0) == 0x000000e0) /* ZMM/OPMASK */ {
                         if ((info[1] & 0xd0030000) == 0xd0030000)
                             flags |= DAV1D_X86_CPU_FLAG_AVX512;
                     }
                 }
             }
         }
+#endif
     }
 
     return flags;
 }
--- a/third_party/dav1d/src/x86/ipred.asm
+++ b/third_party/dav1d/src/x86/ipred.asm
@@ -23,17 +23,17 @@
 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 %include "config.asm"
 %include "ext/x86/x86inc.asm"
 
 %if ARCH_X86_64
 
-SECTION_RODATA 32
+SECTION_RODATA 64
 
 %macro SMOOTH_WEIGHT_TABLE 1-*
     %rep %0
         db %1-128, 127-%1
         %rotate 1
     %endrep
 %endmacro
 
@@ -52,73 +52,82 @@ smooth_weights: SMOOTH_WEIGHT_TABLE     
     196, 189, 182, 176, 169, 163, 156, 150, \
     144, 138, 133, 127, 121, 116, 111, 106, \
     101,  96,  91,  86,  82,  77,  73,  69, \
      65,  61,  57,  54,  50,  47,  44,  41, \
      38,  35,  32,  29,  27,  25,  22,  20, \
      18,  16,  15,  13,  12,  10,   9,   8, \
       7,   6,   6,   5,   5,   4,   4,   4
 
-; Note that the order of (some of) the following z constants matter
 z_filter_wh:  db  7,  7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
               db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1
 z_filter_k:   db  0, 16,  0, 16,  0, 20,  0, 20,  8, 16,  8, 16
               db 32, 16, 32, 16, 24, 20, 24, 20, 16, 16, 16, 16
               db  0,  0,  0,  0,  0,  0,  0,  0,  8,  0,  8,  0
 z_filter_s:   db  0,  0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7
               db  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15
+              db 15, 15, 15, 15, 15, 15, 15, 15 ; should be in one cache line
+pb_12:        times 4 db 12 ; those are just placed here for alignment.
+pb_14:        times 4 db 14
+z3_shuf:      db  8,  7,  7,  6,  6,  5,  5,  4,  4,  3,  3,  2,  2,  1,  1,  0
 z_filter_t0:  db 55,127, 39,127, 39,127,  7, 15, 31,  7, 15, 31,  0,  3, 31,  0
 z_filter_t1:  db 39, 63, 19, 47, 19, 47,  3,  3,  3,  3,  3,  3,  0,  0,  0,  0
-z_upsample:   db  1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
-z_shuf_w4:    db  0,  1,  1,  2,  2,  3,  3,  4,  8,  9,  9, 10, 10, 11, 11, 12
+z_upsample1:  db  1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
+z_upsample2:  db  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  8,  8,  8
+z_upsample3:  db  0,  0,  0,  0,  1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5
+z1_shuf_w4:   db  0,  1,  1,  2,  2,  3,  3,  4,  8,  9,  9, 10, 10, 11, 11, 12
+z3_shuf_w4:   db  4,  3,  3,  2,  2,  1,  1,  0, 12, 11, 11, 10, 10,  9,  9,  8
+z_transpose4: db  0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
 z_base_inc:   dw  0*64,  1*64,  2*64,  3*64,  4*64,  5*64,  6*64,  7*64
               dw 16*64, 17*64, 18*64, 19*64, 20*64, 21*64, 22*64, 23*64
 
 ; vpermd indices in bits 4..6 of filter_shuf1: 0, 2, 6, 4, 1, 3, 7, 5
 filter_shuf1: db 10,  4, 10,  4, 37,  6,  5,  6,103,  9,  7,  9, 72, -1,  8, -1
               db 16,  4,  0,  4, 53,  6,  5,  6,119, 11,  7, 11, 95, -1, 15, -1
 filter_shuf2: db  3,  4,  3,  4,  5,  6,  5,  6,  7,  2,  7,  2,  1, -1,  1, -1
-filter_shuf3: db  3,  4,  3,  4,  5,  6,  5,  6,  7, 11,  7, 11, 15, -1, 15, -1
+filter_shuf3: db  3,  4,  3,  4,  5,  6,  5,  6,  7, 11,  7, 11; 15, -1, 15, -1
+pb_127_m127:  times 2 db 127, -127
 ipred_v_shuf: db  0,  1,  0,  1,  4,  5,  4,  5,  8,  9,  8,  9, 12, 13, 12, 13
               db  2,  3,  2,  3,  6,  7,  6,  7, 10, 11, 10, 11, 14, 15, 14, 15
 ipred_h_shuf: db  7,  7,  7,  7,  3,  3,  3,  3,  5,  5,  5,  5,  1,  1,  1,  1
-              db  6,  6,  6,  6,  2,  2,  2,  2,  4,  4,  4,  4,  0,  0,  0,  0
+              db  6,  6,  6,  6,  2,  2,  2,  2,  4,  4,  4,  4;  0,  0,  0,  0
+pw_64:        times 2 dw 64
 
-pb_0to15:
 cfl_ac_w16_pad_shuffle: ; w=16, w_pad=1
                         db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
                         ; w=8, w_pad=1 as well as second half of previous one
 cfl_ac_w8_pad1_shuffle: db 0, 1, 2, 3, 4, 5
                         times 5 db 6, 7
                         ; w=16,w_pad=2
                         db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
                         times 8 db 14, 15
                         ; w=16,w_pad=3
                         db 0, 1, 2, 3, 4, 5
                         times 13 db 6, 7
+pb_15to0:               db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
-pb_1:   times 4 db 1
-pb_2:   times 4 db 2
-pb_4:   times 4 db 4
-pb_8:   times 4 db 8
-pb_12:  times 4 db 12
-pb_14:  times 4 db 14
-pb_15   times 4 db 15
-pb_31:  times 4 db 31
-pb_128: times 4 db 128
-pw_1:   times 2 dw 1
-pw_8:   times 2 dw 8
-pw_62:  times 2 dw 62
-pw_64:  times 2 dw 64
-pw_128: times 2 dw 128
-pw_255: times 2 dw 255
-pw_512: times 2 dw 512
+%define pb_0to15 cfl_ac_w16_pad_shuffle
+%define pb_1  (ipred_h_shuf+12)
+%define pb_2  (ipred_h_shuf+20)
+%define pb_3  (ipred_h_shuf+ 4)
+%define pb_4  (ipred_h_shuf+24)
+%define pb_7  (ipred_h_shuf+ 0)
+%define pb_8  (z_upsample2 +12)
+%define pb_15 (z_filter_s  +32)
+%define pw_8  (z_filter_k  +32)
 
-pb_36_m4:    times 2 db  36,   -4
-pb_127_m127: times 2 db 127, -127
+pb_27:    times 4 db 27
+pb_31:    times 4 db 31
+pb_128:   times 4 db 128
+pw_1:     times 2 dw 1
+pw_62:    times 2 dw 62
+pw_128:   times 2 dw 128
+pw_255:   times 2 dw 255
+pw_512:   times 2 dw 512
+pb_36_m4: times 2 db 36, -4
 
 %macro JMP_TABLE 3-*
     %xdefine %1_%2_table (%%table - 2*4)
     %xdefine %%base mangle(private_prefix %+ _%1_%2)
     %%table:
     %rep %0 - 2
         dd %%base %+ .%3 - (%%table - 2*4)
         %rotate 1
@@ -133,16 +142,17 @@ JMP_TABLE ipred_smooth_v, avx2, w4, w8, 
 JMP_TABLE ipred_smooth_h, avx2, w4, w8, w16, w32, w64
 JMP_TABLE ipred_paeth,    avx2, w4, w8, w16, w32, w64
 JMP_TABLE ipred_filter,   avx2, w4, w8, w16, w32
 JMP_TABLE ipred_dc,       avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
                                 s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
 JMP_TABLE ipred_dc_left,  avx2, h4, h8, h16, h32, h64
 JMP_TABLE ipred_h,        avx2, w4, w8, w16, w32, w64
 JMP_TABLE ipred_z1,       avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z3,       avx2, h4, h8, h16, h32, h64
 JMP_TABLE ipred_cfl,      avx2, h4, h8, h16, h32, w4, w8, w16, w32, \
                                 s4-8*4, s8-8*4, s16-8*4, s32-8*4
 JMP_TABLE ipred_cfl_left, avx2, h4, h8, h16, h32
 JMP_TABLE ipred_cfl_ac_420, avx2, w16_pad1, w16_pad2, w16_pad3
 JMP_TABLE ipred_cfl_ac_422, avx2, w16_pad1, w16_pad2, w16_pad3
 JMP_TABLE pal_pred,       avx2, w4, w8, w16, w32, w64
 
 cextern dr_intra_derivative
@@ -1293,37 +1303,36 @@ cglobal ipred_z1, 3, 8, 0, dst, stride, 
     lea                  r6, [ipred_z1_avx2_table]
     tzcnt                wd, wm
     movifnidn        angled, anglem
     movifnidn            hd, hm
     lea                  r7, [dr_intra_derivative]
     inc                 tlq
     movsxd               wq, [r6+wq*4]
     add                  wq, r6
-    movzx               dxd, angleb
+    mov                 dxd, angled
+    and                 dxd, 0x7e
     add              angled, 165 ; ~90
-    movzx               dxd, word [r7+dxq*2]
+    movzx               dxd, word [r7+dxq]
     xor              angled, 0x4ff ; d = 90 - angle
     vpbroadcastd         m3, [pw_512]
     vpbroadcastd         m4, [pw_62]
     vpbroadcastd         m5, [pw_64]
     jmp                  wq
 .w4:
     cmp              angleb, 40
     jae .w4_no_upsample
     lea                 r3d, [angleq-1024]
     sar                 r3d, 7
     add                 r3d, hd
     jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
     ALLOC_STACK         -32, 8
     mova                xm1, [tlq-1]
-    pshufb              xm0, xm1, [z_upsample]
-    vpbroadcastd        xm2, [pb_8]
-    pminub              xm2, [z_filter_s+6]
-    pshufb              xm1, xm2
+    pshufb              xm0, xm1, [z_upsample1]
+    pshufb              xm1, [z_upsample2]
     vpbroadcastd        xm2, [pb_36_m4] ; upshifted by 2 to be able to reuse
     add                 dxd, dxd        ; pw_512 (which is already in m3)
     pmaddubsw           xm0, xm2        ; for rounding instead of pw_2048
     pextrd         [rsp+16], xm1, 3 ; top[max_base_x]
     pmaddubsw           xm1, xm2
     movd                xm7, dxd
     mov                 r3d, dxd ; xpos
     vpbroadcastw         m7, xm7
@@ -1370,47 +1379,47 @@ cglobal ipred_z1, 3, 8, 0, dst, stride, 
     lea                dstq, [dstq+strideq*4]
     sub                  hd, 4
     jg .w4_upsample_loop
     RET
 ALIGN function_align
 .filter_strength: ; w4/w8/w16
     ; The C version uses a lot of branches, but we can do all the comparisons
     ; in parallel and use popcnt to get the final filter strength value.
+%define base r3-z_filter_t0
+    lea                  r3, [z_filter_t0]
     movd                xm0, maxbased
     movd                xm2, angled
-    lea                  r3, [z_filter_t0]
     shr              angled, 8 ; is_sm << 1
     vpbroadcastb         m0, xm0
     vpbroadcastb         m2, xm2
-    pcmpeqb              m1, m0, [r3-z_filter_t0+z_filter_wh]
+    pcmpeqb              m1, m0, [base+z_filter_wh]
     pand                 m1, m2
     mova                xm2, [r3+angleq*8] ; upper ymm half zero in both cases
     pcmpgtb              m1, m2
     pmovmskb            r5d, m1
     popcnt              r5d, r5d ; sets ZF which can be used by caller
     ret
 .w4_no_upsample:
     %assign stack_offset org_stack_offset
     ALLOC_STACK         -16, 11
     mov            maxbased, 7
     test             angled, 0x400 ; !enable_intra_edge_filter
     jnz .w4_main
     lea            maxbased, [hq+3]
     call .filter_strength
     mov            maxbased, 7
     jz .w4_main ; filter_strength == 0
-    lea                  r3, [z_filter_k-4]
-    vpbroadcastd         m7, [pb_8]
+    vpbroadcastd         m7, [base+pb_8]
     vbroadcasti128       m2, [tlq-1]
-    pminub               m1, m7, [r3-z_filter_k+z_filter_s+4]
-    vpbroadcastd         m8, [r3+r5*4+12*0]
-    pminub               m7, [r3-z_filter_k+z_filter_s+12]
-    vpbroadcastd         m9, [r3+r5*4+12*1]
-    vpbroadcastd        m10, [r3+r5*4+12*2]
+    pminub               m1, m7, [base+z_filter_s]
+    vpbroadcastd         m8, [base+z_filter_k-4+r5*4+12*0]
+    pminub               m7, [base+z_filter_s+8]
+    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*1]
+    vpbroadcastd        m10, [base+z_filter_k-4+r5*4+12*2]
     pshufb               m0, m2, m1
     shufps               m1, m7, q2121
     pmaddubsw            m0, m8
     pshufb               m1, m2, m1
     pmaddubsw            m1, m9
     pshufb               m2, m7
     pmaddubsw            m2, m10
     paddw                m0, m1
@@ -1427,17 +1436,17 @@ ALIGN function_align
     movd                xm6, dxd
     vpbroadcastq         m0, [z_base_inc] ; base_inc << 6
     vpbroadcastb         m7, [tlq+maxbaseq]
     shl            maxbased, 6
     vpbroadcastw         m6, xm6
     mov                 r3d, dxd ; xpos
     movd                xm9, maxbased
     vpbroadcastw         m9, xm9
-    vbroadcasti128       m8, [z_shuf_w4]
+    vbroadcasti128       m8, [z1_shuf_w4]
     psrlw                m7, 8  ; top[max_base_x]
     paddw               m10, m6, m6
     psubw                m9, m0 ; max_base_x
     vpblendd             m6, m10, 0xcc
     mova                xm0, xm10
     paddw                m6, m0 ; xpos2 xpos3 xpos0 xpos1
     paddw               m10, m10
 .w4_loop:
@@ -1497,17 +1506,17 @@ ALIGN function_align
     ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
     %assign stack_offset org_stack_offset
     ALLOC_STACK         -32, 8
     movu                xm2, [z_filter_s+6]
     mova                xm0, [tlq-1]
     movd                xm6, hd
     vinserti128          m0, [tlq+7], 1
     vpbroadcastb        xm6, xm6
-    vbroadcasti128       m1, [z_upsample]
+    vbroadcasti128       m1, [z_upsample1]
     pminub              xm6, xm2
     vpbroadcastd         m7, [pb_36_m4]
     vinserti128          m2, xm6, 1
     add                 dxd, dxd
     pshufb               m1, m0, m1
     pshufb               m2, m0, m2
     movd                xm6, dxd
     pmaddubsw            m1, m7
@@ -1556,73 +1565,67 @@ ALIGN function_align
     movhps [dstq+strideq*2], xm0
     movq   [dstq+strideq*1], xm1
     movhps [dstq+r2       ], xm1
     lea                dstq, [dstq+strideq*4]
     sub                  hd, 4
     jg .w8_upsample_loop
     RET
 .w8_no_intra_edge_filter:
-    mov                 r3d, 15
-    cmp                  hd, 8
-    cmova          maxbased, r3d
+    and            maxbased, 7
+    or             maxbased, 8 ; imin(h+7, 15)
     jmp .w8_main
 .w8_no_upsample:
     %assign stack_offset org_stack_offset
     ALLOC_STACK         -32, 10
     lea            maxbased, [hq+7]
     test             angled, 0x400
     jnz .w8_no_intra_edge_filter
     call .filter_strength
-    vpbroadcastd        xm6, [pb_15]
-    pminub              xm6, xm0 ; imin(h, 8) + 7
-    movd           maxbased, xm6
-    movzx          maxbased, maxbaseb
     jz .w8_main ; filter_strength == 0
-    lea                  r3, [z_filter_k-4]
     movu                xm2, [tlq]
-    pminub              xm1, xm6, [r3-z_filter_k+z_filter_s+18]
+    pminub              xm1, xm0, [base+z_filter_s+14]
     vinserti128          m2, [tlq-1], 1
-    vinserti128          m1, [r3-z_filter_k+z_filter_s+ 4], 1
-    vpbroadcastd         m7, [r3+r5*4+12*0]
-    pminub              xm6, [r3-z_filter_k+z_filter_s+26]
-    vinserti128          m6, [r3-z_filter_k+z_filter_s+12], 1
-    pshufb               m0, m2, m1
-    pmaddubsw            m0, m7
-    vpbroadcastd         m7, [r3+r5*4+12*1]
+    vinserti128          m1, [base+z_filter_s+ 0], 1
+    vpbroadcastd         m7, [base+z_filter_k-4+r5*4+12*0]
+    pminub              xm0, [base+z_filter_s+22]
+    vinserti128          m0, [base+z_filter_s+ 8], 1
+    pshufb               m6, m2, m1
+    pmaddubsw            m6, m7
+    vpbroadcastd         m7, [base+z_filter_k-4+r5*4+12*1]
     movzx               r3d, byte [tlq+15]
-    shufps               m1, m6, q2121
+    shufps               m1, m0, q2121
     pshufb               m1, m2, m1
     pmaddubsw            m1, m7
-    paddw                m0, m1
+    paddw                m1, m6
     sub                 r5d, 3
     jnz .w8_3tap
     ; filter_strength == 3 uses a 5-tap filter instead of a 3-tap one,
     ; which also results in an awkward edge case where out[w*2] is
     ; slightly different from out[max_base_x] when h > w.
     vpbroadcastd         m7, [z_filter_k+4*8]
     movzx               r2d, byte [tlq+14]
-    pshufb               m2, m6
+    pshufb               m2, m0
     pmaddubsw            m2, m7
     sub                 r2d, r3d
     lea                 r2d, [r2+r3*8+4]
     shr                 r2d, 3 ; (tlq[w*2-2] + tlq[w*2-1]*7 + 4) >> 3
     mov            [rsp+16], r2b
-    paddw                m0, m2
+    paddw                m1, m2
 .w8_3tap:
-    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
     sar                 r5d, 1
     mov                 tlq, rsp
     add                 r5d, 17 ; w*2 + (filter_strength == 3)
     cmp                  hd, 8
     cmova          maxbased, r5d
     mov            [tlq+r5], r3b
-    vextracti128        xm1, m0, 1
-    packuswb            xm1, xm0
-    mova              [tlq], xm1
+    vextracti128        xm0, m1, 1
+    packuswb            xm0, xm1
+    mova              [tlq], xm0
 .w8_main:
     movd                xm2, dxd
     vbroadcasti128       m0, [z_base_inc]
     vpbroadcastw         m2, xm2
     vpbroadcastb         m7, [tlq+maxbaseq]
     shl            maxbased, 6
     movd                xm9, maxbased
     vbroadcasti128       m8, [z_filter_s+2]
@@ -1663,58 +1666,50 @@ ALIGN function_align
     movq   [dstq+strideq*0], xm7
     movq   [dstq+strideq*1], xm7
     lea                dstq, [dstq+strideq*2]
     sub                  hd, 2
     jg .w8_end_loop
 .w8_end:
     RET
 .w16_no_intra_edge_filter:
-    mov                 r3d, 31
-    cmp                  hd, 16
-    cmova          maxbased, r3d
+    and            maxbased, 15
+    or             maxbased, 16 ; imin(h+15, 31)
     jmp .w16_main
 ALIGN function_align
 .w16:
     %assign stack_offset org_stack_offset
     ALLOC_STACK         -64, 12
     lea            maxbased, [hq+15]
     test             angled, 0x400
     jnz .w16_no_intra_edge_filter
     call .filter_strength
-    vpbroadcastd         m1, [pb_31]
-    pminub               m0, m1 ; imin(h, 16) + 15
-    movd           maxbased, xm0
-    movzx          maxbased, maxbaseb
     jz .w16_main ; filter_strength == 0
-    lea                  r3, [z_filter_k-4]
-    vpbroadcastd         m1, [pb_12]
-    vpbroadcastd        m11, [pb_15]
-    vbroadcasti128       m6, [r3-z_filter_k+z_filter_s+12]
-    vinserti128          m2, m6, [r3-z_filter_k+z_filter_s+4], 0
-    vinserti128          m6, [r3-z_filter_k+z_filter_s+20], 1
+    vpbroadcastd         m1, [base+pb_12]
+    vbroadcasti128       m6, [base+z_filter_s+8]
+    vinserti128          m2, m6, [base+z_filter_s], 0
+    vinserti128          m6, [base+z_filter_s+16], 1
     mova               xm10, [tlq-1]
     vinserti128         m10, [tlq+3], 1
-    vpbroadcastd         m9, [r3+r5*4+12*0]
-    vbroadcasti128       m7, [r3-z_filter_k+z_filter_s+18]
-    vinserti128          m8, m7, [r3-z_filter_k+z_filter_s+10], 0
-    vinserti128          m7, [r3-z_filter_k+z_filter_s+26], 1
+    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*0]
+    vbroadcasti128       m7, [base+z_filter_s+14]
+    vinserti128          m8, m7, [base+z_filter_s+6], 0
+    vinserti128          m7, [base+z_filter_s+22], 1
     psubw                m0, m1
-    pminub               m0, m11 ; imin(h+3, 15)
     movu               xm11, [tlq+12]
     vinserti128         m11, [tlq+16], 1
     pminub               m8, m0
     pminub               m7, m0
     pshufb               m0, m10, m2
     shufps               m2, m6, q2121
     pmaddubsw            m0, m9
     pshufb               m1, m11, m8
     shufps               m8, m7, q2121
     pmaddubsw            m1, m9
-    vpbroadcastd         m9, [r3+r5*4+12*1]
+    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*1]
     movzx               r3d, byte [tlq+31]
     pshufb               m2, m10, m2
     pmaddubsw            m2, m9
     pshufb               m8, m11, m8
     pmaddubsw            m8, m9
     paddw                m0, m2
     paddw                m1, m8
     sub                 r5d, 3
@@ -2126,16 +2121,1180 @@ ALIGN function_align
     mova          [dstq+ 0], m7
     mova          [dstq+32], m7
     add                dstq, strideq
     dec                  hd
     jg .w64_end_loop
 .w64_end:
     RET
 
+cglobal ipred_z3, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase
+    %assign org_stack_offset stack_offset
+    lea                  r6, [ipred_z3_avx2_table]
+    tzcnt                hd, hm
+    movifnidn        angled, anglem
+    lea                  r7, [dr_intra_derivative+45*2-1]
+    dec                 tlq
+    movsxd               hq, [r6+hq*4]
+    sub              angled, 180
+    add                  hq, r6
+    mov                 dyd, angled
+    neg                 dyd
+    xor              angled, 0x400
+    or                  dyq, ~0x7e
+    movzx               dyd, word [r7+dyq]
+    vpbroadcastd         m3, [pw_512]
+    vpbroadcastd         m4, [pw_62]
+    vpbroadcastd         m5, [pw_64]
+    mov              org_wd, wd
+    jmp                  hq
+.h4:
+    lea                  r7, [strideq*3]
+    cmp              angleb, 40
+    jae .h4_no_upsample
+    lea                 r4d, [angleq-1024]
+    sar                 r4d, 7
+    add                 r4d, wd
+    jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm)
+    ALLOC_STACK         -32, 9
+    movu                xm8, [tlq-7]
+    pshufb              xm0, xm8, [z_upsample3]
+    vpbroadcastb        xm2, xm8
+    pshufb              xm1, xm8, [z_filter_s+2]
+    mova           [rsp+16], xm2 ; top[max_base_y]
+    vpbroadcastd        xm2, [pb_36_m4]
+    add                 dyd, dyd
+    pmaddubsw           xm0, xm2
+    pmaddubsw           xm1, xm2
+    movd                xm7, dyd
+    mov                 r2d, dyd
+    vpbroadcastw         m7, xm7
+    paddw               xm1, xm0
+    pmulhrsw            xm1, xm3
+    pslldq               m6, m7, 8
+    paddw               xm2, xm7, xm7
+    paddw                m6, m7
+    packuswb            xm1, xm1
+    paddw                m6, m2
+    punpcklbw           xm1, xm8
+    mova                xm8, [z_transpose4]
+    psllw                m7, 2
+    pshufb              xm1, [pb_15to0]
+    mova              [rsp], xm1
+.h4_upsample_loop:
+    lea                 r4d, [r2+dyq]
+    shr                 r2d, 6
+    vpbroadcastq         m1, [rsp+r2]
+    lea                 r2d, [r4+dyq]
+    shr                 r4d, 6
+    vpbroadcastq         m2, [rsp+r4]
+    lea                 r4d, [r2+dyq]
+    shr                 r2d, 6
+    movq                xm0, [rsp+r2]
+    lea                 r2d, [r4+dyq]
+    shr                 r4d, 6
+    movhps              xm0, [rsp+r4]
+    vpblendd             m1, m2, 0xc0
+    pand                 m2, m4, m6
+    vpblendd             m0, m1, 0xf0
+    psubw                m1, m5, m2
+    psllw                m2, 8
+    por                  m1, m2
+    pmaddubsw            m0, m1
+    paddw                m6, m7
+    pmulhrsw             m0, m3
+    vextracti128        xm1, m0, 1
+    packuswb            xm1, xm0
+    pshufb              xm1, xm8
+    movd   [dstq+strideq*0], xm1
+    pextrd [dstq+strideq*1], xm1, 1
+    pextrd [dstq+strideq*2], xm1, 2
+    pextrd [dstq+r7       ], xm1, 3
+    add                dstq, 4
+    sub                  wd, 4
+    jg .h4_upsample_loop
+    RET
+ALIGN function_align
+.filter_strength: ; h4/h8/h16
+%define base r4-z_filter_t0
+    lea                  r4, [z_filter_t0]
+    movd                xm0, maxbased
+    movd                xm2, angled
+    shr              angled, 8 ; is_sm << 1
+    vpbroadcastb         m0, xm0
+    vpbroadcastb         m2, xm2
+    pcmpeqb              m1, m0, [base+z_filter_wh]
+    pand                 m1, m2
+    mova                xm2, [r4+angleq*8]
+    pcmpgtb              m1, m2
+    pmovmskb            r5d, m1
+    popcnt              r5d, r5d
+    ret
+.h4_no_upsample:
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK         -16, 12
+    mov            maxbased, 7
+    test             angled, 0x400 ; !enable_intra_edge_filter
+    jnz .h4_main
+    lea            maxbased, [wq+3]
+    call .filter_strength
+    mov            maxbased, 7
+    jz .h4_main ; filter_strength == 0
+    vpbroadcastd         m7, [base+pb_7]
+    vbroadcasti128       m2, [tlq-14]
+    pmaxub               m1, m7, [base+z_filter_s-4]
+    vpbroadcastd         m8, [base+z_filter_k-4+r5*4+12*0]
+    pmaxub               m7, [base+z_filter_s+4]
+    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*1]
+    vpbroadcastd        m10, [base+z_filter_k-4+r5*4+12*2]
+    pshufb               m0, m2, m1
+    shufps               m1, m7, q2121
+    pmaddubsw            m0, m8
+    pshufb               m1, m2, m1
+    pmaddubsw            m1, m9
+    pshufb               m2, m7
+    pmaddubsw            m2, m10
+    paddw                m0, m1
+    paddw                m0, m2
+    pmulhrsw             m0, m3
+    mov                 r4d, 9
+    lea                 tlq, [rsp+15]
+    cmp                  wd, 4
+    cmova          maxbased, r4d
+    vextracti128        xm1, m0, 1
+    packuswb            xm0, xm1
+    mova              [rsp], xm0
+.h4_main:
+    movd                xm6, dyd
+    vpbroadcastq         m0, [z_base_inc] ; base_inc << 6
+    mov                  r4, tlq
+    sub                 tlq, 4
+    neg                 dyq
+    vpbroadcastw         m6, xm6
+    sub                  r4, maxbaseq
+    shl            maxbased, 6
+    vpbroadcastb         m7, [r4]
+    lea                  r4, [dyq+63] ; ypos
+    movd                xm9, maxbased
+    sub            maxbased, 63
+    vbroadcasti128       m8, [z3_shuf_w4]
+    neg            maxbaseq
+    vpbroadcastw         m9, xm9
+    psrlw                m7, 8  ; top[max_base_y]
+    paddw               m10, m6, m6
+    psubw                m9, m0 ; max_base_y
+    vpblendd             m6, m10, 0xcc
+    mova                xm0, xm10
+    paddw                m6, m0 ; ypos2 ypos3 ypos0 ypos1
+    paddw               m10, m10
+    mova               xm11, [z_transpose4]
+.h4_loop:
+    lea                  r5, [r4+dyq]
+    sar                  r4, 6 ; base0
+    vpbroadcastq         m1, [tlq+r4]
+    lea                  r4, [r5+dyq]
+    sar                  r5, 6 ; base1
+    vpbroadcastq         m2, [tlq+r5]
+    lea                  r5, [r4+dyq]
+    sar                  r4, 6 ; base2
+    movq                xm0, [tlq+r4]
+    lea                  r4, [r5+dyq]
+    sar                  r5, 6 ; base3
+    movhps              xm0, [tlq+r5]
+    vpblendd             m1, m2, 0xc0
+    pand                 m2, m4, m6 ; frac << 1
+    vpblendd             m0, m1, 0xf0
+    psubw                m1, m5, m2 ; (32 - frac) << 1
+    psllw                m2, 8
+    pshufb               m0, m8
+    por                  m1, m2     ; (32-frac, frac) << 1
+    pmaddubsw            m0, m1
+    pcmpgtw              m1, m9, m6 ; base < max_base_y
+    pmulhrsw             m0, m3
+    paddsw               m6, m10    ; ypos += dy
+    vpblendvb            m0, m7, m0, m1
+    vextracti128        xm1, m0, 1
+    packuswb            xm1, xm0
+    pshufb              xm1, xm11   ; transpose
+    movd   [dstq+strideq*0], xm1
+    pextrd [dstq+strideq*1], xm1, 1
+    pextrd [dstq+strideq*2], xm1, 2
+    pextrd [dstq+r7       ], xm1, 3
+    add                dstq, 4
+    sub                  wd, 4
+    jz .h4_end
+    cmp                 r4d, maxbased
+    jg .h4_loop
+    packuswb            xm7, xm7
+.h4_end_loop:
+    movd   [dstq+strideq*0], xm7
+    movd   [dstq+strideq*1], xm7
+    movd   [dstq+strideq*2], xm7
+    movd   [dstq+r7       ], xm7
+    add                dstq, 4
+    sub                  wd, 4
+    jg .h4_end_loop
+.h4_end:
+    RET
+ALIGN function_align
+.h8:
+    lea                 r4d, [angleq+216]
+    mov                 r4b, wb
+    cmp                 r4d, 8
+    ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK         -32, 8
+    and                 r4d, 4
+    mova                xm0, [tlq-15]
+    vinserti128          m0, [tlq- 9], 1
+    movd                xm1, r4d
+    movu                xm2, [z_filter_s+2]
+    vinserti128          m2, [z_filter_s+6], 1
+    vpbroadcastb        xm1, xm1 ; w & 4
+    vpbroadcastd         m7, [pb_36_m4]
+    pmaxub              xm1, [z_upsample3] ; clip 4x8
+    vinserti128          m1, [z_upsample1], 1
+    add                 dyd, dyd
+    pshufb               m1, m0, m1
+    pshufb               m2, m0, m2
+    vinserti128          m0, [tlq-7], 1
+    movd                xm6, dyd
+    pmaddubsw            m1, m7
+    pmaddubsw            m2, m7
+    vpbroadcastw         m6, xm6
+    mov                 r2d, dyd
+    lea                  r5, [strideq*3]
+    paddw                m7, m6, m6
+    paddw                m1, m2
+    vpblendd             m6, m7, 0xf0
+    pmulhrsw             m1, m3
+    pslldq               m2, m7, 8
+    paddw                m7, m7
+    paddw                m6, m2
+    vbroadcasti128       m2, [pb_15to0]
+    packuswb             m1, m1
+    punpcklbw            m1, m0
+    pshufb               m1, m2
+    vextracti128   [rsp+ 0], m1, 1
+    mova           [rsp+16], xm1
+.h8_upsample_loop:
+    lea                 r4d, [r2+dyq]
+    shr                 r2d, 6 ; base0
+    movu                xm0, [rsp+r2]
+    lea                 r2d, [r4+dyq]
+    shr                 r4d, 6 ; base1
+    vinserti128          m0, [rsp+r4], 1
+    lea                 r4d, [r2+dyq]
+    shr                 r2d, 6 ; base2
+    pand                 m1, m4, m6
+    psubw                m2, m5, m1
+    psllw                m1, 8
+    por                  m2, m1
+    punpcklqdq           m1, m2, m2 ; frac0 frac1
+    pmaddubsw            m0, m1
+    movu                xm1, [rsp+r2]
+    lea                 r2d, [r4+dyq]
+    shr                 r4d, 6 ; base3
+    vinserti128          m1, [rsp+r4], 1
+    punpckhqdq           m2, m2 ; frac2 frac3
+    pmaddubsw            m1, m2
+    pmulhrsw             m0, m3
+    paddw                m6, m7
+    pmulhrsw             m1, m3
+    lea                  r4, [dstq+strideq*4]
+    psllw                m1, 8
+    por                  m0, m1
+    vextracti128        xm1, m0, 1
+    punpcklbw           xm2, xm0, xm1
+    punpckhbw           xm0, xm1
+    movd   [dstq+strideq*0], xm2
+    pextrd [dstq+strideq*1], xm2, 1
+    pextrd [dstq+strideq*2], xm2, 2
+    pextrd [dstq+r5       ], xm2, 3
+    movd   [r4  +strideq*0], xm0
+    pextrd [r4  +strideq*1], xm0, 1
+    pextrd [r4  +strideq*2], xm0, 2
+    pextrd [r4  +r5       ], xm0, 3
+    add                dstq, 4
+    sub                  wd, 4
+    jg .h8_upsample_loop
+    RET
+.h8_no_intra_edge_filter:
+    and            maxbased, 7
+    or             maxbased, 8 ; imin(w+7, 15)
+    jmp .h8_main
+.h8_no_upsample:
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK         -32, 10
+    lea            maxbased, [wq+7]
+    test             angled, 0x400
+    jnz .h8_no_intra_edge_filter
+    call .filter_strength
+    jz .h8_main ; filter_strength == 0
+    vpbroadcastd        xm6, [base+pb_15]
+    pcmpeqb             xm1, xm1
+    psubusb             xm6, xm0
+    psubb               xm6, xm1 ; w == 4 ? 5 : 1
+    movu                xm2, [tlq-16]
+    pmaxub              xm1, xm6, [base+z_filter_s]
+    vinserti128          m2, [tlq-14], 1
+    vinserti128          m1, [base+z_filter_s+12], 1
+    vpbroadcastd         m7, [base+z_filter_k-4+r5*4+12*0]
+    pmaxub              xm6, [base+z_filter_s+ 8]
+    vinserti128          m6, [base+z_filter_s+20], 1
+    pshufb               m0, m2, m1
+    pmaddubsw            m0, m7
+    vpbroadcastd         m7, [base+z_filter_k-4+r5*4+12*1]
+    movzx               r4d, byte [tlq-15]
+    shufps               m1, m6, q2121
+    pshufb               m1, m2, m1
+    pmaddubsw            m1, m7
+    paddw                m0, m1
+    sub                 r5d, 3
+    jnz .h8_3tap
+    vpbroadcastd         m7, [z_filter_k+4*8]
+    movzx               r2d, byte [tlq-14]
+    pshufb               m2, m6
+    pmaddubsw            m2, m7
+    sub                 r2d, r4d
+    lea                 r2d, [r2+r4*8+4]
+    shr                 r2d, 3
+    mov            [rsp+15], r2b
+    paddw                m0, m2
+.h8_3tap:
+    pmulhrsw             m0, m3
+    sar                 r5d, 1
+    lea                 tlq, [rsp+31]
+    add                 r5d, 17
+    cmp                  wd, 8
+    cmova          maxbased, r5d
+    neg                  r5
+    mov            [tlq+r5], r4b
+    vextracti128        xm1, m0, 1
+    packuswb            xm0, xm1
+    mova           [tlq-15], xm0
+.h8_main:
+    movd                xm2, dyd
+    vbroadcasti128       m0, [z_base_inc]
+    mov                  r4, tlq
+    sub                 tlq, 8
+    neg                 dyq
+    vpbroadcastw         m2, xm2
+    sub                  r4, maxbaseq
+    shl            maxbased, 6
+    vpbroadcastb         m7, [r4]
+    lea                  r4, [dyq+63]
+    movd                xm9, maxbased
+    sub            maxbased, 63
+    vbroadcasti128       m8, [z3_shuf]
+    neg            maxbaseq
+    vpbroadcastw         m9, xm9
+    psrlw                m7, 8
+    psubw                m9, m0
+    paddw                m6, m2, m2
+    vpblendd             m2, m6, 0x0f
+.h8_loop:
+    lea                  r5, [r4+dyq]
+    sar                  r4, 6
+    pand                 m0, m4, m2
+    psubw                m1, m5, m0
+    psllw                m0, 8
+    por                  m1, m0
+    vbroadcasti128       m0, [tlq+r4]
+    lea                  r4, [r5+dyq]
+    sar                  r5, 6
+    vinserti128          m0, [tlq+r5], 0
+    sub                 rsp, 8*2
+    pshufb               m0, m8
+    pmaddubsw            m0, m1
+    pcmpgtw              m1, m9, m2
+    paddsw               m2, m6
+    pmulhrsw             m0, m3
+    vpblendvb            m0, m7, m0, m1
+    vextracti128        xm1, m0, 1
+    psllw               xm0, 8
+    por                 xm0, xm1 ; interleave rows (partial transpose)
+    mova              [rsp], xm0
+    sub                  wd, 2
+    jz .h8_transpose
+    cmp                 r4d, maxbased
+    jg .h8_loop
+    packuswb            xm0, xm7, xm7
+.h8_end_loop:
+    sub                 rsp, 8*2
+    mova              [rsp], xm0
+    sub                  wd, 2
+    jg .h8_end_loop
+.h8_transpose:
+    mova                xm2, [rsp+16*1]
+    sub              org_wd, 8
+    lea                  r2, [strideq*3]
+    lea                  r6, [dstq+org_wq]
+    cmovg              dstq, r6
+    punpcklwd           xm1, xm2, xm0
+    punpckhwd           xm2, xm0
+    lea                  r6, [dstq+strideq*4]
+    jge .h8_w8
+    add                 rsp, 16*2
+    movd   [dstq+strideq*0], xm1
+    pextrd [dstq+strideq*1], xm1, 1
+    pextrd [dstq+strideq*2], xm1, 2
+    pextrd [dstq+r2       ], xm1, 3
+    movd   [r6  +strideq*0], xm2
+    pextrd [r6  +strideq*1], xm2, 1
+    pextrd [r6  +strideq*2], xm2, 2
+    pextrd [r6  +r2       ], xm2, 3
+    jmp .h8_end
+.h8_w8_loop:
+    mova                xm0, [rsp+16*0]
+    mova                xm2, [rsp+16*1]
+    punpcklwd           xm1, xm2, xm0
+    punpckhwd           xm2, xm0
+.h8_w8: ; w8/w16/w32
+    mova                xm0, [rsp+16*2]
+    mova                xm4, [rsp+16*3]
+    add                 rsp, 16*4
+    punpcklwd           xm3, xm4, xm0
+    punpckhwd           xm4, xm0
+    punpckldq           xm0, xm3, xm1
+    punpckhdq           xm3, xm1
+    punpckldq           xm1, xm4, xm2
+    punpckhdq           xm4, xm2
+    movq   [dstq+strideq*0], xm0
+    movhps [dstq+strideq*1], xm0
+    movq   [dstq+strideq*2], xm3
+    movhps [dstq+r2       ], xm3
+    movq   [r6  +strideq*0], xm1
+    movhps [r6  +strideq*1], xm1
+    movq   [r6  +strideq*2], xm4
+    movhps [r6  +r2       ], xm4
+    sub                dstq, 8
+    sub                  r6, 8
+    sub              org_wd, 8
+    jge .h8_w8_loop
+.h8_end:
+    RET
+.h16_no_intra_edge_filter:
+    and            maxbased, 15
+    or             maxbased, 16 ; imin(w+15, 31)
+    jmp .h16_main
+ALIGN function_align
+.h16:
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK         -64, 12
+    lea            maxbased, [wq+15]
+    test             angled, 0x400
+    jnz .h16_no_intra_edge_filter
+    call .filter_strength
+    jz .h16_main ; filter_strength == 0
+    vpbroadcastd        m11, [base+pb_27]
+    vpbroadcastd         m1, [base+pb_1]
+    vbroadcasti128       m6, [base+z_filter_s+12]
+    vinserti128          m2, m6, [base+z_filter_s+4], 0
+    vinserti128          m6, [base+z_filter_s+20], 1
+    movu               xm10, [tlq-18]
+    vinserti128         m10, [tlq-14], 1
+    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*0]
+    vbroadcasti128       m7, [base+z_filter_s+8]
+    vinserti128          m8, m7, [base+z_filter_s+0], 0
+    vinserti128          m7, [base+z_filter_s+16], 1
+    psubusb             m11, m0
+    por                  m1, m11
+    movu               xm11, [tlq-32]
+    vinserti128         m11, [tlq-28], 1
+    pmaxub               m8, m1
+    pmaxub               m7, m1
+    pshufb               m0, m10, m2
+    shufps               m2, m6, q2121
+    pmaddubsw            m0, m9
+    pshufb               m1, m11, m8
+    shufps               m8, m7, q2121
+    pmaddubsw            m1, m9
+    vpbroadcastd         m9, [base+z_filter_k-4+r5*4+12*1]
+    movzx               r4d, byte [tlq-31]
+    pshufb               m2, m10, m2
+    pmaddubsw            m2, m9
+    pshufb               m8, m11, m8
+    pmaddubsw            m8, m9
+    paddw                m0, m2
+    paddw                m1, m8
+    sub                 r5d, 3
+    jnz .h16_3tap
+    vpbroadcastd         m9, [z_filter_k+4*8]
+    movzx               r2d, byte [tlq-30]
+    pshufb              m10, m6
+    pmaddubsw           m10, m9
+    pshufb              m11, m7
+    pmaddubsw           m11, m9
+    sub                 r2d, r4d
+    lea                 r2d, [r2+r4*8+4]
+    shr                 r2d, 3
+    mov            [rsp+31], r2b
+    paddw                m0, m10
+    paddw                m1, m11
+.h16_3tap:
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    sar                 r5d, 1
+    lea                 tlq, [rsp+63]
+    add                 r5d, 33
+    cmp                  wd, 16
+    cmova          maxbased, r5d
+    neg                  r5
+    mov            [tlq+r5], r4b
+    packuswb             m0, m1
+    vpermq               m0, m0, q2031
+    mova           [tlq-31], m0
+.h16_main:
+    movd                xm6, dyd
+    vbroadcasti128       m0, [z_base_inc]
+    mov                  r4, tlq
+    sub                 tlq, 8
+    neg                 dyq
+    vpbroadcastw         m6, xm6
+    sub                  r4, maxbaseq
+    shl            maxbased, 6
+    vpbroadcastb         m7, [r4]
+    lea                  r4, [dyq+63]
+    movd                xm9, maxbased
+    sub            maxbased, 63
+    vbroadcasti128       m8, [z3_shuf]
+    neg            maxbaseq
+    vpbroadcastw         m9, xm9
+    psubw                m9, m0
+    paddw               m11, m6, m6
+    psubw               m10, m9, m3 ; 64*8
+    vpblendd             m6, m11, 0xf0
+.h16_loop:
+    lea                  r5, [r4+dyq]
+    sar                  r4, 6
+    pand                 m1, m4, m6
+    psubw                m2, m5, m1
+    psllw                m1, 8
+    por                  m2, m1
+    movu                xm0, [tlq+r4-0]
+    movu                xm1, [tlq+r4-8]
+    lea                  r4, [r5+dyq]
+    sar                  r5, 6
+    vinserti128          m0, [tlq+r5-0], 1
+    vinserti128          m1, [tlq+r5-8], 1
+    sub                 rsp, 32
+    pshufb               m0, m8
+    pshufb               m1, m8
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    packuswb             m0, m1
+    pcmpgtw              m1, m9, m6
+    pcmpgtw              m2, m10, m6
+    packsswb             m1, m2
+    paddsw               m6, m11
+    vpblendvb            m0, m7, m0, m1
+    vpermq               m0, m0, q3120
+    mova              [rsp], m0
+    sub                  wd, 2
+    jz .h16_transpose
+    cmp                 r4d, maxbased
+    jg .h16_loop
+    mova                 m0, m7
+.h16_end_loop:
+    sub                 rsp, 32
+    mova              [rsp], m7
+    sub                  wd, 2
+    jg .h16_end_loop
+.h16_transpose:
+    mova                 m2, [rsp+32*1]
+    sub              org_wd, 8
+    lea                  r2, [strideq*3]
+    lea                  r6, [dstq+org_wq]
+    cmovg              dstq, r6
+    punpcklbw            m1, m2, m0
+    punpckhbw            m2, m0
+    lea                  r3, [strideq*5]
+    punpcklbw            m0, m1, m2
+    punpckhbw            m1, m2
+    lea                  r4, [strideq+r2*2] ; stride*7
+    jge .h16_w8
+    add                 rsp, 32*2
+    movd   [dstq+strideq*0], xm0
+    pextrd [dstq+strideq*1], xm0, 1
+    pextrd [dstq+strideq*2], xm0, 2
+    pextrd [dstq+r2       ], xm0, 3
+    vextracti128        xm0, m0, 1
+    movd   [dstq+strideq*4], xm1
+    pextrd [dstq+r3       ], xm1, 1
+    pextrd [dstq+r2*2     ], xm1, 2
+    pextrd [dstq+r4       ], xm1, 3
+    lea                dstq, [dstq+strideq*8]
+    vextracti128        xm1, m1, 1
+    movd   [dstq+strideq*0], xm0
+    pextrd [dstq+strideq*1], xm0, 1
+    pextrd [dstq+strideq*2], xm0, 2
+    pextrd [dstq+r2       ], xm0, 3
+    movd   [dstq+strideq*4], xm1
+    pextrd [dstq+r3       ], xm1, 1
+    pextrd [dstq+r2*2     ], xm1, 2
+    pextrd [dstq+r4       ], xm1, 3
+    jmp .h16_end
+.h16_w8_loop:
+    mova                 m0, [rsp+32*0]
+    mova                 m2, [rsp+32*1]
+    punpcklbw            m1, m2, m0
+    punpckhbw            m2, m0
+    punpcklbw            m0, m1, m2
+    punpckhbw            m1, m2
+.h16_w8:
+    mova                 m2, [rsp+32*2]
+    mova                 m4, [rsp+32*3]
+    lea                  r6, [dstq+strideq*8]
+    add                 rsp, 32*4
+    punpcklbw            m3, m4, m2
+    punpckhbw            m4, m2
+    punpcklbw            m2, m3, m4
+    punpckhbw            m3, m4
+    punpckldq            m4, m2, m0
+    punpckhdq            m2, m0
+    punpckldq            m0, m3, m1
+    punpckhdq            m3, m1
+    movq   [dstq+strideq*0], xm4
+    movhps [dstq+strideq*1], xm4
+    vextracti128        xm4, m4, 1
+    movq   [dstq+strideq*2], xm2
+    movhps [dstq+r2       ], xm2
+    vextracti128        xm2, m2, 1
+    movq   [dstq+strideq*4], xm0
+    movhps [dstq+r3       ], xm0
+    vextracti128        xm0, m0, 1
+    movq   [dstq+r2*2     ], xm3
+    movhps [dstq+r4       ], xm3
+    vextracti128        xm3, m3, 1
+    movq     [r6+strideq*0], xm4
+    movhps   [r6+strideq*1], xm4
+    movq     [r6+strideq*2], xm2
+    movhps   [r6+r2       ], xm2
+    movq     [r6+strideq*4], xm0
+    movhps   [r6+r3       ], xm0
+    movq     [r6+r2*2     ], xm3
+    movhps   [r6+r4       ], xm3
+    sub                dstq, 8
+    sub              org_wd, 8
+    jge .h16_w8_loop
+.h16_end:
+    RET
+ALIGN function_align
+.h32:
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK         -96, 15
+    lea            maxbased, [wq+31]
+    and            maxbased, 31
+    or             maxbased, 32 ; imin(w+31, 63)
+    test             angled, 0x400 ; !enable_intra_edge_filter
+    jnz .h32_main
+    vbroadcasti128       m0, [pb_0to15]
+    mov                 r4d, 21
+    mov                 r5d, 3
+    movu               xm11, [tlq-66]    ; 56-63
+    vinserti128         m11, [tlq-52], 1 ; 40-47
+    sub                 r4d, wd ; 21-w
+    cmovg               r5d, r4d
+    movu               xm12, [tlq-58]    ; 48-55
+    vinserti128         m12, [tlq-44], 1 ; 32-39
+    sub                 r4d, 8 ; 13-w
+    movd                xm1, r5d
+    movu               xm13, [tlq-34]    ; 24-31
+    vinserti128         m13, [tlq-20], 1 ;  8-15
+    movd                xm2, r4d
+    vpbroadcastb         m1, xm1
+    movu               xm14, [tlq-28]    ; 16-23
+    vinserti128         m14, [tlq-14], 1 ;  0- 7
+    vpbroadcastb         m2, xm2
+    pmaxsb               m1, m0 ; clip 16x32 and (32|64)x32
+    movu                 m7, [z_filter_s+4]
+    pshufb              m11, m1
+    vinserti128          m8, m7, [z_filter_s+8], 1
+    vinserti128          m7, [z_filter_s+16], 0
+    pmaxsb               m2, m0 ; clip 8x32
+    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
+    pshufb              m12, m2
+    pshufb               m0, m11, m8
+    pmaddubsw            m0, m9
+    pshufb               m2, m12, m8
+    pmaddubsw            m2, m9
+    pshufb               m1, m13, m8
+    pmaddubsw            m1, m9
+    shufps               m8, m7, q1021
+    pshufb               m6, m14, m8
+    pmaddubsw            m6, m9
+    vpbroadcastd         m9, [z_filter_k+4*2+12*1]
+    pshufb              m10, m11, m8
+    pmaddubsw           m10, m9
+    paddw                m0, m10
+    pshufb              m10, m12, m8
+    pmaddubsw           m10, m9
+    paddw                m2, m10
+    pshufb              m10, m13, m8
+    pmaddubsw           m10, m9
+    shufps               m8, m7, q2121
+    paddw                m1, m10
+    pshufb              m10, m14, m8
+    pmaddubsw           m10, m9
+    paddw                m6, m10
+    vpbroadcastd         m9, [z_filter_k+4*2+12*2]
+    pshufb              m11, m8
+    pmaddubsw           m11, m9
+    pshufb              m12, m8
+    pmaddubsw           m12, m9
+    movzx               r4d, byte [tlq-63]
+    movzx               r2d, byte [tlq-62]
+    paddw                m0, m11
+    paddw                m2, m12
+    pshufb              m13, m8
+    pmaddubsw           m13, m9
+    pshufb              m14, m7
+    pmaddubsw           m14, m9
+    paddw                m1, m13
+    paddw                m6, m14
+    sub                 r2d, r4d
+    lea                 r2d, [r2+r4*8+4] ; edge case for 64x32
+    pmulhrsw             m0, m3
+    pmulhrsw             m2, m3
+    pmulhrsw             m1, m3
+    pmulhrsw             m6, m3
+    shr                 r2d, 3
+    mov            [rsp+31], r2b
+    lea                 tlq, [rsp+95]
+    mov            [tlq-65], r4b
+    mov                 r4d, 65
+    cmp                  wd, 32
+    cmova          maxbased, r4d
+    packuswb             m0, m2
+    packuswb             m1, m6
+    mova           [tlq-63], m0
+    mova           [tlq-31], m1
+.h32_main:
+    movd                xm6, dyd
+    mov                  r4, tlq
+    sub                 tlq, 8
+    neg                 dyq
+    vpbroadcastw         m6, xm6
+    sub                  r4, maxbaseq
+    shl            maxbased, 6
+    vpbroadcastb         m7, [r4]
+    lea                  r4, [dyq+63]
+    movd                xm9, maxbased
+    sub            maxbased, 63
+    vbroadcasti128       m8, [z3_shuf]
+    neg            maxbaseq
+    vpbroadcastw         m9, xm9
+    psubw                m9, [z_base_inc]
+    mova                m11, m6
+    psubw               m10, m9, m3 ; 64*8
+.h32_loop:
+    mov                  r5, r4
+    sar                  r5, 6
+    pand                 m1, m4, m6
+    psubw                m2, m5, m1
+    psllw                m1, 8
+    por                  m2, m1
+    movu                xm0, [tlq+r5- 0]
+    vinserti128          m0, [tlq+r5-16], 1
+    movu                xm1, [tlq+r5- 8]
+    vinserti128          m1, [tlq+r5-24], 1
+    sub                 rsp, 32
+    add                  r4, dyq
+    pshufb               m0, m8
+    pshufb               m1, m8
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    packuswb             m0, m1
+    pcmpgtw              m1, m9, m6
+    pcmpgtw              m2, m10, m6
+    packsswb             m1, m2
+    paddsw               m6, m11
+    vpblendvb            m0, m7, m0, m1
+    mova              [rsp], m0
+    dec                  wd
+    jz .h32_transpose
+    cmp                 r4d, maxbased
+    jg .h32_loop
+.h32_end_loop:
+    sub                 rsp, 32
+    mova              [rsp], m7
+    dec                  wd
+    jg .h32_end_loop
+.h32_transpose:
+    lea                dstq, [dstq+org_wq-8]
+    lea                  r2, [strideq*3]
+    lea                  r3, [strideq*5]
+    lea                  r4, [strideq+r2*2] ; stride*7
+.h32_w8_loop:
+    mova                 m7, [rsp+32*0]
+    mova                 m6, [rsp+32*1]
+    mova                 m5, [rsp+32*2]
+    mova                 m4, [rsp+32*3]
+    mova                 m3, [rsp+32*4]
+    mova                 m2, [rsp+32*5]
+    mova                 m1, [rsp+32*6]
+    mova                 m0, [rsp+32*7]
+    lea                  r6, [dstq+strideq*8]
+    add                 rsp, 32*8
+    punpcklbw            m8, m0, m1
+    punpckhbw            m0, m1
+    punpcklbw            m1, m2, m3
+    punpckhbw            m2, m3
+    punpcklbw            m3, m4, m5
+    punpckhbw            m4, m5
+    punpcklbw            m5, m6, m7
+    punpckhbw            m6, m7
+    punpcklwd            m7, m8, m1
+    punpckhwd            m8, m1
+    punpcklwd            m1, m0, m2
+    punpckhwd            m0, m2
+    punpcklwd            m2, m3, m5
+    punpckhwd            m3, m5
+    punpcklwd            m5, m4, m6
+    punpckhwd            m4, m6
+    punpckldq            m6, m7, m2
+    punpckhdq            m7, m2
+    punpckldq            m2, m8, m3
+    punpckhdq            m8, m3
+    punpckldq            m3, m1, m5
+    punpckhdq            m1, m5
+    punpckldq            m5, m0, m4
+    punpckhdq            m0, m4
+    movq   [dstq+strideq*0], xm6
+    movhps [dstq+strideq*1], xm6
+    vextracti128        xm6, m6, 1
+    movq   [dstq+strideq*2], xm7
+    movhps [dstq+r2       ], xm7
+    vextracti128        xm7, m7, 1
+    movq   [dstq+strideq*4], xm2
+    movhps [dstq+r3       ], xm2
+    vextracti128        xm2, m2, 1
+    movq   [dstq+r2*2     ], xm8
+    movhps [dstq+r4       ], xm8
+    vextracti128        xm8, m8, 1
+    movq     [r6+strideq*0], xm3
+    movhps   [r6+strideq*1], xm3
+    vextracti128        xm3, m3, 1
+    movq     [r6+strideq*2], xm1
+    movhps   [r6+r2       ], xm1
+    vextracti128        xm1, m1, 1
+    movq     [r6+strideq*4], xm5
+    movhps   [r6+r3       ], xm5
+    vextracti128        xm5, m5, 1
+    movq     [r6+r2*2     ], xm0
+    movhps   [r6+r4       ], xm0
+    lea                  r6, [r6+strideq*8]
+    vextracti128        xm0, m0, 1
+    movq     [r6+strideq*0], xm6
+    movhps   [r6+strideq*1], xm6
+    movq     [r6+strideq*2], xm7
+    movhps   [r6+r2       ], xm7
+    movq     [r6+strideq*4], xm2
+    movhps   [r6+r3       ], xm2
+    movq     [r6+r2*2     ], xm8
+    movhps   [r6+r4       ], xm8
+    lea                  r6, [r6+strideq*8]
+    movq     [r6+strideq*0], xm3
+    movhps   [r6+strideq*1], xm3
+    movq     [r6+strideq*2], xm1
+    movhps   [r6+r2       ], xm1
+    movq     [r6+strideq*4], xm5
+    movhps   [r6+r3       ], xm5
+    movq     [r6+r2*2     ], xm0
+    movhps   [r6+r4       ], xm0
+    sub                dstq, 8
+    sub              org_wd, 8
+    jg .h32_w8_loop
+    RET
+ALIGN function_align
+.h64:
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK        -128, 16
+    lea            maxbased, [wq+63]
+    test             angled, 0x400 ; !enable_intra_edge_filter
+    jnz .h64_main
+    mov                 r4d, 21
+    vpbroadcastb       xm11, [tlq-127]
+    vpblendd           xm11, [tlq-130], 0x0e ; 120-127
+    sub                 r4d, wd ; 21-w
+    mov                 r5d, 3
+    vinserti128         m11, [tlq-116], 1    ; 104-111
+    movu                 m7, [z_filter_s+4]
+    cmp                  wd, 32
+    cmove               r4d, r5d
+    vinserti128          m8, m7, [z_filter_s+8], 1
+    vbroadcasti128       m6, [pb_0to15]
+    movd                xm1, r4d
+    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
+    movu               xm12, [tlq-122]       ; 112-119
+    vinserti128         m12, [tlq-108], 1    ;  96-103
+    vpbroadcastb         m1, xm1
+    movu               xm13, [tlq- 98]       ;  88- 95
+    vinserti128         m13, [tlq- 84], 1    ;  72- 79
+    movu               xm14, [tlq- 90]       ;  80- 87
+    vinserti128         m14, [tlq- 76], 1    ;  64- 71
+    vinserti128          m7, [z_filter_s+16], 0
+    pshufb               m0, m11, m8
+    pmaddubsw            m0, m9
+    pshufb               m2, m12, m8
+    pmaddubsw            m2, m9
+    pmaxsb               m1, m6 ; clip (16|32)x64
+    pshufb              m13, m1
+    pshufb               m1, m13, m8
+    pmaddubsw            m1, m9
+    pshufb               m6, m14, m8
+    pmaddubsw            m6, m9
+    vpbroadcastd         m9, [z_filter_k+4*2+12*1]
+    shufps              m15, m8, m7, q1021
+    pshufb              m10, m11, m15
+    pmaddubsw           m10, m9
+    paddw                m0, m10
+    pshufb              m10, m12, m15
+    pmaddubsw           m10, m9
+    paddw                m2, m10
+    pshufb              m10, m13, m15
+    pmaddubsw           m10, m9
+    paddw                m1, m10
+    pshufb              m10, m14, m15
+    pmaddubsw           m10, m9
+    paddw                m6, m10
+    vpbroadcastd         m9, [z_filter_k+4*2+12*2]
+    shufps              m10, m8, m7, q2132
+    pshufb              m11, m10
+    pmaddubsw           m11, m9
+    pshufb              m12, m10
+    pmaddubsw           m12, m9
+    pshufb              m13, m10
+    pmaddubsw           m13, m9
+    pshufb              m14, m10
+    pmaddubsw           m14, m9
+    paddw                m0, m11
+    paddw                m2, m12
+    paddw                m1, m13
+    paddw                m6, m14
+    movu               xm11, [tlq-66]    ; 56-63
+    vinserti128         m11, [tlq-52], 1 ; 40-47
+    movu               xm12, [tlq-58]    ; 48-55
+    vinserti128         m12, [tlq-44], 1 ; 32-39
+    movu               xm13, [tlq-34]    ; 24-31
+    vinserti128         m13, [tlq-20], 1 ;  8-15
+    movu               xm14, [tlq-28]    ; 16-23
+    vinserti128         m14, [tlq-14], 1 ;  0- 7
+    pmulhrsw             m0, m3
+    pmulhrsw             m2, m3
+    pmulhrsw             m1, m3
+    pmulhrsw             m6, m3
+    lea                 tlq, [rsp+127]
+    packuswb             m0, m2
+    packuswb             m1, m6
+    mova          [tlq-127], m0
+    mova          [tlq- 95], m1
+    pshufb               m0, m11, m10
+    pmaddubsw            m0, m9
+    pshufb               m2, m12, m10
+    pmaddubsw            m2, m9
+    pshufb               m1, m13, m10
+    pmaddubsw            m1, m9
+    pshufb               m6, m14, m7
+    pmaddubsw            m6, m9
+    vpbroadcastd         m9, [z_filter_k+4*2+12*1]
+    pshufb               m7, m11, m15
+    pmaddubsw            m7, m9
+    paddw                m0, m7
+    pshufb               m7, m12, m15
+    pmaddubsw            m7, m9
+    paddw                m2, m7
+    pshufb               m7, m13, m15
+    pmaddubsw            m7, m9
+    paddw                m1, m7
+    pshufb               m7, m14, m10
+    pmaddubsw            m7, m9
+    paddw                m6, m7
+    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
+    pshufb              m11, m8
+    pmaddubsw           m11, m9
+    pshufb              m12, m8
+    pmaddubsw           m12, m9
+    pshufb              m13, m8
+    pmaddubsw           m13, m9
+    pshufb              m14, m15
+    pmaddubsw           m14, m9
+    paddw                m0, m11
+    paddw                m2, m12
+    paddw                m1, m13
+    paddw                m6, m14
+    pmulhrsw             m0, m3
+    pmulhrsw             m2, m3
+    pmulhrsw             m1, m3
+    pmulhrsw             m6, m3
+    packuswb             m0, m2
+    packuswb             m1, m6
+    mova           [tlq-63], m0
+    mova           [tlq-31], m1
+.h64_main:
+    movd                xm6, dyd
+    mov                  r4, tlq
+    sub                 tlq, 24
+    neg                 dyq
+    vpbroadcastw         m6, xm6
+    sub                  r4, maxbaseq
+    shl            maxbased, 6
+    vpbroadcastb         m7, [r4]
+    lea                  r4, [dyq+63]
+    movd               xm10, maxbased
+    sub            maxbased, 63
+    vbroadcasti128       m8, [z3_shuf]
+    neg            maxbaseq
+    mova                xm1, [z_base_inc+16]
+    vinserti128          m1, [z_base_inc], 1
+    vpbroadcastw        m10, xm10
+    psllw                m0, m3, 2   ; 64*32
+    psubw               m10, m1
+    mova                m14, m6
+    psubw               m11, m10, m3 ; 64*8
+    psubw               m12, m10, m0
+    psubw               m13, m11, m0
+.h64_loop:
+    mov                  r5, r4
+    sar                  r5, 6
+    movu                 m0, [tlq+r5-0]
+    movu                 m1, [tlq+r5-8]
+    pand                 m2, m4, m6
+    psubw                m9, m5, m2
+    psllw                m2, 8
+    por                  m9, m2
+    pshufb               m0, m8
+    pshufb               m1, m8
+    pmaddubsw            m0, m9
+    pmaddubsw            m1, m9
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    packuswb             m0, m1
+    pcmpgtw              m1, m10, m6
+    pcmpgtw              m2, m11, m6
+    packsswb             m1, m2
+    vpblendvb            m2, m7, m0, m1
+    movu                 m0, [tlq+r5-32]
+    movu                 m1, [tlq+r5-40]
+    add                  r4, dyq
+    sub                 rsp, 64
+    mova           [rsp+32], m2
+    pshufb               m0, m8
+    pshufb               m1, m8
+    pmaddubsw            m0, m9
+    pmaddubsw            m1, m9
+    pcmpgtw              m9, m12, m6
+    pcmpgtw              m2, m13, m6
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    paddsw               m6, m14
+    packsswb             m9, m2
+    packuswb             m0, m1
+    vpblendvb            m0, m7, m0, m9
+    mova              [rsp], m0
+    dec                  wd
+    jz .h64_transpose
+    cmp                 r4d, maxbased
+    jg .h64_loop
+.h64_end_loop:
+    sub                 rsp, 64
+    mova           [rsp+32], m7
+    mova           [rsp+ 0], m7
+    dec                  wd
+    jg .h64_end_loop
+.h64_transpose:
+    lea                  r2, [strideq*3]
+    lea                  r3, [strideq*5]
+    imul                 r5, strideq, -8
+    lea                dstq, [dstq+org_wq-16]
+    lea                  r4, [strideq+r2*2] ; stride*7
+.h64_transpose_loop0:
+    lea                  r6, [rsp+16*3]
+.h64_transpose_loop:
+    mova                xm0, [r6+64*15]
+    vinserti128          m0, [r6+64* 7], 1
+    mova                xm1, [r6+64*14]
+    vinserti128          m1, [r6+64* 6], 1
+    mova                xm2, [r6+64*13]
+    vinserti128          m2, [r6+64* 5], 1
+    mova                xm3, [r6+64*12]
+    vinserti128          m3, [r6+64* 4], 1
+    mova                xm4, [r6+64*11]
+    vinserti128          m4, [r6+64* 3], 1
+    mova                xm5, [r6+64*10]
+    vinserti128          m5, [r6+64* 2], 1
+    mova                xm6, [r6+64* 9]
+    vinserti128          m6, [r6+64* 1], 1
+    mova                xm7, [r6+64* 8]
+    vinserti128          m7, [r6+64* 0], 1
+    sub                  r6, 16
+    punpcklbw            m8, m0, m1
+    punpckhbw            m0, m1
+    punpcklbw            m1, m2, m3
+    punpckhbw            m2, m3
+    punpcklbw            m3, m4, m5
+    punpckhbw            m4, m5
+    punpcklbw            m5, m6, m7
+    punpckhbw            m6, m7
+    punpcklwd            m7, m8, m1
+    punpckhwd            m8, m1
+    punpcklwd            m1, m0, m2
+    punpckhwd            m0, m2
+    punpcklwd            m2, m3, m5
+    punpckhwd            m3, m5
+    punpcklwd            m5, m4, m6
+    punpckhwd            m4, m6
+    punpckldq            m6, m7, m2
+    punpckhdq            m7, m2
+    punpckldq            m2, m8, m3
+    punpckhdq            m8, m3
+    punpckldq            m3, m1, m5
+    punpckhdq            m1, m5
+    punpckldq            m5, m0, m4
+    punpckhdq            m0, m4
+    vpermq               m6, m6, q3120
+    vpermq               m7, m7, q3120
+    vpermq               m2, m2, q3120
+    vpermq               m8, m8, q3120
+    vpermq               m3, m3, q3120
+    vpermq               m1, m1, q3120
+    vpermq               m5, m5, q3120
+    vpermq               m0, m0, q3120
+    mova         [dstq+strideq*0], xm6
+    vextracti128 [dstq+strideq*1], m6, 1
+    mova         [dstq+strideq*2], xm7
+    vextracti128 [dstq+r2       ], m7, 1
+    mova         [dstq+strideq*4], xm2
+    vextracti128 [dstq+r3       ], m2, 1
+    mova         [dstq+r2*2     ], xm8
+    vextracti128 [dstq+r4       ], m8, 1
+    sub               dstq, r5
+    mova         [dstq+strideq*0], xm3
+    vextracti128 [dstq+strideq*1], m3, 1
+    mova         [dstq+strideq*2], xm1
+    vextracti128 [dstq+r2       ], m1, 1
+    mova         [dstq+strideq*4], xm5
+    vextracti128 [dstq+r3       ], m5, 1
+    mova         [dstq+r2*2     ], xm0
+    vextracti128 [dstq+r4       ], m0, 1
+    sub                dstq, r5
+    cmp                  r6, rsp
+    jae .h64_transpose_loop
+    add                 rsp, 64*16
+    lea                dstq, [dstq+r5*8-16]
+    sub              org_wd, 16
+    jg .h64_transpose_loop0
+.h64_end:
+    RET
+
 %macro FILTER_XMM 4 ; dst, src, tmp, shuf
 %ifnum %4
     pshufb             xm%2, xm%4
 %else
     pshufb             xm%2, %4
 %endif
     pshufd             xm%1, xm%2, q0000 ; p0 p1
     pmaddubsw          xm%1, xm2
@@ -2163,17 +3322,17 @@ ALIGN function_align
     paddw               m%1, m%3
     pshufd              m%3, m%2, q2222
     pmaddubsw           m%3, m4
     paddw               m%1, m%3
     pshufd              m%3, m%2, q3333
     pmaddubsw           m%3, m5
     paddw               m%1, m%3
     psraw               m%1, 4
-    vperm2i128          m%3, m%1, m%1, 0x01
+    vpermq              m%3, m%1, q1032
     packuswb            m%1, m%3
 %endmacro
 
 ; The ipred_filter SIMD processes 4x2 blocks in the following order which
 ; increases parallelism compared to doing things row by row. One redundant
 ; block is calculated for w8 and w16, two for w32.
 ;     w4     w8       w16             w32
 ;     1     1 2     1 2 3 5     1 2 3 5 b c d f
@@ -2244,20 +3403,22 @@ ALIGN function_align
     movq   [dstq+strideq*0], xm6
     movhps [dstq+strideq*1], xm6
     lea                dstq, [dstq+strideq*2]
     sub                  hd, 2
     jg .w8_loop
     RET
 ALIGN function_align
 .w16:
+%if WIN64
     %assign stack_offset stack_offset - stack_size_padded
     %assign xmm_regs_used 15
     %assign stack_size_padded 0x98
     SUB                 rsp, stack_size_padded
+%endif
     sub                  hd, 2
     TAIL_CALL .w16_main, 0
 .w16_main:
 %if WIN64
     movaps       [rsp+0xa8], xmm6
     movaps       [rsp+0xb8], xmm7
     movaps       [rsp+0x28], xmm8
     movaps       [rsp+0x38], xmm9
--- a/third_party/dav1d/src/x86/ipred_init_tmpl.c
+++ b/third_party/dav1d/src/x86/ipred_init_tmpl.c
@@ -34,53 +34,65 @@ decl_angular_ipred_fn(dav1d_ipred_dc_top
 decl_angular_ipred_fn(dav1d_ipred_dc_left_avx2);
 decl_angular_ipred_fn(dav1d_ipred_h_avx2);
 decl_angular_ipred_fn(dav1d_ipred_v_avx2);
 decl_angular_ipred_fn(dav1d_ipred_paeth_avx2);
 decl_angular_ipred_fn(dav1d_ipred_smooth_avx2);
 decl_angular_ipred_fn(dav1d_ipred_smooth_v_avx2);
 decl_angular_ipred_fn(dav1d_ipred_smooth_h_avx2);
 decl_angular_ipred_fn(dav1d_ipred_z1_avx2);
+decl_angular_ipred_fn(dav1d_ipred_z3_avx2);
 decl_angular_ipred_fn(dav1d_ipred_filter_avx2);
 
 decl_cfl_pred_fn(dav1d_ipred_cfl_avx2);
 decl_cfl_pred_fn(dav1d_ipred_cfl_128_avx2);
 decl_cfl_pred_fn(dav1d_ipred_cfl_top_avx2);
 decl_cfl_pred_fn(dav1d_ipred_cfl_left_avx2);
 
 decl_cfl_ac_fn(dav1d_ipred_cfl_ac_420_avx2);
 decl_cfl_ac_fn(dav1d_ipred_cfl_ac_422_avx2);
 
 decl_pal_pred_fn(dav1d_pal_pred_avx2);
 
+decl_angular_ipred_fn(dav1d_ipred_dc_ssse3);
+decl_angular_ipred_fn(dav1d_ipred_dc_128_ssse3);
+decl_angular_ipred_fn(dav1d_ipred_dc_top_ssse3);
+decl_angular_ipred_fn(dav1d_ipred_dc_left_ssse3);
 decl_angular_ipred_fn(dav1d_ipred_h_ssse3);
+decl_angular_ipred_fn(dav1d_ipred_v_ssse3);
 
 void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) {
     const unsigned flags = dav1d_get_cpu_flags();
 
     if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
 
 #if BITDEPTH == 8
-    c->intra_pred[HOR_PRED] = dav1d_ipred_h_ssse3;
+    c->intra_pred[DC_PRED]       = dav1d_ipred_dc_ssse3;
+    c->intra_pred[DC_128_PRED]   = dav1d_ipred_dc_128_ssse3;
+    c->intra_pred[TOP_DC_PRED]   = dav1d_ipred_dc_top_ssse3;
+    c->intra_pred[LEFT_DC_PRED]  = dav1d_ipred_dc_left_ssse3;
+    c->intra_pred[HOR_PRED]      = dav1d_ipred_h_ssse3;
+    c->intra_pred[VERT_PRED]     = dav1d_ipred_v_ssse3;
 #endif
 
     if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
 
 #if BITDEPTH == 8 && ARCH_X86_64
     c->intra_pred[DC_PRED]       = dav1d_ipred_dc_avx2;
     c->intra_pred[DC_128_PRED]   = dav1d_ipred_dc_128_avx2;
     c->intra_pred[TOP_DC_PRED]   = dav1d_ipred_dc_top_avx2;
     c->intra_pred[LEFT_DC_PRED]  = dav1d_ipred_dc_left_avx2;
     c->intra_pred[HOR_PRED]      = dav1d_ipred_h_avx2;
     c->intra_pred[VERT_PRED]     = dav1d_ipred_v_avx2;
     c->intra_pred[PAETH_PRED]    = dav1d_ipred_paeth_avx2;
     c->intra_pred[SMOOTH_PRED]   = dav1d_ipred_smooth_avx2;
     c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_avx2;
     c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_avx2;
     c->intra_pred[Z1_PRED]       = dav1d_ipred_z1_avx2;
+    c->intra_pred[Z3_PRED]       = dav1d_ipred_z3_avx2;
     c->intra_pred[FILTER_PRED]   = dav1d_ipred_filter_avx2;
 
     c->cfl_pred[DC_PRED]      = dav1d_ipred_cfl_avx2;
     c->cfl_pred[DC_128_PRED]  = dav1d_ipred_cfl_128_avx2;
     c->cfl_pred[TOP_DC_PRED]  = dav1d_ipred_cfl_top_avx2;
     c->cfl_pred[LEFT_DC_PRED] = dav1d_ipred_cfl_left_avx2;
 
     c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_ipred_cfl_ac_420_avx2;
--- a/third_party/dav1d/src/x86/ipred_ssse3.asm
+++ b/third_party/dav1d/src/x86/ipred_ssse3.asm
@@ -24,31 +24,42 @@
 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 %include "config.asm"
 %include "ext/x86/x86inc.asm"
 
 SECTION_RODATA 16
 
+pb_128   : times 8 db 128
+pd_32768 : times 1 dd 32768
+
 %macro JMP_TABLE 3-*
     %xdefine %1_%2_table (%%table - 2*4)
     %xdefine %%base mangle(private_prefix %+ _%1_%2)
     %%table:
     %rep %0 - 2
         dd %%base %+ .%3 - (%%table - 2*4)
         %rotate 1
     %endrep
 %endmacro
 
-JMP_TABLE      ipred_h,  ssse3, w4, w8, w16, w32, w64
+%define ipred_dc_splat_ssse3_table (ipred_dc_ssse3_table + 10*4)
+
+JMP_TABLE ipred_h,       ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_dc,      ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
+                                s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
+JMP_TABLE ipred_dc_left, ssse3, h4, h8, h16, h32, h64
 
 SECTION .text
 
-
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+;                                    const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
 %macro IPRED_SET   3                                          ; width, stride, stride size pshuflw_imm8
     pshuflw                      m1, m0, %3                   ; extend 8 byte for 2 pos
     punpcklqdq                   m1, m1
     mova           [dstq +      %2], m1
 %if %1 > 16
     mova           [dstq + 16 + %2], m1
 %endif
 %if %1 > 32
@@ -88,25 +99,374 @@ SECTION .text
     lea                        dstq, [dstq+strideq*4]
     sub                          hd, 4
     jg .w%1
     RET
 %endmacro
 
 INIT_XMM ssse3
 cglobal ipred_h, 3, 6, 2, dst, stride, tl, w, h, stride3
-    lea                          r5, [ipred_h_ssse3_table]
+    LEA                          r5, ipred_h_ssse3_table
     tzcnt                        wd, wm
     movifnidn                    hd, hm
     movsxd                       wq, [r5+wq*4]
     add                          wq, r5
     lea                    stride3q, [strideq*3]
     jmp                          wq
 .w4:
     IPRED_H                       4
 .w8:
     IPRED_H                       8
 .w16:
     IPRED_H                      16
 .w32:
     IPRED_H                      32
 .w64:
     IPRED_H                      64
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+;                                    const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_v, 3, 7, 6, dst, stride, tl, w, h, stride3
+    LEA                  r5, ipred_dc_splat_ssse3_table
+    tzcnt                wd, wm
+    movu                 m0, [tlq+ 1]
+    movu                 m1, [tlq+17]
+    movu                 m2, [tlq+33]
+    movu                 m3, [tlq+49]
+    movifnidn            hd, hm
+    movsxd               wq, [r5+wq*4]
+    add                  wq, r5
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_dc_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+;                                    const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_dc, 3, 7, 6, dst, stride, tl, w, h, stride3
+    movifnidn                    hd, hm
+    movifnidn                    wd, wm
+    tzcnt                       r6d, hd
+    lea                         r5d, [wq+hq]
+    movd                         m4, r5d
+    tzcnt                       r5d, r5d
+    movd                         m5, r5d
+    LEA                          r5, ipred_dc_ssse3_table
+    tzcnt                        wd, wd
+    movsxd                       r6, [r5+r6*4]
+    movsxd                       wq, [r5+wq*4+20]
+    pcmpeqd                      m3, m3
+    psrlw                        m4, 1                             ; dc = (width + height) >> 1;
+    add                          r6, r5
+    add                          wq, r5
+    lea                    stride3q, [strideq*3]
+    jmp r6
+.h4:
+    movd                         m0, [tlq-4]
+    pmaddubsw                    m0, m3
+    jmp                          wq
+.w4:
+    movd                         m1, [tlq+1]
+    pmaddubsw                    m1, m3
+    psubw                        m0, m4
+    paddw                        m0, m1
+    pmaddwd                      m0, m3
+    cmp                          hd, 4
+    jg .w4_mul
+    psrlw                        m0, 3                             ; dc >>= ctz(width + height);
+    jmp .w4_end
+.w4_mul:
+    punpckhqdq                   m1, m0, m0
+    paddw                        m0, m1
+    psrlq                        m1, m0, 32
+    paddw                        m0, m1
+    psrlw                        m0, 2
+    mov                         r6d, 0x5556
+    mov                         r2d, 0x3334
+    test                         hd, 8
+    cmovz                       r6d, r2d
+    movd                         m5, r6d
+    pmulhuw                      m0, m5
+.w4_end:
+    pxor                         m1, m1
+    pshufb                       m0, m1
+.s4:
+    movd           [dstq+strideq*0], m0
+    movd           [dstq+strideq*1], m0
+    movd           [dstq+strideq*2], m0
+    movd           [dstq+stride3q ], m0
+    lea                        dstq, [dstq+strideq*4]
+    sub                          hd, 4
+    jg .s4
+    RET
+ALIGN function_align
+.h8:
+    movq                         m0, [tlq-8]
+    pmaddubsw                    m0, m3
+    jmp                          wq
+.w8:
+    movq                         m1, [tlq+1]
+    pmaddubsw                    m1, m3
+    psubw                        m4, m0
+    punpckhqdq                   m0, m0
+    psubw                        m0, m4
+    paddw                        m0, m1
+    pshuflw                      m1, m0, q1032                  ; psrlq  m1, m0, 32
+    paddw                        m0, m1
+    pmaddwd                      m0, m3
+    psrlw                        m0, m5
+    cmp                          hd, 8
+    je .w8_end
+    mov                         r6d, 0x5556
+    mov                         r2d, 0x3334
+    cmp                          hd, 32
+    cmovz                       r6d, r2d
+    movd                         m1, r6d
+    pmulhuw                      m0, m1
+.w8_end:
+    pxor                         m1, m1
+    pshufb                       m0, m1
+.s8:
+    movq           [dstq+strideq*0], m0
+    movq           [dstq+strideq*1], m0
+    movq           [dstq+strideq*2], m0
+    movq           [dstq+stride3q ], m0
+    lea                        dstq, [dstq+strideq*4]
+    sub                          hd, 4
+    jg .s8
+    RET
+ALIGN function_align
+.h16:
+    mova                         m0, [tlq-16]
+    pmaddubsw                    m0, m3
+    jmp                          wq
+.w16:
+    movu                         m1, [tlq+1]
+    pmaddubsw                    m1, m3
+    paddw                        m0, m1
+    psubw                        m4, m0
+    punpckhqdq                   m0, m0
+    psubw                        m0, m4
+    pshuflw                      m1, m0, q1032                  ; psrlq  m1, m0, 32
+    paddw                        m0, m1
+    pmaddwd                      m0, m3
+    psrlw                        m0, m5
+    cmp                          hd, 16
+    je .w16_end
+    mov                         r6d, 0x5556
+    mov                         r2d, 0x3334
+    test                         hd, 8|32
+    cmovz                       r6d, r2d
+    movd                         m1, r6d
+    pmulhuw                      m0, m1
+.w16_end:
+    pxor                         m1, m1
+    pshufb                       m0, m1
+.s16:
+    mova           [dstq+strideq*0], m0
+    mova           [dstq+strideq*1], m0
+    mova           [dstq+strideq*2], m0
+    mova           [dstq+stride3q ], m0
+    lea                        dstq, [dstq+strideq*4]
+    sub                          hd, 4
+    jg .s16
+    RET
+ALIGN function_align
+.h32:
+    mova                         m0, [tlq-32]
+    pmaddubsw                    m0, m3
+    mova                         m2, [tlq-16]
+    pmaddubsw                    m2, m3
+    paddw                        m0, m2
+    jmp wq
+.w32:
+    movu                         m1, [tlq+1]
+    pmaddubsw                    m1, m3
+    movu                         m2, [tlq+17]
+    pmaddubsw                    m2, m3
+    paddw                        m1, m2
+    paddw                        m0, m1
+    psubw                        m4, m0
+    punpckhqdq                   m0, m0
+    psubw                        m0, m4
+    pshuflw                      m1, m0, q1032                   ; psrlq  m1, m0, 32
+    paddw                        m0, m1
+    pmaddwd                      m0, m3
+    psrlw                        m0, m5
+    cmp                          hd, 32
+    je .w32_end
+    lea                         r2d, [hq*2]
+    mov                         r6d, 0x5556
+    mov                         r2d, 0x3334
+    test                         hd, 64|16
+    cmovz                       r6d, r2d
+    movd                         m1, r6d
+    pmulhuw                      m0, m1
+.w32_end:
+    pxor                         m1, m1
+    pshufb                       m0, m1
+    mova                         m1, m0
+.s32:
+    mova                     [dstq], m0
+    mova                  [dstq+16], m1
+    mova             [dstq+strideq], m0
+    mova          [dstq+strideq+16], m1
+    mova           [dstq+strideq*2], m0
+    mova        [dstq+strideq*2+16], m1
+    mova            [dstq+stride3q], m0
+    mova         [dstq+stride3q+16], m1
+    lea                        dstq, [dstq+strideq*4]
+    sub                          hd, 4
+    jg .s32
+    RET
+ALIGN function_align
+.h64:
+    mova                         m0, [tlq-64]
+    mova                         m1, [tlq-48]
+    pmaddubsw                    m0, m3
+    pmaddubsw                    m1, m3
+    paddw                        m0, m1
+    mova                         m1, [tlq-32]
+    pmaddubsw                    m1, m3
+    paddw                        m0, m1
+    mova                         m1, [tlq-16]
+    pmaddubsw                    m1, m3
+    paddw                        m0, m1
+    jmp wq
+.w64:
+    movu                         m1, [tlq+ 1]
+    movu                         m2, [tlq+17]
+    pmaddubsw                    m1, m3
+    pmaddubsw                    m2, m3
+    paddw                        m1, m2
+    movu                         m2, [tlq+33]
+    pmaddubsw                    m2, m3
+    paddw                        m1, m2
+    movu                         m2, [tlq+49]
+    pmaddubsw                    m2, m3
+    paddw                        m1, m2
+    paddw                        m0, m1
+    psubw                        m4, m0
+    punpckhqdq                   m0, m0
+    psubw                        m0, m4
+    pshuflw                      m1, m0, q1032                   ; psrlq  m1, m0, 32
+    paddw                        m0, m1
+    pmaddwd                      m0, m3
+    psrlw                        m0, m5
+    cmp                          hd, 64
+    je .w64_end
+    mov                         r6d, 0x5556
+    mov                         r2d, 0x3334
+    test                         hd, 32
+    cmovz                       r6d, r2d
+    movd                         m1, r6d
+    pmulhuw                      m0, m1
+.w64_end:
+    pxor                         m1, m1
+    pshufb                       m0, m1
+    mova                         m1, m0
+    mova                         m2, m0
+    mova                         m3, m0
+.s64:
+    mova                     [dstq], m0
+    mova                  [dstq+16], m1
+    mova                  [dstq+32], m2
+    mova                  [dstq+48], m3
+    mova             [dstq+strideq], m0
+    mova          [dstq+strideq+16], m1
+    mova          [dstq+strideq+32], m2
+    mova          [dstq+strideq+48], m3
+    lea                        dstq, [dstq+strideq*2]
+    sub                          hd, 2
+    jg .s64
+    RET
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_dc_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+;                                    const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_dc_left, 3, 7, 6, dst, stride, tl, w, h, stride3
+    LEA                  r5, ipred_dc_left_ssse3_table
+    mov                  hd, hm                ; zero upper half
+    tzcnt               r6d, hd
+    sub                 tlq, hq
+    tzcnt                wd, wm
+    movu                 m0, [tlq]
+    movd                 m3, [r5-ipred_dc_left_ssse3_table+pd_32768]
+    movd                 m2, r6d
+    psrld                m3, m2
+    movsxd               r6, [r5+r6*4]
+    pcmpeqd              m2, m2
+    pmaddubsw            m0, m2
+    add                  r6, r5
+    add                  r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table
+    movsxd               wq, [r5+wq*4]
+    add                  wq, r5
+    jmp                  r6
+.h64:
+    movu                 m1, [tlq+48]                           ; unaligned when jumping here from dc_top
+    pmaddubsw            m1, m2
+    paddw                m0, m1
+    movu                 m1, [tlq+32]                           ; unaligned when jumping here from dc_top
+    pmaddubsw            m1, m2
+    paddw                m0, m1
+.h32:
+    movu                 m1, [tlq+16]                           ; unaligned when jumping here from dc_top
+    pmaddubsw            m1, m2
+    paddw                m0, m1
+.h16:
+    pshufd               m1, m0, q3232                          ; psrlq               m1, m0, 16
+    paddw                m0, m1
+.h8:
+    pshuflw              m1, m0, q1032                          ; psrlq               m1, m0, 32
+    paddw                m0, m1
+.h4:
+    pmaddwd              m0, m2
+    pmulhrsw             m0, m3
+    lea            stride3q, [strideq*3]
+    pxor                 m1, m1
+    pshufb               m0, m1
+    mova                 m1, m0
+    mova                 m2, m0
+    mova                 m3, m0
+    jmp                  wq
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_dc_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+;                                    const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_dc_128, 2, 7, 6, dst, stride, tl, w, h, stride3
+    LEA                  r5, ipred_dc_splat_ssse3_table
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    movsxd               wq, [r5+wq*4]
+    movddup              m0, [r5-ipred_dc_splat_ssse3_table+pb_128]
+    mova                 m1, m0
+    mova                 m2, m0
+    mova                 m3, m0
+    add                  wq, r5
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_dc_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+;                                    const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_dc_top, 3, 7, 6, dst, stride, tl, w, h
+    LEA                  r5, ipred_dc_left_ssse3_table
+    tzcnt                wd, wm
+    inc                 tlq
+    movu                 m0, [tlq]
+    movifnidn            hd, hm
+    movd                 m3, [r5-ipred_dc_left_ssse3_table+pd_32768]
+    movd                 m2, wd
+    psrld                m3, m2
+    movsxd               r6, [r5+wq*4]
+    pcmpeqd              m2, m2
+    pmaddubsw            m0, m2
+    add                  r6, r5
+    add                  r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table
+    movsxd               wq, [r5+wq*4]
+    add                  wq, r5
+    jmp                  r6
+
--- a/third_party/dav1d/src/x86/itx.asm
+++ b/third_party/dav1d/src/x86/itx.asm
@@ -108,16 +108,25 @@ pw_2440x8:  COEF_X8  2440
 pw_m601x8:  COEF_X8  -601
 pw_4052x8:  COEF_X8  4052
 
 idct64_mul: COEF_X8  4095,   101,  4065,   501,  2967, -2824,  3229, -2520
             COEF_X8  3745,  1660,  3564,  2019,  3822, -1474,  3948, -1092
             COEF_X8  3996,   897,  3889,  1285,  3461, -2191,  3659, -1842
             COEF_X8  3349,  2359,  3102,  2675,  4036,  -700,  4085,  -301
 
+pw_201_4091x8:   dw   201*8, 4091*8
+pw_m601_4052x8:  dw  -601*8, 4052*8
+pw_995_3973x8:   dw   995*8, 3973*8
+pw_m1380_3857x8: dw -1380*8, 3857*8
+pw_1751_3703x8:  dw  1751*8, 3703*8
+pw_m2106_3513x8: dw -2106*8, 3513*8
+pw_2440_3290x8:  dw  2440*8, 3290*8
+pw_m2751_3035x8: dw -2751*8, 3035*8
+
 %define o_idct64_offset idct64_mul - (o_base) - 8
 
 SECTION .text
 
 ; Code size reduction trickery: Intead of using rip-relative loads with
 ; mandatory 4-byte offsets everywhere, we can set up a base pointer with a
 ; single rip-relative lea and then address things relative from that with
 ; 1-byte offsets as long as data is within +-128 bytes of the base pointer.
@@ -210,92 +219,86 @@ SECTION .text
 %endif
     paddd               m%3, m%5
     paddd               m%2, m%5
     psrad               m%3, 12
     psrad               m%2, 12
     packssdw            m%2, m%3
 %endmacro
 
-%macro ITX_MULHRSW_SHL3 4 ; dst/src, tmp, coef[1-2]
-    vpbroadcastd        m%2, [pw_%3_%4]
-    psllw               m%2, 3
-    pmulhrsw            m%1, m%2
-%endmacro
-
 %macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048
     ITX_MULSUB_2W        %2, %4, %5, %6, %7, 1567, 3784 ; t2, t3
     vpbroadcastd        m%6, [o(pw_2896x8)]
     paddw               m%5, m%1, m%3
     psubw               m%1, m%3
     pmulhrsw            m%1, m%6 ; t1
     pmulhrsw            m%5, m%6 ; t0
-    psubw               m%3, m%1, m%2
-    paddw               m%2, m%1
-    paddw               m%1, m%5, m%4
-    psubw               m%4, m%5, m%4
+    psubsw              m%3, m%1, m%2
+    paddsw              m%2, m%1
+    paddsw              m%1, m%5, m%4
+    psubsw              m%4, m%5, m%4
 %endmacro
 
 %macro IDCT8_1D 11 ; src[1-8], tmp[1-2], pd_2048
     ITX_MULSUB_2W        %6, %4, %9, %10, %11, 3406, 2276 ; t5a, t6a
     ITX_MULSUB_2W        %2, %8, %9, %10, %11,  799, 4017 ; t4a, t7a
     ITX_MULSUB_2W        %3, %7, %9, %10, %11, 1567, 3784 ; t2, t3
-    paddw               m%9, m%2, m%6  ; t4
-    psubw               m%2, m%6       ; t5a
-    paddw              m%10, m%8, m%4  ; t7
-    psubw               m%8, m%4       ; t6a
+    paddsw              m%9, m%2, m%6  ; t4
+    psubsw              m%2, m%6       ; t5a
+    paddsw             m%10, m%8, m%4  ; t7
+    psubsw              m%8, m%4       ; t6a
     vpbroadcastd        m%4, [o(pw_2896x8)]
     psubw               m%6, m%1, m%5
     paddw               m%1, m%5
     psubw               m%5, m%8, m%2
     paddw               m%8, m%2
     pmulhrsw            m%1, m%4       ; t0
     pmulhrsw            m%6, m%4       ; t1
     pmulhrsw            m%8, m%4       ; t6
     pmulhrsw            m%5, m%4       ; t5
-    psubw               m%4, m%1, m%7  ; dct4 out3
-    paddw               m%1, m%7       ; dct4 out0
-    paddw               m%7, m%6, m%3  ; dct4 out1
-    psubw               m%6, m%3       ; dct4 out2
-    paddw               m%2, m%7, m%8  ; out1
-    psubw               m%7, m%8       ; out6
-    psubw               m%8, m%1, m%10 ; out7
-    paddw               m%1, m%10      ; out0
-    paddw               m%3, m%6, m%5  ; out2
-    psubw               m%6, m%5       ; out5
-    psubw               m%5, m%4, m%9  ; out4
-    paddw               m%4, m%9       ; out3
+    psubsw              m%4, m%1, m%7  ; dct4 out3
+    paddsw              m%1, m%7       ; dct4 out0
+    paddsw              m%7, m%6, m%3  ; dct4 out1
+    psubsw              m%6, m%3       ; dct4 out2
+    paddsw              m%2, m%7, m%8  ; out1
+    psubsw              m%7, m%8       ; out6
+    psubsw              m%8, m%1, m%10 ; out7
+    paddsw              m%1, m%10      ; out0
+    paddsw              m%3, m%6, m%5  ; out2
+    psubsw              m%6, m%5       ; out5
+    psubsw              m%5, m%4, m%9  ; out4
+    paddsw              m%4, m%9       ; out3
 %endmacro
 
 ; in1 = %1, in3  = %2, in5  = %3, in7  = %4
 ; in9 = %5, in11 = %6, in13 = %7, in15 = %8
 %macro IDCT16_1D_ODDHALF 11 ; src[1-8], tmp[1-2], pd_2048
     ITX_MULSUB_2W        %1, %8, %9, %10, %11,  401, 4076 ; t8a,  t15a
     ITX_MULSUB_2W        %5, %4, %9, %10, %11, 3166, 2598 ; t9a,  t14a
     ITX_MULSUB_2W        %3, %6, %9, %10, %11, 1931, 3612 ; t10a, t13a
     ITX_MULSUB_2W        %7, %2, %9, %10, %11, 3920, 1189 ; t11a, t12a
-    psubw               m%9, m%2, m%6 ; t13
-    paddw               m%6, m%2      ; t12
-    psubw               m%2, m%8, m%4 ; t14
-    paddw               m%8, m%4      ; t15
-    psubw               m%4, m%7, m%3 ; t10
-    paddw               m%3, m%7      ; t11
-    psubw               m%7, m%1, m%5 ; t9
-    paddw               m%1, m%5      ; t8
+    psubsw              m%9, m%2, m%6 ; t13
+    paddsw              m%6, m%2      ; t12
+    psubsw              m%2, m%8, m%4 ; t14
+    paddsw              m%8, m%4      ; t15
+    psubsw              m%4, m%7, m%3 ; t10
+    paddsw              m%3, m%7      ; t11
+    psubsw              m%7, m%1, m%5 ; t9
+    paddsw              m%1, m%5      ; t8
     ITX_MULSUB_2W        %2, %7, %5, %10, %11,  1567, 3784 ; t9a,  t14a
     ITX_MULSUB_2W        %9, %4, %5, %10, %11, m3784, 1567 ; t10a, t13a
     vpbroadcastd       m%10, [o(pw_2896x8)]
-    psubw               m%5, m%2, m%9 ; t10
-    paddw               m%2, m%9      ; t9
-    psubw               m%9, m%1, m%3 ; t11a
-    paddw               m%1, m%3      ; t8a
-    psubw               m%3, m%7, m%4 ; t13
-    paddw               m%7, m%4      ; t14
-    psubw               m%4, m%8, m%6 ; t12a
-    paddw               m%8, m%6      ; t15a
+    psubsw              m%5, m%2, m%9 ; t10
+    paddsw              m%2, m%9      ; t9
+    psubsw              m%9, m%1, m%3 ; t11a
+    paddsw              m%1, m%3      ; t8a
+    psubsw              m%3, m%7, m%4 ; t13
+    paddsw              m%7, m%4      ; t14
+    psubsw              m%4, m%8, m%6 ; t12a
+    paddsw              m%8, m%6      ; t15a
     paddw               m%6, m%3, m%5 ; t13a
     psubw               m%3, m%5      ; t10a
     paddw               m%5, m%4, m%9 ; t12
     psubw               m%4, m%9      ; t11
     REPX {pmulhrsw x, m%10}, m%6, m%3, m%5, m%4
 %endmacro
 
 %macro WRAP_XMM 1+
@@ -450,18 +453,18 @@ ALIGN function_align
     punpcklqdq           m0, m3
     ITX_MUL2X_PACK        2, 1, 3, 4, 1567, 3784
 %if %0 == 1
     pmulhrsw             m0, m%1
 %else
     vpbroadcastd         m4, [o(pw_2896x8)]
     pmulhrsw             m0, m4     ; t0 t1
 %endif
-    psubw                m1, m0, m2 ; out3 out2
-    paddw                m0, m2     ; out0 out1
+    psubsw               m1, m0, m2 ; out3 out2
+    paddsw               m0, m2     ; out0 out1
 %endmacro
 
 %macro IADST4_1D_PACKED 0
     punpcklwd            m2, m1, m0
     punpckhwd            m3, m1, m0
     psubw                m0, m1
     punpckhqdq           m1, m1
     paddw                m1, m0 ; in0 - in2 + in3
@@ -685,57 +688,57 @@ cglobal iidentity_4x4_internal, 0, 5, 6,
     punpcklwd            m3, m1     ; in2 in6
     psubw                m1, m0, m2
     paddw                m0, m2
     punpcklqdq           m0, m1     ; in0+in4 in0-in4
     ITX_MUL2X_PACK        5, 1, 2, 6,  799, 4017, 1 ; t4a t7a
     ITX_MUL2X_PACK        4, 1, 2, 6, 3406, 2276, 1 ; t5a t6a
     ITX_MUL2X_PACK        3, 1, 2, 6, 1567, 3784    ; t3 t2
     vpbroadcastd         m6, [o(pw_2896x8)]
-    psubw                m2, m5, m4 ; t4 t7
-    paddw                m5, m4     ; t5a t6a
+    psubsw               m2, m5, m4 ; t4 t7
+    paddsw               m5, m4     ; t5a t6a
     pshufd               m4, m2, q1032
     psubw                m1, m2, m4
     paddw                m4, m2
     vpblendd             m4, m4, m1, 0xcc
     pmulhrsw             m0, m6     ; t0 t1
     pmulhrsw             m4, m6     ; t6 t5
-    psubw                m1, m0, m3 ; tmp3 tmp2
-    paddw                m0, m3     ; tmp0 tmp1
+    psubsw               m1, m0, m3 ; tmp3 tmp2
+    paddsw               m0, m3     ; tmp0 tmp1
     shufps               m2, m5, m4, q1032 ; t7 t6
     vpblendd             m5, m5, m4, 0xcc  ; t4 t5
-    psubw                m3, m0, m2 ; out7 out6
-    paddw                m0, m2     ; out0 out1
-    psubw                m2, m1, m5 ; out4 out5
-    paddw                m1, m5     ; out3 out2
+    psubsw               m3, m0, m2 ; out7 out6
+    paddsw               m0, m2     ; out0 out1
+    psubsw               m2, m1, m5 ; out4 out5
+    paddsw               m1, m5     ; out3 out2
 %endmacro
 
 %macro IADST8_1D_PACKED 0
     vpbroadcastd         m6, [o(pd_2048)]
     punpckhwd            m0, m4, m3 ; 0 7
     punpckhwd            m1, m5, m2 ; 2 5
     punpcklwd            m2, m5     ; 4 3
     punpcklwd            m3, m4     ; 6 1
     ITX_MUL2X_PACK        0, 4, 5, 6,  401, 4076 ; t0a t1a
     ITX_MUL2X_PACK        1, 4, 5, 6, 1931, 3612 ; t2a t3a
     ITX_MUL2X_PACK        2, 4, 5, 6, 3166, 2598 ; t4a t5a
     ITX_MUL2X_PACK        3, 4, 5, 6, 3920, 1189 ; t6a t7a
-    psubw                m4, m0, m2 ; t4 t5
-    paddw                m0, m2     ; t0 t1
-    psubw                m5, m1, m3 ; t6 t7
-    paddw                m1, m3     ; t2 t3
+    psubsw               m4, m0, m2 ; t4 t5
+    paddsw               m0, m2     ; t0 t1
+    psubsw               m5, m1, m3 ; t6 t7
+    paddsw               m1, m3     ; t2 t3
     shufps               m2, m5, m4, q1032
     punpckhwd            m4, m2
     punpcklwd            m5, m2
     ITX_MUL2X_PACK        4, 2, 3, 6, 1567, 3784, 1 ; t5a t4a
     ITX_MUL2X_PACK        5, 2, 3, 6, 3784, 1567    ; t7a t6a
-    psubw                m2, m0, m1        ; t2 t3
-    paddw                m0, m1            ; out0 -out7
-    psubw                m1, m4, m5        ; t7 t6
-    paddw                m4, m5            ; out6 -out1
+    psubsw               m2, m0, m1        ; t2 t3
+    paddsw               m0, m1            ; out0 -out7
+    psubsw               m1, m4, m5        ; t7 t6
+    paddsw               m4, m5            ; out6 -out1
     vpbroadcastd         m5, [o(pw_2896x8)]
     vpblendd             m3, m0, m4, 0x33  ; out6 -out7
     vpblendd             m0, m0, m4, 0xcc  ; out0 -out1
     shufps               m4, m2, m1, q1032 ; t3 t7
     vpblendd             m1, m2, m1, 0xcc  ; t2 t6
     psubw                m2, m1, m4        ; t2-t3 t6-t7
     paddw                m1, m4            ; t2+t3 t6+t7
     pmulhrsw             m2, m5            ; out4 -out5
@@ -973,37 +976,37 @@ cglobal iidentity_4x8_internal, 0, 5, 7,
     punpcklwd            m6, m2     ; dct4  in3  in1
     ITX_MUL2X_PACK        8, 2, 4, 10,  401, 4076, 3 ; t8a  t15a
     ITX_MUL2X_PACK        0, 2, 4, 10, 3166, 2598, 3 ; t9a  t14a
     ITX_MUL2X_PACK        1, 2, 4, 10, 3920, 1189, 3 ; t11a t12a
     ITX_MUL2X_PACK        5, 2, 4, 10, 1931, 3612, 3 ; t10a t13a
     ITX_MUL2X_PACK        7, 2, 4, 10,  799, 4017, 1 ; t4a  t7a
     ITX_MUL2X_PACK        3, 2, 4, 10, 3406, 2276, 1 ; t5a  t6a
     ITX_MUL2X_PACK        6, 2, 4, 10, 1567, 3784    ; t3   t2
-    psubw                m2, m8, m0 ; t9  t14
-    paddw                m8, m0     ; t8  t15
-    psubw                m0, m1, m5 ; t10 t13
-    paddw                m1, m5     ; t11 t12
+    psubsw               m2, m8, m0 ; t9  t14
+    paddsw               m8, m0     ; t8  t15
+    psubsw               m0, m1, m5 ; t10 t13
+    paddsw               m1, m5     ; t11 t12
 %if mmsize > 16
     vbroadcasti128       m5, [o(deint_shuf)]
 %else
     mova                 m5, [o(deint_shuf)]
 %endif
     pshufb               m8, m5
     pshufb               m1, m5
     vpbroadcastd         m5, [o(pw_m3784_1567)]  ; reuse pw_1567_3784
     ITX_MUL2X_PACK        2, 4, _, 10, 4, 5, 4   ; t9a  t14a
     vpbroadcastd         m4, [o(pw_m1567_m3784)] ; reuse pw_m3784_1567
     ITX_MUL2X_PACK        0, 5, _, 10, 5, 4, 4   ; t10a t13a
-    psubw                m5, m7, m3 ; t5a t6a
-    paddw                m7, m3     ; t4  t7
-    psubw                m4, m8, m1 ; t11a t12a
-    paddw                m8, m1     ; t8a  t15a
-    paddw                m1, m2, m0 ; t9   t14
-    psubw                m2, m0     ; t10  t13
+    psubsw               m5, m7, m3 ; t5a t6a
+    paddsw               m7, m3     ; t4  t7
+    psubsw               m4, m8, m1 ; t11a t12a
+    paddsw               m8, m1     ; t8a  t15a
+    paddsw               m1, m2, m0 ; t9   t14
+    psubsw               m2, m0     ; t10  t13
     punpckhqdq           m0, m8, m1 ; t15a t14
     punpcklqdq           m8, m1     ; t8a  t9
     pshufd               m3, m5, q1032
     psubw                m1, m5, m3
     paddw                m3, m5
     vpblendd             m3, m3, m1, 0xcc ; t6 t5
     vpbroadcastd         m1, [o(pw_2896x8)]
     punpckhqdq           m5, m4, m2 ; t12a t13
@@ -1011,30 +1014,30 @@ cglobal iidentity_4x8_internal, 0, 5, 7,
     psubw                m4, m5, m2
     paddw                m5, m2
     pmulhrsw             m9, m1     ; t0   t1
     pmulhrsw             m3, m1     ; t6   t5
     pmulhrsw             m4, m1     ; t11  t10a
     pmulhrsw             m5, m1     ; t12  t13a
     shufps               m2, m7, m3, q1032 ; t7 t6
     vpblendd             m7, m7, m3, 0xcc  ; t4 t5
-    psubw                m1, m9, m6 ; dct4 out3 out2
-    paddw                m9, m6     ; dct4 out0 out1
-    psubw                m3, m9, m2 ; dct8 out7 out6
-    paddw                m9, m2     ; dct8 out0 out1
-    psubw                m2, m1, m7 ; dct8 out4 out5
-    paddw                m1, m7     ; dct8 out3 out2
-    psubw                m7, m9, m0 ; out15 out14
-    paddw                m0, m9     ; out0  out1
-    psubw                m6, m1, m5 ; out12 out13
-    paddw                m1, m5     ; out3  out2
-    psubw                m5, m2, m4 ; out11 out10
-    paddw                m2, m4     ; out4  out5
-    psubw                m4, m3, m8 ; out8  out9
-    paddw                m3, m8     ; out7  out6
+    psubsw               m1, m9, m6 ; dct4 out3 out2
+    paddsw               m9, m6     ; dct4 out0 out1
+    psubsw               m3, m9, m2 ; dct8 out7 out6
+    paddsw               m9, m2     ; dct8 out0 out1
+    psubsw               m2, m1, m7 ; dct8 out4 out5
+    paddsw               m1, m7     ; dct8 out3 out2
+    psubsw               m7, m9, m0 ; out15 out14
+    paddsw               m0, m9     ; out0  out1
+    psubsw               m6, m1, m5 ; out12 out13
+    paddsw               m1, m5     ; out3  out2
+    psubsw               m5, m2, m4 ; out11 out10
+    paddsw               m2, m4     ; out4  out5
+    psubsw               m4, m3, m8 ; out8  out9
+    paddsw               m3, m8     ; out7  out6
 %endmacro
 
 INV_TXFM_4X16_FN dct, dct,      0
 INV_TXFM_4X16_FN dct, identity, 15
 INV_TXFM_4X16_FN dct, adst
 INV_TXFM_4X16_FN dct, flipadst
 
 cglobal idct_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
@@ -1145,46 +1148,46 @@ ALIGN function_align
     punpckhwd            m4, m3, m0 ; in12 in3  in14 in1
     punpcklwd            m0, m3     ; in0  in15 in2  in13
     punpckhwd            m3, m2, m1 ; in8  in7  in10 in5
     punpcklwd            m1, m2     ; in4  in11 in6  in9
     ITX_MUL4X_PACK        0, 2, 5, 6, 8,  201, 4091,  995, 3973, 3
     ITX_MUL4X_PACK        1, 2, 5, 6, 8, 1751, 3703, 2440, 3290, 3
     ITX_MUL4X_PACK        3, 2, 5, 6, 8, 3035, 2751, 3513, 2106, 3
     ITX_MUL4X_PACK        4, 2, 5, 6, 8, 3857, 1380, 4052,  601, 3
-    psubw                m2, m0, m3 ; t9a  t8a  t11a t10a
-    paddw                m0, m3     ; t1a  t0a  t3a  t2a
-    psubw                m3, m1, m4 ; t13a t12a t15a t14a
-    paddw                m1, m4     ; t5a  t4a  t7a  t6a
+    psubsw               m2, m0, m3 ; t9a  t8a  t11a t10a
+    paddsw               m0, m3     ; t1a  t0a  t3a  t2a
+    psubsw               m3, m1, m4 ; t13a t12a t15a t14a
+    paddsw               m1, m4     ; t5a  t4a  t7a  t6a
     ITX_MUL4X_PACK        2, 4, 5, 6, 8,  799, 4017, 3406, 2276, 3
     psubw                m6, m7, m5
     ITX_MUL2X_PACK        3, 5, _, 8, 6, 4, 6
     vpbroadcastd         m6, [o(pw_m3784_1567)]
     vpbroadcastd         m5, [o(pw_1567_3784)]
-    psubw                m4, m0, m1 ; t5   t4   t7   t6
-    paddw                m0, m1     ; t1   t0   t3   t2
-    psubw                m1, m2, m3 ; t13a t12a t15a t14a
-    paddw                m2, m3     ; t9a  t8a  t11a t10a
-    psubw                m3, m7, m6
+    psubsw               m4, m0, m1 ; t5   t4   t7   t6
+    paddsw               m0, m1     ; t1   t0   t3   t2
+    psubsw               m1, m2, m3 ; t13a t12a t15a t14a
+    paddsw               m2, m3     ; t9a  t8a  t11a t10a
+    psubw                m3, m7, m6 ; pw_3784_m1567
     vpblendd             m6, m6, m3, 0xf0
     ITX_MUL2X_PACK        4, 3, _, 8, 6, 5, 4 ; t4a t5a t7a t6a
     ITX_MUL2X_PACK        1, 3, _, 8, 6, 5, 4 ; t12 t13 t15 t14
     vbroadcasti128       m5, [o(deint_shuf)]
     pshufb               m0, m5
     pshufb               m2, m5
     vperm2i128           m3, m0, m2, 0x31  ; t3   t2   t11a t10a
     vinserti128          m0, m0, xm2, 1    ; t1   t0   t9a  t8a
     vperm2i128           m2, m4, m1, 0x31  ; t7a  t6a  t15  t14
     vinserti128          m4, m4, xm1, 1    ; t4a  t5a  t12  t13
     vpbroadcastd         m5, [o(pw_2896x8)]
     pshufd               m2, m2, q1032     ; t6a  t7a  t14  t15
-    psubw                m1, m0, m3        ; t3a t2a t11 t10
-    paddw                m0, m3     ; -out15  out0   out14 -out1
-    paddw                m3, m4, m2 ; -out3   out12  out2  -out13
-    psubw                m4, m2            ; t6 t7 t14a t15a
+    psubsw               m1, m0, m3        ; t3a t2a t11 t10
+    paddsw               m0, m3     ; -out15  out0   out14 -out1
+    paddsw               m3, m4, m2 ; -out3   out12  out2  -out13
+    psubsw               m4, m2            ; t6 t7 t14a t15a
     shufps               m2, m1, m4, q1032 ; t2a t6  t10 t14a
     vpblendd             m4, m4, m1, 0x33  ; t3a t7  t11 t15a
     paddw                m1, m2, m4
     psubw                m2, m4
     pmulhrsw             m1, m5     ; -out7   out4   out6  -out5
     pmulhrsw             m2, m5     ;  out8  -out11 -out9   out10
     ret
 
@@ -1894,63 +1897,63 @@ ALIGN function_align
     ITX_MUL2X_PACK        0, 4, 9, 10,  201, 4091, 3 ; t0  t1
     ITX_MUL2X_PACK        1, 4, 9, 10,  995, 3973, 3 ; t2  t3
     ITX_MUL2X_PACK        2, 4, 9, 10, 1751, 3703, 3 ; t4  t5
     ITX_MUL2X_PACK        3, 4, 9, 10, 2440, 3290, 3 ; t6  t7
     ITX_MUL2X_PACK        5, 4, 9, 10, 3035, 2751, 3 ; t8  t9
     ITX_MUL2X_PACK        6, 4, 9, 10, 3513, 2106, 3 ; t10 t11
     ITX_MUL2X_PACK        7, 4, 9, 10, 3857, 1380, 3 ; t12 t13
     ITX_MUL2X_PACK        8, 4, 9, 10, 4052,  601, 3 ; t14 t15
-    psubw                m4, m0, m5 ; t9a  t8a
-    paddw                m0, m5     ; t1a  t0a
-    psubw                m5, m1, m6 ; t11a t10a
-    paddw                m1, m6     ; t3a  t2a
-    psubw                m6, m2, m7 ; t13a t12a
-    paddw                m2, m7     ; t5a  t4a
-    psubw                m7, m3, m8 ; t15a t14a
-    paddw                m3, m8     ; t7a  t6a
+    psubsw               m4, m0, m5 ; t9a  t8a
+    paddsw               m0, m5     ; t1a  t0a
+    psubsw               m5, m1, m6 ; t11a t10a
+    paddsw               m1, m6     ; t3a  t2a
+    psubsw               m6, m2, m7 ; t13a t12a
+    paddsw               m2, m7     ; t5a  t4a
+    psubsw               m7, m3, m8 ; t15a t14a
+    paddsw               m3, m8     ; t7a  t6a
     vpbroadcastd        m11, [o(pw_m4017_799)]
     vpbroadcastd        m12, [o(pw_799_4017)]
     pxor                 m9, m9
     ITX_MUL2X_PACK        4, 8, _, 10, 11, 12, 6 ; t8  t9
-    psubw                m8, m9, m11
+    psubw                m8, m9, m11 ; pw_4017_m799
     ITX_MUL2X_PACK        6, 12, _, 10, 12, 8, 6 ; t12 t13
     vpbroadcastd        m11, [o(pw_m2276_3406)]
     vpbroadcastd        m12, [o(pw_3406_2276)]
     ITX_MUL2X_PACK        5, 8, _, 10, 11, 12, 6 ; t10 t11
-    psubw                m8, m9, m11
+    psubw                m8, m9, m11 ; pw_2276_m3406
     ITX_MUL2X_PACK        7, 12, _, 10, 12, 8, 6 ; t14 t15
-    psubw                m8, m1, m3 ; t7   t6
-    paddw                m1, m3     ; t3   t2
-    psubw                m3, m0, m2 ; t5   t4
-    paddw                m0, m2     ; t1   t0
-    psubw                m2, m5, m7 ; t14a t15a
-    paddw                m7, m5     ; t10a t11a
-    psubw                m5, m4, m6 ; t12a t13a
-    paddw                m4, m6     ; t8a  t9a
+    psubsw               m8, m1, m3 ; t7   t6
+    paddsw               m1, m3     ; t3   t2
+    psubsw               m3, m0, m2 ; t5   t4
+    paddsw               m0, m2     ; t1   t0
+    psubsw               m2, m5, m7 ; t14a t15a
+    paddsw               m7, m5     ; t10a t11a
+    psubsw               m5, m4, m6 ; t12a t13a
+    paddsw               m4, m6     ; t8a  t9a
     vpbroadcastd        m11, [o(pw_m3784_1567)]
     vpbroadcastd        m12, [o(pw_1567_3784)]
     ITX_MUL2X_PACK        3, 6, _, 10, 11, 12, 4 ; t4a t5a
-    psubw                m6, m9, m11
+    psubw                m6, m9, m11 ; pw_3784_m1567
     ITX_MUL2X_PACK        8, 12, _, 10, 12, 6, 4 ; t6a t7a
     vpbroadcastd        m11, [o(pw_m1567_3784)]
     vpbroadcastd        m12, [o(pw_3784_1567)]
     ITX_MUL2X_PACK        2, 6, _, 10, 11, 12, 4 ; t15 t14
-    psubw                m6, m9, m11
+    psubw                m6, m9, m11 ; pw_1567_m3784
     ITX_MUL2X_PACK        5, 12, _, 10, 12, 6, 4 ; t13 t12
     vbroadcasti128      m11, [o(deint_shuf)]
     vpbroadcastd        m12, [o(pw_2896x8)]
-    psubw                m6, m0, m1        ;  t3a    t2a
-    paddw                m0, m1            ; -out15  out0
-    paddw                m1, m2, m5        ; -out13  out2
-    psubw                m5, m2            ;  t15a   t14a
-    paddw                m2, m4, m7        ; -out1  out14
-    psubw                m4, m7            ;  t10    t11
-    psubw                m7, m3, m8        ;  t6     t7
-    paddw                m8, m3            ; -out3   out12
+    psubsw               m6, m0, m1        ;  t3a    t2a
+    paddsw               m0, m1            ; -out15  out0
+    paddsw               m1, m2, m5        ; -out13  out2
+    psubsw               m5, m2            ;  t15a   t14a
+    paddsw               m2, m4, m7        ; -out1  out14
+    psubsw               m4, m7            ;  t10    t11
+    psubsw               m7, m3, m8        ;  t6     t7
+    paddsw               m8, m3            ; -out3   out12
     REPX    {pshufb x, m11}, m6, m4, m0, m2
     vpblendd             m3, m6, m4, 0xcc  ;  t3a    t11
     shufps               m6, m6, m4, q1032 ;  t2a    t10
     vpblendd             m4, m5, m7, 0xcc  ;  t15a   t7
     shufps               m5, m5, m7, q1032 ;  t14a   t6
     shufps               m7, m2, m0, q1032 ;  out14 -out15
     vpblendd             m0, m0, m2, 0x33  ; -out1   out0
     paddw                m2, m5, m4        ; -out5   out4
@@ -2572,35 +2575,35 @@ ALIGN function_align
     jmp m(idct_16x8_internal).end2
 ALIGN function_align
 .main:
     vpbroadcastd        m10, [o(pd_2048)]
     ITX_MULSUB_2W         7, 0, 8, 9, 10,  401, 4076 ; t1a, t0a
     ITX_MULSUB_2W         3, 4, 8, 9, 10, 3166, 2598 ; t5a, t4a
     ITX_MULSUB_2W         1, 6, 8, 9, 10, 3920, 1189 ; t7a, t6a
     ITX_MULSUB_2W         5, 2, 8, 9, 10, 1931, 3612 ; t3a, t2a
-    psubw                m8, m2, m6 ; t6
-    paddw                m2, m6     ; t2
-    psubw                m6, m0, m4 ; t4
-    paddw                m0, m4     ; t0
-    psubw                m4, m5, m1 ; t7
-    paddw                m5, m1     ; t3
-    psubw                m1, m7, m3 ; t5
-    paddw                m7, m3     ; t1
+    psubsw               m8, m2, m6 ; t6
+    paddsw               m2, m6     ; t2
+    psubsw               m6, m0, m4 ; t4
+    paddsw               m0, m4     ; t0
+    psubsw               m4, m5, m1 ; t7
+    paddsw               m5, m1     ; t3
+    psubsw               m1, m7, m3 ; t5
+    paddsw               m7, m3     ; t1
     ITX_MULSUB_2W         6, 1, 3, 9, 10, 1567, 3784 ; t5a, t4a
     ITX_MULSUB_2W         4, 8, 3, 9, 10, 3784, 1567 ; t6a, t7a
-    psubw                m9, m6, m8 ;  t7
-    paddw                m6, m8     ;  out6
+    psubsw               m9, m6, m8 ;  t7
+    paddsw               m6, m8     ;  out6
     vpbroadcastd         m8, [o(pw_2896x8)]
-    psubw                m3, m7, m5 ;  t3
-    paddw                m7, m5     ; -out7
-    psubw                m5, m0, m2 ;  t2
-    paddw                m0, m2     ;  out0
-    psubw                m2, m1, m4 ;  t6
-    paddw                m1, m4     ; -out1
+    psubsw               m3, m7, m5 ;  t3
+    paddsw               m7, m5     ; -out7
+    psubsw               m5, m0, m2 ;  t2
+    paddsw               m0, m2     ;  out0
+    psubsw               m2, m1, m4 ;  t6
+    paddsw               m1, m4     ; -out1
     psubw                m4, m5, m3
     paddw                m3, m5
     psubw                m5, m2, m9
     paddw                m2, m9
     pmulhrsw             m2, m8     ;  out2
     pmulhrsw             m3, m8     ; -out3
     pmulhrsw             m4, m8     ;  out4
     pmulhrsw             m5, m8     ; -out5
@@ -2951,35 +2954,35 @@ ALIGN function_align
     mova                 m1, [rsp+gprsize+32*2] ; in9
     mova [rsp+gprsize+32*2], m14 ; tmp7
     mova                 m9, [rsp+gprsize+32*1] ; in1
     mova [rsp+gprsize+32*1], m10 ; tmp5
     mova                m14, [rsp+gprsize+32*0] ; in15
     mova [rsp+gprsize+32*0], m6  ; tmp3
     IDCT16_1D_ODDHALF     9,  3,  5,  7,  1, 11, 13, 14,  6, 10, 15
     mova                 m6, [rsp+gprsize+32*1] ; tmp5
-    psubw               m15, m0, m14  ; out15
-    paddw                m0, m14      ; out0
-    psubw               m14, m2, m13  ; out14
-    paddw                m2, m13      ; out1
+    psubsw              m15, m0, m14  ; out15
+    paddsw               m0, m14      ; out0
+    psubsw              m14, m2, m13  ; out14
+    paddsw               m2, m13      ; out1
     mova [rsp+gprsize+32*1], m2
-    psubw               m13, m4, m11  ; out13
-    paddw                m2, m4, m11  ; out2
-    psubw               m11, m8, m7   ; out11
-    paddw                m4, m8, m7   ; out4
+    psubsw              m13, m4, m11  ; out13
+    paddsw               m2, m4, m11  ; out2
+    psubsw              m11, m8, m7   ; out11
+    paddsw               m4, m8, m7   ; out4
     mova                 m7, [rsp+gprsize+32*2] ; tmp7
-    psubw               m10, m6, m5   ; out10
-    paddw                m5, m6       ; out5
-    psubw                m8, m7, m9   ; out8
-    paddw                m7, m9       ; out7
-    psubw                m9, m12, m3  ; out9
-    paddw                m6, m12, m3  ; out6
+    psubsw              m10, m6, m5   ; out10
+    paddsw               m5, m6       ; out5
+    psubsw               m8, m7, m9   ; out8
+    paddsw               m7, m9       ; out7
+    psubsw               m9, m12, m3  ; out9
+    paddsw               m6, m12, m3  ; out6
     mova                 m3, [rsp+gprsize+32*0] ; tmp3
-    psubw               m12, m3, m1   ; out12
-    paddw                m3, m1       ; out3
+    psubsw              m12, m3, m1   ; out12
+    paddsw               m3, m1       ; out3
     ret
 
 INV_TXFM_16X16_FN adst, dct
 INV_TXFM_16X16_FN adst, adst
 INV_TXFM_16X16_FN adst, flipadst
 
 cglobal iadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
     ITX_16X16_LOAD_COEFS
@@ -3004,86 +3007,86 @@ ALIGN function_align
 .main:
     vpbroadcastd        m15, [o(pd_2048)]
     mova [rsp+gprsize+32*1], m0
     mova [rsp+gprsize+32*2], m4
     ITX_MULSUB_2W        13,  2,  0,  4, 15,  995, 3973 ; t3,  t2
     ITX_MULSUB_2W         9,  6,  0,  4, 15, 2440, 3290 ; t7,  t6
     ITX_MULSUB_2W         5, 10,  0,  4, 15, 3513, 2106 ; t11, t10
     ITX_MULSUB_2W         1, 14,  0,  4, 15, 4052,  601 ; t15, t14
-    psubw                m0, m2, m10  ; t10a
-    paddw                m2, m10      ; t2a
-    psubw               m10, m13, m5  ; t11a
-    paddw               m13, m5       ; t3a
-    psubw                m5, m6, m14  ; t14a
-    paddw                m6, m14      ; t6a
-    psubw               m14, m9, m1   ; t15a
-    paddw                m9, m1       ; t7a
+    psubsw               m0, m2, m10  ; t10a
+    paddsw               m2, m10      ; t2a
+    psubsw              m10, m13, m5  ; t11a
+    paddsw              m13, m5       ; t3a
+    psubsw               m5, m6, m14  ; t14a
+    paddsw               m6, m14      ; t6a
+    psubsw              m14, m9, m1   ; t15a
+    paddsw               m9, m1       ; t7a
     ITX_MULSUB_2W         0, 10,  1,  4, 15, 3406, 2276 ; t11, t10
     ITX_MULSUB_2W        14,  5,  1,  4, 15, 2276, 3406 ; t14, t15
-    psubw                m1, m10, m14 ; t14a
-    paddw               m10, m14      ; t10a
-    psubw               m14, m0, m5   ; t15a
-    paddw                m0, m5       ; t11a
-    psubw                m5, m2, m6   ; t6
-    paddw                m2, m6       ; t2
-    psubw                m6, m13, m9  ; t7
-    paddw               m13, m9       ; t3
+    psubsw               m1, m10, m14 ; t14a
+    paddsw              m10, m14      ; t10a
+    psubsw              m14, m0, m5   ; t15a
+    paddsw               m0, m5       ; t11a
+    psubsw               m5, m2, m6   ; t6
+    paddsw               m2, m6       ; t2
+    psubsw               m6, m13, m9  ; t7
+    paddsw              m13, m9       ; t3
     ITX_MULSUB_2W         6,  5,  4,  9, 15, 3784, 1567 ; t6a, t7a
     ITX_MULSUB_2W        14,  1,  4,  9, 15, 3784, 1567 ; t14, t15
     mova                 m9, [rsp+gprsize+32*0] ; in15
     mova [rsp+gprsize+32*0], m10 ; t10a
     mova                 m4, [rsp+gprsize+32*1] ; in0
     mova [rsp+gprsize+32*1], m6  ; t6a
     mova                 m6, [rsp+gprsize+32*2] ; in4
     mova [rsp+gprsize+32*2], m2  ; t2
     ITX_MULSUB_2W         9,  4,  2, 10, 15,  201, 4091 ; t1,  t0
     ITX_MULSUB_2W        11,  6,  2, 10, 15, 1751, 3703 ; t5,  t4
     ITX_MULSUB_2W         7,  8,  2, 10, 15, 3035, 2751 ; t9,  t8
     ITX_MULSUB_2W         3, 12,  2, 10, 15, 3857, 1380 ; t13, t12
-    psubw               m10, m4, m8  ; t8a
-    paddw                m8, m4      ; t0a
-    psubw                m4, m9, m7  ; t9a
-    paddw                m9, m7      ; t1a
-    psubw                m7, m6, m12 ; t12a
-    paddw                m6, m12     ; t4a
-    psubw               m12, m11, m3 ; t13a
-    paddw               m11, m3      ; t5a
+    psubsw              m10, m4, m8  ; t8a
+    paddsw               m8, m4      ; t0a
+    psubsw               m4, m9, m7  ; t9a
+    paddsw               m9, m7      ; t1a
+    psubsw               m7, m6, m12 ; t12a
+    paddsw               m6, m12     ; t4a
+    psubsw              m12, m11, m3 ; t13a
+    paddsw              m11, m3      ; t5a
     ITX_MULSUB_2W        10,  4,  2,  3, 15,  799, 4017 ; t9,  t8
     ITX_MULSUB_2W        12,  7,  2,  3, 15, 4017,  799 ; t12, t13
-    psubw                m3, m9, m11 ; t5
-    paddw                m9, m11     ; t1
-    psubw               m11, m4, m12 ; t12a
-    paddw                m4, m12     ; t8a
-    paddw               m12, m8, m6  ; t0
-    psubw                m8, m6      ; t4
-    paddw                m6, m10, m7 ; t9a
-    psubw               m10, m7      ; t13a
+    psubsw               m3, m9, m11 ; t5
+    paddsw               m9, m11     ; t1
+    psubsw              m11, m4, m12 ; t12a
+    paddsw               m4, m12     ; t8a
+    paddsw              m12, m8, m6  ; t0
+    psubsw               m8, m6      ; t4
+    paddsw               m6, m10, m7 ; t9a
+    psubsw              m10, m7      ; t13a
     ITX_MULSUB_2W         8,  3,  2,  7, 15, 1567, 3784 ; t5a, t4a
     ITX_MULSUB_2W        11, 10,  2,  7, 15, 1567, 3784 ; t13, t12
     mova                 m7, [rsp+gprsize+32*0] ; t10a
     mova                 m2, [rsp+gprsize+32*1] ; t6a
-    paddw               m15, m9, m13  ; -out15
-    psubw                m9, m13      ;  t3a
-    paddw               m13, m11, m1  ; -out13
-    psubw               m11, m1       ;  t15a
-    psubw                m1, m4, m7   ;  t10
-    paddw                m7, m4       ; -out1
-    psubw                m4, m3, m2   ;  t6
-    paddw                m3, m2       ; -out3
-    paddw                m2, m10, m14 ;  out2
-    psubw               m10, m14      ;  t14a
-    paddw               m14, m6, m0   ;  out14
-    psubw                m6, m0       ;  t11
+    paddsw              m15, m9, m13  ; -out15
+    psubsw               m9, m13      ;  t3a
+    paddsw              m13, m11, m1  ; -out13
+    psubsw              m11, m1       ;  t15a
+    psubsw               m1, m4, m7   ;  t10
+    paddsw               m7, m4       ; -out1
+    psubsw               m4, m3, m2   ;  t6
+    paddsw               m3, m2       ; -out3
+    paddsw               m2, m10, m14 ;  out2
+    psubsw              m10, m14      ;  t14a
+    paddsw              m14, m6, m0   ;  out14
+    psubsw               m6, m0       ;  t11
     mova                 m0, [rsp+gprsize+32*2] ; t2
     mova [rsp+gprsize+32*1], m7
-    psubw                m7, m12, m0  ;  t2a
-    paddw                m0, m12      ;  out0
-    paddw               m12, m8, m5   ;  out12
-    psubw                m8, m5       ;  t7
+    psubsw               m7, m12, m0  ;  t2a
+    paddsw               m0, m12      ;  out0
+    paddsw              m12, m8, m5   ;  out12
+    psubsw               m8, m5       ;  t7
     paddw                m5, m10, m11 ; -out5
     psubw               m10, m11      ;  out10
     psubw               m11, m4, m8   ; -out11
     paddw                m4, m8       ;  out4
     psubw                m8, m7, m9   ;  out8
     paddw                m7, m9       ; -out7
     psubw                m9, m1, m6   ; -out9
     paddw                m6, m1       ;  out6
@@ -3272,16 +3275,25 @@ ALIGN function_align
     mova                m11, [%1+%2*3]
     mova                m12, [%1+%2*4]
     mova                m13, [%1+%2*5]
     mova                m14, [%1+%2*6]
     mova                m15, [%1+%2*7]
 %endif
 %endmacro
 
+%macro ITX_UNPACK_MULHRSW 7 ; dst1, dst2/src, tmp, coef[1-4]
+    vpbroadcastd        m%3, [r5-pw_201_4091x8+pw_%4_%5x8]
+    punpcklwd           m%1, m%2, m%2
+    pmulhrsw            m%1, m%3
+    vpbroadcastd        m%3, [r5-pw_201_4091x8+pw_%6_%7x8]
+    punpckhwd           m%2, m%2
+    pmulhrsw            m%2, m%3
+%endmacro
+
 cglobal inv_txfm_add_dct_dct_8x32, 4, 4, 0, dst, stride, c, eob
     lea                 rax, [o_base]
     test               eobd, eobd
     jz .dconly
     PROLOGUE              0, 4, 16, 32*3, dst, stride, c, eob
     %undef cmp
     cmp                eobd, 106
     jle .fast
@@ -3426,32 +3438,21 @@ ALIGN function_align
 .main_fast: ; bottom half is zero
     call m(idct_8x16_internal).main
     mova                 m8, [rsp+gprsize+0*32]
     mova [rsp+gprsize+0*32], m0
     mova                 m9, [rsp+gprsize+1*32]
     mova [rsp+gprsize+1*32], m1
     mova                 m0, [rsp+gprsize+2*32]
     mova [rsp+gprsize+2*32], m6
-    punpcklwd            m1, m8, m8
-    punpckhwd            m8, m8
-    punpcklwd           m15, m9, m9
-    punpckhwd            m9, m9
-    punpcklwd           m14, m0, m0
-    punpckhwd            m0, m0
-    punpcklwd           m13, m11, m11
-    punpckhwd           m11, m11
-    ITX_MULHRSW_SHL3      1,  6,   201, 4091 ; t16a, t31a
-    ITX_MULHRSW_SHL3      8,  6,  m601, 4052 ; t23a, t24a
-    ITX_MULHRSW_SHL3     15,  6,   995, 3973 ; t20a, t27a
-    ITX_MULHRSW_SHL3      9,  6, m1380, 3857 ; t19a, t28a
-    ITX_MULHRSW_SHL3     14,  6,  1751, 3703 ; t18a, t29a
-    ITX_MULHRSW_SHL3      0,  6, m2106, 3513 ; t21a, t26a
-    ITX_MULHRSW_SHL3     13,  6,  2440, 3290 ; t22a, t25a
-    ITX_MULHRSW_SHL3     11,  6, m2751, 3035 ; t17a, t30a
+    lea                  r5, [rax-(o_base)+pw_201_4091x8]
+    ITX_UNPACK_MULHRSW    1,  8,  6,  201, 4091,  m601, 4052 ; t16a, t31a, t23a, t24a
+    ITX_UNPACK_MULHRSW   15,  9,  6,  995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a
+    ITX_UNPACK_MULHRSW   14,  0,  6, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a
+    ITX_UNPACK_MULHRSW   13, 11,  6, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a
     jmp .main2
 ALIGN function_align
 .main:
     call m(idct_8x16_internal).main
     mova                 m8, [rsp+gprsize+0*32]
     mova [rsp+gprsize+0*32], m0
     mova                 m9, [rsp+gprsize+1*32]
     mova [rsp+gprsize+1*32], m1
@@ -3469,84 +3470,84 @@ ALIGN function_align
     ITX_MUL2X_PACK        8,  6, 12, 10, 4052,  601, 3 ; t23a, t24a
     ITX_MUL2X_PACK       15,  6, 12, 10,  995, 3973, 3 ; t20a, t27a
     ITX_MUL2X_PACK        9,  6, 12, 10, 3857, 1380, 3 ; t19a, t28a
     ITX_MUL2X_PACK       14,  6, 12, 10, 1751, 3703, 3 ; t18a, t29a
     ITX_MUL2X_PACK        0,  6, 12, 10, 3513, 2106, 3 ; t21a, t26a
     ITX_MUL2X_PACK       13,  6, 12, 10, 2440, 3290, 3 ; t22a, t25a
     ITX_MUL2X_PACK       11,  6, 12, 10, 3035, 2751, 3 ; t17a, t30a
 .main2:
-    psubw                m6, m1, m11  ; t17 t30
-    paddw                m1, m11      ; t16 t31
-    psubw               m11, m9, m14  ; t18 t29
-    paddw                m9, m14      ; t19 t28
-    psubw               m14, m15, m0  ; t21 t26
-    paddw               m15, m0       ; t20 t27
-    psubw                m0, m8, m13  ; t22 t25
-    paddw                m8, m13      ; t23 t24
+    psubsw               m6, m1, m11  ; t17 t30
+    paddsw               m1, m11      ; t16 t31
+    psubsw              m11, m9, m14  ; t18 t29
+    paddsw               m9, m14      ; t19 t28
+    psubsw              m14, m15, m0  ; t21 t26
+    paddsw              m15, m0       ; t20 t27
+    psubsw               m0, m8, m13  ; t22 t25
+    paddsw               m8, m13      ; t23 t24
     ITX_MUL2X_PACK        6, 12, 13, 10,   799, 4017, 3 ; t17a t30a
     ITX_MUL2X_PACK       11, 12, 13, 10, m4017,  799, 3 ; t18a t29a
     ITX_MUL2X_PACK       14, 12, 13, 10,  3406, 2276, 3 ; t21a t26a
     ITX_MUL2X_PACK        0, 12, 13, 10, m2276, 3406, 3 ; t22a t25a
-    psubw               m13, m1, m9   ; t19a t28a
-    paddw                m1, m9       ; t16a t31a
-    psubw                m9, m8, m15  ; t20a t27a
-    paddw                m8, m15      ; t23a t24a
-    psubw               m15, m6, m11  ; t18  t29
-    paddw                m6, m11      ; t17  t30
-    psubw               m11, m0, m14  ; t21  t26
-    paddw                m0, m14      ; t22  t25
+    psubsw              m13, m1, m9   ; t19a t28a
+    paddsw               m1, m9       ; t16a t31a
+    psubsw               m9, m8, m15  ; t20a t27a
+    paddsw               m8, m15      ; t23a t24a
+    psubsw              m15, m6, m11  ; t18  t29
+    paddsw               m6, m11      ; t17  t30
+    psubsw              m11, m0, m14  ; t21  t26
+    paddsw               m0, m14      ; t22  t25
     ITX_MUL2X_PACK       15, 12, 14, 10,  1567, 3784, 1 ; t18a t29a
     ITX_MUL2X_PACK       13, 12, 14, 10,  1567, 3784, 1 ; t19  t28
     ITX_MUL2X_PACK        9, 12, 14, 10, m3784, 1567, 1 ; t20  t27
     ITX_MUL2X_PACK       11, 12, 14, 10, m3784, 1567, 1 ; t21a t26a
     vbroadcasti128      m12, [o(deint_shuf)]
     REPX    {pshufb x, m12}, m0, m1, m6, m8
-    psubw               m14, m1, m8   ; t23  t24
-    paddw                m1, m8       ; t16  t31
-    psubw                m8, m6, m0   ; t22a t25a
-    paddw                m6, m0       ; t17a t30a
-    psubw                m0, m15, m11 ; t21  t26
-    paddw               m15, m11      ; t18  t29
-    psubw               m11, m13, m9  ; t20a t27a
-    paddw               m13, m9       ; t19a t28a
+    psubsw              m14, m1, m8   ; t23  t24
+    paddsw               m1, m8       ; t16  t31
+    psubsw               m8, m6, m0   ; t22a t25a
+    paddsw               m6, m0       ; t17a t30a
+    psubsw               m0, m15, m11 ; t21  t26
+    paddsw              m15, m11      ; t18  t29
+    psubsw              m11, m13, m9  ; t20a t27a
+    paddsw              m13, m9       ; t19a t28a
     vpbroadcastd        m12, [o(pw_2896x8)]
-    punpcklqdq            m9, m11, m0 ; t20a t21
-    punpckhqdq           m11, m0      ; t27a t26
-    punpcklqdq            m0, m14, m8 ; t23  t22a
-    punpckhqdq           m14, m8      ; t24  t25a
-    psubw                 m8, m11, m9 ; t20  t21a
-    paddw                m11, m9      ; t27  t26a
-    psubw                 m9, m14, m0 ; t23a t22
-    paddw                m14, m0      ; t24a t25
-    REPX   {pmulhrsw x, m12}, m8, m9, m14, m11
+    punpcklqdq           m9, m11, m0  ; t20a t21
+    punpckhqdq          m11, m0       ; t27a t26
+    punpcklqdq           m0, m14, m8  ; t23  t22a
+    punpckhqdq          m14, m8       ; t24  t25a
+    psubw                m8, m11, m9  ; t20  t21a
+    paddw               m11, m9       ; t27  t26a
+    psubw                m9, m14, m0  ; t23a t22
+    paddw               m14, m0       ; t24a t25
+    REPX  {pmulhrsw x, m12}, m8, m9, m14, m11
     punpcklqdq           m0, m1, m6   ; t16  t17a
     punpckhqdq           m1, m6       ; t31  t30a
-    psubw               m10, m5, m8   ; out20 out21
-    paddw                m5, m8       ; out11 out10
-    psubw                m6, m3, m14  ; out24 out25
-    paddw                m3, m14      ; out7  out6
-    psubw                m8, m7, m0   ; out16 out17
-    paddw                m7, m0       ; out15 out14
+    psubsw              m10, m5, m8   ; out20 out21
+    paddsw               m5, m8       ; out11 out10
+    psubsw               m6, m3, m14  ; out24 out25
+    paddsw               m3, m14      ; out7  out6
+    psubsw               m8, m7, m0   ; out16 out17
+    paddsw               m7, m0       ; out15 out14
     mova                 m0, [rsp+gprsize+0*32]
     punpcklqdq          m12, m13, m15 ; t19a t18
     punpckhqdq          m13, m15      ; t28a t29
-    psubw               m15, m0, m1   ; out31 out30
-    paddw                m0, m1       ; out0  out1
+    psubsw              m15, m0, m1   ; out31 out30
+    paddsw               m0, m1       ; out0  out1
     mova                 m1, [rsp+gprsize+1*32]
     mova [rsp+gprsize+0*32], m6
     mova                 m6, [rsp+gprsize+2*32]
-    psubw               m14, m1, m13  ; out28 out29
-    paddw                m1, m13      ; out3  out2
-    psubw               m13, m2, m11  ; out27 out26
-    paddw                m2, m11      ; out4  out5
-    psubw               m11, m4, m9   ; out23 out22
-    paddw                m4, m9       ; out8  out9
-    psubw                m9, m6, m12  ; out19 out18
-    paddw                m6, m12      ; out12 out13
+    psubsw              m14, m1, m13  ; out28 out29
+    paddsw               m1, m13      ; out3  out2
+    psubsw              m13, m2, m11  ; out27 out26
+    paddsw               m2, m11      ; out4  out5
+    psubsw              m11, m4, m9   ; out23 out22
+    paddsw               m4, m9       ; out8  out9
+    psubsw               m9, m6, m12  ; out19 out18
+    paddsw               m6, m12      ; out12 out13
     ret
 
 %macro LOAD_PACKED_16X2 4 ; dst, tmp, row[1-2]
     vbroadcasti128      m%1, [cq+16*%3]
     vbroadcasti128      m%2, [cq+16*%4]
     shufpd              m%1, m%1, m%2, 0x0c
 %endmacro
 
@@ -3867,18 +3868,18 @@ cglobal inv_txfm_add_identity_identity_3
     pxor                m15, m15
     REPX {mova [%1+%2*x], m15}, 0,  1,  2,  3,  4,  5,  6,  7, \
                                 8,  9, 10, 11, 12, 13, 14, 15
 %endif
 %endmacro
 
 %macro IDCT32_PASS2_END 7 ; coefs[1-2], tmp[1-2], rnd, offset[1-2]
     mova                m%4, [%2]
-    paddw               m%3, m%1, m%4
-    psubw               m%1, m%4
+    paddsw              m%3, m%1, m%4
+    psubsw              m%1, m%4
     pmovzxbw            m%4, [dstq+%6]
     pmulhrsw            m%3, m%5
     pmulhrsw            m%1, m%5
     paddw               m%3, m%4
     pmovzxbw            m%4, [r2+%7]
     paddw               m%1, m%4
     packuswb            m%3, m%1
     vpermq              m%3, m%3, q3120
@@ -4051,89 +4052,89 @@ ALIGN function_align
     vpbroadcastd        m15, [o(pd_2048)]
     ITX_MULSUB_2W         4, 11,  7,  8, 15, 1751, 3703 ; t18a, t29a
     ITX_MULSUB_2W        12,  3,  7,  8, 15, 3857, 1380 ; t19a, t28a
     ITX_MULSUB_2W         2, 13,  7,  8, 15,  995, 3973 ; t20a, t27a
     ITX_MULSUB_2W        10,  5,  7,  8, 15, 3513, 2106 ; t21a, t26a
     ITX_MULSUB_2W         6,  9,  7,  8, 15, 2440, 3290 ; t22a, t25a
     ITX_MULSUB_2W        14,  1,  7,  8, 15, 4052,  601 ; t23a, t24a
 .main2:
-    psubw                m7, m12, m4  ; t18
-    paddw               m12, m4       ; t19
-    psubw                m4, m2, m10  ; t21
-    paddw                m2, m10      ; t20
-    psubw               m10, m14, m6  ; t22
-    paddw               m14, m6       ; t23
-    psubw                m6, m1, m9   ; t25
-    paddw                m1, m9       ; t24
-    psubw                m9, m13, m5  ; t26
-    paddw               m13, m5       ; t27
-    psubw                m5, m3, m11  ; t29
-    paddw                m3, m11      ; t28
+    psubsw               m7, m12, m4  ; t18
+    paddsw              m12, m4       ; t19
+    psubsw               m4, m2, m10  ; t21
+    paddsw               m2, m10      ; t20
+    psubsw              m10, m14, m6  ; t22
+    paddsw              m14, m6       ; t23
+    psubsw               m6, m1, m9   ; t25
+    paddsw               m1, m9       ; t24
+    psubsw               m9, m13, m5  ; t26
+    paddsw              m13, m5       ; t27
+    psubsw               m5, m3, m11  ; t29
+    paddsw               m3, m11      ; t28
     ITX_MULSUB_2W         5,  7,  8, 11, 15, m4017,  799 ; t18a, t29a
     ITX_MULSUB_2W         9,  4,  8, 11, 15,  3406, 2276 ; t21a, t26a
     ITX_MULSUB_2W         6, 10,  8, 11, 15, m2276, 3406 ; t22a, t25a
-    psubw                m8, m14, m2  ; t20a
-    paddw               m14, m2       ; t23a
-    psubw                m2, m1, m13  ; t27a
-    paddw                m1, m13      ; t24a
-    psubw               m13, m6, m9   ; t21
-    paddw                m6, m9       ; t22
-    psubw                m9, m10, m4  ; t26
-    paddw               m10, m4       ; t25
+    psubsw               m8, m14, m2  ; t20a
+    paddsw              m14, m2       ; t23a
+    psubsw               m2, m1, m13  ; t27a
+    paddsw               m1, m13      ; t24a
+    psubsw              m13, m6, m9   ; t21
+    paddsw               m6, m9       ; t22
+    psubsw               m9, m10, m4  ; t26
+    paddsw              m10, m4       ; t25
     ITX_MULSUB_2W         2,  8,  4, 11, 15, m3784, 1567 ; t20,  t27
     ITX_MULSUB_2W         9, 13,  4, 11, 15, m3784, 1567 ; t21a, t26a
     mova                 m4, [rsp+gprsize+32*0] ; in31
     mova [rsp+gprsize+32*0], m6  ; t22
     mova                 m6, [rsp+gprsize+32*1] ; in15
     mova [rsp+gprsize+32*1], m14 ; t23a
     mova                m14, [rsp+gprsize+32*2] ; in17
     mova [rsp+gprsize+32*2], m1  ; t24a
     ITX_MULSUB_2W         0,  4,  1, 11, 15,  201, 4091 ; t16a, t31a
     ITX_MULSUB_2W        14,  6,  1, 11, 15, 3035, 2751 ; t17a, t30a
-    psubw                m1, m0, m14  ; t17
-    paddw                m0, m14      ; t16
-    psubw               m14, m4, m6   ; t30
-    paddw                m4, m6       ; t31
+    psubsw               m1, m0, m14  ; t17
+    paddsw               m0, m14      ; t16
+    psubsw              m14, m4, m6   ; t30
+    paddsw               m4, m6       ; t31
     ITX_MULSUB_2W        14,  1,  6, 11, 15,  799, 4017 ; t17a, t30a
-    psubw                m6, m0, m12  ; t19a
-    paddw                m0, m12      ; t16a
-    psubw               m12, m4, m3   ; t28a
-    paddw                m4, m3       ; t31a
-    psubw                m3, m14, m5  ; t18
-    paddw               m14, m5       ; t17
-    psubw                m5, m1, m7   ; t29
-    paddw                m1, m7       ; t30
+    psubsw               m6, m0, m12  ; t19a
+    paddsw               m0, m12      ; t16a
+    psubsw              m12, m4, m3   ; t28a
+    paddsw               m4, m3       ; t31a
+    psubsw               m3, m14, m5  ; t18
+    paddsw              m14, m5       ; t17
+    psubsw               m5, m1, m7   ; t29
+    paddsw               m1, m7       ; t30
     ITX_MULSUB_2W         5,  3,  7, 11, 15, 1567, 3784 ; t18a, t29a
     ITX_MULSUB_2W        12,  6,  7, 11, 15, 1567, 3784 ; t19,  t28
-    psubw                m7, m1, m10  ; t25a
-    paddw                m1, m10      ; t30a
-    psubw               m10, m5, m9   ; t21
-    paddw                m5, m9       ; t18
-    psubw                m9, m12, m2  ; t20a
-    paddw               m12, m2       ; t19a
-    psubw                m2, m3, m13  ; t26
-    paddw                m3, m13      ; t29
-    psubw               m13, m6, m8   ; t27a
-    paddw                m6, m8       ; t28a
+    psubsw               m7, m1, m10  ; t25a
+    paddsw               m1, m10      ; t30a
+    psubsw              m10, m5, m9   ; t21
+    paddsw               m5, m9       ; t18
+    psubsw               m9, m12, m2  ; t20a
+    paddsw              m12, m2       ; t19a
+    psubsw               m2, m3, m13  ; t26
+    paddsw               m3, m13      ; t29
+    psubsw              m13, m6, m8   ; t27a
+    paddsw               m6, m8       ; t28a
     mova       [tmp1q-32*2], m5
     mova       [tmp1q-32*1], m12
     mova       [tmp2q+32*0], m6
     mova       [tmp2q+32*1], m3
     mova       [tmp2q+32*2], m1
     mova                 m5, [rsp+gprsize+32*0] ; t22
     mova                 m6, [rsp+gprsize+32*1] ; t23
     mova                 m3, [rsp+gprsize+32*2] ; t24a
     vpbroadcastd         m8, [o(pw_2896x8)]
-    psubw                m1, m14, m5  ; t22a
-    paddw               m14, m5       ; t17a
-    psubw                m5, m0, m6   ; t23
-    paddw                m0, m6       ; t16
-    psubw                m6, m4, m3   ; t24
-    paddw                m4, m3       ; t31
+    psubsw               m1, m14, m5  ; t22a
+    paddsw              m14, m5       ; t17a
+    psubsw               m5, m0, m6   ; t23
+    paddsw               m0, m6       ; t16
+    psubsw               m6, m4, m3   ; t24
+    paddsw               m4, m3       ; t31
     mova       [tmp1q-32*4], m0
     mova       [tmp1q-32*3], m14
     mova       [tmp2q+32*3], m4
     psubw                m3, m13, m9  ; t20
     paddw               m13, m9       ; t27
     psubw                m9, m2, m10  ; t21a
     paddw                m2, m10      ; t26a
     psubw               m10, m7, m1   ; t22
@@ -4236,23 +4237,23 @@ ALIGN function_align
     IDCT32_PASS2_END      7, tmp2q-32*4, 0, 4, 15, strideq*4, strideq*8
     IDCT32_PASS2_END     11, tmp1q+32*0, 0, 4, 15, strideq*8, strideq*4
     IDCT32_PASS2_END      1, tmp1q-32*4, 0, 4, 15, r3*4,      strideq*0
     ret
 
 ; Perform the final sumsub step and YMM lane shuffling
 %macro IDCT32_PASS1_END 4 ; row[1-2], tmp[1-2]
     mova                m%3, [tmp2q+32*( 3-%1)]
-    psubw               m%4, m%1, m%3
-    paddw               m%1, m%3
+    psubsw              m%4, m%1, m%3
+    paddsw              m%1, m%3
     mova                m%3, [tmp1q+32*(11-%2)]
     mova         [tmp1q+32*(11-%2)+16], xm%4
     vextracti128 [tmp2q+32*( 3-%1)+16], m%4, 1
-    paddw               m%4, m%2, m%3
-    psubw               m%2, m%3
+    paddsw              m%4, m%2, m%3
+    psubsw              m%2, m%3
     mova         [tmp1q+32*(11-%2)], xm%2
     vextracti128 [tmp2q+32*( 3-%1)], m%2, 1
     vperm2i128          m%2, m%1, m%4, 0x31
     vinserti128         m%1, m%1, xm%4, 1
 %endmacro
 
 cglobal inv_txfm_add_dct_dct_32x16, 4, 4, 0, dst, stride, c, eob
     lea                 rax, [o_base]
@@ -4703,22 +4704,22 @@ cglobal inv_txfm_add_identity_identity_3
 %macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4])
 %if %1 & 1
     mova                m%5, [tmp2q-32*(51-%1)] ; idct16 out 0+n
     mova                m%4, [tmp1q-32*(14+%1)] ; idct32 out31-n
 %else
     mova                m%5, [tmp1q-32*(45-%1)]
     mova                m%4, [tmp2q-32*(20+%1)]
 %endif
-    psubw               m%6, m%5, m%4 ; idct32 out31-n
-    paddw               m%5, m%4      ; idct32 out 0+n
-    psubw               m%4, m%6, m%3 ; out32+n
-    paddw               m%6, m%3      ; out31-n
-    psubw               m%3, m%5, m%2 ; out63-n
-    paddw               m%5, m%2      ; out 0+n
+    psubsw              m%6, m%5, m%4 ; idct32 out31-n
+    paddsw              m%5, m%4      ; idct32 out 0+n
+    psubsw              m%4, m%6, m%3 ; out32+n
+    paddsw              m%6, m%3      ; out31-n
+    psubsw              m%3, m%5, m%2 ; out63-n
+    paddsw              m%5, m%2      ; out 0+n
 %if %0 == 6 ; pass 1
 %if %1 & 1
     mova [tmp2q-32*(19-%1)], m%4
     mova [tmp1q-32*(14+%1)], m%6
     mova [tmp1q+32*(18-%1)], m%3
     mova [tmp2q-32*(51-%1)], m%5
 %else
     mova [tmp1q-32*(13-%1)], m%4
@@ -4943,35 +4944,35 @@ ALIGN function_align