Bug 1329988 - Always use ANGLE's less-slow transpose:true path. - r=kvark
authorJeff Gilbert <jgilbert@mozilla.com>
Tue, 14 Feb 2017 15:21:37 -0800
changeset 373210 20c8c1cd5a3faec93c9108f7553a6bd6d40b5793
parent 373209 04b72382940faf1a2f7175ba3f28e4407ba23f9c
child 373211 fbd5216f6a4979679dfab4f415fc86777dd88b01
push id10863
push userjlorenzo@mozilla.com
push dateMon, 06 Mar 2017 23:02:23 +0000
treeherdermozilla-aurora@0931190cd725 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewerskvark
bugs1329988
milestone54.0a1
Bug 1329988 - Always use ANGLE's less-slow transpose:true path. - r=kvark MozReview-Commit-ID: ugVzpBlwCP
dom/canvas/WebGLContextGL.cpp
--- a/dom/canvas/WebGLContextGL.cpp
+++ b/dom/canvas/WebGLContextGL.cpp
@@ -2030,55 +2030,102 @@ WebGLContext::UniformNfv(const char* fun
         &gl::GLContext::fUniform4fv
     };
     const auto func = kFuncList[N-1];
 
     MakeContextCurrent();
     (gl->*func)(loc->mLoc, numElementsToUpload, elemBytes);
 }
 
+static inline void
+MatrixAxBToRowMajor(const uint8_t width, const uint8_t height,
+                    const float* __restrict srcColMajor,
+                    float* __restrict dstRowMajor)
+{
+    for (uint8_t x = 0; x < width; ++x) {
+        for (uint8_t y = 0; y < height; ++y) {
+            dstRowMajor[y * width + x] = srcColMajor[x * height + y];
+        }
+    }
+}
+
 void
 WebGLContext::UniformMatrixAxBfv(const char* funcName, uint8_t A, uint8_t B,
-                                 WebGLUniformLocation* loc, bool transpose,
+                                 WebGLUniformLocation* loc, const bool transpose,
                                  const Float32Arr& arr, GLuint elemOffset,
                                  GLuint elemCountOverride)
 {
     size_t elemCount;
     if (!ValidateArrOffsetAndCount(this, funcName, arr.elemCount, elemOffset,
                                    elemCountOverride, &elemCount))
     {
         return;
     }
     const auto elemBytes = arr.elemBytes + elemOffset;
 
-    uint32_t numElementsToUpload;
+    uint32_t numMatsToUpload;
     if (!ValidateUniformMatrixArraySetter(loc, A, B, LOCAL_GL_FLOAT, elemCount,
-                                          transpose, funcName, &numElementsToUpload))
+                                          transpose, funcName, &numMatsToUpload))
     {
         return;
     }
     MOZ_ASSERT(!loc->mInfo->mSamplerTexList, "Should not be a sampler.");
 
+    ////
+
+    bool uploadTranspose = transpose;
+    const float* uploadBytes = elemBytes;
+
+    UniqueBuffer temp;
+    if (!transpose && gl->WorkAroundDriverBugs() && gl->IsANGLE() &&
+        gl->IsAtLeast(gl::ContextProfile::OpenGLES, 300))
+    {
+        // ANGLE is really slow at non-GL-transposed matrices.
+        const size_t kElemsPerMat = A * B;
+
+        temp = malloc(numMatsToUpload * kElemsPerMat * sizeof(float));
+        if (!temp) {
+            ErrorOutOfMemory("%s: Failed to alloc temporary buffer for transposition.",
+                             funcName);
+            return;
+        }
+
+        auto srcItr = (const float*)elemBytes;
+        auto dstItr = (float*)temp.get();
+        const auto srcEnd = srcItr + numMatsToUpload * kElemsPerMat;
+
+        while (srcItr != srcEnd) {
+            MatrixAxBToRowMajor(A, B, srcItr, dstItr);
+            srcItr += kElemsPerMat;
+            dstItr += kElemsPerMat;
+        }
+
+        uploadBytes = (const float*)temp.get();
+        uploadTranspose = true;
+    }
+
+    ////
+
     static const decltype(&gl::GLContext::fUniformMatrix2fv) kFuncList[] = {
         &gl::GLContext::fUniformMatrix2fv,
         &gl::GLContext::fUniformMatrix2x3fv,
         &gl::GLContext::fUniformMatrix2x4fv,
 
         &gl::GLContext::fUniformMatrix3x2fv,
         &gl::GLContext::fUniformMatrix3fv,
         &gl::GLContext::fUniformMatrix3x4fv,
 
         &gl::GLContext::fUniformMatrix4x2fv,
         &gl::GLContext::fUniformMatrix4x3fv,
         &gl::GLContext::fUniformMatrix4fv
     };
     const auto func = kFuncList[3*(A-2) + (B-2)];
 
     MakeContextCurrent();
-    (gl->*func)(loc->mLoc, numElementsToUpload, transpose, elemBytes);
+    (gl->*func)(loc->mLoc, numMatsToUpload, uploadTranspose, uploadBytes);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 
 void
 WebGLContext::UseProgram(WebGLProgram* prog)
 {
     if (IsContextLost())