Bug 1478269 - Fix the outputs and clobbers of inline assembly blocks in yuv_row_posix.cpp. r=jrmuizel
authorMike Hommey <mh+mozilla@glandium.org>
Fri, 24 Aug 2018 10:12:21 +0900
changeset 433741 e471c532fd11675b0cfafa7c161a2e5e147944b3
parent 433740 a69ed9d1f49c097a67d6f67bdc0e91d7f26e32b4
child 433742 5955f883a957e165be245385c640fa3a64fabd8d
push id34521
push userebalazs@mozilla.com
push dateWed, 29 Aug 2018 09:43:53 +0000
treeherdermozilla-central@b75561ff5ffe [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjrmuizel
bugs1478269
milestone63.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1478269 - Fix the outputs and clobbers of inline assembly blocks in yuv_row_posix.cpp. r=jrmuizel While the current code compiles fine with the file as it is, with LTO enabled, some functions end up inlined into their callers and their callers, recursively, and the compiler doesn't know some of the registers have been modified by the assembly, leading to bad decisions, and bad behavior at runtime. The same problem would likely happen if we were using UNIFIED_SOURCES in the directory. Differential Revision: https://phabricator.services.mozilla.com/D4200
gfx/ycbcr/yuv_row_posix.cpp
--- a/gfx/ycbcr/yuv_row_posix.cpp
+++ b/gfx/ycbcr/yuv_row_posix.cpp
@@ -14,17 +14,17 @@ extern "C" {
 // We don't need CPUID guards here, since x86-64 implies SSE2.
 
 // AMD64 ABI uses register paremters.
 void FastConvertYUVToRGB32Row(const uint8* y_buf,  // rdi
                               const uint8* u_buf,  // rsi
                               const uint8* v_buf,  // rdx
                               uint8* rgb_buf,      // rcx
                               int width) {         // r8
-  asm(
+  asm volatile(
   "jmp    1f\n"
 "0:"
   "movzb  (%[u_buf]),%%r10\n"
   "add    $0x1,%[u_buf]\n"
   "movzb  (%[v_buf]),%%r11\n"
   "add    $0x1,%[v_buf]\n"
   "movq   2048(%[kCoefficientsRgbY],%%r10,8),%%xmm0\n"
   "movzb  (%[y_buf]),%%r10\n"
@@ -56,34 +56,33 @@ void FastConvertYUVToRGB32Row(const uint
   "paddsw %%xmm1,%%xmm0\n"
   "movzb  (%[y_buf]),%%r10\n"
   "movq   (%[kCoefficientsRgbY],%%r10,8),%%xmm1\n"
   "paddsw %%xmm0,%%xmm1\n"
   "psraw  $0x6,%%xmm1\n"
   "packuswb %%xmm1,%%xmm1\n"
   "movd   %%xmm1,0x0(%[rgb_buf])\n"
 "3:"
-  :
-  : [y_buf] "r"(y_buf),
-    [u_buf] "r"(u_buf),
-    [v_buf] "r"(v_buf),
-    [rgb_buf] "r"(rgb_buf),
-    [width] "r"(width),
-    [kCoefficientsRgbY] "r" (kCoefficientsRgbY)
-  : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
+  : [y_buf] "+r"(y_buf),
+    [u_buf] "+r"(u_buf),
+    [v_buf] "+r"(v_buf),
+    [rgb_buf] "+r"(rgb_buf),
+    [width] "+r"(width)
+  : [kCoefficientsRgbY] "r" (kCoefficientsRgbY)
+  : "cc", "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
 );
 }
 
 void ScaleYUVToRGB32Row(const uint8* y_buf,  // rdi
                         const uint8* u_buf,  // rsi
                         const uint8* v_buf,  // rdx
                         uint8* rgb_buf,      // rcx
                         int width,           // r8
                         int source_dx) {     // r9
-  asm(
+  asm volatile(
   "xor    %%r11,%%r11\n"
   "sub    $0x2,%[width]\n"
   "js     1f\n"
 
 "0:"
   "mov    %%r11,%%r10\n"
   "sar    $0x11,%%r10\n"
   "movzb  (%[u_buf],%%r10,1),%%rax\n"
@@ -124,35 +123,34 @@ void ScaleYUVToRGB32Row(const uint8* y_b
   "movzb  (%[y_buf],%%r11,1),%%rax\n"
   "movq   (%[kCoefficientsRgbY],%%rax,8),%%xmm1\n"
   "paddsw %%xmm0,%%xmm1\n"
   "psraw  $0x6,%%xmm1\n"
   "packuswb %%xmm1,%%xmm1\n"
   "movd   %%xmm1,0x0(%[rgb_buf])\n"
 
 "2:"
-  :
+  : [rgb_buf] "+r"(rgb_buf),
+    [width] "+r"(width)
   : [y_buf] "r"(y_buf),
     [u_buf] "r"(u_buf),
     [v_buf] "r"(v_buf),
-    [rgb_buf] "r"(rgb_buf),
-    [width] "r"(width),
     [kCoefficientsRgbY] "r" (kCoefficientsRgbY),
     [source_dx] "r"(static_cast<long>(source_dx))
-  : "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2"
+  : "cc", "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2"
 );
 }
 
 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
                               const uint8* u_buf,
                               const uint8* v_buf,
                               uint8* rgb_buf,
                               int width,
                               int source_dx) {
-  asm(
+  asm volatile(
   "xor    %%r11,%%r11\n"   // x = 0
   "sub    $0x2,%[width]\n"
   "js     2f\n"
   "cmp    $0x20000,%[source_dx]\n"   // if source_dx >= 2.0
   "jl     0f\n"
   "mov    $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
 "0:"
 
@@ -241,25 +239,24 @@ void LinearScaleYUVToRGB32Row(const uint
   "movq   (%[kCoefficientsRgbY],%%r13,8),%%xmm1\n"
 
   "paddsw %%xmm0,%%xmm1\n"
   "psraw  $0x6,%%xmm1\n"
   "packuswb %%xmm1,%%xmm1\n"
   "movd   %%xmm1,0x0(%[rgb_buf])\n"
 
 "3:"
-  :
+  : [rgb_buf] "+r"(rgb_buf),
+    [width] "+r"(width)
   : [y_buf] "r"(y_buf),
     [u_buf] "r"(u_buf),
     [v_buf] "r"(v_buf),
-    [rgb_buf] "r"(rgb_buf),
-    [width] "r"(width),
     [kCoefficientsRgbY] "r" (kCoefficientsRgbY),
     [source_dx] "r"(static_cast<long>(source_dx))
-  : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
+  : "cc", "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
 );
 }
 
 #elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__)
 
 // PIC version is slower because less registers are available, so
 // non-PIC is used on platforms where it is possible.
 void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,