author | Mike Hommey <mh+mozilla@glandium.org> |
Fri, 24 Aug 2018 10:12:21 +0900 (2018-08-24) | |
changeset 433741 | e471c532fd11675b0cfafa7c161a2e5e147944b3 |
parent 433740 | a69ed9d1f49c097a67d6f67bdc0e91d7f26e32b4 |
child 433742 | 5955f883a957e165be245385c640fa3a64fabd8d |
push id | 34521 |
push user | ebalazs@mozilla.com |
push date | Wed, 29 Aug 2018 09:43:53 +0000 (2018-08-29) |
treeherder | mozilla-central@b75561ff5ffe [default view] [failures only] |
perfherder | [talos] [build metrics] [platform microbench] (compared to previous push) |
reviewers | jrmuizel |
bugs | 1478269 |
milestone | 63.0a1 |
first release with | nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
|
last release without | nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
|
--- a/gfx/ycbcr/yuv_row_posix.cpp +++ b/gfx/ycbcr/yuv_row_posix.cpp @@ -14,17 +14,17 @@ extern "C" { // We don't need CPUID guards here, since x86-64 implies SSE2. // AMD64 ABI uses register paremters. void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi const uint8* u_buf, // rsi const uint8* v_buf, // rdx uint8* rgb_buf, // rcx int width) { // r8 - asm( + asm volatile( "jmp 1f\n" "0:" "movzb (%[u_buf]),%%r10\n" "add $0x1,%[u_buf]\n" "movzb (%[v_buf]),%%r11\n" "add $0x1,%[v_buf]\n" "movq 2048(%[kCoefficientsRgbY],%%r10,8),%%xmm0\n" "movzb (%[y_buf]),%%r10\n" @@ -56,34 +56,33 @@ void FastConvertYUVToRGB32Row(const uint "paddsw %%xmm1,%%xmm0\n" "movzb (%[y_buf]),%%r10\n" "movq (%[kCoefficientsRgbY],%%r10,8),%%xmm1\n" "paddsw %%xmm0,%%xmm1\n" "psraw $0x6,%%xmm1\n" "packuswb %%xmm1,%%xmm1\n" "movd %%xmm1,0x0(%[rgb_buf])\n" "3:" - : - : [y_buf] "r"(y_buf), - [u_buf] "r"(u_buf), - [v_buf] "r"(v_buf), - [rgb_buf] "r"(rgb_buf), - [width] "r"(width), - [kCoefficientsRgbY] "r" (kCoefficientsRgbY) - : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" + : [y_buf] "+r"(y_buf), + [u_buf] "+r"(u_buf), + [v_buf] "+r"(v_buf), + [rgb_buf] "+r"(rgb_buf), + [width] "+r"(width) + : [kCoefficientsRgbY] "r" (kCoefficientsRgbY) + : "cc", "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" ); } void ScaleYUVToRGB32Row(const uint8* y_buf, // rdi const uint8* u_buf, // rsi const uint8* v_buf, // rdx uint8* rgb_buf, // rcx int width, // r8 int source_dx) { // r9 - asm( + asm volatile( "xor %%r11,%%r11\n" "sub $0x2,%[width]\n" "js 1f\n" "0:" "mov %%r11,%%r10\n" "sar $0x11,%%r10\n" "movzb (%[u_buf],%%r10,1),%%rax\n" @@ -124,35 +123,34 @@ void ScaleYUVToRGB32Row(const uint8* y_b "movzb (%[y_buf],%%r11,1),%%rax\n" "movq (%[kCoefficientsRgbY],%%rax,8),%%xmm1\n" "paddsw %%xmm0,%%xmm1\n" "psraw $0x6,%%xmm1\n" "packuswb %%xmm1,%%xmm1\n" "movd %%xmm1,0x0(%[rgb_buf])\n" "2:" - : + : [rgb_buf] "+r"(rgb_buf), + [width] "+r"(width) : [y_buf] "r"(y_buf), [u_buf] "r"(u_buf), [v_buf] "r"(v_buf), - [rgb_buf] "r"(rgb_buf), - [width] "r"(width), [kCoefficientsRgbY] "r" (kCoefficientsRgbY), [source_dx] "r"(static_cast<long>(source_dx)) - : "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2" + : "cc", "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2" ); } void LinearScaleYUVToRGB32Row(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width, int source_dx) { - asm( + asm volatile( "xor %%r11,%%r11\n" // x = 0 "sub $0x2,%[width]\n" "js 2f\n" "cmp $0x20000,%[source_dx]\n" // if source_dx >= 2.0 "jl 0f\n" "mov $0x8000,%%r11\n" // x = 0.5 for 1/2 or less "0:" @@ -241,25 +239,24 @@ void LinearScaleYUVToRGB32Row(const uint "movq (%[kCoefficientsRgbY],%%r13,8),%%xmm1\n" "paddsw %%xmm0,%%xmm1\n" "psraw $0x6,%%xmm1\n" "packuswb %%xmm1,%%xmm1\n" "movd %%xmm1,0x0(%[rgb_buf])\n" "3:" - : + : [rgb_buf] "+r"(rgb_buf), + [width] "+r"(width) : [y_buf] "r"(y_buf), [u_buf] "r"(u_buf), [v_buf] "r"(v_buf), - [rgb_buf] "r"(rgb_buf), - [width] "r"(width), [kCoefficientsRgbY] "r" (kCoefficientsRgbY), [source_dx] "r"(static_cast<long>(source_dx)) - : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2" + : "cc", "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2" ); } #elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__) // PIC version is slower because less registers are available, so // non-PIC is used on platforms where it is possible. void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,