gfx/ycbcr/win64.patch
author Andrzej Hunt <andrzej@ahunt.org>
Tue, 15 Mar 2016 14:44:13 -0700
changeset 323712 876a1f819d83ef8035e579dd9247693c9526875b
parent 69368 1461de626881724cb66b4b6fd29662c0eaf260e0
permissions -rw-r--r--
Bug 1254797 - Post: Ensure we ignore deleted sites in pinned query r=rnewman, a=ritu Pinned sites should be deleted directly, however I'm not confident enough in my knowledge of sync to be certain that we won't end up with deleted pinned sites in our table. (We use normal bookmark deletion for removing pinned sites.) MozReview-Commit-ID: SSLDkSXWlI

diff --git a/gfx/ycbcr/yuv_row_win64.cpp b/gfx/ycbcr/yuv_row_win64.cpp
new file mode 100644
--- /dev/null
+++ b/gfx/ycbcr/yuv_row_win64.cpp
@@ -0,0 +1,205 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "yuv_row.h"
+
+extern "C" {
+
+// x64 compiler doesn't support MMX and inline assembler.  Use SSE2 intrinsics.
+
+#define kCoefficientsRgbU (reinterpret_cast<uint8*>(kCoefficientsRgbY) + 2048)
+#define kCoefficientsRgbV (reinterpret_cast<uint8*>(kCoefficientsRgbY) + 4096)
+
+#include <emmintrin.h>
+
+static void FastConvertYUVToRGB32Row_SSE2(const uint8* y_buf,
+                                          const uint8* u_buf,
+                                          const uint8* v_buf,
+                                          uint8* rgb_buf,
+                                          int width) {
+  __m128i xmm0, xmmY1, xmmY2;
+  __m128  xmmY;
+
+  while (width >= 2) {
+    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf++)),
+                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf++)));
+
+    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++));
+    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
+
+    xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++));
+    xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
+
+    xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
+                          0x44);
+    xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
+    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
+
+    _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
+    rgb_buf += 8;
+    width -= 2;
+  }
+
+  if (width) {
+    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf)),
+                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf)));
+    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf));
+    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
+    xmmY1 = _mm_srai_epi16(xmmY1, 6);
+    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
+    *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
+  }
+}
+
+static void ScaleYUVToRGB32Row_SSE2(const uint8* y_buf,
+                                    const uint8* u_buf,
+                                    const uint8* v_buf,
+                                    uint8* rgb_buf,
+                                    int width,
+                                    int source_dx) {
+  __m128i xmm0, xmmY1, xmmY2;
+  __m128  xmmY;
+  uint8 u, v, y;
+  int x = 0;
+
+  while (width >= 2) {
+    u = u_buf[x >> 17];
+    v = v_buf[x >> 17];
+    y = y_buf[x >> 16];
+    x += source_dx;
+
+    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
+                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
+    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
+    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
+
+    y = y_buf[x >> 16];
+    x += source_dx;
+
+    xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
+    xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
+
+    xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
+                          0x44);
+    xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
+    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
+
+    _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
+    rgb_buf += 8;
+    width -= 2;
+  }
+
+  if (width) {
+    u = u_buf[x >> 17];
+    v = v_buf[x >> 17];
+    y = y_buf[x >> 16];
+
+    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
+                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
+    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
+    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
+    xmmY1 = _mm_srai_epi16(xmmY1, 6);
+    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
+    *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
+  }
+}
+
+static void LinearScaleYUVToRGB32Row_SSE2(const uint8* y_buf,
+                                          const uint8* u_buf,
+                                          const uint8* v_buf,
+                                          uint8* rgb_buf,
+                                          int width,
+                                          int source_dx) {
+  __m128i xmm0, xmmY1, xmmY2;
+  __m128  xmmY;
+  uint8 u0, u1, v0, v1, y0, y1;
+  uint32 uv_frac, y_frac, u, v, y;
+  int x = 0;
+
+  if (source_dx >= 0x20000) {
+    x = 32768;
+  }
+
+  while(width >= 2) {
+    u0 = u_buf[x >> 17];
+    u1 = u_buf[(x >> 17) + 1];
+    v0 = v_buf[x >> 17];
+    v1 = v_buf[(x >> 17) + 1];
+    y0 = y_buf[x >> 16];
+    y1 = y_buf[(x >> 16) + 1];
+    uv_frac = (x & 0x1fffe);
+    y_frac = (x & 0xffff);
+    u = (uv_frac * u1 + (uv_frac ^ 0x1fffe) * u0) >> 17;
+    v = (uv_frac * v1 + (uv_frac ^ 0x1fffe) * v0) >> 17;
+    y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
+    x += source_dx;
+
+    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
+                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
+    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
+    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
+
+    y0 = y_buf[x >> 16];
+    y1 = y_buf[(x >> 16) + 1];
+    y_frac = (x & 0xffff);
+    y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
+    x += source_dx;
+
+    xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
+    xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
+
+    xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
+                          0x44);
+    xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
+    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
+
+    _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
+    rgb_buf += 8;
+    width -= 2;
+  }
+
+  if (width) {
+    u = u_buf[x >> 17];
+    v = v_buf[x >> 17];
+    y = y_buf[x >> 16];
+
+    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
+                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
+    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
+
+    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
+    xmmY1 = _mm_srai_epi16(xmmY1, 6);
+    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
+    *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
+  }
+}
+
+void FastConvertYUVToRGB32Row(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width) {
+  FastConvertYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width);
+}
+
+void ScaleYUVToRGB32Row(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width,
+                        int source_dx) {
+  ScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
+}
+
+void LinearScaleYUVToRGB32Row(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width,
+                              int source_dx) {
+  LinearScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width,
+                                source_dx);
+}
+
+} // extern "C"