☠☠ backed out by cfb5b914e2b4 ☠ ☠ | |
author | Chris Double <chris.double@double.co.nz> |
Mon, 23 Aug 2010 13:47:33 +1200 | |
changeset 51230 | dbbb9575aae174c5e25b01a67ed755fdf807219c |
parent 51229 | 894a305625c263a354f8ef66962d0b8f8bd0dfcb |
child 51231 | 573c5fa45cc43cc07a6bd5c6e6a6245b6d7a3579 |
push id | unknown |
push user | unknown |
push date | unknown |
reviewers | roc, blocking |
bugs | 577743 |
milestone | 2.0b5pre |
first release with | nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
|
last release without | nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
|
--- a/content/media/nsMediaDecoder.cpp +++ b/content/media/nsMediaDecoder.cpp @@ -45,28 +45,23 @@ #include "nsIDocument.h" #include "nsThreadUtils.h" #include "nsIDOMHTMLMediaElement.h" #include "nsNetUtil.h" #include "nsHTMLMediaElement.h" #include "nsAutoLock.h" #include "nsIRenderingContext.h" #include "gfxContext.h" -#include "gfxImageSurface.h" #include "nsPresContext.h" #include "nsDOMError.h" #include "nsDisplayList.h" #ifdef MOZ_SVG #include "nsSVGEffects.h" #endif -#if defined(XP_MACOSX) -#include "gfxQuartzImageSurface.h" -#endif - // Number of milliseconds between progress events as defined by spec #define PROGRESS_MS 350 // Number of milliseconds of no data before a stall event is fired as defined by spec #define STALL_MS 3000 nsMediaDecoder::nsMediaDecoder() : mElement(0),
--- a/gfx/layers/ImageLayers.h +++ b/gfx/layers/ImageLayers.h @@ -109,16 +109,17 @@ protected: * (because layers can only be used on the main thread) and we want to * be able to set the current Image from any thread, to facilitate * video playback without involving the main thread, for example. */ class THEBES_API ImageContainer { THEBES_INLINE_DECL_THREADSAFE_REFCOUNTING(ImageContainer) public: + ImageContainer() {} virtual ~ImageContainer() {} /** * Create an Image in one of the given formats. * Picks the "best" format from the list and creates an Image of that * format. * Returns null if this backend does not support any of the formats. */ @@ -174,16 +175,23 @@ public: /** * Set a new layer manager for this image container. It must be * either of the same type as the container's current layer manager, * or null. TRUE is returned on success. */ virtual PRBool SetLayerManager(LayerManager *aManager) = 0; + /** + * Sets a size that the image is expected to be rendered at. + * This is a hint for image backends to optimize scaling. + * Default implementation in this class is to ignore the hint. + */ + virtual void SetScaleHint(const gfxIntSize& /* aScaleHint */) { } + protected: LayerManager* mManager; ImageContainer(LayerManager* aManager) : mManager(aManager) {} }; /** * A Layer which renders an Image.
--- a/gfx/layers/basic/BasicImages.cpp +++ b/gfx/layers/basic/BasicImages.cpp @@ -99,39 +99,50 @@ protected: /** * We handle YCbCr by converting to RGB when the image is initialized * (which should be done off the main thread). The RGB results are stored * in a memory buffer and converted to a cairo surface lazily. */ class BasicPlanarYCbCrImage : public PlanarYCbCrImage, public BasicImageImplData { public: - BasicPlanarYCbCrImage() : - PlanarYCbCrImage(static_cast<BasicImageImplData*>(this)) + /** + * aScaleHint is a size that the image is expected to be rendered at. + * This is a hint for image backends to optimize scaling. + */ + BasicPlanarYCbCrImage(const gfxIntSize& aScaleHint) : + PlanarYCbCrImage(static_cast<BasicImageImplData*>(this)), + mScaleHint(aScaleHint) {} virtual void SetData(const Data& aData); virtual already_AddRefed<gfxASurface> GetAsSurface(); protected: nsAutoArrayPtr<PRUint8> mBuffer; nsCountedRef<nsMainThreadSurfaceRef> mSurface; + gfxIntSize mScaleHint; }; void BasicPlanarYCbCrImage::SetData(const Data& aData) { // Do some sanity checks to prevent integer overflow if (aData.mYSize.width > 16384 || aData.mYSize.height > 16384) { NS_ERROR("Illegal width or height"); return; } - size_t size = aData.mPicSize.width*aData.mPicSize.height*4; - mBuffer = new PRUint8[size]; + // 'prescale' is true if the scaling is to be done as part of the + // YCbCr to RGB conversion rather than on the RGB data when rendered. + PRBool prescale = mScaleHint.width > 0 && mScaleHint.height > 0; + gfxIntSize size(prescale ? mScaleHint.width : aData.mPicSize.width, + prescale ? mScaleHint.height : aData.mPicSize.height); + + mBuffer = new PRUint8[size.width * size.height * 4]; if (!mBuffer) { // out of memory return; } gfx::YUVType type = gfx::YV12; if (aData.mYSize.width == aData.mCbCrSize.width && aData.mYSize.height == aData.mCbCrSize.height) { @@ -144,30 +155,47 @@ BasicPlanarYCbCrImage::SetData(const Dat else if (aData.mYSize.width / 2 == aData.mCbCrSize.width && aData.mYSize.height / 2 == aData.mCbCrSize.height ) { type = gfx::YV12; } else { NS_ERROR("YCbCr format not supported"); } - // Convert from YCbCr to RGB now - gfx::ConvertYCbCrToRGB32(aData.mYChannel, + // Convert from YCbCr to RGB now, scaling the image if needed. + if (size != aData.mPicSize) { + gfx::ScaleYCbCrToRGB32(aData.mYChannel, aData.mCbChannel, aData.mCrChannel, mBuffer, - aData.mPicX, - aData.mPicY, aData.mPicSize.width, aData.mPicSize.height, + size.width, + size.height, aData.mYStride, aData.mCbCrStride, - aData.mPicSize.width*4, - type); - mSize = aData.mPicSize; + size.width*4, + type, + gfx::ROTATE_0); + } + else { + gfx::ConvertYCbCrToRGB32(aData.mYChannel, + aData.mCbChannel, + aData.mCrChannel, + mBuffer, + aData.mPicX, + aData.mPicY, + aData.mPicSize.width, + aData.mPicSize.height, + aData.mYStride, + aData.mCbCrStride, + aData.mPicSize.width*4, + type); + } + mSize = size; } static cairo_user_data_key_t imageSurfaceDataKey; static void DestroyBuffer(void* aBuffer) { delete[] static_cast<PRUint8*>(aBuffer); @@ -213,29 +241,32 @@ BasicPlanarYCbCrImage::GetAsSurface() /** * Our image container is very simple. It's really just a factory * for the image objects. We use a Monitor to synchronize access to * mImage. */ class BasicImageContainer : public ImageContainer { public: BasicImageContainer(BasicLayerManager* aManager) : - ImageContainer(aManager), mMonitor("BasicImageContainer") + ImageContainer(aManager), mMonitor("BasicImageContainer"), + mScaleHint(-1, -1) {} virtual already_AddRefed<Image> CreateImage(const Image::Format* aFormats, PRUint32 aNumFormats); virtual void SetCurrentImage(Image* aImage); virtual already_AddRefed<Image> GetCurrentImage(); virtual already_AddRefed<gfxASurface> GetCurrentAsSurface(gfxIntSize* aSize); virtual gfxIntSize GetCurrentSize(); virtual PRBool SetLayerManager(LayerManager *aManager); + virtual void SetScaleHint(const gfxIntSize& aScaleHint); protected: Monitor mMonitor; nsRefPtr<Image> mImage; + gfxIntSize mScaleHint; }; /** * Returns true if aFormat is in the given format array. */ static PRBool FormatInList(const Image::Format* aFormats, PRUint32 aNumFormats, Image::Format aFormat) @@ -252,17 +283,18 @@ already_AddRefed<Image> BasicImageContainer::CreateImage(const Image::Format* aFormats, PRUint32 aNumFormats) { nsRefPtr<Image> image; // Prefer cairo surfaces because they're native for us if (FormatInList(aFormats, aNumFormats, Image::CAIRO_SURFACE)) { image = new BasicCairoImage(); } else if (FormatInList(aFormats, aNumFormats, Image::PLANAR_YCBCR)) { - image = new BasicPlanarYCbCrImage(); + MonitorAutoEnter mon(mMonitor); + image = new BasicPlanarYCbCrImage(mScaleHint); } return image.forget(); } void BasicImageContainer::SetCurrentImage(Image* aImage) { MonitorAutoEnter mon(mMonitor); @@ -298,16 +330,22 @@ BasicImageContainer::GetCurrentAsSurface gfxIntSize BasicImageContainer::GetCurrentSize() { MonitorAutoEnter mon(mMonitor); return !mImage ? gfxIntSize(0,0) : ToImageData(mImage)->GetSize(); } +void BasicImageContainer::SetScaleHint(const gfxIntSize& aScaleHint) +{ + MonitorAutoEnter mon(mMonitor); + mScaleHint = aScaleHint; +} + PRBool BasicImageContainer::SetLayerManager(LayerManager *aManager) { if (aManager && aManager->GetBackendType() != LayerManager::LAYERS_BASIC) { return PR_FALSE; }
--- a/gfx/ycbcr/README +++ b/gfx/ycbcr/README @@ -16,8 +16,9 @@ picture_region.patch: Change Chromium co remove_scale.patch: Removes Chromium scaling code. export.patch: Fix export for building on comm-central win64_mac64.patch: Fallback to C implementation on Windows and Mac OS X 64 bit yv24.patch: Adds YCbCr 4:4:4 support row_c_fix.patch: Fix broken C fallback code (See bug 561385). bug572034_mac_64bit.patch: Fix x86_64 linux code so it works on OS X. solaris.patch: Adds Solaris support, fallback to C implementation on SPARC +add_scale.patch: re-adds Chromium scaling code
new file mode 100644 --- /dev/null +++ b/gfx/ycbcr/add_scale.patch @@ -0,0 +1,953 @@ +diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp +index 40ce10f..7d46629 100644 +--- a/gfx/ycbcr/yuv_convert.cpp ++++ b/gfx/ycbcr/yuv_convert.cpp +@@ -82,10 +82,139 @@ NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* y_buf, + + #ifdef ARCH_CPU_X86_FAMILY + // MMX used for FastConvertYUVToRGB32Row requires emms instruction. + if (has_mmx) + EMMS(); + #endif + } + ++// Scale a frame of YUV to 32 bit ARGB. ++void ScaleYCbCrToRGB32(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int height, ++ int scaled_width, ++ int scaled_height, ++ int y_pitch, ++ int uv_pitch, ++ int rgb_pitch, ++ YUVType yuv_type, ++ Rotate view_rotate) { ++ unsigned int y_shift = yuv_type == YV12 ? 1 : 0; ++ unsigned int x_shift = yuv_type == YV24 ? 0 : 1; ++ bool has_mmx = supports_mmx(); ++ // Diagram showing origin and direction of source sampling. ++ // ->0 4<- ++ // 7 3 ++ // ++ // 6 5 ++ // ->1 2<- ++ // Rotations that start at right side of image. ++ if ((view_rotate == ROTATE_180) || ++ (view_rotate == ROTATE_270) || ++ (view_rotate == MIRROR_ROTATE_0) || ++ (view_rotate == MIRROR_ROTATE_90)) { ++ y_buf += width - 1; ++ u_buf += width / 2 - 1; ++ v_buf += width / 2 - 1; ++ width = -width; ++ } ++ // Rotations that start at bottom of image. ++ if ((view_rotate == ROTATE_90) || ++ (view_rotate == ROTATE_180) || ++ (view_rotate == MIRROR_ROTATE_90) || ++ (view_rotate == MIRROR_ROTATE_180)) { ++ y_buf += (height - 1) * y_pitch; ++ u_buf += ((height >> y_shift) - 1) * uv_pitch; ++ v_buf += ((height >> y_shift) - 1) * uv_pitch; ++ height = -height; ++ } ++ ++ // Handle zero sized destination. ++ if (scaled_width == 0 || scaled_height == 0) ++ return; ++ int scaled_dx = width * 16 / scaled_width; ++ int scaled_dy = height * 16 / scaled_height; ++ ++ int scaled_dx_uv = scaled_dx; ++ ++ if ((view_rotate == ROTATE_90) || ++ (view_rotate == ROTATE_270)) { ++ int tmp = scaled_height; ++ scaled_height = scaled_width; ++ scaled_width = tmp; ++ tmp = height; ++ height = width; ++ width = tmp; ++ int original_dx = scaled_dx; ++ int original_dy = scaled_dy; ++ scaled_dx = ((original_dy >> 4) * y_pitch) << 4; ++ scaled_dx_uv = ((original_dy >> 4) * uv_pitch) << 4; ++ scaled_dy = original_dx; ++ if (view_rotate == ROTATE_90) { ++ y_pitch = -1; ++ uv_pitch = -1; ++ height = -height; ++ } else { ++ y_pitch = 1; ++ uv_pitch = 1; ++ } ++ } ++ ++ for (int y = 0; y < scaled_height; ++y) { ++ uint8* dest_pixel = rgb_buf + y * rgb_pitch; ++ int scaled_y = (y * height / scaled_height); ++ const uint8* y_ptr = y_buf + scaled_y * y_pitch; ++ const uint8* u_ptr = u_buf + (scaled_y >> y_shift) * uv_pitch; ++ const uint8* v_ptr = v_buf + (scaled_y >> y_shift) * uv_pitch; ++ ++#if defined(_MSC_VER) ++ if (scaled_width == (width * 2)) { ++ DoubleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, ++ dest_pixel, scaled_width); ++ } else if ((scaled_dx & 15) == 0) { // Scaling by integer scale factor. ++ if (scaled_dx_uv == scaled_dx) { // Not rotated. ++ if (scaled_dx == 16) { // Not scaled ++ if (has_mmx) ++ FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr, ++ dest_pixel, scaled_width); ++ else ++ FastConvertYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr, ++ dest_pixel, scaled_width, x_shift); ++ } else { // Simple scale down. ie half ++ ConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr, ++ dest_pixel, scaled_width, scaled_dx >> 4); ++ } ++ } else { ++ RotateConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr, ++ dest_pixel, scaled_width, ++ scaled_dx >> 4, scaled_dx_uv >> 4); ++ } ++#else ++ if (scaled_dx == 16) { // Not scaled ++ if (has_mmx) ++ FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr, ++ dest_pixel, scaled_width); ++ else ++ FastConvertYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr, ++ dest_pixel, scaled_width, x_shift); ++#endif ++ } else { ++ if (has_mmx) ++ ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, ++ dest_pixel, scaled_width, scaled_dx); ++ else ++ ScaleYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr, ++ dest_pixel, scaled_width, scaled_dx, x_shift); ++ ++ } ++ } ++ ++ // MMX used for FastConvertYUVToRGB32Row requires emms instruction. ++ if (has_mmx) ++ EMMS(); ++} ++ + } // namespace gfx + } // namespace mozilla +diff --git a/gfx/ycbcr/yuv_convert.h b/gfx/ycbcr/yuv_convert.h +index c0b678d..a7e5b68 100644 +--- a/gfx/ycbcr/yuv_convert.h ++++ b/gfx/ycbcr/yuv_convert.h +@@ -15,27 +15,56 @@ namespace gfx { + // Type of YUV surface. + // The value of these enums matter as they are used to shift vertical indices. + enum YUVType { + YV12 = 0, // YV12 is half width and half height chroma channels. + YV16 = 1, // YV16 is half width and full height chroma channels. + YV24 = 2 // YV24 is full width and full height chroma channels. + }; + ++// Mirror means flip the image horizontally, as in looking in a mirror. ++// Rotate happens after mirroring. ++enum Rotate { ++ ROTATE_0, // Rotation off. ++ ROTATE_90, // Rotate clockwise. ++ ROTATE_180, // Rotate upside down. ++ ROTATE_270, // Rotate counter clockwise. ++ MIRROR_ROTATE_0, // Mirror horizontally. ++ MIRROR_ROTATE_90, // Mirror then Rotate clockwise. ++ MIRROR_ROTATE_180, // Mirror vertically. ++ MIRROR_ROTATE_270 // Transpose. ++}; ++ + // Convert a frame of YUV to 32 bit ARGB. + // Pass in YV16/YV12 depending on source format + NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* yplane, + const uint8* uplane, + const uint8* vplane, + uint8* rgbframe, + int pic_x, + int pic_y, + int pic_width, + int pic_height, + int ystride, + int uvstride, + int rgbstride, + YUVType yuv_type); + ++// Scale a frame of YUV to 32 bit ARGB. ++// Supports rotation and mirroring. ++void ScaleYCbCrToRGB32(const uint8* yplane, ++ const uint8* uplane, ++ const uint8* vplane, ++ uint8* rgbframe, ++ int frame_width, ++ int frame_height, ++ int scaled_width, ++ int scaled_height, ++ int ystride, ++ int uvstride, ++ int rgbstride, ++ YUVType yuv_type, ++ Rotate view_rotate); ++ + } // namespace gfx + } // namespace mozilla + + #endif // MEDIA_BASE_YUV_CONVERT_H_ +diff --git a/gfx/ycbcr/yuv_row.h b/gfx/ycbcr/yuv_row.h +index 8519008..96969ec 100644 +--- a/gfx/ycbcr/yuv_row.h ++++ b/gfx/ycbcr/yuv_row.h +@@ -24,16 +24,64 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf, + void FastConvertYUVToRGB32Row_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + unsigned int x_shift); + + ++// Can do 1x, half size or any scale down by an integer amount. ++// Step can be negative (mirroring, rotate 180). ++// This is the third fastest of the scalers. ++void ConvertYUVToRGB32Row(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int step); ++ ++// Rotate is like Convert, but applies different step to Y versus U and V. ++// This allows rotation by 90 or 270, by stepping by stride. ++// This is the forth fastest of the scalers. ++void RotateConvertYUVToRGB32Row(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int ystep, ++ int uvstep); ++ ++// Doubler does 4 pixels at a time. Each pixel is replicated. ++// This is the fastest of the scalers. ++void DoubleYUVToRGB32Row(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width); ++ ++// Handles arbitrary scaling up or down. ++// Mirroring is supported, but not 90 or 270 degree rotation. ++// Chroma is under sampled every 2 pixels for performance. ++// This is the slowest of the scalers. ++void ScaleYUVToRGB32Row(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int scaled_dx); ++ ++void ScaleYUVToRGB32Row_C(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int scaled_dx, ++ unsigned int x_shift); ++ + } // extern "C" + + // x64 uses MMX2 (SSE) so emms is not required. + #if defined(ARCH_CPU_X86) + #if defined(_MSC_VER) + #define EMMS() __asm emms + #else + #define EMMS() asm("emms") +diff --git a/gfx/ycbcr/yuv_row_c.cpp b/gfx/ycbcr/yuv_row_c.cpp +index b5c0018..49eced2 100644 +--- a/gfx/ycbcr/yuv_row_c.cpp ++++ b/gfx/ycbcr/yuv_row_c.cpp +@@ -172,10 +172,31 @@ void FastConvertYUVToRGB32Row_C(const uint8* y_buf, + v = v_buf[x + 1]; + } + YuvPixel(y1, u, v, rgb_buf + 4); + } + rgb_buf += 8; // Advance 2 pixels. + } + } + ++// 28.4 fixed point is used. A shift by 4 isolates the integer. ++// A shift by 5 is used to further subsample the chrominence channels. ++// & 15 isolates the fixed point fraction. >> 2 to get the upper 2 bits, ++// for 1/4 pixel accurate interpolation. ++void ScaleYUVToRGB32Row_C(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int scaled_dx, ++ unsigned int x_shift) { ++ int scaled_x = 0; ++ for (int x = 0; x < width; ++x) { ++ uint8 u = u_buf[scaled_x >> (4 + x_shift)]; ++ uint8 v = v_buf[scaled_x >> (4 + x_shift)]; ++ uint8 y0 = y_buf[scaled_x >> 4]; ++ YuvPixel(y0, u, v, rgb_buf); ++ rgb_buf += 4; ++ scaled_x += scaled_dx; ++ } ++} + } // extern "C" + +diff --git a/gfx/ycbcr/yuv_row_linux.cpp b/gfx/ycbcr/yuv_row_linux.cpp +index 9f7625c..bff02b3 100644 +--- a/gfx/ycbcr/yuv_row_linux.cpp ++++ b/gfx/ycbcr/yuv_row_linux.cpp +@@ -16,16 +16,24 @@ extern "C" { + void FastConvertYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); + } + ++void ScaleYUVToRGB32Row(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int scaled_dx) { ++ ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, scaled_dx, 1); ++} + #else + + #define RGBY(i) { \ + static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ + static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ + static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ + 0 \ + } +@@ -365,16 +373,86 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi + "r"(u_buf), // %1 + "r"(v_buf), // %2 + "r"(rgb_buf), // %3 + "r"(width), // %4 + "r" (kCoefficientsRgbY) // %5 + : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" + ); + } ++ ++void ScaleYUVToRGB32Row(const uint8* y_buf, // rdi ++ const uint8* u_buf, // rsi ++ const uint8* v_buf, // rdx ++ uint8* rgb_buf, // rcx ++ int width, // r8 ++ int scaled_dx) { // r9 ++ asm( ++ "xor %%r11,%%r11\n" ++ "sub $0x2,%4\n" ++ "js scalenext\n" ++ ++"scaleloop:" ++ "mov %%r11,%%r10\n" ++ "sar $0x5,%%r10\n" ++ "movzb (%1,%%r10,1),%%rax\n" ++ "movq 2048(%5,%%rax,8),%%xmm0\n" ++ "movzb (%2,%%r10,1),%%rax\n" ++ "movq 4096(%5,%%rax,8),%%xmm1\n" ++ "lea (%%r11,%6),%%r10\n" ++ "sar $0x4,%%r11\n" ++ "movzb (%0,%%r11,1),%%rax\n" ++ "paddsw %%xmm1,%%xmm0\n" ++ "movq (%5,%%rax,8),%%xmm1\n" ++ "lea (%%r10,%6),%%r11\n" ++ "sar $0x4,%%r10\n" ++ "movzb (%0,%%r10,1),%%rax\n" ++ "movq (%5,%%rax,8),%%xmm2\n" ++ "paddsw %%xmm0,%%xmm1\n" ++ "paddsw %%xmm0,%%xmm2\n" ++ "shufps $0x44,%%xmm2,%%xmm1\n" ++ "psraw $0x6,%%xmm1\n" ++ "packuswb %%xmm1,%%xmm1\n" ++ "movq %%xmm1,0x0(%3)\n" ++ "add $0x8,%3\n" ++ "sub $0x2,%4\n" ++ "jns scaleloop\n" ++ ++"scalenext:" ++ "add $0x1,%4\n" ++ "js scaledone\n" ++ ++ "mov %%r11,%%r10\n" ++ "sar $0x5,%%r10\n" ++ "movzb (%1,%%r10,1),%%rax\n" ++ "movq 2048(%5,%%rax,8),%%xmm0\n" ++ "movzb (%2,%%r10,1),%%rax\n" ++ "movq 4096(%5,%%rax,8),%%xmm1\n" ++ "paddsw %%xmm1,%%xmm0\n" ++ "sar $0x4,%%r11\n" ++ "movzb (%0,%%r11,1),%%rax\n" ++ "movq (%5,%%rax,8),%%xmm1\n" ++ "paddsw %%xmm0,%%xmm1\n" ++ "psraw $0x6,%%xmm1\n" ++ "packuswb %%xmm1,%%xmm1\n" ++ "movd %%xmm1,0x0(%3)\n" ++ ++"scaledone:" ++ : ++ : "r"(y_buf), // %0 ++ "r"(u_buf), // %1 ++ "r"(v_buf), // %2 ++ "r"(rgb_buf), // %3 ++ "r"(width), // %4 ++ "r" (kCoefficientsRgbY), // %5 ++ "r"(static_cast<long>(scaled_dx)) // %6 ++ : "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2" ++); ++} ++ + #endif // __SUNPRO_CC + + #else // ARCH_CPU_X86_64 + + #ifdef __SUNPRO_CC + void FastConvertYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, +@@ -493,13 +571,87 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf, + "packuswb %mm1,%mm1\n" + "movd %mm1,0x0(%ebp)\n" + "2:" + "popa\n" + "ret\n" + ".previous\n" + ); + ++void ScaleYUVToRGB32Row(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int scaled_dx); ++ ++ asm( ++ ".global ScaleYUVToRGB32Row\n" ++"ScaleYUVToRGB32Row:\n" ++ "pusha\n" ++ "mov 0x24(%esp),%edx\n" ++ "mov 0x28(%esp),%edi\n" ++ "mov 0x2c(%esp),%esi\n" ++ "mov 0x30(%esp),%ebp\n" ++ "mov 0x34(%esp),%ecx\n" ++ "xor %ebx,%ebx\n" ++ "jmp scaleend\n" ++ ++"scaleloop:" ++ "mov %ebx,%eax\n" ++ "sar $0x5,%eax\n" ++ "movzbl (%edi,%eax,1),%eax\n" ++ "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" ++ "mov %ebx,%eax\n" ++ "sar $0x5,%eax\n" ++ "movzbl (%esi,%eax,1),%eax\n" ++ "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" ++ "mov %ebx,%eax\n" ++ "add 0x38(%esp),%ebx\n" ++ "sar $0x4,%eax\n" ++ "movzbl (%edx,%eax,1),%eax\n" ++ "movq kCoefficientsRgbY(,%eax,8),%mm1\n" ++ "mov %ebx,%eax\n" ++ "add 0x38(%esp),%ebx\n" ++ "sar $0x4,%eax\n" ++ "movzbl (%edx,%eax,1),%eax\n" ++ "movq kCoefficientsRgbY(,%eax,8),%mm2\n" ++ "paddsw %mm0,%mm1\n" ++ "paddsw %mm0,%mm2\n" ++ "psraw $0x6,%mm1\n" ++ "psraw $0x6,%mm2\n" ++ "packuswb %mm2,%mm1\n" ++ "movntq %mm1,0x0(%ebp)\n" ++ "add $0x8,%ebp\n" ++"scaleend:" ++ "sub $0x2,%ecx\n" ++ "jns scaleloop\n" ++ ++ "and $0x1,%ecx\n" ++ "je scaledone\n" ++ ++ "mov %ebx,%eax\n" ++ "sar $0x5,%eax\n" ++ "movzbl (%edi,%eax,1),%eax\n" ++ "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" ++ "mov %ebx,%eax\n" ++ "sar $0x5,%eax\n" ++ "movzbl (%esi,%eax,1),%eax\n" ++ "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" ++ "mov %ebx,%eax\n" ++ "sar $0x4,%eax\n" ++ "movzbl (%edx,%eax,1),%eax\n" ++ "movq kCoefficientsRgbY(,%eax,8),%mm1\n" ++ "paddsw %mm0,%mm1\n" ++ "psraw $0x6,%mm1\n" ++ "packuswb %mm1,%mm1\n" ++ "movd %mm1,0x0(%ebp)\n" ++ ++"scaledone:" ++ "popa\n" ++ "ret\n" ++); ++ + #endif // __SUNPRO_CC + #endif // ARCH_CPU_X86_64 + #endif // !ARCH_CPU_X86_FAMILY + } // extern "C" + +diff --git a/gfx/ycbcr/yuv_row_mac.cpp b/gfx/ycbcr/yuv_row_mac.cpp +index a1d0058..5acf825 100644 +--- a/gfx/ycbcr/yuv_row_mac.cpp ++++ b/gfx/ycbcr/yuv_row_mac.cpp +@@ -16,16 +16,24 @@ extern "C" { + void FastConvertYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); + } + ++void ScaleYUVToRGB32Row(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int scaled_dx) { ++ ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, scaled_dx, 1); ++} + #else + + #define RGBY(i) { \ + static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ + static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ + static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ + 0 \ + } +@@ -313,11 +321,96 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + MacConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, + &kCoefficientsRgbY[0][0]); + } + ++extern void MacScaleYUVToRGB32Row(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int scaled_dx, ++ int16 *kCoefficientsRgbY); ++ ++ __asm__( ++"_MacScaleYUVToRGB32Row:\n" ++ "pusha\n" ++ "mov 0x24(%esp),%edx\n" ++ "mov 0x28(%esp),%edi\n" ++ "mov 0x2c(%esp),%esi\n" ++ "mov 0x30(%esp),%ebp\n" ++ "mov 0x3c(%esp),%ecx\n" ++ "xor %ebx,%ebx\n" ++ "jmp Lscaleend\n" ++ ++"Lscaleloop:" ++ "mov %ebx,%eax\n" ++ "sar $0x5,%eax\n" ++ "movzbl (%edi,%eax,1),%eax\n" ++ "movq 2048(%ecx,%eax,8),%mm0\n" ++ "mov %ebx,%eax\n" ++ "sar $0x5,%eax\n" ++ "movzbl (%esi,%eax,1),%eax\n" ++ "paddsw 4096(%ecx,%eax,8),%mm0\n" ++ "mov %ebx,%eax\n" ++ "add 0x38(%esp),%ebx\n" ++ "sar $0x4,%eax\n" ++ "movzbl (%edx,%eax,1),%eax\n" ++ "movq 0(%ecx,%eax,8),%mm1\n" ++ "mov %ebx,%eax\n" ++ "add 0x38(%esp),%ebx\n" ++ "sar $0x4,%eax\n" ++ "movzbl (%edx,%eax,1),%eax\n" ++ "movq 0(%ecx,%eax,8),%mm2\n" ++ "paddsw %mm0,%mm1\n" ++ "paddsw %mm0,%mm2\n" ++ "psraw $0x6,%mm1\n" ++ "psraw $0x6,%mm2\n" ++ "packuswb %mm2,%mm1\n" ++ "movntq %mm1,0x0(%ebp)\n" ++ "add $0x8,%ebp\n" ++"Lscaleend:" ++ "sub $0x2,0x34(%esp)\n" ++ "jns Lscaleloop\n" ++ ++ "and $0x1,0x34(%esp)\n" ++ "je Lscaledone\n" ++ ++ "mov %ebx,%eax\n" ++ "sar $0x5,%eax\n" ++ "movzbl (%edi,%eax,1),%eax\n" ++ "movq 2048(%ecx,%eax,8),%mm0\n" ++ "mov %ebx,%eax\n" ++ "sar $0x5,%eax\n" ++ "movzbl (%esi,%eax,1),%eax\n" ++ "paddsw 4096(%ecx,%eax,8),%mm0\n" ++ "mov %ebx,%eax\n" ++ "sar $0x4,%eax\n" ++ "movzbl (%edx,%eax,1),%eax\n" ++ "movq 0(%ecx,%eax,8),%mm1\n" ++ "paddsw %mm0,%mm1\n" ++ "psraw $0x6,%mm1\n" ++ "packuswb %mm1,%mm1\n" ++ "movd %mm1,0x0(%ebp)\n" ++ ++"Lscaledone:" ++ "popa\n" ++ "ret\n" ++); ++ ++void ScaleYUVToRGB32Row(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int scaled_dx) { ++ ++ MacScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, scaled_dx, ++ &kCoefficientsRgbY[0][0]); ++} ++ + #endif // ARCH_CPU_PPC || ARCH_CPU_64_BITS + } // extern "C" + +diff --git a/gfx/ycbcr/yuv_row_win.cpp b/gfx/ycbcr/yuv_row_win.cpp +index 699ac77..a1700fc 100644 +--- a/gfx/ycbcr/yuv_row_win.cpp ++++ b/gfx/ycbcr/yuv_row_win.cpp +@@ -11,17 +11,26 @@ extern "C" { + // PPC implementation uses C fallback + void FastConvertYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); + } +- ++ ++void ScaleYUVToRGB32Row(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int scaled_dx) { ++ ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, scaled_dx, 1); ++} ++ + #else + + + #define RGBY(i) { \ + static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ + static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ + static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ + 0 \ +@@ -307,11 +316,280 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf, + movd [ebp], mm1 + convertdone : + + popad + ret + } + } + ++__declspec(naked) ++void ConvertYUVToRGB32Row(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int step) { ++ __asm { ++ pushad ++ mov edx, [esp + 32 + 4] // Y ++ mov edi, [esp + 32 + 8] // U ++ mov esi, [esp + 32 + 12] // V ++ mov ebp, [esp + 32 + 16] // rgb ++ mov ecx, [esp + 32 + 20] // width ++ mov ebx, [esp + 32 + 24] // step ++ jmp wend ++ ++ wloop : ++ movzx eax, byte ptr [edi] ++ add edi, ebx ++ movq mm0, [kCoefficientsRgbU + 8 * eax] ++ movzx eax, byte ptr [esi] ++ add esi, ebx ++ paddsw mm0, [kCoefficientsRgbV + 8 * eax] ++ movzx eax, byte ptr [edx] ++ add edx, ebx ++ movq mm1, [kCoefficientsRgbY + 8 * eax] ++ movzx eax, byte ptr [edx] ++ add edx, ebx ++ movq mm2, [kCoefficientsRgbY + 8 * eax] ++ paddsw mm1, mm0 ++ paddsw mm2, mm0 ++ psraw mm1, 6 ++ psraw mm2, 6 ++ packuswb mm1, mm2 ++ movntq [ebp], mm1 ++ add ebp, 8 ++ wend : ++ sub ecx, 2 ++ jns wloop ++ ++ and ecx, 1 // odd number of pixels? ++ jz wdone ++ ++ movzx eax, byte ptr [edi] ++ movq mm0, [kCoefficientsRgbU + 8 * eax] ++ movzx eax, byte ptr [esi] ++ paddsw mm0, [kCoefficientsRgbV + 8 * eax] ++ movzx eax, byte ptr [edx] ++ movq mm1, [kCoefficientsRgbY + 8 * eax] ++ paddsw mm1, mm0 ++ psraw mm1, 6 ++ packuswb mm1, mm1 ++ movd [ebp], mm1 ++ wdone : ++ ++ popad ++ ret ++ } ++} ++ ++__declspec(naked) ++void RotateConvertYUVToRGB32Row(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int ystep, ++ int uvstep) { ++ __asm { ++ pushad ++ mov edx, [esp + 32 + 4] // Y ++ mov edi, [esp + 32 + 8] // U ++ mov esi, [esp + 32 + 12] // V ++ mov ebp, [esp + 32 + 16] // rgb ++ mov ecx, [esp + 32 + 20] // width ++ jmp wend ++ ++ wloop : ++ movzx eax, byte ptr [edi] ++ mov ebx, [esp + 32 + 28] // uvstep ++ add edi, ebx ++ movq mm0, [kCoefficientsRgbU + 8 * eax] ++ movzx eax, byte ptr [esi] ++ add esi, ebx ++ paddsw mm0, [kCoefficientsRgbV + 8 * eax] ++ movzx eax, byte ptr [edx] ++ mov ebx, [esp + 32 + 24] // ystep ++ add edx, ebx ++ movq mm1, [kCoefficientsRgbY + 8 * eax] ++ movzx eax, byte ptr [edx] ++ add edx, ebx ++ movq mm2, [kCoefficientsRgbY + 8 * eax] ++ paddsw mm1, mm0 ++ paddsw mm2, mm0 ++ psraw mm1, 6 ++ psraw mm2, 6 ++ packuswb mm1, mm2 ++ movntq [ebp], mm1 ++ add ebp, 8 ++ wend : ++ sub ecx, 2 ++ jns wloop ++ ++ and ecx, 1 // odd number of pixels? ++ jz wdone ++ ++ movzx eax, byte ptr [edi] ++ movq mm0, [kCoefficientsRgbU + 8 * eax] ++ movzx eax, byte ptr [esi] ++ paddsw mm0, [kCoefficientsRgbV + 8 * eax] ++ movzx eax, byte ptr [edx] ++ movq mm1, [kCoefficientsRgbY + 8 * eax] ++ paddsw mm1, mm0 ++ psraw mm1, 6 ++ packuswb mm1, mm1 ++ movd [ebp], mm1 ++ wdone : ++ ++ popad ++ ret ++ } ++} ++ ++__declspec(naked) ++void DoubleYUVToRGB32Row(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width) { ++ __asm { ++ pushad ++ mov edx, [esp + 32 + 4] // Y ++ mov edi, [esp + 32 + 8] // U ++ mov esi, [esp + 32 + 12] // V ++ mov ebp, [esp + 32 + 16] // rgb ++ mov ecx, [esp + 32 + 20] // width ++ jmp wend ++ ++ wloop : ++ movzx eax, byte ptr [edi] ++ add edi, 1 ++ movzx ebx, byte ptr [esi] ++ add esi, 1 ++ movq mm0, [kCoefficientsRgbU + 8 * eax] ++ movzx eax, byte ptr [edx] ++ paddsw mm0, [kCoefficientsRgbV + 8 * ebx] ++ movq mm1, [kCoefficientsRgbY + 8 * eax] ++ paddsw mm1, mm0 ++ psraw mm1, 6 ++ packuswb mm1, mm1 ++ punpckldq mm1, mm1 ++ movntq [ebp], mm1 ++ ++ movzx ebx, byte ptr [edx + 1] ++ add edx, 2 ++ paddsw mm0, [kCoefficientsRgbY + 8 * ebx] ++ psraw mm0, 6 ++ packuswb mm0, mm0 ++ punpckldq mm0, mm0 ++ movntq [ebp+8], mm0 ++ add ebp, 16 ++ wend : ++ sub ecx, 4 ++ jns wloop ++ ++ add ecx, 4 ++ jz wdone ++ ++ movzx eax, byte ptr [edi] ++ movq mm0, [kCoefficientsRgbU + 8 * eax] ++ movzx eax, byte ptr [esi] ++ paddsw mm0, [kCoefficientsRgbV + 8 * eax] ++ movzx eax, byte ptr [edx] ++ movq mm1, [kCoefficientsRgbY + 8 * eax] ++ paddsw mm1, mm0 ++ psraw mm1, 6 ++ packuswb mm1, mm1 ++ jmp wend1 ++ ++ wloop1 : ++ movd [ebp], mm1 ++ add ebp, 4 ++ wend1 : ++ sub ecx, 1 ++ jns wloop1 ++ wdone : ++ popad ++ ret ++ } ++} ++ ++// This version does general purpose scaling by any amount, up or down. ++// The only thing it can not do it rotation by 90 or 270. ++// For performance the chroma is under sampled, reducing cost of a 3x ++// 1080p scale from 8.4 ms to 5.4 ms. ++__declspec(naked) ++void ScaleYUVToRGB32Row(const uint8* y_buf, ++ const uint8* u_buf, ++ const uint8* v_buf, ++ uint8* rgb_buf, ++ int width, ++ int dx) { ++ __asm { ++ pushad ++ mov edx, [esp + 32 + 4] // Y ++ mov edi, [esp + 32 + 8] // U ++ mov esi, [esp + 32 + 12] // V ++ mov ebp, [esp + 32 + 16] // rgb ++ mov ecx, [esp + 32 + 20] // width ++ xor ebx, ebx // x ++ jmp scaleend ++ ++ scaleloop : ++ mov eax, ebx ++ sar eax, 5 ++ movzx eax, byte ptr [edi + eax] ++ movq mm0, [kCoefficientsRgbU + 8 * eax] ++ mov eax, ebx ++ sar eax, 5 ++ movzx eax, byte ptr [esi + eax] ++ paddsw mm0, [kCoefficientsRgbV + 8 * eax] ++ mov eax, ebx ++ add ebx, [esp + 32 + 24] // x += dx ++ sar eax, 4 ++ movzx eax, byte ptr [edx + eax] ++ movq mm1, [kCoefficientsRgbY + 8 * eax] ++ mov eax, ebx ++ add ebx, [esp + 32 + 24] // x += dx ++ sar eax, 4 ++ movzx eax, byte ptr [edx + eax] ++ movq mm2, [kCoefficientsRgbY + 8 * eax] ++ paddsw mm1, mm0 ++ paddsw mm2, mm0 ++ psraw mm1, 6 ++ psraw mm2, 6 ++ packuswb mm1, mm2 ++ movntq [ebp], mm1 ++ add ebp, 8 ++ scaleend : ++ sub ecx, 2 ++ jns scaleloop ++ ++ and ecx, 1 // odd number of pixels? ++ jz scaledone ++ ++ mov eax, ebx ++ sar eax, 5 ++ movzx eax, byte ptr [edi + eax] ++ movq mm0, [kCoefficientsRgbU + 8 * eax] ++ mov eax, ebx ++ sar eax, 5 ++ movzx eax, byte ptr [esi + eax] ++ paddsw mm0, [kCoefficientsRgbV + 8 * eax] ++ mov eax, ebx ++ sar eax, 4 ++ movzx eax, byte ptr [edx + eax] ++ movq mm1, [kCoefficientsRgbY + 8 * eax] ++ paddsw mm1, mm0 ++ psraw mm1, 6 ++ packuswb mm1, mm1 ++ movd [ebp], mm1 ++ ++ scaledone : ++ popad ++ ret ++ } ++} ++ + #endif // ARCH_CPU_64_BITS + } // extern "C" +
--- a/gfx/ycbcr/update.sh +++ b/gfx/ycbcr/update.sh @@ -10,8 +10,9 @@ patch -p3 <convert.patch patch -p3 <picture_region.patch patch -p3 <remove_scale.patch patch -p3 <export.patch patch -p3 <win64_mac64.patch patch -p3 <yv24.patch patch -p3 <row_c_fix.patch patch -p3 <bug572034_mac_64bit.patch patch -p3 <bug577645_movntq.patch +patch -p3 <add_scale.patch
--- a/gfx/ycbcr/yuv_convert.cpp +++ b/gfx/ycbcr/yuv_convert.cpp @@ -84,10 +84,139 @@ NS_GFX_(void) ConvertYCbCrToRGB32(const #ifdef ARCH_CPU_X86_FAMILY // SSE used for FastConvertYUVToRGB32Row requires emms instruction. if (has_sse) EMMS(); #endif } +// Scale a frame of YUV to 32 bit ARGB. +void ScaleYCbCrToRGB32(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int height, + int scaled_width, + int scaled_height, + int y_pitch, + int uv_pitch, + int rgb_pitch, + YUVType yuv_type, + Rotate view_rotate) { + unsigned int y_shift = yuv_type == YV12 ? 1 : 0; + unsigned int x_shift = yuv_type == YV24 ? 0 : 1; + bool has_mmx = supports_mmx(); + // Diagram showing origin and direction of source sampling. + // ->0 4<- + // 7 3 + // + // 6 5 + // ->1 2<- + // Rotations that start at right side of image. + if ((view_rotate == ROTATE_180) || + (view_rotate == ROTATE_270) || + (view_rotate == MIRROR_ROTATE_0) || + (view_rotate == MIRROR_ROTATE_90)) { + y_buf += width - 1; + u_buf += width / 2 - 1; + v_buf += width / 2 - 1; + width = -width; + } + // Rotations that start at bottom of image. + if ((view_rotate == ROTATE_90) || + (view_rotate == ROTATE_180) || + (view_rotate == MIRROR_ROTATE_90) || + (view_rotate == MIRROR_ROTATE_180)) { + y_buf += (height - 1) * y_pitch; + u_buf += ((height >> y_shift) - 1) * uv_pitch; + v_buf += ((height >> y_shift) - 1) * uv_pitch; + height = -height; + } + + // Handle zero sized destination. + if (scaled_width == 0 || scaled_height == 0) + return; + int scaled_dx = width * 16 / scaled_width; + int scaled_dy = height * 16 / scaled_height; + + int scaled_dx_uv = scaled_dx; + + if ((view_rotate == ROTATE_90) || + (view_rotate == ROTATE_270)) { + int tmp = scaled_height; + scaled_height = scaled_width; + scaled_width = tmp; + tmp = height; + height = width; + width = tmp; + int original_dx = scaled_dx; + int original_dy = scaled_dy; + scaled_dx = ((original_dy >> 4) * y_pitch) << 4; + scaled_dx_uv = ((original_dy >> 4) * uv_pitch) << 4; + scaled_dy = original_dx; + if (view_rotate == ROTATE_90) { + y_pitch = -1; + uv_pitch = -1; + height = -height; + } else { + y_pitch = 1; + uv_pitch = 1; + } + } + + for (int y = 0; y < scaled_height; ++y) { + uint8* dest_pixel = rgb_buf + y * rgb_pitch; + int scaled_y = (y * height / scaled_height); + const uint8* y_ptr = y_buf + scaled_y * y_pitch; + const uint8* u_ptr = u_buf + (scaled_y >> y_shift) * uv_pitch; + const uint8* v_ptr = v_buf + (scaled_y >> y_shift) * uv_pitch; + +#if defined(_MSC_VER) + if (scaled_width == (width * 2)) { + DoubleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, + dest_pixel, scaled_width); + } else if ((scaled_dx & 15) == 0) { // Scaling by integer scale factor. + if (scaled_dx_uv == scaled_dx) { // Not rotated. + if (scaled_dx == 16) { // Not scaled + if (has_mmx) + FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr, + dest_pixel, scaled_width); + else + FastConvertYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr, + dest_pixel, scaled_width, x_shift); + } else { // Simple scale down. ie half + ConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr, + dest_pixel, scaled_width, scaled_dx >> 4); + } + } else { + RotateConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr, + dest_pixel, scaled_width, + scaled_dx >> 4, scaled_dx_uv >> 4); + } +#else + if (scaled_dx == 16) { // Not scaled + if (has_mmx) + FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr, + dest_pixel, scaled_width); + else + FastConvertYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr, + dest_pixel, scaled_width, x_shift); +#endif + } else { + if (has_mmx) + ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, + dest_pixel, scaled_width, scaled_dx); + else + ScaleYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr, + dest_pixel, scaled_width, scaled_dx, x_shift); + + } + } + + // MMX used for FastConvertYUVToRGB32Row requires emms instruction. + if (has_mmx) + EMMS(); +} + } // namespace gfx } // namespace mozilla
--- a/gfx/ycbcr/yuv_convert.h +++ b/gfx/ycbcr/yuv_convert.h @@ -15,27 +15,56 @@ namespace gfx { // Type of YUV surface. // The value of these enums matter as they are used to shift vertical indices. enum YUVType { YV12 = 0, // YV12 is half width and half height chroma channels. YV16 = 1, // YV16 is half width and full height chroma channels. YV24 = 2 // YV24 is full width and full height chroma channels. }; +// Mirror means flip the image horizontally, as in looking in a mirror. +// Rotate happens after mirroring. +enum Rotate { + ROTATE_0, // Rotation off. + ROTATE_90, // Rotate clockwise. + ROTATE_180, // Rotate upside down. + ROTATE_270, // Rotate counter clockwise. + MIRROR_ROTATE_0, // Mirror horizontally. + MIRROR_ROTATE_90, // Mirror then Rotate clockwise. + MIRROR_ROTATE_180, // Mirror vertically. + MIRROR_ROTATE_270 // Transpose. +}; + // Convert a frame of YUV to 32 bit ARGB. // Pass in YV16/YV12 depending on source format NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* yplane, const uint8* uplane, const uint8* vplane, uint8* rgbframe, int pic_x, int pic_y, int pic_width, int pic_height, int ystride, int uvstride, int rgbstride, YUVType yuv_type); +// Scale a frame of YUV to 32 bit ARGB. +// Supports rotation and mirroring. +void ScaleYCbCrToRGB32(const uint8* yplane, + const uint8* uplane, + const uint8* vplane, + uint8* rgbframe, + int frame_width, + int frame_height, + int scaled_width, + int scaled_height, + int ystride, + int uvstride, + int rgbstride, + YUVType yuv_type, + Rotate view_rotate); + } // namespace gfx } // namespace mozilla #endif // MEDIA_BASE_YUV_CONVERT_H_
--- a/gfx/ycbcr/yuv_row.h +++ b/gfx/ycbcr/yuv_row.h @@ -24,16 +24,64 @@ void FastConvertYUVToRGB32Row(const uint void FastConvertYUVToRGB32Row_C(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width, unsigned int x_shift); +// Can do 1x, half size or any scale down by an integer amount. +// Step can be negative (mirroring, rotate 180). +// This is the third fastest of the scalers. +void ConvertYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int step); + +// Rotate is like Convert, but applies different step to Y versus U and V. +// This allows rotation by 90 or 270, by stepping by stride. +// This is the forth fastest of the scalers. +void RotateConvertYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int ystep, + int uvstep); + +// Doubler does 4 pixels at a time. Each pixel is replicated. +// This is the fastest of the scalers. +void DoubleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +// Handles arbitrary scaling up or down. +// Mirroring is supported, but not 90 or 270 degree rotation. +// Chroma is under sampled every 2 pixels for performance. +// This is the slowest of the scalers. +void ScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int scaled_dx); + +void ScaleYUVToRGB32Row_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int scaled_dx, + unsigned int x_shift); + } // extern "C" // x64 uses MMX2 (SSE) so emms is not required. #if defined(ARCH_CPU_X86) #if defined(_MSC_VER) #define EMMS() __asm emms #else #define EMMS() asm("emms")
--- a/gfx/ycbcr/yuv_row_c.cpp +++ b/gfx/ycbcr/yuv_row_c.cpp @@ -172,10 +172,31 @@ void FastConvertYUVToRGB32Row_C(const ui v = v_buf[x + 1]; } YuvPixel(y1, u, v, rgb_buf + 4); } rgb_buf += 8; // Advance 2 pixels. } } +// 28.4 fixed point is used. A shift by 4 isolates the integer. +// A shift by 5 is used to further subsample the chrominence channels. +// & 15 isolates the fixed point fraction. >> 2 to get the upper 2 bits, +// for 1/4 pixel accurate interpolation. +void ScaleYUVToRGB32Row_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int scaled_dx, + unsigned int x_shift) { + int scaled_x = 0; + for (int x = 0; x < width; ++x) { + uint8 u = u_buf[scaled_x >> (4 + x_shift)]; + uint8 v = v_buf[scaled_x >> (4 + x_shift)]; + uint8 y0 = y_buf[scaled_x >> 4]; + YuvPixel(y0, u, v, rgb_buf); + rgb_buf += 4; + scaled_x += scaled_dx; + } +} } // extern "C"
--- a/gfx/ycbcr/yuv_row_linux.cpp +++ b/gfx/ycbcr/yuv_row_linux.cpp @@ -16,16 +16,24 @@ extern "C" { void FastConvertYUVToRGB32Row(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width) { FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); } +void ScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int scaled_dx) { + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, scaled_dx, 1); +} #else #define RGBY(i) { \ static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ 0 \ } @@ -365,16 +373,86 @@ void FastConvertYUVToRGB32Row(const uint "r"(u_buf), // %1 "r"(v_buf), // %2 "r"(rgb_buf), // %3 "r"(width), // %4 "r" (kCoefficientsRgbY) // %5 : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" ); } + +void ScaleYUVToRGB32Row(const uint8* y_buf, // rdi + const uint8* u_buf, // rsi + const uint8* v_buf, // rdx + uint8* rgb_buf, // rcx + int width, // r8 + int scaled_dx) { // r9 + asm( + "xor %%r11,%%r11\n" + "sub $0x2,%4\n" + "js scalenext\n" + +"scaleloop:" + "mov %%r11,%%r10\n" + "sar $0x5,%%r10\n" + "movzb (%1,%%r10,1),%%rax\n" + "movq 2048(%5,%%rax,8),%%xmm0\n" + "movzb (%2,%%r10,1),%%rax\n" + "movq 4096(%5,%%rax,8),%%xmm1\n" + "lea (%%r11,%6),%%r10\n" + "sar $0x4,%%r11\n" + "movzb (%0,%%r11,1),%%rax\n" + "paddsw %%xmm1,%%xmm0\n" + "movq (%5,%%rax,8),%%xmm1\n" + "lea (%%r10,%6),%%r11\n" + "sar $0x4,%%r10\n" + "movzb (%0,%%r10,1),%%rax\n" + "movq (%5,%%rax,8),%%xmm2\n" + "paddsw %%xmm0,%%xmm1\n" + "paddsw %%xmm0,%%xmm2\n" + "shufps $0x44,%%xmm2,%%xmm1\n" + "psraw $0x6,%%xmm1\n" + "packuswb %%xmm1,%%xmm1\n" + "movq %%xmm1,0x0(%3)\n" + "add $0x8,%3\n" + "sub $0x2,%4\n" + "jns scaleloop\n" + +"scalenext:" + "add $0x1,%4\n" + "js scaledone\n" + + "mov %%r11,%%r10\n" + "sar $0x5,%%r10\n" + "movzb (%1,%%r10,1),%%rax\n" + "movq 2048(%5,%%rax,8),%%xmm0\n" + "movzb (%2,%%r10,1),%%rax\n" + "movq 4096(%5,%%rax,8),%%xmm1\n" + "paddsw %%xmm1,%%xmm0\n" + "sar $0x4,%%r11\n" + "movzb (%0,%%r11,1),%%rax\n" + "movq (%5,%%rax,8),%%xmm1\n" + "paddsw %%xmm0,%%xmm1\n" + "psraw $0x6,%%xmm1\n" + "packuswb %%xmm1,%%xmm1\n" + "movd %%xmm1,0x0(%3)\n" + +"scaledone:" + : + : "r"(y_buf), // %0 + "r"(u_buf), // %1 + "r"(v_buf), // %2 + "r"(rgb_buf), // %3 + "r"(width), // %4 + "r" (kCoefficientsRgbY), // %5 + "r"(static_cast<long>(scaled_dx)) // %6 + : "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2" +); +} + #endif // __SUNPRO_CC #else // ARCH_CPU_X86_64 #ifdef __SUNPRO_CC void FastConvertYUVToRGB32Row(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -493,13 +571,87 @@ void FastConvertYUVToRGB32Row(const uint "packuswb %mm1,%mm1\n" "movd %mm1,0x0(%ebp)\n" "2:" "popa\n" "ret\n" ".previous\n" ); +void ScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int scaled_dx); + + asm( + ".global ScaleYUVToRGB32Row\n" +"ScaleYUVToRGB32Row:\n" + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x28(%esp),%edi\n" + "mov 0x2c(%esp),%esi\n" + "mov 0x30(%esp),%ebp\n" + "mov 0x34(%esp),%ecx\n" + "xor %ebx,%ebx\n" + "jmp scaleend\n" + +"scaleloop:" + "mov %ebx,%eax\n" + "sar $0x5,%eax\n" + "movzbl (%edi,%eax,1),%eax\n" + "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" + "mov %ebx,%eax\n" + "sar $0x5,%eax\n" + "movzbl (%esi,%eax,1),%eax\n" + "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" + "mov %ebx,%eax\n" + "add 0x38(%esp),%ebx\n" + "sar $0x4,%eax\n" + "movzbl (%edx,%eax,1),%eax\n" + "movq kCoefficientsRgbY(,%eax,8),%mm1\n" + "mov %ebx,%eax\n" + "add 0x38(%esp),%ebx\n" + "sar $0x4,%eax\n" + "movzbl (%edx,%eax,1),%eax\n" + "movq kCoefficientsRgbY(,%eax,8),%mm2\n" + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movntq %mm1,0x0(%ebp)\n" + "add $0x8,%ebp\n" +"scaleend:" + "sub $0x2,%ecx\n" + "jns scaleloop\n" + + "and $0x1,%ecx\n" + "je scaledone\n" + + "mov %ebx,%eax\n" + "sar $0x5,%eax\n" + "movzbl (%edi,%eax,1),%eax\n" + "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" + "mov %ebx,%eax\n" + "sar $0x5,%eax\n" + "movzbl (%esi,%eax,1),%eax\n" + "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" + "mov %ebx,%eax\n" + "sar $0x4,%eax\n" + "movzbl (%edx,%eax,1),%eax\n" + "movq kCoefficientsRgbY(,%eax,8),%mm1\n" + "paddsw %mm0,%mm1\n" + "psraw $0x6,%mm1\n" + "packuswb %mm1,%mm1\n" + "movd %mm1,0x0(%ebp)\n" + +"scaledone:" + "popa\n" + "ret\n" +); + #endif // __SUNPRO_CC #endif // ARCH_CPU_X86_64 #endif // !ARCH_CPU_X86_FAMILY } // extern "C"
--- a/gfx/ycbcr/yuv_row_mac.cpp +++ b/gfx/ycbcr/yuv_row_mac.cpp @@ -16,16 +16,24 @@ extern "C" { void FastConvertYUVToRGB32Row(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width) { FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); } +void ScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int scaled_dx) { + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, scaled_dx, 1); +} #else #define RGBY(i) { \ static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ 0 \ } @@ -313,11 +321,96 @@ void FastConvertYUVToRGB32Row(const uint const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width) { MacConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, &kCoefficientsRgbY[0][0]); } +extern void MacScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int scaled_dx, + int16 *kCoefficientsRgbY); + + __asm__( +"_MacScaleYUVToRGB32Row:\n" + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x28(%esp),%edi\n" + "mov 0x2c(%esp),%esi\n" + "mov 0x30(%esp),%ebp\n" + "mov 0x3c(%esp),%ecx\n" + "xor %ebx,%ebx\n" + "jmp Lscaleend\n" + +"Lscaleloop:" + "mov %ebx,%eax\n" + "sar $0x5,%eax\n" + "movzbl (%edi,%eax,1),%eax\n" + "movq 2048(%ecx,%eax,8),%mm0\n" + "mov %ebx,%eax\n" + "sar $0x5,%eax\n" + "movzbl (%esi,%eax,1),%eax\n" + "paddsw 4096(%ecx,%eax,8),%mm0\n" + "mov %ebx,%eax\n" + "add 0x38(%esp),%ebx\n" + "sar $0x4,%eax\n" + "movzbl (%edx,%eax,1),%eax\n" + "movq 0(%ecx,%eax,8),%mm1\n" + "mov %ebx,%eax\n" + "add 0x38(%esp),%ebx\n" + "sar $0x4,%eax\n" + "movzbl (%edx,%eax,1),%eax\n" + "movq 0(%ecx,%eax,8),%mm2\n" + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movntq %mm1,0x0(%ebp)\n" + "add $0x8,%ebp\n" +"Lscaleend:" + "sub $0x2,0x34(%esp)\n" + "jns Lscaleloop\n" + + "and $0x1,0x34(%esp)\n" + "je Lscaledone\n" + + "mov %ebx,%eax\n" + "sar $0x5,%eax\n" + "movzbl (%edi,%eax,1),%eax\n" + "movq 2048(%ecx,%eax,8),%mm0\n" + "mov %ebx,%eax\n" + "sar $0x5,%eax\n" + "movzbl (%esi,%eax,1),%eax\n" + "paddsw 4096(%ecx,%eax,8),%mm0\n" + "mov %ebx,%eax\n" + "sar $0x4,%eax\n" + "movzbl (%edx,%eax,1),%eax\n" + "movq 0(%ecx,%eax,8),%mm1\n" + "paddsw %mm0,%mm1\n" + "psraw $0x6,%mm1\n" + "packuswb %mm1,%mm1\n" + "movd %mm1,0x0(%ebp)\n" + +"Lscaledone:" + "popa\n" + "ret\n" +); + +void ScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int scaled_dx) { + + MacScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, scaled_dx, + &kCoefficientsRgbY[0][0]); +} + #endif // ARCH_CPU_PPC || ARCH_CPU_64_BITS } // extern "C"
--- a/gfx/ycbcr/yuv_row_win.cpp +++ b/gfx/ycbcr/yuv_row_win.cpp @@ -11,17 +11,26 @@ extern "C" { // PPC implementation uses C fallback void FastConvertYUVToRGB32Row(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width) { FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); } - + +void ScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int scaled_dx) { + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, scaled_dx, 1); +} + #else #define RGBY(i) { \ static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ 0 \ @@ -307,11 +316,280 @@ void FastConvertYUVToRGB32Row(const uint movd [ebp], mm1 convertdone : popad ret } } +__declspec(naked) +void ConvertYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int step) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + mov esi, [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + mov ebx, [esp + 32 + 24] // step + jmp wend + + wloop : + movzx eax, byte ptr [edi] + add edi, ebx + movq mm0, [kCoefficientsRgbU + 8 * eax] + movzx eax, byte ptr [esi] + add esi, ebx + paddsw mm0, [kCoefficientsRgbV + 8 * eax] + movzx eax, byte ptr [edx] + add edx, ebx + movq mm1, [kCoefficientsRgbY + 8 * eax] + movzx eax, byte ptr [edx] + add edx, ebx + movq mm2, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + paddsw mm2, mm0 + psraw mm1, 6 + psraw mm2, 6 + packuswb mm1, mm2 + movntq [ebp], mm1 + add ebp, 8 + wend : + sub ecx, 2 + jns wloop + + and ecx, 1 // odd number of pixels? + jz wdone + + movzx eax, byte ptr [edi] + movq mm0, [kCoefficientsRgbU + 8 * eax] + movzx eax, byte ptr [esi] + paddsw mm0, [kCoefficientsRgbV + 8 * eax] + movzx eax, byte ptr [edx] + movq mm1, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + movd [ebp], mm1 + wdone : + + popad + ret + } +} + +__declspec(naked) +void RotateConvertYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int ystep, + int uvstep) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + mov esi, [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + jmp wend + + wloop : + movzx eax, byte ptr [edi] + mov ebx, [esp + 32 + 28] // uvstep + add edi, ebx + movq mm0, [kCoefficientsRgbU + 8 * eax] + movzx eax, byte ptr [esi] + add esi, ebx + paddsw mm0, [kCoefficientsRgbV + 8 * eax] + movzx eax, byte ptr [edx] + mov ebx, [esp + 32 + 24] // ystep + add edx, ebx + movq mm1, [kCoefficientsRgbY + 8 * eax] + movzx eax, byte ptr [edx] + add edx, ebx + movq mm2, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + paddsw mm2, mm0 + psraw mm1, 6 + psraw mm2, 6 + packuswb mm1, mm2 + movntq [ebp], mm1 + add ebp, 8 + wend : + sub ecx, 2 + jns wloop + + and ecx, 1 // odd number of pixels? + jz wdone + + movzx eax, byte ptr [edi] + movq mm0, [kCoefficientsRgbU + 8 * eax] + movzx eax, byte ptr [esi] + paddsw mm0, [kCoefficientsRgbV + 8 * eax] + movzx eax, byte ptr [edx] + movq mm1, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + movd [ebp], mm1 + wdone : + + popad + ret + } +} + +__declspec(naked) +void DoubleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + mov esi, [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + jmp wend + + wloop : + movzx eax, byte ptr [edi] + add edi, 1 + movzx ebx, byte ptr [esi] + add esi, 1 + movq mm0, [kCoefficientsRgbU + 8 * eax] + movzx eax, byte ptr [edx] + paddsw mm0, [kCoefficientsRgbV + 8 * ebx] + movq mm1, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + punpckldq mm1, mm1 + movntq [ebp], mm1 + + movzx ebx, byte ptr [edx + 1] + add edx, 2 + paddsw mm0, [kCoefficientsRgbY + 8 * ebx] + psraw mm0, 6 + packuswb mm0, mm0 + punpckldq mm0, mm0 + movntq [ebp+8], mm0 + add ebp, 16 + wend : + sub ecx, 4 + jns wloop + + add ecx, 4 + jz wdone + + movzx eax, byte ptr [edi] + movq mm0, [kCoefficientsRgbU + 8 * eax] + movzx eax, byte ptr [esi] + paddsw mm0, [kCoefficientsRgbV + 8 * eax] + movzx eax, byte ptr [edx] + movq mm1, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + jmp wend1 + + wloop1 : + movd [ebp], mm1 + add ebp, 4 + wend1 : + sub ecx, 1 + jns wloop1 + wdone : + popad + ret + } +} + +// This version does general purpose scaling by any amount, up or down. +// The only thing it can not do it rotation by 90 or 270. +// For performance the chroma is under sampled, reducing cost of a 3x +// 1080p scale from 8.4 ms to 5.4 ms. +__declspec(naked) +void ScaleYUVToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width, + int dx) { + __asm { + pushad + mov edx, [esp + 32 + 4] // Y + mov edi, [esp + 32 + 8] // U + mov esi, [esp + 32 + 12] // V + mov ebp, [esp + 32 + 16] // rgb + mov ecx, [esp + 32 + 20] // width + xor ebx, ebx // x + jmp scaleend + + scaleloop : + mov eax, ebx + sar eax, 5 + movzx eax, byte ptr [edi + eax] + movq mm0, [kCoefficientsRgbU + 8 * eax] + mov eax, ebx + sar eax, 5 + movzx eax, byte ptr [esi + eax] + paddsw mm0, [kCoefficientsRgbV + 8 * eax] + mov eax, ebx + add ebx, [esp + 32 + 24] // x += dx + sar eax, 4 + movzx eax, byte ptr [edx + eax] + movq mm1, [kCoefficientsRgbY + 8 * eax] + mov eax, ebx + add ebx, [esp + 32 + 24] // x += dx + sar eax, 4 + movzx eax, byte ptr [edx + eax] + movq mm2, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + paddsw mm2, mm0 + psraw mm1, 6 + psraw mm2, 6 + packuswb mm1, mm2 + movntq [ebp], mm1 + add ebp, 8 + scaleend : + sub ecx, 2 + jns scaleloop + + and ecx, 1 // odd number of pixels? + jz scaledone + + mov eax, ebx + sar eax, 5 + movzx eax, byte ptr [edi + eax] + movq mm0, [kCoefficientsRgbU + 8 * eax] + mov eax, ebx + sar eax, 5 + movzx eax, byte ptr [esi + eax] + paddsw mm0, [kCoefficientsRgbV + 8 * eax] + mov eax, ebx + sar eax, 4 + movzx eax, byte ptr [edx + eax] + movq mm1, [kCoefficientsRgbY + 8 * eax] + paddsw mm1, mm0 + psraw mm1, 6 + packuswb mm1, mm1 + movd [ebp], mm1 + + scaledone : + popad + ret + } +} + #endif // ARCH_CPU_64_BITS } // extern "C"
--- a/layout/generic/nsVideoFrame.cpp +++ b/layout/generic/nsVideoFrame.cpp @@ -248,16 +248,20 @@ nsVideoFrame::BuildLayer(nsDisplayListBu // the largest rectangle that fills our content-box and has the // correct aspect ratio. nsPresContext* presContext = PresContext(); gfxRect r = gfxRect(presContext->AppUnitsToGfxUnits(area.x), presContext->AppUnitsToGfxUnits(area.y), presContext->AppUnitsToGfxUnits(area.width), presContext->AppUnitsToGfxUnits(area.height)); r = CorrectForAspectRatio(r, videoSize); + r.Round(); + gfxIntSize scaleHint(static_cast<PRInt32>(r.Width()), + static_cast<PRInt32>(r.Height())); + container->SetScaleHint(scaleHint); nsRefPtr<ImageLayer> layer = static_cast<ImageLayer*> (aBuilder->LayerBuilder()->GetLeafLayerFor(aBuilder, aManager, aItem)); if (!layer) { layer = aManager->CreateImageLayer(); if (!layer) return nsnull; }