Bug 749711 - Lots of WebGL texture conversion fixes and improvements - r=jgilbert
authorBenoit Jacob <bjacob@mozilla.com>
Mon, 07 May 2012 13:05:32 -0400
changeset 93461 9f87dbd4d39c
parent 93460 3bc06857dcd2
child 93462 16fe44589079
push id9160
push userbjacob@mozilla.com
push dateTue, 08 May 2012 13:48:11 +0000
treeherdermozilla-inbound@800b381a1638 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjgilbert
bugs749711
milestone15.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 749711 - Lots of WebGL texture conversion fixes and improvements - r=jgilbert * Templatize pack/unpack routines ** Inside anonymous namespace in a naive attempt to not hammer PGO linker memory usage. * Support conversions changing texel size * Support conversion from integer to float formats * Support RGBA32F properly * Avoid compiling useless paths (code size down to 17k from 44k)
content/canvas/src/Makefile.in
content/canvas/src/WebGLContext.h
content/canvas/src/WebGLContextGL.cpp
content/canvas/src/WebGLTexelConversions.cpp
content/canvas/src/WebGLTexelConversions.h
--- a/content/canvas/src/Makefile.in
+++ b/content/canvas/src/Makefile.in
@@ -77,16 +77,17 @@ CPPSRCS += \
 	WebGLContext.cpp \
 	WebGLContextGL.cpp \
 	WebGLContextUtils.cpp \
 	WebGLContextReporter.cpp \
 	WebGLContextValidate.cpp \
 	WebGLExtensionStandardDerivatives.cpp \
 	WebGLExtensionTextureFilterAnisotropic.cpp \
 	WebGLExtensionLoseContext.cpp \
+	WebGLTexelConversions.cpp \
 	$(NULL)
 
 DEFINES += -DUSE_ANGLE
 USE_ANGLE=1
 
 else
 
 CPPSRCS += WebGLContextNotSupported.cpp
--- a/content/canvas/src/WebGLContext.h
+++ b/content/canvas/src/WebGLContext.h
@@ -123,26 +123,61 @@ enum FakeBlackStatus { DoNotNeedFakeBlac
 struct VertexAttrib0Status {
     enum { Default, EmulatedUninitializedArray, EmulatedInitializedArray };
 };
 
 struct BackbufferClearingStatus {
     enum { NotClearedSinceLastPresented, ClearedToDefaultValues, HasBeenDrawnTo };
 };
 
-struct WebGLTexelFormat {
-    enum { Generic, Auto, RGBA8, RGB8, RGBX8, BGRA8, BGR8, BGRX8, RGBA5551, RGBA4444, RGB565, R8, RA8, A8,
-           RGBA32F, RGB32F, A32F, R32F, RA32F };
+namespace WebGLTexelConversions {
+
+/*
+ * The formats that may participate, either as source or destination formats,
+ * in WebGL texture conversions. This includes:
+ *  - all the formats accepted by WebGL.texImage2D, e.g. RGBA4444
+ *  - additional formats provided by extensions, e.g. RGB32F
+ *  - additional source formats, depending on browser details, used when uploading
+ *    textures from DOM elements. See gfxImageSurface::Format().
+ */
+enum WebGLTexelFormat
+{
+    // dummy error code returned by GetWebGLTexelFormat in error cases,
+    // after assertion failure (so this never happens in debug builds)
+    BadFormat,
+    // dummy pseudo-format meaning "use the other format".
+    // For example, if SrcFormat=Auto and DstFormat=RGB8, then the source
+    // is implicitly treated as being RGB8 itself.
+    Auto,
+    // 1-channel formats
+    R8,
+    A8,
+    R32F, // used for OES_texture_float extension
+    A32F, // used for OES_texture_float extension
+    // 2-channel formats
+    RA8,
+    RA32F,
+    // 3-channel formats
+    RGB8,
+    BGRX8, // used for DOM elements. Source format only.
+    RGB565,
+    RGB32F, // used for OES_texture_float extension
+    // 4-channel formats
+    RGBA8,
+    BGRA8, // used for DOM elements
+    RGBA5551,
+    RGBA4444,
+    RGBA32F // used for OES_texture_float extension
 };
 
-struct WebGLTexelPremultiplicationOp {
-    enum { Generic, None, Premultiply, Unmultiply };
-};
-
-int GetWebGLTexelFormat(GLenum format, GLenum type);
+} // end namespace WebGLTexelConversions
+
+using WebGLTexelConversions::WebGLTexelFormat;
+
+WebGLTexelFormat GetWebGLTexelFormat(GLenum format, GLenum type);
 
 // Zero is not an integer power of two.
 inline bool is_pot_assuming_nonnegative(WebGLsizei x)
 {
     return x && (x & (x-1)) == 0;
 }
 
 /* Each WebGL object class WebGLFoo wants to:
@@ -1200,36 +1235,36 @@ protected:
     void MakeContextCurrent() { gl->MakeCurrent(); }
 
     // helpers
     void TexImage2D_base(WebGLenum target, WebGLint level, WebGLenum internalformat,
                          WebGLsizei width, WebGLsizei height, WebGLsizei srcStrideOrZero, WebGLint border,
                          WebGLenum format, WebGLenum type,
                          void *data, PRUint32 byteLength,
                          int jsArrayType,
-                         int srcFormat, bool srcPremultiplied);
+                         WebGLTexelFormat srcFormat, bool srcPremultiplied);
     void TexSubImage2D_base(WebGLenum target, WebGLint level,
                             WebGLint xoffset, WebGLint yoffset,
                             WebGLsizei width, WebGLsizei height, WebGLsizei srcStrideOrZero,
                             WebGLenum format, WebGLenum type,
                             void *pixels, PRUint32 byteLength,
                             int jsArrayType,
-                            int srcFormat, bool srcPremultiplied);
+                            WebGLTexelFormat srcFormat, bool srcPremultiplied);
     void TexParameter_base(WebGLenum target, WebGLenum pname,
                            WebGLint *intParamPtr, WebGLfloat *floatParamPtr);
 
     void ConvertImage(size_t width, size_t height, size_t srcStride, size_t dstStride,
                       const PRUint8*src, PRUint8 *dst,
-                      int srcFormat, bool srcPremultiplied,
-                      int dstFormat, bool dstPremultiplied,
+                      WebGLTexelFormat srcFormat, bool srcPremultiplied,
+                      WebGLTexelFormat dstFormat, bool dstPremultiplied,
                       size_t dstTexelSize);
 
     nsresult DOMElementToImageSurface(dom::Element* imageOrCanvas,
                                       gfxImageSurface **imageOut,
-                                      int *format);
+                                      WebGLTexelFormat *format);
 
     void CopyTexSubImage2D_base(WebGLenum target,
                                 WebGLint level,
                                 WebGLenum internalformat,
                                 WebGLint xoffset,
                                 WebGLint yoffset,
                                 WebGLint x,
                                 WebGLint y,
--- a/content/canvas/src/WebGLContextGL.cpp
+++ b/content/canvas/src/WebGLContextGL.cpp
@@ -4291,208 +4291,19 @@ WebGLContext::StencilOpSeparate(WebGLenu
         !ValidateStencilOpEnum(dpfail, "stencilOpSeparate: dpfail") ||
         !ValidateStencilOpEnum(dppass, "stencilOpSeparate: dppass"))
         return;
 
     MakeContextCurrent();
     gl->fStencilOpSeparate(face, sfail, dpfail, dppass);
 }
 
-struct WebGLImageConverter
-{
-    bool flip;
-    size_t width, height, srcStride, dstStride, srcTexelSize, dstTexelSize;
-    const PRUint8 *src;
-    PRUint8 *dst;
-
-    WebGLImageConverter()
-    {
-        memset(this, 0, sizeof(WebGLImageConverter));
-    }
-
-    template<typename SrcType, typename DstType, typename UnpackType,
-         void unpackingFunc(const SrcType*, UnpackType*),
-         void packingFunc(const UnpackType*, DstType*)>
-    void run()
-    {
-        // Note -- even though the functions take UnpackType, the
-        // pointers below are all in terms of PRUint8; otherwise
-        // pointer math starts getting tricky.
-        for (size_t src_row = 0; src_row < height; ++src_row) {
-            size_t dst_row = flip ? (height - 1 - src_row) : src_row;
-            PRUint8 *dst_row_ptr = dst + dst_row * dstStride;
-            const PRUint8 *src_row_ptr = src + src_row * srcStride;
-            const PRUint8 *src_row_end = src_row_ptr + width * srcTexelSize; // != src_row_ptr + byteStride
-            while (src_row_ptr != src_row_end) {
-                UnpackType tmp[4];
-                unpackingFunc(reinterpret_cast<const SrcType*>(src_row_ptr), tmp);
-                packingFunc(tmp, reinterpret_cast<DstType*>(dst_row_ptr));
-                src_row_ptr += srcTexelSize;
-                dst_row_ptr += dstTexelSize;
-            }
-        }
-    }
-};
-
-void
-WebGLContext::ConvertImage(size_t width, size_t height, size_t srcStride, size_t dstStride,
-                           const PRUint8*src, PRUint8 *dst,
-                           int srcFormat, bool srcPremultiplied,
-                           int dstFormat, bool dstPremultiplied,
-                           size_t dstTexelSize)
-{
-    if (width <= 0 || height <= 0)
-        return;
-
-    if (srcFormat == dstFormat &&
-        srcPremultiplied == dstPremultiplied)
-    {
-        // fast exit path: we just have to memcpy all the rows.
-        //
-        // The case where absolutely nothing needs to be done is supposed to have
-        // been handled earlier (in TexImage2D_base, etc).
-        //
-        // So the case we're handling here is when even though no format conversion is needed,
-        // we still might have to flip vertically and/or to adjust to a different stride.
-
-        NS_ASSERTION(mPixelStoreFlipY || srcStride != dstStride, "Performance trap -- should handle this case earlier, to avoid memcpy");
-
-        size_t row_size = width * dstTexelSize; // doesn't matter, src and dst formats agree
-        const PRUint8* src_row = src;
-        const PRUint8* src_end = src + height * srcStride;
-
-        PRUint8* dst_row = mPixelStoreFlipY ? dst + (height-1) * dstStride : dst;
-        ptrdiff_t dstStrideSigned(dstStride);
-        ptrdiff_t dst_delta = mPixelStoreFlipY ? -dstStrideSigned : dstStrideSigned;
-
-        while(src_row != src_end) {
-            memcpy(dst_row, src_row, row_size);
-            src_row += srcStride;
-            dst_row += dst_delta;
-        }
-        return;
-    }
-
-    WebGLImageConverter converter;
-    converter.flip = mPixelStoreFlipY;
-    converter.width = width;
-    converter.height = height;
-    converter.srcStride = srcStride;
-    converter.dstStride = dstStride;
-    converter.dstTexelSize = dstTexelSize;
-    converter.src = src;
-    converter.dst = dst;
-
-    int premultiplicationOp = (!srcPremultiplied && dstPremultiplied) ? WebGLTexelPremultiplicationOp::Premultiply
-                            : (srcPremultiplied && !dstPremultiplied) ? WebGLTexelPremultiplicationOp::Unmultiply
-                            : WebGLTexelPremultiplicationOp::None;
-
-#define HANDLE_DSTFORMAT(format, SrcType, DstType, unpackFunc, packFunc) \
-        case WebGLTexelFormat::format: \
-            switch (premultiplicationOp) { \
-                case WebGLTexelPremultiplicationOp::Premultiply: \
-                    converter.run<SrcType, DstType, PRUint8,          \
-                                  WebGLTexelConversions::unpackFunc, \
-                                  WebGLTexelConversions::packFunc##Premultiply>(); \
-                break; \
-                case WebGLTexelPremultiplicationOp::Unmultiply: \
-                    converter.run<SrcType, DstType, PRUint8, \
-                                  WebGLTexelConversions::unpackFunc, \
-                                  WebGLTexelConversions::packFunc##Unmultiply>(); \
-                break; \
-                default: \
-                    converter.run<SrcType, DstType, PRUint8, \
-                                  WebGLTexelConversions::unpackFunc, \
-                                  WebGLTexelConversions::packFunc>(); \
-                break; \
-            } \
-            break;
-
-#define HANDLE_SRCFORMAT(format, size, SrcType, unpackFunc) \
-        case WebGLTexelFormat::format: \
-            converter.srcTexelSize = size; \
-            switch (dstFormat) { \
-                HANDLE_DSTFORMAT(RGBA8,    SrcType, PRUint8,  unpackFunc, packRGBA8ToRGBA8) \
-                HANDLE_DSTFORMAT(RGB8,     SrcType, PRUint8,  unpackFunc, packRGBA8ToRGB8) \
-                HANDLE_DSTFORMAT(R8,       SrcType, PRUint8,  unpackFunc, packRGBA8ToR8) \
-                HANDLE_DSTFORMAT(RA8,      SrcType, PRUint8,  unpackFunc, packRGBA8ToRA8) \
-                HANDLE_DSTFORMAT(RGBA5551, SrcType, PRUint16, unpackFunc, packRGBA8ToUnsignedShort5551) \
-                HANDLE_DSTFORMAT(RGBA4444, SrcType, PRUint16, unpackFunc, packRGBA8ToUnsignedShort4444) \
-                HANDLE_DSTFORMAT(RGB565,   SrcType, PRUint16, unpackFunc, packRGBA8ToUnsignedShort565) \
-                /* A8 needs to be special-cased as it doesn't have color channels to premultiply */ \
-                case WebGLTexelFormat::A8: \
-                    converter.run<SrcType, PRUint8, PRUint8,          \
-                                  WebGLTexelConversions::unpackFunc, \
-                                  WebGLTexelConversions::packRGBA8ToA8>(); \
-                    break; \
-                default: \
-                    NS_ASSERTION(false, "Coding error?! Should never reach this point."); \
-                    return; \
-            } \
-            break;
-
-#define HANDLE_FLOAT_DSTFORMAT(format, unpackFunc, packFunc) \
-        case WebGLTexelFormat::format: \
-            switch (premultiplicationOp) { \
-                case WebGLTexelPremultiplicationOp::Premultiply: \
-                    converter.run<float, float, float,                \
-                                  WebGLTexelConversions::unpackFunc, \
-                                  WebGLTexelConversions::packFunc##Premultiply>(); \
-                break; \
-                case WebGLTexelPremultiplicationOp::Unmultiply: \
-                    NS_ASSERTION(false, "Floating point can't be un-premultiplied -- we have no premultiplied source data!"); \
-                break; \
-                default: \
-                    converter.run<float, float, float,                \
-                                  WebGLTexelConversions::unpackFunc, \
-                                  WebGLTexelConversions::packFunc>(); \
-                break; \
-            } \
-            break;
-
-#define HANDLE_FLOAT_SRCFORMAT(format, size, unpackFunc)                \
-        case WebGLTexelFormat::format:                                  \
-            converter.srcTexelSize = size;                              \
-            switch (dstFormat) {                                        \
-                HANDLE_FLOAT_DSTFORMAT(RGB32F, unpackFunc, packRGBA32FToRGB32F) \
-                HANDLE_FLOAT_DSTFORMAT(A32F,   unpackFunc, packRGBA32FToA32F) \
-                HANDLE_FLOAT_DSTFORMAT(R32F,   unpackFunc, packRGBA32FToR32F) \
-                HANDLE_FLOAT_DSTFORMAT(RA32F,  unpackFunc, packRGBA32FToRA32F) \
-                default: \
-                    NS_ASSERTION(false, "Coding error?! Should never reach this point."); \
-                    return; \
-            } \
-            break;
-        
-    switch (srcFormat) {
-        HANDLE_SRCFORMAT(RGBA8,    4, PRUint8,  unpackRGBA8ToRGBA8)
-        HANDLE_SRCFORMAT(RGBX8,    4, PRUint8,  unpackRGB8ToRGBA8)
-        HANDLE_SRCFORMAT(RGB8,     3, PRUint8,  unpackRGB8ToRGBA8)
-        HANDLE_SRCFORMAT(BGRA8,    4, PRUint8,  unpackBGRA8ToRGBA8)
-        HANDLE_SRCFORMAT(BGRX8,    4, PRUint8,  unpackBGR8ToRGBA8)
-        HANDLE_SRCFORMAT(BGR8,     3, PRUint8,  unpackBGR8ToRGBA8)
-        HANDLE_SRCFORMAT(R8,       1, PRUint8,  unpackR8ToRGBA8)
-        HANDLE_SRCFORMAT(A8,       1, PRUint8,  unpackA8ToRGBA8)
-        HANDLE_SRCFORMAT(RA8,      2, PRUint8,  unpackRA8ToRGBA8)
-        HANDLE_SRCFORMAT(RGBA5551, 2, PRUint16, unpackRGBA5551ToRGBA8)
-        HANDLE_SRCFORMAT(RGBA4444, 2, PRUint16, unpackRGBA4444ToRGBA8)
-        HANDLE_SRCFORMAT(RGB565,   2, PRUint16, unpackRGB565ToRGBA8)
-        HANDLE_FLOAT_SRCFORMAT(RGB32F,  12, unpackRGB32FToRGBA32F)
-        HANDLE_FLOAT_SRCFORMAT(RA32F,    8, unpackRA32FToRGBA32F)
-        HANDLE_FLOAT_SRCFORMAT(R32F,     4, unpackR32FToRGBA32F)
-        HANDLE_FLOAT_SRCFORMAT(A32F,     4, unpackA32FToRGBA32F)
-        default:
-            NS_ASSERTION(false, "Coding error?! Should never reach this point.");
-            return;
-    }
-}
-
 nsresult
 WebGLContext::DOMElementToImageSurface(Element* imageOrCanvas,
-                                       gfxImageSurface **imageOut, int *format)
+                                       gfxImageSurface **imageOut, WebGLTexelFormat *format)
 {
     if (!imageOrCanvas) {
         return NS_ERROR_FAILURE;
     }        
 
     PRUint32 flags =
         nsLayoutUtils::SFE_WANT_NEW_SURFACE |
         nsLayoutUtils::SFE_WANT_IMAGE_SURFACE;
@@ -4551,26 +4362,26 @@ WebGLContext::DOMElementToImageSurface(E
 
     gfxImageSurface* surf = static_cast<gfxImageSurface*>(res.mSurface.get());
 
     res.mSurface.forget();
     *imageOut = surf;
 
     switch (surf->Format()) {
         case gfxASurface::ImageFormatARGB32:
-            *format = WebGLTexelFormat::BGRA8; // careful, our ARGB means BGRA
+            *format = WebGLTexelConversions::BGRA8; // careful, our ARGB means BGRA
             break;
         case gfxASurface::ImageFormatRGB24:
-            *format = WebGLTexelFormat::BGRX8; // careful, our RGB24 is not tightly packed. Whence BGRX8.
+            *format = WebGLTexelConversions::BGRX8; // careful, our RGB24 is not tightly packed. Whence BGRX8.
             break;
         case gfxASurface::ImageFormatA8:
-            *format = WebGLTexelFormat::A8;
+            *format = WebGLTexelConversions::A8;
             break;
         case gfxASurface::ImageFormatRGB16_565:
-            *format = WebGLTexelFormat::RGB565;
+            *format = WebGLTexelConversions::RGB565;
             break;
         default:
             NS_ASSERTION(false, "Unsupported image format. Unimplemented.");
             return NS_ERROR_NOT_IMPLEMENTED;
     }
 
     return NS_OK;
 }
@@ -5639,17 +5450,17 @@ GLenum WebGLContext::CheckedTexImage2D(G
 
 void
 WebGLContext::TexImage2D_base(WebGLenum target, WebGLint level, WebGLenum internalformat,
                               WebGLsizei width, WebGLsizei height, WebGLsizei srcStrideOrZero,
                               WebGLint border,
                               WebGLenum format, WebGLenum type,
                               void *data, PRUint32 byteLength,
                               int jsArrayType, // a TypedArray format enum, or -1 if not relevant
-                              int srcFormat, bool srcPremultiplied)
+                              WebGLTexelFormat srcFormat, bool srcPremultiplied)
 {
     switch (target) {
         case LOCAL_GL_TEXTURE_2D:
             break;
         case LOCAL_GL_TEXTURE_CUBE_MAP_POSITIVE_X:
         case LOCAL_GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
         case LOCAL_GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
         case LOCAL_GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
@@ -5694,24 +5505,29 @@ WebGLContext::TexImage2D_base(WebGLenum 
         if (!(is_pot_assuming_nonnegative(width) &&
               is_pot_assuming_nonnegative(height)))
             return ErrorInvalidValue("texImage2D: with level > 0, width and height must be powers of two");
     }
 
     if (border != 0)
         return ErrorInvalidValue("TexImage2D: border must be 0");
 
-    PRUint32 texelSize = 0;
-    if (!ValidateTexFormatAndType(format, type, jsArrayType, &texelSize, "texImage2D"))
-        return;
+    PRUint32 dstTexelSize = 0;
+    if (!ValidateTexFormatAndType(format, type, jsArrayType, &dstTexelSize, "texImage2D"))
+        return;
+
+    WebGLTexelFormat dstFormat = GetWebGLTexelFormat(format, type);
+    WebGLTexelFormat actualSrcFormat = srcFormat == WebGLTexelConversions::Auto ? dstFormat : srcFormat;
+
+    PRUint32 srcTexelSize = WebGLTexelConversions::TexelBytesForFormat(actualSrcFormat);
 
     CheckedUint32 checked_neededByteLength = 
-        GetImageSize(height, width, texelSize, mPixelStoreUnpackAlignment); 
-
-    CheckedUint32 checked_plainRowSize = CheckedUint32(width) * texelSize;
+        GetImageSize(height, width, srcTexelSize, mPixelStoreUnpackAlignment);
+
+    CheckedUint32 checked_plainRowSize = CheckedUint32(width) * srcTexelSize;
 
     CheckedUint32 checked_alignedRowSize =
         RoundedToNextMultipleOf(checked_plainRowSize.value(), mPixelStoreUnpackAlignment);
 
     if (!checked_neededByteLength.valid())
         return ErrorInvalidOperation("texImage2D: integer overflow computing the needed buffer size");
 
     PRUint32 bytesNeeded = checked_neededByteLength.value();
@@ -5729,40 +5545,39 @@ WebGLContext::TexImage2D_base(WebGLenum 
 
     // Handle ES2 and GL differences in floating point internal formats.  Note that
     // format == internalformat, as checked above and as required by ES.
     internalformat = InternalFormatForFormatAndType(format, type, gl->IsGLES2());
 
     GLenum error = LOCAL_GL_NO_ERROR;
 
     if (byteLength) {
-        int dstFormat = GetWebGLTexelFormat(format, type);
-        int actualSrcFormat = srcFormat == WebGLTexelFormat::Auto ? dstFormat : srcFormat;
         size_t srcStride = srcStrideOrZero ? srcStrideOrZero : checked_alignedRowSize.value();
 
-        size_t dstPlainRowSize = texelSize * width;
+        size_t dstPlainRowSize = dstTexelSize * width;
         size_t unpackAlignment = mPixelStoreUnpackAlignment;
         size_t dstStride = ((dstPlainRowSize + unpackAlignment-1) / unpackAlignment) * unpackAlignment;
 
         if (actualSrcFormat == dstFormat &&
             srcPremultiplied == mPixelStorePremultiplyAlpha &&
             srcStride == dstStride &&
             !mPixelStoreFlipY)
         {
             // no conversion, no flipping, so we avoid copying anything and just pass the source pointer
             error = CheckedTexImage2D(target, level, internalformat,
                                       width, height, border, format, type, data);
         }
         else
         {
-            nsAutoArrayPtr<PRUint8> convertedData(new PRUint8[bytesNeeded]);
+            size_t convertedDataSize = height * dstStride;
+            nsAutoArrayPtr<PRUint8> convertedData(new PRUint8[convertedDataSize]);
             ConvertImage(width, height, srcStride, dstStride,
                         (PRUint8*)data, convertedData,
                         actualSrcFormat, srcPremultiplied,
-                        dstFormat, mPixelStorePremultiplyAlpha, texelSize);
+                        dstFormat, mPixelStorePremultiplyAlpha, dstTexelSize);
             error = CheckedTexImage2D(target, level, internalformat,
                                       width, height, border, format, type, convertedData);
         }
     } else {
         // We need some zero pages, because GL doesn't guarantee the
         // contents of a texture allocated with NULL data.
         // Hopefully calloc will just mmap zero pages here.
         void *tempZeroData = calloc(1, bytesNeeded);
@@ -5809,17 +5624,17 @@ WebGLContext::TexImage2D(JSContext* cx, 
 {
     if (!IsContextStable())
         return;
 
     return TexImage2D_base(target, level, internalformat, width, height, 0, border, format, type,
                            pixels ? pixels->mData : 0,
                            pixels ? pixels->mLength : 0,
                            pixels ? (int)JS_GetTypedArrayType(pixels->mObj, cx) : -1,
-                           WebGLTexelFormat::Auto, false);
+                           WebGLTexelConversions::Auto, false);
 }
 
 NS_IMETHODIMP
 WebGLContext::TexImage2D_imageData(WebGLenum target, WebGLint level, WebGLenum internalformat,
                                    WebGLsizei width, WebGLsizei height, WebGLint border,
                                    WebGLenum format, WebGLenum type,
                                    JSObject *pixels, JSContext *cx)
 {
@@ -5827,17 +5642,17 @@ WebGLContext::TexImage2D_imageData(WebGL
         return NS_OK;
 
     NS_ABORT_IF_FALSE(JS_IsTypedArrayObject(pixels, cx), "bad pixels object");
 
     TexImage2D_base(target, level, internalformat, width, height, 4*width, border, format, type,
                     pixels ? JS_GetArrayBufferViewData(pixels, cx) : 0,
                     pixels ? JS_GetArrayBufferViewByteLength(pixels, cx) : 0,
                     -1,
-                    WebGLTexelFormat::RGBA8, false);
+                    WebGLTexelConversions::RGBA8, false);
     return NS_OK;
 }
 
 void
 WebGLContext::TexImage2D(JSContext* cx, WebGLenum target, WebGLint level,
                          WebGLenum internalformat, WebGLenum format,
                          WebGLenum type, ImageData* pixels, ErrorResult& rv)
 {
@@ -5848,17 +5663,17 @@ WebGLContext::TexImage2D(JSContext* cx, 
         // Spec says to generate an INVALID_VALUE error
         return ErrorInvalidValue("texImage2D: null ImageData");
     }
     
     Uint8ClampedArray arr(cx, pixels->GetDataObject());
     return TexImage2D_base(target, level, internalformat, pixels->GetWidth(),
                            pixels->GetHeight(), 4*pixels->GetWidth(), 0,
                            format, type, arr.mData, arr.mLength, -1,
-                           WebGLTexelFormat::RGBA8, false);
+                           WebGLTexelConversions::RGBA8, false);
 }
 
 
 NS_IMETHODIMP
 WebGLContext::TexImage2D_dom(WebGLenum target, WebGLint level, WebGLenum internalformat,
                              WebGLenum format, GLenum type, Element* elt)
 {
     ErrorResult rv;
@@ -5872,17 +5687,17 @@ WebGLContext::TexImage2D(JSContext* /* u
                          WebGLenum format, WebGLenum type, Element* elt,
                          ErrorResult& rv)
 {
     if (!IsContextStable())
         return;
 
     nsRefPtr<gfxImageSurface> isurf;
 
-    int srcFormat;
+    WebGLTexelFormat srcFormat;
     rv = DOMElementToImageSurface(elt, getter_AddRefs(isurf), &srcFormat);
     if (rv.Failed())
         return;
 
     PRUint32 byteLength = isurf->Stride() * isurf->Height();
 
     return TexImage2D_base(target, level, internalformat,
                            isurf->Width(), isurf->Height(), isurf->Stride(), 0,
@@ -5903,17 +5718,17 @@ WebGLContext::TexSubImage2D(PRInt32)
 
 void
 WebGLContext::TexSubImage2D_base(WebGLenum target, WebGLint level,
                                  WebGLint xoffset, WebGLint yoffset,
                                  WebGLsizei width, WebGLsizei height, WebGLsizei srcStrideOrZero,
                                  WebGLenum format, WebGLenum type,
                                  void *pixels, PRUint32 byteLength,
                                  int jsArrayType,
-                                 int srcFormat, bool srcPremultiplied)
+                                 WebGLTexelFormat srcFormat, bool srcPremultiplied)
 {
     switch (target) {
         case LOCAL_GL_TEXTURE_2D:
         case LOCAL_GL_TEXTURE_CUBE_MAP_POSITIVE_X:
         case LOCAL_GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
         case LOCAL_GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
         case LOCAL_GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
         case LOCAL_GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
@@ -5938,27 +5753,32 @@ WebGLContext::TexSubImage2D_base(WebGLen
         return ErrorInvalidValue("texSubImage2D: width or height exceeds maximum texture size");
 
     if (level >= 1) {
         if (!(is_pot_assuming_nonnegative(width) &&
               is_pot_assuming_nonnegative(height)))
             return ErrorInvalidValue("texSubImage2D: with level > 0, width and height must be powers of two");
     }
 
-    PRUint32 texelSize = 0;
-    if (!ValidateTexFormatAndType(format, type, jsArrayType, &texelSize, "texSubImage2D"))
-        return;
+    PRUint32 dstTexelSize = 0;
+    if (!ValidateTexFormatAndType(format, type, jsArrayType, &dstTexelSize, "texSubImage2D"))
+        return;
+
+    WebGLTexelFormat dstFormat = GetWebGLTexelFormat(format, type);
+    WebGLTexelFormat actualSrcFormat = srcFormat == WebGLTexelConversions::Auto ? dstFormat : srcFormat;
+
+    PRUint32 srcTexelSize = WebGLTexelConversions::TexelBytesForFormat(actualSrcFormat);
 
     if (width == 0 || height == 0)
         return; // ES 2.0 says it has no effect, we better return right now
 
     CheckedUint32 checked_neededByteLength = 
-        GetImageSize(height, width, texelSize, mPixelStoreUnpackAlignment);
-
-    CheckedUint32 checked_plainRowSize = CheckedUint32(width) * texelSize;
+        GetImageSize(height, width, srcTexelSize, mPixelStoreUnpackAlignment);
+
+    CheckedUint32 checked_plainRowSize = CheckedUint32(width) * srcTexelSize;
 
     CheckedUint32 checked_alignedRowSize = 
         RoundedToNextMultipleOf(checked_plainRowSize.value(), mPixelStoreUnpackAlignment);
 
     if (!checked_neededByteLength.valid())
         return ErrorInvalidOperation("texSubImage2D: integer overflow computing the needed buffer size");
 
     PRUint32 bytesNeeded = checked_neededByteLength.value();
@@ -5981,39 +5801,38 @@ WebGLContext::TexSubImage2D_base(WebGLen
         return ErrorInvalidValue("texSubImage2D: subtexture rectangle out of bounds");
     
     // Require the format and type in texSubImage2D to match that of the existing texture as created by texImage2D
     if (imageInfo.Format() != format || imageInfo.Type() != type)
         return ErrorInvalidOperation("texSubImage2D: format or type doesn't match the existing texture");
 
     MakeContextCurrent();
 
-    int dstFormat = GetWebGLTexelFormat(format, type);
-    int actualSrcFormat = srcFormat == WebGLTexelFormat::Auto ? dstFormat : srcFormat;
     size_t srcStride = srcStrideOrZero ? srcStrideOrZero : checked_alignedRowSize.value();
 
-    size_t dstPlainRowSize = texelSize * width;
+    size_t dstPlainRowSize = dstTexelSize * width;
     // There are checks above to ensure that this won't overflow.
     size_t dstStride = RoundedToNextMultipleOf(dstPlainRowSize, mPixelStoreUnpackAlignment).value();
 
     if (actualSrcFormat == dstFormat &&
         srcPremultiplied == mPixelStorePremultiplyAlpha &&
         srcStride == dstStride &&
         !mPixelStoreFlipY)
     {
         // no conversion, no flipping, so we avoid copying anything and just pass the source pointer
         gl->fTexSubImage2D(target, level, xoffset, yoffset, width, height, format, type, pixels);
     }
     else
     {
-        nsAutoArrayPtr<PRUint8> convertedData(new PRUint8[bytesNeeded]);
+        size_t convertedDataSize = height * dstStride;
+        nsAutoArrayPtr<PRUint8> convertedData(new PRUint8[convertedDataSize]);
         ConvertImage(width, height, srcStride, dstStride,
                     (const PRUint8*)pixels, convertedData,
                     actualSrcFormat, srcPremultiplied,
-                    dstFormat, mPixelStorePremultiplyAlpha, texelSize);
+                    dstFormat, mPixelStorePremultiplyAlpha, dstTexelSize);
 
         gl->fTexSubImage2D(target, level, xoffset, yoffset, width, height, format, type, convertedData);
     }
 }
 
 NS_IMETHODIMP
 WebGLContext::TexSubImage2D_array(WebGLenum target, WebGLint level,
                                   WebGLint xoffset, WebGLint yoffset,
@@ -6046,17 +5865,17 @@ WebGLContext::TexSubImage2D(JSContext* c
 
     if (!pixels)
         return ErrorInvalidValue("TexSubImage2D: pixels must not be null!");
 
     return TexSubImage2D_base(target, level, xoffset, yoffset,
                               width, height, 0, format, type,
                               pixels->mData, pixels->mLength,
                               JS_GetTypedArrayType(pixels->mObj, cx),
-                              WebGLTexelFormat::Auto, false);
+                              WebGLTexelConversions::Auto, false);
 }
 
 NS_IMETHODIMP
 WebGLContext::TexSubImage2D_imageData(WebGLenum target, WebGLint level,
                                       WebGLint xoffset, WebGLint yoffset,
                                       WebGLsizei width, WebGLsizei height,
                                       WebGLenum format, WebGLenum type,
                                       JSObject *pixels, JSContext *cx)
@@ -6070,17 +5889,17 @@ WebGLContext::TexSubImage2D_imageData(We
     }
 
     NS_ABORT_IF_FALSE(JS_IsTypedArrayObject(pixels, cx), "bad pixels object");
 
     TexSubImage2D_base(target, level, xoffset, yoffset,
                        width, height, 4*width, format, type,
                        JS_GetArrayBufferViewData(pixels, cx), JS_GetArrayBufferViewByteLength(pixels, cx),
                        -1,
-                       WebGLTexelFormat::RGBA8, false);
+                       WebGLTexelConversions::RGBA8, false);
     return NS_OK;
 }
 
 void
 WebGLContext::TexSubImage2D(JSContext* cx, WebGLenum target, WebGLint level,
                             WebGLint xoffset, WebGLint yoffset,
                             WebGLenum format, WebGLenum type, ImageData* pixels,
                             ErrorResult& rv)
@@ -6092,17 +5911,17 @@ WebGLContext::TexSubImage2D(JSContext* c
         return ErrorInvalidValue("TexSubImage2D: pixels must not be null!");
 
     Uint8ClampedArray arr(cx, pixels->GetDataObject());
     return TexSubImage2D_base(target, level, xoffset, yoffset,
                               pixels->GetWidth(), pixels->GetHeight(),
                               4*pixels->GetWidth(), format, type,
                               arr.mData, arr.mLength,
                               -1,
-                              WebGLTexelFormat::RGBA8, false);
+                              WebGLTexelConversions::RGBA8, false);
 }
 
 NS_IMETHODIMP
 WebGLContext::TexSubImage2D_dom(WebGLenum target, WebGLint level,
                                 WebGLint xoffset, WebGLint yoffset,
                                 WebGLenum format, WebGLenum type,
                                 Element *elt)
 {
@@ -6117,17 +5936,17 @@ WebGLContext::TexSubImage2D(JSContext* /
                             WebGLenum format, WebGLenum type,
                             dom::Element* elt, ErrorResult& rv)
 {
     if (!IsContextStable())
         return;
 
     nsRefPtr<gfxImageSurface> isurf;
 
-    int srcFormat;
+    WebGLTexelFormat srcFormat;
     rv = DOMElementToImageSurface(elt, getter_AddRefs(isurf), &srcFormat);
     if (rv.Failed())
         return;
 
     PRUint32 byteLength = isurf->Stride() * isurf->Height();
 
     return TexSubImage2D_base(target, level,
                               xoffset, yoffset,
@@ -6227,62 +6046,62 @@ BaseTypeAndSizeFromUniformType(WebGLenum
         default:
             return false;
     }
 
     return true;
 }
 
 
-int mozilla::GetWebGLTexelFormat(GLenum format, GLenum type)
+WebGLTexelFormat mozilla::GetWebGLTexelFormat(GLenum format, GLenum type)
 {
     if (type == LOCAL_GL_UNSIGNED_BYTE) {
         switch (format) {
             case LOCAL_GL_RGBA:
-                return WebGLTexelFormat::RGBA8;
+                return WebGLTexelConversions::RGBA8;
             case LOCAL_GL_RGB:
-                return WebGLTexelFormat::RGB8;
+                return WebGLTexelConversions::RGB8;
             case LOCAL_GL_ALPHA:
-                return WebGLTexelFormat::A8;
+                return WebGLTexelConversions::A8;
             case LOCAL_GL_LUMINANCE:
-                return WebGLTexelFormat::R8;
+                return WebGLTexelConversions::R8;
             case LOCAL_GL_LUMINANCE_ALPHA:
-                return WebGLTexelFormat::RA8;
+                return WebGLTexelConversions::RA8;
             default:
-                NS_ASSERTION(false, "Coding mistake?! Should never reach this point.");
-                return WebGLTexelFormat::Generic;
+                NS_ABORT_IF_FALSE(false, "Coding mistake?! Should never reach this point.");
+                return WebGLTexelConversions::BadFormat;
         }
     } else if (type == LOCAL_GL_FLOAT) {
         // OES_texture_float
         switch (format) {
             case LOCAL_GL_RGBA:
-                return WebGLTexelFormat::RGBA32F;
+                return WebGLTexelConversions::RGBA32F;
             case LOCAL_GL_RGB:
-                return WebGLTexelFormat::RGB32F;
+                return WebGLTexelConversions::RGB32F;
             case LOCAL_GL_ALPHA:
-                return WebGLTexelFormat::A32F;
+                return WebGLTexelConversions::A32F;
             case LOCAL_GL_LUMINANCE:
-                return WebGLTexelFormat::R32F;
+                return WebGLTexelConversions::R32F;
             case LOCAL_GL_LUMINANCE_ALPHA:
-                return WebGLTexelFormat::RA32F;
+                return WebGLTexelConversions::RA32F;
             default:
-                NS_ASSERTION(false, "Coding mistake?! Should never reach this point.");
-                return WebGLTexelFormat::Generic;
+                NS_ABORT_IF_FALSE(false, "Coding mistake?! Should never reach this point.");
+                return WebGLTexelConversions::BadFormat;
         }
     } else {
         switch (type) {
             case LOCAL_GL_UNSIGNED_SHORT_4_4_4_4:
-                return WebGLTexelFormat::RGBA4444;
+                return WebGLTexelConversions::RGBA4444;
             case LOCAL_GL_UNSIGNED_SHORT_5_5_5_1:
-                return WebGLTexelFormat::RGBA5551;
+                return WebGLTexelConversions::RGBA5551;
             case LOCAL_GL_UNSIGNED_SHORT_5_6_5:
-                return WebGLTexelFormat::RGB565;
+                return WebGLTexelConversions::RGB565;
             default:
-                NS_ASSERTION(false, "Coding mistake?! Should never reach this point.");
-                return WebGLTexelFormat::Generic;
+                NS_ABORT_IF_FALSE(false, "Coding mistake?! Should never reach this point.");
+                return WebGLTexelConversions::BadFormat;
         }
     }
 }
 
 WebGLenum
 InternalFormatForFormatAndType(WebGLenum format, WebGLenum type, bool isGLES2)
 {
     // ES2 requires that format == internalformat; floating-point is
new file mode 100644
--- /dev/null
+++ b/content/canvas/src/WebGLTexelConversions.cpp
@@ -0,0 +1,382 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this file,
+ * You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "WebGLTexelConversions.h"
+
+namespace mozilla {
+
+using namespace WebGLTexelConversions;
+
+namespace {
+
+/** @class WebGLImageConverter
+ *
+ * This class is just a helper to implement WebGLContext::ConvertImage below.
+ *
+ * Design comments:
+ * 
+ * WebGLContext::ConvertImage has to handle hundreds of format conversion paths.
+ * It is important to minimize executable code size here. Instead of passing around
+ * a large number of function parameters hundreds of times, we create a
+ * WebGLImageConverter object once, storing these parameters, and then we call
+ * the run() method on it.
+ */
+class WebGLImageConverter
+{
+    const size_t mWidth, mHeight;
+    const void* const mSrcStart;
+    void* const mDstStart;
+    const ptrdiff_t mSrcStride, mDstStride;
+    bool mAlreadyRun;
+    bool mSuccess;
+
+    /*
+     * Returns sizeof(texel)/sizeof(type). The point is that we will iterate over
+     * texels with typed pointers and this value will tell us by how much we need
+     * to increment these pointers to advance to the next texel.
+     */
+    template<int Format>
+    static size_t NumElementsPerTexelForFormat() {
+        switch (Format) {
+            case R8:
+            case A8:
+            case R32F:
+            case A32F:
+            case RGBA5551:
+            case RGBA4444:
+            case RGB565:
+                return 1;
+            case RA8:
+            case RA32F:
+                return 2;
+            case RGB8:
+            case RGB32F:
+                return 3;
+            case RGBA8:
+            case BGRA8:
+            case BGRX8:
+            case RGBA32F:
+                return 4;
+            default:
+                NS_ABORT_IF_FALSE(false, "Unknown texel format. Coding mistake?");
+                return 0;
+        }
+    }
+
+    /*
+     * This is the completely format-specific templatized conversion function,
+     * that will be instantiated hundreds of times for all different combinations.
+     * It is important to avoid generating useless code here. In particular, many
+     * instantiations of this function template will never be called, so we try
+     * to return immediately in these cases to allow the compiler to avoid generating
+     * useless code.
+     */
+    template<WebGLTexelFormat SrcFormat,
+             WebGLTexelFormat DstFormat,
+             WebGLTexelPremultiplicationOp PremultiplicationOp>
+    void run()
+    {
+        // check for never-called cases. We early-return to allow the compiler
+        // to avoid generating this code. It would be tempting to abort() instead,
+        // as returning early does leave the destination surface with uninitialized
+        // data, but that would not allow the compiler to avoid generating this code.
+        // So instead, we return early, so Success() will return false, and the caller
+        // must check that and abort in that case. See WebGLContext::ConvertImage.
+
+        if (SrcFormat == DstFormat &&
+            PremultiplicationOp == NoPremultiplicationOp)
+        {
+            // Should have used a fast exit path earlier, rather than entering this function.
+            // we explicitly return here to allow the compiler to avoid generating this code
+            return;
+        }
+
+        // Only textures uploaded from DOM elements or ImageData can allow DstFormat != SrcFormat.
+        // DOM elements can only give BGRA8, BGRX8, A8, RGB565 formats. See DOMElementToImageSurface.
+        // ImageData is always RGBA8. So all other SrcFormat will always satisfy DstFormat==SrcFormat,
+        // so we can avoid compiling the code for all the unreachable paths.
+        const bool CanSrcFormatComeFromDOMElementOrImageData
+            = SrcFormat == BGRA8 ||
+              SrcFormat == BGRX8 ||
+              SrcFormat == A8 ||
+              SrcFormat == RGB565 ||
+              SrcFormat == RGBA8;
+        if (!CanSrcFormatComeFromDOMElementOrImageData &&
+            SrcFormat != DstFormat)
+        {
+            return;
+        }
+
+        // Likewise, only textures uploaded from DOM elements or ImageData can possibly have to be unpremultiplied.
+        if (!CanSrcFormatComeFromDOMElementOrImageData &&
+            PremultiplicationOp == Unpremultiply)
+        {
+            return;
+        }
+
+        // there is no point in premultiplication/unpremultiplication
+        // in the following cases:
+        //  - the source format has no alpha
+        //  - the source format has no color
+        //  - the destination format has no color
+        if (!HasAlpha(SrcFormat) ||
+            !HasColor(SrcFormat) ||
+            !HasColor(DstFormat))
+        {
+
+            if (PremultiplicationOp != NoPremultiplicationOp)
+            {
+                return;
+            }
+        }
+
+        // end of early return cases.
+
+        NS_ABORT_IF_FALSE(!mAlreadyRun, "converter should be run only once!");
+        mAlreadyRun = true;
+
+        // gather some compile-time meta-data about the formats at hand.
+
+        typedef
+            typename DataTypeForFormat<SrcFormat>::Type
+            SrcType;
+        typedef
+            typename DataTypeForFormat<DstFormat>::Type
+            DstType;
+
+        const int IntermediateSrcFormat
+            = IntermediateFormat<SrcFormat>::Value;
+        const int IntermediateDstFormat
+            = IntermediateFormat<DstFormat>::Value;
+        typedef
+            typename DataTypeForFormat<IntermediateSrcFormat>::Type
+            IntermediateSrcType;
+        typedef
+            typename DataTypeForFormat<IntermediateDstFormat>::Type
+            IntermediateDstType;
+
+        const size_t NumElementsPerSrcTexel = NumElementsPerTexelForFormat<SrcFormat>();
+        const size_t NumElementsPerDstTexel = NumElementsPerTexelForFormat<DstFormat>();
+        const size_t MaxElementsPerTexel = 4;
+        NS_ABORT_IF_FALSE(NumElementsPerSrcTexel <= MaxElementsPerTexel, "unhandled format");
+        NS_ABORT_IF_FALSE(NumElementsPerDstTexel <= MaxElementsPerTexel, "unhandled format");
+
+        // we assume that the strides are multiples of the sizeof of respective types.
+        // this assumption will allow us to iterate over src and dst images using typed
+        // pointers, e.g. uint8_t* or uint16_t* or float*, instead of untyped pointers.
+        // So this assumption allows us to write cleaner and safer code, but it might
+        // not be true forever and if it eventually becomes wrong, we'll have to revert
+        // to always iterating using uint8_t* pointers regardless of the types at hand.
+        NS_ABORT_IF_FALSE(mSrcStride % sizeof(SrcType) == 0 &&
+                          mDstStride % sizeof(DstType) == 0,
+                          "Unsupported: texture stride is not a multiple of sizeof(type)");
+        const ptrdiff_t srcStrideInElements = mSrcStride / sizeof(SrcType);
+        const ptrdiff_t dstStrideInElements = mDstStride / sizeof(DstType);
+
+        const SrcType *srcRowStart = static_cast<const SrcType*>(mSrcStart);
+        DstType *dstRowStart = static_cast<DstType*>(mDstStart);
+
+        // the loop performing the texture format conversion
+        for (size_t i = 0; i < mHeight; ++i) {
+            const SrcType *srcRowEnd = srcRowStart + mWidth * NumElementsPerSrcTexel;
+            const SrcType *srcPtr = srcRowStart;
+            DstType *dstPtr = dstRowStart;
+            while (srcPtr != srcRowEnd) {
+                // convert a single texel. We proceed in 3 steps: unpack the source texel
+                // so the corresponding interchange format (e.g. unpack RGB565 to RGBA8),
+                // convert the resulting data type to the destination type (e.g. convert
+                // from RGBA8 to RGBA32F), and finally pack the destination texel
+                // (e.g. pack RGBA32F to RGB32F).
+                IntermediateSrcType unpackedSrc[MaxElementsPerTexel];
+                IntermediateDstType unpackedDst[MaxElementsPerTexel];
+
+                // unpack a src texel to corresponding intermediate src format.
+                // for example, unpack RGB565 to RGBA8
+                unpack<SrcFormat>(srcPtr, unpackedSrc);
+                // convert the data type to the destination type, if needed.
+                // for example, convert RGBA8 to RGBA32F
+                convertType(unpackedSrc, unpackedDst);
+                // pack the destination texel.
+                // for example, pack RGBA32F to RGB32F
+                pack<DstFormat, PremultiplicationOp>(unpackedDst, dstPtr);
+
+                srcPtr += NumElementsPerSrcTexel;
+                dstPtr += NumElementsPerDstTexel;
+            }
+            srcRowStart += srcStrideInElements;
+            dstRowStart += dstStrideInElements;
+        }
+
+        mSuccess = true;
+        return;
+    }
+
+    template<WebGLTexelFormat SrcFormat, WebGLTexelFormat DstFormat>
+    void run(WebGLTexelPremultiplicationOp premultiplicationOp)
+    {
+        #define WEBGLIMAGECONVERTER_CASE_PREMULTIPLICATIONOP(PremultiplicationOp) \
+            case PremultiplicationOp: \
+                return run<SrcFormat, DstFormat, PremultiplicationOp>();
+
+        switch (premultiplicationOp) {
+            WEBGLIMAGECONVERTER_CASE_PREMULTIPLICATIONOP(NoPremultiplicationOp)
+            WEBGLIMAGECONVERTER_CASE_PREMULTIPLICATIONOP(Premultiply)
+            WEBGLIMAGECONVERTER_CASE_PREMULTIPLICATIONOP(Unpremultiply)
+            default:
+                NS_ABORT_IF_FALSE(false, "unhandled case. Coding mistake?");
+        }
+
+        #undef WEBGLIMAGECONVERTER_CASE_PREMULTIPLICATIONOP
+    }
+
+    template<WebGLTexelFormat SrcFormat>
+    void run(WebGLTexelFormat dstFormat,
+             WebGLTexelPremultiplicationOp premultiplicationOp)
+    {
+        #define WEBGLIMAGECONVERTER_CASE_DSTFORMAT(DstFormat) \
+            case DstFormat: \
+                return run<SrcFormat, DstFormat>(premultiplicationOp);
+
+        switch (dstFormat) {
+            WEBGLIMAGECONVERTER_CASE_DSTFORMAT(R8)
+            WEBGLIMAGECONVERTER_CASE_DSTFORMAT(A8)
+            WEBGLIMAGECONVERTER_CASE_DSTFORMAT(R32F)
+            WEBGLIMAGECONVERTER_CASE_DSTFORMAT(A32F)
+            WEBGLIMAGECONVERTER_CASE_DSTFORMAT(RA8)
+            WEBGLIMAGECONVERTER_CASE_DSTFORMAT(RA32F)
+            WEBGLIMAGECONVERTER_CASE_DSTFORMAT(RGB8)
+            WEBGLIMAGECONVERTER_CASE_DSTFORMAT(RGB565)
+            WEBGLIMAGECONVERTER_CASE_DSTFORMAT(RGB32F)
+            WEBGLIMAGECONVERTER_CASE_DSTFORMAT(RGBA8)
+            WEBGLIMAGECONVERTER_CASE_DSTFORMAT(RGBA5551)
+            WEBGLIMAGECONVERTER_CASE_DSTFORMAT(RGBA4444)
+            WEBGLIMAGECONVERTER_CASE_DSTFORMAT(RGBA32F)
+            default:
+                NS_ABORT_IF_FALSE(false, "unhandled case. Coding mistake?");
+        }
+
+        #undef WEBGLIMAGECONVERTER_CASE_DSTFORMAT
+    }
+
+public:
+
+    void run(WebGLTexelFormat srcFormat,
+             WebGLTexelFormat dstFormat,
+             WebGLTexelPremultiplicationOp premultiplicationOp)
+    {
+        #define WEBGLIMAGECONVERTER_CASE_SRCFORMAT(SrcFormat) \
+            case SrcFormat: \
+                return run<SrcFormat>(dstFormat, premultiplicationOp);
+
+        switch (srcFormat) {
+            WEBGLIMAGECONVERTER_CASE_SRCFORMAT(R8)
+            WEBGLIMAGECONVERTER_CASE_SRCFORMAT(A8)
+            WEBGLIMAGECONVERTER_CASE_SRCFORMAT(R32F)
+            WEBGLIMAGECONVERTER_CASE_SRCFORMAT(A32F)
+            WEBGLIMAGECONVERTER_CASE_SRCFORMAT(RA8)
+            WEBGLIMAGECONVERTER_CASE_SRCFORMAT(RA32F)
+            WEBGLIMAGECONVERTER_CASE_SRCFORMAT(RGB8)
+            WEBGLIMAGECONVERTER_CASE_SRCFORMAT(BGRX8) // source format only
+            WEBGLIMAGECONVERTER_CASE_SRCFORMAT(RGB565)
+            WEBGLIMAGECONVERTER_CASE_SRCFORMAT(RGB32F)
+            WEBGLIMAGECONVERTER_CASE_SRCFORMAT(RGBA8)
+            WEBGLIMAGECONVERTER_CASE_SRCFORMAT(BGRA8)
+            WEBGLIMAGECONVERTER_CASE_SRCFORMAT(RGBA5551)
+            WEBGLIMAGECONVERTER_CASE_SRCFORMAT(RGBA4444)
+            WEBGLIMAGECONVERTER_CASE_SRCFORMAT(RGBA32F)
+            default:
+                NS_ABORT_IF_FALSE(false, "unhandled case. Coding mistake?");
+        }
+
+        #undef WEBGLIMAGECONVERTER_CASE_SRCFORMAT
+    }
+
+    WebGLImageConverter(size_t width, size_t height,
+                        const void* srcStart, void* dstStart,
+                        ptrdiff_t srcStride, ptrdiff_t dstStride)
+        : mWidth(width), mHeight(height),
+          mSrcStart(srcStart), mDstStart(dstStart),
+          mSrcStride(srcStride), mDstStride(dstStride),
+          mAlreadyRun(false), mSuccess(false)
+    {}
+
+    bool Success() const {
+        return mSuccess;
+    }
+};
+
+} // end anonymous namespace
+
+void
+WebGLContext::ConvertImage(size_t width, size_t height, size_t srcStride, size_t dstStride,
+                           const uint8_t* src, uint8_t *dst,
+                           WebGLTexelFormat srcFormat, bool srcPremultiplied,
+                           WebGLTexelFormat dstFormat, bool dstPremultiplied,
+                           size_t dstTexelSize)
+{
+    if (width <= 0 || height <= 0)
+        return;
+
+    const bool FormatsRequireNoPremultiplicationOp =
+        !HasAlpha(srcFormat) ||
+        !HasColor(srcFormat) ||
+        !HasColor(dstFormat);
+
+    if (srcFormat == dstFormat &&
+        (FormatsRequireNoPremultiplicationOp || srcPremultiplied == dstPremultiplied))
+    {
+        // fast exit path: we just have to memcpy all the rows.
+        //
+        // The case where absolutely nothing needs to be done is supposed to have
+        // been handled earlier (in TexImage2D_base, etc).
+        //
+        // So the case we're handling here is when even though no format conversion is needed,
+        // we still might have to flip vertically and/or to adjust to a different stride.
+
+        NS_ABORT_IF_FALSE(mPixelStoreFlipY || srcStride != dstStride, "Performance trap -- should handle this case earlier, to avoid memcpy");
+
+        size_t row_size = width * dstTexelSize; // doesn't matter, src and dst formats agree
+        const uint8_t* ptr = src;
+        const uint8_t* src_end = src + height * srcStride;
+
+        uint8_t* dst_row = mPixelStoreFlipY
+                           ? dst + (height-1) * dstStride
+                           : dst;
+        ptrdiff_t dstStrideSigned(dstStride);
+        ptrdiff_t dst_delta = mPixelStoreFlipY ? -dstStrideSigned : dstStrideSigned;
+
+        while(ptr != src_end) {
+            memcpy(dst_row, ptr, row_size);
+            ptr += srcStride;
+            dst_row += dst_delta;
+        }
+        return;
+    }
+
+    uint8_t* dstStart = dst;
+    ptrdiff_t signedDstStride = dstStride;
+    if (mPixelStoreFlipY) {
+        dstStart = dst + (height - 1) * dstStride;
+        signedDstStride = -dstStride;
+    }
+
+    WebGLImageConverter converter(width, height, src, dstStart, srcStride, signedDstStride);
+
+    const WebGLTexelPremultiplicationOp premultiplicationOp
+        = FormatsRequireNoPremultiplicationOp     ? NoPremultiplicationOp
+        : (!srcPremultiplied && dstPremultiplied) ? Premultiply
+        : (srcPremultiplied && !dstPremultiplied) ? Unpremultiply
+                                                  : NoPremultiplicationOp;
+
+    converter.run(srcFormat, dstFormat, premultiplicationOp);
+
+    if (!converter.Success()) {
+        // the dst image may be left uninitialized, so we better not try to
+        // continue even in release builds. This should never happen anyway,
+        // and would be a bug in our code.
+        NS_RUNTIMEABORT("programming mistake in WebGL texture conversions");
+    }
+}
+
+} // end namespace mozilla 
--- a/content/canvas/src/WebGLTexelConversions.h
+++ b/content/canvas/src/WebGLTexelConversions.h
@@ -20,449 +20,676 @@
  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-// the pixel conversions code here is originally from this file:
-//   http://trac.webkit.org/browser/trunk/WebCore/platform/graphics/GraphicsContext3D.cpp
-
-// Keep as much as possible unchanged to ease sharing code with the WebKit guys.
-// Changes:
-//  * added BGR8 path, we need it in Mozilla to load textures from DOMElements
-//  * enclosing in a namespace WebGLTexelConversions to make it clear it is, in profilers and in symbol table dumps
-//  * added __restrict keywords. Although non-standard, this is very well supported across all compilers
-//    that I know of (GCC/LLVM/MSC/ICC/XLC...)
-//  * optimized scaleFactor computation in Unmultiply functions (1 div instead of 2)
-
 #ifndef WEBGLTEXELCONVERSIONS_H_
 #define WEBGLTEXELCONVERSIONS_H_
 
 #ifdef __SUNPRO_CC
 #define __restrict
 #endif
 
 #include "WebGLContext.h"
+#include "mozilla/StandardInteger.h"
 
 #if defined _MSC_VER
 #define FORCE_INLINE __forceinline
 #elif defined __GNUC__
 #define FORCE_INLINE __attribute__((always_inline)) inline
 #else
 #define FORCE_INLINE inline
 #endif
 
 namespace mozilla {
 
 namespace WebGLTexelConversions {
 
+enum WebGLTexelPremultiplicationOp
+{
+    NoPremultiplicationOp,
+    Premultiply,
+    Unpremultiply
+};
+
+template<int Format>
+struct IsFloatFormat
+{
+    static const bool Value =
+        Format == RGBA32F ||
+        Format == RGB32F ||
+        Format == RA32F ||
+        Format == R32F ||
+        Format == A32F;
+};
+
+template<int Format>
+struct Is16bppFormat
+{
+    static const bool Value =
+        Format == RGBA4444 ||
+        Format == RGBA5551 ||
+        Format == RGB565;
+};
+
+template<int Format,
+         bool IsFloat = IsFloatFormat<Format>::Value,
+         bool Is16bpp = Is16bppFormat<Format>::Value>
+struct DataTypeForFormat
+{
+    typedef uint8_t Type;
+};
+
+template<int Format>
+struct DataTypeForFormat<Format, true, false>
+{
+    typedef float Type;
+};
+
+template<int Format>
+struct DataTypeForFormat<Format, false, true>
+{
+    typedef uint16_t Type;
+};
+
+template<int Format>
+struct IntermediateFormat
+{
+    static const int Value = IsFloatFormat<Format>::Value ? RGBA32F : RGBA8;
+};
+
+inline size_t TexelBytesForFormat(int format) {
+    switch (format) {
+        case WebGLTexelConversions::R8:
+        case WebGLTexelConversions::A8:
+            return 1;
+        case WebGLTexelConversions::RA8:
+        case WebGLTexelConversions::RGBA5551:
+        case WebGLTexelConversions::RGBA4444:
+        case WebGLTexelConversions::RGB565:
+            return 2;
+        case WebGLTexelConversions::RGB8:
+            return 3;
+        case WebGLTexelConversions::RGBA8:
+        case WebGLTexelConversions::BGRA8:
+        case WebGLTexelConversions::BGRX8:
+        case WebGLTexelConversions::R32F:
+        case WebGLTexelConversions::A32F:
+            return 4;
+        case WebGLTexelConversions::RA32F:
+            return 8;
+        case WebGLTexelConversions::RGB32F:
+            return 12;
+        case WebGLTexelConversions::RGBA32F:
+            return 16;
+        default:
+            NS_ABORT_IF_FALSE(false, "Unknown texel format. Coding mistake?");
+            return 0;
+    }
+}
+
+FORCE_INLINE bool HasAlpha(int format) {
+    return format == A8 ||
+           format == A32F ||
+           format == RA8 ||
+           format == RA32F ||
+           format == RGBA8 ||
+           format == BGRA8 ||
+           format == RGBA32F ||
+           format == RGBA4444 ||
+           format == RGBA5551;
+}
+
+FORCE_INLINE bool HasColor(int format) {
+    return format == R8 ||
+           format == R32F ||
+           format == RA8 ||
+           format == RA32F ||
+           format == RGB8 ||
+           format == BGRX8 ||
+           format == RGB565 ||
+           format == RGB32F ||
+           format == RGBA8 ||
+           format == BGRA8 ||
+           format == RGBA32F ||
+           format == RGBA4444 ||
+           format == RGBA5551;
+}
+
+
 /****** BEGIN CODE SHARED WITH WEBKIT ******/
 
+// the pack/unpack functions here are originally from this file:
+//   http://trac.webkit.org/browser/trunk/WebCore/platform/graphics/GraphicsContext3D.cpp
+
 //----------------------------------------------------------------------
 // Pixel unpacking routines.
 
-FORCE_INLINE void unpackRGBA8ToRGBA8(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<int Format, typename SrcType, typename DstType>
+FORCE_INLINE void
+unpack(const SrcType* __restrict src,
+       DstType* __restrict dst)
 {
-    destination[0] = source[0];
-    destination[1] = source[1];
-    destination[2] = source[2];
-    destination[3] = source[3];
+    NS_ABORT_IF_FALSE(false, "Unimplemented texture format conversion");
 }
 
-FORCE_INLINE void unpackRGB8ToRGBA8(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+unpack<RGBA8, uint8_t, uint8_t>(const uint8_t* __restrict src, uint8_t* __restrict dst)
 {
-    destination[0] = source[0];
-    destination[1] = source[1];
-    destination[2] = source[2];
-    destination[3] = 0xFF;
+    dst[0] = src[0];
+    dst[1] = src[1];
+    dst[2] = src[2];
+    dst[3] = src[3];
 }
 
-FORCE_INLINE void unpackBGRA8ToRGBA8(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+unpack<RGB8, uint8_t, uint8_t>(const uint8_t* __restrict src, uint8_t* __restrict dst)
 {
-    destination[0] = source[2];
-    destination[1] = source[1];
-    destination[2] = source[0];
-    destination[3] = source[3];
+    dst[0] = src[0];
+    dst[1] = src[1];
+    dst[2] = src[2];
+    dst[3] = 0xFF;
 }
 
-FORCE_INLINE void unpackBGR8ToRGBA8(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+unpack<BGRA8, uint8_t, uint8_t>(const uint8_t* __restrict src, uint8_t* __restrict dst)
 {
-    destination[0] = source[2];
-    destination[1] = source[1];
-    destination[2] = source[0];
-    destination[3] = 0xFF;
+    dst[0] = src[2];
+    dst[1] = src[1];
+    dst[2] = src[0];
+    dst[3] = src[3];
+}
+
+template<> FORCE_INLINE void
+unpack<BGRX8, uint8_t, uint8_t>(const uint8_t* __restrict src, uint8_t* __restrict dst)
+{
+    dst[0] = src[2];
+    dst[1] = src[1];
+    dst[2] = src[0];
+    dst[3] = 0xFF;
 }
 
-FORCE_INLINE void unpackRGBA5551ToRGBA8(const uint16_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+unpack<RGBA5551, uint16_t, uint8_t>(const uint16_t* __restrict src, uint8_t* __restrict dst)
 {
-    uint16_t packedValue = source[0];
-    uint8_t r = packedValue >> 11;
+    uint16_t packedValue = src[0];
+    uint8_t r = (packedValue >> 11) & 0x1F;
     uint8_t g = (packedValue >> 6) & 0x1F;
     uint8_t b = (packedValue >> 1) & 0x1F;
-    destination[0] = (r << 3) | (r & 0x7);
-    destination[1] = (g << 3) | (g & 0x7);
-    destination[2] = (b << 3) | (b & 0x7);
-    destination[3] = (packedValue & 0x1) ? 0xFF : 0x0;
+    dst[0] = (r << 3) | (r & 0x7);
+    dst[1] = (g << 3) | (g & 0x7);
+    dst[2] = (b << 3) | (b & 0x7);
+    dst[3] = (packedValue & 0x1) ? 0xFF : 0;
 }
 
-FORCE_INLINE void unpackRGBA4444ToRGBA8(const uint16_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+unpack<RGBA4444, uint16_t, uint8_t>(const uint16_t* __restrict src, uint8_t* __restrict dst)
 {
-    uint16_t packedValue = source[0];
-    uint8_t r = packedValue >> 12;
+    uint16_t packedValue = src[0];
+    uint8_t r = (packedValue >> 12) & 0x0F;
     uint8_t g = (packedValue >> 8) & 0x0F;
     uint8_t b = (packedValue >> 4) & 0x0F;
     uint8_t a = packedValue & 0x0F;
-    destination[0] = r << 4 | r;
-    destination[1] = g << 4 | g;
-    destination[2] = b << 4 | b;
-    destination[3] = a << 4 | a;
+    dst[0] = (r << 4) | r;
+    dst[1] = (g << 4) | g;
+    dst[2] = (b << 4) | b;
+    dst[3] = (a << 4) | a;
 }
 
-FORCE_INLINE void unpackRGB565ToRGBA8(const uint16_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+unpack<RGB565, uint16_t, uint8_t>(const uint16_t* __restrict src, uint8_t* __restrict dst)
 {
-    uint16_t packedValue = source[0];
-    uint8_t r = packedValue >> 11;
+    uint16_t packedValue = src[0];
+    uint8_t r = (packedValue >> 11) & 0x1F;
     uint8_t g = (packedValue >> 5) & 0x3F;
     uint8_t b = packedValue & 0x1F;
-    destination[0] = (r << 3) | (r & 0x7);
-    destination[1] = (g << 2) | (g & 0x3);
-    destination[2] = (b << 3) | (b & 0x7);
-    destination[3] = 0xFF;
+    dst[0] = (r << 3) | (r & 0x7);
+    dst[1] = (g << 2) | (g & 0x3);
+    dst[2] = (b << 3) | (b & 0x7);
+    dst[3] = 0xFF;
 }
 
-FORCE_INLINE void unpackR8ToRGBA8(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+unpack<R8, uint8_t, uint8_t>(const uint8_t* __restrict src, uint8_t* __restrict dst)
 {
-    destination[0] = source[0];
-    destination[1] = source[0];
-    destination[2] = source[0];
-    destination[3] = 0xFF;
+    dst[0] = src[0];
+    dst[1] = src[0];
+    dst[2] = src[0];
+    dst[3] = 0xFF;
 }
 
-FORCE_INLINE void unpackRA8ToRGBA8(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+unpack<RA8, uint8_t, uint8_t>(const uint8_t* __restrict src, uint8_t* __restrict dst)
 {
-    destination[0] = source[0];
-    destination[1] = source[0];
-    destination[2] = source[0];
-    destination[3] = source[1];
+    dst[0] = src[0];
+    dst[1] = src[0];
+    dst[2] = src[0];
+    dst[3] = src[1];
 }
 
-FORCE_INLINE void unpackA8ToRGBA8(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+unpack<A8, uint8_t, uint8_t>(const uint8_t* __restrict src, uint8_t* __restrict dst)
 {
-    destination[0] = 0x0;
-    destination[1] = 0x0;
-    destination[2] = 0x0;
-    destination[3] = source[0];
+    dst[0] = 0;
+    dst[1] = 0;
+    dst[2] = 0;
+    dst[3] = src[0];
 }
 
-FORCE_INLINE void unpackRGB32FToRGBA32F(const float* __restrict source, float* __restrict destination)
+template<> FORCE_INLINE void
+unpack<RGBA32F, float, float>(const float* __restrict src, float* __restrict dst)
 {
-    destination[0] = source[0];
-    destination[1] = source[1];
-    destination[2] = source[2];
-    destination[3] = 1;
+    dst[0] = src[0];
+    dst[1] = src[1];
+    dst[2] = src[2];
+    dst[3] = src[3];
 }
 
-FORCE_INLINE void unpackR32FToRGBA32F(const float* __restrict source, float* __restrict destination)
+template<> FORCE_INLINE void
+unpack<RGB32F, float, float>(const float* __restrict src, float* __restrict dst)
 {
-    destination[0] = source[0];
-    destination[1] = source[0];
-    destination[2] = source[0];
-    destination[3] = 1;
+    dst[0] = src[0];
+    dst[1] = src[1];
+    dst[2] = src[2];
+    dst[3] = 1.0f;
 }
 
-FORCE_INLINE void unpackRA32FToRGBA32F(const float* __restrict source, float* __restrict destination)
+template<> FORCE_INLINE void
+unpack<R32F, float, float>(const float* __restrict src, float* __restrict dst)
 {
-    destination[0] = source[0];
-    destination[1] = source[0];
-    destination[2] = source[0];
-    destination[3] = source[1];
+    dst[0] = src[0];
+    dst[1] = src[0];
+    dst[2] = src[0];
+    dst[3] = 1.0f;
 }
 
-FORCE_INLINE void unpackA32FToRGBA32F(const float* __restrict source, float* __restrict destination)
+template<> FORCE_INLINE void
+unpack<RA32F, float, float>(const float* __restrict src, float* __restrict dst)
 {
-    destination[0] = 0;
-    destination[1] = 0;
-    destination[2] = 0;
-    destination[3] = source[0];
+    dst[0] = src[0];
+    dst[1] = src[0];
+    dst[2] = src[0];
+    dst[3] = src[1];
+}
+
+template<> FORCE_INLINE void
+unpack<A32F, float, float>(const float* __restrict src, float* __restrict dst)
+{
+    dst[0] = 0;
+    dst[1] = 0;
+    dst[2] = 0;
+    dst[3] = src[0];
 }
 
 //----------------------------------------------------------------------
 // Pixel packing routines.
 //
 
-FORCE_INLINE void packRGBA8ToA8(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<int Format, int PremultiplicationOp, typename SrcType, typename DstType>
+FORCE_INLINE void
+pack(const SrcType* __restrict src,
+     DstType* __restrict dst)
 {
-    destination[0] = source[3];
+    NS_ABORT_IF_FALSE(false, "Unimplemented texture format conversion");
 }
 
-FORCE_INLINE void packRGBA8ToR8(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<A8, NoPremultiplicationOp, uint8_t, uint8_t>(const uint8_t* __restrict src, uint8_t* __restrict dst)
 {
-    destination[0] = source[0];
+    dst[0] = src[3];
 }
 
-FORCE_INLINE void packRGBA8ToR8Premultiply(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<A8, Premultiply, uint8_t, uint8_t>(const uint8_t* __restrict src, uint8_t* __restrict dst)
 {
-    float scaleFactor = source[3] / 255.0f;
-    uint8_t sourceR = static_cast<uint8_t>(static_cast<float>(source[0]) * scaleFactor);
-    destination[0] = sourceR;
+    dst[0] = src[3];
+}
+
+template<> FORCE_INLINE void
+pack<A8, Unpremultiply, uint8_t, uint8_t>(const uint8_t* __restrict src, uint8_t* __restrict dst)
+{
+    dst[0] = src[3];
 }
 
-// FIXME: this routine is lossy and must be removed.
-FORCE_INLINE void packRGBA8ToR8Unmultiply(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<R8, NoPremultiplicationOp, uint8_t, uint8_t>(const uint8_t* __restrict src, uint8_t* __restrict dst)
 {
-    float scaleFactor = source[3] ? 255.0f / source[3] : 1.0f;
-    uint8_t sourceR = static_cast<uint8_t>(static_cast<float>(source[0]) * scaleFactor);
-    destination[0] = sourceR;
+    dst[0] = src[0];
+}
+
+template<> FORCE_INLINE void
+pack<R8, Premultiply, uint8_t, uint8_t>(const uint8_t* __restrict src, uint8_t* __restrict dst)
+{
+    float scaleFactor = src[3] / 255.0f;
+    uint8_t srcR = static_cast<uint8_t>(src[0] * scaleFactor);
+    dst[0] = srcR;
 }
 
-FORCE_INLINE void packRGBA8ToRA8(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<R8, Unpremultiply, uint8_t, uint8_t>(const uint8_t* __restrict src, uint8_t* __restrict dst)
 {
-    destination[0] = source[0];
-    destination[1] = source[3];
+    float scaleFactor = src[3] ? 255.0f / src[3] : 1.0f;
+    uint8_t srcR = static_cast<uint8_t>(src[0] * scaleFactor);
+    dst[0] = srcR;
 }
 
-FORCE_INLINE void packRGBA8ToRA8Premultiply(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<RA8, NoPremultiplicationOp, uint8_t, uint8_t>(const uint8_t* __restrict src, uint8_t* __restrict dst)
 {
-    float scaleFactor = source[3] / 255.0f;
-    uint8_t sourceR = static_cast<uint8_t>(static_cast<float>(source[0]) * scaleFactor);
-    destination[0] = sourceR;
-    destination[1] = source[3];
+    dst[0] = src[0];
+    dst[1] = src[3];
+}
+
+template<> FORCE_INLINE void
+pack<RA8, Premultiply, uint8_t, uint8_t>(const uint8_t* __restrict src, uint8_t* __restrict dst)
+{
+    float scaleFactor = src[3] / 255.0f;
+    uint8_t srcR = static_cast<uint8_t>(src[0] * scaleFactor);
+    dst[0] = srcR;
+    dst[1] = src[3];
 }
 
 // FIXME: this routine is lossy and must be removed.
-FORCE_INLINE void packRGBA8ToRA8Unmultiply(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<RA8, Unpremultiply, uint8_t, uint8_t>(const uint8_t* __restrict src, uint8_t* __restrict dst)
+{
+    float scaleFactor = src[3] ? 255.0f / src[3] : 1.0f;
+    uint8_t srcR = static_cast<uint8_t>(src[0] * scaleFactor);
+    dst[0] = srcR;
+    dst[1] = src[3];
+}
+
+template<> FORCE_INLINE void
+pack<RGB8, NoPremultiplicationOp, uint8_t, uint8_t>(const uint8_t* __restrict src, uint8_t* __restrict dst)
 {
-    float scaleFactor = source[3] ? 255.0f / source[3] : 1.0f;
-    uint8_t sourceR = static_cast<uint8_t>(static_cast<float>(source[0]) * scaleFactor);
-    destination[0] = sourceR;
-    destination[1] = source[3];
+    dst[0] = src[0];
+    dst[1] = src[1];
+    dst[2] = src[2];
+}
+
+template<> FORCE_INLINE void
+pack<RGB8, Premultiply, uint8_t, uint8_t>(const uint8_t* __restrict src, uint8_t* __restrict dst)
+{
+    float scaleFactor = src[3] / 255.0f;
+    uint8_t srcR = static_cast<uint8_t>(src[0] * scaleFactor);
+    uint8_t srcG = static_cast<uint8_t>(src[1] * scaleFactor);
+    uint8_t srcB = static_cast<uint8_t>(src[2] * scaleFactor);
+    dst[0] = srcR;
+    dst[1] = srcG;
+    dst[2] = srcB;
 }
 
-FORCE_INLINE void packRGBA8ToRGB8(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<RGB8, Unpremultiply, uint8_t, uint8_t>(const uint8_t* __restrict src, uint8_t* __restrict dst)
 {
-    destination[0] = source[0];
-    destination[1] = source[1];
-    destination[2] = source[2];
+    float scaleFactor = src[3] ? 255.0f / src[3] : 1.0f;
+    uint8_t srcR = static_cast<uint8_t>(src[0] * scaleFactor);
+    uint8_t srcG = static_cast<uint8_t>(src[1] * scaleFactor);
+    uint8_t srcB = static_cast<uint8_t>(src[2] * scaleFactor);
+    dst[0] = srcR;
+    dst[1] = srcG;
+    dst[2] = srcB;
 }
 
-FORCE_INLINE void packRGBA8ToRGB8Premultiply(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<RGBA8, NoPremultiplicationOp, uint8_t, uint8_t>(const uint8_t* __restrict src, uint8_t* __restrict dst)
 {
-    float scaleFactor = source[3] / 255.0f;
-    uint8_t sourceR = static_cast<uint8_t>(static_cast<float>(source[0]) * scaleFactor);
-    uint8_t sourceG = static_cast<uint8_t>(static_cast<float>(source[1]) * scaleFactor);
-    uint8_t sourceB = static_cast<uint8_t>(static_cast<float>(source[2]) * scaleFactor);
-    destination[0] = sourceR;
-    destination[1] = sourceG;
-    destination[2] = sourceB;
+    dst[0] = src[0];
+    dst[1] = src[1];
+    dst[2] = src[2];
+    dst[3] = src[3];
+}
+
+template<> FORCE_INLINE void
+pack<RGBA8, Premultiply, uint8_t, uint8_t>(const uint8_t* __restrict src, uint8_t* __restrict dst)
+{
+    float scaleFactor = src[3] / 255.0f;
+    uint8_t srcR = static_cast<uint8_t>(src[0] * scaleFactor);
+    uint8_t srcG = static_cast<uint8_t>(src[1] * scaleFactor);
+    uint8_t srcB = static_cast<uint8_t>(src[2] * scaleFactor);
+    dst[0] = srcR;
+    dst[1] = srcG;
+    dst[2] = srcB;
+    dst[3] = src[3];
 }
 
 // FIXME: this routine is lossy and must be removed.
-FORCE_INLINE void packRGBA8ToRGB8Unmultiply(const uint8_t* __restrict source, uint8_t* __restrict destination)
-{
-    float scaleFactor = source[3] ? 255.0f / source[3] : 1.0f;
-    uint8_t sourceR = static_cast<uint8_t>(static_cast<float>(source[0]) * scaleFactor);
-    uint8_t sourceG = static_cast<uint8_t>(static_cast<float>(source[1]) * scaleFactor);
-    uint8_t sourceB = static_cast<uint8_t>(static_cast<float>(source[2]) * scaleFactor);
-    destination[0] = sourceR;
-    destination[1] = sourceG;
-    destination[2] = sourceB;
-}
-
-// This is only used when the source format is different than kSourceFormatRGBA8.
-FORCE_INLINE void packRGBA8ToRGBA8(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<RGBA8, Unpremultiply, uint8_t, uint8_t>(const uint8_t* __restrict src, uint8_t* __restrict dst)
 {
-    destination[0] = source[0];
-    destination[1] = source[1];
-    destination[2] = source[2];
-    destination[3] = source[3];
-}
-
-FORCE_INLINE void packRGBA8ToRGBA8Premultiply(const uint8_t* __restrict source, uint8_t* __restrict destination)
-{
-    float scaleFactor = source[3] / 255.0f;
-    uint8_t sourceR = static_cast<uint8_t>(static_cast<float>(source[0]) * scaleFactor);
-    uint8_t sourceG = static_cast<uint8_t>(static_cast<float>(source[1]) * scaleFactor);
-    uint8_t sourceB = static_cast<uint8_t>(static_cast<float>(source[2]) * scaleFactor);
-    destination[0] = sourceR;
-    destination[1] = sourceG;
-    destination[2] = sourceB;
-    destination[3] = source[3];
+    float scaleFactor = src[3] ? 255.0f / src[3] : 1.0f;
+    uint8_t srcR = static_cast<uint8_t>(src[0] * scaleFactor);
+    uint8_t srcG = static_cast<uint8_t>(src[1] * scaleFactor);
+    uint8_t srcB = static_cast<uint8_t>(src[2] * scaleFactor);
+    dst[0] = srcR;
+    dst[1] = srcG;
+    dst[2] = srcB;
+    dst[3] = src[3];
 }
 
-// FIXME: this routine is lossy and must be removed.
-FORCE_INLINE void packRGBA8ToRGBA8Unmultiply(const uint8_t* __restrict source, uint8_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<RGBA4444, NoPremultiplicationOp, uint8_t, uint16_t>(const uint8_t* __restrict src, uint16_t* __restrict dst)
 {
-    float scaleFactor = source[3] ? 255.0f / source[3] : 1.0f;
-    uint8_t sourceR = static_cast<uint8_t>(static_cast<float>(source[0]) * scaleFactor);
-    uint8_t sourceG = static_cast<uint8_t>(static_cast<float>(source[1]) * scaleFactor);
-    uint8_t sourceB = static_cast<uint8_t>(static_cast<float>(source[2]) * scaleFactor);
-    destination[0] = sourceR;
-    destination[1] = sourceG;
-    destination[2] = sourceB;
-    destination[3] = source[3];
+    *dst = ( ((src[0] & 0xF0) << 8)
+           | ((src[1] & 0xF0) << 4)
+           | (src[2] & 0xF0)
+           | (src[3] >> 4) );
 }
 
-FORCE_INLINE void packRGBA8ToUnsignedShort4444(const uint8_t* __restrict source, uint16_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<RGBA4444, Premultiply, uint8_t, uint16_t>(const uint8_t* __restrict src, uint16_t* __restrict dst)
 {
-    *destination = (((source[0] & 0xF0) << 8)
-                    | ((source[1] & 0xF0) << 4)
-                    | (source[2] & 0xF0)
-                    | (source[3] >> 4));
-}
-
-FORCE_INLINE void packRGBA8ToUnsignedShort4444Premultiply(const uint8_t* __restrict source, uint16_t* __restrict destination)
-{
-    float scaleFactor = source[3] / 255.0f;
-    uint8_t sourceR = static_cast<uint8_t>(static_cast<float>(source[0]) * scaleFactor);
-    uint8_t sourceG = static_cast<uint8_t>(static_cast<float>(source[1]) * scaleFactor);
-    uint8_t sourceB = static_cast<uint8_t>(static_cast<float>(source[2]) * scaleFactor);
-    *destination = (((sourceR & 0xF0) << 8)
-                    | ((sourceG & 0xF0) << 4)
-                    | (sourceB & 0xF0)
-                    | (source[3] >> 4));
+    float scaleFactor = src[3] / 255.0f;
+    uint8_t srcR = static_cast<uint8_t>(src[0] * scaleFactor);
+    uint8_t srcG = static_cast<uint8_t>(src[1] * scaleFactor);
+    uint8_t srcB = static_cast<uint8_t>(src[2] * scaleFactor);
+    *dst = ( ((srcR & 0xF0) << 8)
+           | ((srcG & 0xF0) << 4)
+           | (srcB & 0xF0)
+           | (src[3] >> 4));
 }
 
 // FIXME: this routine is lossy and must be removed.
-FORCE_INLINE void packRGBA8ToUnsignedShort4444Unmultiply(const uint8_t* __restrict source, uint16_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<RGBA4444, Unpremultiply, uint8_t, uint16_t>(const uint8_t* __restrict src, uint16_t* __restrict dst)
 {
-    float scaleFactor = source[3] ? 255.0f / source[3] : 1.0f;
-    uint8_t sourceR = static_cast<uint8_t>(static_cast<float>(source[0]) * scaleFactor);
-    uint8_t sourceG = static_cast<uint8_t>(static_cast<float>(source[1]) * scaleFactor);
-    uint8_t sourceB = static_cast<uint8_t>(static_cast<float>(source[2]) * scaleFactor);
-    *destination = (((sourceR & 0xF0) << 8)
-                    | ((sourceG & 0xF0) << 4)
-                    | (sourceB & 0xF0)
-                    | (source[3] >> 4));
+    float scaleFactor = src[3] ? 255.0f / src[3] : 1.0f;
+    uint8_t srcR = static_cast<uint8_t>(src[0] * scaleFactor);
+    uint8_t srcG = static_cast<uint8_t>(src[1] * scaleFactor);
+    uint8_t srcB = static_cast<uint8_t>(src[2] * scaleFactor);
+    *dst = ( ((srcR & 0xF0) << 8)
+           | ((srcG & 0xF0) << 4)
+           | (srcB & 0xF0)
+           | (src[3] >> 4));
 }
 
-FORCE_INLINE void packRGBA8ToUnsignedShort5551(const uint8_t* __restrict source, uint16_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<RGBA5551, NoPremultiplicationOp, uint8_t, uint16_t>(const uint8_t* __restrict src, uint16_t* __restrict dst)
 {
-    *destination = (((source[0] & 0xF8) << 8)
-                    | ((source[1] & 0xF8) << 3)
-                    | ((source[2] & 0xF8) >> 2)
-                    | (source[3] >> 7));
+    *dst = ( ((src[0] & 0xF8) << 8)
+           | ((src[1] & 0xF8) << 3)
+           | ((src[2] & 0xF8) >> 2)
+           | (src[3] >> 7));
 }
 
-FORCE_INLINE void packRGBA8ToUnsignedShort5551Premultiply(const uint8_t* __restrict source, uint16_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<RGBA5551, Premultiply, uint8_t, uint16_t>(const uint8_t* __restrict src, uint16_t* __restrict dst)
 {
-    float scaleFactor = source[3] / 255.0f;
-    uint8_t sourceR = static_cast<uint8_t>(static_cast<float>(source[0]) * scaleFactor);
-    uint8_t sourceG = static_cast<uint8_t>(static_cast<float>(source[1]) * scaleFactor);
-    uint8_t sourceB = static_cast<uint8_t>(static_cast<float>(source[2]) * scaleFactor);
-    *destination = (((sourceR & 0xF8) << 8)
-                    | ((sourceG & 0xF8) << 3)
-                    | ((sourceB & 0xF8) >> 2)
-                    | (source[3] >> 7));
+    float scaleFactor = src[3] / 255.0f;
+    uint8_t srcR = static_cast<uint8_t>(src[0] * scaleFactor);
+    uint8_t srcG = static_cast<uint8_t>(src[1] * scaleFactor);
+    uint8_t srcB = static_cast<uint8_t>(src[2] * scaleFactor);
+    *dst = ( ((srcR & 0xF8) << 8)
+           | ((srcG & 0xF8) << 3)
+           | ((srcB & 0xF8) >> 2)
+           | (src[3] >> 7));
 }
 
 // FIXME: this routine is lossy and must be removed.
-FORCE_INLINE void packRGBA8ToUnsignedShort5551Unmultiply(const uint8_t* __restrict source, uint16_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<RGBA5551, Unpremultiply, uint8_t, uint16_t>(const uint8_t* __restrict src, uint16_t* __restrict dst)
 {
-    float scaleFactor = source[3] ? 255.0f / source[3] : 1.0f;
-    uint8_t sourceR = static_cast<uint8_t>(static_cast<float>(source[0]) * scaleFactor);
-    uint8_t sourceG = static_cast<uint8_t>(static_cast<float>(source[1]) * scaleFactor);
-    uint8_t sourceB = static_cast<uint8_t>(static_cast<float>(source[2]) * scaleFactor);
-    *destination = (((sourceR & 0xF8) << 8)
-                    | ((sourceG & 0xF8) << 3)
-                    | ((sourceB & 0xF8) >> 2)
-                    | (source[3] >> 7));
+    float scaleFactor = src[3] ? 255.0f / src[3] : 1.0f;
+    uint8_t srcR = static_cast<uint8_t>(src[0] * scaleFactor);
+    uint8_t srcG = static_cast<uint8_t>(src[1] * scaleFactor);
+    uint8_t srcB = static_cast<uint8_t>(src[2] * scaleFactor);
+    *dst = ( ((srcR & 0xF8) << 8)
+           | ((srcG & 0xF8) << 3)
+           | ((srcB & 0xF8) >> 2)
+           | (src[3] >> 7));
 }
 
-FORCE_INLINE void packRGBA8ToUnsignedShort565(const uint8_t* __restrict source, uint16_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<RGB565, NoPremultiplicationOp, uint8_t, uint16_t>(const uint8_t* __restrict src, uint16_t* __restrict dst)
 {
-    *destination = (((source[0] & 0xF8) << 8)
-                    | ((source[1] & 0xFC) << 3)
-                    | ((source[2] & 0xF8) >> 3));
+    *dst = ( ((src[0] & 0xF8) << 8)
+           | ((src[1] & 0xFC) << 3)
+           | ((src[2] & 0xF8) >> 3));
 }
 
-FORCE_INLINE void packRGBA8ToUnsignedShort565Premultiply(const uint8_t* __restrict source, uint16_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<RGB565, Premultiply, uint8_t, uint16_t>(const uint8_t* __restrict src, uint16_t* __restrict dst)
 {
-    float scaleFactor = source[3] / 255.0f;
-    uint8_t sourceR = static_cast<uint8_t>(static_cast<float>(source[0]) * scaleFactor);
-    uint8_t sourceG = static_cast<uint8_t>(static_cast<float>(source[1]) * scaleFactor);
-    uint8_t sourceB = static_cast<uint8_t>(static_cast<float>(source[2]) * scaleFactor);
-    *destination = (((sourceR & 0xF8) << 8)
-                    | ((sourceG & 0xFC) << 3)
-                    | ((sourceB & 0xF8) >> 3));
+    float scaleFactor = src[3] / 255.0f;
+    uint8_t srcR = static_cast<uint8_t>(src[0] * scaleFactor);
+    uint8_t srcG = static_cast<uint8_t>(src[1] * scaleFactor);
+    uint8_t srcB = static_cast<uint8_t>(src[2] * scaleFactor);
+    *dst = ( ((srcR & 0xF8) << 8)
+           | ((srcG & 0xFC) << 3)
+           | ((srcB & 0xF8) >> 3));
 }
 
 // FIXME: this routine is lossy and must be removed.
-FORCE_INLINE void packRGBA8ToUnsignedShort565Unmultiply(const uint8_t* __restrict source, uint16_t* __restrict destination)
+template<> FORCE_INLINE void
+pack<RGB565, Unpremultiply, uint8_t, uint16_t>(const uint8_t* __restrict src, uint16_t* __restrict dst)
 {
-    float scaleFactor = source[3] ? 255.0f / source[3] : 1.0f;
-    uint8_t sourceR = static_cast<uint8_t>(static_cast<float>(source[0]) * scaleFactor);
-    uint8_t sourceG = static_cast<uint8_t>(static_cast<float>(source[1]) * scaleFactor);
-    uint8_t sourceB = static_cast<uint8_t>(static_cast<float>(source[2]) * scaleFactor);
-    *destination = (((sourceR & 0xF8) << 8)
-                    | ((sourceG & 0xFC) << 3)
-                    | ((sourceB & 0xF8) >> 3));
+    float scaleFactor = src[3] ? 255.0f / src[3] : 1.0f;
+    uint8_t srcR = static_cast<uint8_t>(src[0] * scaleFactor);
+    uint8_t srcG = static_cast<uint8_t>(src[1] * scaleFactor);
+    uint8_t srcB = static_cast<uint8_t>(src[2] * scaleFactor);
+    *dst = ( ((srcR & 0xF8) << 8)
+           | ((srcG & 0xFC) << 3)
+           | ((srcB & 0xF8) >> 3));
 }
 
-FORCE_INLINE void packRGBA32FToRGB32F(const float* __restrict source, float* __restrict destination)
+template<> FORCE_INLINE void
+pack<RGB32F, NoPremultiplicationOp, float, float>(const float* __restrict src, float* __restrict dst)
 {
-    destination[0] = source[0];
-    destination[1] = source[1];
-    destination[2] = source[2];
+    dst[0] = src[0];
+    dst[1] = src[1];
+    dst[2] = src[2];
 }
 
-FORCE_INLINE void packRGBA32FToRGB32FPremultiply(const float* __restrict source, float* __restrict destination)
+template<> FORCE_INLINE void
+pack<RGB32F, Premultiply, float, float>(const float* __restrict src, float* __restrict dst)
 {
-    float scaleFactor = source[3];
-    destination[0] = source[0] * scaleFactor;
-    destination[1] = source[1] * scaleFactor;
-    destination[2] = source[2] * scaleFactor;
+    float scaleFactor = src[3];
+    dst[0] = src[0] * scaleFactor;
+    dst[1] = src[1] * scaleFactor;
+    dst[2] = src[2] * scaleFactor;
 }
 
-FORCE_INLINE void packRGBA32FToRGBA32FPremultiply(const float* __restrict source, float* __restrict destination)
+template<> FORCE_INLINE void
+pack<RGBA32F, NoPremultiplicationOp, float, float>(const float* __restrict src, float* __restrict dst)
 {
-    float scaleFactor = source[3];
-    destination[0] = source[0] * scaleFactor;
-    destination[1] = source[1] * scaleFactor;
-    destination[2] = source[2] * scaleFactor;
-    destination[3] = source[3];
+    dst[0] = src[0];
+    dst[1] = src[1];
+    dst[2] = src[2];
+    dst[3] = src[3];
 }
 
-FORCE_INLINE void packRGBA32FToA32F(const float* __restrict source, float* __restrict destination)
+template<> FORCE_INLINE void
+pack<RGBA32F, Premultiply, float, float>(const float* __restrict src, float* __restrict dst)
 {
-    destination[0] = source[3];
+    float scaleFactor = src[3];
+    dst[0] = src[0] * scaleFactor;
+    dst[1] = src[1] * scaleFactor;
+    dst[2] = src[2] * scaleFactor;
+    dst[3] = src[3];
 }
 
-// identical to above, to avoid special-casing
-FORCE_INLINE void packRGBA32FToA32FPremultiply(const float* __restrict source, float* __restrict destination)
+template<> FORCE_INLINE void
+pack<A32F, NoPremultiplicationOp, float, float>(const float* __restrict src, float* __restrict dst)
 {
-    destination[0] = source[3];
+    dst[0] = src[3];
 }
 
-FORCE_INLINE void packRGBA32FToR32F(const float* __restrict source, float* __restrict destination)
+template<> FORCE_INLINE void
+pack<A32F, Premultiply, float, float>(const float* __restrict src, float* __restrict dst)
 {
-    destination[0] = source[0];
+    dst[0] = src[3];
 }
 
-FORCE_INLINE void packRGBA32FToR32FPremultiply(const float* __restrict source, float* __restrict destination)
+template<> FORCE_INLINE void
+pack<R32F, NoPremultiplicationOp, float, float>(const float* __restrict src, float* __restrict dst)
 {
-    float scaleFactor = source[3];
-    destination[0] = source[0] * scaleFactor;
+    dst[0] = src[0];
+}
+
+template<> FORCE_INLINE void
+pack<R32F, Premultiply, float, float>(const float* __restrict src, float* __restrict dst)
+{
+    float scaleFactor = src[3];
+    dst[0] = src[0] * scaleFactor;
 }
 
-
-FORCE_INLINE void packRGBA32FToRA32F(const float* __restrict source, float* __restrict destination)
+template<> FORCE_INLINE void
+pack<RA32F, NoPremultiplicationOp, float, float>(const float* __restrict src, float* __restrict dst)
 {
-    destination[0] = source[0];
-    destination[1] = source[3];
+    dst[0] = src[0];
+    dst[1] = src[3];
 }
 
-FORCE_INLINE void packRGBA32FToRA32FPremultiply(const float* __restrict source, float* __restrict destination)
+template<> FORCE_INLINE void
+pack<RA32F, Premultiply, float, float>(const float* __restrict src, float* __restrict dst)
 {
-    float scaleFactor = source[3];
-    destination[0] = source[0] * scaleFactor;
-    destination[1] = scaleFactor;
+    float scaleFactor = src[3];
+    dst[0] = src[0] * scaleFactor;
+    dst[1] = scaleFactor;
 }
 
 /****** END CODE SHARED WITH WEBKIT ******/
 
+template<typename SrcType, typename DstType> FORCE_INLINE void
+convertType(const SrcType* __restrict src, DstType* __restrict dst)
+{
+    NS_ABORT_IF_FALSE(false, "Unimplemented texture format conversion");
+}
+
+template<> FORCE_INLINE void
+convertType<uint8_t, uint8_t>(const uint8_t* __restrict src, uint8_t* __restrict dst)
+{
+    dst[0] = src[0];
+    dst[1] = src[1];
+    dst[2] = src[2];
+    dst[3] = src[3];
+}
+
+template<> FORCE_INLINE void
+convertType<float, float>(const float* __restrict src, float* __restrict dst)
+{
+    dst[0] = src[0];
+    dst[1] = src[1];
+    dst[2] = src[2];
+    dst[3] = src[3];
+}
+
+template<> FORCE_INLINE void
+convertType<uint8_t, float>(const uint8_t* __restrict src, float* __restrict dst)
+{
+    const float scaleFactor = 1.f / 255.0f;
+    dst[0] = src[0] * scaleFactor;
+    dst[1] = src[1] * scaleFactor;
+    dst[2] = src[2] * scaleFactor;
+    dst[3] = src[3] * scaleFactor;
+}
+
+#undef FORCE_INLINE
+
 } // end namespace WebGLTexelConversions
 
 } // end namespace mozilla
 
 #endif // WEBGLTEXELCONVERSIONS_H_