Bug 1551084 - Part 3. Use templates to reduce code duplication in QCMS. r=miko
☠☠ backed out by 1e50992f4133 ☠ ☠
authorAndrew Osmond <aosmond@mozilla.com>
Thu, 09 May 2019 16:49:38 -0400
changeset 475666 99874bf8941910abd3c10f0ba5a762e7f1dc60f5
parent 475665 d73949bd98e99a892a282a85b68aeb44cd972ffc
child 475667 2a5ae3eb40ce2eb31f3708a8c82690fa8858ce14
push id113222
push useraosmond@gmail.com
push dateMon, 27 May 2019 15:17:15 +0000
treeherdermozilla-inbound@af04f8907fab [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersmiko
bugs1551084
milestone69.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 1551084 - Part 3. Use templates to reduce code duplication in QCMS. r=miko No functional change. This allows us to support both alpha and non-alpha variants with the same implementation, in addition to laying the ground work for QCMS to support BGRA as an input and output. Differential Revision: https://phabricator.services.mozilla.com/D30820
gfx/qcms/qcms.h
gfx/qcms/qcmsint.h
gfx/qcms/transform-altivec.cpp
gfx/qcms/transform-sse1.cpp
gfx/qcms/transform-sse2.cpp
gfx/qcms/transform.cpp
--- a/gfx/qcms/qcms.h
+++ b/gfx/qcms/qcms.h
@@ -162,17 +162,17 @@ void qcms_profile_precache_output_transf
 
 qcms_transform* qcms_transform_create(
 		qcms_profile *in, qcms_data_type in_type,
 		qcms_profile* out, qcms_data_type out_type,
 		qcms_intent intent);
 
 void qcms_transform_release(qcms_transform *);
 
-void qcms_transform_data(qcms_transform *transform, void *src, void *dest, size_t length);
+void qcms_transform_data(qcms_transform *transform, const void *src, void *dest, size_t length);
 
 void qcms_enable_iccv4();
 
 #ifdef  __cplusplus
 }
 #endif
 
 #endif
--- a/gfx/qcms/qcmsint.h
+++ b/gfx/qcms/qcmsint.h
@@ -69,17 +69,17 @@ struct _qcms_transform {
 	size_t output_gamma_lut_b_length;
 
 	size_t output_gamma_lut_gray_length;
 
 	struct precache_output *output_table_r;
 	struct precache_output *output_table_g;
 	struct precache_output *output_table_b;
 
-	void (*transform_fn)(struct _qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length);
+	void (*transform_fn)(const struct _qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length);
 };
 
 struct matrix {
 	float m[3][3];
 	bool invalid;
 };
 
 struct qcms_modular_transform;
@@ -259,39 +259,39 @@ static inline float uInt16Number_to_floa
 	return ((int32_t)a)/65535.f;
 }
 
 
 void precache_release(struct precache_output *p);
 bool set_rgb_colorants(qcms_profile *profile, qcms_CIE_xyY white_point, qcms_CIE_xyYTRIPLE primaries);
 bool get_rgb_colorants(struct matrix *colorants, qcms_CIE_xyY white_point, qcms_CIE_xyYTRIPLE primaries);
 
-void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform,
-                                          unsigned char *src,
+void qcms_transform_data_rgb_out_lut_sse2(const qcms_transform *transform,
+                                          const unsigned char *src,
                                           unsigned char *dest,
                                           size_t length);
-void qcms_transform_data_rgba_out_lut_sse2(qcms_transform *transform,
-                                          unsigned char *src,
+void qcms_transform_data_rgba_out_lut_sse2(const qcms_transform *transform,
+                                          const unsigned char *src,
                                           unsigned char *dest,
                                           size_t length);
-void qcms_transform_data_rgb_out_lut_sse1(qcms_transform *transform,
-                                          unsigned char *src,
+void qcms_transform_data_rgb_out_lut_sse1(const qcms_transform *transform,
+                                          const unsigned char *src,
                                           unsigned char *dest,
                                           size_t length);
-void qcms_transform_data_rgba_out_lut_sse1(qcms_transform *transform,
-                                          unsigned char *src,
+void qcms_transform_data_rgba_out_lut_sse1(const qcms_transform *transform,
+                                          const unsigned char *src,
                                           unsigned char *dest,
                                           size_t length);
 
-void qcms_transform_data_rgb_out_lut_altivec(qcms_transform *transform,
-                                             unsigned char *src,
+void qcms_transform_data_rgb_out_lut_altivec(const qcms_transform *transform,
+                                             const unsigned char *src,
                                              unsigned char *dest,
                                              size_t length);
-void qcms_transform_data_rgba_out_lut_altivec(qcms_transform *transform,
-                                              unsigned char *src,
+void qcms_transform_data_rgba_out_lut_altivec(const qcms_transform *transform,
+                                              const unsigned char *src,
                                               unsigned char *dest,
                                               size_t length);
 
 extern bool qcms_supports_iccv4;
 
 #ifdef _MSC_VER
 
 long __cdecl _InterlockedIncrement(long volatile *);
@@ -305,36 +305,30 @@ long __cdecl _InterlockedDecrement(long 
 #else
 
 #define qcms_atomic_increment(x) __sync_add_and_fetch(&x, 1)
 #define qcms_atomic_decrement(x) __sync_sub_and_fetch(&x, 1)
 
 #endif
 
 
-#ifdef NATIVE_OUTPUT
-# define RGB_OUTPUT_COMPONENTS 4
-# define RGBA_OUTPUT_COMPONENTS 4
-# ifdef IS_LITTLE_ENDIAN
-#  define OUTPUT_A_INDEX 3
-#  define OUTPUT_R_INDEX 2
-#  define OUTPUT_G_INDEX 1
-#  define OUTPUT_B_INDEX 0
-# else
-#  define OUTPUT_A_INDEX 0
-#  define OUTPUT_R_INDEX 1
-#  define OUTPUT_G_INDEX 2
-#  define OUTPUT_B_INDEX 3
-# endif
-#else
-# define RGB_OUTPUT_COMPONENTS 3
-# define RGBA_OUTPUT_COMPONENTS 4
-# define OUTPUT_R_INDEX 0
-# define OUTPUT_G_INDEX 1
-# define OUTPUT_B_INDEX 2
-# define OUTPUT_A_INDEX 3
-#endif
+#define RGB_COMPONENTS 3
+#define RGBA_COMPONENTS 4
+
+#define RGBA_R_INDEX 0
+#define RGBA_G_INDEX 1
+#define RGBA_B_INDEX 2
+#define RGBA_A_INDEX 3
+
+#define BGRA_B_INDEX 0
+#define BGRA_G_INDEX 1
+#define BGRA_R_INDEX 2
+#define BGRA_A_INDEX 3
+
+#define NO_A_INDEX   0xFF
+
+#define A_INDEX_COMPONENTS(kAIndex)    ((kAIndex) == NO_A_INDEX ? RGB_COMPONENTS : RGBA_COMPONENTS)
 
 #ifdef __cplusplus
 }
 #endif
 
 #endif
--- a/gfx/qcms/transform-altivec.cpp
+++ b/gfx/qcms/transform-altivec.cpp
@@ -32,134 +32,24 @@ static const ALIGN float clampMaxValueX4
 
 inline vector float load_aligned_float(float *dataPtr)
 {
 	vector float data = vec_lde(0, dataPtr);
 	vector unsigned char moveToStart = vec_lvsl(0, dataPtr);
 	return vec_perm(data, data, moveToStart);
 }
 
-void qcms_transform_data_rgb_out_lut_altivec(qcms_transform *transform,
-                                             unsigned char *src,
-                                             unsigned char *dest,
-                                             size_t length)
+template <size_t kRIndex, size_t kGIndex, size_t kBIndex, size_t kAIndex = NO_A_INDEX>
+static void qcms_transform_data_template_lut_altivec(const qcms_transform *transform,
+                                                     const unsigned char *src,
+                                                     unsigned char *dest,
+                                                     size_t length)
 {
 	unsigned int i;
-	float (*mat)[4] = transform->matrix;
-	char input_back[32];
-	/* Ensure we have a buffer that's 16 byte aligned regardless of the original
-	 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
-	 * because they don't work on stack variables. gcc 4.4 does do the right thing
-	 * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
-	float const *input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
-	/* share input and output locations to save having to keep the
- 	 * locations in separate registers */
-	uint32_t const *output = (uint32_t*)input;
-
-	/* deref *transform now to avoid it in loop */
-	const float *igtbl_r = transform->input_gamma_table_r;
-	const float *igtbl_g = transform->input_gamma_table_g;
-	const float *igtbl_b = transform->input_gamma_table_b;
-
-	/* deref *transform now to avoid it in loop */
-	const uint8_t *otdata_r = &transform->output_table_r->data[0];
-	const uint8_t *otdata_g = &transform->output_table_g->data[0];
-	const uint8_t *otdata_b = &transform->output_table_b->data[0];
-
-	/* input matrix values never change */
-	const vector float mat0 = vec_ldl(0, (vector float*)mat[0]);
-	const vector float mat1 = vec_ldl(0, (vector float*)mat[1]);
-	const vector float mat2 = vec_ldl(0, (vector float*)mat[2]);
-
-	/* these values don't change, either */
-	const vector float max = vec_splat(vec_lde(0, (float*)&clampMaxValueX4), 0);
-	const vector float min = (vector float)vec_splat_u32(0);
-	const vector float scale = vec_splat(vec_lde(0, (float*)&floatScaleX4), 0);
-
-	/* working variables */
-	vector float vec_r, vec_g, vec_b, result;
-
-	/* CYA */
-	if (!length)
-		return;
-
-	/* one pixel is handled outside of the loop */
-	length--;
-
-	/* setup for transforming 1st pixel */
-	vec_r = load_aligned_float((float*)&igtbl_r[src[0]]);
-	vec_g = load_aligned_float((float*)&igtbl_r[src[1]]);
-	vec_b = load_aligned_float((float*)&igtbl_r[src[2]]);
-	src += 3;
-
-	/* transform all but final pixel */
-
-	for (i=0; i<length; i++)
-	{
-		/* position values from gamma tables */
-		vec_r = vec_splat(vec_r, 0);
-		vec_g = vec_splat(vec_g, 0);
-		vec_b = vec_splat(vec_b, 0);
-
-		/* gamma * matrix */
-		vec_r = vec_madd(vec_r, mat0, min);
-		vec_g = vec_madd(vec_g, mat1, min);
-		vec_b = vec_madd(vec_b, mat2, min);
-
-		/* crunch, crunch, crunch */
-		vec_r = vec_add(vec_r, vec_add(vec_g, vec_b));
-		vec_r = vec_max(min, vec_r);
-		vec_r = vec_min(max, vec_r);
-		result = vec_madd(vec_r, scale, min);
-
-		/* store calc'd output tables indices */
-		vec_st(vec_ctu(vec_round(result), 0), 0, (vector unsigned int*)output);
-
-		/* load for next loop while store completes */
-		vec_r = load_aligned_float((float*)&igtbl_r[src[0]]);
-		vec_g = load_aligned_float((float*)&igtbl_r[src[1]]);
-		vec_b = load_aligned_float((float*)&igtbl_r[src[2]]);
-		src += 3;
-
-		/* use calc'd indices to output RGB values */
-		dest[0] = otdata_r[output[0]];
-		dest[1] = otdata_g[output[1]];
-		dest[2] = otdata_b[output[2]];
-		dest += 3;
-	}
-
-	/* handle final (maybe only) pixel */
-
-	vec_r = vec_splat(vec_r, 0);
-	vec_g = vec_splat(vec_g, 0);
-	vec_b = vec_splat(vec_b, 0);
-
-	vec_r = vec_madd(vec_r, mat0, min);
-	vec_g = vec_madd(vec_g, mat1, min);
-	vec_b = vec_madd(vec_b, mat2, min);
-
-	vec_r = vec_add(vec_r, vec_add(vec_g, vec_b));
-	vec_r = vec_max(min, vec_r);
-	vec_r = vec_min(max, vec_r);
-	result = vec_madd(vec_r, scale, min);
-
-	vec_st(vec_ctu(vec_round(result),0),0,(vector unsigned int*)output);
-
-	dest[0] = otdata_r[output[0]];
-	dest[1] = otdata_g[output[1]];
-	dest[2] = otdata_b[output[2]];
-}
-
-void qcms_transform_data_rgba_out_lut_altivec(qcms_transform *transform,
-                                              unsigned char *src,
-                                              unsigned char *dest,
-                                              size_t length)
-{
-	unsigned int i;
-	float (*mat)[4] = transform->matrix;
+	const float (*mat)[4] = transform->matrix;
 	char input_back[32];
 	/* Ensure we have a buffer that's 16 byte aligned regardless of the original
 	 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
 	 * because they don't work on stack variables. gcc 4.4 does do the right thing
 	 * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
 	float const *input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
 	/* share input and output locations to save having to keep the
 	 * locations in separate registers */
@@ -179,91 +69,113 @@ void qcms_transform_data_rgba_out_lut_al
 	const vector float mat0 = vec_ldl(0, (vector float*)mat[0]);
 	const vector float mat1 = vec_ldl(0, (vector float*)mat[1]);
 	const vector float mat2 = vec_ldl(0, (vector float*)mat[2]);
 
 	/* these values don't change, either */
 	const vector float max = vec_splat(vec_lde(0, (float*)&clampMaxValueX4), 0);
 	const vector float min = (vector float)vec_splat_u32(0);
 	const vector float scale = vec_splat(vec_lde(0, (float*)&floatScaleX4), 0);
+	const unsigned int components = A_INDEX_COMPONENTS(kAIndex);
 
 	/* working variables */
 	vector float vec_r, vec_g, vec_b, result;
 	unsigned char alpha;
 
 	/* CYA */
 	if (!length)
 		return;
 
 	/* one pixel is handled outside of the loop */
 	length--;
 
 	/* setup for transforming 1st pixel */
-	vec_r = load_aligned_float((float*)&igtbl_r[src[0]]);
-	vec_g = load_aligned_float((float*)&igtbl_r[src[1]]);
-	vec_b = load_aligned_float((float*)&igtbl_r[src[2]]);
-	alpha = src[3];
-	src += 4;
+	vec_r = load_aligned_float((float*)&igtbl_r[src[kRIndex]]);
+	vec_g = load_aligned_float((float*)&igtbl_r[src[kGIndex]]);
+	vec_b = load_aligned_float((float*)&igtbl_r[src[kBIndex]]);
+	if (kAIndex != NO_A_INDEX) {
+		alpha = src[kAIndex];
+	}
+	src += components;
 
 	/* transform all but final pixel */
 
 	for (i=0; i<length; i++)
 	{
 		/* position values from gamma tables */
 		vec_r = vec_splat(vec_r, 0);
 		vec_g = vec_splat(vec_g, 0);
 		vec_b = vec_splat(vec_b, 0);
 
 		/* gamma * matrix */
 		vec_r = vec_madd(vec_r, mat0, min);
 		vec_g = vec_madd(vec_g, mat1, min);
 		vec_b = vec_madd(vec_b, mat2, min);
 
 		/* store alpha for this pixel; load alpha for next */
-		dest[3] = alpha;
-		alpha = src[3];
+		if (kAIndex != NO_A_INDEX) {
+			dest[kAIndex] = alpha;
+			alpha = src[kAIndex];
+		}
 
 		/* crunch, crunch, crunch */
 		vec_r = vec_add(vec_r, vec_add(vec_g, vec_b));
 		vec_r = vec_max(min, vec_r);
 		vec_r = vec_min(max, vec_r);
 		result = vec_madd(vec_r, scale, min);
 
 		/* store calc'd output tables indices */
 		vec_st(vec_ctu(vec_round(result), 0), 0, (vector unsigned int*)output);
 
 		/* load gamma values for next loop while store completes */
-		vec_r = load_aligned_float((float*)&igtbl_r[src[0]]);
-		vec_g = load_aligned_float((float*)&igtbl_r[src[1]]);
-		vec_b = load_aligned_float((float*)&igtbl_r[src[2]]);
-		src += 4;
+		vec_r = load_aligned_float((float*)&igtbl_r[src[kRIndex]]);
+		vec_g = load_aligned_float((float*)&igtbl_r[src[kGIndex]]);
+		vec_b = load_aligned_float((float*)&igtbl_r[src[kBIndex]]);
+		src += components;
 
 		/* use calc'd indices to output RGB values */
-		dest[0] = otdata_r[output[0]];
-		dest[1] = otdata_g[output[1]];
-		dest[2] = otdata_b[output[2]];
-		dest += 4;
+		dest[kRIndex] = otdata_r[output[0]];
+		dest[kGIndex] = otdata_g[output[1]];
+		dest[kBIndex] = otdata_b[output[2]];
+		dest += components;
 	}
 
 	/* handle final (maybe only) pixel */
 
 	vec_r = vec_splat(vec_r, 0);
 	vec_g = vec_splat(vec_g, 0);
 	vec_b = vec_splat(vec_b, 0);
 
 	vec_r = vec_madd(vec_r, mat0, min);
 	vec_g = vec_madd(vec_g, mat1, min);
 	vec_b = vec_madd(vec_b, mat2, min);
 
-	dest[3] = alpha;
+	if (kAIndex != NO_A_INDEX) {
+		dest[kAIndex] = alpha;
+	}
 
 	vec_r = vec_add(vec_r, vec_add(vec_g, vec_b));
 	vec_r = vec_max(min, vec_r);
 	vec_r = vec_min(max, vec_r);
 	result = vec_madd(vec_r, scale, min);
 
 	vec_st(vec_ctu(vec_round(result), 0), 0, (vector unsigned int*)output);
 
-	dest[0] = otdata_r[output[0]];
-	dest[1] = otdata_g[output[1]];
-	dest[2] = otdata_b[output[2]];
+	dest[kRIndex] = otdata_r[output[0]];
+	dest[kGIndex] = otdata_g[output[1]];
+	dest[kBIndex] = otdata_b[output[2]];
 }
 
+void qcms_transform_data_rgb_out_lut_altivec(const qcms_transform *transform,
+                                             const unsigned char *src,
+                                             unsigned char *dest,
+                                             size_t length)
+{
+    qcms_transform_data_template_lut_altivec<RGBA_R_INDEX, RGBA_G_INDEX, RGBA_B_INDEX>(transform, src, dest, length);
+}
+
+void qcms_transform_data_rgba_out_lut_altivec(const qcms_transform *transform,
+                                              const unsigned char *src,
+                                              unsigned char *dest,
+                                              size_t length)
+{
+    qcms_transform_data_template_lut_altivec<RGBA_R_INDEX, RGBA_G_INDEX, RGBA_B_INDEX, RGBA_A_INDEX>(transform, src, dest, length);
+}
--- a/gfx/qcms/transform-sse1.cpp
+++ b/gfx/qcms/transform-sse1.cpp
@@ -5,23 +5,24 @@
 /* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */
 #define FLOATSCALE  (float)(PRECACHE_OUTPUT_SIZE)
 #define CLAMPMAXVAL ( ((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZE )
 static const ALIGN float floatScaleX4[4] =
     { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE};
 static const ALIGN float clampMaxValueX4[4] =
     { CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL};
 
-void qcms_transform_data_rgb_out_lut_sse1(qcms_transform *transform,
-                                          unsigned char *src,
-                                          unsigned char *dest,
-                                          size_t length)
+template <size_t kRIndex, size_t kGIndex, size_t kBIndex, size_t kAIndex = NO_A_INDEX>
+static void qcms_transform_data_template_lut_sse1(const qcms_transform *transform,
+                                                  const unsigned char *src,
+                                                  unsigned char *dest,
+                                                  size_t length)
 {
     unsigned int i;
-    float (*mat)[4] = transform->matrix;
+    const float (*mat)[4] = transform->matrix;
     char input_back[32];
     /* Ensure we have a buffer that's 16 byte aligned regardless of the original
      * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
      * because they don't work on stack variables. gcc 4.4 does do the right thing
      * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
     float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
     /* share input and output locations to save having to keep the
      * locations in separate registers */
@@ -41,213 +42,119 @@ void qcms_transform_data_rgb_out_lut_sse
     const __m128 mat0  = _mm_load_ps(mat[0]);
     const __m128 mat1  = _mm_load_ps(mat[1]);
     const __m128 mat2  = _mm_load_ps(mat[2]);
 
     /* these values don't change, either */
     const __m128 max   = _mm_load_ps(clampMaxValueX4);
     const __m128 min   = _mm_setzero_ps();
     const __m128 scale = _mm_load_ps(floatScaleX4);
-
-    /* working variables */
-    __m128 vec_r, vec_g, vec_b, result;
-
-    /* CYA */
-    if (!length)
-        return;
-
-    /* one pixel is handled outside of the loop */
-    length--;
-
-    /* setup for transforming 1st pixel */
-    vec_r = _mm_load_ss(&igtbl_r[src[0]]);
-    vec_g = _mm_load_ss(&igtbl_g[src[1]]);
-    vec_b = _mm_load_ss(&igtbl_b[src[2]]);
-    src += 3;
-
-    /* transform all but final pixel */
-
-    for (i=0; i<length; i++)
-    {
-        /* position values from gamma tables */
-        vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
-        vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
-        vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
-
-        /* gamma * matrix */
-        vec_r = _mm_mul_ps(vec_r, mat0);
-        vec_g = _mm_mul_ps(vec_g, mat1);
-        vec_b = _mm_mul_ps(vec_b, mat2);
-
-        /* crunch, crunch, crunch */
-        vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
-        vec_r  = _mm_max_ps(min, vec_r);
-        vec_r  = _mm_min_ps(max, vec_r);
-        result = _mm_mul_ps(vec_r, scale);
-
-        /* store calc'd output tables indices */
-        *((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
-        result = _mm_movehl_ps(result, result);
-        *((__m64 *)&output[2]) = _mm_cvtps_pi32(result) ;
-
-        /* load for next loop while store completes */
-        vec_r = _mm_load_ss(&igtbl_r[src[0]]);
-        vec_g = _mm_load_ss(&igtbl_g[src[1]]);
-        vec_b = _mm_load_ss(&igtbl_b[src[2]]);
-        src += 3;
-
-        /* use calc'd indices to output RGB values */
-        dest[OUTPUT_R_INDEX] = otdata_r[output[0]];
-        dest[OUTPUT_G_INDEX] = otdata_g[output[1]];
-        dest[OUTPUT_B_INDEX] = otdata_b[output[2]];
-        dest += RGB_OUTPUT_COMPONENTS;
-    }
-
-    /* handle final (maybe only) pixel */
-
-    vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
-    vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
-    vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
-
-    vec_r = _mm_mul_ps(vec_r, mat0);
-    vec_g = _mm_mul_ps(vec_g, mat1);
-    vec_b = _mm_mul_ps(vec_b, mat2);
-
-    vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
-    vec_r  = _mm_max_ps(min, vec_r);
-    vec_r  = _mm_min_ps(max, vec_r);
-    result = _mm_mul_ps(vec_r, scale);
-
-    *((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
-    result = _mm_movehl_ps(result, result);
-    *((__m64 *)&output[2]) = _mm_cvtps_pi32(result);
-
-    dest[OUTPUT_R_INDEX] = otdata_r[output[0]];
-    dest[OUTPUT_G_INDEX] = otdata_g[output[1]];
-    dest[OUTPUT_B_INDEX] = otdata_b[output[2]];
-
-    _mm_empty();
-}
-
-void qcms_transform_data_rgba_out_lut_sse1(qcms_transform *transform,
-                                           unsigned char *src,
-                                           unsigned char *dest,
-                                           size_t length)
-{
-    unsigned int i;
-    float (*mat)[4] = transform->matrix;
-    char input_back[32];
-    /* Ensure we have a buffer that's 16 byte aligned regardless of the original
-     * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
-     * because they don't work on stack variables. gcc 4.4 does do the right thing
-     * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
-    float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
-    /* share input and output locations to save having to keep the
-     * locations in separate registers */
-    uint32_t const * output = (uint32_t*)input;
-
-    /* deref *transform now to avoid it in loop */
-    const float *igtbl_r = transform->input_gamma_table_r;
-    const float *igtbl_g = transform->input_gamma_table_g;
-    const float *igtbl_b = transform->input_gamma_table_b;
-
-    /* deref *transform now to avoid it in loop */
-    const uint8_t *otdata_r = &transform->output_table_r->data[0];
-    const uint8_t *otdata_g = &transform->output_table_g->data[0];
-    const uint8_t *otdata_b = &transform->output_table_b->data[0];
-
-    /* input matrix values never change */
-    const __m128 mat0  = _mm_load_ps(mat[0]);
-    const __m128 mat1  = _mm_load_ps(mat[1]);
-    const __m128 mat2  = _mm_load_ps(mat[2]);
-
-    /* these values don't change, either */
-    const __m128 max   = _mm_load_ps(clampMaxValueX4);
-    const __m128 min   = _mm_setzero_ps();
-    const __m128 scale = _mm_load_ps(floatScaleX4);
+    const unsigned int components = A_INDEX_COMPONENTS(kAIndex);
 
     /* working variables */
     __m128 vec_r, vec_g, vec_b, result;
     unsigned char alpha;
 
     /* CYA */
     if (!length)
         return;
 
     /* one pixel is handled outside of the loop */
     length--;
 
     /* setup for transforming 1st pixel */
-    vec_r = _mm_load_ss(&igtbl_r[src[0]]);
-    vec_g = _mm_load_ss(&igtbl_g[src[1]]);
-    vec_b = _mm_load_ss(&igtbl_b[src[2]]);
-    alpha = src[3];
-    src += 4;
+    vec_r = _mm_load_ss(&igtbl_r[src[kRIndex]]);
+    vec_g = _mm_load_ss(&igtbl_g[src[kGIndex]]);
+    vec_b = _mm_load_ss(&igtbl_b[src[kBIndex]]);
+    if (kAIndex != NO_A_INDEX) {
+        alpha = src[kAIndex];
+    }
+    src += components;
 
     /* transform all but final pixel */
 
     for (i=0; i<length; i++)
     {
         /* position values from gamma tables */
         vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
         vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
         vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
 
         /* gamma * matrix */
         vec_r = _mm_mul_ps(vec_r, mat0);
         vec_g = _mm_mul_ps(vec_g, mat1);
         vec_b = _mm_mul_ps(vec_b, mat2);
 
         /* store alpha for this pixel; load alpha for next */
-        dest[OUTPUT_A_INDEX] = alpha;
-        alpha   = src[3];
+        if (kAIndex != NO_A_INDEX) {
+            dest[kAIndex] = alpha;
+            alpha = src[kAIndex];
+        }
 
         /* crunch, crunch, crunch */
         vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
         vec_r  = _mm_max_ps(min, vec_r);
         vec_r  = _mm_min_ps(max, vec_r);
         result = _mm_mul_ps(vec_r, scale);
 
         /* store calc'd output tables indices */
         *((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
         result = _mm_movehl_ps(result, result);
         *((__m64 *)&output[2]) = _mm_cvtps_pi32(result);
 
         /* load gamma values for next loop while store completes */
-        vec_r = _mm_load_ss(&igtbl_r[src[0]]);
-        vec_g = _mm_load_ss(&igtbl_g[src[1]]);
-        vec_b = _mm_load_ss(&igtbl_b[src[2]]);
-        src += 4;
+        vec_r = _mm_load_ss(&igtbl_r[src[kRIndex]]);
+        vec_g = _mm_load_ss(&igtbl_g[src[kGIndex]]);
+        vec_b = _mm_load_ss(&igtbl_b[src[kBIndex]]);
+        src += components;
 
         /* use calc'd indices to output RGB values */
-        dest[OUTPUT_R_INDEX] = otdata_r[output[0]];
-        dest[OUTPUT_G_INDEX] = otdata_g[output[1]];
-        dest[OUTPUT_B_INDEX] = otdata_b[output[2]];
-        dest += 4;
+        dest[kRIndex] = otdata_r[output[0]];
+        dest[kGIndex] = otdata_g[output[1]];
+        dest[kBIndex] = otdata_b[output[2]];
+        dest += components;
     }
 
     /* handle final (maybe only) pixel */
 
     vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
     vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
     vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
 
     vec_r = _mm_mul_ps(vec_r, mat0);
     vec_g = _mm_mul_ps(vec_g, mat1);
     vec_b = _mm_mul_ps(vec_b, mat2);
 
-    dest[OUTPUT_A_INDEX] = alpha;
+    if (kAIndex != NO_A_INDEX) {
+        dest[kAIndex] = alpha;
+    }
 
     vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
     vec_r  = _mm_max_ps(min, vec_r);
     vec_r  = _mm_min_ps(max, vec_r);
     result = _mm_mul_ps(vec_r, scale);
 
     *((__m64 *)&output[0]) = _mm_cvtps_pi32(result);
     result = _mm_movehl_ps(result, result);
     *((__m64 *)&output[2]) = _mm_cvtps_pi32(result);
 
-    dest[OUTPUT_R_INDEX] = otdata_r[output[0]];
-    dest[OUTPUT_G_INDEX] = otdata_g[output[1]];
-    dest[OUTPUT_B_INDEX] = otdata_b[output[2]];
+    dest[kRIndex] = otdata_r[output[0]];
+    dest[kGIndex] = otdata_g[output[1]];
+    dest[kBIndex] = otdata_b[output[2]];
 
     _mm_empty();
 }
+
+void qcms_transform_data_rgb_out_lut_sse1(const qcms_transform *transform,
+                                          const unsigned char *src,
+                                          unsigned char *dest,
+                                          size_t length)
+{
+    qcms_transform_data_template_lut_sse1<RGBA_R_INDEX, RGBA_G_INDEX, RGBA_B_INDEX>(transform, src, dest, length);
+}
+
+void qcms_transform_data_rgba_out_lut_sse1(const qcms_transform *transform,
+                                           const unsigned char *src,
+                                           unsigned char *dest,
+                                           size_t length)
+{
+    qcms_transform_data_template_lut_sse1<RGBA_R_INDEX, RGBA_G_INDEX, RGBA_B_INDEX, RGBA_A_INDEX>(transform, src, dest, length);
+}
--- a/gfx/qcms/transform-sse2.cpp
+++ b/gfx/qcms/transform-sse2.cpp
@@ -5,23 +5,24 @@
 /* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */
 #define FLOATSCALE  (float)(PRECACHE_OUTPUT_SIZE)
 #define CLAMPMAXVAL ( ((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZE )
 static const ALIGN float floatScaleX4[4] =
     { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE};
 static const ALIGN float clampMaxValueX4[4] =
     { CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL};
 
-void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform,
-                                          unsigned char *src,
-                                          unsigned char *dest,
-                                          size_t length)
+template <size_t kRIndex, size_t kGIndex, size_t kBIndex, size_t kAIndex = NO_A_INDEX>
+static void qcms_transform_data_template_lut_sse2(const qcms_transform *transform,
+                                                  const unsigned char *src,
+                                                  unsigned char *dest,
+                                                  size_t length)
 {
     unsigned int i;
-    float (*mat)[4] = transform->matrix;
+    const float (*mat)[4] = transform->matrix;
     char input_back[32];
     /* Ensure we have a buffer that's 16 byte aligned regardless of the original
      * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
      * because they don't work on stack variables. gcc 4.4 does do the right thing
      * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
     float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
     /* share input and output locations to save having to keep the
      * locations in separate registers */
@@ -41,203 +42,113 @@ void qcms_transform_data_rgb_out_lut_sse
     const __m128 mat0  = _mm_load_ps(mat[0]);
     const __m128 mat1  = _mm_load_ps(mat[1]);
     const __m128 mat2  = _mm_load_ps(mat[2]);
 
     /* these values don't change, either */
     const __m128 max   = _mm_load_ps(clampMaxValueX4);
     const __m128 min   = _mm_setzero_ps();
     const __m128 scale = _mm_load_ps(floatScaleX4);
-
-    /* working variables */
-    __m128 vec_r, vec_g, vec_b, result;
-
-    /* CYA */
-    if (!length)
-        return;
-
-    /* one pixel is handled outside of the loop */
-    length--;
-
-    /* setup for transforming 1st pixel */
-    vec_r = _mm_load_ss(&igtbl_r[src[0]]);
-    vec_g = _mm_load_ss(&igtbl_g[src[1]]);
-    vec_b = _mm_load_ss(&igtbl_b[src[2]]);
-    src += 3;
-
-    /* transform all but final pixel */
-
-    for (i=0; i<length; i++)
-    {
-        /* position values from gamma tables */
-        vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
-        vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
-        vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
-
-        /* gamma * matrix */
-        vec_r = _mm_mul_ps(vec_r, mat0);
-        vec_g = _mm_mul_ps(vec_g, mat1);
-        vec_b = _mm_mul_ps(vec_b, mat2);
-
-        /* crunch, crunch, crunch */
-        vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
-        vec_r  = _mm_max_ps(min, vec_r);
-        vec_r  = _mm_min_ps(max, vec_r);
-        result = _mm_mul_ps(vec_r, scale);
-
-        /* store calc'd output tables indices */
-        _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
-
-        /* load for next loop while store completes */
-        vec_r = _mm_load_ss(&igtbl_r[src[0]]);
-        vec_g = _mm_load_ss(&igtbl_g[src[1]]);
-        vec_b = _mm_load_ss(&igtbl_b[src[2]]);
-        src += 3;
-
-        /* use calc'd indices to output RGB values */
-        dest[OUTPUT_R_INDEX] = otdata_r[output[0]];
-        dest[OUTPUT_G_INDEX] = otdata_g[output[1]];
-        dest[OUTPUT_B_INDEX] = otdata_b[output[2]];
-        dest += RGB_OUTPUT_COMPONENTS;
-    }
-
-    /* handle final (maybe only) pixel */
-
-    vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
-    vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
-    vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
-
-    vec_r = _mm_mul_ps(vec_r, mat0);
-    vec_g = _mm_mul_ps(vec_g, mat1);
-    vec_b = _mm_mul_ps(vec_b, mat2);
-
-    vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
-    vec_r  = _mm_max_ps(min, vec_r);
-    vec_r  = _mm_min_ps(max, vec_r);
-    result = _mm_mul_ps(vec_r, scale);
-
-    _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
-
-    dest[OUTPUT_R_INDEX] = otdata_r[output[0]];
-    dest[OUTPUT_G_INDEX] = otdata_g[output[1]];
-    dest[OUTPUT_B_INDEX] = otdata_b[output[2]];
-}
-
-void qcms_transform_data_rgba_out_lut_sse2(qcms_transform *transform,
-                                           unsigned char *src,
-                                           unsigned char *dest,
-                                           size_t length)
-{
-    unsigned int i;
-    float (*mat)[4] = transform->matrix;
-    char input_back[32];
-    /* Ensure we have a buffer that's 16 byte aligned regardless of the original
-     * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
-     * because they don't work on stack variables. gcc 4.4 does do the right thing
-     * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
-    float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
-    /* share input and output locations to save having to keep the
-     * locations in separate registers */
-    uint32_t const * output = (uint32_t*)input;
-
-    /* deref *transform now to avoid it in loop */
-    const float *igtbl_r = transform->input_gamma_table_r;
-    const float *igtbl_g = transform->input_gamma_table_g;
-    const float *igtbl_b = transform->input_gamma_table_b;
-
-    /* deref *transform now to avoid it in loop */
-    const uint8_t *otdata_r = &transform->output_table_r->data[0];
-    const uint8_t *otdata_g = &transform->output_table_g->data[0];
-    const uint8_t *otdata_b = &transform->output_table_b->data[0];
-
-    /* input matrix values never change */
-    const __m128 mat0  = _mm_load_ps(mat[0]);
-    const __m128 mat1  = _mm_load_ps(mat[1]);
-    const __m128 mat2  = _mm_load_ps(mat[2]);
-
-    /* these values don't change, either */
-    const __m128 max   = _mm_load_ps(clampMaxValueX4);
-    const __m128 min   = _mm_setzero_ps();
-    const __m128 scale = _mm_load_ps(floatScaleX4);
+    const unsigned int components = A_INDEX_COMPONENTS(kAIndex);
 
     /* working variables */
     __m128 vec_r, vec_g, vec_b, result;
     unsigned char alpha;
 
     /* CYA */
     if (!length)
         return;
 
     /* one pixel is handled outside of the loop */
     length--;
 
     /* setup for transforming 1st pixel */
-    vec_r = _mm_load_ss(&igtbl_r[src[0]]);
-    vec_g = _mm_load_ss(&igtbl_g[src[1]]);
-    vec_b = _mm_load_ss(&igtbl_b[src[2]]);
-    alpha = src[3];
-    src += 4;
+    vec_r = _mm_load_ss(&igtbl_r[src[kRIndex]]);
+    vec_g = _mm_load_ss(&igtbl_g[src[kGIndex]]);
+    vec_b = _mm_load_ss(&igtbl_b[src[kBIndex]]);
+    if (kAIndex != NO_A_INDEX) {
+        alpha = src[kAIndex];
+    }
+    src += components;
 
     /* transform all but final pixel */
 
     for (i=0; i<length; i++)
     {
         /* position values from gamma tables */
         vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
         vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
         vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
 
         /* gamma * matrix */
         vec_r = _mm_mul_ps(vec_r, mat0);
         vec_g = _mm_mul_ps(vec_g, mat1);
         vec_b = _mm_mul_ps(vec_b, mat2);
 
         /* store alpha for this pixel; load alpha for next */
-        dest[OUTPUT_A_INDEX] = alpha;
-        alpha   = src[3];
+        if (kAIndex != NO_A_INDEX) {
+            dest[kAIndex] = alpha;
+            alpha = src[kAIndex];
+        }
 
         /* crunch, crunch, crunch */
         vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
         vec_r  = _mm_max_ps(min, vec_r);
         vec_r  = _mm_min_ps(max, vec_r);
         result = _mm_mul_ps(vec_r, scale);
 
         /* store calc'd output tables indices */
         _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
 
         /* load gamma values for next loop while store completes */
-        vec_r = _mm_load_ss(&igtbl_r[src[0]]);
-        vec_g = _mm_load_ss(&igtbl_g[src[1]]);
-        vec_b = _mm_load_ss(&igtbl_b[src[2]]);
-        src += 4;
+        vec_r = _mm_load_ss(&igtbl_r[src[kRIndex]]);
+        vec_g = _mm_load_ss(&igtbl_g[src[kGIndex]]);
+        vec_b = _mm_load_ss(&igtbl_b[src[kBIndex]]);
+        src += components;
 
         /* use calc'd indices to output RGB values */
-        dest[OUTPUT_R_INDEX] = otdata_r[output[0]];
-        dest[OUTPUT_G_INDEX] = otdata_g[output[1]];
-        dest[OUTPUT_B_INDEX] = otdata_b[output[2]];
-        dest += RGBA_OUTPUT_COMPONENTS;
+        dest[kRIndex] = otdata_r[output[0]];
+        dest[kGIndex] = otdata_g[output[1]];
+        dest[kBIndex] = otdata_b[output[2]];
+        dest += components;
     }
 
     /* handle final (maybe only) pixel */
 
     vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
     vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
     vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
 
     vec_r = _mm_mul_ps(vec_r, mat0);
     vec_g = _mm_mul_ps(vec_g, mat1);
     vec_b = _mm_mul_ps(vec_b, mat2);
 
-    dest[OUTPUT_A_INDEX] = alpha;
+    if (kAIndex != NO_A_INDEX) {
+        dest[kAIndex] = alpha;
+    }
 
     vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
     vec_r  = _mm_max_ps(min, vec_r);
     vec_r  = _mm_min_ps(max, vec_r);
     result = _mm_mul_ps(vec_r, scale);
 
     _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
 
-    dest[OUTPUT_R_INDEX] = otdata_r[output[0]];
-    dest[OUTPUT_G_INDEX] = otdata_g[output[1]];
-    dest[OUTPUT_B_INDEX] = otdata_b[output[2]];
+    dest[kRIndex] = otdata_r[output[0]];
+    dest[kGIndex] = otdata_g[output[1]];
+    dest[kBIndex] = otdata_b[output[2]];
 }
 
+void qcms_transform_data_rgb_out_lut_sse2(const qcms_transform *transform,
+                                          const unsigned char *src,
+                                          unsigned char *dest,
+                                          size_t length)
+{
+  qcms_transform_data_template_lut_sse2<RGBA_R_INDEX, RGBA_G_INDEX, RGBA_B_INDEX>(transform, src, dest, length);
+}
 
+void qcms_transform_data_rgba_out_lut_sse2(const qcms_transform *transform,
+                                           const unsigned char *src,
+                                           unsigned char *dest,
+                                           size_t length)
+{
+  qcms_transform_data_template_lut_sse2<RGBA_R_INDEX, RGBA_G_INDEX, RGBA_B_INDEX, RGBA_A_INDEX>(transform, src, dest, length);
+}
--- a/gfx/qcms/transform.cpp
+++ b/gfx/qcms/transform.cpp
@@ -320,20 +320,20 @@ bool get_rgb_colorants(struct matrix *co
 {
 	*colorants = build_RGB_to_XYZ_transfer_matrix(white_point, primaries);
 	*colorants = adapt_matrix_to_D50(*colorants, white_point);
 
 	return (colorants->invalid ? true : false);
 }
 
 #if 0
-static void qcms_transform_data_rgb_out_pow(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
+static void qcms_transform_data_rgb_out_pow(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
 {
 	int i;
-	float (*mat)[4] = transform->matrix;
+	const float (*mat)[4] = transform->matrix;
 	for (i=0; i<length; i++) {
 		unsigned char device_r = *src++;
 		unsigned char device_g = *src++;
 		unsigned char device_b = *src++;
 
 		float linear_r = transform->input_gamma_table_r[device_r];
 		float linear_g = transform->input_gamma_table_g[device_g];
 		float linear_b = transform->input_gamma_table_b[device_b];
@@ -349,113 +349,114 @@ static void qcms_transform_data_rgb_out_
 		dest[OUTPUT_R_INDEX] = clamp_u8(255*out_device_r);
 		dest[OUTPUT_G_INDEX] = clamp_u8(255*out_device_g);
 		dest[OUTPUT_B_INDEX] = clamp_u8(255*out_device_b);
 		dest += RGB_OUTPUT_COMPONENTS;
 	}
 }
 #endif
 
-static void qcms_transform_data_gray_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
-{
-	unsigned int i;
-	for (i = 0; i < length; i++) {
-		float out_device_r, out_device_g, out_device_b;
-		unsigned char device = *src++;
-
-		float linear = transform->input_gamma_table_gray[device];
-
-                out_device_r = lut_interp_linear(linear, transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
-		out_device_g = lut_interp_linear(linear, transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
-		out_device_b = lut_interp_linear(linear, transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
-
-		dest[OUTPUT_R_INDEX] = clamp_u8(out_device_r*255);
-		dest[OUTPUT_G_INDEX] = clamp_u8(out_device_g*255);
-		dest[OUTPUT_B_INDEX] = clamp_u8(out_device_b*255);
-		dest += RGB_OUTPUT_COMPONENTS;
-	}
-}
-
 /* Alpha is not corrected.
    A rationale for this is found in Alvy Ray's "Should Alpha Be Nonlinear If
    RGB Is?" Tech Memo 17 (December 14, 1998).
 	See: ftp://ftp.alvyray.com/Acrobat/17_Nonln.pdf
 */
 
-static void qcms_transform_data_graya_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
+template <size_t kRIndex, size_t kGIndex, size_t kBIndex, size_t kAIndex = NO_A_INDEX>
+static void qcms_transform_data_gray_template_lut(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
 {
+	const unsigned int components = A_INDEX_COMPONENTS(kAIndex);
 	unsigned int i;
 	for (i = 0; i < length; i++) {
 		float out_device_r, out_device_g, out_device_b;
 		unsigned char device = *src++;
-		unsigned char alpha = *src++;
+		unsigned char alpha;
+		if (kAIndex != NO_A_INDEX) {
+			alpha = *src++;
+		}
 
 		float linear = transform->input_gamma_table_gray[device];
 
                 out_device_r = lut_interp_linear(linear, transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
 		out_device_g = lut_interp_linear(linear, transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
 		out_device_b = lut_interp_linear(linear, transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
 
-		dest[OUTPUT_R_INDEX] = clamp_u8(out_device_r*255);
-		dest[OUTPUT_G_INDEX] = clamp_u8(out_device_g*255);
-		dest[OUTPUT_B_INDEX] = clamp_u8(out_device_b*255);
-		dest[OUTPUT_A_INDEX] = alpha;
-		dest += RGBA_OUTPUT_COMPONENTS;
+		dest[kRIndex] = clamp_u8(out_device_r*255);
+		dest[kGIndex] = clamp_u8(out_device_g*255);
+		dest[kBIndex] = clamp_u8(out_device_b*255);
+		if (kAIndex != NO_A_INDEX) {
+			dest[kAIndex] = alpha;
+		}
+		dest += components;
 	}
 }
 
+static void qcms_transform_data_gray_out_lut(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
+{
+	qcms_transform_data_gray_template_lut<RGBA_R_INDEX, RGBA_G_INDEX, RGBA_B_INDEX>(transform, src, dest, length);
+}
 
-static void qcms_transform_data_gray_out_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
+static void qcms_transform_data_graya_out_lut(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
 {
+	qcms_transform_data_gray_template_lut<RGBA_R_INDEX, RGBA_G_INDEX, RGBA_B_INDEX, RGBA_A_INDEX>(transform, src, dest, length);
+}
+
+template <size_t kRIndex, size_t kGIndex, size_t kBIndex, size_t kAIndex = NO_A_INDEX>
+static void qcms_transform_data_gray_template_precache(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
+{
+	const unsigned int components = A_INDEX_COMPONENTS(kAIndex);
 	unsigned int i;
 	for (i = 0; i < length; i++) {
 		unsigned char device = *src++;
+		unsigned char alpha;
+		if (kAIndex != NO_A_INDEX) {
+		       alpha = *src++;
+		}
 		uint16_t gray;
 
 		float linear = transform->input_gamma_table_gray[device];
 
 		/* we could round here... */
 		gray = linear * PRECACHE_OUTPUT_MAX;
 
-		dest[OUTPUT_R_INDEX] = transform->output_table_r->data[gray];
-		dest[OUTPUT_G_INDEX] = transform->output_table_g->data[gray];
-		dest[OUTPUT_B_INDEX] = transform->output_table_b->data[gray];
-		dest += RGB_OUTPUT_COMPONENTS;
+		dest[kRIndex] = transform->output_table_r->data[gray];
+		dest[kGIndex] = transform->output_table_g->data[gray];
+		dest[kBIndex] = transform->output_table_b->data[gray];
+		if (kAIndex != NO_A_INDEX) {
+			dest[kAIndex] = alpha;
+		}
+		dest += components;
 	}
 }
 
-static void qcms_transform_data_graya_out_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
+static void qcms_transform_data_gray_out_precache(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
 {
-	unsigned int i;
-	for (i = 0; i < length; i++) {
-		unsigned char device = *src++;
-		unsigned char alpha = *src++;
-		uint16_t gray;
-
-		float linear = transform->input_gamma_table_gray[device];
+	qcms_transform_data_gray_template_precache<RGBA_R_INDEX, RGBA_G_INDEX, RGBA_B_INDEX>(transform, src, dest, length);
+}
 
-		/* we could round here... */
-		gray = linear * PRECACHE_OUTPUT_MAX;
-
-		dest[OUTPUT_R_INDEX] = transform->output_table_r->data[gray];
-		dest[OUTPUT_G_INDEX] = transform->output_table_g->data[gray];
-		dest[OUTPUT_B_INDEX] = transform->output_table_b->data[gray];
-		dest[OUTPUT_A_INDEX] = alpha;
-		dest += RGBA_OUTPUT_COMPONENTS;
-	}
+static void qcms_transform_data_graya_out_precache(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
+{
+	qcms_transform_data_gray_template_precache<RGBA_R_INDEX, RGBA_G_INDEX, RGBA_B_INDEX, RGBA_A_INDEX>(transform, src, dest, length);
 }
 
-static void qcms_transform_data_rgb_out_lut_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
+template <size_t kRIndex, size_t kGIndex, size_t kBIndex, size_t kAIndex = NO_A_INDEX>
+static void qcms_transform_data_template_lut_precache(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
 {
+	const unsigned int components = A_INDEX_COMPONENTS(kAIndex);
 	unsigned int i;
-	float (*mat)[4] = transform->matrix;
+	const float (*mat)[4] = transform->matrix;
 	for (i = 0; i < length; i++) {
-		unsigned char device_r = *src++;
-		unsigned char device_g = *src++;
-		unsigned char device_b = *src++;
+		unsigned char device_r = src[kRIndex];
+		unsigned char device_g = src[kGIndex];
+		unsigned char device_b = src[kBIndex];
+		unsigned char alpha;
+		if (kAIndex != NO_A_INDEX) {
+			alpha = src[kAIndex];
+		}
+		src += components;
 		uint16_t r, g, b;
 
 		float linear_r = transform->input_gamma_table_r[device_r];
 		float linear_g = transform->input_gamma_table_g[device_g];
 		float linear_b = transform->input_gamma_table_b[device_b];
 
 		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
 		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
@@ -465,69 +466,46 @@ static void qcms_transform_data_rgb_out_
 		out_linear_g = clamp_float(out_linear_g);
 		out_linear_b = clamp_float(out_linear_b);
 
 		/* we could round here... */
 		r = out_linear_r * PRECACHE_OUTPUT_MAX;
 		g = out_linear_g * PRECACHE_OUTPUT_MAX;
 		b = out_linear_b * PRECACHE_OUTPUT_MAX;
 
-		dest[OUTPUT_R_INDEX] = transform->output_table_r->data[r];
-		dest[OUTPUT_G_INDEX] = transform->output_table_g->data[g];
-		dest[OUTPUT_B_INDEX] = transform->output_table_b->data[b];
-		dest += RGB_OUTPUT_COMPONENTS;
+		dest[kRIndex] = transform->output_table_r->data[r];
+		dest[kGIndex] = transform->output_table_g->data[g];
+		dest[kBIndex] = transform->output_table_b->data[b];
+		if (kAIndex != NO_A_INDEX) {
+			dest[kAIndex] = alpha;
+		}
+		dest += components;
 	}
 }
 
-static void qcms_transform_data_rgba_out_lut_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
+static void qcms_transform_data_rgb_out_lut_precache(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
 {
-	unsigned int i;
-	float (*mat)[4] = transform->matrix;
-	for (i = 0; i < length; i++) {
-		unsigned char device_r = *src++;
-		unsigned char device_g = *src++;
-		unsigned char device_b = *src++;
-		unsigned char alpha = *src++;
-		uint16_t r, g, b;
-
-		float linear_r = transform->input_gamma_table_r[device_r];
-		float linear_g = transform->input_gamma_table_g[device_g];
-		float linear_b = transform->input_gamma_table_b[device_b];
+	qcms_transform_data_template_lut_precache<RGBA_R_INDEX, RGBA_G_INDEX, RGBA_B_INDEX>(transform, src, dest, length);
+}
 
-		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
-		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
-		float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
-
-		out_linear_r = clamp_float(out_linear_r);
-		out_linear_g = clamp_float(out_linear_g);
-		out_linear_b = clamp_float(out_linear_b);
-
-		/* we could round here... */
-		r = out_linear_r * PRECACHE_OUTPUT_MAX;
-		g = out_linear_g * PRECACHE_OUTPUT_MAX;
-		b = out_linear_b * PRECACHE_OUTPUT_MAX;
-
-		dest[OUTPUT_R_INDEX] = transform->output_table_r->data[r];
-		dest[OUTPUT_G_INDEX] = transform->output_table_g->data[g];
-		dest[OUTPUT_B_INDEX] = transform->output_table_b->data[b];
-		dest[OUTPUT_A_INDEX] = alpha;
-		dest += RGBA_OUTPUT_COMPONENTS;
-	}
+static void qcms_transform_data_rgba_out_lut_precache(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
+{
+	qcms_transform_data_template_lut_precache<RGBA_R_INDEX, RGBA_G_INDEX, RGBA_B_INDEX, RGBA_A_INDEX>(transform, src, dest, length);
 }
 
 // Not used
 /* 
-static void qcms_transform_data_clut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length) {
+static void qcms_transform_data_clut(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length) {
 	unsigned int i;
 	int xy_len = 1;
 	int x_len = transform->grid_size;
 	int len = x_len * x_len;
-	float* r_table = transform->r_clut;
-	float* g_table = transform->g_clut;
-	float* b_table = transform->b_clut;
+	const float* r_table = transform->r_clut;
+	const float* g_table = transform->g_clut;
+	const float* b_table = transform->b_clut;
   
 	for (i = 0; i < length; i++) {
 		unsigned char in_r = *src++;
 		unsigned char in_g = *src++;
 		unsigned char in_b = *src++;
 		float linear_r = in_r/255.0f, linear_g=in_g/255.0f, linear_b = in_b/255.0f;
 
 		int x = floorf(linear_r * (transform->grid_size-1));
@@ -571,33 +549,39 @@ static void qcms_transform_data_clut(qcm
 }
 */
 
 static int int_div_ceil(int value, int div) {
 	return ((value  + div - 1) / div);
 }
 
 // Using lcms' tetra interpolation algorithm.
-static void qcms_transform_data_tetra_clut_rgba(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length) {
+template <size_t kRIndex, size_t kGIndex, size_t kBIndex, size_t kAIndex = NO_A_INDEX>
+static void qcms_transform_data_tetra_clut_template(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length) {
+	const unsigned int components = A_INDEX_COMPONENTS(kAIndex);
 	unsigned int i;
 	int xy_len = 1;
 	int x_len = transform->grid_size;
 	int len = x_len * x_len;
 	float* r_table = transform->r_clut;
 	float* g_table = transform->g_clut;
 	float* b_table = transform->b_clut;
 	float c0_r, c1_r, c2_r, c3_r;
 	float c0_g, c1_g, c2_g, c3_g;
 	float c0_b, c1_b, c2_b, c3_b;
 	float clut_r, clut_g, clut_b;
 	for (i = 0; i < length; i++) {
-		unsigned char in_r = *src++;
-		unsigned char in_g = *src++;
-		unsigned char in_b = *src++;
-		unsigned char in_a = *src++;
+		unsigned char in_r = src[kRIndex];
+		unsigned char in_g = src[kGIndex];
+		unsigned char in_b = src[kBIndex];
+		unsigned char in_a;
+		if (kAIndex != NO_A_INDEX) {
+			in_a = src[kAIndex];
+		}
+		src += components;
 		float linear_r = in_r/255.0f, linear_g=in_g/255.0f, linear_b = in_b/255.0f;
 
 		int x = in_r * (transform->grid_size-1) / 255;
 		int y = in_g * (transform->grid_size-1) / 255;
 		int z = in_b * (transform->grid_size-1) / 255;
 		int x_n = int_div_ceil(in_r * (transform->grid_size-1), 255);
 		int y_n = int_div_ceil(in_g * (transform->grid_size-1), 255);
 		int z_n = int_div_ceil(in_b * (transform->grid_size-1), 255);
@@ -678,146 +662,49 @@ static void qcms_transform_data_tetra_cl
 				}
 			}
 		}
 				
 		clut_r = c0_r + c1_r*rx + c2_r*ry + c3_r*rz;
 		clut_g = c0_g + c1_g*rx + c2_g*ry + c3_g*rz;
 		clut_b = c0_b + c1_b*rx + c2_b*ry + c3_b*rz;
 
-		dest[OUTPUT_R_INDEX] = clamp_u8(clut_r*255.0f);
-		dest[OUTPUT_G_INDEX] = clamp_u8(clut_g*255.0f);
-		dest[OUTPUT_B_INDEX] = clamp_u8(clut_b*255.0f);
-		dest[OUTPUT_A_INDEX] = in_a;
-		dest += RGBA_OUTPUT_COMPONENTS;
+		dest[kRIndex] = clamp_u8(clut_r*255.0f);
+		dest[kGIndex] = clamp_u8(clut_g*255.0f);
+		dest[kBIndex] = clamp_u8(clut_b*255.0f);
+		if (kAIndex != NO_A_INDEX) {
+			dest[kAIndex] = in_a;
+		}
+		dest += components;
 	}	
 }
 
-// Using lcms' tetra interpolation code.
-static void qcms_transform_data_tetra_clut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length) {
-	unsigned int i;
-	int xy_len = 1;
-	int x_len = transform->grid_size;
-	int len = x_len * x_len;
-	float* r_table = transform->r_clut;
-	float* g_table = transform->g_clut;
-	float* b_table = transform->b_clut;
-	float c0_r, c1_r, c2_r, c3_r;
-	float c0_g, c1_g, c2_g, c3_g;
-	float c0_b, c1_b, c2_b, c3_b;
-	float clut_r, clut_g, clut_b;
-	for (i = 0; i < length; i++) {
-		unsigned char in_r = *src++;
-		unsigned char in_g = *src++;
-		unsigned char in_b = *src++;
-		float linear_r = in_r/255.0f, linear_g=in_g/255.0f, linear_b = in_b/255.0f;
-
-		int x = in_r * (transform->grid_size-1) / 255;
-		int y = in_g * (transform->grid_size-1) / 255;
-		int z = in_b * (transform->grid_size-1) / 255;
-		int x_n = int_div_ceil(in_r * (transform->grid_size-1), 255);
-		int y_n = int_div_ceil(in_g * (transform->grid_size-1), 255);
-		int z_n = int_div_ceil(in_b * (transform->grid_size-1), 255);
-		float rx = linear_r * (transform->grid_size-1) - x;
-		float ry = linear_g * (transform->grid_size-1) - y;
-		float rz = linear_b * (transform->grid_size-1) - z;
-
-		c0_r = CLU(r_table, x, y, z);
-		c0_g = CLU(g_table, x, y, z);
-		c0_b = CLU(b_table, x, y, z);
+static void qcms_transform_data_tetra_clut_rgba(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length) {
+	qcms_transform_data_tetra_clut_template<RGBA_R_INDEX, RGBA_G_INDEX, RGBA_B_INDEX, RGBA_A_INDEX>(transform, src, dest, length);
+}
 
-		if( rx >= ry ) {
-			if (ry >= rz) { //rx >= ry && ry >= rz
-				c1_r = CLU(r_table, x_n, y, z) - c0_r;
-				c2_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x_n, y, z);
-				c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
-				c1_g = CLU(g_table, x_n, y, z) - c0_g;
-				c2_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x_n, y, z);
-				c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
-				c1_b = CLU(b_table, x_n, y, z) - c0_b;
-				c2_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x_n, y, z);
-				c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
-			} else { 
-				if (rx >= rz) { //rx >= rz && rz >= ry
-					c1_r = CLU(r_table, x_n, y, z) - c0_r;
-					c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
-					c3_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x_n, y, z);
-					c1_g = CLU(g_table, x_n, y, z) - c0_g;
-					c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
-					c3_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x_n, y, z);
-					c1_b = CLU(b_table, x_n, y, z) - c0_b;
-					c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
-					c3_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x_n, y, z);
-				} else { //rz > rx && rx >= ry
-					c1_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x, y, z_n);
-					c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
-					c3_r = CLU(r_table, x, y, z_n) - c0_r;
-					c1_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x, y, z_n);
-					c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
-					c3_g = CLU(g_table, x, y, z_n) - c0_g;
-					c1_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x, y, z_n);
-					c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
-					c3_b = CLU(b_table, x, y, z_n) - c0_b;
-				}
-			}
-		} else {
-			if (rx >= rz) { //ry > rx && rx >= rz
-				c1_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x, y_n, z);
-				c2_r = CLU(r_table, x, y_n, z) - c0_r;
-				c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
-				c1_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x, y_n, z);
-				c2_g = CLU(g_table, x, y_n, z) - c0_g;
-				c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
-				c1_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x, y_n, z);
-				c2_b = CLU(b_table, x, y_n, z) - c0_b;
-				c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
-			} else {
-				if (ry >= rz) { //ry >= rz && rz > rx 
-					c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
-					c2_r = CLU(r_table, x, y_n, z) - c0_r;
-					c3_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y_n, z);
-					c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
-					c2_g = CLU(g_table, x, y_n, z) - c0_g;
-					c3_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y_n, z);
-					c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
-					c2_b = CLU(b_table, x, y_n, z) - c0_b;
-					c3_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y_n, z);
-				} else { //rz > ry && ry > rx
-					c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
-					c2_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y, z_n);
-					c3_r = CLU(r_table, x, y, z_n) - c0_r;
-					c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
-					c2_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y, z_n);
-					c3_g = CLU(g_table, x, y, z_n) - c0_g;
-					c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
-					c2_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y, z_n);
-					c3_b = CLU(b_table, x, y, z_n) - c0_b;
-				}
-			}
-		}
-				
-		clut_r = c0_r + c1_r*rx + c2_r*ry + c3_r*rz;
-		clut_g = c0_g + c1_g*rx + c2_g*ry + c3_g*rz;
-		clut_b = c0_b + c1_b*rx + c2_b*ry + c3_b*rz;
-
-		dest[OUTPUT_R_INDEX] = clamp_u8(clut_r*255.0f);
-		dest[OUTPUT_G_INDEX] = clamp_u8(clut_g*255.0f);
-		dest[OUTPUT_B_INDEX] = clamp_u8(clut_b*255.0f);
-		dest += RGB_OUTPUT_COMPONENTS;
-	}	
+static void qcms_transform_data_tetra_clut(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length) {
+	qcms_transform_data_tetra_clut_template<RGBA_R_INDEX, RGBA_G_INDEX, RGBA_B_INDEX>(transform, src, dest, length);
 }
 
-static void qcms_transform_data_rgb_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
+template <size_t kRIndex, size_t kGIndex, size_t kBIndex, size_t kAIndex = NO_A_INDEX>
+static void qcms_transform_data_template_lut(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
 {
+	const unsigned int components = A_INDEX_COMPONENTS(kAIndex);
 	unsigned int i;
-	float (*mat)[4] = transform->matrix;
+	const float (*mat)[4] = transform->matrix;
 	for (i = 0; i < length; i++) {
-		unsigned char device_r = *src++;
-		unsigned char device_g = *src++;
-		unsigned char device_b = *src++;
+		unsigned char device_r = src[kRIndex];
+		unsigned char device_g = src[kGIndex];
+		unsigned char device_b = src[kBIndex];
+		unsigned char alpha;
+		if (kAIndex != NO_A_INDEX) {
+			alpha = src[kAIndex];
+		}
+		src += components;
 		float out_device_r, out_device_g, out_device_b;
 
 		float linear_r = transform->input_gamma_table_r[device_r];
 		float linear_g = transform->input_gamma_table_g[device_g];
 		float linear_b = transform->input_gamma_table_b[device_b];
 
 		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
 		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
@@ -829,66 +716,41 @@ static void qcms_transform_data_rgb_out_
 
 		out_device_r = lut_interp_linear(out_linear_r, 
 				transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
 		out_device_g = lut_interp_linear(out_linear_g, 
 				transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
 		out_device_b = lut_interp_linear(out_linear_b, 
 				transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
 
-		dest[OUTPUT_R_INDEX] = clamp_u8(out_device_r*255);
-		dest[OUTPUT_G_INDEX] = clamp_u8(out_device_g*255);
-		dest[OUTPUT_B_INDEX] = clamp_u8(out_device_b*255);
-		dest += RGB_OUTPUT_COMPONENTS;
+		dest[kRIndex] = clamp_u8(out_device_r*255);
+		dest[kGIndex] = clamp_u8(out_device_g*255);
+		dest[kBIndex] = clamp_u8(out_device_b*255);
+		if (kAIndex != NO_A_INDEX) {
+			dest[kAIndex] = alpha;
+		}
+		dest += components;
 	}
 }
 
-static void qcms_transform_data_rgba_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
+static void qcms_transform_data_rgb_out_lut(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
 {
-	unsigned int i;
-	float (*mat)[4] = transform->matrix;
-	for (i = 0; i < length; i++) {
-		unsigned char device_r = *src++;
-		unsigned char device_g = *src++;
-		unsigned char device_b = *src++;
-		unsigned char alpha = *src++;
-		float out_device_r, out_device_g, out_device_b;
-
-		float linear_r = transform->input_gamma_table_r[device_r];
-		float linear_g = transform->input_gamma_table_g[device_g];
-		float linear_b = transform->input_gamma_table_b[device_b];
+	qcms_transform_data_template_lut<RGBA_R_INDEX, RGBA_G_INDEX, RGBA_B_INDEX>(transform, src, dest, length);
+}
 
-		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
-		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
-		float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
-
-		out_linear_r = clamp_float(out_linear_r);
-		out_linear_g = clamp_float(out_linear_g);
-		out_linear_b = clamp_float(out_linear_b);
-
-		out_device_r = lut_interp_linear(out_linear_r, 
-				transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
-		out_device_g = lut_interp_linear(out_linear_g, 
-				transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
-		out_device_b = lut_interp_linear(out_linear_b, 
-				transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
-
-		dest[OUTPUT_R_INDEX] = clamp_u8(out_device_r*255);
-		dest[OUTPUT_G_INDEX] = clamp_u8(out_device_g*255);
-		dest[OUTPUT_B_INDEX] = clamp_u8(out_device_b*255);
-		dest[OUTPUT_A_INDEX] = alpha;
-		dest += RGBA_OUTPUT_COMPONENTS;
-	}
+static void qcms_transform_data_rgba_out_lut(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
+{
+	qcms_transform_data_template_lut<RGBA_R_INDEX, RGBA_G_INDEX, RGBA_B_INDEX, RGBA_A_INDEX>(transform, src, dest, length);
 }
 
 #if 0
-static void qcms_transform_data_rgb_out_linear(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
+static void qcms_transform_data_rgb_out_linear(const qcms_transform *transform, const unsigned char *src, unsigned char *dest, size_t length)
 {
 	int i;
-	float (*mat)[4] = transform->matrix;
+	const float (*mat)[4] = transform->matrix;
 	for (i = 0; i < length; i++) {
 		unsigned char device_r = *src++;
 		unsigned char device_g = *src++;
 		unsigned char device_b = *src++;
 
 		float linear_r = transform->input_gamma_table_r[device_r];
 		float linear_g = transform->input_gamma_table_g[device_g];
 		float linear_b = transform->input_gamma_table_b[device_b];
@@ -1408,18 +1270,18 @@ qcms_transform* qcms_transform_create(
 	}
 	return transform;
 }
 
 #if defined(__GNUC__) && defined(__i386__)
 /* we need this to avoid crashes when gcc assumes the stack is 128bit aligned */
 __attribute__((__force_align_arg_pointer__))
 #endif
-void qcms_transform_data(qcms_transform *transform, void *src, void *dest, size_t length)
+void qcms_transform_data(qcms_transform *transform, const void *src, void *dest, size_t length)
 {
-	transform->transform_fn(transform, (unsigned char*)src, (unsigned char*)dest, length);
+	transform->transform_fn(transform, (const unsigned char*)src, (unsigned char*)dest, length);
 }
 
 bool qcms_supports_iccv4;
 void qcms_enable_iccv4()
 {
 	qcms_supports_iccv4 = true;
 }