Bug 674230 - Merge from qcms github branch v4; r=jmuizelaar
authorBenoit Girard <b56girard@gmail.com>
Wed, 27 Jul 2011 10:11:19 -0400
changeset 73442 77305d8301519cde4f1b4f3a861d9cff41017e3e
parent 73441 467e3f707ca1cfea92f606aac44dd0fe9c34642b
child 73443 86ff239e3948b71dc041981a2dfd5a97c05d12e0
push id20871
push usereakhgari@mozilla.com
push dateThu, 28 Jul 2011 14:37:48 +0000
treeherdermozilla-central@fe48bbfeff94 [default view] [failures only]
perfherder[talos] [build metrics] [platform microbench] (compared to previous push)
reviewersjmuizelaar
bugs674230
milestone8.0a1
first release with
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
last release without
nightly linux32
nightly linux64
nightly mac
nightly win32
nightly win64
Bug 674230 - Merge from qcms github branch v4; r=jmuizelaar
gfx/qcms/Makefile.in
gfx/qcms/chain.c
gfx/qcms/chain.h
gfx/qcms/iccread.c
gfx/qcms/matrix.c
gfx/qcms/matrix.h
gfx/qcms/qcms.h
gfx/qcms/qcmsint.h
gfx/qcms/transform.c
gfx/qcms/transform_util.c
gfx/qcms/transform_util.h
--- a/gfx/qcms/Makefile.in
+++ b/gfx/qcms/Makefile.in
@@ -8,17 +8,23 @@ include $(DEPTH)/config/autoconf.mk
 MODULE       = qcms
 LIBRARY_NAME = mozqcms
 LIBXUL_LIBRARY = 1
 GRE_MODULE      = 1
 DIST_INSTALL = 1
 
 EXPORTS      = qcms.h qcmstypes.h
 
-CSRCS = iccread.c transform.c
+CSRCS = \
+  chain.c \
+  iccread.c \
+  matrix.c \
+  transform.c \
+  transform_util.c \
+  $(NULL)
 
 ifeq (86,$(findstring 86,$(OS_TEST)))
 CSRCS += transform-sse2.c
 ifdef _MSC_VER
 ifneq ($(OS_ARCH)_$(OS_TEST),WINNT_x86_64)
 	CSRCS += transform-sse1.c
 endif
 else
new file mode 100644
--- /dev/null
+++ b/gfx/qcms/chain.c
@@ -0,0 +1,989 @@
+/* vim: set ts=8 sw=8 noexpandtab: */
+//  qcms
+//  Copyright (C) 2009 Mozilla Corporation
+//  Copyright (C) 1998-2007 Marti Maria
+//
+// Permission is hereby granted, free of charge, to any person obtaining 
+// a copy of this software and associated documentation files (the "Software"), 
+// to deal in the Software without restriction, including without limitation 
+// the rights to use, copy, modify, merge, publish, distribute, sublicense, 
+// and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in 
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO 
+// THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 
+// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 
+// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <stdlib.h>
+#include <math.h>
+#include <assert.h>
+#include <string.h> //memcpy
+#include "qcmsint.h"
+#include "transform_util.h"
+#include "matrix.h"
+
+static struct matrix build_lut_matrix(struct lutType *lut)
+{
+	struct matrix result;
+	if (lut) {
+		result.m[0][0] = s15Fixed16Number_to_float(lut->e00);
+		result.m[0][1] = s15Fixed16Number_to_float(lut->e01);
+		result.m[0][2] = s15Fixed16Number_to_float(lut->e02);
+		result.m[1][0] = s15Fixed16Number_to_float(lut->e10);
+		result.m[1][1] = s15Fixed16Number_to_float(lut->e11);
+		result.m[1][2] = s15Fixed16Number_to_float(lut->e12);
+		result.m[2][0] = s15Fixed16Number_to_float(lut->e20);
+		result.m[2][1] = s15Fixed16Number_to_float(lut->e21);
+		result.m[2][2] = s15Fixed16Number_to_float(lut->e22);
+		result.invalid = false;
+	} else {
+		memset(&result, 0, sizeof(struct matrix));
+		result.invalid = true;
+	}
+	return result;
+}
+
+static struct matrix build_mAB_matrix(struct lutmABType *lut)
+{
+	struct matrix result;
+	if (lut) {
+		result.m[0][0] = s15Fixed16Number_to_float(lut->e00);
+		result.m[0][1] = s15Fixed16Number_to_float(lut->e01);
+		result.m[0][2] = s15Fixed16Number_to_float(lut->e02);
+		result.m[1][0] = s15Fixed16Number_to_float(lut->e10);
+		result.m[1][1] = s15Fixed16Number_to_float(lut->e11);
+		result.m[1][2] = s15Fixed16Number_to_float(lut->e12);
+		result.m[2][0] = s15Fixed16Number_to_float(lut->e20);
+		result.m[2][1] = s15Fixed16Number_to_float(lut->e21);
+		result.m[2][2] = s15Fixed16Number_to_float(lut->e22);
+		result.invalid = false;
+	} else {
+		memset(&result, 0, sizeof(struct matrix));
+		result.invalid = true;
+	}
+	return result;
+}
+
+//Based on lcms cmsLab2XYZ
+#define f(t) (t <= (24.0f/116.0f)*(24.0f/116.0f)*(24.0f/116.0f)) ? ((841.0/108.0) * t + (16.0/116.0)) : pow(t,1.0/3.0)
+#define f_1(t) (t <= (24.0f/116.0f)) ? ((108.0/841.0) * (t - (16.0/116.0))) : (t * t * t)
+static void qcms_transform_module_LAB_to_XYZ(struct qcms_modular_transform *transform, float *src, float *dest, size_t length)
+{
+	size_t i;
+	// lcms: D50 XYZ values
+	float WhitePointX = 0.9642f;
+	float WhitePointY = 1.0f;
+	float WhitePointZ = 0.8249f;
+	for (i = 0; i < length; i++) {
+		float device_L = *src++ * 100.0f;
+		float device_a = *src++ * 255.0f - 128.0f;
+		float device_b = *src++ * 255.0f - 128.0f;
+		float y = (device_L + 16.0f) / 116.0f;
+
+		float X = f_1((y + 0.002f * device_a)) * WhitePointX;
+		float Y = f_1(y) * WhitePointY;
+		float Z = f_1((y - 0.005f * device_b)) * WhitePointZ;
+		*dest++ = X / (1.0 + 32767.0/32768.0);
+		*dest++ = Y / (1.0 + 32767.0/32768.0);
+		*dest++ = Z / (1.0 + 32767.0/32768.0);
+	}
+}
+
+//Based on lcms cmsXYZ2Lab
+static void qcms_transform_module_XYZ_to_LAB(struct qcms_modular_transform *transform, float *src, float *dest, size_t length)
+{
+	size_t i;
+        // lcms: D50 XYZ values
+        float WhitePointX = 0.9642f;
+        float WhitePointY = 1.0f;
+        float WhitePointZ = 0.8249f;
+        for (i = 0; i < length; i++) {
+                float device_x = *src++ * (1.0 + 32767.0/32768.0) / WhitePointX;
+                float device_y = *src++ * (1.0 + 32767.0/32768.0) / WhitePointY;
+                float device_z = *src++ * (1.0 + 32767.0/32768.0) / WhitePointZ;
+
+		float fx = f(device_x);
+		float fy = f(device_y);
+		float fz = f(device_z);
+
+                float L = 116.0f*fy - 16.0f;
+                float a = 500.0f*(fx - fy);
+                float b = 200.0f*(fy - fz);
+                *dest++ = L / 100.0f;
+                *dest++ = (a+128.0f) / 255.0f;
+                *dest++ = (b+128.0f) / 255.0f;
+        }
+
+}
+
+static void qcms_transform_module_clut_only(struct qcms_modular_transform *transform, float *src, float *dest, size_t length)
+{
+	size_t i;
+	int xy_len = 1;
+	int x_len = transform->grid_size;
+	int len = x_len * x_len;
+	float* r_table = transform->r_clut;
+	float* g_table = transform->g_clut;
+	float* b_table = transform->b_clut;
+
+	for (i = 0; i < length; i++) {
+		float linear_r = *src++;
+		float linear_g = *src++;
+		float linear_b = *src++;
+
+		int x = floor(linear_r * (transform->grid_size-1));
+		int y = floor(linear_g * (transform->grid_size-1));
+		int z = floor(linear_b * (transform->grid_size-1));
+		int x_n = ceil(linear_r * (transform->grid_size-1));
+		int y_n = ceil(linear_g * (transform->grid_size-1));
+		int z_n = ceil(linear_b * (transform->grid_size-1));
+		float x_d = linear_r * (transform->grid_size-1) - x;
+		float y_d = linear_g * (transform->grid_size-1) - y;
+		float z_d = linear_b * (transform->grid_size-1) - z;
+
+		float r_x1 = lerp(CLU(r_table,x,y,z), CLU(r_table,x_n,y,z), x_d);
+		float r_x2 = lerp(CLU(r_table,x,y_n,z), CLU(r_table,x_n,y_n,z), x_d);
+		float r_y1 = lerp(r_x1, r_x2, y_d);
+		float r_x3 = lerp(CLU(r_table,x,y,z_n), CLU(r_table,x_n,y,z_n), x_d);
+		float r_x4 = lerp(CLU(r_table,x,y_n,z_n), CLU(r_table,x_n,y_n,z_n), x_d);
+		float r_y2 = lerp(r_x3, r_x4, y_d);
+		float clut_r = lerp(r_y1, r_y2, z_d);
+
+		float g_x1 = lerp(CLU(g_table,x,y,z), CLU(g_table,x_n,y,z), x_d);
+		float g_x2 = lerp(CLU(g_table,x,y_n,z), CLU(g_table,x_n,y_n,z), x_d);
+		float g_y1 = lerp(g_x1, g_x2, y_d);
+		float g_x3 = lerp(CLU(g_table,x,y,z_n), CLU(g_table,x_n,y,z_n), x_d);
+		float g_x4 = lerp(CLU(g_table,x,y_n,z_n), CLU(g_table,x_n,y_n,z_n), x_d);
+		float g_y2 = lerp(g_x3, g_x4, y_d);
+		float clut_g = lerp(g_y1, g_y2, z_d);
+
+		float b_x1 = lerp(CLU(b_table,x,y,z), CLU(b_table,x_n,y,z), x_d);
+		float b_x2 = lerp(CLU(b_table,x,y_n,z), CLU(b_table,x_n,y_n,z), x_d);
+		float b_y1 = lerp(b_x1, b_x2, y_d);
+		float b_x3 = lerp(CLU(b_table,x,y,z_n), CLU(b_table,x_n,y,z_n), x_d);
+		float b_x4 = lerp(CLU(b_table,x,y_n,z_n), CLU(b_table,x_n,y_n,z_n), x_d);
+		float b_y2 = lerp(b_x3, b_x4, y_d);
+		float clut_b = lerp(b_y1, b_y2, z_d);
+
+		*dest++ = clamp_float(clut_r);
+		*dest++ = clamp_float(clut_g);
+		*dest++ = clamp_float(clut_b);
+	}
+}
+
+static void qcms_transform_module_clut(struct qcms_modular_transform *transform, float *src, float *dest, size_t length)
+{
+	size_t i;
+	int xy_len = 1;
+	int x_len = transform->grid_size;
+	int len = x_len * x_len;
+	float* r_table = transform->r_clut;
+	float* g_table = transform->g_clut;
+	float* b_table = transform->b_clut;
+	for (i = 0; i < length; i++) {
+		float device_r = *src++;
+		float device_g = *src++;
+		float device_b = *src++;
+		float linear_r = lut_interp_linear_float(device_r,
+				transform->input_clut_table_r, transform->input_clut_table_length);
+		float linear_g = lut_interp_linear_float(device_g,
+				transform->input_clut_table_g, transform->input_clut_table_length);
+		float linear_b = lut_interp_linear_float(device_b,
+				transform->input_clut_table_b, transform->input_clut_table_length);
+
+		int x = floor(linear_r * (transform->grid_size-1));
+		int y = floor(linear_g * (transform->grid_size-1));
+		int z = floor(linear_b * (transform->grid_size-1));
+		int x_n = ceil(linear_r * (transform->grid_size-1));
+		int y_n = ceil(linear_g * (transform->grid_size-1));
+		int z_n = ceil(linear_b * (transform->grid_size-1));
+		float x_d = linear_r * (transform->grid_size-1) - x;
+		float y_d = linear_g * (transform->grid_size-1) - y;
+		float z_d = linear_b * (transform->grid_size-1) - z;
+
+		float r_x1 = lerp(CLU(r_table,x,y,z), CLU(r_table,x_n,y,z), x_d);
+		float r_x2 = lerp(CLU(r_table,x,y_n,z), CLU(r_table,x_n,y_n,z), x_d);
+		float r_y1 = lerp(r_x1, r_x2, y_d);
+		float r_x3 = lerp(CLU(r_table,x,y,z_n), CLU(r_table,x_n,y,z_n), x_d);
+		float r_x4 = lerp(CLU(r_table,x,y_n,z_n), CLU(r_table,x_n,y_n,z_n), x_d);
+		float r_y2 = lerp(r_x3, r_x4, y_d);
+		float clut_r = lerp(r_y1, r_y2, z_d);
+
+		float g_x1 = lerp(CLU(g_table,x,y,z), CLU(g_table,x_n,y,z), x_d);
+		float g_x2 = lerp(CLU(g_table,x,y_n,z), CLU(g_table,x_n,y_n,z), x_d);
+		float g_y1 = lerp(g_x1, g_x2, y_d);
+		float g_x3 = lerp(CLU(g_table,x,y,z_n), CLU(g_table,x_n,y,z_n), x_d);
+		float g_x4 = lerp(CLU(g_table,x,y_n,z_n), CLU(g_table,x_n,y_n,z_n), x_d);
+		float g_y2 = lerp(g_x3, g_x4, y_d);
+		float clut_g = lerp(g_y1, g_y2, z_d);
+
+		float b_x1 = lerp(CLU(b_table,x,y,z), CLU(b_table,x_n,y,z), x_d);
+		float b_x2 = lerp(CLU(b_table,x,y_n,z), CLU(b_table,x_n,y_n,z), x_d);
+		float b_y1 = lerp(b_x1, b_x2, y_d);
+		float b_x3 = lerp(CLU(b_table,x,y,z_n), CLU(b_table,x_n,y,z_n), x_d);
+		float b_x4 = lerp(CLU(b_table,x,y_n,z_n), CLU(b_table,x_n,y_n,z_n), x_d);
+		float b_y2 = lerp(b_x3, b_x4, y_d);
+		float clut_b = lerp(b_y1, b_y2, z_d);
+
+		float pcs_r = lut_interp_linear_float(clut_r,
+				transform->output_clut_table_r, transform->output_clut_table_length);
+		float pcs_g = lut_interp_linear_float(clut_g,
+				transform->output_clut_table_g, transform->output_clut_table_length);
+		float pcs_b = lut_interp_linear_float(clut_b,
+				transform->output_clut_table_b, transform->output_clut_table_length);
+
+		*dest++ = clamp_float(pcs_r);
+		*dest++ = clamp_float(pcs_g);
+		*dest++ = clamp_float(pcs_b);
+	}
+}
+
+/* NOT USED
+static void qcms_transform_module_tetra_clut(struct qcms_modular_transform *transform, float *src, float *dest, size_t length)
+{
+	size_t i;
+	int xy_len = 1;
+	int x_len = transform->grid_size;
+	int len = x_len * x_len;
+	float* r_table = transform->r_clut;
+	float* g_table = transform->g_clut;
+	float* b_table = transform->b_clut;
+	float c0_r, c1_r, c2_r, c3_r;
+	float c0_g, c1_g, c2_g, c3_g;
+	float c0_b, c1_b, c2_b, c3_b;
+	float clut_r, clut_g, clut_b;
+	float pcs_r, pcs_g, pcs_b;
+	for (i = 0; i < length; i++) {
+		float device_r = *src++;
+		float device_g = *src++;
+		float device_b = *src++;
+		float linear_r = lut_interp_linear_float(device_r,
+				transform->input_clut_table_r, transform->input_clut_table_length);
+		float linear_g = lut_interp_linear_float(device_g,
+				transform->input_clut_table_g, transform->input_clut_table_length);
+		float linear_b = lut_interp_linear_float(device_b,
+				transform->input_clut_table_b, transform->input_clut_table_length);
+
+		int x = floor(linear_r * (transform->grid_size-1));
+		int y = floor(linear_g * (transform->grid_size-1));
+		int z = floor(linear_b * (transform->grid_size-1));
+		int x_n = ceil(linear_r * (transform->grid_size-1));
+		int y_n = ceil(linear_g * (transform->grid_size-1));
+		int z_n = ceil(linear_b * (transform->grid_size-1));
+		float rx = linear_r * (transform->grid_size-1) - x;
+		float ry = linear_g * (transform->grid_size-1) - y;
+		float rz = linear_b * (transform->grid_size-1) - z;
+
+		c0_r = CLU(r_table, x, y, z);
+		c0_g = CLU(g_table, x, y, z);
+		c0_b = CLU(b_table, x, y, z);
+		if( rx >= ry ) {
+			if (ry >= rz) { //rx >= ry && ry >= rz
+				c1_r = CLU(r_table, x_n, y, z) - c0_r;
+				c2_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x_n, y, z);
+				c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
+				c1_g = CLU(g_table, x_n, y, z) - c0_g;
+				c2_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x_n, y, z);
+				c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
+				c1_b = CLU(b_table, x_n, y, z) - c0_b;
+				c2_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x_n, y, z);
+				c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
+			} else {
+				if (rx >= rz) { //rx >= rz && rz >= ry
+					c1_r = CLU(r_table, x_n, y, z) - c0_r;
+					c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
+					c3_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x_n, y, z);
+					c1_g = CLU(g_table, x_n, y, z) - c0_g;
+					c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
+					c3_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x_n, y, z);
+					c1_b = CLU(b_table, x_n, y, z) - c0_b;
+					c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
+					c3_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x_n, y, z);
+				} else { //rz > rx && rx >= ry
+					c1_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x, y, z_n);
+					c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
+					c3_r = CLU(r_table, x, y, z_n) - c0_r;
+					c1_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x, y, z_n);
+					c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
+					c3_g = CLU(g_table, x, y, z_n) - c0_g;
+					c1_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x, y, z_n);
+					c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
+					c3_b = CLU(b_table, x, y, z_n) - c0_b;
+				}
+			}
+		} else {
+			if (rx >= rz) { //ry > rx && rx >= rz
+				c1_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x, y_n, z);
+				c2_r = CLU(r_table, x_n, y_n, z) - c0_r;
+				c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
+				c1_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x, y_n, z);
+				c2_g = CLU(g_table, x_n, y_n, z) - c0_g;
+				c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
+				c1_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x, y_n, z);
+				c2_b = CLU(b_table, x_n, y_n, z) - c0_b;
+				c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
+			} else {
+				if (ry >= rz) { //ry >= rz && rz > rx 
+					c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
+					c2_r = CLU(r_table, x, y_n, z) - c0_r;
+					c3_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y_n, z);
+					c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
+					c2_g = CLU(g_table, x, y_n, z) - c0_g;
+					c3_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y_n, z);
+					c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
+					c2_b = CLU(b_table, x, y_n, z) - c0_b;
+					c3_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y_n, z);
+				} else { //rz > ry && ry > rx
+					c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
+					c2_r = CLU(r_table, x, y_n, z) - c0_r;
+					c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
+					c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
+					c2_g = CLU(g_table, x, y_n, z) - c0_g;
+					c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
+					c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
+					c2_b = CLU(b_table, x, y_n, z) - c0_b;
+					c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
+				}
+			}
+		}
+
+		clut_r = c0_r + c1_r*rx + c2_r*ry + c3_r*rz;
+		clut_g = c0_g + c1_g*rx + c2_g*ry + c3_g*rz;
+		clut_b = c0_b + c1_b*rx + c2_b*ry + c3_b*rz;
+
+		pcs_r = lut_interp_linear_float(clut_r,
+				transform->output_clut_table_r, transform->output_clut_table_length);
+		pcs_g = lut_interp_linear_float(clut_g,
+				transform->output_clut_table_g, transform->output_clut_table_length);
+		pcs_b = lut_interp_linear_float(clut_b,
+				transform->output_clut_table_b, transform->output_clut_table_length);
+		*dest++ = clamp_float(pcs_r);
+		*dest++ = clamp_float(pcs_g);
+		*dest++ = clamp_float(pcs_b);
+	}
+}
+*/
+
+static void qcms_transform_module_gamma_table(struct qcms_modular_transform *transform, float *src, float *dest, size_t length)
+{
+	size_t i;
+	float out_r, out_g, out_b;
+	for (i = 0; i < length; i++) {
+		float in_r = *src++;
+		float in_g = *src++;
+		float in_b = *src++;
+
+		out_r = lut_interp_linear_float(in_r, transform->input_clut_table_r, 256);
+		out_g = lut_interp_linear_float(in_g, transform->input_clut_table_g, 256);
+		out_b = lut_interp_linear_float(in_b, transform->input_clut_table_b, 256);
+
+		*dest++ = clamp_float(out_r);
+		*dest++ = clamp_float(out_g);
+		*dest++ = clamp_float(out_b);
+	}
+}
+
+static void qcms_transform_module_gamma_lut(struct qcms_modular_transform *transform, float *src, float *dest, size_t length)
+{
+	size_t i;
+	float out_r, out_g, out_b;
+	for (i = 0; i < length; i++) {
+		float in_r = *src++;
+		float in_g = *src++;
+		float in_b = *src++;
+
+		out_r = lut_interp_linear(in_r,
+				transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
+		out_g = lut_interp_linear(in_g,
+				transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
+		out_b = lut_interp_linear(in_b,
+				transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
+
+		*dest++ = clamp_float(out_r);
+		*dest++ = clamp_float(out_g);
+		*dest++ = clamp_float(out_b);
+	}
+}
+
+static void qcms_transform_module_matrix_translate(struct qcms_modular_transform *transform, float *src, float *dest, size_t length)
+{
+	size_t i;
+	struct matrix mat;
+
+	/* store the results in column major mode
+	 * this makes doing the multiplication with sse easier */
+	mat.m[0][0] = transform->matrix.m[0][0];
+	mat.m[1][0] = transform->matrix.m[0][1];
+	mat.m[2][0] = transform->matrix.m[0][2];
+	mat.m[0][1] = transform->matrix.m[1][0];
+	mat.m[1][1] = transform->matrix.m[1][1];
+	mat.m[2][1] = transform->matrix.m[1][2];
+	mat.m[0][2] = transform->matrix.m[2][0];
+	mat.m[1][2] = transform->matrix.m[2][1];
+	mat.m[2][2] = transform->matrix.m[2][2];
+
+	for (i = 0; i < length; i++) {
+		float in_r = *src++;
+		float in_g = *src++;
+		float in_b = *src++;
+
+		float out_r = mat.m[0][0]*in_r + mat.m[1][0]*in_g + mat.m[2][0]*in_b + transform->tx;
+		float out_g = mat.m[0][1]*in_r + mat.m[1][1]*in_g + mat.m[2][1]*in_b + transform->ty;
+		float out_b = mat.m[0][2]*in_r + mat.m[1][2]*in_g + mat.m[2][2]*in_b + transform->tz;
+
+		*dest++ = clamp_float(out_r);
+		*dest++ = clamp_float(out_g);
+		*dest++ = clamp_float(out_b);
+	}
+}
+
+static void qcms_transform_module_matrix(struct qcms_modular_transform *transform, float *src, float *dest, size_t length)
+{
+	size_t i;
+	struct matrix mat;
+
+	/* store the results in column major mode
+	 * this makes doing the multiplication with sse easier */
+	mat.m[0][0] = transform->matrix.m[0][0];
+	mat.m[1][0] = transform->matrix.m[0][1];
+	mat.m[2][0] = transform->matrix.m[0][2];
+	mat.m[0][1] = transform->matrix.m[1][0];
+	mat.m[1][1] = transform->matrix.m[1][1];
+	mat.m[2][1] = transform->matrix.m[1][2];
+	mat.m[0][2] = transform->matrix.m[2][0];
+	mat.m[1][2] = transform->matrix.m[2][1];
+	mat.m[2][2] = transform->matrix.m[2][2];
+
+	for (i = 0; i < length; i++) {
+		float in_r = *src++;
+		float in_g = *src++;
+		float in_b = *src++;
+
+		float out_r = mat.m[0][0]*in_r + mat.m[1][0]*in_g + mat.m[2][0]*in_b;
+		float out_g = mat.m[0][1]*in_r + mat.m[1][1]*in_g + mat.m[2][1]*in_b;
+		float out_b = mat.m[0][2]*in_r + mat.m[1][2]*in_g + mat.m[2][2]*in_b;
+
+		*dest++ = clamp_float(out_r);
+		*dest++ = clamp_float(out_g);
+		*dest++ = clamp_float(out_b);
+	}
+}
+
+static struct qcms_modular_transform* qcms_modular_transform_alloc() {
+	return calloc(1, sizeof(struct qcms_modular_transform));
+}
+
+static void qcms_modular_transform_release(struct qcms_modular_transform *transform)
+{
+	struct qcms_modular_transform *next_transform;
+	while (transform != NULL) {
+		next_transform = transform->next_transform;
+		// clut may use a single block of memory.
+		// Perhaps we should remove this to simply the code.
+		if (transform->input_clut_table_r + transform->input_clut_table_length == transform->input_clut_table_g && transform->input_clut_table_g + transform->input_clut_table_length == transform->input_clut_table_b) {
+			if (transform->input_clut_table_r) free(transform->input_clut_table_r);
+		} else {
+			if (transform->input_clut_table_r) free(transform->input_clut_table_r);
+			if (transform->input_clut_table_g) free(transform->input_clut_table_g);
+			if (transform->input_clut_table_b) free(transform->input_clut_table_b);
+		}
+		if (transform->r_clut + 1 == transform->g_clut && transform->g_clut + 1 == transform->b_clut) {
+			if (transform->r_clut) free(transform->r_clut);
+		} else {
+			if (transform->r_clut) free(transform->r_clut);
+			if (transform->g_clut) free(transform->g_clut);
+			if (transform->b_clut) free(transform->b_clut);
+		}
+		if (transform->output_clut_table_r + transform->output_clut_table_length == transform->output_clut_table_g && transform->output_clut_table_g+ transform->output_clut_table_length == transform->output_clut_table_b) {
+			if (transform->output_clut_table_r) free(transform->output_clut_table_r);
+		} else {
+			if (transform->output_clut_table_r) free(transform->output_clut_table_r);
+			if (transform->output_clut_table_g) free(transform->output_clut_table_g);
+			if (transform->output_clut_table_b) free(transform->output_clut_table_b);
+		}
+		if (transform->output_gamma_lut_r) free(transform->output_gamma_lut_r);
+		if (transform->output_gamma_lut_g) free(transform->output_gamma_lut_g);
+		if (transform->output_gamma_lut_b) free(transform->output_gamma_lut_b);
+		free(transform);
+		transform = next_transform;
+	}
+}
+
+/* Set transform to be the next element in the linked list. */
+static void append_transform(struct qcms_modular_transform *transform, struct qcms_modular_transform ***next_transform)
+{
+	**next_transform = transform;
+	while (transform) {
+		*next_transform = &(transform->next_transform);
+		transform = transform->next_transform;
+	}
+}
+
+/* reverse the transformation list (used by mBA) */
+static struct qcms_modular_transform* reverse_transform(struct qcms_modular_transform *transform) 
+{
+	struct qcms_modular_transform *prev_transform = NULL;
+	while (transform != NULL) {
+		struct qcms_modular_transform *next_transform = transform->next_transform;
+		transform->next_transform = prev_transform;
+		prev_transform = transform;
+		transform = next_transform;
+	}
+	
+	return prev_transform;
+}
+
+#define EMPTY_TRANSFORM_LIST NULL
+static struct qcms_modular_transform* qcms_modular_transform_create_mAB(struct lutmABType *lut)
+{
+	struct qcms_modular_transform *first_transform = NULL;
+	struct qcms_modular_transform **next_transform = &first_transform;
+	struct qcms_modular_transform *transform = NULL;
+
+	if (lut->a_curves[0] != NULL) {
+		size_t clut_length;
+		float *clut;
+
+		// If the A curve is present this also implies the 
+		// presence of a CLUT.
+		if (!lut->clut_table) 
+			goto fail;
+
+		// Prepare A curve.
+		transform = qcms_modular_transform_alloc();
+		if (!transform)
+			goto fail;
+		append_transform(transform, &next_transform);
+		transform->input_clut_table_r = build_input_gamma_table(lut->a_curves[0]);
+		transform->input_clut_table_g = build_input_gamma_table(lut->a_curves[1]);
+		transform->input_clut_table_b = build_input_gamma_table(lut->a_curves[2]);
+		transform->transform_module_fn = qcms_transform_module_gamma_table;
+		if (lut->num_grid_points[0] != lut->num_grid_points[1] ||
+			lut->num_grid_points[1] != lut->num_grid_points[2] ) {
+			//XXX: We don't currently support clut that are not squared!
+			goto fail;
+		}
+
+		// Prepare CLUT
+		transform = qcms_modular_transform_alloc();
+		if (!transform) 
+			goto fail;
+		append_transform(transform, &next_transform);
+		clut_length = sizeof(float)*pow(lut->num_grid_points[0], 3)*3;
+		clut = malloc(clut_length);
+		if (!clut)
+			goto fail;
+		memcpy(clut, lut->clut_table, clut_length);
+		transform->r_clut = clut + 0;
+		transform->g_clut = clut + 1;
+		transform->b_clut = clut + 2;
+		transform->grid_size = lut->num_grid_points[0];
+		transform->transform_module_fn = qcms_transform_module_clut_only;
+	}
+	if (lut->m_curves[0] != NULL) {
+		// M curve imples the presence of a Matrix
+
+		// Prepare M curve
+		transform = qcms_modular_transform_alloc();
+		if (!transform)
+			goto fail;
+		append_transform(transform, &next_transform);
+		transform->input_clut_table_r = build_input_gamma_table(lut->m_curves[0]);
+		transform->input_clut_table_g = build_input_gamma_table(lut->m_curves[1]);
+		transform->input_clut_table_b = build_input_gamma_table(lut->m_curves[2]);
+		transform->transform_module_fn = qcms_transform_module_gamma_table;
+
+		// Prepare Matrix
+		transform = qcms_modular_transform_alloc();
+		if (!transform) 
+			goto fail;
+		append_transform(transform, &next_transform);
+		transform->matrix = build_mAB_matrix(lut);
+		if (transform->matrix.invalid)
+			goto fail;
+		transform->tx = s15Fixed16Number_to_float(lut->e03);
+		transform->ty = s15Fixed16Number_to_float(lut->e13);
+		transform->tz = s15Fixed16Number_to_float(lut->e23);
+		transform->transform_module_fn = qcms_transform_module_matrix_translate;
+	}
+	if (lut->b_curves[0] != NULL) {
+		// Prepare B curve
+		transform = qcms_modular_transform_alloc();
+		if (!transform) 
+			goto fail;
+		append_transform(transform, &next_transform);
+		transform->input_clut_table_r = build_input_gamma_table(lut->b_curves[0]);
+		transform->input_clut_table_g = build_input_gamma_table(lut->b_curves[1]);
+		transform->input_clut_table_b = build_input_gamma_table(lut->b_curves[2]);
+		transform->transform_module_fn = qcms_transform_module_gamma_table;
+	} else {
+		// B curve is mandatory
+		goto fail;
+	}
+
+	if (lut->reversed) {
+		// mBA are identical to mAB except that the transformation order
+		// is reversed
+		first_transform = reverse_transform(first_transform);
+	}
+
+	return first_transform;
+fail:
+	qcms_modular_transform_release(first_transform);
+	return NULL;
+}
+
+static struct qcms_modular_transform* qcms_modular_transform_create_lut(struct lutType *lut)
+{
+	struct qcms_modular_transform *first_transform = NULL;
+	struct qcms_modular_transform **next_transform = &first_transform;
+	struct qcms_modular_transform *transform = NULL;
+
+	size_t in_curve_len, clut_length, out_curve_len;
+	float *in_curves, *clut, *out_curves;
+
+	// Prepare Matrix
+	transform = qcms_modular_transform_alloc();
+	if (!transform) 
+		goto fail;
+	append_transform(transform, &next_transform);
+	transform->matrix = build_lut_matrix(lut);
+	if (transform->matrix.invalid)
+		goto fail;
+	transform->transform_module_fn = qcms_transform_module_matrix;
+
+	// Prepare input curves
+	transform = qcms_modular_transform_alloc();
+	if (!transform) 
+		goto fail;
+	append_transform(transform, &next_transform);
+	in_curve_len = sizeof(float)*lut->num_input_table_entries * 3;
+	in_curves = malloc(in_curve_len);
+	if (!in_curves) 
+		goto fail;
+	memcpy(in_curves, lut->input_table, in_curve_len);
+	transform->input_clut_table_r = in_curves + lut->num_input_table_entries * 0;
+	transform->input_clut_table_g = in_curves + lut->num_input_table_entries * 1;
+	transform->input_clut_table_b = in_curves + lut->num_input_table_entries * 2;
+	transform->input_clut_table_length = lut->num_input_table_entries;
+
+	// Prepare table
+	clut_length = sizeof(float)*pow(lut->num_clut_grid_points, 3)*3;
+	clut = malloc(clut_length);
+	if (!clut) 
+		goto fail;
+	memcpy(clut, lut->clut_table, clut_length);
+	transform->r_clut = clut + 0;
+	transform->g_clut = clut + 1;
+	transform->b_clut = clut + 2;
+	transform->grid_size = lut->num_clut_grid_points;
+
+	// Prepare output curves
+	out_curve_len = sizeof(float) * lut->num_output_table_entries * 3;
+	out_curves = malloc(out_curve_len);
+	if (!out_curves) 
+		goto fail;
+	memcpy(out_curves, lut->output_table, out_curve_len);
+	transform->output_clut_table_r = out_curves + lut->num_output_table_entries * 0;
+	transform->output_clut_table_g = out_curves + lut->num_output_table_entries * 1;
+	transform->output_clut_table_b = out_curves + lut->num_output_table_entries * 2;
+	transform->output_clut_table_length = lut->num_output_table_entries;
+	transform->transform_module_fn = qcms_transform_module_clut;
+
+	return first_transform;
+fail:
+	qcms_modular_transform_release(first_transform);
+	return NULL;
+}
+
+struct qcms_modular_transform* qcms_modular_transform_create_input(qcms_profile *in)
+{
+	struct qcms_modular_transform *first_transform = NULL;
+	struct qcms_modular_transform **next_transform = &first_transform;
+
+	if (in->A2B0) {
+		struct qcms_modular_transform *lut_transform;
+		lut_transform = qcms_modular_transform_create_lut(in->A2B0);
+		if (!lut_transform)
+			goto fail;
+		append_transform(lut_transform, &next_transform);
+	} else if (in->mAB && in->mAB->num_in_channels == 3 && in->mAB->num_out_channels == 3) {
+		struct qcms_modular_transform *mAB_transform;
+		mAB_transform = qcms_modular_transform_create_mAB(in->mAB);
+		if (!mAB_transform)
+			goto fail;
+		append_transform(mAB_transform, &next_transform);
+
+	} else {
+		struct qcms_modular_transform *transform;
+
+		transform = qcms_modular_transform_alloc();
+		if (!transform)
+			goto fail;
+		append_transform(transform, &next_transform);
+		transform->input_clut_table_r = build_input_gamma_table(in->redTRC);
+		transform->input_clut_table_g = build_input_gamma_table(in->greenTRC);
+		transform->input_clut_table_b = build_input_gamma_table(in->blueTRC);
+		transform->transform_module_fn = qcms_transform_module_gamma_table;
+		if (!transform->input_clut_table_r || !transform->input_clut_table_g ||
+				!transform->input_clut_table_b) {
+			goto fail;
+		}
+
+		transform = qcms_modular_transform_alloc();
+		if (!transform) 
+			goto fail;
+		append_transform(transform, &next_transform);
+		transform->matrix.m[0][0] = 1/1.999969482421875f;
+		transform->matrix.m[0][1] = 0.f;
+		transform->matrix.m[0][2] = 0.f;
+		transform->matrix.m[1][0] = 0.f;
+		transform->matrix.m[1][1] = 1/1.999969482421875f;
+		transform->matrix.m[1][2] = 0.f;
+		transform->matrix.m[2][0] = 0.f;
+		transform->matrix.m[2][1] = 0.f;
+		transform->matrix.m[2][2] = 1/1.999969482421875f;
+		transform->matrix.invalid = false;
+		transform->transform_module_fn = qcms_transform_module_matrix;
+
+		transform = qcms_modular_transform_alloc();
+		if (!transform) 
+			goto fail;
+		append_transform(transform, &next_transform);
+		transform->matrix = build_colorant_matrix(in);
+		transform->transform_module_fn = qcms_transform_module_matrix;
+	}
+
+	return first_transform;
+fail:
+	qcms_modular_transform_release(first_transform);
+	return EMPTY_TRANSFORM_LIST;
+}
+static struct qcms_modular_transform* qcms_modular_transform_create_output(qcms_profile *out)
+{
+	struct qcms_modular_transform *first_transform = NULL;
+	struct qcms_modular_transform **next_transform = &first_transform;
+
+	if (out->B2A0) {
+		struct qcms_modular_transform *lut_transform;
+		lut_transform = qcms_modular_transform_create_lut(out->B2A0);
+		if (!lut_transform) 
+			goto fail;
+		append_transform(lut_transform, &next_transform);
+	} else if (out->mBA && out->mBA->num_in_channels == 3 && out->mBA->num_out_channels == 3) {
+		struct qcms_modular_transform *lut_transform;
+		lut_transform = qcms_modular_transform_create_mAB(out->mBA);
+		if (!lut_transform) 
+			goto fail;
+		append_transform(lut_transform, &next_transform);
+	} else if (out->redTRC && out->greenTRC && out->blueTRC) {
+		struct qcms_modular_transform *transform;
+
+		transform = qcms_modular_transform_alloc();
+		if (!transform) 
+			goto fail;
+		append_transform(transform, &next_transform);
+		transform->matrix = matrix_invert(build_colorant_matrix(out));
+		transform->transform_module_fn = qcms_transform_module_matrix;
+
+		transform = qcms_modular_transform_alloc();
+		if (!transform) 
+			goto fail;
+		append_transform(transform, &next_transform);
+		transform->matrix.m[0][0] = 1.999969482421875f;
+		transform->matrix.m[0][1] = 0.f;
+		transform->matrix.m[0][2] = 0.f;
+		transform->matrix.m[1][0] = 0.f;
+		transform->matrix.m[1][1] = 1.999969482421875f;
+		transform->matrix.m[1][2] = 0.f;
+		transform->matrix.m[2][0] = 0.f;
+		transform->matrix.m[2][1] = 0.f;
+		transform->matrix.m[2][2] = 1.999969482421875f;
+		transform->matrix.invalid = false;
+		transform->transform_module_fn = qcms_transform_module_matrix;
+
+		transform = qcms_modular_transform_alloc();
+		if (!transform) 
+			goto fail;
+		append_transform(transform, &next_transform);
+		build_output_lut(out->redTRC, &transform->output_gamma_lut_r,
+			&transform->output_gamma_lut_r_length);
+		build_output_lut(out->greenTRC, &transform->output_gamma_lut_g,
+			&transform->output_gamma_lut_g_length);
+		build_output_lut(out->blueTRC, &transform->output_gamma_lut_b,
+			&transform->output_gamma_lut_b_length);
+		transform->transform_module_fn = qcms_transform_module_gamma_lut;
+
+		if (!transform->output_gamma_lut_r || !transform->output_gamma_lut_g ||
+				!transform->output_gamma_lut_b) {
+			goto fail;
+		}
+	} else {
+		assert(0 && "Unsupported output profile workflow.");
+		return NULL;
+	}
+
+	return first_transform;
+fail:
+	qcms_modular_transform_release(first_transform);
+	return EMPTY_TRANSFORM_LIST;
+}
+
+/* Not Completed
+// Simplify the transformation chain to an equivalent transformation chain
+static struct qcms_modular_transform* qcms_modular_transform_reduce(struct qcms_modular_transform *transform)
+{
+	struct qcms_modular_transform *first_transform = NULL;
+	struct qcms_modular_transform *curr_trans = transform;
+	struct qcms_modular_transform *prev_trans = NULL;
+	while (curr_trans) {
+		struct qcms_modular_transform *next_trans = curr_trans->next_transform;
+		if (curr_trans->transform_module_fn == qcms_transform_module_matrix) {
+			if (next_trans && next_trans->transform_module_fn == qcms_transform_module_matrix) {
+				curr_trans->matrix = matrix_multiply(curr_trans->matrix, next_trans->matrix);
+				goto remove_next;	
+			}
+		}
+		if (curr_trans->transform_module_fn == qcms_transform_module_gamma_table) {
+			bool isLinear = true;
+			uint16_t i;
+			for (i = 0; isLinear && i < 256; i++) {
+				isLinear &= (int)(curr_trans->input_clut_table_r[i] * 255) == i;
+				isLinear &= (int)(curr_trans->input_clut_table_g[i] * 255) == i;
+				isLinear &= (int)(curr_trans->input_clut_table_b[i] * 255) == i;
+			}
+			goto remove_current;
+		}
+		
+next_transform:
+		if (!next_trans) break;
+		prev_trans = curr_trans;
+		curr_trans = next_trans;
+		continue;
+remove_current:
+		if (curr_trans == transform) {
+			//Update head
+			transform = next_trans;
+		} else {
+			prev_trans->next_transform = next_trans;
+		}
+		curr_trans->next_transform = NULL;
+		qcms_modular_transform_release(curr_trans);
+		//return transform;
+		return qcms_modular_transform_reduce(transform);
+remove_next:
+		curr_trans->next_transform = next_trans->next_transform;
+		next_trans->next_transform = NULL;
+		qcms_modular_transform_release(next_trans);
+		continue;
+	}
+	return transform;
+}
+*/
+
+static struct qcms_modular_transform* qcms_modular_transform_create(qcms_profile *in, qcms_profile *out)
+{
+	struct qcms_modular_transform *first_transform = NULL;
+	struct qcms_modular_transform **next_transform = &first_transform;
+
+	if (in->color_space == RGB_SIGNATURE) {
+		struct qcms_modular_transform* rgb_to_pcs;
+		rgb_to_pcs = qcms_modular_transform_create_input(in);
+		if (!rgb_to_pcs) 
+			goto fail;
+		append_transform(rgb_to_pcs, &next_transform);
+	} else {
+		assert(0 && "input color space not supported");
+		goto fail;
+	}
+
+	if (in->pcs == LAB_SIGNATURE && out->pcs == XYZ_SIGNATURE) {
+		struct qcms_modular_transform* lab_to_pcs;
+		lab_to_pcs = qcms_modular_transform_alloc();
+		if (!lab_to_pcs) 
+			goto fail;
+		append_transform(lab_to_pcs, &next_transform);
+		lab_to_pcs->transform_module_fn = qcms_transform_module_LAB_to_XYZ;
+	}
+
+	// This does not improve accuracy in practice, something is wrong here.
+	//if (in->chromaticAdaption.invalid == false) {
+	//	struct qcms_modular_transform* chromaticAdaption;
+	//	chromaticAdaption = qcms_modular_transform_alloc();
+	//	if (!chromaticAdaption) 
+	//		goto fail;
+	//	append_transform(chromaticAdaption, &next_transform);
+	//	chromaticAdaption->matrix = matrix_invert(in->chromaticAdaption);
+	//	chromaticAdaption->transform_module_fn = qcms_transform_module_matrix;
+	//}
+
+        if (in->pcs == XYZ_SIGNATURE && out->pcs == LAB_SIGNATURE) {
+		struct qcms_modular_transform* pcs_to_lab;
+		pcs_to_lab = qcms_modular_transform_alloc();
+		if (!pcs_to_lab) 
+			goto fail;
+		append_transform(pcs_to_lab, &next_transform);
+		pcs_to_lab->transform_module_fn = qcms_transform_module_XYZ_to_LAB;
+	}
+
+	if (out->color_space == RGB_SIGNATURE) {
+		struct qcms_modular_transform* pcs_to_rgb;
+		pcs_to_rgb = qcms_modular_transform_create_output(out);
+		if (!pcs_to_rgb) 
+			goto fail;
+		append_transform(pcs_to_rgb, &next_transform);
+	} else {
+		assert(0 && "output color space not supported");
+		goto fail;
+	}
+	// Not Completed
+	//return qcms_modular_transform_reduce(first_transform);
+	return first_transform;
+fail:
+	qcms_modular_transform_release(first_transform);
+	return EMPTY_TRANSFORM_LIST;
+}
+
+static float* qcms_modular_transform_data(struct qcms_modular_transform *transform, float *src, float *dest, size_t len)
+{
+        while (transform != NULL) {
+                // Keep swaping src/dest when performing a transform to use less memory.
+                float *new_src = dest;
+		const void *transform_fn = transform->transform_module_fn;
+		if (transform_fn != qcms_transform_module_gamma_table &&
+		    transform_fn != qcms_transform_module_gamma_lut &&
+		    transform_fn != qcms_transform_module_clut &&
+		    transform_fn != qcms_transform_module_clut_only &&
+		    transform_fn != qcms_transform_module_matrix &&
+		    transform_fn != qcms_transform_module_matrix_translate &&
+		    transform_fn != qcms_transform_module_LAB_to_XYZ &&
+		    transform_fn != qcms_transform_module_XYZ_to_LAB) {
+			assert(0 && "Unsupported transform module");
+			return NULL;
+		}
+                transform->transform_module_fn(transform,src,dest,len);
+                dest = src;
+                src = new_src;
+                transform = transform->next_transform;
+        }
+        // The results end up in the src buffer because of the switching
+        return src;
+}
+
+float* qcms_chain_transform(qcms_profile *in, qcms_profile *out, float *src, float *dest, size_t lutSize)
+{
+	struct qcms_modular_transform *transform_list = qcms_modular_transform_create(in, out);
+	if (transform_list != NULL) {
+		float *lut = qcms_modular_transform_data(transform_list, src, dest, lutSize/3);
+		qcms_modular_transform_release(transform_list);
+		return lut;
+	}
+	return NULL;
+}
new file mode 100644
--- /dev/null
+++ b/gfx/qcms/chain.h
@@ -0,0 +1,30 @@
+/* vim: set ts=8 sw=8 noexpandtab: */
+//  qcms
+//  Copyright (C) 2009 Mozilla Foundation
+//  Copyright (C) 1998-2007 Marti Maria
+//
+// Permission is hereby granted, free of charge, to any person obtaining 
+// a copy of this software and associated documentation files (the "Software"), 
+// to deal in the Software without restriction, including without limitation 
+// the rights to use, copy, modify, merge, publish, distribute, sublicense, 
+// and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in 
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO 
+// THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 
+// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 
+// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _QCMS_CHAIN_H
+#define _QCMS_CHAIN_H
+
+// Generates and returns a 3D LUT with lutSize^3 samples using the provided src/dest.
+float* qcms_chain_transform(qcms_profile *in, qcms_profile *out, float *src, float *dest, size_t lutSize);
+
+#endif
--- a/gfx/qcms/iccread.c
+++ b/gfx/qcms/iccread.c
@@ -1,8 +1,9 @@
+/* vim: set ts=8 sw=8 noexpandtab: */
 //  qcms
 //  Copyright (C) 2009 Mozilla Foundation
 //  Copyright (C) 1998-2007 Marti Maria
 //
 // Permission is hereby granted, free of charge, to any person obtaining 
 // a copy of this software and associated documentation files (the "Software"), 
 // to deal in the Software without restriction, including without limitation 
 // the rights to use, copy, modify, merge, publish, distribute, sublicense, 
@@ -18,19 +19,23 @@
 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 
 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 
 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include <math.h>
 #include <assert.h>
 #include <stdlib.h>
-#include <string.h>
+#include <string.h> //memset
 #include "qcmsint.h"
 
+/* It might be worth having a unified limit on content controlled
+ * allocation per profile. This would remove the need for many
+ * of the arbitrary limits that we used */
+
 typedef uint32_t be32;
 typedef uint16_t be16;
 
 #if 0
 not used yet
 /* __builtin_bswap isn't available in older gccs
  * so open code it for now */
 static be32 cpu_to_be32(int32_t v)
@@ -115,22 +120,25 @@ static uint8_t read_u8(struct mem_source
 	}
 }
 
 static s15Fixed16Number read_s15Fixed16Number(struct mem_source *mem, size_t offset)
 {
 	return read_u32(mem, offset);
 }
 
-#if 0
+static uInt8Number read_uInt8Number(struct mem_source *mem, size_t offset)
+{
+	return read_u8(mem, offset);
+}
+
 static uInt16Number read_uInt16Number(struct mem_source *mem, size_t offset)
 {
 	return read_u16(mem, offset);
 }
-#endif
 
 #define BAD_VALUE_PROFILE NULL
 #define INVALID_PROFILE NULL
 #define NO_MEM_PROFILE NULL
 
 /* An arbitrary 4MB limit on profile size */
 #define MAX_PROFILE_SIZE 1024*1024*4
 #define MAX_TAG_COUNT 1024
@@ -139,26 +147,31 @@ static void check_CMM_type_signature(str
 {
 	//uint32_t CMM_type_signature = read_u32(src, 4);
 	//TODO: do the check?
 
 }
 
 static void check_profile_version(struct mem_source *src)
 {
+
+	/*
 	uint8_t major_revision = read_u8(src, 8 + 0);
 	uint8_t minor_revision = read_u8(src, 8 + 1);
+	*/
 	uint8_t reserved1      = read_u8(src, 8 + 2);
 	uint8_t reserved2      = read_u8(src, 8 + 3);
+	/* Checking the version doesn't buy us anything
 	if (major_revision != 0x4) {
 		if (major_revision > 0x2)
 			invalid_source(src, "Unsupported major revision");
 		if (minor_revision > 0x40)
 			invalid_source(src, "Unsupported minor revision");
 	}
+	*/
 	if (reserved1 != 0 || reserved2 != 0)
 		invalid_source(src, "Invalid reserved bytes");
 }
 
 #define INPUT_DEVICE_PROFILE   0x73636e72 // 'scnr'
 #define DISPLAY_DEVICE_PROFILE 0x6d6e7472 // 'mntr'
 #define OUTPUT_DEVICE_PROFILE  0x70727472 // 'prtr'
 #define DEVICE_LINK_PROFILE    0x6c696e6b // 'link'
@@ -167,18 +180,19 @@ static void check_profile_version(struct
 #define NAMED_COLOR_PROFILE    0x6e6d636c // 'nmcl'
 
 static void read_class_signature(qcms_profile *profile, struct mem_source *mem)
 {
 	profile->class = read_u32(mem, 12);
 	switch (profile->class) {
 		case DISPLAY_DEVICE_PROFILE:
 		case INPUT_DEVICE_PROFILE:
+		case OUTPUT_DEVICE_PROFILE:
+		case COLOR_SPACE_PROFILE:
 			break;
-		case OUTPUT_DEVICE_PROFILE:
 		default:
 			invalid_source(mem, "Invalid  Profile/Device Class signature");
 	}
 }
 
 static void read_color_space(qcms_profile *profile, struct mem_source *mem)
 {
 	profile->color_space = read_u32(mem, 16);
@@ -186,16 +200,28 @@ static void read_color_space(qcms_profil
 		case RGB_SIGNATURE:
 		case GRAY_SIGNATURE:
 			break;
 		default:
 			invalid_source(mem, "Unsupported colorspace");
 	}
 }
 
+static void read_pcs(qcms_profile *profile, struct mem_source *mem)
+{
+	profile->pcs = read_u32(mem, 20);
+	switch (profile->pcs) {
+		case XYZ_SIGNATURE:
+		case LAB_SIGNATURE:
+			break;
+		default:
+			invalid_source(mem, "Unsupported pcs");
+	}
+}
+
 struct tag
 {
 	uint32_t signature;
 	uint32_t offset;
 	uint32_t size;
 };
 
 struct tag_index {
@@ -235,16 +261,19 @@ qcms_bool qcms_profile_is_bogus(qcms_pro
        float rX, rY, rZ, gX, gY, gZ, bX, bY, bZ;
        bool negative;
        unsigned i;
 
        // We currently only check the bogosity of RGB profiles
        if (profile->color_space != RGB_SIGNATURE)
 	       return false;
 
+       if (profile->A2B0 || profile->B2A0)
+               return false;
+
        rX = s15Fixed16Number_to_float(profile->redColorant.X);
        rY = s15Fixed16Number_to_float(profile->redColorant.Y);
        rZ = s15Fixed16Number_to_float(profile->redColorant.Z);
 
        gX = s15Fixed16Number_to_float(profile->greenColorant.X);
        gY = s15Fixed16Number_to_float(profile->greenColorant.Y);
        gZ = s15Fixed16Number_to_float(profile->greenColorant.Z);
 
@@ -295,33 +324,64 @@ qcms_bool qcms_profile_is_bogus(qcms_pro
 #define TAG_bXYZ 0x6258595a
 #define TAG_gXYZ 0x6758595a
 #define TAG_rXYZ 0x7258595a
 #define TAG_rTRC 0x72545243
 #define TAG_bTRC 0x62545243
 #define TAG_gTRC 0x67545243
 #define TAG_kTRC 0x6b545243
 #define TAG_A2B0 0x41324230
+#define TAG_B2A0 0x42324130
+#define TAG_CHAD 0x63686164
 
 static struct tag *find_tag(struct tag_index index, uint32_t tag_id)
 {
 	unsigned int i;
 	struct tag *tag = NULL;
 	for (i = 0; i < index.count; i++) {
 		if (index.tags[i].signature == tag_id) {
 			return &index.tags[i];
 		}
 	}
 	return tag;
 }
 
-#define XYZ_TYPE   0x58595a20 // 'XYZ '
-#define CURVE_TYPE 0x63757276 // 'curv'
-#define LUT16_TYPE 0x6d667432 // 'mft2'
-#define LUT8_TYPE  0x6d667431 // 'mft1'
+#define XYZ_TYPE		0x58595a20 // 'XYZ '
+#define CURVE_TYPE		0x63757276 // 'curv'
+#define PARAMETRIC_CURVE_TYPE	0x70617261 // 'para'
+#define LUT16_TYPE		0x6d667432 // 'mft2'
+#define LUT8_TYPE		0x6d667431 // 'mft1'
+#define LUT_MAB_TYPE		0x6d414220 // 'mAB '
+#define LUT_MBA_TYPE		0x6d424120 // 'mBA '
+#define CHROMATIC_TYPE		0x73663332 // 'sf32'
+
+static struct matrix read_tag_s15Fixed16ArrayType(struct mem_source *src, struct tag_index index, uint32_t tag_id)
+{
+	struct tag *tag = find_tag(index, tag_id);
+	struct matrix matrix;
+	if (tag) {
+		uint8_t i;
+		uint32_t offset = tag->offset;
+		uint32_t type = read_u32(src, offset);
+
+		// Check mandatory type signature for s16Fixed16ArrayType
+		if (type != CHROMATIC_TYPE) {
+			invalid_source(src, "unexpected type, expected 'sf32'");
+		}
+
+		for (i = 0; i < 9; i++) {
+			matrix.m[i/3][i%3] = s15Fixed16Number_to_float(read_s15Fixed16Number(src, offset+8+i*4));
+		}
+		matrix.invalid = false;
+	} else {
+		matrix.invalid = true;
+		invalid_source(src, "missing sf32tag");
+	}
+	return matrix;
+}
 
 static struct XYZNumber read_tag_XYZType(struct mem_source *src, struct tag_index index, uint32_t tag_id)
 {
 	struct XYZNumber num = {0, 0, 0};
 	struct tag *tag = find_tag(index, tag_id);
 	if (tag) {
 		uint32_t offset = tag->offset;
 
@@ -332,100 +392,366 @@ static struct XYZNumber read_tag_XYZType
 		num.Y = read_s15Fixed16Number(src, offset+12);
 		num.Z = read_s15Fixed16Number(src, offset+16);
 	} else {
 		invalid_source(src, "missing xyztag");
 	}
 	return num;
 }
 
-static struct curveType *read_tag_curveType(struct mem_source *src, struct tag_index index, uint32_t tag_id)
+// Read the tag at a given offset rather then the tag_index. 
+// This method is used when reading mAB tags where nested curveType are
+// present that are not part of the tag_index.
+static struct curveType *read_curveType(struct mem_source *src, uint32_t offset, uint32_t *len)
 {
-	struct tag *tag = find_tag(index, tag_id);
+	static const size_t COUNT_TO_LENGTH[5] = {1, 3, 4, 5, 7};
 	struct curveType *curve = NULL;
-	if (tag) {
-		uint32_t offset = tag->offset;
-		uint32_t type = read_u32(src, offset);
-		uint32_t count = read_u32(src, offset+8);
-		unsigned int i;
+	uint32_t type = read_u32(src, offset);
+	uint32_t count;
+	int i;
 
-		if (type != CURVE_TYPE) {
-			invalid_source(src, "unexpected type, expected CURV");
-			return NULL;
-		}
+	if (type != CURVE_TYPE && type != PARAMETRIC_CURVE_TYPE) {
+		invalid_source(src, "unexpected type, expected CURV or PARA");
+		return NULL;
+	}
+
+	if (type == CURVE_TYPE) {
+		count = read_u32(src, offset+8);
 
 #define MAX_CURVE_ENTRIES 40000 //arbitrary
 		if (count > MAX_CURVE_ENTRIES) {
 			invalid_source(src, "curve size too large");
 			return NULL;
 		}
 		curve = malloc(sizeof(struct curveType) + sizeof(uInt16Number)*count);
 		if (!curve)
 			return NULL;
 
 		curve->count = count;
+		curve->type = type;
+
 		for (i=0; i<count; i++) {
-			curve->data[i] = read_u16(src, offset + 12 + i *2);
+			curve->data[i] = read_u16(src, offset + 12 + i*2);
+		}
+		*len = 12 + count * 2;
+	} else { //PARAMETRIC_CURVE_TYPE
+		count = read_u16(src, offset+8);
+
+		if (count > 4) {
+			invalid_source(src, "parametric function type not supported.");
+			return NULL;
+		}
+
+		curve = malloc(sizeof(struct curveType));
+		if (!curve)
+			return NULL;
+
+		curve->count = count;
+		curve->type = type;
+
+		for (i=0; i < COUNT_TO_LENGTH[count]; i++) {
+			curve->parameter[i] = s15Fixed16Number_to_float(read_s15Fixed16Number(src, offset + 12 + i*4));	
 		}
+		*len = 12 + COUNT_TO_LENGTH[count] * 4;
+
+		if ((count == 1 || count == 2)) {
+			/* we have a type 1 or type 2 function that has a division by 'a' */
+			float a = curve->parameter[1];
+			if (a == 0.f)
+				invalid_source(src, "parametricCurve definition causes division by zero.");
+		}
+	}
+
+	return curve;
+}
+
+static struct curveType *read_tag_curveType(struct mem_source *src, struct tag_index index, uint32_t tag_id)
+{
+	struct tag *tag = find_tag(index, tag_id);
+	struct curveType *curve = NULL;
+	if (tag) {
+		uint32_t len;
+		return read_curveType(src, tag->offset, &len);
 	} else {
 		invalid_source(src, "missing curvetag");
 	}
 
 	return curve;
 }
 
-/* This function's not done yet */
+#define MAX_CLUT_SIZE 500000 // arbitrary
+#define MAX_CHANNELS 10 // arbitrary
+static void read_nested_curveType(struct mem_source *src, struct curveType *(*curveArray)[MAX_CHANNELS], uint8_t num_channels, uint32_t curve_offset)
+{
+	uint32_t channel_offset = 0;
+	int i;
+	for (i = 0; i < num_channels; i++) {
+		uint32_t tag_len;
+
+		(*curveArray)[i] = read_curveType(src, curve_offset + channel_offset, &tag_len);
+		if (!(*curveArray)[i]) {
+			invalid_source(src, "invalid nested curveType curve");
+		}
+
+		channel_offset += tag_len;
+		// 4 byte aligned
+		if ((tag_len % 4) != 0)
+			channel_offset += 4 - (tag_len % 4);
+	}
+
+}
+
+static void mAB_release(struct lutmABType *lut)
+{
+	uint8_t i;
+
+	for (i = 0; i < lut->num_in_channels; i++){
+		free(lut->a_curves[i]);
+	}
+	for (i = 0; i < lut->num_out_channels; i++){
+		free(lut->b_curves[i]);
+		free(lut->m_curves[i]);
+	}
+	free(lut);
+}
+
+/* See section 10.10 for specs */
+static struct lutmABType *read_tag_lutmABType(struct mem_source *src, struct tag_index index, uint32_t tag_id)
+{
+	struct tag *tag = find_tag(index, tag_id);
+	uint32_t offset = tag->offset;
+	uint32_t a_curve_offset, b_curve_offset, m_curve_offset;
+	uint32_t matrix_offset;
+	uint32_t clut_offset;
+	uint32_t clut_size = 1;
+	uint8_t clut_precision;
+	uint32_t type = read_u32(src, offset);
+	uint8_t num_in_channels, num_out_channels;
+	struct lutmABType *lut;
+	int i;
+
+	if (type != LUT_MAB_TYPE && type != LUT_MBA_TYPE) {
+		return NULL;
+	}
+
+	num_in_channels = read_u8(src, offset + 8);
+	num_out_channels = read_u8(src, offset + 8);
+	if (num_in_channels > MAX_CHANNELS || num_out_channels > MAX_CHANNELS)
+		return NULL;
+
+	// We require 3in/out channels since we only support RGB->XYZ (or RGB->LAB)
+	// XXX: If we remove this restriction make sure that the number of channels
+	//      is less or equal to the maximum number of mAB curves in qcmsint.h
+	//      also check for clut_size overflow.
+	if (num_in_channels != 3 || num_out_channels != 3)
+		return NULL;
+
+	// some of this data is optional and is denoted by a zero offset
+	// we also use this to track their existance
+	a_curve_offset = read_u32(src, offset + 28);
+	clut_offset = read_u32(src, offset + 24);
+	m_curve_offset = read_u32(src, offset + 20);
+	matrix_offset = read_u32(src, offset + 16);
+	b_curve_offset = read_u32(src, offset + 12);
+
+	// Convert offsets relative to the tag to relative to the profile
+	// preserve zero for optional fields
+	if (a_curve_offset)
+		a_curve_offset += offset;
+	if (clut_offset)
+		clut_offset += offset;
+	if (m_curve_offset)
+		m_curve_offset += offset;
+	if (matrix_offset)
+		matrix_offset += offset;
+	if (b_curve_offset)
+		b_curve_offset += offset;
+
+	if (clut_offset) {
+		assert (num_in_channels == 3);
+		// clut_size can not overflow since lg(256^num_in_channels) = 24 bits.
+		for (i = 0; i < num_in_channels; i++) {
+			clut_size *= read_u8(src, clut_offset + i);
+		}
+	} else {
+		clut_size = 0;
+	}
+
+	// 24bits * 3 won't overflow either
+	clut_size = clut_size * num_out_channels;
+
+	if (clut_size > MAX_CLUT_SIZE)
+		return NULL;
+
+	lut = malloc(sizeof(struct lutmABType) + (clut_size) * sizeof(float));
+	if (!lut)
+		return NULL;
+	// we'll fill in the rest below
+	memset(lut, 0, sizeof(struct lutmABType));
+	lut->clut_table   = &lut->clut_table_data[0];
+
+	for (i = 0; i < num_in_channels; i++) {
+		lut->num_grid_points[i] = read_u8(src, clut_offset + i);
+	}
+
+	// Reverse the processing of transformation elements for mBA type.
+	lut->reversed = (type == LUT_MBA_TYPE);
+
+	lut->num_in_channels = num_in_channels;
+	lut->num_out_channels = num_out_channels;
+
+	if (matrix_offset) {
+		// read the matrix if we have it
+		lut->e00 = read_s15Fixed16Number(src, matrix_offset+4*0);
+		lut->e01 = read_s15Fixed16Number(src, matrix_offset+4*1);
+		lut->e02 = read_s15Fixed16Number(src, matrix_offset+4*2);
+		lut->e10 = read_s15Fixed16Number(src, matrix_offset+4*3);
+		lut->e11 = read_s15Fixed16Number(src, matrix_offset+4*4);
+		lut->e12 = read_s15Fixed16Number(src, matrix_offset+4*5);
+		lut->e20 = read_s15Fixed16Number(src, matrix_offset+4*6);
+		lut->e21 = read_s15Fixed16Number(src, matrix_offset+4*7);
+		lut->e22 = read_s15Fixed16Number(src, matrix_offset+4*8);
+		lut->e03 = read_s15Fixed16Number(src, matrix_offset+4*9);
+		lut->e13 = read_s15Fixed16Number(src, matrix_offset+4*10);
+		lut->e23 = read_s15Fixed16Number(src, matrix_offset+4*11);
+	}
+
+	if (a_curve_offset) {
+		read_nested_curveType(src, &lut->a_curves, num_in_channels, a_curve_offset);
+	}
+	if (m_curve_offset) {
+		read_nested_curveType(src, &lut->m_curves, num_out_channels, m_curve_offset);
+	}
+	if (b_curve_offset) {
+		read_nested_curveType(src, &lut->b_curves, num_out_channels, b_curve_offset);
+	} else {
+		invalid_source(src, "B curves required");
+	}
+
+	if (clut_offset) {
+		clut_precision = read_u8(src, clut_offset + 16);
+		if (clut_precision == 1) {
+			for (i = 0; i < clut_size; i++) {
+				lut->clut_table[i] = uInt8Number_to_float(read_uInt8Number(src, clut_offset + 20 + i*1));
+			}
+		} else if (clut_precision == 2) {
+			for (i = 0; i < clut_size; i++) {
+				lut->clut_table[i] = uInt16Number_to_float(read_uInt16Number(src, clut_offset + 20 + i*2));
+			}
+		} else {
+			invalid_source(src, "Invalid clut precision");
+		}
+	}
+
+	if (!src->valid) {
+		mAB_release(lut);
+		return NULL;
+	}
+
+	return lut;
+}
+
 static struct lutType *read_tag_lutType(struct mem_source *src, struct tag_index index, uint32_t tag_id)
 {
 	struct tag *tag = find_tag(index, tag_id);
 	uint32_t offset = tag->offset;
 	uint32_t type = read_u32(src, offset);
 	uint16_t num_input_table_entries;
 	uint16_t num_output_table_entries;
 	uint8_t in_chan, grid_points, out_chan;
+	uint32_t clut_offset, output_offset;
 	uint32_t clut_size;
+	size_t entry_size;
 	struct lutType *lut;
 	int i;
 
-	num_input_table_entries  = read_u16(src, offset + 48);
-	num_output_table_entries = read_u16(src, offset + 50);
+	/* I'm not sure why the spec specifies a fixed number of entries for LUT8 tables even though
+	 * they have room for the num_entries fields */
+	if (type == LUT8_TYPE) {
+		num_input_table_entries = 256;
+		num_output_table_entries = 256;
+		entry_size = 1;
+	} else if (type == LUT16_TYPE) {
+		num_input_table_entries  = read_u16(src, offset + 48);
+		num_output_table_entries = read_u16(src, offset + 50);
+		entry_size = 2;
+	} else {
+		assert(0); // the caller checks that this doesn't happen
+		invalid_source(src, "Unexpected lut type");
+		return NULL;
+	}
 
 	in_chan     = read_u8(src, offset + 8);
 	out_chan    = read_u8(src, offset + 9);
 	grid_points = read_u8(src, offset + 10);
 
-	if (!src->valid)
-		return NULL;
-
-	clut_size = in_chan * grid_points * out_chan;
-#define MAX_CLUT_SIZE 10000 // arbitrary
+	clut_size = pow(grid_points, in_chan);
 	if (clut_size > MAX_CLUT_SIZE) {
 		return NULL;
 	}
 
-	if (type != LUT16_TYPE && type != LUT8_TYPE)
+	if (in_chan != 3 || out_chan != 3) {
 		return NULL;
+	}
 
-	lut = malloc(sizeof(struct lutType) + (clut_size + num_input_table_entries + num_output_table_entries)*sizeof(uint8_t));
-	if (!lut)
+	lut = malloc(sizeof(struct lutType) + (num_input_table_entries * in_chan + clut_size*out_chan + num_output_table_entries * out_chan)*sizeof(float));
+	if (!lut) {
 		return NULL;
+	}
+
+	/* compute the offsets of tables */
+	lut->input_table  = &lut->table_data[0];
+	lut->clut_table   = &lut->table_data[in_chan*num_input_table_entries];
+	lut->output_table = &lut->table_data[in_chan*num_input_table_entries + clut_size*out_chan];
+
+	lut->num_input_table_entries  = num_input_table_entries;
+	lut->num_output_table_entries = num_output_table_entries;
 	lut->num_input_channels   = read_u8(src, offset + 8);
 	lut->num_output_channels  = read_u8(src, offset + 9);
 	lut->num_clut_grid_points = read_u8(src, offset + 10);
 	lut->e00 = read_s15Fixed16Number(src, offset+12);
 	lut->e01 = read_s15Fixed16Number(src, offset+16);
 	lut->e02 = read_s15Fixed16Number(src, offset+20);
 	lut->e10 = read_s15Fixed16Number(src, offset+24);
 	lut->e11 = read_s15Fixed16Number(src, offset+28);
 	lut->e12 = read_s15Fixed16Number(src, offset+32);
 	lut->e20 = read_s15Fixed16Number(src, offset+36);
 	lut->e21 = read_s15Fixed16Number(src, offset+40);
 	lut->e22 = read_s15Fixed16Number(src, offset+44);
 
-	//TODO: finish up
+	for (i = 0; i < lut->num_input_table_entries * in_chan; i++) {
+		if (type == LUT8_TYPE) {
+			lut->input_table[i] = uInt8Number_to_float(read_uInt8Number(src, offset + 52 + i * entry_size));
+		} else {
+			lut->input_table[i] = uInt16Number_to_float(read_uInt16Number(src, offset + 52 + i * entry_size));
+		}
+	}
+
+	clut_offset = offset + 52 + lut->num_input_table_entries * in_chan * entry_size;
+	for (i = 0; i < clut_size * out_chan; i+=3) {
+		if (type == LUT8_TYPE) {
+			lut->clut_table[i+0] = uInt8Number_to_float(read_uInt8Number(src, clut_offset + i*entry_size + 0));
+			lut->clut_table[i+1] = uInt8Number_to_float(read_uInt8Number(src, clut_offset + i*entry_size + 1));
+			lut->clut_table[i+2] = uInt8Number_to_float(read_uInt8Number(src, clut_offset + i*entry_size + 2));
+		} else {
+			lut->clut_table[i+0] = uInt16Number_to_float(read_uInt16Number(src, clut_offset + i*entry_size + 0));
+			lut->clut_table[i+1] = uInt16Number_to_float(read_uInt16Number(src, clut_offset + i*entry_size + 2));
+			lut->clut_table[i+2] = uInt16Number_to_float(read_uInt16Number(src, clut_offset + i*entry_size + 4));
+		}
+	}
+
+	output_offset = clut_offset + clut_size * out_chan * entry_size;
+	for (i = 0; i < lut->num_output_table_entries * out_chan; i++) {
+		if (type == LUT8_TYPE) {
+			lut->output_table[i] = uInt8Number_to_float(read_uInt8Number(src, output_offset + i*entry_size));
+		} else {
+			lut->output_table[i] = uInt16Number_to_float(read_uInt16Number(src, output_offset + i*entry_size));
+		}
+	}
+
 	return lut;
 }
 
 static void read_rendering_intent(qcms_profile *profile, struct mem_source *src)
 {
 	profile->rendering_intent = read_u32(src, 64);
 	switch (profile->rendering_intent) {
 		case QCMS_INTENT_PERCEPTUAL:
@@ -438,16 +764,18 @@ static void read_rendering_intent(qcms_p
 	}
 }
 
 qcms_profile *qcms_profile_create(void)
 {
 	return calloc(sizeof(qcms_profile), 1);
 }
 
+
+
 /* build sRGB gamma table */
 /* based on cmsBuildParametricGamma() */
 static uint16_t *build_sRGB_gamma_table(int num_entries)
 {
 	int i;
 	/* taken from lcms: Build_sRGBGamma() */
 	double gamma = 2.4;
 	double a = 1./1.055;
@@ -488,53 +816,46 @@ static uint16_t *build_sRGB_gamma_table(
 
 static struct curveType *curve_from_table(uint16_t *table, int num_entries)
 {
 	struct curveType *curve;
 	int i;
 	curve = malloc(sizeof(struct curveType) + sizeof(uInt16Number)*num_entries);
 	if (!curve)
 		return NULL;
+	curve->type = CURVE_TYPE;
 	curve->count = num_entries;
 	for (i = 0; i < num_entries; i++) {
 		curve->data[i] = table[i];
 	}
 	return curve;
 }
 
 static uint16_t float_to_u8Fixed8Number(float a)
 {
-	if (a > (255. + 255./256))
+	if (a > (255.f + 255.f/256))
 		return 0xffff;
-	else if (a < 0.)
+	else if (a < 0.f)
 		return 0;
 	else
-		return floor(a*256. + .5);
+		return floor(a*256.f + .5f);
 }
 
 static struct curveType *curve_from_gamma(float gamma)
 {
 	struct curveType *curve;
 	int num_entries = 1;
 	curve = malloc(sizeof(struct curveType) + sizeof(uInt16Number)*num_entries);
 	if (!curve)
 		return NULL;
 	curve->count = num_entries;
 	curve->data[0] = float_to_u8Fixed8Number(gamma);
 	return curve;
 }
 
-static void qcms_profile_fini(qcms_profile *profile)
-{
-	free(profile->redTRC);
-	free(profile->blueTRC);
-	free(profile->greenTRC);
-	free(profile->grayTRC);
-	free(profile);
-}
 
 //XXX: it would be nice if we had a way of ensuring
 // everything in a profile was initialized regardless of how it was created
 
 //XXX: should this also be taking a black_point?
 /* similar to CGColorSpaceCreateCalibratedRGB */
 qcms_profile* qcms_profile_create_rgb_with_gamma(
 		qcms_CIE_xyY white_point,
@@ -542,26 +863,26 @@ qcms_profile* qcms_profile_create_rgb_wi
 		float gamma)
 {
 	qcms_profile* profile = qcms_profile_create();
 	if (!profile)
 		return NO_MEM_PROFILE;
 
 	//XXX: should store the whitepoint
 	if (!set_rgb_colorants(profile, white_point, primaries)) {
-		qcms_profile_fini(profile);
+		qcms_profile_release(profile);
 		return INVALID_PROFILE;
 	}
 
 	profile->redTRC = curve_from_gamma(gamma);
 	profile->blueTRC = curve_from_gamma(gamma);
 	profile->greenTRC = curve_from_gamma(gamma);
 
 	if (!profile->redTRC || !profile->blueTRC || !profile->greenTRC) {
-		qcms_profile_fini(profile);
+		qcms_profile_release(profile);
 		return NO_MEM_PROFILE;
 	}
 	profile->class = DISPLAY_DEVICE_PROFILE;
 	profile->rendering_intent = QCMS_INTENT_PERCEPTUAL;
 	profile->color_space = RGB_SIGNATURE;
 	return profile;
 }
 
@@ -571,26 +892,26 @@ qcms_profile* qcms_profile_create_rgb_wi
 		uint16_t *table, int num_entries)
 {
 	qcms_profile* profile = qcms_profile_create();
 	if (!profile)
 		return NO_MEM_PROFILE;
 
 	//XXX: should store the whitepoint
 	if (!set_rgb_colorants(profile, white_point, primaries)) {
-		qcms_profile_fini(profile);
+		qcms_profile_release(profile);
 		return INVALID_PROFILE;
 	}
 
 	profile->redTRC = curve_from_table(table, num_entries);
 	profile->blueTRC = curve_from_table(table, num_entries);
 	profile->greenTRC = curve_from_table(table, num_entries);
 
 	if (!profile->redTRC || !profile->blueTRC || !profile->greenTRC) {
-		qcms_profile_fini(profile);
+		qcms_profile_release(profile);
 		return NO_MEM_PROFILE;
 	}
 	profile->class = DISPLAY_DEVICE_PROFILE;
 	profile->rendering_intent = QCMS_INTENT_PERCEPTUAL;
 	profile->color_space = RGB_SIGNATURE;
 	return profile;
 }
 
@@ -692,94 +1013,139 @@ qcms_profile* qcms_profile_from_memory(c
 	if (!profile)
 		return NO_MEM_PROFILE;
 
 	check_CMM_type_signature(src);
 	check_profile_version(src);
 	read_class_signature(profile, src);
 	read_rendering_intent(profile, src);
 	read_color_space(profile, src);
+	read_pcs(profile, src);
 	//TODO read rest of profile stuff
 
 	if (!src->valid)
 		goto invalid_profile;
 
 	index = read_tag_table(profile, src);
 	if (!src->valid || !index.tags)
 		goto invalid_tag_table;
 
-	if (profile->class == DISPLAY_DEVICE_PROFILE || profile->class == INPUT_DEVICE_PROFILE) {
-		if (profile->color_space == RGB_SIGNATURE) {
+	if (find_tag(index, TAG_CHAD)) {
+		profile->chromaticAdaption = read_tag_s15Fixed16ArrayType(src, index, TAG_CHAD);
+	} else {
+		profile->chromaticAdaption.invalid = true; //Signal the data is not present
+	}
 
-			profile->redColorant = read_tag_XYZType(src, index, TAG_rXYZ);
-			profile->blueColorant = read_tag_XYZType(src, index, TAG_bXYZ);
-			profile->greenColorant = read_tag_XYZType(src, index, TAG_gXYZ);
+	if (profile->class == DISPLAY_DEVICE_PROFILE || profile->class == INPUT_DEVICE_PROFILE ||
+            profile->class == OUTPUT_DEVICE_PROFILE  || profile->class == COLOR_SPACE_PROFILE) {
+		if (profile->color_space == RGB_SIGNATURE) {
+			if (find_tag(index, TAG_A2B0)) {
+				if (read_u32(src, find_tag(index, TAG_A2B0)->offset) == LUT8_TYPE ||
+				    read_u32(src, find_tag(index, TAG_A2B0)->offset) == LUT16_TYPE) {
+					profile->A2B0 = read_tag_lutType(src, index, TAG_A2B0);
+				} else if (read_u32(src, find_tag(index, TAG_A2B0)->offset) == LUT_MAB_TYPE) {
+					profile->mAB = read_tag_lutmABType(src, index, TAG_A2B0);
+				}
+			}
+			if (find_tag(index, TAG_B2A0)) {
+				if (read_u32(src, find_tag(index, TAG_B2A0)->offset) == LUT8_TYPE ||
+				    read_u32(src, find_tag(index, TAG_B2A0)->offset) == LUT16_TYPE) {
+					profile->B2A0 = read_tag_lutType(src, index, TAG_B2A0);
+				} else if (read_u32(src, find_tag(index, TAG_B2A0)->offset) == LUT_MBA_TYPE) {
+					profile->mBA = read_tag_lutmABType(src, index, TAG_B2A0);
+				}
+			}
+			if (find_tag(index, TAG_rXYZ) || !qcms_supports_iccv4) {
+				profile->redColorant = read_tag_XYZType(src, index, TAG_rXYZ);
+				profile->greenColorant = read_tag_XYZType(src, index, TAG_gXYZ);
+				profile->blueColorant = read_tag_XYZType(src, index, TAG_bXYZ);
+			}
 
 			if (!src->valid)
 				goto invalid_tag_table;
 
-			profile->redTRC = read_tag_curveType(src, index, TAG_rTRC);
-			profile->blueTRC = read_tag_curveType(src, index, TAG_bTRC);
-			profile->greenTRC = read_tag_curveType(src, index, TAG_gTRC);
+			if (find_tag(index, TAG_rTRC) || !qcms_supports_iccv4) {
+				profile->redTRC = read_tag_curveType(src, index, TAG_rTRC);
+				profile->greenTRC = read_tag_curveType(src, index, TAG_gTRC);
+				profile->blueTRC = read_tag_curveType(src, index, TAG_bTRC);
 
-			if (!profile->redTRC || !profile->blueTRC || !profile->greenTRC)
-				goto invalid_tag_table;
-
+				if (!profile->redTRC || !profile->blueTRC || !profile->greenTRC)
+					goto invalid_tag_table;
+			}
 		} else if (profile->color_space == GRAY_SIGNATURE) {
 
 			profile->grayTRC = read_tag_curveType(src, index, TAG_kTRC);
 			if (!profile->grayTRC)
 				goto invalid_tag_table;
 
 		} else {
+			assert(0 && "read_color_space protects against entering here");
 			goto invalid_tag_table;
 		}
-	} else if (0 && profile->class == OUTPUT_DEVICE_PROFILE) {
-		profile->A2B0 = read_tag_lutType(src, index, TAG_A2B0);
 	} else {
 		goto invalid_tag_table;
 	}
 
 	if (!src->valid)
 		goto invalid_tag_table;
 
 	free(index.tags);
 
 	return profile;
 
 invalid_tag_table:
 	free(index.tags);
 invalid_profile:
-	qcms_profile_fini(profile);
+	qcms_profile_release(profile);
 	return INVALID_PROFILE;
 }
 
 qcms_intent qcms_profile_get_rendering_intent(qcms_profile *profile)
 {
 	return profile->rendering_intent;
 }
 
 icColorSpaceSignature
 qcms_profile_get_color_space(qcms_profile *profile)
 {
 	return profile->color_space;
 }
 
+static void lut_release(struct lutType *lut)
+{
+	free(lut);
+}
+
 void qcms_profile_release(qcms_profile *profile)
 {
 	if (profile->output_table_r)
 		precache_release(profile->output_table_r);
 	if (profile->output_table_g)
 		precache_release(profile->output_table_g);
 	if (profile->output_table_b)
 		precache_release(profile->output_table_b);
 
-	qcms_profile_fini(profile);
+	if (profile->A2B0)
+		lut_release(profile->A2B0);
+	if (profile->B2A0)
+		lut_release(profile->B2A0);
+
+	if (profile->mAB)
+		mAB_release(profile->mAB);
+	if (profile->mBA)
+		mAB_release(profile->mBA);
+
+	free(profile->redTRC);
+	free(profile->blueTRC);
+	free(profile->greenTRC);
+	free(profile->grayTRC);
+	free(profile);
 }
 
+
 #include <stdio.h>
 qcms_profile* qcms_profile_from_file(FILE *file)
 {
 	uint32_t length, remaining_length;
 	qcms_profile *profile;
 	size_t read_length;
 	be32 length_be;
 	void *data;
new file mode 100644
--- /dev/null
+++ b/gfx/qcms/matrix.c
@@ -0,0 +1,136 @@
+/* vim: set ts=8 sw=8 noexpandtab: */
+//  qcms
+//  Copyright (C) 2009 Mozilla Foundation
+//  Copyright (C) 1998-2007 Marti Maria
+//
+// Permission is hereby granted, free of charge, to any person obtaining 
+// a copy of this software and associated documentation files (the "Software"), 
+// to deal in the Software without restriction, including without limitation 
+// the rights to use, copy, modify, merge, publish, distribute, sublicense, 
+// and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in 
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO 
+// THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 
+// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 
+// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#include <stdlib.h>
+#include "qcmsint.h"
+#include "matrix.h"
+
+struct vector matrix_eval(struct matrix mat, struct vector v)
+{
+	struct vector result;
+	result.v[0] = mat.m[0][0]*v.v[0] + mat.m[0][1]*v.v[1] + mat.m[0][2]*v.v[2];
+	result.v[1] = mat.m[1][0]*v.v[0] + mat.m[1][1]*v.v[1] + mat.m[1][2]*v.v[2];
+	result.v[2] = mat.m[2][0]*v.v[0] + mat.m[2][1]*v.v[1] + mat.m[2][2]*v.v[2];
+	return result;
+}
+
+//XXX: should probably pass by reference and we could
+//probably reuse this computation in matrix_invert
+float matrix_det(struct matrix mat)
+{
+	float det;
+	det = mat.m[0][0]*mat.m[1][1]*mat.m[2][2] +
+		mat.m[0][1]*mat.m[1][2]*mat.m[2][0] +
+		mat.m[0][2]*mat.m[1][0]*mat.m[2][1] -
+		mat.m[0][0]*mat.m[1][2]*mat.m[2][1] -
+		mat.m[0][1]*mat.m[1][0]*mat.m[2][2] -
+		mat.m[0][2]*mat.m[1][1]*mat.m[2][0];
+	return det;
+}
+
+/* from pixman and cairo and Mathematics for Game Programmers */
+/* lcms uses gauss-jordan elimination with partial pivoting which is
+ * less efficient and not as numerically stable. See Mathematics for
+ * Game Programmers. */
+struct matrix matrix_invert(struct matrix mat)
+{
+	struct matrix dest_mat;
+	int i,j;
+	static int a[3] = { 2, 2, 1 };
+	static int b[3] = { 1, 0, 0 };
+
+	/* inv  (A) = 1/det (A) * adj (A) */
+	float det = matrix_det(mat);
+
+	if (det == 0) {
+		dest_mat.invalid = true;
+	} else {
+		dest_mat.invalid = false;
+	}
+
+	det = 1/det;
+
+	for (j = 0; j < 3; j++) {
+		for (i = 0; i < 3; i++) {
+			double p;
+			int ai = a[i];
+			int aj = a[j];
+			int bi = b[i];
+			int bj = b[j];
+
+			p = mat.m[ai][aj] * mat.m[bi][bj] -
+				mat.m[ai][bj] * mat.m[bi][aj];
+			if (((i + j) & 1) != 0)
+				p = -p;
+
+			dest_mat.m[j][i] = det * p;
+		}
+	}
+	return dest_mat;
+}
+
+struct matrix matrix_identity(void)
+{
+	struct matrix i;
+	i.m[0][0] = 1;
+	i.m[0][1] = 0;
+	i.m[0][2] = 0;
+	i.m[1][0] = 0;
+	i.m[1][1] = 1;
+	i.m[1][2] = 0;
+	i.m[2][0] = 0;
+	i.m[2][1] = 0;
+	i.m[2][2] = 1;
+	i.invalid = false;
+	return i;
+}
+
+struct matrix matrix_invalid(void)
+{
+	struct matrix inv = matrix_identity();
+	inv.invalid = true;
+	return inv;
+}
+
+
+/* from pixman */
+/* MAT3per... */
+struct matrix matrix_multiply(struct matrix a, struct matrix b)
+{
+	struct matrix result;
+	int dx, dy;
+	int o;
+	for (dy = 0; dy < 3; dy++) {
+		for (dx = 0; dx < 3; dx++) {
+			double v = 0;
+			for (o = 0; o < 3; o++) {
+				v += a.m[dy][o] * b.m[o][dx];
+			}
+			result.m[dy][dx] = v;
+		}
+	}
+	result.invalid = a.invalid || b.invalid;
+	return result;
+}
+
+
new file mode 100644
--- /dev/null
+++ b/gfx/qcms/matrix.h
@@ -0,0 +1,39 @@
+/* vim: set ts=8 sw=8 noexpandtab: */
+//  qcms
+//  Copyright (C) 2009 Mozilla Foundation
+//  Copyright (C) 1998-2007 Marti Maria
+//
+// Permission is hereby granted, free of charge, to any person obtaining 
+// a copy of this software and associated documentation files (the "Software"), 
+// to deal in the Software without restriction, including without limitation 
+// the rights to use, copy, modify, merge, publish, distribute, sublicense, 
+// and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in 
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO 
+// THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 
+// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 
+// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _QCMS_MATRIX_H
+#define _QCMS_MATRIX_H
+
+struct vector {
+        float v[3];
+};
+
+struct vector matrix_eval(struct matrix mat, struct vector v);
+float matrix_det(struct matrix mat);
+struct matrix matrix_identity(void);
+struct matrix matrix_multiply(struct matrix a, struct matrix b);
+struct matrix matrix_invert(struct matrix mat);
+
+struct matrix matrix_invalid(void);
+
+#endif
--- a/gfx/qcms/qcms.h
+++ b/gfx/qcms/qcms.h
@@ -142,15 +142,15 @@ qcms_transform* qcms_transform_create(
 		qcms_profile *in, qcms_data_type in_type,
 		qcms_profile* out, qcms_data_type out_type,
 		qcms_intent intent);
 
 void qcms_transform_release(qcms_transform *);
 
 void qcms_transform_data(qcms_transform *transform, void *src, void *dest, size_t length);
 
-
+void qcms_enable_iccv4();
 
 #ifdef  __cplusplus
 }
 #endif
 
 #endif
--- a/gfx/qcms/qcmsint.h
+++ b/gfx/qcms/qcmsint.h
@@ -1,8 +1,9 @@
+/* vim: set ts=8 sw=8 noexpandtab: */
 #include "qcms.h"
 #include "qcmstypes.h"
 
 /* used as a lookup table for the output transformation.
  * we refcount them so we only need to have one around per output
  * profile, instead of duplicating them per transform */
 struct precache_output
 {
@@ -24,16 +25,29 @@ struct precache_output
 #endif
 
 struct _qcms_transform {
 	float ALIGN matrix[3][4];
 	float *input_gamma_table_r;
 	float *input_gamma_table_g;
 	float *input_gamma_table_b;
 
+	float *input_clut_table_r;
+	float *input_clut_table_g;
+	float *input_clut_table_b;
+	uint16_t input_clut_table_length;
+	float *r_clut;
+	float *g_clut;
+	float *b_clut;
+	uint16_t grid_size;
+	float *output_clut_table_r;
+	float *output_clut_table_g;
+	float *output_clut_table_b;
+	uint16_t output_clut_table_length;
+ 
 	float *input_gamma_table_gray;
 
 	float out_gamma_r;
 	float out_gamma_g;
 	float out_gamma_b;
 
 	float out_gamma_gray;
 
@@ -51,31 +65,97 @@ struct _qcms_transform {
 
 	struct precache_output *output_table_r;
 	struct precache_output *output_table_g;
 	struct precache_output *output_table_b;
 
 	void (*transform_fn)(struct _qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length);
 };
 
+struct matrix {
+	float m[3][3];
+	bool invalid;
+};
+struct qcms_modular_transform {
+	struct matrix matrix;
+	float tx, ty, tz;
+
+	float *input_clut_table_r;
+	float *input_clut_table_g;
+	float *input_clut_table_b;
+	uint16_t input_clut_table_length;
+	float *r_clut;
+	float *g_clut;
+	float *b_clut;
+	uint16_t grid_size;
+	float *output_clut_table_r;
+	float *output_clut_table_g;
+	float *output_clut_table_b;
+	uint16_t output_clut_table_length;
+ 
+	uint16_t *output_gamma_lut_r;
+	uint16_t *output_gamma_lut_g;
+	uint16_t *output_gamma_lut_b;
+
+	size_t output_gamma_lut_r_length;
+	size_t output_gamma_lut_g_length;
+	size_t output_gamma_lut_b_length;
+
+	void (*transform_module_fn)(struct qcms_modular_transform *transform, float *src, float *dest, size_t length);	
+	struct qcms_modular_transform *next_transform;
+};
+
 typedef int32_t s15Fixed16Number;
 typedef uint16_t uInt16Number;
+typedef uint8_t uInt8Number;
 
 struct XYZNumber {
 	s15Fixed16Number X;
 	s15Fixed16Number Y;
 	s15Fixed16Number Z;
 };
 
 struct curveType {
+	uint32_t type;
 	uint32_t count;
+	float parameter[7];
 	uInt16Number data[];
 };
 
-struct lutType {
+struct lutmABType {
+	uint8_t num_in_channels;
+	uint8_t num_out_channels;
+	// 16 is the upperbound, actual is 0..num_in_channels.
+	uint8_t num_grid_points[16];
+
+	s15Fixed16Number e00;
+	s15Fixed16Number e01;
+	s15Fixed16Number e02;
+	s15Fixed16Number e03;
+	s15Fixed16Number e10;
+	s15Fixed16Number e11;
+	s15Fixed16Number e12;
+	s15Fixed16Number e13;
+	s15Fixed16Number e20;
+	s15Fixed16Number e21;
+	s15Fixed16Number e22;
+	s15Fixed16Number e23;
+
+	// reversed elements (for mBA)
+	bool reversed;
+
+	float *clut_table;
+	struct curveType *a_curves[10];
+	struct curveType *b_curves[10];
+	struct curveType *m_curves[10];
+	float clut_table_data[];
+};
+
+/* should lut8Type and lut16Type be different types? */
+struct lutType { // used by lut8Type/lut16Type (mft2) only
 	uint8_t num_input_channels;
 	uint8_t num_output_channels;
 	uint8_t num_clut_grid_points;
 
 	s15Fixed16Number e00;
 	s15Fixed16Number e01;
 	s15Fixed16Number e02;
 	s15Fixed16Number e10;
@@ -83,19 +163,21 @@ struct lutType {
 	s15Fixed16Number e12;
 	s15Fixed16Number e20;
 	s15Fixed16Number e21;
 	s15Fixed16Number e22;
 
 	uint16_t num_input_table_entries;
 	uint16_t num_output_table_entries;
 
-	uint16_t *input_table;
-	uint16_t *clut_table;
-	uint16_t *output_table;
+	float *input_table;
+	float *clut_table;
+	float *output_table;
+
+	float table_data[];
 };
 #if 0
 /* this is from an intial idea of having the struct correspond to the data in
  * the file. I decided that it wasn't a good idea.
  */
 struct tag_value {
 	uint32_t type;
 	union {
@@ -108,49 +190,69 @@ struct tag_value {
 			} XYZNumber;
 		} XYZType;
 	};
 }; // I guess we need to pack this?
 #endif
 
 #define RGB_SIGNATURE  0x52474220
 #define GRAY_SIGNATURE 0x47524159
+#define XYZ_SIGNATURE  0x58595A20
+#define LAB_SIGNATURE  0x4C616220
 
 struct _qcms_profile {
 	uint32_t class;
 	uint32_t color_space;
+	uint32_t pcs;
 	qcms_intent rendering_intent;
 	struct XYZNumber redColorant;
 	struct XYZNumber blueColorant;
 	struct XYZNumber greenColorant;
 	struct curveType *redTRC;
 	struct curveType *blueTRC;
 	struct curveType *greenTRC;
 	struct curveType *grayTRC;
 	struct lutType *A2B0;
+	struct lutType *B2A0;
+	struct lutmABType *mAB;
+	struct lutmABType *mBA;
+	struct matrix chromaticAdaption;
 
 	struct precache_output *output_table_r;
 	struct precache_output *output_table_g;
 	struct precache_output *output_table_b;
 };
 
 #ifdef _MSC_VER
 #define inline _inline
 #endif
 
+/* produces the nearest float to 'a' with a maximum error
+ * of 1/1024 which happens for large values like 0x40000040 */
 static inline float s15Fixed16Number_to_float(s15Fixed16Number a)
 {
 	return ((int32_t)a)/65536.f;
 }
 
 static inline s15Fixed16Number double_to_s15Fixed16Number(double v)
 {
 	return (int32_t)(v*65536);
 }
 
+static inline float uInt8Number_to_float(uInt8Number a)
+{
+	return ((int32_t)a)/255.f;
+}
+
+static inline float uInt16Number_to_float(uInt16Number a)
+{
+	return ((int32_t)a)/65535.f;
+}
+
+
 void precache_release(struct precache_output *p);
 qcms_bool set_rgb_colorants(qcms_profile *profile, qcms_CIE_xyY white_point, qcms_CIE_xyYTRIPLE primaries);
 
 void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform,
                                           unsigned char *src,
                                           unsigned char *dest,
                                           size_t length);
 void qcms_transform_data_rgba_out_lut_sse2(qcms_transform *transform,
@@ -160,8 +262,10 @@ void qcms_transform_data_rgba_out_lut_ss
 void qcms_transform_data_rgb_out_lut_sse1(qcms_transform *transform,
                                           unsigned char *src,
                                           unsigned char *dest,
                                           size_t length);
 void qcms_transform_data_rgba_out_lut_sse1(qcms_transform *transform,
                                           unsigned char *src,
                                           unsigned char *dest,
                                           size_t length);
+
+extern qcms_bool qcms_supports_iccv4;
--- a/gfx/qcms/transform.c
+++ b/gfx/qcms/transform.c
@@ -1,8 +1,9 @@
+/* vim: set ts=8 sw=8 noexpandtab: */
 //  qcms
 //  Copyright (C) 2009 Mozilla Corporation
 //  Copyright (C) 1998-2007 Marti Maria
 //
 // Permission is hereby granted, free of charge, to any person obtaining 
 // a copy of this software and associated documentation files (the "Software"), 
 // to deal in the Software without restriction, including without limitation 
 // the rights to use, copy, modify, merge, publish, distribute, sublicense, 
@@ -18,392 +19,27 @@
 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 
 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 
 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include <stdlib.h>
 #include <math.h>
 #include <assert.h>
+#include <string.h> //memcpy
 #include "qcmsint.h"
+#include "chain.h"
+#include "matrix.h"
+#include "transform_util.h"
 
 /* for MSVC, GCC, Intel, and Sun compilers */
 #if defined(_M_IX86) || defined(__i386__) || defined(__i386) || defined(_M_AMD64) || defined(__x86_64__) || defined(__x86_64)
 #define X86
 #endif /* _M_IX86 || __i386__ || __i386 || _M_AMD64 || __x86_64__ || __x86_64 */
 
-//XXX: could use a bettername
-typedef uint16_t uint16_fract_t;
-
-/* value must be a value between 0 and 1 */
-//XXX: is the above a good restriction to have?
-float lut_interp_linear(double value, uint16_t *table, int length)
-{
-	int upper, lower;
-	value = value * (length - 1); // scale to length of the array
-	upper = ceil(value);
-	lower = floor(value);
-	//XXX: can we be more performant here?
-	value = table[upper]*(1. - (upper - value)) + table[lower]*(upper - value);
-	/* scale the value */
-	return value * (1./65535.);
-}
-
-/* same as above but takes and returns a uint16_t value representing a range from 0..1 */
-uint16_t lut_interp_linear16(uint16_t input_value, uint16_t *table, int length)
-{
-	/* Start scaling input_value to the length of the array: 65535*(length-1).
-	 * We'll divide out the 65535 next */
-	uint32_t value = (input_value * (length - 1));
-	uint32_t upper = (value + 65534) / 65535; /* equivalent to ceil(value/65535) */
-	uint32_t lower = value / 65535;           /* equivalent to floor(value/65535) */
-	/* interp is the distance from upper to value scaled to 0..65535 */
-	uint32_t interp = value % 65535;
-
-	value = (table[upper]*(interp) + table[lower]*(65535 - interp))/65535; // 0..65535*65535
-
-	return value;
-}
-
-/* same as above but takes an input_value from 0..PRECACHE_OUTPUT_MAX
- * and returns a uint8_t value representing a range from 0..1 */
-static
-uint8_t lut_interp_linear_precache_output(uint32_t input_value, uint16_t *table, int length)
-{
-	/* Start scaling input_value to the length of the array: PRECACHE_OUTPUT_MAX*(length-1).
-	 * We'll divide out the PRECACHE_OUTPUT_MAX next */
-	uint32_t value = (input_value * (length - 1));
-
-	/* equivalent to ceil(value/PRECACHE_OUTPUT_MAX) */
-	uint32_t upper = (value + PRECACHE_OUTPUT_MAX-1) / PRECACHE_OUTPUT_MAX;
-	/* equivalent to floor(value/PRECACHE_OUTPUT_MAX) */
-	uint32_t lower = value / PRECACHE_OUTPUT_MAX;
-	/* interp is the distance from upper to value scaled to 0..PRECACHE_OUTPUT_MAX */
-	uint32_t interp = value % PRECACHE_OUTPUT_MAX;
-
-	/* the table values range from 0..65535 */
-	value = (table[upper]*(interp) + table[lower]*(PRECACHE_OUTPUT_MAX - interp)); // 0..(65535*PRECACHE_OUTPUT_MAX)
-
-	/* round and scale */
-	value += (PRECACHE_OUTPUT_MAX*65535/255)/2;
-        value /= (PRECACHE_OUTPUT_MAX*65535/255); // scale to 0..255
-	return value;
-}
-
-#if 0
-/* if we use a different representation i.e. one that goes from 0 to 0x1000 we can be more efficient
- * because we can avoid the divisions and use a shifting instead */
-/* same as above but takes and returns a uint16_t value representing a range from 0..1 */
-uint16_t lut_interp_linear16(uint16_t input_value, uint16_t *table, int length)
-{
-	uint32_t value = (input_value * (length - 1));
-	uint32_t upper = (value + 4095) / 4096; /* equivalent to ceil(value/4096) */
-	uint32_t lower = value / 4096;           /* equivalent to floor(value/4096) */
-	uint32_t interp = value % 4096;
-
-	value = (table[upper]*(interp) + table[lower]*(4096 - interp))/4096; // 0..4096*4096
-
-	return value;
-}
-#endif
-
-void compute_curve_gamma_table_type1(float gamma_table[256], double gamma)
-{
-	unsigned int i;
-	for (i = 0; i < 256; i++) {
-		gamma_table[i] = pow(i/255., gamma);
-	}
-}
-
-void compute_curve_gamma_table_type2(float gamma_table[256], uint16_t *table, int length)
-{
-	unsigned int i;
-	for (i = 0; i < 256; i++) {
-		gamma_table[i] = lut_interp_linear(i/255., table, length);
-	}
-}
-
-void compute_curve_gamma_table_type0(float gamma_table[256])
-{
-	unsigned int i;
-	for (i = 0; i < 256; i++) {
-		gamma_table[i] = i/255.;
-	}
-}
-
-unsigned char clamp_u8(float v)
-{
-	if (v > 255.)
-		return 255;
-	else if (v < 0)
-		return 0;
-	else
-		return floor(v+.5);
-}
-
-struct vector {
-	float v[3];
-};
-
-struct matrix {
-	float m[3][3];
-	bool invalid;
-};
-
-struct vector matrix_eval(struct matrix mat, struct vector v)
-{
-	struct vector result;
-	result.v[0] = mat.m[0][0]*v.v[0] + mat.m[0][1]*v.v[1] + mat.m[0][2]*v.v[2];
-	result.v[1] = mat.m[1][0]*v.v[0] + mat.m[1][1]*v.v[1] + mat.m[1][2]*v.v[2];
-	result.v[2] = mat.m[2][0]*v.v[0] + mat.m[2][1]*v.v[1] + mat.m[2][2]*v.v[2];
-	return result;
-}
-
-//XXX: should probably pass by reference and we could
-//probably reuse this computation in matrix_invert
-float matrix_det(struct matrix mat)
-{
-	float det;
-	det = mat.m[0][0]*mat.m[1][1]*mat.m[2][2] +
-		mat.m[0][1]*mat.m[1][2]*mat.m[2][0] +
-		mat.m[0][2]*mat.m[1][0]*mat.m[2][1] -
-		mat.m[0][0]*mat.m[1][2]*mat.m[2][1] -
-		mat.m[0][1]*mat.m[1][0]*mat.m[2][2] -
-		mat.m[0][2]*mat.m[1][1]*mat.m[2][0];
-	return det;
-}
-
-/* from pixman and cairo and Mathematics for Game Programmers */
-/* lcms uses gauss-jordan elimination with partial pivoting which is
- * less efficient and not as numerically stable. See Mathematics for
- * Game Programmers. */
-struct matrix matrix_invert(struct matrix mat)
-{
-	struct matrix dest_mat;
-	int i,j;
-	static int a[3] = { 2, 2, 1 };
-	static int b[3] = { 1, 0, 0 };
-
-	/* inv  (A) = 1/det (A) * adj (A) */
-	float det = matrix_det(mat);
-
-	if (det == 0) {
-		dest_mat.invalid = true;
-	} else {
-		dest_mat.invalid = false;
-	}
-
-	det = 1/det;
-
-	for (j = 0; j < 3; j++) {
-		for (i = 0; i < 3; i++) {
-			double p;
-			int ai = a[i];
-			int aj = a[j];
-			int bi = b[i];
-			int bj = b[j];
-
-			p = mat.m[ai][aj] * mat.m[bi][bj] -
-				mat.m[ai][bj] * mat.m[bi][aj];
-			if (((i + j) & 1) != 0)
-				p = -p;
-
-			dest_mat.m[j][i] = det * p;
-		}
-	}
-	return dest_mat;
-}
-
-struct matrix matrix_identity(void)
-{
-	struct matrix i;
-	i.m[0][0] = 1;
-	i.m[0][1] = 0;
-	i.m[0][2] = 0;
-	i.m[1][0] = 0;
-	i.m[1][1] = 1;
-	i.m[1][2] = 0;
-	i.m[2][0] = 0;
-	i.m[2][1] = 0;
-	i.m[2][2] = 1;
-	i.invalid = false;
-	return i;
-}
-
-static struct matrix matrix_invalid(void)
-{
-	struct matrix inv = matrix_identity();
-	inv.invalid = true;
-	return inv;
-}
-
-
-/* from pixman */
-/* MAT3per... */
-struct matrix matrix_multiply(struct matrix a, struct matrix b)
-{
-	struct matrix result;
-	int dx, dy;
-	int o;
-	for (dy = 0; dy < 3; dy++) {
-		for (dx = 0; dx < 3; dx++) {
-			double v = 0;
-			for (o = 0; o < 3; o++) {
-				v += a.m[dy][o] * b.m[o][dx];
-			}
-			result.m[dy][dx] = v;
-		}
-	}
-	result.invalid = a.invalid || b.invalid;
-	return result;
-}
-
-float u8Fixed8Number_to_float(uint16_t x)
-{
-	// 0x0000 = 0.
-	// 0x0100 = 1.
-	// 0xffff = 255  + 255/256
-	return x/256.;
-}
-
-float *build_input_gamma_table(struct curveType *TRC)
-{
-	float *gamma_table = malloc(sizeof(float)*256);
-	if (gamma_table) {
-		if (TRC->count == 0) {
-			compute_curve_gamma_table_type0(gamma_table);
-		} else if (TRC->count == 1) {
-			compute_curve_gamma_table_type1(gamma_table, u8Fixed8Number_to_float(TRC->data[0]));
-		} else {
-			compute_curve_gamma_table_type2(gamma_table, TRC->data, TRC->count);
-		}
-	}
-	return gamma_table;
-}
-
-struct matrix build_colorant_matrix(qcms_profile *p)
-{
-	struct matrix result;
-	result.m[0][0] = s15Fixed16Number_to_float(p->redColorant.X);
-	result.m[0][1] = s15Fixed16Number_to_float(p->greenColorant.X);
-	result.m[0][2] = s15Fixed16Number_to_float(p->blueColorant.X);
-	result.m[1][0] = s15Fixed16Number_to_float(p->redColorant.Y);
-	result.m[1][1] = s15Fixed16Number_to_float(p->greenColorant.Y);
-	result.m[1][2] = s15Fixed16Number_to_float(p->blueColorant.Y);
-	result.m[2][0] = s15Fixed16Number_to_float(p->redColorant.Z);
-	result.m[2][1] = s15Fixed16Number_to_float(p->greenColorant.Z);
-	result.m[2][2] = s15Fixed16Number_to_float(p->blueColorant.Z);
-	result.invalid = false;
-	return result;
-}
-
-/* The following code is copied nearly directly from lcms.
- * I think it could be much better. For example, Argyll seems to have better code in
- * icmTable_lookup_bwd and icmTable_setup_bwd. However, for now this is a quick way
- * to a working solution and allows for easy comparing with lcms. */
-uint16_fract_t lut_inverse_interp16(uint16_t Value, uint16_t LutTable[], int length)
-{
-        int l = 1;
-        int r = 0x10000;
-        int x = 0, res;       // 'int' Give spacing for negative values
-        int NumZeroes, NumPoles;
-        int cell0, cell1;
-        double val2;
-        double y0, y1, x0, x1;
-        double a, b, f;
-
-        // July/27 2001 - Expanded to handle degenerated curves with an arbitrary
-        // number of elements containing 0 at the begining of the table (Zeroes)
-        // and another arbitrary number of poles (FFFFh) at the end.
-        // First the zero and pole extents are computed, then value is compared.
-
-        NumZeroes = 0;
-        while (LutTable[NumZeroes] == 0 && NumZeroes < length-1)
-                        NumZeroes++;
-
-        // There are no zeros at the beginning and we are trying to find a zero, so
-        // return anything. It seems zero would be the less destructive choice
-	/* I'm not sure that this makes sense, but oh well... */
-        if (NumZeroes == 0 && Value == 0)
-            return 0;
-
-        NumPoles = 0;
-        while (LutTable[length-1- NumPoles] == 0xFFFF && NumPoles < length-1)
-                        NumPoles++;
-
-        // Does the curve belong to this case?
-        if (NumZeroes > 1 || NumPoles > 1)
-        {               
-                int a, b;
-
-                // Identify if value fall downto 0 or FFFF zone             
-                if (Value == 0) return 0;
-               // if (Value == 0xFFFF) return 0xFFFF;
-
-                // else restrict to valid zone
-
-                a = ((NumZeroes-1) * 0xFFFF) / (length-1);               
-                b = ((length-1 - NumPoles) * 0xFFFF) / (length-1);
-                                                                
-                l = a - 1;
-                r = b + 1;
-        }
-
-
-        // Seems not a degenerated case... apply binary search
-
-        while (r > l) {
-
-                x = (l + r) / 2;
-
-		res = (int) lut_interp_linear16((uint16_fract_t) (x-1), LutTable, length);
-
-                if (res == Value) {
-
-                    // Found exact match. 
-                    
-                    return (uint16_fract_t) (x - 1);
-                }
-
-                if (res > Value) r = x - 1;
-                else l = x + 1;
-        }
-
-        // Not found, should we interpolate?
-
-                
-        // Get surrounding nodes
-        
-        val2 = (length-1) * ((double) (x - 1) / 65535.0);
-
-        cell0 = (int) floor(val2);
-        cell1 = (int) ceil(val2);
-           
-        if (cell0 == cell1) return (uint16_fract_t) x;
-
-        y0 = LutTable[cell0] ;
-        x0 = (65535.0 * cell0) / (length-1); 
-
-        y1 = LutTable[cell1] ;
-        x1 = (65535.0 * cell1) / (length-1);
-
-        a = (y1 - y0) / (x1 - x0);
-        b = y0 - a * x0;
-
-        if (fabs(a) < 0.01) return (uint16_fract_t) x;
-
-        f = ((Value - b) / a);
-
-        if (f < 0.0) return (uint16_fract_t) 0;
-        if (f >= 65535.0) return (uint16_fract_t) 0xFFFF;
-
-        return (uint16_fract_t) floor(f + 0.5);                        
-        
-}
-
 // Build a White point, primary chromas transfer matrix from RGB to CIE XYZ
 // This is just an approximation, I am not handling all the non-linear
 // aspects of the RGB to XYZ process, and assumming that the gamma correction
 // has transitive property in the tranformation chain.
 //
 // the alghoritm:
 //
 //            - First I build the absolute conversion matrix using
@@ -588,89 +224,16 @@ qcms_bool set_rgb_colorants(qcms_profile
 
 	profile->blueColorant.X = double_to_s15Fixed16Number(colorants.m[0][2]);
 	profile->blueColorant.Y = double_to_s15Fixed16Number(colorants.m[1][2]);
 	profile->blueColorant.Z = double_to_s15Fixed16Number(colorants.m[2][2]);
 
 	return true;
 }
 
-/*
- The number of entries needed to invert a lookup table should not
- necessarily be the same as the original number of entries.  This is
- especially true of lookup tables that have a small number of entries.
-
- For example:
- Using a table like:
-    {0, 3104, 14263, 34802, 65535}
- invert_lut will produce an inverse of:
-    {3, 34459, 47529, 56801, 65535}
- which has an maximum error of about 9855 (pixel difference of ~38.346)
-
- For now, we punt the decision of output size to the caller. */
-static uint16_t *invert_lut(uint16_t *table, int length, int out_length)
-{
-	int i;
-	/* for now we invert the lut by creating a lut of size out_length
-	 * and attempting to lookup a value for each entry using lut_inverse_interp16 */
-	uint16_t *output = malloc(sizeof(uint16_t)*out_length);
-	if (!output)
-		return NULL;
-
-	for (i = 0; i < out_length; i++) {
-		double x = ((double) i * 65535.) / (double) (out_length - 1);
-		uint16_fract_t input = floor(x + .5);
-		output[i] = lut_inverse_interp16(input, table, length);
-	}
-	return output;
-}
-
-static uint16_t *build_linear_table(int length)
-{
-	int i;
-	uint16_t *output = malloc(sizeof(uint16_t)*length);
-	if (!output)
-		return NULL;
-
-	for (i = 0; i < length; i++) {
-		double x = ((double) i * 65535.) / (double) (length - 1);
-		uint16_fract_t input = floor(x + .5);
-		output[i] = input;
-	}
-	return output;
-}
-
-static uint16_t *build_pow_table(float gamma, int length)
-{
-	int i;
-	uint16_t *output = malloc(sizeof(uint16_t)*length);
-	if (!output)
-		return NULL;
-
-	for (i = 0; i < length; i++) {
-		uint16_fract_t result;
-		double x = ((double) i) / (double) (length - 1);
-		x = pow(x, gamma);
-                //XXX turn this conversion into a function
-		result = floor(x*65535. + .5);
-		output[i] = result;
-	}
-	return output;
-}
-
-static float clamp_float(float a)
-{
-	if (a > 1.f)
-		return 1.f;
-	else if (a < 0.f)
-		return 0.f;
-	else
-		return a;
-}
-
 #if 0
 static void qcms_transform_data_rgb_out_pow(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
 {
 	int i;
 	float (*mat)[4] = transform->matrix;
 	for (i=0; i<length; i++) {
 		unsigned char device_r = *src++;
 		unsigned char device_g = *src++;
@@ -843,16 +406,302 @@ static void qcms_transform_data_rgba_out
 
 		*dest++ = transform->output_table_r->data[r];
 		*dest++ = transform->output_table_g->data[g];
 		*dest++ = transform->output_table_b->data[b];
 		*dest++ = alpha;
 	}
 }
 
+// Not used
+/* 
+static void qcms_transform_data_clut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length) {
+	unsigned int i;
+	int xy_len = 1;
+	int x_len = transform->grid_size;
+	int len = x_len * x_len;
+	float* r_table = transform->r_clut;
+	float* g_table = transform->g_clut;
+	float* b_table = transform->b_clut;
+  
+	for (i = 0; i < length; i++) {
+		unsigned char in_r = *src++;
+		unsigned char in_g = *src++;
+		unsigned char in_b = *src++;
+		float linear_r = in_r/255.0f, linear_g=in_g/255.0f, linear_b = in_b/255.0f;
+
+		int x = floor(linear_r * (transform->grid_size-1));
+		int y = floor(linear_g * (transform->grid_size-1));
+		int z = floor(linear_b * (transform->grid_size-1));
+		int x_n = ceil(linear_r * (transform->grid_size-1));
+		int y_n = ceil(linear_g * (transform->grid_size-1));
+		int z_n = ceil(linear_b * (transform->grid_size-1));
+		float x_d = linear_r * (transform->grid_size-1) - x; 
+		float y_d = linear_g * (transform->grid_size-1) - y;
+		float z_d = linear_b * (transform->grid_size-1) - z; 
+
+		float r_x1 = lerp(CLU(r_table,x,y,z), CLU(r_table,x_n,y,z), x_d);
+		float r_x2 = lerp(CLU(r_table,x,y_n,z), CLU(r_table,x_n,y_n,z), x_d);
+		float r_y1 = lerp(r_x1, r_x2, y_d);
+		float r_x3 = lerp(CLU(r_table,x,y,z_n), CLU(r_table,x_n,y,z_n), x_d);
+		float r_x4 = lerp(CLU(r_table,x,y_n,z_n), CLU(r_table,x_n,y_n,z_n), x_d);
+		float r_y2 = lerp(r_x3, r_x4, y_d);
+		float clut_r = lerp(r_y1, r_y2, z_d);
+
+		float g_x1 = lerp(CLU(g_table,x,y,z), CLU(g_table,x_n,y,z), x_d);
+		float g_x2 = lerp(CLU(g_table,x,y_n,z), CLU(g_table,x_n,y_n,z), x_d);
+		float g_y1 = lerp(g_x1, g_x2, y_d);
+		float g_x3 = lerp(CLU(g_table,x,y,z_n), CLU(g_table,x_n,y,z_n), x_d);
+		float g_x4 = lerp(CLU(g_table,x,y_n,z_n), CLU(g_table,x_n,y_n,z_n), x_d);
+		float g_y2 = lerp(g_x3, g_x4, y_d);
+		float clut_g = lerp(g_y1, g_y2, z_d);
+
+		float b_x1 = lerp(CLU(b_table,x,y,z), CLU(b_table,x_n,y,z), x_d);
+		float b_x2 = lerp(CLU(b_table,x,y_n,z), CLU(b_table,x_n,y_n,z), x_d);
+		float b_y1 = lerp(b_x1, b_x2, y_d);
+		float b_x3 = lerp(CLU(b_table,x,y,z_n), CLU(b_table,x_n,y,z_n), x_d);
+		float b_x4 = lerp(CLU(b_table,x,y_n,z_n), CLU(b_table,x_n,y_n,z_n), x_d);
+		float b_y2 = lerp(b_x3, b_x4, y_d);
+		float clut_b = lerp(b_y1, b_y2, z_d);
+
+		*dest++ = clamp_u8(clut_r*255.0f);
+		*dest++ = clamp_u8(clut_g*255.0f);
+		*dest++ = clamp_u8(clut_b*255.0f);
+	}	
+}
+*/
+
+// Using lcms' tetra interpolation algorithm.
+static void qcms_transform_data_tetra_clut_rgba(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length) {
+	unsigned int i;
+	int xy_len = 1;
+	int x_len = transform->grid_size;
+	int len = x_len * x_len;
+	float* r_table = transform->r_clut;
+	float* g_table = transform->g_clut;
+	float* b_table = transform->b_clut;
+	float c0_r, c1_r, c2_r, c3_r;
+	float c0_g, c1_g, c2_g, c3_g;
+	float c0_b, c1_b, c2_b, c3_b;
+	float clut_r, clut_g, clut_b;
+	for (i = 0; i < length; i++) {
+		unsigned char in_r = *src++;
+		unsigned char in_g = *src++;
+		unsigned char in_b = *src++;
+		unsigned char in_a = *src++;
+		float linear_r = in_r/255.0f, linear_g=in_g/255.0f, linear_b = in_b/255.0f;
+
+		int x = floor(linear_r * (transform->grid_size-1));
+		int y = floor(linear_g * (transform->grid_size-1));
+		int z = floor(linear_b * (transform->grid_size-1));
+		int x_n = ceil(linear_r * (transform->grid_size-1));
+		int y_n = ceil(linear_g * (transform->grid_size-1));
+		int z_n = ceil(linear_b * (transform->grid_size-1));
+		float rx = linear_r * (transform->grid_size-1) - x; 
+		float ry = linear_g * (transform->grid_size-1) - y;
+		float rz = linear_b * (transform->grid_size-1) - z; 
+
+		c0_r = CLU(r_table, x, y, z);
+		c0_g = CLU(g_table, x, y, z);
+		c0_b = CLU(b_table, x, y, z);
+
+		if( rx >= ry ) {
+			if (ry >= rz) { //rx >= ry && ry >= rz
+				c1_r = CLU(r_table, x_n, y, z) - c0_r;
+				c2_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x_n, y, z);
+				c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
+				c1_g = CLU(g_table, x_n, y, z) - c0_g;
+				c2_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x_n, y, z);
+				c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
+				c1_b = CLU(b_table, x_n, y, z) - c0_b;
+				c2_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x_n, y, z);
+				c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
+			} else { 
+				if (rx >= rz) { //rx >= rz && rz >= ry
+					c1_r = CLU(r_table, x_n, y, z) - c0_r;
+					c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
+					c3_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x_n, y, z);
+					c1_g = CLU(g_table, x_n, y, z) - c0_g;
+					c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
+					c3_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x_n, y, z);
+					c1_b = CLU(b_table, x_n, y, z) - c0_b;
+					c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
+					c3_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x_n, y, z);
+				} else { //rz > rx && rx >= ry
+					c1_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x, y, z_n);
+					c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
+					c3_r = CLU(r_table, x, y, z_n) - c0_r;
+					c1_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x, y, z_n);
+					c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
+					c3_g = CLU(g_table, x, y, z_n) - c0_g;
+					c1_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x, y, z_n);
+					c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
+					c3_b = CLU(b_table, x, y, z_n) - c0_b;
+				}
+			}
+		} else {
+			if (rx >= rz) { //ry > rx && rx >= rz
+				c1_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x, y_n, z);
+				c2_r = CLU(r_table, x, y_n, z) - c0_r;
+				c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
+				c1_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x, y_n, z);
+				c2_g = CLU(g_table, x, y_n, z) - c0_g;
+				c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
+				c1_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x, y_n, z);
+				c2_b = CLU(b_table, x, y_n, z) - c0_b;
+				c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
+			} else {
+				if (ry >= rz) { //ry >= rz && rz > rx 
+					c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
+					c2_r = CLU(r_table, x, y_n, z) - c0_r;
+					c3_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y_n, z);
+					c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
+					c2_g = CLU(g_table, x, y_n, z) - c0_g;
+					c3_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y_n, z);
+					c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
+					c2_b = CLU(b_table, x, y_n, z) - c0_b;
+					c3_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y_n, z);
+				} else { //rz > ry && ry > rx
+					c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
+					c2_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y, z_n);
+					c3_r = CLU(r_table, x, y, z_n) - c0_r;
+					c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
+					c2_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y, z_n);
+					c3_g = CLU(g_table, x, y, z_n) - c0_g;
+					c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
+					c2_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y, z_n);
+					c3_b = CLU(b_table, x, y, z_n) - c0_b;
+				}
+			}
+		}
+				
+		clut_r = c0_r + c1_r*rx + c2_r*ry + c3_r*rz;
+		clut_g = c0_g + c1_g*rx + c2_g*ry + c3_g*rz;
+		clut_b = c0_b + c1_b*rx + c2_b*ry + c3_b*rz;
+
+		*dest++ = clamp_u8(clut_r*255.0f);
+		*dest++ = clamp_u8(clut_g*255.0f);
+		*dest++ = clamp_u8(clut_b*255.0f);
+		*dest++ = in_a;
+	}	
+}
+
+// Using lcms' tetra interpolation code.
+static void qcms_transform_data_tetra_clut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length) {
+	unsigned int i;
+	int xy_len = 1;
+	int x_len = transform->grid_size;
+	int len = x_len * x_len;
+	float* r_table = transform->r_clut;
+	float* g_table = transform->g_clut;
+	float* b_table = transform->b_clut;
+	float c0_r, c1_r, c2_r, c3_r;
+	float c0_g, c1_g, c2_g, c3_g;
+	float c0_b, c1_b, c2_b, c3_b;
+	float clut_r, clut_g, clut_b;
+	for (i = 0; i < length; i++) {
+		unsigned char in_r = *src++;
+		unsigned char in_g = *src++;
+		unsigned char in_b = *src++;
+		float linear_r = in_r/255.0f, linear_g=in_g/255.0f, linear_b = in_b/255.0f;
+
+		int x = floor(linear_r * (transform->grid_size-1));
+		int y = floor(linear_g * (transform->grid_size-1));
+		int z = floor(linear_b * (transform->grid_size-1));
+		int x_n = ceil(linear_r * (transform->grid_size-1));
+		int y_n = ceil(linear_g * (transform->grid_size-1));
+		int z_n = ceil(linear_b * (transform->grid_size-1));
+		float rx = linear_r * (transform->grid_size-1) - x; 
+		float ry = linear_g * (transform->grid_size-1) - y;
+		float rz = linear_b * (transform->grid_size-1) - z; 
+
+		c0_r = CLU(r_table, x, y, z);
+		c0_g = CLU(g_table, x, y, z);
+		c0_b = CLU(b_table, x, y, z);
+
+		if( rx >= ry ) {
+			if (ry >= rz) { //rx >= ry && ry >= rz
+				c1_r = CLU(r_table, x_n, y, z) - c0_r;
+				c2_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x_n, y, z);
+				c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
+				c1_g = CLU(g_table, x_n, y, z) - c0_g;
+				c2_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x_n, y, z);
+				c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
+				c1_b = CLU(b_table, x_n, y, z) - c0_b;
+				c2_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x_n, y, z);
+				c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
+			} else { 
+				if (rx >= rz) { //rx >= rz && rz >= ry
+					c1_r = CLU(r_table, x_n, y, z) - c0_r;
+					c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
+					c3_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x_n, y, z);
+					c1_g = CLU(g_table, x_n, y, z) - c0_g;
+					c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
+					c3_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x_n, y, z);
+					c1_b = CLU(b_table, x_n, y, z) - c0_b;
+					c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
+					c3_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x_n, y, z);
+				} else { //rz > rx && rx >= ry
+					c1_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x, y, z_n);
+					c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
+					c3_r = CLU(r_table, x, y, z_n) - c0_r;
+					c1_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x, y, z_n);
+					c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
+					c3_g = CLU(g_table, x, y, z_n) - c0_g;
+					c1_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x, y, z_n);
+					c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
+					c3_b = CLU(b_table, x, y, z_n) - c0_b;
+				}
+			}
+		} else {
+			if (rx >= rz) { //ry > rx && rx >= rz
+				c1_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x, y_n, z);
+				c2_r = CLU(r_table, x, y_n, z) - c0_r;
+				c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
+				c1_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x, y_n, z);
+				c2_g = CLU(g_table, x, y_n, z) - c0_g;
+				c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
+				c1_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x, y_n, z);
+				c2_b = CLU(b_table, x, y_n, z) - c0_b;
+				c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
+			} else {
+				if (ry >= rz) { //ry >= rz && rz > rx 
+					c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
+					c2_r = CLU(r_table, x, y_n, z) - c0_r;
+					c3_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y_n, z);
+					c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
+					c2_g = CLU(g_table, x, y_n, z) - c0_g;
+					c3_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y_n, z);
+					c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
+					c2_b = CLU(b_table, x, y_n, z) - c0_b;
+					c3_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y_n, z);
+				} else { //rz > ry && ry > rx
+					c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
+					c2_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y, z_n);
+					c3_r = CLU(r_table, x, y, z_n) - c0_r;
+					c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
+					c2_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y, z_n);
+					c3_g = CLU(g_table, x, y, z_n) - c0_g;
+					c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
+					c2_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y, z_n);
+					c3_b = CLU(b_table, x, y, z_n) - c0_b;
+				}
+			}
+		}
+				
+		clut_r = c0_r + c1_r*rx + c2_r*ry + c3_r*rz;
+		clut_g = c0_g + c1_g*rx + c2_g*ry + c3_g*rz;
+		clut_b = c0_b + c1_b*rx + c2_b*ry + c3_b*rz;
+
+		*dest++ = clamp_u8(clut_r*255.0f);
+		*dest++ = clamp_u8(clut_g*255.0f);
+		*dest++ = clamp_u8(clut_b*255.0f);
+	}	
+}
+
 static void qcms_transform_data_rgb_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
 {
 	unsigned int i;
 	float (*mat)[4] = transform->matrix;
 	for (i = 0; i < length; i++) {
 		unsigned char device_r = *src++;
 		unsigned char device_g = *src++;
 		unsigned char device_b = *src++;
@@ -865,19 +714,22 @@ static void qcms_transform_data_rgb_out_
 		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
 		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
 		float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
 
 		out_linear_r = clamp_float(out_linear_r);
 		out_linear_g = clamp_float(out_linear_g);
 		out_linear_b = clamp_float(out_linear_b);
 
-		out_device_r = lut_interp_linear(out_linear_r, transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
-		out_device_g = lut_interp_linear(out_linear_g, transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
-		out_device_b = lut_interp_linear(out_linear_b, transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
+		out_device_r = lut_interp_linear(out_linear_r, 
+				transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
+		out_device_g = lut_interp_linear(out_linear_g, 
+				transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
+		out_device_b = lut_interp_linear(out_linear_b, 
+				transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
 
 		*dest++ = clamp_u8(out_device_r*255);
 		*dest++ = clamp_u8(out_device_g*255);
 		*dest++ = clamp_u8(out_device_b*255);
 	}
 }
 
 static void qcms_transform_data_rgba_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
@@ -898,19 +750,22 @@ static void qcms_transform_data_rgba_out
 		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
 		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
 		float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
 
 		out_linear_r = clamp_float(out_linear_r);
 		out_linear_g = clamp_float(out_linear_g);
 		out_linear_b = clamp_float(out_linear_b);
 
-		out_device_r = lut_interp_linear(out_linear_r, transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
-		out_device_g = lut_interp_linear(out_linear_g, transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
-		out_device_b = lut_interp_linear(out_linear_b, transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
+		out_device_r = lut_interp_linear(out_linear_r, 
+				transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
+		out_device_g = lut_interp_linear(out_linear_g, 
+				transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
+		out_device_b = lut_interp_linear(out_linear_b, 
+				transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
 
 		*dest++ = clamp_u8(out_device_r*255);
 		*dest++ = clamp_u8(out_device_g*255);
 		*dest++ = clamp_u8(out_device_b*255);
 		*dest++ = alpha;
 	}
 }
 
@@ -1025,65 +880,16 @@ void qcms_transform_release(qcms_transfo
 
 	free(t->output_gamma_lut_r);
 	free(t->output_gamma_lut_g);
 	free(t->output_gamma_lut_b);
 
 	transform_free(t);
 }
 
-static void compute_precache_pow(uint8_t *output, float gamma)
-{
-	uint32_t v = 0;
-	for (v = 0; v < PRECACHE_OUTPUT_SIZE; v++) {
-		//XXX: don't do integer/float conversion... and round?
-		output[v] = 255. * pow(v/(double)PRECACHE_OUTPUT_MAX, gamma);
-	}
-}
-
-void compute_precache_lut(uint8_t *output, uint16_t *table, int length)
-{
-	uint32_t v = 0;
-	for (v = 0; v < PRECACHE_OUTPUT_SIZE; v++) {
-		output[v] = lut_interp_linear_precache_output(v, table, length);
-	}
-}
-
-void compute_precache_linear(uint8_t *output)
-{
-	uint32_t v = 0;
-	for (v = 0; v < PRECACHE_OUTPUT_SIZE; v++) {
-		//XXX: round?
-		output[v] = v / (PRECACHE_OUTPUT_SIZE/256);
-	}
-}
-
-qcms_bool compute_precache(struct curveType *trc, uint8_t *output)
-{
-	if (trc->count == 0) {
-		compute_precache_linear(output);
-	} else if (trc->count == 1) {
-		compute_precache_pow(output, 1./u8Fixed8Number_to_float(trc->data[0]));
-	} else {
-		uint16_t *inverted;
-		int inverted_size = trc->count;
-		//XXX: the choice of a minimum of 256 here is not backed by any theory, measurement or data, however it is what lcms uses.
-		// the maximum number we would need is 65535 because that's the accuracy used for computing the precache table
-		if (inverted_size < 256)
-			inverted_size = 256;
-
-		inverted = invert_lut(trc->data, trc->count, inverted_size);
-		if (!inverted)
-			return false;
-		compute_precache_lut(output, inverted, inverted_size);
-		free(inverted);
-	}
-	return true;
-}
-
 #ifdef X86
 // Determine if we can build with SSE2 (this was partly copied from jmorecfg.h in
 // mozilla/jpeg)
  // -------------------------------------------------------------------------
 #if defined(_M_IX86) && defined(_MSC_VER)
 #define HAS_CPUID
 /* Get us a CPUID function. Avoid clobbering EBX because sometimes it's the PIC
    register - I'm not sure if that ever happens on windows, but cpuid isn't
@@ -1155,43 +961,56 @@ static int sse_version_available(void)
 
 	return sse_version;
 #else
 	return 0;
 #endif
 }
 #endif
 
-void build_output_lut(struct curveType *trc,
-		uint16_t **output_gamma_lut, size_t *output_gamma_lut_length)
-{
-	if (trc->count == 0) {
-		*output_gamma_lut = build_linear_table(4096);
-		*output_gamma_lut_length = 4096;
-	} else if (trc->count == 1) {
-		float gamma = 1./u8Fixed8Number_to_float(trc->data[0]);
-		*output_gamma_lut = build_pow_table(gamma, 4096);
-		*output_gamma_lut_length = 4096;
-	} else {
-		//XXX: the choice of a minimum of 256 here is not backed by any theory, measurement or data, however it is what lcms uses.
-		*output_gamma_lut_length = trc->count;
-		if (*output_gamma_lut_length < 256)
-			*output_gamma_lut_length = 256;
+static const struct matrix bradford_matrix = {{	{ 0.8951f, 0.2664f,-0.1614f},
+						{-0.7502f, 1.7135f, 0.0367f},
+						{ 0.0389f,-0.0685f, 1.0296f}}, 
+						false};
+
+static const struct matrix bradford_matrix_inv = {{ { 0.9869929f,-0.1470543f, 0.1599627f},
+						    { 0.4323053f, 0.5183603f, 0.0492912f},
+						    {-0.0085287f, 0.0400428f, 0.9684867f}}, 
+						    false};
 
-		*output_gamma_lut = invert_lut(trc->data, trc->count, *output_gamma_lut_length);
-	}
-
+// See ICCv4 E.3
+struct matrix compute_whitepoint_adaption(float X, float Y, float Z) {
+	float p = (0.96422f*bradford_matrix.m[0][0] + 1.000f*bradford_matrix.m[1][0] + 0.82521f*bradford_matrix.m[2][0]) /
+		  (X*bradford_matrix.m[0][0]      + Y*bradford_matrix.m[1][0]      + Z*bradford_matrix.m[2][0]     );
+	float y = (0.96422f*bradford_matrix.m[0][1] + 1.000f*bradford_matrix.m[1][1] + 0.82521f*bradford_matrix.m[2][1]) /
+		  (X*bradford_matrix.m[0][1]      + Y*bradford_matrix.m[1][1]      + Z*bradford_matrix.m[2][1]     );
+	float b = (0.96422f*bradford_matrix.m[0][2] + 1.000f*bradford_matrix.m[1][2] + 0.82521f*bradford_matrix.m[2][2]) /
+		  (X*bradford_matrix.m[0][2]      + Y*bradford_matrix.m[1][2]      + Z*bradford_matrix.m[2][2]     );
+	struct matrix white_adaption = {{ {p,0,0}, {0,y,0}, {0,0,b}}, false};
+	return matrix_multiply( bradford_matrix_inv, matrix_multiply(white_adaption, bradford_matrix) );
 }
 
 void qcms_profile_precache_output_transform(qcms_profile *profile)
 {
 	/* we only support precaching on rgb profiles */
 	if (profile->color_space != RGB_SIGNATURE)
 		return;
 
+	/* don't precache since we will use the B2A LUT */
+	if (profile->B2A0)
+		return;
+
+	/* don't precache since we will use the mBA LUT */
+	if (profile->mBA)
+		return;
+
+	/* don't precache if we do not have the TRC curves */
+	if (!profile->redTRC || !profile->greenTRC || !profile->blueTRC)
+		return;
+
 	if (!profile->output_table_r) {
 		profile->output_table_r = precache_create();
 		if (profile->output_table_r &&
 				!compute_precache(profile->redTRC, profile->output_table_r->data)) {
 			precache_release(profile->output_table_r);
 			profile->output_table_r = NULL;
 		}
 	}
@@ -1208,21 +1027,77 @@ void qcms_profile_precache_output_transf
 		if (profile->output_table_b &&
 				!compute_precache(profile->blueTRC, profile->output_table_b->data)) {
 			precache_release(profile->output_table_b);
 			profile->output_table_b = NULL;
 		}
 	}
 }
 
+/* Replace the current transformation with a LUT transformation using a given number of sample points */
+qcms_transform* qcms_transform_precacheLUT_float(qcms_transform *transform, qcms_profile *in, qcms_profile *out, 
+                                                 int samples, qcms_data_type in_type)
+{
+	/* The range between which 2 consecutive sample points can be used to interpolate */
+	uint16_t x,y,z;
+	uint32_t l;
+	uint32_t lutSize = 3 * samples * samples * samples;
+	float* src = NULL;
+	float* dest = NULL;
+	float* lut = NULL;
+
+	src = malloc(lutSize*sizeof(float));
+	dest = malloc(lutSize*sizeof(float));
+
+	if (src && dest) {
+		/* Prepare a list of points we want to sample */
+		l = 0;
+		for (x = 0; x < samples; x++) {
+			for (y = 0; y < samples; y++) {
+				for (z = 0; z < samples; z++) {
+					src[l++] = x / (float)(samples-1);
+					src[l++] = y / (float)(samples-1);
+					src[l++] = z / (float)(samples-1);
+				}
+			}
+		}
+
+		lut = qcms_chain_transform(in, out, src, dest, lutSize);
+		if (lut) {
+			transform->r_clut = &lut[0];
+			transform->g_clut = &lut[1];
+			transform->b_clut = &lut[2];
+			transform->grid_size = samples;
+			if (in_type == QCMS_DATA_RGBA_8) {
+				transform->transform_fn = qcms_transform_data_tetra_clut_rgba;
+			} else {
+				transform->transform_fn = qcms_transform_data_tetra_clut;
+			}
+		}
+	}
+
+
+	//XXX: qcms_modular_transform_data may return either the src or dest buffer. If so it must not be free-ed
+	if (src && lut != src) {
+		free(src);
+	} else if (dest && lut != src) {
+		free(dest);
+	}
+
+	if (lut == NULL) {
+		return NULL;
+	}
+	return transform;
+}
+
 #define NO_MEM_TRANSFORM NULL
 
 qcms_transform* qcms_transform_create(
 		qcms_profile *in, qcms_data_type in_type,
-		qcms_profile* out, qcms_data_type out_type,
+		qcms_profile *out, qcms_data_type out_type,
 		qcms_intent intent)
 {
 	bool precache = false;
 
         qcms_transform *transform = transform_alloc();
         if (!transform) {
 		return NULL;
 	}
@@ -1234,40 +1109,59 @@ qcms_transform* qcms_transform_create(
         }
 
 	if (out->output_table_r &&
 			out->output_table_g &&
 			out->output_table_b) {
 		precache = true;
 	}
 
+	if (qcms_supports_iccv4 && (in->A2B0 || out->B2A0 || in->mAB || out->mAB)) {
+		// Precache the transformation to a CLUT 33x33x33 in size.
+		// 33 is used by many profiles and works well in pratice. 
+		// This evenly divides 256 into blocks of 8x8x8.
+		// TODO For transforming small data sets of about 200x200 or less
+		// precaching should be avoided.
+		qcms_transform *result = qcms_transform_precacheLUT_float(transform, in, out, 33, in_type);
+		if (!result) {
+            		assert(0 && "precacheLUT failed");
+			transform_free(transform);
+			return NULL;
+		}
+		return result;
+	}
+
 	if (precache) {
 		transform->output_table_r = precache_reference(out->output_table_r);
 		transform->output_table_g = precache_reference(out->output_table_g);
 		transform->output_table_b = precache_reference(out->output_table_b);
 	} else {
+		if (!out->redTRC || !out->greenTRC || !out->blueTRC) {
+			qcms_transform_release(transform);
+			return NO_MEM_TRANSFORM;
+		}
 		build_output_lut(out->redTRC, &transform->output_gamma_lut_r, &transform->output_gamma_lut_r_length);
 		build_output_lut(out->greenTRC, &transform->output_gamma_lut_g, &transform->output_gamma_lut_g_length);
 		build_output_lut(out->blueTRC, &transform->output_gamma_lut_b, &transform->output_gamma_lut_b_length);
 		if (!transform->output_gamma_lut_r || !transform->output_gamma_lut_g || !transform->output_gamma_lut_b) {
 			qcms_transform_release(transform);
 			return NO_MEM_TRANSFORM;
 		}
 	}
 
         if (in->color_space == RGB_SIGNATURE) {
-            struct matrix in_matrix, out_matrix, result;
+		struct matrix in_matrix, out_matrix, result;
 
-            if (in_type != QCMS_DATA_RGB_8 &&
+		if (in_type != QCMS_DATA_RGB_8 &&
                     in_type != QCMS_DATA_RGBA_8){
-                assert(0 && "input type");
-                transform_free(transform);
-                return NULL;
-            }
-	    if (precache) {
+                	assert(0 && "input type");
+			transform_free(transform);
+                	return NULL;
+            	}
+		if (precache) {
 #ifdef X86
 		    if (sse_version_available() >= 2) {
 			    if (in_type == QCMS_DATA_RGB_8)
 				    transform->transform_fn = qcms_transform_data_rgb_out_lut_sse2;
 			    else
 				    transform->transform_fn = qcms_transform_data_rgba_out_lut_sse2;
 
 #if !(defined(_MSC_VER) && defined(_M_AMD64))
@@ -1277,96 +1171,102 @@ qcms_transform* qcms_transform_create(
 		    if (sse_version_available() >= 1) {
 			    if (in_type == QCMS_DATA_RGB_8)
 				    transform->transform_fn = qcms_transform_data_rgb_out_lut_sse1;
 			    else
 				    transform->transform_fn = qcms_transform_data_rgba_out_lut_sse1;
 #endif
 		    } else
 #endif
-		    {
-			    if (in_type == QCMS_DATA_RGB_8)
-				    transform->transform_fn = qcms_transform_data_rgb_out_lut_precache;
-			    else
-				    transform->transform_fn = qcms_transform_data_rgba_out_lut_precache;
-		    }
-	    } else {
-		    if (in_type == QCMS_DATA_RGB_8)
-			    transform->transform_fn = qcms_transform_data_rgb_out_lut;
-		    else
-			    transform->transform_fn = qcms_transform_data_rgba_out_lut;
-	    }
+			{
+				if (in_type == QCMS_DATA_RGB_8)
+					transform->transform_fn = qcms_transform_data_rgb_out_lut_precache;
+				else
+					transform->transform_fn = qcms_transform_data_rgba_out_lut_precache;
+			}
+		} else {
+			if (in_type == QCMS_DATA_RGB_8)
+				transform->transform_fn = qcms_transform_data_rgb_out_lut;
+			else
+				transform->transform_fn = qcms_transform_data_rgba_out_lut;
+		}
 
-            //XXX: avoid duplicating tables if we can
-            transform->input_gamma_table_r = build_input_gamma_table(in->redTRC);
-            transform->input_gamma_table_g = build_input_gamma_table(in->greenTRC);
-            transform->input_gamma_table_b = build_input_gamma_table(in->blueTRC);
+		//XXX: avoid duplicating tables if we can
+		transform->input_gamma_table_r = build_input_gamma_table(in->redTRC);
+		transform->input_gamma_table_g = build_input_gamma_table(in->greenTRC);
+		transform->input_gamma_table_b = build_input_gamma_table(in->blueTRC);
+		if (!transform->input_gamma_table_r || !transform->input_gamma_table_g || !transform->input_gamma_table_b) {
+			qcms_transform_release(transform);
+			return NO_MEM_TRANSFORM;
+		}
 
-	    if (!transform->input_gamma_table_r || !transform->input_gamma_table_g || !transform->input_gamma_table_b) {
-		    qcms_transform_release(transform);
-		    return NO_MEM_TRANSFORM;
-	    }
 
-            /* build combined colorant matrix */
-            in_matrix = build_colorant_matrix(in);
-            out_matrix = build_colorant_matrix(out);
-            out_matrix = matrix_invert(out_matrix);
-            if (out_matrix.invalid) {
-                qcms_transform_release(transform);
-                return NULL;
-            }
-            result = matrix_multiply(out_matrix, in_matrix);
+		/* build combined colorant matrix */
+		in_matrix = build_colorant_matrix(in);
+		out_matrix = build_colorant_matrix(out);
+		out_matrix = matrix_invert(out_matrix);
+		if (out_matrix.invalid) {
+			qcms_transform_release(transform);
+			return NULL;
+		}
+		result = matrix_multiply(out_matrix, in_matrix);
 
-            /* store the results in column major mode
-             * this makes doing the multiplication with sse easier */
-            transform->matrix[0][0] = result.m[0][0];
-            transform->matrix[1][0] = result.m[0][1];
-            transform->matrix[2][0] = result.m[0][2];
-            transform->matrix[0][1] = result.m[1][0];
-            transform->matrix[1][1] = result.m[1][1];
-            transform->matrix[2][1] = result.m[1][2];
-            transform->matrix[0][2] = result.m[2][0];
-            transform->matrix[1][2] = result.m[2][1];
-            transform->matrix[2][2] = result.m[2][2];
+		/* store the results in column major mode
+		 * this makes doing the multiplication with sse easier */
+		transform->matrix[0][0] = result.m[0][0];
+		transform->matrix[1][0] = result.m[0][1];
+		transform->matrix[2][0] = result.m[0][2];
+		transform->matrix[0][1] = result.m[1][0];
+		transform->matrix[1][1] = result.m[1][1];
+		transform->matrix[2][1] = result.m[1][2];
+		transform->matrix[0][2] = result.m[2][0];
+		transform->matrix[1][2] = result.m[2][1];
+		transform->matrix[2][2] = result.m[2][2];
 
-        } else if (in->color_space == GRAY_SIGNATURE) {
-            if (in_type != QCMS_DATA_GRAY_8 &&
-                    in_type != QCMS_DATA_GRAYA_8){
-                assert(0 && "input type");
-                transform_free(transform);
-                return NULL;
-            }
+	} else if (in->color_space == GRAY_SIGNATURE) {
+		if (in_type != QCMS_DATA_GRAY_8 &&
+				in_type != QCMS_DATA_GRAYA_8){
+			assert(0 && "input type");
+			transform_free(transform);
+			return NULL;
+		}
 
-            transform->input_gamma_table_gray = build_input_gamma_table(in->grayTRC);
-	    if (!transform->input_gamma_table_gray) {
-		    qcms_transform_release(transform);
-		    return NO_MEM_TRANSFORM;
-	    }
+		transform->input_gamma_table_gray = build_input_gamma_table(in->grayTRC);
+		if (!transform->input_gamma_table_gray) {
+			qcms_transform_release(transform);
+			return NO_MEM_TRANSFORM;
+		}
 
-	    if (precache) {
-		    if (in_type == QCMS_DATA_GRAY_8) {
-			    transform->transform_fn = qcms_transform_data_gray_out_precache;
-		    } else {
-			    transform->transform_fn = qcms_transform_data_graya_out_precache;
-		    }
-	    } else {
-		    if (in_type == QCMS_DATA_GRAY_8) {
-			    transform->transform_fn = qcms_transform_data_gray_out_lut;
-		    } else {
-			    transform->transform_fn = qcms_transform_data_graya_out_lut;
-		    }
-	    }
+		if (precache) {
+			if (in_type == QCMS_DATA_GRAY_8) {
+				transform->transform_fn = qcms_transform_data_gray_out_precache;
+			} else {
+				transform->transform_fn = qcms_transform_data_graya_out_precache;
+			}
+		} else {
+			if (in_type == QCMS_DATA_GRAY_8) {
+				transform->transform_fn = qcms_transform_data_gray_out_lut;
+			} else {
+				transform->transform_fn = qcms_transform_data_graya_out_lut;
+			}
+		}
 	} else {
 		assert(0 && "unexpected colorspace");
-		qcms_transform_release(transform);
-		return NO_MEM_TRANSFORM;
+		transform_free(transform);
+		return NULL;
 	}
 	return transform;
 }
 
 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
 /* we need this to avoid crashes when gcc assumes the stack is 128bit aligned */
 __attribute__((__force_align_arg_pointer__))
 #endif
 void qcms_transform_data(qcms_transform *transform, void *src, void *dest, size_t length)
 {
 	transform->transform_fn(transform, src, dest, length);
 }
+
+qcms_bool qcms_supports_iccv4;
+void qcms_enable_iccv4()
+{
+	qcms_supports_iccv4 = true;
+}
new file mode 100644
--- /dev/null
+++ b/gfx/qcms/transform_util.c
@@ -0,0 +1,539 @@
+#define _ISOC99_SOURCE  /* for INFINITY */
+
+#include <math.h>
+#include <assert.h>
+#include <string.h> //memcpy
+#include "qcmsint.h"
+#include "transform_util.h"
+#include "matrix.h"
+
+#if !defined(INFINITY)
+#define INFINITY HUGE_VAL
+#endif
+
+#define PARAMETRIC_CURVE_TYPE 0x70617261 //'para'
+
+/* value must be a value between 0 and 1 */
+//XXX: is the above a good restriction to have?
+float lut_interp_linear(double value, uint16_t *table, int length)
+{
+	int upper, lower;
+	value = value * (length - 1); // scale to length of the array
+	upper = ceil(value);
+	lower = floor(value);
+	//XXX: can we be more performant here?
+	value = table[upper]*(1. - (upper - value)) + table[lower]*(upper - value);
+	/* scale the value */
+	return value * (1./65535.);
+}
+
+/* same as above but takes and returns a uint16_t value representing a range from 0..1 */
+uint16_t lut_interp_linear16(uint16_t input_value, uint16_t *table, int length)
+{
+	/* Start scaling input_value to the length of the array: 65535*(length-1).
+	 * We'll divide out the 65535 next */
+	uint32_t value = (input_value * (length - 1));
+	uint32_t upper = (value + 65534) / 65535; /* equivalent to ceil(value/65535) */
+	uint32_t lower = value / 65535;           /* equivalent to floor(value/65535) */
+	/* interp is the distance from upper to value scaled to 0..65535 */
+	uint32_t interp = value % 65535;
+
+	value = (table[upper]*(interp) + table[lower]*(65535 - interp))/65535; // 0..65535*65535
+
+	return value;
+}
+
+/* same as above but takes an input_value from 0..PRECACHE_OUTPUT_MAX
+ * and returns a uint8_t value representing a range from 0..1 */
+static
+uint8_t lut_interp_linear_precache_output(uint32_t input_value, uint16_t *table, int length)
+{
+	/* Start scaling input_value to the length of the array: PRECACHE_OUTPUT_MAX*(length-1).
+	 * We'll divide out the PRECACHE_OUTPUT_MAX next */
+	uint32_t value = (input_value * (length - 1));
+
+	/* equivalent to ceil(value/PRECACHE_OUTPUT_MAX) */
+	uint32_t upper = (value + PRECACHE_OUTPUT_MAX-1) / PRECACHE_OUTPUT_MAX;
+	/* equivalent to floor(value/PRECACHE_OUTPUT_MAX) */
+	uint32_t lower = value / PRECACHE_OUTPUT_MAX;
+	/* interp is the distance from upper to value scaled to 0..PRECACHE_OUTPUT_MAX */
+	uint32_t interp = value % PRECACHE_OUTPUT_MAX;
+
+	/* the table values range from 0..65535 */
+	value = (table[upper]*(interp) + table[lower]*(PRECACHE_OUTPUT_MAX - interp)); // 0..(65535*PRECACHE_OUTPUT_MAX)
+
+	/* round and scale */
+	value += (PRECACHE_OUTPUT_MAX*65535/255)/2;
+        value /= (PRECACHE_OUTPUT_MAX*65535/255); // scale to 0..255
+	return value;
+}
+
+/* value must be a value between 0 and 1 */
+//XXX: is the above a good restriction to have?
+float lut_interp_linear_float(float value, float *table, int length)
+{
+        int upper, lower;
+        value = value * (length - 1);
+        upper = ceil(value);
+        lower = floor(value);
+        //XXX: can we be more performant here?
+        value = table[upper]*(1. - (upper - value)) + table[lower]*(upper - value);
+        /* scale the value */
+        return value;
+}
+
+#if 0
+/* if we use a different representation i.e. one that goes from 0 to 0x1000 we can be more efficient
+ * because we can avoid the divisions and use a shifting instead */
+/* same as above but takes and returns a uint16_t value representing a range from 0..1 */
+uint16_t lut_interp_linear16(uint16_t input_value, uint16_t *table, int length)
+{
+	uint32_t value = (input_value * (length - 1));
+	uint32_t upper = (value + 4095) / 4096; /* equivalent to ceil(value/4096) */
+	uint32_t lower = value / 4096;           /* equivalent to floor(value/4096) */
+	uint32_t interp = value % 4096;
+
+	value = (table[upper]*(interp) + table[lower]*(4096 - interp))/4096; // 0..4096*4096
+
+	return value;
+}
+#endif
+
+void compute_curve_gamma_table_type1(float gamma_table[256], double gamma)
+{
+	unsigned int i;
+	for (i = 0; i < 256; i++) {
+		gamma_table[i] = pow(i/255., gamma);
+	}
+}
+
+void compute_curve_gamma_table_type2(float gamma_table[256], uint16_t *table, int length)
+{
+	unsigned int i;
+	for (i = 0; i < 256; i++) {
+		gamma_table[i] = lut_interp_linear(i/255., table, length);
+	}
+}
+
+void compute_curve_gamma_table_type_parametric(float gamma_table[256], float parameter[7], int count)
+{
+        size_t X;
+        float interval;
+        float a, b, c, e, f;
+        float y = parameter[0];
+        if (count == 0) {
+                a = 1;
+                b = 0;
+                c = 0;
+                e = 0;
+                f = 0;
+                interval = -INFINITY;
+        } else if(count == 1) {
+                a = parameter[1];
+                b = parameter[2];
+                c = 0;
+                e = 0;
+                f = 0;
+                interval = -1 * parameter[2] / parameter[1];
+        } else if(count == 2) {
+                a = parameter[1];
+                b = parameter[2];
+                c = 0;
+                e = parameter[3];
+                f = parameter[3];
+                interval = -1 * parameter[2] / parameter[1];
+        } else if(count == 3) {
+                a = parameter[1];
+                b = parameter[2];
+                c = parameter[3];
+                e = -c;
+                f = 0;
+                interval = parameter[4];
+        } else if(count == 4) {
+                a = parameter[1];
+                b = parameter[2];
+                c = parameter[3];
+                e = parameter[5] - c;
+                f = parameter[6];
+                interval = parameter[4];
+        } else {
+                assert(0 && "invalid parametric function type.");
+                a = 1;
+                b = 0;
+                c = 0;
+                e = 0;
+                f = 0;
+                interval = -INFINITY;
+        }       
+        for (X = 0; X < 256; X++) {
+                if (X >= interval) {
+                        // XXX The equations are not exactly as definied in the spec but are
+                        //     algebraic equivilent.
+                        // TODO Should division by 255 be for the whole expression.
+                        gamma_table[X] = pow(a * X / 255. + b, y) + c + e;
+                } else {
+                        gamma_table[X] = c * X / 255. + f;
+                }
+        }
+}
+
+void compute_curve_gamma_table_type0(float gamma_table[256])
+{
+	unsigned int i;
+	for (i = 0; i < 256; i++) {
+		gamma_table[i] = i/255.;
+	}
+}
+
+
+float clamp_float(float a)
+{
+        if (a > 1.)
+                return 1.;
+        else if (a < 0)
+                return 0;
+        else
+                return a;
+}
+
+unsigned char clamp_u8(float v)
+{
+	if (v > 255.)
+		return 255;
+	else if (v < 0)
+		return 0;
+	else
+		return floor(v+.5);
+}
+
+float u8Fixed8Number_to_float(uint16_t x)
+{
+	// 0x0000 = 0.
+	// 0x0100 = 1.
+	// 0xffff = 255  + 255/256
+	return x/256.;
+}
+
+float *build_input_gamma_table(struct curveType *TRC)
+{
+	float *gamma_table;
+
+	if (!TRC) return NULL;
+	gamma_table = malloc(sizeof(float)*256);
+	if (gamma_table) {
+		if (TRC->type == PARAMETRIC_CURVE_TYPE) {
+			compute_curve_gamma_table_type_parametric(gamma_table, TRC->parameter, TRC->count);
+		} else {
+			if (TRC->count == 0) {
+				compute_curve_gamma_table_type0(gamma_table);
+			} else if (TRC->count == 1) {
+				compute_curve_gamma_table_type1(gamma_table, u8Fixed8Number_to_float(TRC->data[0]));
+			} else {
+				compute_curve_gamma_table_type2(gamma_table, TRC->data, TRC->count);
+			}
+		}
+	}
+        return gamma_table;
+}
+
+struct matrix build_colorant_matrix(qcms_profile *p)
+{
+	struct matrix result;
+	result.m[0][0] = s15Fixed16Number_to_float(p->redColorant.X);
+	result.m[0][1] = s15Fixed16Number_to_float(p->greenColorant.X);
+	result.m[0][2] = s15Fixed16Number_to_float(p->blueColorant.X);
+	result.m[1][0] = s15Fixed16Number_to_float(p->redColorant.Y);
+	result.m[1][1] = s15Fixed16Number_to_float(p->greenColorant.Y);
+	result.m[1][2] = s15Fixed16Number_to_float(p->blueColorant.Y);
+	result.m[2][0] = s15Fixed16Number_to_float(p->redColorant.Z);
+	result.m[2][1] = s15Fixed16Number_to_float(p->greenColorant.Z);
+	result.m[2][2] = s15Fixed16Number_to_float(p->blueColorant.Z);
+	result.invalid = false;
+	return result;
+}
+
+/* The following code is copied nearly directly from lcms.
+ * I think it could be much better. For example, Argyll seems to have better code in
+ * icmTable_lookup_bwd and icmTable_setup_bwd. However, for now this is a quick way
+ * to a working solution and allows for easy comparing with lcms. */
+uint16_fract_t lut_inverse_interp16(uint16_t Value, uint16_t LutTable[], int length)
+{
+        int l = 1;
+        int r = 0x10000;
+        int x = 0, res;       // 'int' Give spacing for negative values
+        int NumZeroes, NumPoles;
+        int cell0, cell1;
+        double val2;
+        double y0, y1, x0, x1;
+        double a, b, f;
+
+        // July/27 2001 - Expanded to handle degenerated curves with an arbitrary
+        // number of elements containing 0 at the begining of the table (Zeroes)
+        // and another arbitrary number of poles (FFFFh) at the end.
+        // First the zero and pole extents are computed, then value is compared.
+
+        NumZeroes = 0;
+        while (LutTable[NumZeroes] == 0 && NumZeroes < length-1)
+                        NumZeroes++;
+
+        // There are no zeros at the beginning and we are trying to find a zero, so
+        // return anything. It seems zero would be the less destructive choice
+	/* I'm not sure that this makes sense, but oh well... */
+        if (NumZeroes == 0 && Value == 0)
+            return 0;
+
+        NumPoles = 0;
+        while (LutTable[length-1- NumPoles] == 0xFFFF && NumPoles < length-1)
+                        NumPoles++;
+
+        // Does the curve belong to this case?
+        if (NumZeroes > 1 || NumPoles > 1)
+        {               
+                int a, b;
+
+                // Identify if value fall downto 0 or FFFF zone             
+                if (Value == 0) return 0;
+               // if (Value == 0xFFFF) return 0xFFFF;
+
+                // else restrict to valid zone
+
+                a = ((NumZeroes-1) * 0xFFFF) / (length-1);               
+                b = ((length-1 - NumPoles) * 0xFFFF) / (length-1);
+                                                                
+                l = a - 1;
+                r = b + 1;
+        }
+
+
+        // Seems not a degenerated case... apply binary search
+
+        while (r > l) {
+
+                x = (l + r) / 2;
+
+		res = (int) lut_interp_linear16((uint16_fract_t) (x-1), LutTable, length);
+
+                if (res == Value) {
+
+                    // Found exact match. 
+                    
+                    return (uint16_fract_t) (x - 1);
+                }
+
+                if (res > Value) r = x - 1;
+                else l = x + 1;
+        }
+
+        // Not found, should we interpolate?
+
+                
+        // Get surrounding nodes
+        
+        val2 = (length-1) * ((double) (x - 1) / 65535.0);
+
+        cell0 = (int) floor(val2);
+        cell1 = (int) ceil(val2);
+           
+        if (cell0 == cell1) return (uint16_fract_t) x;
+
+        y0 = LutTable[cell0] ;
+        x0 = (65535.0 * cell0) / (length-1); 
+
+        y1 = LutTable[cell1] ;
+        x1 = (65535.0 * cell1) / (length-1);
+
+        a = (y1 - y0) / (x1 - x0);
+        b = y0 - a * x0;
+
+        if (fabs(a) < 0.01) return (uint16_fract_t) x;
+
+        f = ((Value - b) / a);
+
+        if (f < 0.0) return (uint16_fract_t) 0;
+        if (f >= 65535.0) return (uint16_fract_t) 0xFFFF;
+
+        return (uint16_fract_t) floor(f + 0.5);                        
+
+}
+
+/*
+ The number of entries needed to invert a lookup table should not
+ necessarily be the same as the original number of entries.  This is
+ especially true of lookup tables that have a small number of entries.
+
+ For example:
+ Using a table like:
+    {0, 3104, 14263, 34802, 65535}
+ invert_lut will produce an inverse of:
+    {3, 34459, 47529, 56801, 65535}
+ which has an maximum error of about 9855 (pixel difference of ~38.346)
+
+ For now, we punt the decision of output size to the caller. */
+static uint16_t *invert_lut(uint16_t *table, int length, int out_length)
+{
+        int i;
+        /* for now we invert the lut by creating a lut of size out_length
+         * and attempting to lookup a value for each entry using lut_inverse_interp16 */
+        uint16_t *output = malloc(sizeof(uint16_t)*out_length);
+        if (!output)
+                return NULL;
+
+        for (i = 0; i < out_length; i++) {
+                double x = ((double) i * 65535.) / (double) (out_length - 1);
+                uint16_fract_t input = floor(x + .5);
+                output[i] = lut_inverse_interp16(input, table, length);
+        }
+        return output;
+}
+
+static void compute_precache_pow(uint8_t *output, float gamma)
+{
+	uint32_t v = 0;
+	for (v = 0; v < PRECACHE_OUTPUT_SIZE; v++) {
+		//XXX: don't do integer/float conversion... and round?
+		output[v] = 255. * pow(v/(double)PRECACHE_OUTPUT_MAX, gamma);
+	}
+}
+
+void compute_precache_lut(uint8_t *output, uint16_t *table, int length)
+{
+	uint32_t v = 0;
+	for (v = 0; v < PRECACHE_OUTPUT_SIZE; v++) {
+		output[v] = lut_interp_linear_precache_output(v, table, length);
+	}
+}
+
+void compute_precache_linear(uint8_t *output)
+{
+	uint32_t v = 0;
+	for (v = 0; v < PRECACHE_OUTPUT_SIZE; v++) {
+		//XXX: round?
+		output[v] = v / (PRECACHE_OUTPUT_SIZE/256);
+	}
+}
+
+qcms_bool compute_precache(struct curveType *trc, uint8_t *output)
+{
+        
+        if (trc->type == PARAMETRIC_CURVE_TYPE) {
+                        float gamma_table[256];
+                        uint16_t gamma_table_uint[256];
+                        uint16_t i;
+                        uint16_t *inverted;
+                        int inverted_size = 256;
+
+                        compute_curve_gamma_table_type_parametric(gamma_table, trc->parameter, trc->count);
+                        for(i = 0; i < 256; i++) {
+                                gamma_table_uint[i] = (uint16_t)(gamma_table[i] * 65535);
+                        }
+
+                        //XXX: the choice of a minimum of 256 here is not backed by any theory, 
+                        //     measurement or data, howeve r it is what lcms uses.
+                        //     the maximum number we would need is 65535 because that's the 
+                        //     accuracy used for computing the pre cache table
+                        if (inverted_size < 256)
+                                inverted_size = 256;
+
+                        inverted = invert_lut(gamma_table_uint, 256, inverted_size);
+                        if (!inverted)
+                                return false;
+                        compute_precache_lut(output, inverted, inverted_size);
+                        free(inverted);
+        } else {
+                if (trc->count == 0) {
+                        compute_precache_linear(output);
+                } else if (trc->count == 1) {
+                        compute_precache_pow(output, 1./u8Fixed8Number_to_float(trc->data[0]));
+                } else {
+                        uint16_t *inverted;
+                        int inverted_size = trc->count;
+                        //XXX: the choice of a minimum of 256 here is not backed by any theory, 
+                        //     measurement or data, howeve r it is what lcms uses.
+                        //     the maximum number we would need is 65535 because that's the 
+                        //     accuracy used for computing the pre cache table
+                        if (inverted_size < 256)
+                                inverted_size = 256;
+
+                        inverted = invert_lut(trc->data, trc->count, inverted_size);
+                        if (!inverted)
+                                return false;
+                        compute_precache_lut(output, inverted, inverted_size);
+                        free(inverted);
+                }
+        }
+        return true;
+}
+
+
+static uint16_t *build_linear_table(int length)
+{
+        int i;
+        uint16_t *output = malloc(sizeof(uint16_t)*length);
+        if (!output)
+                return NULL;
+
+        for (i = 0; i < length; i++) {
+                double x = ((double) i * 65535.) / (double) (length - 1);
+                uint16_fract_t input = floor(x + .5);
+                output[i] = input;
+        }
+        return output;
+}
+
+static uint16_t *build_pow_table(float gamma, int length)
+{
+        int i;
+        uint16_t *output = malloc(sizeof(uint16_t)*length);
+        if (!output)
+                return NULL;
+
+        for (i = 0; i < length; i++) {
+                uint16_fract_t result;
+                double x = ((double) i) / (double) (length - 1);
+                x = pow(x, gamma);                //XXX turn this conversion into a function
+                result = floor(x*65535. + .5);
+                output[i] = result;
+        }
+        return output;
+}
+
+void build_output_lut(struct curveType *trc,
+                uint16_t **output_gamma_lut, size_t *output_gamma_lut_length)
+{
+        if (trc->type == PARAMETRIC_CURVE_TYPE) {
+                float gamma_table[256];
+                uint16_t i;
+                uint16_t *output = malloc(sizeof(uint16_t)*256);
+
+                if (!output) {
+                        *output_gamma_lut = NULL;
+                        return;
+                }
+
+                compute_curve_gamma_table_type_parametric(gamma_table, trc->parameter, trc->count);
+                *output_gamma_lut_length = 256;
+                for(i = 0; i < 256; i++) {
+                        output[i] = (uint16_t)(gamma_table[i] * 65535);
+                }
+                *output_gamma_lut = output;
+        } else {
+                if (trc->count == 0) {
+                        *output_gamma_lut = build_linear_table(4096);
+                        *output_gamma_lut_length = 4096;
+                } else if (trc->count == 1) {
+                        float gamma = 1./u8Fixed8Number_to_float(trc->data[0]);
+                        *output_gamma_lut = build_pow_table(gamma, 4096);
+                        *output_gamma_lut_length = 4096;
+                } else {
+                        //XXX: the choice of a minimum of 256 here is not backed by any theory, 
+                        //     measurement or data, however it is what lcms uses.
+                        *output_gamma_lut_length = trc->count;
+                        if (*output_gamma_lut_length < 256)
+                                *output_gamma_lut_length = 256;
+
+                        *output_gamma_lut = invert_lut(trc->data, trc->count, *output_gamma_lut_length);
+                }
+        }
+
+}
+
new file mode 100644
--- /dev/null
+++ b/gfx/qcms/transform_util.h
@@ -0,0 +1,59 @@
+/* vim: set ts=8 sw=8 noexpandtab: */
+//  qcms
+//  Copyright (C) 2009 Mozilla Foundation
+//  Copyright (C) 1998-2007 Marti Maria
+//
+// Permission is hereby granted, free of charge, to any person obtaining 
+// a copy of this software and associated documentation files (the "Software"), 
+// to deal in the Software without restriction, including without limitation 
+// the rights to use, copy, modify, merge, publish, distribute, sublicense, 
+// and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in 
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO 
+// THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 
+// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 
+// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _QCMS_TRANSFORM_UTIL_H
+#define _QCMS_TRANSFORM_UTIL_H
+
+#include <stdlib.h>
+
+#define CLU(table,x,y,z) table[(x*len + y*x_len + z*xy_len)*3]
+
+//XXX: could use a bettername
+typedef uint16_t uint16_fract_t;
+
+float lut_interp_linear(double value, uint16_t *table, int length);
+float lut_interp_linear_float(float value, float *table, int length);
+uint16_t lut_interp_linear16(uint16_t input_value, uint16_t *table, int length);
+
+
+static inline float lerp(float a, float b, float t)
+{
+        return a*(1.f-t) + b*t;
+}
+
+unsigned char clamp_u8(float v);
+float clamp_float(float a);
+
+float u8Fixed8Number_to_float(uint16_t x);
+
+
+float *build_input_gamma_table(struct curveType *TRC);
+struct matrix build_colorant_matrix(qcms_profile *p);
+void build_output_lut(struct curveType *trc,
+                      uint16_t **output_gamma_lut, size_t *output_gamma_lut_length);
+
+struct matrix matrix_invert(struct matrix mat);
+qcms_bool compute_precache(struct curveType *trc, uint8_t *output);
+
+
+#endif