Bug 1215681 - Fix int sizes in intel-aes.s, r=wtc
authorJed Davis <jld@mozilla.com>
Fri, 05 Feb 2016 22:50:47 +0100
changeset 11898 87ce1a18f0ff14ffd752f35e1e10f50951a18601
parent 11897 ebad0907084511f01f24644a3563041ca64178a7
child 11899 ff8696cc41bbc9f6751626d4093ea6bdc464afec
push id994
push userkaie@kuix.de
push dateMon, 15 Feb 2016 23:39:43 +0000
reviewerswtc
bugs1215681
Bug 1215681 - Fix int sizes in intel-aes.s, r=wtc
lib/freebl/intel-aes.s
--- a/lib/freebl/intel-aes.s
+++ b/lib/freebl/intel-aes.s
@@ -2,16 +2,30 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 	.text
 
 #define IV_OFFSET 16
 #define EXPANDED_KEY_OFFSET 48
 
+/*
+ * Warning: the length values used in this module are "unsigned int"
+ * in C, which is 32-bit.  When they're passed in registers, use only
+ * the low 32 bits, because the top half is unspecified.
+ *
+ * This is called from C code, so the contents of those bits can
+ * depend on the C compiler's optimization decisions.  This means that
+ * mistakes might not be obvious in testing if those bits happen to be
+ * zero in your build.
+ *
+ * Exception: 32-bit lea instructions use a 64-bit address because the
+ * address size doesn't affect the result, and that form is more
+ * compactly encoded and preferred by compilers over a 32-bit address.
+ */
 
 /* in %rdi : the key
    in %rsi : buffer for expanded key
 */
 	.type intel_aes_encrypt_init_128,@function
 	.globl intel_aes_encrypt_init_128
 	.align	16
 intel_aes_encrypt_init_128:
@@ -114,37 +128,38 @@ key_expansion128:
 	addq	$16, %rsi
 	ret
 	.size key_expansion128, .-key_expansion128
 
 
 /* in %rdi : cx - context
    in %rsi : output - pointer to output buffer
    in %rdx : outputLen - pointer to variable for length of output
-             (filled by caller)
-   in %rcx : maxOutputLen - length of output buffer
+             (already filled in by caller)
+   in %ecx : maxOutputLen - length of output buffer
+             (already checked by caller)
    in %r8  : input - pointer to input buffer
-   in %r9  : inputLen - length of input buffer
+   in %r9d : inputLen - length of input buffer
    on stack: blocksize - AES blocksize (always 16, unused)
 */
 	.type intel_aes_encrypt_ecb_128,@function
 	.globl intel_aes_encrypt_ecb_128
 	.align	16
 intel_aes_encrypt_ecb_128:
 //	leaq	EXPANDED_KEY_OFFSET(%rdi), %rdi
 	leaq	48(%rdi), %rdi
 
 	movdqu	(%rdi), %xmm2
 	movdqu	160(%rdi), %xmm12
 	xor	%eax, %eax
-//	cmpq	$8*16, %r9
-	cmpq	$128, %r9
+//	cmpl	$8*16, %r9d
+	cmpl	$128, %r9d
 	jb	1f
-//	leaq	-8*16(%r9), %r11
-	leaq	-128(%r9), %r11
+//	leal	-8*16(%r9), %r11d
+	leal	-128(%r9), %r11d
 2:	movdqu	(%r8, %rax), %xmm3
 	movdqu	16(%r8, %rax), %xmm4
 	movdqu	32(%r8, %rax), %xmm5
 	movdqu	48(%r8, %rax), %xmm6
 	movdqu	64(%r8, %rax), %xmm7
 	movdqu	80(%r8, %rax), %xmm8
 	movdqu	96(%r8, %rax), %xmm9
 	movdqu	112(%r8, %rax), %xmm10
@@ -255,21 +270,21 @@ 2:	movdqu	(%r8, %rax), %xmm3
 	movdqu	%xmm3, (%rsi, %rax)
 	movdqu	%xmm4, 16(%rsi, %rax)
 	movdqu	%xmm5, 32(%rsi, %rax)
 	movdqu	%xmm6, 48(%rsi, %rax)
 	movdqu	%xmm7, 64(%rsi, %rax)
 	movdqu	%xmm8, 80(%rsi, %rax)
 	movdqu	%xmm9, 96(%rsi, %rax)
 	movdqu	%xmm10, 112(%rsi, %rax)
-//	addq	$8*16, %rax
-	addq	$128, %rax
-	cmpq	%r11, %rax
+//	addl	$8*16, %eax
+	addl	$128, %eax
+	cmpl	%r11d, %eax
 	jbe	2b
-1:	cmpq	%rax, %r9
+1:	cmpl	%eax, %r9d
 	je	5f
 
 	movdqu	16(%rdi), %xmm3
 	movdqu	32(%rdi), %xmm4
 	movdqu	48(%rdi), %xmm5
 	movdqu	64(%rdi), %xmm6
 	movdqu	80(%rdi), %xmm7
 	movdqu	96(%rdi), %xmm8
@@ -285,49 +300,50 @@ 4:	movdqu	(%r8, %rax), %xmm1
 	.byte 0x66,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm6, %xmm1 */
 	.byte 0x66,0x0f,0x38,0xdc,0xcf	/* aesenc	%xmm7, %xmm1 */
 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc8	/* aesenc	%xmm8, %xmm1 */
 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm10, %xmm1 */
 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm1 */
 	.byte 0x66,0x41,0x0f,0x38,0xdd,0xcc	/* aesenclast %xmm12, %xmm1 */
 	movdqu	%xmm1, (%rsi, %rax)
-	addq	$16, %rax
-	cmpq	%rax, %r9
+	addl	$16, %eax
+	cmpl	%eax, %r9d
 	jne	4b
 
 5:	xor	%eax, %eax
 	ret
 	.size intel_aes_encrypt_ecb_128, .-intel_aes_encrypt_ecb_128
 
 
 /* in %rdi : cx - context
    in %rsi : output - pointer to output buffer
    in %rdx : outputLen - pointer to variable for length of output
-             (filled by caller)
-   in %rcx : maxOutputLen - length of output buffer
+             (already filled in by caller)
+   in %ecx : maxOutputLen - length of output buffer
+             (already checked by caller)
    in %r8  : input - pointer to input buffer
-   in %r9  : inputLen - length of input buffer
+   in %r9d : inputLen - length of input buffer
    on stack: blocksize - AES blocksize (always 16, unused)
 */
 	.type intel_aes_decrypt_ecb_128,@function
 	.globl intel_aes_decrypt_ecb_128
 	.align	16
 intel_aes_decrypt_ecb_128:
 //	leaq	EXPANDED_KEY_OFFSET(%rdi), %rdi
 	leaq	48(%rdi), %rdi
 
 	movdqu	(%rdi), %xmm2
 	movdqu	160(%rdi), %xmm12
 	xorl	%eax, %eax
-//	cmpq	$8*16, %r9
-	cmpq	$128, %r9
+//	cmpl	$8*16, %r9d
+	cmpl	$128, %r9d
 	jb	1f
-//	leaq	-8*16(%r9), %r11
-	leaq	-128(%r9), %r11
+//	leal	-8*16(%r9), %r11d
+	leal	-128(%r9), %r11d
 2:	movdqu	(%r8, %rax), %xmm3
 	movdqu	16(%r8, %rax), %xmm4
 	movdqu	32(%r8, %rax), %xmm5
 	movdqu	48(%r8, %rax), %xmm6
 	movdqu	64(%r8, %rax), %xmm7
 	movdqu	80(%r8, %rax), %xmm8
 	movdqu	96(%r8, %rax), %xmm9
 	movdqu	112(%r8, %rax), %xmm10
@@ -438,21 +454,21 @@ 2:	movdqu	(%r8, %rax), %xmm3
 	movdqu	%xmm3, (%rsi, %rax)
 	movdqu	%xmm4, 16(%rsi, %rax)
 	movdqu	%xmm5, 32(%rsi, %rax)
 	movdqu	%xmm6, 48(%rsi, %rax)
 	movdqu	%xmm7, 64(%rsi, %rax)
 	movdqu	%xmm8, 80(%rsi, %rax)
 	movdqu	%xmm9, 96(%rsi, %rax)
 	movdqu	%xmm10, 112(%rsi, %rax)
-//	addq	$8*16, %rax
-	addq	$128, %rax
-	cmpq	%r11, %rax
+//	addl	$8*16, %eax
+	addl	$128, %eax
+	cmpl	%r11d, %eax
 	jbe	2b
-1:	cmpq	%rax, %r9
+1:	cmpl	%eax, %r9d
 	je	5f
 
 	movdqu	16(%rdi), %xmm3
 	movdqu	32(%rdi), %xmm4
 	movdqu	48(%rdi), %xmm5
 	movdqu	64(%rdi), %xmm6
 	movdqu	80(%rdi), %xmm7
 	movdqu	96(%rdi), %xmm8
@@ -468,39 +484,40 @@ 4:	movdqu	(%r8, %rax), %xmm1
 	.byte 0x66,0x41,0x0f,0x38,0xde,0xc8	/* aesdec	%xmm8, %xmm1 */
 	.byte 0x66,0x0f,0x38,0xde,0xcf	/* aesdec	%xmm7, %xmm1 */
 	.byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm7, %xmm1 */
 	.byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm7, %xmm1 */
 	.byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm7, %xmm1 */
 	.byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm7, %xmm1 */
 	.byte 0x66,0x0f,0x38,0xdf,0xca	/* aesdeclast %xmm2, %xmm1 */
 	movdqu	%xmm1, (%rsi, %rax)
-	addq	$16, %rax
-	cmpq	%rax, %r9
+	addl	$16, %eax
+	cmpl	%eax, %r9d
 	jne	4b
 
 5:	xor	%eax, %eax
 	ret
 	.size intel_aes_decrypt_ecb_128, .-intel_aes_decrypt_ecb_128
 
 
 /* in %rdi : cx - context
    in %rsi : output - pointer to output buffer
    in %rdx : outputLen - pointer to variable for length of output
-             (filled by caller)
-   in %rcx : maxOutputLen - length of output buffer
+             (already filled in by caller)
+   in %ecx : maxOutputLen - length of output buffer
+             (already checked by caller)
    in %r8  : input - pointer to input buffer
-   in %r9  : inputLen - length of input buffer
+   in %r9d : inputLen - length of input buffer
    on stack: blocksize - AES blocksize (always 16, unused)
 */
 	.type intel_aes_encrypt_cbc_128,@function
 	.globl intel_aes_encrypt_cbc_128
 	.align	16
 intel_aes_encrypt_cbc_128:
-	testq	%r9, %r9
+	testl	%r9d, %r9d
 	je	2f
 
 //	leaq	IV_OFFSET(%rdi), %rdx
 //	leaq	EXPANDED_KEY_OFFSET(%rdi), %rdi
 	leaq	16(%rdi), %rdx
 	leaq	48(%rdi), %rdi
 
 	movdqu	(%rdx), %xmm0
@@ -527,52 +544,53 @@ 1:	movdqu	(%r8, %rax), %xmm1
 	.byte 0x66,0x0f,0x38,0xdc,0xcf	/* aesenc	%xmm7, %xmm1 */
 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc8	/* aesenc	%xmm8, %xmm1 */
 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmma, %xmm1 */
 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmmb, %xmm1 */
 	.byte 0x66,0x41,0x0f,0x38,0xdd,0xcc	/* aesenclast %xmm12, %xmm1 */
 	movdqu	%xmm1, (%rsi, %rax)
 	movdqa	%xmm1, %xmm0
-	addq	$16, %rax
-	cmpq	%rax, %r9
+	addl	$16, %eax
+	cmpl	%eax, %r9d
 	jne	1b
 
 	movdqu	%xmm0, (%rdx)
 
 2:	xor	%eax, %eax
 	ret
 	.size intel_aes_encrypt_cbc_128, .-intel_aes_encrypt_cbc_128
 
 
 /* in %rdi : cx - context
    in %rsi : output - pointer to output buffer
    in %rdx : outputLen - pointer to variable for length of output
-             (filled by caller)
-   in %rcx : maxOutputLen - length of output buffer
+             (already filled in by caller)
+   in %ecx : maxOutputLen - length of output buffer
+             (already checked by caller)
    in %r8  : input - pointer to input buffer
-   in %r9  : inputLen - length of input buffer
+   in %r9d : inputLen - length of input buffer
    on stack: blocksize - AES blocksize (always 16, unused)
 */
 	.type intel_aes_decrypt_cbc_128,@function
 	.globl intel_aes_decrypt_cbc_128
 	.align	16
 intel_aes_decrypt_cbc_128:
 //	leaq	IV_OFFSET(%rdi), %rdx
 //	leaq	EXPANDED_KEY_OFFSET(%rdi), %rdi
 	leaq	16(%rdi), %rdx
 	leaq	48(%rdi), %rdi
 
 	movdqu	(%rdx), %xmm0   /* iv */
 	movdqu	(%rdi), %xmm2   /* first key block */
 	movdqu	160(%rdi), %xmm12 /* last key block */
 	xorl	%eax, %eax
-	cmpq	$128, %r9
+	cmpl	$128, %r9d
 	jb	1f
-	leaq	-128(%r9), %r11
+	leal	-128(%r9), %r11d
 2:	movdqu	(%r8, %rax), %xmm3 /* 1st data block */
 	movdqu	16(%r8, %rax), %xmm4 /* 2d data block */
 	movdqu	32(%r8, %rax), %xmm5
 	movdqu	48(%r8, %rax), %xmm6
 	movdqu	64(%r8, %rax), %xmm7
 	movdqu	80(%r8, %rax), %xmm8
 	movdqu	96(%r8, %rax), %xmm9
 	movdqu	112(%r8, %rax), %xmm10
@@ -699,20 +717,20 @@ 2:	movdqu	(%r8, %rax), %xmm3 /* 1st data
 	movdqu	%xmm3, (%rsi, %rax)
 	movdqu	%xmm4, 16(%rsi, %rax)
 	movdqu	%xmm5, 32(%rsi, %rax)
 	movdqu	%xmm6, 48(%rsi, %rax)
 	movdqu	%xmm7, 64(%rsi, %rax)
 	movdqu	%xmm8, 80(%rsi, %rax)
 	movdqu	%xmm9, 96(%rsi, %rax)
 	movdqu	%xmm10, 112(%rsi, %rax)
-	addq	$128, %rax
-	cmpq	%r11, %rax
+	addl	$128, %eax
+	cmpl	%r11d, %eax
 	jbe	2b
-1:	cmpq	%rax, %r9
+1:	cmpl	%eax, %r9d
 	je	5f
 
 	movdqu	16(%rdi), %xmm3
 	movdqu	32(%rdi), %xmm4
 	movdqu	48(%rdi), %xmm5
 	movdqu	64(%rdi), %xmm6
 	movdqu	80(%rdi), %xmm7
 	movdqu	96(%rdi), %xmm8
@@ -731,18 +749,18 @@ 4:	movdqu	(%r8, %rax), %xmm1
 	.byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm6, %xmm1 */
 	.byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm5, %xmm1 */
 	.byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm4, %xmm1 */
 	.byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm3, %xmm1 */
 	.byte 0x66,0x0f,0x38,0xdf,0xca	/* aesdeclast %xmm2, %xmm1 */
 	pxor	%xmm0, %xmm1
 	movdqu	%xmm1, (%rsi, %rax)
 	movdqa	%xmm13, %xmm0
-	addq	$16, %rax
-	cmpq	%rax, %r9
+	addl	$16, %eax
+	cmpl	%eax, %r9d
 	jne	4b
 
 5:	movdqu	%xmm0, (%rdx)
 
 	xor	%eax, %eax
 	ret
 	.size intel_aes_decrypt_cbc_128, .-intel_aes_decrypt_cbc_128
         
@@ -868,37 +886,38 @@ key_expansion192:
 	addq	$8, %rsi
 	ret
 	.size key_expansion192, .-key_expansion192
 
 
 /* in %rdi : cx - context
    in %rsi : output - pointer to output buffer
    in %rdx : outputLen - pointer to variable for length of output
-             (filled by caller)
-   in %rcx : maxOutputLen - length of output buffer
+             (already filled in by caller)
+   in %ecx : maxOutputLen - length of output buffer
+             (already checked by caller)
    in %r8  : input - pointer to input buffer
-   in %r9  : inputLen - length of input buffer
+   in %r9d : inputLen - length of input buffer
    on stack: blocksize - AES blocksize (always 16, unused)
 */
 	.type intel_aes_encrypt_ecb_192,@function
 	.globl intel_aes_encrypt_ecb_192
 	.align	16
 intel_aes_encrypt_ecb_192:
 //	leaq	EXPANDED_KEY_OFFSET(%rdi), %rdi
 	leaq	48(%rdi), %rdi
 
 	movdqu	(%rdi), %xmm2
 	movdqu	192(%rdi), %xmm14
 	xorl	%eax, %eax
-//	cmpq	$8*16, %r9
-	cmpq	$128, %r9
+//	cmpl	$8*16, %r9d
+	cmpl	$128, %r9d
 	jb	1f
-//	leaq	-8*16(%r9), %r11
-	leaq	-128(%r9), %r11
+//	leal	-8*16(%r9), %r11d
+	leal	-128(%r9), %r11d
 2:	movdqu	(%r8, %rax), %xmm3
 	movdqu	16(%r8, %rax), %xmm4
 	movdqu	32(%r8, %rax), %xmm5
 	movdqu	48(%r8, %rax), %xmm6
 	movdqu	64(%r8, %rax), %xmm7
 	movdqu	80(%r8, %rax), %xmm8
 	movdqu	96(%r8, %rax), %xmm9
 	movdqu	112(%r8, %rax), %xmm10
@@ -1028,21 +1047,21 @@ 2:	movdqu	(%r8, %rax), %xmm3
 	movdqu	%xmm3, (%rsi, %rax)
 	movdqu	%xmm4, 16(%rsi, %rax)
 	movdqu	%xmm5, 32(%rsi, %rax)
 	movdqu	%xmm6, 48(%rsi, %rax)
 	movdqu	%xmm7, 64(%rsi, %rax)
 	movdqu	%xmm8, 80(%rsi, %rax)
 	movdqu	%xmm9, 96(%rsi, %rax)
 	movdqu	%xmm10, 112(%rsi, %rax)
-//	addq	$8*16, %rax
-	addq	$128, %rax
-	cmpq	%r11, %rax
+//	addl	$8*16, %eax
+	addl	$128, %eax
+	cmpl	%r11d, %eax
 	jbe	2b
-1:	cmpq	%rax, %r9
+1:	cmpl	%eax, %r9d
 	je	5f
 
 	movdqu	16(%rdi), %xmm3
 	movdqu	32(%rdi), %xmm4
 	movdqu	48(%rdi), %xmm5
 	movdqu	64(%rdi), %xmm6
 	movdqu	80(%rdi), %xmm7
 	movdqu	96(%rdi), %xmm8
@@ -1062,49 +1081,50 @@ 4:	movdqu	(%r8, %rax), %xmm1
 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc8	/* aesenc	%xmm8, %xmm1 */
 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm10, %xmm1 */
 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm1 */
 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm12, %xmm1 */
 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm13, %xmm1 */
 	.byte 0x66,0x41,0x0f,0x38,0xdd,0xce	/* aesenclast %xmm14, %xmm1 */
 	movdqu	%xmm1, (%rsi, %rax)
-	addq	$16, %rax
-	cmpq	%rax, %r9
+	addl	$16, %eax
+	cmpl	%eax, %r9d
 	jne	4b
 
 5:	xor	%eax, %eax
 	ret
 	.size intel_aes_encrypt_ecb_192, .-intel_aes_encrypt_ecb_192
 
 
 /* in %rdi : cx - context
    in %rsi : output - pointer to output buffer
    in %rdx : outputLen - pointer to variable for length of output
-             (filled by caller)
-   in %rcx : maxOutputLen - length of output buffer
+             (already filled in by caller)
+   in %ecx : maxOutputLen - length of output buffer
+             (already checked by caller)
    in %r8  : input - pointer to input buffer
-   in %r9  : inputLen - length of input buffer
+   in %r9d : inputLen - length of input buffer
    on stack: blocksize - AES blocksize (always 16, unused)
 */
 	.type intel_aes_decrypt_ecb_192,@function
 	.globl intel_aes_decrypt_ecb_192
 	.align	16
 intel_aes_decrypt_ecb_192:
 //	leaq	EXPANDED_KEY_OFFSET(%rdi), %rdi
 	leaq	48(%rdi), %rdi
 
 	movdqu	(%rdi), %xmm2
 	movdqu	192(%rdi), %xmm14
 	xorl	%eax, %eax
-//	cmpq	$8*16, %r9
-	cmpq	$128, %r9
+//	cmpl	$8*16, %r9d
+	cmpl	$128, %r9d
 	jb	1f
-//	leaq	-8*16(%r9), %r11
-	leaq	-128(%r9), %r11
+//	leal	-8*16(%r9), %r11d
+	leal	-128(%r9), %r11d
 2:	movdqu	(%r8, %rax), %xmm3
 	movdqu	16(%r8, %rax), %xmm4
 	movdqu	32(%r8, %rax), %xmm5
 	movdqu	48(%r8, %rax), %xmm6
 	movdqu	64(%r8, %rax), %xmm7
 	movdqu	80(%r8, %rax), %xmm8
 	movdqu	96(%r8, %rax), %xmm9
 	movdqu	112(%r8, %rax), %xmm10
@@ -1234,21 +1254,21 @@ 2:	movdqu	(%r8, %rax), %xmm3
 	movdqu	%xmm3, (%rsi, %rax)
 	movdqu	%xmm4, 16(%rsi, %rax)
 	movdqu	%xmm5, 32(%rsi, %rax)
 	movdqu	%xmm6, 48(%rsi, %rax)
 	movdqu	%xmm7, 64(%rsi, %rax)
 	movdqu	%xmm8, 80(%rsi, %rax)
 	movdqu	%xmm9, 96(%rsi, %rax)
 	movdqu	%xmm10, 112(%rsi, %rax)
-//	addq	$8*16, %rax
-	addq	$128, %rax
-	cmpq	%r11, %rax
+//	addl	$8*16, %eax
+	addl	$128, %eax
+	cmpl	%r11d, %eax
 	jbe	2b
-1:	cmpq	%rax, %r9
+1:	cmpl	%eax, %r9d
 	je	5f
 
 	movdqu	16(%rdi), %xmm3
 	movdqu	32(%rdi), %xmm4
 	movdqu	48(%rdi), %xmm5
 	movdqu	64(%rdi), %xmm6
 	movdqu	80(%rdi), %xmm7
 	movdqu	96(%rdi), %xmm8
@@ -1268,39 +1288,40 @@ 4:	movdqu	(%r8, %rax), %xmm1
 	.byte 0x66,0x41,0x0f,0x38,0xde,0xc8	/* aesdec	%xmm8, %xmm1 */
 	.byte 0x66,0x0f,0x38,0xde,0xcf	/* aesdec	%xmm7, %xmm1 */
 	.byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm6, %xmm1 */
 	.byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm5, %xmm1 */
 	.byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm4, %xmm1 */
 	.byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm3, %xmm1 */
 	.byte 0x66,0x0f,0x38,0xdf,0xca	/* aesdeclast %xmm2, %xmm1 */
 	movdqu	%xmm1, (%rsi, %rax)
-	addq	$16, %rax
-	cmpq	%rax, %r9
+	addl	$16, %eax
+	cmpl	%eax, %r9d
 	jne	4b
 
 5:	xor	%eax, %eax
 	ret
 	.size intel_aes_decrypt_ecb_192, .-intel_aes_decrypt_ecb_192
 
 
 /* in %rdi : cx - context
    in %rsi : output - pointer to output buffer
    in %rdx : outputLen - pointer to variable for length of output
-             (filled by caller)
-   in %rcx : maxOutputLen - length of output buffer
+             (already filled in by caller)
+   in %ecx : maxOutputLen - length of output buffer
+             (already checked by caller)
    in %r8  : input - pointer to input buffer
-   in %r9  : inputLen - length of input buffer
+   in %r9d : inputLen - length of input buffer
    on stack: blocksize - AES blocksize (always 16, unused)
 */
 	.type intel_aes_encrypt_cbc_192,@function
 	.globl intel_aes_encrypt_cbc_192
 	.align	16
 intel_aes_encrypt_cbc_192:
-	testq	%r9, %r9
+	testl	%r9d, %r9d
 	je	2f
 
 //	leaq	IV_OFFSET(%rdi), %rdx
 //	leaq	EXPANDED_KEY_OFFSET(%rdi), %rdi
 	leaq	16(%rdi), %rdx
 	leaq	48(%rdi), %rdi
 
 	movdqu	(%rdx), %xmm0
@@ -1331,50 +1352,51 @@ 1:	movdqu	(%r8, %rax), %xmm1
 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm10, %xmm1 */
 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm1 */
 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm12, %xmm1 */
 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm13, %xmm1 */
 	.byte 0x66,0x41,0x0f,0x38,0xdd,0xce	/* aesenclast %xmm14, %xmm1 */
 	movdqu	%xmm1, (%rsi, %rax)
 	movdqa	%xmm1, %xmm0
-	addq	$16, %rax
-	cmpq	%rax, %r9
+	addl	$16, %eax
+	cmpl	%eax, %r9d
 	jne	1b
 
 	movdqu	%xmm0, (%rdx)
 
 2:	xor	%eax, %eax
 	ret
 	.size intel_aes_encrypt_cbc_192, .-intel_aes_encrypt_cbc_192
 
 
 /* in %rdi : cx - context
    in %rsi : output - pointer to output buffer
    in %rdx : outputLen - pointer to variable for length of output
-             (filled by caller)
-   in %rcx : maxOutputLen - length of output buffer
+             (already filled in by caller)
+   in %exx : maxOutputLen - length of output buffer
+             (already checked by caller)
    in %r8  : input - pointer to input buffer
-   in %r9  : inputLen - length of input buffer
+   in %r9d : inputLen - length of input buffer
    on stack: blocksize - AES blocksize (always 16, unused)
 */
 	.type intel_aes_decrypt_cbc_192,@function
 	.globl intel_aes_decrypt_cbc_192
 	.align	16
 intel_aes_decrypt_cbc_192:
 	leaq	16(%rdi), %rdx
 	leaq	48(%rdi), %rdi
 
 	movdqu	(%rdx), %xmm0
 	movdqu	(%rdi), %xmm2
 	movdqu	192(%rdi), %xmm14
 	xorl	%eax, %eax
-	cmpq	$128, %r9
+	cmpl	$128, %r9d
 	jb	1f
-	leaq	-128(%r9), %r11
+	leal	-128(%r9), %r11d
 2:	movdqu	(%r8, %rax), %xmm3
 	movdqu	16(%r8, %rax), %xmm4
 	movdqu	32(%r8, %rax), %xmm5
 	movdqu	48(%r8, %rax), %xmm6
 	movdqu	64(%r8, %rax), %xmm7
 	movdqu	80(%r8, %rax), %xmm8
 	movdqu	96(%r8, %rax), %xmm9
 	movdqu	112(%r8, %rax), %xmm10
@@ -1520,20 +1542,20 @@ 2:	movdqu	(%r8, %rax), %xmm3
 	movdqu	%xmm3, (%rsi, %rax)
 	movdqu	%xmm4, 16(%rsi, %rax)
 	movdqu	%xmm5, 32(%rsi, %rax)
 	movdqu	%xmm6, 48(%rsi, %rax)
 	movdqu	%xmm7, 64(%rsi, %rax)
 	movdqu	%xmm8, 80(%rsi, %rax)
 	movdqu	%xmm9, 96(%rsi, %rax)
 	movdqu	%xmm10, 112(%rsi, %rax)
-	addq	$128, %rax
-	cmpq	%r11, %rax
+	addl	$128, %eax
+	cmpl	%r11d, %eax
 	jbe	2b
-1:	cmpq	%rax, %r9
+1:	cmpl	%eax, %r9d
 	je	5f
 
 	movdqu	16(%rdi), %xmm3
 	movdqu	32(%rdi), %xmm4
 	movdqu	48(%rdi), %xmm5
 	movdqu	64(%rdi), %xmm6
 	movdqu	80(%rdi), %xmm7
 	movdqu	96(%rdi), %xmm8
@@ -1556,18 +1578,18 @@ 4:	movdqu	(%r8, %rax), %xmm1
 	.byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm6, %xmm1 */
 	.byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm5, %xmm1 */
 	.byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm4, %xmm1 */
 	.byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm3, %xmm1 */
 	.byte 0x66,0x0f,0x38,0xdf,0xca	/* aesdeclast %xmm2, %xmm1 */
 	pxor	%xmm0, %xmm1
 	movdqu	%xmm1, (%rsi, %rax)
 	movdqa	%xmm15, %xmm0
-	addq	$16, %rax
-	cmpq	%rax, %r9
+	addl	$16, %eax
+	cmpl	%eax, %r9d
 	jne	4b
 
 5:	movdqu	%xmm0, (%rdx)
 
 	xor	%eax, %eax
 	ret
 	.size intel_aes_decrypt_cbc_192, .-intel_aes_decrypt_cbc_192
 
@@ -1700,37 +1722,38 @@ key_expansion256:
 	addq	$16, %rsi
 	ret
 	.size key_expansion256, .-key_expansion256
 
 
 /* in %rdi : cx - context
    in %rsi : output - pointer to output buffer
    in %rdx : outputLen - pointer to variable for length of output
-             (filled by caller)
-   in %rcx : maxOutputLen - length of output buffer
+             (already filled in by caller)
+   in %ecx : maxOutputLen - length of output buffer
+             (already checked by caller)
    in %r8  : input - pointer to input buffer
-   in %r9  : inputLen - length of input buffer
+   in %r9d : inputLen - length of input buffer
    on stack: blocksize - AES blocksize (always 16, unused)
 */
 	.type intel_aes_encrypt_ecb_256,@function
 	.globl intel_aes_encrypt_ecb_256
 	.align	16
 intel_aes_encrypt_ecb_256:
 //	leaq	EXPANDED_KEY_OFFSET(%rdi), %rdi
 	leaq	48(%rdi), %rdi
 
 	movdqu	(%rdi), %xmm2
 	movdqu	224(%rdi), %xmm15
 	xorl	%eax, %eax
-//	cmpq	$8*16, %r9
-	cmpq	$128, %r9
+//	cmpl	$8*16, %r9d
+	cmpl	$128, %r9d
 	jb	1f
-//	leaq	-8*16(%r9), %r11
-	leaq	-128(%r9), %r11
+//	leal	-8*16(%r9), %r11d
+	leal	-128(%r9), %r11d
 2:	movdqu	(%r8, %rax), %xmm3
 	movdqu	16(%r8, %rax), %xmm4
 	movdqu	32(%r8, %rax), %xmm5
 	movdqu	48(%r8, %rax), %xmm6
 	movdqu	64(%r8, %rax), %xmm7
 	movdqu	80(%r8, %rax), %xmm8
 	movdqu	96(%r8, %rax), %xmm9
 	movdqu	112(%r8, %rax), %xmm10
@@ -1879,21 +1902,21 @@ 2:	movdqu	(%r8, %rax), %xmm3
 	movdqu	%xmm3, (%rsi, %rax)
 	movdqu	%xmm4, 16(%rsi, %rax)
 	movdqu	%xmm5, 32(%rsi, %rax)
 	movdqu	%xmm6, 48(%rsi, %rax)
 	movdqu	%xmm7, 64(%rsi, %rax)
 	movdqu	%xmm8, 80(%rsi, %rax)
 	movdqu	%xmm9, 96(%rsi, %rax)
 	movdqu	%xmm10, 112(%rsi, %rax)
-//	addq	$8*16, %rax
-	addq	$128, %rax
-	cmpq	%r11, %rax
+//	addl	$8*16, %eax
+	addl	$128, %eax
+	cmpl	%r11d, %eax
 	jbe	2b
-1:	cmpq	%rax, %r9
+1:	cmpl	%eax, %r9d
 	je	5f
 
 	movdqu	(%rdi), %xmm8
 	movdqu	16(%rdi), %xmm2
 	movdqu	32(%rdi), %xmm3
 	movdqu	48(%rdi), %xmm4
 	movdqu	64(%rdi), %xmm5
 	movdqu	80(%rdi), %xmm6
@@ -1919,49 +1942,50 @@ 4:	movdqu	(%r8, %rax), %xmm1
 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm10, %xmm1 */
 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm1 */
 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm12, %xmm1 */
 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm13, %xmm1 */
 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm14, %xmm1 */
 	.byte 0x66,0x41,0x0f,0x38,0xdd,0xcf	/* aesenclast %xmm15, %xmm1 */
 	movdqu	%xmm1, (%rsi, %rax)
-	addq	$16, %rax
-	cmpq	%rax, %r9
+	addl	$16, %eax
+	cmpl	%eax, %r9d
 	jne	4b
 
 5:	xor	%eax, %eax
 	ret
 	.size intel_aes_encrypt_ecb_256, .-intel_aes_encrypt_ecb_256
 
 
 /* in %rdi : cx - context
    in %rsi : output - pointer to output buffer
    in %rdx : outputLen - pointer to variable for length of output
-             (filled by caller)
-   in %rcx : maxOutputLen - length of output buffer
+             (already filled in by caller)
+   in %ecx : maxOutputLen - length of output buffer
+             (already checked by caller)
    in %r8  : input - pointer to input buffer
-   in %r9  : inputLen - length of input buffer
+   in %r9d : inputLen - length of input buffer
    on stack: blocksize - AES blocksize (always 16, unused)
 */
 	.type intel_aes_decrypt_ecb_256,@function
 	.globl intel_aes_decrypt_ecb_256
 	.align	16
 intel_aes_decrypt_ecb_256:
 //	leaq	EXPANDED_KEY_OFFSET(%rdi), %rdi
 	leaq	48(%rdi), %rdi
 
 	movdqu	(%rdi), %xmm2
 	movdqu	224(%rdi), %xmm15
 	xorl	%eax, %eax
-//	cmpq	$8*16, %r9
-	cmpq	$128, %r9
+//	cmpl	$8*16, %r9d
+	cmpl	$128, %r9d
 	jb	1f
-//	leaq	-8*16(%r9), %r11
-	leaq	-128(%r9), %r11
+//	leal	-8*16(%r9), %r11d
+	leal	-128(%r9), %r11d
 2:	movdqu	(%r8, %rax), %xmm3
 	movdqu	16(%r8, %rax), %xmm4
 	movdqu	32(%r8, %rax), %xmm5
 	movdqu	48(%r8, %rax), %xmm6
 	movdqu	64(%r8, %rax), %xmm7
 	movdqu	80(%r8, %rax), %xmm8
 	movdqu	96(%r8, %rax), %xmm9
 	movdqu	112(%r8, %rax), %xmm10
@@ -2110,21 +2134,21 @@ 2:	movdqu	(%r8, %rax), %xmm3
 	movdqu	%xmm3, (%rsi, %rax)
 	movdqu	%xmm4, 16(%rsi, %rax)
 	movdqu	%xmm5, 32(%rsi, %rax)
 	movdqu	%xmm6, 48(%rsi, %rax)
 	movdqu	%xmm7, 64(%rsi, %rax)
 	movdqu	%xmm8, 80(%rsi, %rax)
 	movdqu	%xmm9, 96(%rsi, %rax)
 	movdqu	%xmm10, 112(%rsi, %rax)
-//	addq	$8*16, %rax
-	addq	$128, %rax
-	cmpq	%r11, %rax
+//	addl	$8*16, %eax
+	addl	$128, %eax
+	cmpl	%r11d, %eax
 	jbe	2b
-1:	cmpq	%rax, %r9
+1:	cmpl	%eax, %r9d
 	je	5f
 
 	movdqu	16(%rdi), %xmm2
 	movdqu	32(%rdi), %xmm3
 	movdqu	48(%rdi), %xmm4
 	movdqu	64(%rdi), %xmm5
 	movdqu	80(%rdi), %xmm6
 	movdqu	96(%rdi), %xmm7
@@ -2150,39 +2174,40 @@ 4:	movdqu	(%r8, %rax), %xmm1
 	.byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm6, %xmm1 */
 	.byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm5, %xmm1 */
 	.byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm4, %xmm1 */
 	.byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm3, %xmm1 */
 	.byte 0x66,0x0f,0x38,0xde,0xca	/* aesdec	%xmm2, %xmm1 */
 	.byte 0x66,0x41,0x0f,0x38,0xdf,0xc8	/* aesdeclast %xmm8, %xmm1 */
 	movdqu	112(%rdi), %xmm8
 	movdqu	%xmm1, (%rsi, %rax)
-	addq	$16, %rax
-	cmpq	%rax, %r9
+	addl	$16, %eax
+	cmpl	%eax, %r9d
 	jne	4b
 
 5:	xor	%eax, %eax
 	ret
 	.size intel_aes_decrypt_ecb_256, .-intel_aes_decrypt_ecb_256
 
 
 /* in %rdi : cx - context
    in %rsi : output - pointer to output buffer
    in %rdx : outputLen - pointer to variable for length of output
-             (filled by caller)
-   in %rcx : maxOutputLen - length of output buffer
+             (already filled in by caller)
+   in %ecx : maxOutputLen - length of output buffer
+             (already checked by caller)
    in %r8  : input - pointer to input buffer
-   in %r9  : inputLen - length of input buffer
+   in %r9d : inputLen - length of input buffer
    on stack: blocksize - AES blocksize (always 16, unused)
 */
 	.type intel_aes_encrypt_cbc_256,@function
 	.globl intel_aes_encrypt_cbc_256
 	.align	16
 intel_aes_encrypt_cbc_256:
-	testq	%r9, %r9
+	testl	%r9d, %r9d
 	je	2f
 
 //	leaq	IV_OFFSET(%rdi), %rdx
 //	leaq	EXPANDED_KEY_OFFSET(%rdi), %rdi
 	leaq	16(%rdi), %rdx
 	leaq	48(%rdi), %rdi
 
 	movdqu	(%rdx), %xmm0
@@ -2218,54 +2243,55 @@ 1:	movdqu	(%r8, %rax), %xmm1
 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm10, %xmm1 */
 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm1 */
 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm12, %xmm1 */
 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm13, %xmm1 */
 	.byte 0x66,0x41,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm14, %xmm1 */
 	.byte 0x66,0x41,0x0f,0x38,0xdd,0xcf	/* aesenclast %xmm15, %xmm1 */
 	movdqu	%xmm1, (%rsi, %rax)
 	movdqa	%xmm1, %xmm0
-	addq	$16, %rax
-	cmpq	%rax, %r9
+	addl	$16, %eax
+	cmpl	%eax, %r9d
 	jne	1b
 
 	movdqu	%xmm0, (%rdx)
 
 2:	xor	%eax, %eax
 	ret
 	.size intel_aes_encrypt_cbc_256, .-intel_aes_encrypt_cbc_256
 
 
 /* in %rdi : cx - context
    in %rsi : output - pointer to output buffer
    in %rdx : outputLen - pointer to variable for length of output
-             (filled by caller)
-   in %rcx : maxOutputLen - length of output buffer
+             (already filled in by caller)
+   in %ecx : maxOutputLen - length of output buffer
+             (already checked by caller)
    in %r8  : input - pointer to input buffer
-   in %r9  : inputLen - length of input buffer
+   in %r9d : inputLen - length of input buffer
    on stack: blocksize - AES blocksize (always 16, unused)
 */
 	.type intel_aes_decrypt_cbc_256,@function
 	.globl intel_aes_decrypt_cbc_256
 	.align	16
 intel_aes_decrypt_cbc_256:
 //	leaq	IV_OFFSET(%rdi), %rdx
 //	leaq	EXPANDED_KEY_OFFSET(%rdi), %rdi
 	leaq	16(%rdi), %rdx
 	leaq	48(%rdi), %rdi
 
 	movdqu	(%rdx), %xmm0
 	movdqu	(%rdi), %xmm2
 	movdqu	224(%rdi), %xmm15
 	xorl	%eax, %eax
-//	cmpq	$8*16, %r9
-	cmpq	$128, %r9
+//	cmpl	$8*16, %r9d
+	cmpl	$128, %r9d
 	jb	1f
-//	leaq	-8*16(%r9), %r11
-	leaq	-128(%r9), %r11
+//	leal	-8*16(%r9), %r11d
+	leal	-128(%r9), %r11d
 2:	movdqu  (%r8, %rax), %xmm3
 	movdqu	16(%r8, %rax), %xmm4
 	movdqu	32(%r8, %rax), %xmm5
 	movdqu	48(%r8, %rax), %xmm6
 	movdqu	64(%r8, %rax), %xmm7
 	movdqu	80(%r8, %rax), %xmm8
 	movdqu	96(%r8, %rax), %xmm9
 	movdqu	112(%r8, %rax), %xmm10
@@ -2430,21 +2456,21 @@ 2:	movdqu  (%r8, %rax), %xmm3
 	movdqu	%xmm3, (%rsi, %rax)
 	movdqu	%xmm4, 16(%rsi, %rax)
 	movdqu	%xmm5, 32(%rsi, %rax)
 	movdqu	%xmm6, 48(%rsi, %rax)
 	movdqu	%xmm7, 64(%rsi, %rax)
 	movdqu	%xmm8, 80(%rsi, %rax)
 	movdqu	%xmm9, 96(%rsi, %rax)
 	movdqu	%xmm10, 112(%rsi, %rax)
-//	addq	$8*16, %rax
-	addq	$128, %rax
-	cmpq	%r11, %rax
+//	addl	$8*16, %eax
+	addl	$128, %eax
+	cmpl	%r11d, %eax
 	jbe	2b
-1:	cmpq	%rax, %r9
+1:	cmpl	%eax, %r9d
 	je	5f
 
 	movdqu	16(%rdi), %xmm2
 	movdqu	32(%rdi), %xmm3
 	movdqu	48(%rdi), %xmm4
 	movdqu	64(%rdi), %xmm5
 	movdqu	80(%rdi), %xmm6
 	movdqu	96(%rdi), %xmm7
@@ -2472,17 +2498,17 @@ 4:	movdqu	(%r8, %rax), %xmm1
 	.byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm4, %xmm1 */
 	.byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm3, %xmm1 */
 	.byte 0x66,0x0f,0x38,0xde,0xca	/* aesdec	%xmm2, %xmm1 */
 	.byte 0x66,0x41,0x0f,0x38,0xdf,0xc8	/* aesdeclast %xmm8, %xmm1 */
 	movdqu	112(%rdi), %xmm8
 	pxor	%xmm0, %xmm1
 	movdqu	(%r8, %rax), %xmm0  /* fetch the IV before we store the block */
 	movdqu	%xmm1, (%rsi, %rax) /* in case input buf = output buf */
-	addq	$16, %rax
-	cmpq	%rax, %r9
+	addl	$16, %eax
+	cmpl	%eax, %r9d
 	jne	4b
 
 5:	movdqu	%xmm0, (%rdx)
 
 	xor	%eax, %eax
 	ret
 	.size intel_aes_decrypt_cbc_256, .-intel_aes_decrypt_cbc_256