Bug 979703: Implementation of AES in different modes of operation, using
authorShay Gueron <shay.gueron@intel.com>
Tue, 08 Apr 2014 19:01:48 -0700
changeset 11109 281db105f1f735de5cd75dd15269c4b053966a49
parent 11108 aa6f6ad199a91c2f6ded93576c1e79deeaea7cb9
child 11110 edaddc2875178a1d3350a88569dd2b30cd3ccf5b
push id358
push userwtc@google.com
push dateWed, 09 Apr 2014 02:02:16 +0000
bugs979703
Bug 979703: Implementation of AES in different modes of operation, using AES-NI and PCLMULQDQ-NI, for WIN32 and WIN64 platforms. r=wtc.
lib/freebl/Makefile
lib/freebl/ctr.c
lib/freebl/ctr.h
lib/freebl/intel-aes-x64-masm.asm
lib/freebl/intel-aes-x86-masm.asm
lib/freebl/intel-aes.h
lib/freebl/intel-gcm-wrap.c
lib/freebl/intel-gcm-x64-masm.asm
lib/freebl/intel-gcm-x86-masm.asm
lib/freebl/rijndael.c
--- a/lib/freebl/Makefile
+++ b/lib/freebl/Makefile
@@ -130,31 +130,37 @@ ifdef NS_USE_GCC
 else
 # MSVC
     MPI_SRCS += mpi_x86_asm.c
     DEFINES += -DMP_ASSEMBLY_MULTIPLY -DMP_ASSEMBLY_SQUARE 
     DEFINES += -DMP_ASSEMBLY_DIV_2DX1D -DMP_USE_UINT_DIGIT -DMP_NO_MP_WORD
     ifdef BUILD_OPT
 	OPTIMIZER += -Ox  # maximum optimization for freebl
     endif
+    DEFINES += -DUSE_HW_AES -DINTEL_GCM
+    ASFILES += intel-aes-x86-masm.asm intel-gcm-x86-masm.asm
+    EXTRA_SRCS += intel-gcm-wrap.c
 endif
 else
     # -DMP_NO_MP_WORD
     DEFINES += -DMP_CHAR_STORE_SLOW -DMP_IS_LITTLE_ENDIAN
 ifdef NS_USE_GCC
 # Ideally, we should use amd64 assembly code, but it's not yet mingw-w64
 # compatible.
 else
 # MSVC
     ifdef BUILD_OPT
 	OPTIMIZER += -Ox  # maximum optimization for freebl
     endif
     ASFILES  = arcfour-amd64-masm.asm mpi_amd64_masm.asm mp_comba_amd64_masm.asm
     DEFINES += -DNSS_BEVAND_ARCFOUR -DMPI_AMD64 -DMP_ASSEMBLY_MULTIPLY
     DEFINES += -DNSS_USE_COMBA
+    DEFINES += -DUSE_HW_AES -DINTEL_GCM
+    ASFILES += intel-aes-x64-masm.asm intel-gcm-x64-masm.asm
+    EXTRA_SRCS += intel-gcm-wrap.c
     MPI_SRCS += mpi_amd64.c
 endif
 endif
 endif
 
 ifeq ($(OS_TARGET),IRIX)
 ifeq ($(USE_N32),1)
     ASFILES  = mpi_mips.s
--- a/lib/freebl/ctr.c
+++ b/lib/freebl/ctr.c
@@ -7,16 +7,21 @@
 #endif
 #include "prtypes.h"
 #include "blapit.h"
 #include "blapii.h"
 #include "ctr.h"
 #include "pkcs11t.h"
 #include "secerr.h"
 
+#ifdef USE_HW_AES
+#include "intel-aes.h"
+#include "rijndael.h"
+#endif
+
 SECStatus
 CTR_InitContext(CTRContext *ctr, void *context, freeblCipherFunc cipher,
 		const unsigned char *param, unsigned int blocksize)
 {
     const CK_AES_CTR_PARAMS *ctrParams = (const CK_AES_CTR_PARAMS *)param;
 
     if (ctrParams->ulCounterBits == 0 ||
 	ctrParams->ulCounterBits > blocksize * PR_BITS_PER_BYTE) {
@@ -160,8 +165,61 @@ CTR_Update(CTRContext *ctr, unsigned cha
     if (rv != SECSuccess) {
 	return SECFailure;
     }
     ctr_xor(outbuf, inbuf, ctr->buffer, inlen);
     ctr->bufPtr = inlen;
     *outlen += inlen;
     return SECSuccess;
 }
+
+#if defined(USE_HW_AES) && defined(_MSC_VER)
+SECStatus
+CTR_Update_HW_AES(CTRContext *ctr, unsigned char *outbuf,
+		unsigned int *outlen, unsigned int maxout,
+		const unsigned char *inbuf, unsigned int inlen,
+		unsigned int blocksize)
+{
+    unsigned int tmp;
+    SECStatus rv;
+
+    if (maxout < inlen) {
+	*outlen = inlen;
+	PORT_SetError(SEC_ERROR_OUTPUT_LEN);
+	return SECFailure;
+    }
+    *outlen = 0;
+    if (ctr->bufPtr != blocksize) {
+	unsigned int needed = PR_MIN(blocksize-ctr->bufPtr, inlen);
+	ctr_xor(outbuf, inbuf, ctr->buffer+ctr->bufPtr, needed);
+	ctr->bufPtr += needed;
+	outbuf += needed;
+	inbuf += needed;
+	*outlen += needed;
+	inlen -= needed;
+	if (inlen == 0) {
+	    return SECSuccess;
+	}
+	PORT_Assert(ctr->bufPtr == blocksize);
+    }
+
+    intel_aes_ctr_worker(((AESContext*)(ctr->context))->Nr)(
+	ctr, outbuf, outlen, maxout, inbuf, inlen, blocksize);
+    *outlen += inlen & (-16);
+    outbuf += inlen & (-16);
+    inbuf += inlen & (-16);
+    inlen &= 16 - 1;
+
+    if (inlen == 0) {
+	return SECSuccess;
+    }
+    rv = (*ctr->cipher)(ctr->context, ctr->buffer, &tmp, blocksize,
+			ctr->counter, blocksize, blocksize);
+    ctr_GetNextCtr(ctr->counter, ctr->counterBits, blocksize);
+    if (rv != SECSuccess) {
+	return SECFailure;
+    }
+    ctr_xor(outbuf, inbuf, ctr->buffer, inlen);
+    ctr->bufPtr = inlen;
+    *outlen += inlen;
+    return SECSuccess;
+}
+#endif
--- a/lib/freebl/ctr.h
+++ b/lib/freebl/ctr.h
@@ -36,9 +36,16 @@ CTRContext * CTR_CreateContext(void *con
 
 void CTR_DestroyContext(CTRContext *ctr, PRBool freeit);
 
 SECStatus CTR_Update(CTRContext *ctr, unsigned char *outbuf,
 			unsigned int *outlen, unsigned int maxout,
 			const unsigned char *inbuf, unsigned int inlen,
 			unsigned int blocksize);
 
+#ifdef USE_HW_AES
+SECStatus CTR_Update_HW_AES(CTRContext *ctr, unsigned char *outbuf,
+			unsigned int *outlen, unsigned int maxout,
+			const unsigned char *inbuf, unsigned int inlen,
+			unsigned int blocksize);
 #endif
+
+#endif
new file mode 100644
--- /dev/null
+++ b/lib/freebl/intel-aes-x64-masm.asm
@@ -0,0 +1,969 @@
+; LICENSE:
+; This submission to NSS is to be made available under the terms of the
+; Mozilla Public License, v. 2.0. You can obtain one at http:
+; //mozilla.org/MPL/2.0/.
+;###############################################################################
+; Copyright(c) 2014, Intel Corp.
+; Developers and authors:
+; Shay Gueron and Vlad Krasnov
+; Intel Corporation, Israel Development Centre, Haifa, Israel
+; Please send feedback directly to crypto.feedback.alias@intel.com
+
+
+.DATA
+ALIGN 16
+Lmask dd 0c0f0e0dh,0c0f0e0dh,0c0f0e0dh,0c0f0e0dh
+Lmask192 dd 004070605h, 004070605h, 004070605h, 004070605h
+Lmask256 dd 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh
+Lcon1 dd 1,1,1,1
+Lcon2 dd 1bh,1bh,1bh,1bh
+
+.CODE
+
+ctx     textequ <rcx>
+output  textequ <rdx>
+input   textequ <r8>
+inputLen textequ <r9d>
+
+
+aes_rnd MACRO i
+    movdqu  xmm8, [i*16 + ctx]
+    aesenc  xmm0, xmm8
+    aesenc  xmm1, xmm8
+    aesenc  xmm2, xmm8
+    aesenc  xmm3, xmm8
+    aesenc  xmm4, xmm8
+    aesenc  xmm5, xmm8
+    aesenc  xmm6, xmm8
+    aesenc  xmm7, xmm8
+    ENDM
+
+aes_last_rnd MACRO i
+    movdqu  xmm8, [i*16 + ctx]
+    aesenclast  xmm0, xmm8
+    aesenclast  xmm1, xmm8
+    aesenclast  xmm2, xmm8
+    aesenclast  xmm3, xmm8
+    aesenclast  xmm4, xmm8
+    aesenclast  xmm5, xmm8
+    aesenclast  xmm6, xmm8
+    aesenclast  xmm7, xmm8
+    ENDM
+
+aes_dec_rnd MACRO i
+    movdqu  xmm8, [i*16 + ctx]
+    aesdec  xmm0, xmm8
+    aesdec  xmm1, xmm8
+    aesdec  xmm2, xmm8
+    aesdec  xmm3, xmm8
+    aesdec  xmm4, xmm8
+    aesdec  xmm5, xmm8
+    aesdec  xmm6, xmm8
+    aesdec  xmm7, xmm8
+    ENDM
+
+aes_dec_last_rnd MACRO i
+    movdqu  xmm8, [i*16 + ctx]
+    aesdeclast  xmm0, xmm8
+    aesdeclast  xmm1, xmm8
+    aesdeclast  xmm2, xmm8
+    aesdeclast  xmm3, xmm8
+    aesdeclast  xmm4, xmm8
+    aesdeclast  xmm5, xmm8
+    aesdeclast  xmm6, xmm8
+    aesdeclast  xmm7, xmm8
+    ENDM
+
+
+gen_aes_ecb_func MACRO enc, rnds
+
+LOCAL   loop8
+LOCAL   loop1
+LOCAL   bail
+
+        xor     inputLen, inputLen
+        mov     input,      [rsp + 1*8 + 8*4]
+        mov     inputLen,   [rsp + 1*8 + 8*5]
+
+        sub     rsp, 3*16
+
+        movdqu  [rsp + 0*16], xmm6
+        movdqu  [rsp + 1*16], xmm7
+        movdqu  [rsp + 2*16], xmm8
+
+        lea     ctx, [48+ctx]
+
+loop8:
+        cmp     inputLen, 8*16
+        jb      loop1
+
+        movdqu  xmm0, [0*16 + input]
+        movdqu  xmm1, [1*16 + input]
+        movdqu  xmm2, [2*16 + input]
+        movdqu  xmm3, [3*16 + input]
+        movdqu  xmm4, [4*16 + input]
+        movdqu  xmm5, [5*16 + input]
+        movdqu  xmm6, [6*16 + input]
+        movdqu  xmm7, [7*16 + input]
+
+        movdqu  xmm8, [0*16 + ctx]
+        pxor    xmm0, xmm8
+        pxor    xmm1, xmm8
+        pxor    xmm2, xmm8
+        pxor    xmm3, xmm8
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm8
+        pxor    xmm6, xmm8
+        pxor    xmm7, xmm8
+
+IF enc eq 1
+        rnd textequ <aes_rnd>
+        lastrnd textequ <aes_last_rnd>
+        aesinst textequ <aesenc>
+        aeslastinst textequ <aesenclast>
+ELSE
+        rnd textequ <aes_dec_rnd>
+        lastrnd textequ <aes_dec_last_rnd>
+        aesinst textequ <aesdec>
+        aeslastinst textequ <aesdeclast>
+ENDIF
+
+        i = 1
+        WHILE i LT rnds
+            rnd i
+            i = i+1
+            ENDM
+        lastrnd rnds
+
+        movdqu  [0*16 + output], xmm0
+        movdqu  [1*16 + output], xmm1
+        movdqu  [2*16 + output], xmm2
+        movdqu  [3*16 + output], xmm3
+        movdqu  [4*16 + output], xmm4
+        movdqu  [5*16 + output], xmm5
+        movdqu  [6*16 + output], xmm6
+        movdqu  [7*16 + output], xmm7
+
+        lea input, [8*16 + input]
+        lea output, [8*16 + output]
+        sub inputLen, 8*16
+        jmp loop8
+
+loop1:
+        cmp     inputLen, 1*16
+        jb      bail
+
+        movdqu  xmm0, [input]
+        movdqu  xmm7, [0*16 + ctx]
+        pxor    xmm0, xmm7
+
+        i = 1
+    WHILE i LT rnds
+            movdqu  xmm7, [i*16 + ctx]
+            aesinst  xmm0, xmm7
+            i = i+1
+        ENDM
+        movdqu  xmm7, [rnds*16 + ctx]
+        aeslastinst xmm0, xmm7
+
+        movdqu  [output], xmm0
+
+        lea input, [1*16 + input]
+        lea output, [1*16 + output]
+        sub inputLen, 1*16
+        jmp loop1
+
+bail:
+        xor rax, rax
+
+        movdqu  xmm6, [rsp + 0*16]
+        movdqu  xmm7, [rsp + 1*16]
+        movdqu  xmm8, [rsp + 2*16]
+        add     rsp, 3*16
+        ret
+ENDM
+
+intel_aes_encrypt_ecb_128 PROC
+gen_aes_ecb_func 1, 10
+intel_aes_encrypt_ecb_128 ENDP
+
+intel_aes_encrypt_ecb_192 PROC
+gen_aes_ecb_func 1, 12
+intel_aes_encrypt_ecb_192 ENDP
+
+intel_aes_encrypt_ecb_256 PROC
+gen_aes_ecb_func 1, 14
+intel_aes_encrypt_ecb_256 ENDP
+
+intel_aes_decrypt_ecb_128 PROC
+gen_aes_ecb_func 0, 10
+intel_aes_decrypt_ecb_128 ENDP
+
+intel_aes_decrypt_ecb_192 PROC
+gen_aes_ecb_func 0, 12
+intel_aes_decrypt_ecb_192 ENDP
+
+intel_aes_decrypt_ecb_256 PROC
+gen_aes_ecb_func 0, 14
+intel_aes_decrypt_ecb_256 ENDP
+
+
+KEY textequ <rcx>
+KS  textequ <rdx>
+ITR textequ <r8>
+
+intel_aes_encrypt_init_128  PROC
+
+    movdqu  xmm1, [KEY]
+    movdqu  [KS], xmm1
+    movdqa  xmm2, xmm1
+
+    lea ITR, Lcon1
+    movdqa  xmm0, [ITR]
+    lea ITR, Lmask
+    movdqa  xmm4, [ITR]
+
+    mov ITR, 8
+
+Lenc_128_ks_loop:
+        lea KS, [16 + KS]
+        dec ITR
+
+        pshufb  xmm2, xmm4
+        aesenclast  xmm2, xmm0
+        pslld   xmm0, 1
+        movdqa  xmm3, xmm1
+        pslldq  xmm3, 4
+        pxor    xmm1, xmm3
+        pslldq  xmm3, 4
+        pxor    xmm1, xmm3
+        pslldq  xmm3, 4
+        pxor    xmm1, xmm3
+        pxor    xmm1, xmm2
+        movdqu  [KS], xmm1
+        movdqa  xmm2, xmm1
+
+        jne Lenc_128_ks_loop
+
+    lea ITR, Lcon2
+    movdqa  xmm0, [ITR]
+
+    pshufb  xmm2, xmm4
+    aesenclast  xmm2, xmm0
+    pslld   xmm0, 1
+    movdqa  xmm3, xmm1
+    pslldq  xmm3, 4
+    pxor    xmm1, xmm3
+    pslldq  xmm3, 4
+    pxor    xmm1, xmm3
+    pslldq  xmm3, 4
+    pxor    xmm1, xmm3
+    pxor    xmm1, xmm2
+    movdqu  [16 + KS], xmm1
+    movdqa  xmm2, xmm1
+
+    pshufb  xmm2, xmm4
+    aesenclast  xmm2, xmm0
+    movdqa  xmm3, xmm1
+    pslldq  xmm3, 4
+    pxor    xmm1, xmm3
+    pslldq  xmm3, 4
+    pxor    xmm1, xmm3
+    pslldq  xmm3, 4
+    pxor    xmm1, xmm3
+    pxor    xmm1, xmm2
+    movdqu  [32 + KS], xmm1
+    movdqa  xmm2, xmm1
+
+    ret
+intel_aes_encrypt_init_128  ENDP
+
+
+intel_aes_decrypt_init_128  PROC
+
+    push    KS
+    push    KEY
+
+    call    intel_aes_encrypt_init_128
+
+    pop     KEY
+    pop     KS
+
+    movdqu  xmm0, [0*16 + KS]
+    movdqu  xmm1, [10*16 + KS]
+    movdqu  [10*16 + KS], xmm0
+    movdqu  [0*16 + KS], xmm1
+
+    i = 1
+    WHILE i LT 5
+        movdqu  xmm0, [i*16 + KS]
+        movdqu  xmm1, [(10-i)*16 + KS]
+
+        aesimc  xmm0, xmm0
+        aesimc  xmm1, xmm1
+
+        movdqu  [(10-i)*16 + KS], xmm0
+        movdqu  [i*16 + KS], xmm1
+
+        i = i+1
+    ENDM
+
+    movdqu  xmm0, [5*16 + KS]
+    aesimc  xmm0, xmm0
+    movdqu  [5*16 + KS], xmm0
+    ret
+intel_aes_decrypt_init_128  ENDP
+
+
+intel_aes_encrypt_init_192  PROC
+
+    sub     rsp, 16*2
+    movdqu  [16*0 + rsp], xmm6
+    movdqu  [16*1 + rsp], xmm7
+
+    movdqu  xmm1, [KEY]
+    mov     ITR, [16 + KEY]
+    movd    xmm3, ITR
+
+    movdqu  [KS], xmm1
+    movdqa  xmm5, xmm3
+
+    lea ITR, Lcon1
+    movdqu  xmm0, [ITR]
+    lea ITR, Lmask192
+    movdqu  xmm4, [ITR]
+
+    mov ITR, 4
+
+Lenc_192_ks_loop:
+        movdqa  xmm2, xmm3
+        pshufb  xmm2, xmm4
+        aesenclast xmm2, xmm0
+        pslld   xmm0, 1
+
+        movdqa  xmm6, xmm1
+        movdqa  xmm7, xmm3
+        pslldq  xmm6, 4
+        pslldq  xmm7, 4
+        pxor    xmm1, xmm6
+        pxor    xmm3, xmm7
+        pslldq  xmm6, 4
+        pxor    xmm1, xmm6
+        pslldq  xmm6, 4
+        pxor    xmm1, xmm6
+        pxor    xmm1, xmm2
+        pshufd  xmm2, xmm1, 0ffh
+        pxor    xmm3, xmm2
+
+        movdqa  xmm6, xmm1
+        shufpd  xmm5, xmm1, 00h
+        shufpd  xmm6, xmm3, 01h
+
+        movdqu  [16 + KS], xmm5
+        movdqu  [32 + KS], xmm6
+
+        movdqa  xmm2, xmm3
+        pshufb  xmm2, xmm4
+        aesenclast  xmm2, xmm0
+        pslld   xmm0, 1
+
+        movdqa  xmm6, xmm1
+        movdqa  xmm7, xmm3
+        pslldq  xmm6, 4
+        pslldq  xmm7, 4
+        pxor    xmm1, xmm6
+        pxor    xmm3, xmm7
+        pslldq  xmm6, 4
+        pxor    xmm1, xmm6
+        pslldq  xmm6, 4
+        pxor    xmm1, xmm6
+        pxor    xmm1, xmm2
+        pshufd  xmm2, xmm1, 0ffh
+        pxor    xmm3, xmm2
+
+        movdqu  [48 + KS], xmm1
+        movdqa  xmm5, xmm3
+
+        lea KS, [48 + KS]
+
+        dec ITR
+        jnz Lenc_192_ks_loop
+
+    movdqu  [16 + KS], xmm5
+
+    movdqu  xmm7, [16*1 + rsp]
+    movdqu  xmm6, [16*0 + rsp]
+    add rsp, 16*2
+    ret
+intel_aes_encrypt_init_192  ENDP
+
+intel_aes_decrypt_init_192  PROC
+    push    KS
+    push    KEY
+
+    call    intel_aes_encrypt_init_192
+
+    pop     KEY
+    pop     KS
+
+    movdqu  xmm0, [0*16 + KS]
+    movdqu  xmm1, [12*16 + KS]
+    movdqu  [12*16 + KS], xmm0
+    movdqu  [0*16 + KS], xmm1
+
+    i = 1
+    WHILE i LT 6
+        movdqu  xmm0, [i*16 + KS]
+        movdqu  xmm1, [(12-i)*16 + KS]
+
+        aesimc  xmm0, xmm0
+        aesimc  xmm1, xmm1
+
+        movdqu  [(12-i)*16 + KS], xmm0
+        movdqu  [i*16 + KS], xmm1
+
+        i = i+1
+    ENDM
+
+    movdqu  xmm0, [6*16 + KS]
+    aesimc  xmm0, xmm0
+    movdqu  [6*16 + KS], xmm0
+    ret
+intel_aes_decrypt_init_192  ENDP
+
+
+intel_aes_encrypt_init_256  PROC
+    sub     rsp, 16*2
+    movdqu  [16*0 + rsp], xmm6
+    movdqu  [16*1 + rsp], xmm7
+
+    movdqu  xmm1, [16*0 + KEY]
+    movdqu  xmm3, [16*1 + KEY]
+
+    movdqu  [16*0 + KS], xmm1
+    movdqu  [16*1 + KS], xmm3
+
+    lea ITR, Lcon1
+    movdqu  xmm0, [ITR]
+    lea ITR, Lmask256
+    movdqu  xmm5, [ITR]
+
+    pxor    xmm6, xmm6
+
+    mov ITR, 6
+
+Lenc_256_ks_loop:
+
+        movdqa  xmm2, xmm3
+        pshufb  xmm2, xmm5
+        aesenclast  xmm2, xmm0
+        pslld   xmm0, 1
+        movdqa  xmm4, xmm1
+        pslldq  xmm4, 4
+        pxor    xmm1, xmm4
+        pslldq  xmm4, 4
+        pxor    xmm1, xmm4
+        pslldq  xmm4, 4
+        pxor    xmm1, xmm4
+        pxor    xmm1, xmm2
+        movdqu  [16*2 + KS], xmm1
+
+        pshufd  xmm2, xmm1, 0ffh
+        aesenclast  xmm2, xmm6
+        movdqa  xmm4, xmm3
+        pslldq  xmm4, 4
+        pxor    xmm3, xmm4
+        pslldq  xmm4, 4
+        pxor    xmm3, xmm4
+        pslldq  xmm4, 4
+        pxor    xmm3, xmm4
+        pxor    xmm3, xmm2
+        movdqu  [16*3 + KS], xmm3
+
+        lea KS, [32 + KS]
+        dec ITR
+        jnz Lenc_256_ks_loop
+
+    movdqa  xmm2, xmm3
+    pshufb  xmm2, xmm5
+    aesenclast  xmm2, xmm0
+    movdqa  xmm4, xmm1
+    pslldq  xmm4, 4
+    pxor    xmm1, xmm4
+    pslldq  xmm4, 4
+    pxor    xmm1, xmm4
+    pslldq  xmm4, 4
+    pxor    xmm1, xmm4
+    pxor    xmm1, xmm2
+    movdqu  [16*2 + KS], xmm1
+
+    movdqu  xmm7, [16*1 + rsp]
+    movdqu  xmm6, [16*0 + rsp]
+    add rsp, 16*2
+    ret
+
+intel_aes_encrypt_init_256  ENDP
+
+
+intel_aes_decrypt_init_256  PROC
+    push    KS
+    push    KEY
+
+    call    intel_aes_encrypt_init_256
+
+    pop     KEY
+    pop     KS
+
+    movdqu  xmm0, [0*16 + KS]
+    movdqu  xmm1, [14*16 + KS]
+    movdqu  [14*16 + KS], xmm0
+    movdqu  [0*16 + KS], xmm1
+
+    i = 1
+    WHILE i LT 7
+        movdqu  xmm0, [i*16 + KS]
+        movdqu  xmm1, [(14-i)*16 + KS]
+
+        aesimc  xmm0, xmm0
+        aesimc  xmm1, xmm1
+
+        movdqu  [(14-i)*16 + KS], xmm0
+        movdqu  [i*16 + KS], xmm1
+
+        i = i+1
+    ENDM
+
+    movdqu  xmm0, [7*16 + KS]
+    aesimc  xmm0, xmm0
+    movdqu  [7*16 + KS], xmm0
+    ret
+intel_aes_decrypt_init_256  ENDP
+
+
+
+gen_aes_cbc_enc_func MACRO rnds
+
+LOCAL   loop1
+LOCAL   bail
+
+        mov     input,      [rsp + 1*8 + 8*4]
+        mov     inputLen,   [rsp + 1*8 + 8*5]
+
+        sub     rsp, 3*16
+
+        movdqu  [rsp + 0*16], xmm6
+        movdqu  [rsp + 1*16], xmm7
+        movdqu  [rsp + 2*16], xmm8
+
+        lea     ctx, [48+ctx]
+
+        movdqu  xmm0, [-32+ctx]
+
+        movdqu  xmm2, [0*16 + ctx]
+        movdqu  xmm3, [1*16 + ctx]
+        movdqu  xmm4, [2*16 + ctx]
+        movdqu  xmm5, [3*16 + ctx]
+        movdqu  xmm6, [4*16 + ctx]
+        movdqu  xmm7, [5*16 + ctx]
+
+loop1:
+        cmp     inputLen, 1*16
+        jb      bail
+
+        movdqu  xmm1, [input]
+        pxor    xmm1, xmm2
+        pxor    xmm0, xmm1
+
+        aesenc  xmm0, xmm3
+        aesenc  xmm0, xmm4
+        aesenc  xmm0, xmm5
+        aesenc  xmm0, xmm6
+        aesenc  xmm0, xmm7
+
+        i = 6
+    WHILE i LT rnds
+            movdqu  xmm8, [i*16 + ctx]
+            aesenc  xmm0, xmm8
+            i = i+1
+        ENDM
+        movdqu  xmm8, [rnds*16 + ctx]
+        aesenclast xmm0, xmm8
+
+        movdqu  [output], xmm0
+
+        lea input, [1*16 + input]
+        lea output, [1*16 + output]
+        sub inputLen, 1*16
+        jmp loop1
+
+bail:
+        movdqu  [-32+ctx], xmm0
+
+        xor rax, rax
+
+        movdqu  xmm6, [rsp + 0*16]
+        movdqu  xmm7, [rsp + 1*16]
+        movdqu  xmm8, [rsp + 2*16]
+        add     rsp, 3*16
+        ret
+
+ENDM
+
+gen_aes_cbc_dec_func MACRO rnds
+
+LOCAL   loop8
+LOCAL   loop1
+LOCAL   dec1
+LOCAL   bail
+
+        mov     input,      [rsp + 1*8 + 8*4]
+        mov     inputLen,   [rsp + 1*8 + 8*5]
+
+        sub     rsp, 3*16
+
+        movdqu  [rsp + 0*16], xmm6
+        movdqu  [rsp + 1*16], xmm7
+        movdqu  [rsp + 2*16], xmm8
+
+        lea     ctx, [48+ctx]
+
+loop8:
+        cmp     inputLen, 8*16
+        jb      dec1
+
+        movdqu  xmm0, [0*16 + input]
+        movdqu  xmm1, [1*16 + input]
+        movdqu  xmm2, [2*16 + input]
+        movdqu  xmm3, [3*16 + input]
+        movdqu  xmm4, [4*16 + input]
+        movdqu  xmm5, [5*16 + input]
+        movdqu  xmm6, [6*16 + input]
+        movdqu  xmm7, [7*16 + input]
+
+        movdqu  xmm8, [0*16 + ctx]
+        pxor    xmm0, xmm8
+        pxor    xmm1, xmm8
+        pxor    xmm2, xmm8
+        pxor    xmm3, xmm8
+        pxor    xmm4, xmm8
+        pxor    xmm5, xmm8
+        pxor    xmm6, xmm8
+        pxor    xmm7, xmm8
+
+        i = 1
+        WHILE i LT rnds
+            aes_dec_rnd i
+            i = i+1
+            ENDM
+        aes_dec_last_rnd rnds
+
+        movdqu  xmm8, [-32 + ctx]
+        pxor    xmm0, xmm8
+        movdqu  xmm8, [0*16 + input]
+        pxor    xmm1, xmm8
+        movdqu  xmm8, [1*16 + input]
+        pxor    xmm2, xmm8
+        movdqu  xmm8, [2*16 + input]
+        pxor    xmm3, xmm8
+        movdqu  xmm8, [3*16 + input]
+        pxor    xmm4, xmm8
+        movdqu  xmm8, [4*16 + input]
+        pxor    xmm5, xmm8
+        movdqu  xmm8, [5*16 + input]
+        pxor    xmm6, xmm8
+        movdqu  xmm8, [6*16 + input]
+
+        movdqu  [0*16 + output], xmm0
+        movdqu  [1*16 + output], xmm1
+        movdqu  [2*16 + output], xmm2
+        movdqu  [3*16 + output], xmm3
+        movdqu  [4*16 + output], xmm4
+        movdqu  [5*16 + output], xmm5
+        movdqu  [6*16 + output], xmm6
+        movdqu  [7*16 + output], xmm7
+        movdqu  [-32 + ctx], xmm8
+
+        lea input, [8*16 + input]
+        lea output, [8*16 + output]
+        sub inputLen, 8*16
+        jmp loop8
+dec1:
+
+        movdqu  xmm3, [-32 + ctx]
+
+loop1:
+        cmp     inputLen, 1*16
+        jb      bail
+
+        movdqu  xmm0, [input]
+        movdqa  xmm4, xmm0
+        movdqu  xmm7, [0*16 + ctx]
+        pxor    xmm0, xmm7
+
+        i = 1
+    WHILE i LT rnds
+            movdqu  xmm7, [i*16 + ctx]
+            aesdec  xmm0, xmm7
+            i = i+1
+        ENDM
+        movdqu  xmm7, [rnds*16 + ctx]
+        aesdeclast xmm0, xmm7
+        pxor    xmm3, xmm0
+
+        movdqu  [output], xmm3
+        movdqa  xmm3, xmm4
+
+        lea input, [1*16 + input]
+        lea output, [1*16 + output]
+        sub inputLen, 1*16
+        jmp loop1
+
+bail:
+        movdqu  [-32 + ctx], xmm3
+        xor rax, rax
+
+        movdqu  xmm6, [rsp + 0*16]
+        movdqu  xmm7, [rsp + 1*16]
+        movdqu  xmm8, [rsp + 2*16]
+        add     rsp, 3*16
+        ret
+ENDM
+
+intel_aes_encrypt_cbc_128 PROC
+gen_aes_cbc_enc_func  10
+intel_aes_encrypt_cbc_128 ENDP
+
+intel_aes_encrypt_cbc_192 PROC
+gen_aes_cbc_enc_func  12
+intel_aes_encrypt_cbc_192 ENDP
+
+intel_aes_encrypt_cbc_256 PROC
+gen_aes_cbc_enc_func  14
+intel_aes_encrypt_cbc_256 ENDP
+
+intel_aes_decrypt_cbc_128 PROC
+gen_aes_cbc_dec_func  10
+intel_aes_decrypt_cbc_128 ENDP
+
+intel_aes_decrypt_cbc_192 PROC
+gen_aes_cbc_dec_func  12
+intel_aes_decrypt_cbc_192 ENDP
+
+intel_aes_decrypt_cbc_256 PROC
+gen_aes_cbc_dec_func  14
+intel_aes_decrypt_cbc_256 ENDP
+
+
+
+ctrCtx textequ <r10>
+CTR textequ <r11d>
+CTRSave textequ <eax>
+
+gen_aes_ctr_func MACRO rnds
+
+LOCAL   loop8
+LOCAL   loop1
+LOCAL   enc1
+LOCAL   bail
+
+        mov     input,      [rsp + 8*1 + 4*8]
+        mov     inputLen,   [rsp + 8*1 + 5*8]
+
+        mov     ctrCtx, ctx
+        mov     ctx, [8+ctrCtx]
+        lea     ctx, [48+ctx]
+
+        sub     rsp, 3*16
+        movdqu  [rsp + 0*16], xmm6
+        movdqu  [rsp + 1*16], xmm7
+        movdqu  [rsp + 2*16], xmm8
+
+
+        push    rbp
+        mov     rbp, rsp
+        sub     rsp, 8*16
+        and     rsp, -16
+
+
+        movdqu  xmm0, [16+ctrCtx]
+        mov     CTRSave, DWORD PTR [ctrCtx + 16 + 3*4]
+        bswap   CTRSave
+        movdqu  xmm1, [ctx + 0*16]
+
+        pxor    xmm0, xmm1
+
+        movdqa  [rsp + 0*16], xmm0
+        movdqa  [rsp + 1*16], xmm0
+        movdqa  [rsp + 2*16], xmm0
+        movdqa  [rsp + 3*16], xmm0
+        movdqa  [rsp + 4*16], xmm0
+        movdqa  [rsp + 5*16], xmm0
+        movdqa  [rsp + 6*16], xmm0
+        movdqa  [rsp + 7*16], xmm0
+
+        inc     CTRSave
+        mov     CTR, CTRSave
+        bswap   CTR
+        xor     CTR, DWORD PTR [ctx + 3*4]
+        mov     DWORD PTR [rsp + 1*16 + 3*4], CTR
+
+        inc     CTRSave
+        mov     CTR, CTRSave
+        bswap   CTR
+        xor     CTR, DWORD PTR [ctx + 3*4]
+        mov     DWORD PTR [rsp + 2*16 + 3*4], CTR
+
+        inc     CTRSave
+        mov     CTR, CTRSave
+        bswap   CTR
+        xor     CTR, DWORD PTR [ctx + 3*4]
+        mov     DWORD PTR [rsp + 3*16 + 3*4], CTR
+
+        inc     CTRSave
+        mov     CTR, CTRSave
+        bswap   CTR
+        xor     CTR, DWORD PTR [ctx + 3*4]
+        mov     DWORD PTR [rsp + 4*16 + 3*4], CTR
+
+        inc     CTRSave
+        mov     CTR, CTRSave
+        bswap   CTR
+        xor     CTR, DWORD PTR [ctx + 3*4]
+        mov     DWORD PTR [rsp + 5*16 + 3*4], CTR
+
+        inc     CTRSave
+        mov     CTR, CTRSave
+        bswap   CTR
+        xor     CTR, DWORD PTR [ctx + 3*4]
+        mov     DWORD PTR [rsp + 6*16 + 3*4], CTR
+
+        inc     CTRSave
+        mov     CTR, CTRSave
+        bswap   CTR
+        xor     CTR, DWORD PTR [ctx + 3*4]
+        mov     DWORD PTR [rsp + 7*16 + 3*4], CTR
+
+
+loop8:
+        cmp     inputLen, 8*16
+        jb      loop1
+
+        movdqu  xmm0, [0*16 + rsp]
+        movdqu  xmm1, [1*16 + rsp]
+        movdqu  xmm2, [2*16 + rsp]
+        movdqu  xmm3, [3*16 + rsp]
+        movdqu  xmm4, [4*16 + rsp]
+        movdqu  xmm5, [5*16 + rsp]
+        movdqu  xmm6, [6*16 + rsp]
+        movdqu  xmm7, [7*16 + rsp]
+
+        i = 1
+        WHILE i LE 8
+            aes_rnd i
+
+            inc     CTRSave
+            mov     CTR, CTRSave
+            bswap   CTR
+            xor     CTR, DWORD PTR [ctx + 3*4]
+            mov     DWORD PTR [rsp + (i-1)*16 + 3*4], CTR
+
+            i = i+1
+        ENDM
+        WHILE i LT rnds
+            aes_rnd i
+            i = i+1
+            ENDM
+        aes_last_rnd rnds
+
+        movdqu  xmm8, [0*16 + input]
+        pxor    xmm0, xmm8
+        movdqu  xmm8, [1*16 + input]
+        pxor    xmm1, xmm8
+        movdqu  xmm8, [2*16 + input]
+        pxor    xmm2, xmm8
+        movdqu  xmm8, [3*16 + input]
+        pxor    xmm3, xmm8
+        movdqu  xmm8, [4*16 + input]
+        pxor    xmm4, xmm8
+        movdqu  xmm8, [5*16 + input]
+        pxor    xmm5, xmm8
+        movdqu  xmm8, [6*16 + input]
+        pxor    xmm6, xmm8
+        movdqu  xmm8, [7*16 + input]
+        pxor    xmm7, xmm8
+
+        movdqu  [0*16 + output], xmm0
+        movdqu  [1*16 + output], xmm1
+        movdqu  [2*16 + output], xmm2
+        movdqu  [3*16 + output], xmm3
+        movdqu  [4*16 + output], xmm4
+        movdqu  [5*16 + output], xmm5
+        movdqu  [6*16 + output], xmm6
+        movdqu  [7*16 + output], xmm7
+
+        lea input, [8*16 + input]
+        lea output, [8*16 + output]
+        sub inputLen, 8*16
+        jmp loop8
+
+
+loop1:
+        cmp     inputLen, 1*16
+        jb      bail
+
+        movdqu  xmm0, [rsp]
+        add     rsp, 16
+
+        i = 1
+    WHILE i LT rnds
+            movdqu  xmm7, [i*16 + ctx]
+            aesenc  xmm0, xmm7
+            i = i+1
+        ENDM
+        movdqu  xmm7, [rnds*16 + ctx]
+        aesenclast xmm0, xmm7
+
+        movdqu  xmm7, [input]
+        pxor    xmm0, xmm7
+        movdqu  [output], xmm0
+
+        lea input, [1*16 + input]
+        lea output, [1*16 + output]
+        sub inputLen, 1*16
+        jmp loop1
+
+bail:
+
+        movdqu  xmm0, [rsp]
+        movdqu  xmm1, [ctx + 0*16]
+        pxor    xmm0, xmm1
+        movdqu  [16+ctrCtx], xmm0
+
+
+        xor     rax, rax
+        mov     rsp, rbp
+        pop     rbp
+
+        movdqu  xmm6, [rsp + 0*16]
+        movdqu  xmm7, [rsp + 1*16]
+        movdqu  xmm8, [rsp + 2*16]
+        add     rsp, 3*16
+
+        ret
+ENDM
+
+
+intel_aes_encrypt_ctr_128 PROC
+gen_aes_ctr_func  10
+intel_aes_encrypt_ctr_128 ENDP
+
+intel_aes_encrypt_ctr_192 PROC
+gen_aes_ctr_func  12
+intel_aes_encrypt_ctr_192 ENDP
+
+intel_aes_encrypt_ctr_256 PROC
+gen_aes_ctr_func  14
+intel_aes_encrypt_ctr_256 ENDP
+
+
+END
new file mode 100644
--- /dev/null
+++ b/lib/freebl/intel-aes-x86-masm.asm
@@ -0,0 +1,949 @@
+; LICENSE:
+; This submission to NSS is to be made available under the terms of the
+; Mozilla Public License, v. 2.0. You can obtain one at http:
+; //mozilla.org/MPL/2.0/.
+;###############################################################################
+; Copyright(c) 2014, Intel Corp.
+; Developers and authors:
+; Shay Gueron and Vlad Krasnov
+; Intel Corporation, Israel Development Centre, Haifa, Israel
+; Please send feedback directly to crypto.feedback.alias@intel.com
+
+
+.MODEL FLAT, C
+.XMM
+
+.DATA
+ALIGN 16
+Lmask dd 0c0f0e0dh,0c0f0e0dh,0c0f0e0dh,0c0f0e0dh
+Lmask192 dd 004070605h, 004070605h, 004070605h, 004070605h
+Lmask256 dd 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh
+Lcon1 dd 1,1,1,1
+Lcon2 dd 1bh,1bh,1bh,1bh
+
+.CODE
+
+ctx     textequ <ecx>
+output  textequ <edx>
+input   textequ <eax>
+inputLen textequ <edi>
+
+
+aes_rnd MACRO i
+    movdqu  xmm7, [i*16 + ctx]
+    aesenc  xmm0, xmm7
+    aesenc  xmm1, xmm7
+    aesenc  xmm2, xmm7
+    aesenc  xmm3, xmm7
+    aesenc  xmm4, xmm7
+    aesenc  xmm5, xmm7
+    aesenc  xmm6, xmm7
+    ENDM
+
+aes_last_rnd MACRO i
+    movdqu  xmm7, [i*16 + ctx]
+    aesenclast  xmm0, xmm7
+    aesenclast  xmm1, xmm7
+    aesenclast  xmm2, xmm7
+    aesenclast  xmm3, xmm7
+    aesenclast  xmm4, xmm7
+    aesenclast  xmm5, xmm7
+    aesenclast  xmm6, xmm7
+    ENDM
+
+aes_dec_rnd MACRO i
+    movdqu  xmm7, [i*16 + ctx]
+    aesdec  xmm0, xmm7
+    aesdec  xmm1, xmm7
+    aesdec  xmm2, xmm7
+    aesdec  xmm3, xmm7
+    aesdec  xmm4, xmm7
+    aesdec  xmm5, xmm7
+    aesdec  xmm6, xmm7
+    ENDM
+
+aes_dec_last_rnd MACRO i
+    movdqu  xmm7, [i*16 + ctx]
+    aesdeclast  xmm0, xmm7
+    aesdeclast  xmm1, xmm7
+    aesdeclast  xmm2, xmm7
+    aesdeclast  xmm3, xmm7
+    aesdeclast  xmm4, xmm7
+    aesdeclast  xmm5, xmm7
+    aesdeclast  xmm6, xmm7
+    ENDM
+
+
+gen_aes_ecb_func MACRO enc, rnds
+
+LOCAL   loop7
+LOCAL   loop1
+LOCAL   bail
+
+        push    inputLen
+
+        mov     ctx,    [esp + 2*4 + 0*4]
+        mov     output,     [esp + 2*4 + 1*4]
+        mov     input,      [esp + 2*4 + 4*4]
+        mov     inputLen,   [esp + 2*4 + 5*4]
+
+        lea     ctx, [44+ctx]
+
+loop7:
+        cmp     inputLen, 7*16
+        jb      loop1
+
+        movdqu  xmm0, [0*16 + input]
+        movdqu  xmm1, [1*16 + input]
+        movdqu  xmm2, [2*16 + input]
+        movdqu  xmm3, [3*16 + input]
+        movdqu  xmm4, [4*16 + input]
+        movdqu  xmm5, [5*16 + input]
+        movdqu  xmm6, [6*16 + input]
+
+        movdqu  xmm7, [0*16 + ctx]
+        pxor    xmm0, xmm7
+        pxor    xmm1, xmm7
+        pxor    xmm2, xmm7
+        pxor    xmm3, xmm7
+        pxor    xmm4, xmm7
+        pxor    xmm5, xmm7
+        pxor    xmm6, xmm7
+
+IF enc eq 1
+        rnd textequ <aes_rnd>
+        lastrnd textequ <aes_last_rnd>
+        aesinst textequ <aesenc>
+        aeslastinst textequ <aesenclast>
+ELSE
+        rnd textequ <aes_dec_rnd>
+        lastrnd textequ <aes_dec_last_rnd>
+        aesinst textequ <aesdec>
+        aeslastinst textequ <aesdeclast>
+ENDIF
+
+        i = 1
+        WHILE i LT rnds
+            rnd i
+            i = i+1
+            ENDM
+        lastrnd rnds
+
+        movdqu  [0*16 + output], xmm0
+        movdqu  [1*16 + output], xmm1
+        movdqu  [2*16 + output], xmm2
+        movdqu  [3*16 + output], xmm3
+        movdqu  [4*16 + output], xmm4
+        movdqu  [5*16 + output], xmm5
+        movdqu  [6*16 + output], xmm6
+
+        lea input, [7*16 + input]
+        lea output, [7*16 + output]
+        sub inputLen, 7*16
+        jmp loop7
+
+loop1:
+        cmp     inputLen, 1*16
+        jb      bail
+
+        movdqu  xmm0, [input]
+        movdqu  xmm7, [0*16 + ctx]
+        pxor    xmm0, xmm7
+
+        i = 1
+    WHILE i LT rnds
+            movdqu  xmm7, [i*16 + ctx]
+            aesinst  xmm0, xmm7
+            i = i+1
+        ENDM
+        movdqu  xmm7, [rnds*16 + ctx]
+        aeslastinst xmm0, xmm7
+
+        movdqu  [output], xmm0
+
+        lea input, [1*16 + input]
+        lea output, [1*16 + output]
+        sub inputLen, 1*16
+        jmp loop1
+
+bail:
+        xor eax, eax
+        pop     inputLen
+        ret
+
+ENDM
+
+ALIGN 16
+intel_aes_encrypt_ecb_128 PROC
+gen_aes_ecb_func 1, 10
+intel_aes_encrypt_ecb_128 ENDP
+
+ALIGN 16
+intel_aes_encrypt_ecb_192 PROC
+gen_aes_ecb_func 1, 12
+intel_aes_encrypt_ecb_192 ENDP
+
+ALIGN 16
+intel_aes_encrypt_ecb_256 PROC
+gen_aes_ecb_func 1, 14
+intel_aes_encrypt_ecb_256 ENDP
+
+ALIGN 16
+intel_aes_decrypt_ecb_128 PROC
+gen_aes_ecb_func 0, 10
+intel_aes_decrypt_ecb_128 ENDP
+
+ALIGN 16
+intel_aes_decrypt_ecb_192 PROC
+gen_aes_ecb_func 0, 12
+intel_aes_decrypt_ecb_192 ENDP
+
+ALIGN 16
+intel_aes_decrypt_ecb_256 PROC
+gen_aes_ecb_func 0, 14
+intel_aes_decrypt_ecb_256 ENDP
+
+
+KEY textequ <ecx>
+KS  textequ <edx>
+ITR textequ <eax>
+
+ALIGN 16
+intel_aes_encrypt_init_128  PROC
+
+    mov     KEY,        [esp + 1*4 + 0*4]
+    mov     KS,         [esp + 1*4 + 1*4]
+
+
+    movdqu  xmm1, [KEY]
+    movdqu  [KS], xmm1
+    movdqa  xmm2, xmm1
+
+    lea ITR, Lcon1
+    movdqa  xmm0, [ITR]
+    lea ITR, Lmask
+    movdqa  xmm4, [ITR]
+
+    mov ITR, 8
+
+Lenc_128_ks_loop:
+        lea KS, [16 + KS]
+        dec ITR
+
+        pshufb  xmm2, xmm4
+        aesenclast  xmm2, xmm0
+        pslld   xmm0, 1
+        movdqa  xmm3, xmm1
+        pslldq  xmm3, 4
+        pxor    xmm1, xmm3
+        pslldq  xmm3, 4
+        pxor    xmm1, xmm3
+        pslldq  xmm3, 4
+        pxor    xmm1, xmm3
+        pxor    xmm1, xmm2
+        movdqu  [KS], xmm1
+        movdqa  xmm2, xmm1
+
+        jne Lenc_128_ks_loop
+
+    lea ITR, Lcon2
+    movdqa  xmm0, [ITR]
+
+    pshufb  xmm2, xmm4
+    aesenclast  xmm2, xmm0
+    pslld   xmm0, 1
+    movdqa  xmm3, xmm1
+    pslldq  xmm3, 4
+    pxor    xmm1, xmm3
+    pslldq  xmm3, 4
+    pxor    xmm1, xmm3
+    pslldq  xmm3, 4
+    pxor    xmm1, xmm3
+    pxor    xmm1, xmm2
+    movdqu  [16 + KS], xmm1
+    movdqa  xmm2, xmm1
+
+    pshufb  xmm2, xmm4
+    aesenclast  xmm2, xmm0
+    movdqa  xmm3, xmm1
+    pslldq  xmm3, 4
+    pxor    xmm1, xmm3
+    pslldq  xmm3, 4
+    pxor    xmm1, xmm3
+    pslldq  xmm3, 4
+    pxor    xmm1, xmm3
+    pxor    xmm1, xmm2
+    movdqu  [32 + KS], xmm1
+    movdqa  xmm2, xmm1
+
+    ret
+intel_aes_encrypt_init_128  ENDP
+
+
+ALIGN 16
+intel_aes_decrypt_init_128  PROC
+
+    mov     KEY,        [esp + 1*4 + 0*4]
+    mov     KS,         [esp + 1*4 + 1*4]
+
+    push    KS
+    push    KEY
+
+    call    intel_aes_encrypt_init_128
+
+    pop     KEY
+    pop     KS
+
+    movdqu  xmm0, [0*16 + KS]
+    movdqu  xmm1, [10*16 + KS]
+    movdqu  [10*16 + KS], xmm0
+    movdqu  [0*16 + KS], xmm1
+
+    i = 1
+    WHILE i LT 5
+        movdqu  xmm0, [i*16 + KS]
+        movdqu  xmm1, [(10-i)*16 + KS]
+
+        aesimc  xmm0, xmm0
+        aesimc  xmm1, xmm1
+
+        movdqu  [(10-i)*16 + KS], xmm0
+        movdqu  [i*16 + KS], xmm1
+
+        i = i+1
+    ENDM
+
+    movdqu  xmm0, [5*16 + KS]
+    aesimc  xmm0, xmm0
+    movdqu  [5*16 + KS], xmm0
+    ret
+intel_aes_decrypt_init_128  ENDP
+
+
+ALIGN 16
+intel_aes_encrypt_init_192  PROC
+
+    mov     KEY, [esp + 1*4 + 0*4]
+    mov     KS,  [esp + 1*4 + 1*4]
+
+    pxor    xmm3, xmm3
+    movdqu  xmm1, [KEY]
+    pinsrd  xmm3, DWORD PTR [16 + KEY], 0
+    pinsrd  xmm3, DWORD PTR [20 + KEY], 1
+
+    movdqu  [KS], xmm1
+    movdqa  xmm5, xmm3
+
+    lea ITR, Lcon1
+    movdqu  xmm0, [ITR]
+    lea ITR, Lmask192
+    movdqu  xmm4, [ITR]
+
+    mov ITR, 4
+
+Lenc_192_ks_loop:
+        movdqa  xmm2, xmm3
+        pshufb  xmm2, xmm4
+        aesenclast xmm2, xmm0
+        pslld   xmm0, 1
+
+        movdqa  xmm6, xmm1
+        movdqa  xmm7, xmm3
+        pslldq  xmm6, 4
+        pslldq  xmm7, 4
+        pxor    xmm1, xmm6
+        pxor    xmm3, xmm7
+        pslldq  xmm6, 4
+        pxor    xmm1, xmm6
+        pslldq  xmm6, 4
+        pxor    xmm1, xmm6
+        pxor    xmm1, xmm2
+        pshufd  xmm2, xmm1, 0ffh
+        pxor    xmm3, xmm2
+
+        movdqa  xmm6, xmm1
+        shufpd  xmm5, xmm1, 00h
+        shufpd  xmm6, xmm3, 01h
+
+        movdqu  [16 + KS], xmm5
+        movdqu  [32 + KS], xmm6
+
+        movdqa  xmm2, xmm3
+        pshufb  xmm2, xmm4
+        aesenclast  xmm2, xmm0
+        pslld   xmm0, 1
+
+        movdqa  xmm6, xmm1
+        movdqa  xmm7, xmm3
+        pslldq  xmm6, 4
+        pslldq  xmm7, 4
+        pxor    xmm1, xmm6
+        pxor    xmm3, xmm7
+        pslldq  xmm6, 4
+        pxor    xmm1, xmm6
+        pslldq  xmm6, 4
+        pxor    xmm1, xmm6
+        pxor    xmm1, xmm2
+        pshufd  xmm2, xmm1, 0ffh
+        pxor    xmm3, xmm2
+
+        movdqu  [48 + KS], xmm1
+        movdqa  xmm5, xmm3
+
+        lea KS, [48 + KS]
+
+        dec ITR
+        jnz Lenc_192_ks_loop
+
+    movdqu  [16 + KS], xmm5
+ret
+intel_aes_encrypt_init_192  ENDP
+
+ALIGN 16
+intel_aes_decrypt_init_192  PROC
+    mov     KEY,        [esp + 1*4 + 0*4]
+    mov     KS,         [esp + 1*4 + 1*4]
+
+    push    KS
+    push    KEY
+
+    call    intel_aes_encrypt_init_192
+
+    pop     KEY
+    pop     KS
+
+    movdqu  xmm0, [0*16 + KS]
+    movdqu  xmm1, [12*16 + KS]
+    movdqu  [12*16 + KS], xmm0
+    movdqu  [0*16 + KS], xmm1
+
+    i = 1
+    WHILE i LT 6
+        movdqu  xmm0, [i*16 + KS]
+        movdqu  xmm1, [(12-i)*16 + KS]
+
+        aesimc  xmm0, xmm0
+        aesimc  xmm1, xmm1
+
+        movdqu  [(12-i)*16 + KS], xmm0
+        movdqu  [i*16 + KS], xmm1
+
+        i = i+1
+    ENDM
+
+    movdqu  xmm0, [6*16 + KS]
+    aesimc  xmm0, xmm0
+    movdqu  [6*16 + KS], xmm0
+    ret
+intel_aes_decrypt_init_192  ENDP
+
+ALIGN 16
+intel_aes_encrypt_init_256  PROC
+
+    mov     KEY,    [esp + 1*4 + 0*4]
+    mov     KS,     [esp + 1*4 + 1*4]
+    movdqu  xmm1, [16*0 + KEY]
+    movdqu  xmm3, [16*1 + KEY]
+
+    movdqu  [16*0 + KS], xmm1
+    movdqu  [16*1 + KS], xmm3
+
+    lea ITR, Lcon1
+    movdqu  xmm0, [ITR]
+    lea ITR, Lmask256
+    movdqu  xmm5, [ITR]
+
+    pxor    xmm6, xmm6
+
+    mov ITR, 6
+
+Lenc_256_ks_loop:
+
+        movdqa  xmm2, xmm3
+        pshufb  xmm2, xmm5
+        aesenclast  xmm2, xmm0
+        pslld   xmm0, 1
+        movdqa  xmm4, xmm1
+        pslldq  xmm4, 4
+        pxor    xmm1, xmm4
+        pslldq  xmm4, 4
+        pxor    xmm1, xmm4
+        pslldq  xmm4, 4
+        pxor    xmm1, xmm4
+        pxor    xmm1, xmm2
+        movdqu  [16*2 + KS], xmm1
+
+        pshufd  xmm2, xmm1, 0ffh
+        aesenclast  xmm2, xmm6
+        movdqa  xmm4, xmm3
+        pslldq  xmm4, 4
+        pxor    xmm3, xmm4
+        pslldq  xmm4, 4
+        pxor    xmm3, xmm4
+        pslldq  xmm4, 4
+        pxor    xmm3, xmm4
+        pxor    xmm3, xmm2
+        movdqu  [16*3 + KS], xmm3
+
+        lea KS, [32 + KS]
+        dec ITR
+        jnz Lenc_256_ks_loop
+
+    movdqa  xmm2, xmm3
+    pshufb  xmm2, xmm5
+    aesenclast  xmm2, xmm0
+    movdqa  xmm4, xmm1
+    pslldq  xmm4, 4
+    pxor    xmm1, xmm4
+    pslldq  xmm4, 4
+    pxor    xmm1, xmm4
+    pslldq  xmm4, 4
+    pxor    xmm1, xmm4
+    pxor    xmm1, xmm2
+    movdqu  [16*2 + KS], xmm1
+
+    ret
+intel_aes_encrypt_init_256  ENDP
+
+ALIGN 16
+intel_aes_decrypt_init_256  PROC
+    mov     KEY,        [esp + 1*4 + 0*4]
+    mov     KS,         [esp + 1*4 + 1*4]
+
+    push    KS
+    push    KEY
+
+    call    intel_aes_encrypt_init_256
+
+    pop     KEY
+    pop     KS
+
+    movdqu  xmm0, [0*16 + KS]
+    movdqu  xmm1, [14*16 + KS]
+    movdqu  [14*16 + KS], xmm0
+    movdqu  [0*16 + KS], xmm1
+
+    i = 1
+    WHILE i LT 7
+        movdqu  xmm0, [i*16 + KS]
+        movdqu  xmm1, [(14-i)*16 + KS]
+
+        aesimc  xmm0, xmm0
+        aesimc  xmm1, xmm1
+
+        movdqu  [(14-i)*16 + KS], xmm0
+        movdqu  [i*16 + KS], xmm1
+
+        i = i+1
+    ENDM
+
+    movdqu  xmm0, [7*16 + KS]
+    aesimc  xmm0, xmm0
+    movdqu  [7*16 + KS], xmm0
+    ret
+intel_aes_decrypt_init_256  ENDP
+
+
+
+gen_aes_cbc_enc_func MACRO rnds
+
+LOCAL   loop1
+LOCAL   bail
+
+        push    inputLen
+
+        mov     ctx,    [esp + 2*4 + 0*4]
+        mov     output,     [esp + 2*4 + 1*4]
+        mov     input,      [esp + 2*4 + 4*4]
+        mov     inputLen,   [esp + 2*4 + 5*4]
+
+        lea     ctx, [44+ctx]
+
+        movdqu  xmm0, [-32+ctx]
+
+        movdqu  xmm2, [0*16 + ctx]
+        movdqu  xmm3, [1*16 + ctx]
+        movdqu  xmm4, [2*16 + ctx]
+        movdqu  xmm5, [3*16 + ctx]
+        movdqu  xmm6, [4*16 + ctx]
+
+loop1:
+        cmp     inputLen, 1*16
+        jb      bail
+
+        movdqu  xmm1, [input]
+        pxor    xmm1, xmm2
+        pxor    xmm0, xmm1
+
+        aesenc  xmm0, xmm3
+        aesenc  xmm0, xmm4
+        aesenc  xmm0, xmm5
+        aesenc  xmm0, xmm6
+
+        i = 5
+    WHILE i LT rnds
+            movdqu  xmm7, [i*16 + ctx]
+            aesenc  xmm0, xmm7
+            i = i+1
+        ENDM
+        movdqu  xmm7, [rnds*16 + ctx]
+        aesenclast xmm0, xmm7
+
+        movdqu  [output], xmm0
+
+        lea input, [1*16 + input]
+        lea output, [1*16 + output]
+        sub inputLen, 1*16
+        jmp loop1
+
+bail:
+        movdqu  [-32+ctx], xmm0
+
+        xor eax, eax
+        pop inputLen
+        ret
+
+ENDM
+
+gen_aes_cbc_dec_func MACRO rnds
+
+LOCAL   loop7
+LOCAL   loop1
+LOCAL   dec1
+LOCAL   bail
+
+        push    inputLen
+
+        mov     ctx,    [esp + 2*4 + 0*4]
+        mov     output,     [esp + 2*4 + 1*4]
+        mov     input,      [esp + 2*4 + 4*4]
+        mov     inputLen,   [esp + 2*4 + 5*4]
+
+        lea     ctx, [44+ctx]
+
+loop7:
+        cmp     inputLen, 7*16
+        jb      dec1
+
+        movdqu  xmm0, [0*16 + input]
+        movdqu  xmm1, [1*16 + input]
+        movdqu  xmm2, [2*16 + input]
+        movdqu  xmm3, [3*16 + input]
+        movdqu  xmm4, [4*16 + input]
+        movdqu  xmm5, [5*16 + input]
+        movdqu  xmm6, [6*16 + input]
+
+        movdqu  xmm7, [0*16 + ctx]
+        pxor    xmm0, xmm7
+        pxor    xmm1, xmm7
+        pxor    xmm2, xmm7
+        pxor    xmm3, xmm7
+        pxor    xmm4, xmm7
+        pxor    xmm5, xmm7
+        pxor    xmm6, xmm7
+
+        i = 1
+        WHILE i LT rnds
+            aes_dec_rnd i
+            i = i+1
+            ENDM
+        aes_dec_last_rnd rnds
+
+        movdqu  xmm7, [-32 + ctx]
+        pxor    xmm0, xmm7
+        movdqu  xmm7, [0*16 + input]
+        pxor    xmm1, xmm7
+        movdqu  xmm7, [1*16 + input]
+        pxor    xmm2, xmm7
+        movdqu  xmm7, [2*16 + input]
+        pxor    xmm3, xmm7
+        movdqu  xmm7, [3*16 + input]
+        pxor    xmm4, xmm7
+        movdqu  xmm7, [4*16 + input]
+        pxor    xmm5, xmm7
+        movdqu  xmm7, [5*16 + input]
+        pxor    xmm6, xmm7
+        movdqu  xmm7, [6*16 + input]
+
+        movdqu  [0*16 + output], xmm0
+        movdqu  [1*16 + output], xmm1
+        movdqu  [2*16 + output], xmm2
+        movdqu  [3*16 + output], xmm3
+        movdqu  [4*16 + output], xmm4
+        movdqu  [5*16 + output], xmm5
+        movdqu  [6*16 + output], xmm6
+        movdqu  [-32 + ctx], xmm7
+
+        lea input, [7*16 + input]
+        lea output, [7*16 + output]
+        sub inputLen, 7*16
+        jmp loop7
+dec1:
+
+        movdqu  xmm3, [-32 + ctx]
+
+loop1:
+        cmp     inputLen, 1*16
+        jb      bail
+
+        movdqu  xmm0, [input]
+        movdqa  xmm4, xmm0
+        movdqu  xmm7, [0*16 + ctx]
+        pxor    xmm0, xmm7
+
+        i = 1
+    WHILE i LT rnds
+            movdqu  xmm7, [i*16 + ctx]
+            aesdec  xmm0, xmm7
+            i = i+1
+        ENDM
+        movdqu  xmm7, [rnds*16 + ctx]
+        aesdeclast xmm0, xmm7
+        pxor    xmm3, xmm0
+
+        movdqu  [output], xmm3
+        movdqa  xmm3, xmm4
+
+        lea input, [1*16 + input]
+        lea output, [1*16 + output]
+        sub inputLen, 1*16
+        jmp loop1
+
+bail:
+        movdqu  [-32 + ctx], xmm3
+        xor eax, eax
+        pop     inputLen
+        ret
+ENDM
+
+ALIGN 16
+intel_aes_encrypt_cbc_128 PROC
+gen_aes_cbc_enc_func  10
+intel_aes_encrypt_cbc_128 ENDP
+
+ALIGN 16
+intel_aes_encrypt_cbc_192 PROC
+gen_aes_cbc_enc_func  12
+intel_aes_encrypt_cbc_192 ENDP
+
+ALIGN 16
+intel_aes_encrypt_cbc_256 PROC
+gen_aes_cbc_enc_func  14
+intel_aes_encrypt_cbc_256 ENDP
+
+ALIGN 16
+intel_aes_decrypt_cbc_128 PROC
+gen_aes_cbc_dec_func  10
+intel_aes_decrypt_cbc_128 ENDP
+
+ALIGN 16
+intel_aes_decrypt_cbc_192 PROC
+gen_aes_cbc_dec_func  12
+intel_aes_decrypt_cbc_192 ENDP
+
+ALIGN 16
+intel_aes_decrypt_cbc_256 PROC
+gen_aes_cbc_dec_func  14
+intel_aes_decrypt_cbc_256 ENDP
+
+
+
+ctrCtx textequ <esi>
+CTR textequ <ebx>
+
+gen_aes_ctr_func MACRO rnds
+
+LOCAL   loop7
+LOCAL   loop1
+LOCAL   enc1
+LOCAL   bail
+
+        push    inputLen
+        push    ctrCtx
+        push    CTR
+        push    ebp
+
+        mov     ctrCtx, [esp + 4*5 + 0*4]
+        mov     output, [esp + 4*5 + 1*4]
+        mov     input,  [esp + 4*5 + 4*4]
+        mov     inputLen, [esp + 4*5 + 5*4]
+
+        mov     ctx, [4+ctrCtx]
+        lea     ctx, [44+ctx]
+
+        mov     ebp, esp
+        sub     esp, 7*16
+        and     esp, -16
+
+        movdqu  xmm0, [8+ctrCtx]
+        mov     ctrCtx, [ctrCtx + 8 + 3*4]
+        bswap   ctrCtx
+        movdqu  xmm1, [ctx + 0*16]
+
+        pxor    xmm0, xmm1
+
+        movdqa  [esp + 0*16], xmm0
+        movdqa  [esp + 1*16], xmm0
+        movdqa  [esp + 2*16], xmm0
+        movdqa  [esp + 3*16], xmm0
+        movdqa  [esp + 4*16], xmm0
+        movdqa  [esp + 5*16], xmm0
+        movdqa  [esp + 6*16], xmm0
+
+        inc     ctrCtx
+        mov     CTR, ctrCtx
+        bswap   CTR
+        xor     CTR, [ctx + 3*4]
+        mov     [esp + 1*16 + 3*4], CTR
+
+        inc     ctrCtx
+        mov     CTR, ctrCtx
+        bswap   CTR
+        xor     CTR, [ctx + 3*4]
+        mov     [esp + 2*16 + 3*4], CTR
+
+        inc     ctrCtx
+        mov     CTR, ctrCtx
+        bswap   CTR
+        xor     CTR, [ctx + 3*4]
+        mov     [esp + 3*16 + 3*4], CTR
+
+        inc     ctrCtx
+        mov     CTR, ctrCtx
+        bswap   CTR
+        xor     CTR, [ctx + 3*4]
+        mov     [esp + 4*16 + 3*4], CTR
+
+        inc     ctrCtx
+        mov     CTR, ctrCtx
+        bswap   CTR
+        xor     CTR, [ctx + 3*4]
+        mov     [esp + 5*16 + 3*4], CTR
+
+        inc     ctrCtx
+        mov     CTR, ctrCtx
+        bswap   CTR
+        xor     CTR, [ctx + 3*4]
+        mov     [esp + 6*16 + 3*4], CTR
+
+
+loop7:
+        cmp     inputLen, 7*16
+        jb      loop1
+
+        movdqu  xmm0, [0*16 + esp]
+        movdqu  xmm1, [1*16 + esp]
+        movdqu  xmm2, [2*16 + esp]
+        movdqu  xmm3, [3*16 + esp]
+        movdqu  xmm4, [4*16 + esp]
+        movdqu  xmm5, [5*16 + esp]
+        movdqu  xmm6, [6*16 + esp]
+
+        i = 1
+        WHILE i LE 7
+            aes_rnd i
+
+            inc     ctrCtx
+            mov     CTR, ctrCtx
+            bswap   CTR
+            xor     CTR, [ctx + 3*4]
+            mov     [esp + (i-1)*16 + 3*4], CTR
+
+            i = i+1
+        ENDM
+        WHILE i LT rnds
+            aes_rnd i
+            i = i+1
+            ENDM
+        aes_last_rnd rnds
+
+        movdqu  xmm7, [0*16 + input]
+        pxor    xmm0, xmm7
+        movdqu  xmm7, [1*16 + input]
+        pxor    xmm1, xmm7
+        movdqu  xmm7, [2*16 + input]
+        pxor    xmm2, xmm7
+        movdqu  xmm7, [3*16 + input]
+        pxor    xmm3, xmm7
+        movdqu  xmm7, [4*16 + input]
+        pxor    xmm4, xmm7
+        movdqu  xmm7, [5*16 + input]
+        pxor    xmm5, xmm7
+        movdqu  xmm7, [6*16 + input]
+        pxor    xmm6, xmm7
+
+        movdqu  [0*16 + output], xmm0
+        movdqu  [1*16 + output], xmm1
+        movdqu  [2*16 + output], xmm2
+        movdqu  [3*16 + output], xmm3
+        movdqu  [4*16 + output], xmm4
+        movdqu  [5*16 + output], xmm5
+        movdqu  [6*16 + output], xmm6
+
+        lea input, [7*16 + input]
+        lea output, [7*16 + output]
+        sub inputLen, 7*16
+        jmp loop7
+
+
+loop1:
+        cmp     inputLen, 1*16
+        jb      bail
+
+        movdqu  xmm0, [esp]
+        add     esp, 16
+
+        i = 1
+    WHILE i LT rnds
+            movdqu  xmm7, [i*16 + ctx]
+            aesenc  xmm0, xmm7
+            i = i+1
+        ENDM
+        movdqu  xmm7, [rnds*16 + ctx]
+        aesenclast xmm0, xmm7
+
+        movdqu  xmm7, [input]
+        pxor    xmm0, xmm7
+        movdqu  [output], xmm0
+
+        lea input, [1*16 + input]
+        lea output, [1*16 + output]
+        sub inputLen, 1*16
+        jmp loop1
+
+bail:
+
+        mov     ctrCtx, [ebp + 4*5 + 0*4]
+        movdqu  xmm0, [esp]
+        movdqu  xmm1, [ctx + 0*16]
+        pxor    xmm0, xmm1
+        movdqu  [8+ctrCtx], xmm0
+
+
+        xor     eax, eax
+        mov     esp, ebp
+        pop     ebp
+        pop     CTR
+        pop     ctrCtx
+        pop     inputLen
+        ret
+ENDM
+
+
+ALIGN 16
+intel_aes_encrypt_ctr_128 PROC
+gen_aes_ctr_func  10
+intel_aes_encrypt_ctr_128 ENDP
+
+ALIGN 16
+intel_aes_encrypt_ctr_192 PROC
+gen_aes_ctr_func  12
+intel_aes_encrypt_ctr_192 ENDP
+
+ALIGN 16
+intel_aes_encrypt_ctr_256 PROC
+gen_aes_ctr_func  14
+intel_aes_encrypt_ctr_256 ENDP
+
+
+END
--- a/lib/freebl/intel-aes.h
+++ b/lib/freebl/intel-aes.h
@@ -28,16 +28,22 @@ SECStatus intel_aes_encrypt_cbc_128(AESC
 				    unsigned int inputLen,
 				    unsigned int blocksize);
 SECStatus intel_aes_decrypt_cbc_128(AESContext *cx, unsigned char *output,
 				    unsigned int *outputLen,
 				    unsigned int maxOutputLen,
 				    const unsigned char *input,
 				    unsigned int inputLen,
 				    unsigned int blocksize);
+SECStatus intel_aes_encrypt_ctr_128(CTRContext *cx, unsigned char *output,
+				    unsigned int *outputLen,
+				    unsigned int maxOutputLen,
+				    const unsigned char *input,
+				    unsigned int inputLen,
+				    unsigned int blocksize);
 SECStatus intel_aes_encrypt_ecb_192(AESContext *cx, unsigned char *output,
 				    unsigned int *outputLen,
 				    unsigned int maxOutputLen,
 				    const unsigned char *input,
 				    unsigned int inputLen,
 				    unsigned int blocksize);
 SECStatus intel_aes_decrypt_ecb_192(AESContext *cx, unsigned char *output,
 				    unsigned int *outputLen,
@@ -52,16 +58,22 @@ SECStatus intel_aes_encrypt_cbc_192(AESC
 				    unsigned int inputLen,
 				    unsigned int blocksize);
 SECStatus intel_aes_decrypt_cbc_192(AESContext *cx, unsigned char *output,
 				    unsigned int *outputLen,
 				    unsigned int maxOutputLen,
 				    const unsigned char *input,
 				    unsigned int inputLen,
 				    unsigned int blocksize);
+SECStatus intel_aes_encrypt_ctr_192(CTRContext *cx, unsigned char *output,
+				    unsigned int *outputLen,
+				    unsigned int maxOutputLen,
+				    const unsigned char *input,
+				    unsigned int inputLen,
+				    unsigned int blocksize);
 SECStatus intel_aes_encrypt_ecb_256(AESContext *cx, unsigned char *output,
 				    unsigned int *outputLen,
 				    unsigned int maxOutputLen,
 				    const unsigned char *input,
 				    unsigned int inputLen,
 				    unsigned int blocksize);
 SECStatus intel_aes_decrypt_ecb_256(AESContext *cx, unsigned char *output,
 				    unsigned int *outputLen,
@@ -76,16 +88,22 @@ SECStatus intel_aes_encrypt_cbc_256(AESC
 				    unsigned int inputLen,
 				    unsigned int blocksize);
 SECStatus intel_aes_decrypt_cbc_256(AESContext *cx, unsigned char *output,
 				    unsigned int *outputLen,
 				    unsigned int maxOutputLen,
 				    const unsigned char *input,
 				    unsigned int inputLen,
 				    unsigned int blocksize);
+SECStatus intel_aes_encrypt_ctr_256(CTRContext *cx, unsigned char *output,
+				    unsigned int *outputLen,
+				    unsigned int maxOutputLen,
+				    const unsigned char *input,
+				    unsigned int inputLen,
+				    unsigned int blocksize);
 
 
 #define intel_aes_ecb_worker(encrypt, keysize) \
   ((encrypt)						\
    ? ((keysize) == 16 ? intel_aes_encrypt_ecb_128 :	\
       (keysize) == 24 ? intel_aes_encrypt_ecb_192 :	\
       intel_aes_encrypt_ecb_256)			\
    : ((keysize) == 16 ? intel_aes_decrypt_ecb_128 :	\
@@ -97,16 +115,21 @@ SECStatus intel_aes_decrypt_cbc_256(AESC
   ((encrypt)						\
    ? ((keysize) == 16 ? intel_aes_encrypt_cbc_128 :	\
       (keysize) == 24 ? intel_aes_encrypt_cbc_192 :	\
       intel_aes_encrypt_cbc_256)			\
    : ((keysize) == 16 ? intel_aes_decrypt_cbc_128 :	\
       (keysize) == 24 ? intel_aes_decrypt_cbc_192 :	\
       intel_aes_decrypt_cbc_256))
 
+#define intel_aes_ctr_worker(nr) \
+   ((nr) == 10 ? intel_aes_encrypt_ctr_128 :	\
+    (nr) == 12 ? intel_aes_encrypt_ctr_192 :	\
+    intel_aes_encrypt_ctr_256)
+
 
 #define intel_aes_init(encrypt, keysize) \
   do {					 			\
       if (encrypt) {			 			\
 	  if (keysize == 16)					\
 	      intel_aes_encrypt_init_128(key, cx->expandedKey);	\
 	  else if (keysize == 24)				\
 	      intel_aes_encrypt_init_192(key, cx->expandedKey);	\
--- a/lib/freebl/intel-gcm-wrap.c
+++ b/lib/freebl/intel-gcm-wrap.c
@@ -1,14 +1,14 @@
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 /* Copyright(c) 2013, Intel Corp. */
 
-/* Wrapper funcions for Intel optimized implementation of AES-GCM */
+/* Wrapper functions for Intel optimized implementation of AES-GCM */
 
 #ifdef USE_HW_AES
 
 #ifdef FREEBL_NO_DEPEND
 #include "stubs.h"
 #endif
 
 #include "blapii.h"
@@ -19,22 +19,18 @@
 #include "prtypes.h"
 #include "pkcs11t.h"
 
 #include <limits.h>
 
 #include "intel-gcm.h"
 #include "rijndael.h"
 
-#if defined(__INTEL_COMPILER)
-#include <ia32intrin.h> 
-#elif defined(__GNUC__) || defined(__SUNPRO_C)
 #include <emmintrin.h>
 #include <tmmintrin.h>
-#endif
 
 
 struct intel_AES_GCMContextStr{
     unsigned char Htbl[16*AES_BLOCK_SIZE];
     unsigned char X0[AES_BLOCK_SIZE];
     unsigned char T[AES_BLOCK_SIZE];
     unsigned char CTR[AES_BLOCK_SIZE];
     AESContext *aes_context;
new file mode 100644
--- /dev/null
+++ b/lib/freebl/intel-gcm-x64-masm.asm
@@ -0,0 +1,1301 @@
+; LICENSE:
+; This submission to NSS is to be made available under the terms of the
+; Mozilla Public License, v. 2.0. You can obtain one at http:
+; //mozilla.org/MPL/2.0/.
+;###############################################################################
+; Copyright(c) 2014, Intel Corp.
+; Developers and authors:
+; Shay Gueron and Vlad Krasnov
+; Intel Corporation, Israel Development Centre, Haifa, Israel
+; Please send feedback directly to crypto.feedback.alias@intel.com
+
+
+.DATA
+ALIGN 16
+Lone            dq 1,0
+Ltwo            dq 2,0
+Lbswap_mask     db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+Lshuff_mask     dq 0f0f0f0f0f0f0f0fh, 0f0f0f0f0f0f0f0fh
+Lpoly           dq 01h, 0c200000000000000h
+
+.CODE
+
+
+GFMUL MACRO DST, SRC1, SRC2, TMP1, TMP2, TMP3, TMP4
+    vpclmulqdq  TMP1, SRC2, SRC1, 0h
+    vpclmulqdq  TMP4, SRC2, SRC1, 011h
+
+    vpshufd     TMP2, SRC2, 78
+    vpshufd     TMP3, SRC1, 78
+    vpxor       TMP2, TMP2, SRC2
+    vpxor       TMP3, TMP3, SRC1
+
+    vpclmulqdq  TMP2, TMP2, TMP3, 0h
+    vpxor       TMP2, TMP2, TMP1
+    vpxor       TMP2, TMP2, TMP4
+
+    vpslldq     TMP3, TMP2, 8
+    vpsrldq     TMP2, TMP2, 8
+
+    vpxor       TMP1, TMP1, TMP3
+    vpxor       TMP4, TMP4, TMP2
+
+    vpclmulqdq  TMP2, TMP1, [Lpoly], 010h
+    vpshufd     TMP3, TMP1, 78
+    vpxor       TMP1, TMP2, TMP3
+
+    vpclmulqdq  TMP2, TMP1, [Lpoly], 010h
+    vpshufd     TMP3, TMP1, 78
+    vpxor       TMP1, TMP2, TMP3
+
+    vpxor       DST, TMP1, TMP4
+
+    ENDM
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Generates the final GCM tag
+; void intel_aes_gcmTAG(unsigned char Htbl[16*16],
+;                       unsigned char *Tp,
+;                       unsigned int Mlen,
+;                       unsigned int Alen,
+;                       unsigned char *X0,
+;                       unsigned char *TAG);
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ALIGN 16
+intel_aes_gcmTAG PROC
+
+Htbl    textequ <rcx>
+Tp      textequ <rdx>
+Mlen    textequ <r8>
+Alen    textequ <r9>
+X0      textequ <r10>
+TAG     textequ <r11>
+
+T       textequ <xmm0>
+TMP0    textequ <xmm1>
+
+    mov     X0, [rsp + 1*8 + 4*8]
+    mov     TAG, [rsp + 1*8 + 5*8]
+
+    vzeroupper
+    vmovdqu T, XMMWORD PTR[Tp]
+    vpxor   TMP0, TMP0, TMP0
+
+    shl     Mlen, 3
+    shl     Alen, 3
+
+    ;vpinsrq    TMP0, TMP0, Mlen, 0
+    ;vpinsrq    TMP0, TMP0, Alen, 1
+    ; workaround the ml64.exe vpinsrq issue
+    vpinsrd TMP0, TMP0, r8d, 0
+    vpinsrd TMP0, TMP0, r9d, 2
+    shr Mlen, 32
+    shr Alen, 32
+    vpinsrd TMP0, TMP0, r8d, 1
+    vpinsrd TMP0, TMP0, r9d, 3
+
+    vpxor   T, T, TMP0
+    vmovdqu TMP0, XMMWORD PTR[Htbl]
+    GFMUL   T, T, TMP0, xmm2, xmm3, xmm4, xmm5
+
+    vpshufb T, T, [Lbswap_mask]
+    vpxor   T, T, [X0]
+    vmovdqu XMMWORD PTR[TAG], T
+    vzeroupper
+
+    ret
+
+intel_aes_gcmTAG ENDP
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Generates the H table
+; void intel_aes_gcmINIT(unsigned char Htbl[16*16], unsigned char *KS, int NR);
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ALIGN 16
+intel_aes_gcmINIT PROC
+
+Htbl    textequ <rcx>
+KS      textequ <rdx>
+NR      textequ <r8d>
+
+T       textequ <xmm0>
+TMP0    textequ <xmm1>
+
+    vzeroupper
+    ; AES-ENC(0)
+    vmovdqu T, XMMWORD PTR[KS]
+    lea KS, [16 + KS]
+    dec NR
+Lenc_loop:
+        vaesenc T, T, [KS]
+        lea KS, [16 + KS]
+        dec NR
+        jnz Lenc_loop
+
+    vaesenclast T, T, [KS]
+    vpshufb T, T, [Lbswap_mask]
+
+    ;Calculate H` = GFMUL(H, 2)
+    vpsrad  xmm3, T, 31
+    vpshufd xmm3, xmm3, 0ffh
+    vpand   xmm5, xmm3, [Lpoly]
+    vpsrld  xmm3, T, 31
+    vpslld  xmm4, T, 1
+    vpslldq xmm3, xmm3, 4
+    vpxor   T, xmm4, xmm3
+    vpxor   T, T, xmm5
+
+    vmovdqu TMP0, T
+    vmovdqu XMMWORD PTR[Htbl + 0*16], T
+
+    vpshufd xmm2, T, 78
+    vpxor   xmm2, xmm2, T
+    vmovdqu XMMWORD PTR[Htbl + 8*16 + 0*16], xmm2
+
+    i = 1
+    WHILE i LT 8
+        GFMUL   T, T, TMP0, xmm2, xmm3, xmm4, xmm5
+        vmovdqu XMMWORD PTR[Htbl + i*16], T
+        vpshufd xmm2, T, 78
+        vpxor   xmm2, xmm2, T
+        vmovdqu XMMWORD PTR[Htbl + 8*16 + i*16], xmm2
+        i = i+1
+        ENDM
+    vzeroupper
+    ret
+intel_aes_gcmINIT ENDP
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Authenticate only
+; void intel_aes_gcmAAD(unsigned char Htbl[16*16], unsigned char *AAD, unsigned int Alen, unsigned char *Tp);
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ALIGN 16
+intel_aes_gcmAAD PROC
+
+Htbl    textequ <rcx>
+inp     textequ <rdx>
+len     textequ <r8>
+Tp      textequ <r9>
+hlp0    textequ <r10>
+
+DATA    textequ <xmm0>
+T       textequ <xmm1>
+TMP0    textequ <xmm2>
+TMP1    textequ <xmm3>
+TMP2    textequ <xmm4>
+TMP3    textequ <xmm5>
+TMP4    textequ <xmm6>
+Xhi     textequ <xmm7>
+
+KARATSUBA_AAD MACRO i
+    vpclmulqdq  TMP3, DATA, [Htbl + i*16], 0h
+    vpxor       TMP0, TMP0, TMP3
+    vpclmulqdq  TMP3, DATA, [Htbl + i*16], 011h
+    vpxor       TMP1, TMP1, TMP3
+    vpshufd     TMP3, DATA, 78
+    vpxor       TMP3, TMP3, DATA
+    vpclmulqdq  TMP3, TMP3, [Htbl + 8*16 + i*16], 0h
+    vpxor       TMP2, TMP2, TMP3
+ENDM
+
+    test  len, len
+    jnz   LbeginAAD
+    ret
+
+LbeginAAD:
+    vzeroupper
+
+    sub rsp, 2*16
+    vmovdqu XMMWORD PTR[rsp + 0*16], xmm6
+    vmovdqu XMMWORD PTR[rsp + 1*16], xmm7
+
+    vpxor   Xhi, Xhi, Xhi
+
+    vmovdqu T, XMMWORD PTR[Tp]
+    ;we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first
+    mov hlp0, len
+    and hlp0, 128-1
+    jz  Lmod_loop
+
+    and len, -128
+    sub hlp0, 16
+
+    ; Prefix block
+    vmovdqu DATA, XMMWORD PTR[inp]
+    vpshufb DATA, DATA, [Lbswap_mask]
+    vpxor   DATA, DATA, T
+
+    vpclmulqdq  TMP0, DATA, [Htbl + hlp0], 0h
+    vpclmulqdq  TMP1, DATA, [Htbl + hlp0], 011h
+    vpshufd     TMP3, DATA, 78
+    vpxor       TMP3, TMP3, DATA
+    vpclmulqdq  TMP2, TMP3, [Htbl + 8*16 + hlp0], 0h
+
+    lea     inp, [inp+16]
+    test    hlp0, hlp0
+    jnz     Lpre_loop
+    jmp     Lred1
+
+    ;hash remaining prefix bocks (up to 7 total prefix blocks)
+Lpre_loop:
+
+        sub hlp0, 16
+
+        vmovdqu DATA, XMMWORD PTR[inp]
+        vpshufb DATA, DATA, [Lbswap_mask]
+
+        vpclmulqdq  TMP3, DATA, [Htbl + hlp0], 0h
+        vpxor       TMP0, TMP0, TMP3
+        vpclmulqdq  TMP3, DATA, [Htbl + hlp0], 011h
+        vpxor       TMP1, TMP1, TMP3
+        vpshufd     TMP3, DATA, 78
+        vpxor       TMP3, TMP3, DATA
+        vpclmulqdq  TMP3, TMP3, [Htbl + 8*16 + hlp0], 0h
+        vpxor       TMP2, TMP2, TMP3
+
+        test    hlp0, hlp0
+        lea     inp, [inp+16]
+        jnz     Lpre_loop
+
+Lred1:
+
+    vpxor       TMP2, TMP2, TMP0
+    vpxor       TMP2, TMP2, TMP1
+    vpsrldq     TMP3, TMP2, 8
+    vpslldq     TMP2, TMP2, 8
+
+    vpxor       Xhi, TMP1, TMP3
+    vpxor       T, TMP0, TMP2
+
+
+Lmod_loop:
+
+        sub len, 16*8
+        jb  Ldone
+        ; Block #0
+        vmovdqu DATA, XMMWORD PTR[inp + 16*7]
+        vpshufb DATA, DATA, [Lbswap_mask]
+
+        vpclmulqdq  TMP0, DATA, [Htbl + 0*16], 0h
+        vpclmulqdq  TMP1, DATA, [Htbl + 0*16], 011h
+        vpshufd     TMP3, DATA, 78
+        vpxor       TMP3, TMP3, DATA
+        vpclmulqdq  TMP2, TMP3, [Htbl + 8*16 + 0*16], 0h
+
+        ; Block #1
+        vmovdqu DATA, XMMWORD PTR[inp + 16*6]
+        vpshufb DATA, DATA, [Lbswap_mask]
+        KARATSUBA_AAD 1
+
+        ; Block #2
+        vmovdqu DATA, XMMWORD PTR[inp + 16*5]
+        vpshufb DATA, DATA, [Lbswap_mask]
+
+        vpclmulqdq  TMP4, T, [Lpoly], 010h         ;reduction stage 1a
+        vpalignr    T, T, T, 8
+
+        KARATSUBA_AAD 2
+
+        vpxor       T, T, TMP4                          ;reduction stage 1b
+
+        ; Block #3
+        vmovdqu DATA, XMMWORD PTR[inp + 16*4]
+        vpshufb DATA, DATA, [Lbswap_mask]
+        KARATSUBA_AAD 3
+        ; Block #4
+        vmovdqu DATA, XMMWORD PTR[inp + 16*3]
+        vpshufb DATA, DATA, [Lbswap_mask]
+
+        vpclmulqdq  TMP4, T, [Lpoly], 010h        ;reduction stage 2a
+        vpalignr    T, T, T, 8
+
+        KARATSUBA_AAD 4
+
+        vpxor       T, T, TMP4                          ;reduction stage 2b
+        ; Block #5
+        vmovdqu DATA, XMMWORD PTR[inp + 16*2]
+        vpshufb DATA, DATA, [Lbswap_mask]
+        KARATSUBA_AAD 5
+
+        vpxor   T, T, Xhi                               ;reduction finalize
+        ; Block #6
+        vmovdqu DATA, XMMWORD PTR[inp + 16*1]
+        vpshufb DATA, DATA, [Lbswap_mask]
+        KARATSUBA_AAD 6
+        ; Block #7
+        vmovdqu DATA, XMMWORD PTR[inp + 16*0]
+        vpshufb DATA, DATA, [Lbswap_mask]
+        vpxor   DATA, DATA, T
+        KARATSUBA_AAD 7
+        ; Aggregated 8 blocks, now karatsuba fixup
+        vpxor   TMP2, TMP2, TMP0
+        vpxor   TMP2, TMP2, TMP1
+        vpsrldq TMP3, TMP2, 8
+        vpslldq TMP2, TMP2, 8
+
+        vpxor   Xhi, TMP1, TMP3
+        vpxor   T, TMP0, TMP2
+
+        lea inp, [inp + 16*8]
+        jmp Lmod_loop
+
+Ldone:
+    vpclmulqdq  TMP4, T, [Lpoly], 010h
+    vpalignr    T, T, T, 8
+    vpxor       T, T, TMP4
+
+    vpclmulqdq  TMP4, T, [Lpoly], 010h
+    vpalignr    T, T, T, 8
+    vpxor       T, T, TMP4
+
+    vpxor       T, T, Xhi
+    vmovdqu     XMMWORD PTR[Tp], T
+    vzeroupper
+
+    vmovdqu xmm6, XMMWORD PTR[rsp + 0*16]
+    vmovdqu xmm7, XMMWORD PTR[rsp + 1*16]
+    add rsp, 16*2
+
+    ret
+
+intel_aes_gcmAAD ENDP
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Encrypt and Authenticate
+; void intel_aes_gcmENC(unsigned char* PT, unsigned char* CT, void *Gctx, unsigned int len);
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ALIGN 16
+intel_aes_gcmENC PROC
+
+PT      textequ <rcx>
+CT      textequ <rdx>
+Htbl    textequ <r8>
+Gctx    textequ <r8>
+len     textequ <r9>
+KS      textequ <r10>
+NR      textequ <eax>
+
+aluCTR  textequ <r11d>
+aluKSl  textequ <r12d>
+aluTMP  textequ <r13d>
+
+T       textequ <xmm0>
+TMP0    textequ <xmm1>
+TMP1    textequ <xmm2>
+TMP2    textequ <xmm3>
+TMP3    textequ <xmm4>
+TMP4    textequ <xmm5>
+TMP5    textequ <xmm6>
+CTR0    textequ <xmm7>
+CTR1    textequ <xmm8>
+CTR2    textequ <xmm9>
+CTR3    textequ <xmm10>
+CTR4    textequ <xmm11>
+CTR5    textequ <xmm12>
+CTR6    textequ <xmm13>
+CTR7    textequ <xmm14>
+BSWAPMASK   textequ <xmm15>
+
+ROUND MACRO i
+    vmovdqu TMP3, XMMWORD PTR[i*16 + KS]
+    vaesenc CTR0, CTR0, TMP3
+    vaesenc CTR1, CTR1, TMP3
+    vaesenc CTR2, CTR2, TMP3
+    vaesenc CTR3, CTR3, TMP3
+    vaesenc CTR4, CTR4, TMP3
+    vaesenc CTR5, CTR5, TMP3
+    vaesenc CTR6, CTR6, TMP3
+    vaesenc CTR7, CTR7, TMP3
+ENDM
+ROUNDMUL MACRO i
+    vmovdqu TMP3, XMMWORD PTR[i*16 + KS]
+
+    vaesenc CTR0, CTR0, TMP3
+    vaesenc CTR1, CTR1, TMP3
+    vaesenc CTR2, CTR2, TMP3
+    vaesenc CTR3, CTR3, TMP3
+
+    vpshufd TMP4, TMP5, 78
+    vpxor   TMP4, TMP4, TMP5
+
+    vaesenc CTR4, CTR4, TMP3
+    vaesenc CTR5, CTR5, TMP3
+    vaesenc CTR6, CTR6, TMP3
+    vaesenc CTR7, CTR7, TMP3
+
+    vpclmulqdq  TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h
+    vpxor       TMP0, TMP0, TMP3
+    vmovdqu     TMP4, XMMWORD PTR[i*16 + Htbl]
+    vpclmulqdq  TMP3, TMP5, TMP4, 011h
+    vpxor       TMP1, TMP1, TMP3
+    vpclmulqdq  TMP3, TMP5, TMP4, 000h
+    vpxor       TMP2, TMP2, TMP3
+ENDM
+KARATSUBA MACRO i
+    vpshufd TMP4, TMP5, 78
+    vpxor   TMP4, TMP4, TMP5
+    vpclmulqdq  TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h
+    vpxor       TMP0, TMP0, TMP3
+    vmovdqu     TMP4, XMMWORD PTR[i*16 + Htbl]
+    vpclmulqdq  TMP3, TMP5, TMP4, 011h
+    vpxor       TMP1, TMP1, TMP3
+    vpclmulqdq  TMP3, TMP5, TMP4, 000h
+    vpxor       TMP2, TMP2, TMP3
+ENDM
+NEXTCTR MACRO i
+    add aluCTR, 1
+    mov aluTMP, aluCTR
+    xor aluTMP, aluKSl
+    bswap   aluTMP
+    mov [3*4 + 8*16 + i*16 + rsp], aluTMP
+ENDM
+
+
+    test  len, len
+    jnz   LbeginENC
+    ret
+
+LbeginENC:
+
+    vzeroupper
+    push    r11
+    push    r12
+    push    r13
+    push    rbp
+    sub rsp, 10*16
+    vmovdqu XMMWORD PTR[rsp + 0*16], xmm6
+    vmovdqu XMMWORD PTR[rsp + 1*16], xmm7
+    vmovdqu XMMWORD PTR[rsp + 2*16], xmm8
+    vmovdqu XMMWORD PTR[rsp + 3*16], xmm9
+    vmovdqu XMMWORD PTR[rsp + 4*16], xmm10
+    vmovdqu XMMWORD PTR[rsp + 5*16], xmm11
+    vmovdqu XMMWORD PTR[rsp + 6*16], xmm12
+    vmovdqu XMMWORD PTR[rsp + 7*16], xmm13
+    vmovdqu XMMWORD PTR[rsp + 8*16], xmm14
+    vmovdqu XMMWORD PTR[rsp + 9*16], xmm15
+
+    mov rbp, rsp
+    sub rsp, 16*16
+    and rsp, -16
+
+    vmovdqu T, XMMWORD PTR[16*16 + 1*16 + Gctx]
+    vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx]
+    vmovdqu BSWAPMASK, XMMWORD PTR[Lbswap_mask]
+    mov     KS, [16*16 + 3*16 + Gctx]
+    mov     NR, [4 + KS]
+    lea     KS, [48 + KS]
+
+    vpshufb CTR0, CTR0, BSWAPMASK
+
+    mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx]
+    mov aluKSl, [3*4 + KS]
+    bswap   aluCTR
+    bswap   aluKSl
+
+    vmovdqu TMP0, XMMWORD PTR[0*16 + KS]
+    vpxor   TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx]
+    vmovdqu XMMWORD PTR[8*16 + 0*16 + rsp], TMP0
+
+    cmp len, 128
+    jb  LEncDataSingles
+; Prepare the "top" counters
+    vmovdqu XMMWORD PTR[8*16 + 1*16 + rsp], TMP0
+    vmovdqu XMMWORD PTR[8*16 + 2*16 + rsp], TMP0
+    vmovdqu XMMWORD PTR[8*16 + 3*16 + rsp], TMP0
+    vmovdqu XMMWORD PTR[8*16 + 4*16 + rsp], TMP0
+    vmovdqu XMMWORD PTR[8*16 + 5*16 + rsp], TMP0
+    vmovdqu XMMWORD PTR[8*16 + 6*16 + rsp], TMP0
+    vmovdqu XMMWORD PTR[8*16 + 7*16 + rsp], TMP0
+
+; Encrypt the initial 8 blocks
+    sub len, 128
+    vpaddd  CTR1, CTR0, XMMWORD PTR[Lone]
+    vpaddd  CTR2, CTR0, XMMWORD PTR[Ltwo]
+    vpaddd  CTR3, CTR2, XMMWORD PTR[Lone]
+    vpaddd  CTR4, CTR2, XMMWORD PTR[Ltwo]
+    vpaddd  CTR5, CTR4, XMMWORD PTR[Lone]
+    vpaddd  CTR6, CTR4, XMMWORD PTR[Ltwo]
+    vpaddd  CTR7, CTR6, XMMWORD PTR[Lone]
+
+    vpshufb CTR0, CTR0, BSWAPMASK
+    vpshufb CTR1, CTR1, BSWAPMASK
+    vpshufb CTR2, CTR2, BSWAPMASK
+    vpshufb CTR3, CTR3, BSWAPMASK
+    vpshufb CTR4, CTR4, BSWAPMASK
+    vpshufb CTR5, CTR5, BSWAPMASK
+    vpshufb CTR6, CTR6, BSWAPMASK
+    vpshufb CTR7, CTR7, BSWAPMASK
+
+    vmovdqu TMP3, XMMWORD PTR[0*16 + KS]
+    vpxor   CTR0, CTR0, TMP3
+    vpxor   CTR1, CTR1, TMP3
+    vpxor   CTR2, CTR2, TMP3
+    vpxor   CTR3, CTR3, TMP3
+    vpxor   CTR4, CTR4, TMP3
+    vpxor   CTR5, CTR5, TMP3
+    vpxor   CTR6, CTR6, TMP3
+    vpxor   CTR7, CTR7, TMP3
+
+    ROUND   1
+
+    add aluCTR, 8
+    mov aluTMP, aluCTR
+    xor aluTMP, aluKSl
+    bswap   aluTMP
+    mov [8*16 + 0*16 + 3*4 + rsp], aluTMP
+
+    ROUND   2
+    NEXTCTR 1
+    ROUND   3
+    NEXTCTR 2
+    ROUND   4
+    NEXTCTR 3
+    ROUND   5
+    NEXTCTR 4
+    ROUND   6
+    NEXTCTR 5
+    ROUND   7
+    NEXTCTR 6
+    ROUND   8
+    NEXTCTR 7
+    ROUND   9
+    vmovdqu TMP5, XMMWORD PTR[10*16 + KS]
+    cmp     NR, 10
+    je      @f
+
+    ROUND   10
+    ROUND   11
+    vmovdqu TMP5, XMMWORD PTR[12*16 + KS]
+    cmp     NR, 12
+    je      @f
+
+    ROUND   12
+    ROUND   13
+    vmovdqu TMP5, XMMWORD PTR[14*16 + KS]
+@@:
+    vpxor   TMP3, TMP5, XMMWORD PTR[0*16 + PT]
+    vaesenclast CTR0, CTR0, TMP3
+    vpxor   TMP3, TMP5, XMMWORD PTR[1*16 + PT]
+    vaesenclast CTR1, CTR1, TMP3
+    vpxor   TMP3, TMP5, XMMWORD PTR[2*16 + PT]
+    vaesenclast CTR2, CTR2, TMP3
+    vpxor   TMP3, TMP5, XMMWORD PTR[3*16 + PT]
+    vaesenclast CTR3, CTR3, TMP3
+    vpxor   TMP3, TMP5, XMMWORD PTR[4*16 + PT]
+    vaesenclast CTR4, CTR4, TMP3
+    vpxor   TMP3, TMP5, XMMWORD PTR[5*16 + PT]
+    vaesenclast CTR5, CTR5, TMP3
+    vpxor   TMP3, TMP5, XMMWORD PTR[6*16 + PT]
+    vaesenclast CTR6, CTR6, TMP3
+    vpxor   TMP3, TMP5, XMMWORD PTR[7*16 + PT]
+    vaesenclast CTR7, CTR7, TMP3
+
+    vmovdqu XMMWORD PTR[0*16 + CT], CTR0
+    vpshufb CTR0, CTR0, BSWAPMASK
+    vmovdqu XMMWORD PTR[1*16 + CT], CTR1
+    vpshufb CTR1, CTR1, BSWAPMASK
+    vmovdqu XMMWORD PTR[2*16 + CT], CTR2
+    vpshufb CTR2, CTR2, BSWAPMASK
+    vmovdqu XMMWORD PTR[3*16 + CT], CTR3
+    vpshufb CTR3, CTR3, BSWAPMASK
+    vmovdqu XMMWORD PTR[4*16 + CT], CTR4
+    vpshufb CTR4, CTR4, BSWAPMASK
+    vmovdqu XMMWORD PTR[5*16 + CT], CTR5
+    vpshufb CTR5, CTR5, BSWAPMASK
+    vmovdqu XMMWORD PTR[6*16 + CT], CTR6
+    vpshufb CTR6, CTR6, BSWAPMASK
+    vmovdqu XMMWORD PTR[7*16 + CT], CTR7
+    vpshufb TMP5, CTR7, BSWAPMASK
+
+    vmovdqa XMMWORD PTR[1*16 + rsp], CTR6
+    vmovdqa XMMWORD PTR[2*16 + rsp], CTR5
+    vmovdqa XMMWORD PTR[3*16 + rsp], CTR4
+    vmovdqa XMMWORD PTR[4*16 + rsp], CTR3
+    vmovdqa XMMWORD PTR[5*16 + rsp], CTR2
+    vmovdqa XMMWORD PTR[6*16 + rsp], CTR1
+    vmovdqa XMMWORD PTR[7*16 + rsp], CTR0
+
+    lea CT, [8*16 + CT]
+    lea PT, [8*16 + PT]
+    jmp LEncDataOctets
+
+LEncDataOctets:
+        cmp len, 128
+        jb  LEndEncOctets
+        sub len, 128
+
+        vmovdqa CTR0, XMMWORD PTR[8*16 + 0*16 + rsp]
+        vmovdqa CTR1, XMMWORD PTR[8*16 + 1*16 + rsp]
+        vmovdqa CTR2, XMMWORD PTR[8*16 + 2*16 + rsp]
+        vmovdqa CTR3, XMMWORD PTR[8*16 + 3*16 + rsp]
+        vmovdqa CTR4, XMMWORD PTR[8*16 + 4*16 + rsp]
+        vmovdqa CTR5, XMMWORD PTR[8*16 + 5*16 + rsp]
+        vmovdqa CTR6, XMMWORD PTR[8*16 + 6*16 + rsp]
+        vmovdqa CTR7, XMMWORD PTR[8*16 + 7*16 + rsp]
+
+        vpshufd TMP4, TMP5, 78
+        vpxor   TMP4, TMP4, TMP5
+        vpclmulqdq  TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
+        vmovdqu     TMP4, XMMWORD PTR[0*16 + Htbl]
+        vpclmulqdq  TMP1, TMP5, TMP4, 011h
+        vpclmulqdq  TMP2, TMP5, TMP4, 000h
+
+        vmovdqu TMP5, XMMWORD PTR[1*16 + rsp]
+        ROUNDMUL 1
+        NEXTCTR 0
+        vmovdqu TMP5, XMMWORD PTR[2*16 + rsp]
+        ROUNDMUL 2
+        NEXTCTR 1
+        vmovdqu TMP5, XMMWORD PTR[3*16 + rsp]
+        ROUNDMUL 3
+        NEXTCTR 2
+        vmovdqu TMP5, XMMWORD PTR[4*16 + rsp]
+        ROUNDMUL 4
+        NEXTCTR 3
+        vmovdqu TMP5, XMMWORD PTR[5*16 + rsp]
+        ROUNDMUL 5
+        NEXTCTR 4
+        vmovdqu TMP5, XMMWORD PTR[6*16 + rsp]
+        ROUNDMUL 6
+        NEXTCTR 5
+        vpxor   TMP5, T, XMMWORD PTR[7*16 + rsp]
+        ROUNDMUL 7
+        NEXTCTR 6
+
+        ROUND 8
+        NEXTCTR 7
+
+        vpxor   TMP0, TMP0, TMP1
+        vpxor   TMP0, TMP0, TMP2
+        vpsrldq TMP3, TMP0, 8
+        vpxor   TMP4, TMP1, TMP3
+        vpslldq TMP3, TMP0, 8
+        vpxor   T, TMP2, TMP3
+
+        vpclmulqdq  TMP1, T, XMMWORD PTR[Lpoly], 010h
+        vpalignr    T,T,T,8
+        vpxor       T, T, TMP1
+
+        ROUND 9
+
+        vpclmulqdq  TMP1, T, XMMWORD PTR[Lpoly], 010h
+        vpalignr    T,T,T,8
+        vpxor       T, T, TMP1
+
+        vmovdqu     TMP5, XMMWORD PTR[10*16 + KS]
+        cmp         NR, 10
+        je          @f
+
+        ROUND 10
+        ROUND 11
+        vmovdqu     TMP5, XMMWORD PTR[12*16 + KS]
+        cmp         NR, 12
+        je          @f
+
+        ROUND 12
+        ROUND 13
+        vmovdqu     TMP5, XMMWORD PTR[14*16 + KS]
+@@:
+        vpxor   TMP3, TMP5, XMMWORD PTR[0*16 + PT]
+        vaesenclast CTR0, CTR0, TMP3
+        vpxor   TMP3, TMP5, XMMWORD PTR[1*16 + PT]
+        vaesenclast CTR1, CTR1, TMP3
+        vpxor   TMP3, TMP5, XMMWORD PTR[2*16 + PT]
+        vaesenclast CTR2, CTR2, TMP3
+        vpxor   TMP3, TMP5, XMMWORD PTR[3*16 + PT]
+        vaesenclast CTR3, CTR3, TMP3
+        vpxor   TMP3, TMP5, XMMWORD PTR[4*16 + PT]
+        vaesenclast CTR4, CTR4, TMP3
+        vpxor   TMP3, TMP5, XMMWORD PTR[5*16 + PT]
+        vaesenclast CTR5, CTR5, TMP3
+        vpxor   TMP3, TMP5, XMMWORD PTR[6*16 + PT]
+        vaesenclast CTR6, CTR6, TMP3
+        vpxor   TMP3, TMP5, XMMWORD PTR[7*16 + PT]
+        vaesenclast CTR7, CTR7, TMP3
+
+        vmovdqu XMMWORD PTR[0*16 + CT], CTR0
+        vpshufb CTR0, CTR0, BSWAPMASK
+        vmovdqu XMMWORD PTR[1*16 + CT], CTR1
+        vpshufb CTR1, CTR1, BSWAPMASK
+        vmovdqu XMMWORD PTR[2*16 + CT], CTR2
+        vpshufb CTR2, CTR2, BSWAPMASK
+        vmovdqu XMMWORD PTR[3*16 + CT], CTR3
+        vpshufb CTR3, CTR3, BSWAPMASK
+        vmovdqu XMMWORD PTR[4*16 + CT], CTR4
+        vpshufb CTR4, CTR4, BSWAPMASK
+        vmovdqu XMMWORD PTR[5*16 + CT], CTR5
+        vpshufb CTR5, CTR5, BSWAPMASK
+        vmovdqu XMMWORD PTR[6*16 + CT], CTR6
+        vpshufb CTR6, CTR6, BSWAPMASK
+        vmovdqu XMMWORD PTR[7*16 + CT], CTR7
+        vpshufb TMP5, CTR7, BSWAPMASK
+
+        vmovdqa XMMWORD PTR[1*16 + rsp], CTR6
+        vmovdqa XMMWORD PTR[2*16 + rsp], CTR5
+        vmovdqa XMMWORD PTR[3*16 + rsp], CTR4
+        vmovdqa XMMWORD PTR[4*16 + rsp], CTR3
+        vmovdqa XMMWORD PTR[5*16 + rsp], CTR2
+        vmovdqa XMMWORD PTR[6*16 + rsp], CTR1
+        vmovdqa XMMWORD PTR[7*16 + rsp], CTR0
+
+        vpxor   T, T, TMP4
+
+        lea CT, [8*16 + CT]
+        lea PT, [8*16 + PT]
+        jmp LEncDataOctets
+
+LEndEncOctets:
+
+    vpshufd TMP4, TMP5, 78
+    vpxor   TMP4, TMP4, TMP5
+    vpclmulqdq  TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
+    vmovdqu     TMP4, XMMWORD PTR[0*16 + Htbl]
+    vpclmulqdq  TMP1, TMP5, TMP4, 011h
+    vpclmulqdq  TMP2, TMP5, TMP4, 000h
+
+    vmovdqu TMP5, XMMWORD PTR[1*16 + rsp]
+    KARATSUBA 1
+    vmovdqu TMP5, XMMWORD PTR[2*16 + rsp]
+    KARATSUBA 2
+    vmovdqu TMP5, XMMWORD PTR[3*16 + rsp]
+    KARATSUBA 3
+    vmovdqu TMP5, XMMWORD PTR[4*16 + rsp]
+    KARATSUBA 4
+    vmovdqu TMP5, XMMWORD PTR[5*16 + rsp]
+    KARATSUBA 5
+    vmovdqu TMP5, XMMWORD PTR[6*16 + rsp]
+    KARATSUBA 6
+    vpxor   TMP5, T, XMMWORD PTR[7*16 + rsp]
+    KARATSUBA 7
+
+    vpxor   TMP0, TMP0, TMP1
+    vpxor   TMP0, TMP0, TMP2
+    vpsrldq TMP3, TMP0, 8
+    vpxor   TMP4, TMP1, TMP3
+    vpslldq TMP3, TMP0, 8
+    vpxor   T, TMP2, TMP3
+
+    vpclmulqdq  TMP1, T, XMMWORD PTR[Lpoly], 010h
+    vpalignr    T,T,T,8
+    vpxor       T, T, TMP1
+
+    vpclmulqdq  TMP1, T, XMMWORD PTR[Lpoly], 010h
+    vpalignr    T,T,T,8
+    vpxor       T, T, TMP1
+
+    vpxor       T, T, TMP4
+
+    sub aluCTR, 7
+
+LEncDataSingles:
+
+        cmp len, 16
+        jb  LEncDataTail
+        sub len, 16
+
+        vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + rsp]
+        NEXTCTR 0
+
+        vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
+        vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
+        cmp NR, 10
+        je  @f
+        vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
+        vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
+        cmp NR, 12
+        je  @f
+        vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
+        vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
+@@:
+        vaesenclast TMP1, TMP1, TMP2
+        vpxor   TMP1, TMP1, XMMWORD PTR[PT]
+        vmovdqu XMMWORD PTR[CT], TMP1
+
+        lea PT, [16+PT]
+        lea CT, [16+CT]
+
+        vpshufb TMP1, TMP1, BSWAPMASK
+        vpxor   T, T, TMP1
+        vmovdqu TMP0, XMMWORD PTR[Htbl]
+        GFMUL   T, T, TMP0, TMP1, TMP2, TMP3, TMP4
+
+        jmp LEncDataSingles
+
+LEncDataTail:
+
+    test    len, len
+    jz  LEncDataEnd
+
+    vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + rsp]
+
+    vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
+    vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
+    cmp NR, 10
+    je  @f
+    vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
+    vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
+    cmp NR, 12
+    je  @f
+    vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
+    vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
+@@:
+    vaesenclast TMP1, TMP1, TMP2
+; zero a temp location
+    vpxor   TMP2, TMP2, TMP2
+    vmovdqa XMMWORD PTR[rsp], TMP2
+; copy as many bytes as needed
+    xor KS, KS
+
+@@:
+        cmp len, KS
+        je  @f
+        mov al, [PT + KS]
+        mov [rsp + KS], al
+        inc KS
+        jmp @b
+@@:
+    vpxor   TMP1, TMP1, XMMWORD PTR[rsp]
+    vmovdqa XMMWORD PTR[rsp], TMP1
+    xor KS, KS
+@@:
+        cmp len, KS
+        je  @f
+        mov al, [rsp + KS]
+        mov [CT + KS], al
+        inc KS
+        jmp @b
+@@:
+        cmp KS, 16
+        je  @f
+        mov BYTE PTR[rsp + KS], 0
+        inc KS
+        jmp @b
+@@:
+BAIL:
+    vmovdqa TMP1, XMMWORD PTR[rsp]
+    vpshufb TMP1, TMP1, BSWAPMASK
+    vpxor   T, T, TMP1
+    vmovdqu TMP0, XMMWORD PTR[Htbl]
+    GFMUL   T, T, TMP0, TMP1, TMP2, TMP3, TMP4
+
+LEncDataEnd:
+
+    vmovdqu XMMWORD PTR[16*16 + 1*16 + Gctx], T
+    bswap   aluCTR
+    mov     [16*16 + 2*16 + 3*4 + Gctx], aluCTR
+
+    mov rsp, rbp
+
+    vmovdqu xmm6, XMMWORD PTR[rsp + 0*16]
+    vmovdqu xmm7, XMMWORD PTR[rsp + 1*16]
+    vmovdqu xmm8, XMMWORD PTR[rsp + 2*16]
+    vmovdqu xmm9, XMMWORD PTR[rsp + 3*16]
+    vmovdqu xmm10, XMMWORD PTR[rsp + 4*16]
+    vmovdqu xmm11, XMMWORD PTR[rsp + 5*16]
+    vmovdqu xmm12, XMMWORD PTR[rsp + 6*16]
+    vmovdqu xmm13, XMMWORD PTR[rsp + 7*16]
+    vmovdqu xmm14, XMMWORD PTR[rsp + 8*16]
+    vmovdqu xmm15, XMMWORD PTR[rsp + 9*16]
+
+    add rsp, 10*16
+    pop rbp
+    pop r13
+    pop r12
+    pop r11
+
+    vzeroupper
+
+    ret
+intel_aes_gcmENC ENDP
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Decrypt and Authenticate
+; void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx, unsigned int len);
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ALIGN 16
+intel_aes_gcmDEC PROC
+
+NEXTCTR MACRO i
+    add aluCTR, 1
+    mov aluTMP, aluCTR
+    xor aluTMP, aluKSl
+    bswap   aluTMP
+    mov [3*4 + i*16 + rsp], aluTMP
+ENDM
+
+PT      textequ <rdx>
+CT      textequ <rcx>
+
+    test  len, len
+    jnz   LbeginDEC
+    ret
+
+LbeginDEC:
+
+    vzeroupper
+    push    r11
+    push    r12
+    push    r13
+    push    rbp
+    sub rsp, 10*16
+    vmovdqu XMMWORD PTR[rsp + 0*16], xmm6
+    vmovdqu XMMWORD PTR[rsp + 1*16], xmm7
+    vmovdqu XMMWORD PTR[rsp + 2*16], xmm8
+    vmovdqu XMMWORD PTR[rsp + 3*16], xmm9
+    vmovdqu XMMWORD PTR[rsp + 4*16], xmm10
+    vmovdqu XMMWORD PTR[rsp + 5*16], xmm11
+    vmovdqu XMMWORD PTR[rsp + 6*16], xmm12
+    vmovdqu XMMWORD PTR[rsp + 7*16], xmm13
+    vmovdqu XMMWORD PTR[rsp + 8*16], xmm14
+    vmovdqu XMMWORD PTR[rsp + 9*16], xmm15
+
+    mov rbp, rsp
+    sub rsp, 8*16
+    and rsp, -16
+
+    vmovdqu T, XMMWORD PTR[16*16 + 1*16 + Gctx]
+    vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx]
+    vmovdqu BSWAPMASK, XMMWORD PTR[Lbswap_mask]
+    mov     KS, [16*16 + 3*16 + Gctx]
+    mov     NR, [4 + KS]
+    lea     KS, [48 + KS]
+
+    vpshufb CTR0, CTR0, BSWAPMASK
+
+    mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx]
+    mov aluKSl, [3*4 + KS]
+    bswap   aluCTR
+    bswap   aluKSl
+
+    vmovdqu TMP0, XMMWORD PTR[0*16 + KS]
+    vpxor   TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx]
+    vmovdqu XMMWORD PTR[0*16 + rsp], TMP0
+
+    cmp len, 128
+    jb  LDecDataSingles
+; Prepare the "top" counters
+    vmovdqu XMMWORD PTR[1*16 + rsp], TMP0
+    vmovdqu XMMWORD PTR[2*16 + rsp], TMP0
+    vmovdqu XMMWORD PTR[3*16 + rsp], TMP0
+    vmovdqu XMMWORD PTR[4*16 + rsp], TMP0
+    vmovdqu XMMWORD PTR[5*16 + rsp], TMP0
+    vmovdqu XMMWORD PTR[6*16 + rsp], TMP0
+    vmovdqu XMMWORD PTR[7*16 + rsp], TMP0
+
+    NEXTCTR 1
+    NEXTCTR 2
+    NEXTCTR 3
+    NEXTCTR 4
+    NEXTCTR 5
+    NEXTCTR 6
+    NEXTCTR 7
+
+LDecDataOctets:
+        cmp len, 128
+        jb  LEndDecOctets
+        sub len, 128
+
+        vmovdqa CTR0, XMMWORD PTR[0*16 + rsp]
+        vmovdqa CTR1, XMMWORD PTR[1*16 + rsp]
+        vmovdqa CTR2, XMMWORD PTR[2*16 + rsp]
+        vmovdqa CTR3, XMMWORD PTR[3*16 + rsp]
+        vmovdqa CTR4, XMMWORD PTR[4*16 + rsp]
+        vmovdqa CTR5, XMMWORD PTR[5*16 + rsp]
+        vmovdqa CTR6, XMMWORD PTR[6*16 + rsp]
+        vmovdqa CTR7, XMMWORD PTR[7*16 + rsp]
+
+        vmovdqu TMP5, XMMWORD PTR[7*16 + CT]
+        vpshufb TMP5, TMP5, BSWAPMASK
+        vpshufd TMP4, TMP5, 78
+        vpxor   TMP4, TMP4, TMP5
+        vpclmulqdq  TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
+        vmovdqu     TMP4, XMMWORD PTR[0*16 + Htbl]
+        vpclmulqdq  TMP1, TMP5, TMP4, 011h
+        vpclmulqdq  TMP2, TMP5, TMP4, 000h
+
+        vmovdqu TMP5, XMMWORD PTR[6*16 + CT]
+        vpshufb TMP5, TMP5, BSWAPMASK
+        ROUNDMUL 1
+        NEXTCTR 0
+        vmovdqu TMP5, XMMWORD PTR[5*16 + CT]
+        vpshufb TMP5, TMP5, BSWAPMASK
+        ROUNDMUL 2
+        NEXTCTR 1
+        vmovdqu TMP5, XMMWORD PTR[4*16 + CT]
+        vpshufb TMP5, TMP5, BSWAPMASK
+        ROUNDMUL 3
+        NEXTCTR 2
+        vmovdqu TMP5, XMMWORD PTR[3*16 + CT]
+        vpshufb TMP5, TMP5, BSWAPMASK
+        ROUNDMUL 4
+        NEXTCTR 3
+        vmovdqu TMP5, XMMWORD PTR[2*16 + CT]
+        vpshufb TMP5, TMP5, BSWAPMASK
+        ROUNDMUL 5
+        NEXTCTR 4
+        vmovdqu TMP5, XMMWORD PTR[1*16 + CT]
+        vpshufb TMP5, TMP5, BSWAPMASK
+        ROUNDMUL 6
+        NEXTCTR 5
+        vmovdqu TMP5, XMMWORD PTR[0*16 + CT]
+        vpshufb TMP5, TMP5, BSWAPMASK
+        vpxor   TMP5, TMP5, T
+        ROUNDMUL 7
+        NEXTCTR 6
+
+        ROUND 8
+        NEXTCTR 7
+
+        vpxor   TMP0, TMP0, TMP1
+        vpxor   TMP0, TMP0, TMP2
+        vpsrldq TMP3, TMP0, 8
+        vpxor   TMP4, TMP1, TMP3
+        vpslldq TMP3, TMP0, 8
+        vpxor   T, TMP2, TMP3
+
+        vpclmulqdq  TMP1, T, XMMWORD PTR[Lpoly], 010h
+        vpalignr    T,T,T,8
+        vpxor       T, T, TMP1
+
+        ROUND 9
+
+        vpclmulqdq  TMP1, T, XMMWORD PTR[Lpoly], 010h
+        vpalignr    T,T,T,8
+        vpxor       T, T, TMP1
+
+        vmovdqu     TMP5, XMMWORD PTR[10*16 + KS]
+        cmp         NR, 10
+        je          @f
+
+        ROUND 10
+        ROUND 11
+        vmovdqu     TMP5, XMMWORD PTR[12*16 + KS]
+        cmp         NR, 12
+        je          @f
+
+        ROUND 12
+        ROUND 13
+        vmovdqu     TMP5, XMMWORD PTR[14*16 + KS]
+@@:
+        vpxor   TMP3, TMP5, XMMWORD PTR[0*16 + CT]
+        vaesenclast CTR0, CTR0, TMP3
+        vpxor   TMP3, TMP5, XMMWORD PTR[1*16 + CT]
+        vaesenclast CTR1, CTR1, TMP3
+        vpxor   TMP3, TMP5, XMMWORD PTR[2*16 + CT]
+        vaesenclast CTR2, CTR2, TMP3
+        vpxor   TMP3, TMP5, XMMWORD PTR[3*16 + CT]
+        vaesenclast CTR3, CTR3, TMP3
+        vpxor   TMP3, TMP5, XMMWORD PTR[4*16 + CT]
+        vaesenclast CTR4, CTR4, TMP3
+        vpxor   TMP3, TMP5, XMMWORD PTR[5*16 + CT]
+        vaesenclast CTR5, CTR5, TMP3
+        vpxor   TMP3, TMP5, XMMWORD PTR[6*16 + CT]
+        vaesenclast CTR6, CTR6, TMP3
+        vpxor   TMP3, TMP5, XMMWORD PTR[7*16 + CT]
+        vaesenclast CTR7, CTR7, TMP3
+
+        vmovdqu XMMWORD PTR[0*16 + PT], CTR0
+        vmovdqu XMMWORD PTR[1*16 + PT], CTR1
+        vmovdqu XMMWORD PTR[2*16 + PT], CTR2
+        vmovdqu XMMWORD PTR[3*16 + PT], CTR3
+        vmovdqu XMMWORD PTR[4*16 + PT], CTR4
+        vmovdqu XMMWORD PTR[5*16 + PT], CTR5
+        vmovdqu XMMWORD PTR[6*16 + PT], CTR6
+        vmovdqu XMMWORD PTR[7*16 + PT], CTR7
+
+        vpxor   T, T, TMP4
+
+        lea CT, [8*16 + CT]
+        lea PT, [8*16 + PT]
+        jmp LDecDataOctets
+
+LEndDecOctets:
+
+    sub aluCTR, 7
+
+LDecDataSingles:
+
+        cmp len, 16
+        jb  LDecDataTail
+        sub len, 16
+
+        vmovdqa TMP1, XMMWORD PTR[0*16 + rsp]
+        NEXTCTR 0
+
+        vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
+        vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
+        cmp NR, 10
+        je  @f
+        vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
+        vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
+        cmp NR, 12
+        je  @f
+        vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
+        vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
+@@:
+        vaesenclast TMP1, TMP1, TMP2
+
+        vmovdqu TMP2, XMMWORD PTR[CT]
+        vpxor   TMP1, TMP1, TMP2
+        vmovdqu XMMWORD PTR[PT], TMP1
+
+        lea PT, [16+PT]
+        lea CT, [16+CT]
+
+        vpshufb TMP2, TMP2, BSWAPMASK
+        vpxor   T, T, TMP2
+        vmovdqu TMP0, XMMWORD PTR[Htbl]
+        GFMUL   T, T, TMP0, TMP1, TMP2, TMP3, TMP4
+
+        jmp LDecDataSingles
+
+LDecDataTail:
+
+    test    len, len
+    jz      LDecDataEnd
+
+    vmovdqa TMP1, XMMWORD PTR[0*16 + rsp]
+    inc aluCTR
+    vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
+    vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
+    cmp NR, 10
+    je  @f
+    vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
+    vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
+    cmp NR, 12
+    je  @f
+    vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
+    vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
+@@:
+    vaesenclast TMP1, TMP1, TMP2
+; copy as many bytes as needed
+    xor KS, KS
+@@:
+        cmp len, KS
+        je  @f
+        mov al, [CT + KS]
+        mov [rsp + KS], al
+        inc KS
+        jmp @b
+@@:
+        cmp KS, 16
+        je  @f
+        mov BYTE PTR[rsp + KS], 0
+        inc KS
+        jmp @b
+@@:
+    vmovdqa TMP2, XMMWORD PTR[rsp]
+    vpshufb TMP2, TMP2, BSWAPMASK
+    vpxor   T, T, TMP2
+    vmovdqu TMP0, XMMWORD PTR[Htbl]
+    GFMUL   T, T, TMP0, TMP5, TMP2, TMP3, TMP4
+
+
+    vpxor   TMP1, TMP1, XMMWORD PTR[rsp]
+    vmovdqa XMMWORD PTR[rsp], TMP1
+    xor KS, KS
+@@:
+        cmp len, KS
+        je  @f
+        mov al, [rsp + KS]
+        mov [PT + KS], al
+        inc KS
+        jmp @b
+@@:
+        cmp KS, 16
+        je  @f
+        mov BYTE PTR[rsp + KS], 0
+        inc KS
+        jmp @b
+@@:
+
+LDecDataEnd:
+
+    vmovdqu XMMWORD PTR[16*16 + 1*16 + Gctx], T
+    bswap   aluCTR
+    mov     [16*16 + 2*16 + 3*4 + Gctx], aluCTR
+
+    mov rsp, rbp
+
+    vmovdqu xmm6, XMMWORD PTR[rsp + 0*16]
+    vmovdqu xmm7, XMMWORD PTR[rsp + 1*16]
+    vmovdqu xmm8, XMMWORD PTR[rsp + 2*16]
+    vmovdqu xmm9, XMMWORD PTR[rsp + 3*16]
+    vmovdqu xmm10, XMMWORD PTR[rsp + 4*16]
+    vmovdqu xmm11, XMMWORD PTR[rsp + 5*16]
+    vmovdqu xmm12, XMMWORD PTR[rsp + 6*16]
+    vmovdqu xmm13, XMMWORD PTR[rsp + 7*16]
+    vmovdqu xmm14, XMMWORD PTR[rsp + 8*16]
+    vmovdqu xmm15, XMMWORD PTR[rsp + 9*16]
+
+    add rsp, 10*16
+    pop rbp
+    pop r13
+    pop r12
+    pop r11
+
+    vzeroupper
+
+    ret
+ret
+intel_aes_gcmDEC ENDP
+
+
+END
new file mode 100644
--- /dev/null
+++ b/lib/freebl/intel-gcm-x86-masm.asm
@@ -0,0 +1,1212 @@
+; LICENSE:
+; This submission to NSS is to be made available under the terms of the
+; Mozilla Public License, v. 2.0. You can obtain one at http:
+; //mozilla.org/MPL/2.0/.
+;###############################################################################
+; Copyright(c) 2014, Intel Corp.
+; Developers and authors:
+; Shay Gueron and Vlad Krasnov
+; Intel Corporation, Israel Development Centre, Haifa, Israel
+; Please send feedback directly to crypto.feedback.alias@intel.com
+
+
+.MODEL FLAT, C
+.XMM
+
+.DATA
+ALIGN 16
+Lone            dq 1,0
+Ltwo            dq 2,0
+Lbswap_mask     db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+Lshuff_mask     dq 0f0f0f0f0f0f0f0fh, 0f0f0f0f0f0f0f0fh
+Lpoly           dq 01h, 0c200000000000000h
+
+.CODE
+
+
+GFMUL MACRO DST, SRC1, SRC2, TMP1, TMP2, TMP3, TMP4
+    vpclmulqdq  TMP1, SRC2, SRC1, 0h
+    vpclmulqdq  TMP4, SRC2, SRC1, 011h
+
+    vpshufd     TMP2, SRC2, 78
+    vpshufd     TMP3, SRC1, 78
+    vpxor       TMP2, TMP2, SRC2
+    vpxor       TMP3, TMP3, SRC1
+
+    vpclmulqdq  TMP2, TMP2, TMP3, 0h
+    vpxor       TMP2, TMP2, TMP1
+    vpxor       TMP2, TMP2, TMP4
+
+    vpslldq     TMP3, TMP2, 8
+    vpsrldq     TMP2, TMP2, 8
+
+    vpxor       TMP1, TMP1, TMP3
+    vpxor       TMP4, TMP4, TMP2
+
+    vpclmulqdq  TMP2, TMP1, [Lpoly], 010h
+    vpshufd     TMP3, TMP1, 78
+    vpxor       TMP1, TMP2, TMP3
+
+    vpclmulqdq  TMP2, TMP1, [Lpoly], 010h
+    vpshufd     TMP3, TMP1, 78
+    vpxor       TMP1, TMP2, TMP3
+
+    vpxor       DST, TMP1, TMP4
+
+    ENDM
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Generates the final GCM tag
+; void intel_aes_gcmTAG(unsigned char Htbl[16*16],
+;                       unsigned char *Tp,
+;                       unsigned int Mlen,
+;                       unsigned int Alen,
+;                       unsigned char* X0,
+;                       unsigned char* TAG);
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ALIGN 16
+intel_aes_gcmTAG PROC
+
+Htbl    textequ <eax>
+Tp      textequ <ecx>
+X0      textequ <edx>
+TAG     textequ <ebx>
+
+T       textequ <xmm0>
+TMP0    textequ <xmm1>
+
+    push    ebx
+
+    mov     Htbl,   [esp + 2*4 + 0*4]
+    mov     Tp,     [esp + 2*4 + 1*4]
+    mov     X0,     [esp + 2*4 + 4*4]
+    mov     TAG,    [esp + 2*4 + 5*4]
+
+    vzeroupper
+    vmovdqu T, XMMWORD PTR[Tp]
+
+    vpxor   TMP0, TMP0, TMP0
+    vpinsrd TMP0, TMP0, DWORD PTR[esp + 2*4 + 2*4], 0
+    vpinsrd TMP0, TMP0, DWORD PTR[esp + 2*4 + 3*4], 2
+    vpsllq  TMP0, TMP0, 3
+
+    vpxor   T, T, TMP0
+    vmovdqu TMP0, XMMWORD PTR[Htbl]
+    GFMUL   T, T, TMP0, xmm2, xmm3, xmm4, xmm5
+
+    vpshufb T, T, [Lbswap_mask]
+    vpxor   T, T, [X0]
+    vmovdqu XMMWORD PTR[TAG], T
+    vzeroupper
+
+    pop ebx
+
+    ret
+
+intel_aes_gcmTAG ENDP
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Generates the H table
+; void intel_aes_gcmINIT(unsigned char Htbl[16*16], unsigned char *KS, int NR);
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ALIGN 16
+intel_aes_gcmINIT PROC
+
+Htbl    textequ <eax>
+KS      textequ <ecx>
+NR      textequ <edx>
+
+T       textequ <xmm0>
+TMP0    textequ <xmm1>
+
+    mov     Htbl,   [esp + 4*1 + 0*4]
+    mov     KS,     [esp + 4*1 + 1*4]
+    mov     NR,     [esp + 4*1 + 2*4]
+
+    vzeroupper
+    ; AES-ENC(0)
+    vmovdqu T, XMMWORD PTR[KS]
+    lea KS, [16 + KS]
+    dec NR
+Lenc_loop:
+        vaesenc T, T, [KS]
+        lea KS, [16 + KS]
+        dec NR
+        jnz Lenc_loop
+
+    vaesenclast T, T, [KS]
+    vpshufb T, T, [Lbswap_mask]
+
+    ;Calculate H` = GFMUL(H, 2)
+    vpsrad  xmm3, T, 31
+    vpshufd xmm3, xmm3, 0ffh
+    vpand   xmm5, xmm3, [Lpoly]
+    vpsrld  xmm3, T, 31
+    vpslld  xmm4, T, 1
+    vpslldq xmm3, xmm3, 4
+    vpxor   T, xmm4, xmm3
+    vpxor   T, T, xmm5
+
+    vmovdqu TMP0, T
+    vmovdqu XMMWORD PTR[Htbl + 0*16], T
+
+    vpshufd xmm2, T, 78
+    vpxor   xmm2, xmm2, T
+    vmovdqu XMMWORD PTR[Htbl + 8*16 + 0*16], xmm2
+
+    i = 1
+    WHILE i LT 8
+        GFMUL   T, T, TMP0, xmm2, xmm3, xmm4, xmm5
+        vmovdqu XMMWORD PTR[Htbl + i*16], T
+        vpshufd xmm2, T, 78
+        vpxor   xmm2, xmm2, T
+        vmovdqu XMMWORD PTR[Htbl + 8*16 + i*16], xmm2
+        i = i+1
+        ENDM
+    vzeroupper
+    ret
+intel_aes_gcmINIT ENDP
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Authenticate only
+; void intel_aes_gcmAAD(unsigned char Htbl[16*16], unsigned char *AAD, unsigned int Alen, unsigned char *Tp);
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ALIGN 16
+intel_aes_gcmAAD PROC
+
+Htbl    textequ <eax>
+inp     textequ <ecx>
+len     textequ <edx>
+Tp      textequ <ebx>
+hlp0    textequ <esi>
+
+DATA    textequ <xmm0>
+T       textequ <xmm1>
+TMP0    textequ <xmm2>
+TMP1    textequ <xmm3>
+TMP2    textequ <xmm4>
+TMP3    textequ <xmm5>
+TMP4    textequ <xmm6>
+Xhi     textequ <xmm7>
+
+KARATSUBA_AAD MACRO i
+    vpclmulqdq  TMP3, DATA, [Htbl + i*16], 0h
+    vpxor       TMP0, TMP0, TMP3
+    vpclmulqdq  TMP3, DATA, [Htbl + i*16], 011h
+    vpxor       TMP1, TMP1, TMP3
+    vpshufd     TMP3, DATA, 78
+    vpxor       TMP3, TMP3, DATA
+    vpclmulqdq  TMP3, TMP3, [Htbl + 8*16 + i*16], 0h
+    vpxor       TMP2, TMP2, TMP3
+ENDM
+
+    cmp   DWORD PTR[esp + 1*3 + 2*4], 0
+    jnz   LbeginAAD
+    ret
+
+LbeginAAD:
+    push    ebx
+    push    esi
+
+    mov     Htbl,   [esp + 4*3 + 0*4]
+    mov     inp,    [esp + 4*3 + 1*4]
+    mov     len,    [esp + 4*3 + 2*4]
+    mov     Tp,     [esp + 4*3 + 3*4]
+
+    vzeroupper
+
+    vpxor   Xhi, Xhi, Xhi
+
+    vmovdqu T, XMMWORD PTR[Tp]
+    ;we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first
+    mov hlp0, len
+    and hlp0, 128-1
+    jz  Lmod_loop
+
+    and len, -128
+    sub hlp0, 16
+
+    ; Prefix block
+    vmovdqu DATA, XMMWORD PTR[inp]
+    vpshufb DATA, DATA, [Lbswap_mask]
+    vpxor   DATA, DATA, T
+
+    vpclmulqdq  TMP0, DATA, XMMWORD PTR[Htbl + hlp0], 0h
+    vpclmulqdq  TMP1, DATA, XMMWORD PTR[Htbl + hlp0], 011h
+    vpshufd     TMP3, DATA, 78
+    vpxor       TMP3, TMP3, DATA
+    vpclmulqdq  TMP2, TMP3, XMMWORD PTR[Htbl + 8*16 + hlp0], 0h
+
+    lea     inp, [inp+16]
+    test    hlp0, hlp0
+    jnz     Lpre_loop
+    jmp     Lred1
+
+    ;hash remaining prefix bocks (up to 7 total prefix blocks)
+Lpre_loop:
+
+        sub hlp0, 16
+
+        vmovdqu DATA, XMMWORD PTR[inp]
+        vpshufb DATA, DATA, [Lbswap_mask]
+
+        vpclmulqdq  TMP3, DATA, XMMWORD PTR[Htbl + hlp0], 0h
+        vpxor       TMP0, TMP0, TMP3
+        vpclmulqdq  TMP3, DATA, XMMWORD PTR[Htbl + hlp0], 011h
+        vpxor       TMP1, TMP1, TMP3
+        vpshufd     TMP3, DATA, 78
+        vpxor       TMP3, TMP3, DATA
+        vpclmulqdq  TMP3, TMP3, XMMWORD PTR[Htbl + 8*16 + hlp0], 0h
+        vpxor       TMP2, TMP2, TMP3
+
+        test    hlp0, hlp0
+        lea     inp, [inp+16]
+        jnz     Lpre_loop
+
+Lred1:
+
+    vpxor       TMP2, TMP2, TMP0
+    vpxor       TMP2, TMP2, TMP1
+    vpsrldq     TMP3, TMP2, 8
+    vpslldq     TMP2, TMP2, 8
+
+    vpxor       Xhi, TMP1, TMP3
+    vpxor       T, TMP0, TMP2
+
+Lmod_loop:
+
+        sub len, 16*8
+        jb  Ldone
+        ; Block #0
+        vmovdqu DATA, XMMWORD PTR[inp + 16*7]
+        vpshufb DATA, DATA, XMMWORD PTR[Lbswap_mask]
+
+        vpclmulqdq  TMP0, DATA, XMMWORD PTR[Htbl + 0*16], 0h
+        vpclmulqdq  TMP1, DATA, XMMWORD PTR[Htbl + 0*16], 011h
+        vpshufd     TMP3, DATA, 78
+        vpxor       TMP3, TMP3, DATA
+        vpclmulqdq  TMP2, TMP3, XMMWORD PTR[Htbl + 8*16 + 0*16], 0h
+
+        ; Block #1
+        vmovdqu DATA, XMMWORD PTR[inp + 16*6]
+        vpshufb DATA, DATA, [Lbswap_mask]
+        KARATSUBA_AAD 1
+
+        ; Block #2
+        vmovdqu DATA, XMMWORD PTR[inp + 16*5]
+        vpshufb DATA, DATA, [Lbswap_mask]
+
+        vpclmulqdq  TMP4, T, [Lpoly], 010h         ;reduction stage 1a
+        vpalignr    T, T, T, 8
+
+        KARATSUBA_AAD 2
+
+        vpxor       T, T, TMP4                          ;reduction stage 1b
+
+        ; Block #3
+        vmovdqu DATA, XMMWORD PTR[inp + 16*4]
+        vpshufb DATA, DATA, [Lbswap_mask]
+        KARATSUBA_AAD 3
+        ; Block #4
+        vmovdqu DATA, XMMWORD PTR[inp + 16*3]
+        vpshufb DATA, DATA, [Lbswap_mask]
+
+        vpclmulqdq  TMP4, T, [Lpoly], 010h        ;reduction stage 2a
+        vpalignr    T, T, T, 8
+
+        KARATSUBA_AAD 4
+
+        vpxor       T, T, TMP4                          ;reduction stage 2b
+        ; Block #5
+        vmovdqu DATA, XMMWORD PTR[inp + 16*2]
+        vpshufb DATA, DATA, [Lbswap_mask]
+        KARATSUBA_AAD 5
+
+        vpxor   T, T, Xhi                               ;reduction finalize
+        ; Block #6
+        vmovdqu DATA, XMMWORD PTR[inp + 16*1]
+        vpshufb DATA, DATA, [Lbswap_mask]
+        KARATSUBA_AAD 6
+        ; Block #7
+        vmovdqu DATA, XMMWORD PTR[inp + 16*0]
+        vpshufb DATA, DATA, [Lbswap_mask]
+        vpxor   DATA, DATA, T
+        KARATSUBA_AAD 7
+        ; Aggregated 8 blocks, now karatsuba fixup
+        vpxor   TMP2, TMP2, TMP0
+        vpxor   TMP2, TMP2, TMP1
+        vpsrldq TMP3, TMP2, 8
+        vpslldq TMP2, TMP2, 8
+
+        vpxor   Xhi, TMP1, TMP3
+        vpxor   T, TMP0, TMP2
+
+        lea inp, [inp + 16*8]
+        jmp Lmod_loop
+
+Ldone:
+    vpclmulqdq  TMP4, T, [Lpoly], 010h
+    vpalignr    T, T, T, 8
+    vpxor       T, T, TMP4
+
+    vpclmulqdq  TMP4, T, [Lpoly], 010h
+    vpalignr    T, T, T, 8
+    vpxor       T, T, TMP4
+
+    vpxor       T, T, Xhi
+    vmovdqu     XMMWORD PTR[Tp], T
+    vzeroupper
+
+    pop esi
+    pop ebx
+    ret
+
+intel_aes_gcmAAD ENDP
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Encrypt and Authenticate
+; void intel_aes_gcmENC(unsigned char* PT, unsigned char* CT, void *Gctx, unsigned int len);
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ALIGN 16
+intel_aes_gcmENC PROC
+
+PT      textequ <eax>
+CT      textequ <ecx>
+Htbl    textequ <edx>
+Gctx    textequ <edx>
+len     textequ <DWORD PTR[ebp + 5*4 + 3*4]>
+KS      textequ <esi>
+NR      textequ <DWORD PTR[-40 + KS]>
+
+aluCTR  textequ <ebx>
+aluTMP  textequ <edi>
+
+T       textequ <XMMWORD PTR[16*16 + 1*16 + Gctx]>
+TMP0    textequ <xmm1>
+TMP1    textequ <xmm2>
+TMP2    textequ <xmm3>
+TMP3    textequ <xmm4>
+TMP4    textequ <xmm5>
+TMP5    textequ <xmm6>
+
+CTR0    textequ <xmm0>
+CTR1    textequ <xmm1>
+CTR2    textequ <xmm2>
+CTR3    textequ <xmm3>
+CTR4    textequ <xmm4>
+CTR5    textequ <xmm5>
+CTR6    textequ <xmm6>
+
+ROUND MACRO i
+    vmovdqu xmm7, XMMWORD PTR[i*16 + KS]
+    vaesenc CTR0, CTR0, xmm7
+    vaesenc CTR1, CTR1, xmm7
+    vaesenc CTR2, CTR2, xmm7
+    vaesenc CTR3, CTR3, xmm7
+    vaesenc CTR4, CTR4, xmm7
+    vaesenc CTR5, CTR5, xmm7
+    vaesenc CTR6, CTR6, xmm7
+ENDM
+
+KARATSUBA MACRO i
+    vpshufd TMP4, TMP5, 78
+    vpxor   TMP4, TMP4, TMP5
+    vpclmulqdq  TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h
+    vpxor       TMP0, TMP0, TMP3
+    vmovdqu     TMP4, XMMWORD PTR[i*16 + Htbl]
+    vpclmulqdq  TMP3, TMP5, TMP4, 011h
+    vpxor       TMP1, TMP1, TMP3
+    vpclmulqdq  TMP3, TMP5, TMP4, 000h
+    vpxor       TMP2, TMP2, TMP3
+ENDM
+
+NEXTCTR MACRO i
+    add     aluCTR, 1
+    mov     aluTMP, aluCTR
+    bswap   aluTMP
+    xor     aluTMP, [3*4 + KS]
+    mov     [3*4 + 8*16 + i*16 + esp], aluTMP
+ENDM
+
+    cmp DWORD PTR[1*4 + 3*4 + esp], 0
+    jne LbeginENC
+    ret
+
+LbeginENC:
+
+    vzeroupper
+    push    ebp
+    push    ebx
+    push    esi
+    push    edi
+
+    mov ebp, esp
+    sub esp, 16*16
+    and esp, -16
+
+    mov PT, [ebp + 5*4 + 0*4]
+    mov CT, [ebp + 5*4 + 1*4]
+    mov Gctx, [ebp + 5*4 + 2*4]
+
+    mov     KS, [16*16 + 3*16 + Gctx]
+    lea     KS, [44 + KS]
+
+    mov     aluCTR, [16*16 + 2*16 + 3*4 + Gctx]
+    bswap   aluCTR
+
+
+    vmovdqu TMP0, XMMWORD PTR[0*16 + KS]
+    vpxor   TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx]
+    vmovdqu XMMWORD PTR[8*16 + 0*16 + esp], TMP0
+
+    cmp len, 16*7
+    jb  LEncDataSingles
+; Prepare the "top" counters
+    vmovdqu XMMWORD PTR[8*16 + 1*16 + esp], TMP0
+    vmovdqu XMMWORD PTR[8*16 + 2*16 + esp], TMP0
+    vmovdqu XMMWORD PTR[8*16 + 3*16 + esp], TMP0
+    vmovdqu XMMWORD PTR[8*16 + 4*16 + esp], TMP0
+    vmovdqu XMMWORD PTR[8*16 + 5*16 + esp], TMP0
+    vmovdqu XMMWORD PTR[8*16 + 6*16 + esp], TMP0
+
+    vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx]
+    vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask]
+; Encrypt the initial 7 blocks
+    sub len, 16*7
+    vpaddd  CTR1, CTR0, XMMWORD PTR[Lone]
+    vpaddd  CTR2, CTR0, XMMWORD PTR[Ltwo]
+    vpaddd  CTR3, CTR2, XMMWORD PTR[Lone]
+    vpaddd  CTR4, CTR2, XMMWORD PTR[Ltwo]
+    vpaddd  CTR5, CTR4, XMMWORD PTR[Lone]
+    vpaddd  CTR6, CTR4, XMMWORD PTR[Ltwo]
+
+    vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask]
+    vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask]
+    vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask]
+    vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask]
+    vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask]
+    vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask]
+    vpshufb CTR6, CTR6, XMMWORD PTR[Lbswap_mask]
+
+    vmovdqu xmm7, XMMWORD PTR[0*16 + KS]
+    vpxor   CTR0, CTR0, xmm7
+    vpxor   CTR1, CTR1, xmm7
+    vpxor   CTR2, CTR2, xmm7
+    vpxor   CTR3, CTR3, xmm7
+    vpxor   CTR4, CTR4, xmm7
+    vpxor   CTR5, CTR5, xmm7
+    vpxor   CTR6, CTR6, xmm7
+
+    ROUND   1
+
+    add aluCTR, 7
+    mov aluTMP, aluCTR
+    bswap   aluTMP
+    xor aluTMP, [KS + 3*4]
+    mov [8*16 + 0*16 + 3*4 + esp], aluTMP
+
+    ROUND   2
+    NEXTCTR 1
+    ROUND   3
+    NEXTCTR 2
+    ROUND   4
+    NEXTCTR 3
+    ROUND   5
+    NEXTCTR 4
+    ROUND   6
+    NEXTCTR 5
+    ROUND   7
+    NEXTCTR 6
+    ROUND   8
+    ROUND   9
+    vmovdqu xmm7, XMMWORD PTR[10*16 + KS]
+    cmp     NR, 10
+    je      @f
+
+    ROUND   10
+    ROUND   11
+    vmovdqu xmm7, XMMWORD PTR[12*16 + KS]
+    cmp     NR, 12
+    je      @f
+
+    ROUND   12
+    ROUND   13
+    vmovdqu xmm7, XMMWORD PTR[14*16 + KS]
+@@:
+    vaesenclast CTR0, CTR0, xmm7
+    vaesenclast CTR1, CTR1, xmm7
+    vaesenclast CTR2, CTR2, xmm7
+    vaesenclast CTR3, CTR3, xmm7
+    vaesenclast CTR4, CTR4, xmm7
+    vaesenclast CTR5, CTR5, xmm7
+    vaesenclast CTR6, CTR6, xmm7
+
+    vpxor   CTR0, CTR0, XMMWORD PTR[0*16 + PT]
+    vpxor   CTR1, CTR1, XMMWORD PTR[1*16 + PT]
+    vpxor   CTR2, CTR2, XMMWORD PTR[2*16 + PT]
+    vpxor   CTR3, CTR3, XMMWORD PTR[3*16 + PT]
+    vpxor   CTR4, CTR4, XMMWORD PTR[4*16 + PT]
+    vpxor   CTR5, CTR5, XMMWORD PTR[5*16 + PT]
+    vpxor   CTR6, CTR6, XMMWORD PTR[6*16 + PT]
+
+    vmovdqu XMMWORD PTR[0*16 + CT], CTR0
+    vmovdqu XMMWORD PTR[1*16 + CT], CTR1
+    vmovdqu XMMWORD PTR[2*16 + CT], CTR2
+    vmovdqu XMMWORD PTR[3*16 + CT], CTR3
+    vmovdqu XMMWORD PTR[4*16 + CT], CTR4
+    vmovdqu XMMWORD PTR[5*16 + CT], CTR5
+    vmovdqu XMMWORD PTR[6*16 + CT], CTR6
+
+    vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask]
+    vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask]
+    vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask]
+    vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask]
+    vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask]
+    vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask]
+    vpshufb TMP5, CTR6, XMMWORD PTR[Lbswap_mask]
+
+    vmovdqa XMMWORD PTR[1*16 + esp], CTR5
+    vmovdqa XMMWORD PTR[2*16 + esp], CTR4
+    vmovdqa XMMWORD PTR[3*16 + esp], CTR3
+    vmovdqa XMMWORD PTR[4*16 + esp], CTR2
+    vmovdqa XMMWORD PTR[5*16 + esp], CTR1
+    vmovdqa XMMWORD PTR[6*16 + esp], CTR0
+
+    lea CT, [7*16 + CT]
+    lea PT, [7*16 + PT]
+    jmp LEncData7
+
+LEncData7:
+        cmp len, 16*7
+        jb  LEndEnc7
+        sub len, 16*7
+
+        vpshufd TMP4, TMP5, 78
+        vpxor   TMP4, TMP4, TMP5
+        vpclmulqdq  TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
+        vmovdqu     TMP4, XMMWORD PTR[0*16 + Htbl]
+        vpclmulqdq  TMP1, TMP5, TMP4, 011h
+        vpclmulqdq  TMP2, TMP5, TMP4, 000h
+
+        vmovdqu TMP5, XMMWORD PTR[1*16 + esp]
+        KARATSUBA 1
+        vmovdqu TMP5, XMMWORD PTR[2*16 + esp]
+        KARATSUBA 2
+        vmovdqu TMP5, XMMWORD PTR[3*16 + esp]
+        KARATSUBA 3
+        vmovdqu TMP5, XMMWORD PTR[4*16 + esp]
+        KARATSUBA 4
+        vmovdqu TMP5, XMMWORD PTR[5*16 + esp]
+        KARATSUBA 5
+        vmovdqu TMP5, XMMWORD PTR[6*16 + esp]
+        vpxor   TMP5, TMP5, T
+        KARATSUBA 6
+
+        vpxor   TMP0, TMP0, TMP1
+        vpxor   TMP0, TMP0, TMP2
+        vpsrldq TMP3, TMP0, 8
+        vpxor   TMP4, TMP1, TMP3
+        vpslldq TMP3, TMP0, 8
+        vpxor   TMP5, TMP2, TMP3
+
+        vpclmulqdq  TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
+        vpalignr    TMP5,TMP5,TMP5,8
+        vpxor       TMP5, TMP5, TMP1
+
+        vpclmulqdq  TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
+        vpalignr    TMP5,TMP5,TMP5,8
+        vpxor       TMP5, TMP5, TMP1
+
+        vpxor       TMP5, TMP5, TMP4
+        vmovdqu     T, TMP5
+
+        vmovdqa CTR0, XMMWORD PTR[8*16 + 0*16 + esp]
+        vmovdqa CTR1, XMMWORD PTR[8*16 + 1*16 + esp]
+        vmovdqa CTR2, XMMWORD PTR[8*16 + 2*16 + esp]
+        vmovdqa CTR3, XMMWORD PTR[8*16 + 3*16 + esp]
+        vmovdqa CTR4, XMMWORD PTR[8*16 + 4*16 + esp]
+        vmovdqa CTR5, XMMWORD PTR[8*16 + 5*16 + esp]
+        vmovdqa CTR6, XMMWORD PTR[8*16 + 6*16 + esp]
+
+        ROUND 1
+        NEXTCTR 0
+        ROUND 2
+        NEXTCTR 1
+        ROUND 3
+        NEXTCTR 2
+        ROUND 4
+        NEXTCTR 3
+        ROUND 5
+        NEXTCTR 4
+        ROUND 6
+        NEXTCTR 5
+        ROUND 7
+        NEXTCTR 6
+
+        ROUND 8
+        ROUND 9
+
+        vmovdqu     xmm7, XMMWORD PTR[10*16 + KS]
+        cmp         NR, 10
+        je          @f
+
+        ROUND 10
+        ROUND 11
+        vmovdqu     xmm7, XMMWORD PTR[12*16 + KS]
+        cmp         NR, 12
+        je          @f
+
+        ROUND 12
+        ROUND 13
+        vmovdqu     xmm7, XMMWORD PTR[14*16 + KS]
+@@:
+        vaesenclast CTR0, CTR0, xmm7
+        vaesenclast CTR1, CTR1, xmm7
+        vaesenclast CTR2, CTR2, xmm7
+        vaesenclast CTR3, CTR3, xmm7
+        vaesenclast CTR4, CTR4, xmm7
+        vaesenclast CTR5, CTR5, xmm7
+        vaesenclast CTR6, CTR6, xmm7
+
+        vpxor   CTR0, CTR0, XMMWORD PTR[0*16 + PT]
+        vpxor   CTR1, CTR1, XMMWORD PTR[1*16 + PT]
+        vpxor   CTR2, CTR2, XMMWORD PTR[2*16 + PT]
+        vpxor   CTR3, CTR3, XMMWORD PTR[3*16 + PT]
+        vpxor   CTR4, CTR4, XMMWORD PTR[4*16 + PT]
+        vpxor   CTR5, CTR5, XMMWORD PTR[5*16 + PT]
+        vpxor   CTR6, CTR6, XMMWORD PTR[6*16 + PT]
+
+        vmovdqu XMMWORD PTR[0*16 + CT], CTR0
+        vmovdqu XMMWORD PTR[1*16 + CT], CTR1
+        vmovdqu XMMWORD PTR[2*16 + CT], CTR2
+        vmovdqu XMMWORD PTR[3*16 + CT], CTR3
+        vmovdqu XMMWORD PTR[4*16 + CT], CTR4
+        vmovdqu XMMWORD PTR[5*16 + CT], CTR5
+        vmovdqu XMMWORD PTR[6*16 + CT], CTR6
+
+        vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask]
+        vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask]
+        vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask]
+        vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask]
+        vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask]
+        vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask]
+        vpshufb TMP5, CTR6, XMMWORD PTR[Lbswap_mask]
+
+        vmovdqa XMMWORD PTR[1*16 + esp], CTR5
+        vmovdqa XMMWORD PTR[2*16 + esp], CTR4
+        vmovdqa XMMWORD PTR[3*16 + esp], CTR3
+        vmovdqa XMMWORD PTR[4*16 + esp], CTR2
+        vmovdqa XMMWORD PTR[5*16 + esp], CTR1
+        vmovdqa XMMWORD PTR[6*16 + esp], CTR0
+
+        lea CT, [7*16 + CT]
+        lea PT, [7*16 + PT]
+        jmp LEncData7
+
+LEndEnc7:
+
+    vpshufd TMP4, TMP5, 78
+    vpxor   TMP4, TMP4, TMP5
+    vpclmulqdq  TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
+    vmovdqu     TMP4, XMMWORD PTR[0*16 + Htbl]
+    vpclmulqdq  TMP1, TMP5, TMP4, 011h
+    vpclmulqdq  TMP2, TMP5, TMP4, 000h
+
+    vmovdqu TMP5, XMMWORD PTR[1*16 + esp]
+    KARATSUBA 1
+    vmovdqu TMP5, XMMWORD PTR[2*16 + esp]
+    KARATSUBA 2
+    vmovdqu TMP5, XMMWORD PTR[3*16 + esp]
+    KARATSUBA 3
+    vmovdqu TMP5, XMMWORD PTR[4*16 + esp]
+    KARATSUBA 4
+    vmovdqu TMP5, XMMWORD PTR[5*16 + esp]
+    KARATSUBA 5
+    vmovdqu TMP5, XMMWORD PTR[6*16 + esp]
+    vpxor   TMP5, TMP5, T
+    KARATSUBA 6
+
+    vpxor   TMP0, TMP0, TMP1
+    vpxor   TMP0, TMP0, TMP2
+    vpsrldq TMP3, TMP0, 8
+    vpxor   TMP4, TMP1, TMP3
+    vpslldq TMP3, TMP0, 8
+    vpxor   TMP5, TMP2, TMP3
+
+    vpclmulqdq  TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
+    vpalignr    TMP5,TMP5,TMP5,8
+    vpxor       TMP5, TMP5, TMP1
+
+    vpclmulqdq  TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
+    vpalignr    TMP5,TMP5,TMP5,8
+    vpxor       TMP5, TMP5, TMP1
+
+    vpxor       TMP5, TMP5, TMP4
+    vmovdqu     T, TMP5
+
+    sub aluCTR, 6
+
+LEncDataSingles:
+
+        cmp len, 16
+        jb  LEncDataTail
+        sub len, 16
+
+        vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + esp]
+        NEXTCTR 0
+
+        vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
+        vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
+        cmp NR, 10
+        je  @f
+        vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
+        vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
+        cmp NR, 12
+        je  @f
+        vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
+        vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
+@@:
+        vaesenclast TMP1, TMP1, TMP2
+        vpxor   TMP1, TMP1, XMMWORD PTR[PT]
+        vmovdqu XMMWORD PTR[CT], TMP1
+
+        lea PT, [16+PT]
+        lea CT, [16+CT]
+
+        vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask]
+        vpxor   TMP1, TMP1, T
+
+        vmovdqu TMP0, XMMWORD PTR[Htbl]
+        GFMUL   TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4
+        vmovdqu T, TMP1
+
+        jmp LEncDataSingles
+
+LEncDataTail:
+
+    cmp len, 0
+    je  LEncDataEnd
+
+    vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + esp]
+
+    vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
+    vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
+    cmp NR, 10
+    je  @f
+    vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
+    vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
+    cmp NR, 12
+    je  @f
+    vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
+    vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
+@@:
+    vaesenclast TMP1, TMP1, TMP2
+; zero a temp location
+    vpxor   TMP2, TMP2, TMP2
+    vmovdqa XMMWORD PTR[esp], TMP2
+; copy as many bytes as needed
+    xor KS, KS
+@@:
+        cmp len, KS
+        je  @f
+        mov di, [PT + KS]
+        mov [esp + KS], di
+        inc KS
+        jmp @b
+@@:
+    vpxor   TMP1, TMP1, XMMWORD PTR[esp]
+    vmovdqa XMMWORD PTR[esp], TMP1
+    xor KS, KS
+@@:
+        cmp len, KS
+        je  @f
+        mov di, [esp + KS]
+        mov [CT + KS], di
+        inc KS
+        jmp @b
+@@:
+        cmp KS, 16
+        je  @f
+        mov BYTE PTR[esp + KS], 0
+        inc KS
+        jmp @b
+@@:
+    vmovdqa TMP1, XMMWORD PTR[esp]
+
+    vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask]
+    vpxor   TMP1, TMP1, T
+
+    vmovdqu TMP0, XMMWORD PTR[Htbl]
+    GFMUL   TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4
+    vmovdqu T, TMP1
+
+LEncDataEnd:
+    inc     aluCTR
+    bswap   aluCTR
+    mov     [16*16 + 2*16 + 3*4 + Gctx], aluCTR
+
+    mov esp, ebp
+    pop edi
+    pop esi
+    pop ebx
+    pop ebp
+
+
+    vzeroupper
+
+    ret
+intel_aes_gcmENC ENDP
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Decrypt and Authenticate
+; void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx, unsigned int len);
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+NEXTCTR MACRO i
+    add     aluCTR, 1
+    mov     aluTMP, aluCTR
+    bswap   aluTMP
+    xor     aluTMP, [3*4 + KS]
+    mov     [3*4 + i*16 + esp], aluTMP
+ENDM
+
+intel_aes_gcmDEC PROC
+
+    cmp DWORD PTR[1*4 + 3*4 + esp], 0
+    jne LbeginDEC
+    ret
+
+LbeginDEC:
+
+    vzeroupper
+    push    ebp
+    push    ebx
+    push    esi
+    push    edi
+
+    mov ebp, esp
+    sub esp, 8*16
+    and esp, -16
+
+    mov CT, [ebp + 5*4 + 0*4]
+    mov PT, [ebp + 5*4 + 1*4]
+    mov Gctx, [ebp + 5*4 + 2*4]
+
+    mov     KS, [16*16 + 3*16 + Gctx]
+    lea     KS, [44 + KS]
+
+    mov     aluCTR, [16*16 + 2*16 + 3*4 + Gctx]
+    bswap   aluCTR
+
+
+    vmovdqu TMP0, XMMWORD PTR[0*16 + KS]
+    vpxor   TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx]
+    vmovdqu XMMWORD PTR[0*16 + esp], TMP0
+
+    cmp len, 16*7
+    jb  LDecDataSingles
+    vmovdqu XMMWORD PTR[1*16 + esp], TMP0
+    vmovdqu XMMWORD PTR[2*16 + esp], TMP0
+    vmovdqu XMMWORD PTR[3*16 + esp], TMP0
+    vmovdqu XMMWORD PTR[4*16 + esp], TMP0
+    vmovdqu XMMWORD PTR[5*16 + esp], TMP0
+    vmovdqu XMMWORD PTR[6*16 + esp], TMP0
+    dec aluCTR
+
+LDecData7:
+    cmp len, 16*7
+    jb  LDecData7End
+    sub len, 16*7
+
+    vmovdqu TMP5, XMMWORD PTR[0*16 + CT]
+    vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
+    vpxor   TMP5, TMP5, T
+    vpshufd TMP4, TMP5, 78
+    vpxor   TMP4, TMP4, TMP5
+    vpclmulqdq  TMP0, TMP4, XMMWORD PTR[6*16 + 8*16 + Htbl], 000h
+    vmovdqu     TMP4, XMMWORD PTR[6*16 + Htbl]
+    vpclmulqdq  TMP1, TMP5, TMP4, 011h
+    vpclmulqdq  TMP2, TMP5, TMP4, 000h
+
+    NEXTCTR 0
+    vmovdqu TMP5, XMMWORD PTR[1*16 + CT]
+    vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
+    KARATSUBA 5
+    NEXTCTR 1
+    vmovdqu TMP5, XMMWORD PTR[2*16 + CT]
+    vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
+    KARATSUBA 4
+    NEXTCTR 2
+    vmovdqu TMP5, XMMWORD PTR[3*16 + CT]
+    vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
+    KARATSUBA 3
+    NEXTCTR 3
+    vmovdqu TMP5, XMMWORD PTR[4*16 + CT]
+    vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
+    KARATSUBA 2
+    NEXTCTR 4
+    vmovdqu TMP5, XMMWORD PTR[5*16 + CT]
+    vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
+    KARATSUBA 1
+    NEXTCTR 5
+    vmovdqu TMP5, XMMWORD PTR[6*16 + CT]
+    vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
+    KARATSUBA 0
+    NEXTCTR 6
+
+    vpxor   TMP0, TMP0, TMP1
+    vpxor   TMP0, TMP0, TMP2
+    vpsrldq TMP3, TMP0, 8
+    vpxor   TMP4, TMP1, TMP3
+    vpslldq TMP3, TMP0, 8
+    vpxor   TMP5, TMP2, TMP3
+
+    vpclmulqdq  TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
+    vpalignr    TMP5,TMP5,TMP5,8
+    vpxor       TMP5, TMP5, TMP1
+
+    vpclmulqdq  TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
+    vpalignr    TMP5,TMP5,TMP5,8
+    vpxor       TMP5, TMP5, TMP1
+
+    vpxor       TMP5, TMP5, TMP4
+    vmovdqu     T, TMP5
+
+    vmovdqa CTR0, XMMWORD PTR[0*16 + esp]
+    vmovdqa CTR1, XMMWORD PTR[1*16 + esp]
+    vmovdqa CTR2, XMMWORD PTR[2*16 + esp]
+    vmovdqa CTR3, XMMWORD PTR[3*16 + esp]
+    vmovdqa CTR4, XMMWORD PTR[4*16 + esp]
+    vmovdqa CTR5, XMMWORD PTR[5*16 + esp]
+    vmovdqa CTR6, XMMWORD PTR[6*16 + esp]
+
+    ROUND   1
+    ROUND   2
+    ROUND   3
+    ROUND   4
+    ROUND   5
+    ROUND   6
+    ROUND   7
+    ROUND   8
+    ROUND   9
+    vmovdqu xmm7, XMMWORD PTR[10*16 + KS]
+    cmp     NR, 10
+    je      @f
+
+    ROUND   10
+    ROUND   11
+    vmovdqu xmm7, XMMWORD PTR[12*16 + KS]
+    cmp     NR, 12
+    je      @f
+
+    ROUND   12
+    ROUND   13
+    vmovdqu xmm7, XMMWORD PTR[14*16 + KS]
+@@:
+    vaesenclast CTR0, CTR0, xmm7
+    vaesenclast CTR1, CTR1, xmm7
+    vaesenclast CTR2, CTR2, xmm7
+    vaesenclast CTR3, CTR3, xmm7
+    vaesenclast CTR4, CTR4, xmm7
+    vaesenclast CTR5, CTR5, xmm7
+    vaesenclast CTR6, CTR6, xmm7
+
+    vpxor   CTR0, CTR0, XMMWORD PTR[0*16 + CT]
+    vpxor   CTR1, CTR1, XMMWORD PTR[1*16 + CT]
+    vpxor   CTR2, CTR2, XMMWORD PTR[2*16 + CT]
+    vpxor   CTR3, CTR3, XMMWORD PTR[3*16 + CT]
+    vpxor   CTR4, CTR4, XMMWORD PTR[4*16 + CT]
+    vpxor   CTR5, CTR5, XMMWORD PTR[5*16 + CT]
+    vpxor   CTR6, CTR6, XMMWORD PTR[6*16 + CT]
+
+    vmovdqu XMMWORD PTR[0*16 + PT], CTR0
+    vmovdqu XMMWORD PTR[1*16 + PT], CTR1
+    vmovdqu XMMWORD PTR[2*16 + PT], CTR2
+    vmovdqu XMMWORD PTR[3*16 + PT], CTR3
+    vmovdqu XMMWORD PTR[4*16 + PT], CTR4
+    vmovdqu XMMWORD PTR[5*16 + PT], CTR5
+    vmovdqu XMMWORD PTR[6*16 + PT], CTR6
+
+    lea CT, [7*16 + CT]
+    lea PT, [7*16 + PT]
+    jmp LDecData7
+
+LDecData7End:
+
+    NEXTCTR 0
+
+LDecDataSingles:
+
+        cmp len, 16
+        jb  LDecDataTail
+        sub len, 16
+
+        vmovdqu TMP1, XMMWORD PTR[CT]
+        vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask]
+        vpxor   TMP1, TMP1, T
+
+        vmovdqu TMP0, XMMWORD PTR[Htbl]
+        GFMUL   TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4
+        vmovdqu T, TMP1
+
+        vmovdqa TMP1, XMMWORD PTR[0*16 + esp]
+        NEXTCTR 0
+
+        vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
+        vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
+        cmp NR, 10
+        je  @f
+        vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
+        vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
+        cmp NR, 12
+        je  @f
+        vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
+        vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
+        vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
+@@:
+        vaesenclast TMP1, TMP1, TMP2
+        vpxor   TMP1, TMP1, XMMWORD PTR[CT]
+        vmovdqu XMMWORD PTR[PT], TMP1
+
+        lea PT, [16+PT]
+        lea CT, [16+CT]
+        jmp LDecDataSingles
+
+LDecDataTail:
+
+    cmp len, 0
+    je  LDecDataEnd
+
+    vmovdqa TMP1, XMMWORD PTR[0*16 + esp]
+    inc aluCTR
+    vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
+    vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
+    cmp NR, 10
+    je  @f
+    vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
+    vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
+    cmp NR, 12
+    je  @f
+    vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
+    vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
+    vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
+@@:
+    vaesenclast xmm7, TMP1, TMP2
+
+; copy as many bytes as needed
+    xor KS, KS
+@@:
+        cmp len, KS
+        je  @f
+        mov di, [CT + KS]
+        mov [esp + KS], di
+        inc KS
+        jmp @b
+@@:
+        cmp KS, 16
+        je  @f
+        mov BYTE PTR[esp + KS], 0
+        inc KS
+        jmp @b
+@@:
+
+    vmovdqa TMP1, XMMWORD PTR[esp]
+    vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask]
+    vpxor   TMP1, TMP1, T
+
+    vmovdqu TMP0, XMMWORD PTR[Htbl]
+    GFMUL   TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4
+    vmovdqu T, TMP1
+
+
+    vpxor   xmm7, xmm7, XMMWORD PTR[esp]
+    vmovdqa XMMWORD PTR[esp], xmm7
+    xor     KS, KS
+@@:
+        cmp len, KS
+        je  @f
+        mov di, [esp + KS]
+        mov [PT + KS], di
+        inc KS
+        jmp @b
+@@:
+        cmp KS, 16
+        je  @f
+        mov BYTE PTR[PT + KS], 0
+        inc KS
+        jmp @b
+@@:
+
+LDecDataEnd:
+
+    bswap   aluCTR
+    mov     [16*16 + 2*16 + 3*4 + Gctx], aluCTR
+
+    mov esp, ebp
+    pop edi
+    pop esi
+    pop ebx
+    pop ebp
+
+    vzeroupper
+
+    ret
+intel_aes_gcmDEC ENDP
+
+
+END
--- a/lib/freebl/rijndael.c
+++ b/lib/freebl/rijndael.c
@@ -1175,17 +1175,21 @@ AES_InitContext(AESContext *cx, const un
 	cx->worker = (freeblCipherFunc)
 			(encrypt ? GCM_EncryptUpdate : GCM_DecryptUpdate);
 	cx->destroy = (freeblDestroyFunc) GCM_DestroyContext;
 	cx->isBlock = PR_FALSE;
 	}
 	break;
     case NSS_AES_CTR:
 	cx->worker_cx = CTR_CreateContext(cx, cx->worker, iv, blocksize);
-	cx->worker = (freeblCipherFunc) CTR_Update ;
+	cx->worker = (freeblCipherFunc) CTR_Update;
+#if defined(USE_HW_AES) && defined(_MSC_VER)
+	if (use_hw_aes)
+	    cx->worker = (freeblCipherFunc) CTR_Update_HW_AES;
+#endif
 	cx->destroy = (freeblDestroyFunc) CTR_DestroyContext;
 	cx->isBlock = PR_FALSE;
 	break;
     default:
 	/* everything has already been set up by aes_InitContext, just
 	 * return */
 	return SECSuccess;
     }