Bug 1152625 - Support AES HW acceleration on ARMv8. r=KevinJacobs,jcj
☠☠ backed out by 777b6070fe76 ☠ ☠
authorMakoto Kato <m_kato@ga2.so-net.ne.jp>
Tue, 30 Jul 2019 22:01:35 +0000
changeset 15241 009a7163c80a711bd681b155ecffa5962fd98a46
parent 15240 caf5d97f786fc3dad2f9318b228007a233b5f0bd
child 15242 085e429af86cc9e3808c076f0dd4a37ebe4a1a8c
push id3453
push userjjones@mozilla.com
push dateTue, 30 Jul 2019 22:01:44 +0000
reviewersKevinJacobs, jcj
bugs1152625
Bug 1152625 - Support AES HW acceleration on ARMv8. r=KevinJacobs,jcj Differential Revision: https://phabricator.services.mozilla.com/D34473
lib/freebl/Makefile
lib/freebl/aes-armv8.c
lib/freebl/aes-armv8.h
lib/freebl/freebl.gyp
lib/freebl/intel-aes.h
lib/freebl/rijndael.c
--- a/lib/freebl/Makefile
+++ b/lib/freebl/Makefile
@@ -236,19 +236,34 @@ ifeq ($(CPU_ARCH),x86)
     DEFINES += -DMP_ASSEMBLY_DIV_2DX1D -DMP_USE_UINT_DIGIT
     DEFINES += -DMP_IS_LITTLE_ENDIAN
 endif
 ifeq ($(CPU_ARCH),arm)
     DEFINES += -DMP_ASSEMBLY_MULTIPLY -DMP_ASSEMBLY_SQUARE 
     DEFINES += -DMP_USE_UINT_DIGIT
     DEFINES += -DSHA_NO_LONG_LONG # avoid 64-bit arithmetic in SHA512
     MPI_SRCS += mpi_arm.c
+    ifdef CC_IS_CLANG
+        DEFINES += -DUSE_HW_AES
+        EXTRA_SRCS += aes-armv8.c
+    else ifeq (1,$(CC_IS_GCC))
+        # Old compiler doesn't support ARM AES.
+        ifneq (,$(filter 4.9,$(word 1,$(GCC_VERSION)).$(word 2,$(GCC_VERSION))))
+            DEFINES += -DUSE_HW_AES
+            EXTRA_SRCS += aes-armv8.c
+        endif
+        ifeq (,$(filter 0 1 2 3 4,$(word 1,$(GCC_VERSION))))
+            DEFINES += -DUSE_HW_AES
+            EXTRA_SRCS += aes-armv8.c
+        endif
+    endif
 endif
 ifeq ($(CPU_ARCH),aarch64)
-    EXTRA_SRCS += gcm-aarch64.c
+    DEFINES += -DUSE_HW_AES
+    EXTRA_SRCS += aes-armv8.c gcm-aarch64.c
 endif
 ifeq ($(CPU_ARCH),ppc)
 ifdef USE_64
     DEFINES += -DNSS_NO_INIT_SUPPORT
 endif # USE_64
 endif # ppc
 endif # Linux
 
@@ -756,11 +771,15 @@ endif
 
 ifdef INTEL_GCM_CLANG_CL
 #
 # clang-cl needs -mssse3
 #
 $(OBJDIR)/$(PROG_PREFIX)intel-gcm-wrap$(OBJ_SUFFIX): CFLAGS += -mssse3
 endif
 
+ifeq ($(CPU_ARCH),arm)
+$(OBJDIR)/$(PROG_PREFIX)aes-armv8$(OBJ_SUFFIX): CFLAGS += -march=armv8-a -mfpu=crypto-neon-fp-armv8
+endif
 ifeq ($(CPU_ARCH),aarch64)
+$(OBJDIR)/$(PROG_PREFIX)aes-armv8$(OBJ_SUFFIX): CFLAGS += -march=armv8-a+crypto
 $(OBJDIR)/$(PROG_PREFIX)gcm-aarch64$(OBJ_SUFFIX): CFLAGS += -march=armv8-a+crypto
 endif
new file mode 100644
--- /dev/null
+++ b/lib/freebl/aes-armv8.c
@@ -0,0 +1,1168 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "secerr.h"
+#include "rijndael.h"
+
+#if (defined(__clang__) ||                            \
+     (defined(__GNUC__) && defined(__GNUC_MINOR__) && \
+      (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 8))))
+
+#ifndef __ARM_FEATURE_CRYPTO
+#error "Compiler option is invalid"
+#endif
+
+#include <arm_neon.h>
+
+SECStatus
+arm_aes_encrypt_ecb_128(AESContext *cx, unsigned char *output,
+                        unsigned int *outputLen,
+                        unsigned int maxOutputLen,
+                        const unsigned char *input,
+                        unsigned int inputLen,
+                        unsigned int blocksize)
+{
+#if !defined(HAVE_UNALIGNED_ACCESS)
+    pre_align unsigned char buf[16] post_align;
+#endif
+    uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
+    uint8x16_t key11;
+    const PRUint8 *key = (const PRUint8 *)cx->expandedKey;
+
+    if (!inputLen) {
+        return SECSuccess;
+    }
+
+    key1 = vld1q_u8(__builtin_assume_aligned(key, 16));
+    key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16));
+    key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16));
+    key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16));
+    key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16));
+    key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16));
+    key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16));
+    key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16));
+    key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16));
+    key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16));
+    key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16));
+
+    while (inputLen > 0) {
+        uint8x16_t state;
+#if defined(HAVE_UNALIGNED_ACCESS)
+        state = vld1q_u8(input);
+#else
+        if ((uintptr_t)input & 0x7) {
+            memcpy(buf, input, 16);
+            state = vld1q_u8(__builtin_assume_aligned(buf, 16));
+        } else {
+            state = vld1q_u8(__builtin_assume_aligned(input, 8));
+        }
+#endif
+        input += 16;
+        inputLen -= 16;
+
+        /* Rounds */
+        state = vaeseq_u8(state, key1);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key2);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key3);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key4);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key5);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key6);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key7);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key8);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key9);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key10);
+        /* AddRoundKey */
+        state = veorq_u8(state, key11);
+
+#if defined(HAVE_UNALIGNED_ACCESS)
+        vst1q_u8(output, state);
+#else
+        if ((uintptr_t)output & 0x7) {
+            vst1q_u8(__builtin_assume_aligned(buf, 16), state);
+            memcpy(output, buf, 16);
+        } else {
+            vst1q_u8(__builtin_assume_aligned(output, 8), state);
+        }
+#endif
+        output += 16;
+    }
+
+    return SECSuccess;
+}
+
+SECStatus
+arm_aes_decrypt_ecb_128(AESContext *cx, unsigned char *output,
+                        unsigned int *outputLen,
+                        unsigned int maxOutputLen,
+                        const unsigned char *input,
+                        unsigned int inputLen,
+                        unsigned int blocksize)
+{
+#if !defined(HAVE_UNALIGNED_ACCESS)
+    pre_align unsigned char buf[16] post_align;
+#endif
+    uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
+    uint8x16_t key11;
+    const PRUint8 *key = (const PRUint8 *)cx->expandedKey;
+
+    if (inputLen == 0) {
+        return SECSuccess;
+    }
+
+    key1 = vld1q_u8(__builtin_assume_aligned(key, 16));
+    key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16));
+    key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16));
+    key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16));
+    key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16));
+    key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16));
+    key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16));
+    key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16));
+    key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16));
+    key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16));
+    key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16));
+
+    while (inputLen > 0) {
+        uint8x16_t state;
+#if defined(HAVE_UNALIGNED_ACCESS)
+        state = vld1q_u8(input);
+#else
+        if ((uintptr_t)input & 0x7) {
+            memcpy(buf, input, 16);
+            state = vld1q_u8(__builtin_assume_aligned(buf, 16));
+        } else {
+            state = vld1q_u8(__builtin_assume_aligned(input, 8));
+        }
+#endif
+        input += 16;
+        inputLen -= 16;
+
+        /* Rounds */
+        state = vaesdq_u8(state, key11);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key10);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key9);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key8);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key7);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key6);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key5);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key4);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key3);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key2);
+        /* AddRoundKey */
+        state = veorq_u8(state, key1);
+
+#if defined(HAVE_UNALIGNED_ACCESS)
+        vst1q_u8(output, state);
+#else
+        if ((uintptr_t)output & 0x7) {
+            vst1q_u8(__builtin_assume_aligned(buf, 16), state);
+            memcpy(output, buf, 16);
+        } else {
+            vst1q_u8(__builtin_assume_aligned(output, 8), state);
+        }
+#endif
+        output += 16;
+    }
+
+    return SECSuccess;
+}
+
+SECStatus
+arm_aes_encrypt_cbc_128(AESContext *cx, unsigned char *output,
+                        unsigned int *outputLen,
+                        unsigned int maxOutputLen,
+                        const unsigned char *input,
+                        unsigned int inputLen,
+                        unsigned int blocksize)
+{
+#if !defined(HAVE_UNALIGNED_ACCESS)
+    pre_align unsigned char buf[16] post_align;
+#endif
+    uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
+    uint8x16_t key11;
+    uint8x16_t iv;
+    const PRUint8 *key = (const PRUint8 *)cx->expandedKey;
+
+    if (!inputLen) {
+        return SECSuccess;
+    }
+
+    /* iv */
+    iv = vld1q_u8(__builtin_assume_aligned(cx->iv, 16));
+
+    /* expanedKey */
+    key1 = vld1q_u8(__builtin_assume_aligned(key, 16));
+    key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16));
+    key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16));
+    key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16));
+    key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16));
+    key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16));
+    key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16));
+    key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16));
+    key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16));
+    key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16));
+    key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16));
+
+    while (inputLen > 0) {
+        uint8x16_t state;
+#if defined(HAVE_UNALIGNED_ACCESS)
+        state = vld1q_u8(input);
+#else
+        if ((uintptr_t)input & 0x7) {
+            memcpy(buf, input, 16);
+            state = vld1q_u8(__builtin_assume_aligned(buf, 16));
+        } else {
+            state = vld1q_u8(__builtin_assume_aligned(input, 8));
+        }
+#endif
+        input += 16;
+        inputLen -= 16;
+
+        state = veorq_u8(state, iv);
+
+        /* Rounds */
+        state = vaeseq_u8(state, key1);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key2);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key3);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key4);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key5);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key6);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key7);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key8);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key9);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key10);
+        /* AddRoundKey */
+        state = veorq_u8(state, key11);
+
+#if defined(HAVE_UNALIGNED_ACCESS)
+        vst1q_u8(output, state);
+#else
+        if ((uintptr_t)output & 0x7) {
+            vst1q_u8(__builtin_assume_aligned(buf, 16), state);
+            memcpy(output, buf, 16);
+        } else {
+            vst1q_u8(__builtin_assume_aligned(output, 8), state);
+        }
+#endif
+        output += 16;
+        iv = state;
+    }
+    vst1q_u8(__builtin_assume_aligned(cx->iv, 16), iv);
+
+    return SECSuccess;
+}
+
+SECStatus
+arm_aes_decrypt_cbc_128(AESContext *cx, unsigned char *output,
+                        unsigned int *outputLen,
+                        unsigned int maxOutputLen,
+                        const unsigned char *input,
+                        unsigned int inputLen,
+                        unsigned int blocksize)
+{
+#if !defined(HAVE_UNALIGNED_ACCESS)
+    pre_align unsigned char buf[16] post_align;
+#endif
+    uint8x16_t iv;
+    uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
+    uint8x16_t key11;
+    const PRUint8 *key = (const PRUint8 *)cx->expandedKey;
+
+    if (!inputLen) {
+        return SECSuccess;
+    }
+
+    /* iv */
+    iv = vld1q_u8(__builtin_assume_aligned(cx->iv, 16));
+
+    /* expanedKey */
+    key1 = vld1q_u8(__builtin_assume_aligned(key, 16));
+    key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16));
+    key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16));
+    key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16));
+    key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16));
+    key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16));
+    key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16));
+    key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16));
+    key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16));
+    key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16));
+    key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16));
+
+    while (inputLen > 0) {
+        uint8x16_t state, old_state;
+#if defined(HAVE_UNALIGNED_ACCESS)
+        state = vld1q_u8(input);
+#else
+        if ((uintptr_t)input & 0x7) {
+            memcpy(buf, input, 16);
+            state = vld1q_u8(__builtin_assume_aligned(buf, 16));
+        } else {
+            state = vld1q_u8(__builtin_assume_aligned(input, 8));
+        }
+#endif
+        old_state = state;
+        input += 16;
+        inputLen -= 16;
+
+        /* Rounds */
+        state = vaesdq_u8(state, key11);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key10);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key9);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key8);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key7);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key6);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key5);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key4);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key3);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key2);
+        /* AddRoundKey */
+        state = veorq_u8(state, key1);
+
+        state = veorq_u8(state, iv);
+
+#if defined(HAVE_UNALIGNED_ACCESS)
+        vst1q_u8(output, state);
+#else
+        if ((uintptr_t)output & 0x7) {
+            vst1q_u8(__builtin_assume_aligned(buf, 16), state);
+            memcpy(output, buf, 16);
+        } else {
+            vst1q_u8(__builtin_assume_aligned(output, 8), state);
+        }
+#endif
+        output += 16;
+
+        iv = old_state;
+    }
+    vst1q_u8(__builtin_assume_aligned(cx->iv, 16), iv);
+
+    return SECSuccess;
+}
+
+SECStatus
+arm_aes_encrypt_ecb_192(AESContext *cx, unsigned char *output,
+                        unsigned int *outputLen,
+                        unsigned int maxOutputLen,
+                        const unsigned char *input,
+                        unsigned int inputLen,
+                        unsigned int blocksize)
+{
+#if !defined(HAVE_UNALIGNED_ACCESS)
+    pre_align unsigned char buf[16] post_align;
+#endif
+    uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
+    uint8x16_t key11, key12, key13;
+    PRUint8 *key = (PRUint8 *)cx->expandedKey;
+
+    if (!inputLen) {
+        return SECSuccess;
+    }
+
+    key1 = vld1q_u8(__builtin_assume_aligned(key, 16));
+    key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16));
+    key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16));
+    key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16));
+    key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16));
+    key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16));
+    key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16));
+    key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16));
+    key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16));
+    key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16));
+    key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16));
+    key12 = vld1q_u8(__builtin_assume_aligned(key + 176, 16));
+    key13 = vld1q_u8(__builtin_assume_aligned(key + 192, 16));
+
+    while (inputLen > 0) {
+        uint8x16_t state;
+#if defined(HAVE_UNALIGNED_ACCESS)
+        state = vld1q_u8(input);
+#else
+        if ((uintptr_t)input & 0x7) {
+            memcpy(buf, input, 16);
+            state = vld1q_u8(__builtin_assume_aligned(buf, 16));
+        } else {
+            state = vld1q_u8(__builtin_assume_aligned(input, 8));
+        }
+#endif
+        input += 16;
+        inputLen -= 16;
+
+        /* Rounds */
+        state = vaeseq_u8(state, key1);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key2);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key3);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key4);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key5);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key6);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key7);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key8);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key9);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key10);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key11);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key12);
+        /* AddRoundKey */
+        state = veorq_u8(state, key13);
+
+#if defined(HAVE_UNALIGNED_ACCESS)
+        vst1q_u8(output, state);
+#else
+        if ((uintptr_t)output & 0x7) {
+            vst1q_u8(__builtin_assume_aligned(buf, 16), state);
+            memcpy(output, buf, 16);
+        } else {
+            vst1q_u8(__builtin_assume_aligned(output, 8), state);
+        }
+#endif
+        output += 16;
+    }
+
+    return SECSuccess;
+}
+
+SECStatus
+arm_aes_decrypt_ecb_192(AESContext *cx, unsigned char *output,
+                        unsigned int *outputLen,
+                        unsigned int maxOutputLen,
+                        const unsigned char *input,
+                        unsigned int inputLen,
+                        unsigned int blocksize)
+{
+#if !defined(HAVE_UNALIGNED_ACCESS)
+    pre_align unsigned char buf[16] post_align;
+#endif
+    uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
+    uint8x16_t key11, key12, key13;
+    const PRUint8 *key = (const PRUint8 *)cx->expandedKey;
+
+    if (!inputLen) {
+        return SECSuccess;
+    }
+
+    key1 = vld1q_u8(__builtin_assume_aligned(key, 16));
+    key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16));
+    key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16));
+    key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16));
+    key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16));
+    key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16));
+    key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16));
+    key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16));
+    key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16));
+    key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16));
+    key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16));
+    key12 = vld1q_u8(__builtin_assume_aligned(key + 176, 16));
+    key13 = vld1q_u8(__builtin_assume_aligned(key + 192, 16));
+
+    while (inputLen > 0) {
+        uint8x16_t state;
+#if defined(HAVE_UNALIGNED_ACCESS)
+        state = vld1q_u8(input);
+#else
+        if ((uintptr_t)input & 0x7) {
+            memcpy(buf, input, 16);
+            state = vld1q_u8(__builtin_assume_aligned(buf, 16));
+        } else {
+            state = vld1q_u8(__builtin_assume_aligned(input, 8));
+        }
+#endif
+        input += 16;
+        inputLen -= 16;
+
+        /* Rounds */
+        state = vaesdq_u8(state, key13);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key12);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key11);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key10);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key9);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key8);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key7);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key6);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key5);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key4);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key3);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key2);
+        /* AddRoundKey */
+        state = veorq_u8(state, key1);
+
+#if defined(HAVE_UNALIGNED_ACCESS)
+        vst1q_u8(output, state);
+#else
+        if ((uintptr_t)output & 0x7) {
+            vst1q_u8(__builtin_assume_aligned(buf, 16), state);
+            memcpy(output, buf, 16);
+        } else {
+            vst1q_u8(__builtin_assume_aligned(output, 8), state);
+        }
+#endif
+        output += 16;
+    }
+
+    return SECSuccess;
+}
+
+SECStatus
+arm_aes_encrypt_cbc_192(AESContext *cx, unsigned char *output,
+                        unsigned int *outputLen,
+                        unsigned int maxOutputLen,
+                        const unsigned char *input,
+                        unsigned int inputLen,
+                        unsigned int blocksize)
+{
+#if !defined(HAVE_UNALIGNED_ACCESS)
+    pre_align unsigned char buf[16] post_align;
+#endif
+    uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
+    uint8x16_t key11, key12, key13;
+    uint8x16_t iv;
+    PRUint8 *key = (PRUint8 *)cx->expandedKey;
+
+    if (!inputLen) {
+        return SECSuccess;
+    }
+
+    /* iv */
+    iv = vld1q_u8(cx->iv);
+
+    /* expanedKey */
+    key1 = vld1q_u8(__builtin_assume_aligned(key, 16));
+    key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16));
+    key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16));
+    key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16));
+    key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16));
+    key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16));
+    key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16));
+    key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16));
+    key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16));
+    key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16));
+    key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16));
+    key12 = vld1q_u8(__builtin_assume_aligned(key + 176, 16));
+    key13 = vld1q_u8(__builtin_assume_aligned(key + 192, 16));
+
+    while (inputLen > 0) {
+        uint8x16_t state;
+#if defined(HAVE_UNALIGNED_ACCESS)
+        state = vld1q_u8(input);
+#else
+        if ((uintptr_t)input & 0x7) {
+            memcpy(buf, input, 16);
+            state = vld1q_u8(__builtin_assume_aligned(buf, 16));
+        } else {
+            state = vld1q_u8(__builtin_assume_aligned(input, 8));
+        }
+#endif
+        input += 16;
+        inputLen -= 16;
+
+        state = veorq_u8(state, iv);
+
+        /* Rounds */
+        state = vaeseq_u8(state, key1);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key2);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key3);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key4);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key5);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key6);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key7);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key8);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key9);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key10);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key11);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key12);
+        state = veorq_u8(state, key13);
+
+#if defined(HAVE_UNALIGNED_ACCESS)
+        vst1q_u8(output, state);
+#else
+        if ((uintptr_t)output & 0x7) {
+            vst1q_u8(__builtin_assume_aligned(buf, 16), state);
+            memcpy(output, buf, 16);
+        } else {
+            vst1q_u8(__builtin_assume_aligned(output, 8), state);
+        }
+#endif
+        output += 16;
+        iv = state;
+    }
+    vst1q_u8(__builtin_assume_aligned(cx->iv, 16), iv);
+
+    return SECSuccess;
+}
+
+SECStatus
+arm_aes_decrypt_cbc_192(AESContext *cx, unsigned char *output,
+                        unsigned int *outputLen,
+                        unsigned int maxOutputLen,
+                        const unsigned char *input,
+                        unsigned int inputLen,
+                        unsigned int blocksize)
+{
+#if !defined(HAVE_UNALIGNED_ACCESS)
+    pre_align unsigned char buf[16] post_align;
+#endif
+    uint8x16_t iv;
+    uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
+    uint8x16_t key11, key12, key13;
+    const PRUint8 *key = (const PRUint8 *)cx->expandedKey;
+
+    if (!inputLen) {
+        return SECSuccess;
+    }
+
+    /* iv */
+    iv = vld1q_u8(__builtin_assume_aligned(cx->iv, 16));
+
+    /* expanedKey */
+    key1 = vld1q_u8(__builtin_assume_aligned(key, 16));
+    key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16));
+    key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16));
+    key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16));
+    key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16));
+    key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16));
+    key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16));
+    key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16));
+    key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16));
+    key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16));
+    key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16));
+    key12 = vld1q_u8(__builtin_assume_aligned(key + 176, 16));
+    key13 = vld1q_u8(__builtin_assume_aligned(key + 192, 16));
+
+    while (inputLen > 0) {
+        uint8x16_t state, old_state;
+#if defined(HAVE_UNALIGNED_ACCESS)
+        state = vld1q_u8(input);
+#else
+        if ((uintptr_t)input & 0x7) {
+            memcpy(buf, input, 16);
+            state = vld1q_u8(__builtin_assume_aligned(buf, 16));
+        } else {
+            state = vld1q_u8(__builtin_assume_aligned(input, 8));
+        }
+#endif
+        old_state = state;
+        input += 16;
+        inputLen -= 16;
+
+        /* Rounds */
+        state = vaesdq_u8(state, key13);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key12);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key11);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key10);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key9);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key8);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key7);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key6);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key5);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key4);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key3);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key2);
+        /* AddRoundKey */
+        state = veorq_u8(state, key1);
+
+        state = veorq_u8(state, iv);
+
+#if defined(HAVE_UNALIGNED_ACCESS)
+        vst1q_u8(output, state);
+#else
+        if ((uintptr_t)output & 0x7) {
+            vst1q_u8(__builtin_assume_aligned(buf, 16), state);
+            memcpy(output, buf, 16);
+        } else {
+            vst1q_u8(__builtin_assume_aligned(output, 8), state);
+        }
+#endif
+        output += 16;
+
+        iv = old_state;
+    }
+    vst1q_u8(__builtin_assume_aligned(cx->iv, 16), iv);
+
+    return SECSuccess;
+}
+
+SECStatus
+arm_aes_encrypt_ecb_256(AESContext *cx, unsigned char *output,
+                        unsigned int *outputLen,
+                        unsigned int maxOutputLen,
+                        const unsigned char *input,
+                        unsigned int inputLen,
+                        unsigned int blocksize)
+{
+#if !defined(HAVE_UNALIGNED_ACCESS)
+    pre_align unsigned char buf[16] post_align;
+#endif
+    uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
+    uint8x16_t key11, key12, key13, key14, key15;
+    PRUint8 *key = (PRUint8 *)cx->expandedKey;
+
+    if (inputLen == 0) {
+        return SECSuccess;
+    }
+
+    key1 = vld1q_u8(__builtin_assume_aligned(key, 16));
+    key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16));
+    key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16));
+    key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16));
+    key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16));
+    key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16));
+    key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16));
+    key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16));
+    key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16));
+    key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16));
+    key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16));
+    key12 = vld1q_u8(__builtin_assume_aligned(key + 176, 16));
+    key13 = vld1q_u8(__builtin_assume_aligned(key + 192, 16));
+    key14 = vld1q_u8(__builtin_assume_aligned(key + 208, 16));
+    key15 = vld1q_u8(__builtin_assume_aligned(key + 224, 16));
+
+    while (inputLen > 0) {
+        uint8x16_t state;
+#if defined(HAVE_UNALIGNED_ACCESS)
+        state = vld1q_u8(input);
+#else
+        if ((uintptr_t)input & 0x7) {
+            memcpy(buf, input, 16);
+            state = vld1q_u8(__builtin_assume_aligned(buf, 16));
+        } else {
+            state = vld1q_u8(__builtin_assume_aligned(input, 8));
+        }
+#endif
+        input += 16;
+        inputLen -= 16;
+
+        /* Rounds */
+        state = vaeseq_u8(state, key1);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key2);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key3);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key4);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key5);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key6);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key7);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key8);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key9);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key10);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key11);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key12);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key13);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key14);
+        /* AddRoundKey */
+        state = veorq_u8(state, key15);
+
+#if defined(HAVE_UNALIGNED_ACCESS)
+        vst1q_u8(output, state);
+#else
+        if ((uintptr_t)output & 0x7) {
+            vst1q_u8(__builtin_assume_aligned(buf, 16), state);
+            memcpy(output, buf, 16);
+        } else {
+            vst1q_u8(__builtin_assume_aligned(output, 8), state);
+        }
+#endif
+        output += 16;
+    }
+    return SECSuccess;
+}
+
+SECStatus
+arm_aes_decrypt_ecb_256(AESContext *cx, unsigned char *output,
+                        unsigned int *outputLen,
+                        unsigned int maxOutputLen,
+                        const unsigned char *input,
+                        unsigned int inputLen,
+                        unsigned int blocksize)
+{
+#if !defined(HAVE_UNALIGNED_ACCESS)
+    pre_align unsigned char buf[16] post_align;
+#endif
+    uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
+    uint8x16_t key11, key12, key13, key14, key15;
+    const PRUint8 *key = (const PRUint8 *)cx->expandedKey;
+
+    if (!inputLen) {
+        return SECSuccess;
+    }
+
+    key1 = vld1q_u8(__builtin_assume_aligned(key, 16));
+    key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16));
+    key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16));
+    key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16));
+    key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16));
+    key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16));
+    key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16));
+    key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16));
+    key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16));
+    key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16));
+    key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16));
+    key12 = vld1q_u8(__builtin_assume_aligned(key + 176, 16));
+    key13 = vld1q_u8(__builtin_assume_aligned(key + 192, 16));
+    key14 = vld1q_u8(__builtin_assume_aligned(key + 208, 16));
+    key15 = vld1q_u8(__builtin_assume_aligned(key + 224, 16));
+
+    while (inputLen > 0) {
+        uint8x16_t state;
+#if defined(HAVE_UNALIGNED_ACCESS)
+        state = vld1q_u8(input);
+#else
+        if ((uintptr_t)input & 0x7) {
+            memcpy(buf, input, 16);
+            state = vld1q_u8(__builtin_assume_aligned(buf, 16));
+        } else {
+            state = vld1q_u8(__builtin_assume_aligned(input, 8));
+        }
+#endif
+        input += 16;
+        inputLen -= 16;
+
+        /* Rounds */
+        state = vaesdq_u8(state, key15);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key14);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key13);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key12);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key11);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key10);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key9);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key8);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key7);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key6);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key5);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key4);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key3);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key2);
+        /* AddRoundKey */
+        state = veorq_u8(state, key1);
+
+#if defined(HAVE_UNALIGNED_ACCESS)
+        vst1q_u8(output, state);
+#else
+        if ((uintptr_t)output & 0x7) {
+            vst1q_u8(__builtin_assume_aligned(buf, 16), state);
+            memcpy(output, buf, 16);
+        } else {
+            vst1q_u8(__builtin_assume_aligned(output, 8), state);
+        }
+#endif
+        output += 16;
+    }
+
+    return SECSuccess;
+}
+
+SECStatus
+arm_aes_encrypt_cbc_256(AESContext *cx, unsigned char *output,
+                        unsigned int *outputLen,
+                        unsigned int maxOutputLen,
+                        const unsigned char *input,
+                        unsigned int inputLen,
+                        unsigned int blocksize)
+{
+#if !defined(HAVE_UNALIGNED_ACCESS)
+    pre_align unsigned char buf[16] post_align;
+#endif
+    uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
+    uint8x16_t key11, key12, key13, key14, key15;
+    uint8x16_t iv;
+    const PRUint8 *key = (const PRUint8 *)cx->expandedKey;
+
+    if (!inputLen) {
+        return SECSuccess;
+    }
+
+    /* iv */
+    iv = vld1q_u8(cx->iv);
+
+    /* expanedKey */
+    key1 = vld1q_u8(__builtin_assume_aligned(key, 16));
+    key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16));
+    key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16));
+    key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16));
+    key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16));
+    key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16));
+    key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16));
+    key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16));
+    key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16));
+    key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16));
+    key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16));
+    key12 = vld1q_u8(__builtin_assume_aligned(key + 176, 16));
+    key13 = vld1q_u8(__builtin_assume_aligned(key + 192, 16));
+    key14 = vld1q_u8(__builtin_assume_aligned(key + 208, 16));
+    key15 = vld1q_u8(__builtin_assume_aligned(key + 224, 16));
+
+    while (inputLen > 0) {
+        uint8x16_t state;
+#if defined(HAVE_UNALIGNED_ACCESS)
+        state = vld1q_u8(input);
+#else
+        if ((uintptr_t)input & 0x7) {
+            memcpy(buf, input, 16);
+            state = vld1q_u8(__builtin_assume_aligned(buf, 16));
+        } else {
+            state = vld1q_u8(__builtin_assume_aligned(input, 8));
+        }
+#endif
+        input += 16;
+        inputLen -= 16;
+
+        state = veorq_u8(state, iv);
+
+        /* Rounds */
+        state = vaeseq_u8(state, key1);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key2);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key3);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key4);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key5);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key6);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key7);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key8);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key9);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key10);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key11);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key12);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key13);
+        state = vaesmcq_u8(state);
+        state = vaeseq_u8(state, key14);
+        /* AddRoundKey */
+        state = veorq_u8(state, key15);
+
+#if defined(HAVE_UNALIGNED_ACCESS)
+        vst1q_u8(output, state);
+#else
+        if ((uintptr_t)output & 0x7) {
+            vst1q_u8(__builtin_assume_aligned(buf, 16), state);
+            memcpy(output, buf, 16);
+        } else {
+            vst1q_u8(__builtin_assume_aligned(output, 8), state);
+        }
+#endif
+        output += 16;
+        iv = state;
+    }
+    vst1q_u8(__builtin_assume_aligned(cx->iv, 16), iv);
+
+    return SECSuccess;
+}
+
+SECStatus
+arm_aes_decrypt_cbc_256(AESContext *cx, unsigned char *output,
+                        unsigned int *outputLen,
+                        unsigned int maxOutputLen,
+                        const unsigned char *input,
+                        unsigned int inputLen,
+                        unsigned int blocksize)
+{
+#if !defined(HAVE_UNALIGNED_ACCESS)
+    pre_align unsigned char buf[16] post_align;
+#endif
+    uint8x16_t iv;
+    uint8x16_t key1, key2, key3, key4, key5, key6, key7, key8, key9, key10;
+    uint8x16_t key11, key12, key13, key14, key15;
+    const PRUint8 *key = (const PRUint8 *)cx->expandedKey;
+
+    if (!inputLen) {
+        return SECSuccess;
+    }
+
+    /* iv */
+    iv = vld1q_u8(cx->iv);
+
+    /* expanedKey */
+    key1 = vld1q_u8(__builtin_assume_aligned(key, 16));
+    key2 = vld1q_u8(__builtin_assume_aligned(key + 16, 16));
+    key3 = vld1q_u8(__builtin_assume_aligned(key + 32, 16));
+    key4 = vld1q_u8(__builtin_assume_aligned(key + 48, 16));
+    key5 = vld1q_u8(__builtin_assume_aligned(key + 64, 16));
+    key6 = vld1q_u8(__builtin_assume_aligned(key + 80, 16));
+    key7 = vld1q_u8(__builtin_assume_aligned(key + 96, 16));
+    key8 = vld1q_u8(__builtin_assume_aligned(key + 112, 16));
+    key9 = vld1q_u8(__builtin_assume_aligned(key + 128, 16));
+    key10 = vld1q_u8(__builtin_assume_aligned(key + 144, 16));
+    key11 = vld1q_u8(__builtin_assume_aligned(key + 160, 16));
+    key12 = vld1q_u8(__builtin_assume_aligned(key + 176, 16));
+    key13 = vld1q_u8(__builtin_assume_aligned(key + 192, 16));
+    key14 = vld1q_u8(__builtin_assume_aligned(key + 208, 16));
+    key15 = vld1q_u8(__builtin_assume_aligned(key + 224, 16));
+
+    while (inputLen > 0) {
+        uint8x16_t state, old_state;
+#if defined(HAVE_UNALIGNED_ACCESS)
+        state = vld1q_u8(input);
+#else
+        if ((uintptr_t)input & 0x7) {
+            memcpy(buf, input, 16);
+            state = vld1q_u8(__builtin_assume_aligned(buf, 16));
+        } else {
+            state = vld1q_u8(__builtin_assume_aligned(input, 8));
+        }
+#endif
+        old_state = state;
+        input += 16;
+        inputLen -= 16;
+
+        /* Rounds */
+        state = vaesdq_u8(state, key15);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key14);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key13);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key12);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key11);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key10);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key9);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key8);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key7);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key6);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key5);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key4);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key3);
+        state = vaesimcq_u8(state);
+        state = vaesdq_u8(state, key2);
+        /* AddRoundKey */
+        state = veorq_u8(state, key1);
+
+        state = veorq_u8(state, iv);
+
+#if defined(HAVE_UNALIGNED_ACCESS)
+        vst1q_u8(output, state);
+#else
+        if ((uintptr_t)output & 0x7) {
+            vst1q_u8(__builtin_assume_aligned(buf, 16), state);
+            memcpy(output, buf, 16);
+        } else {
+            vst1q_u8(__builtin_assume_aligned(output, 8), state);
+        }
+#endif
+        output += 16;
+
+        iv = old_state;
+    }
+    vst1q_u8(__builtin_assume_aligned(cx->iv, 16), iv);
+
+    return SECSuccess;
+}
+
+#endif
new file mode 100644
--- /dev/null
+++ b/lib/freebl/aes-armv8.h
@@ -0,0 +1,103 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+SECStatus arm_aes_encrypt_ecb_128(AESContext *cx, unsigned char *output,
+                                  unsigned int *outputLen,
+                                  unsigned int maxOutputLen,
+                                  const unsigned char *input,
+                                  unsigned int inputLen,
+                                  unsigned int blocksize);
+SECStatus arm_aes_decrypt_ecb_128(AESContext *cx, unsigned char *output,
+                                  unsigned int *outputLen,
+                                  unsigned int maxOutputLen,
+                                  const unsigned char *input,
+                                  unsigned int inputLen,
+                                  unsigned int blocksize);
+SECStatus arm_aes_encrypt_cbc_128(AESContext *cx, unsigned char *output,
+                                  unsigned int *outputLen,
+                                  unsigned int maxOutputLen,
+                                  const unsigned char *input,
+                                  unsigned int inputLen,
+                                  unsigned int blocksize);
+SECStatus arm_aes_decrypt_cbc_128(AESContext *cx, unsigned char *output,
+                                  unsigned int *outputLen,
+                                  unsigned int maxOutputLen,
+                                  const unsigned char *input,
+                                  unsigned int inputLen,
+                                  unsigned int blocksize);
+SECStatus arm_aes_encrypt_ecb_192(AESContext *cx, unsigned char *output,
+                                  unsigned int *outputLen,
+                                  unsigned int maxOutputLen,
+                                  const unsigned char *input,
+                                  unsigned int inputLen,
+                                  unsigned int blocksize);
+SECStatus arm_aes_decrypt_ecb_192(AESContext *cx, unsigned char *output,
+                                  unsigned int *outputLen,
+                                  unsigned int maxOutputLen,
+                                  const unsigned char *input,
+                                  unsigned int inputLen,
+                                  unsigned int blocksize);
+SECStatus arm_aes_encrypt_cbc_192(AESContext *cx, unsigned char *output,
+                                  unsigned int *outputLen,
+                                  unsigned int maxOutputLen,
+                                  const unsigned char *input,
+                                  unsigned int inputLen,
+                                  unsigned int blocksize);
+SECStatus arm_aes_decrypt_cbc_192(AESContext *cx, unsigned char *output,
+                                  unsigned int *outputLen,
+                                  unsigned int maxOutputLen,
+                                  const unsigned char *input,
+                                  unsigned int inputLen,
+                                  unsigned int blocksize);
+SECStatus arm_aes_encrypt_ecb_256(AESContext *cx, unsigned char *output,
+                                  unsigned int *outputLen,
+                                  unsigned int maxOutputLen,
+                                  const unsigned char *input,
+                                  unsigned int inputLen,
+                                  unsigned int blocksize);
+SECStatus arm_aes_decrypt_ecb_256(AESContext *cx, unsigned char *output,
+                                  unsigned int *outputLen,
+                                  unsigned int maxOutputLen,
+                                  const unsigned char *input,
+                                  unsigned int inputLen,
+                                  unsigned int blocksize);
+SECStatus arm_aes_encrypt_cbc_256(AESContext *cx, unsigned char *output,
+                                  unsigned int *outputLen,
+                                  unsigned int maxOutputLen,
+                                  const unsigned char *input,
+                                  unsigned int inputLen,
+                                  unsigned int blocksize);
+SECStatus arm_aes_decrypt_cbc_256(AESContext *cx, unsigned char *output,
+                                  unsigned int *outputLen,
+                                  unsigned int maxOutputLen,
+                                  const unsigned char *input,
+                                  unsigned int inputLen,
+                                  unsigned int blocksize);
+
+#define native_aes_ecb_worker(encrypt, keysize)                          \
+    ((encrypt)                                                           \
+         ? ((keysize) == 16 ? arm_aes_encrypt_ecb_128                    \
+                            : (keysize) == 24 ? arm_aes_encrypt_ecb_192  \
+                                              : arm_aes_encrypt_ecb_256) \
+         : ((keysize) == 16 ? arm_aes_decrypt_ecb_128                    \
+                            : (keysize) == 24 ? arm_aes_decrypt_ecb_192  \
+                                              : arm_aes_decrypt_ecb_256))
+
+#define native_aes_cbc_worker(encrypt, keysize)                          \
+    ((encrypt)                                                           \
+         ? ((keysize) == 16 ? arm_aes_encrypt_cbc_128                    \
+                            : (keysize) == 24 ? arm_aes_encrypt_cbc_192  \
+                                              : arm_aes_encrypt_cbc_256) \
+         : ((keysize) == 16 ? arm_aes_decrypt_cbc_128                    \
+                            : (keysize) == 24 ? arm_aes_decrypt_cbc_192  \
+                                              : arm_aes_decrypt_cbc_256))
+
+#define native_aes_init(encrypt, keysize)           \
+    do {                                            \
+        if (encrypt) {                              \
+            rijndael_key_expansion(cx, key, Nk);    \
+        } else {                                    \
+            rijndael_invkey_expansion(cx, key, Nk); \
+        }                                           \
+    } while (0)
--- a/lib/freebl/freebl.gyp
+++ b/lib/freebl/freebl.gyp
@@ -128,16 +128,45 @@
       'cflags': [
         '-march=armv8-a+crypto'
       ],
       'cflags_mozilla': [
         '-march=armv8-a+crypto'
       ]
     },
     {
+      'target_name': 'armv8_c_lib',
+      'type': 'static_library',
+      'sources': [
+        'aes-armv8.c',
+      ],
+      'dependencies': [
+        '<(DEPTH)/exports.gyp:nss_exports'
+      ],
+      'conditions': [
+        [ 'target_arch=="arm"', {
+          'cflags': [
+            '-march=armv8-a',
+            '-mfpu=crypto-neon-fp-armv8'
+          ],
+          'cflags_mozilla': [
+            '-march=armv8-a',
+            '-mfpu=crypto-neon-fp-armv8'
+          ],
+        }, 'target_arch=="arm64" or target_arch=="aarch64"', {
+          'cflags': [
+            '-march=armv8-a+crypto'
+          ],
+          'cflags_mozilla': [
+            '-march=armv8-a+crypto'
+          ],
+        }]
+      ]
+    },
+    {
       'target_name': 'freebl',
       'type': 'static_library',
       'sources': [
         'loader.c'
       ],
       'dependencies': [
         '<(DEPTH)/exports.gyp:nss_exports'
       ]
@@ -155,16 +184,20 @@
         '<(DEPTH)/exports.gyp:nss_exports',
         'hw-acc-crypto',
       ],
       'conditions': [
         [ 'target_arch=="ia32" or target_arch=="x64"', {
           'dependencies': [
             'gcm-aes-x86_c_lib',
           ],
+        }, 'target_arch=="arm" or target_arch=="arm64" or target_arch=="aarch64"', {
+          'dependencies': [
+            'armv8_c_lib'
+          ],
         }],
         [ 'target_arch=="arm64" or target_arch=="aarch64"', {
           'dependencies': [
             'gcm-aes-aarch64_c_lib',
           ],
         }],
         [ 'OS=="linux"', {
           'defines!': [
@@ -197,16 +230,20 @@
         '<(DEPTH)/exports.gyp:nss_exports',
         'hw-acc-crypto',
       ],
       'conditions': [
         [ 'target_arch=="ia32" or target_arch=="x64"', {
           'dependencies': [
             'gcm-aes-x86_c_lib',
           ]
+        }, 'target_arch=="arm" or target_arch=="arm64" or target_arch=="aarch64"', {
+          'dependencies': [
+            'armv8_c_lib',
+          ],
         }],
         [ 'target_arch=="arm64" or target_arch=="aarch64"', {
           'dependencies': [
             'gcm-aes-aarch64_c_lib',
           ],
         }],
         [ 'OS!="linux"', {
           'conditions': [
@@ -424,16 +461,22 @@
           }],
           [ 'target_arch=="arm"', {
             'defines': [
               'MP_ASSEMBLY_MULTIPLY',
               'MP_ASSEMBLY_SQUARE',
               'MP_USE_UINT_DIGIT',
               'SHA_NO_LONG_LONG',
               'ARMHF',
+              'USE_HW_AES',
+            ],
+          }],
+          [ 'target_arch=="arm64" or target_arch=="aarch64"', {
+            'defines': [
+              'USE_HW_AES',
             ],
           }],
         ],
       }],
     ],
   },
   'variables': {
     'module': 'nss',
--- a/lib/freebl/intel-aes.h
+++ b/lib/freebl/intel-aes.h
@@ -95,40 +95,40 @@ SECStatus intel_aes_decrypt_cbc_256(AESC
                                     unsigned int blocksize);
 SECStatus intel_aes_encrypt_ctr_256(CTRContext *cx, unsigned char *output,
                                     unsigned int *outputLen,
                                     unsigned int maxOutputLen,
                                     const unsigned char *input,
                                     unsigned int inputLen,
                                     unsigned int blocksize);
 
-#define intel_aes_ecb_worker(encrypt, keysize)                             \
+#define native_aes_ecb_worker(encrypt, keysize)                            \
     ((encrypt)                                                             \
          ? ((keysize) == 16 ? intel_aes_encrypt_ecb_128                    \
                             : (keysize) == 24 ? intel_aes_encrypt_ecb_192  \
                                               : intel_aes_encrypt_ecb_256) \
          : ((keysize) == 16 ? intel_aes_decrypt_ecb_128                    \
                             : (keysize) == 24 ? intel_aes_decrypt_ecb_192  \
                                               : intel_aes_decrypt_ecb_256))
 
-#define intel_aes_cbc_worker(encrypt, keysize)                             \
+#define native_aes_cbc_worker(encrypt, keysize)                            \
     ((encrypt)                                                             \
          ? ((keysize) == 16 ? intel_aes_encrypt_cbc_128                    \
                             : (keysize) == 24 ? intel_aes_encrypt_cbc_192  \
                                               : intel_aes_encrypt_cbc_256) \
          : ((keysize) == 16 ? intel_aes_decrypt_cbc_128                    \
                             : (keysize) == 24 ? intel_aes_decrypt_cbc_192  \
                                               : intel_aes_decrypt_cbc_256))
 
 #define intel_aes_ctr_worker(nr)                         \
     ((nr) == 10 ? intel_aes_encrypt_ctr_128              \
                 : (nr) == 12 ? intel_aes_encrypt_ctr_192 \
                              : intel_aes_encrypt_ctr_256)
 
-#define intel_aes_init(encrypt, keysize)                          \
+#define native_aes_init(encrypt, keysize)                         \
     do {                                                          \
         if (encrypt) {                                            \
             if (keysize == 16)                                    \
                 intel_aes_encrypt_init_128(key, cx->expandedKey); \
             else if (keysize == 24)                               \
                 intel_aes_encrypt_init_192(key, cx->expandedKey); \
             else                                                  \
                 intel_aes_encrypt_init_256(key, cx->expandedKey); \
--- a/lib/freebl/rijndael.c
+++ b/lib/freebl/rijndael.c
@@ -15,19 +15,28 @@
 #include "blapi.h"
 #include "rijndael.h"
 
 #include "cts.h"
 #include "ctr.h"
 #include "gcm.h"
 #include "mpi.h"
 
+#if !defined(IS_LITTLE_ENDIAN) && !defined(NSS_X86_OR_X64)
+// not test yet on big endian platform of arm
+#undef USE_HW_AES
+#endif
+
 #ifdef USE_HW_AES
+#ifdef NSS_X86_OR_X64
 #include "intel-aes.h"
+#else
+#include "aes-armv8.h"
 #endif
+#endif /* USE_HW_AES */
 #ifdef INTEL_GCM
 #include "intel-gcm.h"
 #endif /* INTEL_GCM */
 
 /* Forward declarations */
 void rijndael_native_key_expansion(AESContext *cx, const unsigned char *key,
                                    unsigned int Nk);
 void rijndael_native_encryptBlock(AESContext *cx,
@@ -842,58 +851,62 @@ aes_InitContext(AESContext *cx, const un
     if (mode == NSS_AES_CBC && iv == NULL) {
         PORT_SetError(SEC_ERROR_INVALID_ARGS);
         return SECFailure;
     }
     if (!cx) {
         PORT_SetError(SEC_ERROR_INVALID_ARGS);
         return SECFailure;
     }
-    use_hw_aes = aesni_support() && (keysize % 8) == 0;
+#if defined(NSS_X86_OR_X64) || defined(USE_HW_AES)
+    use_hw_aes = (aesni_support() || arm_aes_support()) && (keysize % 8) == 0;
+#else
+    use_hw_aes = PR_FALSE;
+#endif
     /* Nb = (block size in bits) / 32 */
     cx->Nb = AES_BLOCK_SIZE / 4;
     /* Nk = (key size in bits) / 32 */
     Nk = keysize / 4;
     /* Obtain number of rounds from "table" */
     cx->Nr = RIJNDAEL_NUM_ROUNDS(Nk, cx->Nb);
     /* copy in the iv, if neccessary */
     if (mode == NSS_AES_CBC) {
         memcpy(cx->iv, iv, AES_BLOCK_SIZE);
 #ifdef USE_HW_AES
         if (use_hw_aes) {
             cx->worker = (freeblCipherFunc)
-                intel_aes_cbc_worker(encrypt, keysize);
+                native_aes_cbc_worker(encrypt, keysize);
         } else
 #endif
         {
             cx->worker = (freeblCipherFunc)(encrypt
                                                 ? &rijndael_encryptCBC
                                                 : &rijndael_decryptCBC);
         }
     } else {
 #ifdef USE_HW_AES
         if (use_hw_aes) {
             cx->worker = (freeblCipherFunc)
-                intel_aes_ecb_worker(encrypt, keysize);
+                native_aes_ecb_worker(encrypt, keysize);
         } else
 #endif
         {
             cx->worker = (freeblCipherFunc)(encrypt
                                                 ? &rijndael_encryptECB
                                                 : &rijndael_decryptECB);
         }
     }
     PORT_Assert((cx->Nb * (cx->Nr + 1)) <= RIJNDAEL_MAX_EXP_KEY_SIZE);
     if ((cx->Nb * (cx->Nr + 1)) > RIJNDAEL_MAX_EXP_KEY_SIZE) {
         PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
         return SECFailure;
     }
 #ifdef USE_HW_AES
     if (use_hw_aes) {
-        intel_aes_init(encrypt, keysize);
+        native_aes_init(encrypt, keysize);
     } else
 #endif
     {
         /* Generate expanded key */
         if (encrypt) {
             if (use_hw_aes && (cx->mode == NSS_AES_GCM || cx->mode == NSS_AES ||
                                cx->mode == NSS_AES_CTR)) {
                 PORT_Assert(keysize == 16 || keysize == 24 || keysize == 32);