Bug 1400603 - freebl: Reorganize AES-GCM source code based on hw/sw implementation, r=franziskus
authorDaiki Ueno <dueno@redhat.com>
Fri, 22 Sep 2017 11:27:34 +0200
changeset 13603 e84403331d99bb1fcad4a879f42749332861e8e1
parent 13602 16eb0e48e4381bd8aa4128d75006d22215f6006f
child 13604 96a835be15192281d67a8b1046b7f159ff6deb12
push id2384
push userfranziskuskiefer@gmail.com
push dateFri, 22 Sep 2017 09:29:42 +0000
reviewersfranziskus
bugs1400603
Bug 1400603 - freebl: Reorganize AES-GCM source code based on hw/sw implementation, r=franziskus Reviewers: franziskus Reviewed By: franziskus Bug #: 1400603 Differential Revision: https://phabricator.services.mozilla.com/D65
lib/freebl/Makefile
lib/freebl/aes-x86.c
lib/freebl/freebl.gyp
lib/freebl/gcm-x86.c
lib/freebl/gcm.c
lib/freebl/gcm.h
lib/freebl/rijndael.c
lib/freebl/rijndael.h
--- a/lib/freebl/Makefile
+++ b/lib/freebl/Makefile
@@ -105,17 +105,19 @@ endif
 ifdef FREEBL_PRELINK_COMMAND
 	DEFINES +=-DFREEBL_PRELINK_COMMAND=\"$(FREEBL_PRELINK_COMMAND)\"
 endif
 # NSS_X86 means the target is a 32-bits x86 CPU architecture
 # NSS_X64 means the target is a 64-bits 64 CPU architecture
 # NSS_X86_OR_X64 means the target is either x86 or x64
 ifeq (,$(filter-out i386 x386 x86 x86_64,$(CPU_ARCH)))
         DEFINES += -DNSS_X86_OR_X64
-        CFLAGS += -mpclmul -maes
+        EXTRA_SRCS += gcm-x86.c aes-x86.c
+$(OBJDIR)/gcm-x86.o: CFLAGS += -mpclmul -maes
+$(OBJDIR)/aes-x86.o: CFLAGS += -mpclmul -maes
 ifneq (,$(USE_64)$(USE_X32))
         DEFINES += -DNSS_X64
 else
         DEFINES += -DNSS_X86
 endif
 endif
 
 ifeq ($(OS_TARGET),OSF1)
new file mode 100644
--- /dev/null
+++ b/lib/freebl/aes-x86.c
@@ -0,0 +1,157 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifdef FREEBL_NO_DEPEND
+#include "stubs.h"
+#endif
+#include "rijndael.h"
+#include "secerr.h"
+
+#include <wmmintrin.h> /* aes-ni */
+
+#define EXPAND_KEY128(k, rcon, res)                   \
+    tmp_key = _mm_aeskeygenassist_si128(k, rcon);     \
+    tmp_key = _mm_shuffle_epi32(tmp_key, 0xFF);       \
+    tmp = _mm_xor_si128(k, _mm_slli_si128(k, 4));     \
+    tmp = _mm_xor_si128(tmp, _mm_slli_si128(tmp, 4)); \
+    tmp = _mm_xor_si128(tmp, _mm_slli_si128(tmp, 4)); \
+    res = _mm_xor_si128(tmp, tmp_key)
+
+static void
+native_key_expansion128(AESContext *cx, const unsigned char *key)
+{
+    __m128i *keySchedule = cx->keySchedule;
+    pre_align __m128i tmp_key post_align;
+    pre_align __m128i tmp post_align;
+    keySchedule[0] = _mm_loadu_si128((__m128i *)key);
+    EXPAND_KEY128(keySchedule[0], 0x01, keySchedule[1]);
+    EXPAND_KEY128(keySchedule[1], 0x02, keySchedule[2]);
+    EXPAND_KEY128(keySchedule[2], 0x04, keySchedule[3]);
+    EXPAND_KEY128(keySchedule[3], 0x08, keySchedule[4]);
+    EXPAND_KEY128(keySchedule[4], 0x10, keySchedule[5]);
+    EXPAND_KEY128(keySchedule[5], 0x20, keySchedule[6]);
+    EXPAND_KEY128(keySchedule[6], 0x40, keySchedule[7]);
+    EXPAND_KEY128(keySchedule[7], 0x80, keySchedule[8]);
+    EXPAND_KEY128(keySchedule[8], 0x1B, keySchedule[9]);
+    EXPAND_KEY128(keySchedule[9], 0x36, keySchedule[10]);
+}
+
+#define EXPAND_KEY192_PART1(res, k0, kt, rcon)                                \
+    tmp2 = _mm_slli_si128(k0, 4);                                             \
+    tmp1 = _mm_xor_si128(k0, tmp2);                                           \
+    tmp2 = _mm_slli_si128(tmp2, 4);                                           \
+    tmp1 = _mm_xor_si128(_mm_xor_si128(tmp1, tmp2), _mm_slli_si128(tmp2, 4)); \
+    tmp2 = _mm_aeskeygenassist_si128(kt, rcon);                               \
+    res = _mm_xor_si128(tmp1, _mm_shuffle_epi32(tmp2, 0x55))
+
+#define EXPAND_KEY192_PART2(res, k1, k2)             \
+    tmp2 = _mm_xor_si128(k1, _mm_slli_si128(k1, 4)); \
+    res = _mm_xor_si128(tmp2, _mm_shuffle_epi32(k2, 0xFF))
+
+#define EXPAND_KEY192(k0, res1, res2, res3, carry, rcon1, rcon2)         \
+    EXPAND_KEY192_PART1(tmp3, k0, res1, rcon1);                          \
+    EXPAND_KEY192_PART2(carry, res1, tmp3);                              \
+    res1 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(res1),       \
+                                           _mm_castsi128_pd(tmp3), 0));  \
+    res2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(tmp3),       \
+                                           _mm_castsi128_pd(carry), 1)); \
+    EXPAND_KEY192_PART1(res3, tmp3, carry, rcon2)
+
+static void
+native_key_expansion192(AESContext *cx, const unsigned char *key)
+{
+    __m128i *keySchedule = cx->keySchedule;
+    pre_align __m128i tmp1 post_align;
+    pre_align __m128i tmp2 post_align;
+    pre_align __m128i tmp3 post_align;
+    pre_align __m128i carry post_align;
+    keySchedule[0] = _mm_loadu_si128((__m128i *)key);
+    keySchedule[1] = _mm_loadu_si128((__m128i *)(key + 16));
+    EXPAND_KEY192(keySchedule[0], keySchedule[1], keySchedule[2],
+                  keySchedule[3], carry, 0x1, 0x2);
+    EXPAND_KEY192_PART2(keySchedule[4], carry, keySchedule[3]);
+    EXPAND_KEY192(keySchedule[3], keySchedule[4], keySchedule[5],
+                  keySchedule[6], carry, 0x4, 0x8);
+    EXPAND_KEY192_PART2(keySchedule[7], carry, keySchedule[6]);
+    EXPAND_KEY192(keySchedule[6], keySchedule[7], keySchedule[8],
+                  keySchedule[9], carry, 0x10, 0x20);
+    EXPAND_KEY192_PART2(keySchedule[10], carry, keySchedule[9]);
+    EXPAND_KEY192(keySchedule[9], keySchedule[10], keySchedule[11],
+                  keySchedule[12], carry, 0x40, 0x80);
+}
+
+#define EXPAND_KEY256_PART(res, rconx, k1x, k2x, X)                           \
+    tmp_key = _mm_shuffle_epi32(_mm_aeskeygenassist_si128(k2x, rconx), X);    \
+    tmp2 = _mm_slli_si128(k1x, 4);                                            \
+    tmp1 = _mm_xor_si128(k1x, tmp2);                                          \
+    tmp2 = _mm_slli_si128(tmp2, 4);                                           \
+    tmp1 = _mm_xor_si128(_mm_xor_si128(tmp1, tmp2), _mm_slli_si128(tmp2, 4)); \
+    res = _mm_xor_si128(tmp1, tmp_key);
+
+#define EXPAND_KEY256(res1, res2, k1, k2, rcon)   \
+    EXPAND_KEY256_PART(res1, rcon, k1, k2, 0xFF); \
+    EXPAND_KEY256_PART(res2, 0x00, k2, res1, 0xAA)
+
+static void
+native_key_expansion256(AESContext *cx, const unsigned char *key)
+{
+    __m128i *keySchedule = cx->keySchedule;
+    pre_align __m128i tmp_key post_align;
+    pre_align __m128i tmp1 post_align;
+    pre_align __m128i tmp2 post_align;
+    keySchedule[0] = _mm_loadu_si128((__m128i *)key);
+    keySchedule[1] = _mm_loadu_si128((__m128i *)(key + 16));
+    EXPAND_KEY256(keySchedule[2], keySchedule[3], keySchedule[0],
+                  keySchedule[1], 0x01);
+    EXPAND_KEY256(keySchedule[4], keySchedule[5], keySchedule[2],
+                  keySchedule[3], 0x02);
+    EXPAND_KEY256(keySchedule[6], keySchedule[7], keySchedule[4],
+                  keySchedule[5], 0x04);
+    EXPAND_KEY256(keySchedule[8], keySchedule[9], keySchedule[6],
+                  keySchedule[7], 0x08);
+    EXPAND_KEY256(keySchedule[10], keySchedule[11], keySchedule[8],
+                  keySchedule[9], 0x10);
+    EXPAND_KEY256(keySchedule[12], keySchedule[13], keySchedule[10],
+                  keySchedule[11], 0x20);
+    EXPAND_KEY256_PART(keySchedule[14], 0x40, keySchedule[12],
+                       keySchedule[13], 0xFF);
+}
+
+/*
+ * AES key expansion using aes-ni instructions.
+ */
+void
+rijndael_native_key_expansion(AESContext *cx, const unsigned char *key,
+                              unsigned int Nk)
+{
+    switch (Nk) {
+        case 4:
+            native_key_expansion128(cx, key);
+            return;
+        case 6:
+            native_key_expansion192(cx, key);
+            return;
+        case 8:
+            native_key_expansion256(cx, key);
+            return;
+        default:
+            /* This shouldn't happen (checked by the caller). */
+            return;
+    }
+}
+
+void
+rijndael_native_encryptBlock(AESContext *cx,
+                             unsigned char *output,
+                             const unsigned char *input)
+{
+    int i;
+    pre_align __m128i m post_align = _mm_loadu_si128((__m128i *)input);
+    m = _mm_xor_si128(m, cx->keySchedule[0]);
+    for (i = 1; i < cx->Nr; ++i) {
+        m = _mm_aesenc_si128(m, cx->keySchedule[i]);
+    }
+    m = _mm_aesenclast_si128(m, cx->keySchedule[cx->Nr]);
+    _mm_storeu_si128((__m128i *)output, m);
+}
--- a/lib/freebl/freebl.gyp
+++ b/lib/freebl/freebl.gyp
@@ -18,16 +18,47 @@
       'cflags': [
         '-mssse3'
       ],
       'cflags_mozilla': [
         '-mssse3'
       ]
     },
     {
+      'target_name': 'gcm-aes-x86_c_lib',
+      'type': 'static_library',
+      'sources': [
+        'gcm-x86.c', 'aes-x86.c'
+      ],
+      'dependencies': [
+        '<(DEPTH)/exports.gyp:nss_exports'
+      ],
+      # Enable isa option for pclmul and aes-ni; supported since gcc 4.4.
+      # This is only supported by x84/x64. It's not needed for Windows,
+      # unless clang-cl is used.
+      'cflags_mozilla': [
+        '-mpclmul', '-maes'
+      ],
+      'conditions': [
+        [ 'OS=="linux" or OS=="android" or OS=="dragonfly" or OS=="freebsd" or OS=="netbsd" or OS=="openbsd"', {
+          'cflags': [
+            '-mpclmul', '-maes'
+          ],
+        }],
+        # macOS build doesn't use cflags.
+        [ 'OS=="mac"', {
+          'xcode_settings': {
+            'OTHER_CFLAGS': [
+              '-mpclmul', '-maes'
+            ],
+          },
+        }]
+      ]
+    },
+    {
       'target_name': 'freebl',
       'type': 'static_library',
       'sources': [
         'loader.c'
       ],
       'dependencies': [
         '<(DEPTH)/exports.gyp:nss_exports'
       ]
@@ -40,16 +71,21 @@
       'type': 'static_library',
       'includes': [
         'freebl_base.gypi',
       ],
       'dependencies': [
         '<(DEPTH)/exports.gyp:nss_exports',
       ],
       'conditions': [
+        [ 'target_arch=="ia32" or target_arch=="x64"', {
+          'dependencies': [
+            'gcm-aes-x86_c_lib'
+          ],
+        }],
         [ 'OS=="linux"', {
           'defines!': [
             'FREEBL_NO_DEPEND',
             'FREEBL_LOWHASH',
             'USE_HW_AES',
             'INTEL_GCM',
           ],
           'conditions': [
@@ -71,16 +107,21 @@
       'type': 'shared_library',
       'includes': [
         'freebl_base.gypi',
       ],
       'dependencies': [
         '<(DEPTH)/exports.gyp:nss_exports',
       ],
       'conditions': [
+        [ 'target_arch=="ia32" or target_arch=="x64"', {
+          'dependencies': [
+            'gcm-aes-x86_c_lib'
+          ]
+        }],
         [ 'OS!="linux" and OS!="android"', {
           'conditions': [
             [ 'moz_fold_libs==0', {
               'dependencies': [
                 '<(DEPTH)/lib/util/util.gyp:nssutil3',
               ],
             }, {
               'libraries': [
@@ -149,37 +190,21 @@
       'SHLIB_SUFFIX=\"<(dll_suffix)\"',
       'SHLIB_PREFIX=\"<(dll_prefix)\"',
       'SHLIB_VERSION=\"3\"',
       'SOFTOKEN_SHLIB_VERSION=\"3\"',
       'RIJNDAEL_INCLUDE_TABLES',
       'MP_API_COMPATIBLE'
     ],
     'conditions': [
-      [ 'target_arch=="ia32" or target_arch=="x64"', {
-        'cflags_mozilla': [
-          '-mpclmul',
-          '-maes',
-        ],
-        'conditions': [
-          [ 'OS=="dragonfly" or OS=="freebsd" or OS=="netbsd" or OS=="openbsd"', {
-            'cflags': [
-              '-mpclmul',
-              '-maes',
-            ],
-          }],
-        ],
-      }],
       [ 'OS=="mac"', {
         'xcode_settings': {
           # I'm not sure since when this is supported.
           # But I hope that doesn't matter. We also assume this is x86/x64.
           'OTHER_CFLAGS': [
-            '-mpclmul',
-            '-maes',
             '-std=gnu99',
           ],
         },
       }],
       [ 'OS=="win" and target_arch=="ia32"', {
         'msvs_settings': {
           'VCCLCompilerTool': {
             #TODO: -Ox optimize flags
@@ -263,24 +288,16 @@
             'defines': [
               'MP_IS_LITTLE_ENDIAN',
               'MP_ASSEMBLY_MULTIPLY',
               'MP_ASSEMBLY_SQUARE',
               'MP_ASSEMBLY_DIV_2DX1D',
               'MP_USE_UINT_DIGIT',
             ],
           }],
-          [ 'target_arch=="ia32" or target_arch=="x64"', {
-            'cflags': [
-              # enable isa option for pclmul am aes-ni; supported since gcc 4.4
-              # This is only support by x84/x64. It's not needed for Windows.
-              '-mpclmul',
-              '-maes',
-            ],
-          }],
           [ 'target_arch=="arm"', {
             'defines': [
               'MP_ASSEMBLY_MULTIPLY',
               'MP_ASSEMBLY_SQUARE',
               'MP_USE_UINT_DIGIT',
               'SHA_NO_LONG_LONG',
               'ARMHF',
             ],
new file mode 100644
--- /dev/null
+++ b/lib/freebl/gcm-x86.c
@@ -0,0 +1,127 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifdef FREEBL_NO_DEPEND
+#include "stubs.h"
+#endif
+#include "gcm.h"
+#include "secerr.h"
+
+#include <wmmintrin.h> /* clmul */
+
+#define WRITE64(x, bytes)   \
+    (bytes)[0] = (x) >> 56; \
+    (bytes)[1] = (x) >> 48; \
+    (bytes)[2] = (x) >> 40; \
+    (bytes)[3] = (x) >> 32; \
+    (bytes)[4] = (x) >> 24; \
+    (bytes)[5] = (x) >> 16; \
+    (bytes)[6] = (x) >> 8;  \
+    (bytes)[7] = (x);
+
+SECStatus
+gcm_HashWrite_hw(gcmHashContext *ghash, unsigned char *outbuf)
+{
+    uint64_t tmp_out[2];
+    _mm_storeu_si128((__m128i *)tmp_out, ghash->x);
+    /* maxout must be larger than 16 byte (checked by the caller). */
+    WRITE64(tmp_out[0], outbuf + 8);
+    WRITE64(tmp_out[1], outbuf);
+    return SECSuccess;
+}
+
+SECStatus
+gcm_HashMult_hw(gcmHashContext *ghash, const unsigned char *buf,
+                unsigned int count)
+{
+    size_t i;
+    pre_align __m128i z_high post_align;
+    pre_align __m128i z_low post_align;
+    pre_align __m128i C post_align;
+    pre_align __m128i D post_align;
+    pre_align __m128i E post_align;
+    pre_align __m128i F post_align;
+    pre_align __m128i bin post_align;
+    pre_align __m128i Ci post_align;
+    pre_align __m128i tmp post_align;
+
+    for (i = 0; i < count; i++, buf += 16) {
+        bin = _mm_set_epi16(((uint16_t)buf[0] << 8) | buf[1],
+                            ((uint16_t)buf[2] << 8) | buf[3],
+                            ((uint16_t)buf[4] << 8) | buf[5],
+                            ((uint16_t)buf[6] << 8) | buf[7],
+                            ((uint16_t)buf[8] << 8) | buf[9],
+                            ((uint16_t)buf[10] << 8) | buf[11],
+                            ((uint16_t)buf[12] << 8) | buf[13],
+                            ((uint16_t)buf[14] << 8) | buf[15]);
+        Ci = _mm_xor_si128(bin, ghash->x);
+
+        /* Do binary mult ghash->X = Ci * ghash->H. */
+        C = _mm_clmulepi64_si128(Ci, ghash->h, 0x00);
+        D = _mm_clmulepi64_si128(Ci, ghash->h, 0x11);
+        E = _mm_clmulepi64_si128(Ci, ghash->h, 0x01);
+        F = _mm_clmulepi64_si128(Ci, ghash->h, 0x10);
+        tmp = _mm_xor_si128(E, F);
+        z_high = _mm_xor_si128(tmp, _mm_slli_si128(D, 8));
+        z_high = _mm_unpackhi_epi64(z_high, D);
+        z_low = _mm_xor_si128(_mm_slli_si128(tmp, 8), C);
+        z_low = _mm_unpackhi_epi64(_mm_slli_si128(C, 8), z_low);
+
+        /* Shift one to the left (multiply by x) as gcm spec is stupid. */
+        C = _mm_slli_si128(z_low, 8);
+        E = _mm_srli_epi64(C, 63);
+        D = _mm_slli_si128(z_high, 8);
+        F = _mm_srli_epi64(D, 63);
+        /* Carry over */
+        C = _mm_srli_si128(z_low, 8);
+        D = _mm_srli_epi64(C, 63);
+        z_low = _mm_or_si128(_mm_slli_epi64(z_low, 1), E);
+        z_high = _mm_or_si128(_mm_or_si128(_mm_slli_epi64(z_high, 1), F), D);
+
+        /* Reduce */
+        C = _mm_slli_si128(z_low, 8);
+        /* D = z_low << 127 */
+        D = _mm_slli_epi64(C, 63);
+        /* E = z_low << 126 */
+        E = _mm_slli_epi64(C, 62);
+        /* F = z_low << 121 */
+        F = _mm_slli_epi64(C, 57);
+        /* z_low ^= (z_low << 127) ^ (z_low << 126) ^ (z_low << 121); */
+        z_low = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(z_low, D), E), F);
+        C = _mm_srli_si128(z_low, 8);
+        /* D = z_low >> 1 */
+        D = _mm_slli_epi64(C, 63);
+        D = _mm_or_si128(_mm_srli_epi64(z_low, 1), D);
+        /* E = z_low >> 2 */
+        E = _mm_slli_epi64(C, 62);
+        E = _mm_or_si128(_mm_srli_epi64(z_low, 2), E);
+        /* F = z_low >> 7 */
+        F = _mm_slli_epi64(C, 57);
+        F = _mm_or_si128(_mm_srli_epi64(z_low, 7), F);
+        /* ghash->x ^= z_low ^ (z_low >> 1) ^ (z_low >> 2) ^ (z_low >> 7); */
+        ghash->x = _mm_xor_si128(_mm_xor_si128(
+                                     _mm_xor_si128(_mm_xor_si128(z_high, z_low), D), E),
+                                 F);
+    }
+    return SECSuccess;
+}
+
+SECStatus
+gcm_HashInit_hw(gcmHashContext *ghash)
+{
+    ghash->ghash_mul = gcm_HashMult_hw;
+    ghash->x = _mm_setzero_si128();
+    /* MSVC requires __m64 to load epi64. */
+    ghash->h = _mm_set_epi32(ghash->h_high >> 32, (uint32_t)ghash->h_high,
+                             ghash->h_low >> 32, (uint32_t)ghash->h_low);
+    ghash->hw = PR_TRUE;
+    return SECSuccess;
+}
+
+SECStatus
+gcm_HashZeroX_hw(gcmHashContext *ghash)
+{
+    ghash->x = _mm_setzero_si128();
+    return SECSuccess;
+}
--- a/lib/freebl/gcm.c
+++ b/lib/freebl/gcm.c
@@ -12,28 +12,60 @@
 #include "gcm.h"
 #include "ctr.h"
 #include "secerr.h"
 #include "prtypes.h"
 #include "pkcs11t.h"
 
 #include <limits.h>
 
-#ifdef NSS_X86_OR_X64
-#include <wmmintrin.h> /* clmul */
-#endif
-
 /* Forward declarations */
+SECStatus gcm_HashInit_hw(gcmHashContext *ghash);
+SECStatus gcm_HashWrite_hw(gcmHashContext *ghash, unsigned char *outbuf);
 SECStatus gcm_HashMult_hw(gcmHashContext *ghash, const unsigned char *buf,
                           unsigned int count);
+SECStatus gcm_HashZeroX_hw(gcmHashContext *ghash);
 SECStatus gcm_HashMult_sftw(gcmHashContext *ghash, const unsigned char *buf,
                             unsigned int count);
 SECStatus gcm_HashMult_sftw32(gcmHashContext *ghash, const unsigned char *buf,
                               unsigned int count);
 
+/* Stub definitions for the above *_hw functions, which shouldn't be
+ * used unless NSS_X86_OR_X64 is defined */
+#ifndef NSS_X86_OR_X64
+SECStatus
+gcm_HashWrite_hw(gcmHashContext *ghash, unsigned char *outbuf)
+{
+    PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
+    return SECFailure;
+}
+
+SECStatus
+gcm_HashMult_hw(gcmHashContext *ghash, const unsigned char *buf,
+                unsigned int count)
+{
+    PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
+    return SECFailure;
+}
+
+SECStatus
+gcm_HashInit_hw(gcmHashContext *ghash)
+{
+    PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
+    return SECFailure;
+}
+
+SECStatus
+gcm_HashZeroX_hw(gcmHashContext *ghash)
+{
+    PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
+    return SECFailure;
+}
+#endif /* NSS_X86_OR_X64 */
+
 uint64_t
 get64(const unsigned char *bytes)
 {
     return ((uint64_t)bytes[0]) << 56 |
            ((uint64_t)bytes[1]) << 48 |
            ((uint64_t)bytes[2]) << 40 |
            ((uint64_t)bytes[3]) << 32 |
            ((uint64_t)bytes[4]) << 24 |
@@ -41,46 +73,38 @@ get64(const unsigned char *bytes)
            ((uint64_t)bytes[6]) << 8 |
            ((uint64_t)bytes[7]);
 }
 
 /* Initialize a gcmHashContext */
 SECStatus
 gcmHash_InitContext(gcmHashContext *ghash, const unsigned char *H, PRBool sw)
 {
+    SECStatus rv = SECSuccess;
+
     ghash->cLen = 0;
     ghash->bufLen = 0;
     PORT_Memset(ghash->counterBuf, 0, sizeof(ghash->counterBuf));
 
     ghash->h_low = get64(H + 8);
     ghash->h_high = get64(H);
     if (clmul_support() && !sw) {
-#ifdef NSS_X86_OR_X64
-        ghash->ghash_mul = gcm_HashMult_hw;
-        ghash->x = _mm_setzero_si128();
-        /* MSVC requires __m64 to load epi64. */
-        ghash->h = _mm_set_epi32(ghash->h_high >> 32, (uint32_t)ghash->h_high,
-                                 ghash->h_low >> 32, (uint32_t)ghash->h_low);
-        ghash->hw = PR_TRUE;
-#else
-        PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
-        return SECFailure;
-#endif /* NSS_X86_OR_X64 */
+        rv = gcm_HashInit_hw(ghash);
     } else {
 /* We fall back to the software implementation if we can't use / don't
          * want to use pclmul. */
 #ifdef HAVE_INT128_SUPPORT
         ghash->ghash_mul = gcm_HashMult_sftw;
 #else
         ghash->ghash_mul = gcm_HashMult_sftw32;
 #endif
         ghash->x_high = ghash->x_low = 0;
         ghash->hw = PR_FALSE;
     }
-    return SECSuccess;
+    return rv;
 }
 
 #ifdef HAVE_INT128_SUPPORT
 /* Binary multiplication x * y = r_high << 64 | r_low. */
 void
 bmul(uint64_t x, uint64_t y, uint64_t *r_high, uint64_t *r_low)
 {
     uint128_t x1, x2, x3, x4, x5;
@@ -278,112 +302,27 @@ gcm_HashMult_sftw32(gcmHashContext *ghas
                     (z_low_h << 63) ^ (z_low_h << 62) ^ (z_low_h << 57);
         ghash->x_high = z_high_h;
         ghash->x_low = z_high_l;
     }
     return SECSuccess;
 }
 #endif /* HAVE_INT128_SUPPORT */
 
-SECStatus
-gcm_HashMult_hw(gcmHashContext *ghash, const unsigned char *buf,
-                unsigned int count)
-{
-#ifdef NSS_X86_OR_X64
-    size_t i;
-    pre_align __m128i z_high post_align;
-    pre_align __m128i z_low post_align;
-    pre_align __m128i C post_align;
-    pre_align __m128i D post_align;
-    pre_align __m128i E post_align;
-    pre_align __m128i F post_align;
-    pre_align __m128i bin post_align;
-    pre_align __m128i Ci post_align;
-    pre_align __m128i tmp post_align;
-
-    for (i = 0; i < count; i++, buf += 16) {
-        bin = _mm_set_epi16(((uint16_t)buf[0] << 8) | buf[1],
-                            ((uint16_t)buf[2] << 8) | buf[3],
-                            ((uint16_t)buf[4] << 8) | buf[5],
-                            ((uint16_t)buf[6] << 8) | buf[7],
-                            ((uint16_t)buf[8] << 8) | buf[9],
-                            ((uint16_t)buf[10] << 8) | buf[11],
-                            ((uint16_t)buf[12] << 8) | buf[13],
-                            ((uint16_t)buf[14] << 8) | buf[15]);
-        Ci = _mm_xor_si128(bin, ghash->x);
-
-        /* Do binary mult ghash->X = Ci * ghash->H. */
-        C = _mm_clmulepi64_si128(Ci, ghash->h, 0x00);
-        D = _mm_clmulepi64_si128(Ci, ghash->h, 0x11);
-        E = _mm_clmulepi64_si128(Ci, ghash->h, 0x01);
-        F = _mm_clmulepi64_si128(Ci, ghash->h, 0x10);
-        tmp = _mm_xor_si128(E, F);
-        z_high = _mm_xor_si128(tmp, _mm_slli_si128(D, 8));
-        z_high = _mm_unpackhi_epi64(z_high, D);
-        z_low = _mm_xor_si128(_mm_slli_si128(tmp, 8), C);
-        z_low = _mm_unpackhi_epi64(_mm_slli_si128(C, 8), z_low);
-
-        /* Shift one to the left (multiply by x) as gcm spec is stupid. */
-        C = _mm_slli_si128(z_low, 8);
-        E = _mm_srli_epi64(C, 63);
-        D = _mm_slli_si128(z_high, 8);
-        F = _mm_srli_epi64(D, 63);
-        /* Carry over */
-        C = _mm_srli_si128(z_low, 8);
-        D = _mm_srli_epi64(C, 63);
-        z_low = _mm_or_si128(_mm_slli_epi64(z_low, 1), E);
-        z_high = _mm_or_si128(_mm_or_si128(_mm_slli_epi64(z_high, 1), F), D);
-
-        /* Reduce */
-        C = _mm_slli_si128(z_low, 8);
-        /* D = z_low << 127 */
-        D = _mm_slli_epi64(C, 63);
-        /* E = z_low << 126 */
-        E = _mm_slli_epi64(C, 62);
-        /* F = z_low << 121 */
-        F = _mm_slli_epi64(C, 57);
-        /* z_low ^= (z_low << 127) ^ (z_low << 126) ^ (z_low << 121); */
-        z_low = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(z_low, D), E), F);
-        C = _mm_srli_si128(z_low, 8);
-        /* D = z_low >> 1 */
-        D = _mm_slli_epi64(C, 63);
-        D = _mm_or_si128(_mm_srli_epi64(z_low, 1), D);
-        /* E = z_low >> 2 */
-        E = _mm_slli_epi64(C, 62);
-        E = _mm_or_si128(_mm_srli_epi64(z_low, 2), E);
-        /* F = z_low >> 7 */
-        F = _mm_slli_epi64(C, 57);
-        F = _mm_or_si128(_mm_srli_epi64(z_low, 7), F);
-        /* ghash->x ^= z_low ^ (z_low >> 1) ^ (z_low >> 2) ^ (z_low >> 7); */
-        ghash->x = _mm_xor_si128(_mm_xor_si128(
-                                     _mm_xor_si128(_mm_xor_si128(z_high, z_low), D), E),
-                                 F);
-    }
-    return SECSuccess;
-#else
-    PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
-    return SECFailure;
-#endif /* NSS_X86_OR_X64 */
-}
-
 static SECStatus
 gcm_zeroX(gcmHashContext *ghash)
 {
+    SECStatus rv = SECSuccess;
+
     if (ghash->hw) {
-#ifdef NSS_X86_OR_X64
-        ghash->x = _mm_setzero_si128();
-        return SECSuccess;
-#else
-        PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
-        return SECFailure;
-#endif /* NSS_X86_OR_X64 */
+        rv = gcm_HashZeroX_hw(ghash);
     }
 
     ghash->x_high = ghash->x_low = 0;
-    return SECSuccess;
+    return rv;
 }
 
 /*
  * implement GCM GHASH using the freebl GHASH function. The gcm_HashMult
  * function always takes AES_BLOCK_SIZE lengths of data. gcmHash_Update will
  * format the data properly.
  */
 SECStatus
@@ -498,25 +437,20 @@ gcmHash_Final(gcmHashContext *ghash, uns
 
     rv = ghash->ghash_mul(ghash, ghash->counterBuf,
                           (GCM_HASH_LEN_LEN * 2) / AES_BLOCK_SIZE);
     if (rv != SECSuccess) {
         goto cleanup;
     }
 
     if (ghash->hw) {
-#ifdef NSS_X86_OR_X64
-        uint64_t tmp_out[2];
-        _mm_storeu_si128((__m128i *)tmp_out, ghash->x);
-        WRITE64(tmp_out[0], T + 8);
-        WRITE64(tmp_out[1], T);
-#else
-        PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
-        return SECFailure;
-#endif /* NSS_X86_OR_X64 */
+        rv = gcm_HashWrite_hw(ghash, T);
+        if (rv != SECSuccess) {
+            goto cleanup;
+        }
     } else {
         WRITE64(ghash->x_low, T + 8);
         WRITE64(ghash->x_high, T);
     }
 
     if (maxout > AES_BLOCK_SIZE) {
         maxout = AES_BLOCK_SIZE;
     }
--- a/lib/freebl/gcm.h
+++ b/lib/freebl/gcm.h
@@ -4,17 +4,31 @@
 
 #ifndef GCM_H
 #define GCM_H 1
 
 #include "blapii.h"
 #include <stdint.h>
 
 #ifdef NSS_X86_OR_X64
+/* GCC <= 4.8 doesn't support including emmintrin.h without enabling SSE2 */
+#if !defined(__clang__) && defined(__GNUC__) && defined(__GNUC_MINOR__) && \
+    (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 8))
+#pragma GCC push_options
+#pragma GCC target("sse2")
+#undef NSS_DISABLE_SSE2
+#define NSS_DISABLE_SSE2 1
+#endif /* GCC <= 4.8 */
+
 #include <emmintrin.h> /* __m128i */
+
+#ifdef NSS_DISABLE_SSE2
+#undef NSS_DISABLE_SSE2
+#pragma GCC pop_options
+#endif /* NSS_DISABLE_SSE2 */
 #endif
 
 SEC_BEGIN_PROTOS
 
 #ifdef HAVE_INT128_SUPPORT
 typedef unsigned __int128 uint128_t;
 #endif
 
--- a/lib/freebl/rijndael.c
+++ b/lib/freebl/rijndael.c
@@ -22,16 +22,44 @@
 
 #ifdef USE_HW_AES
 #include "intel-aes.h"
 #endif
 #ifdef INTEL_GCM
 #include "intel-gcm.h"
 #endif /* INTEL_GCM */
 
+/* Forward declarations */
+void rijndael_native_key_expansion(AESContext *cx, const unsigned char *key,
+                                   unsigned int Nk);
+void rijndael_native_encryptBlock(AESContext *cx,
+                                  unsigned char *output,
+                                  const unsigned char *input);
+
+/* Stub definitions for the above rijndael_native_* functions, which
+ * shouldn't be used unless NSS_X86_OR_X64 is defined */
+#ifndef NSS_X86_OR_X64
+void
+rijndael_native_key_expansion(AESContext *cx, const unsigned char *key,
+                              unsigned int Nk)
+{
+    PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
+    PORT_Assert(0);
+}
+
+void
+rijndael_native_encryptBlock(AESContext *cx,
+                             unsigned char *output,
+                             const unsigned char *input)
+{
+    PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
+    PORT_Assert(0);
+}
+#endif /* NSS_X86_OR_X64 */
+
 /*
  * There are currently three ways to build this code, varying in performance
  * and code size.
  *
  * RIJNDAEL_INCLUDE_TABLES         Include all tables from rijndael32.tab
  * RIJNDAEL_GENERATE_VALUES        Do not store tables, generate the table
  *                                 values "on-the-fly", using gfm
  * RIJNDAEL_GENERATE_VALUES_MACRO  Same as above, but use macros
@@ -304,172 +332,16 @@ rijndael_key_expansion7(AESContext *cx, 
         if (i % Nk == 0)
             tmp = SUBBYTE(ROTBYTE(tmp)) ^ Rcon[i / Nk - 1];
         else if (i % Nk == 4)
             tmp = SUBBYTE(tmp);
         *pW = W[i - Nk] ^ tmp;
     }
 }
 
-#if defined(NSS_X86_OR_X64)
-#define EXPAND_KEY128(k, rcon, res)                   \
-    tmp_key = _mm_aeskeygenassist_si128(k, rcon);     \
-    tmp_key = _mm_shuffle_epi32(tmp_key, 0xFF);       \
-    tmp = _mm_xor_si128(k, _mm_slli_si128(k, 4));     \
-    tmp = _mm_xor_si128(tmp, _mm_slli_si128(tmp, 4)); \
-    tmp = _mm_xor_si128(tmp, _mm_slli_si128(tmp, 4)); \
-    res = _mm_xor_si128(tmp, tmp_key)
-
-static void
-native_key_expansion128(AESContext *cx, const unsigned char *key)
-{
-    __m128i *keySchedule = cx->keySchedule;
-    pre_align __m128i tmp_key post_align;
-    pre_align __m128i tmp post_align;
-    keySchedule[0] = _mm_loadu_si128((__m128i *)key);
-    EXPAND_KEY128(keySchedule[0], 0x01, keySchedule[1]);
-    EXPAND_KEY128(keySchedule[1], 0x02, keySchedule[2]);
-    EXPAND_KEY128(keySchedule[2], 0x04, keySchedule[3]);
-    EXPAND_KEY128(keySchedule[3], 0x08, keySchedule[4]);
-    EXPAND_KEY128(keySchedule[4], 0x10, keySchedule[5]);
-    EXPAND_KEY128(keySchedule[5], 0x20, keySchedule[6]);
-    EXPAND_KEY128(keySchedule[6], 0x40, keySchedule[7]);
-    EXPAND_KEY128(keySchedule[7], 0x80, keySchedule[8]);
-    EXPAND_KEY128(keySchedule[8], 0x1B, keySchedule[9]);
-    EXPAND_KEY128(keySchedule[9], 0x36, keySchedule[10]);
-}
-
-#define EXPAND_KEY192_PART1(res, k0, kt, rcon)                                \
-    tmp2 = _mm_slli_si128(k0, 4);                                             \
-    tmp1 = _mm_xor_si128(k0, tmp2);                                           \
-    tmp2 = _mm_slli_si128(tmp2, 4);                                           \
-    tmp1 = _mm_xor_si128(_mm_xor_si128(tmp1, tmp2), _mm_slli_si128(tmp2, 4)); \
-    tmp2 = _mm_aeskeygenassist_si128(kt, rcon);                               \
-    res = _mm_xor_si128(tmp1, _mm_shuffle_epi32(tmp2, 0x55))
-
-#define EXPAND_KEY192_PART2(res, k1, k2)             \
-    tmp2 = _mm_xor_si128(k1, _mm_slli_si128(k1, 4)); \
-    res = _mm_xor_si128(tmp2, _mm_shuffle_epi32(k2, 0xFF))
-
-#define EXPAND_KEY192(k0, res1, res2, res3, carry, rcon1, rcon2)         \
-    EXPAND_KEY192_PART1(tmp3, k0, res1, rcon1);                          \
-    EXPAND_KEY192_PART2(carry, res1, tmp3);                              \
-    res1 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(res1),       \
-                                           _mm_castsi128_pd(tmp3), 0));  \
-    res2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(tmp3),       \
-                                           _mm_castsi128_pd(carry), 1)); \
-    EXPAND_KEY192_PART1(res3, tmp3, carry, rcon2)
-
-static void
-native_key_expansion192(AESContext *cx, const unsigned char *key)
-{
-    __m128i *keySchedule = cx->keySchedule;
-    pre_align __m128i tmp1 post_align;
-    pre_align __m128i tmp2 post_align;
-    pre_align __m128i tmp3 post_align;
-    pre_align __m128i carry post_align;
-    keySchedule[0] = _mm_loadu_si128((__m128i *)key);
-    keySchedule[1] = _mm_loadu_si128((__m128i *)(key + 16));
-    EXPAND_KEY192(keySchedule[0], keySchedule[1], keySchedule[2],
-                  keySchedule[3], carry, 0x1, 0x2);
-    EXPAND_KEY192_PART2(keySchedule[4], carry, keySchedule[3]);
-    EXPAND_KEY192(keySchedule[3], keySchedule[4], keySchedule[5],
-                  keySchedule[6], carry, 0x4, 0x8);
-    EXPAND_KEY192_PART2(keySchedule[7], carry, keySchedule[6]);
-    EXPAND_KEY192(keySchedule[6], keySchedule[7], keySchedule[8],
-                  keySchedule[9], carry, 0x10, 0x20);
-    EXPAND_KEY192_PART2(keySchedule[10], carry, keySchedule[9]);
-    EXPAND_KEY192(keySchedule[9], keySchedule[10], keySchedule[11],
-                  keySchedule[12], carry, 0x40, 0x80);
-}
-
-#define EXPAND_KEY256_PART(res, rconx, k1x, k2x, X)                           \
-    tmp_key = _mm_shuffle_epi32(_mm_aeskeygenassist_si128(k2x, rconx), X);    \
-    tmp2 = _mm_slli_si128(k1x, 4);                                            \
-    tmp1 = _mm_xor_si128(k1x, tmp2);                                          \
-    tmp2 = _mm_slli_si128(tmp2, 4);                                           \
-    tmp1 = _mm_xor_si128(_mm_xor_si128(tmp1, tmp2), _mm_slli_si128(tmp2, 4)); \
-    res = _mm_xor_si128(tmp1, tmp_key);
-
-#define EXPAND_KEY256(res1, res2, k1, k2, rcon)   \
-    EXPAND_KEY256_PART(res1, rcon, k1, k2, 0xFF); \
-    EXPAND_KEY256_PART(res2, 0x00, k2, res1, 0xAA)
-
-static void
-native_key_expansion256(AESContext *cx, const unsigned char *key)
-{
-    __m128i *keySchedule = cx->keySchedule;
-    pre_align __m128i tmp_key post_align;
-    pre_align __m128i tmp1 post_align;
-    pre_align __m128i tmp2 post_align;
-    keySchedule[0] = _mm_loadu_si128((__m128i *)key);
-    keySchedule[1] = _mm_loadu_si128((__m128i *)(key + 16));
-    EXPAND_KEY256(keySchedule[2], keySchedule[3], keySchedule[0],
-                  keySchedule[1], 0x01);
-    EXPAND_KEY256(keySchedule[4], keySchedule[5], keySchedule[2],
-                  keySchedule[3], 0x02);
-    EXPAND_KEY256(keySchedule[6], keySchedule[7], keySchedule[4],
-                  keySchedule[5], 0x04);
-    EXPAND_KEY256(keySchedule[8], keySchedule[9], keySchedule[6],
-                  keySchedule[7], 0x08);
-    EXPAND_KEY256(keySchedule[10], keySchedule[11], keySchedule[8],
-                  keySchedule[9], 0x10);
-    EXPAND_KEY256(keySchedule[12], keySchedule[13], keySchedule[10],
-                  keySchedule[11], 0x20);
-    EXPAND_KEY256_PART(keySchedule[14], 0x40, keySchedule[12],
-                       keySchedule[13], 0xFF);
-}
-
-#endif /* NSS_X86_OR_X64 */
-
-/*
- * AES key expansion using aes-ni instructions.
- */
-static void
-native_key_expansion(AESContext *cx, const unsigned char *key, unsigned int Nk)
-{
-#ifdef NSS_X86_OR_X64
-    switch (Nk) {
-        case 4:
-            native_key_expansion128(cx, key);
-            return;
-        case 6:
-            native_key_expansion192(cx, key);
-            return;
-        case 8:
-            native_key_expansion256(cx, key);
-            return;
-        default:
-            /* This shouldn't happen. */
-            PORT_Assert(0);
-    }
-#else
-    PORT_Assert(0);
-#endif /* NSS_X86_OR_X64 */
-}
-
-static void
-native_encryptBlock(AESContext *cx,
-                    unsigned char *output,
-                    const unsigned char *input)
-{
-#ifdef NSS_X86_OR_X64
-    int i;
-    pre_align __m128i m post_align = _mm_loadu_si128((__m128i *)input);
-    m = _mm_xor_si128(m, cx->keySchedule[0]);
-    for (i = 1; i < cx->Nr; ++i) {
-        m = _mm_aesenc_si128(m, cx->keySchedule[i]);
-    }
-    m = _mm_aesenclast_si128(m, cx->keySchedule[cx->Nr]);
-    _mm_storeu_si128((__m128i *)output, m);
-#else
-    PORT_Assert(0);
-#endif /* NSS_X86_OR_X64 */
-}
-
 /* rijndael_key_expansion
  *
  * Generate the expanded key from the key input by the user.
  */
 static void
 rijndael_key_expansion(AESContext *cx, const unsigned char *key, unsigned int Nk)
 {
     unsigned int i;
@@ -825,17 +697,17 @@ static SECStatus
 rijndael_encryptECB(AESContext *cx, unsigned char *output,
                     unsigned int *outputLen, unsigned int maxOutputLen,
                     const unsigned char *input, unsigned int inputLen)
 {
     AESBlockFunc *encryptor;
 
     if (aesni_support()) {
         /* Use hardware acceleration for normal AES parameters. */
-        encryptor = &native_encryptBlock;
+        encryptor = &rijndael_native_encryptBlock;
     } else {
         encryptor = &rijndael_encryptBlock128;
     }
     while (inputLen > 0) {
         (*encryptor)(cx, output, input);
         output += AES_BLOCK_SIZE;
         input += AES_BLOCK_SIZE;
         inputLen -= AES_BLOCK_SIZE;
@@ -1021,17 +893,17 @@ aes_InitContext(AESContext *cx, const un
 #endif
     {
         /* Generate expanded key */
         if (encrypt) {
             if (use_hw_aes && (cx->mode == NSS_AES_GCM || cx->mode == NSS_AES ||
                                cx->mode == NSS_AES_CTR)) {
                 PORT_Assert(keysize == 16 || keysize == 24 || keysize == 32);
                 /* Prepare hardware key for normal AES parameters. */
-                native_key_expansion(cx, key, Nk);
+                rijndael_native_key_expansion(cx, key, Nk);
             } else {
                 rijndael_key_expansion(cx, key, Nk);
             }
         } else {
             rijndael_invkey_expansion(cx, key, Nk);
         }
     }
     cx->worker_cx = cx;
--- a/lib/freebl/rijndael.h
+++ b/lib/freebl/rijndael.h
@@ -3,18 +3,32 @@
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 #ifndef _RIJNDAEL_H_
 #define _RIJNDAEL_H_ 1
 
 #include "blapii.h"
 #include <stdint.h>
 
-#ifdef NSS_X86_OR_X64
-#include <wmmintrin.h> /* aes-ni */
+#if defined(NSS_X86_OR_X64)
+/* GCC <= 4.8 doesn't support including emmintrin.h without enabling SSE2 */
+#if !defined(__clang__) && defined(__GNUC__) && defined(__GNUC_MINOR__) && \
+    (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 8))
+#pragma GCC push_options
+#pragma GCC target("sse2")
+#undef NSS_DISABLE_SSE2
+#define NSS_DISABLE_SSE2 1
+#endif /* GCC <= 4.8 */
+
+#include <emmintrin.h> /* __m128i */
+
+#ifdef NSS_DISABLE_SSE2
+#undef NSS_DISABLE_SSE2
+#pragma GCC pop_options
+#endif /* NSS_DISABLE_SSE2 */
 #endif
 
 typedef void AESBlockFunc(AESContext *cx,
                           unsigned char *output,
                           const unsigned char *input);
 
 /* RIJNDAEL_NUM_ROUNDS
  *