Bug 1566126 - freebl: POWER GHASH Vector Acceleration, r=mt
authorLauri Kasanen <cand@gmx.com>
Wed, 06 Nov 2019 19:17:48 +1100
changeset 15375 3d7e509d6d20ecd607a28fa6ce42e4ffd9c51443
parent 15374 73c28cad3dbb7c8eef8f970d69fc5504d96b69dc
child 15376 6125200fbc889fbb68447ff3fae644441842c55e
push id3568
push usermartin.thomson@gmail.com
push dateWed, 06 Nov 2019 08:19:41 +0000
reviewersmt
bugs1566126, 10000, 226033, 80606, 28851, 213899, 45233
Bug 1566126 - freebl: POWER GHASH Vector Acceleration, r=mt Implementation for POWER8 adapted from the ARM paper: https://conradoplg.cryptoland.net/files/2010/12/gcm14.pdf Benchmark of `bltest -E -m aes_gcm -i tests/aes_gcm/plaintext10 \ -v tests/aes_gcm/iv10 -k tests/aes_gcm/key10 -5 10` on POWER8 3.3GHz. NSS_DISABLE_HW_CRYPTO=1 mode in symmkey opreps cxreps context op time(sec) thrgput aes_gcm_e 309Mb 192 5M 0 0.000 10000.000 10.001 30Mb mode in symmkey opreps cxreps context op time(sec) thrgput aes_gcm_e 829Mb 192 14M 0 0.000 10000.000 10.001 82Mb Notable operf results, sw: samples % image name symbol name 226033 59.3991 libfreeblpriv3.so bmul 80606 21.1824 libfreeblpriv3.so rijndael_encryptBlock128 28851 7.5817 libfreeblpriv3.so gcm_HashMult_sftw hw: 213899 56.2037 libfreeblpriv3.so rijndael_encryptBlock128 45233 11.8853 libfreeblpriv3.so gcm_HashMult_hw So the ghash part is ~5.6x faster. Signed-off-by: Lauri Kasanen <cand@gmx.com>
lib/freebl/Makefile
lib/freebl/altivec-types.h
lib/freebl/blapii.h
lib/freebl/blinit.c
lib/freebl/freebl.gyp
lib/freebl/gcm-ppc.c
lib/freebl/gcm.c
lib/freebl/gcm.h
--- a/lib/freebl/Makefile
+++ b/lib/freebl/Makefile
@@ -258,16 +258,17 @@ ifeq ($(CPU_ARCH),x86)
 endif
 ifeq ($(CPU_ARCH),arm)
     DEFINES += -DMP_ASSEMBLY_MULTIPLY -DMP_ASSEMBLY_SQUARE 
     DEFINES += -DMP_USE_UINT_DIGIT
     DEFINES += -DSHA_NO_LONG_LONG # avoid 64-bit arithmetic in SHA512
     MPI_SRCS += mpi_arm.c
 endif
 ifeq ($(CPU_ARCH),ppc)
+    EXTRA_SRCS += gcm-ppc.c
 ifdef USE_64
     DEFINES += -DNSS_NO_INIT_SUPPORT
 endif # USE_64
 endif # ppc
 endif # Linux
 
 ifeq ($(OS_TARGET),AIX)
     DEFINES += -DMP_USE_UINT_DIGIT
@@ -780,8 +781,12 @@ endif
 
 ifeq ($(CPU_ARCH),arm)
 $(OBJDIR)/$(PROG_PREFIX)aes-armv8$(OBJ_SUFFIX): CFLAGS += -march=armv8-a -mfpu=crypto-neon-fp-armv8
 endif
 ifeq ($(CPU_ARCH),aarch64)
 $(OBJDIR)/$(PROG_PREFIX)aes-armv8$(OBJ_SUFFIX): CFLAGS += -march=armv8-a+crypto
 $(OBJDIR)/$(PROG_PREFIX)gcm-aarch64$(OBJ_SUFFIX): CFLAGS += -march=armv8-a+crypto
 endif
+
+ifeq ($(CPU_ARCH),ppc)
+$(OBJDIR)/$(PROG_PREFIX)gcm-ppc$(OBJ_SUFFIX): CFLAGS += -mcrypto -maltivec
+endif
new file mode 100644
--- /dev/null
+++ b/lib/freebl/altivec-types.h
@@ -0,0 +1,23 @@
+/*
+ * altivec-types.h - shorter vector typedefs
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef _ALTIVEC_TYPES_H_
+#define _ALTIVEC_TYPES_H_ 1
+
+#include <altivec.h>
+
+typedef __vector unsigned char vec_u8;
+typedef __vector signed char vec_s8;
+typedef __vector unsigned short vec_u16;
+typedef __vector signed short vec_s16;
+typedef __vector unsigned int vec_u32;
+typedef __vector signed int vec_s32;
+typedef __vector unsigned long long vec_u64;
+typedef __vector signed long long vec_s64;
+typedef __vector float vec_f;
+
+#endif
--- a/lib/freebl/blapii.h
+++ b/lib/freebl/blapii.h
@@ -81,10 +81,11 @@ PRBool aesni_support();
 PRBool clmul_support();
 PRBool avx_support();
 PRBool ssse3_support();
 PRBool arm_neon_support();
 PRBool arm_aes_support();
 PRBool arm_pmull_support();
 PRBool arm_sha1_support();
 PRBool arm_sha2_support();
+PRBool ppc_crypto_support();
 
 #endif /* _BLAPII_H_ */
--- a/lib/freebl/blinit.c
+++ b/lib/freebl/blinit.c
@@ -24,16 +24,17 @@ static PRBool aesni_support_ = PR_FALSE;
 static PRBool clmul_support_ = PR_FALSE;
 static PRBool avx_support_ = PR_FALSE;
 static PRBool ssse3_support_ = PR_FALSE;
 static PRBool arm_neon_support_ = PR_FALSE;
 static PRBool arm_aes_support_ = PR_FALSE;
 static PRBool arm_sha1_support_ = PR_FALSE;
 static PRBool arm_sha2_support_ = PR_FALSE;
 static PRBool arm_pmull_support_ = PR_FALSE;
+static PRBool ppc_crypto_support_ = PR_FALSE;
 
 #ifdef NSS_X86_OR_X64
 /*
  * Adapted from the example code in "How to detect New Instruction support in
  * the 4th generation Intel Core processor family" by Max Locktyukhin.
  *
  * XGETBV:
  *   Reads an extended control register (XCR) specified by ECX into EDX:EAX.
@@ -343,24 +344,52 @@ arm_sha1_support()
 {
     return arm_sha1_support_;
 }
 PRBool
 arm_sha2_support()
 {
     return arm_sha2_support_;
 }
+PRBool
+ppc_crypto_support()
+{
+    return ppc_crypto_support_;
+}
+
+#if defined(__powerpc__)
+
+#include <sys/auxv.h>
+
+// Defines from cputable.h in Linux kernel - PPC, letting us build on older kernels
+#ifndef PPC_FEATURE2_VEC_CRYPTO
+#define PPC_FEATURE2_VEC_CRYPTO         0x02000000
+#endif
+
+static void
+CheckPPCSupport()
+{
+    char *disable_hw_crypto = PR_GetEnvSecure("NSS_DISABLE_PPC_GHASH");
+
+    long hwcaps = getauxval(AT_HWCAP2);
+
+    ppc_crypto_support_ = hwcaps & PPC_FEATURE2_VEC_CRYPTO && disable_hw_crypto == NULL;
+}
+
+#endif /* __powerpc__ */
 
 static PRStatus
 FreeblInit(void)
 {
 #ifdef NSS_X86_OR_X64
     CheckX86CPUSupport();
 #elif (defined(__aarch64__) || defined(__arm__))
     CheckARMSupport();
+#elif (defined(__powerpc__))
+    CheckPPCSupport();
 #endif
     return PR_SUCCESS;
 }
 
 SECStatus
 BL_Init()
 {
     if (PR_CallOnce(&coFreeblInit, FreeblInit) != PR_SUCCESS) {
--- a/lib/freebl/freebl.gyp
+++ b/lib/freebl/freebl.gyp
@@ -128,16 +128,34 @@
       'cflags': [
         '-march=armv8-a+crypto'
       ],
       'cflags_mozilla': [
         '-march=armv8-a+crypto'
       ]
     },
     {
+      'target_name': 'gcm-aes-ppc_c_lib',
+      'type': 'static_library',
+      'sources': [
+        'gcm-ppc.c'
+      ],
+      'dependencies': [
+        '<(DEPTH)/exports.gyp:nss_exports'
+      ],
+      'cflags': [
+        '-mcrypto',
+        '-maltivec'
+      ],
+      'cflags_mozilla': [
+        '-mcrypto',
+        '-maltivec'
+      ]
+    },
+    {
       'target_name': 'armv8_c_lib',
       'type': 'static_library',
       'sources': [
         'aes-armv8.c',
       ],
       'dependencies': [
         '<(DEPTH)/exports.gyp:nss_exports'
       ],
@@ -194,16 +212,21 @@
             'armv8_c_lib'
           ],
         }],
         [ 'target_arch=="arm64" or target_arch=="aarch64"', {
           'dependencies': [
             'gcm-aes-aarch64_c_lib',
           ],
         }],
+        [ 'target_arch=="ppc64le"', {
+          'dependencies': [
+            'gcm-aes-ppc_c_lib',
+          ],
+        }],
         [ 'OS=="linux"', {
           'defines!': [
             'FREEBL_NO_DEPEND',
             'FREEBL_LOWHASH',
             'USE_HW_AES',
             'INTEL_GCM',
           ],
           'conditions': [
@@ -240,16 +263,21 @@
             'armv8_c_lib',
           ],
         }],
         [ 'target_arch=="arm64" or target_arch=="aarch64"', {
           'dependencies': [
             'gcm-aes-aarch64_c_lib',
           ],
         }],
+        [ 'target_arch=="ppc64" or target_arch=="ppc64le"', {
+          'dependencies': [
+            'gcm-aes-ppc_c_lib',
+          ],
+        }],
         [ 'OS!="linux"', {
           'conditions': [
             [ 'moz_fold_libs==0', {
               'dependencies': [
                 '<(DEPTH)/lib/util/util.gyp:nssutil3',
               ],
             }, {
               'libraries': [
new file mode 100644
--- /dev/null
+++ b/lib/freebl/gcm-ppc.c
@@ -0,0 +1,106 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifdef FREEBL_NO_DEPEND
+#include "stubs.h"
+#endif
+#include "gcm.h"
+#include "secerr.h"
+
+#if defined(USE_PPC_CRYPTO)
+
+SECStatus
+gcm_HashWrite_hw(gcmHashContext *ghash, unsigned char *outbuf)
+{
+    vec_xst_be((vec_u8) ghash->x, 0, outbuf);
+    return SECSuccess;
+}
+
+static vec_u64 vpmsumd(const vec_u64 a, const vec_u64 b)
+{
+#if defined(__clang__)
+    /* Clang uses a different name */
+    return __builtin_altivec_crypto_vpmsumd(a, b);
+#elif (__GNUC__ >= 10) || (__GNUC__ == 9 && __GNUC_MINOR__ >= 3) || \
+      (__GNUC__ == 8 && __GNUC_MINOR__ >= 4) || \
+      (__GNUC__ == 7 && __GNUC_MINOR__ >= 5)
+   /* GCC versions not affected by https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91275 */
+   return __builtin_crypto_vpmsumd(a, b);
+#else
+   /* GCC versions where this builtin is buggy */
+   vec_u64 vr;
+   __asm("vpmsumd %0, %1, %2" : "=v"(vr) : "v"(a), "v"(b));
+   return vr;
+#endif
+}
+
+SECStatus
+gcm_HashMult_hw(gcmHashContext *ghash, const unsigned char *buf,
+                unsigned int count)
+{
+    const vec_u8 leftshift = vec_splat_u8(1);
+    const vec_u64 onebit = (vec_u64) {1, 0};
+    const unsigned long long pd = 0xc2LLU << 56;
+
+    vec_u64 ci, v, r0, r1;
+    vec_u64 hibit;
+    unsigned i;
+
+    ci = ghash->x;
+
+    for (i = 0; i < count; i++, buf += 16) {
+        /* clang needs the following cast away from const; maybe a bug in 7.0.0 */
+        v = (vec_u64) vec_xl_be(0, (unsigned char *) buf);
+        ci ^= v;
+
+        /* Do binary mult ghash->X = C * ghash->H (Karatsuba). */
+        r0 = vpmsumd((vec_u64) {ci[0], 0}, (vec_u64) {ghash->h[0], 0});
+        r1 = vpmsumd((vec_u64) {ci[1], 0}, (vec_u64) {ghash->h[1], 0});
+        v = (vec_u64) {ci[0] ^ ci[1], ghash->h[0] ^ ghash->h[1]};
+        v = vpmsumd((vec_u64) {v[0], 0}, (vec_u64) {v[1], 0});
+        v ^= r0;
+        v ^= r1;
+        r0 ^= (vec_u64) {0, v[0]};
+        r1 ^= (vec_u64) {v[1], 0};
+
+        /* Shift one (multiply by x) as gcm spec is stupid. */
+        hibit = (vec_u64) vec_splat((vec_u8) r0, 15);
+        hibit = (vec_u64) vec_rl((vec_u8) hibit, leftshift);
+        hibit &= onebit;
+        r0 = vec_sll(r0, leftshift);
+        r1 = vec_sll(r1, leftshift);
+        r1 |= hibit;
+
+        /* Reduce */
+        v = vpmsumd((vec_u64) {r0[0], 0}, (vec_u64) {pd, 0});
+        r0 ^= (vec_u64) {0, v[0]};
+        r1 ^= (vec_u64) {v[1], 0};
+        v = vpmsumd((vec_u64) {r0[1], 0}, (vec_u64) {pd, 0});
+        r1 ^= v;
+        ci = r0 ^ r1;
+    }
+
+    ghash->x = ci;
+
+    return SECSuccess;
+}
+
+SECStatus
+gcm_HashInit_hw(gcmHashContext *ghash)
+{
+    ghash->x = (vec_u64) vec_splat_u32(0);
+    ghash->h = (vec_u64) {ghash->h_low, ghash->h_high};
+    ghash->ghash_mul = gcm_HashMult_hw;
+    ghash->hw = PR_TRUE;
+    return SECSuccess;
+}
+
+SECStatus
+gcm_HashZeroX_hw(gcmHashContext *ghash)
+{
+    ghash->x = (vec_u64) vec_splat_u32(0);
+    return SECSuccess;
+}
+
+#endif /* defined(USE_PPC_CRYPTO) */
--- a/lib/freebl/gcm.c
+++ b/lib/freebl/gcm.c
@@ -31,17 +31,17 @@ SECStatus gcm_HashMult_hw(gcmHashContext
 SECStatus gcm_HashZeroX_hw(gcmHashContext *ghash);
 SECStatus gcm_HashMult_sftw(gcmHashContext *ghash, const unsigned char *buf,
                             unsigned int count);
 SECStatus gcm_HashMult_sftw32(gcmHashContext *ghash, const unsigned char *buf,
                               unsigned int count);
 
 /* Stub definitions for the above *_hw functions, which shouldn't be
  * used unless NSS_X86_OR_X64 is defined */
-#if !defined(NSS_X86_OR_X64) && !defined(USE_ARM_GCM)
+#if !defined(NSS_X86_OR_X64) && !defined(USE_ARM_GCM) && !defined(USE_PPC_CRYPTO)
 SECStatus
 gcm_HashWrite_hw(gcmHashContext *ghash, unsigned char *outbuf)
 {
     PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
     return SECFailure;
 }
 
 SECStatus
@@ -60,17 +60,17 @@ gcm_HashInit_hw(gcmHashContext *ghash)
 }
 
 SECStatus
 gcm_HashZeroX_hw(gcmHashContext *ghash)
 {
     PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
     return SECFailure;
 }
-#endif /* !NSS_X86_OR_X64 && !USE_ARM_GCM */
+#endif /* !NSS_X86_OR_X64 && !USE_ARM_GCM && !USE_PPC_CRYPTO */
 
 uint64_t
 get64(const unsigned char *bytes)
 {
     return ((uint64_t)bytes[0]) << 56 |
            ((uint64_t)bytes[1]) << 48 |
            ((uint64_t)bytes[2]) << 40 |
            ((uint64_t)bytes[3]) << 32 |
@@ -89,16 +89,18 @@ gcmHash_InitContext(gcmHashContext *ghas
     ghash->cLen = 0;
     ghash->bufLen = 0;
     PORT_Memset(ghash->counterBuf, 0, sizeof(ghash->counterBuf));
 
     ghash->h_low = get64(H + 8);
     ghash->h_high = get64(H);
 #ifdef USE_ARM_GCM
     if (arm_pmull_support() && !sw) {
+#elif defined(USE_PPC_CRYPTO)
+    if (ppc_crypto_support() && !sw) {
 #else
     if (clmul_support() && !sw) {
 #endif
         rv = gcm_HashInit_hw(ghash);
     } else {
 /* We fall back to the software implementation if we can't use / don't
          * want to use pclmul. */
 #ifdef HAVE_INT128_SUPPORT
--- a/lib/freebl/gcm.h
+++ b/lib/freebl/gcm.h
@@ -25,16 +25,38 @@
 #pragma GCC pop_options
 #endif /* NSS_DISABLE_SSE2 */
 #endif
 
 #ifdef __aarch64__
 #include <arm_neon.h>
 #endif
 
+#ifdef __powerpc64__
+#include "altivec-types.h"
+
+/* The ghash freebl test tries to use this in C++, and gcc defines conflict. */
+#ifdef __cplusplus
+#undef pixel
+#undef vector
+#undef bool
+#endif
+
+/*
+ * PPC CRYPTO requires at least gcc 5 or clang. The LE check is purely
+ * because it's only been tested on LE. If you're interested in BE,
+ * please send a patch.
+ */
+#if (defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 5)) && \
+    defined(IS_LITTLE_ENDIAN)
+#define USE_PPC_CRYPTO
+#endif
+
+#endif
+
 SEC_BEGIN_PROTOS
 
 #ifdef HAVE_INT128_SUPPORT
 typedef unsigned __int128 uint128_t;
 #endif
 
 typedef struct GCMContextStr GCMContext;
 
@@ -62,16 +84,18 @@ SECStatus GCM_DecryptUpdate(GCMContext *
 typedef struct gcmHashContextStr gcmHashContext;
 typedef SECStatus (*ghash_t)(gcmHashContext *, const unsigned char *,
                              unsigned int);
 pre_align struct gcmHashContextStr {
 #ifdef NSS_X86_OR_X64
     __m128i x, h;
 #elif defined(__aarch64__)
     uint64x2_t x, h;
+#elif defined(USE_PPC_CRYPTO)
+    vec_u64 x, h;
 #endif
     uint64_t x_low, x_high, h_high, h_low;
     unsigned char buffer[MAX_BLOCK_SIZE];
     unsigned int bufLen;
     uint8_t counterBuf[16];
     uint64_t cLen;
     ghash_t ghash_mul;
     PRBool hw;