Bug 1252035 - Cachebleed (part 1: Remove duplicate weave) r=rrelyea
authorRichard Barnes <rlb@ipv.sx>
Wed, 09 Mar 2016 16:48:41 +0100
changeset 11977 c02a3bc19832c97d2a38ec75f6a5f1c44ab37b4e
parent 11970 d8064837a36f7b54bae52e0c709a2f94e9793587
child 11978 57f1c66f5b3aca0befaa005e64593c42610eca96
push id1051
push userttaubert@mozilla.com
push dateWed, 09 Mar 2016 15:54:53 +0000
reviewersrrelyea
bugs1252035
Bug 1252035 - Cachebleed (part 1: Remove duplicate weave) r=rrelyea
lib/freebl/Makefile
lib/freebl/mpi/mpmontg.c
lib/freebl/mpi/target.mk
--- a/lib/freebl/Makefile
+++ b/lib/freebl/Makefile
@@ -107,17 +107,17 @@ ifeq ($(OS_TARGET),OSF1)
     MPI_SRCS += mpvalpha.c
 endif
 
 ifeq (OS2,$(OS_TARGET))
     ASFILES  = mpi_x86_os2.s
     DEFINES += -DMP_ASSEMBLY_MULTIPLY -DMP_ASSEMBLY_SQUARE 
     DEFINES += -DMP_ASSEMBLY_DIV_2DX1D
     DEFINES += -DMP_USE_UINT_DIGIT -DMP_NO_MP_WORD
-    DEFINES += -DMP_CHAR_STORE_SLOW -DMP_IS_LITTLE_ENDIAN
+    DEFINES += -DMP_IS_LITTLE_ENDIAN
 endif
 
 ifeq (,$(filter-out WINNT WIN95,$(OS_TARGET)))
 ifndef USE_64
 # 32-bit Windows
 ifdef NS_USE_GCC
 # Ideally, we want to use assembler
 #     ASFILES  = mpi_x86.s
@@ -143,17 +143,17 @@ else
 	EXTRA_SRCS += intel-gcm-wrap.c
 	ifeq ($(CLANG_CL),1)
 	    INTEL_GCM_CLANG_CL = 1
 	endif
     endif
 endif
 else
     # -DMP_NO_MP_WORD
-    DEFINES += -DMP_CHAR_STORE_SLOW -DMP_IS_LITTLE_ENDIAN
+    DEFINES += -DMP_IS_LITTLE_ENDIAN
 ifdef NS_USE_GCC
 # Ideally, we should use amd64 assembly code, but it's not yet mingw-w64
 # compatible.
 else
 # MSVC
     ifdef BUILD_OPT
 	OPTIMIZER += -Ox  # maximum optimization for freebl
     endif
@@ -198,30 +198,30 @@ endif
 endif # Darwin
 
 ifeq ($(OS_TARGET),Linux)
 ifeq ($(CPU_ARCH),x86_64)
     ASFILES  = arcfour-amd64-gas.s mpi_amd64_gas.s
     ASFLAGS += -fPIC -Wa,--noexecstack
     DEFINES += -DNSS_BEVAND_ARCFOUR -DMPI_AMD64 -DMP_ASSEMBLY_MULTIPLY
     DEFINES += -DNSS_USE_COMBA
-    DEFINES += -DMP_CHAR_STORE_SLOW -DMP_IS_LITTLE_ENDIAN
+    DEFINES += -DMP_IS_LITTLE_ENDIAN
 #   DEFINES += -DMPI_AMD64_ADD
     # comment the next four lines to turn off Intel HW acceleration.
     DEFINES += -DUSE_HW_AES -DINTEL_GCM
     ASFILES += intel-aes.s intel-gcm.s
     EXTRA_SRCS += intel-gcm-wrap.c
     INTEL_GCM = 1
     MPI_SRCS += mpi_amd64.c mp_comba.c
 endif
 ifeq ($(CPU_ARCH),x86)
     ASFILES  = mpi_x86.s
     DEFINES += -DMP_ASSEMBLY_MULTIPLY -DMP_ASSEMBLY_SQUARE 
     DEFINES += -DMP_ASSEMBLY_DIV_2DX1D -DMP_USE_UINT_DIGIT
-    DEFINES += -DMP_CHAR_STORE_SLOW -DMP_IS_LITTLE_ENDIAN
+    DEFINES += -DMP_IS_LITTLE_ENDIAN
     # The floating point ECC code doesn't work on Linux x86 (bug 311432).
     #ECL_USE_FP = 1
 endif
 ifeq ($(CPU_ARCH),arm)
     DEFINES += -DMP_ASSEMBLY_MULTIPLY -DMP_ASSEMBLY_SQUARE 
     DEFINES += -DMP_USE_UINT_DIGIT
     DEFINES += -DSHA_NO_LONG_LONG # avoid 64-bit arithmetic in SHA512
     MPI_SRCS += mpi_arm.c
@@ -449,17 +449,17 @@ else
  	    ASFILES += mp_comba_amd64_sun.s mpcpucache_amd64.s
 	    ASFLAGS += -xarch=generic64 -K PIC
             SOL_CFLAGS += -xprefetch=no
 	    SHA_SRCS =
  	    MPCPU_SRCS =
 	    # Intel acceleration for GCM does not build currently with Studio
 	endif
 	DEFINES += -DNSS_BEVAND_ARCFOUR -DMPI_AMD64 -DMP_ASSEMBLY_MULTIPLY
-	DEFINES += -DNSS_USE_COMBA -DMP_CHAR_STORE_SLOW -DMP_IS_LITTLE_ENDIAN
+	DEFINES += -DNSS_USE_COMBA -DMP_IS_LITTLE_ENDIAN
 	# comment the next two lines to turn off Intel HW acceleration
 	DEFINES += -DUSE_HW_AES
 	ASFILES += intel-aes.s
 	MPI_SRCS += mpi_amd64.c
     else
 	# Solaris x86
 	DEFINES += -DMP_USE_UINT_DIGIT
 	DEFINES += -DMP_ASSEMBLY_MULTIPLY -DMP_ASSEMBLY_SQUARE 
--- a/lib/freebl/mpi/mpmontg.c
+++ b/lib/freebl/mpi/mpmontg.c
@@ -16,25 +16,16 @@
 #include "mpi-priv.h"
 #include "mplogic.h"
 #include "mpprime.h"
 #ifdef MP_USING_MONT_MULF
 #include "montmulf.h"
 #endif
 #include <stddef.h> /* ptrdiff_t */
 
-/* if MP_CHAR_STORE_SLOW is defined, we  */
-/* need to know endianness of this platform. */
-#ifdef MP_CHAR_STORE_SLOW
-#if !defined(MP_IS_BIG_ENDIAN) && !defined(MP_IS_LITTLE_ENDIAN)
-#error "You must define MP_IS_BIG_ENDIAN or MP_IS_LITTLE_ENDIAN\n" \
-       "  if you define MP_CHAR_STORE_SLOW."
-#endif
-#endif
-
 #define STATIC
 
 #define MAX_ODD_INTS    32   /* 2 ** (WINDOW_BITS - 1) */
 
 /*! computes T = REDC(T), 2^b == R 
     \param T < RN
 */
 mp_err s_mp_redc(mp_int *T, mp_mont_modulus *mmm)
@@ -501,17 +492,16 @@ mp_err mp_set_safe_modexp(int value)
  }
  return MP_BADARG;
 #endif
 }
 
 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
 #define WEAVE_WORD_SIZE 4
 
-#ifndef MP_CHAR_STORE_SLOW
 /*
  * mpi_to_weave takes an array of bignums, a matrix in which each bignum 
  * occupies all the columns of a row, and transposes it into a matrix in 
  * which each bignum occupies a column of every row.  The first row of the
  * input matrix becomes the first column of the output matrix.  The n'th
  * row of input becomes the n'th column of output.  The input data is said
  * to be "interleaved" or "woven" into the output matrix.
  *
@@ -601,255 +591,16 @@ mp_err weave_to_mpi(mp_int *a,          
 
   for (; pDest < endDest; pSrc += nBignums, pDest++) {
     *pDest = *pSrc;
   }
   s_mp_clamp(a);
   return MP_OKAY;
 }
 
-#else
-
-/* Need a primitive that we know is 32 bits long... */
-/* this is true on all modern processors we know of today*/
-typedef unsigned int mp_weave_word;
-
-/*
- * on some platforms character stores into memory is very expensive since they
- * generate a read/modify/write operation on the bus. On those platforms
- * we need to do integer writes to the bus. Because of some unrolled code,
- * in this current code the size of mp_weave_word must be four. The code that
- * makes this assumption explicity is called out. (on some platforms a write
- * of 4 bytes still requires a single read-modify-write operation.
- *
- * This function is takes the identical parameters as the function above, 
- * however it lays out the final array differently. Where the previous function
- * treats the mpi_int as an byte array, this function treats it as an array of
- * mp_digits where each digit is stored in big endian order.
- * 
- * since we need to interleave on a byte by byte basis, we need to collect 
- * several mpi structures together into a single PRUint32 before we write. We
- * also need to make sure the PRUint32 is arranged so that the first value of
- * the first array winds up in b[0]. This means construction of that PRUint32
- * is endian specific (even though the layout of the mp_digits in the array 
- * is always big endian).
- *
- * The final data is stored as follows :
- *
- * Our same logical array p array, m is sizeof(mp_digit),
- * N is still count and n is now b_size. If we define p[i].digit[j]0 as the 
- * most significant byte of the word p[i].digit[j], p[i].digit[j]1 as 
- * the next most significant byte of p[i].digit[j], ...  and p[i].digit[j]m-1
- * is the least significant byte. 
- * Our array would look like:
- * p[0].digit[0]0     p[1].digit[0]0    ...  p[N-2].digit[0]0    p[N-1].digit[0]0
- * p[0].digit[0]1     p[1].digit[0]1    ...  p[N-2].digit[0]1    p[N-1].digit[0]1
- *                .                                         .
- * p[0].digit[0]m-1   p[1].digit[0]m-1  ...  p[N-2].digit[0]m-1  p[N-1].digit[0]m-1
- * p[0].digit[1]0     p[1].digit[1]0    ...  p[N-2].digit[1]0    p[N-1].digit[1]0
- *                .                                         .
- *                .                                         .
- * p[0].digit[n-1]m-2 p[1].digit[n-1]m-2 ... p[N-2].digit[n-1]m-2 p[N-1].digit[n-1]m-2
- * p[0].digit[n-1]m-1 p[1].digit[n-1]m-1 ... p[N-2].digit[n-1]m-1 p[N-1].digit[n-1]m-1 
- *
- */
-mp_err mpi_to_weave(const mp_int *a, unsigned char *b, 
-					mp_size b_size, mp_size count)
-{
-  mp_size i;
-  mp_digit *digitsa0;
-  mp_digit *digitsa1;
-  mp_digit *digitsa2;
-  mp_digit *digitsa3;
-  mp_size   useda0;
-  mp_size   useda1;
-  mp_size   useda2;
-  mp_size   useda3;
-  mp_weave_word *weaved = (mp_weave_word *)b;
-
-  count = count/sizeof(mp_weave_word);
-
-  /* this code pretty much depends on this ! */
-#if MP_ARGCHK == 2
-  assert(WEAVE_WORD_SIZE == 4); 
-  assert(sizeof(mp_weave_word) == 4);
-#endif
-
-  digitsa0 = MP_DIGITS(&a[0]);
-  digitsa1 = MP_DIGITS(&a[1]);
-  digitsa2 = MP_DIGITS(&a[2]);
-  digitsa3 = MP_DIGITS(&a[3]);
-  useda0 = MP_USED(&a[0]);
-  useda1 = MP_USED(&a[1]);
-  useda2 = MP_USED(&a[2]);
-  useda3 = MP_USED(&a[3]);
-
-  ARGCHK(MP_SIGN(&a[0]) == MP_ZPOS, MP_BADARG);
-  ARGCHK(MP_SIGN(&a[1]) == MP_ZPOS, MP_BADARG);
-  ARGCHK(MP_SIGN(&a[2]) == MP_ZPOS, MP_BADARG);
-  ARGCHK(MP_SIGN(&a[3]) == MP_ZPOS, MP_BADARG);
-  ARGCHK(useda0 <= b_size, MP_BADARG);
-  ARGCHK(useda1 <= b_size, MP_BADARG);
-  ARGCHK(useda2 <= b_size, MP_BADARG);
-  ARGCHK(useda3 <= b_size, MP_BADARG);
-
-#define SAFE_FETCH(digit, used, word) ((word) < (used) ? (digit[word]) : 0)
-
-  for (i=0; i < b_size; i++) {
-    mp_digit d0 = SAFE_FETCH(digitsa0,useda0,i);
-    mp_digit d1 = SAFE_FETCH(digitsa1,useda1,i);
-    mp_digit d2 = SAFE_FETCH(digitsa2,useda2,i);
-    mp_digit d3 = SAFE_FETCH(digitsa3,useda3,i);
-    register mp_weave_word acc;
-
-/*
- * ONE_STEP takes the MSB of each of our current digits and places that
- * byte in the appropriate position for writing to the weaved array.
- *  On little endian:
- *   b3 b2 b1 b0
- *  On big endian:
- *   b0 b1 b2 b3
- *  When the data is written it would always wind up:
- *   b[0] = b0
- *   b[1] = b1
- *   b[2] = b2
- *   b[3] = b3
- *
- * Once we've written the MSB, we shift the whole digit up left one
- * byte, putting the Next Most Significant Byte in the MSB position,
- * so we we repeat the next one step that byte will be written.
- * NOTE: This code assumes sizeof(mp_weave_word) and MP_WEAVE_WORD_SIZE
- * is 4.
- */
-#ifdef MP_IS_LITTLE_ENDIAN 
-#define MPI_WEAVE_ONE_STEP \
-    acc  = (d0 >> (MP_DIGIT_BIT-8))  & 0x000000ff; d0 <<= 8; /*b0*/ \
-    acc |= (d1 >> (MP_DIGIT_BIT-16)) & 0x0000ff00; d1 <<= 8; /*b1*/ \
-    acc |= (d2 >> (MP_DIGIT_BIT-24)) & 0x00ff0000; d2 <<= 8; /*b2*/ \
-    acc |= (d3 >> (MP_DIGIT_BIT-32)) & 0xff000000; d3 <<= 8; /*b3*/ \
-    *weaved = acc; weaved += count;
-#else 
-#define MPI_WEAVE_ONE_STEP \
-    acc  = (d0 >> (MP_DIGIT_BIT-32)) & 0xff000000; d0 <<= 8; /*b0*/ \
-    acc |= (d1 >> (MP_DIGIT_BIT-24)) & 0x00ff0000; d1 <<= 8; /*b1*/ \
-    acc |= (d2 >> (MP_DIGIT_BIT-16)) & 0x0000ff00; d2 <<= 8; /*b2*/ \
-    acc |= (d3 >> (MP_DIGIT_BIT-8))  & 0x000000ff; d3 <<= 8; /*b3*/ \
-    *weaved = acc; weaved += count;
-#endif 
-   switch (sizeof(mp_digit)) {
-   case 32:
-    MPI_WEAVE_ONE_STEP
-    MPI_WEAVE_ONE_STEP
-    MPI_WEAVE_ONE_STEP
-    MPI_WEAVE_ONE_STEP
-    MPI_WEAVE_ONE_STEP
-    MPI_WEAVE_ONE_STEP
-    MPI_WEAVE_ONE_STEP
-    MPI_WEAVE_ONE_STEP
-    MPI_WEAVE_ONE_STEP
-    MPI_WEAVE_ONE_STEP
-    MPI_WEAVE_ONE_STEP
-    MPI_WEAVE_ONE_STEP
-    MPI_WEAVE_ONE_STEP
-    MPI_WEAVE_ONE_STEP
-    MPI_WEAVE_ONE_STEP
-    MPI_WEAVE_ONE_STEP
-   case 16:
-    MPI_WEAVE_ONE_STEP
-    MPI_WEAVE_ONE_STEP
-    MPI_WEAVE_ONE_STEP
-    MPI_WEAVE_ONE_STEP
-    MPI_WEAVE_ONE_STEP
-    MPI_WEAVE_ONE_STEP
-    MPI_WEAVE_ONE_STEP
-    MPI_WEAVE_ONE_STEP
-   case 8:
-    MPI_WEAVE_ONE_STEP
-    MPI_WEAVE_ONE_STEP
-    MPI_WEAVE_ONE_STEP
-    MPI_WEAVE_ONE_STEP
-   case 4:
-    MPI_WEAVE_ONE_STEP
-    MPI_WEAVE_ONE_STEP
-   case 2:
-    MPI_WEAVE_ONE_STEP
-   case 1:
-    MPI_WEAVE_ONE_STEP
-    break;
-   }
-  }
-
-  return MP_OKAY;
-}
-
-/* reverse the operation above for one entry.
- * b points to the offset into the weave array of the power we are
- * calculating */
-mp_err weave_to_mpi(mp_int *a, const unsigned char *b, 
-					mp_size b_size, mp_size count)
-{
-  mp_digit *pb = MP_DIGITS(a);
-  mp_digit *end = &pb[b_size];
-
-  MP_SIGN(a) = MP_ZPOS;
-  MP_USED(a) = b_size;
-
-  for (; pb < end; pb++) {
-    register mp_digit digit;
-
-    digit = *b << 8; b += count;
-#define MPI_UNWEAVE_ONE_STEP  digit |= *b; b += count; digit = digit << 8;
-    switch (sizeof(mp_digit)) {
-    case 32:
-	MPI_UNWEAVE_ONE_STEP 
-	MPI_UNWEAVE_ONE_STEP 
-	MPI_UNWEAVE_ONE_STEP 
-	MPI_UNWEAVE_ONE_STEP 
-	MPI_UNWEAVE_ONE_STEP 
-	MPI_UNWEAVE_ONE_STEP 
-	MPI_UNWEAVE_ONE_STEP 
-	MPI_UNWEAVE_ONE_STEP 
-	MPI_UNWEAVE_ONE_STEP 
-	MPI_UNWEAVE_ONE_STEP 
-	MPI_UNWEAVE_ONE_STEP 
-	MPI_UNWEAVE_ONE_STEP 
-	MPI_UNWEAVE_ONE_STEP 
-	MPI_UNWEAVE_ONE_STEP 
-	MPI_UNWEAVE_ONE_STEP 
-	MPI_UNWEAVE_ONE_STEP 
-    case 16:
-	MPI_UNWEAVE_ONE_STEP 
-	MPI_UNWEAVE_ONE_STEP 
-	MPI_UNWEAVE_ONE_STEP 
-	MPI_UNWEAVE_ONE_STEP 
-	MPI_UNWEAVE_ONE_STEP 
-	MPI_UNWEAVE_ONE_STEP 
-	MPI_UNWEAVE_ONE_STEP 
-	MPI_UNWEAVE_ONE_STEP 
-    case 8:
-	MPI_UNWEAVE_ONE_STEP 
-	MPI_UNWEAVE_ONE_STEP 
-	MPI_UNWEAVE_ONE_STEP 
-	MPI_UNWEAVE_ONE_STEP 
-    case 4:
-	MPI_UNWEAVE_ONE_STEP 
-	MPI_UNWEAVE_ONE_STEP 
-    case 2:
-	break;
-    }
-    digit |= *b; b += count; 
-
-    *pb = digit;
-  }
-  s_mp_clamp(a);
-  return MP_OKAY;
-}
-#endif
-
-
 #define SQR(a,b) \
   MP_CHECKOK( mp_sqr(a, b) );\
   MP_CHECKOK( s_mp_redc(b, mmm) )
 
 #if defined(MP_MONT_USE_MP_MUL)
 #define MUL_NOWEAVE(x,a,b) \
   MP_CHECKOK( mp_mul(a, x, b) ); \
   MP_CHECKOK( s_mp_redc(b, mmm) ) 
--- a/lib/freebl/mpi/target.mk
+++ b/lib/freebl/mpi/target.mk
@@ -166,17 +166,17 @@ CFLAGS = -O -O2 -DAIX -DSYSV -qarch=com 
 OBJECT_MODE=64
 export OBJECT_MODE
 endif
 
 ifeq ($(TARGET),x86LINUX)
 #Linux
 AS_OBJS = mpi_x86.o
 MPICMN += -DMP_ASSEMBLY_MULTIPLY -DMP_ASSEMBLY_SQUARE -DMP_ASSEMBLY_DIV_2DX1D
-MPICMN += -DMP_MONT_USE_MP_MUL -DMP_CHAR_STORE_SLOW -DMP_IS_LITTLE_ENDIAN
+MPICMN += -DMP_MONT_USE_MP_MUL -DMP_IS_LITTLE_ENDIAN
 CFLAGS= -O2 -fPIC -DLINUX1_2 -Di386 -D_XOPEN_SOURCE -DLINUX2_1 -ansi -Wall \
  -pipe -DLINUX -Dlinux -D_POSIX_SOURCE -D_BSD_SOURCE -DHAVE_STRERROR \
  -DXP_UNIX -UDEBUG -DNDEBUG -D_REENTRANT $(MPICMN)
 #CFLAGS= -g -fPIC -DLINUX1_2 -Di386 -D_XOPEN_SOURCE -DLINUX2_1 -ansi -Wall \
  -pipe -DLINUX -Dlinux -D_POSIX_SOURCE -D_BSD_SOURCE -DHAVE_STRERROR \
  -DXP_UNIX -DDEBUG -UNDEBUG -D_REENTRANT $(MPICMN)
 #CFLAGS= -g -fPIC -DLINUX1_2 -Di386 -D_XOPEN_SOURCE -DLINUX2_1 -ansi -Wall \
  -pipe -DLINUX -Dlinux -D_POSIX_SOURCE -D_BSD_SOURCE -DHAVE_STRERROR \
@@ -188,17 +188,17 @@ MPICMN += -DMP_ASSEMBLY_MULTIPLY -DMP_AS
 MPICMN += -DMP_USE_UINT_DIGIT 
 AS_OBJS += mpi_arm.o
 endif
 
 ifeq ($(TARGET),AMD64SOLARIS)
 ASFLAGS += -xarch=generic64
 AS_OBJS = mpi_amd64.o mpi_amd64_sun.o
 MP_CONFIG = -DMP_ASSEMBLY_MULTIPLY -DMPI_AMD64
-MP_CONFIG += -DMP_CHAR_STORE_SLOW -DMP_IS_LITTLE_ENDIAN
+MP_CONFIG += -DMP_IS_LITTLE_ENDIAN
 CFLAGS = -xarch=generic64 -xO4 -I. -DMP_API_COMPATIBLE -DMP_IOFUNC $(MP_CONFIG)
 MPICMN += $(MP_CONFIG)
 
 mpi_amd64_asm.o: mpi_amd64_sun.s
 	$(AS) -xarch=generic64 -P -D_ASM mpi_amd64_sun.s
 endif
 
 ifeq ($(TARGET),WIN32)