Oops, wrong base line. This is the correct base line. BOBS_MPI_EXP
authorrelyea%netscape.com
Fri, 11 Nov 2005 20:04:12 +0000
branchBOBS_MPI_EXP
changeset 6288 b6ab804bb9e6d2d77d7eed571bac6962ef611de5
parent 6287 f4752cc76afe1cdb68782c07ebc490f6cfbc6959
child 13751 24ba8e0a285aec84a5d16b5f5dd4ebc111e76838
push idunknown
push userunknown
push dateunknown
Oops, wrong base line. This is the correct base line.
security/nss/lib/freebl/mpi/mpmontg.c
--- a/security/nss/lib/freebl/mpi/mpmontg.c
+++ b/security/nss/lib/freebl/mpi/mpmontg.c
@@ -44,38 +44,36 @@
  * the Motorola DSP56000" by Stephen R. Dusse' and Burton S. Kaliski Jr.
  * published in "Advances in Cryptology: Proceedings of EUROCRYPT '90"
  * "Lecture Notes in Computer Science" volume 473, 1991, pg 230-244,
  * published by Springer Verlag.
  */
 
 /* #define MP_USING_MONT_MULF 1 */
 #define MP_USING_CACHE_SAFE_MOD_EXP 1 
-#define MP_USING_WEAVE_COPY 1 
-#define MP_CHAR_STORE_SLOW 1
+/*#define MP_USING_WEAVE_COPY 1  */
+/*#define MP_CHAR_STORE_SLOW 1 */
 #include <string.h>
 #include "mpi-priv.h"
-#include "mp_gf2m-priv.h"
 #include "mplogic.h"
 #include "mpprime.h"
 #ifdef MP_USING_MONT_MULF
 #include "montmulf.h"
 #endif
 #include <stddef.h> /* ptrdiff_t */
 
 /* need to know endianness of this platform. If we aren't told, get it from
  * nspr... */
 #ifdef MP_CHAR_STORE_SLOW
 #if !defined(IS_BIG_ENDIAN) && !defined(IS_LITTLE_ENDIAN)
 #include "prcpucfg.h"
 #endif
 #endif
 
 #define STATIC
-/* #define DEBUG 1  */
 
 #define MAX_WINDOW_BITS 6
 #define MAX_ODD_INTS    32   /* 2 ** (WINDOW_BITS - 1) */
 #define MAX_POWERS MAX_ODD_INTS*2
 #define MAX_MODULUS_BITS 8192
 #define MAX_MODULUS_BYTES (MAX_MODULUS_BITS/8)
 #define MAX_MODULUS_DIGITS (MAX_MODULUS_BYTES/sizeof(mp_digit))
 
@@ -527,32 +525,30 @@ CLEANUP:
 }
 #undef SQR
 #undef MUL
 
 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
 unsigned int mp_using_cache_safe_exp = 1;
 #endif
 
-mp_err mp_set_modexp_mode(int value)
+mp_err mp_set_safe_modexp(int value) 
 {
 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
  mp_using_cache_safe_exp = value;
  return MP_OKAY;
 #else
  if (value == 0) {
    return MP_OKAY;
  }
  return MP_BADARG;
 #endif
 }
 
-
 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
-
 #ifndef MP_USING_WEAVE_COPY
 #ifndef MP_CHAR_STORE_SLOW
 #define WEAVE_BASE_INIT \
   unsigned char *_ptr;
 
 #define WEAVE_FIRST(bi,b,count) \
   _ptr = (unsigned char *)bi; \
   *_ptr++ = *b; b+= count;
@@ -583,18 +579,17 @@ mp_err mp_set_modexp_mode(int value)
   WEAVE_BASE_INIT
 
 #define WEAVE_FETCH(bi, b, count) \
   WEAVE_FIRST(bi,b,count) \
   WEAVE_MIDDLE(bi,b,count) \
   WEAVE_MIDDLE(bi,b,count) \
   WEAVE_LAST(bi,b,count)
 
-#else
-#ifdef MP_DIGIT_BITS == 64 
+#elif MP_DIGIT_BITS == 64 
 
 #define WEAVE_INIT  \
   WEAVE_BASE_INIT
 
 #define WEAVE_FETCH(bi, b, count) \
   WEAVE_FIRST(bi,b,count) \
   WEAVE_MIDDLE(bi,b,count) \
   WEAVE_MIDDLE(bi,b,count) \
@@ -608,24 +603,22 @@ mp_err mp_set_modexp_mode(int value)
 
 #define WEAVE_INIT \
   int _i; \
   WEAVE_BASE_INIT
 
   /* It would be nice to unroll this loop as well */
 #define WEAVE_FETCH(bi, b, count) \
   WEAVE_FIRST(bi,b,count) \
-  WEAVE_LAST(bi,b,count) \
   for (_i=1; _i < sizeof(mp_digit) -1 ; _i++) { \
     WEAVE_MIDDLE(bi,b,count) \
   } \
   WEAVE_LAST(bi,b,count)
 
 #endif
-#endif
 
 #if !defined(MP_MONT_USE_MP_MUL)
 /*
  * if count <= the minimum cache line size, it means we can fit at least
  * one element of each array on a cache line. We interleave the cache lines
  * to prevent attackers from getting information about the exponent by looking
  * at our cache usage.
  * offset is the first element of our array, b_size is the size of our 
@@ -726,16 +719,51 @@ mp_err   mp_mul_weave(const mp_int *a, c
 CLEANUP:
   return res;
 } /* end mp_mul() */
 #endif /* MP_USING_WEAVE_COPY */
 
 #define WEAVE_WORD_SIZE 4
 
 #ifndef MP_CHAR_STORE_SLOW
+/*
+ *  a is an array of WEAVE_WORD_SIZE mp_ints (that is 4). 
+ * It is a partial window into a logical array mp_int p[N] containing
+ * the base to the 0 through N powers. Ideally this code would be much more
+ * simple if we stored one element of that array at a time, but on some 
+ * platforms the cost of storing a byte incurs a full read modify write cycle
+ * and increases the memory bandwidth cost by a factor of 4 or 8. By collecting
+ * for mp_ints together, we can arrange to store all 4 values in a single
+ * word write. 
+ * 
+ * NOTE: this version of the function does not do word writes. The only reason 
+ * for passing in 4 mp_ints at a time is to make the caller less dependent on 
+ * the * variant we are using.
+ *
+ *  b is the targeted weaved location. b[0] points to the first byte where
+ *  a[0][0] needs to be stored.
+ * 
+ *  count is 2^window size. (same as N above and below).
+ *
+ *  b_size is the size in mp_digits of each mp_int in the array.
+ *
+ *
+ * Data is stored as follows :
+ * The mp_int array is treated as a byte array.
+ * we want to store the logical array mp_int p[N]. p[N].digit is treated as 
+ * a byte array (rather than * an mp_digit array) and n is b_size
+ * *sizeof(mp_digit):
+ *
+ * p[0].digit[0]   p[1].digit[0]   ...... p[N-1].digit[0]   p[N].digit[0]
+ * p[0].digit[1]   p[1].digit[1]   ...... p[N-1].digit[1]   p[N].digit[1]
+ *            .                                         .
+ *            .                                         .
+ * p[0].digit[n-1] p[1].digit[n-1] ...... p[N-1].digit[n-1] p[N].digit[n-1]
+ * p[0].digit[n]   p[1].digit[n]   ...... p[N-1].digit[n]   p[N].digit[n] 
+ */
 mp_err mpi_to_weave(const mp_int *a, unsigned char *b, 
 			             mp_size b_size,  mp_size count)
 {
   mp_size i, j;
   unsigned char *bsave = b;
 
   for (i=0; i < WEAVE_WORD_SIZE; i++) {
     unsigned char *pb = (unsigned char *)MP_DIGITS(&a[i]);
@@ -757,40 +785,65 @@ mp_err mpi_to_weave(const mp_int *a, uns
       b += count;
     }
   }
 
   return MP_OKAY;
 }
 #else
 /* Need a primitive that we know is 32 bits long... */
-#if UINT_MAX == MP_32BIT_MAX
+/* this is true on all modern processors we know of today*/
 typedef unsigned int mp_weave_word;
-#else
-#if ULONG_MAX == MP_32BIT_MAX
-typedef unsigned long mp_weave_word;
-#else
-#error "Can't find 32 bit primitive type for this platform"
-#endif
-#endif
 
 /*
  * on some platforms character stores into memory is very expensive since they
  * generate a read/modify/write operation on the bus. On those platforms
  * we need to do integer writes to the bus.
  *
  * The weave_to_mpi function in those cases expect the data to be laid out in 
  * big endian, interleaved. 
  * 
  * since we need to interleave on a byte by byte basis, we need to collect 
  * several mpi structures together into a single uint32 before we write. We
  * also need to make sure the uint32 is arranged so that the first value of 
  * the first array winds up in b[0]. This means construction of that uint32
  * is endian specific (even though the layout of the array is always big
- * endian.
+ * endian).
+ *
+ *  a is an array of WEAVE_WORD_SIZE mp_ints (that is 4). 
+ * It is a partial window into a logical array mp_int p[N] containing
+ * the base to the 0 through N powers. 
+ *
+ *  b is the targeted weaved location. b[0] points to the first byte where
+ *  a[0][0] needs to be stored.
+ * 
+ *  count is 2^window size. (same as N above and below).
+ *
+ *  b_size is the size in mp_digits of each mp_int in the array.
+ *
+ *
+ * Data is stored as follows :
+ *
+ * Our same logical array p array, m is sizeof(mp_digit),
+ * N is still count and n is now b_size. If we define p[i].digit[j]0 as the 
+ * most significant byte of the word p[i].digit[j], p[i].digit[j]1 as 
+ * the next most significant byte of p[i].digit[j], ...  and p[i].digit[j]m 
+ * is the least significant byte.our array would look like:
+
+ * p[0].digit[0]0   p[1].digit[0]0   ...... p[N-1].digit[0]0   p[N].digit[0]0
+ * p[0].digit[0]1   p[1].digit[0]1   ...... p[N-1].digit[0]1   p[N].digit[0]1
+ *                .                                         .
+ * p[0].digit[0]m   p[1].digit[0]m   ...... p[N-1].digit[0]m   p[N].digit[0]m
+ * p[0].digit[1]0   p[1].digit[1]0   ...... p[N-1].digit[1]0   p[N].digit[1]0
+ *                .                                         .
+ *                .                                         .
+ * p[0].digit[n]m-1 p[1].digit[n]m-1 ...... p[N-1].digit[n]m-1 p[N].digit[n]m-1
+ * p[0].digit[n]m   p[1].digit[n]m   ...... p[N-1].digit[n]m   p[N].digit[n]m 
+ */
+ *
  */
 mp_err mpi_to_weave(const mp_int *a, unsigned char *b, 
 					mp_size b_size, mp_size count)
 {
   mp_size i;
   mp_digit *digitsa0;
   mp_digit *digitsa1;
   mp_digit *digitsa2;
@@ -802,17 +855,20 @@ mp_err mpi_to_weave(const mp_int *a, uns
   mp_weave_word *weaved = (mp_weave_word *)b;
 #if MP_DIGIT_BITS != 32 && MP_DIGIT_BITS != 64
   mp_size j;
 #endif
 
   count = count/sizeof(mp_weave_word);
 
   /* this code pretty much depends on this ! */
-  /*assert(WEAVE_WORD_SIZE == 4); */
+#if MP_ARGCHK < 2
+  assert(WEAVE_WORD_SIZE == 4);
+  assert(sizeof(mp_weave_word) == 4);
+#endif
 
   digitsa0 = MP_DIGITS(&a[0]);
   digitsa1 = MP_DIGITS(&a[1]);
   digitsa2 = MP_DIGITS(&a[2]);
   digitsa3 = MP_DIGITS(&a[3]);
   useda0 = MP_USED(&a[0]);
   useda1 = MP_USED(&a[1]);
   useda2 = MP_USED(&a[2]);
@@ -856,17 +912,16 @@ mp_err mpi_to_weave(const mp_int *a, uns
 #ifdef IS_LITTLE_ENDIAN 
 #define MPI_WEAVE_ONE_STEP \
     acc  = (d0 >> (MP_DIGIT_BITS-8))  & 0xff      ; d0 <<= 8; /*b0*/ \
     acc |= (d1 >> (MP_DIGIT_BITS-16)) & 0xff00    ; d1 <<= 8; /*b1*/ \
     acc |= (d2 >> (MP_DIGIT_BITS-24)) & 0xff0000  ; d2 <<= 8; /*b2*/ \
     acc |= (d3 >> (MP_DIGIT_BITS-32)) & 0xff000000; d3 <<= 8; /*b3*/ \
     *weaved = acc; weaved += count;
 #else 
-#error "Intel is Little endian, but IS_LITTLE_ENDIAN is not defined!"
 #define MPI_WEAVE_ONE_STEP \
     acc  = (d0 >> (MP_DIGIT_BITS-32)) & 0xff000000; d0 <<= 8; /*b0*/ \
     acc |= (d1 >> (MP_DIGIT_BITS-24)) & 0xff0000  ; d1 <<= 8; /*b1*/ \
     acc |= (d2 >> (MP_DIGIT_BITS-16)) & 0xff00    ; d2 <<= 8; /*b2*/ \
     acc |= (d3 >> (MP_DIGIT_BITS-8))  & 0xff      ; d3 <<= 8; /*b3*/ \
     *weaved = acc; weaved += count;
 #endif 
 
@@ -1013,49 +1068,54 @@ mp_err mp_exptmod_safe_i(const mp_int * 
   MP_DIGITS(&accum[3]) = 0;
 
   /* grab the first window value. This allows us to preload accumulator1
    * and save a conversion, some squares and a multiple*/
   MP_CHECKOK( mpl_get_bits(exponent, 
 				bits_in_exponent-window_bits, window_bits) );
   first_window = (mp_size)res;
 
-  MP_CHECKOK( mp_init_size(&accum[0], 3 * nLen + 2) );
-  MP_CHECKOK( mp_init_size(&accum[1], 3 * nLen + 2) );
-  MP_CHECKOK( mp_init_size(&accum[2], 3 * nLen + 2) );
-  MP_CHECKOK( mp_init_size(&accum[3], 3 * nLen + 2) );
   MP_CHECKOK( mp_init_size(&accum1, 3 * nLen + 2) );
   MP_CHECKOK( mp_init_size(&accum2, 3 * nLen + 2) );
 #ifdef MP_USING_WEAVE_COPY
   MP_DIGITS(&tmp) = 0;
   MP_CHECKOK( mp_init_size(&tmp, 3 * nLen + 2) );
 #endif
 
   /* build the first 4 powers inline */
   if (num_powers > 2) {
+    MP_CHECKOK( mp_init_size(&accum[0], 3 * nLen + 2) );
+    MP_CHECKOK( mp_init_size(&accum[1], 3 * nLen + 2) );
+    MP_CHECKOK( mp_init_size(&accum[2], 3 * nLen + 2) );
+    MP_CHECKOK( mp_init_size(&accum[3], 3 * nLen + 2) );
     mp_set(&accum[0], 1);
     MP_CHECKOK( s_mp_to_mont(&accum[0], mmm, &accum[0]) );
     MP_CHECKOK( mp_copy(montBase, &accum[1]) );
     SQR(montBase, &accum[2]);
     MUL_NOWEAVE(montBase, &accum[2], &accum[3]);
     MP_CHECKOK( mpi_to_weave(accum, powers, nLen, num_powers) );
     if (first_window < 4) {
       MP_CHECKOK( mp_copy(&accum[first_window], &accum1) );
       first_window = num_powers;
     }
   } else {
-      /* assert first_window == 1? */
-      MP_CHECKOK( mp_copy(montBase, &accum1) );
+      if (first_window == 0) {
+        mp_set(&accum1, 1);
+        MP_CHECKOK( s_mp_to_mont(&accum1, mmm, &accum1) );
+      } else {
+        /* assert first_window == 1? */
+        MP_CHECKOK( mp_copy(montBase, &accum1) );
+      }
   }
 
 
   /* this adds 2**(k-1)-2 square operations over just calculating the
    * odd powers where k is the window size. We will get some of that
-   * back by not needing the first 'N' squares for the window (though
-   * squaring 1 is extremely fast, so it's not much savings) */ 
+   * back by not needing the first 'k' squares and one multiply for the 
+   * first window */ 
   for (i = 4; i < num_powers; i++) {
     int acc_index = i & 0x3; /* i % 4 */
     if ( i & 1 ) {
       MUL_NOWEAVE(montBase, &accum[acc_index-1] , &accum[acc_index]);
       /* we've filled the array do our 'per array' processing */
       if (acc_index == 3) {
         MP_CHECKOK( mpi_to_weave(accum, powers + i - 3, nLen, num_powers) );
 
@@ -1084,20 +1144,22 @@ mp_err mp_exptmod_safe_i(const mp_int * 
 	   MP_CHECKOK(mp_copy(&accum[half_power_index], &accum2));
 	   SQR(&accum2,&accum[acc_index]);
 	} else {
 	   SQR(&accum[half_power_index],&accum[acc_index]);
 	}
       }
     }
   }
-  /* if the accum1 isn't set, then either j was out of range, or our logic
-   * above does not populate all the powers values. either case it shouldn't
-   * happen and is an internal mpi programming error */
-  ARGCHK(MP_USED(&accum1) != 0, MP_PROGERR);
+  /* if the accum1 isn't set, or our logic above does not populate all the 
+   * powers values. either case it shouldn't happen and is an internal mpi 
+   * programming error */
+#if MP_ARGCHK == 2
+  assert(MP_USED(&accum1) != 0);
+#endif
 
   /* set accumulator to montgomery residue of 1 */
   pa1 = &accum1;
   pa2 = &accum2;
 
   for (expOff = bits_in_exponent - window_bits*2; expOff >= 0; expOff -= window_bits) {
     mp_size smallExp;
     MP_CHECKOK( mpl_get_bits(exponent, expOff, window_bits) );
@@ -1131,16 +1193,20 @@ mp_err mp_exptmod_safe_i(const mp_int * 
   }
 
   res = s_mp_redc(pa1, mmm);
   mp_exch(pa1, result);
 
 CLEANUP:
   mp_clear(&accum1);
   mp_clear(&accum2);
+  mp_clear(&accum[0]);
+  mp_clear(&accum[1]);
+  mp_clear(&accum[2]);
+  mp_clear(&accum[3]);
   /* PORT_Memset(powers,0,sizeof(powers)); */
   return res;
 }
 #undef SQR
 #undef MUL
 #endif
 
 mp_err mp_exptmod(const mp_int *inBase, const mp_int *exponent, 
@@ -1209,22 +1275,16 @@ mp_err mp_exptmod(const mp_int *inBase, 
   else if (bits_in_exponent > 20)
     window_bits = 4;
   /* RSA public key exponents are typically under 20 bits (common values 
    * are: 3, 17, 65537) and a 4-bit window is inefficient
    */
   else 
     window_bits = 1;
 
-  odd_ints = 1 << (window_bits - 1);
-  i = bits_in_exponent % window_bits;
-  if (i != 0) {
-    bits_in_exponent += window_bits - i;
-  } 
-
 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
   /*
    * clamp the window size based on
    * the cache line size.
    */
   if (!max_window_bits) {
     unsigned long cache_size = s_mpi_getProcessorLineSize();
     /* processor has no cache, use 'fast' code always */
@@ -1234,30 +1294,40 @@ mp_err mp_exptmod(const mp_int *inBase, 
     if ((cache_size == 0) || (cache_size >= 64)) {
       max_window_bits = 6;
     } else if (cache_size >= 32) {
       max_window_bits = 5;
     } else if (cache_size >= 16) {
       max_window_bits = 4;
     } else max_window_bits = 1; /* should this be an assert? */
   }
+
+  /* clamp the window size down before we caclulate bits_in_exponent */
+  if (mp_using_cache_safe_exp) {
+    if (window_bits > max_window_bits) {
+      window_bits = max_window_bits;
+    }
+  }
 #endif
 
+  odd_ints = 1 << (window_bits - 1);
+  i = bits_in_exponent % window_bits;
+  if (i != 0) {
+    bits_in_exponent += window_bits - i;
+  } 
+
 #ifdef MP_USING_MONT_MULF
   if (mp_using_mont_mulf) {
     MP_CHECKOK( s_mp_pad(&montBase, nLen) );
     res = mp_exptmod_f(&montBase, exponent, modulus, result, &mmm, nLen, 
 		     bits_in_exponent, window_bits, odd_ints);
   } else
 #endif
 #ifdef MP_USING_CACHE_SAFE_MOD_EXP
   if (mp_using_cache_safe_exp) {
-    if (window_bits > max_window_bits) {
-      window_bits = max_window_bits;
-    }
     res = mp_exptmod_safe_i(&montBase, exponent, modulus, result, &mmm, nLen, 
 		     bits_in_exponent, window_bits, 1 << window_bits);
   } else
 #endif
   res = mp_exptmod_i(&montBase, exponent, modulus, result, &mmm, nLen, 
 		     bits_in_exponent, window_bits, odd_ints);
 
 CLEANUP: