Add faster, optional sqr routine for internal LibTomMath

At the cost of about 0.5 kB of additional binary size, the internal LibTomMath can be configured to include faster sqr routine to speed up DH and RSA. This can be enabled with CONFIG_INTERNAL_LIBTOMMATH_FAST_SQR=y in .config.
2008-06-05 18:33:46 +03:00 · 2008-06-05 18:33:46 +03:00 · c5f5c91aeb
commit c5f5c91aeb
parent 4fba48a5a7
3 changed files with 113 additions and 2 deletions
--- a/src/tls/libtommath.c
+++ b/src/tls/libtommath.c
@ -36,6 +36,11 @@
 #define BN_MP_MUL_2_C
 #endif /* LTM_FAST_EXPTMOD */

+#ifdef LTM_FAST_SQR
+/* Include faster sqr at the cost of about 0.5 kB in code */
+#define BN_FAST_S_MP_SQR_C
+#endif /* LTM_FAST_SQR */
+
 /* Current uses do not require support for negative exponent in exptmod, so we
 * can save about 1.5 kB in leaving out invmod. */
 #define LTM_NO_NEG_EXP
@ -153,6 +158,9 @@ static int mp_init_size(mp_int * a, int size);
 #ifdef BN_MP_EXPTMOD_FAST_C
 static int mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode);
 #endif /* BN_MP_EXPTMOD_FAST_C */
+#ifdef BN_FAST_S_MP_SQR_C
+static int fast_s_mp_sqr (mp_int * a, mp_int * b);
+#endif /* BN_FAST_S_MP_SQR_C */



@ -2988,3 +2996,99 @@ LBL_M:
  return err;
 }
 #endif
+
+
+#ifdef BN_FAST_S_MP_SQR_C
+/* the jist of squaring...
+ * you do like mult except the offset of the tmpx [one that 
+ * starts closer to zero] can't equal the offset of tmpy.  
+ * So basically you set up iy like before then you min it with
+ * (ty-tx) so that it never happens.  You double all those 
+ * you add in the inner loop
+
+After that loop you do the squares and add them in.
+*/
+
+static int fast_s_mp_sqr (mp_int * a, mp_int * b)
+{
+  int       olduse, res, pa, ix, iz;
+  mp_digit   W[MP_WARRAY], *tmpx;
+  mp_word   W1;
+
+  /* grow the destination as required */
+  pa = a->used + a->used;
+  if (b->alloc < pa) {
+    if ((res = mp_grow (b, pa)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  /* number of output digits to produce */
+  W1 = 0;
+  for (ix = 0; ix < pa; ix++) { 
+      int      tx, ty, iy;
+      mp_word  _W;
+      mp_digit *tmpy;
+
+      /* clear counter */
+      _W = 0;
+
+      /* get offsets into the two bignums */
+      ty = MIN(a->used-1, ix);
+      tx = ix - ty;
+
+      /* setup temp aliases */
+      tmpx = a->dp + tx;
+      tmpy = a->dp + ty;
+
+      /* this is the number of times the loop will iterrate, essentially
+         while (tx++ < a->used && ty-- >= 0) { ... }
+       */
+      iy = MIN(a->used-tx, ty+1);
+
+      /* now for squaring tx can never equal ty 
+       * we halve the distance since they approach at a rate of 2x
+       * and we have to round because odd cases need to be executed
+       */
+      iy = MIN(iy, (ty-tx+1)>>1);
+
+      /* execute loop */
+      for (iz = 0; iz < iy; iz++) {
+         _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
+      }
+
+      /* double the inner product and add carry */
+      _W = _W + _W + W1;
+
+      /* even columns have the square term in them */
+      if ((ix&1) == 0) {
+         _W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]);
+      }
+
+      /* store it */
+      W[ix] = (mp_digit)(_W & MP_MASK);
+
+      /* make next carry */
+      W1 = _W >> ((mp_word)DIGIT_BIT);
+  }
+
+  /* setup dest */
+  olduse  = b->used;
+  b->used = a->used+a->used;
+
+  {
+    mp_digit *tmpb;
+    tmpb = b->dp;
+    for (ix = 0; ix < pa; ix++) {
+      *tmpb++ = W[ix] & MP_MASK;
+    }
+
+    /* clear unused digits [that existed in the old copy of c] */
+    for (; ix < olduse; ix++) {
+      *tmpb++ = 0;
+    }
+  }
+  mp_clamp (b);
+  return MP_OKAY;
+}
+#endif
--- a/wpa_supplicant/Makefile
+++ b/wpa_supplicant/Makefile
@ -624,6 +624,9 @@ CFLAGS += -DCONFIG_INTERNAL_LIBTOMMATH
 ifdef CONFIG_INTERNAL_LIBTOMMATH_FAST_EXPTMOD
 CFLAGS += -DLTM_FAST_EXPTMOD
 endif
+ifdef CONFIG_INTERNAL_LIBTOMMATH_FAST_SQR
+CFLAGS += -DLTM_FAST_SQR
+endif
 else
 LIBS += -ltommath
 LIBS_p += -ltommath
--- a/wpa_supplicant/defconfig
+++ b/wpa_supplicant/defconfig
@ -307,9 +307,13 @@ CONFIG_PEERKEY=y
 #LIBS += -L$(LTM_PATH)
 #LIBS_p += -L$(LTM_PATH)
 #endif
-# Add a cost of about 2.5 kB of additional cost, the internal LibTomMath can be
-# configured to include fast exptmod routine to speed up DH and RSA.
+# At the cost of about 2.5 kB of additional binarysize, the internal LibTomMath
+# can be configured to include fast exptmod routine to speed up DH and RSA.
 #CONFIG_INTERNAL_LIBTOMMATH_FAST_EXPTMOD=y
+# At the cost of about 0.5 kB of additional binary size, the internal
+# LibTomMath can be configured to include faster sqr routine to speed up DH and
+# RSA.
+#CONFIG_INTERNAL_LIBTOMMATH_FAST_SQR=y

 # Include NDIS event processing through WMI into wpa_supplicant/wpasvc.
 # This is only for Windows builds and requires WMI-related header files and