Add faster, optional sqr routine for internal LibTomMath

At the cost of about 0.5 kB of additional binary size, the internal
LibTomMath can be configured to include faster sqr routine to speed up DH
and RSA. This can be enabled with CONFIG_INTERNAL_LIBTOMMATH_FAST_SQR=y in
.config.
This commit is contained in:
Jouni Malinen 2008-06-05 18:33:46 +03:00
parent 4fba48a5a7
commit c5f5c91aeb
3 changed files with 113 additions and 2 deletions

View file

@ -36,6 +36,11 @@
#define BN_MP_MUL_2_C #define BN_MP_MUL_2_C
#endif /* LTM_FAST_EXPTMOD */ #endif /* LTM_FAST_EXPTMOD */
#ifdef LTM_FAST_SQR
/* Include faster sqr at the cost of about 0.5 kB in code */
#define BN_FAST_S_MP_SQR_C
#endif /* LTM_FAST_SQR */
/* Current uses do not require support for negative exponent in exptmod, so we /* Current uses do not require support for negative exponent in exptmod, so we
* can save about 1.5 kB in leaving out invmod. */ * can save about 1.5 kB in leaving out invmod. */
#define LTM_NO_NEG_EXP #define LTM_NO_NEG_EXP
@ -153,6 +158,9 @@ static int mp_init_size(mp_int * a, int size);
#ifdef BN_MP_EXPTMOD_FAST_C #ifdef BN_MP_EXPTMOD_FAST_C
static int mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode); static int mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode);
#endif /* BN_MP_EXPTMOD_FAST_C */ #endif /* BN_MP_EXPTMOD_FAST_C */
#ifdef BN_FAST_S_MP_SQR_C
static int fast_s_mp_sqr (mp_int * a, mp_int * b);
#endif /* BN_FAST_S_MP_SQR_C */
@ -2988,3 +2996,99 @@ LBL_M:
return err; return err;
} }
#endif #endif
#ifdef BN_FAST_S_MP_SQR_C
/* the jist of squaring...
* you do like mult except the offset of the tmpx [one that
* starts closer to zero] can't equal the offset of tmpy.
* So basically you set up iy like before then you min it with
* (ty-tx) so that it never happens. You double all those
* you add in the inner loop
After that loop you do the squares and add them in.
*/
static int fast_s_mp_sqr (mp_int * a, mp_int * b)
{
int olduse, res, pa, ix, iz;
mp_digit W[MP_WARRAY], *tmpx;
mp_word W1;
/* grow the destination as required */
pa = a->used + a->used;
if (b->alloc < pa) {
if ((res = mp_grow (b, pa)) != MP_OKAY) {
return res;
}
}
/* number of output digits to produce */
W1 = 0;
for (ix = 0; ix < pa; ix++) {
int tx, ty, iy;
mp_word _W;
mp_digit *tmpy;
/* clear counter */
_W = 0;
/* get offsets into the two bignums */
ty = MIN(a->used-1, ix);
tx = ix - ty;
/* setup temp aliases */
tmpx = a->dp + tx;
tmpy = a->dp + ty;
/* this is the number of times the loop will iterrate, essentially
while (tx++ < a->used && ty-- >= 0) { ... }
*/
iy = MIN(a->used-tx, ty+1);
/* now for squaring tx can never equal ty
* we halve the distance since they approach at a rate of 2x
* and we have to round because odd cases need to be executed
*/
iy = MIN(iy, (ty-tx+1)>>1);
/* execute loop */
for (iz = 0; iz < iy; iz++) {
_W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
}
/* double the inner product and add carry */
_W = _W + _W + W1;
/* even columns have the square term in them */
if ((ix&1) == 0) {
_W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]);
}
/* store it */
W[ix] = (mp_digit)(_W & MP_MASK);
/* make next carry */
W1 = _W >> ((mp_word)DIGIT_BIT);
}
/* setup dest */
olduse = b->used;
b->used = a->used+a->used;
{
mp_digit *tmpb;
tmpb = b->dp;
for (ix = 0; ix < pa; ix++) {
*tmpb++ = W[ix] & MP_MASK;
}
/* clear unused digits [that existed in the old copy of c] */
for (; ix < olduse; ix++) {
*tmpb++ = 0;
}
}
mp_clamp (b);
return MP_OKAY;
}
#endif

View file

@ -624,6 +624,9 @@ CFLAGS += -DCONFIG_INTERNAL_LIBTOMMATH
ifdef CONFIG_INTERNAL_LIBTOMMATH_FAST_EXPTMOD ifdef CONFIG_INTERNAL_LIBTOMMATH_FAST_EXPTMOD
CFLAGS += -DLTM_FAST_EXPTMOD CFLAGS += -DLTM_FAST_EXPTMOD
endif endif
ifdef CONFIG_INTERNAL_LIBTOMMATH_FAST_SQR
CFLAGS += -DLTM_FAST_SQR
endif
else else
LIBS += -ltommath LIBS += -ltommath
LIBS_p += -ltommath LIBS_p += -ltommath

View file

@ -307,9 +307,13 @@ CONFIG_PEERKEY=y
#LIBS += -L$(LTM_PATH) #LIBS += -L$(LTM_PATH)
#LIBS_p += -L$(LTM_PATH) #LIBS_p += -L$(LTM_PATH)
#endif #endif
# Add a cost of about 2.5 kB of additional cost, the internal LibTomMath can be # At the cost of about 2.5 kB of additional binarysize, the internal LibTomMath
# configured to include fast exptmod routine to speed up DH and RSA. # can be configured to include fast exptmod routine to speed up DH and RSA.
#CONFIG_INTERNAL_LIBTOMMATH_FAST_EXPTMOD=y #CONFIG_INTERNAL_LIBTOMMATH_FAST_EXPTMOD=y
# At the cost of about 0.5 kB of additional binary size, the internal
# LibTomMath can be configured to include faster sqr routine to speed up DH and
# RSA.
#CONFIG_INTERNAL_LIBTOMMATH_FAST_SQR=y
# Include NDIS event processing through WMI into wpa_supplicant/wpasvc. # Include NDIS event processing through WMI into wpa_supplicant/wpasvc.
# This is only for Windows builds and requires WMI-related header files and # This is only for Windows builds and requires WMI-related header files and