diff --git a/src/tls/libtommath.c b/src/tls/libtommath.c index 374feb0ff..ea000f02f 100644 --- a/src/tls/libtommath.c +++ b/src/tls/libtommath.c @@ -36,6 +36,11 @@ #define BN_MP_MUL_2_C #endif /* LTM_FAST_EXPTMOD */ +#ifdef LTM_FAST_SQR +/* Include faster sqr at the cost of about 0.5 kB in code */ +#define BN_FAST_S_MP_SQR_C +#endif /* LTM_FAST_SQR */ + /* Current uses do not require support for negative exponent in exptmod, so we * can save about 1.5 kB in leaving out invmod. */ #define LTM_NO_NEG_EXP @@ -153,6 +158,9 @@ static int mp_init_size(mp_int * a, int size); #ifdef BN_MP_EXPTMOD_FAST_C static int mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode); #endif /* BN_MP_EXPTMOD_FAST_C */ +#ifdef BN_FAST_S_MP_SQR_C +static int fast_s_mp_sqr (mp_int * a, mp_int * b); +#endif /* BN_FAST_S_MP_SQR_C */ @@ -2988,3 +2996,99 @@ LBL_M: return err; } #endif + + +#ifdef BN_FAST_S_MP_SQR_C +/* the jist of squaring... + * you do like mult except the offset of the tmpx [one that + * starts closer to zero] can't equal the offset of tmpy. + * So basically you set up iy like before then you min it with + * (ty-tx) so that it never happens. You double all those + * you add in the inner loop + +After that loop you do the squares and add them in. +*/ + +static int fast_s_mp_sqr (mp_int * a, mp_int * b) +{ + int olduse, res, pa, ix, iz; + mp_digit W[MP_WARRAY], *tmpx; + mp_word W1; + + /* grow the destination as required */ + pa = a->used + a->used; + if (b->alloc < pa) { + if ((res = mp_grow (b, pa)) != MP_OKAY) { + return res; + } + } + + /* number of output digits to produce */ + W1 = 0; + for (ix = 0; ix < pa; ix++) { + int tx, ty, iy; + mp_word _W; + mp_digit *tmpy; + + /* clear counter */ + _W = 0; + + /* get offsets into the two bignums */ + ty = MIN(a->used-1, ix); + tx = ix - ty; + + /* setup temp aliases */ + tmpx = a->dp + tx; + tmpy = a->dp + ty; + + /* this is the number of times the loop will iterrate, essentially + while (tx++ < a->used && ty-- >= 0) { ... } + */ + iy = MIN(a->used-tx, ty+1); + + /* now for squaring tx can never equal ty + * we halve the distance since they approach at a rate of 2x + * and we have to round because odd cases need to be executed + */ + iy = MIN(iy, (ty-tx+1)>>1); + + /* execute loop */ + for (iz = 0; iz < iy; iz++) { + _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--); + } + + /* double the inner product and add carry */ + _W = _W + _W + W1; + + /* even columns have the square term in them */ + if ((ix&1) == 0) { + _W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]); + } + + /* store it */ + W[ix] = (mp_digit)(_W & MP_MASK); + + /* make next carry */ + W1 = _W >> ((mp_word)DIGIT_BIT); + } + + /* setup dest */ + olduse = b->used; + b->used = a->used+a->used; + + { + mp_digit *tmpb; + tmpb = b->dp; + for (ix = 0; ix < pa; ix++) { + *tmpb++ = W[ix] & MP_MASK; + } + + /* clear unused digits [that existed in the old copy of c] */ + for (; ix < olduse; ix++) { + *tmpb++ = 0; + } + } + mp_clamp (b); + return MP_OKAY; +} +#endif diff --git a/wpa_supplicant/Makefile b/wpa_supplicant/Makefile index d1c6a0883..53850d8f4 100644 --- a/wpa_supplicant/Makefile +++ b/wpa_supplicant/Makefile @@ -624,6 +624,9 @@ CFLAGS += -DCONFIG_INTERNAL_LIBTOMMATH ifdef CONFIG_INTERNAL_LIBTOMMATH_FAST_EXPTMOD CFLAGS += -DLTM_FAST_EXPTMOD endif +ifdef CONFIG_INTERNAL_LIBTOMMATH_FAST_SQR +CFLAGS += -DLTM_FAST_SQR +endif else LIBS += -ltommath LIBS_p += -ltommath diff --git a/wpa_supplicant/defconfig b/wpa_supplicant/defconfig index 80e5261bb..ad7cf7420 100644 --- a/wpa_supplicant/defconfig +++ b/wpa_supplicant/defconfig @@ -307,9 +307,13 @@ CONFIG_PEERKEY=y #LIBS += -L$(LTM_PATH) #LIBS_p += -L$(LTM_PATH) #endif -# Add a cost of about 2.5 kB of additional cost, the internal LibTomMath can be -# configured to include fast exptmod routine to speed up DH and RSA. +# At the cost of about 2.5 kB of additional binarysize, the internal LibTomMath +# can be configured to include fast exptmod routine to speed up DH and RSA. #CONFIG_INTERNAL_LIBTOMMATH_FAST_EXPTMOD=y +# At the cost of about 0.5 kB of additional binary size, the internal +# LibTomMath can be configured to include faster sqr routine to speed up DH and +# RSA. +#CONFIG_INTERNAL_LIBTOMMATH_FAST_SQR=y # Include NDIS event processing through WMI into wpa_supplicant/wpasvc. # This is only for Windows builds and requires WMI-related header files and