Add faster, optional sqr routine for internal LibTomMath
At the cost of about 0.5 kB of additional binary size, the internal LibTomMath can be configured to include faster sqr routine to speed up DH and RSA. This can be enabled with CONFIG_INTERNAL_LIBTOMMATH_FAST_SQR=y in .config.
This commit is contained in:
parent
4fba48a5a7
commit
c5f5c91aeb
3 changed files with 113 additions and 2 deletions
|
@ -36,6 +36,11 @@
|
||||||
#define BN_MP_MUL_2_C
|
#define BN_MP_MUL_2_C
|
||||||
#endif /* LTM_FAST_EXPTMOD */
|
#endif /* LTM_FAST_EXPTMOD */
|
||||||
|
|
||||||
|
#ifdef LTM_FAST_SQR
|
||||||
|
/* Include faster sqr at the cost of about 0.5 kB in code */
|
||||||
|
#define BN_FAST_S_MP_SQR_C
|
||||||
|
#endif /* LTM_FAST_SQR */
|
||||||
|
|
||||||
/* Current uses do not require support for negative exponent in exptmod, so we
|
/* Current uses do not require support for negative exponent in exptmod, so we
|
||||||
* can save about 1.5 kB in leaving out invmod. */
|
* can save about 1.5 kB in leaving out invmod. */
|
||||||
#define LTM_NO_NEG_EXP
|
#define LTM_NO_NEG_EXP
|
||||||
|
@ -153,6 +158,9 @@ static int mp_init_size(mp_int * a, int size);
|
||||||
#ifdef BN_MP_EXPTMOD_FAST_C
|
#ifdef BN_MP_EXPTMOD_FAST_C
|
||||||
static int mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode);
|
static int mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode);
|
||||||
#endif /* BN_MP_EXPTMOD_FAST_C */
|
#endif /* BN_MP_EXPTMOD_FAST_C */
|
||||||
|
#ifdef BN_FAST_S_MP_SQR_C
|
||||||
|
static int fast_s_mp_sqr (mp_int * a, mp_int * b);
|
||||||
|
#endif /* BN_FAST_S_MP_SQR_C */
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -2988,3 +2996,99 @@ LBL_M:
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef BN_FAST_S_MP_SQR_C
|
||||||
|
/* the jist of squaring...
|
||||||
|
* you do like mult except the offset of the tmpx [one that
|
||||||
|
* starts closer to zero] can't equal the offset of tmpy.
|
||||||
|
* So basically you set up iy like before then you min it with
|
||||||
|
* (ty-tx) so that it never happens. You double all those
|
||||||
|
* you add in the inner loop
|
||||||
|
|
||||||
|
After that loop you do the squares and add them in.
|
||||||
|
*/
|
||||||
|
|
||||||
|
static int fast_s_mp_sqr (mp_int * a, mp_int * b)
|
||||||
|
{
|
||||||
|
int olduse, res, pa, ix, iz;
|
||||||
|
mp_digit W[MP_WARRAY], *tmpx;
|
||||||
|
mp_word W1;
|
||||||
|
|
||||||
|
/* grow the destination as required */
|
||||||
|
pa = a->used + a->used;
|
||||||
|
if (b->alloc < pa) {
|
||||||
|
if ((res = mp_grow (b, pa)) != MP_OKAY) {
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* number of output digits to produce */
|
||||||
|
W1 = 0;
|
||||||
|
for (ix = 0; ix < pa; ix++) {
|
||||||
|
int tx, ty, iy;
|
||||||
|
mp_word _W;
|
||||||
|
mp_digit *tmpy;
|
||||||
|
|
||||||
|
/* clear counter */
|
||||||
|
_W = 0;
|
||||||
|
|
||||||
|
/* get offsets into the two bignums */
|
||||||
|
ty = MIN(a->used-1, ix);
|
||||||
|
tx = ix - ty;
|
||||||
|
|
||||||
|
/* setup temp aliases */
|
||||||
|
tmpx = a->dp + tx;
|
||||||
|
tmpy = a->dp + ty;
|
||||||
|
|
||||||
|
/* this is the number of times the loop will iterrate, essentially
|
||||||
|
while (tx++ < a->used && ty-- >= 0) { ... }
|
||||||
|
*/
|
||||||
|
iy = MIN(a->used-tx, ty+1);
|
||||||
|
|
||||||
|
/* now for squaring tx can never equal ty
|
||||||
|
* we halve the distance since they approach at a rate of 2x
|
||||||
|
* and we have to round because odd cases need to be executed
|
||||||
|
*/
|
||||||
|
iy = MIN(iy, (ty-tx+1)>>1);
|
||||||
|
|
||||||
|
/* execute loop */
|
||||||
|
for (iz = 0; iz < iy; iz++) {
|
||||||
|
_W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* double the inner product and add carry */
|
||||||
|
_W = _W + _W + W1;
|
||||||
|
|
||||||
|
/* even columns have the square term in them */
|
||||||
|
if ((ix&1) == 0) {
|
||||||
|
_W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* store it */
|
||||||
|
W[ix] = (mp_digit)(_W & MP_MASK);
|
||||||
|
|
||||||
|
/* make next carry */
|
||||||
|
W1 = _W >> ((mp_word)DIGIT_BIT);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* setup dest */
|
||||||
|
olduse = b->used;
|
||||||
|
b->used = a->used+a->used;
|
||||||
|
|
||||||
|
{
|
||||||
|
mp_digit *tmpb;
|
||||||
|
tmpb = b->dp;
|
||||||
|
for (ix = 0; ix < pa; ix++) {
|
||||||
|
*tmpb++ = W[ix] & MP_MASK;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* clear unused digits [that existed in the old copy of c] */
|
||||||
|
for (; ix < olduse; ix++) {
|
||||||
|
*tmpb++ = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mp_clamp (b);
|
||||||
|
return MP_OKAY;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
|
@ -624,6 +624,9 @@ CFLAGS += -DCONFIG_INTERNAL_LIBTOMMATH
|
||||||
ifdef CONFIG_INTERNAL_LIBTOMMATH_FAST_EXPTMOD
|
ifdef CONFIG_INTERNAL_LIBTOMMATH_FAST_EXPTMOD
|
||||||
CFLAGS += -DLTM_FAST_EXPTMOD
|
CFLAGS += -DLTM_FAST_EXPTMOD
|
||||||
endif
|
endif
|
||||||
|
ifdef CONFIG_INTERNAL_LIBTOMMATH_FAST_SQR
|
||||||
|
CFLAGS += -DLTM_FAST_SQR
|
||||||
|
endif
|
||||||
else
|
else
|
||||||
LIBS += -ltommath
|
LIBS += -ltommath
|
||||||
LIBS_p += -ltommath
|
LIBS_p += -ltommath
|
||||||
|
|
|
@ -307,9 +307,13 @@ CONFIG_PEERKEY=y
|
||||||
#LIBS += -L$(LTM_PATH)
|
#LIBS += -L$(LTM_PATH)
|
||||||
#LIBS_p += -L$(LTM_PATH)
|
#LIBS_p += -L$(LTM_PATH)
|
||||||
#endif
|
#endif
|
||||||
# Add a cost of about 2.5 kB of additional cost, the internal LibTomMath can be
|
# At the cost of about 2.5 kB of additional binarysize, the internal LibTomMath
|
||||||
# configured to include fast exptmod routine to speed up DH and RSA.
|
# can be configured to include fast exptmod routine to speed up DH and RSA.
|
||||||
#CONFIG_INTERNAL_LIBTOMMATH_FAST_EXPTMOD=y
|
#CONFIG_INTERNAL_LIBTOMMATH_FAST_EXPTMOD=y
|
||||||
|
# At the cost of about 0.5 kB of additional binary size, the internal
|
||||||
|
# LibTomMath can be configured to include faster sqr routine to speed up DH and
|
||||||
|
# RSA.
|
||||||
|
#CONFIG_INTERNAL_LIBTOMMATH_FAST_SQR=y
|
||||||
|
|
||||||
# Include NDIS event processing through WMI into wpa_supplicant/wpasvc.
|
# Include NDIS event processing through WMI into wpa_supplicant/wpasvc.
|
||||||
# This is only for Windows builds and requires WMI-related header files and
|
# This is only for Windows builds and requires WMI-related header files and
|
||||||
|
|
Loading…
Reference in a new issue