3 * Multi Precision Integer functions
5 * Copyright 2004 Michael Jung
6 * Based on public domain code by Tom St Denis (tomstdenis@iahu.ca)
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * This library is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with this library; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
24 * This file contains code from the LibTomCrypt cryptographic
25 * library written by Tom St Denis (tomstdenis@iahu.ca). LibTomCrypt
26 * is in the public domain. The code in this file is tailored to
27 * special requirements. Take a look at http://libtomcrypt.org for the
34 /* computes the modular inverse via binary extended euclidean algorithm,
35 * that is c = 1/a mod b
37 * Based on slow invmod except this is optimized for the case where b is
38 * odd as per HAC Note 14.64 on pp. 610
41 fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
43 mp_int x, y, u, v, B, D;
46 /* 2. [modified] b must be odd */
47 if (mp_iseven (b) == 1) {
51 /* init all our temps */
52 if ((res = mp_init_multi(&x, &y, &u, &v, &B, &D, NULL)) != MP_OKAY) {
56 /* x == modulus, y == value to invert */
57 if ((res = mp_copy (b, &x)) != MP_OKAY) {
62 if ((res = mp_abs (a, &y)) != MP_OKAY) {
66 /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
67 if ((res = mp_copy (&x, &u)) != MP_OKAY) {
70 if ((res = mp_copy (&y, &v)) != MP_OKAY) {
76 /* 4. while u is even do */
77 while (mp_iseven (&u) == 1) {
79 if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
82 /* 4.2 if B is odd then */
83 if (mp_isodd (&B) == 1) {
84 if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
89 if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
94 /* 5. while v is even do */
95 while (mp_iseven (&v) == 1) {
97 if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
100 /* 5.2 if D is odd then */
101 if (mp_isodd (&D) == 1) {
103 if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
108 if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
113 /* 6. if u >= v then */
114 if (mp_cmp (&u, &v) != MP_LT) {
115 /* u = u - v, B = B - D */
116 if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
120 if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
124 /* v - v - u, D = D - B */
125 if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
129 if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
134 /* if not zero goto step 4 */
135 if (mp_iszero (&u) == 0) {
139 /* now a = C, b = D, gcd == g*v */
141 /* if v != 1 then there is no inverse */
142 if (mp_cmp_d (&v, 1) != MP_EQ) {
147 /* b is now the inverse */
149 while (D.sign == MP_NEG) {
150 if ((res = mp_add (&D, b, &D)) != MP_OKAY) {
158 __ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL);
162 /* computes xR**-1 == x (mod N) via Montgomery Reduction
164 * This is an optimized implementation of montgomery_reduce
165 * which uses the comba method to quickly calculate the columns of the
168 * Based on Algorithm 14.32 on pp.601 of HAC.
171 fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
174 mp_word W[MP_WARRAY];
176 /* get old used count */
179 /* grow a as required */
180 if (x->alloc < n->used + 1) {
181 if ((res = mp_grow (x, n->used + 1)) != MP_OKAY) {
186 /* first we have to get the digits of the input into
187 * an array of double precision words W[...]
190 register mp_word *_W;
191 register mp_digit *tmpx;
193 /* alias for the W[] array */
196 /* alias for the digits of x*/
199 /* copy the digits of a into W[0..a->used-1] */
200 for (ix = 0; ix < x->used; ix++) {
204 /* zero the high words of W[a->used..m->used*2] */
205 for (; ix < n->used * 2 + 1; ix++) {
210 /* now we proceed to zero successive digits
211 * from the least significant upwards
213 for (ix = 0; ix < n->used; ix++) {
214 /* mu = ai * m' mod b
216 * We avoid a double precision multiplication (which isn't required)
217 * by casting the value down to a mp_digit. Note this requires
218 * that W[ix-1] have the carry cleared (see after the inner loop)
220 register mp_digit mu;
221 mu = (mp_digit) (((W[ix] & MP_MASK) * rho) & MP_MASK);
223 /* a = a + mu * m * b**i
225 * This is computed in place and on the fly. The multiplication
226 * by b**i is handled by offseting which columns the results
229 * Note the comba method normally doesn't handle carries in the
230 * inner loop In this case we fix the carry from the previous
231 * column since the Montgomery reduction requires digits of the
232 * result (so far) [see above] to work. This is
233 * handled by fixing up one carry after the inner loop. The
234 * carry fixups are done in order so after these loops the
235 * first m->used words of W[] have the carries fixed
239 register mp_digit *tmpn;
240 register mp_word *_W;
242 /* alias for the digits of the modulus */
245 /* Alias for the columns set by an offset of ix */
249 for (iy = 0; iy < n->used; iy++) {
250 *_W++ += ((mp_word)mu) * ((mp_word)*tmpn++);
254 /* now fix carry for next digit, W[ix+1] */
255 W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT);
258 /* now we have to propagate the carries and
259 * shift the words downward [all those least
260 * significant digits we zeroed].
263 register mp_digit *tmpx;
264 register mp_word *_W, *_W1;
266 /* nox fix rest of carries */
268 /* alias for current word */
271 /* alias for next word, where the carry goes */
274 for (; ix <= n->used * 2 + 1; ix++) {
275 *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT);
278 /* copy out, A = A/b**n
280 * The result is A/b**n but instead of converting from an
281 * array of mp_word to mp_digit than calling mp_rshd
282 * we just copy them in the right order
285 /* alias for destination word */
288 /* alias for shifted double precision result */
291 for (ix = 0; ix < n->used + 1; ix++) {
292 *tmpx++ = (mp_digit)(*_W++ & ((mp_word) MP_MASK));
295 /* zero oldused digits, if the input a was larger than
296 * m->used+1 we'll have to clear the digits
298 for (; ix < olduse; ix++) {
303 /* set the max used and clamp */
304 x->used = n->used + 1;
307 /* if A >= m then A = A - m */
308 if (mp_cmp_mag (x, n) != MP_LT) {
309 return s_mp_sub (x, n, x);
314 /* Fast (comba) multiplier
316 * This is the fast column-array [comba] multiplier. It is
317 * designed to compute the columns of the product first
318 * then handle the carries afterwards. This has the effect
319 * of making the nested loops that compute the columns very
320 * simple and schedulable on super-scalar processors.
322 * This has been modified to produce a variable number of
323 * digits of output so if say only a half-product is required
324 * you don't have to compute the upper half (a feature
325 * required for fast Barrett reduction).
327 * Based on Algorithm 14.12 on pp.595 of HAC.
331 fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
333 int olduse, res, pa, ix, iz;
334 mp_digit W[MP_WARRAY];
337 /* grow the destination as required */
338 if (c->alloc < digs) {
339 if ((res = mp_grow (c, digs)) != MP_OKAY) {
344 /* number of output digits to produce */
345 pa = MIN(digs, a->used + b->used);
347 /* clear the carry */
349 for (ix = 0; ix <= pa; ix++) {
352 mp_digit *tmpx, *tmpy;
354 /* get offsets into the two bignums */
355 ty = MIN(b->used-1, ix);
358 /* setup temp aliases */
362 /* this is the number of times the loop will iterrate, essentially its
363 while (tx++ < a->used && ty-- >= 0) { ... }
365 iy = MIN(a->used-tx, ty+1);
368 for (iz = 0; iz < iy; ++iz) {
369 _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
373 W[ix] = ((mp_digit)_W) & MP_MASK;
375 /* make next carry */
376 _W = _W >> ((mp_word)DIGIT_BIT);
384 register mp_digit *tmpc;
386 for (ix = 0; ix < digs; ix++) {
387 /* now extract the previous digit [below the carry] */
391 /* clear unused digits [that existed in the old copy of c] */
392 for (; ix < olduse; ix++) {
400 /* this is a modified version of fast_s_mul_digs that only produces
401 * output digits *above* digs. See the comments for fast_s_mul_digs
402 * to see how it works.
404 * This is used in the Barrett reduction since for one of the multiplications
405 * only the higher digits were needed. This essentially halves the work.
407 * Based on Algorithm 14.12 on pp.595 of HAC.
410 fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
412 int olduse, res, pa, ix, iz;
413 mp_digit W[MP_WARRAY];
416 /* grow the destination as required */
417 pa = a->used + b->used;
419 if ((res = mp_grow (c, pa)) != MP_OKAY) {
424 /* number of output digits to produce */
425 pa = a->used + b->used;
427 for (ix = digs; ix <= pa; ix++) {
429 mp_digit *tmpx, *tmpy;
431 /* get offsets into the two bignums */
432 ty = MIN(b->used-1, ix);
435 /* setup temp aliases */
439 /* this is the number of times the loop will iterrate, essentially its
440 while (tx++ < a->used && ty-- >= 0) { ... }
442 iy = MIN(a->used-tx, ty+1);
445 for (iz = 0; iz < iy; iz++) {
446 _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
450 W[ix] = ((mp_digit)_W) & MP_MASK;
452 /* make next carry */
453 _W = _W >> ((mp_word)DIGIT_BIT);
461 register mp_digit *tmpc;
464 for (ix = digs; ix <= pa; ix++) {
465 /* now extract the previous digit [below the carry] */
469 /* clear unused digits [that existed in the old copy of c] */
470 for (; ix < olduse; ix++) {
480 * This is the comba method where the columns of the product
481 * are computed first then the carries are computed. This
482 * has the effect of making a very simple inner loop that
483 * is executed the most
485 * W2 represents the outer products and W the inner.
487 * A further optimizations is made because the inner
488 * products are of the form "A * B * 2". The *2 part does
489 * not need to be computed until the end which is good
490 * because 64-bit shifts are slow!
492 * Based on Algorithm 14.16 on pp.597 of HAC.
495 /* the jist of squaring...
497 you do like mult except the offset of the tmpx [one that starts closer to zero]
498 can't equal the offset of tmpy. So basically you set up iy like before then you min it with
499 (ty-tx) so that it never happens. You double all those you add in the inner loop
501 After that loop you do the squares and add them in.
503 Remove W2 and don't memset W
507 int fast_s_mp_sqr (mp_int * a, mp_int * b)
509 int olduse, res, pa, ix, iz;
510 mp_digit W[MP_WARRAY], *tmpx;
513 /* grow the destination as required */
514 pa = a->used + a->used;
516 if ((res = mp_grow (b, pa)) != MP_OKAY) {
521 /* number of output digits to produce */
523 for (ix = 0; ix <= pa; ix++) {
531 /* get offsets into the two bignums */
532 ty = MIN(a->used-1, ix);
535 /* setup temp aliases */
539 /* this is the number of times the loop will iterrate, essentially its
540 while (tx++ < a->used && ty-- >= 0) { ... }
542 iy = MIN(a->used-tx, ty+1);
544 /* now for squaring tx can never equal ty
545 * we halve the distance since they approach at a rate of 2x
546 * and we have to round because odd cases need to be executed
548 iy = MIN(iy, (ty-tx+1)>>1);
551 for (iz = 0; iz < iy; iz++) {
552 _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
555 /* double the inner product and add carry */
558 /* even columns have the square term in them */
560 _W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]);
566 /* make next carry */
567 W1 = _W >> ((mp_word)DIGIT_BIT);
572 b->used = a->used+a->used;
577 for (ix = 0; ix < pa; ix++) {
578 *tmpb++ = W[ix] & MP_MASK;
581 /* clear unused digits [that existed in the old copy of c] */
582 for (; ix < olduse; ix++) {
592 * Simple algorithm which zeroes the int, grows it then just sets one bit
596 mp_2expt (mp_int * a, int b)
600 /* zero a as per default */
603 /* grow a to accommodate the single bit */
604 if ((res = mp_grow (a, b / DIGIT_BIT + 1)) != MP_OKAY) {
608 /* set the used count of where the bit will go */
609 a->used = b / DIGIT_BIT + 1;
611 /* put the single bit in its place */
612 a->dp[b / DIGIT_BIT] = ((mp_digit)1) << (b % DIGIT_BIT);
619 * Simple function copies the input and fixes the sign to positive
622 mp_abs (mp_int * a, mp_int * b)
628 if ((res = mp_copy (a, b)) != MP_OKAY) {
633 /* force the sign of b to positive */
639 /* high level addition (handles signs) */
640 int mp_add (mp_int * a, mp_int * b, mp_int * c)
644 /* get sign of both inputs */
648 /* handle two cases, not four */
650 /* both positive or both negative */
651 /* add their magnitudes, copy the sign */
653 res = s_mp_add (a, b, c);
655 /* one positive, the other negative */
656 /* subtract the one with the greater magnitude from */
657 /* the one of the lesser magnitude. The result gets */
658 /* the sign of the one with the greater magnitude. */
659 if (mp_cmp_mag (a, b) == MP_LT) {
661 res = s_mp_sub (b, a, c);
664 res = s_mp_sub (a, b, c);
671 /* single digit addition */
673 mp_add_d (mp_int * a, mp_digit b, mp_int * c)
675 int res, ix, oldused;
676 mp_digit *tmpa, *tmpc, mu;
678 /* grow c as required */
679 if (c->alloc < a->used + 1) {
680 if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) {
685 /* if a is negative and |a| >= b, call c = |a| - b */
686 if (a->sign == MP_NEG && (a->used > 1 || a->dp[0] >= b)) {
687 /* temporarily fix sign of a */
691 res = mp_sub_d(a, b, c);
694 a->sign = c->sign = MP_NEG;
699 /* old number of used digits in c */
702 /* sign always positive */
708 /* destination alias */
711 /* if a is positive */
712 if (a->sign == MP_ZPOS) {
713 /* add digit, after this we're propagating
717 mu = *tmpc >> DIGIT_BIT;
720 /* now handle rest of the digits */
721 for (ix = 1; ix < a->used; ix++) {
722 *tmpc = *tmpa++ + mu;
723 mu = *tmpc >> DIGIT_BIT;
726 /* set final carry */
731 c->used = a->used + 1;
733 /* a was negative and |a| < b */
736 /* the result is a single digit */
738 *tmpc++ = b - a->dp[0];
743 /* setup count so the clearing of oldused
744 * can fall through correctly
749 /* now zero to oldused */
750 while (ix++ < oldused) {
758 /* trim unused digits
760 * This is used to ensure that leading zero digits are
761 * trimed and the leading "used" digit will be non-zero
762 * Typically very fast. Also fixes the sign if there
763 * are no more leading digits
766 mp_clamp (mp_int * a)
768 /* decrease used while the most significant digit is
771 while (a->used > 0 && a->dp[a->used - 1] == 0) {
775 /* reset the sign flag if used == 0 */
781 /* clear one (frees) */
783 mp_clear (mp_int * a)
787 /* only do anything if a hasn't been freed previously */
789 /* first zero the digits */
790 for (i = 0; i < a->used; i++) {
797 /* reset members to make debugging easier */
799 a->alloc = a->used = 0;
805 void mp_clear_multi(mp_int *mp, ...)
807 mp_int* next_mp = mp;
810 while (next_mp != NULL) {
812 next_mp = va_arg(args, mp_int*);
817 /* compare two ints (signed)*/
819 mp_cmp (mp_int * a, mp_int * b)
821 /* compare based on sign */
822 if (a->sign != b->sign) {
823 if (a->sign == MP_NEG) {
831 if (a->sign == MP_NEG) {
832 /* if negative compare opposite direction */
833 return mp_cmp_mag(b, a);
835 return mp_cmp_mag(a, b);
839 /* compare a digit */
840 int mp_cmp_d(mp_int * a, mp_digit b)
842 /* compare based on sign */
843 if (a->sign == MP_NEG) {
847 /* compare based on magnitude */
852 /* compare the only digit of a to b */
855 } else if (a->dp[0] < b) {
862 /* compare maginitude of two ints (unsigned) */
863 int mp_cmp_mag (mp_int * a, mp_int * b)
866 mp_digit *tmpa, *tmpb;
868 /* compare based on # of non-zero digits */
869 if (a->used > b->used) {
873 if (a->used < b->used) {
878 tmpa = a->dp + (a->used - 1);
881 tmpb = b->dp + (a->used - 1);
883 /* compare based on digits */
884 for (n = 0; n < a->used; ++n, --tmpa, --tmpb) {
896 static const int lnz[16] = {
897 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
900 /* Counts the number of lsbs which are zero before the first zero bit */
901 int mp_cnt_lsb(mp_int *a)
907 if (mp_iszero(a) == 1) {
911 /* scan lower digits until non-zero */
912 for (x = 0; x < a->used && a->dp[x] == 0; x++);
916 /* now scan this digit until a 1 is found */
929 mp_copy (const mp_int * a, mp_int * b)
933 /* if dst == src do nothing */
939 if (b->alloc < a->used) {
940 if ((res = mp_grow (b, a->used)) != MP_OKAY) {
945 /* zero b and copy the parameters over */
947 register mp_digit *tmpa, *tmpb;
949 /* pointer aliases */
957 /* copy all the digits */
958 for (n = 0; n < a->used; n++) {
962 /* clear high digits */
963 for (; n < b->used; n++) {
968 /* copy used count and sign */
974 /* returns the number of bits in an int */
976 mp_count_bits (mp_int * a)
986 /* get number of digits and add that */
987 r = (a->used - 1) * DIGIT_BIT;
989 /* take the last digit and count the bits in it */
990 q = a->dp[a->used - 1];
991 while (q > ((mp_digit) 0)) {
993 q >>= ((mp_digit) 1);
998 /* integer signed division.
999 * c*b + d == a [e.g. a/b, c=quotient, d=remainder]
1000 * HAC pp.598 Algorithm 14.20
1002 * Note that the description in HAC is horribly
1003 * incomplete. For example, it doesn't consider
1004 * the case where digits are removed from 'x' in
1005 * the inner loop. It also doesn't consider the
1006 * case that y has fewer than three digits, etc..
1008 * The overall algorithm is as described as
1009 * 14.20 from HAC but fixed to treat these cases.
1011 int mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
1013 mp_int q, x, y, t1, t2;
1014 int res, n, t, i, norm, neg;
1016 /* is divisor zero ? */
1017 if (mp_iszero (b) == 1) {
1021 /* if a < b then q=0, r = a */
1022 if (mp_cmp_mag (a, b) == MP_LT) {
1024 res = mp_copy (a, d);
1034 if ((res = mp_init_size (&q, a->used + 2)) != MP_OKAY) {
1037 q.used = a->used + 2;
1039 if ((res = mp_init (&t1)) != MP_OKAY) {
1043 if ((res = mp_init (&t2)) != MP_OKAY) {
1047 if ((res = mp_init_copy (&x, a)) != MP_OKAY) {
1051 if ((res = mp_init_copy (&y, b)) != MP_OKAY) {
1056 neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
1057 x.sign = y.sign = MP_ZPOS;
1059 /* normalize both x and y, ensure that y >= b/2, [b == 2**DIGIT_BIT] */
1060 norm = mp_count_bits(&y) % DIGIT_BIT;
1061 if (norm < (int)(DIGIT_BIT-1)) {
1062 norm = (DIGIT_BIT-1) - norm;
1063 if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) {
1066 if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) {
1073 /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
1077 /* while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} } */
1078 if ((res = mp_lshd (&y, n - t)) != MP_OKAY) { /* y = y*b**{n-t} */
1082 while (mp_cmp (&x, &y) != MP_LT) {
1084 if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) {
1089 /* reset y by shifting it back down */
1090 mp_rshd (&y, n - t);
1092 /* step 3. for i from n down to (t + 1) */
1093 for (i = n; i >= (t + 1); i--) {
1098 /* step 3.1 if xi == yt then set q{i-t-1} to b-1,
1099 * otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
1100 if (x.dp[i] == y.dp[t]) {
1101 q.dp[i - t - 1] = ((((mp_digit)1) << DIGIT_BIT) - 1);
1104 tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT);
1105 tmp |= ((mp_word) x.dp[i - 1]);
1106 tmp /= ((mp_word) y.dp[t]);
1107 if (tmp > (mp_word) MP_MASK)
1109 q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK));
1112 /* while (q{i-t-1} * (yt * b + y{t-1})) >
1113 xi * b**2 + xi-1 * b + xi-2
1117 q.dp[i - t - 1] = (q.dp[i - t - 1] + 1) & MP_MASK;
1119 q.dp[i - t - 1] = (q.dp[i - t - 1] - 1) & MP_MASK;
1121 /* find left hand */
1123 t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1];
1126 if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) {
1130 /* find right hand */
1131 t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2];
1132 t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
1135 } while (mp_cmp_mag(&t1, &t2) == MP_GT);
1137 /* step 3.3 x = x - q{i-t-1} * y * b**{i-t-1} */
1138 if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
1142 if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
1146 if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) {
1150 /* if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; } */
1151 if (x.sign == MP_NEG) {
1152 if ((res = mp_copy (&y, &t1)) != MP_OKAY) {
1155 if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
1158 if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) {
1162 q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
1166 /* now q is the quotient and x is the remainder
1167 * [which we have to normalize]
1170 /* get sign before writing to c */
1171 x.sign = x.used == 0 ? MP_ZPOS : a->sign;
1180 mp_div_2d (&x, norm, &x, NULL);
1188 __T2:mp_clear (&t2);
1189 __T1:mp_clear (&t1);
1195 int mp_div_2(mp_int * a, mp_int * b)
1197 int x, res, oldused;
1200 if (b->alloc < a->used) {
1201 if ((res = mp_grow (b, a->used)) != MP_OKAY) {
1209 register mp_digit r, rr, *tmpa, *tmpb;
1212 tmpa = a->dp + b->used - 1;
1215 tmpb = b->dp + b->used - 1;
1219 for (x = b->used - 1; x >= 0; x--) {
1220 /* get the carry for the next iteration */
1223 /* shift the current digit, add in carry and store */
1224 *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
1226 /* forward carry to next iteration */
1230 /* zero excess digits */
1231 tmpb = b->dp + b->used;
1232 for (x = b->used; x < oldused; x++) {
1241 /* shift right by a certain bit count (store quotient in c, optional remainder in d) */
1242 int mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
1249 /* if the shift count is <= 0 then we do no work */
1251 res = mp_copy (a, c);
1258 if ((res = mp_init (&t)) != MP_OKAY) {
1262 /* get the remainder */
1264 if ((res = mp_mod_2d (a, b, &t)) != MP_OKAY) {
1271 if ((res = mp_copy (a, c)) != MP_OKAY) {
1276 /* shift by as many digits in the bit count */
1277 if (b >= (int)DIGIT_BIT) {
1278 mp_rshd (c, b / DIGIT_BIT);
1281 /* shift any bit count < DIGIT_BIT */
1282 D = (mp_digit) (b % DIGIT_BIT);
1284 register mp_digit *tmpc, mask, shift;
1287 mask = (((mp_digit)1) << D) - 1;
1290 shift = DIGIT_BIT - D;
1293 tmpc = c->dp + (c->used - 1);
1297 for (x = c->used - 1; x >= 0; x--) {
1298 /* get the lower bits of this word in a temp */
1301 /* shift the current word and mix in the carry bits from the previous word */
1302 *tmpc = (*tmpc >> D) | (r << shift);
1305 /* set the carry to the carry bits of the current word found above */
1317 static int s_is_power_of_two(mp_digit b, int *p)
1321 for (x = 1; x < DIGIT_BIT; x++) {
1322 if (b == (((mp_digit)1)<<x)) {
1330 /* single digit division (based on routine from MPI) */
1331 int mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
1338 /* cannot divide by zero */
1344 if (b == 1 || mp_iszero(a) == 1) {
1349 return mp_copy(a, c);
1354 /* power of two ? */
1355 if (s_is_power_of_two(b, &ix) == 1) {
1357 *d = a->dp[0] & ((((mp_digit)1)<<ix) - 1);
1360 return mp_div_2d(a, ix, c, NULL);
1365 /* no easy answer [c'est la vie]. Just division */
1366 if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
1373 for (ix = a->used - 1; ix >= 0; ix--) {
1374 w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]);
1377 t = (mp_digit)(w / b);
1378 w -= ((mp_word)t) * ((mp_word)b);
1382 q.dp[ix] = (mp_digit)t;
1398 /* reduce "x" in place modulo "n" using the Diminished Radix algorithm.
1400 * Based on algorithm from the paper
1402 * "Generating Efficient Primes for Discrete Log Cryptosystems"
1403 * Chae Hoon Lim, Pil Loong Lee,
1404 * POSTECH Information Research Laboratories
1406 * The modulus must be of a special format [see manual]
1408 * Has been modified to use algorithm 7.10 from the LTM book instead
1410 * Input x must be in the range 0 <= x <= (n-1)**2
1413 mp_dr_reduce (mp_int * x, mp_int * n, mp_digit k)
1417 mp_digit mu, *tmpx1, *tmpx2;
1419 /* m = digits in modulus */
1422 /* ensure that "x" has at least 2m digits */
1423 if (x->alloc < m + m) {
1424 if ((err = mp_grow (x, m + m)) != MP_OKAY) {
1429 /* top of loop, this is where the code resumes if
1430 * another reduction pass is required.
1433 /* aliases for digits */
1434 /* alias for lower half of x */
1437 /* alias for upper half of x, or x/B**m */
1440 /* set carry to zero */
1443 /* compute (x mod B**m) + k * [x/B**m] inline and inplace */
1444 for (i = 0; i < m; i++) {
1445 r = ((mp_word)*tmpx2++) * ((mp_word)k) + *tmpx1 + mu;
1446 *tmpx1++ = (mp_digit)(r & MP_MASK);
1447 mu = (mp_digit)(r >> ((mp_word)DIGIT_BIT));
1450 /* set final carry */
1453 /* zero words above m */
1454 for (i = m + 1; i < x->used; i++) {
1458 /* clamp, sub and return */
1461 /* if x >= n then subtract and reduce again
1462 * Each successive "recursion" makes the input smaller and smaller.
1464 if (mp_cmp_mag (x, n) != MP_LT) {
1471 /* determines the setup value */
1472 void mp_dr_setup(mp_int *a, mp_digit *d)
1474 /* the casts are required if DIGIT_BIT is one less than
1475 * the number of bits in a mp_digit [e.g. DIGIT_BIT==31]
1477 *d = (mp_digit)((((mp_word)1) << ((mp_word)DIGIT_BIT)) -
1478 ((mp_word)a->dp[0]));
1481 /* swap the elements of two integers, for cases where you can't simply swap the
1482 * mp_int pointers around
1485 mp_exch (mp_int * a, mp_int * b)
1494 /* this is a shell function that calls either the normal or Montgomery
1495 * exptmod functions. Originally the call to the montgomery code was
1496 * embedded in the normal function but that wasted a lot of stack space
1497 * for nothing (since 99% of the time the Montgomery code would be called)
1499 int mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
1503 /* modulus P must be positive */
1504 if (P->sign == MP_NEG) {
1508 /* if exponent X is negative we have to recurse */
1509 if (X->sign == MP_NEG) {
1513 /* first compute 1/G mod P */
1514 if ((err = mp_init(&tmpG)) != MP_OKAY) {
1517 if ((err = mp_invmod(G, P, &tmpG)) != MP_OKAY) {
1523 if ((err = mp_init(&tmpX)) != MP_OKAY) {
1527 if ((err = mp_abs(X, &tmpX)) != MP_OKAY) {
1528 mp_clear_multi(&tmpG, &tmpX, NULL);
1532 /* and now compute (1/G)**|X| instead of G**X [X < 0] */
1533 err = mp_exptmod(&tmpG, &tmpX, P, Y);
1534 mp_clear_multi(&tmpG, &tmpX, NULL);
1540 /* if the modulus is odd or dr != 0 use the fast method */
1541 if (mp_isodd (P) == 1 || dr != 0) {
1542 return mp_exptmod_fast (G, X, P, Y, dr);
1544 /* otherwise use the generic Barrett reduction technique */
1545 return s_mp_exptmod (G, X, P, Y);
1549 /* computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
1551 * Uses a left-to-right k-ary sliding window to compute the modular exponentiation.
1552 * The value of k changes based on the size of the exponent.
1554 * Uses Montgomery or Diminished Radix reduction [whichever appropriate]
1558 mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
1562 int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
1564 /* use a pointer to the reduction algorithm. This allows us to use
1565 * one of many reduction algorithms without modding the guts of
1566 * the code with if statements everywhere.
1568 int (*redux)(mp_int*,mp_int*,mp_digit);
1570 /* find window size */
1571 x = mp_count_bits (X);
1574 } else if (x <= 36) {
1576 } else if (x <= 140) {
1578 } else if (x <= 450) {
1580 } else if (x <= 1303) {
1582 } else if (x <= 3529) {
1589 /* init first cell */
1590 if ((err = mp_init(&M[1])) != MP_OKAY) {
1594 /* now init the second half of the array */
1595 for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
1596 if ((err = mp_init(&M[x])) != MP_OKAY) {
1597 for (y = 1<<(winsize-1); y < x; y++) {
1605 /* determine and setup reduction code */
1607 /* now setup montgomery */
1608 if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) {
1612 /* automatically pick the comba one if available (saves quite a few calls/ifs) */
1613 if (((P->used * 2 + 1) < MP_WARRAY) &&
1614 P->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
1615 redux = fast_mp_montgomery_reduce;
1617 /* use slower baseline Montgomery method */
1618 redux = mp_montgomery_reduce;
1620 } else if (redmode == 1) {
1621 /* setup DR reduction for moduli of the form B**k - b */
1622 mp_dr_setup(P, &mp);
1623 redux = mp_dr_reduce;
1625 /* setup DR reduction for moduli of the form 2**k - b */
1626 if ((err = mp_reduce_2k_setup(P, &mp)) != MP_OKAY) {
1629 redux = mp_reduce_2k;
1633 if ((err = mp_init (&res)) != MP_OKAY) {
1641 * The first half of the table is not computed though accept for M[0] and M[1]
1645 /* now we need R mod m */
1646 if ((err = mp_montgomery_calc_normalization (&res, P)) != MP_OKAY) {
1650 /* now set M[1] to G * R mod m */
1651 if ((err = mp_mulmod (G, &res, P, &M[1])) != MP_OKAY) {
1656 if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
1661 /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
1662 if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
1666 for (x = 0; x < (winsize - 1); x++) {
1667 if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
1670 if ((err = redux (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
1675 /* create upper table */
1676 for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
1677 if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
1680 if ((err = redux (&M[x], P, mp)) != MP_OKAY) {
1685 /* set initial mode and bit cnt */
1689 digidx = X->used - 1;
1694 /* grab next digit as required */
1695 if (--bitcnt == 0) {
1696 /* if digidx == -1 we are out of digits so break */
1700 /* read next digit and reset bitcnt */
1701 buf = X->dp[digidx--];
1702 bitcnt = (int)DIGIT_BIT;
1705 /* grab the next msb from the exponent */
1706 y = (mp_digit)(buf >> (DIGIT_BIT - 1)) & 1;
1707 buf <<= (mp_digit)1;
1709 /* if the bit is zero and mode == 0 then we ignore it
1710 * These represent the leading zero bits before the first 1 bit
1711 * in the exponent. Technically this opt is not required but it
1712 * does lower the # of trivial squaring/reductions used
1714 if (mode == 0 && y == 0) {
1718 /* if the bit is zero and mode == 1 then we square */
1719 if (mode == 1 && y == 0) {
1720 if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
1723 if ((err = redux (&res, P, mp)) != MP_OKAY) {
1729 /* else we add it to the window */
1730 bitbuf |= (y << (winsize - ++bitcpy));
1733 if (bitcpy == winsize) {
1734 /* ok window is filled so square as required and multiply */
1736 for (x = 0; x < winsize; x++) {
1737 if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
1740 if ((err = redux (&res, P, mp)) != MP_OKAY) {
1746 if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
1749 if ((err = redux (&res, P, mp)) != MP_OKAY) {
1753 /* empty window and reset */
1760 /* if bits remain then square/multiply */
1761 if (mode == 2 && bitcpy > 0) {
1762 /* square then multiply if the bit is set */
1763 for (x = 0; x < bitcpy; x++) {
1764 if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
1767 if ((err = redux (&res, P, mp)) != MP_OKAY) {
1771 /* get next bit of the window */
1773 if ((bitbuf & (1 << winsize)) != 0) {
1775 if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
1778 if ((err = redux (&res, P, mp)) != MP_OKAY) {
1786 /* fixup result if Montgomery reduction is used
1787 * recall that any value in a Montgomery system is
1788 * actually multiplied by R mod n. So we have
1789 * to reduce one more time to cancel out the factor
1792 if ((err = redux(&res, P, mp)) != MP_OKAY) {
1797 /* swap res with Y */
1800 __RES:mp_clear (&res);
1803 for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
1809 /* Greatest Common Divisor using the binary method */
1810 int mp_gcd (mp_int * a, mp_int * b, mp_int * c)
1813 int k, u_lsb, v_lsb, res;
1815 /* either zero than gcd is the largest */
1816 if (mp_iszero (a) == 1 && mp_iszero (b) == 0) {
1817 return mp_abs (b, c);
1819 if (mp_iszero (a) == 0 && mp_iszero (b) == 1) {
1820 return mp_abs (a, c);
1823 /* optimized. At this point if a == 0 then
1824 * b must equal zero too
1826 if (mp_iszero (a) == 1) {
1831 /* get copies of a and b we can modify */
1832 if ((res = mp_init_copy (&u, a)) != MP_OKAY) {
1836 if ((res = mp_init_copy (&v, b)) != MP_OKAY) {
1840 /* must be positive for the remainder of the algorithm */
1841 u.sign = v.sign = MP_ZPOS;
1843 /* B1. Find the common power of two for u and v */
1844 u_lsb = mp_cnt_lsb(&u);
1845 v_lsb = mp_cnt_lsb(&v);
1846 k = MIN(u_lsb, v_lsb);
1849 /* divide the power of two out */
1850 if ((res = mp_div_2d(&u, k, &u, NULL)) != MP_OKAY) {
1854 if ((res = mp_div_2d(&v, k, &v, NULL)) != MP_OKAY) {
1859 /* divide any remaining factors of two out */
1861 if ((res = mp_div_2d(&u, u_lsb - k, &u, NULL)) != MP_OKAY) {
1867 if ((res = mp_div_2d(&v, v_lsb - k, &v, NULL)) != MP_OKAY) {
1872 while (mp_iszero(&v) == 0) {
1873 /* make sure v is the largest */
1874 if (mp_cmp_mag(&u, &v) == MP_GT) {
1875 /* swap u and v to make sure v is >= u */
1879 /* subtract smallest from largest */
1880 if ((res = s_mp_sub(&v, &u, &v)) != MP_OKAY) {
1884 /* Divide out all factors of two */
1885 if ((res = mp_div_2d(&v, mp_cnt_lsb(&v), &v, NULL)) != MP_OKAY) {
1890 /* multiply by 2**k which we divided out at the beginning */
1891 if ((res = mp_mul_2d (&u, k, c)) != MP_OKAY) {
1901 /* get the lower 32-bits of an mp_int */
1902 unsigned long mp_get_int(mp_int * a)
1911 /* get number of digits of the lsb we have to read */
1912 i = MIN(a->used,(int)((sizeof(unsigned long)*CHAR_BIT+DIGIT_BIT-1)/DIGIT_BIT))-1;
1914 /* get most significant digit of result */
1918 res = (res << DIGIT_BIT) | DIGIT(a,i);
1921 /* force result to 32-bits always so it is consistent on non 32-bit platforms */
1922 return res & 0xFFFFFFFFUL;
1925 /* grow as required */
1926 int mp_grow (mp_int * a, int size)
1931 /* if the alloc size is smaller alloc more ram */
1932 if (a->alloc < size) {
1933 /* ensure there are always at least MP_PREC digits extra on top */
1934 size += (MP_PREC * 2) - (size % MP_PREC);
1936 /* reallocate the array a->dp
1938 * We store the return in a temporary variable
1939 * in case the operation failed we don't want
1940 * to overwrite the dp member of a.
1942 tmp = realloc (a->dp, sizeof (mp_digit) * size);
1944 /* reallocation failed but "a" is still valid [can be freed] */
1948 /* reallocation succeeded so set a->dp */
1951 /* zero excess digits */
1954 for (; i < a->alloc; i++) {
1961 /* init a new mp_int */
1962 int mp_init (mp_int * a)
1966 /* allocate memory required and clear it */
1967 a->dp = malloc (sizeof (mp_digit) * MP_PREC);
1968 if (a->dp == NULL) {
1972 /* set the digits to zero */
1973 for (i = 0; i < MP_PREC; i++) {
1977 /* set the used to zero, allocated digits to the default precision
1978 * and sign to positive */
1986 /* creates "a" then copies b into it */
1987 int mp_init_copy (mp_int * a, const mp_int * b)
1991 if ((res = mp_init (a)) != MP_OKAY) {
1994 return mp_copy (b, a);
1997 int mp_init_multi(mp_int *mp, ...)
1999 mp_err res = MP_OKAY; /* Assume ok until proven otherwise */
2000 int n = 0; /* Number of ok inits */
2001 mp_int* cur_arg = mp;
2004 va_start(args, mp); /* init args to next argument from caller */
2005 while (cur_arg != NULL) {
2006 if (mp_init(cur_arg) != MP_OKAY) {
2007 /* Oops - error! Back-track and mp_clear what we already
2008 succeeded in init-ing, then return error.
2012 /* end the current list */
2015 /* now start cleaning up */
2017 va_start(clean_args, mp);
2020 cur_arg = va_arg(clean_args, mp_int*);
2027 cur_arg = va_arg(args, mp_int*);
2030 return res; /* Assumed ok, if error flagged above. */
2033 /* init an mp_init for a given size */
2034 int mp_init_size (mp_int * a, int size)
2038 /* pad size so there are always extra digits */
2039 size += (MP_PREC * 2) - (size % MP_PREC);
2042 a->dp = malloc (sizeof (mp_digit) * size);
2043 if (a->dp == NULL) {
2047 /* set the members */
2052 /* zero the digits */
2053 for (x = 0; x < size; x++) {
2060 /* hac 14.61, pp608 */
2061 int mp_invmod (mp_int * a, mp_int * b, mp_int * c)
2063 /* b cannot be negative */
2064 if (b->sign == MP_NEG || mp_iszero(b) == 1) {
2068 /* if the modulus is odd we can use a faster routine instead */
2069 if (mp_isodd (b) == 1) {
2070 return fast_mp_invmod (a, b, c);
2073 return mp_invmod_slow(a, b, c);
2076 /* hac 14.61, pp608 */
2077 int mp_invmod_slow (mp_int * a, mp_int * b, mp_int * c)
2079 mp_int x, y, u, v, A, B, C, D;
2082 /* b cannot be negative */
2083 if (b->sign == MP_NEG || mp_iszero(b) == 1) {
2088 if ((res = mp_init_multi(&x, &y, &u, &v,
2089 &A, &B, &C, &D, NULL)) != MP_OKAY) {
2094 if ((res = mp_copy (a, &x)) != MP_OKAY) {
2097 if ((res = mp_copy (b, &y)) != MP_OKAY) {
2101 /* 2. [modified] if x,y are both even then return an error! */
2102 if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
2107 /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
2108 if ((res = mp_copy (&x, &u)) != MP_OKAY) {
2111 if ((res = mp_copy (&y, &v)) != MP_OKAY) {
2118 /* 4. while u is even do */
2119 while (mp_iseven (&u) == 1) {
2121 if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
2124 /* 4.2 if A or B is odd then */
2125 if (mp_isodd (&A) == 1 || mp_isodd (&B) == 1) {
2126 /* A = (A+y)/2, B = (B-x)/2 */
2127 if ((res = mp_add (&A, &y, &A)) != MP_OKAY) {
2130 if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
2134 /* A = A/2, B = B/2 */
2135 if ((res = mp_div_2 (&A, &A)) != MP_OKAY) {
2138 if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
2143 /* 5. while v is even do */
2144 while (mp_iseven (&v) == 1) {
2146 if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
2149 /* 5.2 if C or D is odd then */
2150 if (mp_isodd (&C) == 1 || mp_isodd (&D) == 1) {
2151 /* C = (C+y)/2, D = (D-x)/2 */
2152 if ((res = mp_add (&C, &y, &C)) != MP_OKAY) {
2155 if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
2159 /* C = C/2, D = D/2 */
2160 if ((res = mp_div_2 (&C, &C)) != MP_OKAY) {
2163 if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
2168 /* 6. if u >= v then */
2169 if (mp_cmp (&u, &v) != MP_LT) {
2170 /* u = u - v, A = A - C, B = B - D */
2171 if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
2175 if ((res = mp_sub (&A, &C, &A)) != MP_OKAY) {
2179 if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
2183 /* v - v - u, C = C - A, D = D - B */
2184 if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
2188 if ((res = mp_sub (&C, &A, &C)) != MP_OKAY) {
2192 if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
2197 /* if not zero goto step 4 */
2198 if (mp_iszero (&u) == 0)
2201 /* now a = C, b = D, gcd == g*v */
2203 /* if v != 1 then there is no inverse */
2204 if (mp_cmp_d (&v, 1) != MP_EQ) {
2209 /* if its too low */
2210 while (mp_cmp_d(&C, 0) == MP_LT) {
2211 if ((res = mp_add(&C, b, &C)) != MP_OKAY) {
2217 while (mp_cmp_mag(&C, b) != MP_LT) {
2218 if ((res = mp_sub(&C, b, &C)) != MP_OKAY) {
2223 /* C is now the inverse */
2226 __ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
2230 /* c = |a| * |b| using Karatsuba Multiplication using
2231 * three half size multiplications
2233 * Let B represent the radix [e.g. 2**DIGIT_BIT] and
2234 * let n represent half of the number of digits in
2237 * a = a1 * B**n + a0
2238 * b = b1 * B**n + b0
2241 a1b1 * B**2n + ((a1 - a0)(b1 - b0) + a0b0 + a1b1) * B + a0b0
2243 * Note that a1b1 and a0b0 are used twice and only need to be
2244 * computed once. So in total three half size (half # of
2245 * digit) multiplications are performed, a0b0, a1b1 and
2248 * Note that a multiplication of half the digits requires
2249 * 1/4th the number of single precision multiplications so in
2250 * total after one call 25% of the single precision multiplications
2251 * are saved. Note also that the call to mp_mul can end up back
2252 * in this function if the a0, a1, b0, or b1 are above the threshold.
2253 * This is known as divide-and-conquer and leads to the famous
2254 * O(N**lg(3)) or O(N**1.584) work which is asymptopically lower than
2255 * the standard O(N**2) that the baseline/comba methods use.
2256 * Generally though the overhead of this method doesn't pay off
2257 * until a certain size (N ~ 80) is reached.
2259 int mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
2261 mp_int x0, x1, y0, y1, t1, x0y0, x1y1;
2264 /* default the return code to an error */
2267 /* min # of digits */
2268 B = MIN (a->used, b->used);
2270 /* now divide in two */
2273 /* init copy all the temps */
2274 if (mp_init_size (&x0, B) != MP_OKAY)
2276 if (mp_init_size (&x1, a->used - B) != MP_OKAY)
2278 if (mp_init_size (&y0, B) != MP_OKAY)
2280 if (mp_init_size (&y1, b->used - B) != MP_OKAY)
2284 if (mp_init_size (&t1, B * 2) != MP_OKAY)
2286 if (mp_init_size (&x0y0, B * 2) != MP_OKAY)
2288 if (mp_init_size (&x1y1, B * 2) != MP_OKAY)
2291 /* now shift the digits */
2292 x0.used = y0.used = B;
2293 x1.used = a->used - B;
2294 y1.used = b->used - B;
2298 register mp_digit *tmpa, *tmpb, *tmpx, *tmpy;
2300 /* we copy the digits directly instead of using higher level functions
2301 * since we also need to shift the digits
2308 for (x = 0; x < B; x++) {
2314 for (x = B; x < a->used; x++) {
2319 for (x = B; x < b->used; x++) {
2324 /* only need to clamp the lower words since by definition the
2325 * upper words x1/y1 must have a known number of digits
2330 /* now calc the products x0y0 and x1y1 */
2331 /* after this x0 is no longer required, free temp [x0==t2]! */
2332 if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY)
2333 goto X1Y1; /* x0y0 = x0*y0 */
2334 if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY)
2335 goto X1Y1; /* x1y1 = x1*y1 */
2337 /* now calc x1-x0 and y1-y0 */
2338 if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
2339 goto X1Y1; /* t1 = x1 - x0 */
2340 if (mp_sub (&y1, &y0, &x0) != MP_OKAY)
2341 goto X1Y1; /* t2 = y1 - y0 */
2342 if (mp_mul (&t1, &x0, &t1) != MP_OKAY)
2343 goto X1Y1; /* t1 = (x1 - x0) * (y1 - y0) */
2346 if (mp_add (&x0y0, &x1y1, &x0) != MP_OKAY)
2347 goto X1Y1; /* t2 = x0y0 + x1y1 */
2348 if (mp_sub (&x0, &t1, &t1) != MP_OKAY)
2349 goto X1Y1; /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
2352 if (mp_lshd (&t1, B) != MP_OKAY)
2353 goto X1Y1; /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
2354 if (mp_lshd (&x1y1, B * 2) != MP_OKAY)
2355 goto X1Y1; /* x1y1 = x1y1 << 2*B */
2357 if (mp_add (&x0y0, &t1, &t1) != MP_OKAY)
2358 goto X1Y1; /* t1 = x0y0 + t1 */
2359 if (mp_add (&t1, &x1y1, c) != MP_OKAY)
2360 goto X1Y1; /* t1 = x0y0 + t1 + x1y1 */
2362 /* Algorithm succeeded set the return code to MP_OKAY */
2365 X1Y1:mp_clear (&x1y1);
2366 X0Y0:mp_clear (&x0y0);
2376 /* Karatsuba squaring, computes b = a*a using three
2377 * half size squarings
2379 * See comments of karatsuba_mul for details. It
2380 * is essentially the same algorithm but merely
2381 * tuned to perform recursive squarings.
2383 int mp_karatsuba_sqr (mp_int * a, mp_int * b)
2385 mp_int x0, x1, t1, t2, x0x0, x1x1;
2390 /* min # of digits */
2393 /* now divide in two */
2396 /* init copy all the temps */
2397 if (mp_init_size (&x0, B) != MP_OKAY)
2399 if (mp_init_size (&x1, a->used - B) != MP_OKAY)
2403 if (mp_init_size (&t1, a->used * 2) != MP_OKAY)
2405 if (mp_init_size (&t2, a->used * 2) != MP_OKAY)
2407 if (mp_init_size (&x0x0, B * 2) != MP_OKAY)
2409 if (mp_init_size (&x1x1, (a->used - B) * 2) != MP_OKAY)
2414 register mp_digit *dst, *src;
2418 /* now shift the digits */
2420 for (x = 0; x < B; x++) {
2425 for (x = B; x < a->used; x++) {
2431 x1.used = a->used - B;
2435 /* now calc the products x0*x0 and x1*x1 */
2436 if (mp_sqr (&x0, &x0x0) != MP_OKAY)
2437 goto X1X1; /* x0x0 = x0*x0 */
2438 if (mp_sqr (&x1, &x1x1) != MP_OKAY)
2439 goto X1X1; /* x1x1 = x1*x1 */
2441 /* now calc (x1-x0)**2 */
2442 if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
2443 goto X1X1; /* t1 = x1 - x0 */
2444 if (mp_sqr (&t1, &t1) != MP_OKAY)
2445 goto X1X1; /* t1 = (x1 - x0) * (x1 - x0) */
2448 if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY)
2449 goto X1X1; /* t2 = x0x0 + x1x1 */
2450 if (mp_sub (&t2, &t1, &t1) != MP_OKAY)
2451 goto X1X1; /* t1 = x0x0 + x1x1 - (x1-x0)*(x1-x0) */
2454 if (mp_lshd (&t1, B) != MP_OKAY)
2455 goto X1X1; /* t1 = (x0x0 + x1x1 - (x1-x0)*(x1-x0))<<B */
2456 if (mp_lshd (&x1x1, B * 2) != MP_OKAY)
2457 goto X1X1; /* x1x1 = x1x1 << 2*B */
2459 if (mp_add (&x0x0, &t1, &t1) != MP_OKAY)
2460 goto X1X1; /* t1 = x0x0 + t1 */
2461 if (mp_add (&t1, &x1x1, b) != MP_OKAY)
2462 goto X1X1; /* t1 = x0x0 + t1 + x1x1 */
2466 X1X1:mp_clear (&x1x1);
2467 X0X0:mp_clear (&x0x0);
2476 /* computes least common multiple as |a*b|/(a, b) */
2477 int mp_lcm (mp_int * a, mp_int * b, mp_int * c)
2483 if ((res = mp_init_multi (&t1, &t2, NULL)) != MP_OKAY) {
2487 /* t1 = get the GCD of the two inputs */
2488 if ((res = mp_gcd (a, b, &t1)) != MP_OKAY) {
2492 /* divide the smallest by the GCD */
2493 if (mp_cmp_mag(a, b) == MP_LT) {
2494 /* store quotient in t2 such that t2 * b is the LCM */
2495 if ((res = mp_div(a, &t1, &t2, NULL)) != MP_OKAY) {
2498 res = mp_mul(b, &t2, c);
2500 /* store quotient in t2 such that t2 * a is the LCM */
2501 if ((res = mp_div(b, &t1, &t2, NULL)) != MP_OKAY) {
2504 res = mp_mul(a, &t2, c);
2507 /* fix the sign to positive */
2511 mp_clear_multi (&t1, &t2, NULL);
2515 /* shift left a certain amount of digits */
2516 int mp_lshd (mp_int * a, int b)
2520 /* if its less than zero return */
2525 /* grow to fit the new digits */
2526 if (a->alloc < a->used + b) {
2527 if ((res = mp_grow (a, a->used + b)) != MP_OKAY) {
2533 register mp_digit *top, *bottom;
2535 /* increment the used by the shift amount then copy upwards */
2539 top = a->dp + a->used - 1;
2542 bottom = a->dp + a->used - 1 - b;
2544 /* much like mp_rshd this is implemented using a sliding window
2545 * except the window goes the otherway around. Copying from
2546 * the bottom to the top. see bn_mp_rshd.c for more info.
2548 for (x = a->used - 1; x >= b; x--) {
2552 /* zero the lower digits */
2554 for (x = 0; x < b; x++) {
2561 /* c = a mod b, 0 <= c < b */
2563 mp_mod (mp_int * a, mp_int * b, mp_int * c)
2568 if ((res = mp_init (&t)) != MP_OKAY) {
2572 if ((res = mp_div (a, b, NULL, &t)) != MP_OKAY) {
2577 if (t.sign != b->sign) {
2578 res = mp_add (b, &t, c);
2588 /* calc a value mod 2**b */
2590 mp_mod_2d (mp_int * a, int b, mp_int * c)
2594 /* if b is <= 0 then zero the int */
2600 /* if the modulus is larger than the value than return */
2601 if (b > (int) (a->used * DIGIT_BIT)) {
2602 res = mp_copy (a, c);
2607 if ((res = mp_copy (a, c)) != MP_OKAY) {
2611 /* zero digits above the last digit of the modulus */
2612 for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++) {
2615 /* clear the digit that is not completely outside/inside the modulus */
2616 c->dp[b / DIGIT_BIT] &=
2617 (mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) - ((mp_digit) 1));
2623 mp_mod_d (mp_int * a, mp_digit b, mp_digit * c)
2625 return mp_div_d(a, b, NULL, c);
2629 * shifts with subtractions when the result is greater than b.
2631 * The method is slightly modified to shift B unconditionally up to just under
2632 * the leading bit of b. This saves a lot of multiple precision shifting.
2634 int mp_montgomery_calc_normalization (mp_int * a, mp_int * b)
2638 /* how many bits of last digit does b use */
2639 bits = mp_count_bits (b) % DIGIT_BIT;
2643 if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
2652 /* now compute C = A * B mod b */
2653 for (x = bits - 1; x < (int)DIGIT_BIT; x++) {
2654 if ((res = mp_mul_2 (a, a)) != MP_OKAY) {
2657 if (mp_cmp_mag (a, b) != MP_LT) {
2658 if ((res = s_mp_sub (a, b, a)) != MP_OKAY) {
2667 /* computes xR**-1 == x (mod N) via Montgomery Reduction */
2669 mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
2674 /* can the fast reduction [comba] method be used?
2676 * Note that unlike in mul you're safely allowed *less*
2677 * than the available columns [255 per default] since carries
2678 * are fixed up in the inner loop.
2680 digs = n->used * 2 + 1;
2681 if ((digs < MP_WARRAY) &&
2683 (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
2684 return fast_mp_montgomery_reduce (x, n, rho);
2687 /* grow the input as required */
2688 if (x->alloc < digs) {
2689 if ((res = mp_grow (x, digs)) != MP_OKAY) {
2695 for (ix = 0; ix < n->used; ix++) {
2696 /* mu = ai * rho mod b
2698 * The value of rho must be precalculated via
2699 * montgomery_setup() such that
2700 * it equals -1/n0 mod b this allows the
2701 * following inner loop to reduce the
2702 * input one digit at a time
2704 mu = (mp_digit) (((mp_word)x->dp[ix]) * ((mp_word)rho) & MP_MASK);
2706 /* a = a + mu * m * b**i */
2709 register mp_digit *tmpn, *tmpx, u;
2712 /* alias for digits of the modulus */
2715 /* alias for the digits of x [the input] */
2718 /* set the carry to zero */
2721 /* Multiply and add in place */
2722 for (iy = 0; iy < n->used; iy++) {
2723 /* compute product and sum */
2724 r = ((mp_word)mu) * ((mp_word)*tmpn++) +
2725 ((mp_word) u) + ((mp_word) * tmpx);
2728 u = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
2731 *tmpx++ = (mp_digit)(r & ((mp_word) MP_MASK));
2733 /* At this point the ix'th digit of x should be zero */
2736 /* propagate carries upwards as required*/
2739 u = *tmpx >> DIGIT_BIT;
2745 /* at this point the n.used'th least
2746 * significant digits of x are all zero
2747 * which means we can shift x to the
2748 * right by n.used digits and the
2749 * residue is unchanged.
2752 /* x = x/b**n.used */
2754 mp_rshd (x, n->used);
2756 /* if x >= n then x = x - n */
2757 if (mp_cmp_mag (x, n) != MP_LT) {
2758 return s_mp_sub (x, n, x);
2764 /* setups the montgomery reduction stuff */
2766 mp_montgomery_setup (mp_int * n, mp_digit * rho)
2770 /* fast inversion mod 2**k
2772 * Based on the fact that
2774 * XA = 1 (mod 2**n) => (X(2-XA)) A = 1 (mod 2**2n)
2775 * => 2*X*A - X*X*A*A = 1
2776 * => 2*(1) - (1) = 1
2784 x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
2785 x *= 2 - b * x; /* here x*a==1 mod 2**8 */
2786 x *= 2 - b * x; /* here x*a==1 mod 2**16 */
2787 x *= 2 - b * x; /* here x*a==1 mod 2**32 */
2789 /* rho = -1/m mod b */
2790 *rho = (((mp_word)1 << ((mp_word) DIGIT_BIT)) - x) & MP_MASK;
2795 /* high level multiplication (handles sign) */
2796 int mp_mul (mp_int * a, mp_int * b, mp_int * c)
2799 neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
2801 /* use Karatsuba? */
2802 if (MIN (a->used, b->used) >= KARATSUBA_MUL_CUTOFF) {
2803 res = mp_karatsuba_mul (a, b, c);
2806 /* can we use the fast multiplier?
2808 * The fast multiplier can be used if the output will
2809 * have less than MP_WARRAY digits and the number of
2810 * digits won't affect carry propagation
2812 int digs = a->used + b->used + 1;
2814 if ((digs < MP_WARRAY) &&
2815 MIN(a->used, b->used) <=
2816 (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
2817 res = fast_s_mp_mul_digs (a, b, c, digs);
2819 res = s_mp_mul (a, b, c); /* uses s_mp_mul_digs */
2821 c->sign = (c->used > 0) ? neg : MP_ZPOS;
2826 int mp_mul_2(mp_int * a, mp_int * b)
2828 int x, res, oldused;
2830 /* grow to accommodate result */
2831 if (b->alloc < a->used + 1) {
2832 if ((res = mp_grow (b, a->used + 1)) != MP_OKAY) {
2841 register mp_digit r, rr, *tmpa, *tmpb;
2843 /* alias for source */
2846 /* alias for dest */
2851 for (x = 0; x < a->used; x++) {
2853 /* get what will be the *next* carry bit from the
2854 * MSB of the current digit
2856 rr = *tmpa >> ((mp_digit)(DIGIT_BIT - 1));
2858 /* now shift up this digit, add in the carry [from the previous] */
2859 *tmpb++ = ((*tmpa++ << ((mp_digit)1)) | r) & MP_MASK;
2861 /* copy the carry that would be from the source
2862 * digit into the next iteration
2867 /* new leading digit? */
2869 /* add a MSB which is always 1 at this point */
2874 /* now zero any excess digits on the destination
2875 * that we didn't write to
2877 tmpb = b->dp + b->used;
2878 for (x = b->used; x < oldused; x++) {
2886 /* shift left by a certain bit count */
2887 int mp_mul_2d (mp_int * a, int b, mp_int * c)
2894 if ((res = mp_copy (a, c)) != MP_OKAY) {
2899 if (c->alloc < (int)(c->used + b/DIGIT_BIT + 1)) {
2900 if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 1)) != MP_OKAY) {
2905 /* shift by as many digits in the bit count */
2906 if (b >= (int)DIGIT_BIT) {
2907 if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) {
2912 /* shift any bit count < DIGIT_BIT */
2913 d = (mp_digit) (b % DIGIT_BIT);
2915 register mp_digit *tmpc, shift, mask, r, rr;
2918 /* bitmask for carries */
2919 mask = (((mp_digit)1) << d) - 1;
2921 /* shift for msbs */
2922 shift = DIGIT_BIT - d;
2929 for (x = 0; x < c->used; x++) {
2930 /* get the higher bits of the current word */
2931 rr = (*tmpc >> shift) & mask;
2933 /* shift the current word and OR in the carry */
2934 *tmpc = ((*tmpc << d) | r) & MP_MASK;
2937 /* set the carry to the carry bits of the current word */
2941 /* set final carry */
2943 c->dp[(c->used)++] = r;
2950 /* multiply by a digit */
2952 mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
2954 mp_digit u, *tmpa, *tmpc;
2956 int ix, res, olduse;
2958 /* make sure c is big enough to hold a*b */
2959 if (c->alloc < a->used + 1) {
2960 if ((res = mp_grow (c, a->used + 1)) != MP_OKAY) {
2965 /* get the original destinations used count */
2971 /* alias for a->dp [source] */
2974 /* alias for c->dp [dest] */
2980 /* compute columns */
2981 for (ix = 0; ix < a->used; ix++) {
2982 /* compute product and carry sum for this term */
2983 r = ((mp_word) u) + ((mp_word)*tmpa++) * ((mp_word)b);
2985 /* mask off higher bits to get a single digit */
2986 *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
2988 /* send carry into next iteration */
2989 u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
2992 /* store final carry [if any] */
2995 /* now zero digits above the top */
2996 while (ix++ < olduse) {
3000 /* set used count */
3001 c->used = a->used + 1;
3007 /* d = a * b (mod c) */
3009 mp_mulmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
3014 if ((res = mp_init (&t)) != MP_OKAY) {
3018 if ((res = mp_mul (a, b, &t)) != MP_OKAY) {
3022 res = mp_mod (&t, c, d);
3027 /* determines if an integers is divisible by one
3028 * of the first PRIME_SIZE primes or not
3030 * sets result to 0 if not, 1 if yes
3032 int mp_prime_is_divisible (mp_int * a, int *result)
3037 /* default to not */
3040 for (ix = 0; ix < PRIME_SIZE; ix++) {
3041 /* what is a mod __prime_tab[ix] */
3042 if ((err = mp_mod_d (a, __prime_tab[ix], &res)) != MP_OKAY) {
3046 /* is the residue zero? */
3056 /* performs a variable number of rounds of Miller-Rabin
3058 * Probability of error after t rounds is no more than
3061 * Sets result to 1 if probably prime, 0 otherwise
3063 int mp_prime_is_prime (mp_int * a, int t, int *result)
3071 /* valid value of t? */
3072 if (t <= 0 || t > PRIME_SIZE) {
3076 /* is the input equal to one of the primes in the table? */
3077 for (ix = 0; ix < PRIME_SIZE; ix++) {
3078 if (mp_cmp_d(a, __prime_tab[ix]) == MP_EQ) {
3084 /* first perform trial division */
3085 if ((err = mp_prime_is_divisible (a, &res)) != MP_OKAY) {
3089 /* return if it was trivially divisible */
3090 if (res == MP_YES) {
3094 /* now perform the miller-rabin rounds */
3095 if ((err = mp_init (&b)) != MP_OKAY) {
3099 for (ix = 0; ix < t; ix++) {
3101 mp_set (&b, __prime_tab[ix]);
3103 if ((err = mp_prime_miller_rabin (a, &b, &res)) != MP_OKAY) {
3112 /* passed the test */
3118 /* Miller-Rabin test of "a" to the base of "b" as described in
3119 * HAC pp. 139 Algorithm 4.24
3121 * Sets result to 0 if definitely composite or 1 if probably prime.
3122 * Randomly the chance of error is no more than 1/4 and often
3125 int mp_prime_miller_rabin (mp_int * a, mp_int * b, int *result)
3134 if (mp_cmp_d(b, 1) != MP_GT) {
3138 /* get n1 = a - 1 */
3139 if ((err = mp_init_copy (&n1, a)) != MP_OKAY) {
3142 if ((err = mp_sub_d (&n1, 1, &n1)) != MP_OKAY) {
3146 /* set 2**s * r = n1 */
3147 if ((err = mp_init_copy (&r, &n1)) != MP_OKAY) {
3151 /* count the number of least significant bits
3156 /* now divide n - 1 by 2**s */
3157 if ((err = mp_div_2d (&r, s, &r, NULL)) != MP_OKAY) {
3161 /* compute y = b**r mod a */
3162 if ((err = mp_init (&y)) != MP_OKAY) {
3165 if ((err = mp_exptmod (b, &r, a, &y)) != MP_OKAY) {
3169 /* if y != 1 and y != n1 do */
3170 if (mp_cmp_d (&y, 1) != MP_EQ && mp_cmp (&y, &n1) != MP_EQ) {
3172 /* while j <= s-1 and y != n1 */
3173 while ((j <= (s - 1)) && mp_cmp (&y, &n1) != MP_EQ) {
3174 if ((err = mp_sqrmod (&y, a, &y)) != MP_OKAY) {
3178 /* if y == 1 then composite */
3179 if (mp_cmp_d (&y, 1) == MP_EQ) {
3186 /* if y != n1 then composite */
3187 if (mp_cmp (&y, &n1) != MP_EQ) {
3192 /* probably prime now */
3196 __N1:mp_clear (&n1);
3200 static const struct {
3213 /* returns # of RM trials required for a given bit size */
3214 int mp_prime_rabin_miller_trials(int size)
3218 for (x = 0; x < (int)(sizeof(sizes)/(sizeof(sizes[0]))); x++) {
3219 if (sizes[x].k == size) {
3221 } else if (sizes[x].k > size) {
3222 return (x == 0) ? sizes[0].t : sizes[x - 1].t;
3225 return sizes[x-1].t + 1;
3228 /* makes a truly random prime of a given size (bits),
3230 * Flags are as follows:
3232 * LTM_PRIME_BBS - make prime congruent to 3 mod 4
3233 * LTM_PRIME_SAFE - make sure (p-1)/2 is prime as well (implies LTM_PRIME_BBS)
3234 * LTM_PRIME_2MSB_OFF - make the 2nd highest bit zero
3235 * LTM_PRIME_2MSB_ON - make the 2nd highest bit one
3237 * You have to supply a callback which fills in a buffer with random bytes. "dat" is a parameter you can
3238 * have passed to the callback (e.g. a state or something). This function doesn't use "dat" itself
3243 /* This is possibly the mother of all prime generation functions, muahahahahaha! */
3244 int mp_prime_random_ex(mp_int *a, int t, int size, int flags, ltm_prime_callback cb, void *dat)
3246 unsigned char *tmp, maskAND, maskOR_msb, maskOR_lsb;
3247 int res, err, bsize, maskOR_msb_offset;
3249 /* sanity check the input */
3250 if (size <= 1 || t <= 0) {
3254 /* LTM_PRIME_SAFE implies LTM_PRIME_BBS */
3255 if (flags & LTM_PRIME_SAFE) {
3256 flags |= LTM_PRIME_BBS;
3259 /* calc the byte size */
3260 bsize = (size>>3)+((size&7)?1:0);
3262 /* we need a buffer of bsize bytes */
3263 tmp = malloc(bsize);
3268 /* calc the maskAND value for the MSbyte*/
3269 maskAND = ((size&7) == 0) ? 0xFF : (0xFF >> (8 - (size & 7)));
3271 /* calc the maskOR_msb */
3273 maskOR_msb_offset = ((size & 7) == 1) ? 1 : 0;
3274 if (flags & LTM_PRIME_2MSB_ON) {
3275 maskOR_msb |= 1 << ((size - 2) & 7);
3276 } else if (flags & LTM_PRIME_2MSB_OFF) {
3277 maskAND &= ~(1 << ((size - 2) & 7));
3280 /* get the maskOR_lsb */
3282 if (flags & LTM_PRIME_BBS) {
3287 /* read the bytes */
3288 if (cb(tmp, bsize, dat) != bsize) {
3293 /* work over the MSbyte */
3295 tmp[0] |= 1 << ((size - 1) & 7);
3297 /* mix in the maskORs */
3298 tmp[maskOR_msb_offset] |= maskOR_msb;
3299 tmp[bsize-1] |= maskOR_lsb;
3302 if ((err = mp_read_unsigned_bin(a, tmp, bsize)) != MP_OKAY) { goto error; }
3305 if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY) { goto error; }
3310 if (flags & LTM_PRIME_SAFE) {
3311 /* see if (a-1)/2 is prime */
3312 if ((err = mp_sub_d(a, 1, a)) != MP_OKAY) { goto error; }
3313 if ((err = mp_div_2(a, a)) != MP_OKAY) { goto error; }
3316 if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY) { goto error; }
3318 } while (res == MP_NO);
3320 if (flags & LTM_PRIME_SAFE) {
3321 /* restore a to the original value */
3322 if ((err = mp_mul_2(a, a)) != MP_OKAY) { goto error; }
3323 if ((err = mp_add_d(a, 1, a)) != MP_OKAY) { goto error; }
3332 /* reads an unsigned char array, assumes the msb is stored first [big endian] */
3334 mp_read_unsigned_bin (mp_int * a, unsigned char *b, int c)
3338 /* make sure there are at least two digits */
3340 if ((res = mp_grow(a, 2)) != MP_OKAY) {
3348 /* read the bytes in */
3350 if ((res = mp_mul_2d (a, 8, a)) != MP_OKAY) {
3361 /* reduces x mod m, assumes 0 < x < m**2, mu is
3362 * precomputed via mp_reduce_setup.
3363 * From HAC pp.604 Algorithm 14.42
3366 mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
3369 int res, um = m->used;
3372 if ((res = mp_init_copy (&q, x)) != MP_OKAY) {
3376 /* q1 = x / b**(k-1) */
3377 mp_rshd (&q, um - 1);
3379 /* according to HAC this optimization is ok */
3380 if (((unsigned long) um) > (((mp_digit)1) << (DIGIT_BIT - 1))) {
3381 if ((res = mp_mul (&q, mu, &q)) != MP_OKAY) {
3385 if ((res = s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) {
3390 /* q3 = q2 / b**(k+1) */
3391 mp_rshd (&q, um + 1);
3393 /* x = x mod b**(k+1), quick (no division) */
3394 if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) {
3398 /* q = q * m mod b**(k+1), quick (no division) */
3399 if ((res = s_mp_mul_digs (&q, m, &q, um + 1)) != MP_OKAY) {
3404 if ((res = mp_sub (x, &q, x)) != MP_OKAY) {
3408 /* If x < 0, add b**(k+1) to it */
3409 if (mp_cmp_d (x, 0) == MP_LT) {
3411 if ((res = mp_lshd (&q, um + 1)) != MP_OKAY)
3413 if ((res = mp_add (x, &q, x)) != MP_OKAY)
3417 /* Back off if it's too big */
3418 while (mp_cmp (x, m) != MP_LT) {
3419 if ((res = s_mp_sub (x, m, x)) != MP_OKAY) {
3430 /* reduces a modulo n where n is of the form 2**p - d */
3432 mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d)
3437 if ((res = mp_init(&q)) != MP_OKAY) {
3441 p = mp_count_bits(n);
3443 /* q = a/2**p, a = a mod 2**p */
3444 if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) {
3450 if ((res = mp_mul_d(&q, d, &q)) != MP_OKAY) {
3456 if ((res = s_mp_add(a, &q, a)) != MP_OKAY) {
3460 if (mp_cmp_mag(a, n) != MP_LT) {
3470 /* determines the setup value */
3472 mp_reduce_2k_setup(mp_int *a, mp_digit *d)
3477 if ((res = mp_init(&tmp)) != MP_OKAY) {
3481 p = mp_count_bits(a);
3482 if ((res = mp_2expt(&tmp, p)) != MP_OKAY) {
3487 if ((res = s_mp_sub(&tmp, a, &tmp)) != MP_OKAY) {
3497 /* pre-calculate the value required for Barrett reduction
3498 * For a given modulus "b" it calulates the value required in "a"
3500 int mp_reduce_setup (mp_int * a, mp_int * b)
3504 if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
3507 return mp_div (a, b, a, NULL);
3510 /* shift right a certain amount of digits */
3511 void mp_rshd (mp_int * a, int b)
3515 /* if b <= 0 then ignore it */
3520 /* if b > used then simply zero it and return */
3527 register mp_digit *bottom, *top;
3529 /* shift the digits down */
3534 /* top [offset into digits] */
3537 /* this is implemented as a sliding window where
3538 * the window is b-digits long and digits from
3539 * the top of the window are copied to the bottom
3543 b-2 | b-1 | b0 | b1 | b2 | ... | bb | ---->
3545 \-------------------/ ---->
3547 for (x = 0; x < (a->used - b); x++) {
3551 /* zero the top digits */
3552 for (; x < a->used; x++) {
3557 /* remove excess digits */
3561 /* set to a digit */
3562 void mp_set (mp_int * a, mp_digit b)
3565 a->dp[0] = b & MP_MASK;
3566 a->used = (a->dp[0] != 0) ? 1 : 0;
3569 /* set a 32-bit const */
3570 int mp_set_int (mp_int * a, unsigned long b)
3576 /* set four bits at a time */
3577 for (x = 0; x < 8; x++) {
3578 /* shift the number up four bits */
3579 if ((res = mp_mul_2d (a, 4, a)) != MP_OKAY) {
3583 /* OR in the top four bits of the source */
3584 a->dp[0] |= (b >> 28) & 15;
3586 /* shift the source up to the next four bits */
3589 /* ensure that digits are not clamped off */
3596 /* shrink a bignum */
3597 int mp_shrink (mp_int * a)
3600 if (a->alloc != a->used && a->used > 0) {
3601 if ((tmp = realloc (a->dp, sizeof (mp_digit) * a->used)) == NULL) {
3610 /* get the size for an signed equivalent */
3611 int mp_signed_bin_size (mp_int * a)
3613 return 1 + mp_unsigned_bin_size (a);
3616 /* computes b = a*a */
3618 mp_sqr (mp_int * a, mp_int * b)
3622 if (a->used >= KARATSUBA_SQR_CUTOFF) {
3623 res = mp_karatsuba_sqr (a, b);
3626 /* can we use the fast comba multiplier? */
3627 if ((a->used * 2 + 1) < MP_WARRAY &&
3629 (1 << (sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) {
3630 res = fast_s_mp_sqr (a, b);
3632 res = s_mp_sqr (a, b);
3638 /* c = a * a (mod b) */
3640 mp_sqrmod (mp_int * a, mp_int * b, mp_int * c)
3645 if ((res = mp_init (&t)) != MP_OKAY) {
3649 if ((res = mp_sqr (a, &t)) != MP_OKAY) {
3653 res = mp_mod (&t, b, c);
3658 /* high level subtraction (handles signs) */
3660 mp_sub (mp_int * a, mp_int * b, mp_int * c)
3668 /* subtract a negative from a positive, OR */
3669 /* subtract a positive from a negative. */
3670 /* In either case, ADD their magnitudes, */
3671 /* and use the sign of the first number. */
3673 res = s_mp_add (a, b, c);
3675 /* subtract a positive from a positive, OR */
3676 /* subtract a negative from a negative. */
3677 /* First, take the difference between their */
3678 /* magnitudes, then... */
3679 if (mp_cmp_mag (a, b) != MP_LT) {
3680 /* Copy the sign from the first */
3682 /* The first has a larger or equal magnitude */
3683 res = s_mp_sub (a, b, c);
3685 /* The result has the *opposite* sign from */
3686 /* the first number. */
3687 c->sign = (sa == MP_ZPOS) ? MP_NEG : MP_ZPOS;
3688 /* The second has a larger magnitude */
3689 res = s_mp_sub (b, a, c);
3695 /* single digit subtraction */
3697 mp_sub_d (mp_int * a, mp_digit b, mp_int * c)
3699 mp_digit *tmpa, *tmpc, mu;
3700 int res, ix, oldused;
3702 /* grow c as required */
3703 if (c->alloc < a->used + 1) {
3704 if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) {
3709 /* if a is negative just do an unsigned
3710 * addition [with fudged signs]
3712 if (a->sign == MP_NEG) {
3714 res = mp_add_d(a, b, c);
3715 a->sign = c->sign = MP_NEG;
3724 /* if a <= b simply fix the single digit */
3725 if ((a->used == 1 && a->dp[0] <= b) || a->used == 0) {
3727 *tmpc++ = b - *tmpa;
3733 /* negative/1digit */
3741 /* subtract first digit */
3742 *tmpc = *tmpa++ - b;
3743 mu = *tmpc >> (sizeof(mp_digit) * CHAR_BIT - 1);
3746 /* handle rest of the digits */
3747 for (ix = 1; ix < a->used; ix++) {
3748 *tmpc = *tmpa++ - mu;
3749 mu = *tmpc >> (sizeof(mp_digit) * CHAR_BIT - 1);
3754 /* zero excess digits */
3755 while (ix++ < oldused) {
3762 /* store in unsigned [big endian] format */
3764 mp_to_unsigned_bin (mp_int * a, unsigned char *b)
3769 if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
3774 while (mp_iszero (&t) == 0) {
3775 b[x++] = (unsigned char) (t.dp[0] & 255);
3776 if ((res = mp_div_2d (&t, 8, &t, NULL)) != MP_OKAY) {
3786 /* get the size for an unsigned equivalent */
3788 mp_unsigned_bin_size (mp_int * a)
3790 int size = mp_count_bits (a);
3791 return (size / 8 + ((size & 7) != 0 ? 1 : 0));
3796 mp_zero (mp_int * a)
3800 memset (a->dp, 0, sizeof (mp_digit) * a->alloc);
3803 const mp_digit __prime_tab[] = {
3804 0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
3805 0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
3806 0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
3807 0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F, 0x0083,
3808 0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
3809 0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
3810 0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
3811 0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137,
3813 0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167,
3814 0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199,
3815 0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9,
3816 0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7,
3817 0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239,
3818 0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265,
3819 0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293,
3820 0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF,
3822 0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301,
3823 0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B,
3824 0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371,
3825 0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD,
3826 0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5,
3827 0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419,
3828 0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449,
3829 0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B,
3831 0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7,
3832 0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503,
3833 0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529,
3834 0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F,
3835 0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3,
3836 0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
3837 0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
3838 0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653
3841 /* reverse an array, used for radix code */
3843 bn_reverse (unsigned char *s, int len)
3859 /* low level addition, based on HAC pp.594, Algorithm 14.7 */
3861 s_mp_add (mp_int * a, mp_int * b, mp_int * c)
3864 int olduse, res, min, max;
3866 /* find sizes, we let |a| <= |b| which means we have to sort
3867 * them. "x" will point to the input with the most digits
3869 if (a->used > b->used) {
3880 if (c->alloc < max + 1) {
3881 if ((res = mp_grow (c, max + 1)) != MP_OKAY) {
3886 /* get old used digit count and set new one */
3891 register mp_digit u, *tmpa, *tmpb, *tmpc;
3894 /* alias for digit pointers */
3905 /* zero the carry */
3907 for (i = 0; i < min; i++) {
3908 /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
3909 *tmpc = *tmpa++ + *tmpb++ + u;
3911 /* U = carry bit of T[i] */
3912 u = *tmpc >> ((mp_digit)DIGIT_BIT);
3914 /* take away carry bit from T[i] */
3918 /* now copy higher words if any, that is in A+B
3919 * if A or B has more digits add those in
3922 for (; i < max; i++) {
3923 /* T[i] = X[i] + U */
3924 *tmpc = x->dp[i] + u;
3926 /* U = carry bit of T[i] */
3927 u = *tmpc >> ((mp_digit)DIGIT_BIT);
3929 /* take away carry bit from T[i] */
3937 /* clear digits above oldused */
3938 for (i = c->used; i < olduse; i++) {
3947 int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
3949 mp_int M[256], res, mu;
3951 int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
3953 /* find window size */
3954 x = mp_count_bits (X);
3957 } else if (x <= 36) {
3959 } else if (x <= 140) {
3961 } else if (x <= 450) {
3963 } else if (x <= 1303) {
3965 } else if (x <= 3529) {
3972 /* init first cell */
3973 if ((err = mp_init(&M[1])) != MP_OKAY) {
3977 /* now init the second half of the array */
3978 for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
3979 if ((err = mp_init(&M[x])) != MP_OKAY) {
3980 for (y = 1<<(winsize-1); y < x; y++) {
3988 /* create mu, used for Barrett reduction */
3989 if ((err = mp_init (&mu)) != MP_OKAY) {
3992 if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
3998 * The M table contains powers of the base,
3999 * e.g. M[x] = G**x mod P
4001 * The first half of the table is not
4002 * computed though accept for M[0] and M[1]
4004 if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
4008 /* compute the value at M[1<<(winsize-1)] by squaring
4009 * M[1] (winsize-1) times
4011 if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
4015 for (x = 0; x < (winsize - 1); x++) {
4016 if ((err = mp_sqr (&M[1 << (winsize - 1)],
4017 &M[1 << (winsize - 1)])) != MP_OKAY) {
4020 if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
4025 /* create upper table, that is M[x] = M[x-1] * M[1] (mod P)
4026 * for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
4028 for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
4029 if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
4032 if ((err = mp_reduce (&M[x], P, &mu)) != MP_OKAY) {
4038 if ((err = mp_init (&res)) != MP_OKAY) {
4043 /* set initial mode and bit cnt */
4047 digidx = X->used - 1;
4052 /* grab next digit as required */
4053 if (--bitcnt == 0) {
4054 /* if digidx == -1 we are out of digits */
4058 /* read next digit and reset the bitcnt */
4059 buf = X->dp[digidx--];
4060 bitcnt = (int) DIGIT_BIT;
4063 /* grab the next msb from the exponent */
4064 y = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
4065 buf <<= (mp_digit)1;
4067 /* if the bit is zero and mode == 0 then we ignore it
4068 * These represent the leading zero bits before the first 1 bit
4069 * in the exponent. Technically this opt is not required but it
4070 * does lower the # of trivial squaring/reductions used
4072 if (mode == 0 && y == 0) {
4076 /* if the bit is zero and mode == 1 then we square */
4077 if (mode == 1 && y == 0) {
4078 if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
4081 if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
4087 /* else we add it to the window */
4088 bitbuf |= (y << (winsize - ++bitcpy));
4091 if (bitcpy == winsize) {
4092 /* ok window is filled so square as required and multiply */
4094 for (x = 0; x < winsize; x++) {
4095 if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
4098 if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
4104 if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
4107 if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
4111 /* empty window and reset */
4118 /* if bits remain then square/multiply */
4119 if (mode == 2 && bitcpy > 0) {
4120 /* square then multiply if the bit is set */
4121 for (x = 0; x < bitcpy; x++) {
4122 if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
4125 if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
4130 if ((bitbuf & (1 << winsize)) != 0) {
4132 if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
4135 if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
4144 __RES:mp_clear (&res);
4145 __MU:mp_clear (&mu);
4148 for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
4154 /* multiplies |a| * |b| and only computes up to digs digits of result
4155 * HAC pp. 595, Algorithm 14.12 Modified so you can control how
4156 * many digits of output are created.
4159 s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
4162 int res, pa, pb, ix, iy;
4165 mp_digit tmpx, *tmpt, *tmpy;
4167 /* can we use the fast multiplier? */
4168 if (((digs) < MP_WARRAY) &&
4169 MIN (a->used, b->used) <
4170 (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
4171 return fast_s_mp_mul_digs (a, b, c, digs);
4174 if ((res = mp_init_size (&t, digs)) != MP_OKAY) {
4179 /* compute the digits of the product directly */
4181 for (ix = 0; ix < pa; ix++) {
4182 /* set the carry to zero */
4185 /* limit ourselves to making digs digits of output */
4186 pb = MIN (b->used, digs - ix);
4188 /* setup some aliases */
4189 /* copy of the digit from a used within the nested loop */
4192 /* an alias for the destination shifted ix places */
4195 /* an alias for the digits of b */
4198 /* compute the columns of the output and propagate the carry */
4199 for (iy = 0; iy < pb; iy++) {
4200 /* compute the column as a mp_word */
4201 r = ((mp_word)*tmpt) +
4202 ((mp_word)tmpx) * ((mp_word)*tmpy++) +
4205 /* the new column is the lower part of the result */
4206 *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
4208 /* get the carry word from the result */
4209 u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
4211 /* set carry if it is placed below digs */
4212 if (ix + iy < digs) {
4224 /* multiplies |a| * |b| and does not compute the lower digs digits
4225 * [meant to get the higher part of the product]
4228 s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
4231 int res, pa, pb, ix, iy;
4234 mp_digit tmpx, *tmpt, *tmpy;
4236 /* can we use the fast multiplier? */
4237 if (((a->used + b->used + 1) < MP_WARRAY)
4238 && MIN (a->used, b->used) < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
4239 return fast_s_mp_mul_high_digs (a, b, c, digs);
4242 if ((res = mp_init_size (&t, a->used + b->used + 1)) != MP_OKAY) {
4245 t.used = a->used + b->used + 1;
4249 for (ix = 0; ix < pa; ix++) {
4250 /* clear the carry */
4253 /* left hand side of A[ix] * B[iy] */
4256 /* alias to the address of where the digits will be stored */
4257 tmpt = &(t.dp[digs]);
4259 /* alias for where to read the right hand side from */
4260 tmpy = b->dp + (digs - ix);
4262 for (iy = digs - ix; iy < pb; iy++) {
4263 /* calculate the double precision result */
4264 r = ((mp_word)*tmpt) +
4265 ((mp_word)tmpx) * ((mp_word)*tmpy++) +
4268 /* get the lower part */
4269 *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
4271 /* carry the carry */
4272 u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
4282 /* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
4284 s_mp_sqr (mp_int * a, mp_int * b)
4287 int res, ix, iy, pa;
4289 mp_digit u, tmpx, *tmpt;
4292 if ((res = mp_init_size (&t, 2*pa + 1)) != MP_OKAY) {
4296 /* default used is maximum possible size */
4299 for (ix = 0; ix < pa; ix++) {
4300 /* first calculate the digit at 2*ix */
4301 /* calculate double precision result */
4302 r = ((mp_word) t.dp[2*ix]) +
4303 ((mp_word)a->dp[ix])*((mp_word)a->dp[ix]);
4305 /* store lower part in result */
4306 t.dp[ix+ix] = (mp_digit) (r & ((mp_word) MP_MASK));
4309 u = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
4311 /* left hand side of A[ix] * A[iy] */
4314 /* alias for where to store the results */
4315 tmpt = t.dp + (2*ix + 1);
4317 for (iy = ix + 1; iy < pa; iy++) {
4318 /* first calculate the product */
4319 r = ((mp_word)tmpx) * ((mp_word)a->dp[iy]);
4321 /* now calculate the double precision result, note we use
4322 * addition instead of *2 since it's easier to optimize
4324 r = ((mp_word) *tmpt) + r + r + ((mp_word) u);
4326 /* store lower part */
4327 *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
4330 u = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
4332 /* propagate upwards */
4333 while (u != ((mp_digit) 0)) {
4334 r = ((mp_word) *tmpt) + ((mp_word) u);
4335 *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
4336 u = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
4346 /* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */
4348 s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
4350 int olduse, res, min, max;
4357 if (c->alloc < max) {
4358 if ((res = mp_grow (c, max)) != MP_OKAY) {
4366 register mp_digit u, *tmpa, *tmpb, *tmpc;
4369 /* alias for digit pointers */
4374 /* set carry to zero */
4376 for (i = 0; i < min; i++) {
4377 /* T[i] = A[i] - B[i] - U */
4378 *tmpc = *tmpa++ - *tmpb++ - u;
4380 /* U = carry bit of T[i]
4381 * Note this saves performing an AND operation since
4382 * if a carry does occur it will propagate all the way to the
4383 * MSB. As a result a single shift is enough to get the carry
4385 u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
4387 /* Clear carry from T[i] */
4391 /* now copy higher words if any, e.g. if A has more digits than B */
4392 for (; i < max; i++) {
4393 /* T[i] = A[i] - U */
4394 *tmpc = *tmpa++ - u;
4396 /* U = carry bit of T[i] */
4397 u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
4399 /* Clear carry from T[i] */
4403 /* clear digits above used (since we may not have grown result above) */
4404 for (i = c->used; i < olduse; i++) {
4413 /* Known optimal configurations
4415 CPU /Compiler /MUL CUTOFF/SQR CUTOFF
4416 -------------------------------------------------------------
4417 Intel P4 Northwood /GCC v3.4.1 / 88/ 128/LTM 0.32 ;-)
4421 int KARATSUBA_MUL_CUTOFF = 88, /* Min. number of digits before Karatsuba multiplication is used. */
4422 KARATSUBA_SQR_CUTOFF = 128; /* Min. number of digits before Karatsuba squaring is used. */