3 * Multi Precision Integer functions
5 * Copyright 2004 Michael Jung
6 * Based on public domain code by Tom St Denis (tomstdenis@iahu.ca)
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * This library is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with this library; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
24 * This file contains code from the LibTomCrypt cryptographic
25 * library written by Tom St Denis (tomstdenis@iahu.ca). LibTomCrypt
26 * is in the public domain. The code in this file is tailored to
27 * special requirements. Take a look at http://libtomcrypt.org for the
34 /* Known optimal configurations
35 CPU /Compiler /MUL CUTOFF/SQR CUTOFF
36 -------------------------------------------------------------
37 Intel P4 Northwood /GCC v3.4.1 / 88/ 128/LTM 0.32 ;-)
39 static const int KARATSUBA_MUL_CUTOFF = 88, /* Min. number of digits before Karatsuba multiplication is used. */
40 KARATSUBA_SQR_CUTOFF = 128; /* Min. number of digits before Karatsuba squaring is used. */
42 static void bn_reverse(unsigned char *s, int len);
43 static int s_mp_add(mp_int *a, mp_int *b, mp_int *c);
44 static int s_mp_exptmod (const mp_int * G, const mp_int * X, mp_int * P, mp_int * Y);
45 #define s_mp_mul(a, b, c) s_mp_mul_digs(a, b, c, (a)->used + (b)->used + 1)
46 static int s_mp_mul_digs(const mp_int *a, const mp_int *b, mp_int *c, int digs);
47 static int s_mp_mul_high_digs(const mp_int *a, const mp_int *b, mp_int *c, int digs);
48 static int s_mp_sqr(const mp_int *a, mp_int *b);
49 static int s_mp_sub(const mp_int *a, const mp_int *b, mp_int *c);
50 static int mp_exptmod_fast(const mp_int *G, const mp_int *X, mp_int *P, mp_int *Y, int mode);
51 static int mp_invmod_slow (const mp_int * a, mp_int * b, mp_int * c);
52 static int mp_karatsuba_mul(const mp_int *a, const mp_int *b, mp_int *c);
53 static int mp_karatsuba_sqr(const mp_int *a, mp_int *b);
55 /* computes the modular inverse via binary extended euclidean algorithm,
56 * that is c = 1/a mod b
58 * Based on slow invmod except this is optimized for the case where b is
59 * odd as per HAC Note 14.64 on pp. 610
62 fast_mp_invmod (const mp_int * a, mp_int * b, mp_int * c)
64 mp_int x, y, u, v, B, D;
67 /* 2. [modified] b must be odd */
68 if (mp_iseven (b) == 1) {
72 /* init all our temps */
73 if ((res = mp_init_multi(&x, &y, &u, &v, &B, &D, NULL)) != MP_OKAY) {
77 /* x == modulus, y == value to invert */
78 if ((res = mp_copy (b, &x)) != MP_OKAY) {
83 if ((res = mp_abs (a, &y)) != MP_OKAY) {
87 /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
88 if ((res = mp_copy (&x, &u)) != MP_OKAY) {
91 if ((res = mp_copy (&y, &v)) != MP_OKAY) {
97 /* 4. while u is even do */
98 while (mp_iseven (&u) == 1) {
100 if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
103 /* 4.2 if B is odd then */
104 if (mp_isodd (&B) == 1) {
105 if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
110 if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
115 /* 5. while v is even do */
116 while (mp_iseven (&v) == 1) {
118 if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
121 /* 5.2 if D is odd then */
122 if (mp_isodd (&D) == 1) {
124 if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
129 if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
134 /* 6. if u >= v then */
135 if (mp_cmp (&u, &v) != MP_LT) {
136 /* u = u - v, B = B - D */
137 if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
141 if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
145 /* v - v - u, D = D - B */
146 if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
150 if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
155 /* if not zero goto step 4 */
156 if (mp_iszero (&u) == 0) {
160 /* now a = C, b = D, gcd == g*v */
162 /* if v != 1 then there is no inverse */
163 if (mp_cmp_d (&v, 1) != MP_EQ) {
168 /* b is now the inverse */
170 while (D.sign == MP_NEG) {
171 if ((res = mp_add (&D, b, &D)) != MP_OKAY) {
179 __ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL);
183 /* computes xR**-1 == x (mod N) via Montgomery Reduction
185 * This is an optimized implementation of montgomery_reduce
186 * which uses the comba method to quickly calculate the columns of the
189 * Based on Algorithm 14.32 on pp.601 of HAC.
192 fast_mp_montgomery_reduce (mp_int * x, const mp_int * n, mp_digit rho)
195 mp_word W[MP_WARRAY];
197 /* get old used count */
200 /* grow a as required */
201 if (x->alloc < n->used + 1) {
202 if ((res = mp_grow (x, n->used + 1)) != MP_OKAY) {
207 /* first we have to get the digits of the input into
208 * an array of double precision words W[...]
211 register mp_word *_W;
212 register mp_digit *tmpx;
214 /* alias for the W[] array */
217 /* alias for the digits of x*/
220 /* copy the digits of a into W[0..a->used-1] */
221 for (ix = 0; ix < x->used; ix++) {
225 /* zero the high words of W[a->used..m->used*2] */
226 for (; ix < n->used * 2 + 1; ix++) {
231 /* now we proceed to zero successive digits
232 * from the least significant upwards
234 for (ix = 0; ix < n->used; ix++) {
235 /* mu = ai * m' mod b
237 * We avoid a double precision multiplication (which isn't required)
238 * by casting the value down to a mp_digit. Note this requires
239 * that W[ix-1] have the carry cleared (see after the inner loop)
241 register mp_digit mu;
242 mu = (mp_digit) (((W[ix] & MP_MASK) * rho) & MP_MASK);
244 /* a = a + mu * m * b**i
246 * This is computed in place and on the fly. The multiplication
247 * by b**i is handled by offsetting which columns the results
250 * Note the comba method normally doesn't handle carries in the
251 * inner loop In this case we fix the carry from the previous
252 * column since the Montgomery reduction requires digits of the
253 * result (so far) [see above] to work. This is
254 * handled by fixing up one carry after the inner loop. The
255 * carry fixups are done in order so after these loops the
256 * first m->used words of W[] have the carries fixed
260 register mp_digit *tmpn;
261 register mp_word *_W;
263 /* alias for the digits of the modulus */
266 /* Alias for the columns set by an offset of ix */
270 for (iy = 0; iy < n->used; iy++) {
271 *_W++ += ((mp_word)mu) * ((mp_word)*tmpn++);
275 /* now fix carry for next digit, W[ix+1] */
276 W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT);
279 /* now we have to propagate the carries and
280 * shift the words downward [all those least
281 * significant digits we zeroed].
284 register mp_digit *tmpx;
285 register mp_word *_W, *_W1;
287 /* nox fix rest of carries */
289 /* alias for current word */
292 /* alias for next word, where the carry goes */
295 for (; ix <= n->used * 2 + 1; ix++) {
296 *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT);
299 /* copy out, A = A/b**n
301 * The result is A/b**n but instead of converting from an
302 * array of mp_word to mp_digit than calling mp_rshd
303 * we just copy them in the right order
306 /* alias for destination word */
309 /* alias for shifted double precision result */
312 for (ix = 0; ix < n->used + 1; ix++) {
313 *tmpx++ = (mp_digit)(*_W++ & ((mp_word) MP_MASK));
316 /* zero oldused digits, if the input a was larger than
317 * m->used+1 we'll have to clear the digits
319 for (; ix < olduse; ix++) {
324 /* set the max used and clamp */
325 x->used = n->used + 1;
328 /* if A >= m then A = A - m */
329 if (mp_cmp_mag (x, n) != MP_LT) {
330 return s_mp_sub (x, n, x);
335 /* Fast (comba) multiplier
337 * This is the fast column-array [comba] multiplier. It is
338 * designed to compute the columns of the product first
339 * then handle the carries afterwards. This has the effect
340 * of making the nested loops that compute the columns very
341 * simple and schedulable on super-scalar processors.
343 * This has been modified to produce a variable number of
344 * digits of output so if say only a half-product is required
345 * you don't have to compute the upper half (a feature
346 * required for fast Barrett reduction).
348 * Based on Algorithm 14.12 on pp.595 of HAC.
352 fast_s_mp_mul_digs (const mp_int * a, const mp_int * b, mp_int * c, int digs)
354 int olduse, res, pa, ix, iz;
355 mp_digit W[MP_WARRAY];
358 /* grow the destination as required */
359 if (c->alloc < digs) {
360 if ((res = mp_grow (c, digs)) != MP_OKAY) {
365 /* number of output digits to produce */
366 pa = MIN(digs, a->used + b->used);
368 /* clear the carry */
370 for (ix = 0; ix <= pa; ix++) {
373 mp_digit *tmpx, *tmpy;
375 /* get offsets into the two bignums */
376 ty = MIN(b->used-1, ix);
379 /* setup temp aliases */
383 /* This is the number of times the loop will iterate, essentially it's
384 while (tx++ < a->used && ty-- >= 0) { ... }
386 iy = MIN(a->used-tx, ty+1);
389 for (iz = 0; iz < iy; ++iz) {
390 _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
394 W[ix] = ((mp_digit)_W) & MP_MASK;
396 /* make next carry */
397 _W = _W >> ((mp_word)DIGIT_BIT);
405 register mp_digit *tmpc;
407 for (ix = 0; ix < digs; ix++) {
408 /* now extract the previous digit [below the carry] */
412 /* clear unused digits [that existed in the old copy of c] */
413 for (; ix < olduse; ix++) {
421 /* this is a modified version of fast_s_mul_digs that only produces
422 * output digits *above* digs. See the comments for fast_s_mul_digs
423 * to see how it works.
425 * This is used in the Barrett reduction since for one of the multiplications
426 * only the higher digits were needed. This essentially halves the work.
428 * Based on Algorithm 14.12 on pp.595 of HAC.
431 fast_s_mp_mul_high_digs (const mp_int * a, const mp_int * b, mp_int * c, int digs)
433 int olduse, res, pa, ix, iz;
434 mp_digit W[MP_WARRAY];
437 /* grow the destination as required */
438 pa = a->used + b->used;
440 if ((res = mp_grow (c, pa)) != MP_OKAY) {
445 /* number of output digits to produce */
446 pa = a->used + b->used;
448 for (ix = digs; ix <= pa; ix++) {
450 mp_digit *tmpx, *tmpy;
452 /* get offsets into the two bignums */
453 ty = MIN(b->used-1, ix);
456 /* setup temp aliases */
460 /* This is the number of times the loop will iterate, essentially it's
461 while (tx++ < a->used && ty-- >= 0) { ... }
463 iy = MIN(a->used-tx, ty+1);
466 for (iz = 0; iz < iy; iz++) {
467 _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
471 W[ix] = ((mp_digit)_W) & MP_MASK;
473 /* make next carry */
474 _W = _W >> ((mp_word)DIGIT_BIT);
482 register mp_digit *tmpc;
485 for (ix = digs; ix <= pa; ix++) {
486 /* now extract the previous digit [below the carry] */
490 /* clear unused digits [that existed in the old copy of c] */
491 for (; ix < olduse; ix++) {
501 * This is the comba method where the columns of the product
502 * are computed first then the carries are computed. This
503 * has the effect of making a very simple inner loop that
504 * is executed the most
506 * W2 represents the outer products and W the inner.
508 * A further optimizations is made because the inner
509 * products are of the form "A * B * 2". The *2 part does
510 * not need to be computed until the end which is good
511 * because 64-bit shifts are slow!
513 * Based on Algorithm 14.16 on pp.597 of HAC.
516 /* the jist of squaring...
518 you do like mult except the offset of the tmpx [one that starts closer to zero]
519 can't equal the offset of tmpy. So basically you set up iy like before then you min it with
520 (ty-tx) so that it never happens. You double all those you add in the inner loop
522 After that loop you do the squares and add them in.
524 Remove W2 and don't memset W
528 static int fast_s_mp_sqr (const mp_int * a, mp_int * b)
530 int olduse, res, pa, ix, iz;
531 mp_digit W[MP_WARRAY], *tmpx;
534 /* grow the destination as required */
535 pa = a->used + a->used;
537 if ((res = mp_grow (b, pa)) != MP_OKAY) {
542 /* number of output digits to produce */
544 for (ix = 0; ix <= pa; ix++) {
552 /* get offsets into the two bignums */
553 ty = MIN(a->used-1, ix);
556 /* setup temp aliases */
560 /* This is the number of times the loop will iterate, essentially it's
561 while (tx++ < a->used && ty-- >= 0) { ... }
563 iy = MIN(a->used-tx, ty+1);
565 /* now for squaring tx can never equal ty
566 * we halve the distance since they approach at a rate of 2x
567 * and we have to round because odd cases need to be executed
569 iy = MIN(iy, (ty-tx+1)>>1);
572 for (iz = 0; iz < iy; iz++) {
573 _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
576 /* double the inner product and add carry */
579 /* even columns have the square term in them */
581 _W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]);
587 /* make next carry */
588 W1 = _W >> ((mp_word)DIGIT_BIT);
593 b->used = a->used+a->used;
598 for (ix = 0; ix < pa; ix++) {
599 *tmpb++ = W[ix] & MP_MASK;
602 /* clear unused digits [that existed in the old copy of c] */
603 for (; ix < olduse; ix++) {
613 * Simple algorithm which zeroes the int, grows it then just sets one bit
617 mp_2expt (mp_int * a, int b)
621 /* zero a as per default */
624 /* grow a to accommodate the single bit */
625 if ((res = mp_grow (a, b / DIGIT_BIT + 1)) != MP_OKAY) {
629 /* set the used count of where the bit will go */
630 a->used = b / DIGIT_BIT + 1;
632 /* put the single bit in its place */
633 a->dp[b / DIGIT_BIT] = ((mp_digit)1) << (b % DIGIT_BIT);
640 * Simple function copies the input and fixes the sign to positive
643 mp_abs (const mp_int * a, mp_int * b)
649 if ((res = mp_copy (a, b)) != MP_OKAY) {
654 /* force the sign of b to positive */
660 /* high level addition (handles signs) */
661 int mp_add (mp_int * a, mp_int * b, mp_int * c)
665 /* get sign of both inputs */
669 /* handle two cases, not four */
671 /* both positive or both negative */
672 /* add their magnitudes, copy the sign */
674 res = s_mp_add (a, b, c);
676 /* one positive, the other negative */
677 /* subtract the one with the greater magnitude from */
678 /* the one of the lesser magnitude. The result gets */
679 /* the sign of the one with the greater magnitude. */
680 if (mp_cmp_mag (a, b) == MP_LT) {
682 res = s_mp_sub (b, a, c);
685 res = s_mp_sub (a, b, c);
692 /* single digit addition */
694 mp_add_d (mp_int * a, mp_digit b, mp_int * c)
696 int res, ix, oldused;
697 mp_digit *tmpa, *tmpc, mu;
699 /* grow c as required */
700 if (c->alloc < a->used + 1) {
701 if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) {
706 /* if a is negative and |a| >= b, call c = |a| - b */
707 if (a->sign == MP_NEG && (a->used > 1 || a->dp[0] >= b)) {
708 /* temporarily fix sign of a */
712 res = mp_sub_d(a, b, c);
715 a->sign = c->sign = MP_NEG;
720 /* old number of used digits in c */
723 /* sign always positive */
729 /* destination alias */
732 /* if a is positive */
733 if (a->sign == MP_ZPOS) {
734 /* add digit, after this we're propagating
738 mu = *tmpc >> DIGIT_BIT;
741 /* now handle rest of the digits */
742 for (ix = 1; ix < a->used; ix++) {
743 *tmpc = *tmpa++ + mu;
744 mu = *tmpc >> DIGIT_BIT;
747 /* set final carry */
752 c->used = a->used + 1;
754 /* a was negative and |a| < b */
757 /* the result is a single digit */
759 *tmpc++ = b - a->dp[0];
764 /* setup count so the clearing of oldused
765 * can fall through correctly
770 /* now zero to oldused */
771 while (ix++ < oldused) {
779 /* trim unused digits
781 * This is used to ensure that leading zero digits are
782 * trimed and the leading "used" digit will be non-zero
783 * Typically very fast. Also fixes the sign if there
784 * are no more leading digits
787 mp_clamp (mp_int * a)
789 /* decrease used while the most significant digit is
792 while (a->used > 0 && a->dp[a->used - 1] == 0) {
796 /* reset the sign flag if used == 0 */
802 /* clear one (frees) */
804 mp_clear (mp_int * a)
808 /* only do anything if a hasn't been freed previously */
810 /* first zero the digits */
811 for (i = 0; i < a->used; i++) {
818 /* reset members to make debugging easier */
820 a->alloc = a->used = 0;
826 void mp_clear_multi(mp_int *mp, ...)
828 mp_int* next_mp = mp;
831 while (next_mp != NULL) {
833 next_mp = va_arg(args, mp_int*);
838 /* compare two ints (signed)*/
840 mp_cmp (const mp_int * a, const mp_int * b)
842 /* compare based on sign */
843 if (a->sign != b->sign) {
844 if (a->sign == MP_NEG) {
852 if (a->sign == MP_NEG) {
853 /* if negative compare opposite direction */
854 return mp_cmp_mag(b, a);
856 return mp_cmp_mag(a, b);
860 /* compare a digit */
861 int mp_cmp_d(const mp_int * a, mp_digit b)
863 /* compare based on sign */
864 if (a->sign == MP_NEG) {
868 /* compare based on magnitude */
873 /* compare the only digit of a to b */
876 } else if (a->dp[0] < b) {
883 /* compare maginitude of two ints (unsigned) */
884 int mp_cmp_mag (const mp_int * a, const mp_int * b)
887 mp_digit *tmpa, *tmpb;
889 /* compare based on # of non-zero digits */
890 if (a->used > b->used) {
894 if (a->used < b->used) {
899 tmpa = a->dp + (a->used - 1);
902 tmpb = b->dp + (a->used - 1);
904 /* compare based on digits */
905 for (n = 0; n < a->used; ++n, --tmpa, --tmpb) {
917 static const int lnz[16] = {
918 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
921 /* Counts the number of lsbs which are zero before the first zero bit */
922 int mp_cnt_lsb(const mp_int *a)
928 if (mp_iszero(a) == 1) {
932 /* scan lower digits until non-zero */
933 for (x = 0; x < a->used && a->dp[x] == 0; x++);
937 /* now scan this digit until a 1 is found */
950 mp_copy (const mp_int * a, mp_int * b)
954 /* if dst == src do nothing */
960 if (b->alloc < a->used) {
961 if ((res = mp_grow (b, a->used)) != MP_OKAY) {
966 /* zero b and copy the parameters over */
968 register mp_digit *tmpa, *tmpb;
970 /* pointer aliases */
978 /* copy all the digits */
979 for (n = 0; n < a->used; n++) {
983 /* clear high digits */
984 for (; n < b->used; n++) {
989 /* copy used count and sign */
995 /* returns the number of bits in an int */
997 mp_count_bits (const mp_int * a)
1007 /* get number of digits and add that */
1008 r = (a->used - 1) * DIGIT_BIT;
1010 /* take the last digit and count the bits in it */
1011 q = a->dp[a->used - 1];
1012 while (q > ((mp_digit) 0)) {
1014 q >>= ((mp_digit) 1);
1019 /* integer signed division.
1020 * c*b + d == a [e.g. a/b, c=quotient, d=remainder]
1021 * HAC pp.598 Algorithm 14.20
1023 * Note that the description in HAC is horribly
1024 * incomplete. For example, it doesn't consider
1025 * the case where digits are removed from 'x' in
1026 * the inner loop. It also doesn't consider the
1027 * case that y has fewer than three digits, etc..
1029 * The overall algorithm is as described as
1030 * 14.20 from HAC but fixed to treat these cases.
1032 int mp_div (const mp_int * a, const mp_int * b, mp_int * c, mp_int * d)
1034 mp_int q, x, y, t1, t2;
1035 int res, n, t, i, norm, neg;
1037 /* is divisor zero ? */
1038 if (mp_iszero (b) == 1) {
1042 /* if a < b then q=0, r = a */
1043 if (mp_cmp_mag (a, b) == MP_LT) {
1045 res = mp_copy (a, d);
1055 if ((res = mp_init_size (&q, a->used + 2)) != MP_OKAY) {
1058 q.used = a->used + 2;
1060 if ((res = mp_init (&t1)) != MP_OKAY) {
1064 if ((res = mp_init (&t2)) != MP_OKAY) {
1068 if ((res = mp_init_copy (&x, a)) != MP_OKAY) {
1072 if ((res = mp_init_copy (&y, b)) != MP_OKAY) {
1077 neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
1078 x.sign = y.sign = MP_ZPOS;
1080 /* normalize both x and y, ensure that y >= b/2, [b == 2**DIGIT_BIT] */
1081 norm = mp_count_bits(&y) % DIGIT_BIT;
1082 if (norm < DIGIT_BIT-1) {
1083 norm = (DIGIT_BIT-1) - norm;
1084 if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) {
1087 if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) {
1094 /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
1098 /* while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} } */
1099 if ((res = mp_lshd (&y, n - t)) != MP_OKAY) { /* y = y*b**{n-t} */
1103 while (mp_cmp (&x, &y) != MP_LT) {
1105 if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) {
1110 /* reset y by shifting it back down */
1111 mp_rshd (&y, n - t);
1113 /* step 3. for i from n down to (t + 1) */
1114 for (i = n; i >= (t + 1); i--) {
1119 /* step 3.1 if xi == yt then set q{i-t-1} to b-1,
1120 * otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
1121 if (x.dp[i] == y.dp[t]) {
1122 q.dp[i - t - 1] = ((((mp_digit)1) << DIGIT_BIT) - 1);
1125 tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT);
1126 tmp |= ((mp_word) x.dp[i - 1]);
1127 tmp /= ((mp_word) y.dp[t]);
1128 if (tmp > (mp_word) MP_MASK)
1130 q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK));
1133 /* while (q{i-t-1} * (yt * b + y{t-1})) >
1134 xi * b**2 + xi-1 * b + xi-2
1138 q.dp[i - t - 1] = (q.dp[i - t - 1] + 1) & MP_MASK;
1140 q.dp[i - t - 1] = (q.dp[i - t - 1] - 1) & MP_MASK;
1142 /* find left hand */
1144 t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1];
1147 if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) {
1151 /* find right hand */
1152 t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2];
1153 t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
1156 } while (mp_cmp_mag(&t1, &t2) == MP_GT);
1158 /* step 3.3 x = x - q{i-t-1} * y * b**{i-t-1} */
1159 if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
1163 if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
1167 if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) {
1171 /* if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; } */
1172 if (x.sign == MP_NEG) {
1173 if ((res = mp_copy (&y, &t1)) != MP_OKAY) {
1176 if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
1179 if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) {
1183 q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
1187 /* now q is the quotient and x is the remainder
1188 * [which we have to normalize]
1191 /* get sign before writing to c */
1192 x.sign = x.used == 0 ? MP_ZPOS : a->sign;
1201 mp_div_2d (&x, norm, &x, NULL);
1209 __T2:mp_clear (&t2);
1210 __T1:mp_clear (&t1);
1216 int mp_div_2(const mp_int * a, mp_int * b)
1218 int x, res, oldused;
1221 if (b->alloc < a->used) {
1222 if ((res = mp_grow (b, a->used)) != MP_OKAY) {
1230 register mp_digit r, rr, *tmpa, *tmpb;
1233 tmpa = a->dp + b->used - 1;
1236 tmpb = b->dp + b->used - 1;
1240 for (x = b->used - 1; x >= 0; x--) {
1241 /* get the carry for the next iteration */
1244 /* shift the current digit, add in carry and store */
1245 *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
1247 /* forward carry to next iteration */
1251 /* zero excess digits */
1252 tmpb = b->dp + b->used;
1253 for (x = b->used; x < oldused; x++) {
1262 /* shift right by a certain bit count (store quotient in c, optional remainder in d) */
1263 int mp_div_2d (const mp_int * a, int b, mp_int * c, mp_int * d)
1270 /* if the shift count is <= 0 then we do no work */
1272 res = mp_copy (a, c);
1279 if ((res = mp_init (&t)) != MP_OKAY) {
1283 /* get the remainder */
1285 if ((res = mp_mod_2d (a, b, &t)) != MP_OKAY) {
1292 if ((res = mp_copy (a, c)) != MP_OKAY) {
1297 /* shift by as many digits in the bit count */
1298 if (b >= DIGIT_BIT) {
1299 mp_rshd (c, b / DIGIT_BIT);
1302 /* shift any bit count < DIGIT_BIT */
1303 D = (mp_digit) (b % DIGIT_BIT);
1305 register mp_digit *tmpc, mask, shift;
1308 mask = (((mp_digit)1) << D) - 1;
1311 shift = DIGIT_BIT - D;
1314 tmpc = c->dp + (c->used - 1);
1318 for (x = c->used - 1; x >= 0; x--) {
1319 /* get the lower bits of this word in a temp */
1322 /* shift the current word and mix in the carry bits from the previous word */
1323 *tmpc = (*tmpc >> D) | (r << shift);
1326 /* set the carry to the carry bits of the current word found above */
1338 static int s_is_power_of_two(mp_digit b, int *p)
1342 for (x = 1; x < DIGIT_BIT; x++) {
1343 if (b == (((mp_digit)1)<<x)) {
1351 /* single digit division (based on routine from MPI) */
1352 int mp_div_d (const mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
1359 /* cannot divide by zero */
1365 if (b == 1 || mp_iszero(a) == 1) {
1370 return mp_copy(a, c);
1375 /* power of two ? */
1376 if (s_is_power_of_two(b, &ix) == 1) {
1378 *d = a->dp[0] & ((((mp_digit)1)<<ix) - 1);
1381 return mp_div_2d(a, ix, c, NULL);
1386 /* no easy answer [c'est la vie]. Just division */
1387 if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
1394 for (ix = a->used - 1; ix >= 0; ix--) {
1395 w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]);
1398 t = (mp_digit)(w / b);
1399 w -= ((mp_word)t) * ((mp_word)b);
1419 /* reduce "x" in place modulo "n" using the Diminished Radix algorithm.
1421 * Based on algorithm from the paper
1423 * "Generating Efficient Primes for Discrete Log Cryptosystems"
1424 * Chae Hoon Lim, Pil Loong Lee,
1425 * POSTECH Information Research Laboratories
1427 * The modulus must be of a special format [see manual]
1429 * Has been modified to use algorithm 7.10 from the LTM book instead
1431 * Input x must be in the range 0 <= x <= (n-1)**2
1434 mp_dr_reduce (mp_int * x, const mp_int * n, mp_digit k)
1438 mp_digit mu, *tmpx1, *tmpx2;
1440 /* m = digits in modulus */
1443 /* ensure that "x" has at least 2m digits */
1444 if (x->alloc < m + m) {
1445 if ((err = mp_grow (x, m + m)) != MP_OKAY) {
1450 /* top of loop, this is where the code resumes if
1451 * another reduction pass is required.
1454 /* aliases for digits */
1455 /* alias for lower half of x */
1458 /* alias for upper half of x, or x/B**m */
1461 /* set carry to zero */
1464 /* compute (x mod B**m) + k * [x/B**m] inline and inplace */
1465 for (i = 0; i < m; i++) {
1466 r = ((mp_word)*tmpx2++) * ((mp_word)k) + *tmpx1 + mu;
1467 *tmpx1++ = (mp_digit)(r & MP_MASK);
1468 mu = (mp_digit)(r >> ((mp_word)DIGIT_BIT));
1471 /* set final carry */
1474 /* zero words above m */
1475 for (i = m + 1; i < x->used; i++) {
1479 /* clamp, sub and return */
1482 /* if x >= n then subtract and reduce again
1483 * Each successive "recursion" makes the input smaller and smaller.
1485 if (mp_cmp_mag (x, n) != MP_LT) {
1492 /* determines the setup value */
1493 void mp_dr_setup(const mp_int *a, mp_digit *d)
1495 /* the casts are required if DIGIT_BIT is one less than
1496 * the number of bits in a mp_digit [e.g. DIGIT_BIT==31]
1498 *d = (mp_digit)((((mp_word)1) << ((mp_word)DIGIT_BIT)) -
1499 ((mp_word)a->dp[0]));
1502 /* swap the elements of two integers, for cases where you can't simply swap the
1503 * mp_int pointers around
1506 mp_exch (mp_int * a, mp_int * b)
1515 /* this is a shell function that calls either the normal or Montgomery
1516 * exptmod functions. Originally the call to the montgomery code was
1517 * embedded in the normal function but that wasted a lot of stack space
1518 * for nothing (since 99% of the time the Montgomery code would be called)
1520 int mp_exptmod (const mp_int * G, const mp_int * X, mp_int * P, mp_int * Y)
1524 /* modulus P must be positive */
1525 if (P->sign == MP_NEG) {
1529 /* if exponent X is negative we have to recurse */
1530 if (X->sign == MP_NEG) {
1534 /* first compute 1/G mod P */
1535 if ((err = mp_init(&tmpG)) != MP_OKAY) {
1538 if ((err = mp_invmod(G, P, &tmpG)) != MP_OKAY) {
1544 if ((err = mp_init(&tmpX)) != MP_OKAY) {
1548 if ((err = mp_abs(X, &tmpX)) != MP_OKAY) {
1549 mp_clear_multi(&tmpG, &tmpX, NULL);
1553 /* and now compute (1/G)**|X| instead of G**X [X < 0] */
1554 err = mp_exptmod(&tmpG, &tmpX, P, Y);
1555 mp_clear_multi(&tmpG, &tmpX, NULL);
1561 /* if the modulus is odd or dr != 0 use the fast method */
1562 if (mp_isodd (P) == 1 || dr != 0) {
1563 return mp_exptmod_fast (G, X, P, Y, dr);
1565 /* otherwise use the generic Barrett reduction technique */
1566 return s_mp_exptmod (G, X, P, Y);
1570 /* computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
1572 * Uses a left-to-right k-ary sliding window to compute the modular exponentiation.
1573 * The value of k changes based on the size of the exponent.
1575 * Uses Montgomery or Diminished Radix reduction [whichever appropriate]
1579 mp_exptmod_fast (const mp_int * G, const mp_int * X, mp_int * P, mp_int * Y, int redmode)
1583 int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
1585 /* use a pointer to the reduction algorithm. This allows us to use
1586 * one of many reduction algorithms without modding the guts of
1587 * the code with if statements everywhere.
1589 int (*redux)(mp_int*,const mp_int*,mp_digit);
1591 /* find window size */
1592 x = mp_count_bits (X);
1595 } else if (x <= 36) {
1597 } else if (x <= 140) {
1599 } else if (x <= 450) {
1601 } else if (x <= 1303) {
1603 } else if (x <= 3529) {
1610 /* init first cell */
1611 if ((err = mp_init(&M[1])) != MP_OKAY) {
1615 /* now init the second half of the array */
1616 for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
1617 if ((err = mp_init(&M[x])) != MP_OKAY) {
1618 for (y = 1<<(winsize-1); y < x; y++) {
1626 /* determine and setup reduction code */
1628 /* now setup montgomery */
1629 if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) {
1633 /* automatically pick the comba one if available (saves quite a few calls/ifs) */
1634 if (((P->used * 2 + 1) < MP_WARRAY) &&
1635 P->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
1636 redux = fast_mp_montgomery_reduce;
1638 /* use slower baseline Montgomery method */
1639 redux = mp_montgomery_reduce;
1641 } else if (redmode == 1) {
1642 /* setup DR reduction for moduli of the form B**k - b */
1643 mp_dr_setup(P, &mp);
1644 redux = mp_dr_reduce;
1646 /* setup DR reduction for moduli of the form 2**k - b */
1647 if ((err = mp_reduce_2k_setup(P, &mp)) != MP_OKAY) {
1650 redux = mp_reduce_2k;
1654 if ((err = mp_init (&res)) != MP_OKAY) {
1662 * The first half of the table is not computed though accept for M[0] and M[1]
1666 /* now we need R mod m */
1667 if ((err = mp_montgomery_calc_normalization (&res, P)) != MP_OKAY) {
1671 /* now set M[1] to G * R mod m */
1672 if ((err = mp_mulmod (G, &res, P, &M[1])) != MP_OKAY) {
1677 if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
1682 /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
1683 if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
1687 for (x = 0; x < (winsize - 1); x++) {
1688 if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
1691 if ((err = redux (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
1696 /* create upper table */
1697 for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
1698 if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
1701 if ((err = redux (&M[x], P, mp)) != MP_OKAY) {
1706 /* set initial mode and bit cnt */
1710 digidx = X->used - 1;
1715 /* grab next digit as required */
1716 if (--bitcnt == 0) {
1717 /* if digidx == -1 we are out of digits so break */
1721 /* read next digit and reset bitcnt */
1722 buf = X->dp[digidx--];
1726 /* grab the next msb from the exponent */
1727 y = (buf >> (DIGIT_BIT - 1)) & 1;
1728 buf <<= (mp_digit)1;
1730 /* if the bit is zero and mode == 0 then we ignore it
1731 * These represent the leading zero bits before the first 1 bit
1732 * in the exponent. Technically this opt is not required but it
1733 * does lower the # of trivial squaring/reductions used
1735 if (mode == 0 && y == 0) {
1739 /* if the bit is zero and mode == 1 then we square */
1740 if (mode == 1 && y == 0) {
1741 if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
1744 if ((err = redux (&res, P, mp)) != MP_OKAY) {
1750 /* else we add it to the window */
1751 bitbuf |= (y << (winsize - ++bitcpy));
1754 if (bitcpy == winsize) {
1755 /* ok window is filled so square as required and multiply */
1757 for (x = 0; x < winsize; x++) {
1758 if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
1761 if ((err = redux (&res, P, mp)) != MP_OKAY) {
1767 if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
1770 if ((err = redux (&res, P, mp)) != MP_OKAY) {
1774 /* empty window and reset */
1781 /* if bits remain then square/multiply */
1782 if (mode == 2 && bitcpy > 0) {
1783 /* square then multiply if the bit is set */
1784 for (x = 0; x < bitcpy; x++) {
1785 if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
1788 if ((err = redux (&res, P, mp)) != MP_OKAY) {
1792 /* get next bit of the window */
1794 if ((bitbuf & (1 << winsize)) != 0) {
1796 if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
1799 if ((err = redux (&res, P, mp)) != MP_OKAY) {
1807 /* fixup result if Montgomery reduction is used
1808 * recall that any value in a Montgomery system is
1809 * actually multiplied by R mod n. So we have
1810 * to reduce one more time to cancel out the factor
1813 if ((err = redux(&res, P, mp)) != MP_OKAY) {
1818 /* swap res with Y */
1821 __RES:mp_clear (&res);
1824 for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
1830 /* Greatest Common Divisor using the binary method */
1831 int mp_gcd (const mp_int * a, const mp_int * b, mp_int * c)
1834 int k, u_lsb, v_lsb, res;
1836 /* either zero than gcd is the largest */
1837 if (mp_iszero (a) == 1 && mp_iszero (b) == 0) {
1838 return mp_abs (b, c);
1840 if (mp_iszero (a) == 0 && mp_iszero (b) == 1) {
1841 return mp_abs (a, c);
1844 /* optimized. At this point if a == 0 then
1845 * b must equal zero too
1847 if (mp_iszero (a) == 1) {
1852 /* get copies of a and b we can modify */
1853 if ((res = mp_init_copy (&u, a)) != MP_OKAY) {
1857 if ((res = mp_init_copy (&v, b)) != MP_OKAY) {
1861 /* must be positive for the remainder of the algorithm */
1862 u.sign = v.sign = MP_ZPOS;
1864 /* B1. Find the common power of two for u and v */
1865 u_lsb = mp_cnt_lsb(&u);
1866 v_lsb = mp_cnt_lsb(&v);
1867 k = MIN(u_lsb, v_lsb);
1870 /* divide the power of two out */
1871 if ((res = mp_div_2d(&u, k, &u, NULL)) != MP_OKAY) {
1875 if ((res = mp_div_2d(&v, k, &v, NULL)) != MP_OKAY) {
1880 /* divide any remaining factors of two out */
1882 if ((res = mp_div_2d(&u, u_lsb - k, &u, NULL)) != MP_OKAY) {
1888 if ((res = mp_div_2d(&v, v_lsb - k, &v, NULL)) != MP_OKAY) {
1893 while (mp_iszero(&v) == 0) {
1894 /* make sure v is the largest */
1895 if (mp_cmp_mag(&u, &v) == MP_GT) {
1896 /* swap u and v to make sure v is >= u */
1900 /* subtract smallest from largest */
1901 if ((res = s_mp_sub(&v, &u, &v)) != MP_OKAY) {
1905 /* Divide out all factors of two */
1906 if ((res = mp_div_2d(&v, mp_cnt_lsb(&v), &v, NULL)) != MP_OKAY) {
1911 /* multiply by 2**k which we divided out at the beginning */
1912 if ((res = mp_mul_2d (&u, k, c)) != MP_OKAY) {
1922 /* get the lower 32-bits of an mp_int */
1923 unsigned long mp_get_int(const mp_int * a)
1932 /* get number of digits of the lsb we have to read */
1933 i = MIN(a->used,(int)((sizeof(unsigned long)*CHAR_BIT+DIGIT_BIT-1)/DIGIT_BIT))-1;
1935 /* get most significant digit of result */
1939 res = (res << DIGIT_BIT) | DIGIT(a,i);
1942 /* force result to 32-bits always so it is consistent on non 32-bit platforms */
1943 return res & 0xFFFFFFFFUL;
1946 /* grow as required */
1947 int mp_grow (mp_int * a, int size)
1952 /* if the alloc size is smaller alloc more ram */
1953 if (a->alloc < size) {
1954 /* ensure there are always at least MP_PREC digits extra on top */
1955 size += (MP_PREC * 2) - (size % MP_PREC);
1957 /* reallocate the array a->dp
1959 * We store the return in a temporary variable
1960 * in case the operation failed we don't want
1961 * to overwrite the dp member of a.
1963 tmp = realloc (a->dp, sizeof (mp_digit) * size);
1965 /* reallocation failed but "a" is still valid [can be freed] */
1969 /* reallocation succeeded so set a->dp */
1972 /* zero excess digits */
1975 for (; i < a->alloc; i++) {
1982 /* init a new mp_int */
1983 int mp_init (mp_int * a)
1987 /* allocate memory required and clear it */
1988 a->dp = malloc (sizeof (mp_digit) * MP_PREC);
1989 if (a->dp == NULL) {
1993 /* set the digits to zero */
1994 for (i = 0; i < MP_PREC; i++) {
1998 /* set the used to zero, allocated digits to the default precision
1999 * and sign to positive */
2007 /* creates "a" then copies b into it */
2008 int mp_init_copy (mp_int * a, const mp_int * b)
2012 if ((res = mp_init (a)) != MP_OKAY) {
2015 return mp_copy (b, a);
2018 int mp_init_multi(mp_int *mp, ...)
2020 mp_err res = MP_OKAY; /* Assume ok until proven otherwise */
2021 int n = 0; /* Number of ok inits */
2022 mp_int* cur_arg = mp;
2025 va_start(args, mp); /* init args to next argument from caller */
2026 while (cur_arg != NULL) {
2027 if (mp_init(cur_arg) != MP_OKAY) {
2028 /* Oops - error! Back-track and mp_clear what we already
2029 succeeded in init-ing, then return error.
2033 /* end the current list */
2036 /* now start cleaning up */
2038 va_start(clean_args, mp);
2041 cur_arg = va_arg(clean_args, mp_int*);
2048 cur_arg = va_arg(args, mp_int*);
2051 return res; /* Assumed ok, if error flagged above. */
2054 /* init an mp_init for a given size */
2055 int mp_init_size (mp_int * a, int size)
2059 /* pad size so there are always extra digits */
2060 size += (MP_PREC * 2) - (size % MP_PREC);
2063 a->dp = malloc (sizeof (mp_digit) * size);
2064 if (a->dp == NULL) {
2068 /* set the members */
2073 /* zero the digits */
2074 for (x = 0; x < size; x++) {
2081 /* hac 14.61, pp608 */
2082 int mp_invmod (const mp_int * a, mp_int * b, mp_int * c)
2084 /* b cannot be negative */
2085 if (b->sign == MP_NEG || mp_iszero(b) == 1) {
2089 /* if the modulus is odd we can use a faster routine instead */
2090 if (mp_isodd (b) == 1) {
2091 return fast_mp_invmod (a, b, c);
2094 return mp_invmod_slow(a, b, c);
2097 /* hac 14.61, pp608 */
2098 int mp_invmod_slow (const mp_int * a, mp_int * b, mp_int * c)
2100 mp_int x, y, u, v, A, B, C, D;
2103 /* b cannot be negative */
2104 if (b->sign == MP_NEG || mp_iszero(b) == 1) {
2109 if ((res = mp_init_multi(&x, &y, &u, &v,
2110 &A, &B, &C, &D, NULL)) != MP_OKAY) {
2115 if ((res = mp_copy (a, &x)) != MP_OKAY) {
2118 if ((res = mp_copy (b, &y)) != MP_OKAY) {
2122 /* 2. [modified] if x,y are both even then return an error! */
2123 if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
2128 /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
2129 if ((res = mp_copy (&x, &u)) != MP_OKAY) {
2132 if ((res = mp_copy (&y, &v)) != MP_OKAY) {
2139 /* 4. while u is even do */
2140 while (mp_iseven (&u) == 1) {
2142 if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
2145 /* 4.2 if A or B is odd then */
2146 if (mp_isodd (&A) == 1 || mp_isodd (&B) == 1) {
2147 /* A = (A+y)/2, B = (B-x)/2 */
2148 if ((res = mp_add (&A, &y, &A)) != MP_OKAY) {
2151 if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
2155 /* A = A/2, B = B/2 */
2156 if ((res = mp_div_2 (&A, &A)) != MP_OKAY) {
2159 if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
2164 /* 5. while v is even do */
2165 while (mp_iseven (&v) == 1) {
2167 if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
2170 /* 5.2 if C or D is odd then */
2171 if (mp_isodd (&C) == 1 || mp_isodd (&D) == 1) {
2172 /* C = (C+y)/2, D = (D-x)/2 */
2173 if ((res = mp_add (&C, &y, &C)) != MP_OKAY) {
2176 if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
2180 /* C = C/2, D = D/2 */
2181 if ((res = mp_div_2 (&C, &C)) != MP_OKAY) {
2184 if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
2189 /* 6. if u >= v then */
2190 if (mp_cmp (&u, &v) != MP_LT) {
2191 /* u = u - v, A = A - C, B = B - D */
2192 if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
2196 if ((res = mp_sub (&A, &C, &A)) != MP_OKAY) {
2200 if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
2204 /* v - v - u, C = C - A, D = D - B */
2205 if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
2209 if ((res = mp_sub (&C, &A, &C)) != MP_OKAY) {
2213 if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
2218 /* if not zero goto step 4 */
2219 if (mp_iszero (&u) == 0)
2222 /* now a = C, b = D, gcd == g*v */
2224 /* if v != 1 then there is no inverse */
2225 if (mp_cmp_d (&v, 1) != MP_EQ) {
2230 /* if its too low */
2231 while (mp_cmp_d(&C, 0) == MP_LT) {
2232 if ((res = mp_add(&C, b, &C)) != MP_OKAY) {
2238 while (mp_cmp_mag(&C, b) != MP_LT) {
2239 if ((res = mp_sub(&C, b, &C)) != MP_OKAY) {
2244 /* C is now the inverse */
2247 __ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
2251 /* c = |a| * |b| using Karatsuba Multiplication using
2252 * three half size multiplications
2254 * Let B represent the radix [e.g. 2**DIGIT_BIT] and
2255 * let n represent half of the number of digits in
2258 * a = a1 * B**n + a0
2259 * b = b1 * B**n + b0
2262 a1b1 * B**2n + ((a1 - a0)(b1 - b0) + a0b0 + a1b1) * B + a0b0
2264 * Note that a1b1 and a0b0 are used twice and only need to be
2265 * computed once. So in total three half size (half # of
2266 * digit) multiplications are performed, a0b0, a1b1 and
2269 * Note that a multiplication of half the digits requires
2270 * 1/4th the number of single precision multiplications so in
2271 * total after one call 25% of the single precision multiplications
2272 * are saved. Note also that the call to mp_mul can end up back
2273 * in this function if the a0, a1, b0, or b1 are above the threshold.
2274 * This is known as divide-and-conquer and leads to the famous
2275 * O(N**lg(3)) or O(N**1.584) work which is asymptotically lower than
2276 * the standard O(N**2) that the baseline/comba methods use.
2277 * Generally though the overhead of this method doesn't pay off
2278 * until a certain size (N ~ 80) is reached.
2280 int mp_karatsuba_mul (const mp_int * a, const mp_int * b, mp_int * c)
2282 mp_int x0, x1, y0, y1, t1, x0y0, x1y1;
2285 /* default the return code to an error */
2288 /* min # of digits */
2289 B = MIN (a->used, b->used);
2291 /* now divide in two */
2294 /* init copy all the temps */
2295 if (mp_init_size (&x0, B) != MP_OKAY)
2297 if (mp_init_size (&x1, a->used - B) != MP_OKAY)
2299 if (mp_init_size (&y0, B) != MP_OKAY)
2301 if (mp_init_size (&y1, b->used - B) != MP_OKAY)
2305 if (mp_init_size (&t1, B * 2) != MP_OKAY)
2307 if (mp_init_size (&x0y0, B * 2) != MP_OKAY)
2309 if (mp_init_size (&x1y1, B * 2) != MP_OKAY)
2312 /* now shift the digits */
2313 x0.used = y0.used = B;
2314 x1.used = a->used - B;
2315 y1.used = b->used - B;
2319 register mp_digit *tmpa, *tmpb, *tmpx, *tmpy;
2321 /* we copy the digits directly instead of using higher level functions
2322 * since we also need to shift the digits
2329 for (x = 0; x < B; x++) {
2335 for (x = B; x < a->used; x++) {
2340 for (x = B; x < b->used; x++) {
2345 /* only need to clamp the lower words since by definition the
2346 * upper words x1/y1 must have a known number of digits
2351 /* now calc the products x0y0 and x1y1 */
2352 /* after this x0 is no longer required, free temp [x0==t2]! */
2353 if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY)
2354 goto X1Y1; /* x0y0 = x0*y0 */
2355 if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY)
2356 goto X1Y1; /* x1y1 = x1*y1 */
2358 /* now calc x1-x0 and y1-y0 */
2359 if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
2360 goto X1Y1; /* t1 = x1 - x0 */
2361 if (mp_sub (&y1, &y0, &x0) != MP_OKAY)
2362 goto X1Y1; /* t2 = y1 - y0 */
2363 if (mp_mul (&t1, &x0, &t1) != MP_OKAY)
2364 goto X1Y1; /* t1 = (x1 - x0) * (y1 - y0) */
2367 if (mp_add (&x0y0, &x1y1, &x0) != MP_OKAY)
2368 goto X1Y1; /* t2 = x0y0 + x1y1 */
2369 if (mp_sub (&x0, &t1, &t1) != MP_OKAY)
2370 goto X1Y1; /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
2373 if (mp_lshd (&t1, B) != MP_OKAY)
2374 goto X1Y1; /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
2375 if (mp_lshd (&x1y1, B * 2) != MP_OKAY)
2376 goto X1Y1; /* x1y1 = x1y1 << 2*B */
2378 if (mp_add (&x0y0, &t1, &t1) != MP_OKAY)
2379 goto X1Y1; /* t1 = x0y0 + t1 */
2380 if (mp_add (&t1, &x1y1, c) != MP_OKAY)
2381 goto X1Y1; /* t1 = x0y0 + t1 + x1y1 */
2383 /* Algorithm succeeded set the return code to MP_OKAY */
2386 X1Y1:mp_clear (&x1y1);
2387 X0Y0:mp_clear (&x0y0);
2397 /* Karatsuba squaring, computes b = a*a using three
2398 * half size squarings
2400 * See comments of karatsuba_mul for details. It
2401 * is essentially the same algorithm but merely
2402 * tuned to perform recursive squarings.
2404 int mp_karatsuba_sqr (const mp_int * a, mp_int * b)
2406 mp_int x0, x1, t1, t2, x0x0, x1x1;
2411 /* min # of digits */
2414 /* now divide in two */
2417 /* init copy all the temps */
2418 if (mp_init_size (&x0, B) != MP_OKAY)
2420 if (mp_init_size (&x1, a->used - B) != MP_OKAY)
2424 if (mp_init_size (&t1, a->used * 2) != MP_OKAY)
2426 if (mp_init_size (&t2, a->used * 2) != MP_OKAY)
2428 if (mp_init_size (&x0x0, B * 2) != MP_OKAY)
2430 if (mp_init_size (&x1x1, (a->used - B) * 2) != MP_OKAY)
2435 register mp_digit *dst, *src;
2439 /* now shift the digits */
2441 for (x = 0; x < B; x++) {
2446 for (x = B; x < a->used; x++) {
2452 x1.used = a->used - B;
2456 /* now calc the products x0*x0 and x1*x1 */
2457 if (mp_sqr (&x0, &x0x0) != MP_OKAY)
2458 goto X1X1; /* x0x0 = x0*x0 */
2459 if (mp_sqr (&x1, &x1x1) != MP_OKAY)
2460 goto X1X1; /* x1x1 = x1*x1 */
2462 /* now calc (x1-x0)**2 */
2463 if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
2464 goto X1X1; /* t1 = x1 - x0 */
2465 if (mp_sqr (&t1, &t1) != MP_OKAY)
2466 goto X1X1; /* t1 = (x1 - x0) * (x1 - x0) */
2469 if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY)
2470 goto X1X1; /* t2 = x0x0 + x1x1 */
2471 if (mp_sub (&t2, &t1, &t1) != MP_OKAY)
2472 goto X1X1; /* t1 = x0x0 + x1x1 - (x1-x0)*(x1-x0) */
2475 if (mp_lshd (&t1, B) != MP_OKAY)
2476 goto X1X1; /* t1 = (x0x0 + x1x1 - (x1-x0)*(x1-x0))<<B */
2477 if (mp_lshd (&x1x1, B * 2) != MP_OKAY)
2478 goto X1X1; /* x1x1 = x1x1 << 2*B */
2480 if (mp_add (&x0x0, &t1, &t1) != MP_OKAY)
2481 goto X1X1; /* t1 = x0x0 + t1 */
2482 if (mp_add (&t1, &x1x1, b) != MP_OKAY)
2483 goto X1X1; /* t1 = x0x0 + t1 + x1x1 */
2487 X1X1:mp_clear (&x1x1);
2488 X0X0:mp_clear (&x0x0);
2497 /* computes least common multiple as |a*b|/(a, b) */
2498 int mp_lcm (const mp_int * a, const mp_int * b, mp_int * c)
2504 if ((res = mp_init_multi (&t1, &t2, NULL)) != MP_OKAY) {
2508 /* t1 = get the GCD of the two inputs */
2509 if ((res = mp_gcd (a, b, &t1)) != MP_OKAY) {
2513 /* divide the smallest by the GCD */
2514 if (mp_cmp_mag(a, b) == MP_LT) {
2515 /* store quotient in t2 such that t2 * b is the LCM */
2516 if ((res = mp_div(a, &t1, &t2, NULL)) != MP_OKAY) {
2519 res = mp_mul(b, &t2, c);
2521 /* store quotient in t2 such that t2 * a is the LCM */
2522 if ((res = mp_div(b, &t1, &t2, NULL)) != MP_OKAY) {
2525 res = mp_mul(a, &t2, c);
2528 /* fix the sign to positive */
2532 mp_clear_multi (&t1, &t2, NULL);
2536 /* shift left a certain amount of digits */
2537 int mp_lshd (mp_int * a, int b)
2541 /* if its less than zero return */
2546 /* grow to fit the new digits */
2547 if (a->alloc < a->used + b) {
2548 if ((res = mp_grow (a, a->used + b)) != MP_OKAY) {
2554 register mp_digit *top, *bottom;
2556 /* increment the used by the shift amount then copy upwards */
2560 top = a->dp + a->used - 1;
2563 bottom = a->dp + a->used - 1 - b;
2565 /* much like mp_rshd this is implemented using a sliding window
2566 * except the window goes the otherway around. Copying from
2567 * the bottom to the top. see bn_mp_rshd.c for more info.
2569 for (x = a->used - 1; x >= b; x--) {
2573 /* zero the lower digits */
2575 for (x = 0; x < b; x++) {
2582 /* c = a mod b, 0 <= c < b */
2584 mp_mod (const mp_int * a, mp_int * b, mp_int * c)
2589 if ((res = mp_init (&t)) != MP_OKAY) {
2593 if ((res = mp_div (a, b, NULL, &t)) != MP_OKAY) {
2598 if (t.sign != b->sign) {
2599 res = mp_add (b, &t, c);
2609 /* calc a value mod 2**b */
2611 mp_mod_2d (const mp_int * a, int b, mp_int * c)
2615 /* if b is <= 0 then zero the int */
2621 /* if the modulus is larger than the value than return */
2622 if (b > a->used * DIGIT_BIT) {
2623 res = mp_copy (a, c);
2628 if ((res = mp_copy (a, c)) != MP_OKAY) {
2632 /* zero digits above the last digit of the modulus */
2633 for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++) {
2636 /* clear the digit that is not completely outside/inside the modulus */
2637 c->dp[b / DIGIT_BIT] &= (1 << ((mp_digit)b % DIGIT_BIT)) - 1;
2643 mp_mod_d (const mp_int * a, mp_digit b, mp_digit * c)
2645 return mp_div_d(a, b, NULL, c);
2649 * shifts with subtractions when the result is greater than b.
2651 * The method is slightly modified to shift B unconditionally up to just under
2652 * the leading bit of b. This saves a lot of multiple precision shifting.
2654 int mp_montgomery_calc_normalization (mp_int * a, const mp_int * b)
2658 /* how many bits of last digit does b use */
2659 bits = mp_count_bits (b) % DIGIT_BIT;
2663 if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
2672 /* now compute C = A * B mod b */
2673 for (x = bits - 1; x < DIGIT_BIT; x++) {
2674 if ((res = mp_mul_2 (a, a)) != MP_OKAY) {
2677 if (mp_cmp_mag (a, b) != MP_LT) {
2678 if ((res = s_mp_sub (a, b, a)) != MP_OKAY) {
2687 /* computes xR**-1 == x (mod N) via Montgomery Reduction */
2689 mp_montgomery_reduce (mp_int * x, const mp_int * n, mp_digit rho)
2694 /* can the fast reduction [comba] method be used?
2696 * Note that unlike in mul you're safely allowed *less*
2697 * than the available columns [255 per default] since carries
2698 * are fixed up in the inner loop.
2700 digs = n->used * 2 + 1;
2701 if ((digs < MP_WARRAY) &&
2703 (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
2704 return fast_mp_montgomery_reduce (x, n, rho);
2707 /* grow the input as required */
2708 if (x->alloc < digs) {
2709 if ((res = mp_grow (x, digs)) != MP_OKAY) {
2715 for (ix = 0; ix < n->used; ix++) {
2716 /* mu = ai * rho mod b
2718 * The value of rho must be precalculated via
2719 * montgomery_setup() such that
2720 * it equals -1/n0 mod b this allows the
2721 * following inner loop to reduce the
2722 * input one digit at a time
2724 mu = (mp_digit) (((mp_word)x->dp[ix]) * ((mp_word)rho) & MP_MASK);
2726 /* a = a + mu * m * b**i */
2729 register mp_digit *tmpn, *tmpx, u;
2732 /* alias for digits of the modulus */
2735 /* alias for the digits of x [the input] */
2738 /* set the carry to zero */
2741 /* Multiply and add in place */
2742 for (iy = 0; iy < n->used; iy++) {
2743 /* compute product and sum */
2744 r = ((mp_word)mu) * ((mp_word)*tmpn++) +
2745 ((mp_word) u) + ((mp_word) * tmpx);
2748 u = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
2751 *tmpx++ = (mp_digit)(r & ((mp_word) MP_MASK));
2753 /* At this point the ix'th digit of x should be zero */
2756 /* propagate carries upwards as required*/
2759 u = *tmpx >> DIGIT_BIT;
2765 /* at this point the n.used'th least
2766 * significant digits of x are all zero
2767 * which means we can shift x to the
2768 * right by n.used digits and the
2769 * residue is unchanged.
2772 /* x = x/b**n.used */
2774 mp_rshd (x, n->used);
2776 /* if x >= n then x = x - n */
2777 if (mp_cmp_mag (x, n) != MP_LT) {
2778 return s_mp_sub (x, n, x);
2784 /* setups the montgomery reduction stuff */
2786 mp_montgomery_setup (const mp_int * n, mp_digit * rho)
2790 /* fast inversion mod 2**k
2792 * Based on the fact that
2794 * XA = 1 (mod 2**n) => (X(2-XA)) A = 1 (mod 2**2n)
2795 * => 2*X*A - X*X*A*A = 1
2796 * => 2*(1) - (1) = 1
2804 x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
2805 x *= 2 - b * x; /* here x*a==1 mod 2**8 */
2806 x *= 2 - b * x; /* here x*a==1 mod 2**16 */
2807 x *= 2 - b * x; /* here x*a==1 mod 2**32 */
2809 /* rho = -1/m mod b */
2810 *rho = (((mp_word)1 << ((mp_word) DIGIT_BIT)) - x) & MP_MASK;
2815 /* high level multiplication (handles sign) */
2816 int mp_mul (const mp_int * a, const mp_int * b, mp_int * c)
2819 neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
2821 /* use Karatsuba? */
2822 if (MIN (a->used, b->used) >= KARATSUBA_MUL_CUTOFF) {
2823 res = mp_karatsuba_mul (a, b, c);
2826 /* can we use the fast multiplier?
2828 * The fast multiplier can be used if the output will
2829 * have less than MP_WARRAY digits and the number of
2830 * digits won't affect carry propagation
2832 int digs = a->used + b->used + 1;
2834 if ((digs < MP_WARRAY) &&
2835 MIN(a->used, b->used) <=
2836 (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
2837 res = fast_s_mp_mul_digs (a, b, c, digs);
2839 res = s_mp_mul (a, b, c); /* uses s_mp_mul_digs */
2841 c->sign = (c->used > 0) ? neg : MP_ZPOS;
2846 int mp_mul_2(const mp_int * a, mp_int * b)
2848 int x, res, oldused;
2850 /* grow to accommodate result */
2851 if (b->alloc < a->used + 1) {
2852 if ((res = mp_grow (b, a->used + 1)) != MP_OKAY) {
2861 register mp_digit r, rr, *tmpa, *tmpb;
2863 /* alias for source */
2866 /* alias for dest */
2871 for (x = 0; x < a->used; x++) {
2873 /* get what will be the *next* carry bit from the
2874 * MSB of the current digit
2876 rr = *tmpa >> ((mp_digit)(DIGIT_BIT - 1));
2878 /* now shift up this digit, add in the carry [from the previous] */
2879 *tmpb++ = ((*tmpa++ << ((mp_digit)1)) | r) & MP_MASK;
2881 /* copy the carry that would be from the source
2882 * digit into the next iteration
2887 /* new leading digit? */
2889 /* add a MSB which is always 1 at this point */
2894 /* now zero any excess digits on the destination
2895 * that we didn't write to
2897 tmpb = b->dp + b->used;
2898 for (x = b->used; x < oldused; x++) {
2906 /* shift left by a certain bit count */
2907 int mp_mul_2d (const mp_int * a, int b, mp_int * c)
2914 if ((res = mp_copy (a, c)) != MP_OKAY) {
2919 if (c->alloc < c->used + b/DIGIT_BIT + 1) {
2920 if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 1)) != MP_OKAY) {
2925 /* shift by as many digits in the bit count */
2926 if (b >= DIGIT_BIT) {
2927 if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) {
2932 /* shift any bit count < DIGIT_BIT */
2933 d = (mp_digit) (b % DIGIT_BIT);
2935 register mp_digit *tmpc, shift, mask, r, rr;
2938 /* bitmask for carries */
2939 mask = (((mp_digit)1) << d) - 1;
2941 /* shift for msbs */
2942 shift = DIGIT_BIT - d;
2949 for (x = 0; x < c->used; x++) {
2950 /* get the higher bits of the current word */
2951 rr = (*tmpc >> shift) & mask;
2953 /* shift the current word and OR in the carry */
2954 *tmpc = ((*tmpc << d) | r) & MP_MASK;
2957 /* set the carry to the carry bits of the current word */
2961 /* set final carry */
2963 c->dp[(c->used)++] = r;
2970 /* multiply by a digit */
2972 mp_mul_d (const mp_int * a, mp_digit b, mp_int * c)
2974 mp_digit u, *tmpa, *tmpc;
2976 int ix, res, olduse;
2978 /* make sure c is big enough to hold a*b */
2979 if (c->alloc < a->used + 1) {
2980 if ((res = mp_grow (c, a->used + 1)) != MP_OKAY) {
2985 /* get the original destinations used count */
2991 /* alias for a->dp [source] */
2994 /* alias for c->dp [dest] */
3000 /* compute columns */
3001 for (ix = 0; ix < a->used; ix++) {
3002 /* compute product and carry sum for this term */
3003 r = ((mp_word) u) + ((mp_word)*tmpa++) * ((mp_word)b);
3005 /* mask off higher bits to get a single digit */
3006 *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
3008 /* send carry into next iteration */
3009 u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
3012 /* store final carry [if any] */
3015 /* now zero digits above the top */
3016 while (ix++ < olduse) {
3020 /* set used count */
3021 c->used = a->used + 1;
3027 /* d = a * b (mod c) */
3029 mp_mulmod (const mp_int * a, const mp_int * b, mp_int * c, mp_int * d)
3034 if ((res = mp_init (&t)) != MP_OKAY) {
3038 if ((res = mp_mul (a, b, &t)) != MP_OKAY) {
3042 res = mp_mod (&t, c, d);
3047 /* table of first PRIME_SIZE primes */
3048 static const mp_digit __prime_tab[] = {
3049 0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
3050 0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
3051 0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
3052 0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F, 0x0083,
3053 0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
3054 0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
3055 0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
3056 0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137,
3058 0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167,
3059 0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199,
3060 0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9,
3061 0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7,
3062 0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239,
3063 0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265,
3064 0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293,
3065 0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF,
3067 0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301,
3068 0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B,
3069 0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371,
3070 0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD,
3071 0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5,
3072 0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419,
3073 0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449,
3074 0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B,
3076 0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7,
3077 0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503,
3078 0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529,
3079 0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F,
3080 0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3,
3081 0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
3082 0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
3083 0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653
3086 /* determines if an integers is divisible by one
3087 * of the first PRIME_SIZE primes or not
3089 * sets result to 0 if not, 1 if yes
3091 int mp_prime_is_divisible (const mp_int * a, int *result)
3096 /* default to not */
3099 for (ix = 0; ix < PRIME_SIZE; ix++) {
3100 /* what is a mod __prime_tab[ix] */
3101 if ((err = mp_mod_d (a, __prime_tab[ix], &res)) != MP_OKAY) {
3105 /* is the residue zero? */
3115 /* performs a variable number of rounds of Miller-Rabin
3117 * Probability of error after t rounds is no more than
3120 * Sets result to 1 if probably prime, 0 otherwise
3122 int mp_prime_is_prime (mp_int * a, int t, int *result)
3130 /* valid value of t? */
3131 if (t <= 0 || t > PRIME_SIZE) {
3135 /* is the input equal to one of the primes in the table? */
3136 for (ix = 0; ix < PRIME_SIZE; ix++) {
3137 if (mp_cmp_d(a, __prime_tab[ix]) == MP_EQ) {
3143 /* first perform trial division */
3144 if ((err = mp_prime_is_divisible (a, &res)) != MP_OKAY) {
3148 /* return if it was trivially divisible */
3149 if (res == MP_YES) {
3153 /* now perform the miller-rabin rounds */
3154 if ((err = mp_init (&b)) != MP_OKAY) {
3158 for (ix = 0; ix < t; ix++) {
3160 mp_set (&b, __prime_tab[ix]);
3162 if ((err = mp_prime_miller_rabin (a, &b, &res)) != MP_OKAY) {
3171 /* passed the test */
3177 /* Miller-Rabin test of "a" to the base of "b" as described in
3178 * HAC pp. 139 Algorithm 4.24
3180 * Sets result to 0 if definitely composite or 1 if probably prime.
3181 * Randomly the chance of error is no more than 1/4 and often
3184 int mp_prime_miller_rabin (mp_int * a, const mp_int * b, int *result)
3193 if (mp_cmp_d(b, 1) != MP_GT) {
3197 /* get n1 = a - 1 */
3198 if ((err = mp_init_copy (&n1, a)) != MP_OKAY) {
3201 if ((err = mp_sub_d (&n1, 1, &n1)) != MP_OKAY) {
3205 /* set 2**s * r = n1 */
3206 if ((err = mp_init_copy (&r, &n1)) != MP_OKAY) {
3210 /* count the number of least significant bits
3215 /* now divide n - 1 by 2**s */
3216 if ((err = mp_div_2d (&r, s, &r, NULL)) != MP_OKAY) {
3220 /* compute y = b**r mod a */
3221 if ((err = mp_init (&y)) != MP_OKAY) {
3224 if ((err = mp_exptmod (b, &r, a, &y)) != MP_OKAY) {
3228 /* if y != 1 and y != n1 do */
3229 if (mp_cmp_d (&y, 1) != MP_EQ && mp_cmp (&y, &n1) != MP_EQ) {
3231 /* while j <= s-1 and y != n1 */
3232 while ((j <= (s - 1)) && mp_cmp (&y, &n1) != MP_EQ) {
3233 if ((err = mp_sqrmod (&y, a, &y)) != MP_OKAY) {
3237 /* if y == 1 then composite */
3238 if (mp_cmp_d (&y, 1) == MP_EQ) {
3245 /* if y != n1 then composite */
3246 if (mp_cmp (&y, &n1) != MP_EQ) {
3251 /* probably prime now */
3255 __N1:mp_clear (&n1);
3259 static const struct {
3272 /* returns # of RM trials required for a given bit size */
3273 int mp_prime_rabin_miller_trials(int size)
3277 for (x = 0; x < (int)(sizeof(sizes)/(sizeof(sizes[0]))); x++) {
3278 if (sizes[x].k == size) {
3280 } else if (sizes[x].k > size) {
3281 return (x == 0) ? sizes[0].t : sizes[x - 1].t;
3284 return sizes[x-1].t + 1;
3287 /* makes a truly random prime of a given size (bits),
3289 * Flags are as follows:
3291 * LTM_PRIME_BBS - make prime congruent to 3 mod 4
3292 * LTM_PRIME_SAFE - make sure (p-1)/2 is prime as well (implies LTM_PRIME_BBS)
3293 * LTM_PRIME_2MSB_OFF - make the 2nd highest bit zero
3294 * LTM_PRIME_2MSB_ON - make the 2nd highest bit one
3296 * You have to supply a callback which fills in a buffer with random bytes. "dat" is a parameter you can
3297 * have passed to the callback (e.g. a state or something). This function doesn't use "dat" itself
3302 /* This is possibly the mother of all prime generation functions, muahahahahaha! */
3303 int mp_prime_random_ex(mp_int *a, int t, int size, int flags, ltm_prime_callback cb, void *dat)
3305 unsigned char *tmp, maskAND, maskOR_msb, maskOR_lsb;
3306 int res, err, bsize, maskOR_msb_offset;
3308 /* sanity check the input */
3309 if (size <= 1 || t <= 0) {
3313 /* LTM_PRIME_SAFE implies LTM_PRIME_BBS */
3314 if (flags & LTM_PRIME_SAFE) {
3315 flags |= LTM_PRIME_BBS;
3318 /* calc the byte size */
3319 bsize = (size>>3)+((size&7)?1:0);
3321 /* we need a buffer of bsize bytes */
3322 tmp = malloc(bsize);
3327 /* calc the maskAND value for the MSbyte*/
3328 maskAND = ((size&7) == 0) ? 0xFF : (0xFF >> (8 - (size & 7)));
3330 /* calc the maskOR_msb */
3332 maskOR_msb_offset = ((size & 7) == 1) ? 1 : 0;
3333 if (flags & LTM_PRIME_2MSB_ON) {
3334 maskOR_msb |= 1 << ((size - 2) & 7);
3335 } else if (flags & LTM_PRIME_2MSB_OFF) {
3336 maskAND &= ~(1 << ((size - 2) & 7));
3339 /* get the maskOR_lsb */
3341 if (flags & LTM_PRIME_BBS) {
3346 /* read the bytes */
3347 if (cb(tmp, bsize, dat) != bsize) {
3352 /* work over the MSbyte */
3354 tmp[0] |= 1 << ((size - 1) & 7);
3356 /* mix in the maskORs */
3357 tmp[maskOR_msb_offset] |= maskOR_msb;
3358 tmp[bsize-1] |= maskOR_lsb;
3361 if ((err = mp_read_unsigned_bin(a, tmp, bsize)) != MP_OKAY) { goto error; }
3364 if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY) { goto error; }
3369 if (flags & LTM_PRIME_SAFE) {
3370 /* see if (a-1)/2 is prime */
3371 if ((err = mp_sub_d(a, 1, a)) != MP_OKAY) { goto error; }
3372 if ((err = mp_div_2(a, a)) != MP_OKAY) { goto error; }
3375 if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY) { goto error; }
3377 } while (res == MP_NO);
3379 if (flags & LTM_PRIME_SAFE) {
3380 /* restore a to the original value */
3381 if ((err = mp_mul_2(a, a)) != MP_OKAY) { goto error; }
3382 if ((err = mp_add_d(a, 1, a)) != MP_OKAY) { goto error; }
3391 /* reads an unsigned char array, assumes the msb is stored first [big endian] */
3393 mp_read_unsigned_bin (mp_int * a, const unsigned char *b, int c)
3397 /* make sure there are at least two digits */
3399 if ((res = mp_grow(a, 2)) != MP_OKAY) {
3407 /* read the bytes in */
3409 if ((res = mp_mul_2d (a, 8, a)) != MP_OKAY) {
3420 /* reduces x mod m, assumes 0 < x < m**2, mu is
3421 * precomputed via mp_reduce_setup.
3422 * From HAC pp.604 Algorithm 14.42
3425 mp_reduce (mp_int * x, const mp_int * m, const mp_int * mu)
3428 int res, um = m->used;
3431 if ((res = mp_init_copy (&q, x)) != MP_OKAY) {
3435 /* q1 = x / b**(k-1) */
3436 mp_rshd (&q, um - 1);
3438 /* according to HAC this optimization is ok */
3439 if (((unsigned long) um) > (((mp_digit)1) << (DIGIT_BIT - 1))) {
3440 if ((res = mp_mul (&q, mu, &q)) != MP_OKAY) {
3444 if ((res = s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) {
3449 /* q3 = q2 / b**(k+1) */
3450 mp_rshd (&q, um + 1);
3452 /* x = x mod b**(k+1), quick (no division) */
3453 if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) {
3457 /* q = q * m mod b**(k+1), quick (no division) */
3458 if ((res = s_mp_mul_digs (&q, m, &q, um + 1)) != MP_OKAY) {
3463 if ((res = mp_sub (x, &q, x)) != MP_OKAY) {
3467 /* If x < 0, add b**(k+1) to it */
3468 if (mp_cmp_d (x, 0) == MP_LT) {
3470 if ((res = mp_lshd (&q, um + 1)) != MP_OKAY)
3472 if ((res = mp_add (x, &q, x)) != MP_OKAY)
3476 /* Back off if it's too big */
3477 while (mp_cmp (x, m) != MP_LT) {
3478 if ((res = s_mp_sub (x, m, x)) != MP_OKAY) {
3489 /* reduces a modulo n where n is of the form 2**p - d */
3491 mp_reduce_2k(mp_int *a, const mp_int *n, mp_digit d)
3496 if ((res = mp_init(&q)) != MP_OKAY) {
3500 p = mp_count_bits(n);
3502 /* q = a/2**p, a = a mod 2**p */
3503 if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) {
3509 if ((res = mp_mul_d(&q, d, &q)) != MP_OKAY) {
3515 if ((res = s_mp_add(a, &q, a)) != MP_OKAY) {
3519 if (mp_cmp_mag(a, n) != MP_LT) {
3529 /* determines the setup value */
3531 mp_reduce_2k_setup(const mp_int *a, mp_digit *d)
3536 if ((res = mp_init(&tmp)) != MP_OKAY) {
3540 p = mp_count_bits(a);
3541 if ((res = mp_2expt(&tmp, p)) != MP_OKAY) {
3546 if ((res = s_mp_sub(&tmp, a, &tmp)) != MP_OKAY) {
3556 /* pre-calculate the value required for Barrett reduction
3557 * For a given modulus "b" it calulates the value required in "a"
3559 int mp_reduce_setup (mp_int * a, const mp_int * b)
3563 if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
3566 return mp_div (a, b, a, NULL);
3569 /* shift right a certain amount of digits */
3570 void mp_rshd (mp_int * a, int b)
3574 /* if b <= 0 then ignore it */
3579 /* if b > used then simply zero it and return */
3586 register mp_digit *bottom, *top;
3588 /* shift the digits down */
3593 /* top [offset into digits] */
3596 /* this is implemented as a sliding window where
3597 * the window is b-digits long and digits from
3598 * the top of the window are copied to the bottom
3602 b-2 | b-1 | b0 | b1 | b2 | ... | bb | ---->
3604 \-------------------/ ---->
3606 for (x = 0; x < (a->used - b); x++) {
3610 /* zero the top digits */
3611 for (; x < a->used; x++) {
3616 /* remove excess digits */
3620 /* set to a digit */
3621 void mp_set (mp_int * a, mp_digit b)
3624 a->dp[0] = b & MP_MASK;
3625 a->used = (a->dp[0] != 0) ? 1 : 0;
3628 /* set a 32-bit const */
3629 int mp_set_int (mp_int * a, unsigned long b)
3635 /* set four bits at a time */
3636 for (x = 0; x < 8; x++) {
3637 /* shift the number up four bits */
3638 if ((res = mp_mul_2d (a, 4, a)) != MP_OKAY) {
3642 /* OR in the top four bits of the source */
3643 a->dp[0] |= (b >> 28) & 15;
3645 /* shift the source up to the next four bits */
3648 /* ensure that digits are not clamped off */
3655 /* shrink a bignum */
3656 int mp_shrink (mp_int * a)
3659 if (a->alloc != a->used && a->used > 0) {
3660 if ((tmp = realloc (a->dp, sizeof (mp_digit) * a->used)) == NULL) {
3669 /* get the size for an signed equivalent */
3670 int mp_signed_bin_size (const mp_int * a)
3672 return 1 + mp_unsigned_bin_size (a);
3675 /* computes b = a*a */
3677 mp_sqr (const mp_int * a, mp_int * b)
3681 if (a->used >= KARATSUBA_SQR_CUTOFF) {
3682 res = mp_karatsuba_sqr (a, b);
3685 /* can we use the fast comba multiplier? */
3686 if ((a->used * 2 + 1) < MP_WARRAY &&
3688 (1 << (sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) {
3689 res = fast_s_mp_sqr (a, b);
3691 res = s_mp_sqr (a, b);
3697 /* c = a * a (mod b) */
3699 mp_sqrmod (const mp_int * a, mp_int * b, mp_int * c)
3704 if ((res = mp_init (&t)) != MP_OKAY) {
3708 if ((res = mp_sqr (a, &t)) != MP_OKAY) {
3712 res = mp_mod (&t, b, c);
3717 /* high level subtraction (handles signs) */
3719 mp_sub (mp_int * a, mp_int * b, mp_int * c)
3727 /* subtract a negative from a positive, OR */
3728 /* subtract a positive from a negative. */
3729 /* In either case, ADD their magnitudes, */
3730 /* and use the sign of the first number. */
3732 res = s_mp_add (a, b, c);
3734 /* subtract a positive from a positive, OR */
3735 /* subtract a negative from a negative. */
3736 /* First, take the difference between their */
3737 /* magnitudes, then... */
3738 if (mp_cmp_mag (a, b) != MP_LT) {
3739 /* Copy the sign from the first */
3741 /* The first has a larger or equal magnitude */
3742 res = s_mp_sub (a, b, c);
3744 /* The result has the *opposite* sign from */
3745 /* the first number. */
3746 c->sign = (sa == MP_ZPOS) ? MP_NEG : MP_ZPOS;
3747 /* The second has a larger magnitude */
3748 res = s_mp_sub (b, a, c);
3754 /* single digit subtraction */
3756 mp_sub_d (mp_int * a, mp_digit b, mp_int * c)
3758 mp_digit *tmpa, *tmpc, mu;
3759 int res, ix, oldused;
3761 /* grow c as required */
3762 if (c->alloc < a->used + 1) {
3763 if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) {
3768 /* if a is negative just do an unsigned
3769 * addition [with fudged signs]
3771 if (a->sign == MP_NEG) {
3773 res = mp_add_d(a, b, c);
3774 a->sign = c->sign = MP_NEG;
3783 /* if a <= b simply fix the single digit */
3784 if ((a->used == 1 && a->dp[0] <= b) || a->used == 0) {
3786 *tmpc++ = b - *tmpa;
3792 /* negative/1digit */
3800 /* subtract first digit */
3801 *tmpc = *tmpa++ - b;
3802 mu = *tmpc >> (sizeof(mp_digit) * CHAR_BIT - 1);
3805 /* handle rest of the digits */
3806 for (ix = 1; ix < a->used; ix++) {
3807 *tmpc = *tmpa++ - mu;
3808 mu = *tmpc >> (sizeof(mp_digit) * CHAR_BIT - 1);
3813 /* zero excess digits */
3814 while (ix++ < oldused) {
3821 /* store in unsigned [big endian] format */
3823 mp_to_unsigned_bin (const mp_int * a, unsigned char *b)
3828 if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
3833 while (mp_iszero (&t) == 0) {
3834 b[x++] = (unsigned char) (t.dp[0] & 255);
3835 if ((res = mp_div_2d (&t, 8, &t, NULL)) != MP_OKAY) {
3845 /* get the size for an unsigned equivalent */
3847 mp_unsigned_bin_size (const mp_int * a)
3849 int size = mp_count_bits (a);
3850 return (size / 8 + ((size & 7) != 0 ? 1 : 0));
3855 mp_zero (mp_int * a)
3859 memset (a->dp, 0, sizeof (mp_digit) * a->alloc);
3862 /* reverse an array, used for radix code */
3864 bn_reverse (unsigned char *s, int len)
3880 /* low level addition, based on HAC pp.594, Algorithm 14.7 */
3882 s_mp_add (mp_int * a, mp_int * b, mp_int * c)
3885 int olduse, res, min, max;
3887 /* find sizes, we let |a| <= |b| which means we have to sort
3888 * them. "x" will point to the input with the most digits
3890 if (a->used > b->used) {
3901 if (c->alloc < max + 1) {
3902 if ((res = mp_grow (c, max + 1)) != MP_OKAY) {
3907 /* get old used digit count and set new one */
3912 register mp_digit u, *tmpa, *tmpb, *tmpc;
3915 /* alias for digit pointers */
3926 /* zero the carry */
3928 for (i = 0; i < min; i++) {
3929 /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
3930 *tmpc = *tmpa++ + *tmpb++ + u;
3932 /* U = carry bit of T[i] */
3933 u = *tmpc >> ((mp_digit)DIGIT_BIT);
3935 /* take away carry bit from T[i] */
3939 /* now copy higher words if any, that is in A+B
3940 * if A or B has more digits add those in
3943 for (; i < max; i++) {
3944 /* T[i] = X[i] + U */
3945 *tmpc = x->dp[i] + u;
3947 /* U = carry bit of T[i] */
3948 u = *tmpc >> ((mp_digit)DIGIT_BIT);
3950 /* take away carry bit from T[i] */
3958 /* clear digits above oldused */
3959 for (i = c->used; i < olduse; i++) {
3968 static int s_mp_exptmod (const mp_int * G, const mp_int * X, mp_int * P, mp_int * Y)
3970 mp_int M[256], res, mu;
3972 int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
3974 /* find window size */
3975 x = mp_count_bits (X);
3978 } else if (x <= 36) {
3980 } else if (x <= 140) {
3982 } else if (x <= 450) {
3984 } else if (x <= 1303) {
3986 } else if (x <= 3529) {
3993 /* init first cell */
3994 if ((err = mp_init(&M[1])) != MP_OKAY) {
3998 /* now init the second half of the array */
3999 for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
4000 if ((err = mp_init(&M[x])) != MP_OKAY) {
4001 for (y = 1<<(winsize-1); y < x; y++) {
4009 /* create mu, used for Barrett reduction */
4010 if ((err = mp_init (&mu)) != MP_OKAY) {
4013 if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
4019 * The M table contains powers of the base,
4020 * e.g. M[x] = G**x mod P
4022 * The first half of the table is not
4023 * computed though accept for M[0] and M[1]
4025 if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
4029 /* compute the value at M[1<<(winsize-1)] by squaring
4030 * M[1] (winsize-1) times
4032 if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
4036 for (x = 0; x < (winsize - 1); x++) {
4037 if ((err = mp_sqr (&M[1 << (winsize - 1)],
4038 &M[1 << (winsize - 1)])) != MP_OKAY) {
4041 if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
4046 /* create upper table, that is M[x] = M[x-1] * M[1] (mod P)
4047 * for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
4049 for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
4050 if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
4053 if ((err = mp_reduce (&M[x], P, &mu)) != MP_OKAY) {
4059 if ((err = mp_init (&res)) != MP_OKAY) {
4064 /* set initial mode and bit cnt */
4068 digidx = X->used - 1;
4073 /* grab next digit as required */
4074 if (--bitcnt == 0) {
4075 /* if digidx == -1 we are out of digits */
4079 /* read next digit and reset the bitcnt */
4080 buf = X->dp[digidx--];
4084 /* grab the next msb from the exponent */
4085 y = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
4086 buf <<= (mp_digit)1;
4088 /* if the bit is zero and mode == 0 then we ignore it
4089 * These represent the leading zero bits before the first 1 bit
4090 * in the exponent. Technically this opt is not required but it
4091 * does lower the # of trivial squaring/reductions used
4093 if (mode == 0 && y == 0) {
4097 /* if the bit is zero and mode == 1 then we square */
4098 if (mode == 1 && y == 0) {
4099 if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
4102 if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
4108 /* else we add it to the window */
4109 bitbuf |= (y << (winsize - ++bitcpy));
4112 if (bitcpy == winsize) {
4113 /* ok window is filled so square as required and multiply */
4115 for (x = 0; x < winsize; x++) {
4116 if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
4119 if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
4125 if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
4128 if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
4132 /* empty window and reset */
4139 /* if bits remain then square/multiply */
4140 if (mode == 2 && bitcpy > 0) {
4141 /* square then multiply if the bit is set */
4142 for (x = 0; x < bitcpy; x++) {
4143 if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
4146 if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
4151 if ((bitbuf & (1 << winsize)) != 0) {
4153 if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
4156 if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
4165 __RES:mp_clear (&res);
4166 __MU:mp_clear (&mu);
4169 for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
4175 /* multiplies |a| * |b| and only computes up to digs digits of result
4176 * HAC pp. 595, Algorithm 14.12 Modified so you can control how
4177 * many digits of output are created.
4180 s_mp_mul_digs (const mp_int * a, const mp_int * b, mp_int * c, int digs)
4183 int res, pa, pb, ix, iy;
4186 mp_digit tmpx, *tmpt, *tmpy;
4188 /* can we use the fast multiplier? */
4189 if (((digs) < MP_WARRAY) &&
4190 MIN (a->used, b->used) <
4191 (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
4192 return fast_s_mp_mul_digs (a, b, c, digs);
4195 if ((res = mp_init_size (&t, digs)) != MP_OKAY) {
4200 /* compute the digits of the product directly */
4202 for (ix = 0; ix < pa; ix++) {
4203 /* set the carry to zero */
4206 /* limit ourselves to making digs digits of output */
4207 pb = MIN (b->used, digs - ix);
4209 /* setup some aliases */
4210 /* copy of the digit from a used within the nested loop */
4213 /* an alias for the destination shifted ix places */
4216 /* an alias for the digits of b */
4219 /* compute the columns of the output and propagate the carry */
4220 for (iy = 0; iy < pb; iy++) {
4221 /* compute the column as a mp_word */
4222 r = ((mp_word)*tmpt) +
4223 ((mp_word)tmpx) * ((mp_word)*tmpy++) +
4226 /* the new column is the lower part of the result */
4227 *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
4229 /* get the carry word from the result */
4230 u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
4232 /* set carry if it is placed below digs */
4233 if (ix + iy < digs) {
4245 /* multiplies |a| * |b| and does not compute the lower digs digits
4246 * [meant to get the higher part of the product]
4249 s_mp_mul_high_digs (const mp_int * a, const mp_int * b, mp_int * c, int digs)
4252 int res, pa, pb, ix, iy;
4255 mp_digit tmpx, *tmpt, *tmpy;
4257 /* can we use the fast multiplier? */
4258 if (((a->used + b->used + 1) < MP_WARRAY)
4259 && MIN (a->used, b->used) < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
4260 return fast_s_mp_mul_high_digs (a, b, c, digs);
4263 if ((res = mp_init_size (&t, a->used + b->used + 1)) != MP_OKAY) {
4266 t.used = a->used + b->used + 1;
4270 for (ix = 0; ix < pa; ix++) {
4271 /* clear the carry */
4274 /* left hand side of A[ix] * B[iy] */
4277 /* alias to the address of where the digits will be stored */
4278 tmpt = &(t.dp[digs]);
4280 /* alias for where to read the right hand side from */
4281 tmpy = b->dp + (digs - ix);
4283 for (iy = digs - ix; iy < pb; iy++) {
4284 /* calculate the double precision result */
4285 r = ((mp_word)*tmpt) +
4286 ((mp_word)tmpx) * ((mp_word)*tmpy++) +
4289 /* get the lower part */
4290 *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
4292 /* carry the carry */
4293 u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
4303 /* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
4305 s_mp_sqr (const mp_int * a, mp_int * b)
4308 int res, ix, iy, pa;
4310 mp_digit u, tmpx, *tmpt;
4313 if ((res = mp_init_size (&t, 2*pa + 1)) != MP_OKAY) {
4317 /* default used is maximum possible size */
4320 for (ix = 0; ix < pa; ix++) {
4321 /* first calculate the digit at 2*ix */
4322 /* calculate double precision result */
4323 r = ((mp_word) t.dp[2*ix]) +
4324 ((mp_word)a->dp[ix])*((mp_word)a->dp[ix]);
4326 /* store lower part in result */
4327 t.dp[ix+ix] = (mp_digit) (r & ((mp_word) MP_MASK));
4330 u = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
4332 /* left hand side of A[ix] * A[iy] */
4335 /* alias for where to store the results */
4336 tmpt = t.dp + (2*ix + 1);
4338 for (iy = ix + 1; iy < pa; iy++) {
4339 /* first calculate the product */
4340 r = ((mp_word)tmpx) * ((mp_word)a->dp[iy]);
4342 /* now calculate the double precision result, note we use
4343 * addition instead of *2 since it's easier to optimize
4345 r = ((mp_word) *tmpt) + r + r + ((mp_word) u);
4347 /* store lower part */
4348 *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
4351 u = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
4353 /* propagate upwards */
4354 while (u != ((mp_digit) 0)) {
4355 r = ((mp_word) *tmpt) + ((mp_word) u);
4356 *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
4357 u = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
4367 /* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */
4369 s_mp_sub (const mp_int * a, const mp_int * b, mp_int * c)
4371 int olduse, res, min, max;
4378 if (c->alloc < max) {
4379 if ((res = mp_grow (c, max)) != MP_OKAY) {
4387 register mp_digit u, *tmpa, *tmpb, *tmpc;
4390 /* alias for digit pointers */
4395 /* set carry to zero */
4397 for (i = 0; i < min; i++) {
4398 /* T[i] = A[i] - B[i] - U */
4399 *tmpc = *tmpa++ - *tmpb++ - u;
4401 /* U = carry bit of T[i]
4402 * Note this saves performing an AND operation since
4403 * if a carry does occur it will propagate all the way to the
4404 * MSB. As a result a single shift is enough to get the carry
4406 u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
4408 /* Clear carry from T[i] */
4412 /* now copy higher words if any, e.g. if A has more digits than B */
4413 for (; i < max; i++) {
4414 /* T[i] = A[i] - U */
4415 *tmpc = *tmpa++ - u;
4417 /* U = carry bit of T[i] */
4418 u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
4420 /* Clear carry from T[i] */
4424 /* clear digits above used (since we may not have grown result above) */
4425 for (i = c->used; i < olduse; i++) {