3 * Multi Precision Integer functions
5 * Copyright 2004 Michael Jung
6 * Based on public domain code by Tom St Denis (tomstdenis@iahu.ca)
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * This library is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with this library; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
24 * This file contains code from the LibTomCrypt cryptographic
25 * library written by Tom St Denis (tomstdenis@iahu.ca). LibTomCrypt
26 * is in the public domain. The code in this file is tailored to
27 * special requirements. Take a look at http://libtomcrypt.org for the
34 /* table of first PRIME_SIZE primes */
35 static const mp_digit __prime_tab[];
37 /* Known optimal configurations
38 CPU /Compiler /MUL CUTOFF/SQR CUTOFF
39 -------------------------------------------------------------
40 Intel P4 Northwood /GCC v3.4.1 / 88/ 128/LTM 0.32 ;-)
42 static const int KARATSUBA_MUL_CUTOFF = 88, /* Min. number of digits before Karatsuba multiplication is used. */
43 KARATSUBA_SQR_CUTOFF = 128; /* Min. number of digits before Karatsuba squaring is used. */
45 /* computes the modular inverse via binary extended euclidean algorithm,
46 * that is c = 1/a mod b
48 * Based on slow invmod except this is optimized for the case where b is
49 * odd as per HAC Note 14.64 on pp. 610
52 fast_mp_invmod (const mp_int * a, mp_int * b, mp_int * c)
54 mp_int x, y, u, v, B, D;
57 /* 2. [modified] b must be odd */
58 if (mp_iseven (b) == 1) {
62 /* init all our temps */
63 if ((res = mp_init_multi(&x, &y, &u, &v, &B, &D, NULL)) != MP_OKAY) {
67 /* x == modulus, y == value to invert */
68 if ((res = mp_copy (b, &x)) != MP_OKAY) {
73 if ((res = mp_abs (a, &y)) != MP_OKAY) {
77 /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
78 if ((res = mp_copy (&x, &u)) != MP_OKAY) {
81 if ((res = mp_copy (&y, &v)) != MP_OKAY) {
87 /* 4. while u is even do */
88 while (mp_iseven (&u) == 1) {
90 if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
93 /* 4.2 if B is odd then */
94 if (mp_isodd (&B) == 1) {
95 if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
100 if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
105 /* 5. while v is even do */
106 while (mp_iseven (&v) == 1) {
108 if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
111 /* 5.2 if D is odd then */
112 if (mp_isodd (&D) == 1) {
114 if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
119 if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
124 /* 6. if u >= v then */
125 if (mp_cmp (&u, &v) != MP_LT) {
126 /* u = u - v, B = B - D */
127 if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
131 if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
135 /* v - v - u, D = D - B */
136 if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
140 if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
145 /* if not zero goto step 4 */
146 if (mp_iszero (&u) == 0) {
150 /* now a = C, b = D, gcd == g*v */
152 /* if v != 1 then there is no inverse */
153 if (mp_cmp_d (&v, 1) != MP_EQ) {
158 /* b is now the inverse */
160 while (D.sign == MP_NEG) {
161 if ((res = mp_add (&D, b, &D)) != MP_OKAY) {
169 __ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL);
173 /* computes xR**-1 == x (mod N) via Montgomery Reduction
175 * This is an optimized implementation of montgomery_reduce
176 * which uses the comba method to quickly calculate the columns of the
179 * Based on Algorithm 14.32 on pp.601 of HAC.
182 fast_mp_montgomery_reduce (mp_int * x, const mp_int * n, mp_digit rho)
185 mp_word W[MP_WARRAY];
187 /* get old used count */
190 /* grow a as required */
191 if (x->alloc < n->used + 1) {
192 if ((res = mp_grow (x, n->used + 1)) != MP_OKAY) {
197 /* first we have to get the digits of the input into
198 * an array of double precision words W[...]
201 register mp_word *_W;
202 register mp_digit *tmpx;
204 /* alias for the W[] array */
207 /* alias for the digits of x*/
210 /* copy the digits of a into W[0..a->used-1] */
211 for (ix = 0; ix < x->used; ix++) {
215 /* zero the high words of W[a->used..m->used*2] */
216 for (; ix < n->used * 2 + 1; ix++) {
221 /* now we proceed to zero successive digits
222 * from the least significant upwards
224 for (ix = 0; ix < n->used; ix++) {
225 /* mu = ai * m' mod b
227 * We avoid a double precision multiplication (which isn't required)
228 * by casting the value down to a mp_digit. Note this requires
229 * that W[ix-1] have the carry cleared (see after the inner loop)
231 register mp_digit mu;
232 mu = (mp_digit) (((W[ix] & MP_MASK) * rho) & MP_MASK);
234 /* a = a + mu * m * b**i
236 * This is computed in place and on the fly. The multiplication
237 * by b**i is handled by offseting which columns the results
240 * Note the comba method normally doesn't handle carries in the
241 * inner loop In this case we fix the carry from the previous
242 * column since the Montgomery reduction requires digits of the
243 * result (so far) [see above] to work. This is
244 * handled by fixing up one carry after the inner loop. The
245 * carry fixups are done in order so after these loops the
246 * first m->used words of W[] have the carries fixed
250 register mp_digit *tmpn;
251 register mp_word *_W;
253 /* alias for the digits of the modulus */
256 /* Alias for the columns set by an offset of ix */
260 for (iy = 0; iy < n->used; iy++) {
261 *_W++ += ((mp_word)mu) * ((mp_word)*tmpn++);
265 /* now fix carry for next digit, W[ix+1] */
266 W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT);
269 /* now we have to propagate the carries and
270 * shift the words downward [all those least
271 * significant digits we zeroed].
274 register mp_digit *tmpx;
275 register mp_word *_W, *_W1;
277 /* nox fix rest of carries */
279 /* alias for current word */
282 /* alias for next word, where the carry goes */
285 for (; ix <= n->used * 2 + 1; ix++) {
286 *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT);
289 /* copy out, A = A/b**n
291 * The result is A/b**n but instead of converting from an
292 * array of mp_word to mp_digit than calling mp_rshd
293 * we just copy them in the right order
296 /* alias for destination word */
299 /* alias for shifted double precision result */
302 for (ix = 0; ix < n->used + 1; ix++) {
303 *tmpx++ = (mp_digit)(*_W++ & ((mp_word) MP_MASK));
306 /* zero oldused digits, if the input a was larger than
307 * m->used+1 we'll have to clear the digits
309 for (; ix < olduse; ix++) {
314 /* set the max used and clamp */
315 x->used = n->used + 1;
318 /* if A >= m then A = A - m */
319 if (mp_cmp_mag (x, n) != MP_LT) {
320 return s_mp_sub (x, n, x);
325 /* Fast (comba) multiplier
327 * This is the fast column-array [comba] multiplier. It is
328 * designed to compute the columns of the product first
329 * then handle the carries afterwards. This has the effect
330 * of making the nested loops that compute the columns very
331 * simple and schedulable on super-scalar processors.
333 * This has been modified to produce a variable number of
334 * digits of output so if say only a half-product is required
335 * you don't have to compute the upper half (a feature
336 * required for fast Barrett reduction).
338 * Based on Algorithm 14.12 on pp.595 of HAC.
342 fast_s_mp_mul_digs (const mp_int * a, const mp_int * b, mp_int * c, int digs)
344 int olduse, res, pa, ix, iz;
345 mp_digit W[MP_WARRAY];
348 /* grow the destination as required */
349 if (c->alloc < digs) {
350 if ((res = mp_grow (c, digs)) != MP_OKAY) {
355 /* number of output digits to produce */
356 pa = MIN(digs, a->used + b->used);
358 /* clear the carry */
360 for (ix = 0; ix <= pa; ix++) {
363 mp_digit *tmpx, *tmpy;
365 /* get offsets into the two bignums */
366 ty = MIN(b->used-1, ix);
369 /* setup temp aliases */
373 /* this is the number of times the loop will iterrate, essentially its
374 while (tx++ < a->used && ty-- >= 0) { ... }
376 iy = MIN(a->used-tx, ty+1);
379 for (iz = 0; iz < iy; ++iz) {
380 _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
384 W[ix] = ((mp_digit)_W) & MP_MASK;
386 /* make next carry */
387 _W = _W >> ((mp_word)DIGIT_BIT);
395 register mp_digit *tmpc;
397 for (ix = 0; ix < digs; ix++) {
398 /* now extract the previous digit [below the carry] */
402 /* clear unused digits [that existed in the old copy of c] */
403 for (; ix < olduse; ix++) {
411 /* this is a modified version of fast_s_mul_digs that only produces
412 * output digits *above* digs. See the comments for fast_s_mul_digs
413 * to see how it works.
415 * This is used in the Barrett reduction since for one of the multiplications
416 * only the higher digits were needed. This essentially halves the work.
418 * Based on Algorithm 14.12 on pp.595 of HAC.
421 fast_s_mp_mul_high_digs (const mp_int * a, const mp_int * b, mp_int * c, int digs)
423 int olduse, res, pa, ix, iz;
424 mp_digit W[MP_WARRAY];
427 /* grow the destination as required */
428 pa = a->used + b->used;
430 if ((res = mp_grow (c, pa)) != MP_OKAY) {
435 /* number of output digits to produce */
436 pa = a->used + b->used;
438 for (ix = digs; ix <= pa; ix++) {
440 mp_digit *tmpx, *tmpy;
442 /* get offsets into the two bignums */
443 ty = MIN(b->used-1, ix);
446 /* setup temp aliases */
450 /* this is the number of times the loop will iterrate, essentially its
451 while (tx++ < a->used && ty-- >= 0) { ... }
453 iy = MIN(a->used-tx, ty+1);
456 for (iz = 0; iz < iy; iz++) {
457 _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
461 W[ix] = ((mp_digit)_W) & MP_MASK;
463 /* make next carry */
464 _W = _W >> ((mp_word)DIGIT_BIT);
472 register mp_digit *tmpc;
475 for (ix = digs; ix <= pa; ix++) {
476 /* now extract the previous digit [below the carry] */
480 /* clear unused digits [that existed in the old copy of c] */
481 for (; ix < olduse; ix++) {
491 * This is the comba method where the columns of the product
492 * are computed first then the carries are computed. This
493 * has the effect of making a very simple inner loop that
494 * is executed the most
496 * W2 represents the outer products and W the inner.
498 * A further optimizations is made because the inner
499 * products are of the form "A * B * 2". The *2 part does
500 * not need to be computed until the end which is good
501 * because 64-bit shifts are slow!
503 * Based on Algorithm 14.16 on pp.597 of HAC.
506 /* the jist of squaring...
508 you do like mult except the offset of the tmpx [one that starts closer to zero]
509 can't equal the offset of tmpy. So basically you set up iy like before then you min it with
510 (ty-tx) so that it never happens. You double all those you add in the inner loop
512 After that loop you do the squares and add them in.
514 Remove W2 and don't memset W
518 int fast_s_mp_sqr (const mp_int * a, mp_int * b)
520 int olduse, res, pa, ix, iz;
521 mp_digit W[MP_WARRAY], *tmpx;
524 /* grow the destination as required */
525 pa = a->used + a->used;
527 if ((res = mp_grow (b, pa)) != MP_OKAY) {
532 /* number of output digits to produce */
534 for (ix = 0; ix <= pa; ix++) {
542 /* get offsets into the two bignums */
543 ty = MIN(a->used-1, ix);
546 /* setup temp aliases */
550 /* this is the number of times the loop will iterrate, essentially its
551 while (tx++ < a->used && ty-- >= 0) { ... }
553 iy = MIN(a->used-tx, ty+1);
555 /* now for squaring tx can never equal ty
556 * we halve the distance since they approach at a rate of 2x
557 * and we have to round because odd cases need to be executed
559 iy = MIN(iy, (ty-tx+1)>>1);
562 for (iz = 0; iz < iy; iz++) {
563 _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
566 /* double the inner product and add carry */
569 /* even columns have the square term in them */
571 _W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]);
577 /* make next carry */
578 W1 = _W >> ((mp_word)DIGIT_BIT);
583 b->used = a->used+a->used;
588 for (ix = 0; ix < pa; ix++) {
589 *tmpb++ = W[ix] & MP_MASK;
592 /* clear unused digits [that existed in the old copy of c] */
593 for (; ix < olduse; ix++) {
603 * Simple algorithm which zeroes the int, grows it then just sets one bit
607 mp_2expt (mp_int * a, int b)
611 /* zero a as per default */
614 /* grow a to accommodate the single bit */
615 if ((res = mp_grow (a, b / DIGIT_BIT + 1)) != MP_OKAY) {
619 /* set the used count of where the bit will go */
620 a->used = b / DIGIT_BIT + 1;
622 /* put the single bit in its place */
623 a->dp[b / DIGIT_BIT] = ((mp_digit)1) << (b % DIGIT_BIT);
630 * Simple function copies the input and fixes the sign to positive
633 mp_abs (const mp_int * a, mp_int * b)
639 if ((res = mp_copy (a, b)) != MP_OKAY) {
644 /* force the sign of b to positive */
650 /* high level addition (handles signs) */
651 int mp_add (mp_int * a, mp_int * b, mp_int * c)
655 /* get sign of both inputs */
659 /* handle two cases, not four */
661 /* both positive or both negative */
662 /* add their magnitudes, copy the sign */
664 res = s_mp_add (a, b, c);
666 /* one positive, the other negative */
667 /* subtract the one with the greater magnitude from */
668 /* the one of the lesser magnitude. The result gets */
669 /* the sign of the one with the greater magnitude. */
670 if (mp_cmp_mag (a, b) == MP_LT) {
672 res = s_mp_sub (b, a, c);
675 res = s_mp_sub (a, b, c);
682 /* single digit addition */
684 mp_add_d (mp_int * a, mp_digit b, mp_int * c)
686 int res, ix, oldused;
687 mp_digit *tmpa, *tmpc, mu;
689 /* grow c as required */
690 if (c->alloc < a->used + 1) {
691 if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) {
696 /* if a is negative and |a| >= b, call c = |a| - b */
697 if (a->sign == MP_NEG && (a->used > 1 || a->dp[0] >= b)) {
698 /* temporarily fix sign of a */
702 res = mp_sub_d(a, b, c);
705 a->sign = c->sign = MP_NEG;
710 /* old number of used digits in c */
713 /* sign always positive */
719 /* destination alias */
722 /* if a is positive */
723 if (a->sign == MP_ZPOS) {
724 /* add digit, after this we're propagating
728 mu = *tmpc >> DIGIT_BIT;
731 /* now handle rest of the digits */
732 for (ix = 1; ix < a->used; ix++) {
733 *tmpc = *tmpa++ + mu;
734 mu = *tmpc >> DIGIT_BIT;
737 /* set final carry */
742 c->used = a->used + 1;
744 /* a was negative and |a| < b */
747 /* the result is a single digit */
749 *tmpc++ = b - a->dp[0];
754 /* setup count so the clearing of oldused
755 * can fall through correctly
760 /* now zero to oldused */
761 while (ix++ < oldused) {
769 /* trim unused digits
771 * This is used to ensure that leading zero digits are
772 * trimed and the leading "used" digit will be non-zero
773 * Typically very fast. Also fixes the sign if there
774 * are no more leading digits
777 mp_clamp (mp_int * a)
779 /* decrease used while the most significant digit is
782 while (a->used > 0 && a->dp[a->used - 1] == 0) {
786 /* reset the sign flag if used == 0 */
792 /* clear one (frees) */
794 mp_clear (mp_int * a)
798 /* only do anything if a hasn't been freed previously */
800 /* first zero the digits */
801 for (i = 0; i < a->used; i++) {
808 /* reset members to make debugging easier */
810 a->alloc = a->used = 0;
816 void mp_clear_multi(mp_int *mp, ...)
818 mp_int* next_mp = mp;
821 while (next_mp != NULL) {
823 next_mp = va_arg(args, mp_int*);
828 /* compare two ints (signed)*/
830 mp_cmp (const mp_int * a, const mp_int * b)
832 /* compare based on sign */
833 if (a->sign != b->sign) {
834 if (a->sign == MP_NEG) {
842 if (a->sign == MP_NEG) {
843 /* if negative compare opposite direction */
844 return mp_cmp_mag(b, a);
846 return mp_cmp_mag(a, b);
850 /* compare a digit */
851 int mp_cmp_d(const mp_int * a, mp_digit b)
853 /* compare based on sign */
854 if (a->sign == MP_NEG) {
858 /* compare based on magnitude */
863 /* compare the only digit of a to b */
866 } else if (a->dp[0] < b) {
873 /* compare maginitude of two ints (unsigned) */
874 int mp_cmp_mag (const mp_int * a, const mp_int * b)
877 mp_digit *tmpa, *tmpb;
879 /* compare based on # of non-zero digits */
880 if (a->used > b->used) {
884 if (a->used < b->used) {
889 tmpa = a->dp + (a->used - 1);
892 tmpb = b->dp + (a->used - 1);
894 /* compare based on digits */
895 for (n = 0; n < a->used; ++n, --tmpa, --tmpb) {
907 static const int lnz[16] = {
908 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
911 /* Counts the number of lsbs which are zero before the first zero bit */
912 int mp_cnt_lsb(const mp_int *a)
918 if (mp_iszero(a) == 1) {
922 /* scan lower digits until non-zero */
923 for (x = 0; x < a->used && a->dp[x] == 0; x++);
927 /* now scan this digit until a 1 is found */
940 mp_copy (const mp_int * a, mp_int * b)
944 /* if dst == src do nothing */
950 if (b->alloc < a->used) {
951 if ((res = mp_grow (b, a->used)) != MP_OKAY) {
956 /* zero b and copy the parameters over */
958 register mp_digit *tmpa, *tmpb;
960 /* pointer aliases */
968 /* copy all the digits */
969 for (n = 0; n < a->used; n++) {
973 /* clear high digits */
974 for (; n < b->used; n++) {
979 /* copy used count and sign */
985 /* returns the number of bits in an int */
987 mp_count_bits (const mp_int * a)
997 /* get number of digits and add that */
998 r = (a->used - 1) * DIGIT_BIT;
1000 /* take the last digit and count the bits in it */
1001 q = a->dp[a->used - 1];
1002 while (q > ((mp_digit) 0)) {
1004 q >>= ((mp_digit) 1);
1009 /* integer signed division.
1010 * c*b + d == a [e.g. a/b, c=quotient, d=remainder]
1011 * HAC pp.598 Algorithm 14.20
1013 * Note that the description in HAC is horribly
1014 * incomplete. For example, it doesn't consider
1015 * the case where digits are removed from 'x' in
1016 * the inner loop. It also doesn't consider the
1017 * case that y has fewer than three digits, etc..
1019 * The overall algorithm is as described as
1020 * 14.20 from HAC but fixed to treat these cases.
1022 int mp_div (const mp_int * a, const mp_int * b, mp_int * c, mp_int * d)
1024 mp_int q, x, y, t1, t2;
1025 int res, n, t, i, norm, neg;
1027 /* is divisor zero ? */
1028 if (mp_iszero (b) == 1) {
1032 /* if a < b then q=0, r = a */
1033 if (mp_cmp_mag (a, b) == MP_LT) {
1035 res = mp_copy (a, d);
1045 if ((res = mp_init_size (&q, a->used + 2)) != MP_OKAY) {
1048 q.used = a->used + 2;
1050 if ((res = mp_init (&t1)) != MP_OKAY) {
1054 if ((res = mp_init (&t2)) != MP_OKAY) {
1058 if ((res = mp_init_copy (&x, a)) != MP_OKAY) {
1062 if ((res = mp_init_copy (&y, b)) != MP_OKAY) {
1067 neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
1068 x.sign = y.sign = MP_ZPOS;
1070 /* normalize both x and y, ensure that y >= b/2, [b == 2**DIGIT_BIT] */
1071 norm = mp_count_bits(&y) % DIGIT_BIT;
1072 if (norm < DIGIT_BIT-1) {
1073 norm = (DIGIT_BIT-1) - norm;
1074 if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) {
1077 if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) {
1084 /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
1088 /* while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} } */
1089 if ((res = mp_lshd (&y, n - t)) != MP_OKAY) { /* y = y*b**{n-t} */
1093 while (mp_cmp (&x, &y) != MP_LT) {
1095 if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) {
1100 /* reset y by shifting it back down */
1101 mp_rshd (&y, n - t);
1103 /* step 3. for i from n down to (t + 1) */
1104 for (i = n; i >= (t + 1); i--) {
1109 /* step 3.1 if xi == yt then set q{i-t-1} to b-1,
1110 * otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
1111 if (x.dp[i] == y.dp[t]) {
1112 q.dp[i - t - 1] = ((((mp_digit)1) << DIGIT_BIT) - 1);
1115 tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT);
1116 tmp |= ((mp_word) x.dp[i - 1]);
1117 tmp /= ((mp_word) y.dp[t]);
1118 if (tmp > (mp_word) MP_MASK)
1120 q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK));
1123 /* while (q{i-t-1} * (yt * b + y{t-1})) >
1124 xi * b**2 + xi-1 * b + xi-2
1128 q.dp[i - t - 1] = (q.dp[i - t - 1] + 1) & MP_MASK;
1130 q.dp[i - t - 1] = (q.dp[i - t - 1] - 1) & MP_MASK;
1132 /* find left hand */
1134 t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1];
1137 if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) {
1141 /* find right hand */
1142 t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2];
1143 t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
1146 } while (mp_cmp_mag(&t1, &t2) == MP_GT);
1148 /* step 3.3 x = x - q{i-t-1} * y * b**{i-t-1} */
1149 if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
1153 if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
1157 if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) {
1161 /* if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; } */
1162 if (x.sign == MP_NEG) {
1163 if ((res = mp_copy (&y, &t1)) != MP_OKAY) {
1166 if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
1169 if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) {
1173 q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
1177 /* now q is the quotient and x is the remainder
1178 * [which we have to normalize]
1181 /* get sign before writing to c */
1182 x.sign = x.used == 0 ? MP_ZPOS : a->sign;
1191 mp_div_2d (&x, norm, &x, NULL);
1199 __T2:mp_clear (&t2);
1200 __T1:mp_clear (&t1);
1206 int mp_div_2(const mp_int * a, mp_int * b)
1208 int x, res, oldused;
1211 if (b->alloc < a->used) {
1212 if ((res = mp_grow (b, a->used)) != MP_OKAY) {
1220 register mp_digit r, rr, *tmpa, *tmpb;
1223 tmpa = a->dp + b->used - 1;
1226 tmpb = b->dp + b->used - 1;
1230 for (x = b->used - 1; x >= 0; x--) {
1231 /* get the carry for the next iteration */
1234 /* shift the current digit, add in carry and store */
1235 *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
1237 /* forward carry to next iteration */
1241 /* zero excess digits */
1242 tmpb = b->dp + b->used;
1243 for (x = b->used; x < oldused; x++) {
1252 /* shift right by a certain bit count (store quotient in c, optional remainder in d) */
1253 int mp_div_2d (const mp_int * a, int b, mp_int * c, mp_int * d)
1260 /* if the shift count is <= 0 then we do no work */
1262 res = mp_copy (a, c);
1269 if ((res = mp_init (&t)) != MP_OKAY) {
1273 /* get the remainder */
1275 if ((res = mp_mod_2d (a, b, &t)) != MP_OKAY) {
1282 if ((res = mp_copy (a, c)) != MP_OKAY) {
1287 /* shift by as many digits in the bit count */
1288 if (b >= DIGIT_BIT) {
1289 mp_rshd (c, b / DIGIT_BIT);
1292 /* shift any bit count < DIGIT_BIT */
1293 D = (mp_digit) (b % DIGIT_BIT);
1295 register mp_digit *tmpc, mask, shift;
1298 mask = (((mp_digit)1) << D) - 1;
1301 shift = DIGIT_BIT - D;
1304 tmpc = c->dp + (c->used - 1);
1308 for (x = c->used - 1; x >= 0; x--) {
1309 /* get the lower bits of this word in a temp */
1312 /* shift the current word and mix in the carry bits from the previous word */
1313 *tmpc = (*tmpc >> D) | (r << shift);
1316 /* set the carry to the carry bits of the current word found above */
1328 static int s_is_power_of_two(mp_digit b, int *p)
1332 for (x = 1; x < DIGIT_BIT; x++) {
1333 if (b == (((mp_digit)1)<<x)) {
1341 /* single digit division (based on routine from MPI) */
1342 int mp_div_d (const mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
1349 /* cannot divide by zero */
1355 if (b == 1 || mp_iszero(a) == 1) {
1360 return mp_copy(a, c);
1365 /* power of two ? */
1366 if (s_is_power_of_two(b, &ix) == 1) {
1368 *d = a->dp[0] & ((((mp_digit)1)<<ix) - 1);
1371 return mp_div_2d(a, ix, c, NULL);
1376 /* no easy answer [c'est la vie]. Just division */
1377 if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
1384 for (ix = a->used - 1; ix >= 0; ix--) {
1385 w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]);
1388 t = (mp_digit)(w / b);
1389 w -= ((mp_word)t) * ((mp_word)b);
1409 /* reduce "x" in place modulo "n" using the Diminished Radix algorithm.
1411 * Based on algorithm from the paper
1413 * "Generating Efficient Primes for Discrete Log Cryptosystems"
1414 * Chae Hoon Lim, Pil Loong Lee,
1415 * POSTECH Information Research Laboratories
1417 * The modulus must be of a special format [see manual]
1419 * Has been modified to use algorithm 7.10 from the LTM book instead
1421 * Input x must be in the range 0 <= x <= (n-1)**2
1424 mp_dr_reduce (mp_int * x, const mp_int * n, mp_digit k)
1428 mp_digit mu, *tmpx1, *tmpx2;
1430 /* m = digits in modulus */
1433 /* ensure that "x" has at least 2m digits */
1434 if (x->alloc < m + m) {
1435 if ((err = mp_grow (x, m + m)) != MP_OKAY) {
1440 /* top of loop, this is where the code resumes if
1441 * another reduction pass is required.
1444 /* aliases for digits */
1445 /* alias for lower half of x */
1448 /* alias for upper half of x, or x/B**m */
1451 /* set carry to zero */
1454 /* compute (x mod B**m) + k * [x/B**m] inline and inplace */
1455 for (i = 0; i < m; i++) {
1456 r = ((mp_word)*tmpx2++) * ((mp_word)k) + *tmpx1 + mu;
1457 *tmpx1++ = (mp_digit)(r & MP_MASK);
1458 mu = (mp_digit)(r >> ((mp_word)DIGIT_BIT));
1461 /* set final carry */
1464 /* zero words above m */
1465 for (i = m + 1; i < x->used; i++) {
1469 /* clamp, sub and return */
1472 /* if x >= n then subtract and reduce again
1473 * Each successive "recursion" makes the input smaller and smaller.
1475 if (mp_cmp_mag (x, n) != MP_LT) {
1482 /* determines the setup value */
1483 void mp_dr_setup(const mp_int *a, mp_digit *d)
1485 /* the casts are required if DIGIT_BIT is one less than
1486 * the number of bits in a mp_digit [e.g. DIGIT_BIT==31]
1488 *d = (mp_digit)((((mp_word)1) << ((mp_word)DIGIT_BIT)) -
1489 ((mp_word)a->dp[0]));
1492 /* swap the elements of two integers, for cases where you can't simply swap the
1493 * mp_int pointers around
1496 mp_exch (mp_int * a, mp_int * b)
1505 /* this is a shell function that calls either the normal or Montgomery
1506 * exptmod functions. Originally the call to the montgomery code was
1507 * embedded in the normal function but that wasted a lot of stack space
1508 * for nothing (since 99% of the time the Montgomery code would be called)
1510 int mp_exptmod (const mp_int * G, const mp_int * X, mp_int * P, mp_int * Y)
1514 /* modulus P must be positive */
1515 if (P->sign == MP_NEG) {
1519 /* if exponent X is negative we have to recurse */
1520 if (X->sign == MP_NEG) {
1524 /* first compute 1/G mod P */
1525 if ((err = mp_init(&tmpG)) != MP_OKAY) {
1528 if ((err = mp_invmod(G, P, &tmpG)) != MP_OKAY) {
1534 if ((err = mp_init(&tmpX)) != MP_OKAY) {
1538 if ((err = mp_abs(X, &tmpX)) != MP_OKAY) {
1539 mp_clear_multi(&tmpG, &tmpX, NULL);
1543 /* and now compute (1/G)**|X| instead of G**X [X < 0] */
1544 err = mp_exptmod(&tmpG, &tmpX, P, Y);
1545 mp_clear_multi(&tmpG, &tmpX, NULL);
1551 /* if the modulus is odd or dr != 0 use the fast method */
1552 if (mp_isodd (P) == 1 || dr != 0) {
1553 return mp_exptmod_fast (G, X, P, Y, dr);
1555 /* otherwise use the generic Barrett reduction technique */
1556 return s_mp_exptmod (G, X, P, Y);
1560 /* computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
1562 * Uses a left-to-right k-ary sliding window to compute the modular exponentiation.
1563 * The value of k changes based on the size of the exponent.
1565 * Uses Montgomery or Diminished Radix reduction [whichever appropriate]
1569 mp_exptmod_fast (const mp_int * G, const mp_int * X, mp_int * P, mp_int * Y, int redmode)
1573 int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
1575 /* use a pointer to the reduction algorithm. This allows us to use
1576 * one of many reduction algorithms without modding the guts of
1577 * the code with if statements everywhere.
1579 int (*redux)(mp_int*,const mp_int*,mp_digit);
1581 /* find window size */
1582 x = mp_count_bits (X);
1585 } else if (x <= 36) {
1587 } else if (x <= 140) {
1589 } else if (x <= 450) {
1591 } else if (x <= 1303) {
1593 } else if (x <= 3529) {
1600 /* init first cell */
1601 if ((err = mp_init(&M[1])) != MP_OKAY) {
1605 /* now init the second half of the array */
1606 for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
1607 if ((err = mp_init(&M[x])) != MP_OKAY) {
1608 for (y = 1<<(winsize-1); y < x; y++) {
1616 /* determine and setup reduction code */
1618 /* now setup montgomery */
1619 if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) {
1623 /* automatically pick the comba one if available (saves quite a few calls/ifs) */
1624 if (((P->used * 2 + 1) < MP_WARRAY) &&
1625 P->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
1626 redux = fast_mp_montgomery_reduce;
1628 /* use slower baseline Montgomery method */
1629 redux = mp_montgomery_reduce;
1631 } else if (redmode == 1) {
1632 /* setup DR reduction for moduli of the form B**k - b */
1633 mp_dr_setup(P, &mp);
1634 redux = mp_dr_reduce;
1636 /* setup DR reduction for moduli of the form 2**k - b */
1637 if ((err = mp_reduce_2k_setup(P, &mp)) != MP_OKAY) {
1640 redux = mp_reduce_2k;
1644 if ((err = mp_init (&res)) != MP_OKAY) {
1652 * The first half of the table is not computed though accept for M[0] and M[1]
1656 /* now we need R mod m */
1657 if ((err = mp_montgomery_calc_normalization (&res, P)) != MP_OKAY) {
1661 /* now set M[1] to G * R mod m */
1662 if ((err = mp_mulmod (G, &res, P, &M[1])) != MP_OKAY) {
1667 if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
1672 /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
1673 if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
1677 for (x = 0; x < (winsize - 1); x++) {
1678 if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
1681 if ((err = redux (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
1686 /* create upper table */
1687 for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
1688 if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
1691 if ((err = redux (&M[x], P, mp)) != MP_OKAY) {
1696 /* set initial mode and bit cnt */
1700 digidx = X->used - 1;
1705 /* grab next digit as required */
1706 if (--bitcnt == 0) {
1707 /* if digidx == -1 we are out of digits so break */
1711 /* read next digit and reset bitcnt */
1712 buf = X->dp[digidx--];
1716 /* grab the next msb from the exponent */
1717 y = (buf >> (DIGIT_BIT - 1)) & 1;
1718 buf <<= (mp_digit)1;
1720 /* if the bit is zero and mode == 0 then we ignore it
1721 * These represent the leading zero bits before the first 1 bit
1722 * in the exponent. Technically this opt is not required but it
1723 * does lower the # of trivial squaring/reductions used
1725 if (mode == 0 && y == 0) {
1729 /* if the bit is zero and mode == 1 then we square */
1730 if (mode == 1 && y == 0) {
1731 if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
1734 if ((err = redux (&res, P, mp)) != MP_OKAY) {
1740 /* else we add it to the window */
1741 bitbuf |= (y << (winsize - ++bitcpy));
1744 if (bitcpy == winsize) {
1745 /* ok window is filled so square as required and multiply */
1747 for (x = 0; x < winsize; x++) {
1748 if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
1751 if ((err = redux (&res, P, mp)) != MP_OKAY) {
1757 if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
1760 if ((err = redux (&res, P, mp)) != MP_OKAY) {
1764 /* empty window and reset */
1771 /* if bits remain then square/multiply */
1772 if (mode == 2 && bitcpy > 0) {
1773 /* square then multiply if the bit is set */
1774 for (x = 0; x < bitcpy; x++) {
1775 if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
1778 if ((err = redux (&res, P, mp)) != MP_OKAY) {
1782 /* get next bit of the window */
1784 if ((bitbuf & (1 << winsize)) != 0) {
1786 if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
1789 if ((err = redux (&res, P, mp)) != MP_OKAY) {
1797 /* fixup result if Montgomery reduction is used
1798 * recall that any value in a Montgomery system is
1799 * actually multiplied by R mod n. So we have
1800 * to reduce one more time to cancel out the factor
1803 if ((err = redux(&res, P, mp)) != MP_OKAY) {
1808 /* swap res with Y */
1811 __RES:mp_clear (&res);
1814 for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
1820 /* Greatest Common Divisor using the binary method */
1821 int mp_gcd (const mp_int * a, const mp_int * b, mp_int * c)
1824 int k, u_lsb, v_lsb, res;
1826 /* either zero than gcd is the largest */
1827 if (mp_iszero (a) == 1 && mp_iszero (b) == 0) {
1828 return mp_abs (b, c);
1830 if (mp_iszero (a) == 0 && mp_iszero (b) == 1) {
1831 return mp_abs (a, c);
1834 /* optimized. At this point if a == 0 then
1835 * b must equal zero too
1837 if (mp_iszero (a) == 1) {
1842 /* get copies of a and b we can modify */
1843 if ((res = mp_init_copy (&u, a)) != MP_OKAY) {
1847 if ((res = mp_init_copy (&v, b)) != MP_OKAY) {
1851 /* must be positive for the remainder of the algorithm */
1852 u.sign = v.sign = MP_ZPOS;
1854 /* B1. Find the common power of two for u and v */
1855 u_lsb = mp_cnt_lsb(&u);
1856 v_lsb = mp_cnt_lsb(&v);
1857 k = MIN(u_lsb, v_lsb);
1860 /* divide the power of two out */
1861 if ((res = mp_div_2d(&u, k, &u, NULL)) != MP_OKAY) {
1865 if ((res = mp_div_2d(&v, k, &v, NULL)) != MP_OKAY) {
1870 /* divide any remaining factors of two out */
1872 if ((res = mp_div_2d(&u, u_lsb - k, &u, NULL)) != MP_OKAY) {
1878 if ((res = mp_div_2d(&v, v_lsb - k, &v, NULL)) != MP_OKAY) {
1883 while (mp_iszero(&v) == 0) {
1884 /* make sure v is the largest */
1885 if (mp_cmp_mag(&u, &v) == MP_GT) {
1886 /* swap u and v to make sure v is >= u */
1890 /* subtract smallest from largest */
1891 if ((res = s_mp_sub(&v, &u, &v)) != MP_OKAY) {
1895 /* Divide out all factors of two */
1896 if ((res = mp_div_2d(&v, mp_cnt_lsb(&v), &v, NULL)) != MP_OKAY) {
1901 /* multiply by 2**k which we divided out at the beginning */
1902 if ((res = mp_mul_2d (&u, k, c)) != MP_OKAY) {
1912 /* get the lower 32-bits of an mp_int */
1913 unsigned long mp_get_int(const mp_int * a)
1922 /* get number of digits of the lsb we have to read */
1923 i = MIN(a->used,(int)((sizeof(unsigned long)*CHAR_BIT+DIGIT_BIT-1)/DIGIT_BIT))-1;
1925 /* get most significant digit of result */
1929 res = (res << DIGIT_BIT) | DIGIT(a,i);
1932 /* force result to 32-bits always so it is consistent on non 32-bit platforms */
1933 return res & 0xFFFFFFFFUL;
1936 /* grow as required */
1937 int mp_grow (mp_int * a, int size)
1942 /* if the alloc size is smaller alloc more ram */
1943 if (a->alloc < size) {
1944 /* ensure there are always at least MP_PREC digits extra on top */
1945 size += (MP_PREC * 2) - (size % MP_PREC);
1947 /* reallocate the array a->dp
1949 * We store the return in a temporary variable
1950 * in case the operation failed we don't want
1951 * to overwrite the dp member of a.
1953 tmp = realloc (a->dp, sizeof (mp_digit) * size);
1955 /* reallocation failed but "a" is still valid [can be freed] */
1959 /* reallocation succeeded so set a->dp */
1962 /* zero excess digits */
1965 for (; i < a->alloc; i++) {
1972 /* init a new mp_int */
1973 int mp_init (mp_int * a)
1977 /* allocate memory required and clear it */
1978 a->dp = malloc (sizeof (mp_digit) * MP_PREC);
1979 if (a->dp == NULL) {
1983 /* set the digits to zero */
1984 for (i = 0; i < MP_PREC; i++) {
1988 /* set the used to zero, allocated digits to the default precision
1989 * and sign to positive */
1997 /* creates "a" then copies b into it */
1998 int mp_init_copy (mp_int * a, const mp_int * b)
2002 if ((res = mp_init (a)) != MP_OKAY) {
2005 return mp_copy (b, a);
2008 int mp_init_multi(mp_int *mp, ...)
2010 mp_err res = MP_OKAY; /* Assume ok until proven otherwise */
2011 int n = 0; /* Number of ok inits */
2012 mp_int* cur_arg = mp;
2015 va_start(args, mp); /* init args to next argument from caller */
2016 while (cur_arg != NULL) {
2017 if (mp_init(cur_arg) != MP_OKAY) {
2018 /* Oops - error! Back-track and mp_clear what we already
2019 succeeded in init-ing, then return error.
2023 /* end the current list */
2026 /* now start cleaning up */
2028 va_start(clean_args, mp);
2031 cur_arg = va_arg(clean_args, mp_int*);
2038 cur_arg = va_arg(args, mp_int*);
2041 return res; /* Assumed ok, if error flagged above. */
2044 /* init an mp_init for a given size */
2045 int mp_init_size (mp_int * a, int size)
2049 /* pad size so there are always extra digits */
2050 size += (MP_PREC * 2) - (size % MP_PREC);
2053 a->dp = malloc (sizeof (mp_digit) * size);
2054 if (a->dp == NULL) {
2058 /* set the members */
2063 /* zero the digits */
2064 for (x = 0; x < size; x++) {
2071 /* hac 14.61, pp608 */
2072 int mp_invmod (const mp_int * a, mp_int * b, mp_int * c)
2074 /* b cannot be negative */
2075 if (b->sign == MP_NEG || mp_iszero(b) == 1) {
2079 /* if the modulus is odd we can use a faster routine instead */
2080 if (mp_isodd (b) == 1) {
2081 return fast_mp_invmod (a, b, c);
2084 return mp_invmod_slow(a, b, c);
2087 /* hac 14.61, pp608 */
2088 int mp_invmod_slow (const mp_int * a, mp_int * b, mp_int * c)
2090 mp_int x, y, u, v, A, B, C, D;
2093 /* b cannot be negative */
2094 if (b->sign == MP_NEG || mp_iszero(b) == 1) {
2099 if ((res = mp_init_multi(&x, &y, &u, &v,
2100 &A, &B, &C, &D, NULL)) != MP_OKAY) {
2105 if ((res = mp_copy (a, &x)) != MP_OKAY) {
2108 if ((res = mp_copy (b, &y)) != MP_OKAY) {
2112 /* 2. [modified] if x,y are both even then return an error! */
2113 if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
2118 /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
2119 if ((res = mp_copy (&x, &u)) != MP_OKAY) {
2122 if ((res = mp_copy (&y, &v)) != MP_OKAY) {
2129 /* 4. while u is even do */
2130 while (mp_iseven (&u) == 1) {
2132 if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
2135 /* 4.2 if A or B is odd then */
2136 if (mp_isodd (&A) == 1 || mp_isodd (&B) == 1) {
2137 /* A = (A+y)/2, B = (B-x)/2 */
2138 if ((res = mp_add (&A, &y, &A)) != MP_OKAY) {
2141 if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
2145 /* A = A/2, B = B/2 */
2146 if ((res = mp_div_2 (&A, &A)) != MP_OKAY) {
2149 if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
2154 /* 5. while v is even do */
2155 while (mp_iseven (&v) == 1) {
2157 if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
2160 /* 5.2 if C or D is odd then */
2161 if (mp_isodd (&C) == 1 || mp_isodd (&D) == 1) {
2162 /* C = (C+y)/2, D = (D-x)/2 */
2163 if ((res = mp_add (&C, &y, &C)) != MP_OKAY) {
2166 if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
2170 /* C = C/2, D = D/2 */
2171 if ((res = mp_div_2 (&C, &C)) != MP_OKAY) {
2174 if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
2179 /* 6. if u >= v then */
2180 if (mp_cmp (&u, &v) != MP_LT) {
2181 /* u = u - v, A = A - C, B = B - D */
2182 if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
2186 if ((res = mp_sub (&A, &C, &A)) != MP_OKAY) {
2190 if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
2194 /* v - v - u, C = C - A, D = D - B */
2195 if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
2199 if ((res = mp_sub (&C, &A, &C)) != MP_OKAY) {
2203 if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
2208 /* if not zero goto step 4 */
2209 if (mp_iszero (&u) == 0)
2212 /* now a = C, b = D, gcd == g*v */
2214 /* if v != 1 then there is no inverse */
2215 if (mp_cmp_d (&v, 1) != MP_EQ) {
2220 /* if its too low */
2221 while (mp_cmp_d(&C, 0) == MP_LT) {
2222 if ((res = mp_add(&C, b, &C)) != MP_OKAY) {
2228 while (mp_cmp_mag(&C, b) != MP_LT) {
2229 if ((res = mp_sub(&C, b, &C)) != MP_OKAY) {
2234 /* C is now the inverse */
2237 __ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
2241 /* c = |a| * |b| using Karatsuba Multiplication using
2242 * three half size multiplications
2244 * Let B represent the radix [e.g. 2**DIGIT_BIT] and
2245 * let n represent half of the number of digits in
2248 * a = a1 * B**n + a0
2249 * b = b1 * B**n + b0
2252 a1b1 * B**2n + ((a1 - a0)(b1 - b0) + a0b0 + a1b1) * B + a0b0
2254 * Note that a1b1 and a0b0 are used twice and only need to be
2255 * computed once. So in total three half size (half # of
2256 * digit) multiplications are performed, a0b0, a1b1 and
2259 * Note that a multiplication of half the digits requires
2260 * 1/4th the number of single precision multiplications so in
2261 * total after one call 25% of the single precision multiplications
2262 * are saved. Note also that the call to mp_mul can end up back
2263 * in this function if the a0, a1, b0, or b1 are above the threshold.
2264 * This is known as divide-and-conquer and leads to the famous
2265 * O(N**lg(3)) or O(N**1.584) work which is asymptopically lower than
2266 * the standard O(N**2) that the baseline/comba methods use.
2267 * Generally though the overhead of this method doesn't pay off
2268 * until a certain size (N ~ 80) is reached.
2270 int mp_karatsuba_mul (const mp_int * a, const mp_int * b, mp_int * c)
2272 mp_int x0, x1, y0, y1, t1, x0y0, x1y1;
2275 /* default the return code to an error */
2278 /* min # of digits */
2279 B = MIN (a->used, b->used);
2281 /* now divide in two */
2284 /* init copy all the temps */
2285 if (mp_init_size (&x0, B) != MP_OKAY)
2287 if (mp_init_size (&x1, a->used - B) != MP_OKAY)
2289 if (mp_init_size (&y0, B) != MP_OKAY)
2291 if (mp_init_size (&y1, b->used - B) != MP_OKAY)
2295 if (mp_init_size (&t1, B * 2) != MP_OKAY)
2297 if (mp_init_size (&x0y0, B * 2) != MP_OKAY)
2299 if (mp_init_size (&x1y1, B * 2) != MP_OKAY)
2302 /* now shift the digits */
2303 x0.used = y0.used = B;
2304 x1.used = a->used - B;
2305 y1.used = b->used - B;
2309 register mp_digit *tmpa, *tmpb, *tmpx, *tmpy;
2311 /* we copy the digits directly instead of using higher level functions
2312 * since we also need to shift the digits
2319 for (x = 0; x < B; x++) {
2325 for (x = B; x < a->used; x++) {
2330 for (x = B; x < b->used; x++) {
2335 /* only need to clamp the lower words since by definition the
2336 * upper words x1/y1 must have a known number of digits
2341 /* now calc the products x0y0 and x1y1 */
2342 /* after this x0 is no longer required, free temp [x0==t2]! */
2343 if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY)
2344 goto X1Y1; /* x0y0 = x0*y0 */
2345 if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY)
2346 goto X1Y1; /* x1y1 = x1*y1 */
2348 /* now calc x1-x0 and y1-y0 */
2349 if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
2350 goto X1Y1; /* t1 = x1 - x0 */
2351 if (mp_sub (&y1, &y0, &x0) != MP_OKAY)
2352 goto X1Y1; /* t2 = y1 - y0 */
2353 if (mp_mul (&t1, &x0, &t1) != MP_OKAY)
2354 goto X1Y1; /* t1 = (x1 - x0) * (y1 - y0) */
2357 if (mp_add (&x0y0, &x1y1, &x0) != MP_OKAY)
2358 goto X1Y1; /* t2 = x0y0 + x1y1 */
2359 if (mp_sub (&x0, &t1, &t1) != MP_OKAY)
2360 goto X1Y1; /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
2363 if (mp_lshd (&t1, B) != MP_OKAY)
2364 goto X1Y1; /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
2365 if (mp_lshd (&x1y1, B * 2) != MP_OKAY)
2366 goto X1Y1; /* x1y1 = x1y1 << 2*B */
2368 if (mp_add (&x0y0, &t1, &t1) != MP_OKAY)
2369 goto X1Y1; /* t1 = x0y0 + t1 */
2370 if (mp_add (&t1, &x1y1, c) != MP_OKAY)
2371 goto X1Y1; /* t1 = x0y0 + t1 + x1y1 */
2373 /* Algorithm succeeded set the return code to MP_OKAY */
2376 X1Y1:mp_clear (&x1y1);
2377 X0Y0:mp_clear (&x0y0);
2387 /* Karatsuba squaring, computes b = a*a using three
2388 * half size squarings
2390 * See comments of karatsuba_mul for details. It
2391 * is essentially the same algorithm but merely
2392 * tuned to perform recursive squarings.
2394 int mp_karatsuba_sqr (const mp_int * a, mp_int * b)
2396 mp_int x0, x1, t1, t2, x0x0, x1x1;
2401 /* min # of digits */
2404 /* now divide in two */
2407 /* init copy all the temps */
2408 if (mp_init_size (&x0, B) != MP_OKAY)
2410 if (mp_init_size (&x1, a->used - B) != MP_OKAY)
2414 if (mp_init_size (&t1, a->used * 2) != MP_OKAY)
2416 if (mp_init_size (&t2, a->used * 2) != MP_OKAY)
2418 if (mp_init_size (&x0x0, B * 2) != MP_OKAY)
2420 if (mp_init_size (&x1x1, (a->used - B) * 2) != MP_OKAY)
2425 register mp_digit *dst, *src;
2429 /* now shift the digits */
2431 for (x = 0; x < B; x++) {
2436 for (x = B; x < a->used; x++) {
2442 x1.used = a->used - B;
2446 /* now calc the products x0*x0 and x1*x1 */
2447 if (mp_sqr (&x0, &x0x0) != MP_OKAY)
2448 goto X1X1; /* x0x0 = x0*x0 */
2449 if (mp_sqr (&x1, &x1x1) != MP_OKAY)
2450 goto X1X1; /* x1x1 = x1*x1 */
2452 /* now calc (x1-x0)**2 */
2453 if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
2454 goto X1X1; /* t1 = x1 - x0 */
2455 if (mp_sqr (&t1, &t1) != MP_OKAY)
2456 goto X1X1; /* t1 = (x1 - x0) * (x1 - x0) */
2459 if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY)
2460 goto X1X1; /* t2 = x0x0 + x1x1 */
2461 if (mp_sub (&t2, &t1, &t1) != MP_OKAY)
2462 goto X1X1; /* t1 = x0x0 + x1x1 - (x1-x0)*(x1-x0) */
2465 if (mp_lshd (&t1, B) != MP_OKAY)
2466 goto X1X1; /* t1 = (x0x0 + x1x1 - (x1-x0)*(x1-x0))<<B */
2467 if (mp_lshd (&x1x1, B * 2) != MP_OKAY)
2468 goto X1X1; /* x1x1 = x1x1 << 2*B */
2470 if (mp_add (&x0x0, &t1, &t1) != MP_OKAY)
2471 goto X1X1; /* t1 = x0x0 + t1 */
2472 if (mp_add (&t1, &x1x1, b) != MP_OKAY)
2473 goto X1X1; /* t1 = x0x0 + t1 + x1x1 */
2477 X1X1:mp_clear (&x1x1);
2478 X0X0:mp_clear (&x0x0);
2487 /* computes least common multiple as |a*b|/(a, b) */
2488 int mp_lcm (const mp_int * a, const mp_int * b, mp_int * c)
2494 if ((res = mp_init_multi (&t1, &t2, NULL)) != MP_OKAY) {
2498 /* t1 = get the GCD of the two inputs */
2499 if ((res = mp_gcd (a, b, &t1)) != MP_OKAY) {
2503 /* divide the smallest by the GCD */
2504 if (mp_cmp_mag(a, b) == MP_LT) {
2505 /* store quotient in t2 such that t2 * b is the LCM */
2506 if ((res = mp_div(a, &t1, &t2, NULL)) != MP_OKAY) {
2509 res = mp_mul(b, &t2, c);
2511 /* store quotient in t2 such that t2 * a is the LCM */
2512 if ((res = mp_div(b, &t1, &t2, NULL)) != MP_OKAY) {
2515 res = mp_mul(a, &t2, c);
2518 /* fix the sign to positive */
2522 mp_clear_multi (&t1, &t2, NULL);
2526 /* shift left a certain amount of digits */
2527 int mp_lshd (mp_int * a, int b)
2531 /* if its less than zero return */
2536 /* grow to fit the new digits */
2537 if (a->alloc < a->used + b) {
2538 if ((res = mp_grow (a, a->used + b)) != MP_OKAY) {
2544 register mp_digit *top, *bottom;
2546 /* increment the used by the shift amount then copy upwards */
2550 top = a->dp + a->used - 1;
2553 bottom = a->dp + a->used - 1 - b;
2555 /* much like mp_rshd this is implemented using a sliding window
2556 * except the window goes the otherway around. Copying from
2557 * the bottom to the top. see bn_mp_rshd.c for more info.
2559 for (x = a->used - 1; x >= b; x--) {
2563 /* zero the lower digits */
2565 for (x = 0; x < b; x++) {
2572 /* c = a mod b, 0 <= c < b */
2574 mp_mod (const mp_int * a, mp_int * b, mp_int * c)
2579 if ((res = mp_init (&t)) != MP_OKAY) {
2583 if ((res = mp_div (a, b, NULL, &t)) != MP_OKAY) {
2588 if (t.sign != b->sign) {
2589 res = mp_add (b, &t, c);
2599 /* calc a value mod 2**b */
2601 mp_mod_2d (const mp_int * a, int b, mp_int * c)
2605 /* if b is <= 0 then zero the int */
2611 /* if the modulus is larger than the value than return */
2612 if (b > a->used * DIGIT_BIT) {
2613 res = mp_copy (a, c);
2618 if ((res = mp_copy (a, c)) != MP_OKAY) {
2622 /* zero digits above the last digit of the modulus */
2623 for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++) {
2626 /* clear the digit that is not completely outside/inside the modulus */
2627 c->dp[b / DIGIT_BIT] &= (1 << ((mp_digit)b % DIGIT_BIT)) - 1;
2633 mp_mod_d (const mp_int * a, mp_digit b, mp_digit * c)
2635 return mp_div_d(a, b, NULL, c);
2639 * shifts with subtractions when the result is greater than b.
2641 * The method is slightly modified to shift B unconditionally up to just under
2642 * the leading bit of b. This saves a lot of multiple precision shifting.
2644 int mp_montgomery_calc_normalization (mp_int * a, const mp_int * b)
2648 /* how many bits of last digit does b use */
2649 bits = mp_count_bits (b) % DIGIT_BIT;
2653 if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
2662 /* now compute C = A * B mod b */
2663 for (x = bits - 1; x < DIGIT_BIT; x++) {
2664 if ((res = mp_mul_2 (a, a)) != MP_OKAY) {
2667 if (mp_cmp_mag (a, b) != MP_LT) {
2668 if ((res = s_mp_sub (a, b, a)) != MP_OKAY) {
2677 /* computes xR**-1 == x (mod N) via Montgomery Reduction */
2679 mp_montgomery_reduce (mp_int * x, const mp_int * n, mp_digit rho)
2684 /* can the fast reduction [comba] method be used?
2686 * Note that unlike in mul you're safely allowed *less*
2687 * than the available columns [255 per default] since carries
2688 * are fixed up in the inner loop.
2690 digs = n->used * 2 + 1;
2691 if ((digs < MP_WARRAY) &&
2693 (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
2694 return fast_mp_montgomery_reduce (x, n, rho);
2697 /* grow the input as required */
2698 if (x->alloc < digs) {
2699 if ((res = mp_grow (x, digs)) != MP_OKAY) {
2705 for (ix = 0; ix < n->used; ix++) {
2706 /* mu = ai * rho mod b
2708 * The value of rho must be precalculated via
2709 * montgomery_setup() such that
2710 * it equals -1/n0 mod b this allows the
2711 * following inner loop to reduce the
2712 * input one digit at a time
2714 mu = (mp_digit) (((mp_word)x->dp[ix]) * ((mp_word)rho) & MP_MASK);
2716 /* a = a + mu * m * b**i */
2719 register mp_digit *tmpn, *tmpx, u;
2722 /* alias for digits of the modulus */
2725 /* alias for the digits of x [the input] */
2728 /* set the carry to zero */
2731 /* Multiply and add in place */
2732 for (iy = 0; iy < n->used; iy++) {
2733 /* compute product and sum */
2734 r = ((mp_word)mu) * ((mp_word)*tmpn++) +
2735 ((mp_word) u) + ((mp_word) * tmpx);
2738 u = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
2741 *tmpx++ = (mp_digit)(r & ((mp_word) MP_MASK));
2743 /* At this point the ix'th digit of x should be zero */
2746 /* propagate carries upwards as required*/
2749 u = *tmpx >> DIGIT_BIT;
2755 /* at this point the n.used'th least
2756 * significant digits of x are all zero
2757 * which means we can shift x to the
2758 * right by n.used digits and the
2759 * residue is unchanged.
2762 /* x = x/b**n.used */
2764 mp_rshd (x, n->used);
2766 /* if x >= n then x = x - n */
2767 if (mp_cmp_mag (x, n) != MP_LT) {
2768 return s_mp_sub (x, n, x);
2774 /* setups the montgomery reduction stuff */
2776 mp_montgomery_setup (const mp_int * n, mp_digit * rho)
2780 /* fast inversion mod 2**k
2782 * Based on the fact that
2784 * XA = 1 (mod 2**n) => (X(2-XA)) A = 1 (mod 2**2n)
2785 * => 2*X*A - X*X*A*A = 1
2786 * => 2*(1) - (1) = 1
2794 x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
2795 x *= 2 - b * x; /* here x*a==1 mod 2**8 */
2796 x *= 2 - b * x; /* here x*a==1 mod 2**16 */
2797 x *= 2 - b * x; /* here x*a==1 mod 2**32 */
2799 /* rho = -1/m mod b */
2800 *rho = (((mp_word)1 << ((mp_word) DIGIT_BIT)) - x) & MP_MASK;
2805 /* high level multiplication (handles sign) */
2806 int mp_mul (const mp_int * a, const mp_int * b, mp_int * c)
2809 neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
2811 /* use Karatsuba? */
2812 if (MIN (a->used, b->used) >= KARATSUBA_MUL_CUTOFF) {
2813 res = mp_karatsuba_mul (a, b, c);
2816 /* can we use the fast multiplier?
2818 * The fast multiplier can be used if the output will
2819 * have less than MP_WARRAY digits and the number of
2820 * digits won't affect carry propagation
2822 int digs = a->used + b->used + 1;
2824 if ((digs < MP_WARRAY) &&
2825 MIN(a->used, b->used) <=
2826 (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
2827 res = fast_s_mp_mul_digs (a, b, c, digs);
2829 res = s_mp_mul (a, b, c); /* uses s_mp_mul_digs */
2831 c->sign = (c->used > 0) ? neg : MP_ZPOS;
2836 int mp_mul_2(const mp_int * a, mp_int * b)
2838 int x, res, oldused;
2840 /* grow to accommodate result */
2841 if (b->alloc < a->used + 1) {
2842 if ((res = mp_grow (b, a->used + 1)) != MP_OKAY) {
2851 register mp_digit r, rr, *tmpa, *tmpb;
2853 /* alias for source */
2856 /* alias for dest */
2861 for (x = 0; x < a->used; x++) {
2863 /* get what will be the *next* carry bit from the
2864 * MSB of the current digit
2866 rr = *tmpa >> ((mp_digit)(DIGIT_BIT - 1));
2868 /* now shift up this digit, add in the carry [from the previous] */
2869 *tmpb++ = ((*tmpa++ << ((mp_digit)1)) | r) & MP_MASK;
2871 /* copy the carry that would be from the source
2872 * digit into the next iteration
2877 /* new leading digit? */
2879 /* add a MSB which is always 1 at this point */
2884 /* now zero any excess digits on the destination
2885 * that we didn't write to
2887 tmpb = b->dp + b->used;
2888 for (x = b->used; x < oldused; x++) {
2896 /* shift left by a certain bit count */
2897 int mp_mul_2d (const mp_int * a, int b, mp_int * c)
2904 if ((res = mp_copy (a, c)) != MP_OKAY) {
2909 if (c->alloc < c->used + b/DIGIT_BIT + 1) {
2910 if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 1)) != MP_OKAY) {
2915 /* shift by as many digits in the bit count */
2916 if (b >= DIGIT_BIT) {
2917 if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) {
2922 /* shift any bit count < DIGIT_BIT */
2923 d = (mp_digit) (b % DIGIT_BIT);
2925 register mp_digit *tmpc, shift, mask, r, rr;
2928 /* bitmask for carries */
2929 mask = (((mp_digit)1) << d) - 1;
2931 /* shift for msbs */
2932 shift = DIGIT_BIT - d;
2939 for (x = 0; x < c->used; x++) {
2940 /* get the higher bits of the current word */
2941 rr = (*tmpc >> shift) & mask;
2943 /* shift the current word and OR in the carry */
2944 *tmpc = ((*tmpc << d) | r) & MP_MASK;
2947 /* set the carry to the carry bits of the current word */
2951 /* set final carry */
2953 c->dp[(c->used)++] = r;
2960 /* multiply by a digit */
2962 mp_mul_d (const mp_int * a, mp_digit b, mp_int * c)
2964 mp_digit u, *tmpa, *tmpc;
2966 int ix, res, olduse;
2968 /* make sure c is big enough to hold a*b */
2969 if (c->alloc < a->used + 1) {
2970 if ((res = mp_grow (c, a->used + 1)) != MP_OKAY) {
2975 /* get the original destinations used count */
2981 /* alias for a->dp [source] */
2984 /* alias for c->dp [dest] */
2990 /* compute columns */
2991 for (ix = 0; ix < a->used; ix++) {
2992 /* compute product and carry sum for this term */
2993 r = ((mp_word) u) + ((mp_word)*tmpa++) * ((mp_word)b);
2995 /* mask off higher bits to get a single digit */
2996 *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
2998 /* send carry into next iteration */
2999 u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
3002 /* store final carry [if any] */
3005 /* now zero digits above the top */
3006 while (ix++ < olduse) {
3010 /* set used count */
3011 c->used = a->used + 1;
3017 /* d = a * b (mod c) */
3019 mp_mulmod (const mp_int * a, const mp_int * b, mp_int * c, mp_int * d)
3024 if ((res = mp_init (&t)) != MP_OKAY) {
3028 if ((res = mp_mul (a, b, &t)) != MP_OKAY) {
3032 res = mp_mod (&t, c, d);
3037 /* determines if an integers is divisible by one
3038 * of the first PRIME_SIZE primes or not
3040 * sets result to 0 if not, 1 if yes
3042 int mp_prime_is_divisible (const mp_int * a, int *result)
3047 /* default to not */
3050 for (ix = 0; ix < PRIME_SIZE; ix++) {
3051 /* what is a mod __prime_tab[ix] */
3052 if ((err = mp_mod_d (a, __prime_tab[ix], &res)) != MP_OKAY) {
3056 /* is the residue zero? */
3066 /* performs a variable number of rounds of Miller-Rabin
3068 * Probability of error after t rounds is no more than
3071 * Sets result to 1 if probably prime, 0 otherwise
3073 int mp_prime_is_prime (mp_int * a, int t, int *result)
3081 /* valid value of t? */
3082 if (t <= 0 || t > PRIME_SIZE) {
3086 /* is the input equal to one of the primes in the table? */
3087 for (ix = 0; ix < PRIME_SIZE; ix++) {
3088 if (mp_cmp_d(a, __prime_tab[ix]) == MP_EQ) {
3094 /* first perform trial division */
3095 if ((err = mp_prime_is_divisible (a, &res)) != MP_OKAY) {
3099 /* return if it was trivially divisible */
3100 if (res == MP_YES) {
3104 /* now perform the miller-rabin rounds */
3105 if ((err = mp_init (&b)) != MP_OKAY) {
3109 for (ix = 0; ix < t; ix++) {
3111 mp_set (&b, __prime_tab[ix]);
3113 if ((err = mp_prime_miller_rabin (a, &b, &res)) != MP_OKAY) {
3122 /* passed the test */
3128 /* Miller-Rabin test of "a" to the base of "b" as described in
3129 * HAC pp. 139 Algorithm 4.24
3131 * Sets result to 0 if definitely composite or 1 if probably prime.
3132 * Randomly the chance of error is no more than 1/4 and often
3135 int mp_prime_miller_rabin (mp_int * a, const mp_int * b, int *result)
3144 if (mp_cmp_d(b, 1) != MP_GT) {
3148 /* get n1 = a - 1 */
3149 if ((err = mp_init_copy (&n1, a)) != MP_OKAY) {
3152 if ((err = mp_sub_d (&n1, 1, &n1)) != MP_OKAY) {
3156 /* set 2**s * r = n1 */
3157 if ((err = mp_init_copy (&r, &n1)) != MP_OKAY) {
3161 /* count the number of least significant bits
3166 /* now divide n - 1 by 2**s */
3167 if ((err = mp_div_2d (&r, s, &r, NULL)) != MP_OKAY) {
3171 /* compute y = b**r mod a */
3172 if ((err = mp_init (&y)) != MP_OKAY) {
3175 if ((err = mp_exptmod (b, &r, a, &y)) != MP_OKAY) {
3179 /* if y != 1 and y != n1 do */
3180 if (mp_cmp_d (&y, 1) != MP_EQ && mp_cmp (&y, &n1) != MP_EQ) {
3182 /* while j <= s-1 and y != n1 */
3183 while ((j <= (s - 1)) && mp_cmp (&y, &n1) != MP_EQ) {
3184 if ((err = mp_sqrmod (&y, a, &y)) != MP_OKAY) {
3188 /* if y == 1 then composite */
3189 if (mp_cmp_d (&y, 1) == MP_EQ) {
3196 /* if y != n1 then composite */
3197 if (mp_cmp (&y, &n1) != MP_EQ) {
3202 /* probably prime now */
3206 __N1:mp_clear (&n1);
3210 static const struct {
3223 /* returns # of RM trials required for a given bit size */
3224 int mp_prime_rabin_miller_trials(int size)
3228 for (x = 0; x < (int)(sizeof(sizes)/(sizeof(sizes[0]))); x++) {
3229 if (sizes[x].k == size) {
3231 } else if (sizes[x].k > size) {
3232 return (x == 0) ? sizes[0].t : sizes[x - 1].t;
3235 return sizes[x-1].t + 1;
3238 /* makes a truly random prime of a given size (bits),
3240 * Flags are as follows:
3242 * LTM_PRIME_BBS - make prime congruent to 3 mod 4
3243 * LTM_PRIME_SAFE - make sure (p-1)/2 is prime as well (implies LTM_PRIME_BBS)
3244 * LTM_PRIME_2MSB_OFF - make the 2nd highest bit zero
3245 * LTM_PRIME_2MSB_ON - make the 2nd highest bit one
3247 * You have to supply a callback which fills in a buffer with random bytes. "dat" is a parameter you can
3248 * have passed to the callback (e.g. a state or something). This function doesn't use "dat" itself
3253 /* This is possibly the mother of all prime generation functions, muahahahahaha! */
3254 int mp_prime_random_ex(mp_int *a, int t, int size, int flags, ltm_prime_callback cb, void *dat)
3256 unsigned char *tmp, maskAND, maskOR_msb, maskOR_lsb;
3257 int res, err, bsize, maskOR_msb_offset;
3259 /* sanity check the input */
3260 if (size <= 1 || t <= 0) {
3264 /* LTM_PRIME_SAFE implies LTM_PRIME_BBS */
3265 if (flags & LTM_PRIME_SAFE) {
3266 flags |= LTM_PRIME_BBS;
3269 /* calc the byte size */
3270 bsize = (size>>3)+((size&7)?1:0);
3272 /* we need a buffer of bsize bytes */
3273 tmp = malloc(bsize);
3278 /* calc the maskAND value for the MSbyte*/
3279 maskAND = ((size&7) == 0) ? 0xFF : (0xFF >> (8 - (size & 7)));
3281 /* calc the maskOR_msb */
3283 maskOR_msb_offset = ((size & 7) == 1) ? 1 : 0;
3284 if (flags & LTM_PRIME_2MSB_ON) {
3285 maskOR_msb |= 1 << ((size - 2) & 7);
3286 } else if (flags & LTM_PRIME_2MSB_OFF) {
3287 maskAND &= ~(1 << ((size - 2) & 7));
3290 /* get the maskOR_lsb */
3292 if (flags & LTM_PRIME_BBS) {
3297 /* read the bytes */
3298 if (cb(tmp, bsize, dat) != bsize) {
3303 /* work over the MSbyte */
3305 tmp[0] |= 1 << ((size - 1) & 7);
3307 /* mix in the maskORs */
3308 tmp[maskOR_msb_offset] |= maskOR_msb;
3309 tmp[bsize-1] |= maskOR_lsb;
3312 if ((err = mp_read_unsigned_bin(a, tmp, bsize)) != MP_OKAY) { goto error; }
3315 if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY) { goto error; }
3320 if (flags & LTM_PRIME_SAFE) {
3321 /* see if (a-1)/2 is prime */
3322 if ((err = mp_sub_d(a, 1, a)) != MP_OKAY) { goto error; }
3323 if ((err = mp_div_2(a, a)) != MP_OKAY) { goto error; }
3326 if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY) { goto error; }
3328 } while (res == MP_NO);
3330 if (flags & LTM_PRIME_SAFE) {
3331 /* restore a to the original value */
3332 if ((err = mp_mul_2(a, a)) != MP_OKAY) { goto error; }
3333 if ((err = mp_add_d(a, 1, a)) != MP_OKAY) { goto error; }
3342 /* reads an unsigned char array, assumes the msb is stored first [big endian] */
3344 mp_read_unsigned_bin (mp_int * a, const unsigned char *b, int c)
3348 /* make sure there are at least two digits */
3350 if ((res = mp_grow(a, 2)) != MP_OKAY) {
3358 /* read the bytes in */
3360 if ((res = mp_mul_2d (a, 8, a)) != MP_OKAY) {
3371 /* reduces x mod m, assumes 0 < x < m**2, mu is
3372 * precomputed via mp_reduce_setup.
3373 * From HAC pp.604 Algorithm 14.42
3376 mp_reduce (mp_int * x, const mp_int * m, const mp_int * mu)
3379 int res, um = m->used;
3382 if ((res = mp_init_copy (&q, x)) != MP_OKAY) {
3386 /* q1 = x / b**(k-1) */
3387 mp_rshd (&q, um - 1);
3389 /* according to HAC this optimization is ok */
3390 if (((unsigned long) um) > (((mp_digit)1) << (DIGIT_BIT - 1))) {
3391 if ((res = mp_mul (&q, mu, &q)) != MP_OKAY) {
3395 if ((res = s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) {
3400 /* q3 = q2 / b**(k+1) */
3401 mp_rshd (&q, um + 1);
3403 /* x = x mod b**(k+1), quick (no division) */
3404 if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) {
3408 /* q = q * m mod b**(k+1), quick (no division) */
3409 if ((res = s_mp_mul_digs (&q, m, &q, um + 1)) != MP_OKAY) {
3414 if ((res = mp_sub (x, &q, x)) != MP_OKAY) {
3418 /* If x < 0, add b**(k+1) to it */
3419 if (mp_cmp_d (x, 0) == MP_LT) {
3421 if ((res = mp_lshd (&q, um + 1)) != MP_OKAY)
3423 if ((res = mp_add (x, &q, x)) != MP_OKAY)
3427 /* Back off if it's too big */
3428 while (mp_cmp (x, m) != MP_LT) {
3429 if ((res = s_mp_sub (x, m, x)) != MP_OKAY) {
3440 /* reduces a modulo n where n is of the form 2**p - d */
3442 mp_reduce_2k(mp_int *a, const mp_int *n, mp_digit d)
3447 if ((res = mp_init(&q)) != MP_OKAY) {
3451 p = mp_count_bits(n);
3453 /* q = a/2**p, a = a mod 2**p */
3454 if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) {
3460 if ((res = mp_mul_d(&q, d, &q)) != MP_OKAY) {
3466 if ((res = s_mp_add(a, &q, a)) != MP_OKAY) {
3470 if (mp_cmp_mag(a, n) != MP_LT) {
3480 /* determines the setup value */
3482 mp_reduce_2k_setup(const mp_int *a, mp_digit *d)
3487 if ((res = mp_init(&tmp)) != MP_OKAY) {
3491 p = mp_count_bits(a);
3492 if ((res = mp_2expt(&tmp, p)) != MP_OKAY) {
3497 if ((res = s_mp_sub(&tmp, a, &tmp)) != MP_OKAY) {
3507 /* pre-calculate the value required for Barrett reduction
3508 * For a given modulus "b" it calulates the value required in "a"
3510 int mp_reduce_setup (mp_int * a, const mp_int * b)
3514 if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
3517 return mp_div (a, b, a, NULL);
3520 /* shift right a certain amount of digits */
3521 void mp_rshd (mp_int * a, int b)
3525 /* if b <= 0 then ignore it */
3530 /* if b > used then simply zero it and return */
3537 register mp_digit *bottom, *top;
3539 /* shift the digits down */
3544 /* top [offset into digits] */
3547 /* this is implemented as a sliding window where
3548 * the window is b-digits long and digits from
3549 * the top of the window are copied to the bottom
3553 b-2 | b-1 | b0 | b1 | b2 | ... | bb | ---->
3555 \-------------------/ ---->
3557 for (x = 0; x < (a->used - b); x++) {
3561 /* zero the top digits */
3562 for (; x < a->used; x++) {
3567 /* remove excess digits */
3571 /* set to a digit */
3572 void mp_set (mp_int * a, mp_digit b)
3575 a->dp[0] = b & MP_MASK;
3576 a->used = (a->dp[0] != 0) ? 1 : 0;
3579 /* set a 32-bit const */
3580 int mp_set_int (mp_int * a, unsigned long b)
3586 /* set four bits at a time */
3587 for (x = 0; x < 8; x++) {
3588 /* shift the number up four bits */
3589 if ((res = mp_mul_2d (a, 4, a)) != MP_OKAY) {
3593 /* OR in the top four bits of the source */
3594 a->dp[0] |= (b >> 28) & 15;
3596 /* shift the source up to the next four bits */
3599 /* ensure that digits are not clamped off */
3606 /* shrink a bignum */
3607 int mp_shrink (mp_int * a)
3610 if (a->alloc != a->used && a->used > 0) {
3611 if ((tmp = realloc (a->dp, sizeof (mp_digit) * a->used)) == NULL) {
3620 /* get the size for an signed equivalent */
3621 int mp_signed_bin_size (const mp_int * a)
3623 return 1 + mp_unsigned_bin_size (a);
3626 /* computes b = a*a */
3628 mp_sqr (const mp_int * a, mp_int * b)
3632 if (a->used >= KARATSUBA_SQR_CUTOFF) {
3633 res = mp_karatsuba_sqr (a, b);
3636 /* can we use the fast comba multiplier? */
3637 if ((a->used * 2 + 1) < MP_WARRAY &&
3639 (1 << (sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) {
3640 res = fast_s_mp_sqr (a, b);
3642 res = s_mp_sqr (a, b);
3648 /* c = a * a (mod b) */
3650 mp_sqrmod (const mp_int * a, mp_int * b, mp_int * c)
3655 if ((res = mp_init (&t)) != MP_OKAY) {
3659 if ((res = mp_sqr (a, &t)) != MP_OKAY) {
3663 res = mp_mod (&t, b, c);
3668 /* high level subtraction (handles signs) */
3670 mp_sub (mp_int * a, mp_int * b, mp_int * c)
3678 /* subtract a negative from a positive, OR */
3679 /* subtract a positive from a negative. */
3680 /* In either case, ADD their magnitudes, */
3681 /* and use the sign of the first number. */
3683 res = s_mp_add (a, b, c);
3685 /* subtract a positive from a positive, OR */
3686 /* subtract a negative from a negative. */
3687 /* First, take the difference between their */
3688 /* magnitudes, then... */
3689 if (mp_cmp_mag (a, b) != MP_LT) {
3690 /* Copy the sign from the first */
3692 /* The first has a larger or equal magnitude */
3693 res = s_mp_sub (a, b, c);
3695 /* The result has the *opposite* sign from */
3696 /* the first number. */
3697 c->sign = (sa == MP_ZPOS) ? MP_NEG : MP_ZPOS;
3698 /* The second has a larger magnitude */
3699 res = s_mp_sub (b, a, c);
3705 /* single digit subtraction */
3707 mp_sub_d (mp_int * a, mp_digit b, mp_int * c)
3709 mp_digit *tmpa, *tmpc, mu;
3710 int res, ix, oldused;
3712 /* grow c as required */
3713 if (c->alloc < a->used + 1) {
3714 if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) {
3719 /* if a is negative just do an unsigned
3720 * addition [with fudged signs]
3722 if (a->sign == MP_NEG) {
3724 res = mp_add_d(a, b, c);
3725 a->sign = c->sign = MP_NEG;
3734 /* if a <= b simply fix the single digit */
3735 if ((a->used == 1 && a->dp[0] <= b) || a->used == 0) {
3737 *tmpc++ = b - *tmpa;
3743 /* negative/1digit */
3751 /* subtract first digit */
3752 *tmpc = *tmpa++ - b;
3753 mu = *tmpc >> (sizeof(mp_digit) * CHAR_BIT - 1);
3756 /* handle rest of the digits */
3757 for (ix = 1; ix < a->used; ix++) {
3758 *tmpc = *tmpa++ - mu;
3759 mu = *tmpc >> (sizeof(mp_digit) * CHAR_BIT - 1);
3764 /* zero excess digits */
3765 while (ix++ < oldused) {
3772 /* store in unsigned [big endian] format */
3774 mp_to_unsigned_bin (const mp_int * a, unsigned char *b)
3779 if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
3784 while (mp_iszero (&t) == 0) {
3785 b[x++] = (unsigned char) (t.dp[0] & 255);
3786 if ((res = mp_div_2d (&t, 8, &t, NULL)) != MP_OKAY) {
3796 /* get the size for an unsigned equivalent */
3798 mp_unsigned_bin_size (const mp_int * a)
3800 int size = mp_count_bits (a);
3801 return (size / 8 + ((size & 7) != 0 ? 1 : 0));
3806 mp_zero (mp_int * a)
3810 memset (a->dp, 0, sizeof (mp_digit) * a->alloc);
3813 static const mp_digit __prime_tab[] = {
3814 0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
3815 0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
3816 0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
3817 0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F, 0x0083,
3818 0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
3819 0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
3820 0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
3821 0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137,
3823 0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167,
3824 0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199,
3825 0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9,
3826 0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7,
3827 0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239,
3828 0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265,
3829 0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293,
3830 0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF,
3832 0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301,
3833 0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B,
3834 0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371,
3835 0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD,
3836 0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5,
3837 0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419,
3838 0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449,
3839 0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B,
3841 0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7,
3842 0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503,
3843 0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529,
3844 0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F,
3845 0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3,
3846 0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
3847 0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
3848 0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653
3851 /* reverse an array, used for radix code */
3853 bn_reverse (unsigned char *s, int len)
3869 /* low level addition, based on HAC pp.594, Algorithm 14.7 */
3871 s_mp_add (mp_int * a, mp_int * b, mp_int * c)
3874 int olduse, res, min, max;
3876 /* find sizes, we let |a| <= |b| which means we have to sort
3877 * them. "x" will point to the input with the most digits
3879 if (a->used > b->used) {
3890 if (c->alloc < max + 1) {
3891 if ((res = mp_grow (c, max + 1)) != MP_OKAY) {
3896 /* get old used digit count and set new one */
3901 register mp_digit u, *tmpa, *tmpb, *tmpc;
3904 /* alias for digit pointers */
3915 /* zero the carry */
3917 for (i = 0; i < min; i++) {
3918 /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
3919 *tmpc = *tmpa++ + *tmpb++ + u;
3921 /* U = carry bit of T[i] */
3922 u = *tmpc >> ((mp_digit)DIGIT_BIT);
3924 /* take away carry bit from T[i] */
3928 /* now copy higher words if any, that is in A+B
3929 * if A or B has more digits add those in
3932 for (; i < max; i++) {
3933 /* T[i] = X[i] + U */
3934 *tmpc = x->dp[i] + u;
3936 /* U = carry bit of T[i] */
3937 u = *tmpc >> ((mp_digit)DIGIT_BIT);
3939 /* take away carry bit from T[i] */
3947 /* clear digits above oldused */
3948 for (i = c->used; i < olduse; i++) {
3957 int s_mp_exptmod (const mp_int * G, const mp_int * X, mp_int * P, mp_int * Y)
3959 mp_int M[256], res, mu;
3961 int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
3963 /* find window size */
3964 x = mp_count_bits (X);
3967 } else if (x <= 36) {
3969 } else if (x <= 140) {
3971 } else if (x <= 450) {
3973 } else if (x <= 1303) {
3975 } else if (x <= 3529) {
3982 /* init first cell */
3983 if ((err = mp_init(&M[1])) != MP_OKAY) {
3987 /* now init the second half of the array */
3988 for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
3989 if ((err = mp_init(&M[x])) != MP_OKAY) {
3990 for (y = 1<<(winsize-1); y < x; y++) {
3998 /* create mu, used for Barrett reduction */
3999 if ((err = mp_init (&mu)) != MP_OKAY) {
4002 if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
4008 * The M table contains powers of the base,
4009 * e.g. M[x] = G**x mod P
4011 * The first half of the table is not
4012 * computed though accept for M[0] and M[1]
4014 if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
4018 /* compute the value at M[1<<(winsize-1)] by squaring
4019 * M[1] (winsize-1) times
4021 if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
4025 for (x = 0; x < (winsize - 1); x++) {
4026 if ((err = mp_sqr (&M[1 << (winsize - 1)],
4027 &M[1 << (winsize - 1)])) != MP_OKAY) {
4030 if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
4035 /* create upper table, that is M[x] = M[x-1] * M[1] (mod P)
4036 * for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
4038 for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
4039 if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
4042 if ((err = mp_reduce (&M[x], P, &mu)) != MP_OKAY) {
4048 if ((err = mp_init (&res)) != MP_OKAY) {
4053 /* set initial mode and bit cnt */
4057 digidx = X->used - 1;
4062 /* grab next digit as required */
4063 if (--bitcnt == 0) {
4064 /* if digidx == -1 we are out of digits */
4068 /* read next digit and reset the bitcnt */
4069 buf = X->dp[digidx--];
4073 /* grab the next msb from the exponent */
4074 y = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
4075 buf <<= (mp_digit)1;
4077 /* if the bit is zero and mode == 0 then we ignore it
4078 * These represent the leading zero bits before the first 1 bit
4079 * in the exponent. Technically this opt is not required but it
4080 * does lower the # of trivial squaring/reductions used
4082 if (mode == 0 && y == 0) {
4086 /* if the bit is zero and mode == 1 then we square */
4087 if (mode == 1 && y == 0) {
4088 if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
4091 if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
4097 /* else we add it to the window */
4098 bitbuf |= (y << (winsize - ++bitcpy));
4101 if (bitcpy == winsize) {
4102 /* ok window is filled so square as required and multiply */
4104 for (x = 0; x < winsize; x++) {
4105 if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
4108 if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
4114 if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
4117 if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
4121 /* empty window and reset */
4128 /* if bits remain then square/multiply */
4129 if (mode == 2 && bitcpy > 0) {
4130 /* square then multiply if the bit is set */
4131 for (x = 0; x < bitcpy; x++) {
4132 if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
4135 if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
4140 if ((bitbuf & (1 << winsize)) != 0) {
4142 if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
4145 if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
4154 __RES:mp_clear (&res);
4155 __MU:mp_clear (&mu);
4158 for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
4164 /* multiplies |a| * |b| and only computes up to digs digits of result
4165 * HAC pp. 595, Algorithm 14.12 Modified so you can control how
4166 * many digits of output are created.
4169 s_mp_mul_digs (const mp_int * a, const mp_int * b, mp_int * c, int digs)
4172 int res, pa, pb, ix, iy;
4175 mp_digit tmpx, *tmpt, *tmpy;
4177 /* can we use the fast multiplier? */
4178 if (((digs) < MP_WARRAY) &&
4179 MIN (a->used, b->used) <
4180 (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
4181 return fast_s_mp_mul_digs (a, b, c, digs);
4184 if ((res = mp_init_size (&t, digs)) != MP_OKAY) {
4189 /* compute the digits of the product directly */
4191 for (ix = 0; ix < pa; ix++) {
4192 /* set the carry to zero */
4195 /* limit ourselves to making digs digits of output */
4196 pb = MIN (b->used, digs - ix);
4198 /* setup some aliases */
4199 /* copy of the digit from a used within the nested loop */
4202 /* an alias for the destination shifted ix places */
4205 /* an alias for the digits of b */
4208 /* compute the columns of the output and propagate the carry */
4209 for (iy = 0; iy < pb; iy++) {
4210 /* compute the column as a mp_word */
4211 r = ((mp_word)*tmpt) +
4212 ((mp_word)tmpx) * ((mp_word)*tmpy++) +
4215 /* the new column is the lower part of the result */
4216 *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
4218 /* get the carry word from the result */
4219 u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
4221 /* set carry if it is placed below digs */
4222 if (ix + iy < digs) {
4234 /* multiplies |a| * |b| and does not compute the lower digs digits
4235 * [meant to get the higher part of the product]
4238 s_mp_mul_high_digs (const mp_int * a, const mp_int * b, mp_int * c, int digs)
4241 int res, pa, pb, ix, iy;
4244 mp_digit tmpx, *tmpt, *tmpy;
4246 /* can we use the fast multiplier? */
4247 if (((a->used + b->used + 1) < MP_WARRAY)
4248 && MIN (a->used, b->used) < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
4249 return fast_s_mp_mul_high_digs (a, b, c, digs);
4252 if ((res = mp_init_size (&t, a->used + b->used + 1)) != MP_OKAY) {
4255 t.used = a->used + b->used + 1;
4259 for (ix = 0; ix < pa; ix++) {
4260 /* clear the carry */
4263 /* left hand side of A[ix] * B[iy] */
4266 /* alias to the address of where the digits will be stored */
4267 tmpt = &(t.dp[digs]);
4269 /* alias for where to read the right hand side from */
4270 tmpy = b->dp + (digs - ix);
4272 for (iy = digs - ix; iy < pb; iy++) {
4273 /* calculate the double precision result */
4274 r = ((mp_word)*tmpt) +
4275 ((mp_word)tmpx) * ((mp_word)*tmpy++) +
4278 /* get the lower part */
4279 *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
4281 /* carry the carry */
4282 u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
4292 /* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
4294 s_mp_sqr (const mp_int * a, mp_int * b)
4297 int res, ix, iy, pa;
4299 mp_digit u, tmpx, *tmpt;
4302 if ((res = mp_init_size (&t, 2*pa + 1)) != MP_OKAY) {
4306 /* default used is maximum possible size */
4309 for (ix = 0; ix < pa; ix++) {
4310 /* first calculate the digit at 2*ix */
4311 /* calculate double precision result */
4312 r = ((mp_word) t.dp[2*ix]) +
4313 ((mp_word)a->dp[ix])*((mp_word)a->dp[ix]);
4315 /* store lower part in result */
4316 t.dp[ix+ix] = (mp_digit) (r & ((mp_word) MP_MASK));
4319 u = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
4321 /* left hand side of A[ix] * A[iy] */
4324 /* alias for where to store the results */
4325 tmpt = t.dp + (2*ix + 1);
4327 for (iy = ix + 1; iy < pa; iy++) {
4328 /* first calculate the product */
4329 r = ((mp_word)tmpx) * ((mp_word)a->dp[iy]);
4331 /* now calculate the double precision result, note we use
4332 * addition instead of *2 since it's easier to optimize
4334 r = ((mp_word) *tmpt) + r + r + ((mp_word) u);
4336 /* store lower part */
4337 *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
4340 u = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
4342 /* propagate upwards */
4343 while (u != ((mp_digit) 0)) {
4344 r = ((mp_word) *tmpt) + ((mp_word) u);
4345 *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
4346 u = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
4356 /* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */
4358 s_mp_sub (const mp_int * a, const mp_int * b, mp_int * c)
4360 int olduse, res, min, max;
4367 if (c->alloc < max) {
4368 if ((res = mp_grow (c, max)) != MP_OKAY) {
4376 register mp_digit u, *tmpa, *tmpb, *tmpc;
4379 /* alias for digit pointers */
4384 /* set carry to zero */
4386 for (i = 0; i < min; i++) {
4387 /* T[i] = A[i] - B[i] - U */
4388 *tmpc = *tmpa++ - *tmpb++ - u;
4390 /* U = carry bit of T[i]
4391 * Note this saves performing an AND operation since
4392 * if a carry does occur it will propagate all the way to the
4393 * MSB. As a result a single shift is enough to get the carry
4395 u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
4397 /* Clear carry from T[i] */
4401 /* now copy higher words if any, e.g. if A has more digits than B */
4402 for (; i < max; i++) {
4403 /* T[i] = A[i] - U */
4404 *tmpc = *tmpa++ - u;
4406 /* U = carry bit of T[i] */
4407 u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
4409 /* Clear carry from T[i] */
4413 /* clear digits above used (since we may not have grown result above) */
4414 for (i = c->used; i < olduse; i++) {