home *** CD-ROM | disk | FTP | other *** search
- /* C source code for multiprecision arithmetic library routines.
- Implemented Nov 86 by Philip Zimmermann
- Last revised 27 Nov 91 by PRZ
-
- Boulder Software Engineering
- 3021 Eleventh Street
- Boulder, CO 80304
- (303) 541-0140
-
- (c) Copyright 1986-92 by Philip Zimmermann. All rights reserved.
- The author assumes no liability for damages resulting from the use
- of this software, even if the damage results from defects in this
- software. No warranty is expressed or implied. The use of this
- cryptographic software for developing weapon systems is expressly
- forbidden.
-
- These routines implement all of the multiprecision arithmetic
- necessary for number-theoretic cryptographic algorithms such as
- ElGamal, Diffie-Hellman, Rabin, or factoring studies for large
- composite numbers, as well as Rivest-Shamir-Adleman (RSA) public
- key cryptography.
-
- Although originally developed in Microsoft C for the IBM PC, this code
- contains few machine dependencies. It assumes 2's complement
- arithmetic. It can be adapted to 8-bit, 16-bit, or 32-bit machines,
- lowbyte-highbyte order or highbyte-lowbyte order. This version
- has been converted to ANSI C.
-
-
- The internal representation for these extended precision integer
- "registers" is an array of "units". A unit is a machine word, which
- is either an 8-bit byte, a 16-bit unsigned integer, or a 32-bit
- unsigned integer, depending on the machine's word size. For example,
- an IBM PC or AT uses a unit size of 16 bits. To perform arithmetic
- on these huge precision integers, we pass pointers to these unit
- arrays to various subroutines. A pointer to an array of units is of
- type unitptr. This is a pointer to a huge integer "register".
-
- When calling a subroutine, we always pass a pointer to the BEGINNING
- of the array of units, regardless of the byte order of the machine.
- On a lowbyte-first machine, such as the Intel 80x86, this unitptr
- points to the LEAST significant unit, and the array of units increases
- significance to the right. On a highbyte-first machine, such as the
- Motorola 680x0, this unitptr points to the MOST significant unit, and
- the array of units decreases significance to the right.
-
- Modified 8 Apr 92 - HAJK
- Implement new VAX/VMS primitive support.
-
- Modified 30 Sep 92 -Castor Fu <castor@drizzle.stanford.edu>
- Upgraded PORTABLE support to allow sizeof(unit) == sizeof(long)
-
- Modified 28 Nov 92 - Thad Smith
- Added Smith modmult, generalized non-portable support.
- */
-
- /* #define COUNTMULTS */ /* count modmults for performance studies */
-
- #ifdef DEBUG
- #ifdef MSDOS
- #ifdef __GO32__ /* DJGPP */
- #include <pc.h>
- #else
- #include <conio.h>
- #endif /* __GO32__ */
- #define poll_for_break() {while (kbhit()) getch();}
- #endif /* MSDOS */
- #endif /* DEBUG */
-
- #ifndef poll_for_break
- #define poll_for_break() /* stub */
- #endif
-
- #include "mpilib.h"
-
- #ifdef mp_smula
- #ifdef mp_smul
- Error: Both mp_smula and mp_smul cannot be defined.
- #else
- #define mp_smul mp_smula
- #endif
- #endif
-
- /* set macros for MULTUNIT */
- #ifdef MUNIT8
- #define MULTUNITSIZE 8
- typedef unsigned char MULTUNIT;
- #ifdef UNIT8
- #define MULTUNIT_SIZE_SAME
- #endif
- #else /* not MUNIT8 */
- #ifdef MUNIT32
- #define MULTUNITSIZE 32
- typedef unsigned long MULTUNIT;
- #ifdef UNIT32
- #define MULTUNIT_SIZE_SAME
- #else
- /* #error is not portable, this has the same effect */
- #include "UNITSIZE cannot be smaller than MULTUNITSIZE"
- #endif
- #else /* assume MUNIT16 */
- #define MULTUNITSIZE 16
- typedef unsigned short MULTUNIT;
- #ifdef UNIT16
- #define MULTUNIT_SIZE_SAME
- #endif /* UNIT16 */
- #ifdef UNIT8
- #include "UNITSIZE cannot be smaller than MULTUNITSIZE"
- #endif /* UNIT8 */
- #endif /* MUNIT32 */
- #endif /* MUNIT8 */
-
- #define MULTUNIT_msb ((MULTUNIT)1 << (MULTUNITSIZE-1)) /* msb of MULTUNIT */
- #define DMULTUNIT_msb (1L << (2*MULTUNITSIZE-1))
- #define MULTUNIT_mask ((MULTUNIT)((1L << MULTUNITSIZE)-1))
- #define MULTUNITs_perunit (UNITSIZE/MULTUNITSIZE)
-
-
- void mp_smul (MULTUNIT *prod, MULTUNIT *multiplicand, MULTUNIT multiplier);
- void mp_dmul (unitptr prod, unitptr multiplicand, unitptr multiplier);
-
- short global_precision = 0; /* units of precision for all routines */
- /* global_precision is the unit precision last set by set_precision.
- Initially, set_precision() should be called to define global_precision
- before using any of these other multiprecision library routines.
- i.e.: set_precision(MAX_UNIT_PRECISION);
- */
-
- /*************** multiprecision library primitives ****************/
- /* The following portable C primitives should be recoded in assembly.
- The entry point name should be defined, in "mpilib.h" to the external
- entry point name. If undefined, the C version will be used.
- */
-
- typedef unsigned long int ulint;
-
- #ifndef mp_addc
- boolean mp_addc
- (register unitptr r1,register unitptr r2,register boolean carry)
- /* multiprecision add with carry r2 to r1, result in r1 */
- /* carry is incoming carry flag-- value should be 0 or 1 */
- { register unit x;
- short precision; /* number of units to add */
- precision = global_precision;
- make_lsbptr(r1,precision);
- make_lsbptr(r2,precision);
- while (precision--)
- {
- if (carry)
- { x = *r1 + *r2 + 1;
- carry = (*r2 >= (unit)(~ *r1));
- } else
- { x = *r1 + *r2;
- carry = (x < *r1) ;
- }
- post_higherunit(r2);
- *post_higherunit(r1) = x;
- }
- return(carry); /* return the final carry flag bit */
- } /* mp_addc */
- #endif /* mp_addc */
-
- #ifndef mp_subb
- boolean mp_subb
- (register unitptr r1,register unitptr r2,register boolean borrow)
- /* multiprecision subtract with borrow, r2 from r1, result in r1 */
- /* borrow is incoming borrow flag-- value should be 0 or 1 */
- { register unit x;
- short precision; /* number of units to subtract */
- precision = global_precision;
- make_lsbptr(r1,precision);
- make_lsbptr(r2,precision);
- while (precision--)
- { if (borrow)
- { x = *r1 - *r2 - 1;
- borrow = (*r1 <= *r2);
- } else
- { x = *r1 - *r2;
- borrow = (*r1 < *r2);
- }
- post_higherunit(r2);
- *post_higherunit(r1) = x;
- }
- return(borrow); /* return the final carry/borrow flag bit */
- } /* mp_subb */
- #endif /* mp_subb */
-
- #ifndef mp_rotate_left
- boolean mp_rotate_left(register unitptr r1,register boolean carry)
- /* multiprecision rotate left 1 bit with carry, result in r1. */
- /* carry is incoming carry flag-- value should be 0 or 1 */
- { register int precision; /* number of units to rotate */
- unsigned int mcarry = carry, nextcarry; /* int is supposed to be
- * the efficient size for ops*/
- precision = global_precision;
- make_lsbptr(r1,precision);
- while (precision--)
- {
- nextcarry = (((signedunit) *r1) < 0);
- *r1 = (*r1 << 1) | mcarry;
- mcarry = nextcarry;
- pre_higherunit(r1);
- }
- return(nextcarry); /* return the final carry flag bit */
- } /* mp_rotate_left */
- #endif /* mp_rotate_left */
-
- /************** end of primitives that should be in assembly *************/
-
-
- /* The mp_shift_right_bits function is not called in any time-critical
- situations in public-key cryptographic functions, so it doesn't
- need to be coded in assembly language.
- */
- void mp_shift_right_bits(register unitptr r1,register short bits)
- /* multiprecision shift right bits, result in r1.
- bits is how many bits to shift, must be <= UNITSIZE.
- */
- { register short precision; /* number of units to shift */
- register unit carry,nextcarry,bitmask;
- register short unbits;
- if (bits==0) return; /* shift zero bits is a no-op */
- carry = 0;
- bitmask = power_of_2(bits)-1;
- unbits = UNITSIZE-bits; /* shift bits must be <= UNITSIZE */
- precision = global_precision;
- make_msbptr(r1,precision);
- if (bits == UNITSIZE) {
- while (precision--) {
- nextcarry = *r1;
- *r1 = carry;
- carry = nextcarry;
- pre_lowerunit(r1);
- }
- } else {
- while (precision--)
- { nextcarry = *r1 & bitmask;
- *r1 >>= bits;
- *r1 |= carry << unbits;
- carry = nextcarry;
- pre_lowerunit(r1);
- }
- }
- } /* mp_shift_right_bits */
-
-
- #ifndef mp_compare
- short mp_compare(register unitptr r1,register unitptr r2)
- /* Compares multiprecision integers *r1, *r2, and returns:
- -1 iff *r1 < *r2
- 0 iff *r1 == *r2
- +1 iff *r1 > *r2
- */
- { register short precision; /* number of units to compare */
-
- precision = global_precision;
- make_msbptr(r1,precision);
- make_msbptr(r2,precision);
- do
- { if (*r1 < *r2)
- return(-1);
- if (*post_lowerunit(r1) > *post_lowerunit(r2))
- return(1);
- } while (--precision);
- return(0); /* *r1 == *r2 */
- } /* mp_compare */
- #endif /* mp_compare */
-
-
- boolean mp_inc(register unitptr r)
- /* Increment multiprecision integer r. */
- { register short precision;
- precision = global_precision;
- make_lsbptr(r,precision);
- do
- { if ( ++(*r) ) return(0); /* no carry */
- post_higherunit(r);
- } while (--precision);
- return(1); /* return carry set */
- } /* mp_inc */
-
-
- boolean mp_dec(register unitptr r)
- /* Decrement multiprecision integer r. */
- { register short precision;
- precision = global_precision;
- make_lsbptr(r,precision);
- do
- { if ( (signedunit) (--(*r)) != (signedunit) -1 )
- return(0); /* no borrow */
- post_higherunit(r);
- } while (--precision);
- return(1); /* return borrow set */
- } /* mp_dec */
-
-
- void mp_neg(register unitptr r)
- /* Compute 2's complement, the arithmetic negative, of r */
- { register short precision; /* number of units to negate */
- precision = global_precision;
- mp_dec(r); /* 2's complement is 1's complement plus 1 */
- do /* now do 1's complement */
- { *r = ~(*r);
- r++;
- } while (--precision);
- } /* mp_neg */
-
- #ifndef mp_move
- void mp_move(register unitptr dst,register unitptr src)
- { register short precision; /* number of units to move */
- precision = global_precision;
- do { *dst++ = *src++; } while (--precision);
- } /* mp_move */
- #endif /* mp_move */
-
- void mp_init(register unitptr r, word16 value)
- /* Init multiprecision register r with short value. */
- { /* Note that mp_init doesn't extend sign bit for >32767 */
-
- unitfill0( r, global_precision);
- make_lsbptr(r,global_precision);
- *post_higherunit(r) = value;
- #ifdef UNIT8
- *post_higherunit(r) = value >> UNITSIZE;
- #endif
- } /* mp_init */
-
-
- short significance(register unitptr r)
- /* Returns number of significant units in r */
- { register short precision;
- precision = global_precision;
- make_msbptr(r,precision);
- do
- { if (*post_lowerunit(r))
- return(precision);
- } while (--precision);
- return(precision);
- } /* significance */
-
-
- #ifndef unitfill0
- void unitfill0(unitptr r,word16 unitcount)
- /* Zero-fill the unit buffer r. */
- { while (unitcount--) *r++ = 0;
- } /* unitfill0 */
- #endif /* unitfill0 */
-
-
- int mp_udiv(register unitptr remainder,register unitptr quotient,
- register unitptr dividend,register unitptr divisor)
- /* Unsigned divide, treats both operands as positive. */
- { int bits;
- short dprec;
- register unit bitmask;
- if (testeq(divisor,0))
- return(-1); /* zero divisor means divide error */
- mp_init0(remainder);
- mp_init0(quotient);
- /* normalize and compute number of bits in dividend first */
- init_bitsniffer(dividend,bitmask,dprec,bits);
- /* rescale quotient to same precision (dprec) as dividend */
- rescale(quotient,global_precision,dprec);
- make_msbptr(quotient,dprec);
-
- while (bits--)
- { mp_rotate_left(remainder,(boolean)(sniff_bit(dividend,bitmask)!=0));
- if (mp_compare(remainder,divisor) >= 0)
- { mp_sub(remainder,divisor);
- stuff_bit(quotient,bitmask);
- }
- bump_2bitsniffers(dividend,quotient,bitmask);
- }
- return(0);
- } /* mp_udiv */
-
-
- #ifdef UPTON_OR_SMITH
- #define RECIPMARGIN 0 /* extra margin bits used by mp_recip() */
-
- int mp_recip(register unitptr quotient,register unitptr divisor)
- /* Compute reciprocal (quotient) as 1/divisor. Used by faster modmult. */
- { int bits;
- short qprec;
- register unit bitmask;
- unit remainder[MAX_UNIT_PRECISION];
- if (testeq(divisor,0))
- return(-1); /* zero divisor means divide error */
- mp_init0(remainder);
- mp_init0(quotient);
-
- /* normalize and compute number of bits in quotient first */
- bits = countbits(divisor) + RECIPMARGIN;
- bitmask = bitmsk(bits); /* bitmask within a single unit */
- qprec = bits2units(bits+1);
- mp_setbit(remainder,(bits-RECIPMARGIN)-1);
- /* rescale quotient to precision of divisor + RECIPMARGIN bits */
- rescale(quotient,global_precision,qprec);
- make_msbptr(quotient,qprec);
-
- while (bits--)
- { mp_shift_left(remainder);
- if (mp_compare(remainder,divisor) >= 0)
- { mp_sub(remainder,divisor);
- stuff_bit(quotient,bitmask);
- }
- bump_bitsniffer(quotient,bitmask);
- }
- mp_init0(remainder); /* burn sensitive data left on stack */
- return(0);
- } /* mp_recip */
- #endif /* UPTON_OR_SMITH */
-
-
- int mp_div(register unitptr remainder,register unitptr quotient,
- register unitptr dividend,register unitptr divisor)
- /* Signed divide, either or both operands may be negative. */
- { boolean dvdsign,dsign;
- int status;
- dvdsign = (boolean)(mp_tstminus(dividend)!=0);
- dsign = (boolean)(mp_tstminus(divisor)!=0);
- if (dvdsign) mp_neg(dividend);
- if (dsign) mp_neg(divisor);
- status = mp_udiv(remainder,quotient,dividend,divisor);
- if (dvdsign) mp_neg(dividend); /* repair caller's dividend */
- if (dsign) mp_neg(divisor); /* repair caller's divisor */
- if (status<0) return(status); /* divide error? */
- if (dvdsign) mp_neg(remainder);
- if (dvdsign ^ dsign) mp_neg(quotient);
- return(status);
- } /* mp_div */
-
-
- word16 mp_shortdiv(register unitptr quotient,
- register unitptr dividend,register word16 divisor)
- /* This function does a fast divide and mod on a multiprecision dividend
- using a short integer divisor returning a short integer remainder.
- This is an unsigned divide. It treats both operands as positive.
- It is used mainly for faster printing of large numbers in base 10.
- */
- { int bits;
- short dprec;
- register unit bitmask;
- register word16 remainder;
- if (!divisor) /* if divisor == 0 */
- return(-1); /* zero divisor means divide error */
- remainder=0;
- mp_init0(quotient);
- /* normalize and compute number of bits in dividend first */
- init_bitsniffer(dividend,bitmask,dprec,bits);
- /* rescale quotient to same precision (dprec) as dividend */
- rescale(quotient,global_precision,dprec);
- make_msbptr(quotient,dprec);
-
- while (bits--)
- { remainder <<= 1;
- if (sniff_bit(dividend,bitmask))
- remainder++;
- if (remainder >= divisor)
- { remainder -= divisor;
- stuff_bit(quotient,bitmask);
- }
- bump_2bitsniffers(dividend,quotient,bitmask);
- }
- return(remainder);
- } /* mp_shortdiv */
-
-
- int mp_mod(register unitptr remainder,
- register unitptr dividend,register unitptr divisor)
- /* Unsigned divide, treats both operands as positive. */
- { int bits;
- short dprec;
- register unit bitmask;
- if (testeq(divisor,0))
- return(-1); /* zero divisor means divide error */
- mp_init0(remainder);
- /* normalize and compute number of bits in dividend first */
- init_bitsniffer(dividend,bitmask,dprec,bits);
-
- while (bits--)
- { mp_rotate_left(remainder,(boolean)(sniff_bit(dividend,bitmask)!=0));
- msub(remainder,divisor);
- bump_bitsniffer(dividend,bitmask);
- }
- return(0);
- } /* mp_mod */
-
-
- word16 mp_shortmod(register unitptr dividend,register word16 divisor)
- /* This function does a fast mod operation on a multiprecision dividend
- using a short integer modulus returning a short integer remainder.
- This is an unsigned divide. It treats both operands as positive.
- It is used mainly for fast sieve searches for large primes.
- */
- { int bits;
- short dprec;
- register unit bitmask;
- register word16 remainder;
- if (!divisor) /* if divisor == 0 */
- return(-1); /* zero divisor means divide error */
- remainder=0;
- /* normalize and compute number of bits in dividend first */
- init_bitsniffer(dividend,bitmask,dprec,bits);
-
- while (bits--)
- { remainder <<= 1;
- if (sniff_bit(dividend,bitmask))
- remainder++;
- if (remainder >= divisor) remainder -= divisor;
- bump_bitsniffer(dividend,bitmask);
- }
- return(remainder);
- } /* mp_shortmod */
-
-
-
- #ifdef COMB_MULT /* use faster "comb" multiply algorithm */
- /* We are skipping this code because it has a bug... */
-
- int mp_mult(register unitptr prod,
- register unitptr multiplicand, register unitptr multiplier)
- /* Computes multiprecision prod = multiplicand * multiplier */
- { /* Uses interleaved comb multiply algorithm.
- This improved multiply more than twice as fast as a Russian
- peasant multiply, because it does a lot fewer shifts.
- Must have global_precision set to the size of the multiplicand
- plus UNITSIZE-1 SLOP_BITS. Produces a product that is the sum
- of the lengths of the multiplier and multiplicand.
-
- BUG ALERT: Unfortunately, this code has a bug. It fails for
- some numbers. One such example:
- x= 59DE 60CE 2345 8091 A02B 2A1C DBC3 8BE5
- x*x= 59DE 60CE 2345 26B3 993B 67A5 2499 0B7D
- 52C8 CDC7 AFB3 61C8 243C 741B
- --which is obviously wrong. The answer should be:
- x*x= 1F8C 607B 5EA6 C061 2714 04A9 A0C6 A17A
- C9AB 6095 C62F 3756 3843 E4D0 3950 7AD9
- We'll have to fix this some day. In the meantime, we'll
- just have the compiler skip over this code.
-
- BUG NOTE: Possibly fixed. Needs testing.
- */
- int bits;
- register unit bitmask;
- unitptr product, mplier, temp;
- short mprec,mprec2;
- unit mplicand[MAX_UNIT_PRECISION];
-
- /* better clear full width--double precision */
- mp_init(prod+tohigher(global_precision),0);
-
- if (testeq(multiplicand,0))
- return(0); /* zero multiplicand means zero product */
-
- mp_move(mplicand,multiplicand); /* save it from damage */
-
- normalize(multiplier,mprec);
- if (!mprec)
- return(0);
-
- make_lsbptr(multiplier,mprec);
- bitmask = 1; /* start scan at LSB of multiplier */
-
- do /* UNITSIZE times */
- { /* do for bits 0-15 */
- product = prod;
- mplier = multiplier;
- mprec2 = mprec;
- while (mprec2--) /* do for each word in multiplier */
- {
- if (sniff_bit(mplier,bitmask))
- { if (mp_addc(product,multiplicand,0)) /* ripple carry */
- { /* After 1st time thru, this is rarely encountered. */
- temp = msbptr(product,global_precision);
- pre_higherunit(temp);
- /* temp now points to LSU of carry region. */
- unmake_lsbptr(temp,global_precision);
- mp_inc(temp);
- } /* ripple carry */
- }
- pre_higherunit(mplier);
- pre_higherunit(product);
- }
- if (!(bitmask <<= 1))
- break;
- mp_shift_left(multiplicand);
-
- } while (TRUE);
-
- mp_move(multiplicand,mplicand); /* recover */
-
- return(0); /* normal return */
- } /* mp_mult */
-
- #endif /* COMB_MULT */
-
-
- /* Because the "comb" multiply has a bug, we will use the slower
- Russian peasant multiply instead. Fortunately, the mp_mult
- function is not called from any time-critical code.
- */
-
- int mp_mult(register unitptr prod,
- register unitptr multiplicand,register unitptr multiplier)
- /* Computes multiprecision prod = multiplicand * multiplier */
- { /* Uses "Russian peasant" multiply algorithm. */
- int bits;
- register unit bitmask;
- short mprec;
- mp_init(prod,0);
- if (testeq(multiplicand,0))
- return(0); /* zero multiplicand means zero product */
- /* normalize and compute number of bits in multiplier first */
- init_bitsniffer(multiplier,bitmask,mprec,bits);
-
- while (bits--)
- { mp_shift_left(prod);
- if (sniff_bit(multiplier,bitmask))
- mp_add(prod,multiplicand);
- bump_bitsniffer(multiplier,bitmask);
- }
- return(0);
- } /* mp_mult */
-
-
-
- /* mp_modmult computes a multiprecision multiply combined with a
- modulo operation. This is the most time-critical function in
- this multiprecision arithmetic library for performing modulo
- exponentiation. We experimented with different versions of modmult,
- depending on the machine architecture and performance requirements.
- We will either use a Russian Peasant modmult (peasant_modmult),
- Charlie Merritt's modmult (merritt_modmult), Jimmy Upton's
- modmult (upton_modmult), or Thad Smith's modmult (smith_modmult).
- On machines with a hardware atomic multiply instruction,
- Smith's modmult is fastest. It can utilize assembly subroutines to
- speed up the hardware multiply logic and trial quotient calculation.
- If the machine lacks a fast hardware multiply, Merritt's modmult
- is preferred, which doesn't call any assembly multiply routine.
- We use the alias names mp_modmult, stage_modulus, and modmult_burn
- for the corresponding true names, which depend on what flavor of
- modmult we are using.
-
- Before making the first call to mp_modmult, you must set up the
- modulus-dependant precomputated tables by calling stage_modulus.
- After making all the calls to mp_modmult, you call modmult_burn to
- erase the tables created by stage_modulus that were left in memory.
- */
-
- #ifdef COUNTMULTS
- /* "number of modmults" counters, used for performance studies. */
- static unsigned int tally_modmults = 0;
- static unsigned int tally_modsquares = 0;
- #endif /* COUNTMULTS */
-
-
- #ifdef PEASANT
- /* Conventional Russian peasant multiply with modulo algorithm. */
-
- static unitptr pmodulus = 0; /* used only by mp_modmult */
-
- int stage_peasant_modulus(unitptr n)
- /* Must pass modulus to stage_modulus before calling modmult.
- Assumes that global_precision has already been adjusted to the
- size of the modulus, plus SLOP_BITS.
- */
- { /* For this simple version of modmult, just copy unit pointer. */
- pmodulus = n;
- return(0); /* normal return */
- } /* stage_peasant_modulus */
-
-
- int peasant_modmult(register unitptr prod,
- unitptr multiplicand,register unitptr multiplier)
- { /* "Russian peasant" multiply algorithm, combined with a modulo
- operation. This is a simple naive replacement for Merritt's
- faster modmult algorithm. References global unitptr "modulus".
- Computes: prod = (multiplicand*multiplier) mod modulus
- WARNING: All the arguments must be less than the modulus!
- */
- int bits;
- register unit bitmask;
- short mprec;
- mp_init(prod,0);
- /* if (testeq(multiplicand,0))
- return(0); */ /* zero multiplicand means zero product */
- /* normalize and compute number of bits in multiplier first */
- init_bitsniffer(multiplier,bitmask,mprec,bits);
-
- while (bits--)
- { mp_shift_left(prod);
- msub(prod,pmodulus); /* turns mult into modmult */
- if (sniff_bit(multiplier,bitmask))
- { mp_add(prod,multiplicand);
- msub(prod,pmodulus); /* turns mult into modmult */
- }
- bump_bitsniffer(multiplier,bitmask);
- }
- return(0);
- } /* peasant_modmult */
-
-
- /* If we are using a version of mp_modmult that uses internal tables
- in memory, we have to call modmult_burn() at the end of mp_modexp.
- This is so that no sensitive data is left in memory after the program
- exits. The Russian peasant method doesn't use any such tables.
- */
- void peasant_burn(void)
- /* Alias for modmult_burn, called only from mp_modexp(). Destroys
- internal modmult tables. This version does nothing because no
- tables are used by the Russian peasant modmult. */
- { } /* peasant_burn */
-
- #endif /* PEASANT */
-
-
- #ifdef MERRITT
- /*=========================================================================*/
- /*
- This is Charlie Merritt's MODMULT algorithm, implemented in C by PRZ.
- Also refined by Zhahai Stewart to reduce the number of subtracts.
- Modified by Raymond Brand to reduce the number of SLOP_BITS by 1.
- It performs a multiply combined with a modulo operation, without
- going into "double precision". It is faster than the Russian peasant
- method, and still works well on machines that lack a fast hardware
- multiply instruction.
- */
-
- /* The following support functions, macros, and data structures
- are used only by Merritt's modmult algorithm... */
-
- static void mp_lshift_unit(register unitptr r1)
- /* Shift r1 1 whole unit to the left. Used only by modmult function. */
- { register short precision;
- register unitptr r2;
- precision = global_precision;
- make_msbptr(r1,precision);
- r2 = r1;
- while (--precision)
- *post_lowerunit(r1) = *pre_lowerunit(r2);
- *r1 = 0;
- } /* mp_lshift_unit */
-
-
- /* moduli_buf contains shifted images of the modulus, set by stage_modulus */
- static unit moduli_buf[UNITSIZE-1][MAX_UNIT_PRECISION] = {0};
- static unitptr moduli[UNITSIZE] = /* contains pointers into moduli_buf */
- { 0
- ,&moduli_buf[ 0][0], &moduli_buf[ 1][0], &moduli_buf[ 2][0], &moduli_buf[ 3][0],
- &moduli_buf[ 4][0], &moduli_buf[ 5][0], &moduli_buf[ 6][0]
- #ifndef UNIT8
- ,&moduli_buf[ 7][0]
- ,&moduli_buf[ 8][0], &moduli_buf[ 9][0], &moduli_buf[10][0], &moduli_buf[11][0],
- &moduli_buf[12][0], &moduli_buf[13][0], &moduli_buf[14][0]
- #ifndef UNIT16 /* and not UNIT8 */
- ,&moduli_buf[15][0]
- ,&moduli_buf[16][0], &moduli_buf[17][0], &moduli_buf[18][0], &moduli_buf[19][0],
- &moduli_buf[20][0], &moduli_buf[21][0], &moduli_buf[22][0], &moduli_buf[23][0],
- &moduli_buf[24][0], &moduli_buf[25][0], &moduli_buf[26][0], &moduli_buf[27][0],
- &moduli_buf[28][0], &moduli_buf[29][0], &moduli_buf[30][0]
- #endif /* UNIT16 and UNIT8 not defined */
- #endif /* UNIT8 not defined */
- };
-
- /* To optimize msubs, need following 2 unit arrays, each filled
- with the most significant unit and next-to-most significant unit
- of the preshifted images of the modulus. */
- static unit msu_moduli[UNITSIZE] = {0}; /* most signif. unit */
- static unit nmsu_moduli[UNITSIZE] = {0}; /* next-most signif. unit */
-
- /* mpdbuf contains preshifted images of the multiplicand, mod n.
- It is used only by mp_modmult. It could be staticly declared
- inside of mp_modmult, but we put it outside mp_modmult so that
- it can be wiped clean by modmult_burn(), which is called at the
- end of mp_modexp. This is so that no sensitive data is left in
- memory after the program exits.
- */
- static unit mpdbuf[UNITSIZE-1][MAX_UNIT_PRECISION] = {0};
-
-
- static void stage_mp_images(unitptr images[UNITSIZE],unitptr r)
- /* Computes UNITSIZE images of r, each shifted left 1 more bit.
- Used only by modmult function.
- */
- { short int i;
- images[0] = r; /* no need to move the first image, just copy ptr */
- for (i=1; i<UNITSIZE; i++)
- { mp_move(images[i],images[i-1]);
- mp_shift_left(images[i]);
- msub(images[i],moduli[0]);
- }
- } /* stage_mp_images */
-
-
- int stage_merritt_modulus(unitptr n)
- /* Computes UNITSIZE images of modulus, each shifted left 1 more bit.
- Before calling mp_modmult, you must first stage the moduli images by
- calling stage_modulus. n is the pointer to the modulus.
- Assumes that global_precision has already been adjusted to the
- size of the modulus, plus SLOP_BITS.
- */
- { short int i;
- unitptr msu; /* ptr to most significant unit, for faster msubs */
- moduli[0] = n; /* no need to move the first image, just copy ptr */
-
- /* used by optimized msubs macro... */
- msu = msbptr(n,global_precision); /* needed by msubs */
- msu_moduli[0] = *post_lowerunit(msu);
- nmsu_moduli[0] = *msu;
-
- for (i=1; i<UNITSIZE; i++)
- { mp_move(moduli[i],moduli[i-1]);
- mp_shift_left(moduli[i]);
-
- /* used by optimized msubs macro... */
- msu = msbptr(moduli[i],global_precision); /* needed by msubs */
- msu_moduli[i] = *post_lowerunit(msu); /* for faster msubs */
- nmsu_moduli[i] = *msu;
- }
- return(0); /* normal return */
- } /* stage_merritt_modulus */
-
-
- /* The following macros, sniffadd and msubs, are used by modmult... */
- #define sniffadd(i) if (*multiplier & power_of_2(i)) mp_add(prod,mpd[i])
-
- /* Unoptimized msubs macro (msubs0) follows... */
- /* #define msubs0(i) msub(prod,moduli[i])
- */
-
- /* To optimize msubs, compare the most significant units of the
- product and the shifted modulus before deciding to call the full
- mp_compare routine. Better still, compare the upper two units
- before deciding to call mp_compare.
- Optimization of msubs relies heavily on standard C left-to-right
- parsimonious evaluation of logical expressions.
- */
-
- /* Partially-optimized msubs macro (msubs1) follows... */
- /* #define msubs1(i) if ( \
- ((p_m = (*msu_prod-msu_moduli[i])) >= 0) && \
- (p_m || (mp_compare(prod,moduli[i]) >= 0) ) \
- ) mp_sub(prod,moduli[i])
- */
-
- /* Fully-optimized msubs macro (msubs2) follows... */
- #define msubs(i) if (((p_m = *msu_prod-msu_moduli[i])>0) || ( \
- (p_m==0) && ( (*nmsu_prod>nmsu_moduli[i]) || ( \
- (*nmsu_prod==nmsu_moduli[i]) && ((mp_compare(prod,moduli[i]) >= 0)) ))) ) \
- mp_sub(prod,moduli[i])
-
-
- int merritt_modmult(register unitptr prod,
- unitptr multiplicand,register unitptr multiplier)
- /* Performs combined multiply/modulo operation.
- Computes: prod = (multiplicand*multiplier) mod modulus
- WARNING: All the arguments must be less than the modulus!
- Assumes the modulus has been predefined by first calling
- stage_modulus.
- */
- {
- /* p_m, msu_prod, and nmsu_prod are used by the optimized msubs macro...*/
- register signedunit p_m;
- register unitptr msu_prod; /* ptr to most significant unit of product */
- register unitptr nmsu_prod; /* next-most signif. unit of product */
- short mprec; /* precision of multiplier, in units */
- /* Array mpd contains a list of pointers to preshifted images of
- the multiplicand: */
- static unitptr mpd[UNITSIZE] =
- { 0, &mpdbuf[ 0][0], &mpdbuf[ 1][0], &mpdbuf[ 2][0],
- &mpdbuf[ 3][0], &mpdbuf[ 4][0], &mpdbuf[ 5][0], &mpdbuf[ 6][0]
- #ifndef UNIT8
- ,&mpdbuf[ 7][0], &mpdbuf[ 8][0], &mpdbuf[ 9][0], &mpdbuf[10][0],
- &mpdbuf[11][0], &mpdbuf[12][0], &mpdbuf[13][0], &mpdbuf[14][0]
- #ifndef UNIT16 /* and not UNIT8 */
- ,&mpdbuf[15][0], &mpdbuf[16][0], &mpdbuf[17][0], &mpdbuf[18][0],
- &mpdbuf[19][0], &mpdbuf[20][0], &mpdbuf[21][0], &mpdbuf[22][0],
- &mpdbuf[23][0], &mpdbuf[24][0], &mpdbuf[25][0], &mpdbuf[26][0],
- &mpdbuf[27][0], &mpdbuf[28][0], &mpdbuf[29][0], &mpdbuf[30][0]
- #endif /* UNIT16 and UNIT8 not defined */
- #endif /* UNIT8 not defined */
- };
-
- /* Compute preshifted images of multiplicand, mod n: */
- stage_mp_images(mpd,multiplicand);
-
- /* To optimize msubs, set up msu_prod and nmsu_prod: */
- msu_prod = msbptr(prod,global_precision); /* Get ptr to MSU of prod */
- nmsu_prod = msu_prod;
- post_lowerunit(nmsu_prod); /* Get next-MSU of prod */
-
- /* To understand this algorithm, it would be helpful to first
- study the conventional Russian peasant modmult algorithm.
- This one does about the same thing as Russian peasant, but
- is organized differently to save some steps. It loops
- through the multiplier a word (unit) at a time, instead of
- a bit at a time. It word-shifts the product instead of
- bit-shifting it, so it should be faster. It also does about
- half as many subtracts as Russian peasant.
- */
-
- mp_init(prod,0); /* Initialize product to 0. */
-
- /* The way mp_modmult is actually used in cryptographic
- applications, there will NEVER be a zero multiplier or
- multiplicand. So there is no need to optimize for that
- condition.
- */
- /* if (testeq(multiplicand,0))
- return(0); */ /* zero multiplicand means zero product */
- /* Normalize and compute number of units in multiplier first: */
- normalize(multiplier,mprec);
- if (mprec==0) /* if precision of multiplier is 0 */
- return(0); /* zero multiplier means zero product */
- make_msbptr(multiplier,mprec); /* start at MSU of multiplier */
-
- while (mprec--) /* Loop for the number of units in the multiplier */
- {
- /* Shift the product one whole unit to the left.
- This is part of the multiply phase of modmult.
- */
-
- mp_lshift_unit(prod);
-
- /* The product may have grown by as many as UNITSIZE
- bits. That's why we have global_precision set to the
- size of the modulus plus UNITSIZE slop bits.
- Now reduce the product back down by conditionally
- subtracting the preshifted images of the modulus.
- This is part of the modulo reduction phase of modmult.
- The following loop is unrolled for speed, using macros...
-
- for (i=UNITSIZE-1; i>=LOG_UNITSIZE; i--)
- if (mp_compare(prod,moduli[i]) >= 0)
- mp_sub(prod,moduli[i]);
- */
-
- #ifndef UNIT8
- #ifndef UNIT16 /* and not UNIT8 */
- msubs(31);
- msubs(30);
- msubs(29);
- msubs(28);
- msubs(27);
- msubs(26);
- msubs(25);
- msubs(24);
- msubs(23);
- msubs(22);
- msubs(21);
- msubs(20);
- msubs(19);
- msubs(18);
- msubs(17);
- msubs(16);
- #endif /* not UNIT16 and not UNIT8 */
- msubs(15);
- msubs(14);
- msubs(13);
- msubs(12);
- msubs(11);
- msubs(10);
- msubs(9);
- msubs(8);
- #endif /* not UNIT8 */
- msubs(7);
- msubs(6);
- msubs(5);
- #ifndef UNIT32
- msubs(4);
- #ifndef UNIT16
- msubs(3);
- #endif
- #endif
-
- /* Sniff each bit in the current unit of the multiplier,
- and conditionally add the corresponding preshifted
- image of the multiplicand to the product.
- This is also part of the multiply phase of modmult.
-
- The following loop is unrolled for speed, using macros...
-
- for (i=UNITSIZE-1; i>=0; i--)
- if (*multiplier & power_of_2(i))
- mp_add(prod,mpd[i]);
- */
- #ifndef UNIT8
- #ifndef UNIT16 /* and not UNIT8 */
- sniffadd(31);
- sniffadd(30);
- sniffadd(29);
- sniffadd(28);
- sniffadd(27);
- sniffadd(26);
- sniffadd(25);
- sniffadd(24);
- sniffadd(23);
- sniffadd(22);
- sniffadd(21);
- sniffadd(20);
- sniffadd(19);
- sniffadd(18);
- sniffadd(17);
- sniffadd(16);
- #endif /* not UNIT16 and not UNIT8 */
- sniffadd(15);
- sniffadd(14);
- sniffadd(13);
- sniffadd(12);
- sniffadd(11);
- sniffadd(10);
- sniffadd(9);
- sniffadd(8);
- #endif /* not UNIT8 */
- sniffadd(7);
- sniffadd(6);
- sniffadd(5);
- sniffadd(4);
- sniffadd(3);
- sniffadd(2);
- sniffadd(1);
- sniffadd(0);
-
- /* The product may have grown by as many as LOG_UNITSIZE+1
- bits.
- Now reduce the product back down by conditionally
- subtracting LOG_UNITSIZE+1 preshifted images of the
- modulus. This is the modulo reduction phase of modmult.
-
- The following loop is unrolled for speed, using macros...
-
- for (i=LOG_UNITSIZE; i>=0; i--)
- if (mp_compare(prod,moduli[i]) >= 0)
- mp_sub(prod,moduli[i]);
- */
-
- #ifndef UNIT8
- #ifndef UNIT16
- msubs(5);
- #endif
- msubs(4);
- #endif
- msubs(3);
- msubs(2);
- msubs(1);
- msubs(0);
-
- /* Bump pointer to next lower unit of multiplier: */
- post_lowerunit(multiplier);
-
- } /* Loop for the number of units in the multiplier */
-
- return(0); /* normal return */
-
- } /* merritt_modmult */
-
-
- #undef msubs
- #undef sniffadd
-
-
- /* Merritt's mp_modmult function leaves some internal tables in memory,
- so we have to call modmult_burn() at the end of mp_modexp.
- This is so that no cryptographically sensitive data is left in memory
- after the program exits.
- */
- void merritt_burn(void)
- /* Alias for modmult_burn, merritt_burn() is called only by mp_modexp. */
- { unitfill0(&(mpdbuf[0][0]),(UNITSIZE-1)*MAX_UNIT_PRECISION);
- unitfill0(&(moduli_buf[0][0]),(UNITSIZE-1)*MAX_UNIT_PRECISION);
- unitfill0(msu_moduli,UNITSIZE);
- unitfill0(nmsu_moduli,UNITSIZE);
- } /* merritt_burn() */
-
- /******* end of Merritt's MODMULT stuff. *******/
- /*=========================================================================*/
- #endif /* MERRITT */
-
-
- #ifdef UPTON_OR_SMITH /* Used by Upton's and Smith's modmult algorithms */
-
- /* Jimmy Upton's multiprecision modmult algorithm in C.
- Performs a multiply combined with a modulo operation.
-
- The following support functions and data structures
- are used only by Upton's modmult algorithm...
- */
-
- short munit_prec; /* global_precision expressed in MULTUNITs */
-
- /* Note that since the SPARC CPU has no hardware integer multiply
- instruction, there is not that much advantage in having an
- assembly version of mp_smul on that machine. It might be faster
- to use Merritt's modmult instead of Upton's modmult on the SPARC.
- */
-
- /*
- Multiply the single-word multiplier times the multiprecision integer
- in multiplicand, accumulating result in prod. The resulting
- multiprecision prod will be 1 word longer than the multiplicand.
- multiplicand is munit_prec words long. We add into prod, so caller
- should zero it out first. For best results, this time-critical
- function should be implemented in assembly.
- NOTE: Unlike other functions in the multiprecision arithmetic
- library, both multiplicand and prod are pointing at the LSB,
- regardless of byte order of the machine. On an 80x86, this makes
- no difference. But if this assembly function is implemented
- on a 680x0, it becomes important.
- */
- /* Note that this has been modified from the previous version to allow
- better support for Smith's modmult:
- The final carry bit is added to the existing product
- array, rather than simply stored.
- */
-
- #ifndef mp_smul
- void mp_smul (MULTUNIT *prod, MULTUNIT *multiplicand, MULTUNIT multiplier)
- {
- short i;
- unsigned long p, carry;
-
- carry = 0;
- for (i=0; i<munit_prec; ++i)
- { p = (unsigned long)multiplier * *post_higherunit(multiplicand);
- p += *prod + carry;
- *post_higherunit(prod) = (MULTUNIT)p;
- carry = p >> MULTUNITSIZE;
- }
- /* Add carry to the next higher word of product / dividend */
- *prod += (MULTUNIT)carry;
- } /* mp_smul */
- #endif
-
- /* mp_dmul is a double-precision multiply multiplicand times multiplier,
- result into prod. prod must be pointing at a "double precision"
- buffer. E.g. If global_precision is 10 words, prod must be
- pointing at a 20-word buffer.
- */
- #ifndef mp_dmul
- void mp_dmul (unitptr prod, unitptr multiplicand, unitptr multiplier)
- {
- register int i;
- register MULTUNIT *p_multiplicand, *p_multiplier;
- register MULTUNIT *prodp;
-
-
- unitfill0(prod,global_precision*2); /* Pre-zero prod */
- /* Calculate precision in units of MULTUNIT */
- munit_prec = global_precision * UNITSIZE / MULTUNITSIZE;
- p_multiplicand = (MULTUNIT *)multiplicand;
- p_multiplier = (MULTUNIT *)multiplier;
- prodp = (MULTUNIT *)prod;
- make_lsbptr(p_multiplicand,munit_prec);
- make_lsbptr(p_multiplier,munit_prec);
- make_lsbptr(prodp,munit_prec*2);
- /* Multiply multiplicand by each word in multiplier, accumulating prod: */
- for (i=0; i<munit_prec; ++i)
- mp_smul (post_higherunit(prodp),
- p_multiplicand, *post_higherunit(p_multiplier));
- } /* mp_dmul */
- #endif /* mp_dmul */
-
- static unit ALIGN modulus[MAX_UNIT_PRECISION];
- static short nbits; /* number of modulus significant bits */
- #endif /* UPTON_OR_SMITH */
-
- #ifdef UPTON
-
- /* These scratchpad arrays are used only by upton_modmult (mp_modmult).
- Some of them could be staticly declared inside of mp_modmult, but we
- put them outside mp_modmult so that they can be wiped clean by
- modmult_burn(), which is called at the end of mp_modexp. This is
- so that no sensitive data is left in memory after the program exits.
- */
-
- static unit ALIGN reciprocal[MAX_UNIT_PRECISION];
- static unit ALIGN dhi[MAX_UNIT_PRECISION];
- static unit ALIGN d_data[MAX_UNIT_PRECISION*2]; /* double-wide register */
- static unit ALIGN e_data[MAX_UNIT_PRECISION*2]; /* double-wide register */
- static unit ALIGN f_data[MAX_UNIT_PRECISION*2]; /* double-wide register */
-
- static short nbitsDivUNITSIZE;
- static short nbitsModUNITSIZE;
-
- /* stage_upton_modulus() is aliased to stage_modulus().
- Prepare for an Upton modmult. Calculate the reciprocal of modulus,
- and save both. Note that reciprocal will have one more bit than
- modulus.
- Assumes that global_precision has already been adjusted to the
- size of the modulus, plus SLOP_BITS.
- */
- int stage_upton_modulus(unitptr n)
- {
- mp_move(modulus, n);
- mp_recip(reciprocal, modulus);
- nbits = countbits(modulus);
- nbitsDivUNITSIZE = nbits / UNITSIZE;
- nbitsModUNITSIZE = nbits % UNITSIZE;
- return(0); /* normal return */
- } /* stage_upton_modulus */
-
-
- /* Upton's algorithm performs a multiply combined with a modulo operation.
- Computes: prod = (multiplicand*multiplier) mod modulus
- WARNING: All the arguments must be less than the modulus!
- References global unitptr modulus and reciprocal.
- The reciprocal of modulus is 1 bit longer than the modulus.
- upton_modmult() is aliased to mp_modmult().
- */
- int upton_modmult (unitptr prod, unitptr multiplicand, unitptr multiplier)
- {
- unitptr d = d_data;
- unitptr d1 = d_data;
- unitptr e = e_data;
- unitptr f = f_data;
- short orig_precision;
-
- orig_precision = global_precision;
- mp_dmul (d,multiplicand,multiplier);
-
- /* Throw off low nbits of d */
- #ifndef HIGHFIRST
- d1 = d + nbitsDivUNITSIZE;
- #else
- d1 = d + orig_precision - nbitsDivUNITSIZE;
- #endif
- mp_move(dhi, d1); /* Don't screw up d, we need it later */
- mp_shift_right_bits(dhi, nbitsModUNITSIZE);
-
- mp_dmul (e,dhi,reciprocal); /* Note - reciprocal has nbits+1 bits */
-
- #ifndef HIGHFIRST
- e += nbitsDivUNITSIZE;
- #else
- e += orig_precision - nbitsDivUNITSIZE;
- #endif
- mp_shift_right_bits(e, nbitsModUNITSIZE);
-
- mp_dmul (f,e,modulus);
-
- /* Now for the only double-precision call to mpilib: */
- set_precision (orig_precision * 2);
- mp_sub (d,f);
-
- /* d's precision should be <= orig_precision */
- rescale (d, orig_precision*2, orig_precision);
- set_precision (orig_precision);
-
- /* Should never have to do this final subtract more than twice: */
- while (mp_compare(d,modulus) > 0)
- mp_sub (d,modulus);
-
- mp_move(prod,d);
- return(0); /* normal return */
- } /* upton_modmult */
-
-
- /* Upton's mp_modmult function leaves some internal arrays in memory,
- so we have to call modmult_burn() at the end of mp_modexp.
- This is so that no cryptographically sensitive data is left in memory
- after the program exits.
- upton_burn() is aliased to modmult_burn().
- */
- void upton_burn(void)
- {
- unitfill0(modulus,MAX_UNIT_PRECISION);
- unitfill0(reciprocal,MAX_UNIT_PRECISION);
- unitfill0(dhi,MAX_UNIT_PRECISION);
- unitfill0(d_data,MAX_UNIT_PRECISION*2);
- unitfill0(e_data,MAX_UNIT_PRECISION*2);
- unitfill0(f_data,MAX_UNIT_PRECISION*2);
- nbits = nbitsDivUNITSIZE = nbitsModUNITSIZE = 0;
- } /* upton_burn */
-
- /******* end of Upton's MODMULT stuff. *******/
- /*=========================================================================*/
- #endif /* UPTON */
-
- #ifdef SMITH /* using Thad Smith's modmult algorithm */
-
- /* Thad Smith's implementation of multiprecision modmult algorithm in C.
- Performs a multiply combined with a modulo operation.
- The multiplication is done with mp_dmul, the same as for Upton's
- modmult. The modulus reduction is done by long division, in
- which a trial quotient "digit" is determined, then the product of
- that digit and the divisor are subtracted from the dividend.
-
- In this case, the digit is MULTUNIT in size and the subtraction
- is done by adding the product to the one's complement of the
- dividend, which allows use of the existing mp_smul routine.
-
- The following support functions and data structures
- are used only by Smith's modmult algorithm...
- */
-
- /* These scratchpad arrays are used only by smith_modmult (mp_modmult).
- Some of them could be statically declared inside of mp_modmult, but we
- put them outside mp_modmult so that they can be wiped clean by
- modmult_burn(), which is called at the end of mp_modexp. This is
- so that no sensitive data is left in memory after the program exits.
- */
-
- static unit ALIGN ds_data[MAX_UNIT_PRECISION*2+2];
-
- static unit mod_quotient [4];
- static unit mod_divisor [4]; /* 2 most signif. MULTUNITs of modulus */
-
- static MULTUNIT *modmpl; /* ptr to modulus least significant
- ** MULTUNIT */
- static int mshift; /* number of bits for
- ** recip scaling */
- static MULTUNIT reciph; /* MSunit of scaled recip */
- static MULTUNIT recipl; /* LSunit of scaled recip */
-
- static short modlenMULTUNITS; /* length of modulus in MULTUNITs */
- static MULTUNIT mutemp; /* temporary */
-
- /* The routines mp_smul and mp_dmul are the same as for UPTON and
- should be coded in assembly. Note, however, that the previous
- Upton's mp_smul version has been modified to compatible with
- Smith's modmult. The new version also still works for Upton's
- modmult.
- */
-
- #ifndef mp_set_recip
- #define mp_set_recip(rh,rl,m) /* null */
- #else
- /* setup routine for external mp_quo_digit */
- void mp_set_recip(MULTUNIT rh, MULTUNIT rl, int m);
- #endif
- MULTUNIT mp_quo_digit (MULTUNIT *dividend);
-
- #ifdef MULTUNIT_SIZE_SAME
- #define mp_musubb mp_subb /* use existing routine */
- #else /* ! MULTUNIT_SIZE_SAME */
-
- /* This performs the same function as mp_subb, but with MULTUNITs.
- Note: Processors without alignment requirements may be able to use
- mp_subb, even though MULTUNITs are smaller than units. In that case,
- use mp_subb, since it would be faster if coded in assembly. Note that
- this implementation won't work for MULTUNITs which are long -- use
- mp_subb in that case.
- */
- #ifndef mp_musubb
- boolean mp_musubb
- (register MULTUNIT* r1,register MULTUNIT* r2,register boolean borrow)
- /* multiprecision subtract of MULTUNITs with borrow, r2 from r1,
- ** result in r1 */
- /* borrow is incoming borrow flag-- value should be 0 or 1 */
- { register ulint x; /* won't work if sizeof(MULTUNIT)==
- sizeof(long) */
- short precision; /* number of MULTUNITs to subtract */
- precision = global_precision * MULTUNITs_perunit;
- make_lsbptr(r1,precision);
- make_lsbptr(r2,precision);
- while (precision--)
- { x = (ulint) *r1 - (ulint) *post_higherunit(r2) - (ulint) borrow;
- *post_higherunit(r1) = x;
- borrow = (((1L << MULTUNITSIZE) & x) != 0L);
- }
- return (borrow);
- } /* mp_musubb */
- #endif /* mp_musubb */
- #endif /* MULTUNIT_SIZE_SAME */
-
- /* The function mp_quo_digit is the heart of Smith's modulo reduction,
- which uses a form of long division. It computes a trial quotient
- "digit" (MULTUNIT-sized digit) by multiplying the three most
- significant MULTUNITs of the dividend by the two most significant
- MULTUNITs of the reciprocal of the modulus. Note that this function
- requires that MULTUNITSIZE * 2 <= sizeof(unsigned long).
-
- An important part of this technique is that the quotient never be
- too small, although it may occasionally be too large. This was
- done to eliminate the need to check and correct for a remainder
- exceeding the divisor. It is easier to check for a negative
- remainder. The following technique rarely needs correction for
- MULTUNITs of at least 16 bits.
-
- The following routine has two implementations:
-
- ASM_PROTOTYPE defined: written to be an executable prototype for
- an efficient assembly language implementation. Note that several
- of the following masks and shifts can be done by directly
- manipulating single precision registers on some architectures.
-
- ASM_PROTOTYPE undefined: a slightly more efficient implementation
- in C. Although this version returns a result larger than the
- optimum (which is corrected in smith_modmult) more often than the
- prototype version, the faster execution in C more than makes up
- for the difference.
-
- Parameter: dividend - points to the most significant MULTUNIT
- of the dividend. Note that dividend actually contains the
- one's complement of the actual dividend value (see comments for
- smith_modmult).
-
- Return: the trial quotient digit resulting from dividing the first
- three MULTUNITs at dividend by the upper two MULTUNITs of the
- modulus.
- */
-
- /* #define ASM_PROTOTYPE */ /* undefined: use C-optimized version */
-
- #ifndef mp_quo_digit
- MULTUNIT mp_quo_digit (MULTUNIT *dividend) {
- unsigned long q, q0, q1, q2;
- unsigned short lsb_factor;
-
- /* Compute the least significant product group.
- The last terms of q1 and q2 perform upward rounding, which is
- needed to guarantee that the result not be too small.
- */
- q1 = (dividend[tohigher(-2)] ^ MULTUNIT_mask) * (unsigned long)reciph
- + reciph;
- q2 = (dividend[tohigher(-1)] ^ MULTUNIT_mask) * (unsigned long)recipl
- + (1L << MULTUNITSIZE) ;
- #ifdef ASM_PROTOTYPE
- lsb_factor = 1 & (q1>>MULTUNITSIZE) & (q2>>MULTUNITSIZE);
- q = q1 + q2;
-
- /* The following statement is equivalent to shifting the sum right
- one bit while shifting in the carry bit.
- */
- q0 = (q1 > ~q2 ? DMULTUNIT_msb : 0) | (q >> 1);
- #else /* optimized C version */
- q0 = (q1>>1) + (q2>>1) + 1;
- #endif
-
- /* Compute the middle significant product group. */
-
- q1 = (dividend[tohigher(-1)] ^ MULTUNIT_mask) * (unsigned long)reciph;
- q2 = (dividend[ 0] ^ MULTUNIT_mask) * (unsigned long)recipl;
- #ifdef ASM_PROTOTYPE
- q = q1 + q2;
- q = (q1 > ~q2 ? DMULTUNIT_msb : 0) | (q >> 1);
-
- /* Add in the most significant word of the first group.
- The last term takes care of the carry from adding the lsb's
- that were shifted out prior to addition.
- */
- q = (q0 >> MULTUNITSIZE)+ q + (lsb_factor & (q1 ^ q2));
- #else /* optimized C version */
- q = (q0 >> MULTUNITSIZE)+ (q1>>1) + (q2>>1) + 1;
- #endif
-
- /* Compute the most significant term and add in the others */
-
- q = (q >> (MULTUNITSIZE-2)) +
- (((dividend[0] ^ MULTUNIT_mask) * (unsigned long)reciph) << 1);
- q >>= mshift;
-
- /* Prevent overflow and then wipe out the intermediate results. */
-
- mutemp = (MULTUNIT)min(q, (1L << MULTUNITSIZE) -1);
- q= q0 = q1 = q2 = 0; lsb_factor = 0; (void)lsb_factor;
- return mutemp;
- }
- #endif /* mp_quo_digit */
-
- /* stage_smith_modulus() - Prepare for a Smith modmult.
-
- Calculate the reciprocal of modulus with a precision of two MULTUNITs.
- Assumes that global_precision has already been adjusted to the
- size of the modulus, plus SLOP_BITS.
-
- Note: This routine was designed to work with large values and
- doesn't have the necessary testing or handling to work with a
- modulus having less than three significant units. For such cases,
- the separate multiply and modulus routines can be used.
-
- stage_smith_modulus() is aliased to stage_modulus().
- */
-
- int stage_smith_modulus(unitptr n_modulus)
- {
- int original_precision;
- int sigmod; /* significant units in modulus */
- unitptr mp; /* modulus most significant pointer */
- MULTUNIT *mpm; /* reciprocal pointer */
- int prec; /* precision of reciprocal calc in units */
-
- mp_move(modulus, n_modulus);
- modmpl = (MULTUNIT*) modulus;
- modmpl = lsbptr (modmpl, global_precision * MULTUNITs_perunit);
- nbits = countbits(modulus);
- modlenMULTUNITS = (nbits+ MULTUNITSIZE-1) / MULTUNITSIZE;
-
- original_precision = global_precision;
-
- /* The following code copies the three most significant units of
- * modulus to mod_divisor.
- */
- mp = modulus;
- sigmod = significance (modulus);
- rescale (mp, original_precision, sigmod);
- /* prec is the unit precision required for 3 MULTUNITs */
- prec = (3 +(MULTUNITs_perunit-1)) / MULTUNITs_perunit;
- set_precision (prec);
-
- /* set mp = ptr to most significant units of modulus, then move
- * the most significant units to mp_divisor
- */
- mp = msbptr(mp,sigmod) -tohigher(prec-1);
- unmake_lsbptr (mp, prec);
- mp_move (mod_divisor, mp);
-
- /* Keep 2*MULTUNITSIZE bits in mod_divisor.
- * This will (normally) result in a reciprocal of 2*MULTUNITSIZE+1 bits.
- */
- mshift = countbits (mod_divisor) - 2*MULTUNITSIZE;
- mp_shift_right_bits (mod_divisor, mshift);
- mp_recip(mod_quotient,mod_divisor);
- mp_shift_right_bits (mod_quotient,1);
-
- /* Reduce to: 0 < mshift <= MULTUNITSIZE */
- mshift = ((mshift + (MULTUNITSIZE-1)) % MULTUNITSIZE) +1;
- /* round up, rescaling if necessary */
- mp_inc (mod_quotient);
- if (countbits (mod_quotient) > 2*MULTUNITSIZE) {
- mp_shift_right_bits (mod_quotient,1);
- mshift--; /* now 0 <= mshift <= MULTUNITSIZE */
- }
- mpm = lsbptr ((MULTUNIT*)mod_quotient, prec*MULTUNITs_perunit);
- recipl = *post_higherunit (mpm);
- reciph = *mpm;
- mp_set_recip (reciph, recipl, mshift);
- set_precision (original_precision);
- return(0); /* normal return */
- } /* stage_smith_modulus */
-
- /* Smith's algorithm performs a multiply combined with a modulo operation.
- Computes: prod = (multiplicand*multiplier) mod modulus
- The modulus must first be set by stage_smith_modulus().
- smith_modmult() is aliased to mp_modmult().
- */
-
- int
- smith_modmult(unitptr prod, unitptr multiplicand, unitptr multiplier)
- {
- unitptr d; /* ptr to product */
- MULTUNIT *dmph, *dmpl, *dmp; /* ptrs to dividend (high, low, first)
- * aligned for subtraction */
- /* Note that dmph points one MULTUNIT higher than indicated by
- global precision. This allows us to zero out a word one higher than
- the normal precision.
- */
- short orig_precision;
- short nqd; /* number of quotient digits remaining to
- * be generated */
- short dmi; /* number of significant MULTUNITs in product */
-
- d = ds_data + 1; /* room for leading MSB if HIGHFIRST */
- orig_precision = global_precision;
- mp_dmul(d, multiplicand, multiplier);
-
- rescale(d, orig_precision * 2, orig_precision * 2 + 1);
- set_precision(orig_precision * 2 + 1);
- *msbptr(d, global_precision) = 0; /* leading 0 unit */
-
- /* We now start working with MULTUNITs.
- Determine the most significant MULTUNIT of the product so we don't
- have to process leading zeros in our divide loop.
- */
- dmi = significance(d) * MULTUNITs_perunit;
- if (dmi >= modlenMULTUNITS)
- { /* Make dividend negative. This allows the use of mp_smul to
- * "subtract" the product of the modulus and the trial divisor
- * by actually adding to a negative dividend.
- * The one's complement of the dividend is used, since it causes
- * a zero value to be represented as all ones. This facilitates
- * testing the result for possible overflow, since a sign bit
- * indicates that no adjustment is necessary, and we should not
- * attempt to adjust if the result of the addition is zero.
- */
- mp_inc(d);
- mp_neg(d);
- set_precision(orig_precision);
- munit_prec = global_precision * UNITSIZE / MULTUNITSIZE;
-
- /* Set msb, lsb, and normal ptrs of dividend */
- dmph = lsbptr((MULTUNIT *) d, (orig_precision * 2 + 1) *
- MULTUNITs_perunit) + tohigher(dmi);
- nqd = dmi + 1 - modlenMULTUNITS;
- dmpl = dmph - tohigher(modlenMULTUNITS);
-
- /* Divide loop.
- Each iteration computes the next quotient MULTUNIT digit, then
- multiplies the divisor (modulus) by the quotient digit and adds
- it to the one's complement of the dividend (equivalent to
- subtracting). If the product was greater than the remaining dividend,
- we get a non-negative result, in which case we subtract off the
- modulus to get the proper negative remainder.
- */
- for (; nqd; nqd--)
- { MULTUNIT q; /* quotient trial digit */
-
- q = mp_quo_digit(dmph);
- if (q > 0)
- { mp_smul(dmpl, modmpl, q);
-
- /* Perform correction if q too large.
- * This rarely occurs.
- */
- if (!(*dmph & MULTUNIT_msb))
- { dmp = dmpl;
- unmake_lsbptr(dmp, orig_precision *
- MULTUNITs_perunit);
- if (mp_musubb(dmp,
- (MULTUNIT *) modulus, 0))
- (*dmph) --;
- }
- }
- pre_lowerunit(dmph);
- pre_lowerunit(dmpl);
- }
- /* d contains the one's complement of the remainder. */
- rescale(d, orig_precision * 2 + 1, orig_precision);
- set_precision(orig_precision);
- mp_neg(d);
- mp_dec(d);
- } else
- { /* Product was less than modulus. Return it. */
- rescale(d, orig_precision * 2 + 1, orig_precision);
- set_precision(orig_precision);
- }
- mp_move(prod, d);
- return (0); /* normal return */
- } /* smith_modmult */
-
-
- /* Smith's mp_modmult function leaves some internal arrays in memory,
- so we have to call modmult_burn() at the end of mp_modexp.
- This is so that no cryptographically sensitive data is left in memory
- after the program exits.
- smith_burn() is aliased to modmult_burn().
- */
- void smith_burn(void)
- {
- empty_array (modulus);
- empty_array (ds_data);
- empty_array (mod_quotient);
- empty_array (mod_divisor);
- modmpl = 0;
- mshift =nbits = 0;
- reciph = recipl = 0;
- modlenMULTUNITS = mutemp = 0;
- mp_set_recip (0,0,0);
- } /* smith_burn */
-
- /* End of Thad Smith's implementation of modmult. */
-
- #endif /* SMITH */
-
-
- int countbits(unitptr r)
- /* Returns number of significant bits in r */
- { int bits;
- short prec;
- register unit bitmask;
- init_bitsniffer(r,bitmask,prec,bits);
- return(bits);
- } /* countbits */
-
-
- char *copyright_notice(void)
- /* force linker to include copyright notice in the executable object image. */
- { return ("(c)1986 Philip Zimmermann"); } /* copyright_notice */
-
-
- int mp_modexp(register unitptr expout,register unitptr expin,
- register unitptr exponent,register unitptr modulus)
- { /* Russian peasant combined exponentiation/modulo algorithm.
- Calls modmult instead of mult.
- Computes: expout = (expin**exponent) mod modulus
- WARNING: All the arguments must be less than the modulus!
- */
- int bits;
- short oldprecision;
- register unit bitmask;
- unit product[MAX_UNIT_PRECISION];
- short eprec;
-
- #ifdef COUNTMULTS
- tally_modmults = 0; /* clear "number of modmults" counter */
- tally_modsquares = 0; /* clear "number of modsquares" counter */
- #endif /* COUNTMULTS */
- mp_init(expout,1);
- if (testeq(exponent,0))
- { if (testeq(expin,0))
- return(-1); /* 0 to the 0th power means return error */
- return(0); /* otherwise, zero exponent means expout is 1 */
- }
- if (testeq(modulus,0))
- return(-2); /* zero modulus means error */
- #if SLOP_BITS > 0 /* if there's room for sign bits */
- if (mp_tstminus(modulus))
- return(-2); /* negative modulus means error */
- #endif /* SLOP_BITS > 0 */
- if (mp_compare(expin,modulus) >= 0)
- return(-3); /* if expin >= modulus, return error */
- if (mp_compare(exponent,modulus) >= 0)
- return(-4); /* if exponent >= modulus, return error */
-
- oldprecision = global_precision; /* save global_precision */
- /* set smallest optimum precision for this modulus */
- set_precision(bits2units(countbits(modulus)+SLOP_BITS));
- /* rescale all these registers to global_precision we just defined */
- rescale(modulus,oldprecision,global_precision);
- rescale(expin,oldprecision,global_precision);
- rescale(exponent,oldprecision,global_precision);
- rescale(expout,oldprecision,global_precision);
-
- if (stage_modulus(modulus))
- { set_precision(oldprecision); /* restore original precision */
- return(-5); /* unstageable modulus (STEWART algorithm) */
- }
-
- /* normalize and compute number of bits in exponent first */
- init_bitsniffer(exponent,bitmask,eprec,bits);
-
- /* We can "optimize out" the first modsquare and modmult: */
- bits--; /* We know for sure at this point that bits>0 */
- mp_move(expout,expin); /* expout = (1*1)*expin; */
- bump_bitsniffer(exponent,bitmask);
-
- while (bits--)
- {
- poll_for_break(); /* polls keyboard, allows ctrl-C to abort program */
- #ifdef COUNTMULTS
- tally_modsquares++; /* bump "number of modsquares" counter */
- #endif /* COUNTMULTS */
- mp_modsquare(product,expout);
- if (sniff_bit(exponent,bitmask))
- { mp_modmult(expout,product,expin);
- #ifdef COUNTMULTS
- tally_modmults++; /* bump "number of modmults" counter */
- #endif /* COUNTMULTS */
- } else
- {
- mp_move(expout,product);
- }
- bump_bitsniffer(exponent,bitmask);
- } /* while bits-- */
- mp_burn(product); /* burn the evidence on the stack */
- modmult_burn(); /* ask mp_modmult to also burn its own evidence */
-
- #ifdef COUNTMULTS /* diagnostic analysis */
- { long atomic_mults;
- unsigned int unitcount,totalmults;
- unitcount = bits2units(countbits(modulus));
- /* calculation assumes modsquare takes as long as a modmult: */
- atomic_mults = (long) tally_modmults * (unitcount * unitcount);
- atomic_mults += (long) tally_modsquares * (unitcount * unitcount);
- printf("%ld atomic mults for ",atomic_mults);
- printf("%d+%d = %d modsqr+modmlt, at %d bits, %d words.\n",
- tally_modsquares,tally_modmults,
- tally_modsquares+tally_modmults,
- countbits(modulus), unitcount);
- }
- #endif /* COUNTMULTS */
-
- set_precision(oldprecision); /* restore original precision */
-
- /* Do an explicit reference to the copyright notice so that the linker
- will be forced to include it in the executable object image... */
- copyright_notice(); /* has no real effect at run time */
-
- return(0); /* normal return */
- } /* mp_modexp */
-
- int mp_modexp_crt(unitptr expout, unitptr expin,
- unitptr p, unitptr q, unitptr ep, unitptr eq, unitptr u)
- /* This is a faster modexp for moduli with a known
- factorisation into two relatively prime factors p and q,
- and an input relatively prime to the modulus,
- the Chinese Remainder Theorem to do the computation
- mod p and mod q, and then combine the results. This
- relies on a number of precomputed values, but does not
- actually require the modulus n or the exponent e.
-
- expout = expin ^ e mod (p*q).
- We form this by evaluating
- p2 = (expin ^ e) mod p and
- q2 = (expin ^ e) mod q
- and then combining the two by the CRT.
-
- Two optimisations of this are possible. First, we can
- reduce expin modulo p and q before starting.
-
- Second, since we know the factorisation of p and q
- (trivially derived from the factorisation of n = p*q),
- and expin is relatively prime to both p and q,
- we can use Euler's theorem, expin^phi(m) = 1 (mod m),
- to throw away multiples of phi(p) or phi(q) in e.
- Letting ep = e mod phi(p) and
- eq = e mod phi(q)
- then combining these two speedups, we only need to evaluate
- p2 = ((expin mod p) ^ ep) mod p and
- q2 = ((expin mod q) ^ eq) mod q.
-
- Now we need to apply the CRT. Starting with
- expout = p2 (mod p) and
- expout = q2 (mod q)
- we can say that expout = p2 + p * k, and if we assume that
- 0 <= p2 < p, then 0 <= expout < p*q for some 0 <= k < q.
- Since we want expout = q2 (mod q), then
- p*k = q2-p2 (mod q). Since p and q are relatively
- prime, p has a multiplicative inverse u mod q. In other
- words,
- u = 1/p (mod q).
- Multiplying by u on both sides gives k = u*(q2-p2) (mod q).
- Since we want 0 <= k < q, we can thus find k as
- k = (u * (q2-p2)) mod q.
-
- Once we have k, evaluating p2 + p * k is easy, and
- that gives us the result.
-
- In the detailed implementation, there is a temporary, temp,
- used to hold intermediate results, p2 is held in expout,
- and q2 is used as a temporary in the final step when it is
- no longer needed. With that, you should be able to
- understand the code below.
- */
- {
- unit q2[MAX_UNIT_PRECISION];
- unit temp[MAX_UNIT_PRECISION];
- int status;
-
- /* First, compiute p2 (physically held in M) */
-
- /* p2 = [ (expin mod p) ^ ep ] mod p */
- mp_mod(temp,expin,p); /* temp = expin mod p */
- status = mp_modexp(expout,temp,ep,p);
- if (status < 0) /* mp_modexp returned an error. */
- { mp_init(expout,1);
- return(status); /* error return */
- }
-
- /* And the same thing for q2 */
-
- /* q2 = [ (expin mod q) ^ eq ] mod q */
- mp_mod(temp,expin,q); /* temp = expin mod q */
- status = mp_modexp(q2,temp,eq,q);
- if (status < 0) /* mp_modexp returned an error. */
- { mp_init(expout,1);
- return(status); /* error return */
- }
-
- /* Now use the multiplicative inverse u to glue together the
- two halves.
- */
- #if 0
- /* This optimisation is useful if you commonly get small results,
- but for random results and large q, the odds of (1/q) of it
- being useful do not warrant the test.
- */
- if (mp_compare(expout,q2) != 0)
- {
- #endif
- /* Find q2-p2 mod q */
- if (mp_sub(q2,expout)) /* if the result went negative */
- mp_add(q2,q); /* add q to q2 */
-
- /* expout = p2 + ( p * [(q2*u) mod q] ) */
- mp_mult(temp,q2,u); /* q2*u */
- mp_mod(q2,temp,q); /* (q2*u) mod q */
- mp_mult(temp,p,q2); /* p * [(q2*u) mod q] */
- mp_add(expout,temp); /* expout = p2 + p * [...] */
- #if 0
- }
- #endif
-
- mp_burn(q2); /* burn the evidence on the stack...*/
- mp_burn(temp);
- return(0); /* normal return */
- } /* mp_modexp_crt */
-
-
- /****************** end of MPI library ****************************/
-