home *** CD-ROM | disk | FTP | other *** search
- *
- * $VER: tan.s 33.1 (22.1.97)
- *
- * Calculates the tangent or the cotangent of the source operand
- *
- * Version history:
- *
- * 33.1 22.1.97 (c) Motorola
- *
- * - snipped from M68060SP
- *
- * 33.2 23.1.97 laukkanen
- *
- * - added cot(), well that was the second solution I thought
- * after 1/tan() which didn't seem very fast but I'm no
- * "mathemagician" so if anybody wants to object, please do.
- *
- * 33.3 24.1.97 laukkanen
- *
- * - the reduce part of tan() was broken, fixed.
- *
-
- machine 68040
- fpu 1
-
- XDEF _tan
- XDEF @tan
- XDEF _cot
- XDEF @cot
-
- XREF PITBL
-
- *************************************************************************
- * tan(): computes the tangent of a normalized input *
- * cot(): computes the cotangent of a normalized input *
- * *
- * INPUT *************************************************************** *
- * fp0 = pointer to extended precision input *
- * *
- * OUTPUT ************************************************************** *
- * fp0 = tan(X) or fp0 = cot(X) *
- * *
- * ACCURACY and MONOTONICITY ******************************************* *
- * The returned result is within 3 ulp in 64 significant bit, i.e. *
- * within 0.5001 ulp to 53 bits if the result is subsequently *
- * rounded to double precision. The result is provably monotonic *
- * in double precision. *
- * *
- * ALGORITHM *********************************************************** *
- * *
- * 1. If |X| >= 15Pi or |X| < 2**(-40), go to 6. *
- * *
- * 2. Decompose X as X = N(Pi/2) + r where |r| <= Pi/4. Let *
- * k = N mod 2, so in particular, k = 0 or 1. *
- * *
- * 3. If k is odd, go to 5. *
- * *
- * 4. (k is even) Tan(X) = tan(r) and tan(r) is approximated by a *
- * rational function U/V where *
- * U = r + r*s*(P1 + s*(P2 + s*P3)), and *
- * V = 1 + s*(Q1 + s*(Q2 + s*(Q3 + s*Q4))), s = r*r. *
- * Exit. *
- * *
- * 4. (k is odd) Tan(X) = -cot(r). Since tan(r) is approximated by *
- * a rational function U/V where *
- * U = r + r*s*(P1 + s*(P2 + s*P3)), and *
- * V = 1 + s*(Q1 + s*(Q2 + s*(Q3 + s*Q4))), s = r*r, *
- * -Cot(r) = -V/U. Exit. *
- * *
- * 6. If |X| > 1, go to 8. *
- * *
- * 7. (|X|<2**(-40)) Tan(X) = X. Exit. *
- * *
- * 8. Overwrite X by X := X rem 2Pi. Now that |X| <= Pi, go back *
- * to 2. *
- * *
- *************************************************************************
-
- TANQ4 dc.l $3EA0B759,$F50F8688
- TANP3 dc.l $BEF2BAA5,$A8924F04
- TANQ3 dc.l $BF346F59,$B39BA65F,$00000000,$00000000
- TANP2 dc.l $3FF60000,$E073D3FC,$199C4A00,$00000000
- TANQ2 dc.l $3FF90000,$D23CD684,$15D95FA1,$00000000
- TANP1 dc.l $BFFC0000,$8895A6C5,$FB423BCA,$00000000
- TANQ1 dc.l $BFFD0000,$EEF57E0D,$A84BC8CE,$00000000
- INVTWOPI dc.l $3FFC0000,$A2F9836E,$4E44152A,$00000000
- TWOPI1 dc.l $40010000,$C90FDAA2,$00000000,$00000000
- TWOPI2 dc.l $3FDF0000,$85A308D4,$00000000,$00000000
- TWOBYPI dc.l $3FE45F30,$6DC9C883
- PID2 dc.d 1.57079632679489661923
-
- _tan
- fmove.d (4,sp),fp0
- @tan
- fmove.x fp0,-(sp)
- move.l (sp),d1
- move.w (4,sp),d1
- and.l #$7FFFFFFF,d1
- lea (12,sp),sp
-
- cmp.l #$3FD78000,d1 ; |X| >= 2**(-40)?
- bge.b .TANOK1
- bra.w .TANSM
- .TANOK1
- cmp.l #$4004BC7E,d1 ; |X| < 15 PI?
- blt.b .TANMAIN
- bra.w .REDUCEX
-
- .TANMAIN
- ;--THIS IS THE USUAL CASE, |X| <= 15 PI.
- ;--THE ARGUMENT REDUCTION IS DONE BY TABLE LOOK UP.
- fmove.x fp0,fp1
- fmul.d (TWOBYPI,pc),fp1 ; X*2/PI
-
- lea (PITBL+$200,pc),a1 ; TABLE OF N*PI/2, N = -32,...,32
-
- fmove.l fp1,d1 ; CONVERT TO INTEGER
-
- asl.l #4,d1
- add.l d1,a1 ; ADDRESS N*PIBY2 IN Y1, Y2
-
- fsub.x (a1)+,fp0 ; X-Y1
-
- fsub.s (a1),fp0 ; FP0 IS R = (X-Y1)-Y2
-
- ror.l #5,d1
- and.l #$80000000,d1 ; D0 WAS ODD IFF D0 < 0
-
- .TANCONT
- fmovem.x fp2/fp3,-(sp) ; save fp2,fp3
-
- tst.l d1
- blt.w .NODD
-
- fmove.x fp0,fp1
- fmul.x fp1,fp1 ; S = R*R
-
- fmove.d (TANQ4,pc),fp3
- fmove.d (TANP3,pc),fp2
- fmul.x fp1,fp3 ; SQ4
- fmul.x fp1,fp2 ; SP3
-
- fadd.d (TANQ3,pc),fp3 ; Q3+SQ4
- fadd.x (TANP2,pc),fp2 ; P2+SP3
-
- fmul.x fp1,fp3 ; S(Q3+SQ4)
- fmul.x fp1,fp2 ; S(P2+SP3)
-
- fadd.x (TANQ2,pc),fp3 ; Q2+S(Q3+SQ4)
- fadd.x (TANP1,pc),fp2 ; P1+S(P2+SP3)
-
- fmul.x fp1,fp3 ; S(Q2+S(Q3+SQ4))
- fmul.x fp1,fp2 ; S(P1+S(P2+SP3))
-
- fadd.x (TANQ1,pc),fp3 ; Q1+S(Q2+S(Q3+SQ4))
- fmul.x fp0,fp2 ; RS(P1+S(P2+SP3))
-
- fmul.x fp3,fp1 ; S(Q1+S(Q2+S(Q3+SQ4)))
-
- fadd.x fp2,fp0 ; R+RS(P1+S(P2+SP3))
-
- fadd.s #$3F800000,fp1 ; 1+S(Q1+...)
-
- fmovem.x (sp)+,fp2/fp3 ; restore fp2,fp3
-
- fdiv.x fp1,fp0 ; last inst - possible exception set
- rts
-
- .NODD
- fmove.x fp0,fp1
- fmul.x fp0,fp0 ; S = R*R
-
- fmove.d (TANQ4,pc),fp3
- fmove.d (TANP3,pc),fp2
-
- fmul.x fp0,fp3 ; SQ4
- fmul.x fp0,fp2 ; SP3
-
- fadd.d (TANQ3,pc),fp3 ; Q3+SQ4
- fadd.x (TANP2,pc),fp2 ; P2+SP3
-
- fmul.x fp0,fp3 ; S(Q3+SQ4)
- fmul.x fp0,fp2 ; S(P2+SP3)
-
- fadd.x (TANQ2,pc),fp3 ; Q2+S(Q3+SQ4)
- fadd.x (TANP1,pc),fp2 ; P1+S(P2+SP3)
- fmul.x fp0,fp3 ; S(Q2+S(Q3+SQ4))
- fmul.x fp0,fp2 ; S(P1+S(P2+SP3))
-
- fadd.x (TANQ1,pc),fp3 ; Q1+S(Q2+S(Q3+SQ4))
- fmul.x fp1,fp2 ; RS(P1+S(P2+SP3))
-
- fmul.x fp3,fp0 ; S(Q1+S(Q2+S(Q3+SQ4)))
-
- fadd.x fp2,fp1 ; R+RS(P1+S(P2+SP3))
- fadd.s #$3F800000,fp0 ; 1+S(Q1+...)
-
- fmovem.x (sp)+,fp2/fp3 ; restore fp2,fp3
-
- fmove.x fp1,-(sp)
- eor.l #$80000000,(sp)
-
- fdiv.x (sp)+,fp0 ; last inst - possible exception set
- rts
-
- .TANSM
- rts
-
- FP_SCR0 EQU -12
- FP_SCR0_EX EQU -12
- FP_SCR0_HI EQU -8
- FP_SCR0_LO EQU -4
- FP_SCR1 EQU -24
- FP_SCR1_EX EQU -24
- FP_SCR1_HI EQU -20
- FP_SCR1_LO EQU -16
- INARG EQU -12
- ENDFLAG EQU -28
- TWOTO63 EQU -32
- INT EQU -28
-
- REDUCE_SIZE EQU 32
-
- ;--WHEN REDUCEX IS USED, THE CODE WILL INEVITABLY BE SLOW.
- ;--THIS REDUCTION METHOD, HOWEVER, IS MUCH FASTER THAN USING
- ;--THE REMAINDER INSTRUCTION WHICH IS NOW IN SOFTWARE.
- .REDUCEX
- fmovem.x fp2-fp5,-(sp) ; save {fp2-fp5}
- move.l d2,-(sp) ; save d2
- link a0,#-REDUCE_SIZE
- fmove.s #$00000000,fp1 ; fp1 = 0
-
- ;--If compact form of abs(arg) in d0=$7ffeffff, argument is so large that
- ;--there is a danger of unwanted overflow in first LOOP iteration. In this
- ;--case, reduce argument by one remainder step to make subsequent reduction
- ;--safe.
- cmp.l #$7ffeffff,d1 ; is arg dangerously large?
- bne.b .LOOP ; no
-
- ; yes; create 2**16383*PI/2
- move.w #$7ffe,(FP_SCR0_EX,a0)
- move.l #$c90fdaa2,(FP_SCR0_HI,a0)
- clr.l (FP_SCR0_LO,a0)
-
- ; create low half of 2**16383*PI/2 at FP_SCR1
- move.w #$7fdc,(FP_SCR1_EX,a0)
- move.l #$85a308d3,(FP_SCR1_HI,a0)
- clr.l (FP_SCR1_LO,a0)
-
- fcmp.s #0,fp0 ; test sign of argument
- fblt.w .red_neg
-
- or.b #$80,(FP_SCR0_EX,a0) ; positive arg
- or.b #$80,(FP_SCR1_EX,a0)
- .red_neg
- fadd.x (FP_SCR0,a0),fp0 ; high part of reduction is exact
- fmove.x fp0,fp1 ; save high result in fp1
- fadd.x (FP_SCR1,a0),fp0 ; low part of reduction
- fsub.x fp0,fp1 ; determine low component of result
- fadd.x (FP_SCR1,a0),fp1 ; fp0/fp1 are reduced argument.
-
- ;--ON ENTRY, FP0 IS X, ON RETURN, FP0 IS X REM PI/2, |X| <= PI/4.
- ;--integer quotient will be stored in N
- ;--Intermeditate remainder is 66-bit dc.l; (R,r) in (FP0,FP1)
- .LOOP
- fmove.x fp0,(INARG,a0) ; +-2**K * F, 1 <= F < 2
- move.w (INARG,a0),d1
- move.l d1,a1 ; save a copy of D0
- and.l #$00007FFF,d1
- sub.l #$00003FFF,d1 ; d0 = K
- cmp.l #28,d1
- ble.b .LASTLOOP
- .CONTLOOP
- sub.l #27,d1 ; d0 = L := K-27
- clr.b (ENDFLAG,a0)
- bra.b .WORK
- .LASTLOOP
- clr.l d1 ; d0 = L := 0
- move.b #1,(ENDFLAG,a0)
-
- .WORK
- ;--FIND THE REMAINDER OF (R,r) W.R.T. 2**L * (PI/2). L IS SO CHOSEN
- ;--THAT INT( X * (2/PI) / 2**(L) ) < 2**29.
-
- ;--CREATE 2**(-L) * (2/PI), SIGN(INARG)*2**(63),
- ;--2**L * (PIby2_1), 2**L * (PIby2_2)
-
- move.l #$00003FFE,d2 ; BIASED EXP OF 2/PI
- sub.l d1,d2 ; BIASED EXP OF 2**(-L)*(2/PI)
-
- move.l #$A2F9836E,(FP_SCR0_HI,a0)
- move.l #$4E44152A,(FP_SCR0_LO,a0)
- move.w d2,(FP_SCR0_EX,a0) ; FP_SCR0 = 2**(-L)*(2/PI)
-
- fmove.x fp0,fp2
- fmul.x (FP_SCR0,a0),fp2 ; fp2 = X * 2**(-L)*(2/PI)
-
- ;--WE MUST NOW FIND INT(FP2). SINCE WE NEED THIS VALUE IN
- ;--FLOATING POINT FORMAT, THE TWO FMOVE'S FMOVE.L FP <--> N
- ;--WILL BE TOO INEFFICIENT. THE WAY AROUND IT IS THAT
- ;--(SIGN(INARG)*2**63 + FP2) - SIGN(INARG)*2**63 WILL GIVE
- ;--US THE DESIRED VALUE IN FLOATING POINT.
- move.l a1,d2
- swap d2
- and.l #$80000000,d2
- or.l #$5F000000,d2 ; d2 = SIGN(INARG)*2**63 IN SGL
- move.l d2,(TWOTO63,a0)
- fadd.s (TWOTO63,a0),fp2 ; THE FRACTIONAL PART OF FP1 IS ROUNDED
- fsub.s (TWOTO63,a0),fp2 ; fp2 = N
- ; fintrz.x fp2,fp2
-
- ;--CREATING 2**(L)*Piby2_1 and 2**(L)*Piby2_2
- move.l d1,d2 ; d2 = L
-
- add.l #$00003FFF,d2 ; BIASED EXP OF 2**L * (PI/2)
- move.w d2,(FP_SCR0_EX,a0)
- move.l #$C90FDAA2,(FP_SCR0_HI,a0)
- clr.l (FP_SCR0_LO,a0) ; FP_SCR0 = 2**(L) * Piby2_1
-
- add.l #$00003FDD,d1
- move.w d1,(FP_SCR1_EX,a0)
- move.l #$85A308D3,(FP_SCR1_HI,a0)
- clr.l (FP_SCR1_LO,a0) ; FP_SCR1 = 2**(L) * Piby2_2
-
- move.b (ENDFLAG,a0),d1
-
- ;--We are now ready to perform (R+r) - N*P1 - N*P2, P1 = 2**(L) * Piby2_1 and
- ;--P2 = 2**(L) * Piby2_2
- fmove.x fp2,fp4 ; fp4 = N
- fmul.x (FP_SCR0,a0),fp4 ; fp4 = W = N*P1
- fmove.x fp2,fp5 ; fp5 = N
- fmul.x (FP_SCR1,a0),fp5 ; fp5 = w = N*P2
- fmove.x fp4,fp3 ; fp3 = W = N*P1
-
- ;--we want P+p = W+w but |p| <= half ulp of P
- ;--Then, we need to compute A := R-P and a := r-p
- fadd.x fp5,fp3 ; fp3 = P
- fsub.x fp3,fp4 ; fp4 = W-P
-
- fsub.x fp3,fp0 ; fp0 = A := R - P
- fadd.x fp5,fp4 ; fp4 = p = (W-P)+w
-
- fmove.x fp0,fp3 ; fp3 = A
- fsub.x fp4,fp1 ; fp1 = a := r - p
-
- ;--Now we need to normalize (A,a) to "new (R,r)" where R+r = A+a but
- ;--|r| <= half ulp of R.
- fadd.x fp1,fp0 ; fp0 = R := A+a
- ;--No need to calculate r if this is the last loop
- tst.b d1
- bgt.w .RESTORE
-
- ;--Need to calculate r
- fsub.x fp0,fp3 ; fp3 = A-R
- fadd.x fp3,fp1 ; fp1 = r := (A-R)+a
- bra.w .LOOP
-
- .RESTORE
- fmove.l fp2,(INT,a0)
- move.l (INT,a0),d1
- unlk a0
- move.l (sp)+,d2 ; restore d2
- fmovem.x (sp)+,fp2-fp5 ; restore {fp2-fp5}
- ror.l #1,d1
- bra.w .TANCONT
-
- _cot
- fmove.d (4,sp),fp0
- @cot
- fadd.d (PID2,pc),fp0
- bsr.w @tan
- fneg.x fp0
- rts
-