1 | 2 | stan.sa 3.3 7/29/91 3 | 4 | The entry point stan computes the tangent of 5 | an input argument; 6 | stand does the same except for denormalized input. 7 | 8 | Input: Double-extended number X in location pointed to 9 | by address register a0. 10 | 11 | Output: The value tan(X) returned in floating-point register Fp0. 12 | 13 | Accuracy and Monotonicity: The returned result is within 3 ulp in 14 | 64 significant bit, i.e. within 0.5001 ulp to 53 bits if the 15 | result is subsequently rounded to double precision. The 16 | result is provably monotonic in double precision. 17 | 18 | Speed: The program sTAN takes approximately 170 cycles for 19 | input argument X such that |X| < 15Pi, which is the the usual 20 | situation. 21 | 22 | Algorithm: 23 | 24 | 1. If |X| >= 15Pi or |X| < 2**(-40), go to 6. 25 | 26 | 2. Decompose X as X = N(Pi/2) + r where |r| <= Pi/4. Let 27 | k = N mod 2, so in particular, k = 0 or 1. 28 | 29 | 3. If k is odd, go to 5. 30 | 31 | 4. (k is even) Tan(X) = tan(r) and tan(r) is approximated by a 32 | rational function U/V where 33 | U = r + r*s*(P1 + s*(P2 + s*P3)), and 34 | V = 1 + s*(Q1 + s*(Q2 + s*(Q3 + s*Q4))), s = r*r. 35 | Exit. 36 | 37 | 4. (k is odd) Tan(X) = -cot(r). Since tan(r) is approximated by a 38 | rational function U/V where 39 | U = r + r*s*(P1 + s*(P2 + s*P3)), and 40 | V = 1 + s*(Q1 + s*(Q2 + s*(Q3 + s*Q4))), s = r*r, 41 | -Cot(r) = -V/U. Exit. 42 | 43 | 6. If |X| > 1, go to 8. 44 | 45 | 7. (|X|<2**(-40)) Tan(X) = X. Exit. 46 | 47 | 8. Overwrite X by X := X rem 2Pi. Now that |X| <= Pi, go back to 2. 48 | 49 50 | Copyright (C) Motorola, Inc. 1990 51 | All Rights Reserved 52 | 53 | THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF MOTOROLA 54 | The copyright notice above does not evidence any 55 | actual or intended publication of such source code. 56 57 |STAN idnt 2,1 | Motorola 040 Floating Point Software Package 58 59 |section 8 60 61 .include "fpsp.h" 62 63 BOUNDS1: .long 0x3FD78000,0x4004BC7E 64 TWOBYPI: .long 0x3FE45F30,0x6DC9C883 65 66 TANQ4: .long 0x3EA0B759,0xF50F8688 67 TANP3: .long 0xBEF2BAA5,0xA8924F04 68 69 TANQ3: .long 0xBF346F59,0xB39BA65F,0x00000000,0x00000000 70 71 TANP2: .long 0x3FF60000,0xE073D3FC,0x199C4A00,0x00000000 72 73 TANQ2: .long 0x3FF90000,0xD23CD684,0x15D95FA1,0x00000000 74 75 TANP1: .long 0xBFFC0000,0x8895A6C5,0xFB423BCA,0x00000000 76 77 TANQ1: .long 0xBFFD0000,0xEEF57E0D,0xA84BC8CE,0x00000000 78 79 INVTWOPI: .long 0x3FFC0000,0xA2F9836E,0x4E44152A,0x00000000 80 81 TWOPI1: .long 0x40010000,0xC90FDAA2,0x00000000,0x00000000 82 TWOPI2: .long 0x3FDF0000,0x85A308D4,0x00000000,0x00000000 83 84 |--N*PI/2, -32 <= N <= 32, IN A LEADING TERM IN EXT. AND TRAILING 85 |--TERM IN SGL. NOTE THAT PI IS 64-BIT LONG, THUS N*PI/2 IS AT 86 |--MOST 69 BITS LONG. 87 .global PITBL 88 PITBL: 89 .long 0xC0040000,0xC90FDAA2,0x2168C235,0x21800000 90 .long 0xC0040000,0xC2C75BCD,0x105D7C23,0xA0D00000 91 .long 0xC0040000,0xBC7EDCF7,0xFF523611,0xA1E80000 92 .long 0xC0040000,0xB6365E22,0xEE46F000,0x21480000 93 .long 0xC0040000,0xAFEDDF4D,0xDD3BA9EE,0xA1200000 94 .long 0xC0040000,0xA9A56078,0xCC3063DD,0x21FC0000 95 .long 0xC0040000,0xA35CE1A3,0xBB251DCB,0x21100000 96 .long 0xC0040000,0x9D1462CE,0xAA19D7B9,0xA1580000 97 .long 0xC0040000,0x96CBE3F9,0x990E91A8,0x21E00000 98 .long 0xC0040000,0x90836524,0x88034B96,0x20B00000 99 .long 0xC0040000,0x8A3AE64F,0x76F80584,0xA1880000 100 .long 0xC0040000,0x83F2677A,0x65ECBF73,0x21C40000 101 .long 0xC0030000,0xFB53D14A,0xA9C2F2C2,0x20000000 102 .long 0xC0030000,0xEEC2D3A0,0x87AC669F,0x21380000 103 .long 0xC0030000,0xE231D5F6,0x6595DA7B,0xA1300000 104 .long 0xC0030000,0xD5A0D84C,0x437F4E58,0x9FC00000 105 .long 0xC0030000,0xC90FDAA2,0x2168C235,0x21000000 106 .long 0xC0030000,0xBC7EDCF7,0xFF523611,0xA1680000 107 .long 0xC0030000,0xAFEDDF4D,0xDD3BA9EE,0xA0A00000 108 .long 0xC0030000,0xA35CE1A3,0xBB251DCB,0x20900000 109 .long 0xC0030000,0x96CBE3F9,0x990E91A8,0x21600000 110 .long 0xC0030000,0x8A3AE64F,0x76F80584,0xA1080000 111 .long 0xC0020000,0xFB53D14A,0xA9C2F2C2,0x1F800000 112 .long 0xC0020000,0xE231D5F6,0x6595DA7B,0xA0B00000 113 .long 0xC0020000,0xC90FDAA2,0x2168C235,0x20800000 114 .long 0xC0020000,0xAFEDDF4D,0xDD3BA9EE,0xA0200000 115 .long 0xC0020000,0x96CBE3F9,0x990E91A8,0x20E00000 116 .long 0xC0010000,0xFB53D14A,0xA9C2F2C2,0x1F000000 117 .long 0xC0010000,0xC90FDAA2,0x2168C235,0x20000000 118 .long 0xC0010000,0x96CBE3F9,0x990E91A8,0x20600000 119 .long 0xC0000000,0xC90FDAA2,0x2168C235,0x1F800000 120 .long 0xBFFF0000,0xC90FDAA2,0x2168C235,0x1F000000 121 .long 0x00000000,0x00000000,0x00000000,0x00000000 122 .long 0x3FFF0000,0xC90FDAA2,0x2168C235,0x9F000000 123 .long 0x40000000,0xC90FDAA2,0x2168C235,0x9F800000 124 .long 0x40010000,0x96CBE3F9,0x990E91A8,0xA0600000 125 .long 0x40010000,0xC90FDAA2,0x2168C235,0xA0000000 126 .long 0x40010000,0xFB53D14A,0xA9C2F2C2,0x9F000000 127 .long 0x40020000,0x96CBE3F9,0x990E91A8,0xA0E00000 128 .long 0x40020000,0xAFEDDF4D,0xDD3BA9EE,0x20200000 129 .long 0x40020000,0xC90FDAA2,0x2168C235,0xA0800000 130 .long 0x40020000,0xE231D5F6,0x6595DA7B,0x20B00000 131 .long 0x40020000,0xFB53D14A,0xA9C2F2C2,0x9F800000 132 .long 0x40030000,0x8A3AE64F,0x76F80584,0x21080000 133 .long 0x40030000,0x96CBE3F9,0x990E91A8,0xA1600000 134 .long 0x40030000,0xA35CE1A3,0xBB251DCB,0xA0900000 135 .long 0x40030000,0xAFEDDF4D,0xDD3BA9EE,0x20A00000 136 .long 0x40030000,0xBC7EDCF7,0xFF523611,0x21680000 137 .long 0x40030000,0xC90FDAA2,0x2168C235,0xA1000000 138 .long 0x40030000,0xD5A0D84C,0x437F4E58,0x1FC00000 139 .long 0x40030000,0xE231D5F6,0x6595DA7B,0x21300000 140 .long 0x40030000,0xEEC2D3A0,0x87AC669F,0xA1380000 141 .long 0x40030000,0xFB53D14A,0xA9C2F2C2,0xA0000000 142 .long 0x40040000,0x83F2677A,0x65ECBF73,0xA1C40000 143 .long 0x40040000,0x8A3AE64F,0x76F80584,0x21880000 144 .long 0x40040000,0x90836524,0x88034B96,0xA0B00000 145 .long 0x40040000,0x96CBE3F9,0x990E91A8,0xA1E00000 146 .long 0x40040000,0x9D1462CE,0xAA19D7B9,0x21580000 147 .long 0x40040000,0xA35CE1A3,0xBB251DCB,0xA1100000 148 .long 0x40040000,0xA9A56078,0xCC3063DD,0xA1FC0000 149 .long 0x40040000,0xAFEDDF4D,0xDD3BA9EE,0x21200000 150 .long 0x40040000,0xB6365E22,0xEE46F000,0xA1480000 151 .long 0x40040000,0xBC7EDCF7,0xFF523611,0x21E80000 152 .long 0x40040000,0xC2C75BCD,0x105D7C23,0x20D00000 153 .long 0x40040000,0xC90FDAA2,0x2168C235,0xA1800000 154 155 .set INARG,FP_SCR4 156 157 .set TWOTO63,L_SCR1 158 .set ENDFLAG,L_SCR2 159 .set N,L_SCR3 160 161 | xref t_frcinx 162 |xref t_extdnrm 163 164 .global stand 165 stand: 166 |--TAN(X) = X FOR DENORMALIZED X 167 168 bra t_extdnrm 169 170 .global stan 171 stan: 172 fmovex (%a0),%fp0 | ...LOAD INPUT 173 174 movel (%a0),%d0 175 movew 4(%a0),%d0 176 andil #0x7FFFFFFF,%d0 177 178 cmpil #0x3FD78000,%d0 | ...|X| >= 2**(-40)? 179 bges TANOK1 180 bra TANSM 181 TANOK1: 182 cmpil #0x4004BC7E,%d0 | ...|X| < 15 PI? 183 blts TANMAIN 184 bra REDUCEX 185 186 187 TANMAIN: 188 |--THIS IS THE USUAL CASE, |X| <= 15 PI. 189 |--THE ARGUMENT REDUCTION IS DONE BY TABLE LOOK UP. 190 fmovex %fp0,%fp1 191 fmuld TWOBYPI,%fp1 | ...X*2/PI 192 193 |--HIDE THE NEXT TWO INSTRUCTIONS 194 leal PITBL+0x200,%a1 | ...TABLE OF N*PI/2, N = -32,...,32 195 196 |--FP1 IS NOW READY 197 fmovel %fp1,%d0 | ...CONVERT TO INTEGER 198 199 asll #4,%d0 200 addal %d0,%a1 | ...ADDRESS N*PIBY2 IN Y1, Y2 201 202 fsubx (%a1)+,%fp0 | ...X-Y1 203 |--HIDE THE NEXT ONE 204 205 fsubs (%a1),%fp0 | ...FP0 IS R = (X-Y1)-Y2 206 207 rorl #5,%d0 208 andil #0x80000000,%d0 | ...D0 WAS ODD IFF D0 < 0 209 210 TANCONT: 211 212 cmpil #0,%d0 213 blt NODD 214 215 fmovex %fp0,%fp1 216 fmulx %fp1,%fp1 | ...S = R*R 217 218 fmoved TANQ4,%fp3 219 fmoved TANP3,%fp2 220 221 fmulx %fp1,%fp3 | ...SQ4 222 fmulx %fp1,%fp2 | ...SP3 223 224 faddd TANQ3,%fp3 | ...Q3+SQ4 225 faddx TANP2,%fp2 | ...P2+SP3 226 227 fmulx %fp1,%fp3 | ...S(Q3+SQ4) 228 fmulx %fp1,%fp2 | ...S(P2+SP3) 229 230 faddx TANQ2,%fp3 | ...Q2+S(Q3+SQ4) 231 faddx TANP1,%fp2 | ...P1+S(P2+SP3) 232 233 fmulx %fp1,%fp3 | ...S(Q2+S(Q3+SQ4)) 234 fmulx %fp1,%fp2 | ...S(P1+S(P2+SP3)) 235 236 faddx TANQ1,%fp3 | ...Q1+S(Q2+S(Q3+SQ4)) 237 fmulx %fp0,%fp2 | ...RS(P1+S(P2+SP3)) 238 239 fmulx %fp3,%fp1 | ...S(Q1+S(Q2+S(Q3+SQ4))) 240 241 242 faddx %fp2,%fp0 | ...R+RS(P1+S(P2+SP3)) 243 244 245 fadds #0x3F800000,%fp1 | ...1+S(Q1+...) 246 247 fmovel %d1,%fpcr |restore users exceptions 248 fdivx %fp1,%fp0 |last inst - possible exception set 249 250 bra t_frcinx 251 252 NODD: 253 fmovex %fp0,%fp1 254 fmulx %fp0,%fp0 | ...S = R*R 255 256 fmoved TANQ4,%fp3 257 fmoved TANP3,%fp2 258 259 fmulx %fp0,%fp3 | ...SQ4 260 fmulx %fp0,%fp2 | ...SP3 261 262 faddd TANQ3,%fp3 | ...Q3+SQ4 263 faddx TANP2,%fp2 | ...P2+SP3 264 265 fmulx %fp0,%fp3 | ...S(Q3+SQ4) 266 fmulx %fp0,%fp2 | ...S(P2+SP3) 267 268 faddx TANQ2,%fp3 | ...Q2+S(Q3+SQ4) 269 faddx TANP1,%fp2 | ...P1+S(P2+SP3) 270 271 fmulx %fp0,%fp3 | ...S(Q2+S(Q3+SQ4)) 272 fmulx %fp0,%fp2 | ...S(P1+S(P2+SP3)) 273 274 faddx TANQ1,%fp3 | ...Q1+S(Q2+S(Q3+SQ4)) 275 fmulx %fp1,%fp2 | ...RS(P1+S(P2+SP3)) 276 277 fmulx %fp3,%fp0 | ...S(Q1+S(Q2+S(Q3+SQ4))) 278 279 280 faddx %fp2,%fp1 | ...R+RS(P1+S(P2+SP3)) 281 fadds #0x3F800000,%fp0 | ...1+S(Q1+...) 282 283 284 fmovex %fp1,-(%sp) 285 eoril #0x80000000,(%sp) 286 287 fmovel %d1,%fpcr |restore users exceptions 288 fdivx (%sp)+,%fp0 |last inst - possible exception set 289 290 bra t_frcinx 291 292 TANBORS: 293 |--IF |X| > 15PI, WE USE THE GENERAL ARGUMENT REDUCTION. 294 |--IF |X| < 2**(-40), RETURN X OR 1. 295 cmpil #0x3FFF8000,%d0 296 bgts REDUCEX 297 298 TANSM: 299 300 fmovex %fp0,-(%sp) 301 fmovel %d1,%fpcr |restore users exceptions 302 fmovex (%sp)+,%fp0 |last inst - possible exception set 303 304 bra t_frcinx 305 306 307 REDUCEX: 308 |--WHEN REDUCEX IS USED, THE CODE WILL INEVITABLY BE SLOW. 309 |--THIS REDUCTION METHOD, HOWEVER, IS MUCH FASTER THAN USING 310 |--THE REMAINDER INSTRUCTION WHICH IS NOW IN SOFTWARE. 311 312 fmovemx %fp2-%fp5,-(%a7) | ...save FP2 through FP5 313 movel %d2,-(%a7) 314 fmoves #0x00000000,%fp1 315 316 |--If compact form of abs(arg) in d0=$7ffeffff, argument is so large that 317 |--there is a danger of unwanted overflow in first LOOP iteration. In this 318 |--case, reduce argument by one remainder step to make subsequent reduction 319 |--safe. 320 cmpil #0x7ffeffff,%d0 |is argument dangerously large? 321 bnes LOOP 322 movel #0x7ffe0000,FP_SCR2(%a6) |yes 323 | ;create 2**16383*PI/2 324 movel #0xc90fdaa2,FP_SCR2+4(%a6) 325 clrl FP_SCR2+8(%a6) 326 ftstx %fp0 |test sign of argument 327 movel #0x7fdc0000,FP_SCR3(%a6) |create low half of 2**16383* 328 | ;PI/2 at FP_SCR3 329 movel #0x85a308d3,FP_SCR3+4(%a6) 330 clrl FP_SCR3+8(%a6) 331 fblt red_neg 332 orw #0x8000,FP_SCR2(%a6) |positive arg 333 orw #0x8000,FP_SCR3(%a6) 334 red_neg: 335 faddx FP_SCR2(%a6),%fp0 |high part of reduction is exact 336 fmovex %fp0,%fp1 |save high result in fp1 337 faddx FP_SCR3(%a6),%fp0 |low part of reduction 338 fsubx %fp0,%fp1 |determine low component of result 339 faddx FP_SCR3(%a6),%fp1 |fp0/fp1 are reduced argument. 340 341 |--ON ENTRY, FP0 IS X, ON RETURN, FP0 IS X REM PI/2, |X| <= PI/4. 342 |--integer quotient will be stored in N 343 |--Intermediate remainder is 66-bit long; (R,r) in (FP0,FP1) 344 345 LOOP: 346 fmovex %fp0,INARG(%a6) | ...+-2**K * F, 1 <= F < 2 347 movew INARG(%a6),%d0 348 movel %d0,%a1 | ...save a copy of D0 349 andil #0x00007FFF,%d0 350 subil #0x00003FFF,%d0 | ...D0 IS K 351 cmpil #28,%d0 352 bles LASTLOOP 353 CONTLOOP: 354 subil #27,%d0 | ...D0 IS L := K-27 355 movel #0,ENDFLAG(%a6) 356 bras WORK 357 LASTLOOP: 358 clrl %d0 | ...D0 IS L := 0 359 movel #1,ENDFLAG(%a6) 360 361 WORK: 362 |--FIND THE REMAINDER OF (R,r) W.R.T. 2**L * (PI/2). L IS SO CHOSEN 363 |--THAT INT( X * (2/PI) / 2**(L) ) < 2**29. 364 365 |--CREATE 2**(-L) * (2/PI), SIGN(INARG)*2**(63), 366 |--2**L * (PIby2_1), 2**L * (PIby2_2) 367 368 movel #0x00003FFE,%d2 | ...BIASED EXPO OF 2/PI 369 subl %d0,%d2 | ...BIASED EXPO OF 2**(-L)*(2/PI) 370 371 movel #0xA2F9836E,FP_SCR1+4(%a6) 372 movel #0x4E44152A,FP_SCR1+8(%a6) 373 movew %d2,FP_SCR1(%a6) | ...FP_SCR1 is 2**(-L)*(2/PI) 374 375 fmovex %fp0,%fp2 376 fmulx FP_SCR1(%a6),%fp2 377 |--WE MUST NOW FIND INT(FP2). SINCE WE NEED THIS VALUE IN 378 |--FLOATING POINT FORMAT, THE TWO FMOVE'S FMOVE.L FP <--> N 379 |--WILL BE TOO INEFFICIENT. THE WAY AROUND IT IS THAT 380 |--(SIGN(INARG)*2**63 + FP2) - SIGN(INARG)*2**63 WILL GIVE 381 |--US THE DESIRED VALUE IN FLOATING POINT. 382 383 |--HIDE SIX CYCLES OF INSTRUCTION 384 movel %a1,%d2 385 swap %d2 386 andil #0x80000000,%d2 387 oril #0x5F000000,%d2 | ...D2 IS SIGN(INARG)*2**63 IN SGL 388 movel %d2,TWOTO63(%a6) 389 390 movel %d0,%d2 391 addil #0x00003FFF,%d2 | ...BIASED EXPO OF 2**L * (PI/2) 392 393 |--FP2 IS READY 394 fadds TWOTO63(%a6),%fp2 | ...THE FRACTIONAL PART OF FP1 IS ROUNDED 395 396 |--HIDE 4 CYCLES OF INSTRUCTION; creating 2**(L)*Piby2_1 and 2**(L)*Piby2_2 397 movew %d2,FP_SCR2(%a6) 398 clrw FP_SCR2+2(%a6) 399 movel #0xC90FDAA2,FP_SCR2+4(%a6) 400 clrl FP_SCR2+8(%a6) | ...FP_SCR2 is 2**(L) * Piby2_1 401 402 |--FP2 IS READY 403 fsubs TWOTO63(%a6),%fp2 | ...FP2 is N 404 405 addil #0x00003FDD,%d0 406 movew %d0,FP_SCR3(%a6) 407 clrw FP_SCR3+2(%a6) 408 movel #0x85A308D3,FP_SCR3+4(%a6) 409 clrl FP_SCR3+8(%a6) | ...FP_SCR3 is 2**(L) * Piby2_2 410 411 movel ENDFLAG(%a6),%d0 412 413 |--We are now ready to perform (R+r) - N*P1 - N*P2, P1 = 2**(L) * Piby2_1 and 414 |--P2 = 2**(L) * Piby2_2 415 fmovex %fp2,%fp4 416 fmulx FP_SCR2(%a6),%fp4 | ...W = N*P1 417 fmovex %fp2,%fp5 418 fmulx FP_SCR3(%a6),%fp5 | ...w = N*P2 419 fmovex %fp4,%fp3 420 |--we want P+p = W+w but |p| <= half ulp of P 421 |--Then, we need to compute A := R-P and a := r-p 422 faddx %fp5,%fp3 | ...FP3 is P 423 fsubx %fp3,%fp4 | ...W-P 424 425 fsubx %fp3,%fp0 | ...FP0 is A := R - P 426 faddx %fp5,%fp4 | ...FP4 is p = (W-P)+w 427 428 fmovex %fp0,%fp3 | ...FP3 A 429 fsubx %fp4,%fp1 | ...FP1 is a := r - p 430 431 |--Now we need to normalize (A,a) to "new (R,r)" where R+r = A+a but 432 |--|r| <= half ulp of R. 433 faddx %fp1,%fp0 | ...FP0 is R := A+a 434 |--No need to calculate r if this is the last loop 435 cmpil #0,%d0 436 bgt RESTORE 437 438 |--Need to calculate r 439 fsubx %fp0,%fp3 | ...A-R 440 faddx %fp3,%fp1 | ...FP1 is r := (A-R)+a 441 bra LOOP 442 443 RESTORE: 444 fmovel %fp2,N(%a6) 445 movel (%a7)+,%d2 446 fmovemx (%a7)+,%fp2-%fp5 447 448 449 movel N(%a6),%d0 450 rorl #1,%d0 451 452 453 bra TANCONT 454 455 |end