1# $NetBSD: bn_asm_vax.S,v 1.1 2009/07/19 23:30:47 christos Exp $ 2# 3# w.j.m. 15-jan-1999 4# 5# it's magic ... 6# 7# ULONG bn_mul_add_words(ULONG r[],ULONG a[],int n,ULONG w) { 8# ULONG c = 0; 9# int i; 10# for(i = 0; i < n; i++) <c,r[i]> := r[i] + c + a[i] * w ; 11# return c; 12# } 13 14 .globl bn_mul_add_words 15 .type bn_mul_add_words@function 16 17bn_mul_add_words: 18 .word 0x40 19 20 movl 4(%ap),%r2 # *r 21 movl 8(%ap),%r3 # *a 22 movl 12(%ap),%r4 # n 23 movl 16(%ap),%r5 # w 24 clrl %r6 # return value ("carry") 25 260: emul %r5,(%r3),(%r2),%r0 # w * a[0] + r[0] -> r0 27 28 # fixup for "negative" r[] 29 tstl (%r2) 30 bgeq 1f 31 incl %r1 # add 1 to highword 32 331: # add saved carry to result 34 addl2 %r6,%r0 35 adwc $0,%r1 36 37 # combined fixup for "negative" w, a[] 38 tstl %r5 # if w is negative... 39 bgeq 1f 40 addl2 (%r3),%r1 # ...add a[0] again to highword 411: tstl (%r3) # if a[0] is negative... 42 bgeq 1f 43 addl2 %r5,%r1 # ...add w again to highword 441: 45 movl %r0,(%r2)+ # save low word in dest & advance *r 46 addl2 $4,%r3 # advance *a 47 movl %r1,%r6 # high word in r6 for return value 48 49 sobgtr %r4,0b # loop? 50 51 movl %r6,%r0 52 ret 53 54# .title vax_bn_mul_words unsigned multiply & add, 32*32+32=>64 55#; 56#; w.j.m. 15-jan-1999 57#; 58#; it's magic ... 59#; 60#; ULONG bn_mul_words(ULONG r[],ULONG a[],int n,ULONG w) { 61#; ULONG c = 0; 62#; int i; 63#; for(i = 0; i < num; i++) <c,r[i]> := a[i] * w + c ; 64#; return(c); 65#; } 66# 67 .globl bn_mul_words 68 .type bn_mul_words@function 69bn_mul_words: 70 .word 0x40 71 72 movl 4(%ap),%r2 # *r 73 movl 8(%ap),%r3 # *a 74 movl 12(%ap),%r4 # n 75 movl 16(%ap),%r5 # w 76 clrl %r6 # carry 77 780: emul %r5,(%r3),%r6,%r0 # w * a[0] + carry -> r0 79 80 # fixup for "negative" carry 81 tstl %r6 82 bgeq 1f 83 incl %r1 84 851: # combined fixup for "negative" w, a[] 86 tstl %r5 87 bgeq 1f 88 addl2 (%r3),%r1 891: tstl (%r3) 90 bgeq 1f 91 addl2 %r5,%r1 92 931: movl %r0,(%r2)+ 94 addl2 $4,%r3 95 movl %r1,%r6 96 97 sobgtr %r4,0b 98 99 movl %r6,%r0 100 ret 101 102 103 104# .title vax_bn_sqr_words unsigned square, 32*32=>64 105#; 106#; w.j.m. 15-jan-1999 107#; 108#; it's magic ... 109#; 110#; void bn_sqr_words(ULONG r[],ULONG a[],int n) { 111#; int i; 112#; for(i = 0; i < n; i++) <r[2*i+1],r[2*i]> := a[i] * a[i] ; 113#; } 114# 115 .globl bn_sqr_words 116 .type bn_sqr_words@function 117bn_sqr_words: 118 .word 0 119 120 movl 4(%ap),%r2 # r 121 movl 8(%ap),%r3 # a 122 movl 12(%ap),%r4 # n 123 1240: movl (%r3)+,%r5 # r5 = a[] & advance 125 126 emul %r5,%r5,$0,%r0 # a[0] * a[0] + 0 -> r0 127 128 # fixup for "negative" a[] 129 tstl %r5 130 bgeq 1f 131 addl2 %r5,%r1 132 addl2 %r5,%r1 133 1341: movq %r0,(%r2)+ # store 64-bit result 135 136 sobgtr %r4,0b # loop 137 138 ret 139 140 141# .title vax_bn_div_words unsigned divide 142#; 143#; Richard Levitte 20-Nov-2000 144#; 145#; ULONG bn_div_words(ULONG h, ULONG l, ULONG d) 146#; { 147#; return ((ULONG)((((ULLONG)h)<<32)|l) / (ULLONG)d); 148#; } 149#; 150#; Using EDIV would be very easy, if it didn't do signed calculations. 151#; Any time any of the input numbers are signed, there are problems, 152#; usually with integer overflow, at which point it returns useless 153#; data (the quotient gets the value of l, and the remainder becomes 0). 154#; 155#; If it was just for the dividend, it would be very easy, just divide 156#; it by 2 (unsigned), do the division, multiply the resulting quotient 157#; and remainder by 2, add the bit that was dropped when dividing by 2 158#; to the remainder, and do some adjustment so the remainder doesn't 159#; end up larger than the divisor. For some cases when the divisor is 160#; negative (from EDIV's point of view, i.e. when the highest bit is set), 161#; dividing the dividend by 2 isn't enough, and since some operations 162#; might generate integer overflows even when the dividend is divided by 163#; 4 (when the high part of the shifted down dividend ends up being exactly 164#; half of the divisor, the result is the quotient 0x80000000, which is 165#; negative...) it needs to be divided by 8. Furthermore, the divisor needs 166#; to be divided by 2 (unsigned) as well, to avoid more problems with the sign. 167#; In this case, a little extra fiddling with the remainder is required. 168#; 169#; So, the simplest way to handle this is always to divide the dividend 170#; by 8, and to divide the divisor by 2 if it's highest bit is set. 171#; After EDIV has been used, the quotient gets multiplied by 8 if the 172#; original divisor was positive, otherwise 4. The remainder, oddly 173#; enough, is *always* multiplied by 8. 174#; NOTE: in the case mentioned above, where the high part of the shifted 175#; down dividend ends up being exactly half the shifted down divisor, we 176#; end up with a 33 bit quotient. That's no problem however, it usually 177#; means we have ended up with a too large remainder as well, and the 178#; problem is fixed by the last part of the algorithm (next paragraph). 179#; 180#; The routine ends with comparing the resulting remainder with the 181#; original divisor and if the remainder is larger, subtract the 182#; original divisor from it, and increase the quotient by 1. This is 183#; done until the remainder is smaller than the divisor. 184#; 185#; The complete algorithm looks like this: 186#; 187#; d' = d 188#; l' = l & 7 189#; [h,l] = [h,l] >> 3 190#; [q,r] = floor([h,l] / d) # This is the EDIV operation 191#; if (q < 0) q = -q # I doubt this is necessary any more 192#; 193#; r' = r >> 29 194#; if (d' >= 0) 195#; q' = q >> 29 196#; q = q << 3 197#; else 198#; q' = q >> 30 199#; q = q << 2 200#; r = (r << 3) + l' 201#; 202#; if (d' < 0) 203#; { 204#; [r',r] = [r',r] - q 205#; while ([r',r] < 0) 206#; { 207#; [r',r] = [r',r] + d 208#; [q',q] = [q',q] - 1 209#; } 210#; } 211#; 212#; while ([r',r] >= d') 213#; { 214#; [r',r] = [r',r] - d' 215#; [q',q] = [q',q] + 1 216#; } 217#; 218#; return q 219# 220#;r2 = l, q 221#;r3 = h, r 222#;r4 = d 223#;r5 = l' 224#;r6 = r' 225#;r7 = d' 226#;r8 = q' 227# 228 .globl bn_div_words 229 .type bn_div_words@function 230bn_div_words: 231 .word 0x1c0 232 233 movl 4(%ap),%r3 # h 234 movl 8(%ap),%r2 # l 235 movl 12(%ap),%r4 # d 236 237 bicl3 $-8,%r2,%r5 # l' = l & 7 238 bicl3 $7,%r2,%r2 239 240 bicl3 $-8,%r3,%r6 241 bicl3 $7,%r3,%r3 242 243 addl2 %r6,%r2 244 245 rotl $-3,%r2,%r2 # l = l >> 3 246 rotl $-3,%r3,%r3 # h = h >> 3 247 248 movl %r4,%r7 # d' = d 249 250 clrl %r6 # r' = 0 251 clrl %r8 # q' = 0 252 253 tstl %r4 254 beql 0f # Uh-oh, the divisor is 0... 255 bgtr 1f 256 rotl $-1,%r4,%r4 # If d is negative, shift it right. 257 bicl2 $0x80000000,%r4 # Since d is then a large number, the 258 # lowest bit is insignificant 259 # (contradict that, and I'll fix the problem!) 2601: 261 ediv %r4,%r2,%r2,%r3 # Do the actual division 262 263 tstl %r2 264 bgeq 1f 265 mnegl %r2,%r2 # if q < 0, negate it 2661: 267 tstl %r7 268 blss 1f 269 rotl $3,%r2,%r2 # q = q << 3 270 bicl3 $-8,%r2,%r8 # q' gets the high bits from q 271 bicl3 $7,%r2,%r2 272 brb 2f 273 2741: # else 275 rotl $2,%r2,%r2 # q = q << 2 276 bicl3 $-4,%r2,%r8 # q' gets the high bits from q 277 bicl3 $3,%r2,%r2 2782: 279 rotl $3,%r3,%r3 # r = r << 3 280 bicl3 $-8,%r3,%r6 # r' gets the high bits from r 281 bicl3 $7,%r3,%r3 282 addl2 %r5,%r3 # r = r + l' 283 284 tstl %r7 285 bgeq 5f 286 bitl $1,%r7 287 beql 5f # if d' < 0 && d' & 1 288 subl2 %r2,%r3 # [r',r] = [r',r] - [q',q] 289 sbwc %r8,%r6 2903: 291 bgeq 5f # while r < 0 292 decl %r2 # [q',q] = [q',q] - 1 293 sbwc $0,%r8 294 addl2 %r7,%r3 # [r',r] = [r',r] + d' 295 adwc $0,%r6 296 brb 3b 297 298# The return points are placed in the middle to keep a short distance from 299# all the branch points 3001: 301# movl %r3,%r1 302 movl %r2,%r0 303 ret 3040: 305 movl $-1,%r0 306 ret 3075: 308 tstl %r6 309 bneq 6f 310 cmpl %r3,%r7 311 blssu 1b # while [r',r] >= d' 3126: 313 subl2 %r7,%r3 # [r',r] = [r',r] - d' 314 sbwc $0,%r6 315 incl %r2 # [q',q] = [q',q] + 1 316 adwc $0,%r8 317 brb 5b 318 319 320 321# .title vax_bn_add_words unsigned add of two arrays 322#; 323#; Richard Levitte 20-Nov-2000 324#; 325#; ULONG bn_add_words(ULONG r[], ULONG a[], ULONG b[], int n) { 326#; ULONG c = 0; 327#; int i; 328#; for (i = 0; i < n; i++) <c,r[i]> = a[i] + b[i] + c; 329#; return(c); 330#; } 331# 332 333 .globl bn_add_words 334 .type bn_add_words@function 335bn_add_words: 336 .word 0 337 338 movl 4(%ap),%r2 # r 339 movl 8(%ap),%r3 # a 340 movl 12(%ap),%r4 # b 341 movl 16(%ap),%r5 # n 342 clrl %r0 343 344 tstl %r5 345 bleq 1f 346 3470: movl (%r3)+,%r1 # carry untouched 348 adwc (%r4)+,%r1 # carry used and touched 349 movl %r1,(%r2)+ # carry untouched 350 sobgtr %r5,0b # carry untouched 351 352 adwc $0,%r0 3531: ret 354 355#; 356#; Richard Levitte 20-Nov-2000 357#; 358#; ULONG bn_sub_words(ULONG r[], ULONG a[], ULONG b[], int n) { 359#; ULONG c = 0; 360#; int i; 361#; for (i = 0; i < n; i++) <c,r[i]> = a[i] - b[i] - c; 362#; return(c); 363#; } 364# 365 .globl bn_sub_words 366 .type bn_sub_words@function 367bn_sub_words: 368 .word 0x40 369 370 movl 4(%ap),%r2 # r 371 movl 8(%ap),%r3 # a 372 movl 12(%ap),%r4 # b 373 movl 16(%ap),%r5 # n 374 clrl %r0 375 376 tstl %r5 377 bleq 1f 378 3790: movl (%r3)+,%r6 # carry untouched 380 sbwc (%r4)+,%r6 # carry used and touched 381 movl %r6,(%r2)+ # carry untouched 382 sobgtr %r5,0b # carry untouched 383 3841: adwc $0,%r0 385 ret 386 387# 388# Ragge 20-Sep-2003 389# 390# Multiply a vector of 4/8 longword by another. 391# Uses two loops and 16/64 emuls. 392# 393 .globl bn_mul_comba4 394 .type bn_mul_comba4@function 395bn_mul_comba4: 396 .word 0x3c0 397 movl $4,%r9 # 4*4 398 brb 6f 399 400 .globl bn_mul_comba8 401 .type bn_mul_comba8@function 402bn_mul_comba8: 403 .word 0x3c0 404 movl $8,%r9 # 8*8 405 4066: movl 8(%ap),%r3 # a[] 407 movl 12(%ap),%r7 # b[] 408 brb 5f 409 410 .globl bn_sqr_comba4 411 .type bn_sqr_comba4@function 412bn_sqr_comba4: 413 .word 0x3c0 414 movl $4,%r9 # 4*4 415 brb 0f 416 417 .globl bn_sqr_comba8 418 .type bn_sqr_comba8@function 419bn_sqr_comba8: 420 .word 0x3c0 421 movl $8,%r9 # 8*8 422 4230: 424 movl 8(%ap),%r3 # a[] 425 movl %r3,%r7 # a[] 426 4275: movl 4(%ap),%r5 # r[] 428 movl %r9,%r8 429 430 clrq (%r5) # clear destinatino, for add. 431 clrq 8(%r5) 432 clrq 16(%r5) # these only needed for comba8 433 clrq 24(%r5) 434 4352: clrl %r4 # carry 436 movl %r9,%r6 # inner loop count 437 movl (%r7)+,%r2 # value to multiply with 438 4391: emul %r2,(%r3),%r4,%r0 440 tstl %r4 441 bgeq 3f 442 incl %r1 4433: tstl %r2 444 bgeq 3f 445 addl2 (%r3),%r1 4463: tstl (%r3) 447 bgeq 3f 448 addl2 %r2,%r1 449 4503: addl2 %r0,(%r5)+ # add to destination 451 adwc $0,%r1 # remember carry 452 movl %r1,%r4 # add carry in next emul 453 addl2 $4,%r3 454 sobgtr %r6,1b 455 456 movl %r4,(%r5) # save highest add result 457 458 ashl $2,%r9,%r4 459 subl2 %r4,%r3 460 subl2 $4,%r4 461 subl2 %r4,%r5 462 463 sobgtr %r8,2b 464 465 ret 466