1# $NetBSD: bn_asm_vax.S,v 1.1.1.2 2023/04/18 14:19:11 christos Exp $ 2# 3# w.j.m. 15-jan-1999 4# 5# it's magic ... 6# 7# ULONG bn_mul_add_words(ULONG r[],ULONG a[],int n,ULONG w) { 8# ULONG c = 0; 9# int i; 10# for(i = 0; i < n; i++) <c,r[i]> := r[i] + c + a[i] * w ; 11# return c; 12# } 13 14 .globl bn_mul_add_words 15 .type bn_mul_add_words@function 16 17bn_mul_add_words: 18 .word 0x40 19 20 movl 4(%ap),%r2 # *r 21 movl 8(%ap),%r3 # *a 22 movl 12(%ap),%r4 # n 23 movl 16(%ap),%r5 # w 24 clrl %r6 # return value ("carry") 25 260: emul %r5,(%r3),(%r2),%r0 # w * a[0] + r[0] -> r0 27 28 # fixup for "negative" r[] 29 tstl (%r2) 30 bgeq 1f 31 incl %r1 # add 1 to highword 32 331: # add saved carry to result 34 addl2 %r6,%r0 35 adwc $0,%r1 36 37 # combined fixup for "negative" w, a[] 38 tstl %r5 # if w is negative... 39 bgeq 1f 40 addl2 (%r3),%r1 # ...add a[0] again to highword 411: tstl (%r3) # if a[0] is negative... 42 bgeq 1f 43 addl2 %r5,%r1 # ...add w again to highword 441: 45 movl %r0,(%r2)+ # save low word in dest & advance *r 46 addl2 $4,%r3 # advance *a 47 movl %r1,%r6 # high word in r6 for return value 48 49 sobgtr %r4,0b # loop? 50 51 movl %r6,%r0 52 ret 53 .size bn_mul_add_words, .-bn_mul_add_words 54 55# .title vax_bn_mul_words unsigned multiply & add, 32*32+32=>64 56#; 57#; w.j.m. 15-jan-1999 58#; 59#; it's magic ... 60#; 61#; ULONG bn_mul_words(ULONG r[],ULONG a[],int n,ULONG w) { 62#; ULONG c = 0; 63#; int i; 64#; for(i = 0; i < num; i++) <c,r[i]> := a[i] * w + c ; 65#; return(c); 66#; } 67# 68 .globl bn_mul_words 69 .type bn_mul_words@function 70bn_mul_words: 71 .word 0x40 72 73 movl 4(%ap),%r2 # *r 74 movl 8(%ap),%r3 # *a 75 movl 12(%ap),%r4 # n 76 movl 16(%ap),%r5 # w 77 clrl %r6 # carry 78 790: emul %r5,(%r3),%r6,%r0 # w * a[0] + carry -> r0 80 81 # fixup for "negative" carry 82 tstl %r6 83 bgeq 1f 84 incl %r1 85 861: # combined fixup for "negative" w, a[] 87 tstl %r5 88 bgeq 1f 89 addl2 (%r3),%r1 901: tstl (%r3) 91 bgeq 1f 92 addl2 %r5,%r1 93 941: movl %r0,(%r2)+ 95 addl2 $4,%r3 96 movl %r1,%r6 97 98 sobgtr %r4,0b 99 100 movl %r6,%r0 101 ret 102 .size bn_mul_words, .-bn_mul_words 103 104 105 106# .title vax_bn_sqr_words unsigned square, 32*32=>64 107#; 108#; w.j.m. 15-jan-1999 109#; 110#; it's magic ... 111#; 112#; void bn_sqr_words(ULONG r[],ULONG a[],int n) { 113#; int i; 114#; for(i = 0; i < n; i++) <r[2*i+1],r[2*i]> := a[i] * a[i] ; 115#; } 116# 117 .globl bn_sqr_words 118 .type bn_sqr_words@function 119bn_sqr_words: 120 .word 0 121 122 movl 4(%ap),%r2 # r 123 movl 8(%ap),%r3 # a 124 movl 12(%ap),%r4 # n 125 1260: movl (%r3)+,%r5 # r5 = a[] & advance 127 128 emul %r5,%r5,$0,%r0 # a[0] * a[0] + 0 -> r0 129 130 # fixup for "negative" a[] 131 tstl %r5 132 bgeq 1f 133 addl2 %r5,%r1 134 addl2 %r5,%r1 135 1361: movq %r0,(%r2)+ # store 64-bit result 137 138 sobgtr %r4,0b # loop 139 140 ret 141 .size bn_sqr_words, .-bn_sqr_words 142 143 144# .title vax_bn_div_words unsigned divide 145#; 146#; Richard Levitte 20-Nov-2000 147#; 148#; ULONG bn_div_words(ULONG h, ULONG l, ULONG d) 149#; { 150#; return ((ULONG)((((ULLONG)h)<<32)|l) / (ULLONG)d); 151#; } 152#; 153#; Using EDIV would be very easy, if it didn't do signed calculations. 154#; Any time any of the input numbers are signed, there are problems, 155#; usually with integer overflow, at which point it returns useless 156#; data (the quotient gets the value of l, and the remainder becomes 0). 157#; 158#; If it was just for the dividend, it would be very easy, just divide 159#; it by 2 (unsigned), do the division, multiply the resulting quotient 160#; and remainder by 2, add the bit that was dropped when dividing by 2 161#; to the remainder, and do some adjustment so the remainder doesn't 162#; end up larger than the divisor. For some cases when the divisor is 163#; negative (from EDIV's point of view, i.e. when the highest bit is set), 164#; dividing the dividend by 2 isn't enough, and since some operations 165#; might generate integer overflows even when the dividend is divided by 166#; 4 (when the high part of the shifted down dividend ends up being exactly 167#; half of the divisor, the result is the quotient 0x80000000, which is 168#; negative...) it needs to be divided by 8. Furthermore, the divisor needs 169#; to be divided by 2 (unsigned) as well, to avoid more problems with the sign. 170#; In this case, a little extra fiddling with the remainder is required. 171#; 172#; So, the simplest way to handle this is always to divide the dividend 173#; by 8, and to divide the divisor by 2 if it's highest bit is set. 174#; After EDIV has been used, the quotient gets multiplied by 8 if the 175#; original divisor was positive, otherwise 4. The remainder, oddly 176#; enough, is *always* multiplied by 8. 177#; NOTE: in the case mentioned above, where the high part of the shifted 178#; down dividend ends up being exactly half the shifted down divisor, we 179#; end up with a 33 bit quotient. That's no problem however, it usually 180#; means we have ended up with a too large remainder as well, and the 181#; problem is fixed by the last part of the algorithm (next paragraph). 182#; 183#; The routine ends with comparing the resulting remainder with the 184#; original divisor and if the remainder is larger, subtract the 185#; original divisor from it, and increase the quotient by 1. This is 186#; done until the remainder is smaller than the divisor. 187#; 188#; The complete algorithm looks like this: 189#; 190#; d' = d 191#; l' = l & 7 192#; [h,l] = [h,l] >> 3 193#; [q,r] = floor([h,l] / d) # This is the EDIV operation 194#; if (q < 0) q = -q # I doubt this is necessary any more 195#; 196#; r' = r >> 29 197#; if (d' >= 0) 198#; q' = q >> 29 199#; q = q << 3 200#; else 201#; q' = q >> 30 202#; q = q << 2 203#; r = (r << 3) + l' 204#; 205#; if (d' < 0) 206#; { 207#; [r',r] = [r',r] - q 208#; while ([r',r] < 0) 209#; { 210#; [r',r] = [r',r] + d 211#; [q',q] = [q',q] - 1 212#; } 213#; } 214#; 215#; while ([r',r] >= d') 216#; { 217#; [r',r] = [r',r] - d' 218#; [q',q] = [q',q] + 1 219#; } 220#; 221#; return q 222# 223#;r2 = l, q 224#;r3 = h, r 225#;r4 = d 226#;r5 = l' 227#;r6 = r' 228#;r7 = d' 229#;r8 = q' 230# 231 .globl bn_div_words 232 .type bn_div_words@function 233bn_div_words: 234 .word 0x1c0 235 236 movl 4(%ap),%r3 # h 237 movl 8(%ap),%r2 # l 238 movl 12(%ap),%r4 # d 239 240 bicl3 $-8,%r2,%r5 # l' = l & 7 241 bicl3 $7,%r2,%r2 242 243 bicl3 $-8,%r3,%r6 244 bicl3 $7,%r3,%r3 245 246 addl2 %r6,%r2 247 248 rotl $-3,%r2,%r2 # l = l >> 3 249 rotl $-3,%r3,%r3 # h = h >> 3 250 251 movl %r4,%r7 # d' = d 252 253 clrl %r6 # r' = 0 254 clrl %r8 # q' = 0 255 256 tstl %r4 257 beql 0f # Uh-oh, the divisor is 0... 258 bgtr 1f 259 rotl $-1,%r4,%r4 # If d is negative, shift it right. 260 bicl2 $0x80000000,%r4 # Since d is then a large number, the 261 # lowest bit is insignificant 262 # (contradict that, and I'll fix the problem!) 2631: 264 ediv %r4,%r2,%r2,%r3 # Do the actual division 265 266 tstl %r2 267 bgeq 1f 268 mnegl %r2,%r2 # if q < 0, negate it 2691: 270 tstl %r7 271 blss 1f 272 rotl $3,%r2,%r2 # q = q << 3 273 bicl3 $-8,%r2,%r8 # q' gets the high bits from q 274 bicl3 $7,%r2,%r2 275 brb 2f 276 2771: # else 278 rotl $2,%r2,%r2 # q = q << 2 279 bicl3 $-4,%r2,%r8 # q' gets the high bits from q 280 bicl3 $3,%r2,%r2 2812: 282 rotl $3,%r3,%r3 # r = r << 3 283 bicl3 $-8,%r3,%r6 # r' gets the high bits from r 284 bicl3 $7,%r3,%r3 285 addl2 %r5,%r3 # r = r + l' 286 287 tstl %r7 288 bgeq 5f 289 bitl $1,%r7 290 beql 5f # if d' < 0 && d' & 1 291 subl2 %r2,%r3 # [r',r] = [r',r] - [q',q] 292 sbwc %r8,%r6 2933: 294 bgeq 5f # while r < 0 295 decl %r2 # [q',q] = [q',q] - 1 296 sbwc $0,%r8 297 addl2 %r7,%r3 # [r',r] = [r',r] + d' 298 adwc $0,%r6 299 brb 3b 300 301# The return points are placed in the middle to keep a short distance from 302# all the branch points 3031: 304# movl %r3,%r1 305 movl %r2,%r0 306 ret 3070: 308 movl $-1,%r0 309 ret 3105: 311 tstl %r6 312 bneq 6f 313 cmpl %r3,%r7 314 blssu 1b # while [r',r] >= d' 3156: 316 subl2 %r7,%r3 # [r',r] = [r',r] - d' 317 sbwc $0,%r6 318 incl %r2 # [q',q] = [q',q] + 1 319 adwc $0,%r8 320 brb 5b 321 .size bn_div_words, .-bn_div_words 322 323 324 325# .title vax_bn_add_words unsigned add of two arrays 326#; 327#; Richard Levitte 20-Nov-2000 328#; 329#; ULONG bn_add_words(ULONG r[], ULONG a[], ULONG b[], int n) { 330#; ULONG c = 0; 331#; int i; 332#; for (i = 0; i < n; i++) <c,r[i]> = a[i] + b[i] + c; 333#; return(c); 334#; } 335# 336 337 .globl bn_add_words 338 .type bn_add_words@function 339bn_add_words: 340 .word 0 341 342 movl 4(%ap),%r2 # r 343 movl 8(%ap),%r3 # a 344 movl 12(%ap),%r4 # b 345 movl 16(%ap),%r5 # n 346 clrl %r0 347 348 tstl %r5 349 bleq 1f 350 3510: movl (%r3)+,%r1 # carry untouched 352 adwc (%r4)+,%r1 # carry used and touched 353 movl %r1,(%r2)+ # carry untouched 354 sobgtr %r5,0b # carry untouched 355 356 adwc $0,%r0 3571: ret 358 .size bn_add_words, .-bn_add_words 359 360#; 361#; Richard Levitte 20-Nov-2000 362#; 363#; ULONG bn_sub_words(ULONG r[], ULONG a[], ULONG b[], int n) { 364#; ULONG c = 0; 365#; int i; 366#; for (i = 0; i < n; i++) <c,r[i]> = a[i] - b[i] - c; 367#; return(c); 368#; } 369# 370 .globl bn_sub_words 371 .type bn_sub_words@function 372bn_sub_words: 373 .word 0x40 374 375 movl 4(%ap),%r2 # r 376 movl 8(%ap),%r3 # a 377 movl 12(%ap),%r4 # b 378 movl 16(%ap),%r5 # n 379 clrl %r0 380 381 tstl %r5 382 bleq 1f 383 3840: movl (%r3)+,%r6 # carry untouched 385 sbwc (%r4)+,%r6 # carry used and touched 386 movl %r6,(%r2)+ # carry untouched 387 sobgtr %r5,0b # carry untouched 388 3891: adwc $0,%r0 390 ret 391 .size bn_sub_words, .-bn_sub_words 392 393# 394# Ragge 20-Sep-2003 395# 396# Multiply a vector of 4/8 longword by another. 397# Uses two loops and 16/64 emuls. 398# 399 .globl bn_mul_comba4 400 .type bn_mul_comba4@function 401bn_mul_comba4: 402 .word 0x3c0 403 movl $4,%r9 # 4*4 404 brb 6f 405 406 .globl bn_mul_comba8 407 .type bn_mul_comba8@function 408bn_mul_comba8: 409 .word 0x3c0 410 movl $8,%r9 # 8*8 411 4126: movl 8(%ap),%r3 # a[] 413 movl 12(%ap),%r7 # b[] 414 brb 5f 415 416 .globl bn_sqr_comba4 417 .type bn_sqr_comba4@function 418bn_sqr_comba4: 419 .word 0x3c0 420 movl $4,%r9 # 4*4 421 brb 0f 422 423 .globl bn_sqr_comba8 424 .type bn_sqr_comba8@function 425bn_sqr_comba8: 426 .word 0x3c0 427 movl $8,%r9 # 8*8 428 4290: 430 movl 8(%ap),%r3 # a[] 431 movl %r3,%r7 # a[] 432 4335: movl 4(%ap),%r5 # r[] 434 movl %r9,%r8 435 436 clrq (%r5) # clear destinatino, for add. 437 clrq 8(%r5) 438 clrq 16(%r5) # these only needed for comba8 439 clrq 24(%r5) 440 4412: clrl %r4 # carry 442 movl %r9,%r6 # inner loop count 443 movl (%r7)+,%r2 # value to multiply with 444 4451: emul %r2,(%r3),%r4,%r0 446 tstl %r4 447 bgeq 3f 448 incl %r1 4493: tstl %r2 450 bgeq 3f 451 addl2 (%r3),%r1 4523: tstl (%r3) 453 bgeq 3f 454 addl2 %r2,%r1 455 4563: addl2 %r0,(%r5)+ # add to destination 457 adwc $0,%r1 # remember carry 458 movl %r1,%r4 # add carry in next emul 459 addl2 $4,%r3 460 sobgtr %r6,1b 461 462 movl %r4,(%r5) # save highest add result 463 464 ashl $2,%r9,%r4 465 subl2 %r4,%r3 466 subl2 $4,%r4 467 subl2 %r4,%r5 468 469 sobgtr %r8,2b 470 471 ret 472 .size bn_mul_comba4, .-bn_mul_comba4 473