1/* $OpenBSD: vecast.S,v 1.1 2022/10/22 00:58:56 gkoehler Exp $ */ 2 3/* 4 * Copyright (c) 2022 George Koehler <gkoehler@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19#include <machine/asm.h> 20#include <machine/psl.h> 21 22/* 23 * To load or store an arbitrary AltiVec register, we extract its 24 * number from the instruction and multiply it by 8. We do both using 25 * rlwinm to rotate it left into bits 24 to 28. 26 * 27 * 0 10 15 20 24 28 28 * | | | | | | 29 * 000100dddddaaaaabbbbbcccccxxxxxx 30 */ 31#define VD_ROTATE 14, 24, 28 32#define VA_ROTATE 19, 24, 28 33#define VB_ROTATE 24, 24, 28 34#define VC_ROTATE 29, 24, 28 35 36/* 37 * vctuxs, vctsxs have an unsigned immediate UI in bits 11 to 15. We 38 * extract it into bits 4 to 8, then add FLOAT_1_IS to make 2**UI. 39 */ 40#define UI_ROTATE 7, 4, 8 41#define FLOAT_1_IS 0x3f80 /* (float 1) >> 16 */ 42 43 .rodata 44 .balign 4 45.Lzero: .float 0 46.Lone: .float 1 47.Ln126: .float 126 48.Ltwo63: .float 0x1p63 49.Ltwo126: .float 0x1p126 50.Lmin: .float 0x1p-126 /* FLT_MIN */ 51 52 .text 53 54/* This is the stack frame for vecast_asm. */ 55#define s_size 128 56#define s_f31 120 57#define s_f30 112 58#define s_f29 104 59#define s_f28 96 60#define s_f27 88 61#define s_f26 80 62#define s_f25 72 63#define s_f24 64 64#define s_vc 48 65#define s_vb 32 66#define s_va 16 67 68/* 69 * vecast_asm(insn r3, label r4) emulates an AltiVec instruction when 70 * it traps a denormal or subnormal float (with an AltiVec assist 71 * exception). Such a float f has 0 < |f| < FLT_MIN 2**-126. 72 * 73 * MPC7450 RISC Microprocessor Family Reference Manual, 7.1.2.5 Java 74 * Mode, NaNs, Denormalized Numbers, and Zeros, has a list of trapping 75 * instructions: vaddfp, vsubfp, vmaddfp, vnmsubfp, vrefp, vrsqrtefp, 76 * vlogefp, vexptefp, vctsxs, vctuxs. 77 */ 78ENTRY(vecast_asm) 79 mflr %r0 /* r0 = return address */ 80 RETGUARD_SETUP_LATE(vecast_asm, %r9, %r0) 81 stwu %r1, -s_size(%r1) 82 mfmsr %r5 /* r5 = old msr */ 83 84 /* 85 * Borrow the vector and floating-point units. We must 86 * preserve all float and most vector registers. 87 */ 88 rlwinm %r6, %r5, 0, 17, 15 /* r6 = r5 & ~PSL_EE */ 89 oris %r6, %r6, PSL_VEC >> 16 90 ori %r6, %r6, PSL_FP 91 mtmsr %r6 92 isync 93 94 stfd %f31, s_f31(%r1) 95 stfd %f30, s_f30(%r1) 96 stfd %f29, s_f29(%r1) 97 stfd %f28, s_f28(%r1) 98 stfd %f27, s_f27(%r1) 99 stfd %f26, s_f26(%r1) 100 stfd %f25, s_f25(%r1) 101 stfd %f24, s_f24(%r1) 102 mffs %f31 /* f31 = old fpscr */ 103 104 lis %r6, .Lzero@ha 105 la %r6, .Lzero@l(%r6) /* r6 = address of .Lzero */ 106 107 /* fpscr = zero (round to nearest, no traps) */ 108 lfs %f30, 0(%r6) /* f30 = zero */ 109 mtfsf 255, %f30 110 111 /* All instructions do s_vb = VB now; VD = s_va at finish. */ 112 rlwinm %r7, %r3, VB_ROTATE 113 la %r8, s_vb(%r1) 114 bl vecast_store_vector 115 116 mtctr %r4 117 li %r4, 4 /* r4 = 4 loop iterations */ 118 bctr /* Branch to our instruction's label. */ 119 120/* 121 * vaddfp: d = a + b 122 */ 123 .globl vecast_vaddfp 124vecast_vaddfp: 125 rlwinm %r7, %r3, VA_ROTATE 126 la %r8, s_va(%r1) 127 bl vecast_store_vector 128 129 /* s_va = s_va + s_vb */ 130 mtctr %r4 131 la %r7, (s_va - 4)(%r1) 1321: lfsu %f30, 4(%r7) /* r7 += 4, then load (r7). */ 133 lfs %f29, (s_vb - s_va)(%r7) 134 fadds %f30, %f30, %f29 135 stfs %f30, 0(%r7) 136 bdnz 1b /* Loop 4 times. */ 137 b vecast_finish 138 139/* 140 * vsubfp: d = a + b 141 */ 142 .globl vecast_vsubfp 143vecast_vsubfp: 144 rlwinm %r7, %r3, VA_ROTATE 145 la %r8, s_va(%r1) 146 bl vecast_store_vector 147 148 /* s_va = s_va - s_vb */ 149 mtctr %r4 150 la %r7, (s_va - 4)(%r1) 1511: lfsu %f30, 4(%r7) 152 lfs %f29, (s_vb - s_va)(%r7) 153 fsubs %f30, %f30, %f29 154 stfs %f30, 0(%r7) 155 bdnz 1b 156 b vecast_finish 157 158/* 159 * vmaddfp: d = a * c + b 160 */ 161 .globl vecast_vmaddfp 162vecast_vmaddfp: 163 rlwinm %r7, %r3, VA_ROTATE 164 la %r8, s_va(%r1) 165 bl vecast_store_vector 166 rlwinm %r7, %r3, VC_ROTATE 167 la %r8, s_vc(%r1) 168 bl vecast_store_vector 169 170 /* s_va = s_va * s_vc + s_vb */ 171 mtctr %r4 172 la %r7, (s_va - 4)(%r1) 1731: lfsu %f30, 4(%r7) 174 lfs %f29, (s_vb - s_va)(%r7) 175 lfs %f28, (s_vc - s_va)(%r7) 176 fmadds %f30, %f30, %f28, %f29 177 stfs %f30, 0(%r7) 178 bdnz 1b 179 b vecast_finish 180 181/* 182 * vnmsubfp: d = b - a * c 183 */ 184 .globl vecast_vnmsubfp 185vecast_vnmsubfp: 186 rlwinm %r7, %r3, VA_ROTATE 187 la %r8, s_va(%r1) 188 bl vecast_store_vector 189 rlwinm %r7, %r3, VC_ROTATE 190 la %r8, s_vc(%r1) 191 bl vecast_store_vector 192 193 /* s_va = -(s_va * s_vc - s_vb) */ 194 mtctr %r4 195 la %r7, (s_va - 4)(%r1) 1961: lfsu %f30, 4(%r7) 197 lfs %f29, (s_vb - s_va)(%r7) 198 lfs %f28, (s_vc - s_va)(%r7) 199 fnmsubs %f30, %f30, %f28, %f29 200 stfs %f30, 0(%r7) 201 bdnz 1b 202 b vecast_finish 203 204/* 205 * vrefp: d = estimate 1 / b 206 */ 207 .globl vecast_vrefp 208vecast_vrefp: 209 /* s_va = estimate 1 / s_vb */ 210 mtctr %r4 211 la %r7, (s_vb - 4)(%r1) 2121: lfsu %f30, 4(%r7) 213 fres %f30, %f30 214 stfs %f30, (s_va - s_vb)(%r7) 215 bdnz 1b 216 b vecast_finish 217 218/* 219 * vrsqrtefp: d = estimate 1 / sqrt(b) 220 * 1 / sqrt(b) = 1 / sqrt(b * 2**126) * 2**63 when b < 2**-126 221 * 222 * MPC7455's frsqrte does 1 / sqrt(1) = 0.984375, relative error 1/64. 223 * AltiVec must not err over 1/4096, so avoid frsqrte. 224 */ 225 .globl vecast_vrsqrtefp 226vecast_vrsqrtefp: 227 /* f30 = 1; f29 = 2**63, f28 = 2**126; f27 = 2**-126 */ 228 lfs %f30, (.Lone - .Lzero)(%r6) 229 lfs %f29, (.Ltwo63 - .Lzero)(%r6) 230 lfs %f28, (.Ltwo126 - .Lzero)(%r6) 231 lfs %f27, (.Lmin - .Lzero)(%r6) 232 233 /* 234 * s_vb = s_vb * 2**126, s_va = 2**63 when b < 2**-126 235 * s_va = 1 when b >= 2**-126 236 */ 237 mtctr %r4 238 la %r7, (s_vb - 4)(%r1) 2391: lfsu %f26, 4(%r7) 240 fmuls %f25, %f26, %f28 241 fsubs %f24, %f26, %f27 /* f24 selects b >= 2**-126 */ 242 fsel %f26, %f24, %f26, %f25 /* f26 = b or b * 2**126 */ 243 stfs %f26, 0(%r7) 244 fsel %f25, %f24, %f30, %f29 /* f25 = 1 or 2**63 */ 245 stfs %f25, (s_va - s_vb)(%r7) 246 bdnz 1b 247 248 /* s_vb = estimate 1 / sqrt(s_vb) */ 249 la %r7, s_vc(%r1) 250 la %r8, s_vb(%r1) 251 stvx %v31, 0, %r7 /* Save v31 in s_vc. */ 252 lvx %v31, 0, %r8 253 vrsqrtefp %v31, %v31 254 stvx %v31, 0, %r8 255 lvx %v31, 0, %r7 256 257 /* s_va = s_vb * s_va */ 258 mtctr %r4 259 la %r7, (s_va - 4)(%r1) 2601: lfsu %f30, 4(%r7) 261 lfs %f29, (s_vb - s_va)(%r7) 262 fmuls %f30, %f29, %f30 263 stfs %f30, 0(%r7) 264 bdnz 1b 265 b vecast_finish 266 267/* 268 * vlogefp: d = estimate log2(b) 269 * log2(b) = log2(b * 2**126) - 126 when b < 2**-126 270 */ 271 .globl vecast_vlogefp 272vecast_vlogefp: 273 /* f30 = 0; f29 = 126; f28 = 2**126; f27 = 2**-126 */ 274 lfs %f29, (.Ln126 - .Lzero)(%r6) 275 lfs %f28, (.Ltwo126 - .Lzero)(%r6) 276 lfs %f27, (.Lmin - .Lzero)(%r6) 277 278 /* 279 * s_vb = s_vb * 2**126, s_va = 126 when s_vb < 2**-126 280 * s_va = 0 when s_vb >= 2**-126 281 */ 282 mtctr %r4 283 la %r7, (s_vb - 4)(%r1) 2841: lfsu %f26, 4(%r7) 285 fmuls %f25, %f26, %f28 286 fsubs %f24, %f26, %f27 /* f24 selects b >= 2**-126 */ 287 fsel %f26, %f24, %f26, %f25 /* f26 = b or b * 2**126 */ 288 stfs %f26, 0(%r7) 289 fsel %f25, %f24, %f30, %f29 /* f25 = 0 or 126 */ 290 stfs %f25, (s_va - s_vb)(%r7) 291 bdnz 1b 292 293 /* s_vb = estimate log2(s_vb) */ 294 la %r7, s_vc(%r1) 295 la %r8, s_vb(%r1) 296 stvx %v31, 0, %r7 297 lvx %v31, 0, %r8 298 vlogefp %v31, %v31 299 stvx %v31, 0, %r8 300 lvx %v31, 0, %r7 301 302 /* s_va = s_vb - s_va */ 303 mtctr %r4 304 la %r7, (s_va - 4)(%r1) 3051: lfsu %f30, 4(%r7) 306 lfs %f29, (s_vb - s_va)(%r7) 307 fsubs %f30, %f29, %f30 308 stfs %f30, 0(%r7) 309 bdnz 1b 310 b vecast_finish 311 312/* 313 * vexptefp: d = estimate 2**b 314 * 2**b = 2**(b + 126) * 2**-126 when -252 <= b < -126 315 */ 316 .globl vecast_vexptefp 317vecast_vexptefp: 318 /* f30 = 1; f29 = 126; f28 = 2**-126 */ 319 lfs %f30, (.Lone - .Lzero)(%r6) 320 lfs %f29, (.Ln126 - .Lzero)(%r6) 321 lfs %f28, (.Lmin - .Lzero)(%r6) 322 323 /* 324 * s_vb = s_vb + 126 when -252 <= b < -126 325 * s_va = 2**-126 when b < -126 326 * s_va = 1 when b >= -126 327 * 328 * If b < -252, we avoid a possibly subnormal 2**(b + 126) 329 * by calculating 2**b * 2**-126 = 0 * 2**-126 = 0. 330 */ 331 mtctr %r4 332 la %r7, (s_vb - 4)(%r1) 3331: lfsu %f27, 4(%r7) 334 fadds %f26, %f27, %f29 /* f26 selects b >= -126 */ 335 fadds %f25, %f26, %f29 /* f25 selects b >= -252 */ 336 fsel %f24, %f26, %f27, %f26 337 fsel %f24, %f25, %f24, %f27 /* f24 = b or b + 126 */ 338 stfs %f24, 0(%r7) 339 fsel %f27, %f26, %f30, %f28 /* f27 = 1 or 2**-126 */ 340 stfs %f27, (s_va - s_vb)(%r7) 341 bdnz 1b 342 343 /* s_vb = estimate 2**s_vb */ 344 la %r7, s_vc(%r1) 345 la %r8, s_vb(%r1) 346 stvx %v31, 0, %r7 347 lvx %v31, 0, %r8 348 vexptefp %v31, %v31 349 stvx %v31, 0, %r8 350 lvx %v31, 0, %r7 351 352 /* s_va = s_vb * s_va */ 353 mtctr %r4 354 la %r7, (s_va - 4)(%r1) 3551: lfsu %f30, 4(%r7) 356 lfs %f29, (s_vb - s_va)(%r7) 357 fmuls %f30, %f29, %f30 358 stfs %f30, 0(%r7) 359 bdnz 1b 360 b vecast_finish 361 362/* 363 * vctsxs: d = (int32_t)(b * 2**u) where 0 <= u < 32 364 * d = 0 when |b| < 2**-126 365 */ 366 .globl vecast_vctsxs 367vecast_vctsxs: 368 /* f30 = 0; f29 = 2**-126; f28 = 2**u */ 369 lfs %f29, (.Lmin - .Lzero)(%r6) 370 rlwinm %r7, %r3, UI_ROTATE 371 addis %r7, %r7, FLOAT_1_IS 372 stw %r7, s_va(%r1) 373 lfs %f28, s_va(%r1) 374 375 /* s_va = s_vb * 2**u, unless b is tiny. */ 376 mtctr %r4 377 la %r7, (s_vb - 4)(%r1) 3781: lfsu %f27, 4(%r7) 379 fmuls %f26, %f27, %f28 380 fabs %f27, %f27 381 fsubs %f27, %f27, %f29 /* f27 selects |b| >= 2**-126 */ 382 fsel %f26, %f27, %f26, %f30 /* f26 = b * 2**u or 0 */ 383 stfs %f26, (s_va - s_vb)(%r7) 384 bdnz 1b 385 386 /* s_va = (int32_t)b */ 387 la %r7, s_vc(%r1) 388 la %r8, s_va(%r1) 389 stvx %v31, 0, %r7 390 lvx %v31, 0, %r8 391 vctsxs %v31, %v31, 0 /* May set SAT in vscr. */ 392 stvx %v31, 0, %r8 393 lvx %v31, 0, %r7 394 b vecast_finish 395 396/* 397 * vctuxs: d = (uint32_t)(b * 2**u) where 0 <= u < 32 398 * d = 0 when |b| < 2**-126 399 */ 400 .globl vecast_vctuxs 401vecast_vctuxs: 402 /* f30 = 0; f29 = 2**-126; f28 = 2**u */ 403 lfs %f29, (.Lmin - .Lzero)(%r6) 404 rlwinm %r7, %r3, UI_ROTATE 405 addis %r7, %r7, FLOAT_1_IS 406 stw %r7, s_va(%r1) 407 lfs %f28, s_va(%r1) 408 409 /* s_va = s_vb * 2**u, unless b is tiny. */ 410 mtctr %r4 411 la %r7, (s_vb - 4)(%r1) 4121: lfsu %f27, 4(%r7) 413 fmuls %f26, %f27, %f28 414 fabs %f27, %f27 415 fsubs %f27, %f27, %f29 /* f27 selects |b| >= 2**-126 */ 416 fsel %f26, %f27, %f26, %f30 /* f26 = b * 2**u or 0 */ 417 stfs %f26, (s_va - s_vb)(%r7) 418 bdnz 1b 419 420 /* s_va = (uint32_t)b */ 421 la %r7, s_vc(%r1) 422 la %r8, s_va(%r1) 423 stvx %v31, 0, %r7 424 lvx %v31, 0, %r8 425 vctuxs %v31, %v31, 0 /* May set SAT in vscr. */ 426 stvx %v31, 0, %r8 427 lvx %v31, 0, %r7 428 /* b vecast_finish */ 429 430vecast_finish: 431 /* VD = s_va */ 432 rlwinm %r7, %r3, VD_ROTATE 433 addis %r7, %r7, 1f@ha 434 addi %r7, %r7, 1f@l 435 mtctr %r7 436 la %r8, s_va(%r1) 437 bctr 438#define M(n) lvx %v##n, 0, %r8; b 2f 4391: M( 0); M( 1); M( 2); M( 3); M( 4); M( 5); M( 6); M( 7) 440 M( 8); M( 9); M(10); M(11); M(12); M(13); M(14); M(15) 441 M(16); M(17); M(18); M(19); M(20); M(21); M(22); M(23) 442 M(24); M(25); M(26); M(27); M(28); M(29); M(30); M(31) 443#undef M 4442: mtlr %r0 445 mtfsf 255, %f31 /* Restore old fpscr. */ 446 lfd %f24, s_f24(%r1) 447 lfd %f25, s_f25(%r1) 448 lfd %f26, s_f26(%r1) 449 lfd %f27, s_f27(%r1) 450 lfd %f28, s_f28(%r1) 451 lfd %f29, s_f29(%r1) 452 lfd %f30, s_f30(%r1) 453 lfd %f31, s_f31(%r1) 454 mtmsr %r5 /* Restore old msr. */ 455 isync 456 addi %r1, %r1, s_size 457 RETGUARD_CHECK(vecast_asm, %r9, %r0) 458 blr 459 460/* 461 * Stores vector v(r7 / 8) to address r8. 462 */ 463vecast_store_vector: 464 RETGUARD_SETUP(vecast_store_vector, %r11, %r12) 465 addis %r7, %r7, 1f@ha 466 addi %r7, %r7, 1f@l 467 mtctr %r7 468 bctr 469#define M(n) stvx %v##n, 0, %r8; b 2f 4701: M( 0); M( 1); M( 2); M( 3); M( 4); M( 5); M( 6); M( 7) 471 M( 8); M( 9); M(10); M(11); M(12); M(13); M(14); M(15) 472 M(16); M(17); M(18); M(19); M(20); M(21); M(22); M(23) 473 M(24); M(25); M(26); M(27); M(28); M(29); M(30); M(31) 474#undef M 4752: RETGUARD_CHECK(vecast_store_vector, %r11, %r12) 476 blr 477