xref: /openbsd/sys/arch/powerpc/powerpc/vecast.S (revision 6ea8ccc7)
1/*	$OpenBSD: vecast.S,v 1.1 2022/10/22 00:58:56 gkoehler Exp $	*/
2
3/*
4 * Copyright (c) 2022 George Koehler <gkoehler@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19#include <machine/asm.h>
20#include <machine/psl.h>
21
22/*
23 * To load or store an arbitrary AltiVec register, we extract its
24 * number from the instruction and multiply it by 8.  We do both using
25 * rlwinm to rotate it left into bits 24 to 28.
26 *
27 * 0        10   15   20  24  28
28 * |         |    |    |   |   |
29 * 000100dddddaaaaabbbbbcccccxxxxxx
30 */
31#define VD_ROTATE	14, 24, 28
32#define VA_ROTATE	19, 24, 28
33#define VB_ROTATE	24, 24, 28
34#define VC_ROTATE	29, 24, 28
35
36/*
37 * vctuxs, vctsxs have an unsigned immediate UI in bits 11 to 15.  We
38 * extract it into bits 4 to 8, then add FLOAT_1_IS to make 2**UI.
39 */
40#define UI_ROTATE	7, 4, 8
41#define FLOAT_1_IS	0x3f80		/* (float 1) >> 16 */
42
43	.rodata
44	.balign 4
45.Lzero:		.float	0
46.Lone:		.float	1
47.Ln126:		.float	126
48.Ltwo63:	.float	0x1p63
49.Ltwo126:	.float	0x1p126
50.Lmin:		.float	0x1p-126	/* FLT_MIN */
51
52	.text
53
54/* This is the stack frame for vecast_asm. */
55#define s_size		128
56#define s_f31		120
57#define s_f30		112
58#define s_f29		104
59#define s_f28		96
60#define s_f27		88
61#define s_f26		80
62#define s_f25		72
63#define s_f24		64
64#define s_vc		48
65#define s_vb		32
66#define s_va		16
67
68/*
69 * vecast_asm(insn r3, label r4) emulates an AltiVec instruction when
70 * it traps a denormal or subnormal float (with an AltiVec assist
71 * exception).  Such a float f has 0 < |f| < FLT_MIN 2**-126.
72 *
73 * MPC7450 RISC Microprocessor Family Reference Manual, 7.1.2.5 Java
74 * Mode, NaNs, Denormalized Numbers, and Zeros, has a list of trapping
75 * instructions: vaddfp, vsubfp, vmaddfp, vnmsubfp, vrefp, vrsqrtefp,
76 * vlogefp, vexptefp, vctsxs, vctuxs.
77 */
78ENTRY(vecast_asm)
79	mflr	%r0			/* r0 = return address */
80	RETGUARD_SETUP_LATE(vecast_asm, %r9, %r0)
81	stwu	%r1, -s_size(%r1)
82	mfmsr	%r5			/* r5 = old msr */
83
84	/*
85	 * Borrow the vector and floating-point units.  We must
86	 * preserve all float and most vector registers.
87	 */
88	rlwinm	%r6, %r5, 0, 17, 15	/* r6 = r5 & ~PSL_EE */
89	oris	%r6, %r6, PSL_VEC >> 16
90	ori	%r6, %r6, PSL_FP
91	mtmsr	%r6
92	isync
93
94	stfd	%f31, s_f31(%r1)
95	stfd	%f30, s_f30(%r1)
96	stfd	%f29, s_f29(%r1)
97	stfd	%f28, s_f28(%r1)
98	stfd	%f27, s_f27(%r1)
99	stfd	%f26, s_f26(%r1)
100	stfd	%f25, s_f25(%r1)
101	stfd	%f24, s_f24(%r1)
102	mffs	%f31			/* f31 = old fpscr */
103
104	lis	%r6, .Lzero@ha
105	la	%r6, .Lzero@l(%r6)	/* r6 = address of .Lzero */
106
107	/* fpscr = zero (round to nearest, no traps) */
108	lfs	%f30, 0(%r6)		/* f30 = zero */
109	mtfsf	255, %f30
110
111	/* All instructions do s_vb = VB now; VD = s_va at finish. */
112	rlwinm	%r7, %r3, VB_ROTATE
113	la	%r8, s_vb(%r1)
114	bl	vecast_store_vector
115
116	mtctr	%r4
117	li	%r4, 4		/* r4 = 4 loop iterations */
118	bctr			/* Branch to our instruction's label. */
119
120/*
121 * vaddfp: d = a + b
122 */
123	.globl vecast_vaddfp
124vecast_vaddfp:
125	rlwinm	%r7, %r3, VA_ROTATE
126	la	%r8, s_va(%r1)
127	bl	vecast_store_vector
128
129	/* s_va = s_va + s_vb */
130	mtctr	%r4
131	la	%r7, (s_va - 4)(%r1)
1321:	lfsu	%f30, 4(%r7)		/* r7 += 4, then load (r7). */
133	lfs	%f29, (s_vb - s_va)(%r7)
134	fadds	%f30, %f30, %f29
135	stfs	%f30, 0(%r7)
136	bdnz	1b			/* Loop 4 times. */
137	b	vecast_finish
138
139/*
140 * vsubfp: d = a + b
141 */
142	.globl vecast_vsubfp
143vecast_vsubfp:
144	rlwinm	%r7, %r3, VA_ROTATE
145	la	%r8, s_va(%r1)
146	bl	vecast_store_vector
147
148	/* s_va = s_va - s_vb */
149	mtctr	%r4
150	la	%r7, (s_va - 4)(%r1)
1511:	lfsu	%f30, 4(%r7)
152	lfs	%f29, (s_vb - s_va)(%r7)
153	fsubs	%f30, %f30, %f29
154	stfs	%f30, 0(%r7)
155	bdnz	1b
156	b	vecast_finish
157
158/*
159 * vmaddfp: d = a * c + b
160 */
161	.globl vecast_vmaddfp
162vecast_vmaddfp:
163	rlwinm	%r7, %r3, VA_ROTATE
164	la	%r8, s_va(%r1)
165	bl	vecast_store_vector
166	rlwinm	%r7, %r3, VC_ROTATE
167	la	%r8, s_vc(%r1)
168	bl	vecast_store_vector
169
170	/* s_va = s_va * s_vc + s_vb */
171	mtctr	%r4
172	la	%r7, (s_va - 4)(%r1)
1731:	lfsu	%f30, 4(%r7)
174	lfs	%f29, (s_vb - s_va)(%r7)
175	lfs	%f28, (s_vc - s_va)(%r7)
176	fmadds	%f30, %f30, %f28, %f29
177	stfs	%f30, 0(%r7)
178	bdnz	1b
179	b	vecast_finish
180
181/*
182 * vnmsubfp: d = b - a * c
183 */
184	.globl vecast_vnmsubfp
185vecast_vnmsubfp:
186	rlwinm	%r7, %r3, VA_ROTATE
187	la	%r8, s_va(%r1)
188	bl	vecast_store_vector
189	rlwinm	%r7, %r3, VC_ROTATE
190	la	%r8, s_vc(%r1)
191	bl	vecast_store_vector
192
193	/* s_va = -(s_va * s_vc - s_vb) */
194	mtctr	%r4
195	la	%r7, (s_va - 4)(%r1)
1961:	lfsu	%f30, 4(%r7)
197	lfs	%f29, (s_vb - s_va)(%r7)
198	lfs	%f28, (s_vc - s_va)(%r7)
199	fnmsubs	%f30, %f30, %f28, %f29
200	stfs	%f30, 0(%r7)
201	bdnz	1b
202	b	vecast_finish
203
204/*
205 * vrefp: d = estimate 1 / b
206 */
207	.globl vecast_vrefp
208vecast_vrefp:
209	/* s_va = estimate 1 / s_vb */
210	mtctr	%r4
211	la	%r7, (s_vb - 4)(%r1)
2121:	lfsu	%f30, 4(%r7)
213	fres	%f30, %f30
214	stfs	%f30, (s_va - s_vb)(%r7)
215	bdnz	1b
216	b	vecast_finish
217
218/*
219 * vrsqrtefp: d = estimate 1 / sqrt(b)
220 * 1 / sqrt(b) = 1 / sqrt(b * 2**126) * 2**63 when b < 2**-126
221 *
222 * MPC7455's frsqrte does 1 / sqrt(1) = 0.984375, relative error 1/64.
223 * AltiVec must not err over 1/4096, so avoid frsqrte.
224 */
225	.globl vecast_vrsqrtefp
226vecast_vrsqrtefp:
227	/* f30 = 1; f29 = 2**63, f28 = 2**126; f27 = 2**-126 */
228	lfs	%f30, (.Lone - .Lzero)(%r6)
229	lfs	%f29, (.Ltwo63 - .Lzero)(%r6)
230	lfs	%f28, (.Ltwo126 - .Lzero)(%r6)
231	lfs	%f27, (.Lmin - .Lzero)(%r6)
232
233	/*
234	 * s_vb = s_vb * 2**126, s_va = 2**63 when b < 2**-126
235	 * s_va = 1 when b >= 2**-126
236	 */
237	mtctr	%r4
238	la	%r7, (s_vb - 4)(%r1)
2391:	lfsu	%f26, 4(%r7)
240	fmuls	%f25, %f26, %f28
241	fsubs	%f24, %f26, %f27	/* f24 selects b >= 2**-126 */
242	fsel	%f26, %f24, %f26, %f25	/* f26 = b or b * 2**126 */
243	stfs	%f26, 0(%r7)
244	fsel	%f25, %f24, %f30, %f29	/* f25 = 1 or 2**63 */
245	stfs	%f25, (s_va - s_vb)(%r7)
246	bdnz	1b
247
248	/* s_vb = estimate 1 / sqrt(s_vb) */
249	la	%r7, s_vc(%r1)
250	la	%r8, s_vb(%r1)
251	stvx	%v31, 0, %r7		/* Save v31 in s_vc. */
252	lvx	%v31, 0, %r8
253	vrsqrtefp %v31, %v31
254	stvx	%v31, 0, %r8
255	lvx	%v31, 0, %r7
256
257	/* s_va = s_vb * s_va */
258	mtctr	%r4
259	la	%r7, (s_va - 4)(%r1)
2601:	lfsu	%f30, 4(%r7)
261	lfs	%f29, (s_vb - s_va)(%r7)
262	fmuls	%f30, %f29, %f30
263	stfs	%f30, 0(%r7)
264	bdnz	1b
265	b	vecast_finish
266
267/*
268 * vlogefp: d = estimate log2(b)
269 * log2(b) = log2(b * 2**126) - 126 when b < 2**-126
270 */
271	.globl	vecast_vlogefp
272vecast_vlogefp:
273	/* f30 = 0; f29 = 126; f28 = 2**126; f27 = 2**-126 */
274	lfs	%f29, (.Ln126 - .Lzero)(%r6)
275	lfs	%f28, (.Ltwo126 - .Lzero)(%r6)
276	lfs	%f27, (.Lmin - .Lzero)(%r6)
277
278	/*
279	 * s_vb = s_vb * 2**126, s_va = 126 when s_vb < 2**-126
280	 * s_va = 0 when s_vb >= 2**-126
281	 */
282	mtctr	%r4
283	la	%r7, (s_vb - 4)(%r1)
2841:	lfsu	%f26, 4(%r7)
285	fmuls	%f25, %f26, %f28
286	fsubs	%f24, %f26, %f27	/* f24 selects b >= 2**-126 */
287	fsel	%f26, %f24, %f26, %f25	/* f26 = b or b * 2**126 */
288	stfs	%f26, 0(%r7)
289	fsel	%f25, %f24, %f30, %f29	/* f25 = 0 or 126 */
290	stfs	%f25, (s_va - s_vb)(%r7)
291	bdnz	1b
292
293	/* s_vb = estimate log2(s_vb) */
294	la	%r7, s_vc(%r1)
295	la	%r8, s_vb(%r1)
296	stvx	%v31, 0, %r7
297	lvx	%v31, 0, %r8
298	vlogefp	%v31, %v31
299	stvx	%v31, 0, %r8
300	lvx	%v31, 0, %r7
301
302	/* s_va = s_vb - s_va */
303	mtctr	%r4
304	la	%r7, (s_va - 4)(%r1)
3051:	lfsu	%f30, 4(%r7)
306	lfs	%f29, (s_vb - s_va)(%r7)
307	fsubs	%f30, %f29, %f30
308	stfs	%f30, 0(%r7)
309	bdnz	1b
310	b	vecast_finish
311
312/*
313 * vexptefp: d = estimate 2**b
314 * 2**b = 2**(b + 126) * 2**-126 when -252 <= b < -126
315 */
316	.globl	vecast_vexptefp
317vecast_vexptefp:
318	/* f30 = 1; f29 = 126; f28 = 2**-126 */
319	lfs	%f30, (.Lone - .Lzero)(%r6)
320	lfs	%f29, (.Ln126 - .Lzero)(%r6)
321	lfs	%f28, (.Lmin - .Lzero)(%r6)
322
323	/*
324	 * s_vb = s_vb + 126 when -252 <= b < -126
325	 * s_va = 2**-126 when b < -126
326	 * s_va = 1 when b >= -126
327	 *
328	 * If b < -252, we avoid a possibly subnormal 2**(b + 126)
329	 * by calculating 2**b * 2**-126 = 0 * 2**-126 = 0.
330	 */
331	mtctr	%r4
332	la	%r7, (s_vb - 4)(%r1)
3331:	lfsu	%f27, 4(%r7)
334	fadds	%f26, %f27, %f29	/* f26 selects b >= -126 */
335	fadds	%f25, %f26, %f29	/* f25 selects b >= -252 */
336	fsel	%f24, %f26, %f27, %f26
337	fsel	%f24, %f25, %f24, %f27	/* f24 = b or b + 126 */
338	stfs	%f24, 0(%r7)
339	fsel	%f27, %f26, %f30, %f28	/* f27 = 1 or 2**-126 */
340	stfs	%f27, (s_va - s_vb)(%r7)
341	bdnz	1b
342
343	/* s_vb = estimate 2**s_vb */
344	la	%r7, s_vc(%r1)
345	la	%r8, s_vb(%r1)
346	stvx	%v31, 0, %r7
347	lvx	%v31, 0, %r8
348	vexptefp %v31, %v31
349	stvx	%v31, 0, %r8
350	lvx	%v31, 0, %r7
351
352 	/* s_va = s_vb * s_va */
353	mtctr	%r4
354	la	%r7, (s_va - 4)(%r1)
3551:	lfsu	%f30, 4(%r7)
356	lfs	%f29, (s_vb - s_va)(%r7)
357	fmuls	%f30, %f29, %f30
358	stfs	%f30, 0(%r7)
359	bdnz	1b
360	b	vecast_finish
361
362/*
363 * vctsxs: d = (int32_t)(b * 2**u) where 0 <= u < 32
364 * d = 0 when |b| < 2**-126
365 */
366	.globl	vecast_vctsxs
367vecast_vctsxs:
368	/* f30 = 0; f29 = 2**-126; f28 = 2**u */
369	lfs	%f29, (.Lmin - .Lzero)(%r6)
370	rlwinm	%r7, %r3, UI_ROTATE
371	addis	%r7, %r7, FLOAT_1_IS
372	stw	%r7, s_va(%r1)
373	lfs	%f28, s_va(%r1)
374
375	/* s_va = s_vb * 2**u, unless b is tiny. */
376	mtctr	%r4
377	la	%r7, (s_vb - 4)(%r1)
3781:	lfsu	%f27, 4(%r7)
379	fmuls	%f26, %f27, %f28
380	fabs	%f27, %f27
381	fsubs	%f27, %f27, %f29	/* f27 selects |b| >= 2**-126 */
382	fsel	%f26, %f27, %f26, %f30	/* f26 = b * 2**u or 0 */
383	stfs	%f26, (s_va - s_vb)(%r7)
384	bdnz	1b
385
386	/* s_va = (int32_t)b */
387	la	%r7, s_vc(%r1)
388	la	%r8, s_va(%r1)
389	stvx	%v31, 0, %r7
390	lvx	%v31, 0, %r8
391	vctsxs	%v31, %v31, 0		/* May set SAT in vscr. */
392	stvx	%v31, 0, %r8
393	lvx	%v31, 0, %r7
394	b	vecast_finish
395
396/*
397 * vctuxs: d = (uint32_t)(b * 2**u) where 0 <= u < 32
398 * d = 0 when |b| < 2**-126
399 */
400	.globl	vecast_vctuxs
401vecast_vctuxs:
402	/* f30 = 0; f29 = 2**-126; f28 = 2**u */
403	lfs	%f29, (.Lmin - .Lzero)(%r6)
404	rlwinm	%r7, %r3, UI_ROTATE
405	addis	%r7, %r7, FLOAT_1_IS
406	stw	%r7, s_va(%r1)
407	lfs	%f28, s_va(%r1)
408
409	/* s_va = s_vb * 2**u, unless b is tiny. */
410	mtctr	%r4
411	la	%r7, (s_vb - 4)(%r1)
4121:	lfsu	%f27, 4(%r7)
413	fmuls	%f26, %f27, %f28
414	fabs	%f27, %f27
415	fsubs	%f27, %f27, %f29	/* f27 selects |b| >= 2**-126 */
416	fsel	%f26, %f27, %f26, %f30	/* f26 = b * 2**u or 0 */
417	stfs	%f26, (s_va - s_vb)(%r7)
418	bdnz	1b
419
420	/* s_va = (uint32_t)b */
421	la	%r7, s_vc(%r1)
422	la	%r8, s_va(%r1)
423	stvx	%v31, 0, %r7
424	lvx	%v31, 0, %r8
425	vctuxs	%v31, %v31, 0		/* May set SAT in vscr. */
426	stvx	%v31, 0, %r8
427	lvx	%v31, 0, %r7
428	/* b	vecast_finish */
429
430vecast_finish:
431	/* VD = s_va */
432	rlwinm	%r7, %r3, VD_ROTATE
433	addis	%r7, %r7, 1f@ha
434	addi	%r7, %r7, 1f@l
435	mtctr	%r7
436	la	%r8, s_va(%r1)
437	bctr
438#define M(n) lvx %v##n, 0, %r8; b 2f
4391:	M( 0); M( 1); M( 2); M( 3); M( 4); M( 5); M( 6); M( 7)
440	M( 8); M( 9); M(10); M(11); M(12); M(13); M(14); M(15)
441	M(16); M(17); M(18); M(19); M(20); M(21); M(22); M(23)
442	M(24); M(25); M(26); M(27); M(28); M(29); M(30); M(31)
443#undef M
4442:	mtlr	%r0
445	mtfsf	255, %f31		/* Restore old fpscr. */
446	lfd	%f24, s_f24(%r1)
447	lfd	%f25, s_f25(%r1)
448	lfd	%f26, s_f26(%r1)
449	lfd	%f27, s_f27(%r1)
450	lfd	%f28, s_f28(%r1)
451	lfd	%f29, s_f29(%r1)
452	lfd	%f30, s_f30(%r1)
453	lfd	%f31, s_f31(%r1)
454	mtmsr	%r5			/* Restore old msr. */
455	isync
456	addi	%r1, %r1, s_size
457	RETGUARD_CHECK(vecast_asm, %r9, %r0)
458	blr
459
460/*
461 * Stores vector v(r7 / 8) to address r8.
462 */
463vecast_store_vector:
464	RETGUARD_SETUP(vecast_store_vector, %r11, %r12)
465	addis	%r7, %r7, 1f@ha
466	addi	%r7, %r7, 1f@l
467	mtctr	%r7
468	bctr
469#define M(n)	stvx	%v##n, 0, %r8; b 2f
4701:	M( 0); M( 1); M( 2); M( 3); M( 4); M( 5); M( 6); M( 7)
471	M( 8); M( 9); M(10); M(11); M(12); M(13); M(14); M(15)
472	M(16); M(17); M(18); M(19); M(20); M(21); M(22); M(23)
473	M(24); M(25); M(26); M(27); M(28); M(29); M(30); M(31)
474#undef M
4752:	RETGUARD_CHECK(vecast_store_vector, %r11, %r12)
476	blr
477