1 /* Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
2 * Copyright (c) 2018, Cavium. All rights reserved. (By BELLSOFT)
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "precompiled.hpp"
26 #include "asm/assembler.hpp"
27 #include "asm/assembler.inline.hpp"
28 #include "runtime/stubRoutines.hpp"
29 #include "macroAssembler_aarch64.hpp"
30
31 // The following code is a optimized version of fdlibm sin/cos implementation
32 // (C code is in share/runtime/sharedRuntimeTrig.cpp) adapted for AARCH64.
33
34 // Please refer to sin/cos approximation via polynomial and
35 // trigonometric argument reduction techniques to the following literature:
36 //
37 // [1] Muller, Jean-Michel, Nicolas Brisebarre, Florent De Dinechin,
38 // Claude-Pierre Jeannerod, Vincent Lefevre, Guillaume Melquiond,
39 // Nathalie Revol, Damien Stehlé, and Serge Torres:
40 // Handbook of floating-point arithmetic.
41 // Springer Science & Business Media, 2009.
42 // [2] K. C. Ng
43 // Argument Reduction for Huge Arguments: Good to the Last Bit
44 // July 13, 1992, SunPro
45 //
46 // HOW TO READ THIS CODE:
47 // This code consists of several functions. Each function has following header:
48 // 1) Description
49 // 2) C-pseudo code with differences from fdlibm marked by comments starting
50 // with "NOTE". Check unmodified fdlibm code in
51 // share/runtime/SharedRuntimeTrig.cpp
52 // 3) Brief textual description of changes between fdlibm and current
53 // implementation along with optimization notes (if applicable)
54 // 4) Assumptions, input and output
55 // 5) (Optional) additional notes about intrinsic implementation
56 // Each function is separated in blocks which follow the pseudo-code structure
57 //
58 // HIGH-LEVEL ALGORITHM DESCRIPTION:
59 // - entry point: generate_dsin_dcos(...);
60 // - check corner cases: NaN, INF, tiny argument.
61 // - check if |x| < Pi/4. Then approximate sin/cos via polynomial (kernel_sin/kernel_cos)
62 // -- else proceed to argument reduction routine (__ieee754_rem_pio2) and
63 // use reduced argument to get result via kernel_sin/kernel_cos
64 //
65 // HIGH-LEVEL CHANGES BETWEEN INTRINSICS AND FDLIBM:
66 // 1) two_over_pi table fdlibm representation is int[], while intrinsic version
67 // has these int values converted to double representation to load converted
68 // double values directly (see stubRoutines_aarch4::_two_over_pi)
69 // 2) Several loops are unrolled and vectorized: see comments in code after
70 // labels: SKIP_F_LOAD, RECOMP_FOR1_CHECK, RECOMP_FOR2
71 // 3) fdlibm npio2_hw table now has "prefix" with constants used in
72 // calculation. These constants are loaded from npio2_hw table instead of
73 // constructing it in code (see stubRoutines_aarch64.cpp)
74 // 4) Polynomial coefficients for sin and cos are moved to table sin_coef
75 // and cos_coef to use the same optimization as in 3). It allows to load most of
76 // required constants via single instruction
77 //
78 //
79 //
80 ///* __ieee754_rem_pio2(x,y)
81 // *
82 // * returns the remainder of x rem pi/2 in y[0]+y[1] (i.e. like x div pi/2)
83 // * x is input argument, y[] is hi and low parts of reduced argument (x)
84 // * uses __kernel_rem_pio2()
85 // */
86 // // use tables(see stubRoutines_aarch64.cpp): two_over_pi and modified npio2_hw
87 //
88 // BEGIN __ieee754_rem_pio2 PSEUDO CODE
89 //
90 //static int __ieee754_rem_pio2(double x, double *y) {
91 // double z,w,t,r,fn;
92 // double tx[3];
93 // int e0,i,j,nx,n,ix,hx,i0;
94 //
95 // i0 = ((*(int*)&two24A)>>30)^1; /* high word index */
96 // hx = *(i0+(int*)&x); /* high word of x */
97 // ix = hx&0x7fffffff;
98 // if(ix<0x4002d97c) { /* |x| < 3pi/4, special case with n=+-1 */
99 // if(hx>0) {
100 // z = x - pio2_1;
101 // if(ix!=0x3ff921fb) { /* 33+53 bit pi is good enough */
102 // y[0] = z - pio2_1t;
103 // y[1] = (z-y[0])-pio2_1t;
104 // } else { /* near pi/2, use 33+33+53 bit pi */
105 // z -= pio2_2;
106 // y[0] = z - pio2_2t;
107 // y[1] = (z-y[0])-pio2_2t;
108 // }
109 // return 1;
110 // } else { /* negative x */
111 // z = x + pio2_1;
112 // if(ix!=0x3ff921fb) { /* 33+53 bit pi is good enough */
113 // y[0] = z + pio2_1t;
114 // y[1] = (z-y[0])+pio2_1t;
115 // } else { /* near pi/2, use 33+33+53 bit pi */
116 // z += pio2_2;
117 // y[0] = z + pio2_2t;
118 // y[1] = (z-y[0])+pio2_2t;
119 // }
120 // return -1;
121 // }
122 // }
123 // if(ix<=0x413921fb) { /* |x| ~<= 2^19*(pi/2), medium size */
124 // t = fabsd(x);
125 // n = (int) (t*invpio2+half);
126 // fn = (double)n;
127 // r = t-fn*pio2_1;
128 // w = fn*pio2_1t; /* 1st round good to 85 bit */
129 // // NOTE: y[0] = r-w; is moved from if/else below to be before "if"
130 // y[0] = r-w;
131 // if(n<32&&ix!=npio2_hw[n-1]) {
132 // // y[0] = r-w; /* quick check no cancellation */ // NOTE: moved earlier
133 // } else {
134 // j = ix>>20;
135 // // y[0] = r-w; // NOTE: moved earlier
136 // i = j-(((*(i0+(int*)&y[0]))>>20)&0x7ff);
137 // if(i>16) { /* 2nd iteration needed, good to 118 */
138 // t = r;
139 // w = fn*pio2_2;
140 // r = t-w;
141 // w = fn*pio2_2t-((t-r)-w);
142 // y[0] = r-w;
143 // i = j-(((*(i0+(int*)&y[0]))>>20)&0x7ff);
144 // if(i>49) { /* 3rd iteration need, 151 bits acc */
145 // t = r; /* will cover all possible cases */
146 // w = fn*pio2_3;
147 // r = t-w;
148 // w = fn*pio2_3t-((t-r)-w);
149 // y[0] = r-w;
150 // }
151 // }
152 // }
153 // y[1] = (r-y[0])-w;
154 // if(hx<0) {y[0] = -y[0]; y[1] = -y[1]; return -n;}
155 // else return n;
156 // }
157 // /*
158 // * all other (large) arguments
159 // */
160 // // NOTE: this check is removed, because it was checked in dsin/dcos
161 // // if(ix>=0x7ff00000) { /* x is inf or NaN */
162 // // y[0]=y[1]=x-x; return 0;
163 // // }
164 // /* set z = scalbn(|x|,ilogb(x)-23) */
165 // *(1-i0+(int*)&z) = *(1-i0+(int*)&x);
166 // e0 = (ix>>20)-1046; /* e0 = ilogb(z)-23; */
167 // *(i0+(int*)&z) = ix - (e0<<20);
168 //
169 // // NOTE: "for" loop below in unrolled. See comments in asm code
170 // for(i=0;i<2;i++) {
171 // tx[i] = (double)((int)(z));
172 // z = (z-tx[i])*two24A;
173 // }
174 //
175 // tx[2] = z;
176 // nx = 3;
177 //
178 // // NOTE: while(tx[nx-1]==zeroA) nx--; is unrolled. See comments in asm code
179 // while(tx[nx-1]==zeroA) nx--; /* skip zero term */
180 //
181 // n = __kernel_rem_pio2(tx,y,e0,nx,2,two_over_pi);
182 // if(hx<0) {y[0] = -y[0]; y[1] = -y[1]; return -n;}
183 // return n;
184 //}
185 //
186 // END __ieee754_rem_pio2 PSEUDO CODE
187 //
188 // Changes between fdlibm and intrinsic for __ieee754_rem_pio2:
189 // 1. INF/NaN check for huge argument is removed in comparison with fdlibm
190 // code, because this check is already done in dcos/dsin code
191 // 2. Most constants are now loaded from table instead of direct initialization
192 // 3. Two loops are unrolled
193 // Assumptions:
194 // 1. Assume |X| >= PI/4
195 // 2. Assume rscratch1 = 0x3fe921fb00000000 (~ PI/4)
196 // 3. Assume ix = r3
197 // Input and output:
198 // 1. Input: X = r0
199 // 2. Return n in r2, y[0] == y0 == v4, y[1] == y1 == v5
200 // NOTE: general purpose register names match local variable names in C code
201 // NOTE: fpu registers are actively reused. See comments in code about their usage
generate__ieee754_rem_pio2(address npio2_hw,address two_over_pi,address pio2)202 void MacroAssembler::generate__ieee754_rem_pio2(address npio2_hw,
203 address two_over_pi, address pio2) {
204 const long PIO2_1t = 0x3DD0B4611A626331UL;
205 const long PIO2_2 = 0x3DD0B4611A600000UL;
206 const long PIO2_2t = 0x3BA3198A2E037073UL;
207 Label X_IS_NEGATIVE, X_IS_MEDIUM_OR_LARGE, X_IS_POSITIVE_LONG_PI, LARGE_ELSE,
208 REDUCTION_DONE, X_IS_MEDIUM_BRANCH_DONE, X_IS_LARGE, NX_SET,
209 X_IS_NEGATIVE_LONG_PI;
210 Register X = r0, n = r2, ix = r3, jv = r4, tmp5 = r5, jx = r6,
211 tmp3 = r7, iqBase = r10, ih = r11, i = r17;
212 // initializing constants first
213 // rscratch1 = 0x3fe921fb00000000 (see assumptions)
214 movk(rscratch1, 0x3ff9, 48); // was 0x3fe921fb0..0 now it's 0x3ff921fb0..0
215 mov(rscratch2, 0x4002d97c); // 3*PI/4 high word
216 movk(rscratch1, 0x5440, 16); // now rscratch1 == PIO2_1
217 fmovd(v1, rscratch1); // v1 = PIO2_1
218 cmp(rscratch2, ix);
219 br(LE, X_IS_MEDIUM_OR_LARGE);
220
221 block_comment("if(ix<0x4002d97c) {... /* |x| ~< 3pi/4 */ "); {
222 cmp(X, zr);
223 br(LT, X_IS_NEGATIVE);
224
225 block_comment("if(hx>0) {"); {
226 fsubd(v2, v0, v1); // v2 = z = x - pio2_1
227 cmp(ix, rscratch1, LSR, 32);
228 mov(n, 1);
229 br(EQ, X_IS_POSITIVE_LONG_PI);
230
231 block_comment("case: hx > 0 && ix!=0x3ff921fb {"); { /* 33+53 bit pi is good enough */
232 mov(rscratch2, PIO2_1t);
233 fmovd(v27, rscratch2);
234 fsubd(v4, v2, v27); // v4 = y[0] = z - pio2_1t;
235 fsubd(v5, v2, v4);
236 fsubd(v5, v5, v27); // v5 = y[1] = (z-y[0])-pio2_1t
237 b(REDUCTION_DONE);
238 }
239
240 block_comment("case: hx > 0 &*& ix==0x3ff921fb {"); { /* near pi/2, use 33+33+53 bit pi */
241 bind(X_IS_POSITIVE_LONG_PI);
242 mov(rscratch1, PIO2_2);
243 mov(rscratch2, PIO2_2t);
244 fmovd(v27, rscratch1);
245 fmovd(v6, rscratch2);
246 fsubd(v2, v2, v27); // z-= pio2_2
247 fsubd(v4, v2, v6); // y[0] = z - pio2_2t
248 fsubd(v5, v2, v4);
249 fsubd(v5, v5, v6); // v5 = (z - y[0]) - pio2_2t
250 b(REDUCTION_DONE);
251 }
252 }
253
254 block_comment("case: hx <= 0)"); {
255 bind(X_IS_NEGATIVE);
256 faddd(v2, v0, v1); // v2 = z = x + pio2_1
257 cmp(ix, rscratch1, LSR, 32);
258 mov(n, -1);
259 br(EQ, X_IS_NEGATIVE_LONG_PI);
260
261 block_comment("case: hx <= 0 && ix!=0x3ff921fb) {"); { /* 33+53 bit pi is good enough */
262 mov(rscratch2, PIO2_1t);
263 fmovd(v27, rscratch2);
264 faddd(v4, v2, v27); // v4 = y[0] = z + pio2_1t;
265 fsubd(v5, v2, v4);
266 faddd(v5, v5, v27); // v5 = y[1] = (z-y[0]) + pio2_1t
267 b(REDUCTION_DONE);
268 }
269
270 block_comment("case: hx <= 0 && ix==0x3ff921fb"); { /* near pi/2, use 33+33+53 bit pi */
271 bind(X_IS_NEGATIVE_LONG_PI);
272 mov(rscratch1, PIO2_2);
273 mov(rscratch2, PIO2_2t);
274 fmovd(v27, rscratch1);
275 fmovd(v6, rscratch2);
276 faddd(v2, v2, v27); // z += pio2_2
277 faddd(v4, v2, v6); // y[0] = z + pio2_2t
278 fsubd(v5, v2, v4);
279 faddd(v5, v5, v6); // v5 = (z - y[0]) + pio2_2t
280 b(REDUCTION_DONE);
281 }
282 }
283 }
284 bind(X_IS_MEDIUM_OR_LARGE);
285 mov(rscratch1, 0x413921fb);
286 cmp(ix, rscratch1); // ix < = 0x413921fb ?
287 br(GT, X_IS_LARGE);
288
289 block_comment("|x| ~<= 2^19*(pi/2), medium size"); {
290 lea(ih, ExternalAddress(npio2_hw));
291 ld1(v4, v5, v6, v7, T1D, ih);
292 fabsd(v31, v0); // v31 = t = |x|
293 add(ih, ih, 64);
294 fmaddd(v2, v31, v5, v4); // v2 = t * invpio2 + half (invpio2 = 53 bits of 2/pi, half = 0.5)
295 fcvtzdw(n, v2); // n = (int) v2
296 frintzd(v2, v2);
297 fmsubd(v3, v2, v6, v31); // v3 = r = t - fn * pio2_1
298 fmuld(v26, v2, v7); // v26 = w = fn * pio2_1t
299 fsubd(v4, v3, v26); // y[0] = r - w. Calculated before branch
300 cmp(n, (u1)32);
301 br(GT, LARGE_ELSE);
302 subw(tmp5, n, 1); // tmp5 = n - 1
303 ldrw(jv, Address(ih, tmp5, Address::lsl(2)));
304 cmp(ix, jv);
305 br(NE, X_IS_MEDIUM_BRANCH_DONE);
306
307 block_comment("else block for if(n<32&&ix!=npio2_hw[n-1])"); {
308 bind(LARGE_ELSE);
309 fmovd(jx, v4);
310 lsr(tmp5, ix, 20); // j = ix >> 20
311 lsl(jx, jx, 1);
312 sub(tmp3, tmp5, jx, LSR, 32 + 20 + 1); // r7 = j-(((*(i0+(int*)&y[0]))>>20)&0x7ff);
313
314 block_comment("if(i>16)"); {
315 cmp(tmp3, (u1)16);
316 br(LE, X_IS_MEDIUM_BRANCH_DONE);
317 // i > 16. 2nd iteration needed
318 ldpd(v6, v7, Address(ih, -32));
319 fmovd(v28, v3); // t = r
320 fmuld(v29, v2, v6); // w = v29 = fn * pio2_2
321 fsubd(v3, v28, v29); // r = t - w
322 fsubd(v31, v28, v3); // v31 = (t - r)
323 fsubd(v31, v29, v31); // v31 = w - (t - r) = - ((t - r) - w)
324 fmaddd(v26, v2, v7, v31); // v26 = w = fn*pio2_2t - ((t - r) - w)
325 fsubd(v4, v3, v26); // y[0] = r - w
326 fmovd(jx, v4);
327 lsl(jx, jx, 1);
328 sub(tmp3, tmp5, jx, LSR, 32 + 20 + 1); // r7 = j-(((*(i0+(int*)&y[0]))>>20)&0x7ff);
329
330 block_comment("if(i>49)"); {
331 cmp(tmp3, (u1)49);
332 br(LE, X_IS_MEDIUM_BRANCH_DONE);
333 // 3rd iteration need, 151 bits acc
334 ldpd(v6, v7, Address(ih, -16));
335 fmovd(v28, v3); // save "r"
336 fmuld(v29, v2, v6); // v29 = fn * pio2_3
337 fsubd(v3, v28, v29); // r = r - w
338 fsubd(v31, v28, v3); // v31 = (t - r)
339 fsubd(v31, v29, v31); // v31 = w - (t - r) = - ((t - r) - w)
340 fmaddd(v26, v2, v7, v31); // v26 = w = fn*pio2_3t - ((t - r) - w)
341 fsubd(v4, v3, v26); // y[0] = r - w
342 }
343 }
344 }
345 block_comment("medium x tail"); {
346 bind(X_IS_MEDIUM_BRANCH_DONE);
347 fsubd(v5, v3, v4); // v5 = y[1] = (r - y[0])
348 fsubd(v5, v5, v26); // v5 = y[1] = (r - y[0]) - w
349 cmp(X, zr);
350 br(GT, REDUCTION_DONE);
351 fnegd(v4, v4);
352 negw(n, n);
353 fnegd(v5, v5);
354 b(REDUCTION_DONE);
355 }
356 }
357
358 block_comment("all other (large) arguments"); {
359 bind(X_IS_LARGE);
360 lsr(rscratch1, ix, 20); // ix >> 20
361 movz(tmp5, 0x4170, 48);
362 subw(rscratch1, rscratch1, 1046); // e0
363 fmovd(v10, tmp5); // init two24A value
364 subw(jv, ix, rscratch1, LSL, 20); // ix - (e0<<20)
365 lsl(jv, jv, 32);
366 subw(rscratch2, rscratch1, 3);
367 bfm(jv, X, 0, 31); // jv = z
368 movw(i, 24);
369 fmovd(v26, jv); // v26 = z
370
371 block_comment("unrolled for(i=0;i<2;i++) {tx[i] = (double)((int)(z));z = (z-tx[i])*two24A;}"); {
372 // tx[0,1,2] = v6,v7,v26
373 frintzd(v6, v26); // v6 = (double)((int)v26)
374 sdivw(jv, rscratch2, i); // jv = (e0 - 3)/24
375 fsubd(v26, v26, v6);
376 sub(sp, sp, 560);
377 fmuld(v26, v26, v10);
378 frintzd(v7, v26); // v7 = (double)((int)v26)
379 movw(jx, 2); // calculate jx as nx - 1, which is initially 2. Not a part of unrolled loop
380 fsubd(v26, v26, v7);
381 }
382
383 block_comment("nx calculation with unrolled while(tx[nx-1]==zeroA) nx--;"); {
384 fcmpd(v26, 0.0); // if NE then jx == 2. else it's 1 or 0
385 add(iqBase, sp, 480); // base of iq[]
386 fmuld(v3, v26, v10);
387 br(NE, NX_SET);
388 fcmpd(v7, 0.0); // v7 == 0 => jx = 0. Else jx = 1
389 csetw(jx, NE);
390 }
391 bind(NX_SET);
392 generate__kernel_rem_pio2(two_over_pi, pio2);
393 // now we have y[0] = v4, y[1] = v5 and n = r2
394 cmp(X, zr);
395 br(GE, REDUCTION_DONE);
396 fnegd(v4, v4);
397 fnegd(v5, v5);
398 negw(n, n);
399 }
400 bind(REDUCTION_DONE);
401 }
402
403 ///*
404 // * __kernel_rem_pio2(x,y,e0,nx,prec,ipio2)
405 // * double x[],y[]; int e0,nx,prec; int ipio2[];
406 // *
407 // * __kernel_rem_pio2 return the last three digits of N with
408 // * y = x - N*pi/2
409 // * so that |y| < pi/2.
410 // *
411 // * The method is to compute the integer (mod 8) and fraction parts of
412 // * (2/pi)*x without doing the full multiplication. In general we
413 // * skip the part of the product that are known to be a huge integer (
414 // * more accurately, = 0 mod 8 ). Thus the number of operations are
415 // * independent of the exponent of the input.
416 // *
417 // * NOTE: 2/pi int representation is converted to double
418 // * // (2/pi) is represented by an array of 24-bit integers in ipio2[].
419 // *
420 // * Input parameters:
421 // * x[] The input value (must be positive) is broken into nx
422 // * pieces of 24-bit integers in double precision format.
423 // * x[i] will be the i-th 24 bit of x. The scaled exponent
424 // * of x[0] is given in input parameter e0 (i.e., x[0]*2^e0
425 // * match x's up to 24 bits.
426 // *
427 // * Example of breaking a double positive z into x[0]+x[1]+x[2]:
428 // * e0 = ilogb(z)-23
429 // * z = scalbn(z,-e0)
430 // * for i = 0,1,2
431 // * x[i] = floor(z)
432 // * z = (z-x[i])*2**24
433 // *
434 // *
435 // * y[] ouput result in an array of double precision numbers.
436 // * The dimension of y[] is:
437 // * 24-bit precision 1
438 // * 53-bit precision 2
439 // * 64-bit precision 2
440 // * 113-bit precision 3
441 // * The actual value is the sum of them. Thus for 113-bit
442 // * precsion, one may have to do something like:
443 // *
444 // * long double t,w,r_head, r_tail;
445 // * t = (long double)y[2] + (long double)y[1];
446 // * w = (long double)y[0];
447 // * r_head = t+w;
448 // * r_tail = w - (r_head - t);
449 // *
450 // * e0 The exponent of x[0]
451 // *
452 // * nx dimension of x[]
453 // *
454 // * prec an interger indicating the precision:
455 // * 0 24 bits (single)
456 // * 1 53 bits (double)
457 // * 2 64 bits (extended)
458 // * 3 113 bits (quad)
459 // *
460 // * NOTE: ipio2[] array below is converted to double representation
461 // * //ipio2[]
462 // * // integer array, contains the (24*i)-th to (24*i+23)-th
463 // * // bit of 2/pi after binary point. The corresponding
464 // * // floating value is
465 // *
466 // * ipio2[i] * 2^(-24(i+1)).
467 // *
468 // * Here is the description of some local variables:
469 // *
470 // * jk jk+1 is the initial number of terms of ipio2[] needed
471 // * in the computation. The recommended value is 2,3,4,
472 // * 6 for single, double, extended,and quad.
473 // *
474 // * jz local integer variable indicating the number of
475 // * terms of ipio2[] used.
476 // *
477 // * jx nx - 1
478 // *
479 // * jv index for pointing to the suitable ipio2[] for the
480 // * computation. In general, we want
481 // * ( 2^e0*x[0] * ipio2[jv-1]*2^(-24jv) )/8
482 // * is an integer. Thus
483 // * e0-3-24*jv >= 0 or (e0-3)/24 >= jv
484 // * Hence jv = max(0,(e0-3)/24).
485 // *
486 // * jp jp+1 is the number of terms in PIo2[] needed, jp = jk.
487 // *
488 // * q[] double array with integral value, representing the
489 // * 24-bits chunk of the product of x and 2/pi.
490 // *
491 // * q0 the corresponding exponent of q[0]. Note that the
492 // * exponent for q[i] would be q0-24*i.
493 // *
494 // * PIo2[] double precision array, obtained by cutting pi/2
495 // * into 24 bits chunks.
496 // *
497 // * f[] ipio2[] in floating point
498 // *
499 // * iq[] integer array by breaking up q[] in 24-bits chunk.
500 // *
501 // * fq[] final product of x*(2/pi) in fq[0],..,fq[jk]
502 // *
503 // * ih integer. If >0 it indicates q[] is >= 0.5, hence
504 // * it also indicates the *sign* of the result.
505 // *
506 // */
507 //
508 // Use PIo2 table(see stubRoutines_aarch64.cpp)
509 //
510 // BEGIN __kernel_rem_pio2 PSEUDO CODE
511 //
512 //static int __kernel_rem_pio2(double *x, double *y, int e0, int nx, int prec, /* NOTE: converted to double */ const double *ipio2 // const int *ipio2) {
513 // int jz,jx,jv,jp,jk,carry,n,iq[20],i,j,k,m,q0,ih;
514 // double z,fw,f[20],fq[20],q[20];
515 //
516 // /* initialize jk*/
517 // // jk = init_jk[prec]; // NOTE: prec==2 for double. jk is always 4.
518 // jp = jk; // NOTE: always 4
519 //
520 // /* determine jx,jv,q0, note that 3>q0 */
521 // jx = nx-1;
522 // jv = (e0-3)/24; if(jv<0) jv=0;
523 // q0 = e0-24*(jv+1);
524 //
525 // /* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */
526 // j = jv-jx; m = jx+jk;
527 //
528 // // NOTE: split into two for-loops: one with zeroB and one with ipio2[j]. It
529 // // allows the use of wider loads/stores
530 // for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : /* NOTE: converted to double */ ipio2[j]; //(double) ipio2[j];
531 //
532 // // NOTE: unrolled and vectorized "for". See comments in asm code
533 // /* compute q[0],q[1],...q[jk] */
534 // for (i=0;i<=jk;i++) {
535 // for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j]; q[i] = fw;
536 // }
537 //
538 // jz = jk;
539 //recompute:
540 // /* distill q[] into iq[] reversingly */
541 // for(i=0,j=jz,z=q[jz];j>0;i++,j--) {
542 // fw = (double)((int)(twon24* z));
543 // iq[i] = (int)(z-two24B*fw);
544 // z = q[j-1]+fw;
545 // }
546 //
547 // /* compute n */
548 // z = scalbnA(z,q0); /* actual value of z */
549 // z -= 8.0*floor(z*0.125); /* trim off integer >= 8 */
550 // n = (int) z;
551 // z -= (double)n;
552 // ih = 0;
553 // if(q0>0) { /* need iq[jz-1] to determine n */
554 // i = (iq[jz-1]>>(24-q0)); n += i;
555 // iq[jz-1] -= i<<(24-q0);
556 // ih = iq[jz-1]>>(23-q0);
557 // }
558 // else if(q0==0) ih = iq[jz-1]>>23;
559 // else if(z>=0.5) ih=2;
560 //
561 // if(ih>0) { /* q > 0.5 */
562 // n += 1; carry = 0;
563 // for(i=0;i<jz ;i++) { /* compute 1-q */
564 // j = iq[i];
565 // if(carry==0) {
566 // if(j!=0) {
567 // carry = 1; iq[i] = 0x1000000- j;
568 // }
569 // } else iq[i] = 0xffffff - j;
570 // }
571 // if(q0>0) { /* rare case: chance is 1 in 12 */
572 // switch(q0) {
573 // case 1:
574 // iq[jz-1] &= 0x7fffff; break;
575 // case 2:
576 // iq[jz-1] &= 0x3fffff; break;
577 // }
578 // }
579 // if(ih==2) {
580 // z = one - z;
581 // if(carry!=0) z -= scalbnA(one,q0);
582 // }
583 // }
584 //
585 // /* check if recomputation is needed */
586 // if(z==zeroB) {
587 // j = 0;
588 // for (i=jz-1;i>=jk;i--) j |= iq[i];
589 // if(j==0) { /* need recomputation */
590 // for(k=1;iq[jk-k]==0;k++); /* k = no. of terms needed */
591 //
592 // for(i=jz+1;i<=jz+k;i++) { /* add q[jz+1] to q[jz+k] */
593 // f[jx+i] = /* NOTE: converted to double */ ipio2[jv+i]; //(double) ipio2[jv+i];
594 // for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j];
595 // q[i] = fw;
596 // }
597 // jz += k;
598 // goto recompute;
599 // }
600 // }
601 //
602 // /* chop off zero terms */
603 // if(z==0.0) {
604 // jz -= 1; q0 -= 24;
605 // while(iq[jz]==0) { jz--; q0-=24;}
606 // } else { /* break z into 24-bit if necessary */
607 // z = scalbnA(z,-q0);
608 // if(z>=two24B) {
609 // fw = (double)((int)(twon24*z));
610 // iq[jz] = (int)(z-two24B*fw);
611 // jz += 1; q0 += 24;
612 // iq[jz] = (int) fw;
613 // } else iq[jz] = (int) z ;
614 // }
615 //
616 // /* convert integer "bit" chunk to floating-point value */
617 // fw = scalbnA(one,q0);
618 // for(i=jz;i>=0;i--) {
619 // q[i] = fw*(double)iq[i]; fw*=twon24;
620 // }
621 //
622 // /* compute PIo2[0,...,jp]*q[jz,...,0] */
623 // for(i=jz;i>=0;i--) {
624 // for(fw=0.0,k=0;k<=jp&&k<=jz-i;k++) fw += PIo2[k]*q[i+k];
625 // fq[jz-i] = fw;
626 // }
627 //
628 // // NOTE: switch below is eliminated, because prec is always 2 for doubles
629 // /* compress fq[] into y[] */
630 // //switch(prec) {
631 // //case 0:
632 // // fw = 0.0;
633 // // for (i=jz;i>=0;i--) fw += fq[i];
634 // // y[0] = (ih==0)? fw: -fw;
635 // // break;
636 // //case 1:
637 // //case 2:
638 // fw = 0.0;
639 // for (i=jz;i>=0;i--) fw += fq[i];
640 // y[0] = (ih==0)? fw: -fw;
641 // fw = fq[0]-fw;
642 // for (i=1;i<=jz;i++) fw += fq[i];
643 // y[1] = (ih==0)? fw: -fw;
644 // // break;
645 // //case 3: /* painful */
646 // // for (i=jz;i>0;i--) {
647 // // fw = fq[i-1]+fq[i];
648 // // fq[i] += fq[i-1]-fw;
649 // // fq[i-1] = fw;
650 // // }
651 // // for (i=jz;i>1;i--) {
652 // // fw = fq[i-1]+fq[i];
653 // // fq[i] += fq[i-1]-fw;
654 // // fq[i-1] = fw;
655 // // }
656 // // for (fw=0.0,i=jz;i>=2;i--) fw += fq[i];
657 // // if(ih==0) {
658 // // y[0] = fq[0]; y[1] = fq[1]; y[2] = fw;
659 // // } else {
660 // // y[0] = -fq[0]; y[1] = -fq[1]; y[2] = -fw;
661 // // }
662 // //}
663 // return n&7;
664 //}
665 //
666 // END __kernel_rem_pio2 PSEUDO CODE
667 //
668 // Changes between fdlibm and intrinsic:
669 // 1. One loop is unrolled and vectorized (see comments in code)
670 // 2. One loop is split into 2 loops (see comments in code)
671 // 3. Non-double code is removed(last switch). Sevaral variables became
672 // constants because of that (see comments in code)
673 // 4. Use of jx, which is nx-1 instead of nx
674 // Assumptions:
675 // 1. Assume |X| >= PI/4
676 // Input and output:
677 // 1. Input: X = r0, jx == nx - 1 == r6, e0 == rscratch1
678 // 2. Return n in r2, y[0] == y0 == v4, y[1] == y1 == v5
679 // NOTE: general purpose register names match local variable names in C code
680 // NOTE: fpu registers are actively reused. See comments in code about their usage
generate__kernel_rem_pio2(address two_over_pi,address pio2)681 void MacroAssembler::generate__kernel_rem_pio2(address two_over_pi, address pio2) {
682 Label Q_DONE, JX_IS_0, JX_IS_2, COMP_INNER_LOOP, RECOMP_FOR2, Q0_ZERO_CMP_LT,
683 RECOMP_CHECK_DONE_NOT_ZERO, Q0_ZERO_CMP_DONE, COMP_FOR, Q0_ZERO_CMP_EQ,
684 INIT_F_ZERO, RECOMPUTE, IH_FOR_INCREMENT, IH_FOR_STORE, RECOMP_CHECK_DONE,
685 Z_IS_LESS_THAN_TWO24B, Z_IS_ZERO, FW_Y1_NO_NEGATION,
686 RECOMP_FW_UPDATED, Z_ZERO_CHECK_DONE, FW_FOR1, IH_AFTER_SWITCH, IH_HANDLED,
687 CONVERTION_FOR, FW_Y0_NO_NEGATION, FW_FOR1_DONE, FW_FOR2, FW_FOR2_DONE,
688 IH_FOR, SKIP_F_LOAD, RECOMP_FOR1, RECOMP_FIRST_FOR, INIT_F_COPY,
689 RECOMP_FOR1_CHECK;
690 Register tmp2 = r1, n = r2, jv = r4, tmp5 = r5, jx = r6,
691 tmp3 = r7, iqBase = r10, ih = r11, tmp4 = r12, tmp1 = r13,
692 jz = r14, j = r15, twoOverPiBase = r16, i = r17, qBase = r18;
693 // jp = jk == init_jk[prec] = init_jk[2] == {2,3,4,6}[2] == 4
694 // jx = nx - 1
695 lea(twoOverPiBase, ExternalAddress(two_over_pi));
696 cmpw(jv, zr);
697 addw(tmp4, jx, 4); // tmp4 = m = jx + jk = jx + 4. jx is in {0,1,2} so m is in [4,5,6]
698 cselw(jv, jv, zr, GE);
699 fmovd(v26, 0.0);
700 addw(tmp5, jv, 1); // jv+1
701 subsw(j, jv, jx);
702 add(qBase, sp, 320); // base of q[]
703 msubw(rscratch1, i, tmp5, rscratch1); // q0 = e0-24*(jv+1)
704 // use double f[20], fq[20], q[20], iq[20] on stack, which is
705 // (20 + 20 + 20) x 8 + 20 x 4 = 560 bytes. From lower to upper addresses it
706 // will contain f[20], fq[20], q[20], iq[20]
707 // now initialize f[20] indexes 0..m (inclusive)
708 // for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : /* NOTE: converted to double */ ipio2[j]; // (double) ipio2[j];
709 mov(tmp5, sp);
710
711 block_comment("for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : /* NOTE: converted to double */ ipio2[j]; // (double) ipio2[j];"); {
712 eorw(i, i, i);
713 br(GE, INIT_F_COPY);
714 bind(INIT_F_ZERO);
715 stpq(v26, v26, Address(post(tmp5, 32)));
716 addw(i, i, 4);
717 addsw(j, j, 4);
718 br(LT, INIT_F_ZERO);
719 subw(i, i, j);
720 movw(j, zr);
721 bind(INIT_F_COPY);
722 add(tmp1, twoOverPiBase, j, LSL, 3); // ipio2[j] start address
723 ld1(v18, v19, v20, v21, T16B, tmp1);
724 add(tmp5, sp, i, ext::uxtx, 3);
725 st1(v18, v19, v20, v21, T16B, tmp5);
726 }
727 // v18..v21 can actually contain f[0..7]
728 cbz(i, SKIP_F_LOAD); // i == 0 => f[i] == f[0] => already loaded
729 ld1(v18, v19, v20, v21, T2D, Address(sp)); // load f[0..7]
730 bind(SKIP_F_LOAD);
731 // calculate 2^q0 and 2^-q0, which we'll need further.
732 // q0 is exponent. So, calculate biased exponent(q0+1023)
733 negw(tmp4, rscratch1);
734 addw(tmp5, rscratch1, 1023);
735 addw(tmp4, tmp4, 1023);
736 // Unroll following for(s) depending on jx in [0,1,2]
737 // for (i=0;i<=jk;i++) {
738 // for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j]; q[i] = fw;
739 // }
740 // Unrolling for jx == 0 case:
741 // q[0] = x[0] * f[0]
742 // q[1] = x[0] * f[1]
743 // q[2] = x[0] * f[2]
744 // q[3] = x[0] * f[3]
745 // q[4] = x[0] * f[4]
746 //
747 // Vectorization for unrolled jx == 0 case:
748 // {q[0], q[1]} = {f[0], f[1]} * x[0]
749 // {q[2], q[3]} = {f[2], f[3]} * x[0]
750 // q[4] = f[4] * x[0]
751 //
752 // Unrolling for jx == 1 case:
753 // q[0] = x[0] * f[1] + x[1] * f[0]
754 // q[1] = x[0] * f[2] + x[1] * f[1]
755 // q[2] = x[0] * f[3] + x[1] * f[2]
756 // q[3] = x[0] * f[4] + x[1] * f[3]
757 // q[4] = x[0] * f[5] + x[1] * f[4]
758 //
759 // Vectorization for unrolled jx == 1 case:
760 // {q[0], q[1]} = {f[0], f[1]} * x[1]
761 // {q[2], q[3]} = {f[2], f[3]} * x[1]
762 // q[4] = f[4] * x[1]
763 // {q[0], q[1]} += {f[1], f[2]} * x[0]
764 // {q[2], q[3]} += {f[3], f[4]} * x[0]
765 // q[4] += f[5] * x[0]
766 //
767 // Unrolling for jx == 2 case:
768 // q[0] = x[0] * f[2] + x[1] * f[1] + x[2] * f[0]
769 // q[1] = x[0] * f[3] + x[1] * f[2] + x[2] * f[1]
770 // q[2] = x[0] * f[4] + x[1] * f[3] + x[2] * f[2]
771 // q[3] = x[0] * f[5] + x[1] * f[4] + x[2] * f[3]
772 // q[4] = x[0] * f[6] + x[1] * f[5] + x[2] * f[4]
773 //
774 // Vectorization for unrolled jx == 2 case:
775 // {q[0], q[1]} = {f[0], f[1]} * x[2]
776 // {q[2], q[3]} = {f[2], f[3]} * x[2]
777 // q[4] = f[4] * x[2]
778 // {q[0], q[1]} += {f[1], f[2]} * x[1]
779 // {q[2], q[3]} += {f[3], f[4]} * x[1]
780 // q[4] += f[5] * x[1]
781 // {q[0], q[1]} += {f[2], f[3]} * x[0]
782 // {q[2], q[3]} += {f[4], f[5]} * x[0]
783 // q[4] += f[6] * x[0]
784 block_comment("unrolled and vectorized computation of q[0]..q[jk]"); {
785 cmpw(jx, 1);
786 lsl(tmp5, tmp5, 52); // now it's 2^q0 double value
787 lsl(tmp4, tmp4, 52); // now it's 2^-q0 double value
788 br(LT, JX_IS_0);
789 add(i, sp, 8);
790 ldpq(v26, v27, i); // load f[1..4]
791 br(GT, JX_IS_2);
792 // jx == 1
793 fmulxvs(v28, T2D, v18, v7); // f[0,1] * x[1]
794 fmulxvs(v29, T2D, v19, v7); // f[2,3] * x[1]
795 fmuld(v30, v20, v7); // f[4] * x[1]
796 fmlavs(v28, T2D, v26, v6, 0);
797 fmlavs(v29, T2D, v27, v6, 0);
798 fmlavs(v30, T2D, v6, v20, 1); // v30 += f[5] * x[0]
799 b(Q_DONE);
800 bind(JX_IS_2);
801 fmulxvs(v28, T2D, v18, v3); // f[0,1] * x[2]
802 fmulxvs(v29, T2D, v19, v3); // f[2,3] * x[2]
803 fmuld(v30, v20, v3); // f[4] * x[2]
804 fmlavs(v28, T2D, v26, v7, 0);
805 fmlavs(v29, T2D, v27, v7, 0);
806 fmlavs(v30, T2D, v7, v20, 1); // v30 += f[5] * x[1]
807 fmlavs(v28, T2D, v19, v6, 0);
808 fmlavs(v29, T2D, v20, v6, 0);
809 fmlavs(v30, T2D, v6, v21, 0); // v30 += f[6] * x[0]
810 b(Q_DONE);
811 bind(JX_IS_0);
812 fmulxvs(v28, T2D, v18, v6); // f[0,1] * x[0]
813 fmulxvs(v29, T2D, v19, v6); // f[2,3] * x[0]
814 fmuld(v30, v20, v6); // f[4] * x[0]
815 bind(Q_DONE);
816 st1(v28, v29, v30, T2D, Address(qBase)); // save calculated q[0]...q[jk]
817 }
818 movz(i, 0x3E70, 48);
819 movw(jz, 4);
820 fmovd(v17, i); // v17 = twon24
821 fmovd(v30, tmp5); // 2^q0
822 fmovd(v21, 0.125);
823 fmovd(v20, 8.0);
824 fmovd(v22, tmp4); // 2^-q0
825
826 block_comment("recompute loop"); {
827 bind(RECOMPUTE);
828 // for(i=0,j=jz,z=q[jz];j>0;i++,j--) {
829 // fw = (double)((int)(twon24* z));
830 // iq[i] = (int)(z-two24A*fw);
831 // z = q[j-1]+fw;
832 // }
833 block_comment("distill q[] into iq[] reversingly"); {
834 eorw(i, i, i);
835 movw(j, jz);
836 add(tmp2, qBase, jz, LSL, 3); // q[jz] address
837 ldrd(v18, post(tmp2, -8)); // z = q[j] and moving address to q[j-1]
838 bind(RECOMP_FIRST_FOR);
839 ldrd(v27, post(tmp2, -8));
840 fmuld(v29, v17, v18); // twon24*z
841 frintzd(v29, v29); // (double)(int)
842 fmsubd(v28, v10, v29, v18); // v28 = z-two24A*fw
843 fcvtzdw(tmp1, v28); // (int)(z-two24A*fw)
844 strw(tmp1, Address(iqBase, i, Address::lsl(2)));
845 faddd(v18, v27, v29);
846 add(i, i, 1);
847 subs(j, j, 1);
848 br(GT, RECOMP_FIRST_FOR);
849 }
850 // compute n
851 fmuld(v18, v18, v30);
852 fmuld(v2, v18, v21);
853 frintmd(v2, v2); // v2 = floor(v2) == rounding towards -inf
854 fmsubd(v18, v2, v20, v18); // z -= 8.0*floor(z*0.125);
855 movw(ih, 2);
856 frintzd(v2, v18); // v2 = (double)((int)z)
857 fcvtzdw(n, v18); // n = (int) z;
858 fsubd(v18, v18, v2); // z -= (double)n;
859
860 block_comment("q0-dependent initialization"); {
861 cmpw(rscratch1, 0); // if (q0 > 0)
862 br(LT, Q0_ZERO_CMP_LT);
863 subw(j, jz, 1); // j = jz - 1
864 ldrw(tmp2, Address(iqBase, j, Address::lsl(2))); // tmp2 = iq[jz-1]
865 br(EQ, Q0_ZERO_CMP_EQ);
866 movw(tmp4, 24);
867 subw(tmp4, tmp4, rscratch1); // == 24 - q0
868 lsrvw(i, tmp2, tmp4); // i = iq[jz-1] >> (24-q0)
869 lslvw(tmp5, i, tmp4);
870 subw(tmp2, tmp2, tmp5); // iq[jz-1] -= i<<(24-q0);
871 strw(tmp2, Address(iqBase, j, Address::lsl(2))); // store iq[jz-1]
872 subw(rscratch2, tmp4, 1); // == 23 - q0
873 addw(n, n, i); // n+=i
874 lsrvw(ih, tmp2, rscratch2); // ih = iq[jz-1] >> (23-q0)
875 b(Q0_ZERO_CMP_DONE);
876 bind(Q0_ZERO_CMP_EQ);
877 lsr(ih, tmp2, 23); // ih = iq[z-1] >> 23
878 b(Q0_ZERO_CMP_DONE);
879 bind(Q0_ZERO_CMP_LT);
880 fmovd(v4, 0.5);
881 fcmpd(v18, v4);
882 cselw(ih, zr, ih, LT); // if (z<0.5) ih = 0
883 }
884 bind(Q0_ZERO_CMP_DONE);
885 cmpw(ih, zr);
886 br(LE, IH_HANDLED);
887
888 block_comment("if(ih>) {"); {
889 // use rscratch2 as carry
890
891 block_comment("for(i=0;i<jz ;i++) {...}"); {
892 addw(n, n, 1);
893 eorw(i, i, i);
894 eorw(rscratch2, rscratch2, rscratch2);
895 bind(IH_FOR);
896 ldrw(j, Address(iqBase, i, Address::lsl(2))); // j = iq[i]
897 movw(tmp3, 0x1000000);
898 subw(tmp3, tmp3, rscratch2);
899 cbnzw(rscratch2, IH_FOR_STORE);
900 cbzw(j, IH_FOR_INCREMENT);
901 movw(rscratch2, 1);
902 bind(IH_FOR_STORE);
903 subw(tmp3, tmp3, j);
904 strw(tmp3, Address(iqBase, i, Address::lsl(2))); // iq[i] = 0xffffff - j
905 bind(IH_FOR_INCREMENT);
906 addw(i, i, 1);
907 cmpw(i, jz);
908 br(LT, IH_FOR);
909 }
910
911 block_comment("if(q0>0) {"); {
912 cmpw(rscratch1, zr);
913 br(LE, IH_AFTER_SWITCH);
914 // tmp3 still has iq[jz-1] value. no need to reload
915 // now, zero high tmp3 bits (rscratch1 number of bits)
916 movw(j, -1);
917 subw(i, jz, 1); // set i to jz-1
918 lsrv(j, j, rscratch1);
919 andw(tmp3, tmp3, j, LSR, 8); // we have 24-bit-based constants
920 strw(tmp3, Address(iqBase, i, Address::lsl(2))); // save iq[jz-1]
921 }
922 bind(IH_AFTER_SWITCH);
923 cmpw(ih, 2);
924 br(NE, IH_HANDLED);
925
926 block_comment("if(ih==2) {"); {
927 fmovd(v25, 1.0);
928 fsubd(v18, v25, v18); // z = one - z;
929 cbzw(rscratch2, IH_HANDLED);
930 fsubd(v18, v18, v30); // z -= scalbnA(one,q0);
931 }
932 }
933 bind(IH_HANDLED);
934 // check if recomputation is needed
935 fcmpd(v18, 0.0);
936 br(NE, RECOMP_CHECK_DONE_NOT_ZERO);
937
938 block_comment("if(z==zeroB) {"); {
939
940 block_comment("for (i=jz-1;i>=jk;i--) j |= iq[i];"); {
941 subw(i, jz, 1);
942 eorw(j, j, j);
943 b(RECOMP_FOR1_CHECK);
944 bind(RECOMP_FOR1);
945 ldrw(tmp1, Address(iqBase, i, Address::lsl(2)));
946 orrw(j, j, tmp1);
947 subw(i, i, 1);
948 bind(RECOMP_FOR1_CHECK);
949 cmpw(i, 4);
950 br(GE, RECOMP_FOR1);
951 }
952 cbnzw(j, RECOMP_CHECK_DONE);
953
954 block_comment("if(j==0) {"); {
955 // for(k=1;iq[jk-k]==0;k++); // let's unroll it. jk == 4. So, read
956 // iq[3], iq[2], iq[1], iq[0] until non-zero value
957 ldp(tmp1, tmp3, iqBase); // iq[0..3]
958 movw(j, 2);
959 cmp(tmp3, zr);
960 csel(tmp1, tmp1, tmp3, EQ); // set register for further consideration
961 cselw(j, j, zr, EQ); // set initial k. Use j as k
962 cmp(zr, tmp1, LSR, 32);
963 addw(i, jz, 1);
964 csincw(j, j, j, NE);
965
966 block_comment("for(i=jz+1;i<=jz+k;i++) {...}"); {
967 addw(jz, i, j); // i = jz+1, j = k-1. j+i = jz+k (which is a new jz)
968 bind(RECOMP_FOR2);
969 addw(tmp1, jv, i);
970 ldrd(v29, Address(twoOverPiBase, tmp1, Address::lsl(3)));
971 addw(tmp2, jx, i);
972 strd(v29, Address(sp, tmp2, Address::lsl(3)));
973 // f[jx+i] = /* NOTE: converted to double */ ipio2[jv+i]; //(double) ipio2[jv+i];
974 // since jx = 0, 1 or 2 we can unroll it:
975 // for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j];
976 // f[jx+i-j] == (for first iteration) f[jx+i], which is already v29
977 add(tmp2, sp, tmp2, ext::uxtx, 3); // address of f[jx+i]
978 ldpd(v4, v5, Address(tmp2, -16)); // load f[jx+i-2] and f[jx+i-1]
979 fmuld(v26, v6, v29); // initial fw
980 cbzw(jx, RECOMP_FW_UPDATED);
981 fmaddd(v26, v7, v5, v26);
982 cmpw(jx, 1);
983 br(EQ, RECOMP_FW_UPDATED);
984 fmaddd(v26, v3, v4, v26);
985 bind(RECOMP_FW_UPDATED);
986 strd(v26, Address(qBase, i, Address::lsl(3))); // q[i] = fw;
987 addw(i, i, 1);
988 cmpw(i, jz); // jz here is "old jz" + k
989 br(LE, RECOMP_FOR2);
990 }
991 b(RECOMPUTE);
992 }
993 }
994 }
995 bind(RECOMP_CHECK_DONE);
996 // chop off zero terms
997 fcmpd(v18, 0.0);
998 br(EQ, Z_IS_ZERO);
999
1000 block_comment("else block of if(z==0.0) {"); {
1001 bind(RECOMP_CHECK_DONE_NOT_ZERO);
1002 fmuld(v18, v18, v22);
1003 fcmpd(v18, v10); // v10 is stil two24A
1004 br(LT, Z_IS_LESS_THAN_TWO24B);
1005 fmuld(v1, v18, v17); // twon24*z
1006 frintzd(v1, v1); // v1 = (double)(int)(v1)
1007 fmsubd(v2, v10, v1, v18);
1008 fcvtzdw(tmp3, v1); // (int)fw
1009 fcvtzdw(tmp2, v2); // double to int
1010 strw(tmp2, Address(iqBase, jz, Address::lsl(2)));
1011 addw(rscratch1, rscratch1, 24);
1012 addw(jz, jz, 1);
1013 strw(tmp3, Address(iqBase, jz, Address::lsl(2))); // iq[jz] = (int) fw
1014 b(Z_ZERO_CHECK_DONE);
1015 bind(Z_IS_LESS_THAN_TWO24B);
1016 fcvtzdw(tmp3, v18); // (int)z
1017 strw(tmp3, Address(iqBase, jz, Address::lsl(2))); // iq[jz] = (int) z
1018 b(Z_ZERO_CHECK_DONE);
1019 }
1020
1021 block_comment("if(z==0.0) {"); {
1022 bind(Z_IS_ZERO);
1023 subw(jz, jz, 1);
1024 ldrw(tmp1, Address(iqBase, jz, Address::lsl(2)));
1025 subw(rscratch1, rscratch1, 24);
1026 cbz(tmp1, Z_IS_ZERO);
1027 }
1028 bind(Z_ZERO_CHECK_DONE);
1029 // convert integer "bit" chunk to floating-point value
1030 // v17 = twon24
1031 // update v30, which was scalbnA(1.0, <old q0>);
1032 addw(tmp2, rscratch1, 1023); // biased exponent
1033 lsl(tmp2, tmp2, 52); // put at correct position
1034 mov(i, jz);
1035 fmovd(v30, tmp2);
1036
1037 block_comment("for(i=jz;i>=0;i--) {q[i] = fw*(double)iq[i]; fw*=twon24;}"); {
1038 bind(CONVERTION_FOR);
1039 ldrw(tmp1, Address(iqBase, i, Address::lsl(2)));
1040 scvtfwd(v31, tmp1);
1041 fmuld(v31, v31, v30);
1042 strd(v31, Address(qBase, i, Address::lsl(3)));
1043 fmuld(v30, v30, v17);
1044 subsw(i, i, 1);
1045 br(GE, CONVERTION_FOR);
1046 }
1047 add(rscratch2, sp, 160); // base for fq
1048 // reusing twoOverPiBase
1049 lea(twoOverPiBase, ExternalAddress(pio2));
1050
1051 block_comment("compute PIo2[0,...,jp]*q[jz,...,0]. for(i=jz;i>=0;i--) {...}"); {
1052 movw(i, jz);
1053 movw(tmp2, zr); // tmp2 will keep jz - i == 0 at start
1054 bind(COMP_FOR);
1055 // for(fw=0.0,k=0;k<=jp&&k<=jz-i;k++) fw += PIo2[k]*q[i+k];
1056 fmovd(v30, 0.0);
1057 add(tmp5, qBase, i, LSL, 3); // address of q[i+k] for k==0
1058 movw(tmp3, 4);
1059 movw(tmp4, zr); // used as k
1060 cmpw(tmp2, 4);
1061 add(tmp1, qBase, i, LSL, 3); // used as q[i] address
1062 cselw(tmp3, tmp2, tmp3, LE); // min(jz - i, jp)
1063
1064 block_comment("for(fw=0.0,k=0;k<=jp&&k<=jz-i;k++) fw += PIo2[k]*q[i+k];"); {
1065 bind(COMP_INNER_LOOP);
1066 ldrd(v18, Address(tmp1, tmp4, Address::lsl(3))); // q[i+k]
1067 ldrd(v19, Address(twoOverPiBase, tmp4, Address::lsl(3))); // PIo2[k]
1068 fmaddd(v30, v18, v19, v30); // fw += PIo2[k]*q[i+k];
1069 addw(tmp4, tmp4, 1); // k++
1070 cmpw(tmp4, tmp3);
1071 br(LE, COMP_INNER_LOOP);
1072 }
1073 strd(v30, Address(rscratch2, tmp2, Address::lsl(3))); // fq[jz-i]
1074 add(tmp2, tmp2, 1);
1075 subsw(i, i, 1);
1076 br(GE, COMP_FOR);
1077 }
1078
1079 block_comment("switch(prec) {...}. case 2:"); {
1080 // compress fq into y[]
1081 // remember prec == 2
1082
1083 block_comment("for (i=jz;i>=0;i--) fw += fq[i];"); {
1084 fmovd(v4, 0.0);
1085 mov(i, jz);
1086 bind(FW_FOR1);
1087 ldrd(v1, Address(rscratch2, i, Address::lsl(3)));
1088 subsw(i, i, 1);
1089 faddd(v4, v4, v1);
1090 br(GE, FW_FOR1);
1091 }
1092 bind(FW_FOR1_DONE);
1093 // v1 contains fq[0]. so, keep it so far
1094 fsubd(v5, v1, v4); // fw = fq[0] - fw
1095 cbzw(ih, FW_Y0_NO_NEGATION);
1096 fnegd(v4, v4);
1097 bind(FW_Y0_NO_NEGATION);
1098
1099 block_comment("for (i=1;i<=jz;i++) fw += fq[i];"); {
1100 movw(i, 1);
1101 cmpw(jz, 1);
1102 br(LT, FW_FOR2_DONE);
1103 bind(FW_FOR2);
1104 ldrd(v1, Address(rscratch2, i, Address::lsl(3)));
1105 addw(i, i, 1);
1106 cmp(i, jz);
1107 faddd(v5, v5, v1);
1108 br(LE, FW_FOR2);
1109 }
1110 bind(FW_FOR2_DONE);
1111 cbz(ih, FW_Y1_NO_NEGATION);
1112 fnegd(v5, v5);
1113 bind(FW_Y1_NO_NEGATION);
1114 add(sp, sp, 560);
1115 }
1116 }
1117
1118 ///* __kernel_sin( x, y, iy)
1119 // * kernel sin function on [-pi/4, pi/4], pi/4 ~ 0.7854
1120 // * Input x is assumed to be bounded by ~pi/4 in magnitude.
1121 // * Input y is the tail of x.
1122 // * Input iy indicates whether y is 0. (if iy=0, y assume to be 0).
1123 // *
1124 // * Algorithm
1125 // * 1. Since sin(-x) = -sin(x), we need only to consider positive x.
1126 // * 2. if x < 2^-27 (hx<0x3e400000 0), return x with inexact if x!=0.
1127 // * 3. sin(x) is approximated by a polynomial of degree 13 on
1128 // * [0,pi/4]
1129 // * 3 13
1130 // * sin(x) ~ x + S1*x + ... + S6*x
1131 // * where
1132 // *
1133 // * |sin(x) 2 4 6 8 10 12 | -58
1134 // * |----- - (1+S1*x +S2*x +S3*x +S4*x +S5*x +S6*x )| <= 2
1135 // * | x |
1136 // *
1137 // * 4. sin(x+y) = sin(x) + sin'(x')*y
1138 // * ~ sin(x) + (1-x*x/2)*y
1139 // * For better accuracy, let
1140 // * 3 2 2 2 2
1141 // * r = x *(S2+x *(S3+x *(S4+x *(S5+x *S6))))
1142 // * then 3 2
1143 // * sin(x) = x + (S1*x + (x *(r-y/2)+y))
1144 // */
1145 //static const double
1146 //S1 = -1.66666666666666324348e-01, /* 0xBFC55555, 0x55555549 */
1147 //S2 = 8.33333333332248946124e-03, /* 0x3F811111, 0x1110F8A6 */
1148 //S3 = -1.98412698298579493134e-04, /* 0xBF2A01A0, 0x19C161D5 */
1149 //S4 = 2.75573137070700676789e-06, /* 0x3EC71DE3, 0x57B1FE7D */
1150 //S5 = -2.50507602534068634195e-08, /* 0xBE5AE5E6, 0x8A2B9CEB */
1151 //S6 = 1.58969099521155010221e-10; /* 0x3DE5D93A, 0x5ACFD57C */
1152 //
1153 // NOTE: S1..S6 were moved into a table: StubRoutines::aarch64::_dsin_coef
1154 //
1155 // BEGIN __kernel_sin PSEUDO CODE
1156 //
1157 //static double __kernel_sin(double x, double y, bool iy)
1158 //{
1159 // double z,r,v;
1160 //
1161 // // NOTE: not needed. moved to dsin/dcos
1162 // //int ix;
1163 // //ix = high(x)&0x7fffffff; /* high word of x */
1164 //
1165 // // NOTE: moved to dsin/dcos
1166 // //if(ix<0x3e400000) /* |x| < 2**-27 */
1167 // // {if((int)x==0) return x;} /* generate inexact */
1168 //
1169 // z = x*x;
1170 // v = z*x;
1171 // r = S2+z*(S3+z*(S4+z*(S5+z*S6)));
1172 // if(iy==0) return x+v*(S1+z*r);
1173 // else return x-((z*(half*y-v*r)-y)-v*S1);
1174 //}
1175 //
1176 // END __kernel_sin PSEUDO CODE
1177 //
1178 // Changes between fdlibm and intrinsic:
1179 // 1. Removed |x| < 2**-27 check, because if was done earlier in dsin/dcos
1180 // 2. Constants are now loaded from table dsin_coef
1181 // 3. C code parameter "int iy" was modified to "bool iyIsOne", because
1182 // iy is always 0 or 1. Also, iyIsOne branch was moved into
1183 // generation phase instead of taking it during code execution
1184 // Input ans output:
1185 // 1. Input for generated function: X argument = x
1186 // 2. Input for generator: x = register to read argument from, iyIsOne
1187 // = flag to use low argument low part or not, dsin_coef = coefficients
1188 // table address
1189 // 3. Return sin(x) value in v0
generate_kernel_sin(FloatRegister x,bool iyIsOne,address dsin_coef)1190 void MacroAssembler::generate_kernel_sin(FloatRegister x, bool iyIsOne,
1191 address dsin_coef) {
1192 FloatRegister y = v5, z = v6, v = v7, r = v16, S1 = v17, S2 = v18,
1193 S3 = v19, S4 = v20, S5 = v21, S6 = v22, half = v23;
1194 lea(rscratch2, ExternalAddress(dsin_coef));
1195 ldpd(S5, S6, Address(rscratch2, 32));
1196 fmuld(z, x, x); // z = x*x;
1197 ld1(S1, S2, S3, S4, T1D, Address(rscratch2));
1198 fmuld(v, z, x); // v = z*x;
1199
1200 block_comment("calculate r = S2+z*(S3+z*(S4+z*(S5+z*S6)))"); {
1201 fmaddd(r, z, S6, S5);
1202 // initialize "half" in current block to utilize 2nd FPU. However, it's
1203 // not a part of this block
1204 fmovd(half, 0.5);
1205 fmaddd(r, z, r, S4);
1206 fmaddd(r, z, r, S3);
1207 fmaddd(r, z, r, S2);
1208 }
1209
1210 if (!iyIsOne) {
1211 // return x+v*(S1+z*r);
1212 fmaddd(S1, z, r, S1);
1213 fmaddd(v0, v, S1, x);
1214 } else {
1215 // return x-((z*(half*y-v*r)-y)-v*S1);
1216 fmuld(S6, half, y); // half*y
1217 fmsubd(S6, v, r, S6); // half*y-v*r
1218 fmsubd(S6, z, S6, y); // y - z*(half*y-v*r) = - (z*(half*y-v*r)-y)
1219 fmaddd(S6, v, S1, S6); // - (z*(half*y-v*r)-y) + v*S1 == -((z*(half*y-v*r)-y)-v*S1)
1220 faddd(v0, x, S6);
1221 }
1222 }
1223
1224 ///*
1225 // * __kernel_cos( x, y )
1226 // * kernel cos function on [-pi/4, pi/4], pi/4 ~ 0.785398164
1227 // * Input x is assumed to be bounded by ~pi/4 in magnitude.
1228 // * Input y is the tail of x.
1229 // *
1230 // * Algorithm
1231 // * 1. Since cos(-x) = cos(x), we need only to consider positive x.
1232 // * 2. if x < 2^-27 (hx<0x3e400000 0), return 1 with inexact if x!=0.
1233 // * 3. cos(x) is approximated by a polynomial of degree 14 on
1234 // * [0,pi/4]
1235 // * 4 14
1236 // * cos(x) ~ 1 - x*x/2 + C1*x + ... + C6*x
1237 // * where the remez error is
1238 // *
1239 // * | 2 4 6 8 10 12 14 | -58
1240 // * |cos(x)-(1-.5*x +C1*x +C2*x +C3*x +C4*x +C5*x +C6*x )| <= 2
1241 // * | |
1242 // *
1243 // * 4 6 8 10 12 14
1244 // * 4. let r = C1*x +C2*x +C3*x +C4*x +C5*x +C6*x , then
1245 // * cos(x) = 1 - x*x/2 + r
1246 // * since cos(x+y) ~ cos(x) - sin(x)*y
1247 // * ~ cos(x) - x*y,
1248 // * a correction term is necessary in cos(x) and hence
1249 // * cos(x+y) = 1 - (x*x/2 - (r - x*y))
1250 // * For better accuracy when x > 0.3, let qx = |x|/4 with
1251 // * the last 32 bits mask off, and if x > 0.78125, let qx = 0.28125.
1252 // * Then
1253 // * cos(x+y) = (1-qx) - ((x*x/2-qx) - (r-x*y)).
1254 // * Note that 1-qx and (x*x/2-qx) is EXACT here, and the
1255 // * magnitude of the latter is at least a quarter of x*x/2,
1256 // * thus, reducing the rounding error in the subtraction.
1257 // */
1258 //
1259 //static const double
1260 //C1 = 4.16666666666666019037e-02, /* 0x3FA55555, 0x5555554C */
1261 //C2 = -1.38888888888741095749e-03, /* 0xBF56C16C, 0x16C15177 */
1262 //C3 = 2.48015872894767294178e-05, /* 0x3EFA01A0, 0x19CB1590 */
1263 //C4 = -2.75573143513906633035e-07, /* 0xBE927E4F, 0x809C52AD */
1264 //C5 = 2.08757232129817482790e-09, /* 0x3E21EE9E, 0xBDB4B1C4 */
1265 //C6 = -1.13596475577881948265e-11; /* 0xBDA8FAE9, 0xBE8838D4 */
1266 //
1267 // NOTE: C1..C6 were moved into a table: StubRoutines::aarch64::_dcos_coef
1268 //
1269 // BEGIN __kernel_cos PSEUDO CODE
1270 //
1271 //static double __kernel_cos(double x, double y)
1272 //{
1273 // double a,h,z,r,qx=0;
1274 //
1275 // // NOTE: ix is already initialized in dsin/dcos. Reuse value from register
1276 // //int ix;
1277 // //ix = high(x)&0x7fffffff; /* ix = |x|'s high word*/
1278 //
1279 // // NOTE: moved to dsin/dcos
1280 // //if(ix<0x3e400000) { /* if x < 2**27 */
1281 // // if(((int)x)==0) return one; /* generate inexact */
1282 // //}
1283 //
1284 // z = x*x;
1285 // r = z*(C1+z*(C2+z*(C3+z*(C4+z*(C5+z*C6)))));
1286 // if(ix < 0x3FD33333) /* if |x| < 0.3 */
1287 // return one - (0.5*z - (z*r - x*y));
1288 // else {
1289 // if(ix > 0x3fe90000) { /* x > 0.78125 */
1290 // qx = 0.28125;
1291 // } else {
1292 // set_high(&qx, ix-0x00200000); /* x/4 */
1293 // set_low(&qx, 0);
1294 // }
1295 // h = 0.5*z-qx;
1296 // a = one-qx;
1297 // return a - (h - (z*r-x*y));
1298 // }
1299 //}
1300 //
1301 // END __kernel_cos PSEUDO CODE
1302 //
1303 // Changes between fdlibm and intrinsic:
1304 // 1. Removed |x| < 2**-27 check, because if was done earlier in dsin/dcos
1305 // 2. Constants are now loaded from table dcos_coef
1306 // Input and output:
1307 // 1. Input for generated function: X argument = x
1308 // 2. Input for generator: x = register to read argument from, dcos_coef
1309 // = coefficients table address
1310 // 2. Return cos(x) value in v0
generate_kernel_cos(FloatRegister x,address dcos_coef)1311 void MacroAssembler::generate_kernel_cos(FloatRegister x, address dcos_coef) {
1312 Register ix = r3;
1313 FloatRegister qx = v1, h = v2, a = v3, y = v5, z = v6, r = v7, C1 = v18,
1314 C2 = v19, C3 = v20, C4 = v21, C5 = v22, C6 = v23, one = v25, half = v26;
1315 Label IX_IS_LARGE, SET_QX_CONST, DONE, QX_SET;
1316 lea(rscratch2, ExternalAddress(dcos_coef));
1317 ldpd(C5, C6, Address(rscratch2, 32)); // load C5, C6
1318 fmuld(z, x, x); // z=x^2
1319 ld1(C1, C2, C3, C4, T1D, Address(rscratch2)); // load C1..C3\4
1320 block_comment("calculate r = z*(C1+z*(C2+z*(C3+z*(C4+z*(C5+z*C6)))))"); {
1321 fmaddd(r, z, C6, C5);
1322 fmovd(half, 0.5);
1323 fmaddd(r, z, r, C4);
1324 fmuld(y, x, y);
1325 fmaddd(r, z, r, C3);
1326 mov(rscratch1, 0x3FD33333);
1327 fmaddd(r, z, r, C2);
1328 fmuld(x, z, z); // x = z^2
1329 fmaddd(r, z, r, C1); // r = C1+z(C2+z(C4+z(C5+z*C6)))
1330 }
1331 // need to multiply r by z to have "final" r value
1332 fmovd(one, 1.0);
1333 cmp(ix, rscratch1);
1334 br(GT, IX_IS_LARGE);
1335 block_comment("if(ix < 0x3FD33333) return one - (0.5*z - (z*r - x*y))"); {
1336 // return 1.0 - (0.5*z - (z*r - x*y)) = 1.0 - (0.5*z + (x*y - z*r))
1337 fmsubd(v0, x, r, y);
1338 fmaddd(v0, half, z, v0);
1339 fsubd(v0, one, v0);
1340 b(DONE);
1341 }
1342 block_comment("if(ix >= 0x3FD33333)"); {
1343 bind(IX_IS_LARGE);
1344 movz(rscratch2, 0x3FE9, 16);
1345 cmp(ix, rscratch2);
1346 br(GT, SET_QX_CONST);
1347 block_comment("set_high(&qx, ix-0x00200000); set_low(&qx, 0);"); {
1348 subw(rscratch2, ix, 0x00200000);
1349 lsl(rscratch2, rscratch2, 32);
1350 fmovd(qx, rscratch2);
1351 }
1352 b(QX_SET);
1353 bind(SET_QX_CONST);
1354 block_comment("if(ix > 0x3fe90000) qx = 0.28125;"); {
1355 fmovd(qx, 0.28125);
1356 }
1357 bind(QX_SET);
1358 fnmsub(C6, x, r, y); // z*r - xy
1359 fnmsub(h, half, z, qx); // h = 0.5*z - qx
1360 fsubd(a, one, qx); // a = 1-qx
1361 fsubd(C6, h, C6); // = h - (z*r - x*y)
1362 fsubd(v0, a, C6);
1363 }
1364 bind(DONE);
1365 }
1366
1367 // generate_dsin_dcos creates stub for dsin and dcos
1368 // Generation is done via single call because dsin and dcos code is almost the
1369 // same(see C code below). These functions work as follows:
1370 // 1) handle corner cases: |x| ~< pi/4, x is NaN or INF, |x| < 2**-27
1371 // 2) perform argument reduction if required
1372 // 3) call kernel_sin or kernel_cos which approximate sin/cos via polynomial
1373 //
1374 // BEGIN dsin/dcos PSEUDO CODE
1375 //
1376 //dsin_dcos(jdouble x, bool isCos) {
1377 // double y[2],z=0.0;
1378 // int n, ix;
1379 //
1380 // /* High word of x. */
1381 // ix = high(x);
1382 //
1383 // /* |x| ~< pi/4 */
1384 // ix &= 0x7fffffff;
1385 // if(ix <= 0x3fe921fb) return isCos ? __kernel_cos : __kernel_sin(x,z,0);
1386 //
1387 // /* sin/cos(Inf or NaN) is NaN */
1388 // else if (ix>=0x7ff00000) return x-x;
1389 // else if (ix<0x3e400000) { /* if ix < 2**27 */
1390 // if(((int)x)==0) return isCos ? one : x; /* generate inexact */
1391 // }
1392 // /* argument reduction needed */
1393 // else {
1394 // n = __ieee754_rem_pio2(x,y);
1395 // switch(n&3) {
1396 // case 0: return isCos ? __kernel_cos(y[0],y[1]) : __kernel_sin(y[0],y[1], true);
1397 // case 1: return isCos ? -__kernel_sin(y[0],y[1],true) : __kernel_cos(y[0],y[1]);
1398 // case 2: return isCos ? -__kernel_cos(y[0],y[1]) : -__kernel_sin(y[0],y[1], true);
1399 // default:
1400 // return isCos ? __kernel_sin(y[0],y[1],1) : -__kernel_cos(y[0],y[1]);
1401 // }
1402 // }
1403 //}
1404 // END dsin/dcos PSEUDO CODE
1405 //
1406 // Changes between fdlibm and intrinsic:
1407 // 1. Moved ix < 2**27 from kernel_sin/kernel_cos into dsin/dcos
1408 // 2. Final switch use equivalent bit checks(tbz/tbnz)
1409 // Input ans output:
1410 // 1. Input for generated function: X = r0
1411 // 2. Input for generator: isCos = generate sin or cos, npio2_hw = address
1412 // of npio2_hw table, two_over_pi = address of two_over_pi table,
1413 // pio2 = address if pio2 table, dsin_coef = address if dsin_coef table,
1414 // dcos_coef = address of dcos_coef table
1415 // 3. Return result in v0
1416 // NOTE: general purpose register names match local variable names in C code
generate_dsin_dcos(bool isCos,address npio2_hw,address two_over_pi,address pio2,address dsin_coef,address dcos_coef)1417 void MacroAssembler::generate_dsin_dcos(bool isCos, address npio2_hw,
1418 address two_over_pi, address pio2, address dsin_coef, address dcos_coef) {
1419 const int POSITIVE_INFINITY_OR_NAN_PREFIX = 0x7FF0;
1420
1421 Label DONE, ARG_REDUCTION, TINY_X, RETURN_SIN, EARLY_CASE;
1422 Register X = r0, absX = r1, n = r2, ix = r3;
1423 FloatRegister y0 = v4, y1 = v5;
1424 block_comment("check |x| ~< pi/4, NaN, Inf and |x| < 2**-27 cases"); {
1425 fmovd(X, v0);
1426 mov(rscratch2, 0x3e400000);
1427 mov(rscratch1, 0x3fe921fb00000000); // pi/4. shifted to reuse later
1428 ubfm(absX, X, 0, 62); // absX
1429 movz(r10, POSITIVE_INFINITY_OR_NAN_PREFIX, 48);
1430 cmp(rscratch2, absX, LSR, 32);
1431 lsr(ix, absX, 32); // set ix
1432 br(GT, TINY_X); // handle tiny x (|x| < 2^-27)
1433 cmp(ix, rscratch1, LSR, 32);
1434 br(LE, EARLY_CASE); // if(ix <= 0x3fe921fb) return
1435 cmp(absX, r10);
1436 br(LT, ARG_REDUCTION);
1437 // X is NaN or INF(i.e. 0x7FF* or 0xFFF*). Return NaN (mantissa != 0).
1438 // Set last bit unconditionally to make it NaN
1439 orr(r10, r10, 1);
1440 fmovd(v0, r10);
1441 ret(lr);
1442 }
1443 block_comment("kernel_sin/kernel_cos: if(ix<0x3e400000) {<fast return>}"); {
1444 bind(TINY_X);
1445 if (isCos) {
1446 fmovd(v0, 1.0);
1447 }
1448 ret(lr);
1449 }
1450 bind(ARG_REDUCTION); /* argument reduction needed */
1451 block_comment("n = __ieee754_rem_pio2(x,y);"); {
1452 generate__ieee754_rem_pio2(npio2_hw, two_over_pi, pio2);
1453 }
1454 block_comment("switch(n&3) {case ... }"); {
1455 if (isCos) {
1456 eorw(absX, n, n, LSR, 1);
1457 tbnz(n, 0, RETURN_SIN);
1458 } else {
1459 tbz(n, 0, RETURN_SIN);
1460 }
1461 generate_kernel_cos(y0, dcos_coef);
1462 if (isCos) {
1463 tbz(absX, 0, DONE);
1464 } else {
1465 tbz(n, 1, DONE);
1466 }
1467 fnegd(v0, v0);
1468 ret(lr);
1469 bind(RETURN_SIN);
1470 generate_kernel_sin(y0, true, dsin_coef);
1471 if (isCos) {
1472 tbz(absX, 0, DONE);
1473 } else {
1474 tbz(n, 1, DONE);
1475 }
1476 fnegd(v0, v0);
1477 ret(lr);
1478 }
1479 bind(EARLY_CASE);
1480 eor(y1, T8B, y1, y1);
1481 if (isCos) {
1482 generate_kernel_cos(v0, dcos_coef);
1483 } else {
1484 generate_kernel_sin(v0, false, dsin_coef);
1485 }
1486 bind(DONE);
1487 ret(lr);
1488 }
1489