1;
2;
3; MIT License
4; -----------
5;
6; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
7;
8; Permission is hereby granted, free of charge, to any person obtaining a copy
9; of this Software and associated documentaon files (the "Software"), to deal
10; in the Software without restriction, including without limitation the rights
11; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12; copies of the Software, and to permit persons to whom the Software is
13; furnished to do so, subject to the following conditions:
14;
15; The above copyright notice and this permission notice shall be included in
16; all copies or substantial portions of the Software.
17;
18; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24; THE SOFTWARE.
25;
26; An implementation of the remainder by pi/2 function
27; This is a service routine for use by trig functions coded in asm
28;
29; On input,
30;   xmm0 = x;
31; On ouput
32;   xmm0 = r
33;   xmm1 = rr
34;   xmm2 = region
35
36.const
37ALIGN 16
38L__piby2_part3_piby2_lead DQ 03ff921fb54442d18h, 03c91a62633145c06h
39L__piby2_part1            DQ 03ff921fb50000000h, 03ff921fb50000000h
40L__piby2_part2            DQ 03e5110b460000000h, 03e5110b460000000h
41;; constants for CW reduction
42L_piby2_1      DQ 03FF921FB54400000h, 03FF921FB54400000h
43L_piby2_2      DQ 03DD0B4611A600000h, 03DD0B4611A600000h
44L_piby2_3      DQ 03BA3198A2E000000h, 03BA3198A2E000000h
45L_piby2_1tail  DQ 03DD0B4611A626331h, 03DD0B4611A626331h
46L_piby2_2tail  DQ 03BA3198A2E037073h, 03BA3198A2E037073h
47L_piby2_3tail  DQ 0397B839A252049C1h, 0397B839A252049C1h
48L_twobypi      DQ 03FE45F306DC9C883h, 03FE45F306DC9C883h
49L_point_five   DQ 03FE0000000000000h, 03FE0000000000000h
50L_int_three    DQ 00000000000000003h, 00000000000000003h
51L_inf_mask_64  DQ 07FF0000000000000h, 07FF0000000000000h
52L_signbit      DQ 08000000000000000h, 08000000000000000h
53L_int_1        DQ 00000000000000001h, 00000000000000001h
54L_int_15       DQ 0000000000000000Fh
55L_int_48       DQ 00000000000000030h
56L_3pio4        DQ 04002D97C7F3321D2h
57L_5pio4        DQ 0400F6A7A2955385Eh
58L_7pio4        DQ 04015FDBBE9BBA775h
59L_9pio4        DQ 0401c463abeccb2bbh
60ALIGN 16
61L__2_by_pi_bits DB 224, 241,  27, 193,  12,  88,  33, 116
62                DB  53, 126, 196, 126, 237, 175, 169,  75
63                DB  74,  41, 222, 231,  28, 244, 236, 197
64                DB 151, 175,  31, 235, 158, 212, 181, 168
65                DB 127, 121, 154, 253,  24,  61, 221,  38
66                DB  44, 159,  60, 251, 217, 180, 125, 180
67                DB  41, 104,  45,  70, 188, 188,  63,  96
68                DB  22, 120, 255,  95, 226, 127, 236, 160
69                DB 228, 247,  46, 126,  17, 114, 210, 231
70                DB  76,  13, 230,  88,  71, 230,   4, 249
71                DB 125, 209, 154, 192, 113, 166,  19,  18
72                DB 237, 186, 212, 215,   8, 162, 251, 156
73                DB 166, 196, 114, 172, 119, 248, 115,  72
74                DB  70,  39, 168, 187,  36,  25, 128,  75
75                DB  55,   9, 233, 184, 145, 220, 134,  21
76                DB 239, 122, 175, 142,  69, 249,   7,  65
77                DB  14, 241, 100,  86, 138, 109,   3, 119
78                DB 211, 212,  71,  95, 157, 240, 167,  84
79                DB  16,  57, 185,  13, 230, 139,   2,   0
80                DB   0,   0,   0,   0,   0,   0
81
82
83; local storage offsets
84region          EQU 000h
85stack_size      EQU 018h
86sstack_size     EQU 000h   ; no stack for fsname
87
88include fm.inc
89
90fname TEXTEQU <__remainder_piby2_forAsm>
91fsname TEXTEQU <__remainder_piby2_cw_forAsm>
92
93
94.code
95
96; xmm0l has |x|
97PUBLIC fname
98fname PROC FRAME
99    StackAllocate stack_size
100    .ENDPROLOG
101
102    ; This function is not using rdx, r8, and r9 as pointers;
103    ; all returns are in registers
104
105    ; get the unbiased exponent and the mantissa part of x
106    lea       r9,L__2_by_pi_bits
107
108 ;xexp = (x >> 52) - 1023
109    movd      r11,xmm0
110    mov       rcx,r11
111    shr       r11,52
112    sub       r11,1023                 ; r11 <-- xexp = exponent of input x
113
114    ;calculate the last byte from which to start multiplication
115    ;last = 134 - (xexp >> 3)
116    mov       r10,r11
117    shr       r10,3
118    sub       r10,134                  ; r10 <-- -last
119    neg       r10                      ; r10 <-- last
120
121    ; load 64 bits of 2_by_pi
122    mov       rax,[r9 + r10]
123
124    ; mantissa of x = ((x << 12) >> 12) | implied bit
125    shl       rcx,12
126    shr       rcx,12                   ; rcx <-- mantissa part of input x
127    bts       rcx,52                   ; add the implied bit as well
128
129    ; load next 128 bits of 2_by_pi
130    add       r10,8 ;increment to next 8 bytes of 2_by_pi
131    movdqu    xmm0,[r9 + r10]
132
133    ; do three 64-bit multiplications with mant of x
134    mul rcx
135    mov       r8,rax                   ; r8 <-- last 64 bits of mul = res1[2]
136    mov       r10,rdx                  ; r10 <-- carry
137    movd      rax,xmm0
138    mul       rcx
139    ; resexp = xexp & 7
140    and       r11,7                    ; r11 <-- resexp = xexp & 7 = last 3 bits
141    psrldq    xmm0,8
142    add       rax,r10                  ; add the previous carry
143    adc       rdx,0
144    mov       r9,rax                   ; r9 <-- next 64 bits of mul = res1[1]
145    mov       r10,rdx                  ; r10 <-- carry
146    movd      rax,xmm0
147    mul       rcx
148    add       r10,rax                  ; r10 <-- most sig. 64 bits = res1[0]
149    ; find the region
150    ; last three bits ltb = most sig bits >> (54 - resexp));
151    ;   decimal point in last 18 bits ==> 8 lsb's in first 64 bits
152    ;   and 8 msb's in next 64 bits
153    ; point_five = ltb & 01h;
154    ; region = ((ltb >> 1) + point_five) & 3;
155    mov       rcx,54
156    mov       rax,r10
157    sub       rcx,r11
158    xor       rdx,rdx                  ; rdx <-- sign of x
159    shr       rax,cl
160    jnc       L__no_point_five
161    ; if there is carry then negate the result of multiplication
162    not       r10
163    not       r9
164    not       r8
165    mov       rdx,08000000000000000h
166
167ALIGN  16
168L__no_point_five:
169    adc       rax,0
170    and       rax,3                    ; rax now has region
171    mov       QWORD PTR [region+rsp],rax
172
173    ; calculate the number of integer bits and zero them out
174    mov       rcx,r11
175    add       rcx,10                   ; rcx = no. of integer bits
176    shl       r10,cl
177    shr       r10,cl                   ; r10 contains only mant bits
178    sub       rcx,64                   ; form the exponent
179    mov       r11,rcx
180
181 ;find the highest set bit
182    bsr       rcx,r10
183    jnz       L__form_mantissa
184    mov       r10,r9
185    mov       r9,r8
186    mov       r8,0
187    bsr       rcx,r10                  ; rcx = hsb
188    sub       r11,64
189
190
191ALIGN  16
192L__form_mantissa:
193    add       r11,rcx                  ; for exp of x
194    sub       rcx,52                   ; rcx = no. of bits to shift in r10
195    cmp       rcx,0
196    jl        L__hsb_below_52
197    je        L__form_numbers
198    ; hsb above 52
199    mov       r8,r10                   ; previous contents of r8 not required
200    shr       r10,cl                   ; r10 = mantissa of x with hsb at 52
201    shr       r9,cl                    ; make space for bits from r10
202    sub       rcx,64
203    neg       rcx
204    ; rcx <-- no of bits to shift r10 to move those bits to r9
205    shl       r8,cl
206    or        r9,r8                    ; r9 = mantissa bits of xx
207    jmp       L__form_numbers
208
209ALIGN  16
210L__hsb_below_52:
211    neg       rcx
212    mov       rax,r9
213    shl       r10,cl
214    shl       r9,cl
215    sub       rcx,64
216    neg       rcx
217    shr       rax,cl
218    or        r10,rax
219    shr       r8,cl
220    or        r9,r8
221
222ALIGN  16
223L__form_numbers:
224    add       r11,1023
225    btr       r10,52                   ; remove the implicit bit
226    mov       rcx,r11
227    or        r10,rdx                  ; put the sign
228    shl       rcx,52
229    or        r10,rcx                  ; r10 <-- x
230
231    movd      xmm0,r10                 ; xmm0 <-- x
232    movdqa    xmm1,xmm0                ; xmm1 <-- x
233    psrlq     xmm1,27
234    psllq     xmm1,27                  ; xmm1 <-- hx
235    movdqa    xmm2,xmm0                ; xmm2 <-- x
236    subsd     xmm2,xmm1                ; xmm2 <-- tx
237    movlhps   xmm0,xmm0                ; xmm0 <-- x,x
238    movlhps   xmm2,xmm1                ; xmm2 <-- hx,tx
239
240    movdqa    xmm1,XMMWORD PTR L__piby2_part3_piby2_lead
241    movdqa    xmm3,XMMWORD PTR L__piby2_part1
242    movdqa    xmm4,XMMWORD PTR L__piby2_part2
243
244    ; form xx
245    xor       rcx,rcx
246    bsr       rcx,r9
247    sub       rcx,64                   ; to shift the implicit bit as well
248    neg       rcx
249    shl       r9,cl
250    shr       r9,12
251    add       rcx,52
252    sub       r11,rcx
253    shl       r11,52
254    or        r9,rdx
255    or        r9,r11
256    movd      xmm5,r9                  ; xmm5 <-- xx
257
258    mulpd     xmm0,xmm1 ; xmm0 <-- piby2_part3 * x,piby2_lead * x = c
259    mulpd     xmm5,xmm1 ; xmm5 <-- piby2_lead * xx
260    mulpd     xmm3,xmm2 ; xmm3 <-- piby2_part1 * hx,piby2_part1 * tx
261    mulpd     xmm4,xmm2 ; xmm4 <-- piby2_part2 * hx,piby2_part2 * tx
262
263    ; cc = (piby2_part1 * hx - c) + (piby2_part1 * tx) +
264    ;   (piby2_part2 * hx) + (piby2_part2 * tx) +
265    ;   (piby2_lead * xx + piby2_part3 * x)
266    movhlps   xmm1,xmm3 ; xmm1 = piby2_part1 * hx
267    movhlps   xmm2,xmm4 ; xmm2 = piby2_part2 * hx
268    subsd     xmm1,xmm0 ; xmm1 = (piby2_part1 * hx - c)
269    addsd     xmm1,xmm3 ; xmm1 = (piby2_part1 * hx - c) + (piby2_part1 * tx)
270    movhlps   xmm3,xmm0 ; xmm3 = piby2_part3 * x
271    addsd     xmm1,xmm2
272    ; xmm1 = (piby2_part1 * hx - c) + (piby2_part1 * tx) + (piby2_part2 * hx)
273    addsd     xmm3,xmm5 ; xmm3 = (piby2_lead * xx + piby2_part3 * x)
274    addsd     xmm1,xmm4
275    ; xmm1 = (piby2_part1 * hx - c) + (piby2_part1 * tx) +
276    ;    (piby2_part2 * hx) + (piby2_part2 * tx)
277    addsd     xmm1,xmm3                ; xmm1 = cc
278
279    ; xmm0 <-- c, xmm1 <-- cc
280    ; r = c + cc
281    ; rr = (c - r) + cc
282
283    movdqa    xmm2,xmm0                ; xmm2 <-- copy of c
284    addsd     xmm0,xmm1                ; xmm0 <-- r = c + cc
285    subsd     xmm2,xmm0                ; xmm2 <-- c - r
286    addsd     xmm1,xmm2                ; xmm1 <-- rr = cc + (c - r)
287    mov       rax, QWORD PTR[region+rsp] ; rax <-- region
288
289    StackDeallocate stack_size
290    ret
291
292fname        endp
293
294; NOTE: If this is not going to be used, should probably remove it. - WAT
295ALIGN 16
296PUBLIC fsname
297fsname PROC FRAME
298    StackAllocate sstack_size
299    .ENDPROLOG
300
301; xmm0l has |x|
302; r9 also has |x|
303; ASSUMPTION: if we call this function, |x| > pi/4
304
305    xor       r8d,r8d
306    cmp       r9, QWORD PTR L_5pio4
307    ja        Lax_gt_5pio4
308    cmp       r9, QWORD PTR L_3pio4
309    seta      r8b
310    inc       r8d
311    jmp       Lstage_npi2
312Lax_gt_5pio4:
313    cmp       r9, QWORD PTR L_9pio4
314    ja        Lnpi2_full_computation
315    cmp       r9, QWORD PTR L_7pio4
316    seta      r8b
317    add       r8d,3
318Lstage_npi2:
319    movd      xmm2, r8d
320    cvtdq2pd  xmm4, xmm2
321    jmp       Lnpi2_known
322
323Lnpi2_full_computation:
324;   movapd    xmm1, L_twobypi
325;   movapd    xmm3, L_point_five
326    movapd    xmm5,xmm0
327;   mulsd     xmm5,xmm1
328;   addsd     xmm5,xmm3                   ; xmm5 <-- |x|*2/pi + .5
329    mulsd     xmm5, L_twobypi
330    addsd     xmm5, L_point_five
331
332    cvttpd2dq xmm5,xmm5                   ; xmm5 < npi2 = int part
333    movapd    xmm2,xmm5
334    andpd     xmm2,L_int_three
335    cvtdq2pd  xmm4,xmm5
336
337Lnpi2_known:
338    movapd    xmm5,xmm4
339    mulsd     xmm5,QWORD PTR L_piby2_1    ; xmm5 <-- npi2*piby2_1
340    xorpd     xmm5,L_signbit              ; xmm5 <-- -npi2*piby2_1
341    addpd     xmm5,xmm0                   ; xmm5 <-- rhead = x - npi2*piby2_1
342    movapd    xmm3,xmm4
343    mulsd     xmm3,QWORD PTR L_piby2_1tail ; xmm3 <-- rtail = npi2*piby2_1tail
344
345    ; If x is nearly a multiple of pi/2, rhead will be small compared to |x|
346    ; we check this by checking exponent difference.
347
348    ; Note that both the unbiased exponents are positive, and that of rhead
349    ; must be <= that of |x|
350    movapd    xmm1,xmm5                   ; xmm1l <-- rhead
351    subpd     xmm1,xmm3                   ; xmm1l <-- r = rhead - rtail
352    andpd     xmm1,L_inf_mask_64
353    psubq     xmm0,xmm1                   ; xmm0 <-- |x| - r
354    psrlq     xmm0,52
355    comisd    xmm0,L_int_15
356
357;   movd      rax, xmm5                   ; really a movq
358;   shr       rax, 52
359;   shr       rdx, 52                     ; get exponent of |x| (no and needed)
360;   sub       rdx, rax
361;   cmp       rdx, 15
362    jbe       Lcw_get_r_rr
363
364    ; here expdiff > 15, so x is nearly a multiple of pi/2 and things are hard
365    ; we use another piece of pi/2 in the reduction
366
367    movapd    xmm1,xmm5
368    movapd    xmm3,xmm4
369    mulsd     xmm3,QWORD PTR L_piby2_2 ; xmm3 <--- rtail = npi2*piby2_2
370    subsd     xmm5,xmm3 ; xmm5 <-- rhead = t - rtail
371
372    ; now rtail = npi2*piby2_2tail - ((t-rhead) - rtail)
373    subsd     xmm1,xmm5
374    subsd     xmm1,xmm3
375    movapd    xmm3,xmm4
376    mulsd     xmm3,QWORD PTR L_piby2_2tail
377    subsd     xmm3,xmm1 ; xmm3 <-- rtail
378
379    comisd    xmm0,L_int_48
380;   cmp       rdx, 48
381    jbe       Lcw_get_r_rr
382
383    ; here expdiff > 48, so x is REALLY close to a multiple of pi/2
384    ; and we use yet another piece of pi/2 in the reduction
385
386    movapd    xmm0,xmm5 ; xmm0 <-- t = rhead
387    movapd    xmm3,xmm4
388    mulsd     xmm3,QWORD PTR L_piby2_3 ; xmm3 <-- rtail = npi2 * piby2_3
389    movapd    xmm5,xmm0
390    subsd     xmm5,xmm3 ; xmm5 <-- rhead = t - rtail
391
392    ; now rtail = npi2 * piby2_3tail - ((t - rhead) - rtail)
393    movapd    xmm1,xmm0
394    subsd     xmm1,xmm5
395    subsd     xmm1,xmm3
396    movapd    xmm3,xmm4
397    mulsd     xmm3,QWORD PTR L_piby2_3tail
398    subsd     xmm3,xmm1 ; xmm3 <-- rtail
399
400Lcw_get_r_rr:
401    ; We have a satisfactory rhead in xmm5 and rtail in xmm3
402    ; We now produce r in xmm0 and rr in xmm1, where the actual reduced argument
403    ; is the sum of r and rr, and rr is insignificant
404    ; with respect to r under addition (i.e., r + rr == r).
405    movapd    xmm0,xmm5 ; xmm0 <-- rhead
406    subsd     xmm0,xmm3 ; xmm0 <-- r = rhead - rtail
407    movapd    xmm1,xmm5 ; xmm1 <-- rhead
408    subsd     xmm1,xmm0 ; xmm1 <-- (rhead - r)
409    subsd     xmm1,xmm3 ; xmm1 <-- rr = (rhead - r) - rtail
410    movd      rax,xmm2  ; rax <-- region
411    StackDeallocate sstack_size
412    ret
413fsname        endp
414
415END
416