1#
2# Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8#     http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15#
16
17function print_hdrs()
18{
19  print "\
20/*\n\
21 *     Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.\n\
22 *\n\
23 * NVIDIA CORPORATION and its licensors retain all intellectual property\n\
24 * and proprietary rights in and to this software, related documentation\n\
25 * and any modifications thereto.  Any use, reproduction, disclosure or\n\
26 * distribution of this software and related documentation without an express\n\
27 * license agreement from NVIDIA CORPORATION is strictly prohibited.\n\
28 *\n\
29 */\n\
30\n\n\
31#ifdef __cplusplus\n\
32extern \"C\" {\n\
33#endif\n\n\
34#include \"mth_intrinsics.h\" \n\
35#include \"mth_tbldefs.h\" \n\
36\n\n\
37static const vrs4_t Csp1_4={1.0, 1.0, 1.0, 1.0}; \n\
38static const vrd2_t Cdp1_2={1.0, 1.0}; \n\
39static const vrs8_t Csp1_8={1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; \n\
40static const vrd4_t Cdp1_4={1.0, 1.0, 1.0, 1.0}; \n\
41static const vrs16_t Csp1_16={1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, \n\
42                             1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; \n\
43static const vrd8_t Cdp1_8={1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; \n\
44\n\n\
45#ifdef __cplusplus\n\
46}\n\
47#endif\n\
48#if defined (TARGET_X8664) \n\
49#include \"immintrin.h\" \n\
50#elif defined (TARGET_LINUX_POWER) \n\
51#include \"altivec.h\" \n\
52#elif defined(TARGET_LINUX_ARM64) \n\
53#include \"arm64intrin.h\" \n\
54#include <search.h> \n\
55#else \n\
56#include <stddef.h> \n\
57#include <stdint.h> \n\
58#include <search.h> \n\
59#include <assert.h> \n\
60#endif\n\
61\n\
62#if !defined(TARGET_X8664) && !defined(TARGET_LINUX_POWER)\n\
63static int u8nonzero(const void *a, const void *b) \n\
64{ \n\
65  assert(!a); \n\
66  assert(b); \n\
67  return !(*((uint8_t *)b)); \n\
68} \n\
69\n\
70static inline int is_zero(const void *val, size_t nmemb) \n\
71{ \n\
72  return !lfind(NULL, val, &nmemb, sizeof(uint8_t), u8nonzero); \n\
73} \n\
74\n\
75#endif \n\
76\n\
77"
78}
79
80function extern_c_begin()
81{
82  if (TARGET == "ARM64") {
83    print "\n#ifdef __cplusplus"
84    print "extern \"C\" {"
85    print "#endif"
86  }
87}
88
89function extern_c_end()
90{
91  if (TARGET == "ARM64") {
92    print "\n#ifdef __cplusplus"
93    print "}"
94    print "#endif"
95  }
96}
97
98function init_target_arrays()
99{
100  if (TARGET == "POWER") {
101    divsd["fs"] = "vec_div(x, y)"
102    divsd["fd"] = "vec_div(x, y)"
103    divsd["rs"] = "vec_div(x, y)"
104    divsd["rd"] = "vec_div(x, y)"
105    # For some unexplained reason, the native and llvm compilers implements
106    # relaxed divide on POWER using reciprocal and a multiply.
107    divsd["rs"] = "vec_mul(x, vec_div(Csp1_" VL("s") ", y))"
108    divsd["rd"] = "vec_mul(x, vec_div(Cdp1_" VL("d") ", y))"
109    divsd["ps"] = "vec_div(x, y)"
110    divsd["pd"] = "vec_div(x, y)"
111
112    sqrtsd["fs"] = "vec_sqrt(x)"
113    sqrtsd["fd"] = "vec_sqrt(x)"
114    sqrtsd["rs"] = "vec_sqrt(x)"
115    sqrtsd["rd"] = "vec_sqrt(x)"
116    sqrtsd["ps"] = "vec_sqrt(x)"
117    sqrtsd["pd"] = "vec_sqrt(x)"
118    mask_all_zero = "(vec_all_eq(mask, vec_xor(mask,mask)) == 1)"
119  } else if (TARGET == "X8664") {
120    if (VLS == 4) {
121      _mm = "_mm"
122      __m = "__m128"
123      _si = "_si128"
124    } else if (VLS == 8) {
125      _mm = "_mm256"
126      __m = "__m256"
127      _si = "_si256"
128    } else {
129      _mm = "_mm512"
130      __m = "__m512"
131      _si = "_si512"
132    }
133
134    divsd["fs"] = _mm "_div_ps((" __m ")x, (" __m ")y)"
135    divsd["fd"] = _mm "_div_pd((" __m "d)x, (" __m "d)y)"
136    divsd["rs"] = _mm "_div_ps((" __m ")x, (" __m ")y)"
137    divsd["rd"] = _mm "_div_pd((" __m "d)x, (" __m "d)y)"
138    # For some unexplained reason, the native and llvm compilers implements
139    # relaxed divide on X86-64 using reciprocal and a multiply.
140    divsd["rs"] = _mm "_mul_ps((" __m ")x, " _mm "_div_ps(Csp1_" VL("s") ", ( " __m ")y))"
141    divsd["rd"] = _mm "_mul_pd((" __m "d)x, " _mm "_div_pd(Cdp1_" VL("d") ", ( " __m "d)y))"
142    divsd["ps"] = _mm "_div_ps((" __m ")x, (" __m ")y)"
143    divsd["pd"] = _mm "_div_pd((" __m "d)x, (" __m "d)y)"
144
145    sqrtsd["fs"] = _mm "_sqrt_ps((" __m ")x)"
146    sqrtsd["fd"] = _mm "_sqrt_pd((" __m "d)x)"
147    sqrtsd["rs"] = _mm "_sqrt_ps((" __m ")x)"
148    sqrtsd["rd"] = _mm "_sqrt_pd((" __m "d)x)"
149    sqrtsd["ps"] = _mm "_sqrt_ps((" __m ")x)"
150    sqrtsd["pd"] = _mm "_sqrt_pd((" __m "d)x)"
151
152    # For vector register size == 128, it would be faster to use the
153    # (_mm_testz_si128((__m128i)mask, _mm_set1_epi32(-1) == 1), but we
154    # compile mth_128mask.c for core2 processors (gcc -march=core2),
155    # and the ptest instruction (_mm_testz_si128()) is not available
156    # until SSE4.1.
157
158#    mask_all_zero =  (VLS == 4) ? \
159#       "(_mm_movemask_ps((__m128) _mm_cmpeq_epi32((__m128i)mask, \
160#                      _mm_xor_si128((__m128i)mask,(__m128i)mask))) == 15)" : \
161#      "(_mm256_testz_si256((__m256i)mask, _mm256_set1_epi32(-1)) == 1)"
162    if (VLS == 4) {
163      mask_all_zero = \
164        "(_mm_movemask_ps((__m128) _mm_cmpeq_epi32((__m128i)mask, " \
165        "_mm_xor_si128((__m128i)mask,(__m128i)mask))) == 15)"
166    } else if (VLS == 8) {
167      mask_all_zero = \
168        "(_mm256_testz_si256((__m256i)mask, _mm256_set1_epi32(-1)) == 1)"
169    } else {
170      mask_all_zero = \
171        "(_mm512_test_epi32_mask((__m512i)mask, _mm512_set1_epi32(-1)) == 0)"
172    }
173  } else {
174    divsd["fs"] = "((x) / (y))"
175    divsd["fd"] = "((x) / (y))"
176    divsd["rs"] = "((x) / (y))"
177    divsd["rd"] = "((x) / (y))"
178    divsd["rs"] = "((x) / (y))"
179    divsd["rd"] = "((x) / (y))"
180    divsd["ps"] = "((x) / (y))"
181    divsd["pd"] = "((x) / (y))"
182
183    sqrtsd["fs"] = "(assert(!\"vsqrt\"), x)"
184    sqrtsd["fd"] = "(assert(!\"vsqrt\"), x)"
185    sqrtsd["rs"] = "(assert(!\"vsqrt\"), x)"
186    sqrtsd["rd"] = "(assert(!\"vsqrt\"), x)"
187    sqrtsd["ps"] = "(assert(!\"vsqrt\"), x)"
188    sqrtsd["pd"] = "(assert(!\"vsqrt\"), x)"
189    mask_all_zero = "(is_zero(&mask, sizeof mask))"
190  }
191
192  frps["f"]= ""
193  frps["r"]= ""
194  frps["p"]= ""
195  sds["s"]= ""
196  sds["d"]= ""
197  iks["i"]= ""
198  iks["k"]= ""
199}
200
201function VL(sd)
202{
203  return sd == "s" ? VLS : VLD
204}
205
206function VR_T(sd) {
207  return "vr" sd (sd == "s" ? VLS : VLD) "_t"
208}
209
210function VI_T(sd) {
211  return "vi" sd (sd == "s" ? VLS : VLD) "_t"
212}
213
214function arg_ne_0(yarg, a, b)
215{
216  return yarg != 0 ? a : b
217}
218
219function func_r_decl(name, frp, sd, yarg)
220{
221  extern_c_begin()
222  print "\n" VR_T(sd)
223  print "__" frp sd "_" name "_" VL(sd) "_mn" \
224        "(" VR_T(sd) " x" \
225        arg_ne_0(yarg, ", " VR_T(sd) " y",  "") \
226        ", " VI_T(sd) " mask)"
227
228}
229
230function func_rr_def(name, frp, sd, safeval, yarg) {
231  func_r_decl(name, frp, sd, yarg)
232  print "{"
233  print "  " \
234        VR_T(sd) " (*fptr) (" VR_T(sd) \
235        arg_ne_0(yarg, ", " VR_T(sd), "") \
236        ");"
237  print "  (void) fptr;"
238
239  # X86-64 tests assume input vector is return if mask is all zero.
240  # print "  if(" mask_all_zero ") return (" VR_T(sd) ")mask;"
241  print "  if(" mask_all_zero ") return x;"
242  print "  x = (" VR_T(sd) ")((((" VI_T(sd) ")x & mask))" \
243        arg_ne_0(safeval, " | ((" VI_T(sd) ")C" sd "p1_" VL(sd) " & ~mask)", "") \
244        ");"
245  if (yarg != 0) {
246    print "  y = (" VR_T(sd) ")((((" VI_T(sd) ")y & mask))" \
247        arg_ne_0(safeval, " | ((" VI_T(sd) ")C" sd "p1_" VL(sd) " & ~mask)", "") \
248        ");"
249  }
250  if (name != "div" && name != "sqrt") {
251    print "  fptr = (" VR_T(sd) "(*) (" VR_T(sd), \
252          (yarg != 0) ? ", " VR_T(sd) : "", \
253          ")) MTH_DISPATCH_TBL[func_" name "][sv_" sd "v" VL(sd) "][frp_" frp "];"
254    print "  return (fptr(x", (yarg != 0) ? ", y" : "", "));"
255  } else {
256    print "  return (", (name == "div") ? divsd[frp sd] : sqrtsd[frp sd], ");"
257  }
258
259  print "}\n"
260  extern_c_end()
261}
262
263function func_pow_args_nomask(sd, is_scalar, ik, with_vars)
264{
265  ll = VR_T(sd) arg_ne_0(with_vars, " x", "") ", "
266  if (is_scalar) {
267    ll = ll ((ik == "i") ? "int32_t" : "int64_t") arg_ne_0(with_vars, " iy", "")
268  } else {
269    if (sd == "s" && ik == "k") {
270      ll = ll VI_T("d") arg_ne_0(with_vars, " iyu", "") ", " \
271            VI_T("d") arg_ne_0(with_vars, " iyl", "")
272    } else {
273      ll = ll VI_T(ik == "i" ? "s" : "d") arg_ne_0(with_vars, " iy", "")
274    }
275  }
276
277  return ll
278}
279
280function func_pow_decl(name, frp, sd, is_scalar, ik)
281{
282  extern_c_begin()
283  print "\n" VR_T(sd)
284  l = "__" frp sd "_" name arg_ne_0(is_scalar, ik"1", ik)"_" VL(sd) "_mn" "("
285  l = l func_pow_args_nomask(sd, is_scalar, ik, 1)
286  l = l ", " VI_T(sd) " mask)"
287  print l
288
289}
290
291function func_pow_def(name, frp, sd, is_scalar, ik)
292{
293  func_pow_decl(name, frp, sd, is_scalar, ik)
294  print "{"
295  print "  "\
296        VR_T(sd) " (*fptr) (" func_pow_args_nomask(sd, is_scalar, ik, 0) ");"
297  # X86-64 tests assume input vector is return if mask is all zero.
298  # print "  if(" mask_all_zero ") return (" VR_T(sd) ")mask;"
299  print "  if(" mask_all_zero ") return x;"
300  print "  x = ("VR_T(sd) ")((" VI_T(sd) ")x & mask);"
301  if (is_scalar == 0) {
302    if((sd == "s" && ik == "i") || (sd == "d" && ik == "k")) {
303      print "  iy = iy & mask;"
304    } else {
305      print "  {\n"\
306            "    int i;\n"\
307            "    for (i = 0 ; i < " VL(sd) "; i++) {\n"\
308            "      if (mask[i] == 0) {"
309      if (sd == "s") {
310        print "        if(i < " VL(d) ") {\n"\
311              "          iyu[i] = 0;\n"\
312              "        } else {\n"\
313              "          iyl[i-" VL(d) "] = 0;\n"\
314              "        }"
315      } else {
316        print "        iy[i] = 0;"\
317      }
318      print "      }\n    }\n  }"
319    }
320  }
321  print "  fptr = (" VR_T(sd) "(*) (" \
322        func_pow_args_nomask(sd, is_scalar, ik, 0) \
323        ")) MTH_DISPATCH_TBL[func_" name arg_ne_0(is_scalar, ik"1", ik) \
324        "][sv_" sd "v" VL(sd) "][frp_" frp "];"
325  print "  return (fptr(x, ", \
326        arg_ne_0(is_scalar == 0 && sd == "s" && ik == "k", "iyu, iyl", "iy") \
327        "));"
328
329  print "}"
330  extern_c_end()
331}
332
333function do_all_rr(name, safeval, yarg)
334{
335
336  for (frp in frps) {
337    for (sd in sds) {
338      func_rr_def(name, frp, sd, safeval, yarg)
339    }
340  }
341}
342
343function do_all_pow_r2i()
344{
345  for (frp in frps) {
346    for (sd in sds) {
347      for (ik in iks) {
348        func_pow_def("pow", frp, sd, 1, ik)
349        func_pow_def("pow", frp, sd, 0, ik)
350      }
351    }
352  }
353}
354
355BEGIN {
356  if (TARGET == "POWER") {
357    if (MAX_VREG_SIZE != 128) {
358      print "TARGET == POWER, MAX_VREG_SIZE must be 128"
359      exit(1)
360    }
361  } else if (TARGET == "ARM64") {
362    if (MAX_VREG_SIZE != 128) {
363      print "TARGET == ARM64, MAX_VREG_SIZE must be 128"
364      exit(1)
365    }
366  } else if (MAX_VREG_SIZE != 128 && MAX_VREG_SIZE != 256 && MAX_VREG_SIZE != 512) {
367    print "TARGET == X8664, MAX_VREG_SIZE must be either 128, 256, or 512"
368    exit(1)
369  }
370
371  if (MAX_VREG_SIZE == 128) {
372    VLS = 4
373    VLD = 2
374  } else if (MAX_VREG_SIZE == 256) {
375    VLS = 8
376    VLD = 4
377  } else {
378    VLS = 16
379    VLD = 8
380  }
381
382# Initialize some associative arrays
383  init_target_arrays()
384
385  print_hdrs()
386  one_arg = 0
387  two_args = 1
388
389
390  do_all_rr("acos", 0, one_arg)
391  do_all_rr("asin", 0, one_arg)
392  do_all_rr("atan", 0, one_arg)
393  do_all_rr("atan2", 1, two_args)
394  do_all_rr("cos", 0, one_arg)
395  do_all_rr("sin", 0, one_arg)
396  do_all_rr("tan", 0, one_arg)
397  do_all_rr("sincos", 0, one_arg)
398  do_all_rr("cosh", 0, one_arg)
399  do_all_rr("sinh", 0, one_arg)
400  do_all_rr("tanh", 0, one_arg)
401  do_all_rr("exp", 0, one_arg)
402  do_all_rr("log", 1, one_arg)
403  do_all_rr("log10", 1, one_arg)
404  do_all_rr("pow", 0, two_args)
405  do_all_rr("div", 1, two_args)
406  do_all_rr("sqrt", 0, one_arg)
407  do_all_rr("mod", 1, two_args)
408  do_all_rr("aint", 0, one_arg)
409  do_all_rr("ceil", 0, one_arg)
410  do_all_rr("floor", 0, one_arg)
411
412  do_all_pow_r2i()
413}
414