1 /*===--- __clang_cuda_texture_intrinsics.h - Device-side texture support ---=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 * 9 * This header provides in-header implmentations for NVCC's built-in 10 * __nv_tex_surf_handler() which is used by CUDA's texture-related headers. The 11 * built-in is unusual as it's actually a set of function overloads that use the 12 * first string literal argument as one of the overload parameters. 13 */ 14 #ifndef __CLANG_CUDA_TEXTURE_INTRINSICS_H__ 15 #define __CLANG_CUDA_TEXTURE_INTRINSICS_H__ 16 #ifndef __CUDA__ 17 #error "This file is for CUDA compilation only." 18 #endif 19 20 // __nv_tex_surf_handler() provided by this header as a macro. 21 #define __nv_tex_surf_handler(__op, __ptr, ...) \ 22 ::__cuda_tex::__tex_fetch< \ 23 ::__cuda_tex::__Tag<::__cuda_tex::__tex_op_hash(__op)>>(__ptr, \ 24 __VA_ARGS__) 25 26 #pragma push_macro("__ASM_OUT") 27 #pragma push_macro("__ASM_OUTP") 28 #pragma push_macro("__Args") 29 #pragma push_macro("__ID") 30 #pragma push_macro("__IDV") 31 #pragma push_macro("__IMPL_2DGATHER") 32 #pragma push_macro("__IMPL_ALIAS") 33 #pragma push_macro("__IMPL_ALIASI") 34 #pragma push_macro("__IMPL_F1") 35 #pragma push_macro("__IMPL_F3") 36 #pragma push_macro("__IMPL_F3N") 37 #pragma push_macro("__IMPL_F3S") 38 #pragma push_macro("__IMPL_S") 39 #pragma push_macro("__IMPL_S3") 40 #pragma push_macro("__IMPL_S3I") 41 #pragma push_macro("__IMPL_S3N") 42 #pragma push_macro("__IMPL_S3NI") 43 #pragma push_macro("__IMPL_S3S") 44 #pragma push_macro("__IMPL_S3SI") 45 #pragma push_macro("__IMPL_SI") 46 #pragma push_macro("__L") 47 #pragma push_macro("__STRIP_PARENS") 48 49 // Put all functions into anonymous namespace so they have internal linkage. 50 // The device-only function here must be internal in order to avoid ODR 51 // violations in case they are used from the files compiled with 52 // -fgpu-rdc. E.g. a library and an app using it may be built with a different 53 // version of this header file. 54 namespace { 55 56 // Put the implmentation into its own namespace so we don't pollute the TU. 57 namespace __cuda_tex { 58 59 // First, we need a perfect hash function and a few constexpr helper functions 60 // for converting a string literal into a numeric value which can be used to 61 // parametrize a template. We can not use string literals for that as that would 62 // require C++20. 63 // 64 // The hash function was generated with 'gperf' and then manually converted into 65 // its constexpr equivalent. 66 // 67 // NOTE: the perfect hashing scheme comes with inherent self-test. If the hash 68 // function has a collision for any of the texture operations, the compilation 69 // will fail due to an attempt to redefine a tag with the same value. If the 70 // header compiles, then the hash function is good enough for the job. 71 72 constexpr int __tex_len(const char *s) { 73 return (s[0] == 0) ? 0 74 : (s[1] == 0) ? 1 75 : (s[2] == 0) ? 2 76 : (s[3] == 0) ? 3 77 : (s[4] == 0) ? 4 78 : (s[5] == 0) ? 5 79 : (s[6] == 0) ? 6 80 : (s[7] == 0) ? 7 81 : (s[8] == 0) ? 8 82 : (s[9] == 0) ? 9 83 : (s[10] == 0) ? 10 84 : (s[11] == 0) ? 11 85 : (s[12] == 0) ? 12 86 : (s[13] == 0) ? 13 87 : (s[14] == 0) ? 14 88 : (s[15] == 0) ? 15 89 : (s[16] == 0) ? 16 90 : (s[17] == 0) ? 17 91 : (s[18] == 0) ? 18 92 : (s[19] == 0) ? 19 93 : (s[20] == 0) ? 20 94 : (s[21] == 0) ? 21 95 : (s[22] == 0) ? 22 96 : (s[23] == 0) ? 23 97 : (s[24] == 0) ? 24 98 : (s[25] == 0) ? 25 99 : (s[26] == 0) ? 26 100 : (s[27] == 0) ? 27 101 : (s[28] == 0) ? 28 102 : (s[29] == 0) ? 29 103 : (s[30] == 0) ? 30 104 : (s[31] == 0) ? 31 105 : 32; 106 } 107 108 constexpr int __tex_hash_map(int c) { 109 return (c == 49) ? 10 110 : (c == 50) ? 0 111 : (c == 51) ? 100 112 : (c == 52) ? 30 113 : (c == 67) ? 10 114 : (c == 68) ? 0 115 : (c == 69) ? 25 116 : (c == 72) ? 70 117 : (c == 77) ? 0 118 : (c == 96) ? 44 119 : (c == 99) ? 10 120 : (c == 100) ? 5 121 : (c == 101) ? 60 122 : (c == 102) ? 40 123 : (c == 103) ? 70 124 : (c == 104) ? 25 125 : (c == 112) ? 0 126 : (c == 114) ? 45 127 : (c == 117) ? 5 128 : (c == 118) ? 85 129 : (c == 120) ? 20 130 : 225; 131 } 132 133 constexpr int __tex_op_hash(const char *str) { 134 return __tex_len(str) + __tex_hash_map(str[7] + 1) + __tex_hash_map(str[6]) + 135 __tex_hash_map(str[5]) + __tex_hash_map(str[__tex_len(str) - 1]); 136 } 137 138 // Tag type to identify particular texture operation. 139 template <int N> struct __Tag; 140 #define __ID(__op) __Tag<__tex_op_hash(__op)> 141 // Tags for variants of particular operation. E.g. tex2Dgather can translate 142 // into 4 different instructions. 143 #define __IDV(__op, __variant) \ 144 __Tag<10000 + __tex_op_hash(__op) * 100 + __variant> 145 146 // Helper classes for figuring out key data types for derived types. 147 // E.g. char2 has __base_t = char, __fetch_t = char4 148 template <class> struct __TypeInfoT; 149 // Type info for the fundamental types. 150 template <> struct __TypeInfoT<float> { 151 using __base_t = float; 152 using __fetch_t = float4; 153 }; 154 template <> struct __TypeInfoT<char> { 155 using __base_t = char; 156 using __fetch_t = int4; 157 }; 158 template <> struct __TypeInfoT<signed char> { 159 using __base_t = signed char; 160 using __fetch_t = int4; 161 }; 162 template <> struct __TypeInfoT<unsigned char> { 163 using __base_t = unsigned char; 164 using __fetch_t = uint4; 165 }; 166 template <> struct __TypeInfoT<short> { 167 using __base_t = short; 168 using __fetch_t = int4; 169 }; 170 template <> struct __TypeInfoT<unsigned short> { 171 using __base_t = unsigned short; 172 using __fetch_t = uint4; 173 }; 174 template <> struct __TypeInfoT<int> { 175 using __base_t = int; 176 using __fetch_t = int4; 177 }; 178 template <> struct __TypeInfoT<unsigned int> { 179 using __base_t = unsigned int; 180 using __fetch_t = uint4; 181 }; 182 183 // Derived base/fetch types for N-element vectors. 184 template <class __T> struct __TypeInfoT { 185 using __base_t = decltype(__T::x); 186 using __fetch_t = typename __TypeInfoT<__base_t>::__fetch_t; 187 }; 188 189 // Classes that implement specific texture ops. 190 template <class __op> struct __tex_fetch_v4; 191 192 // Helper macros to strip parens from a macro argument. 193 #define __Args(...) __VA_ARGS__ 194 #define __STRIP_PARENS(__X) __X 195 #define __L(__X) __STRIP_PARENS(__Args __X) 196 197 // Construct inline assembly output args. 198 // Results are stored in a temp var __r. 199 // isResident bool is pointed to by __ir 200 // Asm args for return values. It's a 4-element vector 201 #define __ASM_OUT(__t) \ 202 ("=" __t(__r.x), "=" __t(__r.y), "=" __t(__r.z), "=" __t(__r.w)) 203 // .. possibly combined with a predicate. 204 #define __ASM_OUTP(__t) (__L(__ASM_OUT(__t)), "=h"(*__ir)) 205 206 // Implements a single variant of texture fetch instruction. 207 #define __IMPL_F1(__rt, __dt, __args, __asm_op, __asm_outs, __asm_args) \ 208 template <> \ 209 __device__ __rt __run<__dt>(cudaTextureObject_t __obj, __L(__args)) { \ 210 __rt __r; \ 211 asm(__asm_op : __L(__asm_outs) : "l"(__obj), __L(__asm_args)); \ 212 return __r; \ 213 } 214 215 // Implements texture fetch instructions for int4/uint4/float4 data types. 216 #define __IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \ 217 __IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \ 218 __ASM_OUT("r"), __asm_args) \ 219 __IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \ 220 __ASM_OUT("r"), __asm_args) \ 221 __IMPL_F1(float4, float4, __args, \ 222 __asm_op ".f32." __ctype "\t" __asm_op_args, __ASM_OUT("f"), \ 223 __asm_args) 224 // Implements 'sparse' texture fetch instructions for int4/uint4/float4 data 225 // types. Similar to above, but returns a boolean 'isPresent' value in addition 226 // to texture data, 227 #define __IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \ 228 __IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \ 229 __ASM_OUTP("r"), __asm_args) \ 230 __IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \ 231 __ASM_OUTP("r"), __asm_args) \ 232 __IMPL_F1(float4, float4, __args, \ 233 __asm_op ".f32." __ctype "\t" __asm_op_args, __ASM_OUTP("f"), \ 234 __asm_args) 235 236 // Similar to F3, but for integer data which is returned as normalized floats. 237 // Only instantiates fetch functions for int4/uint4. 238 #define __IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \ 239 __IMPL_F1(float4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \ 240 __ASM_OUT("r"), __asm_args) \ 241 __IMPL_F1(float4, uint4, __args, \ 242 __asm_op ".u32." __ctype "\t" __asm_op_args, __ASM_OUT("r"), \ 243 __asm_args) 244 245 // Instantiates __tex_fetch_v4 with regular fetch functions. 246 #define __IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \ 247 template <> struct __tex_fetch_v4<__op> { \ 248 template <class T> \ 249 __device__ static T __run(cudaTextureObject_t __obj, __L(__args)); \ 250 __IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \ 251 } 252 253 // Same, but for sparse ops. Only available on sm_60+ 254 #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600) 255 #define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args, \ 256 __asm_args) \ 257 template <> struct __tex_fetch_v4<__op> { \ 258 template <class T> \ 259 __device__ static T __run(cudaTextureObject_t __obj, __L(__args)); \ 260 __IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \ 261 } 262 #else 263 #define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) 264 #endif 265 266 // Same, but for normalized float ops. 267 #define __IMPL_S3NI(__op, __args, __asm_op, __ctype, __asm_op_args, \ 268 __asm_args) \ 269 template <> struct __tex_fetch_v4<__op> { \ 270 template <class T> \ 271 __device__ static float4 __run(cudaTextureObject_t __obj, __L(__args)); \ 272 __IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \ 273 } 274 275 // Regular and normalized float ops share a lot of similarities. This macro 276 // instantiates both variants -- normal for __op and normalized for __opn. 277 #define __IMPL_SI(__op, __opn, __args, __asm_op, __ctype, __asm_op_args, \ 278 __asm_args) \ 279 __IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args); \ 280 __IMPL_S3NI(__opn, __args, __asm_op, __ctype, __asm_op_args, __asm_args) 281 282 // Convenience macros which converts string literal __op into a __Tag, 283 #define __IMPL_S3(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \ 284 __IMPL_S3I(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args) 285 #define __IMPL_S3S(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \ 286 __IMPL_S3SI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args) 287 #define __IMPL_S3N(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \ 288 __IMPL_S3NI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args) 289 #define __IMPL_S(__op, __opn, __args, __asm_op, __ctype, __asm_op_args, \ 290 __asm_args) \ 291 __IMPL_SI(__ID(__op), __ID(__opn), __args, __asm_op, __ctype, __asm_op_args, \ 292 __asm_args) 293 294 // CUDA headers have some 'legacy' texture oprerations that duplicate 295 // functionality. So, we just inherit it, instead of refining a copy. 296 #define __IMPL_ALIASI(__op, __opn) \ 297 template <> struct __tex_fetch_v4<__op> : __tex_fetch_v4<__opn> {} 298 #define __IMPL_ALIAS(__op, __opn) __IMPL_ALIASI(__ID(__op), __ID(__opn)) 299 300 // Now we can instantiate everything we need for each specific texture fetch 301 // variant. 302 __IMPL_S("__tex1D_v2", "__tex1D_rmnf_v2", (float __x), "tex.1d.v4", "f32", 303 "{%0, %1, %2, %3}, [%4, {%5}];", ("f"(__x))); 304 __IMPL_S("__tex1Dfetch_v2", "__tex1Dfetch_rmnf_v2", (int __x), "tex.1d.v4", 305 "s32", "{%0, %1, %2, %3}, [%4, {%5}];", ("r"(__x))); 306 __IMPL_ALIAS("__itex1D", "__tex1D_v2"); 307 __IMPL_ALIAS("__itex1Dfetch", "__tex1Dfetch_v2"); 308 309 __IMPL_S("__tex1DGrad_v2", "__tex1DGrad_rmnf_v2", 310 (float __x, float __dPdx, float __dPdy), "tex.grad.1d.v4", "f32", 311 "{%0, %1, %2, %3}, [%4, {%5}], {%6}, {%7};", 312 ("f"(__x), "f"(__dPdx), "f"(__dPdy))); 313 __IMPL_ALIAS("__itex1DGrad", "__tex1DGrad_v2"); 314 315 __IMPL_S("__tex1DLayered_v2", "__tex1DLayered_rmnf_v2", 316 (float __x, int __layer), "tex.a1d.v4", "f32", 317 "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("r"(__layer), "f"(__x))); 318 __IMPL_ALIAS("__itex1DLayered", "__tex1DLayered_v2"); 319 320 __IMPL_S("__tex1DLayeredGrad_v2", "__tex1DLayeredGrad_rmnf_v2", 321 (float __x, int __layer, float __dPdx, float __dPdy), 322 "tex.grad.a1d.v4", "f32", 323 "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7}, {%8};", 324 ("r"(__layer), "f"(__x), "f"(__dPdx), "f"(__dPdy))); 325 __IMPL_ALIAS("__itex1DLayeredGrad", "__tex1DLayeredGrad_v2"); 326 327 __IMPL_S("__tex1DLayeredLod_v2", "__tex1DLayeredLod_rmnf_v2", 328 (float __x, int __layer, float __level), "tex.level.a1d.v4", "f32", 329 "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;", 330 ("r"(__layer), "f"(__x), "f"(__level))); 331 __IMPL_ALIAS("__itex1DLayeredLod", "__tex1DLayeredLod_v2"); 332 333 __IMPL_S("__tex1DLod_v2", "__tex1DLod_rmnf_v2", (float __x, float __level), 334 "tex.level.1d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5}], %6;", 335 ("f"(__x), "f"(__level))); 336 __IMPL_ALIAS("__itex1DLod", "__tex1DLod_v2"); 337 338 // 2D 339 __IMPL_S("__tex2D_v2", "__tex2D_rmnf_v2", (float __x, float __y), "tex.2d.v4", 340 "f32", "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y))); 341 __IMPL_ALIAS("__itex2D", "__tex2D_v2"); 342 343 __IMPL_S3S("__itex2D_sparse", (float __x, float __y, unsigned char *__ir), 344 "{.reg .pred %%p0;\n\t" 345 "tex.2d.v4", 346 "f32", 347 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t" 348 " selp.u16 %4, 1, 0, %%p0; }", 349 ("f"(__x), "f"(__y))); 350 351 __IMPL_S("__tex2DGrad_v2", "__tex2DGrad_rmnf_v2", 352 (float __x, float __y, const float2 *__dPdx, const float2 *__dPdy), 353 "tex.grad.2d.v4", "f32", 354 "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7, %8}, {%9, %10};", 355 ("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x), 356 "f"(__dPdy->y))); 357 __IMPL_ALIAS("__itex2DGrad_v2", "__tex2DGrad_v2"); 358 359 __IMPL_S3S("__itex2DGrad_sparse", 360 (float __x, float __y, const float2 *__dPdx, const float2 *__dPdy, 361 unsigned char *__ir), 362 "{.reg .pred %%p0;\n\t" 363 "tex.grad.2d.v4", 364 "f32", 365 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], {%8, %9}, {%10, %11};\n\t" 366 "selp.u16 %4, 1, 0, %%p0; }", 367 ("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x), 368 "f"(__dPdy->y))); 369 370 __IMPL_S("__tex2DLayered_v2", "__tex2DLayered_rmnf_v2", 371 (float __x, float __y, int __layer), "tex.a2d.v4", "f32", 372 "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];", 373 ("r"(__layer), "f"(__x), "f"(__y))); 374 __IMPL_ALIAS("__itex2DLayered", "__tex2DLayered_v2"); 375 376 __IMPL_S3S("__itex2DLayered_sparse", 377 (float __x, float __y, int __layer, unsigned char *__ir), 378 "{.reg .pred %%p0;\n\t" 379 "tex.a2d.v4", 380 "f32", 381 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t" 382 "selp.u16 %4, 1, 0, %%p0; }", 383 ("r"(__layer), "f"(__x), "f"(__y))); 384 385 __IMPL_S("__tex2DLayeredGrad_v2", "__tex2DLayeredGrad_rmnf_v2", 386 (float __x, float __y, int __layer, const float2 *__dPdx, 387 const float2 *__dPdy), 388 "tex.grad.a2d.v4", "f32", 389 "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], {%8, %9}, {%10, %11};", 390 ("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), 391 "f"(__dPdy->x), "f"(__dPdy->y))); 392 __IMPL_ALIAS("__itex2DLayeredGrad_v2", "__tex2DLayeredGrad_v2"); 393 394 __IMPL_S3S( 395 "__itex2DLayeredGrad_sparse", 396 (float __x, float __y, int __layer, const float2 *__dPdx, 397 const float2 *__dPdy, unsigned char *__ir), 398 "{.reg .pred %%p0;\n\t" 399 "tex.grad.a2d.v4", 400 "f32", 401 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], {%9, %10}, {%11, %12};\n\t" 402 "selp.u16 %4, 1, 0, %%p0; }", 403 ("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), 404 "f"(__dPdy->x), "f"(__dPdy->y))); 405 406 __IMPL_S("__tex2DLayeredLod_v2", "__tex2DLayeredLod_rmnf_v2", 407 (float __x, float __y, int __layer, float __level), "tex.level.a2d.v4", 408 "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;", 409 ("r"(__layer), "f"(__x), "f"(__y), "f"(__level))); 410 __IMPL_ALIAS("__itex2DLayeredLod", "__tex2DLayeredLod_v2"); 411 412 __IMPL_S3S("__itex2DLayeredLod_sparse", 413 (float __x, float __y, int __layer, float __level, 414 unsigned char *__ir), 415 "{.reg .pred %%p0;\n\t" 416 "tex.level.a2d.v4", 417 "f32", 418 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t" 419 "selp.u16 %4, 1, 0, %%p0; }", 420 ("r"(__layer), "f"(__x), "f"(__y), "f"(__level))); 421 422 __IMPL_S("__tex2DLod_v2", "__tex2DLod_rmnf_v2", 423 (float __x, float __y, float __level), "tex.level.2d.v4", "f32", 424 "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;", 425 ("f"(__x), "f"(__y), "f"(__level))); 426 __IMPL_ALIAS("__itex2DLod", "__tex2DLod_v2"); 427 428 __IMPL_S3S("__itex2DLod_sparse", 429 (float __x, float __y, float __level, unsigned char *__ir), 430 "{.reg .pred %%p0;\n\t" 431 "tex.level.2d.v4", 432 "f32", 433 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], %8;\n\t" 434 "selp.u16 %4, 1, 0, %%p0; }", 435 ("f"(__x), "f"(__y), "f"(__level))); 436 437 // 2D gather is special. Unlike other variants that translate into exactly one 438 // asm instruction, it uses one of the four different instructions selected by 439 // __comp. We implement each instruction variant separately, and dispatch the 440 // right one from the manually implemented 'umbrella' fetch. 441 #define __IMPL_2DGATHER(variant, instr) \ 442 __IMPL_SI(__IDV("__tex2Dgather_v2", variant), \ 443 __IDV("__tex2Dgather_rmnf_v2", variant), \ 444 (float __x, float __y, int __comp), instr, "f32", \ 445 "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y))); \ 446 __IMPL_ALIASI(__IDV("__itex2Dgather", variant), \ 447 __IDV("__tex2Dgather_v2", variant)); \ 448 __IMPL_S3SI(__IDV("__itex2Dgather_sparse", variant), \ 449 (float __x, float __y, unsigned char *__ir, int __comp), \ 450 "{.reg .pred %%p0;\n\t" instr, "f32", \ 451 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t" \ 452 "selp.u16 %4, 1, 0, %%p0; }", \ 453 ("f"(__x), "f"(__y))); 454 __IMPL_2DGATHER(0, "tld4.r.2d.v4"); 455 __IMPL_2DGATHER(1, "tld4.g.2d.v4"); 456 __IMPL_2DGATHER(2, "tld4.b.2d.v4"); 457 __IMPL_2DGATHER(3, "tld4.a.2d.v4"); 458 459 // Umbrella dispatcher -- calls into specific 2Dgather variant. 460 template <> struct __tex_fetch_v4<__ID("__tex2Dgather_v2")> { 461 template <class __T> 462 __device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y, 463 int __comp) { 464 switch (__comp) { 465 case 0: 466 return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 0)>::__run<__T>( 467 __obj, __x, __y, __comp); 468 case 1: 469 return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 1)>::__run<__T>( 470 __obj, __x, __y, __comp); 471 case 2: 472 return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 2)>::__run<__T>( 473 __obj, __x, __y, __comp); 474 case 3: 475 return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 3)>::__run<__T>( 476 __obj, __x, __y, __comp); 477 } 478 } 479 }; 480 __IMPL_ALIAS("__itex2Dgather", "__tex2Dgather_v2"); 481 482 template <> struct __tex_fetch_v4<__ID("__tex2Dgather_rmnf_v2")> { 483 template <class __T> 484 __device__ static float4 __run(cudaTextureObject_t __obj, float __x, 485 float __y, int __comp) { 486 switch (__comp) { 487 case 0: 488 return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 0)>::__run<__T>( 489 __obj, __x, __y, __comp); 490 case 1: 491 return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 1)>::__run<__T>( 492 __obj, __x, __y, __comp); 493 case 2: 494 return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 2)>::__run<__T>( 495 __obj, __x, __y, __comp); 496 case 3: 497 return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 3)>::__run<__T>( 498 __obj, __x, __y, __comp); 499 } 500 } 501 }; 502 503 #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600) 504 template <> struct __tex_fetch_v4<__ID("__itex2Dgather_sparse")> { 505 template <class __T> 506 __device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y, 507 unsigned char *__ir, int __comp) { 508 switch (__comp) { 509 case 0: 510 return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 0)>::__run<__T>( 511 __obj, __x, __y, __ir, __comp); 512 case 1: 513 return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 1)>::__run<__T>( 514 __obj, __x, __y, __ir, __comp); 515 case 2: 516 return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 2)>::__run<__T>( 517 __obj, __x, __y, __ir, __comp); 518 case 3: 519 return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 3)>::__run<__T>( 520 __obj, __x, __y, __ir, __comp); 521 } 522 } 523 }; 524 #endif 525 526 // 3D 527 __IMPL_S("__tex3D_v2", "__tex3D_rmnf_v2", (float __x, float __y, float __z), 528 "tex.3d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];", 529 ("f"(__x), "f"(__y), "f"(__z))); 530 __IMPL_ALIAS("__itex3D", "__tex3D_v2"); 531 532 __IMPL_S3S("__itex3D_sparse", 533 (float __x, float __y, float __z, unsigned char *__ir), 534 "{.reg .pred %%p0;\n\t" 535 "tex.3d.v4", 536 "f32", 537 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t" 538 "selp.u16 %4, 1, 0, %%p0; }", 539 ("f"(__x), "f"(__y), "f"(__z))); 540 541 __IMPL_S("__tex3DGrad_v2", "__tex3DGrad_rmnf_v2", 542 (float __x, float __y, float __z, const float4 *__dPdx, 543 const float4 *__dPdy), 544 "tex.grad.3d.v4", "f32", 545 "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], " 546 "{%8, %9, %10, %10}, {%11, %12, %13, %13};", 547 ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y), 548 "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z))); 549 __IMPL_ALIAS("__itex3DGrad_v2", "__tex3DGrad_v2"); 550 551 __IMPL_S3S("__itex3DGrad_sparse", 552 (float __x, float __y, float __z, const float4 *__dPdx, 553 const float4 *__dPdy, unsigned char *__ir), 554 "{.reg .pred %%p0;\n\t" 555 "tex.grad.3d.v4", 556 "f32", 557 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], " 558 "{%9, %10, %11, %11}, {%12, %13, %14, %14};\n\t" 559 "selp.u16 %4, 1, 0, %%p0; }", 560 ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y), 561 "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z))); 562 563 __IMPL_S("__tex3DLod_v2", "__tex3DLod_rmnf_v2", 564 (float __x, float __y, float __z, float __level), "tex.level.3d.v4", 565 "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;", 566 ("f"(__x), "f"(__y), "f"(__z), "f"(__level))); 567 __IMPL_ALIAS("__itex3DLod", "__tex3DLod_v2"); 568 569 __IMPL_S3S("__itex3DLod_sparse", 570 (float __x, float __y, float __z, float __level, 571 unsigned char *__ir), 572 "{.reg .pred %%p0;\n\t" 573 "tex.level.3d.v4", 574 "f32", 575 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t" 576 "selp.u16 %4, 1, 0, %%p0; }", 577 ("f"(__x), "f"(__y), "f"(__z), "f"(__level))); 578 579 // Cubemap 580 __IMPL_S("__texCubemap_v2", "__texCubemap_rmnf_v2", 581 (float __x, float __y, float __z), "tex.cube.v4", "f32", 582 "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];", 583 ("f"(__x), "f"(__y), "f"(__z))); 584 __IMPL_ALIAS("__itexCubemap", "__texCubemap_v2"); 585 586 __IMPL_S3S("__itexCubemap_sparse", 587 (float __x, float __y, float __z, unsigned char *__ir), 588 "{.reg .pred %%p0;\n\t" 589 "tex.cube.v4", 590 "f32", 591 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t" 592 "selp.u16 %4, 1, 0, %%p0; }", 593 ("f"(__x), "f"(__y), "f"(__z))); 594 595 __IMPL_S("__texCubemapGrad_v2", "__texCubemapGrad_rmnf_v2", 596 (float __x, float __y, float __z, const float4 *__dPdx, 597 const float4 *__dPdy), 598 "tex.grad.cube.v4", "f32", 599 "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], " 600 "{%8, %9, %10, %10}, {%11, %12, %13, %13};", 601 ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y), 602 "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z))); 603 __IMPL_ALIAS("__itexCubemapGrad_v2", "__texCubemapGrad_v2"); 604 605 __IMPL_S("__texCubemapLayered_v2", "__texCubemapLayered_rmnf_v2", 606 (float __x, float __y, float __z, int __layer), "tex.acube.v4", "f32", 607 "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];", 608 ("r"(__layer), "f"(__x), "f"(__y), "f"(__z))); 609 __IMPL_ALIAS("__itexCubemapLayered", "__texCubemapLayered_v2"); 610 611 __IMPL_S("__texCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_rmnf_v2", 612 (float __x, float __y, float __z, int __layer, const float4 *__dPdx, 613 const float4 *__dPdy), 614 "tex.grad.acube.v4", "f32", 615 "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], " 616 "{%9, %10, %11, %11}, {%12, %13, %14, %14};", 617 ("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), 618 "f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), 619 "f"(__dPdy->z))); 620 __IMPL_ALIAS("__itexCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_v2"); 621 622 __IMPL_S("__texCubemapLayeredLod_v2", "__texCubemapLayeredLod_rmnf_v2", 623 (float __x, float __y, float __z, int __layer, float __level), 624 "tex.level.acube.v4", "f32", 625 "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;", 626 ("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__level))); 627 __IMPL_ALIAS("__itexCubemapLayeredLod", "__texCubemapLayeredLod_v2"); 628 629 __IMPL_S("__texCubemapLod_v2", "__texCubemapLod_rmnf_v2", 630 (float __x, float __y, float __z, float __level), "tex.level.cube.v4", 631 "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;", 632 ("f"(__x), "f"(__y), "f"(__z), "f"(__level))); 633 __IMPL_ALIAS("__itexCubemapLod", "__texCubemapLod_v2"); 634 635 // Helper class for extracting slice of data from V4 fetch results. 636 template <class __DestT, class __SrcT> struct __convert { 637 template <int __NElements = sizeof(__DestT) / 638 sizeof(typename __TypeInfoT<__DestT>::__base_t)> 639 __device__ static __DestT __run(__SrcT __v); 640 template <> __device__ static __DestT __run<1>(__SrcT __v) { return {__v.x}; } 641 template <> __device__ static __DestT __run<2>(__SrcT __v) { 642 return {__v.x, __v.y}; 643 } 644 template <> __device__ static __DestT __run<3>(__SrcT __v) { 645 return {__v.x, __v.y, __v.z}; 646 } 647 template <> __device__ static __DestT __run<4>(__SrcT __v) { 648 return {__v.x, __v.y, __v.z, __v.w}; 649 } 650 }; 651 652 // These are the top-level function overloads the __nv_tex_surf_handler expands 653 // to. Each overload deals with one of the several ways __nv_tex_surf_handler 654 // is called by CUDA headers. In the end, each of the overloads does the same 655 // job -- it figures out which `__tex_fetch_v4::run` variant should be used to 656 // fetch texture data and which `__convert::run` is needed to convert it into 657 // appropriate return type. 658 659 // __nv_tex_surf_handler("__tex...", &ret, cudaTextureObject_t handle, args...); 660 // Data type and return type are based on ret. 661 template <class __op, class __T, class... __Args> 662 __device__ static void __tex_fetch(__T *__ptr, cudaTextureObject_t __handle, 663 __Args... __args) { 664 using __FetchT = typename __TypeInfoT<__T>::__fetch_t; 665 *__ptr = __convert<__T, __FetchT>::__run( 666 __tex_fetch_v4<__op>::template __run<__FetchT>(__handle, __args...)); 667 } 668 669 // texture<> objects get magically converted into a texture reference. However, 670 // there's no way to convert them to cudaTextureObject_t on C++ level. So, we 671 // cheat a bit and use inline assembly to do it. It costs us an extra register 672 // and a move, but that is easy for ptxas to optimize away. 673 template <class __T> 674 __device__ cudaTextureObject_t __tex_handle_to_obj(__T __handle) { 675 cudaTextureObject_t __obj; 676 asm("mov.b64 %0, %1; " : "=l"(__obj) : "l"(__handle)); 677 return __obj; 678 } 679 680 // __nv_tex_surf_handler ("__tex...", &ret, textureReference, args...); 681 // Data type and return type is based on ret. 682 template <class __op, class __T, class __HandleT, class... __Args> 683 __device__ static void __tex_fetch(__T *__ptr, __HandleT __handle, 684 __Args... __args) { 685 using __FetchT = typename __TypeInfoT<__T>::__fetch_t; 686 *__ptr = __convert<__T, __FetchT>::__run( 687 __tex_fetch_v4<__op>::template __run<__FetchT>( 688 __tex_handle_to_obj(__handle), __args...)); 689 } 690 691 // __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, args...); 692 // cudaReadModeNormalizedFloat fetches always return float4. 693 template <class __op, class __DataT, class __RetT, int __TexT, class... __Args> 694 __device__ static void 695 __tex_fetch(__DataT *, __RetT *__ptr, 696 texture<__DataT, __TexT, cudaReadModeNormalizedFloat> __handle, 697 __Args... __args) { 698 using __FetchT = typename __TypeInfoT<__DataT>::__fetch_t; 699 *__ptr = __convert<__RetT, float4>::__run( 700 __tex_fetch_v4<__op>::template __run<__FetchT>( 701 __tex_handle_to_obj(__handle), __args...)); 702 } 703 704 // __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, args...); 705 // For cudaReadModeElementType fetch return type is based on type_dummy. 706 template <class __op, class __DataT, class __RetT, int __TexT, class... __Args> 707 __device__ static void 708 __tex_fetch(__DataT *, __RetT *__ptr, 709 texture<__DataT, __TexT, cudaReadModeElementType> __handle, 710 __Args... __args) { 711 using __FetchT = typename __TypeInfoT<__DataT>::__fetch_t; 712 *__ptr = __convert<__RetT, __FetchT>::__run( 713 __tex_fetch_v4<__op>::template __run<__FetchT>( 714 __tex_handle_to_obj(__handle), __args...)); 715 } 716 } // namespace __cuda_tex 717 } // namespace 718 #pragma pop_macro("__ASM_OUT") 719 #pragma pop_macro("__ASM_OUTP") 720 #pragma pop_macro("__Args") 721 #pragma pop_macro("__ID") 722 #pragma pop_macro("__IDV") 723 #pragma pop_macro("__IMPL_2DGATHER") 724 #pragma pop_macro("__IMPL_ALIAS") 725 #pragma pop_macro("__IMPL_ALIASI") 726 #pragma pop_macro("__IMPL_F1") 727 #pragma pop_macro("__IMPL_F3") 728 #pragma pop_macro("__IMPL_F3N") 729 #pragma pop_macro("__IMPL_F3S") 730 #pragma pop_macro("__IMPL_S") 731 #pragma pop_macro("__IMPL_S3") 732 #pragma pop_macro("__IMPL_S3I") 733 #pragma pop_macro("__IMPL_S3N") 734 #pragma pop_macro("__IMPL_S3NI") 735 #pragma pop_macro("__IMPL_S3S") 736 #pragma pop_macro("__IMPL_S3SI") 737 #pragma pop_macro("__IMPL_SI") 738 #pragma pop_macro("__L") 739 #pragma pop_macro("__STRIP_PARENS") 740 #endif // __CLANG_CUDA_TEXTURE_INTRINSICS_H__ 741