1 /**
2 * Copyright (c) Facebook, Inc. and its affiliates.
3 *
4 * This source code is licensed under the MIT license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7
8 #pragma once
9
10 #include <cuda.h>
11
12 namespace faiss {
13 namespace gpu {
14
15 // defines to simplify the SASS assembly structure file/line in the profiler
16 #define GET_BITFIELD_U32(OUT, VAL, POS, LEN) \
17 asm("bfe.u32 %0, %1, %2, %3;" : "=r"(OUT) : "r"(VAL), "r"(POS), "r"(LEN));
18
19 #define GET_BITFIELD_U64(OUT, VAL, POS, LEN) \
20 asm("bfe.u64 %0, %1, %2, %3;" : "=l"(OUT) : "l"(VAL), "r"(POS), "r"(LEN));
21
getBitfield(unsigned int val,int pos,int len)22 __device__ __forceinline__ unsigned int getBitfield(
23 unsigned int val,
24 int pos,
25 int len) {
26 unsigned int ret;
27 asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(val), "r"(pos), "r"(len));
28 return ret;
29 }
30
31 __device__ __forceinline__ uint64_t
getBitfield(uint64_t val,int pos,int len)32 getBitfield(uint64_t val, int pos, int len) {
33 uint64_t ret;
34 asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len));
35 return ret;
36 }
37
setBitfield(unsigned int val,unsigned int toInsert,int pos,int len)38 __device__ __forceinline__ unsigned int setBitfield(
39 unsigned int val,
40 unsigned int toInsert,
41 int pos,
42 int len) {
43 unsigned int ret;
44 asm("bfi.b32 %0, %1, %2, %3, %4;"
45 : "=r"(ret)
46 : "r"(toInsert), "r"(val), "r"(pos), "r"(len));
47 return ret;
48 }
49
getLaneId()50 __device__ __forceinline__ int getLaneId() {
51 int laneId;
52 asm("mov.u32 %0, %%laneid;" : "=r"(laneId));
53 return laneId;
54 }
55
getLaneMaskLt()56 __device__ __forceinline__ unsigned getLaneMaskLt() {
57 unsigned mask;
58 asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask));
59 return mask;
60 }
61
getLaneMaskLe()62 __device__ __forceinline__ unsigned getLaneMaskLe() {
63 unsigned mask;
64 asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
65 return mask;
66 }
67
getLaneMaskGt()68 __device__ __forceinline__ unsigned getLaneMaskGt() {
69 unsigned mask;
70 asm("mov.u32 %0, %%lanemask_gt;" : "=r"(mask));
71 return mask;
72 }
73
getLaneMaskGe()74 __device__ __forceinline__ unsigned getLaneMaskGe() {
75 unsigned mask;
76 asm("mov.u32 %0, %%lanemask_ge;" : "=r"(mask));
77 return mask;
78 }
79
namedBarrierWait(int name,int numThreads)80 __device__ __forceinline__ void namedBarrierWait(int name, int numThreads) {
81 asm volatile("bar.sync %0, %1;" : : "r"(name), "r"(numThreads) : "memory");
82 }
83
namedBarrierArrived(int name,int numThreads)84 __device__ __forceinline__ void namedBarrierArrived(int name, int numThreads) {
85 asm volatile("bar.arrive %0, %1;"
86 :
87 : "r"(name), "r"(numThreads)
88 : "memory");
89 }
90
91 } // namespace gpu
92 } // namespace faiss
93