1 /**
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 #pragma once
9 
10 #include <cuda.h>
11 
12 namespace faiss {
13 namespace gpu {
14 
15 // defines to simplify the SASS assembly structure file/line in the profiler
16 #define GET_BITFIELD_U32(OUT, VAL, POS, LEN) \
17     asm("bfe.u32 %0, %1, %2, %3;" : "=r"(OUT) : "r"(VAL), "r"(POS), "r"(LEN));
18 
19 #define GET_BITFIELD_U64(OUT, VAL, POS, LEN) \
20     asm("bfe.u64 %0, %1, %2, %3;" : "=l"(OUT) : "l"(VAL), "r"(POS), "r"(LEN));
21 
getBitfield(unsigned int val,int pos,int len)22 __device__ __forceinline__ unsigned int getBitfield(
23         unsigned int val,
24         int pos,
25         int len) {
26     unsigned int ret;
27     asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(val), "r"(pos), "r"(len));
28     return ret;
29 }
30 
31 __device__ __forceinline__ uint64_t
getBitfield(uint64_t val,int pos,int len)32 getBitfield(uint64_t val, int pos, int len) {
33     uint64_t ret;
34     asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len));
35     return ret;
36 }
37 
setBitfield(unsigned int val,unsigned int toInsert,int pos,int len)38 __device__ __forceinline__ unsigned int setBitfield(
39         unsigned int val,
40         unsigned int toInsert,
41         int pos,
42         int len) {
43     unsigned int ret;
44     asm("bfi.b32 %0, %1, %2, %3, %4;"
45         : "=r"(ret)
46         : "r"(toInsert), "r"(val), "r"(pos), "r"(len));
47     return ret;
48 }
49 
getLaneId()50 __device__ __forceinline__ int getLaneId() {
51     int laneId;
52     asm("mov.u32 %0, %%laneid;" : "=r"(laneId));
53     return laneId;
54 }
55 
getLaneMaskLt()56 __device__ __forceinline__ unsigned getLaneMaskLt() {
57     unsigned mask;
58     asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask));
59     return mask;
60 }
61 
getLaneMaskLe()62 __device__ __forceinline__ unsigned getLaneMaskLe() {
63     unsigned mask;
64     asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
65     return mask;
66 }
67 
getLaneMaskGt()68 __device__ __forceinline__ unsigned getLaneMaskGt() {
69     unsigned mask;
70     asm("mov.u32 %0, %%lanemask_gt;" : "=r"(mask));
71     return mask;
72 }
73 
getLaneMaskGe()74 __device__ __forceinline__ unsigned getLaneMaskGe() {
75     unsigned mask;
76     asm("mov.u32 %0, %%lanemask_ge;" : "=r"(mask));
77     return mask;
78 }
79 
namedBarrierWait(int name,int numThreads)80 __device__ __forceinline__ void namedBarrierWait(int name, int numThreads) {
81     asm volatile("bar.sync %0, %1;" : : "r"(name), "r"(numThreads) : "memory");
82 }
83 
namedBarrierArrived(int name,int numThreads)84 __device__ __forceinline__ void namedBarrierArrived(int name, int numThreads) {
85     asm volatile("bar.arrive %0, %1;"
86                  :
87                  : "r"(name), "r"(numThreads)
88                  : "memory");
89 }
90 
91 } // namespace gpu
92 } // namespace faiss
93