1 /**
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 #include <faiss/gpu/utils/DeviceUtils.h>
9 #include <faiss/gpu/utils/StaticUtils.h>
10 #include <thrust/execution_policy.h>
11 #include <thrust/scan.h>
12 #include <faiss/gpu/impl/IVFUtils.cuh>
13 #include <faiss/gpu/utils/Tensor.cuh>
14 #include <faiss/gpu/utils/ThrustAllocator.cuh>
15 
16 #include <algorithm>
17 
18 namespace faiss {
19 namespace gpu {
20 
21 // Calculates the total number of intermediate distances to consider
22 // for all queries
getResultLengths(Tensor<int,2,true> topQueryToCentroid,int * listLengths,int totalSize,Tensor<int,2,true> length)23 __global__ void getResultLengths(
24         Tensor<int, 2, true> topQueryToCentroid,
25         int* listLengths,
26         int totalSize,
27         Tensor<int, 2, true> length) {
28     int linearThreadId = blockIdx.x * blockDim.x + threadIdx.x;
29     if (linearThreadId >= totalSize) {
30         return;
31     }
32 
33     int nprobe = topQueryToCentroid.getSize(1);
34     int queryId = linearThreadId / nprobe;
35     int listId = linearThreadId % nprobe;
36 
37     int centroidId = topQueryToCentroid[queryId][listId];
38 
39     // Safety guard in case NaNs in input cause no list ID to be generated
40     length[queryId][listId] = (centroidId != -1) ? listLengths[centroidId] : 0;
41 }
42 
runCalcListOffsets(GpuResources * res,Tensor<int,2,true> & topQueryToCentroid,thrust::device_vector<int> & listLengths,Tensor<int,2,true> & prefixSumOffsets,Tensor<char,1,true> & thrustMem,cudaStream_t stream)43 void runCalcListOffsets(
44         GpuResources* res,
45         Tensor<int, 2, true>& topQueryToCentroid,
46         thrust::device_vector<int>& listLengths,
47         Tensor<int, 2, true>& prefixSumOffsets,
48         Tensor<char, 1, true>& thrustMem,
49         cudaStream_t stream) {
50     FAISS_ASSERT(topQueryToCentroid.getSize(0) == prefixSumOffsets.getSize(0));
51     FAISS_ASSERT(topQueryToCentroid.getSize(1) == prefixSumOffsets.getSize(1));
52 
53     int totalSize = topQueryToCentroid.numElements();
54 
55     int numThreads = std::min(totalSize, getMaxThreadsCurrentDevice());
56     int numBlocks = utils::divUp(totalSize, numThreads);
57 
58     auto grid = dim3(numBlocks);
59     auto block = dim3(numThreads);
60 
61     getResultLengths<<<grid, block, 0, stream>>>(
62             topQueryToCentroid,
63             listLengths.data().get(),
64             totalSize,
65             prefixSumOffsets);
66     CUDA_TEST_ERROR();
67 
68     // Prefix sum of the indices, so we know where the intermediate
69     // results should be maintained
70     // Thrust wants a place for its temporary allocations, so provide
71     // one, so it won't call cudaMalloc/Free if we size it sufficiently
72     GpuResourcesThrustAllocator alloc(
73             res, stream, thrustMem.data(), thrustMem.getSizeInBytes());
74 
75     thrust::inclusive_scan(
76             thrust::cuda::par(alloc).on(stream),
77             prefixSumOffsets.data(),
78             prefixSumOffsets.data() + totalSize,
79             prefixSumOffsets.data());
80     CUDA_TEST_ERROR();
81 }
82 
83 } // namespace gpu
84 } // namespace faiss
85