1 /**
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 #include <faiss/IndexFlat.h>
9 #include <faiss/gpu/GpuIndexFlat.h>
10 #include <faiss/gpu/perf/IndexWrapper.h>
11 #include <faiss/gpu/test/TestUtils.h>
12 #include <faiss/gpu/utils/DeviceUtils.h>
13 #include <faiss/gpu/utils/Timer.h>
14 #include <faiss/utils/random.h>
15 #include <gflags/gflags.h>
16 #include <faiss/gpu/utils/DeviceTensor.cuh>
17 #include <faiss/gpu/utils/HostTensor.cuh>
18 #include <map>
19 #include <memory>
20 #include <vector>
21 
22 #include <cuda_profiler_api.h>
23 
24 DEFINE_bool(l2, true, "L2 or inner product");
25 DEFINE_int32(k, 3, "final number of closest results returned");
26 DEFINE_int32(num, 128, "# of vecs");
27 DEFINE_int32(dim, 128, "# of dimensions");
28 DEFINE_int32(num_queries, 3, "number of query vectors");
29 DEFINE_bool(diff, true, "show exact distance + index output discrepancies");
30 DEFINE_bool(use_float16, false, "use encodings in float16");
31 DEFINE_bool(use_float16_math, false, "perform math in float16");
32 DEFINE_bool(transposed, false, "store vectors transposed");
33 DEFINE_int64(seed, -1, "specify random seed");
34 DEFINE_int32(num_gpus, 1, "number of gpus to use");
35 DEFINE_int64(pinned_mem, 0, "pinned memory allocation to use");
36 DEFINE_bool(cpu, true, "run the CPU code for timing and comparison");
37 DEFINE_bool(use_unified_mem, false, "use Pascal unified memory for the index");
38 
39 using namespace faiss::gpu;
40 
main(int argc,char ** argv)41 int main(int argc, char** argv) {
42     gflags::ParseCommandLineFlags(&argc, &argv, true);
43 
44     cudaProfilerStop();
45 
46     auto seed = FLAGS_seed != -1L ? FLAGS_seed : time(nullptr);
47     printf("using seed %ld\n", seed);
48 
49     auto numQueries = FLAGS_num_queries;
50 
51     auto index = std::unique_ptr<faiss::IndexFlat>(new faiss::IndexFlat(
52             FLAGS_dim,
53             FLAGS_l2 ? faiss::METRIC_L2 : faiss::METRIC_INNER_PRODUCT));
54 
55     HostTensor<float, 2, true> vecs({FLAGS_num, FLAGS_dim});
56     faiss::float_rand(vecs.data(), vecs.numElements(), seed);
57 
58     index->add(FLAGS_num, vecs.data());
59 
60     printf("Database: dim %d num vecs %d\n", FLAGS_dim, FLAGS_num);
61     printf("%s lookup: %d queries, total k %d\n",
62            FLAGS_l2 ? "L2" : "IP",
63            numQueries,
64            FLAGS_k);
65     printf("float16 encoding %s\n", FLAGS_use_float16 ? "enabled" : "disabled");
66     printf("transposed storage %s\n",
67            FLAGS_transposed ? "enabled" : "disabled");
68 
69     // Convert to GPU index
70     printf("Copying index to %d GPU(s)...\n", FLAGS_num_gpus);
71 
72     auto initFn = [&index](faiss::gpu::GpuResourcesProvider* res, int dev)
73             -> std::unique_ptr<faiss::gpu::GpuIndexFlat> {
74         ((faiss::gpu::StandardGpuResources*)res)
75                 ->setPinnedMemory(FLAGS_pinned_mem);
76 
77         GpuIndexFlatConfig config;
78         config.device = dev;
79         config.useFloat16 = FLAGS_use_float16;
80         config.storeTransposed = FLAGS_transposed;
81         config.memorySpace = FLAGS_use_unified_mem ? MemorySpace::Unified
82                                                    : MemorySpace::Device;
83 
84         auto p = std::unique_ptr<faiss::gpu::GpuIndexFlat>(
85                 new faiss::gpu::GpuIndexFlat(res, index.get(), config));
86         return p;
87     };
88 
89     IndexWrapper<faiss::gpu::GpuIndexFlat> gpuIndex(FLAGS_num_gpus, initFn);
90     printf("copy done\n");
91 
92     // Build query vectors
93     HostTensor<float, 2, true> cpuQuery({numQueries, FLAGS_dim});
94     faiss::float_rand(cpuQuery.data(), cpuQuery.numElements(), seed);
95 
96     // Time faiss CPU
97     HostTensor<float, 2, true> cpuDistances({numQueries, FLAGS_k});
98     HostTensor<faiss::Index::idx_t, 2, true> cpuIndices({numQueries, FLAGS_k});
99 
100     if (FLAGS_cpu) {
101         float cpuTime = 0.0f;
102 
103         CpuTimer timer;
104         index->search(
105                 numQueries,
106                 cpuQuery.data(),
107                 FLAGS_k,
108                 cpuDistances.data(),
109                 cpuIndices.data());
110 
111         cpuTime = timer.elapsedMilliseconds();
112         printf("CPU time %.3f ms\n", cpuTime);
113     }
114 
115     HostTensor<float, 2, true> gpuDistances({numQueries, FLAGS_k});
116     HostTensor<faiss::Index::idx_t, 2, true> gpuIndices({numQueries, FLAGS_k});
117 
118     CUDA_VERIFY(cudaProfilerStart());
119     faiss::gpu::synchronizeAllDevices();
120 
121     float gpuTime = 0.0f;
122 
123     // Time GPU
124     {
125         CpuTimer timer;
126 
127         gpuIndex.getIndex()->search(
128                 cpuQuery.getSize(0),
129                 cpuQuery.data(),
130                 FLAGS_k,
131                 gpuDistances.data(),
132                 gpuIndices.data());
133 
134         // There is a device -> host copy above, so no need to time
135         // additional synchronization with the GPU
136         gpuTime = timer.elapsedMilliseconds();
137     }
138 
139     CUDA_VERIFY(cudaProfilerStop());
140     printf("GPU time %.3f ms\n", gpuTime);
141 
142     if (FLAGS_cpu) {
143         compareLists(
144                 cpuDistances.data(),
145                 cpuIndices.data(),
146                 gpuDistances.data(),
147                 gpuIndices.data(),
148                 numQueries,
149                 FLAGS_k,
150                 "",
151                 true,
152                 FLAGS_diff,
153                 false);
154     }
155 
156     CUDA_VERIFY(cudaDeviceSynchronize());
157 
158     return 0;
159 }
160