1 /**
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 #include <faiss/gpu/test/TestUtils.h>
9 #include <faiss/utils/random.h>
10 #include <gtest/gtest.h>
11 #include <time.h>
12 #include <cmath>
13 #include <set>
14 #include <sstream>
15 #include <unordered_map>
16 
17 namespace faiss {
18 namespace gpu {
19 
relativeError(float a,float b)20 inline float relativeError(float a, float b) {
21     return std::abs(a - b) / (0.5f * (std::abs(a) + std::abs(b)));
22 }
23 
24 // This seed is also used for the faiss float_rand API; in a test it
25 // is all within a single thread, so it is ok
26 long s_seed = 1;
27 std::mt19937 rng(1);
28 std::uniform_int_distribution<> distrib;
29 
newTestSeed()30 void newTestSeed() {
31     struct timespec t;
32     clock_gettime(CLOCK_REALTIME, &t);
33 
34     setTestSeed(t.tv_nsec);
35 }
36 
setTestSeed(long seed)37 void setTestSeed(long seed) {
38     printf("testing with random seed %ld\n", seed);
39 
40     rng = std::mt19937(seed);
41     s_seed = seed;
42 }
43 
randVal(int a,int b)44 int randVal(int a, int b) {
45     EXPECT_GE(a, 0);
46     EXPECT_LE(a, b);
47 
48     return a + (distrib(rng) % (b + 1 - a));
49 }
50 
randBool()51 bool randBool() {
52     return randSelect<bool>({true, false});
53 }
54 
randVecs(size_t num,size_t dim)55 std::vector<float> randVecs(size_t num, size_t dim) {
56     std::vector<float> v(num * dim);
57 
58     faiss::float_rand(v.data(), v.size(), s_seed);
59     // unfortunately we generate separate sets of vectors, and don't
60     // want the same values
61     ++s_seed;
62 
63     return v;
64 }
65 
randBinaryVecs(size_t num,size_t dim)66 std::vector<unsigned char> randBinaryVecs(size_t num, size_t dim) {
67     std::vector<unsigned char> v(num * (dim / 8));
68 
69     faiss::byte_rand(v.data(), v.size(), s_seed);
70     // unfortunately we generate separate sets of vectors, and don't
71     // want the same values
72     ++s_seed;
73 
74     return v;
75 }
76 
compareIndices(const std::vector<float> & queryVecs,faiss::Index & refIndex,faiss::Index & testIndex,int numQuery,int,int k,const std::string & configMsg,float maxRelativeError,float pctMaxDiff1,float pctMaxDiffN)77 void compareIndices(
78         const std::vector<float>& queryVecs,
79         faiss::Index& refIndex,
80         faiss::Index& testIndex,
81         int numQuery,
82         int /*dim*/,
83         int k,
84         const std::string& configMsg,
85         float maxRelativeError,
86         float pctMaxDiff1,
87         float pctMaxDiffN) {
88     // Compare
89     std::vector<float> refDistance(numQuery * k, 0);
90     std::vector<faiss::Index::idx_t> refIndices(numQuery * k, -1);
91     refIndex.search(
92             numQuery,
93             queryVecs.data(),
94             k,
95             refDistance.data(),
96             refIndices.data());
97 
98     std::vector<float> testDistance(numQuery * k, 0);
99     std::vector<faiss::Index::idx_t> testIndices(numQuery * k, -1);
100     testIndex.search(
101             numQuery,
102             queryVecs.data(),
103             k,
104             testDistance.data(),
105             testIndices.data());
106 
107     faiss::gpu::compareLists(
108             refDistance.data(),
109             refIndices.data(),
110             testDistance.data(),
111             testIndices.data(),
112             numQuery,
113             k,
114             configMsg,
115             true,
116             false,
117             true,
118             maxRelativeError,
119             pctMaxDiff1,
120             pctMaxDiffN);
121 }
122 
compareIndices(faiss::Index & refIndex,faiss::Index & testIndex,int numQuery,int dim,int k,const std::string & configMsg,float maxRelativeError,float pctMaxDiff1,float pctMaxDiffN)123 void compareIndices(
124         faiss::Index& refIndex,
125         faiss::Index& testIndex,
126         int numQuery,
127         int dim,
128         int k,
129         const std::string& configMsg,
130         float maxRelativeError,
131         float pctMaxDiff1,
132         float pctMaxDiffN) {
133     auto queryVecs = faiss::gpu::randVecs(numQuery, dim);
134 
135     compareIndices(
136             queryVecs,
137             refIndex,
138             testIndex,
139             numQuery,
140             dim,
141             k,
142             configMsg,
143             maxRelativeError,
144             pctMaxDiff1,
145             pctMaxDiffN);
146 }
147 
148 template <typename T>
lookup(const T * p,int i,int j,int,int dim2)149 inline T lookup(const T* p, int i, int j, int /*dim1*/, int dim2) {
150     return p[i * dim2 + j];
151 }
152 
compareLists(const float * refDist,const faiss::Index::idx_t * refInd,const float * testDist,const faiss::Index::idx_t * testInd,int dim1,int dim2,const std::string & configMsg,bool printBasicStats,bool printDiffs,bool assertOnErr,float maxRelativeError,float pctMaxDiff1,float pctMaxDiffN)153 void compareLists(
154         const float* refDist,
155         const faiss::Index::idx_t* refInd,
156         const float* testDist,
157         const faiss::Index::idx_t* testInd,
158         int dim1,
159         int dim2,
160         const std::string& configMsg,
161         bool printBasicStats,
162         bool printDiffs,
163         bool assertOnErr,
164         float maxRelativeError,
165         float pctMaxDiff1,
166         float pctMaxDiffN) {
167     float maxAbsErr = 0.0f;
168     for (int i = 0; i < dim1 * dim2; ++i) {
169         maxAbsErr = std::max(maxAbsErr, std::abs(refDist[i] - testDist[i]));
170     }
171     int numResults = dim1 * dim2;
172 
173     // query -> {index -> result position}
174     std::vector<std::unordered_map<faiss::Index::idx_t, int>> refIndexMap;
175 
176     for (int query = 0; query < dim1; ++query) {
177         std::unordered_map<faiss::Index::idx_t, int> indices;
178 
179         for (int result = 0; result < dim2; ++result) {
180             indices[lookup(refInd, query, result, dim1, dim2)] = result;
181         }
182 
183         refIndexMap.emplace_back(std::move(indices));
184     }
185 
186     // See how far off the indices are
187     // Keep track of the difference for each entry
188     std::vector<std::vector<int>> indexDiffs;
189 
190     int diff1 = 0;   // index differs by 1
191     int diffN = 0;   // index differs by >1
192     int diffInf = 0; // index not found in the other
193     int nonUniqueIndices = 0;
194 
195     double avgDiff = 0.0;
196     int maxDiff = 0;
197     float maxRelErr = 0.0f;
198 
199     for (int query = 0; query < dim1; ++query) {
200         std::vector<int> diffs;
201         std::set<faiss::Index::idx_t> uniqueIndices;
202 
203         auto& indices = refIndexMap[query];
204 
205         for (int result = 0; result < dim2; ++result) {
206             auto t = lookup(testInd, query, result, dim1, dim2);
207 
208             // All indices reported within a query should be unique; this is
209             // a serious error if is otherwise the case.
210             // If -1 is reported (no result due to IVF partitioning or not
211             // enough entries in the index), then duplicates are allowed, but
212             // both the reference and test must have -1 in the same position.
213             if (t == -1) {
214                 EXPECT_EQ(lookup(refInd, query, result, dim1, dim2), t);
215             } else {
216                 bool uniqueIndex = uniqueIndices.count(t) == 0;
217                 if (assertOnErr) {
218                     EXPECT_TRUE(uniqueIndex) << configMsg << " " << query << " "
219                                              << result << " " << t;
220                 }
221 
222                 if (!uniqueIndex) {
223                     ++nonUniqueIndices;
224                 } else {
225                     uniqueIndices.insert(t);
226                 }
227 
228                 auto it = indices.find(t);
229                 if (it != indices.end()) {
230                     int diff = std::abs(result - it->second);
231                     diffs.push_back(diff);
232 
233                     if (diff == 1) {
234                         ++diff1;
235                         maxDiff = std::max(diff, maxDiff);
236                     } else if (diff > 1) {
237                         ++diffN;
238                         maxDiff = std::max(diff, maxDiff);
239                     }
240 
241                     avgDiff += (double)diff;
242                 } else {
243                     ++diffInf;
244                     diffs.push_back(-1);
245                     // don't count this for maxDiff
246                 }
247             }
248 
249             auto refD = lookup(refDist, query, result, dim1, dim2);
250             auto testD = lookup(testDist, query, result, dim1, dim2);
251 
252             float relErr = relativeError(refD, testD);
253 
254             if (assertOnErr) {
255                 EXPECT_LE(relErr, maxRelativeError)
256                         << configMsg << " (" << query << ", " << result
257                         << ") refD: " << refD << " testD: " << testD;
258             }
259 
260             maxRelErr = std::max(maxRelErr, relErr);
261         }
262 
263         indexDiffs.emplace_back(std::move(diffs));
264     }
265 
266     if (assertOnErr) {
267         EXPECT_LE(
268                 (float)(diff1 + diffN + diffInf),
269                 (float)numResults * pctMaxDiff1)
270                 << configMsg;
271 
272         // Don't count diffInf because that could be diff1 as far as we
273         // know
274         EXPECT_LE((float)diffN, (float)numResults * pctMaxDiffN) << configMsg;
275     }
276 
277     avgDiff /= (double)numResults;
278 
279     if (printBasicStats) {
280         if (!configMsg.empty()) {
281             printf("Config\n"
282                    "----------------------------\n"
283                    "%s\n",
284                    configMsg.c_str());
285         }
286 
287         printf("Result error and differences\n"
288                "----------------------------\n"
289                "max abs diff %.7f rel diff %.7f\n"
290                "idx diff avg: %.5g max: %d\n"
291                "idx diff of 1:      %d (%.3f%% of queries)\n"
292                "idx diff of >1:     %d (%.3f%% of queries)\n"
293                "idx diff not found: %d (%.3f%% of queries)"
294                " [typically a last element inversion]\n"
295                "non-unique indices: %d (a serious error if >0)\n",
296                maxAbsErr,
297                maxRelErr,
298                avgDiff,
299                maxDiff,
300                diff1,
301                100.0f * (float)diff1 / (float)numResults,
302                diffN,
303                100.0f * (float)diffN / (float)numResults,
304                diffInf,
305                100.0f * (float)diffInf / (float)numResults,
306                nonUniqueIndices);
307     }
308 
309     if (printDiffs) {
310         printf("differences:\n");
311         printf("==================\n");
312         for (int query = 0; query < dim1; ++query) {
313             for (int result = 0; result < dim2; ++result) {
314                 long refI = lookup(refInd, query, result, dim1, dim2);
315                 long testI = lookup(testInd, query, result, dim1, dim2);
316 
317                 if (refI != testI) {
318                     float refD = lookup(refDist, query, result, dim1, dim2);
319                     float testD = lookup(testDist, query, result, dim1, dim2);
320 
321                     float maxDist = std::max(refD, testD);
322                     float delta = std::abs(refD - testD);
323 
324                     float relErr = delta / maxDist;
325 
326                     if (refD == testD) {
327                         printf("(%d, %d [%d]) (ref %ld tst %ld dist ==)\n",
328                                query,
329                                result,
330                                indexDiffs[query][result],
331                                refI,
332                                testI);
333                     } else {
334                         printf("(%d, %d [%d]) (ref %ld tst %ld abs %.8f "
335                                "rel %.8f ref %a tst %a)\n",
336                                query,
337                                result,
338                                indexDiffs[query][result],
339                                refI,
340                                testI,
341                                delta,
342                                relErr,
343                                refD,
344                                testD);
345                     }
346                 }
347             }
348         }
349     }
350 }
351 
352 } // namespace gpu
353 } // namespace faiss
354