1 /**
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 // -*- c++ -*-
9 
10 #include <faiss/utils/utils.h>
11 
12 #include <cassert>
13 #include <cmath>
14 #include <cstdio>
15 #include <cstring>
16 
17 #include <sys/types.h>
18 
19 #ifdef _MSC_VER
20 #define NOMINMAX
21 #include <windows.h>
22 #undef NOMINMAX
23 #else
24 #include <sys/time.h>
25 #include <unistd.h>
26 #endif // !_MSC_VER
27 
28 #include <omp.h>
29 
30 #include <algorithm>
31 #include <vector>
32 
33 #include <faiss/impl/AuxIndexStructures.h>
34 #include <faiss/impl/FaissAssert.h>
35 #include <faiss/impl/platform_macros.h>
36 #include <faiss/utils/random.h>
37 
38 #ifndef FINTEGER
39 #define FINTEGER long
40 #endif
41 
42 extern "C" {
43 
44 /* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
45 
46 int sgemm_(
47         const char* transa,
48         const char* transb,
49         FINTEGER* m,
50         FINTEGER* n,
51         FINTEGER* k,
52         const float* alpha,
53         const float* a,
54         FINTEGER* lda,
55         const float* b,
56         FINTEGER* ldb,
57         float* beta,
58         float* c,
59         FINTEGER* ldc);
60 
61 /* Lapack functions, see http://www.netlib.org/clapack/old/single/sgeqrf.c */
62 
63 int sgeqrf_(
64         FINTEGER* m,
65         FINTEGER* n,
66         float* a,
67         FINTEGER* lda,
68         float* tau,
69         float* work,
70         FINTEGER* lwork,
71         FINTEGER* info);
72 
73 int sorgqr_(
74         FINTEGER* m,
75         FINTEGER* n,
76         FINTEGER* k,
77         float* a,
78         FINTEGER* lda,
79         float* tau,
80         float* work,
81         FINTEGER* lwork,
82         FINTEGER* info);
83 
84 int sgemv_(
85         const char* trans,
86         FINTEGER* m,
87         FINTEGER* n,
88         float* alpha,
89         const float* a,
90         FINTEGER* lda,
91         const float* x,
92         FINTEGER* incx,
93         float* beta,
94         float* y,
95         FINTEGER* incy);
96 }
97 
98 /**************************************************
99  * Get some stats about the system
100  **************************************************/
101 
102 namespace faiss {
103 
get_compile_options()104 std::string get_compile_options() {
105     std::string options;
106 
107     // this flag is set by GCC and Clang
108 #ifdef __OPTIMIZE__
109     options += "OPTIMIZE ";
110 #endif
111 
112 #ifdef __AVX2__
113     options += "AVX2";
114 #elif defined(__aarch64__)
115     options += "NEON";
116 #else
117     options += "GENERIC";
118 #endif
119 
120     return options;
121 }
122 
123 #ifdef _MSC_VER
getmillisecs()124 double getmillisecs() {
125     LARGE_INTEGER ts;
126     LARGE_INTEGER freq;
127     QueryPerformanceFrequency(&freq);
128     QueryPerformanceCounter(&ts);
129 
130     return (ts.QuadPart * 1e3) / freq.QuadPart;
131 }
132 #else  // _MSC_VER
getmillisecs()133 double getmillisecs() {
134     struct timeval tv;
135     gettimeofday(&tv, nullptr);
136     return tv.tv_sec * 1e3 + tv.tv_usec * 1e-3;
137 }
138 #endif // _MSC_VER
139 
get_cycles()140 uint64_t get_cycles() {
141 #ifdef __x86_64__
142     uint32_t high, low;
143     asm volatile("rdtsc \n\t" : "=a"(low), "=d"(high));
144     return ((uint64_t)high << 32) | (low);
145 #else
146     return 0;
147 #endif
148 }
149 
150 #ifdef __linux__
151 
get_mem_usage_kb()152 size_t get_mem_usage_kb() {
153     int pid = getpid();
154     char fname[256];
155     snprintf(fname, 256, "/proc/%d/status", pid);
156     FILE* f = fopen(fname, "r");
157     FAISS_THROW_IF_NOT_MSG(f, "cannot open proc status file");
158     size_t sz = 0;
159     for (;;) {
160         char buf[256];
161         if (!fgets(buf, 256, f))
162             break;
163         if (sscanf(buf, "VmRSS: %ld kB", &sz) == 1)
164             break;
165     }
166     fclose(f);
167     return sz;
168 }
169 
170 #else
171 
get_mem_usage_kb()172 size_t get_mem_usage_kb() {
173     fprintf(stderr,
174             "WARN: get_mem_usage_kb not implemented on current architecture\n");
175     return 0;
176 }
177 
178 #endif
179 
reflection(const float * __restrict u,float * __restrict x,size_t n,size_t d,size_t nu)180 void reflection(
181         const float* __restrict u,
182         float* __restrict x,
183         size_t n,
184         size_t d,
185         size_t nu) {
186     size_t i, j, l;
187     for (i = 0; i < n; i++) {
188         const float* up = u;
189         for (l = 0; l < nu; l++) {
190             float ip1 = 0, ip2 = 0;
191 
192             for (j = 0; j < d; j += 2) {
193                 ip1 += up[j] * x[j];
194                 ip2 += up[j + 1] * x[j + 1];
195             }
196             float ip = 2 * (ip1 + ip2);
197 
198             for (j = 0; j < d; j++)
199                 x[j] -= ip * up[j];
200             up += d;
201         }
202         x += d;
203     }
204 }
205 
206 /* Reference implementation (slower) */
reflection_ref(const float * u,float * x,size_t n,size_t d,size_t nu)207 void reflection_ref(const float* u, float* x, size_t n, size_t d, size_t nu) {
208     size_t i, j, l;
209     for (i = 0; i < n; i++) {
210         const float* up = u;
211         for (l = 0; l < nu; l++) {
212             double ip = 0;
213 
214             for (j = 0; j < d; j++)
215                 ip += up[j] * x[j];
216             ip *= 2;
217 
218             for (j = 0; j < d; j++)
219                 x[j] -= ip * up[j];
220 
221             up += d;
222         }
223         x += d;
224     }
225 }
226 
227 /***************************************************************************
228  * Some matrix manipulation functions
229  ***************************************************************************/
230 
matrix_qr(int m,int n,float * a)231 void matrix_qr(int m, int n, float* a) {
232     FAISS_THROW_IF_NOT(m >= n);
233     FINTEGER mi = m, ni = n, ki = mi < ni ? mi : ni;
234     std::vector<float> tau(ki);
235     FINTEGER lwork = -1, info;
236     float work_size;
237 
238     sgeqrf_(&mi, &ni, a, &mi, tau.data(), &work_size, &lwork, &info);
239     lwork = size_t(work_size);
240     std::vector<float> work(lwork);
241 
242     sgeqrf_(&mi, &ni, a, &mi, tau.data(), work.data(), &lwork, &info);
243 
244     sorgqr_(&mi, &ni, &ki, a, &mi, tau.data(), work.data(), &lwork, &info);
245 }
246 
247 /***************************************************************************
248  * Result list routines
249  ***************************************************************************/
250 
ranklist_handle_ties(int k,int64_t * idx,const float * dis)251 void ranklist_handle_ties(int k, int64_t* idx, const float* dis) {
252     float prev_dis = -1e38;
253     int prev_i = -1;
254     for (int i = 0; i < k; i++) {
255         if (dis[i] != prev_dis) {
256             if (i > prev_i + 1) {
257                 // sort between prev_i and i - 1
258                 std::sort(idx + prev_i, idx + i);
259             }
260             prev_i = i;
261             prev_dis = dis[i];
262         }
263     }
264 }
265 
merge_result_table_with(size_t n,size_t k,int64_t * I0,float * D0,const int64_t * I1,const float * D1,bool keep_min,int64_t translation)266 size_t merge_result_table_with(
267         size_t n,
268         size_t k,
269         int64_t* I0,
270         float* D0,
271         const int64_t* I1,
272         const float* D1,
273         bool keep_min,
274         int64_t translation) {
275     size_t n1 = 0;
276 
277 #pragma omp parallel reduction(+ : n1)
278     {
279         std::vector<int64_t> tmpI(k);
280         std::vector<float> tmpD(k);
281 
282 #pragma omp for
283         for (int64_t i = 0; i < n; i++) {
284             int64_t* lI0 = I0 + i * k;
285             float* lD0 = D0 + i * k;
286             const int64_t* lI1 = I1 + i * k;
287             const float* lD1 = D1 + i * k;
288             size_t r0 = 0;
289             size_t r1 = 0;
290 
291             if (keep_min) {
292                 for (size_t j = 0; j < k; j++) {
293                     if (lI0[r0] >= 0 && lD0[r0] < lD1[r1]) {
294                         tmpD[j] = lD0[r0];
295                         tmpI[j] = lI0[r0];
296                         r0++;
297                     } else if (lD1[r1] >= 0) {
298                         tmpD[j] = lD1[r1];
299                         tmpI[j] = lI1[r1] + translation;
300                         r1++;
301                     } else { // both are NaNs
302                         tmpD[j] = NAN;
303                         tmpI[j] = -1;
304                     }
305                 }
306             } else {
307                 for (size_t j = 0; j < k; j++) {
308                     if (lI0[r0] >= 0 && lD0[r0] > lD1[r1]) {
309                         tmpD[j] = lD0[r0];
310                         tmpI[j] = lI0[r0];
311                         r0++;
312                     } else if (lD1[r1] >= 0) {
313                         tmpD[j] = lD1[r1];
314                         tmpI[j] = lI1[r1] + translation;
315                         r1++;
316                     } else { // both are NaNs
317                         tmpD[j] = NAN;
318                         tmpI[j] = -1;
319                     }
320                 }
321             }
322             n1 += r1;
323             memcpy(lD0, tmpD.data(), sizeof(lD0[0]) * k);
324             memcpy(lI0, tmpI.data(), sizeof(lI0[0]) * k);
325         }
326     }
327 
328     return n1;
329 }
330 
ranklist_intersection_size(size_t k1,const int64_t * v1,size_t k2,const int64_t * v2_in)331 size_t ranklist_intersection_size(
332         size_t k1,
333         const int64_t* v1,
334         size_t k2,
335         const int64_t* v2_in) {
336     if (k2 > k1)
337         return ranklist_intersection_size(k2, v2_in, k1, v1);
338     int64_t* v2 = new int64_t[k2];
339     memcpy(v2, v2_in, sizeof(int64_t) * k2);
340     std::sort(v2, v2 + k2);
341     { // de-dup v2
342         int64_t prev = -1;
343         size_t wp = 0;
344         for (size_t i = 0; i < k2; i++) {
345             if (v2[i] != prev) {
346                 v2[wp++] = prev = v2[i];
347             }
348         }
349         k2 = wp;
350     }
351     const int64_t seen_flag = int64_t{1} << 60;
352     size_t count = 0;
353     for (size_t i = 0; i < k1; i++) {
354         int64_t q = v1[i];
355         size_t i0 = 0, i1 = k2;
356         while (i0 + 1 < i1) {
357             size_t imed = (i1 + i0) / 2;
358             int64_t piv = v2[imed] & ~seen_flag;
359             if (piv <= q)
360                 i0 = imed;
361             else
362                 i1 = imed;
363         }
364         if (v2[i0] == q) {
365             count++;
366             v2[i0] |= seen_flag;
367         }
368     }
369     delete[] v2;
370 
371     return count;
372 }
373 
imbalance_factor(int k,const int * hist)374 double imbalance_factor(int k, const int* hist) {
375     double tot = 0, uf = 0;
376 
377     for (int i = 0; i < k; i++) {
378         tot += hist[i];
379         uf += hist[i] * (double)hist[i];
380     }
381     uf = uf * k / (tot * tot);
382 
383     return uf;
384 }
385 
imbalance_factor(int n,int k,const int64_t * assign)386 double imbalance_factor(int n, int k, const int64_t* assign) {
387     std::vector<int> hist(k, 0);
388     for (int i = 0; i < n; i++) {
389         hist[assign[i]]++;
390     }
391 
392     return imbalance_factor(k, hist.data());
393 }
394 
ivec_hist(size_t n,const int * v,int vmax,int * hist)395 int ivec_hist(size_t n, const int* v, int vmax, int* hist) {
396     memset(hist, 0, sizeof(hist[0]) * vmax);
397     int nout = 0;
398     while (n--) {
399         if (v[n] < 0 || v[n] >= vmax)
400             nout++;
401         else
402             hist[v[n]]++;
403     }
404     return nout;
405 }
406 
bincode_hist(size_t n,size_t nbits,const uint8_t * codes,int * hist)407 void bincode_hist(size_t n, size_t nbits, const uint8_t* codes, int* hist) {
408     FAISS_THROW_IF_NOT(nbits % 8 == 0);
409     size_t d = nbits / 8;
410     std::vector<int> accu(d * 256);
411     const uint8_t* c = codes;
412     for (size_t i = 0; i < n; i++)
413         for (int j = 0; j < d; j++)
414             accu[j * 256 + *c++]++;
415     memset(hist, 0, sizeof(*hist) * nbits);
416     for (int i = 0; i < d; i++) {
417         const int* ai = accu.data() + i * 256;
418         int* hi = hist + i * 8;
419         for (int j = 0; j < 256; j++)
420             for (int k = 0; k < 8; k++)
421                 if ((j >> k) & 1)
422                     hi[k] += ai[j];
423     }
424 }
425 
ivec_checksum(size_t n,const int * a)426 size_t ivec_checksum(size_t n, const int* a) {
427     size_t cs = 112909;
428     while (n--)
429         cs = cs * 65713 + a[n] * 1686049;
430     return cs;
431 }
432 
433 namespace {
434 struct ArgsortComparator {
435     const float* vals;
operator ()faiss::__anon41d9e5360111::ArgsortComparator436     bool operator()(const size_t a, const size_t b) const {
437         return vals[a] < vals[b];
438     }
439 };
440 
441 struct SegmentS {
442     size_t i0; // begin pointer in the permutation array
443     size_t i1; // end
lenfaiss::__anon41d9e5360111::SegmentS444     size_t len() const {
445         return i1 - i0;
446     }
447 };
448 
449 // see https://en.wikipedia.org/wiki/Merge_algorithm#Parallel_merge
450 // extended to > 1 merge thread
451 
452 // merges 2 ranges that should be consecutive on the source into
453 // the union of the two on the destination
454 template <typename T>
parallel_merge(const T * src,T * dst,SegmentS & s1,SegmentS & s2,int nt,const ArgsortComparator & comp)455 void parallel_merge(
456         const T* src,
457         T* dst,
458         SegmentS& s1,
459         SegmentS& s2,
460         int nt,
461         const ArgsortComparator& comp) {
462     if (s2.len() > s1.len()) { // make sure that s1 larger than s2
463         std::swap(s1, s2);
464     }
465 
466     // compute sub-ranges for each thread
467     std::vector<SegmentS> s1s(nt), s2s(nt), sws(nt);
468     s2s[0].i0 = s2.i0;
469     s2s[nt - 1].i1 = s2.i1;
470 
471     // not sure parallel actually helps here
472 #pragma omp parallel for num_threads(nt)
473     for (int t = 0; t < nt; t++) {
474         s1s[t].i0 = s1.i0 + s1.len() * t / nt;
475         s1s[t].i1 = s1.i0 + s1.len() * (t + 1) / nt;
476 
477         if (t + 1 < nt) {
478             T pivot = src[s1s[t].i1];
479             size_t i0 = s2.i0, i1 = s2.i1;
480             while (i0 + 1 < i1) {
481                 size_t imed = (i1 + i0) / 2;
482                 if (comp(pivot, src[imed])) {
483                     i1 = imed;
484                 } else {
485                     i0 = imed;
486                 }
487             }
488             s2s[t].i1 = s2s[t + 1].i0 = i1;
489         }
490     }
491     s1.i0 = std::min(s1.i0, s2.i0);
492     s1.i1 = std::max(s1.i1, s2.i1);
493     s2 = s1;
494     sws[0].i0 = s1.i0;
495     for (int t = 0; t < nt; t++) {
496         sws[t].i1 = sws[t].i0 + s1s[t].len() + s2s[t].len();
497         if (t + 1 < nt) {
498             sws[t + 1].i0 = sws[t].i1;
499         }
500     }
501     assert(sws[nt - 1].i1 == s1.i1);
502 
503     // do the actual merging
504 #pragma omp parallel for num_threads(nt)
505     for (int t = 0; t < nt; t++) {
506         SegmentS sw = sws[t];
507         SegmentS s1t = s1s[t];
508         SegmentS s2t = s2s[t];
509         if (s1t.i0 < s1t.i1 && s2t.i0 < s2t.i1) {
510             for (;;) {
511                 // assert (sw.len() == s1t.len() + s2t.len());
512                 if (comp(src[s1t.i0], src[s2t.i0])) {
513                     dst[sw.i0++] = src[s1t.i0++];
514                     if (s1t.i0 == s1t.i1)
515                         break;
516                 } else {
517                     dst[sw.i0++] = src[s2t.i0++];
518                     if (s2t.i0 == s2t.i1)
519                         break;
520                 }
521             }
522         }
523         if (s1t.len() > 0) {
524             assert(s1t.len() == sw.len());
525             memcpy(dst + sw.i0, src + s1t.i0, s1t.len() * sizeof(dst[0]));
526         } else if (s2t.len() > 0) {
527             assert(s2t.len() == sw.len());
528             memcpy(dst + sw.i0, src + s2t.i0, s2t.len() * sizeof(dst[0]));
529         }
530     }
531 }
532 
533 }; // namespace
534 
fvec_argsort(size_t n,const float * vals,size_t * perm)535 void fvec_argsort(size_t n, const float* vals, size_t* perm) {
536     for (size_t i = 0; i < n; i++)
537         perm[i] = i;
538     ArgsortComparator comp = {vals};
539     std::sort(perm, perm + n, comp);
540 }
541 
fvec_argsort_parallel(size_t n,const float * vals,size_t * perm)542 void fvec_argsort_parallel(size_t n, const float* vals, size_t* perm) {
543     size_t* perm2 = new size_t[n];
544     // 2 result tables, during merging, flip between them
545     size_t *permB = perm2, *permA = perm;
546 
547     int nt = omp_get_max_threads();
548     { // prepare correct permutation so that the result ends in perm
549       // at final iteration
550         int nseg = nt;
551         while (nseg > 1) {
552             nseg = (nseg + 1) / 2;
553             std::swap(permA, permB);
554         }
555     }
556 
557 #pragma omp parallel
558     for (size_t i = 0; i < n; i++)
559         permA[i] = i;
560 
561     ArgsortComparator comp = {vals};
562 
563     std::vector<SegmentS> segs(nt);
564 
565     // independent sorts
566 #pragma omp parallel for
567     for (int t = 0; t < nt; t++) {
568         size_t i0 = t * n / nt;
569         size_t i1 = (t + 1) * n / nt;
570         SegmentS seg = {i0, i1};
571         std::sort(permA + seg.i0, permA + seg.i1, comp);
572         segs[t] = seg;
573     }
574     int prev_nested = omp_get_nested();
575     omp_set_nested(1);
576 
577     int nseg = nt;
578     while (nseg > 1) {
579         int nseg1 = (nseg + 1) / 2;
580         int sub_nt = nseg % 2 == 0 ? nt : nt - 1;
581         int sub_nseg1 = nseg / 2;
582 
583 #pragma omp parallel for num_threads(nseg1)
584         for (int s = 0; s < nseg; s += 2) {
585             if (s + 1 == nseg) { // otherwise isolated segment
586                 memcpy(permB + segs[s].i0,
587                        permA + segs[s].i0,
588                        segs[s].len() * sizeof(size_t));
589             } else {
590                 int t0 = s * sub_nt / sub_nseg1;
591                 int t1 = (s + 1) * sub_nt / sub_nseg1;
592                 printf("merge %d %d, %d threads\n", s, s + 1, t1 - t0);
593                 parallel_merge(
594                         permA, permB, segs[s], segs[s + 1], t1 - t0, comp);
595             }
596         }
597         for (int s = 0; s < nseg; s += 2)
598             segs[s / 2] = segs[s];
599         nseg = nseg1;
600         std::swap(permA, permB);
601     }
602     assert(permA == perm);
603     omp_set_nested(prev_nested);
604     delete[] perm2;
605 }
606 
fvecs_maybe_subsample(size_t d,size_t * n,size_t nmax,const float * x,bool verbose,int64_t seed)607 const float* fvecs_maybe_subsample(
608         size_t d,
609         size_t* n,
610         size_t nmax,
611         const float* x,
612         bool verbose,
613         int64_t seed) {
614     if (*n <= nmax)
615         return x; // nothing to do
616 
617     size_t n2 = nmax;
618     if (verbose) {
619         printf("  Input training set too big (max size is %zd), sampling "
620                "%zd / %zd vectors\n",
621                nmax,
622                n2,
623                *n);
624     }
625     std::vector<int> subset(*n);
626     rand_perm(subset.data(), *n, seed);
627     float* x_subset = new float[n2 * d];
628     for (int64_t i = 0; i < n2; i++)
629         memcpy(&x_subset[i * d], &x[subset[i] * size_t(d)], sizeof(x[0]) * d);
630     *n = n2;
631     return x_subset;
632 }
633 
binary_to_real(size_t d,const uint8_t * x_in,float * x_out)634 void binary_to_real(size_t d, const uint8_t* x_in, float* x_out) {
635     for (size_t i = 0; i < d; ++i) {
636         x_out[i] = 2 * ((x_in[i >> 3] >> (i & 7)) & 1) - 1;
637     }
638 }
639 
real_to_binary(size_t d,const float * x_in,uint8_t * x_out)640 void real_to_binary(size_t d, const float* x_in, uint8_t* x_out) {
641     for (size_t i = 0; i < d / 8; ++i) {
642         uint8_t b = 0;
643         for (int j = 0; j < 8; ++j) {
644             if (x_in[8 * i + j] > 0) {
645                 b |= (1 << j);
646             }
647         }
648         x_out[i] = b;
649     }
650 }
651 
652 // from Python's stringobject.c
hash_bytes(const uint8_t * bytes,int64_t n)653 uint64_t hash_bytes(const uint8_t* bytes, int64_t n) {
654     const uint8_t* p = bytes;
655     uint64_t x = (uint64_t)(*p) << 7;
656     int64_t len = n;
657     while (--len >= 0) {
658         x = (1000003 * x) ^ *p++;
659     }
660     x ^= n;
661     return x;
662 }
663 
check_openmp()664 bool check_openmp() {
665     omp_set_num_threads(10);
666 
667     if (omp_get_max_threads() != 10) {
668         return false;
669     }
670 
671     std::vector<int> nt_per_thread(10);
672     size_t sum = 0;
673     bool in_parallel = true;
674 #pragma omp parallel reduction(+ : sum)
675     {
676         if (!omp_in_parallel()) {
677             in_parallel = false;
678         }
679 
680         int nt = omp_get_num_threads();
681         int rank = omp_get_thread_num();
682 
683         nt_per_thread[rank] = nt;
684 #pragma omp for
685         for (int i = 0; i < 1000 * 1000 * 10; i++) {
686             sum += i;
687         }
688     }
689 
690     if (!in_parallel) {
691         return false;
692     }
693     if (nt_per_thread[0] != 10) {
694         return false;
695     }
696     if (sum == 0) {
697         return false;
698     }
699 
700     return true;
701 }
702 
703 } // namespace faiss
704