1 #ifdef _WIN32
2   #define NOMINMAX
3   #include <windows.h>
4 #elif defined __linux__
5   #define _GNU_SOURCE
6   #include <sched.h>
7 #elif defined __FreeBSD__
8   #include <sys/param.h>
9   #include <sys/cpuset.h>
10 #elif defined __APPLE__
11   #define UNUSED(x) (void)(x)
12 #endif
13 
14 #include <stdlib.h>
15 #include <stdint.h>
16 #include <string.h>
17 #include <stdio.h>
18 #include <unistd.h>
19 #include <errno.h>
20 
21 #include "apic.h"
22 #include "cpuid_asm.h"
23 #include "../common/global.h"
24 
25 /*
26  * bit_scan_reverse and create_mask code taken from:
27  * https://software.intel.com/content/www/us/en/develop/articles/intel-64-architecture-processor-topology-enumeration.html
28  */
bit_scan_reverse(uint32_t * index,uint64_t mask)29 unsigned char bit_scan_reverse(uint32_t* index, uint64_t mask) {
30   for(uint64_t i = (8 * sizeof(uint64_t)); i > 0; i--) {
31     if((mask & (1ULL << (i-1))) != 0) {
32       *index = (uint64_t) (i-1);
33       break;
34     }
35   }
36   return (unsigned char) (mask != 0);
37 }
38 
create_mask(uint32_t num_entries,uint32_t * mask_width)39 uint32_t create_mask(uint32_t num_entries, uint32_t *mask_width) {
40   uint32_t i = 0;
41   uint64_t k = 0;
42 
43   // NearestPo2(numEntries) is the nearest power of 2 integer that is not less than numEntries
44   // The most significant bit of (numEntries * 2 -1) matches the above definition
45 
46   k = (uint64_t)(num_entries) * 2 -1;
47 
48   if (bit_scan_reverse(&i, k) == 0) {
49     if (mask_width) *mask_width = 0;
50     return 0;
51   }
52 
53   if (mask_width) *mask_width = i;
54   if (i == 31) return (uint32_t ) -1;
55 
56   return (1 << i) -1;
57 }
58 
get_apic_id(bool x2apic_id)59 uint32_t get_apic_id(bool x2apic_id) {
60   uint32_t eax = 0;
61   uint32_t ebx = 0;
62   uint32_t ecx = 0;
63   uint32_t edx = 0;
64 
65   if(x2apic_id) {
66     eax = 0x0000000B;
67     cpuid(&eax, &ebx, &ecx, &edx);
68     return edx;
69   }
70   else {
71     eax = 0x00000001;
72     cpuid(&eax, &ebx, &ecx, &edx);
73     return (ebx >> 24);
74   }
75 }
76 
77 #ifndef __APPLE__
bind_to_cpu(int cpu_id)78 bool bind_to_cpu(int cpu_id) {
79   #ifdef _WIN32
80     HANDLE process = GetCurrentProcess();
81     DWORD_PTR processAffinityMask = 1 << cpu_id;
82     return SetProcessAffinityMask(process, processAffinityMask);
83   #elif defined __linux__
84     cpu_set_t currentCPU;
85     CPU_ZERO(&currentCPU);
86     CPU_SET(cpu_id, &currentCPU);
87     if (sched_setaffinity (0, sizeof(currentCPU), &currentCPU) == -1) {
88       printWarn("sched_setaffinity: %s", strerror(errno));
89       return false;
90     }
91     return true;
92   #elif defined __FreeBSD__
93     cpuset_t currentCPU;
94     CPU_ZERO(&currentCPU);
95     CPU_SET(cpu_id, &currentCPU);
96     if(cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(cpuset_t), &currentCPU) == -1) {
97       printWarn("cpuset_setaffinity: %s", strerror(errno));
98       return false;
99     }
100     return true;
101   #endif
102 }
103 #endif
104 
fill_topo_masks_apic(struct topology * topo)105 bool fill_topo_masks_apic(struct topology* topo) {
106   uint32_t eax = 0x00000001;
107   uint32_t ebx = 0;
108   uint32_t ecx = 0;
109   uint32_t edx = 0;
110   uint32_t core_plus_smt_id_max_cnt;
111   uint32_t core_id_max_cnt;
112   uint32_t smt_id_per_core_max_cnt;
113 
114   cpuid(&eax, &ebx, &ecx, &edx);
115 
116   core_plus_smt_id_max_cnt = (ebx >> 16) & 0xFF;
117 
118   eax = 0x00000004;
119   ecx = 0;
120   cpuid(&eax, &ebx, &ecx, &edx);
121 
122   core_id_max_cnt = (eax >> 26) + 1;
123   smt_id_per_core_max_cnt = core_plus_smt_id_max_cnt / core_id_max_cnt;
124 
125   topo->apic->smt_mask = create_mask(smt_id_per_core_max_cnt, &(topo->apic->smt_mask_width));
126   topo->apic->core_mask = create_mask(core_id_max_cnt,&(topo->apic->pkg_mask_shift));
127   topo->apic->pkg_mask_shift += topo->apic->smt_mask_width;
128   topo->apic->core_mask <<= topo->apic->smt_mask_width;
129   topo->apic->pkg_mask = (-1) ^ (topo->apic->core_mask | topo->apic->smt_mask);
130 
131   return true;
132 }
133 
fill_topo_masks_x2apic(struct topology * topo)134 bool fill_topo_masks_x2apic(struct topology* topo) {
135   int32_t level_type;
136   int32_t level_shift;
137 
138   int32_t coreplus_smt_mask = 0;
139   bool level2 = false;
140   bool level1 = false;
141 
142   uint32_t eax = 0;
143   uint32_t ebx = 0;
144   uint32_t ecx = 0;
145   uint32_t edx = 0;
146   uint32_t i = 0;
147 
148   while(true) {
149     eax = 0x0000000B;
150     ecx = i;
151     cpuid(&eax, &ebx, &ecx, &edx);
152     if(ebx == 0) break;
153 
154     level_type = (ecx >> 8) & 0xFF;
155     level_shift = eax & 0xFFF;
156 
157     switch(level_type) {
158       case 1: // SMT
159         topo->apic->smt_mask = ~(0xFFFFFFFF << level_shift);
160         topo->apic->smt_mask_width = level_shift;
161         topo->smt_supported = ebx & 0xFFFF;
162         level1 = true;
163         break;
164       case 2: // Core
165         coreplus_smt_mask = ~(0xFFFFFFFF << level_shift);
166         topo->apic->pkg_mask_shift =  level_shift;
167         topo->apic->pkg_mask = (-1) ^ coreplus_smt_mask;
168         level2 = true;
169         break;
170       default:
171         printErr("Found invalid level when querying topology: %d", level_type);
172         break;
173     }
174 
175     i++; // sublevel to query
176   }
177 
178   if (level1 && level2) {
179     topo->apic->core_mask = coreplus_smt_mask ^ topo->apic->smt_mask;
180   }
181   else if (!level2 && level1) {
182     topo->apic->core_mask = 0;
183     topo->apic->pkg_mask_shift = topo->apic->smt_mask_width;
184     topo->apic->pkg_mask =  (-1) ^ topo->apic->smt_mask;
185   }
186   else {
187     printErr("SMT level was not found when querying topology");
188     return false;
189   }
190 
191   return true;
192 }
193 
194 // Not a very elegant solution. The width should always be as long
195 // as the number of cores, but in the case of Xeon Phi KNL it is not
max_apic_id_size(uint32_t ** cache_id_apic,struct topology * topo)196 uint32_t max_apic_id_size(uint32_t** cache_id_apic, struct topology* topo) {
197   uint32_t max = 0;
198 
199   for(int i=0; i < topo->cach->max_cache_level; i++) {
200     for(int j=0; j < topo->total_cores; j++) {
201       if(cache_id_apic[j][i] > max) max = cache_id_apic[j][i];
202     }
203   }
204 
205   max++;
206   if(max > (uint32_t) topo->total_cores) return max;
207   return topo->total_cores;
208 }
209 
build_topo_from_apic(uint32_t * apic_pkg,uint32_t * apic_smt,uint32_t ** cache_id_apic,struct topology * topo)210 bool build_topo_from_apic(uint32_t* apic_pkg, uint32_t* apic_smt, uint32_t** cache_id_apic, struct topology* topo) {
211   uint32_t size = max_apic_id_size(cache_id_apic, topo);
212   uint32_t* sockets = emalloc(sizeof(uint32_t) * size);
213   uint32_t* smt = emalloc(sizeof(uint32_t) * size);
214   uint32_t* apic_id = emalloc(sizeof(uint32_t) * size);
215   uint32_t num_caches = 0;
216 
217   memset(sockets, 0, sizeof(uint32_t) * size);
218   memset(smt, 0, sizeof(uint32_t) * size);
219   memset(apic_id, 0, sizeof(uint32_t) * size);
220 
221   // System topology
222   for(int i=0; i < topo->total_cores; i++) {
223     sockets[apic_pkg[i]] = 1;
224     smt[apic_smt[i]] = 1;
225   }
226   for(int i=0; i < topo->total_cores; i++) {
227     if(sockets[i] != 0)
228       topo->sockets++;
229     if(smt[i] != 0)
230       topo->smt_available++;
231   }
232 
233   topo->logical_cores = topo->total_cores / topo->sockets;
234   topo->physical_cores = topo->logical_cores / topo->smt_available;
235 
236   // Cache topology
237   for(int i=0; i < topo->cach->max_cache_level; i++) {
238     num_caches = 0;
239     memset(apic_id, 0, sizeof(uint32_t) * size);
240 
241     for(int c=0; c < topo->total_cores; c++) {
242       apic_id[cache_id_apic[c][i]]++;
243     }
244     for(uint32_t c=0; c < size; c++) {
245       if(apic_id[c] > 0) num_caches++;
246     }
247 
248     topo->cach->cach_arr[i]->num_caches = num_caches;
249   }
250 
251   free(sockets);
252   free(smt);
253   free(apic_id);
254 
255   return true;
256 }
257 
get_cache_topology_from_apic(struct topology * topo)258 void get_cache_topology_from_apic(struct topology* topo) {
259   uint32_t eax = 0x00000004;
260   uint32_t ebx = 0;
261   uint32_t ecx = 0;
262   uint32_t edx = 0;
263 
264   for(int i=0; i < topo->cach->max_cache_level; i++) {
265     eax = 0x00000004;
266     ecx = i;
267 
268     cpuid(&eax, &ebx, &ecx, &edx);
269 
270     uint32_t SMTMaxCntPerEachCache = ((eax >> 14) & 0x7FF) + 1;
271     uint32_t dummy;
272     topo->apic->cache_select_mask[i] = create_mask(SMTMaxCntPerEachCache,&dummy);
273   }
274 }
275 
apic_array_full(uint32_t * apic_ids,int n)276 bool apic_array_full(uint32_t* apic_ids, int n) {
277   for(int i=0; i < n; i++) {
278     if(apic_ids[i] == (uint32_t) -1) return false;
279   }
280   return true;
281 }
282 
add_apic_to_array(uint32_t apic,uint32_t * apic_ids,int n)283 void add_apic_to_array(uint32_t apic, uint32_t* apic_ids, int n) {
284   int i=0;
285   int last=0;
286   bool found = false;
287 
288   while(!found && i < n) {
289     if(apic_ids[i] == apic) found = true;
290     if(apic_ids[i] != (uint32_t) -1) last = i+1;
291     i++;
292   }
293 
294   if(!found) {
295     apic_ids[last] = apic;
296     //printf("Added %d\n", apic);
297   }
298 }
299 
fill_apic_ids(uint32_t * apic_ids,int n,bool x2apic_id)300 bool fill_apic_ids(uint32_t* apic_ids, int n, bool x2apic_id) {
301 #ifdef __APPLE__
302   // macOS extremely dirty approach...
303   printf("cpufetch is computing APIC IDs, please wait...\n");
304   bool end = false;
305   uint32_t apic;
306   for(int i=0; i < n; i++) apic_ids[i] = (uint32_t) -1;
307 
308   while(!end) {
309     apic = get_apic_id(x2apic_id);
310 
311     add_apic_to_array(apic, apic_ids, n);
312     end = apic_array_full(apic_ids, n);
313     usleep(1000);
314   }
315 #else
316   for(int i=0; i < n; i++) {
317     if(!bind_to_cpu(i)) {
318       printErr("Failed binding to CPU %d", i);
319       return false;
320     }
321     apic_ids[i] = get_apic_id(x2apic_id);
322   }
323 #endif
324   return true;
325 }
326 
get_topology_from_apic(struct cpuInfo * cpu,struct topology * topo)327 bool get_topology_from_apic(struct cpuInfo* cpu, struct topology* topo) {
328   uint32_t apic_id;
329   uint32_t* apic_ids = emalloc(sizeof(uint32_t) * topo->total_cores);
330   uint32_t* apic_pkg = emalloc(sizeof(uint32_t) * topo->total_cores);
331   uint32_t* apic_core = emalloc(sizeof(uint32_t) * topo->total_cores);
332   uint32_t* apic_smt = emalloc(sizeof(uint32_t) * topo->total_cores);
333   uint32_t** cache_smt_id_apic = emalloc(sizeof(uint32_t*) * topo->total_cores);
334   uint32_t** cache_id_apic = emalloc(sizeof(uint32_t*) * topo->total_cores);
335   bool x2apic_id;
336 
337   if(cpu->maxLevels >= 0x0000000B) {
338     uint32_t eax = 0x0000000B;
339     uint32_t ebx = 0;
340     uint32_t ecx = 0;
341     uint32_t edx = 0;
342 
343     cpuid(&eax, &ebx, &ecx, &edx);
344 
345     if(ebx == 0) x2apic_id = false;
346     else x2apic_id = true;
347   }
348   else {
349     x2apic_id = false;
350   }
351 
352   for(int i=0; i < topo->total_cores; i++) {
353     cache_smt_id_apic[i] = emalloc(sizeof(uint32_t) * (topo->cach->max_cache_level));
354     cache_id_apic[i] = emalloc(sizeof(uint32_t) * (topo->cach->max_cache_level));
355   }
356   topo->apic->cache_select_mask = emalloc(sizeof(uint32_t) * (topo->cach->max_cache_level));
357   topo->apic->cache_id_apic = emalloc(sizeof(uint32_t) * (topo->cach->max_cache_level));
358 
359   if(x2apic_id) {
360     if(!fill_topo_masks_x2apic(topo))
361       return false;
362   }
363   else {
364     if(!fill_topo_masks_apic(topo))
365       return false;
366   }
367 
368   get_cache_topology_from_apic(topo);
369 
370   if(!fill_apic_ids(apic_ids, topo->total_cores, x2apic_id))
371     return false;
372 
373   for(int i=0; i < topo->total_cores; i++) {
374     apic_id = apic_ids[i];
375 
376     apic_pkg[i] = (apic_id & topo->apic->pkg_mask) >> topo->apic->pkg_mask_shift;
377     apic_core[i] = (apic_id & topo->apic->core_mask) >> topo->apic->smt_mask_width;
378     apic_smt[i] = apic_id & topo->apic->smt_mask;
379 
380     for(int c=0; c < topo->cach->max_cache_level; c++) {
381       cache_smt_id_apic[i][c] = apic_id & topo->apic->cache_select_mask[c];
382       cache_id_apic[i][c] = apic_id & (-1 ^ topo->apic->cache_select_mask[c]);
383     }
384   }
385 
386   /* DEBUG
387   for(int i=0; i < topo->cach->max_cache_level; i++) {
388     printf("[CACH %1d]", i);
389     for(int j=0; j < topo->total_cores; j++)
390       printf("[%03d]", cache_id_apic[j][i]);
391     printf("\n");
392   }
393   for(int i=0; i < topo->total_cores; i++)
394     printf("[%2d] 0x%.8X\n", i, apic_pkg[i]);
395   printf("\n");
396   for(int i=0; i < topo->total_cores; i++)
397     printf("[%2d] 0x%.8X\n", i, apic_core[i]);
398   printf("\n");
399   for(int i=0; i < topo->total_cores; i++)
400     printf("[%2d] 0x%.8X\n", i, apic_smt[i]);*/
401 
402 
403   bool ret = build_topo_from_apic(apic_pkg, apic_smt, cache_id_apic, topo);
404 
405   // Assumption: If we cant get smt_available, we assume it is equal to smt_supported...
406   if (!x2apic_id) {
407     printWarn("Can't read SMT from cpuid (needed level is 0x%.8X, max is 0x%.8X)", 0x0000000B, cpu->maxLevels);
408     topo->smt_supported = topo->smt_available;
409   }
410 
411   free(apic_pkg);
412   free(apic_core);
413   free(apic_smt);
414   for(int i=0; i < topo->total_cores; i++) {
415     free(cache_smt_id_apic[i]);
416     free(cache_id_apic[i]);
417   }
418   free(cache_smt_id_apic);
419   free(cache_id_apic);
420 
421   return ret;
422 }
423 
is_smt_enabled_amd(struct topology * topo)424 uint32_t is_smt_enabled_amd(struct topology* topo) {
425 #ifdef __APPLE__
426   UNUSED(topo);
427   return 1;
428 #else
429   uint32_t id;
430 
431   for(int i = 0; i < topo->total_cores; i++) {
432     if(!bind_to_cpu(i)) {
433       printErr("Failed binding to CPU %d", i);
434       return false;
435     }
436     id = get_apic_id(false) & 1; // get the last bit
437     if(id == 1) return 2; // We assume there isn't any AMD CPU with more than 2th per core.
438   }
439 
440   return 1;
441 #endif
442 }
443