1 #ifdef _WIN32
2 #define NOMINMAX
3 #include <windows.h>
4 #elif defined __linux__
5 #define _GNU_SOURCE
6 #include <sched.h>
7 #elif defined __FreeBSD__
8 #include <sys/param.h>
9 #include <sys/cpuset.h>
10 #elif defined __APPLE__
11 #define UNUSED(x) (void)(x)
12 #endif
13
14 #include <stdlib.h>
15 #include <stdint.h>
16 #include <string.h>
17 #include <stdio.h>
18 #include <unistd.h>
19 #include <errno.h>
20
21 #include "apic.h"
22 #include "cpuid_asm.h"
23 #include "../common/global.h"
24
25 /*
26 * bit_scan_reverse and create_mask code taken from:
27 * https://software.intel.com/content/www/us/en/develop/articles/intel-64-architecture-processor-topology-enumeration.html
28 */
bit_scan_reverse(uint32_t * index,uint64_t mask)29 unsigned char bit_scan_reverse(uint32_t* index, uint64_t mask) {
30 for(uint64_t i = (8 * sizeof(uint64_t)); i > 0; i--) {
31 if((mask & (1ULL << (i-1))) != 0) {
32 *index = (uint64_t) (i-1);
33 break;
34 }
35 }
36 return (unsigned char) (mask != 0);
37 }
38
create_mask(uint32_t num_entries,uint32_t * mask_width)39 uint32_t create_mask(uint32_t num_entries, uint32_t *mask_width) {
40 uint32_t i = 0;
41 uint64_t k = 0;
42
43 // NearestPo2(numEntries) is the nearest power of 2 integer that is not less than numEntries
44 // The most significant bit of (numEntries * 2 -1) matches the above definition
45
46 k = (uint64_t)(num_entries) * 2 -1;
47
48 if (bit_scan_reverse(&i, k) == 0) {
49 if (mask_width) *mask_width = 0;
50 return 0;
51 }
52
53 if (mask_width) *mask_width = i;
54 if (i == 31) return (uint32_t ) -1;
55
56 return (1 << i) -1;
57 }
58
get_apic_id(bool x2apic_id)59 uint32_t get_apic_id(bool x2apic_id) {
60 uint32_t eax = 0;
61 uint32_t ebx = 0;
62 uint32_t ecx = 0;
63 uint32_t edx = 0;
64
65 if(x2apic_id) {
66 eax = 0x0000000B;
67 cpuid(&eax, &ebx, &ecx, &edx);
68 return edx;
69 }
70 else {
71 eax = 0x00000001;
72 cpuid(&eax, &ebx, &ecx, &edx);
73 return (ebx >> 24);
74 }
75 }
76
77 #ifndef __APPLE__
bind_to_cpu(int cpu_id)78 bool bind_to_cpu(int cpu_id) {
79 #ifdef _WIN32
80 HANDLE process = GetCurrentProcess();
81 DWORD_PTR processAffinityMask = 1 << cpu_id;
82 return SetProcessAffinityMask(process, processAffinityMask);
83 #elif defined __linux__
84 cpu_set_t currentCPU;
85 CPU_ZERO(¤tCPU);
86 CPU_SET(cpu_id, ¤tCPU);
87 if (sched_setaffinity (0, sizeof(currentCPU), ¤tCPU) == -1) {
88 printWarn("sched_setaffinity: %s", strerror(errno));
89 return false;
90 }
91 return true;
92 #elif defined __FreeBSD__
93 cpuset_t currentCPU;
94 CPU_ZERO(¤tCPU);
95 CPU_SET(cpu_id, ¤tCPU);
96 if(cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(cpuset_t), ¤tCPU) == -1) {
97 printWarn("cpuset_setaffinity: %s", strerror(errno));
98 return false;
99 }
100 return true;
101 #endif
102 }
103 #endif
104
fill_topo_masks_apic(struct topology * topo)105 bool fill_topo_masks_apic(struct topology* topo) {
106 uint32_t eax = 0x00000001;
107 uint32_t ebx = 0;
108 uint32_t ecx = 0;
109 uint32_t edx = 0;
110 uint32_t core_plus_smt_id_max_cnt;
111 uint32_t core_id_max_cnt;
112 uint32_t smt_id_per_core_max_cnt;
113
114 cpuid(&eax, &ebx, &ecx, &edx);
115
116 core_plus_smt_id_max_cnt = (ebx >> 16) & 0xFF;
117
118 eax = 0x00000004;
119 ecx = 0;
120 cpuid(&eax, &ebx, &ecx, &edx);
121
122 core_id_max_cnt = (eax >> 26) + 1;
123 smt_id_per_core_max_cnt = core_plus_smt_id_max_cnt / core_id_max_cnt;
124
125 topo->apic->smt_mask = create_mask(smt_id_per_core_max_cnt, &(topo->apic->smt_mask_width));
126 topo->apic->core_mask = create_mask(core_id_max_cnt,&(topo->apic->pkg_mask_shift));
127 topo->apic->pkg_mask_shift += topo->apic->smt_mask_width;
128 topo->apic->core_mask <<= topo->apic->smt_mask_width;
129 topo->apic->pkg_mask = (-1) ^ (topo->apic->core_mask | topo->apic->smt_mask);
130
131 return true;
132 }
133
fill_topo_masks_x2apic(struct topology * topo)134 bool fill_topo_masks_x2apic(struct topology* topo) {
135 int32_t level_type;
136 int32_t level_shift;
137
138 int32_t coreplus_smt_mask = 0;
139 bool level2 = false;
140 bool level1 = false;
141
142 uint32_t eax = 0;
143 uint32_t ebx = 0;
144 uint32_t ecx = 0;
145 uint32_t edx = 0;
146 uint32_t i = 0;
147
148 while(true) {
149 eax = 0x0000000B;
150 ecx = i;
151 cpuid(&eax, &ebx, &ecx, &edx);
152 if(ebx == 0) break;
153
154 level_type = (ecx >> 8) & 0xFF;
155 level_shift = eax & 0xFFF;
156
157 switch(level_type) {
158 case 1: // SMT
159 topo->apic->smt_mask = ~(0xFFFFFFFF << level_shift);
160 topo->apic->smt_mask_width = level_shift;
161 topo->smt_supported = ebx & 0xFFFF;
162 level1 = true;
163 break;
164 case 2: // Core
165 coreplus_smt_mask = ~(0xFFFFFFFF << level_shift);
166 topo->apic->pkg_mask_shift = level_shift;
167 topo->apic->pkg_mask = (-1) ^ coreplus_smt_mask;
168 level2 = true;
169 break;
170 default:
171 printErr("Found invalid level when querying topology: %d", level_type);
172 break;
173 }
174
175 i++; // sublevel to query
176 }
177
178 if (level1 && level2) {
179 topo->apic->core_mask = coreplus_smt_mask ^ topo->apic->smt_mask;
180 }
181 else if (!level2 && level1) {
182 topo->apic->core_mask = 0;
183 topo->apic->pkg_mask_shift = topo->apic->smt_mask_width;
184 topo->apic->pkg_mask = (-1) ^ topo->apic->smt_mask;
185 }
186 else {
187 printErr("SMT level was not found when querying topology");
188 return false;
189 }
190
191 return true;
192 }
193
194 // Not a very elegant solution. The width should always be as long
195 // as the number of cores, but in the case of Xeon Phi KNL it is not
max_apic_id_size(uint32_t ** cache_id_apic,struct topology * topo)196 uint32_t max_apic_id_size(uint32_t** cache_id_apic, struct topology* topo) {
197 uint32_t max = 0;
198
199 for(int i=0; i < topo->cach->max_cache_level; i++) {
200 for(int j=0; j < topo->total_cores; j++) {
201 if(cache_id_apic[j][i] > max) max = cache_id_apic[j][i];
202 }
203 }
204
205 max++;
206 if(max > (uint32_t) topo->total_cores) return max;
207 return topo->total_cores;
208 }
209
build_topo_from_apic(uint32_t * apic_pkg,uint32_t * apic_smt,uint32_t ** cache_id_apic,struct topology * topo)210 bool build_topo_from_apic(uint32_t* apic_pkg, uint32_t* apic_smt, uint32_t** cache_id_apic, struct topology* topo) {
211 uint32_t size = max_apic_id_size(cache_id_apic, topo);
212 uint32_t* sockets = emalloc(sizeof(uint32_t) * size);
213 uint32_t* smt = emalloc(sizeof(uint32_t) * size);
214 uint32_t* apic_id = emalloc(sizeof(uint32_t) * size);
215 uint32_t num_caches = 0;
216
217 memset(sockets, 0, sizeof(uint32_t) * size);
218 memset(smt, 0, sizeof(uint32_t) * size);
219 memset(apic_id, 0, sizeof(uint32_t) * size);
220
221 // System topology
222 for(int i=0; i < topo->total_cores; i++) {
223 sockets[apic_pkg[i]] = 1;
224 smt[apic_smt[i]] = 1;
225 }
226 for(int i=0; i < topo->total_cores; i++) {
227 if(sockets[i] != 0)
228 topo->sockets++;
229 if(smt[i] != 0)
230 topo->smt_available++;
231 }
232
233 topo->logical_cores = topo->total_cores / topo->sockets;
234 topo->physical_cores = topo->logical_cores / topo->smt_available;
235
236 // Cache topology
237 for(int i=0; i < topo->cach->max_cache_level; i++) {
238 num_caches = 0;
239 memset(apic_id, 0, sizeof(uint32_t) * size);
240
241 for(int c=0; c < topo->total_cores; c++) {
242 apic_id[cache_id_apic[c][i]]++;
243 }
244 for(uint32_t c=0; c < size; c++) {
245 if(apic_id[c] > 0) num_caches++;
246 }
247
248 topo->cach->cach_arr[i]->num_caches = num_caches;
249 }
250
251 free(sockets);
252 free(smt);
253 free(apic_id);
254
255 return true;
256 }
257
get_cache_topology_from_apic(struct topology * topo)258 void get_cache_topology_from_apic(struct topology* topo) {
259 uint32_t eax = 0x00000004;
260 uint32_t ebx = 0;
261 uint32_t ecx = 0;
262 uint32_t edx = 0;
263
264 for(int i=0; i < topo->cach->max_cache_level; i++) {
265 eax = 0x00000004;
266 ecx = i;
267
268 cpuid(&eax, &ebx, &ecx, &edx);
269
270 uint32_t SMTMaxCntPerEachCache = ((eax >> 14) & 0x7FF) + 1;
271 uint32_t dummy;
272 topo->apic->cache_select_mask[i] = create_mask(SMTMaxCntPerEachCache,&dummy);
273 }
274 }
275
apic_array_full(uint32_t * apic_ids,int n)276 bool apic_array_full(uint32_t* apic_ids, int n) {
277 for(int i=0; i < n; i++) {
278 if(apic_ids[i] == (uint32_t) -1) return false;
279 }
280 return true;
281 }
282
add_apic_to_array(uint32_t apic,uint32_t * apic_ids,int n)283 void add_apic_to_array(uint32_t apic, uint32_t* apic_ids, int n) {
284 int i=0;
285 int last=0;
286 bool found = false;
287
288 while(!found && i < n) {
289 if(apic_ids[i] == apic) found = true;
290 if(apic_ids[i] != (uint32_t) -1) last = i+1;
291 i++;
292 }
293
294 if(!found) {
295 apic_ids[last] = apic;
296 //printf("Added %d\n", apic);
297 }
298 }
299
fill_apic_ids(uint32_t * apic_ids,int n,bool x2apic_id)300 bool fill_apic_ids(uint32_t* apic_ids, int n, bool x2apic_id) {
301 #ifdef __APPLE__
302 // macOS extremely dirty approach...
303 printf("cpufetch is computing APIC IDs, please wait...\n");
304 bool end = false;
305 uint32_t apic;
306 for(int i=0; i < n; i++) apic_ids[i] = (uint32_t) -1;
307
308 while(!end) {
309 apic = get_apic_id(x2apic_id);
310
311 add_apic_to_array(apic, apic_ids, n);
312 end = apic_array_full(apic_ids, n);
313 usleep(1000);
314 }
315 #else
316 for(int i=0; i < n; i++) {
317 if(!bind_to_cpu(i)) {
318 printErr("Failed binding to CPU %d", i);
319 return false;
320 }
321 apic_ids[i] = get_apic_id(x2apic_id);
322 }
323 #endif
324 return true;
325 }
326
get_topology_from_apic(struct cpuInfo * cpu,struct topology * topo)327 bool get_topology_from_apic(struct cpuInfo* cpu, struct topology* topo) {
328 uint32_t apic_id;
329 uint32_t* apic_ids = emalloc(sizeof(uint32_t) * topo->total_cores);
330 uint32_t* apic_pkg = emalloc(sizeof(uint32_t) * topo->total_cores);
331 uint32_t* apic_core = emalloc(sizeof(uint32_t) * topo->total_cores);
332 uint32_t* apic_smt = emalloc(sizeof(uint32_t) * topo->total_cores);
333 uint32_t** cache_smt_id_apic = emalloc(sizeof(uint32_t*) * topo->total_cores);
334 uint32_t** cache_id_apic = emalloc(sizeof(uint32_t*) * topo->total_cores);
335 bool x2apic_id;
336
337 if(cpu->maxLevels >= 0x0000000B) {
338 uint32_t eax = 0x0000000B;
339 uint32_t ebx = 0;
340 uint32_t ecx = 0;
341 uint32_t edx = 0;
342
343 cpuid(&eax, &ebx, &ecx, &edx);
344
345 if(ebx == 0) x2apic_id = false;
346 else x2apic_id = true;
347 }
348 else {
349 x2apic_id = false;
350 }
351
352 for(int i=0; i < topo->total_cores; i++) {
353 cache_smt_id_apic[i] = emalloc(sizeof(uint32_t) * (topo->cach->max_cache_level));
354 cache_id_apic[i] = emalloc(sizeof(uint32_t) * (topo->cach->max_cache_level));
355 }
356 topo->apic->cache_select_mask = emalloc(sizeof(uint32_t) * (topo->cach->max_cache_level));
357 topo->apic->cache_id_apic = emalloc(sizeof(uint32_t) * (topo->cach->max_cache_level));
358
359 if(x2apic_id) {
360 if(!fill_topo_masks_x2apic(topo))
361 return false;
362 }
363 else {
364 if(!fill_topo_masks_apic(topo))
365 return false;
366 }
367
368 get_cache_topology_from_apic(topo);
369
370 if(!fill_apic_ids(apic_ids, topo->total_cores, x2apic_id))
371 return false;
372
373 for(int i=0; i < topo->total_cores; i++) {
374 apic_id = apic_ids[i];
375
376 apic_pkg[i] = (apic_id & topo->apic->pkg_mask) >> topo->apic->pkg_mask_shift;
377 apic_core[i] = (apic_id & topo->apic->core_mask) >> topo->apic->smt_mask_width;
378 apic_smt[i] = apic_id & topo->apic->smt_mask;
379
380 for(int c=0; c < topo->cach->max_cache_level; c++) {
381 cache_smt_id_apic[i][c] = apic_id & topo->apic->cache_select_mask[c];
382 cache_id_apic[i][c] = apic_id & (-1 ^ topo->apic->cache_select_mask[c]);
383 }
384 }
385
386 /* DEBUG
387 for(int i=0; i < topo->cach->max_cache_level; i++) {
388 printf("[CACH %1d]", i);
389 for(int j=0; j < topo->total_cores; j++)
390 printf("[%03d]", cache_id_apic[j][i]);
391 printf("\n");
392 }
393 for(int i=0; i < topo->total_cores; i++)
394 printf("[%2d] 0x%.8X\n", i, apic_pkg[i]);
395 printf("\n");
396 for(int i=0; i < topo->total_cores; i++)
397 printf("[%2d] 0x%.8X\n", i, apic_core[i]);
398 printf("\n");
399 for(int i=0; i < topo->total_cores; i++)
400 printf("[%2d] 0x%.8X\n", i, apic_smt[i]);*/
401
402
403 bool ret = build_topo_from_apic(apic_pkg, apic_smt, cache_id_apic, topo);
404
405 // Assumption: If we cant get smt_available, we assume it is equal to smt_supported...
406 if (!x2apic_id) {
407 printWarn("Can't read SMT from cpuid (needed level is 0x%.8X, max is 0x%.8X)", 0x0000000B, cpu->maxLevels);
408 topo->smt_supported = topo->smt_available;
409 }
410
411 free(apic_pkg);
412 free(apic_core);
413 free(apic_smt);
414 for(int i=0; i < topo->total_cores; i++) {
415 free(cache_smt_id_apic[i]);
416 free(cache_id_apic[i]);
417 }
418 free(cache_smt_id_apic);
419 free(cache_id_apic);
420
421 return ret;
422 }
423
is_smt_enabled_amd(struct topology * topo)424 uint32_t is_smt_enabled_amd(struct topology* topo) {
425 #ifdef __APPLE__
426 UNUSED(topo);
427 return 1;
428 #else
429 uint32_t id;
430
431 for(int i = 0; i < topo->total_cores; i++) {
432 if(!bind_to_cpu(i)) {
433 printErr("Failed binding to CPU %d", i);
434 return false;
435 }
436 id = get_apic_id(false) & 1; // get the last bit
437 if(id == 1) return 2; // We assume there isn't any AMD CPU with more than 2th per core.
438 }
439
440 return 1;
441 #endif
442 }
443