1 /*
2 * %CopyrightBegin%
3 *
4 * Copyright Ericsson AB 2010-2020. All Rights Reserved.
5 *
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 *
18 * %CopyrightEnd%
19 */
20
21 /*
22 * Description: CPU topology and related functionality
23 *
24 * Author: Rickard Green
25 */
26
27 #ifdef HAVE_CONFIG_H
28 # include "config.h"
29 #endif
30
31 #include <ctype.h>
32
33 #include "global.h"
34 #include "error.h"
35 #include "bif.h"
36 #include "erl_cpu_topology.h"
37 #include "erl_flxctr.h"
38
39 #define ERTS_MAX_READER_GROUPS 64
40
41 /*
42 * Cpu topology hierarchy.
43 */
44 #define ERTS_TOPOLOGY_NODE 0
45 #define ERTS_TOPOLOGY_PROCESSOR 1
46 #define ERTS_TOPOLOGY_PROCESSOR_NODE 2
47 #define ERTS_TOPOLOGY_CORE 3
48 #define ERTS_TOPOLOGY_THREAD 4
49 #define ERTS_TOPOLOGY_LOGICAL 5
50
51 #define ERTS_TOPOLOGY_MAX_DEPTH 6
52
53 typedef struct {
54 int bind_id;
55 int bound_id;
56 } ErtsCpuBindData;
57
58 static erts_cpu_info_t *cpuinfo;
59
60 static int max_main_threads;
61 static int reader_groups;
62 static int decentralized_counter_groups;
63
64 static ErtsCpuBindData *scheduler2cpu_map;
65 static erts_rwmtx_t cpuinfo_rwmtx;
66
67 typedef enum {
68 ERTS_CPU_BIND_UNDEFINED,
69 ERTS_CPU_BIND_SPREAD,
70 ERTS_CPU_BIND_PROCESSOR_SPREAD,
71 ERTS_CPU_BIND_THREAD_SPREAD,
72 ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD,
73 ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD,
74 ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD,
75 ERTS_CPU_BIND_NO_SPREAD,
76 ERTS_CPU_BIND_NONE
77 } ErtsCpuBindOrder;
78
79 #define ERTS_CPU_BIND_DEFAULT_BIND \
80 ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD
81
82 static int no_cpu_groups_callbacks;
83 static ErtsCpuBindOrder cpu_bind_order;
84
85 static erts_cpu_topology_t *user_cpudata;
86 static int user_cpudata_size;
87 static erts_cpu_topology_t *system_cpudata;
88 static int system_cpudata_size;
89
90 typedef struct {
91 int level[ERTS_TOPOLOGY_MAX_DEPTH+1];
92 } erts_avail_cput;
93
94 typedef struct {
95 int id;
96 int sub_levels;
97 int cpu_groups;
98 } erts_cpu_groups_count_t;
99
100 typedef struct {
101 int logical;
102 int cpu_group;
103 } erts_cpu_groups_map_array_t;
104
105 typedef struct erts_cpu_groups_callback_list_t_ erts_cpu_groups_callback_list_t;
106 struct erts_cpu_groups_callback_list_t_ {
107 erts_cpu_groups_callback_list_t *next;
108 erts_cpu_groups_callback_t callback;
109 void *arg;
110 };
111
112 typedef struct erts_cpu_groups_map_t_ erts_cpu_groups_map_t;
113 struct erts_cpu_groups_map_t_ {
114 erts_cpu_groups_map_t *next;
115 int groups;
116 erts_cpu_groups_map_array_t *array;
117 int size;
118 int logical_processors;
119 erts_cpu_groups_callback_list_t *callback_list;
120 };
121
122 typedef struct {
123 erts_cpu_groups_callback_t callback;
124 int ix;
125 void *arg;
126 } erts_cpu_groups_callback_call_t;
127
128 static erts_cpu_groups_map_t *cpu_groups_maps;
129
130 static erts_cpu_groups_map_t *reader_groups_map;
131
132 static erts_cpu_groups_map_t *decentralized_counter_groups_map;
133
134 #define ERTS_TOPOLOGY_CG ERTS_TOPOLOGY_MAX_DEPTH
135
136 #define ERTS_MAX_CPU_TOPOLOGY_ID ((int) 0xffff)
137
138 static void cpu_bind_order_sort(erts_cpu_topology_t *cpudata,
139 int size,
140 ErtsCpuBindOrder bind_order,
141 int mk_seq);
142 static void write_schedulers_bind_change(erts_cpu_topology_t *cpudata, int size);
143
144 static void reader_groups_callback(int, ErtsSchedulerData *, int, void *);
145 static void flxctr_groups_callback(int, ErtsSchedulerData *, int, void *);
146 static erts_cpu_groups_map_t *add_cpu_groups(int groups,
147 erts_cpu_groups_callback_t callback,
148 void *arg);
149 static void update_cpu_groups_maps(void);
150 static void make_cpu_groups_map(erts_cpu_groups_map_t *map, int test);
151 static int cpu_groups_lookup(erts_cpu_groups_map_t *map,
152 ErtsSchedulerData *esdp);
153
154 static void create_tmp_cpu_topology_copy(erts_cpu_topology_t **cpudata,
155 int *cpudata_size);
156 static void destroy_tmp_cpu_topology_copy(erts_cpu_topology_t *cpudata);
157
158 static int
int_cmp(const void * vx,const void * vy)159 int_cmp(const void *vx, const void *vy)
160 {
161 return *((int *) vx) - *((int *) vy);
162 }
163
164 static int
cpu_spread_order_cmp(const void * vx,const void * vy)165 cpu_spread_order_cmp(const void *vx, const void *vy)
166 {
167 erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx;
168 erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy;
169
170 if (x->thread != y->thread)
171 return x->thread - y->thread;
172 if (x->core != y->core)
173 return x->core - y->core;
174 if (x->processor_node != y->processor_node)
175 return x->processor_node - y->processor_node;
176 if (x->processor != y->processor)
177 return x->processor - y->processor;
178 if (x->node != y->node)
179 return x->node - y->node;
180 return 0;
181 }
182
183 static int
cpu_processor_spread_order_cmp(const void * vx,const void * vy)184 cpu_processor_spread_order_cmp(const void *vx, const void *vy)
185 {
186 erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx;
187 erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy;
188
189 if (x->thread != y->thread)
190 return x->thread - y->thread;
191 if (x->processor_node != y->processor_node)
192 return x->processor_node - y->processor_node;
193 if (x->core != y->core)
194 return x->core - y->core;
195 if (x->node != y->node)
196 return x->node - y->node;
197 if (x->processor != y->processor)
198 return x->processor - y->processor;
199 return 0;
200 }
201
202 static int
cpu_thread_spread_order_cmp(const void * vx,const void * vy)203 cpu_thread_spread_order_cmp(const void *vx, const void *vy)
204 {
205 erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx;
206 erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy;
207
208 if (x->thread != y->thread)
209 return x->thread - y->thread;
210 if (x->node != y->node)
211 return x->node - y->node;
212 if (x->processor != y->processor)
213 return x->processor - y->processor;
214 if (x->processor_node != y->processor_node)
215 return x->processor_node - y->processor_node;
216 if (x->core != y->core)
217 return x->core - y->core;
218 return 0;
219 }
220
221 static int
cpu_thread_no_node_processor_spread_order_cmp(const void * vx,const void * vy)222 cpu_thread_no_node_processor_spread_order_cmp(const void *vx, const void *vy)
223 {
224 erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx;
225 erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy;
226
227 if (x->thread != y->thread)
228 return x->thread - y->thread;
229 if (x->node != y->node)
230 return x->node - y->node;
231 if (x->core != y->core)
232 return x->core - y->core;
233 if (x->processor != y->processor)
234 return x->processor - y->processor;
235 return 0;
236 }
237
238 static int
cpu_no_node_processor_spread_order_cmp(const void * vx,const void * vy)239 cpu_no_node_processor_spread_order_cmp(const void *vx, const void *vy)
240 {
241 erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx;
242 erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy;
243
244 if (x->node != y->node)
245 return x->node - y->node;
246 if (x->thread != y->thread)
247 return x->thread - y->thread;
248 if (x->core != y->core)
249 return x->core - y->core;
250 if (x->processor != y->processor)
251 return x->processor - y->processor;
252 return 0;
253 }
254
255 static int
cpu_no_node_thread_spread_order_cmp(const void * vx,const void * vy)256 cpu_no_node_thread_spread_order_cmp(const void *vx, const void *vy)
257 {
258 erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx;
259 erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy;
260
261 if (x->node != y->node)
262 return x->node - y->node;
263 if (x->thread != y->thread)
264 return x->thread - y->thread;
265 if (x->processor != y->processor)
266 return x->processor - y->processor;
267 if (x->core != y->core)
268 return x->core - y->core;
269 return 0;
270 }
271
272 static int
cpu_no_spread_order_cmp(const void * vx,const void * vy)273 cpu_no_spread_order_cmp(const void *vx, const void *vy)
274 {
275 erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx;
276 erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy;
277
278 if (x->node != y->node)
279 return x->node - y->node;
280 if (x->processor != y->processor)
281 return x->processor - y->processor;
282 if (x->processor_node != y->processor_node)
283 return x->processor_node - y->processor_node;
284 if (x->core != y->core)
285 return x->core - y->core;
286 if (x->thread != y->thread)
287 return x->thread - y->thread;
288 return 0;
289 }
290
291 static ERTS_INLINE void
make_cpudata_id_seq(erts_cpu_topology_t * cpudata,int size,int no_node)292 make_cpudata_id_seq(erts_cpu_topology_t *cpudata, int size, int no_node)
293 {
294 int ix;
295 int node = -1;
296 int processor = -1;
297 int processor_node = -1;
298 int processor_node_node = -1;
299 int core = -1;
300 int thread = -1;
301 int old_node = -1;
302 int old_processor = -1;
303 int old_processor_node = -1;
304 int old_core = -1;
305 int old_thread = -1;
306
307 for (ix = 0; ix < size; ix++) {
308 if (!no_node || cpudata[ix].node >= 0) {
309 if (old_node == cpudata[ix].node)
310 cpudata[ix].node = node;
311 else {
312 old_node = cpudata[ix].node;
313 old_processor = processor = -1;
314 if (!no_node)
315 old_processor_node = processor_node = -1;
316 old_core = core = -1;
317 old_thread = thread = -1;
318 if (no_node || cpudata[ix].node >= 0)
319 cpudata[ix].node = ++node;
320 }
321 }
322 if (old_processor == cpudata[ix].processor)
323 cpudata[ix].processor = processor;
324 else {
325 old_processor = cpudata[ix].processor;
326 if (!no_node)
327 processor_node_node = old_processor_node = processor_node = -1;
328 old_core = core = -1;
329 old_thread = thread = -1;
330 cpudata[ix].processor = ++processor;
331 }
332 if (no_node && cpudata[ix].processor_node < 0)
333 old_processor_node = -1;
334 else {
335 if (old_processor_node == cpudata[ix].processor_node) {
336 if (no_node)
337 cpudata[ix].node = cpudata[ix].processor_node = node;
338 else {
339 if (processor_node_node >= 0)
340 cpudata[ix].node = processor_node_node;
341 cpudata[ix].processor_node = processor_node;
342 }
343 }
344 else {
345 old_processor_node = cpudata[ix].processor_node;
346 old_core = core = -1;
347 old_thread = thread = -1;
348 if (no_node)
349 cpudata[ix].node = cpudata[ix].processor_node = ++node;
350 else {
351 cpudata[ix].node = processor_node_node = ++node;
352 cpudata[ix].processor_node = ++processor_node;
353 }
354 }
355 }
356 if (!no_node && cpudata[ix].processor_node < 0)
357 cpudata[ix].processor_node = 0;
358 if (old_core == cpudata[ix].core)
359 cpudata[ix].core = core;
360 else {
361 old_core = cpudata[ix].core;
362 old_thread = thread = -1;
363 cpudata[ix].core = ++core;
364 }
365 if (old_thread == cpudata[ix].thread)
366 cpudata[ix].thread = thread;
367 else
368 old_thread = cpudata[ix].thread = ++thread;
369 }
370 }
371
372 static void
cpu_bind_order_sort(erts_cpu_topology_t * cpudata,int size,ErtsCpuBindOrder bind_order,int mk_seq)373 cpu_bind_order_sort(erts_cpu_topology_t *cpudata,
374 int size,
375 ErtsCpuBindOrder bind_order,
376 int mk_seq)
377 {
378 if (size > 1) {
379 int no_node = 0;
380 int (*cmp_func)(const void *, const void *);
381 switch (bind_order) {
382 case ERTS_CPU_BIND_SPREAD:
383 cmp_func = cpu_spread_order_cmp;
384 break;
385 case ERTS_CPU_BIND_PROCESSOR_SPREAD:
386 cmp_func = cpu_processor_spread_order_cmp;
387 break;
388 case ERTS_CPU_BIND_THREAD_SPREAD:
389 cmp_func = cpu_thread_spread_order_cmp;
390 break;
391 case ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD:
392 no_node = 1;
393 cmp_func = cpu_thread_no_node_processor_spread_order_cmp;
394 break;
395 case ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD:
396 no_node = 1;
397 cmp_func = cpu_no_node_processor_spread_order_cmp;
398 break;
399 case ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD:
400 no_node = 1;
401 cmp_func = cpu_no_node_thread_spread_order_cmp;
402 break;
403 case ERTS_CPU_BIND_NO_SPREAD:
404 cmp_func = cpu_no_spread_order_cmp;
405 break;
406 default:
407 cmp_func = NULL;
408 erts_exit(ERTS_ABORT_EXIT,
409 "Bad cpu bind type: %d\n",
410 (int) cpu_bind_order);
411 break;
412 }
413
414 if (mk_seq)
415 make_cpudata_id_seq(cpudata, size, no_node);
416
417 qsort(cpudata, size, sizeof(erts_cpu_topology_t), cmp_func);
418 }
419 }
420
421 static int
processor_order_cmp(const void * vx,const void * vy)422 processor_order_cmp(const void *vx, const void *vy)
423 {
424 erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx;
425 erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy;
426
427 if (x->processor != y->processor)
428 return x->processor - y->processor;
429 if (x->node != y->node)
430 return x->node - y->node;
431 if (x->processor_node != y->processor_node)
432 return x->processor_node - y->processor_node;
433 if (x->core != y->core)
434 return x->core - y->core;
435 if (x->thread != y->thread)
436 return x->thread - y->thread;
437 return 0;
438 }
439
440 void
erts_sched_check_cpu_bind_prep_suspend(ErtsSchedulerData * esdp)441 erts_sched_check_cpu_bind_prep_suspend(ErtsSchedulerData *esdp)
442 {
443 erts_cpu_groups_map_t *cgm;
444 erts_cpu_groups_callback_list_t *cgcl;
445 erts_cpu_groups_callback_call_t *cgcc;
446 int cgcc_ix;
447
448 /* Unbind from cpu */
449 erts_rwmtx_rwlock(&cpuinfo_rwmtx);
450 if (scheduler2cpu_map[esdp->no].bound_id >= 0
451 && erts_unbind_from_cpu(cpuinfo) == 0) {
452 esdp->cpu_id = scheduler2cpu_map[esdp->no].bound_id = -1;
453 }
454
455 cgcc = erts_alloc(ERTS_ALC_T_TMP,
456 (no_cpu_groups_callbacks
457 * sizeof(erts_cpu_groups_callback_call_t)));
458 cgcc_ix = 0;
459 for (cgm = cpu_groups_maps; cgm; cgm = cgm->next) {
460 for (cgcl = cgm->callback_list; cgcl; cgcl = cgcl->next) {
461 cgcc[cgcc_ix].callback = cgcl->callback;
462 cgcc[cgcc_ix].ix = cpu_groups_lookup(cgm, esdp);
463 cgcc[cgcc_ix].arg = cgcl->arg;
464 cgcc_ix++;
465 }
466 }
467 ASSERT(no_cpu_groups_callbacks == cgcc_ix);
468 erts_rwmtx_rwunlock(&cpuinfo_rwmtx);
469
470 for (cgcc_ix = 0; cgcc_ix < no_cpu_groups_callbacks; cgcc_ix++)
471 cgcc[cgcc_ix].callback(1,
472 esdp,
473 cgcc[cgcc_ix].ix,
474 cgcc[cgcc_ix].arg);
475
476 erts_free(ERTS_ALC_T_TMP, cgcc);
477
478 if (esdp->no <= max_main_threads)
479 erts_thr_set_main_status(0, 0);
480
481 }
482
483 void
erts_sched_check_cpu_bind_post_suspend(ErtsSchedulerData * esdp)484 erts_sched_check_cpu_bind_post_suspend(ErtsSchedulerData *esdp)
485 {
486 ERTS_LC_ASSERT(erts_lc_runq_is_locked(esdp->run_queue));
487
488 if (esdp->no <= max_main_threads)
489 erts_thr_set_main_status(1, (int) esdp->no);
490
491 /* Make sure we check if we should bind to a cpu or not... */
492 (void) ERTS_RUNQ_FLGS_SET(esdp->run_queue, ERTS_RUNQ_FLG_CHK_CPU_BIND);
493 }
494
495
496 void
erts_sched_check_cpu_bind(ErtsSchedulerData * esdp)497 erts_sched_check_cpu_bind(ErtsSchedulerData *esdp)
498 {
499 int res, cpu_id, cgcc_ix;
500 erts_cpu_groups_map_t *cgm;
501 erts_cpu_groups_callback_list_t *cgcl;
502 erts_cpu_groups_callback_call_t *cgcc;
503 erts_runq_unlock(esdp->run_queue);
504 erts_rwmtx_rwlock(&cpuinfo_rwmtx);
505 cpu_id = scheduler2cpu_map[esdp->no].bind_id;
506 if (cpu_id >= 0 && cpu_id != scheduler2cpu_map[esdp->no].bound_id) {
507 res = erts_bind_to_cpu(cpuinfo, cpu_id);
508 if (res == 0)
509 esdp->cpu_id = scheduler2cpu_map[esdp->no].bound_id = cpu_id;
510 else {
511 erts_dsprintf_buf_t *dsbufp = erts_create_logger_dsbuf();
512 erts_dsprintf(dsbufp, "Scheduler %d failed to bind to cpu %d: %s\n",
513 (int) esdp->no, cpu_id, erl_errno_id(-res));
514 erts_send_error_to_logger_nogl(dsbufp);
515 if (scheduler2cpu_map[esdp->no].bound_id >= 0)
516 goto unbind;
517 }
518 }
519 else if (cpu_id < 0) {
520 unbind:
521 /* Get rid of old binding */
522 res = erts_unbind_from_cpu(cpuinfo);
523 if (res == 0)
524 esdp->cpu_id = scheduler2cpu_map[esdp->no].bound_id = -1;
525 else if (res != -ENOTSUP) {
526 erts_dsprintf_buf_t *dsbufp = erts_create_logger_dsbuf();
527 erts_dsprintf(dsbufp, "Scheduler %d failed to unbind from cpu %d: %s\n",
528 (int) esdp->no, cpu_id, erl_errno_id(-res));
529 erts_send_error_to_logger_nogl(dsbufp);
530 }
531 }
532
533 cgcc = erts_alloc(ERTS_ALC_T_TMP,
534 (no_cpu_groups_callbacks
535 * sizeof(erts_cpu_groups_callback_call_t)));
536 cgcc_ix = 0;
537 for (cgm = cpu_groups_maps; cgm; cgm = cgm->next) {
538 for (cgcl = cgm->callback_list; cgcl; cgcl = cgcl->next) {
539 cgcc[cgcc_ix].callback = cgcl->callback;
540 cgcc[cgcc_ix].ix = cpu_groups_lookup(cgm, esdp);
541 cgcc[cgcc_ix].arg = cgcl->arg;
542 cgcc_ix++;
543 }
544 }
545
546 ASSERT(no_cpu_groups_callbacks == cgcc_ix);
547 erts_rwmtx_rwunlock(&cpuinfo_rwmtx);
548
549 for (cgcc_ix = 0; cgcc_ix < no_cpu_groups_callbacks; cgcc_ix++)
550 cgcc[cgcc_ix].callback(0,
551 esdp,
552 cgcc[cgcc_ix].ix,
553 cgcc[cgcc_ix].arg);
554
555 erts_free(ERTS_ALC_T_TMP, cgcc);
556
557 erts_runq_lock(esdp->run_queue);
558 }
559
560 void
erts_sched_init_check_cpu_bind(ErtsSchedulerData * esdp)561 erts_sched_init_check_cpu_bind(ErtsSchedulerData *esdp)
562 {
563 int cgcc_ix;
564 erts_cpu_groups_map_t *cgm;
565 erts_cpu_groups_callback_list_t *cgcl;
566 erts_cpu_groups_callback_call_t *cgcc;
567
568 erts_rwmtx_rlock(&cpuinfo_rwmtx);
569
570 cgcc = erts_alloc(ERTS_ALC_T_TMP,
571 (no_cpu_groups_callbacks
572 * sizeof(erts_cpu_groups_callback_call_t)));
573 cgcc_ix = 0;
574 for (cgm = cpu_groups_maps; cgm; cgm = cgm->next) {
575 for (cgcl = cgm->callback_list; cgcl; cgcl = cgcl->next) {
576 cgcc[cgcc_ix].callback = cgcl->callback;
577 cgcc[cgcc_ix].ix = cpu_groups_lookup(cgm, esdp);
578 cgcc[cgcc_ix].arg = cgcl->arg;
579 cgcc_ix++;
580 }
581 }
582
583 ASSERT(no_cpu_groups_callbacks == cgcc_ix);
584 erts_rwmtx_runlock(&cpuinfo_rwmtx);
585
586 for (cgcc_ix = 0; cgcc_ix < no_cpu_groups_callbacks; cgcc_ix++)
587 cgcc[cgcc_ix].callback(0,
588 esdp,
589 cgcc[cgcc_ix].ix,
590 cgcc[cgcc_ix].arg);
591
592 erts_free(ERTS_ALC_T_TMP, cgcc);
593
594 if (esdp->no <= max_main_threads)
595 erts_thr_set_main_status(1, (int) esdp->no);
596 }
597
598 static void
write_schedulers_bind_change(erts_cpu_topology_t * cpudata,int size)599 write_schedulers_bind_change(erts_cpu_topology_t *cpudata, int size)
600 {
601 int s_ix = 1;
602 int cpu_ix;
603
604 ERTS_LC_ASSERT(erts_lc_rwmtx_is_rwlocked(&cpuinfo_rwmtx));
605
606 if (cpu_bind_order != ERTS_CPU_BIND_NONE && size) {
607
608 cpu_bind_order_sort(cpudata, size, cpu_bind_order, 1);
609
610 for (cpu_ix = 0; cpu_ix < size && s_ix <= erts_no_schedulers; cpu_ix++)
611 if (erts_is_cpu_available(cpuinfo, cpudata[cpu_ix].logical))
612 scheduler2cpu_map[s_ix++].bind_id = cpudata[cpu_ix].logical;
613 }
614
615 if (s_ix <= erts_no_schedulers)
616 for (; s_ix <= erts_no_schedulers; s_ix++)
617 scheduler2cpu_map[s_ix].bind_id = -1;
618 }
619
620 int
erts_init_scheduler_bind_type_string(char * how)621 erts_init_scheduler_bind_type_string(char *how)
622 {
623 ErtsCpuBindOrder order;
624
625 if (sys_strcmp(how, "u") == 0)
626 order = ERTS_CPU_BIND_NONE;
627 else if (sys_strcmp(how, "db") == 0)
628 order = ERTS_CPU_BIND_DEFAULT_BIND;
629 else if (sys_strcmp(how, "s") == 0)
630 order = ERTS_CPU_BIND_SPREAD;
631 else if (sys_strcmp(how, "ps") == 0)
632 order = ERTS_CPU_BIND_PROCESSOR_SPREAD;
633 else if (sys_strcmp(how, "ts") == 0)
634 order = ERTS_CPU_BIND_THREAD_SPREAD;
635 else if (sys_strcmp(how, "tnnps") == 0)
636 order = ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD;
637 else if (sys_strcmp(how, "nnps") == 0)
638 order = ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD;
639 else if (sys_strcmp(how, "nnts") == 0)
640 order = ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD;
641 else if (sys_strcmp(how, "ns") == 0)
642 order = ERTS_CPU_BIND_NO_SPREAD;
643 else
644 return ERTS_INIT_SCHED_BIND_TYPE_ERROR_BAD_TYPE;
645
646 if (order != ERTS_CPU_BIND_NONE) {
647 if (erts_bind_to_cpu(cpuinfo, -1) == -ENOTSUP)
648 return ERTS_INIT_SCHED_BIND_TYPE_NOT_SUPPORTED;
649 else if (!system_cpudata && !user_cpudata)
650 return ERTS_INIT_SCHED_BIND_TYPE_ERROR_NO_CPU_TOPOLOGY;
651 }
652
653 cpu_bind_order = order;
654
655 return ERTS_INIT_SCHED_BIND_TYPE_SUCCESS;
656 }
657
658 static Eterm
bound_schedulers_term(ErtsCpuBindOrder order)659 bound_schedulers_term(ErtsCpuBindOrder order)
660 {
661 switch (order) {
662 case ERTS_CPU_BIND_SPREAD: {
663 ERTS_DECL_AM(spread);
664 return AM_spread;
665 }
666 case ERTS_CPU_BIND_PROCESSOR_SPREAD: {
667 ERTS_DECL_AM(processor_spread);
668 return AM_processor_spread;
669 }
670 case ERTS_CPU_BIND_THREAD_SPREAD: {
671 ERTS_DECL_AM(thread_spread);
672 return AM_thread_spread;
673 }
674 case ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD: {
675 ERTS_DECL_AM(thread_no_node_processor_spread);
676 return AM_thread_no_node_processor_spread;
677 }
678 case ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD: {
679 ERTS_DECL_AM(no_node_processor_spread);
680 return AM_no_node_processor_spread;
681 }
682 case ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD: {
683 ERTS_DECL_AM(no_node_thread_spread);
684 return AM_no_node_thread_spread;
685 }
686 case ERTS_CPU_BIND_NO_SPREAD: {
687 ERTS_DECL_AM(no_spread);
688 return AM_no_spread;
689 }
690 case ERTS_CPU_BIND_NONE: {
691 ERTS_DECL_AM(unbound);
692 return AM_unbound;
693 }
694 default:
695 ASSERT(0);
696 return THE_NON_VALUE;
697 }
698 }
699
700 Eterm
erts_bound_schedulers_term(Process * c_p)701 erts_bound_schedulers_term(Process *c_p)
702 {
703 ErtsCpuBindOrder order;
704 erts_rwmtx_rlock(&cpuinfo_rwmtx);
705 order = cpu_bind_order;
706 erts_rwmtx_runlock(&cpuinfo_rwmtx);
707 return bound_schedulers_term(order);
708 }
709
710 Eterm
erts_bind_schedulers(Process * c_p,Eterm how)711 erts_bind_schedulers(Process *c_p, Eterm how)
712 {
713 int notify = 0;
714 Eterm res;
715 erts_cpu_topology_t *cpudata;
716 int cpudata_size;
717 ErtsCpuBindOrder old_cpu_bind_order;
718
719 erts_rwmtx_rwlock(&cpuinfo_rwmtx);
720
721 if (erts_bind_to_cpu(cpuinfo, -1) == -ENOTSUP) {
722 if (cpu_bind_order == ERTS_CPU_BIND_NONE
723 && ERTS_IS_ATOM_STR("unbound", how)) {
724 res = bound_schedulers_term(ERTS_CPU_BIND_NONE);
725 goto done;
726 }
727 ERTS_BIF_PREP_ERROR(res, c_p, EXC_NOTSUP);
728 }
729 else {
730
731 old_cpu_bind_order = cpu_bind_order;
732
733 if (ERTS_IS_ATOM_STR("default_bind", how))
734 cpu_bind_order = ERTS_CPU_BIND_DEFAULT_BIND;
735 else if (ERTS_IS_ATOM_STR("spread", how))
736 cpu_bind_order = ERTS_CPU_BIND_SPREAD;
737 else if (ERTS_IS_ATOM_STR("processor_spread", how))
738 cpu_bind_order = ERTS_CPU_BIND_PROCESSOR_SPREAD;
739 else if (ERTS_IS_ATOM_STR("thread_spread", how))
740 cpu_bind_order = ERTS_CPU_BIND_THREAD_SPREAD;
741 else if (ERTS_IS_ATOM_STR("thread_no_node_processor_spread", how))
742 cpu_bind_order = ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD;
743 else if (ERTS_IS_ATOM_STR("no_node_processor_spread", how))
744 cpu_bind_order = ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD;
745 else if (ERTS_IS_ATOM_STR("no_node_thread_spread", how))
746 cpu_bind_order = ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD;
747 else if (ERTS_IS_ATOM_STR("no_spread", how))
748 cpu_bind_order = ERTS_CPU_BIND_NO_SPREAD;
749 else if (ERTS_IS_ATOM_STR("unbound", how))
750 cpu_bind_order = ERTS_CPU_BIND_NONE;
751 else {
752 cpu_bind_order = old_cpu_bind_order;
753 ERTS_BIF_PREP_ERROR(res, c_p, BADARG);
754 goto done;
755 }
756
757 create_tmp_cpu_topology_copy(&cpudata, &cpudata_size);
758
759 if (!cpudata) {
760 cpu_bind_order = old_cpu_bind_order;
761 ERTS_BIF_PREP_ERROR(res, c_p, BADARG);
762 goto done;
763 }
764
765 write_schedulers_bind_change(cpudata, cpudata_size);
766 notify = 1;
767
768 destroy_tmp_cpu_topology_copy(cpudata);
769
770 res = bound_schedulers_term(old_cpu_bind_order);
771 }
772
773 done:
774
775 erts_rwmtx_rwunlock(&cpuinfo_rwmtx);
776
777 if (notify)
778 erts_sched_notify_check_cpu_bind();
779
780 return res;
781 }
782
783 int
erts_sched_bind_atthrcreate_prepare(void)784 erts_sched_bind_atthrcreate_prepare(void)
785 {
786 ErtsSchedulerData *esdp = erts_get_scheduler_data();
787 return esdp != NULL && erts_is_scheduler_bound(esdp);
788 }
789
790 int
erts_sched_bind_atthrcreate_child(int unbind)791 erts_sched_bind_atthrcreate_child(int unbind)
792 {
793 int res = 0;
794 if (unbind) {
795 erts_rwmtx_rlock(&cpuinfo_rwmtx);
796 res = erts_unbind_from_cpu(cpuinfo);
797 erts_rwmtx_runlock(&cpuinfo_rwmtx);
798 }
799 return res;
800 }
801
802 void
erts_sched_bind_atthrcreate_parent(int unbind)803 erts_sched_bind_atthrcreate_parent(int unbind)
804 {
805
806 }
807
808 int
erts_sched_bind_atfork_prepare(void)809 erts_sched_bind_atfork_prepare(void)
810 {
811 ErtsSchedulerData *esdp = erts_get_scheduler_data();
812 int unbind = esdp != NULL && erts_is_scheduler_bound(esdp);
813 if (unbind)
814 erts_rwmtx_rlock(&cpuinfo_rwmtx);
815 return unbind;
816 }
817
818 int
erts_sched_bind_atfork_child(int unbind)819 erts_sched_bind_atfork_child(int unbind)
820 {
821 if (unbind) {
822 ERTS_LC_ASSERT(erts_lc_rwmtx_is_rlocked(&cpuinfo_rwmtx)
823 || erts_lc_rwmtx_is_rwlocked(&cpuinfo_rwmtx));
824 return erts_unbind_from_cpu(cpuinfo);
825 }
826 return 0;
827 }
828
829 void
erts_sched_bind_atfork_parent(int unbind)830 erts_sched_bind_atfork_parent(int unbind)
831 {
832 if (unbind)
833 erts_rwmtx_runlock(&cpuinfo_rwmtx);
834 }
835
836 Eterm
erts_fake_scheduler_bindings(Process * p,Eterm how)837 erts_fake_scheduler_bindings(Process *p, Eterm how)
838 {
839 ErtsCpuBindOrder fake_cpu_bind_order;
840 erts_cpu_topology_t *cpudata;
841 int cpudata_size;
842 Eterm res;
843
844 if (ERTS_IS_ATOM_STR("default_bind", how))
845 fake_cpu_bind_order = ERTS_CPU_BIND_DEFAULT_BIND;
846 else if (ERTS_IS_ATOM_STR("spread", how))
847 fake_cpu_bind_order = ERTS_CPU_BIND_SPREAD;
848 else if (ERTS_IS_ATOM_STR("processor_spread", how))
849 fake_cpu_bind_order = ERTS_CPU_BIND_PROCESSOR_SPREAD;
850 else if (ERTS_IS_ATOM_STR("thread_spread", how))
851 fake_cpu_bind_order = ERTS_CPU_BIND_THREAD_SPREAD;
852 else if (ERTS_IS_ATOM_STR("thread_no_node_processor_spread", how))
853 fake_cpu_bind_order = ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD;
854 else if (ERTS_IS_ATOM_STR("no_node_processor_spread", how))
855 fake_cpu_bind_order = ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD;
856 else if (ERTS_IS_ATOM_STR("no_node_thread_spread", how))
857 fake_cpu_bind_order = ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD;
858 else if (ERTS_IS_ATOM_STR("no_spread", how))
859 fake_cpu_bind_order = ERTS_CPU_BIND_NO_SPREAD;
860 else if (ERTS_IS_ATOM_STR("unbound", how))
861 fake_cpu_bind_order = ERTS_CPU_BIND_NONE;
862 else {
863 ERTS_BIF_PREP_ERROR(res, p, BADARG);
864 return res;
865 }
866
867 erts_rwmtx_rlock(&cpuinfo_rwmtx);
868 create_tmp_cpu_topology_copy(&cpudata, &cpudata_size);
869 erts_rwmtx_runlock(&cpuinfo_rwmtx);
870
871 if (!cpudata || fake_cpu_bind_order == ERTS_CPU_BIND_NONE)
872 ERTS_BIF_PREP_RET(res, am_false);
873 else {
874 int i;
875 Eterm *hp;
876
877 cpu_bind_order_sort(cpudata, cpudata_size, fake_cpu_bind_order, 1);
878
879 #ifdef ERTS_FAKE_SCHED_BIND_PRINT_SORTED_CPU_DATA
880
881 erts_fprintf(stderr, "node: ");
882 for (i = 0; i < cpudata_size; i++)
883 erts_fprintf(stderr, " %2d", cpudata[i].node);
884 erts_fprintf(stderr, "\n");
885 erts_fprintf(stderr, "processor: ");
886 for (i = 0; i < cpudata_size; i++)
887 erts_fprintf(stderr, " %2d", cpudata[i].processor);
888 erts_fprintf(stderr, "\n");
889 if (fake_cpu_bind_order != ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD
890 && fake_cpu_bind_order != ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD
891 && fake_cpu_bind_order != ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD) {
892 erts_fprintf(stderr, "processor_node:");
893 for (i = 0; i < cpudata_size; i++)
894 erts_fprintf(stderr, " %2d", cpudata[i].processor_node);
895 erts_fprintf(stderr, "\n");
896 }
897 erts_fprintf(stderr, "core: ");
898 for (i = 0; i < cpudata_size; i++)
899 erts_fprintf(stderr, " %2d", cpudata[i].core);
900 erts_fprintf(stderr, "\n");
901 erts_fprintf(stderr, "thread: ");
902 for (i = 0; i < cpudata_size; i++)
903 erts_fprintf(stderr, " %2d", cpudata[i].thread);
904 erts_fprintf(stderr, "\n");
905 erts_fprintf(stderr, "logical: ");
906 for (i = 0; i < cpudata_size; i++)
907 erts_fprintf(stderr, " %2d", cpudata[i].logical);
908 erts_fprintf(stderr, "\n");
909 #endif
910
911 hp = HAlloc(p, cpudata_size+1);
912 ERTS_BIF_PREP_RET(res, make_tuple(hp));
913 *hp++ = make_arityval((Uint) cpudata_size);
914 for (i = 0; i < cpudata_size; i++)
915 *hp++ = make_small((Uint) cpudata[i].logical);
916 }
917
918 destroy_tmp_cpu_topology_copy(cpudata);
919
920 return res;
921 }
922
923 Eterm
erts_get_schedulers_binds(Process * c_p)924 erts_get_schedulers_binds(Process *c_p)
925 {
926 int ix;
927 ERTS_DECL_AM(unbound);
928 Eterm *hp = HAlloc(c_p, erts_no_schedulers+1);
929 Eterm res = make_tuple(hp);
930
931 *(hp++) = make_arityval(erts_no_schedulers);
932 erts_rwmtx_rlock(&cpuinfo_rwmtx);
933 for (ix = 1; ix <= erts_no_schedulers; ix++)
934 *(hp++) = (scheduler2cpu_map[ix].bound_id >= 0
935 ? make_small(scheduler2cpu_map[ix].bound_id)
936 : AM_unbound);
937 erts_rwmtx_runlock(&cpuinfo_rwmtx);
938 return res;
939 }
940
941 /*
942 * CPU topology
943 */
944
945 typedef struct {
946 int *id;
947 int used;
948 int size;
949 } ErtsCpuTopIdSeq;
950
951 typedef struct {
952 ErtsCpuTopIdSeq logical;
953 ErtsCpuTopIdSeq thread;
954 ErtsCpuTopIdSeq core;
955 ErtsCpuTopIdSeq processor_node;
956 ErtsCpuTopIdSeq processor;
957 ErtsCpuTopIdSeq node;
958 } ErtsCpuTopEntry;
959
960 static void
init_cpu_top_entry(ErtsCpuTopEntry * cte)961 init_cpu_top_entry(ErtsCpuTopEntry *cte)
962 {
963 int size = 10;
964 cte->logical.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS,
965 sizeof(int)*size);
966 cte->logical.size = size;
967 cte->thread.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS,
968 sizeof(int)*size);
969 cte->thread.size = size;
970 cte->core.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS,
971 sizeof(int)*size);
972 cte->core.size = size;
973 cte->processor_node.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS,
974 sizeof(int)*size);
975 cte->processor_node.size = size;
976 cte->processor.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS,
977 sizeof(int)*size);
978 cte->processor.size = size;
979 cte->node.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS,
980 sizeof(int)*size);
981 cte->node.size = size;
982 }
983
984 static void
destroy_cpu_top_entry(ErtsCpuTopEntry * cte)985 destroy_cpu_top_entry(ErtsCpuTopEntry *cte)
986 {
987 erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->logical.id);
988 erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->thread.id);
989 erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->core.id);
990 erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->processor_node.id);
991 erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->processor.id);
992 erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->node.id);
993 }
994
995 static int
get_cput_value_or_range(int * v,int * vr,char ** str)996 get_cput_value_or_range(int *v, int *vr, char **str)
997 {
998 long l;
999 char *c = *str;
1000 errno = 0;
1001 if (!isdigit((unsigned char)*c))
1002 return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID;
1003 l = strtol(c, &c, 10);
1004 if (errno != 0 || l < 0 || ERTS_MAX_CPU_TOPOLOGY_ID < l)
1005 return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID;
1006 *v = (int) l;
1007 if (*c == '-') {
1008 c++;
1009 if (!isdigit((unsigned char)*c))
1010 return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE;
1011 l = strtol(c, &c, 10);
1012 if (errno != 0 || l < 0 || ERTS_MAX_CPU_TOPOLOGY_ID < l)
1013 return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE;
1014 *vr = (int) l;
1015 }
1016 *str = c;
1017 return ERTS_INIT_CPU_TOPOLOGY_OK;
1018 }
1019
1020 static int
get_cput_id_seq(ErtsCpuTopIdSeq * idseq,char ** str)1021 get_cput_id_seq(ErtsCpuTopIdSeq *idseq, char **str)
1022 {
1023 int ix = 0;
1024 int need_size = 0;
1025 char *c = *str;
1026
1027 while (1) {
1028 int res;
1029 int val;
1030 int nids;
1031 int val_range = -1;
1032 res = get_cput_value_or_range(&val, &val_range, &c);
1033 if (res != ERTS_INIT_CPU_TOPOLOGY_OK)
1034 return res;
1035 if (val_range < 0 || val_range == val)
1036 nids = 1;
1037 else {
1038 if (val_range > val)
1039 nids = val_range - val + 1;
1040 else
1041 nids = val - val_range + 1;
1042 }
1043 need_size += nids;
1044 if (need_size > idseq->size) {
1045 idseq->size = need_size + 10;
1046 idseq->id = erts_realloc(ERTS_ALC_T_TMP_CPU_IDS,
1047 idseq->id,
1048 sizeof(int)*idseq->size);
1049 }
1050 if (nids == 1)
1051 idseq->id[ix++] = val;
1052 else if (val_range > val) {
1053 for (; val <= val_range; val++)
1054 idseq->id[ix++] = val;
1055 }
1056 else {
1057 for (; val >= val_range; val--)
1058 idseq->id[ix++] = val;
1059 }
1060 if (*c != ',')
1061 break;
1062 c++;
1063 }
1064 *str = c;
1065 idseq->used = ix;
1066 return ERTS_INIT_CPU_TOPOLOGY_OK;
1067 }
1068
1069 static int
get_cput_entry(ErtsCpuTopEntry * cput,char ** str)1070 get_cput_entry(ErtsCpuTopEntry *cput, char **str)
1071 {
1072 int h;
1073 char *c = *str;
1074
1075 cput->logical.used = 0;
1076 cput->thread.id[0] = 0;
1077 cput->thread.used = 1;
1078 cput->core.id[0] = 0;
1079 cput->core.used = 1;
1080 cput->processor_node.id[0] = -1;
1081 cput->processor_node.used = 1;
1082 cput->processor.id[0] = 0;
1083 cput->processor.used = 1;
1084 cput->node.id[0] = -1;
1085 cput->node.used = 1;
1086
1087 h = ERTS_TOPOLOGY_MAX_DEPTH;
1088 while (*c != ':' && *c != '\0') {
1089 int res;
1090 ErtsCpuTopIdSeq *idseqp;
1091 switch (*c++) {
1092 case 'L':
1093 if (h <= ERTS_TOPOLOGY_LOGICAL)
1094 return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY;
1095 idseqp = &cput->logical;
1096 h = ERTS_TOPOLOGY_LOGICAL;
1097 break;
1098 case 't':
1099 case 'T':
1100 if (h <= ERTS_TOPOLOGY_THREAD)
1101 return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY;
1102 idseqp = &cput->thread;
1103 h = ERTS_TOPOLOGY_THREAD;
1104 break;
1105 case 'c':
1106 case 'C':
1107 if (h <= ERTS_TOPOLOGY_CORE)
1108 return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY;
1109 idseqp = &cput->core;
1110 h = ERTS_TOPOLOGY_CORE;
1111 break;
1112 case 'p':
1113 case 'P':
1114 if (h <= ERTS_TOPOLOGY_PROCESSOR)
1115 return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY;
1116 idseqp = &cput->processor;
1117 h = ERTS_TOPOLOGY_PROCESSOR;
1118 break;
1119 case 'n':
1120 case 'N':
1121 if (h <= ERTS_TOPOLOGY_PROCESSOR) {
1122 do_node:
1123 if (h <= ERTS_TOPOLOGY_NODE)
1124 return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY;
1125 idseqp = &cput->node;
1126 h = ERTS_TOPOLOGY_NODE;
1127 }
1128 else {
1129 int p_node = 0;
1130 char *p_chk = c;
1131 while (*p_chk != '\0' && *p_chk != ':') {
1132 if (*p_chk == 'p' || *p_chk == 'P') {
1133 p_node = 1;
1134 break;
1135 }
1136 p_chk++;
1137 }
1138 if (!p_node)
1139 goto do_node;
1140 if (h <= ERTS_TOPOLOGY_PROCESSOR_NODE)
1141 return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY;
1142 idseqp = &cput->processor_node;
1143 h = ERTS_TOPOLOGY_PROCESSOR_NODE;
1144 }
1145 break;
1146 default:
1147 return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_TYPE;
1148 }
1149 res = get_cput_id_seq(idseqp, &c);
1150 if (res != ERTS_INIT_CPU_TOPOLOGY_OK)
1151 return res;
1152 }
1153
1154 if (cput->logical.used < 1)
1155 return ERTS_INIT_CPU_TOPOLOGY_MISSING_LID;
1156
1157 if (*c == ':') {
1158 c++;
1159 }
1160
1161 if (cput->thread.used != 1
1162 && cput->thread.used != cput->logical.used)
1163 return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE;
1164 if (cput->core.used != 1
1165 && cput->core.used != cput->logical.used)
1166 return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE;
1167 if (cput->processor_node.used != 1
1168 && cput->processor_node.used != cput->logical.used)
1169 return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE;
1170 if (cput->processor.used != 1
1171 && cput->processor.used != cput->logical.used)
1172 return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE;
1173 if (cput->node.used != 1
1174 && cput->node.used != cput->logical.used)
1175 return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE;
1176
1177 *str = c;
1178 return ERTS_INIT_CPU_TOPOLOGY_OK;
1179 }
1180
1181 static int
verify_topology(erts_cpu_topology_t * cpudata,int size)1182 verify_topology(erts_cpu_topology_t *cpudata, int size)
1183 {
1184 if (size > 0) {
1185 int *logical;
1186 int node, processor, no_nodes, i;
1187
1188 /* Verify logical ids */
1189 logical = erts_alloc(ERTS_ALC_T_TMP, sizeof(int)*size);
1190
1191 for (i = 0; i < size; i++)
1192 logical[i] = cpudata[i].logical;
1193
1194 qsort(logical, size, sizeof(int), int_cmp);
1195 for (i = 0; i < size-1; i++) {
1196 if (logical[i] == logical[i+1]) {
1197 erts_free(ERTS_ALC_T_TMP, logical);
1198 return ERTS_INIT_CPU_TOPOLOGY_NOT_UNIQUE_LIDS;
1199 }
1200 }
1201
1202 erts_free(ERTS_ALC_T_TMP, logical);
1203
1204 qsort(cpudata, size, sizeof(erts_cpu_topology_t), processor_order_cmp);
1205
1206 /* Verify unique entities */
1207
1208 for (i = 1; i < size; i++) {
1209 if (cpudata[i-1].processor == cpudata[i].processor
1210 && cpudata[i-1].node == cpudata[i].node
1211 && (cpudata[i-1].processor_node
1212 == cpudata[i].processor_node)
1213 && cpudata[i-1].core == cpudata[i].core
1214 && cpudata[i-1].thread == cpudata[i].thread) {
1215 return ERTS_INIT_CPU_TOPOLOGY_NOT_UNIQUE_ENTITIES;
1216 }
1217 }
1218
1219 /* Verify numa nodes */
1220 node = cpudata[0].node;
1221 processor = cpudata[0].processor;
1222 no_nodes = cpudata[0].node < 0 && cpudata[0].processor_node < 0;
1223 for (i = 1; i < size; i++) {
1224 if (no_nodes) {
1225 if (cpudata[i].node >= 0 || cpudata[i].processor_node >= 0)
1226 return ERTS_INIT_CPU_TOPOLOGY_INVALID_NODES;
1227 }
1228 else {
1229 if (cpudata[i].processor == processor && cpudata[i].node != node)
1230 return ERTS_INIT_CPU_TOPOLOGY_INVALID_NODES;
1231 node = cpudata[i].node;
1232 processor = cpudata[i].processor;
1233 if (node >= 0 && cpudata[i].processor_node >= 0)
1234 return ERTS_INIT_CPU_TOPOLOGY_INVALID_NODES;
1235 if (node < 0 && cpudata[i].processor_node < 0)
1236 return ERTS_INIT_CPU_TOPOLOGY_INVALID_NODES;
1237 }
1238 }
1239 }
1240
1241 return ERTS_INIT_CPU_TOPOLOGY_OK;
1242 }
1243
1244 int
erts_init_cpu_topology_string(char * topology_str)1245 erts_init_cpu_topology_string(char *topology_str)
1246 {
1247 ErtsCpuTopEntry cput;
1248 int need_size;
1249 char *c;
1250 int ix;
1251 int error = ERTS_INIT_CPU_TOPOLOGY_OK;
1252
1253 if (user_cpudata)
1254 erts_free(ERTS_ALC_T_CPUDATA, user_cpudata);
1255 user_cpudata_size = 10;
1256
1257 user_cpudata = erts_alloc(ERTS_ALC_T_CPUDATA,
1258 (sizeof(erts_cpu_topology_t)
1259 * user_cpudata_size));
1260
1261 init_cpu_top_entry(&cput);
1262
1263 ix = 0;
1264 need_size = 0;
1265
1266 c = topology_str;
1267 if (*c == '\0') {
1268 error = ERTS_INIT_CPU_TOPOLOGY_MISSING;
1269 goto fail;
1270 }
1271 do {
1272 int r;
1273 error = get_cput_entry(&cput, &c);
1274 if (error != ERTS_INIT_CPU_TOPOLOGY_OK)
1275 goto fail;
1276 need_size += cput.logical.used;
1277 if (user_cpudata_size < need_size) {
1278 user_cpudata_size = need_size + 10;
1279 user_cpudata = erts_realloc(ERTS_ALC_T_CPUDATA,
1280 user_cpudata,
1281 (sizeof(erts_cpu_topology_t)
1282 * user_cpudata_size));
1283 }
1284
1285 ASSERT(cput.thread.used == 1
1286 || cput.thread.used == cput.logical.used);
1287 ASSERT(cput.core.used == 1
1288 || cput.core.used == cput.logical.used);
1289 ASSERT(cput.processor_node.used == 1
1290 || cput.processor_node.used == cput.logical.used);
1291 ASSERT(cput.processor.used == 1
1292 || cput.processor.used == cput.logical.used);
1293 ASSERT(cput.node.used == 1
1294 || cput.node.used == cput.logical.used);
1295
1296 for (r = 0; r < cput.logical.used; r++) {
1297 user_cpudata[ix].logical = cput.logical.id[r];
1298 user_cpudata[ix].thread =
1299 cput.thread.id[cput.thread.used == 1 ? 0 : r];
1300 user_cpudata[ix].core =
1301 cput.core.id[cput.core.used == 1 ? 0 : r];
1302 user_cpudata[ix].processor_node =
1303 cput.processor_node.id[cput.processor_node.used == 1 ? 0 : r];
1304 user_cpudata[ix].processor =
1305 cput.processor.id[cput.processor.used == 1 ? 0 : r];
1306 user_cpudata[ix].node =
1307 cput.node.id[cput.node.used == 1 ? 0 : r];
1308 ix++;
1309 }
1310 } while (*c != '\0');
1311
1312 if (user_cpudata_size != ix) {
1313 user_cpudata_size = ix;
1314 user_cpudata = erts_realloc(ERTS_ALC_T_CPUDATA,
1315 user_cpudata,
1316 (sizeof(erts_cpu_topology_t)
1317 * user_cpudata_size));
1318 }
1319
1320 error = verify_topology(user_cpudata, user_cpudata_size);
1321 if (error == ERTS_INIT_CPU_TOPOLOGY_OK) {
1322 destroy_cpu_top_entry(&cput);
1323 return ERTS_INIT_CPU_TOPOLOGY_OK;
1324 }
1325
1326 fail:
1327 if (user_cpudata)
1328 erts_free(ERTS_ALC_T_CPUDATA, user_cpudata);
1329 user_cpudata_size = 0;
1330 destroy_cpu_top_entry(&cput);
1331 return error;
1332 }
1333
1334 #define ERTS_GET_CPU_TOPOLOGY_ERROR -1
1335 #define ERTS_GET_USED_CPU_TOPOLOGY 0
1336 #define ERTS_GET_DETECTED_CPU_TOPOLOGY 1
1337 #define ERTS_GET_DEFINED_CPU_TOPOLOGY 2
1338
1339 static Eterm get_cpu_topology_term(Process *c_p, int type);
1340
1341 Eterm
erts_set_cpu_topology(Process * c_p,Eterm term)1342 erts_set_cpu_topology(Process *c_p, Eterm term)
1343 {
1344 erts_cpu_topology_t *cpudata = NULL;
1345 int cpudata_size = 0;
1346 Eterm res;
1347
1348 erts_rwmtx_rwlock(&cpuinfo_rwmtx);
1349 res = get_cpu_topology_term(c_p, ERTS_GET_USED_CPU_TOPOLOGY);
1350 if (term == am_undefined) {
1351 if (user_cpudata)
1352 erts_free(ERTS_ALC_T_CPUDATA, user_cpudata);
1353 user_cpudata = NULL;
1354 user_cpudata_size = 0;
1355
1356 if (cpu_bind_order != ERTS_CPU_BIND_NONE && system_cpudata) {
1357 cpudata_size = system_cpudata_size;
1358 cpudata = erts_alloc(ERTS_ALC_T_TMP,
1359 (sizeof(erts_cpu_topology_t)
1360 * cpudata_size));
1361
1362 sys_memcpy((void *) cpudata,
1363 (void *) system_cpudata,
1364 sizeof(erts_cpu_topology_t)*cpudata_size);
1365 }
1366 }
1367 else if (is_not_list(term)) {
1368 error:
1369 erts_rwmtx_rwunlock(&cpuinfo_rwmtx);
1370 res = THE_NON_VALUE;
1371 goto done;
1372 }
1373 else {
1374 Eterm list = term;
1375 int ix = 0;
1376
1377 cpudata_size = 100;
1378 cpudata = erts_alloc(ERTS_ALC_T_TMP,
1379 (sizeof(erts_cpu_topology_t)
1380 * cpudata_size));
1381
1382 while (is_list(list)) {
1383 Eterm *lp = list_val(list);
1384 Eterm cpu = CAR(lp);
1385 Eterm* tp;
1386 Sint id;
1387
1388 if (is_not_tuple(cpu))
1389 goto error;
1390
1391 tp = tuple_val(cpu);
1392
1393 if (arityval(tp[0]) != 7 || tp[1] != am_cpu)
1394 goto error;
1395
1396 if (ix >= cpudata_size) {
1397 cpudata_size += 100;
1398 cpudata = erts_realloc(ERTS_ALC_T_TMP,
1399 cpudata,
1400 (sizeof(erts_cpu_topology_t)
1401 * cpudata_size));
1402 }
1403
1404 id = signed_val(tp[2]);
1405 if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id)
1406 goto error;
1407 cpudata[ix].node = (int) id;
1408
1409 id = signed_val(tp[3]);
1410 if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id)
1411 goto error;
1412 cpudata[ix].processor = (int) id;
1413
1414 id = signed_val(tp[4]);
1415 if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id)
1416 goto error;
1417 cpudata[ix].processor_node = (int) id;
1418
1419 id = signed_val(tp[5]);
1420 if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id)
1421 goto error;
1422 cpudata[ix].core = (int) id;
1423
1424 id = signed_val(tp[6]);
1425 if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id)
1426 goto error;
1427 cpudata[ix].thread = (int) id;
1428
1429 id = signed_val(tp[7]);
1430 if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id)
1431 goto error;
1432 cpudata[ix].logical = (int) id;
1433
1434 list = CDR(lp);
1435 ix++;
1436 }
1437
1438 if (is_not_nil(list))
1439 goto error;
1440
1441 cpudata_size = ix;
1442
1443 if (ERTS_INIT_CPU_TOPOLOGY_OK != verify_topology(cpudata, cpudata_size))
1444 goto error;
1445
1446 if (user_cpudata_size != cpudata_size) {
1447 if (user_cpudata)
1448 erts_free(ERTS_ALC_T_CPUDATA, user_cpudata);
1449 user_cpudata = erts_alloc(ERTS_ALC_T_CPUDATA,
1450 sizeof(erts_cpu_topology_t)*cpudata_size);
1451 user_cpudata_size = cpudata_size;
1452 }
1453
1454 sys_memcpy((void *) user_cpudata,
1455 (void *) cpudata,
1456 sizeof(erts_cpu_topology_t)*cpudata_size);
1457 }
1458
1459 update_cpu_groups_maps();
1460
1461 write_schedulers_bind_change(cpudata, cpudata_size);
1462
1463 erts_rwmtx_rwunlock(&cpuinfo_rwmtx);
1464 erts_sched_notify_check_cpu_bind();
1465
1466 done:
1467
1468 if (cpudata)
1469 erts_free(ERTS_ALC_T_TMP, cpudata);
1470
1471 return res;
1472 }
1473
1474 static void
create_tmp_cpu_topology_copy(erts_cpu_topology_t ** cpudata,int * cpudata_size)1475 create_tmp_cpu_topology_copy(erts_cpu_topology_t **cpudata, int *cpudata_size)
1476 {
1477 if (user_cpudata) {
1478 *cpudata_size = user_cpudata_size;
1479 *cpudata = erts_alloc(ERTS_ALC_T_TMP,
1480 (sizeof(erts_cpu_topology_t)
1481 * (*cpudata_size)));
1482 sys_memcpy((void *) *cpudata,
1483 (void *) user_cpudata,
1484 sizeof(erts_cpu_topology_t)*(*cpudata_size));
1485 }
1486 else if (system_cpudata) {
1487 *cpudata_size = system_cpudata_size;
1488 *cpudata = erts_alloc(ERTS_ALC_T_TMP,
1489 (sizeof(erts_cpu_topology_t)
1490 * (*cpudata_size)));
1491 sys_memcpy((void *) *cpudata,
1492 (void *) system_cpudata,
1493 sizeof(erts_cpu_topology_t)*(*cpudata_size));
1494 }
1495 else {
1496 *cpudata = NULL;
1497 *cpudata_size = 0;
1498 }
1499 }
1500
1501 static void
destroy_tmp_cpu_topology_copy(erts_cpu_topology_t * cpudata)1502 destroy_tmp_cpu_topology_copy(erts_cpu_topology_t *cpudata)
1503 {
1504 if (cpudata)
1505 erts_free(ERTS_ALC_T_TMP, cpudata);
1506 }
1507
1508
1509 static Eterm
bld_topology_term(Eterm ** hpp,Uint * hszp,erts_cpu_topology_t * cpudata,int size)1510 bld_topology_term(Eterm **hpp,
1511 Uint *hszp,
1512 erts_cpu_topology_t *cpudata,
1513 int size)
1514 {
1515 Eterm res = NIL;
1516 int i;
1517
1518 if (size == 0)
1519 return am_undefined;
1520
1521 for (i = size-1; i >= 0; i--) {
1522 res = erts_bld_cons(hpp,
1523 hszp,
1524 erts_bld_tuple(hpp,
1525 hszp,
1526 7,
1527 am_cpu,
1528 make_small(cpudata[i].node),
1529 make_small(cpudata[i].processor),
1530 make_small(cpudata[i].processor_node),
1531 make_small(cpudata[i].core),
1532 make_small(cpudata[i].thread),
1533 make_small(cpudata[i].logical)),
1534 res);
1535 }
1536 return res;
1537 }
1538
1539 static Eterm
get_cpu_topology_term(Process * c_p,int type)1540 get_cpu_topology_term(Process *c_p, int type)
1541 {
1542 #ifdef DEBUG
1543 Eterm *hp_end;
1544 #endif
1545 Eterm *hp;
1546 Uint hsz;
1547 Eterm res = THE_NON_VALUE;
1548 erts_cpu_topology_t *cpudata = NULL;
1549 int size = 0;
1550
1551 switch (type) {
1552 case ERTS_GET_USED_CPU_TOPOLOGY:
1553 if (user_cpudata)
1554 goto defined;
1555 else
1556 goto detected;
1557 case ERTS_GET_DETECTED_CPU_TOPOLOGY:
1558 detected:
1559 if (!system_cpudata)
1560 res = am_undefined;
1561 else {
1562 size = system_cpudata_size;
1563 cpudata = erts_alloc(ERTS_ALC_T_TMP,
1564 (sizeof(erts_cpu_topology_t)
1565 * size));
1566 sys_memcpy((void *) cpudata,
1567 (void *) system_cpudata,
1568 sizeof(erts_cpu_topology_t)*size);
1569 }
1570 break;
1571 case ERTS_GET_DEFINED_CPU_TOPOLOGY:
1572 defined:
1573 if (!user_cpudata)
1574 res = am_undefined;
1575 else {
1576 size = user_cpudata_size;
1577 cpudata = user_cpudata;
1578 }
1579 break;
1580 default:
1581 erts_exit(ERTS_ABORT_EXIT, "Bad cpu topology type: %d\n", type);
1582 break;
1583 }
1584
1585 if (res == am_undefined) {
1586 ASSERT(!cpudata);
1587 return res;
1588 }
1589
1590 hsz = 0;
1591
1592 bld_topology_term(NULL, &hsz,
1593 cpudata, size);
1594
1595 hp = HAlloc(c_p, hsz);
1596
1597 #ifdef DEBUG
1598 hp_end = hp + hsz;
1599 #endif
1600
1601 res = bld_topology_term(&hp, NULL,
1602 cpudata, size);
1603
1604 ASSERT(hp_end == hp);
1605
1606 if (cpudata && cpudata != system_cpudata && cpudata != user_cpudata)
1607 erts_free(ERTS_ALC_T_TMP, cpudata);
1608
1609 return res;
1610 }
1611
1612 Eterm
erts_get_cpu_topology_term(Process * c_p,Eterm which)1613 erts_get_cpu_topology_term(Process *c_p, Eterm which)
1614 {
1615 Eterm res;
1616 int type;
1617 erts_rwmtx_rlock(&cpuinfo_rwmtx);
1618 if (ERTS_IS_ATOM_STR("used", which))
1619 type = ERTS_GET_USED_CPU_TOPOLOGY;
1620 else if (ERTS_IS_ATOM_STR("detected", which))
1621 type = ERTS_GET_DETECTED_CPU_TOPOLOGY;
1622 else if (ERTS_IS_ATOM_STR("defined", which))
1623 type = ERTS_GET_DEFINED_CPU_TOPOLOGY;
1624 else
1625 type = ERTS_GET_CPU_TOPOLOGY_ERROR;
1626 if (type == ERTS_GET_CPU_TOPOLOGY_ERROR)
1627 res = THE_NON_VALUE;
1628 else
1629 res = get_cpu_topology_term(c_p, type);
1630 erts_rwmtx_runlock(&cpuinfo_rwmtx);
1631 return res;
1632 }
1633
1634 static void
get_logical_processors(int * conf,int * onln,int * avail)1635 get_logical_processors(int *conf, int *onln, int *avail)
1636 {
1637 if (conf)
1638 *conf = erts_get_cpu_configured(cpuinfo);
1639 if (onln)
1640 *onln = erts_get_cpu_online(cpuinfo);
1641 if (avail)
1642 *avail = erts_get_cpu_available(cpuinfo);
1643 }
1644
1645 void
erts_get_logical_processors(int * conf,int * onln,int * avail)1646 erts_get_logical_processors(int *conf, int *onln, int *avail)
1647 {
1648 erts_rwmtx_rlock(&cpuinfo_rwmtx);
1649 get_logical_processors(conf, onln, avail);
1650 erts_rwmtx_runlock(&cpuinfo_rwmtx);
1651 }
1652
1653 void
erts_pre_early_init_cpu_topology(int * max_dcg_p,int * max_rg_p,int * conf_p,int * onln_p,int * avail_p)1654 erts_pre_early_init_cpu_topology(int *max_dcg_p,
1655 int *max_rg_p,
1656 int *conf_p,
1657 int *onln_p,
1658 int *avail_p)
1659 {
1660 cpu_groups_maps = NULL;
1661 no_cpu_groups_callbacks = 0;
1662 *max_rg_p = ERTS_MAX_READER_GROUPS;
1663 *max_dcg_p = ERTS_MAX_FLXCTR_GROUPS;
1664 cpuinfo = erts_cpu_info_create();
1665 get_logical_processors(conf_p, onln_p, avail_p);
1666 }
1667
1668 void
erts_early_init_cpu_topology(int no_schedulers,int * max_main_threads_p,int max_reader_groups,int * reader_groups_p,int max_decentralized_counter_groups,int * decentralized_counter_groups_p)1669 erts_early_init_cpu_topology(int no_schedulers,
1670 int *max_main_threads_p,
1671 int max_reader_groups,
1672 int *reader_groups_p,
1673 int max_decentralized_counter_groups,
1674 int *decentralized_counter_groups_p)
1675 {
1676 user_cpudata = NULL;
1677 user_cpudata_size = 0;
1678
1679 system_cpudata_size = erts_get_cpu_topology_size(cpuinfo);
1680 system_cpudata = erts_alloc(ERTS_ALC_T_CPUDATA,
1681 (sizeof(erts_cpu_topology_t)
1682 * system_cpudata_size));
1683
1684 cpu_bind_order = ERTS_CPU_BIND_UNDEFINED;
1685
1686 if (!erts_get_cpu_topology(cpuinfo, system_cpudata)
1687 || ERTS_INIT_CPU_TOPOLOGY_OK != verify_topology(system_cpudata,
1688 system_cpudata_size)) {
1689 erts_free(ERTS_ALC_T_CPUDATA, system_cpudata);
1690 system_cpudata = NULL;
1691 system_cpudata_size = 0;
1692 }
1693
1694 max_main_threads = erts_get_cpu_configured(cpuinfo);
1695 if (max_main_threads > no_schedulers || max_main_threads < 0)
1696 max_main_threads = no_schedulers;
1697 *max_main_threads_p = max_main_threads;
1698
1699 decentralized_counter_groups = max_main_threads;
1700 if (decentralized_counter_groups <= 1 || max_decentralized_counter_groups <= 1)
1701 decentralized_counter_groups = 1;
1702 if (decentralized_counter_groups > max_decentralized_counter_groups)
1703 decentralized_counter_groups = max_decentralized_counter_groups;
1704 *decentralized_counter_groups_p = decentralized_counter_groups;
1705 reader_groups = max_main_threads;
1706 if (reader_groups <= 1 || max_reader_groups <= 1)
1707 reader_groups = 0;
1708 if (reader_groups > max_reader_groups)
1709 reader_groups = max_reader_groups;
1710 *reader_groups_p = reader_groups;
1711 }
1712
1713 void
erts_init_cpu_topology(void)1714 erts_init_cpu_topology(void)
1715 {
1716 int ix;
1717
1718 erts_rwmtx_init(&cpuinfo_rwmtx, "cpu_info", NIL,
1719 ERTS_LOCK_FLAGS_PROPERTY_STATIC | ERTS_LOCK_FLAGS_CATEGORY_GENERIC);
1720 erts_rwmtx_rwlock(&cpuinfo_rwmtx);
1721
1722 scheduler2cpu_map = erts_alloc(ERTS_ALC_T_CPUDATA,
1723 (sizeof(ErtsCpuBindData)
1724 * (erts_no_schedulers+1)));
1725 for (ix = 1; ix <= erts_no_schedulers; ix++) {
1726 scheduler2cpu_map[ix].bind_id = -1;
1727 scheduler2cpu_map[ix].bound_id = -1;
1728 }
1729
1730 if (cpu_bind_order == ERTS_CPU_BIND_UNDEFINED)
1731 cpu_bind_order = ERTS_CPU_BIND_NONE;
1732
1733 reader_groups_map = add_cpu_groups(reader_groups,
1734 reader_groups_callback,
1735 NULL);
1736 decentralized_counter_groups_map = add_cpu_groups(decentralized_counter_groups,
1737 flxctr_groups_callback,
1738 NULL);
1739
1740 if (cpu_bind_order == ERTS_CPU_BIND_NONE)
1741 erts_rwmtx_rwunlock(&cpuinfo_rwmtx);
1742 else {
1743 erts_cpu_topology_t *cpudata;
1744 int cpudata_size;
1745 create_tmp_cpu_topology_copy(&cpudata, &cpudata_size);
1746 write_schedulers_bind_change(cpudata, cpudata_size);
1747 erts_rwmtx_rwunlock(&cpuinfo_rwmtx);
1748 erts_sched_notify_check_cpu_bind();
1749 destroy_tmp_cpu_topology_copy(cpudata);
1750 }
1751 }
1752
1753 int
erts_update_cpu_info(void)1754 erts_update_cpu_info(void)
1755 {
1756 int changed;
1757 erts_rwmtx_rwlock(&cpuinfo_rwmtx);
1758 changed = erts_cpu_info_update(cpuinfo);
1759 if (changed) {
1760 erts_cpu_topology_t *cpudata;
1761 int cpudata_size;
1762
1763 if (system_cpudata)
1764 erts_free(ERTS_ALC_T_CPUDATA, system_cpudata);
1765
1766 system_cpudata_size = erts_get_cpu_topology_size(cpuinfo);
1767 if (!system_cpudata_size)
1768 system_cpudata = NULL;
1769 else {
1770 system_cpudata = erts_alloc(ERTS_ALC_T_CPUDATA,
1771 (sizeof(erts_cpu_topology_t)
1772 * system_cpudata_size));
1773
1774 if (!erts_get_cpu_topology(cpuinfo, system_cpudata)
1775 || (ERTS_INIT_CPU_TOPOLOGY_OK
1776 != verify_topology(system_cpudata,
1777 system_cpudata_size))) {
1778 erts_free(ERTS_ALC_T_CPUDATA, system_cpudata);
1779 system_cpudata = NULL;
1780 system_cpudata_size = 0;
1781 }
1782 }
1783
1784 update_cpu_groups_maps();
1785
1786 create_tmp_cpu_topology_copy(&cpudata, &cpudata_size);
1787 write_schedulers_bind_change(cpudata, cpudata_size);
1788 destroy_tmp_cpu_topology_copy(cpudata);
1789 }
1790 erts_rwmtx_rwunlock(&cpuinfo_rwmtx);
1791 if (changed)
1792 erts_sched_notify_check_cpu_bind();
1793 return changed;
1794 }
1795
1796 /*
1797 * reader groups map
1798 */
1799
1800 void
reader_groups_callback(int suspending,ErtsSchedulerData * esdp,int group,void * unused)1801 reader_groups_callback(int suspending,
1802 ErtsSchedulerData *esdp,
1803 int group,
1804 void *unused)
1805 {
1806 if (reader_groups && esdp->no <= max_main_threads)
1807 erts_rwmtx_set_reader_group(suspending ? 0 : group+1);
1808 }
1809
1810 void
flxctr_groups_callback(int suspending,ErtsSchedulerData * esdp,int group,void * unused)1811 flxctr_groups_callback(int suspending,
1812 ErtsSchedulerData *esdp,
1813 int group,
1814 void *unused)
1815 {
1816 erts_flxctr_set_slot(suspending ? 0 : group+1);
1817 }
1818
1819 static Eterm get_cpu_groups_map(Process *c_p,
1820 erts_cpu_groups_map_t *map,
1821 int offset);
1822 Eterm
erts_debug_reader_groups_map(Process * c_p,int groups)1823 erts_debug_reader_groups_map(Process *c_p, int groups)
1824 {
1825 Eterm res;
1826 erts_cpu_groups_map_t test;
1827
1828 test.array = NULL;
1829 test.groups = groups;
1830 make_cpu_groups_map(&test, 1);
1831 if (!test.array)
1832 res = NIL;
1833 else {
1834 res = get_cpu_groups_map(c_p, &test, 1);
1835 erts_free(ERTS_ALC_T_TMP, test.array);
1836 }
1837 return res;
1838 }
1839
1840
1841 Eterm
erts_get_reader_groups_map(Process * c_p)1842 erts_get_reader_groups_map(Process *c_p)
1843 {
1844 Eterm res;
1845 erts_rwmtx_rlock(&cpuinfo_rwmtx);
1846 res = get_cpu_groups_map(c_p, reader_groups_map, 1);
1847 erts_rwmtx_runlock(&cpuinfo_rwmtx);
1848 return res;
1849 }
1850
1851 Eterm
erts_get_decentralized_counter_groups_map(Process * c_p)1852 erts_get_decentralized_counter_groups_map(Process *c_p)
1853 {
1854 Eterm res;
1855 erts_rwmtx_rlock(&cpuinfo_rwmtx);
1856 res = get_cpu_groups_map(c_p, decentralized_counter_groups_map, 1);
1857 erts_rwmtx_runlock(&cpuinfo_rwmtx);
1858 return res;
1859 }
1860
1861 /*
1862 * CPU groups
1863 */
1864
1865 static Eterm
get_cpu_groups_map(Process * c_p,erts_cpu_groups_map_t * map,int offset)1866 get_cpu_groups_map(Process *c_p,
1867 erts_cpu_groups_map_t *map,
1868 int offset)
1869 {
1870 #ifdef DEBUG
1871 Eterm *endp;
1872 #endif
1873 Eterm res = NIL, tuple;
1874 Eterm *hp;
1875 int i;
1876
1877 hp = HAlloc(c_p, map->logical_processors*(2+3));
1878 #ifdef DEBUG
1879 endp = hp + map->logical_processors*(2+3);
1880 #endif
1881 for (i = map->size - 1; i >= 0; i--) {
1882 if (map->array[i].logical >= 0) {
1883 tuple = TUPLE2(hp,
1884 make_small(map->array[i].logical),
1885 make_small(map->array[i].cpu_group + offset));
1886 hp += 3;
1887 res = CONS(hp, tuple, res);
1888 hp += 2;
1889 }
1890 }
1891 ASSERT(hp == endp);
1892 return res;
1893 }
1894
1895 static void
make_available_cpu_topology(erts_avail_cput * no,erts_avail_cput * avail,erts_cpu_topology_t * cpudata,int * size,int test)1896 make_available_cpu_topology(erts_avail_cput *no,
1897 erts_avail_cput *avail,
1898 erts_cpu_topology_t *cpudata,
1899 int *size,
1900 int test)
1901 {
1902 int len = *size;
1903 erts_cpu_topology_t last;
1904 int a, i, j;
1905
1906 no->level[ERTS_TOPOLOGY_NODE] = -1;
1907 no->level[ERTS_TOPOLOGY_PROCESSOR] = -1;
1908 no->level[ERTS_TOPOLOGY_PROCESSOR_NODE] = -1;
1909 no->level[ERTS_TOPOLOGY_CORE] = -1;
1910 no->level[ERTS_TOPOLOGY_THREAD] = -1;
1911 no->level[ERTS_TOPOLOGY_LOGICAL] = -1;
1912
1913 last.node = INT_MIN;
1914 last.processor = INT_MIN;
1915 last.processor_node = INT_MIN;
1916 last.core = INT_MIN;
1917 last.thread = INT_MIN;
1918 last.logical = INT_MIN;
1919
1920 a = 0;
1921
1922 for (i = 0; i < len; i++) {
1923
1924 if (!test && !erts_is_cpu_available(cpuinfo, cpudata[i].logical))
1925 continue;
1926
1927 if (last.node != cpudata[i].node)
1928 goto node;
1929 if (last.processor != cpudata[i].processor)
1930 goto processor;
1931 if (last.processor_node != cpudata[i].processor_node)
1932 goto processor_node;
1933 if (last.core != cpudata[i].core)
1934 goto core;
1935 ASSERT(last.thread != cpudata[i].thread);
1936 goto thread;
1937
1938 node:
1939 no->level[ERTS_TOPOLOGY_NODE]++;
1940 processor:
1941 no->level[ERTS_TOPOLOGY_PROCESSOR]++;
1942 processor_node:
1943 no->level[ERTS_TOPOLOGY_PROCESSOR_NODE]++;
1944 core:
1945 no->level[ERTS_TOPOLOGY_CORE]++;
1946 thread:
1947 no->level[ERTS_TOPOLOGY_THREAD]++;
1948
1949 no->level[ERTS_TOPOLOGY_LOGICAL]++;
1950
1951 for (j = 0; j < ERTS_TOPOLOGY_LOGICAL; j++)
1952 avail[a].level[j] = no->level[j];
1953
1954 avail[a].level[ERTS_TOPOLOGY_LOGICAL] = cpudata[i].logical;
1955 avail[a].level[ERTS_TOPOLOGY_CG] = 0;
1956
1957 ASSERT(last.logical != cpudata[i].logical);
1958
1959 last = cpudata[i];
1960 a++;
1961 }
1962
1963 no->level[ERTS_TOPOLOGY_NODE]++;
1964 no->level[ERTS_TOPOLOGY_PROCESSOR]++;
1965 no->level[ERTS_TOPOLOGY_PROCESSOR_NODE]++;
1966 no->level[ERTS_TOPOLOGY_CORE]++;
1967 no->level[ERTS_TOPOLOGY_THREAD]++;
1968 no->level[ERTS_TOPOLOGY_LOGICAL]++;
1969
1970 *size = a;
1971 }
1972
1973 static void
cpu_group_insert(erts_cpu_groups_map_t * map,int logical,int cpu_group)1974 cpu_group_insert(erts_cpu_groups_map_t *map,
1975 int logical, int cpu_group)
1976 {
1977 int start = logical % map->size;
1978 int ix = start;
1979
1980 do {
1981 if (map->array[ix].logical < 0) {
1982 map->array[ix].logical = logical;
1983 map->array[ix].cpu_group = cpu_group;
1984 return;
1985 }
1986 ix++;
1987 if (ix == map->size)
1988 ix = 0;
1989 } while (ix != start);
1990
1991 erts_exit(ERTS_ABORT_EXIT, "Reader groups map full\n");
1992 }
1993
1994
1995 static int
sub_levels(erts_cpu_groups_count_t * cgc,int level,int aix,int avail_sz,erts_avail_cput * avail)1996 sub_levels(erts_cpu_groups_count_t *cgc, int level, int aix,
1997 int avail_sz, erts_avail_cput *avail)
1998 {
1999 int sub_level = level+1;
2000 int last = -1;
2001 cgc->sub_levels = 0;
2002
2003 do {
2004 if (last != avail[aix].level[sub_level]) {
2005 cgc->sub_levels++;
2006 last = avail[aix].level[sub_level];
2007 }
2008 aix++;
2009 }
2010 while (aix < avail_sz && cgc->id == avail[aix].level[level]);
2011 cgc->cpu_groups = 0;
2012 return aix;
2013 }
2014
2015 static int
write_cpu_groups(int * cgp,erts_cpu_groups_count_t * cgcp,int level,int a,int avail_sz,erts_avail_cput * avail)2016 write_cpu_groups(int *cgp, erts_cpu_groups_count_t *cgcp,
2017 int level, int a,
2018 int avail_sz, erts_avail_cput *avail)
2019 {
2020 int cg = *cgp;
2021 int sub_level = level+1;
2022 int sl_per_gr = cgcp->sub_levels / cgcp->cpu_groups;
2023 int xsl = cgcp->sub_levels % cgcp->cpu_groups;
2024 int sls = 0;
2025 int last = -1;
2026 int xsl_cg_lim = (cgcp->cpu_groups - xsl) + cg + 1;
2027
2028 ASSERT(level < 0 || avail[a].level[level] == cgcp->id);
2029
2030 do {
2031 if (last != avail[a].level[sub_level]) {
2032 if (!sls) {
2033 sls = sl_per_gr;
2034 cg++;
2035 if (cg >= xsl_cg_lim)
2036 sls++;
2037 }
2038 last = avail[a].level[sub_level];
2039 sls--;
2040 }
2041 avail[a].level[ERTS_TOPOLOGY_CG] = cg;
2042 a++;
2043 } while (a < avail_sz && (level < 0
2044 || avail[a].level[level] == cgcp->id));
2045
2046 ASSERT(cgcp->cpu_groups == cg - *cgp);
2047
2048 *cgp = cg;
2049
2050 return a;
2051 }
2052
2053 static int
cg_count_sub_levels_compare(const void * vx,const void * vy)2054 cg_count_sub_levels_compare(const void *vx, const void *vy)
2055 {
2056 erts_cpu_groups_count_t *x = (erts_cpu_groups_count_t *) vx;
2057 erts_cpu_groups_count_t *y = (erts_cpu_groups_count_t *) vy;
2058 if (x->sub_levels != y->sub_levels)
2059 return y->sub_levels - x->sub_levels;
2060 return x->id - y->id;
2061 }
2062
2063 static int
cg_count_id_compare(const void * vx,const void * vy)2064 cg_count_id_compare(const void *vx, const void *vy)
2065 {
2066 erts_cpu_groups_count_t *x = (erts_cpu_groups_count_t *) vx;
2067 erts_cpu_groups_count_t *y = (erts_cpu_groups_count_t *) vy;
2068 return x->id - y->id;
2069 }
2070
2071 static void
make_cpu_groups_map(erts_cpu_groups_map_t * map,int test)2072 make_cpu_groups_map(erts_cpu_groups_map_t *map, int test)
2073 {
2074 int i, spread_level, avail_sz;
2075 erts_avail_cput no, *avail;
2076 erts_cpu_topology_t *cpudata;
2077 ErtsAlcType_t alc_type = (test
2078 ? ERTS_ALC_T_TMP
2079 : ERTS_ALC_T_CPU_GRPS_MAP);
2080
2081 if (map->array)
2082 erts_free(alc_type, map->array);
2083
2084 map->array = NULL;
2085 map->logical_processors = 0;
2086 map->size = 0;
2087
2088 if (!map->groups)
2089 return;
2090
2091 create_tmp_cpu_topology_copy(&cpudata, &avail_sz);
2092
2093 if (!cpudata)
2094 return;
2095
2096 cpu_bind_order_sort(cpudata,
2097 avail_sz,
2098 ERTS_CPU_BIND_NO_SPREAD,
2099 1);
2100
2101 avail = erts_alloc(ERTS_ALC_T_TMP,
2102 sizeof(erts_avail_cput)*avail_sz);
2103
2104 make_available_cpu_topology(&no, avail, cpudata,
2105 &avail_sz, test);
2106
2107 destroy_tmp_cpu_topology_copy(cpudata);
2108
2109 map->size = avail_sz*2+1;
2110
2111 map->array = erts_alloc(alc_type,
2112 (sizeof(erts_cpu_groups_map_array_t)
2113 * map->size));;
2114 map->logical_processors = avail_sz;
2115
2116 for (i = 0; i < map->size; i++) {
2117 map->array[i].logical = -1;
2118 map->array[i].cpu_group = -1;
2119 }
2120
2121 spread_level = ERTS_TOPOLOGY_CORE;
2122 for (i = ERTS_TOPOLOGY_NODE; i < ERTS_TOPOLOGY_THREAD; i++) {
2123 if (no.level[i] > map->groups) {
2124 spread_level = i;
2125 break;
2126 }
2127 }
2128
2129 if (no.level[spread_level] <= map->groups) {
2130 int a, cg, last = -1;
2131 cg = -1;
2132 ASSERT(spread_level == ERTS_TOPOLOGY_CORE);
2133 for (a = 0; a < avail_sz; a++) {
2134 if (last != avail[a].level[spread_level]) {
2135 cg++;
2136 last = avail[a].level[spread_level];
2137 }
2138 cpu_group_insert(map,
2139 avail[a].level[ERTS_TOPOLOGY_LOGICAL],
2140 cg);
2141 }
2142 }
2143 else { /* map->groups < no.level[spread_level] */
2144 erts_cpu_groups_count_t *cg_count;
2145 int a, cg, tl, toplevels;
2146
2147 tl = spread_level-1;
2148
2149 if (spread_level == ERTS_TOPOLOGY_NODE)
2150 toplevels = 1;
2151 else
2152 toplevels = no.level[tl];
2153
2154 cg_count = erts_alloc(ERTS_ALC_T_TMP,
2155 toplevels*sizeof(erts_cpu_groups_count_t));
2156
2157 if (toplevels == 1) {
2158 cg_count[0].id = 0;
2159 cg_count[0].sub_levels = no.level[spread_level];
2160 cg_count[0].cpu_groups = map->groups;
2161 }
2162 else {
2163 int cgs_per_tl, cgs;
2164 cgs = map->groups;
2165 cgs_per_tl = cgs / toplevels;
2166
2167 a = 0;
2168 for (i = 0; i < toplevels; i++) {
2169 cg_count[i].id = avail[a].level[tl];
2170 a = sub_levels(&cg_count[i], tl, a, avail_sz, avail);
2171 }
2172
2173 qsort(cg_count,
2174 toplevels,
2175 sizeof(erts_cpu_groups_count_t),
2176 cg_count_sub_levels_compare);
2177
2178 for (i = 0; i < toplevels; i++) {
2179 if (cg_count[i].sub_levels < cgs_per_tl) {
2180 cg_count[i].cpu_groups = cg_count[i].sub_levels;
2181 cgs -= cg_count[i].sub_levels;
2182 }
2183 else {
2184 cg_count[i].cpu_groups = cgs_per_tl;
2185 cgs -= cgs_per_tl;
2186 }
2187 }
2188
2189 while (cgs > 0) {
2190 for (i = 0; i < toplevels; i++) {
2191 if (cg_count[i].sub_levels == cg_count[i].cpu_groups)
2192 break;
2193 else {
2194 cg_count[i].cpu_groups++;
2195 if (--cgs == 0)
2196 break;
2197 }
2198 }
2199 }
2200
2201 qsort(cg_count,
2202 toplevels,
2203 sizeof(erts_cpu_groups_count_t),
2204 cg_count_id_compare);
2205 }
2206
2207 a = i = 0;
2208 cg = -1;
2209 while (a < avail_sz) {
2210 a = write_cpu_groups(&cg, &cg_count[i], tl,
2211 a, avail_sz, avail);
2212 i++;
2213 }
2214
2215 ASSERT(map->groups == cg + 1);
2216
2217 for (a = 0; a < avail_sz; a++)
2218 cpu_group_insert(map,
2219 avail[a].level[ERTS_TOPOLOGY_LOGICAL],
2220 avail[a].level[ERTS_TOPOLOGY_CG]);
2221
2222 erts_free(ERTS_ALC_T_TMP, cg_count);
2223 }
2224
2225 erts_free(ERTS_ALC_T_TMP, avail);
2226 }
2227
2228 static erts_cpu_groups_map_t *
add_cpu_groups(int groups,erts_cpu_groups_callback_t callback,void * arg)2229 add_cpu_groups(int groups,
2230 erts_cpu_groups_callback_t callback,
2231 void *arg)
2232 {
2233 int use_groups = groups;
2234 erts_cpu_groups_callback_list_t *cgcl;
2235 erts_cpu_groups_map_t *cgm;
2236
2237 ERTS_LC_ASSERT(erts_lc_rwmtx_is_rwlocked(&cpuinfo_rwmtx));
2238
2239 if (use_groups > max_main_threads)
2240 use_groups = max_main_threads;
2241
2242 if (!use_groups)
2243 return NULL;
2244
2245 no_cpu_groups_callbacks++;
2246 cgcl = erts_alloc(ERTS_ALC_T_CPU_GRPS_MAP,
2247 sizeof(erts_cpu_groups_callback_list_t));
2248 cgcl->callback = callback;
2249 cgcl->arg = arg;
2250
2251 for (cgm = cpu_groups_maps; cgm; cgm = cgm->next) {
2252 if (cgm->groups == use_groups) {
2253 cgcl->next = cgm->callback_list;
2254 cgm->callback_list = cgcl;
2255 return cgm;
2256 }
2257 }
2258
2259
2260 cgm = erts_alloc(ERTS_ALC_T_CPU_GRPS_MAP,
2261 sizeof(erts_cpu_groups_map_t));
2262 cgm->next = cpu_groups_maps;
2263 cgm->groups = use_groups;
2264 cgm->array = NULL;
2265 cgm->size = 0;
2266 cgm->logical_processors = 0;
2267 cgm->callback_list = cgcl;
2268
2269 cgcl->next = NULL;
2270
2271 make_cpu_groups_map(cgm, 0);
2272
2273 cpu_groups_maps = cgm;
2274
2275 return cgm;
2276 }
2277
2278 static int
cpu_groups_lookup(erts_cpu_groups_map_t * map,ErtsSchedulerData * esdp)2279 cpu_groups_lookup(erts_cpu_groups_map_t *map,
2280 ErtsSchedulerData *esdp)
2281 {
2282 int start, logical, ix;
2283
2284 ERTS_LC_ASSERT(erts_lc_rwmtx_is_rlocked(&cpuinfo_rwmtx)
2285 || erts_lc_rwmtx_is_rwlocked(&cpuinfo_rwmtx));
2286
2287 if (esdp->cpu_id < 0)
2288 return (((int) esdp->no) - 1) % map->groups;
2289
2290 logical = esdp->cpu_id;
2291 start = logical % map->size;
2292 ix = start;
2293
2294 do {
2295 if (map->array[ix].logical == logical) {
2296 int group = map->array[ix].cpu_group;
2297 ASSERT(0 <= group && group < map->groups);
2298 return group;
2299 }
2300 ix++;
2301 if (ix == map->size)
2302 ix = 0;
2303 } while (ix != start);
2304
2305 erts_exit(ERTS_ABORT_EXIT, "Logical cpu id %d not found\n", logical);
2306 }
2307
2308 static void
update_cpu_groups_maps(void)2309 update_cpu_groups_maps(void)
2310 {
2311 erts_cpu_groups_map_t *cgm;
2312 ERTS_LC_ASSERT(erts_lc_rwmtx_is_rwlocked(&cpuinfo_rwmtx));
2313
2314 for (cgm = cpu_groups_maps; cgm; cgm = cgm->next)
2315 make_cpu_groups_map(cgm, 0);
2316 }
2317