1 /*
2 * Copyright © 2009 CNRS
3 * Copyright © 2009-2017 Inria. All rights reserved.
4 * Copyright © 2009-2013, 2015 Université Bordeaux
5 * Copyright © 2009-2014 Cisco Systems, Inc. All rights reserved.
6 * Copyright © 2015 Intel, Inc. All rights reserved.
7 * Copyright © 2010 IBM
8 * See COPYING in top-level directory.
9 */
10
11 #include <private/autogen/config.h>
12 #include <hwloc.h>
13 #include <hwloc/linux.h>
14 #include <private/misc.h>
15 #include <private/private.h>
16 #include <private/misc.h>
17 #include <private/debug.h>
18
19 #include <limits.h>
20 #include <stdio.h>
21 #include <fcntl.h>
22 #include <errno.h>
23 #include <assert.h>
24 #ifdef HAVE_DIRENT_H
25 #include <dirent.h>
26 #endif
27 #ifdef HAVE_UNISTD_H
28 #include <unistd.h>
29 #endif
30 #ifdef HWLOC_HAVE_LIBUDEV
31 #include <libudev.h>
32 #endif
33 #include <sys/types.h>
34 #include <sys/stat.h>
35 #include <sched.h>
36 #include <pthread.h>
37 #include <sys/mman.h>
38 #include <sys/syscall.h>
39 #include <mntent.h>
40 #if defined HWLOC_HAVE_SET_MEMPOLICY || defined HWLOC_HAVE_MBIND || defined HWLOC_HAVE_MOVE_PAGES
41 #define migratepages migrate_pages /* workaround broken migratepages prototype in numaif.h before libnuma 2.0.2 */
42 #include <numaif.h>
43 #endif
44
45 struct hwloc_linux_backend_data_s {
46 char *root_path; /* NULL if unused */
47 int root_fd; /* The file descriptor for the file system root, used when browsing, e.g., Linux' sysfs and procfs. */
48 int is_real_fsroot; /* Boolean saying whether root_fd points to the real filesystem root of the system */
49 #ifdef HWLOC_HAVE_LIBUDEV
50 struct udev *udev; /* Global udev context */
51 #endif
52 char *dumped_hwdata_dirname;
53 enum {
54 HWLOC_LINUX_ARCH_X86, /* x86 32 or 64bits, including k1om (KNC) */
55 HWLOC_LINUX_ARCH_IA64,
56 HWLOC_LINUX_ARCH_ARM,
57 HWLOC_LINUX_ARCH_POWER,
58 HWLOC_LINUX_ARCH_UNKNOWN
59 } arch;
60 int is_knl;
61 int is_amd_with_CU;
62 struct utsname utsname; /* fields contain \0 when unknown */
63 unsigned fallback_nbprocessors;
64 unsigned pagesize;
65
66 int deprecated_classlinks_model; /* -2 if never tried, -1 if unknown, 0 if new (device contains class/name), 1 if old (device contains class:name) */
67 int mic_need_directlookup; /* if not tried yet, 0 if not needed, 1 if needed */
68 unsigned mic_directlookup_id_max; /* -1 if not tried yet, 0 if none to lookup, maxid+1 otherwise */
69 };
70
71
72
73 /***************************
74 * Misc Abstraction layers *
75 ***************************/
76
77 #if !(defined HWLOC_HAVE_SCHED_SETAFFINITY) && (defined HWLOC_HAVE_SYSCALL)
78 /* libc doesn't have support for sched_setaffinity, make system call
79 * ourselves: */
80 # include <linux/unistd.h>
81 # ifndef __NR_sched_setaffinity
82 # ifdef __i386__
83 # define __NR_sched_setaffinity 241
84 # elif defined(__x86_64__)
85 # define __NR_sched_setaffinity 203
86 # elif defined(__ia64__)
87 # define __NR_sched_setaffinity 1231
88 # elif defined(__hppa__)
89 # define __NR_sched_setaffinity 211
90 # elif defined(__alpha__)
91 # define __NR_sched_setaffinity 395
92 # elif defined(__s390__)
93 # define __NR_sched_setaffinity 239
94 # elif defined(__sparc__)
95 # define __NR_sched_setaffinity 261
96 # elif defined(__m68k__)
97 # define __NR_sched_setaffinity 311
98 # elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
99 # define __NR_sched_setaffinity 222
100 # elif defined(__arm__)
101 # define __NR_sched_setaffinity 241
102 # elif defined(__cris__)
103 # define __NR_sched_setaffinity 241
104 /*# elif defined(__mips__)
105 # define __NR_sched_setaffinity TODO (32/64/nabi) */
106 # else
107 # warning "don't know the syscall number for sched_setaffinity on this architecture, will not support binding"
108 # define sched_setaffinity(pid, lg, mask) (errno = ENOSYS, -1)
109 # endif
110 # endif
111 # ifndef sched_setaffinity
112 # define sched_setaffinity(pid, lg, mask) syscall(__NR_sched_setaffinity, pid, lg, mask)
113 # endif
114 # ifndef __NR_sched_getaffinity
115 # ifdef __i386__
116 # define __NR_sched_getaffinity 242
117 # elif defined(__x86_64__)
118 # define __NR_sched_getaffinity 204
119 # elif defined(__ia64__)
120 # define __NR_sched_getaffinity 1232
121 # elif defined(__hppa__)
122 # define __NR_sched_getaffinity 212
123 # elif defined(__alpha__)
124 # define __NR_sched_getaffinity 396
125 # elif defined(__s390__)
126 # define __NR_sched_getaffinity 240
127 # elif defined(__sparc__)
128 # define __NR_sched_getaffinity 260
129 # elif defined(__m68k__)
130 # define __NR_sched_getaffinity 312
131 # elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
132 # define __NR_sched_getaffinity 223
133 # elif defined(__arm__)
134 # define __NR_sched_getaffinity 242
135 # elif defined(__cris__)
136 # define __NR_sched_getaffinity 242
137 /*# elif defined(__mips__)
138 # define __NR_sched_getaffinity TODO (32/64/nabi) */
139 # else
140 # warning "don't know the syscall number for sched_getaffinity on this architecture, will not support getting binding"
141 # define sched_getaffinity(pid, lg, mask) (errno = ENOSYS, -1)
142 # endif
143 # endif
144 # ifndef sched_getaffinity
145 # define sched_getaffinity(pid, lg, mask) (syscall(__NR_sched_getaffinity, pid, lg, mask) < 0 ? -1 : 0)
146 # endif
147 #endif
148
149 /* Added for ntohl() */
150 #include <arpa/inet.h>
151
152 #ifdef HAVE_OPENAT
153 /* Use our own filesystem functions if we have openat */
154
155 static const char *
hwloc_checkat(const char * path,int fsroot_fd)156 hwloc_checkat(const char *path, int fsroot_fd)
157 {
158 const char *relative_path;
159 if (fsroot_fd < 0) {
160 errno = EBADF;
161 return NULL;
162 }
163
164 /* Skip leading slashes. */
165 for (relative_path = path; *relative_path == '/'; relative_path++);
166
167 return relative_path;
168 }
169
170 static int
hwloc_openat(const char * path,int fsroot_fd)171 hwloc_openat(const char *path, int fsroot_fd)
172 {
173 const char *relative_path;
174
175 relative_path = hwloc_checkat(path, fsroot_fd);
176 if (!relative_path)
177 return -1;
178
179 return openat (fsroot_fd, relative_path, O_RDONLY);
180 }
181
182 static FILE *
hwloc_fopenat(const char * path,const char * mode,int fsroot_fd)183 hwloc_fopenat(const char *path, const char *mode, int fsroot_fd)
184 {
185 int fd;
186
187 if (strcmp(mode, "r")) {
188 errno = ENOTSUP;
189 return NULL;
190 }
191
192 fd = hwloc_openat (path, fsroot_fd);
193 if (fd == -1)
194 return NULL;
195
196 return fdopen(fd, mode);
197 }
198
199 static int
hwloc_accessat(const char * path,int mode,int fsroot_fd)200 hwloc_accessat(const char *path, int mode, int fsroot_fd)
201 {
202 const char *relative_path;
203
204 relative_path = hwloc_checkat(path, fsroot_fd);
205 if (!relative_path)
206 return -1;
207
208 return faccessat(fsroot_fd, relative_path, mode, 0);
209 }
210
211 static int
hwloc_fstatat(const char * path,struct stat * st,int flags,int fsroot_fd)212 hwloc_fstatat(const char *path, struct stat *st, int flags, int fsroot_fd)
213 {
214 const char *relative_path;
215
216 relative_path = hwloc_checkat(path, fsroot_fd);
217 if (!relative_path)
218 return -1;
219
220 return fstatat(fsroot_fd, relative_path, st, flags);
221 }
222
223 static DIR*
hwloc_opendirat(const char * path,int fsroot_fd)224 hwloc_opendirat(const char *path, int fsroot_fd)
225 {
226 int dir_fd;
227 const char *relative_path;
228
229 relative_path = hwloc_checkat(path, fsroot_fd);
230 if (!relative_path)
231 return NULL;
232
233 dir_fd = openat(fsroot_fd, relative_path, O_RDONLY | O_DIRECTORY);
234 if (dir_fd < 0)
235 return NULL;
236
237 return fdopendir(dir_fd);
238 }
239
240 #endif /* HAVE_OPENAT */
241
242 /* Static inline version of fopen so that we can use openat if we have
243 it, but still preserve compiler parameter checking */
244 static __hwloc_inline int
hwloc_open(const char * p,int d __hwloc_attribute_unused)245 hwloc_open(const char *p, int d __hwloc_attribute_unused)
246 {
247 #ifdef HAVE_OPENAT
248 return hwloc_openat(p, d);
249 #else
250 return open(p, O_RDONLY);
251 #endif
252 }
253
254 static __hwloc_inline FILE *
hwloc_fopen(const char * p,const char * m,int d __hwloc_attribute_unused)255 hwloc_fopen(const char *p, const char *m, int d __hwloc_attribute_unused)
256 {
257 #ifdef HAVE_OPENAT
258 return hwloc_fopenat(p, m, d);
259 #else
260 return fopen(p, m);
261 #endif
262 }
263
264 /* Static inline version of access so that we can use openat if we have
265 it, but still preserve compiler parameter checking */
266 static __hwloc_inline int
hwloc_access(const char * p,int m,int d __hwloc_attribute_unused)267 hwloc_access(const char *p, int m, int d __hwloc_attribute_unused)
268 {
269 #ifdef HAVE_OPENAT
270 return hwloc_accessat(p, m, d);
271 #else
272 return access(p, m);
273 #endif
274 }
275
276 static __hwloc_inline int
hwloc_stat(const char * p,struct stat * st,int d __hwloc_attribute_unused)277 hwloc_stat(const char *p, struct stat *st, int d __hwloc_attribute_unused)
278 {
279 #ifdef HAVE_OPENAT
280 return hwloc_fstatat(p, st, 0, d);
281 #else
282 return stat(p, st);
283 #endif
284 }
285
286 static __hwloc_inline int
hwloc_lstat(const char * p,struct stat * st,int d __hwloc_attribute_unused)287 hwloc_lstat(const char *p, struct stat *st, int d __hwloc_attribute_unused)
288 {
289 #ifdef HAVE_OPENAT
290 return hwloc_fstatat(p, st, AT_SYMLINK_NOFOLLOW, d);
291 #else
292 return lstat(p, st);
293 #endif
294 }
295
296 /* Static inline version of opendir so that we can use openat if we have
297 it, but still preserve compiler parameter checking */
298 static __hwloc_inline DIR *
hwloc_opendir(const char * p,int d __hwloc_attribute_unused)299 hwloc_opendir(const char *p, int d __hwloc_attribute_unused)
300 {
301 #ifdef HAVE_OPENAT
302 return hwloc_opendirat(p, d);
303 #else
304 return opendir(p);
305 #endif
306 }
307
308
309 /*****************************************
310 ******* Helpers for reading files *******
311 *****************************************/
312
313 static __hwloc_inline int
hwloc_read_path_by_length(const char * path,char * string,size_t length,int fsroot_fd)314 hwloc_read_path_by_length(const char *path, char *string, size_t length, int fsroot_fd)
315 {
316 int fd, ret;
317
318 fd = hwloc_open(path, fsroot_fd);
319 if (fd < 0)
320 return -1;
321
322 ret = read(fd, string, length-1); /* read -1 to put the ending \0 */
323 close(fd);
324
325 if (ret <= 0)
326 return -1;
327
328 string[ret] = 0;
329
330 return 0;
331 }
332
333 static __hwloc_inline int
hwloc_read_path_as_int(const char * path,int * value,int fsroot_fd)334 hwloc_read_path_as_int(const char *path, int *value, int fsroot_fd)
335 {
336 char string[11];
337 if (hwloc_read_path_by_length(path, string, sizeof(string), fsroot_fd) < 0)
338 return -1;
339 *value = atoi(string);
340 return 0;
341 }
342
343 static __hwloc_inline int
hwloc_read_path_as_uint(const char * path,unsigned * value,int fsroot_fd)344 hwloc_read_path_as_uint(const char *path, unsigned *value, int fsroot_fd)
345 {
346 char string[11];
347 if (hwloc_read_path_by_length(path, string, sizeof(string), fsroot_fd) < 0)
348 return -1;
349 *value = (unsigned) strtoul(string, NULL, 10);
350 return 0;
351 }
352
353 /* Read everything from fd and save it into a newly allocated buffer
354 * returned in bufferp. Use sizep as a default buffer size, and returned
355 * the actually needed size in sizep.
356 */
357 static __hwloc_inline int
hwloc__read_fd(int fd,char ** bufferp,size_t * sizep)358 hwloc__read_fd(int fd, char **bufferp, size_t *sizep)
359 {
360 char *buffer;
361 size_t toread, filesize, totalread;
362 ssize_t ret;
363
364 toread = filesize = *sizep;
365
366 /* Alloc and read +1 so that we get EOF on 2^n without reading once more */
367 buffer = malloc(filesize+1);
368 if (!buffer)
369 return -1;
370
371 ret = read(fd, buffer, toread+1);
372 if (ret < 0) {
373 free(buffer);
374 return -1;
375 }
376
377 totalread = (size_t) ret;
378
379 if (totalread < toread + 1)
380 /* Normal case, a single read got EOF */
381 goto done;
382
383 /* Unexpected case, must extend the buffer and read again.
384 * Only occurs on first invocation and if the kernel ever uses multiple page for a single mask.
385 */
386 do {
387 char *tmp;
388
389 toread = filesize;
390 filesize *= 2;
391
392 tmp = realloc(buffer, filesize+1);
393 if (!tmp) {
394 free(buffer);
395 return -1;
396 }
397 buffer = tmp;
398
399 ret = read(fd, buffer+toread+1, toread);
400 if (ret < 0) {
401 free(buffer);
402 return -1;
403 }
404
405 totalread += ret;
406 } while ((size_t) ret == toread);
407
408 done:
409 buffer[totalread] = '\0';
410 *bufferp = buffer;
411 *sizep = filesize;
412 return 0;
413 }
414
415 /* kernel cpumaps are composed of an array of 32bits cpumasks */
416 #define KERNEL_CPU_MASK_BITS 32
417 #define KERNEL_CPU_MAP_LEN (KERNEL_CPU_MASK_BITS/4+2)
418
419 static __hwloc_inline int
hwloc__read_fd_as_cpumask(int fd,hwloc_bitmap_t set)420 hwloc__read_fd_as_cpumask(int fd, hwloc_bitmap_t set)
421 {
422 static size_t _filesize = 0; /* will be dynamically initialized to hwloc_get_pagesize(), and increased later if needed */
423 size_t filesize;
424 unsigned long *maps;
425 unsigned long map;
426 int nr_maps = 0;
427 static int _nr_maps_allocated = 8; /* Only compute the power-of-two above the kernel cpumask size once.
428 * Actually, it may increase multiple times if first read cpumaps start with zeroes.
429 */
430 int nr_maps_allocated = _nr_maps_allocated;
431 char *buffer, *tmpbuf;
432 int i;
433
434 /* Kernel sysfs files are usually at most one page. 4kB may contain 455 32-bit
435 * masks (followed by comma), enough for 14k PUs. So allocate a page by default for now.
436 *
437 * If we ever need a larger buffer, we'll realloc() the buffer during the first
438 * invocation of this function so that others directly allocate the right size
439 * (all cpumask files have the exact same size).
440 */
441 filesize = _filesize;
442 if (!filesize)
443 filesize = hwloc_getpagesize();
444 if (hwloc__read_fd(fd, &buffer, &filesize) < 0)
445 return -1;
446 /* Only update the static value with the final one,
447 * to avoid sharing intermediate values that we modify,
448 * in case there's ever multiple concurrent calls.
449 */
450 _filesize = filesize;
451
452 maps = malloc(nr_maps_allocated * sizeof(*maps));
453 if (!maps) {
454 free(buffer);
455 return -1;
456 }
457
458 /* reset to zero first */
459 hwloc_bitmap_zero(set);
460
461 /* parse the whole mask */
462 tmpbuf = buffer;
463 while (sscanf(tmpbuf, "%lx", &map) == 1) {
464 /* read one kernel cpu mask and the ending comma */
465 if (nr_maps == nr_maps_allocated) {
466 unsigned long *tmp = realloc(maps, 2*nr_maps_allocated * sizeof(*maps));
467 if (!tmp) {
468 free(buffer);
469 free(maps);
470 return -1;
471 }
472 maps = tmp;
473 nr_maps_allocated *= 2;
474 }
475
476 tmpbuf = strchr(tmpbuf, ',');
477 if (!tmpbuf) {
478 maps[nr_maps++] = map;
479 break;
480 } else
481 tmpbuf++;
482
483 if (!map && !nr_maps)
484 /* ignore the first map if it's empty */
485 continue;
486
487 maps[nr_maps++] = map;
488 }
489
490 free(buffer);
491
492 /* convert into a set */
493 #if KERNEL_CPU_MASK_BITS == HWLOC_BITS_PER_LONG
494 for(i=0; i<nr_maps; i++)
495 hwloc_bitmap_set_ith_ulong(set, i, maps[nr_maps-1-i]);
496 #else
497 for(i=0; i<(nr_maps+1)/2; i++) {
498 unsigned long mask;
499 mask = maps[nr_maps-2*i-1];
500 if (2*i+1<nr_maps)
501 mask |= maps[nr_maps-2*i-2] << KERNEL_CPU_MASK_BITS;
502 hwloc_bitmap_set_ith_ulong(set, i, mask);
503 }
504 #endif
505
506 free(maps);
507
508 /* Only update the static value with the final one,
509 * to avoid sharing intermediate values that we modify,
510 * in case there's ever multiple concurrent calls.
511 */
512 if (nr_maps_allocated > _nr_maps_allocated)
513 _nr_maps_allocated = nr_maps_allocated;
514 return 0;
515 }
516
517 static __hwloc_inline int
hwloc__read_path_as_cpumask(const char * maskpath,hwloc_bitmap_t set,int fsroot_fd)518 hwloc__read_path_as_cpumask(const char *maskpath, hwloc_bitmap_t set, int fsroot_fd)
519 {
520 int fd, err;
521 fd = hwloc_open(maskpath, fsroot_fd);
522 if (fd < 0)
523 return -1;
524 err = hwloc__read_fd_as_cpumask(fd, set);
525 close(fd);
526 return err;
527 }
528
529 static __hwloc_inline hwloc_bitmap_t
hwloc__alloc_read_path_as_cpumask(const char * maskpath,int fsroot_fd)530 hwloc__alloc_read_path_as_cpumask(const char *maskpath, int fsroot_fd)
531 {
532 hwloc_bitmap_t set;
533 int err;
534 set = hwloc_bitmap_alloc();
535 if (!set)
536 return NULL;
537 err = hwloc__read_path_as_cpumask(maskpath, set, fsroot_fd);
538 if (err < 0) {
539 hwloc_bitmap_free(set);
540 return NULL;
541 } else
542 return set;
543 }
544
545 /* set must be full on input */
546 static __hwloc_inline int
hwloc__read_fd_as_cpulist(int fd,hwloc_bitmap_t set)547 hwloc__read_fd_as_cpulist(int fd, hwloc_bitmap_t set)
548 {
549 /* Kernel sysfs files are usually at most one page.
550 * But cpulists can be of very different sizes depending on the fragmentation,
551 * so don't bother remember the actual read size between invocations.
552 * We don't have many invocations anyway.
553 */
554 size_t filesize = hwloc_getpagesize();
555 char *buffer, *current, *comma, *tmp;
556 int prevlast, nextfirst, nextlast; /* beginning/end of enabled-segments */
557
558 if (hwloc__read_fd(fd, &buffer, &filesize) < 0)
559 return -1;
560
561 current = buffer;
562 prevlast = -1;
563
564 while (1) {
565 /* save a pointer to the next comma and erase it to simplify things */
566 comma = strchr(current, ',');
567 if (comma)
568 *comma = '\0';
569
570 /* find current enabled-segment bounds */
571 nextfirst = strtoul(current, &tmp, 0);
572 if (*tmp == '-')
573 nextlast = strtoul(tmp+1, NULL, 0);
574 else
575 nextlast = nextfirst;
576 if (prevlast+1 <= nextfirst-1)
577 hwloc_bitmap_clr_range(set, prevlast+1, nextfirst-1);
578
579 /* switch to next enabled-segment */
580 prevlast = nextlast;
581 if (!comma)
582 break;
583 current = comma+1;
584 }
585
586 hwloc_bitmap_clr_range(set, prevlast+1, -1);
587 free(buffer);
588 return 0;
589 }
590
591
592 /*****************************
593 ******* CpuBind Hooks *******
594 *****************************/
595
596 int
hwloc_linux_set_tid_cpubind(hwloc_topology_t topology __hwloc_attribute_unused,pid_t tid __hwloc_attribute_unused,hwloc_const_bitmap_t hwloc_set __hwloc_attribute_unused)597 hwloc_linux_set_tid_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, pid_t tid __hwloc_attribute_unused, hwloc_const_bitmap_t hwloc_set __hwloc_attribute_unused)
598 {
599 /* TODO Kerrighed: Use
600 * int migrate (pid_t pid, int destination_node);
601 * int migrate_self (int destination_node);
602 * int thread_migrate (int thread_id, int destination_node);
603 */
604
605 /* The resulting binding is always strict */
606
607 #if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
608 cpu_set_t *plinux_set;
609 unsigned cpu;
610 int last;
611 size_t setsize;
612 int err;
613
614 last = hwloc_bitmap_last(hwloc_set);
615 if (last == -1) {
616 errno = EINVAL;
617 return -1;
618 }
619
620 setsize = CPU_ALLOC_SIZE(last+1);
621 plinux_set = CPU_ALLOC(last+1);
622
623 CPU_ZERO_S(setsize, plinux_set);
624 hwloc_bitmap_foreach_begin(cpu, hwloc_set)
625 CPU_SET_S(cpu, setsize, plinux_set);
626 hwloc_bitmap_foreach_end();
627
628 err = sched_setaffinity(tid, setsize, plinux_set);
629
630 CPU_FREE(plinux_set);
631 return err;
632 #elif defined(HWLOC_HAVE_CPU_SET)
633 cpu_set_t linux_set;
634 unsigned cpu;
635
636 CPU_ZERO(&linux_set);
637 hwloc_bitmap_foreach_begin(cpu, hwloc_set)
638 CPU_SET(cpu, &linux_set);
639 hwloc_bitmap_foreach_end();
640
641 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
642 return sched_setaffinity(tid, &linux_set);
643 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
644 return sched_setaffinity(tid, sizeof(linux_set), &linux_set);
645 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
646 #elif defined(HWLOC_HAVE_SYSCALL)
647 unsigned long mask = hwloc_bitmap_to_ulong(hwloc_set);
648
649 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
650 return sched_setaffinity(tid, (void*) &mask);
651 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
652 return sched_setaffinity(tid, sizeof(mask), (void*) &mask);
653 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
654 #else /* !SYSCALL */
655 errno = ENOSYS;
656 return -1;
657 #endif /* !SYSCALL */
658 }
659
660 #if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
661 /*
662 * On some kernels, sched_getaffinity requires the output size to be larger
663 * than the kernel cpu_set size (defined by CONFIG_NR_CPUS).
664 * Try sched_affinity on ourself until we find a nr_cpus value that makes
665 * the kernel happy.
666 */
667 static int
hwloc_linux_find_kernel_nr_cpus(hwloc_topology_t topology)668 hwloc_linux_find_kernel_nr_cpus(hwloc_topology_t topology)
669 {
670 static int _nr_cpus = -1;
671 int nr_cpus = _nr_cpus;
672 int fd;
673
674 if (nr_cpus != -1)
675 /* already computed */
676 return nr_cpus;
677
678 if (topology->levels[0][0]->complete_cpuset)
679 /* start with a nr_cpus that may contain the whole topology */
680 nr_cpus = hwloc_bitmap_last(topology->levels[0][0]->complete_cpuset) + 1;
681 if (nr_cpus <= 0)
682 /* start from scratch, the topology isn't ready yet (complete_cpuset is missing (-1) or empty (0))*/
683 nr_cpus = 1;
684
685 fd = open("/sys/devices/system/cpu/possible", O_RDONLY); /* binding only supported in real fsroot, no need for data->root_fd */
686 if (fd >= 0) {
687 hwloc_bitmap_t possible_bitmap = hwloc_bitmap_alloc_full();
688 if (hwloc__read_fd_as_cpulist(fd, possible_bitmap) == 0) {
689 int max_possible = hwloc_bitmap_last(possible_bitmap);
690 hwloc_debug_bitmap("possible CPUs are %s\n", possible_bitmap);
691
692 if (nr_cpus < max_possible + 1)
693 nr_cpus = max_possible + 1;
694 }
695 close(fd);
696 hwloc_bitmap_free(possible_bitmap);
697 }
698
699 while (1) {
700 cpu_set_t *set = CPU_ALLOC(nr_cpus);
701 size_t setsize = CPU_ALLOC_SIZE(nr_cpus);
702 int err = sched_getaffinity(0, setsize, set); /* always works, unless setsize is too small */
703 CPU_FREE(set);
704 nr_cpus = setsize * 8; /* that's the value that was actually tested */
705 if (!err)
706 /* Found it. Only update the static value with the final one,
707 * to avoid sharing intermediate values that we modify,
708 * in case there's ever multiple concurrent calls.
709 */
710 return _nr_cpus = nr_cpus;
711 nr_cpus *= 2;
712 }
713 }
714 #endif
715
716 int
hwloc_linux_get_tid_cpubind(hwloc_topology_t topology __hwloc_attribute_unused,pid_t tid __hwloc_attribute_unused,hwloc_bitmap_t hwloc_set __hwloc_attribute_unused)717 hwloc_linux_get_tid_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, pid_t tid __hwloc_attribute_unused, hwloc_bitmap_t hwloc_set __hwloc_attribute_unused)
718 {
719 int err __hwloc_attribute_unused;
720 /* TODO Kerrighed */
721
722 #if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
723 cpu_set_t *plinux_set;
724 unsigned cpu;
725 int last;
726 size_t setsize;
727 int kernel_nr_cpus;
728
729 /* find the kernel nr_cpus so as to use a large enough cpu_set size */
730 kernel_nr_cpus = hwloc_linux_find_kernel_nr_cpus(topology);
731 setsize = CPU_ALLOC_SIZE(kernel_nr_cpus);
732 plinux_set = CPU_ALLOC(kernel_nr_cpus);
733
734 err = sched_getaffinity(tid, setsize, plinux_set);
735
736 if (err < 0) {
737 CPU_FREE(plinux_set);
738 return -1;
739 }
740
741 last = -1;
742 if (topology->levels[0][0]->complete_cpuset)
743 last = hwloc_bitmap_last(topology->levels[0][0]->complete_cpuset);
744 if (last == -1)
745 /* round the maximal support number, the topology isn't ready yet (complete_cpuset is missing or empty)*/
746 last = kernel_nr_cpus-1;
747
748 hwloc_bitmap_zero(hwloc_set);
749 for(cpu=0; cpu<=(unsigned) last; cpu++)
750 if (CPU_ISSET_S(cpu, setsize, plinux_set))
751 hwloc_bitmap_set(hwloc_set, cpu);
752
753 CPU_FREE(plinux_set);
754 #elif defined(HWLOC_HAVE_CPU_SET)
755 cpu_set_t linux_set;
756 unsigned cpu;
757
758 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
759 err = sched_getaffinity(tid, &linux_set);
760 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
761 err = sched_getaffinity(tid, sizeof(linux_set), &linux_set);
762 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
763 if (err < 0)
764 return -1;
765
766 hwloc_bitmap_zero(hwloc_set);
767 for(cpu=0; cpu<CPU_SETSIZE; cpu++)
768 if (CPU_ISSET(cpu, &linux_set))
769 hwloc_bitmap_set(hwloc_set, cpu);
770 #elif defined(HWLOC_HAVE_SYSCALL)
771 unsigned long mask;
772
773 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
774 err = sched_getaffinity(tid, (void*) &mask);
775 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
776 err = sched_getaffinity(tid, sizeof(mask), (void*) &mask);
777 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
778 if (err < 0)
779 return -1;
780
781 hwloc_bitmap_from_ulong(hwloc_set, mask);
782 #else /* !SYSCALL */
783 errno = ENOSYS;
784 return -1;
785 #endif /* !SYSCALL */
786
787 return 0;
788 }
789
790 /* Get the array of tids of a process from the task directory in /proc */
791 static int
hwloc_linux_get_proc_tids(DIR * taskdir,unsigned * nr_tidsp,pid_t ** tidsp)792 hwloc_linux_get_proc_tids(DIR *taskdir, unsigned *nr_tidsp, pid_t ** tidsp)
793 {
794 struct dirent *dirent;
795 unsigned nr_tids = 0;
796 unsigned max_tids = 32;
797 pid_t *tids;
798 struct stat sb;
799
800 /* take the number of links as a good estimate for the number of tids */
801 if (fstat(dirfd(taskdir), &sb) == 0)
802 max_tids = sb.st_nlink;
803
804 tids = malloc(max_tids*sizeof(pid_t));
805 if (!tids) {
806 errno = ENOMEM;
807 return -1;
808 }
809
810 rewinddir(taskdir);
811
812 while ((dirent = readdir(taskdir)) != NULL) {
813 if (nr_tids == max_tids) {
814 pid_t *newtids;
815 max_tids += 8;
816 newtids = realloc(tids, max_tids*sizeof(pid_t));
817 if (!newtids) {
818 free(tids);
819 errno = ENOMEM;
820 return -1;
821 }
822 tids = newtids;
823 }
824 if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
825 continue;
826 tids[nr_tids++] = atoi(dirent->d_name);
827 }
828
829 *nr_tidsp = nr_tids;
830 *tidsp = tids;
831 return 0;
832 }
833
834 /* Per-tid callbacks */
835 typedef int (*hwloc_linux_foreach_proc_tid_cb_t)(hwloc_topology_t topology, pid_t tid, void *data, int idx);
836
837 static int
hwloc_linux_foreach_proc_tid(hwloc_topology_t topology,pid_t pid,hwloc_linux_foreach_proc_tid_cb_t cb,void * data)838 hwloc_linux_foreach_proc_tid(hwloc_topology_t topology,
839 pid_t pid, hwloc_linux_foreach_proc_tid_cb_t cb,
840 void *data)
841 {
842 char taskdir_path[128];
843 DIR *taskdir;
844 pid_t *tids, *newtids;
845 unsigned i, nr, newnr, failed = 0, failed_errno = 0;
846 unsigned retrynr = 0;
847 int err;
848
849 if (pid)
850 snprintf(taskdir_path, sizeof(taskdir_path), "/proc/%u/task", (unsigned) pid);
851 else
852 snprintf(taskdir_path, sizeof(taskdir_path), "/proc/self/task");
853
854 taskdir = opendir(taskdir_path);
855 if (!taskdir) {
856 if (errno == ENOENT)
857 errno = EINVAL;
858 err = -1;
859 goto out;
860 }
861
862 /* read the current list of threads */
863 err = hwloc_linux_get_proc_tids(taskdir, &nr, &tids);
864 if (err < 0)
865 goto out_with_dir;
866
867 retry:
868 /* apply the callback to all threads */
869 failed=0;
870 for(i=0; i<nr; i++) {
871 err = cb(topology, tids[i], data, i);
872 if (err < 0) {
873 failed++;
874 failed_errno = errno;
875 }
876 }
877
878 /* re-read the list of thread */
879 err = hwloc_linux_get_proc_tids(taskdir, &newnr, &newtids);
880 if (err < 0)
881 goto out_with_tids;
882 /* retry if the list changed in the meantime, or we failed for *some* threads only.
883 * if we're really unlucky, all threads changed but we got the same set of tids. no way to support this.
884 */
885 if (newnr != nr || memcmp(newtids, tids, nr*sizeof(pid_t)) || (failed && failed != nr)) {
886 free(tids);
887 tids = newtids;
888 nr = newnr;
889 if (++retrynr > 10) {
890 /* we tried 10 times, it didn't work, the application is probably creating/destroying many threads, stop trying */
891 errno = EAGAIN;
892 err = -1;
893 goto out_with_tids;
894 }
895 goto retry;
896 } else {
897 free(newtids);
898 }
899
900 /* if all threads failed, return the last errno. */
901 if (failed) {
902 err = -1;
903 errno = failed_errno;
904 goto out_with_tids;
905 }
906
907 err = 0;
908 out_with_tids:
909 free(tids);
910 out_with_dir:
911 closedir(taskdir);
912 out:
913 return err;
914 }
915
916 /* Per-tid proc_set_cpubind callback and caller.
917 * Callback data is a hwloc_bitmap_t. */
918 static int
hwloc_linux_foreach_proc_tid_set_cpubind_cb(hwloc_topology_t topology,pid_t tid,void * data,int idx __hwloc_attribute_unused)919 hwloc_linux_foreach_proc_tid_set_cpubind_cb(hwloc_topology_t topology, pid_t tid, void *data, int idx __hwloc_attribute_unused)
920 {
921 return hwloc_linux_set_tid_cpubind(topology, tid, (hwloc_bitmap_t) data);
922 }
923
924 static int
hwloc_linux_set_pid_cpubind(hwloc_topology_t topology,pid_t pid,hwloc_const_bitmap_t hwloc_set,int flags __hwloc_attribute_unused)925 hwloc_linux_set_pid_cpubind(hwloc_topology_t topology, pid_t pid, hwloc_const_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
926 {
927 return hwloc_linux_foreach_proc_tid(topology, pid,
928 hwloc_linux_foreach_proc_tid_set_cpubind_cb,
929 (void*) hwloc_set);
930 }
931
932 /* Per-tid proc_get_cpubind callback data, callback function and caller */
933 struct hwloc_linux_foreach_proc_tid_get_cpubind_cb_data_s {
934 hwloc_bitmap_t cpuset;
935 hwloc_bitmap_t tidset;
936 int flags;
937 };
938
939 static int
hwloc_linux_foreach_proc_tid_get_cpubind_cb(hwloc_topology_t topology,pid_t tid,void * _data,int idx)940 hwloc_linux_foreach_proc_tid_get_cpubind_cb(hwloc_topology_t topology, pid_t tid, void *_data, int idx)
941 {
942 struct hwloc_linux_foreach_proc_tid_get_cpubind_cb_data_s *data = _data;
943 hwloc_bitmap_t cpuset = data->cpuset;
944 hwloc_bitmap_t tidset = data->tidset;
945 int flags = data->flags;
946
947 if (hwloc_linux_get_tid_cpubind(topology, tid, tidset))
948 return -1;
949
950 /* reset the cpuset on first iteration */
951 if (!idx)
952 hwloc_bitmap_zero(cpuset);
953
954 if (flags & HWLOC_CPUBIND_STRICT) {
955 /* if STRICT, we want all threads to have the same binding */
956 if (!idx) {
957 /* this is the first thread, copy its binding */
958 hwloc_bitmap_copy(cpuset, tidset);
959 } else if (!hwloc_bitmap_isequal(cpuset, tidset)) {
960 /* this is not the first thread, and it's binding is different */
961 errno = EXDEV;
962 return -1;
963 }
964 } else {
965 /* if not STRICT, just OR all thread bindings */
966 hwloc_bitmap_or(cpuset, cpuset, tidset);
967 }
968 return 0;
969 }
970
971 static int
hwloc_linux_get_pid_cpubind(hwloc_topology_t topology,pid_t pid,hwloc_bitmap_t hwloc_set,int flags)972 hwloc_linux_get_pid_cpubind(hwloc_topology_t topology, pid_t pid, hwloc_bitmap_t hwloc_set, int flags)
973 {
974 struct hwloc_linux_foreach_proc_tid_get_cpubind_cb_data_s data;
975 hwloc_bitmap_t tidset = hwloc_bitmap_alloc();
976 int ret;
977
978 data.cpuset = hwloc_set;
979 data.tidset = tidset;
980 data.flags = flags;
981 ret = hwloc_linux_foreach_proc_tid(topology, pid,
982 hwloc_linux_foreach_proc_tid_get_cpubind_cb,
983 (void*) &data);
984 hwloc_bitmap_free(tidset);
985 return ret;
986 }
987
988 static int
hwloc_linux_set_proc_cpubind(hwloc_topology_t topology,pid_t pid,hwloc_const_bitmap_t hwloc_set,int flags)989 hwloc_linux_set_proc_cpubind(hwloc_topology_t topology, pid_t pid, hwloc_const_bitmap_t hwloc_set, int flags)
990 {
991 if (pid == 0)
992 pid = topology->pid;
993 if (flags & HWLOC_CPUBIND_THREAD)
994 return hwloc_linux_set_tid_cpubind(topology, pid, hwloc_set);
995 else
996 return hwloc_linux_set_pid_cpubind(topology, pid, hwloc_set, flags);
997 }
998
999 static int
hwloc_linux_get_proc_cpubind(hwloc_topology_t topology,pid_t pid,hwloc_bitmap_t hwloc_set,int flags)1000 hwloc_linux_get_proc_cpubind(hwloc_topology_t topology, pid_t pid, hwloc_bitmap_t hwloc_set, int flags)
1001 {
1002 if (pid == 0)
1003 pid = topology->pid;
1004 if (flags & HWLOC_CPUBIND_THREAD)
1005 return hwloc_linux_get_tid_cpubind(topology, pid, hwloc_set);
1006 else
1007 return hwloc_linux_get_pid_cpubind(topology, pid, hwloc_set, flags);
1008 }
1009
1010 static int
hwloc_linux_set_thisproc_cpubind(hwloc_topology_t topology,hwloc_const_bitmap_t hwloc_set,int flags)1011 hwloc_linux_set_thisproc_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_set, int flags)
1012 {
1013 return hwloc_linux_set_pid_cpubind(topology, topology->pid, hwloc_set, flags);
1014 }
1015
1016 static int
hwloc_linux_get_thisproc_cpubind(hwloc_topology_t topology,hwloc_bitmap_t hwloc_set,int flags)1017 hwloc_linux_get_thisproc_cpubind(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags)
1018 {
1019 return hwloc_linux_get_pid_cpubind(topology, topology->pid, hwloc_set, flags);
1020 }
1021
1022 static int
hwloc_linux_set_thisthread_cpubind(hwloc_topology_t topology,hwloc_const_bitmap_t hwloc_set,int flags __hwloc_attribute_unused)1023 hwloc_linux_set_thisthread_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
1024 {
1025 if (topology->pid) {
1026 errno = ENOSYS;
1027 return -1;
1028 }
1029 return hwloc_linux_set_tid_cpubind(topology, 0, hwloc_set);
1030 }
1031
1032 static int
hwloc_linux_get_thisthread_cpubind(hwloc_topology_t topology,hwloc_bitmap_t hwloc_set,int flags __hwloc_attribute_unused)1033 hwloc_linux_get_thisthread_cpubind(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
1034 {
1035 if (topology->pid) {
1036 errno = ENOSYS;
1037 return -1;
1038 }
1039 return hwloc_linux_get_tid_cpubind(topology, 0, hwloc_set);
1040 }
1041
1042 #if HAVE_DECL_PTHREAD_SETAFFINITY_NP
1043 #pragma weak pthread_setaffinity_np
1044 #pragma weak pthread_self
1045
1046 static int
hwloc_linux_set_thread_cpubind(hwloc_topology_t topology,pthread_t tid,hwloc_const_bitmap_t hwloc_set,int flags __hwloc_attribute_unused)1047 hwloc_linux_set_thread_cpubind(hwloc_topology_t topology, pthread_t tid, hwloc_const_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
1048 {
1049 int err;
1050
1051 if (topology->pid) {
1052 errno = ENOSYS;
1053 return -1;
1054 }
1055
1056 if (!pthread_self) {
1057 /* ?! Application uses set_thread_cpubind, but doesn't link against libpthread ?! */
1058 errno = ENOSYS;
1059 return -1;
1060 }
1061 if (tid == pthread_self())
1062 return hwloc_linux_set_tid_cpubind(topology, 0, hwloc_set);
1063
1064 if (!pthread_setaffinity_np) {
1065 errno = ENOSYS;
1066 return -1;
1067 }
1068 /* TODO Kerrighed: Use
1069 * int migrate (pid_t pid, int destination_node);
1070 * int migrate_self (int destination_node);
1071 * int thread_migrate (int thread_id, int destination_node);
1072 */
1073
1074 #if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
1075 /* Use a separate block so that we can define specific variable
1076 types here */
1077 {
1078 cpu_set_t *plinux_set;
1079 unsigned cpu;
1080 int last;
1081 size_t setsize;
1082
1083 last = hwloc_bitmap_last(hwloc_set);
1084 if (last == -1) {
1085 errno = EINVAL;
1086 return -1;
1087 }
1088
1089 setsize = CPU_ALLOC_SIZE(last+1);
1090 plinux_set = CPU_ALLOC(last+1);
1091
1092 CPU_ZERO_S(setsize, plinux_set);
1093 hwloc_bitmap_foreach_begin(cpu, hwloc_set)
1094 CPU_SET_S(cpu, setsize, plinux_set);
1095 hwloc_bitmap_foreach_end();
1096
1097 err = pthread_setaffinity_np(tid, setsize, plinux_set);
1098
1099 CPU_FREE(plinux_set);
1100 }
1101 #elif defined(HWLOC_HAVE_CPU_SET)
1102 /* Use a separate block so that we can define specific variable
1103 types here */
1104 {
1105 cpu_set_t linux_set;
1106 unsigned cpu;
1107
1108 CPU_ZERO(&linux_set);
1109 hwloc_bitmap_foreach_begin(cpu, hwloc_set)
1110 CPU_SET(cpu, &linux_set);
1111 hwloc_bitmap_foreach_end();
1112
1113 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
1114 err = pthread_setaffinity_np(tid, &linux_set);
1115 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1116 err = pthread_setaffinity_np(tid, sizeof(linux_set), &linux_set);
1117 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1118 }
1119 #else /* CPU_SET */
1120 /* Use a separate block so that we can define specific variable
1121 types here */
1122 {
1123 unsigned long mask = hwloc_bitmap_to_ulong(hwloc_set);
1124
1125 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
1126 err = pthread_setaffinity_np(tid, (void*) &mask);
1127 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1128 err = pthread_setaffinity_np(tid, sizeof(mask), (void*) &mask);
1129 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1130 }
1131 #endif /* CPU_SET */
1132
1133 if (err) {
1134 errno = err;
1135 return -1;
1136 }
1137 return 0;
1138 }
1139 #endif /* HAVE_DECL_PTHREAD_SETAFFINITY_NP */
1140
1141 #if HAVE_DECL_PTHREAD_GETAFFINITY_NP
1142 #pragma weak pthread_getaffinity_np
1143 #pragma weak pthread_self
1144
1145 static int
hwloc_linux_get_thread_cpubind(hwloc_topology_t topology,pthread_t tid,hwloc_bitmap_t hwloc_set,int flags __hwloc_attribute_unused)1146 hwloc_linux_get_thread_cpubind(hwloc_topology_t topology, pthread_t tid, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
1147 {
1148 int err;
1149
1150 if (topology->pid) {
1151 errno = ENOSYS;
1152 return -1;
1153 }
1154
1155 if (!pthread_self) {
1156 /* ?! Application uses set_thread_cpubind, but doesn't link against libpthread ?! */
1157 errno = ENOSYS;
1158 return -1;
1159 }
1160 if (tid == pthread_self())
1161 return hwloc_linux_get_tid_cpubind(topology, 0, hwloc_set);
1162
1163 if (!pthread_getaffinity_np) {
1164 errno = ENOSYS;
1165 return -1;
1166 }
1167 /* TODO Kerrighed */
1168
1169 #if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
1170 /* Use a separate block so that we can define specific variable
1171 types here */
1172 {
1173 cpu_set_t *plinux_set;
1174 unsigned cpu;
1175 int last;
1176 size_t setsize;
1177
1178 last = hwloc_bitmap_last(topology->levels[0][0]->complete_cpuset);
1179 assert (last != -1);
1180
1181 setsize = CPU_ALLOC_SIZE(last+1);
1182 plinux_set = CPU_ALLOC(last+1);
1183
1184 err = pthread_getaffinity_np(tid, setsize, plinux_set);
1185 if (err) {
1186 CPU_FREE(plinux_set);
1187 errno = err;
1188 return -1;
1189 }
1190
1191 hwloc_bitmap_zero(hwloc_set);
1192 for(cpu=0; cpu<=(unsigned) last; cpu++)
1193 if (CPU_ISSET_S(cpu, setsize, plinux_set))
1194 hwloc_bitmap_set(hwloc_set, cpu);
1195
1196 CPU_FREE(plinux_set);
1197 }
1198 #elif defined(HWLOC_HAVE_CPU_SET)
1199 /* Use a separate block so that we can define specific variable
1200 types here */
1201 {
1202 cpu_set_t linux_set;
1203 unsigned cpu;
1204
1205 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
1206 err = pthread_getaffinity_np(tid, &linux_set);
1207 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1208 err = pthread_getaffinity_np(tid, sizeof(linux_set), &linux_set);
1209 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1210 if (err) {
1211 errno = err;
1212 return -1;
1213 }
1214
1215 hwloc_bitmap_zero(hwloc_set);
1216 for(cpu=0; cpu<CPU_SETSIZE; cpu++)
1217 if (CPU_ISSET(cpu, &linux_set))
1218 hwloc_bitmap_set(hwloc_set, cpu);
1219 }
1220 #else /* CPU_SET */
1221 /* Use a separate block so that we can define specific variable
1222 types here */
1223 {
1224 unsigned long mask;
1225
1226 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
1227 err = pthread_getaffinity_np(tid, (void*) &mask);
1228 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1229 err = pthread_getaffinity_np(tid, sizeof(mask), (void*) &mask);
1230 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1231 if (err) {
1232 errno = err;
1233 return -1;
1234 }
1235
1236 hwloc_bitmap_from_ulong(hwloc_set, mask);
1237 }
1238 #endif /* CPU_SET */
1239
1240 return 0;
1241 }
1242 #endif /* HAVE_DECL_PTHREAD_GETAFFINITY_NP */
1243
1244 int
hwloc_linux_get_tid_last_cpu_location(hwloc_topology_t topology __hwloc_attribute_unused,pid_t tid,hwloc_bitmap_t set)1245 hwloc_linux_get_tid_last_cpu_location(hwloc_topology_t topology __hwloc_attribute_unused, pid_t tid, hwloc_bitmap_t set)
1246 {
1247 /* read /proc/pid/stat.
1248 * its second field contains the command name between parentheses,
1249 * and the command itself may contain parentheses,
1250 * so read the whole line and find the last closing parenthesis to find the third field.
1251 */
1252 char buf[1024] = "";
1253 char name[64];
1254 char *tmp;
1255 int fd, i, err;
1256
1257 if (!tid) {
1258 #ifdef SYS_gettid
1259 tid = syscall(SYS_gettid);
1260 #else
1261 errno = ENOSYS;
1262 return -1;
1263 #endif
1264 }
1265
1266 snprintf(name, sizeof(name), "/proc/%lu/stat", (unsigned long) tid);
1267 fd = open(name, O_RDONLY); /* no fsroot for real /proc */
1268 if (fd < 0) {
1269 errno = ENOSYS;
1270 return -1;
1271 }
1272 err = read(fd, buf, sizeof(buf)-1); /* read -1 to put the ending \0 */
1273 close(fd);
1274 if (err <= 0) {
1275 errno = ENOSYS;
1276 return -1;
1277 }
1278 buf[err-1] = '\0';
1279
1280 tmp = strrchr(buf, ')');
1281 if (!tmp) {
1282 errno = ENOSYS;
1283 return -1;
1284 }
1285 /* skip ') ' to find the actual third argument */
1286 tmp += 2;
1287
1288 /* skip 35 fields */
1289 for(i=0; i<36; i++) {
1290 tmp = strchr(tmp, ' ');
1291 if (!tmp) {
1292 errno = ENOSYS;
1293 return -1;
1294 }
1295 /* skip the ' ' itself */
1296 tmp++;
1297 }
1298
1299 /* read the last cpu in the 38th field now */
1300 if (sscanf(tmp, "%d ", &i) != 1) {
1301 errno = ENOSYS;
1302 return -1;
1303 }
1304
1305 hwloc_bitmap_only(set, i);
1306 return 0;
1307 }
1308
1309 /* Per-tid proc_get_last_cpu_location callback data, callback function and caller */
1310 struct hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb_data_s {
1311 hwloc_bitmap_t cpuset;
1312 hwloc_bitmap_t tidset;
1313 };
1314
1315 static int
hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb(hwloc_topology_t topology,pid_t tid,void * _data,int idx)1316 hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb(hwloc_topology_t topology, pid_t tid, void *_data, int idx)
1317 {
1318 struct hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb_data_s *data = _data;
1319 hwloc_bitmap_t cpuset = data->cpuset;
1320 hwloc_bitmap_t tidset = data->tidset;
1321
1322 if (hwloc_linux_get_tid_last_cpu_location(topology, tid, tidset))
1323 return -1;
1324
1325 /* reset the cpuset on first iteration */
1326 if (!idx)
1327 hwloc_bitmap_zero(cpuset);
1328
1329 hwloc_bitmap_or(cpuset, cpuset, tidset);
1330 return 0;
1331 }
1332
1333 static int
hwloc_linux_get_pid_last_cpu_location(hwloc_topology_t topology,pid_t pid,hwloc_bitmap_t hwloc_set,int flags __hwloc_attribute_unused)1334 hwloc_linux_get_pid_last_cpu_location(hwloc_topology_t topology, pid_t pid, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
1335 {
1336 struct hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb_data_s data;
1337 hwloc_bitmap_t tidset = hwloc_bitmap_alloc();
1338 int ret;
1339
1340 data.cpuset = hwloc_set;
1341 data.tidset = tidset;
1342 ret = hwloc_linux_foreach_proc_tid(topology, pid,
1343 hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb,
1344 &data);
1345 hwloc_bitmap_free(tidset);
1346 return ret;
1347 }
1348
1349 static int
hwloc_linux_get_proc_last_cpu_location(hwloc_topology_t topology,pid_t pid,hwloc_bitmap_t hwloc_set,int flags)1350 hwloc_linux_get_proc_last_cpu_location(hwloc_topology_t topology, pid_t pid, hwloc_bitmap_t hwloc_set, int flags)
1351 {
1352 if (pid == 0)
1353 pid = topology->pid;
1354 if (flags & HWLOC_CPUBIND_THREAD)
1355 return hwloc_linux_get_tid_last_cpu_location(topology, pid, hwloc_set);
1356 else
1357 return hwloc_linux_get_pid_last_cpu_location(topology, pid, hwloc_set, flags);
1358 }
1359
1360 static int
hwloc_linux_get_thisproc_last_cpu_location(hwloc_topology_t topology,hwloc_bitmap_t hwloc_set,int flags)1361 hwloc_linux_get_thisproc_last_cpu_location(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags)
1362 {
1363 return hwloc_linux_get_pid_last_cpu_location(topology, topology->pid, hwloc_set, flags);
1364 }
1365
1366 static int
hwloc_linux_get_thisthread_last_cpu_location(hwloc_topology_t topology,hwloc_bitmap_t hwloc_set,int flags __hwloc_attribute_unused)1367 hwloc_linux_get_thisthread_last_cpu_location(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
1368 {
1369 if (topology->pid) {
1370 errno = ENOSYS;
1371 return -1;
1372 }
1373 return hwloc_linux_get_tid_last_cpu_location(topology, 0, hwloc_set);
1374 }
1375
1376
1377
1378 /***************************
1379 ****** Membind hooks ******
1380 ***************************/
1381
1382 #if defined HWLOC_HAVE_SET_MEMPOLICY || defined HWLOC_HAVE_MBIND
1383 static int
hwloc_linux_membind_policy_from_hwloc(int * linuxpolicy,hwloc_membind_policy_t policy,int flags)1384 hwloc_linux_membind_policy_from_hwloc(int *linuxpolicy, hwloc_membind_policy_t policy, int flags)
1385 {
1386 switch (policy) {
1387 case HWLOC_MEMBIND_DEFAULT:
1388 case HWLOC_MEMBIND_FIRSTTOUCH:
1389 *linuxpolicy = MPOL_DEFAULT;
1390 break;
1391 case HWLOC_MEMBIND_BIND:
1392 if (flags & HWLOC_MEMBIND_STRICT)
1393 *linuxpolicy = MPOL_BIND;
1394 else
1395 *linuxpolicy = MPOL_PREFERRED;
1396 break;
1397 case HWLOC_MEMBIND_INTERLEAVE:
1398 *linuxpolicy = MPOL_INTERLEAVE;
1399 break;
1400 /* TODO: next-touch when (if?) patch applied upstream */
1401 default:
1402 errno = ENOSYS;
1403 return -1;
1404 }
1405 return 0;
1406 }
1407
1408 static int
hwloc_linux_membind_mask_from_nodeset(hwloc_topology_t topology __hwloc_attribute_unused,hwloc_const_nodeset_t nodeset,unsigned * max_os_index_p,unsigned long ** linuxmaskp)1409 hwloc_linux_membind_mask_from_nodeset(hwloc_topology_t topology __hwloc_attribute_unused,
1410 hwloc_const_nodeset_t nodeset,
1411 unsigned *max_os_index_p, unsigned long **linuxmaskp)
1412 {
1413 unsigned max_os_index = 0; /* highest os_index + 1 */
1414 unsigned long *linuxmask;
1415 unsigned i;
1416 hwloc_nodeset_t linux_nodeset = NULL;
1417
1418 if (hwloc_bitmap_isfull(nodeset)) {
1419 linux_nodeset = hwloc_bitmap_alloc();
1420 hwloc_bitmap_only(linux_nodeset, 0);
1421 nodeset = linux_nodeset;
1422 }
1423
1424 max_os_index = hwloc_bitmap_last(nodeset);
1425 if (max_os_index == (unsigned) -1)
1426 max_os_index = 0;
1427 /* add 1 to convert the last os_index into a max_os_index,
1428 * and round up to the nearest multiple of BITS_PER_LONG */
1429 max_os_index = (max_os_index + 1 + HWLOC_BITS_PER_LONG - 1) & ~(HWLOC_BITS_PER_LONG - 1);
1430
1431 linuxmask = calloc(max_os_index/HWLOC_BITS_PER_LONG, sizeof(long));
1432 if (!linuxmask) {
1433 hwloc_bitmap_free(linux_nodeset);
1434 errno = ENOMEM;
1435 return -1;
1436 }
1437
1438 for(i=0; i<max_os_index/HWLOC_BITS_PER_LONG; i++)
1439 linuxmask[i] = hwloc_bitmap_to_ith_ulong(nodeset, i);
1440
1441 if (linux_nodeset)
1442 hwloc_bitmap_free(linux_nodeset);
1443
1444 *max_os_index_p = max_os_index;
1445 *linuxmaskp = linuxmask;
1446 return 0;
1447 }
1448
1449 static void
hwloc_linux_membind_mask_to_nodeset(hwloc_topology_t topology __hwloc_attribute_unused,hwloc_nodeset_t nodeset,unsigned max_os_index,const unsigned long * linuxmask)1450 hwloc_linux_membind_mask_to_nodeset(hwloc_topology_t topology __hwloc_attribute_unused,
1451 hwloc_nodeset_t nodeset,
1452 unsigned max_os_index, const unsigned long *linuxmask)
1453 {
1454 unsigned i;
1455
1456 #ifdef HWLOC_DEBUG
1457 /* max_os_index comes from hwloc_linux_find_kernel_max_numnodes() so it's a multiple of HWLOC_BITS_PER_LONG */
1458 assert(!(max_os_index%HWLOC_BITS_PER_LONG));
1459 #endif
1460
1461 hwloc_bitmap_zero(nodeset);
1462 for(i=0; i<max_os_index/HWLOC_BITS_PER_LONG; i++)
1463 hwloc_bitmap_set_ith_ulong(nodeset, i, linuxmask[i]);
1464 }
1465 #endif /* HWLOC_HAVE_SET_MEMPOLICY || HWLOC_HAVE_MBIND */
1466
1467 #ifdef HWLOC_HAVE_MBIND
1468 static int
hwloc_linux_set_area_membind(hwloc_topology_t topology,const void * addr,size_t len,hwloc_const_nodeset_t nodeset,hwloc_membind_policy_t policy,int flags)1469 hwloc_linux_set_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
1470 {
1471 unsigned max_os_index; /* highest os_index + 1 */
1472 unsigned long *linuxmask;
1473 size_t remainder;
1474 int linuxpolicy;
1475 unsigned linuxflags = 0;
1476 int err;
1477
1478 remainder = (uintptr_t) addr & (hwloc_getpagesize()-1);
1479 addr = (char*) addr - remainder;
1480 len += remainder;
1481
1482 err = hwloc_linux_membind_policy_from_hwloc(&linuxpolicy, policy, flags);
1483 if (err < 0)
1484 return err;
1485
1486 if (linuxpolicy == MPOL_DEFAULT)
1487 /* Some Linux kernels don't like being passed a set */
1488 return mbind((void *) addr, len, linuxpolicy, NULL, 0, 0);
1489
1490 err = hwloc_linux_membind_mask_from_nodeset(topology, nodeset, &max_os_index, &linuxmask);
1491 if (err < 0)
1492 goto out;
1493
1494 if (flags & HWLOC_MEMBIND_MIGRATE) {
1495 #ifdef MPOL_MF_MOVE
1496 linuxflags = MPOL_MF_MOVE;
1497 if (flags & HWLOC_MEMBIND_STRICT)
1498 linuxflags |= MPOL_MF_STRICT;
1499 #else
1500 if (flags & HWLOC_MEMBIND_STRICT) {
1501 errno = ENOSYS;
1502 goto out_with_mask;
1503 }
1504 #endif
1505 }
1506
1507 err = mbind((void *) addr, len, linuxpolicy, linuxmask, max_os_index+1, linuxflags);
1508 if (err < 0)
1509 goto out_with_mask;
1510
1511 free(linuxmask);
1512 return 0;
1513
1514 out_with_mask:
1515 free(linuxmask);
1516 out:
1517 return -1;
1518 }
1519
1520 static void *
hwloc_linux_alloc_membind(hwloc_topology_t topology,size_t len,hwloc_const_nodeset_t nodeset,hwloc_membind_policy_t policy,int flags)1521 hwloc_linux_alloc_membind(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
1522 {
1523 void *buffer;
1524 int err;
1525
1526 buffer = hwloc_alloc_mmap(topology, len);
1527 if (!buffer)
1528 return NULL;
1529
1530 err = hwloc_linux_set_area_membind(topology, buffer, len, nodeset, policy, flags);
1531 if (err < 0 && policy & HWLOC_MEMBIND_STRICT) {
1532 munmap(buffer, len);
1533 return NULL;
1534 }
1535
1536 return buffer;
1537 }
1538 #endif /* HWLOC_HAVE_MBIND */
1539
1540 #ifdef HWLOC_HAVE_SET_MEMPOLICY
1541 static int
hwloc_linux_set_thisthread_membind(hwloc_topology_t topology,hwloc_const_nodeset_t nodeset,hwloc_membind_policy_t policy,int flags)1542 hwloc_linux_set_thisthread_membind(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
1543 {
1544 unsigned max_os_index; /* highest os_index + 1 */
1545 unsigned long *linuxmask;
1546 int linuxpolicy;
1547 int err;
1548
1549 err = hwloc_linux_membind_policy_from_hwloc(&linuxpolicy, policy, flags);
1550 if (err < 0)
1551 return err;
1552
1553 if (linuxpolicy == MPOL_DEFAULT)
1554 /* Some Linux kernels don't like being passed a set */
1555 return set_mempolicy(linuxpolicy, NULL, 0);
1556
1557 err = hwloc_linux_membind_mask_from_nodeset(topology, nodeset, &max_os_index, &linuxmask);
1558 if (err < 0)
1559 goto out;
1560
1561 if (flags & HWLOC_MEMBIND_MIGRATE) {
1562 #ifdef HWLOC_HAVE_MIGRATE_PAGES
1563 unsigned long *fullmask = malloc(max_os_index/HWLOC_BITS_PER_LONG * sizeof(long));
1564 if (fullmask) {
1565 memset(fullmask, 0xf, max_os_index/HWLOC_BITS_PER_LONG * sizeof(long));
1566 err = migrate_pages(0, max_os_index+1, fullmask, linuxmask);
1567 free(fullmask);
1568 } else
1569 err = -1;
1570 if (err < 0 && (flags & HWLOC_MEMBIND_STRICT))
1571 goto out_with_mask;
1572 #else
1573 errno = ENOSYS;
1574 goto out_with_mask;
1575 #endif
1576 }
1577
1578 err = set_mempolicy(linuxpolicy, linuxmask, max_os_index+1);
1579 if (err < 0)
1580 goto out_with_mask;
1581
1582 free(linuxmask);
1583 return 0;
1584
1585 out_with_mask:
1586 free(linuxmask);
1587 out:
1588 return -1;
1589 }
1590
1591 /*
1592 * On some kernels, get_mempolicy requires the output size to be larger
1593 * than the kernel MAX_NUMNODES (defined by CONFIG_NODES_SHIFT).
1594 * Try get_mempolicy on ourself until we find a max_os_index value that
1595 * makes the kernel happy.
1596 */
1597 static int
hwloc_linux_find_kernel_max_numnodes(hwloc_topology_t topology __hwloc_attribute_unused)1598 hwloc_linux_find_kernel_max_numnodes(hwloc_topology_t topology __hwloc_attribute_unused)
1599 {
1600 static int _max_numnodes = -1, max_numnodes;
1601 int linuxpolicy;
1602
1603 if (_max_numnodes != -1)
1604 /* already computed */
1605 return _max_numnodes;
1606
1607 /* start with a single ulong, it's the minimal and it's enough for most machines */
1608 max_numnodes = HWLOC_BITS_PER_LONG;
1609 while (1) {
1610 unsigned long *mask = malloc(max_numnodes / HWLOC_BITS_PER_LONG * sizeof(long));
1611 int err = get_mempolicy(&linuxpolicy, mask, max_numnodes, 0, 0);
1612 free(mask);
1613 if (!err || errno != EINVAL)
1614 /* Found it. Only update the static value with the final one,
1615 * to avoid sharing intermediate values that we modify,
1616 * in case there's ever multiple concurrent calls.
1617 */
1618 return _max_numnodes = max_numnodes;
1619 max_numnodes *= 2;
1620 }
1621 }
1622
1623 static int
hwloc_linux_membind_policy_to_hwloc(int linuxpolicy,hwloc_membind_policy_t * policy)1624 hwloc_linux_membind_policy_to_hwloc(int linuxpolicy, hwloc_membind_policy_t *policy)
1625 {
1626 switch (linuxpolicy) {
1627 case MPOL_DEFAULT:
1628 *policy = HWLOC_MEMBIND_FIRSTTOUCH;
1629 return 0;
1630 case MPOL_PREFERRED:
1631 case MPOL_BIND:
1632 *policy = HWLOC_MEMBIND_BIND;
1633 return 0;
1634 case MPOL_INTERLEAVE:
1635 *policy = HWLOC_MEMBIND_INTERLEAVE;
1636 return 0;
1637 default:
1638 errno = EINVAL;
1639 return -1;
1640 }
1641 }
1642
1643 static int
hwloc_linux_get_thisthread_membind(hwloc_topology_t topology,hwloc_nodeset_t nodeset,hwloc_membind_policy_t * policy,int flags __hwloc_attribute_unused)1644 hwloc_linux_get_thisthread_membind(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t *policy, int flags __hwloc_attribute_unused)
1645 {
1646 unsigned max_os_index;
1647 unsigned long *linuxmask;
1648 int linuxpolicy;
1649 int err;
1650
1651 max_os_index = hwloc_linux_find_kernel_max_numnodes(topology);
1652
1653 linuxmask = malloc(max_os_index/HWLOC_BITS_PER_LONG * sizeof(long));
1654 if (!linuxmask) {
1655 errno = ENOMEM;
1656 goto out;
1657 }
1658
1659 err = get_mempolicy(&linuxpolicy, linuxmask, max_os_index, 0, 0);
1660 if (err < 0)
1661 goto out_with_mask;
1662
1663 if (linuxpolicy == MPOL_DEFAULT) {
1664 hwloc_bitmap_copy(nodeset, hwloc_topology_get_topology_nodeset(topology));
1665 } else {
1666 hwloc_linux_membind_mask_to_nodeset(topology, nodeset, max_os_index, linuxmask);
1667 }
1668
1669 err = hwloc_linux_membind_policy_to_hwloc(linuxpolicy, policy);
1670 if (err < 0)
1671 goto out_with_mask;
1672
1673 free(linuxmask);
1674 return 0;
1675
1676 out_with_mask:
1677 free(linuxmask);
1678 out:
1679 return -1;
1680 }
1681
1682 static int
hwloc_linux_get_area_membind(hwloc_topology_t topology,const void * addr,size_t len,hwloc_nodeset_t nodeset,hwloc_membind_policy_t * policy,int flags __hwloc_attribute_unused)1683 hwloc_linux_get_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, hwloc_membind_policy_t *policy, int flags __hwloc_attribute_unused)
1684 {
1685 unsigned max_os_index;
1686 unsigned long *linuxmask, *globallinuxmask;
1687 int linuxpolicy, globallinuxpolicy = 0;
1688 int mixed = 0;
1689 int full = 0;
1690 int first = 1;
1691 int pagesize = hwloc_getpagesize();
1692 char *tmpaddr;
1693 int err;
1694 unsigned i;
1695
1696 max_os_index = hwloc_linux_find_kernel_max_numnodes(topology);
1697
1698 linuxmask = malloc(max_os_index/HWLOC_BITS_PER_LONG * sizeof(long));
1699 if (!linuxmask) {
1700 errno = ENOMEM;
1701 goto out;
1702 }
1703 globallinuxmask = calloc(max_os_index/HWLOC_BITS_PER_LONG, sizeof(long));
1704 if (!globallinuxmask) {
1705 errno = ENOMEM;
1706 goto out_with_masks;
1707 }
1708
1709 for(tmpaddr = (char *)((unsigned long)addr & ~(pagesize-1));
1710 tmpaddr < (char *)addr + len;
1711 tmpaddr += pagesize) {
1712 err = get_mempolicy(&linuxpolicy, linuxmask, max_os_index, tmpaddr, MPOL_F_ADDR);
1713 if (err < 0)
1714 goto out_with_masks;
1715
1716 /* use the first found policy. if we find a different one later, set mixed to 1 */
1717 if (first)
1718 globallinuxpolicy = linuxpolicy;
1719 else if (globallinuxpolicy != linuxpolicy)
1720 mixed = 1;
1721
1722 /* agregate masks, and set full to 1 if we ever find DEFAULT */
1723 if (full || linuxpolicy == MPOL_DEFAULT) {
1724 full = 1;
1725 } else {
1726 for(i=0; i<max_os_index/HWLOC_BITS_PER_LONG; i++)
1727 globallinuxmask[i] |= linuxmask[i];
1728 }
1729
1730 first = 0;
1731 }
1732
1733 if (mixed) {
1734 *policy = HWLOC_MEMBIND_MIXED;
1735 } else {
1736 err = hwloc_linux_membind_policy_to_hwloc(linuxpolicy, policy);
1737 if (err < 0)
1738 goto out_with_masks;
1739 }
1740
1741 if (full) {
1742 hwloc_bitmap_copy(nodeset, hwloc_topology_get_topology_nodeset(topology));
1743 } else {
1744 hwloc_linux_membind_mask_to_nodeset(topology, nodeset, max_os_index, globallinuxmask);
1745 }
1746
1747 free(globallinuxmask);
1748 free(linuxmask);
1749 return 0;
1750
1751 out_with_masks:
1752 free(globallinuxmask);
1753 free(linuxmask);
1754 out:
1755 return -1;
1756 }
1757
1758 #endif /* HWLOC_HAVE_SET_MEMPOLICY */
1759
1760 #ifdef HWLOC_HAVE_MOVE_PAGES
1761 static int
hwloc_linux_get_area_memlocation(hwloc_topology_t topology __hwloc_attribute_unused,const void * addr,size_t len,hwloc_nodeset_t nodeset,int flags __hwloc_attribute_unused)1762 hwloc_linux_get_area_memlocation(hwloc_topology_t topology __hwloc_attribute_unused, const void *addr, size_t len, hwloc_nodeset_t nodeset, int flags __hwloc_attribute_unused)
1763 {
1764 unsigned offset;
1765 unsigned long count;
1766 void **pages;
1767 int *status;
1768 int pagesize = hwloc_getpagesize();
1769 int ret;
1770 unsigned i;
1771
1772 offset = ((unsigned long) addr) & (pagesize-1);
1773 addr = ((char*) addr) - offset;
1774 len += offset;
1775 count = (len + pagesize-1)/pagesize;
1776 pages = malloc(count*sizeof(*pages));
1777 status = malloc(count*sizeof(*status));
1778 if (!pages || !status) {
1779 ret = -1;
1780 goto out_with_pages;
1781 }
1782
1783 for(i=0; i<count; i++)
1784 pages[i] = ((char*)addr) + i*pagesize;
1785
1786 ret = move_pages(0, count, pages, NULL, status, 0);
1787 if (ret < 0)
1788 goto out_with_pages;
1789
1790 hwloc_bitmap_zero(nodeset);
1791 for(i=0; i<count; i++)
1792 if (status[i] >= 0)
1793 hwloc_bitmap_set(nodeset, status[i]);
1794 ret = 0;
1795
1796 out_with_pages:
1797 free(pages);
1798 free(status);
1799 return ret;
1800 }
1801 #endif /* HWLOC_HAVE_MOVE_PAGES */
1802
1803 static void hwloc_linux__get_allowed_resources(hwloc_topology_t topology, const char *root_path, int root_fd, char **cpuset_namep);
1804
hwloc_linux_get_allowed_resources_hook(hwloc_topology_t topology)1805 static int hwloc_linux_get_allowed_resources_hook(hwloc_topology_t topology)
1806 {
1807 const char *fsroot_path;
1808 char *cpuset_name;
1809 int root_fd = -1;
1810
1811 fsroot_path = getenv("HWLOC_FSROOT");
1812 if (!fsroot_path)
1813 fsroot_path = "/";
1814
1815 #ifdef HAVE_OPENAT
1816 root_fd = open(fsroot_path, O_RDONLY | O_DIRECTORY);
1817 if (root_fd < 0)
1818 goto out;
1819 #else
1820 if (strcmp(fsroot_path, "/")) {
1821 errno = ENOSYS;
1822 goto out;
1823 }
1824 #endif
1825
1826 /* we could also error-out if the current topology doesn't actually match the system,
1827 * at least for PUs and NUMA nodes. But it would increase the overhead of loading XMLs.
1828 *
1829 * Just trust the user when he sets THISSYSTEM=1. It enables hacky
1830 * tests such as restricting random XML or synthetic to the current
1831 * machine (uses the default cgroup).
1832 */
1833
1834 hwloc_linux__get_allowed_resources(topology, fsroot_path, root_fd, &cpuset_name);
1835 if (cpuset_name) {
1836 hwloc_obj_add_info(topology->levels[0][0], "LinuxCgroup", cpuset_name);
1837 free(cpuset_name);
1838 }
1839 if (root_fd != -1)
1840 close(root_fd);
1841
1842 out:
1843 return -1;
1844 }
1845
1846 void
hwloc_set_linuxfs_hooks(struct hwloc_binding_hooks * hooks,struct hwloc_topology_support * support __hwloc_attribute_unused)1847 hwloc_set_linuxfs_hooks(struct hwloc_binding_hooks *hooks,
1848 struct hwloc_topology_support *support __hwloc_attribute_unused)
1849 {
1850 hooks->set_thisthread_cpubind = hwloc_linux_set_thisthread_cpubind;
1851 hooks->get_thisthread_cpubind = hwloc_linux_get_thisthread_cpubind;
1852 hooks->set_thisproc_cpubind = hwloc_linux_set_thisproc_cpubind;
1853 hooks->get_thisproc_cpubind = hwloc_linux_get_thisproc_cpubind;
1854 hooks->set_proc_cpubind = hwloc_linux_set_proc_cpubind;
1855 hooks->get_proc_cpubind = hwloc_linux_get_proc_cpubind;
1856 #if HAVE_DECL_PTHREAD_SETAFFINITY_NP
1857 hooks->set_thread_cpubind = hwloc_linux_set_thread_cpubind;
1858 #endif /* HAVE_DECL_PTHREAD_SETAFFINITY_NP */
1859 #if HAVE_DECL_PTHREAD_GETAFFINITY_NP
1860 hooks->get_thread_cpubind = hwloc_linux_get_thread_cpubind;
1861 #endif /* HAVE_DECL_PTHREAD_GETAFFINITY_NP */
1862 hooks->get_thisthread_last_cpu_location = hwloc_linux_get_thisthread_last_cpu_location;
1863 hooks->get_thisproc_last_cpu_location = hwloc_linux_get_thisproc_last_cpu_location;
1864 hooks->get_proc_last_cpu_location = hwloc_linux_get_proc_last_cpu_location;
1865 #ifdef HWLOC_HAVE_SET_MEMPOLICY
1866 hooks->set_thisthread_membind = hwloc_linux_set_thisthread_membind;
1867 hooks->get_thisthread_membind = hwloc_linux_get_thisthread_membind;
1868 hooks->get_area_membind = hwloc_linux_get_area_membind;
1869 #endif /* HWLOC_HAVE_SET_MEMPOLICY */
1870 #ifdef HWLOC_HAVE_MBIND
1871 hooks->set_area_membind = hwloc_linux_set_area_membind;
1872 #ifdef HWLOC_HAVE_MOVE_PAGES
1873 hooks->get_area_memlocation = hwloc_linux_get_area_memlocation;
1874 #endif /* HWLOC_HAVE_MOVE_PAGES */
1875 hooks->alloc_membind = hwloc_linux_alloc_membind;
1876 hooks->alloc = hwloc_alloc_mmap;
1877 hooks->free_membind = hwloc_free_mmap;
1878 support->membind->firsttouch_membind = 1;
1879 support->membind->bind_membind = 1;
1880 support->membind->interleave_membind = 1;
1881 #endif /* HWLOC_HAVE_MBIND */
1882 #if (defined HWLOC_HAVE_MIGRATE_PAGES) || ((defined HWLOC_HAVE_MBIND) && (defined MPOL_MF_MOVE))
1883 support->membind->migrate_membind = 1;
1884 #endif
1885 hooks->get_allowed_resources = hwloc_linux_get_allowed_resources_hook;
1886 }
1887
1888
1889 /*******************************************
1890 *** Misc Helpers for Topology Discovery ***
1891 *******************************************/
1892
1893 /* cpuinfo array */
1894 struct hwloc_linux_cpuinfo_proc {
1895 /* set during hwloc_linux_parse_cpuinfo */
1896 unsigned long Pproc;
1897 /* set during hwloc_linux_parse_cpuinfo or -1 if unknown*/
1898 long Pcore, Ppkg;
1899 /* set later, or -1 if unknown */
1900 long Lcore, Lpkg;
1901
1902 /* custom info, set during hwloc_linux_parse_cpuinfo */
1903 struct hwloc_obj_info_s *infos;
1904 unsigned infos_count;
1905 };
1906
1907 /* deprecated but still needed in hwloc/linux.h for backward compat */
1908 int
hwloc_linux_parse_cpumap_file(FILE * file,hwloc_bitmap_t set)1909 hwloc_linux_parse_cpumap_file(FILE *file, hwloc_bitmap_t set)
1910 {
1911 unsigned long *maps;
1912 unsigned long map;
1913 int nr_maps = 0;
1914 static int _nr_maps_allocated = 8; /* Only compute the power-of-two above the kernel cpumask size once.
1915 * Actually, it may increase multiple times if first read cpumaps start with zeroes.
1916 */
1917 int nr_maps_allocated = _nr_maps_allocated;
1918 int i;
1919
1920 maps = malloc(nr_maps_allocated * sizeof(*maps));
1921 if (!maps)
1922 return -1;
1923
1924 /* reset to zero first */
1925 hwloc_bitmap_zero(set);
1926
1927 /* parse the whole mask */
1928 while (fscanf(file, "%lx,", &map) == 1) /* read one kernel cpu mask and the ending comma */
1929 {
1930 if (nr_maps == nr_maps_allocated) {
1931 unsigned long *tmp = realloc(maps, 2*nr_maps_allocated * sizeof(*maps));
1932 if (!tmp) {
1933 free(maps);
1934 return -1;
1935 }
1936 maps = tmp;
1937 nr_maps_allocated *= 2;
1938 }
1939
1940 if (!map && !nr_maps)
1941 /* ignore the first map if it's empty */
1942 continue;
1943
1944 maps[nr_maps++] = map;
1945 }
1946
1947 /* convert into a set */
1948 #if KERNEL_CPU_MASK_BITS == HWLOC_BITS_PER_LONG
1949 for(i=0; i<nr_maps; i++)
1950 hwloc_bitmap_set_ith_ulong(set, i, maps[nr_maps-1-i]);
1951 #else
1952 for(i=0; i<(nr_maps+1)/2; i++) {
1953 unsigned long mask;
1954 mask = maps[nr_maps-2*i-1];
1955 if (2*i+1<nr_maps)
1956 mask |= maps[nr_maps-2*i-2] << KERNEL_CPU_MASK_BITS;
1957 hwloc_bitmap_set_ith_ulong(set, i, mask);
1958 }
1959 #endif
1960
1961 free(maps);
1962
1963 /* Only update the static value with the final one,
1964 * to avoid sharing intermediate values that we modify,
1965 * in case there's ever multiple concurrent calls.
1966 */
1967 if (nr_maps_allocated > _nr_maps_allocated)
1968 _nr_maps_allocated = nr_maps_allocated;
1969 return 0;
1970 }
1971
1972 static void
hwloc_find_linux_cpuset_mntpnt(char ** cgroup_mntpnt,char ** cpuset_mntpnt,const char * root_path)1973 hwloc_find_linux_cpuset_mntpnt(char **cgroup_mntpnt, char **cpuset_mntpnt, const char *root_path)
1974 {
1975 char *mount_path;
1976 struct mntent mntent;
1977 FILE *fd;
1978 int err;
1979 size_t bufsize;
1980 char *buf;
1981
1982 *cgroup_mntpnt = NULL;
1983 *cpuset_mntpnt = NULL;
1984
1985 if (root_path) {
1986 /* setmntent() doesn't support openat(), so use the root_path directly */
1987 err = asprintf(&mount_path, "%s/proc/mounts", root_path);
1988 if (err < 0)
1989 return;
1990 fd = setmntent(mount_path, "r");
1991 free(mount_path);
1992 } else {
1993 fd = setmntent("/proc/mounts", "r");
1994 }
1995 if (!fd)
1996 return;
1997
1998 /* getmntent_r() doesn't actually report an error when the buffer
1999 * is too small. It just silently truncates things. So we can't
2000 * dynamically resize things.
2001 *
2002 * Linux limits mount type, string, and options to one page each.
2003 * getmntent() limits the line size to 4kB.
2004 * so use 4*pagesize to be far above both.
2005 */
2006 bufsize = hwloc_getpagesize()*4;
2007 buf = malloc(bufsize);
2008
2009 while (getmntent_r(fd, &mntent, buf, bufsize)) {
2010 if (!strcmp(mntent.mnt_type, "cpuset")) {
2011 hwloc_debug("Found cpuset mount point on %s\n", mntent.mnt_dir);
2012 *cpuset_mntpnt = strdup(mntent.mnt_dir);
2013 break;
2014 } else if (!strcmp(mntent.mnt_type, "cgroup")) {
2015 /* found a cgroup mntpnt */
2016 char *opt, *opts = mntent.mnt_opts;
2017 int cpuset_opt = 0;
2018 int noprefix_opt = 0;
2019 /* look at options */
2020 while ((opt = strsep(&opts, ",")) != NULL) {
2021 if (!strcmp(opt, "cpuset"))
2022 cpuset_opt = 1;
2023 else if (!strcmp(opt, "noprefix"))
2024 noprefix_opt = 1;
2025 }
2026 if (!cpuset_opt)
2027 continue;
2028 if (noprefix_opt) {
2029 hwloc_debug("Found cgroup emulating a cpuset mount point on %s\n", mntent.mnt_dir);
2030 *cpuset_mntpnt = strdup(mntent.mnt_dir);
2031 } else {
2032 hwloc_debug("Found cgroup/cpuset mount point on %s\n", mntent.mnt_dir);
2033 *cgroup_mntpnt = strdup(mntent.mnt_dir);
2034 }
2035 break;
2036 }
2037 }
2038
2039 free(buf);
2040 endmntent(fd);
2041 }
2042
2043 /*
2044 * Linux cpusets may be managed directly or through cgroup.
2045 * If cgroup is used, tasks get a /proc/pid/cgroup which may contain a
2046 * single line %d:cpuset:<name>. If cpuset are used they get /proc/pid/cpuset
2047 * containing <name>.
2048 */
2049 static char *
hwloc_read_linux_cpuset_name(int fsroot_fd,hwloc_pid_t pid)2050 hwloc_read_linux_cpuset_name(int fsroot_fd, hwloc_pid_t pid)
2051 {
2052 #define CPUSET_NAME_LEN 128
2053 char cpuset_name[CPUSET_NAME_LEN];
2054 FILE *file;
2055 int err;
2056 char *tmp;
2057
2058 /* check whether a cgroup-cpuset is enabled */
2059 if (!pid)
2060 file = hwloc_fopen("/proc/self/cgroup", "r", fsroot_fd);
2061 else {
2062 char path[] = "/proc/XXXXXXXXXX/cgroup";
2063 snprintf(path, sizeof(path), "/proc/%d/cgroup", pid);
2064 file = hwloc_fopen(path, "r", fsroot_fd);
2065 }
2066 if (file) {
2067 /* find a cpuset line */
2068 #define CGROUP_LINE_LEN 256
2069 char line[CGROUP_LINE_LEN];
2070 while (fgets(line, sizeof(line), file)) {
2071 char *end, *colon = strchr(line, ':');
2072 if (!colon)
2073 continue;
2074 if (strncmp(colon, ":cpuset:", 8))
2075 continue;
2076
2077 /* found a cgroup-cpuset line, return the name */
2078 fclose(file);
2079 end = strchr(colon, '\n');
2080 if (end)
2081 *end = '\0';
2082 hwloc_debug("Found cgroup-cpuset %s\n", colon+8);
2083 return strdup(colon+8);
2084 }
2085 fclose(file);
2086 }
2087
2088 /* check whether a cpuset is enabled */
2089 if (!pid)
2090 err = hwloc_read_path_by_length("/proc/self/cpuset", cpuset_name, sizeof(cpuset_name), fsroot_fd);
2091 else {
2092 char path[] = "/proc/XXXXXXXXXX/cpuset";
2093 snprintf(path, sizeof(path), "/proc/%d/cpuset", pid);
2094 err = hwloc_read_path_by_length(path, cpuset_name, sizeof(cpuset_name), fsroot_fd);
2095 }
2096 if (err < 0) {
2097 /* found nothing */
2098 hwloc_debug("%s", "No cgroup or cpuset found\n");
2099 return NULL;
2100 }
2101
2102 /* found a cpuset, return the name */
2103 tmp = strchr(cpuset_name, '\n');
2104 if (tmp)
2105 *tmp = '\0';
2106 hwloc_debug("Found cpuset %s\n", cpuset_name);
2107 return strdup(cpuset_name);
2108 }
2109
2110 /*
2111 * Then, the cpuset description is available from either the cgroup or
2112 * the cpuset filesystem (usually mounted in / or /dev) where there
2113 * are cgroup<name>/cpuset.{cpus,mems} or cpuset<name>/{cpus,mems} files.
2114 */
2115 static void
hwloc_admin_disable_set_from_cpuset(int root_fd,const char * cgroup_mntpnt,const char * cpuset_mntpnt,const char * cpuset_name,const char * attr_name,hwloc_bitmap_t admin_enabled_cpus_set)2116 hwloc_admin_disable_set_from_cpuset(int root_fd,
2117 const char *cgroup_mntpnt, const char *cpuset_mntpnt, const char *cpuset_name,
2118 const char *attr_name,
2119 hwloc_bitmap_t admin_enabled_cpus_set)
2120 {
2121 #define CPUSET_FILENAME_LEN 256
2122 char cpuset_filename[CPUSET_FILENAME_LEN];
2123 int fd;
2124 int err;
2125
2126 if (cgroup_mntpnt) {
2127 /* try to read the cpuset from cgroup */
2128 snprintf(cpuset_filename, CPUSET_FILENAME_LEN, "%s%s/cpuset.%s", cgroup_mntpnt, cpuset_name, attr_name);
2129 hwloc_debug("Trying to read cgroup file <%s>\n", cpuset_filename);
2130 } else if (cpuset_mntpnt) {
2131 /* try to read the cpuset directly */
2132 snprintf(cpuset_filename, CPUSET_FILENAME_LEN, "%s%s/%s", cpuset_mntpnt, cpuset_name, attr_name);
2133 hwloc_debug("Trying to read cpuset file <%s>\n", cpuset_filename);
2134 }
2135
2136 fd = hwloc_open(cpuset_filename, root_fd);
2137 if (fd < 0) {
2138 /* found no cpuset description, ignore it */
2139 hwloc_debug("Couldn't find cpuset <%s> description, ignoring\n", cpuset_name);
2140 return;
2141 }
2142
2143 err = hwloc__read_fd_as_cpulist(fd, admin_enabled_cpus_set);
2144 close(fd);
2145
2146 if (err < 0)
2147 hwloc_bitmap_fill(admin_enabled_cpus_set);
2148 else
2149 hwloc_debug_bitmap("cpuset includes %s\n", admin_enabled_cpus_set);
2150 }
2151
2152 static void
hwloc_parse_meminfo_info(struct hwloc_linux_backend_data_s * data,const char * path,uint64_t * local_memory,uint64_t * meminfo_hugepages_count,uint64_t * meminfo_hugepages_size,int onlytotal)2153 hwloc_parse_meminfo_info(struct hwloc_linux_backend_data_s *data,
2154 const char *path,
2155 uint64_t *local_memory,
2156 uint64_t *meminfo_hugepages_count,
2157 uint64_t *meminfo_hugepages_size,
2158 int onlytotal)
2159 {
2160 char *tmp;
2161 char buffer[4096];
2162 unsigned long long number;
2163
2164 if (hwloc_read_path_by_length(path, buffer, sizeof(buffer), data->root_fd) < 0)
2165 return;
2166
2167 tmp = strstr(buffer, "MemTotal: "); /* MemTotal: %llu kB */
2168 if (tmp) {
2169 number = strtoull(tmp+10, NULL, 10);
2170 *local_memory = number << 10;
2171
2172 if (onlytotal)
2173 return;
2174
2175 tmp = strstr(tmp, "Hugepagesize: "); /* Hugepagesize: %llu */
2176 if (tmp) {
2177 number = strtoull(tmp+14, NULL, 10);
2178 *meminfo_hugepages_size = number << 10;
2179
2180 tmp = strstr(tmp, "HugePages_Free: "); /* HugePages_Free: %llu */
2181 if (tmp) {
2182 number = strtoull(tmp+16, NULL, 10);
2183 *meminfo_hugepages_count = number;
2184 }
2185 }
2186 }
2187 }
2188
2189 #define SYSFS_NUMA_NODE_PATH_LEN 128
2190
2191 static void
hwloc_parse_hugepages_info(struct hwloc_linux_backend_data_s * data,const char * dirpath,struct hwloc_obj_memory_s * memory,uint64_t * remaining_local_memory)2192 hwloc_parse_hugepages_info(struct hwloc_linux_backend_data_s *data,
2193 const char *dirpath,
2194 struct hwloc_obj_memory_s *memory,
2195 uint64_t *remaining_local_memory)
2196 {
2197 DIR *dir;
2198 struct dirent *dirent;
2199 unsigned long index_ = 1;
2200 char line[64];
2201 char path[SYSFS_NUMA_NODE_PATH_LEN];
2202
2203 dir = hwloc_opendir(dirpath, data->root_fd);
2204 if (dir) {
2205 while ((dirent = readdir(dir)) != NULL) {
2206 if (strncmp(dirent->d_name, "hugepages-", 10))
2207 continue;
2208 memory->page_types[index_].size = strtoul(dirent->d_name+10, NULL, 0) * 1024ULL;
2209 sprintf(path, "%s/%s/nr_hugepages", dirpath, dirent->d_name);
2210 if (!hwloc_read_path_by_length(path, line, sizeof(line), data->root_fd)) {
2211 /* these are the actual total amount of huge pages */
2212 memory->page_types[index_].count = strtoull(line, NULL, 0);
2213 *remaining_local_memory -= memory->page_types[index_].count * memory->page_types[index_].size;
2214 index_++;
2215 }
2216 }
2217 closedir(dir);
2218 memory->page_types_len = index_;
2219 }
2220 }
2221
2222 static void
hwloc_get_kerrighed_node_meminfo_info(struct hwloc_topology * topology,struct hwloc_linux_backend_data_s * data,unsigned long node,struct hwloc_obj_memory_s * memory)2223 hwloc_get_kerrighed_node_meminfo_info(struct hwloc_topology *topology,
2224 struct hwloc_linux_backend_data_s *data,
2225 unsigned long node, struct hwloc_obj_memory_s *memory)
2226 {
2227 char path[128];
2228 uint64_t meminfo_hugepages_count, meminfo_hugepages_size = 0;
2229
2230 if (topology->is_thissystem) {
2231 memory->page_types_len = 2;
2232 memory->page_types = malloc(2*sizeof(*memory->page_types));
2233 memset(memory->page_types, 0, 2*sizeof(*memory->page_types));
2234 /* Try to get the hugepage size from sysconf in case we fail to get it from /proc/meminfo later */
2235 #ifdef HAVE__SC_LARGE_PAGESIZE
2236 memory->page_types[1].size = sysconf(_SC_LARGE_PAGESIZE);
2237 #endif
2238 memory->page_types[0].size = data->pagesize;
2239 }
2240
2241 snprintf(path, sizeof(path), "/proc/nodes/node%lu/meminfo", node);
2242 hwloc_parse_meminfo_info(data, path,
2243 &memory->local_memory,
2244 &meminfo_hugepages_count, &meminfo_hugepages_size,
2245 memory->page_types == NULL);
2246
2247 if (memory->page_types) {
2248 uint64_t remaining_local_memory = memory->local_memory;
2249 if (meminfo_hugepages_size) {
2250 memory->page_types[1].size = meminfo_hugepages_size;
2251 memory->page_types[1].count = meminfo_hugepages_count;
2252 remaining_local_memory -= meminfo_hugepages_count * meminfo_hugepages_size;
2253 } else {
2254 memory->page_types_len = 1;
2255 }
2256 memory->page_types[0].count = remaining_local_memory / memory->page_types[0].size;
2257 }
2258 }
2259
2260 static void
hwloc_get_procfs_meminfo_info(struct hwloc_topology * topology,struct hwloc_linux_backend_data_s * data,struct hwloc_obj_memory_s * memory)2261 hwloc_get_procfs_meminfo_info(struct hwloc_topology *topology,
2262 struct hwloc_linux_backend_data_s *data,
2263 struct hwloc_obj_memory_s *memory)
2264 {
2265 uint64_t meminfo_hugepages_count, meminfo_hugepages_size = 0;
2266 struct stat st;
2267 int has_sysfs_hugepages = 0;
2268 const char *pagesize_env = getenv("HWLOC_DEBUG_PAGESIZE");
2269 int types = 2;
2270 int err;
2271
2272 err = hwloc_stat("/sys/kernel/mm/hugepages", &st, data->root_fd);
2273 if (!err) {
2274 types = 1 + st.st_nlink-2;
2275 has_sysfs_hugepages = 1;
2276 }
2277
2278 if (topology->is_thissystem || pagesize_env) {
2279 /* we cannot report any page_type info unless we have the page size.
2280 * we'll take it either from the system if local, or from the debug env variable
2281 */
2282 memory->page_types_len = types;
2283 memory->page_types = calloc(types, sizeof(*memory->page_types));
2284 }
2285
2286 if (topology->is_thissystem) {
2287 /* Get the page and hugepage sizes from sysconf */
2288 #if HAVE_DECL__SC_LARGE_PAGESIZE
2289 memory->page_types[1].size = sysconf(_SC_LARGE_PAGESIZE);
2290 #endif
2291 memory->page_types[0].size = data->pagesize; /* might be overwritten later by /proc/meminfo or sysfs */
2292 }
2293
2294 hwloc_parse_meminfo_info(data, "/proc/meminfo",
2295 &memory->local_memory,
2296 &meminfo_hugepages_count, &meminfo_hugepages_size,
2297 memory->page_types == NULL);
2298
2299 if (memory->page_types) {
2300 uint64_t remaining_local_memory = memory->local_memory;
2301 if (has_sysfs_hugepages) {
2302 /* read from node%d/hugepages/hugepages-%skB/nr_hugepages */
2303 hwloc_parse_hugepages_info(data, "/sys/kernel/mm/hugepages", memory, &remaining_local_memory);
2304 } else {
2305 /* use what we found in meminfo */
2306 if (meminfo_hugepages_size) {
2307 memory->page_types[1].size = meminfo_hugepages_size;
2308 memory->page_types[1].count = meminfo_hugepages_count;
2309 remaining_local_memory -= meminfo_hugepages_count * meminfo_hugepages_size;
2310 } else {
2311 memory->page_types_len = 1;
2312 }
2313 }
2314
2315 if (pagesize_env) {
2316 /* We cannot get the pagesize if not thissystem, use the env-given one to experience the code during make check */
2317 memory->page_types[0].size = strtoull(pagesize_env, NULL, 10);
2318 /* If failed, use 4kB */
2319 if (!memory->page_types[0].size)
2320 memory->page_types[0].size = 4096;
2321 }
2322 assert(memory->page_types[0].size); /* from sysconf if local or from the env */
2323 /* memory->page_types[1].size from sysconf if local, or from /proc/meminfo, or from sysfs,
2324 * may be 0 if no hugepage support in the kernel */
2325
2326 memory->page_types[0].count = remaining_local_memory / memory->page_types[0].size;
2327 }
2328 }
2329
2330 static void
hwloc_sysfs_node_meminfo_info(struct hwloc_topology * topology,struct hwloc_linux_backend_data_s * data,const char * syspath,int node,struct hwloc_obj_memory_s * memory)2331 hwloc_sysfs_node_meminfo_info(struct hwloc_topology *topology,
2332 struct hwloc_linux_backend_data_s *data,
2333 const char *syspath, int node,
2334 struct hwloc_obj_memory_s *memory)
2335 {
2336 char path[SYSFS_NUMA_NODE_PATH_LEN];
2337 char meminfopath[SYSFS_NUMA_NODE_PATH_LEN];
2338 uint64_t meminfo_hugepages_count = 0;
2339 uint64_t meminfo_hugepages_size = 0;
2340 struct stat st;
2341 int has_sysfs_hugepages = 0;
2342 int types = 2;
2343 int err;
2344
2345 sprintf(path, "%s/node%d/hugepages", syspath, node);
2346 err = hwloc_stat(path, &st, data->root_fd);
2347 if (!err) {
2348 types = 1 + st.st_nlink-2;
2349 has_sysfs_hugepages = 1;
2350 }
2351
2352 if (topology->is_thissystem) {
2353 memory->page_types_len = types;
2354 memory->page_types = malloc(types*sizeof(*memory->page_types));
2355 memset(memory->page_types, 0, types*sizeof(*memory->page_types));
2356 }
2357
2358 sprintf(meminfopath, "%s/node%d/meminfo", syspath, node);
2359 hwloc_parse_meminfo_info(data, meminfopath,
2360 &memory->local_memory,
2361 &meminfo_hugepages_count, NULL /* no hugepage size in node-specific meminfo */,
2362 memory->page_types == NULL);
2363
2364 if (memory->page_types) {
2365 uint64_t remaining_local_memory = memory->local_memory;
2366 if (has_sysfs_hugepages) {
2367 /* read from node%d/hugepages/hugepages-%skB/nr_hugepages */
2368 hwloc_parse_hugepages_info(data, path, memory, &remaining_local_memory);
2369 } else {
2370 /* get hugepage size from machine-specific meminfo since there is no size in node-specific meminfo,
2371 * hwloc_get_procfs_meminfo_info must have been called earlier */
2372 meminfo_hugepages_size = topology->levels[0][0]->memory.page_types[1].size;
2373 /* use what we found in meminfo */
2374 if (meminfo_hugepages_size) {
2375 memory->page_types[1].count = meminfo_hugepages_count;
2376 memory->page_types[1].size = meminfo_hugepages_size;
2377 remaining_local_memory -= meminfo_hugepages_count * meminfo_hugepages_size;
2378 } else {
2379 memory->page_types_len = 1;
2380 }
2381 }
2382 /* update what's remaining as normal pages */
2383 memory->page_types[0].size = data->pagesize;
2384 memory->page_types[0].count = remaining_local_memory / memory->page_types[0].size;
2385 }
2386 }
2387
2388 static int
hwloc_parse_nodes_distances(const char * path,unsigned nbnodes,unsigned * indexes,float * distances,int fsroot_fd)2389 hwloc_parse_nodes_distances(const char *path, unsigned nbnodes, unsigned *indexes, float *distances, int fsroot_fd)
2390 {
2391 size_t len = (10+1)*nbnodes;
2392 float *curdist = distances;
2393 char *string;
2394 unsigned i;
2395
2396 string = malloc(len); /* space-separated %d */
2397 if (!string)
2398 goto out;
2399
2400 for(i=0; i<nbnodes; i++) {
2401 unsigned osnode = indexes[i];
2402 char distancepath[SYSFS_NUMA_NODE_PATH_LEN];
2403 char *tmp, *next;
2404 unsigned found;
2405
2406 /* Linux nodeX/distance file contains distance from X to other localities (from ACPI SLIT table or so),
2407 * store them in slots X*N...X*N+N-1 */
2408 sprintf(distancepath, "%s/node%u/distance", path, osnode);
2409 if (hwloc_read_path_by_length(distancepath, string, len, fsroot_fd) < 0)
2410 goto out_with_string;
2411
2412 tmp = string;
2413 found = 0;
2414 while (tmp) {
2415 unsigned distance = strtoul(tmp, &next, 0); /* stored as a %d */
2416 if (next == tmp)
2417 break;
2418 *curdist = (float) distance;
2419 curdist++;
2420 found++;
2421 if (found == nbnodes)
2422 break;
2423 tmp = next+1;
2424 }
2425 if (found != nbnodes)
2426 goto out_with_string;
2427 }
2428
2429 free(string);
2430 return 0;
2431
2432 out_with_string:
2433 free(string);
2434 out:
2435 return -1;
2436 }
2437
2438 static void
hwloc__get_dmi_id_one_info(struct hwloc_linux_backend_data_s * data,hwloc_obj_t obj,char * path,unsigned pathlen,const char * dmi_name,const char * hwloc_name)2439 hwloc__get_dmi_id_one_info(struct hwloc_linux_backend_data_s *data,
2440 hwloc_obj_t obj,
2441 char *path, unsigned pathlen,
2442 const char *dmi_name, const char *hwloc_name)
2443 {
2444 char dmi_line[64];
2445
2446 strcpy(path+pathlen, dmi_name);
2447 if (hwloc_read_path_by_length(path, dmi_line, sizeof(dmi_line), data->root_fd) < 0)
2448 return;
2449
2450 if (dmi_line[0] != '\0') {
2451 char *tmp = strchr(dmi_line, '\n');
2452 if (tmp)
2453 *tmp = '\0';
2454 hwloc_debug("found %s '%s'\n", hwloc_name, dmi_line);
2455 hwloc_obj_add_info(obj, hwloc_name, dmi_line);
2456 }
2457 }
2458
2459 static void
hwloc__get_dmi_id_info(struct hwloc_linux_backend_data_s * data,hwloc_obj_t obj)2460 hwloc__get_dmi_id_info(struct hwloc_linux_backend_data_s *data, hwloc_obj_t obj)
2461 {
2462 char path[128];
2463 unsigned pathlen;
2464 DIR *dir;
2465
2466 strcpy(path, "/sys/devices/virtual/dmi/id");
2467 dir = hwloc_opendir(path, data->root_fd);
2468 if (dir) {
2469 pathlen = 27;
2470 } else {
2471 strcpy(path, "/sys/class/dmi/id");
2472 dir = hwloc_opendir(path, data->root_fd);
2473 if (dir)
2474 pathlen = 17;
2475 else
2476 return;
2477 }
2478 closedir(dir);
2479
2480 path[pathlen++] = '/';
2481
2482 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "product_name", "DMIProductName");
2483 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "product_version", "DMIProductVersion");
2484 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "product_serial", "DMIProductSerial");
2485 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "product_uuid", "DMIProductUUID");
2486 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_vendor", "DMIBoardVendor");
2487 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_name", "DMIBoardName");
2488 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_version", "DMIBoardVersion");
2489 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_serial", "DMIBoardSerial");
2490 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_asset_tag", "DMIBoardAssetTag");
2491 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_vendor", "DMIChassisVendor");
2492 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_type", "DMIChassisType");
2493 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_version", "DMIChassisVersion");
2494 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_serial", "DMIChassisSerial");
2495 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_asset_tag", "DMIChassisAssetTag");
2496 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "bios_vendor", "DMIBIOSVendor");
2497 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "bios_version", "DMIBIOSVersion");
2498 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "bios_date", "DMIBIOSDate");
2499 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "sys_vendor", "DMISysVendor");
2500 }
2501
2502 struct hwloc_firmware_dmi_mem_device_header {
2503 unsigned char type;
2504 unsigned char length;
2505 unsigned char handle[2];
2506 unsigned char phy_mem_handle[2];
2507 unsigned char mem_err_handle[2];
2508 unsigned char tot_width[2];
2509 unsigned char dat_width[2];
2510 unsigned char size[2];
2511 unsigned char ff;
2512 unsigned char dev_set;
2513 unsigned char dev_loc_str_num;
2514 unsigned char bank_loc_str_num;
2515 unsigned char mem_type;
2516 unsigned char type_detail[2];
2517 unsigned char speed[2];
2518 unsigned char manuf_str_num;
2519 unsigned char serial_str_num;
2520 unsigned char asset_tag_str_num;
2521 unsigned char part_num_str_num;
2522 /* don't include the following fields since we don't need them,
2523 * some old implementations may miss them.
2524 */
2525 };
2526
check_dmi_entry(const char * buffer)2527 static int check_dmi_entry(const char *buffer)
2528 {
2529 /* reject empty strings */
2530 if (!*buffer)
2531 return 0;
2532 /* reject strings of spaces (at least Dell use this for empty memory slots) */
2533 if (strspn(buffer, " ") == strlen(buffer))
2534 return 0;
2535 return 1;
2536 }
2537
2538 static void
hwloc__get_firmware_dmi_memory_info_one(struct hwloc_topology * topology,unsigned idx,const char * path,FILE * fd,struct hwloc_firmware_dmi_mem_device_header * header)2539 hwloc__get_firmware_dmi_memory_info_one(struct hwloc_topology *topology,
2540 unsigned idx, const char *path, FILE *fd,
2541 struct hwloc_firmware_dmi_mem_device_header *header)
2542 {
2543 unsigned slen;
2544 char buffer[256]; /* enough for memory device strings, or at least for each of them */
2545 unsigned foff; /* offset in raw file */
2546 unsigned boff; /* offset in buffer read from raw file */
2547 unsigned i;
2548 struct hwloc_obj_info_s *infos = NULL;
2549 unsigned infos_count = 0;
2550 hwloc_obj_t misc;
2551 int foundinfo = 0;
2552
2553 hwloc__add_info(&infos, &infos_count, "Type", "MemoryModule");
2554
2555 /* start after the header */
2556 foff = header->length;
2557 i = 1;
2558 while (1) {
2559 /* read one buffer */
2560 if (fseek(fd, foff, SEEK_SET) < 0)
2561 break;
2562 if (!fgets(buffer, sizeof(buffer), fd))
2563 break;
2564 /* read string at the beginning of the buffer */
2565 boff = 0;
2566 while (1) {
2567 /* stop on empty string */
2568 if (!buffer[boff])
2569 goto done;
2570 /* stop if this string goes to the end of the buffer */
2571 slen = strlen(buffer+boff);
2572 if (boff + slen+1 == sizeof(buffer))
2573 break;
2574 /* string didn't get truncated, should be OK */
2575 if (i == header->manuf_str_num) {
2576 if (check_dmi_entry(buffer+boff)) {
2577 hwloc__add_info(&infos, &infos_count, "Vendor", buffer+boff);
2578 foundinfo = 1;
2579 }
2580 } else if (i == header->serial_str_num) {
2581 if (check_dmi_entry(buffer+boff)) {
2582 hwloc__add_info(&infos, &infos_count, "SerialNumber", buffer+boff);
2583 foundinfo = 1;
2584 }
2585 } else if (i == header->asset_tag_str_num) {
2586 if (check_dmi_entry(buffer+boff)) {
2587 hwloc__add_info(&infos, &infos_count, "AssetTag", buffer+boff);
2588 foundinfo = 1;
2589 }
2590 } else if (i == header->part_num_str_num) {
2591 if (check_dmi_entry(buffer+boff)) {
2592 hwloc__add_info(&infos, &infos_count, "PartNumber", buffer+boff);
2593 foundinfo = 1;
2594 }
2595 } else if (i == header->dev_loc_str_num) {
2596 if (check_dmi_entry(buffer+boff)) {
2597 hwloc__add_info(&infos, &infos_count, "DeviceLocation", buffer+boff);
2598 /* only a location, not an actual info about the device */
2599 }
2600 } else if (i == header->bank_loc_str_num) {
2601 if (check_dmi_entry(buffer+boff)) {
2602 hwloc__add_info(&infos, &infos_count, "BankLocation", buffer+boff);
2603 /* only a location, not an actual info about the device */
2604 }
2605 } else {
2606 goto done;
2607 }
2608 /* next string in buffer */
2609 boff += slen+1;
2610 i++;
2611 }
2612 /* couldn't read a single full string from that buffer, we're screwed */
2613 if (!boff) {
2614 fprintf(stderr, "hwloc could read a DMI firmware entry #%u in %s\n",
2615 i, path);
2616 break;
2617 }
2618 /* reread buffer after previous string */
2619 foff += boff;
2620 }
2621
2622 done:
2623 if (!foundinfo) {
2624 /* found no actual info about the device. if there's only location info, the slot may be empty */
2625 goto out_with_infos;
2626 }
2627
2628 misc = hwloc_alloc_setup_object(HWLOC_OBJ_MISC, idx);
2629 if (!misc)
2630 goto out_with_infos;
2631
2632 hwloc__move_infos(&misc->infos, &misc->infos_count, &infos, &infos_count);
2633 /* FIXME: find a way to identify the corresponding NUMA node and attach these objects there.
2634 * but it means we need to parse DeviceLocation=DIMM_B4 but these vary significantly
2635 * with the vendor, and it's hard to be 100% sure 'B' is second socket.
2636 * Examples at http://sourceforge.net/p/edac-utils/code/HEAD/tree/trunk/src/etc/labels.db
2637 * or https://github.com/grondo/edac-utils/blob/master/src/etc/labels.db
2638 */
2639 hwloc_insert_object_by_parent(topology, hwloc_get_root_obj(topology), misc);
2640 return;
2641
2642 out_with_infos:
2643 hwloc__free_infos(infos, infos_count);
2644 }
2645
2646 static void
hwloc__get_firmware_dmi_memory_info(struct hwloc_topology * topology,struct hwloc_linux_backend_data_s * data)2647 hwloc__get_firmware_dmi_memory_info(struct hwloc_topology *topology,
2648 struct hwloc_linux_backend_data_s *data)
2649 {
2650 char path[128];
2651 unsigned i;
2652
2653 for(i=0; ; i++) {
2654 FILE *fd;
2655 struct hwloc_firmware_dmi_mem_device_header header;
2656 int err;
2657
2658 snprintf(path, sizeof(path), "/sys/firmware/dmi/entries/17-%u/raw", i);
2659 fd = hwloc_fopen(path, "r", data->root_fd);
2660 if (!fd)
2661 break;
2662
2663 err = fread(&header, sizeof(header), 1, fd);
2664 if (err != 1) {
2665 fclose(fd);
2666 break;
2667 }
2668 if (header.length < sizeof(header)) {
2669 /* invalid, or too old entry/spec that doesn't contain what we need */
2670 fclose(fd);
2671 break;
2672 }
2673
2674 hwloc__get_firmware_dmi_memory_info_one(topology, i, path, fd, &header);
2675
2676 fclose(fd);
2677 }
2678 }
2679
2680
2681 /***********************************
2682 ****** Device tree Discovery ******
2683 ***********************************/
2684
2685 /* Reads the entire file and returns bytes read if bytes_read != NULL
2686 * Returned pointer can be freed by using free(). */
2687 static void *
hwloc_read_raw(const char * p,const char * p1,size_t * bytes_read,int root_fd)2688 hwloc_read_raw(const char *p, const char *p1, size_t *bytes_read, int root_fd)
2689 {
2690 char fname[256];
2691 char *ret = NULL;
2692 struct stat fs;
2693 int file = -1;
2694
2695 snprintf(fname, sizeof(fname), "%s/%s", p, p1);
2696
2697 file = hwloc_open(fname, root_fd);
2698 if (-1 == file) {
2699 goto out_no_close;
2700 }
2701 if (fstat(file, &fs)) {
2702 goto out;
2703 }
2704
2705 ret = (char *) malloc(fs.st_size);
2706 if (NULL != ret) {
2707 ssize_t cb = read(file, ret, fs.st_size);
2708 if (cb == -1) {
2709 free(ret);
2710 ret = NULL;
2711 } else {
2712 if (NULL != bytes_read)
2713 *bytes_read = cb;
2714 }
2715 }
2716
2717 out:
2718 close(file);
2719 out_no_close:
2720 return ret;
2721 }
2722
2723 /* Reads the entire file and returns it as a 0-terminated string
2724 * Returned pointer can be freed by using free(). */
2725 static char *
hwloc_read_str(const char * p,const char * p1,int root_fd)2726 hwloc_read_str(const char *p, const char *p1, int root_fd)
2727 {
2728 size_t cb = 0;
2729 char *ret = hwloc_read_raw(p, p1, &cb, root_fd);
2730 if ((NULL != ret) && (0 < cb) && (0 != ret[cb-1])) {
2731 char *tmp = realloc(ret, cb + 1);
2732 if (!tmp) {
2733 free(ret);
2734 return NULL;
2735 }
2736 ret = tmp;
2737 ret[cb] = 0;
2738 }
2739 return ret;
2740 }
2741
2742 /* Reads first 32bit bigendian value */
2743 static ssize_t
hwloc_read_unit32be(const char * p,const char * p1,uint32_t * buf,int root_fd)2744 hwloc_read_unit32be(const char *p, const char *p1, uint32_t *buf, int root_fd)
2745 {
2746 size_t cb = 0;
2747 uint32_t *tmp = hwloc_read_raw(p, p1, &cb, root_fd);
2748 if (sizeof(*buf) != cb) {
2749 errno = EINVAL;
2750 free(tmp); /* tmp is either NULL or contains useless things */
2751 return -1;
2752 }
2753 *buf = htonl(*tmp);
2754 free(tmp);
2755 return sizeof(*buf);
2756 }
2757
2758 typedef struct {
2759 unsigned int n, allocated;
2760 struct {
2761 hwloc_bitmap_t cpuset;
2762 uint32_t phandle;
2763 uint32_t l2_cache;
2764 char *name;
2765 } *p;
2766 } device_tree_cpus_t;
2767
2768 static void
add_device_tree_cpus_node(device_tree_cpus_t * cpus,hwloc_bitmap_t cpuset,uint32_t l2_cache,uint32_t phandle,const char * name)2769 add_device_tree_cpus_node(device_tree_cpus_t *cpus, hwloc_bitmap_t cpuset,
2770 uint32_t l2_cache, uint32_t phandle, const char *name)
2771 {
2772 if (cpus->n == cpus->allocated) {
2773 void *tmp;
2774 unsigned allocated;
2775 if (!cpus->allocated)
2776 allocated = 64;
2777 else
2778 allocated = 2 * cpus->allocated;
2779 tmp = realloc(cpus->p, allocated * sizeof(cpus->p[0]));
2780 if (!tmp)
2781 return; /* failed to realloc, ignore this entry */
2782 cpus->p = tmp;
2783 cpus->allocated = allocated;
2784 }
2785 cpus->p[cpus->n].phandle = phandle;
2786 cpus->p[cpus->n].cpuset = (NULL == cpuset)?NULL:hwloc_bitmap_dup(cpuset);
2787 cpus->p[cpus->n].l2_cache = l2_cache;
2788 cpus->p[cpus->n].name = strdup(name);
2789 ++cpus->n;
2790 }
2791
2792 /* Walks over the cache list in order to detect nested caches and CPU mask for each */
2793 static int
look_powerpc_device_tree_discover_cache(device_tree_cpus_t * cpus,uint32_t phandle,unsigned int * level,hwloc_bitmap_t cpuset)2794 look_powerpc_device_tree_discover_cache(device_tree_cpus_t *cpus,
2795 uint32_t phandle, unsigned int *level, hwloc_bitmap_t cpuset)
2796 {
2797 unsigned int i;
2798 int ret = -1;
2799 if ((NULL == level) || (NULL == cpuset) || phandle == (uint32_t) -1)
2800 return ret;
2801 for (i = 0; i < cpus->n; ++i) {
2802 if (phandle != cpus->p[i].l2_cache)
2803 continue;
2804 if (NULL != cpus->p[i].cpuset) {
2805 hwloc_bitmap_or(cpuset, cpuset, cpus->p[i].cpuset);
2806 ret = 0;
2807 } else {
2808 ++(*level);
2809 if (0 == look_powerpc_device_tree_discover_cache(cpus,
2810 cpus->p[i].phandle, level, cpuset))
2811 ret = 0;
2812 }
2813 }
2814 return ret;
2815 }
2816
2817 static void
try__add_cache_from_device_tree_cpu(struct hwloc_topology * topology,unsigned int level,hwloc_obj_cache_type_t type,uint32_t cache_line_size,uint32_t cache_size,uint32_t cache_sets,hwloc_bitmap_t cpuset)2818 try__add_cache_from_device_tree_cpu(struct hwloc_topology *topology,
2819 unsigned int level, hwloc_obj_cache_type_t type,
2820 uint32_t cache_line_size, uint32_t cache_size, uint32_t cache_sets,
2821 hwloc_bitmap_t cpuset)
2822 {
2823 struct hwloc_obj *c = NULL;
2824
2825 if (0 == cache_size)
2826 return;
2827
2828 c = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, -1);
2829 c->attr->cache.depth = level;
2830 c->attr->cache.linesize = cache_line_size;
2831 c->attr->cache.size = cache_size;
2832 c->attr->cache.type = type;
2833 if (cache_sets == 1)
2834 /* likely wrong, make it unknown */
2835 cache_sets = 0;
2836 if (cache_sets && cache_line_size)
2837 c->attr->cache.associativity = cache_size / (cache_sets * cache_line_size);
2838 else
2839 c->attr->cache.associativity = 0;
2840 c->cpuset = hwloc_bitmap_dup(cpuset);
2841 hwloc_debug_2args_bitmap("cache (%s) depth %d has cpuset %s\n",
2842 type == HWLOC_OBJ_CACHE_UNIFIED ? "unified" : (type == HWLOC_OBJ_CACHE_DATA ? "data" : "instruction"),
2843 level, c->cpuset);
2844 hwloc_insert_object_by_cpuset(topology, c);
2845 }
2846
2847 static void
try_add_cache_from_device_tree_cpu(struct hwloc_topology * topology,struct hwloc_linux_backend_data_s * data,const char * cpu,unsigned int level,hwloc_bitmap_t cpuset)2848 try_add_cache_from_device_tree_cpu(struct hwloc_topology *topology,
2849 struct hwloc_linux_backend_data_s *data,
2850 const char *cpu, unsigned int level, hwloc_bitmap_t cpuset)
2851 {
2852 /* d-cache-block-size - ignore */
2853 /* d-cache-line-size - to read, in bytes */
2854 /* d-cache-sets - ignore */
2855 /* d-cache-size - to read, in bytes */
2856 /* i-cache, same for instruction */
2857 /* cache-unified only exist if data and instruction caches are unified */
2858 /* d-tlb-sets - ignore */
2859 /* d-tlb-size - ignore, always 0 on power6 */
2860 /* i-tlb-*, same */
2861 uint32_t d_cache_line_size = 0, d_cache_size = 0, d_cache_sets = 0;
2862 uint32_t i_cache_line_size = 0, i_cache_size = 0, i_cache_sets = 0;
2863 char unified_path[1024];
2864 struct stat statbuf;
2865 int unified;
2866
2867 snprintf(unified_path, sizeof(unified_path), "%s/cache-unified", cpu);
2868 unified = (hwloc_stat(unified_path, &statbuf, data->root_fd) == 0);
2869
2870 hwloc_read_unit32be(cpu, "d-cache-line-size", &d_cache_line_size,
2871 data->root_fd);
2872 hwloc_read_unit32be(cpu, "d-cache-size", &d_cache_size,
2873 data->root_fd);
2874 hwloc_read_unit32be(cpu, "d-cache-sets", &d_cache_sets,
2875 data->root_fd);
2876 hwloc_read_unit32be(cpu, "i-cache-line-size", &i_cache_line_size,
2877 data->root_fd);
2878 hwloc_read_unit32be(cpu, "i-cache-size", &i_cache_size,
2879 data->root_fd);
2880 hwloc_read_unit32be(cpu, "i-cache-sets", &i_cache_sets,
2881 data->root_fd);
2882
2883 if (!unified)
2884 try__add_cache_from_device_tree_cpu(topology, level, HWLOC_OBJ_CACHE_INSTRUCTION,
2885 i_cache_line_size, i_cache_size, i_cache_sets, cpuset);
2886 try__add_cache_from_device_tree_cpu(topology, level, unified ? HWLOC_OBJ_CACHE_UNIFIED : HWLOC_OBJ_CACHE_DATA,
2887 d_cache_line_size, d_cache_size, d_cache_sets, cpuset);
2888 }
2889
2890 /*
2891 * Discovers L1/L2/L3 cache information on IBM PowerPC systems for old kernels (RHEL5.*)
2892 * which provide NUMA nodes information without any details
2893 */
2894 static void
look_powerpc_device_tree(struct hwloc_topology * topology,struct hwloc_linux_backend_data_s * data)2895 look_powerpc_device_tree(struct hwloc_topology *topology,
2896 struct hwloc_linux_backend_data_s *data)
2897 {
2898 device_tree_cpus_t cpus;
2899 const char ofroot[] = "/proc/device-tree/cpus";
2900 unsigned int i;
2901 int root_fd = data->root_fd;
2902 DIR *dt = hwloc_opendir(ofroot, root_fd);
2903 struct dirent *dirent;
2904
2905 if (NULL == dt)
2906 return;
2907
2908 /* only works for Power so far, and not useful on ARM */
2909 if (data->arch != HWLOC_LINUX_ARCH_POWER)
2910 return;
2911
2912 cpus.n = 0;
2913 cpus.p = NULL;
2914 cpus.allocated = 0;
2915
2916 while (NULL != (dirent = readdir(dt))) {
2917 char cpu[256];
2918 char *device_type;
2919 uint32_t reg = -1, l2_cache = -1, phandle = -1;
2920
2921 if ('.' == dirent->d_name[0])
2922 continue;
2923
2924 snprintf(cpu, sizeof(cpu), "%s/%s", ofroot, dirent->d_name);
2925
2926 device_type = hwloc_read_str(cpu, "device_type", root_fd);
2927 if (NULL == device_type)
2928 continue;
2929
2930 hwloc_read_unit32be(cpu, "reg", ®, root_fd);
2931 if (hwloc_read_unit32be(cpu, "next-level-cache", &l2_cache, root_fd) == -1)
2932 hwloc_read_unit32be(cpu, "l2-cache", &l2_cache, root_fd);
2933 if (hwloc_read_unit32be(cpu, "phandle", &phandle, root_fd) == -1)
2934 if (hwloc_read_unit32be(cpu, "ibm,phandle", &phandle, root_fd) == -1)
2935 hwloc_read_unit32be(cpu, "linux,phandle", &phandle, root_fd);
2936
2937 if (0 == strcmp(device_type, "cache")) {
2938 add_device_tree_cpus_node(&cpus, NULL, l2_cache, phandle, dirent->d_name);
2939 }
2940 else if (0 == strcmp(device_type, "cpu")) {
2941 /* Found CPU */
2942 hwloc_bitmap_t cpuset = NULL;
2943 size_t cb = 0;
2944 uint32_t *threads = hwloc_read_raw(cpu, "ibm,ppc-interrupt-server#s", &cb, root_fd);
2945 uint32_t nthreads = cb / sizeof(threads[0]);
2946
2947 if (NULL != threads) {
2948 cpuset = hwloc_bitmap_alloc();
2949 for (i = 0; i < nthreads; ++i) {
2950 if (hwloc_bitmap_isset(topology->levels[0][0]->complete_cpuset, ntohl(threads[i])))
2951 hwloc_bitmap_set(cpuset, ntohl(threads[i]));
2952 }
2953 free(threads);
2954 } else if ((unsigned int)-1 != reg) {
2955 /* Doesn't work on ARM because cpu "reg" do not start at 0.
2956 * We know the first cpu "reg" is the lowest. The others are likely
2957 * in order assuming the device-tree shows objects in order.
2958 */
2959 cpuset = hwloc_bitmap_alloc();
2960 hwloc_bitmap_set(cpuset, reg);
2961 }
2962
2963 if (NULL == cpuset) {
2964 hwloc_debug("%s has no \"reg\" property, skipping\n", cpu);
2965 } else {
2966 struct hwloc_obj *core = NULL;
2967 add_device_tree_cpus_node(&cpus, cpuset, l2_cache, phandle, dirent->d_name);
2968
2969 /* Add core */
2970 core = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, reg);
2971 core->cpuset = hwloc_bitmap_dup(cpuset);
2972 hwloc_insert_object_by_cpuset(topology, core);
2973
2974 /* Add L1 cache */
2975 try_add_cache_from_device_tree_cpu(topology, data, cpu, 1, cpuset);
2976
2977 hwloc_bitmap_free(cpuset);
2978 }
2979 }
2980 free(device_type);
2981 }
2982 closedir(dt);
2983
2984 /* No cores and L2 cache were found, exiting */
2985 if (0 == cpus.n) {
2986 hwloc_debug("No cores and L2 cache were found in %s, exiting\n", ofroot);
2987 return;
2988 }
2989
2990 #ifdef HWLOC_DEBUG
2991 for (i = 0; i < cpus.n; ++i) {
2992 hwloc_debug("%i: %s ibm,phandle=%08X l2_cache=%08X ",
2993 i, cpus.p[i].name, cpus.p[i].phandle, cpus.p[i].l2_cache);
2994 if (NULL == cpus.p[i].cpuset) {
2995 hwloc_debug("%s\n", "no cpuset");
2996 } else {
2997 hwloc_debug_bitmap("cpuset %s\n", cpus.p[i].cpuset);
2998 }
2999 }
3000 #endif
3001
3002 /* Scan L2/L3/... caches */
3003 for (i = 0; i < cpus.n; ++i) {
3004 unsigned int level = 2;
3005 hwloc_bitmap_t cpuset;
3006 /* Skip real CPUs */
3007 if (NULL != cpus.p[i].cpuset)
3008 continue;
3009
3010 /* Calculate cache level and CPU mask */
3011 cpuset = hwloc_bitmap_alloc();
3012 if (0 == look_powerpc_device_tree_discover_cache(&cpus,
3013 cpus.p[i].phandle, &level, cpuset)) {
3014 char cpu[256];
3015 snprintf(cpu, sizeof(cpu), "%s/%s", ofroot, cpus.p[i].name);
3016 try_add_cache_from_device_tree_cpu(topology, data, cpu, level, cpuset);
3017 }
3018 hwloc_bitmap_free(cpuset);
3019 }
3020
3021 /* Do cleanup */
3022 for (i = 0; i < cpus.n; ++i) {
3023 hwloc_bitmap_free(cpus.p[i].cpuset);
3024 free(cpus.p[i].name);
3025 }
3026 free(cpus.p);
3027 }
3028
3029 /* Try to handle knl hwdata properties
3030 * Returns 0 on success and -1 otherwise */
hwloc_linux_try_handle_knl_hwdata_properties(hwloc_topology_t topology,struct hwloc_linux_backend_data_s * data,hwloc_obj_t * nodes,unsigned nbnodes)3031 static int hwloc_linux_try_handle_knl_hwdata_properties(hwloc_topology_t topology, struct hwloc_linux_backend_data_s *data, hwloc_obj_t *nodes, unsigned nbnodes)
3032 {
3033 char *knl_cache_file;
3034 long long int cache_size = -1;
3035 int associativity = -1;
3036 int inclusiveness = -1;
3037 int line_size = -1;
3038 int version = 0;
3039 unsigned i;
3040 char buffer[512] = {0};
3041 char *data_beg = NULL;
3042 char memory_mode_str[32] = {0};
3043 char cluster_mode_str[32] = {0};
3044
3045 if (asprintf(&knl_cache_file, "%s/knl_memoryside_cache", data->dumped_hwdata_dirname) < 0)
3046 return -1;
3047
3048 hwloc_debug("Reading knl cache data from: %s\n", knl_cache_file);
3049 if (hwloc_read_path_by_length(knl_cache_file, buffer, sizeof(buffer), data->root_fd) < 0) {
3050 hwloc_debug("Unable to open KNL data file `%s' (%s)\n", knl_cache_file, strerror(errno));
3051 free(knl_cache_file);
3052 return -1;
3053 }
3054 free(knl_cache_file);
3055
3056 data_beg = &buffer[0];
3057
3058 /* file must start with version information */
3059 if (sscanf(data_beg, "version: %d", &version) != 1) {
3060 fprintf(stderr, "Invalid knl_memoryside_cache header, expected \"version: <int>\".\n");
3061 return -1;
3062 }
3063
3064 while (1) {
3065 char *line_end = strstr(data_beg, "\n");
3066 if (!line_end)
3067 break;
3068 if (version >= 1) {
3069 if (!strncmp("cache_size:", data_beg, strlen("cache_size"))) {
3070 sscanf(data_beg, "cache_size: %lld", &cache_size);
3071 hwloc_debug("read cache_size=%lld\n", cache_size);
3072 } else if (!strncmp("line_size:", data_beg, strlen("line_size:"))) {
3073 sscanf(data_beg, "line_size: %d", &line_size);
3074 hwloc_debug("read line_size=%d\n", line_size);
3075 } else if (!strncmp("inclusiveness:", data_beg, strlen("inclusiveness:"))) {
3076 sscanf(data_beg, "inclusiveness: %d", &inclusiveness);
3077 hwloc_debug("read inclusiveness=%d\n", inclusiveness);
3078 } else if (!strncmp("associativity:", data_beg, strlen("associativity:"))) {
3079 sscanf(data_beg, "associativity: %d\n", &associativity);
3080 hwloc_debug("read associativity=%d\n", associativity);
3081 }
3082 }
3083 if (version >= 2) {
3084 if (!strncmp("cluster_mode:", data_beg, strlen("cluster_mode:"))) {
3085 sscanf(data_beg, "cluster_mode: %s\n", cluster_mode_str);
3086 hwloc_debug("read cluster_mode=%s\n", cluster_mode_str);
3087 } else if (!strncmp("memory_mode:", data_beg, strlen("memory_mode:"))) {
3088 sscanf(data_beg, "memory_mode: %s\n", memory_mode_str);
3089 hwloc_debug("read memory_mode=%s\n", memory_mode_str);
3090 }
3091 }
3092
3093 data_beg = line_end + 1;
3094 }
3095
3096 if (line_size == -1 || cache_size == -1 || associativity == -1 || inclusiveness == -1) {
3097 hwloc_debug("Incorrect file format line_size=%d cache_size=%lld associativity=%d inclusiveness=%d\n",
3098 line_size, cache_size, associativity, inclusiveness);
3099 return -1;
3100 }
3101
3102 /* In file version 1 mcdram_cache is always non-zero.
3103 * In file version 2 mcdram cache can be zero in flat mode. We need to check and do not expose cache in flat mode. */
3104 if (cache_size > 0) {
3105 for(i=0; i<nbnodes; i++) {
3106 hwloc_obj_t cache;
3107
3108 if (hwloc_bitmap_iszero(nodes[i]->cpuset))
3109 /* one L3 per DDR, none for MCDRAM nodes */
3110 continue;
3111
3112 cache = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, -1);
3113 if (!cache)
3114 return -1;
3115
3116 cache->attr->cache.depth = 3;
3117 cache->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED;
3118 cache->attr->cache.associativity = associativity;
3119 hwloc_obj_add_info(cache, "Inclusive", inclusiveness ? "1" : "0");
3120 cache->attr->cache.size = cache_size;
3121 cache->attr->cache.linesize = line_size;
3122 cache->cpuset = hwloc_bitmap_dup(nodes[i]->cpuset);
3123 hwloc_obj_add_info(cache, "Type", "MemorySideCache");
3124 hwloc_insert_object_by_cpuset(topology, cache);
3125 }
3126 }
3127 /* adding cluster and memory mode as properties of the machine */
3128 if (version >= 2) {
3129 hwloc_obj_add_info(topology->levels[0][0], "ClusterMode", cluster_mode_str);
3130 hwloc_obj_add_info(topology->levels[0][0], "MemoryMode", memory_mode_str);
3131 }
3132
3133 return 0;
3134 }
3135
3136
3137
3138 /**************************************
3139 ****** Sysfs Topology Discovery ******
3140 **************************************/
3141
3142 static int
look_sysfsnode(struct hwloc_topology * topology,struct hwloc_linux_backend_data_s * data,const char * path,unsigned * found)3143 look_sysfsnode(struct hwloc_topology *topology,
3144 struct hwloc_linux_backend_data_s *data,
3145 const char *path, unsigned *found)
3146 {
3147 unsigned osnode;
3148 unsigned nbnodes = 0;
3149 DIR *dir;
3150 struct dirent *dirent;
3151 hwloc_bitmap_t nodeset;
3152
3153 *found = 0;
3154
3155 /* Get the list of nodes first */
3156 dir = hwloc_opendir(path, data->root_fd);
3157 if (dir)
3158 {
3159 nodeset = hwloc_bitmap_alloc();
3160 while ((dirent = readdir(dir)) != NULL)
3161 {
3162 if (strncmp(dirent->d_name, "node", 4))
3163 continue;
3164 osnode = strtoul(dirent->d_name+4, NULL, 0);
3165 hwloc_bitmap_set(nodeset, osnode);
3166 nbnodes++;
3167 }
3168 closedir(dir);
3169 }
3170 else
3171 return -1;
3172
3173 if (!nbnodes || (nbnodes == 1 && !data->is_knl)) { /* always keep NUMA for KNL, or configs might look too different */
3174 hwloc_bitmap_free(nodeset);
3175 return 0;
3176 }
3177
3178 /* For convenience, put these declarations inside a block. */
3179
3180 {
3181 hwloc_obj_t * nodes = calloc(nbnodes, sizeof(hwloc_obj_t));
3182 unsigned *indexes = calloc(nbnodes, sizeof(unsigned));
3183 float * distances = NULL;
3184 int failednodes = 0;
3185 unsigned index_;
3186
3187 if (NULL == nodes || NULL == indexes) {
3188 free(nodes);
3189 free(indexes);
3190 hwloc_bitmap_free(nodeset);
3191 nbnodes = 0;
3192 goto out;
3193 }
3194
3195 /* Unsparsify node indexes.
3196 * We'll need them later because Linux groups sparse distances
3197 * and keeps them in order in the sysfs distance files.
3198 * It'll simplify things in the meantime.
3199 */
3200 index_ = 0;
3201 hwloc_bitmap_foreach_begin (osnode, nodeset) {
3202 indexes[index_] = osnode;
3203 index_++;
3204 } hwloc_bitmap_foreach_end();
3205 hwloc_bitmap_free(nodeset);
3206
3207 #ifdef HWLOC_DEBUG
3208 hwloc_debug("%s", "NUMA indexes: ");
3209 for (index_ = 0; index_ < nbnodes; index_++) {
3210 hwloc_debug(" %u", indexes[index_]);
3211 }
3212 hwloc_debug("%s", "\n");
3213 #endif
3214
3215 /* Create NUMA objects */
3216 for (index_ = 0; index_ < nbnodes; index_++) {
3217 hwloc_obj_t node, res_obj;
3218 int annotate;
3219
3220 osnode = indexes[index_];
3221
3222 node = hwloc_get_numanode_obj_by_os_index(topology, osnode);
3223 annotate = (node != NULL);
3224 if (!annotate) {
3225 /* create a new node */
3226 char nodepath[SYSFS_NUMA_NODE_PATH_LEN];
3227 hwloc_bitmap_t cpuset;
3228 sprintf(nodepath, "%s/node%u/cpumap", path, osnode);
3229 cpuset = hwloc__alloc_read_path_as_cpumask(nodepath, data->root_fd);
3230 if (!cpuset) {
3231 /* This NUMA object won't be inserted, we'll ignore distances */
3232 failednodes++;
3233 continue;
3234 }
3235
3236 node = hwloc_alloc_setup_object(HWLOC_OBJ_NUMANODE, osnode);
3237 node->cpuset = cpuset;
3238 node->nodeset = hwloc_bitmap_alloc();
3239 hwloc_bitmap_set(node->nodeset, osnode);
3240 }
3241 hwloc_sysfs_node_meminfo_info(topology, data, path, osnode, &node->memory);
3242
3243 hwloc_debug_1arg_bitmap("os node %u has cpuset %s\n",
3244 osnode, node->cpuset);
3245
3246 if (annotate) {
3247 nodes[index_] = node;
3248 } else {
3249 res_obj = hwloc_insert_object_by_cpuset(topology, node);
3250 if (node == res_obj) {
3251 nodes[index_] = node;
3252 } else {
3253 /* We got merged somehow, could be a buggy BIOS reporting wrong NUMA node cpuset.
3254 * This object disappeared, we'll ignore distances */
3255 failednodes++;
3256 }
3257 }
3258 }
3259
3260 if (!failednodes && data->is_knl)
3261 hwloc_linux_try_handle_knl_hwdata_properties(topology, data, nodes, nbnodes);
3262
3263 if (failednodes) {
3264 /* failed to read/create some nodes, don't bother reading/fixing
3265 * a distance matrix that would likely be wrong anyway.
3266 */
3267 nbnodes -= failednodes;
3268 } else if (nbnodes > 1) {
3269 distances = malloc(nbnodes*nbnodes*sizeof(*distances));
3270 }
3271
3272 if (NULL == distances) {
3273 free(nodes);
3274 free(indexes);
3275 goto out;
3276 }
3277
3278 if (hwloc_parse_nodes_distances(path, nbnodes, indexes, distances, data->root_fd) < 0) {
3279 free(nodes);
3280 free(distances);
3281 free(indexes);
3282 goto out;
3283 }
3284
3285 if (data->is_knl && distances) {
3286 char *env = getenv("HWLOC_KNL_NUMA_QUIRK");
3287 if (!(env && !atoi(env)) && nbnodes>=2) { /* SNC2 or SNC4, with 0 or 2/4 MCDRAM, and 0-4 DDR nodes */
3288 unsigned i, j, closest;
3289 for(i=0; i<nbnodes; i++) {
3290 if (!hwloc_bitmap_iszero(nodes[i]->cpuset))
3291 /* nodes with CPU, that's DDR, skip it */
3292 continue;
3293 hwloc_obj_add_info(nodes[i], "Type", "MCDRAM");
3294
3295 /* DDR is the closest node with CPUs */
3296 closest = (unsigned)-1;
3297 for(j=0; j<nbnodes; j++) {
3298 if (j==i)
3299 continue;
3300 if (hwloc_bitmap_iszero(nodes[j]->cpuset))
3301 /* nodes without CPU, that's another MCDRAM, skip it */
3302 continue;
3303 if (closest == (unsigned)-1 || distances[i*nbnodes+j]<distances[i*nbnodes+closest])
3304 closest = j;
3305 }
3306 if (closest != (unsigned) -1) {
3307 /* Add a Group for Cluster containing this MCDRAM + DDR */
3308 hwloc_obj_t cluster = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, -1);
3309 cluster->cpuset = hwloc_bitmap_dup(nodes[i]->cpuset);
3310 cluster->nodeset = hwloc_bitmap_dup(nodes[i]->nodeset);
3311 hwloc_bitmap_or(cluster->cpuset, cluster->cpuset, nodes[closest]->cpuset);
3312 hwloc_bitmap_or(cluster->nodeset, cluster->nodeset, nodes[closest]->nodeset);
3313 hwloc_obj_add_info(cluster, "Type", "Cluster");
3314 hwloc_insert_object_by_cpuset(topology, cluster);
3315 }
3316 }
3317 /* drop the distance matrix, it contradicts the above NUMA layout groups */
3318 free(distances);
3319 free(nodes);
3320 free(indexes);
3321 goto out;
3322 }
3323 }
3324
3325 hwloc_distances_set(topology, HWLOC_OBJ_NUMANODE, nbnodes, indexes, nodes, distances, 0 /* OS cannot force */);
3326 }
3327
3328 out:
3329 *found = nbnodes;
3330 return 0;
3331 }
3332
3333 /* Look at Linux' /sys/devices/system/cpu/cpu%d/topology/ */
3334 static int
look_sysfscpu(struct hwloc_topology * topology,struct hwloc_linux_backend_data_s * data,const char * path,struct hwloc_linux_cpuinfo_proc * cpuinfo_Lprocs,unsigned cpuinfo_numprocs)3335 look_sysfscpu(struct hwloc_topology *topology,
3336 struct hwloc_linux_backend_data_s *data,
3337 const char *path,
3338 struct hwloc_linux_cpuinfo_proc * cpuinfo_Lprocs, unsigned cpuinfo_numprocs)
3339 {
3340 hwloc_bitmap_t cpuset; /* Set of cpus for which we have topology information */
3341 #define CPU_TOPOLOGY_STR_LEN 128
3342 char str[CPU_TOPOLOGY_STR_LEN];
3343 DIR *dir;
3344 int i,j;
3345 unsigned caches_added, merge_buggy_core_siblings;
3346 hwloc_obj_t packages = NULL; /* temporary list of packages before actual insert in the tree */
3347 int threadwithcoreid = data->is_amd_with_CU ? -1 : 0; /* -1 means we don't know yet if threads have their own coreids within thread_siblings */
3348
3349 /* fill the cpuset of interesting cpus */
3350 dir = hwloc_opendir(path, data->root_fd);
3351 if (!dir)
3352 return -1;
3353 else {
3354 struct dirent *dirent;
3355 cpuset = hwloc_bitmap_alloc();
3356
3357 while ((dirent = readdir(dir)) != NULL) {
3358 unsigned long cpu;
3359 char online[2];
3360
3361 if (strncmp(dirent->d_name, "cpu", 3))
3362 continue;
3363 cpu = strtoul(dirent->d_name+3, NULL, 0);
3364
3365 /* Maybe we don't have topology information but at least it exists */
3366 hwloc_bitmap_set(topology->levels[0][0]->complete_cpuset, cpu);
3367
3368 /* check whether this processor is online */
3369 sprintf(str, "%s/cpu%lu/online", path, cpu);
3370 if (hwloc_read_path_by_length(str, online, sizeof(online), data->root_fd) == 0) {
3371 if (atoi(online)) {
3372 hwloc_debug("os proc %lu is online\n", cpu);
3373 } else {
3374 hwloc_debug("os proc %lu is offline\n", cpu);
3375 hwloc_bitmap_clr(topology->levels[0][0]->online_cpuset, cpu);
3376 }
3377 }
3378
3379 /* check whether the kernel exports topology information for this cpu */
3380 sprintf(str, "%s/cpu%lu/topology", path, cpu);
3381 if (hwloc_access(str, X_OK, data->root_fd) < 0 && errno == ENOENT) {
3382 hwloc_debug("os proc %lu has no accessible %s/cpu%lu/topology\n",
3383 cpu, path, cpu);
3384 continue;
3385 }
3386
3387 hwloc_bitmap_set(cpuset, cpu);
3388 }
3389 closedir(dir);
3390 }
3391
3392 topology->support.discovery->pu = 1;
3393 hwloc_debug_1arg_bitmap("found %d cpu topologies, cpuset %s\n",
3394 hwloc_bitmap_weight(cpuset), cpuset);
3395
3396 merge_buggy_core_siblings = (data->arch == HWLOC_LINUX_ARCH_X86);
3397 caches_added = 0;
3398 hwloc_bitmap_foreach_begin(i, cpuset) {
3399 hwloc_bitmap_t packageset, coreset, bookset, threadset;
3400 unsigned mypackageid, mycoreid, mybookid;
3401 int tmpint;
3402
3403 /* look at the package */
3404 sprintf(str, "%s/cpu%d/topology/core_siblings", path, i);
3405 packageset = hwloc__alloc_read_path_as_cpumask(str, data->root_fd);
3406 if (packageset && hwloc_bitmap_first(packageset) == i) {
3407 /* first cpu in this package, add the package */
3408 struct hwloc_obj *package;
3409
3410 mypackageid = (unsigned) -1;
3411 sprintf(str, "%s/cpu%d/topology/physical_package_id", path, i); /* contains %d at least up to 4.9 */
3412 if (hwloc_read_path_as_int(str, &tmpint, data->root_fd) == 0)
3413 mypackageid = (unsigned) tmpint;
3414
3415 if (merge_buggy_core_siblings) {
3416 /* check for another package with same physical_package_id */
3417 hwloc_obj_t curpackage = packages;
3418 while (curpackage) {
3419 if (curpackage->os_index == mypackageid) {
3420 /* found another package with same physical_package_id but different core_siblings.
3421 * looks like a buggy kernel on Intel Xeon E5 v3 processor with two rings.
3422 * merge these core_siblings to extend the existing first package object.
3423 */
3424 static int reported = 0;
3425 if (!reported && !hwloc_hide_errors()) {
3426 char *a, *b;
3427 hwloc_bitmap_asprintf(&a, curpackage->cpuset);
3428 hwloc_bitmap_asprintf(&b, packageset);
3429 fprintf(stderr, "****************************************************************************\n");
3430 fprintf(stderr, "* hwloc %s has detected buggy sysfs package information: Two packages have\n", HWLOC_VERSION);
3431 fprintf(stderr, "* the same physical package id %u but different core_siblings %s and %s\n",
3432 mypackageid, a, b);
3433 fprintf(stderr, "* hwloc is merging these packages into a single one assuming your Linux kernel\n");
3434 fprintf(stderr, "* does not support this processor correctly.\n");
3435 fprintf(stderr, "* You may hide this warning by setting HWLOC_HIDE_ERRORS=1 in the environment.\n");
3436 fprintf(stderr, "*\n");
3437 fprintf(stderr, "* If hwloc does not report the right number of packages,\n");
3438 fprintf(stderr, "* please report this error message to the hwloc user's mailing list,\n");
3439 fprintf(stderr, "* along with the output+tarball generated by the hwloc-gather-topology script.\n");
3440 fprintf(stderr, "****************************************************************************\n");
3441 reported = 1;
3442 free(a);
3443 free(b);
3444 }
3445 hwloc_bitmap_or(curpackage->cpuset, curpackage->cpuset, packageset);
3446 goto package_done;
3447 }
3448 curpackage = curpackage->next_cousin;
3449 }
3450 }
3451
3452 /* no package with same physical_package_id, create a new one */
3453 package = hwloc_alloc_setup_object(HWLOC_OBJ_PACKAGE, mypackageid);
3454 package->cpuset = packageset;
3455 hwloc_debug_1arg_bitmap("os package %u has cpuset %s\n",
3456 mypackageid, packageset);
3457 /* add cpuinfo */
3458 if (cpuinfo_Lprocs) {
3459 for(j=0; j<(int) cpuinfo_numprocs; j++)
3460 if ((int) cpuinfo_Lprocs[j].Pproc == i) {
3461 hwloc__move_infos(&package->infos, &package->infos_count,
3462 &cpuinfo_Lprocs[j].infos, &cpuinfo_Lprocs[j].infos_count);
3463 }
3464 }
3465 /* insert in a temporary list in case we have to modify the cpuset by merging other core_siblings later.
3466 * we'll actually insert the tree at the end of the entire sysfs cpu loop.
3467 */
3468 package->next_cousin = packages;
3469 packages = package;
3470
3471 packageset = NULL; /* don't free it */
3472 }
3473 package_done:
3474 hwloc_bitmap_free(packageset);
3475
3476 /* look at the core */
3477 sprintf(str, "%s/cpu%d/topology/thread_siblings", path, i);
3478 coreset = hwloc__alloc_read_path_as_cpumask(str, data->root_fd);
3479
3480 if (coreset) {
3481 int gotcoreid = 0; /* to avoid reading the coreid twice */
3482 if (hwloc_bitmap_weight(coreset) > 1 && threadwithcoreid == -1) {
3483 /* check if this is hyper-threading or different coreids */
3484 unsigned siblingid, siblingcoreid;
3485
3486 mycoreid = (unsigned) -1;
3487 sprintf(str, "%s/cpu%d/topology/core_id", path, i); /* contains %d at least up to 4.9 */
3488 if (hwloc_read_path_as_int(str, &tmpint, data->root_fd) == 0)
3489 mycoreid = (unsigned) tmpint;
3490 gotcoreid = 1;
3491
3492 siblingid = hwloc_bitmap_first(coreset);
3493 if (siblingid == (unsigned) i)
3494 siblingid = hwloc_bitmap_next(coreset, i);
3495 siblingcoreid = (unsigned) -1;
3496 sprintf(str, "%s/cpu%u/topology/core_id", path, siblingid); /* contains %d at least up to 4.9 */
3497 if (hwloc_read_path_as_int(str, &tmpint, data->root_fd) == 0)
3498 siblingcoreid = (unsigned) tmpint;
3499 threadwithcoreid = (siblingcoreid != mycoreid);
3500 }
3501 if (hwloc_bitmap_first(coreset) == i || threadwithcoreid) {
3502 /* regular core */
3503 struct hwloc_obj *core;
3504
3505 if (!gotcoreid) {
3506 mycoreid = (unsigned) -1;
3507 sprintf(str, "%s/cpu%d/topology/core_id", path, i); /* contains %d at least up to 4.9 */
3508 if (hwloc_read_path_as_int(str, &tmpint, data->root_fd) == 0)
3509 mycoreid = (unsigned) tmpint;
3510 }
3511
3512 core = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, mycoreid);
3513 if (threadwithcoreid)
3514 /* amd multicore compute-unit, create one core per thread */
3515 hwloc_bitmap_only(coreset, i);
3516 core->cpuset = coreset;
3517 hwloc_debug_1arg_bitmap("os core %u has cpuset %s\n",
3518 mycoreid, core->cpuset);
3519 hwloc_insert_object_by_cpuset(topology, core);
3520 coreset = NULL; /* don't free it */
3521 }
3522 hwloc_bitmap_free(coreset);
3523 }
3524
3525 /* look at the books */
3526 sprintf(str, "%s/cpu%d/topology/book_siblings", path, i);
3527 bookset = hwloc__alloc_read_path_as_cpumask(str, data->root_fd);
3528 if (bookset) {
3529 if (hwloc_bitmap_first(bookset) == i) {
3530 struct hwloc_obj *book;
3531
3532 mybookid = (unsigned) -1;
3533 sprintf(str, "%s/cpu%d/topology/book_id", path, i); /* contains %d at least up to 4.9 */
3534 if (hwloc_read_path_as_int(str, &tmpint, data->root_fd) == 0) {
3535 mybookid = (unsigned) tmpint;
3536
3537 book = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, mybookid);
3538 book->cpuset = bookset;
3539 hwloc_debug_1arg_bitmap("os book %u has cpuset %s\n",
3540 mybookid, bookset);
3541 hwloc_obj_add_info(book, "Type", "Book");
3542 hwloc_insert_object_by_cpuset(topology, book);
3543 bookset = NULL; /* don't free it */
3544 }
3545 }
3546 hwloc_bitmap_free(bookset);
3547 }
3548
3549 {
3550 /* look at the thread */
3551 struct hwloc_obj *thread = hwloc_alloc_setup_object(HWLOC_OBJ_PU, i);
3552 threadset = hwloc_bitmap_alloc();
3553 hwloc_bitmap_only(threadset, i);
3554 thread->cpuset = threadset;
3555 hwloc_debug_1arg_bitmap("thread %d has cpuset %s\n",
3556 i, threadset);
3557 hwloc_insert_object_by_cpuset(topology, thread);
3558 }
3559
3560 /* look at the caches */
3561 for(j=0; j<10; j++) {
3562 char str2[20]; /* enough for a level number (one digit) or a type (Data/Instruction/Unified) */
3563 hwloc_bitmap_t cacheset;
3564
3565 sprintf(str, "%s/cpu%d/cache/index%d/shared_cpu_map", path, i, j);
3566 cacheset = hwloc__alloc_read_path_as_cpumask(str, data->root_fd);
3567 if (cacheset) {
3568 if (hwloc_bitmap_iszero(cacheset)) {
3569 hwloc_bitmap_t tmpset;
3570 /* ia64 returning empty L3 and L2i? use the core set instead */
3571 sprintf(str, "%s/cpu%d/topology/thread_siblings", path, i);
3572 tmpset = hwloc__alloc_read_path_as_cpumask(str, data->root_fd);
3573 /* only use it if we actually got something */
3574 if (tmpset) {
3575 hwloc_bitmap_free(cacheset);
3576 cacheset = tmpset;
3577 }
3578 }
3579
3580 if (hwloc_bitmap_first(cacheset) == i) {
3581 unsigned kB;
3582 unsigned linesize;
3583 unsigned sets, lines_per_tag;
3584 unsigned depth; /* 1 for L1, .... */
3585 hwloc_obj_cache_type_t type = HWLOC_OBJ_CACHE_UNIFIED; /* default */
3586 struct hwloc_obj *cache;
3587
3588 /* get the cache level depth */
3589 sprintf(str, "%s/cpu%d/cache/index%d/level", path, i, j); /* contains %u at least up to 4.9 */
3590 if (hwloc_read_path_as_uint(str, &depth, data->root_fd) < 0) {
3591 hwloc_bitmap_free(cacheset);
3592 continue;
3593 }
3594
3595 /* cache type */
3596 sprintf(str, "%s/cpu%d/cache/index%d/type", path, i, j);
3597 if (hwloc_read_path_by_length(str, str2, sizeof(str2), data->root_fd) == 0) {
3598 if (!strncmp(str2, "Data", 4))
3599 type = HWLOC_OBJ_CACHE_DATA;
3600 else if (!strncmp(str2, "Unified", 7))
3601 type = HWLOC_OBJ_CACHE_UNIFIED;
3602 else if (!strncmp(str2, "Instruction", 11))
3603 type = HWLOC_OBJ_CACHE_INSTRUCTION;
3604 else {
3605 hwloc_bitmap_free(cacheset);
3606 continue;
3607 }
3608 } else {
3609 hwloc_bitmap_free(cacheset);
3610 continue;
3611 }
3612
3613 /* get the cache size */
3614 kB = 0;
3615 sprintf(str, "%s/cpu%d/cache/index%d/size", path, i, j); /* contains %uK at least up to 4.9 */
3616 hwloc_read_path_as_uint(str, &kB, data->root_fd);
3617 /* KNL reports L3 with size=0 and full cpuset in cpuid.
3618 * Let hwloc_linux_try_add_knl_mcdram_cache() detect it better.
3619 */
3620 if (!kB && depth == 3 && data->is_knl) {
3621 hwloc_bitmap_free(cacheset);
3622 continue;
3623 }
3624
3625 /* get the line size */
3626 linesize = 0;
3627 sprintf(str, "%s/cpu%d/cache/index%d/coherency_line_size", path, i, j); /* contains %u at least up to 4.9 */
3628 hwloc_read_path_as_uint(str, &linesize, data->root_fd);
3629
3630 /* get the number of sets and lines per tag.
3631 * don't take the associativity directly in "ways_of_associativity" because
3632 * some archs (ia64, ppc) put 0 there when fully-associative, while others (x86) put something like -1 there.
3633 */
3634 sets = 0;
3635 sprintf(str, "%s/cpu%d/cache/index%d/number_of_sets", path, i, j); /* contains %u at least up to 4.9 */
3636 hwloc_read_path_as_uint(str, &sets, data->root_fd);
3637
3638 lines_per_tag = 1;
3639 sprintf(str, "%s/cpu%d/cache/index%d/physical_line_partition", path, i, j); /* contains %u at least up to 4.9 */
3640 hwloc_read_path_as_uint(str, &lines_per_tag, data->root_fd);
3641
3642 /* first cpu in this cache, add the cache */
3643 cache = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, -1);
3644 cache->attr->cache.size = ((uint64_t)kB) << 10;
3645 cache->attr->cache.depth = depth;
3646 cache->attr->cache.linesize = linesize;
3647 cache->attr->cache.type = type;
3648 if (!linesize || !lines_per_tag || !sets)
3649 cache->attr->cache.associativity = 0; /* unknown */
3650 else if (sets == 1)
3651 cache->attr->cache.associativity = 0; /* likely wrong, make it unknown */
3652 else
3653 cache->attr->cache.associativity = (kB << 10) / linesize / lines_per_tag / sets;
3654 cache->cpuset = cacheset;
3655 hwloc_debug_1arg_bitmap("cache depth %u has cpuset %s\n",
3656 depth, cacheset);
3657 hwloc_insert_object_by_cpuset(topology, cache);
3658 cacheset = NULL; /* don't free it */
3659 ++caches_added;
3660 }
3661 }
3662 hwloc_bitmap_free(cacheset);
3663 }
3664 } hwloc_bitmap_foreach_end();
3665
3666 /* actually insert in the tree now that package cpusets have been fixed-up */
3667 while (packages) {
3668 hwloc_obj_t next = packages->next_cousin;
3669 packages->next_cousin = NULL;
3670 hwloc_insert_object_by_cpuset(topology, packages);
3671 packages = next;
3672 }
3673
3674 if (0 == caches_added)
3675 look_powerpc_device_tree(topology, data);
3676
3677 hwloc_bitmap_free(cpuset);
3678
3679 return 0;
3680 }
3681
3682
3683
3684 /****************************************
3685 ****** cpuinfo Topology Discovery ******
3686 ****************************************/
3687
3688 static int
hwloc_linux_parse_cpuinfo_x86(const char * prefix,const char * value,struct hwloc_obj_info_s ** infos,unsigned * infos_count,int is_global __hwloc_attribute_unused)3689 hwloc_linux_parse_cpuinfo_x86(const char *prefix, const char *value,
3690 struct hwloc_obj_info_s **infos, unsigned *infos_count,
3691 int is_global __hwloc_attribute_unused)
3692 {
3693 if (!strcmp("vendor_id", prefix)) {
3694 hwloc__add_info(infos, infos_count, "CPUVendor", value);
3695 } else if (!strcmp("model name", prefix)) {
3696 hwloc__add_info(infos, infos_count, "CPUModel", value);
3697 } else if (!strcmp("model", prefix)) {
3698 hwloc__add_info(infos, infos_count, "CPUModelNumber", value);
3699 } else if (!strcmp("cpu family", prefix)) {
3700 hwloc__add_info(infos, infos_count, "CPUFamilyNumber", value);
3701 } else if (!strcmp("stepping", prefix)) {
3702 hwloc__add_info(infos, infos_count, "CPUStepping", value);
3703 }
3704 return 0;
3705 }
3706
3707 static int
hwloc_linux_parse_cpuinfo_ia64(const char * prefix,const char * value,struct hwloc_obj_info_s ** infos,unsigned * infos_count,int is_global __hwloc_attribute_unused)3708 hwloc_linux_parse_cpuinfo_ia64(const char *prefix, const char *value,
3709 struct hwloc_obj_info_s **infos, unsigned *infos_count,
3710 int is_global __hwloc_attribute_unused)
3711 {
3712 if (!strcmp("vendor", prefix)) {
3713 hwloc__add_info(infos, infos_count, "CPUVendor", value);
3714 } else if (!strcmp("model name", prefix)) {
3715 hwloc__add_info(infos, infos_count, "CPUModel", value);
3716 } else if (!strcmp("model", prefix)) {
3717 hwloc__add_info(infos, infos_count, "CPUModelNumber", value);
3718 } else if (!strcmp("family", prefix)) {
3719 hwloc__add_info(infos, infos_count, "CPUFamilyNumber", value);
3720 }
3721 return 0;
3722 }
3723
3724 static int
hwloc_linux_parse_cpuinfo_arm(const char * prefix,const char * value,struct hwloc_obj_info_s ** infos,unsigned * infos_count,int is_global __hwloc_attribute_unused)3725 hwloc_linux_parse_cpuinfo_arm(const char *prefix, const char *value,
3726 struct hwloc_obj_info_s **infos, unsigned *infos_count,
3727 int is_global __hwloc_attribute_unused)
3728 {
3729 if (!strcmp("Processor", prefix) /* old kernels with one Processor header */
3730 || !strcmp("model name", prefix) /* new kernels with one model name per core */) {
3731 hwloc__add_info(infos, infos_count, "CPUModel", value);
3732 } else if (!strcmp("CPU implementer", prefix)) {
3733 hwloc__add_info(infos, infos_count, "CPUImplementer", value);
3734 } else if (!strcmp("CPU architecture", prefix)) {
3735 hwloc__add_info(infos, infos_count, "CPUArchitecture", value);
3736 } else if (!strcmp("CPU variant", prefix)) {
3737 hwloc__add_info(infos, infos_count, "CPUVariant", value);
3738 } else if (!strcmp("CPU part", prefix)) {
3739 hwloc__add_info(infos, infos_count, "CPUPart", value);
3740 } else if (!strcmp("CPU revision", prefix)) {
3741 hwloc__add_info(infos, infos_count, "CPURevision", value);
3742 } else if (!strcmp("Hardware", prefix)) {
3743 hwloc__add_info(infos, infos_count, "HardwareName", value);
3744 } else if (!strcmp("Revision", prefix)) {
3745 hwloc__add_info(infos, infos_count, "HardwareRevision", value);
3746 } else if (!strcmp("Serial", prefix)) {
3747 hwloc__add_info(infos, infos_count, "HardwareSerial", value);
3748 }
3749 return 0;
3750 }
3751
3752 static int
hwloc_linux_parse_cpuinfo_ppc(const char * prefix,const char * value,struct hwloc_obj_info_s ** infos,unsigned * infos_count,int is_global)3753 hwloc_linux_parse_cpuinfo_ppc(const char *prefix, const char *value,
3754 struct hwloc_obj_info_s **infos, unsigned *infos_count,
3755 int is_global)
3756 {
3757 /* common fields */
3758 if (!strcmp("cpu", prefix)) {
3759 hwloc__add_info(infos, infos_count, "CPUModel", value);
3760 } else if (!strcmp("platform", prefix)) {
3761 hwloc__add_info(infos, infos_count, "PlatformName", value);
3762 } else if (!strcmp("model", prefix)) {
3763 hwloc__add_info(infos, infos_count, "PlatformModel", value);
3764 }
3765 /* platform-specific fields */
3766 else if (!strcasecmp("vendor", prefix)) {
3767 hwloc__add_info(infos, infos_count, "PlatformVendor", value);
3768 } else if (!strcmp("Board ID", prefix)) {
3769 hwloc__add_info(infos, infos_count, "PlatformBoardID", value);
3770 } else if (!strcmp("Board", prefix)
3771 || !strcasecmp("Machine", prefix)) {
3772 /* machine and board are similar (and often more precise) than model above */
3773 char **valuep = hwloc__find_info_slot(infos, infos_count, "PlatformModel");
3774 if (*valuep)
3775 free(*valuep);
3776 *valuep = strdup(value);
3777 } else if (!strcasecmp("Revision", prefix)
3778 || !strcmp("Hardware rev", prefix)) {
3779 hwloc__add_info(infos, infos_count, is_global ? "PlatformRevision" : "CPURevision", value);
3780 } else if (!strcmp("SVR", prefix)) {
3781 hwloc__add_info(infos, infos_count, "SystemVersionRegister", value);
3782 } else if (!strcmp("PVR", prefix)) {
3783 hwloc__add_info(infos, infos_count, "ProcessorVersionRegister", value);
3784 }
3785 /* don't match 'board*' because there's also "board l2" on some platforms */
3786 return 0;
3787 }
3788
3789 /*
3790 * avr32: "chip type\t:" => OK
3791 * blackfin: "model name\t:" => OK
3792 * h8300: "CPU:" => OK
3793 * m68k: "CPU:" => OK
3794 * mips: "cpu model\t\t:" => OK
3795 * openrisc: "CPU:" => OK
3796 * sparc: "cpu\t\t:" => OK
3797 * tile: "model name\t:" => OK
3798 * unicore32: "Processor\t:" => OK
3799 * alpha: "cpu\t\t\t: Alpha" + "cpu model\t\t:" => "cpu" overwritten by "cpu model", no processor indexes
3800 * cris: "cpu\t\t:" + "cpu model\t:" => only "cpu"
3801 * frv: "CPU-Core:" + "CPU:" => only "CPU"
3802 * mn10300: "cpu core :" + "model name :" => only "model name"
3803 * parisc: "cpu family\t:" + "cpu\t\t:" => only "cpu"
3804 *
3805 * not supported because of conflicts with other arch minor lines:
3806 * m32r: "cpu family\t:" => KO (adding "cpu family" would break "blackfin")
3807 * microblaze: "CPU-Family:" => KO
3808 * sh: "cpu family\t:" + "cpu type\t:" => KO
3809 * xtensa: "model\t\t:" => KO
3810 */
3811 static int
hwloc_linux_parse_cpuinfo_generic(const char * prefix,const char * value,struct hwloc_obj_info_s ** infos,unsigned * infos_count,int is_global __hwloc_attribute_unused)3812 hwloc_linux_parse_cpuinfo_generic(const char *prefix, const char *value,
3813 struct hwloc_obj_info_s **infos, unsigned *infos_count,
3814 int is_global __hwloc_attribute_unused)
3815 {
3816 if (!strcmp("model name", prefix)
3817 || !strcmp("Processor", prefix)
3818 || !strcmp("chip type", prefix)
3819 || !strcmp("cpu model", prefix)
3820 || !strcasecmp("cpu", prefix)) {
3821 /* keep the last one, assume it's more precise than the first one.
3822 * we should have the Architecture keypair for basic information anyway.
3823 */
3824 char **valuep = hwloc__find_info_slot(infos, infos_count, "CPUModel");
3825 if (*valuep)
3826 free(*valuep);
3827 *valuep = strdup(value);
3828 }
3829 return 0;
3830 }
3831
3832 /* Lprocs_p set to NULL unless returns > 0 */
3833 static int
hwloc_linux_parse_cpuinfo(struct hwloc_linux_backend_data_s * data,const char * path,struct hwloc_linux_cpuinfo_proc ** Lprocs_p,struct hwloc_obj_info_s ** global_infos,unsigned * global_infos_count)3834 hwloc_linux_parse_cpuinfo(struct hwloc_linux_backend_data_s *data,
3835 const char *path,
3836 struct hwloc_linux_cpuinfo_proc ** Lprocs_p,
3837 struct hwloc_obj_info_s **global_infos, unsigned *global_infos_count)
3838 {
3839 FILE *fd;
3840 char *str = NULL;
3841 char *endptr;
3842 unsigned len;
3843 unsigned allocated_Lprocs = 0;
3844 struct hwloc_linux_cpuinfo_proc * Lprocs = NULL;
3845 unsigned numprocs = 0;
3846 int curproc = -1;
3847 int (*parse_cpuinfo_func)(const char *, const char *, struct hwloc_obj_info_s **, unsigned *, int) = NULL;
3848
3849 if (!(fd=hwloc_fopen(path,"r", data->root_fd)))
3850 {
3851 hwloc_debug("could not open %s\n", path);
3852 return -1;
3853 }
3854
3855 # define PROCESSOR "processor"
3856 # define PACKAGEID "physical id" /* the longest one */
3857 # define COREID "core id"
3858 len = 128; /* vendor/model can be very long */
3859 str = malloc(len);
3860 hwloc_debug("\n\n * Topology extraction from %s *\n\n", path);
3861 while (fgets(str,len,fd)!=NULL) {
3862 unsigned long Ppkg, Pcore, Pproc;
3863 char *end, *dot, *prefix, *value;
3864 int noend = 0;
3865
3866 /* remove the ending \n */
3867 end = strchr(str, '\n');
3868 if (end)
3869 *end = 0;
3870 else
3871 noend = 1;
3872 /* if empty line, skip and reset curproc */
3873 if (!*str) {
3874 curproc = -1;
3875 continue;
3876 }
3877 /* skip lines with no dot */
3878 dot = strchr(str, ':');
3879 if (!dot)
3880 continue;
3881 /* skip lines not starting with a letter */
3882 if ((*str > 'z' || *str < 'a')
3883 && (*str > 'Z' || *str < 'A'))
3884 continue;
3885
3886 /* mark the end of the prefix */
3887 prefix = str;
3888 end = dot;
3889 while (end[-1] == ' ' || end[-1] == '\t') end--; /* need a strrspn() */
3890 *end = 0;
3891 /* find beginning of value, its end is already marked */
3892 value = dot+1 + strspn(dot+1, " ");
3893
3894 /* defines for parsing numbers */
3895 # define getprocnb_begin(field, var) \
3896 if (!strcmp(field,prefix)) { \
3897 var = strtoul(value,&endptr,0); \
3898 if (endptr==value) { \
3899 hwloc_debug("no number in "field" field of %s\n", path); \
3900 goto err; \
3901 } else if (var==ULONG_MAX) { \
3902 hwloc_debug("too big "field" number in %s\n", path); \
3903 goto err; \
3904 } \
3905 hwloc_debug(field " %lu\n", var)
3906 # define getprocnb_end() \
3907 }
3908 /* actually parse numbers */
3909 getprocnb_begin(PROCESSOR, Pproc);
3910 curproc = numprocs++;
3911 if (numprocs > allocated_Lprocs) {
3912 struct hwloc_linux_cpuinfo_proc * tmp;
3913 if (!allocated_Lprocs)
3914 allocated_Lprocs = 8;
3915 else
3916 allocated_Lprocs *= 2;
3917 tmp = realloc(Lprocs, allocated_Lprocs * sizeof(*Lprocs));
3918 if (!tmp)
3919 goto err;
3920 Lprocs = tmp;
3921 }
3922 Lprocs[curproc].Pproc = Pproc;
3923 Lprocs[curproc].Pcore = -1;
3924 Lprocs[curproc].Ppkg = -1;
3925 Lprocs[curproc].Lcore = -1;
3926 Lprocs[curproc].Lpkg = -1;
3927 Lprocs[curproc].infos = NULL;
3928 Lprocs[curproc].infos_count = 0;
3929 getprocnb_end() else
3930 getprocnb_begin(PACKAGEID, Ppkg);
3931 Lprocs[curproc].Ppkg = Ppkg;
3932 getprocnb_end() else
3933 getprocnb_begin(COREID, Pcore);
3934 Lprocs[curproc].Pcore = Pcore;
3935 getprocnb_end() else {
3936
3937 /* architecture specific or default routine for parsing cpumodel */
3938 switch (data->arch) {
3939 case HWLOC_LINUX_ARCH_X86:
3940 parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_x86;
3941 break;
3942 case HWLOC_LINUX_ARCH_ARM:
3943 parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_arm;
3944 break;
3945 case HWLOC_LINUX_ARCH_POWER:
3946 parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_ppc;
3947 break;
3948 case HWLOC_LINUX_ARCH_IA64:
3949 parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_ia64;
3950 break;
3951 default:
3952 parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_generic;
3953 }
3954
3955 /* we can't assume that we already got a processor index line:
3956 * alpha/frv/h8300/m68k/microblaze/sparc have no processor lines at all, only a global entry.
3957 * tile has a global section with model name before the list of processor lines.
3958 */
3959 parse_cpuinfo_func(prefix, value,
3960 curproc >= 0 ? &Lprocs[curproc].infos : global_infos,
3961 curproc >= 0 ? &Lprocs[curproc].infos_count : global_infos_count,
3962 curproc < 0);
3963 }
3964
3965 if (noend) {
3966 /* ignore end of line */
3967 if (fscanf(fd,"%*[^\n]") == EOF)
3968 break;
3969 getc(fd);
3970 }
3971 }
3972 fclose(fd);
3973 free(str);
3974
3975 *Lprocs_p = Lprocs;
3976 return numprocs;
3977
3978 err:
3979 fclose(fd);
3980 free(str);
3981 free(Lprocs);
3982 *Lprocs_p = NULL;
3983 return -1;
3984 }
3985
3986 static void
hwloc_linux_free_cpuinfo(struct hwloc_linux_cpuinfo_proc * Lprocs,unsigned numprocs,struct hwloc_obj_info_s * global_infos,unsigned global_infos_count)3987 hwloc_linux_free_cpuinfo(struct hwloc_linux_cpuinfo_proc * Lprocs, unsigned numprocs,
3988 struct hwloc_obj_info_s *global_infos, unsigned global_infos_count)
3989 {
3990 if (Lprocs) {
3991 unsigned i;
3992 for(i=0; i<numprocs; i++) {
3993 hwloc__free_infos(Lprocs[i].infos, Lprocs[i].infos_count);
3994 }
3995 free(Lprocs);
3996 }
3997 hwloc__free_infos(global_infos, global_infos_count);
3998 }
3999
4000 static int
look_cpuinfo(struct hwloc_topology * topology,struct hwloc_linux_cpuinfo_proc * Lprocs,unsigned numprocs,hwloc_bitmap_t online_cpuset)4001 look_cpuinfo(struct hwloc_topology *topology,
4002 struct hwloc_linux_cpuinfo_proc * Lprocs,
4003 unsigned numprocs, hwloc_bitmap_t online_cpuset)
4004 {
4005 /* P for physical/OS index, L for logical (e.g. in we order we get them, not in the final hwloc logical order) */
4006 unsigned *Lcore_to_Pcore;
4007 unsigned *Lcore_to_Ppkg; /* needed because Lcore is equivalent to Pcore+Ppkg, not to Pcore alone */
4008 unsigned *Lpkg_to_Ppkg;
4009 unsigned numpkgs=0;
4010 unsigned numcores=0;
4011 unsigned long Lproc;
4012 unsigned missingpkg;
4013 unsigned missingcore;
4014 unsigned i,j;
4015 hwloc_bitmap_t cpuset;
4016
4017 /* initialize misc arrays, there can be at most numprocs entries */
4018 Lcore_to_Pcore = malloc(numprocs * sizeof(*Lcore_to_Pcore));
4019 Lcore_to_Ppkg = malloc(numprocs * sizeof(*Lcore_to_Ppkg));
4020 Lpkg_to_Ppkg = malloc(numprocs * sizeof(*Lpkg_to_Ppkg));
4021 for (i = 0; i < numprocs; i++) {
4022 Lcore_to_Pcore[i] = -1;
4023 Lcore_to_Ppkg[i] = -1;
4024 Lpkg_to_Ppkg[i] = -1;
4025 }
4026
4027 cpuset = hwloc_bitmap_alloc();
4028
4029 /* create PU objects */
4030 for(Lproc=0; Lproc<numprocs; Lproc++) {
4031 unsigned long Pproc = Lprocs[Lproc].Pproc;
4032 hwloc_obj_t obj = hwloc_alloc_setup_object(HWLOC_OBJ_PU, Pproc);
4033 hwloc_bitmap_set(cpuset, Pproc);
4034 obj->cpuset = hwloc_bitmap_alloc();
4035 hwloc_bitmap_only(obj->cpuset, Pproc);
4036 hwloc_debug_2args_bitmap("cpu %lu (os %lu) has cpuset %s\n",
4037 Lproc, Pproc, obj->cpuset);
4038 hwloc_insert_object_by_cpuset(topology, obj);
4039 }
4040
4041 topology->support.discovery->pu = 1;
4042 hwloc_bitmap_copy(online_cpuset, cpuset);
4043 hwloc_bitmap_free(cpuset);
4044
4045 hwloc_debug("%u online processors found\n", numprocs);
4046 hwloc_debug_bitmap("online processor cpuset: %s\n", online_cpuset);
4047
4048 hwloc_debug("%s", "\n * Topology summary *\n");
4049 hwloc_debug("%u processors)\n", numprocs);
4050
4051 /* fill Lprocs[].Lpkg and Lpkg_to_Ppkg */
4052 for(Lproc=0; Lproc<numprocs; Lproc++) {
4053 long Ppkg = Lprocs[Lproc].Ppkg;
4054 if (Ppkg != -1) {
4055 unsigned long Pproc = Lprocs[Lproc].Pproc;
4056 for (i=0; i<numpkgs; i++)
4057 if ((unsigned) Ppkg == Lpkg_to_Ppkg[i])
4058 break;
4059 Lprocs[Lproc].Lpkg = i;
4060 hwloc_debug("%lu on package %u (%lx)\n", Pproc, i, Ppkg);
4061 if (i==numpkgs) {
4062 Lpkg_to_Ppkg[numpkgs] = Ppkg;
4063 numpkgs++;
4064 }
4065 }
4066 }
4067 /* Some buggy Linuxes don't provide numbers for processor 0, which makes us
4068 * provide bogus information. We should rather drop it. */
4069 missingpkg=0;
4070 for(j=0; j<numprocs; j++)
4071 if (Lprocs[j].Ppkg == -1) {
4072 missingpkg=1;
4073 break;
4074 }
4075 /* create package objects */
4076 hwloc_debug("%u pkgs%s\n", numpkgs, missingpkg ? ", but some missing package" : "");
4077 if (!missingpkg && numpkgs>0) {
4078 for (i = 0; i < numpkgs; i++) {
4079 struct hwloc_obj *obj = hwloc_alloc_setup_object(HWLOC_OBJ_PACKAGE, Lpkg_to_Ppkg[i]);
4080 int doneinfos = 0;
4081 obj->cpuset = hwloc_bitmap_alloc();
4082 for(j=0; j<numprocs; j++)
4083 if ((unsigned) Lprocs[j].Lpkg == i) {
4084 hwloc_bitmap_set(obj->cpuset, Lprocs[j].Pproc);
4085 if (!doneinfos) {
4086 hwloc__move_infos(&obj->infos, &obj->infos_count, &Lprocs[j].infos, &Lprocs[j].infos_count);
4087 doneinfos = 1;
4088 }
4089 }
4090 hwloc_debug_1arg_bitmap("package %d has cpuset %s\n", i, obj->cpuset);
4091 hwloc_insert_object_by_cpuset(topology, obj);
4092 }
4093 hwloc_debug("%s", "\n");
4094 }
4095
4096 /* fill Lprocs[].Lcore, Lcore_to_Ppkg and Lcore_to_Pcore */
4097 for(Lproc=0; Lproc<numprocs; Lproc++) {
4098 long Pcore = Lprocs[Lproc].Pcore;
4099 if (Pcore != -1) {
4100 for (i=0; i<numcores; i++)
4101 if ((unsigned) Pcore == Lcore_to_Pcore[i] && (unsigned) Lprocs[Lproc].Ppkg == Lcore_to_Ppkg[i])
4102 break;
4103 Lprocs[Lproc].Lcore = i;
4104 if (i==numcores) {
4105 Lcore_to_Ppkg[numcores] = Lprocs[Lproc].Ppkg;
4106 Lcore_to_Pcore[numcores] = Pcore;
4107 numcores++;
4108 }
4109 }
4110 }
4111 /* Some buggy Linuxes don't provide numbers for processor 0, which makes us
4112 * provide bogus information. We should rather drop it. */
4113 missingcore=0;
4114 for(j=0; j<numprocs; j++)
4115 if (Lprocs[j].Pcore == -1) {
4116 missingcore=1;
4117 break;
4118 }
4119 /* create Core objects */
4120 hwloc_debug("%u cores%s\n", numcores, missingcore ? ", but some missing core" : "");
4121 if (!missingcore && numcores>0) {
4122 for (i = 0; i < numcores; i++) {
4123 struct hwloc_obj *obj = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, Lcore_to_Pcore[i]);
4124 obj->cpuset = hwloc_bitmap_alloc();
4125 for(j=0; j<numprocs; j++)
4126 if ((unsigned) Lprocs[j].Lcore == i)
4127 hwloc_bitmap_set(obj->cpuset, Lprocs[j].Pproc);
4128 hwloc_debug_1arg_bitmap("Core %d has cpuset %s\n", i, obj->cpuset);
4129 hwloc_insert_object_by_cpuset(topology, obj);
4130 }
4131 hwloc_debug("%s", "\n");
4132 }
4133
4134 free(Lcore_to_Pcore);
4135 free(Lcore_to_Ppkg);
4136 free(Lpkg_to_Ppkg);
4137 return 0;
4138 }
4139
4140
4141
4142 /*************************************
4143 ****** Main Topology Discovery ******
4144 *************************************/
4145
4146 static void
hwloc__linux_get_mic_sn(struct hwloc_topology * topology,struct hwloc_linux_backend_data_s * data)4147 hwloc__linux_get_mic_sn(struct hwloc_topology *topology, struct hwloc_linux_backend_data_s *data)
4148 {
4149 char line[64], *tmp, *end;
4150 if (hwloc_read_path_by_length("/proc/elog", line, sizeof(line), data->root_fd) < 0)
4151 return;
4152 if (strncmp(line, "Card ", 5))
4153 return;
4154 tmp = line + 5;
4155 end = strchr(tmp, ':');
4156 if (!end)
4157 return;
4158 *end = '\0';
4159 hwloc_obj_add_info(hwloc_get_root_obj(topology), "MICSerialNumber", tmp);
4160 }
4161
4162 static void
hwloc_gather_system_info(struct hwloc_topology * topology,struct hwloc_linux_backend_data_s * data)4163 hwloc_gather_system_info(struct hwloc_topology *topology,
4164 struct hwloc_linux_backend_data_s *data)
4165 {
4166 FILE *file;
4167 char line[128]; /* enough for utsname fields */
4168 const char *env;
4169
4170 /* initialize to something sane, in case !is_thissystem and we can't find things in /proc/hwloc-nofile-info */
4171 memset(&data->utsname, 0, sizeof(data->utsname));
4172 data->fallback_nbprocessors = 1;
4173 data->pagesize = 4096;
4174
4175 /* read thissystem info */
4176 if (topology->is_thissystem) {
4177 uname(&data->utsname);
4178 data->fallback_nbprocessors = hwloc_fallback_nbprocessors(topology);
4179 data->pagesize = hwloc_getpagesize();
4180 }
4181
4182 /* overwrite with optional /proc/hwloc-nofile-info */
4183 file = hwloc_fopen("/proc/hwloc-nofile-info", "r", data->root_fd);
4184 if (file) {
4185 while (fgets(line, sizeof(line), file)) {
4186 char *tmp = strchr(line, '\n');
4187 if (!strncmp("OSName: ", line, 8)) {
4188 if (tmp)
4189 *tmp = '\0';
4190 strncpy(data->utsname.sysname, line+8, sizeof(data->utsname.sysname));
4191 data->utsname.sysname[sizeof(data->utsname.sysname)-1] = '\0';
4192 } else if (!strncmp("OSRelease: ", line, 11)) {
4193 if (tmp)
4194 *tmp = '\0';
4195 strncpy(data->utsname.release, line+11, sizeof(data->utsname.release));
4196 data->utsname.release[sizeof(data->utsname.release)-1] = '\0';
4197 } else if (!strncmp("OSVersion: ", line, 11)) {
4198 if (tmp)
4199 *tmp = '\0';
4200 strncpy(data->utsname.version, line+11, sizeof(data->utsname.version));
4201 data->utsname.version[sizeof(data->utsname.version)-1] = '\0';
4202 } else if (!strncmp("HostName: ", line, 10)) {
4203 if (tmp)
4204 *tmp = '\0';
4205 strncpy(data->utsname.nodename, line+10, sizeof(data->utsname.nodename));
4206 data->utsname.nodename[sizeof(data->utsname.nodename)-1] = '\0';
4207 } else if (!strncmp("Architecture: ", line, 14)) {
4208 if (tmp)
4209 *tmp = '\0';
4210 strncpy(data->utsname.machine, line+14, sizeof(data->utsname.machine));
4211 data->utsname.machine[sizeof(data->utsname.machine)-1] = '\0';
4212 } else if (!strncmp("FallbackNbProcessors: ", line, 22)) {
4213 if (tmp)
4214 *tmp = '\0';
4215 data->fallback_nbprocessors = atoi(line+22);
4216 } else if (!strncmp("PageSize: ", line, 10)) {
4217 if (tmp)
4218 *tmp = '\0';
4219 data->pagesize = strtoull(line+10, NULL, 10);
4220 } else {
4221 hwloc_debug("ignored /proc/hwloc-nofile-info line %s\n", line);
4222 /* ignored */
4223 }
4224 }
4225 fclose(file);
4226 }
4227
4228 env = getenv("HWLOC_DUMP_NOFILE_INFO");
4229 if (env && *env) {
4230 file = fopen(env, "w");
4231 if (file) {
4232 if (*data->utsname.sysname)
4233 fprintf(file, "OSName: %s\n", data->utsname.sysname);
4234 if (*data->utsname.release)
4235 fprintf(file, "OSRelease: %s\n", data->utsname.release);
4236 if (*data->utsname.version)
4237 fprintf(file, "OSVersion: %s\n", data->utsname.version);
4238 if (*data->utsname.nodename)
4239 fprintf(file, "HostName: %s\n", data->utsname.nodename);
4240 if (*data->utsname.machine)
4241 fprintf(file, "Architecture: %s\n", data->utsname.machine);
4242 fprintf(file, "FallbackNbProcessors: %u\n", data->fallback_nbprocessors);
4243 fprintf(file, "PageSize: %llu\n", (unsigned long long) data->pagesize);
4244 fclose(file);
4245 }
4246 }
4247
4248 /* detect arch for quirks, using configure #defines if possible, or uname */
4249 #if (defined HWLOC_X86_32_ARCH) || (defined HWLOC_X86_64_ARCH) /* does not cover KNC */
4250 if (topology->is_thissystem)
4251 data->arch = HWLOC_LINUX_ARCH_X86;
4252 #endif
4253 if (data->arch == HWLOC_LINUX_ARCH_UNKNOWN && *data->utsname.machine) {
4254 if (!strcmp(data->utsname.machine, "x86_64")
4255 || (data->utsname.machine[0] == 'i' && !strcmp(data->utsname.machine+2, "86"))
4256 || !strcmp(data->utsname.machine, "k1om"))
4257 data->arch = HWLOC_LINUX_ARCH_X86;
4258 else if (!strncmp(data->utsname.machine, "arm", 3))
4259 data->arch = HWLOC_LINUX_ARCH_ARM;
4260 else if (!strncmp(data->utsname.machine, "ppc", 3)
4261 || !strncmp(data->utsname.machine, "power", 5))
4262 data->arch = HWLOC_LINUX_ARCH_POWER;
4263 else if (!strcmp(data->utsname.machine, "ia64"))
4264 data->arch = HWLOC_LINUX_ARCH_IA64;
4265 }
4266 }
4267
4268 /* returns 0 on success, -1 on non-match or error during hardwired load */
4269 static int
hwloc_linux_try_hardwired_cpuinfo(struct hwloc_backend * backend)4270 hwloc_linux_try_hardwired_cpuinfo(struct hwloc_backend *backend)
4271 {
4272 struct hwloc_topology *topology = backend->topology;
4273 struct hwloc_linux_backend_data_s *data = backend->private_data;
4274
4275 if (getenv("HWLOC_NO_HARDWIRED_TOPOLOGY"))
4276 return -1;
4277
4278 if (!strcmp(data->utsname.machine, "s64fx")) {
4279 char line[128];
4280 /* Fujistu K-computer, FX10, and FX100 use specific processors
4281 * whose Linux topology support is broken until 4.1 (acc455cffa75070d55e74fc7802b49edbc080e92and)
4282 * and existing machines will likely never be fixed by kernel upgrade.
4283 */
4284
4285 /* /proc/cpuinfo starts with one of these lines:
4286 * "cpu : Fujitsu SPARC64 VIIIfx"
4287 * "cpu : Fujitsu SPARC64 XIfx"
4288 * "cpu : Fujitsu SPARC64 IXfx"
4289 */
4290 if (hwloc_read_path_by_length("/proc/cpuinfo", line, sizeof(line), data->root_fd) < 0)
4291 return -1;
4292
4293 if (strncmp(line, "cpu ", 4))
4294 return -1;
4295
4296 if (strstr(line, "Fujitsu SPARC64 VIIIfx"))
4297 return hwloc_look_hardwired_fujitsu_k(topology);
4298 else if (strstr(line, "Fujitsu SPARC64 IXfx"))
4299 return hwloc_look_hardwired_fujitsu_fx10(topology);
4300 else if (strstr(line, "FUJITSU SPARC64 XIfx"))
4301 return hwloc_look_hardwired_fujitsu_fx100(topology);
4302 }
4303 return -1;
4304 }
4305
hwloc_linux__get_allowed_resources(hwloc_topology_t topology,const char * root_path,int root_fd,char ** cpuset_namep)4306 static void hwloc_linux__get_allowed_resources(hwloc_topology_t topology, const char *root_path, int root_fd, char **cpuset_namep)
4307 {
4308 char *cpuset_mntpnt, *cgroup_mntpnt, *cpuset_name = NULL;
4309 hwloc_find_linux_cpuset_mntpnt(&cgroup_mntpnt, &cpuset_mntpnt, root_path);
4310 if (cgroup_mntpnt || cpuset_mntpnt) {
4311 cpuset_name = hwloc_read_linux_cpuset_name(root_fd, topology->pid);
4312 if (cpuset_name) {
4313 hwloc_admin_disable_set_from_cpuset(root_fd, cgroup_mntpnt, cpuset_mntpnt, cpuset_name, "cpus", topology->levels[0][0]->allowed_cpuset);
4314 hwloc_admin_disable_set_from_cpuset(root_fd, cgroup_mntpnt, cpuset_mntpnt, cpuset_name, "mems", topology->levels[0][0]->allowed_nodeset);
4315 }
4316 free(cgroup_mntpnt);
4317 free(cpuset_mntpnt);
4318 }
4319 *cpuset_namep = cpuset_name;
4320 }
4321
4322 static int
hwloc_look_linuxfs(struct hwloc_backend * backend)4323 hwloc_look_linuxfs(struct hwloc_backend *backend)
4324 {
4325 struct hwloc_topology *topology = backend->topology;
4326 struct hwloc_linux_backend_data_s *data = backend->private_data;
4327 DIR *nodes_dir;
4328 unsigned nbnodes;
4329 char *cpuset_name;
4330 struct hwloc_linux_cpuinfo_proc * Lprocs = NULL;
4331 struct hwloc_obj_info_s *global_infos = NULL;
4332 unsigned global_infos_count = 0;
4333 int numprocs;
4334 int already_pus;
4335 int err;
4336
4337 already_pus = (topology->levels[0][0]->complete_cpuset != NULL
4338 && !hwloc_bitmap_iszero(topology->levels[0][0]->complete_cpuset));
4339 /* if there are PUs, still look at memory information
4340 * since x86 misses NUMA node information (unless the processor supports topoext)
4341 * memory size.
4342 */
4343
4344 /* allocate root sets in case not done yet */
4345 hwloc_alloc_obj_cpusets(topology->levels[0][0]);
4346
4347 /*********************************
4348 * Platform information for later
4349 */
4350 hwloc_gather_system_info(topology, data);
4351
4352 /**********************
4353 * /proc/cpuinfo
4354 */
4355 numprocs = hwloc_linux_parse_cpuinfo(data, "/proc/cpuinfo", &Lprocs, &global_infos, &global_infos_count);
4356 if (numprocs < 0)
4357 numprocs = 0;
4358
4359 /**************************
4360 * detect model for quirks
4361 */
4362 if (data->arch == HWLOC_LINUX_ARCH_X86 && numprocs > 0) {
4363 unsigned i;
4364 const char *cpuvendor = NULL, *cpufamilynumber = NULL, *cpumodelnumber = NULL;
4365 for(i=0; i<Lprocs[0].infos_count; i++) {
4366 if (!strcmp(Lprocs[0].infos[i].name, "CPUVendor")) {
4367 cpuvendor = Lprocs[0].infos[i].value;
4368 } else if (!strcmp(Lprocs[0].infos[i].name, "CPUFamilyNumber")) {
4369 cpufamilynumber = Lprocs[0].infos[i].value;
4370 } else if (!strcmp(Lprocs[0].infos[i].name, "CPUModelNumber")) {
4371 cpumodelnumber = Lprocs[0].infos[i].value;
4372 }
4373 }
4374 if (cpuvendor && !strcmp(cpuvendor, "GenuineIntel")
4375 && cpufamilynumber && !strcmp(cpufamilynumber, "6")
4376 && cpumodelnumber && (!strcmp(cpumodelnumber, "87")
4377 || !strcmp(cpumodelnumber, "133")))
4378 data->is_knl = 1;
4379 if (cpuvendor && !strcmp(cpuvendor, "AuthenticAMD")
4380 && cpufamilynumber
4381 && (!strcmp(cpufamilynumber, "21")
4382 || !strcmp(cpufamilynumber, "22")))
4383 data->is_amd_with_CU = 1;
4384 }
4385
4386 /**********************
4387 * Gather the list of admin-disabled cpus and mems
4388 */
4389 hwloc_linux__get_allowed_resources(topology, data->root_path, data->root_fd, &cpuset_name);
4390
4391 nodes_dir = hwloc_opendir("/proc/nodes", data->root_fd);
4392 if (nodes_dir) {
4393 /* Kerrighed */
4394 struct dirent *dirent;
4395 char path[128];
4396 hwloc_obj_t machine;
4397 hwloc_bitmap_t machine_online_set;
4398
4399 if (already_pus) {
4400 /* we don't support extending kerrighed topologies */
4401 free(cpuset_name);
4402 hwloc_linux_free_cpuinfo(Lprocs, numprocs, global_infos, global_infos_count);
4403 return 0;
4404 }
4405
4406 /* replace top-level object type with SYSTEM and add some MACHINE underneath */
4407
4408 topology->levels[0][0]->type = HWLOC_OBJ_SYSTEM;
4409 topology->levels[0][0]->name = strdup("Kerrighed");
4410
4411 /* No cpuset support for now. */
4412 /* No sys support for now. */
4413 while ((dirent = readdir(nodes_dir)) != NULL) {
4414 struct hwloc_linux_cpuinfo_proc * machine_Lprocs = NULL;
4415 struct hwloc_obj_info_s *machine_global_infos = NULL;
4416 unsigned machine_global_infos_count = 0;
4417 int machine_numprocs = 0;
4418 unsigned long node;
4419 if (strncmp(dirent->d_name, "node", 4))
4420 continue;
4421 machine_online_set = hwloc_bitmap_alloc();
4422 node = strtoul(dirent->d_name+4, NULL, 0);
4423 snprintf(path, sizeof(path), "/proc/nodes/node%lu/cpuinfo", node);
4424 machine_numprocs = hwloc_linux_parse_cpuinfo(data, path, &machine_Lprocs, &machine_global_infos, &machine_global_infos_count);
4425 if (machine_numprocs < 0) {
4426 err = -1;
4427 machine_numprocs = 0;
4428 } else {
4429 err = look_cpuinfo(topology, machine_Lprocs, machine_numprocs, machine_online_set);
4430 }
4431
4432 hwloc_linux_free_cpuinfo(machine_Lprocs, machine_numprocs, machine_global_infos, machine_global_infos_count);
4433 if (err < 0) {
4434 hwloc_bitmap_free(machine_online_set);
4435 continue;
4436 }
4437 hwloc_bitmap_or(topology->levels[0][0]->online_cpuset, topology->levels[0][0]->online_cpuset, machine_online_set);
4438 machine = hwloc_alloc_setup_object(HWLOC_OBJ_MACHINE, node);
4439 machine->cpuset = machine_online_set;
4440 hwloc_debug_1arg_bitmap("machine number %lu has cpuset %s\n",
4441 node, machine_online_set);
4442
4443 /* Get the machine memory attributes */
4444 hwloc_get_kerrighed_node_meminfo_info(topology, data, node, &machine->memory);
4445
4446 /* Gather DMI info */
4447 /* FIXME: get the right DMI info of each machine */
4448 hwloc__get_dmi_id_info(data, machine);
4449
4450 hwloc_insert_object_by_cpuset(topology, machine);
4451 }
4452 closedir(nodes_dir);
4453 } else {
4454 /*********************
4455 * Memory information
4456 */
4457
4458 /* Get the machine memory attributes */
4459 hwloc_get_procfs_meminfo_info(topology, data, &topology->levels[0][0]->memory);
4460
4461 /* Gather NUMA information. Must be after hwloc_get_procfs_meminfo_info so that the hugepage size is known */
4462 if (look_sysfsnode(topology, data, "/sys/bus/node/devices", &nbnodes) < 0)
4463 look_sysfsnode(topology, data, "/sys/devices/system/node", &nbnodes);
4464
4465 /* if we found some numa nodes, the machine object has no local memory */
4466 if (nbnodes) {
4467 unsigned i;
4468 topology->levels[0][0]->memory.local_memory = 0;
4469 if (topology->levels[0][0]->memory.page_types)
4470 for(i=0; i<topology->levels[0][0]->memory.page_types_len; i++)
4471 topology->levels[0][0]->memory.page_types[i].count = 0;
4472 }
4473
4474 /**********************
4475 * CPU information
4476 */
4477
4478 /* Don't rediscover CPU resources if already done */
4479 if (already_pus)
4480 goto done;
4481
4482 /* Gather the list of cpus now */
4483 err = hwloc_linux_try_hardwired_cpuinfo(backend);
4484 if (!err)
4485 goto done;
4486
4487 /* setup root info */
4488 hwloc__move_infos(&hwloc_get_root_obj(topology)->infos, &hwloc_get_root_obj(topology)->infos_count,
4489 &global_infos, &global_infos_count);
4490
4491 if (getenv("HWLOC_LINUX_USE_CPUINFO")
4492 || (hwloc_access("/sys/devices/system/cpu/cpu0/topology/core_siblings", R_OK, data->root_fd) < 0
4493 && hwloc_access("/sys/devices/system/cpu/cpu0/topology/thread_siblings", R_OK, data->root_fd) < 0
4494 && hwloc_access("/sys/bus/cpu/devices/cpu0/topology/thread_siblings", R_OK, data->root_fd) < 0
4495 && hwloc_access("/sys/bus/cpu/devices/cpu0/topology/core_siblings", R_OK, data->root_fd) < 0)) {
4496 /* revert to reading cpuinfo only if /sys/.../topology unavailable (before 2.6.16)
4497 * or not containing anything interesting */
4498 if (numprocs > 0)
4499 err = look_cpuinfo(topology, Lprocs, numprocs, topology->levels[0][0]->online_cpuset);
4500 else
4501 err = -1;
4502 if (err < 0)
4503 hwloc_setup_pu_level(topology, data->fallback_nbprocessors);
4504 look_powerpc_device_tree(topology, data);
4505
4506 } else {
4507 /* sysfs */
4508 if (look_sysfscpu(topology, data, "/sys/bus/cpu/devices", Lprocs, numprocs) < 0)
4509 if (look_sysfscpu(topology, data, "/sys/devices/system/cpu", Lprocs, numprocs) < 0)
4510 /* sysfs but we failed to read cpu topology, fallback */
4511 hwloc_setup_pu_level(topology, data->fallback_nbprocessors);
4512 }
4513
4514 done:
4515
4516 /**********************
4517 * Misc
4518 */
4519
4520 /* Gather DMI info */
4521 hwloc__get_dmi_id_info(data, topology->levels[0][0]);
4522 if (hwloc_topology_get_flags(topology) & (HWLOC_TOPOLOGY_FLAG_IO_DEVICES|HWLOC_TOPOLOGY_FLAG_WHOLE_IO))
4523 hwloc__get_firmware_dmi_memory_info(topology, data);
4524 }
4525
4526 hwloc_obj_add_info(topology->levels[0][0], "Backend", "Linux");
4527 if (cpuset_name) {
4528 hwloc_obj_add_info(topology->levels[0][0], "LinuxCgroup", cpuset_name);
4529 free(cpuset_name);
4530 }
4531
4532 hwloc__linux_get_mic_sn(topology, data);
4533
4534 /* data->utsname was filled with real uname or \0, we can safely pass it */
4535 hwloc_add_uname_info(topology, &data->utsname);
4536
4537 hwloc_linux_free_cpuinfo(Lprocs, numprocs, global_infos, global_infos_count);
4538 return 1;
4539 }
4540
4541
4542
4543 /****************************************
4544 ***** Linux PCI backend callbacks ******
4545 ****************************************
4546 * Do not support changing the fsroot (use sysfs)
4547 */
4548
4549 static hwloc_obj_t
hwloc_linux_add_os_device(struct hwloc_backend * backend,struct hwloc_obj * pcidev,hwloc_obj_osdev_type_t type,const char * name)4550 hwloc_linux_add_os_device(struct hwloc_backend *backend, struct hwloc_obj *pcidev, hwloc_obj_osdev_type_t type, const char *name)
4551 {
4552 struct hwloc_topology *topology = backend->topology;
4553 struct hwloc_obj *obj = hwloc_alloc_setup_object(HWLOC_OBJ_OS_DEVICE, -1);
4554 obj->name = strdup(name);
4555 obj->logical_index = -1;
4556 obj->attr->osdev.type = type;
4557
4558 hwloc_insert_object_by_parent(topology, pcidev, obj);
4559 /* insert_object_by_parent() doesn't merge during insert, so obj is still valid */
4560
4561 return obj;
4562 }
4563
4564 typedef void (*hwloc_linux_class_fillinfos_t)(struct hwloc_backend *backend, struct hwloc_obj *osdev, const char *osdevpath);
4565
4566 /* cannot be used in fsroot-aware code, would have to move to a per-topology variable */
4567
4568 static void
hwloc_linux_check_deprecated_classlinks_model(struct hwloc_linux_backend_data_s * data)4569 hwloc_linux_check_deprecated_classlinks_model(struct hwloc_linux_backend_data_s *data)
4570 {
4571 int root_fd = data->root_fd;
4572 DIR *dir;
4573 struct dirent *dirent;
4574 char path[128];
4575 struct stat st;
4576
4577 data->deprecated_classlinks_model = -1;
4578
4579 dir = hwloc_opendir("/sys/class/net", root_fd);
4580 if (!dir)
4581 return;
4582 while ((dirent = readdir(dir)) != NULL) {
4583 if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, "..") || !strcmp(dirent->d_name, "lo"))
4584 continue;
4585 snprintf(path, sizeof(path), "/sys/class/net/%s/device/net/%s", dirent->d_name, dirent->d_name);
4586 if (hwloc_stat(path, &st, root_fd) == 0) {
4587 data->deprecated_classlinks_model = 0;
4588 goto out;
4589 }
4590 snprintf(path, sizeof(path), "/sys/class/net/%s/device/net:%s", dirent->d_name, dirent->d_name);
4591 if (hwloc_stat(path, &st, root_fd) == 0) {
4592 data->deprecated_classlinks_model = 1;
4593 goto out;
4594 }
4595 }
4596 out:
4597 closedir(dir);
4598 }
4599
4600 /* class objects that are immediately below pci devices:
4601 * look for objects of the given classname below a sysfs (pcidev) directory
4602 */
4603 static int
hwloc_linux_class_readdir(struct hwloc_backend * backend,struct hwloc_obj * pcidev,const char * devicepath,hwloc_obj_osdev_type_t type,const char * classname,hwloc_linux_class_fillinfos_t fillinfo)4604 hwloc_linux_class_readdir(struct hwloc_backend *backend,
4605 struct hwloc_obj *pcidev, const char *devicepath,
4606 hwloc_obj_osdev_type_t type, const char *classname,
4607 hwloc_linux_class_fillinfos_t fillinfo)
4608 {
4609 struct hwloc_linux_backend_data_s *data = backend->private_data;
4610 int root_fd = data->root_fd;
4611 size_t classnamelen = strlen(classname);
4612 char path[256];
4613 DIR *dir;
4614 struct dirent *dirent;
4615 hwloc_obj_t obj;
4616 int res = 0, err;
4617
4618 if (data->deprecated_classlinks_model == -2)
4619 hwloc_linux_check_deprecated_classlinks_model(data);
4620
4621 if (data->deprecated_classlinks_model != 1) {
4622 /* modern sysfs: <device>/<class>/<name> */
4623 struct stat st;
4624 snprintf(path, sizeof(path), "%s/%s", devicepath, classname);
4625
4626 /* some very host kernel (2.6.9/RHEL4) have <device>/<class> symlink without any way to find <name>.
4627 * make sure <device>/<class> is a directory to avoid this case.
4628 */
4629 err = hwloc_lstat(path, &st, root_fd);
4630 if (err < 0 || !S_ISDIR(st.st_mode))
4631 goto trydeprecated;
4632
4633 dir = hwloc_opendir(path, root_fd);
4634 if (dir) {
4635 data->deprecated_classlinks_model = 0;
4636 while ((dirent = readdir(dir)) != NULL) {
4637 if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
4638 continue;
4639 obj = hwloc_linux_add_os_device(backend, pcidev, type, dirent->d_name);
4640 if (fillinfo) {
4641 snprintf(path, sizeof(path), "%s/%s/%s", devicepath, classname, dirent->d_name);
4642 fillinfo(backend, obj, path);
4643 }
4644 res++;
4645 }
4646 closedir(dir);
4647 return res;
4648 }
4649 }
4650
4651 trydeprecated:
4652 if (data->deprecated_classlinks_model != 0) {
4653 /* deprecated sysfs: <device>/<class>:<name> */
4654 dir = hwloc_opendir(devicepath, root_fd);
4655 if (dir) {
4656 while ((dirent = readdir(dir)) != NULL) {
4657 if (strncmp(dirent->d_name, classname, classnamelen) || dirent->d_name[classnamelen] != ':')
4658 continue;
4659 data->deprecated_classlinks_model = 1;
4660 obj = hwloc_linux_add_os_device(backend, pcidev, type, dirent->d_name + classnamelen+1);
4661 if (fillinfo) {
4662 snprintf(path, sizeof(path), "%s/%s", devicepath, dirent->d_name);
4663 fillinfo(backend, obj, path);
4664 }
4665 res++;
4666 }
4667 closedir(dir);
4668 return res;
4669 }
4670 }
4671
4672 return 0;
4673 }
4674
4675 /*
4676 * look for net objects below a pcidev in sysfs
4677 */
4678 static void
hwloc_linux_net_class_fillinfos(struct hwloc_backend * backend,struct hwloc_obj * obj,const char * osdevpath)4679 hwloc_linux_net_class_fillinfos(struct hwloc_backend *backend,
4680 struct hwloc_obj *obj, const char *osdevpath)
4681 {
4682 struct hwloc_linux_backend_data_s *data = backend->private_data;
4683 int root_fd = data->root_fd;
4684 struct stat st;
4685 char path[256];
4686 char address[128];
4687 snprintf(path, sizeof(path), "%s/address", osdevpath);
4688 if (!hwloc_read_path_by_length(path, address, sizeof(address), root_fd)) {
4689 char *eol = strchr(address, '\n');
4690 if (eol)
4691 *eol = 0;
4692 hwloc_obj_add_info(obj, "Address", address);
4693 }
4694 snprintf(path, sizeof(path), "%s/device/infiniband", osdevpath);
4695 if (!hwloc_stat(path, &st, root_fd)) {
4696 char hexid[16];
4697 snprintf(path, sizeof(path), "%s/dev_id", osdevpath);
4698 if (!hwloc_read_path_by_length(path, hexid, sizeof(hexid), root_fd)) {
4699 char *eoid;
4700 unsigned long port;
4701 port = strtoul(hexid, &eoid, 0);
4702 if (eoid != hexid) {
4703 char portstr[16];
4704 snprintf(portstr, sizeof(portstr), "%ld", port+1);
4705 hwloc_obj_add_info(obj, "Port", portstr);
4706 }
4707 }
4708 }
4709 }
4710
4711 static int
hwloc_linux_lookup_net_class(struct hwloc_backend * backend,struct hwloc_obj * pcidev,const char * pcidevpath)4712 hwloc_linux_lookup_net_class(struct hwloc_backend *backend,
4713 struct hwloc_obj *pcidev, const char *pcidevpath)
4714 {
4715 return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_NETWORK, "net", hwloc_linux_net_class_fillinfos);
4716 }
4717
4718 /*
4719 * look for infiniband objects below a pcidev in sysfs
4720 */
4721 static void
hwloc_linux_infiniband_class_fillinfos(struct hwloc_backend * backend,struct hwloc_obj * obj,const char * osdevpath)4722 hwloc_linux_infiniband_class_fillinfos(struct hwloc_backend *backend,
4723 struct hwloc_obj *obj, const char *osdevpath)
4724 {
4725 struct hwloc_linux_backend_data_s *data = backend->private_data;
4726 int root_fd = data->root_fd;
4727 char path[256];
4728 char guidvalue[20];
4729 unsigned i,j;
4730
4731 snprintf(path, sizeof(path), "%s/node_guid", osdevpath);
4732 if (!hwloc_read_path_by_length(path, guidvalue, sizeof(guidvalue), root_fd)) {
4733 size_t len;
4734 len = strspn(guidvalue, "0123456789abcdefx:");
4735 guidvalue[len] = '\0';
4736 hwloc_obj_add_info(obj, "NodeGUID", guidvalue);
4737 }
4738
4739 snprintf(path, sizeof(path), "%s/sys_image_guid", osdevpath);
4740 if (!hwloc_read_path_by_length(path, guidvalue, sizeof(guidvalue), root_fd)) {
4741 size_t len;
4742 len = strspn(guidvalue, "0123456789abcdefx:");
4743 guidvalue[len] = '\0';
4744 hwloc_obj_add_info(obj, "SysImageGUID", guidvalue);
4745 }
4746
4747 for(i=1; ; i++) {
4748 char statevalue[2];
4749 char lidvalue[11];
4750 char gidvalue[40];
4751
4752 snprintf(path, sizeof(path), "%s/ports/%u/state", osdevpath, i);
4753 if (!hwloc_read_path_by_length(path, statevalue, sizeof(statevalue), root_fd)) {
4754 char statename[32];
4755 statevalue[1] = '\0'; /* only keep the first byte/digit */
4756 snprintf(statename, sizeof(statename), "Port%uState", i);
4757 hwloc_obj_add_info(obj, statename, statevalue);
4758 } else {
4759 /* no such port */
4760 break;
4761 }
4762
4763 snprintf(path, sizeof(path), "%s/ports/%u/lid", osdevpath, i);
4764 if (!hwloc_read_path_by_length(path, lidvalue, sizeof(lidvalue), root_fd)) {
4765 char lidname[32];
4766 size_t len;
4767 len = strspn(lidvalue, "0123456789abcdefx");
4768 lidvalue[len] = '\0';
4769 snprintf(lidname, sizeof(lidname), "Port%uLID", i);
4770 hwloc_obj_add_info(obj, lidname, lidvalue);
4771 }
4772
4773 snprintf(path, sizeof(path), "%s/ports/%u/lid_mask_count", osdevpath, i);
4774 if (!hwloc_read_path_by_length(path, lidvalue, sizeof(lidvalue), root_fd)) {
4775 char lidname[32];
4776 size_t len;
4777 len = strspn(lidvalue, "0123456789");
4778 lidvalue[len] = '\0';
4779 snprintf(lidname, sizeof(lidname), "Port%uLMC", i);
4780 hwloc_obj_add_info(obj, lidname, lidvalue);
4781 }
4782
4783 for(j=0; ; j++) {
4784 snprintf(path, sizeof(path), "%s/ports/%u/gids/%u", osdevpath, i, j);
4785 if (!hwloc_read_path_by_length(path, gidvalue, sizeof(gidvalue), root_fd)) {
4786 char gidname[32];
4787 size_t len;
4788 len = strspn(gidvalue, "0123456789abcdefx:");
4789 gidvalue[len] = '\0';
4790 if (strncmp(gidvalue+20, "0000:0000:0000:0000", 19)) {
4791 /* only keep initialized GIDs */
4792 snprintf(gidname, sizeof(gidname), "Port%uGID%u", i, j);
4793 hwloc_obj_add_info(obj, gidname, gidvalue);
4794 }
4795 } else {
4796 /* no such port */
4797 break;
4798 }
4799 }
4800 }
4801 }
4802
4803 static int
hwloc_linux_lookup_openfabrics_class(struct hwloc_backend * backend,struct hwloc_obj * pcidev,const char * pcidevpath)4804 hwloc_linux_lookup_openfabrics_class(struct hwloc_backend *backend,
4805 struct hwloc_obj *pcidev, const char *pcidevpath)
4806 {
4807 return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_OPENFABRICS, "infiniband", hwloc_linux_infiniband_class_fillinfos);
4808 }
4809
4810 /* look for dma objects below a pcidev in sysfs */
4811 static int
hwloc_linux_lookup_dma_class(struct hwloc_backend * backend,struct hwloc_obj * pcidev,const char * pcidevpath)4812 hwloc_linux_lookup_dma_class(struct hwloc_backend *backend,
4813 struct hwloc_obj *pcidev, const char *pcidevpath)
4814 {
4815 return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_DMA, "dma", NULL);
4816 }
4817
4818 /* look for drm objects below a pcidev in sysfs */
4819 static int
hwloc_linux_lookup_drm_class(struct hwloc_backend * backend,struct hwloc_obj * pcidev,const char * pcidevpath)4820 hwloc_linux_lookup_drm_class(struct hwloc_backend *backend,
4821 struct hwloc_obj *pcidev, const char *pcidevpath)
4822 {
4823 return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_GPU, "drm", NULL);
4824
4825 /* we could look at the "graphics" class too, but it doesn't help for proprietary drivers either */
4826
4827 /* GPU devices (even with a proprietary driver) seem to have a boot_vga field in their PCI device directory (since 2.6.30),
4828 * so we could create a OS device for each PCI devices with such a field.
4829 * boot_vga is actually created when class >> 8 == VGA (it contains 1 for boot vga device), so it's trivial anyway.
4830 */
4831 }
4832
4833 /*
4834 * look for block objects below a pcidev in sysfs
4835 */
4836
4837 static void
hwloc_linux_block_class_fillinfos(struct hwloc_backend * backend,struct hwloc_obj * obj,const char * osdevpath)4838 hwloc_linux_block_class_fillinfos(struct hwloc_backend *backend,
4839 struct hwloc_obj *obj, const char *osdevpath)
4840 {
4841 struct hwloc_linux_backend_data_s *data = backend->private_data;
4842 int root_fd = data->root_fd;
4843 FILE *file;
4844 char path[256];
4845 char line[128];
4846 char vendor[64] = "";
4847 char model[64] = "";
4848 char serial[64] = "";
4849 char revision[64] = "";
4850 char blocktype[64] = "";
4851 unsigned major_id, minor_id;
4852 char *tmp;
4853
4854 snprintf(path, sizeof(path), "%s/dev", osdevpath);
4855 if (hwloc_read_path_by_length(path, line, sizeof(line), root_fd) < 0)
4856 return;
4857
4858 if (sscanf(line, "%u:%u", &major_id, &minor_id) != 2)
4859 return;
4860 tmp = strchr(line, '\n');
4861 if (tmp)
4862 *tmp = '\0';
4863 hwloc_obj_add_info(obj, "LinuxDeviceID", line);
4864
4865 #ifdef HWLOC_HAVE_LIBUDEV
4866 if (data->udev) {
4867 struct udev_device *dev;
4868 const char *prop;
4869 dev = udev_device_new_from_subsystem_sysname(data->udev, "block", obj->name);
4870 if (!dev)
4871 return;
4872 prop = udev_device_get_property_value(dev, "ID_VENDOR");
4873 if (prop) {
4874 strncpy(vendor, prop, sizeof(vendor));
4875 vendor[sizeof(vendor)-1] = '\0';
4876 }
4877 prop = udev_device_get_property_value(dev, "ID_MODEL");
4878 if (prop) {
4879 strncpy(model, prop, sizeof(model));
4880 model[sizeof(model)-1] = '\0';
4881 }
4882 prop = udev_device_get_property_value(dev, "ID_REVISION");
4883 if (prop) {
4884 strncpy(revision, prop, sizeof(revision));
4885 revision[sizeof(revision)-1] = '\0';
4886 }
4887 prop = udev_device_get_property_value(dev, "ID_SERIAL_SHORT");
4888 if (prop) {
4889 strncpy(serial, prop, sizeof(serial));
4890 serial[sizeof(serial)-1] = '\0';
4891 }
4892 prop = udev_device_get_property_value(dev, "ID_TYPE");
4893 if (prop) {
4894 strncpy(blocktype, prop, sizeof(blocktype));
4895 blocktype[sizeof(blocktype)-1] = '\0';
4896 }
4897
4898 udev_device_unref(dev);
4899 } else
4900 /* fallback to reading files, works with any fsroot */
4901 #endif
4902 {
4903 snprintf(path, sizeof(path), "/run/udev/data/b%u:%u", major_id, minor_id);
4904 file = hwloc_fopen(path, "r", root_fd);
4905 if (!file)
4906 return;
4907
4908 while (NULL != fgets(line, sizeof(line), file)) {
4909 tmp = strchr(line, '\n');
4910 if (tmp)
4911 *tmp = '\0';
4912 if (!strncmp(line, "E:ID_VENDOR=", strlen("E:ID_VENDOR="))) {
4913 strncpy(vendor, line+strlen("E:ID_VENDOR="), sizeof(vendor));
4914 vendor[sizeof(vendor)-1] = '\0';
4915 } else if (!strncmp(line, "E:ID_MODEL=", strlen("E:ID_MODEL="))) {
4916 strncpy(model, line+strlen("E:ID_MODEL="), sizeof(model));
4917 model[sizeof(model)-1] = '\0';
4918 } else if (!strncmp(line, "E:ID_REVISION=", strlen("E:ID_REVISION="))) {
4919 strncpy(revision, line+strlen("E:ID_REVISION="), sizeof(revision));
4920 revision[sizeof(revision)-1] = '\0';
4921 } else if (!strncmp(line, "E:ID_SERIAL_SHORT=", strlen("E:ID_SERIAL_SHORT="))) {
4922 strncpy(serial, line+strlen("E:ID_SERIAL_SHORT="), sizeof(serial));
4923 serial[sizeof(serial)-1] = '\0';
4924 } else if (!strncmp(line, "E:ID_TYPE=", strlen("E:ID_TYPE="))) {
4925 strncpy(blocktype, line+strlen("E:ID_TYPE="), sizeof(blocktype));
4926 blocktype[sizeof(blocktype)-1] = '\0';
4927 }
4928 }
4929 fclose(file);
4930 }
4931
4932 /* clear fake "ATA" vendor name */
4933 if (!strcasecmp(vendor, "ATA"))
4934 *vendor = '\0';
4935 /* overwrite vendor name from model when possible */
4936 if (!*vendor) {
4937 if (!strncasecmp(model, "wd", 2))
4938 strcpy(vendor, "Western Digital");
4939 else if (!strncasecmp(model, "st", 2))
4940 strcpy(vendor, "Seagate");
4941 else if (!strncasecmp(model, "samsung", 7))
4942 strcpy(vendor, "Samsung");
4943 else if (!strncasecmp(model, "sandisk", 7))
4944 strcpy(vendor, "SanDisk");
4945 else if (!strncasecmp(model, "toshiba", 7))
4946 strcpy(vendor, "Toshiba");
4947 }
4948
4949 if (*vendor)
4950 hwloc_obj_add_info(obj, "Vendor", vendor);
4951 if (*model)
4952 hwloc_obj_add_info(obj, "Model", model);
4953 if (*revision)
4954 hwloc_obj_add_info(obj, "Revision", revision);
4955 if (*serial)
4956 hwloc_obj_add_info(obj, "SerialNumber", serial);
4957
4958 if (!strcmp(blocktype, "disk"))
4959 hwloc_obj_add_info(obj, "Type", "Disk");
4960 else if (!strcmp(blocktype, "tape"))
4961 hwloc_obj_add_info(obj, "Type", "Tape");
4962 else if (!strcmp(blocktype, "cd") || !strcmp(blocktype, "floppy") || !strcmp(blocktype, "optical"))
4963 hwloc_obj_add_info(obj, "Type", "Removable Media Device");
4964 else /* generic, usb mass storage/rbc, usb mass storage/scsi */
4965 hwloc_obj_add_info(obj, "Type", "Other");
4966 }
4967
4968 /* block class objects are in
4969 * host%d/target%d:%d:%d/%d:%d:%d:%d/
4970 * or
4971 * host%d/port-%d:%d/end_device-%d:%d/target%d:%d:%d/%d:%d:%d:%d/
4972 * or
4973 * ide%d/%d.%d/
4974 * below pci devices */
4975 static int
hwloc_linux_lookup_host_block_class(struct hwloc_backend * backend,struct hwloc_obj * pcidev,char * path,size_t pathlen)4976 hwloc_linux_lookup_host_block_class(struct hwloc_backend *backend,
4977 struct hwloc_obj *pcidev, char *path, size_t pathlen)
4978 {
4979 struct hwloc_linux_backend_data_s *data = backend->private_data;
4980 int root_fd = data->root_fd;
4981 DIR *hostdir, *portdir, *targetdir;
4982 struct dirent *hostdirent, *portdirent, *targetdirent;
4983 size_t hostdlen, portdlen, targetdlen;
4984 int dummy;
4985 int res = 0;
4986
4987 hostdir = hwloc_opendir(path, root_fd);
4988 if (!hostdir)
4989 return 0;
4990
4991 while ((hostdirent = readdir(hostdir)) != NULL) {
4992 if (sscanf(hostdirent->d_name, "port-%d:%d", &dummy, &dummy) == 2)
4993 {
4994 /* found host%d/port-%d:%d */
4995 path[pathlen] = '/';
4996 strcpy(&path[pathlen+1], hostdirent->d_name);
4997 pathlen += hostdlen = 1+strlen(hostdirent->d_name);
4998 portdir = hwloc_opendir(path, root_fd);
4999 if (!portdir)
5000 continue;
5001 while ((portdirent = readdir(portdir)) != NULL) {
5002 if (sscanf(portdirent->d_name, "end_device-%d:%d", &dummy, &dummy) == 2) {
5003 /* found host%d/port-%d:%d/end_device-%d:%d */
5004 path[pathlen] = '/';
5005 strcpy(&path[pathlen+1], portdirent->d_name);
5006 pathlen += portdlen = 1+strlen(portdirent->d_name);
5007 res += hwloc_linux_lookup_host_block_class(backend, pcidev, path, pathlen);
5008 /* restore parent path */
5009 pathlen -= portdlen;
5010 path[pathlen] = '\0';
5011 }
5012 }
5013 closedir(portdir);
5014 /* restore parent path */
5015 pathlen -= hostdlen;
5016 path[pathlen] = '\0';
5017 continue;
5018 } else if (sscanf(hostdirent->d_name, "target%d:%d:%d", &dummy, &dummy, &dummy) == 3) {
5019 /* found host%d/target%d:%d:%d */
5020 path[pathlen] = '/';
5021 strcpy(&path[pathlen+1], hostdirent->d_name);
5022 pathlen += hostdlen = 1+strlen(hostdirent->d_name);
5023 targetdir = hwloc_opendir(path, root_fd);
5024 if (!targetdir)
5025 continue;
5026 while ((targetdirent = readdir(targetdir)) != NULL) {
5027 if (sscanf(targetdirent->d_name, "%d:%d:%d:%d", &dummy, &dummy, &dummy, &dummy) != 4)
5028 continue;
5029 /* found host%d/target%d:%d:%d/%d:%d:%d:%d */
5030 path[pathlen] = '/';
5031 strcpy(&path[pathlen+1], targetdirent->d_name);
5032 pathlen += targetdlen = 1+strlen(targetdirent->d_name);
5033 /* lookup block class for real */
5034 res += hwloc_linux_class_readdir(backend, pcidev, path, HWLOC_OBJ_OSDEV_BLOCK, "block", hwloc_linux_block_class_fillinfos);
5035 /* restore parent path */
5036 pathlen -= targetdlen;
5037 path[pathlen] = '\0';
5038 }
5039 closedir(targetdir);
5040 /* restore parent path */
5041 pathlen -= hostdlen;
5042 path[pathlen] = '\0';
5043 }
5044 }
5045 closedir(hostdir);
5046
5047 return res;
5048 }
5049
5050 static int
hwloc_linux_lookup_block_class(struct hwloc_backend * backend,struct hwloc_obj * pcidev,const char * pcidevpath)5051 hwloc_linux_lookup_block_class(struct hwloc_backend *backend,
5052 struct hwloc_obj *pcidev, const char *pcidevpath)
5053 {
5054 struct hwloc_linux_backend_data_s *data = backend->private_data;
5055 int root_fd = data->root_fd;
5056 size_t pathlen;
5057 DIR *devicedir, *hostdir;
5058 struct dirent *devicedirent, *hostdirent;
5059 size_t devicedlen, hostdlen;
5060 char path[256];
5061 int dummy;
5062 int res = 0;
5063
5064 strcpy(path, pcidevpath);
5065 pathlen = strlen(path);
5066
5067 /* look for a direct block device here (such as NVMe, something without controller subdirs in the middle) */
5068 res += hwloc_linux_class_readdir(backend, pcidev, path,
5069 HWLOC_OBJ_OSDEV_BLOCK, "block",
5070 hwloc_linux_block_class_fillinfos);
5071 if (res)
5072 return res;
5073 /* otherwise try to find controller subdirectories */
5074
5075 devicedir = hwloc_opendir(pcidevpath, root_fd);
5076 if (!devicedir)
5077 return 0;
5078
5079 while ((devicedirent = readdir(devicedir)) != NULL) {
5080 if (sscanf(devicedirent->d_name, "ide%d", &dummy) == 1) {
5081 /* found ide%d */
5082 path[pathlen] = '/';
5083 strcpy(&path[pathlen+1], devicedirent->d_name);
5084 pathlen += devicedlen = 1+strlen(devicedirent->d_name);
5085 hostdir = hwloc_opendir(path, root_fd);
5086 if (!hostdir)
5087 continue;
5088 while ((hostdirent = readdir(hostdir)) != NULL) {
5089 if (sscanf(hostdirent->d_name, "%d.%d", &dummy, &dummy) == 2) {
5090 /* found ide%d/%d.%d */
5091 path[pathlen] = '/';
5092 strcpy(&path[pathlen+1], hostdirent->d_name);
5093 pathlen += hostdlen = 1+strlen(hostdirent->d_name);
5094 /* lookup block class for real */
5095 res += hwloc_linux_class_readdir(backend, pcidev, path, HWLOC_OBJ_OSDEV_BLOCK, "block", NULL);
5096 /* restore parent path */
5097 pathlen -= hostdlen;
5098 path[pathlen] = '\0';
5099 }
5100 }
5101 closedir(hostdir);
5102 /* restore parent path */
5103 pathlen -= devicedlen;
5104 path[pathlen] = '\0';
5105 } else if (sscanf(devicedirent->d_name, "host%d", &dummy) == 1) {
5106 /* found host%d */
5107 path[pathlen] = '/';
5108 strcpy(&path[pathlen+1], devicedirent->d_name);
5109 pathlen += devicedlen = 1+strlen(devicedirent->d_name);
5110 res += hwloc_linux_lookup_host_block_class(backend, pcidev, path, pathlen);
5111 /* restore parent path */
5112 pathlen -= devicedlen;
5113 path[pathlen] = '\0';
5114 } else if (sscanf(devicedirent->d_name, "ata%d", &dummy) == 1) {
5115 /* found ata%d */
5116 path[pathlen] = '/';
5117 strcpy(&path[pathlen+1], devicedirent->d_name);
5118 pathlen += devicedlen = 1+strlen(devicedirent->d_name);
5119 hostdir = hwloc_opendir(path, root_fd);
5120 if (!hostdir)
5121 continue;
5122 while ((hostdirent = readdir(hostdir)) != NULL) {
5123 if (sscanf(hostdirent->d_name, "host%d", &dummy) == 1) {
5124 /* found ata%d/host%d */
5125 path[pathlen] = '/';
5126 strcpy(&path[pathlen+1], hostdirent->d_name);
5127 pathlen += hostdlen = 1+strlen(hostdirent->d_name);
5128 /* lookup block class for real */
5129 res += hwloc_linux_lookup_host_block_class(backend, pcidev, path, pathlen);
5130 /* restore parent path */
5131 pathlen -= hostdlen;
5132 path[pathlen] = '\0';
5133 }
5134 }
5135 closedir(hostdir);
5136 /* restore parent path */
5137 pathlen -= devicedlen;
5138 path[pathlen] = '\0';
5139 }
5140 }
5141 closedir(devicedir);
5142
5143 return res;
5144 }
5145
5146 static void
hwloc_linux_mic_class_fillinfos(struct hwloc_backend * backend,struct hwloc_obj * obj,const char * osdevpath)5147 hwloc_linux_mic_class_fillinfos(struct hwloc_backend *backend,
5148 struct hwloc_obj *obj, const char *osdevpath)
5149 {
5150 struct hwloc_linux_backend_data_s *data = backend->private_data;
5151 int root_fd = data->root_fd;
5152 char path[256];
5153 char family[64];
5154 char sku[64];
5155 char sn[64];
5156 char string[20];
5157
5158 hwloc_obj_add_info(obj, "CoProcType", "MIC");
5159
5160 snprintf(path, sizeof(path), "%s/family", osdevpath);
5161 if (!hwloc_read_path_by_length(path, family, sizeof(family), root_fd)) {
5162 char *eol = strchr(family, '\n');
5163 if (eol)
5164 *eol = 0;
5165 hwloc_obj_add_info(obj, "MICFamily", family);
5166 }
5167
5168 snprintf(path, sizeof(path), "%s/sku", osdevpath);
5169 if (!hwloc_read_path_by_length(path, sku, sizeof(sku), root_fd)) {
5170 char *eol = strchr(sku, '\n');
5171 if (eol)
5172 *eol = 0;
5173 hwloc_obj_add_info(obj, "MICSKU", sku);
5174 }
5175
5176 snprintf(path, sizeof(path), "%s/serialnumber", osdevpath);
5177 if (!hwloc_read_path_by_length(path, sn, sizeof(sn), root_fd)) {
5178 char *eol;
5179 eol = strchr(sn, '\n');
5180 if (eol)
5181 *eol = 0;
5182 hwloc_obj_add_info(obj, "MICSerialNumber", sn);
5183 }
5184
5185 snprintf(path, sizeof(path), "%s/active_cores", osdevpath);
5186 if (!hwloc_read_path_by_length(path, string, sizeof(string), root_fd)) {
5187 unsigned long count = strtoul(string, NULL, 16);
5188 snprintf(string, sizeof(string), "%lu", count);
5189 hwloc_obj_add_info(obj, "MICActiveCores", string);
5190 }
5191
5192 snprintf(path, sizeof(path), "%s/memsize", osdevpath);
5193 if (!hwloc_read_path_by_length(path, string, sizeof(string), root_fd)) {
5194 unsigned long count = strtoul(string, NULL, 16);
5195 snprintf(string, sizeof(string), "%lu", count);
5196 hwloc_obj_add_info(obj, "MICMemorySize", string);
5197 }
5198 }
5199
5200 static int
hwloc_linux_lookup_mic_class(struct hwloc_backend * backend,struct hwloc_obj * pcidev,const char * pcidevpath)5201 hwloc_linux_lookup_mic_class(struct hwloc_backend *backend,
5202 struct hwloc_obj *pcidev, const char *pcidevpath)
5203 {
5204 return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_COPROC, "mic", hwloc_linux_mic_class_fillinfos);
5205 }
5206
5207 static int
hwloc_linux_directlookup_mic_class(struct hwloc_backend * backend,struct hwloc_obj * pcidev)5208 hwloc_linux_directlookup_mic_class(struct hwloc_backend *backend,
5209 struct hwloc_obj *pcidev)
5210 {
5211 struct hwloc_linux_backend_data_s *data = backend->private_data;
5212 int root_fd = data->root_fd;
5213 char path[256];
5214 struct stat st;
5215 hwloc_obj_t obj;
5216 unsigned idx;
5217 int res = 0;
5218
5219 if (!data->mic_directlookup_id_max)
5220 /* already tried, nothing to do */
5221 return 0;
5222
5223 if (data->mic_directlookup_id_max == (unsigned) -1) {
5224 /* never tried, find out the max id */
5225 DIR *dir;
5226 struct dirent *dirent;
5227
5228 /* make sure we never do this lookup again */
5229 data->mic_directlookup_id_max = 0;
5230
5231 /* read the entire class and find the max id of mic%u dirents */
5232 dir = hwloc_opendir("/sys/devices/virtual/mic", root_fd);
5233 if (!dir) {
5234 dir = hwloc_opendir("/sys/class/mic", root_fd);
5235 if (!dir)
5236 return 0;
5237 }
5238 while ((dirent = readdir(dir)) != NULL) {
5239 if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
5240 continue;
5241 if (sscanf(dirent->d_name, "mic%u", &idx) != 1)
5242 continue;
5243 if (idx >= data->mic_directlookup_id_max)
5244 data->mic_directlookup_id_max = idx+1;
5245 }
5246 closedir(dir);
5247 }
5248
5249 /* now iterate over the mic ids and see if one matches our pcidev */
5250 for(idx=0; idx<data->mic_directlookup_id_max; idx++) {
5251 snprintf(path, sizeof(path), "/sys/class/mic/mic%u/pci_%02x:%02x.%02x",
5252 idx, pcidev->attr->pcidev.bus, pcidev->attr->pcidev.dev, pcidev->attr->pcidev.func);
5253 if (hwloc_stat(path, &st, root_fd) < 0)
5254 continue;
5255 snprintf(path, sizeof(path), "mic%u", idx);
5256 obj = hwloc_linux_add_os_device(backend, pcidev, HWLOC_OBJ_OSDEV_COPROC, path);
5257 snprintf(path, sizeof(path), "/sys/class/mic/mic%u", idx);
5258 hwloc_linux_mic_class_fillinfos(backend, obj, path);
5259 res++;
5260 }
5261
5262 return res;
5263 }
5264
5265 /*
5266 * backend callback for inserting objects inside a pci device
5267 */
5268 static int
hwloc_linux_backend_notify_new_object(struct hwloc_backend * backend,struct hwloc_backend * caller __hwloc_attribute_unused,struct hwloc_obj * obj)5269 hwloc_linux_backend_notify_new_object(struct hwloc_backend *backend, struct hwloc_backend *caller __hwloc_attribute_unused,
5270 struct hwloc_obj *obj)
5271 {
5272 struct hwloc_linux_backend_data_s *data = backend->private_data;
5273 char pcidevpath[256];
5274 int res = 0;
5275
5276 /* this callback is only used in the libpci backend for now */
5277 assert(obj->type == HWLOC_OBJ_PCI_DEVICE);
5278
5279 snprintf(pcidevpath, sizeof(pcidevpath), "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
5280 obj->attr->pcidev.domain, obj->attr->pcidev.bus,
5281 obj->attr->pcidev.dev, obj->attr->pcidev.func);
5282
5283 res += hwloc_linux_lookup_net_class(backend, obj, pcidevpath);
5284 res += hwloc_linux_lookup_openfabrics_class(backend, obj, pcidevpath);
5285 res += hwloc_linux_lookup_dma_class(backend, obj, pcidevpath);
5286 res += hwloc_linux_lookup_drm_class(backend, obj, pcidevpath);
5287 res += hwloc_linux_lookup_block_class(backend, obj, pcidevpath);
5288
5289 if (data->mic_need_directlookup == -1) {
5290 struct stat st;
5291 if (hwloc_stat("/sys/class/mic/mic0", &st, data->root_fd) == 0
5292 && hwloc_stat("/sys/class/mic/mic0/device/mic/mic0", &st, data->root_fd) == -1)
5293 /* hwloc_linux_lookup_mic_class will fail because pcidev sysfs directories
5294 * do not have mic/mic%u symlinks to mic devices (old mic driver).
5295 * if so, try from the mic class.
5296 */
5297 data->mic_need_directlookup = 1;
5298 else
5299 data->mic_need_directlookup = 0;
5300 }
5301 if (data->mic_need_directlookup)
5302 res += hwloc_linux_directlookup_mic_class(backend, obj);
5303 else
5304 res += hwloc_linux_lookup_mic_class(backend, obj, pcidevpath);
5305
5306 return res;
5307 }
5308
5309 /*
5310 * backend callback for retrieving the location of a pci device
5311 */
5312 static int
hwloc_linux_backend_get_obj_cpuset(struct hwloc_backend * backend,struct hwloc_backend * caller __hwloc_attribute_unused,struct hwloc_obj * obj,hwloc_bitmap_t cpuset)5313 hwloc_linux_backend_get_obj_cpuset(struct hwloc_backend *backend,
5314 struct hwloc_backend *caller __hwloc_attribute_unused,
5315 struct hwloc_obj *obj, hwloc_bitmap_t cpuset)
5316 {
5317 struct hwloc_linux_backend_data_s *data = backend->private_data;
5318 char path[256];
5319
5320 /* this callback is only used in the libpci backend for now */
5321 assert(obj->type == HWLOC_OBJ_PCI_DEVICE
5322 || (obj->type == HWLOC_OBJ_BRIDGE && obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI));
5323
5324 snprintf(path, sizeof(path), "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/local_cpus",
5325 obj->attr->pcidev.domain, obj->attr->pcidev.bus,
5326 obj->attr->pcidev.dev, obj->attr->pcidev.func);
5327 if (!hwloc__read_path_as_cpumask(path, cpuset, data->root_fd)
5328 && !hwloc_bitmap_iszero(cpuset))
5329 return 0;
5330 return -1;
5331 }
5332
5333
5334
5335 /*******************************
5336 ******* Linux component *******
5337 *******************************/
5338
5339 static void
hwloc_linux_backend_disable(struct hwloc_backend * backend)5340 hwloc_linux_backend_disable(struct hwloc_backend *backend)
5341 {
5342 struct hwloc_linux_backend_data_s *data = backend->private_data;
5343 #ifdef HAVE_OPENAT
5344 if (data->root_path)
5345 free(data->root_path);
5346 close(data->root_fd);
5347 #endif
5348 #ifdef HWLOC_HAVE_LIBUDEV
5349 if (data->udev)
5350 udev_unref(data->udev);
5351 #endif
5352 free(data);
5353 }
5354
5355 static struct hwloc_backend *
hwloc_linux_component_instantiate(struct hwloc_disc_component * component,const void * _data1,const void * _data2 __hwloc_attribute_unused,const void * _data3 __hwloc_attribute_unused)5356 hwloc_linux_component_instantiate(struct hwloc_disc_component *component,
5357 const void *_data1,
5358 const void *_data2 __hwloc_attribute_unused,
5359 const void *_data3 __hwloc_attribute_unused)
5360 {
5361 struct hwloc_backend *backend;
5362 struct hwloc_linux_backend_data_s *data;
5363 const char * fsroot_path = _data1;
5364 int flags, root = -1;
5365
5366 backend = hwloc_backend_alloc(component);
5367 if (!backend)
5368 goto out;
5369
5370 data = malloc(sizeof(*data));
5371 if (!data) {
5372 errno = ENOMEM;
5373 goto out_with_backend;
5374 }
5375
5376 backend->private_data = data;
5377 backend->flags = HWLOC_BACKEND_FLAG_NEED_LEVELS;
5378 backend->discover = hwloc_look_linuxfs;
5379 backend->get_obj_cpuset = hwloc_linux_backend_get_obj_cpuset;
5380 backend->notify_new_object = hwloc_linux_backend_notify_new_object;
5381 backend->disable = hwloc_linux_backend_disable;
5382
5383 /* default values */
5384 data->arch = HWLOC_LINUX_ARCH_UNKNOWN;
5385 data->is_knl = 0;
5386 data->is_amd_with_CU = 0;
5387 data->is_real_fsroot = 1;
5388 data->root_path = NULL;
5389 if (!fsroot_path)
5390 fsroot_path = "/";
5391
5392 #ifdef HAVE_OPENAT
5393 root = open(fsroot_path, O_RDONLY | O_DIRECTORY);
5394 if (root < 0)
5395 goto out_with_data;
5396
5397 if (strcmp(fsroot_path, "/")) {
5398 backend->is_thissystem = 0;
5399 data->is_real_fsroot = 0;
5400 data->root_path = strdup(fsroot_path);
5401 }
5402
5403 /* Since this fd stays open after hwloc returns, mark it as
5404 close-on-exec so that children don't inherit it. Stevens says
5405 that we should GETFD before we SETFD, so we do. */
5406 flags = fcntl(root, F_GETFD, 0);
5407 if (-1 == flags ||
5408 -1 == fcntl(root, F_SETFD, FD_CLOEXEC | flags)) {
5409 close(root);
5410 root = -1;
5411 goto out_with_data;
5412 }
5413 #else
5414 if (strcmp(fsroot_path, "/")) {
5415 errno = ENOSYS;
5416 goto out_with_data;
5417 }
5418 #endif
5419 data->root_fd = root;
5420
5421 #ifdef HWLOC_HAVE_LIBUDEV
5422 data->udev = NULL;
5423 if (data->is_real_fsroot) {
5424 data->udev = udev_new();
5425 }
5426 #endif
5427
5428 data->dumped_hwdata_dirname = getenv("HWLOC_DUMPED_HWDATA_DIR");
5429 if (!data->dumped_hwdata_dirname) {
5430 if (_data1)
5431 data->dumped_hwdata_dirname = "/var/run/hwloc";
5432 else
5433 data->dumped_hwdata_dirname = RUNSTATEDIR "/hwloc";
5434 }
5435
5436 data->deprecated_classlinks_model = -2; /* never tried */
5437 data->mic_need_directlookup = -1; /* not initialized */
5438 data->mic_directlookup_id_max = -1; /* not initialized */
5439
5440 return backend;
5441
5442 out_with_data:
5443 #ifdef HAVE_OPENAT
5444 if (data->root_path)
5445 free(data->root_path);
5446 #endif
5447 free(data);
5448 out_with_backend:
5449 free(backend);
5450 out:
5451 return NULL;
5452 }
5453
5454 static struct hwloc_disc_component hwloc_linux_disc_component = {
5455 HWLOC_DISC_COMPONENT_TYPE_CPU,
5456 "linux",
5457 HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
5458 hwloc_linux_component_instantiate,
5459 50,
5460 NULL
5461 };
5462
5463 const struct hwloc_component hwloc_linux_component = {
5464 HWLOC_COMPONENT_ABI,
5465 NULL, NULL,
5466 HWLOC_COMPONENT_TYPE_DISC,
5467 0,
5468 &hwloc_linux_disc_component
5469 };
5470
5471
5472
5473
5474 #ifdef HWLOC_HAVE_LINUXPCI
5475
5476 /***********************************
5477 ******* Linux PCI component *******
5478 ***********************************/
5479
5480 #define HWLOC_PCI_REVISION_ID 0x08
5481 #define HWLOC_PCI_CAP_ID_EXP 0x10
5482 #define HWLOC_PCI_CLASS_NOT_DEFINED 0x0000
5483
5484 static int
hwloc_look_linuxfs_pci(struct hwloc_backend * backend)5485 hwloc_look_linuxfs_pci(struct hwloc_backend *backend)
5486 {
5487 struct hwloc_topology *topology = backend->topology;
5488 struct hwloc_backend *tmpbackend;
5489 hwloc_obj_t first_obj = NULL, last_obj = NULL;
5490 int root_fd = -1;
5491 DIR *dir;
5492 struct dirent *dirent;
5493 int res = 0;
5494
5495 if (!(hwloc_topology_get_flags(topology) & (HWLOC_TOPOLOGY_FLAG_IO_DEVICES|HWLOC_TOPOLOGY_FLAG_WHOLE_IO)))
5496 return 0;
5497
5498 if (hwloc_get_next_pcidev(topology, NULL)) {
5499 hwloc_debug("%s", "PCI objects already added, ignoring linuxpci backend.\n");
5500 return 0;
5501 }
5502
5503 /* hackily find the linux backend to steal its fsroot */
5504 tmpbackend = topology->backends;
5505 while (tmpbackend) {
5506 if (tmpbackend->component == &hwloc_linux_disc_component) {
5507 root_fd = ((struct hwloc_linux_backend_data_s *) tmpbackend->private_data)->root_fd;
5508 hwloc_debug("linuxpci backend stole linux backend root_fd %d\n", root_fd);
5509 break; }
5510 tmpbackend = tmpbackend->next;
5511 }
5512 /* take our own descriptor, either pointing to linux fsroot, or to / if not found */
5513 if (root_fd >= 0)
5514 root_fd = dup(root_fd);
5515 else
5516 root_fd = open("/", O_RDONLY | O_DIRECTORY);
5517
5518 dir = hwloc_opendir("/sys/bus/pci/devices/", root_fd);
5519 if (!dir)
5520 goto out_with_rootfd;
5521
5522 while ((dirent = readdir(dir)) != NULL) {
5523 unsigned domain, bus, dev, func;
5524 hwloc_obj_t obj;
5525 struct hwloc_pcidev_attr_s *attr;
5526 unsigned os_index;
5527 char path[64];
5528 char value[16];
5529 size_t ret;
5530 int fd;
5531
5532 if (sscanf(dirent->d_name, "%04x:%02x:%02x.%01x", &domain, &bus, &dev, &func) != 4)
5533 continue;
5534
5535 os_index = (domain << 20) + (bus << 12) + (dev << 4) + func;
5536 obj = hwloc_alloc_setup_object(HWLOC_OBJ_PCI_DEVICE, os_index);
5537 if (!obj)
5538 break;
5539 attr = &obj->attr->pcidev;
5540
5541 attr->domain = domain;
5542 attr->bus = bus;
5543 attr->dev = dev;
5544 attr->func = func;
5545
5546 /* default (unknown) values */
5547 attr->vendor_id = 0;
5548 attr->device_id = 0;
5549 attr->class_id = HWLOC_PCI_CLASS_NOT_DEFINED;
5550 attr->revision = 0;
5551 attr->subvendor_id = 0;
5552 attr->subdevice_id = 0;
5553 attr->linkspeed = 0;
5554
5555 snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/vendor", dirent->d_name);
5556 if (!hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
5557 attr->vendor_id = strtoul(value, NULL, 16);
5558
5559 snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/device", dirent->d_name);
5560 if (!hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
5561 attr->device_id = strtoul(value, NULL, 16);
5562
5563 snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/class", dirent->d_name);
5564 if (!hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
5565 attr->class_id = strtoul(value, NULL, 16) >> 8;
5566
5567 snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/subsystem_vendor", dirent->d_name);
5568 if (!hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
5569 attr->subvendor_id = strtoul(value, NULL, 16);
5570
5571 snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/subsystem_device", dirent->d_name);
5572 if (!hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
5573 attr->subdevice_id = strtoul(value, NULL, 16);
5574
5575 snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/config", dirent->d_name);
5576 /* don't use hwloc_read_path_by_length() because we don't want the ending \0 */
5577 fd = hwloc_open(path, root_fd);
5578 if (fd >= 0) {
5579 #define CONFIG_SPACE_CACHESIZE 256
5580 unsigned char config_space_cache[CONFIG_SPACE_CACHESIZE];
5581 unsigned offset;
5582
5583 /* initialize the config space in case we fail to read it (missing permissions, etc). */
5584 memset(config_space_cache, 0xff, CONFIG_SPACE_CACHESIZE);
5585 ret = read(fd, config_space_cache, CONFIG_SPACE_CACHESIZE);
5586 (void) ret; /* we initialized config_space_cache in case we don't read enough, ignore the read length */
5587 close(fd);
5588
5589 /* is this a bridge? */
5590 if (hwloc_pci_prepare_bridge(obj, config_space_cache) < 0)
5591 continue;
5592
5593 /* get the revision */
5594 attr->revision = config_space_cache[HWLOC_PCI_REVISION_ID];
5595
5596 /* try to get the link speed */
5597 offset = hwloc_pci_find_cap(config_space_cache, HWLOC_PCI_CAP_ID_EXP);
5598 if (offset > 0 && offset + 20 /* size of PCI express block up to link status */ <= CONFIG_SPACE_CACHESIZE)
5599 hwloc_pci_find_linkspeed(config_space_cache, offset, &attr->linkspeed);
5600 }
5601
5602 if (first_obj)
5603 last_obj->next_sibling = obj;
5604 else
5605 first_obj = obj;
5606 last_obj = obj;
5607 }
5608
5609 closedir(dir);
5610
5611 dir = hwloc_opendir("/sys/bus/pci/slots/", root_fd);
5612 if (dir) {
5613 while ((dirent = readdir(dir)) != NULL) {
5614 char path[64];
5615 char buf[64];
5616 unsigned domain, bus, dev;
5617 if (dirent->d_name[0] == '.')
5618 continue;
5619 snprintf(path, sizeof(path), "/sys/bus/pci/slots/%s/address", dirent->d_name);
5620 if (!hwloc_read_path_by_length(path, buf, sizeof(buf), root_fd)
5621 && sscanf(buf, "%x:%x:%x", &domain, &bus, &dev) == 3) {
5622 hwloc_obj_t obj = first_obj;
5623 while (obj) {
5624 if (obj->attr->pcidev.domain == domain
5625 && obj->attr->pcidev.bus == bus
5626 && obj->attr->pcidev.dev == dev) {
5627 hwloc_obj_add_info(obj, "PCISlot", dirent->d_name);
5628 }
5629 obj = obj->next_sibling;
5630 }
5631 }
5632 }
5633 closedir(dir);
5634 }
5635
5636 res = hwloc_insert_pci_device_list(backend, first_obj);
5637
5638 out_with_rootfd:
5639 close(root_fd);
5640 return res;
5641 }
5642
5643 static struct hwloc_backend *
hwloc_linuxpci_component_instantiate(struct hwloc_disc_component * component,const void * _data1 __hwloc_attribute_unused,const void * _data2 __hwloc_attribute_unused,const void * _data3 __hwloc_attribute_unused)5644 hwloc_linuxpci_component_instantiate(struct hwloc_disc_component *component,
5645 const void *_data1 __hwloc_attribute_unused,
5646 const void *_data2 __hwloc_attribute_unused,
5647 const void *_data3 __hwloc_attribute_unused)
5648 {
5649 struct hwloc_backend *backend;
5650
5651 /* thissystem may not be fully initialized yet, we'll check flags in discover() */
5652
5653 backend = hwloc_backend_alloc(component);
5654 if (!backend)
5655 return NULL;
5656 backend->flags = HWLOC_BACKEND_FLAG_NEED_LEVELS;
5657 backend->discover = hwloc_look_linuxfs_pci;
5658 return backend;
5659 }
5660
5661 static struct hwloc_disc_component hwloc_linuxpci_disc_component = {
5662 HWLOC_DISC_COMPONENT_TYPE_MISC,
5663 "linuxpci",
5664 HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
5665 hwloc_linuxpci_component_instantiate,
5666 19, /* after pci */
5667 NULL
5668 };
5669
5670 const struct hwloc_component hwloc_linuxpci_component = {
5671 HWLOC_COMPONENT_ABI,
5672 NULL, NULL,
5673 HWLOC_COMPONENT_TYPE_DISC,
5674 0,
5675 &hwloc_linuxpci_disc_component
5676 };
5677
5678 #endif /* HWLOC_HAVE_LINUXPCI */
5679