1 /*
2  * Copyright © 2009 CNRS
3  * Copyright © 2009-2017 Inria.  All rights reserved.
4  * Copyright © 2009-2013, 2015 Université Bordeaux
5  * Copyright © 2009-2014 Cisco Systems, Inc.  All rights reserved.
6  * Copyright © 2015 Intel, Inc.  All rights reserved.
7  * Copyright © 2010 IBM
8  * See COPYING in top-level directory.
9  */
10 
11 #include <private/autogen/config.h>
12 #include <hwloc.h>
13 #include <hwloc/linux.h>
14 #include <private/misc.h>
15 #include <private/private.h>
16 #include <private/misc.h>
17 #include <private/debug.h>
18 
19 #include <limits.h>
20 #include <stdio.h>
21 #include <fcntl.h>
22 #include <errno.h>
23 #include <assert.h>
24 #ifdef HAVE_DIRENT_H
25 #include <dirent.h>
26 #endif
27 #ifdef HAVE_UNISTD_H
28 #include <unistd.h>
29 #endif
30 #ifdef HWLOC_HAVE_LIBUDEV
31 #include <libudev.h>
32 #endif
33 #include <sys/types.h>
34 #include <sys/stat.h>
35 #include <sched.h>
36 #include <pthread.h>
37 #include <sys/mman.h>
38 #include <sys/syscall.h>
39 #include <mntent.h>
40 #if defined HWLOC_HAVE_SET_MEMPOLICY || defined HWLOC_HAVE_MBIND || defined HWLOC_HAVE_MOVE_PAGES
41 #define migratepages migrate_pages /* workaround broken migratepages prototype in numaif.h before libnuma 2.0.2 */
42 #include <numaif.h>
43 #endif
44 
45 struct hwloc_linux_backend_data_s {
46   char *root_path; /* NULL if unused */
47   int root_fd; /* The file descriptor for the file system root, used when browsing, e.g., Linux' sysfs and procfs. */
48   int is_real_fsroot; /* Boolean saying whether root_fd points to the real filesystem root of the system */
49 #ifdef HWLOC_HAVE_LIBUDEV
50   struct udev *udev; /* Global udev context */
51 #endif
52   char *dumped_hwdata_dirname;
53   enum {
54     HWLOC_LINUX_ARCH_X86, /* x86 32 or 64bits, including k1om (KNC) */
55     HWLOC_LINUX_ARCH_IA64,
56     HWLOC_LINUX_ARCH_ARM,
57     HWLOC_LINUX_ARCH_POWER,
58     HWLOC_LINUX_ARCH_UNKNOWN
59   } arch;
60   int is_knl;
61   int is_amd_with_CU;
62   struct utsname utsname; /* fields contain \0 when unknown */
63   unsigned fallback_nbprocessors;
64   unsigned pagesize;
65 
66   int deprecated_classlinks_model; /* -2 if never tried, -1 if unknown, 0 if new (device contains class/name), 1 if old (device contains class:name) */
67   int mic_need_directlookup; /* if not tried yet, 0 if not needed, 1 if needed */
68   unsigned mic_directlookup_id_max; /* -1 if not tried yet, 0 if none to lookup, maxid+1 otherwise */
69 };
70 
71 
72 
73 /***************************
74  * Misc Abstraction layers *
75  ***************************/
76 
77 #if !(defined HWLOC_HAVE_SCHED_SETAFFINITY) && (defined HWLOC_HAVE_SYSCALL)
78 /* libc doesn't have support for sched_setaffinity, make system call
79  * ourselves: */
80 #    include <linux/unistd.h>
81 #    ifndef __NR_sched_setaffinity
82 #       ifdef __i386__
83 #         define __NR_sched_setaffinity 241
84 #       elif defined(__x86_64__)
85 #         define __NR_sched_setaffinity 203
86 #       elif defined(__ia64__)
87 #         define __NR_sched_setaffinity 1231
88 #       elif defined(__hppa__)
89 #         define __NR_sched_setaffinity 211
90 #       elif defined(__alpha__)
91 #         define __NR_sched_setaffinity 395
92 #       elif defined(__s390__)
93 #         define __NR_sched_setaffinity 239
94 #       elif defined(__sparc__)
95 #         define __NR_sched_setaffinity 261
96 #       elif defined(__m68k__)
97 #         define __NR_sched_setaffinity 311
98 #       elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
99 #         define __NR_sched_setaffinity 222
100 #       elif defined(__arm__)
101 #         define __NR_sched_setaffinity 241
102 #       elif defined(__cris__)
103 #         define __NR_sched_setaffinity 241
104 /*#       elif defined(__mips__)
105   #         define __NR_sched_setaffinity TODO (32/64/nabi) */
106 #       else
107 #         warning "don't know the syscall number for sched_setaffinity on this architecture, will not support binding"
108 #         define sched_setaffinity(pid, lg, mask) (errno = ENOSYS, -1)
109 #       endif
110 #    endif
111 #    ifndef sched_setaffinity
112 #      define sched_setaffinity(pid, lg, mask) syscall(__NR_sched_setaffinity, pid, lg, mask)
113 #    endif
114 #    ifndef __NR_sched_getaffinity
115 #       ifdef __i386__
116 #         define __NR_sched_getaffinity 242
117 #       elif defined(__x86_64__)
118 #         define __NR_sched_getaffinity 204
119 #       elif defined(__ia64__)
120 #         define __NR_sched_getaffinity 1232
121 #       elif defined(__hppa__)
122 #         define __NR_sched_getaffinity 212
123 #       elif defined(__alpha__)
124 #         define __NR_sched_getaffinity 396
125 #       elif defined(__s390__)
126 #         define __NR_sched_getaffinity 240
127 #       elif defined(__sparc__)
128 #         define __NR_sched_getaffinity 260
129 #       elif defined(__m68k__)
130 #         define __NR_sched_getaffinity 312
131 #       elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
132 #         define __NR_sched_getaffinity 223
133 #       elif defined(__arm__)
134 #         define __NR_sched_getaffinity 242
135 #       elif defined(__cris__)
136 #         define __NR_sched_getaffinity 242
137 /*#       elif defined(__mips__)
138   #         define __NR_sched_getaffinity TODO (32/64/nabi) */
139 #       else
140 #         warning "don't know the syscall number for sched_getaffinity on this architecture, will not support getting binding"
141 #         define sched_getaffinity(pid, lg, mask) (errno = ENOSYS, -1)
142 #       endif
143 #    endif
144 #    ifndef sched_getaffinity
145 #      define sched_getaffinity(pid, lg, mask) (syscall(__NR_sched_getaffinity, pid, lg, mask) < 0 ? -1 : 0)
146 #    endif
147 #endif
148 
149 /* Added for ntohl() */
150 #include <arpa/inet.h>
151 
152 #ifdef HAVE_OPENAT
153 /* Use our own filesystem functions if we have openat */
154 
155 static const char *
hwloc_checkat(const char * path,int fsroot_fd)156 hwloc_checkat(const char *path, int fsroot_fd)
157 {
158   const char *relative_path;
159   if (fsroot_fd < 0) {
160     errno = EBADF;
161     return NULL;
162   }
163 
164   /* Skip leading slashes.  */
165   for (relative_path = path; *relative_path == '/'; relative_path++);
166 
167   return relative_path;
168 }
169 
170 static int
hwloc_openat(const char * path,int fsroot_fd)171 hwloc_openat(const char *path, int fsroot_fd)
172 {
173   const char *relative_path;
174 
175   relative_path = hwloc_checkat(path, fsroot_fd);
176   if (!relative_path)
177     return -1;
178 
179   return openat (fsroot_fd, relative_path, O_RDONLY);
180 }
181 
182 static FILE *
hwloc_fopenat(const char * path,const char * mode,int fsroot_fd)183 hwloc_fopenat(const char *path, const char *mode, int fsroot_fd)
184 {
185   int fd;
186 
187   if (strcmp(mode, "r")) {
188     errno = ENOTSUP;
189     return NULL;
190   }
191 
192   fd = hwloc_openat (path, fsroot_fd);
193   if (fd == -1)
194     return NULL;
195 
196   return fdopen(fd, mode);
197 }
198 
199 static int
hwloc_accessat(const char * path,int mode,int fsroot_fd)200 hwloc_accessat(const char *path, int mode, int fsroot_fd)
201 {
202   const char *relative_path;
203 
204   relative_path = hwloc_checkat(path, fsroot_fd);
205   if (!relative_path)
206     return -1;
207 
208   return faccessat(fsroot_fd, relative_path, mode, 0);
209 }
210 
211 static int
hwloc_fstatat(const char * path,struct stat * st,int flags,int fsroot_fd)212 hwloc_fstatat(const char *path, struct stat *st, int flags, int fsroot_fd)
213 {
214   const char *relative_path;
215 
216   relative_path = hwloc_checkat(path, fsroot_fd);
217   if (!relative_path)
218     return -1;
219 
220   return fstatat(fsroot_fd, relative_path, st, flags);
221 }
222 
223 static DIR*
hwloc_opendirat(const char * path,int fsroot_fd)224 hwloc_opendirat(const char *path, int fsroot_fd)
225 {
226   int dir_fd;
227   const char *relative_path;
228 
229   relative_path = hwloc_checkat(path, fsroot_fd);
230   if (!relative_path)
231     return NULL;
232 
233   dir_fd = openat(fsroot_fd, relative_path, O_RDONLY | O_DIRECTORY);
234   if (dir_fd < 0)
235     return NULL;
236 
237   return fdopendir(dir_fd);
238 }
239 
240 #endif /* HAVE_OPENAT */
241 
242 /* Static inline version of fopen so that we can use openat if we have
243    it, but still preserve compiler parameter checking */
244 static __hwloc_inline int
hwloc_open(const char * p,int d __hwloc_attribute_unused)245 hwloc_open(const char *p, int d __hwloc_attribute_unused)
246 {
247 #ifdef HAVE_OPENAT
248     return hwloc_openat(p, d);
249 #else
250     return open(p, O_RDONLY);
251 #endif
252 }
253 
254 static __hwloc_inline FILE *
hwloc_fopen(const char * p,const char * m,int d __hwloc_attribute_unused)255 hwloc_fopen(const char *p, const char *m, int d __hwloc_attribute_unused)
256 {
257 #ifdef HAVE_OPENAT
258     return hwloc_fopenat(p, m, d);
259 #else
260     return fopen(p, m);
261 #endif
262 }
263 
264 /* Static inline version of access so that we can use openat if we have
265    it, but still preserve compiler parameter checking */
266 static __hwloc_inline int
hwloc_access(const char * p,int m,int d __hwloc_attribute_unused)267 hwloc_access(const char *p, int m, int d __hwloc_attribute_unused)
268 {
269 #ifdef HAVE_OPENAT
270     return hwloc_accessat(p, m, d);
271 #else
272     return access(p, m);
273 #endif
274 }
275 
276 static __hwloc_inline int
hwloc_stat(const char * p,struct stat * st,int d __hwloc_attribute_unused)277 hwloc_stat(const char *p, struct stat *st, int d __hwloc_attribute_unused)
278 {
279 #ifdef HAVE_OPENAT
280     return hwloc_fstatat(p, st, 0, d);
281 #else
282     return stat(p, st);
283 #endif
284 }
285 
286 static __hwloc_inline int
hwloc_lstat(const char * p,struct stat * st,int d __hwloc_attribute_unused)287 hwloc_lstat(const char *p, struct stat *st, int d __hwloc_attribute_unused)
288 {
289 #ifdef HAVE_OPENAT
290     return hwloc_fstatat(p, st, AT_SYMLINK_NOFOLLOW, d);
291 #else
292     return lstat(p, st);
293 #endif
294 }
295 
296 /* Static inline version of opendir so that we can use openat if we have
297    it, but still preserve compiler parameter checking */
298 static __hwloc_inline DIR *
hwloc_opendir(const char * p,int d __hwloc_attribute_unused)299 hwloc_opendir(const char *p, int d __hwloc_attribute_unused)
300 {
301 #ifdef HAVE_OPENAT
302     return hwloc_opendirat(p, d);
303 #else
304     return opendir(p);
305 #endif
306 }
307 
308 
309 /*****************************************
310  ******* Helpers for reading files *******
311  *****************************************/
312 
313 static __hwloc_inline int
hwloc_read_path_by_length(const char * path,char * string,size_t length,int fsroot_fd)314 hwloc_read_path_by_length(const char *path, char *string, size_t length, int fsroot_fd)
315 {
316   int fd, ret;
317 
318   fd = hwloc_open(path, fsroot_fd);
319   if (fd < 0)
320     return -1;
321 
322   ret = read(fd, string, length-1); /* read -1 to put the ending \0 */
323   close(fd);
324 
325   if (ret <= 0)
326     return -1;
327 
328   string[ret] = 0;
329 
330   return 0;
331 }
332 
333 static __hwloc_inline int
hwloc_read_path_as_int(const char * path,int * value,int fsroot_fd)334 hwloc_read_path_as_int(const char *path, int *value, int fsroot_fd)
335 {
336   char string[11];
337   if (hwloc_read_path_by_length(path, string, sizeof(string), fsroot_fd) < 0)
338     return -1;
339   *value = atoi(string);
340   return 0;
341 }
342 
343 static __hwloc_inline int
hwloc_read_path_as_uint(const char * path,unsigned * value,int fsroot_fd)344 hwloc_read_path_as_uint(const char *path, unsigned *value, int fsroot_fd)
345 {
346   char string[11];
347   if (hwloc_read_path_by_length(path, string, sizeof(string), fsroot_fd) < 0)
348     return -1;
349   *value = (unsigned) strtoul(string, NULL, 10);
350   return 0;
351 }
352 
353 /* Read everything from fd and save it into a newly allocated buffer
354  * returned in bufferp. Use sizep as a default buffer size, and returned
355  * the actually needed size in sizep.
356  */
357 static __hwloc_inline int
hwloc__read_fd(int fd,char ** bufferp,size_t * sizep)358 hwloc__read_fd(int fd, char **bufferp, size_t *sizep)
359 {
360   char *buffer;
361   size_t toread, filesize, totalread;
362   ssize_t ret;
363 
364   toread = filesize = *sizep;
365 
366   /* Alloc and read +1 so that we get EOF on 2^n without reading once more */
367   buffer = malloc(filesize+1);
368   if (!buffer)
369     return -1;
370 
371   ret = read(fd, buffer, toread+1);
372   if (ret < 0) {
373     free(buffer);
374     return -1;
375   }
376 
377   totalread = (size_t) ret;
378 
379   if (totalread < toread + 1)
380     /* Normal case, a single read got EOF */
381     goto done;
382 
383   /* Unexpected case, must extend the buffer and read again.
384    * Only occurs on first invocation and if the kernel ever uses multiple page for a single mask.
385    */
386   do {
387     char *tmp;
388 
389     toread = filesize;
390     filesize *= 2;
391 
392     tmp = realloc(buffer, filesize+1);
393     if (!tmp) {
394       free(buffer);
395       return -1;
396     }
397     buffer = tmp;
398 
399     ret = read(fd, buffer+toread+1, toread);
400     if (ret < 0) {
401       free(buffer);
402       return -1;
403     }
404 
405     totalread += ret;
406   } while ((size_t) ret == toread);
407 
408  done:
409   buffer[totalread] = '\0';
410   *bufferp = buffer;
411   *sizep = filesize;
412   return 0;
413 }
414 
415 /* kernel cpumaps are composed of an array of 32bits cpumasks */
416 #define KERNEL_CPU_MASK_BITS 32
417 #define KERNEL_CPU_MAP_LEN (KERNEL_CPU_MASK_BITS/4+2)
418 
419 static __hwloc_inline int
hwloc__read_fd_as_cpumask(int fd,hwloc_bitmap_t set)420 hwloc__read_fd_as_cpumask(int fd, hwloc_bitmap_t set)
421 {
422   static size_t _filesize = 0; /* will be dynamically initialized to hwloc_get_pagesize(), and increased later if needed */
423   size_t filesize;
424   unsigned long *maps;
425   unsigned long map;
426   int nr_maps = 0;
427   static int _nr_maps_allocated = 8; /* Only compute the power-of-two above the kernel cpumask size once.
428                                       * Actually, it may increase multiple times if first read cpumaps start with zeroes.
429                                       */
430   int nr_maps_allocated = _nr_maps_allocated;
431   char *buffer, *tmpbuf;
432   int i;
433 
434   /* Kernel sysfs files are usually at most one page. 4kB may contain 455 32-bit
435    * masks (followed by comma), enough for 14k PUs. So allocate a page by default for now.
436    *
437    * If we ever need a larger buffer, we'll realloc() the buffer during the first
438    * invocation of this function so that others directly allocate the right size
439    * (all cpumask files have the exact same size).
440    */
441   filesize = _filesize;
442   if (!filesize)
443     filesize = hwloc_getpagesize();
444   if (hwloc__read_fd(fd, &buffer, &filesize) < 0)
445     return -1;
446   /* Only update the static value with the final one,
447    * to avoid sharing intermediate values that we modify,
448    * in case there's ever multiple concurrent calls.
449    */
450   _filesize = filesize;
451 
452   maps = malloc(nr_maps_allocated * sizeof(*maps));
453   if (!maps) {
454     free(buffer);
455     return -1;
456   }
457 
458   /* reset to zero first */
459   hwloc_bitmap_zero(set);
460 
461   /* parse the whole mask */
462   tmpbuf = buffer;
463   while (sscanf(tmpbuf, "%lx", &map) == 1) {
464     /* read one kernel cpu mask and the ending comma */
465     if (nr_maps == nr_maps_allocated) {
466       unsigned long *tmp = realloc(maps, 2*nr_maps_allocated * sizeof(*maps));
467       if (!tmp) {
468         free(buffer);
469         free(maps);
470         return -1;
471       }
472       maps = tmp;
473       nr_maps_allocated *= 2;
474     }
475 
476     tmpbuf = strchr(tmpbuf, ',');
477     if (!tmpbuf) {
478       maps[nr_maps++] = map;
479       break;
480     } else
481       tmpbuf++;
482 
483     if (!map && !nr_maps)
484       /* ignore the first map if it's empty */
485       continue;
486 
487     maps[nr_maps++] = map;
488   }
489 
490   free(buffer);
491 
492   /* convert into a set */
493 #if KERNEL_CPU_MASK_BITS == HWLOC_BITS_PER_LONG
494   for(i=0; i<nr_maps; i++)
495     hwloc_bitmap_set_ith_ulong(set, i, maps[nr_maps-1-i]);
496 #else
497   for(i=0; i<(nr_maps+1)/2; i++) {
498     unsigned long mask;
499     mask = maps[nr_maps-2*i-1];
500     if (2*i+1<nr_maps)
501       mask |= maps[nr_maps-2*i-2] << KERNEL_CPU_MASK_BITS;
502     hwloc_bitmap_set_ith_ulong(set, i, mask);
503   }
504 #endif
505 
506   free(maps);
507 
508   /* Only update the static value with the final one,
509    * to avoid sharing intermediate values that we modify,
510    * in case there's ever multiple concurrent calls.
511    */
512   if (nr_maps_allocated > _nr_maps_allocated)
513     _nr_maps_allocated = nr_maps_allocated;
514   return 0;
515 }
516 
517 static __hwloc_inline int
hwloc__read_path_as_cpumask(const char * maskpath,hwloc_bitmap_t set,int fsroot_fd)518 hwloc__read_path_as_cpumask(const char *maskpath, hwloc_bitmap_t set, int fsroot_fd)
519 {
520   int fd, err;
521   fd = hwloc_open(maskpath, fsroot_fd);
522   if (fd < 0)
523     return -1;
524   err = hwloc__read_fd_as_cpumask(fd, set);
525   close(fd);
526   return err;
527 }
528 
529 static __hwloc_inline hwloc_bitmap_t
hwloc__alloc_read_path_as_cpumask(const char * maskpath,int fsroot_fd)530 hwloc__alloc_read_path_as_cpumask(const char *maskpath, int fsroot_fd)
531 {
532   hwloc_bitmap_t set;
533   int err;
534   set = hwloc_bitmap_alloc();
535   if (!set)
536     return NULL;
537   err = hwloc__read_path_as_cpumask(maskpath, set, fsroot_fd);
538   if (err < 0) {
539     hwloc_bitmap_free(set);
540     return NULL;
541   } else
542     return set;
543 }
544 
545 /* set must be full on input */
546 static __hwloc_inline int
hwloc__read_fd_as_cpulist(int fd,hwloc_bitmap_t set)547 hwloc__read_fd_as_cpulist(int fd, hwloc_bitmap_t set)
548 {
549   /* Kernel sysfs files are usually at most one page.
550    * But cpulists can be of very different sizes depending on the fragmentation,
551    * so don't bother remember the actual read size between invocations.
552    * We don't have many invocations anyway.
553    */
554   size_t filesize = hwloc_getpagesize();
555   char *buffer, *current, *comma, *tmp;
556   int prevlast, nextfirst, nextlast; /* beginning/end of enabled-segments */
557 
558   if (hwloc__read_fd(fd, &buffer, &filesize) < 0)
559     return -1;
560 
561   current = buffer;
562   prevlast = -1;
563 
564   while (1) {
565     /* save a pointer to the next comma and erase it to simplify things */
566     comma = strchr(current, ',');
567     if (comma)
568       *comma = '\0';
569 
570     /* find current enabled-segment bounds */
571     nextfirst = strtoul(current, &tmp, 0);
572     if (*tmp == '-')
573       nextlast = strtoul(tmp+1, NULL, 0);
574     else
575       nextlast = nextfirst;
576     if (prevlast+1 <= nextfirst-1)
577       hwloc_bitmap_clr_range(set, prevlast+1, nextfirst-1);
578 
579     /* switch to next enabled-segment */
580     prevlast = nextlast;
581     if (!comma)
582       break;
583     current = comma+1;
584   }
585 
586   hwloc_bitmap_clr_range(set, prevlast+1, -1);
587   free(buffer);
588   return 0;
589 }
590 
591 
592 /*****************************
593  ******* CpuBind Hooks *******
594  *****************************/
595 
596 int
hwloc_linux_set_tid_cpubind(hwloc_topology_t topology __hwloc_attribute_unused,pid_t tid __hwloc_attribute_unused,hwloc_const_bitmap_t hwloc_set __hwloc_attribute_unused)597 hwloc_linux_set_tid_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, pid_t tid __hwloc_attribute_unused, hwloc_const_bitmap_t hwloc_set __hwloc_attribute_unused)
598 {
599   /* TODO Kerrighed: Use
600    * int migrate (pid_t pid, int destination_node);
601    * int migrate_self (int destination_node);
602    * int thread_migrate (int thread_id, int destination_node);
603    */
604 
605   /* The resulting binding is always strict */
606 
607 #if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
608   cpu_set_t *plinux_set;
609   unsigned cpu;
610   int last;
611   size_t setsize;
612   int err;
613 
614   last = hwloc_bitmap_last(hwloc_set);
615   if (last == -1) {
616     errno = EINVAL;
617     return -1;
618   }
619 
620   setsize = CPU_ALLOC_SIZE(last+1);
621   plinux_set = CPU_ALLOC(last+1);
622 
623   CPU_ZERO_S(setsize, plinux_set);
624   hwloc_bitmap_foreach_begin(cpu, hwloc_set)
625     CPU_SET_S(cpu, setsize, plinux_set);
626   hwloc_bitmap_foreach_end();
627 
628   err = sched_setaffinity(tid, setsize, plinux_set);
629 
630   CPU_FREE(plinux_set);
631   return err;
632 #elif defined(HWLOC_HAVE_CPU_SET)
633   cpu_set_t linux_set;
634   unsigned cpu;
635 
636   CPU_ZERO(&linux_set);
637   hwloc_bitmap_foreach_begin(cpu, hwloc_set)
638     CPU_SET(cpu, &linux_set);
639   hwloc_bitmap_foreach_end();
640 
641 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
642   return sched_setaffinity(tid, &linux_set);
643 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
644   return sched_setaffinity(tid, sizeof(linux_set), &linux_set);
645 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
646 #elif defined(HWLOC_HAVE_SYSCALL)
647   unsigned long mask = hwloc_bitmap_to_ulong(hwloc_set);
648 
649 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
650   return sched_setaffinity(tid, (void*) &mask);
651 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
652   return sched_setaffinity(tid, sizeof(mask), (void*) &mask);
653 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
654 #else /* !SYSCALL */
655   errno = ENOSYS;
656   return -1;
657 #endif /* !SYSCALL */
658 }
659 
660 #if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
661 /*
662  * On some kernels, sched_getaffinity requires the output size to be larger
663  * than the kernel cpu_set size (defined by CONFIG_NR_CPUS).
664  * Try sched_affinity on ourself until we find a nr_cpus value that makes
665  * the kernel happy.
666  */
667 static int
hwloc_linux_find_kernel_nr_cpus(hwloc_topology_t topology)668 hwloc_linux_find_kernel_nr_cpus(hwloc_topology_t topology)
669 {
670   static int _nr_cpus = -1;
671   int nr_cpus = _nr_cpus;
672   int fd;
673 
674   if (nr_cpus != -1)
675     /* already computed */
676     return nr_cpus;
677 
678   if (topology->levels[0][0]->complete_cpuset)
679     /* start with a nr_cpus that may contain the whole topology */
680     nr_cpus = hwloc_bitmap_last(topology->levels[0][0]->complete_cpuset) + 1;
681   if (nr_cpus <= 0)
682     /* start from scratch, the topology isn't ready yet (complete_cpuset is missing (-1) or empty (0))*/
683     nr_cpus = 1;
684 
685   fd = open("/sys/devices/system/cpu/possible", O_RDONLY); /* binding only supported in real fsroot, no need for data->root_fd */
686   if (fd >= 0) {
687     hwloc_bitmap_t possible_bitmap = hwloc_bitmap_alloc_full();
688     if (hwloc__read_fd_as_cpulist(fd, possible_bitmap) == 0) {
689       int max_possible = hwloc_bitmap_last(possible_bitmap);
690       hwloc_debug_bitmap("possible CPUs are %s\n", possible_bitmap);
691 
692       if (nr_cpus < max_possible + 1)
693         nr_cpus = max_possible + 1;
694     }
695     close(fd);
696     hwloc_bitmap_free(possible_bitmap);
697   }
698 
699   while (1) {
700     cpu_set_t *set = CPU_ALLOC(nr_cpus);
701     size_t setsize = CPU_ALLOC_SIZE(nr_cpus);
702     int err = sched_getaffinity(0, setsize, set); /* always works, unless setsize is too small */
703     CPU_FREE(set);
704     nr_cpus = setsize * 8; /* that's the value that was actually tested */
705     if (!err)
706       /* Found it. Only update the static value with the final one,
707        * to avoid sharing intermediate values that we modify,
708        * in case there's ever multiple concurrent calls.
709        */
710       return _nr_cpus = nr_cpus;
711     nr_cpus *= 2;
712   }
713 }
714 #endif
715 
716 int
hwloc_linux_get_tid_cpubind(hwloc_topology_t topology __hwloc_attribute_unused,pid_t tid __hwloc_attribute_unused,hwloc_bitmap_t hwloc_set __hwloc_attribute_unused)717 hwloc_linux_get_tid_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, pid_t tid __hwloc_attribute_unused, hwloc_bitmap_t hwloc_set __hwloc_attribute_unused)
718 {
719   int err __hwloc_attribute_unused;
720   /* TODO Kerrighed */
721 
722 #if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
723   cpu_set_t *plinux_set;
724   unsigned cpu;
725   int last;
726   size_t setsize;
727   int kernel_nr_cpus;
728 
729   /* find the kernel nr_cpus so as to use a large enough cpu_set size */
730   kernel_nr_cpus = hwloc_linux_find_kernel_nr_cpus(topology);
731   setsize = CPU_ALLOC_SIZE(kernel_nr_cpus);
732   plinux_set = CPU_ALLOC(kernel_nr_cpus);
733 
734   err = sched_getaffinity(tid, setsize, plinux_set);
735 
736   if (err < 0) {
737     CPU_FREE(plinux_set);
738     return -1;
739   }
740 
741   last = -1;
742   if (topology->levels[0][0]->complete_cpuset)
743     last = hwloc_bitmap_last(topology->levels[0][0]->complete_cpuset);
744   if (last == -1)
745     /* round the maximal support number, the topology isn't ready yet (complete_cpuset is missing or empty)*/
746     last = kernel_nr_cpus-1;
747 
748   hwloc_bitmap_zero(hwloc_set);
749   for(cpu=0; cpu<=(unsigned) last; cpu++)
750     if (CPU_ISSET_S(cpu, setsize, plinux_set))
751       hwloc_bitmap_set(hwloc_set, cpu);
752 
753   CPU_FREE(plinux_set);
754 #elif defined(HWLOC_HAVE_CPU_SET)
755   cpu_set_t linux_set;
756   unsigned cpu;
757 
758 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
759   err = sched_getaffinity(tid, &linux_set);
760 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
761   err = sched_getaffinity(tid, sizeof(linux_set), &linux_set);
762 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
763   if (err < 0)
764     return -1;
765 
766   hwloc_bitmap_zero(hwloc_set);
767   for(cpu=0; cpu<CPU_SETSIZE; cpu++)
768     if (CPU_ISSET(cpu, &linux_set))
769       hwloc_bitmap_set(hwloc_set, cpu);
770 #elif defined(HWLOC_HAVE_SYSCALL)
771   unsigned long mask;
772 
773 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
774   err = sched_getaffinity(tid, (void*) &mask);
775 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
776   err = sched_getaffinity(tid, sizeof(mask), (void*) &mask);
777 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
778   if (err < 0)
779     return -1;
780 
781   hwloc_bitmap_from_ulong(hwloc_set, mask);
782 #else /* !SYSCALL */
783   errno = ENOSYS;
784   return -1;
785 #endif /* !SYSCALL */
786 
787   return 0;
788 }
789 
790 /* Get the array of tids of a process from the task directory in /proc */
791 static int
hwloc_linux_get_proc_tids(DIR * taskdir,unsigned * nr_tidsp,pid_t ** tidsp)792 hwloc_linux_get_proc_tids(DIR *taskdir, unsigned *nr_tidsp, pid_t ** tidsp)
793 {
794   struct dirent *dirent;
795   unsigned nr_tids = 0;
796   unsigned max_tids = 32;
797   pid_t *tids;
798   struct stat sb;
799 
800   /* take the number of links as a good estimate for the number of tids */
801   if (fstat(dirfd(taskdir), &sb) == 0)
802     max_tids = sb.st_nlink;
803 
804   tids = malloc(max_tids*sizeof(pid_t));
805   if (!tids) {
806     errno = ENOMEM;
807     return -1;
808   }
809 
810   rewinddir(taskdir);
811 
812   while ((dirent = readdir(taskdir)) != NULL) {
813     if (nr_tids == max_tids) {
814       pid_t *newtids;
815       max_tids += 8;
816       newtids = realloc(tids, max_tids*sizeof(pid_t));
817       if (!newtids) {
818         free(tids);
819         errno = ENOMEM;
820         return -1;
821       }
822       tids = newtids;
823     }
824     if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
825       continue;
826     tids[nr_tids++] = atoi(dirent->d_name);
827   }
828 
829   *nr_tidsp = nr_tids;
830   *tidsp = tids;
831   return 0;
832 }
833 
834 /* Per-tid callbacks */
835 typedef int (*hwloc_linux_foreach_proc_tid_cb_t)(hwloc_topology_t topology, pid_t tid, void *data, int idx);
836 
837 static int
hwloc_linux_foreach_proc_tid(hwloc_topology_t topology,pid_t pid,hwloc_linux_foreach_proc_tid_cb_t cb,void * data)838 hwloc_linux_foreach_proc_tid(hwloc_topology_t topology,
839                              pid_t pid, hwloc_linux_foreach_proc_tid_cb_t cb,
840                              void *data)
841 {
842   char taskdir_path[128];
843   DIR *taskdir;
844   pid_t *tids, *newtids;
845   unsigned i, nr, newnr, failed = 0, failed_errno = 0;
846   unsigned retrynr = 0;
847   int err;
848 
849   if (pid)
850     snprintf(taskdir_path, sizeof(taskdir_path), "/proc/%u/task", (unsigned) pid);
851   else
852     snprintf(taskdir_path, sizeof(taskdir_path), "/proc/self/task");
853 
854   taskdir = opendir(taskdir_path);
855   if (!taskdir) {
856     if (errno == ENOENT)
857       errno = EINVAL;
858     err = -1;
859     goto out;
860   }
861 
862   /* read the current list of threads */
863   err = hwloc_linux_get_proc_tids(taskdir, &nr, &tids);
864   if (err < 0)
865     goto out_with_dir;
866 
867  retry:
868   /* apply the callback to all threads */
869   failed=0;
870   for(i=0; i<nr; i++) {
871     err = cb(topology, tids[i], data, i);
872     if (err < 0) {
873       failed++;
874       failed_errno = errno;
875     }
876   }
877 
878   /* re-read the list of thread */
879   err = hwloc_linux_get_proc_tids(taskdir, &newnr, &newtids);
880   if (err < 0)
881     goto out_with_tids;
882   /* retry if the list changed in the meantime, or we failed for *some* threads only.
883    * if we're really unlucky, all threads changed but we got the same set of tids. no way to support this.
884    */
885   if (newnr != nr || memcmp(newtids, tids, nr*sizeof(pid_t)) || (failed && failed != nr)) {
886     free(tids);
887     tids = newtids;
888     nr = newnr;
889     if (++retrynr > 10) {
890       /* we tried 10 times, it didn't work, the application is probably creating/destroying many threads, stop trying */
891       errno = EAGAIN;
892       err = -1;
893       goto out_with_tids;
894     }
895     goto retry;
896   } else {
897     free(newtids);
898   }
899 
900   /* if all threads failed, return the last errno. */
901   if (failed) {
902     err = -1;
903     errno = failed_errno;
904     goto out_with_tids;
905   }
906 
907   err = 0;
908  out_with_tids:
909   free(tids);
910  out_with_dir:
911   closedir(taskdir);
912  out:
913   return err;
914 }
915 
916 /* Per-tid proc_set_cpubind callback and caller.
917  * Callback data is a hwloc_bitmap_t. */
918 static int
hwloc_linux_foreach_proc_tid_set_cpubind_cb(hwloc_topology_t topology,pid_t tid,void * data,int idx __hwloc_attribute_unused)919 hwloc_linux_foreach_proc_tid_set_cpubind_cb(hwloc_topology_t topology, pid_t tid, void *data, int idx __hwloc_attribute_unused)
920 {
921   return hwloc_linux_set_tid_cpubind(topology, tid, (hwloc_bitmap_t) data);
922 }
923 
924 static int
hwloc_linux_set_pid_cpubind(hwloc_topology_t topology,pid_t pid,hwloc_const_bitmap_t hwloc_set,int flags __hwloc_attribute_unused)925 hwloc_linux_set_pid_cpubind(hwloc_topology_t topology, pid_t pid, hwloc_const_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
926 {
927   return hwloc_linux_foreach_proc_tid(topology, pid,
928                                       hwloc_linux_foreach_proc_tid_set_cpubind_cb,
929                                       (void*) hwloc_set);
930 }
931 
932 /* Per-tid proc_get_cpubind callback data, callback function and caller */
933 struct hwloc_linux_foreach_proc_tid_get_cpubind_cb_data_s {
934   hwloc_bitmap_t cpuset;
935   hwloc_bitmap_t tidset;
936   int flags;
937 };
938 
939 static int
hwloc_linux_foreach_proc_tid_get_cpubind_cb(hwloc_topology_t topology,pid_t tid,void * _data,int idx)940 hwloc_linux_foreach_proc_tid_get_cpubind_cb(hwloc_topology_t topology, pid_t tid, void *_data, int idx)
941 {
942   struct hwloc_linux_foreach_proc_tid_get_cpubind_cb_data_s *data = _data;
943   hwloc_bitmap_t cpuset = data->cpuset;
944   hwloc_bitmap_t tidset = data->tidset;
945   int flags = data->flags;
946 
947   if (hwloc_linux_get_tid_cpubind(topology, tid, tidset))
948     return -1;
949 
950   /* reset the cpuset on first iteration */
951   if (!idx)
952     hwloc_bitmap_zero(cpuset);
953 
954   if (flags & HWLOC_CPUBIND_STRICT) {
955     /* if STRICT, we want all threads to have the same binding */
956     if (!idx) {
957       /* this is the first thread, copy its binding */
958       hwloc_bitmap_copy(cpuset, tidset);
959     } else if (!hwloc_bitmap_isequal(cpuset, tidset)) {
960       /* this is not the first thread, and it's binding is different */
961       errno = EXDEV;
962       return -1;
963     }
964   } else {
965     /* if not STRICT, just OR all thread bindings */
966     hwloc_bitmap_or(cpuset, cpuset, tidset);
967   }
968   return 0;
969 }
970 
971 static int
hwloc_linux_get_pid_cpubind(hwloc_topology_t topology,pid_t pid,hwloc_bitmap_t hwloc_set,int flags)972 hwloc_linux_get_pid_cpubind(hwloc_topology_t topology, pid_t pid, hwloc_bitmap_t hwloc_set, int flags)
973 {
974   struct hwloc_linux_foreach_proc_tid_get_cpubind_cb_data_s data;
975   hwloc_bitmap_t tidset = hwloc_bitmap_alloc();
976   int ret;
977 
978   data.cpuset = hwloc_set;
979   data.tidset = tidset;
980   data.flags = flags;
981   ret = hwloc_linux_foreach_proc_tid(topology, pid,
982                                      hwloc_linux_foreach_proc_tid_get_cpubind_cb,
983                                      (void*) &data);
984   hwloc_bitmap_free(tidset);
985   return ret;
986 }
987 
988 static int
hwloc_linux_set_proc_cpubind(hwloc_topology_t topology,pid_t pid,hwloc_const_bitmap_t hwloc_set,int flags)989 hwloc_linux_set_proc_cpubind(hwloc_topology_t topology, pid_t pid, hwloc_const_bitmap_t hwloc_set, int flags)
990 {
991   if (pid == 0)
992     pid = topology->pid;
993   if (flags & HWLOC_CPUBIND_THREAD)
994     return hwloc_linux_set_tid_cpubind(topology, pid, hwloc_set);
995   else
996     return hwloc_linux_set_pid_cpubind(topology, pid, hwloc_set, flags);
997 }
998 
999 static int
hwloc_linux_get_proc_cpubind(hwloc_topology_t topology,pid_t pid,hwloc_bitmap_t hwloc_set,int flags)1000 hwloc_linux_get_proc_cpubind(hwloc_topology_t topology, pid_t pid, hwloc_bitmap_t hwloc_set, int flags)
1001 {
1002   if (pid == 0)
1003     pid = topology->pid;
1004   if (flags & HWLOC_CPUBIND_THREAD)
1005     return hwloc_linux_get_tid_cpubind(topology, pid, hwloc_set);
1006   else
1007     return hwloc_linux_get_pid_cpubind(topology, pid, hwloc_set, flags);
1008 }
1009 
1010 static int
hwloc_linux_set_thisproc_cpubind(hwloc_topology_t topology,hwloc_const_bitmap_t hwloc_set,int flags)1011 hwloc_linux_set_thisproc_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_set, int flags)
1012 {
1013   return hwloc_linux_set_pid_cpubind(topology, topology->pid, hwloc_set, flags);
1014 }
1015 
1016 static int
hwloc_linux_get_thisproc_cpubind(hwloc_topology_t topology,hwloc_bitmap_t hwloc_set,int flags)1017 hwloc_linux_get_thisproc_cpubind(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags)
1018 {
1019   return hwloc_linux_get_pid_cpubind(topology, topology->pid, hwloc_set, flags);
1020 }
1021 
1022 static int
hwloc_linux_set_thisthread_cpubind(hwloc_topology_t topology,hwloc_const_bitmap_t hwloc_set,int flags __hwloc_attribute_unused)1023 hwloc_linux_set_thisthread_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
1024 {
1025   if (topology->pid) {
1026     errno = ENOSYS;
1027     return -1;
1028   }
1029   return hwloc_linux_set_tid_cpubind(topology, 0, hwloc_set);
1030 }
1031 
1032 static int
hwloc_linux_get_thisthread_cpubind(hwloc_topology_t topology,hwloc_bitmap_t hwloc_set,int flags __hwloc_attribute_unused)1033 hwloc_linux_get_thisthread_cpubind(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
1034 {
1035   if (topology->pid) {
1036     errno = ENOSYS;
1037     return -1;
1038   }
1039   return hwloc_linux_get_tid_cpubind(topology, 0, hwloc_set);
1040 }
1041 
1042 #if HAVE_DECL_PTHREAD_SETAFFINITY_NP
1043 #pragma weak pthread_setaffinity_np
1044 #pragma weak pthread_self
1045 
1046 static int
hwloc_linux_set_thread_cpubind(hwloc_topology_t topology,pthread_t tid,hwloc_const_bitmap_t hwloc_set,int flags __hwloc_attribute_unused)1047 hwloc_linux_set_thread_cpubind(hwloc_topology_t topology, pthread_t tid, hwloc_const_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
1048 {
1049   int err;
1050 
1051   if (topology->pid) {
1052     errno = ENOSYS;
1053     return -1;
1054   }
1055 
1056   if (!pthread_self) {
1057     /* ?! Application uses set_thread_cpubind, but doesn't link against libpthread ?! */
1058     errno = ENOSYS;
1059     return -1;
1060   }
1061   if (tid == pthread_self())
1062     return hwloc_linux_set_tid_cpubind(topology, 0, hwloc_set);
1063 
1064   if (!pthread_setaffinity_np) {
1065     errno = ENOSYS;
1066     return -1;
1067   }
1068   /* TODO Kerrighed: Use
1069    * int migrate (pid_t pid, int destination_node);
1070    * int migrate_self (int destination_node);
1071    * int thread_migrate (int thread_id, int destination_node);
1072    */
1073 
1074 #if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
1075   /* Use a separate block so that we can define specific variable
1076      types here */
1077   {
1078      cpu_set_t *plinux_set;
1079      unsigned cpu;
1080      int last;
1081      size_t setsize;
1082 
1083      last = hwloc_bitmap_last(hwloc_set);
1084      if (last == -1) {
1085        errno = EINVAL;
1086        return -1;
1087      }
1088 
1089      setsize = CPU_ALLOC_SIZE(last+1);
1090      plinux_set = CPU_ALLOC(last+1);
1091 
1092      CPU_ZERO_S(setsize, plinux_set);
1093      hwloc_bitmap_foreach_begin(cpu, hwloc_set)
1094          CPU_SET_S(cpu, setsize, plinux_set);
1095      hwloc_bitmap_foreach_end();
1096 
1097      err = pthread_setaffinity_np(tid, setsize, plinux_set);
1098 
1099      CPU_FREE(plinux_set);
1100   }
1101 #elif defined(HWLOC_HAVE_CPU_SET)
1102   /* Use a separate block so that we can define specific variable
1103      types here */
1104   {
1105      cpu_set_t linux_set;
1106      unsigned cpu;
1107 
1108      CPU_ZERO(&linux_set);
1109      hwloc_bitmap_foreach_begin(cpu, hwloc_set)
1110          CPU_SET(cpu, &linux_set);
1111      hwloc_bitmap_foreach_end();
1112 
1113 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
1114      err = pthread_setaffinity_np(tid, &linux_set);
1115 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1116      err = pthread_setaffinity_np(tid, sizeof(linux_set), &linux_set);
1117 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1118   }
1119 #else /* CPU_SET */
1120   /* Use a separate block so that we can define specific variable
1121      types here */
1122   {
1123       unsigned long mask = hwloc_bitmap_to_ulong(hwloc_set);
1124 
1125 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
1126       err = pthread_setaffinity_np(tid, (void*) &mask);
1127 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1128       err = pthread_setaffinity_np(tid, sizeof(mask), (void*) &mask);
1129 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1130   }
1131 #endif /* CPU_SET */
1132 
1133   if (err) {
1134     errno = err;
1135     return -1;
1136   }
1137   return 0;
1138 }
1139 #endif /* HAVE_DECL_PTHREAD_SETAFFINITY_NP */
1140 
1141 #if HAVE_DECL_PTHREAD_GETAFFINITY_NP
1142 #pragma weak pthread_getaffinity_np
1143 #pragma weak pthread_self
1144 
1145 static int
hwloc_linux_get_thread_cpubind(hwloc_topology_t topology,pthread_t tid,hwloc_bitmap_t hwloc_set,int flags __hwloc_attribute_unused)1146 hwloc_linux_get_thread_cpubind(hwloc_topology_t topology, pthread_t tid, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
1147 {
1148   int err;
1149 
1150   if (topology->pid) {
1151     errno = ENOSYS;
1152     return -1;
1153   }
1154 
1155   if (!pthread_self) {
1156     /* ?! Application uses set_thread_cpubind, but doesn't link against libpthread ?! */
1157     errno = ENOSYS;
1158     return -1;
1159   }
1160   if (tid == pthread_self())
1161     return hwloc_linux_get_tid_cpubind(topology, 0, hwloc_set);
1162 
1163   if (!pthread_getaffinity_np) {
1164     errno = ENOSYS;
1165     return -1;
1166   }
1167   /* TODO Kerrighed */
1168 
1169 #if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
1170   /* Use a separate block so that we can define specific variable
1171      types here */
1172   {
1173      cpu_set_t *plinux_set;
1174      unsigned cpu;
1175      int last;
1176      size_t setsize;
1177 
1178      last = hwloc_bitmap_last(topology->levels[0][0]->complete_cpuset);
1179      assert (last != -1);
1180 
1181      setsize = CPU_ALLOC_SIZE(last+1);
1182      plinux_set = CPU_ALLOC(last+1);
1183 
1184      err = pthread_getaffinity_np(tid, setsize, plinux_set);
1185      if (err) {
1186         CPU_FREE(plinux_set);
1187         errno = err;
1188         return -1;
1189      }
1190 
1191      hwloc_bitmap_zero(hwloc_set);
1192      for(cpu=0; cpu<=(unsigned) last; cpu++)
1193        if (CPU_ISSET_S(cpu, setsize, plinux_set))
1194          hwloc_bitmap_set(hwloc_set, cpu);
1195 
1196      CPU_FREE(plinux_set);
1197   }
1198 #elif defined(HWLOC_HAVE_CPU_SET)
1199   /* Use a separate block so that we can define specific variable
1200      types here */
1201   {
1202      cpu_set_t linux_set;
1203      unsigned cpu;
1204 
1205 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
1206      err = pthread_getaffinity_np(tid, &linux_set);
1207 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1208      err = pthread_getaffinity_np(tid, sizeof(linux_set), &linux_set);
1209 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1210      if (err) {
1211         errno = err;
1212         return -1;
1213      }
1214 
1215      hwloc_bitmap_zero(hwloc_set);
1216      for(cpu=0; cpu<CPU_SETSIZE; cpu++)
1217        if (CPU_ISSET(cpu, &linux_set))
1218          hwloc_bitmap_set(hwloc_set, cpu);
1219   }
1220 #else /* CPU_SET */
1221   /* Use a separate block so that we can define specific variable
1222      types here */
1223   {
1224       unsigned long mask;
1225 
1226 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
1227       err = pthread_getaffinity_np(tid, (void*) &mask);
1228 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1229       err = pthread_getaffinity_np(tid, sizeof(mask), (void*) &mask);
1230 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1231       if (err) {
1232         errno = err;
1233         return -1;
1234       }
1235 
1236      hwloc_bitmap_from_ulong(hwloc_set, mask);
1237   }
1238 #endif /* CPU_SET */
1239 
1240   return 0;
1241 }
1242 #endif /* HAVE_DECL_PTHREAD_GETAFFINITY_NP */
1243 
1244 int
hwloc_linux_get_tid_last_cpu_location(hwloc_topology_t topology __hwloc_attribute_unused,pid_t tid,hwloc_bitmap_t set)1245 hwloc_linux_get_tid_last_cpu_location(hwloc_topology_t topology __hwloc_attribute_unused, pid_t tid, hwloc_bitmap_t set)
1246 {
1247   /* read /proc/pid/stat.
1248    * its second field contains the command name between parentheses,
1249    * and the command itself may contain parentheses,
1250    * so read the whole line and find the last closing parenthesis to find the third field.
1251    */
1252   char buf[1024] = "";
1253   char name[64];
1254   char *tmp;
1255   int fd, i, err;
1256 
1257   if (!tid) {
1258 #ifdef SYS_gettid
1259     tid = syscall(SYS_gettid);
1260 #else
1261     errno = ENOSYS;
1262     return -1;
1263 #endif
1264   }
1265 
1266   snprintf(name, sizeof(name), "/proc/%lu/stat", (unsigned long) tid);
1267   fd = open(name, O_RDONLY); /* no fsroot for real /proc */
1268   if (fd < 0) {
1269     errno = ENOSYS;
1270     return -1;
1271   }
1272   err = read(fd, buf, sizeof(buf)-1); /* read -1 to put the ending \0 */
1273   close(fd);
1274   if (err <= 0) {
1275     errno = ENOSYS;
1276     return -1;
1277   }
1278   buf[err-1] = '\0';
1279 
1280   tmp = strrchr(buf, ')');
1281   if (!tmp) {
1282     errno = ENOSYS;
1283     return -1;
1284   }
1285   /* skip ') ' to find the actual third argument */
1286   tmp += 2;
1287 
1288   /* skip 35 fields */
1289   for(i=0; i<36; i++) {
1290     tmp = strchr(tmp, ' ');
1291     if (!tmp) {
1292       errno = ENOSYS;
1293       return -1;
1294     }
1295     /* skip the ' ' itself */
1296     tmp++;
1297   }
1298 
1299   /* read the last cpu in the 38th field now */
1300   if (sscanf(tmp, "%d ", &i) != 1) {
1301     errno = ENOSYS;
1302     return -1;
1303   }
1304 
1305   hwloc_bitmap_only(set, i);
1306   return 0;
1307 }
1308 
1309 /* Per-tid proc_get_last_cpu_location callback data, callback function and caller */
1310 struct hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb_data_s {
1311   hwloc_bitmap_t cpuset;
1312   hwloc_bitmap_t tidset;
1313 };
1314 
1315 static int
hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb(hwloc_topology_t topology,pid_t tid,void * _data,int idx)1316 hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb(hwloc_topology_t topology, pid_t tid, void *_data, int idx)
1317 {
1318   struct hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb_data_s *data = _data;
1319   hwloc_bitmap_t cpuset = data->cpuset;
1320   hwloc_bitmap_t tidset = data->tidset;
1321 
1322   if (hwloc_linux_get_tid_last_cpu_location(topology, tid, tidset))
1323     return -1;
1324 
1325   /* reset the cpuset on first iteration */
1326   if (!idx)
1327     hwloc_bitmap_zero(cpuset);
1328 
1329   hwloc_bitmap_or(cpuset, cpuset, tidset);
1330   return 0;
1331 }
1332 
1333 static int
hwloc_linux_get_pid_last_cpu_location(hwloc_topology_t topology,pid_t pid,hwloc_bitmap_t hwloc_set,int flags __hwloc_attribute_unused)1334 hwloc_linux_get_pid_last_cpu_location(hwloc_topology_t topology, pid_t pid, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
1335 {
1336   struct hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb_data_s data;
1337   hwloc_bitmap_t tidset = hwloc_bitmap_alloc();
1338   int ret;
1339 
1340   data.cpuset = hwloc_set;
1341   data.tidset = tidset;
1342   ret = hwloc_linux_foreach_proc_tid(topology, pid,
1343                                      hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb,
1344                                      &data);
1345   hwloc_bitmap_free(tidset);
1346   return ret;
1347 }
1348 
1349 static int
hwloc_linux_get_proc_last_cpu_location(hwloc_topology_t topology,pid_t pid,hwloc_bitmap_t hwloc_set,int flags)1350 hwloc_linux_get_proc_last_cpu_location(hwloc_topology_t topology, pid_t pid, hwloc_bitmap_t hwloc_set, int flags)
1351 {
1352   if (pid == 0)
1353     pid = topology->pid;
1354   if (flags & HWLOC_CPUBIND_THREAD)
1355     return hwloc_linux_get_tid_last_cpu_location(topology, pid, hwloc_set);
1356   else
1357     return hwloc_linux_get_pid_last_cpu_location(topology, pid, hwloc_set, flags);
1358 }
1359 
1360 static int
hwloc_linux_get_thisproc_last_cpu_location(hwloc_topology_t topology,hwloc_bitmap_t hwloc_set,int flags)1361 hwloc_linux_get_thisproc_last_cpu_location(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags)
1362 {
1363   return hwloc_linux_get_pid_last_cpu_location(topology, topology->pid, hwloc_set, flags);
1364 }
1365 
1366 static int
hwloc_linux_get_thisthread_last_cpu_location(hwloc_topology_t topology,hwloc_bitmap_t hwloc_set,int flags __hwloc_attribute_unused)1367 hwloc_linux_get_thisthread_last_cpu_location(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
1368 {
1369   if (topology->pid) {
1370     errno = ENOSYS;
1371     return -1;
1372   }
1373   return hwloc_linux_get_tid_last_cpu_location(topology, 0, hwloc_set);
1374 }
1375 
1376 
1377 
1378 /***************************
1379  ****** Membind hooks ******
1380  ***************************/
1381 
1382 #if defined HWLOC_HAVE_SET_MEMPOLICY || defined HWLOC_HAVE_MBIND
1383 static int
hwloc_linux_membind_policy_from_hwloc(int * linuxpolicy,hwloc_membind_policy_t policy,int flags)1384 hwloc_linux_membind_policy_from_hwloc(int *linuxpolicy, hwloc_membind_policy_t policy, int flags)
1385 {
1386   switch (policy) {
1387   case HWLOC_MEMBIND_DEFAULT:
1388   case HWLOC_MEMBIND_FIRSTTOUCH:
1389     *linuxpolicy = MPOL_DEFAULT;
1390     break;
1391   case HWLOC_MEMBIND_BIND:
1392     if (flags & HWLOC_MEMBIND_STRICT)
1393       *linuxpolicy = MPOL_BIND;
1394     else
1395       *linuxpolicy = MPOL_PREFERRED;
1396     break;
1397   case HWLOC_MEMBIND_INTERLEAVE:
1398     *linuxpolicy = MPOL_INTERLEAVE;
1399     break;
1400   /* TODO: next-touch when (if?) patch applied upstream */
1401   default:
1402     errno = ENOSYS;
1403     return -1;
1404   }
1405   return 0;
1406 }
1407 
1408 static int
hwloc_linux_membind_mask_from_nodeset(hwloc_topology_t topology __hwloc_attribute_unused,hwloc_const_nodeset_t nodeset,unsigned * max_os_index_p,unsigned long ** linuxmaskp)1409 hwloc_linux_membind_mask_from_nodeset(hwloc_topology_t topology __hwloc_attribute_unused,
1410                                       hwloc_const_nodeset_t nodeset,
1411                                       unsigned *max_os_index_p, unsigned long **linuxmaskp)
1412 {
1413   unsigned max_os_index = 0; /* highest os_index + 1 */
1414   unsigned long *linuxmask;
1415   unsigned i;
1416   hwloc_nodeset_t linux_nodeset = NULL;
1417 
1418   if (hwloc_bitmap_isfull(nodeset)) {
1419     linux_nodeset = hwloc_bitmap_alloc();
1420     hwloc_bitmap_only(linux_nodeset, 0);
1421     nodeset = linux_nodeset;
1422   }
1423 
1424   max_os_index = hwloc_bitmap_last(nodeset);
1425   if (max_os_index == (unsigned) -1)
1426     max_os_index = 0;
1427   /* add 1 to convert the last os_index into a max_os_index,
1428    * and round up to the nearest multiple of BITS_PER_LONG */
1429   max_os_index = (max_os_index + 1 + HWLOC_BITS_PER_LONG - 1) & ~(HWLOC_BITS_PER_LONG - 1);
1430 
1431   linuxmask = calloc(max_os_index/HWLOC_BITS_PER_LONG, sizeof(long));
1432   if (!linuxmask) {
1433     hwloc_bitmap_free(linux_nodeset);
1434     errno = ENOMEM;
1435     return -1;
1436   }
1437 
1438   for(i=0; i<max_os_index/HWLOC_BITS_PER_LONG; i++)
1439     linuxmask[i] = hwloc_bitmap_to_ith_ulong(nodeset, i);
1440 
1441   if (linux_nodeset)
1442     hwloc_bitmap_free(linux_nodeset);
1443 
1444   *max_os_index_p = max_os_index;
1445   *linuxmaskp = linuxmask;
1446   return 0;
1447 }
1448 
1449 static void
hwloc_linux_membind_mask_to_nodeset(hwloc_topology_t topology __hwloc_attribute_unused,hwloc_nodeset_t nodeset,unsigned max_os_index,const unsigned long * linuxmask)1450 hwloc_linux_membind_mask_to_nodeset(hwloc_topology_t topology __hwloc_attribute_unused,
1451                                     hwloc_nodeset_t nodeset,
1452                                     unsigned max_os_index, const unsigned long *linuxmask)
1453 {
1454   unsigned i;
1455 
1456 #ifdef HWLOC_DEBUG
1457   /* max_os_index comes from hwloc_linux_find_kernel_max_numnodes() so it's a multiple of HWLOC_BITS_PER_LONG */
1458   assert(!(max_os_index%HWLOC_BITS_PER_LONG));
1459 #endif
1460 
1461   hwloc_bitmap_zero(nodeset);
1462   for(i=0; i<max_os_index/HWLOC_BITS_PER_LONG; i++)
1463     hwloc_bitmap_set_ith_ulong(nodeset, i, linuxmask[i]);
1464 }
1465 #endif /* HWLOC_HAVE_SET_MEMPOLICY || HWLOC_HAVE_MBIND */
1466 
1467 #ifdef HWLOC_HAVE_MBIND
1468 static int
hwloc_linux_set_area_membind(hwloc_topology_t topology,const void * addr,size_t len,hwloc_const_nodeset_t nodeset,hwloc_membind_policy_t policy,int flags)1469 hwloc_linux_set_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
1470 {
1471   unsigned max_os_index; /* highest os_index + 1 */
1472   unsigned long *linuxmask;
1473   size_t remainder;
1474   int linuxpolicy;
1475   unsigned linuxflags = 0;
1476   int err;
1477 
1478   remainder = (uintptr_t) addr & (hwloc_getpagesize()-1);
1479   addr = (char*) addr - remainder;
1480   len += remainder;
1481 
1482   err = hwloc_linux_membind_policy_from_hwloc(&linuxpolicy, policy, flags);
1483   if (err < 0)
1484     return err;
1485 
1486   if (linuxpolicy == MPOL_DEFAULT)
1487     /* Some Linux kernels don't like being passed a set */
1488     return mbind((void *) addr, len, linuxpolicy, NULL, 0, 0);
1489 
1490   err = hwloc_linux_membind_mask_from_nodeset(topology, nodeset, &max_os_index, &linuxmask);
1491   if (err < 0)
1492     goto out;
1493 
1494   if (flags & HWLOC_MEMBIND_MIGRATE) {
1495 #ifdef MPOL_MF_MOVE
1496     linuxflags = MPOL_MF_MOVE;
1497     if (flags & HWLOC_MEMBIND_STRICT)
1498       linuxflags |= MPOL_MF_STRICT;
1499 #else
1500     if (flags & HWLOC_MEMBIND_STRICT) {
1501       errno = ENOSYS;
1502       goto out_with_mask;
1503     }
1504 #endif
1505   }
1506 
1507   err = mbind((void *) addr, len, linuxpolicy, linuxmask, max_os_index+1, linuxflags);
1508   if (err < 0)
1509     goto out_with_mask;
1510 
1511   free(linuxmask);
1512   return 0;
1513 
1514  out_with_mask:
1515   free(linuxmask);
1516  out:
1517   return -1;
1518 }
1519 
1520 static void *
hwloc_linux_alloc_membind(hwloc_topology_t topology,size_t len,hwloc_const_nodeset_t nodeset,hwloc_membind_policy_t policy,int flags)1521 hwloc_linux_alloc_membind(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
1522 {
1523   void *buffer;
1524   int err;
1525 
1526   buffer = hwloc_alloc_mmap(topology, len);
1527   if (!buffer)
1528     return NULL;
1529 
1530   err = hwloc_linux_set_area_membind(topology, buffer, len, nodeset, policy, flags);
1531   if (err < 0 && policy & HWLOC_MEMBIND_STRICT) {
1532     munmap(buffer, len);
1533     return NULL;
1534   }
1535 
1536   return buffer;
1537 }
1538 #endif /* HWLOC_HAVE_MBIND */
1539 
1540 #ifdef HWLOC_HAVE_SET_MEMPOLICY
1541 static int
hwloc_linux_set_thisthread_membind(hwloc_topology_t topology,hwloc_const_nodeset_t nodeset,hwloc_membind_policy_t policy,int flags)1542 hwloc_linux_set_thisthread_membind(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
1543 {
1544   unsigned max_os_index; /* highest os_index + 1 */
1545   unsigned long *linuxmask;
1546   int linuxpolicy;
1547   int err;
1548 
1549   err = hwloc_linux_membind_policy_from_hwloc(&linuxpolicy, policy, flags);
1550   if (err < 0)
1551     return err;
1552 
1553   if (linuxpolicy == MPOL_DEFAULT)
1554     /* Some Linux kernels don't like being passed a set */
1555     return set_mempolicy(linuxpolicy, NULL, 0);
1556 
1557   err = hwloc_linux_membind_mask_from_nodeset(topology, nodeset, &max_os_index, &linuxmask);
1558   if (err < 0)
1559     goto out;
1560 
1561   if (flags & HWLOC_MEMBIND_MIGRATE) {
1562 #ifdef HWLOC_HAVE_MIGRATE_PAGES
1563     unsigned long *fullmask = malloc(max_os_index/HWLOC_BITS_PER_LONG * sizeof(long));
1564     if (fullmask) {
1565       memset(fullmask, 0xf, max_os_index/HWLOC_BITS_PER_LONG * sizeof(long));
1566       err = migrate_pages(0, max_os_index+1, fullmask, linuxmask);
1567       free(fullmask);
1568     } else
1569       err = -1;
1570     if (err < 0 && (flags & HWLOC_MEMBIND_STRICT))
1571       goto out_with_mask;
1572 #else
1573     errno = ENOSYS;
1574     goto out_with_mask;
1575 #endif
1576   }
1577 
1578   err = set_mempolicy(linuxpolicy, linuxmask, max_os_index+1);
1579   if (err < 0)
1580     goto out_with_mask;
1581 
1582   free(linuxmask);
1583   return 0;
1584 
1585  out_with_mask:
1586   free(linuxmask);
1587  out:
1588   return -1;
1589 }
1590 
1591 /*
1592  * On some kernels, get_mempolicy requires the output size to be larger
1593  * than the kernel MAX_NUMNODES (defined by CONFIG_NODES_SHIFT).
1594  * Try get_mempolicy on ourself until we find a max_os_index value that
1595  * makes the kernel happy.
1596  */
1597 static int
hwloc_linux_find_kernel_max_numnodes(hwloc_topology_t topology __hwloc_attribute_unused)1598 hwloc_linux_find_kernel_max_numnodes(hwloc_topology_t topology __hwloc_attribute_unused)
1599 {
1600   static int _max_numnodes = -1, max_numnodes;
1601   int linuxpolicy;
1602 
1603   if (_max_numnodes != -1)
1604     /* already computed */
1605     return _max_numnodes;
1606 
1607   /* start with a single ulong, it's the minimal and it's enough for most machines */
1608   max_numnodes = HWLOC_BITS_PER_LONG;
1609   while (1) {
1610     unsigned long *mask = malloc(max_numnodes / HWLOC_BITS_PER_LONG * sizeof(long));
1611     int err = get_mempolicy(&linuxpolicy, mask, max_numnodes, 0, 0);
1612     free(mask);
1613     if (!err || errno != EINVAL)
1614       /* Found it. Only update the static value with the final one,
1615        * to avoid sharing intermediate values that we modify,
1616        * in case there's ever multiple concurrent calls.
1617        */
1618       return _max_numnodes = max_numnodes;
1619     max_numnodes *= 2;
1620   }
1621 }
1622 
1623 static int
hwloc_linux_membind_policy_to_hwloc(int linuxpolicy,hwloc_membind_policy_t * policy)1624 hwloc_linux_membind_policy_to_hwloc(int linuxpolicy, hwloc_membind_policy_t *policy)
1625 {
1626   switch (linuxpolicy) {
1627   case MPOL_DEFAULT:
1628     *policy = HWLOC_MEMBIND_FIRSTTOUCH;
1629     return 0;
1630   case MPOL_PREFERRED:
1631   case MPOL_BIND:
1632     *policy = HWLOC_MEMBIND_BIND;
1633     return 0;
1634   case MPOL_INTERLEAVE:
1635     *policy = HWLOC_MEMBIND_INTERLEAVE;
1636     return 0;
1637   default:
1638     errno = EINVAL;
1639     return -1;
1640   }
1641 }
1642 
1643 static int
hwloc_linux_get_thisthread_membind(hwloc_topology_t topology,hwloc_nodeset_t nodeset,hwloc_membind_policy_t * policy,int flags __hwloc_attribute_unused)1644 hwloc_linux_get_thisthread_membind(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t *policy, int flags __hwloc_attribute_unused)
1645 {
1646   unsigned max_os_index;
1647   unsigned long *linuxmask;
1648   int linuxpolicy;
1649   int err;
1650 
1651   max_os_index = hwloc_linux_find_kernel_max_numnodes(topology);
1652 
1653   linuxmask = malloc(max_os_index/HWLOC_BITS_PER_LONG * sizeof(long));
1654   if (!linuxmask) {
1655     errno = ENOMEM;
1656     goto out;
1657   }
1658 
1659   err = get_mempolicy(&linuxpolicy, linuxmask, max_os_index, 0, 0);
1660   if (err < 0)
1661     goto out_with_mask;
1662 
1663   if (linuxpolicy == MPOL_DEFAULT) {
1664     hwloc_bitmap_copy(nodeset, hwloc_topology_get_topology_nodeset(topology));
1665   } else {
1666     hwloc_linux_membind_mask_to_nodeset(topology, nodeset, max_os_index, linuxmask);
1667   }
1668 
1669   err = hwloc_linux_membind_policy_to_hwloc(linuxpolicy, policy);
1670   if (err < 0)
1671     goto out_with_mask;
1672 
1673   free(linuxmask);
1674   return 0;
1675 
1676  out_with_mask:
1677   free(linuxmask);
1678  out:
1679   return -1;
1680 }
1681 
1682 static int
hwloc_linux_get_area_membind(hwloc_topology_t topology,const void * addr,size_t len,hwloc_nodeset_t nodeset,hwloc_membind_policy_t * policy,int flags __hwloc_attribute_unused)1683 hwloc_linux_get_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, hwloc_membind_policy_t *policy, int flags __hwloc_attribute_unused)
1684 {
1685   unsigned max_os_index;
1686   unsigned long *linuxmask, *globallinuxmask;
1687   int linuxpolicy, globallinuxpolicy = 0;
1688   int mixed = 0;
1689   int full = 0;
1690   int first = 1;
1691   int pagesize = hwloc_getpagesize();
1692   char *tmpaddr;
1693   int err;
1694   unsigned i;
1695 
1696   max_os_index = hwloc_linux_find_kernel_max_numnodes(topology);
1697 
1698   linuxmask = malloc(max_os_index/HWLOC_BITS_PER_LONG * sizeof(long));
1699   if (!linuxmask) {
1700     errno = ENOMEM;
1701     goto out;
1702   }
1703   globallinuxmask = calloc(max_os_index/HWLOC_BITS_PER_LONG, sizeof(long));
1704   if (!globallinuxmask) {
1705     errno = ENOMEM;
1706     goto out_with_masks;
1707   }
1708 
1709   for(tmpaddr = (char *)((unsigned long)addr & ~(pagesize-1));
1710       tmpaddr < (char *)addr + len;
1711       tmpaddr += pagesize) {
1712     err = get_mempolicy(&linuxpolicy, linuxmask, max_os_index, tmpaddr, MPOL_F_ADDR);
1713     if (err < 0)
1714       goto out_with_masks;
1715 
1716     /* use the first found policy. if we find a different one later, set mixed to 1 */
1717     if (first)
1718       globallinuxpolicy = linuxpolicy;
1719     else if (globallinuxpolicy != linuxpolicy)
1720       mixed = 1;
1721 
1722     /* agregate masks, and set full to 1 if we ever find DEFAULT */
1723     if (full || linuxpolicy == MPOL_DEFAULT) {
1724       full = 1;
1725     } else {
1726       for(i=0; i<max_os_index/HWLOC_BITS_PER_LONG; i++)
1727         globallinuxmask[i] |= linuxmask[i];
1728     }
1729 
1730     first = 0;
1731   }
1732 
1733   if (mixed) {
1734     *policy = HWLOC_MEMBIND_MIXED;
1735   } else {
1736     err = hwloc_linux_membind_policy_to_hwloc(linuxpolicy, policy);
1737     if (err < 0)
1738       goto out_with_masks;
1739   }
1740 
1741   if (full) {
1742     hwloc_bitmap_copy(nodeset, hwloc_topology_get_topology_nodeset(topology));
1743   } else {
1744     hwloc_linux_membind_mask_to_nodeset(topology, nodeset, max_os_index, globallinuxmask);
1745   }
1746 
1747   free(globallinuxmask);
1748   free(linuxmask);
1749   return 0;
1750 
1751  out_with_masks:
1752   free(globallinuxmask);
1753   free(linuxmask);
1754  out:
1755   return -1;
1756 }
1757 
1758 #endif /* HWLOC_HAVE_SET_MEMPOLICY */
1759 
1760 #ifdef HWLOC_HAVE_MOVE_PAGES
1761 static int
hwloc_linux_get_area_memlocation(hwloc_topology_t topology __hwloc_attribute_unused,const void * addr,size_t len,hwloc_nodeset_t nodeset,int flags __hwloc_attribute_unused)1762 hwloc_linux_get_area_memlocation(hwloc_topology_t topology __hwloc_attribute_unused, const void *addr, size_t len, hwloc_nodeset_t nodeset, int flags __hwloc_attribute_unused)
1763 {
1764   unsigned offset;
1765   unsigned long count;
1766   void **pages;
1767   int *status;
1768   int pagesize = hwloc_getpagesize();
1769   int ret;
1770   unsigned i;
1771 
1772   offset = ((unsigned long) addr) & (pagesize-1);
1773   addr = ((char*) addr) - offset;
1774   len += offset;
1775   count = (len + pagesize-1)/pagesize;
1776   pages = malloc(count*sizeof(*pages));
1777   status = malloc(count*sizeof(*status));
1778   if (!pages || !status) {
1779     ret = -1;
1780     goto out_with_pages;
1781   }
1782 
1783   for(i=0; i<count; i++)
1784     pages[i] = ((char*)addr) + i*pagesize;
1785 
1786   ret = move_pages(0, count, pages, NULL, status, 0);
1787   if (ret  < 0)
1788     goto out_with_pages;
1789 
1790   hwloc_bitmap_zero(nodeset);
1791   for(i=0; i<count; i++)
1792     if (status[i] >= 0)
1793       hwloc_bitmap_set(nodeset, status[i]);
1794   ret = 0;
1795 
1796  out_with_pages:
1797   free(pages);
1798   free(status);
1799   return ret;
1800 }
1801 #endif /* HWLOC_HAVE_MOVE_PAGES */
1802 
1803 static void hwloc_linux__get_allowed_resources(hwloc_topology_t topology, const char *root_path, int root_fd, char **cpuset_namep);
1804 
hwloc_linux_get_allowed_resources_hook(hwloc_topology_t topology)1805 static int hwloc_linux_get_allowed_resources_hook(hwloc_topology_t topology)
1806 {
1807   const char *fsroot_path;
1808   char *cpuset_name;
1809   int root_fd = -1;
1810 
1811   fsroot_path = getenv("HWLOC_FSROOT");
1812   if (!fsroot_path)
1813     fsroot_path = "/";
1814 
1815 #ifdef HAVE_OPENAT
1816   root_fd = open(fsroot_path, O_RDONLY | O_DIRECTORY);
1817   if (root_fd < 0)
1818     goto out;
1819 #else
1820   if (strcmp(fsroot_path, "/")) {
1821     errno = ENOSYS;
1822     goto out;
1823   }
1824 #endif
1825 
1826   /* we could also error-out if the current topology doesn't actually match the system,
1827    * at least for PUs and NUMA nodes. But it would increase the overhead of loading XMLs.
1828    *
1829    * Just trust the user when he sets THISSYSTEM=1. It enables hacky
1830    * tests such as restricting random XML or synthetic to the current
1831    * machine (uses the default cgroup).
1832    */
1833 
1834   hwloc_linux__get_allowed_resources(topology, fsroot_path, root_fd, &cpuset_name);
1835   if (cpuset_name) {
1836     hwloc_obj_add_info(topology->levels[0][0], "LinuxCgroup", cpuset_name);
1837     free(cpuset_name);
1838   }
1839   if (root_fd != -1)
1840     close(root_fd);
1841 
1842  out:
1843   return -1;
1844 }
1845 
1846 void
hwloc_set_linuxfs_hooks(struct hwloc_binding_hooks * hooks,struct hwloc_topology_support * support __hwloc_attribute_unused)1847 hwloc_set_linuxfs_hooks(struct hwloc_binding_hooks *hooks,
1848                         struct hwloc_topology_support *support __hwloc_attribute_unused)
1849 {
1850   hooks->set_thisthread_cpubind = hwloc_linux_set_thisthread_cpubind;
1851   hooks->get_thisthread_cpubind = hwloc_linux_get_thisthread_cpubind;
1852   hooks->set_thisproc_cpubind = hwloc_linux_set_thisproc_cpubind;
1853   hooks->get_thisproc_cpubind = hwloc_linux_get_thisproc_cpubind;
1854   hooks->set_proc_cpubind = hwloc_linux_set_proc_cpubind;
1855   hooks->get_proc_cpubind = hwloc_linux_get_proc_cpubind;
1856 #if HAVE_DECL_PTHREAD_SETAFFINITY_NP
1857   hooks->set_thread_cpubind = hwloc_linux_set_thread_cpubind;
1858 #endif /* HAVE_DECL_PTHREAD_SETAFFINITY_NP */
1859 #if HAVE_DECL_PTHREAD_GETAFFINITY_NP
1860   hooks->get_thread_cpubind = hwloc_linux_get_thread_cpubind;
1861 #endif /* HAVE_DECL_PTHREAD_GETAFFINITY_NP */
1862   hooks->get_thisthread_last_cpu_location = hwloc_linux_get_thisthread_last_cpu_location;
1863   hooks->get_thisproc_last_cpu_location = hwloc_linux_get_thisproc_last_cpu_location;
1864   hooks->get_proc_last_cpu_location = hwloc_linux_get_proc_last_cpu_location;
1865 #ifdef HWLOC_HAVE_SET_MEMPOLICY
1866   hooks->set_thisthread_membind = hwloc_linux_set_thisthread_membind;
1867   hooks->get_thisthread_membind = hwloc_linux_get_thisthread_membind;
1868   hooks->get_area_membind = hwloc_linux_get_area_membind;
1869 #endif /* HWLOC_HAVE_SET_MEMPOLICY */
1870 #ifdef HWLOC_HAVE_MBIND
1871   hooks->set_area_membind = hwloc_linux_set_area_membind;
1872 #ifdef HWLOC_HAVE_MOVE_PAGES
1873   hooks->get_area_memlocation = hwloc_linux_get_area_memlocation;
1874 #endif /* HWLOC_HAVE_MOVE_PAGES */
1875   hooks->alloc_membind = hwloc_linux_alloc_membind;
1876   hooks->alloc = hwloc_alloc_mmap;
1877   hooks->free_membind = hwloc_free_mmap;
1878   support->membind->firsttouch_membind = 1;
1879   support->membind->bind_membind = 1;
1880   support->membind->interleave_membind = 1;
1881 #endif /* HWLOC_HAVE_MBIND */
1882 #if (defined HWLOC_HAVE_MIGRATE_PAGES) || ((defined HWLOC_HAVE_MBIND) && (defined MPOL_MF_MOVE))
1883   support->membind->migrate_membind = 1;
1884 #endif
1885   hooks->get_allowed_resources = hwloc_linux_get_allowed_resources_hook;
1886 }
1887 
1888 
1889 /*******************************************
1890  *** Misc Helpers for Topology Discovery ***
1891  *******************************************/
1892 
1893 /* cpuinfo array */
1894 struct hwloc_linux_cpuinfo_proc {
1895   /* set during hwloc_linux_parse_cpuinfo */
1896   unsigned long Pproc;
1897   /* set during hwloc_linux_parse_cpuinfo or -1 if unknown*/
1898   long Pcore, Ppkg;
1899   /* set later, or -1 if unknown */
1900   long Lcore, Lpkg;
1901 
1902   /* custom info, set during hwloc_linux_parse_cpuinfo */
1903   struct hwloc_obj_info_s *infos;
1904   unsigned infos_count;
1905 };
1906 
1907 /* deprecated but still needed in hwloc/linux.h for backward compat */
1908 int
hwloc_linux_parse_cpumap_file(FILE * file,hwloc_bitmap_t set)1909 hwloc_linux_parse_cpumap_file(FILE *file, hwloc_bitmap_t set)
1910 {
1911   unsigned long *maps;
1912   unsigned long map;
1913   int nr_maps = 0;
1914   static int _nr_maps_allocated = 8; /* Only compute the power-of-two above the kernel cpumask size once.
1915                                       * Actually, it may increase multiple times if first read cpumaps start with zeroes.
1916                                       */
1917   int nr_maps_allocated = _nr_maps_allocated;
1918   int i;
1919 
1920   maps = malloc(nr_maps_allocated * sizeof(*maps));
1921   if (!maps)
1922     return -1;
1923 
1924   /* reset to zero first */
1925   hwloc_bitmap_zero(set);
1926 
1927   /* parse the whole mask */
1928   while (fscanf(file, "%lx,", &map) == 1) /* read one kernel cpu mask and the ending comma */
1929     {
1930       if (nr_maps == nr_maps_allocated) {
1931         unsigned long *tmp = realloc(maps, 2*nr_maps_allocated * sizeof(*maps));
1932         if (!tmp) {
1933           free(maps);
1934           return -1;
1935         }
1936         maps = tmp;
1937         nr_maps_allocated *= 2;
1938       }
1939 
1940       if (!map && !nr_maps)
1941         /* ignore the first map if it's empty */
1942         continue;
1943 
1944       maps[nr_maps++] = map;
1945     }
1946 
1947   /* convert into a set */
1948 #if KERNEL_CPU_MASK_BITS == HWLOC_BITS_PER_LONG
1949   for(i=0; i<nr_maps; i++)
1950     hwloc_bitmap_set_ith_ulong(set, i, maps[nr_maps-1-i]);
1951 #else
1952   for(i=0; i<(nr_maps+1)/2; i++) {
1953     unsigned long mask;
1954     mask = maps[nr_maps-2*i-1];
1955     if (2*i+1<nr_maps)
1956       mask |= maps[nr_maps-2*i-2] << KERNEL_CPU_MASK_BITS;
1957     hwloc_bitmap_set_ith_ulong(set, i, mask);
1958   }
1959 #endif
1960 
1961   free(maps);
1962 
1963   /* Only update the static value with the final one,
1964    * to avoid sharing intermediate values that we modify,
1965    * in case there's ever multiple concurrent calls.
1966    */
1967   if (nr_maps_allocated > _nr_maps_allocated)
1968     _nr_maps_allocated = nr_maps_allocated;
1969   return 0;
1970 }
1971 
1972 static void
hwloc_find_linux_cpuset_mntpnt(char ** cgroup_mntpnt,char ** cpuset_mntpnt,const char * root_path)1973 hwloc_find_linux_cpuset_mntpnt(char **cgroup_mntpnt, char **cpuset_mntpnt, const char *root_path)
1974 {
1975   char *mount_path;
1976   struct mntent mntent;
1977   FILE *fd;
1978   int err;
1979   size_t bufsize;
1980   char *buf;
1981 
1982   *cgroup_mntpnt = NULL;
1983   *cpuset_mntpnt = NULL;
1984 
1985   if (root_path) {
1986     /* setmntent() doesn't support openat(), so use the root_path directly */
1987     err = asprintf(&mount_path, "%s/proc/mounts", root_path);
1988     if (err < 0)
1989       return;
1990     fd = setmntent(mount_path, "r");
1991     free(mount_path);
1992   } else {
1993     fd = setmntent("/proc/mounts", "r");
1994   }
1995   if (!fd)
1996     return;
1997 
1998   /* getmntent_r() doesn't actually report an error when the buffer
1999    * is too small. It just silently truncates things. So we can't
2000    * dynamically resize things.
2001    *
2002    * Linux limits mount type, string, and options to one page each.
2003    * getmntent() limits the line size to 4kB.
2004    * so use 4*pagesize to be far above both.
2005    */
2006   bufsize = hwloc_getpagesize()*4;
2007   buf = malloc(bufsize);
2008 
2009   while (getmntent_r(fd, &mntent, buf, bufsize)) {
2010     if (!strcmp(mntent.mnt_type, "cpuset")) {
2011       hwloc_debug("Found cpuset mount point on %s\n", mntent.mnt_dir);
2012       *cpuset_mntpnt = strdup(mntent.mnt_dir);
2013       break;
2014     } else if (!strcmp(mntent.mnt_type, "cgroup")) {
2015       /* found a cgroup mntpnt */
2016       char *opt, *opts = mntent.mnt_opts;
2017       int cpuset_opt = 0;
2018       int noprefix_opt = 0;
2019       /* look at options */
2020       while ((opt = strsep(&opts, ",")) != NULL) {
2021         if (!strcmp(opt, "cpuset"))
2022           cpuset_opt = 1;
2023         else if (!strcmp(opt, "noprefix"))
2024           noprefix_opt = 1;
2025       }
2026       if (!cpuset_opt)
2027         continue;
2028       if (noprefix_opt) {
2029         hwloc_debug("Found cgroup emulating a cpuset mount point on %s\n", mntent.mnt_dir);
2030         *cpuset_mntpnt = strdup(mntent.mnt_dir);
2031       } else {
2032         hwloc_debug("Found cgroup/cpuset mount point on %s\n", mntent.mnt_dir);
2033         *cgroup_mntpnt = strdup(mntent.mnt_dir);
2034       }
2035       break;
2036     }
2037   }
2038 
2039   free(buf);
2040   endmntent(fd);
2041 }
2042 
2043 /*
2044  * Linux cpusets may be managed directly or through cgroup.
2045  * If cgroup is used, tasks get a /proc/pid/cgroup which may contain a
2046  * single line %d:cpuset:<name>. If cpuset are used they get /proc/pid/cpuset
2047  * containing <name>.
2048  */
2049 static char *
hwloc_read_linux_cpuset_name(int fsroot_fd,hwloc_pid_t pid)2050 hwloc_read_linux_cpuset_name(int fsroot_fd, hwloc_pid_t pid)
2051 {
2052 #define CPUSET_NAME_LEN 128
2053   char cpuset_name[CPUSET_NAME_LEN];
2054   FILE *file;
2055   int err;
2056   char *tmp;
2057 
2058   /* check whether a cgroup-cpuset is enabled */
2059   if (!pid)
2060     file = hwloc_fopen("/proc/self/cgroup", "r", fsroot_fd);
2061   else {
2062     char path[] = "/proc/XXXXXXXXXX/cgroup";
2063     snprintf(path, sizeof(path), "/proc/%d/cgroup", pid);
2064     file = hwloc_fopen(path, "r", fsroot_fd);
2065   }
2066   if (file) {
2067     /* find a cpuset line */
2068 #define CGROUP_LINE_LEN 256
2069     char line[CGROUP_LINE_LEN];
2070     while (fgets(line, sizeof(line), file)) {
2071       char *end, *colon = strchr(line, ':');
2072       if (!colon)
2073         continue;
2074       if (strncmp(colon, ":cpuset:", 8))
2075         continue;
2076 
2077       /* found a cgroup-cpuset line, return the name */
2078       fclose(file);
2079       end = strchr(colon, '\n');
2080       if (end)
2081         *end = '\0';
2082       hwloc_debug("Found cgroup-cpuset %s\n", colon+8);
2083       return strdup(colon+8);
2084     }
2085     fclose(file);
2086   }
2087 
2088   /* check whether a cpuset is enabled */
2089   if (!pid)
2090     err = hwloc_read_path_by_length("/proc/self/cpuset", cpuset_name, sizeof(cpuset_name), fsroot_fd);
2091   else {
2092     char path[] = "/proc/XXXXXXXXXX/cpuset";
2093     snprintf(path, sizeof(path), "/proc/%d/cpuset", pid);
2094     err = hwloc_read_path_by_length(path, cpuset_name, sizeof(cpuset_name), fsroot_fd);
2095   }
2096   if (err < 0) {
2097     /* found nothing */
2098     hwloc_debug("%s", "No cgroup or cpuset found\n");
2099     return NULL;
2100   }
2101 
2102   /* found a cpuset, return the name */
2103   tmp = strchr(cpuset_name, '\n');
2104   if (tmp)
2105     *tmp = '\0';
2106   hwloc_debug("Found cpuset %s\n", cpuset_name);
2107   return strdup(cpuset_name);
2108 }
2109 
2110 /*
2111  * Then, the cpuset description is available from either the cgroup or
2112  * the cpuset filesystem (usually mounted in / or /dev) where there
2113  * are cgroup<name>/cpuset.{cpus,mems} or cpuset<name>/{cpus,mems} files.
2114  */
2115 static void
hwloc_admin_disable_set_from_cpuset(int root_fd,const char * cgroup_mntpnt,const char * cpuset_mntpnt,const char * cpuset_name,const char * attr_name,hwloc_bitmap_t admin_enabled_cpus_set)2116 hwloc_admin_disable_set_from_cpuset(int root_fd,
2117                                     const char *cgroup_mntpnt, const char *cpuset_mntpnt, const char *cpuset_name,
2118                                     const char *attr_name,
2119                                     hwloc_bitmap_t admin_enabled_cpus_set)
2120 {
2121 #define CPUSET_FILENAME_LEN 256
2122   char cpuset_filename[CPUSET_FILENAME_LEN];
2123   int fd;
2124   int err;
2125 
2126   if (cgroup_mntpnt) {
2127     /* try to read the cpuset from cgroup */
2128     snprintf(cpuset_filename, CPUSET_FILENAME_LEN, "%s%s/cpuset.%s", cgroup_mntpnt, cpuset_name, attr_name);
2129     hwloc_debug("Trying to read cgroup file <%s>\n", cpuset_filename);
2130   } else if (cpuset_mntpnt) {
2131     /* try to read the cpuset directly */
2132     snprintf(cpuset_filename, CPUSET_FILENAME_LEN, "%s%s/%s", cpuset_mntpnt, cpuset_name, attr_name);
2133     hwloc_debug("Trying to read cpuset file <%s>\n", cpuset_filename);
2134   }
2135 
2136   fd = hwloc_open(cpuset_filename, root_fd);
2137   if (fd < 0) {
2138     /* found no cpuset description, ignore it */
2139     hwloc_debug("Couldn't find cpuset <%s> description, ignoring\n", cpuset_name);
2140     return;
2141   }
2142 
2143   err = hwloc__read_fd_as_cpulist(fd, admin_enabled_cpus_set);
2144   close(fd);
2145 
2146   if (err < 0)
2147     hwloc_bitmap_fill(admin_enabled_cpus_set);
2148   else
2149     hwloc_debug_bitmap("cpuset includes %s\n", admin_enabled_cpus_set);
2150 }
2151 
2152 static void
hwloc_parse_meminfo_info(struct hwloc_linux_backend_data_s * data,const char * path,uint64_t * local_memory,uint64_t * meminfo_hugepages_count,uint64_t * meminfo_hugepages_size,int onlytotal)2153 hwloc_parse_meminfo_info(struct hwloc_linux_backend_data_s *data,
2154                          const char *path,
2155                          uint64_t *local_memory,
2156                          uint64_t *meminfo_hugepages_count,
2157                          uint64_t *meminfo_hugepages_size,
2158                          int onlytotal)
2159 {
2160   char *tmp;
2161   char buffer[4096];
2162   unsigned long long number;
2163 
2164   if (hwloc_read_path_by_length(path, buffer, sizeof(buffer), data->root_fd) < 0)
2165     return;
2166 
2167   tmp = strstr(buffer, "MemTotal: "); /* MemTotal: %llu kB */
2168   if (tmp) {
2169     number = strtoull(tmp+10, NULL, 10);
2170     *local_memory = number << 10;
2171 
2172     if (onlytotal)
2173       return;
2174 
2175     tmp = strstr(tmp, "Hugepagesize: "); /* Hugepagesize: %llu */
2176     if (tmp) {
2177       number = strtoull(tmp+14, NULL, 10);
2178       *meminfo_hugepages_size = number << 10;
2179 
2180       tmp = strstr(tmp, "HugePages_Free: "); /* HugePages_Free: %llu */
2181       if (tmp) {
2182         number = strtoull(tmp+16, NULL, 10);
2183         *meminfo_hugepages_count = number;
2184       }
2185     }
2186   }
2187 }
2188 
2189 #define SYSFS_NUMA_NODE_PATH_LEN 128
2190 
2191 static void
hwloc_parse_hugepages_info(struct hwloc_linux_backend_data_s * data,const char * dirpath,struct hwloc_obj_memory_s * memory,uint64_t * remaining_local_memory)2192 hwloc_parse_hugepages_info(struct hwloc_linux_backend_data_s *data,
2193                            const char *dirpath,
2194                            struct hwloc_obj_memory_s *memory,
2195                            uint64_t *remaining_local_memory)
2196 {
2197   DIR *dir;
2198   struct dirent *dirent;
2199   unsigned long index_ = 1;
2200   char line[64];
2201   char path[SYSFS_NUMA_NODE_PATH_LEN];
2202 
2203   dir = hwloc_opendir(dirpath, data->root_fd);
2204   if (dir) {
2205     while ((dirent = readdir(dir)) != NULL) {
2206       if (strncmp(dirent->d_name, "hugepages-", 10))
2207         continue;
2208       memory->page_types[index_].size = strtoul(dirent->d_name+10, NULL, 0) * 1024ULL;
2209       sprintf(path, "%s/%s/nr_hugepages", dirpath, dirent->d_name);
2210       if (!hwloc_read_path_by_length(path, line, sizeof(line), data->root_fd)) {
2211         /* these are the actual total amount of huge pages */
2212         memory->page_types[index_].count = strtoull(line, NULL, 0);
2213         *remaining_local_memory -= memory->page_types[index_].count * memory->page_types[index_].size;
2214         index_++;
2215       }
2216     }
2217     closedir(dir);
2218     memory->page_types_len = index_;
2219   }
2220 }
2221 
2222 static void
hwloc_get_kerrighed_node_meminfo_info(struct hwloc_topology * topology,struct hwloc_linux_backend_data_s * data,unsigned long node,struct hwloc_obj_memory_s * memory)2223 hwloc_get_kerrighed_node_meminfo_info(struct hwloc_topology *topology,
2224                                       struct hwloc_linux_backend_data_s *data,
2225                                       unsigned long node, struct hwloc_obj_memory_s *memory)
2226 {
2227   char path[128];
2228   uint64_t meminfo_hugepages_count, meminfo_hugepages_size = 0;
2229 
2230   if (topology->is_thissystem) {
2231     memory->page_types_len = 2;
2232     memory->page_types = malloc(2*sizeof(*memory->page_types));
2233     memset(memory->page_types, 0, 2*sizeof(*memory->page_types));
2234     /* Try to get the hugepage size from sysconf in case we fail to get it from /proc/meminfo later */
2235 #ifdef HAVE__SC_LARGE_PAGESIZE
2236     memory->page_types[1].size = sysconf(_SC_LARGE_PAGESIZE);
2237 #endif
2238     memory->page_types[0].size = data->pagesize;
2239   }
2240 
2241   snprintf(path, sizeof(path), "/proc/nodes/node%lu/meminfo", node);
2242   hwloc_parse_meminfo_info(data, path,
2243                            &memory->local_memory,
2244                            &meminfo_hugepages_count, &meminfo_hugepages_size,
2245                            memory->page_types == NULL);
2246 
2247   if (memory->page_types) {
2248     uint64_t remaining_local_memory = memory->local_memory;
2249     if (meminfo_hugepages_size) {
2250       memory->page_types[1].size = meminfo_hugepages_size;
2251       memory->page_types[1].count = meminfo_hugepages_count;
2252       remaining_local_memory -= meminfo_hugepages_count * meminfo_hugepages_size;
2253     } else {
2254       memory->page_types_len = 1;
2255     }
2256     memory->page_types[0].count = remaining_local_memory / memory->page_types[0].size;
2257   }
2258 }
2259 
2260 static void
hwloc_get_procfs_meminfo_info(struct hwloc_topology * topology,struct hwloc_linux_backend_data_s * data,struct hwloc_obj_memory_s * memory)2261 hwloc_get_procfs_meminfo_info(struct hwloc_topology *topology,
2262                               struct hwloc_linux_backend_data_s *data,
2263                               struct hwloc_obj_memory_s *memory)
2264 {
2265   uint64_t meminfo_hugepages_count, meminfo_hugepages_size = 0;
2266   struct stat st;
2267   int has_sysfs_hugepages = 0;
2268   const char *pagesize_env = getenv("HWLOC_DEBUG_PAGESIZE");
2269   int types = 2;
2270   int err;
2271 
2272   err = hwloc_stat("/sys/kernel/mm/hugepages", &st, data->root_fd);
2273   if (!err) {
2274     types = 1 + st.st_nlink-2;
2275     has_sysfs_hugepages = 1;
2276   }
2277 
2278   if (topology->is_thissystem || pagesize_env) {
2279     /* we cannot report any page_type info unless we have the page size.
2280      * we'll take it either from the system if local, or from the debug env variable
2281      */
2282     memory->page_types_len = types;
2283     memory->page_types = calloc(types, sizeof(*memory->page_types));
2284   }
2285 
2286   if (topology->is_thissystem) {
2287     /* Get the page and hugepage sizes from sysconf */
2288 #if HAVE_DECL__SC_LARGE_PAGESIZE
2289     memory->page_types[1].size = sysconf(_SC_LARGE_PAGESIZE);
2290 #endif
2291     memory->page_types[0].size = data->pagesize; /* might be overwritten later by /proc/meminfo or sysfs */
2292   }
2293 
2294   hwloc_parse_meminfo_info(data, "/proc/meminfo",
2295                            &memory->local_memory,
2296                            &meminfo_hugepages_count, &meminfo_hugepages_size,
2297                            memory->page_types == NULL);
2298 
2299   if (memory->page_types) {
2300     uint64_t remaining_local_memory = memory->local_memory;
2301     if (has_sysfs_hugepages) {
2302       /* read from node%d/hugepages/hugepages-%skB/nr_hugepages */
2303       hwloc_parse_hugepages_info(data, "/sys/kernel/mm/hugepages", memory, &remaining_local_memory);
2304     } else {
2305       /* use what we found in meminfo */
2306       if (meminfo_hugepages_size) {
2307         memory->page_types[1].size = meminfo_hugepages_size;
2308         memory->page_types[1].count = meminfo_hugepages_count;
2309         remaining_local_memory -= meminfo_hugepages_count * meminfo_hugepages_size;
2310       } else {
2311         memory->page_types_len = 1;
2312       }
2313     }
2314 
2315     if (pagesize_env) {
2316       /* We cannot get the pagesize if not thissystem, use the env-given one to experience the code during make check */
2317       memory->page_types[0].size = strtoull(pagesize_env, NULL, 10);
2318       /* If failed, use 4kB */
2319       if (!memory->page_types[0].size)
2320         memory->page_types[0].size = 4096;
2321     }
2322     assert(memory->page_types[0].size); /* from sysconf if local or from the env */
2323     /* memory->page_types[1].size from sysconf if local, or from /proc/meminfo, or from sysfs,
2324      * may be 0 if no hugepage support in the kernel */
2325 
2326     memory->page_types[0].count = remaining_local_memory / memory->page_types[0].size;
2327   }
2328 }
2329 
2330 static void
hwloc_sysfs_node_meminfo_info(struct hwloc_topology * topology,struct hwloc_linux_backend_data_s * data,const char * syspath,int node,struct hwloc_obj_memory_s * memory)2331 hwloc_sysfs_node_meminfo_info(struct hwloc_topology *topology,
2332                               struct hwloc_linux_backend_data_s *data,
2333                               const char *syspath, int node,
2334                               struct hwloc_obj_memory_s *memory)
2335 {
2336   char path[SYSFS_NUMA_NODE_PATH_LEN];
2337   char meminfopath[SYSFS_NUMA_NODE_PATH_LEN];
2338   uint64_t meminfo_hugepages_count = 0;
2339   uint64_t meminfo_hugepages_size = 0;
2340   struct stat st;
2341   int has_sysfs_hugepages = 0;
2342   int types = 2;
2343   int err;
2344 
2345   sprintf(path, "%s/node%d/hugepages", syspath, node);
2346   err = hwloc_stat(path, &st, data->root_fd);
2347   if (!err) {
2348     types = 1 + st.st_nlink-2;
2349     has_sysfs_hugepages = 1;
2350   }
2351 
2352   if (topology->is_thissystem) {
2353     memory->page_types_len = types;
2354     memory->page_types = malloc(types*sizeof(*memory->page_types));
2355     memset(memory->page_types, 0, types*sizeof(*memory->page_types));
2356   }
2357 
2358   sprintf(meminfopath, "%s/node%d/meminfo", syspath, node);
2359   hwloc_parse_meminfo_info(data, meminfopath,
2360                            &memory->local_memory,
2361                            &meminfo_hugepages_count, NULL /* no hugepage size in node-specific meminfo */,
2362                            memory->page_types == NULL);
2363 
2364   if (memory->page_types) {
2365     uint64_t remaining_local_memory = memory->local_memory;
2366     if (has_sysfs_hugepages) {
2367       /* read from node%d/hugepages/hugepages-%skB/nr_hugepages */
2368       hwloc_parse_hugepages_info(data, path, memory, &remaining_local_memory);
2369     } else {
2370       /* get hugepage size from machine-specific meminfo since there is no size in node-specific meminfo,
2371        * hwloc_get_procfs_meminfo_info must have been called earlier */
2372       meminfo_hugepages_size = topology->levels[0][0]->memory.page_types[1].size;
2373       /* use what we found in meminfo */
2374       if (meminfo_hugepages_size) {
2375         memory->page_types[1].count = meminfo_hugepages_count;
2376         memory->page_types[1].size = meminfo_hugepages_size;
2377         remaining_local_memory -= meminfo_hugepages_count * meminfo_hugepages_size;
2378       } else {
2379         memory->page_types_len = 1;
2380       }
2381     }
2382     /* update what's remaining as normal pages */
2383     memory->page_types[0].size = data->pagesize;
2384     memory->page_types[0].count = remaining_local_memory / memory->page_types[0].size;
2385   }
2386 }
2387 
2388 static int
hwloc_parse_nodes_distances(const char * path,unsigned nbnodes,unsigned * indexes,float * distances,int fsroot_fd)2389 hwloc_parse_nodes_distances(const char *path, unsigned nbnodes, unsigned *indexes, float *distances, int fsroot_fd)
2390 {
2391   size_t len = (10+1)*nbnodes;
2392   float *curdist = distances;
2393   char *string;
2394   unsigned i;
2395 
2396   string = malloc(len); /* space-separated %d */
2397   if (!string)
2398     goto out;
2399 
2400   for(i=0; i<nbnodes; i++) {
2401     unsigned osnode = indexes[i];
2402     char distancepath[SYSFS_NUMA_NODE_PATH_LEN];
2403     char *tmp, *next;
2404     unsigned found;
2405 
2406     /* Linux nodeX/distance file contains distance from X to other localities (from ACPI SLIT table or so),
2407      * store them in slots X*N...X*N+N-1 */
2408     sprintf(distancepath, "%s/node%u/distance", path, osnode);
2409     if (hwloc_read_path_by_length(distancepath, string, len, fsroot_fd) < 0)
2410       goto out_with_string;
2411 
2412     tmp = string;
2413     found = 0;
2414     while (tmp) {
2415       unsigned distance = strtoul(tmp, &next, 0); /* stored as a %d */
2416       if (next == tmp)
2417         break;
2418       *curdist = (float) distance;
2419       curdist++;
2420       found++;
2421       if (found == nbnodes)
2422         break;
2423       tmp = next+1;
2424     }
2425     if (found != nbnodes)
2426       goto out_with_string;
2427   }
2428 
2429   free(string);
2430   return 0;
2431 
2432  out_with_string:
2433   free(string);
2434  out:
2435   return -1;
2436 }
2437 
2438 static void
hwloc__get_dmi_id_one_info(struct hwloc_linux_backend_data_s * data,hwloc_obj_t obj,char * path,unsigned pathlen,const char * dmi_name,const char * hwloc_name)2439 hwloc__get_dmi_id_one_info(struct hwloc_linux_backend_data_s *data,
2440                            hwloc_obj_t obj,
2441                            char *path, unsigned pathlen,
2442                            const char *dmi_name, const char *hwloc_name)
2443 {
2444   char dmi_line[64];
2445 
2446   strcpy(path+pathlen, dmi_name);
2447   if (hwloc_read_path_by_length(path, dmi_line, sizeof(dmi_line), data->root_fd) < 0)
2448     return;
2449 
2450   if (dmi_line[0] != '\0') {
2451     char *tmp = strchr(dmi_line, '\n');
2452     if (tmp)
2453       *tmp = '\0';
2454     hwloc_debug("found %s '%s'\n", hwloc_name, dmi_line);
2455     hwloc_obj_add_info(obj, hwloc_name, dmi_line);
2456   }
2457 }
2458 
2459 static void
hwloc__get_dmi_id_info(struct hwloc_linux_backend_data_s * data,hwloc_obj_t obj)2460 hwloc__get_dmi_id_info(struct hwloc_linux_backend_data_s *data, hwloc_obj_t obj)
2461 {
2462   char path[128];
2463   unsigned pathlen;
2464   DIR *dir;
2465 
2466   strcpy(path, "/sys/devices/virtual/dmi/id");
2467   dir = hwloc_opendir(path, data->root_fd);
2468   if (dir) {
2469     pathlen = 27;
2470   } else {
2471     strcpy(path, "/sys/class/dmi/id");
2472     dir = hwloc_opendir(path, data->root_fd);
2473     if (dir)
2474       pathlen = 17;
2475     else
2476       return;
2477   }
2478   closedir(dir);
2479 
2480   path[pathlen++] = '/';
2481 
2482   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "product_name", "DMIProductName");
2483   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "product_version", "DMIProductVersion");
2484   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "product_serial", "DMIProductSerial");
2485   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "product_uuid", "DMIProductUUID");
2486   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_vendor", "DMIBoardVendor");
2487   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_name", "DMIBoardName");
2488   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_version", "DMIBoardVersion");
2489   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_serial", "DMIBoardSerial");
2490   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_asset_tag", "DMIBoardAssetTag");
2491   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_vendor", "DMIChassisVendor");
2492   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_type", "DMIChassisType");
2493   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_version", "DMIChassisVersion");
2494   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_serial", "DMIChassisSerial");
2495   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_asset_tag", "DMIChassisAssetTag");
2496   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "bios_vendor", "DMIBIOSVendor");
2497   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "bios_version", "DMIBIOSVersion");
2498   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "bios_date", "DMIBIOSDate");
2499   hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "sys_vendor", "DMISysVendor");
2500 }
2501 
2502 struct hwloc_firmware_dmi_mem_device_header {
2503   unsigned char type;
2504   unsigned char length;
2505   unsigned char handle[2];
2506   unsigned char phy_mem_handle[2];
2507   unsigned char mem_err_handle[2];
2508   unsigned char tot_width[2];
2509   unsigned char dat_width[2];
2510   unsigned char size[2];
2511   unsigned char ff;
2512   unsigned char dev_set;
2513   unsigned char dev_loc_str_num;
2514   unsigned char bank_loc_str_num;
2515   unsigned char mem_type;
2516   unsigned char type_detail[2];
2517   unsigned char speed[2];
2518   unsigned char manuf_str_num;
2519   unsigned char serial_str_num;
2520   unsigned char asset_tag_str_num;
2521   unsigned char part_num_str_num;
2522   /* don't include the following fields since we don't need them,
2523    * some old implementations may miss them.
2524    */
2525 };
2526 
check_dmi_entry(const char * buffer)2527 static int check_dmi_entry(const char *buffer)
2528 {
2529   /* reject empty strings */
2530   if (!*buffer)
2531     return 0;
2532   /* reject strings of spaces (at least Dell use this for empty memory slots) */
2533   if (strspn(buffer, " ") == strlen(buffer))
2534     return 0;
2535   return 1;
2536 }
2537 
2538 static void
hwloc__get_firmware_dmi_memory_info_one(struct hwloc_topology * topology,unsigned idx,const char * path,FILE * fd,struct hwloc_firmware_dmi_mem_device_header * header)2539 hwloc__get_firmware_dmi_memory_info_one(struct hwloc_topology *topology,
2540                                         unsigned idx, const char *path, FILE *fd,
2541                                         struct hwloc_firmware_dmi_mem_device_header *header)
2542 {
2543   unsigned slen;
2544   char buffer[256]; /* enough for memory device strings, or at least for each of them */
2545   unsigned foff; /* offset in raw file */
2546   unsigned boff; /* offset in buffer read from raw file */
2547   unsigned i;
2548   struct hwloc_obj_info_s *infos = NULL;
2549   unsigned infos_count = 0;
2550   hwloc_obj_t misc;
2551   int foundinfo = 0;
2552 
2553   hwloc__add_info(&infos, &infos_count, "Type", "MemoryModule");
2554 
2555   /* start after the header */
2556   foff = header->length;
2557   i = 1;
2558   while (1) {
2559     /* read one buffer */
2560     if (fseek(fd, foff, SEEK_SET) < 0)
2561       break;
2562     if (!fgets(buffer, sizeof(buffer), fd))
2563       break;
2564     /* read string at the beginning of the buffer */
2565     boff = 0;
2566     while (1) {
2567       /* stop on empty string */
2568       if (!buffer[boff])
2569         goto done;
2570       /* stop if this string goes to the end of the buffer */
2571       slen = strlen(buffer+boff);
2572       if (boff + slen+1 == sizeof(buffer))
2573         break;
2574       /* string didn't get truncated, should be OK */
2575       if (i == header->manuf_str_num) {
2576         if (check_dmi_entry(buffer+boff)) {
2577           hwloc__add_info(&infos, &infos_count, "Vendor", buffer+boff);
2578           foundinfo = 1;
2579         }
2580       } else if (i == header->serial_str_num) {
2581         if (check_dmi_entry(buffer+boff)) {
2582           hwloc__add_info(&infos, &infos_count, "SerialNumber", buffer+boff);
2583           foundinfo = 1;
2584         }
2585       } else if (i == header->asset_tag_str_num) {
2586         if (check_dmi_entry(buffer+boff)) {
2587           hwloc__add_info(&infos, &infos_count, "AssetTag", buffer+boff);
2588           foundinfo = 1;
2589         }
2590       } else if (i == header->part_num_str_num) {
2591         if (check_dmi_entry(buffer+boff)) {
2592           hwloc__add_info(&infos, &infos_count, "PartNumber", buffer+boff);
2593           foundinfo = 1;
2594         }
2595       } else if (i == header->dev_loc_str_num) {
2596         if (check_dmi_entry(buffer+boff)) {
2597           hwloc__add_info(&infos, &infos_count, "DeviceLocation", buffer+boff);
2598           /* only a location, not an actual info about the device */
2599         }
2600       } else if (i == header->bank_loc_str_num) {
2601         if (check_dmi_entry(buffer+boff)) {
2602           hwloc__add_info(&infos, &infos_count, "BankLocation", buffer+boff);
2603           /* only a location, not an actual info about the device */
2604         }
2605       } else {
2606         goto done;
2607       }
2608       /* next string in buffer */
2609       boff += slen+1;
2610       i++;
2611     }
2612     /* couldn't read a single full string from that buffer, we're screwed */
2613     if (!boff) {
2614       fprintf(stderr, "hwloc could read a DMI firmware entry #%u in %s\n",
2615               i, path);
2616       break;
2617     }
2618     /* reread buffer after previous string */
2619     foff += boff;
2620   }
2621 
2622 done:
2623   if (!foundinfo) {
2624     /* found no actual info about the device. if there's only location info, the slot may be empty */
2625     goto out_with_infos;
2626   }
2627 
2628   misc = hwloc_alloc_setup_object(HWLOC_OBJ_MISC, idx);
2629   if (!misc)
2630     goto out_with_infos;
2631 
2632   hwloc__move_infos(&misc->infos, &misc->infos_count, &infos, &infos_count);
2633   /* FIXME: find a way to identify the corresponding NUMA node and attach these objects there.
2634    * but it means we need to parse DeviceLocation=DIMM_B4 but these vary significantly
2635    * with the vendor, and it's hard to be 100% sure 'B' is second socket.
2636    * Examples at http://sourceforge.net/p/edac-utils/code/HEAD/tree/trunk/src/etc/labels.db
2637    * or https://github.com/grondo/edac-utils/blob/master/src/etc/labels.db
2638    */
2639   hwloc_insert_object_by_parent(topology, hwloc_get_root_obj(topology), misc);
2640   return;
2641 
2642  out_with_infos:
2643   hwloc__free_infos(infos, infos_count);
2644 }
2645 
2646 static void
hwloc__get_firmware_dmi_memory_info(struct hwloc_topology * topology,struct hwloc_linux_backend_data_s * data)2647 hwloc__get_firmware_dmi_memory_info(struct hwloc_topology *topology,
2648                                     struct hwloc_linux_backend_data_s *data)
2649 {
2650   char path[128];
2651   unsigned i;
2652 
2653   for(i=0; ; i++) {
2654     FILE *fd;
2655     struct hwloc_firmware_dmi_mem_device_header header;
2656     int err;
2657 
2658     snprintf(path, sizeof(path), "/sys/firmware/dmi/entries/17-%u/raw", i);
2659     fd = hwloc_fopen(path, "r", data->root_fd);
2660     if (!fd)
2661       break;
2662 
2663     err = fread(&header, sizeof(header), 1, fd);
2664     if (err != 1) {
2665       fclose(fd);
2666       break;
2667     }
2668     if (header.length < sizeof(header)) {
2669       /* invalid, or too old entry/spec that doesn't contain what we need */
2670       fclose(fd);
2671       break;
2672     }
2673 
2674     hwloc__get_firmware_dmi_memory_info_one(topology, i, path, fd, &header);
2675 
2676     fclose(fd);
2677   }
2678 }
2679 
2680 
2681 /***********************************
2682  ****** Device tree Discovery ******
2683  ***********************************/
2684 
2685 /* Reads the entire file and returns bytes read if bytes_read != NULL
2686  * Returned pointer can be freed by using free().  */
2687 static void *
hwloc_read_raw(const char * p,const char * p1,size_t * bytes_read,int root_fd)2688 hwloc_read_raw(const char *p, const char *p1, size_t *bytes_read, int root_fd)
2689 {
2690   char fname[256];
2691   char *ret = NULL;
2692   struct stat fs;
2693   int file = -1;
2694 
2695   snprintf(fname, sizeof(fname), "%s/%s", p, p1);
2696 
2697   file = hwloc_open(fname, root_fd);
2698   if (-1 == file) {
2699       goto out_no_close;
2700   }
2701   if (fstat(file, &fs)) {
2702     goto out;
2703   }
2704 
2705   ret = (char *) malloc(fs.st_size);
2706   if (NULL != ret) {
2707     ssize_t cb = read(file, ret, fs.st_size);
2708     if (cb == -1) {
2709       free(ret);
2710       ret = NULL;
2711     } else {
2712       if (NULL != bytes_read)
2713         *bytes_read = cb;
2714     }
2715   }
2716 
2717  out:
2718   close(file);
2719  out_no_close:
2720   return ret;
2721 }
2722 
2723 /* Reads the entire file and returns it as a 0-terminated string
2724  * Returned pointer can be freed by using free().  */
2725 static char *
hwloc_read_str(const char * p,const char * p1,int root_fd)2726 hwloc_read_str(const char *p, const char *p1, int root_fd)
2727 {
2728   size_t cb = 0;
2729   char *ret = hwloc_read_raw(p, p1, &cb, root_fd);
2730   if ((NULL != ret) && (0 < cb) && (0 != ret[cb-1])) {
2731     char *tmp = realloc(ret, cb + 1);
2732     if (!tmp) {
2733       free(ret);
2734       return NULL;
2735     }
2736     ret = tmp;
2737     ret[cb] = 0;
2738   }
2739   return ret;
2740 }
2741 
2742 /* Reads first 32bit bigendian value */
2743 static ssize_t
hwloc_read_unit32be(const char * p,const char * p1,uint32_t * buf,int root_fd)2744 hwloc_read_unit32be(const char *p, const char *p1, uint32_t *buf, int root_fd)
2745 {
2746   size_t cb = 0;
2747   uint32_t *tmp = hwloc_read_raw(p, p1, &cb, root_fd);
2748   if (sizeof(*buf) != cb) {
2749     errno = EINVAL;
2750     free(tmp); /* tmp is either NULL or contains useless things */
2751     return -1;
2752   }
2753   *buf = htonl(*tmp);
2754   free(tmp);
2755   return sizeof(*buf);
2756 }
2757 
2758 typedef struct {
2759   unsigned int n, allocated;
2760   struct {
2761     hwloc_bitmap_t cpuset;
2762     uint32_t phandle;
2763     uint32_t l2_cache;
2764     char *name;
2765   } *p;
2766 } device_tree_cpus_t;
2767 
2768 static void
add_device_tree_cpus_node(device_tree_cpus_t * cpus,hwloc_bitmap_t cpuset,uint32_t l2_cache,uint32_t phandle,const char * name)2769 add_device_tree_cpus_node(device_tree_cpus_t *cpus, hwloc_bitmap_t cpuset,
2770     uint32_t l2_cache, uint32_t phandle, const char *name)
2771 {
2772   if (cpus->n == cpus->allocated) {
2773     void *tmp;
2774     unsigned allocated;
2775     if (!cpus->allocated)
2776       allocated = 64;
2777     else
2778       allocated = 2 * cpus->allocated;
2779     tmp = realloc(cpus->p, allocated * sizeof(cpus->p[0]));
2780     if (!tmp)
2781       return; /* failed to realloc, ignore this entry */
2782     cpus->p = tmp;
2783     cpus->allocated = allocated;
2784   }
2785   cpus->p[cpus->n].phandle = phandle;
2786   cpus->p[cpus->n].cpuset = (NULL == cpuset)?NULL:hwloc_bitmap_dup(cpuset);
2787   cpus->p[cpus->n].l2_cache = l2_cache;
2788   cpus->p[cpus->n].name = strdup(name);
2789   ++cpus->n;
2790 }
2791 
2792 /* Walks over the cache list in order to detect nested caches and CPU mask for each */
2793 static int
look_powerpc_device_tree_discover_cache(device_tree_cpus_t * cpus,uint32_t phandle,unsigned int * level,hwloc_bitmap_t cpuset)2794 look_powerpc_device_tree_discover_cache(device_tree_cpus_t *cpus,
2795     uint32_t phandle, unsigned int *level, hwloc_bitmap_t cpuset)
2796 {
2797   unsigned int i;
2798   int ret = -1;
2799   if ((NULL == level) || (NULL == cpuset) || phandle == (uint32_t) -1)
2800     return ret;
2801   for (i = 0; i < cpus->n; ++i) {
2802     if (phandle != cpus->p[i].l2_cache)
2803       continue;
2804     if (NULL != cpus->p[i].cpuset) {
2805       hwloc_bitmap_or(cpuset, cpuset, cpus->p[i].cpuset);
2806       ret = 0;
2807     } else {
2808       ++(*level);
2809       if (0 == look_powerpc_device_tree_discover_cache(cpus,
2810             cpus->p[i].phandle, level, cpuset))
2811         ret = 0;
2812     }
2813   }
2814   return ret;
2815 }
2816 
2817 static void
try__add_cache_from_device_tree_cpu(struct hwloc_topology * topology,unsigned int level,hwloc_obj_cache_type_t type,uint32_t cache_line_size,uint32_t cache_size,uint32_t cache_sets,hwloc_bitmap_t cpuset)2818 try__add_cache_from_device_tree_cpu(struct hwloc_topology *topology,
2819                                     unsigned int level, hwloc_obj_cache_type_t type,
2820                                     uint32_t cache_line_size, uint32_t cache_size, uint32_t cache_sets,
2821                                     hwloc_bitmap_t cpuset)
2822 {
2823   struct hwloc_obj *c = NULL;
2824 
2825   if (0 == cache_size)
2826     return;
2827 
2828   c = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, -1);
2829   c->attr->cache.depth = level;
2830   c->attr->cache.linesize = cache_line_size;
2831   c->attr->cache.size = cache_size;
2832   c->attr->cache.type = type;
2833   if (cache_sets == 1)
2834     /* likely wrong, make it unknown */
2835     cache_sets = 0;
2836   if (cache_sets && cache_line_size)
2837     c->attr->cache.associativity = cache_size / (cache_sets * cache_line_size);
2838   else
2839     c->attr->cache.associativity = 0;
2840   c->cpuset = hwloc_bitmap_dup(cpuset);
2841   hwloc_debug_2args_bitmap("cache (%s) depth %d has cpuset %s\n",
2842                            type == HWLOC_OBJ_CACHE_UNIFIED ? "unified" : (type == HWLOC_OBJ_CACHE_DATA ? "data" : "instruction"),
2843                            level, c->cpuset);
2844   hwloc_insert_object_by_cpuset(topology, c);
2845 }
2846 
2847 static void
try_add_cache_from_device_tree_cpu(struct hwloc_topology * topology,struct hwloc_linux_backend_data_s * data,const char * cpu,unsigned int level,hwloc_bitmap_t cpuset)2848 try_add_cache_from_device_tree_cpu(struct hwloc_topology *topology,
2849                                    struct hwloc_linux_backend_data_s *data,
2850                                    const char *cpu, unsigned int level, hwloc_bitmap_t cpuset)
2851 {
2852   /* d-cache-block-size - ignore */
2853   /* d-cache-line-size - to read, in bytes */
2854   /* d-cache-sets - ignore */
2855   /* d-cache-size - to read, in bytes */
2856   /* i-cache, same for instruction */
2857   /* cache-unified only exist if data and instruction caches are unified */
2858   /* d-tlb-sets - ignore */
2859   /* d-tlb-size - ignore, always 0 on power6 */
2860   /* i-tlb-*, same */
2861   uint32_t d_cache_line_size = 0, d_cache_size = 0, d_cache_sets = 0;
2862   uint32_t i_cache_line_size = 0, i_cache_size = 0, i_cache_sets = 0;
2863   char unified_path[1024];
2864   struct stat statbuf;
2865   int unified;
2866 
2867   snprintf(unified_path, sizeof(unified_path), "%s/cache-unified", cpu);
2868   unified = (hwloc_stat(unified_path, &statbuf, data->root_fd) == 0);
2869 
2870   hwloc_read_unit32be(cpu, "d-cache-line-size", &d_cache_line_size,
2871       data->root_fd);
2872   hwloc_read_unit32be(cpu, "d-cache-size", &d_cache_size,
2873       data->root_fd);
2874   hwloc_read_unit32be(cpu, "d-cache-sets", &d_cache_sets,
2875       data->root_fd);
2876   hwloc_read_unit32be(cpu, "i-cache-line-size", &i_cache_line_size,
2877       data->root_fd);
2878   hwloc_read_unit32be(cpu, "i-cache-size", &i_cache_size,
2879       data->root_fd);
2880   hwloc_read_unit32be(cpu, "i-cache-sets", &i_cache_sets,
2881       data->root_fd);
2882 
2883   if (!unified)
2884     try__add_cache_from_device_tree_cpu(topology, level, HWLOC_OBJ_CACHE_INSTRUCTION,
2885                                         i_cache_line_size, i_cache_size, i_cache_sets, cpuset);
2886   try__add_cache_from_device_tree_cpu(topology, level, unified ? HWLOC_OBJ_CACHE_UNIFIED : HWLOC_OBJ_CACHE_DATA,
2887                                       d_cache_line_size, d_cache_size, d_cache_sets, cpuset);
2888 }
2889 
2890 /*
2891  * Discovers L1/L2/L3 cache information on IBM PowerPC systems for old kernels (RHEL5.*)
2892  * which provide NUMA nodes information without any details
2893  */
2894 static void
look_powerpc_device_tree(struct hwloc_topology * topology,struct hwloc_linux_backend_data_s * data)2895 look_powerpc_device_tree(struct hwloc_topology *topology,
2896                          struct hwloc_linux_backend_data_s *data)
2897 {
2898   device_tree_cpus_t cpus;
2899   const char ofroot[] = "/proc/device-tree/cpus";
2900   unsigned int i;
2901   int root_fd = data->root_fd;
2902   DIR *dt = hwloc_opendir(ofroot, root_fd);
2903   struct dirent *dirent;
2904 
2905   if (NULL == dt)
2906     return;
2907 
2908   /* only works for Power so far, and not useful on ARM */
2909   if (data->arch != HWLOC_LINUX_ARCH_POWER)
2910     return;
2911 
2912   cpus.n = 0;
2913   cpus.p = NULL;
2914   cpus.allocated = 0;
2915 
2916   while (NULL != (dirent = readdir(dt))) {
2917     char cpu[256];
2918     char *device_type;
2919     uint32_t reg = -1, l2_cache = -1, phandle = -1;
2920 
2921     if ('.' == dirent->d_name[0])
2922       continue;
2923 
2924     snprintf(cpu, sizeof(cpu), "%s/%s", ofroot, dirent->d_name);
2925 
2926     device_type = hwloc_read_str(cpu, "device_type", root_fd);
2927     if (NULL == device_type)
2928       continue;
2929 
2930     hwloc_read_unit32be(cpu, "reg", &reg, root_fd);
2931     if (hwloc_read_unit32be(cpu, "next-level-cache", &l2_cache, root_fd) == -1)
2932       hwloc_read_unit32be(cpu, "l2-cache", &l2_cache, root_fd);
2933     if (hwloc_read_unit32be(cpu, "phandle", &phandle, root_fd) == -1)
2934       if (hwloc_read_unit32be(cpu, "ibm,phandle", &phandle, root_fd) == -1)
2935         hwloc_read_unit32be(cpu, "linux,phandle", &phandle, root_fd);
2936 
2937     if (0 == strcmp(device_type, "cache")) {
2938       add_device_tree_cpus_node(&cpus, NULL, l2_cache, phandle, dirent->d_name);
2939     }
2940     else if (0 == strcmp(device_type, "cpu")) {
2941       /* Found CPU */
2942       hwloc_bitmap_t cpuset = NULL;
2943       size_t cb = 0;
2944       uint32_t *threads = hwloc_read_raw(cpu, "ibm,ppc-interrupt-server#s", &cb, root_fd);
2945       uint32_t nthreads = cb / sizeof(threads[0]);
2946 
2947       if (NULL != threads) {
2948         cpuset = hwloc_bitmap_alloc();
2949         for (i = 0; i < nthreads; ++i) {
2950           if (hwloc_bitmap_isset(topology->levels[0][0]->complete_cpuset, ntohl(threads[i])))
2951             hwloc_bitmap_set(cpuset, ntohl(threads[i]));
2952         }
2953         free(threads);
2954       } else if ((unsigned int)-1 != reg) {
2955         /* Doesn't work on ARM because cpu "reg" do not start at 0.
2956          * We know the first cpu "reg" is the lowest. The others are likely
2957          * in order assuming the device-tree shows objects in order.
2958          */
2959         cpuset = hwloc_bitmap_alloc();
2960         hwloc_bitmap_set(cpuset, reg);
2961       }
2962 
2963       if (NULL == cpuset) {
2964         hwloc_debug("%s has no \"reg\" property, skipping\n", cpu);
2965       } else {
2966         struct hwloc_obj *core = NULL;
2967         add_device_tree_cpus_node(&cpus, cpuset, l2_cache, phandle, dirent->d_name);
2968 
2969         /* Add core */
2970         core = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, reg);
2971         core->cpuset = hwloc_bitmap_dup(cpuset);
2972         hwloc_insert_object_by_cpuset(topology, core);
2973 
2974         /* Add L1 cache */
2975         try_add_cache_from_device_tree_cpu(topology, data, cpu, 1, cpuset);
2976 
2977         hwloc_bitmap_free(cpuset);
2978       }
2979     }
2980     free(device_type);
2981   }
2982   closedir(dt);
2983 
2984   /* No cores and L2 cache were found, exiting */
2985   if (0 == cpus.n) {
2986     hwloc_debug("No cores and L2 cache were found in %s, exiting\n", ofroot);
2987     return;
2988   }
2989 
2990 #ifdef HWLOC_DEBUG
2991   for (i = 0; i < cpus.n; ++i) {
2992     hwloc_debug("%i: %s  ibm,phandle=%08X l2_cache=%08X ",
2993       i, cpus.p[i].name, cpus.p[i].phandle, cpus.p[i].l2_cache);
2994     if (NULL == cpus.p[i].cpuset) {
2995       hwloc_debug("%s\n", "no cpuset");
2996     } else {
2997       hwloc_debug_bitmap("cpuset %s\n", cpus.p[i].cpuset);
2998     }
2999   }
3000 #endif
3001 
3002   /* Scan L2/L3/... caches */
3003   for (i = 0; i < cpus.n; ++i) {
3004     unsigned int level = 2;
3005     hwloc_bitmap_t cpuset;
3006     /* Skip real CPUs */
3007     if (NULL != cpus.p[i].cpuset)
3008       continue;
3009 
3010     /* Calculate cache level and CPU mask */
3011     cpuset = hwloc_bitmap_alloc();
3012     if (0 == look_powerpc_device_tree_discover_cache(&cpus,
3013           cpus.p[i].phandle, &level, cpuset)) {
3014       char cpu[256];
3015       snprintf(cpu, sizeof(cpu), "%s/%s", ofroot, cpus.p[i].name);
3016       try_add_cache_from_device_tree_cpu(topology, data, cpu, level, cpuset);
3017     }
3018     hwloc_bitmap_free(cpuset);
3019   }
3020 
3021   /* Do cleanup */
3022   for (i = 0; i < cpus.n; ++i) {
3023     hwloc_bitmap_free(cpus.p[i].cpuset);
3024     free(cpus.p[i].name);
3025   }
3026   free(cpus.p);
3027 }
3028 
3029 /* Try to handle knl hwdata properties
3030  * Returns 0 on success and -1 otherwise */
hwloc_linux_try_handle_knl_hwdata_properties(hwloc_topology_t topology,struct hwloc_linux_backend_data_s * data,hwloc_obj_t * nodes,unsigned nbnodes)3031 static int hwloc_linux_try_handle_knl_hwdata_properties(hwloc_topology_t topology, struct hwloc_linux_backend_data_s *data, hwloc_obj_t *nodes, unsigned nbnodes)
3032 {
3033   char *knl_cache_file;
3034   long long int cache_size = -1;
3035   int associativity = -1;
3036   int inclusiveness = -1;
3037   int line_size = -1;
3038   int version = 0;
3039   unsigned i;
3040   char buffer[512] = {0};
3041   char *data_beg = NULL;
3042   char memory_mode_str[32] = {0};
3043   char cluster_mode_str[32] = {0};
3044 
3045   if (asprintf(&knl_cache_file, "%s/knl_memoryside_cache", data->dumped_hwdata_dirname) < 0)
3046     return -1;
3047 
3048   hwloc_debug("Reading knl cache data from: %s\n", knl_cache_file);
3049   if (hwloc_read_path_by_length(knl_cache_file, buffer, sizeof(buffer), data->root_fd) < 0) {
3050     hwloc_debug("Unable to open KNL data file `%s' (%s)\n", knl_cache_file, strerror(errno));
3051     free(knl_cache_file);
3052     return -1;
3053   }
3054   free(knl_cache_file);
3055 
3056   data_beg = &buffer[0];
3057 
3058   /* file must start with version information */
3059   if (sscanf(data_beg, "version: %d", &version) != 1) {
3060     fprintf(stderr, "Invalid knl_memoryside_cache header, expected \"version: <int>\".\n");
3061     return -1;
3062   }
3063 
3064   while (1) {
3065     char *line_end = strstr(data_beg, "\n");
3066     if (!line_end)
3067         break;
3068     if (version >= 1) {
3069       if (!strncmp("cache_size:", data_beg, strlen("cache_size"))) {
3070           sscanf(data_beg, "cache_size: %lld", &cache_size);
3071           hwloc_debug("read cache_size=%lld\n", cache_size);
3072       } else if (!strncmp("line_size:", data_beg, strlen("line_size:"))) {
3073           sscanf(data_beg, "line_size: %d", &line_size);
3074           hwloc_debug("read line_size=%d\n", line_size);
3075       } else if (!strncmp("inclusiveness:", data_beg, strlen("inclusiveness:"))) {
3076           sscanf(data_beg, "inclusiveness: %d", &inclusiveness);
3077           hwloc_debug("read inclusiveness=%d\n", inclusiveness);
3078       } else if (!strncmp("associativity:", data_beg, strlen("associativity:"))) {
3079           sscanf(data_beg, "associativity: %d\n", &associativity);
3080           hwloc_debug("read associativity=%d\n", associativity);
3081       }
3082     }
3083     if (version >= 2) {
3084       if (!strncmp("cluster_mode:", data_beg, strlen("cluster_mode:"))) {
3085         sscanf(data_beg, "cluster_mode: %s\n", cluster_mode_str);
3086         hwloc_debug("read cluster_mode=%s\n", cluster_mode_str);
3087       } else if (!strncmp("memory_mode:", data_beg, strlen("memory_mode:"))) {
3088         sscanf(data_beg, "memory_mode: %s\n", memory_mode_str);
3089         hwloc_debug("read memory_mode=%s\n", memory_mode_str);
3090       }
3091     }
3092 
3093     data_beg = line_end + 1;
3094   }
3095 
3096   if (line_size == -1 || cache_size == -1 || associativity == -1 || inclusiveness == -1) {
3097     hwloc_debug("Incorrect file format line_size=%d cache_size=%lld associativity=%d inclusiveness=%d\n",
3098             line_size, cache_size, associativity, inclusiveness);
3099     return -1;
3100   }
3101 
3102   /* In file version 1 mcdram_cache is always non-zero.
3103    * In file version 2 mcdram cache can be zero in flat mode. We need to check and do not expose cache in flat mode. */
3104   if (cache_size > 0) {
3105     for(i=0; i<nbnodes; i++) {
3106       hwloc_obj_t cache;
3107 
3108       if (hwloc_bitmap_iszero(nodes[i]->cpuset))
3109         /* one L3 per DDR, none for MCDRAM nodes */
3110         continue;
3111 
3112       cache = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, -1);
3113       if (!cache)
3114         return -1;
3115 
3116       cache->attr->cache.depth = 3;
3117       cache->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED;
3118       cache->attr->cache.associativity = associativity;
3119       hwloc_obj_add_info(cache, "Inclusive", inclusiveness ? "1" : "0");
3120       cache->attr->cache.size = cache_size;
3121       cache->attr->cache.linesize = line_size;
3122       cache->cpuset = hwloc_bitmap_dup(nodes[i]->cpuset);
3123       hwloc_obj_add_info(cache, "Type", "MemorySideCache");
3124       hwloc_insert_object_by_cpuset(topology, cache);
3125     }
3126   }
3127   /* adding cluster and memory mode as properties of the machine */
3128   if (version >= 2) {
3129     hwloc_obj_add_info(topology->levels[0][0], "ClusterMode", cluster_mode_str);
3130     hwloc_obj_add_info(topology->levels[0][0], "MemoryMode", memory_mode_str);
3131   }
3132 
3133   return 0;
3134 }
3135 
3136 
3137 
3138 /**************************************
3139  ****** Sysfs Topology Discovery ******
3140  **************************************/
3141 
3142 static int
look_sysfsnode(struct hwloc_topology * topology,struct hwloc_linux_backend_data_s * data,const char * path,unsigned * found)3143 look_sysfsnode(struct hwloc_topology *topology,
3144                struct hwloc_linux_backend_data_s *data,
3145                const char *path, unsigned *found)
3146 {
3147   unsigned osnode;
3148   unsigned nbnodes = 0;
3149   DIR *dir;
3150   struct dirent *dirent;
3151   hwloc_bitmap_t nodeset;
3152 
3153   *found = 0;
3154 
3155   /* Get the list of nodes first */
3156   dir = hwloc_opendir(path, data->root_fd);
3157   if (dir)
3158     {
3159       nodeset = hwloc_bitmap_alloc();
3160       while ((dirent = readdir(dir)) != NULL)
3161         {
3162           if (strncmp(dirent->d_name, "node", 4))
3163             continue;
3164           osnode = strtoul(dirent->d_name+4, NULL, 0);
3165           hwloc_bitmap_set(nodeset, osnode);
3166           nbnodes++;
3167         }
3168       closedir(dir);
3169     }
3170   else
3171     return -1;
3172 
3173   if (!nbnodes || (nbnodes == 1 && !data->is_knl)) { /* always keep NUMA for KNL, or configs might look too different */
3174     hwloc_bitmap_free(nodeset);
3175     return 0;
3176   }
3177 
3178   /* For convenience, put these declarations inside a block. */
3179 
3180   {
3181       hwloc_obj_t * nodes = calloc(nbnodes, sizeof(hwloc_obj_t));
3182       unsigned *indexes = calloc(nbnodes, sizeof(unsigned));
3183       float * distances = NULL;
3184       int failednodes = 0;
3185       unsigned index_;
3186 
3187       if (NULL == nodes || NULL == indexes) {
3188           free(nodes);
3189           free(indexes);
3190           hwloc_bitmap_free(nodeset);
3191           nbnodes = 0;
3192           goto out;
3193       }
3194 
3195       /* Unsparsify node indexes.
3196        * We'll need them later because Linux groups sparse distances
3197        * and keeps them in order in the sysfs distance files.
3198        * It'll simplify things in the meantime.
3199        */
3200       index_ = 0;
3201       hwloc_bitmap_foreach_begin (osnode, nodeset) {
3202         indexes[index_] = osnode;
3203         index_++;
3204       } hwloc_bitmap_foreach_end();
3205       hwloc_bitmap_free(nodeset);
3206 
3207 #ifdef HWLOC_DEBUG
3208       hwloc_debug("%s", "NUMA indexes: ");
3209       for (index_ = 0; index_ < nbnodes; index_++) {
3210         hwloc_debug(" %u", indexes[index_]);
3211       }
3212       hwloc_debug("%s", "\n");
3213 #endif
3214 
3215       /* Create NUMA objects */
3216       for (index_ = 0; index_ < nbnodes; index_++) {
3217           hwloc_obj_t node, res_obj;
3218           int annotate;
3219 
3220           osnode = indexes[index_];
3221 
3222           node = hwloc_get_numanode_obj_by_os_index(topology, osnode);
3223           annotate = (node != NULL);
3224           if (!annotate) {
3225             /* create a new node */
3226             char nodepath[SYSFS_NUMA_NODE_PATH_LEN];
3227             hwloc_bitmap_t cpuset;
3228             sprintf(nodepath, "%s/node%u/cpumap", path, osnode);
3229             cpuset = hwloc__alloc_read_path_as_cpumask(nodepath, data->root_fd);
3230             if (!cpuset) {
3231               /* This NUMA object won't be inserted, we'll ignore distances */
3232               failednodes++;
3233               continue;
3234             }
3235 
3236             node = hwloc_alloc_setup_object(HWLOC_OBJ_NUMANODE, osnode);
3237             node->cpuset = cpuset;
3238             node->nodeset = hwloc_bitmap_alloc();
3239             hwloc_bitmap_set(node->nodeset, osnode);
3240           }
3241           hwloc_sysfs_node_meminfo_info(topology, data, path, osnode, &node->memory);
3242 
3243           hwloc_debug_1arg_bitmap("os node %u has cpuset %s\n",
3244                                   osnode, node->cpuset);
3245 
3246           if (annotate) {
3247             nodes[index_] = node;
3248           } else {
3249             res_obj = hwloc_insert_object_by_cpuset(topology, node);
3250             if (node == res_obj) {
3251               nodes[index_] = node;
3252             } else {
3253               /* We got merged somehow, could be a buggy BIOS reporting wrong NUMA node cpuset.
3254                * This object disappeared, we'll ignore distances */
3255               failednodes++;
3256             }
3257           }
3258       }
3259 
3260       if (!failednodes && data->is_knl)
3261         hwloc_linux_try_handle_knl_hwdata_properties(topology, data, nodes, nbnodes);
3262 
3263       if (failednodes) {
3264         /* failed to read/create some nodes, don't bother reading/fixing
3265          * a distance matrix that would likely be wrong anyway.
3266          */
3267         nbnodes -= failednodes;
3268       } else if (nbnodes > 1) {
3269         distances = malloc(nbnodes*nbnodes*sizeof(*distances));
3270       }
3271 
3272       if (NULL == distances) {
3273           free(nodes);
3274           free(indexes);
3275           goto out;
3276       }
3277 
3278       if (hwloc_parse_nodes_distances(path, nbnodes, indexes, distances, data->root_fd) < 0) {
3279         free(nodes);
3280         free(distances);
3281         free(indexes);
3282         goto out;
3283       }
3284 
3285       if (data->is_knl && distances) {
3286         char *env = getenv("HWLOC_KNL_NUMA_QUIRK");
3287         if (!(env && !atoi(env)) && nbnodes>=2) { /* SNC2 or SNC4, with 0 or 2/4 MCDRAM, and 0-4 DDR nodes */
3288           unsigned i, j, closest;
3289           for(i=0; i<nbnodes; i++) {
3290             if (!hwloc_bitmap_iszero(nodes[i]->cpuset))
3291               /* nodes with CPU, that's DDR, skip it */
3292               continue;
3293             hwloc_obj_add_info(nodes[i], "Type", "MCDRAM");
3294 
3295             /* DDR is the closest node with CPUs */
3296             closest = (unsigned)-1;
3297             for(j=0; j<nbnodes; j++) {
3298               if (j==i)
3299                 continue;
3300               if (hwloc_bitmap_iszero(nodes[j]->cpuset))
3301                 /* nodes without CPU, that's another MCDRAM, skip it */
3302                 continue;
3303               if (closest == (unsigned)-1 || distances[i*nbnodes+j]<distances[i*nbnodes+closest])
3304                 closest = j;
3305             }
3306             if (closest != (unsigned) -1) {
3307               /* Add a Group for Cluster containing this MCDRAM + DDR */
3308               hwloc_obj_t cluster = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, -1);
3309               cluster->cpuset = hwloc_bitmap_dup(nodes[i]->cpuset);
3310               cluster->nodeset = hwloc_bitmap_dup(nodes[i]->nodeset);
3311               hwloc_bitmap_or(cluster->cpuset, cluster->cpuset, nodes[closest]->cpuset);
3312               hwloc_bitmap_or(cluster->nodeset, cluster->nodeset, nodes[closest]->nodeset);
3313               hwloc_obj_add_info(cluster, "Type", "Cluster");
3314               hwloc_insert_object_by_cpuset(topology, cluster);
3315             }
3316           }
3317           /* drop the distance matrix, it contradicts the above NUMA layout groups */
3318           free(distances);
3319           free(nodes);
3320           free(indexes);
3321           goto out;
3322         }
3323       }
3324 
3325       hwloc_distances_set(topology, HWLOC_OBJ_NUMANODE, nbnodes, indexes, nodes, distances, 0 /* OS cannot force */);
3326   }
3327 
3328  out:
3329   *found = nbnodes;
3330   return 0;
3331 }
3332 
3333 /* Look at Linux' /sys/devices/system/cpu/cpu%d/topology/ */
3334 static int
look_sysfscpu(struct hwloc_topology * topology,struct hwloc_linux_backend_data_s * data,const char * path,struct hwloc_linux_cpuinfo_proc * cpuinfo_Lprocs,unsigned cpuinfo_numprocs)3335 look_sysfscpu(struct hwloc_topology *topology,
3336               struct hwloc_linux_backend_data_s *data,
3337               const char *path,
3338               struct hwloc_linux_cpuinfo_proc * cpuinfo_Lprocs, unsigned cpuinfo_numprocs)
3339 {
3340   hwloc_bitmap_t cpuset; /* Set of cpus for which we have topology information */
3341 #define CPU_TOPOLOGY_STR_LEN 128
3342   char str[CPU_TOPOLOGY_STR_LEN];
3343   DIR *dir;
3344   int i,j;
3345   unsigned caches_added, merge_buggy_core_siblings;
3346   hwloc_obj_t packages = NULL; /* temporary list of packages before actual insert in the tree */
3347   int threadwithcoreid = data->is_amd_with_CU ? -1 : 0; /* -1 means we don't know yet if threads have their own coreids within thread_siblings */
3348 
3349   /* fill the cpuset of interesting cpus */
3350   dir = hwloc_opendir(path, data->root_fd);
3351   if (!dir)
3352     return -1;
3353   else {
3354     struct dirent *dirent;
3355     cpuset = hwloc_bitmap_alloc();
3356 
3357     while ((dirent = readdir(dir)) != NULL) {
3358       unsigned long cpu;
3359       char online[2];
3360 
3361       if (strncmp(dirent->d_name, "cpu", 3))
3362         continue;
3363       cpu = strtoul(dirent->d_name+3, NULL, 0);
3364 
3365       /* Maybe we don't have topology information but at least it exists */
3366       hwloc_bitmap_set(topology->levels[0][0]->complete_cpuset, cpu);
3367 
3368       /* check whether this processor is online */
3369       sprintf(str, "%s/cpu%lu/online", path, cpu);
3370       if (hwloc_read_path_by_length(str, online, sizeof(online), data->root_fd) == 0) {
3371         if (atoi(online)) {
3372           hwloc_debug("os proc %lu is online\n", cpu);
3373         } else {
3374           hwloc_debug("os proc %lu is offline\n", cpu);
3375           hwloc_bitmap_clr(topology->levels[0][0]->online_cpuset, cpu);
3376         }
3377       }
3378 
3379       /* check whether the kernel exports topology information for this cpu */
3380       sprintf(str, "%s/cpu%lu/topology", path, cpu);
3381       if (hwloc_access(str, X_OK, data->root_fd) < 0 && errno == ENOENT) {
3382         hwloc_debug("os proc %lu has no accessible %s/cpu%lu/topology\n",
3383                    cpu, path, cpu);
3384         continue;
3385       }
3386 
3387       hwloc_bitmap_set(cpuset, cpu);
3388     }
3389     closedir(dir);
3390   }
3391 
3392   topology->support.discovery->pu = 1;
3393   hwloc_debug_1arg_bitmap("found %d cpu topologies, cpuset %s\n",
3394              hwloc_bitmap_weight(cpuset), cpuset);
3395 
3396   merge_buggy_core_siblings = (data->arch == HWLOC_LINUX_ARCH_X86);
3397   caches_added = 0;
3398   hwloc_bitmap_foreach_begin(i, cpuset) {
3399     hwloc_bitmap_t packageset, coreset, bookset, threadset;
3400     unsigned mypackageid, mycoreid, mybookid;
3401     int tmpint;
3402 
3403     /* look at the package */
3404     sprintf(str, "%s/cpu%d/topology/core_siblings", path, i);
3405     packageset = hwloc__alloc_read_path_as_cpumask(str, data->root_fd);
3406     if (packageset && hwloc_bitmap_first(packageset) == i) {
3407       /* first cpu in this package, add the package */
3408       struct hwloc_obj *package;
3409 
3410       mypackageid = (unsigned) -1;
3411       sprintf(str, "%s/cpu%d/topology/physical_package_id", path, i); /* contains %d at least up to 4.9 */
3412       if (hwloc_read_path_as_int(str, &tmpint, data->root_fd) == 0)
3413         mypackageid = (unsigned) tmpint;
3414 
3415       if (merge_buggy_core_siblings) {
3416         /* check for another package with same physical_package_id */
3417         hwloc_obj_t curpackage = packages;
3418         while (curpackage) {
3419           if (curpackage->os_index == mypackageid) {
3420             /* found another package with same physical_package_id but different core_siblings.
3421              * looks like a buggy kernel on Intel Xeon E5 v3 processor with two rings.
3422              * merge these core_siblings to extend the existing first package object.
3423              */
3424             static int reported = 0;
3425             if (!reported && !hwloc_hide_errors()) {
3426               char *a, *b;
3427               hwloc_bitmap_asprintf(&a, curpackage->cpuset);
3428               hwloc_bitmap_asprintf(&b, packageset);
3429               fprintf(stderr, "****************************************************************************\n");
3430               fprintf(stderr, "* hwloc %s has detected buggy sysfs package information: Two packages have\n", HWLOC_VERSION);
3431               fprintf(stderr, "* the same physical package id %u but different core_siblings %s and %s\n",
3432                       mypackageid, a, b);
3433               fprintf(stderr, "* hwloc is merging these packages into a single one assuming your Linux kernel\n");
3434               fprintf(stderr, "* does not support this processor correctly.\n");
3435               fprintf(stderr, "* You may hide this warning by setting HWLOC_HIDE_ERRORS=1 in the environment.\n");
3436               fprintf(stderr, "*\n");
3437               fprintf(stderr, "* If hwloc does not report the right number of packages,\n");
3438               fprintf(stderr, "* please report this error message to the hwloc user's mailing list,\n");
3439               fprintf(stderr, "* along with the output+tarball generated by the hwloc-gather-topology script.\n");
3440               fprintf(stderr, "****************************************************************************\n");
3441               reported = 1;
3442               free(a);
3443               free(b);
3444             }
3445             hwloc_bitmap_or(curpackage->cpuset, curpackage->cpuset, packageset);
3446             goto package_done;
3447           }
3448           curpackage = curpackage->next_cousin;
3449         }
3450       }
3451 
3452       /* no package with same physical_package_id, create a new one */
3453       package = hwloc_alloc_setup_object(HWLOC_OBJ_PACKAGE, mypackageid);
3454       package->cpuset = packageset;
3455       hwloc_debug_1arg_bitmap("os package %u has cpuset %s\n",
3456                               mypackageid, packageset);
3457       /* add cpuinfo */
3458       if (cpuinfo_Lprocs) {
3459         for(j=0; j<(int) cpuinfo_numprocs; j++)
3460           if ((int) cpuinfo_Lprocs[j].Pproc == i) {
3461             hwloc__move_infos(&package->infos, &package->infos_count,
3462                               &cpuinfo_Lprocs[j].infos, &cpuinfo_Lprocs[j].infos_count);
3463           }
3464       }
3465       /* insert in a temporary list in case we have to modify the cpuset by merging other core_siblings later.
3466        * we'll actually insert the tree at the end of the entire sysfs cpu loop.
3467        */
3468       package->next_cousin = packages;
3469       packages = package;
3470 
3471       packageset = NULL; /* don't free it */
3472     }
3473 package_done:
3474     hwloc_bitmap_free(packageset);
3475 
3476     /* look at the core */
3477     sprintf(str, "%s/cpu%d/topology/thread_siblings", path, i);
3478     coreset = hwloc__alloc_read_path_as_cpumask(str, data->root_fd);
3479 
3480     if (coreset) {
3481       int gotcoreid = 0; /* to avoid reading the coreid twice */
3482       if (hwloc_bitmap_weight(coreset) > 1 && threadwithcoreid == -1) {
3483         /* check if this is hyper-threading or different coreids */
3484         unsigned siblingid, siblingcoreid;
3485 
3486         mycoreid = (unsigned) -1;
3487         sprintf(str, "%s/cpu%d/topology/core_id", path, i); /* contains %d at least up to 4.9 */
3488         if (hwloc_read_path_as_int(str, &tmpint, data->root_fd) == 0)
3489           mycoreid = (unsigned) tmpint;
3490         gotcoreid = 1;
3491 
3492         siblingid = hwloc_bitmap_first(coreset);
3493         if (siblingid == (unsigned) i)
3494           siblingid = hwloc_bitmap_next(coreset, i);
3495         siblingcoreid = (unsigned) -1;
3496         sprintf(str, "%s/cpu%u/topology/core_id", path, siblingid); /* contains %d at least up to 4.9 */
3497         if (hwloc_read_path_as_int(str, &tmpint, data->root_fd) == 0)
3498           siblingcoreid = (unsigned) tmpint;
3499         threadwithcoreid = (siblingcoreid != mycoreid);
3500       }
3501       if (hwloc_bitmap_first(coreset) == i || threadwithcoreid) {
3502         /* regular core */
3503         struct hwloc_obj *core;
3504 
3505         if (!gotcoreid) {
3506           mycoreid = (unsigned) -1;
3507           sprintf(str, "%s/cpu%d/topology/core_id", path, i); /* contains %d at least up to 4.9 */
3508           if (hwloc_read_path_as_int(str, &tmpint, data->root_fd) == 0)
3509             mycoreid = (unsigned) tmpint;
3510         }
3511 
3512         core = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, mycoreid);
3513         if (threadwithcoreid)
3514           /* amd multicore compute-unit, create one core per thread */
3515           hwloc_bitmap_only(coreset, i);
3516         core->cpuset = coreset;
3517         hwloc_debug_1arg_bitmap("os core %u has cpuset %s\n",
3518                                 mycoreid, core->cpuset);
3519         hwloc_insert_object_by_cpuset(topology, core);
3520         coreset = NULL; /* don't free it */
3521       }
3522       hwloc_bitmap_free(coreset);
3523     }
3524 
3525     /* look at the books */
3526     sprintf(str, "%s/cpu%d/topology/book_siblings", path, i);
3527     bookset = hwloc__alloc_read_path_as_cpumask(str, data->root_fd);
3528     if (bookset) {
3529       if (hwloc_bitmap_first(bookset) == i) {
3530         struct hwloc_obj *book;
3531 
3532         mybookid = (unsigned) -1;
3533         sprintf(str, "%s/cpu%d/topology/book_id", path, i); /* contains %d at least up to 4.9 */
3534         if (hwloc_read_path_as_int(str, &tmpint, data->root_fd) == 0) {
3535           mybookid = (unsigned) tmpint;
3536 
3537           book = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, mybookid);
3538           book->cpuset = bookset;
3539           hwloc_debug_1arg_bitmap("os book %u has cpuset %s\n",
3540                                   mybookid, bookset);
3541           hwloc_obj_add_info(book, "Type", "Book");
3542           hwloc_insert_object_by_cpuset(topology, book);
3543           bookset = NULL; /* don't free it */
3544         }
3545       }
3546       hwloc_bitmap_free(bookset);
3547     }
3548 
3549     {
3550       /* look at the thread */
3551       struct hwloc_obj *thread = hwloc_alloc_setup_object(HWLOC_OBJ_PU, i);
3552       threadset = hwloc_bitmap_alloc();
3553       hwloc_bitmap_only(threadset, i);
3554       thread->cpuset = threadset;
3555       hwloc_debug_1arg_bitmap("thread %d has cpuset %s\n",
3556                               i, threadset);
3557       hwloc_insert_object_by_cpuset(topology, thread);
3558     }
3559 
3560     /* look at the caches */
3561     for(j=0; j<10; j++) {
3562       char str2[20]; /* enough for a level number (one digit) or a type (Data/Instruction/Unified) */
3563       hwloc_bitmap_t cacheset;
3564 
3565       sprintf(str, "%s/cpu%d/cache/index%d/shared_cpu_map", path, i, j);
3566       cacheset = hwloc__alloc_read_path_as_cpumask(str, data->root_fd);
3567       if (cacheset) {
3568         if (hwloc_bitmap_iszero(cacheset)) {
3569           hwloc_bitmap_t tmpset;
3570           /* ia64 returning empty L3 and L2i? use the core set instead */
3571           sprintf(str, "%s/cpu%d/topology/thread_siblings", path, i);
3572           tmpset = hwloc__alloc_read_path_as_cpumask(str, data->root_fd);
3573           /* only use it if we actually got something */
3574           if (tmpset) {
3575             hwloc_bitmap_free(cacheset);
3576             cacheset = tmpset;
3577           }
3578         }
3579 
3580         if (hwloc_bitmap_first(cacheset) == i) {
3581           unsigned kB;
3582           unsigned linesize;
3583           unsigned sets, lines_per_tag;
3584           unsigned depth; /* 1 for L1, .... */
3585           hwloc_obj_cache_type_t type = HWLOC_OBJ_CACHE_UNIFIED; /* default */
3586           struct hwloc_obj *cache;
3587 
3588           /* get the cache level depth */
3589           sprintf(str, "%s/cpu%d/cache/index%d/level", path, i, j); /* contains %u at least up to 4.9 */
3590           if (hwloc_read_path_as_uint(str, &depth, data->root_fd) < 0) {
3591             hwloc_bitmap_free(cacheset);
3592             continue;
3593           }
3594 
3595           /* cache type */
3596           sprintf(str, "%s/cpu%d/cache/index%d/type", path, i, j);
3597           if (hwloc_read_path_by_length(str, str2, sizeof(str2), data->root_fd) == 0) {
3598             if (!strncmp(str2, "Data", 4))
3599               type = HWLOC_OBJ_CACHE_DATA;
3600             else if (!strncmp(str2, "Unified", 7))
3601               type = HWLOC_OBJ_CACHE_UNIFIED;
3602             else if (!strncmp(str2, "Instruction", 11))
3603               type = HWLOC_OBJ_CACHE_INSTRUCTION;
3604             else {
3605               hwloc_bitmap_free(cacheset);
3606               continue;
3607             }
3608           } else {
3609             hwloc_bitmap_free(cacheset);
3610             continue;
3611           }
3612 
3613           /* get the cache size */
3614           kB = 0;
3615           sprintf(str, "%s/cpu%d/cache/index%d/size", path, i, j); /* contains %uK at least up to 4.9 */
3616           hwloc_read_path_as_uint(str, &kB, data->root_fd);
3617           /* KNL reports L3 with size=0 and full cpuset in cpuid.
3618            * Let hwloc_linux_try_add_knl_mcdram_cache() detect it better.
3619            */
3620           if (!kB && depth == 3 && data->is_knl) {
3621             hwloc_bitmap_free(cacheset);
3622             continue;
3623           }
3624 
3625           /* get the line size */
3626           linesize = 0;
3627           sprintf(str, "%s/cpu%d/cache/index%d/coherency_line_size", path, i, j); /* contains %u at least up to 4.9 */
3628           hwloc_read_path_as_uint(str, &linesize, data->root_fd);
3629 
3630           /* get the number of sets and lines per tag.
3631            * don't take the associativity directly in "ways_of_associativity" because
3632            * some archs (ia64, ppc) put 0 there when fully-associative, while others (x86) put something like -1 there.
3633            */
3634           sets = 0;
3635           sprintf(str, "%s/cpu%d/cache/index%d/number_of_sets", path, i, j); /* contains %u at least up to 4.9 */
3636           hwloc_read_path_as_uint(str, &sets, data->root_fd);
3637 
3638           lines_per_tag = 1;
3639           sprintf(str, "%s/cpu%d/cache/index%d/physical_line_partition", path, i, j); /* contains %u at least up to 4.9 */
3640           hwloc_read_path_as_uint(str, &lines_per_tag, data->root_fd);
3641 
3642           /* first cpu in this cache, add the cache */
3643           cache = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, -1);
3644           cache->attr->cache.size = ((uint64_t)kB) << 10;
3645           cache->attr->cache.depth = depth;
3646           cache->attr->cache.linesize = linesize;
3647           cache->attr->cache.type = type;
3648           if (!linesize || !lines_per_tag || !sets)
3649             cache->attr->cache.associativity = 0; /* unknown */
3650           else if (sets == 1)
3651             cache->attr->cache.associativity = 0; /* likely wrong, make it unknown */
3652           else
3653             cache->attr->cache.associativity = (kB << 10) / linesize / lines_per_tag / sets;
3654           cache->cpuset = cacheset;
3655           hwloc_debug_1arg_bitmap("cache depth %u has cpuset %s\n",
3656                                   depth, cacheset);
3657           hwloc_insert_object_by_cpuset(topology, cache);
3658           cacheset = NULL; /* don't free it */
3659           ++caches_added;
3660         }
3661       }
3662       hwloc_bitmap_free(cacheset);
3663     }
3664   } hwloc_bitmap_foreach_end();
3665 
3666   /* actually insert in the tree now that package cpusets have been fixed-up */
3667   while (packages) {
3668     hwloc_obj_t next = packages->next_cousin;
3669     packages->next_cousin = NULL;
3670     hwloc_insert_object_by_cpuset(topology, packages);
3671     packages = next;
3672   }
3673 
3674   if (0 == caches_added)
3675     look_powerpc_device_tree(topology, data);
3676 
3677   hwloc_bitmap_free(cpuset);
3678 
3679   return 0;
3680 }
3681 
3682 
3683 
3684 /****************************************
3685  ****** cpuinfo Topology Discovery ******
3686  ****************************************/
3687 
3688 static int
hwloc_linux_parse_cpuinfo_x86(const char * prefix,const char * value,struct hwloc_obj_info_s ** infos,unsigned * infos_count,int is_global __hwloc_attribute_unused)3689 hwloc_linux_parse_cpuinfo_x86(const char *prefix, const char *value,
3690                               struct hwloc_obj_info_s **infos, unsigned *infos_count,
3691                               int is_global __hwloc_attribute_unused)
3692 {
3693   if (!strcmp("vendor_id", prefix)) {
3694     hwloc__add_info(infos, infos_count, "CPUVendor", value);
3695   } else if (!strcmp("model name", prefix)) {
3696     hwloc__add_info(infos, infos_count, "CPUModel", value);
3697   } else if (!strcmp("model", prefix)) {
3698     hwloc__add_info(infos, infos_count, "CPUModelNumber", value);
3699   } else if (!strcmp("cpu family", prefix)) {
3700     hwloc__add_info(infos, infos_count, "CPUFamilyNumber", value);
3701   } else if (!strcmp("stepping", prefix)) {
3702     hwloc__add_info(infos, infos_count, "CPUStepping", value);
3703   }
3704   return 0;
3705 }
3706 
3707 static int
hwloc_linux_parse_cpuinfo_ia64(const char * prefix,const char * value,struct hwloc_obj_info_s ** infos,unsigned * infos_count,int is_global __hwloc_attribute_unused)3708 hwloc_linux_parse_cpuinfo_ia64(const char *prefix, const char *value,
3709                                struct hwloc_obj_info_s **infos, unsigned *infos_count,
3710                                int is_global __hwloc_attribute_unused)
3711 {
3712   if (!strcmp("vendor", prefix)) {
3713     hwloc__add_info(infos, infos_count, "CPUVendor", value);
3714   } else if (!strcmp("model name", prefix)) {
3715     hwloc__add_info(infos, infos_count, "CPUModel", value);
3716   } else if (!strcmp("model", prefix)) {
3717     hwloc__add_info(infos, infos_count, "CPUModelNumber", value);
3718   } else if (!strcmp("family", prefix)) {
3719     hwloc__add_info(infos, infos_count, "CPUFamilyNumber", value);
3720   }
3721   return 0;
3722 }
3723 
3724 static int
hwloc_linux_parse_cpuinfo_arm(const char * prefix,const char * value,struct hwloc_obj_info_s ** infos,unsigned * infos_count,int is_global __hwloc_attribute_unused)3725 hwloc_linux_parse_cpuinfo_arm(const char *prefix, const char *value,
3726                               struct hwloc_obj_info_s **infos, unsigned *infos_count,
3727                               int is_global __hwloc_attribute_unused)
3728 {
3729   if (!strcmp("Processor", prefix) /* old kernels with one Processor header */
3730       || !strcmp("model name", prefix) /* new kernels with one model name per core */) {
3731     hwloc__add_info(infos, infos_count, "CPUModel", value);
3732   } else if (!strcmp("CPU implementer", prefix)) {
3733     hwloc__add_info(infos, infos_count, "CPUImplementer", value);
3734   } else if (!strcmp("CPU architecture", prefix)) {
3735     hwloc__add_info(infos, infos_count, "CPUArchitecture", value);
3736   } else if (!strcmp("CPU variant", prefix)) {
3737     hwloc__add_info(infos, infos_count, "CPUVariant", value);
3738   } else if (!strcmp("CPU part", prefix)) {
3739     hwloc__add_info(infos, infos_count, "CPUPart", value);
3740   } else if (!strcmp("CPU revision", prefix)) {
3741     hwloc__add_info(infos, infos_count, "CPURevision", value);
3742   } else if (!strcmp("Hardware", prefix)) {
3743     hwloc__add_info(infos, infos_count, "HardwareName", value);
3744   } else if (!strcmp("Revision", prefix)) {
3745     hwloc__add_info(infos, infos_count, "HardwareRevision", value);
3746   } else if (!strcmp("Serial", prefix)) {
3747     hwloc__add_info(infos, infos_count, "HardwareSerial", value);
3748   }
3749   return 0;
3750 }
3751 
3752 static int
hwloc_linux_parse_cpuinfo_ppc(const char * prefix,const char * value,struct hwloc_obj_info_s ** infos,unsigned * infos_count,int is_global)3753 hwloc_linux_parse_cpuinfo_ppc(const char *prefix, const char *value,
3754                               struct hwloc_obj_info_s **infos, unsigned *infos_count,
3755                               int is_global)
3756 {
3757   /* common fields */
3758   if (!strcmp("cpu", prefix)) {
3759     hwloc__add_info(infos, infos_count, "CPUModel", value);
3760   } else if (!strcmp("platform", prefix)) {
3761     hwloc__add_info(infos, infos_count, "PlatformName", value);
3762   } else if (!strcmp("model", prefix)) {
3763     hwloc__add_info(infos, infos_count, "PlatformModel", value);
3764   }
3765   /* platform-specific fields */
3766   else if (!strcasecmp("vendor", prefix)) {
3767     hwloc__add_info(infos, infos_count, "PlatformVendor", value);
3768   } else if (!strcmp("Board ID", prefix)) {
3769     hwloc__add_info(infos, infos_count, "PlatformBoardID", value);
3770   } else if (!strcmp("Board", prefix)
3771              || !strcasecmp("Machine", prefix)) {
3772     /* machine and board are similar (and often more precise) than model above */
3773     char **valuep = hwloc__find_info_slot(infos, infos_count, "PlatformModel");
3774     if (*valuep)
3775       free(*valuep);
3776     *valuep = strdup(value);
3777   } else if (!strcasecmp("Revision", prefix)
3778              || !strcmp("Hardware rev", prefix)) {
3779     hwloc__add_info(infos, infos_count, is_global ? "PlatformRevision" : "CPURevision", value);
3780   } else if (!strcmp("SVR", prefix)) {
3781     hwloc__add_info(infos, infos_count, "SystemVersionRegister", value);
3782   } else if (!strcmp("PVR", prefix)) {
3783     hwloc__add_info(infos, infos_count, "ProcessorVersionRegister", value);
3784   }
3785   /* don't match 'board*' because there's also "board l2" on some platforms */
3786   return 0;
3787 }
3788 
3789 /*
3790  * avr32: "chip type\t:"                        => OK
3791  * blackfin: "model name\t:"                    => OK
3792  * h8300: "CPU:"                                => OK
3793  * m68k: "CPU:"                                 => OK
3794  * mips: "cpu model\t\t:"                       => OK
3795  * openrisc: "CPU:"                             => OK
3796  * sparc: "cpu\t\t:"                            => OK
3797  * tile: "model name\t:"                        => OK
3798  * unicore32: "Processor\t:"                    => OK
3799  * alpha: "cpu\t\t\t: Alpha" + "cpu model\t\t:" => "cpu" overwritten by "cpu model", no processor indexes
3800  * cris: "cpu\t\t:" + "cpu model\t:"            => only "cpu"
3801  * frv: "CPU-Core:" + "CPU:"                    => only "CPU"
3802  * mn10300: "cpu core   :" + "model name :"     => only "model name"
3803  * parisc: "cpu family\t:" + "cpu\t\t:"         => only "cpu"
3804  *
3805  * not supported because of conflicts with other arch minor lines:
3806  * m32r: "cpu family\t:"                        => KO (adding "cpu family" would break "blackfin")
3807  * microblaze: "CPU-Family:"                    => KO
3808  * sh: "cpu family\t:" + "cpu type\t:"          => KO
3809  * xtensa: "model\t\t:"                         => KO
3810  */
3811 static int
hwloc_linux_parse_cpuinfo_generic(const char * prefix,const char * value,struct hwloc_obj_info_s ** infos,unsigned * infos_count,int is_global __hwloc_attribute_unused)3812 hwloc_linux_parse_cpuinfo_generic(const char *prefix, const char *value,
3813                                   struct hwloc_obj_info_s **infos, unsigned *infos_count,
3814                                   int is_global __hwloc_attribute_unused)
3815 {
3816   if (!strcmp("model name", prefix)
3817       || !strcmp("Processor", prefix)
3818       || !strcmp("chip type", prefix)
3819       || !strcmp("cpu model", prefix)
3820       || !strcasecmp("cpu", prefix)) {
3821     /* keep the last one, assume it's more precise than the first one.
3822      * we should have the Architecture keypair for basic information anyway.
3823      */
3824     char **valuep = hwloc__find_info_slot(infos, infos_count, "CPUModel");
3825     if (*valuep)
3826       free(*valuep);
3827     *valuep = strdup(value);
3828   }
3829   return 0;
3830 }
3831 
3832 /* Lprocs_p set to NULL unless returns > 0 */
3833 static int
hwloc_linux_parse_cpuinfo(struct hwloc_linux_backend_data_s * data,const char * path,struct hwloc_linux_cpuinfo_proc ** Lprocs_p,struct hwloc_obj_info_s ** global_infos,unsigned * global_infos_count)3834 hwloc_linux_parse_cpuinfo(struct hwloc_linux_backend_data_s *data,
3835                           const char *path,
3836                           struct hwloc_linux_cpuinfo_proc ** Lprocs_p,
3837                           struct hwloc_obj_info_s **global_infos, unsigned *global_infos_count)
3838 {
3839   FILE *fd;
3840   char *str = NULL;
3841   char *endptr;
3842   unsigned len;
3843   unsigned allocated_Lprocs = 0;
3844   struct hwloc_linux_cpuinfo_proc * Lprocs = NULL;
3845   unsigned numprocs = 0;
3846   int curproc = -1;
3847   int (*parse_cpuinfo_func)(const char *, const char *, struct hwloc_obj_info_s **, unsigned *, int) = NULL;
3848 
3849   if (!(fd=hwloc_fopen(path,"r", data->root_fd)))
3850     {
3851       hwloc_debug("could not open %s\n", path);
3852       return -1;
3853     }
3854 
3855 #      define PROCESSOR "processor"
3856 #      define PACKAGEID "physical id" /* the longest one */
3857 #      define COREID "core id"
3858   len = 128; /* vendor/model can be very long */
3859   str = malloc(len);
3860   hwloc_debug("\n\n * Topology extraction from %s *\n\n", path);
3861   while (fgets(str,len,fd)!=NULL) {
3862     unsigned long Ppkg, Pcore, Pproc;
3863     char *end, *dot, *prefix, *value;
3864     int noend = 0;
3865 
3866     /* remove the ending \n */
3867     end = strchr(str, '\n');
3868     if (end)
3869       *end = 0;
3870     else
3871       noend = 1;
3872     /* if empty line, skip and reset curproc */
3873     if (!*str) {
3874       curproc = -1;
3875       continue;
3876     }
3877     /* skip lines with no dot */
3878     dot = strchr(str, ':');
3879     if (!dot)
3880       continue;
3881     /* skip lines not starting with a letter */
3882     if ((*str > 'z' || *str < 'a')
3883         && (*str > 'Z' || *str < 'A'))
3884       continue;
3885 
3886     /* mark the end of the prefix */
3887     prefix = str;
3888     end = dot;
3889     while (end[-1] == ' ' || end[-1] == '\t') end--; /* need a strrspn() */
3890     *end = 0;
3891     /* find beginning of value, its end is already marked */
3892     value = dot+1 + strspn(dot+1, "     ");
3893 
3894     /* defines for parsing numbers */
3895 #   define getprocnb_begin(field, var)                                  \
3896     if (!strcmp(field,prefix)) {                                        \
3897       var = strtoul(value,&endptr,0);                                   \
3898       if (endptr==value) {                                              \
3899         hwloc_debug("no number in "field" field of %s\n", path);        \
3900         goto err;                                                       \
3901       } else if (var==ULONG_MAX) {                                      \
3902         hwloc_debug("too big "field" number in %s\n", path);            \
3903         goto err;                                                       \
3904       }                                                                 \
3905       hwloc_debug(field " %lu\n", var)
3906 #   define getprocnb_end()                                              \
3907     }
3908     /* actually parse numbers */
3909     getprocnb_begin(PROCESSOR, Pproc);
3910     curproc = numprocs++;
3911     if (numprocs > allocated_Lprocs) {
3912       struct hwloc_linux_cpuinfo_proc * tmp;
3913       if (!allocated_Lprocs)
3914         allocated_Lprocs = 8;
3915       else
3916         allocated_Lprocs *= 2;
3917       tmp = realloc(Lprocs, allocated_Lprocs * sizeof(*Lprocs));
3918       if (!tmp)
3919         goto err;
3920       Lprocs = tmp;
3921     }
3922     Lprocs[curproc].Pproc = Pproc;
3923     Lprocs[curproc].Pcore = -1;
3924     Lprocs[curproc].Ppkg = -1;
3925     Lprocs[curproc].Lcore = -1;
3926     Lprocs[curproc].Lpkg = -1;
3927     Lprocs[curproc].infos = NULL;
3928     Lprocs[curproc].infos_count = 0;
3929     getprocnb_end() else
3930     getprocnb_begin(PACKAGEID, Ppkg);
3931     Lprocs[curproc].Ppkg = Ppkg;
3932     getprocnb_end() else
3933     getprocnb_begin(COREID, Pcore);
3934     Lprocs[curproc].Pcore = Pcore;
3935     getprocnb_end() else {
3936 
3937       /* architecture specific or default routine for parsing cpumodel */
3938       switch (data->arch) {
3939       case HWLOC_LINUX_ARCH_X86:
3940         parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_x86;
3941         break;
3942       case HWLOC_LINUX_ARCH_ARM:
3943         parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_arm;
3944         break;
3945       case HWLOC_LINUX_ARCH_POWER:
3946         parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_ppc;
3947         break;
3948       case HWLOC_LINUX_ARCH_IA64:
3949         parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_ia64;
3950         break;
3951       default:
3952         parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_generic;
3953       }
3954 
3955       /* we can't assume that we already got a processor index line:
3956        * alpha/frv/h8300/m68k/microblaze/sparc have no processor lines at all, only a global entry.
3957        * tile has a global section with model name before the list of processor lines.
3958        */
3959       parse_cpuinfo_func(prefix, value,
3960                          curproc >= 0 ? &Lprocs[curproc].infos : global_infos,
3961                          curproc >= 0 ? &Lprocs[curproc].infos_count : global_infos_count,
3962                          curproc < 0);
3963     }
3964 
3965     if (noend) {
3966       /* ignore end of line */
3967       if (fscanf(fd,"%*[^\n]") == EOF)
3968         break;
3969       getc(fd);
3970     }
3971   }
3972   fclose(fd);
3973   free(str);
3974 
3975   *Lprocs_p = Lprocs;
3976   return numprocs;
3977 
3978  err:
3979   fclose(fd);
3980   free(str);
3981   free(Lprocs);
3982   *Lprocs_p = NULL;
3983   return -1;
3984 }
3985 
3986 static void
hwloc_linux_free_cpuinfo(struct hwloc_linux_cpuinfo_proc * Lprocs,unsigned numprocs,struct hwloc_obj_info_s * global_infos,unsigned global_infos_count)3987 hwloc_linux_free_cpuinfo(struct hwloc_linux_cpuinfo_proc * Lprocs, unsigned numprocs,
3988                          struct hwloc_obj_info_s *global_infos, unsigned global_infos_count)
3989 {
3990   if (Lprocs) {
3991     unsigned i;
3992     for(i=0; i<numprocs; i++) {
3993       hwloc__free_infos(Lprocs[i].infos, Lprocs[i].infos_count);
3994     }
3995     free(Lprocs);
3996   }
3997   hwloc__free_infos(global_infos, global_infos_count);
3998 }
3999 
4000 static int
look_cpuinfo(struct hwloc_topology * topology,struct hwloc_linux_cpuinfo_proc * Lprocs,unsigned numprocs,hwloc_bitmap_t online_cpuset)4001 look_cpuinfo(struct hwloc_topology *topology,
4002              struct hwloc_linux_cpuinfo_proc * Lprocs,
4003              unsigned numprocs, hwloc_bitmap_t online_cpuset)
4004 {
4005   /* P for physical/OS index, L for logical (e.g. in we order we get them, not in the final hwloc logical order) */
4006   unsigned *Lcore_to_Pcore;
4007   unsigned *Lcore_to_Ppkg; /* needed because Lcore is equivalent to Pcore+Ppkg, not to Pcore alone */
4008   unsigned *Lpkg_to_Ppkg;
4009   unsigned numpkgs=0;
4010   unsigned numcores=0;
4011   unsigned long Lproc;
4012   unsigned missingpkg;
4013   unsigned missingcore;
4014   unsigned i,j;
4015   hwloc_bitmap_t cpuset;
4016 
4017   /* initialize misc arrays, there can be at most numprocs entries */
4018   Lcore_to_Pcore = malloc(numprocs * sizeof(*Lcore_to_Pcore));
4019   Lcore_to_Ppkg = malloc(numprocs * sizeof(*Lcore_to_Ppkg));
4020   Lpkg_to_Ppkg = malloc(numprocs * sizeof(*Lpkg_to_Ppkg));
4021   for (i = 0; i < numprocs; i++) {
4022     Lcore_to_Pcore[i] = -1;
4023     Lcore_to_Ppkg[i] = -1;
4024     Lpkg_to_Ppkg[i] = -1;
4025   }
4026 
4027   cpuset = hwloc_bitmap_alloc();
4028 
4029   /* create PU objects */
4030   for(Lproc=0; Lproc<numprocs; Lproc++) {
4031     unsigned long Pproc = Lprocs[Lproc].Pproc;
4032     hwloc_obj_t obj = hwloc_alloc_setup_object(HWLOC_OBJ_PU, Pproc);
4033     hwloc_bitmap_set(cpuset, Pproc);
4034     obj->cpuset = hwloc_bitmap_alloc();
4035     hwloc_bitmap_only(obj->cpuset, Pproc);
4036     hwloc_debug_2args_bitmap("cpu %lu (os %lu) has cpuset %s\n",
4037                              Lproc, Pproc, obj->cpuset);
4038     hwloc_insert_object_by_cpuset(topology, obj);
4039   }
4040 
4041   topology->support.discovery->pu = 1;
4042   hwloc_bitmap_copy(online_cpuset, cpuset);
4043   hwloc_bitmap_free(cpuset);
4044 
4045   hwloc_debug("%u online processors found\n", numprocs);
4046   hwloc_debug_bitmap("online processor cpuset: %s\n", online_cpuset);
4047 
4048   hwloc_debug("%s", "\n * Topology summary *\n");
4049   hwloc_debug("%u processors)\n", numprocs);
4050 
4051   /* fill Lprocs[].Lpkg and Lpkg_to_Ppkg */
4052   for(Lproc=0; Lproc<numprocs; Lproc++) {
4053     long Ppkg = Lprocs[Lproc].Ppkg;
4054     if (Ppkg != -1) {
4055       unsigned long Pproc = Lprocs[Lproc].Pproc;
4056       for (i=0; i<numpkgs; i++)
4057         if ((unsigned) Ppkg == Lpkg_to_Ppkg[i])
4058           break;
4059       Lprocs[Lproc].Lpkg = i;
4060       hwloc_debug("%lu on package %u (%lx)\n", Pproc, i, Ppkg);
4061       if (i==numpkgs) {
4062         Lpkg_to_Ppkg[numpkgs] = Ppkg;
4063         numpkgs++;
4064       }
4065     }
4066   }
4067   /* Some buggy Linuxes don't provide numbers for processor 0, which makes us
4068    * provide bogus information. We should rather drop it. */
4069   missingpkg=0;
4070   for(j=0; j<numprocs; j++)
4071     if (Lprocs[j].Ppkg == -1) {
4072       missingpkg=1;
4073       break;
4074     }
4075   /* create package objects */
4076   hwloc_debug("%u pkgs%s\n", numpkgs, missingpkg ? ", but some missing package" : "");
4077   if (!missingpkg && numpkgs>0) {
4078     for (i = 0; i < numpkgs; i++) {
4079       struct hwloc_obj *obj = hwloc_alloc_setup_object(HWLOC_OBJ_PACKAGE, Lpkg_to_Ppkg[i]);
4080       int doneinfos = 0;
4081       obj->cpuset = hwloc_bitmap_alloc();
4082       for(j=0; j<numprocs; j++)
4083         if ((unsigned) Lprocs[j].Lpkg == i) {
4084           hwloc_bitmap_set(obj->cpuset, Lprocs[j].Pproc);
4085           if (!doneinfos) {
4086             hwloc__move_infos(&obj->infos, &obj->infos_count, &Lprocs[j].infos, &Lprocs[j].infos_count);
4087             doneinfos = 1;
4088           }
4089         }
4090       hwloc_debug_1arg_bitmap("package %d has cpuset %s\n", i, obj->cpuset);
4091       hwloc_insert_object_by_cpuset(topology, obj);
4092     }
4093     hwloc_debug("%s", "\n");
4094   }
4095 
4096   /* fill Lprocs[].Lcore, Lcore_to_Ppkg and Lcore_to_Pcore */
4097   for(Lproc=0; Lproc<numprocs; Lproc++) {
4098     long Pcore = Lprocs[Lproc].Pcore;
4099     if (Pcore != -1) {
4100       for (i=0; i<numcores; i++)
4101         if ((unsigned) Pcore == Lcore_to_Pcore[i] && (unsigned) Lprocs[Lproc].Ppkg == Lcore_to_Ppkg[i])
4102           break;
4103       Lprocs[Lproc].Lcore = i;
4104       if (i==numcores) {
4105         Lcore_to_Ppkg[numcores] = Lprocs[Lproc].Ppkg;
4106         Lcore_to_Pcore[numcores] = Pcore;
4107         numcores++;
4108       }
4109     }
4110   }
4111   /* Some buggy Linuxes don't provide numbers for processor 0, which makes us
4112    * provide bogus information. We should rather drop it. */
4113   missingcore=0;
4114   for(j=0; j<numprocs; j++)
4115     if (Lprocs[j].Pcore == -1) {
4116       missingcore=1;
4117       break;
4118     }
4119   /* create Core objects */
4120   hwloc_debug("%u cores%s\n", numcores, missingcore ? ", but some missing core" : "");
4121   if (!missingcore && numcores>0) {
4122     for (i = 0; i < numcores; i++) {
4123       struct hwloc_obj *obj = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, Lcore_to_Pcore[i]);
4124       obj->cpuset = hwloc_bitmap_alloc();
4125       for(j=0; j<numprocs; j++)
4126         if ((unsigned) Lprocs[j].Lcore == i)
4127           hwloc_bitmap_set(obj->cpuset, Lprocs[j].Pproc);
4128       hwloc_debug_1arg_bitmap("Core %d has cpuset %s\n", i, obj->cpuset);
4129       hwloc_insert_object_by_cpuset(topology, obj);
4130     }
4131     hwloc_debug("%s", "\n");
4132   }
4133 
4134   free(Lcore_to_Pcore);
4135   free(Lcore_to_Ppkg);
4136   free(Lpkg_to_Ppkg);
4137   return 0;
4138 }
4139 
4140 
4141 
4142 /*************************************
4143  ****** Main Topology Discovery ******
4144  *************************************/
4145 
4146 static void
hwloc__linux_get_mic_sn(struct hwloc_topology * topology,struct hwloc_linux_backend_data_s * data)4147 hwloc__linux_get_mic_sn(struct hwloc_topology *topology, struct hwloc_linux_backend_data_s *data)
4148 {
4149   char line[64], *tmp, *end;
4150   if (hwloc_read_path_by_length("/proc/elog", line, sizeof(line), data->root_fd) < 0)
4151     return;
4152   if (strncmp(line, "Card ", 5))
4153     return;
4154   tmp = line + 5;
4155   end = strchr(tmp, ':');
4156   if (!end)
4157     return;
4158   *end = '\0';
4159   hwloc_obj_add_info(hwloc_get_root_obj(topology), "MICSerialNumber", tmp);
4160 }
4161 
4162 static void
hwloc_gather_system_info(struct hwloc_topology * topology,struct hwloc_linux_backend_data_s * data)4163 hwloc_gather_system_info(struct hwloc_topology *topology,
4164                          struct hwloc_linux_backend_data_s *data)
4165 {
4166   FILE *file;
4167   char line[128]; /* enough for utsname fields */
4168   const char *env;
4169 
4170   /* initialize to something sane, in case !is_thissystem and we can't find things in /proc/hwloc-nofile-info */
4171   memset(&data->utsname, 0, sizeof(data->utsname));
4172   data->fallback_nbprocessors = 1;
4173   data->pagesize = 4096;
4174 
4175   /* read thissystem info */
4176   if (topology->is_thissystem) {
4177     uname(&data->utsname);
4178     data->fallback_nbprocessors = hwloc_fallback_nbprocessors(topology);
4179     data->pagesize = hwloc_getpagesize();
4180   }
4181 
4182   /* overwrite with optional /proc/hwloc-nofile-info */
4183   file = hwloc_fopen("/proc/hwloc-nofile-info", "r", data->root_fd);
4184   if (file) {
4185     while (fgets(line, sizeof(line), file)) {
4186       char *tmp = strchr(line, '\n');
4187       if (!strncmp("OSName: ", line, 8)) {
4188         if (tmp)
4189           *tmp = '\0';
4190         strncpy(data->utsname.sysname, line+8, sizeof(data->utsname.sysname));
4191         data->utsname.sysname[sizeof(data->utsname.sysname)-1] = '\0';
4192       } else if (!strncmp("OSRelease: ", line, 11)) {
4193         if (tmp)
4194           *tmp = '\0';
4195         strncpy(data->utsname.release, line+11, sizeof(data->utsname.release));
4196         data->utsname.release[sizeof(data->utsname.release)-1] = '\0';
4197       } else if (!strncmp("OSVersion: ", line, 11)) {
4198         if (tmp)
4199           *tmp = '\0';
4200         strncpy(data->utsname.version, line+11, sizeof(data->utsname.version));
4201         data->utsname.version[sizeof(data->utsname.version)-1] = '\0';
4202       } else if (!strncmp("HostName: ", line, 10)) {
4203         if (tmp)
4204           *tmp = '\0';
4205         strncpy(data->utsname.nodename, line+10, sizeof(data->utsname.nodename));
4206         data->utsname.nodename[sizeof(data->utsname.nodename)-1] = '\0';
4207       } else if (!strncmp("Architecture: ", line, 14)) {
4208         if (tmp)
4209           *tmp = '\0';
4210         strncpy(data->utsname.machine, line+14, sizeof(data->utsname.machine));
4211         data->utsname.machine[sizeof(data->utsname.machine)-1] = '\0';
4212       } else if (!strncmp("FallbackNbProcessors: ", line, 22)) {
4213         if (tmp)
4214           *tmp = '\0';
4215         data->fallback_nbprocessors = atoi(line+22);
4216       } else if (!strncmp("PageSize: ", line, 10)) {
4217         if (tmp)
4218          *tmp = '\0';
4219         data->pagesize = strtoull(line+10, NULL, 10);
4220       } else {
4221         hwloc_debug("ignored /proc/hwloc-nofile-info line %s\n", line);
4222         /* ignored */
4223       }
4224     }
4225     fclose(file);
4226   }
4227 
4228   env = getenv("HWLOC_DUMP_NOFILE_INFO");
4229   if (env && *env) {
4230     file = fopen(env, "w");
4231     if (file) {
4232       if (*data->utsname.sysname)
4233         fprintf(file, "OSName: %s\n", data->utsname.sysname);
4234       if (*data->utsname.release)
4235         fprintf(file, "OSRelease: %s\n", data->utsname.release);
4236       if (*data->utsname.version)
4237         fprintf(file, "OSVersion: %s\n", data->utsname.version);
4238       if (*data->utsname.nodename)
4239         fprintf(file, "HostName: %s\n", data->utsname.nodename);
4240       if (*data->utsname.machine)
4241         fprintf(file, "Architecture: %s\n", data->utsname.machine);
4242       fprintf(file, "FallbackNbProcessors: %u\n", data->fallback_nbprocessors);
4243       fprintf(file, "PageSize: %llu\n", (unsigned long long) data->pagesize);
4244       fclose(file);
4245     }
4246   }
4247 
4248   /* detect arch for quirks, using configure #defines if possible, or uname */
4249 #if (defined HWLOC_X86_32_ARCH) || (defined HWLOC_X86_64_ARCH) /* does not cover KNC */
4250   if (topology->is_thissystem)
4251     data->arch = HWLOC_LINUX_ARCH_X86;
4252 #endif
4253   if (data->arch == HWLOC_LINUX_ARCH_UNKNOWN && *data->utsname.machine) {
4254     if (!strcmp(data->utsname.machine, "x86_64")
4255         || (data->utsname.machine[0] == 'i' && !strcmp(data->utsname.machine+2, "86"))
4256         || !strcmp(data->utsname.machine, "k1om"))
4257       data->arch = HWLOC_LINUX_ARCH_X86;
4258     else if (!strncmp(data->utsname.machine, "arm", 3))
4259       data->arch = HWLOC_LINUX_ARCH_ARM;
4260     else if (!strncmp(data->utsname.machine, "ppc", 3)
4261              || !strncmp(data->utsname.machine, "power", 5))
4262       data->arch = HWLOC_LINUX_ARCH_POWER;
4263     else if (!strcmp(data->utsname.machine, "ia64"))
4264       data->arch = HWLOC_LINUX_ARCH_IA64;
4265   }
4266 }
4267 
4268 /* returns 0 on success, -1 on non-match or error during hardwired load */
4269 static int
hwloc_linux_try_hardwired_cpuinfo(struct hwloc_backend * backend)4270 hwloc_linux_try_hardwired_cpuinfo(struct hwloc_backend *backend)
4271 {
4272   struct hwloc_topology *topology = backend->topology;
4273   struct hwloc_linux_backend_data_s *data = backend->private_data;
4274 
4275   if (getenv("HWLOC_NO_HARDWIRED_TOPOLOGY"))
4276     return -1;
4277 
4278   if (!strcmp(data->utsname.machine, "s64fx")) {
4279     char line[128];
4280     /* Fujistu K-computer, FX10, and FX100 use specific processors
4281      * whose Linux topology support is broken until 4.1 (acc455cffa75070d55e74fc7802b49edbc080e92and)
4282      * and existing machines will likely never be fixed by kernel upgrade.
4283      */
4284 
4285     /* /proc/cpuinfo starts with one of these lines:
4286      * "cpu             : Fujitsu SPARC64 VIIIfx"
4287      * "cpu             : Fujitsu SPARC64 XIfx"
4288      * "cpu             : Fujitsu SPARC64 IXfx"
4289      */
4290     if (hwloc_read_path_by_length("/proc/cpuinfo", line, sizeof(line), data->root_fd) < 0)
4291       return -1;
4292 
4293     if (strncmp(line, "cpu      ", 4))
4294       return -1;
4295 
4296     if (strstr(line, "Fujitsu SPARC64 VIIIfx"))
4297       return hwloc_look_hardwired_fujitsu_k(topology);
4298     else if (strstr(line, "Fujitsu SPARC64 IXfx"))
4299       return hwloc_look_hardwired_fujitsu_fx10(topology);
4300     else if (strstr(line, "FUJITSU SPARC64 XIfx"))
4301       return hwloc_look_hardwired_fujitsu_fx100(topology);
4302   }
4303   return -1;
4304 }
4305 
hwloc_linux__get_allowed_resources(hwloc_topology_t topology,const char * root_path,int root_fd,char ** cpuset_namep)4306 static void hwloc_linux__get_allowed_resources(hwloc_topology_t topology, const char *root_path, int root_fd, char **cpuset_namep)
4307 {
4308   char *cpuset_mntpnt, *cgroup_mntpnt, *cpuset_name = NULL;
4309   hwloc_find_linux_cpuset_mntpnt(&cgroup_mntpnt, &cpuset_mntpnt, root_path);
4310   if (cgroup_mntpnt || cpuset_mntpnt) {
4311     cpuset_name = hwloc_read_linux_cpuset_name(root_fd, topology->pid);
4312     if (cpuset_name) {
4313       hwloc_admin_disable_set_from_cpuset(root_fd, cgroup_mntpnt, cpuset_mntpnt, cpuset_name, "cpus", topology->levels[0][0]->allowed_cpuset);
4314       hwloc_admin_disable_set_from_cpuset(root_fd, cgroup_mntpnt, cpuset_mntpnt, cpuset_name, "mems", topology->levels[0][0]->allowed_nodeset);
4315     }
4316     free(cgroup_mntpnt);
4317     free(cpuset_mntpnt);
4318   }
4319   *cpuset_namep = cpuset_name;
4320 }
4321 
4322 static int
hwloc_look_linuxfs(struct hwloc_backend * backend)4323 hwloc_look_linuxfs(struct hwloc_backend *backend)
4324 {
4325   struct hwloc_topology *topology = backend->topology;
4326   struct hwloc_linux_backend_data_s *data = backend->private_data;
4327   DIR *nodes_dir;
4328   unsigned nbnodes;
4329   char *cpuset_name;
4330   struct hwloc_linux_cpuinfo_proc * Lprocs = NULL;
4331   struct hwloc_obj_info_s *global_infos = NULL;
4332   unsigned global_infos_count = 0;
4333   int numprocs;
4334   int already_pus;
4335   int err;
4336 
4337   already_pus = (topology->levels[0][0]->complete_cpuset != NULL
4338                  && !hwloc_bitmap_iszero(topology->levels[0][0]->complete_cpuset));
4339   /* if there are PUs, still look at memory information
4340    * since x86 misses NUMA node information (unless the processor supports topoext)
4341    * memory size.
4342    */
4343 
4344   /* allocate root sets in case not done yet */
4345   hwloc_alloc_obj_cpusets(topology->levels[0][0]);
4346 
4347   /*********************************
4348    * Platform information for later
4349    */
4350   hwloc_gather_system_info(topology, data);
4351 
4352   /**********************
4353    * /proc/cpuinfo
4354    */
4355   numprocs = hwloc_linux_parse_cpuinfo(data, "/proc/cpuinfo", &Lprocs, &global_infos, &global_infos_count);
4356   if (numprocs < 0)
4357     numprocs = 0;
4358 
4359   /**************************
4360    * detect model for quirks
4361    */
4362   if (data->arch == HWLOC_LINUX_ARCH_X86 && numprocs > 0) {
4363       unsigned i;
4364       const char *cpuvendor = NULL, *cpufamilynumber = NULL, *cpumodelnumber = NULL;
4365       for(i=0; i<Lprocs[0].infos_count; i++) {
4366         if (!strcmp(Lprocs[0].infos[i].name, "CPUVendor")) {
4367           cpuvendor = Lprocs[0].infos[i].value;
4368         } else if (!strcmp(Lprocs[0].infos[i].name, "CPUFamilyNumber")) {
4369           cpufamilynumber = Lprocs[0].infos[i].value;
4370         } else if (!strcmp(Lprocs[0].infos[i].name, "CPUModelNumber")) {
4371           cpumodelnumber = Lprocs[0].infos[i].value;
4372         }
4373       }
4374       if (cpuvendor && !strcmp(cpuvendor, "GenuineIntel")
4375           && cpufamilynumber && !strcmp(cpufamilynumber, "6")
4376           && cpumodelnumber && (!strcmp(cpumodelnumber, "87")
4377           || !strcmp(cpumodelnumber, "133")))
4378         data->is_knl = 1;
4379       if (cpuvendor && !strcmp(cpuvendor, "AuthenticAMD")
4380           && cpufamilynumber
4381           && (!strcmp(cpufamilynumber, "21")
4382               || !strcmp(cpufamilynumber, "22")))
4383         data->is_amd_with_CU = 1;
4384   }
4385 
4386   /**********************
4387    * Gather the list of admin-disabled cpus and mems
4388    */
4389   hwloc_linux__get_allowed_resources(topology, data->root_path, data->root_fd, &cpuset_name);
4390 
4391   nodes_dir = hwloc_opendir("/proc/nodes", data->root_fd);
4392   if (nodes_dir) {
4393     /* Kerrighed */
4394     struct dirent *dirent;
4395     char path[128];
4396     hwloc_obj_t machine;
4397     hwloc_bitmap_t machine_online_set;
4398 
4399     if (already_pus) {
4400       /* we don't support extending kerrighed topologies */
4401       free(cpuset_name);
4402       hwloc_linux_free_cpuinfo(Lprocs, numprocs, global_infos, global_infos_count);
4403       return 0;
4404     }
4405 
4406     /* replace top-level object type with SYSTEM and add some MACHINE underneath */
4407 
4408     topology->levels[0][0]->type = HWLOC_OBJ_SYSTEM;
4409     topology->levels[0][0]->name = strdup("Kerrighed");
4410 
4411     /* No cpuset support for now.  */
4412     /* No sys support for now.  */
4413     while ((dirent = readdir(nodes_dir)) != NULL) {
4414       struct hwloc_linux_cpuinfo_proc * machine_Lprocs = NULL;
4415       struct hwloc_obj_info_s *machine_global_infos = NULL;
4416       unsigned machine_global_infos_count = 0;
4417       int machine_numprocs = 0;
4418       unsigned long node;
4419       if (strncmp(dirent->d_name, "node", 4))
4420         continue;
4421       machine_online_set = hwloc_bitmap_alloc();
4422       node = strtoul(dirent->d_name+4, NULL, 0);
4423       snprintf(path, sizeof(path), "/proc/nodes/node%lu/cpuinfo", node);
4424       machine_numprocs = hwloc_linux_parse_cpuinfo(data, path, &machine_Lprocs, &machine_global_infos, &machine_global_infos_count);
4425       if (machine_numprocs < 0) {
4426         err = -1;
4427         machine_numprocs = 0;
4428       } else {
4429         err = look_cpuinfo(topology, machine_Lprocs, machine_numprocs, machine_online_set);
4430       }
4431 
4432       hwloc_linux_free_cpuinfo(machine_Lprocs, machine_numprocs, machine_global_infos, machine_global_infos_count);
4433       if (err < 0) {
4434         hwloc_bitmap_free(machine_online_set);
4435         continue;
4436       }
4437       hwloc_bitmap_or(topology->levels[0][0]->online_cpuset, topology->levels[0][0]->online_cpuset, machine_online_set);
4438       machine = hwloc_alloc_setup_object(HWLOC_OBJ_MACHINE, node);
4439       machine->cpuset = machine_online_set;
4440       hwloc_debug_1arg_bitmap("machine number %lu has cpuset %s\n",
4441                  node, machine_online_set);
4442 
4443       /* Get the machine memory attributes */
4444       hwloc_get_kerrighed_node_meminfo_info(topology, data, node, &machine->memory);
4445 
4446       /* Gather DMI info */
4447       /* FIXME: get the right DMI info of each machine */
4448       hwloc__get_dmi_id_info(data, machine);
4449 
4450       hwloc_insert_object_by_cpuset(topology, machine);
4451     }
4452     closedir(nodes_dir);
4453   } else {
4454     /*********************
4455      * Memory information
4456      */
4457 
4458     /* Get the machine memory attributes */
4459     hwloc_get_procfs_meminfo_info(topology, data, &topology->levels[0][0]->memory);
4460 
4461     /* Gather NUMA information. Must be after hwloc_get_procfs_meminfo_info so that the hugepage size is known */
4462     if (look_sysfsnode(topology, data, "/sys/bus/node/devices", &nbnodes) < 0)
4463       look_sysfsnode(topology, data, "/sys/devices/system/node", &nbnodes);
4464 
4465     /* if we found some numa nodes, the machine object has no local memory */
4466     if (nbnodes) {
4467       unsigned i;
4468       topology->levels[0][0]->memory.local_memory = 0;
4469       if (topology->levels[0][0]->memory.page_types)
4470         for(i=0; i<topology->levels[0][0]->memory.page_types_len; i++)
4471           topology->levels[0][0]->memory.page_types[i].count = 0;
4472     }
4473 
4474     /**********************
4475      * CPU information
4476      */
4477 
4478     /* Don't rediscover CPU resources if already done */
4479     if (already_pus)
4480       goto done;
4481 
4482     /* Gather the list of cpus now */
4483     err = hwloc_linux_try_hardwired_cpuinfo(backend);
4484     if (!err)
4485       goto done;
4486 
4487     /* setup root info */
4488     hwloc__move_infos(&hwloc_get_root_obj(topology)->infos, &hwloc_get_root_obj(topology)->infos_count,
4489                       &global_infos, &global_infos_count);
4490 
4491     if (getenv("HWLOC_LINUX_USE_CPUINFO")
4492         || (hwloc_access("/sys/devices/system/cpu/cpu0/topology/core_siblings", R_OK, data->root_fd) < 0
4493             && hwloc_access("/sys/devices/system/cpu/cpu0/topology/thread_siblings", R_OK, data->root_fd) < 0
4494             && hwloc_access("/sys/bus/cpu/devices/cpu0/topology/thread_siblings", R_OK, data->root_fd) < 0
4495             && hwloc_access("/sys/bus/cpu/devices/cpu0/topology/core_siblings", R_OK, data->root_fd) < 0)) {
4496         /* revert to reading cpuinfo only if /sys/.../topology unavailable (before 2.6.16)
4497          * or not containing anything interesting */
4498       if (numprocs > 0)
4499         err = look_cpuinfo(topology, Lprocs, numprocs, topology->levels[0][0]->online_cpuset);
4500       else
4501         err = -1;
4502       if (err < 0)
4503         hwloc_setup_pu_level(topology, data->fallback_nbprocessors);
4504       look_powerpc_device_tree(topology, data);
4505 
4506     } else {
4507       /* sysfs */
4508       if (look_sysfscpu(topology, data, "/sys/bus/cpu/devices", Lprocs, numprocs) < 0)
4509         if (look_sysfscpu(topology, data, "/sys/devices/system/cpu", Lprocs, numprocs) < 0)
4510           /* sysfs but we failed to read cpu topology, fallback */
4511           hwloc_setup_pu_level(topology, data->fallback_nbprocessors);
4512     }
4513 
4514  done:
4515 
4516     /**********************
4517      * Misc
4518      */
4519 
4520     /* Gather DMI info */
4521     hwloc__get_dmi_id_info(data, topology->levels[0][0]);
4522     if (hwloc_topology_get_flags(topology) & (HWLOC_TOPOLOGY_FLAG_IO_DEVICES|HWLOC_TOPOLOGY_FLAG_WHOLE_IO))
4523       hwloc__get_firmware_dmi_memory_info(topology, data);
4524   }
4525 
4526   hwloc_obj_add_info(topology->levels[0][0], "Backend", "Linux");
4527   if (cpuset_name) {
4528     hwloc_obj_add_info(topology->levels[0][0], "LinuxCgroup", cpuset_name);
4529     free(cpuset_name);
4530   }
4531 
4532   hwloc__linux_get_mic_sn(topology, data);
4533 
4534   /* data->utsname was filled with real uname or \0, we can safely pass it */
4535   hwloc_add_uname_info(topology, &data->utsname);
4536 
4537   hwloc_linux_free_cpuinfo(Lprocs, numprocs, global_infos, global_infos_count);
4538   return 1;
4539 }
4540 
4541 
4542 
4543 /****************************************
4544  ***** Linux PCI backend callbacks ******
4545  ****************************************
4546  * Do not support changing the fsroot (use sysfs)
4547  */
4548 
4549 static hwloc_obj_t
hwloc_linux_add_os_device(struct hwloc_backend * backend,struct hwloc_obj * pcidev,hwloc_obj_osdev_type_t type,const char * name)4550 hwloc_linux_add_os_device(struct hwloc_backend *backend, struct hwloc_obj *pcidev, hwloc_obj_osdev_type_t type, const char *name)
4551 {
4552   struct hwloc_topology *topology = backend->topology;
4553   struct hwloc_obj *obj = hwloc_alloc_setup_object(HWLOC_OBJ_OS_DEVICE, -1);
4554   obj->name = strdup(name);
4555   obj->logical_index = -1;
4556   obj->attr->osdev.type = type;
4557 
4558   hwloc_insert_object_by_parent(topology, pcidev, obj);
4559   /* insert_object_by_parent() doesn't merge during insert, so obj is still valid */
4560 
4561   return obj;
4562 }
4563 
4564 typedef void (*hwloc_linux_class_fillinfos_t)(struct hwloc_backend *backend, struct hwloc_obj *osdev, const char *osdevpath);
4565 
4566 /* cannot be used in fsroot-aware code, would have to move to a per-topology variable */
4567 
4568 static void
hwloc_linux_check_deprecated_classlinks_model(struct hwloc_linux_backend_data_s * data)4569 hwloc_linux_check_deprecated_classlinks_model(struct hwloc_linux_backend_data_s *data)
4570 {
4571   int root_fd = data->root_fd;
4572   DIR *dir;
4573   struct dirent *dirent;
4574   char path[128];
4575   struct stat st;
4576 
4577   data->deprecated_classlinks_model = -1;
4578 
4579   dir = hwloc_opendir("/sys/class/net", root_fd);
4580   if (!dir)
4581     return;
4582   while ((dirent = readdir(dir)) != NULL) {
4583     if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, "..") || !strcmp(dirent->d_name, "lo"))
4584       continue;
4585     snprintf(path, sizeof(path), "/sys/class/net/%s/device/net/%s", dirent->d_name, dirent->d_name);
4586     if (hwloc_stat(path, &st, root_fd) == 0) {
4587       data->deprecated_classlinks_model = 0;
4588       goto out;
4589     }
4590     snprintf(path, sizeof(path), "/sys/class/net/%s/device/net:%s", dirent->d_name, dirent->d_name);
4591     if (hwloc_stat(path, &st, root_fd) == 0) {
4592       data->deprecated_classlinks_model = 1;
4593       goto out;
4594     }
4595   }
4596 out:
4597   closedir(dir);
4598 }
4599 
4600 /* class objects that are immediately below pci devices:
4601  * look for objects of the given classname below a sysfs (pcidev) directory
4602  */
4603 static int
hwloc_linux_class_readdir(struct hwloc_backend * backend,struct hwloc_obj * pcidev,const char * devicepath,hwloc_obj_osdev_type_t type,const char * classname,hwloc_linux_class_fillinfos_t fillinfo)4604 hwloc_linux_class_readdir(struct hwloc_backend *backend,
4605                           struct hwloc_obj *pcidev, const char *devicepath,
4606                           hwloc_obj_osdev_type_t type, const char *classname,
4607                           hwloc_linux_class_fillinfos_t fillinfo)
4608 {
4609   struct hwloc_linux_backend_data_s *data = backend->private_data;
4610   int root_fd = data->root_fd;
4611   size_t classnamelen = strlen(classname);
4612   char path[256];
4613   DIR *dir;
4614   struct dirent *dirent;
4615   hwloc_obj_t obj;
4616   int res = 0, err;
4617 
4618   if (data->deprecated_classlinks_model == -2)
4619     hwloc_linux_check_deprecated_classlinks_model(data);
4620 
4621   if (data->deprecated_classlinks_model != 1) {
4622     /* modern sysfs: <device>/<class>/<name> */
4623     struct stat st;
4624     snprintf(path, sizeof(path), "%s/%s", devicepath, classname);
4625 
4626     /* some very host kernel (2.6.9/RHEL4) have <device>/<class> symlink without any way to find <name>.
4627      * make sure <device>/<class> is a directory to avoid this case.
4628      */
4629     err = hwloc_lstat(path, &st, root_fd);
4630     if (err < 0 || !S_ISDIR(st.st_mode))
4631       goto trydeprecated;
4632 
4633     dir = hwloc_opendir(path, root_fd);
4634     if (dir) {
4635       data->deprecated_classlinks_model = 0;
4636       while ((dirent = readdir(dir)) != NULL) {
4637         if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
4638           continue;
4639         obj = hwloc_linux_add_os_device(backend, pcidev, type, dirent->d_name);
4640         if (fillinfo) {
4641           snprintf(path, sizeof(path), "%s/%s/%s", devicepath, classname, dirent->d_name);
4642           fillinfo(backend, obj, path);
4643         }
4644         res++;
4645       }
4646       closedir(dir);
4647       return res;
4648     }
4649   }
4650 
4651 trydeprecated:
4652   if (data->deprecated_classlinks_model != 0) {
4653     /* deprecated sysfs: <device>/<class>:<name> */
4654     dir = hwloc_opendir(devicepath, root_fd);
4655     if (dir) {
4656       while ((dirent = readdir(dir)) != NULL) {
4657         if (strncmp(dirent->d_name, classname, classnamelen) || dirent->d_name[classnamelen] != ':')
4658           continue;
4659         data->deprecated_classlinks_model = 1;
4660         obj = hwloc_linux_add_os_device(backend, pcidev, type, dirent->d_name + classnamelen+1);
4661         if (fillinfo) {
4662           snprintf(path, sizeof(path), "%s/%s", devicepath, dirent->d_name);
4663           fillinfo(backend, obj, path);
4664         }
4665         res++;
4666       }
4667       closedir(dir);
4668       return res;
4669     }
4670   }
4671 
4672   return 0;
4673 }
4674 
4675 /*
4676  * look for net objects below a pcidev in sysfs
4677  */
4678 static void
hwloc_linux_net_class_fillinfos(struct hwloc_backend * backend,struct hwloc_obj * obj,const char * osdevpath)4679 hwloc_linux_net_class_fillinfos(struct hwloc_backend *backend,
4680                                 struct hwloc_obj *obj, const char *osdevpath)
4681 {
4682   struct hwloc_linux_backend_data_s *data = backend->private_data;
4683   int root_fd = data->root_fd;
4684   struct stat st;
4685   char path[256];
4686   char address[128];
4687   snprintf(path, sizeof(path), "%s/address", osdevpath);
4688   if (!hwloc_read_path_by_length(path, address, sizeof(address), root_fd)) {
4689     char *eol = strchr(address, '\n');
4690     if (eol)
4691       *eol = 0;
4692     hwloc_obj_add_info(obj, "Address", address);
4693   }
4694   snprintf(path, sizeof(path), "%s/device/infiniband", osdevpath);
4695   if (!hwloc_stat(path, &st, root_fd)) {
4696     char hexid[16];
4697     snprintf(path, sizeof(path), "%s/dev_id", osdevpath);
4698     if (!hwloc_read_path_by_length(path, hexid, sizeof(hexid), root_fd)) {
4699       char *eoid;
4700       unsigned long port;
4701       port = strtoul(hexid, &eoid, 0);
4702       if (eoid != hexid) {
4703         char portstr[16];
4704         snprintf(portstr, sizeof(portstr), "%ld", port+1);
4705         hwloc_obj_add_info(obj, "Port", portstr);
4706       }
4707     }
4708   }
4709 }
4710 
4711 static int
hwloc_linux_lookup_net_class(struct hwloc_backend * backend,struct hwloc_obj * pcidev,const char * pcidevpath)4712 hwloc_linux_lookup_net_class(struct hwloc_backend *backend,
4713                              struct hwloc_obj *pcidev, const char *pcidevpath)
4714 {
4715   return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_NETWORK, "net", hwloc_linux_net_class_fillinfos);
4716 }
4717 
4718 /*
4719  * look for infiniband objects below a pcidev in sysfs
4720  */
4721 static void
hwloc_linux_infiniband_class_fillinfos(struct hwloc_backend * backend,struct hwloc_obj * obj,const char * osdevpath)4722 hwloc_linux_infiniband_class_fillinfos(struct hwloc_backend *backend,
4723                                        struct hwloc_obj *obj, const char *osdevpath)
4724 {
4725   struct hwloc_linux_backend_data_s *data = backend->private_data;
4726   int root_fd = data->root_fd;
4727   char path[256];
4728   char guidvalue[20];
4729   unsigned i,j;
4730 
4731   snprintf(path, sizeof(path), "%s/node_guid", osdevpath);
4732   if (!hwloc_read_path_by_length(path, guidvalue, sizeof(guidvalue), root_fd)) {
4733     size_t len;
4734     len = strspn(guidvalue, "0123456789abcdefx:");
4735     guidvalue[len] = '\0';
4736     hwloc_obj_add_info(obj, "NodeGUID", guidvalue);
4737   }
4738 
4739   snprintf(path, sizeof(path), "%s/sys_image_guid", osdevpath);
4740   if (!hwloc_read_path_by_length(path, guidvalue, sizeof(guidvalue), root_fd)) {
4741     size_t len;
4742     len = strspn(guidvalue, "0123456789abcdefx:");
4743     guidvalue[len] = '\0';
4744     hwloc_obj_add_info(obj, "SysImageGUID", guidvalue);
4745   }
4746 
4747   for(i=1; ; i++) {
4748     char statevalue[2];
4749     char lidvalue[11];
4750     char gidvalue[40];
4751 
4752     snprintf(path, sizeof(path), "%s/ports/%u/state", osdevpath, i);
4753     if (!hwloc_read_path_by_length(path, statevalue, sizeof(statevalue), root_fd)) {
4754       char statename[32];
4755       statevalue[1] = '\0'; /* only keep the first byte/digit */
4756       snprintf(statename, sizeof(statename), "Port%uState", i);
4757       hwloc_obj_add_info(obj, statename, statevalue);
4758     } else {
4759       /* no such port */
4760       break;
4761     }
4762 
4763     snprintf(path, sizeof(path), "%s/ports/%u/lid", osdevpath, i);
4764     if (!hwloc_read_path_by_length(path, lidvalue, sizeof(lidvalue), root_fd)) {
4765       char lidname[32];
4766       size_t len;
4767       len = strspn(lidvalue, "0123456789abcdefx");
4768       lidvalue[len] = '\0';
4769       snprintf(lidname, sizeof(lidname), "Port%uLID", i);
4770       hwloc_obj_add_info(obj, lidname, lidvalue);
4771     }
4772 
4773     snprintf(path, sizeof(path), "%s/ports/%u/lid_mask_count", osdevpath, i);
4774     if (!hwloc_read_path_by_length(path, lidvalue, sizeof(lidvalue), root_fd)) {
4775       char lidname[32];
4776       size_t len;
4777       len = strspn(lidvalue, "0123456789");
4778       lidvalue[len] = '\0';
4779       snprintf(lidname, sizeof(lidname), "Port%uLMC", i);
4780       hwloc_obj_add_info(obj, lidname, lidvalue);
4781     }
4782 
4783     for(j=0; ; j++) {
4784       snprintf(path, sizeof(path), "%s/ports/%u/gids/%u", osdevpath, i, j);
4785       if (!hwloc_read_path_by_length(path, gidvalue, sizeof(gidvalue), root_fd)) {
4786         char gidname[32];
4787         size_t len;
4788         len = strspn(gidvalue, "0123456789abcdefx:");
4789         gidvalue[len] = '\0';
4790         if (strncmp(gidvalue+20, "0000:0000:0000:0000", 19)) {
4791           /* only keep initialized GIDs */
4792           snprintf(gidname, sizeof(gidname), "Port%uGID%u", i, j);
4793           hwloc_obj_add_info(obj, gidname, gidvalue);
4794         }
4795       } else {
4796         /* no such port */
4797         break;
4798       }
4799     }
4800   }
4801 }
4802 
4803 static int
hwloc_linux_lookup_openfabrics_class(struct hwloc_backend * backend,struct hwloc_obj * pcidev,const char * pcidevpath)4804 hwloc_linux_lookup_openfabrics_class(struct hwloc_backend *backend,
4805                                      struct hwloc_obj *pcidev, const char *pcidevpath)
4806 {
4807   return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_OPENFABRICS, "infiniband", hwloc_linux_infiniband_class_fillinfos);
4808 }
4809 
4810 /* look for dma objects below a pcidev in sysfs */
4811 static int
hwloc_linux_lookup_dma_class(struct hwloc_backend * backend,struct hwloc_obj * pcidev,const char * pcidevpath)4812 hwloc_linux_lookup_dma_class(struct hwloc_backend *backend,
4813                              struct hwloc_obj *pcidev, const char *pcidevpath)
4814 {
4815   return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_DMA, "dma", NULL);
4816 }
4817 
4818 /* look for drm objects below a pcidev in sysfs */
4819 static int
hwloc_linux_lookup_drm_class(struct hwloc_backend * backend,struct hwloc_obj * pcidev,const char * pcidevpath)4820 hwloc_linux_lookup_drm_class(struct hwloc_backend *backend,
4821                              struct hwloc_obj *pcidev, const char *pcidevpath)
4822 {
4823   return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_GPU, "drm", NULL);
4824 
4825   /* we could look at the "graphics" class too, but it doesn't help for proprietary drivers either */
4826 
4827   /* GPU devices (even with a proprietary driver) seem to have a boot_vga field in their PCI device directory (since 2.6.30),
4828    * so we could create a OS device for each PCI devices with such a field.
4829    * boot_vga is actually created when class >> 8 == VGA (it contains 1 for boot vga device), so it's trivial anyway.
4830    */
4831 }
4832 
4833 /*
4834  * look for block objects below a pcidev in sysfs
4835  */
4836 
4837 static void
hwloc_linux_block_class_fillinfos(struct hwloc_backend * backend,struct hwloc_obj * obj,const char * osdevpath)4838 hwloc_linux_block_class_fillinfos(struct hwloc_backend *backend,
4839                                   struct hwloc_obj *obj, const char *osdevpath)
4840 {
4841   struct hwloc_linux_backend_data_s *data = backend->private_data;
4842   int root_fd = data->root_fd;
4843   FILE *file;
4844   char path[256];
4845   char line[128];
4846   char vendor[64] = "";
4847   char model[64] = "";
4848   char serial[64] = "";
4849   char revision[64] = "";
4850   char blocktype[64] = "";
4851   unsigned major_id, minor_id;
4852   char *tmp;
4853 
4854   snprintf(path, sizeof(path), "%s/dev", osdevpath);
4855   if (hwloc_read_path_by_length(path, line, sizeof(line), root_fd) < 0)
4856     return;
4857 
4858   if (sscanf(line, "%u:%u", &major_id, &minor_id) != 2)
4859     return;
4860   tmp = strchr(line, '\n');
4861   if (tmp)
4862     *tmp = '\0';
4863   hwloc_obj_add_info(obj, "LinuxDeviceID", line);
4864 
4865 #ifdef HWLOC_HAVE_LIBUDEV
4866   if (data->udev) {
4867     struct udev_device *dev;
4868     const char *prop;
4869     dev = udev_device_new_from_subsystem_sysname(data->udev, "block", obj->name);
4870     if (!dev)
4871       return;
4872     prop = udev_device_get_property_value(dev, "ID_VENDOR");
4873     if (prop) {
4874       strncpy(vendor, prop, sizeof(vendor));
4875       vendor[sizeof(vendor)-1] = '\0';
4876     }
4877     prop = udev_device_get_property_value(dev, "ID_MODEL");
4878     if (prop) {
4879       strncpy(model, prop, sizeof(model));
4880       model[sizeof(model)-1] = '\0';
4881     }
4882     prop = udev_device_get_property_value(dev, "ID_REVISION");
4883     if (prop) {
4884       strncpy(revision, prop, sizeof(revision));
4885       revision[sizeof(revision)-1] = '\0';
4886     }
4887     prop = udev_device_get_property_value(dev, "ID_SERIAL_SHORT");
4888     if (prop) {
4889       strncpy(serial, prop, sizeof(serial));
4890       serial[sizeof(serial)-1] = '\0';
4891     }
4892     prop = udev_device_get_property_value(dev, "ID_TYPE");
4893     if (prop) {
4894       strncpy(blocktype, prop, sizeof(blocktype));
4895       blocktype[sizeof(blocktype)-1] = '\0';
4896     }
4897 
4898     udev_device_unref(dev);
4899   } else
4900     /* fallback to reading files, works with any fsroot */
4901 #endif
4902  {
4903   snprintf(path, sizeof(path), "/run/udev/data/b%u:%u", major_id, minor_id);
4904   file = hwloc_fopen(path, "r", root_fd);
4905   if (!file)
4906     return;
4907 
4908   while (NULL != fgets(line, sizeof(line), file)) {
4909     tmp = strchr(line, '\n');
4910     if (tmp)
4911       *tmp = '\0';
4912     if (!strncmp(line, "E:ID_VENDOR=", strlen("E:ID_VENDOR="))) {
4913       strncpy(vendor, line+strlen("E:ID_VENDOR="), sizeof(vendor));
4914       vendor[sizeof(vendor)-1] = '\0';
4915     } else if (!strncmp(line, "E:ID_MODEL=", strlen("E:ID_MODEL="))) {
4916       strncpy(model, line+strlen("E:ID_MODEL="), sizeof(model));
4917       model[sizeof(model)-1] = '\0';
4918     } else if (!strncmp(line, "E:ID_REVISION=", strlen("E:ID_REVISION="))) {
4919       strncpy(revision, line+strlen("E:ID_REVISION="), sizeof(revision));
4920       revision[sizeof(revision)-1] = '\0';
4921     } else if (!strncmp(line, "E:ID_SERIAL_SHORT=", strlen("E:ID_SERIAL_SHORT="))) {
4922       strncpy(serial, line+strlen("E:ID_SERIAL_SHORT="), sizeof(serial));
4923       serial[sizeof(serial)-1] = '\0';
4924     } else if (!strncmp(line, "E:ID_TYPE=", strlen("E:ID_TYPE="))) {
4925       strncpy(blocktype, line+strlen("E:ID_TYPE="), sizeof(blocktype));
4926       blocktype[sizeof(blocktype)-1] = '\0';
4927     }
4928   }
4929   fclose(file);
4930  }
4931 
4932   /* clear fake "ATA" vendor name */
4933   if (!strcasecmp(vendor, "ATA"))
4934     *vendor = '\0';
4935   /* overwrite vendor name from model when possible */
4936   if (!*vendor) {
4937     if (!strncasecmp(model, "wd", 2))
4938       strcpy(vendor, "Western Digital");
4939     else if (!strncasecmp(model, "st", 2))
4940       strcpy(vendor, "Seagate");
4941     else if (!strncasecmp(model, "samsung", 7))
4942       strcpy(vendor, "Samsung");
4943     else if (!strncasecmp(model, "sandisk", 7))
4944       strcpy(vendor, "SanDisk");
4945     else if (!strncasecmp(model, "toshiba", 7))
4946       strcpy(vendor, "Toshiba");
4947   }
4948 
4949   if (*vendor)
4950     hwloc_obj_add_info(obj, "Vendor", vendor);
4951   if (*model)
4952     hwloc_obj_add_info(obj, "Model", model);
4953   if (*revision)
4954     hwloc_obj_add_info(obj, "Revision", revision);
4955   if (*serial)
4956     hwloc_obj_add_info(obj, "SerialNumber", serial);
4957 
4958   if (!strcmp(blocktype, "disk"))
4959     hwloc_obj_add_info(obj, "Type", "Disk");
4960   else if (!strcmp(blocktype, "tape"))
4961     hwloc_obj_add_info(obj, "Type", "Tape");
4962   else if (!strcmp(blocktype, "cd") || !strcmp(blocktype, "floppy") || !strcmp(blocktype, "optical"))
4963     hwloc_obj_add_info(obj, "Type", "Removable Media Device");
4964   else /* generic, usb mass storage/rbc, usb mass storage/scsi */
4965     hwloc_obj_add_info(obj, "Type", "Other");
4966 }
4967 
4968 /* block class objects are in
4969  * host%d/target%d:%d:%d/%d:%d:%d:%d/
4970  * or
4971  * host%d/port-%d:%d/end_device-%d:%d/target%d:%d:%d/%d:%d:%d:%d/
4972  * or
4973  * ide%d/%d.%d/
4974  * below pci devices */
4975 static int
hwloc_linux_lookup_host_block_class(struct hwloc_backend * backend,struct hwloc_obj * pcidev,char * path,size_t pathlen)4976 hwloc_linux_lookup_host_block_class(struct hwloc_backend *backend,
4977                                     struct hwloc_obj *pcidev, char *path, size_t pathlen)
4978 {
4979   struct hwloc_linux_backend_data_s *data = backend->private_data;
4980   int root_fd = data->root_fd;
4981   DIR *hostdir, *portdir, *targetdir;
4982   struct dirent *hostdirent, *portdirent, *targetdirent;
4983   size_t hostdlen, portdlen, targetdlen;
4984   int dummy;
4985   int res = 0;
4986 
4987   hostdir = hwloc_opendir(path, root_fd);
4988   if (!hostdir)
4989     return 0;
4990 
4991   while ((hostdirent = readdir(hostdir)) != NULL) {
4992     if (sscanf(hostdirent->d_name, "port-%d:%d", &dummy, &dummy) == 2)
4993     {
4994       /* found host%d/port-%d:%d */
4995       path[pathlen] = '/';
4996       strcpy(&path[pathlen+1], hostdirent->d_name);
4997       pathlen += hostdlen = 1+strlen(hostdirent->d_name);
4998       portdir = hwloc_opendir(path, root_fd);
4999       if (!portdir)
5000         continue;
5001       while ((portdirent = readdir(portdir)) != NULL) {
5002         if (sscanf(portdirent->d_name, "end_device-%d:%d", &dummy, &dummy) == 2) {
5003           /* found host%d/port-%d:%d/end_device-%d:%d */
5004           path[pathlen] = '/';
5005           strcpy(&path[pathlen+1], portdirent->d_name);
5006           pathlen += portdlen = 1+strlen(portdirent->d_name);
5007           res += hwloc_linux_lookup_host_block_class(backend, pcidev, path, pathlen);
5008           /* restore parent path */
5009           pathlen -= portdlen;
5010           path[pathlen] = '\0';
5011         }
5012       }
5013       closedir(portdir);
5014       /* restore parent path */
5015       pathlen -= hostdlen;
5016       path[pathlen] = '\0';
5017       continue;
5018     } else if (sscanf(hostdirent->d_name, "target%d:%d:%d", &dummy, &dummy, &dummy) == 3) {
5019       /* found host%d/target%d:%d:%d */
5020       path[pathlen] = '/';
5021       strcpy(&path[pathlen+1], hostdirent->d_name);
5022       pathlen += hostdlen = 1+strlen(hostdirent->d_name);
5023       targetdir = hwloc_opendir(path, root_fd);
5024       if (!targetdir)
5025         continue;
5026       while ((targetdirent = readdir(targetdir)) != NULL) {
5027         if (sscanf(targetdirent->d_name, "%d:%d:%d:%d", &dummy, &dummy, &dummy, &dummy) != 4)
5028           continue;
5029         /* found host%d/target%d:%d:%d/%d:%d:%d:%d */
5030         path[pathlen] = '/';
5031         strcpy(&path[pathlen+1], targetdirent->d_name);
5032         pathlen += targetdlen = 1+strlen(targetdirent->d_name);
5033         /* lookup block class for real */
5034         res += hwloc_linux_class_readdir(backend, pcidev, path, HWLOC_OBJ_OSDEV_BLOCK, "block", hwloc_linux_block_class_fillinfos);
5035         /* restore parent path */
5036         pathlen -= targetdlen;
5037         path[pathlen] = '\0';
5038       }
5039       closedir(targetdir);
5040       /* restore parent path */
5041       pathlen -= hostdlen;
5042       path[pathlen] = '\0';
5043     }
5044   }
5045   closedir(hostdir);
5046 
5047   return res;
5048 }
5049 
5050 static int
hwloc_linux_lookup_block_class(struct hwloc_backend * backend,struct hwloc_obj * pcidev,const char * pcidevpath)5051 hwloc_linux_lookup_block_class(struct hwloc_backend *backend,
5052                                struct hwloc_obj *pcidev, const char *pcidevpath)
5053 {
5054   struct hwloc_linux_backend_data_s *data = backend->private_data;
5055   int root_fd = data->root_fd;
5056   size_t pathlen;
5057   DIR *devicedir, *hostdir;
5058   struct dirent *devicedirent, *hostdirent;
5059   size_t devicedlen, hostdlen;
5060   char path[256];
5061   int dummy;
5062   int res = 0;
5063 
5064   strcpy(path, pcidevpath);
5065   pathlen = strlen(path);
5066 
5067   /* look for a direct block device here (such as NVMe, something without controller subdirs in the middle) */
5068   res += hwloc_linux_class_readdir(backend, pcidev, path,
5069                                    HWLOC_OBJ_OSDEV_BLOCK, "block",
5070                                    hwloc_linux_block_class_fillinfos);
5071   if (res)
5072     return res;
5073   /* otherwise try to find controller subdirectories */
5074 
5075   devicedir = hwloc_opendir(pcidevpath, root_fd);
5076   if (!devicedir)
5077     return 0;
5078 
5079   while ((devicedirent = readdir(devicedir)) != NULL) {
5080     if (sscanf(devicedirent->d_name, "ide%d", &dummy) == 1) {
5081       /* found ide%d */
5082       path[pathlen] = '/';
5083       strcpy(&path[pathlen+1], devicedirent->d_name);
5084       pathlen += devicedlen = 1+strlen(devicedirent->d_name);
5085       hostdir = hwloc_opendir(path, root_fd);
5086       if (!hostdir)
5087         continue;
5088       while ((hostdirent = readdir(hostdir)) != NULL) {
5089         if (sscanf(hostdirent->d_name, "%d.%d", &dummy, &dummy) == 2) {
5090           /* found ide%d/%d.%d */
5091           path[pathlen] = '/';
5092           strcpy(&path[pathlen+1], hostdirent->d_name);
5093           pathlen += hostdlen = 1+strlen(hostdirent->d_name);
5094           /* lookup block class for real */
5095           res += hwloc_linux_class_readdir(backend, pcidev, path, HWLOC_OBJ_OSDEV_BLOCK, "block", NULL);
5096           /* restore parent path */
5097           pathlen -= hostdlen;
5098           path[pathlen] = '\0';
5099         }
5100       }
5101       closedir(hostdir);
5102       /* restore parent path */
5103       pathlen -= devicedlen;
5104       path[pathlen] = '\0';
5105     } else if (sscanf(devicedirent->d_name, "host%d", &dummy) == 1) {
5106       /* found host%d */
5107       path[pathlen] = '/';
5108       strcpy(&path[pathlen+1], devicedirent->d_name);
5109       pathlen += devicedlen = 1+strlen(devicedirent->d_name);
5110       res += hwloc_linux_lookup_host_block_class(backend, pcidev, path, pathlen);
5111       /* restore parent path */
5112       pathlen -= devicedlen;
5113       path[pathlen] = '\0';
5114     } else if (sscanf(devicedirent->d_name, "ata%d", &dummy) == 1) {
5115       /* found ata%d */
5116       path[pathlen] = '/';
5117       strcpy(&path[pathlen+1], devicedirent->d_name);
5118       pathlen += devicedlen = 1+strlen(devicedirent->d_name);
5119       hostdir = hwloc_opendir(path, root_fd);
5120       if (!hostdir)
5121         continue;
5122       while ((hostdirent = readdir(hostdir)) != NULL) {
5123         if (sscanf(hostdirent->d_name, "host%d", &dummy) == 1) {
5124           /* found ata%d/host%d */
5125           path[pathlen] = '/';
5126           strcpy(&path[pathlen+1], hostdirent->d_name);
5127           pathlen += hostdlen = 1+strlen(hostdirent->d_name);
5128           /* lookup block class for real */
5129           res += hwloc_linux_lookup_host_block_class(backend, pcidev, path, pathlen);
5130           /* restore parent path */
5131           pathlen -= hostdlen;
5132           path[pathlen] = '\0';
5133         }
5134       }
5135       closedir(hostdir);
5136       /* restore parent path */
5137       pathlen -= devicedlen;
5138       path[pathlen] = '\0';
5139     }
5140   }
5141   closedir(devicedir);
5142 
5143   return res;
5144 }
5145 
5146 static void
hwloc_linux_mic_class_fillinfos(struct hwloc_backend * backend,struct hwloc_obj * obj,const char * osdevpath)5147 hwloc_linux_mic_class_fillinfos(struct hwloc_backend *backend,
5148                                 struct hwloc_obj *obj, const char *osdevpath)
5149 {
5150   struct hwloc_linux_backend_data_s *data = backend->private_data;
5151   int root_fd = data->root_fd;
5152   char path[256];
5153   char family[64];
5154   char sku[64];
5155   char sn[64];
5156   char string[20];
5157 
5158   hwloc_obj_add_info(obj, "CoProcType", "MIC");
5159 
5160   snprintf(path, sizeof(path), "%s/family", osdevpath);
5161   if (!hwloc_read_path_by_length(path, family, sizeof(family), root_fd)) {
5162     char *eol = strchr(family, '\n');
5163     if (eol)
5164       *eol = 0;
5165     hwloc_obj_add_info(obj, "MICFamily", family);
5166   }
5167 
5168   snprintf(path, sizeof(path), "%s/sku", osdevpath);
5169   if (!hwloc_read_path_by_length(path, sku, sizeof(sku), root_fd)) {
5170     char *eol = strchr(sku, '\n');
5171     if (eol)
5172       *eol = 0;
5173     hwloc_obj_add_info(obj, "MICSKU", sku);
5174   }
5175 
5176   snprintf(path, sizeof(path), "%s/serialnumber", osdevpath);
5177   if (!hwloc_read_path_by_length(path, sn, sizeof(sn), root_fd)) {
5178     char *eol;
5179     eol = strchr(sn, '\n');
5180     if (eol)
5181       *eol = 0;
5182     hwloc_obj_add_info(obj, "MICSerialNumber", sn);
5183   }
5184 
5185   snprintf(path, sizeof(path), "%s/active_cores", osdevpath);
5186   if (!hwloc_read_path_by_length(path, string, sizeof(string), root_fd)) {
5187     unsigned long count = strtoul(string, NULL, 16);
5188     snprintf(string, sizeof(string), "%lu", count);
5189     hwloc_obj_add_info(obj, "MICActiveCores", string);
5190   }
5191 
5192   snprintf(path, sizeof(path), "%s/memsize", osdevpath);
5193   if (!hwloc_read_path_by_length(path, string, sizeof(string), root_fd)) {
5194     unsigned long count = strtoul(string, NULL, 16);
5195     snprintf(string, sizeof(string), "%lu", count);
5196     hwloc_obj_add_info(obj, "MICMemorySize", string);
5197   }
5198 }
5199 
5200 static int
hwloc_linux_lookup_mic_class(struct hwloc_backend * backend,struct hwloc_obj * pcidev,const char * pcidevpath)5201 hwloc_linux_lookup_mic_class(struct hwloc_backend *backend,
5202                              struct hwloc_obj *pcidev, const char *pcidevpath)
5203 {
5204   return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_COPROC, "mic", hwloc_linux_mic_class_fillinfos);
5205 }
5206 
5207 static int
hwloc_linux_directlookup_mic_class(struct hwloc_backend * backend,struct hwloc_obj * pcidev)5208 hwloc_linux_directlookup_mic_class(struct hwloc_backend *backend,
5209                                    struct hwloc_obj *pcidev)
5210 {
5211   struct hwloc_linux_backend_data_s *data = backend->private_data;
5212   int root_fd = data->root_fd;
5213   char path[256];
5214   struct stat st;
5215   hwloc_obj_t obj;
5216   unsigned idx;
5217   int res = 0;
5218 
5219   if (!data->mic_directlookup_id_max)
5220     /* already tried, nothing to do */
5221     return 0;
5222 
5223   if (data->mic_directlookup_id_max == (unsigned) -1) {
5224     /* never tried, find out the max id */
5225     DIR *dir;
5226     struct dirent *dirent;
5227 
5228     /* make sure we never do this lookup again */
5229     data->mic_directlookup_id_max = 0;
5230 
5231     /* read the entire class and find the max id of mic%u dirents */
5232     dir = hwloc_opendir("/sys/devices/virtual/mic", root_fd);
5233     if (!dir) {
5234       dir = hwloc_opendir("/sys/class/mic", root_fd);
5235       if (!dir)
5236         return 0;
5237     }
5238     while ((dirent = readdir(dir)) != NULL) {
5239       if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
5240         continue;
5241       if (sscanf(dirent->d_name, "mic%u", &idx) != 1)
5242         continue;
5243       if (idx >= data->mic_directlookup_id_max)
5244         data->mic_directlookup_id_max = idx+1;
5245     }
5246     closedir(dir);
5247   }
5248 
5249   /* now iterate over the mic ids and see if one matches our pcidev */
5250   for(idx=0; idx<data->mic_directlookup_id_max; idx++) {
5251     snprintf(path, sizeof(path), "/sys/class/mic/mic%u/pci_%02x:%02x.%02x",
5252              idx, pcidev->attr->pcidev.bus,  pcidev->attr->pcidev.dev,  pcidev->attr->pcidev.func);
5253     if (hwloc_stat(path, &st, root_fd) < 0)
5254       continue;
5255     snprintf(path, sizeof(path), "mic%u", idx);
5256     obj = hwloc_linux_add_os_device(backend, pcidev, HWLOC_OBJ_OSDEV_COPROC, path);
5257     snprintf(path, sizeof(path), "/sys/class/mic/mic%u", idx);
5258     hwloc_linux_mic_class_fillinfos(backend, obj, path);
5259     res++;
5260   }
5261 
5262   return res;
5263 }
5264 
5265 /*
5266  * backend callback for inserting objects inside a pci device
5267  */
5268 static int
hwloc_linux_backend_notify_new_object(struct hwloc_backend * backend,struct hwloc_backend * caller __hwloc_attribute_unused,struct hwloc_obj * obj)5269 hwloc_linux_backend_notify_new_object(struct hwloc_backend *backend, struct hwloc_backend *caller __hwloc_attribute_unused,
5270                                       struct hwloc_obj *obj)
5271 {
5272   struct hwloc_linux_backend_data_s *data = backend->private_data;
5273   char pcidevpath[256];
5274   int res = 0;
5275 
5276   /* this callback is only used in the libpci backend for now */
5277   assert(obj->type == HWLOC_OBJ_PCI_DEVICE);
5278 
5279   snprintf(pcidevpath, sizeof(pcidevpath), "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
5280            obj->attr->pcidev.domain, obj->attr->pcidev.bus,
5281            obj->attr->pcidev.dev, obj->attr->pcidev.func);
5282 
5283   res += hwloc_linux_lookup_net_class(backend, obj, pcidevpath);
5284   res += hwloc_linux_lookup_openfabrics_class(backend, obj, pcidevpath);
5285   res += hwloc_linux_lookup_dma_class(backend, obj, pcidevpath);
5286   res += hwloc_linux_lookup_drm_class(backend, obj, pcidevpath);
5287   res += hwloc_linux_lookup_block_class(backend, obj, pcidevpath);
5288 
5289   if (data->mic_need_directlookup == -1) {
5290     struct stat st;
5291     if (hwloc_stat("/sys/class/mic/mic0", &st, data->root_fd) == 0
5292         && hwloc_stat("/sys/class/mic/mic0/device/mic/mic0", &st, data->root_fd) == -1)
5293       /* hwloc_linux_lookup_mic_class will fail because pcidev sysfs directories
5294        * do not have mic/mic%u symlinks to mic devices (old mic driver).
5295        * if so, try from the mic class.
5296        */
5297       data->mic_need_directlookup = 1;
5298     else
5299       data->mic_need_directlookup = 0;
5300   }
5301   if (data->mic_need_directlookup)
5302     res += hwloc_linux_directlookup_mic_class(backend, obj);
5303   else
5304     res += hwloc_linux_lookup_mic_class(backend, obj, pcidevpath);
5305 
5306   return res;
5307 }
5308 
5309 /*
5310  * backend callback for retrieving the location of a pci device
5311  */
5312 static int
hwloc_linux_backend_get_obj_cpuset(struct hwloc_backend * backend,struct hwloc_backend * caller __hwloc_attribute_unused,struct hwloc_obj * obj,hwloc_bitmap_t cpuset)5313 hwloc_linux_backend_get_obj_cpuset(struct hwloc_backend *backend,
5314                                    struct hwloc_backend *caller __hwloc_attribute_unused,
5315                                    struct hwloc_obj *obj, hwloc_bitmap_t cpuset)
5316 {
5317   struct hwloc_linux_backend_data_s *data = backend->private_data;
5318   char path[256];
5319 
5320   /* this callback is only used in the libpci backend for now */
5321   assert(obj->type == HWLOC_OBJ_PCI_DEVICE
5322          || (obj->type == HWLOC_OBJ_BRIDGE && obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI));
5323 
5324   snprintf(path, sizeof(path), "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/local_cpus",
5325            obj->attr->pcidev.domain, obj->attr->pcidev.bus,
5326            obj->attr->pcidev.dev, obj->attr->pcidev.func);
5327   if (!hwloc__read_path_as_cpumask(path, cpuset, data->root_fd)
5328       && !hwloc_bitmap_iszero(cpuset))
5329     return 0;
5330   return -1;
5331 }
5332 
5333 
5334 
5335 /*******************************
5336  ******* Linux component *******
5337  *******************************/
5338 
5339 static void
hwloc_linux_backend_disable(struct hwloc_backend * backend)5340 hwloc_linux_backend_disable(struct hwloc_backend *backend)
5341 {
5342   struct hwloc_linux_backend_data_s *data = backend->private_data;
5343 #ifdef HAVE_OPENAT
5344   if (data->root_path)
5345     free(data->root_path);
5346   close(data->root_fd);
5347 #endif
5348 #ifdef HWLOC_HAVE_LIBUDEV
5349   if (data->udev)
5350     udev_unref(data->udev);
5351 #endif
5352   free(data);
5353 }
5354 
5355 static struct hwloc_backend *
hwloc_linux_component_instantiate(struct hwloc_disc_component * component,const void * _data1,const void * _data2 __hwloc_attribute_unused,const void * _data3 __hwloc_attribute_unused)5356 hwloc_linux_component_instantiate(struct hwloc_disc_component *component,
5357                                   const void *_data1,
5358                                   const void *_data2 __hwloc_attribute_unused,
5359                                   const void *_data3 __hwloc_attribute_unused)
5360 {
5361   struct hwloc_backend *backend;
5362   struct hwloc_linux_backend_data_s *data;
5363   const char * fsroot_path = _data1;
5364   int flags, root = -1;
5365 
5366   backend = hwloc_backend_alloc(component);
5367   if (!backend)
5368     goto out;
5369 
5370   data = malloc(sizeof(*data));
5371   if (!data) {
5372     errno = ENOMEM;
5373     goto out_with_backend;
5374   }
5375 
5376   backend->private_data = data;
5377   backend->flags = HWLOC_BACKEND_FLAG_NEED_LEVELS;
5378   backend->discover = hwloc_look_linuxfs;
5379   backend->get_obj_cpuset = hwloc_linux_backend_get_obj_cpuset;
5380   backend->notify_new_object = hwloc_linux_backend_notify_new_object;
5381   backend->disable = hwloc_linux_backend_disable;
5382 
5383   /* default values */
5384   data->arch = HWLOC_LINUX_ARCH_UNKNOWN;
5385   data->is_knl = 0;
5386   data->is_amd_with_CU = 0;
5387   data->is_real_fsroot = 1;
5388   data->root_path = NULL;
5389   if (!fsroot_path)
5390     fsroot_path = "/";
5391 
5392 #ifdef HAVE_OPENAT
5393   root = open(fsroot_path, O_RDONLY | O_DIRECTORY);
5394   if (root < 0)
5395     goto out_with_data;
5396 
5397   if (strcmp(fsroot_path, "/")) {
5398     backend->is_thissystem = 0;
5399     data->is_real_fsroot = 0;
5400     data->root_path = strdup(fsroot_path);
5401   }
5402 
5403   /* Since this fd stays open after hwloc returns, mark it as
5404      close-on-exec so that children don't inherit it.  Stevens says
5405      that we should GETFD before we SETFD, so we do. */
5406   flags = fcntl(root, F_GETFD, 0);
5407   if (-1 == flags ||
5408       -1 == fcntl(root, F_SETFD, FD_CLOEXEC | flags)) {
5409       close(root);
5410       root = -1;
5411       goto out_with_data;
5412   }
5413 #else
5414   if (strcmp(fsroot_path, "/")) {
5415     errno = ENOSYS;
5416     goto out_with_data;
5417   }
5418 #endif
5419   data->root_fd = root;
5420 
5421 #ifdef HWLOC_HAVE_LIBUDEV
5422   data->udev = NULL;
5423   if (data->is_real_fsroot) {
5424     data->udev = udev_new();
5425   }
5426 #endif
5427 
5428   data->dumped_hwdata_dirname = getenv("HWLOC_DUMPED_HWDATA_DIR");
5429   if (!data->dumped_hwdata_dirname) {
5430     if (_data1)
5431       data->dumped_hwdata_dirname = "/var/run/hwloc";
5432     else
5433       data->dumped_hwdata_dirname = RUNSTATEDIR "/hwloc";
5434   }
5435 
5436   data->deprecated_classlinks_model = -2; /* never tried */
5437   data->mic_need_directlookup = -1; /* not initialized */
5438   data->mic_directlookup_id_max = -1; /* not initialized */
5439 
5440   return backend;
5441 
5442  out_with_data:
5443 #ifdef HAVE_OPENAT
5444   if (data->root_path)
5445     free(data->root_path);
5446 #endif
5447   free(data);
5448  out_with_backend:
5449   free(backend);
5450  out:
5451   return NULL;
5452 }
5453 
5454 static struct hwloc_disc_component hwloc_linux_disc_component = {
5455   HWLOC_DISC_COMPONENT_TYPE_CPU,
5456   "linux",
5457   HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
5458   hwloc_linux_component_instantiate,
5459   50,
5460   NULL
5461 };
5462 
5463 const struct hwloc_component hwloc_linux_component = {
5464   HWLOC_COMPONENT_ABI,
5465   NULL, NULL,
5466   HWLOC_COMPONENT_TYPE_DISC,
5467   0,
5468   &hwloc_linux_disc_component
5469 };
5470 
5471 
5472 
5473 
5474 #ifdef HWLOC_HAVE_LINUXPCI
5475 
5476 /***********************************
5477  ******* Linux PCI component *******
5478  ***********************************/
5479 
5480 #define HWLOC_PCI_REVISION_ID 0x08
5481 #define HWLOC_PCI_CAP_ID_EXP 0x10
5482 #define HWLOC_PCI_CLASS_NOT_DEFINED 0x0000
5483 
5484 static int
hwloc_look_linuxfs_pci(struct hwloc_backend * backend)5485 hwloc_look_linuxfs_pci(struct hwloc_backend *backend)
5486 {
5487   struct hwloc_topology *topology = backend->topology;
5488   struct hwloc_backend *tmpbackend;
5489   hwloc_obj_t first_obj = NULL, last_obj = NULL;
5490   int root_fd = -1;
5491   DIR *dir;
5492   struct dirent *dirent;
5493   int res = 0;
5494 
5495   if (!(hwloc_topology_get_flags(topology) & (HWLOC_TOPOLOGY_FLAG_IO_DEVICES|HWLOC_TOPOLOGY_FLAG_WHOLE_IO)))
5496     return 0;
5497 
5498   if (hwloc_get_next_pcidev(topology, NULL)) {
5499     hwloc_debug("%s", "PCI objects already added, ignoring linuxpci backend.\n");
5500     return 0;
5501   }
5502 
5503   /* hackily find the linux backend to steal its fsroot */
5504   tmpbackend = topology->backends;
5505   while (tmpbackend) {
5506     if (tmpbackend->component == &hwloc_linux_disc_component) {
5507       root_fd = ((struct hwloc_linux_backend_data_s *) tmpbackend->private_data)->root_fd;
5508       hwloc_debug("linuxpci backend stole linux backend root_fd %d\n", root_fd);
5509       break;    }
5510     tmpbackend = tmpbackend->next;
5511   }
5512   /* take our own descriptor, either pointing to linux fsroot, or to / if not found */
5513   if (root_fd >= 0)
5514     root_fd = dup(root_fd);
5515   else
5516     root_fd = open("/", O_RDONLY | O_DIRECTORY);
5517 
5518   dir = hwloc_opendir("/sys/bus/pci/devices/", root_fd);
5519   if (!dir)
5520     goto out_with_rootfd;
5521 
5522   while ((dirent = readdir(dir)) != NULL) {
5523     unsigned domain, bus, dev, func;
5524     hwloc_obj_t obj;
5525     struct hwloc_pcidev_attr_s *attr;
5526     unsigned os_index;
5527     char path[64];
5528     char value[16];
5529     size_t ret;
5530     int fd;
5531 
5532     if (sscanf(dirent->d_name, "%04x:%02x:%02x.%01x", &domain, &bus, &dev, &func) != 4)
5533       continue;
5534 
5535     os_index = (domain << 20) + (bus << 12) + (dev << 4) + func;
5536     obj = hwloc_alloc_setup_object(HWLOC_OBJ_PCI_DEVICE, os_index);
5537     if (!obj)
5538       break;
5539     attr = &obj->attr->pcidev;
5540 
5541     attr->domain = domain;
5542     attr->bus = bus;
5543     attr->dev = dev;
5544     attr->func = func;
5545 
5546     /* default (unknown) values */
5547     attr->vendor_id = 0;
5548     attr->device_id = 0;
5549     attr->class_id = HWLOC_PCI_CLASS_NOT_DEFINED;
5550     attr->revision = 0;
5551     attr->subvendor_id = 0;
5552     attr->subdevice_id = 0;
5553     attr->linkspeed = 0;
5554 
5555     snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/vendor", dirent->d_name);
5556     if (!hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
5557       attr->vendor_id = strtoul(value, NULL, 16);
5558 
5559     snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/device", dirent->d_name);
5560     if (!hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
5561       attr->device_id = strtoul(value, NULL, 16);
5562 
5563     snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/class", dirent->d_name);
5564     if (!hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
5565       attr->class_id = strtoul(value, NULL, 16) >> 8;
5566 
5567     snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/subsystem_vendor", dirent->d_name);
5568     if (!hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
5569       attr->subvendor_id = strtoul(value, NULL, 16);
5570 
5571     snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/subsystem_device", dirent->d_name);
5572     if (!hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
5573       attr->subdevice_id = strtoul(value, NULL, 16);
5574 
5575     snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/config", dirent->d_name);
5576     /* don't use hwloc_read_path_by_length() because we don't want the ending \0 */
5577     fd = hwloc_open(path, root_fd);
5578     if (fd >= 0) {
5579 #define CONFIG_SPACE_CACHESIZE 256
5580       unsigned char config_space_cache[CONFIG_SPACE_CACHESIZE];
5581       unsigned offset;
5582 
5583       /* initialize the config space in case we fail to read it (missing permissions, etc). */
5584       memset(config_space_cache, 0xff, CONFIG_SPACE_CACHESIZE);
5585       ret = read(fd, config_space_cache, CONFIG_SPACE_CACHESIZE);
5586       (void) ret; /* we initialized config_space_cache in case we don't read enough, ignore the read length */
5587       close(fd);
5588 
5589       /* is this a bridge? */
5590       if (hwloc_pci_prepare_bridge(obj, config_space_cache) < 0)
5591         continue;
5592 
5593       /* get the revision */
5594       attr->revision = config_space_cache[HWLOC_PCI_REVISION_ID];
5595 
5596       /* try to get the link speed */
5597       offset = hwloc_pci_find_cap(config_space_cache, HWLOC_PCI_CAP_ID_EXP);
5598       if (offset > 0 && offset + 20 /* size of PCI express block up to link status */ <= CONFIG_SPACE_CACHESIZE)
5599         hwloc_pci_find_linkspeed(config_space_cache, offset, &attr->linkspeed);
5600     }
5601 
5602     if (first_obj)
5603       last_obj->next_sibling = obj;
5604     else
5605       first_obj = obj;
5606     last_obj = obj;
5607   }
5608 
5609   closedir(dir);
5610 
5611   dir = hwloc_opendir("/sys/bus/pci/slots/", root_fd);
5612   if (dir) {
5613     while ((dirent = readdir(dir)) != NULL) {
5614       char path[64];
5615       char buf[64];
5616       unsigned domain, bus, dev;
5617       if (dirent->d_name[0] == '.')
5618         continue;
5619       snprintf(path, sizeof(path), "/sys/bus/pci/slots/%s/address", dirent->d_name);
5620       if (!hwloc_read_path_by_length(path, buf, sizeof(buf), root_fd)
5621           && sscanf(buf, "%x:%x:%x", &domain, &bus, &dev) == 3) {
5622         hwloc_obj_t obj = first_obj;
5623         while (obj) {
5624           if (obj->attr->pcidev.domain == domain
5625               && obj->attr->pcidev.bus == bus
5626               && obj->attr->pcidev.dev == dev) {
5627             hwloc_obj_add_info(obj, "PCISlot", dirent->d_name);
5628           }
5629           obj = obj->next_sibling;
5630         }
5631       }
5632     }
5633     closedir(dir);
5634   }
5635 
5636   res = hwloc_insert_pci_device_list(backend, first_obj);
5637 
5638  out_with_rootfd:
5639   close(root_fd);
5640   return res;
5641 }
5642 
5643 static struct hwloc_backend *
hwloc_linuxpci_component_instantiate(struct hwloc_disc_component * component,const void * _data1 __hwloc_attribute_unused,const void * _data2 __hwloc_attribute_unused,const void * _data3 __hwloc_attribute_unused)5644 hwloc_linuxpci_component_instantiate(struct hwloc_disc_component *component,
5645                                      const void *_data1 __hwloc_attribute_unused,
5646                                      const void *_data2 __hwloc_attribute_unused,
5647                                      const void *_data3 __hwloc_attribute_unused)
5648 {
5649   struct hwloc_backend *backend;
5650 
5651   /* thissystem may not be fully initialized yet, we'll check flags in discover() */
5652 
5653   backend = hwloc_backend_alloc(component);
5654   if (!backend)
5655     return NULL;
5656   backend->flags = HWLOC_BACKEND_FLAG_NEED_LEVELS;
5657   backend->discover = hwloc_look_linuxfs_pci;
5658   return backend;
5659 }
5660 
5661 static struct hwloc_disc_component hwloc_linuxpci_disc_component = {
5662   HWLOC_DISC_COMPONENT_TYPE_MISC,
5663   "linuxpci",
5664   HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
5665   hwloc_linuxpci_component_instantiate,
5666   19, /* after pci */
5667   NULL
5668 };
5669 
5670 const struct hwloc_component hwloc_linuxpci_component = {
5671   HWLOC_COMPONENT_ABI,
5672   NULL, NULL,
5673   HWLOC_COMPONENT_TYPE_DISC,
5674   0,
5675   &hwloc_linuxpci_disc_component
5676 };
5677 
5678 #endif /* HWLOC_HAVE_LINUXPCI */
5679