1 /*
2  * Copyright (c) 2011-2017 Cisco Systems, Inc.  All rights reserved
3  * Copyright (c) 2012      Los Alamos National Security, LLC. All rights reserved
4  * Copyright (c) 2015-2017 Research Organization for Information Science
5  *                         and Technology (RIST). All rights reserved.
6  * Copyright (c) 2015-2017 Intel, Inc.  All rights reserved.
7  *
8  * $COPYRIGHT$
9  *
10  * Additional copyrights may follow
11  *
12  * $HEADER$
13  */
14 #include "orte_config.h"
15 #include "orte/constants.h"
16 #include "orte/types.h"
17 
18 #include <unistd.h>
19 #include <string.h>
20 #include <ctype.h>
21 
22 #include "opal/class/opal_list.h"
23 #include "opal/mca/hwloc/hwloc-internal.h"
24 #include "opal/util/argv.h"
25 
26 #include "orte/util/show_help.h"
27 #include "orte/runtime/orte_globals.h"
28 
29 #include "ras_sim.h"
30 
31 
32 /*
33  * Local functions
34  */
35 static int allocate(orte_job_t *jdata, opal_list_t *nodes);
36 static int finalize(void);
37 
38 
39 /*
40  * Global variable
41  */
42 orte_ras_base_module_t orte_ras_sim_module = {
43     NULL,
44     allocate,
45     NULL,
46     finalize
47 };
48 
allocate(orte_job_t * jdata,opal_list_t * nodes)49 static int allocate(orte_job_t *jdata, opal_list_t *nodes)
50 {
51     int i, n, val, dig, num_nodes;
52     orte_node_t *node;
53     orte_topology_t *t;
54     hwloc_topology_t topo;
55     hwloc_obj_t obj;
56     unsigned j, k;
57     struct hwloc_topology_support *support;
58     char **files=NULL;
59     char **topos = NULL;
60     bool use_local_topology = false;
61     char **node_cnt=NULL;
62     char **slot_cnt=NULL;
63     char **max_slot_cnt=NULL;
64     char *tmp;
65     char prefix[6];
66 
67     node_cnt = opal_argv_split(mca_ras_simulator_component.num_nodes, ',');
68     if (NULL != mca_ras_simulator_component.slots) {
69         slot_cnt = opal_argv_split(mca_ras_simulator_component.slots, ',');
70         /* backfile the slot_cnt so every topology has a cnt */
71         tmp = slot_cnt[opal_argv_count(slot_cnt)-1];
72         for (n=opal_argv_count(slot_cnt); n < opal_argv_count(node_cnt); n++) {
73             opal_argv_append_nosize(&slot_cnt, tmp);
74         }
75     }
76     if (NULL != mca_ras_simulator_component.slots_max) {
77         max_slot_cnt = opal_argv_split(mca_ras_simulator_component.slots_max, ',');
78         /* backfill the max_slot_cnt as reqd */
79         tmp = max_slot_cnt[opal_argv_count(slot_cnt)-1];
80         for (n=opal_argv_count(max_slot_cnt); n < opal_argv_count(max_slot_cnt); n++) {
81             opal_argv_append_nosize(&max_slot_cnt, tmp);
82         }
83     }
84 
85     if (NULL != mca_ras_simulator_component.topofiles) {
86         files = opal_argv_split(mca_ras_simulator_component.topofiles, ',');
87         if (opal_argv_count(files) != opal_argv_count(node_cnt)) {
88             orte_show_help("help-ras-base.txt", "ras-sim:mismatch", true);
89             goto error_silent;
90         }
91     } else if (NULL != mca_ras_simulator_component.topologies) {
92         topos = opal_argv_split(mca_ras_simulator_component.topologies, ',');
93         if (opal_argv_count(topos) != opal_argv_count(node_cnt)) {
94             orte_show_help("help-ras-base.txt", "ras-sim:mismatch", true);
95             goto error_silent;
96         }
97     } else {
98         /* use our topology */
99         use_local_topology = true;
100     }
101 
102     /* setup the prefix to the node names */
103     snprintf(prefix, 6, "nodeA");
104 
105     /* process the request */
106     for (n=0; NULL != node_cnt[n]; n++) {
107         num_nodes = strtol(node_cnt[n], NULL, 10);
108 
109         /* get number of digits */
110         val = num_nodes;
111         for (dig=0; 0 != val; dig++) {
112             val /= 10;
113         }
114 
115         /* set the prefix for this group of nodes */
116         prefix[4] += n;
117 
118         /* check for topology */
119         if (use_local_topology) {
120             /* use our topology */
121             t = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, 0);
122         } else if (NULL != files) {
123             if (0 != hwloc_topology_init(&topo)) {
124                 orte_show_help("help-ras-simulator.txt",
125                                "hwloc API fail", true,
126                                __FILE__, __LINE__, "hwloc_topology_init");
127                 goto error_silent;
128             }
129             if (0 != hwloc_topology_set_xml(topo, files[n])) {
130                 orte_show_help("help-ras-simulator.txt",
131                                "hwloc failed to load xml", true, files[n]);
132                 hwloc_topology_destroy(topo);
133                 goto error_silent;
134             }
135             /* since we are loading this from an external source, we have to
136              * explicitly set a flag so hwloc sets things up correctly
137              */
138             if (0 != opal_hwloc_base_topology_set_flags(topo, HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM, false)) {
139                 orte_show_help("help-ras-simulator.txt",
140                                "hwloc API fail", true,
141                                __FILE__, __LINE__, "hwloc_topology_set_flags");
142                 hwloc_topology_destroy(topo);
143                 goto error_silent;
144             }
145             if (0 != hwloc_topology_load(topo)) {
146                 orte_show_help("help-ras-simulator.txt",
147                                "hwloc API fail", true,
148                                __FILE__, __LINE__, "hwloc_topology_load");
149                 hwloc_topology_destroy(topo);
150                 goto error_silent;
151             }
152             /* remove the hostname from the topology. Unfortunately, hwloc
153              * decided to add the source hostname to the "topology", thus
154              * rendering it unusable as a pure topological description. So
155              * we remove that information here.
156              */
157             obj = hwloc_get_root_obj(topo);
158             for (k=0; k < obj->infos_count; k++) {
159                 if (NULL == obj->infos[k].name ||
160                     NULL == obj->infos[k].value) {
161                     continue;
162                 }
163                 if (0 == strncmp(obj->infos[k].name, "HostName", strlen("HostName"))) {
164                     free(obj->infos[k].name);
165                     free(obj->infos[k].value);
166                     /* left justify the array */
167                     for (j=k; j < obj->infos_count-1; j++) {
168                         obj->infos[j] = obj->infos[j+1];
169                     }
170                     obj->infos[obj->infos_count-1].name = NULL;
171                     obj->infos[obj->infos_count-1].value = NULL;
172                     obj->infos_count--;
173                     break;
174                 }
175             }
176             /* unfortunately, hwloc does not include support info in its
177              * xml output :-(( To aid in debugging, we set it here
178              */
179             support = (struct hwloc_topology_support*)hwloc_topology_get_support(topo);
180             support->cpubind->set_thisproc_cpubind = mca_ras_simulator_component.have_cpubind;
181             support->membind->set_thisproc_membind = mca_ras_simulator_component.have_membind;
182             /* add it to our array */
183             t = OBJ_NEW(orte_topology_t);
184             t->topo = topo;
185             t->sig = opal_hwloc_base_get_topo_signature(topo);
186             opal_pointer_array_add(orte_node_topologies, t);
187         } else {
188             if (0 != hwloc_topology_init(&topo)) {
189                 orte_show_help("help-ras-simulator.txt",
190                                "hwloc API fail", true,
191                                __FILE__, __LINE__, "hwloc_topology_init");
192                 goto error_silent;
193             }
194             if (0 != hwloc_topology_set_synthetic(topo, topos[n])) {
195                 orte_show_help("help-ras-simulator.txt",
196                                "hwloc API fail", true,
197                                __FILE__, __LINE__, "hwloc_topology_set_synthetic");
198                 hwloc_topology_destroy(topo);
199                 goto error_silent;
200             }
201             if (0 != hwloc_topology_load(topo)) {
202                 orte_show_help("help-ras-simulator.txt",
203                                "hwloc API fail", true,
204                                __FILE__, __LINE__, "hwloc_topology_load");
205                 hwloc_topology_destroy(topo);
206                 goto error_silent;
207             }
208             /* remove the hostname from the topology. Unfortunately, hwloc
209              * decided to add the source hostname to the "topology", thus
210              * rendering it unusable as a pure topological description. So
211              * we remove that information here.
212              */
213             obj = hwloc_get_root_obj(topo);
214             for (k=0; k < obj->infos_count; k++) {
215                 if (NULL == obj->infos[k].name ||
216                     NULL == obj->infos[k].value) {
217                     continue;
218                 }
219                 if (0 == strncmp(obj->infos[k].name, "HostName", strlen("HostName"))) {
220                     free(obj->infos[k].name);
221                     free(obj->infos[k].value);
222                     /* left justify the array */
223                     for (j=k; j < obj->infos_count-1; j++) {
224                         obj->infos[j] = obj->infos[j+1];
225                     }
226                     obj->infos[obj->infos_count-1].name = NULL;
227                     obj->infos[obj->infos_count-1].value = NULL;
228                     obj->infos_count--;
229                     break;
230                 }
231             }
232             /* unfortunately, hwloc does not include support info in its
233              * xml output :-(( To aid in debugging, we set it here
234              */
235             support = (struct hwloc_topology_support*)hwloc_topology_get_support(topo);
236             support->cpubind->set_thisproc_cpubind = mca_ras_simulator_component.have_cpubind;
237             support->membind->set_thisproc_membind = mca_ras_simulator_component.have_membind;
238             /* add it to our array */
239             t = OBJ_NEW(orte_topology_t);
240             t->topo = topo;
241             t->sig = opal_hwloc_base_get_topo_signature(topo);
242             opal_pointer_array_add(orte_node_topologies, t);
243         }
244 
245         for (i=0; i < num_nodes; i++) {
246             node = OBJ_NEW(orte_node_t);
247             asprintf(&node->name, "%s%0*d", prefix, dig, i);
248             node->state = ORTE_NODE_STATE_UP;
249             node->slots_inuse = 0;
250             if (NULL == max_slot_cnt || NULL == max_slot_cnt[n]) {
251                 node->slots_max = 0;
252             } else {
253                 obj = hwloc_get_root_obj(t->topo);
254                 node->slots_max = opal_hwloc_base_get_npus(t->topo, obj);
255             }
256             if (NULL == slot_cnt || NULL == slot_cnt[n]) {
257                 node->slots = 0;
258             } else {
259                 obj = hwloc_get_root_obj(t->topo);
260                 node->slots = opal_hwloc_base_get_npus(t->topo, obj);
261             }
262             OBJ_RETAIN(t);
263             node->topology = t;
264             opal_output_verbose(1, orte_ras_base_framework.framework_output,
265                                 "Created Node <%10s> [%3d : %3d]",
266                                 node->name, node->slots, node->slots_max);
267             opal_list_append(nodes, &node->super);
268         }
269     }
270 
271     /* record the number of allocated nodes */
272     orte_num_allocated_nodes = opal_list_get_size(nodes);
273 
274     if (NULL != max_slot_cnt) {
275         opal_argv_free(max_slot_cnt);
276     }
277     if (NULL != slot_cnt) {
278         opal_argv_free(slot_cnt);
279     }
280     if (NULL != node_cnt) {
281         opal_argv_free(node_cnt);
282     }
283     if (NULL != topos) {
284         opal_argv_free(topos);
285     }
286     return ORTE_SUCCESS;
287 
288 error_silent:
289     if (NULL != max_slot_cnt) {
290         opal_argv_free(max_slot_cnt);
291     }
292     if (NULL != slot_cnt) {
293         opal_argv_free(slot_cnt);
294     }
295     if (NULL != node_cnt) {
296         opal_argv_free(node_cnt);
297     }
298     return ORTE_ERR_SILENT;
299 
300 }
301 
302 /*
303  * There's really nothing to do here
304  */
finalize(void)305 static int finalize(void)
306 {
307     return ORTE_SUCCESS;
308 }
309