1 /*
2 * Copyright (c) 2011-2017 Cisco Systems, Inc. All rights reserved
3 * Copyright (c) 2012 Los Alamos National Security, LLC. All rights reserved
4 * Copyright (c) 2015-2017 Research Organization for Information Science
5 * and Technology (RIST). All rights reserved.
6 * Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
7 *
8 * $COPYRIGHT$
9 *
10 * Additional copyrights may follow
11 *
12 * $HEADER$
13 */
14 #include "orte_config.h"
15 #include "orte/constants.h"
16 #include "orte/types.h"
17
18 #include <unistd.h>
19 #include <string.h>
20 #include <ctype.h>
21
22 #include "opal/class/opal_list.h"
23 #include "opal/mca/hwloc/hwloc-internal.h"
24 #include "opal/util/argv.h"
25
26 #include "orte/util/show_help.h"
27 #include "orte/runtime/orte_globals.h"
28
29 #include "ras_sim.h"
30
31
32 /*
33 * Local functions
34 */
35 static int allocate(orte_job_t *jdata, opal_list_t *nodes);
36 static int finalize(void);
37
38
39 /*
40 * Global variable
41 */
42 orte_ras_base_module_t orte_ras_sim_module = {
43 NULL,
44 allocate,
45 NULL,
46 finalize
47 };
48
allocate(orte_job_t * jdata,opal_list_t * nodes)49 static int allocate(orte_job_t *jdata, opal_list_t *nodes)
50 {
51 int i, n, val, dig, num_nodes;
52 orte_node_t *node;
53 orte_topology_t *t;
54 hwloc_topology_t topo;
55 hwloc_obj_t obj;
56 unsigned j, k;
57 struct hwloc_topology_support *support;
58 char **files=NULL;
59 char **topos = NULL;
60 bool use_local_topology = false;
61 char **node_cnt=NULL;
62 char **slot_cnt=NULL;
63 char **max_slot_cnt=NULL;
64 char *tmp;
65 char prefix[6];
66
67 node_cnt = opal_argv_split(mca_ras_simulator_component.num_nodes, ',');
68 if (NULL != mca_ras_simulator_component.slots) {
69 slot_cnt = opal_argv_split(mca_ras_simulator_component.slots, ',');
70 /* backfile the slot_cnt so every topology has a cnt */
71 tmp = slot_cnt[opal_argv_count(slot_cnt)-1];
72 for (n=opal_argv_count(slot_cnt); n < opal_argv_count(node_cnt); n++) {
73 opal_argv_append_nosize(&slot_cnt, tmp);
74 }
75 }
76 if (NULL != mca_ras_simulator_component.slots_max) {
77 max_slot_cnt = opal_argv_split(mca_ras_simulator_component.slots_max, ',');
78 /* backfill the max_slot_cnt as reqd */
79 tmp = max_slot_cnt[opal_argv_count(slot_cnt)-1];
80 for (n=opal_argv_count(max_slot_cnt); n < opal_argv_count(max_slot_cnt); n++) {
81 opal_argv_append_nosize(&max_slot_cnt, tmp);
82 }
83 }
84
85 if (NULL != mca_ras_simulator_component.topofiles) {
86 files = opal_argv_split(mca_ras_simulator_component.topofiles, ',');
87 if (opal_argv_count(files) != opal_argv_count(node_cnt)) {
88 orte_show_help("help-ras-base.txt", "ras-sim:mismatch", true);
89 goto error_silent;
90 }
91 } else if (NULL != mca_ras_simulator_component.topologies) {
92 topos = opal_argv_split(mca_ras_simulator_component.topologies, ',');
93 if (opal_argv_count(topos) != opal_argv_count(node_cnt)) {
94 orte_show_help("help-ras-base.txt", "ras-sim:mismatch", true);
95 goto error_silent;
96 }
97 } else {
98 /* use our topology */
99 use_local_topology = true;
100 }
101
102 /* setup the prefix to the node names */
103 snprintf(prefix, 6, "nodeA");
104
105 /* process the request */
106 for (n=0; NULL != node_cnt[n]; n++) {
107 num_nodes = strtol(node_cnt[n], NULL, 10);
108
109 /* get number of digits */
110 val = num_nodes;
111 for (dig=0; 0 != val; dig++) {
112 val /= 10;
113 }
114
115 /* set the prefix for this group of nodes */
116 prefix[4] += n;
117
118 /* check for topology */
119 if (use_local_topology) {
120 /* use our topology */
121 t = (orte_topology_t*)opal_pointer_array_get_item(orte_node_topologies, 0);
122 } else if (NULL != files) {
123 if (0 != hwloc_topology_init(&topo)) {
124 orte_show_help("help-ras-simulator.txt",
125 "hwloc API fail", true,
126 __FILE__, __LINE__, "hwloc_topology_init");
127 goto error_silent;
128 }
129 if (0 != hwloc_topology_set_xml(topo, files[n])) {
130 orte_show_help("help-ras-simulator.txt",
131 "hwloc failed to load xml", true, files[n]);
132 hwloc_topology_destroy(topo);
133 goto error_silent;
134 }
135 /* since we are loading this from an external source, we have to
136 * explicitly set a flag so hwloc sets things up correctly
137 */
138 if (0 != opal_hwloc_base_topology_set_flags(topo, HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM, false)) {
139 orte_show_help("help-ras-simulator.txt",
140 "hwloc API fail", true,
141 __FILE__, __LINE__, "hwloc_topology_set_flags");
142 hwloc_topology_destroy(topo);
143 goto error_silent;
144 }
145 if (0 != hwloc_topology_load(topo)) {
146 orte_show_help("help-ras-simulator.txt",
147 "hwloc API fail", true,
148 __FILE__, __LINE__, "hwloc_topology_load");
149 hwloc_topology_destroy(topo);
150 goto error_silent;
151 }
152 /* remove the hostname from the topology. Unfortunately, hwloc
153 * decided to add the source hostname to the "topology", thus
154 * rendering it unusable as a pure topological description. So
155 * we remove that information here.
156 */
157 obj = hwloc_get_root_obj(topo);
158 for (k=0; k < obj->infos_count; k++) {
159 if (NULL == obj->infos[k].name ||
160 NULL == obj->infos[k].value) {
161 continue;
162 }
163 if (0 == strncmp(obj->infos[k].name, "HostName", strlen("HostName"))) {
164 free(obj->infos[k].name);
165 free(obj->infos[k].value);
166 /* left justify the array */
167 for (j=k; j < obj->infos_count-1; j++) {
168 obj->infos[j] = obj->infos[j+1];
169 }
170 obj->infos[obj->infos_count-1].name = NULL;
171 obj->infos[obj->infos_count-1].value = NULL;
172 obj->infos_count--;
173 break;
174 }
175 }
176 /* unfortunately, hwloc does not include support info in its
177 * xml output :-(( To aid in debugging, we set it here
178 */
179 support = (struct hwloc_topology_support*)hwloc_topology_get_support(topo);
180 support->cpubind->set_thisproc_cpubind = mca_ras_simulator_component.have_cpubind;
181 support->membind->set_thisproc_membind = mca_ras_simulator_component.have_membind;
182 /* add it to our array */
183 t = OBJ_NEW(orte_topology_t);
184 t->topo = topo;
185 t->sig = opal_hwloc_base_get_topo_signature(topo);
186 opal_pointer_array_add(orte_node_topologies, t);
187 } else {
188 if (0 != hwloc_topology_init(&topo)) {
189 orte_show_help("help-ras-simulator.txt",
190 "hwloc API fail", true,
191 __FILE__, __LINE__, "hwloc_topology_init");
192 goto error_silent;
193 }
194 if (0 != hwloc_topology_set_synthetic(topo, topos[n])) {
195 orte_show_help("help-ras-simulator.txt",
196 "hwloc API fail", true,
197 __FILE__, __LINE__, "hwloc_topology_set_synthetic");
198 hwloc_topology_destroy(topo);
199 goto error_silent;
200 }
201 if (0 != hwloc_topology_load(topo)) {
202 orte_show_help("help-ras-simulator.txt",
203 "hwloc API fail", true,
204 __FILE__, __LINE__, "hwloc_topology_load");
205 hwloc_topology_destroy(topo);
206 goto error_silent;
207 }
208 /* remove the hostname from the topology. Unfortunately, hwloc
209 * decided to add the source hostname to the "topology", thus
210 * rendering it unusable as a pure topological description. So
211 * we remove that information here.
212 */
213 obj = hwloc_get_root_obj(topo);
214 for (k=0; k < obj->infos_count; k++) {
215 if (NULL == obj->infos[k].name ||
216 NULL == obj->infos[k].value) {
217 continue;
218 }
219 if (0 == strncmp(obj->infos[k].name, "HostName", strlen("HostName"))) {
220 free(obj->infos[k].name);
221 free(obj->infos[k].value);
222 /* left justify the array */
223 for (j=k; j < obj->infos_count-1; j++) {
224 obj->infos[j] = obj->infos[j+1];
225 }
226 obj->infos[obj->infos_count-1].name = NULL;
227 obj->infos[obj->infos_count-1].value = NULL;
228 obj->infos_count--;
229 break;
230 }
231 }
232 /* unfortunately, hwloc does not include support info in its
233 * xml output :-(( To aid in debugging, we set it here
234 */
235 support = (struct hwloc_topology_support*)hwloc_topology_get_support(topo);
236 support->cpubind->set_thisproc_cpubind = mca_ras_simulator_component.have_cpubind;
237 support->membind->set_thisproc_membind = mca_ras_simulator_component.have_membind;
238 /* add it to our array */
239 t = OBJ_NEW(orte_topology_t);
240 t->topo = topo;
241 t->sig = opal_hwloc_base_get_topo_signature(topo);
242 opal_pointer_array_add(orte_node_topologies, t);
243 }
244
245 for (i=0; i < num_nodes; i++) {
246 node = OBJ_NEW(orte_node_t);
247 asprintf(&node->name, "%s%0*d", prefix, dig, i);
248 node->state = ORTE_NODE_STATE_UP;
249 node->slots_inuse = 0;
250 if (NULL == max_slot_cnt || NULL == max_slot_cnt[n]) {
251 node->slots_max = 0;
252 } else {
253 obj = hwloc_get_root_obj(t->topo);
254 node->slots_max = opal_hwloc_base_get_npus(t->topo, obj);
255 }
256 if (NULL == slot_cnt || NULL == slot_cnt[n]) {
257 node->slots = 0;
258 } else {
259 obj = hwloc_get_root_obj(t->topo);
260 node->slots = opal_hwloc_base_get_npus(t->topo, obj);
261 }
262 OBJ_RETAIN(t);
263 node->topology = t;
264 opal_output_verbose(1, orte_ras_base_framework.framework_output,
265 "Created Node <%10s> [%3d : %3d]",
266 node->name, node->slots, node->slots_max);
267 opal_list_append(nodes, &node->super);
268 }
269 }
270
271 /* record the number of allocated nodes */
272 orte_num_allocated_nodes = opal_list_get_size(nodes);
273
274 if (NULL != max_slot_cnt) {
275 opal_argv_free(max_slot_cnt);
276 }
277 if (NULL != slot_cnt) {
278 opal_argv_free(slot_cnt);
279 }
280 if (NULL != node_cnt) {
281 opal_argv_free(node_cnt);
282 }
283 if (NULL != topos) {
284 opal_argv_free(topos);
285 }
286 return ORTE_SUCCESS;
287
288 error_silent:
289 if (NULL != max_slot_cnt) {
290 opal_argv_free(max_slot_cnt);
291 }
292 if (NULL != slot_cnt) {
293 opal_argv_free(slot_cnt);
294 }
295 if (NULL != node_cnt) {
296 opal_argv_free(node_cnt);
297 }
298 return ORTE_ERR_SILENT;
299
300 }
301
302 /*
303 * There's really nothing to do here
304 */
finalize(void)305 static int finalize(void)
306 {
307 return ORTE_SUCCESS;
308 }
309