1 /*****************************************************************************\
2 * acct_gather_energy_cray_aries.c - slurm energy accounting plugin for
3 * Cray/Aries.
4 *****************************************************************************
5 * Copyright (C) 2015 SchedMD LLC
6 * Written by Danny Auble <da@schedmd.com> who borrowed from the rapl
7 * plugin of the same type
8 *
9 * This file is part of Slurm, a resource management program.
10 * For details, see <https://slurm.schedmd.com/>.
11 * Please also read the included file: DISCLAIMER.
12 *
13 * Slurm is free software; you can redistribute it and/or modify it under
14 * the terms of the GNU General Public License as published by the Free
15 * Software Foundation; either version 2 of the License, or (at your option)
16 * any later version.
17 *
18 * In addition, as a special exception, the copyright holders give permission
19 * to link the code of portions of this program with the OpenSSL library under
20 * certain conditions as described in each individual source file, and
21 * distribute linked combinations including the two. You must obey the GNU
22 * General Public License in all respects for all of the code used other than
23 * OpenSSL. If you modify file(s) with this exception, you may extend this
24 * exception to your version of the file(s), but you are not obligated to do
25 * so. If you do not wish to do so, delete this exception statement from your
26 * version. If you delete this exception statement from all source files in
27 * the program, then also delete it here.
28 *
29 * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
30 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
31 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
32 * details.
33 *
34 * You should have received a copy of the GNU General Public License along
35 * with Slurm; if not, write to the Free Software Foundation, Inc.,
36 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
37 *
38 \*****************************************************************************/
39
40 /* acct_gather_energy_cray_aries
41 * This plugin does not initiate a node-level thread.
42 * It will be used to get energy values from the cray bmc when available
43 */
44
45
46 #include "src/common/slurm_xlator.h"
47 #include "src/common/slurm_acct_gather_energy.h"
48
49
50 /*
51 * These variables are required by the generic plugin interface. If they
52 * are not found in the plugin, the plugin loader will ignore it.
53 *
54 * plugin_name - a string giving a human-readable description of the
55 * plugin. There is no maximum length, but the symbol must refer to
56 * a valid string.
57 *
58 * plugin_type - a string suggesting the type of the plugin or its
59 * applicability to a particular form of data or method of data handling.
60 * If the low-level plugin API is used, the contents of this string are
61 * unimportant and may be anything. Slurm uses the higher-level plugin
62 * interface which requires this string to be of the form
63 *
64 * <application>/<method>
65 *
66 * where <application> is a description of the intended application of
67 * the plugin (e.g., "jobacct" for Slurm job completion logging) and <method>
68 * is a description of how this plugin satisfies that application. Slurm will
69 * only load job completion logging plugins if the plugin_type string has a
70 * prefix of "jobacct/".
71 *
72 * plugin_version - an unsigned 32-bit integer containing the Slurm version
73 * (major.minor.micro combined into a single number).
74 */
75 const char plugin_name[] = "AcctGatherEnergy Cray/Aries plugin";
76 const char plugin_type[] = "acct_gather_energy/cray_aries";
77 const uint32_t plugin_version = SLURM_VERSION_NUMBER;
78
79 static acct_gather_energy_t *local_energy = NULL;
80 static uint64_t debug_flags = 0;
81 static stepd_step_rec_t *job = NULL;
82
83 enum {
84 GET_ENERGY,
85 GET_POWER
86 };
87
88 extern void acct_gather_energy_p_conf_set(int context_id_in,
89 s_p_hashtbl_t *tbl);
90
_get_latest_stats(int type)91 static uint64_t _get_latest_stats(int type)
92 {
93 uint64_t data = 0;
94 int fd;
95 FILE *fp = NULL;
96 char *file_name;
97 char sbuf[72];
98 int num_read;
99
100 switch (type) {
101 case GET_ENERGY:
102 file_name = "/sys/cray/pm_counters/energy";
103 break;
104 case GET_POWER:
105 file_name = "/sys/cray/pm_counters/power";
106 break;
107 default:
108 error("unknown type %d", type);
109 return 0;
110 break;
111 }
112
113 if (!(fp = fopen(file_name, "r"))) {
114 error("%s: unable to open %s", __func__, file_name);
115 return data;
116 }
117
118 fd = fileno(fp);
119 if (fcntl(fd, F_SETFD, FD_CLOEXEC) == -1)
120 error("%s: fcntl(%s): %m", __func__, file_name);
121 num_read = read(fd, sbuf, (sizeof(sbuf) - 1));
122 if (num_read > 0) {
123 sbuf[num_read] = '\0';
124 sscanf(sbuf, "%"PRIu64, &data);
125 }
126 fclose(fp);
127
128 return data;
129 }
130
_get_joules_task(acct_gather_energy_t * energy)131 static void _get_joules_task(acct_gather_energy_t *energy)
132 {
133 uint64_t curr_energy, diff_energy = 0;
134 uint32_t curr_power;
135 time_t now;
136 static uint32_t readings = 0;
137
138 if (energy->current_watts == NO_VAL)
139 return;
140
141 now = time(NULL);
142 curr_energy = _get_latest_stats(GET_ENERGY);
143 curr_power = (uint32_t) _get_latest_stats(GET_POWER);
144
145 if (energy->previous_consumed_energy) {
146 diff_energy = curr_energy - energy->previous_consumed_energy;
147
148 energy->consumed_energy += diff_energy;
149 energy->ave_watts = ((energy->ave_watts * readings) +
150 energy->current_watts) / (readings + 1);
151 } else {
152 energy->base_consumed_energy = curr_energy;
153 energy->ave_watts = 0;
154 }
155 readings++;
156 energy->current_watts = curr_power;
157
158 if (debug_flags & DEBUG_FLAG_ENERGY)
159 info("_get_joules_task: %"PRIu64" Joules consumed over last"
160 " %ld secs. Currently at %u watts, ave watts %u",
161 diff_energy,
162 energy->poll_time ? now - energy->poll_time : 0,
163 curr_power, energy->ave_watts);
164
165 energy->previous_consumed_energy = curr_energy;
166 energy->poll_time = now;
167 }
168
_running_profile(void)169 static int _running_profile(void)
170 {
171 static bool run = false;
172 static uint32_t profile_opt = ACCT_GATHER_PROFILE_NOT_SET;
173
174 if (profile_opt == ACCT_GATHER_PROFILE_NOT_SET) {
175 acct_gather_profile_g_get(ACCT_GATHER_PROFILE_RUNNING,
176 &profile_opt);
177 if (profile_opt & ACCT_GATHER_PROFILE_ENERGY)
178 run = true;
179 }
180
181 return run;
182 }
183
_send_profile(void)184 static int _send_profile(void)
185 {
186 uint64_t curr_watts;
187 acct_gather_profile_dataset_t dataset[] = {
188 { "Power", PROFILE_FIELD_UINT64 },
189 { NULL, PROFILE_FIELD_NOT_SET }
190 };
191
192 static int dataset_id = -1; /* id of the dataset for profile data */
193
194 if (!_running_profile())
195 return SLURM_SUCCESS;
196
197 if (debug_flags & DEBUG_FLAG_ENERGY)
198 info("_send_profile: consumed %d watts",
199 local_energy->current_watts);
200
201 if (dataset_id < 0) {
202 dataset_id = acct_gather_profile_g_create_dataset(
203 "Energy", NO_PARENT, dataset);
204 if (debug_flags & DEBUG_FLAG_ENERGY)
205 debug("Energy: dataset created (id = %d)", dataset_id);
206 if (dataset_id == SLURM_ERROR) {
207 error("Energy: Failed to create the dataset for RAPL");
208 return SLURM_ERROR;
209 }
210 }
211
212 curr_watts = (uint64_t)local_energy->current_watts;
213
214 if (debug_flags & DEBUG_FLAG_PROFILE) {
215 info("PROFILE-Energy: power=%u", local_energy->current_watts);
216 }
217
218 return acct_gather_profile_g_add_sample_data(dataset_id,
219 (void *)&curr_watts,
220 local_energy->poll_time);
221 }
222
acct_gather_energy_p_update_node_energy(void)223 extern int acct_gather_energy_p_update_node_energy(void)
224 {
225 int rc = SLURM_SUCCESS;
226
227 xassert(running_in_slurmdstepd());
228
229 if (!local_energy || local_energy->current_watts == NO_VAL)
230 return rc;
231
232 _get_joules_task(local_energy);
233
234 return rc;
235 }
236
237 /*
238 * init() is called when the plugin is loaded, before any other functions
239 * are called. Put global initialization here.
240 */
init(void)241 extern int init(void)
242 {
243 debug_flags = slurm_get_debug_flags();
244
245 /* put anything that requires the .conf being read in
246 acct_gather_energy_p_conf_parse
247 */
248
249 return SLURM_SUCCESS;
250 }
251
fini(void)252 extern int fini(void)
253 {
254 if (!running_in_slurmdstepd())
255 return SLURM_SUCCESS;
256
257 acct_gather_energy_destroy(local_energy);
258 local_energy = NULL;
259 return SLURM_SUCCESS;
260 }
261
acct_gather_energy_p_get_data(enum acct_energy_type data_type,void * data)262 extern int acct_gather_energy_p_get_data(enum acct_energy_type data_type,
263 void *data)
264 {
265 int rc = SLURM_SUCCESS;
266 acct_gather_energy_t *energy = (acct_gather_energy_t *)data;
267 time_t *last_poll = (time_t *)data;
268 uint16_t *sensor_cnt = (uint16_t *)data;
269
270 xassert(running_in_slurmdstepd());
271
272 if (!local_energy) {
273 debug("%s: trying to get data %d, but no local_energy yet.",
274 __func__, data_type);
275 acct_gather_energy_p_conf_set(0, NULL);
276 }
277
278 switch (data_type) {
279 case ENERGY_DATA_JOULES_TASK:
280 case ENERGY_DATA_NODE_ENERGY_UP:
281 if (local_energy->current_watts == NO_VAL)
282 energy->consumed_energy = NO_VAL64;
283 else
284 _get_joules_task(energy);
285 break;
286 case ENERGY_DATA_STRUCT:
287 case ENERGY_DATA_NODE_ENERGY:
288 memcpy(energy, local_energy, sizeof(acct_gather_energy_t));
289 break;
290 case ENERGY_DATA_LAST_POLL:
291 *last_poll = local_energy->poll_time;
292 break;
293 case ENERGY_DATA_SENSOR_CNT:
294 *sensor_cnt = 1;
295 break;
296 default:
297 error("acct_gather_energy_p_get_data: unknown enum %d",
298 data_type);
299 rc = SLURM_ERROR;
300 break;
301 }
302 return rc;
303 }
304
acct_gather_energy_p_set_data(enum acct_energy_type data_type,void * data)305 extern int acct_gather_energy_p_set_data(enum acct_energy_type data_type,
306 void *data)
307 {
308 int rc = SLURM_SUCCESS;
309
310 xassert(running_in_slurmdstepd());
311
312 switch (data_type) {
313 case ENERGY_DATA_RECONFIG:
314 debug_flags = slurm_get_debug_flags();
315 break;
316 case ENERGY_DATA_PROFILE:
317 _get_joules_task(local_energy);
318 _send_profile();
319 break;
320 case ENERGY_DATA_STEP_PTR:
321 /* set global job if needed later */
322 job = (stepd_step_rec_t *)data;
323 break;
324 default:
325 error("acct_gather_energy_p_set_data: unknown enum %d",
326 data_type);
327 rc = SLURM_ERROR;
328 break;
329 }
330 return rc;
331 }
332
acct_gather_energy_p_conf_options(s_p_options_t ** full_options,int * full_options_cnt)333 extern void acct_gather_energy_p_conf_options(s_p_options_t **full_options,
334 int *full_options_cnt)
335 {
336 return;
337 }
338
acct_gather_energy_p_conf_set(int context_id_in,s_p_hashtbl_t * tbl)339 extern void acct_gather_energy_p_conf_set(int context_id_in,
340 s_p_hashtbl_t *tbl)
341 {
342 static bool flag_init = 0;
343
344 if (!running_in_slurmdstepd())
345 return;
346
347 /* Already been here, we shouldn't need to visit again */
348 if (local_energy)
349 return;
350
351 if (!flag_init) {
352 flag_init = 1;
353 local_energy = acct_gather_energy_alloc(1);
354 if (!_get_latest_stats(GET_ENERGY))
355 local_energy->current_watts = NO_VAL;
356 else
357 _get_joules_task(local_energy);
358 }
359
360 debug("%s loaded", plugin_name);
361
362 return;
363 }
364
acct_gather_energy_p_conf_values(List * data)365 extern void acct_gather_energy_p_conf_values(List *data)
366 {
367 return;
368 }
369