1 /*
2  * collectd - src/dcpmm.c
3  * MIT License
4  *
5  * Copyright (C) 2019 Intel Corporation. All rights reserved.
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a
8  * copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation
10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11  * and/or sell copies of the Software, and to permit persons to whom the
12  * Software is furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23  * DEALINGS IN THE SOFTWARE.
24  *
25  * Authors:
26  *   Hari TG <hari.tg at intel.com>
27  */
28 
29 #include "collectd.h"
30 #include "utils/common/common.h"
31 
32 #include "pmw_api.h"
33 
34 #define PLUGIN_NAME "dcpmm"
35 #define PRINT_BOOL(s) (s ? "true" : "false")
36 
37 int num_nvdimms;
38 int skip_stop = 0;
39 bool enable_dispatch_all = false;
40 cdtime_t interval = 0;
41 PMWATCH_OP_BUF pmw_output_buf;
42 PMWATCH_CONFIG_NODE pmwatch_config;
43 
add_metric(const char * plugin_inst,const char * type,const char * type_inst,gauge_t value)44 static void add_metric(const char *plugin_inst, const char *type,
45                        const char *type_inst, gauge_t value) {
46   value_list_t vl = VALUE_LIST_INIT;
47 
48   vl.values = &(value_t){.gauge = value};
49   vl.values_len = 1;
50 
51   sstrncpy(vl.plugin, PLUGIN_NAME, sizeof(vl.plugin));
52   sstrncpy(vl.host, hostname_g, sizeof(vl.host));
53   sstrncpy(vl.type, type, sizeof(vl.type));
54 
55   if (plugin_inst != NULL) {
56     sstrncpy(vl.plugin_instance, plugin_inst, sizeof(vl.plugin_instance));
57   }
58 
59   if (type_inst != NULL) {
60     sstrncpy(vl.type_instance, type_inst, sizeof(vl.type_instance));
61   }
62 
63   plugin_dispatch_values(&vl);
64 
65 #if COLLECT_DEBUG
66 
67   notification_t n = {
68       .severity = NOTIF_OKAY, .time = cdtime(), .plugin = PLUGIN_NAME};
69 
70   if (strncmp(type_inst, "read_hit_ratio", strlen("read_hit_ratio")) == 0 ||
71       strncmp(type_inst, "write_hit_ratio", strlen("write_hit_ratio")) == 0 ||
72       strncmp(type_inst, "media_temperature", strlen("media_temperature")) ==
73           0 ||
74       strncmp(type_inst, "controller_temperature",
75               strlen("controller_temperature")) == 0) {
76     snprintf(n.message, sizeof(n.message), "Value: %0.2f", value);
77   } else {
78     snprintf(n.message, sizeof(n.message), "Value: %0.0f", value);
79   }
80   sstrncpy(n.host, hostname_g, sizeof(n.host));
81   sstrncpy(n.type, type, sizeof(n.type));
82   sstrncpy(n.type_instance, type_inst, sizeof(n.type_instance));
83   sstrncpy(n.plugin_instance, plugin_inst, sizeof(n.plugin_instance));
84 
85   plugin_dispatch_notification(&n);
86 
87 #endif /* COLLECT_DEBUG */
88 
89   return;
90 } /* void add_metric  */
91 
dcpmm_read(user_data_t * ud)92 static int dcpmm_read(__attribute__((unused)) user_data_t *ud) {
93   DEBUG(PLUGIN_NAME ": %s:%d", __FUNCTION__, __LINE__);
94 
95   int i, ret = 0;
96   char dimm_num[16];
97 
98   ret = PMWAPIRead(&pmw_output_buf);
99   if (ret != 0) {
100     ERROR(PLUGIN_NAME ": Failed to read data from the collection.");
101 
102     return ret;
103   }
104 
105   for (i = 0; i < num_nvdimms; i++) {
106     snprintf(dimm_num, sizeof(dimm_num), "%d", i);
107 
108     if (pmwatch_config.collect_perf_metrics) {
109       add_metric(dimm_num, "timestamp", "epoch",
110                  PMWATCH_OP_BUF_EPOCH(&pmw_output_buf[i]));
111       add_metric(dimm_num, "timestamp", "tsc_cycles",
112                  PMWATCH_OP_BUF_TIMESTAMP(&pmw_output_buf[i]));
113       add_metric(dimm_num, "media", "total_bytes_read",
114                  PMWATCH_OP_BUF_TOTAL_BYTES_READ(&pmw_output_buf[i]));
115       add_metric(dimm_num, "media", "total_bytes_written",
116                  PMWATCH_OP_BUF_TOTAL_BYTES_WRITTEN(&pmw_output_buf[i]));
117       add_metric(dimm_num, "media", "read_64B_ops_rcvd",
118                  PMWATCH_OP_BUF_BYTES_READ(&pmw_output_buf[i]));
119       add_metric(dimm_num, "media", "write_64B_ops_rcvd",
120                  PMWATCH_OP_BUF_BYTES_WRITTEN(&pmw_output_buf[i]));
121       add_metric(dimm_num, "media", "media_read_ops",
122                  PMWATCH_OP_BUF_MEDIA_READ(&pmw_output_buf[i]));
123       add_metric(dimm_num, "media", "media_write_ops",
124                  PMWATCH_OP_BUF_MEDIA_WRITE(&pmw_output_buf[i]));
125       add_metric(dimm_num, "controller", "host_reads",
126                  PMWATCH_OP_BUF_HOST_READS(&pmw_output_buf[i]));
127       add_metric(dimm_num, "controller", "host_writes",
128                  PMWATCH_OP_BUF_HOST_WRITES(&pmw_output_buf[i]));
129       add_metric(dimm_num, "buffer", "read_hit_ratio",
130                  PMWATCH_OP_BUF_READ_HIT_RATIO(&pmw_output_buf[i]));
131       add_metric(dimm_num, "buffer", "write_hit_ratio",
132                  PMWATCH_OP_BUF_WRITE_HIT_RATIO(&pmw_output_buf[i]));
133     }
134 
135     if (pmwatch_config.collect_health) {
136       if (pmwatch_config.collect_perf_metrics && !enable_dispatch_all) {
137         continue;
138       }
139       add_metric(dimm_num, "timestamp", "epoch",
140                  PMWATCH_OP_BUF_EPOCH(&pmw_output_buf[i]));
141       add_metric(dimm_num, "timestamp", "tsc_cycles",
142                  PMWATCH_OP_BUF_TIMESTAMP(&pmw_output_buf[i]));
143       add_metric(dimm_num, "health", "health_status",
144                  PMWATCH_OP_BUF_HEALTH_STATUS(&pmw_output_buf[i]));
145       add_metric(dimm_num, "health", "lifespan_remaining",
146                  PMWATCH_OP_BUF_PERCENTAGE_REMAINING(&pmw_output_buf[i]));
147       add_metric(dimm_num, "health", "lifespan_used",
148                  PMWATCH_OP_BUF_PERCENTAGE_USED(&pmw_output_buf[i]));
149       add_metric(dimm_num, "health", "power_on_time",
150                  PMWATCH_OP_POWER_ON_TIME(&pmw_output_buf[i]));
151       add_metric(dimm_num, "health", "uptime",
152                  PMWATCH_OP_BUF_UPTIME(&pmw_output_buf[i]));
153       add_metric(dimm_num, "health", "last_shutdown_time",
154                  PMWATCH_OP_BUF_LAST_SHUTDOWN_TIME(&pmw_output_buf[i]));
155       add_metric(dimm_num, "health", "media_temperature",
156                  PMWATCH_OP_BUF_MEDIA_TEMP(&pmw_output_buf[i]));
157       add_metric(dimm_num, "health", "controller_temperature",
158                  PMWATCH_OP_BUF_CONTROLLER_TEMP(&pmw_output_buf[i]));
159       add_metric(dimm_num, "health", "max_media_temperature",
160                  PMWATCH_OP_BUF_MAX_MEDIA_TEMP(&pmw_output_buf[i]));
161       add_metric(dimm_num, "health", "max_controller_temperature",
162                  PMWATCH_OP_BUF_MAX_CONTROLLER_TEMP(&pmw_output_buf[i]));
163     }
164   }
165 
166   return 0;
167 } /* int dcpmm_read */
168 
dcpmm_stop(void)169 static int dcpmm_stop(void) {
170   DEBUG(PLUGIN_NAME ": %s:%d", __FUNCTION__, __LINE__);
171 
172   int ret = 0;
173 
174   if (skip_stop) {
175     DEBUG(PLUGIN_NAME ": %s:%d skipping stop function", __FUNCTION__, __LINE__);
176 
177     return ret;
178   }
179 
180   ret = PMWAPIStop();
181   if (ret != 0) {
182     ERROR(PLUGIN_NAME ": Failed to stop the collection.");
183   }
184 
185   return ret;
186 } /* int dcpmm_stop */
187 
dcpmm_shutdown(void)188 static int dcpmm_shutdown(void) {
189   DEBUG(PLUGIN_NAME ": %s:%d", __FUNCTION__, __LINE__);
190 
191   int ret = 0;
192 
193   free(pmw_output_buf);
194 
195   ret = dcpmm_stop();
196 
197   return ret;
198 } /* int dcpmm_shutdown */
199 
dcpmm_init(void)200 static int dcpmm_init(void) {
201   DEBUG(PLUGIN_NAME ": %s:%d", __FUNCTION__, __LINE__);
202 
203   int ret = 0;
204 
205   ret = PMWAPIGetDIMMCount(&num_nvdimms);
206   if (ret != 0) {
207     ERROR(PLUGIN_NAME
208           ": Failed to obtain count of Intel(R) Optane DCPMM. "
209           "A common cause for this is collectd running without "
210           "root privileges. Ensure that collectd is running with "
211           "root privileges. Also, make sure that Intel(R) Optane DC "
212           "Persistent Memory is available in the system.");
213     skip_stop = 1;
214 
215     return ret;
216   }
217 
218   ret = PMWAPIStart(pmwatch_config);
219   if (ret != 0) {
220     ERROR(PLUGIN_NAME ": Failed to start the collection. "
221                       "A common cause for this is collectd running without "
222                       "root privileges. Ensure that collectd is running with "
223                       "root privileges.");
224     skip_stop = 1;
225 
226     return ret;
227   }
228 
229   pmw_output_buf =
230       (PMWATCH_OP_BUF)calloc(num_nvdimms, sizeof(PMWATCH_OP_BUF_NODE));
231   if (pmw_output_buf == NULL) {
232     ERROR(PLUGIN_NAME ": Memory allocation for output buffer failed.");
233     dcpmm_stop();
234     skip_stop = 1;
235     ret = 1;
236   }
237 
238   return ret;
239 } /* int dcpmm_init */
240 
dcpmm_config(oconfig_item_t * ci)241 static int dcpmm_config(oconfig_item_t *ci) {
242   DEBUG(PLUGIN_NAME ": %s:%d", __FUNCTION__, __LINE__);
243 
244   int ret = 0;
245 
246   for (int i = 0; i < ci->children_num; i++) {
247     oconfig_item_t *child = ci->children + i;
248 
249     if (strncasecmp("Interval", child->key, strlen("Interval")) == 0) {
250       ret = cf_util_get_cdtime(child, &interval);
251       if (!ret) {
252         ret = cf_util_get_double(child, &pmwatch_config.interval);
253       }
254     } else if (strncasecmp("CollectHealth", child->key,
255                            strlen("CollectHealth")) == 0) {
256       ret = cf_util_get_boolean(child, &pmwatch_config.collect_health);
257 
258     } else if (strncasecmp("CollectPerfMetrics", child->key,
259                            strlen("CollectPerfMetrics")) == 0) {
260       ret = cf_util_get_boolean(child, &pmwatch_config.collect_perf_metrics);
261     } else if (strncasecmp("EnableDispatchAll", child->key,
262                            strlen("EnableDispatchAll")) == 0) {
263       ret = cf_util_get_boolean(child, &enable_dispatch_all);
264     } else {
265       ERROR(PLUGIN_NAME ": Unkown configuration parameter %s.", child->key);
266       ret = 1;
267     }
268 
269     if (ret != 0) {
270       ERROR(PLUGIN_NAME ": Failed to parse configuration parameters");
271       return ret;
272     }
273   }
274 
275   DEBUG("%s Config: Interval %.2f ; CollectHealth %s ; CollectdPerfMetrics %s "
276         "; EnableDispatchAll %s",
277         PLUGIN_NAME, pmwatch_config.interval,
278         PRINT_BOOL(pmwatch_config.collect_health),
279         PRINT_BOOL(pmwatch_config.collect_perf_metrics),
280         PRINT_BOOL(enable_dispatch_all));
281 
282   if (!pmwatch_config.collect_health && !pmwatch_config.collect_perf_metrics) {
283     ERROR(PLUGIN_NAME ": CollectdHealth and CollectPerfMetrics are disabled. "
284                       "Enable atleast one.");
285     return 1;
286   }
287 
288   plugin_register_complex_read(NULL, PLUGIN_NAME, dcpmm_read, interval, NULL);
289 
290   return 0;
291 } /* int dcpmm_config */
292 
module_register(void)293 void module_register(void) {
294   plugin_register_init(PLUGIN_NAME, dcpmm_init);
295   plugin_register_complex_config(PLUGIN_NAME, dcpmm_config);
296   plugin_register_shutdown(PLUGIN_NAME, dcpmm_shutdown);
297 } /* void module_register */
298