1 /*
2  * Copyright (c) 2007 The Regents of the University of California.
3  * Copyright (c) 2007-2009 Voltaire, Inc. All rights reserved.
4  * Copyright (c) 2009,2010 HNR Consulting. All rights reserved.
5  * Copyright (c) 2013 Lawrence Livermore National Security. All rights reserved.
6  * Copyright (c) 2011-2014 Mellanox Technologies LTD. All rights reserved.
7  *
8  * This software is available to you under a choice of one of two
9  * licenses.  You may choose to be licensed under the terms of the GNU
10  * General Public License (GPL) Version 2, available from the file
11  * COPYING in the main directory of this source tree, or the
12  * OpenIB.org BSD license below:
13  *
14  *     Redistribution and use in source and binary forms, with or
15  *     without modification, are permitted provided that the following
16  *     conditions are met:
17  *
18  *      - Redistributions of source code must retain the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer.
21  *
22  *      - Redistributions in binary form must reproduce the above
23  *        copyright notice, this list of conditions and the following
24  *        disclaimer in the documentation and/or other materials
25  *        provided with the distribution.
26  *
27  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
28  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
29  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
30  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
31  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
32  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
33  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
34  * SOFTWARE.
35  *
36  */
37 
38 /*
39  * Abstract:
40  *    Implementation of osm_perfmgr_t.
41  * This object implements an IBA performance manager.
42  *
43  * Author:
44  *    Ira Weiny, LLNL
45  */
46 
47 #if HAVE_CONFIG_H
48 #  include <config.h>
49 #endif				/* HAVE_CONFIG_H */
50 
51 #ifdef ENABLE_OSM_PERF_MGR
52 #include <stdlib.h>
53 #include <stdint.h>
54 #include <string.h>
55 #include <poll.h>
56 #include <errno.h>
57 #include <sys/time.h>
58 #include <netinet/in.h>
59 #include <float.h>
60 #include <arpa/inet.h>
61 #include <sys/socket.h>
62 #include <iba/ib_types.h>
63 #include <complib/cl_debug.h>
64 #include <complib/cl_thread.h>
65 #include <opensm/osm_file_ids.h>
66 #define FILE_ID OSM_FILE_PERFMGR_C
67 #include <vendor/osm_vendor_api.h>
68 #include <opensm/osm_perfmgr.h>
69 #include <opensm/osm_log.h>
70 #include <opensm/osm_node.h>
71 #include <opensm/osm_opensm.h>
72 #include <opensm/osm_helper.h>
73 
74 #define PERFMGR_INITIAL_TID_VALUE 0xcafe
75 
76 #ifdef ENABLE_OSM_PERF_MGR_PROFILE
77 struct {
78 	double fastest_us;
79 	double slowest_us;
80 	double avg_us;
81 	uint64_t num;
82 } perfmgr_mad_stats = {
83 fastest_us: DBL_MAX, slowest_us: DBL_MIN, avg_us: 0, num:0};
84 
85 /* diff must be something which can fit in a susecond_t */
86 static inline void update_mad_stats(struct timeval *diff)
87 {
88 	double new = (diff->tv_sec * 1000000) + diff->tv_usec;
89 	if (new < perfmgr_mad_stats.fastest_us)
90 		perfmgr_mad_stats.fastest_us = new;
91 	if (new > perfmgr_mad_stats.slowest_us)
92 		perfmgr_mad_stats.slowest_us = new;
93 
94 	perfmgr_mad_stats.avg_us =
95 	    ((perfmgr_mad_stats.avg_us * perfmgr_mad_stats.num) + new)
96 	    / (perfmgr_mad_stats.num + 1);
97 	perfmgr_mad_stats.num++;
98 }
99 
100 static inline void clear_mad_stats(void)
101 {
102 	perfmgr_mad_stats.fastest_us = DBL_MAX;
103 	perfmgr_mad_stats.slowest_us = DBL_MIN;
104 	perfmgr_mad_stats.avg_us = 0;
105 	perfmgr_mad_stats.num = 0;
106 }
107 
108 /* after and diff can be the same struct */
109 static inline void diff_time(struct timeval *before, struct timeval *after,
110 			     struct timeval *diff)
111 {
112 	struct timeval tmp = *after;
113 	if (tmp.tv_usec < before->tv_usec) {
114 		tmp.tv_sec--;
115 		tmp.tv_usec += 1000000;
116 	}
117 	diff->tv_sec = tmp.tv_sec - before->tv_sec;
118 	diff->tv_usec = tmp.tv_usec - before->tv_usec;
119 }
120 #endif
121 
122 /**********************************************************************
123  * Internal helper functions
124  **********************************************************************/
125 static void init_monitored_nodes(osm_perfmgr_t * pm)
126 {
127 	cl_qmap_init(&pm->monitored_map);
128 	pm->remove_list = NULL;
129 	cl_event_construct(&pm->sig_query);
130 	cl_event_init(&pm->sig_query, FALSE);
131 }
132 
133 static void mark_for_removal(osm_perfmgr_t * pm, monitored_node_t * node)
134 {
135 	if (pm->remove_list) {
136 		node->next = pm->remove_list;
137 		pm->remove_list = node;
138 	} else {
139 		node->next = NULL;
140 		pm->remove_list = node;
141 	}
142 }
143 
144 static void remove_marked_nodes(osm_perfmgr_t * pm)
145 {
146 	while (pm->remove_list) {
147 		monitored_node_t *next = pm->remove_list->next;
148 		int port;
149 
150 		cl_qmap_remove_item(&pm->monitored_map,
151 				    (cl_map_item_t *) (pm->remove_list));
152 
153 		if (pm->rm_nodes)
154 			perfmgr_db_delete_entry(pm->db, pm->remove_list->guid);
155 		else
156 			perfmgr_db_mark_active(pm->db, pm->remove_list->guid, FALSE);
157 
158 		if (pm->remove_list->name)
159 			free(pm->remove_list->name);
160 
161 		for (port = pm->remove_list->esp0 ? 0 : 1;
162 		     port < pm->remove_list->num_ports;
163 		     port++) {
164 			if (pm->remove_list->port[port].remote_name)
165 				free(pm->remove_list->port[port].remote_name);
166 		}
167 
168 		free(pm->remove_list);
169 		pm->remove_list = next;
170 	}
171 }
172 
173 static inline void decrement_outstanding_queries(osm_perfmgr_t * pm)
174 {
175 	cl_atomic_dec(&pm->outstanding_queries);
176 
177 	if (!pm->outstanding_queries) {
178 		cl_spinlock_acquire(&pm->lock);
179 		if (pm->sweep_state == PERFMGR_SWEEP_POST_PROCESSING) {
180 			pm->sweep_state = PERFMGR_SWEEP_SLEEP;
181 			OSM_LOG(pm->log, OSM_LOG_INFO,
182 				"PM sweep state exiting Post Processing\n");
183 		}
184 		cl_spinlock_release(&pm->lock);
185 	}
186 
187 	cl_event_signal(&pm->sig_query);
188 }
189 
190 /**********************************************************************
191  * Receive the MAD from the vendor layer and post it for processing by
192  * the dispatcher
193  **********************************************************************/
194 static void perfmgr_mad_recv_callback(osm_madw_t * p_madw, void *bind_context,
195 				      osm_madw_t * p_req_madw)
196 {
197 	osm_perfmgr_t *pm = (osm_perfmgr_t *) bind_context;
198 
199 	OSM_LOG_ENTER(pm->log);
200 
201 	CL_ASSERT(p_madw);
202 	CL_ASSERT(p_req_madw != NULL);
203 
204 	osm_madw_copy_context(p_madw, p_req_madw);
205 	osm_mad_pool_put(pm->mad_pool, p_req_madw);
206 
207 	decrement_outstanding_queries(pm);
208 
209 	/* post this message for later processing. */
210 	if (cl_disp_post(pm->pc_disp_h, OSM_MSG_MAD_PORT_COUNTERS,
211 			 p_madw, NULL, NULL) != CL_SUCCESS) {
212 		OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 5401: "
213 			"PerfMgr Dispatcher post failed\n");
214 		osm_mad_pool_put(pm->mad_pool, p_madw);
215 	}
216 	OSM_LOG_EXIT(pm->log);
217 }
218 
219 /**********************************************************************
220  * Process MAD send errors
221  **********************************************************************/
222 static void perfmgr_mad_send_err_callback(void *bind_context,
223 					  osm_madw_t * p_madw)
224 {
225 	osm_perfmgr_t *pm = (osm_perfmgr_t *) bind_context;
226 	osm_madw_context_t *context = &p_madw->context;
227 	uint64_t node_guid = context->perfmgr_context.node_guid;
228 	uint8_t port = context->perfmgr_context.port;
229 	cl_map_item_t *p_node;
230 	monitored_node_t *p_mon_node;
231 	ib_net16_t orig_lid;
232 
233 	OSM_LOG_ENTER(pm->log);
234 
235 	/*
236 	 * get the monitored node struct to have the printable name
237 	 * for log messages
238 	 */
239 	if ((p_node = cl_qmap_get(&pm->monitored_map, node_guid)) ==
240 	    cl_qmap_end(&pm->monitored_map)) {
241 		OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 5415: GUID 0x%016"
242 			PRIx64 " not found in monitored map\n", node_guid);
243 		goto Exit;
244 	}
245 	p_mon_node = (monitored_node_t *) p_node;
246 
247 	OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 5402: %s (0x%" PRIx64
248 		") port %u LID %u TID 0x%" PRIx64 "\n",
249 		p_mon_node->name, p_mon_node->guid, port,
250 		cl_ntoh16(p_madw->mad_addr.dest_lid),
251 		cl_ntoh64(p_madw->p_mad->trans_id));
252 
253 	if (pm->subn->opt.perfmgr_redir && p_madw->status == IB_TIMEOUT) {
254 		/* First, find the node in the monitored map */
255 		cl_plock_acquire(&pm->osm->lock);
256 		/* Now, validate port number */
257 		if (port >= p_mon_node->num_ports) {
258 			cl_plock_release(&pm->osm->lock);
259 			OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 5416: "
260 				"Invalid port num %u for %s (GUID 0x%016"
261 				PRIx64 ") num ports %u\n", port,
262 				p_mon_node->name, p_mon_node->guid,
263 				p_mon_node->num_ports);
264 			goto Exit;
265 		}
266 		/* Clear redirection info for this port except orig_lid */
267 		orig_lid = p_mon_node->port[port].orig_lid;
268 		memset(&p_mon_node->port[port], 0, sizeof(monitored_port_t));
269 		p_mon_node->port[port].orig_lid = orig_lid;
270 		p_mon_node->port[port].valid = TRUE;
271 		cl_plock_release(&pm->osm->lock);
272 	}
273 
274 Exit:
275 	osm_mad_pool_put(pm->mad_pool, p_madw);
276 
277 	decrement_outstanding_queries(pm);
278 
279 	OSM_LOG_EXIT(pm->log);
280 }
281 
282 /**********************************************************************
283  * Bind the PerfMgr to the vendor layer for MAD sends/receives
284  **********************************************************************/
285 ib_api_status_t osm_perfmgr_bind(osm_perfmgr_t * pm, ib_net64_t port_guid)
286 {
287 	osm_bind_info_t bind_info;
288 	ib_api_status_t status = IB_SUCCESS;
289 
290 	OSM_LOG_ENTER(pm->log);
291 
292 	if (pm->bind_handle != OSM_BIND_INVALID_HANDLE) {
293 		OSM_LOG(pm->log, OSM_LOG_ERROR,
294 			"ERR 5403: Multiple binds not allowed\n");
295 		status = IB_ERROR;
296 		goto Exit;
297 	}
298 
299 	bind_info.port_guid = pm->port_guid = port_guid;
300 	bind_info.mad_class = IB_MCLASS_PERF;
301 	bind_info.class_version = 1;
302 	bind_info.is_responder = FALSE;
303 	bind_info.is_report_processor = FALSE;
304 	bind_info.is_trap_processor = FALSE;
305 	bind_info.recv_q_size = OSM_PM_DEFAULT_QP1_RCV_SIZE;
306 	bind_info.send_q_size = OSM_PM_DEFAULT_QP1_SEND_SIZE;
307 	bind_info.timeout = pm->subn->opt.transaction_timeout;
308 	bind_info.retries = pm->subn->opt.transaction_retries;
309 
310 	OSM_LOG(pm->log, OSM_LOG_VERBOSE,
311 		"Binding to port GUID 0x%" PRIx64 "\n", cl_ntoh64(port_guid));
312 
313 	pm->bind_handle = osm_vendor_bind(pm->vendor, &bind_info, pm->mad_pool,
314 					  perfmgr_mad_recv_callback,
315 					  perfmgr_mad_send_err_callback, pm);
316 
317 	if (pm->bind_handle == OSM_BIND_INVALID_HANDLE) {
318 		status = IB_ERROR;
319 		OSM_LOG(pm->log, OSM_LOG_ERROR,
320 			"ERR 5404: Vendor specific bind failed (%s)\n",
321 			ib_get_err_str(status));
322 	}
323 
324 Exit:
325 	OSM_LOG_EXIT(pm->log);
326 	return status;
327 }
328 
329 /**********************************************************************
330  * Unbind the PerfMgr from the vendor layer for MAD sends/receives
331  **********************************************************************/
332 static void perfmgr_mad_unbind(osm_perfmgr_t * pm)
333 {
334 	OSM_LOG_ENTER(pm->log);
335 	if (pm->bind_handle == OSM_BIND_INVALID_HANDLE) {
336 		OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 5405: No previous bind\n");
337 		goto Exit;
338 	}
339 	osm_vendor_unbind(pm->bind_handle);
340 Exit:
341 	OSM_LOG_EXIT(pm->log);
342 }
343 
344 /**********************************************************************
345  * Given a monitored node and a port, return the qp
346  **********************************************************************/
347 static ib_net32_t get_qp(monitored_node_t * mon_node, uint8_t port)
348 {
349 	ib_net32_t qp = IB_QP1;
350 
351 	if (mon_node && mon_node->num_ports && port < mon_node->num_ports &&
352 	    mon_node->port[port].redirection && mon_node->port[port].qp)
353 		qp = mon_node->port[port].qp;
354 
355 	return qp;
356 }
357 
358 static ib_net16_t get_base_lid(osm_node_t * p_node, uint8_t port)
359 {
360 	switch (p_node->node_info.node_type) {
361 	case IB_NODE_TYPE_CA:
362 	case IB_NODE_TYPE_ROUTER:
363 		return osm_node_get_base_lid(p_node, port);
364 	case IB_NODE_TYPE_SWITCH:
365 		return osm_node_get_base_lid(p_node, 0);
366 	default:
367 		return 0;
368 	}
369 }
370 
371 /**********************************************************************
372  * Given a node, a port, and an optional monitored node,
373  * return the lid appropriate to query that port
374  **********************************************************************/
375 static ib_net16_t get_lid(osm_node_t * p_node, uint8_t port,
376 			  monitored_node_t * mon_node)
377 {
378 	if (mon_node && mon_node->num_ports && port < mon_node->num_ports &&
379 	    mon_node->port[port].lid)
380 		return mon_node->port[port].lid;
381 
382 	return get_base_lid(p_node, port);
383 }
384 
385 /**********************************************************************
386  * Build a Performance Management class MAD
387  **********************************************************************/
388 static osm_madw_t *perfmgr_build_mad(osm_perfmgr_t * perfmgr,
389 				     ib_net16_t dest_lid,
390 				     uint8_t sl,
391 				     ib_net32_t dest_qp,
392 				     uint16_t pkey_ix,
393 				     uint8_t mad_method,
394 				     ib_net16_t attr_id,
395 				     osm_madw_context_t * p_context,
396 				     ib_perfmgt_mad_t ** p_pm_mad)
397 {
398 	ib_perfmgt_mad_t *pm_mad = NULL;
399 	osm_madw_t *p_madw = NULL;
400 
401 	OSM_LOG_ENTER(perfmgr->log);
402 
403 	p_madw = osm_mad_pool_get(perfmgr->mad_pool, perfmgr->bind_handle,
404 				  MAD_BLOCK_SIZE, NULL);
405 	if (p_madw == NULL)
406 		return NULL;
407 
408 	pm_mad = osm_madw_get_perfmgt_mad_ptr(p_madw);
409 
410 	/* build the mad */
411 	pm_mad->header.base_ver = 1;
412 	pm_mad->header.mgmt_class = IB_MCLASS_PERF;
413 	pm_mad->header.class_ver = 1;
414 	pm_mad->header.method = mad_method;
415 	pm_mad->header.status = 0;
416 	pm_mad->header.class_spec = 0;
417 	pm_mad->header.trans_id =
418 	    cl_hton64((uint64_t) cl_atomic_inc(&perfmgr->trans_id) &
419 		      (uint64_t) (0xFFFFFFFF));
420 	if (perfmgr->trans_id == 0)
421 		pm_mad->header.trans_id =
422 		    cl_hton64((uint64_t) cl_atomic_inc(&perfmgr->trans_id) &
423 			      (uint64_t) (0xFFFFFFFF));
424 	pm_mad->header.attr_id = attr_id;
425 	pm_mad->header.resv = 0;
426 	pm_mad->header.attr_mod = 0;
427 
428 	p_madw->mad_addr.dest_lid = dest_lid;
429 	p_madw->mad_addr.addr_type.gsi.remote_qp = dest_qp;
430 	p_madw->mad_addr.addr_type.gsi.remote_qkey =
431 	    cl_hton32(IB_QP1_WELL_KNOWN_Q_KEY);
432 	p_madw->mad_addr.addr_type.gsi.pkey_ix = pkey_ix;
433 	p_madw->mad_addr.addr_type.gsi.service_level = sl;
434 	p_madw->mad_addr.addr_type.gsi.global_route = FALSE;
435 	p_madw->resp_expected = TRUE;
436 
437 	if (p_context)
438 		p_madw->context = *p_context;
439 
440 	if (p_pm_mad)
441 		*p_pm_mad = pm_mad;
442 
443 	OSM_LOG_EXIT(perfmgr->log);
444 
445 	return (p_madw);
446 }
447 
448 /**********************************************************************
449  * Send a Performance Management class MAD
450  **********************************************************************/
451 static ib_api_status_t perfmgr_send_mad(osm_perfmgr_t *perfmgr,
452 					osm_madw_t * const p_madw)
453 {
454 	cl_status_t sts;
455 	ib_api_status_t status = osm_vendor_send(perfmgr->bind_handle, p_madw,
456 						 TRUE);
457 	if (status == IB_SUCCESS) {
458 		/* pause thread if there are too many outstanding requests */
459 		cl_atomic_inc(&(perfmgr->outstanding_queries));
460 		while (perfmgr->outstanding_queries >
461 		       (int32_t)perfmgr->max_outstanding_queries) {
462 			cl_spinlock_acquire(&perfmgr->lock);
463 			if (perfmgr->sweep_state == PERFMGR_SWEEP_SLEEP) {
464 				perfmgr->sweep_state = PERFMGR_SWEEP_POST_PROCESSING;
465 				OSM_LOG(perfmgr->log, OSM_LOG_INFO,
466 					"PM sweep state going into Post Processing\n");
467 			} else if (perfmgr->sweep_state == PERFMGR_SWEEP_ACTIVE)
468 				perfmgr->sweep_state = PERFMGR_SWEEP_SUSPENDED;
469 			cl_spinlock_release(&perfmgr->lock);
470 wait:
471 			sts = cl_event_wait_on(&perfmgr->sig_query,
472 					       EVENT_NO_TIMEOUT, TRUE);
473 			if (sts != CL_SUCCESS)
474 				goto wait;
475 
476 			cl_spinlock_acquire(&perfmgr->lock);
477 			if (perfmgr->sweep_state == PERFMGR_SWEEP_SUSPENDED)
478 				perfmgr->sweep_state = PERFMGR_SWEEP_ACTIVE;
479 			cl_spinlock_release(&perfmgr->lock);
480 		}
481 	}
482 	return (status);
483 }
484 
485 
486 /**********************************************************************
487  * Form and send the PortCounters MAD for a single port
488  **********************************************************************/
489 static ib_api_status_t perfmgr_send_pc_mad(osm_perfmgr_t * perfmgr,
490 					   ib_net16_t dest_lid,
491 					   ib_net32_t dest_qp, uint16_t pkey_ix,
492 					   uint8_t port, uint8_t mad_method,
493 					   uint16_t counter_select,
494 					   uint8_t counter_select2,
495 					   osm_madw_context_t * p_context,
496 					   uint8_t sl)
497 {
498 	ib_api_status_t status = IB_SUCCESS;
499 	ib_port_counters_t *port_counter = NULL;
500 	ib_perfmgt_mad_t *pm_mad = NULL;
501 	osm_madw_t *p_madw = NULL;
502 
503 	OSM_LOG_ENTER(perfmgr->log);
504 
505 	p_context->perfmgr_context.mad_attr_id = IB_MAD_ATTR_PORT_CNTRS;
506 	p_madw = perfmgr_build_mad(perfmgr, dest_lid, sl, dest_qp, pkey_ix,
507 				mad_method, IB_MAD_ATTR_PORT_CNTRS, p_context,
508 				&pm_mad);
509 	if (p_madw == NULL)
510 		return IB_INSUFFICIENT_MEMORY;
511 
512 	port_counter = (ib_port_counters_t *) & pm_mad->data;
513 	memset(port_counter, 0, sizeof(*port_counter));
514 	port_counter->port_select = port;
515 	port_counter->counter_select = cl_hton16(counter_select);
516 	port_counter->counter_select2 = counter_select2;
517 
518 	status = perfmgr_send_mad(perfmgr, p_madw);
519 
520 	OSM_LOG_EXIT(perfmgr->log);
521 	return status;
522 }
523 
524 /**********************************************************************
525  * sweep the node_guid_tbl and collect the node guids to be tracked
526  **********************************************************************/
527 static void collect_guids(cl_map_item_t * p_map_item, void *context)
528 {
529 	osm_node_t *node = (osm_node_t *) p_map_item;
530 	uint64_t node_guid = cl_ntoh64(node->node_info.node_guid);
531 	osm_perfmgr_t *pm = (osm_perfmgr_t *) context;
532 	monitored_node_t *mon_node = NULL;
533 	uint32_t num_ports;
534 	int port;
535 
536 	OSM_LOG_ENTER(pm->log);
537 
538 	if (cl_qmap_get(&pm->monitored_map, node_guid) ==
539 	    cl_qmap_end(&pm->monitored_map)) {
540 
541 		if (pm->ignore_cas &&
542 		    (node->node_info.node_type == IB_NODE_TYPE_CA))
543 			goto Exit;
544 
545 		/* if not already in map add it */
546 		num_ports = osm_node_get_num_physp(node);
547 		mon_node = malloc(sizeof(*mon_node) +
548 				  sizeof(monitored_port_t) * num_ports);
549 		if (!mon_node) {
550 			OSM_LOG(pm->log, OSM_LOG_ERROR, "PerfMgr: ERR 5406: "
551 				"malloc failed: not handling node %s"
552 				"(GUID 0x%" PRIx64 ")\n", node->print_desc,
553 				node_guid);
554 			goto Exit;
555 		}
556 		memset(mon_node, 0,
557 		       sizeof(*mon_node) + sizeof(monitored_port_t) * num_ports);
558 		mon_node->guid = node_guid;
559 		mon_node->name = strdup(node->print_desc);
560 		mon_node->num_ports = num_ports;
561 		mon_node->node_type = node->node_info.node_type;
562 		/* check for enhanced switch port 0 */
563 		mon_node->esp0 = (node->sw &&
564 				  ib_switch_info_is_enhanced_port0(&node->sw->
565 								   switch_info));
566 		for (port = mon_node->esp0 ? 0 : 1; port < num_ports; port++) {
567 			monitored_port_t *mon_port = &mon_node->port[port];
568 			osm_physp_t *p_physp = &node->physp_table[port];
569 			osm_physp_t *p_remote_physp = p_physp->p_remote_physp;
570 
571 			mon_port->orig_lid = 0;
572 			mon_port->valid = FALSE;
573 			if (osm_physp_is_valid(p_physp)) {
574 				mon_port->orig_lid = get_base_lid(node, port);
575 				mon_port->valid = TRUE;
576 			}
577 			mon_port->remote_valid = FALSE;
578 			mon_port->remote_name = NULL;
579 			if (p_remote_physp && osm_physp_is_valid(p_remote_physp)) {
580 				osm_node_t *p_remote_node = p_remote_physp->p_node;
581 				mon_port->remote_valid = TRUE;
582 				mon_port->remote_guid = p_remote_node->node_info.node_guid;
583 				mon_port->remote_name = strdup(p_remote_node->print_desc);
584 				mon_port->remote_port = p_remote_physp->port_num;
585 			}
586 		}
587 
588 		cl_qmap_insert(&pm->monitored_map, node_guid,
589 			       (cl_map_item_t *) mon_node);
590 	}
591 
592 Exit:
593 	OSM_LOG_EXIT(pm->log);
594 }
595 
596 /**********************************************************************
597  * Form and send the ClassPortInfo MAD for a single port
598  **********************************************************************/
599 static ib_api_status_t perfmgr_send_cpi_mad(osm_perfmgr_t * pm,
600 					    ib_net16_t dest_lid,
601 					    ib_net32_t dest_qp,
602 					    uint16_t pkey_ix,
603 					    uint8_t port,
604 					    osm_madw_context_t * p_context,
605 					    uint8_t sl)
606 {
607 	ib_api_status_t status = IB_SUCCESS;
608 	osm_madw_t *p_madw = NULL;
609 
610 	OSM_LOG_ENTER(pm->log);
611 
612 	p_context->perfmgr_context.mad_attr_id = IB_MAD_ATTR_CLASS_PORT_INFO;
613 	p_madw = perfmgr_build_mad(pm, dest_lid, sl, dest_qp,
614 				   pkey_ix, IB_MAD_METHOD_GET,
615 				   IB_MAD_ATTR_CLASS_PORT_INFO, p_context,
616 				   NULL);
617 	if (p_madw == NULL)
618 		return IB_INSUFFICIENT_MEMORY;
619 
620 	status = perfmgr_send_mad(pm, p_madw);
621 
622 	OSM_LOG_EXIT(pm->log);
623 	return status;
624 }
625 
626 /**********************************************************************
627  * return if some form of PortCountersExtended (PCE || PCE NoIETF) are supported
628  **********************************************************************/
629 static inline boolean_t pce_supported(monitored_node_t *mon_node, uint8_t port)
630 {
631 	monitored_port_t *mon_port = &(mon_node->port[port]);
632 	return (mon_port->cpi_valid
633 		&& (mon_port->cap_mask & IB_PM_EXT_WIDTH_SUPPORTED
634 		|| mon_port->cap_mask & IB_PM_EXT_WIDTH_NOIETF_SUP));
635 }
636 
637 /**********************************************************************
638  * return if CapMask.PortCountersXmitWaitSupported is set
639  **********************************************************************/
640 static inline boolean_t xmit_wait_supported(monitored_node_t *mon_node, uint8_t port)
641 {
642 	monitored_port_t *mon_port = &(mon_node->port[port]);
643 	return (mon_port->cpi_valid
644 		&& (mon_port->cap_mask & IB_PM_PC_XMIT_WAIT_SUP));
645 }
646 
647 /**********************************************************************
648  * return if "full" PortCountersExtended (IETF) is indicated
649  **********************************************************************/
650 static inline boolean_t ietf_supported(monitored_node_t *mon_node, uint8_t port)
651 {
652 	monitored_port_t *mon_port = &(mon_node->port[port]);
653 	return (mon_port->cpi_valid
654 		&& (mon_port->cap_mask & IB_PM_EXT_WIDTH_SUPPORTED));
655 }
656 
657 /**********************************************************************
658  * Form and send the PortCountersExtended MAD for a single port
659  **********************************************************************/
660 static ib_api_status_t perfmgr_send_pce_mad(osm_perfmgr_t * perfmgr,
661 					    ib_net16_t dest_lid,
662 					    ib_net32_t dest_qp,
663 					    uint16_t pkey_ix,
664 					    uint8_t port, uint8_t mad_method,
665 					    osm_madw_context_t * p_context,
666 					    uint8_t sl)
667 {
668 	ib_api_status_t status = IB_SUCCESS;
669 	ib_port_counters_ext_t *port_counter_ext = NULL;
670 	ib_perfmgt_mad_t *pm_mad = NULL;
671 	osm_madw_t *p_madw = NULL;
672 
673 	OSM_LOG_ENTER(perfmgr->log);
674 
675 	p_context->perfmgr_context.mad_attr_id = IB_MAD_ATTR_PORT_CNTRS_EXT;
676 	p_madw = perfmgr_build_mad(perfmgr, dest_lid, sl, dest_qp, pkey_ix,
677 				mad_method, IB_MAD_ATTR_PORT_CNTRS_EXT, p_context,
678 				&pm_mad);
679 	if (p_madw == NULL)
680 		return IB_INSUFFICIENT_MEMORY;
681 
682 	port_counter_ext = (ib_port_counters_ext_t *) & pm_mad->data;
683 	memset(port_counter_ext, 0, sizeof(*port_counter_ext));
684 	port_counter_ext->port_select = port;
685 	port_counter_ext->counter_select = cl_hton16(0x00FF);
686 
687 	status = perfmgr_send_mad(perfmgr, p_madw);
688 
689 	OSM_LOG_EXIT(perfmgr->log);
690 	return status;
691 }
692 
693 /**********************************************************************
694  * query the Port Counters of all the nodes in the subnet
695  **********************************************************************/
696 static void perfmgr_query_counters(cl_map_item_t * p_map_item, void *context)
697 {
698 	ib_api_status_t status = IB_SUCCESS;
699 	osm_perfmgr_t *pm = context;
700 	osm_node_t *node = NULL;
701 	monitored_node_t *mon_node = (monitored_node_t *) p_map_item;
702 	osm_madw_context_t mad_context;
703 	uint64_t node_guid = 0;
704 	ib_net32_t remote_qp;
705 	uint8_t port, num_ports = 0;
706 
707 	OSM_LOG_ENTER(pm->log);
708 
709 	cl_plock_acquire(&pm->osm->lock);
710 	node = osm_get_node_by_guid(pm->subn, cl_hton64(mon_node->guid));
711 	if (!node) {
712 		OSM_LOG(pm->log, OSM_LOG_ERROR,
713 			"ERR 5407: Node \"%s\" (guid 0x%" PRIx64
714 			") no longer exists so removing from PerfMgr monitoring\n",
715 			mon_node->name, mon_node->guid);
716 		mark_for_removal(pm, mon_node);
717 		goto Exit;
718 	}
719 
720 	num_ports = osm_node_get_num_physp(node);
721 	node_guid = cl_ntoh64(node->node_info.node_guid);
722 
723 	/* make sure there is a database object ready to store this info */
724 	if (perfmgr_db_create_entry(pm->db, node_guid, mon_node->esp0,
725 				    num_ports, node->print_desc) !=
726 	    PERFMGR_EVENT_DB_SUCCESS) {
727 		OSM_LOG(pm->log, OSM_LOG_ERROR,
728 			"ERR 5408: DB create entry failed for 0x%"
729 			PRIx64 " (%s) : %s\n", node_guid, node->print_desc,
730 			strerror(errno));
731 		goto Exit;
732 	}
733 
734 	perfmgr_db_mark_active(pm->db, node_guid, TRUE);
735 
736 	/* issue the query for each port */
737 	for (port = mon_node->esp0 ? 0 : 1; port < num_ports; port++) {
738 		ib_net16_t lid;
739 
740 		if (!osm_node_get_physp_ptr(node, port))
741 			continue;
742 
743 		if (!mon_node->port[port].valid)
744 			continue;
745 
746 		lid = get_lid(node, port, mon_node);
747 		if (lid == 0) {
748 			OSM_LOG(pm->log, OSM_LOG_DEBUG, "WARN: node 0x%" PRIx64
749 				" port %d (%s): port out of range, skipping\n",
750 				cl_ntoh64(node->node_info.node_guid), port,
751 				node->print_desc);
752 			continue;
753 		}
754 
755 		remote_qp = get_qp(mon_node, port);
756 
757 		mad_context.perfmgr_context.node_guid = node_guid;
758 		mad_context.perfmgr_context.port = port;
759 		mad_context.perfmgr_context.mad_method = IB_MAD_METHOD_GET;
760 
761 		if (pm->query_cpi && !mon_node->port[port].cpi_valid) {
762 			status = perfmgr_send_cpi_mad(pm, lid, remote_qp,
763 						mon_node->port[port].pkey_ix,
764 						port, &mad_context,
765 						0); /* FIXME SL != 0 */
766 			if (status != IB_SUCCESS)
767 				OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 5410: "
768 					"Failed to issue ClassPortInfo query "
769 					"for node 0x%" PRIx64
770 					" port %d (%s)\n",
771 					node->node_info.node_guid, port,
772 					node->print_desc);
773 			if (mon_node->node_type == IB_NODE_TYPE_SWITCH)
774 				goto Exit; /* only need to issue 1 CPI query
775 						for switches */
776 		} else {
777 
778 #ifdef ENABLE_OSM_PERF_MGR_PROFILE
779 			gettimeofday(&mad_context.perfmgr_context.query_start, NULL);
780 #endif
781 			OSM_LOG(pm->log, OSM_LOG_VERBOSE, "Getting stats for node 0x%"
782 				PRIx64 " port %d (lid %u) (%s)\n",
783 				node_guid, port, cl_ntoh16(lid),
784 				node->print_desc);
785 			status = perfmgr_send_pc_mad(pm, lid, remote_qp,
786 						     mon_node->port[port].pkey_ix,
787 						     port, IB_MAD_METHOD_GET,
788 						     0xffff,
789 						     1,
790 						     &mad_context,
791 						     0); /* FIXME SL != 0 */
792 			if (status != IB_SUCCESS)
793 				OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 5409: "
794 					"Failed to issue port counter query for node 0x%"
795 					PRIx64 " port %d (%s)\n",
796 					node->node_info.node_guid, port,
797 					node->print_desc);
798 
799 			if (pce_supported(mon_node, port)) {
800 
801 #if ENABLE_OSM_PERF_MGR_PROFILE
802 				gettimeofday(&mad_context.perfmgr_context.query_start, NULL);
803 #endif
804 				status = perfmgr_send_pce_mad(pm, lid, remote_qp,
805 							      mon_node->port[port].pkey_ix,
806 							      port,
807 							      IB_MAD_METHOD_GET,
808 							      &mad_context,
809 							      0); /* FIXME SL != 0 */
810 				if (status != IB_SUCCESS)
811 					OSM_LOG(pm->log, OSM_LOG_ERROR,
812 						"ERR 5417: Failed to issue "
813 						"port counter query for "
814 						"node 0x%" PRIx64 " port "
815 						"%d (%s)\n",
816 						node->node_info.node_guid,
817 						port,
818 						node->print_desc);
819 			}
820 		}
821 	}
822 Exit:
823 	cl_plock_release(&pm->osm->lock);
824 	OSM_LOG_EXIT(pm->log);
825 }
826 
827 /**********************************************************************
828  * Discovery stuff
829  * This code should not be here, but merged with main OpenSM
830  **********************************************************************/
831 extern int wait_for_pending_transactions(osm_stats_t * stats);
832 extern void osm_drop_mgr_process(IN osm_sm_t * sm);
833 
834 static int sweep_hop_1(osm_sm_t * sm)
835 {
836 	ib_api_status_t status = IB_SUCCESS;
837 	osm_madw_context_t context;
838 	osm_node_t *p_node;
839 	osm_port_t *p_port;
840 	osm_dr_path_t hop_1_path;
841 	ib_net64_t port_guid;
842 	uint8_t port_num;
843 	uint8_t path_array[IB_SUBNET_PATH_HOPS_MAX];
844 	uint8_t num_ports;
845 	osm_physp_t *p_ext_physp;
846 
847 	port_guid = sm->p_subn->sm_port_guid;
848 
849 	p_port = osm_get_port_by_guid(sm->p_subn, port_guid);
850 	if (!p_port) {
851 		OSM_LOG(sm->p_log, OSM_LOG_ERROR,
852 			"ERR 5481: No SM port object\n");
853 		return -1;
854 	}
855 
856 	p_node = p_port->p_node;
857 	port_num = ib_node_info_get_local_port_num(&p_node->node_info);
858 
859 	OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
860 		"Probing hop 1 on local port %u\n", port_num);
861 
862 	memset(path_array, 0, sizeof(path_array));
863 	/* the hop_1 operations depend on the type of our node.
864 	 * Currently - legal nodes that can host SM are SW and CA */
865 	switch (osm_node_get_type(p_node)) {
866 	case IB_NODE_TYPE_CA:
867 	case IB_NODE_TYPE_ROUTER:
868 		memset(&context, 0, sizeof(context));
869 		context.ni_context.node_guid = osm_node_get_node_guid(p_node);
870 		context.ni_context.port_num = port_num;
871 
872 		path_array[1] = port_num;
873 
874 		osm_dr_path_init(&hop_1_path, 1, path_array);
875 		CL_PLOCK_ACQUIRE(sm->p_lock);
876 		status = osm_req_get(sm, &hop_1_path, IB_MAD_ATTR_NODE_INFO, 0,
877 				     TRUE, 0, CL_DISP_MSGID_NONE, &context);
878 		CL_PLOCK_RELEASE(sm->p_lock);
879 
880 		if (status != IB_SUCCESS)
881 			OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 5482: "
882 				"Request for NodeInfo failed\n");
883 		break;
884 
885 	case IB_NODE_TYPE_SWITCH:
886 		/* Need to go over all the ports of the switch, and send a node_info
887 		 * from them. This doesn't include the port 0 of the switch, which
888 		 * hosts the SM.
889 		 * Note: We'll send another switchInfo on port 0, since if no ports
890 		 * are connected, we still want to get some response, and have the
891 		 * subnet come up.
892 		 */
893 		num_ports = osm_node_get_num_physp(p_node);
894 		for (port_num = 0; port_num < num_ports; port_num++) {
895 			/* go through the port only if the port is not DOWN */
896 			p_ext_physp = osm_node_get_physp_ptr(p_node, port_num);
897 			if (!p_ext_physp || ib_port_info_get_port_state
898 			    (&p_ext_physp->port_info) <= IB_LINK_DOWN)
899 				continue;
900 
901 			memset(&context, 0, sizeof(context));
902 			context.ni_context.node_guid =
903 			    osm_node_get_node_guid(p_node);
904 			context.ni_context.port_num = port_num;
905 
906 			path_array[1] = port_num;
907 
908 			osm_dr_path_init(&hop_1_path, 1, path_array);
909 			CL_PLOCK_ACQUIRE(sm->p_lock);
910 			status = osm_req_get(sm, &hop_1_path,
911 					     IB_MAD_ATTR_NODE_INFO, 0, TRUE, 0,
912 					     CL_DISP_MSGID_NONE, &context);
913 			CL_PLOCK_RELEASE(sm->p_lock);
914 
915 			if (status != IB_SUCCESS)
916 				OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 5484: "
917 					"Request for NodeInfo failed\n");
918 		}
919 		break;
920 
921 	default:
922 		OSM_LOG(sm->p_log, OSM_LOG_ERROR,
923 			"ERR 5483: Unknown node type %d\n",
924 			osm_node_get_type(p_node));
925 	}
926 
927 	return status;
928 }
929 
930 static unsigned is_sm_port_down(osm_sm_t * sm)
931 {
932 	ib_net64_t port_guid;
933 	osm_port_t *p_port;
934 
935 	port_guid = sm->p_subn->sm_port_guid;
936 	if (port_guid == 0)
937 		return 1;
938 
939 	CL_PLOCK_ACQUIRE(sm->p_lock);
940 	p_port = osm_get_port_by_guid(sm->p_subn, port_guid);
941 	if (!p_port) {
942 		CL_PLOCK_RELEASE(sm->p_lock);
943 		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 5485: "
944 			"SM port with GUID:%016" PRIx64 " is unknown\n",
945 			cl_ntoh64(port_guid));
946 		return 1;
947 	}
948 	CL_PLOCK_RELEASE(sm->p_lock);
949 
950 	if (p_port->p_node->sw &&
951 	    !ib_switch_info_is_enhanced_port0(&p_port->p_node->sw->switch_info))
952 		return 0;	/* base SP0 */
953 
954 	return osm_physp_get_port_state(p_port->p_physp) == IB_LINK_DOWN;
955 }
956 
957 static int sweep_hop_0(osm_sm_t * sm)
958 {
959 	ib_api_status_t status;
960 	osm_dr_path_t dr_path;
961 	osm_bind_handle_t h_bind;
962 	uint8_t path_array[IB_SUBNET_PATH_HOPS_MAX];
963 
964 	memset(path_array, 0, sizeof(path_array));
965 
966 	h_bind = osm_sm_mad_ctrl_get_bind_handle(&sm->mad_ctrl);
967 	if (h_bind == OSM_BIND_INVALID_HANDLE) {
968 		OSM_LOG(sm->p_log, OSM_LOG_DEBUG, "No bound ports\n");
969 		return -1;
970 	}
971 
972 	osm_dr_path_init(&dr_path, 0, path_array);
973 	CL_PLOCK_ACQUIRE(sm->p_lock);
974 	status = osm_req_get(sm, &dr_path, IB_MAD_ATTR_NODE_INFO, 0,
975 			     TRUE, 0, CL_DISP_MSGID_NONE, NULL);
976 	CL_PLOCK_RELEASE(sm->p_lock);
977 
978 	if (status != IB_SUCCESS)
979 		OSM_LOG(sm->p_log, OSM_LOG_ERROR,
980 			"ERR 5486: Request for NodeInfo failed\n");
981 
982 	return status;
983 }
984 
985 static void reset_node_count(cl_map_item_t * p_map_item, void *cxt)
986 {
987 	osm_node_t *p_node = (osm_node_t *) p_map_item;
988 	p_node->discovery_count = 0;
989 
990 	memset(p_node->physp_discovered, 0,
991 	       sizeof(uint8_t) * p_node->physp_tbl_size);
992 }
993 
994 static void reset_port_count(cl_map_item_t * p_map_item, void *cxt)
995 {
996 	osm_port_t *p_port = (osm_port_t *) p_map_item;
997 	p_port->discovery_count = 0;
998 }
999 
1000 static void reset_switch_count(cl_map_item_t * p_map_item, void *cxt)
1001 {
1002 	osm_switch_t *p_sw = (osm_switch_t *) p_map_item;
1003 	p_sw->need_update = 0;
1004 }
1005 
1006 static int perfmgr_discovery(osm_opensm_t * osm)
1007 {
1008 	int ret;
1009 
1010 	CL_PLOCK_ACQUIRE(&osm->lock);
1011 	cl_qmap_apply_func(&osm->subn.node_guid_tbl, reset_node_count, NULL);
1012 	cl_qmap_apply_func(&osm->subn.port_guid_tbl, reset_port_count, NULL);
1013 	cl_qmap_apply_func(&osm->subn.sw_guid_tbl, reset_switch_count, NULL);
1014 	CL_PLOCK_RELEASE(&osm->lock);
1015 
1016 	osm->subn.in_sweep_hop_0 = TRUE;
1017 
1018 	ret = sweep_hop_0(&osm->sm);
1019 	if (ret)
1020 		goto _exit;
1021 
1022 	if (wait_for_pending_transactions(&osm->stats))
1023 		goto _exit;
1024 
1025 	if (is_sm_port_down(&osm->sm)) {
1026 		OSM_LOG(&osm->log, OSM_LOG_VERBOSE, "SM port is down\n");
1027 		goto _drop;
1028 	}
1029 
1030 	osm->subn.in_sweep_hop_0 = FALSE;
1031 
1032 	ret = sweep_hop_1(&osm->sm);
1033 	if (ret)
1034 		goto _exit;
1035 
1036 	if (wait_for_pending_transactions(&osm->stats))
1037 		goto _exit;
1038 
1039 _drop:
1040 	osm_drop_mgr_process(&osm->sm);
1041 
1042 _exit:
1043 	return ret;
1044 }
1045 
1046 /**********************************************************************
1047  * Main PerfMgr processor - query the performance counters
1048  **********************************************************************/
1049 void osm_perfmgr_process(osm_perfmgr_t * pm)
1050 {
1051 #ifdef ENABLE_OSM_PERF_MGR_PROFILE
1052 	struct timeval before, after;
1053 #endif
1054 
1055 	if (pm->state != PERFMGR_STATE_ENABLED)
1056 		return;
1057 
1058 	cl_spinlock_acquire(&pm->lock);
1059 	if (pm->sweep_state == PERFMGR_SWEEP_ACTIVE ||
1060 	    pm->sweep_state == PERFMGR_SWEEP_SUSPENDED ||
1061 	    pm->sweep_state == PERFMGR_SWEEP_POST_PROCESSING) {
1062 		cl_spinlock_release(&pm->lock);
1063 		OSM_LOG(pm->log, OSM_LOG_INFO,
1064 			"PM sweep state %d, skipping sweep\n",
1065 			pm->sweep_state);
1066 		return;
1067 	}
1068 
1069 	pm->sweep_state = PERFMGR_SWEEP_ACTIVE;
1070 	cl_spinlock_release(&pm->lock);
1071 
1072 	if (pm->subn->sm_state == IB_SMINFO_STATE_STANDBY ||
1073 	    pm->subn->sm_state == IB_SMINFO_STATE_NOTACTIVE)
1074 		perfmgr_discovery(pm->subn->p_osm);
1075 
1076 	/* if redirection enabled, determine local port */
1077 	if (pm->subn->opt.perfmgr_redir && pm->local_port == -1) {
1078 		osm_node_t *p_node;
1079 		osm_port_t *p_port;
1080 
1081 		CL_PLOCK_ACQUIRE(pm->sm->p_lock);
1082 		p_port = osm_get_port_by_guid(pm->subn, pm->port_guid);
1083 		if (p_port) {
1084 			p_node = p_port->p_node;
1085 			CL_ASSERT(p_node);
1086 			pm->local_port =
1087 			    ib_node_info_get_local_port_num(&p_node->node_info);
1088 		} else
1089 			OSM_LOG(pm->log, OSM_LOG_ERROR,
1090 				"ERR 5487: No PerfMgr port object for "
1091 				"port GUID 0x%" PRIx64 "\n",
1092 				cl_ntoh64(pm->port_guid));
1093 		CL_PLOCK_RELEASE(pm->sm->p_lock);
1094 	}
1095 
1096 #ifdef ENABLE_OSM_PERF_MGR_PROFILE
1097 	gettimeofday(&before, NULL);
1098 #endif
1099 	/* With the global lock held, collect the node guids */
1100 	/* FIXME we should be able to track SA notices
1101 	 * and not have to sweep the node_guid_tbl each pass
1102 	 */
1103 	OSM_LOG(pm->log, OSM_LOG_VERBOSE, "Gathering PerfMgr stats\n");
1104 	cl_plock_acquire(&pm->osm->lock);
1105 	cl_qmap_apply_func(&pm->subn->node_guid_tbl, collect_guids, pm);
1106 	cl_plock_release(&pm->osm->lock);
1107 
1108 	/* then for each node query their counters */
1109 	cl_qmap_apply_func(&pm->monitored_map, perfmgr_query_counters, pm);
1110 
1111 	/* clean out any nodes found to be removed during the sweep */
1112 	remove_marked_nodes(pm);
1113 
1114 #ifdef ENABLE_OSM_PERF_MGR_PROFILE
1115 	gettimeofday(&after, NULL);
1116 	diff_time(&before, &after, &after);
1117 	osm_log_v2(pm->log, OSM_LOG_INFO, FILE_ID,
1118 		   "PerfMgr total sweep time : %ld.%06ld s\n"
1119 		   "        fastest mad      : %g us\n"
1120 		   "        slowest mad      : %g us\n"
1121 		   "        average mad      : %g us\n",
1122 		   after.tv_sec, after.tv_usec, perfmgr_mad_stats.fastest_us,
1123 		   perfmgr_mad_stats.slowest_us, perfmgr_mad_stats.avg_us);
1124 	clear_mad_stats();
1125 #endif
1126 
1127 	cl_spinlock_acquire(&pm->lock);
1128 	pm->sweep_state = PERFMGR_SWEEP_SLEEP;
1129 	cl_spinlock_release(&pm->lock);
1130 }
1131 
1132 /**********************************************************************
1133  * PerfMgr timer - loop continuously and signal SM to run PerfMgr
1134  * processor if enabled
1135  **********************************************************************/
1136 static void perfmgr_sweep(void *arg)
1137 {
1138 	osm_perfmgr_t *pm = arg;
1139 
1140 	osm_sm_signal(pm->sm, OSM_SIGNAL_PERFMGR_SWEEP);
1141 	cl_timer_start(&pm->sweep_timer, pm->sweep_time_s * 1000);
1142 }
1143 
1144 void osm_perfmgr_shutdown(osm_perfmgr_t * pm)
1145 {
1146 	OSM_LOG_ENTER(pm->log);
1147 	cl_timer_stop(&pm->sweep_timer);
1148 	cl_disp_unregister(pm->pc_disp_h);
1149 	perfmgr_mad_unbind(pm);
1150 	OSM_LOG_EXIT(pm->log);
1151 }
1152 
1153 void osm_perfmgr_destroy(osm_perfmgr_t * pm)
1154 {
1155 	OSM_LOG_ENTER(pm->log);
1156 	perfmgr_db_destroy(pm->db);
1157 	cl_timer_destroy(&pm->sweep_timer);
1158 	OSM_LOG_EXIT(pm->log);
1159 }
1160 
1161 /**********************************************************************
1162  * Detect if someone else on the network could have cleared the counters
1163  * without us knowing.  This is easy to detect because the counters never
1164  * wrap but are "sticky".
1165  *
1166  * The one time this will not work is if the port is getting errors fast
1167  * enough to have the reading overtake the previous reading.  In this case,
1168  * counters will be missed.
1169  **********************************************************************/
1170 static void perfmgr_check_oob_clear(osm_perfmgr_t * pm,
1171 				    monitored_node_t * mon_node, uint8_t port,
1172 				    perfmgr_db_err_reading_t * cr)
1173 {
1174 	perfmgr_db_err_reading_t prev_err;
1175 
1176 	if (perfmgr_db_get_prev_err(pm->db, mon_node->guid, port, &prev_err)
1177 	    != PERFMGR_EVENT_DB_SUCCESS) {
1178 		OSM_LOG(pm->log, OSM_LOG_VERBOSE, "Failed to find previous "
1179 			"error reading for %s (guid 0x%" PRIx64 ") port %u\n",
1180 			mon_node->name, mon_node->guid, port);
1181 		return;
1182 	}
1183 
1184 	OSM_LOG(pm->log, OSM_LOG_DEBUG,
1185 		"Errors vs previous node %s (0x%" PRIx64 ") port %u\n"
1186 		"SE:   %"PRIu64" ?< %"PRIu64"\n"
1187 		"LE:   %"PRIu64" ?< %"PRIu64"\n"
1188 		"LD:   %"PRIu64" ?< %"PRIu64"\n"
1189 		"RE:   %"PRIu64" ?< %"PRIu64"\n"
1190 		"RPE:  %"PRIu64" ?< %"PRIu64"\n"
1191 		"SRE:  %"PRIu64" ?< %"PRIu64"\n"
1192 		"XD:   %"PRIu64" ?< %"PRIu64"\n"
1193 		"XCE:  %"PRIu64" ?< %"PRIu64"\n"
1194 		"RCE:  %"PRIu64" ?< %"PRIu64"\n"
1195 		"LI:   %"PRIu64" ?< %"PRIu64"\n"
1196 		"BO:   %"PRIu64" ?< %"PRIu64"\n"
1197 		"VL15: %"PRIu64" ?< %"PRIu64"\n"
1198 		"XW:   %"PRIu64" ?< %"PRIu64"\n"
1199 		,
1200 		mon_node->name, mon_node->guid, port,
1201 		cr->symbol_err_cnt, prev_err.symbol_err_cnt,
1202 		cr->link_err_recover, prev_err.link_err_recover,
1203 		cr->link_downed, prev_err.link_downed,
1204 		cr->rcv_err, prev_err.rcv_err,
1205 		cr->rcv_rem_phys_err, prev_err.rcv_rem_phys_err,
1206 		cr->rcv_switch_relay_err, prev_err.rcv_switch_relay_err,
1207 		cr->xmit_discards, prev_err.xmit_discards,
1208 		cr->xmit_constraint_err, prev_err.xmit_constraint_err,
1209 		cr->rcv_constraint_err, prev_err.rcv_constraint_err,
1210 		cr->link_integrity, prev_err.link_integrity,
1211 		cr->buffer_overrun, prev_err.buffer_overrun,
1212 		cr->vl15_dropped, prev_err.vl15_dropped,
1213 		cr->xmit_wait, prev_err.xmit_wait);
1214 
1215 	if (cr->symbol_err_cnt < prev_err.symbol_err_cnt ||
1216 	    cr->link_err_recover < prev_err.link_err_recover ||
1217 	    cr->link_downed < prev_err.link_downed ||
1218 	    cr->rcv_err < prev_err.rcv_err ||
1219 	    cr->rcv_rem_phys_err < prev_err.rcv_rem_phys_err ||
1220 	    cr->rcv_switch_relay_err < prev_err.rcv_switch_relay_err ||
1221 	    cr->xmit_discards < prev_err.xmit_discards ||
1222 	    cr->xmit_constraint_err < prev_err.xmit_constraint_err ||
1223 	    cr->rcv_constraint_err < prev_err.rcv_constraint_err ||
1224 	    cr->link_integrity < prev_err.link_integrity ||
1225 	    cr->buffer_overrun < prev_err.buffer_overrun ||
1226 	    cr->vl15_dropped < prev_err.vl15_dropped ||
1227 	    cr->xmit_wait < prev_err.xmit_wait) {
1228 		OSM_LOG(pm->log, OSM_LOG_ERROR, "PerfMgr: ERR 540A: "
1229 			"Detected an out of band error clear "
1230 			"on %s (0x%" PRIx64 ") port %u\n",
1231 			mon_node->name, mon_node->guid, port);
1232 		perfmgr_db_clear_prev_err(pm->db, mon_node->guid, port);
1233 	}
1234 }
1235 
1236 /**********************************************************************
1237  * Return 1 if the value is "close" to overflowing
1238  * "close" is defined at 25% for now
1239  **********************************************************************/
1240 static int counter_overflow_4(uint8_t val)
1241 {
1242 	return (val >= 10);
1243 }
1244 
1245 static int counter_overflow_8(uint8_t val)
1246 {
1247 	return (val >= (UINT8_MAX - (UINT8_MAX / 4)));
1248 }
1249 
1250 static int counter_overflow_16(ib_net16_t val)
1251 {
1252 	return (cl_ntoh16(val) >= (UINT16_MAX - (UINT16_MAX / 4)));
1253 }
1254 
1255 static int counter_overflow_32(ib_net32_t val)
1256 {
1257 	return (cl_ntoh32(val) >= (UINT32_MAX - (UINT32_MAX / 4)));
1258 }
1259 
1260 static int counter_overflow_64(ib_net64_t val)
1261 {
1262 	return (cl_ntoh64(val) >= (UINT64_MAX - (UINT64_MAX / 4)));
1263 }
1264 
1265 /**********************************************************************
1266  * Check if the port counters have overflowed and if so issue a clear
1267  * MAD to the port
1268  **********************************************************************/
1269 static void perfmgr_check_overflow(osm_perfmgr_t * pm,
1270 				   monitored_node_t * mon_node, int16_t pkey_ix,
1271 				   uint8_t port, ib_port_counters_t * pc,
1272 				   boolean_t xmit_wait_sup)
1273 {
1274 	osm_madw_context_t mad_context;
1275 	ib_api_status_t status;
1276 	ib_net32_t remote_qp;
1277 	uint16_t counter_select;
1278 	uint8_t counter_select2;
1279 
1280 	OSM_LOG_ENTER(pm->log);
1281 
1282 	if (counter_overflow_16(pc->symbol_err_cnt) ||
1283 	    counter_overflow_8(pc->link_err_recover) ||
1284 	    counter_overflow_8(pc->link_downed) ||
1285 	    counter_overflow_16(pc->rcv_err) ||
1286 	    counter_overflow_16(pc->rcv_rem_phys_err) ||
1287 	    counter_overflow_16(pc->rcv_switch_relay_err) ||
1288 	    counter_overflow_16(pc->xmit_discards) ||
1289 	    counter_overflow_8(pc->xmit_constraint_err) ||
1290 	    counter_overflow_8(pc->rcv_constraint_err) ||
1291 	    counter_overflow_4(PC_LINK_INT(pc->link_int_buffer_overrun)) ||
1292 	    counter_overflow_4(PC_BUF_OVERRUN(pc->link_int_buffer_overrun)) ||
1293 	    counter_overflow_16(pc->vl15_dropped) ||
1294 	    (xmit_wait_sup && counter_overflow_32(pc->xmit_wait)) ||
1295 	    (!pce_supported(mon_node, port) &&
1296 	    (counter_overflow_32(pc->xmit_data) ||
1297 	     counter_overflow_32(pc->rcv_data) ||
1298 	     counter_overflow_32(pc->xmit_pkts) ||
1299 	     counter_overflow_32(pc->rcv_pkts)))) {
1300 		osm_node_t *p_node = NULL;
1301 		ib_net16_t lid = 0;
1302 
1303 		if (!mon_node->port[port].valid)
1304 			goto Exit;
1305 
1306 		osm_log_v2(pm->log, OSM_LOG_VERBOSE, FILE_ID,
1307 			   "PerfMgr: Counter overflow: %s (0x%" PRIx64
1308 			   ") port %d; clearing counters\n",
1309 			   mon_node->name, mon_node->guid, port);
1310 
1311 		cl_plock_acquire(&pm->osm->lock);
1312 		p_node =
1313 		    osm_get_node_by_guid(pm->subn, cl_hton64(mon_node->guid));
1314 		if (!p_node) {
1315 			OSM_LOG(pm->log, OSM_LOG_ERROR,
1316 				"ERR 5407: Node \"%s\" (guid 0x%" PRIx64
1317 				") no longer exists so removing from PerfMgr"
1318                                 " monitoring\n",
1319 				mon_node->name, mon_node->guid);
1320 			goto Exit;
1321 		}
1322 		lid = get_lid(p_node, port, mon_node);
1323 		cl_plock_release(&pm->osm->lock);
1324 		if (lid == 0) {
1325 			OSM_LOG(pm->log, OSM_LOG_ERROR, "PerfMgr: ERR 540C: "
1326 				"Failed to clear counters for %s (0x%"
1327 				PRIx64 ") port %d; failed to get lid\n",
1328 				mon_node->name, mon_node->guid, port);
1329 			goto Exit;
1330 		}
1331 
1332 		remote_qp = get_qp(NULL, port);
1333 
1334 		mad_context.perfmgr_context.node_guid = mon_node->guid;
1335 		mad_context.perfmgr_context.port = port;
1336 		mad_context.perfmgr_context.mad_method = IB_MAD_METHOD_SET;
1337 
1338 		/* apparently some HW uses the same counters for the 32 and 64
1339 		 * bit versions and a clear of them in the PortCounters
1340 		 * attribute also clears the ExtendedPortCounters equivalant
1341 		 * counters
1342 		 */
1343 		if (pce_supported(mon_node, port))
1344 			counter_select = 0x0fff;
1345 		else
1346 			counter_select = 0xffff;
1347 
1348 		if (xmit_wait_sup)
1349 			counter_select2 = 1;
1350 		else
1351 			counter_select2 = 0;
1352 
1353 		status = perfmgr_send_pc_mad(pm, lid, remote_qp, pkey_ix,
1354 					     port, IB_MAD_METHOD_SET,
1355 					     counter_select,
1356 					     counter_select2,
1357 					     &mad_context,
1358 					     0); /* FIXME SL != 0 */
1359 		if (status != IB_SUCCESS)
1360 			OSM_LOG(pm->log, OSM_LOG_ERROR, "PerfMgr: ERR 5411: "
1361 				"Failed to send clear counters MAD for %s (0x%"
1362 				PRIx64 ") port %d\n",
1363 				mon_node->name, mon_node->guid, port);
1364 
1365 		perfmgr_db_clear_prev_err(pm->db, mon_node->guid, port);
1366 		if (!pce_supported(mon_node, port))
1367 			perfmgr_db_clear_prev_dc(pm->db, mon_node->guid, port);
1368 	}
1369 
1370 Exit:
1371 	OSM_LOG_EXIT(pm->log);
1372 }
1373 
1374 /**********************************************************************
1375  * Check if the port counters have overflowed and if so issue a clear
1376  * MAD to the port
1377  **********************************************************************/
1378 static void perfmgr_check_pce_overflow(osm_perfmgr_t * pm,
1379 				       monitored_node_t * mon_node,
1380 				       int16_t pkey_ix,
1381 				       uint8_t port,
1382 				       ib_port_counters_ext_t * pc)
1383 {
1384 	osm_madw_context_t mad_context;
1385 	ib_api_status_t status;
1386 	ib_net32_t remote_qp;
1387 
1388 	OSM_LOG_ENTER(pm->log);
1389 
1390 	if (counter_overflow_64(pc->xmit_data) ||
1391 	    counter_overflow_64(pc->rcv_data) ||
1392 	    counter_overflow_64(pc->xmit_pkts) ||
1393 	    counter_overflow_64(pc->rcv_pkts) ||
1394 	    (ietf_supported(mon_node, port) &&
1395 	    (counter_overflow_64(pc->unicast_xmit_pkts) ||
1396 	    counter_overflow_64(pc->unicast_rcv_pkts) ||
1397 	    counter_overflow_64(pc->multicast_xmit_pkts) ||
1398 	    counter_overflow_64(pc->multicast_rcv_pkts)))) {
1399 		osm_node_t *p_node = NULL;
1400 		ib_net16_t lid = 0;
1401 
1402 		if (!mon_node->port[port].valid)
1403 			goto Exit;
1404 
1405 		osm_log(pm->log, OSM_LOG_VERBOSE,
1406 			"PerfMgr: PortCountersExtended overflow: %s (0x%"
1407 			PRIx64 ") port %d; clearing counters\n",
1408 			mon_node->name, mon_node->guid, port);
1409 
1410 		cl_plock_acquire(&pm->osm->lock);
1411 		p_node =
1412 		    osm_get_node_by_guid(pm->subn, cl_hton64(mon_node->guid));
1413 		if (!p_node) {
1414 			OSM_LOG(pm->log, OSM_LOG_ERROR,
1415 				"ERR 5407: Node \"%s\" (guid 0x%" PRIx64
1416 				") no longer exists so removing from PerfMgr"
1417                                 " monitoring\n",
1418 				mon_node->name, mon_node->guid);
1419 			goto Exit;
1420 		}
1421 		lid = get_lid(p_node, port, mon_node);
1422 		cl_plock_release(&pm->osm->lock);
1423 		if (lid == 0) {
1424 			OSM_LOG(pm->log, OSM_LOG_ERROR, "PerfMgr: ERR 5418: "
1425 				"Failed to clear counters for %s (0x%"
1426 				PRIx64 ") port %d; failed to get lid\n",
1427 				mon_node->name, mon_node->guid, port);
1428 			goto Exit;
1429 		}
1430 
1431 		remote_qp = get_qp(NULL, port);
1432 
1433 		mad_context.perfmgr_context.node_guid = mon_node->guid;
1434 		mad_context.perfmgr_context.port = port;
1435 		mad_context.perfmgr_context.mad_method = IB_MAD_METHOD_SET;
1436 		/* clear port counters */
1437 		status = perfmgr_send_pce_mad(pm, lid, remote_qp, pkey_ix,
1438 					      port, IB_MAD_METHOD_SET,
1439 					      &mad_context,
1440 					      0); /* FIXME SL != 0 */
1441 		if (status != IB_SUCCESS)
1442 			OSM_LOG(pm->log, OSM_LOG_ERROR, "PerfMgr: ERR 5419: "
1443 				"Failed to send clear counters MAD for %s (0x%"
1444 				PRIx64 ") port %d\n",
1445 				mon_node->name, mon_node->guid, port);
1446 
1447 		perfmgr_db_clear_prev_dc(pm->db, mon_node->guid, port);
1448 	}
1449 
1450 Exit:
1451 	OSM_LOG_EXIT(pm->log);
1452 }
1453 
1454 /**********************************************************************
1455  * Check values for logging of errors
1456  **********************************************************************/
1457 static void perfmgr_log_errors(osm_perfmgr_t * pm,
1458 			       monitored_node_t * mon_node, uint8_t port,
1459 			       perfmgr_db_err_reading_t * reading)
1460 {
1461 	perfmgr_db_err_reading_t prev_read;
1462 	perfmgr_db_err_t err =
1463 	    perfmgr_db_get_prev_err(pm->db, mon_node->guid, port, &prev_read);
1464 	uint64_t cur, prev;
1465 
1466 	if (err != PERFMGR_EVENT_DB_SUCCESS) {
1467 		OSM_LOG(pm->log, OSM_LOG_VERBOSE, "Failed to find previous "
1468 			"reading for %s (0x%" PRIx64 ") port %u\n",
1469 			mon_node->name, mon_node->guid, port);
1470 		return;
1471 	}
1472 
1473 #define LOG_ERR_CNT(errname, errnum, counter_name) \
1474 	if (reading->counter_name > prev_read.counter_name) { \
1475 		if (mon_node->port[port].remote_valid == TRUE) \
1476 			OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR %s: " \
1477 				"%s : %" PRIu64 " : node " \
1478 				"\"%s\" (NodeGUID: 0x%" PRIx64 ") : port %u " \
1479 				"connected to \"%s\" (NodeGUID: 0x%" PRIx64 ") : port %u\n", \
1480 				errnum, errname, \
1481 				reading->counter_name - prev_read.counter_name, \
1482 				mon_node->name, mon_node->guid, port, \
1483 				mon_node->port[port].remote_name, \
1484 				mon_node->port[port].remote_guid, \
1485 				mon_node->port[port].remote_port); \
1486 		else \
1487 			OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR %s: " \
1488 				"%s : %" PRIu64 " : node " \
1489 				"\"%s\" (NodeGUID: 0x%" PRIx64 ") : port %u\n", \
1490 				errnum, errname, \
1491 				reading->counter_name - prev_read.counter_name, \
1492 				mon_node->name, mon_node->guid, port); \
1493 	}
1494 
1495 	LOG_ERR_CNT("SymbolErrorCounter",           "5431", symbol_err_cnt);
1496 	LOG_ERR_CNT("LinkErrorRecoveryCounter",     "5432", link_err_recover);
1497 	LOG_ERR_CNT("LinkDownedCounter",            "5433", link_downed);
1498 	LOG_ERR_CNT("PortRcvErrors",                "5434", rcv_err);
1499 	LOG_ERR_CNT("PortRcvRemotePhysicalErrors",  "5435", rcv_rem_phys_err);
1500 	LOG_ERR_CNT("PortRcvSwitchRelayErrors",     "5436", rcv_switch_relay_err);
1501 	LOG_ERR_CNT("PortXmitDiscards",             "5437", xmit_discards);
1502 	LOG_ERR_CNT("PortXmitConstraintErrors",     "5438", xmit_constraint_err);
1503 	LOG_ERR_CNT("PortRcvConstraintErrors",      "5439", rcv_constraint_err);
1504 	LOG_ERR_CNT("LocalLinkIntegrityErrors",     "543A", link_integrity);
1505 	LOG_ERR_CNT("ExcessiveBufferOverrunErrors", "543B", buffer_overrun);
1506 	LOG_ERR_CNT("VL15Dropped",                  "543C", vl15_dropped);
1507 
1508 	cur = reading->xmit_wait;
1509 	prev = prev_read.xmit_wait;
1510 	if (pm->xmit_wait_log && cur > prev &&
1511 	    (cur - prev) >= pm->xmit_wait_threshold) {
1512 		OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 543D: XmitWait : %" PRIu64
1513 			" : node \"%s\" (NodeGUID: 0x%" PRIx64 ") : port %u\n",
1514 			cur - prev, mon_node->name, mon_node->guid, port);
1515 	}
1516 }
1517 
1518 static int16_t validate_redir_pkey(osm_perfmgr_t *pm, ib_net16_t pkey)
1519 {
1520 	int16_t pkey_ix = -1;
1521 	osm_port_t *p_port;
1522 	osm_pkey_tbl_t *p_pkey_tbl;
1523 	ib_net16_t *p_orig_pkey;
1524 	uint16_t block;
1525 	uint8_t index;
1526 
1527 	OSM_LOG_ENTER(pm->log);
1528 
1529 	CL_PLOCK_ACQUIRE(pm->sm->p_lock);
1530 	p_port = osm_get_port_by_guid(pm->subn, pm->port_guid);
1531 	if (!p_port) {
1532 		CL_PLOCK_RELEASE(pm->sm->p_lock);
1533 		OSM_LOG(pm->log, OSM_LOG_ERROR,
1534 			"ERR 541E: No PerfMgr port object\n");
1535 		goto Exit;
1536 	}
1537 	if (p_port->p_physp && osm_physp_is_valid(p_port->p_physp)) {
1538 		p_pkey_tbl = &p_port->p_physp->pkeys;
1539 		if (!p_pkey_tbl) {
1540 			CL_PLOCK_RELEASE(pm->sm->p_lock);
1541 			OSM_LOG(pm->log, OSM_LOG_VERBOSE,
1542 				"No PKey table found for PerfMgr port\n");
1543 			goto Exit;
1544 		}
1545 		p_orig_pkey = cl_map_get(&p_pkey_tbl->keys,
1546 					 ib_pkey_get_base(pkey));
1547 		if (!p_orig_pkey) {
1548 			CL_PLOCK_RELEASE(pm->sm->p_lock);
1549 			OSM_LOG(pm->log, OSM_LOG_VERBOSE,
1550 				"PKey 0x%x not found for PerfMgr port\n",
1551 				cl_ntoh16(pkey));
1552 			goto Exit;
1553 		}
1554 		if (osm_pkey_tbl_get_block_and_idx(p_pkey_tbl, p_orig_pkey,
1555 						   &block, &index) == IB_SUCCESS) {
1556 			CL_PLOCK_RELEASE(pm->sm->p_lock);
1557 			pkey_ix = block * IB_NUM_PKEY_ELEMENTS_IN_BLOCK + index;
1558 		} else {
1559 			CL_PLOCK_RELEASE(pm->sm->p_lock);
1560 			OSM_LOG(pm->log, OSM_LOG_ERROR,
1561 				"ERR 541F: Failed to obtain P_Key 0x%04x "
1562 				"block and index for PerfMgr port\n",
1563 				cl_ntoh16(pkey));
1564 		}
1565 	} else {
1566 		CL_PLOCK_RELEASE(pm->sm->p_lock);
1567 		OSM_LOG(pm->log, OSM_LOG_ERROR,
1568 			"ERR 5420: Local PerfMgt port physp invalid\n");
1569 	}
1570 
1571 Exit:
1572 	OSM_LOG_EXIT(pm->log);
1573 	return pkey_ix;
1574 }
1575 
1576 static boolean_t handle_redirect(osm_perfmgr_t *pm,
1577 			    ib_class_port_info_t *cpi,
1578 			    monitored_node_t *p_mon_node,
1579 			    uint8_t port,
1580 			    osm_madw_context_t *mad_context)
1581 {
1582 	char gid_str[INET6_ADDRSTRLEN];
1583 	ib_api_status_t status;
1584 	boolean_t valid = TRUE;
1585 	int16_t pkey_ix = 0;
1586 	uint8_t mad_method;
1587 
1588 	OSM_LOG(pm->log, OSM_LOG_VERBOSE,
1589 		"Redirection to LID %u GID %s QP 0x%x received\n",
1590 		cl_ntoh16(cpi->redir_lid),
1591 		inet_ntop(AF_INET6, cpi->redir_gid.raw, gid_str,
1592 			  sizeof gid_str), cl_ntoh32(cpi->redir_qp));
1593 
1594 	if (!pm->subn->opt.perfmgr_redir) {
1595 		OSM_LOG(pm->log, OSM_LOG_VERBOSE,
1596 			"Redirection requested but disabled\n");
1597 		valid = FALSE;
1598 	}
1599 
1600 	/* valid redirection ? */
1601 	if (cpi->redir_lid == 0) {
1602 		if (!ib_gid_is_notzero(&cpi->redir_gid)) {
1603 			OSM_LOG(pm->log, OSM_LOG_VERBOSE,
1604 				"Invalid redirection "
1605 				"(both redirect LID and GID are zero)\n");
1606 			valid = FALSE;
1607 		}
1608 	}
1609 	if (cpi->redir_qp == 0) {
1610 		OSM_LOG(pm->log, OSM_LOG_VERBOSE, "Invalid RedirectQP\n");
1611 		valid = FALSE;
1612 	}
1613 	if (cpi->redir_pkey == 0) {
1614 		OSM_LOG(pm->log, OSM_LOG_VERBOSE, "Invalid RedirectP_Key\n");
1615 		valid = FALSE;
1616 	}
1617 	if (cpi->redir_qkey != IB_QP1_WELL_KNOWN_Q_KEY) {
1618 		OSM_LOG(pm->log, OSM_LOG_VERBOSE, "Invalid RedirectQ_Key\n");
1619 		valid = FALSE;
1620 	}
1621 
1622 	pkey_ix = validate_redir_pkey(pm, cpi->redir_pkey);
1623 	if (pkey_ix == -1) {
1624 		OSM_LOG(pm->log, OSM_LOG_VERBOSE,
1625 			"Index for Pkey 0x%x not found\n",
1626 			cl_ntoh16(cpi->redir_pkey));
1627 		valid = FALSE;
1628 	}
1629 
1630 	if (cpi->redir_lid == 0) {
1631 		/* GID redirection: get PathRecord information */
1632 		OSM_LOG(pm->log, OSM_LOG_VERBOSE,
1633 			"GID redirection not currently supported\n");
1634 		goto Exit;
1635 	}
1636 
1637 	if (!valid)
1638 		goto Exit;
1639 
1640 	/* LID redirection support (easier than GID redirection) */
1641 	cl_plock_acquire(&pm->osm->lock);
1642 	p_mon_node->port[port].redirection = TRUE;
1643 	p_mon_node->port[port].valid = valid;
1644 	memcpy(&p_mon_node->port[port].gid, &cpi->redir_gid,
1645 	       sizeof(ib_gid_t));
1646 	p_mon_node->port[port].lid = cpi->redir_lid;
1647 	p_mon_node->port[port].qp = cpi->redir_qp;
1648 	p_mon_node->port[port].pkey = cpi->redir_pkey;
1649 	if (pkey_ix != -1)
1650 		p_mon_node->port[port].pkey_ix = pkey_ix;
1651 	cl_plock_release(&pm->osm->lock);
1652 
1653 	/* either */
1654 	if (pm->query_cpi)
1655 	{
1656 		/* issue a CPI query to the redirected location */
1657 		mad_method = IB_MAD_METHOD_GET;
1658 		p_mon_node->port[port].cpi_valid = FALSE;
1659 		status = perfmgr_send_cpi_mad(pm, cpi->redir_lid,
1660 						cpi->redir_qp, pkey_ix,
1661 						port, mad_context,
1662 						0); /* FIXME SL != 0 */
1663 	} else {
1664 		/* reissue the original query to the redirected location */
1665 		uint8_t counter_select2;
1666 
1667 		if (xmit_wait_supported(p_mon_node, port))
1668 			counter_select2 = 1;
1669 		else
1670 			counter_select2 = 0;
1671 
1672 		mad_method = mad_context->perfmgr_context.mad_method;
1673 		if (mad_context->perfmgr_context.mad_attr_id
1674 		    == IB_MAD_ATTR_PORT_CNTRS) {
1675 			status = perfmgr_send_pc_mad(pm, cpi->redir_lid,
1676 						     cpi->redir_qp,
1677 						     pkey_ix, port,
1678 						     mad_method,
1679 						     0xffff,
1680 						     counter_select2,
1681 						     mad_context,
1682 						     0); /* FIXME SL != 0 */
1683 		} else {
1684 			status = perfmgr_send_pce_mad(pm, cpi->redir_lid,
1685 						      cpi->redir_qp,
1686 						      pkey_ix, port,
1687 						      mad_method,
1688 						      mad_context,
1689 						      0); /* FIXME SL != 0 */
1690 		}
1691 	}
1692 	if (status != IB_SUCCESS)
1693 		OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 5414: "
1694 			"Failed to send redirected MAD "
1695 			"with method 0x%x for node %s "
1696 			"(NodeGuid 0x%" PRIx64 ") port %d\n",
1697 			mad_method, p_mon_node->name, p_mon_node->guid, port);
1698 Exit:
1699 	return (valid);
1700 }
1701 
1702 /**********************************************************************
1703  * Detect if someone else on the network could have cleared the counters
1704  * without us knowing.  This is easy to detect because the counters never
1705  * wrap but are "sticky" PortCountersExtended version.
1706  *
1707  * The one time this will not work is if the port is getting errors fast
1708  * enough to have the reading overtake the previous reading.  In this case,
1709  * counters will be missed.
1710  **********************************************************************/
1711 static void perfmgr_check_data_cnt_oob_clear(osm_perfmgr_t * pm,
1712 					monitored_node_t * mon_node,
1713 					uint8_t port,
1714 					perfmgr_db_data_cnt_reading_t * dc)
1715 {
1716 	perfmgr_db_data_cnt_reading_t prev_dc;
1717 
1718 	if (perfmgr_db_get_prev_dc(pm->db, mon_node->guid, port, &prev_dc)
1719 	    != PERFMGR_EVENT_DB_SUCCESS) {
1720 		OSM_LOG(pm->log, OSM_LOG_VERBOSE,
1721 			"Failed to find previous data count "
1722 			"reading for %s (0x%" PRIx64 ") port %u\n",
1723 			mon_node->name, mon_node->guid, port);
1724 		return;
1725 	}
1726 
1727 	OSM_LOG(pm->log, OSM_LOG_DEBUG,
1728 		"Data vs previous node %s (0x%" PRIx64 ") port %u\n"
1729 		"TX:    %"PRIu64" ?< %"PRIu64"\n"
1730 		"RX:    %"PRIu64" ?< %"PRIu64"\n"
1731 		"TXP:   %"PRIu64" ?< %"PRIu64"\n"
1732 		"RXP:   %"PRIu64" ?< %"PRIu64"\n"
1733 		"UTXP:  %"PRIu64" ?< %"PRIu64"\n"
1734 		"URXP:  %"PRIu64" ?< %"PRIu64"\n"
1735 		"MTXP:  %"PRIu64" ?< %"PRIu64"\n"
1736 		"MRXP:  %"PRIu64" ?< %"PRIu64"\n"
1737 		,
1738 		mon_node->name, mon_node->guid, port,
1739 		dc->xmit_data, prev_dc.xmit_data,
1740 		dc->rcv_data, prev_dc.rcv_data,
1741 		dc->xmit_pkts, prev_dc.xmit_pkts,
1742 		dc->rcv_pkts, prev_dc.rcv_pkts,
1743 		dc->unicast_xmit_pkts, prev_dc.unicast_xmit_pkts,
1744 		dc->unicast_rcv_pkts, prev_dc.unicast_rcv_pkts,
1745 		dc->multicast_xmit_pkts, prev_dc.multicast_xmit_pkts,
1746 		dc->multicast_rcv_pkts, prev_dc.multicast_rcv_pkts);
1747 
1748 	if (dc->xmit_data < prev_dc.xmit_data ||
1749 	    dc->rcv_data < prev_dc.rcv_data ||
1750 	    dc->xmit_pkts < prev_dc.xmit_pkts ||
1751 	    dc->rcv_pkts < prev_dc.rcv_pkts ||
1752 	    (ietf_supported(mon_node, port) &&
1753 	    (dc->unicast_xmit_pkts < prev_dc.unicast_xmit_pkts ||
1754 	    dc->unicast_rcv_pkts < prev_dc.unicast_rcv_pkts ||
1755 	    dc->multicast_xmit_pkts < prev_dc.multicast_xmit_pkts ||
1756 	    dc->multicast_rcv_pkts < prev_dc.multicast_rcv_pkts))) {
1757 		OSM_LOG(pm->log, OSM_LOG_ERROR,
1758 			"PerfMgr: ERR 540B: Detected an out of band data counter "
1759 			"clear on node %s (0x%" PRIx64 ") port %u\n",
1760 			mon_node->name, mon_node->guid, port);
1761 
1762 		perfmgr_db_clear_prev_dc(pm->db, mon_node->guid, port);
1763 	}
1764 }
1765 
1766 /**********************************************************************
1767  * The dispatcher uses a thread pool which will call this function when
1768  * there is a thread available to process the mad received on the wire
1769  **********************************************************************/
1770 static void pc_recv_process(void *context, void *data)
1771 {
1772 	osm_perfmgr_t *pm = context;
1773 	osm_madw_t *p_madw = data;
1774 	osm_madw_context_t *mad_context = &p_madw->context;
1775 	ib_mad_t *p_mad = osm_madw_get_mad_ptr(p_madw);
1776 	uint64_t node_guid = mad_context->perfmgr_context.node_guid;
1777 	uint8_t port = mad_context->perfmgr_context.port;
1778 	perfmgr_db_err_reading_t err_reading;
1779 	perfmgr_db_data_cnt_reading_t data_reading;
1780 	cl_map_item_t *p_node;
1781 	monitored_node_t *p_mon_node;
1782 	ib_class_port_info_t *cpi = NULL;
1783 
1784 	OSM_LOG_ENTER(pm->log);
1785 
1786 	/*
1787 	 * get the monitored node struct to have the printable name
1788 	 * for log messages
1789 	 */
1790 	if ((p_node = cl_qmap_get(&pm->monitored_map, node_guid)) ==
1791 	    cl_qmap_end(&pm->monitored_map)) {
1792 		OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 5412: GUID 0x%016"
1793 			PRIx64 " not found in monitored map\n", node_guid);
1794 		goto Exit;
1795 	}
1796 	p_mon_node = (monitored_node_t *) p_node;
1797 
1798 	OSM_LOG(pm->log, OSM_LOG_VERBOSE,
1799 		"Processing received MAD status 0x%x context 0x%"
1800 		PRIx64 " port %u\n", cl_ntoh16(p_mad->status), node_guid, port);
1801 
1802 	CL_ASSERT(p_mad->attr_id == IB_MAD_ATTR_PORT_CNTRS ||
1803 		  p_mad->attr_id == IB_MAD_ATTR_PORT_CNTRS_EXT ||
1804 		  p_mad->attr_id == IB_MAD_ATTR_CLASS_PORT_INFO);
1805 
1806 	cl_plock_acquire(&pm->osm->lock);
1807 	/* validate port number */
1808 	if (port >= p_mon_node->num_ports) {
1809 		cl_plock_release(&pm->osm->lock);
1810 		OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 5413: "
1811 			"Invalid port num %d for GUID 0x%016"
1812 			PRIx64 " num ports %d\n", port, node_guid,
1813 			p_mon_node->num_ports);
1814 		goto Exit;
1815 	}
1816 	cl_plock_release(&pm->osm->lock);
1817 
1818 	/* capture CLASS_PORT_INFO data */
1819 	if (p_mad->attr_id == IB_MAD_ATTR_CLASS_PORT_INFO) {
1820 		boolean_t cpi_valid = TRUE;
1821 
1822 		cpi = (ib_class_port_info_t *) &
1823 		    (osm_madw_get_perfmgt_mad_ptr(p_madw)->data);
1824 
1825 		/* Response could be redirection (IBM eHCA PMA does this) */
1826 		if (p_mad->status & IB_MAD_STATUS_REDIRECT)
1827 			cpi_valid = handle_redirect(pm, cpi, p_mon_node, port,
1828 							mad_context);
1829 
1830 		if (pm->query_cpi && cpi_valid) {
1831 			cl_plock_acquire(&pm->osm->lock);
1832 			if (p_mon_node->node_type == IB_NODE_TYPE_SWITCH) {
1833 				int i;
1834 				for (i = p_mon_node->esp0 ? 0 : 1;
1835 				     i < p_mon_node->num_ports;
1836 				     i++) {
1837 					p_mon_node->port[i].cap_mask = cpi->cap_mask;
1838 					p_mon_node->port[i].cpi_valid = cpi_valid;
1839 				}
1840 			} else {
1841 				p_mon_node->port[port].cap_mask = cpi->cap_mask;
1842 				p_mon_node->port[port].cpi_valid = cpi_valid;
1843 			}
1844 			cl_plock_release(&pm->osm->lock);
1845 		}
1846 		goto Exit;
1847 	}
1848 
1849 	if (p_mad->attr_id == IB_MAD_ATTR_PORT_CNTRS_EXT) {
1850 		ib_port_counters_ext_t *ext_wire_read =
1851 				(ib_port_counters_ext_t *)
1852 				&osm_madw_get_perfmgt_mad_ptr(p_madw)->data;
1853 
1854 		/* convert wire data to perfmgr data counter reading */
1855 		perfmgr_db_fill_data_cnt_read_pce(ext_wire_read, &data_reading,
1856 						  ietf_supported(p_mon_node,
1857 								 port));
1858 
1859 		/* add counter */
1860 		if (mad_context->perfmgr_context.mad_method
1861 		    == IB_MAD_METHOD_GET) {
1862 			/* detect an out of band clear on the port */
1863 			perfmgr_check_data_cnt_oob_clear(pm, p_mon_node, port,
1864 						    &data_reading);
1865 
1866 			perfmgr_db_add_dc_reading(pm->db, node_guid, port,
1867 						  &data_reading,
1868 						  ietf_supported(p_mon_node,
1869 								 port));
1870 		} else {
1871 			perfmgr_db_clear_prev_dc(pm->db, node_guid, port);
1872 		}
1873 
1874 		perfmgr_check_pce_overflow(pm, p_mon_node,
1875 					   p_mon_node->port[port].pkey_ix,
1876 					   port, ext_wire_read);
1877 	} else {
1878 		boolean_t pce_sup = pce_supported(p_mon_node, port);
1879 		boolean_t xmit_wait_sup = xmit_wait_supported(p_mon_node, port);
1880 		ib_port_counters_t *wire_read =
1881 				(ib_port_counters_t *)
1882 				&osm_madw_get_perfmgt_mad_ptr(p_madw)->data;
1883 
1884 		perfmgr_db_fill_err_read(wire_read, &err_reading, xmit_wait_sup);
1885 		if (!pce_sup)
1886 			perfmgr_db_fill_data_cnt_read_pc(wire_read, &data_reading);
1887 
1888 		if (mad_context->perfmgr_context.mad_method == IB_MAD_METHOD_GET) {
1889 			/* detect an out of band clear on the port */
1890 			perfmgr_check_oob_clear(pm, p_mon_node, port, &err_reading);
1891 			if (!pce_sup)
1892 				perfmgr_check_data_cnt_oob_clear(pm, p_mon_node, port,
1893 							    &data_reading);
1894 
1895 			/* log errors from this reading */
1896 			if (pm->subn->opt.perfmgr_log_errors)
1897 				perfmgr_log_errors(pm, p_mon_node, port, &err_reading);
1898 
1899 			perfmgr_db_add_err_reading(pm->db, node_guid, port,
1900 						   &err_reading);
1901 			if (!pce_sup)
1902 				perfmgr_db_add_dc_reading(pm->db, node_guid, port,
1903 							  &data_reading, 0);
1904 		} else {
1905 			perfmgr_db_clear_prev_err(pm->db, node_guid, port);
1906 			if (!pce_sup)
1907 				perfmgr_db_clear_prev_dc(pm->db, node_guid, port);
1908 		}
1909 
1910 		perfmgr_check_overflow(pm, p_mon_node, p_mon_node->port[port].pkey_ix,
1911 				       port, wire_read, xmit_wait_sup);
1912 
1913 	}
1914 
1915 #ifdef ENABLE_OSM_PERF_MGR_PROFILE
1916 	do {
1917 		struct timeval proc_time;
1918 		gettimeofday(&proc_time, NULL);
1919 		diff_time(&p_madw->context.perfmgr_context.query_start,
1920 			  &proc_time, &proc_time);
1921 		update_mad_stats(&proc_time);
1922 	} while (0);
1923 #endif
1924 
1925 Exit:
1926 	osm_mad_pool_put(pm->mad_pool, p_madw);
1927 
1928 	OSM_LOG_EXIT(pm->log);
1929 }
1930 
1931 /**********************************************************************
1932  * Initialize the PerfMgr object
1933  **********************************************************************/
1934 ib_api_status_t osm_perfmgr_init(osm_perfmgr_t * pm, osm_opensm_t * osm,
1935 				 const osm_subn_opt_t * p_opt)
1936 {
1937 	ib_api_status_t status;
1938 
1939 	OSM_LOG_ENTER(&osm->log);
1940 
1941 	OSM_LOG(&osm->log, OSM_LOG_VERBOSE, "Initializing PerfMgr\n");
1942 
1943 	memset(pm, 0, sizeof(*pm));
1944 
1945 	pm->subn = &osm->subn;
1946 	pm->sm = &osm->sm;
1947 	pm->log = &osm->log;
1948 	pm->mad_pool = &osm->mad_pool;
1949 	pm->vendor = osm->p_vendor;
1950 	pm->trans_id = PERFMGR_INITIAL_TID_VALUE;
1951 	pm->state =
1952 	    p_opt->perfmgr ? PERFMGR_STATE_ENABLED : PERFMGR_STATE_DISABLE;
1953 	pm->sweep_state = PERFMGR_SWEEP_SLEEP;
1954 	status = cl_spinlock_init(&pm->lock);
1955 	if (status != IB_SUCCESS)
1956 		goto Exit;
1957 	pm->sweep_time_s = p_opt->perfmgr_sweep_time_s;
1958 	pm->max_outstanding_queries = p_opt->perfmgr_max_outstanding_queries;
1959 	pm->ignore_cas = p_opt->perfmgr_ignore_cas;
1960 	pm->osm = osm;
1961 	pm->local_port = -1;
1962 
1963 	status = cl_timer_init(&pm->sweep_timer, perfmgr_sweep, pm);
1964 	if (status != IB_SUCCESS)
1965 		goto Exit;
1966 
1967 	status = IB_INSUFFICIENT_RESOURCES;
1968 	pm->db = perfmgr_db_construct(pm);
1969 	if (!pm->db) {
1970 		pm->state = PERFMGR_STATE_NO_DB;
1971 		goto Exit;
1972 	}
1973 
1974 	pm->pc_disp_h = cl_disp_register(&osm->disp, OSM_MSG_MAD_PORT_COUNTERS,
1975 					 pc_recv_process, pm);
1976 	if (pm->pc_disp_h == CL_DISP_INVALID_HANDLE) {
1977 		perfmgr_db_destroy(pm->db);
1978 		goto Exit;
1979 	}
1980 
1981 	init_monitored_nodes(pm);
1982 
1983 	if (pm->state == PERFMGR_STATE_ENABLED)
1984 		cl_timer_start(&pm->sweep_timer, pm->sweep_time_s * 1000);
1985 
1986 	pm->rm_nodes = p_opt->perfmgr_rm_nodes;
1987 	pm->query_cpi = p_opt->perfmgr_query_cpi;
1988 	pm->xmit_wait_log = p_opt->perfmgr_xmit_wait_log;
1989 	pm->xmit_wait_threshold = p_opt->perfmgr_xmit_wait_threshold;
1990 	status = IB_SUCCESS;
1991 Exit:
1992 	OSM_LOG_EXIT(pm->log);
1993 	return status;
1994 }
1995 
1996 /**********************************************************************
1997  * Clear the counters from the db
1998  **********************************************************************/
1999 void osm_perfmgr_clear_counters(osm_perfmgr_t * pm)
2000 {
2001 	/**
2002 	 * FIXME todo issue clear on the fabric?
2003 	 */
2004 	perfmgr_db_clear_counters(pm->db);
2005 	osm_log_v2(pm->log, OSM_LOG_INFO, FILE_ID, "PerfMgr counters cleared\n");
2006 }
2007 
2008 /*******************************************************************
2009  * Dump the DB information to the file specified
2010  *******************************************************************/
2011 void osm_perfmgr_dump_counters(osm_perfmgr_t * pm, perfmgr_db_dump_t dump_type)
2012 {
2013 	char path[256];
2014 	char *file_name;
2015 	if (pm->subn->opt.event_db_dump_file)
2016 		file_name = pm->subn->opt.event_db_dump_file;
2017 	else {
2018 		snprintf(path, sizeof(path), "%s/%s",
2019 			 pm->subn->opt.dump_files_dir,
2020 			 OSM_PERFMGR_DEFAULT_DUMP_FILE);
2021 		file_name = path;
2022 	}
2023 	if (perfmgr_db_dump(pm->db, file_name, dump_type) != 0)
2024 		OSM_LOG(pm->log, OSM_LOG_ERROR, "Failed to dump file %s : %s",
2025 			file_name, strerror(errno));
2026 }
2027 
2028 /*******************************************************************
2029  * Print the DB information to the fp specified
2030  *******************************************************************/
2031 void osm_perfmgr_print_counters(osm_perfmgr_t * pm, char *nodename, FILE * fp,
2032 				char *port, int err_only)
2033 {
2034 	if (nodename) {
2035 		char *end = NULL;
2036 		uint64_t guid = strtoull(nodename, &end, 0);
2037 		if (nodename + strlen(nodename) != end)
2038 			perfmgr_db_print_by_name(pm->db, nodename, fp, port,
2039 						 err_only);
2040 		else
2041 			perfmgr_db_print_by_guid(pm->db, guid, fp, port,
2042 						 err_only);
2043 	} else
2044 		perfmgr_db_print_all(pm->db, fp, err_only);
2045 }
2046 
2047 void osm_perfmgr_update_nodename(osm_perfmgr_t *pm, uint64_t node_guid,
2048 				char *nodename)
2049 {
2050 	if (pm->db)
2051 		perfmgr_db_update_name(pm->db, node_guid, nodename);
2052 }
2053 #endif				/* ENABLE_OSM_PERF_MGR */
2054