1 /*
2  * Copyright (c) 2006-2009 Voltaire, Inc. All rights reserved.
3  * Copyright (c) 2009 HNR Consulting. All rights reserved.
4  * Copyright (c) 2012 Lawrence Livermore National Lab.  All rights reserved.
5  * Copyright (c) 2014 Mellanox Technologies LTD. All rights reserved.
6  *
7  * This software is available to you under a choice of one of two
8  * licenses.  You may choose to be licensed under the terms of the GNU
9  * General Public License (GPL) Version 2, available from the file
10  * COPYING in the main directory of this source tree, or the
11  * OpenIB.org BSD license below:
12  *
13  *     Redistribution and use in source and binary forms, with or
14  *     without modification, are permitted provided that the following
15  *     conditions are met:
16  *
17  *      - Redistributions of source code must retain the above
18  *        copyright notice, this list of conditions and the following
19  *        disclaimer.
20  *
21  *      - Redistributions in binary form must reproduce the above
22  *        copyright notice, this list of conditions and the following
23  *        disclaimer in the documentation and/or other materials
24  *        provided with the distribution.
25  *
26  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
27  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
28  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
29  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
30  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
31  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
32  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33  * SOFTWARE.
34  *
35  */
36 
37 /*
38  * Abstract:
39  *    OSM Congestion Control configuration implementation
40  *
41  * Author:
42  *    Albert Chu, LLNL
43  */
44 
45 #if HAVE_CONFIG_H
46 #  include <config.h>
47 #endif				/* HAVE_CONFIG_H */
48 
49 #include <stdlib.h>
50 #include <string.h>
51 
52 #include <iba/ib_types.h>
53 #include <complib/cl_debug.h>
54 #include <opensm/osm_file_ids.h>
55 #define FILE_ID OSM_FILE_CONGESTION_CONTROL_C
56 #include <opensm/osm_subnet.h>
57 #include <opensm/osm_opensm.h>
58 #include <opensm/osm_log.h>
59 #include <opensm/osm_subnet.h>
60 #include <opensm/osm_congestion_control.h>
61 
62 #define CONGESTION_CONTROL_INITIAL_TID_VALUE 0x7A93
63 
64 static void cc_mad_post(osm_congestion_control_t *p_cc,
65 			osm_madw_t *p_madw,
66 			osm_node_t *p_node,
67 			osm_physp_t *p_physp,
68 			ib_net16_t attr_id,
69 			ib_net32_t attr_mod)
70 {
71 	osm_subn_opt_t *p_opt = &p_cc->subn->opt;
72 	ib_cc_mad_t *p_cc_mad;
73 	uint8_t port;
74 
75 	OSM_LOG_ENTER(p_cc->log);
76 
77 	port = osm_physp_get_port_num(p_physp);
78 
79 	p_cc_mad = osm_madw_get_cc_mad_ptr(p_madw);
80 
81 	p_cc_mad->header.base_ver = 1;
82 	p_cc_mad->header.mgmt_class = IB_MCLASS_CC;
83 	p_cc_mad->header.class_ver = 2;
84 	p_cc_mad->header.method = IB_MAD_METHOD_SET;
85 	p_cc_mad->header.status = 0;
86 	p_cc_mad->header.class_spec = 0;
87 	p_cc_mad->header.trans_id =
88 		cl_hton64((uint64_t) cl_atomic_inc(&p_cc->trans_id) &
89 			  (uint64_t) (0xFFFFFFFF));
90 	if (p_cc_mad->header.trans_id == 0)
91 		p_cc_mad->header.trans_id =
92 			cl_hton64((uint64_t) cl_atomic_inc(&p_cc->trans_id) &
93 				  (uint64_t) (0xFFFFFFFF));
94 	p_cc_mad->header.attr_id = attr_id;
95 	p_cc_mad->header.resv = 0;
96 	p_cc_mad->header.attr_mod = attr_mod;
97 
98 	p_cc_mad->cc_key = p_opt->cc_key;
99 
100 	memset(p_cc_mad->log_data, '\0', IB_CC_LOG_DATA_SIZE);
101 
102 	p_madw->mad_addr.dest_lid = osm_node_get_base_lid(p_node, port);
103 	p_madw->mad_addr.addr_type.gsi.remote_qp = IB_QP1;
104 	p_madw->mad_addr.addr_type.gsi.remote_qkey =
105 		cl_hton32(IB_QP1_WELL_KNOWN_Q_KEY);
106 	p_madw->resp_expected = TRUE;
107 	p_madw->fail_msg = CL_DISP_MSGID_NONE;
108 
109 	p_madw->context.cc_context.node_guid = osm_node_get_node_guid(p_node);
110 	p_madw->context.cc_context.port_guid = osm_physp_get_port_guid(p_physp);
111 	p_madw->context.cc_context.port = port;
112 	p_madw->context.cc_context.mad_method = IB_MAD_METHOD_SET;
113 	p_madw->context.cc_context.attr_mod = attr_mod;
114 
115 	cl_spinlock_acquire(&p_cc->mad_queue_lock);
116 	cl_atomic_inc(&p_cc->outstanding_mads);
117 	cl_qlist_insert_tail(&p_cc->mad_queue, &p_madw->list_item);
118 	cl_spinlock_release(&p_cc->mad_queue_lock);
119 
120 	cl_event_signal(&p_cc->cc_poller_wakeup);
121 
122 	OSM_LOG_EXIT(p_cc->log);
123 }
124 
125 static void cc_setup_mad_data(osm_sm_t * p_sm)
126 {
127 	osm_congestion_control_t *p_cc = &p_sm->p_subn->p_osm->cc;
128 	osm_subn_opt_t *p_opt = &p_sm->p_subn->opt;
129 	uint16_t ccti_limit;
130 	int i;
131 
132 	/* Switch Congestion Setting */
133 	p_cc->sw_cong_setting.control_map = p_opt->cc_sw_cong_setting_control_map;
134 
135 	memcpy(p_cc->sw_cong_setting.victim_mask,
136 	       p_opt->cc_sw_cong_setting_victim_mask,
137 	       IB_CC_PORT_MASK_DATA_SIZE);
138 
139 	memcpy(p_cc->sw_cong_setting.credit_mask,
140 	       p_opt->cc_sw_cong_setting_credit_mask,
141 	       IB_CC_PORT_MASK_DATA_SIZE);
142 
143 	/* threshold is 4 bits, takes up upper nibble of byte */
144 	p_cc->sw_cong_setting.threshold_resv = (p_opt->cc_sw_cong_setting_threshold << 4);
145 
146 	p_cc->sw_cong_setting.packet_size = p_opt->cc_sw_cong_setting_packet_size;
147 
148 	/* cs threshold is 4 bits, takes up upper nibble of short */
149 	p_cc->sw_cong_setting.cs_threshold_resv =
150 		cl_hton16(p_opt->cc_sw_cong_setting_credit_starvation_threshold << 12);
151 
152 	p_cc->sw_cong_setting.cs_return_delay =
153 		cl_hton16(p_opt->cc_sw_cong_setting_credit_starvation_return_delay.shift << 14
154 			  | p_opt->cc_sw_cong_setting_credit_starvation_return_delay.multiplier);
155 
156 	p_cc->sw_cong_setting.marking_rate = p_opt->cc_sw_cong_setting_marking_rate;
157 
158 	/* CA Congestion Setting */
159 	p_cc->ca_cong_setting.port_control = p_opt->cc_ca_cong_setting_port_control;
160 	p_cc->ca_cong_setting.control_map = p_opt->cc_ca_cong_setting_control_map;
161 
162 	for (i = 0; i < IB_CA_CONG_ENTRY_DATA_SIZE; i++) {
163 		ib_ca_cong_entry_t *p_entry;
164 
165 		p_entry = &p_cc->ca_cong_setting.entry_list[i];
166 
167 		p_entry->ccti_timer = p_opt->cc_ca_cong_entries[i].ccti_timer;
168 		p_entry->ccti_increase = p_opt->cc_ca_cong_entries[i].ccti_increase;
169 		p_entry->trigger_threshold = p_opt->cc_ca_cong_entries[i].trigger_threshold;
170 		p_entry->ccti_min = p_opt->cc_ca_cong_entries[i].ccti_min;
171 		p_entry->resv0 = 0;
172 		p_entry->resv1 = 0;
173 	}
174 
175 	/* Congestion Control Table */
176 
177 	/* if no entries, we will always send at least 1 mad to set ccti_limit = 0 */
178 	if (!p_opt->cc_cct.entries_len)
179 		p_cc->cc_tbl_mads = 1;
180 	else {
181 		p_cc->cc_tbl_mads = p_opt->cc_cct.entries_len - 1;
182 		p_cc->cc_tbl_mads /= IB_CC_TBL_ENTRY_LIST_MAX;
183 		p_cc->cc_tbl_mads += 1;
184 	}
185 
186 	CL_ASSERT(p_cc->cc_tbl_mads <= OSM_CCT_ENTRY_MAD_BLOCKS);
187 
188 	if (!p_opt->cc_cct.entries_len)
189 		ccti_limit = 0;
190 	else
191 		ccti_limit = p_opt->cc_cct.entries_len - 1;
192 
193 	for (i = 0; i < p_cc->cc_tbl_mads; i++) {
194 		int j;
195 
196 		p_cc->cc_tbl[i].ccti_limit = cl_hton16(ccti_limit);
197 		p_cc->cc_tbl[i].resv = 0;
198 
199 		memset(p_cc->cc_tbl[i].entry_list,
200 		       '\0',
201 		       sizeof(p_cc->cc_tbl[i].entry_list));
202 
203 		if (!ccti_limit)
204 			break;
205 
206 		for (j = 0; j < IB_CC_TBL_ENTRY_LIST_MAX; j++) {
207 			int k;
208 
209 			k = (i * IB_CC_TBL_ENTRY_LIST_MAX) + j;
210 			p_cc->cc_tbl[i].entry_list[j].shift_multiplier =
211 				cl_hton16(p_opt->cc_cct.entries[k].shift << 14
212 					  | p_opt->cc_cct.entries[k].multiplier);
213 		}
214 	}
215 }
216 
217 static ib_api_status_t cc_send_sw_cong_setting(osm_sm_t * p_sm,
218 					       osm_node_t *p_node)
219 {
220 	osm_congestion_control_t *p_cc = &p_sm->p_subn->p_osm->cc;
221 	unsigned force_update;
222 	osm_physp_t *p_physp;
223 	osm_madw_t *p_madw = NULL;
224 	ib_cc_mad_t *p_cc_mad = NULL;
225 	ib_sw_cong_setting_t *p_sw_cong_setting = NULL;
226 
227 	OSM_LOG_ENTER(p_sm->p_log);
228 
229 	p_physp = osm_node_get_physp_ptr(p_node, 0);
230 
231 	force_update = p_physp->need_update || p_sm->p_subn->need_update;
232 
233 	if (!force_update
234 	    && !memcmp(&p_cc->sw_cong_setting,
235 		       &p_physp->cc.sw.sw_cong_setting,
236 		       sizeof(p_cc->sw_cong_setting)))
237 		return IB_SUCCESS;
238 
239 	p_madw = osm_mad_pool_get(p_cc->mad_pool, p_cc->bind_handle,
240 				  MAD_BLOCK_SIZE, NULL);
241 	if (p_madw == NULL) {
242 		OSM_LOG(p_sm->p_log, OSM_LOG_ERROR, "ERR C101: "
243 			"failed to allocate mad\n");
244 		return IB_INSUFFICIENT_MEMORY;
245 	}
246 
247 	p_cc_mad = osm_madw_get_cc_mad_ptr(p_madw);
248 
249 	p_sw_cong_setting = ib_cc_mad_get_mgt_data_ptr(p_cc_mad);
250 
251 	memcpy(p_sw_cong_setting,
252 	       &p_cc->sw_cong_setting,
253 	       sizeof(p_cc->sw_cong_setting));
254 
255 	cc_mad_post(p_cc, p_madw, p_node, p_physp,
256 		    IB_MAD_ATTR_SW_CONG_SETTING, 0);
257 
258 	OSM_LOG_EXIT(p_sm->p_log);
259 
260 	return IB_SUCCESS;
261 }
262 
263 static ib_api_status_t cc_send_ca_cong_setting(osm_sm_t * p_sm,
264 					       osm_node_t *p_node,
265 					       osm_physp_t *p_physp)
266 {
267 	osm_congestion_control_t *p_cc = &p_sm->p_subn->p_osm->cc;
268 	unsigned force_update;
269 	osm_madw_t *p_madw = NULL;
270 	ib_cc_mad_t *p_cc_mad = NULL;
271 	ib_ca_cong_setting_t *p_ca_cong_setting = NULL;
272 
273 	OSM_LOG_ENTER(p_sm->p_log);
274 
275 	force_update = p_physp->need_update || p_sm->p_subn->need_update;
276 
277 	if (!force_update
278 	    && !memcmp(&p_cc->ca_cong_setting,
279 		       &p_physp->cc.ca.ca_cong_setting,
280 		       sizeof(p_cc->ca_cong_setting)))
281 		return IB_SUCCESS;
282 
283 	p_madw = osm_mad_pool_get(p_cc->mad_pool, p_cc->bind_handle,
284 				  MAD_BLOCK_SIZE, NULL);
285 	if (p_madw == NULL) {
286 		OSM_LOG(p_sm->p_log, OSM_LOG_ERROR, "ERR C102: "
287 			"failed to allocate mad\n");
288 		return IB_INSUFFICIENT_MEMORY;
289 	}
290 
291 	p_cc_mad = osm_madw_get_cc_mad_ptr(p_madw);
292 
293 	p_ca_cong_setting = ib_cc_mad_get_mgt_data_ptr(p_cc_mad);
294 
295 	memcpy(p_ca_cong_setting,
296 	       &p_cc->ca_cong_setting,
297 	       sizeof(p_cc->ca_cong_setting));
298 
299 	cc_mad_post(p_cc, p_madw, p_node, p_physp,
300 		    IB_MAD_ATTR_CA_CONG_SETTING, 0);
301 
302 	OSM_LOG_EXIT(p_sm->p_log);
303 
304 	return IB_SUCCESS;
305 }
306 
307 static ib_api_status_t cc_send_cct(osm_sm_t * p_sm,
308 				   osm_node_t *p_node,
309 				   osm_physp_t *p_physp)
310 {
311 	osm_congestion_control_t *p_cc = &p_sm->p_subn->p_osm->cc;
312 	unsigned force_update;
313 	osm_madw_t *p_madw = NULL;
314 	ib_cc_mad_t *p_cc_mad = NULL;
315 	ib_cc_tbl_t *p_cc_tbl = NULL;
316 	unsigned int index = 0;
317 
318 	OSM_LOG_ENTER(p_sm->p_log);
319 
320 	force_update = p_physp->need_update || p_sm->p_subn->need_update;
321 
322 	for (index = 0; index < p_cc->cc_tbl_mads; index++) {
323 		if (!force_update
324 		    && !memcmp(&p_cc->cc_tbl[index],
325 			       &p_physp->cc.ca.cc_tbl[index],
326 			       sizeof(p_cc->cc_tbl[index])))
327 			continue;
328 
329 		p_madw = osm_mad_pool_get(p_cc->mad_pool, p_cc->bind_handle,
330 					  MAD_BLOCK_SIZE, NULL);
331 		if (p_madw == NULL) {
332 			OSM_LOG(p_sm->p_log, OSM_LOG_ERROR, "ERR C103: "
333 				"failed to allocate mad\n");
334 			return IB_INSUFFICIENT_MEMORY;
335 		}
336 
337 		p_cc_mad = osm_madw_get_cc_mad_ptr(p_madw);
338 
339 		p_cc_tbl = (ib_cc_tbl_t *)ib_cc_mad_get_mgt_data_ptr(p_cc_mad);
340 
341 		memcpy(p_cc_tbl,
342 		       &p_cc->cc_tbl[index],
343 		       sizeof(p_cc->cc_tbl[index]));
344 
345 		cc_mad_post(p_cc, p_madw, p_node, p_physp,
346 			    IB_MAD_ATTR_CC_TBL, cl_hton32(index));
347 	}
348 
349 	OSM_LOG_EXIT(p_sm->p_log);
350 
351 	return IB_SUCCESS;
352 }
353 
354 int osm_congestion_control_setup(struct osm_opensm *p_osm)
355 {
356 	cl_qmap_t *p_tbl;
357 	cl_map_item_t *p_next;
358 	int ret = 0;
359 
360 	if (!p_osm->subn.opt.congestion_control)
361 		return 0;
362 
363 	OSM_LOG_ENTER(&p_osm->log);
364 
365 	/*
366 	 * Do nothing unless the most recent routing attempt was successful.
367 	 */
368 	if (!p_osm->routing_engine_used)
369 		return 0;
370 
371 	cc_setup_mad_data(&p_osm->sm);
372 
373 	cl_plock_acquire(&p_osm->lock);
374 
375 	p_tbl = &p_osm->subn.port_guid_tbl;
376 	p_next = cl_qmap_head(p_tbl);
377 	while (p_next != cl_qmap_end(p_tbl)) {
378 		osm_port_t *p_port = (osm_port_t *) p_next;
379 		osm_node_t *p_node = p_port->p_node;
380 		ib_api_status_t status;
381 
382 		p_next = cl_qmap_next(p_next);
383 
384 		if (p_port->cc_unavailable_flag)
385 			continue;
386 
387 		if (osm_node_get_type(p_node) == IB_NODE_TYPE_SWITCH) {
388 			status = cc_send_sw_cong_setting(&p_osm->sm, p_node);
389 			if (status != IB_SUCCESS)
390 				ret = -1;
391 		} else if (osm_node_get_type(p_node) == IB_NODE_TYPE_CA) {
392 			status = cc_send_ca_cong_setting(&p_osm->sm,
393 							 p_node,
394 							 p_port->p_physp);
395 			if (status != IB_SUCCESS)
396 				ret = -1;
397 
398 			status = cc_send_cct(&p_osm->sm,
399 					     p_node,
400 					     p_port->p_physp);
401 			if (status != IB_SUCCESS)
402 				ret = -1;
403 		}
404 	}
405 
406 	cl_plock_release(&p_osm->lock);
407 
408 	OSM_LOG_EXIT(&p_osm->log);
409 
410 	return ret;
411 }
412 
413 int osm_congestion_control_wait_pending_transactions(struct osm_opensm *p_osm)
414 {
415 	osm_congestion_control_t *cc = &p_osm->cc;
416 
417 	if (!p_osm->subn.opt.congestion_control)
418 		return 0;
419 
420 	while (1) {
421 		unsigned count = cc->outstanding_mads;
422 		if (!count || osm_exit_flag)
423 			break;
424 		cl_event_wait_on(&cc->outstanding_mads_done_event,
425 				 EVENT_NO_TIMEOUT,
426 				 TRUE);
427 	}
428 
429 	return osm_exit_flag;
430 }
431 
432 static inline void decrement_outstanding_mads(osm_congestion_control_t *p_cc)
433 {
434 	uint32_t outstanding;
435 
436 	outstanding = cl_atomic_dec(&p_cc->outstanding_mads);
437 	if (!outstanding)
438 		cl_event_signal(&p_cc->outstanding_mads_done_event);
439 
440 	cl_atomic_dec(&p_cc->outstanding_mads_on_wire);
441 	cl_event_signal(&p_cc->sig_mads_on_wire_continue);
442 }
443 
444 static void cc_rcv_mad(void *context, void *data)
445 {
446 	osm_congestion_control_t *p_cc = context;
447 	osm_opensm_t *p_osm = p_cc->osm;
448 	osm_madw_t *p_madw = data;
449 	ib_cc_mad_t *p_cc_mad;
450 	osm_madw_context_t *p_mad_context = &p_madw->context;
451 	ib_mad_t *p_mad = osm_madw_get_mad_ptr(p_madw);
452 	ib_net64_t node_guid = p_mad_context->cc_context.node_guid;
453 	ib_net64_t port_guid = p_mad_context->cc_context.port_guid;
454 	uint8_t port = p_mad_context->cc_context.port;
455 	osm_port_t *p_port;
456 
457 	OSM_LOG_ENTER(p_cc->log);
458 
459 	OSM_LOG(p_cc->log, OSM_LOG_VERBOSE,
460 		"Processing received MAD status 0x%x for "
461 		"attr ID %u mod 0x%x node 0x%" PRIx64 " port %u\n",
462 		cl_ntoh16(p_mad->status), cl_ntoh16(p_mad->attr_id),
463 		cl_ntoh32(p_mad_context->cc_context.attr_mod),
464 		cl_ntoh64(node_guid), port);
465 
466 	p_cc_mad = osm_madw_get_cc_mad_ptr(p_madw);
467 
468 	cl_plock_acquire(&p_osm->lock);
469 
470 	p_port = osm_get_port_by_guid(p_cc->subn, port_guid);
471 	if (!p_port) {
472 		OSM_LOG(p_cc->log, OSM_LOG_ERROR, "ERR C109: "
473 			"Port GUID 0x%" PRIx64 " not in table\n",
474 			cl_ntoh64(port_guid));
475 		cl_plock_release(&p_osm->lock);
476 		goto Exit;
477 	}
478 
479 	p_port->cc_timeout_count = 0;
480 
481 	if (p_cc_mad->header.status) {
482 		if (p_cc_mad->header.status & IB_MAD_STATUS_UNSUP_CLASS_VER
483 		    || p_cc_mad->header.status & IB_MAD_STATUS_UNSUP_METHOD
484 		    || p_cc_mad->header.status & IB_MAD_STATUS_UNSUP_METHOD_ATTR)
485 			p_port->cc_unavailable_flag = TRUE;
486 		cl_plock_release(&p_osm->lock);
487 		goto Exit;
488 	}
489 	else
490 		p_port->cc_unavailable_flag = FALSE;
491 
492 	if (p_cc_mad->header.attr_id == IB_MAD_ATTR_SW_CONG_SETTING) {
493 		ib_sw_cong_setting_t *p_sw_cong_setting;
494 
495 		p_sw_cong_setting = ib_cc_mad_get_mgt_data_ptr(p_cc_mad);
496 		p_port->p_physp->cc.sw.sw_cong_setting = *p_sw_cong_setting;
497 	}
498 	else if (p_cc_mad->header.attr_id == IB_MAD_ATTR_CA_CONG_SETTING) {
499 		ib_ca_cong_setting_t *p_ca_cong_setting;
500 
501 		p_ca_cong_setting = ib_cc_mad_get_mgt_data_ptr(p_cc_mad);
502 		p_port->p_physp->cc.ca.ca_cong_setting = *p_ca_cong_setting;
503 	}
504 	else if (p_cc_mad->header.attr_id == IB_MAD_ATTR_CC_TBL) {
505 		ib_net32_t attr_mod = p_mad_context->cc_context.attr_mod;
506 		uint32_t index = cl_ntoh32(attr_mod);
507 		ib_cc_tbl_t *p_cc_tbl;
508 
509 		p_cc_tbl = ib_cc_mad_get_mgt_data_ptr(p_cc_mad);
510 		p_port->p_physp->cc.ca.cc_tbl[index] = *p_cc_tbl;
511 	}
512 	else
513 		OSM_LOG(p_cc->log, OSM_LOG_ERROR, "ERR C10A: "
514 			"Unexpected MAD attribute ID %u received\n",
515 			cl_ntoh16(p_cc_mad->header.attr_id));
516 
517 	cl_plock_release(&p_osm->lock);
518 
519 Exit:
520 	decrement_outstanding_mads(p_cc);
521 	osm_mad_pool_put(p_cc->mad_pool, p_madw);
522 	OSM_LOG_EXIT(p_cc->log);
523 }
524 
525 static void cc_poller_send(osm_congestion_control_t *p_cc,
526 			   osm_madw_t *p_madw)
527 {
528 	osm_subn_opt_t *p_opt = &p_cc->subn->opt;
529 	ib_api_status_t status;
530 	cl_status_t sts;
531 	osm_madw_context_t mad_context = p_madw->context;
532 
533 	status = osm_vendor_send(p_cc->bind_handle, p_madw, TRUE);
534 	if (status == IB_SUCCESS) {
535 		cl_atomic_inc(&p_cc->outstanding_mads_on_wire);
536 		while (p_cc->outstanding_mads_on_wire >
537 		       (int32_t)p_opt->cc_max_outstanding_mads) {
538 wait:
539 			sts = cl_event_wait_on(&p_cc->sig_mads_on_wire_continue,
540 					       EVENT_NO_TIMEOUT, TRUE);
541 			if (sts != CL_SUCCESS)
542 				goto wait;
543 		}
544 	} else
545 		OSM_LOG(p_cc->log, OSM_LOG_ERROR, "ERR C104: "
546 			"send failed to node 0x%" PRIx64 "port %u\n",
547 			cl_ntoh64(mad_context.cc_context.node_guid),
548 			mad_context.cc_context.port);
549 }
550 
551 static void cc_poller(void *p_ptr)
552 {
553 	osm_congestion_control_t *p_cc = p_ptr;
554 	osm_madw_t *p_madw;
555 
556 	OSM_LOG_ENTER(p_cc->log);
557 
558 	if (p_cc->thread_state == OSM_THREAD_STATE_NONE)
559 		p_cc->thread_state = OSM_THREAD_STATE_RUN;
560 
561 	while (p_cc->thread_state == OSM_THREAD_STATE_RUN) {
562 		cl_spinlock_acquire(&p_cc->mad_queue_lock);
563 
564 		p_madw = (osm_madw_t *) cl_qlist_remove_head(&p_cc->mad_queue);
565 
566 		cl_spinlock_release(&p_cc->mad_queue_lock);
567 
568 		if (p_madw != (osm_madw_t *) cl_qlist_end(&p_cc->mad_queue))
569 			cc_poller_send(p_cc, p_madw);
570 		else
571 			cl_event_wait_on(&p_cc->cc_poller_wakeup,
572 					 EVENT_NO_TIMEOUT, TRUE);
573 	}
574 
575 	OSM_LOG_EXIT(p_cc->log);
576 }
577 
578 ib_api_status_t osm_congestion_control_init(osm_congestion_control_t * p_cc,
579 					    struct osm_opensm *p_osm,
580 					    const osm_subn_opt_t * p_opt)
581 {
582 	ib_api_status_t status = IB_SUCCESS;
583 
584 	OSM_LOG_ENTER(&p_osm->log);
585 
586 	memset(p_cc, 0, sizeof(*p_cc));
587 
588 	p_cc->osm = p_osm;
589 	p_cc->subn = &p_osm->subn;
590 	p_cc->sm = &p_osm->sm;
591 	p_cc->log = &p_osm->log;
592 	p_cc->mad_pool = &p_osm->mad_pool;
593 	p_cc->trans_id = CONGESTION_CONTROL_INITIAL_TID_VALUE;
594 	p_cc->vendor = p_osm->p_vendor;
595 
596 	p_cc->cc_disp_h = cl_disp_register(&p_osm->disp, OSM_MSG_MAD_CC,
597 					   cc_rcv_mad, p_cc);
598 	if (p_cc->cc_disp_h == CL_DISP_INVALID_HANDLE)
599 		goto Exit;
600 
601 	cl_qlist_init(&p_cc->mad_queue);
602 
603 	status = cl_spinlock_init(&p_cc->mad_queue_lock);
604 	if (status != IB_SUCCESS)
605 		goto Exit;
606 
607 	cl_event_construct(&p_cc->cc_poller_wakeup);
608 	status = cl_event_init(&p_cc->cc_poller_wakeup, FALSE);
609 	if (status != IB_SUCCESS)
610 		goto Exit;
611 
612 	cl_event_construct(&p_cc->outstanding_mads_done_event);
613 	status = cl_event_init(&p_cc->outstanding_mads_done_event, FALSE);
614 	if (status != IB_SUCCESS)
615 		goto Exit;
616 
617 	cl_event_construct(&p_cc->sig_mads_on_wire_continue);
618 	status = cl_event_init(&p_cc->sig_mads_on_wire_continue, FALSE);
619 	if (status != IB_SUCCESS)
620 		goto Exit;
621 
622 	p_cc->thread_state = OSM_THREAD_STATE_NONE;
623 
624 	status = cl_thread_init(&p_cc->cc_poller, cc_poller, p_cc,
625 				"cc poller");
626 	if (status != IB_SUCCESS)
627 		goto Exit;
628 
629 	status = IB_SUCCESS;
630 Exit:
631 	OSM_LOG_EXIT(p_cc->log);
632 	return status;
633 }
634 
635 static void cc_mad_recv_callback(osm_madw_t * p_madw, void *bind_context,
636 				 osm_madw_t * p_req_madw)
637 {
638 	osm_congestion_control_t *p_cc = bind_context;
639 
640 	OSM_LOG_ENTER(p_cc->log);
641 
642 	CL_ASSERT(p_madw);
643 
644 	/* HACK - should be extended when supporting CC traps */
645 	CL_ASSERT(p_req_madw != NULL);
646 
647 	osm_madw_copy_context(p_madw, p_req_madw);
648 	osm_mad_pool_put(p_cc->mad_pool, p_req_madw);
649 
650 	/* Do not decrement outstanding mads here, do it in the dispatcher */
651 
652 	if (cl_disp_post(p_cc->cc_disp_h, OSM_MSG_MAD_CC,
653 			 p_madw, NULL, NULL) != CL_SUCCESS) {
654 		OSM_LOG(p_cc->log, OSM_LOG_ERROR, "ERR C105: "
655 			"Congestion Control Dispatcher post failed\n");
656 		osm_mad_pool_put(p_cc->mad_pool, p_madw);
657 	}
658 
659 	OSM_LOG_EXIT(p_cc->log);
660 }
661 
662 static void cc_mad_send_err_callback(void *bind_context,
663 				     osm_madw_t * p_madw)
664 {
665 	osm_congestion_control_t *p_cc = bind_context;
666 	osm_madw_context_t *p_madw_context = &p_madw->context;
667 	osm_opensm_t *p_osm = p_cc->osm;
668 	uint64_t node_guid = p_madw_context->cc_context.node_guid;
669 	uint64_t port_guid = p_madw_context->cc_context.port_guid;
670 	uint8_t port = p_madw_context->cc_context.port;
671 	osm_port_t *p_port;
672 	int log_flag = 1;
673 
674 	OSM_LOG_ENTER(p_cc->log);
675 
676 	cl_plock_acquire(&p_osm->lock);
677 
678 	p_port = osm_get_port_by_guid(p_cc->subn, port_guid);
679 	if (!p_port) {
680 		OSM_LOG(p_cc->log, OSM_LOG_ERROR, "ERR C10B: "
681 			"Port GUID 0x%" PRIx64 " not in table\n",
682 			cl_ntoh64(port_guid));
683 		cl_plock_release(&p_osm->lock);
684 		goto Exit;
685 	}
686 
687 	/* If timed out before, don't bothering logging again
688 	 * we assume no CC support
689 	 */
690 	if (p_madw->status == IB_TIMEOUT
691 	    && p_port->cc_timeout_count)
692 		log_flag = 0;
693 
694 	if (log_flag)
695 		OSM_LOG(p_cc->log, OSM_LOG_ERROR, "ERR C106: MAD Error (%s): "
696 			"attr id = %u LID %u GUID 0x%016" PRIx64 " port %u "
697 			"TID 0x%" PRIx64 "\n",
698 			ib_get_err_str(p_madw->status),
699 			p_madw->p_mad->attr_id,
700 			cl_ntoh16(p_madw->mad_addr.dest_lid),
701 			cl_ntoh64(node_guid),
702 			port,
703 			cl_ntoh64(p_madw->p_mad->trans_id));
704 
705 	if (p_madw->status == IB_TIMEOUT) {
706 		p_port->cc_timeout_count++;
707 		if (p_port->cc_timeout_count > OSM_CC_TIMEOUT_COUNT_THRESHOLD
708 		    && !p_port->cc_unavailable_flag) {
709 			p_port->cc_unavailable_flag = TRUE;
710 			p_port->cc_timeout_count = 0;
711 		}
712 	} else
713 		p_cc->subn->subnet_initialization_error = TRUE;
714 
715 	cl_plock_release(&p_osm->lock);
716 
717 Exit:
718 	osm_mad_pool_put(p_cc->mad_pool, p_madw);
719 
720 	decrement_outstanding_mads(p_cc);
721 
722 	OSM_LOG_EXIT(p_cc->log);
723 }
724 
725 ib_api_status_t osm_congestion_control_bind(osm_congestion_control_t * p_cc,
726 					    ib_net64_t port_guid)
727 {
728 	osm_bind_info_t bind_info;
729 	ib_api_status_t status = IB_SUCCESS;
730 
731 	OSM_LOG_ENTER(p_cc->log);
732 
733 	bind_info.port_guid = p_cc->port_guid = port_guid;
734 	bind_info.mad_class = IB_MCLASS_CC;
735 	bind_info.class_version = 2;
736 	bind_info.is_responder = FALSE;
737 	bind_info.is_report_processor = FALSE;
738 	bind_info.is_trap_processor = FALSE;
739 	bind_info.recv_q_size = OSM_SM_DEFAULT_QP1_RCV_SIZE;
740 	bind_info.send_q_size = OSM_SM_DEFAULT_QP1_SEND_SIZE;
741 	bind_info.timeout = p_cc->subn->opt.transaction_timeout;
742 	bind_info.retries = p_cc->subn->opt.transaction_retries;
743 
744 	OSM_LOG(p_cc->log, OSM_LOG_VERBOSE,
745 		"Binding to port GUID 0x%" PRIx64 "\n", cl_ntoh64(port_guid));
746 
747 	p_cc->bind_handle = osm_vendor_bind(p_cc->vendor, &bind_info,
748 					    p_cc->mad_pool,
749 					    cc_mad_recv_callback,
750 					    cc_mad_send_err_callback, p_cc);
751 
752 	if (p_cc->bind_handle == OSM_BIND_INVALID_HANDLE) {
753 		status = IB_ERROR;
754 		OSM_LOG(p_cc->log, OSM_LOG_ERROR,
755 			"ERR C107: Vendor specific bind failed (%s)\n",
756 			ib_get_err_str(status));
757 		goto Exit;
758 	}
759 
760 Exit:
761 	OSM_LOG_EXIT(p_cc->log);
762 	return status;
763 }
764 
765 void osm_congestion_control_shutdown(osm_congestion_control_t * p_cc)
766 {
767 	OSM_LOG_ENTER(p_cc->log);
768 	if (p_cc->bind_handle == OSM_BIND_INVALID_HANDLE) {
769 		OSM_LOG(p_cc->log, OSM_LOG_ERROR,
770 			"ERR C108: No previous bind\n");
771 		goto Exit;
772 	}
773 	cl_disp_unregister(p_cc->cc_disp_h);
774 Exit:
775 	OSM_LOG_EXIT(p_cc->log);
776 }
777 
778 void osm_congestion_control_destroy(osm_congestion_control_t * p_cc)
779 {
780 	osm_madw_t *p_madw;
781 
782 	OSM_LOG_ENTER(p_cc->log);
783 
784 	p_cc->thread_state = OSM_THREAD_STATE_EXIT;
785 
786 	cl_event_signal(&p_cc->sig_mads_on_wire_continue);
787 	cl_event_signal(&p_cc->cc_poller_wakeup);
788 
789 	cl_thread_destroy(&p_cc->cc_poller);
790 
791 	cl_spinlock_acquire(&p_cc->mad_queue_lock);
792 
793 	while (!cl_is_qlist_empty(&p_cc->mad_queue)) {
794 		p_madw = (osm_madw_t *) cl_qlist_remove_head(&p_cc->mad_queue);
795 		osm_mad_pool_put(p_cc->mad_pool, p_madw);
796 	}
797 
798 	cl_spinlock_release(&p_cc->mad_queue_lock);
799 
800 	cl_spinlock_destroy(&p_cc->mad_queue_lock);
801 
802 	cl_event_destroy(&p_cc->cc_poller_wakeup);
803 	cl_event_destroy(&p_cc->outstanding_mads_done_event);
804 	cl_event_destroy(&p_cc->sig_mads_on_wire_continue);
805 
806 	OSM_LOG_EXIT(p_cc->log);
807 }
808