1*da14cebeSEric Cheng /* 2*da14cebeSEric Cheng * CDDL HEADER START 3*da14cebeSEric Cheng * 4*da14cebeSEric Cheng * The contents of this file are subject to the terms of the 5*da14cebeSEric Cheng * Common Development and Distribution License (the "License"). 6*da14cebeSEric Cheng * You may not use this file except in compliance with the License. 7*da14cebeSEric Cheng * 8*da14cebeSEric Cheng * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*da14cebeSEric Cheng * or http://www.opensolaris.org/os/licensing. 10*da14cebeSEric Cheng * See the License for the specific language governing permissions 11*da14cebeSEric Cheng * and limitations under the License. 12*da14cebeSEric Cheng * 13*da14cebeSEric Cheng * When distributing Covered Code, include this CDDL HEADER in each 14*da14cebeSEric Cheng * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*da14cebeSEric Cheng * If applicable, add the following below this CDDL HEADER, with the 16*da14cebeSEric Cheng * fields enclosed by brackets "[]" replaced with your own identifying 17*da14cebeSEric Cheng * information: Portions Copyright [yyyy] [name of copyright owner] 18*da14cebeSEric Cheng * 19*da14cebeSEric Cheng * CDDL HEADER END 20*da14cebeSEric Cheng */ 21*da14cebeSEric Cheng /* 22*da14cebeSEric Cheng * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23*da14cebeSEric Cheng * Use is subject to license terms. 24*da14cebeSEric Cheng */ 25*da14cebeSEric Cheng 26*da14cebeSEric Cheng #include <sys/types.h> 27*da14cebeSEric Cheng #include <sys/callb.h> 28*da14cebeSEric Cheng #include <sys/sdt.h> 29*da14cebeSEric Cheng #include <sys/strsubr.h> 30*da14cebeSEric Cheng #include <sys/strsun.h> 31*da14cebeSEric Cheng #include <sys/vlan.h> 32*da14cebeSEric Cheng #include <inet/ipsec_impl.h> 33*da14cebeSEric Cheng #include <inet/ip_impl.h> 34*da14cebeSEric Cheng #include <inet/sadb.h> 35*da14cebeSEric Cheng #include <inet/ipsecesp.h> 36*da14cebeSEric Cheng #include <inet/ipsecah.h> 37*da14cebeSEric Cheng 38*da14cebeSEric Cheng #include <sys/mac_impl.h> 39*da14cebeSEric Cheng #include <sys/mac_client_impl.h> 40*da14cebeSEric Cheng #include <sys/mac_client_priv.h> 41*da14cebeSEric Cheng #include <sys/mac_soft_ring.h> 42*da14cebeSEric Cheng #include <sys/mac_flow_impl.h> 43*da14cebeSEric Cheng 44*da14cebeSEric Cheng static void mac_srs_soft_rings_signal(mac_soft_ring_set_t *, uint_t); 45*da14cebeSEric Cheng static void mac_srs_update_fanout_list(mac_soft_ring_set_t *); 46*da14cebeSEric Cheng static void mac_srs_poll_unbind(mac_soft_ring_set_t *); 47*da14cebeSEric Cheng static void mac_srs_worker_unbind(mac_soft_ring_set_t *); 48*da14cebeSEric Cheng static void mac_srs_soft_rings_quiesce(mac_soft_ring_set_t *, uint_t); 49*da14cebeSEric Cheng 50*da14cebeSEric Cheng static int mac_srs_cpu_setup(cpu_setup_t, int, void *); 51*da14cebeSEric Cheng static void mac_srs_worker_bind(mac_soft_ring_set_t *, processorid_t); 52*da14cebeSEric Cheng static void mac_srs_poll_bind(mac_soft_ring_set_t *, processorid_t); 53*da14cebeSEric Cheng static void mac_srs_threads_unbind(mac_soft_ring_set_t *); 54*da14cebeSEric Cheng static void mac_srs_add_glist(mac_soft_ring_set_t *); 55*da14cebeSEric Cheng static void mac_srs_remove_glist(mac_soft_ring_set_t *); 56*da14cebeSEric Cheng static void mac_srs_fanout_list_free(mac_soft_ring_set_t *); 57*da14cebeSEric Cheng static void mac_soft_ring_remove(mac_soft_ring_set_t *, mac_soft_ring_t *); 58*da14cebeSEric Cheng 59*da14cebeSEric Cheng static int mac_compute_soft_ring_count(flow_entry_t *, int); 60*da14cebeSEric Cheng static void mac_walk_srs_and_bind(int); 61*da14cebeSEric Cheng static void mac_walk_srs_and_unbind(int); 62*da14cebeSEric Cheng 63*da14cebeSEric Cheng extern mac_group_t *mac_reserve_rx_group(mac_client_impl_t *, uint8_t *, 64*da14cebeSEric Cheng mac_rx_group_reserve_type_t); 65*da14cebeSEric Cheng extern void mac_release_rx_group(mac_client_impl_t *, mac_group_t *); 66*da14cebeSEric Cheng 67*da14cebeSEric Cheng extern boolean_t mac_latency_optimize; 68*da14cebeSEric Cheng 69*da14cebeSEric Cheng static kmem_cache_t *mac_srs_cache; 70*da14cebeSEric Cheng kmem_cache_t *mac_soft_ring_cache; 71*da14cebeSEric Cheng 72*da14cebeSEric Cheng /* 73*da14cebeSEric Cheng * The duration in msec we wait before signalling the soft ring 74*da14cebeSEric Cheng * worker thread in case packets get queued. 75*da14cebeSEric Cheng */ 76*da14cebeSEric Cheng static uint32_t mac_soft_ring_worker_wait = 0; 77*da14cebeSEric Cheng 78*da14cebeSEric Cheng /* 79*da14cebeSEric Cheng * Need to set mac_soft_ring_max_q_cnt based on bandwidth and perhaps latency. 80*da14cebeSEric Cheng * Large values could end up in consuming lot of system memory and cause 81*da14cebeSEric Cheng * system hang. 82*da14cebeSEric Cheng */ 83*da14cebeSEric Cheng static int mac_soft_ring_max_q_cnt = 1024; 84*da14cebeSEric Cheng static int mac_soft_ring_min_q_cnt = 256; 85*da14cebeSEric Cheng static int mac_soft_ring_poll_thres = 16; 86*da14cebeSEric Cheng 87*da14cebeSEric Cheng /* 88*da14cebeSEric Cheng * Default value of number of TX rings to be assigned to a MAC client. 89*da14cebeSEric Cheng * If less than 'mac_tx_ring_count' worth of Tx rings is available, then 90*da14cebeSEric Cheng * as many as is available will be assigned to the newly created MAC client. 91*da14cebeSEric Cheng * If no TX rings are available, then MAC client(s) will be assigned the 92*da14cebeSEric Cheng * default Tx ring. Default Tx ring can be shared among multiple MAC clients. 93*da14cebeSEric Cheng */ 94*da14cebeSEric Cheng static uint32_t mac_tx_ring_count = 8; 95*da14cebeSEric Cheng static boolean_t mac_tx_serialize = B_FALSE; 96*da14cebeSEric Cheng 97*da14cebeSEric Cheng /* 98*da14cebeSEric Cheng * mac_tx_srs_hiwat is the queue depth threshold at which callers of 99*da14cebeSEric Cheng * mac_tx() will be notified of flow control condition. 100*da14cebeSEric Cheng * 101*da14cebeSEric Cheng * TCP does not honour flow control condition sent up by mac_tx(). 102*da14cebeSEric Cheng * Thus provision is made for TCP to allow more packets to be queued 103*da14cebeSEric Cheng * in SRS upto a maximum of mac_tx_srs_max_q_cnt. 104*da14cebeSEric Cheng * 105*da14cebeSEric Cheng * Note that mac_tx_srs_hiwat is always be lesser than 106*da14cebeSEric Cheng * mac_tx_srs_max_q_cnt. 107*da14cebeSEric Cheng */ 108*da14cebeSEric Cheng static uint32_t mac_tx_srs_max_q_cnt = 100000; 109*da14cebeSEric Cheng static uint32_t mac_tx_srs_hiwat = 1000; 110*da14cebeSEric Cheng 111*da14cebeSEric Cheng /* 112*da14cebeSEric Cheng * mac_rx_soft_ring_count, mac_soft_ring_10gig_count: 113*da14cebeSEric Cheng * 114*da14cebeSEric Cheng * Global tunables that determines the number of soft rings to be used for 115*da14cebeSEric Cheng * fanning out incoming traffic on a link. These count will be used only 116*da14cebeSEric Cheng * when no explicit set of CPUs was assigned to the data-links. 117*da14cebeSEric Cheng * 118*da14cebeSEric Cheng * mac_rx_soft_ring_count tunable will come into effect only if 119*da14cebeSEric Cheng * mac_soft_ring_enable is set. mac_soft_ring_enable is turned on by 120*da14cebeSEric Cheng * default only for sun4v platforms. 121*da14cebeSEric Cheng * 122*da14cebeSEric Cheng * mac_rx_soft_ring_10gig_count will come into effect if you are running on a 123*da14cebeSEric Cheng * 10Gbps link and is not dependent upon mac_soft_ring_enable. 124*da14cebeSEric Cheng * 125*da14cebeSEric Cheng * The number of soft rings for fanout for a link or a flow is determined 126*da14cebeSEric Cheng * by mac_compute_soft_ring_count() routine. This routine will take into 127*da14cebeSEric Cheng * account mac_soft_ring_enable, mac_rx_soft_ring_count and 128*da14cebeSEric Cheng * mac_rx_soft_ring_10gig_count to determine the soft ring count for a link. 129*da14cebeSEric Cheng * 130*da14cebeSEric Cheng * If a bandwidth is specified, the determination of the number of soft 131*da14cebeSEric Cheng * rings is based on specified bandwidth, CPU speed and number of CPUs in 132*da14cebeSEric Cheng * the system. 133*da14cebeSEric Cheng */ 134*da14cebeSEric Cheng static uint_t mac_rx_soft_ring_count = 8; 135*da14cebeSEric Cheng static uint_t mac_rx_soft_ring_10gig_count = 8; 136*da14cebeSEric Cheng 137*da14cebeSEric Cheng /* 138*da14cebeSEric Cheng * Every Tx and Rx mac_soft_ring_set_t (mac_srs) created gets added 139*da14cebeSEric Cheng * to mac_srs_g_list and mac_srs_g_lock protects mac_srs_g_list. The 140*da14cebeSEric Cheng * list is used to walk the list of all MAC threads when a CPU is 141*da14cebeSEric Cheng * coming online or going offline. 142*da14cebeSEric Cheng */ 143*da14cebeSEric Cheng static mac_soft_ring_set_t *mac_srs_g_list = NULL; 144*da14cebeSEric Cheng static krwlock_t mac_srs_g_lock; 145*da14cebeSEric Cheng 146*da14cebeSEric Cheng /* 147*da14cebeSEric Cheng * Whether the SRS threads should be bound, or not. 148*da14cebeSEric Cheng */ 149*da14cebeSEric Cheng static boolean_t mac_srs_thread_bind = B_TRUE; 150*da14cebeSEric Cheng 151*da14cebeSEric Cheng /* 152*da14cebeSEric Cheng * CPU to fallback to, used by mac_next_bind_cpu(). 153*da14cebeSEric Cheng */ 154*da14cebeSEric Cheng static processorid_t srs_bind_cpu = 0; 155*da14cebeSEric Cheng 156*da14cebeSEric Cheng /* 157*da14cebeSEric Cheng * Possible setting for soft_ring_process_flag is 158*da14cebeSEric Cheng * 0 or ST_RING_WORKER_ONLY. 159*da14cebeSEric Cheng */ 160*da14cebeSEric Cheng static int soft_ring_process_flag = ST_RING_WORKER_ONLY; 161*da14cebeSEric Cheng 162*da14cebeSEric Cheng /* 163*da14cebeSEric Cheng * If cpu bindings are specified by user, then Tx SRS and its soft 164*da14cebeSEric Cheng * rings should also be bound to the CPUs specified by user. The 165*da14cebeSEric Cheng * CPUs for Tx bindings are at the end of the cpu list provided by 166*da14cebeSEric Cheng * the user. If enough CPUs are not available (for Tx and Rx 167*da14cebeSEric Cheng * SRSes), then the CPUs are shared by both Tx and Rx SRSes. 168*da14cebeSEric Cheng */ 169*da14cebeSEric Cheng #define BIND_TX_SRS_AND_SOFT_RINGS(mac_tx_srs, mrp) { \ 170*da14cebeSEric Cheng processorid_t cpuid; \ 171*da14cebeSEric Cheng int i, j; \ 172*da14cebeSEric Cheng mac_soft_ring_t *softring; \ 173*da14cebeSEric Cheng \ 174*da14cebeSEric Cheng cpuid = mrp->mrp_cpu[mrp->mrp_ncpus - 1]; \ 175*da14cebeSEric Cheng mac_srs_worker_bind(mac_tx_srs, cpuid); \ 176*da14cebeSEric Cheng if (TX_MULTI_RING_MODE(mac_tx_srs)) { \ 177*da14cebeSEric Cheng j = mrp->mrp_ncpus - 1; \ 178*da14cebeSEric Cheng for (i = 0; \ 179*da14cebeSEric Cheng i < mac_tx_srs->srs_oth_ring_count; i++, j--) { \ 180*da14cebeSEric Cheng if (j < 0) \ 181*da14cebeSEric Cheng j = mrp->mrp_ncpus - 1; \ 182*da14cebeSEric Cheng cpuid = mrp->mrp_cpu[j]; \ 183*da14cebeSEric Cheng softring = mac_tx_srs->srs_oth_soft_rings[i]; \ 184*da14cebeSEric Cheng (void) mac_soft_ring_bind(softring, cpuid); \ 185*da14cebeSEric Cheng } \ 186*da14cebeSEric Cheng } \ 187*da14cebeSEric Cheng } 188*da14cebeSEric Cheng 189*da14cebeSEric Cheng /* INIT and FINI ROUTINES */ 190*da14cebeSEric Cheng 191*da14cebeSEric Cheng void 192*da14cebeSEric Cheng mac_soft_ring_init(void) 193*da14cebeSEric Cheng { 194*da14cebeSEric Cheng mac_soft_ring_cache = kmem_cache_create("mac_soft_ring_cache", 195*da14cebeSEric Cheng sizeof (mac_soft_ring_t), 64, NULL, NULL, NULL, NULL, NULL, 0); 196*da14cebeSEric Cheng 197*da14cebeSEric Cheng mac_srs_cache = kmem_cache_create("mac_srs_cache", 198*da14cebeSEric Cheng sizeof (mac_soft_ring_set_t), 199*da14cebeSEric Cheng 64, NULL, NULL, NULL, NULL, NULL, 0); 200*da14cebeSEric Cheng 201*da14cebeSEric Cheng rw_init(&mac_srs_g_lock, NULL, RW_DEFAULT, NULL); 202*da14cebeSEric Cheng mutex_enter(&cpu_lock); 203*da14cebeSEric Cheng register_cpu_setup_func(mac_srs_cpu_setup, NULL); 204*da14cebeSEric Cheng mutex_exit(&cpu_lock); 205*da14cebeSEric Cheng } 206*da14cebeSEric Cheng 207*da14cebeSEric Cheng void 208*da14cebeSEric Cheng mac_soft_ring_finish(void) 209*da14cebeSEric Cheng { 210*da14cebeSEric Cheng mutex_enter(&cpu_lock); 211*da14cebeSEric Cheng unregister_cpu_setup_func(mac_srs_cpu_setup, NULL); 212*da14cebeSEric Cheng mutex_exit(&cpu_lock); 213*da14cebeSEric Cheng rw_destroy(&mac_srs_g_lock); 214*da14cebeSEric Cheng kmem_cache_destroy(mac_soft_ring_cache); 215*da14cebeSEric Cheng kmem_cache_destroy(mac_srs_cache); 216*da14cebeSEric Cheng } 217*da14cebeSEric Cheng 218*da14cebeSEric Cheng static void 219*da14cebeSEric Cheng mac_srs_soft_rings_free(mac_soft_ring_set_t *mac_srs, boolean_t release_tx_ring) 220*da14cebeSEric Cheng { 221*da14cebeSEric Cheng mac_soft_ring_t *softring, *next, *head; 222*da14cebeSEric Cheng 223*da14cebeSEric Cheng /* 224*da14cebeSEric Cheng * Synchronize with mac_walk_srs_bind/unbind which are callbacks from 225*da14cebeSEric Cheng * DR. The callbacks from DR are called with cpu_lock held, and hence 226*da14cebeSEric Cheng * can't wait to grab the mac perimeter. The soft ring list is hence 227*da14cebeSEric Cheng * protected for read access by srs_lock. Changing the soft ring list 228*da14cebeSEric Cheng * needs the mac perimeter and the srs_lock. 229*da14cebeSEric Cheng */ 230*da14cebeSEric Cheng mutex_enter(&mac_srs->srs_lock); 231*da14cebeSEric Cheng 232*da14cebeSEric Cheng head = mac_srs->srs_soft_ring_head; 233*da14cebeSEric Cheng mac_srs->srs_soft_ring_head = NULL; 234*da14cebeSEric Cheng mac_srs->srs_soft_ring_tail = NULL; 235*da14cebeSEric Cheng mac_srs->srs_soft_ring_count = 0; 236*da14cebeSEric Cheng 237*da14cebeSEric Cheng mutex_exit(&mac_srs->srs_lock); 238*da14cebeSEric Cheng 239*da14cebeSEric Cheng for (softring = head; softring != NULL; softring = next) { 240*da14cebeSEric Cheng next = softring->s_ring_next; 241*da14cebeSEric Cheng mac_soft_ring_free(softring, release_tx_ring); 242*da14cebeSEric Cheng } 243*da14cebeSEric Cheng } 244*da14cebeSEric Cheng 245*da14cebeSEric Cheng static void 246*da14cebeSEric Cheng mac_srs_add_glist(mac_soft_ring_set_t *mac_srs) 247*da14cebeSEric Cheng { 248*da14cebeSEric Cheng ASSERT(mac_srs->srs_next == NULL && mac_srs->srs_prev == NULL); 249*da14cebeSEric Cheng ASSERT(MAC_PERIM_HELD((mac_handle_t)mac_srs->srs_mcip->mci_mip)); 250*da14cebeSEric Cheng 251*da14cebeSEric Cheng rw_enter(&mac_srs_g_lock, RW_WRITER); 252*da14cebeSEric Cheng mutex_enter(&mac_srs->srs_lock); 253*da14cebeSEric Cheng 254*da14cebeSEric Cheng ASSERT((mac_srs->srs_state & SRS_IN_GLIST) == 0); 255*da14cebeSEric Cheng 256*da14cebeSEric Cheng if (mac_srs_g_list == NULL) { 257*da14cebeSEric Cheng mac_srs_g_list = mac_srs; 258*da14cebeSEric Cheng } else { 259*da14cebeSEric Cheng mac_srs->srs_next = mac_srs_g_list; 260*da14cebeSEric Cheng mac_srs_g_list->srs_prev = mac_srs; 261*da14cebeSEric Cheng mac_srs->srs_prev = NULL; 262*da14cebeSEric Cheng mac_srs_g_list = mac_srs; 263*da14cebeSEric Cheng } 264*da14cebeSEric Cheng mac_srs->srs_state |= SRS_IN_GLIST; 265*da14cebeSEric Cheng 266*da14cebeSEric Cheng mutex_exit(&mac_srs->srs_lock); 267*da14cebeSEric Cheng rw_exit(&mac_srs_g_lock); 268*da14cebeSEric Cheng } 269*da14cebeSEric Cheng 270*da14cebeSEric Cheng static void 271*da14cebeSEric Cheng mac_srs_remove_glist(mac_soft_ring_set_t *mac_srs) 272*da14cebeSEric Cheng { 273*da14cebeSEric Cheng ASSERT(MAC_PERIM_HELD((mac_handle_t)mac_srs->srs_mcip->mci_mip)); 274*da14cebeSEric Cheng 275*da14cebeSEric Cheng rw_enter(&mac_srs_g_lock, RW_WRITER); 276*da14cebeSEric Cheng mutex_enter(&mac_srs->srs_lock); 277*da14cebeSEric Cheng 278*da14cebeSEric Cheng ASSERT((mac_srs->srs_state & SRS_IN_GLIST) != 0); 279*da14cebeSEric Cheng 280*da14cebeSEric Cheng if (mac_srs == mac_srs_g_list) { 281*da14cebeSEric Cheng mac_srs_g_list = mac_srs->srs_next; 282*da14cebeSEric Cheng if (mac_srs_g_list != NULL) 283*da14cebeSEric Cheng mac_srs_g_list->srs_prev = NULL; 284*da14cebeSEric Cheng } else { 285*da14cebeSEric Cheng mac_srs->srs_prev->srs_next = mac_srs->srs_next; 286*da14cebeSEric Cheng if (mac_srs->srs_next != NULL) 287*da14cebeSEric Cheng mac_srs->srs_next->srs_prev = mac_srs->srs_prev; 288*da14cebeSEric Cheng } 289*da14cebeSEric Cheng mac_srs->srs_state &= ~SRS_IN_GLIST; 290*da14cebeSEric Cheng 291*da14cebeSEric Cheng mutex_exit(&mac_srs->srs_lock); 292*da14cebeSEric Cheng rw_exit(&mac_srs_g_lock); 293*da14cebeSEric Cheng } 294*da14cebeSEric Cheng 295*da14cebeSEric Cheng /* POLLING SETUP AND TEAR DOWN ROUTINES */ 296*da14cebeSEric Cheng 297*da14cebeSEric Cheng /* 298*da14cebeSEric Cheng * mac_srs_client_poll_quiesce and mac_srs_client_poll_restart 299*da14cebeSEric Cheng * 300*da14cebeSEric Cheng * These routines are used to call back into the upper layer 301*da14cebeSEric Cheng * (primarily TCP squeue) to stop polling the soft rings or 302*da14cebeSEric Cheng * restart polling. 303*da14cebeSEric Cheng */ 304*da14cebeSEric Cheng void 305*da14cebeSEric Cheng mac_srs_client_poll_quiesce(mac_client_impl_t *mcip, 306*da14cebeSEric Cheng mac_soft_ring_set_t *mac_srs) 307*da14cebeSEric Cheng { 308*da14cebeSEric Cheng mac_soft_ring_t *softring; 309*da14cebeSEric Cheng 310*da14cebeSEric Cheng ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); 311*da14cebeSEric Cheng 312*da14cebeSEric Cheng if (!(mac_srs->srs_type & SRST_CLIENT_POLL_ENABLED)) { 313*da14cebeSEric Cheng ASSERT(!(mac_srs->srs_type & SRST_DLS_BYPASS)); 314*da14cebeSEric Cheng return; 315*da14cebeSEric Cheng } 316*da14cebeSEric Cheng 317*da14cebeSEric Cheng for (softring = mac_srs->srs_soft_ring_head; 318*da14cebeSEric Cheng softring != NULL; softring = softring->s_ring_next) { 319*da14cebeSEric Cheng if ((softring->s_ring_type & ST_RING_TCP) && 320*da14cebeSEric Cheng (softring->s_ring_rx_arg2 != NULL)) { 321*da14cebeSEric Cheng mcip->mci_resource_quiesce(mcip->mci_resource_arg, 322*da14cebeSEric Cheng softring->s_ring_rx_arg2); 323*da14cebeSEric Cheng } 324*da14cebeSEric Cheng } 325*da14cebeSEric Cheng } 326*da14cebeSEric Cheng 327*da14cebeSEric Cheng void 328*da14cebeSEric Cheng mac_srs_client_poll_restart(mac_client_impl_t *mcip, 329*da14cebeSEric Cheng mac_soft_ring_set_t *mac_srs) 330*da14cebeSEric Cheng { 331*da14cebeSEric Cheng mac_soft_ring_t *softring; 332*da14cebeSEric Cheng 333*da14cebeSEric Cheng ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); 334*da14cebeSEric Cheng 335*da14cebeSEric Cheng if (!(mac_srs->srs_type & SRST_CLIENT_POLL_ENABLED)) { 336*da14cebeSEric Cheng ASSERT(!(mac_srs->srs_type & SRST_DLS_BYPASS)); 337*da14cebeSEric Cheng return; 338*da14cebeSEric Cheng } 339*da14cebeSEric Cheng 340*da14cebeSEric Cheng for (softring = mac_srs->srs_soft_ring_head; 341*da14cebeSEric Cheng softring != NULL; softring = softring->s_ring_next) { 342*da14cebeSEric Cheng if ((softring->s_ring_type & ST_RING_TCP) && 343*da14cebeSEric Cheng (softring->s_ring_rx_arg2 != NULL)) { 344*da14cebeSEric Cheng mcip->mci_resource_restart(mcip->mci_resource_arg, 345*da14cebeSEric Cheng softring->s_ring_rx_arg2); 346*da14cebeSEric Cheng } 347*da14cebeSEric Cheng } 348*da14cebeSEric Cheng } 349*da14cebeSEric Cheng 350*da14cebeSEric Cheng /* 351*da14cebeSEric Cheng * Register the given SRS and associated soft rings with the consumer and 352*da14cebeSEric Cheng * enable the polling interface used by the consumer.(i.e IP) over this 353*da14cebeSEric Cheng * SRS and associated soft rings. 354*da14cebeSEric Cheng */ 355*da14cebeSEric Cheng void 356*da14cebeSEric Cheng mac_srs_client_poll_enable(mac_client_impl_t *mcip, 357*da14cebeSEric Cheng mac_soft_ring_set_t *mac_srs) 358*da14cebeSEric Cheng { 359*da14cebeSEric Cheng mac_rx_fifo_t mrf; 360*da14cebeSEric Cheng mac_soft_ring_t *softring; 361*da14cebeSEric Cheng 362*da14cebeSEric Cheng ASSERT(mac_srs->srs_mcip == mcip); 363*da14cebeSEric Cheng ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); 364*da14cebeSEric Cheng 365*da14cebeSEric Cheng if (!(mcip->mci_state_flags & MCIS_CLIENT_POLL_CAPABLE)) 366*da14cebeSEric Cheng return; 367*da14cebeSEric Cheng 368*da14cebeSEric Cheng bzero(&mrf, sizeof (mac_rx_fifo_t)); 369*da14cebeSEric Cheng mrf.mrf_type = MAC_RX_FIFO; 370*da14cebeSEric Cheng 371*da14cebeSEric Cheng /* 372*da14cebeSEric Cheng * A SRS is capable of acting as a soft ring for cases 373*da14cebeSEric Cheng * where no fanout is needed. This is the case for userland 374*da14cebeSEric Cheng * flows. 375*da14cebeSEric Cheng */ 376*da14cebeSEric Cheng if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) 377*da14cebeSEric Cheng return; 378*da14cebeSEric Cheng 379*da14cebeSEric Cheng mrf.mrf_receive = (mac_receive_t)mac_soft_ring_poll; 380*da14cebeSEric Cheng mrf.mrf_intr_enable = (mac_intr_enable_t)mac_soft_ring_intr_enable; 381*da14cebeSEric Cheng mrf.mrf_intr_disable = (mac_intr_disable_t)mac_soft_ring_intr_disable; 382*da14cebeSEric Cheng mac_srs->srs_type |= SRST_CLIENT_POLL_ENABLED; 383*da14cebeSEric Cheng 384*da14cebeSEric Cheng softring = mac_srs->srs_soft_ring_head; 385*da14cebeSEric Cheng while (softring != NULL) { 386*da14cebeSEric Cheng if (softring->s_ring_type & (ST_RING_TCP | ST_RING_UDP)) { 387*da14cebeSEric Cheng /* 388*da14cebeSEric Cheng * TCP and UDP support DLS bypass. Squeue polling 389*da14cebeSEric Cheng * support implies DLS bypass since the squeue poll 390*da14cebeSEric Cheng * path does not have DLS processing. 391*da14cebeSEric Cheng */ 392*da14cebeSEric Cheng mac_soft_ring_dls_bypass(softring, 393*da14cebeSEric Cheng mcip->mci_direct_rx_fn, mcip->mci_direct_rx_arg); 394*da14cebeSEric Cheng } 395*da14cebeSEric Cheng /* 396*da14cebeSEric Cheng * Non-TCP protocols don't support squeues. Hence we don't 397*da14cebeSEric Cheng * make any ring addition callbacks for non-TCP rings 398*da14cebeSEric Cheng */ 399*da14cebeSEric Cheng if (!(softring->s_ring_type & ST_RING_TCP)) { 400*da14cebeSEric Cheng softring->s_ring_rx_arg2 = NULL; 401*da14cebeSEric Cheng softring = softring->s_ring_next; 402*da14cebeSEric Cheng continue; 403*da14cebeSEric Cheng } 404*da14cebeSEric Cheng mrf.mrf_rx_arg = softring; 405*da14cebeSEric Cheng mrf.mrf_intr_handle = (mac_intr_handle_t)softring; 406*da14cebeSEric Cheng mrf.mrf_cpu_id = softring->s_ring_cpuid; 407*da14cebeSEric Cheng mrf.mrf_flow_priority = mac_srs->srs_pri; 408*da14cebeSEric Cheng 409*da14cebeSEric Cheng softring->s_ring_rx_arg2 = mcip->mci_resource_add( 410*da14cebeSEric Cheng mcip->mci_resource_arg, (mac_resource_t *)&mrf); 411*da14cebeSEric Cheng 412*da14cebeSEric Cheng softring = softring->s_ring_next; 413*da14cebeSEric Cheng } 414*da14cebeSEric Cheng } 415*da14cebeSEric Cheng 416*da14cebeSEric Cheng /* 417*da14cebeSEric Cheng * Unregister the given SRS and associated soft rings with the consumer and 418*da14cebeSEric Cheng * disable the polling interface used by the consumer.(i.e IP) over this 419*da14cebeSEric Cheng * SRS and associated soft rings. 420*da14cebeSEric Cheng */ 421*da14cebeSEric Cheng void 422*da14cebeSEric Cheng mac_srs_client_poll_disable(mac_client_impl_t *mcip, 423*da14cebeSEric Cheng mac_soft_ring_set_t *mac_srs) 424*da14cebeSEric Cheng { 425*da14cebeSEric Cheng mac_soft_ring_t *softring; 426*da14cebeSEric Cheng 427*da14cebeSEric Cheng ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); 428*da14cebeSEric Cheng 429*da14cebeSEric Cheng /* 430*da14cebeSEric Cheng * A SRS is capable of acting as a soft ring for cases 431*da14cebeSEric Cheng * where no protocol fanout is needed. This is the case 432*da14cebeSEric Cheng * for userland flows. Nothing to do here. 433*da14cebeSEric Cheng */ 434*da14cebeSEric Cheng if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) 435*da14cebeSEric Cheng return; 436*da14cebeSEric Cheng 437*da14cebeSEric Cheng mutex_enter(&mac_srs->srs_lock); 438*da14cebeSEric Cheng if (!(mac_srs->srs_type & SRST_CLIENT_POLL_ENABLED)) { 439*da14cebeSEric Cheng ASSERT(!(mac_srs->srs_type & SRST_DLS_BYPASS)); 440*da14cebeSEric Cheng mutex_exit(&mac_srs->srs_lock); 441*da14cebeSEric Cheng return; 442*da14cebeSEric Cheng } 443*da14cebeSEric Cheng mac_srs->srs_type &= ~(SRST_CLIENT_POLL_ENABLED | SRST_DLS_BYPASS); 444*da14cebeSEric Cheng mutex_exit(&mac_srs->srs_lock); 445*da14cebeSEric Cheng 446*da14cebeSEric Cheng /* 447*da14cebeSEric Cheng * DLS bypass is now disabled in the case of both TCP and UDP. 448*da14cebeSEric Cheng * Reset the soft ring callbacks to the standard 'mac_rx_deliver' 449*da14cebeSEric Cheng * callback. In addition, in the case of TCP, invoke IP's callback 450*da14cebeSEric Cheng * for ring removal. 451*da14cebeSEric Cheng */ 452*da14cebeSEric Cheng for (softring = mac_srs->srs_soft_ring_head; 453*da14cebeSEric Cheng softring != NULL; softring = softring->s_ring_next) { 454*da14cebeSEric Cheng if (!(softring->s_ring_type & (ST_RING_UDP | ST_RING_TCP))) 455*da14cebeSEric Cheng continue; 456*da14cebeSEric Cheng 457*da14cebeSEric Cheng if ((softring->s_ring_type & ST_RING_TCP) && 458*da14cebeSEric Cheng softring->s_ring_rx_arg2 != NULL) { 459*da14cebeSEric Cheng mcip->mci_resource_remove(mcip->mci_resource_arg, 460*da14cebeSEric Cheng softring->s_ring_rx_arg2); 461*da14cebeSEric Cheng } 462*da14cebeSEric Cheng 463*da14cebeSEric Cheng mutex_enter(&softring->s_ring_lock); 464*da14cebeSEric Cheng while (softring->s_ring_state & S_RING_PROC) { 465*da14cebeSEric Cheng softring->s_ring_state |= S_RING_CLIENT_WAIT; 466*da14cebeSEric Cheng cv_wait(&softring->s_ring_client_cv, 467*da14cebeSEric Cheng &softring->s_ring_lock); 468*da14cebeSEric Cheng } 469*da14cebeSEric Cheng softring->s_ring_state &= ~S_RING_CLIENT_WAIT; 470*da14cebeSEric Cheng softring->s_ring_rx_arg2 = NULL; 471*da14cebeSEric Cheng softring->s_ring_rx_func = mac_rx_deliver; 472*da14cebeSEric Cheng softring->s_ring_rx_arg1 = mcip; 473*da14cebeSEric Cheng mutex_exit(&softring->s_ring_lock); 474*da14cebeSEric Cheng } 475*da14cebeSEric Cheng } 476*da14cebeSEric Cheng 477*da14cebeSEric Cheng /* 478*da14cebeSEric Cheng * Enable or disable poll capability of the SRS on the underlying Rx ring. 479*da14cebeSEric Cheng * 480*da14cebeSEric Cheng * There is a need to enable or disable the poll capability of an SRS over an 481*da14cebeSEric Cheng * Rx ring depending on the number of mac clients sharing the ring and also 482*da14cebeSEric Cheng * whether user flows are configured on it. However the poll state is actively 483*da14cebeSEric Cheng * manipulated by the SRS worker and poll threads and uncoordinated changes by 484*da14cebeSEric Cheng * yet another thread to the underlying capability can surprise them leading 485*da14cebeSEric Cheng * to assert failures. Instead we quiesce the SRS, make the changes and then 486*da14cebeSEric Cheng * restart the SRS. 487*da14cebeSEric Cheng */ 488*da14cebeSEric Cheng static void 489*da14cebeSEric Cheng mac_srs_poll_state_change(mac_soft_ring_set_t *mac_srs, 490*da14cebeSEric Cheng boolean_t turn_off_poll_capab, mac_rx_func_t rx_func) 491*da14cebeSEric Cheng { 492*da14cebeSEric Cheng boolean_t need_restart = B_FALSE; 493*da14cebeSEric Cheng mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 494*da14cebeSEric Cheng mac_ring_t *ring; 495*da14cebeSEric Cheng 496*da14cebeSEric Cheng if (!SRS_QUIESCED(mac_srs)) { 497*da14cebeSEric Cheng mac_rx_srs_quiesce(mac_srs, SRS_QUIESCE); 498*da14cebeSEric Cheng need_restart = B_TRUE; 499*da14cebeSEric Cheng } 500*da14cebeSEric Cheng 501*da14cebeSEric Cheng ring = mac_srs->srs_ring; 502*da14cebeSEric Cheng if ((ring != NULL) && 503*da14cebeSEric Cheng (ring->mr_classify_type == MAC_HW_CLASSIFIER)) { 504*da14cebeSEric Cheng if (turn_off_poll_capab) 505*da14cebeSEric Cheng mac_srs->srs_state &= ~SRS_POLLING_CAPAB; 506*da14cebeSEric Cheng else 507*da14cebeSEric Cheng mac_srs->srs_state |= SRS_POLLING_CAPAB; 508*da14cebeSEric Cheng } 509*da14cebeSEric Cheng srs_rx->sr_lower_proc = rx_func; 510*da14cebeSEric Cheng 511*da14cebeSEric Cheng if (need_restart) 512*da14cebeSEric Cheng mac_rx_srs_restart(mac_srs); 513*da14cebeSEric Cheng } 514*da14cebeSEric Cheng 515*da14cebeSEric Cheng /* CPU RECONFIGURATION AND FANOUT COMPUTATION ROUTINES */ 516*da14cebeSEric Cheng 517*da14cebeSEric Cheng /* 518*da14cebeSEric Cheng * Return the next CPU to be used to bind a MAC kernel thread. 519*da14cebeSEric Cheng */ 520*da14cebeSEric Cheng static processorid_t 521*da14cebeSEric Cheng mac_next_bind_cpu(void) 522*da14cebeSEric Cheng { 523*da14cebeSEric Cheng static processorid_t srs_curr_cpu = -1; 524*da14cebeSEric Cheng cpu_t *cp; 525*da14cebeSEric Cheng 526*da14cebeSEric Cheng ASSERT(MUTEX_HELD(&cpu_lock)); 527*da14cebeSEric Cheng 528*da14cebeSEric Cheng srs_curr_cpu++; 529*da14cebeSEric Cheng cp = cpu_get(srs_curr_cpu); 530*da14cebeSEric Cheng if (cp == NULL || !cpu_is_online(cp)) 531*da14cebeSEric Cheng srs_curr_cpu = srs_bind_cpu; 532*da14cebeSEric Cheng 533*da14cebeSEric Cheng return (srs_curr_cpu); 534*da14cebeSEric Cheng } 535*da14cebeSEric Cheng 536*da14cebeSEric Cheng /* ARGSUSED */ 537*da14cebeSEric Cheng static int 538*da14cebeSEric Cheng mac_srs_cpu_setup(cpu_setup_t what, int id, void *arg) 539*da14cebeSEric Cheng { 540*da14cebeSEric Cheng ASSERT(MUTEX_HELD(&cpu_lock)); 541*da14cebeSEric Cheng switch (what) { 542*da14cebeSEric Cheng case CPU_CONFIG: 543*da14cebeSEric Cheng case CPU_ON: 544*da14cebeSEric Cheng case CPU_CPUPART_IN: 545*da14cebeSEric Cheng mac_walk_srs_and_bind(id); 546*da14cebeSEric Cheng break; 547*da14cebeSEric Cheng 548*da14cebeSEric Cheng case CPU_UNCONFIG: 549*da14cebeSEric Cheng case CPU_OFF: 550*da14cebeSEric Cheng case CPU_CPUPART_OUT: 551*da14cebeSEric Cheng mac_walk_srs_and_unbind(id); 552*da14cebeSEric Cheng break; 553*da14cebeSEric Cheng 554*da14cebeSEric Cheng default: 555*da14cebeSEric Cheng break; 556*da14cebeSEric Cheng } 557*da14cebeSEric Cheng return (0); 558*da14cebeSEric Cheng } 559*da14cebeSEric Cheng 560*da14cebeSEric Cheng /* 561*da14cebeSEric Cheng * mac_compute_soft_ring_count(): 562*da14cebeSEric Cheng * 563*da14cebeSEric Cheng * This routine computes the number of soft rings needed to handle incoming 564*da14cebeSEric Cheng * load given a flow_entry. 565*da14cebeSEric Cheng * 566*da14cebeSEric Cheng * The routine does the following: 567*da14cebeSEric Cheng * 1) soft rings will be created if mac_soft_ring_enable is set. 568*da14cebeSEric Cheng * 2) If the underlying link is a 10Gbps link, then soft rings will be 569*da14cebeSEric Cheng * created even if mac_soft_ring_enable is not set. The number of soft 570*da14cebeSEric Cheng * rings, so created, will equal mac_rx_soft_ring_10gig_count. 571*da14cebeSEric Cheng * 3) On a sun4v platform (i.e., mac_soft_ring_enable is set), 2 times the 572*da14cebeSEric Cheng * mac_rx_soft_ring_10gig_count number of soft rings will be created for a 573*da14cebeSEric Cheng * 10Gbps link. 574*da14cebeSEric Cheng * 575*da14cebeSEric Cheng * If a bandwidth limit is specified, the number that gets computed is 576*da14cebeSEric Cheng * dependent upon CPU speed, the number of Rx rings configured, and 577*da14cebeSEric Cheng * the bandwidth limit. 578*da14cebeSEric Cheng * If more Rx rings are available, less number of soft rings is needed. 579*da14cebeSEric Cheng * 580*da14cebeSEric Cheng * mac_use_bw_heuristic is another "hidden" variable that can be used to 581*da14cebeSEric Cheng * override the default use of soft ring count computation. Depending upon 582*da14cebeSEric Cheng * the usefulness of it, mac_use_bw_heuristic can later be made into a 583*da14cebeSEric Cheng * data-link property or removed altogether. 584*da14cebeSEric Cheng * 585*da14cebeSEric Cheng * TODO: Cleanup and tighten some of the assumptions. 586*da14cebeSEric Cheng */ 587*da14cebeSEric Cheng boolean_t mac_use_bw_heuristic = B_TRUE; 588*da14cebeSEric Cheng static int 589*da14cebeSEric Cheng mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt) 590*da14cebeSEric Cheng { 591*da14cebeSEric Cheng uint64_t cpu_speed, bw = 0; 592*da14cebeSEric Cheng int srings = 0; 593*da14cebeSEric Cheng boolean_t bw_enabled = B_FALSE; 594*da14cebeSEric Cheng 595*da14cebeSEric Cheng ASSERT(!(flent->fe_type & FLOW_USER)); 596*da14cebeSEric Cheng if (flent->fe_resource_props.mrp_mask & MRP_MAXBW && 597*da14cebeSEric Cheng mac_use_bw_heuristic) { 598*da14cebeSEric Cheng /* bandwidth enabled */ 599*da14cebeSEric Cheng bw_enabled = B_TRUE; 600*da14cebeSEric Cheng bw = flent->fe_resource_props.mrp_maxbw; 601*da14cebeSEric Cheng } 602*da14cebeSEric Cheng if (!bw_enabled) { 603*da14cebeSEric Cheng /* No bandwidth enabled */ 604*da14cebeSEric Cheng if (mac_soft_ring_enable) 605*da14cebeSEric Cheng srings = mac_rx_soft_ring_count; 606*da14cebeSEric Cheng 607*da14cebeSEric Cheng /* Is this a 10Gig link? */ 608*da14cebeSEric Cheng flent->fe_nic_speed = mac_client_stat_get(flent->fe_mcip, 609*da14cebeSEric Cheng MAC_STAT_IFSPEED); 610*da14cebeSEric Cheng /* convert to Mbps */ 611*da14cebeSEric Cheng if (((flent->fe_nic_speed)/1000000) > 1000 && 612*da14cebeSEric Cheng mac_rx_soft_ring_10gig_count > 0) { 613*da14cebeSEric Cheng /* This is a 10Gig link */ 614*da14cebeSEric Cheng srings = mac_rx_soft_ring_10gig_count; 615*da14cebeSEric Cheng /* 616*da14cebeSEric Cheng * Use 2 times mac_rx_soft_ring_10gig_count for 617*da14cebeSEric Cheng * sun4v systems. 618*da14cebeSEric Cheng */ 619*da14cebeSEric Cheng if (mac_soft_ring_enable) 620*da14cebeSEric Cheng srings = srings * 2; 621*da14cebeSEric Cheng } 622*da14cebeSEric Cheng } else { 623*da14cebeSEric Cheng /* 624*da14cebeSEric Cheng * Soft ring computation using CPU speed and specified 625*da14cebeSEric Cheng * bandwidth limit. 626*da14cebeSEric Cheng */ 627*da14cebeSEric Cheng /* Assumption: all CPUs have the same frequency */ 628*da14cebeSEric Cheng cpu_speed = (uint64_t)CPU->cpu_type_info.pi_clock; 629*da14cebeSEric Cheng 630*da14cebeSEric Cheng /* cpu_speed is in MHz; make bw in units of Mbps. */ 631*da14cebeSEric Cheng bw = bw/1000000; 632*da14cebeSEric Cheng 633*da14cebeSEric Cheng if (bw >= 1000) { 634*da14cebeSEric Cheng /* 635*da14cebeSEric Cheng * bw is greater than or equal to 1Gbps. 636*da14cebeSEric Cheng * The number of soft rings required is a function 637*da14cebeSEric Cheng * of bandwidth and CPU speed. To keep this simple, 638*da14cebeSEric Cheng * let's use this rule: 1GHz CPU can handle 1Gbps. 639*da14cebeSEric Cheng * If bw is less than 1 Gbps, then there is no need 640*da14cebeSEric Cheng * for soft rings. Assumption is that CPU speeds 641*da14cebeSEric Cheng * (on modern systems) are at least 1GHz. 642*da14cebeSEric Cheng */ 643*da14cebeSEric Cheng srings = bw/cpu_speed; 644*da14cebeSEric Cheng if (srings <= 1 && mac_soft_ring_enable) { 645*da14cebeSEric Cheng /* 646*da14cebeSEric Cheng * Give at least 2 soft rings 647*da14cebeSEric Cheng * for sun4v systems 648*da14cebeSEric Cheng */ 649*da14cebeSEric Cheng srings = 2; 650*da14cebeSEric Cheng } 651*da14cebeSEric Cheng } 652*da14cebeSEric Cheng } 653*da14cebeSEric Cheng /* 654*da14cebeSEric Cheng * If the flent has multiple Rx SRSs, then each SRS need not 655*da14cebeSEric Cheng * have that many soft rings on top of it. The number of 656*da14cebeSEric Cheng * soft rings for each Rx SRS is found by dividing srings by 657*da14cebeSEric Cheng * rx_srs_cnt. 658*da14cebeSEric Cheng */ 659*da14cebeSEric Cheng if (rx_srs_cnt > 1) { 660*da14cebeSEric Cheng int remainder; 661*da14cebeSEric Cheng 662*da14cebeSEric Cheng remainder = srings%rx_srs_cnt; 663*da14cebeSEric Cheng srings = srings/rx_srs_cnt; 664*da14cebeSEric Cheng if (remainder != 0) 665*da14cebeSEric Cheng srings++; 666*da14cebeSEric Cheng /* 667*da14cebeSEric Cheng * Fanning out to 1 soft ring is not very useful. 668*da14cebeSEric Cheng * Set it as well to 0 and mac_srs_fanout_init() 669*da14cebeSEric Cheng * will take care of creating a single soft ring 670*da14cebeSEric Cheng * for proto fanout. 671*da14cebeSEric Cheng */ 672*da14cebeSEric Cheng if (srings == 1) 673*da14cebeSEric Cheng srings = 0; 674*da14cebeSEric Cheng } 675*da14cebeSEric Cheng /* Do some more massaging */ 676*da14cebeSEric Cheng srings = min(srings, ncpus); 677*da14cebeSEric Cheng srings = min(srings, MAX_SR_FANOUT); 678*da14cebeSEric Cheng return (srings); 679*da14cebeSEric Cheng } 680*da14cebeSEric Cheng 681*da14cebeSEric Cheng /* 682*da14cebeSEric Cheng * Assignment of user specified CPUs to a link. 683*da14cebeSEric Cheng * 684*da14cebeSEric Cheng * Minimum CPUs required to get an optimal assignmet: 685*da14cebeSEric Cheng * For each Rx SRS, atleast two CPUs are needed if mac_latency_optimize 686*da14cebeSEric Cheng * flag is set -- one for polling, one for fanout soft ring. 687*da14cebeSEric Cheng * If mac_latency_optimize is not set, then 3 CPUs are needed -- one 688*da14cebeSEric Cheng * for polling, one for SRS worker thread and one for fanout soft ring. 689*da14cebeSEric Cheng * 690*da14cebeSEric Cheng * The CPUs needed for Tx side is equal to the number of Tx rings 691*da14cebeSEric Cheng * the link is using. 692*da14cebeSEric Cheng * 693*da14cebeSEric Cheng * mac_flow_user_cpu_init() categorizes the CPU assignment depending 694*da14cebeSEric Cheng * upon the number of CPUs in 3 different buckets. 695*da14cebeSEric Cheng * 696*da14cebeSEric Cheng * In the first bucket, the most optimal case is handled. The user has 697*da14cebeSEric Cheng * passed enough number of CPUs and every thread gets its own CPU. 698*da14cebeSEric Cheng * 699*da14cebeSEric Cheng * The second and third are the sub-optimal cases. Enough CPUs are not 700*da14cebeSEric Cheng * available. 701*da14cebeSEric Cheng * 702*da14cebeSEric Cheng * The second bucket handles the case where atleast one distinct CPU is 703*da14cebeSEric Cheng * is available for each of the Rx rings (Rx SRSes) and Tx rings (Tx 704*da14cebeSEric Cheng * SRS or soft rings). 705*da14cebeSEric Cheng * 706*da14cebeSEric Cheng * In the third case (worst case scenario), specified CPU count is less 707*da14cebeSEric Cheng * than the Rx rings configured for the link. In this case, we round 708*da14cebeSEric Cheng * robin the CPUs among the Rx SRSes and Tx SRS/soft rings. 709*da14cebeSEric Cheng */ 710*da14cebeSEric Cheng static void 711*da14cebeSEric Cheng mac_flow_user_cpu_init(flow_entry_t *flent, mac_resource_props_t *mrp) 712*da14cebeSEric Cheng { 713*da14cebeSEric Cheng mac_soft_ring_set_t *rx_srs, *tx_srs; 714*da14cebeSEric Cheng int i, srs_cnt; 715*da14cebeSEric Cheng mac_cpus_t *srs_cpu; 716*da14cebeSEric Cheng int no_of_cpus, cpu_cnt; 717*da14cebeSEric Cheng int rx_srs_cnt, reqd_rx_cpu_cnt; 718*da14cebeSEric Cheng int fanout_cpu_cnt, reqd_tx_cpu_cnt; 719*da14cebeSEric Cheng int reqd_poll_worker_cnt, fanout_cnt_per_srs; 720*da14cebeSEric Cheng 721*da14cebeSEric Cheng ASSERT(mrp->mrp_fanout_mode == MCM_CPUS); 722*da14cebeSEric Cheng /* 723*da14cebeSEric Cheng * The check for nbc_ncpus to be within limits for 724*da14cebeSEric Cheng * the user specified case was done earlier and if 725*da14cebeSEric Cheng * not within limits, an error would have been 726*da14cebeSEric Cheng * returned to the user. 727*da14cebeSEric Cheng */ 728*da14cebeSEric Cheng ASSERT(mrp->mrp_ncpus > 0 && mrp->mrp_ncpus <= MAX_SR_FANOUT); 729*da14cebeSEric Cheng 730*da14cebeSEric Cheng no_of_cpus = mrp->mrp_ncpus; 731*da14cebeSEric Cheng 732*da14cebeSEric Cheng if (mrp->mrp_intr_cpu != -1) { 733*da14cebeSEric Cheng /* 734*da14cebeSEric Cheng * interrupt has been re-targetted. Poll 735*da14cebeSEric Cheng * thread needs to be bound to interrupt 736*da14cebeSEric Cheng * CPU. Presently only fixed interrupts 737*da14cebeSEric Cheng * are re-targetted, MSI-x aren't. 738*da14cebeSEric Cheng * 739*da14cebeSEric Cheng * Find where in the list is the intr 740*da14cebeSEric Cheng * CPU and swap it with the first one. 741*da14cebeSEric Cheng * We will be using the first CPU in the 742*da14cebeSEric Cheng * list for poll. 743*da14cebeSEric Cheng */ 744*da14cebeSEric Cheng for (i = 0; i < no_of_cpus; i++) { 745*da14cebeSEric Cheng if (mrp->mrp_cpu[i] == mrp->mrp_intr_cpu) 746*da14cebeSEric Cheng break; 747*da14cebeSEric Cheng } 748*da14cebeSEric Cheng mrp->mrp_cpu[i] = mrp->mrp_cpu[0]; 749*da14cebeSEric Cheng mrp->mrp_cpu[0] = mrp->mrp_intr_cpu; 750*da14cebeSEric Cheng } 751*da14cebeSEric Cheng 752*da14cebeSEric Cheng /* 753*da14cebeSEric Cheng * Requirements: 754*da14cebeSEric Cheng * The number of CPUs that each Rx ring needs is dependent 755*da14cebeSEric Cheng * upon mac_latency_optimize flag. 756*da14cebeSEric Cheng * 1) If set, atleast 2 CPUs are needed -- one for 757*da14cebeSEric Cheng * polling, one for fanout soft ring. 758*da14cebeSEric Cheng * 2) If not set, then atleast 3 CPUs are needed -- one 759*da14cebeSEric Cheng * for polling, one for srs worker thread, and one for 760*da14cebeSEric Cheng * fanout soft ring. 761*da14cebeSEric Cheng */ 762*da14cebeSEric Cheng rx_srs_cnt = (flent->fe_rx_srs_cnt > 1) ? 763*da14cebeSEric Cheng (flent->fe_rx_srs_cnt - 1) : flent->fe_rx_srs_cnt; 764*da14cebeSEric Cheng reqd_rx_cpu_cnt = mac_latency_optimize ? 765*da14cebeSEric Cheng (rx_srs_cnt * 2) : (rx_srs_cnt * 3); 766*da14cebeSEric Cheng 767*da14cebeSEric Cheng /* How many CPUs are needed for Tx side? */ 768*da14cebeSEric Cheng tx_srs = flent->fe_tx_srs; 769*da14cebeSEric Cheng reqd_tx_cpu_cnt = TX_MULTI_RING_MODE(tx_srs) ? 770*da14cebeSEric Cheng tx_srs->srs_oth_ring_count : 1; 771*da14cebeSEric Cheng 772*da14cebeSEric Cheng /* CPUs needed for Rx SRSes poll and worker threads */ 773*da14cebeSEric Cheng reqd_poll_worker_cnt = mac_latency_optimize ? 774*da14cebeSEric Cheng rx_srs_cnt : rx_srs_cnt * 2; 775*da14cebeSEric Cheng 776*da14cebeSEric Cheng /* Has the user provided enough CPUs? */ 777*da14cebeSEric Cheng if (no_of_cpus >= (reqd_rx_cpu_cnt + reqd_tx_cpu_cnt)) { 778*da14cebeSEric Cheng /* 779*da14cebeSEric Cheng * Best case scenario. There is enough CPUs. All 780*da14cebeSEric Cheng * Rx rings will get their own set of CPUs plus 781*da14cebeSEric Cheng * Tx soft rings will get their own. 782*da14cebeSEric Cheng */ 783*da14cebeSEric Cheng /* 784*da14cebeSEric Cheng * fanout_cpu_cnt is the number of CPUs available 785*da14cebeSEric Cheng * for Rx side fanout soft rings. 786*da14cebeSEric Cheng */ 787*da14cebeSEric Cheng fanout_cpu_cnt = no_of_cpus - 788*da14cebeSEric Cheng reqd_poll_worker_cnt - reqd_tx_cpu_cnt; 789*da14cebeSEric Cheng 790*da14cebeSEric Cheng /* 791*da14cebeSEric Cheng * Divide fanout_cpu_cnt by rx_srs_cnt to find 792*da14cebeSEric Cheng * out how many fanout soft rings each Rx SRS 793*da14cebeSEric Cheng * can have. 794*da14cebeSEric Cheng */ 795*da14cebeSEric Cheng fanout_cnt_per_srs = fanout_cpu_cnt/rx_srs_cnt; 796*da14cebeSEric Cheng 797*da14cebeSEric Cheng /* Do the assignment for the default Rx ring */ 798*da14cebeSEric Cheng cpu_cnt = 0; 799*da14cebeSEric Cheng rx_srs = flent->fe_rx_srs[0]; 800*da14cebeSEric Cheng ASSERT(rx_srs->srs_ring == NULL); 801*da14cebeSEric Cheng if (rx_srs->srs_fanout_state == SRS_FANOUT_INIT) 802*da14cebeSEric Cheng rx_srs->srs_fanout_state = SRS_FANOUT_REINIT; 803*da14cebeSEric Cheng srs_cpu = &rx_srs->srs_cpu; 804*da14cebeSEric Cheng srs_cpu->mc_ncpus = no_of_cpus; 805*da14cebeSEric Cheng bcopy(mrp->mrp_cpu, 806*da14cebeSEric Cheng srs_cpu->mc_cpus, sizeof (srs_cpu->mc_cpus)); 807*da14cebeSEric Cheng srs_cpu->mc_fanout_cnt = fanout_cnt_per_srs; 808*da14cebeSEric Cheng srs_cpu->mc_pollid = mrp->mrp_cpu[cpu_cnt++]; 809*da14cebeSEric Cheng srs_cpu->mc_intr_cpu = mrp->mrp_intr_cpu; 810*da14cebeSEric Cheng srs_cpu->mc_workerid = srs_cpu->mc_pollid; 811*da14cebeSEric Cheng if (!mac_latency_optimize) 812*da14cebeSEric Cheng srs_cpu->mc_workerid = mrp->mrp_cpu[cpu_cnt++]; 813*da14cebeSEric Cheng for (i = 0; i < fanout_cnt_per_srs; i++) 814*da14cebeSEric Cheng srs_cpu->mc_fanout_cpus[i] = mrp->mrp_cpu[cpu_cnt++]; 815*da14cebeSEric Cheng 816*da14cebeSEric Cheng /* Do the assignment for h/w Rx SRSes */ 817*da14cebeSEric Cheng if (flent->fe_rx_srs_cnt > 1) { 818*da14cebeSEric Cheng cpu_cnt = 0; 819*da14cebeSEric Cheng for (srs_cnt = 1; 820*da14cebeSEric Cheng srs_cnt < flent->fe_rx_srs_cnt; srs_cnt++) { 821*da14cebeSEric Cheng rx_srs = flent->fe_rx_srs[srs_cnt]; 822*da14cebeSEric Cheng ASSERT(rx_srs->srs_ring != NULL); 823*da14cebeSEric Cheng if (rx_srs->srs_fanout_state == 824*da14cebeSEric Cheng SRS_FANOUT_INIT) { 825*da14cebeSEric Cheng rx_srs->srs_fanout_state = 826*da14cebeSEric Cheng SRS_FANOUT_REINIT; 827*da14cebeSEric Cheng } 828*da14cebeSEric Cheng srs_cpu = &rx_srs->srs_cpu; 829*da14cebeSEric Cheng srs_cpu->mc_ncpus = no_of_cpus; 830*da14cebeSEric Cheng bcopy(mrp->mrp_cpu, srs_cpu->mc_cpus, 831*da14cebeSEric Cheng sizeof (srs_cpu->mc_cpus)); 832*da14cebeSEric Cheng srs_cpu->mc_fanout_cnt = fanout_cnt_per_srs; 833*da14cebeSEric Cheng /* The first CPU in the list is the intr CPU */ 834*da14cebeSEric Cheng srs_cpu->mc_pollid = mrp->mrp_cpu[cpu_cnt++]; 835*da14cebeSEric Cheng srs_cpu->mc_intr_cpu = mrp->mrp_intr_cpu; 836*da14cebeSEric Cheng srs_cpu->mc_workerid = srs_cpu->mc_pollid; 837*da14cebeSEric Cheng if (!mac_latency_optimize) { 838*da14cebeSEric Cheng srs_cpu->mc_workerid = 839*da14cebeSEric Cheng mrp->mrp_cpu[cpu_cnt++]; 840*da14cebeSEric Cheng } 841*da14cebeSEric Cheng for (i = 0; i < fanout_cnt_per_srs; i++) { 842*da14cebeSEric Cheng srs_cpu->mc_fanout_cpus[i] = 843*da14cebeSEric Cheng mrp->mrp_cpu[cpu_cnt++]; 844*da14cebeSEric Cheng } 845*da14cebeSEric Cheng ASSERT(cpu_cnt <= no_of_cpus); 846*da14cebeSEric Cheng } 847*da14cebeSEric Cheng } 848*da14cebeSEric Cheng return; 849*da14cebeSEric Cheng } 850*da14cebeSEric Cheng 851*da14cebeSEric Cheng /* 852*da14cebeSEric Cheng * Sub-optimal case. 853*da14cebeSEric Cheng * We have the following information: 854*da14cebeSEric Cheng * no_of_cpus - no. of cpus that user passed. 855*da14cebeSEric Cheng * rx_srs_cnt - no. of rx rings. 856*da14cebeSEric Cheng * reqd_rx_cpu_cnt = mac_latency_optimize?rx_srs_cnt*2:rx_srs_cnt*3 857*da14cebeSEric Cheng * reqd_tx_cpu_cnt - no. of cpus reqd. for Tx side. 858*da14cebeSEric Cheng * reqd_poll_worker_cnt = mac_latency_optimize?rx_srs_cnt:rx_srs_cnt*2 859*da14cebeSEric Cheng */ 860*da14cebeSEric Cheng /* 861*da14cebeSEric Cheng * If we bind the Rx fanout soft rings to the same CPUs 862*da14cebeSEric Cheng * as poll/worker, would that be enough? 863*da14cebeSEric Cheng */ 864*da14cebeSEric Cheng if (no_of_cpus >= (rx_srs_cnt + reqd_tx_cpu_cnt)) { 865*da14cebeSEric Cheng boolean_t worker_assign = B_FALSE; 866*da14cebeSEric Cheng 867*da14cebeSEric Cheng /* 868*da14cebeSEric Cheng * If mac_latency_optimize is not set, are there 869*da14cebeSEric Cheng * enough CPUs to assign a CPU for worker also? 870*da14cebeSEric Cheng */ 871*da14cebeSEric Cheng if (no_of_cpus >= (reqd_poll_worker_cnt + reqd_tx_cpu_cnt)) 872*da14cebeSEric Cheng worker_assign = B_TRUE; 873*da14cebeSEric Cheng /* 874*da14cebeSEric Cheng * Zero'th Rx SRS is the default Rx ring. It is not 875*da14cebeSEric Cheng * associated with h/w Rx ring. 876*da14cebeSEric Cheng */ 877*da14cebeSEric Cheng rx_srs = flent->fe_rx_srs[0]; 878*da14cebeSEric Cheng ASSERT(rx_srs->srs_ring == NULL); 879*da14cebeSEric Cheng if (rx_srs->srs_fanout_state == SRS_FANOUT_INIT) 880*da14cebeSEric Cheng rx_srs->srs_fanout_state = SRS_FANOUT_REINIT; 881*da14cebeSEric Cheng cpu_cnt = 0; 882*da14cebeSEric Cheng srs_cpu = &rx_srs->srs_cpu; 883*da14cebeSEric Cheng srs_cpu->mc_ncpus = no_of_cpus; 884*da14cebeSEric Cheng bcopy(mrp->mrp_cpu, 885*da14cebeSEric Cheng srs_cpu->mc_cpus, sizeof (srs_cpu->mc_cpus)); 886*da14cebeSEric Cheng srs_cpu->mc_fanout_cnt = 1; 887*da14cebeSEric Cheng srs_cpu->mc_pollid = mrp->mrp_cpu[cpu_cnt++]; 888*da14cebeSEric Cheng srs_cpu->mc_intr_cpu = mrp->mrp_intr_cpu; 889*da14cebeSEric Cheng srs_cpu->mc_workerid = srs_cpu->mc_pollid; 890*da14cebeSEric Cheng if (!mac_latency_optimize && worker_assign) 891*da14cebeSEric Cheng srs_cpu->mc_workerid = mrp->mrp_cpu[cpu_cnt++]; 892*da14cebeSEric Cheng srs_cpu->mc_fanout_cpus[0] = mrp->mrp_cpu[cpu_cnt]; 893*da14cebeSEric Cheng 894*da14cebeSEric Cheng /* Do CPU bindings for SRSes having h/w Rx rings */ 895*da14cebeSEric Cheng if (flent->fe_rx_srs_cnt > 1) { 896*da14cebeSEric Cheng cpu_cnt = 0; 897*da14cebeSEric Cheng for (srs_cnt = 1; 898*da14cebeSEric Cheng srs_cnt < flent->fe_rx_srs_cnt; srs_cnt++) { 899*da14cebeSEric Cheng rx_srs = flent->fe_rx_srs[srs_cnt]; 900*da14cebeSEric Cheng ASSERT(rx_srs->srs_ring != NULL); 901*da14cebeSEric Cheng if (rx_srs->srs_fanout_state == 902*da14cebeSEric Cheng SRS_FANOUT_INIT) { 903*da14cebeSEric Cheng rx_srs->srs_fanout_state = 904*da14cebeSEric Cheng SRS_FANOUT_REINIT; 905*da14cebeSEric Cheng } 906*da14cebeSEric Cheng srs_cpu = &rx_srs->srs_cpu; 907*da14cebeSEric Cheng srs_cpu->mc_ncpus = no_of_cpus; 908*da14cebeSEric Cheng bcopy(mrp->mrp_cpu, srs_cpu->mc_cpus, 909*da14cebeSEric Cheng sizeof (srs_cpu->mc_cpus)); 910*da14cebeSEric Cheng srs_cpu->mc_pollid = 911*da14cebeSEric Cheng mrp->mrp_cpu[cpu_cnt]; 912*da14cebeSEric Cheng srs_cpu->mc_intr_cpu = mrp->mrp_intr_cpu; 913*da14cebeSEric Cheng srs_cpu->mc_workerid = srs_cpu->mc_pollid; 914*da14cebeSEric Cheng if (!mac_latency_optimize && worker_assign) { 915*da14cebeSEric Cheng srs_cpu->mc_workerid = 916*da14cebeSEric Cheng mrp->mrp_cpu[++cpu_cnt]; 917*da14cebeSEric Cheng } 918*da14cebeSEric Cheng srs_cpu->mc_fanout_cnt = 1; 919*da14cebeSEric Cheng srs_cpu->mc_fanout_cpus[0] = 920*da14cebeSEric Cheng mrp->mrp_cpu[cpu_cnt]; 921*da14cebeSEric Cheng cpu_cnt++; 922*da14cebeSEric Cheng ASSERT(cpu_cnt <= no_of_cpus); 923*da14cebeSEric Cheng } 924*da14cebeSEric Cheng } 925*da14cebeSEric Cheng return; 926*da14cebeSEric Cheng } 927*da14cebeSEric Cheng 928*da14cebeSEric Cheng /* 929*da14cebeSEric Cheng * Real sub-optimal case. Not enough CPUs for poll and 930*da14cebeSEric Cheng * Tx soft rings. Do a round robin assignment where 931*da14cebeSEric Cheng * each Rx SRS will get the same CPU for poll, worker 932*da14cebeSEric Cheng * and fanout soft ring. 933*da14cebeSEric Cheng */ 934*da14cebeSEric Cheng cpu_cnt = 0; 935*da14cebeSEric Cheng for (srs_cnt = 0; srs_cnt < flent->fe_rx_srs_cnt; srs_cnt++) { 936*da14cebeSEric Cheng rx_srs = flent->fe_rx_srs[srs_cnt]; 937*da14cebeSEric Cheng srs_cpu = &rx_srs->srs_cpu; 938*da14cebeSEric Cheng if (rx_srs->srs_fanout_state == SRS_FANOUT_INIT) 939*da14cebeSEric Cheng rx_srs->srs_fanout_state = SRS_FANOUT_REINIT; 940*da14cebeSEric Cheng srs_cpu->mc_ncpus = no_of_cpus; 941*da14cebeSEric Cheng bcopy(mrp->mrp_cpu, 942*da14cebeSEric Cheng srs_cpu->mc_cpus, sizeof (srs_cpu->mc_cpus)); 943*da14cebeSEric Cheng srs_cpu->mc_fanout_cnt = 1; 944*da14cebeSEric Cheng srs_cpu->mc_pollid = mrp->mrp_cpu[cpu_cnt]; 945*da14cebeSEric Cheng srs_cpu->mc_intr_cpu = mrp->mrp_intr_cpu; 946*da14cebeSEric Cheng srs_cpu->mc_workerid = mrp->mrp_cpu[cpu_cnt]; 947*da14cebeSEric Cheng srs_cpu->mc_fanout_cpus[0] = mrp->mrp_cpu[cpu_cnt]; 948*da14cebeSEric Cheng if (++cpu_cnt >= no_of_cpus) 949*da14cebeSEric Cheng cpu_cnt = 0; 950*da14cebeSEric Cheng } 951*da14cebeSEric Cheng } 952*da14cebeSEric Cheng 953*da14cebeSEric Cheng /* 954*da14cebeSEric Cheng * mac_flow_cpu_init(): 955*da14cebeSEric Cheng * 956*da14cebeSEric Cheng * Each SRS has a mac_cpu_t structure, srs_cpu. This routine fills in 957*da14cebeSEric Cheng * the CPU binding information in srs_cpu for all Rx SRSes associated 958*da14cebeSEric Cheng * with a flent. 959*da14cebeSEric Cheng */ 960*da14cebeSEric Cheng static void 961*da14cebeSEric Cheng mac_flow_cpu_init(flow_entry_t *flent, mac_resource_props_t *mrp) 962*da14cebeSEric Cheng { 963*da14cebeSEric Cheng mac_soft_ring_set_t *rx_srs; 964*da14cebeSEric Cheng processorid_t cpuid; 965*da14cebeSEric Cheng int j, srs_cnt, soft_ring_cnt = 0; 966*da14cebeSEric Cheng mac_cpus_t *srs_cpu; 967*da14cebeSEric Cheng 968*da14cebeSEric Cheng if (mrp->mrp_mask & MRP_CPUS_USERSPEC) { 969*da14cebeSEric Cheng mac_flow_user_cpu_init(flent, mrp); 970*da14cebeSEric Cheng } else { 971*da14cebeSEric Cheng /* 972*da14cebeSEric Cheng * Compute the number of soft rings needed on top for each Rx 973*da14cebeSEric Cheng * SRS. "rx_srs_cnt-1" indicates the number of Rx SRS 974*da14cebeSEric Cheng * associated with h/w Rx rings. Soft ring count needed for 975*da14cebeSEric Cheng * each h/w Rx SRS is computed and the same is applied to 976*da14cebeSEric Cheng * software classified Rx SRS. The first Rx SRS in fe_rx_srs[] 977*da14cebeSEric Cheng * is the software classified Rx SRS. 978*da14cebeSEric Cheng */ 979*da14cebeSEric Cheng soft_ring_cnt = mac_compute_soft_ring_count(flent, 980*da14cebeSEric Cheng flent->fe_rx_srs_cnt - 1); 981*da14cebeSEric Cheng if (soft_ring_cnt == 0) { 982*da14cebeSEric Cheng /* 983*da14cebeSEric Cheng * Even when soft_ring_cnt is 0, we still need 984*da14cebeSEric Cheng * to create a soft ring for TCP, UDP and 985*da14cebeSEric Cheng * OTHER. So set it to 1. 986*da14cebeSEric Cheng */ 987*da14cebeSEric Cheng soft_ring_cnt = 1; 988*da14cebeSEric Cheng } 989*da14cebeSEric Cheng for (srs_cnt = 0; srs_cnt < flent->fe_rx_srs_cnt; srs_cnt++) { 990*da14cebeSEric Cheng rx_srs = flent->fe_rx_srs[srs_cnt]; 991*da14cebeSEric Cheng srs_cpu = &rx_srs->srs_cpu; 992*da14cebeSEric Cheng if (rx_srs->srs_fanout_state == SRS_FANOUT_INIT) { 993*da14cebeSEric Cheng if (soft_ring_cnt == srs_cpu->mc_fanout_cnt) 994*da14cebeSEric Cheng continue; 995*da14cebeSEric Cheng rx_srs->srs_fanout_state = SRS_FANOUT_REINIT; 996*da14cebeSEric Cheng } 997*da14cebeSEric Cheng srs_cpu->mc_ncpus = soft_ring_cnt; 998*da14cebeSEric Cheng srs_cpu->mc_fanout_cnt = soft_ring_cnt; 999*da14cebeSEric Cheng mutex_enter(&cpu_lock); 1000*da14cebeSEric Cheng for (j = 0; j < soft_ring_cnt; j++) { 1001*da14cebeSEric Cheng cpuid = mac_next_bind_cpu(); 1002*da14cebeSEric Cheng srs_cpu->mc_cpus[j] = cpuid; 1003*da14cebeSEric Cheng srs_cpu->mc_fanout_cpus[j] = cpuid; 1004*da14cebeSEric Cheng } 1005*da14cebeSEric Cheng cpuid = mac_next_bind_cpu(); 1006*da14cebeSEric Cheng srs_cpu->mc_pollid = cpuid; 1007*da14cebeSEric Cheng /* increment ncpus to account for polling cpu */ 1008*da14cebeSEric Cheng srs_cpu->mc_ncpus++; 1009*da14cebeSEric Cheng srs_cpu->mc_cpus[j++] = cpuid; 1010*da14cebeSEric Cheng if (!mac_latency_optimize) { 1011*da14cebeSEric Cheng cpuid = mac_next_bind_cpu(); 1012*da14cebeSEric Cheng srs_cpu->mc_ncpus++; 1013*da14cebeSEric Cheng srs_cpu->mc_cpus[j++] = cpuid; 1014*da14cebeSEric Cheng } 1015*da14cebeSEric Cheng srs_cpu->mc_workerid = cpuid; 1016*da14cebeSEric Cheng mutex_exit(&cpu_lock); 1017*da14cebeSEric Cheng } 1018*da14cebeSEric Cheng } 1019*da14cebeSEric Cheng } 1020*da14cebeSEric Cheng 1021*da14cebeSEric Cheng /* 1022*da14cebeSEric Cheng * DATAPATH SETUP ROUTINES 1023*da14cebeSEric Cheng * (setup SRS and set/update FANOUT, B/W and PRIORITY) 1024*da14cebeSEric Cheng */ 1025*da14cebeSEric Cheng 1026*da14cebeSEric Cheng static void 1027*da14cebeSEric Cheng mac_srs_fanout_list_alloc(mac_soft_ring_set_t *mac_srs) 1028*da14cebeSEric Cheng { 1029*da14cebeSEric Cheng mac_srs->srs_tcp_soft_rings = (mac_soft_ring_t **) 1030*da14cebeSEric Cheng kmem_zalloc(sizeof (mac_soft_ring_t *) * MAX_SR_FANOUT, KM_SLEEP); 1031*da14cebeSEric Cheng mac_srs->srs_udp_soft_rings = (mac_soft_ring_t **) 1032*da14cebeSEric Cheng kmem_zalloc(sizeof (mac_soft_ring_t *) * MAX_SR_FANOUT, KM_SLEEP); 1033*da14cebeSEric Cheng mac_srs->srs_oth_soft_rings = (mac_soft_ring_t **) 1034*da14cebeSEric Cheng kmem_zalloc(sizeof (mac_soft_ring_t *) * MAX_SR_FANOUT, KM_SLEEP); 1035*da14cebeSEric Cheng } 1036*da14cebeSEric Cheng 1037*da14cebeSEric Cheng static void 1038*da14cebeSEric Cheng mac_srs_worker_bind(mac_soft_ring_set_t *mac_srs, processorid_t cpuid) 1039*da14cebeSEric Cheng { 1040*da14cebeSEric Cheng cpu_t *cp; 1041*da14cebeSEric Cheng boolean_t clear = B_FALSE; 1042*da14cebeSEric Cheng 1043*da14cebeSEric Cheng ASSERT(MUTEX_HELD(&cpu_lock)); 1044*da14cebeSEric Cheng 1045*da14cebeSEric Cheng if (!mac_srs_thread_bind) 1046*da14cebeSEric Cheng return; 1047*da14cebeSEric Cheng 1048*da14cebeSEric Cheng cp = cpu_get(cpuid); 1049*da14cebeSEric Cheng if (cp == NULL || !cpu_is_online(cp)) 1050*da14cebeSEric Cheng return; 1051*da14cebeSEric Cheng 1052*da14cebeSEric Cheng mutex_enter(&mac_srs->srs_lock); 1053*da14cebeSEric Cheng mac_srs->srs_state |= SRS_WORKER_BOUND; 1054*da14cebeSEric Cheng if (mac_srs->srs_worker_cpuid != -1) 1055*da14cebeSEric Cheng clear = B_TRUE; 1056*da14cebeSEric Cheng mac_srs->srs_worker_cpuid = cpuid; 1057*da14cebeSEric Cheng mutex_exit(&mac_srs->srs_lock); 1058*da14cebeSEric Cheng 1059*da14cebeSEric Cheng if (clear) 1060*da14cebeSEric Cheng thread_affinity_clear(mac_srs->srs_worker); 1061*da14cebeSEric Cheng 1062*da14cebeSEric Cheng thread_affinity_set(mac_srs->srs_worker, cpuid); 1063*da14cebeSEric Cheng DTRACE_PROBE1(worker__CPU, processorid_t, cpuid); 1064*da14cebeSEric Cheng } 1065*da14cebeSEric Cheng 1066*da14cebeSEric Cheng static void 1067*da14cebeSEric Cheng mac_srs_poll_bind(mac_soft_ring_set_t *mac_srs, processorid_t cpuid) 1068*da14cebeSEric Cheng { 1069*da14cebeSEric Cheng cpu_t *cp; 1070*da14cebeSEric Cheng boolean_t clear = B_FALSE; 1071*da14cebeSEric Cheng 1072*da14cebeSEric Cheng ASSERT(MUTEX_HELD(&cpu_lock)); 1073*da14cebeSEric Cheng 1074*da14cebeSEric Cheng if (!mac_srs_thread_bind || mac_srs->srs_poll_thr == NULL) 1075*da14cebeSEric Cheng return; 1076*da14cebeSEric Cheng 1077*da14cebeSEric Cheng cp = cpu_get(cpuid); 1078*da14cebeSEric Cheng if (cp == NULL || !cpu_is_online(cp)) 1079*da14cebeSEric Cheng return; 1080*da14cebeSEric Cheng 1081*da14cebeSEric Cheng mutex_enter(&mac_srs->srs_lock); 1082*da14cebeSEric Cheng mac_srs->srs_state |= SRS_POLL_BOUND; 1083*da14cebeSEric Cheng if (mac_srs->srs_poll_cpuid != -1) 1084*da14cebeSEric Cheng clear = B_TRUE; 1085*da14cebeSEric Cheng mac_srs->srs_poll_cpuid = cpuid; 1086*da14cebeSEric Cheng mutex_exit(&mac_srs->srs_lock); 1087*da14cebeSEric Cheng 1088*da14cebeSEric Cheng if (clear) 1089*da14cebeSEric Cheng thread_affinity_clear(mac_srs->srs_poll_thr); 1090*da14cebeSEric Cheng 1091*da14cebeSEric Cheng thread_affinity_set(mac_srs->srs_poll_thr, cpuid); 1092*da14cebeSEric Cheng DTRACE_PROBE1(poll__CPU, processorid_t, cpuid); 1093*da14cebeSEric Cheng } 1094*da14cebeSEric Cheng 1095*da14cebeSEric Cheng /* 1096*da14cebeSEric Cheng * When a CPU comes back online, bind the MAC kernel threads which 1097*da14cebeSEric Cheng * were previously bound to that CPU, and had to be unbound because 1098*da14cebeSEric Cheng * the CPU was going away. 1099*da14cebeSEric Cheng * 1100*da14cebeSEric Cheng * These functions are called with cpu_lock held and hence we can't 1101*da14cebeSEric Cheng * cv_wait to grab the mac perimeter. Since these functions walk the soft 1102*da14cebeSEric Cheng * ring list of an SRS without being in the perimeter, the list itself 1103*da14cebeSEric Cheng * is protected by the SRS lock. 1104*da14cebeSEric Cheng */ 1105*da14cebeSEric Cheng static void 1106*da14cebeSEric Cheng mac_walk_srs_and_bind(int cpuid) 1107*da14cebeSEric Cheng { 1108*da14cebeSEric Cheng mac_soft_ring_set_t *mac_srs; 1109*da14cebeSEric Cheng mac_soft_ring_t *soft_ring; 1110*da14cebeSEric Cheng 1111*da14cebeSEric Cheng rw_enter(&mac_srs_g_lock, RW_READER); 1112*da14cebeSEric Cheng 1113*da14cebeSEric Cheng if ((mac_srs = mac_srs_g_list) == NULL) 1114*da14cebeSEric Cheng goto done; 1115*da14cebeSEric Cheng 1116*da14cebeSEric Cheng for (; mac_srs != NULL; mac_srs = mac_srs->srs_next) { 1117*da14cebeSEric Cheng if (mac_srs->srs_worker_cpuid == -1 && 1118*da14cebeSEric Cheng mac_srs->srs_worker_cpuid_save == cpuid) { 1119*da14cebeSEric Cheng mac_srs->srs_worker_cpuid_save = -1; 1120*da14cebeSEric Cheng mac_srs_worker_bind(mac_srs, cpuid); 1121*da14cebeSEric Cheng } 1122*da14cebeSEric Cheng 1123*da14cebeSEric Cheng if (!(mac_srs->srs_type & SRST_TX)) { 1124*da14cebeSEric Cheng if (mac_srs->srs_poll_cpuid == -1 && 1125*da14cebeSEric Cheng mac_srs->srs_poll_cpuid_save == cpuid) { 1126*da14cebeSEric Cheng mac_srs->srs_poll_cpuid_save = -1; 1127*da14cebeSEric Cheng mac_srs_poll_bind(mac_srs, cpuid); 1128*da14cebeSEric Cheng } 1129*da14cebeSEric Cheng } 1130*da14cebeSEric Cheng 1131*da14cebeSEric Cheng /* Next tackle the soft rings associated with the srs */ 1132*da14cebeSEric Cheng mutex_enter(&mac_srs->srs_lock); 1133*da14cebeSEric Cheng for (soft_ring = mac_srs->srs_soft_ring_head; soft_ring != NULL; 1134*da14cebeSEric Cheng soft_ring = soft_ring->s_ring_next) { 1135*da14cebeSEric Cheng if (soft_ring->s_ring_cpuid == -1 && 1136*da14cebeSEric Cheng soft_ring->s_ring_cpuid_save == cpuid) { 1137*da14cebeSEric Cheng soft_ring->s_ring_cpuid_save = -1; 1138*da14cebeSEric Cheng (void) mac_soft_ring_bind(soft_ring, cpuid); 1139*da14cebeSEric Cheng } 1140*da14cebeSEric Cheng } 1141*da14cebeSEric Cheng mutex_exit(&mac_srs->srs_lock); 1142*da14cebeSEric Cheng } 1143*da14cebeSEric Cheng done: 1144*da14cebeSEric Cheng rw_exit(&mac_srs_g_lock); 1145*da14cebeSEric Cheng } 1146*da14cebeSEric Cheng 1147*da14cebeSEric Cheng /* 1148*da14cebeSEric Cheng * Change the priority of the SRS's poll and worker thread. Additionally, 1149*da14cebeSEric Cheng * update the priority of the worker threads for the SRS's soft rings. 1150*da14cebeSEric Cheng * Need to modify any associated squeue threads. 1151*da14cebeSEric Cheng */ 1152*da14cebeSEric Cheng void 1153*da14cebeSEric Cheng mac_update_srs_priority(mac_soft_ring_set_t *mac_srs, pri_t prival) 1154*da14cebeSEric Cheng { 1155*da14cebeSEric Cheng mac_soft_ring_t *ringp; 1156*da14cebeSEric Cheng 1157*da14cebeSEric Cheng mac_srs->srs_pri = prival; 1158*da14cebeSEric Cheng thread_lock(mac_srs->srs_worker); 1159*da14cebeSEric Cheng (void) thread_change_pri(mac_srs->srs_worker, mac_srs->srs_pri, 0); 1160*da14cebeSEric Cheng thread_unlock(mac_srs->srs_worker); 1161*da14cebeSEric Cheng if (mac_srs->srs_poll_thr != NULL) { 1162*da14cebeSEric Cheng thread_lock(mac_srs->srs_poll_thr); 1163*da14cebeSEric Cheng (void) thread_change_pri(mac_srs->srs_poll_thr, 1164*da14cebeSEric Cheng mac_srs->srs_pri, 0); 1165*da14cebeSEric Cheng thread_unlock(mac_srs->srs_poll_thr); 1166*da14cebeSEric Cheng } 1167*da14cebeSEric Cheng if ((ringp = mac_srs->srs_soft_ring_head) == NULL) 1168*da14cebeSEric Cheng return; 1169*da14cebeSEric Cheng while (ringp != mac_srs->srs_soft_ring_tail) { 1170*da14cebeSEric Cheng thread_lock(ringp->s_ring_worker); 1171*da14cebeSEric Cheng (void) thread_change_pri(ringp->s_ring_worker, 1172*da14cebeSEric Cheng mac_srs->srs_pri, 0); 1173*da14cebeSEric Cheng thread_unlock(ringp->s_ring_worker); 1174*da14cebeSEric Cheng ringp = ringp->s_ring_next; 1175*da14cebeSEric Cheng } 1176*da14cebeSEric Cheng ASSERT(ringp == mac_srs->srs_soft_ring_tail); 1177*da14cebeSEric Cheng thread_lock(ringp->s_ring_worker); 1178*da14cebeSEric Cheng (void) thread_change_pri(ringp->s_ring_worker, mac_srs->srs_pri, 0); 1179*da14cebeSEric Cheng thread_unlock(ringp->s_ring_worker); 1180*da14cebeSEric Cheng } 1181*da14cebeSEric Cheng 1182*da14cebeSEric Cheng /* 1183*da14cebeSEric Cheng * Change the receive bandwidth limit. 1184*da14cebeSEric Cheng */ 1185*da14cebeSEric Cheng static void 1186*da14cebeSEric Cheng mac_rx_srs_update_bwlimit(mac_soft_ring_set_t *srs, mac_resource_props_t *mrp) 1187*da14cebeSEric Cheng { 1188*da14cebeSEric Cheng mac_soft_ring_t *softring; 1189*da14cebeSEric Cheng 1190*da14cebeSEric Cheng mutex_enter(&srs->srs_lock); 1191*da14cebeSEric Cheng mutex_enter(&srs->srs_bw->mac_bw_lock); 1192*da14cebeSEric Cheng 1193*da14cebeSEric Cheng if (mrp->mrp_maxbw == MRP_MAXBW_RESETVAL) { 1194*da14cebeSEric Cheng /* Reset bandwidth limit */ 1195*da14cebeSEric Cheng if (srs->srs_type & SRST_BW_CONTROL) { 1196*da14cebeSEric Cheng softring = srs->srs_soft_ring_head; 1197*da14cebeSEric Cheng while (softring != NULL) { 1198*da14cebeSEric Cheng softring->s_ring_type &= ~ST_RING_BW_CTL; 1199*da14cebeSEric Cheng softring = softring->s_ring_next; 1200*da14cebeSEric Cheng } 1201*da14cebeSEric Cheng srs->srs_type &= ~SRST_BW_CONTROL; 1202*da14cebeSEric Cheng srs->srs_drain_func = mac_rx_srs_drain; 1203*da14cebeSEric Cheng } 1204*da14cebeSEric Cheng } else { 1205*da14cebeSEric Cheng /* Set/Modify bandwidth limit */ 1206*da14cebeSEric Cheng srs->srs_bw->mac_bw_limit = FLOW_BYTES_PER_TICK(mrp->mrp_maxbw); 1207*da14cebeSEric Cheng /* 1208*da14cebeSEric Cheng * Give twice the queuing capability before 1209*da14cebeSEric Cheng * dropping packets. The unit is bytes/tick. 1210*da14cebeSEric Cheng */ 1211*da14cebeSEric Cheng srs->srs_bw->mac_bw_drop_threshold = 1212*da14cebeSEric Cheng srs->srs_bw->mac_bw_limit << 1; 1213*da14cebeSEric Cheng if (!(srs->srs_type & SRST_BW_CONTROL)) { 1214*da14cebeSEric Cheng softring = srs->srs_soft_ring_head; 1215*da14cebeSEric Cheng while (softring != NULL) { 1216*da14cebeSEric Cheng softring->s_ring_type |= ST_RING_BW_CTL; 1217*da14cebeSEric Cheng softring = softring->s_ring_next; 1218*da14cebeSEric Cheng } 1219*da14cebeSEric Cheng srs->srs_type |= SRST_BW_CONTROL; 1220*da14cebeSEric Cheng srs->srs_drain_func = mac_rx_srs_drain_bw; 1221*da14cebeSEric Cheng } 1222*da14cebeSEric Cheng } 1223*da14cebeSEric Cheng done: 1224*da14cebeSEric Cheng mutex_exit(&srs->srs_bw->mac_bw_lock); 1225*da14cebeSEric Cheng mutex_exit(&srs->srs_lock); 1226*da14cebeSEric Cheng } 1227*da14cebeSEric Cheng 1228*da14cebeSEric Cheng /* Change the transmit bandwidth limit */ 1229*da14cebeSEric Cheng static void 1230*da14cebeSEric Cheng mac_tx_srs_update_bwlimit(mac_soft_ring_set_t *srs, mac_resource_props_t *mrp) 1231*da14cebeSEric Cheng { 1232*da14cebeSEric Cheng mac_srs_tx_t *srs_tx = &srs->srs_tx; 1233*da14cebeSEric Cheng uint32_t tx_mode; 1234*da14cebeSEric Cheng mac_impl_t *mip = srs->srs_mcip->mci_mip; 1235*da14cebeSEric Cheng 1236*da14cebeSEric Cheng mutex_enter(&srs->srs_lock); 1237*da14cebeSEric Cheng mutex_enter(&srs->srs_bw->mac_bw_lock); 1238*da14cebeSEric Cheng 1239*da14cebeSEric Cheng tx_mode = srs_tx->st_mode; 1240*da14cebeSEric Cheng 1241*da14cebeSEric Cheng if (mrp->mrp_maxbw == MRP_MAXBW_RESETVAL) { 1242*da14cebeSEric Cheng /* Reset bandwidth limit */ 1243*da14cebeSEric Cheng if (tx_mode == SRS_TX_BW) { 1244*da14cebeSEric Cheng if (mac_tx_serialize || 1245*da14cebeSEric Cheng (mip->mi_v12n_level & MAC_VIRT_SERIALIZE)) { 1246*da14cebeSEric Cheng srs_tx->st_mode = SRS_TX_SERIALIZE; 1247*da14cebeSEric Cheng } else { 1248*da14cebeSEric Cheng srs_tx->st_mode = SRS_TX_DEFAULT; 1249*da14cebeSEric Cheng } 1250*da14cebeSEric Cheng } else if (tx_mode == SRS_TX_BW_FANOUT) { 1251*da14cebeSEric Cheng srs_tx->st_mode = SRS_TX_FANOUT; 1252*da14cebeSEric Cheng } 1253*da14cebeSEric Cheng srs->srs_type &= ~SRST_BW_CONTROL; 1254*da14cebeSEric Cheng } else { 1255*da14cebeSEric Cheng /* Set/Modify bandwidth limit */ 1256*da14cebeSEric Cheng srs->srs_bw->mac_bw_limit = FLOW_BYTES_PER_TICK(mrp->mrp_maxbw); 1257*da14cebeSEric Cheng /* 1258*da14cebeSEric Cheng * Give twice the queuing capability before 1259*da14cebeSEric Cheng * dropping packets. The unit is bytes/tick. 1260*da14cebeSEric Cheng */ 1261*da14cebeSEric Cheng srs->srs_bw->mac_bw_drop_threshold = 1262*da14cebeSEric Cheng srs->srs_bw->mac_bw_limit << 1; 1263*da14cebeSEric Cheng srs->srs_type |= SRST_BW_CONTROL; 1264*da14cebeSEric Cheng if (tx_mode != SRS_TX_BW && 1265*da14cebeSEric Cheng tx_mode != SRS_TX_BW_FANOUT) { 1266*da14cebeSEric Cheng if (tx_mode == SRS_TX_SERIALIZE || 1267*da14cebeSEric Cheng tx_mode == SRS_TX_DEFAULT) { 1268*da14cebeSEric Cheng srs_tx->st_mode = SRS_TX_BW; 1269*da14cebeSEric Cheng } else if (tx_mode == SRS_TX_FANOUT) { 1270*da14cebeSEric Cheng srs_tx->st_mode = SRS_TX_BW_FANOUT; 1271*da14cebeSEric Cheng } else { 1272*da14cebeSEric Cheng ASSERT(0); 1273*da14cebeSEric Cheng } 1274*da14cebeSEric Cheng } 1275*da14cebeSEric Cheng } 1276*da14cebeSEric Cheng done: 1277*da14cebeSEric Cheng srs_tx->st_func = mac_tx_get_func(srs_tx->st_mode); 1278*da14cebeSEric Cheng mutex_exit(&srs->srs_bw->mac_bw_lock); 1279*da14cebeSEric Cheng mutex_exit(&srs->srs_lock); 1280*da14cebeSEric Cheng } 1281*da14cebeSEric Cheng 1282*da14cebeSEric Cheng /* 1283*da14cebeSEric Cheng * The uber function that deals with any update to bandwidth limits. 1284*da14cebeSEric Cheng */ 1285*da14cebeSEric Cheng void 1286*da14cebeSEric Cheng mac_srs_update_bwlimit(flow_entry_t *flent, mac_resource_props_t *mrp) 1287*da14cebeSEric Cheng { 1288*da14cebeSEric Cheng int count; 1289*da14cebeSEric Cheng 1290*da14cebeSEric Cheng for (count = 0; count < flent->fe_rx_srs_cnt; count++) 1291*da14cebeSEric Cheng mac_rx_srs_update_bwlimit(flent->fe_rx_srs[count], mrp); 1292*da14cebeSEric Cheng mac_tx_srs_update_bwlimit(flent->fe_tx_srs, mrp); 1293*da14cebeSEric Cheng } 1294*da14cebeSEric Cheng 1295*da14cebeSEric Cheng void 1296*da14cebeSEric Cheng mac_srs_change_upcall(void *arg, mac_direct_rx_t rx_func, void *rx_arg1) 1297*da14cebeSEric Cheng { 1298*da14cebeSEric Cheng mac_soft_ring_set_t *mac_srs = arg; 1299*da14cebeSEric Cheng mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 1300*da14cebeSEric Cheng mac_soft_ring_t *softring; 1301*da14cebeSEric Cheng 1302*da14cebeSEric Cheng mutex_enter(&mac_srs->srs_lock); 1303*da14cebeSEric Cheng ASSERT((mac_srs->srs_type & SRST_TX) == 0); 1304*da14cebeSEric Cheng srs_rx->sr_func = rx_func; 1305*da14cebeSEric Cheng srs_rx->sr_arg1 = rx_arg1; 1306*da14cebeSEric Cheng 1307*da14cebeSEric Cheng softring = mac_srs->srs_soft_ring_head; 1308*da14cebeSEric Cheng while (softring != NULL) { 1309*da14cebeSEric Cheng mutex_enter(&softring->s_ring_lock); 1310*da14cebeSEric Cheng softring->s_ring_rx_func = rx_func; 1311*da14cebeSEric Cheng softring->s_ring_rx_arg1 = rx_arg1; 1312*da14cebeSEric Cheng mutex_exit(&softring->s_ring_lock); 1313*da14cebeSEric Cheng softring = softring->s_ring_next; 1314*da14cebeSEric Cheng } 1315*da14cebeSEric Cheng 1316*da14cebeSEric Cheng mutex_exit(&mac_srs->srs_lock); 1317*da14cebeSEric Cheng } 1318*da14cebeSEric Cheng 1319*da14cebeSEric Cheng /* 1320*da14cebeSEric Cheng * When the first sub-flow is added to a link, we disable polling on the 1321*da14cebeSEric Cheng * link and also modify the entry point to mac_rx_srs_subflow_process. 1322*da14cebeSEric Cheng * (polling is disabled because with the subflow added, accounting 1323*da14cebeSEric Cheng * for polling needs additional logic, it is assumed that when a subflow is 1324*da14cebeSEric Cheng * added, we can take some hit as a result of disabling polling rather than 1325*da14cebeSEric Cheng * adding more complexity - if this becomes a perf. issue we need to 1326*da14cebeSEric Cheng * re-rvaluate this logic). When the last subflow is removed, we turn back 1327*da14cebeSEric Cheng * polling and also reset the entry point to mac_rx_srs_process. 1328*da14cebeSEric Cheng * 1329*da14cebeSEric Cheng * In the future if there are multiple SRS, we can simply 1330*da14cebeSEric Cheng * take one and give it to the flow rather than disabling polling and 1331*da14cebeSEric Cheng * resetting the entry point. 1332*da14cebeSEric Cheng */ 1333*da14cebeSEric Cheng void 1334*da14cebeSEric Cheng mac_client_update_classifier(mac_client_impl_t *mcip, boolean_t enable) 1335*da14cebeSEric Cheng { 1336*da14cebeSEric Cheng flow_entry_t *flent = mcip->mci_flent; 1337*da14cebeSEric Cheng int i; 1338*da14cebeSEric Cheng mac_impl_t *mip = mcip->mci_mip; 1339*da14cebeSEric Cheng mac_rx_func_t rx_func; 1340*da14cebeSEric Cheng uint_t rx_srs_cnt; 1341*da14cebeSEric Cheng boolean_t enable_classifier; 1342*da14cebeSEric Cheng 1343*da14cebeSEric Cheng ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 1344*da14cebeSEric Cheng 1345*da14cebeSEric Cheng enable_classifier = !FLOW_TAB_EMPTY(mcip->mci_subflow_tab) && enable; 1346*da14cebeSEric Cheng 1347*da14cebeSEric Cheng rx_func = enable_classifier ? mac_rx_srs_subflow_process : 1348*da14cebeSEric Cheng mac_rx_srs_process; 1349*da14cebeSEric Cheng 1350*da14cebeSEric Cheng /* 1351*da14cebeSEric Cheng * If receive function has already been configured correctly for 1352*da14cebeSEric Cheng * current subflow configuration, do nothing. 1353*da14cebeSEric Cheng */ 1354*da14cebeSEric Cheng if (flent->fe_cb_fn == (flow_fn_t)rx_func) 1355*da14cebeSEric Cheng return; 1356*da14cebeSEric Cheng 1357*da14cebeSEric Cheng rx_srs_cnt = flent->fe_rx_srs_cnt; 1358*da14cebeSEric Cheng for (i = 0; i < rx_srs_cnt; i++) { 1359*da14cebeSEric Cheng ASSERT(flent->fe_rx_srs[i] != NULL); 1360*da14cebeSEric Cheng mac_srs_poll_state_change(flent->fe_rx_srs[i], 1361*da14cebeSEric Cheng enable_classifier, rx_func); 1362*da14cebeSEric Cheng } 1363*da14cebeSEric Cheng 1364*da14cebeSEric Cheng /* 1365*da14cebeSEric Cheng * Change the S/W classifier so that we can land in the 1366*da14cebeSEric Cheng * correct processing function with correct argument. 1367*da14cebeSEric Cheng * If all subflows have been removed we can revert to 1368*da14cebeSEric Cheng * mac_rx_srsprocess, else we need mac_rx_srs_subflow_process. 1369*da14cebeSEric Cheng */ 1370*da14cebeSEric Cheng mutex_enter(&flent->fe_lock); 1371*da14cebeSEric Cheng flent->fe_cb_fn = (flow_fn_t)rx_func; 1372*da14cebeSEric Cheng flent->fe_cb_arg1 = (void *)mip; 1373*da14cebeSEric Cheng flent->fe_cb_arg2 = flent->fe_rx_srs[0]; 1374*da14cebeSEric Cheng mutex_exit(&flent->fe_lock); 1375*da14cebeSEric Cheng } 1376*da14cebeSEric Cheng 1377*da14cebeSEric Cheng static void 1378*da14cebeSEric Cheng mac_srs_update_fanout_list(mac_soft_ring_set_t *mac_srs) 1379*da14cebeSEric Cheng { 1380*da14cebeSEric Cheng int tcp_count = 0; 1381*da14cebeSEric Cheng int udp_count = 0; 1382*da14cebeSEric Cheng int oth_count = 0; 1383*da14cebeSEric Cheng mac_soft_ring_t *softring; 1384*da14cebeSEric Cheng 1385*da14cebeSEric Cheng softring = mac_srs->srs_soft_ring_head; 1386*da14cebeSEric Cheng if (softring == NULL) { 1387*da14cebeSEric Cheng ASSERT(mac_srs->srs_soft_ring_count == 0); 1388*da14cebeSEric Cheng mac_srs->srs_tcp_ring_count = 0; 1389*da14cebeSEric Cheng mac_srs->srs_udp_ring_count = 0; 1390*da14cebeSEric Cheng mac_srs->srs_oth_ring_count = 0; 1391*da14cebeSEric Cheng return; 1392*da14cebeSEric Cheng } 1393*da14cebeSEric Cheng 1394*da14cebeSEric Cheng softring = mac_srs->srs_soft_ring_head; 1395*da14cebeSEric Cheng tcp_count = udp_count = oth_count = 0; 1396*da14cebeSEric Cheng 1397*da14cebeSEric Cheng while (softring != NULL) { 1398*da14cebeSEric Cheng if (softring->s_ring_type & ST_RING_TCP) 1399*da14cebeSEric Cheng mac_srs->srs_tcp_soft_rings[tcp_count++] = softring; 1400*da14cebeSEric Cheng else if (softring->s_ring_type & ST_RING_UDP) 1401*da14cebeSEric Cheng mac_srs->srs_udp_soft_rings[udp_count++] = softring; 1402*da14cebeSEric Cheng else 1403*da14cebeSEric Cheng mac_srs->srs_oth_soft_rings[oth_count++] = softring; 1404*da14cebeSEric Cheng softring = softring->s_ring_next; 1405*da14cebeSEric Cheng } 1406*da14cebeSEric Cheng 1407*da14cebeSEric Cheng ASSERT(mac_srs->srs_soft_ring_count == 1408*da14cebeSEric Cheng (tcp_count + udp_count + oth_count)); 1409*da14cebeSEric Cheng 1410*da14cebeSEric Cheng mac_srs->srs_tcp_ring_count = tcp_count; 1411*da14cebeSEric Cheng mac_srs->srs_udp_ring_count = udp_count; 1412*da14cebeSEric Cheng mac_srs->srs_oth_ring_count = oth_count; 1413*da14cebeSEric Cheng } 1414*da14cebeSEric Cheng 1415*da14cebeSEric Cheng void 1416*da14cebeSEric Cheng mac_srs_create_proto_softrings(int id, void *flent, uint16_t type, 1417*da14cebeSEric Cheng pri_t pri, mac_client_impl_t *mcip, mac_soft_ring_set_t *mac_srs, 1418*da14cebeSEric Cheng processorid_t cpuid, mac_direct_rx_t rx_func, void *x_arg1, 1419*da14cebeSEric Cheng mac_resource_handle_t x_arg2, boolean_t set_bypass) 1420*da14cebeSEric Cheng { 1421*da14cebeSEric Cheng mac_soft_ring_t *softring; 1422*da14cebeSEric Cheng mac_rx_fifo_t mrf; 1423*da14cebeSEric Cheng 1424*da14cebeSEric Cheng bzero(&mrf, sizeof (mac_rx_fifo_t)); 1425*da14cebeSEric Cheng mrf.mrf_type = MAC_RX_FIFO; 1426*da14cebeSEric Cheng mrf.mrf_receive = (mac_receive_t)mac_soft_ring_poll; 1427*da14cebeSEric Cheng mrf.mrf_intr_enable = 1428*da14cebeSEric Cheng (mac_intr_enable_t)mac_soft_ring_intr_enable; 1429*da14cebeSEric Cheng mrf.mrf_intr_disable = 1430*da14cebeSEric Cheng (mac_intr_disable_t)mac_soft_ring_intr_disable; 1431*da14cebeSEric Cheng mrf.mrf_flow_priority = pri; 1432*da14cebeSEric Cheng 1433*da14cebeSEric Cheng softring = mac_soft_ring_create(id, mac_soft_ring_worker_wait, 1434*da14cebeSEric Cheng (void *)flent, (type|ST_RING_TCP), pri, mcip, mac_srs, 1435*da14cebeSEric Cheng cpuid, rx_func, x_arg1, x_arg2); 1436*da14cebeSEric Cheng softring->s_ring_rx_arg2 = NULL; 1437*da14cebeSEric Cheng 1438*da14cebeSEric Cheng /* 1439*da14cebeSEric Cheng * TCP and UDP support DLS bypass. In addition TCP 1440*da14cebeSEric Cheng * squeue can also poll their corresponding soft rings. 1441*da14cebeSEric Cheng */ 1442*da14cebeSEric Cheng if (set_bypass && (mcip->mci_resource_arg != NULL)) { 1443*da14cebeSEric Cheng mac_soft_ring_dls_bypass(softring, 1444*da14cebeSEric Cheng mcip->mci_direct_rx_fn, 1445*da14cebeSEric Cheng mcip->mci_direct_rx_arg); 1446*da14cebeSEric Cheng 1447*da14cebeSEric Cheng mrf.mrf_rx_arg = softring; 1448*da14cebeSEric Cheng mrf.mrf_intr_handle = (mac_intr_handle_t)softring; 1449*da14cebeSEric Cheng 1450*da14cebeSEric Cheng /* 1451*da14cebeSEric Cheng * Make a call in IP to get a TCP squeue assigned to 1452*da14cebeSEric Cheng * this softring to maintain full CPU locality through 1453*da14cebeSEric Cheng * the stack and allow the squeue to be able to poll 1454*da14cebeSEric Cheng * the softring so the flow control can be pushed 1455*da14cebeSEric Cheng * all the way to H/W. 1456*da14cebeSEric Cheng */ 1457*da14cebeSEric Cheng softring->s_ring_rx_arg2 = 1458*da14cebeSEric Cheng mcip->mci_resource_add((void *)mcip->mci_resource_arg, 1459*da14cebeSEric Cheng (mac_resource_t *)&mrf); 1460*da14cebeSEric Cheng } 1461*da14cebeSEric Cheng 1462*da14cebeSEric Cheng /* 1463*da14cebeSEric Cheng * Non-TCP protocols don't support squeues. Hence we 1464*da14cebeSEric Cheng * don't make any ring addition callbacks for non-TCP 1465*da14cebeSEric Cheng * rings. Now create the UDP softring and allow it to 1466*da14cebeSEric Cheng * bypass the DLS layer. 1467*da14cebeSEric Cheng */ 1468*da14cebeSEric Cheng softring = mac_soft_ring_create(id, mac_soft_ring_worker_wait, 1469*da14cebeSEric Cheng (void *)flent, (type|ST_RING_UDP), pri, mcip, mac_srs, 1470*da14cebeSEric Cheng cpuid, rx_func, x_arg1, x_arg2); 1471*da14cebeSEric Cheng softring->s_ring_rx_arg2 = NULL; 1472*da14cebeSEric Cheng 1473*da14cebeSEric Cheng if (set_bypass && (mcip->mci_resource_arg != NULL)) { 1474*da14cebeSEric Cheng mac_soft_ring_dls_bypass(softring, 1475*da14cebeSEric Cheng mcip->mci_direct_rx_fn, 1476*da14cebeSEric Cheng mcip->mci_direct_rx_arg); 1477*da14cebeSEric Cheng } 1478*da14cebeSEric Cheng 1479*da14cebeSEric Cheng /* Create the Oth softrings which has to go through the DLS */ 1480*da14cebeSEric Cheng softring = mac_soft_ring_create(id, mac_soft_ring_worker_wait, 1481*da14cebeSEric Cheng (void *)flent, (type|ST_RING_OTH), pri, mcip, mac_srs, 1482*da14cebeSEric Cheng cpuid, rx_func, x_arg1, x_arg2); 1483*da14cebeSEric Cheng softring->s_ring_rx_arg2 = NULL; 1484*da14cebeSEric Cheng } 1485*da14cebeSEric Cheng 1486*da14cebeSEric Cheng /* 1487*da14cebeSEric Cheng * This routine associates a CPU or a set of CPU to process incoming 1488*da14cebeSEric Cheng * traffic from a mac client. If multiple CPUs are specified, then 1489*da14cebeSEric Cheng * so many soft rings are created with each soft ring worker thread 1490*da14cebeSEric Cheng * bound to a CPU in the set. Each soft ring in turn will be 1491*da14cebeSEric Cheng * associated with an squeue and the squeue will be moved to the 1492*da14cebeSEric Cheng * same CPU as that of the soft ring's. 1493*da14cebeSEric Cheng */ 1494*da14cebeSEric Cheng static void 1495*da14cebeSEric Cheng mac_srs_fanout_modify(mac_client_impl_t *mcip, flow_entry_t *flent, 1496*da14cebeSEric Cheng mac_resource_props_t *mrp, mac_direct_rx_t rx_func, void *x_arg1, 1497*da14cebeSEric Cheng mac_resource_handle_t x_arg2, mac_soft_ring_set_t *mac_rx_srs, 1498*da14cebeSEric Cheng mac_soft_ring_set_t *mac_tx_srs) 1499*da14cebeSEric Cheng { 1500*da14cebeSEric Cheng mac_soft_ring_t *softring; 1501*da14cebeSEric Cheng uint32_t soft_ring_flag = soft_ring_process_flag; 1502*da14cebeSEric Cheng processorid_t cpuid = -1; 1503*da14cebeSEric Cheng boolean_t user_specified; 1504*da14cebeSEric Cheng int i, srings_present, new_fanout_cnt; 1505*da14cebeSEric Cheng mac_cpus_t *srs_cpu; 1506*da14cebeSEric Cheng 1507*da14cebeSEric Cheng user_specified = mrp->mrp_mask & MRP_CPUS_USERSPEC; 1508*da14cebeSEric Cheng /* fanout state is REINIT. Set it back to INIT */ 1509*da14cebeSEric Cheng ASSERT(mac_rx_srs->srs_fanout_state == SRS_FANOUT_REINIT); 1510*da14cebeSEric Cheng mac_rx_srs->srs_fanout_state = SRS_FANOUT_INIT; 1511*da14cebeSEric Cheng 1512*da14cebeSEric Cheng /* how many are present right now */ 1513*da14cebeSEric Cheng srings_present = mac_rx_srs->srs_tcp_ring_count; 1514*da14cebeSEric Cheng /* new request */ 1515*da14cebeSEric Cheng srs_cpu = &mac_rx_srs->srs_cpu; 1516*da14cebeSEric Cheng new_fanout_cnt = srs_cpu->mc_fanout_cnt; 1517*da14cebeSEric Cheng 1518*da14cebeSEric Cheng mutex_enter(&mac_rx_srs->srs_lock); 1519*da14cebeSEric Cheng if (mac_rx_srs->srs_type & SRST_BW_CONTROL) 1520*da14cebeSEric Cheng soft_ring_flag |= ST_RING_BW_CTL; 1521*da14cebeSEric Cheng mutex_exit(&mac_rx_srs->srs_lock); 1522*da14cebeSEric Cheng 1523*da14cebeSEric Cheng if (new_fanout_cnt > srings_present) { 1524*da14cebeSEric Cheng /* soft rings increased */ 1525*da14cebeSEric Cheng mutex_enter(&mac_rx_srs->srs_lock); 1526*da14cebeSEric Cheng mac_rx_srs->srs_type |= SRST_FANOUT_SRC_IP; 1527*da14cebeSEric Cheng mutex_exit(&mac_rx_srs->srs_lock); 1528*da14cebeSEric Cheng 1529*da14cebeSEric Cheng for (i = mac_rx_srs->srs_tcp_ring_count; 1530*da14cebeSEric Cheng i < new_fanout_cnt; i++) { 1531*da14cebeSEric Cheng /* 1532*da14cebeSEric Cheng * Create the protocol softrings and set the 1533*da14cebeSEric Cheng * DLS bypass where possible. 1534*da14cebeSEric Cheng */ 1535*da14cebeSEric Cheng mac_srs_create_proto_softrings(i, 1536*da14cebeSEric Cheng (void *)flent, soft_ring_flag, 1537*da14cebeSEric Cheng mac_rx_srs->srs_pri, mcip, mac_rx_srs, cpuid, 1538*da14cebeSEric Cheng rx_func, x_arg1, x_arg2, B_TRUE); 1539*da14cebeSEric Cheng } 1540*da14cebeSEric Cheng mac_srs_update_fanout_list(mac_rx_srs); 1541*da14cebeSEric Cheng } else if (new_fanout_cnt < srings_present) { 1542*da14cebeSEric Cheng /* soft rings decreased */ 1543*da14cebeSEric Cheng if (new_fanout_cnt == 1) { 1544*da14cebeSEric Cheng mutex_enter(&mac_rx_srs->srs_lock); 1545*da14cebeSEric Cheng mac_rx_srs->srs_type &= ~SRST_FANOUT_SRC_IP; 1546*da14cebeSEric Cheng ASSERT(mac_rx_srs->srs_type & SRST_FANOUT_PROTO); 1547*da14cebeSEric Cheng mutex_exit(&mac_rx_srs->srs_lock); 1548*da14cebeSEric Cheng } 1549*da14cebeSEric Cheng /* Get rid of extra soft rings */ 1550*da14cebeSEric Cheng for (i = new_fanout_cnt; 1551*da14cebeSEric Cheng i < mac_rx_srs->srs_tcp_ring_count; i++) { 1552*da14cebeSEric Cheng softring = mac_rx_srs->srs_tcp_soft_rings[i]; 1553*da14cebeSEric Cheng if (softring->s_ring_rx_arg2 != NULL) { 1554*da14cebeSEric Cheng mcip->mci_resource_remove( 1555*da14cebeSEric Cheng (void *)mcip->mci_resource_arg, 1556*da14cebeSEric Cheng softring->s_ring_rx_arg2); 1557*da14cebeSEric Cheng } 1558*da14cebeSEric Cheng mac_soft_ring_remove(mac_rx_srs, 1559*da14cebeSEric Cheng mac_rx_srs->srs_tcp_soft_rings[i]); 1560*da14cebeSEric Cheng mac_soft_ring_remove(mac_rx_srs, 1561*da14cebeSEric Cheng mac_rx_srs->srs_udp_soft_rings[i]); 1562*da14cebeSEric Cheng mac_soft_ring_remove(mac_rx_srs, 1563*da14cebeSEric Cheng mac_rx_srs->srs_oth_soft_rings[i]); 1564*da14cebeSEric Cheng } 1565*da14cebeSEric Cheng mac_srs_update_fanout_list(mac_rx_srs); 1566*da14cebeSEric Cheng } 1567*da14cebeSEric Cheng 1568*da14cebeSEric Cheng ASSERT(new_fanout_cnt == mac_rx_srs->srs_tcp_ring_count); 1569*da14cebeSEric Cheng mutex_enter(&cpu_lock); 1570*da14cebeSEric Cheng for (i = 0; i < mac_rx_srs->srs_tcp_ring_count; i++) { 1571*da14cebeSEric Cheng cpuid = srs_cpu->mc_fanout_cpus[i]; 1572*da14cebeSEric Cheng (void) mac_soft_ring_bind(mac_rx_srs->srs_udp_soft_rings[i], 1573*da14cebeSEric Cheng cpuid); 1574*da14cebeSEric Cheng (void) mac_soft_ring_bind(mac_rx_srs->srs_oth_soft_rings[i], 1575*da14cebeSEric Cheng cpuid); 1576*da14cebeSEric Cheng (void) mac_soft_ring_bind(mac_rx_srs->srs_tcp_soft_rings[i], 1577*da14cebeSEric Cheng cpuid); 1578*da14cebeSEric Cheng softring = mac_rx_srs->srs_tcp_soft_rings[i]; 1579*da14cebeSEric Cheng if (softring->s_ring_rx_arg2 != NULL) { 1580*da14cebeSEric Cheng mcip->mci_resource_bind((void *)mcip->mci_resource_arg, 1581*da14cebeSEric Cheng softring->s_ring_rx_arg2, cpuid); 1582*da14cebeSEric Cheng } 1583*da14cebeSEric Cheng } 1584*da14cebeSEric Cheng 1585*da14cebeSEric Cheng mac_srs_worker_bind(mac_rx_srs, srs_cpu->mc_pollid); 1586*da14cebeSEric Cheng mac_srs_poll_bind(mac_rx_srs, srs_cpu->mc_workerid); 1587*da14cebeSEric Cheng 1588*da14cebeSEric Cheng /* 1589*da14cebeSEric Cheng * Bind Tx srs and soft ring threads too. Let's bind tx 1590*da14cebeSEric Cheng * srs to the last cpu in mrp list. 1591*da14cebeSEric Cheng */ 1592*da14cebeSEric Cheng if (mac_tx_srs != NULL && user_specified) { 1593*da14cebeSEric Cheng BIND_TX_SRS_AND_SOFT_RINGS(mac_tx_srs, mrp); 1594*da14cebeSEric Cheng } 1595*da14cebeSEric Cheng mutex_exit(&cpu_lock); 1596*da14cebeSEric Cheng } 1597*da14cebeSEric Cheng 1598*da14cebeSEric Cheng /* 1599*da14cebeSEric Cheng * Bind SRS threads and soft rings to CPUs/create fanout list. 1600*da14cebeSEric Cheng */ 1601*da14cebeSEric Cheng void 1602*da14cebeSEric Cheng mac_srs_fanout_init(mac_client_impl_t *mcip, flow_entry_t *flent, 1603*da14cebeSEric Cheng mac_resource_props_t *mrp, mac_direct_rx_t rx_func, void *x_arg1, 1604*da14cebeSEric Cheng mac_resource_handle_t x_arg2, mac_soft_ring_set_t *mac_rx_srs, 1605*da14cebeSEric Cheng mac_soft_ring_set_t *mac_tx_srs) 1606*da14cebeSEric Cheng { 1607*da14cebeSEric Cheng int i; 1608*da14cebeSEric Cheng processorid_t cpuid, worker_cpuid, poll_cpuid; 1609*da14cebeSEric Cheng uint32_t soft_ring_flag = soft_ring_process_flag; 1610*da14cebeSEric Cheng int soft_ring_cnt; 1611*da14cebeSEric Cheng boolean_t user_specified = B_FALSE; 1612*da14cebeSEric Cheng mac_cpus_t *srs_cpu = &mac_rx_srs->srs_cpu; 1613*da14cebeSEric Cheng 1614*da14cebeSEric Cheng /* 1615*da14cebeSEric Cheng * Remove the no soft ring flag and we will adjust it 1616*da14cebeSEric Cheng * appropriately further down. 1617*da14cebeSEric Cheng */ 1618*da14cebeSEric Cheng mutex_enter(&mac_rx_srs->srs_lock); 1619*da14cebeSEric Cheng mac_rx_srs->srs_type &= ~SRST_NO_SOFT_RINGS; 1620*da14cebeSEric Cheng mutex_exit(&mac_rx_srs->srs_lock); 1621*da14cebeSEric Cheng 1622*da14cebeSEric Cheng ASSERT(mac_rx_srs->srs_soft_ring_head == NULL); 1623*da14cebeSEric Cheng 1624*da14cebeSEric Cheng if (mac_rx_srs->srs_type & SRST_BW_CONTROL) 1625*da14cebeSEric Cheng soft_ring_flag |= ST_RING_BW_CTL; 1626*da14cebeSEric Cheng 1627*da14cebeSEric Cheng ASSERT(mac_rx_srs->srs_fanout_state == SRS_FANOUT_UNINIT); 1628*da14cebeSEric Cheng mac_rx_srs->srs_fanout_state = SRS_FANOUT_INIT; 1629*da14cebeSEric Cheng user_specified = mrp->mrp_mask & MRP_CPUS_USERSPEC; 1630*da14cebeSEric Cheng /* 1631*da14cebeSEric Cheng * Ring count can be 0 if no fanout is required and no cpu 1632*da14cebeSEric Cheng * were specified. Leave the SRS worker and poll thread 1633*da14cebeSEric Cheng * unbound 1634*da14cebeSEric Cheng */ 1635*da14cebeSEric Cheng ASSERT(mrp != NULL); 1636*da14cebeSEric Cheng soft_ring_cnt = srs_cpu->mc_fanout_cnt; 1637*da14cebeSEric Cheng 1638*da14cebeSEric Cheng /* Step 1: bind cpu contains cpu list where threads need to bind */ 1639*da14cebeSEric Cheng if (soft_ring_cnt > 0) { 1640*da14cebeSEric Cheng mutex_enter(&cpu_lock); 1641*da14cebeSEric Cheng for (i = 0; i < soft_ring_cnt; i++) { 1642*da14cebeSEric Cheng cpuid = srs_cpu->mc_fanout_cpus[i]; 1643*da14cebeSEric Cheng /* Create the protocol softrings */ 1644*da14cebeSEric Cheng mac_srs_create_proto_softrings(i, (void *)flent, 1645*da14cebeSEric Cheng soft_ring_flag, mac_rx_srs->srs_pri, 1646*da14cebeSEric Cheng mcip, mac_rx_srs, cpuid, rx_func, 1647*da14cebeSEric Cheng x_arg1, x_arg2, B_FALSE); 1648*da14cebeSEric Cheng } 1649*da14cebeSEric Cheng worker_cpuid = srs_cpu->mc_workerid; 1650*da14cebeSEric Cheng poll_cpuid = srs_cpu->mc_pollid; 1651*da14cebeSEric Cheng mac_srs_worker_bind(mac_rx_srs, worker_cpuid); 1652*da14cebeSEric Cheng mac_srs_poll_bind(mac_rx_srs, poll_cpuid); 1653*da14cebeSEric Cheng 1654*da14cebeSEric Cheng /* 1655*da14cebeSEric Cheng * Bind Tx srs and soft ring threads too. 1656*da14cebeSEric Cheng * Let's bind tx srs to the last cpu in 1657*da14cebeSEric Cheng * mrp list. 1658*da14cebeSEric Cheng */ 1659*da14cebeSEric Cheng if (mac_tx_srs == NULL) { 1660*da14cebeSEric Cheng mutex_exit(&cpu_lock); 1661*da14cebeSEric Cheng goto alldone; 1662*da14cebeSEric Cheng } 1663*da14cebeSEric Cheng 1664*da14cebeSEric Cheng if (user_specified) { 1665*da14cebeSEric Cheng BIND_TX_SRS_AND_SOFT_RINGS(mac_tx_srs, mrp); 1666*da14cebeSEric Cheng } 1667*da14cebeSEric Cheng mutex_exit(&cpu_lock); 1668*da14cebeSEric Cheng } else { 1669*da14cebeSEric Cheng mutex_enter(&cpu_lock); 1670*da14cebeSEric Cheng /* 1671*da14cebeSEric Cheng * For a subflow, mrp_workerid and mrp_pollid 1672*da14cebeSEric Cheng * is not set. 1673*da14cebeSEric Cheng */ 1674*da14cebeSEric Cheng mac_srs_worker_bind(mac_rx_srs, mrp->mrp_workerid); 1675*da14cebeSEric Cheng mac_srs_poll_bind(mac_rx_srs, mrp->mrp_pollid); 1676*da14cebeSEric Cheng mutex_exit(&cpu_lock); 1677*da14cebeSEric Cheng goto no_softrings; 1678*da14cebeSEric Cheng } 1679*da14cebeSEric Cheng 1680*da14cebeSEric Cheng alldone: 1681*da14cebeSEric Cheng if (soft_ring_cnt > 1) 1682*da14cebeSEric Cheng mac_rx_srs->srs_type |= SRST_FANOUT_SRC_IP; 1683*da14cebeSEric Cheng mac_srs_update_fanout_list(mac_rx_srs); 1684*da14cebeSEric Cheng mac_srs_client_poll_enable(mcip, mac_rx_srs); 1685*da14cebeSEric Cheng return; 1686*da14cebeSEric Cheng 1687*da14cebeSEric Cheng no_softrings: 1688*da14cebeSEric Cheng if (mac_rx_srs->srs_type & SRST_FANOUT_PROTO) { 1689*da14cebeSEric Cheng mutex_enter(&cpu_lock); 1690*da14cebeSEric Cheng cpuid = mac_next_bind_cpu(); 1691*da14cebeSEric Cheng /* Create the protocol softrings */ 1692*da14cebeSEric Cheng mac_srs_create_proto_softrings(0, (void *)flent, 1693*da14cebeSEric Cheng soft_ring_flag, mac_rx_srs->srs_pri, 1694*da14cebeSEric Cheng mcip, mac_rx_srs, cpuid, rx_func, 1695*da14cebeSEric Cheng x_arg1, x_arg2, B_FALSE); 1696*da14cebeSEric Cheng mutex_exit(&cpu_lock); 1697*da14cebeSEric Cheng } else { 1698*da14cebeSEric Cheng /* 1699*da14cebeSEric Cheng * This is the case when there is no fanout which is 1700*da14cebeSEric Cheng * true for subflows. 1701*da14cebeSEric Cheng */ 1702*da14cebeSEric Cheng mac_rx_srs->srs_type |= SRST_NO_SOFT_RINGS; 1703*da14cebeSEric Cheng } 1704*da14cebeSEric Cheng mac_srs_update_fanout_list(mac_rx_srs); 1705*da14cebeSEric Cheng mac_srs_client_poll_enable(mcip, mac_rx_srs); 1706*da14cebeSEric Cheng } 1707*da14cebeSEric Cheng 1708*da14cebeSEric Cheng /* 1709*da14cebeSEric Cheng * mac_fanout_setup: 1710*da14cebeSEric Cheng * 1711*da14cebeSEric Cheng * Calls mac_srs_fanout_init() or modify() depending upon whether 1712*da14cebeSEric Cheng * the SRS is getting initialized or re-initialized. 1713*da14cebeSEric Cheng */ 1714*da14cebeSEric Cheng void 1715*da14cebeSEric Cheng mac_fanout_setup(mac_client_impl_t *mcip, flow_entry_t *flent, 1716*da14cebeSEric Cheng mac_resource_props_t *mrp, mac_direct_rx_t rx_func, void *x_arg1, 1717*da14cebeSEric Cheng mac_resource_handle_t x_arg2) 1718*da14cebeSEric Cheng { 1719*da14cebeSEric Cheng mac_soft_ring_set_t *mac_rx_srs, *mac_tx_srs; 1720*da14cebeSEric Cheng int i, rx_srs_cnt; 1721*da14cebeSEric Cheng 1722*da14cebeSEric Cheng ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); 1723*da14cebeSEric Cheng /* 1724*da14cebeSEric Cheng * This is an aggregation port. Fanout will be setup 1725*da14cebeSEric Cheng * over the aggregation itself. 1726*da14cebeSEric Cheng */ 1727*da14cebeSEric Cheng if (mcip->mci_state_flags & MCIS_IS_AGGR_PORT) 1728*da14cebeSEric Cheng return; 1729*da14cebeSEric Cheng 1730*da14cebeSEric Cheng mac_rx_srs = flent->fe_rx_srs[0]; 1731*da14cebeSEric Cheng /* 1732*da14cebeSEric Cheng * Set up the fanout on the tx side only once, with the 1733*da14cebeSEric Cheng * first rx SRS. The CPU binding, fanout, and bandwidth 1734*da14cebeSEric Cheng * criteria are common to both RX and TX, so 1735*da14cebeSEric Cheng * initializing them along side avoids redundant code. 1736*da14cebeSEric Cheng */ 1737*da14cebeSEric Cheng mac_tx_srs = flent->fe_tx_srs; 1738*da14cebeSEric Cheng rx_srs_cnt = flent->fe_rx_srs_cnt; 1739*da14cebeSEric Cheng 1740*da14cebeSEric Cheng /* No fanout for subflows */ 1741*da14cebeSEric Cheng if (flent->fe_type & FLOW_USER) { 1742*da14cebeSEric Cheng mac_srs_fanout_init(mcip, flent, mrp, rx_func, 1743*da14cebeSEric Cheng x_arg1, x_arg2, mac_rx_srs, mac_tx_srs); 1744*da14cebeSEric Cheng return; 1745*da14cebeSEric Cheng } 1746*da14cebeSEric Cheng 1747*da14cebeSEric Cheng mac_flow_cpu_init(flent, mrp); 1748*da14cebeSEric Cheng 1749*da14cebeSEric Cheng /* 1750*da14cebeSEric Cheng * Set up fanout for both SW (0th SRS) and HW classified 1751*da14cebeSEric Cheng * SRS (the rest of Rx SRSs in flent). 1752*da14cebeSEric Cheng */ 1753*da14cebeSEric Cheng for (i = 0; i < rx_srs_cnt; i++) { 1754*da14cebeSEric Cheng mac_rx_srs = flent->fe_rx_srs[i]; 1755*da14cebeSEric Cheng if (i != 0) 1756*da14cebeSEric Cheng mac_tx_srs = NULL; 1757*da14cebeSEric Cheng switch (mac_rx_srs->srs_fanout_state) { 1758*da14cebeSEric Cheng case SRS_FANOUT_UNINIT: 1759*da14cebeSEric Cheng mac_srs_fanout_init(mcip, flent, mrp, rx_func, 1760*da14cebeSEric Cheng x_arg1, x_arg2, mac_rx_srs, mac_tx_srs); 1761*da14cebeSEric Cheng break; 1762*da14cebeSEric Cheng case SRS_FANOUT_INIT: 1763*da14cebeSEric Cheng break; 1764*da14cebeSEric Cheng case SRS_FANOUT_REINIT: 1765*da14cebeSEric Cheng mac_rx_srs_quiesce(mac_rx_srs, SRS_QUIESCE); 1766*da14cebeSEric Cheng mac_srs_fanout_modify(mcip, flent, mrp, rx_func, 1767*da14cebeSEric Cheng x_arg1, x_arg2, mac_rx_srs, mac_tx_srs); 1768*da14cebeSEric Cheng mac_rx_srs_restart(mac_rx_srs); 1769*da14cebeSEric Cheng break; 1770*da14cebeSEric Cheng default: 1771*da14cebeSEric Cheng VERIFY(mac_rx_srs->srs_fanout_state <= 1772*da14cebeSEric Cheng SRS_FANOUT_REINIT); 1773*da14cebeSEric Cheng break; 1774*da14cebeSEric Cheng } 1775*da14cebeSEric Cheng } 1776*da14cebeSEric Cheng } 1777*da14cebeSEric Cheng 1778*da14cebeSEric Cheng /* 1779*da14cebeSEric Cheng * mac_create_soft_ring_set: 1780*da14cebeSEric Cheng * 1781*da14cebeSEric Cheng * Create a mac_soft_ring_set_t (SRS). If soft_ring_fanout_type is 1782*da14cebeSEric Cheng * SRST_TX, an SRS for Tx side is created. Otherwise an SRS for Rx side 1783*da14cebeSEric Cheng * processing is created. 1784*da14cebeSEric Cheng * 1785*da14cebeSEric Cheng * Details on Rx SRS: 1786*da14cebeSEric Cheng * Create a SRS and also add the necessary soft rings for TCP and 1787*da14cebeSEric Cheng * non-TCP based on fanout type and count specified. 1788*da14cebeSEric Cheng * 1789*da14cebeSEric Cheng * mac_soft_ring_fanout, mac_srs_fanout_modify (?), 1790*da14cebeSEric Cheng * mac_soft_ring_stop_workers, mac_soft_ring_set_destroy, etc need 1791*da14cebeSEric Cheng * to be heavily modified. 1792*da14cebeSEric Cheng * 1793*da14cebeSEric Cheng * mi_soft_ring_list_size, mi_soft_ring_size, etc need to disappear. 1794*da14cebeSEric Cheng */ 1795*da14cebeSEric Cheng mac_soft_ring_set_t * 1796*da14cebeSEric Cheng mac_srs_create(mac_client_impl_t *mcip, flow_entry_t *flent, uint32_t srs_type, 1797*da14cebeSEric Cheng mac_direct_rx_t rx_func, void *x_arg1, mac_resource_handle_t x_arg2, 1798*da14cebeSEric Cheng mac_ring_t *ring) 1799*da14cebeSEric Cheng { 1800*da14cebeSEric Cheng mac_soft_ring_set_t *mac_srs; 1801*da14cebeSEric Cheng mac_srs_rx_t *srs_rx; 1802*da14cebeSEric Cheng mac_srs_tx_t *srs_tx; 1803*da14cebeSEric Cheng mac_bw_ctl_t *mac_bw; 1804*da14cebeSEric Cheng mac_resource_props_t *mrp; 1805*da14cebeSEric Cheng boolean_t is_tx_srs = ((srs_type & SRST_TX) != 0); 1806*da14cebeSEric Cheng 1807*da14cebeSEric Cheng mac_srs = kmem_cache_alloc(mac_srs_cache, KM_SLEEP); 1808*da14cebeSEric Cheng bzero(mac_srs, sizeof (mac_soft_ring_set_t)); 1809*da14cebeSEric Cheng srs_rx = &mac_srs->srs_rx; 1810*da14cebeSEric Cheng srs_tx = &mac_srs->srs_tx; 1811*da14cebeSEric Cheng 1812*da14cebeSEric Cheng mutex_enter(&flent->fe_lock); 1813*da14cebeSEric Cheng 1814*da14cebeSEric Cheng /* 1815*da14cebeSEric Cheng * Get the bandwidth control structure from the flent. Get 1816*da14cebeSEric Cheng * rid of any residual values in the control structure for 1817*da14cebeSEric Cheng * the tx bw struct and also for the rx, if the rx srs is 1818*da14cebeSEric Cheng * the 1st one being brought up (the rx bw ctl struct may 1819*da14cebeSEric Cheng * be shared by multiple SRSs) 1820*da14cebeSEric Cheng */ 1821*da14cebeSEric Cheng if (is_tx_srs) { 1822*da14cebeSEric Cheng mac_srs->srs_bw = &flent->fe_tx_bw; 1823*da14cebeSEric Cheng bzero(mac_srs->srs_bw, sizeof (mac_bw_ctl_t)); 1824*da14cebeSEric Cheng flent->fe_tx_srs = mac_srs; 1825*da14cebeSEric Cheng } else { 1826*da14cebeSEric Cheng /* 1827*da14cebeSEric Cheng * The bw counter (stored in the flent) is shared 1828*da14cebeSEric Cheng * by SRS's within an rx group. 1829*da14cebeSEric Cheng */ 1830*da14cebeSEric Cheng mac_srs->srs_bw = &flent->fe_rx_bw; 1831*da14cebeSEric Cheng /* First rx SRS, clear the bw structure */ 1832*da14cebeSEric Cheng if (flent->fe_rx_srs_cnt == 0) 1833*da14cebeSEric Cheng bzero(mac_srs->srs_bw, sizeof (mac_bw_ctl_t)); 1834*da14cebeSEric Cheng ASSERT(flent->fe_rx_srs_cnt < MAX_RINGS_PER_GROUP); 1835*da14cebeSEric Cheng flent->fe_rx_srs[flent->fe_rx_srs_cnt] = mac_srs; 1836*da14cebeSEric Cheng flent->fe_rx_srs_cnt++; 1837*da14cebeSEric Cheng } 1838*da14cebeSEric Cheng mac_srs->srs_flent = flent; 1839*da14cebeSEric Cheng mutex_exit(&flent->fe_lock); 1840*da14cebeSEric Cheng 1841*da14cebeSEric Cheng mac_srs->srs_state = 0; 1842*da14cebeSEric Cheng mac_srs->srs_type = (srs_type | SRST_NO_SOFT_RINGS); 1843*da14cebeSEric Cheng mac_srs->srs_worker_cpuid = mac_srs->srs_worker_cpuid_save = -1; 1844*da14cebeSEric Cheng mac_srs->srs_poll_cpuid = mac_srs->srs_poll_cpuid_save = -1; 1845*da14cebeSEric Cheng mac_srs_fanout_list_alloc(mac_srs); 1846*da14cebeSEric Cheng 1847*da14cebeSEric Cheng /* 1848*da14cebeSEric Cheng * For a flow we use the underlying MAC client's priority range with 1849*da14cebeSEric Cheng * the priority value to find an absolute priority value. For a MAC 1850*da14cebeSEric Cheng * client we use the MAC client's maximum priority as the value. 1851*da14cebeSEric Cheng */ 1852*da14cebeSEric Cheng mrp = &flent->fe_effective_props; 1853*da14cebeSEric Cheng if ((mac_srs->srs_type & SRST_FLOW) != 0) { 1854*da14cebeSEric Cheng mac_srs->srs_pri = FLOW_PRIORITY(mcip->mci_min_pri, 1855*da14cebeSEric Cheng mcip->mci_max_pri, mrp->mrp_priority); 1856*da14cebeSEric Cheng } else { 1857*da14cebeSEric Cheng mac_srs->srs_pri = mcip->mci_max_pri; 1858*da14cebeSEric Cheng } 1859*da14cebeSEric Cheng mac_srs->srs_mcip = mcip; 1860*da14cebeSEric Cheng /* 1861*da14cebeSEric Cheng * We need to insert the SRS in the global list before 1862*da14cebeSEric Cheng * binding the SRS and SR threads. Otherwise there is a 1863*da14cebeSEric Cheng * is a small window where the cpu reconfig callbacks 1864*da14cebeSEric Cheng * may miss the SRS in the list walk and DR could fail 1865*da14cebeSEric Cheng * as there are bound threads. 1866*da14cebeSEric Cheng */ 1867*da14cebeSEric Cheng mac_srs_add_glist(mac_srs); 1868*da14cebeSEric Cheng 1869*da14cebeSEric Cheng /* Initialize bw limit */ 1870*da14cebeSEric Cheng if ((mrp->mrp_mask & MRP_MAXBW) != 0) { 1871*da14cebeSEric Cheng mac_srs->srs_drain_func = mac_rx_srs_drain_bw; 1872*da14cebeSEric Cheng 1873*da14cebeSEric Cheng mac_bw = mac_srs->srs_bw; 1874*da14cebeSEric Cheng mutex_enter(&mac_bw->mac_bw_lock); 1875*da14cebeSEric Cheng mac_bw->mac_bw_limit = FLOW_BYTES_PER_TICK(mrp->mrp_maxbw); 1876*da14cebeSEric Cheng 1877*da14cebeSEric Cheng /* 1878*da14cebeSEric Cheng * Give twice the queuing capability before 1879*da14cebeSEric Cheng * dropping packets. The unit is bytes/tick. 1880*da14cebeSEric Cheng */ 1881*da14cebeSEric Cheng mac_bw->mac_bw_drop_threshold = mac_bw->mac_bw_limit << 1; 1882*da14cebeSEric Cheng mutex_exit(&mac_bw->mac_bw_lock); 1883*da14cebeSEric Cheng mac_srs->srs_type |= SRST_BW_CONTROL; 1884*da14cebeSEric Cheng } else { 1885*da14cebeSEric Cheng mac_srs->srs_drain_func = mac_rx_srs_drain; 1886*da14cebeSEric Cheng } 1887*da14cebeSEric Cheng 1888*da14cebeSEric Cheng /* 1889*da14cebeSEric Cheng * We use the following policy to control Receive 1890*da14cebeSEric Cheng * Side Dynamic Polling: 1891*da14cebeSEric Cheng * 1) We switch to poll mode anytime the processing thread causes 1892*da14cebeSEric Cheng * a backlog to build up in SRS and its associated Soft Rings 1893*da14cebeSEric Cheng * (sr_poll_pkt_cnt > 0). 1894*da14cebeSEric Cheng * 2) As long as the backlog stays under the low water mark 1895*da14cebeSEric Cheng * (sr_lowat), we poll the H/W for more packets. 1896*da14cebeSEric Cheng * 3) If the backlog (sr_poll_pkt_cnt) exceeds low water mark, we 1897*da14cebeSEric Cheng * stay in poll mode but don't poll the H/W for more packets. 1898*da14cebeSEric Cheng * 4) Anytime in polling mode, if we poll the H/W for packets and 1899*da14cebeSEric Cheng * find nothing plus we have an existing backlog 1900*da14cebeSEric Cheng * (sr_poll_pkt_cnt > 0), we stay in polling mode but don't poll 1901*da14cebeSEric Cheng * the H/W for packets anymore (let the polling thread go to sleep). 1902*da14cebeSEric Cheng * 5) Once the backlog is relived (packets are processed) we reenable 1903*da14cebeSEric Cheng * polling (by signalling the poll thread) only when the backlog 1904*da14cebeSEric Cheng * dips below sr_poll_thres. 1905*da14cebeSEric Cheng * 6) sr_hiwat is used exclusively when we are not polling capable 1906*da14cebeSEric Cheng * and is used to decide when to drop packets so the SRS queue 1907*da14cebeSEric Cheng * length doesn't grow infinitely. 1908*da14cebeSEric Cheng */ 1909*da14cebeSEric Cheng if (!is_tx_srs) { 1910*da14cebeSEric Cheng srs_rx->sr_hiwat = mac_soft_ring_max_q_cnt; 1911*da14cebeSEric Cheng /* Low water mark needs to be less than high water mark */ 1912*da14cebeSEric Cheng srs_rx->sr_lowat = mac_soft_ring_min_q_cnt <= 1913*da14cebeSEric Cheng mac_soft_ring_max_q_cnt ? mac_soft_ring_min_q_cnt : 1914*da14cebeSEric Cheng (mac_soft_ring_max_q_cnt >> 2); 1915*da14cebeSEric Cheng /* Poll threshold need to be half of low water mark or less */ 1916*da14cebeSEric Cheng srs_rx->sr_poll_thres = mac_soft_ring_poll_thres <= 1917*da14cebeSEric Cheng (srs_rx->sr_lowat >> 1) ? mac_soft_ring_poll_thres : 1918*da14cebeSEric Cheng (srs_rx->sr_lowat >> 1); 1919*da14cebeSEric Cheng if (mac_latency_optimize) 1920*da14cebeSEric Cheng mac_srs->srs_state |= SRS_LATENCY_OPT; 1921*da14cebeSEric Cheng } 1922*da14cebeSEric Cheng 1923*da14cebeSEric Cheng mac_srs->srs_worker = thread_create(NULL, 0, 1924*da14cebeSEric Cheng mac_srs_worker, mac_srs, 0, &p0, TS_RUN, mac_srs->srs_pri); 1925*da14cebeSEric Cheng 1926*da14cebeSEric Cheng if (is_tx_srs) { 1927*da14cebeSEric Cheng /* Handle everything about Tx SRS and return */ 1928*da14cebeSEric Cheng mac_srs->srs_drain_func = mac_tx_srs_drain; 1929*da14cebeSEric Cheng srs_tx->st_max_q_cnt = mac_tx_srs_max_q_cnt; 1930*da14cebeSEric Cheng srs_tx->st_hiwat = 1931*da14cebeSEric Cheng (mac_tx_srs_hiwat > mac_tx_srs_max_q_cnt) ? 1932*da14cebeSEric Cheng mac_tx_srs_max_q_cnt : mac_tx_srs_hiwat; 1933*da14cebeSEric Cheng srs_tx->st_arg1 = x_arg1; 1934*da14cebeSEric Cheng srs_tx->st_arg2 = x_arg2; 1935*da14cebeSEric Cheng return (mac_srs); 1936*da14cebeSEric Cheng } 1937*da14cebeSEric Cheng 1938*da14cebeSEric Cheng if ((srs_type & SRST_FLOW) != 0 || 1939*da14cebeSEric Cheng FLOW_TAB_EMPTY(mcip->mci_subflow_tab)) 1940*da14cebeSEric Cheng srs_rx->sr_lower_proc = mac_rx_srs_process; 1941*da14cebeSEric Cheng else 1942*da14cebeSEric Cheng srs_rx->sr_lower_proc = mac_rx_srs_subflow_process; 1943*da14cebeSEric Cheng 1944*da14cebeSEric Cheng srs_rx->sr_func = rx_func; 1945*da14cebeSEric Cheng srs_rx->sr_arg1 = x_arg1; 1946*da14cebeSEric Cheng srs_rx->sr_arg2 = x_arg2; 1947*da14cebeSEric Cheng 1948*da14cebeSEric Cheng if (ring != NULL) { 1949*da14cebeSEric Cheng /* Is the mac_srs created over the RX default group? */ 1950*da14cebeSEric Cheng if (ring->mr_gh == (mac_group_handle_t) 1951*da14cebeSEric Cheng (&mcip->mci_mip->mi_rx_groups[0])) 1952*da14cebeSEric Cheng mac_srs->srs_type |= SRST_DEFAULT_GRP; 1953*da14cebeSEric Cheng 1954*da14cebeSEric Cheng mac_srs->srs_ring = ring; 1955*da14cebeSEric Cheng ring->mr_srs = mac_srs; 1956*da14cebeSEric Cheng ring->mr_classify_type = MAC_HW_CLASSIFIER; 1957*da14cebeSEric Cheng ring->mr_flag |= MR_INCIPIENT; 1958*da14cebeSEric Cheng 1959*da14cebeSEric Cheng if (FLOW_TAB_EMPTY(mcip->mci_subflow_tab)) 1960*da14cebeSEric Cheng mac_srs->srs_state |= SRS_POLLING_CAPAB; 1961*da14cebeSEric Cheng 1962*da14cebeSEric Cheng mac_srs->srs_poll_thr = thread_create(NULL, 0, 1963*da14cebeSEric Cheng mac_rx_srs_poll_ring, mac_srs, 0, &p0, TS_RUN, 1964*da14cebeSEric Cheng mac_srs->srs_pri); 1965*da14cebeSEric Cheng } 1966*da14cebeSEric Cheng return (mac_srs); 1967*da14cebeSEric Cheng } 1968*da14cebeSEric Cheng 1969*da14cebeSEric Cheng /* 1970*da14cebeSEric Cheng * Figure out the number of soft rings required. Its dependant on 1971*da14cebeSEric Cheng * if protocol fanout is required (for LINKs), global settings 1972*da14cebeSEric Cheng * require us to do fanout for performance (based on mac_soft_ring_enable), 1973*da14cebeSEric Cheng * or user has specifically requested fanout. 1974*da14cebeSEric Cheng */ 1975*da14cebeSEric Cheng static uint32_t 1976*da14cebeSEric Cheng mac_find_fanout(flow_entry_t *flent, uint32_t link_type) 1977*da14cebeSEric Cheng { 1978*da14cebeSEric Cheng uint32_t fanout_type; 1979*da14cebeSEric Cheng mac_resource_props_t *mrp = &flent->fe_effective_props; 1980*da14cebeSEric Cheng 1981*da14cebeSEric Cheng /* no fanout for subflows */ 1982*da14cebeSEric Cheng switch (link_type) { 1983*da14cebeSEric Cheng case SRST_FLOW: 1984*da14cebeSEric Cheng fanout_type = SRST_NO_SOFT_RINGS; 1985*da14cebeSEric Cheng break; 1986*da14cebeSEric Cheng case SRST_LINK: 1987*da14cebeSEric Cheng fanout_type = SRST_FANOUT_PROTO; 1988*da14cebeSEric Cheng break; 1989*da14cebeSEric Cheng } 1990*da14cebeSEric Cheng 1991*da14cebeSEric Cheng /* A primary NIC/link is being plumbed */ 1992*da14cebeSEric Cheng if (flent->fe_type & FLOW_PRIMARY_MAC) { 1993*da14cebeSEric Cheng if (mac_soft_ring_enable && mac_rx_soft_ring_count > 1) { 1994*da14cebeSEric Cheng fanout_type |= SRST_FANOUT_SRC_IP; 1995*da14cebeSEric Cheng } 1996*da14cebeSEric Cheng } else if (flent->fe_type & FLOW_VNIC) { 1997*da14cebeSEric Cheng /* A VNIC is being created */ 1998*da14cebeSEric Cheng if (mrp != NULL && mrp->mrp_ncpus > 0) { 1999*da14cebeSEric Cheng fanout_type |= SRST_FANOUT_SRC_IP; 2000*da14cebeSEric Cheng } 2001*da14cebeSEric Cheng } 2002*da14cebeSEric Cheng 2003*da14cebeSEric Cheng return (fanout_type); 2004*da14cebeSEric Cheng } 2005*da14cebeSEric Cheng 2006*da14cebeSEric Cheng /* 2007*da14cebeSEric Cheng * Change a group from h/w to s/w classification. 2008*da14cebeSEric Cheng */ 2009*da14cebeSEric Cheng static void 2010*da14cebeSEric Cheng mac_rx_switch_grp_to_sw(mac_group_t *group) 2011*da14cebeSEric Cheng { 2012*da14cebeSEric Cheng mac_ring_t *ring; 2013*da14cebeSEric Cheng mac_soft_ring_set_t *mac_srs; 2014*da14cebeSEric Cheng 2015*da14cebeSEric Cheng for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) { 2016*da14cebeSEric Cheng if (ring->mr_classify_type == MAC_HW_CLASSIFIER) { 2017*da14cebeSEric Cheng /* 2018*da14cebeSEric Cheng * Remove the SRS associated with the HW ring. 2019*da14cebeSEric Cheng * As a result, polling will be disabled. 2020*da14cebeSEric Cheng */ 2021*da14cebeSEric Cheng mac_srs = ring->mr_srs; 2022*da14cebeSEric Cheng ASSERT(mac_srs != NULL); 2023*da14cebeSEric Cheng mac_rx_srs_remove(mac_srs); 2024*da14cebeSEric Cheng ring->mr_srs = NULL; 2025*da14cebeSEric Cheng } 2026*da14cebeSEric Cheng 2027*da14cebeSEric Cheng if (ring->mr_state != MR_INUSE) 2028*da14cebeSEric Cheng (void) mac_start_ring(ring); 2029*da14cebeSEric Cheng /* 2030*da14cebeSEric Cheng * We need to perform SW classification 2031*da14cebeSEric Cheng * for packets landing in these rings 2032*da14cebeSEric Cheng */ 2033*da14cebeSEric Cheng ring->mr_state = MR_INUSE; 2034*da14cebeSEric Cheng ring->mr_flag = 0; 2035*da14cebeSEric Cheng ring->mr_classify_type = MAC_SW_CLASSIFIER; 2036*da14cebeSEric Cheng } 2037*da14cebeSEric Cheng } 2038*da14cebeSEric Cheng 2039*da14cebeSEric Cheng /* 2040*da14cebeSEric Cheng * Create the Rx SRS for S/W classifier and for each ring in the 2041*da14cebeSEric Cheng * group (if exclusive group). Also create the Tx SRS. 2042*da14cebeSEric Cheng */ 2043*da14cebeSEric Cheng void 2044*da14cebeSEric Cheng mac_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent, 2045*da14cebeSEric Cheng mac_group_t *group, uint32_t link_type) 2046*da14cebeSEric Cheng { 2047*da14cebeSEric Cheng mac_impl_t *mip = mcip->mci_mip; 2048*da14cebeSEric Cheng mac_soft_ring_set_t *mac_srs; 2049*da14cebeSEric Cheng mac_soft_ring_set_t *tx_srs = NULL; 2050*da14cebeSEric Cheng mac_ring_t *ring; 2051*da14cebeSEric Cheng uint32_t fanout_type; 2052*da14cebeSEric Cheng boolean_t created_srs = B_FALSE; 2053*da14cebeSEric Cheng 2054*da14cebeSEric Cheng fanout_type = mac_find_fanout(flent, link_type); 2055*da14cebeSEric Cheng 2056*da14cebeSEric Cheng /* Create the SRS for S/W classification if none exists */ 2057*da14cebeSEric Cheng if (flent->fe_rx_srs[0] == NULL) { 2058*da14cebeSEric Cheng ASSERT(flent->fe_rx_srs_cnt == 0); 2059*da14cebeSEric Cheng /* Setup the Rx SRS */ 2060*da14cebeSEric Cheng mac_srs = mac_srs_create(mcip, flent, fanout_type | link_type, 2061*da14cebeSEric Cheng mac_rx_deliver, mcip, NULL, NULL); 2062*da14cebeSEric Cheng 2063*da14cebeSEric Cheng mutex_enter(&flent->fe_lock); 2064*da14cebeSEric Cheng flent->fe_cb_fn = (flow_fn_t)mac_srs->srs_rx.sr_lower_proc; 2065*da14cebeSEric Cheng flent->fe_cb_arg1 = (void *)mip; 2066*da14cebeSEric Cheng flent->fe_cb_arg2 = (void *)mac_srs; 2067*da14cebeSEric Cheng mutex_exit(&flent->fe_lock); 2068*da14cebeSEric Cheng 2069*da14cebeSEric Cheng /* Setup the Tx SRS as well */ 2070*da14cebeSEric Cheng ASSERT(flent->fe_tx_srs == NULL); 2071*da14cebeSEric Cheng tx_srs = mac_srs_create(mcip, flent, SRST_TX | link_type, 2072*da14cebeSEric Cheng NULL, mcip, NULL, NULL); 2073*da14cebeSEric Cheng 2074*da14cebeSEric Cheng if (mcip->mci_share != NULL) { 2075*da14cebeSEric Cheng mac_srs_tx_t *tx = &tx_srs->srs_tx; 2076*da14cebeSEric Cheng ASSERT(!mcip->mci_no_hwrings); 2077*da14cebeSEric Cheng /* 2078*da14cebeSEric Cheng * A share requires a dedicated TX group. 2079*da14cebeSEric Cheng * mac_reserve_tx_group() does the work needed to 2080*da14cebeSEric Cheng * allocate a new group and populate that group 2081*da14cebeSEric Cheng * with rings according to the driver requirements 2082*da14cebeSEric Cheng * and limitations. 2083*da14cebeSEric Cheng */ 2084*da14cebeSEric Cheng tx->st_group = 2085*da14cebeSEric Cheng mac_reserve_tx_group(mip, mcip->mci_share); 2086*da14cebeSEric Cheng ASSERT(tx->st_group != NULL); 2087*da14cebeSEric Cheng tx->st_group->mrg_tx_client = mcip; 2088*da14cebeSEric Cheng } 2089*da14cebeSEric Cheng mac_tx_srs_setup(mcip, flent, link_type); 2090*da14cebeSEric Cheng created_srs = B_TRUE; 2091*da14cebeSEric Cheng } 2092*da14cebeSEric Cheng 2093*da14cebeSEric Cheng if (group == NULL) { 2094*da14cebeSEric Cheng if (created_srs) { 2095*da14cebeSEric Cheng mac_fanout_setup(mcip, flent, 2096*da14cebeSEric Cheng MCIP_RESOURCE_PROPS(mcip), mac_rx_deliver, 2097*da14cebeSEric Cheng mcip, NULL); 2098*da14cebeSEric Cheng } 2099*da14cebeSEric Cheng return; 2100*da14cebeSEric Cheng } 2101*da14cebeSEric Cheng 2102*da14cebeSEric Cheng /* 2103*da14cebeSEric Cheng * fanout for default SRS is done when default SRS are created 2104*da14cebeSEric Cheng * above. As each ring is added to the group, we setup the 2105*da14cebeSEric Cheng * SRS and fanout to it. 2106*da14cebeSEric Cheng */ 2107*da14cebeSEric Cheng switch (group->mrg_state) { 2108*da14cebeSEric Cheng case MAC_GROUP_STATE_RESERVED: 2109*da14cebeSEric Cheng /* 2110*da14cebeSEric Cheng * The group is exclusively ours. Create a SRS 2111*da14cebeSEric Cheng * for each ring in the group and allow the 2112*da14cebeSEric Cheng * individual SRS to dynamically poll their 2113*da14cebeSEric Cheng * Rx ring. Do this only if the client is not 2114*da14cebeSEric Cheng * a VLAN MAC client since for VLAN we do 2115*da14cebeSEric Cheng * s/w classification for the VID check. 2116*da14cebeSEric Cheng */ 2117*da14cebeSEric Cheng if (i_mac_flow_vid(mcip->mci_flent) != VLAN_ID_NONE) 2118*da14cebeSEric Cheng break; 2119*da14cebeSEric Cheng for (ring = group->mrg_rings; ring != NULL; 2120*da14cebeSEric Cheng ring = ring->mr_next) { 2121*da14cebeSEric Cheng switch (ring->mr_state) { 2122*da14cebeSEric Cheng case MR_INUSE: 2123*da14cebeSEric Cheng case MR_FREE: 2124*da14cebeSEric Cheng if (ring->mr_srs != NULL) 2125*da14cebeSEric Cheng break; 2126*da14cebeSEric Cheng if (ring->mr_state != MR_INUSE) 2127*da14cebeSEric Cheng (void) mac_start_ring(ring); 2128*da14cebeSEric Cheng 2129*da14cebeSEric Cheng ring->mr_state = MR_INUSE; 2130*da14cebeSEric Cheng 2131*da14cebeSEric Cheng mac_srs = mac_srs_create(mcip, flent, 2132*da14cebeSEric Cheng fanout_type | link_type, 2133*da14cebeSEric Cheng mac_rx_deliver, mcip, NULL, ring); 2134*da14cebeSEric Cheng if (mip->mi_v12n_level & MAC_VIRT_SERIALIZE) { 2135*da14cebeSEric Cheng mac_srs->srs_rx.sr_enqueue_always = 2136*da14cebeSEric Cheng B_TRUE; 2137*da14cebeSEric Cheng } 2138*da14cebeSEric Cheng break; 2139*da14cebeSEric Cheng default: 2140*da14cebeSEric Cheng cmn_err(CE_PANIC, "srs_setup: mcip = %p " 2141*da14cebeSEric Cheng "trying to add UNKNOWN ring = %p\n", 2142*da14cebeSEric Cheng (void *)mcip, (void *)ring); 2143*da14cebeSEric Cheng break; 2144*da14cebeSEric Cheng } 2145*da14cebeSEric Cheng } 2146*da14cebeSEric Cheng break; 2147*da14cebeSEric Cheng case MAC_GROUP_STATE_SHARED: 2148*da14cebeSEric Cheng /* 2149*da14cebeSEric Cheng * Set all rings of this group to software classified. 2150*da14cebeSEric Cheng * 2151*da14cebeSEric Cheng * If the group is current RESERVED, the existing mac client 2152*da14cebeSEric Cheng * (the only client on this group) is using this group 2153*da14cebeSEric Cheng * exclusively. In that case we need to disable polling on 2154*da14cebeSEric Cheng * the rings of the group (if it was enabled), and free the 2155*da14cebeSEric Cheng * SRS associated with the rings. 2156*da14cebeSEric Cheng */ 2157*da14cebeSEric Cheng mac_rx_switch_grp_to_sw(group); 2158*da14cebeSEric Cheng break; 2159*da14cebeSEric Cheng default: 2160*da14cebeSEric Cheng ASSERT(B_FALSE); 2161*da14cebeSEric Cheng break; 2162*da14cebeSEric Cheng } 2163*da14cebeSEric Cheng mac_fanout_setup(mcip, flent, MCIP_RESOURCE_PROPS(mcip), 2164*da14cebeSEric Cheng mac_rx_deliver, mcip, NULL); 2165*da14cebeSEric Cheng } 2166*da14cebeSEric Cheng 2167*da14cebeSEric Cheng void 2168*da14cebeSEric Cheng mac_srs_group_teardown(mac_client_impl_t *mcip, flow_entry_t *flent, 2169*da14cebeSEric Cheng uint32_t link_type) 2170*da14cebeSEric Cheng { 2171*da14cebeSEric Cheng mac_soft_ring_set_t *mac_srs; 2172*da14cebeSEric Cheng mac_soft_ring_set_t *tx_srs; 2173*da14cebeSEric Cheng mac_srs_tx_t *tx; 2174*da14cebeSEric Cheng int i; 2175*da14cebeSEric Cheng 2176*da14cebeSEric Cheng for (i = 0; i < flent->fe_rx_srs_cnt; i++) { 2177*da14cebeSEric Cheng mac_srs = flent->fe_rx_srs[i]; 2178*da14cebeSEric Cheng mac_rx_srs_quiesce(mac_srs, SRS_CONDEMNED); 2179*da14cebeSEric Cheng /* 2180*da14cebeSEric Cheng * Deal with all fanout tear down etc. 2181*da14cebeSEric Cheng */ 2182*da14cebeSEric Cheng mac_srs_free(mac_srs); 2183*da14cebeSEric Cheng flent->fe_rx_srs[i] = NULL; 2184*da14cebeSEric Cheng } 2185*da14cebeSEric Cheng flent->fe_rx_srs_cnt = 0; 2186*da14cebeSEric Cheng 2187*da14cebeSEric Cheng tx_srs = flent->fe_tx_srs; 2188*da14cebeSEric Cheng tx = &tx_srs->srs_tx; 2189*da14cebeSEric Cheng switch (link_type) { 2190*da14cebeSEric Cheng case SRST_FLOW: 2191*da14cebeSEric Cheng /* 2192*da14cebeSEric Cheng * For flows, we need to work with passed 2193*da14cebeSEric Cheng * flent to find the Rx/Tx SRS. 2194*da14cebeSEric Cheng */ 2195*da14cebeSEric Cheng mac_tx_srs_quiesce(tx_srs, SRS_CONDEMNED); 2196*da14cebeSEric Cheng break; 2197*da14cebeSEric Cheng case SRST_LINK: 2198*da14cebeSEric Cheng mac_tx_client_quiesce(mcip, SRS_CONDEMNED); 2199*da14cebeSEric Cheng /* 2200*da14cebeSEric Cheng * Release the TX resources. First the TX group, if any 2201*da14cebeSEric Cheng * was assigned to the MAC client, which will cause the 2202*da14cebeSEric Cheng * TX rings to be moved back to the pool. Then free the 2203*da14cebeSEric Cheng * rings themselves. 2204*da14cebeSEric Cheng */ 2205*da14cebeSEric Cheng if (tx->st_group != NULL) { 2206*da14cebeSEric Cheng mac_release_tx_group(tx_srs->srs_mcip->mci_mip, 2207*da14cebeSEric Cheng tx->st_group); 2208*da14cebeSEric Cheng tx->st_group = NULL; 2209*da14cebeSEric Cheng } 2210*da14cebeSEric Cheng if (tx->st_arg2 != NULL) { 2211*da14cebeSEric Cheng ASSERT(tx_srs->srs_type & SRST_TX); 2212*da14cebeSEric Cheng mac_release_tx_ring(tx->st_arg2); 2213*da14cebeSEric Cheng } 2214*da14cebeSEric Cheng break; 2215*da14cebeSEric Cheng default: 2216*da14cebeSEric Cheng ASSERT(B_FALSE); 2217*da14cebeSEric Cheng break; 2218*da14cebeSEric Cheng } 2219*da14cebeSEric Cheng mac_srs_free(tx_srs); 2220*da14cebeSEric Cheng flent->fe_tx_srs = NULL; 2221*da14cebeSEric Cheng } 2222*da14cebeSEric Cheng 2223*da14cebeSEric Cheng /* 2224*da14cebeSEric Cheng * This is the group state machine. The state of an Rx group is given by 2225*da14cebeSEric Cheng * the following table. The default group and its rings are started in 2226*da14cebeSEric Cheng * mac_start itself and the default group stays in SHARED state until 2227*da14cebeSEric Cheng * mac_stop at which time the group and rings are stopped and and it 2228*da14cebeSEric Cheng * reverts to the Registered state. 2229*da14cebeSEric Cheng * 2230*da14cebeSEric Cheng * Typically this function is called on a group after adding or removing a 2231*da14cebeSEric Cheng * client from it, to find out what should be the new state of the group. 2232*da14cebeSEric Cheng * If the new state is RESERVED, then the client that owns this group 2233*da14cebeSEric Cheng * exclusively is also returned. Note that adding or removing a client from 2234*da14cebeSEric Cheng * a group could also impact the default group and the caller needs to 2235*da14cebeSEric Cheng * evaluate the effect on the default group. 2236*da14cebeSEric Cheng * 2237*da14cebeSEric Cheng * Group type # of clients mi_nactiveclients Group State 2238*da14cebeSEric Cheng * in the group 2239*da14cebeSEric Cheng * 2240*da14cebeSEric Cheng * Non-default 0 N.A. REGISTERED 2241*da14cebeSEric Cheng * Non-default 1 N.A. RESERVED 2242*da14cebeSEric Cheng * Non-default > 1 N.A. SHARED 2243*da14cebeSEric Cheng * 2244*da14cebeSEric Cheng * Default 0 N.A. SHARED 2245*da14cebeSEric Cheng * Default 1 1 RESERVED 2246*da14cebeSEric Cheng * Default 1 > 1 SHARED 2247*da14cebeSEric Cheng * Default > 1 N.A. SHARED 2248*da14cebeSEric Cheng */ 2249*da14cebeSEric Cheng mac_group_state_t 2250*da14cebeSEric Cheng mac_rx_group_next_state(mac_group_t *grp, mac_client_impl_t **group_only_mcip) 2251*da14cebeSEric Cheng { 2252*da14cebeSEric Cheng mac_impl_t *mip = (mac_impl_t *)grp->mrg_mh; 2253*da14cebeSEric Cheng 2254*da14cebeSEric Cheng *group_only_mcip = NULL; 2255*da14cebeSEric Cheng 2256*da14cebeSEric Cheng /* Non-default group */ 2257*da14cebeSEric Cheng 2258*da14cebeSEric Cheng if (grp != mip->mi_rx_groups) { 2259*da14cebeSEric Cheng if (MAC_RX_GROUP_NO_CLIENT(grp)) 2260*da14cebeSEric Cheng return (MAC_GROUP_STATE_REGISTERED); 2261*da14cebeSEric Cheng 2262*da14cebeSEric Cheng *group_only_mcip = MAC_RX_GROUP_ONLY_CLIENT(grp); 2263*da14cebeSEric Cheng if (*group_only_mcip != NULL) 2264*da14cebeSEric Cheng return (MAC_GROUP_STATE_RESERVED); 2265*da14cebeSEric Cheng 2266*da14cebeSEric Cheng return (MAC_GROUP_STATE_SHARED); 2267*da14cebeSEric Cheng } 2268*da14cebeSEric Cheng 2269*da14cebeSEric Cheng /* Default group */ 2270*da14cebeSEric Cheng 2271*da14cebeSEric Cheng if (MAC_RX_GROUP_NO_CLIENT(grp) || mip->mi_nactiveclients != 1) 2272*da14cebeSEric Cheng return (MAC_GROUP_STATE_SHARED); 2273*da14cebeSEric Cheng 2274*da14cebeSEric Cheng *group_only_mcip = MAC_RX_GROUP_ONLY_CLIENT(grp); 2275*da14cebeSEric Cheng ASSERT(*group_only_mcip != NULL); 2276*da14cebeSEric Cheng return (MAC_GROUP_STATE_RESERVED); 2277*da14cebeSEric Cheng } 2278*da14cebeSEric Cheng 2279*da14cebeSEric Cheng /* 2280*da14cebeSEric Cheng * OVERVIEW NOTES FOR DATAPATH 2281*da14cebeSEric Cheng * =========================== 2282*da14cebeSEric Cheng * 2283*da14cebeSEric Cheng * Create an SRS and setup the corresponding flow function and args. 2284*da14cebeSEric Cheng * Add a classification rule for the flow specified by 'flent' and program 2285*da14cebeSEric Cheng * the hardware classifier when applicable. 2286*da14cebeSEric Cheng * 2287*da14cebeSEric Cheng * Rx ring assignment, SRS, polling and B/W enforcement 2288*da14cebeSEric Cheng * ---------------------------------------------------- 2289*da14cebeSEric Cheng * 2290*da14cebeSEric Cheng * We try to use H/W classification on NIC and assign traffic to a 2291*da14cebeSEric Cheng * MAC address to a particular Rx ring. There is a 1-1 mapping 2292*da14cebeSEric Cheng * between a SRS and a Rx ring. The SRS (short for soft ring set) 2293*da14cebeSEric Cheng * dynamically switches the underlying Rx ring between interrupt 2294*da14cebeSEric Cheng * and polling mode and enforces any specified B/W control. 2295*da14cebeSEric Cheng * 2296*da14cebeSEric Cheng * There is always a SRS created and tied to each H/W and S/W rule. 2297*da14cebeSEric Cheng * Whenever we create a H/W rule, we always add the the same rule to 2298*da14cebeSEric Cheng * S/W classifier and tie a SRS to it. 2299*da14cebeSEric Cheng * 2300*da14cebeSEric Cheng * In case a B/W control is specified, its broken into bytes 2301*da14cebeSEric Cheng * per ticks and as soon as the quota for a tick is exhausted, 2302*da14cebeSEric Cheng * the underlying Rx ring is forced into poll mode for remianing 2303*da14cebeSEric Cheng * tick. The SRS poll thread only polls for bytes that are 2304*da14cebeSEric Cheng * allowed to come in the SRS. We typically let 4x the configured 2305*da14cebeSEric Cheng * B/W worth of packets to come in the SRS (to prevent unnecessary 2306*da14cebeSEric Cheng * drops due to bursts) but only process the specified amount. 2307*da14cebeSEric Cheng * 2308*da14cebeSEric Cheng * A Link (primary NIC, VNIC, VLAN or aggr) can have 1 or more 2309*da14cebeSEric Cheng * Rx rings (and corresponding SRSs) assigned to it. The SRS 2310*da14cebeSEric Cheng * in turn can have softrings to do protocol level fanout or 2311*da14cebeSEric Cheng * softrings to do S/W based fanout or both. In case the NIC 2312*da14cebeSEric Cheng * has no Rx rings, we do S/W classification to respective SRS. 2313*da14cebeSEric Cheng * The S/W classification rule is always setup and ready. This 2314*da14cebeSEric Cheng * allows the MAC layer to reassign Rx rings whenever needed 2315*da14cebeSEric Cheng * but packets still continue to flow via the default path and 2316*da14cebeSEric Cheng * getting S/W classified to correct SRS. 2317*da14cebeSEric Cheng * 2318*da14cebeSEric Cheng * In other cases where a NIC or VNIC is plumbed, our goal is use 2319*da14cebeSEric Cheng * H/W classifier and get two Rx ring assigned for the Link. One 2320*da14cebeSEric Cheng * for TCP and one for UDP|SCTP. The respective SRS still do the 2321*da14cebeSEric Cheng * polling on the Rx ring. For Link that is plumbed for IP, there 2322*da14cebeSEric Cheng * is a TCP squeue which also does polling and can control the 2323*da14cebeSEric Cheng * the Rx ring directly (where SRS is just pass through). For 2324*da14cebeSEric Cheng * the following cases, the SRS does the polling underneath. 2325*da14cebeSEric Cheng * 1) non IP based Links (Links which are not plumbed via ifconfig) 2326*da14cebeSEric Cheng * and paths which have no IP squeues (UDP & SCTP) 2327*da14cebeSEric Cheng * 2) If B/W control is specified on the Link 2328*da14cebeSEric Cheng * 3) If S/W fanout is secified 2329*da14cebeSEric Cheng * 2330*da14cebeSEric Cheng * Note1: As of current implementation, we try to assign only 1 Rx 2331*da14cebeSEric Cheng * ring per Link and more than 1 Rx ring for primary Link for 2332*da14cebeSEric Cheng * H/W based fanout. We always create following softrings per SRS: 2333*da14cebeSEric Cheng * 1) TCP softring which is polled by TCP squeue where possible 2334*da14cebeSEric Cheng * (and also bypasses DLS) 2335*da14cebeSEric Cheng * 2) UDP/SCTP based which bypasses DLS 2336*da14cebeSEric Cheng * 3) OTH softring which goes via DLS (currently deal with IPv6 2337*da14cebeSEric Cheng * and non TCP/UDP/SCTP for IPv4 packets). 2338*da14cebeSEric Cheng * 2339*da14cebeSEric Cheng * It is necessary to create 3 softrings since SRS has to poll 2340*da14cebeSEric Cheng * the single Rx ring underneath and enforce any link level B/W 2341*da14cebeSEric Cheng * control (we can't switch the Rx ring in poll mode just based 2342*da14cebeSEric Cheng * on TCP squeue if the same Rx ring is sharing UDP and other 2343*da14cebeSEric Cheng * traffic as well). Once polling is done and any Link level B/W 2344*da14cebeSEric Cheng * control is specified, the packets are assigned to respective 2345*da14cebeSEric Cheng * softring based on protocol. Since TCP has IP based squeue 2346*da14cebeSEric Cheng * which benefits by polling, we separate TCP packets into 2347*da14cebeSEric Cheng * its own softring which can be polled by IP squeue. We need 2348*da14cebeSEric Cheng * to separate out UDP/SCTP to UDP softring since it can bypass 2349*da14cebeSEric Cheng * the DLS layer which has heavy performance advanatges and we 2350*da14cebeSEric Cheng * need a softring (OTH) for rest. 2351*da14cebeSEric Cheng * 2352*da14cebeSEric Cheng * ToDo: The 3 softrings for protocol are needed only till we can 2353*da14cebeSEric Cheng * get rid of DLS from datapath, make IPv4 and IPv6 paths 2354*da14cebeSEric Cheng * symmetric (deal with mac_header_info for v6 and polling for 2355*da14cebeSEric Cheng * IPv4 TCP - ip_accept_tcp is IPv4 specific although squeues 2356*da14cebeSEric Cheng * are generic), and bring SAP based classification to MAC layer 2357*da14cebeSEric Cheng * 2358*da14cebeSEric Cheng * H/W and S/W based fanout and multiple Rx rings per Link 2359*da14cebeSEric Cheng * ------------------------------------------------------- 2360*da14cebeSEric Cheng * 2361*da14cebeSEric Cheng * In case, fanout is requested (or determined automatically based 2362*da14cebeSEric Cheng * on Link speed and processor speed), we try to assign multiple 2363*da14cebeSEric Cheng * Rx rings per Link with their respective SRS. In this case 2364*da14cebeSEric Cheng * the NIC should be capable of fanning out incoming packets between 2365*da14cebeSEric Cheng * the assigned Rx rings (H/W based fanout). All the SRS 2366*da14cebeSEric Cheng * individually switch their Rx ring between interrupt and polling 2367*da14cebeSEric Cheng * mode but share a common B/W control counter in case of Link 2368*da14cebeSEric Cheng * level B/W is specified. 2369*da14cebeSEric Cheng * 2370*da14cebeSEric Cheng * If S/W based fanout is specified in lieu of H/W based fanout, 2371*da14cebeSEric Cheng * the Link SRS creates the specified number of softrings for 2372*da14cebeSEric Cheng * each protocol (TCP, UDP, OTH). Incoming packets are fanned 2373*da14cebeSEric Cheng * out to the correct softring based on their protocol and 2374*da14cebeSEric Cheng * protocol specific hash function. 2375*da14cebeSEric Cheng * 2376*da14cebeSEric Cheng * Primary and non primary MAC clients 2377*da14cebeSEric Cheng * ----------------------------------- 2378*da14cebeSEric Cheng * 2379*da14cebeSEric Cheng * The NICs, VNICs, Vlans, and Aggrs are typically termed as Links 2380*da14cebeSEric Cheng * and are a Layer 2 construct. 2381*da14cebeSEric Cheng * 2382*da14cebeSEric Cheng * Primary NIC: 2383*da14cebeSEric Cheng * The Link that owns the primary MAC address and typically 2384*da14cebeSEric Cheng * is used as the data NIC in non virtualized cases. As such 2385*da14cebeSEric Cheng * H/W resources are preferntially given to primary NIC. As 2386*da14cebeSEric Cheng * far as code is concerned, there is no difference in the 2387*da14cebeSEric Cheng * primary NIC vs VNICs. They are all treated as Links. 2388*da14cebeSEric Cheng * At the very first call to mac_unicast_add() we program the S/W 2389*da14cebeSEric Cheng * classifier for the primary MAC address, get a soft ring set 2390*da14cebeSEric Cheng * (and soft rings based on 'ip_soft_ring_cnt') 2391*da14cebeSEric Cheng * and a Rx ring assigned for polling to get enabled. 2392*da14cebeSEric Cheng * When IP get plumbed and negotiates polling, we can 2393*da14cebeSEric Cheng * let squeue do the polling on TCP softring. 2394*da14cebeSEric Cheng * 2395*da14cebeSEric Cheng * VNICs: 2396*da14cebeSEric Cheng * Same as any other Link. As long as the H/W resource assignments 2397*da14cebeSEric Cheng * are equal, the data path and setup for all Links is same. 2398*da14cebeSEric Cheng * 2399*da14cebeSEric Cheng * Flows: 2400*da14cebeSEric Cheng * Can be configured on Links. They have their own SRS and the 2401*da14cebeSEric Cheng * S/W classifier is programmed appropriately based on the flow. 2402*da14cebeSEric Cheng * The flows typically deal with layer 3 and above and 2403*da14cebeSEric Cheng * creates a soft ring set specific to the flow. The receive 2404*da14cebeSEric Cheng * side function is switched from mac_rx_srs_process to 2405*da14cebeSEric Cheng * mac_rx_srs_subflow_process which first tries to assign the 2406*da14cebeSEric Cheng * packet to appropriate flow SRS and failing which assigns it 2407*da14cebeSEric Cheng * to link SRS. This allows us to avoid the layered approach 2408*da14cebeSEric Cheng * which gets complex. 2409*da14cebeSEric Cheng * 2410*da14cebeSEric Cheng * By the time mac_datapath_setup() completes, we already have the 2411*da14cebeSEric Cheng * soft rings set, Rx rings, soft rings, etc figured out and both H/W 2412*da14cebeSEric Cheng * and S/W classifiers programmed. IP is not plumbed yet (and might 2413*da14cebeSEric Cheng * never be for Virtual Machines guest OS path). When IP is plumbed 2414*da14cebeSEric Cheng * (for both NIC and VNIC), we do a capability negotiation for polling 2415*da14cebeSEric Cheng * and upcall functions etc. 2416*da14cebeSEric Cheng * 2417*da14cebeSEric Cheng * Rx ring Assignement NOTES 2418*da14cebeSEric Cheng * ------------------------- 2419*da14cebeSEric Cheng * 2420*da14cebeSEric Cheng * For NICs which have only 1 Rx ring (we treat NICs with no Rx rings 2421*da14cebeSEric Cheng * as NIC with a single default ring), we assign the only ring to 2422*da14cebeSEric Cheng * primary Link as MAC_RX_HW_DEFAULT_RING. The primary Link SRS can do 2423*da14cebeSEric Cheng * polling on it as long as it is the only link in use and we compare 2424*da14cebeSEric Cheng * the MAC address for unicast packets before accepting an incoming 2425*da14cebeSEric Cheng * packet (there is no need for S/W classification in this case). We 2426*da14cebeSEric Cheng * disable polling on the only ring the moment 2nd link gets created 2427*da14cebeSEric Cheng * (the polling remains enabled even though there are broadcast and 2428*da14cebeSEric Cheng * multicast flows created). 2429*da14cebeSEric Cheng * 2430*da14cebeSEric Cheng * If the NIC has more than 1 Rx ring, we assign the default ring (the 2431*da14cebeSEric Cheng * 1st ring) to deal with broadcast, multicast and traffic for other 2432*da14cebeSEric Cheng * NICs which needs S/W classification. We assign the primary mac 2433*da14cebeSEric Cheng * addresses to another ring by specifiying a classification rule for 2434*da14cebeSEric Cheng * primary unicast MAC address to the selected ring. The primary Link 2435*da14cebeSEric Cheng * (and its SRS) can continue to poll the assigned Rx ring at all times 2436*da14cebeSEric Cheng * independantly. 2437*da14cebeSEric Cheng * 2438*da14cebeSEric Cheng * Right now we just assign MAC_RX_HW_DEFAULT_RING to note that it is 2439*da14cebeSEric Cheng * primary NIC and later we will check to see how many Rx rings we 2440*da14cebeSEric Cheng * have and can we get a non default Rx ring for the primary MAC. 2441*da14cebeSEric Cheng * 2442*da14cebeSEric Cheng * Note: In future, if no fanout is specified, we try to assign 2 Rx 2443*da14cebeSEric Cheng * rings for the primary Link with the primary MAC address + TCP going 2444*da14cebeSEric Cheng * to one ring and primary MAC address + UDP|SCTP going to other ring. 2445*da14cebeSEric Cheng * Any remaining traffic for primary MAC address can go to the default 2446*da14cebeSEric Cheng * Rx ring and get S/W classified. This way the respective SRSs don't 2447*da14cebeSEric Cheng * need to do proto fanout and don't need to have softrings at all and 2448*da14cebeSEric Cheng * can poll their respective Rx rings. 2449*da14cebeSEric Cheng * 2450*da14cebeSEric Cheng * As an optimization, when a new NIC or VNIC is created, we can get 2451*da14cebeSEric Cheng * only one Rx ring and make it a TCP specific Rx ring and use the 2452*da14cebeSEric Cheng * H/W default Rx ring for the rest (this Rx ring is never polled). 2453*da14cebeSEric Cheng */ 2454*da14cebeSEric Cheng int 2455*da14cebeSEric Cheng mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent, 2456*da14cebeSEric Cheng uint32_t link_type) 2457*da14cebeSEric Cheng { 2458*da14cebeSEric Cheng mac_impl_t *mip = mcip->mci_mip; 2459*da14cebeSEric Cheng mac_group_t *group = NULL; 2460*da14cebeSEric Cheng mac_group_t *default_group; 2461*da14cebeSEric Cheng int err; 2462*da14cebeSEric Cheng uint8_t *mac_addr; 2463*da14cebeSEric Cheng mac_rx_group_reserve_type_t rtype = MAC_RX_RESERVE_NONDEFAULT; 2464*da14cebeSEric Cheng mac_group_state_t next_state; 2465*da14cebeSEric Cheng mac_client_impl_t *group_only_mcip; 2466*da14cebeSEric Cheng 2467*da14cebeSEric Cheng ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 2468*da14cebeSEric Cheng 2469*da14cebeSEric Cheng switch (link_type) { 2470*da14cebeSEric Cheng case SRST_FLOW: 2471*da14cebeSEric Cheng mac_srs_group_setup(mcip, flent, NULL, link_type); 2472*da14cebeSEric Cheng return (0); 2473*da14cebeSEric Cheng 2474*da14cebeSEric Cheng case SRST_LINK: 2475*da14cebeSEric Cheng mac_addr = flent->fe_flow_desc.fd_dst_mac; 2476*da14cebeSEric Cheng 2477*da14cebeSEric Cheng /* Check if we need to reserve the default group */ 2478*da14cebeSEric Cheng if (flent->fe_type & FLOW_PRIMARY_MAC) 2479*da14cebeSEric Cheng rtype = MAC_RX_RESERVE_DEFAULT; 2480*da14cebeSEric Cheng 2481*da14cebeSEric Cheng if (!mcip->mci_no_hwrings) { 2482*da14cebeSEric Cheng /* 2483*da14cebeSEric Cheng * Check to see if we can get an exclusive group for 2484*da14cebeSEric Cheng * this mac address or if there already exists a 2485*da14cebeSEric Cheng * group that has this mac address (case of VLANs). 2486*da14cebeSEric Cheng * If no groups are available, use the default group. 2487*da14cebeSEric Cheng */ 2488*da14cebeSEric Cheng group = mac_reserve_rx_group(mcip, mac_addr, rtype); 2489*da14cebeSEric Cheng } 2490*da14cebeSEric Cheng 2491*da14cebeSEric Cheng if (group == NULL) { 2492*da14cebeSEric Cheng if (mcip->mci_req_hwrings) 2493*da14cebeSEric Cheng return (ENOSPC); 2494*da14cebeSEric Cheng group = &mip->mi_rx_groups[0]; 2495*da14cebeSEric Cheng } 2496*da14cebeSEric Cheng 2497*da14cebeSEric Cheng /* 2498*da14cebeSEric Cheng * Some NICs don't support any Rx rings, so there may not 2499*da14cebeSEric Cheng * even be a default group. 2500*da14cebeSEric Cheng */ 2501*da14cebeSEric Cheng if (group != NULL) { 2502*da14cebeSEric Cheng flent->fe_rx_ring_group = group; 2503*da14cebeSEric Cheng /* 2504*da14cebeSEric Cheng * Add the client to the group. This could cause 2505*da14cebeSEric Cheng * either this group to move to the shared state or 2506*da14cebeSEric Cheng * cause the default group to move to the shared state. 2507*da14cebeSEric Cheng * The actions on this group are done here, while the 2508*da14cebeSEric Cheng * actions on the default group are postponed to 2509*da14cebeSEric Cheng * the end of this function. 2510*da14cebeSEric Cheng */ 2511*da14cebeSEric Cheng mac_rx_group_add_client(group, mcip); 2512*da14cebeSEric Cheng next_state = mac_rx_group_next_state(group, 2513*da14cebeSEric Cheng &group_only_mcip); 2514*da14cebeSEric Cheng 2515*da14cebeSEric Cheng ASSERT((next_state == MAC_GROUP_STATE_RESERVED && 2516*da14cebeSEric Cheng mcip == group_only_mcip) || 2517*da14cebeSEric Cheng (next_state == MAC_GROUP_STATE_SHARED && 2518*da14cebeSEric Cheng group_only_mcip == NULL)); 2519*da14cebeSEric Cheng 2520*da14cebeSEric Cheng mac_set_rx_group_state(group, next_state); 2521*da14cebeSEric Cheng } 2522*da14cebeSEric Cheng 2523*da14cebeSEric Cheng /* 2524*da14cebeSEric Cheng * Setup the Rx and Tx SRSes. If we got a pristine group 2525*da14cebeSEric Cheng * exclusively above, mac_srs_group_setup would simply create 2526*da14cebeSEric Cheng * the required SRSes. If we ended up sharing a previously 2527*da14cebeSEric Cheng * reserved group, mac_srs_group_setup would also dismantle the 2528*da14cebeSEric Cheng * SRSes of the previously exclusive group 2529*da14cebeSEric Cheng */ 2530*da14cebeSEric Cheng mac_srs_group_setup(mcip, flent, group, link_type); 2531*da14cebeSEric Cheng 2532*da14cebeSEric Cheng /* Program the S/W Classifer */ 2533*da14cebeSEric Cheng if ((err = mac_flow_add(mip->mi_flow_tab, flent)) != 0) 2534*da14cebeSEric Cheng goto setup_failed; 2535*da14cebeSEric Cheng 2536*da14cebeSEric Cheng /* Program the H/W Classifier */ 2537*da14cebeSEric Cheng if ((err = mac_add_macaddr(mip, group, mac_addr)) != 0) 2538*da14cebeSEric Cheng goto setup_failed; 2539*da14cebeSEric Cheng mcip->mci_unicast = mac_find_macaddr(mip, mac_addr); 2540*da14cebeSEric Cheng ASSERT(mcip->mci_unicast != NULL); 2541*da14cebeSEric Cheng break; 2542*da14cebeSEric Cheng 2543*da14cebeSEric Cheng default: 2544*da14cebeSEric Cheng ASSERT(B_FALSE); 2545*da14cebeSEric Cheng break; 2546*da14cebeSEric Cheng } 2547*da14cebeSEric Cheng 2548*da14cebeSEric Cheng /* 2549*da14cebeSEric Cheng * All broadcast and multicast traffic is received only on the default 2550*da14cebeSEric Cheng * group. If we have setup the datapath for a non-default group above 2551*da14cebeSEric Cheng * then move the default group to shared state to allow distribution of 2552*da14cebeSEric Cheng * incoming broadcast traffic to the other groups and dismantle the 2553*da14cebeSEric Cheng * SRSes over the default group. 2554*da14cebeSEric Cheng */ 2555*da14cebeSEric Cheng if (group != NULL) { 2556*da14cebeSEric Cheng if (group != mip->mi_rx_groups) { 2557*da14cebeSEric Cheng default_group = mip->mi_rx_groups; 2558*da14cebeSEric Cheng if (default_group->mrg_state == 2559*da14cebeSEric Cheng MAC_GROUP_STATE_RESERVED) { 2560*da14cebeSEric Cheng group_only_mcip = MAC_RX_GROUP_ONLY_CLIENT( 2561*da14cebeSEric Cheng default_group); 2562*da14cebeSEric Cheng ASSERT(group_only_mcip != NULL && 2563*da14cebeSEric Cheng mip->mi_nactiveclients > 1); 2564*da14cebeSEric Cheng 2565*da14cebeSEric Cheng mac_set_rx_group_state(default_group, 2566*da14cebeSEric Cheng MAC_GROUP_STATE_SHARED); 2567*da14cebeSEric Cheng mac_srs_group_setup(group_only_mcip, 2568*da14cebeSEric Cheng group_only_mcip->mci_flent, 2569*da14cebeSEric Cheng default_group, SRST_LINK); 2570*da14cebeSEric Cheng } 2571*da14cebeSEric Cheng ASSERT(default_group->mrg_state == 2572*da14cebeSEric Cheng MAC_GROUP_STATE_SHARED); 2573*da14cebeSEric Cheng } 2574*da14cebeSEric Cheng /* 2575*da14cebeSEric Cheng * If we get an exclusive group for a VLAN MAC client we 2576*da14cebeSEric Cheng * need to take the s/w path to make the additional check for 2577*da14cebeSEric Cheng * the vid. Disable polling and set it to s/w classification. 2578*da14cebeSEric Cheng */ 2579*da14cebeSEric Cheng if (group->mrg_state == MAC_GROUP_STATE_RESERVED && 2580*da14cebeSEric Cheng i_mac_flow_vid(mcip->mci_flent) != VLAN_ID_NONE) { 2581*da14cebeSEric Cheng mac_rx_switch_grp_to_sw(group); 2582*da14cebeSEric Cheng } 2583*da14cebeSEric Cheng } 2584*da14cebeSEric Cheng return (0); 2585*da14cebeSEric Cheng 2586*da14cebeSEric Cheng setup_failed: 2587*da14cebeSEric Cheng mac_datapath_teardown(mcip, flent, link_type); 2588*da14cebeSEric Cheng return (err); 2589*da14cebeSEric Cheng } 2590*da14cebeSEric Cheng 2591*da14cebeSEric Cheng void 2592*da14cebeSEric Cheng mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent, 2593*da14cebeSEric Cheng uint32_t link_type) 2594*da14cebeSEric Cheng { 2595*da14cebeSEric Cheng mac_impl_t *mip = mcip->mci_mip; 2596*da14cebeSEric Cheng mac_group_t *group = NULL; 2597*da14cebeSEric Cheng mac_client_impl_t *grp_only_mcip; 2598*da14cebeSEric Cheng flow_entry_t *group_only_flent; 2599*da14cebeSEric Cheng mac_group_t *default_group; 2600*da14cebeSEric Cheng boolean_t check_default_group = B_FALSE; 2601*da14cebeSEric Cheng mac_group_state_t next_state; 2602*da14cebeSEric Cheng 2603*da14cebeSEric Cheng ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); 2604*da14cebeSEric Cheng 2605*da14cebeSEric Cheng switch (link_type) { 2606*da14cebeSEric Cheng case SRST_FLOW: 2607*da14cebeSEric Cheng mac_srs_group_teardown(mcip, flent, SRST_FLOW); 2608*da14cebeSEric Cheng return; 2609*da14cebeSEric Cheng 2610*da14cebeSEric Cheng case SRST_LINK: 2611*da14cebeSEric Cheng /* Stop sending packets */ 2612*da14cebeSEric Cheng mac_tx_client_block(mcip); 2613*da14cebeSEric Cheng 2614*da14cebeSEric Cheng /* Stop the packets coming from the H/W */ 2615*da14cebeSEric Cheng if (mcip->mci_unicast != NULL) { 2616*da14cebeSEric Cheng int err; 2617*da14cebeSEric Cheng err = mac_remove_macaddr(mcip->mci_unicast); 2618*da14cebeSEric Cheng if (err != 0) { 2619*da14cebeSEric Cheng cmn_err(CE_WARN, "%s: failed to remove a MAC" 2620*da14cebeSEric Cheng " address because of error 0x%x", 2621*da14cebeSEric Cheng mip->mi_name, err); 2622*da14cebeSEric Cheng } 2623*da14cebeSEric Cheng mcip->mci_unicast = NULL; 2624*da14cebeSEric Cheng } 2625*da14cebeSEric Cheng 2626*da14cebeSEric Cheng /* Stop the packets coming from the S/W classifier */ 2627*da14cebeSEric Cheng mac_flow_remove(mip->mi_flow_tab, flent, B_FALSE); 2628*da14cebeSEric Cheng mac_flow_wait(flent, FLOW_DRIVER_UPCALL); 2629*da14cebeSEric Cheng 2630*da14cebeSEric Cheng /* Now quiesce and destroy all SRS and soft rings */ 2631*da14cebeSEric Cheng mac_srs_group_teardown(mcip, flent, SRST_LINK); 2632*da14cebeSEric Cheng ASSERT((mcip->mci_flent == flent) && 2633*da14cebeSEric Cheng (flent->fe_next == NULL)); 2634*da14cebeSEric Cheng 2635*da14cebeSEric Cheng /* 2636*da14cebeSEric Cheng * Release our hold on the group as well. We need 2637*da14cebeSEric Cheng * to check if the shared group has only one client 2638*da14cebeSEric Cheng * left who can use it exclusively. Also, if we 2639*da14cebeSEric Cheng * were the last client, release the group. 2640*da14cebeSEric Cheng */ 2641*da14cebeSEric Cheng group = flent->fe_rx_ring_group; 2642*da14cebeSEric Cheng if (group != NULL) { 2643*da14cebeSEric Cheng mac_rx_group_remove_client(group, mcip); 2644*da14cebeSEric Cheng next_state = mac_rx_group_next_state(group, 2645*da14cebeSEric Cheng &grp_only_mcip); 2646*da14cebeSEric Cheng if (next_state == MAC_GROUP_STATE_RESERVED) { 2647*da14cebeSEric Cheng /* 2648*da14cebeSEric Cheng * Only one client left on this RX group. 2649*da14cebeSEric Cheng */ 2650*da14cebeSEric Cheng ASSERT(grp_only_mcip != NULL); 2651*da14cebeSEric Cheng mac_set_rx_group_state(group, 2652*da14cebeSEric Cheng MAC_GROUP_STATE_RESERVED); 2653*da14cebeSEric Cheng group_only_flent = grp_only_mcip->mci_flent; 2654*da14cebeSEric Cheng 2655*da14cebeSEric Cheng /* 2656*da14cebeSEric Cheng * The only remaining client has exclusive 2657*da14cebeSEric Cheng * access on the group. Allow it to 2658*da14cebeSEric Cheng * dynamically poll the H/W rings etc. 2659*da14cebeSEric Cheng */ 2660*da14cebeSEric Cheng mac_srs_group_setup(grp_only_mcip, 2661*da14cebeSEric Cheng group_only_flent, group, SRST_LINK); 2662*da14cebeSEric Cheng mac_rx_group_unmark(group, MR_INCIPIENT); 2663*da14cebeSEric Cheng } else if (next_state == MAC_GROUP_STATE_REGISTERED) { 2664*da14cebeSEric Cheng /* 2665*da14cebeSEric Cheng * This is a non-default group being freed up. 2666*da14cebeSEric Cheng * We need to reevaluate the default group 2667*da14cebeSEric Cheng * to see if the primary client can get 2668*da14cebeSEric Cheng * exclusive access to the default group. 2669*da14cebeSEric Cheng */ 2670*da14cebeSEric Cheng ASSERT(group != mip->mi_rx_groups); 2671*da14cebeSEric Cheng mac_release_rx_group(mcip, group); 2672*da14cebeSEric Cheng mac_set_rx_group_state(group, 2673*da14cebeSEric Cheng MAC_GROUP_STATE_REGISTERED); 2674*da14cebeSEric Cheng check_default_group = B_TRUE; 2675*da14cebeSEric Cheng } else { 2676*da14cebeSEric Cheng ASSERT(next_state == MAC_GROUP_STATE_SHARED); 2677*da14cebeSEric Cheng mac_set_rx_group_state(group, 2678*da14cebeSEric Cheng MAC_GROUP_STATE_SHARED); 2679*da14cebeSEric Cheng mac_rx_group_unmark(group, MR_CONDEMNED); 2680*da14cebeSEric Cheng } 2681*da14cebeSEric Cheng flent->fe_rx_ring_group = NULL; 2682*da14cebeSEric Cheng } 2683*da14cebeSEric Cheng break; 2684*da14cebeSEric Cheng default: 2685*da14cebeSEric Cheng ASSERT(B_FALSE); 2686*da14cebeSEric Cheng break; 2687*da14cebeSEric Cheng } 2688*da14cebeSEric Cheng 2689*da14cebeSEric Cheng /* 2690*da14cebeSEric Cheng * The mac client using the default group gets exclusive access to the 2691*da14cebeSEric Cheng * default group if and only if it is the sole client on the entire 2692*da14cebeSEric Cheng * mip. If so set the group state to reserved, and set up the SRSes 2693*da14cebeSEric Cheng * over the default group. 2694*da14cebeSEric Cheng */ 2695*da14cebeSEric Cheng if (check_default_group) { 2696*da14cebeSEric Cheng default_group = mip->mi_rx_groups; 2697*da14cebeSEric Cheng ASSERT(default_group->mrg_state == MAC_GROUP_STATE_SHARED); 2698*da14cebeSEric Cheng next_state = mac_rx_group_next_state(default_group, 2699*da14cebeSEric Cheng &grp_only_mcip); 2700*da14cebeSEric Cheng if (next_state == MAC_GROUP_STATE_RESERVED) { 2701*da14cebeSEric Cheng ASSERT(grp_only_mcip != NULL && 2702*da14cebeSEric Cheng mip->mi_nactiveclients == 1); 2703*da14cebeSEric Cheng mac_set_rx_group_state(default_group, 2704*da14cebeSEric Cheng MAC_GROUP_STATE_RESERVED); 2705*da14cebeSEric Cheng mac_srs_group_setup(grp_only_mcip, 2706*da14cebeSEric Cheng grp_only_mcip->mci_flent, 2707*da14cebeSEric Cheng default_group, SRST_LINK); 2708*da14cebeSEric Cheng } 2709*da14cebeSEric Cheng } 2710*da14cebeSEric Cheng } 2711*da14cebeSEric Cheng 2712*da14cebeSEric Cheng /* DATAPATH TEAR DOWN ROUTINES (SRS and FANOUT teardown) */ 2713*da14cebeSEric Cheng 2714*da14cebeSEric Cheng static void 2715*da14cebeSEric Cheng mac_srs_fanout_list_free(mac_soft_ring_set_t *mac_srs) 2716*da14cebeSEric Cheng { 2717*da14cebeSEric Cheng ASSERT(mac_srs->srs_tcp_soft_rings != NULL); 2718*da14cebeSEric Cheng kmem_free(mac_srs->srs_tcp_soft_rings, 2719*da14cebeSEric Cheng sizeof (mac_soft_ring_t *) * MAX_SR_FANOUT); 2720*da14cebeSEric Cheng mac_srs->srs_tcp_soft_rings = NULL; 2721*da14cebeSEric Cheng ASSERT(mac_srs->srs_udp_soft_rings != NULL); 2722*da14cebeSEric Cheng kmem_free(mac_srs->srs_udp_soft_rings, 2723*da14cebeSEric Cheng sizeof (mac_soft_ring_t *) * MAX_SR_FANOUT); 2724*da14cebeSEric Cheng mac_srs->srs_udp_soft_rings = NULL; 2725*da14cebeSEric Cheng ASSERT(mac_srs->srs_oth_soft_rings != NULL); 2726*da14cebeSEric Cheng kmem_free(mac_srs->srs_oth_soft_rings, 2727*da14cebeSEric Cheng sizeof (mac_soft_ring_t *) * MAX_SR_FANOUT); 2728*da14cebeSEric Cheng mac_srs->srs_oth_soft_rings = NULL; 2729*da14cebeSEric Cheng } 2730*da14cebeSEric Cheng 2731*da14cebeSEric Cheng /* 2732*da14cebeSEric Cheng * An RX SRS is attached to at most one mac_ring. 2733*da14cebeSEric Cheng * A TX SRS has no rings. 2734*da14cebeSEric Cheng */ 2735*da14cebeSEric Cheng static void 2736*da14cebeSEric Cheng mac_srs_ring_free(mac_soft_ring_set_t *mac_srs) 2737*da14cebeSEric Cheng { 2738*da14cebeSEric Cheng mac_client_impl_t *mcip; 2739*da14cebeSEric Cheng mac_ring_t *ring; 2740*da14cebeSEric Cheng flow_entry_t *flent; 2741*da14cebeSEric Cheng 2742*da14cebeSEric Cheng ring = mac_srs->srs_ring; 2743*da14cebeSEric Cheng if (mac_srs->srs_type & SRST_TX) { 2744*da14cebeSEric Cheng ASSERT(ring == NULL); 2745*da14cebeSEric Cheng return; 2746*da14cebeSEric Cheng } 2747*da14cebeSEric Cheng 2748*da14cebeSEric Cheng if (ring == NULL) 2749*da14cebeSEric Cheng return; 2750*da14cebeSEric Cheng 2751*da14cebeSEric Cheng /* 2752*da14cebeSEric Cheng * Broadcast flows don't have a client impl association, but they 2753*da14cebeSEric Cheng * use only soft rings. 2754*da14cebeSEric Cheng */ 2755*da14cebeSEric Cheng flent = mac_srs->srs_flent; 2756*da14cebeSEric Cheng mcip = flent->fe_mcip; 2757*da14cebeSEric Cheng ASSERT(mcip != NULL); 2758*da14cebeSEric Cheng 2759*da14cebeSEric Cheng ring->mr_classify_type = MAC_NO_CLASSIFIER; 2760*da14cebeSEric Cheng ring->mr_srs = NULL; 2761*da14cebeSEric Cheng } 2762*da14cebeSEric Cheng 2763*da14cebeSEric Cheng /* 2764*da14cebeSEric Cheng * Physical unlink and free of the data structures happen below. This is 2765*da14cebeSEric Cheng * driven from mac_flow_destroy(), on the last refrele of a flow. 2766*da14cebeSEric Cheng * 2767*da14cebeSEric Cheng * Assumes Rx srs is 1-1 mapped with an ring. 2768*da14cebeSEric Cheng */ 2769*da14cebeSEric Cheng void 2770*da14cebeSEric Cheng mac_srs_free(mac_soft_ring_set_t *mac_srs) 2771*da14cebeSEric Cheng { 2772*da14cebeSEric Cheng ASSERT(mac_srs->srs_mcip == NULL || 2773*da14cebeSEric Cheng MAC_PERIM_HELD((mac_handle_t)mac_srs->srs_mcip->mci_mip)); 2774*da14cebeSEric Cheng ASSERT((mac_srs->srs_state & (SRS_CONDEMNED | SRS_CONDEMNED_DONE | 2775*da14cebeSEric Cheng SRS_PROC | SRS_PROC_FAST)) == (SRS_CONDEMNED | SRS_CONDEMNED_DONE)); 2776*da14cebeSEric Cheng 2777*da14cebeSEric Cheng mac_pkt_drop(NULL, NULL, mac_srs->srs_first, B_FALSE); 2778*da14cebeSEric Cheng mac_srs_ring_free(mac_srs); 2779*da14cebeSEric Cheng mac_srs_soft_rings_free(mac_srs, B_TRUE); 2780*da14cebeSEric Cheng mac_srs_fanout_list_free(mac_srs); 2781*da14cebeSEric Cheng 2782*da14cebeSEric Cheng mac_srs->srs_bw = NULL; 2783*da14cebeSEric Cheng kmem_cache_free(mac_srs_cache, mac_srs); 2784*da14cebeSEric Cheng } 2785*da14cebeSEric Cheng 2786*da14cebeSEric Cheng static void 2787*da14cebeSEric Cheng mac_srs_soft_rings_quiesce(mac_soft_ring_set_t *mac_srs, uint_t s_ring_flag) 2788*da14cebeSEric Cheng { 2789*da14cebeSEric Cheng mac_soft_ring_t *softring; 2790*da14cebeSEric Cheng 2791*da14cebeSEric Cheng ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 2792*da14cebeSEric Cheng 2793*da14cebeSEric Cheng mac_srs_soft_rings_signal(mac_srs, s_ring_flag); 2794*da14cebeSEric Cheng if (s_ring_flag == S_RING_CONDEMNED) { 2795*da14cebeSEric Cheng while (mac_srs->srs_soft_ring_condemned_count != 2796*da14cebeSEric Cheng mac_srs->srs_soft_ring_count) 2797*da14cebeSEric Cheng cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock); 2798*da14cebeSEric Cheng } else { 2799*da14cebeSEric Cheng while (mac_srs->srs_soft_ring_quiesced_count != 2800*da14cebeSEric Cheng mac_srs->srs_soft_ring_count) 2801*da14cebeSEric Cheng cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock); 2802*da14cebeSEric Cheng } 2803*da14cebeSEric Cheng mutex_exit(&mac_srs->srs_lock); 2804*da14cebeSEric Cheng 2805*da14cebeSEric Cheng for (softring = mac_srs->srs_soft_ring_head; softring != NULL; 2806*da14cebeSEric Cheng softring = softring->s_ring_next) 2807*da14cebeSEric Cheng (void) untimeout(softring->s_ring_tid); 2808*da14cebeSEric Cheng 2809*da14cebeSEric Cheng (void) untimeout(mac_srs->srs_tid); 2810*da14cebeSEric Cheng 2811*da14cebeSEric Cheng mutex_enter(&mac_srs->srs_lock); 2812*da14cebeSEric Cheng } 2813*da14cebeSEric Cheng 2814*da14cebeSEric Cheng /* 2815*da14cebeSEric Cheng * The block comment above mac_rx_classify_flow_state_change explains the 2816*da14cebeSEric Cheng * background. At this point upcalls from the driver (both hardware classified 2817*da14cebeSEric Cheng * and software classified) have been cut off. We now need to quiesce the 2818*da14cebeSEric Cheng * SRS worker, poll, and softring threads. The SRS worker thread serves as 2819*da14cebeSEric Cheng * the master controller. The steps involved are described below in the function 2820*da14cebeSEric Cheng */ 2821*da14cebeSEric Cheng void 2822*da14cebeSEric Cheng mac_srs_worker_quiesce(mac_soft_ring_set_t *mac_srs) 2823*da14cebeSEric Cheng { 2824*da14cebeSEric Cheng uint_t s_ring_flag; 2825*da14cebeSEric Cheng uint_t srs_poll_wait_flag; 2826*da14cebeSEric Cheng 2827*da14cebeSEric Cheng ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 2828*da14cebeSEric Cheng ASSERT(mac_srs->srs_state & (SRS_CONDEMNED | SRS_QUIESCE)); 2829*da14cebeSEric Cheng 2830*da14cebeSEric Cheng if (mac_srs->srs_state & SRS_CONDEMNED) { 2831*da14cebeSEric Cheng s_ring_flag = S_RING_CONDEMNED; 2832*da14cebeSEric Cheng srs_poll_wait_flag = SRS_POLL_THR_EXITED; 2833*da14cebeSEric Cheng } else { 2834*da14cebeSEric Cheng s_ring_flag = S_RING_QUIESCE; 2835*da14cebeSEric Cheng srs_poll_wait_flag = SRS_POLL_THR_QUIESCED; 2836*da14cebeSEric Cheng } 2837*da14cebeSEric Cheng 2838*da14cebeSEric Cheng /* 2839*da14cebeSEric Cheng * In the case of Rx SRS wait till the poll thread is done. 2840*da14cebeSEric Cheng */ 2841*da14cebeSEric Cheng if ((mac_srs->srs_type & SRST_TX) == 0 && 2842*da14cebeSEric Cheng mac_srs->srs_poll_thr != NULL) { 2843*da14cebeSEric Cheng while (!(mac_srs->srs_state & srs_poll_wait_flag)) 2844*da14cebeSEric Cheng cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock); 2845*da14cebeSEric Cheng 2846*da14cebeSEric Cheng /* 2847*da14cebeSEric Cheng * Turn off polling as part of the quiesce operation. 2848*da14cebeSEric Cheng */ 2849*da14cebeSEric Cheng MAC_SRS_POLLING_OFF(mac_srs); 2850*da14cebeSEric Cheng mac_srs->srs_state &= ~(SRS_POLLING | SRS_GET_PKTS); 2851*da14cebeSEric Cheng } 2852*da14cebeSEric Cheng 2853*da14cebeSEric Cheng /* 2854*da14cebeSEric Cheng * Then signal the soft ring worker threads to quiesce or quit 2855*da14cebeSEric Cheng * as needed and then wait till that happens. 2856*da14cebeSEric Cheng */ 2857*da14cebeSEric Cheng mac_srs_soft_rings_quiesce(mac_srs, s_ring_flag); 2858*da14cebeSEric Cheng 2859*da14cebeSEric Cheng if (mac_srs->srs_state & SRS_CONDEMNED) 2860*da14cebeSEric Cheng mac_srs->srs_state |= (SRS_QUIESCE_DONE | SRS_CONDEMNED_DONE); 2861*da14cebeSEric Cheng else 2862*da14cebeSEric Cheng mac_srs->srs_state |= SRS_QUIESCE_DONE; 2863*da14cebeSEric Cheng cv_signal(&mac_srs->srs_quiesce_done_cv); 2864*da14cebeSEric Cheng } 2865*da14cebeSEric Cheng 2866*da14cebeSEric Cheng /* 2867*da14cebeSEric Cheng * Signal an SRS to start a temporary quiesce, or permanent removal, or restart 2868*da14cebeSEric Cheng * a quiesced SRS by setting the appropriate flags and signaling the SRS worker 2869*da14cebeSEric Cheng * or poll thread. This function is internal to the quiescing logic and is 2870*da14cebeSEric Cheng * called internally from the SRS quiesce or flow quiesce or client quiesce 2871*da14cebeSEric Cheng * higher level functions. 2872*da14cebeSEric Cheng */ 2873*da14cebeSEric Cheng void 2874*da14cebeSEric Cheng mac_srs_signal(mac_soft_ring_set_t *mac_srs, uint_t srs_flag) 2875*da14cebeSEric Cheng { 2876*da14cebeSEric Cheng mac_ring_t *ring; 2877*da14cebeSEric Cheng 2878*da14cebeSEric Cheng ring = mac_srs->srs_ring; 2879*da14cebeSEric Cheng ASSERT(ring == NULL || ring->mr_refcnt == 0); 2880*da14cebeSEric Cheng 2881*da14cebeSEric Cheng if (srs_flag == SRS_CONDEMNED) { 2882*da14cebeSEric Cheng /* 2883*da14cebeSEric Cheng * The SRS is going away. We need to unbind the SRS and SR 2884*da14cebeSEric Cheng * threads before removing from the global SRS list. Otherwise 2885*da14cebeSEric Cheng * there is a small window where the cpu reconfig callbacks 2886*da14cebeSEric Cheng * may miss the SRS in the list walk and DR could fail since 2887*da14cebeSEric Cheng * there are still bound threads. 2888*da14cebeSEric Cheng */ 2889*da14cebeSEric Cheng mac_srs_threads_unbind(mac_srs); 2890*da14cebeSEric Cheng mac_srs_remove_glist(mac_srs); 2891*da14cebeSEric Cheng } 2892*da14cebeSEric Cheng /* 2893*da14cebeSEric Cheng * Wakeup the SRS worker and poll threads. 2894*da14cebeSEric Cheng */ 2895*da14cebeSEric Cheng mutex_enter(&mac_srs->srs_lock); 2896*da14cebeSEric Cheng mac_srs->srs_state |= srs_flag; 2897*da14cebeSEric Cheng cv_signal(&mac_srs->srs_async); 2898*da14cebeSEric Cheng cv_signal(&mac_srs->srs_cv); 2899*da14cebeSEric Cheng mutex_exit(&mac_srs->srs_lock); 2900*da14cebeSEric Cheng } 2901*da14cebeSEric Cheng 2902*da14cebeSEric Cheng /* 2903*da14cebeSEric Cheng * In the Rx side, the quiescing is done bottom up. After the Rx upcalls 2904*da14cebeSEric Cheng * from the driver are done, then the Rx SRS is quiesced and only then can 2905*da14cebeSEric Cheng * we signal the soft rings. Thus this function can't be called arbitrarily 2906*da14cebeSEric Cheng * without satisfying the prerequisites. On the Tx side, the threads from 2907*da14cebeSEric Cheng * top need to quiesced, then the Tx SRS and only then can we signal the 2908*da14cebeSEric Cheng * Tx soft rings. 2909*da14cebeSEric Cheng */ 2910*da14cebeSEric Cheng static void 2911*da14cebeSEric Cheng mac_srs_soft_rings_signal(mac_soft_ring_set_t *mac_srs, uint_t sr_flag) 2912*da14cebeSEric Cheng { 2913*da14cebeSEric Cheng mac_soft_ring_t *softring; 2914*da14cebeSEric Cheng 2915*da14cebeSEric Cheng for (softring = mac_srs->srs_soft_ring_head; softring != NULL; 2916*da14cebeSEric Cheng softring = softring->s_ring_next) 2917*da14cebeSEric Cheng mac_soft_ring_signal(softring, sr_flag); 2918*da14cebeSEric Cheng } 2919*da14cebeSEric Cheng 2920*da14cebeSEric Cheng /* 2921*da14cebeSEric Cheng * The block comment above mac_rx_classify_flow_state_change explains the 2922*da14cebeSEric Cheng * background. At this point the SRS is quiesced and we need to restart the 2923*da14cebeSEric Cheng * SRS worker, poll, and softring threads. The SRS worker thread serves as 2924*da14cebeSEric Cheng * the master controller. The steps involved are described below in the function 2925*da14cebeSEric Cheng */ 2926*da14cebeSEric Cheng void 2927*da14cebeSEric Cheng mac_srs_worker_restart(mac_soft_ring_set_t *mac_srs) 2928*da14cebeSEric Cheng { 2929*da14cebeSEric Cheng boolean_t iam_rx_srs; 2930*da14cebeSEric Cheng mac_soft_ring_t *softring; 2931*da14cebeSEric Cheng 2932*da14cebeSEric Cheng ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 2933*da14cebeSEric Cheng if ((mac_srs->srs_type & SRST_TX) != 0) { 2934*da14cebeSEric Cheng iam_rx_srs = B_FALSE; 2935*da14cebeSEric Cheng ASSERT((mac_srs->srs_state & 2936*da14cebeSEric Cheng (SRS_POLL_THR_QUIESCED | SRS_QUIESCE_DONE | SRS_QUIESCE)) == 2937*da14cebeSEric Cheng (SRS_QUIESCE_DONE | SRS_QUIESCE)); 2938*da14cebeSEric Cheng } else { 2939*da14cebeSEric Cheng iam_rx_srs = B_TRUE; 2940*da14cebeSEric Cheng ASSERT((mac_srs->srs_state & 2941*da14cebeSEric Cheng (SRS_QUIESCE_DONE | SRS_QUIESCE)) == 2942*da14cebeSEric Cheng (SRS_QUIESCE_DONE | SRS_QUIESCE)); 2943*da14cebeSEric Cheng if (mac_srs->srs_poll_thr != NULL) { 2944*da14cebeSEric Cheng ASSERT((mac_srs->srs_state & SRS_POLL_THR_QUIESCED) == 2945*da14cebeSEric Cheng SRS_POLL_THR_QUIESCED); 2946*da14cebeSEric Cheng } 2947*da14cebeSEric Cheng } 2948*da14cebeSEric Cheng 2949*da14cebeSEric Cheng /* 2950*da14cebeSEric Cheng * Signal any quiesced soft ring workers to restart and wait for the 2951*da14cebeSEric Cheng * soft ring down count to come down to zero. 2952*da14cebeSEric Cheng */ 2953*da14cebeSEric Cheng if (mac_srs->srs_soft_ring_quiesced_count != 0) { 2954*da14cebeSEric Cheng for (softring = mac_srs->srs_soft_ring_head; softring != NULL; 2955*da14cebeSEric Cheng softring = softring->s_ring_next) { 2956*da14cebeSEric Cheng if (!(softring->s_ring_state & S_RING_QUIESCE)) 2957*da14cebeSEric Cheng continue; 2958*da14cebeSEric Cheng mac_soft_ring_signal(softring, S_RING_RESTART); 2959*da14cebeSEric Cheng } 2960*da14cebeSEric Cheng while (mac_srs->srs_soft_ring_quiesced_count != 0) 2961*da14cebeSEric Cheng cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock); 2962*da14cebeSEric Cheng } 2963*da14cebeSEric Cheng 2964*da14cebeSEric Cheng mac_srs->srs_state &= ~(SRS_QUIESCE_DONE | SRS_QUIESCE | SRS_RESTART); 2965*da14cebeSEric Cheng if (iam_rx_srs && mac_srs->srs_poll_thr != NULL) { 2966*da14cebeSEric Cheng /* 2967*da14cebeSEric Cheng * Signal the poll thread and ask it to restart. Wait till it 2968*da14cebeSEric Cheng * actually restarts and the SRS_POLL_THR_QUIESCED flag gets 2969*da14cebeSEric Cheng * cleared. 2970*da14cebeSEric Cheng */ 2971*da14cebeSEric Cheng mac_srs->srs_state |= SRS_POLL_THR_RESTART; 2972*da14cebeSEric Cheng cv_signal(&mac_srs->srs_cv); 2973*da14cebeSEric Cheng while (mac_srs->srs_state & SRS_POLL_THR_QUIESCED) 2974*da14cebeSEric Cheng cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock); 2975*da14cebeSEric Cheng ASSERT(!(mac_srs->srs_state & SRS_POLL_THR_RESTART)); 2976*da14cebeSEric Cheng } 2977*da14cebeSEric Cheng /* Wake up any waiter waiting for the restart to complete */ 2978*da14cebeSEric Cheng mac_srs->srs_state |= SRS_RESTART_DONE; 2979*da14cebeSEric Cheng cv_signal(&mac_srs->srs_quiesce_done_cv); 2980*da14cebeSEric Cheng } 2981*da14cebeSEric Cheng 2982*da14cebeSEric Cheng static void 2983*da14cebeSEric Cheng mac_srs_worker_unbind(mac_soft_ring_set_t *mac_srs) 2984*da14cebeSEric Cheng { 2985*da14cebeSEric Cheng mutex_enter(&mac_srs->srs_lock); 2986*da14cebeSEric Cheng if (!(mac_srs->srs_state & SRS_WORKER_BOUND)) { 2987*da14cebeSEric Cheng ASSERT(mac_srs->srs_worker_cpuid == -1); 2988*da14cebeSEric Cheng mutex_exit(&mac_srs->srs_lock); 2989*da14cebeSEric Cheng return; 2990*da14cebeSEric Cheng } 2991*da14cebeSEric Cheng 2992*da14cebeSEric Cheng mac_srs->srs_worker_cpuid = -1; 2993*da14cebeSEric Cheng mac_srs->srs_state &= ~SRS_WORKER_BOUND; 2994*da14cebeSEric Cheng thread_affinity_clear(mac_srs->srs_worker); 2995*da14cebeSEric Cheng mutex_exit(&mac_srs->srs_lock); 2996*da14cebeSEric Cheng } 2997*da14cebeSEric Cheng 2998*da14cebeSEric Cheng static void 2999*da14cebeSEric Cheng mac_srs_poll_unbind(mac_soft_ring_set_t *mac_srs) 3000*da14cebeSEric Cheng { 3001*da14cebeSEric Cheng mutex_enter(&mac_srs->srs_lock); 3002*da14cebeSEric Cheng if (mac_srs->srs_poll_thr == NULL || 3003*da14cebeSEric Cheng (mac_srs->srs_state & SRS_POLL_BOUND) == 0) { 3004*da14cebeSEric Cheng ASSERT(mac_srs->srs_poll_cpuid == -1); 3005*da14cebeSEric Cheng mutex_exit(&mac_srs->srs_lock); 3006*da14cebeSEric Cheng return; 3007*da14cebeSEric Cheng } 3008*da14cebeSEric Cheng 3009*da14cebeSEric Cheng mac_srs->srs_poll_cpuid = -1; 3010*da14cebeSEric Cheng mac_srs->srs_state &= ~SRS_POLL_BOUND; 3011*da14cebeSEric Cheng thread_affinity_clear(mac_srs->srs_poll_thr); 3012*da14cebeSEric Cheng mutex_exit(&mac_srs->srs_lock); 3013*da14cebeSEric Cheng } 3014*da14cebeSEric Cheng 3015*da14cebeSEric Cheng static void 3016*da14cebeSEric Cheng mac_srs_threads_unbind(mac_soft_ring_set_t *mac_srs) 3017*da14cebeSEric Cheng { 3018*da14cebeSEric Cheng mac_soft_ring_t *soft_ring; 3019*da14cebeSEric Cheng 3020*da14cebeSEric Cheng ASSERT(MAC_PERIM_HELD((mac_handle_t)mac_srs->srs_mcip->mci_mip)); 3021*da14cebeSEric Cheng 3022*da14cebeSEric Cheng mutex_enter(&cpu_lock); 3023*da14cebeSEric Cheng mac_srs_worker_unbind(mac_srs); 3024*da14cebeSEric Cheng if (!(mac_srs->srs_type & SRST_TX)) 3025*da14cebeSEric Cheng mac_srs_poll_unbind(mac_srs); 3026*da14cebeSEric Cheng 3027*da14cebeSEric Cheng for (soft_ring = mac_srs->srs_soft_ring_head; soft_ring != NULL; 3028*da14cebeSEric Cheng soft_ring = soft_ring->s_ring_next) { 3029*da14cebeSEric Cheng mac_soft_ring_unbind(soft_ring); 3030*da14cebeSEric Cheng } 3031*da14cebeSEric Cheng mutex_exit(&cpu_lock); 3032*da14cebeSEric Cheng } 3033*da14cebeSEric Cheng 3034*da14cebeSEric Cheng /* 3035*da14cebeSEric Cheng * When a CPU is going away, unbind all MAC threads which are bound 3036*da14cebeSEric Cheng * to that CPU. The affinity of the thread to the CPU is saved to allow 3037*da14cebeSEric Cheng * the thread to be rebound to the CPU if it comes back online. 3038*da14cebeSEric Cheng */ 3039*da14cebeSEric Cheng static void 3040*da14cebeSEric Cheng mac_walk_srs_and_unbind(int cpuid) 3041*da14cebeSEric Cheng { 3042*da14cebeSEric Cheng mac_soft_ring_set_t *mac_srs; 3043*da14cebeSEric Cheng mac_soft_ring_t *soft_ring; 3044*da14cebeSEric Cheng 3045*da14cebeSEric Cheng rw_enter(&mac_srs_g_lock, RW_READER); 3046*da14cebeSEric Cheng 3047*da14cebeSEric Cheng if ((mac_srs = mac_srs_g_list) == NULL) 3048*da14cebeSEric Cheng goto done; 3049*da14cebeSEric Cheng 3050*da14cebeSEric Cheng for (; mac_srs != NULL; mac_srs = mac_srs->srs_next) { 3051*da14cebeSEric Cheng if (mac_srs->srs_worker_cpuid == cpuid) { 3052*da14cebeSEric Cheng mac_srs->srs_worker_cpuid_save = cpuid; 3053*da14cebeSEric Cheng mac_srs_worker_unbind(mac_srs); 3054*da14cebeSEric Cheng } 3055*da14cebeSEric Cheng 3056*da14cebeSEric Cheng if (!(mac_srs->srs_type & SRST_TX)) { 3057*da14cebeSEric Cheng if (mac_srs->srs_poll_cpuid == cpuid) { 3058*da14cebeSEric Cheng mac_srs->srs_poll_cpuid_save = cpuid; 3059*da14cebeSEric Cheng mac_srs_poll_unbind(mac_srs); 3060*da14cebeSEric Cheng } 3061*da14cebeSEric Cheng } 3062*da14cebeSEric Cheng 3063*da14cebeSEric Cheng /* Next tackle the soft rings associated with the srs */ 3064*da14cebeSEric Cheng mutex_enter(&mac_srs->srs_lock); 3065*da14cebeSEric Cheng for (soft_ring = mac_srs->srs_soft_ring_head; soft_ring != NULL; 3066*da14cebeSEric Cheng soft_ring = soft_ring->s_ring_next) { 3067*da14cebeSEric Cheng if (soft_ring->s_ring_cpuid == cpuid) { 3068*da14cebeSEric Cheng soft_ring->s_ring_cpuid_save = cpuid; 3069*da14cebeSEric Cheng mac_soft_ring_unbind(soft_ring); 3070*da14cebeSEric Cheng } 3071*da14cebeSEric Cheng } 3072*da14cebeSEric Cheng mutex_exit(&mac_srs->srs_lock); 3073*da14cebeSEric Cheng } 3074*da14cebeSEric Cheng done: 3075*da14cebeSEric Cheng rw_exit(&mac_srs_g_lock); 3076*da14cebeSEric Cheng } 3077*da14cebeSEric Cheng 3078*da14cebeSEric Cheng /* TX SETUP and TEARDOWN ROUTINES */ 3079*da14cebeSEric Cheng 3080*da14cebeSEric Cheng /* 3081*da14cebeSEric Cheng * XXXHIO need to make sure the two mac_tx_srs_{add,del}_ring() 3082*da14cebeSEric Cheng * handle the case where the number of rings is one. I.e. there is 3083*da14cebeSEric Cheng * a ring pointed to by mac_srs->srs_tx_arg2. 3084*da14cebeSEric Cheng */ 3085*da14cebeSEric Cheng void 3086*da14cebeSEric Cheng mac_tx_srs_add_ring(mac_soft_ring_set_t *mac_srs, mac_ring_t *tx_ring) 3087*da14cebeSEric Cheng { 3088*da14cebeSEric Cheng mac_client_impl_t *mcip = mac_srs->srs_mcip; 3089*da14cebeSEric Cheng mac_soft_ring_t *soft_ring; 3090*da14cebeSEric Cheng int count = mac_srs->srs_oth_ring_count; 3091*da14cebeSEric Cheng 3092*da14cebeSEric Cheng ASSERT(mac_srs->srs_state & SRS_QUIESCE); 3093*da14cebeSEric Cheng soft_ring = mac_soft_ring_create(count, 0, NULL, 3094*da14cebeSEric Cheng (ST_RING_OTH | ST_RING_TX), maxclsyspri, mcip, mac_srs, -1, 3095*da14cebeSEric Cheng NULL, mcip, (mac_resource_handle_t)tx_ring); 3096*da14cebeSEric Cheng mac_srs->srs_oth_ring_count++; 3097*da14cebeSEric Cheng /* 3098*da14cebeSEric Cheng * put this soft ring in quiesce mode too so when we restart 3099*da14cebeSEric Cheng * all soft rings in the srs are in the same state. 3100*da14cebeSEric Cheng */ 3101*da14cebeSEric Cheng mac_soft_ring_signal(soft_ring, S_RING_QUIESCE); 3102*da14cebeSEric Cheng } 3103*da14cebeSEric Cheng 3104*da14cebeSEric Cheng static void 3105*da14cebeSEric Cheng mac_soft_ring_remove(mac_soft_ring_set_t *mac_srs, mac_soft_ring_t *softring) 3106*da14cebeSEric Cheng { 3107*da14cebeSEric Cheng int sringcnt; 3108*da14cebeSEric Cheng 3109*da14cebeSEric Cheng mutex_enter(&mac_srs->srs_lock); 3110*da14cebeSEric Cheng sringcnt = mac_srs->srs_soft_ring_count; 3111*da14cebeSEric Cheng ASSERT(sringcnt > 0); 3112*da14cebeSEric Cheng mac_soft_ring_signal(softring, S_RING_CONDEMNED); 3113*da14cebeSEric Cheng 3114*da14cebeSEric Cheng ASSERT(mac_srs->srs_soft_ring_condemned_count == 0); 3115*da14cebeSEric Cheng while (mac_srs->srs_soft_ring_condemned_count != 1) 3116*da14cebeSEric Cheng cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock); 3117*da14cebeSEric Cheng 3118*da14cebeSEric Cheng if (softring == mac_srs->srs_soft_ring_head) { 3119*da14cebeSEric Cheng mac_srs->srs_soft_ring_head = softring->s_ring_next; 3120*da14cebeSEric Cheng if (mac_srs->srs_soft_ring_head != NULL) { 3121*da14cebeSEric Cheng mac_srs->srs_soft_ring_head->s_ring_prev = NULL; 3122*da14cebeSEric Cheng } else { 3123*da14cebeSEric Cheng mac_srs->srs_soft_ring_tail = NULL; 3124*da14cebeSEric Cheng } 3125*da14cebeSEric Cheng } else { 3126*da14cebeSEric Cheng softring->s_ring_prev->s_ring_next = 3127*da14cebeSEric Cheng softring->s_ring_next; 3128*da14cebeSEric Cheng if (softring->s_ring_next != NULL) { 3129*da14cebeSEric Cheng softring->s_ring_next->s_ring_prev = 3130*da14cebeSEric Cheng softring->s_ring_prev; 3131*da14cebeSEric Cheng } else { 3132*da14cebeSEric Cheng mac_srs->srs_soft_ring_tail = 3133*da14cebeSEric Cheng softring->s_ring_prev; 3134*da14cebeSEric Cheng } 3135*da14cebeSEric Cheng } 3136*da14cebeSEric Cheng mac_srs->srs_soft_ring_count--; 3137*da14cebeSEric Cheng 3138*da14cebeSEric Cheng mac_srs->srs_soft_ring_condemned_count--; 3139*da14cebeSEric Cheng mutex_exit(&mac_srs->srs_lock); 3140*da14cebeSEric Cheng 3141*da14cebeSEric Cheng mac_soft_ring_free(softring, B_FALSE); 3142*da14cebeSEric Cheng } 3143*da14cebeSEric Cheng 3144*da14cebeSEric Cheng void 3145*da14cebeSEric Cheng mac_tx_srs_del_ring(mac_soft_ring_set_t *mac_srs, mac_ring_t *tx_ring) 3146*da14cebeSEric Cheng { 3147*da14cebeSEric Cheng int i; 3148*da14cebeSEric Cheng mac_soft_ring_t *soft_ring, *remove_sring; 3149*da14cebeSEric Cheng 3150*da14cebeSEric Cheng mutex_enter(&mac_srs->srs_lock); 3151*da14cebeSEric Cheng for (i = 0; i < mac_srs->srs_oth_ring_count; i++) { 3152*da14cebeSEric Cheng soft_ring = mac_srs->srs_oth_soft_rings[i]; 3153*da14cebeSEric Cheng if (soft_ring->s_ring_tx_arg2 == tx_ring) 3154*da14cebeSEric Cheng break; 3155*da14cebeSEric Cheng } 3156*da14cebeSEric Cheng mutex_exit(&mac_srs->srs_lock); 3157*da14cebeSEric Cheng ASSERT(i < mac_srs->srs_oth_ring_count); 3158*da14cebeSEric Cheng remove_sring = soft_ring; 3159*da14cebeSEric Cheng mac_soft_ring_remove(mac_srs, remove_sring); 3160*da14cebeSEric Cheng mac_srs_update_fanout_list(mac_srs); 3161*da14cebeSEric Cheng } 3162*da14cebeSEric Cheng 3163*da14cebeSEric Cheng /* 3164*da14cebeSEric Cheng * mac_tx_srs_setup(): 3165*da14cebeSEric Cheng * 3166*da14cebeSEric Cheng * Used to setup Tx rings. If no free Tx ring is available, then default 3167*da14cebeSEric Cheng * Tx ring is used. 3168*da14cebeSEric Cheng */ 3169*da14cebeSEric Cheng void 3170*da14cebeSEric Cheng mac_tx_srs_setup(mac_client_impl_t *mcip, flow_entry_t *flent, 3171*da14cebeSEric Cheng uint32_t srs_type) 3172*da14cebeSEric Cheng { 3173*da14cebeSEric Cheng mac_impl_t *mip = mcip->mci_mip; 3174*da14cebeSEric Cheng mac_soft_ring_set_t *tx_srs; 3175*da14cebeSEric Cheng int i, tx_ring_count = 0, tx_rings_reserved; 3176*da14cebeSEric Cheng mac_ring_handle_t *tx_ring = NULL; 3177*da14cebeSEric Cheng uint32_t soft_ring_type; 3178*da14cebeSEric Cheng mac_group_t *grp = NULL; 3179*da14cebeSEric Cheng mac_ring_t *ring; 3180*da14cebeSEric Cheng mac_srs_tx_t *tx; 3181*da14cebeSEric Cheng boolean_t serialize = B_FALSE; 3182*da14cebeSEric Cheng 3183*da14cebeSEric Cheng tx_srs = flent->fe_tx_srs; 3184*da14cebeSEric Cheng tx = &tx_srs->srs_tx; 3185*da14cebeSEric Cheng 3186*da14cebeSEric Cheng if (tx->st_group != NULL) { 3187*da14cebeSEric Cheng grp = tx->st_group; 3188*da14cebeSEric Cheng tx_ring_count = grp->mrg_cur_count; 3189*da14cebeSEric Cheng } else { 3190*da14cebeSEric Cheng tx_ring_count = mac_tx_ring_count; 3191*da14cebeSEric Cheng } 3192*da14cebeSEric Cheng 3193*da14cebeSEric Cheng if (tx_ring_count != 0) { 3194*da14cebeSEric Cheng tx_ring = kmem_zalloc(sizeof (mac_ring_handle_t) * 3195*da14cebeSEric Cheng tx_ring_count, KM_SLEEP); 3196*da14cebeSEric Cheng } 3197*da14cebeSEric Cheng 3198*da14cebeSEric Cheng /* 3199*da14cebeSEric Cheng * Just use the default ring for now. We need to use 3200*da14cebeSEric Cheng * the underlying link's ring set instead of the underlying 3201*da14cebeSEric Cheng * NIC's. 3202*da14cebeSEric Cheng */ 3203*da14cebeSEric Cheng if (srs_type == SRST_FLOW || mcip->mci_no_hwrings) 3204*da14cebeSEric Cheng goto use_default_ring; 3205*da14cebeSEric Cheng 3206*da14cebeSEric Cheng if (mcip->mci_share != NULL) 3207*da14cebeSEric Cheng ring = grp->mrg_rings; 3208*da14cebeSEric Cheng /* 3209*da14cebeSEric Cheng * An attempt is made to reserve 'tx_ring_count' number 3210*da14cebeSEric Cheng * of Tx rings. If tx_ring_count is 0, default Tx ring 3211*da14cebeSEric Cheng * is used. If it is 1, an attempt is made to reserve one 3212*da14cebeSEric Cheng * Tx ring. In both the cases, the ring information is 3213*da14cebeSEric Cheng * stored in Tx SRS. If multiple Tx rings are specified, 3214*da14cebeSEric Cheng * then each Tx ring will have a Tx-side soft ring. All 3215*da14cebeSEric Cheng * these soft rings will be hang off Tx SRS. 3216*da14cebeSEric Cheng */ 3217*da14cebeSEric Cheng for (i = 0, tx_rings_reserved = 0; 3218*da14cebeSEric Cheng i < tx_ring_count; i++, tx_rings_reserved++) { 3219*da14cebeSEric Cheng if (mcip->mci_share != NULL) { 3220*da14cebeSEric Cheng /* 3221*da14cebeSEric Cheng * The ring was already chosen and associated 3222*da14cebeSEric Cheng * with the TX group. Save it in the new 3223*da14cebeSEric Cheng * array to keep as much of the code below common 3224*da14cebeSEric Cheng * between the share and non-share cases. 3225*da14cebeSEric Cheng */ 3226*da14cebeSEric Cheng ASSERT(ring != NULL); 3227*da14cebeSEric Cheng tx_ring[i] = (mac_ring_handle_t)ring; 3228*da14cebeSEric Cheng ring = ring->mr_next; 3229*da14cebeSEric Cheng } else { 3230*da14cebeSEric Cheng tx_ring[i] = 3231*da14cebeSEric Cheng (mac_ring_handle_t)mac_reserve_tx_ring(mip, NULL); 3232*da14cebeSEric Cheng if (tx_ring[i] == NULL) 3233*da14cebeSEric Cheng break; 3234*da14cebeSEric Cheng } 3235*da14cebeSEric Cheng } 3236*da14cebeSEric Cheng if (mac_tx_serialize || (mip->mi_v12n_level & MAC_VIRT_SERIALIZE)) 3237*da14cebeSEric Cheng serialize = B_TRUE; 3238*da14cebeSEric Cheng /* 3239*da14cebeSEric Cheng * Did we get the requested number of tx rings? 3240*da14cebeSEric Cheng * There are 3 actions we can take depending upon the number 3241*da14cebeSEric Cheng * of tx_rings we got. 3242*da14cebeSEric Cheng * 1) If we got none, then hook up the tx_srs with the 3243*da14cebeSEric Cheng * default ring. 3244*da14cebeSEric Cheng * 2) If we got one, then get the tx_ring from the soft ring, 3245*da14cebeSEric Cheng * save it in SRS and free up the soft ring. 3246*da14cebeSEric Cheng * 3) If we got more than 1, then do the tx fanout among the 3247*da14cebeSEric Cheng * rings we obtained. 3248*da14cebeSEric Cheng */ 3249*da14cebeSEric Cheng switch (tx_rings_reserved) { 3250*da14cebeSEric Cheng case 1: 3251*da14cebeSEric Cheng /* 3252*da14cebeSEric Cheng * No need to allocate Tx soft rings. Tx-side soft 3253*da14cebeSEric Cheng * rings are for Tx fanout case. Just use Tx SRS. 3254*da14cebeSEric Cheng */ 3255*da14cebeSEric Cheng /* FALLTHRU */ 3256*da14cebeSEric Cheng 3257*da14cebeSEric Cheng case 0: 3258*da14cebeSEric Cheng use_default_ring: 3259*da14cebeSEric Cheng if (tx_rings_reserved == 0) 3260*da14cebeSEric Cheng tx->st_arg2 = (void *)mip->mi_default_tx_ring; 3261*da14cebeSEric Cheng else 3262*da14cebeSEric Cheng tx->st_arg2 = (void *)tx_ring[0]; 3263*da14cebeSEric Cheng /* For ring_count of 0 or 1, set the tx_mode and return */ 3264*da14cebeSEric Cheng if (tx_srs->srs_type & SRST_BW_CONTROL) 3265*da14cebeSEric Cheng tx->st_mode = SRS_TX_BW; 3266*da14cebeSEric Cheng else if (serialize) 3267*da14cebeSEric Cheng tx->st_mode = SRS_TX_SERIALIZE; 3268*da14cebeSEric Cheng else 3269*da14cebeSEric Cheng tx->st_mode = SRS_TX_DEFAULT; 3270*da14cebeSEric Cheng break; 3271*da14cebeSEric Cheng 3272*da14cebeSEric Cheng default: 3273*da14cebeSEric Cheng /* 3274*da14cebeSEric Cheng * We got multiple Tx rings for Tx fanout. 3275*da14cebeSEric Cheng * 3276*da14cebeSEric Cheng * cpuid of -1 is passed. This creates an unbound 3277*da14cebeSEric Cheng * worker thread. Instead the code should get CPU 3278*da14cebeSEric Cheng * binding information and pass that to 3279*da14cebeSEric Cheng * mac_soft_ring_create(). This needs to be done 3280*da14cebeSEric Cheng * in conjunction with Rx-side soft ring 3281*da14cebeSEric Cheng * bindings. 3282*da14cebeSEric Cheng */ 3283*da14cebeSEric Cheng soft_ring_type = ST_RING_OTH | ST_RING_TX; 3284*da14cebeSEric Cheng if (tx_srs->srs_type & SRST_BW_CONTROL) { 3285*da14cebeSEric Cheng tx->st_mode = SRS_TX_BW_FANOUT; 3286*da14cebeSEric Cheng } else { 3287*da14cebeSEric Cheng tx->st_mode = SRS_TX_FANOUT; 3288*da14cebeSEric Cheng if (serialize) 3289*da14cebeSEric Cheng soft_ring_type |= ST_RING_WORKER_ONLY; 3290*da14cebeSEric Cheng } 3291*da14cebeSEric Cheng for (i = 0; i < tx_rings_reserved; i++) { 3292*da14cebeSEric Cheng (void) mac_soft_ring_create(i, 0, NULL, soft_ring_type, 3293*da14cebeSEric Cheng maxclsyspri, mcip, tx_srs, -1, NULL, mcip, 3294*da14cebeSEric Cheng (mac_resource_handle_t)tx_ring[i]); 3295*da14cebeSEric Cheng } 3296*da14cebeSEric Cheng mac_srs_update_fanout_list(tx_srs); 3297*da14cebeSEric Cheng } 3298*da14cebeSEric Cheng tx->st_func = mac_tx_get_func(tx->st_mode); 3299*da14cebeSEric Cheng 3300*da14cebeSEric Cheng DTRACE_PROBE3(tx__srs___setup__return, mac_soft_ring_set_t *, tx_srs, 3301*da14cebeSEric Cheng int, tx->st_mode, int, tx_srs->srs_oth_ring_count); 3302*da14cebeSEric Cheng 3303*da14cebeSEric Cheng if (tx_ring_count != 0) { 3304*da14cebeSEric Cheng kmem_free(tx_ring, 3305*da14cebeSEric Cheng sizeof (mac_ring_handle_t) * tx_ring_count); 3306*da14cebeSEric Cheng } 3307*da14cebeSEric Cheng } 3308*da14cebeSEric Cheng 3309*da14cebeSEric Cheng /* 3310*da14cebeSEric Cheng * Walk through the list of mac clients for the MAC. 3311*da14cebeSEric Cheng * For each active mac client, recompute the number of soft rings 3312*da14cebeSEric Cheng * associated with every client, only if current speed is different 3313*da14cebeSEric Cheng * from the speed that was previously used for soft ring computation. 3314*da14cebeSEric Cheng * If the cable is disconnected whlie the NIC is started, we would get 3315*da14cebeSEric Cheng * notification with speed set to 0. We do not recompute in that case. 3316*da14cebeSEric Cheng */ 3317*da14cebeSEric Cheng void 3318*da14cebeSEric Cheng mac_fanout_recompute(mac_impl_t *mip) 3319*da14cebeSEric Cheng { 3320*da14cebeSEric Cheng mac_client_impl_t *mcip; 3321*da14cebeSEric Cheng uint64_t ifspeed; 3322*da14cebeSEric Cheng mac_resource_props_t *mcip_mrp; 3323*da14cebeSEric Cheng 3324*da14cebeSEric Cheng i_mac_perim_enter(mip); 3325*da14cebeSEric Cheng ASSERT(!(mip->mi_state_flags & MIS_IS_VNIC)); 3326*da14cebeSEric Cheng 3327*da14cebeSEric Cheng if (mip->mi_linkstate != LINK_STATE_UP) { 3328*da14cebeSEric Cheng i_mac_perim_exit(mip); 3329*da14cebeSEric Cheng return; 3330*da14cebeSEric Cheng } 3331*da14cebeSEric Cheng 3332*da14cebeSEric Cheng for (mcip = mip->mi_clients_list; mcip != NULL; 3333*da14cebeSEric Cheng mcip = mcip->mci_client_next) { 3334*da14cebeSEric Cheng if (!MCIP_DATAPATH_SETUP(mcip)) 3335*da14cebeSEric Cheng continue; 3336*da14cebeSEric Cheng 3337*da14cebeSEric Cheng ifspeed = mac_client_stat_get(mcip->mci_flent->fe_mcip, 3338*da14cebeSEric Cheng MAC_STAT_IFSPEED); 3339*da14cebeSEric Cheng if ((ifspeed != 0) && 3340*da14cebeSEric Cheng (ifspeed != mcip->mci_flent->fe_nic_speed)) { 3341*da14cebeSEric Cheng mcip_mrp = MCIP_RESOURCE_PROPS(mcip); 3342*da14cebeSEric Cheng mac_fanout_setup(mcip, mcip->mci_flent, 3343*da14cebeSEric Cheng mcip_mrp, mac_rx_deliver, mcip, NULL); 3344*da14cebeSEric Cheng } 3345*da14cebeSEric Cheng } 3346*da14cebeSEric Cheng i_mac_perim_exit(mip); 3347*da14cebeSEric Cheng } 3348