xref: /illumos-gate/usr/src/uts/common/io/aggr/aggr_send.c (revision 1c9de0c9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * IEEE 802.3ad Link Aggregation - Send code.
30  *
31  * Implements the Distributor function.
32  */
33 
34 #include <sys/conf.h>
35 #include <sys/modctl.h>
36 #include <sys/sunddi.h>
37 #include <sys/vlan.h>
38 #include <sys/strsun.h>
39 #include <sys/strsubr.h>
40 
41 #include <inet/common.h>
42 #include <inet/led.h>
43 #include <inet/ip.h>
44 #include <inet/ip6.h>
45 #include <inet/tcp.h>
46 #include <netinet/udp.h>
47 #include <inet/ipsec_impl.h>
48 #include <inet/sadb.h>
49 #include <inet/ipsecesp.h>
50 #include <inet/ipsecah.h>
51 
52 #include <sys/aggr.h>
53 #include <sys/aggr_impl.h>
54 
55 #define	HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3])
56 #define	HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5])
57 
58 static uint16_t aggr_send_ip6_hdr_len(mblk_t *, ip6_t *);
59 
60 static uint_t
61 aggr_send_port(aggr_grp_t *grp, mblk_t *mp)
62 {
63 	struct ether_header *ehp;
64 	uint16_t sap;
65 	uint_t skip_len;
66 	uint8_t proto;
67 	uint32_t policy = grp->lg_tx_policy;
68 	uint32_t hash = 0;
69 
70 	ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)));
71 	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
72 
73 	/* compute MAC hash */
74 
75 	ehp = (struct ether_header *)mp->b_rptr;
76 
77 	if (policy & AGGR_POLICY_L2) {
78 		uchar_t *mac_src = ehp->ether_shost.ether_addr_octet;
79 		uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet;
80 		hash = HASH_MAC(mac_src) ^ HASH_MAC(mac_dst);
81 		policy &= ~AGGR_POLICY_L2;
82 	}
83 
84 	if (policy == 0)
85 		goto done;
86 
87 	/* skip ethernet header */
88 
89 	if (ntohs(ehp->ether_type) == ETHERTYPE_VLAN) {
90 		struct ether_vlan_header *evhp;
91 
92 		ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
93 		evhp = (struct ether_vlan_header *)mp->b_rptr;
94 		sap = ntohs(evhp->ether_type);
95 		skip_len = sizeof (struct ether_vlan_header);
96 	} else {
97 		sap = ntohs(ehp->ether_type);
98 		skip_len = sizeof (struct ether_header);
99 	}
100 
101 	/* if ethernet header is in its own mblk, skip it */
102 	if (MBLKL(mp) <= skip_len) {
103 		skip_len -= MBLKL(mp);
104 		mp = mp->b_cont;
105 	}
106 
107 	sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
108 
109 	/* compute IP src/dst addresses hash and skip IPv{4,6} header */
110 
111 	switch (sap) {
112 	case ETHERTYPE_IP: {
113 		ipha_t *iphp;
114 
115 		ASSERT(MBLKL(mp) >= skip_len + sizeof (ipha_t));
116 		iphp = (ipha_t *)(mp->b_rptr + skip_len);
117 		proto = iphp->ipha_protocol;
118 		skip_len += IPH_HDR_LENGTH(iphp);
119 
120 		if (policy & AGGR_POLICY_L3) {
121 			uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src);
122 			uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst);
123 
124 			hash ^= (HASH_4BYTES(ip_src) ^ HASH_4BYTES(ip_dst));
125 			policy &= ~AGGR_POLICY_L3;
126 		}
127 		break;
128 	}
129 	case ETHERTYPE_IPV6: {
130 		ip6_t *ip6hp;
131 
132 		/*
133 		 * if ipv6 packet has options, the proto will not be one of the
134 		 * ones handled by the ULP processor below, and will return 0
135 		 * as the index
136 		 */
137 		ASSERT(MBLKL(mp) >= skip_len + sizeof (ip6_t));
138 		ip6hp = (ip6_t *)(mp->b_rptr + skip_len);
139 		proto = ip6hp->ip6_nxt;
140 		skip_len += aggr_send_ip6_hdr_len(mp, ip6hp);
141 
142 		if (policy & AGGR_POLICY_L3) {
143 			uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]);
144 			uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]);
145 
146 			hash ^= (HASH_4BYTES(ip_src) ^ HASH_4BYTES(ip_dst));
147 			policy &= ~AGGR_POLICY_L3;
148 		}
149 		break;
150 	}
151 	default:
152 		goto done;
153 	}
154 
155 	if (!(policy & AGGR_POLICY_L4))
156 		goto done;
157 
158 	/* if ip header is in its own mblk, skip it */
159 	if (MBLKL(mp) <= skip_len) {
160 		skip_len -= MBLKL(mp);
161 		mp = mp->b_cont;
162 	}
163 
164 	/* parse ULP header */
165 again:
166 	switch (proto) {
167 	case IPPROTO_TCP:
168 	case IPPROTO_UDP:
169 	case IPPROTO_ESP:
170 	case IPPROTO_SCTP:
171 		/*
172 		 * These Internet Protocols are intentionally designed
173 		 * for hashing from the git-go.  Port numbers are in the first
174 		 * word for transports, SPI is first for ESP.
175 		 */
176 		hash ^= HASH_4BYTES((mp->b_rptr + skip_len));
177 		break;
178 
179 	case IPPROTO_AH: {
180 		ah_t *ah = (ah_t *)(mp->b_rptr + skip_len);
181 
182 		uint_t ah_length = AH_TOTAL_LEN(ah);
183 		proto = ah->ah_nexthdr;
184 		skip_len += ah_length;
185 
186 		/* if ip header is in its own mblk, skip it */
187 		if (MBLKL(mp) <= skip_len) {
188 			skip_len -= MBLKL(mp);
189 			mp = mp->b_cont;
190 		}
191 
192 		goto again;
193 	}
194 	}
195 
196 done:
197 	return (hash % grp->lg_ntx_ports);
198 }
199 
200 /*
201  * Update the TX load balancing policy of the specified group.
202  */
203 void
204 aggr_send_update_policy(aggr_grp_t *grp, uint32_t policy)
205 {
206 	ASSERT(AGGR_LACP_LOCK_HELD(grp));
207 	ASSERT(RW_WRITE_HELD(&grp->lg_lock));
208 
209 	grp->lg_tx_policy = policy;
210 }
211 
212 /*
213  * Send function invoked by the MAC service module.
214  */
215 mblk_t *
216 aggr_m_tx(void *arg, mblk_t *mp)
217 {
218 	aggr_grp_t *grp = arg;
219 	aggr_port_t *port;
220 	mblk_t *nextp;
221 	const mac_txinfo_t *mtp;
222 
223 	for (;;) {
224 		rw_enter(&grp->lg_lock, RW_READER);
225 		if (grp->lg_ntx_ports == 0) {
226 			/*
227 			 * We could have returned from aggr_m_start() before
228 			 * the ports were actually attached. Drop the chain.
229 			 */
230 			rw_exit(&grp->lg_lock);
231 			freemsgchain(mp);
232 			return (NULL);
233 		}
234 		nextp = mp->b_next;
235 		mp->b_next = NULL;
236 
237 		port = grp->lg_tx_ports[aggr_send_port(grp, mp)];
238 		ASSERT(port->lp_state == AGGR_PORT_STATE_ATTACHED);
239 
240 		rw_exit(&grp->lg_lock);
241 
242 		/*
243 		 * We store the transmit info pointer locally in case it
244 		 * changes between loading mt_fn and mt_arg.
245 		 */
246 		mtp = port->lp_txinfo;
247 		if ((mp = mtp->mt_fn(mtp->mt_arg, mp)) != NULL) {
248 			mp->b_next = nextp;
249 			break;
250 		}
251 
252 		if ((mp = nextp) == NULL)
253 			break;
254 	}
255 	return (mp);
256 }
257 
258 /*
259  * Enable sending on the specified port.
260  */
261 void
262 aggr_send_port_enable(aggr_port_t *port)
263 {
264 	aggr_grp_t *grp = port->lp_grp;
265 
266 	if (port->lp_tx_enabled || (port->lp_state !=
267 	    AGGR_PORT_STATE_ATTACHED)) {
268 		/* already enabled or port not yet attached */
269 		return;
270 	}
271 
272 	/*
273 	 * Add to group's array of tx ports.
274 	 */
275 	if (grp->lg_tx_ports_size < grp->lg_ntx_ports+1) {
276 		/* current array too small */
277 		aggr_port_t **new_ports;
278 		uint_t new_size;
279 
280 		new_size = grp->lg_ntx_ports+1;
281 		new_ports = kmem_zalloc(new_size * sizeof (aggr_port_t *),
282 		    KM_SLEEP);
283 
284 		if (grp->lg_tx_ports_size > 0) {
285 			ASSERT(grp->lg_tx_ports != NULL);
286 			bcopy(grp->lg_tx_ports, new_ports,
287 			    grp->lg_ntx_ports * sizeof (aggr_port_t *));
288 			kmem_free(grp->lg_tx_ports,
289 			    grp->lg_tx_ports_size * sizeof (aggr_port_t *));
290 		}
291 
292 		grp->lg_tx_ports = new_ports;
293 		grp->lg_tx_ports_size = new_size;
294 	}
295 
296 	grp->lg_tx_ports[grp->lg_ntx_ports++] = port;
297 	port->lp_tx_idx = grp->lg_ntx_ports-1;
298 
299 	port->lp_tx_enabled = B_TRUE;
300 }
301 
302 /*
303  * Disable sending from the specified port.
304  */
305 void
306 aggr_send_port_disable(aggr_port_t *port)
307 {
308 	uint_t idx, ntx;
309 	aggr_grp_t *grp = port->lp_grp;
310 
311 	ASSERT(RW_WRITE_HELD(&port->lp_lock));
312 
313 	if (!port->lp_tx_enabled) {
314 		/* not yet enabled */
315 		return;
316 	}
317 
318 	idx = port->lp_tx_idx;
319 	ntx = grp->lg_ntx_ports;
320 	ASSERT(idx < ntx);
321 
322 	/* remove from array of attached ports */
323 	if (idx == (ntx - 1)) {
324 		grp->lg_tx_ports[idx] = NULL;
325 	} else {
326 		/* not the last entry, replace with last one */
327 		aggr_port_t *victim;
328 
329 		victim = grp->lg_tx_ports[ntx - 1];
330 		grp->lg_tx_ports[ntx - 1] = NULL;
331 		victim->lp_tx_idx = idx;
332 		grp->lg_tx_ports[idx] = victim;
333 	}
334 
335 	port->lp_tx_idx = 0;
336 	grp->lg_ntx_ports--;
337 
338 	port->lp_tx_enabled = B_FALSE;
339 }
340 
341 static uint16_t
342 aggr_send_ip6_hdr_len(mblk_t *mp, ip6_t *ip6h)
343 {
344 	uint16_t length;
345 	uint_t	ehdrlen;
346 	uint8_t	*nexthdrp;
347 	uint8_t *whereptr;
348 	uint8_t *endptr;
349 	ip6_dest_t *desthdr;
350 	ip6_rthdr_t *rthdr;
351 	ip6_frag_t *fraghdr;
352 
353 	length = IPV6_HDR_LEN;
354 	whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
355 	endptr = mp->b_wptr;
356 
357 	nexthdrp = &ip6h->ip6_nxt;
358 	while (whereptr < endptr) {
359 		switch (*nexthdrp) {
360 		case IPPROTO_HOPOPTS:
361 		case IPPROTO_DSTOPTS:
362 			/* Assumes the headers are identical for hbh and dst */
363 			desthdr = (ip6_dest_t *)whereptr;
364 			ehdrlen = 8 * (desthdr->ip6d_len + 1);
365 			nexthdrp = &desthdr->ip6d_nxt;
366 			break;
367 		case IPPROTO_ROUTING:
368 			rthdr = (ip6_rthdr_t *)whereptr;
369 			ehdrlen =  8 * (rthdr->ip6r_len + 1);
370 			nexthdrp = &rthdr->ip6r_nxt;
371 			break;
372 		case IPPROTO_FRAGMENT:
373 			fraghdr = (ip6_frag_t *)whereptr;
374 			ehdrlen = sizeof (ip6_frag_t);
375 			nexthdrp = &fraghdr->ip6f_nxt;
376 			break;
377 		case IPPROTO_NONE:
378 			/* No next header means we're finished */
379 		default:
380 			return (length);
381 		}
382 		length += ehdrlen;
383 		whereptr += ehdrlen;
384 	}
385 
386 	return (length);
387 }
388