1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/stream.h>
28 #define	_SUN_TPI_VERSION 2
29 #include <sys/tihdr.h>
30 #include <sys/socket.h>
31 #include <sys/xti_xtiopt.h>
32 #include <sys/xti_inet.h>
33 #include <sys/policy.h>
34 
35 #include <inet/common.h>
36 #include <netinet/ip6.h>
37 #include <inet/ip.h>
38 
39 #include <netinet/in.h>
40 #include <netinet/tcp.h>
41 #include <inet/optcom.h>
42 #include <inet/proto_set.h>
43 #include <inet/tcp_impl.h>
44 
45 /*
46  * Table of all known options handled on a TCP protocol stack.
47  *
48  * Note: This table contains options processed by both TCP and IP levels
49  *       and is the superset of options that can be performed on a TCP over IP
50  *       stack.
51  */
52 opdes_t	tcp_opt_arr[] = {
53 
54 { SO_LINGER,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
55 	sizeof (struct linger), 0 },
56 
57 { SO_DEBUG,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
58 { SO_KEEPALIVE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
59 { SO_DONTROUTE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
60 { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
61 	},
62 { SO_BROADCAST,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
63 { SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
64 { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
65 { SO_TYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
66 { SO_SNDBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
67 { SO_RCVBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
68 { SO_SNDTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
69 	sizeof (struct timeval), 0 },
70 { SO_RCVTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
71 	sizeof (struct timeval), 0 },
72 { SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
73 	},
74 { SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
75 { SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
76 	0 },
77 { SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
78 	0 },
79 { SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
80 	0 },
81 { SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
82 	0 },
83 { SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
84 
85 { SO_DOMAIN,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
86 
87 { SO_PROTOTYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
88 
89 { TCP_NODELAY,	IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
90 	},
91 { TCP_MAXSEG,	IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t),
92 	536 },
93 
94 { TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
95 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
96 
97 { TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
98 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
99 
100 { TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
101 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
102 
103 { TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
104 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
105 
106 { TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
107 	0 },
108 
109 { TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0,
110 	sizeof (int), 0 },
111 
112 { TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
113 	},
114 
115 { TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0,
116 	sizeof (int), 0 },
117 
118 { TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
119 	sizeof (int), 0	},
120 
121 { TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
122 	sizeof (int), 0	},
123 
124 { TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
125 
126 { IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
127 	(OP_VARLEN|OP_NODEFAULT),
128 	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
129 { T_IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
130 	(OP_VARLEN|OP_NODEFAULT),
131 	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
132 
133 { IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
134 { T_IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
135 { IP_TTL,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
136 	sizeof (int), -1 /* not initialized */ },
137 
138 { IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
139 	sizeof (ipsec_req_t), -1 /* not initialized */ },
140 
141 { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
142 	sizeof (int),	0 /* no ifindex */ },
143 
144 { IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
145 	sizeof (int), 0 },
146 
147 { IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
148 	sizeof (int), -1 /* not initialized */ },
149 
150 { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
151 	sizeof (int),	0 /* no ifindex */ },
152 
153 { IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
154 
155 { IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
156 	sizeof (in_addr_t),	-1 /* not initialized  */ },
157 
158 { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
159 	sizeof (int), 0 },
160 
161 { IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
162 	(OP_NODEFAULT|OP_VARLEN),
163 	sizeof (struct in6_pktinfo), -1 /* not initialized */ },
164 { IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
165 	OP_NODEFAULT,
166 	sizeof (sin6_t), -1 /* not initialized */ },
167 { IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
168 	(OP_VARLEN|OP_NODEFAULT), 255*8,
169 	-1 /* not initialized */ },
170 { IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
171 	(OP_VARLEN|OP_NODEFAULT), 255*8,
172 	-1 /* not initialized */ },
173 { IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
174 	(OP_VARLEN|OP_NODEFAULT), 255*8,
175 	-1 /* not initialized */ },
176 { IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
177 	(OP_VARLEN|OP_NODEFAULT), 255*8,
178 	-1 /* not initialized */ },
179 { IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
180 	OP_NODEFAULT,
181 	sizeof (int), -1 /* not initialized */ },
182 { IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
183 	OP_NODEFAULT,
184 	sizeof (struct ip6_mtuinfo), -1 /* not initialized */ },
185 { IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
186 	sizeof (int), 0 },
187 { IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
188 	sizeof (int), 0 },
189 { IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
190 	sizeof (int), 0 },
191 
192 /* Enable receipt of ancillary data */
193 { IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
194 	sizeof (int), 0 },
195 { IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
196 	sizeof (int), 0 },
197 { IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
198 	sizeof (int), 0 },
199 { _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
200 	sizeof (int), 0 },
201 { IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
202 	sizeof (int), 0 },
203 { IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
204 	sizeof (int), 0 },
205 { IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
206 	sizeof (int), 0 },
207 { IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
208 	sizeof (int), 0 },
209 
210 { IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
211 	sizeof (ipsec_req_t), -1 /* not initialized */ },
212 { IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
213 	sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
214 };
215 
216 /*
217  * Table of all supported levels
218  * Note: Some levels (e.g. XTI_GENERIC) may be valid but may not have
219  * any supported options so we need this info separately.
220  *
221  * This is needed only for topmost tpi providers and is used only by
222  * XTI interfaces.
223  */
224 optlevel_t	tcp_valid_levels_arr[] = {
225 	XTI_GENERIC,
226 	SOL_SOCKET,
227 	IPPROTO_TCP,
228 	IPPROTO_IP,
229 	IPPROTO_IPV6
230 };
231 
232 
233 #define	TCP_OPT_ARR_CNT		A_CNT(tcp_opt_arr)
234 #define	TCP_VALID_LEVELS_CNT	A_CNT(tcp_valid_levels_arr)
235 
236 uint_t tcp_max_optsize; /* initialized when TCP driver is loaded */
237 
238 /*
239  * Initialize option database object for TCP
240  *
241  * This object represents database of options to search passed to
242  * {sock,tpi}optcom_req() interface routine to take care of option
243  * management and associated methods.
244  */
245 
246 optdb_obj_t tcp_opt_obj = {
247 	tcp_opt_default,	/* TCP default value function pointer */
248 	tcp_tpi_opt_get,	/* TCP get function pointer */
249 	tcp_tpi_opt_set,	/* TCP set function pointer */
250 	TCP_OPT_ARR_CNT,	/* TCP option database count of entries */
251 	tcp_opt_arr,		/* TCP option database */
252 	TCP_VALID_LEVELS_CNT,	/* TCP valid level count of entries */
253 	tcp_valid_levels_arr	/* TCP valid level array */
254 };
255 
256 /* Maximum TCP initial cwin (start/restart). */
257 #define	TCP_MAX_INIT_CWND	16
258 
259 static int tcp_max_init_cwnd = TCP_MAX_INIT_CWND;
260 
261 /*
262  * Some TCP options can be "set" by requesting them in the option
263  * buffer. This is needed for XTI feature test though we do not
264  * allow it in general. We interpret that this mechanism is more
265  * applicable to OSI protocols and need not be allowed in general.
266  * This routine filters out options for which it is not allowed (most)
267  * and lets through those (few) for which it is. [ The XTI interface
268  * test suite specifics will imply that any XTI_GENERIC level XTI_* if
269  * ever implemented will have to be allowed here ].
270  */
271 static boolean_t
272 tcp_allow_connopt_set(int level, int name)
273 {
274 
275 	switch (level) {
276 	case IPPROTO_TCP:
277 		switch (name) {
278 		case TCP_NODELAY:
279 			return (B_TRUE);
280 		default:
281 			return (B_FALSE);
282 		}
283 		/*NOTREACHED*/
284 	default:
285 		return (B_FALSE);
286 	}
287 	/*NOTREACHED*/
288 }
289 
290 /*
291  * This routine gets default values of certain options whose default
292  * values are maintained by protocol specific code
293  */
294 /* ARGSUSED */
295 int
296 tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
297 {
298 	int32_t	*i1 = (int32_t *)ptr;
299 	tcp_stack_t	*tcps = Q_TO_TCP(q)->tcp_tcps;
300 
301 	switch (level) {
302 	case IPPROTO_TCP:
303 		switch (name) {
304 		case TCP_NOTIFY_THRESHOLD:
305 			*i1 = tcps->tcps_ip_notify_interval;
306 			break;
307 		case TCP_ABORT_THRESHOLD:
308 			*i1 = tcps->tcps_ip_abort_interval;
309 			break;
310 		case TCP_CONN_NOTIFY_THRESHOLD:
311 			*i1 = tcps->tcps_ip_notify_cinterval;
312 			break;
313 		case TCP_CONN_ABORT_THRESHOLD:
314 			*i1 = tcps->tcps_ip_abort_cinterval;
315 			break;
316 		default:
317 			return (-1);
318 		}
319 		break;
320 	case IPPROTO_IP:
321 		switch (name) {
322 		case IP_TTL:
323 			*i1 = tcps->tcps_ipv4_ttl;
324 			break;
325 		default:
326 			return (-1);
327 		}
328 		break;
329 	case IPPROTO_IPV6:
330 		switch (name) {
331 		case IPV6_UNICAST_HOPS:
332 			*i1 = tcps->tcps_ipv6_hoplimit;
333 			break;
334 		default:
335 			return (-1);
336 		}
337 		break;
338 	default:
339 		return (-1);
340 	}
341 	return (sizeof (int));
342 }
343 
344 /*
345  * TCP routine to get the values of options.
346  */
347 int
348 tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
349 {
350 	int		*i1 = (int *)ptr;
351 	tcp_t		*tcp = connp->conn_tcp;
352 	conn_opt_arg_t	coas;
353 	int		retval;
354 
355 	coas.coa_connp = connp;
356 	coas.coa_ixa = connp->conn_ixa;
357 	coas.coa_ipp = &connp->conn_xmit_ipp;
358 	coas.coa_ancillary = B_FALSE;
359 	coas.coa_changed = 0;
360 
361 	switch (level) {
362 	case SOL_SOCKET:
363 		switch (name) {
364 		case SO_SND_COPYAVOID:
365 			*i1 = tcp->tcp_snd_zcopy_on ?
366 			    SO_SND_COPYAVOID : 0;
367 			return (sizeof (int));
368 		case SO_ACCEPTCONN:
369 			*i1 = (tcp->tcp_state == TCPS_LISTEN);
370 			return (sizeof (int));
371 		}
372 		break;
373 	case IPPROTO_TCP:
374 		switch (name) {
375 		case TCP_NODELAY:
376 			*i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0;
377 			return (sizeof (int));
378 		case TCP_MAXSEG:
379 			*i1 = tcp->tcp_mss;
380 			return (sizeof (int));
381 		case TCP_NOTIFY_THRESHOLD:
382 			*i1 = (int)tcp->tcp_first_timer_threshold;
383 			return (sizeof (int));
384 		case TCP_ABORT_THRESHOLD:
385 			*i1 = tcp->tcp_second_timer_threshold;
386 			return (sizeof (int));
387 		case TCP_CONN_NOTIFY_THRESHOLD:
388 			*i1 = tcp->tcp_first_ctimer_threshold;
389 			return (sizeof (int));
390 		case TCP_CONN_ABORT_THRESHOLD:
391 			*i1 = tcp->tcp_second_ctimer_threshold;
392 			return (sizeof (int));
393 		case TCP_INIT_CWND:
394 			*i1 = tcp->tcp_init_cwnd;
395 			return (sizeof (int));
396 		case TCP_KEEPALIVE_THRESHOLD:
397 			*i1 = tcp->tcp_ka_interval;
398 			return (sizeof (int));
399 		case TCP_KEEPALIVE_ABORT_THRESHOLD:
400 			*i1 = tcp->tcp_ka_abort_thres;
401 			return (sizeof (int));
402 		case TCP_CORK:
403 			*i1 = tcp->tcp_cork;
404 			return (sizeof (int));
405 		}
406 		break;
407 	case IPPROTO_IP:
408 		if (connp->conn_family != AF_INET)
409 			return (-1);
410 		switch (name) {
411 		case IP_OPTIONS:
412 		case T_IP_OPTIONS:
413 			/* Caller ensures enough space */
414 			return (ip_opt_get_user(connp, ptr));
415 		default:
416 			break;
417 		}
418 		break;
419 
420 	case IPPROTO_IPV6:
421 		/*
422 		 * IPPROTO_IPV6 options are only supported for sockets
423 		 * that are using IPv6 on the wire.
424 		 */
425 		if (connp->conn_ipversion != IPV6_VERSION) {
426 			return (-1);
427 		}
428 		switch (name) {
429 		case IPV6_PATHMTU:
430 			if (tcp->tcp_state < TCPS_ESTABLISHED)
431 				return (-1);
432 			break;
433 		}
434 		break;
435 	}
436 	mutex_enter(&connp->conn_lock);
437 	retval = conn_opt_get(&coas, level, name, ptr);
438 	mutex_exit(&connp->conn_lock);
439 	return (retval);
440 }
441 
442 /*
443  * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
444  * Parameters are assumed to be verified by the caller.
445  */
446 /* ARGSUSED */
447 int
448 tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
449     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
450     void *thisdg_attrs, cred_t *cr)
451 {
452 	tcp_t	*tcp = connp->conn_tcp;
453 	int	*i1 = (int *)invalp;
454 	boolean_t onoff = (*i1 == 0) ? 0 : 1;
455 	boolean_t checkonly;
456 	int	reterr;
457 	tcp_stack_t	*tcps = tcp->tcp_tcps;
458 	conn_opt_arg_t	coas;
459 
460 	coas.coa_connp = connp;
461 	coas.coa_ixa = connp->conn_ixa;
462 	coas.coa_ipp = &connp->conn_xmit_ipp;
463 	coas.coa_ancillary = B_FALSE;
464 	coas.coa_changed = 0;
465 
466 	switch (optset_context) {
467 	case SETFN_OPTCOM_CHECKONLY:
468 		checkonly = B_TRUE;
469 		/*
470 		 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
471 		 * inlen != 0 implies value supplied and
472 		 * 	we have to "pretend" to set it.
473 		 * inlen == 0 implies that there is no
474 		 * 	value part in T_CHECK request and just validation
475 		 * done elsewhere should be enough, we just return here.
476 		 */
477 		if (inlen == 0) {
478 			*outlenp = 0;
479 			return (0);
480 		}
481 		break;
482 	case SETFN_OPTCOM_NEGOTIATE:
483 		checkonly = B_FALSE;
484 		break;
485 	case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */
486 	case SETFN_CONN_NEGOTIATE:
487 		checkonly = B_FALSE;
488 		/*
489 		 * Negotiating local and "association-related" options
490 		 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ)
491 		 * primitives is allowed by XTI, but we choose
492 		 * to not implement this style negotiation for Internet
493 		 * protocols (We interpret it is a must for OSI world but
494 		 * optional for Internet protocols) for all options.
495 		 * [ Will do only for the few options that enable test
496 		 * suites that our XTI implementation of this feature
497 		 * works for transports that do allow it ]
498 		 */
499 		if (!tcp_allow_connopt_set(level, name)) {
500 			*outlenp = 0;
501 			return (EINVAL);
502 		}
503 		break;
504 	default:
505 		/*
506 		 * We should never get here
507 		 */
508 		*outlenp = 0;
509 		return (EINVAL);
510 	}
511 
512 	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
513 	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
514 
515 	/*
516 	 * For TCP, we should have no ancillary data sent down
517 	 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs
518 	 * has to be zero.
519 	 */
520 	ASSERT(thisdg_attrs == NULL);
521 
522 	/*
523 	 * For fixed length options, no sanity check
524 	 * of passed in length is done. It is assumed *_optcom_req()
525 	 * routines do the right thing.
526 	 */
527 	switch (level) {
528 	case SOL_SOCKET:
529 		switch (name) {
530 		case SO_KEEPALIVE:
531 			if (checkonly) {
532 				/* check only case */
533 				break;
534 			}
535 
536 			if (!onoff) {
537 				if (connp->conn_keepalive) {
538 					if (tcp->tcp_ka_tid != 0) {
539 						(void) TCP_TIMER_CANCEL(tcp,
540 						    tcp->tcp_ka_tid);
541 						tcp->tcp_ka_tid = 0;
542 					}
543 					connp->conn_keepalive = 0;
544 				}
545 				break;
546 			}
547 			if (!connp->conn_keepalive) {
548 				/* Crank up the keepalive timer */
549 				tcp->tcp_ka_last_intrvl = 0;
550 				tcp->tcp_ka_tid = TCP_TIMER(tcp,
551 				    tcp_keepalive_timer,
552 				    MSEC_TO_TICK(tcp->tcp_ka_interval));
553 				connp->conn_keepalive = 1;
554 			}
555 			break;
556 		case SO_SNDBUF: {
557 			if (*i1 > tcps->tcps_max_buf) {
558 				*outlenp = 0;
559 				return (ENOBUFS);
560 			}
561 			if (checkonly)
562 				break;
563 
564 			connp->conn_sndbuf = *i1;
565 			if (tcps->tcps_snd_lowat_fraction != 0) {
566 				connp->conn_sndlowat = connp->conn_sndbuf /
567 				    tcps->tcps_snd_lowat_fraction;
568 			}
569 			(void) tcp_maxpsz_set(tcp, B_TRUE);
570 			/*
571 			 * If we are flow-controlled, recheck the condition.
572 			 * There are apps that increase SO_SNDBUF size when
573 			 * flow-controlled (EWOULDBLOCK), and expect the flow
574 			 * control condition to be lifted right away.
575 			 */
576 			mutex_enter(&tcp->tcp_non_sq_lock);
577 			if (tcp->tcp_flow_stopped &&
578 			    TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) {
579 				tcp_clrqfull(tcp);
580 			}
581 			mutex_exit(&tcp->tcp_non_sq_lock);
582 			*outlenp = inlen;
583 			return (0);
584 		}
585 		case SO_RCVBUF:
586 			if (*i1 > tcps->tcps_max_buf) {
587 				*outlenp = 0;
588 				return (ENOBUFS);
589 			}
590 			/* Silently ignore zero */
591 			if (!checkonly && *i1 != 0) {
592 				*i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss);
593 				(void) tcp_rwnd_set(tcp, *i1);
594 			}
595 			/*
596 			 * XXX should we return the rwnd here
597 			 * and tcp_opt_get ?
598 			 */
599 			*outlenp = inlen;
600 			return (0);
601 		case SO_SND_COPYAVOID:
602 			if (!checkonly) {
603 				if (tcp->tcp_loopback ||
604 				    (tcp->tcp_kssl_ctx != NULL) ||
605 				    (onoff != 1) || !tcp_zcopy_check(tcp)) {
606 					*outlenp = 0;
607 					return (EOPNOTSUPP);
608 				}
609 				tcp->tcp_snd_zcopy_aware = 1;
610 			}
611 			*outlenp = inlen;
612 			return (0);
613 		}
614 		break;
615 	case IPPROTO_TCP:
616 		switch (name) {
617 		case TCP_NODELAY:
618 			if (!checkonly)
619 				tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss;
620 			break;
621 		case TCP_NOTIFY_THRESHOLD:
622 			if (!checkonly)
623 				tcp->tcp_first_timer_threshold = *i1;
624 			break;
625 		case TCP_ABORT_THRESHOLD:
626 			if (!checkonly)
627 				tcp->tcp_second_timer_threshold = *i1;
628 			break;
629 		case TCP_CONN_NOTIFY_THRESHOLD:
630 			if (!checkonly)
631 				tcp->tcp_first_ctimer_threshold = *i1;
632 			break;
633 		case TCP_CONN_ABORT_THRESHOLD:
634 			if (!checkonly)
635 				tcp->tcp_second_ctimer_threshold = *i1;
636 			break;
637 		case TCP_RECVDSTADDR:
638 			if (tcp->tcp_state > TCPS_LISTEN) {
639 				*outlenp = 0;
640 				return (EOPNOTSUPP);
641 			}
642 			/* Setting done in conn_opt_set */
643 			break;
644 		case TCP_INIT_CWND: {
645 			uint32_t init_cwnd = *((uint32_t *)invalp);
646 
647 			if (checkonly)
648 				break;
649 
650 			/*
651 			 * Only allow socket with network configuration
652 			 * privilege to set the initial cwnd to be larger
653 			 * than allowed by RFC 3390.
654 			 */
655 			if (init_cwnd <= MIN(4, MAX(2, 4380 / tcp->tcp_mss))) {
656 				tcp->tcp_init_cwnd = init_cwnd;
657 				break;
658 			}
659 			if ((reterr = secpolicy_ip_config(cr, B_TRUE)) != 0) {
660 				*outlenp = 0;
661 				return (reterr);
662 			}
663 			if (init_cwnd > tcp_max_init_cwnd) {
664 				*outlenp = 0;
665 				return (EINVAL);
666 			}
667 			tcp->tcp_init_cwnd = init_cwnd;
668 			break;
669 		}
670 		case TCP_KEEPALIVE_THRESHOLD:
671 			if (checkonly)
672 				break;
673 
674 			if (*i1 < tcps->tcps_keepalive_interval_low ||
675 			    *i1 > tcps->tcps_keepalive_interval_high) {
676 				*outlenp = 0;
677 				return (EINVAL);
678 			}
679 			if (*i1 != tcp->tcp_ka_interval) {
680 				tcp->tcp_ka_interval = *i1;
681 				/*
682 				 * Check if we need to restart the
683 				 * keepalive timer.
684 				 */
685 				if (tcp->tcp_ka_tid != 0) {
686 					ASSERT(connp->conn_keepalive);
687 					(void) TCP_TIMER_CANCEL(tcp,
688 					    tcp->tcp_ka_tid);
689 					tcp->tcp_ka_last_intrvl = 0;
690 					tcp->tcp_ka_tid = TCP_TIMER(tcp,
691 					    tcp_keepalive_timer,
692 					    MSEC_TO_TICK(tcp->tcp_ka_interval));
693 				}
694 			}
695 			break;
696 		case TCP_KEEPALIVE_ABORT_THRESHOLD:
697 			if (!checkonly) {
698 				if (*i1 <
699 				    tcps->tcps_keepalive_abort_interval_low ||
700 				    *i1 >
701 				    tcps->tcps_keepalive_abort_interval_high) {
702 					*outlenp = 0;
703 					return (EINVAL);
704 				}
705 				tcp->tcp_ka_abort_thres = *i1;
706 			}
707 			break;
708 		case TCP_CORK:
709 			if (!checkonly) {
710 				/*
711 				 * if tcp->tcp_cork was set and is now
712 				 * being unset, we have to make sure that
713 				 * the remaining data gets sent out. Also
714 				 * unset tcp->tcp_cork so that tcp_wput_data()
715 				 * can send data even if it is less than mss
716 				 */
717 				if (tcp->tcp_cork && onoff == 0 &&
718 				    tcp->tcp_unsent > 0) {
719 					tcp->tcp_cork = B_FALSE;
720 					tcp_wput_data(tcp, NULL, B_FALSE);
721 				}
722 				tcp->tcp_cork = onoff;
723 			}
724 			break;
725 		default:
726 			break;
727 		}
728 		break;
729 	case IPPROTO_IP:
730 		if (connp->conn_family != AF_INET) {
731 			*outlenp = 0;
732 			return (EINVAL);
733 		}
734 		switch (name) {
735 		case IP_SEC_OPT:
736 			/*
737 			 * We should not allow policy setting after
738 			 * we start listening for connections.
739 			 */
740 			if (tcp->tcp_state == TCPS_LISTEN) {
741 				return (EINVAL);
742 			}
743 			break;
744 		}
745 		break;
746 	case IPPROTO_IPV6:
747 		/*
748 		 * IPPROTO_IPV6 options are only supported for sockets
749 		 * that are using IPv6 on the wire.
750 		 */
751 		if (connp->conn_ipversion != IPV6_VERSION) {
752 			*outlenp = 0;
753 			return (EINVAL);
754 		}
755 
756 		switch (name) {
757 		case IPV6_RECVPKTINFO:
758 			if (!checkonly) {
759 				/* Force it to be sent up with the next msg */
760 				tcp->tcp_recvifindex = 0;
761 			}
762 			break;
763 		case IPV6_RECVTCLASS:
764 			if (!checkonly) {
765 				/* Force it to be sent up with the next msg */
766 				tcp->tcp_recvtclass = 0xffffffffU;
767 			}
768 			break;
769 		case IPV6_RECVHOPLIMIT:
770 			if (!checkonly) {
771 				/* Force it to be sent up with the next msg */
772 				tcp->tcp_recvhops = 0xffffffffU;
773 			}
774 			break;
775 		case IPV6_PKTINFO:
776 			/* This is an extra check for TCP */
777 			if (inlen == sizeof (struct in6_pktinfo)) {
778 				struct in6_pktinfo *pkti;
779 
780 				pkti = (struct in6_pktinfo *)invalp;
781 				/*
782 				 * RFC 3542 states that ipi6_addr must be
783 				 * the unspecified address when setting the
784 				 * IPV6_PKTINFO sticky socket option on a
785 				 * TCP socket.
786 				 */
787 				if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr))
788 					return (EINVAL);
789 			}
790 			break;
791 		case IPV6_SEC_OPT:
792 			/*
793 			 * We should not allow policy setting after
794 			 * we start listening for connections.
795 			 */
796 			if (tcp->tcp_state == TCPS_LISTEN) {
797 				return (EINVAL);
798 			}
799 			break;
800 		}
801 		break;
802 	}
803 	reterr = conn_opt_set(&coas, level, name, inlen, invalp,
804 	    checkonly, cr);
805 	if (reterr != 0) {
806 		*outlenp = 0;
807 		return (reterr);
808 	}
809 
810 	/*
811 	 * Common case of OK return with outval same as inval
812 	 */
813 	if (invalp != outvalp) {
814 		/* don't trust bcopy for identical src/dst */
815 		(void) bcopy(invalp, outvalp, inlen);
816 	}
817 	*outlenp = inlen;
818 
819 	if (coas.coa_changed & COA_HEADER_CHANGED) {
820 		/* If we are connected we rebuilt the headers */
821 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
822 		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
823 			reterr = tcp_build_hdrs(tcp);
824 			if (reterr != 0)
825 				return (reterr);
826 		}
827 	}
828 	if (coas.coa_changed & COA_ROUTE_CHANGED) {
829 		in6_addr_t nexthop;
830 
831 		/*
832 		 * If we are connected we re-cache the information.
833 		 * We ignore errors to preserve BSD behavior.
834 		 * Note that we don't redo IPsec policy lookup here
835 		 * since the final destination (or source) didn't change.
836 		 */
837 		ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa,
838 		    &connp->conn_faddr_v6, &nexthop);
839 
840 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
841 		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
842 			(void) ip_attr_connect(connp, connp->conn_ixa,
843 			    &connp->conn_laddr_v6, &connp->conn_faddr_v6,
844 			    &nexthop, connp->conn_fport, NULL, NULL,
845 			    IPDF_VERIFY_DST);
846 		}
847 	}
848 	if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
849 		connp->conn_wq->q_hiwat = connp->conn_sndbuf;
850 	}
851 	if (coas.coa_changed & COA_WROFF_CHANGED) {
852 		connp->conn_wroff = connp->conn_ht_iphc_allocated +
853 		    tcps->tcps_wroff_xtra;
854 		(void) proto_set_tx_wroff(connp->conn_rq, connp,
855 		    connp->conn_wroff);
856 	}
857 	if (coas.coa_changed & COA_OOBINLINE_CHANGED) {
858 		if (IPCL_IS_NONSTR(connp))
859 			proto_set_rx_oob_opt(connp, onoff);
860 	}
861 	return (0);
862 }
863