1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 #include <sys/types.h>
26 #include <sys/stream.h>
27 #define	_SUN_TPI_VERSION 2
28 #include <sys/tihdr.h>
29 #include <sys/socket.h>
30 #include <sys/xti_xtiopt.h>
31 #include <sys/xti_inet.h>
32 #include <sys/policy.h>
33 
34 #include <inet/common.h>
35 #include <netinet/ip6.h>
36 #include <inet/ip.h>
37 
38 #include <netinet/in.h>
39 #include <netinet/tcp.h>
40 #include <inet/optcom.h>
41 #include <inet/proto_set.h>
42 #include <inet/tcp_impl.h>
43 
44 /*
45  * Table of all known options handled on a TCP protocol stack.
46  *
47  * Note: This table contains options processed by both TCP and IP levels
48  *       and is the superset of options that can be performed on a TCP over IP
49  *       stack.
50  */
51 opdes_t	tcp_opt_arr[] = {
52 
53 { SO_LINGER,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
54 	sizeof (struct linger), 0 },
55 
56 { SO_DEBUG,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
57 { SO_KEEPALIVE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
58 { SO_DONTROUTE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
59 { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
60 	},
61 { SO_BROADCAST,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
62 { SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
63 { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
64 { SO_TYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
65 { SO_SNDBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
66 { SO_RCVBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
67 { SO_SNDTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
68 	sizeof (struct timeval), 0 },
69 { SO_RCVTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
70 	sizeof (struct timeval), 0 },
71 { SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
72 	},
73 { SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
74 { SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
75 	0 },
76 { SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
77 	0 },
78 { SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
79 	0 },
80 { SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
81 	0 },
82 { SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
83 
84 { SO_DOMAIN,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
85 
86 { SO_PROTOTYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
87 
88 { TCP_NODELAY,	IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
89 	},
90 { TCP_MAXSEG,	IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t),
91 	536 },
92 
93 { TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
94 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
95 
96 { TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
97 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
98 
99 { TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
100 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
101 
102 { TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
103 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
104 
105 { TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
106 	0 },
107 
108 { TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0,
109 	sizeof (int), 0 },
110 
111 { TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
112 	},
113 
114 { TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0,
115 	sizeof (int), 0 },
116 
117 { TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
118 	sizeof (int), 0	},
119 
120 { TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
121 	sizeof (int), 0	},
122 
123 { TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
124 
125 { TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
126 
127 { TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
128 
129 { TCP_RTO_MAX, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
130 
131 { TCP_LINGER2, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
132 
133 { IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
134 	(OP_VARLEN|OP_NODEFAULT),
135 	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
136 { T_IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
137 	(OP_VARLEN|OP_NODEFAULT),
138 	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
139 
140 { IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
141 { T_IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
142 { IP_TTL,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
143 	sizeof (int), -1 /* not initialized */ },
144 
145 { IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
146 	sizeof (ipsec_req_t), -1 /* not initialized */ },
147 
148 { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
149 	sizeof (int),	0 /* no ifindex */ },
150 
151 { IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
152 	sizeof (int), 0 },
153 
154 { IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
155 	sizeof (int), -1 /* not initialized */ },
156 
157 { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
158 	sizeof (int),	0 /* no ifindex */ },
159 
160 { IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
161 
162 { IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
163 	sizeof (in_addr_t),	-1 /* not initialized  */ },
164 
165 { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
166 	sizeof (int), 0 },
167 
168 { IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
169 	(OP_NODEFAULT|OP_VARLEN),
170 	sizeof (struct in6_pktinfo), -1 /* not initialized */ },
171 { IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
172 	OP_NODEFAULT,
173 	sizeof (sin6_t), -1 /* not initialized */ },
174 { IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
175 	(OP_VARLEN|OP_NODEFAULT), 255*8,
176 	-1 /* not initialized */ },
177 { IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
178 	(OP_VARLEN|OP_NODEFAULT), 255*8,
179 	-1 /* not initialized */ },
180 { IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
181 	(OP_VARLEN|OP_NODEFAULT), 255*8,
182 	-1 /* not initialized */ },
183 { IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
184 	(OP_VARLEN|OP_NODEFAULT), 255*8,
185 	-1 /* not initialized */ },
186 { IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
187 	OP_NODEFAULT,
188 	sizeof (int), -1 /* not initialized */ },
189 { IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
190 	OP_NODEFAULT,
191 	sizeof (struct ip6_mtuinfo), -1 /* not initialized */ },
192 { IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
193 	sizeof (int), 0 },
194 { IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
195 	sizeof (int), 0 },
196 { IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
197 	sizeof (int), 0 },
198 
199 /* Enable receipt of ancillary data */
200 { IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
201 	sizeof (int), 0 },
202 { IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
203 	sizeof (int), 0 },
204 { IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
205 	sizeof (int), 0 },
206 { _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
207 	sizeof (int), 0 },
208 { IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
209 	sizeof (int), 0 },
210 { IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
211 	sizeof (int), 0 },
212 { IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
213 	sizeof (int), 0 },
214 { IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
215 	sizeof (int), 0 },
216 
217 { IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
218 	sizeof (ipsec_req_t), -1 /* not initialized */ },
219 { IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
220 	sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
221 };
222 
223 /*
224  * Table of all supported levels
225  * Note: Some levels (e.g. XTI_GENERIC) may be valid but may not have
226  * any supported options so we need this info separately.
227  *
228  * This is needed only for topmost tpi providers and is used only by
229  * XTI interfaces.
230  */
231 optlevel_t	tcp_valid_levels_arr[] = {
232 	XTI_GENERIC,
233 	SOL_SOCKET,
234 	IPPROTO_TCP,
235 	IPPROTO_IP,
236 	IPPROTO_IPV6
237 };
238 
239 
240 #define	TCP_OPT_ARR_CNT		A_CNT(tcp_opt_arr)
241 #define	TCP_VALID_LEVELS_CNT	A_CNT(tcp_valid_levels_arr)
242 
243 uint_t tcp_max_optsize; /* initialized when TCP driver is loaded */
244 
245 /*
246  * Initialize option database object for TCP
247  *
248  * This object represents database of options to search passed to
249  * {sock,tpi}optcom_req() interface routine to take care of option
250  * management and associated methods.
251  */
252 
253 optdb_obj_t tcp_opt_obj = {
254 	tcp_opt_default,	/* TCP default value function pointer */
255 	tcp_tpi_opt_get,	/* TCP get function pointer */
256 	tcp_tpi_opt_set,	/* TCP set function pointer */
257 	TCP_OPT_ARR_CNT,	/* TCP option database count of entries */
258 	tcp_opt_arr,		/* TCP option database */
259 	TCP_VALID_LEVELS_CNT,	/* TCP valid level count of entries */
260 	tcp_valid_levels_arr	/* TCP valid level array */
261 };
262 
263 /* Maximum TCP initial cwin (start/restart). */
264 #define	TCP_MAX_INIT_CWND	16
265 
266 static int tcp_max_init_cwnd = TCP_MAX_INIT_CWND;
267 
268 /*
269  * Some TCP options can be "set" by requesting them in the option
270  * buffer. This is needed for XTI feature test though we do not
271  * allow it in general. We interpret that this mechanism is more
272  * applicable to OSI protocols and need not be allowed in general.
273  * This routine filters out options for which it is not allowed (most)
274  * and lets through those (few) for which it is. [ The XTI interface
275  * test suite specifics will imply that any XTI_GENERIC level XTI_* if
276  * ever implemented will have to be allowed here ].
277  */
278 static boolean_t
279 tcp_allow_connopt_set(int level, int name)
280 {
281 
282 	switch (level) {
283 	case IPPROTO_TCP:
284 		switch (name) {
285 		case TCP_NODELAY:
286 			return (B_TRUE);
287 		default:
288 			return (B_FALSE);
289 		}
290 		/*NOTREACHED*/
291 	default:
292 		return (B_FALSE);
293 	}
294 	/*NOTREACHED*/
295 }
296 
297 /*
298  * This routine gets default values of certain options whose default
299  * values are maintained by protocol specific code
300  */
301 /* ARGSUSED */
302 int
303 tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
304 {
305 	int32_t	*i1 = (int32_t *)ptr;
306 	tcp_stack_t	*tcps = Q_TO_TCP(q)->tcp_tcps;
307 
308 	switch (level) {
309 	case IPPROTO_TCP:
310 		switch (name) {
311 		case TCP_NOTIFY_THRESHOLD:
312 			*i1 = tcps->tcps_ip_notify_interval;
313 			break;
314 		case TCP_ABORT_THRESHOLD:
315 			*i1 = tcps->tcps_ip_abort_interval;
316 			break;
317 		case TCP_CONN_NOTIFY_THRESHOLD:
318 			*i1 = tcps->tcps_ip_notify_cinterval;
319 			break;
320 		case TCP_CONN_ABORT_THRESHOLD:
321 			*i1 = tcps->tcps_ip_abort_cinterval;
322 			break;
323 		default:
324 			return (-1);
325 		}
326 		break;
327 	case IPPROTO_IP:
328 		switch (name) {
329 		case IP_TTL:
330 			*i1 = tcps->tcps_ipv4_ttl;
331 			break;
332 		default:
333 			return (-1);
334 		}
335 		break;
336 	case IPPROTO_IPV6:
337 		switch (name) {
338 		case IPV6_UNICAST_HOPS:
339 			*i1 = tcps->tcps_ipv6_hoplimit;
340 			break;
341 		default:
342 			return (-1);
343 		}
344 		break;
345 	default:
346 		return (-1);
347 	}
348 	return (sizeof (int));
349 }
350 
351 /*
352  * TCP routine to get the values of options.
353  */
354 int
355 tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
356 {
357 	int		*i1 = (int *)ptr;
358 	tcp_t		*tcp = connp->conn_tcp;
359 	conn_opt_arg_t	coas;
360 	int		retval;
361 
362 	coas.coa_connp = connp;
363 	coas.coa_ixa = connp->conn_ixa;
364 	coas.coa_ipp = &connp->conn_xmit_ipp;
365 	coas.coa_ancillary = B_FALSE;
366 	coas.coa_changed = 0;
367 
368 	switch (level) {
369 	case SOL_SOCKET:
370 		switch (name) {
371 		case SO_SND_COPYAVOID:
372 			*i1 = tcp->tcp_snd_zcopy_on ?
373 			    SO_SND_COPYAVOID : 0;
374 			return (sizeof (int));
375 		case SO_ACCEPTCONN:
376 			*i1 = (tcp->tcp_state == TCPS_LISTEN);
377 			return (sizeof (int));
378 		}
379 		break;
380 	case IPPROTO_TCP:
381 		switch (name) {
382 		case TCP_NODELAY:
383 			*i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0;
384 			return (sizeof (int));
385 		case TCP_MAXSEG:
386 			*i1 = tcp->tcp_mss;
387 			return (sizeof (int));
388 		case TCP_NOTIFY_THRESHOLD:
389 			*i1 = (int)tcp->tcp_first_timer_threshold;
390 			return (sizeof (int));
391 		case TCP_ABORT_THRESHOLD:
392 			*i1 = tcp->tcp_second_timer_threshold;
393 			return (sizeof (int));
394 		case TCP_CONN_NOTIFY_THRESHOLD:
395 			*i1 = tcp->tcp_first_ctimer_threshold;
396 			return (sizeof (int));
397 		case TCP_CONN_ABORT_THRESHOLD:
398 			*i1 = tcp->tcp_second_ctimer_threshold;
399 			return (sizeof (int));
400 		case TCP_INIT_CWND:
401 			*i1 = tcp->tcp_init_cwnd;
402 			return (sizeof (int));
403 		case TCP_KEEPALIVE_THRESHOLD:
404 			*i1 = tcp->tcp_ka_interval;
405 			return (sizeof (int));
406 		case TCP_KEEPALIVE_ABORT_THRESHOLD:
407 			*i1 = tcp->tcp_ka_abort_thres;
408 			return (sizeof (int));
409 		case TCP_CORK:
410 			*i1 = tcp->tcp_cork;
411 			return (sizeof (int));
412 		case TCP_RTO_INITIAL:
413 			*i1 = tcp->tcp_rto_initial;
414 			return (sizeof (uint32_t));
415 		case TCP_RTO_MIN:
416 			*i1 = tcp->tcp_rto_min;
417 			return (sizeof (uint32_t));
418 		case TCP_RTO_MAX:
419 			*i1 = tcp->tcp_rto_max;
420 			return (sizeof (uint32_t));
421 		case TCP_LINGER2:
422 			*i1 = tcp->tcp_fin_wait_2_flush_interval / SECONDS;
423 			return (sizeof (int));
424 		}
425 		break;
426 	case IPPROTO_IP:
427 		if (connp->conn_family != AF_INET)
428 			return (-1);
429 		switch (name) {
430 		case IP_OPTIONS:
431 		case T_IP_OPTIONS:
432 			/* Caller ensures enough space */
433 			return (ip_opt_get_user(connp, ptr));
434 		default:
435 			break;
436 		}
437 		break;
438 
439 	case IPPROTO_IPV6:
440 		/*
441 		 * IPPROTO_IPV6 options are only supported for sockets
442 		 * that are using IPv6 on the wire.
443 		 */
444 		if (connp->conn_ipversion != IPV6_VERSION) {
445 			return (-1);
446 		}
447 		switch (name) {
448 		case IPV6_PATHMTU:
449 			if (tcp->tcp_state < TCPS_ESTABLISHED)
450 				return (-1);
451 			break;
452 		}
453 		break;
454 	}
455 	mutex_enter(&connp->conn_lock);
456 	retval = conn_opt_get(&coas, level, name, ptr);
457 	mutex_exit(&connp->conn_lock);
458 	return (retval);
459 }
460 
461 /*
462  * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
463  * Parameters are assumed to be verified by the caller.
464  */
465 /* ARGSUSED */
466 int
467 tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
468     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
469     void *thisdg_attrs, cred_t *cr)
470 {
471 	tcp_t	*tcp = connp->conn_tcp;
472 	int	*i1 = (int *)invalp;
473 	boolean_t onoff = (*i1 == 0) ? 0 : 1;
474 	boolean_t checkonly;
475 	int	reterr;
476 	tcp_stack_t	*tcps = tcp->tcp_tcps;
477 	conn_opt_arg_t	coas;
478 	uint32_t	val = *((uint32_t *)invalp);
479 
480 	coas.coa_connp = connp;
481 	coas.coa_ixa = connp->conn_ixa;
482 	coas.coa_ipp = &connp->conn_xmit_ipp;
483 	coas.coa_ancillary = B_FALSE;
484 	coas.coa_changed = 0;
485 
486 	switch (optset_context) {
487 	case SETFN_OPTCOM_CHECKONLY:
488 		checkonly = B_TRUE;
489 		/*
490 		 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
491 		 * inlen != 0 implies value supplied and
492 		 * 	we have to "pretend" to set it.
493 		 * inlen == 0 implies that there is no
494 		 * 	value part in T_CHECK request and just validation
495 		 * done elsewhere should be enough, we just return here.
496 		 */
497 		if (inlen == 0) {
498 			*outlenp = 0;
499 			return (0);
500 		}
501 		break;
502 	case SETFN_OPTCOM_NEGOTIATE:
503 		checkonly = B_FALSE;
504 		break;
505 	case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */
506 	case SETFN_CONN_NEGOTIATE:
507 		checkonly = B_FALSE;
508 		/*
509 		 * Negotiating local and "association-related" options
510 		 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ)
511 		 * primitives is allowed by XTI, but we choose
512 		 * to not implement this style negotiation for Internet
513 		 * protocols (We interpret it is a must for OSI world but
514 		 * optional for Internet protocols) for all options.
515 		 * [ Will do only for the few options that enable test
516 		 * suites that our XTI implementation of this feature
517 		 * works for transports that do allow it ]
518 		 */
519 		if (!tcp_allow_connopt_set(level, name)) {
520 			*outlenp = 0;
521 			return (EINVAL);
522 		}
523 		break;
524 	default:
525 		/*
526 		 * We should never get here
527 		 */
528 		*outlenp = 0;
529 		return (EINVAL);
530 	}
531 
532 	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
533 	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
534 
535 	/*
536 	 * For TCP, we should have no ancillary data sent down
537 	 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs
538 	 * has to be zero.
539 	 */
540 	ASSERT(thisdg_attrs == NULL);
541 
542 	/*
543 	 * For fixed length options, no sanity check
544 	 * of passed in length is done. It is assumed *_optcom_req()
545 	 * routines do the right thing.
546 	 */
547 	switch (level) {
548 	case SOL_SOCKET:
549 		switch (name) {
550 		case SO_KEEPALIVE:
551 			if (checkonly) {
552 				/* check only case */
553 				break;
554 			}
555 
556 			if (!onoff) {
557 				if (connp->conn_keepalive) {
558 					if (tcp->tcp_ka_tid != 0) {
559 						(void) TCP_TIMER_CANCEL(tcp,
560 						    tcp->tcp_ka_tid);
561 						tcp->tcp_ka_tid = 0;
562 					}
563 					connp->conn_keepalive = 0;
564 				}
565 				break;
566 			}
567 			if (!connp->conn_keepalive) {
568 				/* Crank up the keepalive timer */
569 				tcp->tcp_ka_last_intrvl = 0;
570 				tcp->tcp_ka_tid = TCP_TIMER(tcp,
571 				    tcp_keepalive_timer, tcp->tcp_ka_interval);
572 				connp->conn_keepalive = 1;
573 			}
574 			break;
575 		case SO_SNDBUF: {
576 			if (*i1 > tcps->tcps_max_buf) {
577 				*outlenp = 0;
578 				return (ENOBUFS);
579 			}
580 			if (checkonly)
581 				break;
582 
583 			connp->conn_sndbuf = *i1;
584 			if (tcps->tcps_snd_lowat_fraction != 0) {
585 				connp->conn_sndlowat = connp->conn_sndbuf /
586 				    tcps->tcps_snd_lowat_fraction;
587 			}
588 			(void) tcp_maxpsz_set(tcp, B_TRUE);
589 			/*
590 			 * If we are flow-controlled, recheck the condition.
591 			 * There are apps that increase SO_SNDBUF size when
592 			 * flow-controlled (EWOULDBLOCK), and expect the flow
593 			 * control condition to be lifted right away.
594 			 */
595 			mutex_enter(&tcp->tcp_non_sq_lock);
596 			if (tcp->tcp_flow_stopped &&
597 			    TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) {
598 				tcp_clrqfull(tcp);
599 			}
600 			mutex_exit(&tcp->tcp_non_sq_lock);
601 			*outlenp = inlen;
602 			return (0);
603 		}
604 		case SO_RCVBUF:
605 			if (*i1 > tcps->tcps_max_buf) {
606 				*outlenp = 0;
607 				return (ENOBUFS);
608 			}
609 			/* Silently ignore zero */
610 			if (!checkonly && *i1 != 0) {
611 				*i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss);
612 				(void) tcp_rwnd_set(tcp, *i1);
613 			}
614 			/*
615 			 * XXX should we return the rwnd here
616 			 * and tcp_opt_get ?
617 			 */
618 			*outlenp = inlen;
619 			return (0);
620 		case SO_SND_COPYAVOID:
621 			if (!checkonly) {
622 				if (tcp->tcp_loopback ||
623 				    (onoff != 1) || !tcp_zcopy_check(tcp)) {
624 					*outlenp = 0;
625 					return (EOPNOTSUPP);
626 				}
627 				tcp->tcp_snd_zcopy_aware = 1;
628 			}
629 			*outlenp = inlen;
630 			return (0);
631 		}
632 		break;
633 	case IPPROTO_TCP:
634 		switch (name) {
635 		case TCP_NODELAY:
636 			if (!checkonly)
637 				tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss;
638 			break;
639 		case TCP_NOTIFY_THRESHOLD:
640 			if (!checkonly)
641 				tcp->tcp_first_timer_threshold = *i1;
642 			break;
643 		case TCP_ABORT_THRESHOLD:
644 			if (!checkonly)
645 				tcp->tcp_second_timer_threshold = *i1;
646 			break;
647 		case TCP_CONN_NOTIFY_THRESHOLD:
648 			if (!checkonly)
649 				tcp->tcp_first_ctimer_threshold = *i1;
650 			break;
651 		case TCP_CONN_ABORT_THRESHOLD:
652 			if (!checkonly)
653 				tcp->tcp_second_ctimer_threshold = *i1;
654 			break;
655 		case TCP_RECVDSTADDR:
656 			if (tcp->tcp_state > TCPS_LISTEN) {
657 				*outlenp = 0;
658 				return (EOPNOTSUPP);
659 			}
660 			/* Setting done in conn_opt_set */
661 			break;
662 		case TCP_INIT_CWND:
663 			if (checkonly)
664 				break;
665 
666 			/*
667 			 * Only allow socket with network configuration
668 			 * privilege to set the initial cwnd to be larger
669 			 * than allowed by RFC 3390.
670 			 */
671 			if (val <= MIN(4, MAX(2, 4380 / tcp->tcp_mss))) {
672 				tcp->tcp_init_cwnd = val;
673 				break;
674 			}
675 			if ((reterr = secpolicy_ip_config(cr, B_TRUE)) != 0) {
676 				*outlenp = 0;
677 				return (reterr);
678 			}
679 			if (val > tcp_max_init_cwnd) {
680 				*outlenp = 0;
681 				return (EINVAL);
682 			}
683 			tcp->tcp_init_cwnd = val;
684 			break;
685 		case TCP_KEEPALIVE_THRESHOLD:
686 			if (checkonly)
687 				break;
688 
689 			if (*i1 < tcps->tcps_keepalive_interval_low ||
690 			    *i1 > tcps->tcps_keepalive_interval_high) {
691 				*outlenp = 0;
692 				return (EINVAL);
693 			}
694 			if (*i1 != tcp->tcp_ka_interval) {
695 				tcp->tcp_ka_interval = *i1;
696 				/*
697 				 * Check if we need to restart the
698 				 * keepalive timer.
699 				 */
700 				if (tcp->tcp_ka_tid != 0) {
701 					ASSERT(connp->conn_keepalive);
702 					(void) TCP_TIMER_CANCEL(tcp,
703 					    tcp->tcp_ka_tid);
704 					tcp->tcp_ka_last_intrvl = 0;
705 					tcp->tcp_ka_tid = TCP_TIMER(tcp,
706 					    tcp_keepalive_timer,
707 					    tcp->tcp_ka_interval);
708 				}
709 			}
710 			break;
711 		case TCP_KEEPALIVE_ABORT_THRESHOLD:
712 			if (!checkonly) {
713 				if (*i1 <
714 				    tcps->tcps_keepalive_abort_interval_low ||
715 				    *i1 >
716 				    tcps->tcps_keepalive_abort_interval_high) {
717 					*outlenp = 0;
718 					return (EINVAL);
719 				}
720 				tcp->tcp_ka_abort_thres = *i1;
721 			}
722 			break;
723 		case TCP_CORK:
724 			if (!checkonly) {
725 				/*
726 				 * if tcp->tcp_cork was set and is now
727 				 * being unset, we have to make sure that
728 				 * the remaining data gets sent out. Also
729 				 * unset tcp->tcp_cork so that tcp_wput_data()
730 				 * can send data even if it is less than mss
731 				 */
732 				if (tcp->tcp_cork && onoff == 0 &&
733 				    tcp->tcp_unsent > 0) {
734 					tcp->tcp_cork = B_FALSE;
735 					tcp_wput_data(tcp, NULL, B_FALSE);
736 				}
737 				tcp->tcp_cork = onoff;
738 			}
739 			break;
740 		case TCP_RTO_INITIAL: {
741 			clock_t rto;
742 
743 			if (checkonly || val == 0)
744 				break;
745 
746 			/*
747 			 * Sanity checks
748 			 *
749 			 * The initial RTO should be bounded by the minimum
750 			 * and maximum RTO.  And it should also be smaller
751 			 * than the connect attempt abort timeout.  Otherwise,
752 			 * the connection won't be aborted in a period
753 			 * reasonably close to that timeout.
754 			 */
755 			if (val < tcp->tcp_rto_min || val > tcp->tcp_rto_max ||
756 			    val > tcp->tcp_second_ctimer_threshold ||
757 			    val < tcps->tcps_rexmit_interval_initial_low ||
758 			    val > tcps->tcps_rexmit_interval_initial_high) {
759 				*outlenp = 0;
760 				return (EINVAL);
761 			}
762 			tcp->tcp_rto_initial = val;
763 
764 			/*
765 			 * If TCP has not sent anything, need to re-calculate
766 			 * tcp_rto.  Otherwise, this option change does not
767 			 * really affect anything.
768 			 */
769 			if (tcp->tcp_state >= TCPS_SYN_SENT)
770 				break;
771 
772 			tcp->tcp_rtt_sa = tcp->tcp_rto_initial << 2;
773 			tcp->tcp_rtt_sd = tcp->tcp_rto_initial >> 1;
774 			rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
775 			    tcps->tcps_rexmit_interval_extra +
776 			    (tcp->tcp_rtt_sa >> 5) +
777 			    tcps->tcps_conn_grace_period;
778 			TCP_SET_RTO(tcp, rto);
779 			break;
780 		}
781 		case TCP_RTO_MIN:
782 			if (checkonly || val == 0)
783 				break;
784 
785 			if (val < tcps->tcps_rexmit_interval_min_low ||
786 			    val > tcps->tcps_rexmit_interval_min_high ||
787 			    val > tcp->tcp_rto_max) {
788 				*outlenp = 0;
789 				return (EINVAL);
790 			}
791 			tcp->tcp_rto_min = val;
792 			if (tcp->tcp_rto < val)
793 				tcp->tcp_rto = val;
794 			break;
795 		case TCP_RTO_MAX:
796 			if (checkonly || val == 0)
797 				break;
798 
799 			/*
800 			 * Sanity checks
801 			 *
802 			 * The maximum RTO should not be larger than the
803 			 * connection abort timeout.  Otherwise, the
804 			 * connection won't be aborted in a period reasonably
805 			 * close to that timeout.
806 			 */
807 			if (val < tcps->tcps_rexmit_interval_max_low ||
808 			    val > tcps->tcps_rexmit_interval_max_high ||
809 			    val < tcp->tcp_rto_min ||
810 			    val > tcp->tcp_second_timer_threshold) {
811 				*outlenp = 0;
812 				return (EINVAL);
813 			}
814 			tcp->tcp_rto_max = val;
815 			if (tcp->tcp_rto > val)
816 				tcp->tcp_rto = val;
817 			break;
818 		case TCP_LINGER2:
819 			if (checkonly || *i1 == 0)
820 				break;
821 
822 			/*
823 			 * Note that the option value's unit is second.  And
824 			 * the value should be bigger than the private
825 			 * parameter tcp_fin_wait_2_flush_interval's lower
826 			 * bound and smaller than the current value of that
827 			 * parameter.  It should be smaller than the current
828 			 * value to avoid an app setting TCP_LINGER2 to a big
829 			 * value, causing resource to be held up too long in
830 			 * FIN-WAIT-2 state.
831 			 */
832 			if (*i1 < 0 ||
833 			    tcps->tcps_fin_wait_2_flush_interval_low/SECONDS >
834 			    *i1 ||
835 			    tcps->tcps_fin_wait_2_flush_interval/SECONDS <
836 			    *i1) {
837 				*outlenp = 0;
838 				return (EINVAL);
839 			}
840 			tcp->tcp_fin_wait_2_flush_interval = *i1 * SECONDS;
841 			break;
842 		default:
843 			break;
844 		}
845 		break;
846 	case IPPROTO_IP:
847 		if (connp->conn_family != AF_INET) {
848 			*outlenp = 0;
849 			return (EINVAL);
850 		}
851 		switch (name) {
852 		case IP_SEC_OPT:
853 			/*
854 			 * We should not allow policy setting after
855 			 * we start listening for connections.
856 			 */
857 			if (tcp->tcp_state == TCPS_LISTEN) {
858 				return (EINVAL);
859 			}
860 			break;
861 		}
862 		break;
863 	case IPPROTO_IPV6:
864 		/*
865 		 * IPPROTO_IPV6 options are only supported for sockets
866 		 * that are using IPv6 on the wire.
867 		 */
868 		if (connp->conn_ipversion != IPV6_VERSION) {
869 			*outlenp = 0;
870 			return (EINVAL);
871 		}
872 
873 		switch (name) {
874 		case IPV6_RECVPKTINFO:
875 			if (!checkonly) {
876 				/* Force it to be sent up with the next msg */
877 				tcp->tcp_recvifindex = 0;
878 			}
879 			break;
880 		case IPV6_RECVTCLASS:
881 			if (!checkonly) {
882 				/* Force it to be sent up with the next msg */
883 				tcp->tcp_recvtclass = 0xffffffffU;
884 			}
885 			break;
886 		case IPV6_RECVHOPLIMIT:
887 			if (!checkonly) {
888 				/* Force it to be sent up with the next msg */
889 				tcp->tcp_recvhops = 0xffffffffU;
890 			}
891 			break;
892 		case IPV6_PKTINFO:
893 			/* This is an extra check for TCP */
894 			if (inlen == sizeof (struct in6_pktinfo)) {
895 				struct in6_pktinfo *pkti;
896 
897 				pkti = (struct in6_pktinfo *)invalp;
898 				/*
899 				 * RFC 3542 states that ipi6_addr must be
900 				 * the unspecified address when setting the
901 				 * IPV6_PKTINFO sticky socket option on a
902 				 * TCP socket.
903 				 */
904 				if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr))
905 					return (EINVAL);
906 			}
907 			break;
908 		case IPV6_SEC_OPT:
909 			/*
910 			 * We should not allow policy setting after
911 			 * we start listening for connections.
912 			 */
913 			if (tcp->tcp_state == TCPS_LISTEN) {
914 				return (EINVAL);
915 			}
916 			break;
917 		}
918 		break;
919 	}
920 	reterr = conn_opt_set(&coas, level, name, inlen, invalp,
921 	    checkonly, cr);
922 	if (reterr != 0) {
923 		*outlenp = 0;
924 		return (reterr);
925 	}
926 
927 	/*
928 	 * Common case of OK return with outval same as inval
929 	 */
930 	if (invalp != outvalp) {
931 		/* don't trust bcopy for identical src/dst */
932 		(void) bcopy(invalp, outvalp, inlen);
933 	}
934 	*outlenp = inlen;
935 
936 	if (coas.coa_changed & COA_HEADER_CHANGED) {
937 		/* If we are connected we rebuilt the headers */
938 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
939 		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
940 			reterr = tcp_build_hdrs(tcp);
941 			if (reterr != 0)
942 				return (reterr);
943 		}
944 	}
945 	if (coas.coa_changed & COA_ROUTE_CHANGED) {
946 		in6_addr_t nexthop;
947 
948 		/*
949 		 * If we are connected we re-cache the information.
950 		 * We ignore errors to preserve BSD behavior.
951 		 * Note that we don't redo IPsec policy lookup here
952 		 * since the final destination (or source) didn't change.
953 		 */
954 		ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa,
955 		    &connp->conn_faddr_v6, &nexthop);
956 
957 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
958 		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
959 			(void) ip_attr_connect(connp, connp->conn_ixa,
960 			    &connp->conn_laddr_v6, &connp->conn_faddr_v6,
961 			    &nexthop, connp->conn_fport, NULL, NULL,
962 			    IPDF_VERIFY_DST);
963 		}
964 	}
965 	if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
966 		connp->conn_wq->q_hiwat = connp->conn_sndbuf;
967 	}
968 	if (coas.coa_changed & COA_WROFF_CHANGED) {
969 		connp->conn_wroff = connp->conn_ht_iphc_allocated +
970 		    tcps->tcps_wroff_xtra;
971 		(void) proto_set_tx_wroff(connp->conn_rq, connp,
972 		    connp->conn_wroff);
973 	}
974 	if (coas.coa_changed & COA_OOBINLINE_CHANGED) {
975 		if (IPCL_IS_NONSTR(connp))
976 			proto_set_rx_oob_opt(connp, onoff);
977 	}
978 	return (0);
979 }
980