1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/stream.h>
28 #define	_SUN_TPI_VERSION 2
29 #include <sys/tihdr.h>
30 #include <sys/socket.h>
31 #include <sys/xti_xtiopt.h>
32 #include <sys/xti_inet.h>
33 #include <sys/policy.h>
34 
35 #include <inet/common.h>
36 #include <netinet/ip6.h>
37 #include <inet/ip.h>
38 
39 #include <netinet/in.h>
40 #include <netinet/tcp.h>
41 #include <inet/optcom.h>
42 #include <inet/proto_set.h>
43 #include <inet/tcp_impl.h>
44 
45 /*
46  * Table of all known options handled on a TCP protocol stack.
47  *
48  * Note: This table contains options processed by both TCP and IP levels
49  *       and is the superset of options that can be performed on a TCP over IP
50  *       stack.
51  */
52 opdes_t	tcp_opt_arr[] = {
53 
54 { SO_LINGER,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
55 	sizeof (struct linger), 0 },
56 
57 { SO_DEBUG,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
58 { SO_KEEPALIVE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
59 { SO_DONTROUTE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
60 { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
61 	},
62 { SO_BROADCAST,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
63 { SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
64 { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
65 { SO_TYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
66 { SO_SNDBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
67 { SO_RCVBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
68 { SO_SNDTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
69 	sizeof (struct timeval), 0 },
70 { SO_RCVTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
71 	sizeof (struct timeval), 0 },
72 { SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
73 	},
74 { SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
75 { SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
76 	0 },
77 { SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
78 	0 },
79 { SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
80 	0 },
81 { SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
82 	0 },
83 { SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
84 
85 { SO_DOMAIN,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
86 
87 { SO_PROTOTYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
88 
89 { TCP_NODELAY,	IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
90 	},
91 { TCP_MAXSEG,	IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t),
92 	536 },
93 
94 { TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
95 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
96 
97 { TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
98 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
99 
100 { TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
101 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
102 
103 { TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
104 	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
105 
106 { TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
107 	0 },
108 
109 { TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0,
110 	sizeof (int), 0 },
111 
112 { TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
113 	},
114 
115 { TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0,
116 	sizeof (int), 0 },
117 
118 { TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
119 	sizeof (int), 0	},
120 
121 { TCP_KEEPIDLE, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
122 
123 { TCP_KEEPCNT, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
124 
125 { TCP_KEEPINTVL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
126 
127 { TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
128 	sizeof (int), 0	},
129 
130 { TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
131 
132 { TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
133 
134 { TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
135 
136 { TCP_RTO_MAX, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
137 
138 { TCP_LINGER2, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
139 
140 { IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
141 	(OP_VARLEN|OP_NODEFAULT),
142 	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
143 { T_IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
144 	(OP_VARLEN|OP_NODEFAULT),
145 	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
146 
147 { IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
148 { T_IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
149 { IP_TTL,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
150 	sizeof (int), -1 /* not initialized */ },
151 
152 { IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
153 	sizeof (ipsec_req_t), -1 /* not initialized */ },
154 
155 { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
156 	sizeof (int),	0 /* no ifindex */ },
157 
158 { IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
159 	sizeof (int), 0 },
160 
161 { IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
162 	sizeof (int), -1 /* not initialized */ },
163 
164 { IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
165 	sizeof (int),	0 /* no ifindex */ },
166 
167 { IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
168 
169 { IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
170 	sizeof (in_addr_t),	-1 /* not initialized  */ },
171 
172 { IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
173 	sizeof (int), 0 },
174 
175 { IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
176 	(OP_NODEFAULT|OP_VARLEN),
177 	sizeof (struct in6_pktinfo), -1 /* not initialized */ },
178 { IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
179 	OP_NODEFAULT,
180 	sizeof (sin6_t), -1 /* not initialized */ },
181 { IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
182 	(OP_VARLEN|OP_NODEFAULT), 255*8,
183 	-1 /* not initialized */ },
184 { IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
185 	(OP_VARLEN|OP_NODEFAULT), 255*8,
186 	-1 /* not initialized */ },
187 { IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
188 	(OP_VARLEN|OP_NODEFAULT), 255*8,
189 	-1 /* not initialized */ },
190 { IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
191 	(OP_VARLEN|OP_NODEFAULT), 255*8,
192 	-1 /* not initialized */ },
193 { IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
194 	OP_NODEFAULT,
195 	sizeof (int), -1 /* not initialized */ },
196 { IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
197 	OP_NODEFAULT,
198 	sizeof (struct ip6_mtuinfo), -1 /* not initialized */ },
199 { IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
200 	sizeof (int), 0 },
201 { IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
202 	sizeof (int), 0 },
203 { IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
204 	sizeof (int), 0 },
205 
206 /* Enable receipt of ancillary data */
207 { IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
208 	sizeof (int), 0 },
209 { IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
210 	sizeof (int), 0 },
211 { IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
212 	sizeof (int), 0 },
213 { _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
214 	sizeof (int), 0 },
215 { IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
216 	sizeof (int), 0 },
217 { IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
218 	sizeof (int), 0 },
219 { IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
220 	sizeof (int), 0 },
221 { IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
222 	sizeof (int), 0 },
223 
224 { IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
225 	sizeof (ipsec_req_t), -1 /* not initialized */ },
226 { IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
227 	sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
228 };
229 
230 /*
231  * Table of all supported levels
232  * Note: Some levels (e.g. XTI_GENERIC) may be valid but may not have
233  * any supported options so we need this info separately.
234  *
235  * This is needed only for topmost tpi providers and is used only by
236  * XTI interfaces.
237  */
238 optlevel_t	tcp_valid_levels_arr[] = {
239 	XTI_GENERIC,
240 	SOL_SOCKET,
241 	IPPROTO_TCP,
242 	IPPROTO_IP,
243 	IPPROTO_IPV6
244 };
245 
246 
247 #define	TCP_OPT_ARR_CNT		A_CNT(tcp_opt_arr)
248 #define	TCP_VALID_LEVELS_CNT	A_CNT(tcp_valid_levels_arr)
249 
250 uint_t tcp_max_optsize; /* initialized when TCP driver is loaded */
251 
252 /*
253  * Initialize option database object for TCP
254  *
255  * This object represents database of options to search passed to
256  * {sock,tpi}optcom_req() interface routine to take care of option
257  * management and associated methods.
258  */
259 
260 optdb_obj_t tcp_opt_obj = {
261 	tcp_opt_default,	/* TCP default value function pointer */
262 	tcp_tpi_opt_get,	/* TCP get function pointer */
263 	tcp_tpi_opt_set,	/* TCP set function pointer */
264 	TCP_OPT_ARR_CNT,	/* TCP option database count of entries */
265 	tcp_opt_arr,		/* TCP option database */
266 	TCP_VALID_LEVELS_CNT,	/* TCP valid level count of entries */
267 	tcp_valid_levels_arr	/* TCP valid level array */
268 };
269 
270 /* Maximum TCP initial cwin (start/restart). */
271 #define	TCP_MAX_INIT_CWND	16
272 
273 static int tcp_max_init_cwnd = TCP_MAX_INIT_CWND;
274 
275 /*
276  * Some TCP options can be "set" by requesting them in the option
277  * buffer. This is needed for XTI feature test though we do not
278  * allow it in general. We interpret that this mechanism is more
279  * applicable to OSI protocols and need not be allowed in general.
280  * This routine filters out options for which it is not allowed (most)
281  * and lets through those (few) for which it is. [ The XTI interface
282  * test suite specifics will imply that any XTI_GENERIC level XTI_* if
283  * ever implemented will have to be allowed here ].
284  */
285 static boolean_t
286 tcp_allow_connopt_set(int level, int name)
287 {
288 
289 	switch (level) {
290 	case IPPROTO_TCP:
291 		switch (name) {
292 		case TCP_NODELAY:
293 			return (B_TRUE);
294 		default:
295 			return (B_FALSE);
296 		}
297 		/*NOTREACHED*/
298 	default:
299 		return (B_FALSE);
300 	}
301 	/*NOTREACHED*/
302 }
303 
304 /*
305  * This routine gets default values of certain options whose default
306  * values are maintained by protocol specific code
307  */
308 /* ARGSUSED */
309 int
310 tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
311 {
312 	int32_t	*i1 = (int32_t *)ptr;
313 	tcp_stack_t	*tcps = Q_TO_TCP(q)->tcp_tcps;
314 
315 	switch (level) {
316 	case IPPROTO_TCP:
317 		switch (name) {
318 		case TCP_NOTIFY_THRESHOLD:
319 			*i1 = tcps->tcps_ip_notify_interval;
320 			break;
321 		case TCP_ABORT_THRESHOLD:
322 			*i1 = tcps->tcps_ip_abort_interval;
323 			break;
324 		case TCP_CONN_NOTIFY_THRESHOLD:
325 			*i1 = tcps->tcps_ip_notify_cinterval;
326 			break;
327 		case TCP_CONN_ABORT_THRESHOLD:
328 			*i1 = tcps->tcps_ip_abort_cinterval;
329 			break;
330 		default:
331 			return (-1);
332 		}
333 		break;
334 	case IPPROTO_IP:
335 		switch (name) {
336 		case IP_TTL:
337 			*i1 = tcps->tcps_ipv4_ttl;
338 			break;
339 		default:
340 			return (-1);
341 		}
342 		break;
343 	case IPPROTO_IPV6:
344 		switch (name) {
345 		case IPV6_UNICAST_HOPS:
346 			*i1 = tcps->tcps_ipv6_hoplimit;
347 			break;
348 		default:
349 			return (-1);
350 		}
351 		break;
352 	default:
353 		return (-1);
354 	}
355 	return (sizeof (int));
356 }
357 
358 /*
359  * TCP routine to get the values of options.
360  */
361 int
362 tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
363 {
364 	int		*i1 = (int *)ptr;
365 	tcp_t		*tcp = connp->conn_tcp;
366 	conn_opt_arg_t	coas;
367 	int		retval;
368 
369 	coas.coa_connp = connp;
370 	coas.coa_ixa = connp->conn_ixa;
371 	coas.coa_ipp = &connp->conn_xmit_ipp;
372 	coas.coa_ancillary = B_FALSE;
373 	coas.coa_changed = 0;
374 
375 	switch (level) {
376 	case SOL_SOCKET:
377 		switch (name) {
378 		case SO_SND_COPYAVOID:
379 			*i1 = tcp->tcp_snd_zcopy_on ?
380 			    SO_SND_COPYAVOID : 0;
381 			return (sizeof (int));
382 		case SO_ACCEPTCONN:
383 			*i1 = (tcp->tcp_state == TCPS_LISTEN);
384 			return (sizeof (int));
385 		}
386 		break;
387 	case IPPROTO_TCP:
388 		switch (name) {
389 		case TCP_NODELAY:
390 			*i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0;
391 			return (sizeof (int));
392 		case TCP_MAXSEG:
393 			*i1 = tcp->tcp_mss;
394 			return (sizeof (int));
395 		case TCP_NOTIFY_THRESHOLD:
396 			*i1 = (int)tcp->tcp_first_timer_threshold;
397 			return (sizeof (int));
398 		case TCP_ABORT_THRESHOLD:
399 			*i1 = tcp->tcp_second_timer_threshold;
400 			return (sizeof (int));
401 		case TCP_CONN_NOTIFY_THRESHOLD:
402 			*i1 = tcp->tcp_first_ctimer_threshold;
403 			return (sizeof (int));
404 		case TCP_CONN_ABORT_THRESHOLD:
405 			*i1 = tcp->tcp_second_ctimer_threshold;
406 			return (sizeof (int));
407 		case TCP_INIT_CWND:
408 			*i1 = tcp->tcp_init_cwnd;
409 			return (sizeof (int));
410 		case TCP_KEEPALIVE_THRESHOLD:
411 			*i1 = tcp->tcp_ka_interval;
412 			return (sizeof (int));
413 
414 		/*
415 		 * TCP_KEEPIDLE expects value in seconds, but
416 		 * tcp_ka_interval is in milliseconds.
417 		 */
418 		case TCP_KEEPIDLE:
419 			*i1 = tcp->tcp_ka_interval / 1000;
420 			return (sizeof (int));
421 		case TCP_KEEPCNT:
422 			*i1 = tcp->tcp_ka_cnt;
423 			return (sizeof (int));
424 
425 		/*
426 		 * TCP_KEEPINTVL expects value in seconds, but
427 		 * tcp_ka_rinterval is in milliseconds.
428 		 */
429 		case TCP_KEEPINTVL:
430 			*i1 = tcp->tcp_ka_rinterval / 1000;
431 			return (sizeof (int));
432 		case TCP_KEEPALIVE_ABORT_THRESHOLD:
433 			*i1 = tcp->tcp_ka_abort_thres;
434 			return (sizeof (int));
435 		case TCP_CORK:
436 			*i1 = tcp->tcp_cork;
437 			return (sizeof (int));
438 		case TCP_RTO_INITIAL:
439 			*i1 = tcp->tcp_rto_initial;
440 			return (sizeof (uint32_t));
441 		case TCP_RTO_MIN:
442 			*i1 = tcp->tcp_rto_min;
443 			return (sizeof (uint32_t));
444 		case TCP_RTO_MAX:
445 			*i1 = tcp->tcp_rto_max;
446 			return (sizeof (uint32_t));
447 		case TCP_LINGER2:
448 			*i1 = tcp->tcp_fin_wait_2_flush_interval / SECONDS;
449 			return (sizeof (int));
450 		}
451 		break;
452 	case IPPROTO_IP:
453 		if (connp->conn_family != AF_INET)
454 			return (-1);
455 		switch (name) {
456 		case IP_OPTIONS:
457 		case T_IP_OPTIONS:
458 			/* Caller ensures enough space */
459 			return (ip_opt_get_user(connp, ptr));
460 		default:
461 			break;
462 		}
463 		break;
464 
465 	case IPPROTO_IPV6:
466 		/*
467 		 * IPPROTO_IPV6 options are only supported for sockets
468 		 * that are using IPv6 on the wire.
469 		 */
470 		if (connp->conn_ipversion != IPV6_VERSION) {
471 			return (-1);
472 		}
473 		switch (name) {
474 		case IPV6_PATHMTU:
475 			if (tcp->tcp_state < TCPS_ESTABLISHED)
476 				return (-1);
477 			break;
478 		}
479 		break;
480 	}
481 	mutex_enter(&connp->conn_lock);
482 	retval = conn_opt_get(&coas, level, name, ptr);
483 	mutex_exit(&connp->conn_lock);
484 	return (retval);
485 }
486 
487 /*
488  * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
489  * Parameters are assumed to be verified by the caller.
490  */
491 /* ARGSUSED */
492 int
493 tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
494     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
495     void *thisdg_attrs, cred_t *cr)
496 {
497 	tcp_t	*tcp = connp->conn_tcp;
498 	int	*i1 = (int *)invalp;
499 	boolean_t onoff = (*i1 == 0) ? 0 : 1;
500 	boolean_t checkonly;
501 	int	reterr;
502 	tcp_stack_t	*tcps = tcp->tcp_tcps;
503 	conn_opt_arg_t	coas;
504 	uint32_t	val = *((uint32_t *)invalp);
505 
506 	coas.coa_connp = connp;
507 	coas.coa_ixa = connp->conn_ixa;
508 	coas.coa_ipp = &connp->conn_xmit_ipp;
509 	coas.coa_ancillary = B_FALSE;
510 	coas.coa_changed = 0;
511 
512 	switch (optset_context) {
513 	case SETFN_OPTCOM_CHECKONLY:
514 		checkonly = B_TRUE;
515 		/*
516 		 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
517 		 * inlen != 0 implies value supplied and
518 		 * 	we have to "pretend" to set it.
519 		 * inlen == 0 implies that there is no
520 		 * 	value part in T_CHECK request and just validation
521 		 * done elsewhere should be enough, we just return here.
522 		 */
523 		if (inlen == 0) {
524 			*outlenp = 0;
525 			return (0);
526 		}
527 		break;
528 	case SETFN_OPTCOM_NEGOTIATE:
529 		checkonly = B_FALSE;
530 		break;
531 	case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */
532 	case SETFN_CONN_NEGOTIATE:
533 		checkonly = B_FALSE;
534 		/*
535 		 * Negotiating local and "association-related" options
536 		 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ)
537 		 * primitives is allowed by XTI, but we choose
538 		 * to not implement this style negotiation for Internet
539 		 * protocols (We interpret it is a must for OSI world but
540 		 * optional for Internet protocols) for all options.
541 		 * [ Will do only for the few options that enable test
542 		 * suites that our XTI implementation of this feature
543 		 * works for transports that do allow it ]
544 		 */
545 		if (!tcp_allow_connopt_set(level, name)) {
546 			*outlenp = 0;
547 			return (EINVAL);
548 		}
549 		break;
550 	default:
551 		/*
552 		 * We should never get here
553 		 */
554 		*outlenp = 0;
555 		return (EINVAL);
556 	}
557 
558 	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
559 	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
560 
561 	/*
562 	 * For TCP, we should have no ancillary data sent down
563 	 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs
564 	 * has to be zero.
565 	 */
566 	ASSERT(thisdg_attrs == NULL);
567 
568 	/*
569 	 * For fixed length options, no sanity check
570 	 * of passed in length is done. It is assumed *_optcom_req()
571 	 * routines do the right thing.
572 	 */
573 	switch (level) {
574 	case SOL_SOCKET:
575 		switch (name) {
576 		case SO_KEEPALIVE:
577 			if (checkonly) {
578 				/* check only case */
579 				break;
580 			}
581 
582 			if (!onoff) {
583 				if (connp->conn_keepalive) {
584 					if (tcp->tcp_ka_tid != 0) {
585 						(void) TCP_TIMER_CANCEL(tcp,
586 						    tcp->tcp_ka_tid);
587 						tcp->tcp_ka_tid = 0;
588 					}
589 					connp->conn_keepalive = 0;
590 				}
591 				break;
592 			}
593 			if (!connp->conn_keepalive) {
594 				/* Crank up the keepalive timer */
595 				tcp->tcp_ka_last_intrvl = 0;
596 				tcp->tcp_ka_tid = TCP_TIMER(tcp,
597 				    tcp_keepalive_timer, tcp->tcp_ka_interval);
598 				connp->conn_keepalive = 1;
599 			}
600 			break;
601 		case SO_SNDBUF: {
602 			if (*i1 > tcps->tcps_max_buf) {
603 				*outlenp = 0;
604 				return (ENOBUFS);
605 			}
606 			if (checkonly)
607 				break;
608 
609 			connp->conn_sndbuf = *i1;
610 			if (tcps->tcps_snd_lowat_fraction != 0) {
611 				connp->conn_sndlowat = connp->conn_sndbuf /
612 				    tcps->tcps_snd_lowat_fraction;
613 			}
614 			(void) tcp_maxpsz_set(tcp, B_TRUE);
615 			/*
616 			 * If we are flow-controlled, recheck the condition.
617 			 * There are apps that increase SO_SNDBUF size when
618 			 * flow-controlled (EWOULDBLOCK), and expect the flow
619 			 * control condition to be lifted right away.
620 			 */
621 			mutex_enter(&tcp->tcp_non_sq_lock);
622 			if (tcp->tcp_flow_stopped &&
623 			    TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) {
624 				tcp_clrqfull(tcp);
625 			}
626 			mutex_exit(&tcp->tcp_non_sq_lock);
627 			*outlenp = inlen;
628 			return (0);
629 		}
630 		case SO_RCVBUF:
631 			if (*i1 > tcps->tcps_max_buf) {
632 				*outlenp = 0;
633 				return (ENOBUFS);
634 			}
635 			/* Silently ignore zero */
636 			if (!checkonly && *i1 != 0) {
637 				*i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss);
638 				(void) tcp_rwnd_set(tcp, *i1);
639 			}
640 			/*
641 			 * XXX should we return the rwnd here
642 			 * and tcp_opt_get ?
643 			 */
644 			*outlenp = inlen;
645 			return (0);
646 		case SO_SND_COPYAVOID:
647 			if (!checkonly) {
648 				if (tcp->tcp_loopback ||
649 				    (onoff != 1) || !tcp_zcopy_check(tcp)) {
650 					*outlenp = 0;
651 					return (EOPNOTSUPP);
652 				}
653 				tcp->tcp_snd_zcopy_aware = 1;
654 			}
655 			*outlenp = inlen;
656 			return (0);
657 		}
658 		break;
659 	case IPPROTO_TCP:
660 		switch (name) {
661 		case TCP_NODELAY:
662 			if (!checkonly)
663 				tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss;
664 			break;
665 		case TCP_NOTIFY_THRESHOLD:
666 			if (!checkonly)
667 				tcp->tcp_first_timer_threshold = *i1;
668 			break;
669 		case TCP_ABORT_THRESHOLD:
670 			if (!checkonly)
671 				tcp->tcp_second_timer_threshold = *i1;
672 			break;
673 		case TCP_CONN_NOTIFY_THRESHOLD:
674 			if (!checkonly)
675 				tcp->tcp_first_ctimer_threshold = *i1;
676 			break;
677 		case TCP_CONN_ABORT_THRESHOLD:
678 			if (!checkonly)
679 				tcp->tcp_second_ctimer_threshold = *i1;
680 			break;
681 		case TCP_RECVDSTADDR:
682 			if (tcp->tcp_state > TCPS_LISTEN) {
683 				*outlenp = 0;
684 				return (EOPNOTSUPP);
685 			}
686 			/* Setting done in conn_opt_set */
687 			break;
688 		case TCP_INIT_CWND:
689 			if (checkonly)
690 				break;
691 
692 			/*
693 			 * Only allow socket with network configuration
694 			 * privilege to set the initial cwnd to be larger
695 			 * than allowed by RFC 3390.
696 			 */
697 			if (val > MIN(4, MAX(2, 4380 / tcp->tcp_mss))) {
698 				if ((reterr = secpolicy_ip_config(cr, B_TRUE))
699 				    != 0) {
700 					*outlenp = 0;
701 					return (reterr);
702 				}
703 				if (val > tcp_max_init_cwnd) {
704 					*outlenp = 0;
705 					return (EINVAL);
706 				}
707 			}
708 
709 			tcp->tcp_init_cwnd = val;
710 
711 			/*
712 			 * If the socket is connected, AND no outbound data
713 			 * has been sent, reset the actual cwnd values.
714 			 */
715 			if (tcp->tcp_state == TCPS_ESTABLISHED &&
716 			    tcp->tcp_iss == tcp->tcp_snxt - 1) {
717 				tcp->tcp_cwnd =
718 				    MIN(tcp->tcp_rwnd, val * tcp->tcp_mss);
719 			}
720 			break;
721 
722 		/*
723 		 * TCP_KEEPIDLE is in seconds but TCP_KEEPALIVE_THRESHOLD
724 		 * is in milliseconds. TCP_KEEPIDLE is introduced for
725 		 * compatibility with other Unix flavors.
726 		 * We can fall through TCP_KEEPALIVE_THRESHOLD logic after
727 		 * converting the input to milliseconds.
728 		 */
729 		case TCP_KEEPIDLE:
730 			*i1 *= 1000;
731 			/* FALLTHRU */
732 
733 		case TCP_KEEPALIVE_THRESHOLD:
734 			if (checkonly)
735 				break;
736 
737 			if (*i1 < tcps->tcps_keepalive_interval_low ||
738 			    *i1 > tcps->tcps_keepalive_interval_high) {
739 				*outlenp = 0;
740 				return (EINVAL);
741 			}
742 			if (*i1 != tcp->tcp_ka_interval) {
743 				tcp->tcp_ka_interval = *i1;
744 				/*
745 				 * Check if we need to restart the
746 				 * keepalive timer.
747 				 */
748 				if (tcp->tcp_ka_tid != 0) {
749 					ASSERT(connp->conn_keepalive);
750 					(void) TCP_TIMER_CANCEL(tcp,
751 					    tcp->tcp_ka_tid);
752 					tcp->tcp_ka_last_intrvl = 0;
753 					tcp->tcp_ka_tid = TCP_TIMER(tcp,
754 					    tcp_keepalive_timer,
755 					    tcp->tcp_ka_interval);
756 				}
757 			}
758 			break;
759 
760 		/*
761 		 * tcp_ka_abort_thres = tcp_ka_rinterval * tcp_ka_cnt.
762 		 * So setting TCP_KEEPCNT or TCP_KEEPINTVL can affect all the
763 		 * three members - tcp_ka_abort_thres, tcp_ka_rinterval and
764 		 * tcp_ka_cnt.
765 		 */
766 		case TCP_KEEPCNT:
767 			if (checkonly)
768 				break;
769 
770 			if (*i1 == 0) {
771 				return (EINVAL);
772 			} else if (tcp->tcp_ka_rinterval == 0) {
773 				if ((tcp->tcp_ka_abort_thres / *i1) <
774 				    tcp->tcp_rto_min ||
775 				    (tcp->tcp_ka_abort_thres / *i1) >
776 				    tcp->tcp_rto_max)
777 					return (EINVAL);
778 
779 				tcp->tcp_ka_rinterval =
780 				    tcp->tcp_ka_abort_thres / *i1;
781 			} else {
782 				if ((*i1 * tcp->tcp_ka_rinterval) <
783 				    tcps->tcps_keepalive_abort_interval_low ||
784 				    (*i1 * tcp->tcp_ka_rinterval) >
785 				    tcps->tcps_keepalive_abort_interval_high)
786 					return (EINVAL);
787 				tcp->tcp_ka_abort_thres =
788 				    (*i1 * tcp->tcp_ka_rinterval);
789 			}
790 			tcp->tcp_ka_cnt = *i1;
791 			break;
792 		case TCP_KEEPINTVL:
793 			/*
794 			 * TCP_KEEPINTVL is specified in seconds, but
795 			 * tcp_ka_rinterval is in milliseconds.
796 			 */
797 
798 			if (checkonly)
799 				break;
800 
801 			if ((*i1 * 1000) < tcp->tcp_rto_min ||
802 			    (*i1 * 1000) > tcp->tcp_rto_max)
803 				return (EINVAL);
804 
805 			if (tcp->tcp_ka_cnt == 0) {
806 				tcp->tcp_ka_cnt =
807 				    tcp->tcp_ka_abort_thres / (*i1 * 1000);
808 			} else {
809 				if ((*i1 * tcp->tcp_ka_cnt * 1000) <
810 				    tcps->tcps_keepalive_abort_interval_low ||
811 				    (*i1 * tcp->tcp_ka_cnt * 1000) >
812 				    tcps->tcps_keepalive_abort_interval_high)
813 					return (EINVAL);
814 				tcp->tcp_ka_abort_thres =
815 				    (*i1 * tcp->tcp_ka_cnt * 1000);
816 			}
817 			tcp->tcp_ka_rinterval = *i1 * 1000;
818 			break;
819 		case TCP_KEEPALIVE_ABORT_THRESHOLD:
820 			if (!checkonly) {
821 				if (*i1 <
822 				    tcps->tcps_keepalive_abort_interval_low ||
823 				    *i1 >
824 				    tcps->tcps_keepalive_abort_interval_high) {
825 					*outlenp = 0;
826 					return (EINVAL);
827 				}
828 				tcp->tcp_ka_abort_thres = *i1;
829 				tcp->tcp_ka_cnt = 0;
830 				tcp->tcp_ka_rinterval = 0;
831 			}
832 			break;
833 		case TCP_CORK:
834 			if (!checkonly) {
835 				/*
836 				 * if tcp->tcp_cork was set and is now
837 				 * being unset, we have to make sure that
838 				 * the remaining data gets sent out. Also
839 				 * unset tcp->tcp_cork so that tcp_wput_data()
840 				 * can send data even if it is less than mss
841 				 */
842 				if (tcp->tcp_cork && onoff == 0 &&
843 				    tcp->tcp_unsent > 0) {
844 					tcp->tcp_cork = B_FALSE;
845 					tcp_wput_data(tcp, NULL, B_FALSE);
846 				}
847 				tcp->tcp_cork = onoff;
848 			}
849 			break;
850 		case TCP_RTO_INITIAL: {
851 			clock_t rto;
852 
853 			if (checkonly || val == 0)
854 				break;
855 
856 			/*
857 			 * Sanity checks
858 			 *
859 			 * The initial RTO should be bounded by the minimum
860 			 * and maximum RTO.  And it should also be smaller
861 			 * than the connect attempt abort timeout.  Otherwise,
862 			 * the connection won't be aborted in a period
863 			 * reasonably close to that timeout.
864 			 */
865 			if (val < tcp->tcp_rto_min || val > tcp->tcp_rto_max ||
866 			    val > tcp->tcp_second_ctimer_threshold ||
867 			    val < tcps->tcps_rexmit_interval_initial_low ||
868 			    val > tcps->tcps_rexmit_interval_initial_high) {
869 				*outlenp = 0;
870 				return (EINVAL);
871 			}
872 			tcp->tcp_rto_initial = val;
873 
874 			/*
875 			 * If TCP has not sent anything, need to re-calculate
876 			 * tcp_rto.  Otherwise, this option change does not
877 			 * really affect anything.
878 			 */
879 			if (tcp->tcp_state >= TCPS_SYN_SENT)
880 				break;
881 
882 			tcp->tcp_rtt_sa = tcp->tcp_rto_initial << 2;
883 			tcp->tcp_rtt_sd = tcp->tcp_rto_initial >> 1;
884 			rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
885 			    tcps->tcps_rexmit_interval_extra +
886 			    (tcp->tcp_rtt_sa >> 5) +
887 			    tcps->tcps_conn_grace_period;
888 			TCP_SET_RTO(tcp, rto);
889 			break;
890 		}
891 		case TCP_RTO_MIN:
892 			if (checkonly || val == 0)
893 				break;
894 
895 			if (val < tcps->tcps_rexmit_interval_min_low ||
896 			    val > tcps->tcps_rexmit_interval_min_high ||
897 			    val > tcp->tcp_rto_max) {
898 				*outlenp = 0;
899 				return (EINVAL);
900 			}
901 			tcp->tcp_rto_min = val;
902 			if (tcp->tcp_rto < val)
903 				tcp->tcp_rto = val;
904 			break;
905 		case TCP_RTO_MAX:
906 			if (checkonly || val == 0)
907 				break;
908 
909 			/*
910 			 * Sanity checks
911 			 *
912 			 * The maximum RTO should not be larger than the
913 			 * connection abort timeout.  Otherwise, the
914 			 * connection won't be aborted in a period reasonably
915 			 * close to that timeout.
916 			 */
917 			if (val < tcps->tcps_rexmit_interval_max_low ||
918 			    val > tcps->tcps_rexmit_interval_max_high ||
919 			    val < tcp->tcp_rto_min ||
920 			    val > tcp->tcp_second_timer_threshold) {
921 				*outlenp = 0;
922 				return (EINVAL);
923 			}
924 			tcp->tcp_rto_max = val;
925 			if (tcp->tcp_rto > val)
926 				tcp->tcp_rto = val;
927 			break;
928 		case TCP_LINGER2:
929 			if (checkonly || *i1 == 0)
930 				break;
931 
932 			/*
933 			 * Note that the option value's unit is second.  And
934 			 * the value should be bigger than the private
935 			 * parameter tcp_fin_wait_2_flush_interval's lower
936 			 * bound and smaller than the current value of that
937 			 * parameter.  It should be smaller than the current
938 			 * value to avoid an app setting TCP_LINGER2 to a big
939 			 * value, causing resource to be held up too long in
940 			 * FIN-WAIT-2 state.
941 			 */
942 			if (*i1 < 0 ||
943 			    tcps->tcps_fin_wait_2_flush_interval_low/SECONDS >
944 			    *i1 ||
945 			    tcps->tcps_fin_wait_2_flush_interval/SECONDS <
946 			    *i1) {
947 				*outlenp = 0;
948 				return (EINVAL);
949 			}
950 			tcp->tcp_fin_wait_2_flush_interval = *i1 * SECONDS;
951 			break;
952 		default:
953 			break;
954 		}
955 		break;
956 	case IPPROTO_IP:
957 		if (connp->conn_family != AF_INET) {
958 			*outlenp = 0;
959 			return (EINVAL);
960 		}
961 		switch (name) {
962 		case IP_SEC_OPT:
963 			/*
964 			 * We should not allow policy setting after
965 			 * we start listening for connections.
966 			 */
967 			if (tcp->tcp_state == TCPS_LISTEN) {
968 				return (EINVAL);
969 			}
970 			break;
971 		}
972 		break;
973 	case IPPROTO_IPV6:
974 		/*
975 		 * IPPROTO_IPV6 options are only supported for sockets
976 		 * that are using IPv6 on the wire.
977 		 */
978 		if (connp->conn_ipversion != IPV6_VERSION) {
979 			*outlenp = 0;
980 			return (EINVAL);
981 		}
982 
983 		switch (name) {
984 		case IPV6_RECVPKTINFO:
985 			if (!checkonly) {
986 				/* Force it to be sent up with the next msg */
987 				tcp->tcp_recvifindex = 0;
988 			}
989 			break;
990 		case IPV6_RECVTCLASS:
991 			if (!checkonly) {
992 				/* Force it to be sent up with the next msg */
993 				tcp->tcp_recvtclass = 0xffffffffU;
994 			}
995 			break;
996 		case IPV6_RECVHOPLIMIT:
997 			if (!checkonly) {
998 				/* Force it to be sent up with the next msg */
999 				tcp->tcp_recvhops = 0xffffffffU;
1000 			}
1001 			break;
1002 		case IPV6_PKTINFO:
1003 			/* This is an extra check for TCP */
1004 			if (inlen == sizeof (struct in6_pktinfo)) {
1005 				struct in6_pktinfo *pkti;
1006 
1007 				pkti = (struct in6_pktinfo *)invalp;
1008 				/*
1009 				 * RFC 3542 states that ipi6_addr must be
1010 				 * the unspecified address when setting the
1011 				 * IPV6_PKTINFO sticky socket option on a
1012 				 * TCP socket.
1013 				 */
1014 				if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr))
1015 					return (EINVAL);
1016 			}
1017 			break;
1018 		case IPV6_SEC_OPT:
1019 			/*
1020 			 * We should not allow policy setting after
1021 			 * we start listening for connections.
1022 			 */
1023 			if (tcp->tcp_state == TCPS_LISTEN) {
1024 				return (EINVAL);
1025 			}
1026 			break;
1027 		}
1028 		break;
1029 	}
1030 	reterr = conn_opt_set(&coas, level, name, inlen, invalp,
1031 	    checkonly, cr);
1032 	if (reterr != 0) {
1033 		*outlenp = 0;
1034 		return (reterr);
1035 	}
1036 
1037 	/*
1038 	 * Common case of OK return with outval same as inval
1039 	 */
1040 	if (invalp != outvalp) {
1041 		/* don't trust bcopy for identical src/dst */
1042 		(void) bcopy(invalp, outvalp, inlen);
1043 	}
1044 	*outlenp = inlen;
1045 
1046 	if (coas.coa_changed & COA_HEADER_CHANGED) {
1047 		/* If we are connected we rebuilt the headers */
1048 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1049 		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1050 			reterr = tcp_build_hdrs(tcp);
1051 			if (reterr != 0)
1052 				return (reterr);
1053 		}
1054 	}
1055 	if (coas.coa_changed & COA_ROUTE_CHANGED) {
1056 		in6_addr_t nexthop;
1057 
1058 		/*
1059 		 * If we are connected we re-cache the information.
1060 		 * We ignore errors to preserve BSD behavior.
1061 		 * Note that we don't redo IPsec policy lookup here
1062 		 * since the final destination (or source) didn't change.
1063 		 */
1064 		ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa,
1065 		    &connp->conn_faddr_v6, &nexthop);
1066 
1067 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1068 		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1069 			(void) ip_attr_connect(connp, connp->conn_ixa,
1070 			    &connp->conn_laddr_v6, &connp->conn_faddr_v6,
1071 			    &nexthop, connp->conn_fport, NULL, NULL,
1072 			    IPDF_VERIFY_DST);
1073 		}
1074 	}
1075 	if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
1076 		connp->conn_wq->q_hiwat = connp->conn_sndbuf;
1077 	}
1078 	if (coas.coa_changed & COA_WROFF_CHANGED) {
1079 		connp->conn_wroff = connp->conn_ht_iphc_allocated +
1080 		    tcps->tcps_wroff_xtra;
1081 		(void) proto_set_tx_wroff(connp->conn_rq, connp,
1082 		    connp->conn_wroff);
1083 	}
1084 	if (coas.coa_changed & COA_OOBINLINE_CHANGED) {
1085 		if (IPCL_IS_NONSTR(connp))
1086 			proto_set_rx_oob_opt(connp, onoff);
1087 	}
1088 	return (0);
1089 }
1090