1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /* This file contains all TCP kernel socket related functions. */
27 
28 #include <sys/types.h>
29 #include <sys/strlog.h>
30 #include <sys/policy.h>
31 #include <sys/sockio.h>
32 #include <sys/strsubr.h>
33 #include <sys/strsun.h>
34 #include <sys/squeue_impl.h>
35 #include <sys/squeue.h>
36 #define	_SUN_TPI_VERSION 2
37 #include <sys/tihdr.h>
38 #include <sys/timod.h>
39 #include <sys/tpicommon.h>
40 #include <sys/socketvar.h>
41 
42 #include <inet/common.h>
43 #include <inet/proto_set.h>
44 #include <inet/ip.h>
45 #include <inet/tcp.h>
46 #include <inet/tcp_impl.h>
47 
48 static void	tcp_activate(sock_lower_handle_t, sock_upper_handle_t,
49 		    sock_upcalls_t *, int, cred_t *);
50 static int	tcp_accept(sock_lower_handle_t, sock_lower_handle_t,
51 		    sock_upper_handle_t, cred_t *);
52 static int	tcp_bind(sock_lower_handle_t, struct sockaddr *,
53 		    socklen_t, cred_t *);
54 static int	tcp_listen(sock_lower_handle_t, int, cred_t *);
55 static int	tcp_connect(sock_lower_handle_t, const struct sockaddr *,
56 		    socklen_t, sock_connid_t *, cred_t *);
57 static int	tcp_getsockopt(sock_lower_handle_t, int, int, void *,
58 		    socklen_t *, cred_t *);
59 static int	tcp_setsockopt(sock_lower_handle_t, int, int, const void *,
60 		    socklen_t, cred_t *);
61 static int	tcp_sendmsg(sock_lower_handle_t, mblk_t *, struct nmsghdr *,
62 		    cred_t *cr);
63 static int	tcp_shutdown(sock_lower_handle_t, int, cred_t *);
64 static void	tcp_clr_flowctrl(sock_lower_handle_t);
65 static int	tcp_ioctl(sock_lower_handle_t, int, intptr_t, int, int32_t *,
66 		    cred_t *);
67 static int	tcp_close(sock_lower_handle_t, int, cred_t *);
68 
69 sock_downcalls_t sock_tcp_downcalls = {
70 	tcp_activate,
71 	tcp_accept,
72 	tcp_bind,
73 	tcp_listen,
74 	tcp_connect,
75 	tcp_getpeername,
76 	tcp_getsockname,
77 	tcp_getsockopt,
78 	tcp_setsockopt,
79 	tcp_sendmsg,
80 	NULL,
81 	NULL,
82 	NULL,
83 	tcp_shutdown,
84 	tcp_clr_flowctrl,
85 	tcp_ioctl,
86 	tcp_close,
87 };
88 
89 /* ARGSUSED */
90 static void
91 tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
92     sock_upcalls_t *sock_upcalls, int flags, cred_t *cr)
93 {
94 	conn_t *connp = (conn_t *)proto_handle;
95 	struct sock_proto_props sopp;
96 	extern struct module_info tcp_rinfo;
97 
98 	ASSERT(connp->conn_upper_handle == NULL);
99 
100 	/* All Solaris components should pass a cred for this operation. */
101 	ASSERT(cr != NULL);
102 
103 	sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
104 	    SOCKOPT_MAXPSZ | SOCKOPT_MAXBLK | SOCKOPT_RCVTIMER |
105 	    SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ;
106 
107 	sopp.sopp_rxhiwat = SOCKET_RECVHIWATER;
108 	sopp.sopp_rxlowat = SOCKET_RECVLOWATER;
109 	sopp.sopp_maxpsz = INFPSZ;
110 	sopp.sopp_maxblk = INFPSZ;
111 	sopp.sopp_rcvtimer = SOCKET_TIMER_INTERVAL;
112 	sopp.sopp_rcvthresh = SOCKET_RECVHIWATER >> 3;
113 	sopp.sopp_maxaddrlen = sizeof (sin6_t);
114 	sopp.sopp_minpsz = (tcp_rinfo.mi_minpsz == 1) ? 0 :
115 	    tcp_rinfo.mi_minpsz;
116 
117 	connp->conn_upcalls = sock_upcalls;
118 	connp->conn_upper_handle = sock_handle;
119 
120 	ASSERT(connp->conn_rcvbuf != 0 &&
121 	    connp->conn_rcvbuf == connp->conn_tcp->tcp_rwnd);
122 	(*sock_upcalls->su_set_proto_props)(sock_handle, &sopp);
123 }
124 
125 /*ARGSUSED*/
126 static int
127 tcp_accept(sock_lower_handle_t lproto_handle,
128     sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
129     cred_t *cr)
130 {
131 	conn_t *lconnp, *econnp;
132 	tcp_t *listener, *eager;
133 
134 	/*
135 	 * KSSL can move a socket from one listener to another, in which
136 	 * case `lproto_handle' points to the new listener. To ensure that
137 	 * the original listener is used the information is obtained from
138 	 * the eager.
139 	 */
140 	econnp = (conn_t *)eproto_handle;
141 	eager = econnp->conn_tcp;
142 	ASSERT(IPCL_IS_NONSTR(econnp));
143 	ASSERT(eager->tcp_listener != NULL);
144 	listener = eager->tcp_listener;
145 	lconnp = (conn_t *)listener->tcp_connp;
146 	ASSERT(listener->tcp_state == TCPS_LISTEN);
147 	ASSERT(lconnp->conn_upper_handle != NULL);
148 
149 	/*
150 	 * It is possible for the accept thread to race with the thread that
151 	 * made the su_newconn upcall in tcp_newconn_notify. Both
152 	 * tcp_newconn_notify and tcp_accept require that conn_upper_handle
153 	 * and conn_upcalls be set before returning, so they both write to
154 	 * them. However, we're guaranteed that the value written is the same
155 	 * for both threads.
156 	 */
157 	ASSERT(econnp->conn_upper_handle == NULL ||
158 	    econnp->conn_upper_handle == sock_handle);
159 	ASSERT(econnp->conn_upcalls == NULL ||
160 	    econnp->conn_upcalls == lconnp->conn_upcalls);
161 	econnp->conn_upper_handle = sock_handle;
162 	econnp->conn_upcalls = lconnp->conn_upcalls;
163 
164 	ASSERT(econnp->conn_netstack ==
165 	    listener->tcp_connp->conn_netstack);
166 	ASSERT(eager->tcp_tcps == listener->tcp_tcps);
167 
168 	/*
169 	 * We should have a minimum of 2 references on the conn at this
170 	 * point. One for TCP and one for the newconn notification
171 	 * (which is now taken over by IP). In the normal case we would
172 	 * also have another reference (making a total of 3) for the conn
173 	 * being in the classifier hash list. However the eager could have
174 	 * received an RST subsequently and tcp_closei_local could have
175 	 * removed the eager from the classifier hash list, hence we can't
176 	 * assert that reference.
177 	 */
178 	ASSERT(econnp->conn_ref >= 2);
179 
180 	mutex_enter(&listener->tcp_eager_lock);
181 	/*
182 	 * Non-STREAMS listeners never defer the notification of new
183 	 * connections.
184 	 */
185 	ASSERT(!listener->tcp_eager_prev_q0->tcp_conn_def_q0);
186 	tcp_eager_unlink(eager);
187 	mutex_exit(&listener->tcp_eager_lock);
188 	CONN_DEC_REF(listener->tcp_connp);
189 
190 	return ((eager->tcp_state < TCPS_ESTABLISHED) ? ECONNABORTED : 0);
191 }
192 
193 static int
194 tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
195     socklen_t len, cred_t *cr)
196 {
197 	int 		error;
198 	conn_t		*connp = (conn_t *)proto_handle;
199 
200 	/* All Solaris components should pass a cred for this operation. */
201 	ASSERT(cr != NULL);
202 	ASSERT(connp->conn_upper_handle != NULL);
203 
204 	error = squeue_synch_enter(connp, NULL);
205 	if (error != 0) {
206 		/* failed to enter */
207 		return (ENOSR);
208 	}
209 
210 	/* binding to a NULL address really means unbind */
211 	if (sa == NULL) {
212 		if (connp->conn_tcp->tcp_state < TCPS_LISTEN)
213 			error = tcp_do_unbind(connp);
214 		else
215 			error = EINVAL;
216 	} else {
217 		error = tcp_do_bind(connp, sa, len, cr, B_TRUE);
218 	}
219 
220 	squeue_synch_exit(connp);
221 
222 	if (error < 0) {
223 		if (error == -TOUTSTATE)
224 			error = EINVAL;
225 		else
226 			error = proto_tlitosyserr(-error);
227 	}
228 
229 	return (error);
230 }
231 
232 /* ARGSUSED */
233 static int
234 tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
235 {
236 	conn_t	*connp = (conn_t *)proto_handle;
237 	tcp_t	*tcp = connp->conn_tcp;
238 	int 	error;
239 
240 	ASSERT(connp->conn_upper_handle != NULL);
241 
242 	/* All Solaris components should pass a cred for this operation. */
243 	ASSERT(cr != NULL);
244 
245 	error = squeue_synch_enter(connp, NULL);
246 	if (error != 0) {
247 		/* failed to enter */
248 		return (ENOBUFS);
249 	}
250 
251 	error = tcp_do_listen(connp, NULL, 0, backlog, cr, B_FALSE);
252 	if (error == 0) {
253 		/*
254 		 * sockfs needs to know what's the maximum number of socket
255 		 * that can be queued on the listener.
256 		 */
257 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
258 		    SOCK_OPCTL_ENAB_ACCEPT,
259 		    (uintptr_t)(tcp->tcp_conn_req_max +
260 		    tcp->tcp_tcps->tcps_conn_req_max_q0));
261 	} else if (error < 0) {
262 		if (error == -TOUTSTATE)
263 			error = EINVAL;
264 		else
265 			error = proto_tlitosyserr(-error);
266 	}
267 	squeue_synch_exit(connp);
268 	return (error);
269 }
270 
271 static int
272 tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
273     socklen_t len, sock_connid_t *id, cred_t *cr)
274 {
275 	conn_t		*connp = (conn_t *)proto_handle;
276 	int		error;
277 
278 	ASSERT(connp->conn_upper_handle != NULL);
279 
280 	/* All Solaris components should pass a cred for this operation. */
281 	ASSERT(cr != NULL);
282 
283 	error = proto_verify_ip_addr(connp->conn_family, sa, len);
284 	if (error != 0) {
285 		return (error);
286 	}
287 
288 	error = squeue_synch_enter(connp, NULL);
289 	if (error != 0) {
290 		/* failed to enter */
291 		return (ENOSR);
292 	}
293 
294 	/*
295 	 * TCP supports quick connect, so no need to do an implicit bind
296 	 */
297 	error = tcp_do_connect(connp, sa, len, cr, curproc->p_pid);
298 	if (error == 0) {
299 		*id = connp->conn_tcp->tcp_connid;
300 	} else if (error < 0) {
301 		if (error == -TOUTSTATE) {
302 			switch (connp->conn_tcp->tcp_state) {
303 			case TCPS_SYN_SENT:
304 				error = EALREADY;
305 				break;
306 			case TCPS_ESTABLISHED:
307 				error = EISCONN;
308 				break;
309 			case TCPS_LISTEN:
310 				error = EOPNOTSUPP;
311 				break;
312 			default:
313 				error = EINVAL;
314 				break;
315 			}
316 		} else {
317 			error = proto_tlitosyserr(-error);
318 		}
319 	}
320 
321 	if (connp->conn_tcp->tcp_loopback) {
322 		struct sock_proto_props sopp;
323 
324 		sopp.sopp_flags = SOCKOPT_LOOPBACK;
325 		sopp.sopp_loopback = B_TRUE;
326 
327 		(*connp->conn_upcalls->su_set_proto_props)(
328 		    connp->conn_upper_handle, &sopp);
329 	}
330 done:
331 	squeue_synch_exit(connp);
332 
333 	return ((error == 0) ? EINPROGRESS : error);
334 }
335 
336 /* ARGSUSED3 */
337 int
338 tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr,
339     socklen_t *addrlenp, cred_t *cr)
340 {
341 	conn_t	*connp = (conn_t *)proto_handle;
342 	tcp_t	*tcp = connp->conn_tcp;
343 
344 	/* All Solaris components should pass a cred for this operation. */
345 	ASSERT(cr != NULL);
346 
347 	ASSERT(tcp != NULL);
348 	if (tcp->tcp_state < TCPS_SYN_RCVD)
349 		return (ENOTCONN);
350 
351 	return (conn_getpeername(connp, addr, addrlenp));
352 }
353 
354 /* ARGSUSED3 */
355 int
356 tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr,
357     socklen_t *addrlenp, cred_t *cr)
358 {
359 	conn_t	*connp = (conn_t *)proto_handle;
360 
361 	/* All Solaris components should pass a cred for this operation. */
362 	ASSERT(cr != NULL);
363 
364 	return (conn_getsockname(connp, addr, addrlenp));
365 }
366 
367 /* returns UNIX error, the optlen is a value-result arg */
368 static int
369 tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
370     void *optvalp, socklen_t *optlen, cred_t *cr)
371 {
372 	conn_t		*connp = (conn_t *)proto_handle;
373 	int		error;
374 	t_uscalar_t	max_optbuf_len;
375 	void		*optvalp_buf;
376 	int		len;
377 
378 	ASSERT(connp->conn_upper_handle != NULL);
379 
380 	error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
381 	    tcp_opt_obj.odb_opt_des_arr,
382 	    tcp_opt_obj.odb_opt_arr_cnt,
383 	    B_FALSE, B_TRUE, cr);
384 	if (error != 0) {
385 		if (error < 0) {
386 			error = proto_tlitosyserr(-error);
387 		}
388 		return (error);
389 	}
390 
391 	optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
392 
393 	error = squeue_synch_enter(connp, NULL);
394 	if (error == ENOMEM) {
395 		kmem_free(optvalp_buf, max_optbuf_len);
396 		return (ENOMEM);
397 	}
398 
399 	len = tcp_opt_get(connp, level, option_name, optvalp_buf);
400 	squeue_synch_exit(connp);
401 
402 	if (len == -1) {
403 		kmem_free(optvalp_buf, max_optbuf_len);
404 		return (EINVAL);
405 	}
406 
407 	/*
408 	 * update optlen and copy option value
409 	 */
410 	t_uscalar_t size = MIN(len, *optlen);
411 
412 	bcopy(optvalp_buf, optvalp, size);
413 	bcopy(&size, optlen, sizeof (size));
414 
415 	kmem_free(optvalp_buf, max_optbuf_len);
416 	return (0);
417 }
418 
419 static int
420 tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
421     const void *optvalp, socklen_t optlen, cred_t *cr)
422 {
423 	conn_t		*connp = (conn_t *)proto_handle;
424 	int		error;
425 
426 	ASSERT(connp->conn_upper_handle != NULL);
427 	/*
428 	 * Entering the squeue synchronously can result in a context switch,
429 	 * which can cause a rather sever performance degradation. So we try to
430 	 * handle whatever options we can without entering the squeue.
431 	 */
432 	if (level == IPPROTO_TCP) {
433 		switch (option_name) {
434 		case TCP_NODELAY:
435 			if (optlen != sizeof (int32_t))
436 				return (EINVAL);
437 			mutex_enter(&connp->conn_tcp->tcp_non_sq_lock);
438 			connp->conn_tcp->tcp_naglim = *(int *)optvalp ? 1 :
439 			    connp->conn_tcp->tcp_mss;
440 			mutex_exit(&connp->conn_tcp->tcp_non_sq_lock);
441 			return (0);
442 		default:
443 			break;
444 		}
445 	}
446 
447 	error = squeue_synch_enter(connp, NULL);
448 	if (error == ENOMEM) {
449 		return (ENOMEM);
450 	}
451 
452 	error = proto_opt_check(level, option_name, optlen, NULL,
453 	    tcp_opt_obj.odb_opt_des_arr,
454 	    tcp_opt_obj.odb_opt_arr_cnt,
455 	    B_TRUE, B_FALSE, cr);
456 
457 	if (error != 0) {
458 		if (error < 0) {
459 			error = proto_tlitosyserr(-error);
460 		}
461 		squeue_synch_exit(connp);
462 		return (error);
463 	}
464 
465 	error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name,
466 	    optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp,
467 	    NULL, cr);
468 	squeue_synch_exit(connp);
469 
470 	ASSERT(error >= 0);
471 
472 	return (error);
473 }
474 
475 /* ARGSUSED */
476 static int
477 tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
478     cred_t *cr)
479 {
480 	tcp_t		*tcp;
481 	uint32_t	msize;
482 	conn_t *connp = (conn_t *)proto_handle;
483 	int32_t		tcpstate;
484 
485 	/* All Solaris components should pass a cred for this operation. */
486 	ASSERT(cr != NULL);
487 
488 	ASSERT(connp->conn_ref >= 2);
489 	ASSERT(connp->conn_upper_handle != NULL);
490 
491 	if (msg->msg_controllen != 0) {
492 		freemsg(mp);
493 		return (EOPNOTSUPP);
494 	}
495 
496 	switch (DB_TYPE(mp)) {
497 	case M_DATA:
498 		tcp = connp->conn_tcp;
499 		ASSERT(tcp != NULL);
500 
501 		tcpstate = tcp->tcp_state;
502 		if (tcpstate < TCPS_ESTABLISHED) {
503 			freemsg(mp);
504 			/*
505 			 * We return ENOTCONN if the endpoint is trying to
506 			 * connect or has never been connected, and EPIPE if it
507 			 * has been disconnected. The connection id helps us
508 			 * distinguish between the last two cases.
509 			 */
510 			return ((tcpstate == TCPS_SYN_SENT) ? ENOTCONN :
511 			    ((tcp->tcp_connid > 0) ? EPIPE : ENOTCONN));
512 		} else if (tcpstate > TCPS_CLOSE_WAIT) {
513 			freemsg(mp);
514 			return (EPIPE);
515 		}
516 
517 		msize = msgdsize(mp);
518 
519 		mutex_enter(&tcp->tcp_non_sq_lock);
520 		tcp->tcp_squeue_bytes += msize;
521 		/*
522 		 * Squeue Flow Control
523 		 */
524 		if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) {
525 			tcp_setqfull(tcp);
526 		}
527 		mutex_exit(&tcp->tcp_non_sq_lock);
528 
529 		/*
530 		 * The application may pass in an address in the msghdr, but
531 		 * we ignore the address on connection-oriented sockets.
532 		 * Just like BSD this code does not generate an error for
533 		 * TCP (a CONNREQUIRED socket) when sending to an address
534 		 * passed in with sendto/sendmsg. Instead the data is
535 		 * delivered on the connection as if no address had been
536 		 * supplied.
537 		 */
538 		CONN_INC_REF(connp);
539 
540 		if (msg->msg_flags & MSG_OOB) {
541 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output_urgent,
542 			    connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
543 		} else {
544 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output,
545 			    connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
546 		}
547 
548 		return (0);
549 
550 	default:
551 		ASSERT(0);
552 	}
553 
554 	freemsg(mp);
555 	return (0);
556 }
557 
558 /* ARGSUSED */
559 static int
560 tcp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
561 {
562 	conn_t  *connp = (conn_t *)proto_handle;
563 	tcp_t   *tcp = connp->conn_tcp;
564 
565 	ASSERT(connp->conn_upper_handle != NULL);
566 
567 	/* All Solaris components should pass a cred for this operation. */
568 	ASSERT(cr != NULL);
569 
570 	/*
571 	 * X/Open requires that we check the connected state.
572 	 */
573 	if (tcp->tcp_state < TCPS_SYN_SENT)
574 		return (ENOTCONN);
575 
576 	/* shutdown the send side */
577 	if (how != SHUT_RD) {
578 		mblk_t *bp;
579 
580 		bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL);
581 		CONN_INC_REF(connp);
582 		SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_shutdown_output,
583 		    connp, NULL, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT);
584 
585 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
586 		    SOCK_OPCTL_SHUT_SEND, 0);
587 	}
588 
589 	/* shutdown the recv side */
590 	if (how != SHUT_WR)
591 		(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
592 		    SOCK_OPCTL_SHUT_RECV, 0);
593 
594 	return (0);
595 }
596 
597 static void
598 tcp_clr_flowctrl(sock_lower_handle_t proto_handle)
599 {
600 	conn_t  *connp = (conn_t *)proto_handle;
601 	tcp_t	*tcp = connp->conn_tcp;
602 	mblk_t *mp;
603 	int error;
604 
605 	ASSERT(connp->conn_upper_handle != NULL);
606 
607 	/*
608 	 * If tcp->tcp_rsrv_mp == NULL, it means that tcp_clr_flowctrl()
609 	 * is currently running.
610 	 */
611 	mutex_enter(&tcp->tcp_rsrv_mp_lock);
612 	if ((mp = tcp->tcp_rsrv_mp) == NULL) {
613 		mutex_exit(&tcp->tcp_rsrv_mp_lock);
614 		return;
615 	}
616 	tcp->tcp_rsrv_mp = NULL;
617 	mutex_exit(&tcp->tcp_rsrv_mp_lock);
618 
619 	error = squeue_synch_enter(connp, mp);
620 	ASSERT(error == 0);
621 
622 	mutex_enter(&tcp->tcp_rsrv_mp_lock);
623 	tcp->tcp_rsrv_mp = mp;
624 	mutex_exit(&tcp->tcp_rsrv_mp_lock);
625 
626 	if (tcp->tcp_fused) {
627 		tcp_fuse_backenable(tcp);
628 	} else {
629 		tcp->tcp_rwnd = connp->conn_rcvbuf;
630 		/*
631 		 * Send back a window update immediately if TCP is above
632 		 * ESTABLISHED state and the increase of the rcv window
633 		 * that the other side knows is at least 1 MSS after flow
634 		 * control is lifted.
635 		 */
636 		if (tcp->tcp_state >= TCPS_ESTABLISHED &&
637 		    tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
638 			tcp_xmit_ctl(NULL, tcp,
639 			    (tcp->tcp_swnd == 0) ? tcp->tcp_suna :
640 			    tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
641 		}
642 	}
643 
644 	squeue_synch_exit(connp);
645 }
646 
647 /* ARGSUSED */
648 static int
649 tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
650     int mode, int32_t *rvalp, cred_t *cr)
651 {
652 	conn_t  	*connp = (conn_t *)proto_handle;
653 	int		error;
654 
655 	ASSERT(connp->conn_upper_handle != NULL);
656 
657 	/* All Solaris components should pass a cred for this operation. */
658 	ASSERT(cr != NULL);
659 
660 	/*
661 	 * If we don't have a helper stream then create one.
662 	 * ip_create_helper_stream takes care of locking the conn_t,
663 	 * so this check for NULL is just a performance optimization.
664 	 */
665 	if (connp->conn_helper_info == NULL) {
666 		tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps;
667 
668 		/*
669 		 * Create a helper stream for non-STREAMS socket.
670 		 */
671 		error = ip_create_helper_stream(connp, tcps->tcps_ldi_ident);
672 		if (error != 0) {
673 			ip0dbg(("tcp_ioctl: create of IP helper stream "
674 			    "failed %d\n", error));
675 			return (error);
676 		}
677 	}
678 
679 	switch (cmd) {
680 		case ND_SET:
681 		case ND_GET:
682 		case _SIOCSOCKFALLBACK:
683 		case TCP_IOC_ABORT_CONN:
684 		case TI_GETPEERNAME:
685 		case TI_GETMYNAME:
686 			ip1dbg(("tcp_ioctl: cmd 0x%x on non streams socket",
687 			    cmd));
688 			error = EINVAL;
689 			break;
690 		default:
691 			/*
692 			 * If the conn is not closing, pass on to IP using
693 			 * helper stream. Bump the ioctlref to prevent tcp_close
694 			 * from closing the rq/wq out from underneath the ioctl
695 			 * if it ends up queued or aborted/interrupted.
696 			 */
697 			mutex_enter(&connp->conn_lock);
698 			if (connp->conn_state_flags & (CONN_CLOSING)) {
699 				mutex_exit(&connp->conn_lock);
700 				error = EINVAL;
701 				break;
702 			}
703 			CONN_INC_IOCTLREF_LOCKED(connp);
704 			error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
705 			    cmd, arg, mode, cr, rvalp);
706 			CONN_DEC_IOCTLREF(connp);
707 			break;
708 	}
709 	return (error);
710 }
711 
712 /* ARGSUSED */
713 static int
714 tcp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
715 {
716 	conn_t *connp = (conn_t *)proto_handle;
717 
718 	ASSERT(connp->conn_upper_handle != NULL);
719 
720 	/* All Solaris components should pass a cred for this operation. */
721 	ASSERT(cr != NULL);
722 
723 	tcp_close_common(connp, flags);
724 
725 	ip_free_helper_stream(connp);
726 
727 	/*
728 	 * Drop IP's reference on the conn. This is the last reference
729 	 * on the connp if the state was less than established. If the
730 	 * connection has gone into timewait state, then we will have
731 	 * one ref for the TCP and one more ref (total of two) for the
732 	 * classifier connected hash list (a timewait connections stays
733 	 * in connected hash till closed).
734 	 *
735 	 * We can't assert the references because there might be other
736 	 * transient reference places because of some walkers or queued
737 	 * packets in squeue for the timewait state.
738 	 */
739 	CONN_DEC_REF(connp);
740 
741 	/*
742 	 * EINPROGRESS tells sockfs to wait for a 'closed' upcall before
743 	 * freeing the socket.
744 	 */
745 	return (EINPROGRESS);
746 }
747 
748 /* ARGSUSED */
749 sock_lower_handle_t
750 tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
751     uint_t *smodep, int *errorp, int flags, cred_t *credp)
752 {
753 	conn_t		*connp;
754 	boolean_t	isv6 = family == AF_INET6;
755 	if (type != SOCK_STREAM || (family != AF_INET && family != AF_INET6) ||
756 	    (proto != 0 && proto != IPPROTO_TCP)) {
757 		*errorp = EPROTONOSUPPORT;
758 		return (NULL);
759 	}
760 
761 	connp = tcp_create_common(credp, isv6, B_TRUE, errorp);
762 	if (connp == NULL) {
763 		return (NULL);
764 	}
765 
766 	/*
767 	 * Put the ref for TCP. Ref for IP was already put
768 	 * by ipcl_conn_create. Also Make the conn_t globally
769 	 * visible to walkers
770 	 */
771 	mutex_enter(&connp->conn_lock);
772 	CONN_INC_REF_LOCKED(connp);
773 	ASSERT(connp->conn_ref == 2);
774 	connp->conn_state_flags &= ~CONN_INCIPIENT;
775 
776 	connp->conn_flags |= IPCL_NONSTR;
777 	mutex_exit(&connp->conn_lock);
778 
779 	ASSERT(errorp != NULL);
780 	*errorp = 0;
781 	*sock_downcalls = &sock_tcp_downcalls;
782 	*smodep = SM_CONNREQUIRED | SM_EXDATA | SM_ACCEPTSUPP |
783 	    SM_SENDFILESUPP;
784 
785 	return ((sock_lower_handle_t)connp);
786 }
787 
788 /*
789  * tcp_fallback
790  *
791  * A direct socket is falling back to using STREAMS. The queue
792  * that is being passed down was created using tcp_open() with
793  * the SO_FALLBACK flag set. As a result, the queue is not
794  * associated with a conn, and the q_ptrs instead contain the
795  * dev and minor area that should be used.
796  *
797  * The 'issocket' flag indicates whether the FireEngine
798  * optimizations should be used. The common case would be that
799  * optimizations are enabled, and they might be subsequently
800  * disabled using the _SIOCSOCKFALLBACK ioctl.
801  */
802 
803 /*
804  * An active connection is falling back to TPI. Gather all the information
805  * required by the STREAM head and TPI sonode and send it up.
806  */
807 static void
808 tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q,
809     boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb,
810     sock_quiesce_arg_t *arg)
811 {
812 	conn_t			*connp = tcp->tcp_connp;
813 	struct stroptions	*stropt;
814 	struct T_capability_ack tca;
815 	struct sockaddr_in6	laddr, faddr;
816 	socklen_t 		laddrlen, faddrlen;
817 	short			opts;
818 	int			error;
819 	mblk_t			*mp, *mpnext;
820 
821 	connp->conn_dev = (dev_t)RD(q)->q_ptr;
822 	connp->conn_minor_arena = WR(q)->q_ptr;
823 
824 	RD(q)->q_ptr = WR(q)->q_ptr = connp;
825 
826 	connp->conn_rq = RD(q);
827 	connp->conn_wq = WR(q);
828 
829 	WR(q)->q_qinfo = &tcp_sock_winit;
830 
831 	if (!issocket)
832 		tcp_use_pure_tpi(tcp);
833 
834 	/*
835 	 * free the helper stream
836 	 */
837 	ip_free_helper_stream(connp);
838 
839 	/*
840 	 * Notify the STREAM head about options
841 	 */
842 	DB_TYPE(stropt_mp) = M_SETOPTS;
843 	stropt = (struct stroptions *)stropt_mp->b_rptr;
844 	stropt_mp->b_wptr += sizeof (struct stroptions);
845 	stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
846 
847 	stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 :
848 	    tcp->tcp_tcps->tcps_wroff_xtra);
849 	if (tcp->tcp_snd_sack_ok)
850 		stropt->so_wroff += TCPOPT_MAX_SACK_LEN;
851 	stropt->so_hiwat = connp->conn_rcvbuf;
852 	stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
853 
854 	putnext(RD(q), stropt_mp);
855 
856 	/*
857 	 * Collect the information needed to sync with the sonode
858 	 */
859 	tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID);
860 
861 	laddrlen = faddrlen = sizeof (sin6_t);
862 	(void) tcp_getsockname((sock_lower_handle_t)connp,
863 	    (struct sockaddr *)&laddr, &laddrlen, CRED());
864 	error = tcp_getpeername((sock_lower_handle_t)connp,
865 	    (struct sockaddr *)&faddr, &faddrlen, CRED());
866 	if (error != 0)
867 		faddrlen = 0;
868 
869 	opts = 0;
870 	if (connp->conn_oobinline)
871 		opts |= SO_OOBINLINE;
872 	if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
873 		opts |= SO_DONTROUTE;
874 
875 	/*
876 	 * Notify the socket that the protocol is now quiescent,
877 	 * and it's therefore safe move data from the socket
878 	 * to the stream head.
879 	 */
880 	mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca,
881 	    (struct sockaddr *)&laddr, laddrlen,
882 	    (struct sockaddr *)&faddr, faddrlen, opts);
883 
884 	while (mp != NULL) {
885 		mpnext = mp->b_next;
886 		tcp->tcp_rcv_list = mp->b_next;
887 		mp->b_next = NULL;
888 		putnext(q, mp);
889 		mp = mpnext;
890 	}
891 	ASSERT(tcp->tcp_rcv_last_head == NULL);
892 	ASSERT(tcp->tcp_rcv_last_tail == NULL);
893 	ASSERT(tcp->tcp_rcv_cnt == 0);
894 
895 	/*
896 	 * All eagers in q0 are marked as being non-STREAM, so they will
897 	 * make su_newconn upcalls when the handshake completes, which
898 	 * will fail (resulting in the conn being closed). So we just blow
899 	 * off everything in q0 instead of waiting for the inevitable.
900 	 */
901 	if (tcp->tcp_conn_req_cnt_q0 != 0)
902 		tcp_eager_cleanup(tcp, B_TRUE);
903 }
904 
905 /*
906  * An eager is falling back to TPI. All we have to do is send
907  * up a T_CONN_IND.
908  */
909 static void
910 tcp_fallback_eager(tcp_t *eager, boolean_t issocket,
911     so_proto_quiesced_cb_t quiesced_cb, sock_quiesce_arg_t *arg)
912 {
913 	conn_t *connp = eager->tcp_connp;
914 	tcp_t *listener = eager->tcp_listener;
915 	mblk_t *mp;
916 
917 	ASSERT(listener != NULL);
918 
919 	/*
920 	 * Notify the socket that the protocol is now quiescent,
921 	 * and it's therefore safe move data from the socket
922 	 * to tcp's rcv queue.
923 	 */
924 	mp = (*quiesced_cb)(connp->conn_upper_handle, arg, NULL, NULL, 0,
925 	    NULL, 0, 0);
926 
927 	if (mp != NULL) {
928 		ASSERT(eager->tcp_rcv_cnt == 0);
929 
930 		eager->tcp_rcv_list = mp;
931 		eager->tcp_rcv_cnt = msgdsize(mp);
932 		while (mp->b_next != NULL) {
933 			mp = mp->b_next;
934 			eager->tcp_rcv_cnt += msgdsize(mp);
935 		}
936 		eager->tcp_rcv_last_head = mp;
937 		while (mp->b_cont)
938 			mp = mp->b_cont;
939 		eager->tcp_rcv_last_tail = mp;
940 		if (eager->tcp_rcv_cnt > eager->tcp_rwnd)
941 			eager->tcp_rwnd = 0;
942 		else
943 			eager->tcp_rwnd -= eager->tcp_rcv_cnt;
944 	}
945 
946 	if (!issocket)
947 		eager->tcp_issocket = B_FALSE;
948 	/*
949 	 * The stream for this eager does not yet exist, so mark it as
950 	 * being detached.
951 	 */
952 	eager->tcp_detached = B_TRUE;
953 	eager->tcp_hard_binding = B_TRUE;
954 	connp->conn_rq = listener->tcp_connp->conn_rq;
955 	connp->conn_wq = listener->tcp_connp->conn_wq;
956 
957 	/* Send up the connection indication */
958 	mp = eager->tcp_conn.tcp_eager_conn_ind;
959 	ASSERT(mp != NULL);
960 	eager->tcp_conn.tcp_eager_conn_ind = NULL;
961 
962 	/*
963 	 * TLI/XTI applications will get confused by
964 	 * sending eager as an option since it violates
965 	 * the option semantics. So remove the eager as
966 	 * option since TLI/XTI app doesn't need it anyway.
967 	 */
968 	if (!issocket) {
969 		struct T_conn_ind *conn_ind;
970 
971 		conn_ind = (struct T_conn_ind *)mp->b_rptr;
972 		conn_ind->OPT_length = 0;
973 		conn_ind->OPT_offset = 0;
974 	}
975 
976 	/*
977 	 * Sockfs guarantees that the listener will not be closed
978 	 * during fallback. So we can safely use the listener's queue.
979 	 */
980 	putnext(listener->tcp_connp->conn_rq, mp);
981 }
982 
983 
984 int
985 tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
986     boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb,
987     sock_quiesce_arg_t *arg)
988 {
989 	tcp_t			*tcp;
990 	conn_t 			*connp = (conn_t *)proto_handle;
991 	int			error;
992 	mblk_t			*stropt_mp;
993 	mblk_t			*ordrel_mp;
994 
995 	tcp = connp->conn_tcp;
996 
997 	stropt_mp = allocb_wait(sizeof (struct stroptions), BPRI_HI, STR_NOSIG,
998 	    NULL);
999 
1000 	/* Pre-allocate the T_ordrel_ind mblk. */
1001 	ASSERT(tcp->tcp_ordrel_mp == NULL);
1002 	ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI,
1003 	    STR_NOSIG, NULL);
1004 	ordrel_mp->b_datap->db_type = M_PROTO;
1005 	((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND;
1006 	ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind);
1007 
1008 	/*
1009 	 * Enter the squeue so that no new packets can come in
1010 	 */
1011 	error = squeue_synch_enter(connp, NULL);
1012 	if (error != 0) {
1013 		/* failed to enter, free all the pre-allocated messages. */
1014 		freeb(stropt_mp);
1015 		freeb(ordrel_mp);
1016 		return (ENOMEM);
1017 	}
1018 
1019 	/*
1020 	 * Both endpoints must be of the same type (either STREAMS or
1021 	 * non-STREAMS) for fusion to be enabled. So if we are fused,
1022 	 * we have to unfuse.
1023 	 */
1024 	if (tcp->tcp_fused)
1025 		tcp_unfuse(tcp);
1026 
1027 	if (tcp->tcp_listener != NULL) {
1028 		/* The eager will deal with opts when accept() is called */
1029 		freeb(stropt_mp);
1030 		tcp_fallback_eager(tcp, direct_sockfs, quiesced_cb, arg);
1031 	} else {
1032 		tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs,
1033 		    quiesced_cb, arg);
1034 	}
1035 
1036 	/*
1037 	 * No longer a direct socket
1038 	 *
1039 	 * Note that we intentionally leave the upper_handle and upcalls
1040 	 * intact, since eagers may still be using them.
1041 	 */
1042 	connp->conn_flags &= ~IPCL_NONSTR;
1043 	tcp->tcp_ordrel_mp = ordrel_mp;
1044 
1045 	/*
1046 	 * There should be atleast two ref's (IP + TCP)
1047 	 */
1048 	ASSERT(connp->conn_ref >= 2);
1049 	squeue_synch_exit(connp);
1050 
1051 	return (0);
1052 }
1053 
1054 /*
1055  * Notifies a non-STREAMS based listener about a new connection. This
1056  * function is executed on the *eager*'s squeue once the 3 way handshake
1057  * has completed. Note that the behavior differs from STREAMS, where the
1058  * T_CONN_IND is sent up by tcp_send_conn_ind while on the *listener*'s
1059  * squeue.
1060  *
1061  * Returns B_TRUE if the notification succeeded, in which case `tcp' will
1062  * be moved over to the ESTABLISHED list (q) of the listener. Othwerise,
1063  * B_FALSE is returned and `tcp' is killed.
1064  */
1065 boolean_t
1066 tcp_newconn_notify(tcp_t *tcp, ip_recv_attr_t *ira)
1067 {
1068 	tcp_t *listener = tcp->tcp_listener;
1069 	conn_t *lconnp = listener->tcp_connp;
1070 	conn_t *econnp = tcp->tcp_connp;
1071 	tcp_t *tail;
1072 	ipaddr_t *addr_cache;
1073 	sock_upper_handle_t upper;
1074 	struct sock_proto_props sopp;
1075 	mblk_t *mp;
1076 
1077 	mutex_enter(&listener->tcp_eager_lock);
1078 	/*
1079 	 * Take the eager out, if it is in the list of droppable eagers
1080 	 * as we are here because the 3W handshake is over.
1081 	 */
1082 	MAKE_UNDROPPABLE(tcp);
1083 	/*
1084 	 * The eager already has an extra ref put in tcp_input_data
1085 	 * so that it stays till accept comes back even though it
1086 	 * might get into TCPS_CLOSED as a result of a TH_RST etc.
1087 	 */
1088 	ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
1089 	listener->tcp_conn_req_cnt_q0--;
1090 	listener->tcp_conn_req_cnt_q++;
1091 
1092 	/* Move from SYN_RCVD to ESTABLISHED list  */
1093 	tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = tcp->tcp_eager_prev_q0;
1094 	tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp->tcp_eager_next_q0;
1095 	tcp->tcp_eager_prev_q0 = NULL;
1096 	tcp->tcp_eager_next_q0 = NULL;
1097 
1098 	/*
1099 	 * Insert at end of the queue because connections are accepted
1100 	 * in chronological order. Leaving the older connections at front
1101 	 * of the queue helps reducing search time.
1102 	 */
1103 	tail = listener->tcp_eager_last_q;
1104 	if (tail != NULL)
1105 		tail->tcp_eager_next_q = tcp;
1106 	else
1107 		listener->tcp_eager_next_q = tcp;
1108 	listener->tcp_eager_last_q = tcp;
1109 	tcp->tcp_eager_next_q = NULL;
1110 
1111 	/* we have timed out before */
1112 	if (tcp->tcp_syn_rcvd_timeout != 0) {
1113 		tcp->tcp_syn_rcvd_timeout = 0;
1114 		listener->tcp_syn_rcvd_timeout--;
1115 		if (listener->tcp_syn_defense &&
1116 		    listener->tcp_syn_rcvd_timeout <=
1117 		    (listener->tcp_tcps->tcps_conn_req_max_q0 >> 5) &&
1118 		    10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() -
1119 		    listener->tcp_last_rcv_lbolt)) {
1120 			/*
1121 			 * Turn off the defense mode if we
1122 			 * believe the SYN attack is over.
1123 			 */
1124 			listener->tcp_syn_defense = B_FALSE;
1125 			if (listener->tcp_ip_addr_cache) {
1126 				kmem_free((void *)listener->tcp_ip_addr_cache,
1127 				    IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
1128 				listener->tcp_ip_addr_cache = NULL;
1129 			}
1130 		}
1131 	}
1132 	addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache);
1133 	if (addr_cache != NULL) {
1134 		/*
1135 		 * We have finished a 3-way handshake with this
1136 		 * remote host. This proves the IP addr is good.
1137 		 * Cache it!
1138 		 */
1139 		addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] =
1140 		    tcp->tcp_connp->conn_faddr_v4;
1141 	}
1142 	mutex_exit(&listener->tcp_eager_lock);
1143 
1144 	/*
1145 	 * Notify the ULP about the newconn. It is guaranteed that no
1146 	 * tcp_accept() call will be made for the eager if the
1147 	 * notification fails.
1148 	 */
1149 	if ((upper = (*lconnp->conn_upcalls->su_newconn)
1150 	    (lconnp->conn_upper_handle, (sock_lower_handle_t)econnp,
1151 	    &sock_tcp_downcalls, ira->ira_cred, ira->ira_cpid,
1152 	    &econnp->conn_upcalls)) == NULL) {
1153 		/*
1154 		 * Normally this should not happen, but the listener might
1155 		 * have done a fallback to TPI followed by a close(), in
1156 		 * which case tcp_closemp for this conn might have been
1157 		 * used by tcp_eager_cleanup().
1158 		 */
1159 		mutex_enter(&listener->tcp_eager_lock);
1160 		if (tcp->tcp_closemp_used) {
1161 			mutex_exit(&listener->tcp_eager_lock);
1162 			return (B_FALSE);
1163 		}
1164 		tcp->tcp_closemp_used = B_TRUE;
1165 		TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
1166 		mp = &tcp->tcp_closemp;
1167 		mutex_exit(&listener->tcp_eager_lock);
1168 		tcp_eager_kill(econnp, mp, NULL, NULL);
1169 		return (B_FALSE);
1170 	}
1171 	econnp->conn_upper_handle = upper;
1172 
1173 	tcp->tcp_detached = B_FALSE;
1174 	tcp->tcp_hard_binding = B_FALSE;
1175 	tcp->tcp_tconnind_started = B_TRUE;
1176 
1177 	if (econnp->conn_keepalive) {
1178 		tcp->tcp_ka_last_intrvl = 0;
1179 		tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
1180 		    tcp->tcp_ka_interval);
1181 	}
1182 
1183 	/* Update the necessary parameters */
1184 	tcp_get_proto_props(tcp, &sopp);
1185 
1186 	(*econnp->conn_upcalls->su_set_proto_props)
1187 	    (econnp->conn_upper_handle, &sopp);
1188 
1189 	return (B_TRUE);
1190 }
1191