1 /*
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 * The Regents of the University of California. All rights reserved.
4 *
5 * %sccs.include.redist.c%
6 *
7 * @(#)uipc_socket2.c 8.2 (Berkeley) 02/14/95
8 */
9
10 #include <sys/param.h>
11 #include <sys/systm.h>
12 #include <sys/proc.h>
13 #include <sys/file.h>
14 #include <sys/buf.h>
15 #include <sys/malloc.h>
16 #include <sys/mbuf.h>
17 #include <sys/protosw.h>
18 #include <sys/socket.h>
19 #include <sys/socketvar.h>
20
21 /*
22 * Primitive routines for operating on sockets and socket buffers
23 */
24
25 /* strings for sleep message: */
26 char netio[] = "netio";
27 char netcon[] = "netcon";
28 char netcls[] = "netcls";
29
30 u_long sb_max = SB_MAX; /* patchable */
31
32 /*
33 * Procedures to manipulate state flags of socket
34 * and do appropriate wakeups. Normal sequence from the
35 * active (originating) side is that soisconnecting() is
36 * called during processing of connect() call,
37 * resulting in an eventual call to soisconnected() if/when the
38 * connection is established. When the connection is torn down
39 * soisdisconnecting() is called during processing of disconnect() call,
40 * and soisdisconnected() is called when the connection to the peer
41 * is totally severed. The semantics of these routines are such that
42 * connectionless protocols can call soisconnected() and soisdisconnected()
43 * only, bypassing the in-progress calls when setting up a ``connection''
44 * takes no time.
45 *
46 * From the passive side, a socket is created with
47 * two queues of sockets: so_q0 for connections in progress
48 * and so_q for connections already made and awaiting user acceptance.
49 * As a protocol is preparing incoming connections, it creates a socket
50 * structure queued on so_q0 by calling sonewconn(). When the connection
51 * is established, soisconnected() is called, and transfers the
52 * socket structure to so_q, making it available to accept().
53 *
54 * If a socket is closed with sockets on either
55 * so_q0 or so_q, these sockets are dropped.
56 *
57 * If higher level protocols are implemented in
58 * the kernel, the wakeups done here will sometimes
59 * cause software-interrupt process scheduling.
60 */
61
62 void
soisconnecting(so)63 soisconnecting(so)
64 register struct socket *so;
65 {
66
67 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
68 so->so_state |= SS_ISCONNECTING;
69 }
70
71 void
soisconnected(so)72 soisconnected(so)
73 register struct socket *so;
74 {
75 register struct socket *head = so->so_head;
76
77 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
78 so->so_state |= SS_ISCONNECTED;
79 if (head && soqremque(so, 0)) {
80 soqinsque(head, so, 1);
81 sorwakeup(head);
82 wakeup((caddr_t)&head->so_timeo);
83 } else {
84 wakeup((caddr_t)&so->so_timeo);
85 sorwakeup(so);
86 sowwakeup(so);
87 }
88 }
89
90 void
soisdisconnecting(so)91 soisdisconnecting(so)
92 register struct socket *so;
93 {
94
95 so->so_state &= ~SS_ISCONNECTING;
96 so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE);
97 wakeup((caddr_t)&so->so_timeo);
98 sowwakeup(so);
99 sorwakeup(so);
100 }
101
102 void
soisdisconnected(so)103 soisdisconnected(so)
104 register struct socket *so;
105 {
106
107 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
108 so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE);
109 wakeup((caddr_t)&so->so_timeo);
110 sowwakeup(so);
111 sorwakeup(so);
112 }
113
114 /*
115 * When an attempt at a new connection is noted on a socket
116 * which accepts connections, sonewconn is called. If the
117 * connection is possible (subject to space constraints, etc.)
118 * then we allocate a new structure, propoerly linked into the
119 * data structure of the original socket, and return this.
120 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
121 *
122 * Currently, sonewconn() is defined as sonewconn1() in socketvar.h
123 * to catch calls that are missing the (new) second parameter.
124 */
125 struct socket *
sonewconn1(head,connstatus)126 sonewconn1(head, connstatus)
127 register struct socket *head;
128 int connstatus;
129 {
130 register struct socket *so;
131 int soqueue = connstatus ? 1 : 0;
132
133 if (head->so_qlen + head->so_q0len > 3 * head->so_qlimit / 2)
134 return ((struct socket *)0);
135 MALLOC(so, struct socket *, sizeof(*so), M_SOCKET, M_DONTWAIT);
136 if (so == NULL)
137 return ((struct socket *)0);
138 bzero((caddr_t)so, sizeof(*so));
139 so->so_type = head->so_type;
140 so->so_options = head->so_options &~ SO_ACCEPTCONN;
141 so->so_linger = head->so_linger;
142 so->so_state = head->so_state | SS_NOFDREF;
143 so->so_proto = head->so_proto;
144 so->so_timeo = head->so_timeo;
145 so->so_pgid = head->so_pgid;
146 (void) soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat);
147 soqinsque(head, so, soqueue);
148 if ((*so->so_proto->pr_usrreq)(so, PRU_ATTACH,
149 (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0)) {
150 (void) soqremque(so, soqueue);
151 (void) free((caddr_t)so, M_SOCKET);
152 return ((struct socket *)0);
153 }
154 if (connstatus) {
155 sorwakeup(head);
156 wakeup((caddr_t)&head->so_timeo);
157 so->so_state |= connstatus;
158 }
159 return (so);
160 }
161
162 void
soqinsque(head,so,q)163 soqinsque(head, so, q)
164 register struct socket *head, *so;
165 int q;
166 {
167
168 register struct socket **prev;
169 so->so_head = head;
170 if (q == 0) {
171 head->so_q0len++;
172 so->so_q0 = 0;
173 for (prev = &(head->so_q0); *prev; )
174 prev = &((*prev)->so_q0);
175 } else {
176 head->so_qlen++;
177 so->so_q = 0;
178 for (prev = &(head->so_q); *prev; )
179 prev = &((*prev)->so_q);
180 }
181 *prev = so;
182 }
183
184 int
soqremque(so,q)185 soqremque(so, q)
186 register struct socket *so;
187 int q;
188 {
189 register struct socket *head, *prev, *next;
190
191 head = so->so_head;
192 prev = head;
193 for (;;) {
194 next = q ? prev->so_q : prev->so_q0;
195 if (next == so)
196 break;
197 if (next == 0)
198 return (0);
199 prev = next;
200 }
201 if (q == 0) {
202 prev->so_q0 = next->so_q0;
203 head->so_q0len--;
204 } else {
205 prev->so_q = next->so_q;
206 head->so_qlen--;
207 }
208 next->so_q0 = next->so_q = 0;
209 next->so_head = 0;
210 return (1);
211 }
212
213 /*
214 * Socantsendmore indicates that no more data will be sent on the
215 * socket; it would normally be applied to a socket when the user
216 * informs the system that no more data is to be sent, by the protocol
217 * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data
218 * will be received, and will normally be applied to the socket by a
219 * protocol when it detects that the peer will send no more data.
220 * Data queued for reading in the socket may yet be read.
221 */
222
223 void
socantsendmore(so)224 socantsendmore(so)
225 struct socket *so;
226 {
227
228 so->so_state |= SS_CANTSENDMORE;
229 sowwakeup(so);
230 }
231
232 void
socantrcvmore(so)233 socantrcvmore(so)
234 struct socket *so;
235 {
236
237 so->so_state |= SS_CANTRCVMORE;
238 sorwakeup(so);
239 }
240
241 /*
242 * Wait for data to arrive at/drain from a socket buffer.
243 */
244 int
sbwait(sb)245 sbwait(sb)
246 struct sockbuf *sb;
247 {
248
249 sb->sb_flags |= SB_WAIT;
250 return (tsleep((caddr_t)&sb->sb_cc,
251 (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, netio,
252 sb->sb_timeo));
253 }
254
255 /*
256 * Lock a sockbuf already known to be locked;
257 * return any error returned from sleep (EINTR).
258 */
259 int
sb_lock(sb)260 sb_lock(sb)
261 register struct sockbuf *sb;
262 {
263 int error;
264
265 while (sb->sb_flags & SB_LOCK) {
266 sb->sb_flags |= SB_WANT;
267 if (error = tsleep((caddr_t)&sb->sb_flags,
268 (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH,
269 netio, 0))
270 return (error);
271 }
272 sb->sb_flags |= SB_LOCK;
273 return (0);
274 }
275
276 /*
277 * Wakeup processes waiting on a socket buffer.
278 * Do asynchronous notification via SIGIO
279 * if the socket has the SS_ASYNC flag set.
280 */
281 void
sowakeup(so,sb)282 sowakeup(so, sb)
283 register struct socket *so;
284 register struct sockbuf *sb;
285 {
286 struct proc *p;
287
288 selwakeup(&sb->sb_sel);
289 sb->sb_flags &= ~SB_SEL;
290 if (sb->sb_flags & SB_WAIT) {
291 sb->sb_flags &= ~SB_WAIT;
292 wakeup((caddr_t)&sb->sb_cc);
293 }
294 if (so->so_state & SS_ASYNC) {
295 if (so->so_pgid < 0)
296 gsignal(-so->so_pgid, SIGIO);
297 else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0)
298 psignal(p, SIGIO);
299 }
300 }
301
302 /*
303 * Socket buffer (struct sockbuf) utility routines.
304 *
305 * Each socket contains two socket buffers: one for sending data and
306 * one for receiving data. Each buffer contains a queue of mbufs,
307 * information about the number of mbufs and amount of data in the
308 * queue, and other fields allowing select() statements and notification
309 * on data availability to be implemented.
310 *
311 * Data stored in a socket buffer is maintained as a list of records.
312 * Each record is a list of mbufs chained together with the m_next
313 * field. Records are chained together with the m_nextpkt field. The upper
314 * level routine soreceive() expects the following conventions to be
315 * observed when placing information in the receive buffer:
316 *
317 * 1. If the protocol requires each message be preceded by the sender's
318 * name, then a record containing that name must be present before
319 * any associated data (mbuf's must be of type MT_SONAME).
320 * 2. If the protocol supports the exchange of ``access rights'' (really
321 * just additional data associated with the message), and there are
322 * ``rights'' to be received, then a record containing this data
323 * should be present (mbuf's must be of type MT_RIGHTS).
324 * 3. If a name or rights record exists, then it must be followed by
325 * a data record, perhaps of zero length.
326 *
327 * Before using a new socket structure it is first necessary to reserve
328 * buffer space to the socket, by calling sbreserve(). This should commit
329 * some of the available buffer space in the system buffer pool for the
330 * socket (currently, it does nothing but enforce limits). The space
331 * should be released by calling sbrelease() when the socket is destroyed.
332 */
333
334 int
soreserve(so,sndcc,rcvcc)335 soreserve(so, sndcc, rcvcc)
336 register struct socket *so;
337 u_long sndcc, rcvcc;
338 {
339
340 if (sbreserve(&so->so_snd, sndcc) == 0)
341 goto bad;
342 if (sbreserve(&so->so_rcv, rcvcc) == 0)
343 goto bad2;
344 if (so->so_rcv.sb_lowat == 0)
345 so->so_rcv.sb_lowat = 1;
346 if (so->so_snd.sb_lowat == 0)
347 so->so_snd.sb_lowat = MCLBYTES;
348 if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
349 so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
350 return (0);
351 bad2:
352 sbrelease(&so->so_snd);
353 bad:
354 return (ENOBUFS);
355 }
356
357 /*
358 * Allot mbufs to a sockbuf.
359 * Attempt to scale mbmax so that mbcnt doesn't become limiting
360 * if buffering efficiency is near the normal case.
361 */
362 int
sbreserve(sb,cc)363 sbreserve(sb, cc)
364 struct sockbuf *sb;
365 u_long cc;
366 {
367
368 if (cc > sb_max * MCLBYTES / (MSIZE + MCLBYTES))
369 return (0);
370 sb->sb_hiwat = cc;
371 sb->sb_mbmax = min(cc * 2, sb_max);
372 if (sb->sb_lowat > sb->sb_hiwat)
373 sb->sb_lowat = sb->sb_hiwat;
374 return (1);
375 }
376
377 /*
378 * Free mbufs held by a socket, and reserved mbuf space.
379 */
380 void
sbrelease(sb)381 sbrelease(sb)
382 struct sockbuf *sb;
383 {
384
385 sbflush(sb);
386 sb->sb_hiwat = sb->sb_mbmax = 0;
387 }
388
389 /*
390 * Routines to add and remove
391 * data from an mbuf queue.
392 *
393 * The routines sbappend() or sbappendrecord() are normally called to
394 * append new mbufs to a socket buffer, after checking that adequate
395 * space is available, comparing the function sbspace() with the amount
396 * of data to be added. sbappendrecord() differs from sbappend() in
397 * that data supplied is treated as the beginning of a new record.
398 * To place a sender's address, optional access rights, and data in a
399 * socket receive buffer, sbappendaddr() should be used. To place
400 * access rights and data in a socket receive buffer, sbappendrights()
401 * should be used. In either case, the new data begins a new record.
402 * Note that unlike sbappend() and sbappendrecord(), these routines check
403 * for the caller that there will be enough space to store the data.
404 * Each fails if there is not enough space, or if it cannot find mbufs
405 * to store additional information in.
406 *
407 * Reliable protocols may use the socket send buffer to hold data
408 * awaiting acknowledgement. Data is normally copied from a socket
409 * send buffer in a protocol with m_copy for output to a peer,
410 * and then removing the data from the socket buffer with sbdrop()
411 * or sbdroprecord() when the data is acknowledged by the peer.
412 */
413
414 /*
415 * Append mbuf chain m to the last record in the
416 * socket buffer sb. The additional space associated
417 * the mbuf chain is recorded in sb. Empty mbufs are
418 * discarded and mbufs are compacted where possible.
419 */
420 void
sbappend(sb,m)421 sbappend(sb, m)
422 struct sockbuf *sb;
423 struct mbuf *m;
424 {
425 register struct mbuf *n;
426
427 if (m == 0)
428 return;
429 if (n = sb->sb_mb) {
430 while (n->m_nextpkt)
431 n = n->m_nextpkt;
432 do {
433 if (n->m_flags & M_EOR) {
434 sbappendrecord(sb, m); /* XXXXXX!!!! */
435 return;
436 }
437 } while (n->m_next && (n = n->m_next));
438 }
439 sbcompress(sb, m, n);
440 }
441
442 #ifdef SOCKBUF_DEBUG
443 void
sbcheck(sb)444 sbcheck(sb)
445 register struct sockbuf *sb;
446 {
447 register struct mbuf *m;
448 register int len = 0, mbcnt = 0;
449
450 for (m = sb->sb_mb; m; m = m->m_next) {
451 len += m->m_len;
452 mbcnt += MSIZE;
453 if (m->m_flags & M_EXT)
454 mbcnt += m->m_ext.ext_size;
455 if (m->m_nextpkt)
456 panic("sbcheck nextpkt");
457 }
458 if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
459 printf("cc %d != %d || mbcnt %d != %d\n", len, sb->sb_cc,
460 mbcnt, sb->sb_mbcnt);
461 panic("sbcheck");
462 }
463 }
464 #endif
465
466 /*
467 * As above, except the mbuf chain
468 * begins a new record.
469 */
470 void
sbappendrecord(sb,m0)471 sbappendrecord(sb, m0)
472 register struct sockbuf *sb;
473 register struct mbuf *m0;
474 {
475 register struct mbuf *m;
476
477 if (m0 == 0)
478 return;
479 if (m = sb->sb_mb)
480 while (m->m_nextpkt)
481 m = m->m_nextpkt;
482 /*
483 * Put the first mbuf on the queue.
484 * Note this permits zero length records.
485 */
486 sballoc(sb, m0);
487 if (m)
488 m->m_nextpkt = m0;
489 else
490 sb->sb_mb = m0;
491 m = m0->m_next;
492 m0->m_next = 0;
493 if (m && (m0->m_flags & M_EOR)) {
494 m0->m_flags &= ~M_EOR;
495 m->m_flags |= M_EOR;
496 }
497 sbcompress(sb, m, m0);
498 }
499
500 /*
501 * As above except that OOB data
502 * is inserted at the beginning of the sockbuf,
503 * but after any other OOB data.
504 */
505 void
sbinsertoob(sb,m0)506 sbinsertoob(sb, m0)
507 register struct sockbuf *sb;
508 register struct mbuf *m0;
509 {
510 register struct mbuf *m;
511 register struct mbuf **mp;
512
513 if (m0 == 0)
514 return;
515 for (mp = &sb->sb_mb; m = *mp; mp = &((*mp)->m_nextpkt)) {
516 again:
517 switch (m->m_type) {
518
519 case MT_OOBDATA:
520 continue; /* WANT next train */
521
522 case MT_CONTROL:
523 if (m = m->m_next)
524 goto again; /* inspect THIS train further */
525 }
526 break;
527 }
528 /*
529 * Put the first mbuf on the queue.
530 * Note this permits zero length records.
531 */
532 sballoc(sb, m0);
533 m0->m_nextpkt = *mp;
534 *mp = m0;
535 m = m0->m_next;
536 m0->m_next = 0;
537 if (m && (m0->m_flags & M_EOR)) {
538 m0->m_flags &= ~M_EOR;
539 m->m_flags |= M_EOR;
540 }
541 sbcompress(sb, m, m0);
542 }
543
544 /*
545 * Append address and data, and optionally, control (ancillary) data
546 * to the receive queue of a socket. If present,
547 * m0 must include a packet header with total length.
548 * Returns 0 if no space in sockbuf or insufficient mbufs.
549 */
550 int
sbappendaddr(sb,asa,m0,control)551 sbappendaddr(sb, asa, m0, control)
552 register struct sockbuf *sb;
553 struct sockaddr *asa;
554 struct mbuf *m0, *control;
555 {
556 register struct mbuf *m, *n;
557 int space = asa->sa_len;
558
559 if (m0 && (m0->m_flags & M_PKTHDR) == 0)
560 panic("sbappendaddr");
561 if (m0)
562 space += m0->m_pkthdr.len;
563 for (n = control; n; n = n->m_next) {
564 space += n->m_len;
565 if (n->m_next == 0) /* keep pointer to last control buf */
566 break;
567 }
568 if (space > sbspace(sb))
569 return (0);
570 if (asa->sa_len > MLEN)
571 return (0);
572 MGET(m, M_DONTWAIT, MT_SONAME);
573 if (m == 0)
574 return (0);
575 m->m_len = asa->sa_len;
576 bcopy((caddr_t)asa, mtod(m, caddr_t), asa->sa_len);
577 if (n)
578 n->m_next = m0; /* concatenate data to control */
579 else
580 control = m0;
581 m->m_next = control;
582 for (n = m; n; n = n->m_next)
583 sballoc(sb, n);
584 if (n = sb->sb_mb) {
585 while (n->m_nextpkt)
586 n = n->m_nextpkt;
587 n->m_nextpkt = m;
588 } else
589 sb->sb_mb = m;
590 return (1);
591 }
592
593 int
sbappendcontrol(sb,m0,control)594 sbappendcontrol(sb, m0, control)
595 struct sockbuf *sb;
596 struct mbuf *m0, *control;
597 {
598 register struct mbuf *m, *n;
599 int space = 0;
600
601 if (control == 0)
602 panic("sbappendcontrol");
603 for (m = control; ; m = m->m_next) {
604 space += m->m_len;
605 if (m->m_next == 0)
606 break;
607 }
608 n = m; /* save pointer to last control buffer */
609 for (m = m0; m; m = m->m_next)
610 space += m->m_len;
611 if (space > sbspace(sb))
612 return (0);
613 n->m_next = m0; /* concatenate data to control */
614 for (m = control; m; m = m->m_next)
615 sballoc(sb, m);
616 if (n = sb->sb_mb) {
617 while (n->m_nextpkt)
618 n = n->m_nextpkt;
619 n->m_nextpkt = control;
620 } else
621 sb->sb_mb = control;
622 return (1);
623 }
624
625 /*
626 * Compress mbuf chain m into the socket
627 * buffer sb following mbuf n. If n
628 * is null, the buffer is presumed empty.
629 */
630 void
sbcompress(sb,m,n)631 sbcompress(sb, m, n)
632 register struct sockbuf *sb;
633 register struct mbuf *m, *n;
634 {
635 register int eor = 0;
636 register struct mbuf *o;
637
638 while (m) {
639 eor |= m->m_flags & M_EOR;
640 if (m->m_len == 0 &&
641 (eor == 0 ||
642 (((o = m->m_next) || (o = n)) &&
643 o->m_type == m->m_type))) {
644 m = m_free(m);
645 continue;
646 }
647 if (n && (n->m_flags & (M_EXT | M_EOR)) == 0 &&
648 (n->m_data + n->m_len + m->m_len) < &n->m_dat[MLEN] &&
649 n->m_type == m->m_type) {
650 bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
651 (unsigned)m->m_len);
652 n->m_len += m->m_len;
653 sb->sb_cc += m->m_len;
654 m = m_free(m);
655 continue;
656 }
657 if (n)
658 n->m_next = m;
659 else
660 sb->sb_mb = m;
661 sballoc(sb, m);
662 n = m;
663 m->m_flags &= ~M_EOR;
664 m = m->m_next;
665 n->m_next = 0;
666 }
667 if (eor) {
668 if (n)
669 n->m_flags |= eor;
670 else
671 printf("semi-panic: sbcompress\n");
672 }
673 }
674
675 /*
676 * Free all mbufs in a sockbuf.
677 * Check that all resources are reclaimed.
678 */
679 void
sbflush(sb)680 sbflush(sb)
681 register struct sockbuf *sb;
682 {
683
684 if (sb->sb_flags & SB_LOCK)
685 panic("sbflush");
686 while (sb->sb_mbcnt)
687 sbdrop(sb, (int)sb->sb_cc);
688 if (sb->sb_cc || sb->sb_mb)
689 panic("sbflush 2");
690 }
691
692 /*
693 * Drop data from (the front of) a sockbuf.
694 */
695 void
sbdrop(sb,len)696 sbdrop(sb, len)
697 register struct sockbuf *sb;
698 register int len;
699 {
700 register struct mbuf *m, *mn;
701 struct mbuf *next;
702
703 next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
704 while (len > 0) {
705 if (m == 0) {
706 if (next == 0)
707 panic("sbdrop");
708 m = next;
709 next = m->m_nextpkt;
710 continue;
711 }
712 if (m->m_len > len) {
713 m->m_len -= len;
714 m->m_data += len;
715 sb->sb_cc -= len;
716 break;
717 }
718 len -= m->m_len;
719 sbfree(sb, m);
720 MFREE(m, mn);
721 m = mn;
722 }
723 while (m && m->m_len == 0) {
724 sbfree(sb, m);
725 MFREE(m, mn);
726 m = mn;
727 }
728 if (m) {
729 sb->sb_mb = m;
730 m->m_nextpkt = next;
731 } else
732 sb->sb_mb = next;
733 }
734
735 /*
736 * Drop a record off the front of a sockbuf
737 * and move the next record to the front.
738 */
739 void
sbdroprecord(sb)740 sbdroprecord(sb)
741 register struct sockbuf *sb;
742 {
743 register struct mbuf *m, *mn;
744
745 m = sb->sb_mb;
746 if (m) {
747 sb->sb_mb = m->m_nextpkt;
748 do {
749 sbfree(sb, m);
750 MFREE(m, mn);
751 } while (m = mn);
752 }
753 }
754