xref: /dragonfly/sys/kern/uipc_sockbuf.c (revision 8e1c6f81)
1 /*
2  * Copyright (c) 2005 Jeffrey M. Hsu.  All rights reserved.
3  * Copyright (c) 1982, 1986, 1988, 1990, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. All advertising materials mentioning features or use of this software
15  *    must display the following acknowledgement:
16  *	This product includes software developed by the University of
17  *	California, Berkeley and its contributors.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * @(#)uipc_socket2.c	8.1 (Berkeley) 6/10/93
35  * $FreeBSD: src/sys/kern/uipc_socket2.c,v 1.55.2.17 2002/08/31 19:04:55 dwmalone Exp $
36  * $DragonFly: src/sys/kern/uipc_sockbuf.c,v 1.3 2007/08/09 01:10:04 dillon Exp $
37  */
38 
39 #include "opt_param.h"
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/domain.h>
43 #include <sys/file.h>	/* for maxfiles */
44 #include <sys/kernel.h>
45 #include <sys/proc.h>
46 #include <sys/malloc.h>
47 #include <sys/mbuf.h>
48 #include <sys/protosw.h>
49 #include <sys/resourcevar.h>
50 #include <sys/stat.h>
51 #include <sys/socket.h>
52 #include <sys/socketvar.h>
53 
54 #include <sys/thread2.h>
55 #include <sys/msgport2.h>
56 
57 /*
58  * Routines to add and remove data from an mbuf queue.
59  *
60  * The routines sbappend() or sbappendrecord() are normally called to
61  * append new mbufs to a socket buffer.  sbappendrecord() differs from
62  * sbappend() in that data supplied is treated as the beginning of a new
63  * record.  sbappend() only begins a new record if the last mbuf in the
64  * sockbuf is marked M_EOR.
65  *
66  * To place a sender's address, optional access rights, and data in a
67  * socket receive buffer, sbappendaddr() or sbappendcontrol() should be
68  * used.   These functions also begin a new record.
69  *
70  * Reliable protocols may use the socket send buffer to hold data
71  * awaiting acknowledgement.  Data is normally copied from a socket
72  * send buffer in a protocol with m_copy for output to a peer,
73  * and then removing the data from the socket buffer with sbdrop()
74  * or sbdroprecord() when the data is acknowledged by the peer.
75  */
76 
77 /*
78  * Append mbuf chain m to the last record in the socket buffer sb.
79  * The additional space associated the mbuf chain is recorded in sb.
80  * Empty mbufs are discarded and mbufs are compacted where possible.
81  *
82  * If M_EOR is set in the first or last mbuf of the last record, the
83  * mbuf chain is appended as a new record.  M_EOR is usually just set
84  * in the last mbuf of the last record's mbuf chain (see sbcompress()),
85  * but this may be changed in the future since there is no real need
86  * to propogate the flag any more.
87  */
88 void
89 sbappend(struct sockbuf *sb, struct mbuf *m)
90 {
91 	struct mbuf *n;
92 
93 	mbuftrackid(m, 16);
94 
95 	if (m) {
96 		n = sb->sb_lastrecord;
97 		if (n) {
98 			if (n->m_flags & M_EOR) {
99 				sbappendrecord(sb, m);
100 				return;
101 			}
102 		}
103 		n = sb->sb_lastmbuf;
104 		if (n) {
105 			if (n->m_flags & M_EOR) {
106 				sbappendrecord(sb, m);
107 				return;
108 			}
109 		}
110 		sbcompress(sb, m, n);
111 	}
112 }
113 
114 /*
115  * sbappendstream() is an optimized form of sbappend() for protocols
116  * such as TCP that only have one record in the socket buffer, are
117  * not PR_ATOMIC, nor allow MT_CONTROL data.  A protocol that uses
118  * sbappendstream() must use sbappendstream() exclusively.
119  */
120 void
121 sbappendstream(struct sockbuf *sb, struct mbuf *m)
122 {
123 	mbuftrackid(m, 17);
124 	KKASSERT(m->m_nextpkt == NULL);
125 	sbcompress(sb, m, sb->sb_lastmbuf);
126 }
127 
128 #ifdef SOCKBUF_DEBUG
129 
130 void
131 _sbcheck(struct sockbuf *sb)
132 {
133 	struct mbuf *m;
134 	struct mbuf *n = NULL;
135 	u_long len = 0, mbcnt = 0;
136 
137 	for (m = sb->sb_mb; m; m = n) {
138 	    n = m->m_nextpkt;
139 	    if (n == NULL && sb->sb_lastrecord != m) {
140 		    kprintf("sockbuf %p mismatched lastrecord %p vs %p\n", sb, sb->sb_lastrecord, m);
141 		    panic("sbcheck1");
142 
143 	    }
144 	    for (; m; m = m->m_next) {
145 		len += m->m_len;
146 		mbcnt += MSIZE;
147 		if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
148 			mbcnt += m->m_ext.ext_size;
149 		if (n == NULL && m->m_next == NULL) {
150 			if (sb->sb_lastmbuf != m) {
151 				kprintf("sockbuf %p mismatched lastmbuf %p vs %p\n", sb, sb->sb_lastmbuf, m);
152 				panic("sbcheck2");
153 			}
154 		}
155 	    }
156 	}
157 	if (sb->sb_mb == NULL) {
158 	    if (sb->sb_lastrecord != NULL) {
159 		kprintf("sockbuf %p is empty, lastrecord not NULL: %p\n",
160 			sb, sb->sb_lastrecord);
161 		panic("sbcheck3");
162 	    }
163 	    if (sb->sb_lastmbuf != NULL) {
164 		kprintf("sockbuf %p is empty, lastmbuf not NULL: %p\n",
165 			sb, sb->sb_lastmbuf);
166 		panic("sbcheck4");
167 	    }
168 	}
169 	if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
170 		kprintf("sockbuf %p cc %ld != %ld || mbcnt %ld != %ld\n",
171 		    sb, len, sb->sb_cc, mbcnt, sb->sb_mbcnt);
172 		panic("sbcheck5");
173 	}
174 }
175 
176 #endif
177 
178 /*
179  * Same as sbappend(), except the mbuf chain begins a new record.
180  */
181 void
182 sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
183 {
184 	struct mbuf *firstmbuf;
185 	struct mbuf *secondmbuf;
186 
187 	if (m0 == NULL)
188 		return;
189 	mbuftrackid(m0, 18);
190 
191 	sbcheck(sb);
192 
193 	/*
194 	 * Break the first mbuf off from the rest of the mbuf chain.
195 	 */
196 	firstmbuf = m0;
197 	secondmbuf = m0->m_next;
198 	m0->m_next = NULL;
199 
200 	/*
201 	 * Insert the first mbuf of the m0 mbuf chain as the last record of
202 	 * the sockbuf.  Note this permits zero length records!  Keep the
203 	 * sockbuf state consistent.
204 	 */
205 	if (sb->sb_mb == NULL)
206 		sb->sb_mb = firstmbuf;
207 	else
208 		sb->sb_lastrecord->m_nextpkt = firstmbuf;
209 	sb->sb_lastrecord = firstmbuf;	/* update hint for new last record */
210 	sb->sb_lastmbuf = firstmbuf;	/* update hint for new last mbuf */
211 
212 	/*
213 	 * propagate the EOR flag so sbcompress() can pick it up
214 	 */
215 	if ((firstmbuf->m_flags & M_EOR) && (secondmbuf != NULL)) {
216 		firstmbuf->m_flags &= ~M_EOR;
217 		secondmbuf->m_flags |= M_EOR;
218 	}
219 
220 	/*
221 	 * The succeeding call to sbcompress() omits accounting for
222 	 * the first mbuf, so do it here.
223 	 */
224 	sballoc(sb, firstmbuf);
225 
226 	/* Compact the rest of the mbuf chain in after the first mbuf. */
227 	sbcompress(sb, secondmbuf, firstmbuf);
228 }
229 
230 /*
231  * Append address and data, and optionally, control (ancillary) data
232  * to the receive queue of a socket.  If present,
233  * m0 must include a packet header with total length.
234  * Returns 0 if insufficient mbufs.
235  */
236 int
237 sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0,
238 	     struct mbuf *control)
239 {
240 	struct mbuf *m, *n;
241 	int eor;
242 
243 	mbuftrackid(m0, 19);
244 	mbuftrackid(control, 20);
245 	if (m0 && (m0->m_flags & M_PKTHDR) == 0)
246 		panic("sbappendaddr");
247 	sbcheck(sb);
248 
249 	for (n = control; n; n = n->m_next) {
250 		if (n->m_next == NULL)	/* keep pointer to last control buf */
251 			break;
252 	}
253 	if (asa->sa_len > MLEN)
254 		return (0);
255 	MGET(m, MB_DONTWAIT, MT_SONAME);
256 	if (m == NULL)
257 		return (0);
258 	KKASSERT(m->m_nextpkt == NULL);
259 	m->m_len = asa->sa_len;
260 	bcopy(asa, mtod(m, caddr_t), asa->sa_len);
261 	if (n)
262 		n->m_next = m0;		/* concatenate data to control */
263 	else
264 		control = m0;
265 	m->m_next = control;
266 	for (n = m; n; n = n->m_next)
267 		sballoc(sb, n);
268 
269 	if (sb->sb_mb == NULL)
270 		sb->sb_mb = m;
271 	else
272 		sb->sb_lastrecord->m_nextpkt = m;
273 	sb->sb_lastrecord = m;
274 
275 	/*
276 	 * Propogate M_EOR to the last mbuf and calculate sb_lastmbuf
277 	 * so sbappend() can find it.
278 	 */
279 	eor = m->m_flags;
280 	while (m->m_next) {
281 		m->m_flags &= ~M_EOR;
282 		m = m->m_next;
283 		eor |= m->m_flags;
284 	}
285 	m->m_flags |= eor & M_EOR;
286 	sb->sb_lastmbuf = m;
287 
288 	return (1);
289 }
290 
291 /*
292  * Append control information followed by data. Both the control and data
293  * must be non-null.
294  */
295 int
296 sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control)
297 {
298 	struct mbuf *n;
299 	u_int length, cmbcnt, m0mbcnt;
300 	int eor;
301 
302 	KASSERT(control != NULL, ("sbappendcontrol"));
303 	KKASSERT(control->m_nextpkt == NULL);
304 	sbcheck(sb);
305 
306 	mbuftrackid(m0, 21);
307 	mbuftrackid(control, 22);
308 
309 	length = m_countm(control, &n, &cmbcnt) + m_countm(m0, NULL, &m0mbcnt);
310 
311 	KKASSERT(m0 != NULL);
312 
313 	n->m_next = m0;			/* concatenate data to control */
314 
315 	if (sb->sb_mb == NULL)
316 		sb->sb_mb = control;
317 	else
318 		sb->sb_lastrecord->m_nextpkt = control;
319 	sb->sb_lastrecord = control;
320 
321 	/*
322 	 * Propogate M_EOR to the last mbuf and calculate sb_lastmbuf
323 	 * so sbappend() can find it.
324 	 */
325 	eor = m0->m_flags;
326 	while (m0->m_next) {
327 		m0->m_flags &= ~M_EOR;
328 		m0 = m0->m_next;
329 		eor |= m0->m_flags;
330 	}
331 	m0->m_flags |= eor & M_EOR;
332 	sb->sb_lastmbuf = m0;
333 
334 	sb->sb_cc += length;
335 	sb->sb_mbcnt += cmbcnt + m0mbcnt;
336 
337 	return (1);
338 }
339 
340 /*
341  * Compress mbuf chain m into the socket buffer sb following mbuf tailm.
342  * If tailm is null, the buffer is presumed empty.  Also, as a side-effect,
343  * increment the sockbuf counts for each mbuf in the chain.
344  */
345 void
346 sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *tailm)
347 {
348 	int eor = 0;
349 	struct mbuf *free_chain = NULL;
350 
351 	mbuftrackid(m, 23);
352 
353 	sbcheck(sb);
354 	while (m) {
355 		struct mbuf *o;
356 
357 		eor |= m->m_flags & M_EOR;
358 		/*
359 		 * Disregard empty mbufs as long as we don't encounter
360 		 * an end-of-record or there is a trailing mbuf of
361 		 * the same type to propagate the EOR flag to.
362 		 *
363 		 * Defer the m_free() call because it can block and break
364 		 * the atomicy of the sockbuf.
365 		 */
366 		if (m->m_len == 0 &&
367 		    (eor == 0 ||
368 		     (((o = m->m_next) || (o = tailm)) &&
369 		      o->m_type == m->m_type))) {
370 			o = m->m_next;
371 			m->m_next = free_chain;
372 			free_chain = m;
373 			m = o;
374 			continue;
375 		}
376 
377 		/* See if we can coalesce with preceding mbuf. */
378 		if (tailm && !(tailm->m_flags & M_EOR) && M_WRITABLE(tailm) &&
379 		    m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
380 		    m->m_len <= M_TRAILINGSPACE(tailm) &&
381 		    tailm->m_type == m->m_type) {
382 			bcopy(mtod(m, caddr_t),
383 			      mtod(tailm, caddr_t) + tailm->m_len,
384 			      (unsigned)m->m_len);
385 			tailm->m_len += m->m_len;
386 			sb->sb_cc += m->m_len;		/* update sb counter */
387 			o = m->m_next;
388 			m->m_next = free_chain;
389 			free_chain = m;
390 			m = o;
391 			continue;
392 		}
393 
394 		/* Insert whole mbuf. */
395 		if (tailm == NULL) {
396 			KASSERT(sb->sb_mb == NULL,
397 				("sbcompress: sb_mb not NULL"));
398 			sb->sb_mb = m;		/* only mbuf in sockbuf */
399 			sb->sb_lastrecord = m;	/* new last record */
400 		} else {
401 			tailm->m_next = m;	/* tack m on following tailm */
402 		}
403 		sb->sb_lastmbuf = m;	/* update last mbuf hint */
404 
405 		tailm = m;	/* just inserted mbuf becomes the new tail */
406 		m = m->m_next;		/* advance to next mbuf */
407 		tailm->m_next = NULL;	/* split inserted mbuf off from chain */
408 
409 		/* update sb counters for just added mbuf */
410 		sballoc(sb, tailm);
411 
412 		/* clear EOR on intermediate mbufs */
413 		tailm->m_flags &= ~M_EOR;
414 	}
415 
416 	/*
417 	 * Propogate EOR to the last mbuf
418 	 */
419 	if (eor) {
420 		if (tailm)
421 			tailm->m_flags |= eor;
422 		else
423 			kprintf("semi-panic: sbcompress");
424 	}
425 
426 	/*
427 	 * Clean up any defered frees.
428 	 */
429 	while (free_chain)
430 		free_chain = m_free(free_chain);
431 
432 	sbcheck(sb);
433 }
434 
435 /*
436  * Free all mbufs in a sockbuf.
437  * Check that all resources are reclaimed.
438  */
439 void
440 sbflush(struct sockbuf *sb)
441 {
442 	while (sb->sb_mbcnt) {
443 		/*
444 		 * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
445 		 * we would loop forever. Panic instead.
446 		 */
447 		if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len))
448 			break;
449 		sbdrop(sb, (int)sb->sb_cc);
450 	}
451 	KASSERT(!(sb->sb_cc || sb->sb_mb || sb->sb_mbcnt || sb->sb_lastmbuf),
452 	    ("sbflush: cc %ld || mb %p || mbcnt %ld || lastmbuf %p",
453 	    sb->sb_cc, sb->sb_mb, sb->sb_mbcnt, sb->sb_lastmbuf));
454 }
455 
456 /*
457  * Drop data from (the front of) a sockbuf.  If the current record is
458  * exhausted this routine will move onto the next one and continue dropping
459  * data.
460  */
461 void
462 sbdrop(struct sockbuf *sb, int len)
463 {
464 	struct mbuf *m;
465 	struct mbuf *free_chain = NULL;
466 
467 	sbcheck(sb);
468 	crit_enter();
469 
470 	m = sb->sb_mb;
471 	while (m && len > 0) {
472 		if (m->m_len > len) {
473 			m->m_len -= len;
474 			m->m_data += len;
475 			sb->sb_cc -= len;
476 			break;
477 		}
478 		len -= m->m_len;
479 		m = sbunlinkmbuf(sb, m, &free_chain);
480 		if (m == NULL && len)
481 			m = sb->sb_mb;
482 	}
483 
484 	/*
485 	 * Remove any trailing 0-length mbufs in the current record.  If
486 	 * the last record for which data was removed is now empty, m will be
487 	 * NULL.
488 	 */
489 	while (m && m->m_len == 0) {
490 		m = sbunlinkmbuf(sb, m, &free_chain);
491 	}
492 	crit_exit();
493 	if (free_chain)
494 		m_freem(free_chain);
495 	sbcheck(sb);
496 }
497 
498 /*
499  * Drop a record off the front of a sockbuf and move the next record
500  * to the front.
501  *
502  * Must be called while holding a critical section.
503  */
504 void
505 sbdroprecord(struct sockbuf *sb)
506 {
507 	struct mbuf *m;
508 	struct mbuf *n;
509 
510 	sbcheck(sb);
511 	m = sb->sb_mb;
512 	if (m) {
513 		if ((sb->sb_mb = m->m_nextpkt) == NULL) {
514 			sb->sb_lastrecord = NULL;
515 			sb->sb_lastmbuf = NULL;
516 		}
517 		m->m_nextpkt = NULL;
518 		for (n = m; n; n = n->m_next)
519 			sbfree(sb, n);
520 		m_freem(m);
521 		sbcheck(sb);
522 	}
523 }
524 
525 /*
526  * Drop the first mbuf off the sockbuf and move the next mbuf to the front.
527  * Currently only the head mbuf of the sockbuf may be dropped this way.
528  *
529  * The next mbuf in the same record as the mbuf being removed is returned
530  * or NULL if the record is exhausted.  Note that other records may remain
531  * in the sockbuf when NULL is returned.
532  *
533  * Must be called while holding a critical section.
534  */
535 struct mbuf *
536 sbunlinkmbuf(struct sockbuf *sb, struct mbuf *m, struct mbuf **free_chain)
537 {
538 	struct mbuf *n;
539 
540 	KKASSERT(sb->sb_mb == m);
541 	sbfree(sb, m);
542 	n = m->m_next;
543 	if (n) {
544 		sb->sb_mb = n;
545 		if (sb->sb_lastrecord == m)
546 			sb->sb_lastrecord = n;
547 		KKASSERT(sb->sb_lastmbuf != m);
548 		n->m_nextpkt = m->m_nextpkt;
549 	} else {
550 		sb->sb_mb = m->m_nextpkt;
551 		if (sb->sb_lastrecord == m) {
552 			KKASSERT(sb->sb_mb == NULL);
553 			sb->sb_lastrecord = NULL;
554 		}
555 		if (sb->sb_mb == NULL)
556 			sb->sb_lastmbuf = NULL;
557 	}
558 	m->m_nextpkt = NULL;
559 	if (free_chain) {
560 		m->m_next = *free_chain;
561 		*free_chain = m;
562 	} else {
563 		m->m_next = NULL;
564 	}
565 	return(n);
566 }
567 
568 /*
569  * Create a "control" mbuf containing the specified data
570  * with the specified type for presentation on a socket buffer.
571  */
572 struct mbuf *
573 sbcreatecontrol(caddr_t p, int size, int type, int level)
574 {
575 	struct cmsghdr *cp;
576 	struct mbuf *m;
577 
578 	if (CMSG_SPACE((u_int)size) > MCLBYTES)
579 		return (NULL);
580 	m = m_getl(CMSG_SPACE((u_int)size), MB_DONTWAIT, MT_CONTROL, 0, NULL);
581 	if (m == NULL)
582 		return (NULL);
583 	m->m_len = CMSG_SPACE(size);
584 	cp = mtod(m, struct cmsghdr *);
585 	if (p != NULL)
586 		memcpy(CMSG_DATA(cp), p, size);
587 	cp->cmsg_len = CMSG_LEN(size);
588 	cp->cmsg_level = level;
589 	cp->cmsg_type = type;
590 	mbuftrackid(m, 24);
591 	return (m);
592 }
593 
594