xref: /dragonfly/sys/kern/uipc_sockbuf.c (revision d4ef6694)
1 /*
2  * Copyright (c) 2005 Jeffrey M. Hsu.  All rights reserved.
3  * Copyright (c) 1982, 1986, 1988, 1990, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. Neither the name of the University nor the names of its contributors
15  *    may be used to endorse or promote products derived from this software
16  *    without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  *
30  * @(#)uipc_socket2.c	8.1 (Berkeley) 6/10/93
31  * $FreeBSD: src/sys/kern/uipc_socket2.c,v 1.55.2.17 2002/08/31 19:04:55 dwmalone Exp $
32  * $DragonFly: src/sys/kern/uipc_sockbuf.c,v 1.3 2007/08/09 01:10:04 dillon Exp $
33  */
34 
35 #include "opt_param.h"
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/domain.h>
39 #include <sys/file.h>	/* for maxfiles */
40 #include <sys/kernel.h>
41 #include <sys/proc.h>
42 #include <sys/malloc.h>
43 #include <sys/mbuf.h>
44 #include <sys/protosw.h>
45 #include <sys/resourcevar.h>
46 #include <sys/stat.h>
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 
50 #include <sys/thread2.h>
51 #include <sys/msgport2.h>
52 
53 /*
54  * Routines to add and remove data from an mbuf queue.
55  *
56  * The routines sbappend() or sbappendrecord() are normally called to
57  * append new mbufs to a socket buffer.  sbappendrecord() differs from
58  * sbappend() in that data supplied is treated as the beginning of a new
59  * record.  sbappend() only begins a new record if the last mbuf in the
60  * sockbuf is marked M_EOR.
61  *
62  * To place a sender's address, optional access rights, and data in a
63  * socket receive buffer, sbappendaddr() or sbappendcontrol() should be
64  * used.   These functions also begin a new record.
65  *
66  * Reliable protocols may use the socket send buffer to hold data
67  * awaiting acknowledgement.  Data is normally copied from a socket
68  * send buffer in a protocol with m_copy for output to a peer,
69  * and then removing the data from the socket buffer with sbdrop()
70  * or sbdroprecord() when the data is acknowledged by the peer.
71  */
72 
73 /*
74  * Append mbuf chain m to the last record in the socket buffer sb.
75  * The additional space associated the mbuf chain is recorded in sb.
76  * Empty mbufs are discarded and mbufs are compacted where possible.
77  *
78  * If M_EOR is set in the first or last mbuf of the last record, the
79  * mbuf chain is appended as a new record.  M_EOR is usually just set
80  * in the last mbuf of the last record's mbuf chain (see sbcompress()),
81  * but this may be changed in the future since there is no real need
82  * to propogate the flag any more.
83  */
84 void
85 sbappend(struct sockbuf *sb, struct mbuf *m)
86 {
87 	struct mbuf *n;
88 
89 	mbuftrackid(m, 16);
90 
91 	if (m) {
92 		n = sb->sb_lastrecord;
93 		if (n) {
94 			if (n->m_flags & M_EOR) {
95 				sbappendrecord(sb, m);
96 				return;
97 			}
98 		}
99 		n = sb->sb_lastmbuf;
100 		if (n) {
101 			if (n->m_flags & M_EOR) {
102 				sbappendrecord(sb, m);
103 				return;
104 			}
105 		}
106 		sbcompress(sb, m, n);
107 	}
108 }
109 
110 /*
111  * sbappendstream() is an optimized form of sbappend() for protocols
112  * such as TCP that only have one record in the socket buffer, are
113  * not PR_ATOMIC, nor allow MT_CONTROL data.  A protocol that uses
114  * sbappendstream() must use sbappendstream() exclusively.
115  */
116 void
117 sbappendstream(struct sockbuf *sb, struct mbuf *m)
118 {
119 	mbuftrackid(m, 17);
120 	KKASSERT(m->m_nextpkt == NULL);
121 	sbcompress(sb, m, sb->sb_lastmbuf);
122 }
123 
124 #ifdef SOCKBUF_DEBUG
125 
126 void
127 _sbcheck(struct sockbuf *sb)
128 {
129 	struct mbuf *m;
130 	struct mbuf *n = NULL;
131 	u_long len = 0, mbcnt = 0;
132 
133 	for (m = sb->sb_mb; m; m = n) {
134 	    n = m->m_nextpkt;
135 	    if (n == NULL && sb->sb_lastrecord != m) {
136 		    kprintf("sockbuf %p mismatched lastrecord %p vs %p\n", sb, sb->sb_lastrecord, m);
137 		    panic("sbcheck1");
138 
139 	    }
140 	    for (; m; m = m->m_next) {
141 		len += m->m_len;
142 		mbcnt += MSIZE;
143 		if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
144 			mbcnt += m->m_ext.ext_size;
145 		if (n == NULL && m->m_next == NULL) {
146 			if (sb->sb_lastmbuf != m) {
147 				kprintf("sockbuf %p mismatched lastmbuf %p vs %p\n", sb, sb->sb_lastmbuf, m);
148 				panic("sbcheck2");
149 			}
150 		}
151 	    }
152 	}
153 	if (sb->sb_mb == NULL) {
154 	    if (sb->sb_lastrecord != NULL) {
155 		kprintf("sockbuf %p is empty, lastrecord not NULL: %p\n",
156 			sb, sb->sb_lastrecord);
157 		panic("sbcheck3");
158 	    }
159 	    if (sb->sb_lastmbuf != NULL) {
160 		kprintf("sockbuf %p is empty, lastmbuf not NULL: %p\n",
161 			sb, sb->sb_lastmbuf);
162 		panic("sbcheck4");
163 	    }
164 	}
165 	if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
166 		kprintf("sockbuf %p cc %ld != %ld || mbcnt %ld != %ld\n",
167 		    sb, len, sb->sb_cc, mbcnt, sb->sb_mbcnt);
168 		panic("sbcheck5");
169 	}
170 }
171 
172 #endif
173 
174 /*
175  * Same as sbappend(), except the mbuf chain begins a new record.
176  */
177 void
178 sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
179 {
180 	struct mbuf *firstmbuf;
181 	struct mbuf *secondmbuf;
182 
183 	if (m0 == NULL)
184 		return;
185 	mbuftrackid(m0, 18);
186 
187 	sbcheck(sb);
188 
189 	/*
190 	 * Break the first mbuf off from the rest of the mbuf chain.
191 	 */
192 	firstmbuf = m0;
193 	secondmbuf = m0->m_next;
194 	m0->m_next = NULL;
195 
196 	/*
197 	 * Insert the first mbuf of the m0 mbuf chain as the last record of
198 	 * the sockbuf.  Note this permits zero length records!  Keep the
199 	 * sockbuf state consistent.
200 	 */
201 	if (sb->sb_mb == NULL)
202 		sb->sb_mb = firstmbuf;
203 	else
204 		sb->sb_lastrecord->m_nextpkt = firstmbuf;
205 	sb->sb_lastrecord = firstmbuf;	/* update hint for new last record */
206 	sb->sb_lastmbuf = firstmbuf;	/* update hint for new last mbuf */
207 
208 	/*
209 	 * propagate the EOR flag so sbcompress() can pick it up
210 	 */
211 	if ((firstmbuf->m_flags & M_EOR) && (secondmbuf != NULL)) {
212 		firstmbuf->m_flags &= ~M_EOR;
213 		secondmbuf->m_flags |= M_EOR;
214 	}
215 
216 	/*
217 	 * The succeeding call to sbcompress() omits accounting for
218 	 * the first mbuf, so do it here.
219 	 */
220 	sballoc(sb, firstmbuf);
221 
222 	/* Compact the rest of the mbuf chain in after the first mbuf. */
223 	sbcompress(sb, secondmbuf, firstmbuf);
224 }
225 
226 /*
227  * Append address and data, and optionally, control (ancillary) data
228  * to the receive queue of a socket.  If present,
229  * m0 must include a packet header with total length.
230  * Returns 0 if insufficient mbufs.
231  */
232 int
233 sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0,
234 	     struct mbuf *control)
235 {
236 	struct mbuf *m, *n;
237 	int eor;
238 
239 	mbuftrackid(m0, 19);
240 	mbuftrackid(control, 20);
241 	if (m0 && (m0->m_flags & M_PKTHDR) == 0)
242 		panic("sbappendaddr");
243 	sbcheck(sb);
244 
245 	for (n = control; n; n = n->m_next) {
246 		if (n->m_next == NULL)	/* keep pointer to last control buf */
247 			break;
248 	}
249 	if (asa->sa_len > MLEN)
250 		return (0);
251 	MGET(m, MB_DONTWAIT, MT_SONAME);
252 	if (m == NULL)
253 		return (0);
254 	KKASSERT(m->m_nextpkt == NULL);
255 	m->m_len = asa->sa_len;
256 	bcopy(asa, mtod(m, caddr_t), asa->sa_len);
257 	if (n)
258 		n->m_next = m0;		/* concatenate data to control */
259 	else
260 		control = m0;
261 	m->m_next = control;
262 	for (n = m; n; n = n->m_next)
263 		sballoc(sb, n);
264 
265 	if (sb->sb_mb == NULL)
266 		sb->sb_mb = m;
267 	else
268 		sb->sb_lastrecord->m_nextpkt = m;
269 	sb->sb_lastrecord = m;
270 
271 	/*
272 	 * Propogate M_EOR to the last mbuf and calculate sb_lastmbuf
273 	 * so sbappend() can find it.
274 	 */
275 	eor = m->m_flags;
276 	while (m->m_next) {
277 		m->m_flags &= ~M_EOR;
278 		m = m->m_next;
279 		eor |= m->m_flags;
280 	}
281 	m->m_flags |= eor & M_EOR;
282 	sb->sb_lastmbuf = m;
283 
284 	return (1);
285 }
286 
287 /*
288  * Append control information followed by data. Both the control and data
289  * must be non-null.
290  */
291 int
292 sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control)
293 {
294 	struct mbuf *n;
295 	u_int length, cmbcnt, m0mbcnt;
296 	int eor;
297 
298 	KASSERT(control != NULL, ("sbappendcontrol"));
299 	KKASSERT(control->m_nextpkt == NULL);
300 	sbcheck(sb);
301 
302 	mbuftrackid(m0, 21);
303 	mbuftrackid(control, 22);
304 
305 	length = m_countm(control, &n, &cmbcnt) + m_countm(m0, NULL, &m0mbcnt);
306 
307 	KKASSERT(m0 != NULL);
308 
309 	n->m_next = m0;			/* concatenate data to control */
310 
311 	if (sb->sb_mb == NULL)
312 		sb->sb_mb = control;
313 	else
314 		sb->sb_lastrecord->m_nextpkt = control;
315 	sb->sb_lastrecord = control;
316 
317 	/*
318 	 * Propogate M_EOR to the last mbuf and calculate sb_lastmbuf
319 	 * so sbappend() can find it.
320 	 */
321 	eor = m0->m_flags;
322 	while (m0->m_next) {
323 		m0->m_flags &= ~M_EOR;
324 		m0 = m0->m_next;
325 		eor |= m0->m_flags;
326 	}
327 	m0->m_flags |= eor & M_EOR;
328 	sb->sb_lastmbuf = m0;
329 
330 	sb->sb_cc += length;
331 	sb->sb_mbcnt += cmbcnt + m0mbcnt;
332 
333 	return (1);
334 }
335 
336 /*
337  * Compress mbuf chain m into the socket buffer sb following mbuf tailm.
338  * If tailm is null, the buffer is presumed empty.  Also, as a side-effect,
339  * increment the sockbuf counts for each mbuf in the chain.
340  */
341 void
342 sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *tailm)
343 {
344 	int eor = 0;
345 	struct mbuf *free_chain = NULL;
346 
347 	mbuftrackid(m, 23);
348 
349 	sbcheck(sb);
350 	while (m) {
351 		struct mbuf *o;
352 
353 		eor |= m->m_flags & M_EOR;
354 		/*
355 		 * Disregard empty mbufs as long as we don't encounter
356 		 * an end-of-record or there is a trailing mbuf of
357 		 * the same type to propagate the EOR flag to.
358 		 *
359 		 * Defer the m_free() call because it can block and break
360 		 * the atomicy of the sockbuf.
361 		 */
362 		if (m->m_len == 0 &&
363 		    (eor == 0 ||
364 		     (((o = m->m_next) || (o = tailm)) &&
365 		      o->m_type == m->m_type))) {
366 			o = m->m_next;
367 			m->m_next = free_chain;
368 			free_chain = m;
369 			m = o;
370 			continue;
371 		}
372 
373 		/*
374 		 * See if we can coalesce with preceding mbuf.  Never try
375 		 * to coalesce a mbuf representing an end-of-record or
376 		 * a mbuf locked by userland for reading.
377 		 */
378 		if (tailm && !(tailm->m_flags & (M_EOR | M_SOLOCKED)) &&
379 		    M_WRITABLE(tailm) &&
380 		    m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
381 		    m->m_len <= M_TRAILINGSPACE(tailm) &&
382 		    tailm->m_type == m->m_type) {
383 			u_long mbcnt_sz;
384 
385 			bcopy(mtod(m, caddr_t),
386 			      mtod(tailm, caddr_t) + tailm->m_len,
387 			      (unsigned)m->m_len);
388 			tailm->m_len += m->m_len;
389 
390 			sb->sb_cc += m->m_len;		/* update sb counter */
391 
392 			/*
393 			 * Fix the wrongly updated mbcnt_prealloc
394 			 */
395 			mbcnt_sz = MSIZE;
396 			if (m->m_flags & M_EXT)
397 				mbcnt_sz += m->m_ext.ext_size;
398 			atomic_subtract_long(&sb->sb_mbcnt_prealloc, mbcnt_sz);
399 
400 			o = m->m_next;
401 			m->m_next = free_chain;
402 			free_chain = m;
403 			m = o;
404 			continue;
405 		}
406 
407 		/* Insert whole mbuf. */
408 		if (tailm == NULL) {
409 			KASSERT(sb->sb_mb == NULL,
410 				("sbcompress: sb_mb not NULL"));
411 			sb->sb_mb = m;		/* only mbuf in sockbuf */
412 			sb->sb_lastrecord = m;	/* new last record */
413 		} else {
414 			tailm->m_next = m;	/* tack m on following tailm */
415 		}
416 		sb->sb_lastmbuf = m;	/* update last mbuf hint */
417 
418 		tailm = m;	/* just inserted mbuf becomes the new tail */
419 		m = m->m_next;		/* advance to next mbuf */
420 		tailm->m_next = NULL;	/* split inserted mbuf off from chain */
421 
422 		/* update sb counters for just added mbuf */
423 		sballoc(sb, tailm);
424 
425 		/* clear EOR on intermediate mbufs */
426 		tailm->m_flags &= ~M_EOR;
427 	}
428 
429 	/*
430 	 * Propogate EOR to the last mbuf
431 	 */
432 	if (eor) {
433 		if (tailm)
434 			tailm->m_flags |= eor;
435 		else
436 			kprintf("semi-panic: sbcompress");
437 	}
438 
439 	/*
440 	 * Clean up any defered frees.
441 	 */
442 	while (free_chain)
443 		free_chain = m_free(free_chain);
444 
445 	sbcheck(sb);
446 }
447 
448 /*
449  * Free all mbufs in a sockbuf.
450  * Check that all resources are reclaimed.
451  */
452 void
453 sbflush(struct sockbuf *sb)
454 {
455 	while (sb->sb_mbcnt) {
456 		/*
457 		 * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
458 		 * we would loop forever. Panic instead.
459 		 */
460 		if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len))
461 			break;
462 		sbdrop(sb, (int)sb->sb_cc);
463 	}
464 	KASSERT(!(sb->sb_cc || sb->sb_mb || sb->sb_mbcnt || sb->sb_lastmbuf),
465 	    ("sbflush: cc %ld || mb %p || mbcnt %ld || lastmbuf %p",
466 	    sb->sb_cc, sb->sb_mb, sb->sb_mbcnt, sb->sb_lastmbuf));
467 }
468 
469 /*
470  * Drop data from (the front of) a sockbuf.  If the current record is
471  * exhausted this routine will move onto the next one and continue dropping
472  * data.
473  */
474 void
475 sbdrop(struct sockbuf *sb, int len)
476 {
477 	struct mbuf *m;
478 	struct mbuf *free_chain = NULL;
479 
480 	sbcheck(sb);
481 	crit_enter();
482 
483 	m = sb->sb_mb;
484 	while (m && len > 0) {
485 		if (m->m_len > len) {
486 			m->m_len -= len;
487 			m->m_data += len;
488 			sb->sb_cc -= len;
489 			atomic_subtract_long(&sb->sb_cc_prealloc, len);
490 			break;
491 		}
492 		len -= m->m_len;
493 		m = sbunlinkmbuf(sb, m, &free_chain);
494 		if (m == NULL && len)
495 			m = sb->sb_mb;
496 	}
497 
498 	/*
499 	 * Remove any trailing 0-length mbufs in the current record.  If
500 	 * the last record for which data was removed is now empty, m will be
501 	 * NULL.
502 	 */
503 	while (m && m->m_len == 0) {
504 		m = sbunlinkmbuf(sb, m, &free_chain);
505 	}
506 	crit_exit();
507 	if (free_chain)
508 		m_freem(free_chain);
509 	sbcheck(sb);
510 }
511 
512 /*
513  * Drop a record off the front of a sockbuf and move the next record
514  * to the front.
515  *
516  * Must be called while holding a critical section.
517  */
518 void
519 sbdroprecord(struct sockbuf *sb)
520 {
521 	struct mbuf *m;
522 	struct mbuf *n;
523 
524 	sbcheck(sb);
525 	m = sb->sb_mb;
526 	if (m) {
527 		if ((sb->sb_mb = m->m_nextpkt) == NULL) {
528 			sb->sb_lastrecord = NULL;
529 			sb->sb_lastmbuf = NULL;
530 		}
531 		m->m_nextpkt = NULL;
532 		for (n = m; n; n = n->m_next)
533 			sbfree(sb, n);
534 		m_freem(m);
535 		sbcheck(sb);
536 	}
537 }
538 
539 /*
540  * Drop the first mbuf off the sockbuf and move the next mbuf to the front.
541  * Currently only the head mbuf of the sockbuf may be dropped this way.
542  *
543  * The next mbuf in the same record as the mbuf being removed is returned
544  * or NULL if the record is exhausted.  Note that other records may remain
545  * in the sockbuf when NULL is returned.
546  *
547  * Must be called while holding a critical section.
548  */
549 struct mbuf *
550 sbunlinkmbuf(struct sockbuf *sb, struct mbuf *m, struct mbuf **free_chain)
551 {
552 	struct mbuf *n;
553 
554 	KKASSERT(sb->sb_mb == m);
555 	sbfree(sb, m);
556 	n = m->m_next;
557 	if (n) {
558 		sb->sb_mb = n;
559 		if (sb->sb_lastrecord == m)
560 			sb->sb_lastrecord = n;
561 		KKASSERT(sb->sb_lastmbuf != m);
562 		n->m_nextpkt = m->m_nextpkt;
563 	} else {
564 		sb->sb_mb = m->m_nextpkt;
565 		if (sb->sb_lastrecord == m) {
566 			KKASSERT(sb->sb_mb == NULL);
567 			sb->sb_lastrecord = NULL;
568 		}
569 		if (sb->sb_mb == NULL)
570 			sb->sb_lastmbuf = NULL;
571 	}
572 	m->m_nextpkt = NULL;
573 	if (free_chain) {
574 		m->m_next = *free_chain;
575 		*free_chain = m;
576 	} else {
577 		m->m_next = NULL;
578 	}
579 	return(n);
580 }
581 
582 /*
583  * Create a "control" mbuf containing the specified data
584  * with the specified type for presentation on a socket buffer.
585  */
586 struct mbuf *
587 sbcreatecontrol(caddr_t p, int size, int type, int level)
588 {
589 	struct cmsghdr *cp;
590 	struct mbuf *m;
591 
592 	if (CMSG_SPACE((u_int)size) > MCLBYTES)
593 		return (NULL);
594 	m = m_getl(CMSG_SPACE((u_int)size), MB_DONTWAIT, MT_CONTROL, 0, NULL);
595 	if (m == NULL)
596 		return (NULL);
597 	m->m_len = CMSG_SPACE(size);
598 	cp = mtod(m, struct cmsghdr *);
599 	if (p != NULL)
600 		memcpy(CMSG_DATA(cp), p, size);
601 	cp->cmsg_len = CMSG_LEN(size);
602 	cp->cmsg_level = level;
603 	cp->cmsg_type = type;
604 	mbuftrackid(m, 24);
605 	return (m);
606 }
607 
608