xref: /dragonfly/sys/kern/uipc_sockbuf.c (revision 03517d4e)
1 /*
2  * Copyright (c) 2005 Jeffrey M. Hsu.  All rights reserved.
3  * Copyright (c) 1982, 1986, 1988, 1990, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. Neither the name of the University nor the names of its contributors
15  *    may be used to endorse or promote products derived from this software
16  *    without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  *
30  * @(#)uipc_socket2.c	8.1 (Berkeley) 6/10/93
31  * $FreeBSD: src/sys/kern/uipc_socket2.c,v 1.55.2.17 2002/08/31 19:04:55 dwmalone Exp $
32  */
33 
34 #include "opt_param.h"
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/domain.h>
38 #include <sys/file.h>	/* for maxfiles */
39 #include <sys/kernel.h>
40 #include <sys/proc.h>
41 #include <sys/malloc.h>
42 #include <sys/mbuf.h>
43 #include <sys/protosw.h>
44 #include <sys/resourcevar.h>
45 #include <sys/stat.h>
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 
49 #include <sys/thread2.h>
50 #include <sys/msgport2.h>
51 
52 /*
53  * Routines to add and remove data from an mbuf queue.
54  *
55  * The routines sbappend() or sbappendrecord() are normally called to
56  * append new mbufs to a socket buffer.  sbappendrecord() differs from
57  * sbappend() in that data supplied is treated as the beginning of a new
58  * record.  sbappend() only begins a new record if the last mbuf in the
59  * sockbuf is marked M_EOR.
60  *
61  * To place a sender's address, optional access rights, and data in a
62  * socket receive buffer, sbappendaddr() or sbappendcontrol() should be
63  * used.   These functions also begin a new record.
64  *
65  * Reliable protocols may use the socket send buffer to hold data
66  * awaiting acknowledgement.  Data is normally copied from a socket
67  * send buffer in a protocol with m_copym for output to a peer,
68  * and then removing the data from the socket buffer with sbdrop()
69  * or sbdroprecord() when the data is acknowledged by the peer.
70  */
71 
72 /*
73  * Append mbuf chain m to the last record in the socket buffer sb.
74  * The additional space associated the mbuf chain is recorded in sb.
75  * Empty mbufs are discarded and mbufs are compacted where possible.
76  *
77  * If M_EOR is set in the first or last mbuf of the last record, the
78  * mbuf chain is appended as a new record.  M_EOR is usually just set
79  * in the last mbuf of the last record's mbuf chain (see sbcompress()),
80  * but this may be changed in the future since there is no real need
81  * to propogate the flag any more.
82  */
83 void
84 sbappend(struct sockbuf *sb, struct mbuf *m)
85 {
86 	struct mbuf *n;
87 
88 	mbuftrackid(m, 16);
89 
90 	if (m) {
91 		n = sb->sb_lastrecord;
92 		if (n) {
93 			if (n->m_flags & M_EOR) {
94 				sbappendrecord(sb, m);
95 				return;
96 			}
97 		}
98 		n = sb->sb_lastmbuf;
99 		if (n) {
100 			if (n->m_flags & M_EOR) {
101 				sbappendrecord(sb, m);
102 				return;
103 			}
104 		}
105 		sbcompress(sb, m, n);
106 	}
107 }
108 
109 /*
110  * sbappendstream() is an optimized form of sbappend() for protocols
111  * such as TCP that only have one record in the socket buffer, are
112  * not PR_ATOMIC, nor allow MT_CONTROL data.  A protocol that uses
113  * sbappendstream() must use sbappendstream() exclusively.
114  */
115 void
116 sbappendstream(struct sockbuf *sb, struct mbuf *m)
117 {
118 	mbuftrackid(m, 17);
119 	KKASSERT(m->m_nextpkt == NULL);
120 	sbcompress(sb, m, sb->sb_lastmbuf);
121 }
122 
123 #ifdef SOCKBUF_DEBUG
124 
125 void
126 _sbcheck(struct sockbuf *sb)
127 {
128 	struct mbuf *m;
129 	struct mbuf *n = NULL;
130 	u_long len = 0, mbcnt = 0;
131 
132 	for (m = sb->sb_mb; m; m = n) {
133 	    n = m->m_nextpkt;
134 	    if (n == NULL && sb->sb_lastrecord != m) {
135 		    kprintf("sockbuf %p mismatched lastrecord %p vs %p\n", sb, sb->sb_lastrecord, m);
136 		    panic("sbcheck1");
137 
138 	    }
139 	    for (; m; m = m->m_next) {
140 		len += m->m_len;
141 		mbcnt += MSIZE;
142 		if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
143 			mbcnt += m->m_ext.ext_size;
144 		if (n == NULL && m->m_next == NULL) {
145 			if (sb->sb_lastmbuf != m) {
146 				kprintf("sockbuf %p mismatched lastmbuf %p vs %p\n", sb, sb->sb_lastmbuf, m);
147 				panic("sbcheck2");
148 			}
149 		}
150 	    }
151 	}
152 	if (sb->sb_mb == NULL) {
153 	    if (sb->sb_lastrecord != NULL) {
154 		kprintf("sockbuf %p is empty, lastrecord not NULL: %p\n",
155 			sb, sb->sb_lastrecord);
156 		panic("sbcheck3");
157 	    }
158 	    if (sb->sb_lastmbuf != NULL) {
159 		kprintf("sockbuf %p is empty, lastmbuf not NULL: %p\n",
160 			sb, sb->sb_lastmbuf);
161 		panic("sbcheck4");
162 	    }
163 	}
164 	if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
165 		kprintf("sockbuf %p cc %ld != %ld || mbcnt %ld != %ld\n",
166 		    sb, len, sb->sb_cc, mbcnt, sb->sb_mbcnt);
167 		panic("sbcheck5");
168 	}
169 }
170 
171 #endif
172 
173 /*
174  * Same as sbappend(), except the mbuf chain begins a new record.
175  */
176 void
177 sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
178 {
179 	struct mbuf *firstmbuf;
180 	struct mbuf *secondmbuf;
181 
182 	if (m0 == NULL)
183 		return;
184 	mbuftrackid(m0, 18);
185 
186 	sbcheck(sb);
187 
188 	/*
189 	 * Break the first mbuf off from the rest of the mbuf chain.
190 	 */
191 	firstmbuf = m0;
192 	secondmbuf = m0->m_next;
193 	m0->m_next = NULL;
194 
195 	/*
196 	 * Insert the first mbuf of the m0 mbuf chain as the last record of
197 	 * the sockbuf.  Note this permits zero length records!  Keep the
198 	 * sockbuf state consistent.
199 	 */
200 	if (sb->sb_mb == NULL)
201 		sb->sb_mb = firstmbuf;
202 	else
203 		sb->sb_lastrecord->m_nextpkt = firstmbuf;
204 	sb->sb_lastrecord = firstmbuf;	/* update hint for new last record */
205 	sb->sb_lastmbuf = firstmbuf;	/* update hint for new last mbuf */
206 
207 	/*
208 	 * propagate the EOR flag so sbcompress() can pick it up
209 	 */
210 	if ((firstmbuf->m_flags & M_EOR) && (secondmbuf != NULL)) {
211 		firstmbuf->m_flags &= ~M_EOR;
212 		secondmbuf->m_flags |= M_EOR;
213 	}
214 
215 	/*
216 	 * The succeeding call to sbcompress() omits accounting for
217 	 * the first mbuf, so do it here.
218 	 */
219 	sballoc(sb, firstmbuf);
220 
221 	/* Compact the rest of the mbuf chain in after the first mbuf. */
222 	sbcompress(sb, secondmbuf, firstmbuf);
223 }
224 
225 /*
226  * Append address and data, and optionally, control (ancillary) data
227  * to the receive queue of a socket.  If present,
228  * m0 must include a packet header with total length.
229  * Returns 0 if insufficient mbufs.
230  */
231 int
232 sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0,
233 	     struct mbuf *control)
234 {
235 	struct mbuf *m, *n;
236 	int eor;
237 
238 	mbuftrackid(m0, 19);
239 	mbuftrackid(control, 20);
240 	if (m0 && (m0->m_flags & M_PKTHDR) == 0)
241 		panic("sbappendaddr");
242 	sbcheck(sb);
243 
244 	for (n = control; n; n = n->m_next) {
245 		if (n->m_next == NULL)	/* keep pointer to last control buf */
246 			break;
247 	}
248 	if (asa->sa_len > MLEN)
249 		return (0);
250 	MGET(m, M_NOWAIT, MT_SONAME);
251 	if (m == NULL)
252 		return (0);
253 	KKASSERT(m->m_nextpkt == NULL);
254 	m->m_len = asa->sa_len;
255 	bcopy(asa, mtod(m, caddr_t), asa->sa_len);
256 	if (n)
257 		n->m_next = m0;		/* concatenate data to control */
258 	else
259 		control = m0;
260 	m->m_next = control;
261 	for (n = m; n; n = n->m_next)
262 		sballoc(sb, n);
263 
264 	if (sb->sb_mb == NULL)
265 		sb->sb_mb = m;
266 	else
267 		sb->sb_lastrecord->m_nextpkt = m;
268 	sb->sb_lastrecord = m;
269 
270 	/*
271 	 * Propogate M_EOR to the last mbuf and calculate sb_lastmbuf
272 	 * so sbappend() can find it.
273 	 */
274 	eor = m->m_flags;
275 	while (m->m_next) {
276 		m->m_flags &= ~M_EOR;
277 		m = m->m_next;
278 		eor |= m->m_flags;
279 	}
280 	m->m_flags |= eor & M_EOR;
281 	sb->sb_lastmbuf = m;
282 
283 	return (1);
284 }
285 
286 /*
287  * Append control information followed by data. Both the control and data
288  * must be non-null.
289  */
290 int
291 sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control)
292 {
293 	struct mbuf *n;
294 	u_int length, cmbcnt, m0mbcnt;
295 	int eor;
296 
297 	KASSERT(control != NULL, ("sbappendcontrol"));
298 	KKASSERT(control->m_nextpkt == NULL);
299 	sbcheck(sb);
300 
301 	mbuftrackid(m0, 21);
302 	mbuftrackid(control, 22);
303 
304 	length = m_countm(control, &n, &cmbcnt) + m_countm(m0, NULL, &m0mbcnt);
305 
306 	KKASSERT(m0 != NULL);
307 
308 	n->m_next = m0;			/* concatenate data to control */
309 
310 	if (sb->sb_mb == NULL)
311 		sb->sb_mb = control;
312 	else
313 		sb->sb_lastrecord->m_nextpkt = control;
314 	sb->sb_lastrecord = control;
315 
316 	/*
317 	 * Propogate M_EOR to the last mbuf and calculate sb_lastmbuf
318 	 * so sbappend() can find it.
319 	 */
320 	eor = m0->m_flags;
321 	while (m0->m_next) {
322 		m0->m_flags &= ~M_EOR;
323 		m0 = m0->m_next;
324 		eor |= m0->m_flags;
325 	}
326 	m0->m_flags |= eor & M_EOR;
327 	sb->sb_lastmbuf = m0;
328 
329 	sb->sb_cc += length;
330 	sb->sb_mbcnt += cmbcnt + m0mbcnt;
331 
332 	return (1);
333 }
334 
335 /*
336  * Compress mbuf chain m into the socket buffer sb following mbuf tailm.
337  * If tailm is null, the buffer is presumed empty.  Also, as a side-effect,
338  * increment the sockbuf counts for each mbuf in the chain.
339  */
340 void
341 sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *tailm)
342 {
343 	int eor = 0;
344 	struct mbuf *free_chain = NULL;
345 
346 	mbuftrackid(m, 23);
347 
348 	sbcheck(sb);
349 	while (m) {
350 		struct mbuf *o;
351 
352 		eor |= m->m_flags & M_EOR;
353 		/*
354 		 * Disregard empty mbufs as long as we don't encounter
355 		 * an end-of-record or there is a trailing mbuf of
356 		 * the same type to propagate the EOR flag to.
357 		 *
358 		 * Defer the m_free() call because it can block and break
359 		 * the atomicy of the sockbuf.
360 		 */
361 		if (m->m_len == 0 &&
362 		    (eor == 0 ||
363 		     (((o = m->m_next) || (o = tailm)) &&
364 		      o->m_type == m->m_type))) {
365 			o = m->m_next;
366 			m->m_next = free_chain;
367 			free_chain = m;
368 			m = o;
369 			continue;
370 		}
371 
372 		/*
373 		 * See if we can coalesce with preceding mbuf.  Never try
374 		 * to coalesce a mbuf representing an end-of-record or
375 		 * a mbuf locked by userland for reading.
376 		 */
377 		if (tailm && !(tailm->m_flags & (M_EOR | M_SOLOCKED)) &&
378 		    M_WRITABLE(tailm) &&
379 		    m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
380 		    m->m_len <= M_TRAILINGSPACE(tailm) &&
381 		    tailm->m_type == m->m_type) {
382 			u_long mbcnt_sz;
383 
384 			bcopy(mtod(m, caddr_t),
385 			      mtod(tailm, caddr_t) + tailm->m_len,
386 			      (unsigned)m->m_len);
387 			tailm->m_len += m->m_len;
388 
389 			sb->sb_cc += m->m_len;		/* update sb counter */
390 
391 			/*
392 			 * Fix the wrongly updated mbcnt_prealloc
393 			 */
394 			mbcnt_sz = MSIZE;
395 			if (m->m_flags & M_EXT)
396 				mbcnt_sz += m->m_ext.ext_size;
397 			atomic_subtract_long(&sb->sb_mbcnt_prealloc, mbcnt_sz);
398 
399 			o = m->m_next;
400 			m->m_next = free_chain;
401 			free_chain = m;
402 			m = o;
403 			continue;
404 		}
405 
406 		/* Insert whole mbuf. */
407 		if (tailm == NULL) {
408 			KASSERT(sb->sb_mb == NULL,
409 				("sbcompress: sb_mb not NULL"));
410 			sb->sb_mb = m;		/* only mbuf in sockbuf */
411 			sb->sb_lastrecord = m;	/* new last record */
412 		} else {
413 			tailm->m_next = m;	/* tack m on following tailm */
414 		}
415 		sb->sb_lastmbuf = m;	/* update last mbuf hint */
416 
417 		tailm = m;	/* just inserted mbuf becomes the new tail */
418 		m = m->m_next;		/* advance to next mbuf */
419 		tailm->m_next = NULL;	/* split inserted mbuf off from chain */
420 
421 		/* update sb counters for just added mbuf */
422 		sballoc(sb, tailm);
423 
424 		/* clear EOR on intermediate mbufs */
425 		tailm->m_flags &= ~M_EOR;
426 	}
427 
428 	/*
429 	 * Propogate EOR to the last mbuf
430 	 */
431 	if (eor) {
432 		if (tailm)
433 			tailm->m_flags |= eor;
434 		else
435 			kprintf("semi-panic: sbcompress");
436 	}
437 
438 	/*
439 	 * Clean up any defered frees.
440 	 */
441 	while (free_chain)
442 		free_chain = m_free(free_chain);
443 
444 	sbcheck(sb);
445 }
446 
447 /*
448  * Free all mbufs in a sockbuf.
449  * Check that all resources are reclaimed.
450  */
451 void
452 sbflush(struct sockbuf *sb)
453 {
454 	while (sb->sb_mbcnt) {
455 		/*
456 		 * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
457 		 * we would loop forever. Panic instead.
458 		 */
459 		if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len))
460 			break;
461 		sbdrop(sb, (int)sb->sb_cc);
462 	}
463 	KASSERT(!(sb->sb_cc || sb->sb_mb || sb->sb_mbcnt || sb->sb_lastmbuf),
464 	    ("sbflush: cc %ld || mb %p || mbcnt %ld || lastmbuf %p",
465 	    sb->sb_cc, sb->sb_mb, sb->sb_mbcnt, sb->sb_lastmbuf));
466 }
467 
468 /*
469  * Drop data from (the front of) a sockbuf.  If the current record is
470  * exhausted this routine will move onto the next one and continue dropping
471  * data.
472  */
473 void
474 sbdrop(struct sockbuf *sb, int len)
475 {
476 	struct mbuf *m;
477 	struct mbuf *free_chain = NULL;
478 
479 	sbcheck(sb);
480 	crit_enter();
481 
482 	m = sb->sb_mb;
483 	while (m && len > 0) {
484 		if (m->m_len > len) {
485 			m->m_len -= len;
486 			m->m_data += len;
487 			sb->sb_cc -= len;
488 			atomic_subtract_long(&sb->sb_cc_prealloc, len);
489 			break;
490 		}
491 		len -= m->m_len;
492 		m = sbunlinkmbuf(sb, m, &free_chain);
493 		if (m == NULL && len)
494 			m = sb->sb_mb;
495 	}
496 
497 	/*
498 	 * Remove any trailing 0-length mbufs in the current record.  If
499 	 * the last record for which data was removed is now empty, m will be
500 	 * NULL.
501 	 */
502 	while (m && m->m_len == 0) {
503 		m = sbunlinkmbuf(sb, m, &free_chain);
504 	}
505 	crit_exit();
506 	if (free_chain)
507 		m_freem(free_chain);
508 	sbcheck(sb);
509 }
510 
511 /*
512  * Drop a record off the front of a sockbuf and move the next record
513  * to the front.
514  *
515  * Must be called while holding a critical section.
516  */
517 void
518 sbdroprecord(struct sockbuf *sb)
519 {
520 	struct mbuf *m;
521 	struct mbuf *n;
522 
523 	sbcheck(sb);
524 	m = sb->sb_mb;
525 	if (m) {
526 		if ((sb->sb_mb = m->m_nextpkt) == NULL) {
527 			sb->sb_lastrecord = NULL;
528 			sb->sb_lastmbuf = NULL;
529 		}
530 		m->m_nextpkt = NULL;
531 		for (n = m; n; n = n->m_next)
532 			sbfree(sb, n);
533 		m_freem(m);
534 		sbcheck(sb);
535 	}
536 }
537 
538 /*
539  * Drop the first mbuf off the sockbuf and move the next mbuf to the front.
540  * Currently only the head mbuf of the sockbuf may be dropped this way.
541  *
542  * The next mbuf in the same record as the mbuf being removed is returned
543  * or NULL if the record is exhausted.  Note that other records may remain
544  * in the sockbuf when NULL is returned.
545  *
546  * Must be called while holding a critical section.
547  */
548 struct mbuf *
549 sbunlinkmbuf(struct sockbuf *sb, struct mbuf *m, struct mbuf **free_chain)
550 {
551 	struct mbuf *n;
552 
553 	KKASSERT(sb->sb_mb == m);
554 	sbfree(sb, m);
555 	n = m->m_next;
556 	if (n) {
557 		sb->sb_mb = n;
558 		if (sb->sb_lastrecord == m)
559 			sb->sb_lastrecord = n;
560 		KKASSERT(sb->sb_lastmbuf != m);
561 		n->m_nextpkt = m->m_nextpkt;
562 	} else {
563 		sb->sb_mb = m->m_nextpkt;
564 		if (sb->sb_lastrecord == m) {
565 			KKASSERT(sb->sb_mb == NULL);
566 			sb->sb_lastrecord = NULL;
567 		}
568 		if (sb->sb_mb == NULL)
569 			sb->sb_lastmbuf = NULL;
570 	}
571 	m->m_nextpkt = NULL;
572 	if (free_chain) {
573 		m->m_next = *free_chain;
574 		*free_chain = m;
575 	} else {
576 		m->m_next = NULL;
577 	}
578 	return(n);
579 }
580 
581 /*
582  * Create a "control" mbuf containing the specified data
583  * with the specified type for presentation on a socket buffer.
584  */
585 struct mbuf *
586 sbcreatecontrol(const void *p, size_t size, int type, int level)
587 {
588 	struct cmsghdr *cp;
589 	struct mbuf *m;
590 
591 	if (CMSG_SPACE(size) > MCLBYTES)
592 		return (NULL);
593 	m = m_getl(CMSG_SPACE(size), M_NOWAIT, MT_CONTROL, 0, NULL);
594 	if (m == NULL)
595 		return (NULL);
596 	m->m_len = CMSG_SPACE(size);
597 	cp = mtod(m, struct cmsghdr *);
598 	if (p != NULL)
599 		memcpy(CMSG_DATA(cp), p, size);
600 	cp->cmsg_len = CMSG_LEN(size);
601 	cp->cmsg_level = level;
602 	cp->cmsg_type = type;
603 	mbuftrackid(m, 24);
604 	return (m);
605 }
606