1 /*
2 * Copyright (c) 2005 Jeffrey M. Hsu. All rights reserved.
3 * Copyright (c) 1982, 1986, 1988, 1990, 1993
4 * The Regents of the University of California. All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. Neither the name of the University nor the names of its contributors
15 * may be used to endorse or promote products derived from this software
16 * without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 *
30 * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93
31 * $FreeBSD: src/sys/kern/uipc_socket2.c,v 1.55.2.17 2002/08/31 19:04:55 dwmalone Exp $
32 */
33
34 #include "opt_param.h"
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/domain.h>
38 #include <sys/file.h> /* for maxfiles */
39 #include <sys/kernel.h>
40 #include <sys/proc.h>
41 #include <sys/malloc.h>
42 #include <sys/mbuf.h>
43 #include <sys/protosw.h>
44 #include <sys/resourcevar.h>
45 #include <sys/stat.h>
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48
49 #include <sys/thread2.h>
50 #include <sys/msgport2.h>
51
52 /*
53 * Routines to add and remove data from an mbuf queue.
54 *
55 * The routines sbappend() or sbappendrecord() are normally called to
56 * append new mbufs to a socket buffer. sbappendrecord() differs from
57 * sbappend() in that data supplied is treated as the beginning of a new
58 * record. sbappend() only begins a new record if the last mbuf in the
59 * sockbuf is marked M_EOR.
60 *
61 * To place a sender's address, optional access rights, and data in a
62 * socket receive buffer, sbappendaddr() or sbappendcontrol() should be
63 * used. These functions also begin a new record.
64 *
65 * Reliable protocols may use the socket send buffer to hold data
66 * awaiting acknowledgement. Data is normally copied from a socket
67 * send buffer in a protocol with m_copym for output to a peer,
68 * and then removing the data from the socket buffer with sbdrop()
69 * or sbdroprecord() when the data is acknowledged by the peer.
70 */
71
72 /*
73 * Append mbuf chain m to the last record in the socket buffer sb.
74 * The additional space associated the mbuf chain is recorded in sb.
75 * Empty mbufs are discarded and mbufs are compacted where possible.
76 *
77 * If M_EOR is set in the first or last mbuf of the last record, the
78 * mbuf chain is appended as a new record. M_EOR is usually just set
79 * in the last mbuf of the last record's mbuf chain (see sbcompress()),
80 * but this may be changed in the future since there is no real need
81 * to propogate the flag any more.
82 */
83 void
sbappend(struct sockbuf * sb,struct mbuf * m)84 sbappend(struct sockbuf *sb, struct mbuf *m)
85 {
86 struct mbuf *n;
87
88 mbuftrackid(m, 16);
89
90 if (m) {
91 n = sb->sb_lastrecord;
92 if (n) {
93 if (n->m_flags & M_EOR) {
94 sbappendrecord(sb, m);
95 return;
96 }
97 }
98 n = sb->sb_lastmbuf;
99 if (n) {
100 if (n->m_flags & M_EOR) {
101 sbappendrecord(sb, m);
102 return;
103 }
104 }
105 sbcompress(sb, m, n);
106 }
107 }
108
109 /*
110 * sbappendstream() is an optimized form of sbappend() for protocols
111 * such as TCP that only have one record in the socket buffer, are
112 * not PR_ATOMIC, nor allow MT_CONTROL data. A protocol that uses
113 * sbappendstream() must use sbappendstream() exclusively.
114 */
115 void
sbappendstream(struct sockbuf * sb,struct mbuf * m)116 sbappendstream(struct sockbuf *sb, struct mbuf *m)
117 {
118 mbuftrackid(m, 17);
119 KKASSERT(m->m_nextpkt == NULL);
120 sbcompress(sb, m, sb->sb_lastmbuf);
121 }
122
123 #ifdef SOCKBUF_DEBUG
124
125 void
_sbcheck(struct sockbuf * sb)126 _sbcheck(struct sockbuf *sb)
127 {
128 struct mbuf *m;
129 struct mbuf *n = NULL;
130 u_long len = 0, mbcnt = 0;
131
132 for (m = sb->sb_mb; m; m = n) {
133 n = m->m_nextpkt;
134 if (n == NULL && sb->sb_lastrecord != m) {
135 kprintf("sockbuf %p mismatched lastrecord %p vs %p\n", sb, sb->sb_lastrecord, m);
136 panic("sbcheck1");
137 }
138 for (; m; m = m->m_next) {
139 len += m->m_len;
140 mbcnt += MSIZE;
141 if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
142 mbcnt += m->m_ext.ext_size;
143 if (n == NULL && m->m_next == NULL) {
144 if (sb->sb_lastmbuf != m) {
145 kprintf("sockbuf %p mismatched lastmbuf %p vs %p\n", sb, sb->sb_lastmbuf, m);
146 panic("sbcheck2");
147 }
148 }
149 }
150 }
151 if (sb->sb_mb == NULL) {
152 if (sb->sb_lastrecord != NULL) {
153 kprintf("sockbuf %p is empty, lastrecord not NULL: %p\n",
154 sb, sb->sb_lastrecord);
155 panic("sbcheck3");
156 }
157 if (sb->sb_lastmbuf != NULL) {
158 kprintf("sockbuf %p is empty, lastmbuf not NULL: %p\n",
159 sb, sb->sb_lastmbuf);
160 panic("sbcheck4");
161 }
162 }
163 if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
164 kprintf("sockbuf %p cc %ld != %ld || mbcnt %ld != %ld\n",
165 sb, len, sb->sb_cc, mbcnt, sb->sb_mbcnt);
166 panic("sbcheck5");
167 }
168 }
169
170 #endif
171
172 /*
173 * Same as sbappend(), except the mbuf chain begins a new record.
174 */
175 void
sbappendrecord(struct sockbuf * sb,struct mbuf * m0)176 sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
177 {
178 struct mbuf *firstmbuf;
179 struct mbuf *secondmbuf;
180
181 if (m0 == NULL)
182 return;
183 mbuftrackid(m0, 18);
184
185 sbcheck(sb);
186
187 /*
188 * Break the first mbuf off from the rest of the mbuf chain.
189 */
190 firstmbuf = m0;
191 secondmbuf = m0->m_next;
192 m0->m_next = NULL;
193
194 /*
195 * Insert the first mbuf of the m0 mbuf chain as the last record of
196 * the sockbuf. Note this permits zero length records! Keep the
197 * sockbuf state consistent.
198 */
199 if (sb->sb_mb == NULL)
200 sb->sb_mb = firstmbuf;
201 else
202 sb->sb_lastrecord->m_nextpkt = firstmbuf;
203 sb->sb_lastrecord = firstmbuf; /* update hint for new last record */
204 sb->sb_lastmbuf = firstmbuf; /* update hint for new last mbuf */
205
206 /*
207 * propagate the EOR flag so sbcompress() can pick it up
208 */
209 if ((firstmbuf->m_flags & M_EOR) && (secondmbuf != NULL)) {
210 firstmbuf->m_flags &= ~M_EOR;
211 secondmbuf->m_flags |= M_EOR;
212 }
213
214 /*
215 * The succeeding call to sbcompress() omits accounting for
216 * the first mbuf, so do it here.
217 */
218 sballoc(sb, firstmbuf);
219
220 /* Compact the rest of the mbuf chain in after the first mbuf. */
221 sbcompress(sb, secondmbuf, firstmbuf);
222 }
223
224 /*
225 * Append address and data, and optionally, control (ancillary) data
226 * to the receive queue of a socket. If present,
227 * m0 must include a packet header with total length.
228 * Returns 0 if insufficient mbufs.
229 */
230 int
sbappendaddr(struct sockbuf * sb,const struct sockaddr * asa,struct mbuf * m0,struct mbuf * control)231 sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0,
232 struct mbuf *control)
233 {
234 struct mbuf *m, *n;
235 int eor;
236
237 mbuftrackid(m0, 19);
238 mbuftrackid(control, 20);
239 if (m0 && (m0->m_flags & M_PKTHDR) == 0)
240 panic("sbappendaddr");
241 sbcheck(sb);
242
243 for (n = control; n; n = n->m_next) {
244 if (n->m_next == NULL) /* keep pointer to last control buf */
245 break;
246 }
247 if (asa->sa_len > MLEN)
248 return (0);
249 MGET(m, M_NOWAIT, MT_SONAME);
250 if (m == NULL)
251 return (0);
252 KKASSERT(m->m_nextpkt == NULL);
253 m->m_len = asa->sa_len;
254 bcopy(asa, mtod(m, caddr_t), asa->sa_len);
255 if (n)
256 n->m_next = m0; /* concatenate data to control */
257 else
258 control = m0;
259 m->m_next = control;
260 for (n = m; n; n = n->m_next)
261 sballoc(sb, n);
262
263 if (sb->sb_mb == NULL)
264 sb->sb_mb = m;
265 else
266 sb->sb_lastrecord->m_nextpkt = m;
267 sb->sb_lastrecord = m;
268
269 /*
270 * Propogate M_EOR to the last mbuf and calculate sb_lastmbuf
271 * so sbappend() can find it.
272 */
273 eor = m->m_flags;
274 while (m->m_next) {
275 m->m_flags &= ~M_EOR;
276 m = m->m_next;
277 eor |= m->m_flags;
278 }
279 m->m_flags |= eor & M_EOR;
280 sb->sb_lastmbuf = m;
281
282 return (1);
283 }
284
285 /*
286 * Append control information followed by data. Both the control and data
287 * must be non-null.
288 */
289 int
sbappendcontrol(struct sockbuf * sb,struct mbuf * m0,struct mbuf * control)290 sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control)
291 {
292 struct mbuf *n;
293 u_int length, cmbcnt, m0mbcnt;
294 int eor;
295
296 KASSERT(control != NULL, ("sbappendcontrol"));
297 KKASSERT(control->m_nextpkt == NULL);
298 sbcheck(sb);
299
300 mbuftrackid(m0, 21);
301 mbuftrackid(control, 22);
302
303 length = m_countm(control, &n, &cmbcnt) + m_countm(m0, NULL, &m0mbcnt);
304
305 KKASSERT(m0 != NULL);
306
307 n->m_next = m0; /* concatenate data to control */
308
309 if (sb->sb_mb == NULL)
310 sb->sb_mb = control;
311 else
312 sb->sb_lastrecord->m_nextpkt = control;
313 sb->sb_lastrecord = control;
314
315 /*
316 * Propogate M_EOR to the last mbuf and calculate sb_lastmbuf
317 * so sbappend() can find it.
318 */
319 eor = m0->m_flags;
320 while (m0->m_next) {
321 m0->m_flags &= ~M_EOR;
322 m0 = m0->m_next;
323 eor |= m0->m_flags;
324 }
325 m0->m_flags |= eor & M_EOR;
326 sb->sb_lastmbuf = m0;
327
328 sb->sb_cc += length;
329 sb->sb_mbcnt += cmbcnt + m0mbcnt;
330
331 return (1);
332 }
333
334 /*
335 * Compress mbuf chain m into the socket buffer sb following mbuf tailm.
336 * If tailm is null, the buffer is presumed empty. Also, as a side-effect,
337 * increment the sockbuf counts for each mbuf in the chain.
338 */
339 void
sbcompress(struct sockbuf * sb,struct mbuf * m,struct mbuf * tailm)340 sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *tailm)
341 {
342 int eor = 0;
343 struct mbuf *free_chain = NULL;
344
345 mbuftrackid(m, 23);
346
347 sbcheck(sb);
348 while (m) {
349 struct mbuf *o;
350
351 eor |= m->m_flags & M_EOR;
352 /*
353 * Disregard empty mbufs as long as we don't encounter
354 * an end-of-record or there is a trailing mbuf of
355 * the same type to propagate the EOR flag to.
356 *
357 * Defer the m_free() call because it can block and break
358 * the atomicy of the sockbuf.
359 */
360 if (m->m_len == 0 &&
361 (eor == 0 ||
362 (((o = m->m_next) || (o = tailm)) &&
363 o->m_type == m->m_type))) {
364 o = m->m_next;
365 m->m_next = free_chain;
366 free_chain = m;
367 m = o;
368 continue;
369 }
370
371 /*
372 * See if we can coalesce with preceding mbuf. Never try
373 * to coalesce a mbuf representing an end-of-record or
374 * a mbuf locked by userland for reading.
375 */
376 if (tailm && !(tailm->m_flags & (M_EOR | M_SOLOCKED)) &&
377 M_WRITABLE(tailm) &&
378 m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
379 m->m_len <= M_TRAILINGSPACE(tailm) &&
380 tailm->m_type == m->m_type) {
381 u_long mbcnt_sz;
382
383 bcopy(mtod(m, caddr_t),
384 mtod(tailm, caddr_t) + tailm->m_len,
385 (unsigned)m->m_len);
386 tailm->m_len += m->m_len;
387
388 sb->sb_cc += m->m_len; /* update sb counter */
389
390 /*
391 * Fix the wrongly updated mbcnt_prealloc
392 */
393 mbcnt_sz = MSIZE;
394 if (m->m_flags & M_EXT)
395 mbcnt_sz += m->m_ext.ext_size;
396 atomic_subtract_long(&sb->sb_mbcnt_prealloc, mbcnt_sz);
397
398 o = m->m_next;
399 m->m_next = free_chain;
400 free_chain = m;
401 m = o;
402 continue;
403 }
404
405 /* Insert whole mbuf. */
406 if (tailm == NULL) {
407 KASSERT(sb->sb_mb == NULL,
408 ("sbcompress: sb_mb not NULL"));
409 sb->sb_mb = m; /* only mbuf in sockbuf */
410 sb->sb_lastrecord = m; /* new last record */
411 } else {
412 tailm->m_next = m; /* tack m on following tailm */
413 }
414 sb->sb_lastmbuf = m; /* update last mbuf hint */
415
416 tailm = m; /* just inserted mbuf becomes the new tail */
417 m = m->m_next; /* advance to next mbuf */
418 tailm->m_next = NULL; /* split inserted mbuf off from chain */
419
420 /* update sb counters for just added mbuf */
421 sballoc(sb, tailm);
422
423 /* clear EOR on intermediate mbufs */
424 tailm->m_flags &= ~M_EOR;
425 }
426
427 /*
428 * Propogate EOR to the last mbuf
429 */
430 if (eor) {
431 if (tailm)
432 tailm->m_flags |= eor;
433 else
434 kprintf("semi-panic: sbcompress");
435 }
436
437 /*
438 * Clean up any defered frees.
439 */
440 while (free_chain)
441 free_chain = m_free(free_chain);
442
443 sbcheck(sb);
444 }
445
446 /*
447 * Free all mbufs in a sockbuf.
448 * Check that all resources are reclaimed.
449 */
450 void
sbflush(struct sockbuf * sb)451 sbflush(struct sockbuf *sb)
452 {
453 while (sb->sb_mbcnt) {
454 /*
455 * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
456 * we would loop forever. Panic instead.
457 */
458 if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len))
459 break;
460 sbdrop(sb, (int)sb->sb_cc);
461 }
462 KASSERT(!(sb->sb_cc || sb->sb_mb || sb->sb_mbcnt || sb->sb_lastmbuf),
463 ("sbflush: cc %ld || mb %p || mbcnt %ld || lastmbuf %p",
464 sb->sb_cc, sb->sb_mb, sb->sb_mbcnt, sb->sb_lastmbuf));
465 }
466
467 /*
468 * Drop data from (the front of) a sockbuf. If the current record is
469 * exhausted this routine will move onto the next one and continue dropping
470 * data.
471 */
472 void
sbdrop(struct sockbuf * sb,int len)473 sbdrop(struct sockbuf *sb, int len)
474 {
475 struct mbuf *m;
476 struct mbuf *free_chain = NULL;
477
478 sbcheck(sb);
479 crit_enter();
480
481 m = sb->sb_mb;
482 while (m && len > 0) {
483 if (m->m_len > len) {
484 m->m_len -= len;
485 m->m_data += len;
486 sb->sb_cc -= len;
487 atomic_subtract_long(&sb->sb_cc_prealloc, len);
488 break;
489 }
490 len -= m->m_len;
491 m = sbunlinkmbuf(sb, m, &free_chain);
492 if (m == NULL && len)
493 m = sb->sb_mb;
494 }
495
496 /*
497 * Remove any trailing 0-length mbufs in the current record. If
498 * the last record for which data was removed is now empty, m will be
499 * NULL.
500 */
501 while (m && m->m_len == 0) {
502 m = sbunlinkmbuf(sb, m, &free_chain);
503 }
504 crit_exit();
505 if (free_chain)
506 m_freem(free_chain);
507 sbcheck(sb);
508 }
509
510 /*
511 * Drop a record off the front of a sockbuf and move the next record
512 * to the front.
513 *
514 * Must be called while holding a critical section.
515 */
516 void
sbdroprecord(struct sockbuf * sb)517 sbdroprecord(struct sockbuf *sb)
518 {
519 struct mbuf *m;
520 struct mbuf *n;
521
522 sbcheck(sb);
523 m = sb->sb_mb;
524 if (m) {
525 if ((sb->sb_mb = m->m_nextpkt) == NULL) {
526 sb->sb_lastrecord = NULL;
527 sb->sb_lastmbuf = NULL;
528 }
529 m->m_nextpkt = NULL;
530 for (n = m; n; n = n->m_next)
531 sbfree(sb, n);
532 m_freem(m);
533 sbcheck(sb);
534 }
535 }
536
537 /*
538 * Drop the first mbuf off the sockbuf and move the next mbuf to the front.
539 * Currently only the head mbuf of the sockbuf may be dropped this way.
540 *
541 * The next mbuf in the same record as the mbuf being removed is returned
542 * or NULL if the record is exhausted. Note that other records may remain
543 * in the sockbuf when NULL is returned.
544 *
545 * Must be called while holding a critical section.
546 */
547 struct mbuf *
sbunlinkmbuf(struct sockbuf * sb,struct mbuf * m,struct mbuf ** free_chain)548 sbunlinkmbuf(struct sockbuf *sb, struct mbuf *m, struct mbuf **free_chain)
549 {
550 struct mbuf *n;
551
552 KKASSERT(sb->sb_mb == m);
553 sbfree(sb, m);
554 n = m->m_next;
555 if (n) {
556 sb->sb_mb = n;
557 if (sb->sb_lastrecord == m)
558 sb->sb_lastrecord = n;
559 KKASSERT(sb->sb_lastmbuf != m);
560 n->m_nextpkt = m->m_nextpkt;
561 } else {
562 sb->sb_mb = m->m_nextpkt;
563 if (sb->sb_lastrecord == m) {
564 KKASSERT(sb->sb_mb == NULL);
565 sb->sb_lastrecord = NULL;
566 }
567 if (sb->sb_mb == NULL)
568 sb->sb_lastmbuf = NULL;
569 }
570 m->m_nextpkt = NULL;
571 if (free_chain) {
572 m->m_next = *free_chain;
573 *free_chain = m;
574 } else {
575 m->m_next = NULL;
576 }
577 return(n);
578 }
579
580 /*
581 * Create a "control" mbuf containing the specified data
582 * with the specified type for presentation on a socket buffer.
583 */
584 struct mbuf *
sbcreatecontrol(const void * p,size_t size,int type,int level)585 sbcreatecontrol(const void *p, size_t size, int type, int level)
586 {
587 struct cmsghdr *cp;
588 struct mbuf *m;
589
590 if (CMSG_SPACE(size) > MCLBYTES)
591 return (NULL);
592 m = m_getl(CMSG_SPACE(size), M_NOWAIT, MT_CONTROL, 0, NULL);
593 if (m == NULL)
594 return (NULL);
595 m->m_len = CMSG_SPACE(size);
596 cp = mtod(m, struct cmsghdr *);
597 if (p != NULL)
598 memcpy(CMSG_DATA(cp), p, size);
599 cp->cmsg_len = CMSG_LEN(size);
600 cp->cmsg_level = level;
601 cp->cmsg_type = type;
602 mbuftrackid(m, 24);
603 return (m);
604 }
605