xref: /dragonfly/sys/net/if_poll.c (revision 77b0c609)
1 /*-
2  * Copyright (c) 2001-2002 Luigi Rizzo
3  *
4  * Supported by: the Xorp Project (www.xorp.org)
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  *
27  * $FreeBSD: src/sys/kern/kern_poll.c,v 1.2.2.4 2002/06/27 23:26:33 luigi Exp $
28  */
29 
30 #include "opt_ifpoll.h"
31 
32 #include <sys/param.h>
33 #include <sys/kernel.h>
34 #include <sys/ktr.h>
35 #include <sys/malloc.h>
36 #include <sys/serialize.h>
37 #include <sys/socket.h>
38 #include <sys/sysctl.h>
39 
40 #include <sys/thread2.h>
41 #include <sys/msgport2.h>
42 
43 #include <machine/atomic.h>
44 #include <machine/clock.h>
45 #include <machine/smp.h>
46 
47 #include <net/if.h>
48 #include <net/if_poll.h>
49 #include <net/netmsg2.h>
50 
51 /*
52  * Polling support for network device drivers.
53  *
54  * Drivers which support this feature try to register one status polling
55  * handler and several TX/RX polling handlers with the polling code.
56  * If interface's if_npoll is called with non-NULL second argument, then
57  * a register operation is requested, else a deregister operation is
58  * requested.  If the requested operation is "register", driver should
59  * setup the ifpoll_info passed in accoding its own needs:
60  *   ifpoll_info.ifpi_status.status_func == NULL
61  *     No status polling handler will be installed on CPU(0)
62  *   ifpoll_info.ifpi_rx[n].poll_func == NULL
63  *     No RX polling handler will be installed on CPU(n)
64  *   ifpoll_info.ifpi_tx[n].poll_func == NULL
65  *     No TX polling handler will be installed on CPU(n)
66  *
67  * RX is polled at the specified polling frequency (net.ifpoll.X.pollhz).
68  * TX and status polling could be done at lower frequency than RX frequency
69  * (net.ifpoll.0.status_frac and net.ifpoll.X.tx_frac).  To avoid systimer
70  * staggering at high frequency, RX systimer gives TX and status polling a
71  * piggyback (XXX).
72  *
73  * All of the registered polling handlers are called only if the interface
74  * is marked as 'IFF_RUNNING and IFF_NPOLLING'.  However, the interface's
75  * register and deregister function (ifnet.if_npoll) will be called even
76  * if interface is not marked with 'IFF_RUNNING'.
77  *
78  * If registration is successful, the driver must disable interrupts,
79  * and further I/O is performed through the TX/RX polling handler, which
80  * are invoked (at least once per clock tick) with 3 arguments: the "arg"
81  * passed at register time, a struct ifnet pointer, and a "count" limit.
82  * The registered serializer will be held before calling the related
83  * polling handler.
84  *
85  * The count limit specifies how much work the handler can do during the
86  * call -- typically this is the number of packets to be received, or
87  * transmitted, etc. (drivers are free to interpret this number, as long
88  * as the max time spent in the function grows roughly linearly with the
89  * count).
90  *
91  * A second variable controls the sharing of CPU between polling/kernel
92  * network processing, and other activities (typically userlevel tasks):
93  * net.ifpoll.X.{rx,tx}.user_frac (between 0 and 100, default 50) sets the
94  * share of CPU allocated to user tasks.  CPU is allocated proportionally
95  * to the shares, by dynamically adjusting the "count" (poll_burst).
96  *
97  * Other parameters can should be left to their default values.
98  * The following constraints hold
99  *
100  *	1 <= poll_burst <= poll_burst_max
101  *	1 <= poll_each_burst <= poll_burst_max
102  *	MIN_POLL_BURST_MAX <= poll_burst_max <= MAX_POLL_BURST_MAX
103  */
104 
105 #define IFPOLL_LIST_LEN		128
106 #define IFPOLL_FREQ_MAX		30000
107 
108 #define MIN_IOPOLL_BURST_MAX	10
109 #define MAX_IOPOLL_BURST_MAX	5000
110 #define IOPOLL_BURST_MAX	375	/* good for 1000Mbit net and HZ=4000 */
111 
112 #define IOPOLL_EACH_BURST	15
113 
114 #define IFPOLL_FREQ_DEFAULT	4000
115 
116 #define IFPOLL_TXFRAC_DEFAULT	0	/* 1/1 of the pollhz */
117 #define IFPOLL_STFRAC_DEFAULT	39	/* 1/40 of the pollhz */
118 
119 #define IFPOLL_RX		0x1
120 #define IFPOLL_TX		0x2
121 
122 union ifpoll_time {
123 	struct timeval		tv;
124 	uint64_t		tsc;
125 };
126 
127 struct iopoll_rec {
128 	struct lwkt_serialize	*serializer;
129 	struct ifnet		*ifp;
130 	void			*arg;
131 	ifpoll_iofn_t		poll_func;
132 };
133 
134 struct iopoll_ctx {
135 	union ifpoll_time	prev_t;
136 	u_long			short_ticks;		/* statistics */
137 	u_long			lost_polls;		/* statistics */
138 	u_long			suspect;		/* statistics */
139 	u_long			stalled;		/* statistics */
140 	uint32_t		pending_polls;		/* state */
141 
142 	struct netmsg_base	poll_netmsg;
143 	struct netmsg_base	poll_more_netmsg;
144 
145 	int			poll_cpuid;
146 	int			pollhz;
147 	uint32_t		phase;			/* state */
148 	int			residual_burst;		/* state */
149 	uint32_t		poll_each_burst;	/* tunable */
150 	union ifpoll_time	poll_start_t;		/* state */
151 
152 	uint32_t		poll_burst;		/* state */
153 	uint32_t		poll_burst_max;		/* tunable */
154 	uint32_t		user_frac;		/* tunable */
155 	uint32_t		kern_frac;		/* state */
156 
157 	uint32_t		poll_handlers; /* next free entry in pr[]. */
158 	struct iopoll_rec	pr[IFPOLL_LIST_LEN];
159 
160 	struct sysctl_ctx_list	poll_sysctl_ctx;
161 	struct sysctl_oid	*poll_sysctl_tree;
162 } __cachealign;
163 
164 struct poll_comm {
165 	struct systimer		pollclock;
166 	int			poll_cpuid;
167 
168 	int			stfrac_count;		/* state */
169 	int			poll_stfrac;		/* tunable */
170 
171 	int			txfrac_count;		/* state */
172 	int			poll_txfrac;		/* tunable */
173 
174 	int			pollhz;			/* tunable */
175 
176 	struct sysctl_ctx_list	sysctl_ctx;
177 	struct sysctl_oid	*sysctl_tree;
178 } __cachealign;
179 
180 struct stpoll_rec {
181 	struct lwkt_serialize	*serializer;
182 	struct ifnet		*ifp;
183 	ifpoll_stfn_t		status_func;
184 };
185 
186 struct stpoll_ctx {
187 	struct netmsg_base	poll_netmsg;
188 
189 	uint32_t		poll_handlers; /* next free entry in pr[]. */
190 	struct stpoll_rec	pr[IFPOLL_LIST_LEN];
191 
192 	struct sysctl_ctx_list	poll_sysctl_ctx;
193 	struct sysctl_oid	*poll_sysctl_tree;
194 } __cachealign;
195 
196 struct iopoll_sysctl_netmsg {
197 	struct netmsg_base	base;
198 	struct iopoll_ctx	*ctx;
199 };
200 
201 void		ifpoll_init_pcpu(int);
202 static void	ifpoll_register_handler(netmsg_t);
203 static void	ifpoll_deregister_handler(netmsg_t);
204 
205 /*
206  * Status polling
207  */
208 static void	stpoll_init(void);
209 static void	stpoll_handler(netmsg_t);
210 static void	stpoll_clock(struct stpoll_ctx *);
211 static int	stpoll_register(struct ifnet *, const struct ifpoll_status *);
212 static int	stpoll_deregister(struct ifnet *);
213 
214 /*
215  * RX/TX polling
216  */
217 static struct iopoll_ctx *iopoll_ctx_create(int, int);
218 static void	iopoll_init(int);
219 static void	rxpoll_handler(netmsg_t);
220 static void	txpoll_handler(netmsg_t);
221 static void	rxpollmore_handler(netmsg_t);
222 static void	txpollmore_handler(netmsg_t);
223 static void	iopoll_clock(struct iopoll_ctx *);
224 static int	iopoll_register(struct ifnet *, struct iopoll_ctx *,
225 		    const struct ifpoll_io *);
226 static int	iopoll_deregister(struct ifnet *, struct iopoll_ctx *);
227 
228 static void	iopoll_add_sysctl(struct sysctl_ctx_list *,
229 		    struct sysctl_oid_list *, struct iopoll_ctx *, int);
230 static void	sysctl_burstmax_handler(netmsg_t);
231 static int	sysctl_burstmax(SYSCTL_HANDLER_ARGS);
232 static void	sysctl_eachburst_handler(netmsg_t);
233 static int	sysctl_eachburst(SYSCTL_HANDLER_ARGS);
234 
235 /*
236  * Common functions
237  */
238 static void	poll_comm_init(int);
239 static void	poll_comm_start(int);
240 static void	poll_comm_adjust_pollhz(struct poll_comm *);
241 static void	poll_comm_systimer0(systimer_t, int, struct intrframe *);
242 static void	poll_comm_systimer(systimer_t, int, struct intrframe *);
243 static void	sysctl_pollhz_handler(netmsg_t);
244 static void	sysctl_stfrac_handler(netmsg_t);
245 static void	sysctl_txfrac_handler(netmsg_t);
246 static int	sysctl_pollhz(SYSCTL_HANDLER_ARGS);
247 static int	sysctl_stfrac(SYSCTL_HANDLER_ARGS);
248 static int	sysctl_txfrac(SYSCTL_HANDLER_ARGS);
249 
250 static struct stpoll_ctx	stpoll_context;
251 static struct poll_comm		*poll_common[MAXCPU];
252 static struct iopoll_ctx	*rxpoll_context[MAXCPU];
253 static struct iopoll_ctx	*txpoll_context[MAXCPU];
254 
255 SYSCTL_NODE(_net, OID_AUTO, ifpoll, CTLFLAG_RW, 0,
256 	    "Network device polling parameters");
257 
258 static int	iopoll_burst_max = IOPOLL_BURST_MAX;
259 static int	iopoll_each_burst = IOPOLL_EACH_BURST;
260 
261 static int	ifpoll_pollhz = IFPOLL_FREQ_DEFAULT;
262 static int	ifpoll_stfrac = IFPOLL_STFRAC_DEFAULT;
263 static int	ifpoll_txfrac = IFPOLL_TXFRAC_DEFAULT;
264 
265 TUNABLE_INT("net.ifpoll.burst_max", &iopoll_burst_max);
266 TUNABLE_INT("net.ifpoll.each_burst", &iopoll_each_burst);
267 TUNABLE_INT("net.ifpoll.pollhz", &ifpoll_pollhz);
268 TUNABLE_INT("net.ifpoll.status_frac", &ifpoll_stfrac);
269 TUNABLE_INT("net.ifpoll.tx_frac", &ifpoll_txfrac);
270 
271 static __inline void
272 ifpoll_sendmsg_oncpu(netmsg_t msg)
273 {
274 	if (msg->lmsg.ms_flags & MSGF_DONE)
275 		lwkt_sendmsg(netisr_portfn(mycpuid), &msg->lmsg);
276 }
277 
278 static __inline void
279 sched_stpoll(struct stpoll_ctx *st_ctx)
280 {
281 	ifpoll_sendmsg_oncpu((netmsg_t)&st_ctx->poll_netmsg);
282 }
283 
284 static __inline void
285 sched_iopoll(struct iopoll_ctx *io_ctx)
286 {
287 	ifpoll_sendmsg_oncpu((netmsg_t)&io_ctx->poll_netmsg);
288 }
289 
290 static __inline void
291 sched_iopollmore(struct iopoll_ctx *io_ctx)
292 {
293 	ifpoll_sendmsg_oncpu((netmsg_t)&io_ctx->poll_more_netmsg);
294 }
295 
296 static __inline void
297 ifpoll_time_get(union ifpoll_time *t)
298 {
299 	if (__predict_true(tsc_present))
300 		t->tsc = rdtsc();
301 	else
302 		microuptime(&t->tv);
303 }
304 
305 /* Return time diff in us */
306 static __inline int
307 ifpoll_time_diff(const union ifpoll_time *s, const union ifpoll_time *e)
308 {
309 	if (__predict_true(tsc_present)) {
310 		return (((e->tsc - s->tsc) * 1000000) / tsc_frequency);
311 	} else {
312 		return ((e->tv.tv_usec - s->tv.tv_usec) +
313 			(e->tv.tv_sec - s->tv.tv_sec) * 1000000);
314 	}
315 }
316 
317 /*
318  * Initialize per-cpu qpolling(4) context.  Called from kern_clock.c:
319  */
320 void
321 ifpoll_init_pcpu(int cpuid)
322 {
323 	if (cpuid >= ncpus2)
324 		return;
325 
326 	poll_comm_init(cpuid);
327 
328 	if (cpuid == 0)
329 		stpoll_init();
330 	iopoll_init(cpuid);
331 
332 	poll_comm_start(cpuid);
333 }
334 
335 int
336 ifpoll_register(struct ifnet *ifp)
337 {
338 	struct ifpoll_info *info;
339 	struct netmsg_base nmsg;
340 	int error;
341 
342 	if (ifp->if_npoll == NULL) {
343 		/* Device does not support polling */
344 		return EOPNOTSUPP;
345 	}
346 
347 	info = kmalloc(sizeof(*info), M_TEMP, M_WAITOK | M_ZERO);
348 
349 	/*
350 	 * Attempt to register.  Interlock with IFF_NPOLLING.
351 	 */
352 
353 	ifnet_serialize_all(ifp);
354 
355 	if (ifp->if_flags & IFF_NPOLLING) {
356 		/* Already polling */
357 		ifnet_deserialize_all(ifp);
358 		kfree(info, M_TEMP);
359 		return EBUSY;
360 	}
361 
362 	info->ifpi_ifp = ifp;
363 
364 	ifp->if_flags |= IFF_NPOLLING;
365 	ifp->if_npoll(ifp, info);
366 	KASSERT(ifp->if_npoll_cpuid >= 0, ("invalid npoll cpuid"));
367 
368 	ifnet_deserialize_all(ifp);
369 
370 	netmsg_init(&nmsg, NULL, &curthread->td_msgport,
371 		    0, ifpoll_register_handler);
372 	nmsg.lmsg.u.ms_resultp = info;
373 
374 	error = lwkt_domsg(netisr_portfn(0), &nmsg.lmsg, 0);
375 	if (error) {
376 		if (!ifpoll_deregister(ifp)) {
377 			if_printf(ifp, "ifpoll_register: "
378 				  "ifpoll_deregister failed!\n");
379 		}
380 	}
381 
382 	kfree(info, M_TEMP);
383 	return error;
384 }
385 
386 int
387 ifpoll_deregister(struct ifnet *ifp)
388 {
389 	struct netmsg_base nmsg;
390 	int error;
391 
392 	if (ifp->if_npoll == NULL)
393 		return EOPNOTSUPP;
394 
395 	ifnet_serialize_all(ifp);
396 
397 	if ((ifp->if_flags & IFF_NPOLLING) == 0) {
398 		ifnet_deserialize_all(ifp);
399 		return EINVAL;
400 	}
401 	ifp->if_flags &= ~IFF_NPOLLING;
402 
403 	ifnet_deserialize_all(ifp);
404 
405 	netmsg_init(&nmsg, NULL, &curthread->td_msgport,
406 		    0, ifpoll_deregister_handler);
407 	nmsg.lmsg.u.ms_resultp = ifp;
408 
409 	error = lwkt_domsg(netisr_portfn(0), &nmsg.lmsg, 0);
410 	if (!error) {
411 		ifnet_serialize_all(ifp);
412 		ifp->if_npoll(ifp, NULL);
413 		KASSERT(ifp->if_npoll_cpuid < 0, ("invalid npoll cpuid"));
414 		ifnet_deserialize_all(ifp);
415 	}
416 	return error;
417 }
418 
419 static void
420 ifpoll_register_handler(netmsg_t nmsg)
421 {
422 	const struct ifpoll_info *info = nmsg->lmsg.u.ms_resultp;
423 	int cpuid = mycpuid, nextcpu;
424 	int error;
425 
426 	KKASSERT(cpuid < ncpus2);
427 	KKASSERT(&curthread->td_msgport == netisr_portfn(cpuid));
428 
429 	if (cpuid == 0) {
430 		error = stpoll_register(info->ifpi_ifp, &info->ifpi_status);
431 		if (error)
432 			goto failed;
433 	}
434 
435 	error = iopoll_register(info->ifpi_ifp, rxpoll_context[cpuid],
436 				&info->ifpi_rx[cpuid]);
437 	if (error)
438 		goto failed;
439 
440 	error = iopoll_register(info->ifpi_ifp, txpoll_context[cpuid],
441 				&info->ifpi_tx[cpuid]);
442 	if (error)
443 		goto failed;
444 
445 	/* Adjust polling frequency, after all registration is done */
446 	poll_comm_adjust_pollhz(poll_common[cpuid]);
447 
448 	nextcpu = cpuid + 1;
449 	if (nextcpu < ncpus2)
450 		lwkt_forwardmsg(netisr_portfn(nextcpu), &nmsg->lmsg);
451 	else
452 		lwkt_replymsg(&nmsg->lmsg, 0);
453 	return;
454 failed:
455 	lwkt_replymsg(&nmsg->lmsg, error);
456 }
457 
458 static void
459 ifpoll_deregister_handler(netmsg_t nmsg)
460 {
461 	struct ifnet *ifp = nmsg->lmsg.u.ms_resultp;
462 	int cpuid = mycpuid, nextcpu;
463 
464 	KKASSERT(cpuid < ncpus2);
465 	KKASSERT(&curthread->td_msgport == netisr_portfn(cpuid));
466 
467 	/* Ignore errors */
468 	if (cpuid == 0)
469 		stpoll_deregister(ifp);
470 	iopoll_deregister(ifp, rxpoll_context[cpuid]);
471 	iopoll_deregister(ifp, txpoll_context[cpuid]);
472 
473 	/* Adjust polling frequency, after all deregistration is done */
474 	poll_comm_adjust_pollhz(poll_common[cpuid]);
475 
476 	nextcpu = cpuid + 1;
477 	if (nextcpu < ncpus2)
478 		lwkt_forwardmsg(netisr_portfn(nextcpu), &nmsg->lmsg);
479 	else
480 		lwkt_replymsg(&nmsg->lmsg, 0);
481 }
482 
483 static void
484 stpoll_init(void)
485 {
486 	struct stpoll_ctx *st_ctx = &stpoll_context;
487 	const struct poll_comm *comm = poll_common[0];
488 
489 	sysctl_ctx_init(&st_ctx->poll_sysctl_ctx);
490 	st_ctx->poll_sysctl_tree = SYSCTL_ADD_NODE(&st_ctx->poll_sysctl_ctx,
491 				   SYSCTL_CHILDREN(comm->sysctl_tree),
492 				   OID_AUTO, "status", CTLFLAG_RD, 0, "");
493 
494 	SYSCTL_ADD_UINT(&st_ctx->poll_sysctl_ctx,
495 			SYSCTL_CHILDREN(st_ctx->poll_sysctl_tree),
496 			OID_AUTO, "handlers", CTLFLAG_RD,
497 			&st_ctx->poll_handlers, 0,
498 			"Number of registered status poll handlers");
499 
500 	netmsg_init(&st_ctx->poll_netmsg, NULL, &netisr_adone_rport,
501 		    0, stpoll_handler);
502 }
503 
504 /*
505  * stpoll_handler is scheduled by sched_stpoll when appropriate, typically
506  * once per polling systimer tick.
507  */
508 static void
509 stpoll_handler(netmsg_t msg)
510 {
511 	struct stpoll_ctx *st_ctx = &stpoll_context;
512 	struct thread *td = curthread;
513 	int i;
514 
515 	KKASSERT(&td->td_msgport == netisr_portfn(0));
516 
517 	crit_enter_quick(td);
518 
519 	/* Reply ASAP */
520 	lwkt_replymsg(&msg->lmsg, 0);
521 
522 	if (st_ctx->poll_handlers == 0) {
523 		crit_exit_quick(td);
524 		return;
525 	}
526 
527 	for (i = 0; i < st_ctx->poll_handlers; ++i) {
528 		const struct stpoll_rec *rec = &st_ctx->pr[i];
529 		struct ifnet *ifp = rec->ifp;
530 
531 		if (!lwkt_serialize_try(rec->serializer))
532 			continue;
533 
534 		if ((ifp->if_flags & (IFF_RUNNING | IFF_NPOLLING)) ==
535 		    (IFF_RUNNING | IFF_NPOLLING))
536 			rec->status_func(ifp);
537 
538 		lwkt_serialize_exit(rec->serializer);
539 	}
540 
541 	crit_exit_quick(td);
542 }
543 
544 /*
545  * Hook from status poll systimer.  Tries to schedule an status poll.
546  * NOTE: Caller should hold critical section.
547  */
548 static void
549 stpoll_clock(struct stpoll_ctx *st_ctx)
550 {
551 	KKASSERT(mycpuid == 0);
552 
553 	if (st_ctx->poll_handlers == 0)
554 		return;
555 	sched_stpoll(st_ctx);
556 }
557 
558 static int
559 stpoll_register(struct ifnet *ifp, const struct ifpoll_status *st_rec)
560 {
561 	struct stpoll_ctx *st_ctx = &stpoll_context;
562 	int error;
563 
564 	KKASSERT(&curthread->td_msgport == netisr_portfn(0));
565 
566 	if (st_rec->status_func == NULL)
567 		return 0;
568 
569 	/*
570 	 * Check if there is room.
571 	 */
572 	if (st_ctx->poll_handlers >= IFPOLL_LIST_LEN) {
573 		/*
574 		 * List full, cannot register more entries.
575 		 * This should never happen; if it does, it is probably a
576 		 * broken driver trying to register multiple times. Checking
577 		 * this at runtime is expensive, and won't solve the problem
578 		 * anyways, so just report a few times and then give up.
579 		 */
580 		static int verbose = 10; /* XXX */
581 
582 		if (verbose > 0) {
583 			kprintf("status poll handlers list full, "
584 				"maybe a broken driver ?\n");
585 			verbose--;
586 		}
587 		error = ENOENT;
588 	} else {
589 		struct stpoll_rec *rec = &st_ctx->pr[st_ctx->poll_handlers];
590 
591 		rec->ifp = ifp;
592 		rec->serializer = st_rec->serializer;
593 		rec->status_func = st_rec->status_func;
594 
595 		st_ctx->poll_handlers++;
596 		error = 0;
597 	}
598 	return error;
599 }
600 
601 static int
602 stpoll_deregister(struct ifnet *ifp)
603 {
604 	struct stpoll_ctx *st_ctx = &stpoll_context;
605 	int i, error;
606 
607 	KKASSERT(&curthread->td_msgport == netisr_portfn(0));
608 
609 	for (i = 0; i < st_ctx->poll_handlers; ++i) {
610 		if (st_ctx->pr[i].ifp == ifp) /* Found it */
611 			break;
612 	}
613 	if (i == st_ctx->poll_handlers) {
614 		error = ENOENT;
615 	} else {
616 		st_ctx->poll_handlers--;
617 		if (i < st_ctx->poll_handlers) {
618 			/* Last entry replaces this one. */
619 			st_ctx->pr[i] = st_ctx->pr[st_ctx->poll_handlers];
620 		}
621 		error = 0;
622 	}
623 	return error;
624 }
625 
626 static __inline void
627 iopoll_reset_state(struct iopoll_ctx *io_ctx)
628 {
629 	crit_enter();
630 	io_ctx->poll_burst = io_ctx->poll_each_burst;
631 	io_ctx->pending_polls = 0;
632 	io_ctx->residual_burst = 0;
633 	io_ctx->phase = 0;
634 	io_ctx->kern_frac = 0;
635 	bzero(&io_ctx->poll_start_t, sizeof(io_ctx->poll_start_t));
636 	bzero(&io_ctx->prev_t, sizeof(io_ctx->prev_t));
637 	crit_exit();
638 }
639 
640 static void
641 iopoll_init(int cpuid)
642 {
643 	KKASSERT(cpuid < ncpus2);
644 
645 	rxpoll_context[cpuid] = iopoll_ctx_create(cpuid, IFPOLL_RX);
646 	txpoll_context[cpuid] = iopoll_ctx_create(cpuid, IFPOLL_TX);
647 }
648 
649 static struct iopoll_ctx *
650 iopoll_ctx_create(int cpuid, int poll_type)
651 {
652 	struct poll_comm *comm;
653 	struct iopoll_ctx *io_ctx;
654 	const char *poll_type_str;
655 	netisr_fn_t handler, more_handler;
656 
657 	KKASSERT(poll_type == IFPOLL_RX || poll_type == IFPOLL_TX);
658 
659 	/*
660 	 * Make sure that tunables are in sane state
661 	 */
662 	if (iopoll_burst_max < MIN_IOPOLL_BURST_MAX)
663 		iopoll_burst_max = MIN_IOPOLL_BURST_MAX;
664 	else if (iopoll_burst_max > MAX_IOPOLL_BURST_MAX)
665 		iopoll_burst_max = MAX_IOPOLL_BURST_MAX;
666 
667 	if (iopoll_each_burst > iopoll_burst_max)
668 		iopoll_each_burst = iopoll_burst_max;
669 
670 	comm = poll_common[cpuid];
671 
672 	/*
673 	 * Create the per-cpu polling context
674 	 */
675 	io_ctx = kmalloc_cachealign(sizeof(*io_ctx), M_DEVBUF,
676 	    M_WAITOK | M_ZERO);
677 
678 	io_ctx->poll_each_burst = iopoll_each_burst;
679 	io_ctx->poll_burst_max = iopoll_burst_max;
680 	io_ctx->user_frac = 50;
681 	if (poll_type == IFPOLL_RX)
682 		io_ctx->pollhz = comm->pollhz;
683 	else
684 		io_ctx->pollhz = comm->pollhz / (comm->poll_txfrac + 1);
685 	io_ctx->poll_cpuid = cpuid;
686 	iopoll_reset_state(io_ctx);
687 
688 	if (poll_type == IFPOLL_RX) {
689 		handler = rxpoll_handler;
690 		more_handler = rxpollmore_handler;
691 	} else {
692 		handler = txpoll_handler;
693 		more_handler = txpollmore_handler;
694 	}
695 
696 	netmsg_init(&io_ctx->poll_netmsg, NULL, &netisr_adone_rport,
697 	    0, handler);
698 	io_ctx->poll_netmsg.lmsg.u.ms_resultp = io_ctx;
699 
700 	netmsg_init(&io_ctx->poll_more_netmsg, NULL, &netisr_adone_rport,
701 	    0, more_handler);
702 	io_ctx->poll_more_netmsg.lmsg.u.ms_resultp = io_ctx;
703 
704 	/*
705 	 * Initialize per-cpu sysctl nodes
706 	 */
707 	if (poll_type == IFPOLL_RX)
708 		poll_type_str = "rx";
709 	else
710 		poll_type_str = "tx";
711 
712 	sysctl_ctx_init(&io_ctx->poll_sysctl_ctx);
713 	io_ctx->poll_sysctl_tree = SYSCTL_ADD_NODE(&io_ctx->poll_sysctl_ctx,
714 				   SYSCTL_CHILDREN(comm->sysctl_tree),
715 				   OID_AUTO, poll_type_str, CTLFLAG_RD, 0, "");
716 	iopoll_add_sysctl(&io_ctx->poll_sysctl_ctx,
717 	    SYSCTL_CHILDREN(io_ctx->poll_sysctl_tree), io_ctx, poll_type);
718 
719 	return io_ctx;
720 }
721 
722 /*
723  * Hook from iopoll systimer.  Tries to schedule an iopoll, but keeps
724  * track of lost ticks due to the previous handler taking too long.
725  * Normally, this should not happen, because polling handler should
726  * run for a short time.  However, in some cases (e.g. when there are
727  * changes in link status etc.) the drivers take a very long time
728  * (even in the order of milliseconds) to reset and reconfigure the
729  * device, causing apparent lost polls.
730  *
731  * The first part of the code is just for debugging purposes, and tries
732  * to count how often hardclock ticks are shorter than they should,
733  * meaning either stray interrupts or delayed events.
734  *
735  * WARNING! called from fastint or IPI, the MP lock might not be held.
736  * NOTE: Caller should hold critical section.
737  */
738 static void
739 iopoll_clock(struct iopoll_ctx *io_ctx)
740 {
741 	union ifpoll_time t;
742 	int delta;
743 
744 	KKASSERT(mycpuid == io_ctx->poll_cpuid);
745 
746 	if (io_ctx->poll_handlers == 0)
747 		return;
748 
749 	ifpoll_time_get(&t);
750 	delta = ifpoll_time_diff(&io_ctx->prev_t, &t);
751 	if (delta * io_ctx->pollhz < 500000)
752 		io_ctx->short_ticks++;
753 	else
754 		io_ctx->prev_t = t;
755 
756 	if (io_ctx->pending_polls > 100) {
757 		/*
758 		 * Too much, assume it has stalled (not always true
759 		 * see comment above).
760 		 */
761 		io_ctx->stalled++;
762 		io_ctx->pending_polls = 0;
763 		io_ctx->phase = 0;
764 	}
765 
766 	if (io_ctx->phase <= 2) {
767 		if (io_ctx->phase != 0)
768 			io_ctx->suspect++;
769 		io_ctx->phase = 1;
770 		sched_iopoll(io_ctx);
771 		io_ctx->phase = 2;
772 	}
773 	if (io_ctx->pending_polls++ > 0)
774 		io_ctx->lost_polls++;
775 }
776 
777 /*
778  * rxpoll_handler and txpoll_handler are scheduled by sched_iopoll when
779  * appropriate, typically once per polling systimer tick.
780  *
781  * Note that the message is replied immediately in order to allow a new
782  * ISR to be scheduled in the handler.
783  */
784 static void
785 rxpoll_handler(netmsg_t msg)
786 {
787 	struct iopoll_ctx *io_ctx;
788 	struct thread *td = curthread;
789 	int i, cycles;
790 
791 	io_ctx = msg->lmsg.u.ms_resultp;
792 	KKASSERT(&td->td_msgport == netisr_portfn(io_ctx->poll_cpuid));
793 
794 	crit_enter_quick(td);
795 
796 	/* Reply ASAP */
797 	lwkt_replymsg(&msg->lmsg, 0);
798 
799 	if (io_ctx->poll_handlers == 0) {
800 		crit_exit_quick(td);
801 		return;
802 	}
803 
804 	io_ctx->phase = 3;
805 	if (io_ctx->residual_burst == 0) {
806 		/* First call in this tick */
807 		ifpoll_time_get(&io_ctx->poll_start_t);
808 		io_ctx->residual_burst = io_ctx->poll_burst;
809 	}
810 	cycles = (io_ctx->residual_burst < io_ctx->poll_each_burst) ?
811 		 io_ctx->residual_burst : io_ctx->poll_each_burst;
812 	io_ctx->residual_burst -= cycles;
813 
814 	for (i = 0; i < io_ctx->poll_handlers; i++) {
815 		const struct iopoll_rec *rec = &io_ctx->pr[i];
816 		struct ifnet *ifp = rec->ifp;
817 
818 		if (!lwkt_serialize_try(rec->serializer))
819 			continue;
820 
821 		if ((ifp->if_flags & (IFF_RUNNING | IFF_NPOLLING)) ==
822 		    (IFF_RUNNING | IFF_NPOLLING))
823 			rec->poll_func(ifp, rec->arg, cycles);
824 
825 		lwkt_serialize_exit(rec->serializer);
826 	}
827 
828 	/*
829 	 * Do a quick exit/enter to catch any higher-priority
830 	 * interrupt sources.
831 	 */
832 	crit_exit_quick(td);
833 	crit_enter_quick(td);
834 
835 	sched_iopollmore(io_ctx);
836 	io_ctx->phase = 4;
837 
838 	crit_exit_quick(td);
839 }
840 
841 static void
842 txpoll_handler(netmsg_t msg)
843 {
844 	struct iopoll_ctx *io_ctx;
845 	struct thread *td = curthread;
846 	int i;
847 
848 	io_ctx = msg->lmsg.u.ms_resultp;
849 	KKASSERT(&td->td_msgport == netisr_portfn(io_ctx->poll_cpuid));
850 
851 	crit_enter_quick(td);
852 
853 	/* Reply ASAP */
854 	lwkt_replymsg(&msg->lmsg, 0);
855 
856 	if (io_ctx->poll_handlers == 0) {
857 		crit_exit_quick(td);
858 		return;
859 	}
860 
861 	io_ctx->phase = 3;
862 
863 	for (i = 0; i < io_ctx->poll_handlers; i++) {
864 		const struct iopoll_rec *rec = &io_ctx->pr[i];
865 		struct ifnet *ifp = rec->ifp;
866 
867 		if (!lwkt_serialize_try(rec->serializer))
868 			continue;
869 
870 		if ((ifp->if_flags & (IFF_RUNNING | IFF_NPOLLING)) ==
871 		    (IFF_RUNNING | IFF_NPOLLING))
872 			rec->poll_func(ifp, rec->arg, -1);
873 
874 		lwkt_serialize_exit(rec->serializer);
875 	}
876 
877 	/*
878 	 * Do a quick exit/enter to catch any higher-priority
879 	 * interrupt sources.
880 	 */
881 	crit_exit_quick(td);
882 	crit_enter_quick(td);
883 
884 	sched_iopollmore(io_ctx);
885 	io_ctx->phase = 4;
886 
887 	crit_exit_quick(td);
888 }
889 
890 /*
891  * rxpollmore_handler and txpollmore_handler are called after other netisr's,
892  * possibly scheduling another rxpoll_handler or txpoll_handler call, or
893  * adapting the burst size for the next cycle.
894  *
895  * It is very bad to fetch large bursts of packets from a single card at once,
896  * because the burst could take a long time to be completely processed leading
897  * to unfairness.  To reduce the problem, and also to account better for time
898  * spent in network-related processing, we split the burst in smaller chunks
899  * of fixed size, giving control to the other netisr's between chunks.  This
900  * helps in improving the fairness, reducing livelock and accounting for the
901  * work performed in low level handling.
902  */
903 static void
904 rxpollmore_handler(netmsg_t msg)
905 {
906 	struct thread *td = curthread;
907 	struct iopoll_ctx *io_ctx;
908 	union ifpoll_time t;
909 	int kern_load;
910 	uint32_t pending_polls;
911 
912 	io_ctx = msg->lmsg.u.ms_resultp;
913 	KKASSERT(&td->td_msgport == netisr_portfn(io_ctx->poll_cpuid));
914 
915 	crit_enter_quick(td);
916 
917 	/* Replay ASAP */
918 	lwkt_replymsg(&msg->lmsg, 0);
919 
920 	if (io_ctx->poll_handlers == 0) {
921 		crit_exit_quick(td);
922 		return;
923 	}
924 
925 	io_ctx->phase = 5;
926 	if (io_ctx->residual_burst > 0) {
927 		sched_iopoll(io_ctx);
928 		crit_exit_quick(td);
929 		/* Will run immediately on return, followed by netisrs */
930 		return;
931 	}
932 
933 	/* Here we can account time spent in iopoll's in this tick */
934 	ifpoll_time_get(&t);
935 	kern_load = ifpoll_time_diff(&io_ctx->poll_start_t, &t);
936 	kern_load = (kern_load * io_ctx->pollhz) / 10000; /* 0..100 */
937 	io_ctx->kern_frac = kern_load;
938 
939 	if (kern_load > (100 - io_ctx->user_frac)) {
940 		/* Try decrease ticks */
941 		if (io_ctx->poll_burst > 1)
942 			io_ctx->poll_burst--;
943 	} else {
944 		if (io_ctx->poll_burst < io_ctx->poll_burst_max)
945 			io_ctx->poll_burst++;
946 	}
947 
948 	io_ctx->pending_polls--;
949 	pending_polls = io_ctx->pending_polls;
950 
951 	if (pending_polls == 0) {
952 		/* We are done */
953 		io_ctx->phase = 0;
954 	} else {
955 		/*
956 		 * Last cycle was long and caused us to miss one or more
957 		 * hardclock ticks.  Restart processing again, but slightly
958 		 * reduce the burst size to prevent that this happens again.
959 		 */
960 		io_ctx->poll_burst -= (io_ctx->poll_burst / 8);
961 		if (io_ctx->poll_burst < 1)
962 			io_ctx->poll_burst = 1;
963 		sched_iopoll(io_ctx);
964 		io_ctx->phase = 6;
965 	}
966 
967 	crit_exit_quick(td);
968 }
969 
970 static void
971 txpollmore_handler(netmsg_t msg)
972 {
973 	struct thread *td = curthread;
974 	struct iopoll_ctx *io_ctx;
975 	uint32_t pending_polls;
976 
977 	io_ctx = msg->lmsg.u.ms_resultp;
978 	KKASSERT(&td->td_msgport == netisr_portfn(io_ctx->poll_cpuid));
979 
980 	crit_enter_quick(td);
981 
982 	/* Replay ASAP */
983 	lwkt_replymsg(&msg->lmsg, 0);
984 
985 	if (io_ctx->poll_handlers == 0) {
986 		crit_exit_quick(td);
987 		return;
988 	}
989 
990 	io_ctx->phase = 5;
991 
992 	io_ctx->pending_polls--;
993 	pending_polls = io_ctx->pending_polls;
994 
995 	if (pending_polls == 0) {
996 		/* We are done */
997 		io_ctx->phase = 0;
998 	} else {
999 		/*
1000 		 * Last cycle was long and caused us to miss one or more
1001 		 * hardclock ticks.  Restart processing again.
1002 		 */
1003 		sched_iopoll(io_ctx);
1004 		io_ctx->phase = 6;
1005 	}
1006 
1007 	crit_exit_quick(td);
1008 }
1009 
1010 static void
1011 iopoll_add_sysctl(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *parent,
1012     struct iopoll_ctx *io_ctx, int poll_type)
1013 {
1014 	if (poll_type == IFPOLL_RX) {
1015 		SYSCTL_ADD_PROC(ctx, parent, OID_AUTO, "burst_max",
1016 		    CTLTYPE_UINT | CTLFLAG_RW, io_ctx, 0, sysctl_burstmax,
1017 		    "IU", "Max Polling burst size");
1018 
1019 		SYSCTL_ADD_PROC(ctx, parent, OID_AUTO, "each_burst",
1020 		    CTLTYPE_UINT | CTLFLAG_RW, io_ctx, 0, sysctl_eachburst,
1021 		    "IU", "Max size of each burst");
1022 
1023 		SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "burst", CTLFLAG_RD,
1024 		    &io_ctx->poll_burst, 0, "Current polling burst size");
1025 
1026 		SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "user_frac", CTLFLAG_RW,
1027 		    &io_ctx->user_frac, 0, "Desired user fraction of cpu time");
1028 
1029 		SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "kern_frac", CTLFLAG_RD,
1030 		    &io_ctx->kern_frac, 0, "Kernel fraction of cpu time");
1031 
1032 		SYSCTL_ADD_INT(ctx, parent, OID_AUTO, "residual_burst", CTLFLAG_RD,
1033 		    &io_ctx->residual_burst, 0,
1034 		    "# of residual cycles in burst");
1035 	}
1036 
1037 	SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "phase", CTLFLAG_RD,
1038 	    &io_ctx->phase, 0, "Polling phase");
1039 
1040 	SYSCTL_ADD_ULONG(ctx, parent, OID_AUTO, "suspect", CTLFLAG_RW,
1041 	    &io_ctx->suspect, "Suspected events");
1042 
1043 	SYSCTL_ADD_ULONG(ctx, parent, OID_AUTO, "stalled", CTLFLAG_RW,
1044 	    &io_ctx->stalled, "Potential stalls");
1045 
1046 	SYSCTL_ADD_ULONG(ctx, parent, OID_AUTO, "short_ticks", CTLFLAG_RW,
1047 	    &io_ctx->short_ticks,
1048 	    "Hardclock ticks shorter than they should be");
1049 
1050 	SYSCTL_ADD_ULONG(ctx, parent, OID_AUTO, "lost_polls", CTLFLAG_RW,
1051 	    &io_ctx->lost_polls,
1052 	    "How many times we would have lost a poll tick");
1053 
1054 	SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "pending_polls", CTLFLAG_RD,
1055 	    &io_ctx->pending_polls, 0, "Do we need to poll again");
1056 
1057 	SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "handlers", CTLFLAG_RD,
1058 	    &io_ctx->poll_handlers, 0, "Number of registered poll handlers");
1059 }
1060 
1061 static void
1062 sysctl_burstmax_handler(netmsg_t nmsg)
1063 {
1064 	struct iopoll_sysctl_netmsg *msg = (struct iopoll_sysctl_netmsg *)nmsg;
1065 	struct iopoll_ctx *io_ctx;
1066 
1067 	io_ctx = msg->ctx;
1068 	KKASSERT(&curthread->td_msgport == netisr_portfn(io_ctx->poll_cpuid));
1069 
1070 	io_ctx->poll_burst_max = nmsg->lmsg.u.ms_result;
1071 	if (io_ctx->poll_each_burst > io_ctx->poll_burst_max)
1072 		io_ctx->poll_each_burst = io_ctx->poll_burst_max;
1073 	if (io_ctx->poll_burst > io_ctx->poll_burst_max)
1074 		io_ctx->poll_burst = io_ctx->poll_burst_max;
1075 	if (io_ctx->residual_burst > io_ctx->poll_burst_max)
1076 		io_ctx->residual_burst = io_ctx->poll_burst_max;
1077 
1078 	lwkt_replymsg(&nmsg->lmsg, 0);
1079 }
1080 
1081 static int
1082 sysctl_burstmax(SYSCTL_HANDLER_ARGS)
1083 {
1084 	struct iopoll_ctx *io_ctx = arg1;
1085 	struct iopoll_sysctl_netmsg msg;
1086 	uint32_t burst_max;
1087 	int error;
1088 
1089 	burst_max = io_ctx->poll_burst_max;
1090 	error = sysctl_handle_int(oidp, &burst_max, 0, req);
1091 	if (error || req->newptr == NULL)
1092 		return error;
1093 	if (burst_max < MIN_IOPOLL_BURST_MAX)
1094 		burst_max = MIN_IOPOLL_BURST_MAX;
1095 	else if (burst_max > MAX_IOPOLL_BURST_MAX)
1096 		burst_max = MAX_IOPOLL_BURST_MAX;
1097 
1098 	netmsg_init(&msg.base, NULL, &curthread->td_msgport,
1099 		    0, sysctl_burstmax_handler);
1100 	msg.base.lmsg.u.ms_result = burst_max;
1101 	msg.ctx = io_ctx;
1102 
1103 	return lwkt_domsg(netisr_portfn(io_ctx->poll_cpuid), &msg.base.lmsg, 0);
1104 }
1105 
1106 static void
1107 sysctl_eachburst_handler(netmsg_t nmsg)
1108 {
1109 	struct iopoll_sysctl_netmsg *msg = (struct iopoll_sysctl_netmsg *)nmsg;
1110 	struct iopoll_ctx *io_ctx;
1111 	uint32_t each_burst;
1112 
1113 	io_ctx = msg->ctx;
1114 	KKASSERT(&curthread->td_msgport == netisr_portfn(io_ctx->poll_cpuid));
1115 
1116 	each_burst = nmsg->lmsg.u.ms_result;
1117 	if (each_burst > io_ctx->poll_burst_max)
1118 		each_burst = io_ctx->poll_burst_max;
1119 	else if (each_burst < 1)
1120 		each_burst = 1;
1121 	io_ctx->poll_each_burst = each_burst;
1122 
1123 	lwkt_replymsg(&nmsg->lmsg, 0);
1124 }
1125 
1126 static int
1127 sysctl_eachburst(SYSCTL_HANDLER_ARGS)
1128 {
1129 	struct iopoll_ctx *io_ctx = arg1;
1130 	struct iopoll_sysctl_netmsg msg;
1131 	uint32_t each_burst;
1132 	int error;
1133 
1134 	each_burst = io_ctx->poll_each_burst;
1135 	error = sysctl_handle_int(oidp, &each_burst, 0, req);
1136 	if (error || req->newptr == NULL)
1137 		return error;
1138 
1139 	netmsg_init(&msg.base, NULL, &curthread->td_msgport,
1140 		    0, sysctl_eachburst_handler);
1141 	msg.base.lmsg.u.ms_result = each_burst;
1142 	msg.ctx = io_ctx;
1143 
1144 	return lwkt_domsg(netisr_portfn(io_ctx->poll_cpuid), &msg.base.lmsg, 0);
1145 }
1146 
1147 static int
1148 iopoll_register(struct ifnet *ifp, struct iopoll_ctx *io_ctx,
1149 		const struct ifpoll_io *io_rec)
1150 {
1151 	int error;
1152 
1153 	KKASSERT(&curthread->td_msgport == netisr_portfn(io_ctx->poll_cpuid));
1154 
1155 	if (io_rec->poll_func == NULL)
1156 		return 0;
1157 
1158 	/*
1159 	 * Check if there is room.
1160 	 */
1161 	if (io_ctx->poll_handlers >= IFPOLL_LIST_LEN) {
1162 		/*
1163 		 * List full, cannot register more entries.
1164 		 * This should never happen; if it does, it is probably a
1165 		 * broken driver trying to register multiple times. Checking
1166 		 * this at runtime is expensive, and won't solve the problem
1167 		 * anyways, so just report a few times and then give up.
1168 		 */
1169 		static int verbose = 10; /* XXX */
1170 		if (verbose > 0) {
1171 			kprintf("io poll handlers list full, "
1172 				"maybe a broken driver ?\n");
1173 			verbose--;
1174 		}
1175 		error = ENOENT;
1176 	} else {
1177 		struct iopoll_rec *rec = &io_ctx->pr[io_ctx->poll_handlers];
1178 
1179 		rec->ifp = ifp;
1180 		rec->serializer = io_rec->serializer;
1181 		rec->arg = io_rec->arg;
1182 		rec->poll_func = io_rec->poll_func;
1183 
1184 		io_ctx->poll_handlers++;
1185 		error = 0;
1186 	}
1187 	return error;
1188 }
1189 
1190 static int
1191 iopoll_deregister(struct ifnet *ifp, struct iopoll_ctx *io_ctx)
1192 {
1193 	int i, error;
1194 
1195 	KKASSERT(&curthread->td_msgport == netisr_portfn(io_ctx->poll_cpuid));
1196 
1197 	for (i = 0; i < io_ctx->poll_handlers; ++i) {
1198 		if (io_ctx->pr[i].ifp == ifp) /* Found it */
1199 			break;
1200 	}
1201 	if (i == io_ctx->poll_handlers) {
1202 		error = ENOENT;
1203 	} else {
1204 		io_ctx->poll_handlers--;
1205 		if (i < io_ctx->poll_handlers) {
1206 			/* Last entry replaces this one. */
1207 			io_ctx->pr[i] = io_ctx->pr[io_ctx->poll_handlers];
1208 		}
1209 
1210 		if (io_ctx->poll_handlers == 0)
1211 			iopoll_reset_state(io_ctx);
1212 		error = 0;
1213 	}
1214 	return error;
1215 }
1216 
1217 static void
1218 poll_comm_init(int cpuid)
1219 {
1220 	struct poll_comm *comm;
1221 	char cpuid_str[16];
1222 
1223 	comm = kmalloc_cachealign(sizeof(*comm), M_DEVBUF, M_WAITOK | M_ZERO);
1224 
1225 	if (ifpoll_stfrac < 0)
1226 		ifpoll_stfrac = IFPOLL_STFRAC_DEFAULT;
1227 	if (ifpoll_txfrac < 0)
1228 		ifpoll_txfrac = IFPOLL_TXFRAC_DEFAULT;
1229 
1230 	comm->pollhz = ifpoll_pollhz;
1231 	comm->poll_cpuid = cpuid;
1232 	comm->poll_stfrac = ifpoll_stfrac;
1233 	comm->poll_txfrac = ifpoll_txfrac;
1234 
1235 	ksnprintf(cpuid_str, sizeof(cpuid_str), "%d", cpuid);
1236 
1237 	sysctl_ctx_init(&comm->sysctl_ctx);
1238 	comm->sysctl_tree = SYSCTL_ADD_NODE(&comm->sysctl_ctx,
1239 			    SYSCTL_STATIC_CHILDREN(_net_ifpoll),
1240 			    OID_AUTO, cpuid_str, CTLFLAG_RD, 0, "");
1241 
1242 	SYSCTL_ADD_PROC(&comm->sysctl_ctx, SYSCTL_CHILDREN(comm->sysctl_tree),
1243 			OID_AUTO, "pollhz", CTLTYPE_INT | CTLFLAG_RW,
1244 			comm, 0, sysctl_pollhz,
1245 			"I", "Device polling frequency");
1246 
1247 	if (cpuid == 0) {
1248 		SYSCTL_ADD_PROC(&comm->sysctl_ctx,
1249 				SYSCTL_CHILDREN(comm->sysctl_tree),
1250 				OID_AUTO, "status_frac",
1251 				CTLTYPE_INT | CTLFLAG_RW,
1252 				comm, 0, sysctl_stfrac,
1253 				"I", "# of cycles before status is polled");
1254 	}
1255 	SYSCTL_ADD_PROC(&comm->sysctl_ctx, SYSCTL_CHILDREN(comm->sysctl_tree),
1256 			OID_AUTO, "tx_frac", CTLTYPE_INT | CTLFLAG_RW,
1257 			comm, 0, sysctl_txfrac,
1258 			"I", "# of cycles before TX is polled");
1259 
1260 	poll_common[cpuid] = comm;
1261 }
1262 
1263 static void
1264 poll_comm_start(int cpuid)
1265 {
1266 	struct poll_comm *comm = poll_common[cpuid];
1267 	systimer_func_t func;
1268 
1269 	/*
1270 	 * Initialize systimer
1271 	 */
1272 	if (cpuid == 0)
1273 		func = poll_comm_systimer0;
1274 	else
1275 		func = poll_comm_systimer;
1276 	systimer_init_periodic_nq(&comm->pollclock, func, comm, 1);
1277 }
1278 
1279 static void
1280 _poll_comm_systimer(struct poll_comm *comm)
1281 {
1282 	if (comm->txfrac_count-- == 0) {
1283 		comm->txfrac_count = comm->poll_txfrac;
1284 		iopoll_clock(txpoll_context[comm->poll_cpuid]);
1285 	}
1286 	iopoll_clock(rxpoll_context[comm->poll_cpuid]);
1287 }
1288 
1289 static void
1290 poll_comm_systimer0(systimer_t info, int in_ipi __unused,
1291     struct intrframe *frame __unused)
1292 {
1293 	struct poll_comm *comm = info->data;
1294 	globaldata_t gd = mycpu;
1295 
1296 	KKASSERT(comm->poll_cpuid == gd->gd_cpuid && gd->gd_cpuid == 0);
1297 
1298 	crit_enter_gd(gd);
1299 
1300 	if (comm->stfrac_count-- == 0) {
1301 		comm->stfrac_count = comm->poll_stfrac;
1302 		stpoll_clock(&stpoll_context);
1303 	}
1304 	_poll_comm_systimer(comm);
1305 
1306 	crit_exit_gd(gd);
1307 }
1308 
1309 static void
1310 poll_comm_systimer(systimer_t info, int in_ipi __unused,
1311     struct intrframe *frame __unused)
1312 {
1313 	struct poll_comm *comm = info->data;
1314 	globaldata_t gd = mycpu;
1315 
1316 	KKASSERT(comm->poll_cpuid == gd->gd_cpuid && gd->gd_cpuid != 0);
1317 
1318 	crit_enter_gd(gd);
1319 	_poll_comm_systimer(comm);
1320 	crit_exit_gd(gd);
1321 }
1322 
1323 static void
1324 poll_comm_adjust_pollhz(struct poll_comm *comm)
1325 {
1326 	uint32_t handlers;
1327 	int pollhz = 1;
1328 
1329 	KKASSERT(&curthread->td_msgport == netisr_portfn(comm->poll_cpuid));
1330 
1331 	/*
1332 	 * If there is no polling handler registered, set systimer
1333 	 * frequency to the lowest value.  Polling systimer frequency
1334 	 * will be adjusted to the requested value, once there are
1335 	 * registered handlers.
1336 	 */
1337 	handlers = rxpoll_context[mycpuid]->poll_handlers +
1338 		   txpoll_context[mycpuid]->poll_handlers;
1339 	if (comm->poll_cpuid == 0)
1340 		handlers += stpoll_context.poll_handlers;
1341 	if (handlers)
1342 		pollhz = comm->pollhz;
1343 	systimer_adjust_periodic(&comm->pollclock, pollhz);
1344 }
1345 
1346 static int
1347 sysctl_pollhz(SYSCTL_HANDLER_ARGS)
1348 {
1349 	struct poll_comm *comm = arg1;
1350 	struct netmsg_base nmsg;
1351 	int error, phz;
1352 
1353 	phz = comm->pollhz;
1354 	error = sysctl_handle_int(oidp, &phz, 0, req);
1355 	if (error || req->newptr == NULL)
1356 		return error;
1357 	if (phz <= 0)
1358 		return EINVAL;
1359 	else if (phz > IFPOLL_FREQ_MAX)
1360 		phz = IFPOLL_FREQ_MAX;
1361 
1362 	netmsg_init(&nmsg, NULL, &curthread->td_msgport,
1363 		    0, sysctl_pollhz_handler);
1364 	nmsg.lmsg.u.ms_result = phz;
1365 
1366 	return lwkt_domsg(netisr_portfn(comm->poll_cpuid), &nmsg.lmsg, 0);
1367 }
1368 
1369 static void
1370 sysctl_pollhz_handler(netmsg_t nmsg)
1371 {
1372 	struct poll_comm *comm = poll_common[mycpuid];
1373 
1374 	KKASSERT(&curthread->td_msgport == netisr_portfn(comm->poll_cpuid));
1375 
1376 	/* Save polling frequency */
1377 	comm->pollhz = nmsg->lmsg.u.ms_result;
1378 
1379 	/*
1380 	 * Adjust cached pollhz
1381 	 */
1382 	rxpoll_context[mycpuid]->pollhz = comm->pollhz;
1383 	txpoll_context[mycpuid]->pollhz =
1384 	    comm->pollhz / (comm->poll_txfrac + 1);
1385 
1386 	/*
1387 	 * Adjust polling frequency
1388 	 */
1389 	poll_comm_adjust_pollhz(comm);
1390 
1391 	lwkt_replymsg(&nmsg->lmsg, 0);
1392 }
1393 
1394 static int
1395 sysctl_stfrac(SYSCTL_HANDLER_ARGS)
1396 {
1397 	struct poll_comm *comm = arg1;
1398 	struct netmsg_base nmsg;
1399 	int error, stfrac;
1400 
1401 	KKASSERT(comm->poll_cpuid == 0);
1402 
1403 	stfrac = comm->poll_stfrac;
1404 	error = sysctl_handle_int(oidp, &stfrac, 0, req);
1405 	if (error || req->newptr == NULL)
1406 		return error;
1407 	if (stfrac < 0)
1408 		return EINVAL;
1409 
1410 	netmsg_init(&nmsg, NULL, &curthread->td_msgport,
1411 		    0, sysctl_stfrac_handler);
1412 	nmsg.lmsg.u.ms_result = stfrac;
1413 
1414 	return lwkt_domsg(netisr_portfn(comm->poll_cpuid), &nmsg.lmsg, 0);
1415 }
1416 
1417 static void
1418 sysctl_stfrac_handler(netmsg_t nmsg)
1419 {
1420 	struct poll_comm *comm = poll_common[mycpuid];
1421 	int stfrac = nmsg->lmsg.u.ms_result;
1422 
1423 	KKASSERT(&curthread->td_msgport == netisr_portfn(comm->poll_cpuid));
1424 
1425 	crit_enter();
1426 	comm->poll_stfrac = stfrac;
1427 	if (comm->stfrac_count > comm->poll_stfrac)
1428 		comm->stfrac_count = comm->poll_stfrac;
1429 	crit_exit();
1430 
1431 	lwkt_replymsg(&nmsg->lmsg, 0);
1432 }
1433 
1434 static int
1435 sysctl_txfrac(SYSCTL_HANDLER_ARGS)
1436 {
1437 	struct poll_comm *comm = arg1;
1438 	struct netmsg_base nmsg;
1439 	int error, txfrac;
1440 
1441 	txfrac = comm->poll_txfrac;
1442 	error = sysctl_handle_int(oidp, &txfrac, 0, req);
1443 	if (error || req->newptr == NULL)
1444 		return error;
1445 	if (txfrac < 0)
1446 		return EINVAL;
1447 
1448 	netmsg_init(&nmsg, NULL, &curthread->td_msgport,
1449 		    0, sysctl_txfrac_handler);
1450 	nmsg.lmsg.u.ms_result = txfrac;
1451 
1452 	return lwkt_domsg(netisr_portfn(comm->poll_cpuid), &nmsg.lmsg, 0);
1453 }
1454 
1455 static void
1456 sysctl_txfrac_handler(netmsg_t nmsg)
1457 {
1458 	struct poll_comm *comm = poll_common[mycpuid];
1459 	int txfrac = nmsg->lmsg.u.ms_result;
1460 
1461 	KKASSERT(&curthread->td_msgport == netisr_portfn(comm->poll_cpuid));
1462 
1463 	crit_enter();
1464 	comm->poll_txfrac = txfrac;
1465 	if (comm->txfrac_count > comm->poll_txfrac)
1466 		comm->txfrac_count = comm->poll_txfrac;
1467 	crit_exit();
1468 
1469 	lwkt_replymsg(&nmsg->lmsg, 0);
1470 }
1471