1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "io_uring.h"
4 #include "napi.h"
5
6 #ifdef CONFIG_NET_RX_BUSY_POLL
7
8 /* Timeout for cleanout of stale entries. */
9 #define NAPI_TIMEOUT (60 * SEC_CONVERSION)
10
11 struct io_napi_entry {
12 unsigned int napi_id;
13 struct list_head list;
14
15 unsigned long timeout;
16 struct hlist_node node;
17
18 struct rcu_head rcu;
19 };
20
io_napi_hash_find(struct hlist_head * hash_list,unsigned int napi_id)21 static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list,
22 unsigned int napi_id)
23 {
24 struct io_napi_entry *e;
25
26 hlist_for_each_entry_rcu(e, hash_list, node) {
27 if (e->napi_id != napi_id)
28 continue;
29 e->timeout = jiffies + NAPI_TIMEOUT;
30 return e;
31 }
32
33 return NULL;
34 }
35
__io_napi_add(struct io_ring_ctx * ctx,struct socket * sock)36 void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock)
37 {
38 struct hlist_head *hash_list;
39 unsigned int napi_id;
40 struct sock *sk;
41 struct io_napi_entry *e;
42
43 sk = sock->sk;
44 if (!sk)
45 return;
46
47 napi_id = READ_ONCE(sk->sk_napi_id);
48
49 /* Non-NAPI IDs can be rejected. */
50 if (napi_id < MIN_NAPI_ID)
51 return;
52
53 hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))];
54
55 rcu_read_lock();
56 e = io_napi_hash_find(hash_list, napi_id);
57 if (e) {
58 e->timeout = jiffies + NAPI_TIMEOUT;
59 rcu_read_unlock();
60 return;
61 }
62 rcu_read_unlock();
63
64 e = kmalloc(sizeof(*e), GFP_NOWAIT);
65 if (!e)
66 return;
67
68 e->napi_id = napi_id;
69 e->timeout = jiffies + NAPI_TIMEOUT;
70
71 spin_lock(&ctx->napi_lock);
72 if (unlikely(io_napi_hash_find(hash_list, napi_id))) {
73 spin_unlock(&ctx->napi_lock);
74 kfree(e);
75 return;
76 }
77
78 hlist_add_tail_rcu(&e->node, hash_list);
79 list_add_tail(&e->list, &ctx->napi_list);
80 spin_unlock(&ctx->napi_lock);
81 }
82
__io_napi_remove_stale(struct io_ring_ctx * ctx)83 static void __io_napi_remove_stale(struct io_ring_ctx *ctx)
84 {
85 struct io_napi_entry *e;
86 unsigned int i;
87
88 spin_lock(&ctx->napi_lock);
89 hash_for_each(ctx->napi_ht, i, e, node) {
90 if (time_after(jiffies, e->timeout)) {
91 list_del(&e->list);
92 hash_del_rcu(&e->node);
93 kfree_rcu(e, rcu);
94 }
95 }
96 spin_unlock(&ctx->napi_lock);
97 }
98
io_napi_remove_stale(struct io_ring_ctx * ctx,bool is_stale)99 static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale)
100 {
101 if (is_stale)
102 __io_napi_remove_stale(ctx);
103 }
104
io_napi_busy_loop_timeout(unsigned long start_time,unsigned long bp_usec)105 static inline bool io_napi_busy_loop_timeout(unsigned long start_time,
106 unsigned long bp_usec)
107 {
108 if (bp_usec) {
109 unsigned long end_time = start_time + bp_usec;
110 unsigned long now = busy_loop_current_time();
111
112 return time_after(now, end_time);
113 }
114
115 return true;
116 }
117
io_napi_busy_loop_should_end(void * data,unsigned long start_time)118 static bool io_napi_busy_loop_should_end(void *data,
119 unsigned long start_time)
120 {
121 struct io_wait_queue *iowq = data;
122
123 if (signal_pending(current))
124 return true;
125 if (io_should_wake(iowq) || io_has_work(iowq->ctx))
126 return true;
127 if (io_napi_busy_loop_timeout(start_time, iowq->napi_busy_poll_to))
128 return true;
129
130 return false;
131 }
132
__io_napi_do_busy_loop(struct io_ring_ctx * ctx,void * loop_end_arg)133 static bool __io_napi_do_busy_loop(struct io_ring_ctx *ctx,
134 void *loop_end_arg)
135 {
136 struct io_napi_entry *e;
137 bool (*loop_end)(void *, unsigned long) = NULL;
138 bool is_stale = false;
139
140 if (loop_end_arg)
141 loop_end = io_napi_busy_loop_should_end;
142
143 list_for_each_entry_rcu(e, &ctx->napi_list, list) {
144 napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg,
145 ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET);
146
147 if (time_after(jiffies, e->timeout))
148 is_stale = true;
149 }
150
151 return is_stale;
152 }
153
io_napi_blocking_busy_loop(struct io_ring_ctx * ctx,struct io_wait_queue * iowq)154 static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx,
155 struct io_wait_queue *iowq)
156 {
157 unsigned long start_time = busy_loop_current_time();
158 void *loop_end_arg = NULL;
159 bool is_stale = false;
160
161 /* Singular lists use a different napi loop end check function and are
162 * only executed once.
163 */
164 if (list_is_singular(&ctx->napi_list))
165 loop_end_arg = iowq;
166
167 rcu_read_lock();
168 do {
169 is_stale = __io_napi_do_busy_loop(ctx, loop_end_arg);
170 } while (!io_napi_busy_loop_should_end(iowq, start_time) && !loop_end_arg);
171 rcu_read_unlock();
172
173 io_napi_remove_stale(ctx, is_stale);
174 }
175
176 /*
177 * io_napi_init() - Init napi settings
178 * @ctx: pointer to io-uring context structure
179 *
180 * Init napi settings in the io-uring context.
181 */
io_napi_init(struct io_ring_ctx * ctx)182 void io_napi_init(struct io_ring_ctx *ctx)
183 {
184 INIT_LIST_HEAD(&ctx->napi_list);
185 spin_lock_init(&ctx->napi_lock);
186 ctx->napi_prefer_busy_poll = false;
187 ctx->napi_busy_poll_to = READ_ONCE(sysctl_net_busy_poll);
188 }
189
190 /*
191 * io_napi_free() - Deallocate napi
192 * @ctx: pointer to io-uring context structure
193 *
194 * Free the napi list and the hash table in the io-uring context.
195 */
io_napi_free(struct io_ring_ctx * ctx)196 void io_napi_free(struct io_ring_ctx *ctx)
197 {
198 struct io_napi_entry *e;
199 LIST_HEAD(napi_list);
200 unsigned int i;
201
202 spin_lock(&ctx->napi_lock);
203 hash_for_each(ctx->napi_ht, i, e, node) {
204 hash_del_rcu(&e->node);
205 kfree_rcu(e, rcu);
206 }
207 spin_unlock(&ctx->napi_lock);
208 }
209
210 /*
211 * io_napi_register() - Register napi with io-uring
212 * @ctx: pointer to io-uring context structure
213 * @arg: pointer to io_uring_napi structure
214 *
215 * Register napi in the io-uring context.
216 */
io_register_napi(struct io_ring_ctx * ctx,void __user * arg)217 int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
218 {
219 const struct io_uring_napi curr = {
220 .busy_poll_to = ctx->napi_busy_poll_to,
221 .prefer_busy_poll = ctx->napi_prefer_busy_poll
222 };
223 struct io_uring_napi napi;
224
225 if (copy_from_user(&napi, arg, sizeof(napi)))
226 return -EFAULT;
227 if (napi.pad[0] || napi.pad[1] || napi.pad[2] || napi.resv)
228 return -EINVAL;
229
230 if (copy_to_user(arg, &curr, sizeof(curr)))
231 return -EFAULT;
232
233 WRITE_ONCE(ctx->napi_busy_poll_to, napi.busy_poll_to);
234 WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi.prefer_busy_poll);
235 WRITE_ONCE(ctx->napi_enabled, true);
236 return 0;
237 }
238
239 /*
240 * io_napi_unregister() - Unregister napi with io-uring
241 * @ctx: pointer to io-uring context structure
242 * @arg: pointer to io_uring_napi structure
243 *
244 * Unregister napi. If arg has been specified copy the busy poll timeout and
245 * prefer busy poll setting to the passed in structure.
246 */
io_unregister_napi(struct io_ring_ctx * ctx,void __user * arg)247 int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
248 {
249 const struct io_uring_napi curr = {
250 .busy_poll_to = ctx->napi_busy_poll_to,
251 .prefer_busy_poll = ctx->napi_prefer_busy_poll
252 };
253
254 if (arg && copy_to_user(arg, &curr, sizeof(curr)))
255 return -EFAULT;
256
257 WRITE_ONCE(ctx->napi_busy_poll_to, 0);
258 WRITE_ONCE(ctx->napi_prefer_busy_poll, false);
259 WRITE_ONCE(ctx->napi_enabled, false);
260 return 0;
261 }
262
263 /*
264 * __io_napi_adjust_timeout() - adjust busy loop timeout
265 * @ctx: pointer to io-uring context structure
266 * @iowq: pointer to io wait queue
267 * @ts: pointer to timespec or NULL
268 *
269 * Adjust the busy loop timeout according to timespec and busy poll timeout.
270 * If the specified NAPI timeout is bigger than the wait timeout, then adjust
271 * the NAPI timeout accordingly.
272 */
__io_napi_adjust_timeout(struct io_ring_ctx * ctx,struct io_wait_queue * iowq,struct timespec64 * ts)273 void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq,
274 struct timespec64 *ts)
275 {
276 unsigned int poll_to = READ_ONCE(ctx->napi_busy_poll_to);
277
278 if (ts) {
279 struct timespec64 poll_to_ts;
280
281 poll_to_ts = ns_to_timespec64(1000 * (s64)poll_to);
282 if (timespec64_compare(ts, &poll_to_ts) < 0) {
283 s64 poll_to_ns = timespec64_to_ns(ts);
284 if (poll_to_ns > 0) {
285 u64 val = poll_to_ns + 999;
286 do_div(val, (s64) 1000);
287 poll_to = val;
288 }
289 }
290 }
291
292 iowq->napi_busy_poll_to = poll_to;
293 }
294
295 /*
296 * __io_napi_busy_loop() - execute busy poll loop
297 * @ctx: pointer to io-uring context structure
298 * @iowq: pointer to io wait queue
299 *
300 * Execute the busy poll loop and merge the spliced off list.
301 */
__io_napi_busy_loop(struct io_ring_ctx * ctx,struct io_wait_queue * iowq)302 void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq)
303 {
304 iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll);
305
306 if (!(ctx->flags & IORING_SETUP_SQPOLL) && ctx->napi_enabled)
307 io_napi_blocking_busy_loop(ctx, iowq);
308 }
309
310 /*
311 * io_napi_sqpoll_busy_poll() - busy poll loop for sqpoll
312 * @ctx: pointer to io-uring context structure
313 *
314 * Splice of the napi list and execute the napi busy poll loop.
315 */
io_napi_sqpoll_busy_poll(struct io_ring_ctx * ctx)316 int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx)
317 {
318 LIST_HEAD(napi_list);
319 bool is_stale = false;
320
321 if (!READ_ONCE(ctx->napi_busy_poll_to))
322 return 0;
323 if (list_empty_careful(&ctx->napi_list))
324 return 0;
325
326 rcu_read_lock();
327 is_stale = __io_napi_do_busy_loop(ctx, NULL);
328 rcu_read_unlock();
329
330 io_napi_remove_stale(ctx, is_stale);
331 return 1;
332 }
333
334 #endif
335