1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2016 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3  * SPDX-License-Identifier: MIT
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 #include "nv-kthread-q.h"
25 #include "nv-list-helpers.h"
26 
27 #include <linux/kthread.h>
28 #include <linux/interrupt.h>
29 #include <linux/completion.h>
30 #include <linux/module.h>
31 #include <linux/mm.h>
32 
33 #if defined(NV_LINUX_BUG_H_PRESENT)
34     #include <linux/bug.h>
35 #else
36     #include <asm/bug.h>
37 #endif
38 
39 // Today's implementation is a little simpler and more limited than the
40 // API description allows for in nv-kthread-q.h. Details include:
41 //
42 // 1. Each nv_kthread_q instance is a first-in, first-out queue.
43 //
44 // 2. Each nv_kthread_q instance is serviced by exactly one kthread.
45 //
46 // You can create any number of queues, each of which gets its own
47 // named kernel thread (kthread). You can then insert arbitrary functions
48 // into the queue, and those functions will be run in the context of the
49 // queue's kthread.
50 
51 #ifndef WARN
52     // Only *really* old kernels (2.6.9) end up here. Just use a simple printk
53     // to implement this, because such kernels won't be supported much longer.
54     #define WARN(condition, format...) ({                    \
55         int __ret_warn_on = !!(condition);                   \
56         if (unlikely(__ret_warn_on))                         \
57             printk(KERN_ERR format);                         \
58         unlikely(__ret_warn_on);                             \
59     })
60 #endif
61 
62 #define NVQ_WARN(fmt, ...)                                   \
63     do {                                                     \
64         if (in_interrupt()) {                                \
65             WARN(1, "nv_kthread_q: [in interrupt]: " fmt,    \
66             ##__VA_ARGS__);                                  \
67         }                                                    \
68         else {                                               \
69             WARN(1, "nv_kthread_q: task: %s: " fmt,          \
70                  current->comm,                              \
71                  ##__VA_ARGS__);                             \
72         }                                                    \
73     } while (0)
74 
75 static int _main_loop(void *args)
76 {
77     nv_kthread_q_t *q = (nv_kthread_q_t *)args;
78     nv_kthread_q_item_t *q_item = NULL;
79     unsigned long flags;
80 
81     while (1) {
82         // Normally this thread is never interrupted. However,
83         // down_interruptible (instead of down) is called here,
84         // in order to avoid being classified as a potentially
85         // hung task, by the kernel watchdog.
86         while (down_interruptible(&q->q_sem))
87             NVQ_WARN("Interrupted during semaphore wait\n");
88 
89         if (atomic_read(&q->main_loop_should_exit))
90             break;
91 
92         spin_lock_irqsave(&q->q_lock, flags);
93 
94         // The q_sem semaphore prevents us from getting here unless there is
95         // at least one item in the list, so an empty list indicates a bug.
96         if (unlikely(list_empty(&q->q_list_head))) {
97             spin_unlock_irqrestore(&q->q_lock, flags);
98             NVQ_WARN("_main_loop: Empty queue: q: 0x%p\n", q);
99             continue;
100         }
101 
102         // Consume one item from the queue
103         q_item = list_first_entry(&q->q_list_head,
104                                    nv_kthread_q_item_t,
105                                    q_list_node);
106 
107         list_del_init(&q_item->q_list_node);
108 
109         spin_unlock_irqrestore(&q->q_lock, flags);
110 
111         // Run the item
112         q_item->function_to_run(q_item->function_args);
113 
114         // Make debugging a little simpler by clearing this between runs:
115         q_item = NULL;
116     }
117 
118     while (!kthread_should_stop())
119         schedule();
120 
121     return 0;
122 }
123 
124 void nv_kthread_q_stop(nv_kthread_q_t *q)
125 {
126     // check if queue has been properly initialized
127     if (unlikely(!q->q_kthread))
128         return;
129 
130     nv_kthread_q_flush(q);
131 
132     // If this assertion fires, then a caller likely either broke the API rules,
133     // by adding items after calling nv_kthread_q_stop, or possibly messed up
134     // with inadequate flushing of self-rescheduling q_items.
135     if (unlikely(!list_empty(&q->q_list_head)))
136         NVQ_WARN("list not empty after flushing\n");
137 
138     if (likely(!atomic_read(&q->main_loop_should_exit))) {
139 
140         atomic_set(&q->main_loop_should_exit, 1);
141 
142         // Wake up the kthread so that it can see that it needs to stop:
143         up(&q->q_sem);
144 
145         kthread_stop(q->q_kthread);
146         q->q_kthread = NULL;
147     }
148 }
149 
150 // When CONFIG_VMAP_STACK is defined, the kernel thread stack allocator used by
151 // kthread_create_on_node relies on a 2 entry, per-core cache to minimize
152 // vmalloc invocations. The cache is NUMA-unaware, so when there is a hit, the
153 // stack location ends up being a function of the core assigned to the current
154 // thread, instead of being a function of the specified NUMA node. The cache was
155 // added to the kernel in commit ac496bf48d97f2503eaa353996a4dd5e4383eaf0
156 // ("fork: Optimize task creation by caching two thread stacks per CPU if
157 // CONFIG_VMAP_STACK=y")
158 //
159 // To work around the problematic cache, we create up to three kernel threads
160 //   -If the first thread's stack is resident on the preferred node, return this
161 //    thread.
162 //   -Otherwise, create a second thread. If its stack is resident on the
163 //    preferred node, stop the first thread and return this one.
164 //   -Otherwise, create a third thread. The stack allocator does not find a
165 //    cached stack, and so falls back to vmalloc, which takes the NUMA hint into
166 //    consideration. The first two threads are then stopped.
167 //
168 // When CONFIG_VMAP_STACK is not defined, the first kernel thread is returned.
169 //
170 // This function is never invoked when there is no NUMA preference (preferred
171 // node is NUMA_NO_NODE).
172 static struct task_struct *thread_create_on_node(int (*threadfn)(void *data),
173                                                  nv_kthread_q_t *q,
174                                                  int preferred_node,
175                                                  const char *q_name)
176 {
177 
178     unsigned i, j;
179     const static unsigned attempts = 3;
180     struct task_struct *thread[3];
181 
182     for (i = 0;; i++) {
183         struct page *stack;
184 
185         thread[i] = kthread_create_on_node(threadfn, q, preferred_node, q_name);
186 
187         if (unlikely(IS_ERR(thread[i]))) {
188 
189             // Instead of failing, pick the previous thread, even if its
190             // stack is not allocated on the preferred node.
191             if (i > 0)
192                 i--;
193 
194             break;
195         }
196 
197         // vmalloc is not used to allocate the stack, so simply return the
198         // thread, even if its stack may not be allocated on the preferred node
199         if (!is_vmalloc_addr(thread[i]->stack))
200             break;
201 
202         // Ran out of attempts - return thread even if its stack may not be
203         // allocated on the preferred node
204         if ((i == (attempts - 1)))
205             break;
206 
207         // Get the NUMA node where the first page of the stack is resident. If
208         // it is the preferred node, select this thread.
209         stack = vmalloc_to_page(thread[i]->stack);
210         if (page_to_nid(stack) == preferred_node)
211             break;
212     }
213 
214     for (j = i; j > 0; j--)
215         kthread_stop(thread[j - 1]);
216 
217     return thread[i];
218 }
219 
220 int nv_kthread_q_init_on_node(nv_kthread_q_t *q, const char *q_name, int preferred_node)
221 {
222     memset(q, 0, sizeof(*q));
223 
224     INIT_LIST_HEAD(&q->q_list_head);
225     spin_lock_init(&q->q_lock);
226     sema_init(&q->q_sem, 0);
227 
228     if (preferred_node == NV_KTHREAD_NO_NODE) {
229         q->q_kthread = kthread_create(_main_loop, q, q_name);
230     }
231     else {
232         q->q_kthread = thread_create_on_node(_main_loop, q, preferred_node, q_name);
233     }
234 
235     if (IS_ERR(q->q_kthread)) {
236         int err = PTR_ERR(q->q_kthread);
237 
238         // Clear q_kthread before returning so that nv_kthread_q_stop() can be
239         // safely called on it making error handling easier.
240         q->q_kthread = NULL;
241 
242         return err;
243     }
244 
245     wake_up_process(q->q_kthread);
246 
247     return 0;
248 }
249 
250 // Returns true (non-zero) if the item was actually scheduled, and false if the
251 // item was already pending in a queue.
252 static int _raw_q_schedule(nv_kthread_q_t *q, nv_kthread_q_item_t *q_item)
253 {
254     unsigned long flags;
255     int ret = 1;
256 
257     spin_lock_irqsave(&q->q_lock, flags);
258 
259     if (likely(list_empty(&q_item->q_list_node)))
260         list_add_tail(&q_item->q_list_node, &q->q_list_head);
261     else
262         ret = 0;
263 
264     spin_unlock_irqrestore(&q->q_lock, flags);
265 
266     if (likely(ret))
267         up(&q->q_sem);
268 
269     return ret;
270 }
271 
272 void nv_kthread_q_item_init(nv_kthread_q_item_t *q_item,
273                             nv_q_func_t function_to_run,
274                             void *function_args)
275 {
276     INIT_LIST_HEAD(&q_item->q_list_node);
277     q_item->function_to_run = function_to_run;
278     q_item->function_args   = function_args;
279 }
280 
281 // Returns true (non-zero) if the q_item got scheduled, false otherwise.
282 int nv_kthread_q_schedule_q_item(nv_kthread_q_t *q,
283                                  nv_kthread_q_item_t *q_item)
284 {
285     if (unlikely(atomic_read(&q->main_loop_should_exit))) {
286         NVQ_WARN("Not allowed: nv_kthread_q_schedule_q_item was "
287                    "called with a non-alive q: 0x%p\n", q);
288         return 0;
289     }
290 
291     return _raw_q_schedule(q, q_item);
292 }
293 
294 static void _q_flush_function(void *args)
295 {
296     struct completion *completion = (struct completion *)args;
297     complete(completion);
298 }
299 
300 
301 static void _raw_q_flush(nv_kthread_q_t *q)
302 {
303     nv_kthread_q_item_t q_item;
304     DECLARE_COMPLETION_ONSTACK(completion);
305 
306     nv_kthread_q_item_init(&q_item, _q_flush_function, &completion);
307 
308     _raw_q_schedule(q, &q_item);
309 
310     // Wait for the flush item to run. Once it has run, then all of the
311     // previously queued items in front of it will have run, so that means
312     // the flush is complete.
313     wait_for_completion(&completion);
314 }
315 
316 void nv_kthread_q_flush(nv_kthread_q_t *q)
317 {
318     if (unlikely(atomic_read(&q->main_loop_should_exit))) {
319         NVQ_WARN("Not allowed: nv_kthread_q_flush was called after "
320                    "nv_kthread_q_stop. q: 0x%p\n", q);
321         return;
322     }
323 
324     // This 2x flush is not a typing mistake. The queue really does have to be
325     // flushed twice, in order to take care of the case of a q_item that
326     // reschedules itself.
327     _raw_q_flush(q);
328     _raw_q_flush(q);
329 }
330