1 /* 2 * SPDX-FileCopyrightText: Copyright (c) 2016 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 * SPDX-License-Identifier: MIT 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 */ 23 24 #include "nv-kthread-q.h" 25 #include "nv-list-helpers.h" 26 27 #include <linux/kthread.h> 28 #include <linux/interrupt.h> 29 #include <linux/completion.h> 30 #include <linux/module.h> 31 #include <linux/mm.h> 32 33 #if defined(NV_LINUX_BUG_H_PRESENT) 34 #include <linux/bug.h> 35 #else 36 #include <asm/bug.h> 37 #endif 38 39 // Today's implementation is a little simpler and more limited than the 40 // API description allows for in nv-kthread-q.h. Details include: 41 // 42 // 1. Each nv_kthread_q instance is a first-in, first-out queue. 43 // 44 // 2. Each nv_kthread_q instance is serviced by exactly one kthread. 45 // 46 // You can create any number of queues, each of which gets its own 47 // named kernel thread (kthread). You can then insert arbitrary functions 48 // into the queue, and those functions will be run in the context of the 49 // queue's kthread. 50 51 #ifndef WARN 52 // Only *really* old kernels (2.6.9) end up here. Just use a simple printk 53 // to implement this, because such kernels won't be supported much longer. 54 #define WARN(condition, format...) ({ \ 55 int __ret_warn_on = !!(condition); \ 56 if (unlikely(__ret_warn_on)) \ 57 printk(KERN_ERR format); \ 58 unlikely(__ret_warn_on); \ 59 }) 60 #endif 61 62 #define NVQ_WARN(fmt, ...) \ 63 do { \ 64 if (in_interrupt()) { \ 65 WARN(1, "nv_kthread_q: [in interrupt]: " fmt, \ 66 ##__VA_ARGS__); \ 67 } \ 68 else { \ 69 WARN(1, "nv_kthread_q: task: %s: " fmt, \ 70 current->comm, \ 71 ##__VA_ARGS__); \ 72 } \ 73 } while (0) 74 75 static int _main_loop(void *args) 76 { 77 nv_kthread_q_t *q = (nv_kthread_q_t *)args; 78 nv_kthread_q_item_t *q_item = NULL; 79 unsigned long flags; 80 81 while (1) { 82 // Normally this thread is never interrupted. However, 83 // down_interruptible (instead of down) is called here, 84 // in order to avoid being classified as a potentially 85 // hung task, by the kernel watchdog. 86 while (down_interruptible(&q->q_sem)) 87 NVQ_WARN("Interrupted during semaphore wait\n"); 88 89 if (atomic_read(&q->main_loop_should_exit)) 90 break; 91 92 spin_lock_irqsave(&q->q_lock, flags); 93 94 // The q_sem semaphore prevents us from getting here unless there is 95 // at least one item in the list, so an empty list indicates a bug. 96 if (unlikely(list_empty(&q->q_list_head))) { 97 spin_unlock_irqrestore(&q->q_lock, flags); 98 NVQ_WARN("_main_loop: Empty queue: q: 0x%p\n", q); 99 continue; 100 } 101 102 // Consume one item from the queue 103 q_item = list_first_entry(&q->q_list_head, 104 nv_kthread_q_item_t, 105 q_list_node); 106 107 list_del_init(&q_item->q_list_node); 108 109 spin_unlock_irqrestore(&q->q_lock, flags); 110 111 // Run the item 112 q_item->function_to_run(q_item->function_args); 113 114 // Make debugging a little simpler by clearing this between runs: 115 q_item = NULL; 116 } 117 118 while (!kthread_should_stop()) 119 schedule(); 120 121 return 0; 122 } 123 124 void nv_kthread_q_stop(nv_kthread_q_t *q) 125 { 126 // check if queue has been properly initialized 127 if (unlikely(!q->q_kthread)) 128 return; 129 130 nv_kthread_q_flush(q); 131 132 // If this assertion fires, then a caller likely either broke the API rules, 133 // by adding items after calling nv_kthread_q_stop, or possibly messed up 134 // with inadequate flushing of self-rescheduling q_items. 135 if (unlikely(!list_empty(&q->q_list_head))) 136 NVQ_WARN("list not empty after flushing\n"); 137 138 if (likely(!atomic_read(&q->main_loop_should_exit))) { 139 140 atomic_set(&q->main_loop_should_exit, 1); 141 142 // Wake up the kthread so that it can see that it needs to stop: 143 up(&q->q_sem); 144 145 kthread_stop(q->q_kthread); 146 q->q_kthread = NULL; 147 } 148 } 149 150 // When CONFIG_VMAP_STACK is defined, the kernel thread stack allocator used by 151 // kthread_create_on_node relies on a 2 entry, per-core cache to minimize 152 // vmalloc invocations. The cache is NUMA-unaware, so when there is a hit, the 153 // stack location ends up being a function of the core assigned to the current 154 // thread, instead of being a function of the specified NUMA node. The cache was 155 // added to the kernel in commit ac496bf48d97f2503eaa353996a4dd5e4383eaf0 156 // ("fork: Optimize task creation by caching two thread stacks per CPU if 157 // CONFIG_VMAP_STACK=y") 158 // 159 // To work around the problematic cache, we create up to three kernel threads 160 // -If the first thread's stack is resident on the preferred node, return this 161 // thread. 162 // -Otherwise, create a second thread. If its stack is resident on the 163 // preferred node, stop the first thread and return this one. 164 // -Otherwise, create a third thread. The stack allocator does not find a 165 // cached stack, and so falls back to vmalloc, which takes the NUMA hint into 166 // consideration. The first two threads are then stopped. 167 // 168 // When CONFIG_VMAP_STACK is not defined, the first kernel thread is returned. 169 // 170 // This function is never invoked when there is no NUMA preference (preferred 171 // node is NUMA_NO_NODE). 172 static struct task_struct *thread_create_on_node(int (*threadfn)(void *data), 173 nv_kthread_q_t *q, 174 int preferred_node, 175 const char *q_name) 176 { 177 178 unsigned i, j; 179 const static unsigned attempts = 3; 180 struct task_struct *thread[3]; 181 182 for (i = 0;; i++) { 183 struct page *stack; 184 185 thread[i] = kthread_create_on_node(threadfn, q, preferred_node, q_name); 186 187 if (unlikely(IS_ERR(thread[i]))) { 188 189 // Instead of failing, pick the previous thread, even if its 190 // stack is not allocated on the preferred node. 191 if (i > 0) 192 i--; 193 194 break; 195 } 196 197 // vmalloc is not used to allocate the stack, so simply return the 198 // thread, even if its stack may not be allocated on the preferred node 199 if (!is_vmalloc_addr(thread[i]->stack)) 200 break; 201 202 // Ran out of attempts - return thread even if its stack may not be 203 // allocated on the preferred node 204 if ((i == (attempts - 1))) 205 break; 206 207 // Get the NUMA node where the first page of the stack is resident. If 208 // it is the preferred node, select this thread. 209 stack = vmalloc_to_page(thread[i]->stack); 210 if (page_to_nid(stack) == preferred_node) 211 break; 212 } 213 214 for (j = i; j > 0; j--) 215 kthread_stop(thread[j - 1]); 216 217 return thread[i]; 218 } 219 220 int nv_kthread_q_init_on_node(nv_kthread_q_t *q, const char *q_name, int preferred_node) 221 { 222 memset(q, 0, sizeof(*q)); 223 224 INIT_LIST_HEAD(&q->q_list_head); 225 spin_lock_init(&q->q_lock); 226 sema_init(&q->q_sem, 0); 227 228 if (preferred_node == NV_KTHREAD_NO_NODE) { 229 q->q_kthread = kthread_create(_main_loop, q, q_name); 230 } 231 else { 232 q->q_kthread = thread_create_on_node(_main_loop, q, preferred_node, q_name); 233 } 234 235 if (IS_ERR(q->q_kthread)) { 236 int err = PTR_ERR(q->q_kthread); 237 238 // Clear q_kthread before returning so that nv_kthread_q_stop() can be 239 // safely called on it making error handling easier. 240 q->q_kthread = NULL; 241 242 return err; 243 } 244 245 wake_up_process(q->q_kthread); 246 247 return 0; 248 } 249 250 // Returns true (non-zero) if the item was actually scheduled, and false if the 251 // item was already pending in a queue. 252 static int _raw_q_schedule(nv_kthread_q_t *q, nv_kthread_q_item_t *q_item) 253 { 254 unsigned long flags; 255 int ret = 1; 256 257 spin_lock_irqsave(&q->q_lock, flags); 258 259 if (likely(list_empty(&q_item->q_list_node))) 260 list_add_tail(&q_item->q_list_node, &q->q_list_head); 261 else 262 ret = 0; 263 264 spin_unlock_irqrestore(&q->q_lock, flags); 265 266 if (likely(ret)) 267 up(&q->q_sem); 268 269 return ret; 270 } 271 272 void nv_kthread_q_item_init(nv_kthread_q_item_t *q_item, 273 nv_q_func_t function_to_run, 274 void *function_args) 275 { 276 INIT_LIST_HEAD(&q_item->q_list_node); 277 q_item->function_to_run = function_to_run; 278 q_item->function_args = function_args; 279 } 280 281 // Returns true (non-zero) if the q_item got scheduled, false otherwise. 282 int nv_kthread_q_schedule_q_item(nv_kthread_q_t *q, 283 nv_kthread_q_item_t *q_item) 284 { 285 if (unlikely(atomic_read(&q->main_loop_should_exit))) { 286 NVQ_WARN("Not allowed: nv_kthread_q_schedule_q_item was " 287 "called with a non-alive q: 0x%p\n", q); 288 return 0; 289 } 290 291 return _raw_q_schedule(q, q_item); 292 } 293 294 static void _q_flush_function(void *args) 295 { 296 struct completion *completion = (struct completion *)args; 297 complete(completion); 298 } 299 300 301 static void _raw_q_flush(nv_kthread_q_t *q) 302 { 303 nv_kthread_q_item_t q_item; 304 DECLARE_COMPLETION_ONSTACK(completion); 305 306 nv_kthread_q_item_init(&q_item, _q_flush_function, &completion); 307 308 _raw_q_schedule(q, &q_item); 309 310 // Wait for the flush item to run. Once it has run, then all of the 311 // previously queued items in front of it will have run, so that means 312 // the flush is complete. 313 wait_for_completion(&completion); 314 } 315 316 void nv_kthread_q_flush(nv_kthread_q_t *q) 317 { 318 if (unlikely(atomic_read(&q->main_loop_should_exit))) { 319 NVQ_WARN("Not allowed: nv_kthread_q_flush was called after " 320 "nv_kthread_q_stop. q: 0x%p\n", q); 321 return; 322 } 323 324 // This 2x flush is not a typing mistake. The queue really does have to be 325 // flushed twice, in order to take care of the case of a q_item that 326 // reschedules itself. 327 _raw_q_flush(q); 328 _raw_q_flush(q); 329 } 330