1 /* 2 * Copyright 2015 Advanced Micro Devices, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 * 22 */ 23 24 /** 25 * DOC: Overview 26 * 27 * The GPU scheduler provides entities which allow userspace to push jobs 28 * into software queues which are then scheduled on a hardware run queue. 29 * The software queues have a priority among them. The scheduler selects the entities 30 * from the run queue using a FIFO. The scheduler provides dependency handling 31 * features among jobs. The driver is supposed to provide callback functions for 32 * backend operations to the scheduler like submitting a job to hardware run queue, 33 * returning the dependencies of a job etc. 34 * 35 * The organisation of the scheduler is the following: 36 * 37 * 1. Each hw run queue has one scheduler 38 * 2. Each scheduler has multiple run queues with different priorities 39 * (e.g., HIGH_HW,HIGH_SW, KERNEL, NORMAL) 40 * 3. Each scheduler run queue has a queue of entities to schedule 41 * 4. Entities themselves maintain a queue of jobs that will be scheduled on 42 * the hardware. 43 * 44 * The jobs in a entity are always scheduled in the order that they were pushed. 45 */ 46 47 #include <linux/kthread.h> 48 #include <linux/wait.h> 49 #include <linux/sched.h> 50 #include <uapi/linux/sched/types.h> 51 #include <drm/drmP.h> 52 #include <drm/gpu_scheduler.h> 53 #include <drm/spsc_queue.h> 54 55 #define CREATE_TRACE_POINTS 56 #include "gpu_scheduler_trace.h" 57 58 #define to_drm_sched_job(sched_job) \ 59 container_of((sched_job), struct drm_sched_job, queue_node) 60 61 static bool drm_sched_entity_is_ready(struct drm_sched_entity *entity); 62 static void drm_sched_wakeup(struct drm_gpu_scheduler *sched); 63 static void drm_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb); 64 65 /** 66 * drm_sched_rq_init - initialize a given run queue struct 67 * 68 * @rq: scheduler run queue 69 * 70 * Initializes a scheduler runqueue. 71 */ 72 static void drm_sched_rq_init(struct drm_gpu_scheduler *sched, 73 struct drm_sched_rq *rq) 74 { 75 spin_init(&rq->lock, "dsrql"); 76 INIT_LIST_HEAD(&rq->entities); 77 rq->current_entity = NULL; 78 rq->sched = sched; 79 } 80 81 /** 82 * drm_sched_rq_add_entity - add an entity 83 * 84 * @rq: scheduler run queue 85 * @entity: scheduler entity 86 * 87 * Adds a scheduler entity to the run queue. 88 */ 89 static void drm_sched_rq_add_entity(struct drm_sched_rq *rq, 90 struct drm_sched_entity *entity) 91 { 92 if (!list_empty(&entity->list)) 93 return; 94 spin_lock(&rq->lock); 95 list_add_tail(&entity->list, &rq->entities); 96 spin_unlock(&rq->lock); 97 } 98 99 /** 100 * drm_sched_rq_remove_entity - remove an entity 101 * 102 * @rq: scheduler run queue 103 * @entity: scheduler entity 104 * 105 * Removes a scheduler entity from the run queue. 106 */ 107 static void drm_sched_rq_remove_entity(struct drm_sched_rq *rq, 108 struct drm_sched_entity *entity) 109 { 110 if (list_empty(&entity->list)) 111 return; 112 spin_lock(&rq->lock); 113 list_del_init(&entity->list); 114 if (rq->current_entity == entity) 115 rq->current_entity = NULL; 116 spin_unlock(&rq->lock); 117 } 118 119 /** 120 * drm_sched_rq_select_entity - Select an entity which could provide a job to run 121 * 122 * @rq: scheduler run queue to check. 123 * 124 * Try to find a ready entity, returns NULL if none found. 125 */ 126 static struct drm_sched_entity * 127 drm_sched_rq_select_entity(struct drm_sched_rq *rq) 128 { 129 struct drm_sched_entity *entity; 130 131 spin_lock(&rq->lock); 132 133 entity = rq->current_entity; 134 if (entity) { 135 list_for_each_entry_continue(entity, &rq->entities, list) { 136 if (drm_sched_entity_is_ready(entity)) { 137 rq->current_entity = entity; 138 spin_unlock(&rq->lock); 139 return entity; 140 } 141 } 142 } 143 144 list_for_each_entry(entity, &rq->entities, list) { 145 146 if (drm_sched_entity_is_ready(entity)) { 147 rq->current_entity = entity; 148 spin_unlock(&rq->lock); 149 return entity; 150 } 151 152 if (entity == rq->current_entity) 153 break; 154 } 155 156 spin_unlock(&rq->lock); 157 158 return NULL; 159 } 160 161 /** 162 * drm_sched_entity_init - Init a context entity used by scheduler when 163 * submit to HW ring. 164 * 165 * @entity: scheduler entity to init 166 * @rq_list: the list of run queue on which jobs from this 167 * entity can be submitted 168 * @num_rq_list: number of run queue in rq_list 169 * @guilty: atomic_t set to 1 when a job on this queue 170 * is found to be guilty causing a timeout 171 * 172 * Note: the rq_list should have atleast one element to schedule 173 * the entity 174 * 175 * Returns 0 on success or a negative error code on failure. 176 */ 177 int drm_sched_entity_init(struct drm_sched_entity *entity, 178 struct drm_sched_rq **rq_list, 179 unsigned int num_rq_list, 180 atomic_t *guilty) 181 { 182 if (!(entity && rq_list && num_rq_list > 0 && rq_list[0])) 183 return -EINVAL; 184 185 memset(entity, 0, sizeof(struct drm_sched_entity)); 186 INIT_LIST_HEAD(&entity->list); 187 entity->rq = rq_list[0]; 188 entity->guilty = guilty; 189 entity->last_scheduled = NULL; 190 191 spin_init(&entity->rq_lock, "dserql"); 192 spsc_queue_init(&entity->job_queue); 193 194 atomic_set(&entity->fence_seq, 0); 195 entity->fence_context = dma_fence_context_alloc(2); 196 197 return 0; 198 } 199 EXPORT_SYMBOL(drm_sched_entity_init); 200 201 /** 202 * drm_sched_entity_is_idle - Check if entity is idle 203 * 204 * @entity: scheduler entity 205 * 206 * Returns true if the entity does not have any unscheduled jobs. 207 */ 208 static bool drm_sched_entity_is_idle(struct drm_sched_entity *entity) 209 { 210 rmb(); 211 212 if (list_empty(&entity->list) || 213 spsc_queue_peek(&entity->job_queue) == NULL) 214 return true; 215 216 return false; 217 } 218 219 /** 220 * drm_sched_entity_is_ready - Check if entity is ready 221 * 222 * @entity: scheduler entity 223 * 224 * Return true if entity could provide a job. 225 */ 226 static bool drm_sched_entity_is_ready(struct drm_sched_entity *entity) 227 { 228 if (spsc_queue_peek(&entity->job_queue) == NULL) 229 return false; 230 231 if (READ_ONCE(entity->dependency)) 232 return false; 233 234 return true; 235 } 236 237 static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f, 238 struct dma_fence_cb *cb) 239 { 240 struct drm_sched_job *job = container_of(cb, struct drm_sched_job, 241 finish_cb); 242 drm_sched_fence_finished(job->s_fence); 243 WARN_ON(job->s_fence->parent); 244 dma_fence_put(&job->s_fence->finished); 245 job->sched->ops->free_job(job); 246 } 247 248 249 /** 250 * drm_sched_entity_flush - Flush a context entity 251 * 252 * @entity: scheduler entity 253 * @timeout: time to wait in for Q to become empty in jiffies. 254 * 255 * Splitting drm_sched_entity_fini() into two functions, The first one does the waiting, 256 * removes the entity from the runqueue and returns an error when the process was killed. 257 * 258 * Returns the remaining time in jiffies left from the input timeout 259 */ 260 long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) 261 { 262 struct drm_gpu_scheduler *sched; 263 #if 0 264 struct task_struct *last_user; 265 #endif 266 long ret = timeout; 267 268 sched = entity->rq->sched; 269 /** 270 * The client will not queue more IBs during this fini, consume existing 271 * queued IBs or discard them on SIGKILL 272 */ 273 if (current->dfly_td->td_flags & TDF_EXITING) { 274 if (timeout) { 275 ret = wait_event_timeout( 276 sched->job_scheduled, 277 drm_sched_entity_is_idle(entity), 278 timeout); 279 } 280 } else { 281 wait_event_interruptible(sched->job_scheduled, drm_sched_entity_is_idle(entity)); 282 } 283 284 285 /* For killed process disable any more IBs enqueue right now */ 286 #if 0 287 last_user = cmpxchg(&entity->last_user, current->group_leader, NULL); 288 #endif 289 if (/*(!last_user || last_user == current->group_leader) && */ 290 (current->dfly_td->td_flags & TDF_EXITING) && fatal_signal_pending(current)) 291 drm_sched_rq_remove_entity(entity->rq, entity); 292 293 return ret; 294 } 295 EXPORT_SYMBOL(drm_sched_entity_flush); 296 297 /** 298 * drm_sched_entity_cleanup - Destroy a context entity 299 * 300 * @entity: scheduler entity 301 * 302 * This should be called after @drm_sched_entity_do_release. It goes over the 303 * entity and signals all jobs with an error code if the process was killed. 304 * 305 */ 306 void drm_sched_entity_fini(struct drm_sched_entity *entity) 307 { 308 struct drm_gpu_scheduler *sched; 309 310 sched = entity->rq->sched; 311 drm_sched_rq_remove_entity(entity->rq, entity); 312 313 /* Consumption of existing IBs wasn't completed. Forcefully 314 * remove them here. 315 */ 316 if (spsc_queue_peek(&entity->job_queue)) { 317 struct drm_sched_job *job; 318 int r; 319 320 /* Park the kernel for a moment to make sure it isn't processing 321 * our enity. 322 */ 323 kthread_park(sched->thread); 324 kthread_unpark(sched->thread); 325 if (entity->dependency) { 326 dma_fence_remove_callback(entity->dependency, 327 &entity->cb); 328 dma_fence_put(entity->dependency); 329 entity->dependency = NULL; 330 } 331 332 while ((job = to_drm_sched_job(spsc_queue_pop(&entity->job_queue)))) { 333 struct drm_sched_fence *s_fence = job->s_fence; 334 drm_sched_fence_scheduled(s_fence); 335 dma_fence_set_error(&s_fence->finished, -ESRCH); 336 337 /* 338 * When pipe is hanged by older entity, new entity might 339 * not even have chance to submit it's first job to HW 340 * and so entity->last_scheduled will remain NULL 341 */ 342 if (!entity->last_scheduled) { 343 drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb); 344 } else { 345 r = dma_fence_add_callback(entity->last_scheduled, &job->finish_cb, 346 drm_sched_entity_kill_jobs_cb); 347 if (r == -ENOENT) 348 drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb); 349 else if (r) 350 DRM_ERROR("fence add callback failed (%d)\n", r); 351 } 352 } 353 } 354 355 dma_fence_put(entity->last_scheduled); 356 entity->last_scheduled = NULL; 357 } 358 EXPORT_SYMBOL(drm_sched_entity_fini); 359 360 /** 361 * drm_sched_entity_fini - Destroy a context entity 362 * 363 * @entity: scheduler entity 364 * 365 * Calls drm_sched_entity_do_release() and drm_sched_entity_cleanup() 366 */ 367 void drm_sched_entity_destroy(struct drm_sched_entity *entity) 368 { 369 drm_sched_entity_flush(entity, MAX_WAIT_SCHED_ENTITY_Q_EMPTY); 370 drm_sched_entity_fini(entity); 371 } 372 EXPORT_SYMBOL(drm_sched_entity_destroy); 373 374 static void drm_sched_entity_wakeup(struct dma_fence *f, struct dma_fence_cb *cb) 375 { 376 struct drm_sched_entity *entity = 377 container_of(cb, struct drm_sched_entity, cb); 378 entity->dependency = NULL; 379 dma_fence_put(f); 380 drm_sched_wakeup(entity->rq->sched); 381 } 382 383 static void drm_sched_entity_clear_dep(struct dma_fence *f, struct dma_fence_cb *cb) 384 { 385 struct drm_sched_entity *entity = 386 container_of(cb, struct drm_sched_entity, cb); 387 entity->dependency = NULL; 388 dma_fence_put(f); 389 } 390 391 /** 392 * drm_sched_entity_set_rq - Sets the run queue for an entity 393 * 394 * @entity: scheduler entity 395 * @rq: scheduler run queue 396 * 397 * Sets the run queue for an entity and removes the entity from the previous 398 * run queue in which was present. 399 */ 400 void drm_sched_entity_set_rq(struct drm_sched_entity *entity, 401 struct drm_sched_rq *rq) 402 { 403 if (entity->rq == rq) 404 return; 405 406 BUG_ON(!rq); 407 408 spin_lock(&entity->rq_lock); 409 drm_sched_rq_remove_entity(entity->rq, entity); 410 entity->rq = rq; 411 drm_sched_rq_add_entity(rq, entity); 412 spin_unlock(&entity->rq_lock); 413 } 414 EXPORT_SYMBOL(drm_sched_entity_set_rq); 415 416 /** 417 * drm_sched_dependency_optimized 418 * 419 * @fence: the dependency fence 420 * @entity: the entity which depends on the above fence 421 * 422 * Returns true if the dependency can be optimized and false otherwise 423 */ 424 bool drm_sched_dependency_optimized(struct dma_fence* fence, 425 struct drm_sched_entity *entity) 426 { 427 struct drm_gpu_scheduler *sched = entity->rq->sched; 428 struct drm_sched_fence *s_fence; 429 430 if (!fence || dma_fence_is_signaled(fence)) 431 return false; 432 if (fence->context == entity->fence_context) 433 return true; 434 s_fence = to_drm_sched_fence(fence); 435 if (s_fence && s_fence->sched == sched) 436 return true; 437 438 return false; 439 } 440 EXPORT_SYMBOL(drm_sched_dependency_optimized); 441 442 static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity) 443 { 444 struct drm_gpu_scheduler *sched = entity->rq->sched; 445 struct dma_fence * fence = entity->dependency; 446 struct drm_sched_fence *s_fence; 447 448 if (fence->context == entity->fence_context || 449 fence->context == entity->fence_context + 1) { 450 /* 451 * Fence is a scheduled/finished fence from a job 452 * which belongs to the same entity, we can ignore 453 * fences from ourself 454 */ 455 dma_fence_put(entity->dependency); 456 return false; 457 } 458 459 s_fence = to_drm_sched_fence(fence); 460 if (s_fence && s_fence->sched == sched) { 461 462 /* 463 * Fence is from the same scheduler, only need to wait for 464 * it to be scheduled 465 */ 466 fence = dma_fence_get(&s_fence->scheduled); 467 dma_fence_put(entity->dependency); 468 entity->dependency = fence; 469 if (!dma_fence_add_callback(fence, &entity->cb, 470 drm_sched_entity_clear_dep)) 471 return true; 472 473 /* Ignore it when it is already scheduled */ 474 dma_fence_put(fence); 475 return false; 476 } 477 478 if (!dma_fence_add_callback(entity->dependency, &entity->cb, 479 drm_sched_entity_wakeup)) 480 return true; 481 482 dma_fence_put(entity->dependency); 483 return false; 484 } 485 486 static struct drm_sched_job * 487 drm_sched_entity_pop_job(struct drm_sched_entity *entity) 488 { 489 struct drm_gpu_scheduler *sched = entity->rq->sched; 490 struct drm_sched_job *sched_job = to_drm_sched_job( 491 spsc_queue_peek(&entity->job_queue)); 492 493 if (!sched_job) 494 return NULL; 495 496 while ((entity->dependency = sched->ops->dependency(sched_job, entity))) 497 if (drm_sched_entity_add_dependency_cb(entity)) 498 return NULL; 499 500 /* skip jobs from entity that marked guilty */ 501 if (entity->guilty && atomic_read(entity->guilty)) 502 dma_fence_set_error(&sched_job->s_fence->finished, -ECANCELED); 503 504 dma_fence_put(entity->last_scheduled); 505 entity->last_scheduled = dma_fence_get(&sched_job->s_fence->finished); 506 507 spsc_queue_pop(&entity->job_queue); 508 return sched_job; 509 } 510 511 /** 512 * drm_sched_entity_push_job - Submit a job to the entity's job queue 513 * 514 * @sched_job: job to submit 515 * @entity: scheduler entity 516 * 517 * Note: To guarantee that the order of insertion to queue matches 518 * the job's fence sequence number this function should be 519 * called with drm_sched_job_init under common lock. 520 * 521 * Returns 0 for success, negative error code otherwise. 522 */ 523 void drm_sched_entity_push_job(struct drm_sched_job *sched_job, 524 struct drm_sched_entity *entity) 525 { 526 struct drm_gpu_scheduler *sched = sched_job->sched; 527 bool first = false; 528 529 #if 0 530 trace_drm_sched_job(sched_job, entity); 531 #endif 532 533 #if 0 534 WRITE_ONCE(entity->last_user, current->group_leader); 535 #endif 536 first = spsc_queue_push(&entity->job_queue, &sched_job->queue_node); 537 538 /* first job wakes up scheduler */ 539 if (first) { 540 /* Add the entity to the run queue */ 541 spin_lock(&entity->rq_lock); 542 if (!entity->rq) { 543 DRM_ERROR("Trying to push to a killed entity\n"); 544 spin_unlock(&entity->rq_lock); 545 return; 546 } 547 drm_sched_rq_add_entity(entity->rq, entity); 548 spin_unlock(&entity->rq_lock); 549 drm_sched_wakeup(sched); 550 } 551 } 552 EXPORT_SYMBOL(drm_sched_entity_push_job); 553 554 /* job_finish is called after hw fence signaled 555 */ 556 static void drm_sched_job_finish(struct work_struct *work) 557 { 558 struct drm_sched_job *s_job = container_of(work, struct drm_sched_job, 559 finish_work); 560 struct drm_gpu_scheduler *sched = s_job->sched; 561 562 /* 563 * Canceling the timeout without removing our job from the ring mirror 564 * list is safe, as we will only end up in this worker if our jobs 565 * finished fence has been signaled. So even if some another worker 566 * manages to find this job as the next job in the list, the fence 567 * signaled check below will prevent the timeout to be restarted. 568 */ 569 cancel_delayed_work_sync(&s_job->work_tdr); 570 571 spin_lock(&sched->job_list_lock); 572 /* queue TDR for next job */ 573 if (sched->timeout != MAX_SCHEDULE_TIMEOUT && 574 !list_is_last(&s_job->node, &sched->ring_mirror_list)) { 575 struct drm_sched_job *next = list_next_entry(s_job, node); 576 577 if (!dma_fence_is_signaled(&next->s_fence->finished)) 578 schedule_delayed_work(&next->work_tdr, sched->timeout); 579 } 580 /* remove job from ring_mirror_list */ 581 list_del(&s_job->node); 582 spin_unlock(&sched->job_list_lock); 583 584 dma_fence_put(&s_job->s_fence->finished); 585 sched->ops->free_job(s_job); 586 } 587 588 static void drm_sched_job_finish_cb(struct dma_fence *f, 589 struct dma_fence_cb *cb) 590 { 591 struct drm_sched_job *job = container_of(cb, struct drm_sched_job, 592 finish_cb); 593 schedule_work(&job->finish_work); 594 } 595 596 static void drm_sched_job_begin(struct drm_sched_job *s_job) 597 { 598 struct drm_gpu_scheduler *sched = s_job->sched; 599 600 dma_fence_add_callback(&s_job->s_fence->finished, &s_job->finish_cb, 601 drm_sched_job_finish_cb); 602 603 spin_lock(&sched->job_list_lock); 604 list_add_tail(&s_job->node, &sched->ring_mirror_list); 605 if (sched->timeout != MAX_SCHEDULE_TIMEOUT && 606 list_first_entry_or_null(&sched->ring_mirror_list, 607 struct drm_sched_job, node) == s_job) 608 schedule_delayed_work(&s_job->work_tdr, sched->timeout); 609 spin_unlock(&sched->job_list_lock); 610 } 611 612 static void drm_sched_job_timedout(struct work_struct *work) 613 { 614 struct drm_sched_job *job = container_of(work, struct drm_sched_job, 615 work_tdr.work); 616 617 job->sched->ops->timedout_job(job); 618 } 619 620 /** 621 * drm_sched_hw_job_reset - stop the scheduler if it contains the bad job 622 * 623 * @sched: scheduler instance 624 * @bad: bad scheduler job 625 * 626 */ 627 void drm_sched_hw_job_reset(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad) 628 { 629 struct drm_sched_job *s_job; 630 struct drm_sched_entity *entity, *tmp; 631 int i; 632 633 spin_lock(&sched->job_list_lock); 634 list_for_each_entry_reverse(s_job, &sched->ring_mirror_list, node) { 635 if (s_job->s_fence->parent && 636 dma_fence_remove_callback(s_job->s_fence->parent, 637 &s_job->s_fence->cb)) { 638 dma_fence_put(s_job->s_fence->parent); 639 s_job->s_fence->parent = NULL; 640 atomic_dec(&sched->hw_rq_count); 641 } 642 } 643 spin_unlock(&sched->job_list_lock); 644 645 if (bad && bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) { 646 atomic_inc(&bad->karma); 647 /* don't increase @bad's karma if it's from KERNEL RQ, 648 * becuase sometimes GPU hang would cause kernel jobs (like VM updating jobs) 649 * corrupt but keep in mind that kernel jobs always considered good. 650 */ 651 for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_KERNEL; i++ ) { 652 struct drm_sched_rq *rq = &sched->sched_rq[i]; 653 654 spin_lock(&rq->lock); 655 list_for_each_entry_safe(entity, tmp, &rq->entities, list) { 656 if (bad->s_fence->scheduled.context == entity->fence_context) { 657 if (atomic_read(&bad->karma) > bad->sched->hang_limit) 658 if (entity->guilty) 659 atomic_set(entity->guilty, 1); 660 break; 661 } 662 } 663 spin_unlock(&rq->lock); 664 if (&entity->list != &rq->entities) 665 break; 666 } 667 } 668 } 669 EXPORT_SYMBOL(drm_sched_hw_job_reset); 670 671 /** 672 * drm_sched_job_recovery - recover jobs after a reset 673 * 674 * @sched: scheduler instance 675 * 676 */ 677 void drm_sched_job_recovery(struct drm_gpu_scheduler *sched) 678 { 679 struct drm_sched_job *s_job, *tmp; 680 bool found_guilty = false; 681 int r; 682 683 spin_lock(&sched->job_list_lock); 684 s_job = list_first_entry_or_null(&sched->ring_mirror_list, 685 struct drm_sched_job, node); 686 if (s_job && sched->timeout != MAX_SCHEDULE_TIMEOUT) 687 schedule_delayed_work(&s_job->work_tdr, sched->timeout); 688 689 list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) { 690 struct drm_sched_fence *s_fence = s_job->s_fence; 691 struct dma_fence *fence; 692 uint64_t guilty_context; 693 694 if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) { 695 found_guilty = true; 696 guilty_context = s_job->s_fence->scheduled.context; 697 } 698 699 if (found_guilty && s_job->s_fence->scheduled.context == guilty_context) 700 dma_fence_set_error(&s_fence->finished, -ECANCELED); 701 702 spin_unlock(&sched->job_list_lock); 703 fence = sched->ops->run_job(s_job); 704 atomic_inc(&sched->hw_rq_count); 705 706 if (fence) { 707 s_fence->parent = dma_fence_get(fence); 708 r = dma_fence_add_callback(fence, &s_fence->cb, 709 drm_sched_process_job); 710 if (r == -ENOENT) 711 drm_sched_process_job(fence, &s_fence->cb); 712 else if (r) 713 DRM_ERROR("fence add callback failed (%d)\n", 714 r); 715 dma_fence_put(fence); 716 } else { 717 drm_sched_process_job(NULL, &s_fence->cb); 718 } 719 spin_lock(&sched->job_list_lock); 720 } 721 spin_unlock(&sched->job_list_lock); 722 } 723 EXPORT_SYMBOL(drm_sched_job_recovery); 724 725 /** 726 * drm_sched_job_init - init a scheduler job 727 * 728 * @job: scheduler job to init 729 * @entity: scheduler entity to use 730 * @owner: job owner for debugging 731 * 732 * Refer to drm_sched_entity_push_job() documentation 733 * for locking considerations. 734 * 735 * Returns 0 for success, negative error code otherwise. 736 */ 737 int drm_sched_job_init(struct drm_sched_job *job, 738 struct drm_sched_entity *entity, 739 void *owner) 740 { 741 struct drm_gpu_scheduler *sched = entity->rq->sched; 742 743 job->sched = sched; 744 job->entity = entity; 745 job->s_priority = entity->rq - sched->sched_rq; 746 job->s_fence = drm_sched_fence_create(entity, owner); 747 if (!job->s_fence) 748 return -ENOMEM; 749 job->id = atomic64_inc_return(&sched->job_id_count); 750 751 INIT_WORK(&job->finish_work, drm_sched_job_finish); 752 INIT_LIST_HEAD(&job->node); 753 INIT_DELAYED_WORK(&job->work_tdr, drm_sched_job_timedout); 754 755 return 0; 756 } 757 EXPORT_SYMBOL(drm_sched_job_init); 758 759 /** 760 * drm_sched_ready - is the scheduler ready 761 * 762 * @sched: scheduler instance 763 * 764 * Return true if we can push more jobs to the hw, otherwise false. 765 */ 766 static bool drm_sched_ready(struct drm_gpu_scheduler *sched) 767 { 768 return atomic_read(&sched->hw_rq_count) < 769 sched->hw_submission_limit; 770 } 771 772 /** 773 * drm_sched_wakeup - Wake up the scheduler when it is ready 774 * 775 * @sched: scheduler instance 776 * 777 */ 778 static void drm_sched_wakeup(struct drm_gpu_scheduler *sched) 779 { 780 if (drm_sched_ready(sched)) 781 wake_up_interruptible(&sched->wake_up_worker); 782 } 783 784 /** 785 * drm_sched_select_entity - Select next entity to process 786 * 787 * @sched: scheduler instance 788 * 789 * Returns the entity to process or NULL if none are found. 790 */ 791 static struct drm_sched_entity * 792 drm_sched_select_entity(struct drm_gpu_scheduler *sched) 793 { 794 struct drm_sched_entity *entity; 795 int i; 796 797 if (!drm_sched_ready(sched)) 798 return NULL; 799 800 /* Kernel run queue has higher priority than normal run queue*/ 801 for (i = DRM_SCHED_PRIORITY_MAX - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) { 802 entity = drm_sched_rq_select_entity(&sched->sched_rq[i]); 803 if (entity) 804 break; 805 } 806 807 return entity; 808 } 809 810 /** 811 * drm_sched_process_job - process a job 812 * 813 * @f: fence 814 * @cb: fence callbacks 815 * 816 * Called after job has finished execution. 817 */ 818 static void drm_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb) 819 { 820 struct drm_sched_fence *s_fence = 821 container_of(cb, struct drm_sched_fence, cb); 822 struct drm_gpu_scheduler *sched = s_fence->sched; 823 824 dma_fence_get(&s_fence->finished); 825 atomic_dec(&sched->hw_rq_count); 826 drm_sched_fence_finished(s_fence); 827 828 #if 0 829 trace_drm_sched_process_job(s_fence); 830 #endif 831 dma_fence_put(&s_fence->finished); 832 wake_up_interruptible(&sched->wake_up_worker); 833 } 834 835 /** 836 * drm_sched_blocked - check if the scheduler is blocked 837 * 838 * @sched: scheduler instance 839 * 840 * Returns true if blocked, otherwise false. 841 */ 842 static bool drm_sched_blocked(struct drm_gpu_scheduler *sched) 843 { 844 if (kthread_should_park()) { 845 kthread_parkme(); 846 return true; 847 } 848 849 return false; 850 } 851 852 /** 853 * drm_sched_main - main scheduler thread 854 * 855 * @param: scheduler instance 856 * 857 * Returns 0. 858 */ 859 static int drm_sched_main(void *param) 860 { 861 #if 0 862 struct sched_param sparam = {.sched_priority = 1}; 863 #endif 864 struct drm_gpu_scheduler *sched = (struct drm_gpu_scheduler *)param; 865 int r; 866 867 #if 0 868 sched_setscheduler(current, SCHED_FIFO, &sparam); 869 #endif 870 871 while (!kthread_should_stop()) { 872 struct drm_sched_entity *entity = NULL; 873 struct drm_sched_fence *s_fence; 874 struct drm_sched_job *sched_job; 875 struct dma_fence *fence; 876 877 wait_event_interruptible(sched->wake_up_worker, 878 (!drm_sched_blocked(sched) && 879 (entity = drm_sched_select_entity(sched))) || 880 kthread_should_stop()); 881 882 if (!entity) 883 continue; 884 885 sched_job = drm_sched_entity_pop_job(entity); 886 if (!sched_job) 887 continue; 888 889 s_fence = sched_job->s_fence; 890 891 atomic_inc(&sched->hw_rq_count); 892 drm_sched_job_begin(sched_job); 893 894 fence = sched->ops->run_job(sched_job); 895 drm_sched_fence_scheduled(s_fence); 896 897 if (fence) { 898 s_fence->parent = dma_fence_get(fence); 899 r = dma_fence_add_callback(fence, &s_fence->cb, 900 drm_sched_process_job); 901 if (r == -ENOENT) 902 drm_sched_process_job(fence, &s_fence->cb); 903 else if (r) 904 DRM_ERROR("fence add callback failed (%d)\n", 905 r); 906 dma_fence_put(fence); 907 } else { 908 drm_sched_process_job(NULL, &s_fence->cb); 909 } 910 911 wake_up(&sched->job_scheduled); 912 } 913 return 0; 914 } 915 916 /** 917 * drm_sched_init - Init a gpu scheduler instance 918 * 919 * @sched: scheduler instance 920 * @ops: backend operations for this scheduler 921 * @hw_submission: number of hw submissions that can be in flight 922 * @hang_limit: number of times to allow a job to hang before dropping it 923 * @timeout: timeout value in jiffies for the scheduler 924 * @name: name used for debugging 925 * 926 * Return 0 on success, otherwise error code. 927 */ 928 int drm_sched_init(struct drm_gpu_scheduler *sched, 929 const struct drm_sched_backend_ops *ops, 930 unsigned hw_submission, 931 unsigned hang_limit, 932 long timeout, 933 const char *name) 934 { 935 int i; 936 sched->ops = ops; 937 sched->hw_submission_limit = hw_submission; 938 sched->name = name; 939 sched->timeout = timeout; 940 sched->hang_limit = hang_limit; 941 for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_MAX; i++) 942 drm_sched_rq_init(sched, &sched->sched_rq[i]); 943 944 init_waitqueue_head(&sched->wake_up_worker); 945 init_waitqueue_head(&sched->job_scheduled); 946 INIT_LIST_HEAD(&sched->ring_mirror_list); 947 spin_init(&sched->job_list_lock, "dgsjll"); 948 atomic_set(&sched->hw_rq_count, 0); 949 atomic64_set(&sched->job_id_count, 0); 950 951 /* Each scheduler will run on a seperate kernel thread */ 952 sched->thread = kthread_run(drm_sched_main, sched, sched->name); 953 if (IS_ERR(sched->thread)) { 954 DRM_ERROR("Failed to create scheduler for %s.\n", name); 955 return PTR_ERR(sched->thread); 956 } 957 958 return 0; 959 } 960 EXPORT_SYMBOL(drm_sched_init); 961 962 /** 963 * drm_sched_fini - Destroy a gpu scheduler 964 * 965 * @sched: scheduler instance 966 * 967 * Tears down and cleans up the scheduler. 968 */ 969 void drm_sched_fini(struct drm_gpu_scheduler *sched) 970 { 971 if (sched->thread) 972 kthread_stop(sched->thread); 973 } 974 EXPORT_SYMBOL(drm_sched_fini); 975