1 // SPDX-License-Identifier: GPL-2.0
2
3 /*
4 * Copyright 2016-2019 HabanaLabs, Ltd.
5 * All Rights Reserved.
6 *
7 */
8
9 #define pr_fmt(fmt) "habanalabs: " fmt
10
11 #include "habanalabs.h"
12
13 #include <linux/pci.h>
14 #include <linux/aer.h>
15 #include <linux/module.h>
16
17 #define HL_DRIVER_AUTHOR "HabanaLabs Kernel Driver Team"
18
19 #define HL_DRIVER_DESC "Driver for HabanaLabs's AI Accelerators"
20
21 MODULE_AUTHOR(HL_DRIVER_AUTHOR);
22 MODULE_DESCRIPTION(HL_DRIVER_DESC);
23 MODULE_LICENSE("GPL v2");
24
25 static int hl_major;
26 static struct class *hl_class;
27 static DEFINE_IDR(hl_devs_idr);
28 static DEFINE_MUTEX(hl_devs_idr_lock);
29
30 static int timeout_locked = 30;
31 static int reset_on_lockup = 1;
32 static int memory_scrub = 1;
33
34 module_param(timeout_locked, int, 0444);
35 MODULE_PARM_DESC(timeout_locked,
36 "Device lockup timeout in seconds (0 = disabled, default 30s)");
37
38 module_param(reset_on_lockup, int, 0444);
39 MODULE_PARM_DESC(reset_on_lockup,
40 "Do device reset on lockup (0 = no, 1 = yes, default yes)");
41
42 module_param(memory_scrub, int, 0444);
43 MODULE_PARM_DESC(memory_scrub,
44 "Scrub device memory in various states (0 = no, 1 = yes, default yes)");
45
46 #define PCI_VENDOR_ID_HABANALABS 0x1da3
47
48 #define PCI_IDS_GOYA 0x0001
49 #define PCI_IDS_GAUDI 0x1000
50 #define PCI_IDS_GAUDI_SEC 0x1010
51
52 static const struct pci_device_id ids[] = {
53 { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GOYA), },
54 { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI), },
55 { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI_SEC), },
56 { 0, }
57 };
58 MODULE_DEVICE_TABLE(pci, ids);
59
60 /*
61 * get_asic_type - translate device id to asic type
62 *
63 * @device: id of the PCI device
64 *
65 * Translate device id to asic type.
66 * In case of unidentified device, return -1
67 */
get_asic_type(u16 device)68 static enum hl_asic_type get_asic_type(u16 device)
69 {
70 enum hl_asic_type asic_type;
71
72 switch (device) {
73 case PCI_IDS_GOYA:
74 asic_type = ASIC_GOYA;
75 break;
76 case PCI_IDS_GAUDI:
77 asic_type = ASIC_GAUDI;
78 break;
79 case PCI_IDS_GAUDI_SEC:
80 asic_type = ASIC_GAUDI_SEC;
81 break;
82 default:
83 asic_type = ASIC_INVALID;
84 break;
85 }
86
87 return asic_type;
88 }
89
is_asic_secured(enum hl_asic_type asic_type)90 static bool is_asic_secured(enum hl_asic_type asic_type)
91 {
92 switch (asic_type) {
93 case ASIC_GAUDI_SEC:
94 return true;
95 default:
96 return false;
97 }
98 }
99
100 /*
101 * hl_device_open - open function for habanalabs device
102 *
103 * @inode: pointer to inode structure
104 * @filp: pointer to file structure
105 *
106 * Called when process opens an habanalabs device.
107 */
hl_device_open(struct inode * inode,struct file * filp)108 int hl_device_open(struct inode *inode, struct file *filp)
109 {
110 enum hl_device_status status;
111 struct hl_device *hdev;
112 struct hl_fpriv *hpriv;
113 int rc;
114
115 mutex_lock(&hl_devs_idr_lock);
116 hdev = idr_find(&hl_devs_idr, iminor(inode));
117 mutex_unlock(&hl_devs_idr_lock);
118
119 if (!hdev) {
120 pr_err("Couldn't find device %d:%d\n",
121 imajor(inode), iminor(inode));
122 return -ENXIO;
123 }
124
125 hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
126 if (!hpriv)
127 return -ENOMEM;
128
129 hpriv->hdev = hdev;
130 filp->private_data = hpriv;
131 hpriv->filp = filp;
132 mutex_init(&hpriv->restore_phase_mutex);
133 kref_init(&hpriv->refcount);
134 nonseekable_open(inode, filp);
135
136 hl_cb_mgr_init(&hpriv->cb_mgr);
137 hl_ctx_mgr_init(&hpriv->ctx_mgr);
138
139 hpriv->taskpid = find_get_pid(current->pid);
140
141 mutex_lock(&hdev->fpriv_list_lock);
142
143 if (!hl_device_operational(hdev, &status)) {
144 dev_err_ratelimited(hdev->dev,
145 "Can't open %s because it is %s\n",
146 dev_name(hdev->dev), hdev->status[status]);
147 rc = -EPERM;
148 goto out_err;
149 }
150
151 if (hdev->in_debug) {
152 dev_err_ratelimited(hdev->dev,
153 "Can't open %s because it is being debugged by another user\n",
154 dev_name(hdev->dev));
155 rc = -EPERM;
156 goto out_err;
157 }
158
159 if (hdev->compute_ctx) {
160 dev_dbg_ratelimited(hdev->dev,
161 "Can't open %s because another user is working on it\n",
162 dev_name(hdev->dev));
163 rc = -EBUSY;
164 goto out_err;
165 }
166
167 rc = hl_ctx_create(hdev, hpriv);
168 if (rc) {
169 dev_err(hdev->dev, "Failed to create context %d\n", rc);
170 goto out_err;
171 }
172
173 /* Device is IDLE at this point so it is legal to change PLLs.
174 * There is no need to check anything because if the PLL is
175 * already HIGH, the set function will return without doing
176 * anything
177 */
178 hl_device_set_frequency(hdev, PLL_HIGH);
179
180 list_add(&hpriv->dev_node, &hdev->fpriv_list);
181 mutex_unlock(&hdev->fpriv_list_lock);
182
183 hl_debugfs_add_file(hpriv);
184
185 return 0;
186
187 out_err:
188 mutex_unlock(&hdev->fpriv_list_lock);
189
190 hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr);
191 hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
192 filp->private_data = NULL;
193 mutex_destroy(&hpriv->restore_phase_mutex);
194 put_pid(hpriv->taskpid);
195
196 kfree(hpriv);
197
198 return rc;
199 }
200
hl_device_open_ctrl(struct inode * inode,struct file * filp)201 int hl_device_open_ctrl(struct inode *inode, struct file *filp)
202 {
203 struct hl_device *hdev;
204 struct hl_fpriv *hpriv;
205 int rc;
206
207 mutex_lock(&hl_devs_idr_lock);
208 hdev = idr_find(&hl_devs_idr, iminor(inode));
209 mutex_unlock(&hl_devs_idr_lock);
210
211 if (!hdev) {
212 pr_err("Couldn't find device %d:%d\n",
213 imajor(inode), iminor(inode));
214 return -ENXIO;
215 }
216
217 hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
218 if (!hpriv)
219 return -ENOMEM;
220
221 mutex_lock(&hdev->fpriv_list_lock);
222
223 if (!hl_device_operational(hdev, NULL)) {
224 dev_err_ratelimited(hdev->dev_ctrl,
225 "Can't open %s because it is disabled or in reset\n",
226 dev_name(hdev->dev_ctrl));
227 rc = -EPERM;
228 goto out_err;
229 }
230
231 list_add(&hpriv->dev_node, &hdev->fpriv_list);
232 mutex_unlock(&hdev->fpriv_list_lock);
233
234 hpriv->hdev = hdev;
235 filp->private_data = hpriv;
236 hpriv->filp = filp;
237 hpriv->is_control = true;
238 nonseekable_open(inode, filp);
239
240 hpriv->taskpid = find_get_pid(current->pid);
241
242 return 0;
243
244 out_err:
245 mutex_unlock(&hdev->fpriv_list_lock);
246 kfree(hpriv);
247 return rc;
248 }
249
set_driver_behavior_per_device(struct hl_device * hdev)250 static void set_driver_behavior_per_device(struct hl_device *hdev)
251 {
252 hdev->fw_components = FW_TYPE_ALL_TYPES;
253 hdev->cpu_queues_enable = 1;
254 hdev->heartbeat = 1;
255 hdev->mmu_enable = 1;
256 hdev->clock_gating_mask = ULONG_MAX;
257 hdev->sram_scrambler_enable = 1;
258 hdev->dram_scrambler_enable = 1;
259 hdev->bmc_enable = 1;
260 hdev->hard_reset_on_fw_events = 1;
261 hdev->reset_on_preboot_fail = 1;
262
263 hdev->reset_pcilink = 0;
264 hdev->axi_drain = 0;
265 }
266
267 /*
268 * create_hdev - create habanalabs device instance
269 *
270 * @dev: will hold the pointer to the new habanalabs device structure
271 * @pdev: pointer to the pci device
272 * @asic_type: in case of simulator device, which device is it
273 * @minor: in case of simulator device, the minor of the device
274 *
275 * Allocate memory for habanalabs device and initialize basic fields
276 * Identify the ASIC type
277 * Allocate ID (minor) for the device (only for real devices)
278 */
create_hdev(struct hl_device ** dev,struct pci_dev * pdev,enum hl_asic_type asic_type,int minor)279 int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
280 enum hl_asic_type asic_type, int minor)
281 {
282 struct hl_device *hdev;
283 int rc, main_id, ctrl_id = 0;
284
285 *dev = NULL;
286
287 hdev = kzalloc(sizeof(*hdev), GFP_KERNEL);
288 if (!hdev)
289 return -ENOMEM;
290
291 /* First, we must find out which ASIC are we handling. This is needed
292 * to configure the behavior of the driver (kernel parameters)
293 */
294 if (pdev) {
295 hdev->asic_type = get_asic_type(pdev->device);
296 if (hdev->asic_type == ASIC_INVALID) {
297 dev_err(&pdev->dev, "Unsupported ASIC\n");
298 rc = -ENODEV;
299 goto free_hdev;
300 }
301 } else {
302 hdev->asic_type = asic_type;
303 }
304
305 if (pdev)
306 hdev->asic_prop.fw_security_disabled =
307 !is_asic_secured(pdev->device);
308 else
309 hdev->asic_prop.fw_security_disabled = true;
310
311 /* Assign status description string */
312 strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION],
313 "disabled", HL_STR_MAX);
314 strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET],
315 "in reset", HL_STR_MAX);
316 strncpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET],
317 "needs reset", HL_STR_MAX);
318
319 hdev->major = hl_major;
320 hdev->reset_on_lockup = reset_on_lockup;
321 hdev->memory_scrub = memory_scrub;
322 hdev->pldm = 0;
323
324 set_driver_behavior_per_device(hdev);
325
326 if (timeout_locked)
327 hdev->timeout_jiffies = msecs_to_jiffies(timeout_locked * 1000);
328 else
329 hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
330
331 hdev->disabled = true;
332 hdev->pdev = pdev; /* can be NULL in case of simulator device */
333
334 /* Set default DMA mask to 32 bits */
335 hdev->dma_mask = 32;
336
337 mutex_lock(&hl_devs_idr_lock);
338
339 /* Always save 2 numbers, 1 for main device and 1 for control.
340 * They must be consecutive
341 */
342 main_id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS,
343 GFP_KERNEL);
344
345 if (main_id >= 0)
346 ctrl_id = idr_alloc(&hl_devs_idr, hdev, main_id + 1,
347 main_id + 2, GFP_KERNEL);
348
349 mutex_unlock(&hl_devs_idr_lock);
350
351 if ((main_id < 0) || (ctrl_id < 0)) {
352 if ((main_id == -ENOSPC) || (ctrl_id == -ENOSPC))
353 pr_err("too many devices in the system\n");
354
355 if (main_id >= 0) {
356 mutex_lock(&hl_devs_idr_lock);
357 idr_remove(&hl_devs_idr, main_id);
358 mutex_unlock(&hl_devs_idr_lock);
359 }
360
361 rc = -EBUSY;
362 goto free_hdev;
363 }
364
365 hdev->id = main_id;
366 hdev->id_control = ctrl_id;
367
368 *dev = hdev;
369
370 return 0;
371
372 free_hdev:
373 kfree(hdev);
374 return rc;
375 }
376
377 /*
378 * destroy_hdev - destroy habanalabs device instance
379 *
380 * @dev: pointer to the habanalabs device structure
381 *
382 */
destroy_hdev(struct hl_device * hdev)383 void destroy_hdev(struct hl_device *hdev)
384 {
385 /* Remove device from the device list */
386 mutex_lock(&hl_devs_idr_lock);
387 idr_remove(&hl_devs_idr, hdev->id);
388 idr_remove(&hl_devs_idr, hdev->id_control);
389 mutex_unlock(&hl_devs_idr_lock);
390
391 kfree(hdev);
392 }
393
hl_pmops_suspend(struct device * dev)394 static int hl_pmops_suspend(struct device *dev)
395 {
396 struct hl_device *hdev = dev_get_drvdata(dev);
397
398 pr_debug("Going to suspend PCI device\n");
399
400 if (!hdev) {
401 pr_err("device pointer is NULL in suspend\n");
402 return 0;
403 }
404
405 return hl_device_suspend(hdev);
406 }
407
hl_pmops_resume(struct device * dev)408 static int hl_pmops_resume(struct device *dev)
409 {
410 struct hl_device *hdev = dev_get_drvdata(dev);
411
412 pr_debug("Going to resume PCI device\n");
413
414 if (!hdev) {
415 pr_err("device pointer is NULL in resume\n");
416 return 0;
417 }
418
419 return hl_device_resume(hdev);
420 }
421
422 /*
423 * hl_pci_probe - probe PCI habanalabs devices
424 *
425 * @pdev: pointer to pci device
426 * @id: pointer to pci device id structure
427 *
428 * Standard PCI probe function for habanalabs device.
429 * Create a new habanalabs device and initialize it according to the
430 * device's type
431 */
hl_pci_probe(struct pci_dev * pdev,const struct pci_device_id * id)432 static int hl_pci_probe(struct pci_dev *pdev,
433 const struct pci_device_id *id)
434 {
435 struct hl_device *hdev;
436 int rc;
437
438 dev_info(&pdev->dev, HL_NAME
439 " device found [%04x:%04x] (rev %x)\n",
440 (int)pdev->vendor, (int)pdev->device, (int)pdev->revision);
441
442 rc = create_hdev(&hdev, pdev, ASIC_INVALID, -1);
443 if (rc)
444 return rc;
445
446 pci_set_drvdata(pdev, hdev);
447
448 pci_enable_pcie_error_reporting(pdev);
449
450 rc = hl_device_init(hdev, hl_class);
451 if (rc) {
452 dev_err(&pdev->dev, "Fatal error during habanalabs device init\n");
453 rc = -ENODEV;
454 goto disable_device;
455 }
456
457 return 0;
458
459 disable_device:
460 pci_set_drvdata(pdev, NULL);
461 destroy_hdev(hdev);
462
463 return rc;
464 }
465
466 /*
467 * hl_pci_remove - remove PCI habanalabs devices
468 *
469 * @pdev: pointer to pci device
470 *
471 * Standard PCI remove function for habanalabs device
472 */
hl_pci_remove(struct pci_dev * pdev)473 static void hl_pci_remove(struct pci_dev *pdev)
474 {
475 struct hl_device *hdev;
476
477 hdev = pci_get_drvdata(pdev);
478 if (!hdev)
479 return;
480
481 hl_device_fini(hdev);
482 pci_disable_pcie_error_reporting(pdev);
483 pci_set_drvdata(pdev, NULL);
484 destroy_hdev(hdev);
485 }
486
487 /**
488 * hl_pci_err_detected - a PCI bus error detected on this device
489 *
490 * @pdev: pointer to pci device
491 * @state: PCI error type
492 *
493 * Called by the PCI subsystem whenever a non-correctable
494 * PCI bus error is detected
495 */
496 static pci_ers_result_t
hl_pci_err_detected(struct pci_dev * pdev,pci_channel_state_t state)497 hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state)
498 {
499 struct hl_device *hdev = pci_get_drvdata(pdev);
500 enum pci_ers_result result;
501
502 switch (state) {
503 case pci_channel_io_normal:
504 return PCI_ERS_RESULT_CAN_RECOVER;
505
506 case pci_channel_io_frozen:
507 dev_warn(hdev->dev, "frozen state error detected\n");
508 result = PCI_ERS_RESULT_NEED_RESET;
509 break;
510
511 case pci_channel_io_perm_failure:
512 dev_warn(hdev->dev, "failure state error detected\n");
513 result = PCI_ERS_RESULT_DISCONNECT;
514 break;
515
516 default:
517 result = PCI_ERS_RESULT_NONE;
518 }
519
520 hdev->asic_funcs->halt_engines(hdev, true);
521
522 return result;
523 }
524
525 /**
526 * hl_pci_err_resume - resume after a PCI slot reset
527 *
528 * @pdev: pointer to pci device
529 *
530 */
hl_pci_err_resume(struct pci_dev * pdev)531 static void hl_pci_err_resume(struct pci_dev *pdev)
532 {
533 struct hl_device *hdev = pci_get_drvdata(pdev);
534
535 dev_warn(hdev->dev, "Resuming device after PCI slot reset\n");
536 hl_device_resume(hdev);
537 }
538
539 /**
540 * hl_pci_err_slot_reset - a PCI slot reset has just happened
541 *
542 * @pdev: pointer to pci device
543 *
544 * Determine if the driver can recover from the PCI slot reset
545 */
hl_pci_err_slot_reset(struct pci_dev * pdev)546 static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev)
547 {
548 return PCI_ERS_RESULT_RECOVERED;
549 }
550
551 static const struct dev_pm_ops hl_pm_ops = {
552 .suspend = hl_pmops_suspend,
553 .resume = hl_pmops_resume,
554 };
555
556 static const struct pci_error_handlers hl_pci_err_handler = {
557 .error_detected = hl_pci_err_detected,
558 .slot_reset = hl_pci_err_slot_reset,
559 .resume = hl_pci_err_resume,
560 };
561
562 static struct pci_driver hl_pci_driver = {
563 .name = HL_NAME,
564 .id_table = ids,
565 .probe = hl_pci_probe,
566 .remove = hl_pci_remove,
567 .shutdown = hl_pci_remove,
568 .driver.pm = &hl_pm_ops,
569 .err_handler = &hl_pci_err_handler,
570 };
571
572 /*
573 * hl_init - Initialize the habanalabs kernel driver
574 */
hl_init(void)575 static int __init hl_init(void)
576 {
577 int rc;
578 dev_t dev;
579
580 pr_info("loading driver\n");
581
582 rc = alloc_chrdev_region(&dev, 0, HL_MAX_MINORS, HL_NAME);
583 if (rc < 0) {
584 pr_err("unable to get major\n");
585 return rc;
586 }
587
588 hl_major = MAJOR(dev);
589
590 hl_class = class_create(THIS_MODULE, HL_NAME);
591 if (IS_ERR(hl_class)) {
592 pr_err("failed to allocate class\n");
593 rc = PTR_ERR(hl_class);
594 goto remove_major;
595 }
596
597 hl_debugfs_init();
598
599 rc = pci_register_driver(&hl_pci_driver);
600 if (rc) {
601 pr_err("failed to register pci device\n");
602 goto remove_debugfs;
603 }
604
605 pr_debug("driver loaded\n");
606
607 return 0;
608
609 remove_debugfs:
610 hl_debugfs_fini();
611 class_destroy(hl_class);
612 remove_major:
613 unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
614 return rc;
615 }
616
617 /*
618 * hl_exit - Release all resources of the habanalabs kernel driver
619 */
hl_exit(void)620 static void __exit hl_exit(void)
621 {
622 pci_unregister_driver(&hl_pci_driver);
623
624 /*
625 * Removing debugfs must be after all devices or simulator devices
626 * have been removed because otherwise we get a bug in the
627 * debugfs module for referencing NULL objects
628 */
629 hl_debugfs_fini();
630
631 class_destroy(hl_class);
632 unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
633
634 idr_destroy(&hl_devs_idr);
635
636 pr_debug("driver removed\n");
637 }
638
639 module_init(hl_init);
640 module_exit(hl_exit);
641