1 // SPDX-License-Identifier: GPL-2.0
2 
3 /*
4  * Copyright 2016-2019 HabanaLabs, Ltd.
5  * All Rights Reserved.
6  *
7  */
8 
9 #define pr_fmt(fmt)		"habanalabs: " fmt
10 
11 #include "habanalabs.h"
12 
13 #include <linux/pci.h>
14 #include <linux/aer.h>
15 #include <linux/module.h>
16 
17 #define HL_DRIVER_AUTHOR	"HabanaLabs Kernel Driver Team"
18 
19 #define HL_DRIVER_DESC		"Driver for HabanaLabs's AI Accelerators"
20 
21 MODULE_AUTHOR(HL_DRIVER_AUTHOR);
22 MODULE_DESCRIPTION(HL_DRIVER_DESC);
23 MODULE_LICENSE("GPL v2");
24 
25 static int hl_major;
26 static struct class *hl_class;
27 static DEFINE_IDR(hl_devs_idr);
28 static DEFINE_MUTEX(hl_devs_idr_lock);
29 
30 static int timeout_locked = 30;
31 static int reset_on_lockup = 1;
32 static int memory_scrub = 1;
33 
34 module_param(timeout_locked, int, 0444);
35 MODULE_PARM_DESC(timeout_locked,
36 	"Device lockup timeout in seconds (0 = disabled, default 30s)");
37 
38 module_param(reset_on_lockup, int, 0444);
39 MODULE_PARM_DESC(reset_on_lockup,
40 	"Do device reset on lockup (0 = no, 1 = yes, default yes)");
41 
42 module_param(memory_scrub, int, 0444);
43 MODULE_PARM_DESC(memory_scrub,
44 	"Scrub device memory in various states (0 = no, 1 = yes, default yes)");
45 
46 #define PCI_VENDOR_ID_HABANALABS	0x1da3
47 
48 #define PCI_IDS_GOYA			0x0001
49 #define PCI_IDS_GAUDI			0x1000
50 #define PCI_IDS_GAUDI_SEC		0x1010
51 
52 static const struct pci_device_id ids[] = {
53 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GOYA), },
54 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI), },
55 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI_SEC), },
56 	{ 0, }
57 };
58 MODULE_DEVICE_TABLE(pci, ids);
59 
60 /*
61  * get_asic_type - translate device id to asic type
62  *
63  * @device: id of the PCI device
64  *
65  * Translate device id to asic type.
66  * In case of unidentified device, return -1
67  */
get_asic_type(u16 device)68 static enum hl_asic_type get_asic_type(u16 device)
69 {
70 	enum hl_asic_type asic_type;
71 
72 	switch (device) {
73 	case PCI_IDS_GOYA:
74 		asic_type = ASIC_GOYA;
75 		break;
76 	case PCI_IDS_GAUDI:
77 		asic_type = ASIC_GAUDI;
78 		break;
79 	case PCI_IDS_GAUDI_SEC:
80 		asic_type = ASIC_GAUDI_SEC;
81 		break;
82 	default:
83 		asic_type = ASIC_INVALID;
84 		break;
85 	}
86 
87 	return asic_type;
88 }
89 
is_asic_secured(enum hl_asic_type asic_type)90 static bool is_asic_secured(enum hl_asic_type asic_type)
91 {
92 	switch (asic_type) {
93 	case ASIC_GAUDI_SEC:
94 		return true;
95 	default:
96 		return false;
97 	}
98 }
99 
100 /*
101  * hl_device_open - open function for habanalabs device
102  *
103  * @inode: pointer to inode structure
104  * @filp: pointer to file structure
105  *
106  * Called when process opens an habanalabs device.
107  */
hl_device_open(struct inode * inode,struct file * filp)108 int hl_device_open(struct inode *inode, struct file *filp)
109 {
110 	enum hl_device_status status;
111 	struct hl_device *hdev;
112 	struct hl_fpriv *hpriv;
113 	int rc;
114 
115 	mutex_lock(&hl_devs_idr_lock);
116 	hdev = idr_find(&hl_devs_idr, iminor(inode));
117 	mutex_unlock(&hl_devs_idr_lock);
118 
119 	if (!hdev) {
120 		pr_err("Couldn't find device %d:%d\n",
121 			imajor(inode), iminor(inode));
122 		return -ENXIO;
123 	}
124 
125 	hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
126 	if (!hpriv)
127 		return -ENOMEM;
128 
129 	hpriv->hdev = hdev;
130 	filp->private_data = hpriv;
131 	hpriv->filp = filp;
132 	mutex_init(&hpriv->restore_phase_mutex);
133 	kref_init(&hpriv->refcount);
134 	nonseekable_open(inode, filp);
135 
136 	hl_cb_mgr_init(&hpriv->cb_mgr);
137 	hl_ctx_mgr_init(&hpriv->ctx_mgr);
138 
139 	hpriv->taskpid = find_get_pid(current->pid);
140 
141 	mutex_lock(&hdev->fpriv_list_lock);
142 
143 	if (!hl_device_operational(hdev, &status)) {
144 		dev_err_ratelimited(hdev->dev,
145 			"Can't open %s because it is %s\n",
146 			dev_name(hdev->dev), hdev->status[status]);
147 		rc = -EPERM;
148 		goto out_err;
149 	}
150 
151 	if (hdev->in_debug) {
152 		dev_err_ratelimited(hdev->dev,
153 			"Can't open %s because it is being debugged by another user\n",
154 			dev_name(hdev->dev));
155 		rc = -EPERM;
156 		goto out_err;
157 	}
158 
159 	if (hdev->compute_ctx) {
160 		dev_dbg_ratelimited(hdev->dev,
161 			"Can't open %s because another user is working on it\n",
162 			dev_name(hdev->dev));
163 		rc = -EBUSY;
164 		goto out_err;
165 	}
166 
167 	rc = hl_ctx_create(hdev, hpriv);
168 	if (rc) {
169 		dev_err(hdev->dev, "Failed to create context %d\n", rc);
170 		goto out_err;
171 	}
172 
173 	/* Device is IDLE at this point so it is legal to change PLLs.
174 	 * There is no need to check anything because if the PLL is
175 	 * already HIGH, the set function will return without doing
176 	 * anything
177 	 */
178 	hl_device_set_frequency(hdev, PLL_HIGH);
179 
180 	list_add(&hpriv->dev_node, &hdev->fpriv_list);
181 	mutex_unlock(&hdev->fpriv_list_lock);
182 
183 	hl_debugfs_add_file(hpriv);
184 
185 	return 0;
186 
187 out_err:
188 	mutex_unlock(&hdev->fpriv_list_lock);
189 
190 	hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr);
191 	hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
192 	filp->private_data = NULL;
193 	mutex_destroy(&hpriv->restore_phase_mutex);
194 	put_pid(hpriv->taskpid);
195 
196 	kfree(hpriv);
197 
198 	return rc;
199 }
200 
hl_device_open_ctrl(struct inode * inode,struct file * filp)201 int hl_device_open_ctrl(struct inode *inode, struct file *filp)
202 {
203 	struct hl_device *hdev;
204 	struct hl_fpriv *hpriv;
205 	int rc;
206 
207 	mutex_lock(&hl_devs_idr_lock);
208 	hdev = idr_find(&hl_devs_idr, iminor(inode));
209 	mutex_unlock(&hl_devs_idr_lock);
210 
211 	if (!hdev) {
212 		pr_err("Couldn't find device %d:%d\n",
213 			imajor(inode), iminor(inode));
214 		return -ENXIO;
215 	}
216 
217 	hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
218 	if (!hpriv)
219 		return -ENOMEM;
220 
221 	mutex_lock(&hdev->fpriv_list_lock);
222 
223 	if (!hl_device_operational(hdev, NULL)) {
224 		dev_err_ratelimited(hdev->dev_ctrl,
225 			"Can't open %s because it is disabled or in reset\n",
226 			dev_name(hdev->dev_ctrl));
227 		rc = -EPERM;
228 		goto out_err;
229 	}
230 
231 	list_add(&hpriv->dev_node, &hdev->fpriv_list);
232 	mutex_unlock(&hdev->fpriv_list_lock);
233 
234 	hpriv->hdev = hdev;
235 	filp->private_data = hpriv;
236 	hpriv->filp = filp;
237 	hpriv->is_control = true;
238 	nonseekable_open(inode, filp);
239 
240 	hpriv->taskpid = find_get_pid(current->pid);
241 
242 	return 0;
243 
244 out_err:
245 	mutex_unlock(&hdev->fpriv_list_lock);
246 	kfree(hpriv);
247 	return rc;
248 }
249 
set_driver_behavior_per_device(struct hl_device * hdev)250 static void set_driver_behavior_per_device(struct hl_device *hdev)
251 {
252 	hdev->fw_components = FW_TYPE_ALL_TYPES;
253 	hdev->cpu_queues_enable = 1;
254 	hdev->heartbeat = 1;
255 	hdev->mmu_enable = 1;
256 	hdev->clock_gating_mask = ULONG_MAX;
257 	hdev->sram_scrambler_enable = 1;
258 	hdev->dram_scrambler_enable = 1;
259 	hdev->bmc_enable = 1;
260 	hdev->hard_reset_on_fw_events = 1;
261 	hdev->reset_on_preboot_fail = 1;
262 
263 	hdev->reset_pcilink = 0;
264 	hdev->axi_drain = 0;
265 }
266 
267 /*
268  * create_hdev - create habanalabs device instance
269  *
270  * @dev: will hold the pointer to the new habanalabs device structure
271  * @pdev: pointer to the pci device
272  * @asic_type: in case of simulator device, which device is it
273  * @minor: in case of simulator device, the minor of the device
274  *
275  * Allocate memory for habanalabs device and initialize basic fields
276  * Identify the ASIC type
277  * Allocate ID (minor) for the device (only for real devices)
278  */
create_hdev(struct hl_device ** dev,struct pci_dev * pdev,enum hl_asic_type asic_type,int minor)279 int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
280 		enum hl_asic_type asic_type, int minor)
281 {
282 	struct hl_device *hdev;
283 	int rc, main_id, ctrl_id = 0;
284 
285 	*dev = NULL;
286 
287 	hdev = kzalloc(sizeof(*hdev), GFP_KERNEL);
288 	if (!hdev)
289 		return -ENOMEM;
290 
291 	/* First, we must find out which ASIC are we handling. This is needed
292 	 * to configure the behavior of the driver (kernel parameters)
293 	 */
294 	if (pdev) {
295 		hdev->asic_type = get_asic_type(pdev->device);
296 		if (hdev->asic_type == ASIC_INVALID) {
297 			dev_err(&pdev->dev, "Unsupported ASIC\n");
298 			rc = -ENODEV;
299 			goto free_hdev;
300 		}
301 	} else {
302 		hdev->asic_type = asic_type;
303 	}
304 
305 	if (pdev)
306 		hdev->asic_prop.fw_security_disabled =
307 				!is_asic_secured(pdev->device);
308 	else
309 		hdev->asic_prop.fw_security_disabled = true;
310 
311 	/* Assign status description string */
312 	strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION],
313 					"disabled", HL_STR_MAX);
314 	strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET],
315 					"in reset", HL_STR_MAX);
316 	strncpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET],
317 					"needs reset", HL_STR_MAX);
318 
319 	hdev->major = hl_major;
320 	hdev->reset_on_lockup = reset_on_lockup;
321 	hdev->memory_scrub = memory_scrub;
322 	hdev->pldm = 0;
323 
324 	set_driver_behavior_per_device(hdev);
325 
326 	if (timeout_locked)
327 		hdev->timeout_jiffies = msecs_to_jiffies(timeout_locked * 1000);
328 	else
329 		hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
330 
331 	hdev->disabled = true;
332 	hdev->pdev = pdev; /* can be NULL in case of simulator device */
333 
334 	/* Set default DMA mask to 32 bits */
335 	hdev->dma_mask = 32;
336 
337 	mutex_lock(&hl_devs_idr_lock);
338 
339 	/* Always save 2 numbers, 1 for main device and 1 for control.
340 	 * They must be consecutive
341 	 */
342 	main_id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS,
343 				GFP_KERNEL);
344 
345 	if (main_id >= 0)
346 		ctrl_id = idr_alloc(&hl_devs_idr, hdev, main_id + 1,
347 					main_id + 2, GFP_KERNEL);
348 
349 	mutex_unlock(&hl_devs_idr_lock);
350 
351 	if ((main_id < 0) || (ctrl_id < 0)) {
352 		if ((main_id == -ENOSPC) || (ctrl_id == -ENOSPC))
353 			pr_err("too many devices in the system\n");
354 
355 		if (main_id >= 0) {
356 			mutex_lock(&hl_devs_idr_lock);
357 			idr_remove(&hl_devs_idr, main_id);
358 			mutex_unlock(&hl_devs_idr_lock);
359 		}
360 
361 		rc = -EBUSY;
362 		goto free_hdev;
363 	}
364 
365 	hdev->id = main_id;
366 	hdev->id_control = ctrl_id;
367 
368 	*dev = hdev;
369 
370 	return 0;
371 
372 free_hdev:
373 	kfree(hdev);
374 	return rc;
375 }
376 
377 /*
378  * destroy_hdev - destroy habanalabs device instance
379  *
380  * @dev: pointer to the habanalabs device structure
381  *
382  */
destroy_hdev(struct hl_device * hdev)383 void destroy_hdev(struct hl_device *hdev)
384 {
385 	/* Remove device from the device list */
386 	mutex_lock(&hl_devs_idr_lock);
387 	idr_remove(&hl_devs_idr, hdev->id);
388 	idr_remove(&hl_devs_idr, hdev->id_control);
389 	mutex_unlock(&hl_devs_idr_lock);
390 
391 	kfree(hdev);
392 }
393 
hl_pmops_suspend(struct device * dev)394 static int hl_pmops_suspend(struct device *dev)
395 {
396 	struct hl_device *hdev = dev_get_drvdata(dev);
397 
398 	pr_debug("Going to suspend PCI device\n");
399 
400 	if (!hdev) {
401 		pr_err("device pointer is NULL in suspend\n");
402 		return 0;
403 	}
404 
405 	return hl_device_suspend(hdev);
406 }
407 
hl_pmops_resume(struct device * dev)408 static int hl_pmops_resume(struct device *dev)
409 {
410 	struct hl_device *hdev = dev_get_drvdata(dev);
411 
412 	pr_debug("Going to resume PCI device\n");
413 
414 	if (!hdev) {
415 		pr_err("device pointer is NULL in resume\n");
416 		return 0;
417 	}
418 
419 	return hl_device_resume(hdev);
420 }
421 
422 /*
423  * hl_pci_probe - probe PCI habanalabs devices
424  *
425  * @pdev: pointer to pci device
426  * @id: pointer to pci device id structure
427  *
428  * Standard PCI probe function for habanalabs device.
429  * Create a new habanalabs device and initialize it according to the
430  * device's type
431  */
hl_pci_probe(struct pci_dev * pdev,const struct pci_device_id * id)432 static int hl_pci_probe(struct pci_dev *pdev,
433 				const struct pci_device_id *id)
434 {
435 	struct hl_device *hdev;
436 	int rc;
437 
438 	dev_info(&pdev->dev, HL_NAME
439 		 " device found [%04x:%04x] (rev %x)\n",
440 		 (int)pdev->vendor, (int)pdev->device, (int)pdev->revision);
441 
442 	rc = create_hdev(&hdev, pdev, ASIC_INVALID, -1);
443 	if (rc)
444 		return rc;
445 
446 	pci_set_drvdata(pdev, hdev);
447 
448 	pci_enable_pcie_error_reporting(pdev);
449 
450 	rc = hl_device_init(hdev, hl_class);
451 	if (rc) {
452 		dev_err(&pdev->dev, "Fatal error during habanalabs device init\n");
453 		rc = -ENODEV;
454 		goto disable_device;
455 	}
456 
457 	return 0;
458 
459 disable_device:
460 	pci_set_drvdata(pdev, NULL);
461 	destroy_hdev(hdev);
462 
463 	return rc;
464 }
465 
466 /*
467  * hl_pci_remove - remove PCI habanalabs devices
468  *
469  * @pdev: pointer to pci device
470  *
471  * Standard PCI remove function for habanalabs device
472  */
hl_pci_remove(struct pci_dev * pdev)473 static void hl_pci_remove(struct pci_dev *pdev)
474 {
475 	struct hl_device *hdev;
476 
477 	hdev = pci_get_drvdata(pdev);
478 	if (!hdev)
479 		return;
480 
481 	hl_device_fini(hdev);
482 	pci_disable_pcie_error_reporting(pdev);
483 	pci_set_drvdata(pdev, NULL);
484 	destroy_hdev(hdev);
485 }
486 
487 /**
488  * hl_pci_err_detected - a PCI bus error detected on this device
489  *
490  * @pdev: pointer to pci device
491  * @state: PCI error type
492  *
493  * Called by the PCI subsystem whenever a non-correctable
494  * PCI bus error is detected
495  */
496 static pci_ers_result_t
hl_pci_err_detected(struct pci_dev * pdev,pci_channel_state_t state)497 hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state)
498 {
499 	struct hl_device *hdev = pci_get_drvdata(pdev);
500 	enum pci_ers_result result;
501 
502 	switch (state) {
503 	case pci_channel_io_normal:
504 		return PCI_ERS_RESULT_CAN_RECOVER;
505 
506 	case pci_channel_io_frozen:
507 		dev_warn(hdev->dev, "frozen state error detected\n");
508 		result = PCI_ERS_RESULT_NEED_RESET;
509 		break;
510 
511 	case pci_channel_io_perm_failure:
512 		dev_warn(hdev->dev, "failure state error detected\n");
513 		result = PCI_ERS_RESULT_DISCONNECT;
514 		break;
515 
516 	default:
517 		result = PCI_ERS_RESULT_NONE;
518 	}
519 
520 	hdev->asic_funcs->halt_engines(hdev, true);
521 
522 	return result;
523 }
524 
525 /**
526  * hl_pci_err_resume - resume after a PCI slot reset
527  *
528  * @pdev: pointer to pci device
529  *
530  */
hl_pci_err_resume(struct pci_dev * pdev)531 static void hl_pci_err_resume(struct pci_dev *pdev)
532 {
533 	struct hl_device *hdev = pci_get_drvdata(pdev);
534 
535 	dev_warn(hdev->dev, "Resuming device after PCI slot reset\n");
536 	hl_device_resume(hdev);
537 }
538 
539 /**
540  * hl_pci_err_slot_reset - a PCI slot reset has just happened
541  *
542  * @pdev: pointer to pci device
543  *
544  * Determine if the driver can recover from the PCI slot reset
545  */
hl_pci_err_slot_reset(struct pci_dev * pdev)546 static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev)
547 {
548 	return PCI_ERS_RESULT_RECOVERED;
549 }
550 
551 static const struct dev_pm_ops hl_pm_ops = {
552 	.suspend = hl_pmops_suspend,
553 	.resume = hl_pmops_resume,
554 };
555 
556 static const struct pci_error_handlers hl_pci_err_handler = {
557 	.error_detected = hl_pci_err_detected,
558 	.slot_reset = hl_pci_err_slot_reset,
559 	.resume = hl_pci_err_resume,
560 };
561 
562 static struct pci_driver hl_pci_driver = {
563 	.name = HL_NAME,
564 	.id_table = ids,
565 	.probe = hl_pci_probe,
566 	.remove = hl_pci_remove,
567 	.shutdown = hl_pci_remove,
568 	.driver.pm = &hl_pm_ops,
569 	.err_handler = &hl_pci_err_handler,
570 };
571 
572 /*
573  * hl_init - Initialize the habanalabs kernel driver
574  */
hl_init(void)575 static int __init hl_init(void)
576 {
577 	int rc;
578 	dev_t dev;
579 
580 	pr_info("loading driver\n");
581 
582 	rc = alloc_chrdev_region(&dev, 0, HL_MAX_MINORS, HL_NAME);
583 	if (rc < 0) {
584 		pr_err("unable to get major\n");
585 		return rc;
586 	}
587 
588 	hl_major = MAJOR(dev);
589 
590 	hl_class = class_create(THIS_MODULE, HL_NAME);
591 	if (IS_ERR(hl_class)) {
592 		pr_err("failed to allocate class\n");
593 		rc = PTR_ERR(hl_class);
594 		goto remove_major;
595 	}
596 
597 	hl_debugfs_init();
598 
599 	rc = pci_register_driver(&hl_pci_driver);
600 	if (rc) {
601 		pr_err("failed to register pci device\n");
602 		goto remove_debugfs;
603 	}
604 
605 	pr_debug("driver loaded\n");
606 
607 	return 0;
608 
609 remove_debugfs:
610 	hl_debugfs_fini();
611 	class_destroy(hl_class);
612 remove_major:
613 	unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
614 	return rc;
615 }
616 
617 /*
618  * hl_exit - Release all resources of the habanalabs kernel driver
619  */
hl_exit(void)620 static void __exit hl_exit(void)
621 {
622 	pci_unregister_driver(&hl_pci_driver);
623 
624 	/*
625 	 * Removing debugfs must be after all devices or simulator devices
626 	 * have been removed because otherwise we get a bug in the
627 	 * debugfs module for referencing NULL objects
628 	 */
629 	hl_debugfs_fini();
630 
631 	class_destroy(hl_class);
632 	unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
633 
634 	idr_destroy(&hl_devs_idr);
635 
636 	pr_debug("driver removed\n");
637 }
638 
639 module_init(hl_init);
640 module_exit(hl_exit);
641