1 /*- 2 * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 * 25 * $FreeBSD$ 26 */ 27 28 #include <linux/kernel.h> 29 #include <linux/module.h> 30 #include <linux/random.h> 31 #include <linux/vmalloc.h> 32 #include <linux/hardirq.h> 33 #include <dev/mlx5/driver.h> 34 #include <dev/mlx5/mlx5_ifc.h> 35 #include "mlx5_core.h" 36 37 #define MLX5_HEALTH_POLL_INTERVAL (2 * HZ) 38 #define MAX_MISSES 3 39 40 enum { 41 MLX5_NIC_IFC_FULL = 0, 42 MLX5_NIC_IFC_DISABLED = 1, 43 MLX5_NIC_IFC_NO_DRAM_NIC = 2 44 }; 45 46 enum { 47 MLX5_DROP_NEW_HEALTH_WORK, 48 }; 49 50 static u8 get_nic_interface(struct mlx5_core_dev *dev) 51 { 52 return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3; 53 } 54 55 static void mlx5_trigger_cmd_completions(struct mlx5_core_dev *dev) 56 { 57 unsigned long flags; 58 u64 vector; 59 60 /* wait for pending handlers to complete */ 61 synchronize_irq(dev->priv.msix_arr[MLX5_EQ_VEC_CMD].vector); 62 spin_lock_irqsave(&dev->cmd.alloc_lock, flags); 63 vector = ~dev->cmd.bitmask & ((1ul << (1 << dev->cmd.log_sz)) - 1); 64 if (!vector) 65 goto no_trig; 66 67 vector |= MLX5_TRIGGERED_CMD_COMP; 68 spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags); 69 70 mlx5_core_dbg(dev, "vector 0x%lx\n", vector); 71 mlx5_cmd_comp_handler(dev, vector); 72 return; 73 74 no_trig: 75 spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags); 76 } 77 78 static int in_fatal(struct mlx5_core_dev *dev) 79 { 80 struct mlx5_core_health *health = &dev->priv.health; 81 struct mlx5_health_buffer __iomem *h = health->health; 82 83 if (get_nic_interface(dev) == MLX5_NIC_IFC_DISABLED) 84 return 1; 85 86 if (ioread32be(&h->fw_ver) == 0xffffffff) 87 return 1; 88 89 return 0; 90 } 91 92 void mlx5_enter_error_state(struct mlx5_core_dev *dev) 93 { 94 mutex_lock(&dev->intf_state_mutex); 95 if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { 96 goto unlock; 97 return; 98 } 99 100 mlx5_core_err(dev, "start\n"); 101 if (pci_channel_offline(dev->pdev) || in_fatal(dev)) { 102 dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; 103 mlx5_trigger_cmd_completions(dev); 104 } 105 106 mlx5_core_event(dev, MLX5_DEV_EVENT_SYS_ERROR, 0); 107 mlx5_core_err(dev, "end\n"); 108 109 unlock: 110 mutex_unlock(&dev->intf_state_mutex); 111 } 112 113 static void mlx5_handle_bad_state(struct mlx5_core_dev *dev) 114 { 115 u8 nic_interface = get_nic_interface(dev); 116 117 switch (nic_interface) { 118 case MLX5_NIC_IFC_FULL: 119 mlx5_core_warn(dev, "Expected to see disabled NIC but it is full driver\n"); 120 break; 121 122 case MLX5_NIC_IFC_DISABLED: 123 mlx5_core_warn(dev, "starting teardown\n"); 124 break; 125 126 case MLX5_NIC_IFC_NO_DRAM_NIC: 127 mlx5_core_warn(dev, "Expected to see disabled NIC but it is no dram nic\n"); 128 break; 129 default: 130 mlx5_core_warn(dev, "Expected to see disabled NIC but it is has invalid value %d\n", 131 nic_interface); 132 } 133 134 mlx5_disable_device(dev); 135 } 136 137 static void health_care(struct work_struct *work) 138 { 139 struct mlx5_core_health *health; 140 struct mlx5_core_dev *dev; 141 struct mlx5_priv *priv; 142 143 health = container_of(work, struct mlx5_core_health, work); 144 priv = container_of(health, struct mlx5_priv, health); 145 dev = container_of(priv, struct mlx5_core_dev, priv); 146 mlx5_core_warn(dev, "handling bad device here\n"); 147 mlx5_handle_bad_state(dev); 148 } 149 150 static int get_next_poll_jiffies(void) 151 { 152 unsigned long next; 153 154 get_random_bytes(&next, sizeof(next)); 155 next %= HZ; 156 next += jiffies + MLX5_HEALTH_POLL_INTERVAL; 157 158 return next; 159 } 160 161 static const char *hsynd_str(u8 synd) 162 { 163 switch (synd) { 164 case MLX5_HEALTH_SYNDR_FW_ERR: 165 return "firmware internal error"; 166 case MLX5_HEALTH_SYNDR_IRISC_ERR: 167 return "irisc not responding"; 168 case MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR: 169 return "unrecoverable hardware error"; 170 case MLX5_HEALTH_SYNDR_CRC_ERR: 171 return "firmware CRC error"; 172 case MLX5_HEALTH_SYNDR_FETCH_PCI_ERR: 173 return "ICM fetch PCI error"; 174 case MLX5_HEALTH_SYNDR_HW_FTL_ERR: 175 return "HW fatal error\n"; 176 case MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR: 177 return "async EQ buffer overrun"; 178 case MLX5_HEALTH_SYNDR_EQ_ERR: 179 return "EQ error"; 180 case MLX5_HEALTH_SYNDR_EQ_INV: 181 return "Invalid EQ referenced"; 182 case MLX5_HEALTH_SYNDR_FFSER_ERR: 183 return "FFSER error"; 184 case MLX5_HEALTH_SYNDR_HIGH_TEMP: 185 return "High temprature"; 186 default: 187 return "unrecognized error"; 188 } 189 } 190 191 static void print_health_info(struct mlx5_core_dev *dev) 192 { 193 struct mlx5_core_health *health = &dev->priv.health; 194 struct mlx5_health_buffer __iomem *h = health->health; 195 char fw_str[18]; 196 u32 fw; 197 int i; 198 199 /* If the syndrom is 0, the device is OK and no need to print buffer */ 200 if (!ioread8(&h->synd)) 201 return; 202 203 for (i = 0; i < ARRAY_SIZE(h->assert_var); i++) 204 printf("mlx5_core: INFO: ""assert_var[%d] 0x%08x\n", i, ioread32be(h->assert_var + i)); 205 206 printf("mlx5_core: INFO: ""assert_exit_ptr 0x%08x\n", ioread32be(&h->assert_exit_ptr)); 207 printf("mlx5_core: INFO: ""assert_callra 0x%08x\n", ioread32be(&h->assert_callra)); 208 snprintf(fw_str, sizeof(fw_str), "%d.%d.%d", fw_rev_maj(dev), fw_rev_min(dev), fw_rev_sub(dev)); 209 printf("mlx5_core: INFO: ""fw_ver %s\n", fw_str); 210 printf("mlx5_core: INFO: ""hw_id 0x%08x\n", ioread32be(&h->hw_id)); 211 printf("mlx5_core: INFO: ""irisc_index %d\n", ioread8(&h->irisc_index)); 212 printf("mlx5_core: INFO: ""synd 0x%x: %s\n", ioread8(&h->synd), hsynd_str(ioread8(&h->synd))); 213 printf("mlx5_core: INFO: ""ext_synd 0x%04x\n", ioread16be(&h->ext_synd)); 214 fw = ioread32be(&h->fw_ver); 215 printf("mlx5_core: INFO: ""raw fw_ver 0x%08x\n", fw); 216 } 217 218 static void poll_health(unsigned long data) 219 { 220 struct mlx5_core_dev *dev = (struct mlx5_core_dev *)data; 221 struct mlx5_core_health *health = &dev->priv.health; 222 u32 count; 223 224 if (dev->state != MLX5_DEVICE_STATE_UP) 225 return; 226 227 if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { 228 mod_timer(&health->timer, get_next_poll_jiffies()); 229 return; 230 } 231 232 count = ioread32be(health->health_counter); 233 if (count == health->prev) 234 ++health->miss_counter; 235 else 236 health->miss_counter = 0; 237 238 health->prev = count; 239 if (health->miss_counter == MAX_MISSES) { 240 mlx5_core_err(dev, "device's health compromised - reached miss count\n"); 241 print_health_info(dev); 242 } else { 243 mod_timer(&health->timer, get_next_poll_jiffies()); 244 } 245 246 if (in_fatal(dev) && !health->sick) { 247 health->sick = true; 248 print_health_info(dev); 249 spin_lock(&health->wq_lock); 250 if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags)) 251 queue_work(health->wq, &health->work); 252 else 253 dev_err(&dev->pdev->dev, 254 "new health works are not permitted at this stage\n"); 255 spin_unlock(&health->wq_lock); 256 } 257 } 258 259 void mlx5_start_health_poll(struct mlx5_core_dev *dev) 260 { 261 struct mlx5_core_health *health = &dev->priv.health; 262 263 init_timer(&health->timer); 264 health->sick = 0; 265 clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); 266 health->health = &dev->iseg->health; 267 health->health_counter = &dev->iseg->health_counter; 268 269 setup_timer(&health->timer, poll_health, (unsigned long)dev); 270 mod_timer(&health->timer, 271 round_jiffies(jiffies + MLX5_HEALTH_POLL_INTERVAL)); 272 } 273 274 void mlx5_stop_health_poll(struct mlx5_core_dev *dev) 275 { 276 struct mlx5_core_health *health = &dev->priv.health; 277 278 del_timer_sync(&health->timer); 279 } 280 281 void mlx5_drain_health_wq(struct mlx5_core_dev *dev) 282 { 283 struct mlx5_core_health *health = &dev->priv.health; 284 285 spin_lock(&health->wq_lock); 286 set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); 287 spin_unlock(&health->wq_lock); 288 cancel_work_sync(&health->work); 289 } 290 291 void mlx5_health_cleanup(struct mlx5_core_dev *dev) 292 { 293 struct mlx5_core_health *health = &dev->priv.health; 294 295 destroy_workqueue(health->wq); 296 } 297 298 #define HEALTH_NAME "mlx5_health" 299 int mlx5_health_init(struct mlx5_core_dev *dev) 300 { 301 struct mlx5_core_health *health; 302 char *name; 303 int len; 304 305 health = &dev->priv.health; 306 len = strlen(HEALTH_NAME) + strlen(dev_name(&dev->pdev->dev)); 307 name = kmalloc(len + 1, GFP_KERNEL); 308 if (!name) 309 return -ENOMEM; 310 311 snprintf(name, len, "%s:%s", HEALTH_NAME, dev_name(&dev->pdev->dev)); 312 health->wq = create_singlethread_workqueue(name); 313 kfree(name); 314 if (!health->wq) 315 return -ENOMEM; 316 317 spin_lock_init(&health->wq_lock); 318 INIT_WORK(&health->work, health_care); 319 320 return 0; 321 } 322