1 /* 2 * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. 3 * Copyright (c) 2007, 2008, 2014 Mellanox Technologies. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 #define LINUXKPI_PARAM_PREFIX mlx4_ 35 36 #include <linux/workqueue.h> 37 #include <linux/module.h> 38 39 #include <asm/byteorder.h> 40 41 #include "mlx4.h" 42 43 #define MLX4_CATAS_POLL_INTERVAL (5 * HZ) 44 45 46 47 int mlx4_internal_err_reset = 1; 48 module_param_named(internal_err_reset, mlx4_internal_err_reset, int, 0644); 49 MODULE_PARM_DESC(internal_err_reset, 50 "Reset device on internal errors if non-zero (default 1)"); 51 52 static int read_vendor_id(struct mlx4_dev *dev) 53 { 54 u16 vendor_id = 0; 55 int ret; 56 57 ret = pci_read_config_word(dev->persist->pdev, 0, &vendor_id); 58 if (ret) { 59 mlx4_err(dev, "Failed to read vendor ID, ret=%d\n", ret); 60 return ret; 61 } 62 63 if (vendor_id == 0xffff) { 64 mlx4_err(dev, "PCI can't be accessed to read vendor id\n"); 65 return -EINVAL; 66 } 67 68 return 0; 69 } 70 71 static int mlx4_reset_master(struct mlx4_dev *dev) 72 { 73 int err = 0; 74 75 if (mlx4_is_master(dev)) 76 mlx4_report_internal_err_comm_event(dev); 77 78 if (!pci_channel_offline(dev->persist->pdev)) { 79 err = read_vendor_id(dev); 80 /* If PCI can't be accessed to read vendor ID we assume that its 81 * link was disabled and chip was already reset. 82 */ 83 if (err) 84 return 0; 85 86 err = mlx4_reset(dev); 87 if (err) 88 mlx4_err(dev, "Fail to reset HCA\n"); 89 } 90 91 return err; 92 } 93 94 static int mlx4_reset_slave(struct mlx4_dev *dev) 95 { 96 #define COM_CHAN_RST_REQ_OFFSET 0x10 97 #define COM_CHAN_RST_ACK_OFFSET 0x08 98 99 u32 comm_flags; 100 u32 rst_req; 101 u32 rst_ack; 102 unsigned long end; 103 struct mlx4_priv *priv = mlx4_priv(dev); 104 105 if (pci_channel_offline(dev->persist->pdev)) 106 return 0; 107 108 comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm + 109 MLX4_COMM_CHAN_FLAGS)); 110 if (comm_flags == 0xffffffff) { 111 mlx4_err(dev, "VF reset is not needed\n"); 112 return 0; 113 } 114 115 if (!(dev->caps.vf_caps & MLX4_VF_CAP_FLAG_RESET)) { 116 mlx4_err(dev, "VF reset is not supported\n"); 117 return -EOPNOTSUPP; 118 } 119 120 rst_req = (comm_flags & (u32)(1 << COM_CHAN_RST_REQ_OFFSET)) >> 121 COM_CHAN_RST_REQ_OFFSET; 122 rst_ack = (comm_flags & (u32)(1 << COM_CHAN_RST_ACK_OFFSET)) >> 123 COM_CHAN_RST_ACK_OFFSET; 124 if (rst_req != rst_ack) { 125 mlx4_err(dev, "Communication channel isn't sync, fail to send reset\n"); 126 return -EIO; 127 } 128 129 rst_req ^= 1; 130 mlx4_warn(dev, "VF is sending reset request to Firmware\n"); 131 comm_flags = rst_req << COM_CHAN_RST_REQ_OFFSET; 132 __raw_writel((__force u32)cpu_to_be32(comm_flags), 133 (__iomem char *)priv->mfunc.comm + MLX4_COMM_CHAN_FLAGS); 134 /* Make sure that our comm channel write doesn't 135 * get mixed in with writes from another CPU. 136 */ 137 mmiowb(); 138 139 end = msecs_to_jiffies(MLX4_COMM_TIME) + jiffies; 140 while (time_before(jiffies, end)) { 141 comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm + 142 MLX4_COMM_CHAN_FLAGS)); 143 rst_ack = (comm_flags & (u32)(1 << COM_CHAN_RST_ACK_OFFSET)) >> 144 COM_CHAN_RST_ACK_OFFSET; 145 146 /* Reading rst_req again since the communication channel can 147 * be reset at any time by the PF and all its bits will be 148 * set to zero. 149 */ 150 rst_req = (comm_flags & (u32)(1 << COM_CHAN_RST_REQ_OFFSET)) >> 151 COM_CHAN_RST_REQ_OFFSET; 152 153 if (rst_ack == rst_req) { 154 mlx4_warn(dev, "VF Reset succeed\n"); 155 return 0; 156 } 157 cond_resched(); 158 } 159 mlx4_err(dev, "Fail to send reset over the communication channel\n"); 160 return -ETIMEDOUT; 161 } 162 163 static int mlx4_comm_internal_err(u32 slave_read) 164 { 165 return (u32)COMM_CHAN_EVENT_INTERNAL_ERR == 166 (slave_read & (u32)COMM_CHAN_EVENT_INTERNAL_ERR) ? 1 : 0; 167 } 168 169 void mlx4_enter_error_state(struct mlx4_dev_persistent *persist) 170 { 171 int err; 172 struct mlx4_dev *dev; 173 174 if (!mlx4_internal_err_reset) 175 return; 176 177 mutex_lock(&persist->device_state_mutex); 178 if (persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) 179 goto out; 180 181 dev = persist->dev; 182 mlx4_err(dev, "device is going to be reset\n"); 183 if (mlx4_is_slave(dev)) 184 err = mlx4_reset_slave(dev); 185 else 186 err = mlx4_reset_master(dev); 187 BUG_ON(err != 0); 188 189 dev->persist->state |= MLX4_DEVICE_STATE_INTERNAL_ERROR; 190 mlx4_err(dev, "device was reset successfully\n"); 191 mutex_unlock(&persist->device_state_mutex); 192 193 /* At that step HW was already reset, now notify clients */ 194 mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0); 195 mlx4_cmd_wake_completions(dev); 196 return; 197 198 out: 199 mutex_unlock(&persist->device_state_mutex); 200 } 201 202 static void mlx4_handle_error_state(struct mlx4_dev_persistent *persist) 203 { 204 int err = 0; 205 206 mlx4_enter_error_state(persist); 207 mutex_lock(&persist->interface_state_mutex); 208 if (persist->interface_state & MLX4_INTERFACE_STATE_UP && 209 !(persist->interface_state & MLX4_INTERFACE_STATE_DELETION)) { 210 err = mlx4_restart_one(persist->pdev); 211 mlx4_info(persist->dev, "mlx4_restart_one was ended, ret=%d\n", 212 err); 213 } 214 mutex_unlock(&persist->interface_state_mutex); 215 } 216 217 static void dump_err_buf(struct mlx4_dev *dev) 218 { 219 struct mlx4_priv *priv = mlx4_priv(dev); 220 221 int i; 222 223 mlx4_err(dev, "Internal error detected:\n"); 224 for (i = 0; i < priv->fw.catas_size; ++i) 225 mlx4_err(dev, " buf[%02x]: %08x\n", 226 i, swab32(readl(priv->catas_err.map + i))); 227 } 228 229 static void poll_catas(unsigned long dev_ptr) 230 { 231 struct mlx4_dev *dev = (struct mlx4_dev *) dev_ptr; 232 struct mlx4_priv *priv = mlx4_priv(dev); 233 u32 slave_read; 234 235 if (mlx4_is_slave(dev)) { 236 slave_read = swab32(readl(&priv->mfunc.comm->slave_read)); 237 if (mlx4_comm_internal_err(slave_read)) { 238 mlx4_warn(dev, "Internal error detected on the communication channel\n"); 239 goto internal_err; 240 } 241 } else if (readl(priv->catas_err.map)) { 242 dump_err_buf(dev); 243 goto internal_err; 244 } 245 246 if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) { 247 mlx4_warn(dev, "Internal error mark was detected on device\n"); 248 goto internal_err; 249 } 250 251 mod_timer(&priv->catas_err.timer, 252 round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL)); 253 return; 254 255 internal_err: 256 if (mlx4_internal_err_reset) 257 queue_work(dev->persist->catas_wq, &dev->persist->catas_work); 258 } 259 260 static void catas_reset(struct work_struct *work) 261 { 262 struct mlx4_dev_persistent *persist = 263 container_of(work, struct mlx4_dev_persistent, 264 catas_work); 265 266 mlx4_handle_error_state(persist); 267 } 268 269 void mlx4_start_catas_poll(struct mlx4_dev *dev) 270 { 271 struct mlx4_priv *priv = mlx4_priv(dev); 272 phys_addr_t addr; 273 274 INIT_LIST_HEAD(&priv->catas_err.list); 275 init_timer(&priv->catas_err.timer); 276 priv->catas_err.map = NULL; 277 278 if (!mlx4_is_slave(dev)) { 279 addr = pci_resource_start(dev->persist->pdev, 280 priv->fw.catas_bar) + 281 priv->fw.catas_offset; 282 283 priv->catas_err.map = ioremap(addr, priv->fw.catas_size * 4); 284 if (!priv->catas_err.map) { 285 mlx4_warn(dev, "Failed to map internal error buffer at 0x%llx\n", 286 (unsigned long long)addr); 287 return; 288 } 289 } 290 291 priv->catas_err.timer.data = (unsigned long) dev; 292 priv->catas_err.timer.function = poll_catas; 293 priv->catas_err.timer.expires = 294 round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL); 295 add_timer(&priv->catas_err.timer); 296 } 297 298 void mlx4_stop_catas_poll(struct mlx4_dev *dev) 299 { 300 struct mlx4_priv *priv = mlx4_priv(dev); 301 302 del_timer_sync(&priv->catas_err.timer); 303 304 if (priv->catas_err.map) { 305 iounmap(priv->catas_err.map); 306 priv->catas_err.map = NULL; 307 } 308 309 if (dev->persist->interface_state & MLX4_INTERFACE_STATE_DELETION) 310 flush_workqueue(dev->persist->catas_wq); 311 } 312 313 int mlx4_catas_init(struct mlx4_dev *dev) 314 { 315 INIT_WORK(&dev->persist->catas_work, catas_reset); 316 dev->persist->catas_wq = create_singlethread_workqueue("mlx4_health"); 317 if (!dev->persist->catas_wq) 318 return -ENOMEM; 319 320 return 0; 321 } 322 323 void mlx4_catas_end(struct mlx4_dev *dev) 324 { 325 if (dev->persist->catas_wq) { 326 destroy_workqueue(dev->persist->catas_wq); 327 dev->persist->catas_wq = NULL; 328 } 329 } 330