1 /* 2 * Copyright (c) 2017, Mellanox Technologies. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 */ 32 33 #include <linux/module.h> 34 #include <linux/etherdevice.h> 35 #include <linux/mlx5/driver.h> 36 37 #include "mlx5_core.h" 38 #include "lib/mlx5.h" 39 #include "lib/eq.h" 40 #include "fpga/core.h" 41 #include "fpga/conn.h" 42 43 static const char *const mlx5_fpga_error_strings[] = { 44 "Null Syndrome", 45 "Corrupted DDR", 46 "Flash Timeout", 47 "Internal Link Error", 48 "Watchdog HW Failure", 49 "I2C Failure", 50 "Image Changed", 51 "Temperature Critical", 52 }; 53 54 static const char * const mlx5_fpga_qp_error_strings[] = { 55 "Null Syndrome", 56 "Retry Counter Expired", 57 "RNR Expired", 58 }; 59 static struct mlx5_fpga_device *mlx5_fpga_device_alloc(void) 60 { 61 struct mlx5_fpga_device *fdev = NULL; 62 63 fdev = kzalloc(sizeof(*fdev), GFP_KERNEL); 64 if (!fdev) 65 return NULL; 66 67 spin_lock_init(&fdev->state_lock); 68 fdev->state = MLX5_FPGA_STATUS_NONE; 69 return fdev; 70 } 71 72 static const char *mlx5_fpga_image_name(enum mlx5_fpga_image image) 73 { 74 switch (image) { 75 case MLX5_FPGA_IMAGE_USER: 76 return "user"; 77 case MLX5_FPGA_IMAGE_FACTORY: 78 return "factory"; 79 default: 80 return "unknown"; 81 } 82 } 83 84 static const char *mlx5_fpga_device_name(u32 device) 85 { 86 switch (device) { 87 case MLX5_FPGA_DEVICE_KU040: 88 return "ku040"; 89 case MLX5_FPGA_DEVICE_KU060: 90 return "ku060"; 91 case MLX5_FPGA_DEVICE_KU060_2: 92 return "ku060_2"; 93 case MLX5_FPGA_DEVICE_UNKNOWN: 94 default: 95 return "unknown"; 96 } 97 } 98 99 static int mlx5_fpga_device_load_check(struct mlx5_fpga_device *fdev) 100 { 101 struct mlx5_fpga_query query; 102 int err; 103 104 err = mlx5_fpga_query(fdev->mdev, &query); 105 if (err) { 106 mlx5_fpga_err(fdev, "Failed to query status: %d\n", err); 107 return err; 108 } 109 110 fdev->last_admin_image = query.admin_image; 111 fdev->last_oper_image = query.oper_image; 112 113 mlx5_fpga_dbg(fdev, "Status %u; Admin image %u; Oper image %u\n", 114 query.status, query.admin_image, query.oper_image); 115 116 if (query.status != MLX5_FPGA_STATUS_SUCCESS) { 117 mlx5_fpga_err(fdev, "%s image failed to load; status %u\n", 118 mlx5_fpga_image_name(fdev->last_oper_image), 119 query.status); 120 return -EIO; 121 } 122 123 return 0; 124 } 125 126 static int mlx5_fpga_device_brb(struct mlx5_fpga_device *fdev) 127 { 128 int err; 129 struct mlx5_core_dev *mdev = fdev->mdev; 130 131 err = mlx5_fpga_ctrl_op(mdev, MLX5_FPGA_CTRL_OPERATION_SANDBOX_BYPASS_ON); 132 if (err) { 133 mlx5_fpga_err(fdev, "Failed to set bypass on: %d\n", err); 134 return err; 135 } 136 err = mlx5_fpga_ctrl_op(mdev, MLX5_FPGA_CTRL_OPERATION_RESET_SANDBOX); 137 if (err) { 138 mlx5_fpga_err(fdev, "Failed to reset SBU: %d\n", err); 139 return err; 140 } 141 err = mlx5_fpga_ctrl_op(mdev, MLX5_FPGA_CTRL_OPERATION_SANDBOX_BYPASS_OFF); 142 if (err) { 143 mlx5_fpga_err(fdev, "Failed to set bypass off: %d\n", err); 144 return err; 145 } 146 return 0; 147 } 148 149 static int mlx5_fpga_event(struct mlx5_fpga_device *, unsigned long, void *); 150 151 static int fpga_err_event(struct notifier_block *nb, unsigned long event, void *eqe) 152 { 153 struct mlx5_fpga_device *fdev = mlx5_nb_cof(nb, struct mlx5_fpga_device, fpga_err_nb); 154 155 return mlx5_fpga_event(fdev, event, eqe); 156 } 157 158 static int fpga_qp_err_event(struct notifier_block *nb, unsigned long event, void *eqe) 159 { 160 struct mlx5_fpga_device *fdev = mlx5_nb_cof(nb, struct mlx5_fpga_device, fpga_qp_err_nb); 161 162 return mlx5_fpga_event(fdev, event, eqe); 163 } 164 165 int mlx5_fpga_device_start(struct mlx5_core_dev *mdev) 166 { 167 struct mlx5_fpga_device *fdev = mdev->fpga; 168 unsigned int max_num_qps; 169 unsigned long flags; 170 u32 fpga_device_id; 171 int err; 172 173 if (!fdev) 174 return 0; 175 176 err = mlx5_fpga_device_load_check(fdev); 177 if (err) 178 goto out; 179 180 err = mlx5_fpga_caps(fdev->mdev); 181 if (err) 182 goto out; 183 184 fpga_device_id = MLX5_CAP_FPGA(fdev->mdev, fpga_device); 185 mlx5_fpga_info(fdev, "%s:%u; %s image, version %u; SBU %06x:%04x version %d\n", 186 mlx5_fpga_device_name(fpga_device_id), 187 fpga_device_id, 188 mlx5_fpga_image_name(fdev->last_oper_image), 189 MLX5_CAP_FPGA(fdev->mdev, image_version), 190 MLX5_CAP_FPGA(fdev->mdev, ieee_vendor_id), 191 MLX5_CAP_FPGA(fdev->mdev, sandbox_product_id), 192 MLX5_CAP_FPGA(fdev->mdev, sandbox_product_version)); 193 194 max_num_qps = MLX5_CAP_FPGA(mdev, shell_caps.max_num_qps); 195 if (!max_num_qps) { 196 mlx5_fpga_err(fdev, "FPGA reports 0 QPs in SHELL_CAPS\n"); 197 err = -ENOTSUPP; 198 goto out; 199 } 200 201 err = mlx5_core_reserve_gids(mdev, max_num_qps); 202 if (err) 203 goto out; 204 205 MLX5_NB_INIT(&fdev->fpga_err_nb, fpga_err_event, FPGA_ERROR); 206 MLX5_NB_INIT(&fdev->fpga_qp_err_nb, fpga_qp_err_event, FPGA_QP_ERROR); 207 mlx5_eq_notifier_register(fdev->mdev, &fdev->fpga_err_nb); 208 mlx5_eq_notifier_register(fdev->mdev, &fdev->fpga_qp_err_nb); 209 210 err = mlx5_fpga_conn_device_init(fdev); 211 if (err) 212 goto err_rsvd_gid; 213 214 if (fdev->last_oper_image == MLX5_FPGA_IMAGE_USER) { 215 err = mlx5_fpga_device_brb(fdev); 216 if (err) 217 goto err_conn_init; 218 } 219 220 goto out; 221 222 err_conn_init: 223 mlx5_fpga_conn_device_cleanup(fdev); 224 225 err_rsvd_gid: 226 mlx5_eq_notifier_unregister(fdev->mdev, &fdev->fpga_err_nb); 227 mlx5_eq_notifier_unregister(fdev->mdev, &fdev->fpga_qp_err_nb); 228 mlx5_core_unreserve_gids(mdev, max_num_qps); 229 out: 230 spin_lock_irqsave(&fdev->state_lock, flags); 231 fdev->state = err ? MLX5_FPGA_STATUS_FAILURE : MLX5_FPGA_STATUS_SUCCESS; 232 spin_unlock_irqrestore(&fdev->state_lock, flags); 233 return err; 234 } 235 236 int mlx5_fpga_init(struct mlx5_core_dev *mdev) 237 { 238 struct mlx5_fpga_device *fdev = NULL; 239 240 if (!MLX5_CAP_GEN(mdev, fpga)) { 241 mlx5_core_dbg(mdev, "FPGA capability not present\n"); 242 return 0; 243 } 244 245 mlx5_core_dbg(mdev, "Initializing FPGA\n"); 246 247 fdev = mlx5_fpga_device_alloc(); 248 if (!fdev) 249 return -ENOMEM; 250 251 fdev->mdev = mdev; 252 mdev->fpga = fdev; 253 254 return 0; 255 } 256 257 void mlx5_fpga_device_stop(struct mlx5_core_dev *mdev) 258 { 259 struct mlx5_fpga_device *fdev = mdev->fpga; 260 unsigned int max_num_qps; 261 unsigned long flags; 262 int err; 263 264 if (!fdev) 265 return; 266 267 spin_lock_irqsave(&fdev->state_lock, flags); 268 if (fdev->state != MLX5_FPGA_STATUS_SUCCESS) { 269 spin_unlock_irqrestore(&fdev->state_lock, flags); 270 return; 271 } 272 fdev->state = MLX5_FPGA_STATUS_NONE; 273 spin_unlock_irqrestore(&fdev->state_lock, flags); 274 275 if (fdev->last_oper_image == MLX5_FPGA_IMAGE_USER) { 276 err = mlx5_fpga_ctrl_op(mdev, MLX5_FPGA_CTRL_OPERATION_SANDBOX_BYPASS_ON); 277 if (err) 278 mlx5_fpga_err(fdev, "Failed to re-set SBU bypass on: %d\n", 279 err); 280 } 281 282 mlx5_fpga_conn_device_cleanup(fdev); 283 mlx5_eq_notifier_unregister(fdev->mdev, &fdev->fpga_err_nb); 284 mlx5_eq_notifier_unregister(fdev->mdev, &fdev->fpga_qp_err_nb); 285 286 max_num_qps = MLX5_CAP_FPGA(mdev, shell_caps.max_num_qps); 287 mlx5_core_unreserve_gids(mdev, max_num_qps); 288 } 289 290 void mlx5_fpga_cleanup(struct mlx5_core_dev *mdev) 291 { 292 struct mlx5_fpga_device *fdev = mdev->fpga; 293 294 mlx5_fpga_device_stop(mdev); 295 kfree(fdev); 296 mdev->fpga = NULL; 297 } 298 299 static const char *mlx5_fpga_syndrome_to_string(u8 syndrome) 300 { 301 if (syndrome < ARRAY_SIZE(mlx5_fpga_error_strings)) 302 return mlx5_fpga_error_strings[syndrome]; 303 return "Unknown"; 304 } 305 306 static const char *mlx5_fpga_qp_syndrome_to_string(u8 syndrome) 307 { 308 if (syndrome < ARRAY_SIZE(mlx5_fpga_qp_error_strings)) 309 return mlx5_fpga_qp_error_strings[syndrome]; 310 return "Unknown"; 311 } 312 313 static int mlx5_fpga_event(struct mlx5_fpga_device *fdev, 314 unsigned long event, void *eqe) 315 { 316 void *data = ((struct mlx5_eqe *)eqe)->data.raw; 317 const char *event_name; 318 bool teardown = false; 319 unsigned long flags; 320 u8 syndrome; 321 322 switch (event) { 323 case MLX5_EVENT_TYPE_FPGA_ERROR: 324 syndrome = MLX5_GET(fpga_error_event, data, syndrome); 325 event_name = mlx5_fpga_syndrome_to_string(syndrome); 326 break; 327 case MLX5_EVENT_TYPE_FPGA_QP_ERROR: 328 syndrome = MLX5_GET(fpga_qp_error_event, data, syndrome); 329 event_name = mlx5_fpga_qp_syndrome_to_string(syndrome); 330 break; 331 default: 332 return NOTIFY_DONE; 333 } 334 335 spin_lock_irqsave(&fdev->state_lock, flags); 336 switch (fdev->state) { 337 case MLX5_FPGA_STATUS_SUCCESS: 338 mlx5_fpga_warn(fdev, "Error %u: %s\n", syndrome, event_name); 339 teardown = true; 340 break; 341 default: 342 mlx5_fpga_warn_ratelimited(fdev, "Unexpected error event %u: %s\n", 343 syndrome, event_name); 344 } 345 spin_unlock_irqrestore(&fdev->state_lock, flags); 346 /* We tear-down the card's interfaces and functionality because 347 * the FPGA bump-on-the-wire is misbehaving and we lose ability 348 * to communicate with the network. User may still be able to 349 * recover by re-programming or debugging the FPGA 350 */ 351 if (teardown) 352 mlx5_trigger_health_work(fdev->mdev); 353 354 return NOTIFY_OK; 355 } 356