1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2020 Joyent, Inc. 14 * Copyright 2022 Tintri by DDN, Inc. All rights reserved. 15 * Copyright 2023 Oxide Computer Company 16 */ 17 18 /* 19 * This file drives topo node enumeration of NVMe controllers. A single "nvme" 20 * node is enumerated for each NVMe controller. Child "disk" nodes are then 21 * enumerated for each active or attached NVMe namespace. 22 * 23 * nvme nodes are expected to be enumerated under either a "bay" node (for U.2 24 * devices) or a "slot" node (for M.2 devices) or a "pciexfn" node (for AIC 25 * devices). 26 * 27 * Enumeration of NVMe controllers on PCIe add-in cards is automatically driven 28 * by the pcibus topo module. 29 * 30 * In order to allow for associating a given NVMe controller with a physical 31 * location, enumeration of U.2 and M.2 devices should be driven by a 32 * platform-specific topo map which statically sets the following two 33 * properties on the parent "bay" or "slot" node: 34 * 35 * propgroup property description 36 * --------- -------- ------------ 37 * binding driver "nvme" 38 * binding parent-device devpath of parent PCIe device 39 * 40 * for example: 41 * 42 * <propgroup name="binding" version="1" name-stability="Private" 43 * data-stability="Private"> 44 * <propval name="driver" type="string" value="nvme"/> 45 * <propval name="parent-device" type="string" 46 * value="/pci@0,0/pci8086,6f09@3,1"/> 47 * </propgroup> 48 * <dependents grouping="children"> 49 * <range name="nvme" min="0" max="0"> 50 * <enum-method name="disk" version="1"/> 51 * </range> 52 * </dependents> 53 */ 54 #include <stdlib.h> 55 #include <sys/types.h> 56 #include <sys/stat.h> 57 #include <fcntl.h> 58 #include <unistd.h> 59 #include <string.h> 60 #include <strings.h> 61 #include <stdbool.h> 62 63 #include <sys/fm/protocol.h> 64 #include <fm/topo_hc.h> 65 #include <fm/topo_mod.h> 66 #include <topo_ufm.h> 67 68 #include <sys/dkio.h> 69 #include <sys/scsi/generic/inquiry.h> 70 71 #include <sys/nvme.h> 72 #include "disk.h" 73 #include "disk_drivers.h" 74 75 typedef struct nvme_enum_info { 76 topo_mod_t *nei_mod; 77 di_node_t nei_dinode; 78 nvme_identify_ctrl_t *nei_idctl; 79 nvme_version_t nei_vers; 80 tnode_t *nei_parent; 81 tnode_t *nei_nvme; 82 nvlist_t *nei_nvme_fmri; 83 const char *nei_nvme_path; 84 int nei_fd; 85 } nvme_enum_info_t; 86 87 typedef struct devlink_arg { 88 topo_mod_t *dla_mod; 89 char *dla_logical_disk; 90 uint_t dla_strsz; 91 } devlink_arg_t; 92 93 static int 94 devlink_cb(di_devlink_t dl, void *arg) 95 { 96 devlink_arg_t *dlarg = (devlink_arg_t *)arg; 97 topo_mod_t *mod = dlarg->dla_mod; 98 const char *devpath; 99 char *slice, *ctds; 100 101 if ((devpath = di_devlink_path(dl)) == NULL || 102 (dlarg->dla_logical_disk = topo_mod_strdup(mod, devpath)) == 103 NULL) { 104 return (DI_WALK_TERMINATE); 105 } 106 107 /* 108 * We need to keep track of the original string size before we 109 * truncate it with a NUL, so that we can free the right number of 110 * bytes when we're done, otherwise libumem will complain. 111 */ 112 dlarg->dla_strsz = strlen(dlarg->dla_logical_disk) + 1; 113 114 /* trim the slice off the public name */ 115 if (((ctds = strrchr(dlarg->dla_logical_disk, '/')) != NULL) && 116 ((slice = strchr(ctds, 's')) != NULL)) 117 *slice = '\0'; 118 119 return (DI_WALK_TERMINATE); 120 } 121 122 static char * 123 get_logical_disk(topo_mod_t *mod, const char *devpath, uint_t *bufsz) 124 { 125 di_devlink_handle_t devhdl; 126 devlink_arg_t dlarg = { 0 }; 127 char *minorpath = NULL; 128 129 if (asprintf(&minorpath, "%s:a", devpath) < 0) { 130 return (NULL); 131 } 132 133 if ((devhdl = di_devlink_init(NULL, 0)) == DI_NODE_NIL) { 134 topo_mod_dprintf(mod, "%s: di_devlink_init failed", __func__); 135 free(minorpath); 136 return (NULL); 137 } 138 139 dlarg.dla_mod = mod; 140 141 (void) di_devlink_walk(devhdl, "^dsk/", minorpath, DI_PRIMARY_LINK, 142 &dlarg, devlink_cb); 143 144 (void) di_devlink_fini(&devhdl); 145 free(minorpath); 146 147 *bufsz = dlarg.dla_strsz; 148 return (dlarg.dla_logical_disk); 149 } 150 151 static bool 152 disk_nvme_make_ns_serial(topo_mod_t *mod, const nvme_identify_nsid_t *id, 153 uint32_t nsid, char *buf, size_t buflen) 154 { 155 uint8_t zero_guid[16] = { 0 }; 156 int ret; 157 158 if (bcmp(zero_guid, id->id_nguid, sizeof (id->id_nguid)) != 0) { 159 ret = snprintf(buf, buflen, "%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X" 160 "%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X", 161 id->id_nguid[0], id->id_nguid[1], id->id_nguid[2], 162 id->id_nguid[3], id->id_nguid[4], id->id_nguid[5], 163 id->id_nguid[6], id->id_nguid[7], id->id_nguid[8], 164 id->id_nguid[9], id->id_nguid[10], id->id_nguid[11], 165 id->id_nguid[12], id->id_nguid[13], id->id_nguid[14], 166 id->id_nguid[15]); 167 } else if (bcmp(zero_guid, id->id_eui64, sizeof (id->id_eui64)) != 0) { 168 ret = snprintf(buf, buflen, 169 "%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X", 170 id->id_eui64[0], id->id_eui64[1], id->id_eui64[2], 171 id->id_eui64[3], id->id_eui64[4], id->id_eui64[5], 172 id->id_eui64[6], id->id_eui64[7]); 173 } else { 174 ret = snprintf(buf, buflen, "%u", nsid); 175 } 176 177 if ((size_t)ret >= buflen) { 178 topo_mod_dprintf(mod, "overflowed serial number for nsid %u: " 179 "needed %zu bytes, got %d", nsid, buflen, ret); 180 return (false); 181 } 182 183 return (true); 184 } 185 186 /* 187 * Create the common I/O property group properties that are shared between 188 * controllers and namespaces. We assume the property group was already created. 189 */ 190 static bool 191 disk_nvme_common_io(topo_mod_t *mod, tnode_t *tn, di_node_t di) 192 { 193 int err; 194 int inst = di_instance(di); 195 const char *drv = di_driver_name(di); 196 char *path; 197 const char *ppaths[1]; 198 199 if (inst != -1 && topo_prop_set_uint32(tn, TOPO_PGROUP_IO, 200 TOPO_IO_INSTANCE, TOPO_PROP_IMMUTABLE, (uint32_t)inst, &err) != 0) { 201 topo_mod_dprintf(mod, "failed to set %s:%s on %s[%" PRIu64 "]: " 202 "%s", TOPO_PGROUP_IO, TOPO_IO_INSTANCE, topo_node_name(tn), 203 topo_node_instance(tn), topo_strerror(err)); 204 return (false); 205 } 206 207 if (drv != NULL && topo_prop_set_string(tn, TOPO_PGROUP_IO, 208 TOPO_IO_DRIVER, TOPO_PROP_IMMUTABLE, drv, &err) != 0) { 209 topo_mod_dprintf(mod, "failed to set %s:%s on %s[%" PRIu64 "]: " 210 "%s", TOPO_PGROUP_IO, TOPO_IO_DRIVER, topo_node_name(tn), 211 topo_node_instance(tn), topo_strerror(err)); 212 return (false); 213 } 214 215 if (drv != NULL) { 216 nvlist_t *fmri = topo_mod_modfmri(mod, FM_MOD_SCHEME_VERSION, 217 drv); 218 if (mod != NULL && topo_prop_set_fmri(tn, TOPO_PGROUP_IO, 219 TOPO_IO_MODULE, TOPO_PROP_IMMUTABLE, fmri, &err) != 0) { 220 topo_mod_dprintf(mod, "failed to set %s:%s on %s[%" 221 PRIu64 "]: %s", TOPO_PGROUP_IO, TOPO_IO_MODULE, 222 topo_node_name(tn), topo_node_instance(tn), 223 topo_strerror(err)); 224 nvlist_free(fmri); 225 return (false); 226 } 227 nvlist_free(fmri); 228 } 229 230 path = di_devfs_path(di); 231 ppaths[0] = path; 232 if (path != NULL && topo_prop_set_string(tn, TOPO_PGROUP_IO, 233 TOPO_IO_DEV_PATH, TOPO_PROP_IMMUTABLE, path, &err) != 0) { 234 topo_mod_dprintf(mod, "failed to set %s:%s on %s[%" PRIu64 "]: " 235 "%s", TOPO_PGROUP_IO, TOPO_IO_DRIVER, topo_node_name(tn), 236 topo_node_instance(tn), topo_strerror(err)); 237 di_devfs_path_free(path); 238 return (false); 239 } 240 241 if (path != NULL && topo_prop_set_string_array(tn, TOPO_PGROUP_IO, 242 TOPO_IO_PHYS_PATH, TOPO_PROP_IMMUTABLE, ppaths, 1, &err) != 0) { 243 topo_mod_dprintf(mod, "failed to set %s:%s on %s[%" PRIu64 "]: " 244 "%s", TOPO_PGROUP_IO, TOPO_IO_PHYS_PATH, topo_node_name(tn), 245 topo_node_instance(tn), topo_strerror(err)); 246 di_devfs_path_free(path); 247 return (false); 248 } 249 di_devfs_path_free(path); 250 251 return (true); 252 } 253 254 /* 255 * Add the various storage and I/O property group items that are appropriate 256 * given that we have a devinfo node. The storage property group has already 257 * been created, but the I/O property group has not. 258 */ 259 static void 260 disk_nvme_make_ns_di_props(topo_mod_t *mod, tnode_t *tn, di_node_t di) 261 { 262 int err; 263 char *devid, *mfg, *model, *rev, *serial, *log, *path; 264 uint_t buflen; 265 266 if (di_prop_lookup_strings(DDI_DEV_T_ANY, di, DEVID_PROP_NAME, 267 &devid) != 1 || 268 di_prop_lookup_strings(DDI_DEV_T_ANY, di, INQUIRY_VENDOR_ID, 269 &mfg) != 1 || 270 di_prop_lookup_strings(DDI_DEV_T_ANY, di, INQUIRY_PRODUCT_ID, 271 &model) != 1 || 272 di_prop_lookup_strings(DDI_DEV_T_ANY, di, INQUIRY_REVISION_ID, 273 &rev) != 1 || 274 di_prop_lookup_strings(DDI_DEV_T_ANY, di, INQUIRY_SERIAL_NO, 275 &serial) != 1) { 276 topo_mod_dprintf(mod, "failed to get devinfo props for %s[%" 277 PRIu64 "]", topo_node_name(tn), topo_node_instance(tn)); 278 return; 279 } 280 281 /* 282 * Set the basic storage manufacturer information. Yes, this is 283 * information really about the NVMe controller and not the namespace. 284 * That's how the storage property group basically works here. 285 */ 286 if (topo_prop_set_string(tn, TOPO_PGROUP_STORAGE, 287 TOPO_STORAGE_MANUFACTURER, TOPO_PROP_IMMUTABLE, mfg, &err) != 0 || 288 topo_prop_set_string(tn, TOPO_PGROUP_STORAGE, 289 TOPO_STORAGE_SERIAL_NUM, TOPO_PROP_IMMUTABLE, serial, &err) != 0 || 290 topo_prop_set_string(tn, TOPO_PGROUP_STORAGE, 291 TOPO_STORAGE_FIRMWARE_REV, TOPO_PROP_IMMUTABLE, rev, &err) != 0 || 292 topo_prop_set_string(tn, TOPO_PGROUP_STORAGE, 293 TOPO_STORAGE_MODEL, TOPO_PROP_IMMUTABLE, model, &err) != 0) { 294 topo_mod_dprintf(mod, "failed to set storage properties on " 295 "%s[%" PRIu64 "]: %s", topo_node_name(tn), 296 topo_node_instance(tn), topo_strerror(err)); 297 return; 298 } 299 300 if (topo_pgroup_create(tn, &io_pgroup, &err) != 0) { 301 topo_mod_dprintf(mod, "failed to create I/O property " 302 "group on %s[%" PRIu64 "]: %s", topo_node_name(tn), 303 topo_node_instance(tn), topo_strerror(err)); 304 } 305 306 if (!disk_nvme_common_io(mod, tn, di)) { 307 return; 308 } 309 310 /* 311 * The last property that we'd like to attempt to create for a namespace 312 * is a mapping back to its corresponding logical disk entry in /dev. 313 * The logical disk will be everything past the trailing /, i.e. a 314 * cXtXdX value. 315 */ 316 path = di_devfs_path(di); 317 if (path == NULL) { 318 return; 319 } 320 log = get_logical_disk(mod, path, &buflen); 321 di_devfs_path_free(path); 322 if (log == NULL) { 323 return; 324 } 325 path = strrchr(log, '/'); 326 if (path != NULL && path[1] != '\0' && 327 topo_prop_set_string(tn, TOPO_PGROUP_STORAGE, 328 TOPO_STORAGE_LOGICAL_DISK_NAME, TOPO_PROP_IMMUTABLE, path + 1, 329 &err) != 0) { 330 topo_mod_dprintf(mod, "failed to set %s:%s on %s[%" 331 PRIu64 "]: %s", TOPO_PGROUP_STORAGE, 332 TOPO_STORAGE_LOGICAL_DISK_NAME, topo_node_name(tn), 333 topo_node_instance(tn), topo_strerror(err)); 334 } 335 topo_mod_free(mod, log, buflen); 336 } 337 338 static void 339 disk_nvme_make_ns(nvme_enum_info_t *nei, uint32_t nsid) 340 { 341 topo_mod_t *mod = nei->nei_mod; 342 nvlist_t *auth = NULL, *fmri = NULL; 343 const topo_instance_t inst = nsid - 1; 344 nvme_ns_info_t info; 345 nvme_ioctl_t ioc; 346 char serial[64], capstr[64]; 347 uint64_t cap, blksz; 348 tnode_t *tn; 349 uint8_t lba; 350 int err; 351 352 bzero(&ioc, sizeof (ioc)); 353 bzero(&info, sizeof (info)); 354 ioc.n_len = sizeof (nvme_ns_info_t); 355 ioc.n_buf = (uintptr_t)&info; 356 ioc.n_arg = nsid; 357 358 if (ioctl(nei->nei_fd, NVME_IOC_NS_INFO, &ioc) != 0) { 359 topo_mod_dprintf(mod, "failed to get namespace info for ns %u: " 360 "%s", nsid, strerror(errno)); 361 return; 362 } 363 364 if ((info.nni_state & NVME_NS_STATE_IGNORED) != 0) { 365 return; 366 } 367 368 if ((info.nni_state & 369 (NVME_NS_STATE_ACTIVE | NVME_NS_STATE_ATTACHED)) == 0) { 370 topo_mod_dprintf(mod, "skipping nsid %u because it is not " 371 "active or attached (state: 0x%x)", nsid, info.nni_state); 372 return; 373 } 374 375 auth = topo_mod_auth(mod, nei->nei_nvme); 376 if (auth == NULL) { 377 topo_mod_dprintf(mod, "failed to get auth for nsid %u from " 378 "parent %s[%" PRIu64 "]: %s", nsid, 379 topo_node_name(nei->nei_nvme), 380 topo_node_instance(nei->nei_nvme), topo_mod_errmsg(mod)); 381 goto done; 382 } 383 384 /* 385 * We want to construct the FMRI for the namespace. The namespace is a 386 * little awkward in terms of things like the model, revision, and 387 * serial. While blkdev sets up standard inquiry properties to map these 388 * to the parent device which makes sense in the context of trying to 389 * use this as a normal block device, it's not really appropriate here. 390 * The namespace is not the NVMe controller. We construct the namespace 391 * serial number from the preferential ordering of information that 392 * we're given of the NGUID, EUI64, and then fall back to the namespace 393 * number. 394 */ 395 if (!disk_nvme_make_ns_serial(mod, &info.nni_id, nsid, serial, 396 sizeof (serial))) { 397 goto done; 398 } 399 fmri = topo_mod_hcfmri(mod, nei->nei_nvme, FM_HC_SCHEME_VERSION, 400 DISK, inst, NULL, auth, NULL, NULL, serial); 401 if (fmri == NULL) { 402 topo_mod_dprintf(mod, "failed to make fmri for %s[%" PRIu64 403 "] on nsid %u: %s", DISK, inst, nsid, topo_mod_errmsg(mod)); 404 goto done; 405 } 406 407 tn = topo_node_bind(mod, nei->nei_nvme, DISK, inst, fmri); 408 if (tn == NULL) { 409 topo_mod_dprintf(mod, "failed to bind fmri for %s[%" PRIu64 410 "] on nsid %u: %s", DISK, inst, nsid, topo_mod_errmsg(mod)); 411 goto done; 412 } 413 414 /* 415 * Always inherit our parent's FRU. The namespace is just a part of the 416 * device in reality. 417 */ 418 if (topo_node_fru_set(tn, NULL, 0, &err) != 0) { 419 topo_mod_dprintf(mod, "failed to set FRU for %s[%" PRIu64 420 "] on nsid %u: %s", DISK, inst, nsid, topo_strerror(err)); 421 goto done; 422 423 } 424 425 /* 426 * Our namespace may or may not be attached. From the namespace we will 427 * always get the capacity and block information. The rest of it will 428 * end up being filled in if we find a devinfo node. 429 */ 430 if (topo_pgroup_create(tn, &storage_pgroup, &err) != 0) { 431 topo_mod_dprintf(mod, "failed to create storage property " 432 "group on %s[%" PRIu64 "]: %s", DISK, inst, 433 topo_strerror(err)); 434 } 435 436 lba = info.nni_id.id_flbas.lba_format; 437 blksz = 1ULL << info.nni_id.id_lbaf[lba].lbaf_lbads; 438 if (blksz != 0 && topo_prop_set_uint64(tn, TOPO_PGROUP_STORAGE, 439 TOPO_STORAGE_LOG_BLOCK_SIZE, TOPO_PROP_IMMUTABLE, blksz, &err) != 440 0) { 441 topo_mod_dprintf(mod, "failed to create property %s:%s on %s[%" 442 PRIu64 "]: %s", TOPO_PGROUP_STORAGE, 443 TOPO_STORAGE_LOG_BLOCK_SIZE, DISK, inst, 444 topo_strerror(err)); 445 goto done; 446 } 447 448 cap = blksz * info.nni_id.id_nsize; 449 if (snprintf(capstr, sizeof (capstr), "%" PRIu64, cap) >= 450 sizeof (capstr)) { 451 topo_mod_dprintf(mod, "overflowed capacity calculation on " 452 "nsid %u", nsid); 453 goto done; 454 } 455 456 /* 457 * Finally attempt to find a child node that has a matching name and go 458 * from there. Sorry, this does result in node creation being O(n^2), 459 * but at least n is usually small today. 460 */ 461 for (di_node_t di = di_child_node(nei->nei_dinode); di != DI_NODE_NIL; 462 di = di_sibling_node(di)) { 463 const char *addr = di_bus_addr(di); 464 if (addr != NULL && strcmp(addr, info.nni_addr) == 0) { 465 disk_nvme_make_ns_di_props(mod, tn, di); 466 } 467 } 468 469 done: 470 nvlist_free(auth); 471 nvlist_free(fmri); 472 } 473 474 /* 475 * Attempt to make a ufm node, but swallow the error so we can try to get as 476 * much of the disk information as possible. 477 */ 478 static void 479 disk_nvme_make_ufm(topo_mod_t *mod, nvme_enum_info_t *nei) 480 { 481 topo_ufm_devinfo_t tud; 482 char *path = di_devfs_path(nei->nei_dinode); 483 if (path == NULL) { 484 return; 485 } 486 487 tud.tud_method = TOPO_UFM_M_DEVINFO; 488 tud.tud_path = path; 489 if (topo_mod_load(mod, TOPO_MOD_UFM, TOPO_VERSION) == NULL) { 490 topo_mod_dprintf(mod, "disk enum could not load ufm module"); 491 di_devfs_path_free(path); 492 return; 493 } 494 495 (void) topo_mod_enumerate(mod, nei->nei_nvme, TOPO_MOD_UFM, UFM, 0, 0, 496 &tud); 497 di_devfs_path_free(path); 498 } 499 500 static const topo_pgroup_info_t nvme_pgroup = { 501 TOPO_PGROUP_NVME, 502 TOPO_STABILITY_PRIVATE, 503 TOPO_STABILITY_PRIVATE, 504 1 505 }; 506 507 static int 508 make_nvme_node(nvme_enum_info_t *nvme_info) 509 { 510 topo_mod_t *mod = nvme_info->nei_mod; 511 nvlist_t *auth = NULL, *fmri = NULL, *fru; 512 tnode_t *nvme; 513 char raw_rev[NVME_FWVER_SZ + 1], raw_model[NVME_MODEL_SZ + 1]; 514 char raw_serial[NVME_SERIAL_SZ + 1]; 515 char *rev = NULL, *model = NULL, *serial = NULL, *vers = NULL; 516 char *pname = topo_node_name(nvme_info->nei_parent); 517 char *label = NULL; 518 topo_instance_t pinst = topo_node_instance(nvme_info->nei_parent); 519 int err = 0, ret = -1; 520 521 /* 522 * The raw strings returned by the IDENTIFY CONTROLLER command are 523 * not NUL-terminated, so we fix that up. 524 */ 525 (void) strncpy(raw_rev, nvme_info->nei_idctl->id_fwrev, NVME_FWVER_SZ); 526 raw_rev[NVME_FWVER_SZ] = '\0'; 527 (void) strncpy(raw_model, nvme_info->nei_idctl->id_model, 528 NVME_MODEL_SZ); 529 raw_model[NVME_MODEL_SZ] = '\0'; 530 (void) strncpy(raw_serial, nvme_info->nei_idctl->id_serial, 531 NVME_SERIAL_SZ); 532 raw_serial[NVME_SERIAL_SZ] = '\0'; 533 534 /* 535 * Next we pass the strings through a function that sanitizes them of 536 * any characters that can't be used in an FMRI string. 537 */ 538 rev = topo_mod_clean_str(mod, raw_rev); 539 model = topo_mod_clean_str(mod, raw_model); 540 serial = topo_mod_clean_str(mod, raw_serial); 541 542 auth = topo_mod_auth(mod, nvme_info->nei_parent); 543 fmri = topo_mod_hcfmri(mod, nvme_info->nei_parent, FM_HC_SCHEME_VERSION, 544 NVME, 0, NULL, auth, model, rev, serial); 545 546 if (fmri == NULL) { 547 /* errno set */ 548 topo_mod_dprintf(mod, "%s: hcfmri failed for %s=%" PRIu64 549 "/%s=0", __func__, pname, pinst, NVME); 550 goto error; 551 } 552 553 /* 554 * If our parent is a pciexfn node, then we need to create a nvme range 555 * underneath it to hold the nvme hierarchy. For other cases, where 556 * enumeration is being driven by a topo map file, this range will have 557 * already been statically defined in the XML. 558 */ 559 if (strcmp(pname, PCIEX_FUNCTION) == 0) { 560 if (topo_node_range_create(mod, nvme_info->nei_parent, NVME, 0, 561 0) < 0) { 562 /* errno set */ 563 topo_mod_dprintf(mod, "%s: error creating %s range", 564 __func__, NVME); 565 goto error; 566 } 567 } 568 569 /* 570 * Create a new topo node to represent the NVMe controller and bind it 571 * to the parent node. 572 */ 573 if ((nvme = topo_node_bind(mod, nvme_info->nei_parent, NVME, 0, 574 fmri)) == NULL) { 575 /* errno set */ 576 topo_mod_dprintf(mod, "%s: bind failed for %s=%" PRIu64 577 "/%s=0", __func__, pname, pinst, NVME); 578 goto error; 579 } 580 nvme_info->nei_nvme = nvme; 581 nvme_info->nei_nvme_fmri = fmri; 582 583 /* 584 * If our parent node is a "pciexfn" node then this is a NVMe device on 585 * a PCIe AIC, so we inherit our parent's FRU. Otherwise, we set the 586 * FRU to ourself. 587 */ 588 if (strcmp(topo_node_name(nvme_info->nei_parent), PCIEX_FUNCTION) == 0) 589 fru = NULL; 590 else 591 fru = fmri; 592 593 if (topo_node_fru_set(nvme, fru, 0, &err) != 0) { 594 topo_mod_dprintf(mod, "%s: failed to set FRU: %s", __func__, 595 topo_strerror(err)); 596 (void) topo_mod_seterrno(mod, err); 597 goto error; 598 } 599 600 /* 601 * Clone the label from our parent node. We can't inherit the property 602 * because the label prop is mutable on bay nodes and only immutable 603 * properties can be inherited. 604 */ 605 if ((topo_node_label(nvme_info->nei_parent, &label, &err) != 0 && 606 err != ETOPO_PROP_NOENT) || 607 topo_node_label_set(nvme, label, &err) != 0) { 608 topo_mod_dprintf(mod, "%s: failed to set label: %s", 609 __func__, topo_strerror(err)); 610 (void) topo_mod_seterrno(mod, err); 611 goto error; 612 } 613 614 /* 615 * Ensure that we have a UFM property set based on our devinfo path. 616 * This is a little repetitive if our parent actually did so as well, 617 * but given that the majority of such nodes are under bays and slots 618 * right now, it's a worthwhile tradeoff. 619 */ 620 disk_nvme_make_ufm(mod, nvme_info); 621 622 if (topo_pgroup_create(nvme, &nvme_pgroup, &err) != 0) { 623 topo_mod_dprintf(mod, "%s: failed to create %s pgroup: %s", 624 __func__, TOPO_PGROUP_NVME, topo_strerror(err)); 625 (void) topo_mod_seterrno(mod, err); 626 goto error; 627 } 628 629 if (asprintf(&vers, "%u.%u", nvme_info->nei_vers.v_major, 630 nvme_info->nei_vers.v_minor) < 0) { 631 topo_mod_dprintf(mod, "%s: failed to alloc string", __func__); 632 (void) topo_mod_seterrno(mod, EMOD_NOMEM); 633 goto error; 634 } 635 if (topo_prop_set_string(nvme, TOPO_PGROUP_NVME, TOPO_PROP_NVME_VER, 636 TOPO_PROP_IMMUTABLE, vers, &err) != 0) { 637 topo_mod_dprintf(mod, "%s: failed to set %s/%s property", 638 __func__, TOPO_PGROUP_NVME, TOPO_PROP_NVME_VER); 639 (void) topo_mod_seterrno(mod, err); 640 goto error; 641 } 642 643 if (topo_pgroup_create(nvme, &io_pgroup, &err) != 0) { 644 topo_mod_dprintf(mod, "%s: failed to create %s pgroup: %s", 645 __func__, TOPO_PGROUP_IO, topo_strerror(err)); 646 (void) topo_mod_seterrno(mod, err); 647 goto error; 648 } 649 650 if (!disk_nvme_common_io(mod, nvme, nvme_info->nei_dinode)) { 651 goto error; 652 } 653 654 /* 655 * Create a child disk node for each namespace. 656 */ 657 if (topo_node_range_create(mod, nvme, DISK, 0, 658 (nvme_info->nei_idctl->id_nn - 1)) < 0) { 659 /* errno set */ 660 topo_mod_dprintf(mod, "%s: error creating %s range", __func__, 661 DISK); 662 goto error; 663 } 664 665 /* 666 * Iterate over each namespace to see if it's a candidate for inclusion. 667 * Namespaces start at index 1 and not every namespace will be included. 668 * We map things such that a disk instance is always namespace - 1 to 669 * fit into the above mapping. 670 */ 671 for (uint32_t i = 1; i <= nvme_info->nei_idctl->id_nn; i++) { 672 disk_nvme_make_ns(nvme_info, i); 673 } 674 ret = 0; 675 676 error: 677 free(vers); 678 nvlist_free(auth); 679 nvlist_free(fmri); 680 topo_mod_strfree(mod, rev); 681 topo_mod_strfree(mod, model); 682 topo_mod_strfree(mod, serial); 683 topo_mod_strfree(mod, label); 684 return (ret); 685 } 686 687 struct diwalk_arg { 688 topo_mod_t *diwk_mod; 689 tnode_t *diwk_parent; 690 }; 691 692 /* 693 * This function gathers identity information from the NVMe controller and 694 * stores it in a struct. This struct is passed to make_nvme_node(), which 695 * does the actual topo node creation. 696 */ 697 static int 698 discover_nvme_ctl(di_node_t node, di_minor_t minor, void *arg) 699 { 700 struct diwalk_arg *wkarg = arg; 701 topo_mod_t *mod = wkarg->diwk_mod; 702 char *path = NULL, *devctl = NULL; 703 nvme_ioctl_t nioc = { 0 }; 704 nvme_identify_ctrl_t *idctl = NULL; 705 nvme_enum_info_t nvme_info = { 0 }; 706 int fd = -1, ret = DI_WALK_TERMINATE; 707 708 if ((path = di_devfs_minor_path(minor)) == NULL) { 709 topo_mod_dprintf(mod, "failed to get minor path"); 710 (void) topo_mod_seterrno(mod, EMOD_UNKNOWN); 711 return (ret); 712 } 713 714 topo_mod_dprintf(mod, "%s=%" PRIu64 ": found nvme controller: %s", 715 topo_node_name(wkarg->diwk_parent), 716 topo_node_instance(wkarg->diwk_parent), path); 717 718 if (asprintf(&devctl, "/devices%s", path) < 0) { 719 topo_mod_dprintf(mod, "failed to alloc string"); 720 (void) topo_mod_seterrno(mod, EMOD_NOMEM); 721 goto error; 722 } 723 724 if ((fd = open(devctl, O_RDWR)) < 0) { 725 topo_mod_dprintf(mod, "failed to open %s", devctl); 726 (void) topo_mod_seterrno(mod, EMOD_UNKNOWN); 727 goto error; 728 } 729 if ((idctl = topo_mod_zalloc(mod, NVME_IDENTIFY_BUFSIZE)) == NULL) { 730 topo_mod_dprintf(mod, "zalloc failed"); 731 (void) topo_mod_seterrno(mod, EMOD_NOMEM); 732 goto error; 733 } 734 nioc.n_len = NVME_IDENTIFY_BUFSIZE; 735 nioc.n_buf = (uintptr_t)idctl; 736 nioc.n_arg = NVME_IDENTIFY_CTRL; 737 738 if (ioctl(fd, NVME_IOC_IDENTIFY, &nioc) != 0) { 739 topo_mod_dprintf(mod, "NVME_IOC_IDENTIFY ioctl " 740 "failed: %s", strerror(errno)); 741 (void) topo_mod_seterrno(mod, EMOD_UNKNOWN); 742 goto error; 743 } 744 745 nioc.n_len = sizeof (nvme_version_t); 746 nioc.n_buf = (uintptr_t)&nvme_info.nei_vers; 747 nioc.n_arg = 0; 748 749 if (ioctl(fd, NVME_IOC_VERSION, &nioc) != 0) { 750 topo_mod_dprintf(mod, "NVME_IOC_VERSION ioctl failed: %s", 751 strerror(errno)); 752 (void) topo_mod_seterrno(mod, EMOD_UNKNOWN); 753 goto error; 754 } 755 756 nvme_info.nei_mod = mod; 757 nvme_info.nei_nvme_path = path; 758 nvme_info.nei_dinode = node; 759 nvme_info.nei_idctl = idctl; 760 nvme_info.nei_parent = wkarg->diwk_parent; 761 nvme_info.nei_fd = fd; 762 763 if (make_nvme_node(&nvme_info) != 0) { 764 /* errno set */ 765 goto error; 766 } 767 768 ret = DI_WALK_CONTINUE; 769 770 error: 771 if (fd > 0) 772 (void) close(fd); 773 di_devfs_path_free(path); 774 free(devctl); 775 if (idctl != NULL) 776 topo_mod_free(mod, idctl, NVME_IDENTIFY_BUFSIZE); 777 return (ret); 778 } 779 780 int 781 disk_nvme_enum_disk(topo_mod_t *mod, tnode_t *pnode) 782 { 783 char *parent = NULL; 784 int err; 785 di_node_t devtree; 786 di_node_t dnode; 787 struct diwalk_arg wkarg = { 0 }; 788 int ret = -1; 789 790 /* 791 * Lookup a property containing the devfs path of the parent PCIe 792 * device of the NVMe device we're attempting to enumerate. This 793 * property is hard-coded in per-platform topo XML maps that are 794 * delivered with the OS. This hard-coded path allows topo to map a 795 * given NVMe controller to a physical location (bay or slot) on the 796 * platform, when generating the topo snapshot. 797 */ 798 if (topo_prop_get_string(pnode, TOPO_PGROUP_BINDING, 799 TOPO_BINDING_PARENT_DEV, &parent, &err) != 0) { 800 topo_mod_dprintf(mod, "parent node was missing nvme binding " 801 "properties\n"); 802 (void) topo_mod_seterrno(mod, err); 803 goto out; 804 } 805 if ((devtree = topo_mod_devinfo(mod)) == DI_NODE_NIL) { 806 topo_mod_dprintf(mod, "failed to get devinfo snapshot"); 807 (void) topo_mod_seterrno(mod, EMOD_UNKNOWN); 808 goto out; 809 } 810 811 /* 812 * Walk the devinfo tree looking NVMe devices. For each NVMe device, 813 * check if the devfs path of the parent matches the one specified in 814 * TOPO_BINDING_PARENT_DEV. 815 */ 816 wkarg.diwk_mod = mod; 817 wkarg.diwk_parent = pnode; 818 dnode = di_drv_first_node(NVME_DRV, devtree); 819 while (dnode != DI_NODE_NIL) { 820 char *path; 821 822 if ((path = di_devfs_path(di_parent_node(dnode))) == NULL) { 823 topo_mod_dprintf(mod, "failed to get dev path"); 824 (void) topo_mod_seterrno(mod, EMOD_UNKNOWN); 825 goto out; 826 } 827 if (strcmp(parent, path) == 0) { 828 if (di_walk_minor(dnode, DDI_NT_NVME_NEXUS, 0, 829 &wkarg, discover_nvme_ctl) < 0) { 830 di_devfs_path_free(path); 831 goto out; 832 } 833 } 834 di_devfs_path_free(path); 835 dnode = di_drv_next_node(dnode); 836 } 837 ret = 0; 838 839 out: 840 topo_mod_strfree(mod, parent); 841 return (ret); 842 } 843