1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2020 Joyent, Inc.
14  * Copyright 2022 Tintri by DDN, Inc. All rights reserved.
15  * Copyright 2023 Oxide Computer Company
16  */
17 
18 /*
19  * This file drives topo node enumeration of NVMe controllers.  A single "nvme"
20  * node is enumerated for each NVMe controller.   Child "disk" nodes are then
21  * enumerated for each active or attached NVMe namespace.
22  *
23  * nvme nodes are expected to be enumerated under either a "bay" node (for U.2
24  * devices) or a "slot" node (for M.2 devices) or a "pciexfn" node (for AIC
25  * devices).
26  *
27  * Enumeration of NVMe controllers on PCIe add-in cards is automatically driven
28  * by the pcibus topo module.
29  *
30  * In order to allow for associating a given NVMe controller with a physical
31  * location, enumeration of U.2 and M.2 devices should be driven by a
32  * platform-specific topo map which statically sets the following two
33  * properties on the parent "bay" or "slot" node:
34  *
35  * propgroup        property        description
36  * ---------        --------        ------------
37  * binding          driver          "nvme"
38  * binding          parent-device   devpath of parent PCIe device
39  *
40  * for example:
41  *
42  * <propgroup name="binding" version="1" name-stability="Private"
43  *   data-stability="Private">
44  *     <propval name="driver" type="string" value="nvme"/>
45  *     <propval name="parent-device" type="string"
46  *       value="/pci@0,0/pci8086,6f09@3,1"/>
47  * </propgroup>
48  * <dependents grouping="children">
49  *     <range name="nvme" min="0" max="0">
50  *         <enum-method name="disk" version="1"/>
51  *     </range>
52  * </dependents>
53  */
54 #include <stdlib.h>
55 #include <sys/types.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #include <unistd.h>
59 #include <string.h>
60 #include <strings.h>
61 #include <stdbool.h>
62 
63 #include <sys/fm/protocol.h>
64 #include <fm/topo_hc.h>
65 #include <fm/topo_mod.h>
66 #include <topo_ufm.h>
67 
68 #include <sys/dkio.h>
69 #include <sys/scsi/generic/inquiry.h>
70 
71 #include <sys/nvme.h>
72 #include "disk.h"
73 #include "disk_drivers.h"
74 
75 typedef struct nvme_enum_info {
76 	topo_mod_t		*nei_mod;
77 	di_node_t		nei_dinode;
78 	nvme_identify_ctrl_t	*nei_idctl;
79 	nvme_version_t		nei_vers;
80 	tnode_t			*nei_parent;
81 	tnode_t			*nei_nvme;
82 	nvlist_t		*nei_nvme_fmri;
83 	const char		*nei_nvme_path;
84 	int			nei_fd;
85 } nvme_enum_info_t;
86 
87 typedef struct devlink_arg {
88 	topo_mod_t		*dla_mod;
89 	char			*dla_logical_disk;
90 	uint_t			dla_strsz;
91 } devlink_arg_t;
92 
93 static int
94 devlink_cb(di_devlink_t dl, void *arg)
95 {
96 	devlink_arg_t *dlarg = (devlink_arg_t *)arg;
97 	topo_mod_t *mod = dlarg->dla_mod;
98 	const char *devpath;
99 	char *slice, *ctds;
100 
101 	if ((devpath = di_devlink_path(dl)) == NULL ||
102 	    (dlarg->dla_logical_disk = topo_mod_strdup(mod, devpath)) ==
103 	    NULL) {
104 		return (DI_WALK_TERMINATE);
105 	}
106 
107 	/*
108 	 * We need to keep track of the original string size before we
109 	 * truncate it with a NUL, so that we can free the right number of
110 	 * bytes when we're done, otherwise libumem will complain.
111 	 */
112 	dlarg->dla_strsz = strlen(dlarg->dla_logical_disk) + 1;
113 
114 	/* trim the slice off the public name */
115 	if (((ctds = strrchr(dlarg->dla_logical_disk, '/')) != NULL) &&
116 	    ((slice = strchr(ctds, 's')) != NULL))
117 		*slice = '\0';
118 
119 	return (DI_WALK_TERMINATE);
120 }
121 
122 static char *
123 get_logical_disk(topo_mod_t *mod, const char *devpath, uint_t *bufsz)
124 {
125 	di_devlink_handle_t devhdl;
126 	devlink_arg_t dlarg = { 0 };
127 	char *minorpath = NULL;
128 
129 	if (asprintf(&minorpath, "%s:a", devpath) < 0) {
130 		return (NULL);
131 	}
132 
133 	if ((devhdl = di_devlink_init(NULL, 0)) == DI_NODE_NIL) {
134 		topo_mod_dprintf(mod, "%s: di_devlink_init failed", __func__);
135 		free(minorpath);
136 		return (NULL);
137 	}
138 
139 	dlarg.dla_mod = mod;
140 
141 	(void) di_devlink_walk(devhdl, "^dsk/", minorpath, DI_PRIMARY_LINK,
142 	    &dlarg, devlink_cb);
143 
144 	(void) di_devlink_fini(&devhdl);
145 	free(minorpath);
146 
147 	*bufsz = dlarg.dla_strsz;
148 	return (dlarg.dla_logical_disk);
149 }
150 
151 static bool
152 disk_nvme_make_ns_serial(topo_mod_t *mod, const nvme_identify_nsid_t *id,
153     uint32_t nsid, char *buf, size_t buflen)
154 {
155 	uint8_t zero_guid[16] = { 0 };
156 	int ret;
157 
158 	if (bcmp(zero_guid, id->id_nguid, sizeof (id->id_nguid)) != 0) {
159 		ret = snprintf(buf, buflen, "%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X"
160 		    "%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X",
161 		    id->id_nguid[0], id->id_nguid[1], id->id_nguid[2],
162 		    id->id_nguid[3], id->id_nguid[4], id->id_nguid[5],
163 		    id->id_nguid[6], id->id_nguid[7], id->id_nguid[8],
164 		    id->id_nguid[9], id->id_nguid[10], id->id_nguid[11],
165 		    id->id_nguid[12], id->id_nguid[13], id->id_nguid[14],
166 		    id->id_nguid[15]);
167 	} else if (bcmp(zero_guid, id->id_eui64, sizeof (id->id_eui64)) != 0) {
168 		ret = snprintf(buf, buflen,
169 		    "%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X",
170 		    id->id_eui64[0], id->id_eui64[1], id->id_eui64[2],
171 		    id->id_eui64[3], id->id_eui64[4], id->id_eui64[5],
172 		    id->id_eui64[6], id->id_eui64[7]);
173 	} else {
174 		ret = snprintf(buf, buflen, "%u", nsid);
175 	}
176 
177 	if ((size_t)ret >= buflen) {
178 		topo_mod_dprintf(mod, "overflowed serial number for nsid %u: "
179 		    "needed %zu bytes, got %d", nsid, buflen, ret);
180 		return (false);
181 	}
182 
183 	return (true);
184 }
185 
186 /*
187  * Create the common I/O property group properties that are shared between
188  * controllers and namespaces. We assume the property group was already created.
189  */
190 static bool
191 disk_nvme_common_io(topo_mod_t *mod, tnode_t *tn, di_node_t di)
192 {
193 	int err;
194 	int inst = di_instance(di);
195 	const char *drv = di_driver_name(di);
196 	char *path;
197 	const char *ppaths[1];
198 
199 	if (inst != -1 && topo_prop_set_uint32(tn, TOPO_PGROUP_IO,
200 	    TOPO_IO_INSTANCE, TOPO_PROP_IMMUTABLE, (uint32_t)inst, &err) != 0) {
201 		topo_mod_dprintf(mod, "failed to set %s:%s on %s[%" PRIu64 "]: "
202 		    "%s", TOPO_PGROUP_IO, TOPO_IO_INSTANCE, topo_node_name(tn),
203 		    topo_node_instance(tn), topo_strerror(err));
204 		return (false);
205 	}
206 
207 	if (drv != NULL && topo_prop_set_string(tn, TOPO_PGROUP_IO,
208 	    TOPO_IO_DRIVER, TOPO_PROP_IMMUTABLE, drv, &err) != 0) {
209 		topo_mod_dprintf(mod, "failed to set %s:%s on %s[%" PRIu64 "]: "
210 		    "%s", TOPO_PGROUP_IO, TOPO_IO_DRIVER, topo_node_name(tn),
211 		    topo_node_instance(tn), topo_strerror(err));
212 		return (false);
213 	}
214 
215 	if (drv != NULL) {
216 		nvlist_t *fmri = topo_mod_modfmri(mod, FM_MOD_SCHEME_VERSION,
217 		    drv);
218 		if (mod != NULL && topo_prop_set_fmri(tn, TOPO_PGROUP_IO,
219 		    TOPO_IO_MODULE, TOPO_PROP_IMMUTABLE, fmri, &err) != 0) {
220 			topo_mod_dprintf(mod, "failed to set %s:%s on %s[%"
221 			    PRIu64 "]: %s", TOPO_PGROUP_IO, TOPO_IO_MODULE,
222 			    topo_node_name(tn), topo_node_instance(tn),
223 			    topo_strerror(err));
224 			nvlist_free(fmri);
225 			return (false);
226 		}
227 		nvlist_free(fmri);
228 	}
229 
230 	path = di_devfs_path(di);
231 	ppaths[0] = path;
232 	if (path != NULL && topo_prop_set_string(tn, TOPO_PGROUP_IO,
233 	    TOPO_IO_DEV_PATH, TOPO_PROP_IMMUTABLE, path, &err) != 0) {
234 		topo_mod_dprintf(mod, "failed to set %s:%s on %s[%" PRIu64 "]: "
235 		    "%s", TOPO_PGROUP_IO, TOPO_IO_DRIVER, topo_node_name(tn),
236 		    topo_node_instance(tn), topo_strerror(err));
237 		di_devfs_path_free(path);
238 		return (false);
239 	}
240 
241 	if (path != NULL && topo_prop_set_string_array(tn, TOPO_PGROUP_IO,
242 	    TOPO_IO_PHYS_PATH, TOPO_PROP_IMMUTABLE, ppaths, 1, &err) != 0) {
243 		topo_mod_dprintf(mod, "failed to set %s:%s on %s[%" PRIu64 "]: "
244 		    "%s", TOPO_PGROUP_IO, TOPO_IO_PHYS_PATH, topo_node_name(tn),
245 		    topo_node_instance(tn), topo_strerror(err));
246 		di_devfs_path_free(path);
247 		return (false);
248 	}
249 	di_devfs_path_free(path);
250 
251 	return (true);
252 }
253 
254 /*
255  * Add the various storage and I/O property group items that are appropriate
256  * given that we have a devinfo node. The storage property group has already
257  * been created, but the I/O property group has not.
258  */
259 static void
260 disk_nvme_make_ns_di_props(topo_mod_t *mod, tnode_t *tn, di_node_t di)
261 {
262 	int err;
263 	char *devid, *mfg, *model, *rev, *serial, *log, *path;
264 	uint_t buflen;
265 
266 	if (di_prop_lookup_strings(DDI_DEV_T_ANY, di, DEVID_PROP_NAME,
267 	    &devid) != 1 ||
268 	    di_prop_lookup_strings(DDI_DEV_T_ANY, di, INQUIRY_VENDOR_ID,
269 	    &mfg) != 1 ||
270 	    di_prop_lookup_strings(DDI_DEV_T_ANY, di, INQUIRY_PRODUCT_ID,
271 	    &model) != 1 ||
272 	    di_prop_lookup_strings(DDI_DEV_T_ANY, di, INQUIRY_REVISION_ID,
273 	    &rev) != 1 ||
274 	    di_prop_lookup_strings(DDI_DEV_T_ANY, di, INQUIRY_SERIAL_NO,
275 	    &serial) != 1) {
276 		topo_mod_dprintf(mod, "failed to get devinfo props for %s[%"
277 		    PRIu64 "]", topo_node_name(tn), topo_node_instance(tn));
278 		return;
279 	}
280 
281 	/*
282 	 * Set the basic storage manufacturer information. Yes, this is
283 	 * information really about the NVMe controller and not the namespace.
284 	 * That's how the storage property group basically works here.
285 	 */
286 	if (topo_prop_set_string(tn, TOPO_PGROUP_STORAGE,
287 	    TOPO_STORAGE_MANUFACTURER, TOPO_PROP_IMMUTABLE, mfg, &err) != 0 ||
288 	    topo_prop_set_string(tn, TOPO_PGROUP_STORAGE,
289 	    TOPO_STORAGE_SERIAL_NUM, TOPO_PROP_IMMUTABLE, serial, &err) != 0 ||
290 	    topo_prop_set_string(tn, TOPO_PGROUP_STORAGE,
291 	    TOPO_STORAGE_FIRMWARE_REV, TOPO_PROP_IMMUTABLE, rev, &err) != 0 ||
292 	    topo_prop_set_string(tn, TOPO_PGROUP_STORAGE,
293 	    TOPO_STORAGE_MODEL, TOPO_PROP_IMMUTABLE, model, &err) != 0) {
294 		topo_mod_dprintf(mod, "failed to set storage properties on "
295 		    "%s[%" PRIu64 "]: %s", topo_node_name(tn),
296 		    topo_node_instance(tn), topo_strerror(err));
297 		return;
298 	}
299 
300 	if (topo_pgroup_create(tn, &io_pgroup, &err) != 0) {
301 		topo_mod_dprintf(mod, "failed to create I/O property "
302 		    "group on %s[%" PRIu64 "]: %s",  topo_node_name(tn),
303 		    topo_node_instance(tn), topo_strerror(err));
304 	}
305 
306 	if (!disk_nvme_common_io(mod, tn, di)) {
307 		return;
308 	}
309 
310 	/*
311 	 * The last property that we'd like to attempt to create for a namespace
312 	 * is a mapping back to its corresponding logical disk entry in /dev.
313 	 * The logical disk will be everything past the trailing /, i.e. a
314 	 * cXtXdX value.
315 	 */
316 	path = di_devfs_path(di);
317 	if (path == NULL) {
318 		return;
319 	}
320 	log = get_logical_disk(mod, path, &buflen);
321 	di_devfs_path_free(path);
322 	if (log == NULL) {
323 		return;
324 	}
325 	path = strrchr(log, '/');
326 	if (path != NULL && path[1] != '\0' &&
327 	    topo_prop_set_string(tn, TOPO_PGROUP_STORAGE,
328 	    TOPO_STORAGE_LOGICAL_DISK_NAME, TOPO_PROP_IMMUTABLE, path + 1,
329 	    &err) != 0) {
330 		topo_mod_dprintf(mod, "failed to set %s:%s on %s[%"
331 		    PRIu64 "]: %s", TOPO_PGROUP_STORAGE,
332 		    TOPO_STORAGE_LOGICAL_DISK_NAME, topo_node_name(tn),
333 		    topo_node_instance(tn), topo_strerror(err));
334 	}
335 	topo_mod_free(mod, log, buflen);
336 }
337 
338 static void
339 disk_nvme_make_ns(nvme_enum_info_t *nei, uint32_t nsid)
340 {
341 	topo_mod_t *mod = nei->nei_mod;
342 	nvlist_t *auth = NULL, *fmri = NULL;
343 	const topo_instance_t inst = nsid - 1;
344 	nvme_ns_info_t info;
345 	nvme_ioctl_t ioc;
346 	char serial[64], capstr[64];
347 	uint64_t cap, blksz;
348 	tnode_t *tn;
349 	uint8_t lba;
350 	int err;
351 
352 	bzero(&ioc, sizeof (ioc));
353 	bzero(&info, sizeof (info));
354 	ioc.n_len = sizeof (nvme_ns_info_t);
355 	ioc.n_buf = (uintptr_t)&info;
356 	ioc.n_arg = nsid;
357 
358 	if (ioctl(nei->nei_fd, NVME_IOC_NS_INFO, &ioc) != 0) {
359 		topo_mod_dprintf(mod, "failed to get namespace info for ns %u: "
360 		    "%s", nsid, strerror(errno));
361 		return;
362 	}
363 
364 	if ((info.nni_state & NVME_NS_STATE_IGNORED) != 0) {
365 		return;
366 	}
367 
368 	if ((info.nni_state &
369 	    (NVME_NS_STATE_ACTIVE | NVME_NS_STATE_ATTACHED)) == 0) {
370 		topo_mod_dprintf(mod, "skipping nsid %u because it is not "
371 		    "active or attached (state: 0x%x)", nsid, info.nni_state);
372 		return;
373 	}
374 
375 	auth = topo_mod_auth(mod, nei->nei_nvme);
376 	if (auth == NULL) {
377 		topo_mod_dprintf(mod, "failed to get auth for nsid %u from "
378 		    "parent %s[%" PRIu64 "]: %s", nsid,
379 		    topo_node_name(nei->nei_nvme),
380 		    topo_node_instance(nei->nei_nvme), topo_mod_errmsg(mod));
381 		goto done;
382 	}
383 
384 	/*
385 	 * We want to construct the FMRI for the namespace. The namespace is a
386 	 * little awkward in terms of things like the model, revision, and
387 	 * serial. While blkdev sets up standard inquiry properties to map these
388 	 * to the parent device which makes sense in the context of trying to
389 	 * use this as a normal block device, it's not really appropriate here.
390 	 * The namespace is not the NVMe controller. We construct the namespace
391 	 * serial number from the preferential ordering of information that
392 	 * we're given of the NGUID, EUI64, and then fall back to the namespace
393 	 * number.
394 	 */
395 	if (!disk_nvme_make_ns_serial(mod, &info.nni_id, nsid, serial,
396 	    sizeof (serial))) {
397 		goto done;
398 	}
399 	fmri = topo_mod_hcfmri(mod, nei->nei_nvme, FM_HC_SCHEME_VERSION,
400 	    DISK, inst, NULL, auth, NULL, NULL, serial);
401 	if (fmri == NULL) {
402 		topo_mod_dprintf(mod, "failed to make fmri for %s[%" PRIu64
403 		    "] on nsid %u: %s", DISK, inst, nsid, topo_mod_errmsg(mod));
404 		goto done;
405 	}
406 
407 	tn = topo_node_bind(mod, nei->nei_nvme, DISK, inst, fmri);
408 	if (tn == NULL) {
409 		topo_mod_dprintf(mod, "failed to bind fmri for %s[%" PRIu64
410 		    "] on nsid %u: %s", DISK, inst, nsid, topo_mod_errmsg(mod));
411 		goto done;
412 	}
413 
414 	/*
415 	 * Always inherit our parent's FRU. The namespace is just a part of the
416 	 * device in reality.
417 	 */
418 	if (topo_node_fru_set(tn, NULL, 0, &err) != 0) {
419 		topo_mod_dprintf(mod, "failed to set FRU for %s[%" PRIu64
420 		    "] on nsid %u: %s", DISK, inst, nsid, topo_strerror(err));
421 		goto done;
422 
423 	}
424 
425 	/*
426 	 * Our namespace may or may not be attached. From the namespace we will
427 	 * always get the capacity and block information. The rest of it will
428 	 * end up being filled in if we find a devinfo node.
429 	 */
430 	if (topo_pgroup_create(tn, &storage_pgroup, &err) != 0) {
431 		topo_mod_dprintf(mod, "failed to create storage property "
432 		    "group on %s[%" PRIu64 "]: %s", DISK, inst,
433 		    topo_strerror(err));
434 	}
435 
436 	lba = info.nni_id.id_flbas.lba_format;
437 	blksz = 1ULL << info.nni_id.id_lbaf[lba].lbaf_lbads;
438 	if (blksz != 0 && topo_prop_set_uint64(tn, TOPO_PGROUP_STORAGE,
439 	    TOPO_STORAGE_LOG_BLOCK_SIZE, TOPO_PROP_IMMUTABLE, blksz, &err) !=
440 	    0) {
441 		topo_mod_dprintf(mod, "failed to create property %s:%s on %s[%"
442 		    PRIu64 "]: %s", TOPO_PGROUP_STORAGE,
443 		    TOPO_STORAGE_LOG_BLOCK_SIZE, DISK, inst,
444 		    topo_strerror(err));
445 		goto done;
446 	}
447 
448 	cap = blksz * info.nni_id.id_nsize;
449 	if (snprintf(capstr, sizeof (capstr), "%" PRIu64, cap) >=
450 	    sizeof (capstr)) {
451 		topo_mod_dprintf(mod, "overflowed capacity calculation on "
452 		    "nsid %u", nsid);
453 		goto done;
454 	}
455 
456 	/*
457 	 * Finally attempt to find a child node that has a matching name and go
458 	 * from there. Sorry, this does result in node creation being O(n^2),
459 	 * but at least n is usually small today.
460 	 */
461 	for (di_node_t di = di_child_node(nei->nei_dinode); di != DI_NODE_NIL;
462 	    di = di_sibling_node(di)) {
463 		const char *addr = di_bus_addr(di);
464 		if (addr != NULL && strcmp(addr, info.nni_addr) == 0) {
465 			disk_nvme_make_ns_di_props(mod, tn, di);
466 		}
467 	}
468 
469 done:
470 	nvlist_free(auth);
471 	nvlist_free(fmri);
472 }
473 
474 /*
475  * Attempt to make a ufm node, but swallow the error so we can try to get as
476  * much of the disk information as possible.
477  */
478 static void
479 disk_nvme_make_ufm(topo_mod_t *mod, nvme_enum_info_t *nei)
480 {
481 	topo_ufm_devinfo_t tud;
482 	char *path = di_devfs_path(nei->nei_dinode);
483 	if (path == NULL) {
484 		return;
485 	}
486 
487 	tud.tud_method = TOPO_UFM_M_DEVINFO;
488 	tud.tud_path = path;
489 	if (topo_mod_load(mod, TOPO_MOD_UFM, TOPO_VERSION) == NULL) {
490 		topo_mod_dprintf(mod, "disk enum could not load ufm module");
491 		di_devfs_path_free(path);
492 		return;
493 	}
494 
495 	(void) topo_mod_enumerate(mod, nei->nei_nvme, TOPO_MOD_UFM, UFM, 0, 0,
496 	    &tud);
497 	di_devfs_path_free(path);
498 }
499 
500 static const topo_pgroup_info_t nvme_pgroup = {
501 	TOPO_PGROUP_NVME,
502 	TOPO_STABILITY_PRIVATE,
503 	TOPO_STABILITY_PRIVATE,
504 	1
505 };
506 
507 static int
508 make_nvme_node(nvme_enum_info_t *nvme_info)
509 {
510 	topo_mod_t *mod = nvme_info->nei_mod;
511 	nvlist_t *auth = NULL, *fmri = NULL, *fru;
512 	tnode_t *nvme;
513 	char raw_rev[NVME_FWVER_SZ + 1], raw_model[NVME_MODEL_SZ + 1];
514 	char raw_serial[NVME_SERIAL_SZ + 1];
515 	char *rev = NULL, *model = NULL, *serial = NULL, *vers = NULL;
516 	char *pname = topo_node_name(nvme_info->nei_parent);
517 	char *label = NULL;
518 	topo_instance_t pinst = topo_node_instance(nvme_info->nei_parent);
519 	int err = 0, ret = -1;
520 
521 	/*
522 	 * The raw strings returned by the IDENTIFY CONTROLLER command are
523 	 * not NUL-terminated, so we fix that up.
524 	 */
525 	(void) strncpy(raw_rev, nvme_info->nei_idctl->id_fwrev, NVME_FWVER_SZ);
526 	raw_rev[NVME_FWVER_SZ] = '\0';
527 	(void) strncpy(raw_model, nvme_info->nei_idctl->id_model,
528 	    NVME_MODEL_SZ);
529 	raw_model[NVME_MODEL_SZ] = '\0';
530 	(void) strncpy(raw_serial, nvme_info->nei_idctl->id_serial,
531 	    NVME_SERIAL_SZ);
532 	raw_serial[NVME_SERIAL_SZ] = '\0';
533 
534 	/*
535 	 * Next we pass the strings through a function that sanitizes them of
536 	 * any characters that can't be used in an FMRI string.
537 	 */
538 	rev = topo_mod_clean_str(mod, raw_rev);
539 	model = topo_mod_clean_str(mod, raw_model);
540 	serial = topo_mod_clean_str(mod, raw_serial);
541 
542 	auth = topo_mod_auth(mod, nvme_info->nei_parent);
543 	fmri = topo_mod_hcfmri(mod, nvme_info->nei_parent, FM_HC_SCHEME_VERSION,
544 	    NVME, 0, NULL, auth, model, rev, serial);
545 
546 	if (fmri == NULL) {
547 		/* errno set */
548 		topo_mod_dprintf(mod, "%s: hcfmri failed for %s=%" PRIu64
549 		    "/%s=0", __func__, pname, pinst, NVME);
550 		goto error;
551 	}
552 
553 	/*
554 	 * If our parent is a pciexfn node, then we need to create a nvme range
555 	 * underneath it to hold the nvme hierarchy.  For other cases, where
556 	 * enumeration is being driven by a topo map file, this range will have
557 	 * already been statically defined in the XML.
558 	 */
559 	if (strcmp(pname, PCIEX_FUNCTION) == 0) {
560 		if (topo_node_range_create(mod, nvme_info->nei_parent, NVME, 0,
561 		    0) < 0) {
562 			/* errno set */
563 			topo_mod_dprintf(mod, "%s: error creating %s range",
564 			    __func__, NVME);
565 			goto error;
566 		}
567 	}
568 
569 	/*
570 	 * Create a new topo node to represent the NVMe controller and bind it
571 	 * to the parent node.
572 	 */
573 	if ((nvme = topo_node_bind(mod, nvme_info->nei_parent, NVME, 0,
574 	    fmri)) == NULL) {
575 		/* errno set */
576 		topo_mod_dprintf(mod, "%s: bind failed for %s=%" PRIu64
577 		    "/%s=0", __func__, pname, pinst, NVME);
578 		goto error;
579 	}
580 	nvme_info->nei_nvme = nvme;
581 	nvme_info->nei_nvme_fmri = fmri;
582 
583 	/*
584 	 * If our parent node is a "pciexfn" node then this is a NVMe device on
585 	 * a PCIe AIC, so we inherit our parent's FRU.  Otherwise, we set the
586 	 * FRU to ourself.
587 	 */
588 	if (strcmp(topo_node_name(nvme_info->nei_parent), PCIEX_FUNCTION) == 0)
589 		fru = NULL;
590 	else
591 		fru = fmri;
592 
593 	if (topo_node_fru_set(nvme, fru, 0, &err) != 0) {
594 		topo_mod_dprintf(mod, "%s: failed to set FRU: %s", __func__,
595 		    topo_strerror(err));
596 		(void) topo_mod_seterrno(mod, err);
597 		goto error;
598 	}
599 
600 	/*
601 	 * Clone the label from our parent node.  We can't inherit the property
602 	 * because the label prop is mutable on bay nodes and only immutable
603 	 * properties can be inherited.
604 	 */
605 	if ((topo_node_label(nvme_info->nei_parent, &label, &err) != 0 &&
606 	    err != ETOPO_PROP_NOENT) ||
607 	    topo_node_label_set(nvme, label, &err) != 0) {
608 		topo_mod_dprintf(mod, "%s: failed to set label: %s",
609 		    __func__, topo_strerror(err));
610 		(void) topo_mod_seterrno(mod, err);
611 		goto error;
612 	}
613 
614 	/*
615 	 * Ensure that we have a UFM property set based on our devinfo path.
616 	 * This is a little repetitive if our parent actually did so as well,
617 	 * but given that the majority of such nodes are under bays and slots
618 	 * right now, it's a worthwhile tradeoff.
619 	 */
620 	disk_nvme_make_ufm(mod, nvme_info);
621 
622 	if (topo_pgroup_create(nvme, &nvme_pgroup, &err) != 0) {
623 		topo_mod_dprintf(mod, "%s: failed to create %s pgroup: %s",
624 		    __func__, TOPO_PGROUP_NVME, topo_strerror(err));
625 		(void) topo_mod_seterrno(mod, err);
626 		goto error;
627 	}
628 
629 	if (asprintf(&vers, "%u.%u", nvme_info->nei_vers.v_major,
630 	    nvme_info->nei_vers.v_minor) < 0) {
631 		topo_mod_dprintf(mod, "%s: failed to alloc string", __func__);
632 		(void) topo_mod_seterrno(mod, EMOD_NOMEM);
633 		goto error;
634 	}
635 	if (topo_prop_set_string(nvme, TOPO_PGROUP_NVME, TOPO_PROP_NVME_VER,
636 	    TOPO_PROP_IMMUTABLE, vers, &err) != 0) {
637 		topo_mod_dprintf(mod, "%s: failed to set %s/%s property",
638 		    __func__, TOPO_PGROUP_NVME, TOPO_PROP_NVME_VER);
639 		(void) topo_mod_seterrno(mod, err);
640 		goto error;
641 	}
642 
643 	if (topo_pgroup_create(nvme, &io_pgroup, &err) != 0) {
644 		topo_mod_dprintf(mod, "%s: failed to create %s pgroup: %s",
645 		    __func__, TOPO_PGROUP_IO, topo_strerror(err));
646 		(void) topo_mod_seterrno(mod, err);
647 		goto error;
648 	}
649 
650 	if (!disk_nvme_common_io(mod, nvme, nvme_info->nei_dinode)) {
651 		goto error;
652 	}
653 
654 	/*
655 	 * Create a child disk node for each namespace.
656 	 */
657 	if (topo_node_range_create(mod, nvme, DISK, 0,
658 	    (nvme_info->nei_idctl->id_nn - 1)) < 0) {
659 		/* errno set */
660 		topo_mod_dprintf(mod, "%s: error creating %s range", __func__,
661 		    DISK);
662 		goto error;
663 	}
664 
665 	/*
666 	 * Iterate over each namespace to see if it's a candidate for inclusion.
667 	 * Namespaces start at index 1 and not every namespace will be included.
668 	 * We map things such that a disk instance is always namespace - 1 to
669 	 * fit into the above mapping.
670 	 */
671 	for (uint32_t i = 1; i <= nvme_info->nei_idctl->id_nn; i++) {
672 		disk_nvme_make_ns(nvme_info, i);
673 	}
674 	ret = 0;
675 
676 error:
677 	free(vers);
678 	nvlist_free(auth);
679 	nvlist_free(fmri);
680 	topo_mod_strfree(mod, rev);
681 	topo_mod_strfree(mod, model);
682 	topo_mod_strfree(mod, serial);
683 	topo_mod_strfree(mod, label);
684 	return (ret);
685 }
686 
687 struct diwalk_arg {
688 	topo_mod_t	*diwk_mod;
689 	tnode_t		*diwk_parent;
690 };
691 
692 /*
693  * This function gathers identity information from the NVMe controller and
694  * stores it in a struct.  This struct is passed to make_nvme_node(), which
695  * does the actual topo node creation.
696  */
697 static int
698 discover_nvme_ctl(di_node_t node, di_minor_t minor, void *arg)
699 {
700 	struct diwalk_arg *wkarg = arg;
701 	topo_mod_t *mod = wkarg->diwk_mod;
702 	char *path = NULL, *devctl = NULL;
703 	nvme_ioctl_t nioc = { 0 };
704 	nvme_identify_ctrl_t *idctl = NULL;
705 	nvme_enum_info_t nvme_info = { 0 };
706 	int fd = -1, ret = DI_WALK_TERMINATE;
707 
708 	if ((path = di_devfs_minor_path(minor)) == NULL) {
709 		topo_mod_dprintf(mod, "failed to get minor path");
710 		(void) topo_mod_seterrno(mod, EMOD_UNKNOWN);
711 		return (ret);
712 	}
713 
714 	topo_mod_dprintf(mod, "%s=%" PRIu64 ": found nvme controller: %s",
715 	    topo_node_name(wkarg->diwk_parent),
716 	    topo_node_instance(wkarg->diwk_parent), path);
717 
718 	if (asprintf(&devctl, "/devices%s", path) < 0) {
719 		topo_mod_dprintf(mod, "failed to alloc string");
720 		(void) topo_mod_seterrno(mod, EMOD_NOMEM);
721 		goto error;
722 	}
723 
724 	if ((fd = open(devctl, O_RDWR)) < 0) {
725 		topo_mod_dprintf(mod, "failed to open %s", devctl);
726 		(void) topo_mod_seterrno(mod, EMOD_UNKNOWN);
727 		goto error;
728 	}
729 	if ((idctl = topo_mod_zalloc(mod, NVME_IDENTIFY_BUFSIZE)) == NULL) {
730 		topo_mod_dprintf(mod, "zalloc failed");
731 		(void) topo_mod_seterrno(mod, EMOD_NOMEM);
732 		goto error;
733 	}
734 	nioc.n_len = NVME_IDENTIFY_BUFSIZE;
735 	nioc.n_buf = (uintptr_t)idctl;
736 	nioc.n_arg = NVME_IDENTIFY_CTRL;
737 
738 	if (ioctl(fd, NVME_IOC_IDENTIFY, &nioc) != 0) {
739 		topo_mod_dprintf(mod, "NVME_IOC_IDENTIFY ioctl "
740 		    "failed: %s", strerror(errno));
741 		(void) topo_mod_seterrno(mod, EMOD_UNKNOWN);
742 		goto error;
743 	}
744 
745 	nioc.n_len = sizeof (nvme_version_t);
746 	nioc.n_buf = (uintptr_t)&nvme_info.nei_vers;
747 	nioc.n_arg = 0;
748 
749 	if (ioctl(fd, NVME_IOC_VERSION, &nioc) != 0) {
750 		topo_mod_dprintf(mod, "NVME_IOC_VERSION ioctl failed: %s",
751 		    strerror(errno));
752 		(void) topo_mod_seterrno(mod, EMOD_UNKNOWN);
753 		goto error;
754 	}
755 
756 	nvme_info.nei_mod = mod;
757 	nvme_info.nei_nvme_path = path;
758 	nvme_info.nei_dinode = node;
759 	nvme_info.nei_idctl = idctl;
760 	nvme_info.nei_parent = wkarg->diwk_parent;
761 	nvme_info.nei_fd = fd;
762 
763 	if (make_nvme_node(&nvme_info) != 0) {
764 		/* errno set */
765 		goto error;
766 	}
767 
768 	ret = DI_WALK_CONTINUE;
769 
770 error:
771 	if (fd > 0)
772 		(void) close(fd);
773 	di_devfs_path_free(path);
774 	free(devctl);
775 	if (idctl != NULL)
776 		topo_mod_free(mod, idctl, NVME_IDENTIFY_BUFSIZE);
777 	return (ret);
778 }
779 
780 int
781 disk_nvme_enum_disk(topo_mod_t *mod, tnode_t *pnode)
782 {
783 	char *parent = NULL;
784 	int err;
785 	di_node_t devtree;
786 	di_node_t dnode;
787 	struct diwalk_arg wkarg = { 0 };
788 	int ret = -1;
789 
790 	/*
791 	 * Lookup a property containing the devfs path of the parent PCIe
792 	 * device of the NVMe device we're attempting to enumerate.  This
793 	 * property is hard-coded in per-platform topo XML maps that are
794 	 * delivered with the OS.  This hard-coded path allows topo to map a
795 	 * given NVMe controller to a physical location (bay or slot) on the
796 	 * platform, when generating the topo snapshot.
797 	 */
798 	if (topo_prop_get_string(pnode, TOPO_PGROUP_BINDING,
799 	    TOPO_BINDING_PARENT_DEV, &parent, &err) != 0) {
800 		topo_mod_dprintf(mod, "parent node was missing nvme binding "
801 		    "properties\n");
802 		(void) topo_mod_seterrno(mod, err);
803 		goto out;
804 	}
805 	if ((devtree = topo_mod_devinfo(mod)) == DI_NODE_NIL) {
806 		topo_mod_dprintf(mod, "failed to get devinfo snapshot");
807 		(void) topo_mod_seterrno(mod, EMOD_UNKNOWN);
808 		goto out;
809 	}
810 
811 	/*
812 	 * Walk the devinfo tree looking NVMe devices. For each NVMe device,
813 	 * check if the devfs path of the parent matches the one specified in
814 	 * TOPO_BINDING_PARENT_DEV.
815 	 */
816 	wkarg.diwk_mod = mod;
817 	wkarg.diwk_parent = pnode;
818 	dnode = di_drv_first_node(NVME_DRV, devtree);
819 	while (dnode != DI_NODE_NIL) {
820 		char *path;
821 
822 		if ((path = di_devfs_path(di_parent_node(dnode))) == NULL) {
823 			topo_mod_dprintf(mod, "failed to get dev path");
824 			(void) topo_mod_seterrno(mod, EMOD_UNKNOWN);
825 			goto out;
826 		}
827 		if (strcmp(parent, path) == 0) {
828 			if (di_walk_minor(dnode, DDI_NT_NVME_NEXUS, 0,
829 			    &wkarg, discover_nvme_ctl) < 0) {
830 				di_devfs_path_free(path);
831 				goto out;
832 			}
833 		}
834 		di_devfs_path_free(path);
835 		dnode = di_drv_next_node(dnode);
836 	}
837 	ret = 0;
838 
839 out:
840 	topo_mod_strfree(mod, parent);
841 	return (ret);
842 }
843