1 /*
2 * QEMU NVM Express Controller
3 *
4 * Copyright (c) 2012, Intel Corporation
5 *
6 * Written by Keith Busch <keith.busch@intel.com>
7 *
8 * This code is licensed under the GNU GPL v2 or later.
9 */
10
11 /**
12 * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e
13 *
14 * https://nvmexpress.org/developers/nvme-specification/
15 *
16 *
17 * Notes on coding style
18 * ---------------------
19 * While QEMU coding style prefers lowercase hexadecimals in constants, the
20 * NVMe subsystem use this format from the NVMe specifications in the comments
21 * (i.e. 'h' suffix instead of '0x' prefix).
22 *
23 * Usage
24 * -----
25 * See docs/system/nvme.rst for extensive documentation.
26 *
27 * Add options:
28 * -drive file=<file>,if=none,id=<drive_id>
29 * -device nvme-subsys,id=<subsys_id>,nqn=<nqn_id>
30 * -device nvme,serial=<serial>,id=<bus_name>, \
31 * cmb_size_mb=<cmb_size_mb[optional]>, \
32 * [pmrdev=<mem_backend_file_id>,] \
33 * max_ioqpairs=<N[optional]>, \
34 * aerl=<N[optional]>,aer_max_queued=<N[optional]>, \
35 * mdts=<N[optional]>,vsl=<N[optional]>, \
36 * zoned.zasl=<N[optional]>, \
37 * zoned.auto_transition=<on|off[optional]>, \
38 * sriov_max_vfs=<N[optional]> \
39 * sriov_vq_flexible=<N[optional]> \
40 * sriov_vi_flexible=<N[optional]> \
41 * sriov_max_vi_per_vf=<N[optional]> \
42 * sriov_max_vq_per_vf=<N[optional]> \
43 * atomic.dn=<on|off[optional]>, \
44 * atomic.awun<N[optional]>, \
45 * atomic.awupf<N[optional]>, \
46 * subsys=<subsys_id>
47 * -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\
48 * zoned=<true|false[optional]>, \
49 * subsys=<subsys_id>,shared=<true|false[optional]>, \
50 * detached=<true|false[optional]>, \
51 * zoned.zone_size=<N[optional]>, \
52 * zoned.zone_capacity=<N[optional]>, \
53 * zoned.descr_ext_size=<N[optional]>, \
54 * zoned.max_active=<N[optional]>, \
55 * zoned.max_open=<N[optional]>, \
56 * zoned.cross_read=<true|false[optional]>
57 *
58 * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
59 * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the
60 * device will use the "v1.4 CMB scheme" - use the `legacy-cmb` parameter to
61 * always enable the CMBLOC and CMBSZ registers (v1.3 behavior).
62 *
63 * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
64 * For example:
65 * -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \
66 * size=<size> .... -device nvme,...,pmrdev=<mem_id>
67 *
68 * The PMR will use BAR 4/5 exclusively.
69 *
70 * To place controller(s) and namespace(s) to a subsystem, then provide
71 * nvme-subsys device as above.
72 *
73 * nvme subsystem device parameters
74 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
75 * - `nqn`
76 * This parameter provides the `<nqn_id>` part of the string
77 * `nqn.2019-08.org.qemu:<nqn_id>` which will be reported in the SUBNQN field
78 * of subsystem controllers. Note that `<nqn_id>` should be unique per
79 * subsystem, but this is not enforced by QEMU. If not specified, it will
80 * default to the value of the `id` parameter (`<subsys_id>`).
81 *
82 * nvme device parameters
83 * ~~~~~~~~~~~~~~~~~~~~~~
84 * - `subsys`
85 * Specifying this parameter attaches the controller to the subsystem and
86 * the SUBNQN field in the controller will report the NQN of the subsystem
87 * device. This also enables multi controller capability represented in
88 * Identify Controller data structure in CMIC (Controller Multi-path I/O and
89 * Namespace Sharing Capabilities).
90 *
91 * - `aerl`
92 * The Asynchronous Event Request Limit (AERL). Indicates the maximum number
93 * of concurrently outstanding Asynchronous Event Request commands support
94 * by the controller. This is a 0's based value.
95 *
96 * - `aer_max_queued`
97 * This is the maximum number of events that the device will enqueue for
98 * completion when there are no outstanding AERs. When the maximum number of
99 * enqueued events are reached, subsequent events will be dropped.
100 *
101 * - `mdts`
102 * Indicates the maximum data transfer size for a command that transfers data
103 * between host-accessible memory and the controller. The value is specified
104 * as a power of two (2^n) and is in units of the minimum memory page size
105 * (CAP.MPSMIN). The default value is 7 (i.e. 512 KiB).
106 *
107 * - `vsl`
108 * Indicates the maximum data size limit for the Verify command. Like `mdts`,
109 * this value is specified as a power of two (2^n) and is in units of the
110 * minimum memory page size (CAP.MPSMIN). The default value is 7 (i.e. 512
111 * KiB).
112 *
113 * - `zoned.zasl`
114 * Indicates the maximum data transfer size for the Zone Append command. Like
115 * `mdts`, the value is specified as a power of two (2^n) and is in units of
116 * the minimum memory page size (CAP.MPSMIN). The default value is 0 (i.e.
117 * defaulting to the value of `mdts`).
118 *
119 * - `zoned.auto_transition`
120 * Indicates if zones in zone state implicitly opened can be automatically
121 * transitioned to zone state closed for resource management purposes.
122 * Defaults to 'on'.
123 *
124 * - `sriov_max_vfs`
125 * Indicates the maximum number of PCIe virtual functions supported
126 * by the controller. The default value is 0. Specifying a non-zero value
127 * enables reporting of both SR-IOV and ARI capabilities by the NVMe device.
128 * Virtual function controllers will not report SR-IOV capability.
129 *
130 * NOTE: Single Root I/O Virtualization support is experimental.
131 * All the related parameters may be subject to change.
132 *
133 * - `sriov_vq_flexible`
134 * Indicates the total number of flexible queue resources assignable to all
135 * the secondary controllers. Implicitly sets the number of primary
136 * controller's private resources to `(max_ioqpairs - sriov_vq_flexible)`.
137 *
138 * - `sriov_vi_flexible`
139 * Indicates the total number of flexible interrupt resources assignable to
140 * all the secondary controllers. Implicitly sets the number of primary
141 * controller's private resources to `(msix_qsize - sriov_vi_flexible)`.
142 *
143 * - `sriov_max_vi_per_vf`
144 * Indicates the maximum number of virtual interrupt resources assignable
145 * to a secondary controller. The default 0 resolves to
146 * `(sriov_vi_flexible / sriov_max_vfs)`.
147 *
148 * - `sriov_max_vq_per_vf`
149 * Indicates the maximum number of virtual queue resources assignable to
150 * a secondary controller. The default 0 resolves to
151 * `(sriov_vq_flexible / sriov_max_vfs)`.
152 *
153 * nvme namespace device parameters
154 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
155 * - `shared`
156 * When the parent nvme device (as defined explicitly by the 'bus' parameter
157 * or implicitly by the most recently defined NvmeBus) is linked to an
158 * nvme-subsys device, the namespace will be attached to all controllers in
159 * the subsystem. If set to 'off' (the default), the namespace will remain a
160 * private namespace and may only be attached to a single controller at a
161 * time.
162 *
163 * - `detached`
164 * This parameter is only valid together with the `subsys` parameter. If left
165 * at the default value (`false/off`), the namespace will be attached to all
166 * controllers in the NVMe subsystem at boot-up. If set to `true/on`, the
167 * namespace will be available in the subsystem but not attached to any
168 * controllers.
169 *
170 * Setting `zoned` to true selects Zoned Command Set at the namespace.
171 * In this case, the following namespace properties are available to configure
172 * zoned operation:
173 * zoned.zone_size=<zone size in bytes, default: 128MiB>
174 * The number may be followed by K, M, G as in kilo-, mega- or giga-.
175 *
176 * zoned.zone_capacity=<zone capacity in bytes, default: zone size>
177 * The value 0 (default) forces zone capacity to be the same as zone
178 * size. The value of this property may not exceed zone size.
179 *
180 * zoned.descr_ext_size=<zone descriptor extension size, default 0>
181 * This value needs to be specified in 64B units. If it is zero,
182 * namespace(s) will not support zone descriptor extensions.
183 *
184 * zoned.max_active=<Maximum Active Resources (zones), default: 0>
185 * The default value means there is no limit to the number of
186 * concurrently active zones.
187 *
188 * zoned.max_open=<Maximum Open Resources (zones), default: 0>
189 * The default value means there is no limit to the number of
190 * concurrently open zones.
191 *
192 * zoned.cross_read=<enable RAZB, default: false>
193 * Setting this property to true enables Read Across Zone Boundaries.
194 */
195
196 #include "qemu/osdep.h"
197 #include "qemu/cutils.h"
198 #include "qemu/error-report.h"
199 #include "qemu/log.h"
200 #include "qemu/units.h"
201 #include "qemu/range.h"
202 #include "qapi/error.h"
203 #include "qapi/visitor.h"
204 #include "sysemu/sysemu.h"
205 #include "sysemu/block-backend.h"
206 #include "sysemu/hostmem.h"
207 #include "hw/pci/msix.h"
208 #include "hw/pci/pcie_sriov.h"
209 #include "sysemu/spdm-socket.h"
210 #include "migration/vmstate.h"
211
212 #include "nvme.h"
213 #include "dif.h"
214 #include "trace.h"
215
216 #define NVME_MAX_IOQPAIRS 0xffff
217 #define NVME_DB_SIZE 4
218 #define NVME_SPEC_VER 0x00010400
219 #define NVME_CMB_BIR 2
220 #define NVME_PMR_BIR 4
221 #define NVME_TEMPERATURE 0x143
222 #define NVME_TEMPERATURE_WARNING 0x157
223 #define NVME_TEMPERATURE_CRITICAL 0x175
224 #define NVME_NUM_FW_SLOTS 1
225 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
226 #define NVME_VF_RES_GRANULARITY 1
227 #define NVME_VF_OFFSET 0x1
228 #define NVME_VF_STRIDE 1
229
230 #define NVME_GUEST_ERR(trace, fmt, ...) \
231 do { \
232 (trace_##trace)(__VA_ARGS__); \
233 qemu_log_mask(LOG_GUEST_ERROR, #trace \
234 " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
235 } while (0)
236
237 static const bool nvme_feature_support[NVME_FID_MAX] = {
238 [NVME_ARBITRATION] = true,
239 [NVME_POWER_MANAGEMENT] = true,
240 [NVME_TEMPERATURE_THRESHOLD] = true,
241 [NVME_ERROR_RECOVERY] = true,
242 [NVME_VOLATILE_WRITE_CACHE] = true,
243 [NVME_NUMBER_OF_QUEUES] = true,
244 [NVME_INTERRUPT_COALESCING] = true,
245 [NVME_INTERRUPT_VECTOR_CONF] = true,
246 [NVME_WRITE_ATOMICITY] = true,
247 [NVME_ASYNCHRONOUS_EVENT_CONF] = true,
248 [NVME_TIMESTAMP] = true,
249 [NVME_HOST_BEHAVIOR_SUPPORT] = true,
250 [NVME_COMMAND_SET_PROFILE] = true,
251 [NVME_FDP_MODE] = true,
252 [NVME_FDP_EVENTS] = true,
253 };
254
255 static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
256 [NVME_TEMPERATURE_THRESHOLD] = NVME_FEAT_CAP_CHANGE,
257 [NVME_ERROR_RECOVERY] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
258 [NVME_VOLATILE_WRITE_CACHE] = NVME_FEAT_CAP_CHANGE,
259 [NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE,
260 [NVME_WRITE_ATOMICITY] = NVME_FEAT_CAP_CHANGE,
261 [NVME_ASYNCHRONOUS_EVENT_CONF] = NVME_FEAT_CAP_CHANGE,
262 [NVME_TIMESTAMP] = NVME_FEAT_CAP_CHANGE,
263 [NVME_HOST_BEHAVIOR_SUPPORT] = NVME_FEAT_CAP_CHANGE,
264 [NVME_COMMAND_SET_PROFILE] = NVME_FEAT_CAP_CHANGE,
265 [NVME_FDP_MODE] = NVME_FEAT_CAP_CHANGE,
266 [NVME_FDP_EVENTS] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
267 };
268
269 static const uint32_t nvme_cse_acs[256] = {
270 [NVME_ADM_CMD_DELETE_SQ] = NVME_CMD_EFF_CSUPP,
271 [NVME_ADM_CMD_CREATE_SQ] = NVME_CMD_EFF_CSUPP,
272 [NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP,
273 [NVME_ADM_CMD_DELETE_CQ] = NVME_CMD_EFF_CSUPP,
274 [NVME_ADM_CMD_CREATE_CQ] = NVME_CMD_EFF_CSUPP,
275 [NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFF_CSUPP,
276 [NVME_ADM_CMD_ABORT] = NVME_CMD_EFF_CSUPP,
277 [NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP,
278 [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
279 [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
280 [NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
281 [NVME_ADM_CMD_VIRT_MNGMT] = NVME_CMD_EFF_CSUPP,
282 [NVME_ADM_CMD_DBBUF_CONFIG] = NVME_CMD_EFF_CSUPP,
283 [NVME_ADM_CMD_FORMAT_NVM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
284 [NVME_ADM_CMD_DIRECTIVE_RECV] = NVME_CMD_EFF_CSUPP,
285 [NVME_ADM_CMD_DIRECTIVE_SEND] = NVME_CMD_EFF_CSUPP,
286 };
287
288 static const uint32_t nvme_cse_iocs_none[256];
289
290 static const uint32_t nvme_cse_iocs_nvm[256] = {
291 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
292 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
293 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
294 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
295 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
296 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP,
297 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
298 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
299 [NVME_CMD_IO_MGMT_RECV] = NVME_CMD_EFF_CSUPP,
300 [NVME_CMD_IO_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
301 };
302
303 static const uint32_t nvme_cse_iocs_zoned[256] = {
304 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
305 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
306 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
307 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
308 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
309 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP,
310 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
311 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
312 [NVME_CMD_ZONE_APPEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
313 [NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
314 [NVME_CMD_ZONE_MGMT_RECV] = NVME_CMD_EFF_CSUPP,
315 };
316
317 static void nvme_process_sq(void *opaque);
318 static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst);
319 static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n);
320
nvme_sqid(NvmeRequest * req)321 static uint16_t nvme_sqid(NvmeRequest *req)
322 {
323 return le16_to_cpu(req->sq->sqid);
324 }
325
nvme_make_pid(NvmeNamespace * ns,uint16_t rg,uint16_t ph)326 static inline uint16_t nvme_make_pid(NvmeNamespace *ns, uint16_t rg,
327 uint16_t ph)
328 {
329 uint16_t rgif = ns->endgrp->fdp.rgif;
330
331 if (!rgif) {
332 return ph;
333 }
334
335 return (rg << (16 - rgif)) | ph;
336 }
337
nvme_ph_valid(NvmeNamespace * ns,uint16_t ph)338 static inline bool nvme_ph_valid(NvmeNamespace *ns, uint16_t ph)
339 {
340 return ph < ns->fdp.nphs;
341 }
342
nvme_rg_valid(NvmeEnduranceGroup * endgrp,uint16_t rg)343 static inline bool nvme_rg_valid(NvmeEnduranceGroup *endgrp, uint16_t rg)
344 {
345 return rg < endgrp->fdp.nrg;
346 }
347
nvme_pid2ph(NvmeNamespace * ns,uint16_t pid)348 static inline uint16_t nvme_pid2ph(NvmeNamespace *ns, uint16_t pid)
349 {
350 uint16_t rgif = ns->endgrp->fdp.rgif;
351
352 if (!rgif) {
353 return pid;
354 }
355
356 return pid & ((1 << (15 - rgif)) - 1);
357 }
358
nvme_pid2rg(NvmeNamespace * ns,uint16_t pid)359 static inline uint16_t nvme_pid2rg(NvmeNamespace *ns, uint16_t pid)
360 {
361 uint16_t rgif = ns->endgrp->fdp.rgif;
362
363 if (!rgif) {
364 return 0;
365 }
366
367 return pid >> (16 - rgif);
368 }
369
nvme_parse_pid(NvmeNamespace * ns,uint16_t pid,uint16_t * ph,uint16_t * rg)370 static inline bool nvme_parse_pid(NvmeNamespace *ns, uint16_t pid,
371 uint16_t *ph, uint16_t *rg)
372 {
373 *rg = nvme_pid2rg(ns, pid);
374 *ph = nvme_pid2ph(ns, pid);
375
376 return nvme_ph_valid(ns, *ph) && nvme_rg_valid(ns->endgrp, *rg);
377 }
378
nvme_assign_zone_state(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state)379 static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
380 NvmeZoneState state)
381 {
382 if (QTAILQ_IN_USE(zone, entry)) {
383 switch (nvme_get_zone_state(zone)) {
384 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
385 QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
386 break;
387 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
388 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
389 break;
390 case NVME_ZONE_STATE_CLOSED:
391 QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
392 break;
393 case NVME_ZONE_STATE_FULL:
394 QTAILQ_REMOVE(&ns->full_zones, zone, entry);
395 default:
396 ;
397 }
398 }
399
400 nvme_set_zone_state(zone, state);
401
402 switch (state) {
403 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
404 QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry);
405 break;
406 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
407 QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry);
408 break;
409 case NVME_ZONE_STATE_CLOSED:
410 QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry);
411 break;
412 case NVME_ZONE_STATE_FULL:
413 QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry);
414 case NVME_ZONE_STATE_READ_ONLY:
415 break;
416 default:
417 zone->d.za = 0;
418 }
419 }
420
nvme_zns_check_resources(NvmeNamespace * ns,uint32_t act,uint32_t opn,uint32_t zrwa)421 static uint16_t nvme_zns_check_resources(NvmeNamespace *ns, uint32_t act,
422 uint32_t opn, uint32_t zrwa)
423 {
424 if (ns->params.max_active_zones != 0 &&
425 ns->nr_active_zones + act > ns->params.max_active_zones) {
426 trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones);
427 return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR;
428 }
429
430 if (ns->params.max_open_zones != 0 &&
431 ns->nr_open_zones + opn > ns->params.max_open_zones) {
432 trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones);
433 return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR;
434 }
435
436 if (zrwa > ns->zns.numzrwa) {
437 return NVME_NOZRWA | NVME_DNR;
438 }
439
440 return NVME_SUCCESS;
441 }
442
443 /*
444 * Check if we can open a zone without exceeding open/active limits.
445 * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5).
446 */
nvme_aor_check(NvmeNamespace * ns,uint32_t act,uint32_t opn)447 static uint16_t nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
448 {
449 return nvme_zns_check_resources(ns, act, opn, 0);
450 }
451
nvme_fdp_alloc_event(NvmeCtrl * n,NvmeFdpEventBuffer * ebuf)452 static NvmeFdpEvent *nvme_fdp_alloc_event(NvmeCtrl *n, NvmeFdpEventBuffer *ebuf)
453 {
454 NvmeFdpEvent *ret = NULL;
455 bool is_full = ebuf->next == ebuf->start && ebuf->nelems;
456
457 ret = &ebuf->events[ebuf->next++];
458 if (unlikely(ebuf->next == NVME_FDP_MAX_EVENTS)) {
459 ebuf->next = 0;
460 }
461 if (is_full) {
462 ebuf->start = ebuf->next;
463 } else {
464 ebuf->nelems++;
465 }
466
467 memset(ret, 0, sizeof(NvmeFdpEvent));
468 ret->timestamp = nvme_get_timestamp(n);
469
470 return ret;
471 }
472
log_event(NvmeRuHandle * ruh,uint8_t event_type)473 static inline int log_event(NvmeRuHandle *ruh, uint8_t event_type)
474 {
475 return (ruh->event_filter >> nvme_fdp_evf_shifts[event_type]) & 0x1;
476 }
477
nvme_update_ruh(NvmeCtrl * n,NvmeNamespace * ns,uint16_t pid)478 static bool nvme_update_ruh(NvmeCtrl *n, NvmeNamespace *ns, uint16_t pid)
479 {
480 NvmeEnduranceGroup *endgrp = ns->endgrp;
481 NvmeRuHandle *ruh;
482 NvmeReclaimUnit *ru;
483 NvmeFdpEvent *e = NULL;
484 uint16_t ph, rg, ruhid;
485
486 if (!nvme_parse_pid(ns, pid, &ph, &rg)) {
487 return false;
488 }
489
490 ruhid = ns->fdp.phs[ph];
491
492 ruh = &endgrp->fdp.ruhs[ruhid];
493 ru = &ruh->rus[rg];
494
495 if (ru->ruamw) {
496 if (log_event(ruh, FDP_EVT_RU_NOT_FULLY_WRITTEN)) {
497 e = nvme_fdp_alloc_event(n, &endgrp->fdp.host_events);
498 e->type = FDP_EVT_RU_NOT_FULLY_WRITTEN;
499 e->flags = FDPEF_PIV | FDPEF_NSIDV | FDPEF_LV;
500 e->pid = cpu_to_le16(pid);
501 e->nsid = cpu_to_le32(ns->params.nsid);
502 e->rgid = cpu_to_le16(rg);
503 e->ruhid = cpu_to_le16(ruhid);
504 }
505
506 /* log (eventual) GC overhead of prematurely swapping the RU */
507 nvme_fdp_stat_inc(&endgrp->fdp.mbmw, nvme_l2b(ns, ru->ruamw));
508 }
509
510 ru->ruamw = ruh->ruamw;
511
512 return true;
513 }
514
nvme_addr_is_cmb(NvmeCtrl * n,hwaddr addr)515 static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
516 {
517 hwaddr hi, lo;
518
519 if (!n->cmb.cmse) {
520 return false;
521 }
522
523 lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
524 hi = lo + int128_get64(n->cmb.mem.size);
525
526 return addr >= lo && addr < hi;
527 }
528
nvme_addr_to_cmb(NvmeCtrl * n,hwaddr addr)529 static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
530 {
531 hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
532 return &n->cmb.buf[addr - base];
533 }
534
nvme_addr_is_pmr(NvmeCtrl * n,hwaddr addr)535 static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr)
536 {
537 hwaddr hi;
538
539 if (!n->pmr.cmse) {
540 return false;
541 }
542
543 hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size);
544
545 return addr >= n->pmr.cba && addr < hi;
546 }
547
nvme_addr_to_pmr(NvmeCtrl * n,hwaddr addr)548 static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr)
549 {
550 return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba);
551 }
552
nvme_addr_is_iomem(NvmeCtrl * n,hwaddr addr)553 static inline bool nvme_addr_is_iomem(NvmeCtrl *n, hwaddr addr)
554 {
555 hwaddr hi, lo;
556
557 /*
558 * The purpose of this check is to guard against invalid "local" access to
559 * the iomem (i.e. controller registers). Thus, we check against the range
560 * covered by the 'bar0' MemoryRegion since that is currently composed of
561 * two subregions (the NVMe "MBAR" and the MSI-X table/pba). Note, however,
562 * that if the device model is ever changed to allow the CMB to be located
563 * in BAR0 as well, then this must be changed.
564 */
565 lo = n->bar0.addr;
566 hi = lo + int128_get64(n->bar0.size);
567
568 return addr >= lo && addr < hi;
569 }
570
nvme_addr_read(NvmeCtrl * n,hwaddr addr,void * buf,int size)571 static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
572 {
573 hwaddr hi = addr + size - 1;
574 if (hi < addr) {
575 return 1;
576 }
577
578 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
579 memcpy(buf, nvme_addr_to_cmb(n, addr), size);
580 return 0;
581 }
582
583 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
584 memcpy(buf, nvme_addr_to_pmr(n, addr), size);
585 return 0;
586 }
587
588 return pci_dma_read(PCI_DEVICE(n), addr, buf, size);
589 }
590
nvme_addr_write(NvmeCtrl * n,hwaddr addr,const void * buf,int size)591 static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, const void *buf, int size)
592 {
593 hwaddr hi = addr + size - 1;
594 if (hi < addr) {
595 return 1;
596 }
597
598 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
599 memcpy(nvme_addr_to_cmb(n, addr), buf, size);
600 return 0;
601 }
602
603 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
604 memcpy(nvme_addr_to_pmr(n, addr), buf, size);
605 return 0;
606 }
607
608 return pci_dma_write(PCI_DEVICE(n), addr, buf, size);
609 }
610
nvme_nsid_valid(NvmeCtrl * n,uint32_t nsid)611 static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid)
612 {
613 return nsid &&
614 (nsid == NVME_NSID_BROADCAST || nsid <= NVME_MAX_NAMESPACES);
615 }
616
nvme_check_sqid(NvmeCtrl * n,uint16_t sqid)617 static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
618 {
619 return sqid < n->conf_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
620 }
621
nvme_check_cqid(NvmeCtrl * n,uint16_t cqid)622 static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
623 {
624 return cqid < n->conf_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
625 }
626
nvme_inc_cq_tail(NvmeCQueue * cq)627 static void nvme_inc_cq_tail(NvmeCQueue *cq)
628 {
629 cq->tail++;
630 if (cq->tail >= cq->size) {
631 cq->tail = 0;
632 cq->phase = !cq->phase;
633 }
634 }
635
nvme_inc_sq_head(NvmeSQueue * sq)636 static void nvme_inc_sq_head(NvmeSQueue *sq)
637 {
638 sq->head = (sq->head + 1) % sq->size;
639 }
640
nvme_cq_full(NvmeCQueue * cq)641 static uint8_t nvme_cq_full(NvmeCQueue *cq)
642 {
643 return (cq->tail + 1) % cq->size == cq->head;
644 }
645
nvme_sq_empty(NvmeSQueue * sq)646 static uint8_t nvme_sq_empty(NvmeSQueue *sq)
647 {
648 return sq->head == sq->tail;
649 }
650
nvme_irq_check(NvmeCtrl * n)651 static void nvme_irq_check(NvmeCtrl *n)
652 {
653 PCIDevice *pci = PCI_DEVICE(n);
654 uint32_t intms = ldl_le_p(&n->bar.intms);
655
656 if (msix_enabled(pci)) {
657 return;
658 }
659 if (~intms & n->irq_status) {
660 pci_irq_assert(pci);
661 } else {
662 pci_irq_deassert(pci);
663 }
664 }
665
nvme_irq_assert(NvmeCtrl * n,NvmeCQueue * cq)666 static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
667 {
668 PCIDevice *pci = PCI_DEVICE(n);
669
670 if (cq->irq_enabled) {
671 if (msix_enabled(pci)) {
672 trace_pci_nvme_irq_msix(cq->vector);
673 msix_notify(pci, cq->vector);
674 } else {
675 trace_pci_nvme_irq_pin();
676 assert(cq->vector < 32);
677 n->irq_status |= 1 << cq->vector;
678 nvme_irq_check(n);
679 }
680 } else {
681 trace_pci_nvme_irq_masked();
682 }
683 }
684
nvme_irq_deassert(NvmeCtrl * n,NvmeCQueue * cq)685 static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
686 {
687 if (cq->irq_enabled) {
688 if (msix_enabled(PCI_DEVICE(n))) {
689 return;
690 } else {
691 assert(cq->vector < 32);
692 if (!n->cq_pending) {
693 n->irq_status &= ~(1 << cq->vector);
694 }
695 nvme_irq_check(n);
696 }
697 }
698 }
699
nvme_req_clear(NvmeRequest * req)700 static void nvme_req_clear(NvmeRequest *req)
701 {
702 req->ns = NULL;
703 req->opaque = NULL;
704 req->aiocb = NULL;
705 memset(&req->cqe, 0x0, sizeof(req->cqe));
706 req->status = NVME_SUCCESS;
707 }
708
nvme_sg_init(NvmeCtrl * n,NvmeSg * sg,bool dma)709 static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma)
710 {
711 if (dma) {
712 pci_dma_sglist_init(&sg->qsg, PCI_DEVICE(n), 0);
713 sg->flags = NVME_SG_DMA;
714 } else {
715 qemu_iovec_init(&sg->iov, 0);
716 }
717
718 sg->flags |= NVME_SG_ALLOC;
719 }
720
nvme_sg_unmap(NvmeSg * sg)721 static inline void nvme_sg_unmap(NvmeSg *sg)
722 {
723 if (!(sg->flags & NVME_SG_ALLOC)) {
724 return;
725 }
726
727 if (sg->flags & NVME_SG_DMA) {
728 qemu_sglist_destroy(&sg->qsg);
729 } else {
730 qemu_iovec_destroy(&sg->iov);
731 }
732
733 memset(sg, 0x0, sizeof(*sg));
734 }
735
736 /*
737 * When metadata is transferred as extended LBAs, the DPTR mapped into `sg`
738 * holds both data and metadata. This function splits the data and metadata
739 * into two separate QSG/IOVs.
740 */
nvme_sg_split(NvmeSg * sg,NvmeNamespace * ns,NvmeSg * data,NvmeSg * mdata)741 static void nvme_sg_split(NvmeSg *sg, NvmeNamespace *ns, NvmeSg *data,
742 NvmeSg *mdata)
743 {
744 NvmeSg *dst = data;
745 uint32_t trans_len, count = ns->lbasz;
746 uint64_t offset = 0;
747 bool dma = sg->flags & NVME_SG_DMA;
748 size_t sge_len;
749 size_t sg_len = dma ? sg->qsg.size : sg->iov.size;
750 int sg_idx = 0;
751
752 assert(sg->flags & NVME_SG_ALLOC);
753
754 while (sg_len) {
755 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
756
757 trans_len = MIN(sg_len, count);
758 trans_len = MIN(trans_len, sge_len - offset);
759
760 if (dst) {
761 if (dma) {
762 qemu_sglist_add(&dst->qsg, sg->qsg.sg[sg_idx].base + offset,
763 trans_len);
764 } else {
765 qemu_iovec_add(&dst->iov,
766 sg->iov.iov[sg_idx].iov_base + offset,
767 trans_len);
768 }
769 }
770
771 sg_len -= trans_len;
772 count -= trans_len;
773 offset += trans_len;
774
775 if (count == 0) {
776 dst = (dst == data) ? mdata : data;
777 count = (dst == data) ? ns->lbasz : ns->lbaf.ms;
778 }
779
780 if (sge_len == offset) {
781 offset = 0;
782 sg_idx++;
783 }
784 }
785 }
786
nvme_map_addr_cmb(NvmeCtrl * n,QEMUIOVector * iov,hwaddr addr,size_t len)787 static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
788 size_t len)
789 {
790 if (!len) {
791 return NVME_SUCCESS;
792 }
793
794 trace_pci_nvme_map_addr_cmb(addr, len);
795
796 if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len - 1)) {
797 return NVME_DATA_TRAS_ERROR;
798 }
799
800 qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len);
801
802 return NVME_SUCCESS;
803 }
804
nvme_map_addr_pmr(NvmeCtrl * n,QEMUIOVector * iov,hwaddr addr,size_t len)805 static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
806 size_t len)
807 {
808 if (!len) {
809 return NVME_SUCCESS;
810 }
811
812 if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) {
813 return NVME_DATA_TRAS_ERROR;
814 }
815
816 qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len);
817
818 return NVME_SUCCESS;
819 }
820
nvme_map_addr(NvmeCtrl * n,NvmeSg * sg,hwaddr addr,size_t len)821 static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len)
822 {
823 bool cmb = false, pmr = false;
824
825 if (!len) {
826 return NVME_SUCCESS;
827 }
828
829 trace_pci_nvme_map_addr(addr, len);
830
831 if (nvme_addr_is_iomem(n, addr)) {
832 return NVME_DATA_TRAS_ERROR;
833 }
834
835 if (nvme_addr_is_cmb(n, addr)) {
836 cmb = true;
837 } else if (nvme_addr_is_pmr(n, addr)) {
838 pmr = true;
839 }
840
841 if (cmb || pmr) {
842 if (sg->flags & NVME_SG_DMA) {
843 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
844 }
845
846 if (sg->iov.niov + 1 > IOV_MAX) {
847 goto max_mappings_exceeded;
848 }
849
850 if (cmb) {
851 return nvme_map_addr_cmb(n, &sg->iov, addr, len);
852 } else {
853 return nvme_map_addr_pmr(n, &sg->iov, addr, len);
854 }
855 }
856
857 if (!(sg->flags & NVME_SG_DMA)) {
858 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
859 }
860
861 if (sg->qsg.nsg + 1 > IOV_MAX) {
862 goto max_mappings_exceeded;
863 }
864
865 qemu_sglist_add(&sg->qsg, addr, len);
866
867 return NVME_SUCCESS;
868
869 max_mappings_exceeded:
870 NVME_GUEST_ERR(pci_nvme_ub_too_many_mappings,
871 "number of mappings exceed 1024");
872 return NVME_INTERNAL_DEV_ERROR | NVME_DNR;
873 }
874
nvme_addr_is_dma(NvmeCtrl * n,hwaddr addr)875 static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr)
876 {
877 return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr));
878 }
879
nvme_map_prp(NvmeCtrl * n,NvmeSg * sg,uint64_t prp1,uint64_t prp2,uint32_t len)880 static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1,
881 uint64_t prp2, uint32_t len)
882 {
883 hwaddr trans_len = n->page_size - (prp1 % n->page_size);
884 trans_len = MIN(len, trans_len);
885 int num_prps = (len >> n->page_bits) + 1;
886 uint16_t status;
887 int ret;
888
889 trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps);
890
891 nvme_sg_init(n, sg, nvme_addr_is_dma(n, prp1));
892
893 status = nvme_map_addr(n, sg, prp1, trans_len);
894 if (status) {
895 goto unmap;
896 }
897
898 len -= trans_len;
899 if (len) {
900 if (len > n->page_size) {
901 g_autofree uint64_t *prp_list = g_new(uint64_t, n->max_prp_ents);
902 uint32_t nents, prp_trans;
903 int i = 0;
904
905 /*
906 * The first PRP list entry, pointed to by PRP2 may contain offset.
907 * Hence, we need to calculate the number of entries in based on
908 * that offset.
909 */
910 nents = (n->page_size - (prp2 & (n->page_size - 1))) >> 3;
911 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
912 ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
913 if (ret) {
914 trace_pci_nvme_err_addr_read(prp2);
915 status = NVME_DATA_TRAS_ERROR;
916 goto unmap;
917 }
918 while (len != 0) {
919 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
920
921 if (i == nents - 1 && len > n->page_size) {
922 if (unlikely(prp_ent & (n->page_size - 1))) {
923 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
924 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
925 goto unmap;
926 }
927
928 i = 0;
929 nents = (len + n->page_size - 1) >> n->page_bits;
930 nents = MIN(nents, n->max_prp_ents);
931 prp_trans = nents * sizeof(uint64_t);
932 ret = nvme_addr_read(n, prp_ent, (void *)prp_list,
933 prp_trans);
934 if (ret) {
935 trace_pci_nvme_err_addr_read(prp_ent);
936 status = NVME_DATA_TRAS_ERROR;
937 goto unmap;
938 }
939 prp_ent = le64_to_cpu(prp_list[i]);
940 }
941
942 if (unlikely(prp_ent & (n->page_size - 1))) {
943 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
944 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
945 goto unmap;
946 }
947
948 trans_len = MIN(len, n->page_size);
949 status = nvme_map_addr(n, sg, prp_ent, trans_len);
950 if (status) {
951 goto unmap;
952 }
953
954 len -= trans_len;
955 i++;
956 }
957 } else {
958 if (unlikely(prp2 & (n->page_size - 1))) {
959 trace_pci_nvme_err_invalid_prp2_align(prp2);
960 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
961 goto unmap;
962 }
963 status = nvme_map_addr(n, sg, prp2, len);
964 if (status) {
965 goto unmap;
966 }
967 }
968 }
969
970 return NVME_SUCCESS;
971
972 unmap:
973 nvme_sg_unmap(sg);
974 return status;
975 }
976
977 /*
978 * Map 'nsgld' data descriptors from 'segment'. The function will subtract the
979 * number of bytes mapped in len.
980 */
nvme_map_sgl_data(NvmeCtrl * n,NvmeSg * sg,NvmeSglDescriptor * segment,uint64_t nsgld,size_t * len,NvmeCmd * cmd)981 static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg,
982 NvmeSglDescriptor *segment, uint64_t nsgld,
983 size_t *len, NvmeCmd *cmd)
984 {
985 dma_addr_t addr, trans_len;
986 uint32_t dlen;
987 uint16_t status;
988
989 for (int i = 0; i < nsgld; i++) {
990 uint8_t type = NVME_SGL_TYPE(segment[i].type);
991
992 switch (type) {
993 case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
994 break;
995 case NVME_SGL_DESCR_TYPE_SEGMENT:
996 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
997 return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
998 default:
999 return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
1000 }
1001
1002 dlen = le32_to_cpu(segment[i].len);
1003
1004 if (!dlen) {
1005 continue;
1006 }
1007
1008 if (*len == 0) {
1009 /*
1010 * All data has been mapped, but the SGL contains additional
1011 * segments and/or descriptors. The controller might accept
1012 * ignoring the rest of the SGL.
1013 */
1014 uint32_t sgls = le32_to_cpu(n->id_ctrl.sgls);
1015 if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
1016 break;
1017 }
1018
1019 trace_pci_nvme_err_invalid_sgl_excess_length(dlen);
1020 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1021 }
1022
1023 trans_len = MIN(*len, dlen);
1024
1025 addr = le64_to_cpu(segment[i].addr);
1026
1027 if (UINT64_MAX - addr < dlen) {
1028 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1029 }
1030
1031 status = nvme_map_addr(n, sg, addr, trans_len);
1032 if (status) {
1033 return status;
1034 }
1035
1036 *len -= trans_len;
1037 }
1038
1039 return NVME_SUCCESS;
1040 }
1041
nvme_map_sgl(NvmeCtrl * n,NvmeSg * sg,NvmeSglDescriptor sgl,size_t len,NvmeCmd * cmd)1042 static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl,
1043 size_t len, NvmeCmd *cmd)
1044 {
1045 /*
1046 * Read the segment in chunks of 256 descriptors (one 4k page) to avoid
1047 * dynamically allocating a potentially huge SGL. The spec allows the SGL
1048 * to be larger (as in number of bytes required to describe the SGL
1049 * descriptors and segment chain) than the command transfer size, so it is
1050 * not bounded by MDTS.
1051 */
1052 #define SEG_CHUNK_SIZE 256
1053
1054 NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
1055 uint64_t nsgld;
1056 uint32_t seg_len;
1057 uint16_t status;
1058 hwaddr addr;
1059 int ret;
1060
1061 sgld = &sgl;
1062 addr = le64_to_cpu(sgl.addr);
1063
1064 trace_pci_nvme_map_sgl(NVME_SGL_TYPE(sgl.type), len);
1065
1066 nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr));
1067
1068 /*
1069 * If the entire transfer can be described with a single data block it can
1070 * be mapped directly.
1071 */
1072 if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
1073 status = nvme_map_sgl_data(n, sg, sgld, 1, &len, cmd);
1074 if (status) {
1075 goto unmap;
1076 }
1077
1078 goto out;
1079 }
1080
1081 for (;;) {
1082 switch (NVME_SGL_TYPE(sgld->type)) {
1083 case NVME_SGL_DESCR_TYPE_SEGMENT:
1084 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
1085 break;
1086 default:
1087 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
1088 }
1089
1090 seg_len = le32_to_cpu(sgld->len);
1091
1092 /* check the length of the (Last) Segment descriptor */
1093 if (!seg_len || seg_len & 0xf) {
1094 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
1095 }
1096
1097 if (UINT64_MAX - addr < seg_len) {
1098 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1099 }
1100
1101 nsgld = seg_len / sizeof(NvmeSglDescriptor);
1102
1103 while (nsgld > SEG_CHUNK_SIZE) {
1104 if (nvme_addr_read(n, addr, segment, sizeof(segment))) {
1105 trace_pci_nvme_err_addr_read(addr);
1106 status = NVME_DATA_TRAS_ERROR;
1107 goto unmap;
1108 }
1109
1110 status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE,
1111 &len, cmd);
1112 if (status) {
1113 goto unmap;
1114 }
1115
1116 nsgld -= SEG_CHUNK_SIZE;
1117 addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor);
1118 }
1119
1120 ret = nvme_addr_read(n, addr, segment, nsgld *
1121 sizeof(NvmeSglDescriptor));
1122 if (ret) {
1123 trace_pci_nvme_err_addr_read(addr);
1124 status = NVME_DATA_TRAS_ERROR;
1125 goto unmap;
1126 }
1127
1128 last_sgld = &segment[nsgld - 1];
1129
1130 /*
1131 * If the segment ends with a Data Block, then we are done.
1132 */
1133 if (NVME_SGL_TYPE(last_sgld->type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
1134 status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd);
1135 if (status) {
1136 goto unmap;
1137 }
1138
1139 goto out;
1140 }
1141
1142 /*
1143 * If the last descriptor was not a Data Block, then the current
1144 * segment must not be a Last Segment.
1145 */
1146 if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
1147 status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
1148 goto unmap;
1149 }
1150
1151 sgld = last_sgld;
1152 addr = le64_to_cpu(sgld->addr);
1153
1154 /*
1155 * Do not map the last descriptor; it will be a Segment or Last Segment
1156 * descriptor and is handled by the next iteration.
1157 */
1158 status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, cmd);
1159 if (status) {
1160 goto unmap;
1161 }
1162 }
1163
1164 out:
1165 /* if there is any residual left in len, the SGL was too short */
1166 if (len) {
1167 status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1168 goto unmap;
1169 }
1170
1171 return NVME_SUCCESS;
1172
1173 unmap:
1174 nvme_sg_unmap(sg);
1175 return status;
1176 }
1177
nvme_map_dptr(NvmeCtrl * n,NvmeSg * sg,size_t len,NvmeCmd * cmd)1178 uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
1179 NvmeCmd *cmd)
1180 {
1181 uint64_t prp1, prp2;
1182
1183 switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) {
1184 case NVME_PSDT_PRP:
1185 prp1 = le64_to_cpu(cmd->dptr.prp1);
1186 prp2 = le64_to_cpu(cmd->dptr.prp2);
1187
1188 return nvme_map_prp(n, sg, prp1, prp2, len);
1189 case NVME_PSDT_SGL_MPTR_CONTIGUOUS:
1190 case NVME_PSDT_SGL_MPTR_SGL:
1191 return nvme_map_sgl(n, sg, cmd->dptr.sgl, len, cmd);
1192 default:
1193 return NVME_INVALID_FIELD;
1194 }
1195 }
1196
nvme_map_mptr(NvmeCtrl * n,NvmeSg * sg,size_t len,NvmeCmd * cmd)1197 static uint16_t nvme_map_mptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
1198 NvmeCmd *cmd)
1199 {
1200 int psdt = NVME_CMD_FLAGS_PSDT(cmd->flags);
1201 hwaddr mptr = le64_to_cpu(cmd->mptr);
1202 uint16_t status;
1203
1204 if (psdt == NVME_PSDT_SGL_MPTR_SGL) {
1205 NvmeSglDescriptor sgl;
1206
1207 if (nvme_addr_read(n, mptr, &sgl, sizeof(sgl))) {
1208 return NVME_DATA_TRAS_ERROR;
1209 }
1210
1211 status = nvme_map_sgl(n, sg, sgl, len, cmd);
1212 if (status && (status & 0x7ff) == NVME_DATA_SGL_LEN_INVALID) {
1213 status = NVME_MD_SGL_LEN_INVALID | NVME_DNR;
1214 }
1215
1216 return status;
1217 }
1218
1219 nvme_sg_init(n, sg, nvme_addr_is_dma(n, mptr));
1220 status = nvme_map_addr(n, sg, mptr, len);
1221 if (status) {
1222 nvme_sg_unmap(sg);
1223 }
1224
1225 return status;
1226 }
1227
nvme_map_data(NvmeCtrl * n,uint32_t nlb,NvmeRequest * req)1228 static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1229 {
1230 NvmeNamespace *ns = req->ns;
1231 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1232 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1233 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1234 size_t len = nvme_l2b(ns, nlb);
1235 uint16_t status;
1236
1237 if (nvme_ns_ext(ns) &&
1238 !(pi && pract && ns->lbaf.ms == nvme_pi_tuple_size(ns))) {
1239 NvmeSg sg;
1240
1241 len += nvme_m2b(ns, nlb);
1242
1243 status = nvme_map_dptr(n, &sg, len, &req->cmd);
1244 if (status) {
1245 return status;
1246 }
1247
1248 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1249 nvme_sg_split(&sg, ns, &req->sg, NULL);
1250 nvme_sg_unmap(&sg);
1251
1252 return NVME_SUCCESS;
1253 }
1254
1255 return nvme_map_dptr(n, &req->sg, len, &req->cmd);
1256 }
1257
nvme_map_mdata(NvmeCtrl * n,uint32_t nlb,NvmeRequest * req)1258 static uint16_t nvme_map_mdata(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1259 {
1260 NvmeNamespace *ns = req->ns;
1261 size_t len = nvme_m2b(ns, nlb);
1262 uint16_t status;
1263
1264 if (nvme_ns_ext(ns)) {
1265 NvmeSg sg;
1266
1267 len += nvme_l2b(ns, nlb);
1268
1269 status = nvme_map_dptr(n, &sg, len, &req->cmd);
1270 if (status) {
1271 return status;
1272 }
1273
1274 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1275 nvme_sg_split(&sg, ns, NULL, &req->sg);
1276 nvme_sg_unmap(&sg);
1277
1278 return NVME_SUCCESS;
1279 }
1280
1281 return nvme_map_mptr(n, &req->sg, len, &req->cmd);
1282 }
1283
nvme_tx_interleaved(NvmeCtrl * n,NvmeSg * sg,uint8_t * ptr,uint32_t len,uint32_t bytes,int32_t skip_bytes,int64_t offset,NvmeTxDirection dir)1284 static uint16_t nvme_tx_interleaved(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr,
1285 uint32_t len, uint32_t bytes,
1286 int32_t skip_bytes, int64_t offset,
1287 NvmeTxDirection dir)
1288 {
1289 hwaddr addr;
1290 uint32_t trans_len, count = bytes;
1291 bool dma = sg->flags & NVME_SG_DMA;
1292 int64_t sge_len;
1293 int sg_idx = 0;
1294 int ret;
1295
1296 assert(sg->flags & NVME_SG_ALLOC);
1297
1298 while (len) {
1299 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
1300
1301 if (sge_len - offset < 0) {
1302 offset -= sge_len;
1303 sg_idx++;
1304 continue;
1305 }
1306
1307 if (sge_len == offset) {
1308 offset = 0;
1309 sg_idx++;
1310 continue;
1311 }
1312
1313 trans_len = MIN(len, count);
1314 trans_len = MIN(trans_len, sge_len - offset);
1315
1316 if (dma) {
1317 addr = sg->qsg.sg[sg_idx].base + offset;
1318 } else {
1319 addr = (hwaddr)(uintptr_t)sg->iov.iov[sg_idx].iov_base + offset;
1320 }
1321
1322 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1323 ret = nvme_addr_read(n, addr, ptr, trans_len);
1324 } else {
1325 ret = nvme_addr_write(n, addr, ptr, trans_len);
1326 }
1327
1328 if (ret) {
1329 return NVME_DATA_TRAS_ERROR;
1330 }
1331
1332 ptr += trans_len;
1333 len -= trans_len;
1334 count -= trans_len;
1335 offset += trans_len;
1336
1337 if (count == 0) {
1338 count = bytes;
1339 offset += skip_bytes;
1340 }
1341 }
1342
1343 return NVME_SUCCESS;
1344 }
1345
nvme_tx(NvmeCtrl * n,NvmeSg * sg,void * ptr,uint32_t len,NvmeTxDirection dir)1346 static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, void *ptr, uint32_t len,
1347 NvmeTxDirection dir)
1348 {
1349 assert(sg->flags & NVME_SG_ALLOC);
1350
1351 if (sg->flags & NVME_SG_DMA) {
1352 const MemTxAttrs attrs = MEMTXATTRS_UNSPECIFIED;
1353 dma_addr_t residual;
1354
1355 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1356 dma_buf_write(ptr, len, &residual, &sg->qsg, attrs);
1357 } else {
1358 dma_buf_read(ptr, len, &residual, &sg->qsg, attrs);
1359 }
1360
1361 if (unlikely(residual)) {
1362 trace_pci_nvme_err_invalid_dma();
1363 return NVME_INVALID_FIELD | NVME_DNR;
1364 }
1365 } else {
1366 size_t bytes;
1367
1368 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1369 bytes = qemu_iovec_to_buf(&sg->iov, 0, ptr, len);
1370 } else {
1371 bytes = qemu_iovec_from_buf(&sg->iov, 0, ptr, len);
1372 }
1373
1374 if (unlikely(bytes != len)) {
1375 trace_pci_nvme_err_invalid_dma();
1376 return NVME_INVALID_FIELD | NVME_DNR;
1377 }
1378 }
1379
1380 return NVME_SUCCESS;
1381 }
1382
nvme_c2h(NvmeCtrl * n,void * ptr,uint32_t len,NvmeRequest * req)1383 static inline uint16_t nvme_c2h(NvmeCtrl *n, void *ptr, uint32_t len,
1384 NvmeRequest *req)
1385 {
1386 uint16_t status;
1387
1388 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1389 if (status) {
1390 return status;
1391 }
1392
1393 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE);
1394 }
1395
nvme_h2c(NvmeCtrl * n,void * ptr,uint32_t len,NvmeRequest * req)1396 static inline uint16_t nvme_h2c(NvmeCtrl *n, void *ptr, uint32_t len,
1397 NvmeRequest *req)
1398 {
1399 uint16_t status;
1400
1401 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1402 if (status) {
1403 return status;
1404 }
1405
1406 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE);
1407 }
1408
nvme_bounce_data(NvmeCtrl * n,void * ptr,uint32_t len,NvmeTxDirection dir,NvmeRequest * req)1409 uint16_t nvme_bounce_data(NvmeCtrl *n, void *ptr, uint32_t len,
1410 NvmeTxDirection dir, NvmeRequest *req)
1411 {
1412 NvmeNamespace *ns = req->ns;
1413 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1414 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1415 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1416
1417 if (nvme_ns_ext(ns) &&
1418 !(pi && pract && ns->lbaf.ms == nvme_pi_tuple_size(ns))) {
1419 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbasz,
1420 ns->lbaf.ms, 0, dir);
1421 }
1422
1423 return nvme_tx(n, &req->sg, ptr, len, dir);
1424 }
1425
nvme_bounce_mdata(NvmeCtrl * n,void * ptr,uint32_t len,NvmeTxDirection dir,NvmeRequest * req)1426 uint16_t nvme_bounce_mdata(NvmeCtrl *n, void *ptr, uint32_t len,
1427 NvmeTxDirection dir, NvmeRequest *req)
1428 {
1429 NvmeNamespace *ns = req->ns;
1430 uint16_t status;
1431
1432 if (nvme_ns_ext(ns)) {
1433 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbaf.ms,
1434 ns->lbasz, ns->lbasz, dir);
1435 }
1436
1437 nvme_sg_unmap(&req->sg);
1438
1439 status = nvme_map_mptr(n, &req->sg, len, &req->cmd);
1440 if (status) {
1441 return status;
1442 }
1443
1444 return nvme_tx(n, &req->sg, ptr, len, dir);
1445 }
1446
nvme_blk_read(BlockBackend * blk,int64_t offset,uint32_t align,BlockCompletionFunc * cb,NvmeRequest * req)1447 static inline void nvme_blk_read(BlockBackend *blk, int64_t offset,
1448 uint32_t align, BlockCompletionFunc *cb,
1449 NvmeRequest *req)
1450 {
1451 assert(req->sg.flags & NVME_SG_ALLOC);
1452
1453 if (req->sg.flags & NVME_SG_DMA) {
1454 req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, align, cb, req);
1455 } else {
1456 req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req);
1457 }
1458 }
1459
nvme_blk_write(BlockBackend * blk,int64_t offset,uint32_t align,BlockCompletionFunc * cb,NvmeRequest * req)1460 static inline void nvme_blk_write(BlockBackend *blk, int64_t offset,
1461 uint32_t align, BlockCompletionFunc *cb,
1462 NvmeRequest *req)
1463 {
1464 assert(req->sg.flags & NVME_SG_ALLOC);
1465
1466 if (req->sg.flags & NVME_SG_DMA) {
1467 req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, align, cb, req);
1468 } else {
1469 req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req);
1470 }
1471 }
1472
nvme_update_cq_eventidx(const NvmeCQueue * cq)1473 static void nvme_update_cq_eventidx(const NvmeCQueue *cq)
1474 {
1475 trace_pci_nvme_update_cq_eventidx(cq->cqid, cq->head);
1476
1477 stl_le_pci_dma(PCI_DEVICE(cq->ctrl), cq->ei_addr, cq->head,
1478 MEMTXATTRS_UNSPECIFIED);
1479 }
1480
nvme_update_cq_head(NvmeCQueue * cq)1481 static void nvme_update_cq_head(NvmeCQueue *cq)
1482 {
1483 ldl_le_pci_dma(PCI_DEVICE(cq->ctrl), cq->db_addr, &cq->head,
1484 MEMTXATTRS_UNSPECIFIED);
1485
1486 trace_pci_nvme_update_cq_head(cq->cqid, cq->head);
1487 }
1488
nvme_post_cqes(void * opaque)1489 static void nvme_post_cqes(void *opaque)
1490 {
1491 NvmeCQueue *cq = opaque;
1492 NvmeCtrl *n = cq->ctrl;
1493 NvmeRequest *req, *next;
1494 bool pending = cq->head != cq->tail;
1495 int ret;
1496
1497 QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
1498 NvmeSQueue *sq;
1499 hwaddr addr;
1500
1501 if (n->dbbuf_enabled) {
1502 nvme_update_cq_eventidx(cq);
1503 nvme_update_cq_head(cq);
1504 }
1505
1506 if (nvme_cq_full(cq)) {
1507 break;
1508 }
1509
1510 sq = req->sq;
1511 req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
1512 req->cqe.sq_id = cpu_to_le16(sq->sqid);
1513 req->cqe.sq_head = cpu_to_le16(sq->head);
1514 addr = cq->dma_addr + (cq->tail << NVME_CQES);
1515 ret = pci_dma_write(PCI_DEVICE(n), addr, (void *)&req->cqe,
1516 sizeof(req->cqe));
1517 if (ret) {
1518 trace_pci_nvme_err_addr_write(addr);
1519 trace_pci_nvme_err_cfs();
1520 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
1521 break;
1522 }
1523 QTAILQ_REMOVE(&cq->req_list, req, entry);
1524 nvme_inc_cq_tail(cq);
1525 nvme_sg_unmap(&req->sg);
1526 QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
1527 }
1528 if (cq->tail != cq->head) {
1529 if (cq->irq_enabled && !pending) {
1530 n->cq_pending++;
1531 }
1532
1533 nvme_irq_assert(n, cq);
1534 }
1535 }
1536
nvme_enqueue_req_completion(NvmeCQueue * cq,NvmeRequest * req)1537 static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
1538 {
1539 assert(cq->cqid == req->sq->cqid);
1540 trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid,
1541 le32_to_cpu(req->cqe.result),
1542 le32_to_cpu(req->cqe.dw1),
1543 req->status);
1544
1545 if (req->status) {
1546 trace_pci_nvme_err_req_status(nvme_cid(req), nvme_nsid(req->ns),
1547 req->status, req->cmd.opcode);
1548 }
1549
1550 QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
1551 QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
1552
1553 qemu_bh_schedule(cq->bh);
1554 }
1555
nvme_process_aers(void * opaque)1556 static void nvme_process_aers(void *opaque)
1557 {
1558 NvmeCtrl *n = opaque;
1559 NvmeAsyncEvent *event, *next;
1560
1561 trace_pci_nvme_process_aers(n->aer_queued);
1562
1563 QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
1564 NvmeRequest *req;
1565 NvmeAerResult *result;
1566
1567 /* can't post cqe if there is nothing to complete */
1568 if (!n->outstanding_aers) {
1569 trace_pci_nvme_no_outstanding_aers();
1570 break;
1571 }
1572
1573 /* ignore if masked (cqe posted, but event not cleared) */
1574 if (n->aer_mask & (1 << event->result.event_type)) {
1575 trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask);
1576 continue;
1577 }
1578
1579 QTAILQ_REMOVE(&n->aer_queue, event, entry);
1580 n->aer_queued--;
1581
1582 n->aer_mask |= 1 << event->result.event_type;
1583 n->outstanding_aers--;
1584
1585 req = n->aer_reqs[n->outstanding_aers];
1586
1587 result = (NvmeAerResult *) &req->cqe.result;
1588 result->event_type = event->result.event_type;
1589 result->event_info = event->result.event_info;
1590 result->log_page = event->result.log_page;
1591 g_free(event);
1592
1593 trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info,
1594 result->log_page);
1595
1596 nvme_enqueue_req_completion(&n->admin_cq, req);
1597 }
1598 }
1599
nvme_enqueue_event(NvmeCtrl * n,uint8_t event_type,uint8_t event_info,uint8_t log_page)1600 static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
1601 uint8_t event_info, uint8_t log_page)
1602 {
1603 NvmeAsyncEvent *event;
1604
1605 trace_pci_nvme_enqueue_event(event_type, event_info, log_page);
1606
1607 if (n->aer_queued == n->params.aer_max_queued) {
1608 trace_pci_nvme_enqueue_event_noqueue(n->aer_queued);
1609 return;
1610 }
1611
1612 event = g_new(NvmeAsyncEvent, 1);
1613 event->result = (NvmeAerResult) {
1614 .event_type = event_type,
1615 .event_info = event_info,
1616 .log_page = log_page,
1617 };
1618
1619 QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
1620 n->aer_queued++;
1621
1622 nvme_process_aers(n);
1623 }
1624
nvme_smart_event(NvmeCtrl * n,uint8_t event)1625 static void nvme_smart_event(NvmeCtrl *n, uint8_t event)
1626 {
1627 uint8_t aer_info;
1628
1629 /* Ref SPEC <Asynchronous Event Information 0x2013 SMART / Health Status> */
1630 if (!(NVME_AEC_SMART(n->features.async_config) & event)) {
1631 return;
1632 }
1633
1634 switch (event) {
1635 case NVME_SMART_SPARE:
1636 aer_info = NVME_AER_INFO_SMART_SPARE_THRESH;
1637 break;
1638 case NVME_SMART_TEMPERATURE:
1639 aer_info = NVME_AER_INFO_SMART_TEMP_THRESH;
1640 break;
1641 case NVME_SMART_RELIABILITY:
1642 case NVME_SMART_MEDIA_READ_ONLY:
1643 case NVME_SMART_FAILED_VOLATILE_MEDIA:
1644 case NVME_SMART_PMR_UNRELIABLE:
1645 aer_info = NVME_AER_INFO_SMART_RELIABILITY;
1646 break;
1647 default:
1648 return;
1649 }
1650
1651 nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO);
1652 }
1653
nvme_clear_events(NvmeCtrl * n,uint8_t event_type)1654 static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
1655 {
1656 NvmeAsyncEvent *event, *next;
1657
1658 n->aer_mask &= ~(1 << event_type);
1659
1660 QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
1661 if (event->result.event_type == event_type) {
1662 QTAILQ_REMOVE(&n->aer_queue, event, entry);
1663 n->aer_queued--;
1664 g_free(event);
1665 }
1666 }
1667 }
1668
nvme_check_mdts(NvmeCtrl * n,size_t len)1669 static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len)
1670 {
1671 uint8_t mdts = n->params.mdts;
1672
1673 if (mdts && len > n->page_size << mdts) {
1674 trace_pci_nvme_err_mdts(len);
1675 return NVME_INVALID_FIELD | NVME_DNR;
1676 }
1677
1678 return NVME_SUCCESS;
1679 }
1680
nvme_check_bounds(NvmeNamespace * ns,uint64_t slba,uint32_t nlb)1681 static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba,
1682 uint32_t nlb)
1683 {
1684 uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
1685
1686 if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
1687 trace_pci_nvme_err_invalid_lba_range(slba, nlb, nsze);
1688 return NVME_LBA_RANGE | NVME_DNR;
1689 }
1690
1691 return NVME_SUCCESS;
1692 }
1693
nvme_block_status_all(NvmeNamespace * ns,uint64_t slba,uint32_t nlb,int flags)1694 static int nvme_block_status_all(NvmeNamespace *ns, uint64_t slba,
1695 uint32_t nlb, int flags)
1696 {
1697 BlockDriverState *bs = blk_bs(ns->blkconf.blk);
1698
1699 int64_t pnum = 0, bytes = nvme_l2b(ns, nlb);
1700 int64_t offset = nvme_l2b(ns, slba);
1701 int ret;
1702
1703 /*
1704 * `pnum` holds the number of bytes after offset that shares the same
1705 * allocation status as the byte at offset. If `pnum` is different from
1706 * `bytes`, we should check the allocation status of the next range and
1707 * continue this until all bytes have been checked.
1708 */
1709 do {
1710 bytes -= pnum;
1711
1712 ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
1713 if (ret < 0) {
1714 return ret;
1715 }
1716
1717
1718 trace_pci_nvme_block_status(offset, bytes, pnum, ret,
1719 !!(ret & BDRV_BLOCK_ZERO));
1720
1721 if (!(ret & flags)) {
1722 return 1;
1723 }
1724
1725 offset += pnum;
1726 } while (pnum != bytes);
1727
1728 return 0;
1729 }
1730
nvme_check_dulbe(NvmeNamespace * ns,uint64_t slba,uint32_t nlb)1731 static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
1732 uint32_t nlb)
1733 {
1734 int ret;
1735 Error *err = NULL;
1736
1737 ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_DATA);
1738 if (ret) {
1739 if (ret < 0) {
1740 error_setg_errno(&err, -ret, "unable to get block status");
1741 error_report_err(err);
1742
1743 return NVME_INTERNAL_DEV_ERROR;
1744 }
1745
1746 return NVME_DULB;
1747 }
1748
1749 return NVME_SUCCESS;
1750 }
1751
nvme_aio_err(NvmeRequest * req,int ret)1752 static void nvme_aio_err(NvmeRequest *req, int ret)
1753 {
1754 uint16_t status = NVME_SUCCESS;
1755 Error *local_err = NULL;
1756
1757 switch (req->cmd.opcode) {
1758 case NVME_CMD_READ:
1759 status = NVME_UNRECOVERED_READ;
1760 break;
1761 case NVME_CMD_FLUSH:
1762 case NVME_CMD_WRITE:
1763 case NVME_CMD_WRITE_ZEROES:
1764 case NVME_CMD_ZONE_APPEND:
1765 case NVME_CMD_COPY:
1766 status = NVME_WRITE_FAULT;
1767 break;
1768 default:
1769 status = NVME_INTERNAL_DEV_ERROR;
1770 break;
1771 }
1772
1773 if (ret == -ECANCELED) {
1774 status = NVME_CMD_ABORT_REQ;
1775 }
1776
1777 trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), status);
1778
1779 error_setg_errno(&local_err, -ret, "aio failed");
1780 error_report_err(local_err);
1781
1782 /*
1783 * Set the command status code to the first encountered error but allow a
1784 * subsequent Internal Device Error to trump it.
1785 */
1786 if (req->status && status != NVME_INTERNAL_DEV_ERROR) {
1787 return;
1788 }
1789
1790 req->status = status;
1791 }
1792
nvme_zone_idx(NvmeNamespace * ns,uint64_t slba)1793 static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba)
1794 {
1795 return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 :
1796 slba / ns->zone_size;
1797 }
1798
nvme_get_zone_by_slba(NvmeNamespace * ns,uint64_t slba)1799 static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba)
1800 {
1801 uint32_t zone_idx = nvme_zone_idx(ns, slba);
1802
1803 if (zone_idx >= ns->num_zones) {
1804 return NULL;
1805 }
1806
1807 return &ns->zone_array[zone_idx];
1808 }
1809
nvme_check_zone_state_for_write(NvmeZone * zone)1810 static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
1811 {
1812 uint64_t zslba = zone->d.zslba;
1813
1814 switch (nvme_get_zone_state(zone)) {
1815 case NVME_ZONE_STATE_EMPTY:
1816 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1817 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1818 case NVME_ZONE_STATE_CLOSED:
1819 return NVME_SUCCESS;
1820 case NVME_ZONE_STATE_FULL:
1821 trace_pci_nvme_err_zone_is_full(zslba);
1822 return NVME_ZONE_FULL;
1823 case NVME_ZONE_STATE_OFFLINE:
1824 trace_pci_nvme_err_zone_is_offline(zslba);
1825 return NVME_ZONE_OFFLINE;
1826 case NVME_ZONE_STATE_READ_ONLY:
1827 trace_pci_nvme_err_zone_is_read_only(zslba);
1828 return NVME_ZONE_READ_ONLY;
1829 default:
1830 g_assert_not_reached();
1831 }
1832
1833 return NVME_INTERNAL_DEV_ERROR;
1834 }
1835
nvme_check_zone_write(NvmeNamespace * ns,NvmeZone * zone,uint64_t slba,uint32_t nlb)1836 static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone,
1837 uint64_t slba, uint32_t nlb)
1838 {
1839 uint64_t zcap = nvme_zone_wr_boundary(zone);
1840 uint16_t status;
1841
1842 status = nvme_check_zone_state_for_write(zone);
1843 if (status) {
1844 return status;
1845 }
1846
1847 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1848 uint64_t ezrwa = zone->w_ptr + 2 * ns->zns.zrwas;
1849
1850 if (slba < zone->w_ptr || slba + nlb > ezrwa) {
1851 trace_pci_nvme_err_zone_invalid_write(slba, zone->w_ptr);
1852 return NVME_ZONE_INVALID_WRITE;
1853 }
1854 } else {
1855 if (unlikely(slba != zone->w_ptr)) {
1856 trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba,
1857 zone->w_ptr);
1858 return NVME_ZONE_INVALID_WRITE;
1859 }
1860 }
1861
1862 if (unlikely((slba + nlb) > zcap)) {
1863 trace_pci_nvme_err_zone_boundary(slba, nlb, zcap);
1864 return NVME_ZONE_BOUNDARY_ERROR;
1865 }
1866
1867 return NVME_SUCCESS;
1868 }
1869
nvme_check_zone_state_for_read(NvmeZone * zone)1870 static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
1871 {
1872 switch (nvme_get_zone_state(zone)) {
1873 case NVME_ZONE_STATE_EMPTY:
1874 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1875 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1876 case NVME_ZONE_STATE_FULL:
1877 case NVME_ZONE_STATE_CLOSED:
1878 case NVME_ZONE_STATE_READ_ONLY:
1879 return NVME_SUCCESS;
1880 case NVME_ZONE_STATE_OFFLINE:
1881 trace_pci_nvme_err_zone_is_offline(zone->d.zslba);
1882 return NVME_ZONE_OFFLINE;
1883 default:
1884 g_assert_not_reached();
1885 }
1886
1887 return NVME_INTERNAL_DEV_ERROR;
1888 }
1889
nvme_check_zone_read(NvmeNamespace * ns,uint64_t slba,uint32_t nlb)1890 static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
1891 uint32_t nlb)
1892 {
1893 NvmeZone *zone;
1894 uint64_t bndry, end;
1895 uint16_t status;
1896
1897 zone = nvme_get_zone_by_slba(ns, slba);
1898 assert(zone);
1899
1900 bndry = nvme_zone_rd_boundary(ns, zone);
1901 end = slba + nlb;
1902
1903 status = nvme_check_zone_state_for_read(zone);
1904 if (status) {
1905 ;
1906 } else if (unlikely(end > bndry)) {
1907 if (!ns->params.cross_zone_read) {
1908 status = NVME_ZONE_BOUNDARY_ERROR;
1909 } else {
1910 /*
1911 * Read across zone boundary - check that all subsequent
1912 * zones that are being read have an appropriate state.
1913 */
1914 do {
1915 zone++;
1916 status = nvme_check_zone_state_for_read(zone);
1917 if (status) {
1918 break;
1919 }
1920 } while (end > nvme_zone_rd_boundary(ns, zone));
1921 }
1922 }
1923
1924 return status;
1925 }
1926
nvme_zrm_finish(NvmeNamespace * ns,NvmeZone * zone)1927 static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone)
1928 {
1929 switch (nvme_get_zone_state(zone)) {
1930 case NVME_ZONE_STATE_FULL:
1931 return NVME_SUCCESS;
1932
1933 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1934 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1935 nvme_aor_dec_open(ns);
1936 /* fallthrough */
1937 case NVME_ZONE_STATE_CLOSED:
1938 nvme_aor_dec_active(ns);
1939
1940 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1941 zone->d.za &= ~NVME_ZA_ZRWA_VALID;
1942 if (ns->params.numzrwa) {
1943 ns->zns.numzrwa++;
1944 }
1945 }
1946
1947 /* fallthrough */
1948 case NVME_ZONE_STATE_EMPTY:
1949 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
1950 return NVME_SUCCESS;
1951
1952 default:
1953 return NVME_ZONE_INVAL_TRANSITION;
1954 }
1955 }
1956
nvme_zrm_close(NvmeNamespace * ns,NvmeZone * zone)1957 static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone)
1958 {
1959 switch (nvme_get_zone_state(zone)) {
1960 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1961 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1962 nvme_aor_dec_open(ns);
1963 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
1964 /* fall through */
1965 case NVME_ZONE_STATE_CLOSED:
1966 return NVME_SUCCESS;
1967
1968 default:
1969 return NVME_ZONE_INVAL_TRANSITION;
1970 }
1971 }
1972
nvme_zrm_reset(NvmeNamespace * ns,NvmeZone * zone)1973 static uint16_t nvme_zrm_reset(NvmeNamespace *ns, NvmeZone *zone)
1974 {
1975 switch (nvme_get_zone_state(zone)) {
1976 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1977 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1978 nvme_aor_dec_open(ns);
1979 /* fallthrough */
1980 case NVME_ZONE_STATE_CLOSED:
1981 nvme_aor_dec_active(ns);
1982
1983 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1984 if (ns->params.numzrwa) {
1985 ns->zns.numzrwa++;
1986 }
1987 }
1988
1989 /* fallthrough */
1990 case NVME_ZONE_STATE_FULL:
1991 zone->w_ptr = zone->d.zslba;
1992 zone->d.wp = zone->w_ptr;
1993 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
1994 /* fallthrough */
1995 case NVME_ZONE_STATE_EMPTY:
1996 return NVME_SUCCESS;
1997
1998 default:
1999 return NVME_ZONE_INVAL_TRANSITION;
2000 }
2001 }
2002
nvme_zrm_auto_transition_zone(NvmeNamespace * ns)2003 static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns)
2004 {
2005 NvmeZone *zone;
2006
2007 if (ns->params.max_open_zones &&
2008 ns->nr_open_zones == ns->params.max_open_zones) {
2009 zone = QTAILQ_FIRST(&ns->imp_open_zones);
2010 if (zone) {
2011 /*
2012 * Automatically close this implicitly open zone.
2013 */
2014 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
2015 nvme_zrm_close(ns, zone);
2016 }
2017 }
2018 }
2019
2020 enum {
2021 NVME_ZRM_AUTO = 1 << 0,
2022 NVME_ZRM_ZRWA = 1 << 1,
2023 };
2024
nvme_zrm_open_flags(NvmeCtrl * n,NvmeNamespace * ns,NvmeZone * zone,int flags)2025 static uint16_t nvme_zrm_open_flags(NvmeCtrl *n, NvmeNamespace *ns,
2026 NvmeZone *zone, int flags)
2027 {
2028 int act = 0;
2029 uint16_t status;
2030
2031 switch (nvme_get_zone_state(zone)) {
2032 case NVME_ZONE_STATE_EMPTY:
2033 act = 1;
2034
2035 /* fallthrough */
2036
2037 case NVME_ZONE_STATE_CLOSED:
2038 if (n->params.auto_transition_zones) {
2039 nvme_zrm_auto_transition_zone(ns);
2040 }
2041 status = nvme_zns_check_resources(ns, act, 1,
2042 (flags & NVME_ZRM_ZRWA) ? 1 : 0);
2043 if (status) {
2044 return status;
2045 }
2046
2047 if (act) {
2048 nvme_aor_inc_active(ns);
2049 }
2050
2051 nvme_aor_inc_open(ns);
2052
2053 if (flags & NVME_ZRM_AUTO) {
2054 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
2055 return NVME_SUCCESS;
2056 }
2057
2058 /* fallthrough */
2059
2060 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
2061 if (flags & NVME_ZRM_AUTO) {
2062 return NVME_SUCCESS;
2063 }
2064
2065 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN);
2066
2067 /* fallthrough */
2068
2069 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
2070 if (flags & NVME_ZRM_ZRWA) {
2071 ns->zns.numzrwa--;
2072
2073 zone->d.za |= NVME_ZA_ZRWA_VALID;
2074 }
2075
2076 return NVME_SUCCESS;
2077
2078 default:
2079 return NVME_ZONE_INVAL_TRANSITION;
2080 }
2081 }
2082
nvme_zrm_auto(NvmeCtrl * n,NvmeNamespace * ns,NvmeZone * zone)2083 static inline uint16_t nvme_zrm_auto(NvmeCtrl *n, NvmeNamespace *ns,
2084 NvmeZone *zone)
2085 {
2086 return nvme_zrm_open_flags(n, ns, zone, NVME_ZRM_AUTO);
2087 }
2088
nvme_advance_zone_wp(NvmeNamespace * ns,NvmeZone * zone,uint32_t nlb)2089 static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
2090 uint32_t nlb)
2091 {
2092 zone->d.wp += nlb;
2093
2094 if (zone->d.wp == nvme_zone_wr_boundary(zone)) {
2095 nvme_zrm_finish(ns, zone);
2096 }
2097 }
2098
nvme_zoned_zrwa_implicit_flush(NvmeNamespace * ns,NvmeZone * zone,uint32_t nlbc)2099 static void nvme_zoned_zrwa_implicit_flush(NvmeNamespace *ns, NvmeZone *zone,
2100 uint32_t nlbc)
2101 {
2102 uint16_t nzrwafgs = DIV_ROUND_UP(nlbc, ns->zns.zrwafg);
2103
2104 nlbc = nzrwafgs * ns->zns.zrwafg;
2105
2106 trace_pci_nvme_zoned_zrwa_implicit_flush(zone->d.zslba, nlbc);
2107
2108 zone->w_ptr += nlbc;
2109
2110 nvme_advance_zone_wp(ns, zone, nlbc);
2111 }
2112
nvme_finalize_zoned_write(NvmeNamespace * ns,NvmeRequest * req)2113 static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req)
2114 {
2115 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2116 NvmeZone *zone;
2117 uint64_t slba;
2118 uint32_t nlb;
2119
2120 slba = le64_to_cpu(rw->slba);
2121 nlb = le16_to_cpu(rw->nlb) + 1;
2122 zone = nvme_get_zone_by_slba(ns, slba);
2123 assert(zone);
2124
2125 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
2126 uint64_t ezrwa = zone->w_ptr + ns->zns.zrwas - 1;
2127 uint64_t elba = slba + nlb - 1;
2128
2129 if (elba > ezrwa) {
2130 nvme_zoned_zrwa_implicit_flush(ns, zone, elba - ezrwa);
2131 }
2132
2133 return;
2134 }
2135
2136 nvme_advance_zone_wp(ns, zone, nlb);
2137 }
2138
nvme_is_write(NvmeRequest * req)2139 static inline bool nvme_is_write(NvmeRequest *req)
2140 {
2141 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2142
2143 return rw->opcode == NVME_CMD_WRITE ||
2144 rw->opcode == NVME_CMD_ZONE_APPEND ||
2145 rw->opcode == NVME_CMD_WRITE_ZEROES;
2146 }
2147
nvme_misc_cb(void * opaque,int ret)2148 static void nvme_misc_cb(void *opaque, int ret)
2149 {
2150 NvmeRequest *req = opaque;
2151
2152 trace_pci_nvme_misc_cb(nvme_cid(req));
2153
2154 if (ret) {
2155 nvme_aio_err(req, ret);
2156 }
2157
2158 nvme_enqueue_req_completion(nvme_cq(req), req);
2159 }
2160
nvme_rw_complete_cb(void * opaque,int ret)2161 void nvme_rw_complete_cb(void *opaque, int ret)
2162 {
2163 NvmeRequest *req = opaque;
2164 NvmeNamespace *ns = req->ns;
2165 BlockBackend *blk = ns->blkconf.blk;
2166 BlockAcctCookie *acct = &req->acct;
2167 BlockAcctStats *stats = blk_get_stats(blk);
2168
2169 trace_pci_nvme_rw_complete_cb(nvme_cid(req), blk_name(blk));
2170
2171 if (ret) {
2172 block_acct_failed(stats, acct);
2173 nvme_aio_err(req, ret);
2174 } else {
2175 block_acct_done(stats, acct);
2176 }
2177
2178 if (ns->params.zoned && nvme_is_write(req)) {
2179 nvme_finalize_zoned_write(ns, req);
2180 }
2181
2182 nvme_enqueue_req_completion(nvme_cq(req), req);
2183 }
2184
nvme_rw_cb(void * opaque,int ret)2185 static void nvme_rw_cb(void *opaque, int ret)
2186 {
2187 NvmeRequest *req = opaque;
2188 NvmeNamespace *ns = req->ns;
2189
2190 BlockBackend *blk = ns->blkconf.blk;
2191
2192 trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
2193
2194 if (ret) {
2195 goto out;
2196 }
2197
2198 if (ns->lbaf.ms) {
2199 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2200 uint64_t slba = le64_to_cpu(rw->slba);
2201 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
2202 uint64_t offset = nvme_moff(ns, slba);
2203
2204 if (req->cmd.opcode == NVME_CMD_WRITE_ZEROES) {
2205 size_t mlen = nvme_m2b(ns, nlb);
2206
2207 req->aiocb = blk_aio_pwrite_zeroes(blk, offset, mlen,
2208 BDRV_REQ_MAY_UNMAP,
2209 nvme_rw_complete_cb, req);
2210 return;
2211 }
2212
2213 if (nvme_ns_ext(ns) || req->cmd.mptr) {
2214 uint16_t status;
2215
2216 nvme_sg_unmap(&req->sg);
2217 status = nvme_map_mdata(nvme_ctrl(req), nlb, req);
2218 if (status) {
2219 ret = -EFAULT;
2220 goto out;
2221 }
2222
2223 if (req->cmd.opcode == NVME_CMD_READ) {
2224 return nvme_blk_read(blk, offset, 1, nvme_rw_complete_cb, req);
2225 }
2226
2227 return nvme_blk_write(blk, offset, 1, nvme_rw_complete_cb, req);
2228 }
2229 }
2230
2231 out:
2232 nvme_rw_complete_cb(req, ret);
2233 }
2234
nvme_verify_cb(void * opaque,int ret)2235 static void nvme_verify_cb(void *opaque, int ret)
2236 {
2237 NvmeBounceContext *ctx = opaque;
2238 NvmeRequest *req = ctx->req;
2239 NvmeNamespace *ns = req->ns;
2240 BlockBackend *blk = ns->blkconf.blk;
2241 BlockAcctCookie *acct = &req->acct;
2242 BlockAcctStats *stats = blk_get_stats(blk);
2243 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2244 uint64_t slba = le64_to_cpu(rw->slba);
2245 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2246 uint16_t apptag = le16_to_cpu(rw->apptag);
2247 uint16_t appmask = le16_to_cpu(rw->appmask);
2248 uint64_t reftag = le32_to_cpu(rw->reftag);
2249 uint64_t cdw3 = le32_to_cpu(rw->cdw3);
2250 uint16_t status;
2251
2252 reftag |= cdw3 << 32;
2253
2254 trace_pci_nvme_verify_cb(nvme_cid(req), prinfo, apptag, appmask, reftag);
2255
2256 if (ret) {
2257 block_acct_failed(stats, acct);
2258 nvme_aio_err(req, ret);
2259 goto out;
2260 }
2261
2262 block_acct_done(stats, acct);
2263
2264 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2265 status = nvme_dif_mangle_mdata(ns, ctx->mdata.bounce,
2266 ctx->mdata.iov.size, slba);
2267 if (status) {
2268 req->status = status;
2269 goto out;
2270 }
2271
2272 req->status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2273 ctx->mdata.bounce, ctx->mdata.iov.size,
2274 prinfo, slba, apptag, appmask, &reftag);
2275 }
2276
2277 out:
2278 qemu_iovec_destroy(&ctx->data.iov);
2279 g_free(ctx->data.bounce);
2280
2281 qemu_iovec_destroy(&ctx->mdata.iov);
2282 g_free(ctx->mdata.bounce);
2283
2284 g_free(ctx);
2285
2286 nvme_enqueue_req_completion(nvme_cq(req), req);
2287 }
2288
2289
nvme_verify_mdata_in_cb(void * opaque,int ret)2290 static void nvme_verify_mdata_in_cb(void *opaque, int ret)
2291 {
2292 NvmeBounceContext *ctx = opaque;
2293 NvmeRequest *req = ctx->req;
2294 NvmeNamespace *ns = req->ns;
2295 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2296 uint64_t slba = le64_to_cpu(rw->slba);
2297 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2298 size_t mlen = nvme_m2b(ns, nlb);
2299 uint64_t offset = nvme_moff(ns, slba);
2300 BlockBackend *blk = ns->blkconf.blk;
2301
2302 trace_pci_nvme_verify_mdata_in_cb(nvme_cid(req), blk_name(blk));
2303
2304 if (ret) {
2305 goto out;
2306 }
2307
2308 ctx->mdata.bounce = g_malloc(mlen);
2309
2310 qemu_iovec_reset(&ctx->mdata.iov);
2311 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2312
2313 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2314 nvme_verify_cb, ctx);
2315 return;
2316
2317 out:
2318 nvme_verify_cb(ctx, ret);
2319 }
2320
2321 struct nvme_compare_ctx {
2322 struct {
2323 QEMUIOVector iov;
2324 uint8_t *bounce;
2325 } data;
2326
2327 struct {
2328 QEMUIOVector iov;
2329 uint8_t *bounce;
2330 } mdata;
2331 };
2332
nvme_compare_mdata_cb(void * opaque,int ret)2333 static void nvme_compare_mdata_cb(void *opaque, int ret)
2334 {
2335 NvmeRequest *req = opaque;
2336 NvmeNamespace *ns = req->ns;
2337 NvmeCtrl *n = nvme_ctrl(req);
2338 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2339 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2340 uint16_t apptag = le16_to_cpu(rw->apptag);
2341 uint16_t appmask = le16_to_cpu(rw->appmask);
2342 uint64_t reftag = le32_to_cpu(rw->reftag);
2343 uint64_t cdw3 = le32_to_cpu(rw->cdw3);
2344 struct nvme_compare_ctx *ctx = req->opaque;
2345 g_autofree uint8_t *buf = NULL;
2346 BlockBackend *blk = ns->blkconf.blk;
2347 BlockAcctCookie *acct = &req->acct;
2348 BlockAcctStats *stats = blk_get_stats(blk);
2349 uint16_t status = NVME_SUCCESS;
2350
2351 reftag |= cdw3 << 32;
2352
2353 trace_pci_nvme_compare_mdata_cb(nvme_cid(req));
2354
2355 if (ret) {
2356 block_acct_failed(stats, acct);
2357 nvme_aio_err(req, ret);
2358 goto out;
2359 }
2360
2361 buf = g_malloc(ctx->mdata.iov.size);
2362
2363 status = nvme_bounce_mdata(n, buf, ctx->mdata.iov.size,
2364 NVME_TX_DIRECTION_TO_DEVICE, req);
2365 if (status) {
2366 req->status = status;
2367 goto out;
2368 }
2369
2370 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2371 uint64_t slba = le64_to_cpu(rw->slba);
2372 uint8_t *bufp;
2373 uint8_t *mbufp = ctx->mdata.bounce;
2374 uint8_t *end = mbufp + ctx->mdata.iov.size;
2375 int16_t pil = 0;
2376
2377 status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2378 ctx->mdata.bounce, ctx->mdata.iov.size, prinfo,
2379 slba, apptag, appmask, &reftag);
2380 if (status) {
2381 req->status = status;
2382 goto out;
2383 }
2384
2385 /*
2386 * When formatted with protection information, do not compare the DIF
2387 * tuple.
2388 */
2389 if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
2390 pil = ns->lbaf.ms - nvme_pi_tuple_size(ns);
2391 }
2392
2393 for (bufp = buf; mbufp < end; bufp += ns->lbaf.ms, mbufp += ns->lbaf.ms) {
2394 if (memcmp(bufp + pil, mbufp + pil, ns->lbaf.ms - pil)) {
2395 req->status = NVME_CMP_FAILURE | NVME_DNR;
2396 goto out;
2397 }
2398 }
2399
2400 goto out;
2401 }
2402
2403 if (memcmp(buf, ctx->mdata.bounce, ctx->mdata.iov.size)) {
2404 req->status = NVME_CMP_FAILURE | NVME_DNR;
2405 goto out;
2406 }
2407
2408 block_acct_done(stats, acct);
2409
2410 out:
2411 qemu_iovec_destroy(&ctx->data.iov);
2412 g_free(ctx->data.bounce);
2413
2414 qemu_iovec_destroy(&ctx->mdata.iov);
2415 g_free(ctx->mdata.bounce);
2416
2417 g_free(ctx);
2418
2419 nvme_enqueue_req_completion(nvme_cq(req), req);
2420 }
2421
nvme_compare_data_cb(void * opaque,int ret)2422 static void nvme_compare_data_cb(void *opaque, int ret)
2423 {
2424 NvmeRequest *req = opaque;
2425 NvmeCtrl *n = nvme_ctrl(req);
2426 NvmeNamespace *ns = req->ns;
2427 BlockBackend *blk = ns->blkconf.blk;
2428 BlockAcctCookie *acct = &req->acct;
2429 BlockAcctStats *stats = blk_get_stats(blk);
2430
2431 struct nvme_compare_ctx *ctx = req->opaque;
2432 g_autofree uint8_t *buf = NULL;
2433 uint16_t status;
2434
2435 trace_pci_nvme_compare_data_cb(nvme_cid(req));
2436
2437 if (ret) {
2438 block_acct_failed(stats, acct);
2439 nvme_aio_err(req, ret);
2440 goto out;
2441 }
2442
2443 buf = g_malloc(ctx->data.iov.size);
2444
2445 status = nvme_bounce_data(n, buf, ctx->data.iov.size,
2446 NVME_TX_DIRECTION_TO_DEVICE, req);
2447 if (status) {
2448 req->status = status;
2449 goto out;
2450 }
2451
2452 if (memcmp(buf, ctx->data.bounce, ctx->data.iov.size)) {
2453 req->status = NVME_CMP_FAILURE | NVME_DNR;
2454 goto out;
2455 }
2456
2457 if (ns->lbaf.ms) {
2458 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2459 uint64_t slba = le64_to_cpu(rw->slba);
2460 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2461 size_t mlen = nvme_m2b(ns, nlb);
2462 uint64_t offset = nvme_moff(ns, slba);
2463
2464 ctx->mdata.bounce = g_malloc(mlen);
2465
2466 qemu_iovec_init(&ctx->mdata.iov, 1);
2467 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2468
2469 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2470 nvme_compare_mdata_cb, req);
2471 return;
2472 }
2473
2474 block_acct_done(stats, acct);
2475
2476 out:
2477 qemu_iovec_destroy(&ctx->data.iov);
2478 g_free(ctx->data.bounce);
2479 g_free(ctx);
2480
2481 nvme_enqueue_req_completion(nvme_cq(req), req);
2482 }
2483
2484 typedef struct NvmeDSMAIOCB {
2485 BlockAIOCB common;
2486 BlockAIOCB *aiocb;
2487 NvmeRequest *req;
2488 int ret;
2489
2490 NvmeDsmRange *range;
2491 unsigned int nr;
2492 unsigned int idx;
2493 } NvmeDSMAIOCB;
2494
nvme_dsm_cancel(BlockAIOCB * aiocb)2495 static void nvme_dsm_cancel(BlockAIOCB *aiocb)
2496 {
2497 NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
2498
2499 /* break nvme_dsm_cb loop */
2500 iocb->idx = iocb->nr;
2501 iocb->ret = -ECANCELED;
2502
2503 if (iocb->aiocb) {
2504 blk_aio_cancel_async(iocb->aiocb);
2505 iocb->aiocb = NULL;
2506 } else {
2507 /*
2508 * We only reach this if nvme_dsm_cancel() has already been called or
2509 * the command ran to completion.
2510 */
2511 assert(iocb->idx == iocb->nr);
2512 }
2513 }
2514
2515 static const AIOCBInfo nvme_dsm_aiocb_info = {
2516 .aiocb_size = sizeof(NvmeDSMAIOCB),
2517 .cancel_async = nvme_dsm_cancel,
2518 };
2519
2520 static void nvme_dsm_cb(void *opaque, int ret);
2521
nvme_dsm_md_cb(void * opaque,int ret)2522 static void nvme_dsm_md_cb(void *opaque, int ret)
2523 {
2524 NvmeDSMAIOCB *iocb = opaque;
2525 NvmeRequest *req = iocb->req;
2526 NvmeNamespace *ns = req->ns;
2527 NvmeDsmRange *range;
2528 uint64_t slba;
2529 uint32_t nlb;
2530
2531 if (ret < 0 || iocb->ret < 0 || !ns->lbaf.ms) {
2532 goto done;
2533 }
2534
2535 range = &iocb->range[iocb->idx - 1];
2536 slba = le64_to_cpu(range->slba);
2537 nlb = le32_to_cpu(range->nlb);
2538
2539 /*
2540 * Check that all block were discarded (zeroed); otherwise we do not zero
2541 * the metadata.
2542 */
2543
2544 ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_ZERO);
2545 if (ret) {
2546 if (ret < 0) {
2547 goto done;
2548 }
2549
2550 nvme_dsm_cb(iocb, 0);
2551 return;
2552 }
2553
2554 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_moff(ns, slba),
2555 nvme_m2b(ns, nlb), BDRV_REQ_MAY_UNMAP,
2556 nvme_dsm_cb, iocb);
2557 return;
2558
2559 done:
2560 nvme_dsm_cb(iocb, ret);
2561 }
2562
nvme_dsm_cb(void * opaque,int ret)2563 static void nvme_dsm_cb(void *opaque, int ret)
2564 {
2565 NvmeDSMAIOCB *iocb = opaque;
2566 NvmeRequest *req = iocb->req;
2567 NvmeCtrl *n = nvme_ctrl(req);
2568 NvmeNamespace *ns = req->ns;
2569 NvmeDsmRange *range;
2570 uint64_t slba;
2571 uint32_t nlb;
2572
2573 if (iocb->ret < 0) {
2574 goto done;
2575 } else if (ret < 0) {
2576 iocb->ret = ret;
2577 goto done;
2578 }
2579
2580 next:
2581 if (iocb->idx == iocb->nr) {
2582 goto done;
2583 }
2584
2585 range = &iocb->range[iocb->idx++];
2586 slba = le64_to_cpu(range->slba);
2587 nlb = le32_to_cpu(range->nlb);
2588
2589 trace_pci_nvme_dsm_deallocate(slba, nlb);
2590
2591 if (nlb > n->dmrsl) {
2592 trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
2593 goto next;
2594 }
2595
2596 if (nvme_check_bounds(ns, slba, nlb)) {
2597 trace_pci_nvme_err_invalid_lba_range(slba, nlb,
2598 ns->id_ns.nsze);
2599 goto next;
2600 }
2601
2602 iocb->aiocb = blk_aio_pdiscard(ns->blkconf.blk, nvme_l2b(ns, slba),
2603 nvme_l2b(ns, nlb),
2604 nvme_dsm_md_cb, iocb);
2605 return;
2606
2607 done:
2608 iocb->aiocb = NULL;
2609 iocb->common.cb(iocb->common.opaque, iocb->ret);
2610 g_free(iocb->range);
2611 qemu_aio_unref(iocb);
2612 }
2613
nvme_dsm(NvmeCtrl * n,NvmeRequest * req)2614 static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
2615 {
2616 NvmeNamespace *ns = req->ns;
2617 NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
2618 uint32_t attr = le32_to_cpu(dsm->attributes);
2619 uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
2620 uint16_t status = NVME_SUCCESS;
2621
2622 trace_pci_nvme_dsm(nr, attr);
2623
2624 if (attr & NVME_DSMGMT_AD) {
2625 NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk,
2626 nvme_misc_cb, req);
2627
2628 iocb->req = req;
2629 iocb->ret = 0;
2630 iocb->range = g_new(NvmeDsmRange, nr);
2631 iocb->nr = nr;
2632 iocb->idx = 0;
2633
2634 status = nvme_h2c(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr,
2635 req);
2636 if (status) {
2637 g_free(iocb->range);
2638 qemu_aio_unref(iocb);
2639
2640 return status;
2641 }
2642
2643 req->aiocb = &iocb->common;
2644 nvme_dsm_cb(iocb, 0);
2645
2646 return NVME_NO_COMPLETE;
2647 }
2648
2649 return status;
2650 }
2651
nvme_verify(NvmeCtrl * n,NvmeRequest * req)2652 static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req)
2653 {
2654 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2655 NvmeNamespace *ns = req->ns;
2656 BlockBackend *blk = ns->blkconf.blk;
2657 uint64_t slba = le64_to_cpu(rw->slba);
2658 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2659 size_t len = nvme_l2b(ns, nlb);
2660 size_t data_len = len;
2661 int64_t offset = nvme_l2b(ns, slba);
2662 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2663 uint32_t reftag = le32_to_cpu(rw->reftag);
2664 NvmeBounceContext *ctx = NULL;
2665 uint16_t status;
2666
2667 trace_pci_nvme_verify(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2668
2669 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2670 status = nvme_check_prinfo(ns, prinfo, slba, reftag);
2671 if (status) {
2672 return status;
2673 }
2674
2675 if (prinfo & NVME_PRINFO_PRACT) {
2676 return NVME_INVALID_PROT_INFO | NVME_DNR;
2677 }
2678 }
2679
2680 if (nvme_ns_ext(ns) && !(NVME_ID_CTRL_CTRATT_MEM(n->id_ctrl.ctratt))) {
2681 data_len += nvme_m2b(ns, nlb);
2682 }
2683
2684 if (data_len > (n->page_size << n->params.vsl)) {
2685 return NVME_INVALID_FIELD | NVME_DNR;
2686 }
2687
2688 status = nvme_check_bounds(ns, slba, nlb);
2689 if (status) {
2690 return status;
2691 }
2692
2693 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2694 status = nvme_check_dulbe(ns, slba, nlb);
2695 if (status) {
2696 return status;
2697 }
2698 }
2699
2700 ctx = g_new0(NvmeBounceContext, 1);
2701 ctx->req = req;
2702
2703 ctx->data.bounce = g_malloc(len);
2704
2705 qemu_iovec_init(&ctx->data.iov, 1);
2706 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len);
2707
2708 block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size,
2709 BLOCK_ACCT_READ);
2710
2711 req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0,
2712 nvme_verify_mdata_in_cb, ctx);
2713 return NVME_NO_COMPLETE;
2714 }
2715
2716 typedef struct NvmeCopyAIOCB {
2717 BlockAIOCB common;
2718 BlockAIOCB *aiocb;
2719 NvmeRequest *req;
2720 NvmeCtrl *n;
2721 int ret;
2722
2723 void *ranges;
2724 unsigned int format;
2725 int nr;
2726 int idx;
2727
2728 uint8_t *bounce;
2729 QEMUIOVector iov;
2730 struct {
2731 BlockAcctCookie read;
2732 BlockAcctCookie write;
2733 } acct;
2734
2735 uint64_t reftag;
2736 uint64_t slba;
2737
2738 NvmeZone *zone;
2739 NvmeNamespace *sns;
2740 uint32_t tcl;
2741 } NvmeCopyAIOCB;
2742
nvme_copy_cancel(BlockAIOCB * aiocb)2743 static void nvme_copy_cancel(BlockAIOCB *aiocb)
2744 {
2745 NvmeCopyAIOCB *iocb = container_of(aiocb, NvmeCopyAIOCB, common);
2746
2747 iocb->ret = -ECANCELED;
2748
2749 if (iocb->aiocb) {
2750 blk_aio_cancel_async(iocb->aiocb);
2751 iocb->aiocb = NULL;
2752 }
2753 }
2754
2755 static const AIOCBInfo nvme_copy_aiocb_info = {
2756 .aiocb_size = sizeof(NvmeCopyAIOCB),
2757 .cancel_async = nvme_copy_cancel,
2758 };
2759
nvme_copy_done(NvmeCopyAIOCB * iocb)2760 static void nvme_copy_done(NvmeCopyAIOCB *iocb)
2761 {
2762 NvmeRequest *req = iocb->req;
2763 NvmeNamespace *ns = req->ns;
2764 BlockAcctStats *stats = blk_get_stats(ns->blkconf.blk);
2765
2766 if (iocb->idx != iocb->nr) {
2767 req->cqe.result = cpu_to_le32(iocb->idx);
2768 }
2769
2770 qemu_iovec_destroy(&iocb->iov);
2771 g_free(iocb->bounce);
2772
2773 if (iocb->ret < 0) {
2774 block_acct_failed(stats, &iocb->acct.read);
2775 block_acct_failed(stats, &iocb->acct.write);
2776 } else {
2777 block_acct_done(stats, &iocb->acct.read);
2778 block_acct_done(stats, &iocb->acct.write);
2779 }
2780
2781 iocb->common.cb(iocb->common.opaque, iocb->ret);
2782 qemu_aio_unref(iocb);
2783 }
2784
2785 static void nvme_do_copy(NvmeCopyAIOCB *iocb);
2786
nvme_copy_source_range_parse_format0_2(void * ranges,int idx,uint64_t * slba,uint32_t * nlb,uint32_t * snsid,uint16_t * apptag,uint16_t * appmask,uint64_t * reftag)2787 static void nvme_copy_source_range_parse_format0_2(void *ranges,
2788 int idx, uint64_t *slba,
2789 uint32_t *nlb,
2790 uint32_t *snsid,
2791 uint16_t *apptag,
2792 uint16_t *appmask,
2793 uint64_t *reftag)
2794 {
2795 NvmeCopySourceRangeFormat0_2 *_ranges = ranges;
2796
2797 if (snsid) {
2798 *snsid = le32_to_cpu(_ranges[idx].sparams);
2799 }
2800
2801 if (slba) {
2802 *slba = le64_to_cpu(_ranges[idx].slba);
2803 }
2804
2805 if (nlb) {
2806 *nlb = le16_to_cpu(_ranges[idx].nlb) + 1;
2807 }
2808
2809 if (apptag) {
2810 *apptag = le16_to_cpu(_ranges[idx].apptag);
2811 }
2812
2813 if (appmask) {
2814 *appmask = le16_to_cpu(_ranges[idx].appmask);
2815 }
2816
2817 if (reftag) {
2818 *reftag = le32_to_cpu(_ranges[idx].reftag);
2819 }
2820 }
2821
nvme_copy_source_range_parse_format1_3(void * ranges,int idx,uint64_t * slba,uint32_t * nlb,uint32_t * snsid,uint16_t * apptag,uint16_t * appmask,uint64_t * reftag)2822 static void nvme_copy_source_range_parse_format1_3(void *ranges, int idx,
2823 uint64_t *slba,
2824 uint32_t *nlb,
2825 uint32_t *snsid,
2826 uint16_t *apptag,
2827 uint16_t *appmask,
2828 uint64_t *reftag)
2829 {
2830 NvmeCopySourceRangeFormat1_3 *_ranges = ranges;
2831
2832 if (snsid) {
2833 *snsid = le32_to_cpu(_ranges[idx].sparams);
2834 }
2835
2836 if (slba) {
2837 *slba = le64_to_cpu(_ranges[idx].slba);
2838 }
2839
2840 if (nlb) {
2841 *nlb = le16_to_cpu(_ranges[idx].nlb) + 1;
2842 }
2843
2844 if (apptag) {
2845 *apptag = le16_to_cpu(_ranges[idx].apptag);
2846 }
2847
2848 if (appmask) {
2849 *appmask = le16_to_cpu(_ranges[idx].appmask);
2850 }
2851
2852 if (reftag) {
2853 *reftag = 0;
2854
2855 *reftag |= (uint64_t)_ranges[idx].sr[4] << 40;
2856 *reftag |= (uint64_t)_ranges[idx].sr[5] << 32;
2857 *reftag |= (uint64_t)_ranges[idx].sr[6] << 24;
2858 *reftag |= (uint64_t)_ranges[idx].sr[7] << 16;
2859 *reftag |= (uint64_t)_ranges[idx].sr[8] << 8;
2860 *reftag |= (uint64_t)_ranges[idx].sr[9];
2861 }
2862 }
2863
nvme_copy_source_range_parse(void * ranges,int idx,uint8_t format,uint64_t * slba,uint32_t * nlb,uint32_t * snsid,uint16_t * apptag,uint16_t * appmask,uint64_t * reftag)2864 static void nvme_copy_source_range_parse(void *ranges, int idx, uint8_t format,
2865 uint64_t *slba, uint32_t *nlb,
2866 uint32_t *snsid, uint16_t *apptag,
2867 uint16_t *appmask, uint64_t *reftag)
2868 {
2869 switch (format) {
2870 case NVME_COPY_FORMAT_0:
2871 case NVME_COPY_FORMAT_2:
2872 nvme_copy_source_range_parse_format0_2(ranges, idx, slba, nlb, snsid,
2873 apptag, appmask, reftag);
2874 break;
2875
2876 case NVME_COPY_FORMAT_1:
2877 case NVME_COPY_FORMAT_3:
2878 nvme_copy_source_range_parse_format1_3(ranges, idx, slba, nlb, snsid,
2879 apptag, appmask, reftag);
2880 break;
2881
2882 default:
2883 abort();
2884 }
2885 }
2886
nvme_check_copy_mcl(NvmeNamespace * ns,NvmeCopyAIOCB * iocb,uint16_t nr)2887 static inline uint16_t nvme_check_copy_mcl(NvmeNamespace *ns,
2888 NvmeCopyAIOCB *iocb, uint16_t nr)
2889 {
2890 uint32_t copy_len = 0;
2891
2892 for (int idx = 0; idx < nr; idx++) {
2893 uint32_t nlb;
2894 nvme_copy_source_range_parse(iocb->ranges, idx, iocb->format, NULL,
2895 &nlb, NULL, NULL, NULL, NULL);
2896 copy_len += nlb;
2897 }
2898 iocb->tcl = copy_len;
2899 if (copy_len > ns->id_ns.mcl) {
2900 return NVME_CMD_SIZE_LIMIT | NVME_DNR;
2901 }
2902
2903 return NVME_SUCCESS;
2904 }
2905
nvme_copy_out_completed_cb(void * opaque,int ret)2906 static void nvme_copy_out_completed_cb(void *opaque, int ret)
2907 {
2908 NvmeCopyAIOCB *iocb = opaque;
2909 NvmeRequest *req = iocb->req;
2910 NvmeNamespace *dns = req->ns;
2911 uint32_t nlb;
2912
2913 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, NULL,
2914 &nlb, NULL, NULL, NULL, NULL);
2915
2916 if (ret < 0) {
2917 iocb->ret = ret;
2918 goto out;
2919 } else if (iocb->ret < 0) {
2920 goto out;
2921 }
2922
2923 if (dns->params.zoned) {
2924 nvme_advance_zone_wp(dns, iocb->zone, nlb);
2925 }
2926
2927 iocb->idx++;
2928 iocb->slba += nlb;
2929 out:
2930 nvme_do_copy(iocb);
2931 }
2932
nvme_copy_out_cb(void * opaque,int ret)2933 static void nvme_copy_out_cb(void *opaque, int ret)
2934 {
2935 NvmeCopyAIOCB *iocb = opaque;
2936 NvmeRequest *req = iocb->req;
2937 NvmeNamespace *dns = req->ns;
2938 uint32_t nlb;
2939 size_t mlen;
2940 uint8_t *mbounce;
2941
2942 if (ret < 0 || iocb->ret < 0 || !dns->lbaf.ms) {
2943 goto out;
2944 }
2945
2946 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, NULL,
2947 &nlb, NULL, NULL, NULL, NULL);
2948
2949 mlen = nvme_m2b(dns, nlb);
2950 mbounce = iocb->bounce + nvme_l2b(dns, nlb);
2951
2952 qemu_iovec_reset(&iocb->iov);
2953 qemu_iovec_add(&iocb->iov, mbounce, mlen);
2954
2955 iocb->aiocb = blk_aio_pwritev(dns->blkconf.blk, nvme_moff(dns, iocb->slba),
2956 &iocb->iov, 0, nvme_copy_out_completed_cb,
2957 iocb);
2958
2959 return;
2960
2961 out:
2962 nvme_copy_out_completed_cb(iocb, ret);
2963 }
2964
nvme_copy_in_completed_cb(void * opaque,int ret)2965 static void nvme_copy_in_completed_cb(void *opaque, int ret)
2966 {
2967 NvmeCopyAIOCB *iocb = opaque;
2968 NvmeRequest *req = iocb->req;
2969 NvmeNamespace *sns = iocb->sns;
2970 NvmeNamespace *dns = req->ns;
2971 NvmeCopyCmd *copy = NULL;
2972 uint8_t *mbounce = NULL;
2973 uint32_t nlb;
2974 uint64_t slba;
2975 uint16_t apptag, appmask;
2976 uint64_t reftag;
2977 size_t len, mlen;
2978 uint16_t status;
2979
2980 if (ret < 0) {
2981 iocb->ret = ret;
2982 goto out;
2983 } else if (iocb->ret < 0) {
2984 goto out;
2985 }
2986
2987 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, &slba,
2988 &nlb, NULL, &apptag, &appmask, &reftag);
2989
2990 trace_pci_nvme_copy_out(iocb->slba, nlb);
2991
2992 len = nvme_l2b(sns, nlb);
2993
2994 if (NVME_ID_NS_DPS_TYPE(sns->id_ns.dps)) {
2995 copy = (NvmeCopyCmd *)&req->cmd;
2996
2997 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
2998
2999 mlen = nvme_m2b(sns, nlb);
3000 mbounce = iocb->bounce + nvme_l2b(sns, nlb);
3001
3002 status = nvme_dif_mangle_mdata(sns, mbounce, mlen, slba);
3003 if (status) {
3004 goto invalid;
3005 }
3006 status = nvme_dif_check(sns, iocb->bounce, len, mbounce, mlen, prinfor,
3007 slba, apptag, appmask, &reftag);
3008 if (status) {
3009 goto invalid;
3010 }
3011 }
3012
3013 if (NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3014 copy = (NvmeCopyCmd *)&req->cmd;
3015 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
3016
3017 mlen = nvme_m2b(dns, nlb);
3018 mbounce = iocb->bounce + nvme_l2b(dns, nlb);
3019
3020 apptag = le16_to_cpu(copy->apptag);
3021 appmask = le16_to_cpu(copy->appmask);
3022
3023 if (prinfow & NVME_PRINFO_PRACT) {
3024 status = nvme_check_prinfo(dns, prinfow, iocb->slba, iocb->reftag);
3025 if (status) {
3026 goto invalid;
3027 }
3028
3029 nvme_dif_pract_generate_dif(dns, iocb->bounce, len, mbounce, mlen,
3030 apptag, &iocb->reftag);
3031 } else {
3032 status = nvme_dif_check(dns, iocb->bounce, len, mbounce, mlen,
3033 prinfow, iocb->slba, apptag, appmask,
3034 &iocb->reftag);
3035 if (status) {
3036 goto invalid;
3037 }
3038 }
3039 }
3040
3041 status = nvme_check_bounds(dns, iocb->slba, nlb);
3042 if (status) {
3043 goto invalid;
3044 }
3045
3046 if (dns->params.zoned) {
3047 status = nvme_check_zone_write(dns, iocb->zone, iocb->slba, nlb);
3048 if (status) {
3049 goto invalid;
3050 }
3051
3052 if (!(iocb->zone->d.za & NVME_ZA_ZRWA_VALID)) {
3053 iocb->zone->w_ptr += nlb;
3054 }
3055 }
3056
3057 qemu_iovec_reset(&iocb->iov);
3058 qemu_iovec_add(&iocb->iov, iocb->bounce, len);
3059
3060 block_acct_start(blk_get_stats(dns->blkconf.blk), &iocb->acct.write, 0,
3061 BLOCK_ACCT_WRITE);
3062
3063 iocb->aiocb = blk_aio_pwritev(dns->blkconf.blk, nvme_l2b(dns, iocb->slba),
3064 &iocb->iov, 0, nvme_copy_out_cb, iocb);
3065
3066 return;
3067
3068 invalid:
3069 req->status = status;
3070 iocb->ret = -1;
3071 out:
3072 nvme_do_copy(iocb);
3073 }
3074
nvme_copy_in_cb(void * opaque,int ret)3075 static void nvme_copy_in_cb(void *opaque, int ret)
3076 {
3077 NvmeCopyAIOCB *iocb = opaque;
3078 NvmeNamespace *sns = iocb->sns;
3079 uint64_t slba;
3080 uint32_t nlb;
3081
3082 if (ret < 0 || iocb->ret < 0 || !sns->lbaf.ms) {
3083 goto out;
3084 }
3085
3086 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, &slba,
3087 &nlb, NULL, NULL, NULL, NULL);
3088
3089 qemu_iovec_reset(&iocb->iov);
3090 qemu_iovec_add(&iocb->iov, iocb->bounce + nvme_l2b(sns, nlb),
3091 nvme_m2b(sns, nlb));
3092
3093 iocb->aiocb = blk_aio_preadv(sns->blkconf.blk, nvme_moff(sns, slba),
3094 &iocb->iov, 0, nvme_copy_in_completed_cb,
3095 iocb);
3096 return;
3097
3098 out:
3099 nvme_copy_in_completed_cb(iocb, ret);
3100 }
3101
nvme_csi_supports_copy(uint8_t csi)3102 static inline bool nvme_csi_supports_copy(uint8_t csi)
3103 {
3104 return csi == NVME_CSI_NVM || csi == NVME_CSI_ZONED;
3105 }
3106
nvme_copy_ns_format_match(NvmeNamespace * sns,NvmeNamespace * dns)3107 static inline bool nvme_copy_ns_format_match(NvmeNamespace *sns,
3108 NvmeNamespace *dns)
3109 {
3110 return sns->lbaf.ds == dns->lbaf.ds && sns->lbaf.ms == dns->lbaf.ms;
3111 }
3112
nvme_copy_matching_ns_format(NvmeNamespace * sns,NvmeNamespace * dns,bool pi_enable)3113 static bool nvme_copy_matching_ns_format(NvmeNamespace *sns, NvmeNamespace *dns,
3114 bool pi_enable)
3115 {
3116 if (!nvme_csi_supports_copy(sns->csi) ||
3117 !nvme_csi_supports_copy(dns->csi)) {
3118 return false;
3119 }
3120
3121 if (!pi_enable && !nvme_copy_ns_format_match(sns, dns)) {
3122 return false;
3123 }
3124
3125 if (pi_enable && (!nvme_copy_ns_format_match(sns, dns) ||
3126 sns->id_ns.dps != dns->id_ns.dps)) {
3127 return false;
3128 }
3129
3130 return true;
3131 }
3132
nvme_copy_corresp_pi_match(NvmeNamespace * sns,NvmeNamespace * dns)3133 static inline bool nvme_copy_corresp_pi_match(NvmeNamespace *sns,
3134 NvmeNamespace *dns)
3135 {
3136 return sns->lbaf.ms == 0 &&
3137 ((dns->lbaf.ms == 8 && dns->pif == 0) ||
3138 (dns->lbaf.ms == 16 && dns->pif == 1));
3139 }
3140
nvme_copy_corresp_pi_format(NvmeNamespace * sns,NvmeNamespace * dns,bool sns_pi_en)3141 static bool nvme_copy_corresp_pi_format(NvmeNamespace *sns, NvmeNamespace *dns,
3142 bool sns_pi_en)
3143 {
3144 if (!nvme_csi_supports_copy(sns->csi) ||
3145 !nvme_csi_supports_copy(dns->csi)) {
3146 return false;
3147 }
3148
3149 if (!sns_pi_en && !nvme_copy_corresp_pi_match(sns, dns)) {
3150 return false;
3151 }
3152
3153 if (sns_pi_en && !nvme_copy_corresp_pi_match(dns, sns)) {
3154 return false;
3155 }
3156
3157 return true;
3158 }
3159
nvme_do_copy(NvmeCopyAIOCB * iocb)3160 static void nvme_do_copy(NvmeCopyAIOCB *iocb)
3161 {
3162 NvmeRequest *req = iocb->req;
3163 NvmeNamespace *sns;
3164 NvmeNamespace *dns = req->ns;
3165 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
3166 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
3167 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
3168 uint64_t slba;
3169 uint32_t nlb;
3170 size_t len;
3171 uint16_t status;
3172 uint32_t dnsid = le32_to_cpu(req->cmd.nsid);
3173 uint32_t snsid = dnsid;
3174
3175 if (iocb->ret < 0) {
3176 goto done;
3177 }
3178
3179 if (iocb->idx == iocb->nr) {
3180 goto done;
3181 }
3182
3183 if (iocb->format == 2 || iocb->format == 3) {
3184 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format,
3185 &slba, &nlb, &snsid, NULL, NULL, NULL);
3186 if (snsid != dnsid) {
3187 if (snsid == NVME_NSID_BROADCAST ||
3188 !nvme_nsid_valid(iocb->n, snsid)) {
3189 status = NVME_INVALID_NSID | NVME_DNR;
3190 goto invalid;
3191 }
3192 iocb->sns = nvme_ns(iocb->n, snsid);
3193 if (unlikely(!iocb->sns)) {
3194 status = NVME_INVALID_FIELD | NVME_DNR;
3195 goto invalid;
3196 }
3197 } else {
3198 if (((slba + nlb) > iocb->slba) &&
3199 ((slba + nlb) < (iocb->slba + iocb->tcl))) {
3200 status = NVME_CMD_OVERLAP_IO_RANGE | NVME_DNR;
3201 goto invalid;
3202 }
3203 }
3204 } else {
3205 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format,
3206 &slba, &nlb, NULL, NULL, NULL, NULL);
3207 }
3208
3209 sns = iocb->sns;
3210 if ((snsid == dnsid) && NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3211 ((prinfor & NVME_PRINFO_PRACT) != (prinfow & NVME_PRINFO_PRACT))) {
3212 status = NVME_INVALID_FIELD | NVME_DNR;
3213 goto invalid;
3214 } else if (snsid != dnsid) {
3215 if (!NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3216 !NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3217 if (!nvme_copy_matching_ns_format(sns, dns, false)) {
3218 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3219 goto invalid;
3220 }
3221 }
3222 if (NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3223 NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3224 if ((prinfor & NVME_PRINFO_PRACT) !=
3225 (prinfow & NVME_PRINFO_PRACT)) {
3226 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3227 goto invalid;
3228 } else {
3229 if (!nvme_copy_matching_ns_format(sns, dns, true)) {
3230 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3231 goto invalid;
3232 }
3233 }
3234 }
3235
3236 if (!NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3237 NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3238 if (!(prinfow & NVME_PRINFO_PRACT)) {
3239 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3240 goto invalid;
3241 } else {
3242 if (!nvme_copy_corresp_pi_format(sns, dns, false)) {
3243 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3244 goto invalid;
3245 }
3246 }
3247 }
3248
3249 if (NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3250 !NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3251 if (!(prinfor & NVME_PRINFO_PRACT)) {
3252 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3253 goto invalid;
3254 } else {
3255 if (!nvme_copy_corresp_pi_format(sns, dns, true)) {
3256 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3257 goto invalid;
3258 }
3259 }
3260 }
3261 }
3262 len = nvme_l2b(sns, nlb);
3263
3264 trace_pci_nvme_copy_source_range(slba, nlb);
3265
3266 if (nlb > le16_to_cpu(sns->id_ns.mssrl)) {
3267 status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
3268 goto invalid;
3269 }
3270
3271 status = nvme_check_bounds(sns, slba, nlb);
3272 if (status) {
3273 goto invalid;
3274 }
3275
3276 if (NVME_ERR_REC_DULBE(sns->features.err_rec)) {
3277 status = nvme_check_dulbe(sns, slba, nlb);
3278 if (status) {
3279 goto invalid;
3280 }
3281 }
3282
3283 if (sns->params.zoned) {
3284 status = nvme_check_zone_read(sns, slba, nlb);
3285 if (status) {
3286 goto invalid;
3287 }
3288 }
3289
3290 g_free(iocb->bounce);
3291 iocb->bounce = g_malloc_n(le16_to_cpu(sns->id_ns.mssrl),
3292 sns->lbasz + sns->lbaf.ms);
3293
3294 qemu_iovec_reset(&iocb->iov);
3295 qemu_iovec_add(&iocb->iov, iocb->bounce, len);
3296
3297 block_acct_start(blk_get_stats(sns->blkconf.blk), &iocb->acct.read, 0,
3298 BLOCK_ACCT_READ);
3299
3300 iocb->aiocb = blk_aio_preadv(sns->blkconf.blk, nvme_l2b(sns, slba),
3301 &iocb->iov, 0, nvme_copy_in_cb, iocb);
3302 return;
3303
3304 invalid:
3305 req->status = status;
3306 iocb->ret = -1;
3307 done:
3308 nvme_copy_done(iocb);
3309 }
3310
nvme_copy(NvmeCtrl * n,NvmeRequest * req)3311 static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
3312 {
3313 NvmeNamespace *ns = req->ns;
3314 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
3315 NvmeCopyAIOCB *iocb = blk_aio_get(&nvme_copy_aiocb_info, ns->blkconf.blk,
3316 nvme_misc_cb, req);
3317 uint16_t nr = copy->nr + 1;
3318 uint8_t format = copy->control[0] & 0xf;
3319 size_t len = sizeof(NvmeCopySourceRangeFormat0_2);
3320
3321 uint16_t status;
3322
3323 trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format);
3324
3325 iocb->ranges = NULL;
3326 iocb->zone = NULL;
3327
3328 if (!(n->id_ctrl.ocfs & (1 << format)) ||
3329 ((format == 2 || format == 3) &&
3330 !(n->features.hbs.cdfe & (1 << format)))) {
3331 trace_pci_nvme_err_copy_invalid_format(format);
3332 status = NVME_INVALID_FIELD | NVME_DNR;
3333 goto invalid;
3334 }
3335
3336 if (nr > ns->id_ns.msrc + 1) {
3337 status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
3338 goto invalid;
3339 }
3340
3341 if ((ns->pif == 0x0 && (format != 0x0 && format != 0x2)) ||
3342 (ns->pif != 0x0 && (format != 0x1 && format != 0x3))) {
3343 status = NVME_INVALID_FORMAT | NVME_DNR;
3344 goto invalid;
3345 }
3346
3347 if (ns->pif) {
3348 len = sizeof(NvmeCopySourceRangeFormat1_3);
3349 }
3350
3351 iocb->format = format;
3352 iocb->ranges = g_malloc_n(nr, len);
3353 status = nvme_h2c(n, (uint8_t *)iocb->ranges, len * nr, req);
3354 if (status) {
3355 goto invalid;
3356 }
3357
3358 iocb->slba = le64_to_cpu(copy->sdlba);
3359
3360 if (ns->params.zoned) {
3361 iocb->zone = nvme_get_zone_by_slba(ns, iocb->slba);
3362 if (!iocb->zone) {
3363 status = NVME_LBA_RANGE | NVME_DNR;
3364 goto invalid;
3365 }
3366
3367 status = nvme_zrm_auto(n, ns, iocb->zone);
3368 if (status) {
3369 goto invalid;
3370 }
3371 }
3372
3373 status = nvme_check_copy_mcl(ns, iocb, nr);
3374 if (status) {
3375 goto invalid;
3376 }
3377
3378 iocb->req = req;
3379 iocb->ret = 0;
3380 iocb->nr = nr;
3381 iocb->idx = 0;
3382 iocb->reftag = le32_to_cpu(copy->reftag);
3383 iocb->reftag |= (uint64_t)le32_to_cpu(copy->cdw3) << 32;
3384
3385 qemu_iovec_init(&iocb->iov, 1);
3386
3387 req->aiocb = &iocb->common;
3388 iocb->sns = req->ns;
3389 iocb->n = n;
3390 iocb->bounce = NULL;
3391 nvme_do_copy(iocb);
3392
3393 return NVME_NO_COMPLETE;
3394
3395 invalid:
3396 g_free(iocb->ranges);
3397 qemu_aio_unref(iocb);
3398 return status;
3399 }
3400
nvme_compare(NvmeCtrl * n,NvmeRequest * req)3401 static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
3402 {
3403 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3404 NvmeNamespace *ns = req->ns;
3405 BlockBackend *blk = ns->blkconf.blk;
3406 uint64_t slba = le64_to_cpu(rw->slba);
3407 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
3408 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
3409 size_t data_len = nvme_l2b(ns, nlb);
3410 size_t len = data_len;
3411 int64_t offset = nvme_l2b(ns, slba);
3412 struct nvme_compare_ctx *ctx = NULL;
3413 uint16_t status;
3414
3415 trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb);
3416
3417 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (prinfo & NVME_PRINFO_PRACT)) {
3418 return NVME_INVALID_PROT_INFO | NVME_DNR;
3419 }
3420
3421 if (nvme_ns_ext(ns)) {
3422 len += nvme_m2b(ns, nlb);
3423 }
3424
3425 if (NVME_ID_CTRL_CTRATT_MEM(n->id_ctrl.ctratt)) {
3426 status = nvme_check_mdts(n, data_len);
3427 } else {
3428 status = nvme_check_mdts(n, len);
3429 }
3430 if (status) {
3431 return status;
3432 }
3433
3434 status = nvme_check_bounds(ns, slba, nlb);
3435 if (status) {
3436 return status;
3437 }
3438
3439 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
3440 status = nvme_check_dulbe(ns, slba, nlb);
3441 if (status) {
3442 return status;
3443 }
3444 }
3445
3446 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
3447 if (status) {
3448 return status;
3449 }
3450
3451 ctx = g_new(struct nvme_compare_ctx, 1);
3452 ctx->data.bounce = g_malloc(data_len);
3453
3454 req->opaque = ctx;
3455
3456 qemu_iovec_init(&ctx->data.iov, 1);
3457 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, data_len);
3458
3459 block_acct_start(blk_get_stats(blk), &req->acct, data_len,
3460 BLOCK_ACCT_READ);
3461 req->aiocb = blk_aio_preadv(blk, offset, &ctx->data.iov, 0,
3462 nvme_compare_data_cb, req);
3463
3464 return NVME_NO_COMPLETE;
3465 }
3466
3467 typedef struct NvmeFlushAIOCB {
3468 BlockAIOCB common;
3469 BlockAIOCB *aiocb;
3470 NvmeRequest *req;
3471 int ret;
3472
3473 NvmeNamespace *ns;
3474 uint32_t nsid;
3475 bool broadcast;
3476 } NvmeFlushAIOCB;
3477
nvme_flush_cancel(BlockAIOCB * acb)3478 static void nvme_flush_cancel(BlockAIOCB *acb)
3479 {
3480 NvmeFlushAIOCB *iocb = container_of(acb, NvmeFlushAIOCB, common);
3481
3482 iocb->ret = -ECANCELED;
3483
3484 if (iocb->aiocb) {
3485 blk_aio_cancel_async(iocb->aiocb);
3486 iocb->aiocb = NULL;
3487 }
3488 }
3489
3490 static const AIOCBInfo nvme_flush_aiocb_info = {
3491 .aiocb_size = sizeof(NvmeFlushAIOCB),
3492 .cancel_async = nvme_flush_cancel,
3493 };
3494
3495 static void nvme_do_flush(NvmeFlushAIOCB *iocb);
3496
nvme_flush_ns_cb(void * opaque,int ret)3497 static void nvme_flush_ns_cb(void *opaque, int ret)
3498 {
3499 NvmeFlushAIOCB *iocb = opaque;
3500 NvmeNamespace *ns = iocb->ns;
3501
3502 if (ret < 0) {
3503 iocb->ret = ret;
3504 goto out;
3505 } else if (iocb->ret < 0) {
3506 goto out;
3507 }
3508
3509 if (ns) {
3510 trace_pci_nvme_flush_ns(iocb->nsid);
3511
3512 iocb->ns = NULL;
3513 iocb->aiocb = blk_aio_flush(ns->blkconf.blk, nvme_flush_ns_cb, iocb);
3514 return;
3515 }
3516
3517 out:
3518 nvme_do_flush(iocb);
3519 }
3520
nvme_do_flush(NvmeFlushAIOCB * iocb)3521 static void nvme_do_flush(NvmeFlushAIOCB *iocb)
3522 {
3523 NvmeRequest *req = iocb->req;
3524 NvmeCtrl *n = nvme_ctrl(req);
3525 int i;
3526
3527 if (iocb->ret < 0) {
3528 goto done;
3529 }
3530
3531 if (iocb->broadcast) {
3532 for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
3533 iocb->ns = nvme_ns(n, i);
3534 if (iocb->ns) {
3535 iocb->nsid = i;
3536 break;
3537 }
3538 }
3539 }
3540
3541 if (!iocb->ns) {
3542 goto done;
3543 }
3544
3545 nvme_flush_ns_cb(iocb, 0);
3546 return;
3547
3548 done:
3549 iocb->common.cb(iocb->common.opaque, iocb->ret);
3550 qemu_aio_unref(iocb);
3551 }
3552
nvme_flush(NvmeCtrl * n,NvmeRequest * req)3553 static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
3554 {
3555 NvmeFlushAIOCB *iocb;
3556 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3557 uint16_t status;
3558
3559 iocb = qemu_aio_get(&nvme_flush_aiocb_info, NULL, nvme_misc_cb, req);
3560
3561 iocb->req = req;
3562 iocb->ret = 0;
3563 iocb->ns = NULL;
3564 iocb->nsid = 0;
3565 iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
3566
3567 if (!iocb->broadcast) {
3568 if (!nvme_nsid_valid(n, nsid)) {
3569 status = NVME_INVALID_NSID | NVME_DNR;
3570 goto out;
3571 }
3572
3573 iocb->ns = nvme_ns(n, nsid);
3574 if (!iocb->ns) {
3575 status = NVME_INVALID_FIELD | NVME_DNR;
3576 goto out;
3577 }
3578
3579 iocb->nsid = nsid;
3580 }
3581
3582 req->aiocb = &iocb->common;
3583 nvme_do_flush(iocb);
3584
3585 return NVME_NO_COMPLETE;
3586
3587 out:
3588 qemu_aio_unref(iocb);
3589
3590 return status;
3591 }
3592
nvme_read(NvmeCtrl * n,NvmeRequest * req)3593 static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
3594 {
3595 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3596 NvmeNamespace *ns = req->ns;
3597 uint64_t slba = le64_to_cpu(rw->slba);
3598 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3599 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
3600 uint64_t data_size = nvme_l2b(ns, nlb);
3601 uint64_t mapped_size = data_size;
3602 uint64_t data_offset;
3603 BlockBackend *blk = ns->blkconf.blk;
3604 uint16_t status;
3605
3606 if (nvme_ns_ext(ns) && !(NVME_ID_CTRL_CTRATT_MEM(n->id_ctrl.ctratt))) {
3607 mapped_size += nvme_m2b(ns, nlb);
3608
3609 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3610 bool pract = prinfo & NVME_PRINFO_PRACT;
3611
3612 if (pract && ns->lbaf.ms == nvme_pi_tuple_size(ns)) {
3613 mapped_size = data_size;
3614 }
3615 }
3616 }
3617
3618 trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, mapped_size, slba);
3619
3620 status = nvme_check_mdts(n, mapped_size);
3621 if (status) {
3622 goto invalid;
3623 }
3624
3625 status = nvme_check_bounds(ns, slba, nlb);
3626 if (status) {
3627 goto invalid;
3628 }
3629
3630 if (ns->params.zoned) {
3631 status = nvme_check_zone_read(ns, slba, nlb);
3632 if (status) {
3633 trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
3634 goto invalid;
3635 }
3636 }
3637
3638 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
3639 status = nvme_check_dulbe(ns, slba, nlb);
3640 if (status) {
3641 goto invalid;
3642 }
3643 }
3644
3645 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3646 return nvme_dif_rw(n, req);
3647 }
3648
3649 status = nvme_map_data(n, nlb, req);
3650 if (status) {
3651 goto invalid;
3652 }
3653
3654 data_offset = nvme_l2b(ns, slba);
3655
3656 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3657 BLOCK_ACCT_READ);
3658 nvme_blk_read(blk, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req);
3659 return NVME_NO_COMPLETE;
3660
3661 invalid:
3662 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
3663 return status | NVME_DNR;
3664 }
3665
nvme_do_write_fdp(NvmeCtrl * n,NvmeRequest * req,uint64_t slba,uint32_t nlb)3666 static void nvme_do_write_fdp(NvmeCtrl *n, NvmeRequest *req, uint64_t slba,
3667 uint32_t nlb)
3668 {
3669 NvmeNamespace *ns = req->ns;
3670 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3671 uint64_t data_size = nvme_l2b(ns, nlb);
3672 uint32_t dw12 = le32_to_cpu(req->cmd.cdw12);
3673 uint8_t dtype = (dw12 >> 20) & 0xf;
3674 uint16_t pid = le16_to_cpu(rw->dspec);
3675 uint16_t ph, rg, ruhid;
3676 NvmeReclaimUnit *ru;
3677
3678 if (dtype != NVME_DIRECTIVE_DATA_PLACEMENT ||
3679 !nvme_parse_pid(ns, pid, &ph, &rg)) {
3680 ph = 0;
3681 rg = 0;
3682 }
3683
3684 ruhid = ns->fdp.phs[ph];
3685 ru = &ns->endgrp->fdp.ruhs[ruhid].rus[rg];
3686
3687 nvme_fdp_stat_inc(&ns->endgrp->fdp.hbmw, data_size);
3688 nvme_fdp_stat_inc(&ns->endgrp->fdp.mbmw, data_size);
3689
3690 while (nlb) {
3691 if (nlb < ru->ruamw) {
3692 ru->ruamw -= nlb;
3693 break;
3694 }
3695
3696 nlb -= ru->ruamw;
3697 nvme_update_ruh(n, ns, pid);
3698 }
3699 }
3700
nvme_do_write(NvmeCtrl * n,NvmeRequest * req,bool append,bool wrz)3701 static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
3702 bool wrz)
3703 {
3704 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3705 NvmeNamespace *ns = req->ns;
3706 uint64_t slba = le64_to_cpu(rw->slba);
3707 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3708 uint16_t ctrl = le16_to_cpu(rw->control);
3709 uint8_t prinfo = NVME_RW_PRINFO(ctrl);
3710 uint64_t data_size = nvme_l2b(ns, nlb);
3711 uint64_t mapped_size = data_size;
3712 uint64_t data_offset;
3713 NvmeZone *zone;
3714 NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe;
3715 BlockBackend *blk = ns->blkconf.blk;
3716 uint16_t status;
3717
3718 if (nvme_ns_ext(ns) && !(NVME_ID_CTRL_CTRATT_MEM(n->id_ctrl.ctratt))) {
3719 mapped_size += nvme_m2b(ns, nlb);
3720
3721 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3722 bool pract = prinfo & NVME_PRINFO_PRACT;
3723
3724 if (pract && ns->lbaf.ms == nvme_pi_tuple_size(ns)) {
3725 mapped_size -= nvme_m2b(ns, nlb);
3726 }
3727 }
3728 }
3729
3730 trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
3731 nvme_nsid(ns), nlb, mapped_size, slba);
3732
3733 if (!wrz) {
3734 status = nvme_check_mdts(n, mapped_size);
3735 if (status) {
3736 goto invalid;
3737 }
3738 }
3739
3740 status = nvme_check_bounds(ns, slba, nlb);
3741 if (status) {
3742 goto invalid;
3743 }
3744
3745 if (ns->params.zoned) {
3746 zone = nvme_get_zone_by_slba(ns, slba);
3747 assert(zone);
3748
3749 if (append) {
3750 bool piremap = !!(ctrl & NVME_RW_PIREMAP);
3751
3752 if (unlikely(zone->d.za & NVME_ZA_ZRWA_VALID)) {
3753 return NVME_INVALID_ZONE_OP | NVME_DNR;
3754 }
3755
3756 if (unlikely(slba != zone->d.zslba)) {
3757 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
3758 status = NVME_INVALID_FIELD;
3759 goto invalid;
3760 }
3761
3762 if (n->params.zasl &&
3763 data_size > (uint64_t)n->page_size << n->params.zasl) {
3764 trace_pci_nvme_err_zasl(data_size);
3765 return NVME_INVALID_FIELD | NVME_DNR;
3766 }
3767
3768 slba = zone->w_ptr;
3769 rw->slba = cpu_to_le64(slba);
3770 res->slba = cpu_to_le64(slba);
3771
3772 switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3773 case NVME_ID_NS_DPS_TYPE_1:
3774 if (!piremap) {
3775 return NVME_INVALID_PROT_INFO | NVME_DNR;
3776 }
3777
3778 /* fallthrough */
3779
3780 case NVME_ID_NS_DPS_TYPE_2:
3781 if (piremap) {
3782 uint32_t reftag = le32_to_cpu(rw->reftag);
3783 rw->reftag = cpu_to_le32(reftag + (slba - zone->d.zslba));
3784 }
3785
3786 break;
3787
3788 case NVME_ID_NS_DPS_TYPE_3:
3789 if (piremap) {
3790 return NVME_INVALID_PROT_INFO | NVME_DNR;
3791 }
3792
3793 break;
3794 }
3795 }
3796
3797 status = nvme_check_zone_write(ns, zone, slba, nlb);
3798 if (status) {
3799 goto invalid;
3800 }
3801
3802 status = nvme_zrm_auto(n, ns, zone);
3803 if (status) {
3804 goto invalid;
3805 }
3806
3807 if (!(zone->d.za & NVME_ZA_ZRWA_VALID)) {
3808 zone->w_ptr += nlb;
3809 }
3810 } else if (ns->endgrp && ns->endgrp->fdp.enabled) {
3811 nvme_do_write_fdp(n, req, slba, nlb);
3812 }
3813
3814 data_offset = nvme_l2b(ns, slba);
3815
3816 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3817 return nvme_dif_rw(n, req);
3818 }
3819
3820 if (!wrz) {
3821 status = nvme_map_data(n, nlb, req);
3822 if (status) {
3823 goto invalid;
3824 }
3825
3826 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3827 BLOCK_ACCT_WRITE);
3828 nvme_blk_write(blk, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req);
3829 } else {
3830 req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
3831 BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
3832 req);
3833 }
3834
3835 return NVME_NO_COMPLETE;
3836
3837 invalid:
3838 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE);
3839 return status | NVME_DNR;
3840 }
3841
nvme_write(NvmeCtrl * n,NvmeRequest * req)3842 static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
3843 {
3844 return nvme_do_write(n, req, false, false);
3845 }
3846
nvme_write_zeroes(NvmeCtrl * n,NvmeRequest * req)3847 static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
3848 {
3849 return nvme_do_write(n, req, false, true);
3850 }
3851
nvme_zone_append(NvmeCtrl * n,NvmeRequest * req)3852 static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
3853 {
3854 return nvme_do_write(n, req, true, false);
3855 }
3856
nvme_get_mgmt_zone_slba_idx(NvmeNamespace * ns,NvmeCmd * c,uint64_t * slba,uint32_t * zone_idx)3857 static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
3858 uint64_t *slba, uint32_t *zone_idx)
3859 {
3860 uint32_t dw10 = le32_to_cpu(c->cdw10);
3861 uint32_t dw11 = le32_to_cpu(c->cdw11);
3862
3863 if (!ns->params.zoned) {
3864 trace_pci_nvme_err_invalid_opc(c->opcode);
3865 return NVME_INVALID_OPCODE | NVME_DNR;
3866 }
3867
3868 *slba = ((uint64_t)dw11) << 32 | dw10;
3869 if (unlikely(*slba >= ns->id_ns.nsze)) {
3870 trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze);
3871 *slba = 0;
3872 return NVME_LBA_RANGE | NVME_DNR;
3873 }
3874
3875 *zone_idx = nvme_zone_idx(ns, *slba);
3876 assert(*zone_idx < ns->num_zones);
3877
3878 return NVME_SUCCESS;
3879 }
3880
3881 typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState,
3882 NvmeRequest *);
3883
3884 enum NvmeZoneProcessingMask {
3885 NVME_PROC_CURRENT_ZONE = 0,
3886 NVME_PROC_OPENED_ZONES = 1 << 0,
3887 NVME_PROC_CLOSED_ZONES = 1 << 1,
3888 NVME_PROC_READ_ONLY_ZONES = 1 << 2,
3889 NVME_PROC_FULL_ZONES = 1 << 3,
3890 };
3891
nvme_open_zone(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state,NvmeRequest * req)3892 static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
3893 NvmeZoneState state, NvmeRequest *req)
3894 {
3895 NvmeZoneSendCmd *cmd = (NvmeZoneSendCmd *)&req->cmd;
3896 int flags = 0;
3897
3898 if (cmd->zsflags & NVME_ZSFLAG_ZRWA_ALLOC) {
3899 uint16_t ozcs = le16_to_cpu(ns->id_ns_zoned->ozcs);
3900
3901 if (!(ozcs & NVME_ID_NS_ZONED_OZCS_ZRWASUP)) {
3902 return NVME_INVALID_ZONE_OP | NVME_DNR;
3903 }
3904
3905 if (zone->w_ptr % ns->zns.zrwafg) {
3906 return NVME_NOZRWA | NVME_DNR;
3907 }
3908
3909 flags = NVME_ZRM_ZRWA;
3910 }
3911
3912 return nvme_zrm_open_flags(nvme_ctrl(req), ns, zone, flags);
3913 }
3914
nvme_close_zone(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state,NvmeRequest * req)3915 static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
3916 NvmeZoneState state, NvmeRequest *req)
3917 {
3918 return nvme_zrm_close(ns, zone);
3919 }
3920
nvme_finish_zone(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state,NvmeRequest * req)3921 static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
3922 NvmeZoneState state, NvmeRequest *req)
3923 {
3924 return nvme_zrm_finish(ns, zone);
3925 }
3926
nvme_offline_zone(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state,NvmeRequest * req)3927 static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
3928 NvmeZoneState state, NvmeRequest *req)
3929 {
3930 switch (state) {
3931 case NVME_ZONE_STATE_READ_ONLY:
3932 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE);
3933 /* fall through */
3934 case NVME_ZONE_STATE_OFFLINE:
3935 return NVME_SUCCESS;
3936 default:
3937 return NVME_ZONE_INVAL_TRANSITION;
3938 }
3939 }
3940
nvme_set_zd_ext(NvmeNamespace * ns,NvmeZone * zone)3941 static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
3942 {
3943 uint16_t status;
3944 uint8_t state = nvme_get_zone_state(zone);
3945
3946 if (state == NVME_ZONE_STATE_EMPTY) {
3947 status = nvme_aor_check(ns, 1, 0);
3948 if (status) {
3949 return status;
3950 }
3951 nvme_aor_inc_active(ns);
3952 zone->d.za |= NVME_ZA_ZD_EXT_VALID;
3953 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
3954 return NVME_SUCCESS;
3955 }
3956
3957 return NVME_ZONE_INVAL_TRANSITION;
3958 }
3959
nvme_bulk_proc_zone(NvmeNamespace * ns,NvmeZone * zone,enum NvmeZoneProcessingMask proc_mask,op_handler_t op_hndlr,NvmeRequest * req)3960 static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
3961 enum NvmeZoneProcessingMask proc_mask,
3962 op_handler_t op_hndlr, NvmeRequest *req)
3963 {
3964 uint16_t status = NVME_SUCCESS;
3965 NvmeZoneState zs = nvme_get_zone_state(zone);
3966 bool proc_zone;
3967
3968 switch (zs) {
3969 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3970 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3971 proc_zone = proc_mask & NVME_PROC_OPENED_ZONES;
3972 break;
3973 case NVME_ZONE_STATE_CLOSED:
3974 proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES;
3975 break;
3976 case NVME_ZONE_STATE_READ_ONLY:
3977 proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES;
3978 break;
3979 case NVME_ZONE_STATE_FULL:
3980 proc_zone = proc_mask & NVME_PROC_FULL_ZONES;
3981 break;
3982 default:
3983 proc_zone = false;
3984 }
3985
3986 if (proc_zone) {
3987 status = op_hndlr(ns, zone, zs, req);
3988 }
3989
3990 return status;
3991 }
3992
nvme_do_zone_op(NvmeNamespace * ns,NvmeZone * zone,enum NvmeZoneProcessingMask proc_mask,op_handler_t op_hndlr,NvmeRequest * req)3993 static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
3994 enum NvmeZoneProcessingMask proc_mask,
3995 op_handler_t op_hndlr, NvmeRequest *req)
3996 {
3997 NvmeZone *next;
3998 uint16_t status = NVME_SUCCESS;
3999 int i;
4000
4001 if (!proc_mask) {
4002 status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req);
4003 } else {
4004 if (proc_mask & NVME_PROC_CLOSED_ZONES) {
4005 QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
4006 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4007 req);
4008 if (status && status != NVME_NO_COMPLETE) {
4009 goto out;
4010 }
4011 }
4012 }
4013 if (proc_mask & NVME_PROC_OPENED_ZONES) {
4014 QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
4015 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4016 req);
4017 if (status && status != NVME_NO_COMPLETE) {
4018 goto out;
4019 }
4020 }
4021
4022 QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
4023 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4024 req);
4025 if (status && status != NVME_NO_COMPLETE) {
4026 goto out;
4027 }
4028 }
4029 }
4030 if (proc_mask & NVME_PROC_FULL_ZONES) {
4031 QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) {
4032 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4033 req);
4034 if (status && status != NVME_NO_COMPLETE) {
4035 goto out;
4036 }
4037 }
4038 }
4039
4040 if (proc_mask & NVME_PROC_READ_ONLY_ZONES) {
4041 for (i = 0; i < ns->num_zones; i++, zone++) {
4042 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4043 req);
4044 if (status && status != NVME_NO_COMPLETE) {
4045 goto out;
4046 }
4047 }
4048 }
4049 }
4050
4051 out:
4052 return status;
4053 }
4054
4055 typedef struct NvmeZoneResetAIOCB {
4056 BlockAIOCB common;
4057 BlockAIOCB *aiocb;
4058 NvmeRequest *req;
4059 int ret;
4060
4061 bool all;
4062 int idx;
4063 NvmeZone *zone;
4064 } NvmeZoneResetAIOCB;
4065
nvme_zone_reset_cancel(BlockAIOCB * aiocb)4066 static void nvme_zone_reset_cancel(BlockAIOCB *aiocb)
4067 {
4068 NvmeZoneResetAIOCB *iocb = container_of(aiocb, NvmeZoneResetAIOCB, common);
4069 NvmeRequest *req = iocb->req;
4070 NvmeNamespace *ns = req->ns;
4071
4072 iocb->idx = ns->num_zones;
4073
4074 iocb->ret = -ECANCELED;
4075
4076 if (iocb->aiocb) {
4077 blk_aio_cancel_async(iocb->aiocb);
4078 iocb->aiocb = NULL;
4079 }
4080 }
4081
4082 static const AIOCBInfo nvme_zone_reset_aiocb_info = {
4083 .aiocb_size = sizeof(NvmeZoneResetAIOCB),
4084 .cancel_async = nvme_zone_reset_cancel,
4085 };
4086
4087 static void nvme_zone_reset_cb(void *opaque, int ret);
4088
nvme_zone_reset_epilogue_cb(void * opaque,int ret)4089 static void nvme_zone_reset_epilogue_cb(void *opaque, int ret)
4090 {
4091 NvmeZoneResetAIOCB *iocb = opaque;
4092 NvmeRequest *req = iocb->req;
4093 NvmeNamespace *ns = req->ns;
4094 int64_t moff;
4095 int count;
4096
4097 if (ret < 0 || iocb->ret < 0 || !ns->lbaf.ms) {
4098 goto out;
4099 }
4100
4101 moff = nvme_moff(ns, iocb->zone->d.zslba);
4102 count = nvme_m2b(ns, ns->zone_size);
4103
4104 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, moff, count,
4105 BDRV_REQ_MAY_UNMAP,
4106 nvme_zone_reset_cb, iocb);
4107 return;
4108
4109 out:
4110 nvme_zone_reset_cb(iocb, ret);
4111 }
4112
nvme_zone_reset_cb(void * opaque,int ret)4113 static void nvme_zone_reset_cb(void *opaque, int ret)
4114 {
4115 NvmeZoneResetAIOCB *iocb = opaque;
4116 NvmeRequest *req = iocb->req;
4117 NvmeNamespace *ns = req->ns;
4118
4119 if (iocb->ret < 0) {
4120 goto done;
4121 } else if (ret < 0) {
4122 iocb->ret = ret;
4123 goto done;
4124 }
4125
4126 if (iocb->zone) {
4127 nvme_zrm_reset(ns, iocb->zone);
4128
4129 if (!iocb->all) {
4130 goto done;
4131 }
4132 }
4133
4134 while (iocb->idx < ns->num_zones) {
4135 NvmeZone *zone = &ns->zone_array[iocb->idx++];
4136
4137 switch (nvme_get_zone_state(zone)) {
4138 case NVME_ZONE_STATE_EMPTY:
4139 if (!iocb->all) {
4140 goto done;
4141 }
4142
4143 continue;
4144
4145 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
4146 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
4147 case NVME_ZONE_STATE_CLOSED:
4148 case NVME_ZONE_STATE_FULL:
4149 iocb->zone = zone;
4150 break;
4151
4152 default:
4153 continue;
4154 }
4155
4156 trace_pci_nvme_zns_zone_reset(zone->d.zslba);
4157
4158 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk,
4159 nvme_l2b(ns, zone->d.zslba),
4160 nvme_l2b(ns, ns->zone_size),
4161 BDRV_REQ_MAY_UNMAP,
4162 nvme_zone_reset_epilogue_cb,
4163 iocb);
4164 return;
4165 }
4166
4167 done:
4168 iocb->aiocb = NULL;
4169
4170 iocb->common.cb(iocb->common.opaque, iocb->ret);
4171 qemu_aio_unref(iocb);
4172 }
4173
nvme_zone_mgmt_send_zrwa_flush(NvmeCtrl * n,NvmeZone * zone,uint64_t elba,NvmeRequest * req)4174 static uint16_t nvme_zone_mgmt_send_zrwa_flush(NvmeCtrl *n, NvmeZone *zone,
4175 uint64_t elba, NvmeRequest *req)
4176 {
4177 NvmeNamespace *ns = req->ns;
4178 uint16_t ozcs = le16_to_cpu(ns->id_ns_zoned->ozcs);
4179 uint64_t wp = zone->d.wp;
4180 uint32_t nlb = elba - wp + 1;
4181 uint16_t status;
4182
4183
4184 if (!(ozcs & NVME_ID_NS_ZONED_OZCS_ZRWASUP)) {
4185 return NVME_INVALID_ZONE_OP | NVME_DNR;
4186 }
4187
4188 if (!(zone->d.za & NVME_ZA_ZRWA_VALID)) {
4189 return NVME_INVALID_FIELD | NVME_DNR;
4190 }
4191
4192 if (elba < wp || elba > wp + ns->zns.zrwas) {
4193 return NVME_ZONE_BOUNDARY_ERROR | NVME_DNR;
4194 }
4195
4196 if (nlb % ns->zns.zrwafg) {
4197 return NVME_INVALID_FIELD | NVME_DNR;
4198 }
4199
4200 status = nvme_zrm_auto(n, ns, zone);
4201 if (status) {
4202 return status;
4203 }
4204
4205 zone->w_ptr += nlb;
4206
4207 nvme_advance_zone_wp(ns, zone, nlb);
4208
4209 return NVME_SUCCESS;
4210 }
4211
nvme_zone_mgmt_send(NvmeCtrl * n,NvmeRequest * req)4212 static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
4213 {
4214 NvmeZoneSendCmd *cmd = (NvmeZoneSendCmd *)&req->cmd;
4215 NvmeNamespace *ns = req->ns;
4216 NvmeZone *zone;
4217 NvmeZoneResetAIOCB *iocb;
4218 uint8_t *zd_ext;
4219 uint64_t slba = 0;
4220 uint32_t zone_idx = 0;
4221 uint16_t status;
4222 uint8_t action = cmd->zsa;
4223 bool all;
4224 enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE;
4225
4226 all = cmd->zsflags & NVME_ZSFLAG_SELECT_ALL;
4227
4228 req->status = NVME_SUCCESS;
4229
4230 if (!all) {
4231 status = nvme_get_mgmt_zone_slba_idx(ns, &req->cmd, &slba, &zone_idx);
4232 if (status) {
4233 return status;
4234 }
4235 }
4236
4237 zone = &ns->zone_array[zone_idx];
4238 if (slba != zone->d.zslba && action != NVME_ZONE_ACTION_ZRWA_FLUSH) {
4239 trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba);
4240 return NVME_INVALID_FIELD | NVME_DNR;
4241 }
4242
4243 switch (action) {
4244
4245 case NVME_ZONE_ACTION_OPEN:
4246 if (all) {
4247 proc_mask = NVME_PROC_CLOSED_ZONES;
4248 }
4249 trace_pci_nvme_open_zone(slba, zone_idx, all);
4250 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req);
4251 break;
4252
4253 case NVME_ZONE_ACTION_CLOSE:
4254 if (all) {
4255 proc_mask = NVME_PROC_OPENED_ZONES;
4256 }
4257 trace_pci_nvme_close_zone(slba, zone_idx, all);
4258 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req);
4259 break;
4260
4261 case NVME_ZONE_ACTION_FINISH:
4262 if (all) {
4263 proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES;
4264 }
4265 trace_pci_nvme_finish_zone(slba, zone_idx, all);
4266 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req);
4267 break;
4268
4269 case NVME_ZONE_ACTION_RESET:
4270 trace_pci_nvme_reset_zone(slba, zone_idx, all);
4271
4272 iocb = blk_aio_get(&nvme_zone_reset_aiocb_info, ns->blkconf.blk,
4273 nvme_misc_cb, req);
4274
4275 iocb->req = req;
4276 iocb->ret = 0;
4277 iocb->all = all;
4278 iocb->idx = zone_idx;
4279 iocb->zone = NULL;
4280
4281 req->aiocb = &iocb->common;
4282 nvme_zone_reset_cb(iocb, 0);
4283
4284 return NVME_NO_COMPLETE;
4285
4286 case NVME_ZONE_ACTION_OFFLINE:
4287 if (all) {
4288 proc_mask = NVME_PROC_READ_ONLY_ZONES;
4289 }
4290 trace_pci_nvme_offline_zone(slba, zone_idx, all);
4291 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req);
4292 break;
4293
4294 case NVME_ZONE_ACTION_SET_ZD_EXT:
4295 trace_pci_nvme_set_descriptor_extension(slba, zone_idx);
4296 if (all || !ns->params.zd_extension_size) {
4297 return NVME_INVALID_FIELD | NVME_DNR;
4298 }
4299 zd_ext = nvme_get_zd_extension(ns, zone_idx);
4300 status = nvme_h2c(n, zd_ext, ns->params.zd_extension_size, req);
4301 if (status) {
4302 trace_pci_nvme_err_zd_extension_map_error(zone_idx);
4303 return status;
4304 }
4305
4306 status = nvme_set_zd_ext(ns, zone);
4307 if (status == NVME_SUCCESS) {
4308 trace_pci_nvme_zd_extension_set(zone_idx);
4309 return status;
4310 }
4311 break;
4312
4313 case NVME_ZONE_ACTION_ZRWA_FLUSH:
4314 if (all) {
4315 return NVME_INVALID_FIELD | NVME_DNR;
4316 }
4317
4318 return nvme_zone_mgmt_send_zrwa_flush(n, zone, slba, req);
4319
4320 default:
4321 trace_pci_nvme_err_invalid_mgmt_action(action);
4322 status = NVME_INVALID_FIELD;
4323 }
4324
4325 if (status == NVME_ZONE_INVAL_TRANSITION) {
4326 trace_pci_nvme_err_invalid_zone_state_transition(action, slba,
4327 zone->d.za);
4328 }
4329 if (status) {
4330 status |= NVME_DNR;
4331 }
4332
4333 return status;
4334 }
4335
nvme_zone_matches_filter(uint32_t zafs,NvmeZone * zl)4336 static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl)
4337 {
4338 NvmeZoneState zs = nvme_get_zone_state(zl);
4339
4340 switch (zafs) {
4341 case NVME_ZONE_REPORT_ALL:
4342 return true;
4343 case NVME_ZONE_REPORT_EMPTY:
4344 return zs == NVME_ZONE_STATE_EMPTY;
4345 case NVME_ZONE_REPORT_IMPLICITLY_OPEN:
4346 return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN;
4347 case NVME_ZONE_REPORT_EXPLICITLY_OPEN:
4348 return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN;
4349 case NVME_ZONE_REPORT_CLOSED:
4350 return zs == NVME_ZONE_STATE_CLOSED;
4351 case NVME_ZONE_REPORT_FULL:
4352 return zs == NVME_ZONE_STATE_FULL;
4353 case NVME_ZONE_REPORT_READ_ONLY:
4354 return zs == NVME_ZONE_STATE_READ_ONLY;
4355 case NVME_ZONE_REPORT_OFFLINE:
4356 return zs == NVME_ZONE_STATE_OFFLINE;
4357 default:
4358 return false;
4359 }
4360 }
4361
nvme_zone_mgmt_recv(NvmeCtrl * n,NvmeRequest * req)4362 static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
4363 {
4364 NvmeCmd *cmd = &req->cmd;
4365 NvmeNamespace *ns = req->ns;
4366 /* cdw12 is zero-based number of dwords to return. Convert to bytes */
4367 uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2;
4368 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
4369 uint32_t zone_idx, zra, zrasf, partial;
4370 uint64_t max_zones, nr_zones = 0;
4371 uint16_t status;
4372 uint64_t slba;
4373 NvmeZoneDescr *z;
4374 NvmeZone *zone;
4375 NvmeZoneReportHeader *header;
4376 void *buf, *buf_p;
4377 size_t zone_entry_sz;
4378 int i;
4379
4380 req->status = NVME_SUCCESS;
4381
4382 status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
4383 if (status) {
4384 return status;
4385 }
4386
4387 zra = dw13 & 0xff;
4388 if (zra != NVME_ZONE_REPORT && zra != NVME_ZONE_REPORT_EXTENDED) {
4389 return NVME_INVALID_FIELD | NVME_DNR;
4390 }
4391 if (zra == NVME_ZONE_REPORT_EXTENDED && !ns->params.zd_extension_size) {
4392 return NVME_INVALID_FIELD | NVME_DNR;
4393 }
4394
4395 zrasf = (dw13 >> 8) & 0xff;
4396 if (zrasf > NVME_ZONE_REPORT_OFFLINE) {
4397 return NVME_INVALID_FIELD | NVME_DNR;
4398 }
4399
4400 if (data_size < sizeof(NvmeZoneReportHeader)) {
4401 return NVME_INVALID_FIELD | NVME_DNR;
4402 }
4403
4404 status = nvme_check_mdts(n, data_size);
4405 if (status) {
4406 return status;
4407 }
4408
4409 partial = (dw13 >> 16) & 0x01;
4410
4411 zone_entry_sz = sizeof(NvmeZoneDescr);
4412 if (zra == NVME_ZONE_REPORT_EXTENDED) {
4413 zone_entry_sz += ns->params.zd_extension_size;
4414 }
4415
4416 max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz;
4417 buf = g_malloc0(data_size);
4418
4419 zone = &ns->zone_array[zone_idx];
4420 for (i = zone_idx; i < ns->num_zones; i++) {
4421 if (partial && nr_zones >= max_zones) {
4422 break;
4423 }
4424 if (nvme_zone_matches_filter(zrasf, zone++)) {
4425 nr_zones++;
4426 }
4427 }
4428 header = buf;
4429 header->nr_zones = cpu_to_le64(nr_zones);
4430
4431 buf_p = buf + sizeof(NvmeZoneReportHeader);
4432 for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) {
4433 zone = &ns->zone_array[zone_idx];
4434 if (nvme_zone_matches_filter(zrasf, zone)) {
4435 z = buf_p;
4436 buf_p += sizeof(NvmeZoneDescr);
4437
4438 z->zt = zone->d.zt;
4439 z->zs = zone->d.zs;
4440 z->zcap = cpu_to_le64(zone->d.zcap);
4441 z->zslba = cpu_to_le64(zone->d.zslba);
4442 z->za = zone->d.za;
4443
4444 if (nvme_wp_is_valid(zone)) {
4445 z->wp = cpu_to_le64(zone->d.wp);
4446 } else {
4447 z->wp = cpu_to_le64(~0ULL);
4448 }
4449
4450 if (zra == NVME_ZONE_REPORT_EXTENDED) {
4451 if (zone->d.za & NVME_ZA_ZD_EXT_VALID) {
4452 memcpy(buf_p, nvme_get_zd_extension(ns, zone_idx),
4453 ns->params.zd_extension_size);
4454 }
4455 buf_p += ns->params.zd_extension_size;
4456 }
4457
4458 max_zones--;
4459 }
4460 }
4461
4462 status = nvme_c2h(n, (uint8_t *)buf, data_size, req);
4463
4464 g_free(buf);
4465
4466 return status;
4467 }
4468
nvme_io_mgmt_recv_ruhs(NvmeCtrl * n,NvmeRequest * req,size_t len)4469 static uint16_t nvme_io_mgmt_recv_ruhs(NvmeCtrl *n, NvmeRequest *req,
4470 size_t len)
4471 {
4472 NvmeNamespace *ns = req->ns;
4473 NvmeEnduranceGroup *endgrp;
4474 NvmeRuhStatus *hdr;
4475 NvmeRuhStatusDescr *ruhsd;
4476 unsigned int nruhsd;
4477 uint16_t rg, ph, *ruhid;
4478 size_t trans_len;
4479 g_autofree uint8_t *buf = NULL;
4480
4481 if (!n->subsys) {
4482 return NVME_INVALID_FIELD | NVME_DNR;
4483 }
4484
4485 if (ns->params.nsid == 0 || ns->params.nsid == 0xffffffff) {
4486 return NVME_INVALID_NSID | NVME_DNR;
4487 }
4488
4489 if (!n->subsys->endgrp.fdp.enabled) {
4490 return NVME_FDP_DISABLED | NVME_DNR;
4491 }
4492
4493 endgrp = ns->endgrp;
4494
4495 nruhsd = ns->fdp.nphs * endgrp->fdp.nrg;
4496 trans_len = sizeof(NvmeRuhStatus) + nruhsd * sizeof(NvmeRuhStatusDescr);
4497 buf = g_malloc0(trans_len);
4498
4499 trans_len = MIN(trans_len, len);
4500
4501 hdr = (NvmeRuhStatus *)buf;
4502 ruhsd = (NvmeRuhStatusDescr *)(buf + sizeof(NvmeRuhStatus));
4503
4504 hdr->nruhsd = cpu_to_le16(nruhsd);
4505
4506 ruhid = ns->fdp.phs;
4507
4508 for (ph = 0; ph < ns->fdp.nphs; ph++, ruhid++) {
4509 NvmeRuHandle *ruh = &endgrp->fdp.ruhs[*ruhid];
4510
4511 for (rg = 0; rg < endgrp->fdp.nrg; rg++, ruhsd++) {
4512 uint16_t pid = nvme_make_pid(ns, rg, ph);
4513
4514 ruhsd->pid = cpu_to_le16(pid);
4515 ruhsd->ruhid = *ruhid;
4516 ruhsd->earutr = 0;
4517 ruhsd->ruamw = cpu_to_le64(ruh->rus[rg].ruamw);
4518 }
4519 }
4520
4521 return nvme_c2h(n, buf, trans_len, req);
4522 }
4523
nvme_io_mgmt_recv(NvmeCtrl * n,NvmeRequest * req)4524 static uint16_t nvme_io_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
4525 {
4526 NvmeCmd *cmd = &req->cmd;
4527 uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
4528 uint32_t numd = le32_to_cpu(cmd->cdw11);
4529 uint8_t mo = (cdw10 & 0xff);
4530 size_t len = (numd + 1) << 2;
4531
4532 switch (mo) {
4533 case NVME_IOMR_MO_NOP:
4534 return 0;
4535 case NVME_IOMR_MO_RUH_STATUS:
4536 return nvme_io_mgmt_recv_ruhs(n, req, len);
4537 default:
4538 return NVME_INVALID_FIELD | NVME_DNR;
4539 };
4540 }
4541
nvme_io_mgmt_send_ruh_update(NvmeCtrl * n,NvmeRequest * req)4542 static uint16_t nvme_io_mgmt_send_ruh_update(NvmeCtrl *n, NvmeRequest *req)
4543 {
4544 NvmeCmd *cmd = &req->cmd;
4545 NvmeNamespace *ns = req->ns;
4546 uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
4547 uint16_t ret = NVME_SUCCESS;
4548 uint32_t npid = (cdw10 >> 16) + 1;
4549 unsigned int i = 0;
4550 g_autofree uint16_t *pids = NULL;
4551 uint32_t maxnpid;
4552
4553 if (!ns->endgrp || !ns->endgrp->fdp.enabled) {
4554 return NVME_FDP_DISABLED | NVME_DNR;
4555 }
4556
4557 maxnpid = n->subsys->endgrp.fdp.nrg * n->subsys->endgrp.fdp.nruh;
4558
4559 if (unlikely(npid >= MIN(NVME_FDP_MAXPIDS, maxnpid))) {
4560 return NVME_INVALID_FIELD | NVME_DNR;
4561 }
4562
4563 pids = g_new(uint16_t, npid);
4564
4565 ret = nvme_h2c(n, pids, npid * sizeof(uint16_t), req);
4566 if (ret) {
4567 return ret;
4568 }
4569
4570 for (; i < npid; i++) {
4571 if (!nvme_update_ruh(n, ns, pids[i])) {
4572 return NVME_INVALID_FIELD | NVME_DNR;
4573 }
4574 }
4575
4576 return ret;
4577 }
4578
nvme_io_mgmt_send(NvmeCtrl * n,NvmeRequest * req)4579 static uint16_t nvme_io_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
4580 {
4581 NvmeCmd *cmd = &req->cmd;
4582 uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
4583 uint8_t mo = (cdw10 & 0xff);
4584
4585 switch (mo) {
4586 case NVME_IOMS_MO_NOP:
4587 return 0;
4588 case NVME_IOMS_MO_RUH_UPDATE:
4589 return nvme_io_mgmt_send_ruh_update(n, req);
4590 default:
4591 return NVME_INVALID_FIELD | NVME_DNR;
4592 };
4593 }
4594
nvme_io_cmd(NvmeCtrl * n,NvmeRequest * req)4595 static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
4596 {
4597 NvmeNamespace *ns;
4598 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4599
4600 trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
4601 req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
4602
4603 /*
4604 * In the base NVM command set, Flush may apply to all namespaces
4605 * (indicated by NSID being set to FFFFFFFFh). But if that feature is used
4606 * along with TP 4056 (Namespace Types), it may be pretty screwed up.
4607 *
4608 * If NSID is indeed set to FFFFFFFFh, we simply cannot associate the
4609 * opcode with a specific command since we cannot determine a unique I/O
4610 * command set. Opcode 0h could have any other meaning than something
4611 * equivalent to flushing and say it DOES have completely different
4612 * semantics in some other command set - does an NSID of FFFFFFFFh then
4613 * mean "for all namespaces, apply whatever command set specific command
4614 * that uses the 0h opcode?" Or does it mean "for all namespaces, apply
4615 * whatever command that uses the 0h opcode if, and only if, it allows NSID
4616 * to be FFFFFFFFh"?
4617 *
4618 * Anyway (and luckily), for now, we do not care about this since the
4619 * device only supports namespace types that includes the NVM Flush command
4620 * (NVM and Zoned), so always do an NVM Flush.
4621 */
4622
4623 if (req->cmd.opcode == NVME_CMD_FLUSH) {
4624 return nvme_flush(n, req);
4625 }
4626
4627 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4628 return NVME_INVALID_NSID | NVME_DNR;
4629 }
4630
4631 ns = nvme_ns(n, nsid);
4632 if (unlikely(!ns)) {
4633 return NVME_INVALID_FIELD | NVME_DNR;
4634 }
4635
4636 if (!(ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
4637 trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
4638 return NVME_INVALID_OPCODE | NVME_DNR;
4639 }
4640
4641 if (ns->status) {
4642 return ns->status;
4643 }
4644
4645 if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
4646 return NVME_INVALID_FIELD;
4647 }
4648
4649 req->ns = ns;
4650
4651 switch (req->cmd.opcode) {
4652 case NVME_CMD_WRITE_ZEROES:
4653 return nvme_write_zeroes(n, req);
4654 case NVME_CMD_ZONE_APPEND:
4655 return nvme_zone_append(n, req);
4656 case NVME_CMD_WRITE:
4657 return nvme_write(n, req);
4658 case NVME_CMD_READ:
4659 return nvme_read(n, req);
4660 case NVME_CMD_COMPARE:
4661 return nvme_compare(n, req);
4662 case NVME_CMD_DSM:
4663 return nvme_dsm(n, req);
4664 case NVME_CMD_VERIFY:
4665 return nvme_verify(n, req);
4666 case NVME_CMD_COPY:
4667 return nvme_copy(n, req);
4668 case NVME_CMD_ZONE_MGMT_SEND:
4669 return nvme_zone_mgmt_send(n, req);
4670 case NVME_CMD_ZONE_MGMT_RECV:
4671 return nvme_zone_mgmt_recv(n, req);
4672 case NVME_CMD_IO_MGMT_RECV:
4673 return nvme_io_mgmt_recv(n, req);
4674 case NVME_CMD_IO_MGMT_SEND:
4675 return nvme_io_mgmt_send(n, req);
4676 default:
4677 g_assert_not_reached();
4678 }
4679
4680 return NVME_INVALID_OPCODE | NVME_DNR;
4681 }
4682
nvme_cq_notifier(EventNotifier * e)4683 static void nvme_cq_notifier(EventNotifier *e)
4684 {
4685 NvmeCQueue *cq = container_of(e, NvmeCQueue, notifier);
4686 NvmeCtrl *n = cq->ctrl;
4687
4688 if (!event_notifier_test_and_clear(e)) {
4689 return;
4690 }
4691
4692 nvme_update_cq_head(cq);
4693
4694 if (cq->tail == cq->head) {
4695 if (cq->irq_enabled) {
4696 n->cq_pending--;
4697 }
4698
4699 nvme_irq_deassert(n, cq);
4700 }
4701
4702 qemu_bh_schedule(cq->bh);
4703 }
4704
nvme_init_cq_ioeventfd(NvmeCQueue * cq)4705 static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
4706 {
4707 NvmeCtrl *n = cq->ctrl;
4708 uint16_t offset = (cq->cqid << 3) + (1 << 2);
4709 int ret;
4710
4711 ret = event_notifier_init(&cq->notifier, 0);
4712 if (ret < 0) {
4713 return ret;
4714 }
4715
4716 event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
4717 memory_region_add_eventfd(&n->iomem,
4718 0x1000 + offset, 4, false, 0, &cq->notifier);
4719
4720 return 0;
4721 }
4722
nvme_sq_notifier(EventNotifier * e)4723 static void nvme_sq_notifier(EventNotifier *e)
4724 {
4725 NvmeSQueue *sq = container_of(e, NvmeSQueue, notifier);
4726
4727 if (!event_notifier_test_and_clear(e)) {
4728 return;
4729 }
4730
4731 nvme_process_sq(sq);
4732 }
4733
nvme_init_sq_ioeventfd(NvmeSQueue * sq)4734 static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
4735 {
4736 NvmeCtrl *n = sq->ctrl;
4737 uint16_t offset = sq->sqid << 3;
4738 int ret;
4739
4740 ret = event_notifier_init(&sq->notifier, 0);
4741 if (ret < 0) {
4742 return ret;
4743 }
4744
4745 event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
4746 memory_region_add_eventfd(&n->iomem,
4747 0x1000 + offset, 4, false, 0, &sq->notifier);
4748
4749 return 0;
4750 }
4751
nvme_free_sq(NvmeSQueue * sq,NvmeCtrl * n)4752 static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
4753 {
4754 uint16_t offset = sq->sqid << 3;
4755
4756 n->sq[sq->sqid] = NULL;
4757 qemu_bh_delete(sq->bh);
4758 if (sq->ioeventfd_enabled) {
4759 memory_region_del_eventfd(&n->iomem,
4760 0x1000 + offset, 4, false, 0, &sq->notifier);
4761 event_notifier_set_handler(&sq->notifier, NULL);
4762 event_notifier_cleanup(&sq->notifier);
4763 }
4764 g_free(sq->io_req);
4765 if (sq->sqid) {
4766 g_free(sq);
4767 }
4768 }
4769
nvme_del_sq(NvmeCtrl * n,NvmeRequest * req)4770 static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
4771 {
4772 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
4773 NvmeRequest *r, *next;
4774 NvmeSQueue *sq;
4775 NvmeCQueue *cq;
4776 uint16_t qid = le16_to_cpu(c->qid);
4777
4778 if (unlikely(!qid || nvme_check_sqid(n, qid))) {
4779 trace_pci_nvme_err_invalid_del_sq(qid);
4780 return NVME_INVALID_QID | NVME_DNR;
4781 }
4782
4783 trace_pci_nvme_del_sq(qid);
4784
4785 sq = n->sq[qid];
4786 while (!QTAILQ_EMPTY(&sq->out_req_list)) {
4787 r = QTAILQ_FIRST(&sq->out_req_list);
4788 assert(r->aiocb);
4789 blk_aio_cancel(r->aiocb);
4790 }
4791
4792 assert(QTAILQ_EMPTY(&sq->out_req_list));
4793
4794 if (!nvme_check_cqid(n, sq->cqid)) {
4795 cq = n->cq[sq->cqid];
4796 QTAILQ_REMOVE(&cq->sq_list, sq, entry);
4797
4798 nvme_post_cqes(cq);
4799 QTAILQ_FOREACH_SAFE(r, &cq->req_list, entry, next) {
4800 if (r->sq == sq) {
4801 QTAILQ_REMOVE(&cq->req_list, r, entry);
4802 QTAILQ_INSERT_TAIL(&sq->req_list, r, entry);
4803 }
4804 }
4805 }
4806
4807 nvme_free_sq(sq, n);
4808 return NVME_SUCCESS;
4809 }
4810
nvme_init_sq(NvmeSQueue * sq,NvmeCtrl * n,uint64_t dma_addr,uint16_t sqid,uint16_t cqid,uint16_t size)4811 static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
4812 uint16_t sqid, uint16_t cqid, uint16_t size)
4813 {
4814 int i;
4815 NvmeCQueue *cq;
4816
4817 sq->ctrl = n;
4818 sq->dma_addr = dma_addr;
4819 sq->sqid = sqid;
4820 sq->size = size;
4821 sq->cqid = cqid;
4822 sq->head = sq->tail = 0;
4823 sq->io_req = g_new0(NvmeRequest, sq->size);
4824
4825 QTAILQ_INIT(&sq->req_list);
4826 QTAILQ_INIT(&sq->out_req_list);
4827 for (i = 0; i < sq->size; i++) {
4828 sq->io_req[i].sq = sq;
4829 QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
4830 }
4831
4832 sq->bh = qemu_bh_new_guarded(nvme_process_sq, sq,
4833 &DEVICE(sq->ctrl)->mem_reentrancy_guard);
4834
4835 if (n->dbbuf_enabled) {
4836 sq->db_addr = n->dbbuf_dbs + (sqid << 3);
4837 sq->ei_addr = n->dbbuf_eis + (sqid << 3);
4838
4839 if (n->params.ioeventfd && sq->sqid != 0) {
4840 if (!nvme_init_sq_ioeventfd(sq)) {
4841 sq->ioeventfd_enabled = true;
4842 }
4843 }
4844 }
4845
4846 assert(n->cq[cqid]);
4847 cq = n->cq[cqid];
4848 QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
4849 n->sq[sqid] = sq;
4850 }
4851
nvme_create_sq(NvmeCtrl * n,NvmeRequest * req)4852 static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req)
4853 {
4854 NvmeSQueue *sq;
4855 NvmeCreateSq *c = (NvmeCreateSq *)&req->cmd;
4856
4857 uint16_t cqid = le16_to_cpu(c->cqid);
4858 uint16_t sqid = le16_to_cpu(c->sqid);
4859 uint16_t qsize = le16_to_cpu(c->qsize);
4860 uint16_t qflags = le16_to_cpu(c->sq_flags);
4861 uint64_t prp1 = le64_to_cpu(c->prp1);
4862
4863 trace_pci_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
4864
4865 if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
4866 trace_pci_nvme_err_invalid_create_sq_cqid(cqid);
4867 return NVME_INVALID_CQID | NVME_DNR;
4868 }
4869 if (unlikely(!sqid || sqid > n->conf_ioqpairs || n->sq[sqid] != NULL)) {
4870 trace_pci_nvme_err_invalid_create_sq_sqid(sqid);
4871 return NVME_INVALID_QID | NVME_DNR;
4872 }
4873 if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
4874 trace_pci_nvme_err_invalid_create_sq_size(qsize);
4875 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
4876 }
4877 if (unlikely(prp1 & (n->page_size - 1))) {
4878 trace_pci_nvme_err_invalid_create_sq_addr(prp1);
4879 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
4880 }
4881 if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
4882 trace_pci_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
4883 return NVME_INVALID_FIELD | NVME_DNR;
4884 }
4885 sq = g_malloc0(sizeof(*sq));
4886 nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1);
4887 return NVME_SUCCESS;
4888 }
4889
4890 struct nvme_stats {
4891 uint64_t units_read;
4892 uint64_t units_written;
4893 uint64_t read_commands;
4894 uint64_t write_commands;
4895 };
4896
nvme_set_blk_stats(NvmeNamespace * ns,struct nvme_stats * stats)4897 static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats)
4898 {
4899 BlockAcctStats *s = blk_get_stats(ns->blkconf.blk);
4900
4901 stats->units_read += s->nr_bytes[BLOCK_ACCT_READ];
4902 stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE];
4903 stats->read_commands += s->nr_ops[BLOCK_ACCT_READ];
4904 stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
4905 }
4906
nvme_smart_info(NvmeCtrl * n,uint8_t rae,uint32_t buf_len,uint64_t off,NvmeRequest * req)4907 static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4908 uint64_t off, NvmeRequest *req)
4909 {
4910 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4911 struct nvme_stats stats = { 0 };
4912 NvmeSmartLog smart = { 0 };
4913 uint32_t trans_len;
4914 NvmeNamespace *ns;
4915 time_t current_ms;
4916 uint64_t u_read, u_written;
4917
4918 if (off >= sizeof(smart)) {
4919 return NVME_INVALID_FIELD | NVME_DNR;
4920 }
4921
4922 if (nsid != 0xffffffff) {
4923 ns = nvme_ns(n, nsid);
4924 if (!ns) {
4925 return NVME_INVALID_NSID | NVME_DNR;
4926 }
4927 nvme_set_blk_stats(ns, &stats);
4928 } else {
4929 int i;
4930
4931 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4932 ns = nvme_ns(n, i);
4933 if (!ns) {
4934 continue;
4935 }
4936 nvme_set_blk_stats(ns, &stats);
4937 }
4938 }
4939
4940 trans_len = MIN(sizeof(smart) - off, buf_len);
4941 smart.critical_warning = n->smart_critical_warning;
4942
4943 u_read = DIV_ROUND_UP(stats.units_read >> BDRV_SECTOR_BITS, 1000);
4944 u_written = DIV_ROUND_UP(stats.units_written >> BDRV_SECTOR_BITS, 1000);
4945
4946 smart.data_units_read[0] = cpu_to_le64(u_read);
4947 smart.data_units_written[0] = cpu_to_le64(u_written);
4948 smart.host_read_commands[0] = cpu_to_le64(stats.read_commands);
4949 smart.host_write_commands[0] = cpu_to_le64(stats.write_commands);
4950
4951 smart.temperature = cpu_to_le16(n->temperature);
4952
4953 if ((n->temperature >= n->features.temp_thresh_hi) ||
4954 (n->temperature <= n->features.temp_thresh_low)) {
4955 smart.critical_warning |= NVME_SMART_TEMPERATURE;
4956 }
4957
4958 current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4959 smart.power_on_hours[0] =
4960 cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60);
4961
4962 if (!rae) {
4963 nvme_clear_events(n, NVME_AER_TYPE_SMART);
4964 }
4965
4966 return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req);
4967 }
4968
nvme_endgrp_info(NvmeCtrl * n,uint8_t rae,uint32_t buf_len,uint64_t off,NvmeRequest * req)4969 static uint16_t nvme_endgrp_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4970 uint64_t off, NvmeRequest *req)
4971 {
4972 uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
4973 uint16_t endgrpid = (dw11 >> 16) & 0xffff;
4974 struct nvme_stats stats = {};
4975 NvmeEndGrpLog info = {};
4976 int i;
4977
4978 if (!n->subsys || endgrpid != 0x1) {
4979 return NVME_INVALID_FIELD | NVME_DNR;
4980 }
4981
4982 if (off >= sizeof(info)) {
4983 return NVME_INVALID_FIELD | NVME_DNR;
4984 }
4985
4986 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4987 NvmeNamespace *ns = nvme_subsys_ns(n->subsys, i);
4988 if (!ns) {
4989 continue;
4990 }
4991
4992 nvme_set_blk_stats(ns, &stats);
4993 }
4994
4995 info.data_units_read[0] =
4996 cpu_to_le64(DIV_ROUND_UP(stats.units_read / 1000000000, 1000000000));
4997 info.data_units_written[0] =
4998 cpu_to_le64(DIV_ROUND_UP(stats.units_written / 1000000000, 1000000000));
4999 info.media_units_written[0] =
5000 cpu_to_le64(DIV_ROUND_UP(stats.units_written / 1000000000, 1000000000));
5001
5002 info.host_read_commands[0] = cpu_to_le64(stats.read_commands);
5003 info.host_write_commands[0] = cpu_to_le64(stats.write_commands);
5004
5005 buf_len = MIN(sizeof(info) - off, buf_len);
5006
5007 return nvme_c2h(n, (uint8_t *)&info + off, buf_len, req);
5008 }
5009
5010
nvme_fw_log_info(NvmeCtrl * n,uint32_t buf_len,uint64_t off,NvmeRequest * req)5011 static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off,
5012 NvmeRequest *req)
5013 {
5014 uint32_t trans_len;
5015 NvmeFwSlotInfoLog fw_log = {
5016 .afi = 0x1,
5017 };
5018
5019 if (off >= sizeof(fw_log)) {
5020 return NVME_INVALID_FIELD | NVME_DNR;
5021 }
5022
5023 strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' ');
5024 trans_len = MIN(sizeof(fw_log) - off, buf_len);
5025
5026 return nvme_c2h(n, (uint8_t *) &fw_log + off, trans_len, req);
5027 }
5028
nvme_error_info(NvmeCtrl * n,uint8_t rae,uint32_t buf_len,uint64_t off,NvmeRequest * req)5029 static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
5030 uint64_t off, NvmeRequest *req)
5031 {
5032 uint32_t trans_len;
5033 NvmeErrorLog errlog;
5034
5035 if (off >= sizeof(errlog)) {
5036 return NVME_INVALID_FIELD | NVME_DNR;
5037 }
5038
5039 if (!rae) {
5040 nvme_clear_events(n, NVME_AER_TYPE_ERROR);
5041 }
5042
5043 memset(&errlog, 0x0, sizeof(errlog));
5044 trans_len = MIN(sizeof(errlog) - off, buf_len);
5045
5046 return nvme_c2h(n, (uint8_t *)&errlog, trans_len, req);
5047 }
5048
nvme_changed_nslist(NvmeCtrl * n,uint8_t rae,uint32_t buf_len,uint64_t off,NvmeRequest * req)5049 static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
5050 uint64_t off, NvmeRequest *req)
5051 {
5052 uint32_t nslist[1024];
5053 uint32_t trans_len;
5054 int i = 0;
5055 uint32_t nsid;
5056
5057 if (off >= sizeof(nslist)) {
5058 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(nslist));
5059 return NVME_INVALID_FIELD | NVME_DNR;
5060 }
5061
5062 memset(nslist, 0x0, sizeof(nslist));
5063 trans_len = MIN(sizeof(nslist) - off, buf_len);
5064
5065 while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) !=
5066 NVME_CHANGED_NSID_SIZE) {
5067 /*
5068 * If more than 1024 namespaces, the first entry in the log page should
5069 * be set to FFFFFFFFh and the others to 0 as spec.
5070 */
5071 if (i == ARRAY_SIZE(nslist)) {
5072 memset(nslist, 0x0, sizeof(nslist));
5073 nslist[0] = 0xffffffff;
5074 break;
5075 }
5076
5077 nslist[i++] = nsid;
5078 clear_bit(nsid, n->changed_nsids);
5079 }
5080
5081 /*
5082 * Remove all the remaining list entries in case returns directly due to
5083 * more than 1024 namespaces.
5084 */
5085 if (nslist[0] == 0xffffffff) {
5086 bitmap_zero(n->changed_nsids, NVME_CHANGED_NSID_SIZE);
5087 }
5088
5089 if (!rae) {
5090 nvme_clear_events(n, NVME_AER_TYPE_NOTICE);
5091 }
5092
5093 return nvme_c2h(n, ((uint8_t *)nslist) + off, trans_len, req);
5094 }
5095
nvme_cmd_effects(NvmeCtrl * n,uint8_t csi,uint32_t buf_len,uint64_t off,NvmeRequest * req)5096 static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
5097 uint64_t off, NvmeRequest *req)
5098 {
5099 NvmeEffectsLog log = {};
5100 const uint32_t *src_iocs = NULL;
5101 uint32_t trans_len;
5102
5103 if (off >= sizeof(log)) {
5104 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log));
5105 return NVME_INVALID_FIELD | NVME_DNR;
5106 }
5107
5108 switch (NVME_CC_CSS(ldl_le_p(&n->bar.cc))) {
5109 case NVME_CC_CSS_NVM:
5110 src_iocs = nvme_cse_iocs_nvm;
5111 /* fall through */
5112 case NVME_CC_CSS_ADMIN_ONLY:
5113 break;
5114 case NVME_CC_CSS_CSI:
5115 switch (csi) {
5116 case NVME_CSI_NVM:
5117 src_iocs = nvme_cse_iocs_nvm;
5118 break;
5119 case NVME_CSI_ZONED:
5120 src_iocs = nvme_cse_iocs_zoned;
5121 break;
5122 }
5123 }
5124
5125 memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs));
5126
5127 if (src_iocs) {
5128 memcpy(log.iocs, src_iocs, sizeof(log.iocs));
5129 }
5130
5131 trans_len = MIN(sizeof(log) - off, buf_len);
5132
5133 return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req);
5134 }
5135
sizeof_fdp_conf_descr(size_t nruh,size_t vss)5136 static size_t sizeof_fdp_conf_descr(size_t nruh, size_t vss)
5137 {
5138 size_t entry_siz = sizeof(NvmeFdpDescrHdr) + nruh * sizeof(NvmeRuhDescr)
5139 + vss;
5140 return ROUND_UP(entry_siz, 8);
5141 }
5142
nvme_fdp_confs(NvmeCtrl * n,uint32_t endgrpid,uint32_t buf_len,uint64_t off,NvmeRequest * req)5143 static uint16_t nvme_fdp_confs(NvmeCtrl *n, uint32_t endgrpid, uint32_t buf_len,
5144 uint64_t off, NvmeRequest *req)
5145 {
5146 uint32_t log_size, trans_len;
5147 g_autofree uint8_t *buf = NULL;
5148 NvmeFdpDescrHdr *hdr;
5149 NvmeRuhDescr *ruhd;
5150 NvmeEnduranceGroup *endgrp;
5151 NvmeFdpConfsHdr *log;
5152 size_t nruh, fdp_descr_size;
5153 int i;
5154
5155 if (endgrpid != 1 || !n->subsys) {
5156 return NVME_INVALID_FIELD | NVME_DNR;
5157 }
5158
5159 endgrp = &n->subsys->endgrp;
5160
5161 if (endgrp->fdp.enabled) {
5162 nruh = endgrp->fdp.nruh;
5163 } else {
5164 nruh = 1;
5165 }
5166
5167 fdp_descr_size = sizeof_fdp_conf_descr(nruh, FDPVSS);
5168 log_size = sizeof(NvmeFdpConfsHdr) + fdp_descr_size;
5169
5170 if (off >= log_size) {
5171 return NVME_INVALID_FIELD | NVME_DNR;
5172 }
5173
5174 trans_len = MIN(log_size - off, buf_len);
5175
5176 buf = g_malloc0(log_size);
5177 log = (NvmeFdpConfsHdr *)buf;
5178 hdr = (NvmeFdpDescrHdr *)(log + 1);
5179 ruhd = (NvmeRuhDescr *)(buf + sizeof(*log) + sizeof(*hdr));
5180
5181 log->num_confs = cpu_to_le16(0);
5182 log->size = cpu_to_le32(log_size);
5183
5184 hdr->descr_size = cpu_to_le16(fdp_descr_size);
5185 if (endgrp->fdp.enabled) {
5186 hdr->fdpa = FIELD_DP8(hdr->fdpa, FDPA, VALID, 1);
5187 hdr->fdpa = FIELD_DP8(hdr->fdpa, FDPA, RGIF, endgrp->fdp.rgif);
5188 hdr->nrg = cpu_to_le16(endgrp->fdp.nrg);
5189 hdr->nruh = cpu_to_le16(endgrp->fdp.nruh);
5190 hdr->maxpids = cpu_to_le16(NVME_FDP_MAXPIDS - 1);
5191 hdr->nnss = cpu_to_le32(NVME_MAX_NAMESPACES);
5192 hdr->runs = cpu_to_le64(endgrp->fdp.runs);
5193
5194 for (i = 0; i < nruh; i++) {
5195 ruhd->ruht = NVME_RUHT_INITIALLY_ISOLATED;
5196 ruhd++;
5197 }
5198 } else {
5199 /* 1 bit for RUH in PIF -> 2 RUHs max. */
5200 hdr->nrg = cpu_to_le16(1);
5201 hdr->nruh = cpu_to_le16(1);
5202 hdr->maxpids = cpu_to_le16(NVME_FDP_MAXPIDS - 1);
5203 hdr->nnss = cpu_to_le32(1);
5204 hdr->runs = cpu_to_le64(96 * MiB);
5205
5206 ruhd->ruht = NVME_RUHT_INITIALLY_ISOLATED;
5207 }
5208
5209 return nvme_c2h(n, (uint8_t *)buf + off, trans_len, req);
5210 }
5211
nvme_fdp_ruh_usage(NvmeCtrl * n,uint32_t endgrpid,uint32_t dw10,uint32_t dw12,uint32_t buf_len,uint64_t off,NvmeRequest * req)5212 static uint16_t nvme_fdp_ruh_usage(NvmeCtrl *n, uint32_t endgrpid,
5213 uint32_t dw10, uint32_t dw12,
5214 uint32_t buf_len, uint64_t off,
5215 NvmeRequest *req)
5216 {
5217 NvmeRuHandle *ruh;
5218 NvmeRuhuLog *hdr;
5219 NvmeRuhuDescr *ruhud;
5220 NvmeEnduranceGroup *endgrp;
5221 g_autofree uint8_t *buf = NULL;
5222 uint32_t log_size, trans_len;
5223 uint16_t i;
5224
5225 if (endgrpid != 1 || !n->subsys) {
5226 return NVME_INVALID_FIELD | NVME_DNR;
5227 }
5228
5229 endgrp = &n->subsys->endgrp;
5230
5231 if (!endgrp->fdp.enabled) {
5232 return NVME_FDP_DISABLED | NVME_DNR;
5233 }
5234
5235 log_size = sizeof(NvmeRuhuLog) + endgrp->fdp.nruh * sizeof(NvmeRuhuDescr);
5236
5237 if (off >= log_size) {
5238 return NVME_INVALID_FIELD | NVME_DNR;
5239 }
5240
5241 trans_len = MIN(log_size - off, buf_len);
5242
5243 buf = g_malloc0(log_size);
5244 hdr = (NvmeRuhuLog *)buf;
5245 ruhud = (NvmeRuhuDescr *)(hdr + 1);
5246
5247 ruh = endgrp->fdp.ruhs;
5248 hdr->nruh = cpu_to_le16(endgrp->fdp.nruh);
5249
5250 for (i = 0; i < endgrp->fdp.nruh; i++, ruhud++, ruh++) {
5251 ruhud->ruha = ruh->ruha;
5252 }
5253
5254 return nvme_c2h(n, (uint8_t *)buf + off, trans_len, req);
5255 }
5256
nvme_fdp_stats(NvmeCtrl * n,uint32_t endgrpid,uint32_t buf_len,uint64_t off,NvmeRequest * req)5257 static uint16_t nvme_fdp_stats(NvmeCtrl *n, uint32_t endgrpid, uint32_t buf_len,
5258 uint64_t off, NvmeRequest *req)
5259 {
5260 NvmeEnduranceGroup *endgrp;
5261 NvmeFdpStatsLog log = {};
5262 uint32_t trans_len;
5263
5264 if (off >= sizeof(NvmeFdpStatsLog)) {
5265 return NVME_INVALID_FIELD | NVME_DNR;
5266 }
5267
5268 if (endgrpid != 1 || !n->subsys) {
5269 return NVME_INVALID_FIELD | NVME_DNR;
5270 }
5271
5272 if (!n->subsys->endgrp.fdp.enabled) {
5273 return NVME_FDP_DISABLED | NVME_DNR;
5274 }
5275
5276 endgrp = &n->subsys->endgrp;
5277
5278 trans_len = MIN(sizeof(log) - off, buf_len);
5279
5280 /* spec value is 128 bit, we only use 64 bit */
5281 log.hbmw[0] = cpu_to_le64(endgrp->fdp.hbmw);
5282 log.mbmw[0] = cpu_to_le64(endgrp->fdp.mbmw);
5283 log.mbe[0] = cpu_to_le64(endgrp->fdp.mbe);
5284
5285 return nvme_c2h(n, (uint8_t *)&log + off, trans_len, req);
5286 }
5287
nvme_fdp_events(NvmeCtrl * n,uint32_t endgrpid,uint32_t buf_len,uint64_t off,NvmeRequest * req)5288 static uint16_t nvme_fdp_events(NvmeCtrl *n, uint32_t endgrpid,
5289 uint32_t buf_len, uint64_t off,
5290 NvmeRequest *req)
5291 {
5292 NvmeEnduranceGroup *endgrp;
5293 NvmeCmd *cmd = &req->cmd;
5294 bool host_events = (cmd->cdw10 >> 8) & 0x1;
5295 uint32_t log_size, trans_len;
5296 NvmeFdpEventBuffer *ebuf;
5297 g_autofree NvmeFdpEventsLog *elog = NULL;
5298 NvmeFdpEvent *event;
5299
5300 if (endgrpid != 1 || !n->subsys) {
5301 return NVME_INVALID_FIELD | NVME_DNR;
5302 }
5303
5304 endgrp = &n->subsys->endgrp;
5305
5306 if (!endgrp->fdp.enabled) {
5307 return NVME_FDP_DISABLED | NVME_DNR;
5308 }
5309
5310 if (host_events) {
5311 ebuf = &endgrp->fdp.host_events;
5312 } else {
5313 ebuf = &endgrp->fdp.ctrl_events;
5314 }
5315
5316 log_size = sizeof(NvmeFdpEventsLog) + ebuf->nelems * sizeof(NvmeFdpEvent);
5317
5318 if (off >= log_size) {
5319 return NVME_INVALID_FIELD | NVME_DNR;
5320 }
5321
5322 trans_len = MIN(log_size - off, buf_len);
5323 elog = g_malloc0(log_size);
5324 elog->num_events = cpu_to_le32(ebuf->nelems);
5325 event = (NvmeFdpEvent *)(elog + 1);
5326
5327 if (ebuf->nelems && ebuf->start == ebuf->next) {
5328 unsigned int nelems = (NVME_FDP_MAX_EVENTS - ebuf->start);
5329 /* wrap over, copy [start;NVME_FDP_MAX_EVENTS[ and [0; next[ */
5330 memcpy(event, &ebuf->events[ebuf->start],
5331 sizeof(NvmeFdpEvent) * nelems);
5332 memcpy(event + nelems, ebuf->events,
5333 sizeof(NvmeFdpEvent) * ebuf->next);
5334 } else if (ebuf->start < ebuf->next) {
5335 memcpy(event, &ebuf->events[ebuf->start],
5336 sizeof(NvmeFdpEvent) * (ebuf->next - ebuf->start));
5337 }
5338
5339 return nvme_c2h(n, (uint8_t *)elog + off, trans_len, req);
5340 }
5341
nvme_get_log(NvmeCtrl * n,NvmeRequest * req)5342 static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
5343 {
5344 NvmeCmd *cmd = &req->cmd;
5345
5346 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
5347 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
5348 uint32_t dw12 = le32_to_cpu(cmd->cdw12);
5349 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
5350 uint8_t lid = dw10 & 0xff;
5351 uint8_t lsp = (dw10 >> 8) & 0xf;
5352 uint8_t rae = (dw10 >> 15) & 0x1;
5353 uint8_t csi = le32_to_cpu(cmd->cdw14) >> 24;
5354 uint32_t numdl, numdu, lspi;
5355 uint64_t off, lpol, lpou;
5356 size_t len;
5357 uint16_t status;
5358
5359 numdl = (dw10 >> 16);
5360 numdu = (dw11 & 0xffff);
5361 lspi = (dw11 >> 16);
5362 lpol = dw12;
5363 lpou = dw13;
5364
5365 len = (((numdu << 16) | numdl) + 1) << 2;
5366 off = (lpou << 32ULL) | lpol;
5367
5368 if (off & 0x3) {
5369 return NVME_INVALID_FIELD | NVME_DNR;
5370 }
5371
5372 trace_pci_nvme_get_log(nvme_cid(req), lid, lsp, rae, len, off);
5373
5374 status = nvme_check_mdts(n, len);
5375 if (status) {
5376 return status;
5377 }
5378
5379 switch (lid) {
5380 case NVME_LOG_ERROR_INFO:
5381 return nvme_error_info(n, rae, len, off, req);
5382 case NVME_LOG_SMART_INFO:
5383 return nvme_smart_info(n, rae, len, off, req);
5384 case NVME_LOG_FW_SLOT_INFO:
5385 return nvme_fw_log_info(n, len, off, req);
5386 case NVME_LOG_CHANGED_NSLIST:
5387 return nvme_changed_nslist(n, rae, len, off, req);
5388 case NVME_LOG_CMD_EFFECTS:
5389 return nvme_cmd_effects(n, csi, len, off, req);
5390 case NVME_LOG_ENDGRP:
5391 return nvme_endgrp_info(n, rae, len, off, req);
5392 case NVME_LOG_FDP_CONFS:
5393 return nvme_fdp_confs(n, lspi, len, off, req);
5394 case NVME_LOG_FDP_RUH_USAGE:
5395 return nvme_fdp_ruh_usage(n, lspi, dw10, dw12, len, off, req);
5396 case NVME_LOG_FDP_STATS:
5397 return nvme_fdp_stats(n, lspi, len, off, req);
5398 case NVME_LOG_FDP_EVENTS:
5399 return nvme_fdp_events(n, lspi, len, off, req);
5400 default:
5401 trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
5402 return NVME_INVALID_FIELD | NVME_DNR;
5403 }
5404 }
5405
nvme_free_cq(NvmeCQueue * cq,NvmeCtrl * n)5406 static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
5407 {
5408 PCIDevice *pci = PCI_DEVICE(n);
5409 uint16_t offset = (cq->cqid << 3) + (1 << 2);
5410
5411 n->cq[cq->cqid] = NULL;
5412 qemu_bh_delete(cq->bh);
5413 if (cq->ioeventfd_enabled) {
5414 memory_region_del_eventfd(&n->iomem,
5415 0x1000 + offset, 4, false, 0, &cq->notifier);
5416 event_notifier_set_handler(&cq->notifier, NULL);
5417 event_notifier_cleanup(&cq->notifier);
5418 }
5419 if (msix_enabled(pci)) {
5420 msix_vector_unuse(pci, cq->vector);
5421 }
5422 if (cq->cqid) {
5423 g_free(cq);
5424 }
5425 }
5426
nvme_del_cq(NvmeCtrl * n,NvmeRequest * req)5427 static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req)
5428 {
5429 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
5430 NvmeCQueue *cq;
5431 uint16_t qid = le16_to_cpu(c->qid);
5432
5433 if (unlikely(!qid || nvme_check_cqid(n, qid))) {
5434 trace_pci_nvme_err_invalid_del_cq_cqid(qid);
5435 return NVME_INVALID_CQID | NVME_DNR;
5436 }
5437
5438 cq = n->cq[qid];
5439 if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
5440 trace_pci_nvme_err_invalid_del_cq_notempty(qid);
5441 return NVME_INVALID_QUEUE_DEL;
5442 }
5443
5444 if (cq->irq_enabled && cq->tail != cq->head) {
5445 n->cq_pending--;
5446 }
5447
5448 nvme_irq_deassert(n, cq);
5449 trace_pci_nvme_del_cq(qid);
5450 nvme_free_cq(cq, n);
5451 return NVME_SUCCESS;
5452 }
5453
nvme_init_cq(NvmeCQueue * cq,NvmeCtrl * n,uint64_t dma_addr,uint16_t cqid,uint16_t vector,uint16_t size,uint16_t irq_enabled)5454 static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
5455 uint16_t cqid, uint16_t vector, uint16_t size,
5456 uint16_t irq_enabled)
5457 {
5458 PCIDevice *pci = PCI_DEVICE(n);
5459
5460 if (msix_enabled(pci)) {
5461 msix_vector_use(pci, vector);
5462 }
5463 cq->ctrl = n;
5464 cq->cqid = cqid;
5465 cq->size = size;
5466 cq->dma_addr = dma_addr;
5467 cq->phase = 1;
5468 cq->irq_enabled = irq_enabled;
5469 cq->vector = vector;
5470 cq->head = cq->tail = 0;
5471 QTAILQ_INIT(&cq->req_list);
5472 QTAILQ_INIT(&cq->sq_list);
5473 if (n->dbbuf_enabled) {
5474 cq->db_addr = n->dbbuf_dbs + (cqid << 3) + (1 << 2);
5475 cq->ei_addr = n->dbbuf_eis + (cqid << 3) + (1 << 2);
5476
5477 if (n->params.ioeventfd && cqid != 0) {
5478 if (!nvme_init_cq_ioeventfd(cq)) {
5479 cq->ioeventfd_enabled = true;
5480 }
5481 }
5482 }
5483 n->cq[cqid] = cq;
5484 cq->bh = qemu_bh_new_guarded(nvme_post_cqes, cq,
5485 &DEVICE(cq->ctrl)->mem_reentrancy_guard);
5486 }
5487
nvme_create_cq(NvmeCtrl * n,NvmeRequest * req)5488 static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
5489 {
5490 NvmeCQueue *cq;
5491 NvmeCreateCq *c = (NvmeCreateCq *)&req->cmd;
5492 uint16_t cqid = le16_to_cpu(c->cqid);
5493 uint16_t vector = le16_to_cpu(c->irq_vector);
5494 uint16_t qsize = le16_to_cpu(c->qsize);
5495 uint16_t qflags = le16_to_cpu(c->cq_flags);
5496 uint64_t prp1 = le64_to_cpu(c->prp1);
5497 uint32_t cc = ldq_le_p(&n->bar.cc);
5498 uint8_t iocqes = NVME_CC_IOCQES(cc);
5499 uint8_t iosqes = NVME_CC_IOSQES(cc);
5500
5501 trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
5502 NVME_CQ_FLAGS_IEN(qflags) != 0);
5503
5504 if (iosqes != NVME_SQES || iocqes != NVME_CQES) {
5505 trace_pci_nvme_err_invalid_create_cq_entry_size(iosqes, iocqes);
5506 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
5507 }
5508
5509 if (unlikely(!cqid || cqid > n->conf_ioqpairs || n->cq[cqid] != NULL)) {
5510 trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
5511 return NVME_INVALID_QID | NVME_DNR;
5512 }
5513 if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
5514 trace_pci_nvme_err_invalid_create_cq_size(qsize);
5515 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
5516 }
5517 if (unlikely(prp1 & (n->page_size - 1))) {
5518 trace_pci_nvme_err_invalid_create_cq_addr(prp1);
5519 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
5520 }
5521 if (unlikely(!msix_enabled(PCI_DEVICE(n)) && vector)) {
5522 trace_pci_nvme_err_invalid_create_cq_vector(vector);
5523 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
5524 }
5525 if (unlikely(vector >= n->conf_msix_qsize)) {
5526 trace_pci_nvme_err_invalid_create_cq_vector(vector);
5527 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
5528 }
5529 if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
5530 trace_pci_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
5531 return NVME_INVALID_FIELD | NVME_DNR;
5532 }
5533
5534 cq = g_malloc0(sizeof(*cq));
5535 nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
5536 NVME_CQ_FLAGS_IEN(qflags));
5537
5538 /*
5539 * It is only required to set qs_created when creating a completion queue;
5540 * creating a submission queue without a matching completion queue will
5541 * fail.
5542 */
5543 n->qs_created = true;
5544 return NVME_SUCCESS;
5545 }
5546
nvme_rpt_empty_id_struct(NvmeCtrl * n,NvmeRequest * req)5547 static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req)
5548 {
5549 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
5550
5551 return nvme_c2h(n, id, sizeof(id), req);
5552 }
5553
nvme_identify_ctrl(NvmeCtrl * n,NvmeRequest * req)5554 static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
5555 {
5556 trace_pci_nvme_identify_ctrl();
5557
5558 return nvme_c2h(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), req);
5559 }
5560
nvme_identify_ctrl_csi(NvmeCtrl * n,NvmeRequest * req)5561 static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
5562 {
5563 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5564 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
5565 NvmeIdCtrlNvm *id_nvm = (NvmeIdCtrlNvm *)&id;
5566
5567 trace_pci_nvme_identify_ctrl_csi(c->csi);
5568
5569 switch (c->csi) {
5570 case NVME_CSI_NVM:
5571 id_nvm->vsl = n->params.vsl;
5572 id_nvm->dmrsl = cpu_to_le32(n->dmrsl);
5573 break;
5574
5575 case NVME_CSI_ZONED:
5576 ((NvmeIdCtrlZoned *)&id)->zasl = n->params.zasl;
5577 break;
5578
5579 default:
5580 return NVME_INVALID_FIELD | NVME_DNR;
5581 }
5582
5583 return nvme_c2h(n, id, sizeof(id), req);
5584 }
5585
nvme_identify_ns(NvmeCtrl * n,NvmeRequest * req,bool active)5586 static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active)
5587 {
5588 NvmeNamespace *ns;
5589 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5590 uint32_t nsid = le32_to_cpu(c->nsid);
5591
5592 trace_pci_nvme_identify_ns(nsid);
5593
5594 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5595 return NVME_INVALID_NSID | NVME_DNR;
5596 }
5597
5598 ns = nvme_ns(n, nsid);
5599 if (unlikely(!ns)) {
5600 if (!active) {
5601 ns = nvme_subsys_ns(n->subsys, nsid);
5602 if (!ns) {
5603 return nvme_rpt_empty_id_struct(n, req);
5604 }
5605 } else {
5606 return nvme_rpt_empty_id_struct(n, req);
5607 }
5608 }
5609
5610 if (active || ns->csi == NVME_CSI_NVM) {
5611 return nvme_c2h(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), req);
5612 }
5613
5614 return NVME_INVALID_CMD_SET | NVME_DNR;
5615 }
5616
nvme_identify_ctrl_list(NvmeCtrl * n,NvmeRequest * req,bool attached)5617 static uint16_t nvme_identify_ctrl_list(NvmeCtrl *n, NvmeRequest *req,
5618 bool attached)
5619 {
5620 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5621 uint32_t nsid = le32_to_cpu(c->nsid);
5622 uint16_t min_id = le16_to_cpu(c->ctrlid);
5623 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
5624 uint16_t *ids = &list[1];
5625 NvmeNamespace *ns;
5626 NvmeCtrl *ctrl;
5627 int cntlid, nr_ids = 0;
5628
5629 trace_pci_nvme_identify_ctrl_list(c->cns, min_id);
5630
5631 if (!n->subsys) {
5632 return NVME_INVALID_FIELD | NVME_DNR;
5633 }
5634
5635 if (attached) {
5636 if (nsid == NVME_NSID_BROADCAST) {
5637 return NVME_INVALID_FIELD | NVME_DNR;
5638 }
5639
5640 ns = nvme_subsys_ns(n->subsys, nsid);
5641 if (!ns) {
5642 return NVME_INVALID_FIELD | NVME_DNR;
5643 }
5644 }
5645
5646 for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) {
5647 ctrl = nvme_subsys_ctrl(n->subsys, cntlid);
5648 if (!ctrl) {
5649 continue;
5650 }
5651
5652 if (attached && !nvme_ns(ctrl, nsid)) {
5653 continue;
5654 }
5655
5656 ids[nr_ids++] = cntlid;
5657 }
5658
5659 list[0] = nr_ids;
5660
5661 return nvme_c2h(n, (uint8_t *)list, sizeof(list), req);
5662 }
5663
nvme_identify_pri_ctrl_cap(NvmeCtrl * n,NvmeRequest * req)5664 static uint16_t nvme_identify_pri_ctrl_cap(NvmeCtrl *n, NvmeRequest *req)
5665 {
5666 trace_pci_nvme_identify_pri_ctrl_cap(le16_to_cpu(n->pri_ctrl_cap.cntlid));
5667
5668 return nvme_c2h(n, (uint8_t *)&n->pri_ctrl_cap,
5669 sizeof(NvmePriCtrlCap), req);
5670 }
5671
nvme_identify_sec_ctrl_list(NvmeCtrl * n,NvmeRequest * req)5672 static uint16_t nvme_identify_sec_ctrl_list(NvmeCtrl *n, NvmeRequest *req)
5673 {
5674 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5675 uint16_t pri_ctrl_id = le16_to_cpu(n->pri_ctrl_cap.cntlid);
5676 uint16_t min_id = le16_to_cpu(c->ctrlid);
5677 uint8_t num_sec_ctrl = n->nr_sec_ctrls;
5678 NvmeSecCtrlList list = {0};
5679 uint8_t i;
5680
5681 for (i = 0; i < num_sec_ctrl; i++) {
5682 if (n->sec_ctrl_list[i].scid >= min_id) {
5683 list.numcntl = MIN(num_sec_ctrl - i, 127);
5684 memcpy(&list.sec, n->sec_ctrl_list + i,
5685 list.numcntl * sizeof(NvmeSecCtrlEntry));
5686 break;
5687 }
5688 }
5689
5690 trace_pci_nvme_identify_sec_ctrl_list(pri_ctrl_id, list.numcntl);
5691
5692 return nvme_c2h(n, (uint8_t *)&list, sizeof(list), req);
5693 }
5694
nvme_identify_ns_ind(NvmeCtrl * n,NvmeRequest * req,bool alloc)5695 static uint16_t nvme_identify_ns_ind(NvmeCtrl *n, NvmeRequest *req, bool alloc)
5696 {
5697 NvmeNamespace *ns;
5698 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5699 uint32_t nsid = le32_to_cpu(c->nsid);
5700
5701 trace_pci_nvme_identify_ns_ind(nsid);
5702
5703 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5704 return NVME_INVALID_NSID | NVME_DNR;
5705 }
5706
5707 ns = nvme_ns(n, nsid);
5708 if (unlikely(!ns)) {
5709 if (alloc) {
5710 ns = nvme_subsys_ns(n->subsys, nsid);
5711 if (!ns) {
5712 return nvme_rpt_empty_id_struct(n, req);
5713 }
5714 } else {
5715 return nvme_rpt_empty_id_struct(n, req);
5716 }
5717 }
5718
5719 return nvme_c2h(n, (uint8_t *)&ns->id_ns_ind, sizeof(NvmeIdNsInd), req);
5720 }
5721
nvme_identify_ns_csi(NvmeCtrl * n,NvmeRequest * req,bool active)5722 static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
5723 bool active)
5724 {
5725 NvmeNamespace *ns;
5726 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5727 uint32_t nsid = le32_to_cpu(c->nsid);
5728
5729 trace_pci_nvme_identify_ns_csi(nsid, c->csi);
5730
5731 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5732 return NVME_INVALID_NSID | NVME_DNR;
5733 }
5734
5735 ns = nvme_ns(n, nsid);
5736 if (unlikely(!ns)) {
5737 if (!active) {
5738 ns = nvme_subsys_ns(n->subsys, nsid);
5739 if (!ns) {
5740 return nvme_rpt_empty_id_struct(n, req);
5741 }
5742 } else {
5743 return nvme_rpt_empty_id_struct(n, req);
5744 }
5745 }
5746
5747 if (c->csi == NVME_CSI_NVM) {
5748 return nvme_c2h(n, (uint8_t *)&ns->id_ns_nvm, sizeof(NvmeIdNsNvm),
5749 req);
5750 } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) {
5751 return nvme_c2h(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned),
5752 req);
5753 }
5754
5755 return NVME_INVALID_FIELD | NVME_DNR;
5756 }
5757
nvme_identify_nslist(NvmeCtrl * n,NvmeRequest * req,bool active)5758 static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req,
5759 bool active)
5760 {
5761 NvmeNamespace *ns;
5762 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5763 uint32_t min_nsid = le32_to_cpu(c->nsid);
5764 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5765 static const int data_len = sizeof(list);
5766 uint32_t *list_ptr = (uint32_t *)list;
5767 int i, j = 0;
5768
5769 trace_pci_nvme_identify_nslist(min_nsid);
5770
5771 /*
5772 * Both FFFFFFFFh (NVME_NSID_BROADCAST) and FFFFFFFFEh are invalid values
5773 * since the Active Namespace ID List should return namespaces with ids
5774 * *higher* than the NSID specified in the command. This is also specified
5775 * in the spec (NVM Express v1.3d, Section 5.15.4).
5776 */
5777 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
5778 return NVME_INVALID_NSID | NVME_DNR;
5779 }
5780
5781 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5782 ns = nvme_ns(n, i);
5783 if (!ns) {
5784 if (!active) {
5785 ns = nvme_subsys_ns(n->subsys, i);
5786 if (!ns) {
5787 continue;
5788 }
5789 } else {
5790 continue;
5791 }
5792 }
5793 if (ns->params.nsid <= min_nsid) {
5794 continue;
5795 }
5796 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
5797 if (j == data_len / sizeof(uint32_t)) {
5798 break;
5799 }
5800 }
5801
5802 return nvme_c2h(n, list, data_len, req);
5803 }
5804
nvme_identify_nslist_csi(NvmeCtrl * n,NvmeRequest * req,bool active)5805 static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
5806 bool active)
5807 {
5808 NvmeNamespace *ns;
5809 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5810 uint32_t min_nsid = le32_to_cpu(c->nsid);
5811 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5812 static const int data_len = sizeof(list);
5813 uint32_t *list_ptr = (uint32_t *)list;
5814 int i, j = 0;
5815
5816 trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);
5817
5818 /*
5819 * Same as in nvme_identify_nslist(), FFFFFFFFh/FFFFFFFFEh are invalid.
5820 */
5821 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
5822 return NVME_INVALID_NSID | NVME_DNR;
5823 }
5824
5825 if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) {
5826 return NVME_INVALID_FIELD | NVME_DNR;
5827 }
5828
5829 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5830 ns = nvme_ns(n, i);
5831 if (!ns) {
5832 if (!active) {
5833 ns = nvme_subsys_ns(n->subsys, i);
5834 if (!ns) {
5835 continue;
5836 }
5837 } else {
5838 continue;
5839 }
5840 }
5841 if (ns->params.nsid <= min_nsid || c->csi != ns->csi) {
5842 continue;
5843 }
5844 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
5845 if (j == data_len / sizeof(uint32_t)) {
5846 break;
5847 }
5848 }
5849
5850 return nvme_c2h(n, list, data_len, req);
5851 }
5852
nvme_endurance_group_list(NvmeCtrl * n,NvmeRequest * req)5853 static uint16_t nvme_endurance_group_list(NvmeCtrl *n, NvmeRequest *req)
5854 {
5855 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
5856 uint16_t *nr_ids = &list[0];
5857 uint16_t *ids = &list[1];
5858 uint16_t endgid = le32_to_cpu(req->cmd.cdw11) & 0xffff;
5859
5860 /*
5861 * The current nvme-subsys only supports Endurance Group #1.
5862 */
5863 if (!endgid) {
5864 *nr_ids = 1;
5865 ids[0] = 1;
5866 } else {
5867 *nr_ids = 0;
5868 }
5869
5870 return nvme_c2h(n, list, sizeof(list), req);
5871 }
5872
nvme_identify_ns_descr_list(NvmeCtrl * n,NvmeRequest * req)5873 static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
5874 {
5875 NvmeNamespace *ns;
5876 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5877 uint32_t nsid = le32_to_cpu(c->nsid);
5878 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5879 uint8_t *pos = list;
5880 struct {
5881 NvmeIdNsDescr hdr;
5882 uint8_t v[NVME_NIDL_UUID];
5883 } QEMU_PACKED uuid = {};
5884 struct {
5885 NvmeIdNsDescr hdr;
5886 uint8_t v[NVME_NIDL_NGUID];
5887 } QEMU_PACKED nguid = {};
5888 struct {
5889 NvmeIdNsDescr hdr;
5890 uint64_t v;
5891 } QEMU_PACKED eui64 = {};
5892 struct {
5893 NvmeIdNsDescr hdr;
5894 uint8_t v;
5895 } QEMU_PACKED csi = {};
5896
5897 trace_pci_nvme_identify_ns_descr_list(nsid);
5898
5899 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5900 return NVME_INVALID_NSID | NVME_DNR;
5901 }
5902
5903 ns = nvme_ns(n, nsid);
5904 if (unlikely(!ns)) {
5905 return NVME_INVALID_FIELD | NVME_DNR;
5906 }
5907
5908 if (!qemu_uuid_is_null(&ns->params.uuid)) {
5909 uuid.hdr.nidt = NVME_NIDT_UUID;
5910 uuid.hdr.nidl = NVME_NIDL_UUID;
5911 memcpy(uuid.v, ns->params.uuid.data, NVME_NIDL_UUID);
5912 memcpy(pos, &uuid, sizeof(uuid));
5913 pos += sizeof(uuid);
5914 }
5915
5916 if (!nvme_nguid_is_null(&ns->params.nguid)) {
5917 nguid.hdr.nidt = NVME_NIDT_NGUID;
5918 nguid.hdr.nidl = NVME_NIDL_NGUID;
5919 memcpy(nguid.v, ns->params.nguid.data, NVME_NIDL_NGUID);
5920 memcpy(pos, &nguid, sizeof(nguid));
5921 pos += sizeof(nguid);
5922 }
5923
5924 if (ns->params.eui64) {
5925 eui64.hdr.nidt = NVME_NIDT_EUI64;
5926 eui64.hdr.nidl = NVME_NIDL_EUI64;
5927 eui64.v = cpu_to_be64(ns->params.eui64);
5928 memcpy(pos, &eui64, sizeof(eui64));
5929 pos += sizeof(eui64);
5930 }
5931
5932 csi.hdr.nidt = NVME_NIDT_CSI;
5933 csi.hdr.nidl = NVME_NIDL_CSI;
5934 csi.v = ns->csi;
5935 memcpy(pos, &csi, sizeof(csi));
5936 pos += sizeof(csi);
5937
5938 return nvme_c2h(n, list, sizeof(list), req);
5939 }
5940
nvme_identify_cmd_set(NvmeCtrl * n,NvmeRequest * req)5941 static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req)
5942 {
5943 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5944 static const int data_len = sizeof(list);
5945
5946 trace_pci_nvme_identify_cmd_set();
5947
5948 NVME_SET_CSI(*list, NVME_CSI_NVM);
5949 NVME_SET_CSI(*list, NVME_CSI_ZONED);
5950
5951 return nvme_c2h(n, list, data_len, req);
5952 }
5953
nvme_identify(NvmeCtrl * n,NvmeRequest * req)5954 static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req)
5955 {
5956 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5957
5958 trace_pci_nvme_identify(nvme_cid(req), c->cns, le16_to_cpu(c->ctrlid),
5959 c->csi);
5960
5961 switch (c->cns) {
5962 case NVME_ID_CNS_NS:
5963 return nvme_identify_ns(n, req, true);
5964 case NVME_ID_CNS_NS_PRESENT:
5965 return nvme_identify_ns(n, req, false);
5966 case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST:
5967 return nvme_identify_ctrl_list(n, req, true);
5968 case NVME_ID_CNS_CTRL_LIST:
5969 return nvme_identify_ctrl_list(n, req, false);
5970 case NVME_ID_CNS_PRIMARY_CTRL_CAP:
5971 return nvme_identify_pri_ctrl_cap(n, req);
5972 case NVME_ID_CNS_SECONDARY_CTRL_LIST:
5973 return nvme_identify_sec_ctrl_list(n, req);
5974 case NVME_ID_CNS_CS_NS:
5975 return nvme_identify_ns_csi(n, req, true);
5976 case NVME_ID_CNS_CS_IND_NS:
5977 return nvme_identify_ns_ind(n, req, false);
5978 case NVME_ID_CNS_CS_IND_NS_ALLOCATED:
5979 return nvme_identify_ns_ind(n, req, true);
5980 case NVME_ID_CNS_CS_NS_PRESENT:
5981 return nvme_identify_ns_csi(n, req, false);
5982 case NVME_ID_CNS_CTRL:
5983 return nvme_identify_ctrl(n, req);
5984 case NVME_ID_CNS_CS_CTRL:
5985 return nvme_identify_ctrl_csi(n, req);
5986 case NVME_ID_CNS_NS_ACTIVE_LIST:
5987 return nvme_identify_nslist(n, req, true);
5988 case NVME_ID_CNS_NS_PRESENT_LIST:
5989 return nvme_identify_nslist(n, req, false);
5990 case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
5991 return nvme_identify_nslist_csi(n, req, true);
5992 case NVME_ID_CNS_ENDURANCE_GROUP_LIST:
5993 return nvme_endurance_group_list(n, req);
5994 case NVME_ID_CNS_CS_NS_PRESENT_LIST:
5995 return nvme_identify_nslist_csi(n, req, false);
5996 case NVME_ID_CNS_NS_DESCR_LIST:
5997 return nvme_identify_ns_descr_list(n, req);
5998 case NVME_ID_CNS_IO_COMMAND_SET:
5999 return nvme_identify_cmd_set(n, req);
6000 default:
6001 trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
6002 return NVME_INVALID_FIELD | NVME_DNR;
6003 }
6004 }
6005
nvme_abort(NvmeCtrl * n,NvmeRequest * req)6006 static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req)
6007 {
6008 uint16_t sqid = le32_to_cpu(req->cmd.cdw10) & 0xffff;
6009 uint16_t cid = (le32_to_cpu(req->cmd.cdw10) >> 16) & 0xffff;
6010 NvmeSQueue *sq = n->sq[sqid];
6011 NvmeRequest *r, *next;
6012 int i;
6013
6014 req->cqe.result = 1;
6015 if (nvme_check_sqid(n, sqid)) {
6016 return NVME_INVALID_FIELD | NVME_DNR;
6017 }
6018
6019 if (sqid == 0) {
6020 for (i = 0; i < n->outstanding_aers; i++) {
6021 NvmeRequest *re = n->aer_reqs[i];
6022 if (re->cqe.cid == cid) {
6023 memmove(n->aer_reqs + i, n->aer_reqs + i + 1,
6024 (n->outstanding_aers - i - 1) * sizeof(NvmeRequest *));
6025 n->outstanding_aers--;
6026 re->status = NVME_CMD_ABORT_REQ;
6027 req->cqe.result = 0;
6028 nvme_enqueue_req_completion(&n->admin_cq, re);
6029 return NVME_SUCCESS;
6030 }
6031 }
6032 }
6033
6034 QTAILQ_FOREACH_SAFE(r, &sq->out_req_list, entry, next) {
6035 if (r->cqe.cid == cid) {
6036 if (r->aiocb) {
6037 blk_aio_cancel_async(r->aiocb);
6038 }
6039 break;
6040 }
6041 }
6042
6043 return NVME_SUCCESS;
6044 }
6045
nvme_set_timestamp(NvmeCtrl * n,uint64_t ts)6046 static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts)
6047 {
6048 trace_pci_nvme_setfeat_timestamp(ts);
6049
6050 n->host_timestamp = le64_to_cpu(ts);
6051 n->timestamp_set_qemu_clock_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
6052 }
6053
nvme_get_timestamp(const NvmeCtrl * n)6054 static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n)
6055 {
6056 uint64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
6057 uint64_t elapsed_time = current_time - n->timestamp_set_qemu_clock_ms;
6058
6059 union nvme_timestamp {
6060 struct {
6061 uint64_t timestamp:48;
6062 uint64_t sync:1;
6063 uint64_t origin:3;
6064 uint64_t rsvd1:12;
6065 };
6066 uint64_t all;
6067 };
6068
6069 union nvme_timestamp ts;
6070 ts.all = 0;
6071 ts.timestamp = n->host_timestamp + elapsed_time;
6072
6073 /* If the host timestamp is non-zero, set the timestamp origin */
6074 ts.origin = n->host_timestamp ? 0x01 : 0x00;
6075
6076 trace_pci_nvme_getfeat_timestamp(ts.all);
6077
6078 return cpu_to_le64(ts.all);
6079 }
6080
nvme_get_feature_timestamp(NvmeCtrl * n,NvmeRequest * req)6081 static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
6082 {
6083 uint64_t timestamp = nvme_get_timestamp(n);
6084
6085 return nvme_c2h(n, (uint8_t *)×tamp, sizeof(timestamp), req);
6086 }
6087
nvme_get_feature_fdp(NvmeCtrl * n,uint32_t endgrpid,uint32_t * result)6088 static int nvme_get_feature_fdp(NvmeCtrl *n, uint32_t endgrpid,
6089 uint32_t *result)
6090 {
6091 *result = 0;
6092
6093 if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
6094 return NVME_INVALID_FIELD | NVME_DNR;
6095 }
6096
6097 *result = FIELD_DP16(0, FEAT_FDP, FDPE, 1);
6098 *result = FIELD_DP16(*result, FEAT_FDP, CONF_NDX, 0);
6099
6100 return NVME_SUCCESS;
6101 }
6102
nvme_get_feature_fdp_events(NvmeCtrl * n,NvmeNamespace * ns,NvmeRequest * req,uint32_t * result)6103 static uint16_t nvme_get_feature_fdp_events(NvmeCtrl *n, NvmeNamespace *ns,
6104 NvmeRequest *req, uint32_t *result)
6105 {
6106 NvmeCmd *cmd = &req->cmd;
6107 uint32_t cdw11 = le32_to_cpu(cmd->cdw11);
6108 uint16_t ph = cdw11 & 0xffff;
6109 uint8_t noet = (cdw11 >> 16) & 0xff;
6110 uint16_t ruhid, ret;
6111 uint32_t nentries = 0;
6112 uint8_t s_events_ndx = 0;
6113 size_t s_events_siz = sizeof(NvmeFdpEventDescr) * noet;
6114 g_autofree NvmeFdpEventDescr *s_events = g_malloc0(s_events_siz);
6115 NvmeRuHandle *ruh;
6116 NvmeFdpEventDescr *s_event;
6117
6118 if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
6119 return NVME_FDP_DISABLED | NVME_DNR;
6120 }
6121
6122 if (!nvme_ph_valid(ns, ph)) {
6123 return NVME_INVALID_FIELD | NVME_DNR;
6124 }
6125
6126 ruhid = ns->fdp.phs[ph];
6127 ruh = &n->subsys->endgrp.fdp.ruhs[ruhid];
6128
6129 assert(ruh);
6130
6131 if (unlikely(noet == 0)) {
6132 return NVME_INVALID_FIELD | NVME_DNR;
6133 }
6134
6135 for (uint8_t event_type = 0; event_type < FDP_EVT_MAX; event_type++) {
6136 uint8_t shift = nvme_fdp_evf_shifts[event_type];
6137 if (!shift && event_type) {
6138 /*
6139 * only first entry (event_type == 0) has a shift value of 0
6140 * other entries are simply unpopulated.
6141 */
6142 continue;
6143 }
6144
6145 nentries++;
6146
6147 s_event = &s_events[s_events_ndx];
6148 s_event->evt = event_type;
6149 s_event->evta = (ruh->event_filter >> shift) & 0x1;
6150
6151 /* break if all `noet` entries are filled */
6152 if ((++s_events_ndx) == noet) {
6153 break;
6154 }
6155 }
6156
6157 ret = nvme_c2h(n, s_events, s_events_siz, req);
6158 if (ret) {
6159 return ret;
6160 }
6161
6162 *result = nentries;
6163 return NVME_SUCCESS;
6164 }
6165
nvme_get_feature(NvmeCtrl * n,NvmeRequest * req)6166 static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
6167 {
6168 NvmeCmd *cmd = &req->cmd;
6169 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
6170 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
6171 uint32_t nsid = le32_to_cpu(cmd->nsid);
6172 uint32_t result = 0;
6173 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
6174 NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
6175 uint16_t iv;
6176 NvmeNamespace *ns;
6177 int i;
6178 uint16_t endgrpid = 0, ret = NVME_SUCCESS;
6179
6180 static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
6181 [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
6182 };
6183
6184 trace_pci_nvme_getfeat(nvme_cid(req), nsid, fid, sel, dw11);
6185
6186 if (!nvme_feature_support[fid]) {
6187 return NVME_INVALID_FIELD | NVME_DNR;
6188 }
6189
6190 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
6191 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
6192 /*
6193 * The Reservation Notification Mask and Reservation Persistence
6194 * features require a status code of Invalid Field in Command when
6195 * NSID is FFFFFFFFh. Since the device does not support those
6196 * features we can always return Invalid Namespace or Format as we
6197 * should do for all other features.
6198 */
6199 return NVME_INVALID_NSID | NVME_DNR;
6200 }
6201
6202 if (!nvme_ns(n, nsid)) {
6203 return NVME_INVALID_FIELD | NVME_DNR;
6204 }
6205 }
6206
6207 switch (sel) {
6208 case NVME_GETFEAT_SELECT_CURRENT:
6209 break;
6210 case NVME_GETFEAT_SELECT_SAVED:
6211 /* no features are saveable by the controller; fallthrough */
6212 case NVME_GETFEAT_SELECT_DEFAULT:
6213 goto defaults;
6214 case NVME_GETFEAT_SELECT_CAP:
6215 result = nvme_feature_cap[fid];
6216 goto out;
6217 }
6218
6219 switch (fid) {
6220 case NVME_TEMPERATURE_THRESHOLD:
6221 result = 0;
6222
6223 /*
6224 * The controller only implements the Composite Temperature sensor, so
6225 * return 0 for all other sensors.
6226 */
6227 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
6228 goto out;
6229 }
6230
6231 switch (NVME_TEMP_THSEL(dw11)) {
6232 case NVME_TEMP_THSEL_OVER:
6233 result = n->features.temp_thresh_hi;
6234 goto out;
6235 case NVME_TEMP_THSEL_UNDER:
6236 result = n->features.temp_thresh_low;
6237 goto out;
6238 }
6239
6240 return NVME_INVALID_FIELD | NVME_DNR;
6241 case NVME_ERROR_RECOVERY:
6242 if (!nvme_nsid_valid(n, nsid)) {
6243 return NVME_INVALID_NSID | NVME_DNR;
6244 }
6245
6246 ns = nvme_ns(n, nsid);
6247 if (unlikely(!ns)) {
6248 return NVME_INVALID_FIELD | NVME_DNR;
6249 }
6250
6251 result = ns->features.err_rec;
6252 goto out;
6253 case NVME_VOLATILE_WRITE_CACHE:
6254 result = 0;
6255 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6256 ns = nvme_ns(n, i);
6257 if (!ns) {
6258 continue;
6259 }
6260
6261 result = blk_enable_write_cache(ns->blkconf.blk);
6262 if (result) {
6263 break;
6264 }
6265 }
6266 trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
6267 goto out;
6268 case NVME_ASYNCHRONOUS_EVENT_CONF:
6269 result = n->features.async_config;
6270 goto out;
6271 case NVME_TIMESTAMP:
6272 return nvme_get_feature_timestamp(n, req);
6273 case NVME_HOST_BEHAVIOR_SUPPORT:
6274 return nvme_c2h(n, (uint8_t *)&n->features.hbs,
6275 sizeof(n->features.hbs), req);
6276 case NVME_FDP_MODE:
6277 endgrpid = dw11 & 0xff;
6278
6279 if (endgrpid != 0x1) {
6280 return NVME_INVALID_FIELD | NVME_DNR;
6281 }
6282
6283 ret = nvme_get_feature_fdp(n, endgrpid, &result);
6284 if (ret) {
6285 return ret;
6286 }
6287 goto out;
6288 case NVME_FDP_EVENTS:
6289 if (!nvme_nsid_valid(n, nsid)) {
6290 return NVME_INVALID_NSID | NVME_DNR;
6291 }
6292
6293 ns = nvme_ns(n, nsid);
6294 if (unlikely(!ns)) {
6295 return NVME_INVALID_FIELD | NVME_DNR;
6296 }
6297
6298 ret = nvme_get_feature_fdp_events(n, ns, req, &result);
6299 if (ret) {
6300 return ret;
6301 }
6302 goto out;
6303 default:
6304 break;
6305 }
6306
6307 defaults:
6308 switch (fid) {
6309 case NVME_TEMPERATURE_THRESHOLD:
6310 result = 0;
6311
6312 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
6313 break;
6314 }
6315
6316 if (NVME_TEMP_THSEL(dw11) == NVME_TEMP_THSEL_OVER) {
6317 result = NVME_TEMPERATURE_WARNING;
6318 }
6319
6320 break;
6321 case NVME_NUMBER_OF_QUEUES:
6322 result = (n->conf_ioqpairs - 1) | ((n->conf_ioqpairs - 1) << 16);
6323 trace_pci_nvme_getfeat_numq(result);
6324 break;
6325 case NVME_INTERRUPT_VECTOR_CONF:
6326 iv = dw11 & 0xffff;
6327 if (iv >= n->conf_ioqpairs + 1) {
6328 return NVME_INVALID_FIELD | NVME_DNR;
6329 }
6330
6331 result = iv;
6332 if (iv == n->admin_cq.vector) {
6333 result |= NVME_INTVC_NOCOALESCING;
6334 }
6335 break;
6336 case NVME_FDP_MODE:
6337 endgrpid = dw11 & 0xff;
6338
6339 if (endgrpid != 0x1) {
6340 return NVME_INVALID_FIELD | NVME_DNR;
6341 }
6342
6343 ret = nvme_get_feature_fdp(n, endgrpid, &result);
6344 if (ret) {
6345 return ret;
6346 }
6347 break;
6348
6349 case NVME_WRITE_ATOMICITY:
6350 result = n->dn;
6351 break;
6352 default:
6353 result = nvme_feature_default[fid];
6354 break;
6355 }
6356
6357 out:
6358 req->cqe.result = cpu_to_le32(result);
6359 return ret;
6360 }
6361
nvme_set_feature_timestamp(NvmeCtrl * n,NvmeRequest * req)6362 static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
6363 {
6364 uint16_t ret;
6365 uint64_t timestamp;
6366
6367 ret = nvme_h2c(n, (uint8_t *)×tamp, sizeof(timestamp), req);
6368 if (ret) {
6369 return ret;
6370 }
6371
6372 nvme_set_timestamp(n, timestamp);
6373
6374 return NVME_SUCCESS;
6375 }
6376
nvme_set_feature_fdp_events(NvmeCtrl * n,NvmeNamespace * ns,NvmeRequest * req)6377 static uint16_t nvme_set_feature_fdp_events(NvmeCtrl *n, NvmeNamespace *ns,
6378 NvmeRequest *req)
6379 {
6380 NvmeCmd *cmd = &req->cmd;
6381 uint32_t cdw11 = le32_to_cpu(cmd->cdw11);
6382 uint16_t ph = cdw11 & 0xffff;
6383 uint8_t noet = (cdw11 >> 16) & 0xff;
6384 uint16_t ret, ruhid;
6385 uint8_t enable = le32_to_cpu(cmd->cdw12) & 0x1;
6386 uint8_t event_mask = 0;
6387 unsigned int i;
6388 g_autofree uint8_t *events = g_malloc0(noet);
6389 NvmeRuHandle *ruh = NULL;
6390
6391 assert(ns);
6392
6393 if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
6394 return NVME_FDP_DISABLED | NVME_DNR;
6395 }
6396
6397 if (!nvme_ph_valid(ns, ph)) {
6398 return NVME_INVALID_FIELD | NVME_DNR;
6399 }
6400
6401 ruhid = ns->fdp.phs[ph];
6402 ruh = &n->subsys->endgrp.fdp.ruhs[ruhid];
6403
6404 ret = nvme_h2c(n, events, noet, req);
6405 if (ret) {
6406 return ret;
6407 }
6408
6409 for (i = 0; i < noet; i++) {
6410 event_mask |= (1 << nvme_fdp_evf_shifts[events[i]]);
6411 }
6412
6413 if (enable) {
6414 ruh->event_filter |= event_mask;
6415 } else {
6416 ruh->event_filter = ruh->event_filter & ~event_mask;
6417 }
6418
6419 return NVME_SUCCESS;
6420 }
6421
nvme_set_feature(NvmeCtrl * n,NvmeRequest * req)6422 static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
6423 {
6424 NvmeNamespace *ns = NULL;
6425
6426 NvmeCmd *cmd = &req->cmd;
6427 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
6428 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
6429 uint32_t nsid = le32_to_cpu(cmd->nsid);
6430 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
6431 uint8_t save = NVME_SETFEAT_SAVE(dw10);
6432 uint16_t status;
6433 int i;
6434 NvmeIdCtrl *id = &n->id_ctrl;
6435 NvmeAtomic *atomic = &n->atomic;
6436
6437 trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
6438
6439 if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) {
6440 return NVME_FID_NOT_SAVEABLE | NVME_DNR;
6441 }
6442
6443 if (!nvme_feature_support[fid]) {
6444 return NVME_INVALID_FIELD | NVME_DNR;
6445 }
6446
6447 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
6448 if (nsid != NVME_NSID_BROADCAST) {
6449 if (!nvme_nsid_valid(n, nsid)) {
6450 return NVME_INVALID_NSID | NVME_DNR;
6451 }
6452
6453 ns = nvme_ns(n, nsid);
6454 if (unlikely(!ns)) {
6455 return NVME_INVALID_FIELD | NVME_DNR;
6456 }
6457 }
6458 } else if (nsid && nsid != NVME_NSID_BROADCAST) {
6459 if (!nvme_nsid_valid(n, nsid)) {
6460 return NVME_INVALID_NSID | NVME_DNR;
6461 }
6462
6463 return NVME_FEAT_NOT_NS_SPEC | NVME_DNR;
6464 }
6465
6466 if (!(nvme_feature_cap[fid] & NVME_FEAT_CAP_CHANGE)) {
6467 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
6468 }
6469
6470 switch (fid) {
6471 case NVME_TEMPERATURE_THRESHOLD:
6472 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
6473 break;
6474 }
6475
6476 switch (NVME_TEMP_THSEL(dw11)) {
6477 case NVME_TEMP_THSEL_OVER:
6478 n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11);
6479 break;
6480 case NVME_TEMP_THSEL_UNDER:
6481 n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11);
6482 break;
6483 default:
6484 return NVME_INVALID_FIELD | NVME_DNR;
6485 }
6486
6487 if ((n->temperature >= n->features.temp_thresh_hi) ||
6488 (n->temperature <= n->features.temp_thresh_low)) {
6489 nvme_smart_event(n, NVME_SMART_TEMPERATURE);
6490 }
6491
6492 break;
6493 case NVME_ERROR_RECOVERY:
6494 if (nsid == NVME_NSID_BROADCAST) {
6495 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6496 ns = nvme_ns(n, i);
6497
6498 if (!ns) {
6499 continue;
6500 }
6501
6502 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
6503 ns->features.err_rec = dw11;
6504 }
6505 }
6506
6507 break;
6508 }
6509
6510 assert(ns);
6511 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
6512 ns->features.err_rec = dw11;
6513 }
6514 break;
6515 case NVME_VOLATILE_WRITE_CACHE:
6516 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6517 ns = nvme_ns(n, i);
6518 if (!ns) {
6519 continue;
6520 }
6521
6522 if (!(dw11 & 0x1) && blk_enable_write_cache(ns->blkconf.blk)) {
6523 blk_flush(ns->blkconf.blk);
6524 }
6525
6526 blk_set_enable_write_cache(ns->blkconf.blk, dw11 & 1);
6527 }
6528
6529 break;
6530
6531 case NVME_NUMBER_OF_QUEUES:
6532 if (n->qs_created) {
6533 return NVME_CMD_SEQ_ERROR | NVME_DNR;
6534 }
6535
6536 /*
6537 * NVMe v1.3, Section 5.21.1.7: FFFFh is not an allowed value for NCQR
6538 * and NSQR.
6539 */
6540 if ((dw11 & 0xffff) == 0xffff || ((dw11 >> 16) & 0xffff) == 0xffff) {
6541 return NVME_INVALID_FIELD | NVME_DNR;
6542 }
6543
6544 trace_pci_nvme_setfeat_numq((dw11 & 0xffff) + 1,
6545 ((dw11 >> 16) & 0xffff) + 1,
6546 n->conf_ioqpairs,
6547 n->conf_ioqpairs);
6548 req->cqe.result = cpu_to_le32((n->conf_ioqpairs - 1) |
6549 ((n->conf_ioqpairs - 1) << 16));
6550 break;
6551 case NVME_ASYNCHRONOUS_EVENT_CONF:
6552 n->features.async_config = dw11;
6553 break;
6554 case NVME_TIMESTAMP:
6555 return nvme_set_feature_timestamp(n, req);
6556 case NVME_HOST_BEHAVIOR_SUPPORT:
6557 status = nvme_h2c(n, (uint8_t *)&n->features.hbs,
6558 sizeof(n->features.hbs), req);
6559 if (status) {
6560 return status;
6561 }
6562
6563 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6564 ns = nvme_ns(n, i);
6565
6566 if (!ns) {
6567 continue;
6568 }
6569
6570 ns->id_ns.nlbaf = ns->nlbaf - 1;
6571 if (!n->features.hbs.lbafee) {
6572 ns->id_ns.nlbaf = MIN(ns->id_ns.nlbaf, 15);
6573 }
6574 }
6575
6576 return status;
6577 case NVME_COMMAND_SET_PROFILE:
6578 if (dw11 & 0x1ff) {
6579 trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff);
6580 return NVME_CMD_SET_CMB_REJECTED | NVME_DNR;
6581 }
6582 break;
6583 case NVME_FDP_MODE:
6584 /* spec: abort with cmd seq err if there's one or more NS' in endgrp */
6585 return NVME_CMD_SEQ_ERROR | NVME_DNR;
6586 case NVME_FDP_EVENTS:
6587 return nvme_set_feature_fdp_events(n, ns, req);
6588 case NVME_WRITE_ATOMICITY:
6589
6590 n->dn = 0x1 & dw11;
6591
6592 if (n->dn) {
6593 atomic->atomic_max_write_size = le16_to_cpu(id->awupf) + 1;
6594 } else {
6595 atomic->atomic_max_write_size = le16_to_cpu(id->awun) + 1;
6596 }
6597
6598 if (atomic->atomic_max_write_size == 1) {
6599 atomic->atomic_writes = 0;
6600 } else {
6601 atomic->atomic_writes = 1;
6602 }
6603 break;
6604 default:
6605 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
6606 }
6607 return NVME_SUCCESS;
6608 }
6609
nvme_aer(NvmeCtrl * n,NvmeRequest * req)6610 static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req)
6611 {
6612 trace_pci_nvme_aer(nvme_cid(req));
6613
6614 if (n->outstanding_aers > n->params.aerl) {
6615 trace_pci_nvme_aer_aerl_exceeded();
6616 return NVME_AER_LIMIT_EXCEEDED;
6617 }
6618
6619 n->aer_reqs[n->outstanding_aers] = req;
6620 n->outstanding_aers++;
6621
6622 if (!QTAILQ_EMPTY(&n->aer_queue)) {
6623 nvme_process_aers(n);
6624 }
6625
6626 return NVME_NO_COMPLETE;
6627 }
6628
nvme_update_dmrsl(NvmeCtrl * n)6629 static void nvme_update_dmrsl(NvmeCtrl *n)
6630 {
6631 int nsid;
6632
6633 for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) {
6634 NvmeNamespace *ns = nvme_ns(n, nsid);
6635 if (!ns) {
6636 continue;
6637 }
6638
6639 n->dmrsl = MIN_NON_ZERO(n->dmrsl,
6640 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
6641 }
6642 }
6643
nvme_select_iocs_ns(NvmeCtrl * n,NvmeNamespace * ns)6644 static void nvme_select_iocs_ns(NvmeCtrl *n, NvmeNamespace *ns)
6645 {
6646 uint32_t cc = ldl_le_p(&n->bar.cc);
6647
6648 ns->iocs = nvme_cse_iocs_none;
6649 switch (ns->csi) {
6650 case NVME_CSI_NVM:
6651 if (NVME_CC_CSS(cc) != NVME_CC_CSS_ADMIN_ONLY) {
6652 ns->iocs = nvme_cse_iocs_nvm;
6653 }
6654 break;
6655 case NVME_CSI_ZONED:
6656 if (NVME_CC_CSS(cc) == NVME_CC_CSS_CSI) {
6657 ns->iocs = nvme_cse_iocs_zoned;
6658 } else if (NVME_CC_CSS(cc) == NVME_CC_CSS_NVM) {
6659 ns->iocs = nvme_cse_iocs_nvm;
6660 }
6661 break;
6662 }
6663 }
6664
nvme_ns_attachment(NvmeCtrl * n,NvmeRequest * req)6665 static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req)
6666 {
6667 NvmeNamespace *ns;
6668 NvmeCtrl *ctrl;
6669 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
6670 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
6671 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
6672 uint8_t sel = dw10 & 0xf;
6673 uint16_t *nr_ids = &list[0];
6674 uint16_t *ids = &list[1];
6675 uint16_t ret;
6676 int i;
6677
6678 trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf);
6679
6680 if (!nvme_nsid_valid(n, nsid)) {
6681 return NVME_INVALID_NSID | NVME_DNR;
6682 }
6683
6684 ns = nvme_subsys_ns(n->subsys, nsid);
6685 if (!ns) {
6686 return NVME_INVALID_FIELD | NVME_DNR;
6687 }
6688
6689 ret = nvme_h2c(n, (uint8_t *)list, 4096, req);
6690 if (ret) {
6691 return ret;
6692 }
6693
6694 if (!*nr_ids) {
6695 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
6696 }
6697
6698 *nr_ids = MIN(*nr_ids, NVME_CONTROLLER_LIST_SIZE - 1);
6699 for (i = 0; i < *nr_ids; i++) {
6700 ctrl = nvme_subsys_ctrl(n->subsys, ids[i]);
6701 if (!ctrl) {
6702 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
6703 }
6704
6705 switch (sel) {
6706 case NVME_NS_ATTACHMENT_ATTACH:
6707 if (nvme_ns(ctrl, nsid)) {
6708 return NVME_NS_ALREADY_ATTACHED | NVME_DNR;
6709 }
6710
6711 if (ns->attached && !ns->params.shared) {
6712 return NVME_NS_PRIVATE | NVME_DNR;
6713 }
6714
6715 nvme_attach_ns(ctrl, ns);
6716 nvme_select_iocs_ns(ctrl, ns);
6717
6718 break;
6719
6720 case NVME_NS_ATTACHMENT_DETACH:
6721 if (!nvme_ns(ctrl, nsid)) {
6722 return NVME_NS_NOT_ATTACHED | NVME_DNR;
6723 }
6724
6725 ctrl->namespaces[nsid] = NULL;
6726 ns->attached--;
6727
6728 nvme_update_dmrsl(ctrl);
6729
6730 break;
6731
6732 default:
6733 return NVME_INVALID_FIELD | NVME_DNR;
6734 }
6735
6736 /*
6737 * Add namespace id to the changed namespace id list for event clearing
6738 * via Get Log Page command.
6739 */
6740 if (!test_and_set_bit(nsid, ctrl->changed_nsids)) {
6741 nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE,
6742 NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED,
6743 NVME_LOG_CHANGED_NSLIST);
6744 }
6745 }
6746
6747 return NVME_SUCCESS;
6748 }
6749
6750 typedef struct NvmeFormatAIOCB {
6751 BlockAIOCB common;
6752 BlockAIOCB *aiocb;
6753 NvmeRequest *req;
6754 int ret;
6755
6756 NvmeNamespace *ns;
6757 uint32_t nsid;
6758 bool broadcast;
6759 int64_t offset;
6760
6761 uint8_t lbaf;
6762 uint8_t mset;
6763 uint8_t pi;
6764 uint8_t pil;
6765 } NvmeFormatAIOCB;
6766
nvme_format_cancel(BlockAIOCB * aiocb)6767 static void nvme_format_cancel(BlockAIOCB *aiocb)
6768 {
6769 NvmeFormatAIOCB *iocb = container_of(aiocb, NvmeFormatAIOCB, common);
6770
6771 iocb->ret = -ECANCELED;
6772
6773 if (iocb->aiocb) {
6774 blk_aio_cancel_async(iocb->aiocb);
6775 iocb->aiocb = NULL;
6776 }
6777 }
6778
6779 static const AIOCBInfo nvme_format_aiocb_info = {
6780 .aiocb_size = sizeof(NvmeFormatAIOCB),
6781 .cancel_async = nvme_format_cancel,
6782 };
6783
nvme_format_set(NvmeNamespace * ns,uint8_t lbaf,uint8_t mset,uint8_t pi,uint8_t pil)6784 static void nvme_format_set(NvmeNamespace *ns, uint8_t lbaf, uint8_t mset,
6785 uint8_t pi, uint8_t pil)
6786 {
6787 uint8_t lbafl = lbaf & 0xf;
6788 uint8_t lbafu = lbaf >> 4;
6789
6790 trace_pci_nvme_format_set(ns->params.nsid, lbaf, mset, pi, pil);
6791
6792 ns->id_ns.dps = (pil << 3) | pi;
6793 ns->id_ns.flbas = (lbafu << 5) | (mset << 4) | lbafl;
6794
6795 nvme_ns_init_format(ns);
6796 }
6797
6798 static void nvme_do_format(NvmeFormatAIOCB *iocb);
6799
nvme_format_ns_cb(void * opaque,int ret)6800 static void nvme_format_ns_cb(void *opaque, int ret)
6801 {
6802 NvmeFormatAIOCB *iocb = opaque;
6803 NvmeNamespace *ns = iocb->ns;
6804 int bytes;
6805
6806 if (iocb->ret < 0) {
6807 goto done;
6808 } else if (ret < 0) {
6809 iocb->ret = ret;
6810 goto done;
6811 }
6812
6813 assert(ns);
6814
6815 if (iocb->offset < ns->size) {
6816 bytes = MIN(BDRV_REQUEST_MAX_BYTES, ns->size - iocb->offset);
6817
6818 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, iocb->offset,
6819 bytes, BDRV_REQ_MAY_UNMAP,
6820 nvme_format_ns_cb, iocb);
6821
6822 iocb->offset += bytes;
6823 return;
6824 }
6825
6826 nvme_format_set(ns, iocb->lbaf, iocb->mset, iocb->pi, iocb->pil);
6827 ns->status = 0x0;
6828 iocb->ns = NULL;
6829 iocb->offset = 0;
6830
6831 done:
6832 nvme_do_format(iocb);
6833 }
6834
nvme_format_check(NvmeNamespace * ns,uint8_t lbaf,uint8_t pi)6835 static uint16_t nvme_format_check(NvmeNamespace *ns, uint8_t lbaf, uint8_t pi)
6836 {
6837 if (ns->params.zoned) {
6838 return NVME_INVALID_FORMAT | NVME_DNR;
6839 }
6840
6841 if (lbaf > ns->id_ns.nlbaf) {
6842 return NVME_INVALID_FORMAT | NVME_DNR;
6843 }
6844
6845 if (pi && (ns->id_ns.lbaf[lbaf].ms < nvme_pi_tuple_size(ns))) {
6846 return NVME_INVALID_FORMAT | NVME_DNR;
6847 }
6848
6849 if (pi && pi > NVME_ID_NS_DPS_TYPE_3) {
6850 return NVME_INVALID_FIELD | NVME_DNR;
6851 }
6852
6853 return NVME_SUCCESS;
6854 }
6855
nvme_do_format(NvmeFormatAIOCB * iocb)6856 static void nvme_do_format(NvmeFormatAIOCB *iocb)
6857 {
6858 NvmeRequest *req = iocb->req;
6859 NvmeCtrl *n = nvme_ctrl(req);
6860 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
6861 uint8_t lbaf = dw10 & 0xf;
6862 uint8_t pi = (dw10 >> 5) & 0x7;
6863 uint16_t status;
6864 int i;
6865
6866 if (iocb->ret < 0) {
6867 goto done;
6868 }
6869
6870 if (iocb->broadcast) {
6871 for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
6872 iocb->ns = nvme_ns(n, i);
6873 if (iocb->ns) {
6874 iocb->nsid = i;
6875 break;
6876 }
6877 }
6878 }
6879
6880 if (!iocb->ns) {
6881 goto done;
6882 }
6883
6884 status = nvme_format_check(iocb->ns, lbaf, pi);
6885 if (status) {
6886 req->status = status;
6887 goto done;
6888 }
6889
6890 iocb->ns->status = NVME_FORMAT_IN_PROGRESS;
6891 nvme_format_ns_cb(iocb, 0);
6892 return;
6893
6894 done:
6895 iocb->common.cb(iocb->common.opaque, iocb->ret);
6896 qemu_aio_unref(iocb);
6897 }
6898
nvme_format(NvmeCtrl * n,NvmeRequest * req)6899 static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req)
6900 {
6901 NvmeFormatAIOCB *iocb;
6902 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
6903 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
6904 uint8_t lbaf = dw10 & 0xf;
6905 uint8_t mset = (dw10 >> 4) & 0x1;
6906 uint8_t pi = (dw10 >> 5) & 0x7;
6907 uint8_t pil = (dw10 >> 8) & 0x1;
6908 uint8_t lbafu = (dw10 >> 12) & 0x3;
6909 uint16_t status;
6910
6911 iocb = qemu_aio_get(&nvme_format_aiocb_info, NULL, nvme_misc_cb, req);
6912
6913 iocb->req = req;
6914 iocb->ret = 0;
6915 iocb->ns = NULL;
6916 iocb->nsid = 0;
6917 iocb->lbaf = lbaf;
6918 iocb->mset = mset;
6919 iocb->pi = pi;
6920 iocb->pil = pil;
6921 iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
6922 iocb->offset = 0;
6923
6924 if (n->features.hbs.lbafee) {
6925 iocb->lbaf |= lbafu << 4;
6926 }
6927
6928 if (!iocb->broadcast) {
6929 if (!nvme_nsid_valid(n, nsid)) {
6930 status = NVME_INVALID_NSID | NVME_DNR;
6931 goto out;
6932 }
6933
6934 iocb->ns = nvme_ns(n, nsid);
6935 if (!iocb->ns) {
6936 status = NVME_INVALID_FIELD | NVME_DNR;
6937 goto out;
6938 }
6939 }
6940
6941 req->aiocb = &iocb->common;
6942 nvme_do_format(iocb);
6943
6944 return NVME_NO_COMPLETE;
6945
6946 out:
6947 qemu_aio_unref(iocb);
6948
6949 return status;
6950 }
6951
nvme_get_virt_res_num(NvmeCtrl * n,uint8_t rt,int * num_total,int * num_prim,int * num_sec)6952 static void nvme_get_virt_res_num(NvmeCtrl *n, uint8_t rt, int *num_total,
6953 int *num_prim, int *num_sec)
6954 {
6955 *num_total = le32_to_cpu(rt ?
6956 n->pri_ctrl_cap.vifrt : n->pri_ctrl_cap.vqfrt);
6957 *num_prim = le16_to_cpu(rt ?
6958 n->pri_ctrl_cap.virfap : n->pri_ctrl_cap.vqrfap);
6959 *num_sec = le16_to_cpu(rt ? n->pri_ctrl_cap.virfa : n->pri_ctrl_cap.vqrfa);
6960 }
6961
nvme_assign_virt_res_to_prim(NvmeCtrl * n,NvmeRequest * req,uint16_t cntlid,uint8_t rt,int nr)6962 static uint16_t nvme_assign_virt_res_to_prim(NvmeCtrl *n, NvmeRequest *req,
6963 uint16_t cntlid, uint8_t rt,
6964 int nr)
6965 {
6966 int num_total, num_prim, num_sec;
6967
6968 if (cntlid != n->cntlid) {
6969 return NVME_INVALID_CTRL_ID | NVME_DNR;
6970 }
6971
6972 nvme_get_virt_res_num(n, rt, &num_total, &num_prim, &num_sec);
6973
6974 if (nr > num_total) {
6975 return NVME_INVALID_NUM_RESOURCES | NVME_DNR;
6976 }
6977
6978 if (nr > num_total - num_sec) {
6979 return NVME_INVALID_RESOURCE_ID | NVME_DNR;
6980 }
6981
6982 if (rt) {
6983 n->next_pri_ctrl_cap.virfap = cpu_to_le16(nr);
6984 } else {
6985 n->next_pri_ctrl_cap.vqrfap = cpu_to_le16(nr);
6986 }
6987
6988 req->cqe.result = cpu_to_le32(nr);
6989 return req->status;
6990 }
6991
nvme_update_virt_res(NvmeCtrl * n,NvmeSecCtrlEntry * sctrl,uint8_t rt,int nr)6992 static void nvme_update_virt_res(NvmeCtrl *n, NvmeSecCtrlEntry *sctrl,
6993 uint8_t rt, int nr)
6994 {
6995 int prev_nr, prev_total;
6996
6997 if (rt) {
6998 prev_nr = le16_to_cpu(sctrl->nvi);
6999 prev_total = le32_to_cpu(n->pri_ctrl_cap.virfa);
7000 sctrl->nvi = cpu_to_le16(nr);
7001 n->pri_ctrl_cap.virfa = cpu_to_le32(prev_total + nr - prev_nr);
7002 } else {
7003 prev_nr = le16_to_cpu(sctrl->nvq);
7004 prev_total = le32_to_cpu(n->pri_ctrl_cap.vqrfa);
7005 sctrl->nvq = cpu_to_le16(nr);
7006 n->pri_ctrl_cap.vqrfa = cpu_to_le32(prev_total + nr - prev_nr);
7007 }
7008 }
7009
nvme_assign_virt_res_to_sec(NvmeCtrl * n,NvmeRequest * req,uint16_t cntlid,uint8_t rt,int nr)7010 static uint16_t nvme_assign_virt_res_to_sec(NvmeCtrl *n, NvmeRequest *req,
7011 uint16_t cntlid, uint8_t rt, int nr)
7012 {
7013 int num_total, num_prim, num_sec, num_free, diff, limit;
7014 NvmeSecCtrlEntry *sctrl;
7015
7016 sctrl = nvme_sctrl_for_cntlid(n, cntlid);
7017 if (!sctrl) {
7018 return NVME_INVALID_CTRL_ID | NVME_DNR;
7019 }
7020
7021 if (sctrl->scs) {
7022 return NVME_INVALID_SEC_CTRL_STATE | NVME_DNR;
7023 }
7024
7025 limit = le16_to_cpu(rt ? n->pri_ctrl_cap.vifrsm : n->pri_ctrl_cap.vqfrsm);
7026 if (nr > limit) {
7027 return NVME_INVALID_NUM_RESOURCES | NVME_DNR;
7028 }
7029
7030 nvme_get_virt_res_num(n, rt, &num_total, &num_prim, &num_sec);
7031 num_free = num_total - num_prim - num_sec;
7032 diff = nr - le16_to_cpu(rt ? sctrl->nvi : sctrl->nvq);
7033
7034 if (diff > num_free) {
7035 return NVME_INVALID_RESOURCE_ID | NVME_DNR;
7036 }
7037
7038 nvme_update_virt_res(n, sctrl, rt, nr);
7039 req->cqe.result = cpu_to_le32(nr);
7040
7041 return req->status;
7042 }
7043
nvme_virt_set_state(NvmeCtrl * n,uint16_t cntlid,bool online)7044 static uint16_t nvme_virt_set_state(NvmeCtrl *n, uint16_t cntlid, bool online)
7045 {
7046 PCIDevice *pci = PCI_DEVICE(n);
7047 NvmeCtrl *sn = NULL;
7048 NvmeSecCtrlEntry *sctrl;
7049 int vf_index;
7050
7051 sctrl = nvme_sctrl_for_cntlid(n, cntlid);
7052 if (!sctrl) {
7053 return NVME_INVALID_CTRL_ID | NVME_DNR;
7054 }
7055
7056 if (!pci_is_vf(pci)) {
7057 vf_index = le16_to_cpu(sctrl->vfn) - 1;
7058 sn = NVME(pcie_sriov_get_vf_at_index(pci, vf_index));
7059 }
7060
7061 if (online) {
7062 if (!sctrl->nvi || (le16_to_cpu(sctrl->nvq) < 2) || !sn) {
7063 return NVME_INVALID_SEC_CTRL_STATE | NVME_DNR;
7064 }
7065
7066 if (!sctrl->scs) {
7067 sctrl->scs = 0x1;
7068 nvme_ctrl_reset(sn, NVME_RESET_FUNCTION);
7069 }
7070 } else {
7071 nvme_update_virt_res(n, sctrl, NVME_VIRT_RES_INTERRUPT, 0);
7072 nvme_update_virt_res(n, sctrl, NVME_VIRT_RES_QUEUE, 0);
7073
7074 if (sctrl->scs) {
7075 sctrl->scs = 0x0;
7076 if (sn) {
7077 nvme_ctrl_reset(sn, NVME_RESET_FUNCTION);
7078 }
7079 }
7080 }
7081
7082 return NVME_SUCCESS;
7083 }
7084
nvme_virt_mngmt(NvmeCtrl * n,NvmeRequest * req)7085 static uint16_t nvme_virt_mngmt(NvmeCtrl *n, NvmeRequest *req)
7086 {
7087 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
7088 uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
7089 uint8_t act = dw10 & 0xf;
7090 uint8_t rt = (dw10 >> 8) & 0x7;
7091 uint16_t cntlid = (dw10 >> 16) & 0xffff;
7092 int nr = dw11 & 0xffff;
7093
7094 trace_pci_nvme_virt_mngmt(nvme_cid(req), act, cntlid, rt ? "VI" : "VQ", nr);
7095
7096 if (rt != NVME_VIRT_RES_QUEUE && rt != NVME_VIRT_RES_INTERRUPT) {
7097 return NVME_INVALID_RESOURCE_ID | NVME_DNR;
7098 }
7099
7100 switch (act) {
7101 case NVME_VIRT_MNGMT_ACTION_SEC_ASSIGN:
7102 return nvme_assign_virt_res_to_sec(n, req, cntlid, rt, nr);
7103 case NVME_VIRT_MNGMT_ACTION_PRM_ALLOC:
7104 return nvme_assign_virt_res_to_prim(n, req, cntlid, rt, nr);
7105 case NVME_VIRT_MNGMT_ACTION_SEC_ONLINE:
7106 return nvme_virt_set_state(n, cntlid, true);
7107 case NVME_VIRT_MNGMT_ACTION_SEC_OFFLINE:
7108 return nvme_virt_set_state(n, cntlid, false);
7109 default:
7110 return NVME_INVALID_FIELD | NVME_DNR;
7111 }
7112 }
7113
nvme_dbbuf_config(NvmeCtrl * n,const NvmeRequest * req)7114 static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
7115 {
7116 PCIDevice *pci = PCI_DEVICE(n);
7117 uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1);
7118 uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2);
7119 int i;
7120
7121 /* Address should be page aligned */
7122 if (dbs_addr & (n->page_size - 1) || eis_addr & (n->page_size - 1)) {
7123 return NVME_INVALID_FIELD | NVME_DNR;
7124 }
7125
7126 /* Save shadow buffer base addr for use during queue creation */
7127 n->dbbuf_dbs = dbs_addr;
7128 n->dbbuf_eis = eis_addr;
7129 n->dbbuf_enabled = true;
7130
7131 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
7132 NvmeSQueue *sq = n->sq[i];
7133 NvmeCQueue *cq = n->cq[i];
7134
7135 if (sq) {
7136 /*
7137 * CAP.DSTRD is 0, so offset of ith sq db_addr is (i<<3)
7138 * nvme_process_db() uses this hard-coded way to calculate
7139 * doorbell offsets. Be consistent with that here.
7140 */
7141 sq->db_addr = dbs_addr + (i << 3);
7142 sq->ei_addr = eis_addr + (i << 3);
7143 stl_le_pci_dma(pci, sq->db_addr, sq->tail, MEMTXATTRS_UNSPECIFIED);
7144
7145 if (n->params.ioeventfd && sq->sqid != 0) {
7146 if (!nvme_init_sq_ioeventfd(sq)) {
7147 sq->ioeventfd_enabled = true;
7148 }
7149 }
7150 }
7151
7152 if (cq) {
7153 /* CAP.DSTRD is 0, so offset of ith cq db_addr is (i<<3)+(1<<2) */
7154 cq->db_addr = dbs_addr + (i << 3) + (1 << 2);
7155 cq->ei_addr = eis_addr + (i << 3) + (1 << 2);
7156 stl_le_pci_dma(pci, cq->db_addr, cq->head, MEMTXATTRS_UNSPECIFIED);
7157
7158 if (n->params.ioeventfd && cq->cqid != 0) {
7159 if (!nvme_init_cq_ioeventfd(cq)) {
7160 cq->ioeventfd_enabled = true;
7161 }
7162 }
7163 }
7164 }
7165
7166 trace_pci_nvme_dbbuf_config(dbs_addr, eis_addr);
7167
7168 return NVME_SUCCESS;
7169 }
7170
nvme_directive_send(NvmeCtrl * n,NvmeRequest * req)7171 static uint16_t nvme_directive_send(NvmeCtrl *n, NvmeRequest *req)
7172 {
7173 return NVME_INVALID_FIELD | NVME_DNR;
7174 }
7175
nvme_directive_receive(NvmeCtrl * n,NvmeRequest * req)7176 static uint16_t nvme_directive_receive(NvmeCtrl *n, NvmeRequest *req)
7177 {
7178 NvmeNamespace *ns;
7179 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
7180 uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
7181 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
7182 uint8_t doper, dtype;
7183 uint32_t numd, trans_len;
7184 NvmeDirectiveIdentify id = {
7185 .supported = 1 << NVME_DIRECTIVE_IDENTIFY,
7186 .enabled = 1 << NVME_DIRECTIVE_IDENTIFY,
7187 };
7188
7189 numd = dw10 + 1;
7190 doper = dw11 & 0xff;
7191 dtype = (dw11 >> 8) & 0xff;
7192
7193 trans_len = MIN(sizeof(NvmeDirectiveIdentify), numd << 2);
7194
7195 if (nsid == NVME_NSID_BROADCAST || dtype != NVME_DIRECTIVE_IDENTIFY ||
7196 doper != NVME_DIRECTIVE_RETURN_PARAMS) {
7197 return NVME_INVALID_FIELD | NVME_DNR;
7198 }
7199
7200 ns = nvme_ns(n, nsid);
7201 if (!ns) {
7202 return NVME_INVALID_FIELD | NVME_DNR;
7203 }
7204
7205 switch (dtype) {
7206 case NVME_DIRECTIVE_IDENTIFY:
7207 switch (doper) {
7208 case NVME_DIRECTIVE_RETURN_PARAMS:
7209 if (ns->endgrp && ns->endgrp->fdp.enabled) {
7210 id.supported |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
7211 id.enabled |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
7212 id.persistent |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
7213 }
7214
7215 return nvme_c2h(n, (uint8_t *)&id, trans_len, req);
7216
7217 default:
7218 return NVME_INVALID_FIELD | NVME_DNR;
7219 }
7220
7221 default:
7222 return NVME_INVALID_FIELD;
7223 }
7224 }
7225
nvme_admin_cmd(NvmeCtrl * n,NvmeRequest * req)7226 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
7227 {
7228 trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
7229 nvme_adm_opc_str(req->cmd.opcode));
7230
7231 if (!(nvme_cse_acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
7232 trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode);
7233 return NVME_INVALID_OPCODE | NVME_DNR;
7234 }
7235
7236 /* SGLs shall not be used for Admin commands in NVMe over PCIe */
7237 if (NVME_CMD_FLAGS_PSDT(req->cmd.flags) != NVME_PSDT_PRP) {
7238 return NVME_INVALID_FIELD | NVME_DNR;
7239 }
7240
7241 if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
7242 return NVME_INVALID_FIELD;
7243 }
7244
7245 switch (req->cmd.opcode) {
7246 case NVME_ADM_CMD_DELETE_SQ:
7247 return nvme_del_sq(n, req);
7248 case NVME_ADM_CMD_CREATE_SQ:
7249 return nvme_create_sq(n, req);
7250 case NVME_ADM_CMD_GET_LOG_PAGE:
7251 return nvme_get_log(n, req);
7252 case NVME_ADM_CMD_DELETE_CQ:
7253 return nvme_del_cq(n, req);
7254 case NVME_ADM_CMD_CREATE_CQ:
7255 return nvme_create_cq(n, req);
7256 case NVME_ADM_CMD_IDENTIFY:
7257 return nvme_identify(n, req);
7258 case NVME_ADM_CMD_ABORT:
7259 return nvme_abort(n, req);
7260 case NVME_ADM_CMD_SET_FEATURES:
7261 return nvme_set_feature(n, req);
7262 case NVME_ADM_CMD_GET_FEATURES:
7263 return nvme_get_feature(n, req);
7264 case NVME_ADM_CMD_ASYNC_EV_REQ:
7265 return nvme_aer(n, req);
7266 case NVME_ADM_CMD_NS_ATTACHMENT:
7267 return nvme_ns_attachment(n, req);
7268 case NVME_ADM_CMD_VIRT_MNGMT:
7269 return nvme_virt_mngmt(n, req);
7270 case NVME_ADM_CMD_DBBUF_CONFIG:
7271 return nvme_dbbuf_config(n, req);
7272 case NVME_ADM_CMD_FORMAT_NVM:
7273 return nvme_format(n, req);
7274 case NVME_ADM_CMD_DIRECTIVE_SEND:
7275 return nvme_directive_send(n, req);
7276 case NVME_ADM_CMD_DIRECTIVE_RECV:
7277 return nvme_directive_receive(n, req);
7278 default:
7279 g_assert_not_reached();
7280 }
7281
7282 return NVME_INVALID_OPCODE | NVME_DNR;
7283 }
7284
nvme_update_sq_eventidx(const NvmeSQueue * sq)7285 static void nvme_update_sq_eventidx(const NvmeSQueue *sq)
7286 {
7287 trace_pci_nvme_update_sq_eventidx(sq->sqid, sq->tail);
7288
7289 stl_le_pci_dma(PCI_DEVICE(sq->ctrl), sq->ei_addr, sq->tail,
7290 MEMTXATTRS_UNSPECIFIED);
7291 }
7292
nvme_update_sq_tail(NvmeSQueue * sq)7293 static void nvme_update_sq_tail(NvmeSQueue *sq)
7294 {
7295 ldl_le_pci_dma(PCI_DEVICE(sq->ctrl), sq->db_addr, &sq->tail,
7296 MEMTXATTRS_UNSPECIFIED);
7297
7298 trace_pci_nvme_update_sq_tail(sq->sqid, sq->tail);
7299 }
7300
7301 #define NVME_ATOMIC_NO_START 0
7302 #define NVME_ATOMIC_START_ATOMIC 1
7303 #define NVME_ATOMIC_START_NONATOMIC 2
7304
nvme_atomic_write_check(NvmeCtrl * n,NvmeCmd * cmd,NvmeAtomic * atomic)7305 static int nvme_atomic_write_check(NvmeCtrl *n, NvmeCmd *cmd,
7306 NvmeAtomic *atomic)
7307 {
7308 NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
7309 uint64_t slba = le64_to_cpu(rw->slba);
7310 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb);
7311 uint64_t elba = slba + nlb;
7312 bool cmd_atomic_wr = true;
7313 int i;
7314
7315 if ((cmd->opcode == NVME_CMD_READ) || ((cmd->opcode == NVME_CMD_WRITE) &&
7316 ((rw->nlb + 1) > atomic->atomic_max_write_size))) {
7317 cmd_atomic_wr = false;
7318 }
7319
7320 /*
7321 * Walk the queues to see if there are any atomic conflicts.
7322 */
7323 for (i = 1; i < n->params.max_ioqpairs + 1; i++) {
7324 NvmeSQueue *sq;
7325 NvmeRequest *req;
7326 NvmeRwCmd *req_rw;
7327 uint64_t req_slba;
7328 uint32_t req_nlb;
7329 uint64_t req_elba;
7330
7331 sq = n->sq[i];
7332 if (!sq) {
7333 continue;
7334 }
7335
7336 /*
7337 * Walk all the requests on a given queue.
7338 */
7339 QTAILQ_FOREACH(req, &sq->out_req_list, entry) {
7340 req_rw = (NvmeRwCmd *)&req->cmd;
7341
7342 if (((req_rw->opcode == NVME_CMD_WRITE) ||
7343 (req_rw->opcode == NVME_CMD_READ)) &&
7344 (cmd->nsid == req->ns->params.nsid)) {
7345 req_slba = le64_to_cpu(req_rw->slba);
7346 req_nlb = (uint32_t)le16_to_cpu(req_rw->nlb);
7347 req_elba = req_slba + req_nlb;
7348
7349 if (cmd_atomic_wr) {
7350 if ((elba >= req_slba) && (slba <= req_elba)) {
7351 return NVME_ATOMIC_NO_START;
7352 }
7353 } else {
7354 if (req->atomic_write && ((elba >= req_slba) &&
7355 (slba <= req_elba))) {
7356 return NVME_ATOMIC_NO_START;
7357 }
7358 }
7359 }
7360 }
7361 }
7362 if (cmd_atomic_wr) {
7363 return NVME_ATOMIC_START_ATOMIC;
7364 }
7365 return NVME_ATOMIC_START_NONATOMIC;
7366 }
7367
nvme_get_atomic(NvmeCtrl * n,NvmeCmd * cmd)7368 static NvmeAtomic *nvme_get_atomic(NvmeCtrl *n, NvmeCmd *cmd)
7369 {
7370 if (n->atomic.atomic_writes) {
7371 return &n->atomic;
7372 }
7373 return NULL;
7374 }
7375
nvme_process_sq(void * opaque)7376 static void nvme_process_sq(void *opaque)
7377 {
7378 NvmeSQueue *sq = opaque;
7379 NvmeCtrl *n = sq->ctrl;
7380 NvmeCQueue *cq = n->cq[sq->cqid];
7381
7382 uint16_t status;
7383 hwaddr addr;
7384 NvmeCmd cmd;
7385 NvmeRequest *req;
7386
7387 if (n->dbbuf_enabled) {
7388 nvme_update_sq_tail(sq);
7389 }
7390
7391 while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
7392 NvmeAtomic *atomic;
7393 bool cmd_is_atomic;
7394
7395 addr = sq->dma_addr + (sq->head << NVME_SQES);
7396 if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
7397 trace_pci_nvme_err_addr_read(addr);
7398 trace_pci_nvme_err_cfs();
7399 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
7400 break;
7401 }
7402
7403 atomic = nvme_get_atomic(n, &cmd);
7404
7405 cmd_is_atomic = false;
7406 if (sq->sqid && atomic) {
7407 int ret;
7408
7409 ret = nvme_atomic_write_check(n, &cmd, atomic);
7410 switch (ret) {
7411 case NVME_ATOMIC_NO_START:
7412 qemu_bh_schedule(sq->bh);
7413 return;
7414 case NVME_ATOMIC_START_ATOMIC:
7415 cmd_is_atomic = true;
7416 break;
7417 case NVME_ATOMIC_START_NONATOMIC:
7418 default:
7419 break;
7420 }
7421 }
7422 nvme_inc_sq_head(sq);
7423
7424 req = QTAILQ_FIRST(&sq->req_list);
7425 QTAILQ_REMOVE(&sq->req_list, req, entry);
7426 QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
7427 nvme_req_clear(req);
7428 req->cqe.cid = cmd.cid;
7429 memcpy(&req->cmd, &cmd, sizeof(NvmeCmd));
7430
7431 if (sq->sqid && atomic) {
7432 req->atomic_write = cmd_is_atomic;
7433 }
7434
7435 status = sq->sqid ? nvme_io_cmd(n, req) :
7436 nvme_admin_cmd(n, req);
7437 if (status != NVME_NO_COMPLETE) {
7438 req->status = status;
7439 nvme_enqueue_req_completion(cq, req);
7440 }
7441
7442 if (n->dbbuf_enabled) {
7443 nvme_update_sq_eventidx(sq);
7444 nvme_update_sq_tail(sq);
7445 }
7446 }
7447 }
7448
nvme_update_msixcap_ts(PCIDevice * pci_dev,uint32_t table_size)7449 static void nvme_update_msixcap_ts(PCIDevice *pci_dev, uint32_t table_size)
7450 {
7451 uint8_t *config;
7452
7453 if (!msix_present(pci_dev)) {
7454 return;
7455 }
7456
7457 assert(table_size > 0 && table_size <= pci_dev->msix_entries_nr);
7458
7459 config = pci_dev->config + pci_dev->msix_cap;
7460 pci_set_word_by_mask(config + PCI_MSIX_FLAGS, PCI_MSIX_FLAGS_QSIZE,
7461 table_size - 1);
7462 }
7463
nvme_activate_virt_res(NvmeCtrl * n)7464 static void nvme_activate_virt_res(NvmeCtrl *n)
7465 {
7466 PCIDevice *pci_dev = PCI_DEVICE(n);
7467 NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
7468 NvmeSecCtrlEntry *sctrl;
7469
7470 /* -1 to account for the admin queue */
7471 if (pci_is_vf(pci_dev)) {
7472 sctrl = nvme_sctrl(n);
7473 cap->vqprt = sctrl->nvq;
7474 cap->viprt = sctrl->nvi;
7475 n->conf_ioqpairs = sctrl->nvq ? le16_to_cpu(sctrl->nvq) - 1 : 0;
7476 n->conf_msix_qsize = sctrl->nvi ? le16_to_cpu(sctrl->nvi) : 1;
7477 } else {
7478 cap->vqrfap = n->next_pri_ctrl_cap.vqrfap;
7479 cap->virfap = n->next_pri_ctrl_cap.virfap;
7480 n->conf_ioqpairs = le16_to_cpu(cap->vqprt) +
7481 le16_to_cpu(cap->vqrfap) - 1;
7482 n->conf_msix_qsize = le16_to_cpu(cap->viprt) +
7483 le16_to_cpu(cap->virfap);
7484 }
7485 }
7486
nvme_ctrl_reset(NvmeCtrl * n,NvmeResetType rst)7487 static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst)
7488 {
7489 PCIDevice *pci_dev = PCI_DEVICE(n);
7490 NvmeSecCtrlEntry *sctrl;
7491 NvmeNamespace *ns;
7492 int i;
7493
7494 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
7495 ns = nvme_ns(n, i);
7496 if (!ns) {
7497 continue;
7498 }
7499
7500 nvme_ns_drain(ns);
7501 }
7502
7503 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
7504 if (n->sq[i] != NULL) {
7505 nvme_free_sq(n->sq[i], n);
7506 }
7507 }
7508 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
7509 if (n->cq[i] != NULL) {
7510 nvme_free_cq(n->cq[i], n);
7511 }
7512 }
7513
7514 while (!QTAILQ_EMPTY(&n->aer_queue)) {
7515 NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue);
7516 QTAILQ_REMOVE(&n->aer_queue, event, entry);
7517 g_free(event);
7518 }
7519
7520 if (n->params.sriov_max_vfs) {
7521 if (!pci_is_vf(pci_dev)) {
7522 for (i = 0; i < n->nr_sec_ctrls; i++) {
7523 sctrl = &n->sec_ctrl_list[i];
7524 nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
7525 }
7526 }
7527
7528 if (rst != NVME_RESET_CONTROLLER) {
7529 nvme_activate_virt_res(n);
7530 }
7531 }
7532
7533 n->aer_queued = 0;
7534 n->aer_mask = 0;
7535 n->outstanding_aers = 0;
7536 n->qs_created = false;
7537
7538 n->dn = n->params.atomic_dn; /* Set Disable Normal */
7539
7540 nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize);
7541
7542 if (pci_is_vf(pci_dev)) {
7543 sctrl = nvme_sctrl(n);
7544
7545 stl_le_p(&n->bar.csts, sctrl->scs ? 0 : NVME_CSTS_FAILED);
7546 } else {
7547 stl_le_p(&n->bar.csts, 0);
7548 }
7549
7550 stl_le_p(&n->bar.intms, 0);
7551 stl_le_p(&n->bar.intmc, 0);
7552 stl_le_p(&n->bar.cc, 0);
7553
7554 n->dbbuf_dbs = 0;
7555 n->dbbuf_eis = 0;
7556 n->dbbuf_enabled = false;
7557 }
7558
nvme_ctrl_shutdown(NvmeCtrl * n)7559 static void nvme_ctrl_shutdown(NvmeCtrl *n)
7560 {
7561 NvmeNamespace *ns;
7562 int i;
7563
7564 if (n->pmr.dev) {
7565 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
7566 }
7567
7568 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
7569 ns = nvme_ns(n, i);
7570 if (!ns) {
7571 continue;
7572 }
7573
7574 nvme_ns_shutdown(ns);
7575 }
7576 }
7577
nvme_select_iocs(NvmeCtrl * n)7578 static void nvme_select_iocs(NvmeCtrl *n)
7579 {
7580 NvmeNamespace *ns;
7581 int i;
7582
7583 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
7584 ns = nvme_ns(n, i);
7585 if (!ns) {
7586 continue;
7587 }
7588
7589 nvme_select_iocs_ns(n, ns);
7590 }
7591 }
7592
nvme_start_ctrl(NvmeCtrl * n)7593 static int nvme_start_ctrl(NvmeCtrl *n)
7594 {
7595 uint64_t cap = ldq_le_p(&n->bar.cap);
7596 uint32_t cc = ldl_le_p(&n->bar.cc);
7597 uint32_t aqa = ldl_le_p(&n->bar.aqa);
7598 uint64_t asq = ldq_le_p(&n->bar.asq);
7599 uint64_t acq = ldq_le_p(&n->bar.acq);
7600 uint32_t page_bits = NVME_CC_MPS(cc) + 12;
7601 uint32_t page_size = 1 << page_bits;
7602 NvmeSecCtrlEntry *sctrl = nvme_sctrl(n);
7603
7604 if (pci_is_vf(PCI_DEVICE(n)) && !sctrl->scs) {
7605 trace_pci_nvme_err_startfail_virt_state(le16_to_cpu(sctrl->nvi),
7606 le16_to_cpu(sctrl->nvq));
7607 return -1;
7608 }
7609 if (unlikely(n->cq[0])) {
7610 trace_pci_nvme_err_startfail_cq();
7611 return -1;
7612 }
7613 if (unlikely(n->sq[0])) {
7614 trace_pci_nvme_err_startfail_sq();
7615 return -1;
7616 }
7617 if (unlikely(asq & (page_size - 1))) {
7618 trace_pci_nvme_err_startfail_asq_misaligned(asq);
7619 return -1;
7620 }
7621 if (unlikely(acq & (page_size - 1))) {
7622 trace_pci_nvme_err_startfail_acq_misaligned(acq);
7623 return -1;
7624 }
7625 if (unlikely(!(NVME_CAP_CSS(cap) & (1 << NVME_CC_CSS(cc))))) {
7626 trace_pci_nvme_err_startfail_css(NVME_CC_CSS(cc));
7627 return -1;
7628 }
7629 if (unlikely(NVME_CC_MPS(cc) < NVME_CAP_MPSMIN(cap))) {
7630 trace_pci_nvme_err_startfail_page_too_small(
7631 NVME_CC_MPS(cc),
7632 NVME_CAP_MPSMIN(cap));
7633 return -1;
7634 }
7635 if (unlikely(NVME_CC_MPS(cc) >
7636 NVME_CAP_MPSMAX(cap))) {
7637 trace_pci_nvme_err_startfail_page_too_large(
7638 NVME_CC_MPS(cc),
7639 NVME_CAP_MPSMAX(cap));
7640 return -1;
7641 }
7642 if (unlikely(!NVME_AQA_ASQS(aqa))) {
7643 trace_pci_nvme_err_startfail_asqent_sz_zero();
7644 return -1;
7645 }
7646 if (unlikely(!NVME_AQA_ACQS(aqa))) {
7647 trace_pci_nvme_err_startfail_acqent_sz_zero();
7648 return -1;
7649 }
7650
7651 n->page_bits = page_bits;
7652 n->page_size = page_size;
7653 n->max_prp_ents = n->page_size / sizeof(uint64_t);
7654 nvme_init_cq(&n->admin_cq, n, acq, 0, 0, NVME_AQA_ACQS(aqa) + 1, 1);
7655 nvme_init_sq(&n->admin_sq, n, asq, 0, 0, NVME_AQA_ASQS(aqa) + 1);
7656
7657 nvme_set_timestamp(n, 0ULL);
7658
7659 nvme_select_iocs(n);
7660
7661 return 0;
7662 }
7663
nvme_cmb_enable_regs(NvmeCtrl * n)7664 static void nvme_cmb_enable_regs(NvmeCtrl *n)
7665 {
7666 uint32_t cmbloc = ldl_le_p(&n->bar.cmbloc);
7667 uint32_t cmbsz = ldl_le_p(&n->bar.cmbsz);
7668
7669 NVME_CMBLOC_SET_CDPCILS(cmbloc, 1);
7670 NVME_CMBLOC_SET_CDPMLS(cmbloc, 1);
7671 NVME_CMBLOC_SET_BIR(cmbloc, NVME_CMB_BIR);
7672 stl_le_p(&n->bar.cmbloc, cmbloc);
7673
7674 NVME_CMBSZ_SET_SQS(cmbsz, 1);
7675 NVME_CMBSZ_SET_CQS(cmbsz, 0);
7676 NVME_CMBSZ_SET_LISTS(cmbsz, 1);
7677 NVME_CMBSZ_SET_RDS(cmbsz, 1);
7678 NVME_CMBSZ_SET_WDS(cmbsz, 1);
7679 NVME_CMBSZ_SET_SZU(cmbsz, 2); /* MBs */
7680 NVME_CMBSZ_SET_SZ(cmbsz, n->params.cmb_size_mb);
7681 stl_le_p(&n->bar.cmbsz, cmbsz);
7682 }
7683
nvme_write_bar(NvmeCtrl * n,hwaddr offset,uint64_t data,unsigned size)7684 static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
7685 unsigned size)
7686 {
7687 PCIDevice *pci = PCI_DEVICE(n);
7688 uint64_t cap = ldq_le_p(&n->bar.cap);
7689 uint32_t cc = ldl_le_p(&n->bar.cc);
7690 uint32_t intms = ldl_le_p(&n->bar.intms);
7691 uint32_t csts = ldl_le_p(&n->bar.csts);
7692 uint32_t pmrsts = ldl_le_p(&n->bar.pmrsts);
7693
7694 if (unlikely(offset & (sizeof(uint32_t) - 1))) {
7695 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32,
7696 "MMIO write not 32-bit aligned,"
7697 " offset=0x%"PRIx64"", offset);
7698 /* should be ignored, fall through for now */
7699 }
7700
7701 if (unlikely(size < sizeof(uint32_t))) {
7702 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_toosmall,
7703 "MMIO write smaller than 32-bits,"
7704 " offset=0x%"PRIx64", size=%u",
7705 offset, size);
7706 /* should be ignored, fall through for now */
7707 }
7708
7709 switch (offset) {
7710 case NVME_REG_INTMS:
7711 if (unlikely(msix_enabled(pci))) {
7712 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
7713 "undefined access to interrupt mask set"
7714 " when MSI-X is enabled");
7715 /* should be ignored, fall through for now */
7716 }
7717 intms |= data;
7718 stl_le_p(&n->bar.intms, intms);
7719 n->bar.intmc = n->bar.intms;
7720 trace_pci_nvme_mmio_intm_set(data & 0xffffffff, intms);
7721 nvme_irq_check(n);
7722 break;
7723 case NVME_REG_INTMC:
7724 if (unlikely(msix_enabled(pci))) {
7725 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
7726 "undefined access to interrupt mask clr"
7727 " when MSI-X is enabled");
7728 /* should be ignored, fall through for now */
7729 }
7730 intms &= ~data;
7731 stl_le_p(&n->bar.intms, intms);
7732 n->bar.intmc = n->bar.intms;
7733 trace_pci_nvme_mmio_intm_clr(data & 0xffffffff, intms);
7734 nvme_irq_check(n);
7735 break;
7736 case NVME_REG_CC:
7737 stl_le_p(&n->bar.cc, data);
7738
7739 trace_pci_nvme_mmio_cfg(data & 0xffffffff);
7740
7741 if (NVME_CC_SHN(data) && !(NVME_CC_SHN(cc))) {
7742 trace_pci_nvme_mmio_shutdown_set();
7743 nvme_ctrl_shutdown(n);
7744 csts &= ~(CSTS_SHST_MASK << CSTS_SHST_SHIFT);
7745 csts |= NVME_CSTS_SHST_COMPLETE;
7746 } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(cc)) {
7747 trace_pci_nvme_mmio_shutdown_cleared();
7748 csts &= ~(CSTS_SHST_MASK << CSTS_SHST_SHIFT);
7749 }
7750
7751 if (NVME_CC_EN(data) && !NVME_CC_EN(cc)) {
7752 if (unlikely(nvme_start_ctrl(n))) {
7753 trace_pci_nvme_err_startfail();
7754 csts = NVME_CSTS_FAILED;
7755 } else {
7756 trace_pci_nvme_mmio_start_success();
7757 csts = NVME_CSTS_READY;
7758 }
7759 } else if (!NVME_CC_EN(data) && NVME_CC_EN(cc)) {
7760 trace_pci_nvme_mmio_stopped();
7761 nvme_ctrl_reset(n, NVME_RESET_CONTROLLER);
7762
7763 break;
7764 }
7765
7766 stl_le_p(&n->bar.csts, csts);
7767
7768 break;
7769 case NVME_REG_CSTS:
7770 if (data & (1 << 4)) {
7771 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ssreset_w1c_unsupported,
7772 "attempted to W1C CSTS.NSSRO"
7773 " but CAP.NSSRS is zero (not supported)");
7774 } else if (data != 0) {
7775 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ro_csts,
7776 "attempted to set a read only bit"
7777 " of controller status");
7778 }
7779 break;
7780 case NVME_REG_NSSR:
7781 if (data == 0x4e564d65) {
7782 trace_pci_nvme_ub_mmiowr_ssreset_unsupported();
7783 } else {
7784 /* The spec says that writes of other values have no effect */
7785 return;
7786 }
7787 break;
7788 case NVME_REG_AQA:
7789 stl_le_p(&n->bar.aqa, data);
7790 trace_pci_nvme_mmio_aqattr(data & 0xffffffff);
7791 break;
7792 case NVME_REG_ASQ:
7793 stn_le_p(&n->bar.asq, size, data);
7794 trace_pci_nvme_mmio_asqaddr(data);
7795 break;
7796 case NVME_REG_ASQ + 4:
7797 stl_le_p((uint8_t *)&n->bar.asq + 4, data);
7798 trace_pci_nvme_mmio_asqaddr_hi(data, ldq_le_p(&n->bar.asq));
7799 break;
7800 case NVME_REG_ACQ:
7801 trace_pci_nvme_mmio_acqaddr(data);
7802 stn_le_p(&n->bar.acq, size, data);
7803 break;
7804 case NVME_REG_ACQ + 4:
7805 stl_le_p((uint8_t *)&n->bar.acq + 4, data);
7806 trace_pci_nvme_mmio_acqaddr_hi(data, ldq_le_p(&n->bar.acq));
7807 break;
7808 case NVME_REG_CMBLOC:
7809 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbloc_reserved,
7810 "invalid write to reserved CMBLOC"
7811 " when CMBSZ is zero, ignored");
7812 return;
7813 case NVME_REG_CMBSZ:
7814 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly,
7815 "invalid write to read only CMBSZ, ignored");
7816 return;
7817 case NVME_REG_CMBMSC:
7818 if (!NVME_CAP_CMBS(cap)) {
7819 return;
7820 }
7821
7822 stn_le_p(&n->bar.cmbmsc, size, data);
7823 n->cmb.cmse = false;
7824
7825 if (NVME_CMBMSC_CRE(data)) {
7826 nvme_cmb_enable_regs(n);
7827
7828 if (NVME_CMBMSC_CMSE(data)) {
7829 uint64_t cmbmsc = ldq_le_p(&n->bar.cmbmsc);
7830 hwaddr cba = NVME_CMBMSC_CBA(cmbmsc) << CMBMSC_CBA_SHIFT;
7831 if (cba + int128_get64(n->cmb.mem.size) < cba) {
7832 uint32_t cmbsts = ldl_le_p(&n->bar.cmbsts);
7833 NVME_CMBSTS_SET_CBAI(cmbsts, 1);
7834 stl_le_p(&n->bar.cmbsts, cmbsts);
7835 return;
7836 }
7837
7838 n->cmb.cba = cba;
7839 n->cmb.cmse = true;
7840 }
7841 } else {
7842 n->bar.cmbsz = 0;
7843 n->bar.cmbloc = 0;
7844 }
7845
7846 return;
7847 case NVME_REG_CMBMSC + 4:
7848 stl_le_p((uint8_t *)&n->bar.cmbmsc + 4, data);
7849 return;
7850
7851 case NVME_REG_PMRCAP:
7852 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly,
7853 "invalid write to PMRCAP register, ignored");
7854 return;
7855 case NVME_REG_PMRCTL:
7856 if (!NVME_CAP_PMRS(cap)) {
7857 return;
7858 }
7859
7860 stl_le_p(&n->bar.pmrctl, data);
7861 if (NVME_PMRCTL_EN(data)) {
7862 memory_region_set_enabled(&n->pmr.dev->mr, true);
7863 pmrsts = 0;
7864 } else {
7865 memory_region_set_enabled(&n->pmr.dev->mr, false);
7866 NVME_PMRSTS_SET_NRDY(pmrsts, 1);
7867 n->pmr.cmse = false;
7868 }
7869 stl_le_p(&n->bar.pmrsts, pmrsts);
7870 return;
7871 case NVME_REG_PMRSTS:
7872 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly,
7873 "invalid write to PMRSTS register, ignored");
7874 return;
7875 case NVME_REG_PMREBS:
7876 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrebs_readonly,
7877 "invalid write to PMREBS register, ignored");
7878 return;
7879 case NVME_REG_PMRSWTP:
7880 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly,
7881 "invalid write to PMRSWTP register, ignored");
7882 return;
7883 case NVME_REG_PMRMSCL:
7884 if (!NVME_CAP_PMRS(cap)) {
7885 return;
7886 }
7887
7888 stl_le_p(&n->bar.pmrmscl, data);
7889 n->pmr.cmse = false;
7890
7891 if (NVME_PMRMSCL_CMSE(data)) {
7892 uint64_t pmrmscu = ldl_le_p(&n->bar.pmrmscu);
7893 hwaddr cba = pmrmscu << 32 |
7894 (NVME_PMRMSCL_CBA(data) << PMRMSCL_CBA_SHIFT);
7895 if (cba + int128_get64(n->pmr.dev->mr.size) < cba) {
7896 NVME_PMRSTS_SET_CBAI(pmrsts, 1);
7897 stl_le_p(&n->bar.pmrsts, pmrsts);
7898 return;
7899 }
7900
7901 n->pmr.cmse = true;
7902 n->pmr.cba = cba;
7903 }
7904
7905 return;
7906 case NVME_REG_PMRMSCU:
7907 if (!NVME_CAP_PMRS(cap)) {
7908 return;
7909 }
7910
7911 stl_le_p(&n->bar.pmrmscu, data);
7912 return;
7913 default:
7914 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid,
7915 "invalid MMIO write,"
7916 " offset=0x%"PRIx64", data=%"PRIx64"",
7917 offset, data);
7918 break;
7919 }
7920 }
7921
nvme_mmio_read(void * opaque,hwaddr addr,unsigned size)7922 static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
7923 {
7924 NvmeCtrl *n = (NvmeCtrl *)opaque;
7925 uint8_t *ptr = (uint8_t *)&n->bar;
7926
7927 trace_pci_nvme_mmio_read(addr, size);
7928
7929 if (unlikely(addr & (sizeof(uint32_t) - 1))) {
7930 NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32,
7931 "MMIO read not 32-bit aligned,"
7932 " offset=0x%"PRIx64"", addr);
7933 /* should RAZ, fall through for now */
7934 } else if (unlikely(size < sizeof(uint32_t))) {
7935 NVME_GUEST_ERR(pci_nvme_ub_mmiord_toosmall,
7936 "MMIO read smaller than 32-bits,"
7937 " offset=0x%"PRIx64"", addr);
7938 /* should RAZ, fall through for now */
7939 }
7940
7941 if (addr > sizeof(n->bar) - size) {
7942 NVME_GUEST_ERR(pci_nvme_ub_mmiord_invalid_ofs,
7943 "MMIO read beyond last register,"
7944 " offset=0x%"PRIx64", returning 0", addr);
7945
7946 return 0;
7947 }
7948
7949 if (pci_is_vf(PCI_DEVICE(n)) && !nvme_sctrl(n)->scs &&
7950 addr != NVME_REG_CSTS) {
7951 trace_pci_nvme_err_ignored_mmio_vf_offline(addr, size);
7952 return 0;
7953 }
7954
7955 /*
7956 * When PMRWBM bit 1 is set then read from
7957 * from PMRSTS should ensure prior writes
7958 * made it to persistent media
7959 */
7960 if (addr == NVME_REG_PMRSTS &&
7961 (NVME_PMRCAP_PMRWBM(ldl_le_p(&n->bar.pmrcap)) & 0x02)) {
7962 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
7963 }
7964
7965 return ldn_le_p(ptr + addr, size);
7966 }
7967
nvme_process_db(NvmeCtrl * n,hwaddr addr,int val)7968 static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
7969 {
7970 PCIDevice *pci = PCI_DEVICE(n);
7971 uint32_t qid;
7972
7973 if (unlikely(addr & ((1 << 2) - 1))) {
7974 NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned,
7975 "doorbell write not 32-bit aligned,"
7976 " offset=0x%"PRIx64", ignoring", addr);
7977 return;
7978 }
7979
7980 if (((addr - 0x1000) >> 2) & 1) {
7981 /* Completion queue doorbell write */
7982
7983 uint16_t new_head = val & 0xffff;
7984 int start_sqs;
7985 NvmeCQueue *cq;
7986
7987 qid = (addr - (0x1000 + (1 << 2))) >> 3;
7988 if (unlikely(nvme_check_cqid(n, qid))) {
7989 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cq,
7990 "completion queue doorbell write"
7991 " for nonexistent queue,"
7992 " sqid=%"PRIu32", ignoring", qid);
7993
7994 /*
7995 * NVM Express v1.3d, Section 4.1 state: "If host software writes
7996 * an invalid value to the Submission Queue Tail Doorbell or
7997 * Completion Queue Head Doorbell register and an Asynchronous Event
7998 * Request command is outstanding, then an asynchronous event is
7999 * posted to the Admin Completion Queue with a status code of
8000 * Invalid Doorbell Write Value."
8001 *
8002 * Also note that the spec includes the "Invalid Doorbell Register"
8003 * status code, but nowhere does it specify when to use it.
8004 * However, it seems reasonable to use it here in a similar
8005 * fashion.
8006 */
8007 if (n->outstanding_aers) {
8008 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
8009 NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
8010 NVME_LOG_ERROR_INFO);
8011 }
8012
8013 return;
8014 }
8015
8016 cq = n->cq[qid];
8017 if (unlikely(new_head >= cq->size)) {
8018 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cqhead,
8019 "completion queue doorbell write value"
8020 " beyond queue size, sqid=%"PRIu32","
8021 " new_head=%"PRIu16", ignoring",
8022 qid, new_head);
8023
8024 if (n->outstanding_aers) {
8025 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
8026 NVME_AER_INFO_ERR_INVALID_DB_VALUE,
8027 NVME_LOG_ERROR_INFO);
8028 }
8029
8030 return;
8031 }
8032
8033 trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head);
8034
8035 start_sqs = nvme_cq_full(cq) ? 1 : 0;
8036 cq->head = new_head;
8037 if (!qid && n->dbbuf_enabled) {
8038 stl_le_pci_dma(pci, cq->db_addr, cq->head, MEMTXATTRS_UNSPECIFIED);
8039 }
8040 if (start_sqs) {
8041 NvmeSQueue *sq;
8042 QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
8043 qemu_bh_schedule(sq->bh);
8044 }
8045 qemu_bh_schedule(cq->bh);
8046 }
8047
8048 if (cq->tail == cq->head) {
8049 if (cq->irq_enabled) {
8050 n->cq_pending--;
8051 }
8052
8053 nvme_irq_deassert(n, cq);
8054 }
8055 } else {
8056 /* Submission queue doorbell write */
8057
8058 uint16_t new_tail = val & 0xffff;
8059 NvmeSQueue *sq;
8060
8061 qid = (addr - 0x1000) >> 3;
8062 if (unlikely(nvme_check_sqid(n, qid))) {
8063 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sq,
8064 "submission queue doorbell write"
8065 " for nonexistent queue,"
8066 " sqid=%"PRIu32", ignoring", qid);
8067
8068 if (n->outstanding_aers) {
8069 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
8070 NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
8071 NVME_LOG_ERROR_INFO);
8072 }
8073
8074 return;
8075 }
8076
8077 sq = n->sq[qid];
8078 if (unlikely(new_tail >= sq->size)) {
8079 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sqtail,
8080 "submission queue doorbell write value"
8081 " beyond queue size, sqid=%"PRIu32","
8082 " new_tail=%"PRIu16", ignoring",
8083 qid, new_tail);
8084
8085 if (n->outstanding_aers) {
8086 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
8087 NVME_AER_INFO_ERR_INVALID_DB_VALUE,
8088 NVME_LOG_ERROR_INFO);
8089 }
8090
8091 return;
8092 }
8093
8094 trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail);
8095
8096 sq->tail = new_tail;
8097 if (!qid && n->dbbuf_enabled) {
8098 /*
8099 * The spec states "the host shall also update the controller's
8100 * corresponding doorbell property to match the value of that entry
8101 * in the Shadow Doorbell buffer."
8102 *
8103 * Since this context is currently a VM trap, we can safely enforce
8104 * the requirement from the device side in case the host is
8105 * misbehaving.
8106 *
8107 * Note, we shouldn't have to do this, but various drivers
8108 * including ones that run on Linux, are not updating Admin Queues,
8109 * so we can't trust reading it for an appropriate sq tail.
8110 */
8111 stl_le_pci_dma(pci, sq->db_addr, sq->tail, MEMTXATTRS_UNSPECIFIED);
8112 }
8113
8114 qemu_bh_schedule(sq->bh);
8115 }
8116 }
8117
nvme_mmio_write(void * opaque,hwaddr addr,uint64_t data,unsigned size)8118 static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
8119 unsigned size)
8120 {
8121 NvmeCtrl *n = (NvmeCtrl *)opaque;
8122
8123 trace_pci_nvme_mmio_write(addr, data, size);
8124
8125 if (pci_is_vf(PCI_DEVICE(n)) && !nvme_sctrl(n)->scs &&
8126 addr != NVME_REG_CSTS) {
8127 trace_pci_nvme_err_ignored_mmio_vf_offline(addr, size);
8128 return;
8129 }
8130
8131 if (addr < sizeof(n->bar)) {
8132 nvme_write_bar(n, addr, data, size);
8133 } else {
8134 nvme_process_db(n, addr, data);
8135 }
8136 }
8137
8138 static const MemoryRegionOps nvme_mmio_ops = {
8139 .read = nvme_mmio_read,
8140 .write = nvme_mmio_write,
8141 .endianness = DEVICE_LITTLE_ENDIAN,
8142 .impl = {
8143 .min_access_size = 2,
8144 .max_access_size = 8,
8145 },
8146 };
8147
nvme_cmb_write(void * opaque,hwaddr addr,uint64_t data,unsigned size)8148 static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data,
8149 unsigned size)
8150 {
8151 NvmeCtrl *n = (NvmeCtrl *)opaque;
8152 stn_le_p(&n->cmb.buf[addr], size, data);
8153 }
8154
nvme_cmb_read(void * opaque,hwaddr addr,unsigned size)8155 static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size)
8156 {
8157 NvmeCtrl *n = (NvmeCtrl *)opaque;
8158 return ldn_le_p(&n->cmb.buf[addr], size);
8159 }
8160
8161 static const MemoryRegionOps nvme_cmb_ops = {
8162 .read = nvme_cmb_read,
8163 .write = nvme_cmb_write,
8164 .endianness = DEVICE_LITTLE_ENDIAN,
8165 .impl = {
8166 .min_access_size = 1,
8167 .max_access_size = 8,
8168 },
8169 };
8170
nvme_check_params(NvmeCtrl * n,Error ** errp)8171 static bool nvme_check_params(NvmeCtrl *n, Error **errp)
8172 {
8173 NvmeParams *params = &n->params;
8174
8175 if (params->num_queues) {
8176 warn_report("num_queues is deprecated; please use max_ioqpairs "
8177 "instead");
8178
8179 params->max_ioqpairs = params->num_queues - 1;
8180 }
8181
8182 if (n->namespace.blkconf.blk && n->subsys) {
8183 error_setg(errp, "subsystem support is unavailable with legacy "
8184 "namespace ('drive' property)");
8185 return false;
8186 }
8187
8188 if (params->max_ioqpairs < 1 ||
8189 params->max_ioqpairs > NVME_MAX_IOQPAIRS) {
8190 error_setg(errp, "max_ioqpairs must be between 1 and %d",
8191 NVME_MAX_IOQPAIRS);
8192 return false;
8193 }
8194
8195 if (params->msix_qsize < 1 ||
8196 params->msix_qsize > PCI_MSIX_FLAGS_QSIZE + 1) {
8197 error_setg(errp, "msix_qsize must be between 1 and %d",
8198 PCI_MSIX_FLAGS_QSIZE + 1);
8199 return false;
8200 }
8201
8202 if (!params->serial) {
8203 error_setg(errp, "serial property not set");
8204 return false;
8205 }
8206
8207 if (params->mqes < 1) {
8208 error_setg(errp, "mqes property cannot be less than 1");
8209 return false;
8210 }
8211
8212 if (n->pmr.dev) {
8213 if (params->msix_exclusive_bar) {
8214 error_setg(errp, "not enough BARs available to enable PMR");
8215 return false;
8216 }
8217
8218 if (host_memory_backend_is_mapped(n->pmr.dev)) {
8219 error_setg(errp, "can't use already busy memdev: %s",
8220 object_get_canonical_path_component(OBJECT(n->pmr.dev)));
8221 return false;
8222 }
8223
8224 if (!is_power_of_2(n->pmr.dev->size)) {
8225 error_setg(errp, "pmr backend size needs to be power of 2 in size");
8226 return false;
8227 }
8228
8229 host_memory_backend_set_mapped(n->pmr.dev, true);
8230 }
8231
8232 if (n->params.zasl > n->params.mdts) {
8233 error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less "
8234 "than or equal to mdts (Maximum Data Transfer Size)");
8235 return false;
8236 }
8237
8238 if (!n->params.vsl) {
8239 error_setg(errp, "vsl must be non-zero");
8240 return false;
8241 }
8242
8243 if (params->sriov_max_vfs) {
8244 if (!n->subsys) {
8245 error_setg(errp, "subsystem is required for the use of SR-IOV");
8246 return false;
8247 }
8248
8249 if (params->cmb_size_mb) {
8250 error_setg(errp, "CMB is not supported with SR-IOV");
8251 return false;
8252 }
8253
8254 if (n->pmr.dev) {
8255 error_setg(errp, "PMR is not supported with SR-IOV");
8256 return false;
8257 }
8258
8259 if (!params->sriov_vq_flexible || !params->sriov_vi_flexible) {
8260 error_setg(errp, "both sriov_vq_flexible and sriov_vi_flexible"
8261 " must be set for the use of SR-IOV");
8262 return false;
8263 }
8264
8265 if (params->sriov_vq_flexible < params->sriov_max_vfs * 2) {
8266 error_setg(errp, "sriov_vq_flexible must be greater than or equal"
8267 " to %d (sriov_max_vfs * 2)", params->sriov_max_vfs * 2);
8268 return false;
8269 }
8270
8271 if (params->max_ioqpairs < params->sriov_vq_flexible + 2) {
8272 error_setg(errp, "(max_ioqpairs - sriov_vq_flexible) must be"
8273 " greater than or equal to 2");
8274 return false;
8275 }
8276
8277 if (params->sriov_vi_flexible < params->sriov_max_vfs) {
8278 error_setg(errp, "sriov_vi_flexible must be greater than or equal"
8279 " to %d (sriov_max_vfs)", params->sriov_max_vfs);
8280 return false;
8281 }
8282
8283 if (params->msix_qsize < params->sriov_vi_flexible + 1) {
8284 error_setg(errp, "(msix_qsize - sriov_vi_flexible) must be"
8285 " greater than or equal to 1");
8286 return false;
8287 }
8288
8289 if (params->sriov_max_vi_per_vf &&
8290 (params->sriov_max_vi_per_vf - 1) % NVME_VF_RES_GRANULARITY) {
8291 error_setg(errp, "sriov_max_vi_per_vf must meet:"
8292 " (sriov_max_vi_per_vf - 1) %% %d == 0 and"
8293 " sriov_max_vi_per_vf >= 1", NVME_VF_RES_GRANULARITY);
8294 return false;
8295 }
8296
8297 if (params->sriov_max_vq_per_vf &&
8298 (params->sriov_max_vq_per_vf < 2 ||
8299 (params->sriov_max_vq_per_vf - 1) % NVME_VF_RES_GRANULARITY)) {
8300 error_setg(errp, "sriov_max_vq_per_vf must meet:"
8301 " (sriov_max_vq_per_vf - 1) %% %d == 0 and"
8302 " sriov_max_vq_per_vf >= 2", NVME_VF_RES_GRANULARITY);
8303 return false;
8304 }
8305 }
8306
8307 return true;
8308 }
8309
nvme_init_state(NvmeCtrl * n)8310 static void nvme_init_state(NvmeCtrl *n)
8311 {
8312 NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
8313 NvmeSecCtrlEntry *list = n->sec_ctrl_list;
8314 NvmeSecCtrlEntry *sctrl;
8315 PCIDevice *pci = PCI_DEVICE(n);
8316 NvmeAtomic *atomic = &n->atomic;
8317 NvmeIdCtrl *id = &n->id_ctrl;
8318 uint8_t max_vfs;
8319 int i;
8320
8321 if (pci_is_vf(pci)) {
8322 sctrl = nvme_sctrl(n);
8323 max_vfs = 0;
8324 n->conf_ioqpairs = sctrl->nvq ? le16_to_cpu(sctrl->nvq) - 1 : 0;
8325 n->conf_msix_qsize = sctrl->nvi ? le16_to_cpu(sctrl->nvi) : 1;
8326 } else {
8327 max_vfs = n->params.sriov_max_vfs;
8328 n->conf_ioqpairs = n->params.max_ioqpairs;
8329 n->conf_msix_qsize = n->params.msix_qsize;
8330 }
8331
8332 n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
8333 n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
8334 n->temperature = NVME_TEMPERATURE;
8335 n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
8336 n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
8337 n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
8338 QTAILQ_INIT(&n->aer_queue);
8339
8340 n->nr_sec_ctrls = max_vfs;
8341 for (i = 0; i < max_vfs; i++) {
8342 sctrl = &list[i];
8343 sctrl->pcid = cpu_to_le16(n->cntlid);
8344 sctrl->vfn = cpu_to_le16(i + 1);
8345 }
8346
8347 cap->cntlid = cpu_to_le16(n->cntlid);
8348 cap->crt = NVME_CRT_VQ | NVME_CRT_VI;
8349
8350 if (pci_is_vf(pci)) {
8351 cap->vqprt = cpu_to_le16(1 + n->conf_ioqpairs);
8352 } else {
8353 cap->vqprt = cpu_to_le16(1 + n->params.max_ioqpairs -
8354 n->params.sriov_vq_flexible);
8355 cap->vqfrt = cpu_to_le32(n->params.sriov_vq_flexible);
8356 cap->vqrfap = cap->vqfrt;
8357 cap->vqgran = cpu_to_le16(NVME_VF_RES_GRANULARITY);
8358 cap->vqfrsm = n->params.sriov_max_vq_per_vf ?
8359 cpu_to_le16(n->params.sriov_max_vq_per_vf) :
8360 cap->vqfrt / MAX(max_vfs, 1);
8361 }
8362
8363 if (pci_is_vf(pci)) {
8364 cap->viprt = cpu_to_le16(n->conf_msix_qsize);
8365 } else {
8366 cap->viprt = cpu_to_le16(n->params.msix_qsize -
8367 n->params.sriov_vi_flexible);
8368 cap->vifrt = cpu_to_le32(n->params.sriov_vi_flexible);
8369 cap->virfap = cap->vifrt;
8370 cap->vigran = cpu_to_le16(NVME_VF_RES_GRANULARITY);
8371 cap->vifrsm = n->params.sriov_max_vi_per_vf ?
8372 cpu_to_le16(n->params.sriov_max_vi_per_vf) :
8373 cap->vifrt / MAX(max_vfs, 1);
8374 }
8375
8376 /* Atomic Write */
8377 id->awun = cpu_to_le16(n->params.atomic_awun);
8378 id->awupf = cpu_to_le16(n->params.atomic_awupf);
8379 n->dn = n->params.atomic_dn;
8380
8381 if (id->awun || id->awupf) {
8382 if (id->awupf > id->awun) {
8383 id->awupf = 0;
8384 }
8385
8386 if (n->dn) {
8387 atomic->atomic_max_write_size = id->awupf + 1;
8388 } else {
8389 atomic->atomic_max_write_size = id->awun + 1;
8390 }
8391
8392 if (atomic->atomic_max_write_size == 1) {
8393 atomic->atomic_writes = 0;
8394 } else {
8395 atomic->atomic_writes = 1;
8396 }
8397 }
8398 }
8399
nvme_init_cmb(NvmeCtrl * n,PCIDevice * pci_dev)8400 static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
8401 {
8402 uint64_t cmb_size = n->params.cmb_size_mb * MiB;
8403 uint64_t cap = ldq_le_p(&n->bar.cap);
8404
8405 n->cmb.buf = g_malloc0(cmb_size);
8406 memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n,
8407 "nvme-cmb", cmb_size);
8408 pci_register_bar(pci_dev, NVME_CMB_BIR,
8409 PCI_BASE_ADDRESS_SPACE_MEMORY |
8410 PCI_BASE_ADDRESS_MEM_TYPE_64 |
8411 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
8412
8413 NVME_CAP_SET_CMBS(cap, 1);
8414 stq_le_p(&n->bar.cap, cap);
8415
8416 if (n->params.legacy_cmb) {
8417 nvme_cmb_enable_regs(n);
8418 n->cmb.cmse = true;
8419 }
8420 }
8421
nvme_init_pmr(NvmeCtrl * n,PCIDevice * pci_dev)8422 static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
8423 {
8424 uint32_t pmrcap = ldl_le_p(&n->bar.pmrcap);
8425
8426 NVME_PMRCAP_SET_RDS(pmrcap, 1);
8427 NVME_PMRCAP_SET_WDS(pmrcap, 1);
8428 NVME_PMRCAP_SET_BIR(pmrcap, NVME_PMR_BIR);
8429 /* Turn on bit 1 support */
8430 NVME_PMRCAP_SET_PMRWBM(pmrcap, 0x02);
8431 NVME_PMRCAP_SET_CMSS(pmrcap, 1);
8432 stl_le_p(&n->bar.pmrcap, pmrcap);
8433
8434 pci_register_bar(pci_dev, NVME_PMR_BIR,
8435 PCI_BASE_ADDRESS_SPACE_MEMORY |
8436 PCI_BASE_ADDRESS_MEM_TYPE_64 |
8437 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmr.dev->mr);
8438
8439 memory_region_set_enabled(&n->pmr.dev->mr, false);
8440 }
8441
nvme_mbar_size(unsigned total_queues,unsigned total_irqs,unsigned * msix_table_offset,unsigned * msix_pba_offset)8442 static uint64_t nvme_mbar_size(unsigned total_queues, unsigned total_irqs,
8443 unsigned *msix_table_offset,
8444 unsigned *msix_pba_offset)
8445 {
8446 uint64_t bar_size, msix_table_size;
8447
8448 bar_size = sizeof(NvmeBar) + 2 * total_queues * NVME_DB_SIZE;
8449
8450 if (total_irqs == 0) {
8451 goto out;
8452 }
8453
8454 bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
8455
8456 if (msix_table_offset) {
8457 *msix_table_offset = bar_size;
8458 }
8459
8460 msix_table_size = PCI_MSIX_ENTRY_SIZE * total_irqs;
8461 bar_size += msix_table_size;
8462 bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
8463
8464 if (msix_pba_offset) {
8465 *msix_pba_offset = bar_size;
8466 }
8467
8468 bar_size += QEMU_ALIGN_UP(total_irqs, 64) / 8;
8469
8470 out:
8471 return pow2ceil(bar_size);
8472 }
8473
nvme_init_sriov(NvmeCtrl * n,PCIDevice * pci_dev,uint16_t offset)8474 static void nvme_init_sriov(NvmeCtrl *n, PCIDevice *pci_dev, uint16_t offset)
8475 {
8476 uint16_t vf_dev_id = n->params.use_intel_id ?
8477 PCI_DEVICE_ID_INTEL_NVME : PCI_DEVICE_ID_REDHAT_NVME;
8478 NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
8479 uint64_t bar_size = nvme_mbar_size(le16_to_cpu(cap->vqfrsm),
8480 le16_to_cpu(cap->vifrsm),
8481 NULL, NULL);
8482
8483 pcie_sriov_pf_init(pci_dev, offset, "nvme", vf_dev_id,
8484 n->params.sriov_max_vfs, n->params.sriov_max_vfs,
8485 NVME_VF_OFFSET, NVME_VF_STRIDE);
8486
8487 pcie_sriov_pf_init_vf_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
8488 PCI_BASE_ADDRESS_MEM_TYPE_64, bar_size);
8489 }
8490
nvme_add_pm_capability(PCIDevice * pci_dev,uint8_t offset)8491 static int nvme_add_pm_capability(PCIDevice *pci_dev, uint8_t offset)
8492 {
8493 Error *err = NULL;
8494 int ret;
8495
8496 ret = pci_add_capability(pci_dev, PCI_CAP_ID_PM, offset,
8497 PCI_PM_SIZEOF, &err);
8498 if (err) {
8499 error_report_err(err);
8500 return ret;
8501 }
8502
8503 pci_set_word(pci_dev->config + offset + PCI_PM_PMC,
8504 PCI_PM_CAP_VER_1_2);
8505 pci_set_word(pci_dev->config + offset + PCI_PM_CTRL,
8506 PCI_PM_CTRL_NO_SOFT_RESET);
8507 pci_set_word(pci_dev->wmask + offset + PCI_PM_CTRL,
8508 PCI_PM_CTRL_STATE_MASK);
8509
8510 return 0;
8511 }
8512
pcie_doe_spdm_rsp(DOECap * doe_cap)8513 static bool pcie_doe_spdm_rsp(DOECap *doe_cap)
8514 {
8515 void *req = pcie_doe_get_write_mbox_ptr(doe_cap);
8516 uint32_t req_len = pcie_doe_get_obj_len(req) * 4;
8517 void *rsp = doe_cap->read_mbox;
8518 uint32_t rsp_len = SPDM_SOCKET_MAX_MESSAGE_BUFFER_SIZE;
8519
8520 uint32_t recvd = spdm_socket_rsp(doe_cap->spdm_socket,
8521 SPDM_SOCKET_TRANSPORT_TYPE_PCI_DOE,
8522 req, req_len, rsp, rsp_len);
8523 doe_cap->read_mbox_len += DIV_ROUND_UP(recvd, 4);
8524
8525 return recvd != 0;
8526 }
8527
8528 static DOEProtocol doe_spdm_prot[] = {
8529 { PCI_VENDOR_ID_PCI_SIG, PCI_SIG_DOE_CMA, pcie_doe_spdm_rsp },
8530 { PCI_VENDOR_ID_PCI_SIG, PCI_SIG_DOE_SECURED_CMA, pcie_doe_spdm_rsp },
8531 { }
8532 };
8533
nvme_init_pci(NvmeCtrl * n,PCIDevice * pci_dev,Error ** errp)8534 static bool nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
8535 {
8536 ERRP_GUARD();
8537 uint8_t *pci_conf = pci_dev->config;
8538 uint64_t bar_size;
8539 unsigned msix_table_offset = 0, msix_pba_offset = 0;
8540 unsigned nr_vectors;
8541 int ret;
8542
8543 pci_conf[PCI_INTERRUPT_PIN] = 1;
8544 pci_config_set_prog_interface(pci_conf, 0x2);
8545
8546 if (n->params.use_intel_id) {
8547 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
8548 pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_INTEL_NVME);
8549 } else {
8550 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT);
8551 pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME);
8552 }
8553
8554 pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
8555 nvme_add_pm_capability(pci_dev, 0x60);
8556 pcie_endpoint_cap_init(pci_dev, 0x80);
8557 pcie_cap_flr_init(pci_dev);
8558 if (n->params.sriov_max_vfs) {
8559 pcie_ari_init(pci_dev, 0x100);
8560 }
8561
8562 if (n->params.msix_exclusive_bar && !pci_is_vf(pci_dev)) {
8563 bar_size = nvme_mbar_size(n->params.max_ioqpairs + 1, 0, NULL, NULL);
8564 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
8565 bar_size);
8566 pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
8567 PCI_BASE_ADDRESS_MEM_TYPE_64, &n->iomem);
8568 ret = msix_init_exclusive_bar(pci_dev, n->params.msix_qsize, 4, errp);
8569 } else {
8570 assert(n->params.msix_qsize >= 1);
8571
8572 /* add one to max_ioqpairs to account for the admin queue pair */
8573 if (!pci_is_vf(pci_dev)) {
8574 nr_vectors = n->params.msix_qsize;
8575 bar_size = nvme_mbar_size(n->params.max_ioqpairs + 1,
8576 nr_vectors, &msix_table_offset,
8577 &msix_pba_offset);
8578 } else {
8579 NvmeCtrl *pn = NVME(pcie_sriov_get_pf(pci_dev));
8580 NvmePriCtrlCap *cap = &pn->pri_ctrl_cap;
8581
8582 nr_vectors = le16_to_cpu(cap->vifrsm);
8583 bar_size = nvme_mbar_size(le16_to_cpu(cap->vqfrsm), nr_vectors,
8584 &msix_table_offset, &msix_pba_offset);
8585 }
8586
8587 memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size);
8588 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
8589 msix_table_offset);
8590 memory_region_add_subregion(&n->bar0, 0, &n->iomem);
8591
8592 if (pci_is_vf(pci_dev)) {
8593 pcie_sriov_vf_register_bar(pci_dev, 0, &n->bar0);
8594 } else {
8595 pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
8596 PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
8597 }
8598
8599 ret = msix_init(pci_dev, nr_vectors,
8600 &n->bar0, 0, msix_table_offset,
8601 &n->bar0, 0, msix_pba_offset, 0, errp);
8602 }
8603
8604 if (ret == -ENOTSUP) {
8605 /* report that msix is not supported, but do not error out */
8606 warn_report_err(*errp);
8607 *errp = NULL;
8608 } else if (ret < 0) {
8609 /* propagate error to caller */
8610 return false;
8611 }
8612
8613 nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize);
8614
8615 pcie_cap_deverr_init(pci_dev);
8616
8617 /* DOE Initialisation */
8618 if (pci_dev->spdm_port) {
8619 uint16_t doe_offset = n->params.sriov_max_vfs ?
8620 PCI_CONFIG_SPACE_SIZE + PCI_ARI_SIZEOF
8621 : PCI_CONFIG_SPACE_SIZE;
8622
8623 pcie_doe_init(pci_dev, &pci_dev->doe_spdm, doe_offset,
8624 doe_spdm_prot, true, 0);
8625
8626 pci_dev->doe_spdm.spdm_socket = spdm_socket_connect(pci_dev->spdm_port,
8627 errp);
8628
8629 if (pci_dev->doe_spdm.spdm_socket < 0) {
8630 return false;
8631 }
8632 }
8633
8634 if (n->params.cmb_size_mb) {
8635 nvme_init_cmb(n, pci_dev);
8636 }
8637
8638 if (n->pmr.dev) {
8639 nvme_init_pmr(n, pci_dev);
8640 }
8641
8642 if (!pci_is_vf(pci_dev) && n->params.sriov_max_vfs) {
8643 nvme_init_sriov(n, pci_dev, 0x120);
8644 }
8645
8646 return true;
8647 }
8648
nvme_init_subnqn(NvmeCtrl * n)8649 static void nvme_init_subnqn(NvmeCtrl *n)
8650 {
8651 NvmeSubsystem *subsys = n->subsys;
8652 NvmeIdCtrl *id = &n->id_ctrl;
8653
8654 if (!subsys) {
8655 snprintf((char *)id->subnqn, sizeof(id->subnqn),
8656 "nqn.2019-08.org.qemu:%s", n->params.serial);
8657 } else {
8658 pstrcpy((char *)id->subnqn, sizeof(id->subnqn), (char*)subsys->subnqn);
8659 }
8660 }
8661
nvme_init_ctrl(NvmeCtrl * n,PCIDevice * pci_dev)8662 static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
8663 {
8664 NvmeIdCtrl *id = &n->id_ctrl;
8665 uint8_t *pci_conf = pci_dev->config;
8666 uint64_t cap = ldq_le_p(&n->bar.cap);
8667 NvmeSecCtrlEntry *sctrl = nvme_sctrl(n);
8668 uint32_t ctratt;
8669
8670 id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
8671 id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
8672 strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' ');
8673 strpadcpy((char *)id->fr, sizeof(id->fr), QEMU_VERSION, ' ');
8674 strpadcpy((char *)id->sn, sizeof(id->sn), n->params.serial, ' ');
8675
8676 id->cntlid = cpu_to_le16(n->cntlid);
8677
8678 id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR);
8679
8680 ctratt = NVME_CTRATT_ELBAS;
8681 if (n->params.ctratt.mem) {
8682 ctratt |= NVME_CTRATT_MEM;
8683 }
8684
8685 id->rab = 6;
8686
8687 if (n->params.use_intel_id) {
8688 id->ieee[0] = 0xb3;
8689 id->ieee[1] = 0x02;
8690 id->ieee[2] = 0x00;
8691 } else {
8692 id->ieee[0] = 0x00;
8693 id->ieee[1] = 0x54;
8694 id->ieee[2] = 0x52;
8695 }
8696
8697 id->mdts = n->params.mdts;
8698 id->ver = cpu_to_le32(NVME_SPEC_VER);
8699 id->oacs =
8700 cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT | NVME_OACS_DBBUF |
8701 NVME_OACS_DIRECTIVES);
8702 id->cntrltype = 0x1;
8703
8704 /*
8705 * Because the controller always completes the Abort command immediately,
8706 * there can never be more than one concurrently executing Abort command,
8707 * so this value is never used for anything. Note that there can easily be
8708 * many Abort commands in the queues, but they are not considered
8709 * "executing" until processed by nvme_abort.
8710 *
8711 * The specification recommends a value of 3 for Abort Command Limit (four
8712 * concurrently outstanding Abort commands), so lets use that though it is
8713 * inconsequential.
8714 */
8715 id->acl = 3;
8716 id->aerl = n->params.aerl;
8717 id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO;
8718 id->lpa = NVME_LPA_NS_SMART | NVME_LPA_CSE | NVME_LPA_EXTENDED;
8719
8720 /* recommended default value (~70 C) */
8721 id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
8722 id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL);
8723
8724 id->sqes = (NVME_SQES << 4) | NVME_SQES;
8725 id->cqes = (NVME_CQES << 4) | NVME_CQES;
8726 id->nn = cpu_to_le32(NVME_MAX_NAMESPACES);
8727 id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
8728 NVME_ONCS_FEATURES | NVME_ONCS_DSM |
8729 NVME_ONCS_COMPARE | NVME_ONCS_COPY |
8730 NVME_ONCS_NVMCSA | NVME_ONCS_NVMAFC);
8731
8732 /*
8733 * NOTE: If this device ever supports a command set that does NOT use 0x0
8734 * as a Flush-equivalent operation, support for the broadcast NSID in Flush
8735 * should probably be removed.
8736 *
8737 * See comment in nvme_io_cmd.
8738 */
8739 id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT;
8740
8741 id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0 | NVME_OCFS_COPY_FORMAT_1 |
8742 NVME_OCFS_COPY_FORMAT_2 | NVME_OCFS_COPY_FORMAT_3);
8743 id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
8744 NVME_CTRL_SGLS_MPTR_SGL);
8745
8746 nvme_init_subnqn(n);
8747
8748 id->psd[0].mp = cpu_to_le16(0x9c4);
8749 id->psd[0].enlat = cpu_to_le32(0x10);
8750 id->psd[0].exlat = cpu_to_le32(0x4);
8751
8752 if (n->subsys) {
8753 id->cmic |= NVME_CMIC_MULTI_CTRL;
8754 ctratt |= NVME_CTRATT_ENDGRPS;
8755
8756 id->endgidmax = cpu_to_le16(0x1);
8757
8758 if (n->subsys->endgrp.fdp.enabled) {
8759 ctratt |= NVME_CTRATT_FDPS;
8760 }
8761 }
8762
8763 id->ctratt = cpu_to_le32(ctratt);
8764
8765 NVME_CAP_SET_MQES(cap, n->params.mqes);
8766 NVME_CAP_SET_CQR(cap, 1);
8767 NVME_CAP_SET_TO(cap, 0xf);
8768 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_NVM);
8769 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_CSI_SUPP);
8770 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_ADMIN_ONLY);
8771 NVME_CAP_SET_MPSMAX(cap, 4);
8772 NVME_CAP_SET_CMBS(cap, n->params.cmb_size_mb ? 1 : 0);
8773 NVME_CAP_SET_PMRS(cap, n->pmr.dev ? 1 : 0);
8774 stq_le_p(&n->bar.cap, cap);
8775
8776 stl_le_p(&n->bar.vs, NVME_SPEC_VER);
8777 n->bar.intmc = n->bar.intms = 0;
8778
8779 if (pci_is_vf(pci_dev) && !sctrl->scs) {
8780 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
8781 }
8782 }
8783
nvme_init_subsys(NvmeCtrl * n,Error ** errp)8784 static int nvme_init_subsys(NvmeCtrl *n, Error **errp)
8785 {
8786 int cntlid;
8787
8788 if (!n->subsys) {
8789 return 0;
8790 }
8791
8792 cntlid = nvme_subsys_register_ctrl(n, errp);
8793 if (cntlid < 0) {
8794 return -1;
8795 }
8796
8797 n->cntlid = cntlid;
8798
8799 return 0;
8800 }
8801
nvme_attach_ns(NvmeCtrl * n,NvmeNamespace * ns)8802 void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns)
8803 {
8804 uint32_t nsid = ns->params.nsid;
8805 assert(nsid && nsid <= NVME_MAX_NAMESPACES);
8806
8807 n->namespaces[nsid] = ns;
8808 ns->attached++;
8809
8810 n->dmrsl = MIN_NON_ZERO(n->dmrsl,
8811 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
8812 }
8813
nvme_realize(PCIDevice * pci_dev,Error ** errp)8814 static void nvme_realize(PCIDevice *pci_dev, Error **errp)
8815 {
8816 NvmeCtrl *n = NVME(pci_dev);
8817 DeviceState *dev = DEVICE(pci_dev);
8818 NvmeNamespace *ns;
8819 NvmeCtrl *pn = NVME(pcie_sriov_get_pf(pci_dev));
8820
8821 if (pci_is_vf(pci_dev)) {
8822 /*
8823 * VFs derive settings from the parent. PF's lifespan exceeds
8824 * that of VF's.
8825 */
8826 memcpy(&n->params, &pn->params, sizeof(NvmeParams));
8827
8828 /*
8829 * Set PF's serial value to a new string memory to prevent 'serial'
8830 * property object release of PF when a VF is removed from the system.
8831 */
8832 n->params.serial = g_strdup(pn->params.serial);
8833 n->subsys = pn->subsys;
8834 }
8835
8836 if (!nvme_check_params(n, errp)) {
8837 return;
8838 }
8839
8840 qbus_init(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS, dev, dev->id);
8841
8842 if (nvme_init_subsys(n, errp)) {
8843 return;
8844 }
8845 nvme_init_state(n);
8846 if (!nvme_init_pci(n, pci_dev, errp)) {
8847 return;
8848 }
8849 nvme_init_ctrl(n, pci_dev);
8850
8851 /* setup a namespace if the controller drive property was given */
8852 if (n->namespace.blkconf.blk) {
8853 ns = &n->namespace;
8854 ns->params.nsid = 1;
8855
8856 if (nvme_ns_setup(ns, errp)) {
8857 return;
8858 }
8859
8860 nvme_attach_ns(n, ns);
8861 }
8862 }
8863
nvme_exit(PCIDevice * pci_dev)8864 static void nvme_exit(PCIDevice *pci_dev)
8865 {
8866 NvmeCtrl *n = NVME(pci_dev);
8867 NvmeNamespace *ns;
8868 int i;
8869
8870 nvme_ctrl_reset(n, NVME_RESET_FUNCTION);
8871
8872 if (n->subsys) {
8873 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
8874 ns = nvme_ns(n, i);
8875 if (ns) {
8876 ns->attached--;
8877 }
8878 }
8879
8880 nvme_subsys_unregister_ctrl(n->subsys, n);
8881 }
8882
8883 g_free(n->cq);
8884 g_free(n->sq);
8885 g_free(n->aer_reqs);
8886
8887 if (n->params.cmb_size_mb) {
8888 g_free(n->cmb.buf);
8889 }
8890
8891 if (pci_dev->doe_spdm.spdm_socket > 0) {
8892 spdm_socket_close(pci_dev->doe_spdm.spdm_socket,
8893 SPDM_SOCKET_TRANSPORT_TYPE_PCI_DOE);
8894 }
8895
8896 if (n->pmr.dev) {
8897 host_memory_backend_set_mapped(n->pmr.dev, false);
8898 }
8899
8900 if (!pci_is_vf(pci_dev) && n->params.sriov_max_vfs) {
8901 pcie_sriov_pf_exit(pci_dev);
8902 }
8903
8904 msix_uninit(pci_dev, &n->bar0, &n->bar0);
8905 memory_region_del_subregion(&n->bar0, &n->iomem);
8906 }
8907
8908 static Property nvme_props[] = {
8909 DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf),
8910 DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND,
8911 HostMemoryBackend *),
8912 DEFINE_PROP_LINK("subsys", NvmeCtrl, subsys, TYPE_NVME_SUBSYS,
8913 NvmeSubsystem *),
8914 DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial),
8915 DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0),
8916 DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0),
8917 DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64),
8918 DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65),
8919 DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3),
8920 DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
8921 DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
8922 DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7),
8923 DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
8924 DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
8925 DEFINE_PROP_BOOL("ioeventfd", NvmeCtrl, params.ioeventfd, false),
8926 DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
8927 DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
8928 params.auto_transition_zones, true),
8929 DEFINE_PROP_UINT16("sriov_max_vfs", NvmeCtrl, params.sriov_max_vfs, 0),
8930 DEFINE_PROP_UINT16("sriov_vq_flexible", NvmeCtrl,
8931 params.sriov_vq_flexible, 0),
8932 DEFINE_PROP_UINT16("sriov_vi_flexible", NvmeCtrl,
8933 params.sriov_vi_flexible, 0),
8934 DEFINE_PROP_UINT32("sriov_max_vi_per_vf", NvmeCtrl,
8935 params.sriov_max_vi_per_vf, 0),
8936 DEFINE_PROP_UINT32("sriov_max_vq_per_vf", NvmeCtrl,
8937 params.sriov_max_vq_per_vf, 0),
8938 DEFINE_PROP_BOOL("msix-exclusive-bar", NvmeCtrl, params.msix_exclusive_bar,
8939 false),
8940 DEFINE_PROP_UINT16("mqes", NvmeCtrl, params.mqes, 0x7ff),
8941 DEFINE_PROP_UINT16("spdm_port", PCIDevice, spdm_port, 0),
8942 DEFINE_PROP_BOOL("ctratt.mem", NvmeCtrl, params.ctratt.mem, false),
8943 DEFINE_PROP_BOOL("atomic.dn", NvmeCtrl, params.atomic_dn, 0),
8944 DEFINE_PROP_UINT16("atomic.awun", NvmeCtrl, params.atomic_awun, 0),
8945 DEFINE_PROP_UINT16("atomic.awupf", NvmeCtrl, params.atomic_awupf, 0),
8946 DEFINE_PROP_END_OF_LIST(),
8947 };
8948
nvme_get_smart_warning(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)8949 static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name,
8950 void *opaque, Error **errp)
8951 {
8952 NvmeCtrl *n = NVME(obj);
8953 uint8_t value = n->smart_critical_warning;
8954
8955 visit_type_uint8(v, name, &value, errp);
8956 }
8957
nvme_set_smart_warning(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)8958 static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
8959 void *opaque, Error **errp)
8960 {
8961 NvmeCtrl *n = NVME(obj);
8962 uint8_t value, old_value, cap = 0, index, event;
8963
8964 if (!visit_type_uint8(v, name, &value, errp)) {
8965 return;
8966 }
8967
8968 cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY
8969 | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA;
8970 if (NVME_CAP_PMRS(ldq_le_p(&n->bar.cap))) {
8971 cap |= NVME_SMART_PMR_UNRELIABLE;
8972 }
8973
8974 if ((value & cap) != value) {
8975 error_setg(errp, "unsupported smart critical warning bits: 0x%x",
8976 value & ~cap);
8977 return;
8978 }
8979
8980 old_value = n->smart_critical_warning;
8981 n->smart_critical_warning = value;
8982
8983 /* only inject new bits of smart critical warning */
8984 for (index = 0; index < NVME_SMART_WARN_MAX; index++) {
8985 event = 1 << index;
8986 if (value & ~old_value & event)
8987 nvme_smart_event(n, event);
8988 }
8989 }
8990
nvme_pci_reset(DeviceState * qdev)8991 static void nvme_pci_reset(DeviceState *qdev)
8992 {
8993 PCIDevice *pci_dev = PCI_DEVICE(qdev);
8994 NvmeCtrl *n = NVME(pci_dev);
8995
8996 trace_pci_nvme_pci_reset();
8997 nvme_ctrl_reset(n, NVME_RESET_FUNCTION);
8998 }
8999
nvme_sriov_post_write_config(PCIDevice * dev,uint16_t old_num_vfs)9000 static void nvme_sriov_post_write_config(PCIDevice *dev, uint16_t old_num_vfs)
9001 {
9002 NvmeCtrl *n = NVME(dev);
9003 NvmeSecCtrlEntry *sctrl;
9004 int i;
9005
9006 for (i = pcie_sriov_num_vfs(dev); i < old_num_vfs; i++) {
9007 sctrl = &n->sec_ctrl_list[i];
9008 nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
9009 }
9010 }
9011
nvme_pci_write_config(PCIDevice * dev,uint32_t address,uint32_t val,int len)9012 static void nvme_pci_write_config(PCIDevice *dev, uint32_t address,
9013 uint32_t val, int len)
9014 {
9015 uint16_t old_num_vfs = pcie_sriov_num_vfs(dev);
9016
9017 if (pcie_find_capability(dev, PCI_EXT_CAP_ID_DOE)) {
9018 pcie_doe_write_config(&dev->doe_spdm, address, val, len);
9019 }
9020 pci_default_write_config(dev, address, val, len);
9021 pcie_cap_flr_write_config(dev, address, val, len);
9022 nvme_sriov_post_write_config(dev, old_num_vfs);
9023 }
9024
nvme_pci_read_config(PCIDevice * dev,uint32_t address,int len)9025 static uint32_t nvme_pci_read_config(PCIDevice *dev, uint32_t address, int len)
9026 {
9027 uint32_t val;
9028 if (dev->spdm_port && pcie_find_capability(dev, PCI_EXT_CAP_ID_DOE)) {
9029 if (pcie_doe_read_config(&dev->doe_spdm, address, len, &val)) {
9030 return val;
9031 }
9032 }
9033 return pci_default_read_config(dev, address, len);
9034 }
9035
9036 static const VMStateDescription nvme_vmstate = {
9037 .name = "nvme",
9038 .unmigratable = 1,
9039 };
9040
nvme_class_init(ObjectClass * oc,void * data)9041 static void nvme_class_init(ObjectClass *oc, void *data)
9042 {
9043 DeviceClass *dc = DEVICE_CLASS(oc);
9044 PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
9045
9046 pc->realize = nvme_realize;
9047 pc->config_write = nvme_pci_write_config;
9048 pc->config_read = nvme_pci_read_config;
9049 pc->exit = nvme_exit;
9050 pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
9051 pc->revision = 2;
9052
9053 set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
9054 dc->desc = "Non-Volatile Memory Express";
9055 device_class_set_props(dc, nvme_props);
9056 dc->vmsd = &nvme_vmstate;
9057 device_class_set_legacy_reset(dc, nvme_pci_reset);
9058 }
9059
nvme_instance_init(Object * obj)9060 static void nvme_instance_init(Object *obj)
9061 {
9062 NvmeCtrl *n = NVME(obj);
9063
9064 device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex,
9065 "bootindex", "/namespace@1,0",
9066 DEVICE(obj));
9067
9068 object_property_add(obj, "smart_critical_warning", "uint8",
9069 nvme_get_smart_warning,
9070 nvme_set_smart_warning, NULL, NULL);
9071 }
9072
9073 static const TypeInfo nvme_info = {
9074 .name = TYPE_NVME,
9075 .parent = TYPE_PCI_DEVICE,
9076 .instance_size = sizeof(NvmeCtrl),
9077 .instance_init = nvme_instance_init,
9078 .class_init = nvme_class_init,
9079 .interfaces = (InterfaceInfo[]) {
9080 { INTERFACE_PCIE_DEVICE },
9081 { }
9082 },
9083 };
9084
9085 static const TypeInfo nvme_bus_info = {
9086 .name = TYPE_NVME_BUS,
9087 .parent = TYPE_BUS,
9088 .instance_size = sizeof(NvmeBus),
9089 };
9090
nvme_register_types(void)9091 static void nvme_register_types(void)
9092 {
9093 type_register_static(&nvme_info);
9094 type_register_static(&nvme_bus_info);
9095 }
9096
9097 type_init(nvme_register_types)
9098