1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (C) 2012-2014 Intel Corporation
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include <sys/param.h>
30 #include <sys/bus.h>
31 #include <sys/conf.h>
32 #include <sys/domainset.h>
33 #include <sys/proc.h>
34
35 #include <dev/pci/pcivar.h>
36
37 #include "nvme_private.h"
38
39 typedef enum error_print { ERROR_PRINT_NONE, ERROR_PRINT_NO_RETRY, ERROR_PRINT_ALL } error_print_t;
40 #define DO_NOT_RETRY 1
41
42 static void _nvme_qpair_submit_request(struct nvme_qpair *qpair,
43 struct nvme_request *req);
44 static void nvme_qpair_destroy(struct nvme_qpair *qpair);
45
46 #define DEFAULT_INDEX 256
47 #define DEFAULT_ENTRY(x) [DEFAULT_INDEX] = x
48 #define OPC_ENTRY(x) [NVME_OPC_ ## x] = #x
49
50 static const char *admin_opcode[DEFAULT_INDEX + 1] = {
51 OPC_ENTRY(DELETE_IO_SQ),
52 OPC_ENTRY(CREATE_IO_SQ),
53 OPC_ENTRY(GET_LOG_PAGE),
54 OPC_ENTRY(DELETE_IO_CQ),
55 OPC_ENTRY(CREATE_IO_CQ),
56 OPC_ENTRY(IDENTIFY),
57 OPC_ENTRY(ABORT),
58 OPC_ENTRY(SET_FEATURES),
59 OPC_ENTRY(GET_FEATURES),
60 OPC_ENTRY(ASYNC_EVENT_REQUEST),
61 OPC_ENTRY(NAMESPACE_MANAGEMENT),
62 OPC_ENTRY(FIRMWARE_ACTIVATE),
63 OPC_ENTRY(FIRMWARE_IMAGE_DOWNLOAD),
64 OPC_ENTRY(DEVICE_SELF_TEST),
65 OPC_ENTRY(NAMESPACE_ATTACHMENT),
66 OPC_ENTRY(KEEP_ALIVE),
67 OPC_ENTRY(DIRECTIVE_SEND),
68 OPC_ENTRY(DIRECTIVE_RECEIVE),
69 OPC_ENTRY(VIRTUALIZATION_MANAGEMENT),
70 OPC_ENTRY(NVME_MI_SEND),
71 OPC_ENTRY(NVME_MI_RECEIVE),
72 OPC_ENTRY(CAPACITY_MANAGEMENT),
73 OPC_ENTRY(LOCKDOWN),
74 OPC_ENTRY(DOORBELL_BUFFER_CONFIG),
75 OPC_ENTRY(FABRICS_COMMANDS),
76 OPC_ENTRY(FORMAT_NVM),
77 OPC_ENTRY(SECURITY_SEND),
78 OPC_ENTRY(SECURITY_RECEIVE),
79 OPC_ENTRY(SANITIZE),
80 OPC_ENTRY(GET_LBA_STATUS),
81 DEFAULT_ENTRY("ADMIN COMMAND"),
82 };
83
84 static const char *io_opcode[DEFAULT_INDEX + 1] = {
85 OPC_ENTRY(FLUSH),
86 OPC_ENTRY(WRITE),
87 OPC_ENTRY(READ),
88 OPC_ENTRY(WRITE_UNCORRECTABLE),
89 OPC_ENTRY(COMPARE),
90 OPC_ENTRY(WRITE_ZEROES),
91 OPC_ENTRY(DATASET_MANAGEMENT),
92 OPC_ENTRY(VERIFY),
93 OPC_ENTRY(RESERVATION_REGISTER),
94 OPC_ENTRY(RESERVATION_REPORT),
95 OPC_ENTRY(RESERVATION_ACQUIRE),
96 OPC_ENTRY(RESERVATION_RELEASE),
97 OPC_ENTRY(COPY),
98 DEFAULT_ENTRY("IO COMMAND"),
99 };
100
101 static const char *
get_opcode_string(const char * op[DEFAULT_INDEX+1],uint16_t opc)102 get_opcode_string(const char *op[DEFAULT_INDEX + 1], uint16_t opc)
103 {
104 const char *nm = opc < DEFAULT_INDEX ? op[opc] : op[DEFAULT_INDEX];
105
106 return (nm != NULL ? nm : op[DEFAULT_INDEX]);
107 }
108
109 static const char *
get_admin_opcode_string(uint16_t opc)110 get_admin_opcode_string(uint16_t opc)
111 {
112 return (get_opcode_string(admin_opcode, opc));
113 }
114
115 static const char *
get_io_opcode_string(uint16_t opc)116 get_io_opcode_string(uint16_t opc)
117 {
118 return (get_opcode_string(io_opcode, opc));
119 }
120
121 static void
nvme_admin_qpair_print_command(struct nvme_qpair * qpair,struct nvme_command * cmd)122 nvme_admin_qpair_print_command(struct nvme_qpair *qpair,
123 struct nvme_command *cmd)
124 {
125
126 nvme_printf(qpair->ctrlr, "%s (%02x) sqid:%d cid:%d nsid:%x "
127 "cdw10:%08x cdw11:%08x\n",
128 get_admin_opcode_string(cmd->opc), cmd->opc, qpair->id, cmd->cid,
129 le32toh(cmd->nsid), le32toh(cmd->cdw10), le32toh(cmd->cdw11));
130 }
131
132 static void
nvme_io_qpair_print_command(struct nvme_qpair * qpair,struct nvme_command * cmd)133 nvme_io_qpair_print_command(struct nvme_qpair *qpair,
134 struct nvme_command *cmd)
135 {
136
137 switch (cmd->opc) {
138 case NVME_OPC_WRITE:
139 case NVME_OPC_READ:
140 case NVME_OPC_WRITE_UNCORRECTABLE:
141 case NVME_OPC_COMPARE:
142 case NVME_OPC_WRITE_ZEROES:
143 case NVME_OPC_VERIFY:
144 nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d "
145 "lba:%llu len:%d\n",
146 get_io_opcode_string(cmd->opc), qpair->id, cmd->cid, le32toh(cmd->nsid),
147 ((unsigned long long)le32toh(cmd->cdw11) << 32) + le32toh(cmd->cdw10),
148 (le32toh(cmd->cdw12) & 0xFFFF) + 1);
149 break;
150 case NVME_OPC_FLUSH:
151 case NVME_OPC_DATASET_MANAGEMENT:
152 case NVME_OPC_RESERVATION_REGISTER:
153 case NVME_OPC_RESERVATION_REPORT:
154 case NVME_OPC_RESERVATION_ACQUIRE:
155 case NVME_OPC_RESERVATION_RELEASE:
156 nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d\n",
157 get_io_opcode_string(cmd->opc), qpair->id, cmd->cid, le32toh(cmd->nsid));
158 break;
159 default:
160 nvme_printf(qpair->ctrlr, "%s (%02x) sqid:%d cid:%d nsid:%d\n",
161 get_io_opcode_string(cmd->opc), cmd->opc, qpair->id,
162 cmd->cid, le32toh(cmd->nsid));
163 break;
164 }
165 }
166
167 void
nvme_qpair_print_command(struct nvme_qpair * qpair,struct nvme_command * cmd)168 nvme_qpair_print_command(struct nvme_qpair *qpair, struct nvme_command *cmd)
169 {
170 if (qpair->id == 0)
171 nvme_admin_qpair_print_command(qpair, cmd);
172 else
173 nvme_io_qpair_print_command(qpair, cmd);
174 if (nvme_verbose_cmd_dump) {
175 nvme_printf(qpair->ctrlr,
176 "nsid:%#x rsvd2:%#x rsvd3:%#x mptr:%#jx prp1:%#jx prp2:%#jx\n",
177 cmd->nsid, cmd->rsvd2, cmd->rsvd3, (uintmax_t)cmd->mptr,
178 (uintmax_t)cmd->prp1, (uintmax_t)cmd->prp2);
179 nvme_printf(qpair->ctrlr,
180 "cdw10: %#x cdw11:%#x cdw12:%#x cdw13:%#x cdw14:%#x cdw15:%#x\n",
181 cmd->cdw10, cmd->cdw11, cmd->cdw12, cmd->cdw13, cmd->cdw14,
182 cmd->cdw15);
183 }
184 }
185
186 struct nvme_status_string {
187 uint16_t sc;
188 const char * str;
189 };
190
191 static struct nvme_status_string generic_status[] = {
192 { NVME_SC_SUCCESS, "SUCCESS" },
193 { NVME_SC_INVALID_OPCODE, "INVALID OPCODE" },
194 { NVME_SC_INVALID_FIELD, "INVALID_FIELD" },
195 { NVME_SC_COMMAND_ID_CONFLICT, "COMMAND ID CONFLICT" },
196 { NVME_SC_DATA_TRANSFER_ERROR, "DATA TRANSFER ERROR" },
197 { NVME_SC_ABORTED_POWER_LOSS, "ABORTED - POWER LOSS" },
198 { NVME_SC_INTERNAL_DEVICE_ERROR, "INTERNAL DEVICE ERROR" },
199 { NVME_SC_ABORTED_BY_REQUEST, "ABORTED - BY REQUEST" },
200 { NVME_SC_ABORTED_SQ_DELETION, "ABORTED - SQ DELETION" },
201 { NVME_SC_ABORTED_FAILED_FUSED, "ABORTED - FAILED FUSED" },
202 { NVME_SC_ABORTED_MISSING_FUSED, "ABORTED - MISSING FUSED" },
203 { NVME_SC_INVALID_NAMESPACE_OR_FORMAT, "INVALID NAMESPACE OR FORMAT" },
204 { NVME_SC_COMMAND_SEQUENCE_ERROR, "COMMAND SEQUENCE ERROR" },
205 { NVME_SC_INVALID_SGL_SEGMENT_DESCR, "INVALID SGL SEGMENT DESCRIPTOR" },
206 { NVME_SC_INVALID_NUMBER_OF_SGL_DESCR, "INVALID NUMBER OF SGL DESCRIPTORS" },
207 { NVME_SC_DATA_SGL_LENGTH_INVALID, "DATA SGL LENGTH INVALID" },
208 { NVME_SC_METADATA_SGL_LENGTH_INVALID, "METADATA SGL LENGTH INVALID" },
209 { NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID, "SGL DESCRIPTOR TYPE INVALID" },
210 { NVME_SC_INVALID_USE_OF_CMB, "INVALID USE OF CONTROLLER MEMORY BUFFER" },
211 { NVME_SC_PRP_OFFET_INVALID, "PRP OFFET INVALID" },
212 { NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED, "ATOMIC WRITE UNIT EXCEEDED" },
213 { NVME_SC_OPERATION_DENIED, "OPERATION DENIED" },
214 { NVME_SC_SGL_OFFSET_INVALID, "SGL OFFSET INVALID" },
215 { NVME_SC_HOST_ID_INCONSISTENT_FORMAT, "HOST IDENTIFIER INCONSISTENT FORMAT" },
216 { NVME_SC_KEEP_ALIVE_TIMEOUT_EXPIRED, "KEEP ALIVE TIMEOUT EXPIRED" },
217 { NVME_SC_KEEP_ALIVE_TIMEOUT_INVALID, "KEEP ALIVE TIMEOUT INVALID" },
218 { NVME_SC_ABORTED_DUE_TO_PREEMPT, "COMMAND ABORTED DUE TO PREEMPT AND ABORT" },
219 { NVME_SC_SANITIZE_FAILED, "SANITIZE FAILED" },
220 { NVME_SC_SANITIZE_IN_PROGRESS, "SANITIZE IN PROGRESS" },
221 { NVME_SC_SGL_DATA_BLOCK_GRAN_INVALID, "SGL_DATA_BLOCK_GRANULARITY_INVALID" },
222 { NVME_SC_NOT_SUPPORTED_IN_CMB, "COMMAND NOT SUPPORTED FOR QUEUE IN CMB" },
223 { NVME_SC_NAMESPACE_IS_WRITE_PROTECTED, "NAMESPACE IS WRITE PROTECTED" },
224 { NVME_SC_COMMAND_INTERRUPTED, "COMMAND INTERRUPTED" },
225 { NVME_SC_TRANSIENT_TRANSPORT_ERROR, "TRANSIENT TRANSPORT ERROR" },
226
227 { NVME_SC_LBA_OUT_OF_RANGE, "LBA OUT OF RANGE" },
228 { NVME_SC_CAPACITY_EXCEEDED, "CAPACITY EXCEEDED" },
229 { NVME_SC_NAMESPACE_NOT_READY, "NAMESPACE NOT READY" },
230 { NVME_SC_RESERVATION_CONFLICT, "RESERVATION CONFLICT" },
231 { NVME_SC_FORMAT_IN_PROGRESS, "FORMAT IN PROGRESS" },
232 { 0xFFFF, "GENERIC" }
233 };
234
235 static struct nvme_status_string command_specific_status[] = {
236 { NVME_SC_COMPLETION_QUEUE_INVALID, "INVALID COMPLETION QUEUE" },
237 { NVME_SC_INVALID_QUEUE_IDENTIFIER, "INVALID QUEUE IDENTIFIER" },
238 { NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED, "MAX QUEUE SIZE EXCEEDED" },
239 { NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED, "ABORT CMD LIMIT EXCEEDED" },
240 { NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED, "ASYNC LIMIT EXCEEDED" },
241 { NVME_SC_INVALID_FIRMWARE_SLOT, "INVALID FIRMWARE SLOT" },
242 { NVME_SC_INVALID_FIRMWARE_IMAGE, "INVALID FIRMWARE IMAGE" },
243 { NVME_SC_INVALID_INTERRUPT_VECTOR, "INVALID INTERRUPT VECTOR" },
244 { NVME_SC_INVALID_LOG_PAGE, "INVALID LOG PAGE" },
245 { NVME_SC_INVALID_FORMAT, "INVALID FORMAT" },
246 { NVME_SC_FIRMWARE_REQUIRES_RESET, "FIRMWARE REQUIRES RESET" },
247 { NVME_SC_INVALID_QUEUE_DELETION, "INVALID QUEUE DELETION" },
248 { NVME_SC_FEATURE_NOT_SAVEABLE, "FEATURE IDENTIFIER NOT SAVEABLE" },
249 { NVME_SC_FEATURE_NOT_CHANGEABLE, "FEATURE NOT CHANGEABLE" },
250 { NVME_SC_FEATURE_NOT_NS_SPECIFIC, "FEATURE NOT NAMESPACE SPECIFIC" },
251 { NVME_SC_FW_ACT_REQUIRES_NVMS_RESET, "FIRMWARE ACTIVATION REQUIRES NVM SUBSYSTEM RESET" },
252 { NVME_SC_FW_ACT_REQUIRES_RESET, "FIRMWARE ACTIVATION REQUIRES RESET" },
253 { NVME_SC_FW_ACT_REQUIRES_TIME, "FIRMWARE ACTIVATION REQUIRES MAXIMUM TIME VIOLATION" },
254 { NVME_SC_FW_ACT_PROHIBITED, "FIRMWARE ACTIVATION PROHIBITED" },
255 { NVME_SC_OVERLAPPING_RANGE, "OVERLAPPING RANGE" },
256 { NVME_SC_NS_INSUFFICIENT_CAPACITY, "NAMESPACE INSUFFICIENT CAPACITY" },
257 { NVME_SC_NS_ID_UNAVAILABLE, "NAMESPACE IDENTIFIER UNAVAILABLE" },
258 { NVME_SC_NS_ALREADY_ATTACHED, "NAMESPACE ALREADY ATTACHED" },
259 { NVME_SC_NS_IS_PRIVATE, "NAMESPACE IS PRIVATE" },
260 { NVME_SC_NS_NOT_ATTACHED, "NS NOT ATTACHED" },
261 { NVME_SC_THIN_PROV_NOT_SUPPORTED, "THIN PROVISIONING NOT SUPPORTED" },
262 { NVME_SC_CTRLR_LIST_INVALID, "CONTROLLER LIST INVALID" },
263 { NVME_SC_SELF_TEST_IN_PROGRESS, "DEVICE SELF-TEST IN PROGRESS" },
264 { NVME_SC_BOOT_PART_WRITE_PROHIB, "BOOT PARTITION WRITE PROHIBITED" },
265 { NVME_SC_INVALID_CTRLR_ID, "INVALID CONTROLLER IDENTIFIER" },
266 { NVME_SC_INVALID_SEC_CTRLR_STATE, "INVALID SECONDARY CONTROLLER STATE" },
267 { NVME_SC_INVALID_NUM_OF_CTRLR_RESRC, "INVALID NUMBER OF CONTROLLER RESOURCES" },
268 { NVME_SC_INVALID_RESOURCE_ID, "INVALID RESOURCE IDENTIFIER" },
269 { NVME_SC_SANITIZE_PROHIBITED_WPMRE, "SANITIZE PROHIBITED WRITE PERSISTENT MEMORY REGION ENABLED" },
270 { NVME_SC_ANA_GROUP_ID_INVALID, "ANA GROUP IDENTIFIED INVALID" },
271 { NVME_SC_ANA_ATTACH_FAILED, "ANA ATTACH FAILED" },
272
273 { NVME_SC_CONFLICTING_ATTRIBUTES, "CONFLICTING ATTRIBUTES" },
274 { NVME_SC_INVALID_PROTECTION_INFO, "INVALID PROTECTION INFO" },
275 { NVME_SC_ATTEMPTED_WRITE_TO_RO_PAGE, "WRITE TO RO PAGE" },
276 { 0xFFFF, "COMMAND SPECIFIC" }
277 };
278
279 static struct nvme_status_string media_error_status[] = {
280 { NVME_SC_WRITE_FAULTS, "WRITE FAULTS" },
281 { NVME_SC_UNRECOVERED_READ_ERROR, "UNRECOVERED READ ERROR" },
282 { NVME_SC_GUARD_CHECK_ERROR, "GUARD CHECK ERROR" },
283 { NVME_SC_APPLICATION_TAG_CHECK_ERROR, "APPLICATION TAG CHECK ERROR" },
284 { NVME_SC_REFERENCE_TAG_CHECK_ERROR, "REFERENCE TAG CHECK ERROR" },
285 { NVME_SC_COMPARE_FAILURE, "COMPARE FAILURE" },
286 { NVME_SC_ACCESS_DENIED, "ACCESS DENIED" },
287 { NVME_SC_DEALLOCATED_OR_UNWRITTEN, "DEALLOCATED OR UNWRITTEN LOGICAL BLOCK" },
288 { 0xFFFF, "MEDIA ERROR" }
289 };
290
291 static struct nvme_status_string path_related_status[] = {
292 { NVME_SC_INTERNAL_PATH_ERROR, "INTERNAL PATH ERROR" },
293 { NVME_SC_ASYMMETRIC_ACCESS_PERSISTENT_LOSS, "ASYMMETRIC ACCESS PERSISTENT LOSS" },
294 { NVME_SC_ASYMMETRIC_ACCESS_INACCESSIBLE, "ASYMMETRIC ACCESS INACCESSIBLE" },
295 { NVME_SC_ASYMMETRIC_ACCESS_TRANSITION, "ASYMMETRIC ACCESS TRANSITION" },
296 { NVME_SC_CONTROLLER_PATHING_ERROR, "CONTROLLER PATHING ERROR" },
297 { NVME_SC_HOST_PATHING_ERROR, "HOST PATHING ERROR" },
298 { NVME_SC_COMMAND_ABORTED_BY_HOST, "COMMAND ABORTED BY HOST" },
299 { 0xFFFF, "PATH RELATED" },
300 };
301
302 static const char *
get_status_string(uint16_t sct,uint16_t sc)303 get_status_string(uint16_t sct, uint16_t sc)
304 {
305 struct nvme_status_string *entry;
306
307 switch (sct) {
308 case NVME_SCT_GENERIC:
309 entry = generic_status;
310 break;
311 case NVME_SCT_COMMAND_SPECIFIC:
312 entry = command_specific_status;
313 break;
314 case NVME_SCT_MEDIA_ERROR:
315 entry = media_error_status;
316 break;
317 case NVME_SCT_PATH_RELATED:
318 entry = path_related_status;
319 break;
320 case NVME_SCT_VENDOR_SPECIFIC:
321 return ("VENDOR SPECIFIC");
322 default:
323 return ("RESERVED");
324 }
325
326 while (entry->sc != 0xFFFF) {
327 if (entry->sc == sc)
328 return (entry->str);
329 entry++;
330 }
331 return (entry->str);
332 }
333
334 void
nvme_qpair_print_completion(struct nvme_qpair * qpair,struct nvme_completion * cpl)335 nvme_qpair_print_completion(struct nvme_qpair *qpair,
336 struct nvme_completion *cpl)
337 {
338 uint8_t sct, sc, crd, m, dnr, p;
339
340 sct = NVME_STATUS_GET_SCT(cpl->status);
341 sc = NVME_STATUS_GET_SC(cpl->status);
342 crd = NVME_STATUS_GET_CRD(cpl->status);
343 m = NVME_STATUS_GET_M(cpl->status);
344 dnr = NVME_STATUS_GET_DNR(cpl->status);
345 p = NVME_STATUS_GET_P(cpl->status);
346
347 nvme_printf(qpair->ctrlr, "%s (%02x/%02x) crd:%x m:%x dnr:%x p:%d "
348 "sqid:%d cid:%d cdw0:%x\n",
349 get_status_string(sct, sc), sct, sc, crd, m, dnr, p,
350 cpl->sqid, cpl->cid, cpl->cdw0);
351 }
352
353 static bool
nvme_completion_is_retry(const struct nvme_completion * cpl)354 nvme_completion_is_retry(const struct nvme_completion *cpl)
355 {
356 uint8_t sct, sc, dnr;
357
358 sct = NVME_STATUS_GET_SCT(cpl->status);
359 sc = NVME_STATUS_GET_SC(cpl->status);
360 dnr = NVME_STATUS_GET_DNR(cpl->status); /* Do Not Retry Bit */
361
362 /*
363 * TODO: spec is not clear how commands that are aborted due
364 * to TLER will be marked. So for now, it seems
365 * NAMESPACE_NOT_READY is the only case where we should
366 * look at the DNR bit. Requests failed with ABORTED_BY_REQUEST
367 * set the DNR bit correctly since the driver controls that.
368 */
369 switch (sct) {
370 case NVME_SCT_GENERIC:
371 switch (sc) {
372 case NVME_SC_ABORTED_BY_REQUEST:
373 case NVME_SC_NAMESPACE_NOT_READY:
374 if (dnr)
375 return (0);
376 else
377 return (1);
378 case NVME_SC_INVALID_OPCODE:
379 case NVME_SC_INVALID_FIELD:
380 case NVME_SC_COMMAND_ID_CONFLICT:
381 case NVME_SC_DATA_TRANSFER_ERROR:
382 case NVME_SC_ABORTED_POWER_LOSS:
383 case NVME_SC_INTERNAL_DEVICE_ERROR:
384 case NVME_SC_ABORTED_SQ_DELETION:
385 case NVME_SC_ABORTED_FAILED_FUSED:
386 case NVME_SC_ABORTED_MISSING_FUSED:
387 case NVME_SC_INVALID_NAMESPACE_OR_FORMAT:
388 case NVME_SC_COMMAND_SEQUENCE_ERROR:
389 case NVME_SC_LBA_OUT_OF_RANGE:
390 case NVME_SC_CAPACITY_EXCEEDED:
391 default:
392 return (0);
393 }
394 case NVME_SCT_COMMAND_SPECIFIC:
395 case NVME_SCT_MEDIA_ERROR:
396 return (0);
397 case NVME_SCT_PATH_RELATED:
398 switch (sc) {
399 case NVME_SC_INTERNAL_PATH_ERROR:
400 if (dnr)
401 return (0);
402 else
403 return (1);
404 default:
405 return (0);
406 }
407 case NVME_SCT_VENDOR_SPECIFIC:
408 default:
409 return (0);
410 }
411 }
412
413 static void
nvme_qpair_complete_tracker(struct nvme_tracker * tr,struct nvme_completion * cpl,error_print_t print_on_error)414 nvme_qpair_complete_tracker(struct nvme_tracker *tr,
415 struct nvme_completion *cpl, error_print_t print_on_error)
416 {
417 struct nvme_qpair *qpair = tr->qpair;
418 struct nvme_request *req;
419 bool retry, error, retriable;
420
421 mtx_assert(&qpair->lock, MA_NOTOWNED);
422
423 req = tr->req;
424 error = nvme_completion_is_error(cpl);
425 retriable = nvme_completion_is_retry(cpl);
426 retry = error && retriable && req->retries < nvme_retry_count;
427 if (retry)
428 qpair->num_retries++;
429 if (error && req->retries >= nvme_retry_count && retriable)
430 qpair->num_failures++;
431
432 if (error && (print_on_error == ERROR_PRINT_ALL ||
433 (!retry && print_on_error == ERROR_PRINT_NO_RETRY))) {
434 nvme_qpair_print_command(qpair, &req->cmd);
435 nvme_qpair_print_completion(qpair, cpl);
436 }
437
438 qpair->act_tr[cpl->cid] = NULL;
439
440 KASSERT(cpl->cid == req->cmd.cid, ("cpl cid does not match cmd cid\n"));
441
442 if (!retry) {
443 if (req->payload_valid) {
444 bus_dmamap_sync(qpair->dma_tag_payload,
445 tr->payload_dma_map,
446 BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
447 }
448 if (req->cb_fn)
449 req->cb_fn(req->cb_arg, cpl);
450 }
451
452 mtx_lock(&qpair->lock);
453
454 if (retry) {
455 req->retries++;
456 nvme_qpair_submit_tracker(qpair, tr);
457 } else {
458 if (req->payload_valid) {
459 bus_dmamap_unload(qpair->dma_tag_payload,
460 tr->payload_dma_map);
461 }
462
463 nvme_free_request(req);
464 tr->req = NULL;
465
466 TAILQ_REMOVE(&qpair->outstanding_tr, tr, tailq);
467 TAILQ_INSERT_HEAD(&qpair->free_tr, tr, tailq);
468
469 /*
470 * If the controller is in the middle of resetting, don't
471 * try to submit queued requests here - let the reset logic
472 * handle that instead.
473 */
474 if (!STAILQ_EMPTY(&qpair->queued_req) &&
475 !qpair->ctrlr->is_resetting) {
476 req = STAILQ_FIRST(&qpair->queued_req);
477 STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
478 _nvme_qpair_submit_request(qpair, req);
479 }
480 }
481
482 mtx_unlock(&qpair->lock);
483 }
484
485 static void
nvme_qpair_manual_complete_tracker(struct nvme_tracker * tr,uint32_t sct,uint32_t sc,uint32_t dnr,error_print_t print_on_error)486 nvme_qpair_manual_complete_tracker(
487 struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr,
488 error_print_t print_on_error)
489 {
490 struct nvme_completion cpl;
491 struct nvme_qpair * qpair = tr->qpair;
492
493 mtx_assert(&qpair->lock, MA_NOTOWNED);
494
495 memset(&cpl, 0, sizeof(cpl));
496
497 cpl.sqid = qpair->id;
498 cpl.cid = tr->cid;
499 cpl.status |= NVMEF(NVME_STATUS_SCT, sct);
500 cpl.status |= NVMEF(NVME_STATUS_SC, sc);
501 cpl.status |= NVMEF(NVME_STATUS_DNR, dnr);
502 /* M=0 : this is artificial so no data in error log page */
503 /* CRD=0 : this is artificial and no delayed retry support anyway */
504 /* P=0 : phase not checked */
505 nvme_qpair_complete_tracker(tr, &cpl, print_on_error);
506 }
507
508 void
nvme_qpair_manual_complete_request(struct nvme_qpair * qpair,struct nvme_request * req,uint32_t sct,uint32_t sc)509 nvme_qpair_manual_complete_request(struct nvme_qpair *qpair,
510 struct nvme_request *req, uint32_t sct, uint32_t sc)
511 {
512 struct nvme_completion cpl;
513 bool error;
514
515 memset(&cpl, 0, sizeof(cpl));
516 cpl.sqid = qpair->id;
517 cpl.status |= NVMEF(NVME_STATUS_SCT, sct);
518 cpl.status |= NVMEF(NVME_STATUS_SC, sc);
519
520 error = nvme_completion_is_error(&cpl);
521
522 if (error) {
523 nvme_qpair_print_command(qpair, &req->cmd);
524 nvme_qpair_print_completion(qpair, &cpl);
525 }
526
527 if (req->cb_fn)
528 req->cb_fn(req->cb_arg, &cpl);
529
530 nvme_free_request(req);
531 }
532
533 /* Locked version of completion processor */
534 static bool
_nvme_qpair_process_completions(struct nvme_qpair * qpair)535 _nvme_qpair_process_completions(struct nvme_qpair *qpair)
536 {
537 struct nvme_tracker *tr;
538 struct nvme_completion cpl;
539 bool done = false;
540 bool in_panic = dumping || SCHEDULER_STOPPED();
541
542 mtx_assert(&qpair->recovery, MA_OWNED);
543
544 /*
545 * qpair is not enabled, likely because a controller reset is in
546 * progress. Ignore the interrupt - any I/O that was associated with
547 * this interrupt will get retried when the reset is complete. Any
548 * pending completions for when we're in startup will be completed
549 * as soon as initialization is complete and we start sending commands
550 * to the device.
551 */
552 if (qpair->recovery_state != RECOVERY_NONE) {
553 qpair->num_ignored++;
554 return (false);
555 }
556
557 /*
558 * Sanity check initialization. After we reset the hardware, the phase
559 * is defined to be 1. So if we get here with zero prior calls and the
560 * phase is 0, it means that we've lost a race between the
561 * initialization and the ISR running. With the phase wrong, we'll
562 * process a bunch of completions that aren't really completions leading
563 * to a KASSERT below.
564 */
565 KASSERT(!(qpair->num_intr_handler_calls == 0 && qpair->phase == 0),
566 ("%s: Phase wrong for first interrupt call.",
567 device_get_nameunit(qpair->ctrlr->dev)));
568
569 qpair->num_intr_handler_calls++;
570
571 bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map,
572 BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
573 /*
574 * A panic can stop the CPU this routine is running on at any point. If
575 * we're called during a panic, complete the sq_head wrap protocol for
576 * the case where we are interrupted just after the increment at 1
577 * below, but before we can reset cq_head to zero at 2. Also cope with
578 * the case where we do the zero at 2, but may or may not have done the
579 * phase adjustment at step 3. The panic machinery flushes all pending
580 * memory writes, so we can make these strong ordering assumptions
581 * that would otherwise be unwise if we were racing in real time.
582 */
583 if (__predict_false(in_panic)) {
584 if (qpair->cq_head == qpair->num_entries) {
585 /*
586 * Here we know that we need to zero cq_head and then negate
587 * the phase, which hasn't been assigned if cq_head isn't
588 * zero due to the atomic_store_rel.
589 */
590 qpair->cq_head = 0;
591 qpair->phase = !qpair->phase;
592 } else if (qpair->cq_head == 0) {
593 /*
594 * In this case, we know that the assignment at 2
595 * happened below, but we don't know if it 3 happened or
596 * not. To do this, we look at the last completion
597 * entry and set the phase to the opposite phase
598 * that it has. This gets us back in sync
599 */
600 cpl = qpair->cpl[qpair->num_entries - 1];
601 nvme_completion_swapbytes(&cpl);
602 qpair->phase = !NVME_STATUS_GET_P(cpl.status);
603 }
604 }
605
606 while (1) {
607 uint16_t status;
608
609 /*
610 * We need to do this dance to avoid a race between the host and
611 * the device where the device overtakes the host while the host
612 * is reading this record, leaving the status field 'new' and
613 * the sqhd and cid fields potentially stale. If the phase
614 * doesn't match, that means status hasn't yet been updated and
615 * we'll get any pending changes next time. It also means that
616 * the phase must be the same the second time. We have to sync
617 * before reading to ensure any bouncing completes.
618 */
619 status = le16toh(qpair->cpl[qpair->cq_head].status);
620 if (NVME_STATUS_GET_P(status) != qpair->phase)
621 break;
622
623 bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map,
624 BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
625 cpl = qpair->cpl[qpair->cq_head];
626 nvme_completion_swapbytes(&cpl);
627
628 KASSERT(
629 NVME_STATUS_GET_P(status) == NVME_STATUS_GET_P(cpl.status),
630 ("Phase unexpectedly inconsistent"));
631
632 if (cpl.cid < qpair->num_trackers)
633 tr = qpair->act_tr[cpl.cid];
634 else
635 tr = NULL;
636
637 done = true;
638 if (tr != NULL) {
639 nvme_qpair_complete_tracker(tr, &cpl, ERROR_PRINT_ALL);
640 qpair->sq_head = cpl.sqhd;
641 } else if (!in_panic) {
642 /*
643 * A missing tracker is normally an error. However, a
644 * panic can stop the CPU this routine is running on
645 * after completing an I/O but before updating
646 * qpair->cq_head at 1 below. Later, we re-enter this
647 * routine to poll I/O associated with the kernel
648 * dump. We find that the tr has been set to null before
649 * calling the completion routine. If it hasn't
650 * completed (or it triggers a panic), then '1' below
651 * won't have updated cq_head. Rather than panic again,
652 * ignore this condition because it's not unexpected.
653 */
654 nvme_printf(qpair->ctrlr,
655 "cpl (cid = %u) does not map to outstanding cmd\n",
656 cpl.cid);
657 nvme_qpair_print_completion(qpair,
658 &qpair->cpl[qpair->cq_head]);
659 KASSERT(0, ("received completion for unknown cmd"));
660 }
661
662 /*
663 * There's a number of races with the following (see above) when
664 * the system panics. We compensate for each one of them by
665 * using the atomic store to force strong ordering (at least when
666 * viewed in the aftermath of a panic).
667 */
668 if (++qpair->cq_head == qpair->num_entries) { /* 1 */
669 atomic_store_rel_int(&qpair->cq_head, 0); /* 2 */
670 qpair->phase = !qpair->phase; /* 3 */
671 }
672 }
673
674 if (done) {
675 bus_space_write_4(qpair->ctrlr->bus_tag, qpair->ctrlr->bus_handle,
676 qpair->cq_hdbl_off, qpair->cq_head);
677 }
678
679 return (done);
680 }
681
682 bool
nvme_qpair_process_completions(struct nvme_qpair * qpair)683 nvme_qpair_process_completions(struct nvme_qpair *qpair)
684 {
685 bool done;
686
687 /*
688 * Interlock with reset / recovery code. This is an usually uncontended
689 * to make sure that we drain out of the ISRs before we reset the card
690 * and to prevent races with the recovery process called from a timeout
691 * context.
692 */
693 if (!mtx_trylock(&qpair->recovery)) {
694 qpair->num_recovery_nolock++;
695 return (false);
696 }
697
698 done = _nvme_qpair_process_completions(qpair);
699
700 mtx_unlock(&qpair->recovery);
701
702 return (done);
703 }
704
705 static void
nvme_qpair_msi_handler(void * arg)706 nvme_qpair_msi_handler(void *arg)
707 {
708 struct nvme_qpair *qpair = arg;
709
710 nvme_qpair_process_completions(qpair);
711 }
712
713 int
nvme_qpair_construct(struct nvme_qpair * qpair,uint32_t num_entries,uint32_t num_trackers,struct nvme_controller * ctrlr)714 nvme_qpair_construct(struct nvme_qpair *qpair,
715 uint32_t num_entries, uint32_t num_trackers,
716 struct nvme_controller *ctrlr)
717 {
718 struct nvme_tracker *tr;
719 size_t cmdsz, cplsz, prpsz, allocsz, prpmemsz;
720 uint64_t queuemem_phys, prpmem_phys, list_phys;
721 uint8_t *queuemem, *prpmem, *prp_list;
722 int i, err;
723
724 qpair->vector = ctrlr->msi_count > 1 ? qpair->id : 0;
725 qpair->num_entries = num_entries;
726 qpair->num_trackers = num_trackers;
727 qpair->ctrlr = ctrlr;
728
729 mtx_init(&qpair->lock, "nvme qpair lock", NULL, MTX_DEF);
730 mtx_init(&qpair->recovery, "nvme qpair recovery", NULL, MTX_DEF);
731
732 callout_init_mtx(&qpair->timer, &qpair->recovery, 0);
733 qpair->timer_armed = false;
734 qpair->recovery_state = RECOVERY_WAITING;
735
736 /* Note: NVMe PRP format is restricted to 4-byte alignment. */
737 err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
738 4, ctrlr->page_size, BUS_SPACE_MAXADDR,
739 BUS_SPACE_MAXADDR, NULL, NULL, ctrlr->max_xfer_size,
740 howmany(ctrlr->max_xfer_size, ctrlr->page_size) + 1,
741 ctrlr->page_size, 0,
742 NULL, NULL, &qpair->dma_tag_payload);
743 if (err != 0) {
744 nvme_printf(ctrlr, "payload tag create failed %d\n", err);
745 goto out;
746 }
747
748 /*
749 * Each component must be page aligned, and individual PRP lists
750 * cannot cross a page boundary.
751 */
752 cmdsz = qpair->num_entries * sizeof(struct nvme_command);
753 cmdsz = roundup2(cmdsz, ctrlr->page_size);
754 cplsz = qpair->num_entries * sizeof(struct nvme_completion);
755 cplsz = roundup2(cplsz, ctrlr->page_size);
756 /*
757 * For commands requiring more than 2 PRP entries, one PRP will be
758 * embedded in the command (prp1), and the rest of the PRP entries
759 * will be in a list pointed to by the command (prp2).
760 */
761 prpsz = sizeof(uint64_t) *
762 howmany(ctrlr->max_xfer_size, ctrlr->page_size);
763 prpmemsz = qpair->num_trackers * prpsz;
764 allocsz = cmdsz + cplsz + prpmemsz;
765
766 err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
767 ctrlr->page_size, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL,
768 allocsz, 1, allocsz, 0, NULL, NULL, &qpair->dma_tag);
769 if (err != 0) {
770 nvme_printf(ctrlr, "tag create failed %d\n", err);
771 goto out;
772 }
773 bus_dma_tag_set_domain(qpair->dma_tag, qpair->domain);
774
775 if (bus_dmamem_alloc(qpair->dma_tag, (void **)&queuemem,
776 BUS_DMA_COHERENT | BUS_DMA_NOWAIT, &qpair->queuemem_map)) {
777 nvme_printf(ctrlr, "failed to alloc qpair memory\n");
778 goto out;
779 }
780
781 if (bus_dmamap_load(qpair->dma_tag, qpair->queuemem_map,
782 queuemem, allocsz, nvme_single_map, &queuemem_phys, 0) != 0) {
783 nvme_printf(ctrlr, "failed to load qpair memory\n");
784 bus_dmamem_free(qpair->dma_tag, qpair->cmd,
785 qpair->queuemem_map);
786 goto out;
787 }
788
789 qpair->num_cmds = 0;
790 qpair->num_intr_handler_calls = 0;
791 qpair->num_retries = 0;
792 qpair->num_failures = 0;
793 qpair->num_ignored = 0;
794 qpair->cmd = (struct nvme_command *)queuemem;
795 qpair->cpl = (struct nvme_completion *)(queuemem + cmdsz);
796 prpmem = (uint8_t *)(queuemem + cmdsz + cplsz);
797 qpair->cmd_bus_addr = queuemem_phys;
798 qpair->cpl_bus_addr = queuemem_phys + cmdsz;
799 prpmem_phys = queuemem_phys + cmdsz + cplsz;
800
801 /*
802 * Calcuate the stride of the doorbell register. Many emulators set this
803 * value to correspond to a cache line. However, some hardware has set
804 * it to various small values.
805 */
806 qpair->sq_tdbl_off = nvme_mmio_offsetof(doorbell[0]) +
807 (qpair->id << (ctrlr->dstrd + 1));
808 qpair->cq_hdbl_off = nvme_mmio_offsetof(doorbell[0]) +
809 (qpair->id << (ctrlr->dstrd + 1)) + (1 << ctrlr->dstrd);
810
811 TAILQ_INIT(&qpair->free_tr);
812 TAILQ_INIT(&qpair->outstanding_tr);
813 STAILQ_INIT(&qpair->queued_req);
814
815 list_phys = prpmem_phys;
816 prp_list = prpmem;
817 for (i = 0; i < qpair->num_trackers; i++) {
818 if (list_phys + prpsz > prpmem_phys + prpmemsz) {
819 qpair->num_trackers = i;
820 break;
821 }
822
823 /*
824 * Make sure that the PRP list for this tracker doesn't
825 * overflow to another nvme page.
826 */
827 if (trunc_page(list_phys) !=
828 trunc_page(list_phys + prpsz - 1)) {
829 list_phys = roundup2(list_phys, ctrlr->page_size);
830 prp_list =
831 (uint8_t *)roundup2((uintptr_t)prp_list, ctrlr->page_size);
832 }
833
834 tr = malloc_domainset(sizeof(*tr), M_NVME,
835 DOMAINSET_PREF(qpair->domain), M_ZERO | M_WAITOK);
836 bus_dmamap_create(qpair->dma_tag_payload, 0,
837 &tr->payload_dma_map);
838 tr->cid = i;
839 tr->qpair = qpair;
840 tr->prp = (uint64_t *)prp_list;
841 tr->prp_bus_addr = list_phys;
842 TAILQ_INSERT_HEAD(&qpair->free_tr, tr, tailq);
843 list_phys += prpsz;
844 prp_list += prpsz;
845 }
846
847 if (qpair->num_trackers == 0) {
848 nvme_printf(ctrlr, "failed to allocate enough trackers\n");
849 goto out;
850 }
851
852 qpair->act_tr = malloc_domainset(sizeof(struct nvme_tracker *) *
853 qpair->num_entries, M_NVME, DOMAINSET_PREF(qpair->domain),
854 M_ZERO | M_WAITOK);
855
856 if (ctrlr->msi_count > 1) {
857 /*
858 * MSI-X vector resource IDs start at 1, so we add one to
859 * the queue's vector to get the corresponding rid to use.
860 */
861 qpair->rid = qpair->vector + 1;
862
863 qpair->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ,
864 &qpair->rid, RF_ACTIVE);
865 if (qpair->res == NULL) {
866 nvme_printf(ctrlr, "unable to allocate MSI\n");
867 goto out;
868 }
869 if (bus_setup_intr(ctrlr->dev, qpair->res,
870 INTR_TYPE_MISC | INTR_MPSAFE, NULL,
871 nvme_qpair_msi_handler, qpair, &qpair->tag) != 0) {
872 nvme_printf(ctrlr, "unable to setup MSI\n");
873 goto out;
874 }
875 if (qpair->id == 0) {
876 bus_describe_intr(ctrlr->dev, qpair->res, qpair->tag,
877 "admin");
878 } else {
879 bus_describe_intr(ctrlr->dev, qpair->res, qpair->tag,
880 "io%d", qpair->id - 1);
881 }
882 }
883
884 return (0);
885
886 out:
887 nvme_qpair_destroy(qpair);
888 return (ENOMEM);
889 }
890
891 static void
nvme_qpair_destroy(struct nvme_qpair * qpair)892 nvme_qpair_destroy(struct nvme_qpair *qpair)
893 {
894 struct nvme_tracker *tr;
895
896 mtx_lock(&qpair->recovery);
897 qpair->timer_armed = false;
898 mtx_unlock(&qpair->recovery);
899 callout_drain(&qpair->timer);
900
901 if (qpair->tag) {
902 bus_teardown_intr(qpair->ctrlr->dev, qpair->res, qpair->tag);
903 qpair->tag = NULL;
904 }
905
906 if (qpair->act_tr) {
907 free(qpair->act_tr, M_NVME);
908 qpair->act_tr = NULL;
909 }
910
911 while (!TAILQ_EMPTY(&qpair->free_tr)) {
912 tr = TAILQ_FIRST(&qpair->free_tr);
913 TAILQ_REMOVE(&qpair->free_tr, tr, tailq);
914 bus_dmamap_destroy(qpair->dma_tag_payload,
915 tr->payload_dma_map);
916 free(tr, M_NVME);
917 }
918
919 if (qpair->cmd != NULL) {
920 bus_dmamap_unload(qpair->dma_tag, qpair->queuemem_map);
921 bus_dmamem_free(qpair->dma_tag, qpair->cmd,
922 qpair->queuemem_map);
923 qpair->cmd = NULL;
924 }
925
926 if (qpair->dma_tag) {
927 bus_dma_tag_destroy(qpair->dma_tag);
928 qpair->dma_tag = NULL;
929 }
930
931 if (qpair->dma_tag_payload) {
932 bus_dma_tag_destroy(qpair->dma_tag_payload);
933 qpair->dma_tag_payload = NULL;
934 }
935
936 if (mtx_initialized(&qpair->lock))
937 mtx_destroy(&qpair->lock);
938 if (mtx_initialized(&qpair->recovery))
939 mtx_destroy(&qpair->recovery);
940
941 if (qpair->res) {
942 bus_release_resource(qpair->ctrlr->dev, SYS_RES_IRQ,
943 rman_get_rid(qpair->res), qpair->res);
944 qpair->res = NULL;
945 }
946 }
947
948 static void
nvme_admin_qpair_abort_aers(struct nvme_qpair * qpair)949 nvme_admin_qpair_abort_aers(struct nvme_qpair *qpair)
950 {
951 struct nvme_tracker *tr;
952
953 /*
954 * nvme_complete_tracker must be called without the qpair lock held. It
955 * takes the lock to adjust outstanding_tr list, so make sure we don't
956 * have it yet. We need the lock to make the list traverse safe, but
957 * have to drop the lock to complete any AER. We restart the list scan
958 * when we do this to make this safe. There's interlock with the ISR so
959 * we know this tracker won't be completed twice.
960 */
961 mtx_assert(&qpair->lock, MA_NOTOWNED);
962
963 mtx_lock(&qpair->lock);
964 tr = TAILQ_FIRST(&qpair->outstanding_tr);
965 while (tr != NULL) {
966 if (tr->req->cmd.opc != NVME_OPC_ASYNC_EVENT_REQUEST) {
967 tr = TAILQ_NEXT(tr, tailq);
968 continue;
969 }
970 mtx_unlock(&qpair->lock);
971 nvme_qpair_manual_complete_tracker(tr,
972 NVME_SCT_GENERIC, NVME_SC_ABORTED_SQ_DELETION, 0,
973 ERROR_PRINT_NONE);
974 mtx_lock(&qpair->lock);
975 tr = TAILQ_FIRST(&qpair->outstanding_tr);
976 }
977 mtx_unlock(&qpair->lock);
978 }
979
980 void
nvme_admin_qpair_destroy(struct nvme_qpair * qpair)981 nvme_admin_qpair_destroy(struct nvme_qpair *qpair)
982 {
983 mtx_assert(&qpair->lock, MA_NOTOWNED);
984
985 nvme_admin_qpair_abort_aers(qpair);
986 nvme_qpair_destroy(qpair);
987 }
988
989 void
nvme_io_qpair_destroy(struct nvme_qpair * qpair)990 nvme_io_qpair_destroy(struct nvme_qpair *qpair)
991 {
992
993 nvme_qpair_destroy(qpair);
994 }
995
996 static void
nvme_abort_complete(void * arg,const struct nvme_completion * status)997 nvme_abort_complete(void *arg, const struct nvme_completion *status)
998 {
999 struct nvme_tracker *tr = arg;
1000
1001 /*
1002 * If cdw0 == 1, the controller was not able to abort the command
1003 * we requested. We still need to check the active tracker array,
1004 * to cover race where I/O timed out at same time controller was
1005 * completing the I/O.
1006 */
1007 if (status->cdw0 == 1 && tr->qpair->act_tr[tr->cid] != NULL) {
1008 /*
1009 * An I/O has timed out, and the controller was unable to
1010 * abort it for some reason. Construct a fake completion
1011 * status, and then complete the I/O's tracker manually.
1012 */
1013 nvme_printf(tr->qpair->ctrlr,
1014 "abort command failed, aborting command manually\n");
1015 nvme_qpair_manual_complete_tracker(tr,
1016 NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_ALL);
1017 }
1018 }
1019
1020 static void
nvme_qpair_timeout(void * arg)1021 nvme_qpair_timeout(void *arg)
1022 {
1023 struct nvme_qpair *qpair = arg;
1024 struct nvme_controller *ctrlr = qpair->ctrlr;
1025 struct nvme_tracker *tr;
1026 sbintime_t now;
1027 bool idle = false;
1028 bool needs_reset;
1029 uint32_t csts;
1030 uint8_t cfs;
1031
1032 mtx_assert(&qpair->recovery, MA_OWNED);
1033
1034 /*
1035 * If the controller is failed, then stop polling. This ensures that any
1036 * failure processing that races with the qpair timeout will fail
1037 * safely.
1038 */
1039 if (qpair->ctrlr->is_failed) {
1040 nvme_printf(qpair->ctrlr,
1041 "Failed controller, stopping watchdog timeout.\n");
1042 qpair->timer_armed = false;
1043 return;
1044 }
1045
1046 /*
1047 * Shutdown condition: We set qpair->timer_armed to false in
1048 * nvme_qpair_destroy before calling callout_drain. When we call that,
1049 * this routine might get called one last time. Exit w/o setting a
1050 * timeout. None of the watchdog stuff needs to be done since we're
1051 * destroying the qpair.
1052 */
1053 if (!qpair->timer_armed) {
1054 nvme_printf(qpair->ctrlr,
1055 "Timeout fired during nvme_qpair_destroy\n");
1056 return;
1057 }
1058
1059 switch (qpair->recovery_state) {
1060 case RECOVERY_NONE:
1061 /*
1062 * Read csts to get value of cfs - controller fatal status. If
1063 * we are in the hot-plug or controller failed status proceed
1064 * directly to reset. We also bail early if the status reads all
1065 * 1's or the control fatal status bit is now 1. The latter is
1066 * always true when the former is true, but not vice versa. The
1067 * intent of the code is that if the card is gone (all 1's) or
1068 * we've failed, then try to do a reset (which someitmes
1069 * unwedges a card reading all 1's that's not gone away, but
1070 * usually doesn't).
1071 */
1072 csts = nvme_mmio_read_4(ctrlr, csts);
1073 cfs = NVMEV(NVME_CSTS_REG_CFS, csts);
1074 if (csts == NVME_GONE || cfs == 1)
1075 goto do_reset;
1076
1077 /*
1078 * Process completions. We already have the recovery lock, so
1079 * call the locked version.
1080 */
1081 _nvme_qpair_process_completions(qpair);
1082
1083 /*
1084 * Check to see if we need to timeout any commands. If we do, then
1085 * we also enter a recovery phase.
1086 */
1087 now = getsbinuptime();
1088 needs_reset = false;
1089 idle = true;
1090 mtx_lock(&qpair->lock);
1091 TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) {
1092 /*
1093 * Skip async commands, they are posted to the card for
1094 * an indefinite amount of time and have no deadline.
1095 */
1096 if (tr->deadline == SBT_MAX)
1097 continue;
1098 if (now > tr->deadline) {
1099 if (tr->req->cb_fn != nvme_abort_complete &&
1100 ctrlr->enable_aborts) {
1101 /*
1102 * This isn't an abort command, ask
1103 * for a hardware abort.
1104 */
1105 nvme_ctrlr_cmd_abort(ctrlr, tr->cid,
1106 qpair->id, nvme_abort_complete, tr);
1107 } else {
1108 /*
1109 * Otherwise we have a live command in
1110 * the card (either one we couldn't
1111 * abort, or aborts weren't enabled).
1112 * The only safe way to proceed is to do
1113 * a reset.
1114 */
1115 needs_reset = true;
1116 }
1117 } else {
1118 idle = false;
1119 }
1120 }
1121 mtx_unlock(&qpair->lock);
1122 if (!needs_reset)
1123 break;
1124
1125 /*
1126 * We've had a command timeout that we weren't able to abort
1127 *
1128 * If we get here due to a possible surprise hot-unplug event,
1129 * then we let nvme_ctrlr_reset confirm and fail the
1130 * controller.
1131 */
1132 do_reset:
1133 nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n",
1134 (csts == 0xffffffff) ? " and possible hot unplug" :
1135 (cfs ? " and fatal error status" : ""));
1136 qpair->recovery_state = RECOVERY_WAITING;
1137 nvme_ctrlr_reset(ctrlr);
1138 idle = false; /* We want to keep polling */
1139 break;
1140 case RECOVERY_WAITING:
1141 /*
1142 * These messages aren't interesting while we're suspended. We
1143 * put the queues into waiting state while
1144 * suspending. Suspending takes a while, so we'll see these
1145 * during that time and they aren't diagnostic. At other times,
1146 * they indicate a problem that's worth complaining about.
1147 */
1148 if (!device_is_suspended(ctrlr->dev))
1149 nvme_printf(ctrlr, "Waiting for reset to complete\n");
1150 idle = false; /* We want to keep polling */
1151 break;
1152 }
1153
1154 /*
1155 * Rearm the timeout.
1156 */
1157 if (!idle) {
1158 callout_schedule_sbt(&qpair->timer, SBT_1S / 2, SBT_1S / 2, 0);
1159 } else {
1160 qpair->timer_armed = false;
1161 }
1162 }
1163
1164 /*
1165 * Submit the tracker to the hardware. Must already be in the
1166 * outstanding queue when called.
1167 */
1168 void
nvme_qpair_submit_tracker(struct nvme_qpair * qpair,struct nvme_tracker * tr)1169 nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr)
1170 {
1171 struct nvme_request *req;
1172 struct nvme_controller *ctrlr;
1173 int timeout;
1174
1175 mtx_assert(&qpair->lock, MA_OWNED);
1176
1177 req = tr->req;
1178 req->cmd.cid = tr->cid;
1179 qpair->act_tr[tr->cid] = tr;
1180 ctrlr = qpair->ctrlr;
1181
1182 if (req->timeout) {
1183 if (req->cb_fn == nvme_completion_poll_cb)
1184 timeout = 1;
1185 else if (qpair->id == 0)
1186 timeout = ctrlr->admin_timeout_period;
1187 else
1188 timeout = ctrlr->timeout_period;
1189 tr->deadline = getsbinuptime() + timeout * SBT_1S;
1190 if (!qpair->timer_armed) {
1191 qpair->timer_armed = true;
1192 callout_reset_sbt_on(&qpair->timer, SBT_1S / 2, SBT_1S / 2,
1193 nvme_qpair_timeout, qpair, qpair->cpu, 0);
1194 }
1195 } else
1196 tr->deadline = SBT_MAX;
1197
1198 /* Copy the command from the tracker to the submission queue. */
1199 memcpy(&qpair->cmd[qpair->sq_tail], &req->cmd, sizeof(req->cmd));
1200
1201 if (++qpair->sq_tail == qpair->num_entries)
1202 qpair->sq_tail = 0;
1203
1204 bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map,
1205 BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1206 bus_space_write_4(ctrlr->bus_tag, ctrlr->bus_handle,
1207 qpair->sq_tdbl_off, qpair->sq_tail);
1208 qpair->num_cmds++;
1209 }
1210
1211 static void
nvme_payload_map(void * arg,bus_dma_segment_t * seg,int nseg,int error)1212 nvme_payload_map(void *arg, bus_dma_segment_t *seg, int nseg, int error)
1213 {
1214 struct nvme_tracker *tr = arg;
1215 uint32_t cur_nseg;
1216
1217 /*
1218 * If the mapping operation failed, return immediately. The caller
1219 * is responsible for detecting the error status and failing the
1220 * tracker manually.
1221 */
1222 if (error != 0) {
1223 nvme_printf(tr->qpair->ctrlr,
1224 "nvme_payload_map err %d\n", error);
1225 return;
1226 }
1227
1228 /*
1229 * Note that we specified ctrlr->page_size for alignment and max
1230 * segment size when creating the bus dma tags. So here we can safely
1231 * just transfer each segment to its associated PRP entry.
1232 */
1233 tr->req->cmd.prp1 = htole64(seg[0].ds_addr);
1234
1235 if (nseg == 2) {
1236 tr->req->cmd.prp2 = htole64(seg[1].ds_addr);
1237 } else if (nseg > 2) {
1238 cur_nseg = 1;
1239 tr->req->cmd.prp2 = htole64((uint64_t)tr->prp_bus_addr);
1240 while (cur_nseg < nseg) {
1241 tr->prp[cur_nseg-1] =
1242 htole64((uint64_t)seg[cur_nseg].ds_addr);
1243 cur_nseg++;
1244 }
1245 } else {
1246 /*
1247 * prp2 should not be used by the controller
1248 * since there is only one segment, but set
1249 * to 0 just to be safe.
1250 */
1251 tr->req->cmd.prp2 = 0;
1252 }
1253
1254 bus_dmamap_sync(tr->qpair->dma_tag_payload, tr->payload_dma_map,
1255 BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1256 nvme_qpair_submit_tracker(tr->qpair, tr);
1257 }
1258
1259 static void
_nvme_qpair_submit_request(struct nvme_qpair * qpair,struct nvme_request * req)1260 _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
1261 {
1262 struct nvme_tracker *tr;
1263 int err = 0;
1264
1265 mtx_assert(&qpair->lock, MA_OWNED);
1266
1267 tr = TAILQ_FIRST(&qpair->free_tr);
1268 req->qpair = qpair;
1269
1270 if (tr == NULL || qpair->recovery_state != RECOVERY_NONE) {
1271 /*
1272 * No tracker is available, or the qpair is disabled due to an
1273 * in-progress controller-level reset. If we lose the race with
1274 * recovery_state, then we may add an extra request to the queue
1275 * which will be resubmitted later. We only set recovery_state
1276 * to NONE with qpair->lock also held, so if we observe that the
1277 * state is not NONE, we know it can't transition to NONE below
1278 * when we've submitted the request to hardware.
1279 *
1280 * Also, as part of the failure process, we set recovery_state
1281 * to RECOVERY_WAITING, so we check here to see if we've failed
1282 * the controller. We set it before we call the qpair_fail
1283 * functions, which take out the lock lock before messing with
1284 * queued_req. Since we hold that lock, we know it's safe to
1285 * either fail directly, or queue the failure should is_failed
1286 * be stale. If we lose the race reading is_failed, then
1287 * nvme_qpair_fail will fail the queued request.
1288 */
1289
1290 if (qpair->ctrlr->is_failed) {
1291 /*
1292 * The controller has failed, so fail the request.
1293 */
1294 nvme_qpair_manual_complete_request(qpair, req,
1295 NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST);
1296 } else {
1297 /*
1298 * Put the request on the qpair's request queue to be
1299 * processed when a tracker frees up via a command
1300 * completion or when the controller reset is
1301 * completed.
1302 */
1303 STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq);
1304 }
1305 return;
1306 }
1307
1308 TAILQ_REMOVE(&qpair->free_tr, tr, tailq);
1309 TAILQ_INSERT_TAIL(&qpair->outstanding_tr, tr, tailq);
1310 tr->deadline = SBT_MAX;
1311 tr->req = req;
1312
1313 if (!req->payload_valid) {
1314 nvme_qpair_submit_tracker(tr->qpair, tr);
1315 return;
1316 }
1317
1318 /*
1319 * tr->deadline updating when nvme_payload_map calls
1320 * nvme_qpair_submit_tracker (we call it above directly
1321 * when there's no map to load).
1322 */
1323 err = bus_dmamap_load_mem(tr->qpair->dma_tag_payload,
1324 tr->payload_dma_map, &req->payload, nvme_payload_map, tr, 0);
1325 if (err != 0) {
1326 /*
1327 * The dmamap operation failed, so we manually fail the
1328 * tracker here with DATA_TRANSFER_ERROR status.
1329 *
1330 * nvme_qpair_manual_complete_tracker must not be called
1331 * with the qpair lock held.
1332 */
1333 nvme_printf(qpair->ctrlr,
1334 "bus_dmamap_load_mem returned 0x%x!\n", err);
1335 mtx_unlock(&qpair->lock);
1336 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC,
1337 NVME_SC_DATA_TRANSFER_ERROR, DO_NOT_RETRY, ERROR_PRINT_ALL);
1338 mtx_lock(&qpair->lock);
1339 }
1340 }
1341
1342 void
nvme_qpair_submit_request(struct nvme_qpair * qpair,struct nvme_request * req)1343 nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
1344 {
1345
1346 mtx_lock(&qpair->lock);
1347 _nvme_qpair_submit_request(qpair, req);
1348 mtx_unlock(&qpair->lock);
1349 }
1350
1351 static void
nvme_qpair_enable(struct nvme_qpair * qpair)1352 nvme_qpair_enable(struct nvme_qpair *qpair)
1353 {
1354 if (mtx_initialized(&qpair->recovery))
1355 mtx_assert(&qpair->recovery, MA_OWNED);
1356 if (mtx_initialized(&qpair->lock))
1357 mtx_assert(&qpair->lock, MA_OWNED);
1358 KASSERT(!qpair->ctrlr->is_failed,
1359 ("Enabling a failed qpair\n"));
1360
1361 qpair->recovery_state = RECOVERY_NONE;
1362 }
1363
1364 void
nvme_qpair_reset(struct nvme_qpair * qpair)1365 nvme_qpair_reset(struct nvme_qpair *qpair)
1366 {
1367
1368 qpair->sq_head = qpair->sq_tail = qpair->cq_head = 0;
1369
1370 /*
1371 * First time through the completion queue, HW will set phase
1372 * bit on completions to 1. So set this to 1 here, indicating
1373 * we're looking for a 1 to know which entries have completed.
1374 * we'll toggle the bit each time when the completion queue
1375 * rolls over.
1376 */
1377 qpair->phase = 1;
1378
1379 memset(qpair->cmd, 0,
1380 qpair->num_entries * sizeof(struct nvme_command));
1381 memset(qpair->cpl, 0,
1382 qpair->num_entries * sizeof(struct nvme_completion));
1383 }
1384
1385 void
nvme_admin_qpair_enable(struct nvme_qpair * qpair)1386 nvme_admin_qpair_enable(struct nvme_qpair *qpair)
1387 {
1388 struct nvme_tracker *tr;
1389 struct nvme_tracker *tr_temp;
1390 bool rpt;
1391
1392 /*
1393 * Manually abort each outstanding admin command. Do not retry
1394 * admin commands found here, since they will be left over from
1395 * a controller reset and its likely the context in which the
1396 * command was issued no longer applies.
1397 */
1398 rpt = !TAILQ_EMPTY(&qpair->outstanding_tr);
1399 if (rpt)
1400 nvme_printf(qpair->ctrlr,
1401 "aborting outstanding admin command\n");
1402 TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
1403 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC,
1404 NVME_SC_ABORTED_BY_REQUEST, DO_NOT_RETRY, ERROR_PRINT_ALL);
1405 }
1406 if (rpt)
1407 nvme_printf(qpair->ctrlr,
1408 "done aborting outstanding admin\n");
1409
1410 mtx_lock(&qpair->recovery);
1411 mtx_lock(&qpair->lock);
1412 nvme_qpair_enable(qpair);
1413 mtx_unlock(&qpair->lock);
1414 mtx_unlock(&qpair->recovery);
1415 }
1416
1417 void
nvme_io_qpair_enable(struct nvme_qpair * qpair)1418 nvme_io_qpair_enable(struct nvme_qpair *qpair)
1419 {
1420 STAILQ_HEAD(, nvme_request) temp;
1421 struct nvme_tracker *tr;
1422 struct nvme_tracker *tr_temp;
1423 struct nvme_request *req;
1424 bool report;
1425
1426 /*
1427 * Manually abort each outstanding I/O. This normally results in a
1428 * retry, unless the retry count on the associated request has
1429 * reached its limit.
1430 */
1431 report = !TAILQ_EMPTY(&qpair->outstanding_tr);
1432 if (report)
1433 nvme_printf(qpair->ctrlr, "aborting outstanding i/o\n");
1434 TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
1435 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC,
1436 NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_NO_RETRY);
1437 }
1438 if (report)
1439 nvme_printf(qpair->ctrlr, "done aborting outstanding i/o\n");
1440
1441 mtx_lock(&qpair->recovery);
1442 mtx_lock(&qpair->lock);
1443 nvme_qpair_enable(qpair);
1444
1445 STAILQ_INIT(&temp);
1446 STAILQ_SWAP(&qpair->queued_req, &temp, nvme_request);
1447
1448 report = !STAILQ_EMPTY(&temp);
1449 if (report)
1450 nvme_printf(qpair->ctrlr, "resubmitting queued i/o\n");
1451 while (!STAILQ_EMPTY(&temp)) {
1452 req = STAILQ_FIRST(&temp);
1453 STAILQ_REMOVE_HEAD(&temp, stailq);
1454 nvme_qpair_print_command(qpair, &req->cmd);
1455 _nvme_qpair_submit_request(qpair, req);
1456 }
1457 if (report)
1458 nvme_printf(qpair->ctrlr, "done resubmitting i/o\n");
1459
1460 mtx_unlock(&qpair->lock);
1461 mtx_unlock(&qpair->recovery);
1462 }
1463
1464 static void
nvme_qpair_disable(struct nvme_qpair * qpair)1465 nvme_qpair_disable(struct nvme_qpair *qpair)
1466 {
1467 struct nvme_tracker *tr, *tr_temp;
1468
1469 if (mtx_initialized(&qpair->recovery))
1470 mtx_assert(&qpair->recovery, MA_OWNED);
1471 if (mtx_initialized(&qpair->lock))
1472 mtx_assert(&qpair->lock, MA_OWNED);
1473
1474 qpair->recovery_state = RECOVERY_WAITING;
1475 TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
1476 tr->deadline = SBT_MAX;
1477 }
1478 }
1479
1480 void
nvme_admin_qpair_disable(struct nvme_qpair * qpair)1481 nvme_admin_qpair_disable(struct nvme_qpair *qpair)
1482 {
1483 mtx_lock(&qpair->recovery);
1484
1485 mtx_lock(&qpair->lock);
1486 nvme_qpair_disable(qpair);
1487 mtx_unlock(&qpair->lock);
1488
1489 nvme_admin_qpair_abort_aers(qpair);
1490
1491 mtx_unlock(&qpair->recovery);
1492 }
1493
1494 void
nvme_io_qpair_disable(struct nvme_qpair * qpair)1495 nvme_io_qpair_disable(struct nvme_qpair *qpair)
1496 {
1497 mtx_lock(&qpair->recovery);
1498 mtx_lock(&qpair->lock);
1499
1500 nvme_qpair_disable(qpair);
1501
1502 mtx_unlock(&qpair->lock);
1503 mtx_unlock(&qpair->recovery);
1504 }
1505
1506 void
nvme_qpair_fail(struct nvme_qpair * qpair)1507 nvme_qpair_fail(struct nvme_qpair *qpair)
1508 {
1509 struct nvme_tracker *tr;
1510 struct nvme_request *req;
1511
1512 if (!mtx_initialized(&qpair->lock))
1513 return;
1514
1515 mtx_lock(&qpair->lock);
1516
1517 if (!STAILQ_EMPTY(&qpair->queued_req)) {
1518 nvme_printf(qpair->ctrlr, "failing queued i/o\n");
1519 }
1520 while (!STAILQ_EMPTY(&qpair->queued_req)) {
1521 req = STAILQ_FIRST(&qpair->queued_req);
1522 STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
1523 mtx_unlock(&qpair->lock);
1524 nvme_qpair_manual_complete_request(qpair, req, NVME_SCT_GENERIC,
1525 NVME_SC_ABORTED_BY_REQUEST);
1526 mtx_lock(&qpair->lock);
1527 }
1528
1529 if (!TAILQ_EMPTY(&qpair->outstanding_tr)) {
1530 nvme_printf(qpair->ctrlr, "failing outstanding i/o\n");
1531 }
1532 /* Manually abort each outstanding I/O. */
1533 while (!TAILQ_EMPTY(&qpair->outstanding_tr)) {
1534 tr = TAILQ_FIRST(&qpair->outstanding_tr);
1535 /*
1536 * Do not remove the tracker. The abort_tracker path will
1537 * do that for us.
1538 */
1539 mtx_unlock(&qpair->lock);
1540 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC,
1541 NVME_SC_ABORTED_BY_REQUEST, DO_NOT_RETRY, ERROR_PRINT_ALL);
1542 mtx_lock(&qpair->lock);
1543 }
1544
1545 mtx_unlock(&qpair->lock);
1546 }
1547