1 // Low level NVMe disk access
2 //
3 // Copyright 2017 Amazon.com, Inc. or its affiliates.
4 //
5 // This file may be distributed under the terms of the GNU LGPLv3 license.
6 
7 #include "blockcmd.h"
8 #include "malloc.h" // malloc_high
9 #include "output.h" // dprintf
10 #include "pci.h"
11 #include "pci_ids.h" // PCI_CLASS_STORAGE_NVME
12 #include "pci_regs.h" // PCI_BASE_ADDRESS_0
13 #include "pcidevice.h" // foreachpci
14 #include "stacks.h" // yield
15 #include "std/disk.h" // DISK_RET_
16 #include "string.h" // memset
17 #include "util.h" // boot_add_hd
18 #include "x86.h" // readl
19 
20 #include "nvme.h"
21 #include "nvme-int.h"
22 
23 static void *
zalloc_page_aligned(struct zone_s * zone,u32 size)24 zalloc_page_aligned(struct zone_s *zone, u32 size)
25 {
26     void *res = _malloc(zone, size, NVME_PAGE_SIZE);
27     if (res) memset(res, 0, size);
28     return res;
29 }
30 
31 static void
nvme_init_queue_common(struct nvme_ctrl * ctrl,struct nvme_queue * q,u16 q_idx,u16 length)32 nvme_init_queue_common(struct nvme_ctrl *ctrl, struct nvme_queue *q, u16 q_idx,
33                        u16 length)
34 {
35     memset(q, 0, sizeof(*q));
36     q->dbl = (u32 *)((char *)ctrl->reg + 0x1000 + q_idx * ctrl->doorbell_stride);
37     dprintf(3, " q %p q_idx %u dbl %p\n", q, q_idx, q->dbl);
38     q->mask = length - 1;
39 }
40 
41 static int
nvme_init_sq(struct nvme_ctrl * ctrl,struct nvme_sq * sq,u16 q_idx,u16 length,struct nvme_cq * cq)42 nvme_init_sq(struct nvme_ctrl *ctrl, struct nvme_sq *sq, u16 q_idx, u16 length,
43              struct nvme_cq *cq)
44 {
45     nvme_init_queue_common(ctrl, &sq->common, q_idx, length);
46     sq->sqe = zalloc_page_aligned(&ZoneHigh, sizeof(*sq->sqe) * length);
47 
48     if (!sq->sqe) {
49         warn_noalloc();
50         return -1;
51     }
52 
53     dprintf(3, "sq %p q_idx %u sqe %p\n", sq, q_idx, sq->sqe);
54     sq->cq   = cq;
55     sq->head = 0;
56     sq->tail = 0;
57 
58     return 0;
59 }
60 
61 static int
nvme_init_cq(struct nvme_ctrl * ctrl,struct nvme_cq * cq,u16 q_idx,u16 length)62 nvme_init_cq(struct nvme_ctrl *ctrl, struct nvme_cq *cq, u16 q_idx, u16 length)
63 {
64     nvme_init_queue_common(ctrl, &cq->common, q_idx, length);
65     cq->cqe = zalloc_page_aligned(&ZoneHigh, sizeof(*cq->cqe) * length);
66     if (!cq->cqe) {
67         warn_noalloc();
68         return -1;
69     }
70 
71     cq->head = 0;
72 
73     /* All CQE phase bits are initialized to zero. This means initially we wait
74        for the host controller to set these to 1. */
75     cq->phase = 1;
76 
77     return 0;
78 }
79 
80 static int
nvme_poll_cq(struct nvme_cq * cq)81 nvme_poll_cq(struct nvme_cq *cq)
82 {
83     u32 dw3 = readl(&cq->cqe[cq->head].dword[3]);
84     return (!!(dw3 & NVME_CQE_DW3_P) == cq->phase);
85 }
86 
87 static int
nvme_is_cqe_success(struct nvme_cqe const * cqe)88 nvme_is_cqe_success(struct nvme_cqe const *cqe)
89 {
90     return ((cqe->status >> 1) & 0xFF) == 0;
91 }
92 
93 static struct nvme_cqe
nvme_error_cqe(void)94 nvme_error_cqe(void)
95 {
96     struct nvme_cqe r;
97 
98     /* 0xFF is a vendor specific status code != success. Should be okay for
99        indicating failure. */
100     memset(&r, 0xFF, sizeof(r));
101     return r;
102 }
103 
104 static struct nvme_cqe
nvme_consume_cqe(struct nvme_sq * sq)105 nvme_consume_cqe(struct nvme_sq *sq)
106 {
107     struct nvme_cq *cq = sq->cq;
108 
109     if (!nvme_poll_cq(cq)) {
110         /* Cannot consume a completion queue entry, if there is none ready. */
111         return nvme_error_cqe();
112     }
113 
114     struct nvme_cqe *cqe = &cq->cqe[cq->head];
115     u16 cq_next_head = (cq->head + 1) & cq->common.mask;
116     dprintf(4, "cq %p head %u -> %u\n", cq, cq->head, cq_next_head);
117     if (cq_next_head < cq->head) {
118         dprintf(3, "cq %p wrap\n", cq);
119         cq->phase = ~cq->phase;
120     }
121     cq->head = cq_next_head;
122 
123     /* Update the submission queue head. */
124     if (cqe->sq_head != sq->head) {
125         sq->head = cqe->sq_head;
126         dprintf(4, "sq %p advanced to %u\n", sq, cqe->sq_head);
127     }
128 
129     /* Tell the controller that we consumed the completion. */
130     writel(cq->common.dbl, cq->head);
131 
132     return *cqe;
133 }
134 
135 static struct nvme_cqe
nvme_wait(struct nvme_sq * sq)136 nvme_wait(struct nvme_sq *sq)
137 {
138     static const unsigned nvme_timeout = 5000 /* ms */;
139     u32 to = timer_calc(nvme_timeout);
140     while (!nvme_poll_cq(sq->cq)) {
141         yield();
142 
143         if (timer_check(to)) {
144             warn_timeout();
145             return nvme_error_cqe();
146         }
147     }
148 
149     return nvme_consume_cqe(sq);
150 }
151 
152 /* Returns the next submission queue entry (or NULL if the queue is full). It
153    also fills out Command Dword 0 and clears the rest. */
154 static struct nvme_sqe *
nvme_get_next_sqe(struct nvme_sq * sq,u8 opc,void * metadata,void * data)155 nvme_get_next_sqe(struct nvme_sq *sq, u8 opc, void *metadata, void *data)
156 {
157     if (((sq->head + 1) & sq->common.mask) == sq->tail) {
158         dprintf(3, "submission queue is full");
159         return NULL;
160     }
161 
162     struct nvme_sqe *sqe = &sq->sqe[sq->tail];
163     dprintf(4, "sq %p next_sqe %u\n", sq, sq->tail);
164 
165     memset(sqe, 0, sizeof(*sqe));
166     sqe->cdw0 = opc | (sq->tail << 16 /* CID */);
167     sqe->mptr = (u32)metadata;
168     sqe->dptr_prp1 = (u32)data;
169 
170     if (sqe->dptr_prp1 & (NVME_PAGE_SIZE - 1)) {
171         /* Data buffer not page aligned. */
172         warn_internalerror();
173     }
174 
175     return sqe;
176 }
177 
178 /* Call this after you've filled out an sqe that you've got from nvme_get_next_sqe. */
179 static void
nvme_commit_sqe(struct nvme_sq * sq)180 nvme_commit_sqe(struct nvme_sq *sq)
181 {
182     dprintf(4, "sq %p commit_sqe %u\n", sq, sq->tail);
183     sq->tail = (sq->tail + 1) & sq->common.mask;
184     writel(sq->common.dbl, sq->tail);
185 }
186 
187 /* Perform an identify command on the admin queue and return the resulting
188    buffer. This may be a NULL pointer, if something failed. This function
189    cannot be used after initialization, because it uses buffers in tmp zone. */
190 static union nvme_identify *
nvme_admin_identify(struct nvme_ctrl * ctrl,u8 cns,u32 nsid)191 nvme_admin_identify(struct nvme_ctrl *ctrl, u8 cns, u32 nsid)
192 {
193     union nvme_identify *identify_buf = zalloc_page_aligned(&ZoneTmpHigh, 4096);
194     if (!identify_buf) {
195         /* Could not allocate identify buffer. */
196         warn_internalerror();
197         return NULL;
198     }
199 
200     struct nvme_sqe *cmd_identify;
201     cmd_identify = nvme_get_next_sqe(&ctrl->admin_sq,
202                                      NVME_SQE_OPC_ADMIN_IDENTIFY, NULL,
203                                      identify_buf);
204 
205     if (!cmd_identify) {
206         warn_internalerror();
207         goto error;
208     }
209 
210     cmd_identify->nsid = nsid;
211     cmd_identify->dword[10] = cns;
212 
213     nvme_commit_sqe(&ctrl->admin_sq);
214 
215     struct nvme_cqe cqe = nvme_wait(&ctrl->admin_sq);
216 
217     if (!nvme_is_cqe_success(&cqe)) {
218         goto error;
219     }
220 
221     return identify_buf;
222  error:
223     free(identify_buf);
224     return NULL;
225 }
226 
227 static struct nvme_identify_ctrl *
nvme_admin_identify_ctrl(struct nvme_ctrl * ctrl)228 nvme_admin_identify_ctrl(struct nvme_ctrl *ctrl)
229 {
230     return &nvme_admin_identify(ctrl, NVME_ADMIN_IDENTIFY_CNS_ID_CTRL, 0)->ctrl;
231 }
232 
233 static struct nvme_identify_ns *
nvme_admin_identify_ns(struct nvme_ctrl * ctrl,u32 ns_id)234 nvme_admin_identify_ns(struct nvme_ctrl *ctrl, u32 ns_id)
235 {
236     return &nvme_admin_identify(ctrl, NVME_ADMIN_IDENTIFY_CNS_ID_NS,
237                                 ns_id)->ns;
238 }
239 
240 static void
nvme_probe_ns(struct nvme_ctrl * ctrl,struct nvme_namespace * ns,u32 ns_id)241 nvme_probe_ns(struct nvme_ctrl *ctrl, struct nvme_namespace *ns, u32 ns_id)
242 {
243     ns->ctrl  = ctrl;
244     ns->ns_id = ns_id;
245 
246     struct nvme_identify_ns *id = nvme_admin_identify_ns(ctrl, ns_id);
247     if (!id) {
248         dprintf(2, "NVMe couldn't identify namespace %u.\n", ns_id);
249         goto free_buffer;
250     }
251 
252     u8 current_lba_format = id->flbas & 0xF;
253     if (current_lba_format > id->nlbaf) {
254         dprintf(2, "NVMe NS %u: current LBA format %u is beyond what the "
255                 " namespace supports (%u)?\n",
256                 ns_id, current_lba_format, id->nlbaf + 1);
257         goto free_buffer;
258     }
259 
260     ns->lba_count = id->nsze;
261     if (!ns->lba_count) {
262         dprintf(2, "NVMe NS %u is inactive.\n", ns_id);
263         goto free_buffer;
264     }
265 
266     struct nvme_lba_format *fmt = &id->lbaf[current_lba_format];
267 
268     ns->block_size    = 1U << fmt->lbads;
269     ns->metadata_size = fmt->ms;
270 
271     if (ns->block_size > NVME_PAGE_SIZE) {
272         /* If we see devices that trigger this path, we need to increase our
273            buffer size. */
274         warn_internalerror();
275         goto free_buffer;
276     }
277 
278     ns->drive.cntl_id   = ns - ctrl->ns;
279     ns->drive.removable = 0;
280     ns->drive.type      = DTYPE_NVME;
281     ns->drive.blksize   = ns->block_size;
282     ns->drive.sectors   = ns->lba_count;
283 
284     ns->dma_buffer = zalloc_page_aligned(&ZoneHigh, NVME_PAGE_SIZE);
285 
286     char *desc = znprintf(MAXDESCSIZE, "NVMe NS %u: %llu MiB (%llu %u-byte "
287                           "blocks + %u-byte metadata)\n",
288                           ns_id, (ns->lba_count * ns->block_size) >> 20,
289                           ns->lba_count, ns->block_size, ns->metadata_size);
290 
291     dprintf(3, "%s", desc);
292     boot_add_hd(&ns->drive, desc, bootprio_find_pci_device(ctrl->pci));
293 
294 free_buffer:
295     free (id);
296 }
297 
298 
299 /* Release memory allocated for a completion queue */
300 static void
nvme_destroy_cq(struct nvme_cq * cq)301 nvme_destroy_cq(struct nvme_cq *cq)
302 {
303     free(cq->cqe);
304     cq->cqe = NULL;
305 }
306 
307 /* Release memory allocated for a submission queue */
308 static void
nvme_destroy_sq(struct nvme_sq * sq)309 nvme_destroy_sq(struct nvme_sq *sq)
310 {
311     free(sq->sqe);
312     sq->sqe = NULL;
313 }
314 
315 /* Returns 0 on success. */
316 static int
nvme_create_io_cq(struct nvme_ctrl * ctrl,struct nvme_cq * cq,u16 q_idx)317 nvme_create_io_cq(struct nvme_ctrl *ctrl, struct nvme_cq *cq, u16 q_idx)
318 {
319     int rc;
320     struct nvme_sqe *cmd_create_cq;
321     u32 length = 1 + (ctrl->reg->cap & 0xffff);
322     if (length > NVME_PAGE_SIZE / sizeof(struct nvme_cqe))
323         length = NVME_PAGE_SIZE / sizeof(struct nvme_cqe);
324 
325     rc = nvme_init_cq(ctrl, cq, q_idx, length);
326     if (rc) {
327         goto err;
328     }
329 
330     cmd_create_cq = nvme_get_next_sqe(&ctrl->admin_sq,
331                                       NVME_SQE_OPC_ADMIN_CREATE_IO_CQ, NULL,
332                                       cq->cqe);
333     if (!cmd_create_cq) {
334         goto err_destroy_cq;
335     }
336 
337     cmd_create_cq->dword[10] = (cq->common.mask << 16) | (q_idx >> 1);
338     cmd_create_cq->dword[11] = 1 /* physically contiguous */;
339 
340     nvme_commit_sqe(&ctrl->admin_sq);
341 
342     struct nvme_cqe cqe = nvme_wait(&ctrl->admin_sq);
343 
344     if (!nvme_is_cqe_success(&cqe)) {
345         dprintf(2, "create io cq failed: %08x %08x %08x %08x\n",
346                 cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]);
347 
348         goto err_destroy_cq;
349     }
350 
351     return 0;
352 
353 err_destroy_cq:
354     nvme_destroy_cq(cq);
355 err:
356     return -1;
357 }
358 
359 /* Returns 0 on success. */
360 static int
nvme_create_io_sq(struct nvme_ctrl * ctrl,struct nvme_sq * sq,u16 q_idx,struct nvme_cq * cq)361 nvme_create_io_sq(struct nvme_ctrl *ctrl, struct nvme_sq *sq, u16 q_idx, struct nvme_cq *cq)
362 {
363     int rc;
364     struct nvme_sqe *cmd_create_sq;
365     u32 length = 1 + (ctrl->reg->cap & 0xffff);
366     if (length > NVME_PAGE_SIZE / sizeof(struct nvme_cqe))
367         length = NVME_PAGE_SIZE / sizeof(struct nvme_cqe);
368 
369     rc = nvme_init_sq(ctrl, sq, q_idx, length, cq);
370     if (rc) {
371         goto err;
372     }
373 
374     cmd_create_sq = nvme_get_next_sqe(&ctrl->admin_sq,
375                                       NVME_SQE_OPC_ADMIN_CREATE_IO_SQ, NULL,
376                                       sq->sqe);
377     if (!cmd_create_sq) {
378         goto err_destroy_sq;
379     }
380 
381     cmd_create_sq->dword[10] = (sq->common.mask << 16) | (q_idx >> 1);
382     cmd_create_sq->dword[11] = (q_idx >> 1) << 16 | 1 /* contiguous */;
383     dprintf(3, "sq %p create dword10 %08x dword11 %08x\n", sq,
384             cmd_create_sq->dword[10], cmd_create_sq->dword[11]);
385 
386     nvme_commit_sqe(&ctrl->admin_sq);
387 
388     struct nvme_cqe cqe = nvme_wait(&ctrl->admin_sq);
389 
390     if (!nvme_is_cqe_success(&cqe)) {
391         dprintf(2, "create io sq failed: %08x %08x %08x %08x\n",
392                 cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]);
393         goto err_destroy_sq;
394     }
395 
396     return 0;
397 
398 err_destroy_sq:
399     nvme_destroy_sq(sq);
400 err:
401     return -1;
402 }
403 
404 /* Reads count sectors into buf. Returns DISK_RET_*. The buffer cannot cross
405    page boundaries. */
406 static int
nvme_io_readwrite(struct nvme_namespace * ns,u64 lba,char * buf,u16 count,int write)407 nvme_io_readwrite(struct nvme_namespace *ns, u64 lba, char *buf, u16 count,
408                   int write)
409 {
410     u32 buf_addr = (u32)buf;
411 
412     if ((buf_addr & 0x3) ||
413         ((buf_addr & ~(NVME_PAGE_SIZE - 1)) !=
414          ((buf_addr + ns->block_size * count - 1) & ~(NVME_PAGE_SIZE - 1)))) {
415         /* Buffer is misaligned or crosses page boundary */
416         warn_internalerror();
417         return DISK_RET_EBADTRACK;
418     }
419 
420     struct nvme_sqe *io_read = nvme_get_next_sqe(&ns->ctrl->io_sq,
421                                                  write ? NVME_SQE_OPC_IO_WRITE
422                                                        : NVME_SQE_OPC_IO_READ,
423                                                  NULL, buf);
424     io_read->nsid = ns->ns_id;
425     io_read->dword[10] = (u32)lba;
426     io_read->dword[11] = (u32)(lba >> 32);
427     io_read->dword[12] = (1U << 31 /* limited retry */) | (count - 1);
428 
429     nvme_commit_sqe(&ns->ctrl->io_sq);
430 
431     struct nvme_cqe cqe = nvme_wait(&ns->ctrl->io_sq);
432 
433     if (!nvme_is_cqe_success(&cqe)) {
434         dprintf(2, "read io: %08x %08x %08x %08x\n",
435                 cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]);
436 
437         return DISK_RET_EBADTRACK;
438     }
439 
440     return DISK_RET_SUCCESS;
441 }
442 
443 static int
nvme_create_io_queues(struct nvme_ctrl * ctrl)444 nvme_create_io_queues(struct nvme_ctrl *ctrl)
445 {
446     if (nvme_create_io_cq(ctrl, &ctrl->io_cq, 3))
447         goto err;
448 
449     if (nvme_create_io_sq(ctrl, &ctrl->io_sq, 2, &ctrl->io_cq))
450         goto err_free_cq;
451 
452     return 0;
453 
454  err_free_cq:
455     nvme_destroy_cq(&ctrl->io_cq);
456  err:
457     return -1;
458 }
459 
460 static void
nvme_destroy_io_queues(struct nvme_ctrl * ctrl)461 nvme_destroy_io_queues(struct nvme_ctrl *ctrl)
462 {
463     nvme_destroy_sq(&ctrl->io_sq);
464     nvme_destroy_cq(&ctrl->io_cq);
465 }
466 
467 /* Waits for CSTS.RDY to match rdy. Returns 0 on success. */
468 static int
nvme_wait_csts_rdy(struct nvme_ctrl * ctrl,unsigned rdy)469 nvme_wait_csts_rdy(struct nvme_ctrl *ctrl, unsigned rdy)
470 {
471     u32 const max_to = 500 /* ms */ * ((ctrl->reg->cap >> 24) & 0xFFU);
472     u32 to = timer_calc(max_to);
473     u32 csts;
474 
475     while (rdy != ((csts = ctrl->reg->csts) & NVME_CSTS_RDY)) {
476         yield();
477 
478         if (csts & NVME_CSTS_FATAL) {
479             dprintf(3, "NVMe fatal error during controller shutdown\n");
480             return -1;
481         }
482 
483         if (timer_check(to)) {
484             warn_timeout();
485             return -1;
486         }
487     }
488 
489     return 0;
490 }
491 
492 /* Returns 0 on success. */
493 static int
nvme_controller_enable(struct nvme_ctrl * ctrl)494 nvme_controller_enable(struct nvme_ctrl *ctrl)
495 {
496     int rc;
497 
498     pci_enable_busmaster(ctrl->pci);
499 
500     /* Turn the controller off. */
501     ctrl->reg->cc = 0;
502     if (nvme_wait_csts_rdy(ctrl, 0)) {
503         dprintf(2, "NVMe fatal error during controller shutdown\n");
504         return -1;
505     }
506 
507     ctrl->doorbell_stride = 4U << ((ctrl->reg->cap >> 32) & 0xF);
508 
509     rc = nvme_init_cq(ctrl, &ctrl->admin_cq, 1,
510                       NVME_PAGE_SIZE / sizeof(struct nvme_cqe));
511     if (rc) {
512         return -1;
513     }
514 
515     rc = nvme_init_sq(ctrl, &ctrl->admin_sq, 0,
516                       NVME_PAGE_SIZE / sizeof(struct nvme_sqe), &ctrl->admin_cq);
517     if (rc) {
518         goto err_destroy_admin_cq;
519     }
520 
521     ctrl->reg->aqa = ctrl->admin_cq.common.mask << 16
522         | ctrl->admin_sq.common.mask;
523 
524     ctrl->reg->asq = (u32)ctrl->admin_sq.sqe;
525     ctrl->reg->acq = (u32)ctrl->admin_cq.cqe;
526 
527     dprintf(3, "  admin submission queue: %p\n", ctrl->admin_sq.sqe);
528     dprintf(3, "  admin completion queue: %p\n", ctrl->admin_cq.cqe);
529 
530     ctrl->reg->cc = NVME_CC_EN | (NVME_CQE_SIZE_LOG << 20)
531         | (NVME_SQE_SIZE_LOG << 16 /* IOSQES */);
532 
533     if (nvme_wait_csts_rdy(ctrl, 1)) {
534         dprintf(2, "NVMe fatal error while enabling controller\n");
535         goto err_destroy_admin_sq;
536     }
537 
538     /* The admin queue is set up and the controller is ready. Let's figure out
539        what namespaces we have. */
540 
541     struct nvme_identify_ctrl *identify = nvme_admin_identify_ctrl(ctrl);
542 
543     if (!identify) {
544         dprintf(2, "NVMe couldn't identify controller.\n");
545         goto err_destroy_admin_sq;
546     }
547 
548     dprintf(3, "NVMe has %u namespace%s.\n",
549             identify->nn, (identify->nn == 1) ? "" : "s");
550 
551     ctrl->ns_count = identify->nn;
552     free(identify);
553 
554     if ((ctrl->ns_count == 0) || nvme_create_io_queues(ctrl)) {
555         /* No point to continue, if the controller says it doesn't have
556            namespaces or we couldn't create I/O queues. */
557         goto err_destroy_admin_sq;
558     }
559 
560     ctrl->ns = malloc_fseg(sizeof(*ctrl->ns) * ctrl->ns_count);
561     if (!ctrl->ns) {
562         warn_noalloc();
563         goto err_destroy_ioq;
564     }
565     memset(ctrl->ns, 0, sizeof(*ctrl->ns) * ctrl->ns_count);
566 
567     /* Populate namespace IDs */
568     int ns_idx;
569     for (ns_idx = 0; ns_idx < ctrl->ns_count; ns_idx++) {
570         nvme_probe_ns(ctrl, &ctrl->ns[ns_idx], ns_idx + 1);
571     }
572 
573     dprintf(3, "NVMe initialization complete!\n");
574     return 0;
575 
576  err_destroy_ioq:
577     nvme_destroy_io_queues(ctrl);
578  err_destroy_admin_sq:
579     nvme_destroy_sq(&ctrl->admin_sq);
580  err_destroy_admin_cq:
581     nvme_destroy_cq(&ctrl->admin_cq);
582     return -1;
583 }
584 
585 /* Initialize an NVMe controller and detect its drives. */
586 static void
nvme_controller_setup(void * opaque)587 nvme_controller_setup(void *opaque)
588 {
589     struct pci_device *pci = opaque;
590 
591     struct nvme_reg volatile *reg = pci_enable_membar(pci, PCI_BASE_ADDRESS_0);
592     if (!reg)
593         return;
594 
595     u32 version = reg->vs;
596     dprintf(3, "Found NVMe controller with version %u.%u.%u.\n",
597             version >> 16, (version >> 8) & 0xFF, version & 0xFF);
598     dprintf(3, "  Capabilities %016llx\n", reg->cap);
599 
600     if (~reg->cap & NVME_CAP_CSS_NVME) {
601         dprintf(3, "Controller doesn't speak NVMe command set. Skipping.\n");
602         goto err;
603     }
604 
605     struct nvme_ctrl *ctrl = malloc_high(sizeof(*ctrl));
606     if (!ctrl) {
607         warn_noalloc();
608         goto err;
609     }
610 
611     memset(ctrl, 0, sizeof(*ctrl));
612 
613     ctrl->reg = reg;
614     ctrl->pci = pci;
615 
616     if (nvme_controller_enable(ctrl)) {
617         goto err_free_ctrl;
618     }
619 
620     return;
621 
622  err_free_ctrl:
623     free(ctrl);
624  err:
625     dprintf(2, "Failed to enable NVMe controller.\n");
626 }
627 
628 // Locate and init NVMe controllers
629 static void
nvme_scan(void)630 nvme_scan(void)
631 {
632     // Scan PCI bus for NVMe adapters
633     struct pci_device *pci;
634 
635     foreachpci(pci) {
636         if (pci->class != PCI_CLASS_STORAGE_NVME)
637             continue;
638         if (pci->prog_if != 2 /* as of NVM 1.0e */) {
639             dprintf(3, "Found incompatble NVMe: prog-if=%02x\n", pci->prog_if);
640             continue;
641         }
642 
643         run_thread(nvme_controller_setup, pci);
644     }
645 }
646 
647 static int
nvme_cmd_readwrite(struct nvme_namespace * ns,struct disk_op_s * op,int write)648 nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write)
649 {
650     int res = DISK_RET_SUCCESS;
651     u16 const max_blocks = NVME_PAGE_SIZE / ns->block_size;
652     u16 i;
653 
654     for (i = 0; i < op->count && res == DISK_RET_SUCCESS;) {
655         u16 blocks_remaining = op->count - i;
656         u16 blocks = blocks_remaining < max_blocks ? blocks_remaining
657                                                    : max_blocks;
658         char *op_buf = op->buf_fl + i * ns->block_size;
659 
660         if (write) {
661             memcpy(ns->dma_buffer, op_buf, blocks * ns->block_size);
662         }
663 
664         res = nvme_io_readwrite(ns, op->lba + i, ns->dma_buffer, blocks, write);
665         dprintf(3, "ns %u %s lba %llu+%u: %d\n", ns->ns_id, write ? "write"
666                                                                   : "read",
667                 op->lba + i, blocks, res);
668 
669         if (!write && res == DISK_RET_SUCCESS) {
670             memcpy(op_buf, ns->dma_buffer, blocks * ns->block_size);
671         }
672 
673         i += blocks;
674     }
675 
676     return res;
677 }
678 
679 int
nvme_process_op(struct disk_op_s * op)680 nvme_process_op(struct disk_op_s *op)
681 {
682     if (!CONFIG_NVME)
683         return DISK_RET_SUCCESS;
684 
685     struct nvme_namespace *ns = container_of(op->drive_fl, struct nvme_namespace,
686                                              drive);
687 
688     switch (op->command) {
689     case CMD_READ:
690     case CMD_WRITE:
691         return nvme_cmd_readwrite(ns, op, op->command == CMD_WRITE);
692     default:
693         return default_process_op(op);
694     }
695 }
696 
697 void
nvme_setup(void)698 nvme_setup(void)
699 {
700     ASSERT32FLAT();
701     if (!CONFIG_NVME)
702         return;
703 
704     dprintf(3, "init nvme\n");
705     nvme_scan();
706 }
707 
708 /* EOF */
709