1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2009-2012 Spectra Logic Corporation
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions, and the following disclaimer,
12 * without modification.
13 * 2. Redistributions in binary form must reproduce at minimum a disclaimer
14 * substantially similar to the "NO WARRANTY" disclaimer below
15 * ("Disclaimer") and any redistribution must be conditioned upon
16 * including a substantially similar Disclaimer requirement for further
17 * binary redistribution.
18 *
19 * NO WARRANTY
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
28 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
29 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGES.
31 *
32 * Authors: Justin T. Gibbs (Spectra Logic Corporation)
33 * Ken Merry (Spectra Logic Corporation)
34 */
35 #include <sys/cdefs.h>
36 /**
37 * \file blkback.c
38 *
39 * \brief Device driver supporting the vending of block storage from
40 * a FreeBSD domain to other domains.
41 */
42
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/kernel.h>
46 #include <sys/malloc.h>
47
48 #include <sys/bio.h>
49 #include <sys/bus.h>
50 #include <sys/conf.h>
51 #include <sys/devicestat.h>
52 #include <sys/disk.h>
53 #include <sys/fcntl.h>
54 #include <sys/filedesc.h>
55 #include <sys/kdb.h>
56 #include <sys/module.h>
57 #include <sys/namei.h>
58 #include <sys/proc.h>
59 #include <sys/rman.h>
60 #include <sys/taskqueue.h>
61 #include <sys/types.h>
62 #include <sys/vnode.h>
63 #include <sys/mount.h>
64 #include <sys/sysctl.h>
65 #include <sys/bitstring.h>
66 #include <sys/sdt.h>
67
68 #include <geom/geom.h>
69
70 #include <machine/_inttypes.h>
71
72 #include <vm/vm.h>
73 #include <vm/vm_extern.h>
74 #include <vm/vm_kern.h>
75
76 #include <xen/xen-os.h>
77 #include <xen/blkif.h>
78 #include <xen/gnttab.h>
79 #include <xen/xen_intr.h>
80
81 #include <contrib/xen/event_channel.h>
82 #include <contrib/xen/grant_table.h>
83
84 #include <xen/xenbus/xenbusvar.h>
85
86 /*--------------------------- Compile-time Tunables --------------------------*/
87 /**
88 * The maximum number of shared memory ring pages we will allow in a
89 * negotiated block-front/back communication channel. Allow enough
90 * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd.
91 */
92 #define XBB_MAX_RING_PAGES 32
93
94 /**
95 * The maximum number of outstanding request blocks (request headers plus
96 * additional segment blocks) we will allow in a negotiated block-front/back
97 * communication channel.
98 */
99 #define XBB_MAX_REQUESTS \
100 __CONST_RING_SIZE(blkif, PAGE_SIZE * XBB_MAX_RING_PAGES)
101
102 /**
103 * \brief Define to enable rudimentary request logging to the console.
104 */
105 #undef XBB_DEBUG
106
107 /*---------------------------------- Macros ----------------------------------*/
108 /**
109 * Custom malloc type for all driver allocations.
110 */
111 static MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data");
112
113 #ifdef XBB_DEBUG
114 #define DPRINTF(fmt, args...) \
115 printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
116 #else
117 #define DPRINTF(fmt, args...) do {} while(0)
118 #endif
119
120 /**
121 * The maximum mapped region size per request we will allow in a negotiated
122 * block-front/back communication channel.
123 * Use old default of MAXPHYS == 128K.
124 */
125 #define XBB_MAX_REQUEST_SIZE \
126 MIN(128 * 1024, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE)
127
128 /**
129 * The maximum number of segments (within a request header and accompanying
130 * segment blocks) per request we will allow in a negotiated block-front/back
131 * communication channel.
132 */
133 #define XBB_MAX_SEGMENTS_PER_REQUEST \
134 (MIN(UIO_MAXIOV, \
135 MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \
136 (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1)))
137
138 /**
139 * The maximum number of ring pages that we can allow per request list.
140 * We limit this to the maximum number of segments per request, because
141 * that is already a reasonable number of segments to aggregate. This
142 * number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST,
143 * because that would leave situations where we can't dispatch even one
144 * large request.
145 */
146 #define XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST
147
148 /*--------------------------- Forward Declarations ---------------------------*/
149 struct xbb_softc;
150 struct xbb_xen_req;
151
152 static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt,
153 ...) __attribute__((format(printf, 3, 4)));
154 static int xbb_shutdown(struct xbb_softc *xbb);
155
156 /*------------------------------ Data Structures -----------------------------*/
157
158 STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req);
159
160 typedef enum {
161 XBB_REQLIST_NONE = 0x00,
162 XBB_REQLIST_MAPPED = 0x01
163 } xbb_reqlist_flags;
164
165 struct xbb_xen_reqlist {
166 /**
167 * Back reference to the parent block back instance for this
168 * request. Used during bio_done handling.
169 */
170 struct xbb_softc *xbb;
171
172 /**
173 * BLKIF_OP code for this request.
174 */
175 int operation;
176
177 /**
178 * Set to BLKIF_RSP_* to indicate request status.
179 *
180 * This field allows an error status to be recorded even if the
181 * delivery of this status must be deferred. Deferred reporting
182 * is necessary, for example, when an error is detected during
183 * completion processing of one bio when other bios for this
184 * request are still outstanding.
185 */
186 int status;
187
188 /**
189 * Number of 512 byte sectors not transferred.
190 */
191 int residual_512b_sectors;
192
193 /**
194 * Starting sector number of the first request in the list.
195 */
196 off_t starting_sector_number;
197
198 /**
199 * If we're going to coalesce, the next contiguous sector would be
200 * this one.
201 */
202 off_t next_contig_sector;
203
204 /**
205 * Number of child requests in the list.
206 */
207 int num_children;
208
209 /**
210 * Number of I/O requests still pending on the backend.
211 */
212 int pendcnt;
213
214 /**
215 * Total number of segments for requests in the list.
216 */
217 int nr_segments;
218
219 /**
220 * Flags for this particular request list.
221 */
222 xbb_reqlist_flags flags;
223
224 /**
225 * Kernel virtual address space reserved for this request
226 * list structure and used to map the remote domain's pages for
227 * this I/O, into our domain's address space.
228 */
229 uint8_t *kva;
230
231 /**
232 * Base, pseudo-physical address, corresponding to the start
233 * of this request's kva region.
234 */
235 uint64_t gnt_base;
236
237 /**
238 * Array of grant handles (one per page) used to map this request.
239 */
240 grant_handle_t *gnt_handles;
241
242 /**
243 * Device statistics request ordering type (ordered or simple).
244 */
245 devstat_tag_type ds_tag_type;
246
247 /**
248 * Device statistics request type (read, write, no_data).
249 */
250 devstat_trans_flags ds_trans_type;
251
252 /**
253 * The start time for this request.
254 */
255 struct bintime ds_t0;
256
257 /**
258 * Linked list of contiguous requests with the same operation type.
259 */
260 struct xbb_xen_req_list contig_req_list;
261
262 /**
263 * Linked list links used to aggregate idle requests in the
264 * request list free pool (xbb->reqlist_free_stailq) and pending
265 * requests waiting for execution (xbb->reqlist_pending_stailq).
266 */
267 STAILQ_ENTRY(xbb_xen_reqlist) links;
268 };
269
270 STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist);
271
272 /**
273 * \brief Object tracking an in-flight I/O from a Xen VBD consumer.
274 */
275 struct xbb_xen_req {
276 /**
277 * Linked list links used to aggregate requests into a reqlist
278 * and to store them in the request free pool.
279 */
280 STAILQ_ENTRY(xbb_xen_req) links;
281
282 /**
283 * The remote domain's identifier for this I/O request.
284 */
285 uint64_t id;
286
287 /**
288 * The number of pages currently mapped for this request.
289 */
290 int nr_pages;
291
292 /**
293 * The number of 512 byte sectors comprising this requests.
294 */
295 int nr_512b_sectors;
296
297 /**
298 * BLKIF_OP code for this request.
299 */
300 int operation;
301
302 /**
303 * Storage used for non-native ring requests.
304 */
305 blkif_request_t ring_req_storage;
306
307 /**
308 * Pointer to the Xen request in the ring.
309 */
310 blkif_request_t *ring_req;
311
312 /**
313 * Consumer index for this request.
314 */
315 RING_IDX req_ring_idx;
316
317 /**
318 * The start time for this request.
319 */
320 struct bintime ds_t0;
321
322 /**
323 * Pointer back to our parent request list.
324 */
325 struct xbb_xen_reqlist *reqlist;
326 };
327 SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req);
328
329 /**
330 * \brief Configuration data for the shared memory request ring
331 * used to communicate with the front-end client of this
332 * this driver.
333 */
334 struct xbb_ring_config {
335 /** KVA address where ring memory is mapped. */
336 vm_offset_t va;
337
338 /** The pseudo-physical address where ring memory is mapped.*/
339 uint64_t gnt_addr;
340
341 /**
342 * Grant table handles, one per-ring page, returned by the
343 * hyperpervisor upon mapping of the ring and required to
344 * unmap it when a connection is torn down.
345 */
346 grant_handle_t handle[XBB_MAX_RING_PAGES];
347
348 /**
349 * The device bus address returned by the hypervisor when
350 * mapping the ring and required to unmap it when a connection
351 * is torn down.
352 */
353 uint64_t bus_addr[XBB_MAX_RING_PAGES];
354
355 /** The number of ring pages mapped for the current connection. */
356 u_int ring_pages;
357
358 /**
359 * The grant references, one per-ring page, supplied by the
360 * front-end, allowing us to reference the ring pages in the
361 * front-end's domain and to map these pages into our own domain.
362 */
363 grant_ref_t ring_ref[XBB_MAX_RING_PAGES];
364
365 /** The interrupt driven even channel used to signal ring events. */
366 evtchn_port_t evtchn;
367 };
368
369 /**
370 * Per-instance connection state flags.
371 */
372 typedef enum
373 {
374 /**
375 * The front-end requested a read-only mount of the
376 * back-end device/file.
377 */
378 XBBF_READ_ONLY = 0x01,
379
380 /** Communication with the front-end has been established. */
381 XBBF_RING_CONNECTED = 0x02,
382
383 /**
384 * Front-end requests exist in the ring and are waiting for
385 * xbb_xen_req objects to free up.
386 */
387 XBBF_RESOURCE_SHORTAGE = 0x04,
388
389 /** Connection teardown in progress. */
390 XBBF_SHUTDOWN = 0x08,
391
392 /** A thread is already performing shutdown processing. */
393 XBBF_IN_SHUTDOWN = 0x10
394 } xbb_flag_t;
395
396 /** Backend device type. */
397 typedef enum {
398 /** Backend type unknown. */
399 XBB_TYPE_NONE = 0x00,
400
401 /**
402 * Backend type disk (access via cdev switch
403 * strategy routine).
404 */
405 XBB_TYPE_DISK = 0x01,
406
407 /** Backend type file (access vnode operations.). */
408 XBB_TYPE_FILE = 0x02
409 } xbb_type;
410
411 /**
412 * \brief Structure used to memoize information about a per-request
413 * scatter-gather list.
414 *
415 * The chief benefit of using this data structure is it avoids having
416 * to reparse the possibly discontiguous S/G list in the original
417 * request. Due to the way that the mapping of the memory backing an
418 * I/O transaction is handled by Xen, a second pass is unavoidable.
419 * At least this way the second walk is a simple array traversal.
420 *
421 * \note A single Scatter/Gather element in the block interface covers
422 * at most 1 machine page. In this context a sector (blkif
423 * nomenclature, not what I'd choose) is a 512b aligned unit
424 * of mapping within the machine page referenced by an S/G
425 * element.
426 */
427 struct xbb_sg {
428 /** The number of 512b data chunks mapped in this S/G element. */
429 int16_t nsect;
430
431 /**
432 * The index (0 based) of the first 512b data chunk mapped
433 * in this S/G element.
434 */
435 uint8_t first_sect;
436
437 /**
438 * The index (0 based) of the last 512b data chunk mapped
439 * in this S/G element.
440 */
441 uint8_t last_sect;
442 };
443
444 /**
445 * Character device backend specific configuration data.
446 */
447 struct xbb_dev_data {
448 /** Cdev used for device backend access. */
449 struct cdev *cdev;
450
451 /** Cdev switch used for device backend access. */
452 struct cdevsw *csw;
453
454 /** Used to hold a reference on opened cdev backend devices. */
455 int dev_ref;
456 };
457
458 /**
459 * File backend specific configuration data.
460 */
461 struct xbb_file_data {
462 /** Credentials to use for vnode backed (file based) I/O. */
463 struct ucred *cred;
464
465 /**
466 * \brief Array of io vectors used to process file based I/O.
467 *
468 * Only a single file based request is outstanding per-xbb instance,
469 * so we only need one of these.
470 */
471 struct iovec xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST];
472 };
473
474 /**
475 * Collection of backend type specific data.
476 */
477 union xbb_backend_data {
478 struct xbb_dev_data dev;
479 struct xbb_file_data file;
480 };
481
482 /**
483 * Function signature of backend specific I/O handlers.
484 */
485 typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb,
486 struct xbb_xen_reqlist *reqlist, int operation,
487 int flags);
488
489 /**
490 * Per-instance configuration data.
491 */
492 struct xbb_softc {
493 /**
494 * Task-queue used to process I/O requests.
495 */
496 struct taskqueue *io_taskqueue;
497
498 /**
499 * Single "run the request queue" task enqueued
500 * on io_taskqueue.
501 */
502 struct task io_task;
503
504 /** Device type for this instance. */
505 xbb_type device_type;
506
507 /** NewBus device corresponding to this instance. */
508 device_t dev;
509
510 /** Backend specific dispatch routine for this instance. */
511 xbb_dispatch_t dispatch_io;
512
513 /** The number of requests outstanding on the backend device/file. */
514 int active_request_count;
515
516 /** Free pool of request tracking structures. */
517 struct xbb_xen_req_list request_free_stailq;
518
519 /** Array, sized at connection time, of request tracking structures. */
520 struct xbb_xen_req *requests;
521
522 /** Free pool of request list structures. */
523 struct xbb_xen_reqlist_list reqlist_free_stailq;
524
525 /** List of pending request lists awaiting execution. */
526 struct xbb_xen_reqlist_list reqlist_pending_stailq;
527
528 /** Array, sized at connection time, of request list structures. */
529 struct xbb_xen_reqlist *request_lists;
530
531 /**
532 * Global pool of kva used for mapping remote domain ring
533 * and I/O transaction data.
534 */
535 vm_offset_t kva;
536
537 /** Pseudo-physical address corresponding to kva. */
538 uint64_t gnt_base_addr;
539
540 /** The size of the global kva pool. */
541 int kva_size;
542
543 /** The size of the KVA area used for request lists. */
544 int reqlist_kva_size;
545
546 /** The number of pages of KVA used for request lists */
547 int reqlist_kva_pages;
548
549 /** Bitmap of free KVA pages */
550 bitstr_t *kva_free;
551
552 /**
553 * \brief Cached value of the front-end's domain id.
554 *
555 * This value is used at once for each mapped page in
556 * a transaction. We cache it to avoid incuring the
557 * cost of an ivar access every time this is needed.
558 */
559 domid_t otherend_id;
560
561 /**
562 * \brief The blkif protocol abi in effect.
563 *
564 * There are situations where the back and front ends can
565 * have a different, native abi (e.g. intel x86_64 and
566 * 32bit x86 domains on the same machine). The back-end
567 * always accommodates the front-end's native abi. That
568 * value is pulled from the XenStore and recorded here.
569 */
570 int abi;
571
572 /**
573 * \brief The maximum number of requests and request lists allowed
574 * to be in flight at a time.
575 *
576 * This value is negotiated via the XenStore.
577 */
578 u_int max_requests;
579
580 /**
581 * \brief The maximum number of segments (1 page per segment)
582 * that can be mapped by a request.
583 *
584 * This value is negotiated via the XenStore.
585 */
586 u_int max_request_segments;
587
588 /**
589 * \brief Maximum number of segments per request list.
590 *
591 * This value is derived from and will generally be larger than
592 * max_request_segments.
593 */
594 u_int max_reqlist_segments;
595
596 /**
597 * The maximum size of any request to this back-end
598 * device.
599 *
600 * This value is negotiated via the XenStore.
601 */
602 u_int max_request_size;
603
604 /**
605 * The maximum size of any request list. This is derived directly
606 * from max_reqlist_segments.
607 */
608 u_int max_reqlist_size;
609
610 /** Various configuration and state bit flags. */
611 xbb_flag_t flags;
612
613 /** Ring mapping and interrupt configuration data. */
614 struct xbb_ring_config ring_config;
615
616 /** Runtime, cross-abi safe, structures for ring access. */
617 blkif_back_rings_t rings;
618
619 /** IRQ mapping for the communication ring event channel. */
620 xen_intr_handle_t xen_intr_handle;
621
622 /**
623 * \brief Backend access mode flags (e.g. write, or read-only).
624 *
625 * This value is passed to us by the front-end via the XenStore.
626 */
627 char *dev_mode;
628
629 /**
630 * \brief Backend device type (e.g. "disk", "cdrom", "floppy").
631 *
632 * This value is passed to us by the front-end via the XenStore.
633 * Currently unused.
634 */
635 char *dev_type;
636
637 /**
638 * \brief Backend device/file identifier.
639 *
640 * This value is passed to us by the front-end via the XenStore.
641 * We expect this to be a POSIX path indicating the file or
642 * device to open.
643 */
644 char *dev_name;
645
646 /**
647 * Vnode corresponding to the backend device node or file
648 * we are acessing.
649 */
650 struct vnode *vn;
651
652 union xbb_backend_data backend;
653
654 /** The native sector size of the backend. */
655 u_int sector_size;
656
657 /** log2 of sector_size. */
658 u_int sector_size_shift;
659
660 /** Size in bytes of the backend device or file. */
661 off_t media_size;
662
663 /**
664 * \brief media_size expressed in terms of the backend native
665 * sector size.
666 *
667 * (e.g. xbb->media_size >> xbb->sector_size_shift).
668 */
669 uint64_t media_num_sectors;
670
671 /**
672 * \brief Array of memoized scatter gather data computed during the
673 * conversion of blkif ring requests to internal xbb_xen_req
674 * structures.
675 *
676 * Ring processing is serialized so we only need one of these.
677 */
678 struct xbb_sg xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST];
679
680 /**
681 * Temporary grant table map used in xbb_dispatch_io(). When
682 * XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the
683 * stack could cause a stack overflow.
684 */
685 struct gnttab_map_grant_ref maps[XBB_MAX_SEGMENTS_PER_REQLIST];
686
687 /** Mutex protecting per-instance data. */
688 struct mtx lock;
689
690 /**
691 * Resource representing allocated physical address space
692 * associated with our per-instance kva region.
693 */
694 struct resource *pseudo_phys_res;
695
696 /** Resource id for allocated physical address space. */
697 int pseudo_phys_res_id;
698
699 /**
700 * I/O statistics from BlockBack dispatch down. These are
701 * coalesced requests, and we start them right before execution.
702 */
703 struct devstat *xbb_stats;
704
705 /**
706 * I/O statistics coming into BlockBack. These are the requests as
707 * we get them from BlockFront. They are started as soon as we
708 * receive a request, and completed when the I/O is complete.
709 */
710 struct devstat *xbb_stats_in;
711
712 /** Disable sending flush to the backend */
713 int disable_flush;
714
715 /** Send a real flush for every N flush requests */
716 int flush_interval;
717
718 /** Count of flush requests in the interval */
719 int flush_count;
720
721 /** Don't coalesce requests if this is set */
722 int no_coalesce_reqs;
723
724 /** Number of requests we have received */
725 uint64_t reqs_received;
726
727 /** Number of requests we have completed*/
728 uint64_t reqs_completed;
729
730 /** Number of requests we queued but not pushed*/
731 uint64_t reqs_queued_for_completion;
732
733 /** Number of requests we completed with an error status*/
734 uint64_t reqs_completed_with_error;
735
736 /** How many forced dispatches (i.e. without coalescing) have happened */
737 uint64_t forced_dispatch;
738
739 /** How many normal dispatches have happened */
740 uint64_t normal_dispatch;
741
742 /** How many total dispatches have happened */
743 uint64_t total_dispatch;
744
745 /** How many times we have run out of KVA */
746 uint64_t kva_shortages;
747
748 /** How many times we have run out of request structures */
749 uint64_t request_shortages;
750
751 /** Watch to wait for hotplug script execution */
752 struct xs_watch hotplug_watch;
753
754 /** Got the needed data from hotplug scripts? */
755 bool hotplug_done;
756 };
757
758 /*---------------------------- Request Processing ----------------------------*/
759 /**
760 * Allocate an internal transaction tracking structure from the free pool.
761 *
762 * \param xbb Per-instance xbb configuration structure.
763 *
764 * \return On success, a pointer to the allocated xbb_xen_req structure.
765 * Otherwise NULL.
766 */
767 static inline struct xbb_xen_req *
xbb_get_req(struct xbb_softc * xbb)768 xbb_get_req(struct xbb_softc *xbb)
769 {
770 struct xbb_xen_req *req;
771
772 req = NULL;
773
774 mtx_assert(&xbb->lock, MA_OWNED);
775
776 if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) {
777 STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links);
778 xbb->active_request_count++;
779 }
780
781 return (req);
782 }
783
784 /**
785 * Return an allocated transaction tracking structure to the free pool.
786 *
787 * \param xbb Per-instance xbb configuration structure.
788 * \param req The request structure to free.
789 */
790 static inline void
xbb_release_req(struct xbb_softc * xbb,struct xbb_xen_req * req)791 xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req)
792 {
793 mtx_assert(&xbb->lock, MA_OWNED);
794
795 STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links);
796 xbb->active_request_count--;
797
798 KASSERT(xbb->active_request_count >= 0,
799 ("xbb_release_req: negative active count"));
800 }
801
802 /**
803 * Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool.
804 *
805 * \param xbb Per-instance xbb configuration structure.
806 * \param req_list The list of requests to free.
807 * \param nreqs The number of items in the list.
808 */
809 static inline void
xbb_release_reqs(struct xbb_softc * xbb,struct xbb_xen_req_list * req_list,int nreqs)810 xbb_release_reqs(struct xbb_softc *xbb, struct xbb_xen_req_list *req_list,
811 int nreqs)
812 {
813 mtx_assert(&xbb->lock, MA_OWNED);
814
815 STAILQ_CONCAT(&xbb->request_free_stailq, req_list);
816 xbb->active_request_count -= nreqs;
817
818 KASSERT(xbb->active_request_count >= 0,
819 ("xbb_release_reqs: negative active count"));
820 }
821
822 /**
823 * Given a page index and 512b sector offset within that page,
824 * calculate an offset into a request's kva region.
825 *
826 * \param reqlist The request structure whose kva region will be accessed.
827 * \param pagenr The page index used to compute the kva offset.
828 * \param sector The 512b sector index used to compute the page relative
829 * kva offset.
830 *
831 * \return The computed global KVA offset.
832 */
833 static inline uint8_t *
xbb_reqlist_vaddr(struct xbb_xen_reqlist * reqlist,int pagenr,int sector)834 xbb_reqlist_vaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
835 {
836 return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9));
837 }
838
839 /**
840 * Given a page number and 512b sector offset within that page,
841 * calculate an offset into the request's memory region that the
842 * underlying backend device/file should use for I/O.
843 *
844 * \param reqlist The request structure whose I/O region will be accessed.
845 * \param pagenr The page index used to compute the I/O offset.
846 * \param sector The 512b sector index used to compute the page relative
847 * I/O offset.
848 *
849 * \return The computed global I/O address.
850 *
851 * Depending on configuration, this will either be a local bounce buffer
852 * or a pointer to the memory mapped in from the front-end domain for
853 * this request.
854 */
855 static inline uint8_t *
xbb_reqlist_ioaddr(struct xbb_xen_reqlist * reqlist,int pagenr,int sector)856 xbb_reqlist_ioaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
857 {
858 return (xbb_reqlist_vaddr(reqlist, pagenr, sector));
859 }
860
861 /**
862 * Given a page index and 512b sector offset within that page, calculate
863 * an offset into the local pseudo-physical address space used to map a
864 * front-end's request data into a request.
865 *
866 * \param reqlist The request list structure whose pseudo-physical region
867 * will be accessed.
868 * \param pagenr The page index used to compute the pseudo-physical offset.
869 * \param sector The 512b sector index used to compute the page relative
870 * pseudo-physical offset.
871 *
872 * \return The computed global pseudo-phsyical address.
873 *
874 * Depending on configuration, this will either be a local bounce buffer
875 * or a pointer to the memory mapped in from the front-end domain for
876 * this request.
877 */
878 static inline uintptr_t
xbb_get_gntaddr(struct xbb_xen_reqlist * reqlist,int pagenr,int sector)879 xbb_get_gntaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
880 {
881 struct xbb_softc *xbb;
882
883 xbb = reqlist->xbb;
884
885 return ((uintptr_t)(xbb->gnt_base_addr +
886 (uintptr_t)(reqlist->kva - xbb->kva) +
887 (PAGE_SIZE * pagenr) + (sector << 9)));
888 }
889
890 /**
891 * Get Kernel Virtual Address space for mapping requests.
892 *
893 * \param xbb Per-instance xbb configuration structure.
894 * \param nr_pages Number of pages needed.
895 * \param check_only If set, check for free KVA but don't allocate it.
896 * \param have_lock If set, xbb lock is already held.
897 *
898 * \return On success, a pointer to the allocated KVA region. Otherwise NULL.
899 *
900 * Note: This should be unnecessary once we have either chaining or
901 * scatter/gather support for struct bio. At that point we'll be able to
902 * put multiple addresses and lengths in one bio/bio chain and won't need
903 * to map everything into one virtual segment.
904 */
905 static uint8_t *
xbb_get_kva(struct xbb_softc * xbb,int nr_pages)906 xbb_get_kva(struct xbb_softc *xbb, int nr_pages)
907 {
908 int first_clear;
909 int num_clear;
910 uint8_t *free_kva;
911 int i;
912
913 KASSERT(nr_pages != 0, ("xbb_get_kva of zero length"));
914
915 first_clear = 0;
916 free_kva = NULL;
917
918 mtx_lock(&xbb->lock);
919
920 /*
921 * Look for the first available page. If there are none, we're done.
922 */
923 bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear);
924
925 if (first_clear == -1)
926 goto bailout;
927
928 /*
929 * Starting at the first available page, look for consecutive free
930 * pages that will satisfy the user's request.
931 */
932 for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) {
933 /*
934 * If this is true, the page is used, so we have to reset
935 * the number of clear pages and the first clear page
936 * (since it pointed to a region with an insufficient number
937 * of clear pages).
938 */
939 if (bit_test(xbb->kva_free, i)) {
940 num_clear = 0;
941 first_clear = -1;
942 continue;
943 }
944
945 if (first_clear == -1)
946 first_clear = i;
947
948 /*
949 * If this is true, we've found a large enough free region
950 * to satisfy the request.
951 */
952 if (++num_clear == nr_pages) {
953 bit_nset(xbb->kva_free, first_clear,
954 first_clear + nr_pages - 1);
955
956 free_kva = xbb->kva +
957 (uint8_t *)((intptr_t)first_clear * PAGE_SIZE);
958
959 KASSERT(free_kva >= (uint8_t *)xbb->kva &&
960 free_kva + (nr_pages * PAGE_SIZE) <=
961 (uint8_t *)xbb->ring_config.va,
962 ("Free KVA %p len %d out of range, "
963 "kva = %#jx, ring VA = %#jx\n", free_kva,
964 nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva,
965 (uintmax_t)xbb->ring_config.va));
966 break;
967 }
968 }
969
970 bailout:
971
972 if (free_kva == NULL) {
973 xbb->flags |= XBBF_RESOURCE_SHORTAGE;
974 xbb->kva_shortages++;
975 }
976
977 mtx_unlock(&xbb->lock);
978
979 return (free_kva);
980 }
981
982 /**
983 * Free allocated KVA.
984 *
985 * \param xbb Per-instance xbb configuration structure.
986 * \param kva_ptr Pointer to allocated KVA region.
987 * \param nr_pages Number of pages in the KVA region.
988 */
989 static void
xbb_free_kva(struct xbb_softc * xbb,uint8_t * kva_ptr,int nr_pages)990 xbb_free_kva(struct xbb_softc *xbb, uint8_t *kva_ptr, int nr_pages)
991 {
992 intptr_t start_page;
993
994 mtx_assert(&xbb->lock, MA_OWNED);
995
996 start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT;
997 bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1);
998
999 }
1000
1001 /**
1002 * Unmap the front-end pages associated with this I/O request.
1003 *
1004 * \param req The request structure to unmap.
1005 */
1006 static void
xbb_unmap_reqlist(struct xbb_xen_reqlist * reqlist)1007 xbb_unmap_reqlist(struct xbb_xen_reqlist *reqlist)
1008 {
1009 struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST];
1010 u_int i;
1011 u_int invcount;
1012 int error __diagused;
1013
1014 invcount = 0;
1015 for (i = 0; i < reqlist->nr_segments; i++) {
1016 if (reqlist->gnt_handles[i] == GRANT_REF_INVALID)
1017 continue;
1018
1019 unmap[invcount].host_addr = xbb_get_gntaddr(reqlist, i, 0);
1020 unmap[invcount].dev_bus_addr = 0;
1021 unmap[invcount].handle = reqlist->gnt_handles[i];
1022 reqlist->gnt_handles[i] = GRANT_REF_INVALID;
1023 invcount++;
1024 }
1025
1026 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
1027 unmap, invcount);
1028 KASSERT(error == 0, ("Grant table operation failed"));
1029 }
1030
1031 /**
1032 * Allocate an internal transaction tracking structure from the free pool.
1033 *
1034 * \param xbb Per-instance xbb configuration structure.
1035 *
1036 * \return On success, a pointer to the allocated xbb_xen_reqlist structure.
1037 * Otherwise NULL.
1038 */
1039 static inline struct xbb_xen_reqlist *
xbb_get_reqlist(struct xbb_softc * xbb)1040 xbb_get_reqlist(struct xbb_softc *xbb)
1041 {
1042 struct xbb_xen_reqlist *reqlist;
1043
1044 reqlist = NULL;
1045
1046 mtx_assert(&xbb->lock, MA_OWNED);
1047
1048 if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) {
1049 STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links);
1050 reqlist->flags = XBB_REQLIST_NONE;
1051 reqlist->kva = NULL;
1052 reqlist->status = BLKIF_RSP_OKAY;
1053 reqlist->residual_512b_sectors = 0;
1054 reqlist->num_children = 0;
1055 reqlist->nr_segments = 0;
1056 STAILQ_INIT(&reqlist->contig_req_list);
1057 }
1058
1059 return (reqlist);
1060 }
1061
1062 /**
1063 * Return an allocated transaction tracking structure to the free pool.
1064 *
1065 * \param xbb Per-instance xbb configuration structure.
1066 * \param req The request list structure to free.
1067 * \param wakeup If set, wakeup the work thread if freeing this reqlist
1068 * during a resource shortage condition.
1069 */
1070 static inline void
xbb_release_reqlist(struct xbb_softc * xbb,struct xbb_xen_reqlist * reqlist,int wakeup)1071 xbb_release_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
1072 int wakeup)
1073 {
1074
1075 mtx_assert(&xbb->lock, MA_OWNED);
1076
1077 if (wakeup) {
1078 wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE;
1079 xbb->flags &= ~XBBF_RESOURCE_SHORTAGE;
1080 }
1081
1082 if (reqlist->kva != NULL)
1083 xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments);
1084
1085 xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children);
1086
1087 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links);
1088
1089 if ((xbb->flags & XBBF_SHUTDOWN) != 0) {
1090 /*
1091 * Shutdown is in progress. See if we can
1092 * progress further now that one more request
1093 * has completed and been returned to the
1094 * free pool.
1095 */
1096 xbb_shutdown(xbb);
1097 }
1098
1099 if (wakeup != 0)
1100 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task);
1101 }
1102
1103 /**
1104 * Request resources and do basic request setup.
1105 *
1106 * \param xbb Per-instance xbb configuration structure.
1107 * \param reqlist Pointer to reqlist pointer.
1108 * \param ring_req Pointer to a block ring request.
1109 * \param ring_index The ring index of this request.
1110 *
1111 * \return 0 for success, non-zero for failure.
1112 */
1113 static int
xbb_get_resources(struct xbb_softc * xbb,struct xbb_xen_reqlist ** reqlist,blkif_request_t * ring_req,RING_IDX ring_idx)1114 xbb_get_resources(struct xbb_softc *xbb, struct xbb_xen_reqlist **reqlist,
1115 blkif_request_t *ring_req, RING_IDX ring_idx)
1116 {
1117 struct xbb_xen_reqlist *nreqlist;
1118 struct xbb_xen_req *nreq;
1119
1120 nreqlist = NULL;
1121 nreq = NULL;
1122
1123 mtx_lock(&xbb->lock);
1124
1125 /*
1126 * We don't allow new resources to be allocated if we're in the
1127 * process of shutting down.
1128 */
1129 if ((xbb->flags & XBBF_SHUTDOWN) != 0) {
1130 mtx_unlock(&xbb->lock);
1131 return (1);
1132 }
1133
1134 /*
1135 * Allocate a reqlist if the caller doesn't have one already.
1136 */
1137 if (*reqlist == NULL) {
1138 nreqlist = xbb_get_reqlist(xbb);
1139 if (nreqlist == NULL)
1140 goto bailout_error;
1141 }
1142
1143 /* We always allocate a request. */
1144 nreq = xbb_get_req(xbb);
1145 if (nreq == NULL)
1146 goto bailout_error;
1147
1148 mtx_unlock(&xbb->lock);
1149
1150 if (*reqlist == NULL) {
1151 *reqlist = nreqlist;
1152 nreqlist->operation = ring_req->operation;
1153 nreqlist->starting_sector_number = ring_req->sector_number;
1154 STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist,
1155 links);
1156 }
1157
1158 nreq->reqlist = *reqlist;
1159 nreq->req_ring_idx = ring_idx;
1160 nreq->id = ring_req->id;
1161 nreq->operation = ring_req->operation;
1162
1163 if (xbb->abi != BLKIF_PROTOCOL_NATIVE) {
1164 bcopy(ring_req, &nreq->ring_req_storage, sizeof(*ring_req));
1165 nreq->ring_req = &nreq->ring_req_storage;
1166 } else {
1167 nreq->ring_req = ring_req;
1168 }
1169
1170 binuptime(&nreq->ds_t0);
1171 devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0);
1172 STAILQ_INSERT_TAIL(&(*reqlist)->contig_req_list, nreq, links);
1173 (*reqlist)->num_children++;
1174 (*reqlist)->nr_segments += ring_req->nr_segments;
1175
1176 return (0);
1177
1178 bailout_error:
1179
1180 /*
1181 * We're out of resources, so set the shortage flag. The next time
1182 * a request is released, we'll try waking up the work thread to
1183 * see if we can allocate more resources.
1184 */
1185 xbb->flags |= XBBF_RESOURCE_SHORTAGE;
1186 xbb->request_shortages++;
1187
1188 if (nreq != NULL)
1189 xbb_release_req(xbb, nreq);
1190
1191 if (nreqlist != NULL)
1192 xbb_release_reqlist(xbb, nreqlist, /*wakeup*/ 0);
1193
1194 mtx_unlock(&xbb->lock);
1195
1196 return (1);
1197 }
1198
1199 /**
1200 * Create and queue a response to a blkif request.
1201 *
1202 * \param xbb Per-instance xbb configuration structure.
1203 * \param req The request structure to which to respond.
1204 * \param status The status code to report. See BLKIF_RSP_*
1205 * in sys/contrib/xen/io/blkif.h.
1206 */
1207 static void
xbb_queue_response(struct xbb_softc * xbb,struct xbb_xen_req * req,int status)1208 xbb_queue_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status)
1209 {
1210 blkif_response_t *resp;
1211
1212 /*
1213 * The mutex is required here, and should be held across this call
1214 * until after the subsequent call to xbb_push_responses(). This
1215 * is to guarantee that another context won't queue responses and
1216 * push them while we're active.
1217 *
1218 * That could lead to the other end being notified of responses
1219 * before the resources have been freed on this end. The other end
1220 * would then be able to queue additional I/O, and we may run out
1221 * of resources because we haven't freed them all yet.
1222 */
1223 mtx_assert(&xbb->lock, MA_OWNED);
1224
1225 /*
1226 * Place on the response ring for the relevant domain.
1227 * For now, only the spacing between entries is different
1228 * in the different ABIs, not the response entry layout.
1229 */
1230 switch (xbb->abi) {
1231 case BLKIF_PROTOCOL_NATIVE:
1232 resp = RING_GET_RESPONSE(&xbb->rings.native,
1233 xbb->rings.native.rsp_prod_pvt);
1234 break;
1235 case BLKIF_PROTOCOL_X86_32:
1236 resp = (blkif_response_t *)
1237 RING_GET_RESPONSE(&xbb->rings.x86_32,
1238 xbb->rings.x86_32.rsp_prod_pvt);
1239 break;
1240 case BLKIF_PROTOCOL_X86_64:
1241 resp = (blkif_response_t *)
1242 RING_GET_RESPONSE(&xbb->rings.x86_64,
1243 xbb->rings.x86_64.rsp_prod_pvt);
1244 break;
1245 default:
1246 panic("Unexpected blkif protocol ABI.");
1247 }
1248
1249 resp->id = req->id;
1250 resp->operation = req->operation;
1251 resp->status = status;
1252
1253 if (status != BLKIF_RSP_OKAY)
1254 xbb->reqs_completed_with_error++;
1255
1256 xbb->rings.common.rsp_prod_pvt++;
1257
1258 xbb->reqs_queued_for_completion++;
1259
1260 }
1261
1262 /**
1263 * Send queued responses to blkif requests.
1264 *
1265 * \param xbb Per-instance xbb configuration structure.
1266 * \param run_taskqueue Flag that is set to 1 if the taskqueue
1267 * should be run, 0 if it does not need to be run.
1268 * \param notify Flag that is set to 1 if the other end should be
1269 * notified via irq, 0 if the other end should not be
1270 * notified.
1271 */
1272 static void
xbb_push_responses(struct xbb_softc * xbb,int * run_taskqueue,int * notify)1273 xbb_push_responses(struct xbb_softc *xbb, int *run_taskqueue, int *notify)
1274 {
1275 int more_to_do;
1276
1277 /*
1278 * The mutex is required here.
1279 */
1280 mtx_assert(&xbb->lock, MA_OWNED);
1281
1282 more_to_do = 0;
1283
1284 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, *notify);
1285
1286 if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) {
1287 /*
1288 * Tail check for pending requests. Allows frontend to avoid
1289 * notifications if requests are already in flight (lower
1290 * overheads and promotes batching).
1291 */
1292 RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do);
1293 } else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) {
1294 more_to_do = 1;
1295 }
1296
1297 xbb->reqs_completed += xbb->reqs_queued_for_completion;
1298 xbb->reqs_queued_for_completion = 0;
1299
1300 *run_taskqueue = more_to_do;
1301 }
1302
1303 /**
1304 * Complete a request list.
1305 *
1306 * \param xbb Per-instance xbb configuration structure.
1307 * \param reqlist Allocated internal request list structure.
1308 */
1309 static void
xbb_complete_reqlist(struct xbb_softc * xbb,struct xbb_xen_reqlist * reqlist)1310 xbb_complete_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist)
1311 {
1312 struct xbb_xen_req *nreq;
1313 off_t sectors_sent;
1314 int notify, run_taskqueue;
1315
1316 sectors_sent = 0;
1317
1318 if (reqlist->flags & XBB_REQLIST_MAPPED)
1319 xbb_unmap_reqlist(reqlist);
1320
1321 mtx_lock(&xbb->lock);
1322
1323 /*
1324 * All I/O is done, send the response. A lock is not necessary
1325 * to protect the request list, because all requests have
1326 * completed. Therefore this is the only context accessing this
1327 * reqlist right now. However, in order to make sure that no one
1328 * else queues responses onto the queue or pushes them to the other
1329 * side while we're active, we need to hold the lock across the
1330 * calls to xbb_queue_response() and xbb_push_responses().
1331 */
1332 STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) {
1333 off_t cur_sectors_sent;
1334
1335 /* Put this response on the ring, but don't push yet */
1336 xbb_queue_response(xbb, nreq, reqlist->status);
1337
1338 /* We don't report bytes sent if there is an error. */
1339 if (reqlist->status == BLKIF_RSP_OKAY)
1340 cur_sectors_sent = nreq->nr_512b_sectors;
1341 else
1342 cur_sectors_sent = 0;
1343
1344 sectors_sent += cur_sectors_sent;
1345
1346 devstat_end_transaction(xbb->xbb_stats_in,
1347 /*bytes*/cur_sectors_sent << 9,
1348 reqlist->ds_tag_type,
1349 reqlist->ds_trans_type,
1350 /*now*/NULL,
1351 /*then*/&nreq->ds_t0);
1352 }
1353
1354 /*
1355 * Take out any sectors not sent. If we wind up negative (which
1356 * might happen if an error is reported as well as a residual), just
1357 * report 0 sectors sent.
1358 */
1359 sectors_sent -= reqlist->residual_512b_sectors;
1360 if (sectors_sent < 0)
1361 sectors_sent = 0;
1362
1363 devstat_end_transaction(xbb->xbb_stats,
1364 /*bytes*/ sectors_sent << 9,
1365 reqlist->ds_tag_type,
1366 reqlist->ds_trans_type,
1367 /*now*/NULL,
1368 /*then*/&reqlist->ds_t0);
1369
1370 xbb_release_reqlist(xbb, reqlist, /*wakeup*/ 1);
1371
1372 xbb_push_responses(xbb, &run_taskqueue, ¬ify);
1373
1374 mtx_unlock(&xbb->lock);
1375
1376 if (run_taskqueue)
1377 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task);
1378
1379 if (notify)
1380 xen_intr_signal(xbb->xen_intr_handle);
1381 }
1382
1383 /**
1384 * Completion handler for buffer I/O requests issued by the device
1385 * backend driver.
1386 *
1387 * \param bio The buffer I/O request on which to perform completion
1388 * processing.
1389 */
1390 static void
xbb_bio_done(struct bio * bio)1391 xbb_bio_done(struct bio *bio)
1392 {
1393 struct xbb_softc *xbb;
1394 struct xbb_xen_reqlist *reqlist;
1395
1396 reqlist = bio->bio_caller1;
1397 xbb = reqlist->xbb;
1398
1399 reqlist->residual_512b_sectors += bio->bio_resid >> 9;
1400
1401 /*
1402 * This is a bit imprecise. With aggregated I/O a single
1403 * request list can contain multiple front-end requests and
1404 * a multiple bios may point to a single request. By carefully
1405 * walking the request list, we could map residuals and errors
1406 * back to the original front-end request, but the interface
1407 * isn't sufficiently rich for us to properly report the error.
1408 * So, we just treat the entire request list as having failed if an
1409 * error occurs on any part. And, if an error occurs, we treat
1410 * the amount of data transferred as 0.
1411 *
1412 * For residuals, we report it on the overall aggregated device,
1413 * but not on the individual requests, since we don't currently
1414 * do the work to determine which front-end request to which the
1415 * residual applies.
1416 */
1417 if (bio->bio_error) {
1418 DPRINTF("BIO returned error %d for operation on device %s\n",
1419 bio->bio_error, xbb->dev_name);
1420 reqlist->status = BLKIF_RSP_ERROR;
1421
1422 if (bio->bio_error == ENXIO
1423 && xenbus_get_state(xbb->dev) == XenbusStateConnected) {
1424 /*
1425 * Backend device has disappeared. Signal the
1426 * front-end that we (the device proxy) want to
1427 * go away.
1428 */
1429 xenbus_set_state(xbb->dev, XenbusStateClosing);
1430 }
1431 }
1432
1433 /*
1434 * Decrement the pending count for the request list. When we're
1435 * done with the requests, send status back for all of them.
1436 */
1437 if (atomic_fetchadd_int(&reqlist->pendcnt, -1) == 1)
1438 xbb_complete_reqlist(xbb, reqlist);
1439
1440 g_destroy_bio(bio);
1441 }
1442
1443 /**
1444 * Parse a blkif request into an internal request structure and send
1445 * it to the backend for processing.
1446 *
1447 * \param xbb Per-instance xbb configuration structure.
1448 * \param reqlist Allocated internal request list structure.
1449 *
1450 * \return On success, 0. For resource shortages, non-zero.
1451 *
1452 * This routine performs the backend common aspects of request parsing
1453 * including compiling an internal request structure, parsing the S/G
1454 * list and any secondary ring requests in which they may reside, and
1455 * the mapping of front-end I/O pages into our domain.
1456 */
1457 static int
xbb_dispatch_io(struct xbb_softc * xbb,struct xbb_xen_reqlist * reqlist)1458 xbb_dispatch_io(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist)
1459 {
1460 struct xbb_sg *xbb_sg;
1461 struct gnttab_map_grant_ref *map;
1462 struct blkif_request_segment *sg;
1463 struct blkif_request_segment *last_block_sg;
1464 struct xbb_xen_req *nreq;
1465 u_int nseg;
1466 u_int seg_idx;
1467 u_int block_segs;
1468 int nr_sects;
1469 int total_sects;
1470 int operation;
1471 uint8_t bio_flags;
1472 int error;
1473
1474 reqlist->ds_tag_type = DEVSTAT_TAG_SIMPLE;
1475 bio_flags = 0;
1476 total_sects = 0;
1477 nr_sects = 0;
1478
1479 /*
1480 * First determine whether we have enough free KVA to satisfy this
1481 * request list. If not, tell xbb_run_queue() so it can go to
1482 * sleep until we have more KVA.
1483 */
1484 reqlist->kva = NULL;
1485 if (reqlist->nr_segments != 0) {
1486 reqlist->kva = xbb_get_kva(xbb, reqlist->nr_segments);
1487 if (reqlist->kva == NULL) {
1488 /*
1489 * If we're out of KVA, return ENOMEM.
1490 */
1491 return (ENOMEM);
1492 }
1493 }
1494
1495 binuptime(&reqlist->ds_t0);
1496 devstat_start_transaction(xbb->xbb_stats, &reqlist->ds_t0);
1497
1498 switch (reqlist->operation) {
1499 case BLKIF_OP_WRITE_BARRIER:
1500 bio_flags |= BIO_ORDERED;
1501 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED;
1502 /* FALLTHROUGH */
1503 case BLKIF_OP_WRITE:
1504 operation = BIO_WRITE;
1505 reqlist->ds_trans_type = DEVSTAT_WRITE;
1506 if ((xbb->flags & XBBF_READ_ONLY) != 0) {
1507 DPRINTF("Attempt to write to read only device %s\n",
1508 xbb->dev_name);
1509 reqlist->status = BLKIF_RSP_ERROR;
1510 goto send_response;
1511 }
1512 break;
1513 case BLKIF_OP_READ:
1514 operation = BIO_READ;
1515 reqlist->ds_trans_type = DEVSTAT_READ;
1516 break;
1517 case BLKIF_OP_FLUSH_DISKCACHE:
1518 /*
1519 * If this is true, the user has requested that we disable
1520 * flush support. So we just complete the requests
1521 * successfully.
1522 */
1523 if (xbb->disable_flush != 0) {
1524 goto send_response;
1525 }
1526
1527 /*
1528 * The user has requested that we only send a real flush
1529 * for every N flush requests. So keep count, and either
1530 * complete the request immediately or queue it for the
1531 * backend.
1532 */
1533 if (xbb->flush_interval != 0) {
1534 if (++(xbb->flush_count) < xbb->flush_interval) {
1535 goto send_response;
1536 } else
1537 xbb->flush_count = 0;
1538 }
1539
1540 operation = BIO_FLUSH;
1541 reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED;
1542 reqlist->ds_trans_type = DEVSTAT_NO_DATA;
1543 goto do_dispatch;
1544 /*NOTREACHED*/
1545 default:
1546 DPRINTF("error: unknown block io operation [%d]\n",
1547 reqlist->operation);
1548 reqlist->status = BLKIF_RSP_ERROR;
1549 goto send_response;
1550 }
1551
1552 reqlist->xbb = xbb;
1553 xbb_sg = xbb->xbb_sgs;
1554 map = xbb->maps;
1555 seg_idx = 0;
1556
1557 STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) {
1558 blkif_request_t *ring_req;
1559
1560 ring_req = nreq->ring_req;
1561 nr_sects = 0;
1562 nseg = ring_req->nr_segments;
1563 nreq->nr_pages = nseg;
1564 nreq->nr_512b_sectors = 0;
1565 sg = NULL;
1566
1567 /* Check that number of segments is sane. */
1568 if (__predict_false(nseg == 0)
1569 || __predict_false(nseg > xbb->max_request_segments)) {
1570 DPRINTF("Bad number of segments in request (%d)\n",
1571 nseg);
1572 reqlist->status = BLKIF_RSP_ERROR;
1573 goto send_response;
1574 }
1575
1576 block_segs = nseg;
1577 sg = ring_req->seg;
1578 last_block_sg = sg + block_segs;
1579
1580 while (sg < last_block_sg) {
1581 KASSERT(seg_idx <
1582 XBB_MAX_SEGMENTS_PER_REQLIST,
1583 ("seg_idx %d is too large, max "
1584 "segs %d\n", seg_idx,
1585 XBB_MAX_SEGMENTS_PER_REQLIST));
1586
1587 xbb_sg->first_sect = sg->first_sect;
1588 xbb_sg->last_sect = sg->last_sect;
1589 xbb_sg->nsect =
1590 (int8_t)(sg->last_sect -
1591 sg->first_sect + 1);
1592
1593 if ((sg->last_sect >= (PAGE_SIZE >> 9))
1594 || (xbb_sg->nsect <= 0)) {
1595 reqlist->status = BLKIF_RSP_ERROR;
1596 goto send_response;
1597 }
1598
1599 nr_sects += xbb_sg->nsect;
1600 map->host_addr = xbb_get_gntaddr(reqlist,
1601 seg_idx, /*sector*/0);
1602 KASSERT(map->host_addr + PAGE_SIZE <=
1603 xbb->ring_config.gnt_addr,
1604 ("Host address %#jx len %d overlaps "
1605 "ring address %#jx\n",
1606 (uintmax_t)map->host_addr, PAGE_SIZE,
1607 (uintmax_t)xbb->ring_config.gnt_addr));
1608
1609 map->flags = GNTMAP_host_map;
1610 map->ref = sg->gref;
1611 map->dom = xbb->otherend_id;
1612 if (operation == BIO_WRITE)
1613 map->flags |= GNTMAP_readonly;
1614 sg++;
1615 map++;
1616 xbb_sg++;
1617 seg_idx++;
1618 }
1619
1620 /* Convert to the disk's sector size */
1621 nreq->nr_512b_sectors = nr_sects;
1622 nr_sects = (nr_sects << 9) >> xbb->sector_size_shift;
1623 total_sects += nr_sects;
1624
1625 if ((nreq->nr_512b_sectors &
1626 ((xbb->sector_size >> 9) - 1)) != 0) {
1627 device_printf(xbb->dev, "%s: I/O size (%d) is not "
1628 "a multiple of the backing store sector "
1629 "size (%d)\n", __func__,
1630 nreq->nr_512b_sectors << 9,
1631 xbb->sector_size);
1632 reqlist->status = BLKIF_RSP_ERROR;
1633 goto send_response;
1634 }
1635 }
1636
1637 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
1638 xbb->maps, reqlist->nr_segments);
1639 if (error != 0)
1640 panic("Grant table operation failed (%d)", error);
1641
1642 reqlist->flags |= XBB_REQLIST_MAPPED;
1643
1644 for (seg_idx = 0, map = xbb->maps; seg_idx < reqlist->nr_segments;
1645 seg_idx++, map++){
1646 if (__predict_false(map->status != 0)) {
1647 DPRINTF("invalid buffer -- could not remap "
1648 "it (%d)\n", map->status);
1649 DPRINTF("Mapping(%d): Host Addr 0x%"PRIx64", flags "
1650 "0x%x ref 0x%x, dom %d\n", seg_idx,
1651 map->host_addr, map->flags, map->ref,
1652 map->dom);
1653 reqlist->status = BLKIF_RSP_ERROR;
1654 goto send_response;
1655 }
1656
1657 reqlist->gnt_handles[seg_idx] = map->handle;
1658 }
1659 if (reqlist->starting_sector_number + total_sects >
1660 xbb->media_num_sectors) {
1661 DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] "
1662 "extends past end of device %s\n",
1663 operation == BIO_READ ? "read" : "write",
1664 reqlist->starting_sector_number,
1665 reqlist->starting_sector_number + total_sects,
1666 xbb->dev_name);
1667 reqlist->status = BLKIF_RSP_ERROR;
1668 goto send_response;
1669 }
1670
1671 do_dispatch:
1672
1673 error = xbb->dispatch_io(xbb,
1674 reqlist,
1675 operation,
1676 bio_flags);
1677
1678 if (error != 0) {
1679 reqlist->status = BLKIF_RSP_ERROR;
1680 goto send_response;
1681 }
1682
1683 return (0);
1684
1685 send_response:
1686
1687 xbb_complete_reqlist(xbb, reqlist);
1688
1689 return (0);
1690 }
1691
1692 static __inline int
xbb_count_sects(blkif_request_t * ring_req)1693 xbb_count_sects(blkif_request_t *ring_req)
1694 {
1695 int i;
1696 int cur_size = 0;
1697
1698 for (i = 0; i < ring_req->nr_segments; i++) {
1699 int nsect;
1700
1701 nsect = (int8_t)(ring_req->seg[i].last_sect -
1702 ring_req->seg[i].first_sect + 1);
1703 if (nsect <= 0)
1704 break;
1705
1706 cur_size += nsect;
1707 }
1708
1709 return (cur_size);
1710 }
1711
1712 /**
1713 * Process incoming requests from the shared communication ring in response
1714 * to a signal on the ring's event channel.
1715 *
1716 * \param context Callback argument registerd during task initialization -
1717 * the xbb_softc for this instance.
1718 * \param pending The number of taskqueue_enqueue events that have
1719 * occurred since this handler was last run.
1720 */
1721 static void
xbb_run_queue(void * context,int pending)1722 xbb_run_queue(void *context, int pending)
1723 {
1724 struct xbb_softc *xbb;
1725 blkif_back_rings_t *rings;
1726 RING_IDX rp;
1727 uint64_t cur_sector;
1728 int cur_operation;
1729 struct xbb_xen_reqlist *reqlist;
1730
1731 xbb = (struct xbb_softc *)context;
1732 rings = &xbb->rings;
1733
1734 /*
1735 * Work gather and dispatch loop. Note that we have a bias here
1736 * towards gathering I/O sent by blockfront. We first gather up
1737 * everything in the ring, as long as we have resources. Then we
1738 * dispatch one request, and then attempt to gather up any
1739 * additional requests that have come in while we were dispatching
1740 * the request.
1741 *
1742 * This allows us to get a clearer picture (via devstat) of how
1743 * many requests blockfront is queueing to us at any given time.
1744 */
1745 for (;;) {
1746 int retval;
1747
1748 /*
1749 * Initialize reqlist to the last element in the pending
1750 * queue, if there is one. This allows us to add more
1751 * requests to that request list, if we have room.
1752 */
1753 reqlist = STAILQ_LAST(&xbb->reqlist_pending_stailq,
1754 xbb_xen_reqlist, links);
1755 if (reqlist != NULL) {
1756 cur_sector = reqlist->next_contig_sector;
1757 cur_operation = reqlist->operation;
1758 } else {
1759 cur_operation = 0;
1760 cur_sector = 0;
1761 }
1762
1763 /*
1764 * Cache req_prod to avoid accessing a cache line shared
1765 * with the frontend.
1766 */
1767 rp = rings->common.sring->req_prod;
1768
1769 /* Ensure we see queued requests up to 'rp'. */
1770 rmb();
1771
1772 /**
1773 * Run so long as there is work to consume and the generation
1774 * of a response will not overflow the ring.
1775 *
1776 * @note There's a 1 to 1 relationship between requests and
1777 * responses, so an overflow should never occur. This
1778 * test is to protect our domain from digesting bogus
1779 * data. Shouldn't we log this?
1780 */
1781 while (rings->common.req_cons != rp
1782 && RING_REQUEST_CONS_OVERFLOW(&rings->common,
1783 rings->common.req_cons) == 0){
1784 blkif_request_t ring_req_storage;
1785 blkif_request_t *ring_req;
1786 int cur_size;
1787
1788 switch (xbb->abi) {
1789 case BLKIF_PROTOCOL_NATIVE:
1790 ring_req = RING_GET_REQUEST(&xbb->rings.native,
1791 rings->common.req_cons);
1792 break;
1793 case BLKIF_PROTOCOL_X86_32:
1794 {
1795 struct blkif_x86_32_request *ring_req32;
1796
1797 ring_req32 = RING_GET_REQUEST(
1798 &xbb->rings.x86_32, rings->common.req_cons);
1799 blkif_get_x86_32_req(&ring_req_storage,
1800 ring_req32);
1801 ring_req = &ring_req_storage;
1802 break;
1803 }
1804 case BLKIF_PROTOCOL_X86_64:
1805 {
1806 struct blkif_x86_64_request *ring_req64;
1807
1808 ring_req64 =RING_GET_REQUEST(&xbb->rings.x86_64,
1809 rings->common.req_cons);
1810 blkif_get_x86_64_req(&ring_req_storage,
1811 ring_req64);
1812 ring_req = &ring_req_storage;
1813 break;
1814 }
1815 default:
1816 panic("Unexpected blkif protocol ABI.");
1817 /* NOTREACHED */
1818 }
1819
1820 /*
1821 * Check for situations that would require closing
1822 * off this I/O for further coalescing:
1823 * - Coalescing is turned off.
1824 * - Current I/O is out of sequence with the previous
1825 * I/O.
1826 * - Coalesced I/O would be too large.
1827 */
1828 if ((reqlist != NULL)
1829 && ((xbb->no_coalesce_reqs != 0)
1830 || ((xbb->no_coalesce_reqs == 0)
1831 && ((ring_req->sector_number != cur_sector)
1832 || (ring_req->operation != cur_operation)
1833 || ((ring_req->nr_segments + reqlist->nr_segments) >
1834 xbb->max_reqlist_segments))))) {
1835 reqlist = NULL;
1836 }
1837
1838 /*
1839 * Grab and check for all resources in one shot.
1840 * If we can't get all of the resources we need,
1841 * the shortage is noted and the thread will get
1842 * woken up when more resources are available.
1843 */
1844 retval = xbb_get_resources(xbb, &reqlist, ring_req,
1845 xbb->rings.common.req_cons);
1846
1847 if (retval != 0) {
1848 /*
1849 * Resource shortage has been recorded.
1850 * We'll be scheduled to run once a request
1851 * object frees up due to a completion.
1852 */
1853 break;
1854 }
1855
1856 /*
1857 * Signify that we can overwrite this request with
1858 * a response by incrementing our consumer index.
1859 * The response won't be generated until after
1860 * we've already consumed all necessary data out
1861 * of the version of the request in the ring buffer
1862 * (for native mode). We must update the consumer
1863 * index before issuing back-end I/O so there is
1864 * no possibility that it will complete and a
1865 * response be generated before we make room in
1866 * the queue for that response.
1867 */
1868 xbb->rings.common.req_cons++;
1869 xbb->reqs_received++;
1870
1871 cur_size = xbb_count_sects(ring_req);
1872 cur_sector = ring_req->sector_number + cur_size;
1873 reqlist->next_contig_sector = cur_sector;
1874 cur_operation = ring_req->operation;
1875 }
1876
1877 /* Check for I/O to dispatch */
1878 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq);
1879 if (reqlist == NULL) {
1880 /*
1881 * We're out of work to do, put the task queue to
1882 * sleep.
1883 */
1884 break;
1885 }
1886
1887 /*
1888 * Grab the first request off the queue and attempt
1889 * to dispatch it.
1890 */
1891 STAILQ_REMOVE_HEAD(&xbb->reqlist_pending_stailq, links);
1892
1893 retval = xbb_dispatch_io(xbb, reqlist);
1894 if (retval != 0) {
1895 /*
1896 * xbb_dispatch_io() returns non-zero only when
1897 * there is a resource shortage. If that's the
1898 * case, re-queue this request on the head of the
1899 * queue, and go to sleep until we have more
1900 * resources.
1901 */
1902 STAILQ_INSERT_HEAD(&xbb->reqlist_pending_stailq,
1903 reqlist, links);
1904 break;
1905 } else {
1906 /*
1907 * If we still have anything on the queue after
1908 * removing the head entry, that is because we
1909 * met one of the criteria to create a new
1910 * request list (outlined above), and we'll call
1911 * that a forced dispatch for statistical purposes.
1912 *
1913 * Otherwise, if there is only one element on the
1914 * queue, we coalesced everything available on
1915 * the ring and we'll call that a normal dispatch.
1916 */
1917 reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq);
1918
1919 if (reqlist != NULL)
1920 xbb->forced_dispatch++;
1921 else
1922 xbb->normal_dispatch++;
1923
1924 xbb->total_dispatch++;
1925 }
1926 }
1927 }
1928
1929 /**
1930 * Interrupt handler bound to the shared ring's event channel.
1931 *
1932 * \param arg Callback argument registerd during event channel
1933 * binding - the xbb_softc for this instance.
1934 */
1935 static int
xbb_filter(void * arg)1936 xbb_filter(void *arg)
1937 {
1938 struct xbb_softc *xbb;
1939
1940 /* Defer to taskqueue thread. */
1941 xbb = (struct xbb_softc *)arg;
1942 taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task);
1943
1944 return (FILTER_HANDLED);
1945 }
1946
1947 SDT_PROVIDER_DEFINE(xbb);
1948 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, "int");
1949 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, "int", "uint64_t",
1950 "uint64_t");
1951 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, "int",
1952 "uint64_t", "uint64_t");
1953
1954 /*----------------------------- Backend Handlers -----------------------------*/
1955 /**
1956 * Backend handler for character device access.
1957 *
1958 * \param xbb Per-instance xbb configuration structure.
1959 * \param reqlist Allocated internal request list structure.
1960 * \param operation BIO_* I/O operation code.
1961 * \param bio_flags Additional bio_flag data to pass to any generated
1962 * bios (e.g. BIO_ORDERED)..
1963 *
1964 * \return 0 for success, errno codes for failure.
1965 */
1966 static int
xbb_dispatch_dev(struct xbb_softc * xbb,struct xbb_xen_reqlist * reqlist,int operation,int bio_flags)1967 xbb_dispatch_dev(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
1968 int operation, int bio_flags)
1969 {
1970 struct xbb_dev_data *dev_data;
1971 struct bio *bios[XBB_MAX_SEGMENTS_PER_REQLIST];
1972 off_t bio_offset;
1973 struct bio *bio;
1974 struct xbb_sg *xbb_sg;
1975 u_int nbio;
1976 u_int bio_idx;
1977 u_int nseg;
1978 u_int seg_idx;
1979 int error;
1980
1981 dev_data = &xbb->backend.dev;
1982 bio_offset = (off_t)reqlist->starting_sector_number
1983 << xbb->sector_size_shift;
1984 error = 0;
1985 nbio = 0;
1986 bio_idx = 0;
1987
1988 if (operation == BIO_FLUSH) {
1989 bio = g_new_bio();
1990 if (__predict_false(bio == NULL)) {
1991 DPRINTF("Unable to allocate bio for BIO_FLUSH\n");
1992 error = ENOMEM;
1993 return (error);
1994 }
1995
1996 bio->bio_cmd = BIO_FLUSH;
1997 bio->bio_flags |= BIO_ORDERED;
1998 bio->bio_dev = dev_data->cdev;
1999 bio->bio_offset = 0;
2000 bio->bio_data = 0;
2001 bio->bio_done = xbb_bio_done;
2002 bio->bio_caller1 = reqlist;
2003 bio->bio_pblkno = 0;
2004
2005 reqlist->pendcnt = 1;
2006
2007 SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush,
2008 device_get_unit(xbb->dev));
2009
2010 (*dev_data->csw->d_strategy)(bio);
2011
2012 return (0);
2013 }
2014
2015 xbb_sg = xbb->xbb_sgs;
2016 bio = NULL;
2017 nseg = reqlist->nr_segments;
2018
2019 for (seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) {
2020 /*
2021 * KVA will not be contiguous, so any additional
2022 * I/O will need to be represented in a new bio.
2023 */
2024 if ((bio != NULL)
2025 && (xbb_sg->first_sect != 0)) {
2026 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) {
2027 printf("%s: Discontiguous I/O request "
2028 "from domain %d ends on "
2029 "non-sector boundary\n",
2030 __func__, xbb->otherend_id);
2031 error = EINVAL;
2032 goto fail_free_bios;
2033 }
2034 bio = NULL;
2035 }
2036
2037 if (bio == NULL) {
2038 /*
2039 * Make sure that the start of this bio is
2040 * aligned to a device sector.
2041 */
2042 if ((bio_offset & (xbb->sector_size - 1)) != 0){
2043 printf("%s: Misaligned I/O request "
2044 "from domain %d\n", __func__,
2045 xbb->otherend_id);
2046 error = EINVAL;
2047 goto fail_free_bios;
2048 }
2049
2050 bio = bios[nbio++] = g_new_bio();
2051 if (__predict_false(bio == NULL)) {
2052 error = ENOMEM;
2053 goto fail_free_bios;
2054 }
2055 bio->bio_cmd = operation;
2056 bio->bio_flags |= bio_flags;
2057 bio->bio_dev = dev_data->cdev;
2058 bio->bio_offset = bio_offset;
2059 bio->bio_data = xbb_reqlist_ioaddr(reqlist, seg_idx,
2060 xbb_sg->first_sect);
2061 bio->bio_done = xbb_bio_done;
2062 bio->bio_caller1 = reqlist;
2063 bio->bio_pblkno = bio_offset >> xbb->sector_size_shift;
2064 }
2065
2066 bio->bio_length += xbb_sg->nsect << 9;
2067 bio->bio_bcount = bio->bio_length;
2068 bio_offset += xbb_sg->nsect << 9;
2069
2070 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) {
2071 if ((bio->bio_length & (xbb->sector_size - 1)) != 0) {
2072 printf("%s: Discontiguous I/O request "
2073 "from domain %d ends on "
2074 "non-sector boundary\n",
2075 __func__, xbb->otherend_id);
2076 error = EINVAL;
2077 goto fail_free_bios;
2078 }
2079 /*
2080 * KVA will not be contiguous, so any additional
2081 * I/O will need to be represented in a new bio.
2082 */
2083 bio = NULL;
2084 }
2085 }
2086
2087 reqlist->pendcnt = nbio;
2088
2089 for (bio_idx = 0; bio_idx < nbio; bio_idx++)
2090 {
2091 if (operation == BIO_READ) {
2092 SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read,
2093 device_get_unit(xbb->dev),
2094 bios[bio_idx]->bio_offset,
2095 bios[bio_idx]->bio_length);
2096 } else if (operation == BIO_WRITE) {
2097 SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write,
2098 device_get_unit(xbb->dev),
2099 bios[bio_idx]->bio_offset,
2100 bios[bio_idx]->bio_length);
2101 }
2102 (*dev_data->csw->d_strategy)(bios[bio_idx]);
2103 }
2104
2105 return (error);
2106
2107 fail_free_bios:
2108 for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++)
2109 g_destroy_bio(bios[bio_idx]);
2110
2111 return (error);
2112 }
2113
2114 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, "int");
2115 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, "int", "uint64_t",
2116 "uint64_t");
2117 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, "int",
2118 "uint64_t", "uint64_t");
2119
2120 /**
2121 * Backend handler for file access.
2122 *
2123 * \param xbb Per-instance xbb configuration structure.
2124 * \param reqlist Allocated internal request list.
2125 * \param operation BIO_* I/O operation code.
2126 * \param flags Additional bio_flag data to pass to any generated bios
2127 * (e.g. BIO_ORDERED)..
2128 *
2129 * \return 0 for success, errno codes for failure.
2130 */
2131 static int
xbb_dispatch_file(struct xbb_softc * xbb,struct xbb_xen_reqlist * reqlist,int operation,int flags)2132 xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
2133 int operation, int flags)
2134 {
2135 struct xbb_file_data *file_data;
2136 u_int seg_idx;
2137 u_int nseg;
2138 struct uio xuio;
2139 struct xbb_sg *xbb_sg;
2140 struct iovec *xiovec;
2141 int error;
2142
2143 file_data = &xbb->backend.file;
2144 error = 0;
2145 bzero(&xuio, sizeof(xuio));
2146
2147 switch (operation) {
2148 case BIO_READ:
2149 xuio.uio_rw = UIO_READ;
2150 break;
2151 case BIO_WRITE:
2152 xuio.uio_rw = UIO_WRITE;
2153 break;
2154 case BIO_FLUSH: {
2155 struct mount *mountpoint;
2156
2157 SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush,
2158 device_get_unit(xbb->dev));
2159
2160 (void) vn_start_write(xbb->vn, &mountpoint, V_WAIT);
2161
2162 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
2163 error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread);
2164 VOP_UNLOCK(xbb->vn);
2165
2166 vn_finished_write(mountpoint);
2167
2168 goto bailout_send_response;
2169 /* NOTREACHED */
2170 }
2171 default:
2172 panic("invalid operation %d", operation);
2173 /* NOTREACHED */
2174 }
2175 xuio.uio_offset = (vm_offset_t)reqlist->starting_sector_number
2176 << xbb->sector_size_shift;
2177 xuio.uio_segflg = UIO_SYSSPACE;
2178 xuio.uio_iov = file_data->xiovecs;
2179 xuio.uio_iovcnt = 0;
2180 xbb_sg = xbb->xbb_sgs;
2181 nseg = reqlist->nr_segments;
2182
2183 for (xiovec = NULL, seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) {
2184 /*
2185 * If the first sector is not 0, the KVA will
2186 * not be contiguous and we'll need to go on
2187 * to another segment.
2188 */
2189 if (xbb_sg->first_sect != 0)
2190 xiovec = NULL;
2191
2192 if (xiovec == NULL) {
2193 xiovec = &file_data->xiovecs[xuio.uio_iovcnt];
2194 xiovec->iov_base = xbb_reqlist_ioaddr(reqlist,
2195 seg_idx, xbb_sg->first_sect);
2196 xiovec->iov_len = 0;
2197 xuio.uio_iovcnt++;
2198 }
2199
2200 xiovec->iov_len += xbb_sg->nsect << 9;
2201
2202 xuio.uio_resid += xbb_sg->nsect << 9;
2203
2204 /*
2205 * If the last sector is not the full page
2206 * size count, the next segment will not be
2207 * contiguous in KVA and we need a new iovec.
2208 */
2209 if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9)
2210 xiovec = NULL;
2211 }
2212
2213 xuio.uio_td = curthread;
2214
2215 switch (operation) {
2216 case BIO_READ:
2217
2218 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read,
2219 device_get_unit(xbb->dev), xuio.uio_offset,
2220 xuio.uio_resid);
2221
2222 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
2223
2224 /*
2225 * UFS pays attention to IO_DIRECT for reads. If the
2226 * DIRECTIO option is configured into the kernel, it calls
2227 * ffs_rawread(). But that only works for single-segment
2228 * uios with user space addresses. In our case, with a
2229 * kernel uio, it still reads into the buffer cache, but it
2230 * will just try to release the buffer from the cache later
2231 * on in ffs_read().
2232 *
2233 * ZFS does not pay attention to IO_DIRECT for reads.
2234 *
2235 * UFS does not pay attention to IO_SYNC for reads.
2236 *
2237 * ZFS pays attention to IO_SYNC (which translates into the
2238 * Solaris define FRSYNC for zfs_read()) for reads. It
2239 * attempts to sync the file before reading.
2240 *
2241 * So, to attempt to provide some barrier semantics in the
2242 * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC.
2243 */
2244 error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ?
2245 (IO_DIRECT|IO_SYNC) : 0, file_data->cred);
2246
2247 VOP_UNLOCK(xbb->vn);
2248 break;
2249 case BIO_WRITE: {
2250 struct mount *mountpoint;
2251
2252 SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write,
2253 device_get_unit(xbb->dev), xuio.uio_offset,
2254 xuio.uio_resid);
2255
2256 (void)vn_start_write(xbb->vn, &mountpoint, V_WAIT);
2257
2258 vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
2259
2260 /*
2261 * UFS pays attention to IO_DIRECT for writes. The write
2262 * is done asynchronously. (Normally the write would just
2263 * get put into cache.
2264 *
2265 * UFS pays attention to IO_SYNC for writes. It will
2266 * attempt to write the buffer out synchronously if that
2267 * flag is set.
2268 *
2269 * ZFS does not pay attention to IO_DIRECT for writes.
2270 *
2271 * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC)
2272 * for writes. It will flush the transaction from the
2273 * cache before returning.
2274 *
2275 * So if we've got the BIO_ORDERED flag set, we want
2276 * IO_SYNC in either the UFS or ZFS case.
2277 */
2278 error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ?
2279 IO_SYNC : 0, file_data->cred);
2280 VOP_UNLOCK(xbb->vn);
2281
2282 vn_finished_write(mountpoint);
2283
2284 break;
2285 }
2286 default:
2287 panic("invalid operation %d", operation);
2288 /* NOTREACHED */
2289 }
2290
2291 bailout_send_response:
2292
2293 if (error != 0)
2294 reqlist->status = BLKIF_RSP_ERROR;
2295
2296 xbb_complete_reqlist(xbb, reqlist);
2297
2298 return (0);
2299 }
2300
2301 /*--------------------------- Backend Configuration --------------------------*/
2302 /**
2303 * Close and cleanup any backend device/file specific state for this
2304 * block back instance.
2305 *
2306 * \param xbb Per-instance xbb configuration structure.
2307 */
2308 static void
xbb_close_backend(struct xbb_softc * xbb)2309 xbb_close_backend(struct xbb_softc *xbb)
2310 {
2311 DROP_GIANT();
2312 DPRINTF("closing dev=%s\n", xbb->dev_name);
2313 if (xbb->vn) {
2314 int flags = FREAD;
2315
2316 if ((xbb->flags & XBBF_READ_ONLY) == 0)
2317 flags |= FWRITE;
2318
2319 switch (xbb->device_type) {
2320 case XBB_TYPE_DISK:
2321 if (xbb->backend.dev.csw) {
2322 dev_relthread(xbb->backend.dev.cdev,
2323 xbb->backend.dev.dev_ref);
2324 xbb->backend.dev.csw = NULL;
2325 xbb->backend.dev.cdev = NULL;
2326 }
2327 break;
2328 case XBB_TYPE_FILE:
2329 break;
2330 case XBB_TYPE_NONE:
2331 default:
2332 panic("Unexpected backend type.");
2333 break;
2334 }
2335
2336 (void)vn_close(xbb->vn, flags, NOCRED, curthread);
2337 xbb->vn = NULL;
2338
2339 switch (xbb->device_type) {
2340 case XBB_TYPE_DISK:
2341 break;
2342 case XBB_TYPE_FILE:
2343 if (xbb->backend.file.cred != NULL) {
2344 crfree(xbb->backend.file.cred);
2345 xbb->backend.file.cred = NULL;
2346 }
2347 break;
2348 case XBB_TYPE_NONE:
2349 default:
2350 panic("Unexpected backend type.");
2351 break;
2352 }
2353 }
2354 PICKUP_GIANT();
2355 }
2356
2357 /**
2358 * Open a character device to be used for backend I/O.
2359 *
2360 * \param xbb Per-instance xbb configuration structure.
2361 *
2362 * \return 0 for success, errno codes for failure.
2363 */
2364 static int
xbb_open_dev(struct xbb_softc * xbb)2365 xbb_open_dev(struct xbb_softc *xbb)
2366 {
2367 struct vattr vattr;
2368 struct cdev *dev;
2369 struct cdevsw *devsw;
2370 int error;
2371
2372 xbb->device_type = XBB_TYPE_DISK;
2373 xbb->dispatch_io = xbb_dispatch_dev;
2374 xbb->backend.dev.cdev = xbb->vn->v_rdev;
2375 xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev,
2376 &xbb->backend.dev.dev_ref);
2377 if (xbb->backend.dev.csw == NULL)
2378 panic("Unable to retrieve device switch");
2379
2380 error = VOP_GETATTR(xbb->vn, &vattr, NOCRED);
2381 if (error) {
2382 xenbus_dev_fatal(xbb->dev, error, "error getting "
2383 "vnode attributes for device %s",
2384 xbb->dev_name);
2385 return (error);
2386 }
2387
2388 dev = xbb->vn->v_rdev;
2389 devsw = dev->si_devsw;
2390 if (!devsw->d_ioctl) {
2391 xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for "
2392 "device %s!", xbb->dev_name);
2393 return (ENODEV);
2394 }
2395
2396 error = devsw->d_ioctl(dev, DIOCGSECTORSIZE,
2397 (caddr_t)&xbb->sector_size, FREAD,
2398 curthread);
2399 if (error) {
2400 xenbus_dev_fatal(xbb->dev, error,
2401 "error calling ioctl DIOCGSECTORSIZE "
2402 "for device %s", xbb->dev_name);
2403 return (error);
2404 }
2405
2406 error = devsw->d_ioctl(dev, DIOCGMEDIASIZE,
2407 (caddr_t)&xbb->media_size, FREAD,
2408 curthread);
2409 if (error) {
2410 xenbus_dev_fatal(xbb->dev, error,
2411 "error calling ioctl DIOCGMEDIASIZE "
2412 "for device %s", xbb->dev_name);
2413 return (error);
2414 }
2415
2416 return (0);
2417 }
2418
2419 /**
2420 * Open a file to be used for backend I/O.
2421 *
2422 * \param xbb Per-instance xbb configuration structure.
2423 *
2424 * \return 0 for success, errno codes for failure.
2425 */
2426 static int
xbb_open_file(struct xbb_softc * xbb)2427 xbb_open_file(struct xbb_softc *xbb)
2428 {
2429 struct xbb_file_data *file_data;
2430 struct vattr vattr;
2431 int error;
2432
2433 file_data = &xbb->backend.file;
2434 xbb->device_type = XBB_TYPE_FILE;
2435 xbb->dispatch_io = xbb_dispatch_file;
2436 error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred);
2437 if (error != 0) {
2438 xenbus_dev_fatal(xbb->dev, error,
2439 "error calling VOP_GETATTR()"
2440 "for file %s", xbb->dev_name);
2441 return (error);
2442 }
2443
2444 /*
2445 * Verify that we have the ability to upgrade to exclusive
2446 * access on this file so we can trap errors at open instead
2447 * of reporting them during first access.
2448 */
2449 if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) {
2450 vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY);
2451 if (VN_IS_DOOMED(xbb->vn)) {
2452 error = EBADF;
2453 xenbus_dev_fatal(xbb->dev, error,
2454 "error locking file %s",
2455 xbb->dev_name);
2456
2457 return (error);
2458 }
2459 }
2460
2461 file_data->cred = crhold(curthread->td_ucred);
2462 xbb->media_size = vattr.va_size;
2463
2464 /*
2465 * XXX KDM vattr.va_blocksize may be larger than 512 bytes here.
2466 * With ZFS, it is 131072 bytes. Block sizes that large don't work
2467 * with disklabel and UFS on FreeBSD at least. Large block sizes
2468 * may not work with other OSes as well. So just export a sector
2469 * size of 512 bytes, which should work with any OS or
2470 * application. Since our backing is a file, any block size will
2471 * work fine for the backing store.
2472 */
2473 #if 0
2474 xbb->sector_size = vattr.va_blocksize;
2475 #endif
2476 xbb->sector_size = 512;
2477
2478 /*
2479 * Sanity check. The media size has to be at least one
2480 * sector long.
2481 */
2482 if (xbb->media_size < xbb->sector_size) {
2483 error = EINVAL;
2484 xenbus_dev_fatal(xbb->dev, error,
2485 "file %s size %ju < block size %u",
2486 xbb->dev_name,
2487 (uintmax_t)xbb->media_size,
2488 xbb->sector_size);
2489 }
2490 return (error);
2491 }
2492
2493 /**
2494 * Open the backend provider for this connection.
2495 *
2496 * \param xbb Per-instance xbb configuration structure.
2497 *
2498 * \return 0 for success, errno codes for failure.
2499 */
2500 static int
xbb_open_backend(struct xbb_softc * xbb)2501 xbb_open_backend(struct xbb_softc *xbb)
2502 {
2503 struct nameidata nd;
2504 int flags;
2505 int error;
2506
2507 flags = FREAD;
2508 error = 0;
2509
2510 DPRINTF("opening dev=%s\n", xbb->dev_name);
2511
2512 if (rootvnode == NULL) {
2513 xenbus_dev_fatal(xbb->dev, ENOENT,
2514 "Root file system not mounted");
2515 return (ENOENT);
2516 }
2517
2518 if ((xbb->flags & XBBF_READ_ONLY) == 0)
2519 flags |= FWRITE;
2520
2521 pwd_ensure_dirs();
2522
2523 again:
2524 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name);
2525 error = vn_open(&nd, &flags, 0, NULL);
2526 if (error) {
2527 /*
2528 * This is the only reasonable guess we can make as far as
2529 * path if the user doesn't give us a fully qualified path.
2530 * If they want to specify a file, they need to specify the
2531 * full path.
2532 */
2533 if (xbb->dev_name[0] != '/') {
2534 char *dev_path = "/dev/";
2535 char *dev_name;
2536
2537 /* Try adding device path at beginning of name */
2538 dev_name = malloc(strlen(xbb->dev_name)
2539 + strlen(dev_path) + 1,
2540 M_XENBLOCKBACK, M_NOWAIT);
2541 if (dev_name) {
2542 sprintf(dev_name, "%s%s", dev_path,
2543 xbb->dev_name);
2544 free(xbb->dev_name, M_XENBLOCKBACK);
2545 xbb->dev_name = dev_name;
2546 goto again;
2547 }
2548 }
2549 xenbus_dev_fatal(xbb->dev, error, "error opening device %s",
2550 xbb->dev_name);
2551 return (error);
2552 }
2553
2554 NDFREE_PNBUF(&nd);
2555
2556 xbb->vn = nd.ni_vp;
2557
2558 /* We only support disks and files. */
2559 if (vn_isdisk_error(xbb->vn, &error)) {
2560 error = xbb_open_dev(xbb);
2561 } else if (xbb->vn->v_type == VREG) {
2562 error = xbb_open_file(xbb);
2563 } else {
2564 error = EINVAL;
2565 xenbus_dev_fatal(xbb->dev, error, "%s is not a disk "
2566 "or file", xbb->dev_name);
2567 }
2568 VOP_UNLOCK(xbb->vn);
2569
2570 if (error != 0) {
2571 xbb_close_backend(xbb);
2572 return (error);
2573 }
2574
2575 xbb->sector_size_shift = fls(xbb->sector_size) - 1;
2576 xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift;
2577
2578 DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n",
2579 (xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file",
2580 xbb->dev_name, xbb->sector_size, xbb->media_size);
2581
2582 return (0);
2583 }
2584
2585 /*------------------------ Inter-Domain Communication ------------------------*/
2586 /**
2587 * Free dynamically allocated KVA or pseudo-physical address allocations.
2588 *
2589 * \param xbb Per-instance xbb configuration structure.
2590 */
2591 static void
xbb_free_communication_mem(struct xbb_softc * xbb)2592 xbb_free_communication_mem(struct xbb_softc *xbb)
2593 {
2594 if (xbb->kva != 0) {
2595 if (xbb->pseudo_phys_res != NULL) {
2596 xenmem_free(xbb->dev, xbb->pseudo_phys_res_id,
2597 xbb->pseudo_phys_res);
2598 xbb->pseudo_phys_res = NULL;
2599 }
2600 }
2601 xbb->kva = 0;
2602 xbb->gnt_base_addr = 0;
2603 if (xbb->kva_free != NULL) {
2604 free(xbb->kva_free, M_XENBLOCKBACK);
2605 xbb->kva_free = NULL;
2606 }
2607 }
2608
2609 /**
2610 * Cleanup all inter-domain communication mechanisms.
2611 *
2612 * \param xbb Per-instance xbb configuration structure.
2613 */
2614 static int
xbb_disconnect(struct xbb_softc * xbb)2615 xbb_disconnect(struct xbb_softc *xbb)
2616 {
2617 DPRINTF("\n");
2618
2619 mtx_unlock(&xbb->lock);
2620 xen_intr_unbind(&xbb->xen_intr_handle);
2621 if (xbb->io_taskqueue != NULL)
2622 taskqueue_drain(xbb->io_taskqueue, &xbb->io_task);
2623 mtx_lock(&xbb->lock);
2624
2625 /*
2626 * No new interrupts can generate work, but we must wait
2627 * for all currently active requests to drain.
2628 */
2629 if (xbb->active_request_count != 0)
2630 return (EAGAIN);
2631
2632 if (xbb->flags & XBBF_RING_CONNECTED) {
2633 struct gnttab_unmap_grant_ref ops[XBB_MAX_RING_PAGES];
2634 struct gnttab_unmap_grant_ref *op;
2635 unsigned int ring_idx;
2636 int error;
2637
2638 for (ring_idx = 0, op = ops;
2639 ring_idx < xbb->ring_config.ring_pages;
2640 ring_idx++, op++) {
2641 op->host_addr = xbb->ring_config.gnt_addr
2642 + (ring_idx * PAGE_SIZE);
2643 op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx];
2644 op->handle = xbb->ring_config.handle[ring_idx];
2645 }
2646
2647 error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops,
2648 xbb->ring_config.ring_pages);
2649 if (error != 0)
2650 panic("Grant table op failed (%d)", error);
2651
2652 xbb->flags &= ~XBBF_RING_CONNECTED;
2653 }
2654
2655 xbb_free_communication_mem(xbb);
2656
2657 if (xbb->requests != NULL) {
2658 free(xbb->requests, M_XENBLOCKBACK);
2659 xbb->requests = NULL;
2660 }
2661
2662 if (xbb->request_lists != NULL) {
2663 struct xbb_xen_reqlist *reqlist;
2664 int i;
2665
2666 /* There is one request list for ever allocated request. */
2667 for (i = 0, reqlist = xbb->request_lists;
2668 i < xbb->max_requests; i++, reqlist++){
2669 if (reqlist->gnt_handles != NULL) {
2670 free(reqlist->gnt_handles, M_XENBLOCKBACK);
2671 reqlist->gnt_handles = NULL;
2672 }
2673 }
2674 free(xbb->request_lists, M_XENBLOCKBACK);
2675 xbb->request_lists = NULL;
2676 }
2677
2678 return (0);
2679 }
2680
2681 /**
2682 * Map shared memory ring into domain local address space, initialize
2683 * ring control structures, and bind an interrupt to the event channel
2684 * used to notify us of ring changes.
2685 *
2686 * \param xbb Per-instance xbb configuration structure.
2687 */
2688 static int
xbb_connect_ring(struct xbb_softc * xbb)2689 xbb_connect_ring(struct xbb_softc *xbb)
2690 {
2691 struct gnttab_map_grant_ref gnts[XBB_MAX_RING_PAGES];
2692 struct gnttab_map_grant_ref *gnt;
2693 u_int ring_idx;
2694 int error;
2695
2696 if ((xbb->flags & XBBF_RING_CONNECTED) != 0)
2697 return (0);
2698
2699 /*
2700 * Kva for our ring is at the tail of the region of kva allocated
2701 * by xbb_alloc_communication_mem().
2702 */
2703 xbb->ring_config.va = xbb->kva
2704 + (xbb->kva_size
2705 - (xbb->ring_config.ring_pages * PAGE_SIZE));
2706 xbb->ring_config.gnt_addr = xbb->gnt_base_addr
2707 + (xbb->kva_size
2708 - (xbb->ring_config.ring_pages * PAGE_SIZE));
2709
2710 for (ring_idx = 0, gnt = gnts;
2711 ring_idx < xbb->ring_config.ring_pages;
2712 ring_idx++, gnt++) {
2713 gnt->host_addr = xbb->ring_config.gnt_addr
2714 + (ring_idx * PAGE_SIZE);
2715 gnt->flags = GNTMAP_host_map;
2716 gnt->ref = xbb->ring_config.ring_ref[ring_idx];
2717 gnt->dom = xbb->otherend_id;
2718 }
2719
2720 error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts,
2721 xbb->ring_config.ring_pages);
2722 if (error)
2723 panic("blkback: Ring page grant table op failed (%d)", error);
2724
2725 for (ring_idx = 0, gnt = gnts;
2726 ring_idx < xbb->ring_config.ring_pages;
2727 ring_idx++, gnt++) {
2728 if (gnt->status != 0) {
2729 struct gnttab_unmap_grant_ref unmap[XBB_MAX_RING_PAGES];
2730 unsigned int i, j;
2731
2732 xbb->ring_config.va = 0;
2733 xenbus_dev_fatal(xbb->dev, EACCES,
2734 "Ring shared page mapping failed. "
2735 "Status %d.", gnt->status);
2736
2737 /* Unmap everything to avoid leaking grant table maps */
2738 for (i = 0, j = 0; i < xbb->ring_config.ring_pages;
2739 i++) {
2740 if (gnts[i].status != GNTST_okay)
2741 continue;
2742
2743 unmap[j].host_addr = gnts[i].host_addr;
2744 unmap[j].dev_bus_addr = gnts[i].dev_bus_addr;
2745 unmap[j++].handle = gnts[i].handle;
2746 }
2747 if (j != 0) {
2748 error = HYPERVISOR_grant_table_op(
2749 GNTTABOP_unmap_grant_ref, unmap, j);
2750 if (error != 0)
2751 panic("Unable to unmap grants (%d)",
2752 error);
2753 }
2754 return (EACCES);
2755 }
2756 xbb->ring_config.handle[ring_idx] = gnt->handle;
2757 xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr;
2758 }
2759
2760 /* Initialize the ring based on ABI. */
2761 switch (xbb->abi) {
2762 case BLKIF_PROTOCOL_NATIVE:
2763 {
2764 blkif_sring_t *sring;
2765 sring = (blkif_sring_t *)xbb->ring_config.va;
2766 BACK_RING_INIT(&xbb->rings.native, sring,
2767 xbb->ring_config.ring_pages * PAGE_SIZE);
2768 break;
2769 }
2770 case BLKIF_PROTOCOL_X86_32:
2771 {
2772 blkif_x86_32_sring_t *sring_x86_32;
2773 sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va;
2774 BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32,
2775 xbb->ring_config.ring_pages * PAGE_SIZE);
2776 break;
2777 }
2778 case BLKIF_PROTOCOL_X86_64:
2779 {
2780 blkif_x86_64_sring_t *sring_x86_64;
2781 sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va;
2782 BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64,
2783 xbb->ring_config.ring_pages * PAGE_SIZE);
2784 break;
2785 }
2786 default:
2787 panic("Unexpected blkif protocol ABI.");
2788 }
2789
2790 xbb->flags |= XBBF_RING_CONNECTED;
2791
2792 error = xen_intr_bind_remote_port(xbb->dev,
2793 xbb->otherend_id,
2794 xbb->ring_config.evtchn,
2795 xbb_filter,
2796 /*ithread_handler*/NULL,
2797 /*arg*/xbb,
2798 INTR_TYPE_BIO | INTR_MPSAFE,
2799 &xbb->xen_intr_handle);
2800 if (error) {
2801 xenbus_dev_fatal(xbb->dev, error, "binding event channel");
2802 return (error);
2803 }
2804
2805 DPRINTF("rings connected!\n");
2806
2807 return 0;
2808 }
2809
2810 /**
2811 * Size KVA and pseudo-physical address allocations based on negotiated
2812 * values for the size and number of I/O requests, and the size of our
2813 * communication ring.
2814 *
2815 * \param xbb Per-instance xbb configuration structure.
2816 *
2817 * These address spaces are used to dynamically map pages in the
2818 * front-end's domain into our own.
2819 */
2820 static int
xbb_alloc_communication_mem(struct xbb_softc * xbb)2821 xbb_alloc_communication_mem(struct xbb_softc *xbb)
2822 {
2823 xbb->reqlist_kva_pages = xbb->max_requests * xbb->max_request_segments;
2824 xbb->reqlist_kva_size = xbb->reqlist_kva_pages * PAGE_SIZE;
2825 xbb->kva_size = xbb->reqlist_kva_size +
2826 (xbb->ring_config.ring_pages * PAGE_SIZE);
2827
2828 xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages, M_XENBLOCKBACK, M_NOWAIT);
2829 if (xbb->kva_free == NULL)
2830 return (ENOMEM);
2831
2832 DPRINTF("%s: kva_size = %d, reqlist_kva_size = %d\n",
2833 device_get_nameunit(xbb->dev), xbb->kva_size,
2834 xbb->reqlist_kva_size);
2835 /*
2836 * Reserve a range of pseudo physical memory that we can map
2837 * into kva. These pages will only be backed by machine
2838 * pages ("real memory") during the lifetime of front-end requests
2839 * via grant table operations.
2840 */
2841 xbb->pseudo_phys_res_id = 0;
2842 xbb->pseudo_phys_res = xenmem_alloc(xbb->dev, &xbb->pseudo_phys_res_id,
2843 xbb->kva_size);
2844 if (xbb->pseudo_phys_res == NULL) {
2845 xbb->kva = 0;
2846 return (ENOMEM);
2847 }
2848 xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res);
2849 xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res);
2850
2851 DPRINTF("%s: kva: %#jx, gnt_base_addr: %#jx\n",
2852 device_get_nameunit(xbb->dev), (uintmax_t)xbb->kva,
2853 (uintmax_t)xbb->gnt_base_addr);
2854 return (0);
2855 }
2856
2857 /**
2858 * Collect front-end information from the XenStore.
2859 *
2860 * \param xbb Per-instance xbb configuration structure.
2861 */
2862 static int
xbb_collect_frontend_info(struct xbb_softc * xbb)2863 xbb_collect_frontend_info(struct xbb_softc *xbb)
2864 {
2865 char protocol_abi[64];
2866 const char *otherend_path;
2867 int error;
2868 u_int ring_idx;
2869 u_int ring_page_order;
2870 size_t ring_size;
2871
2872 otherend_path = xenbus_get_otherend_path(xbb->dev);
2873
2874 /*
2875 * Protocol defaults valid even if all negotiation fails.
2876 */
2877 xbb->ring_config.ring_pages = 1;
2878 xbb->max_request_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST;
2879 xbb->max_request_size = xbb->max_request_segments * PAGE_SIZE;
2880
2881 /*
2882 * Mandatory data (used in all versions of the protocol) first.
2883 */
2884 error = xs_scanf(XST_NIL, otherend_path,
2885 "event-channel", NULL, "%" PRIu32,
2886 &xbb->ring_config.evtchn);
2887 if (error != 0) {
2888 xenbus_dev_fatal(xbb->dev, error,
2889 "Unable to retrieve event-channel information "
2890 "from frontend %s. Unable to connect.",
2891 xenbus_get_otherend_path(xbb->dev));
2892 return (error);
2893 }
2894
2895 /*
2896 * These fields are initialized to legacy protocol defaults
2897 * so we only need to fail if reading the updated value succeeds
2898 * and the new value is outside of its allowed range.
2899 *
2900 * \note xs_gather() returns on the first encountered error, so
2901 * we must use independent calls in order to guarantee
2902 * we don't miss information in a sparsly populated front-end
2903 * tree.
2904 *
2905 * \note xs_scanf() does not update variables for unmatched
2906 * fields.
2907 */
2908 ring_page_order = 0;
2909 xbb->max_requests = 32;
2910
2911 (void)xs_scanf(XST_NIL, otherend_path,
2912 "ring-page-order", NULL, "%u",
2913 &ring_page_order);
2914 xbb->ring_config.ring_pages = 1 << ring_page_order;
2915 ring_size = PAGE_SIZE * xbb->ring_config.ring_pages;
2916 xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size);
2917
2918 if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) {
2919 xenbus_dev_fatal(xbb->dev, EINVAL,
2920 "Front-end specified ring-pages of %u "
2921 "exceeds backend limit of %u. "
2922 "Unable to connect.",
2923 xbb->ring_config.ring_pages,
2924 XBB_MAX_RING_PAGES);
2925 return (EINVAL);
2926 }
2927
2928 if (xbb->ring_config.ring_pages == 1) {
2929 error = xs_gather(XST_NIL, otherend_path,
2930 "ring-ref", "%" PRIu32,
2931 &xbb->ring_config.ring_ref[0],
2932 NULL);
2933 if (error != 0) {
2934 xenbus_dev_fatal(xbb->dev, error,
2935 "Unable to retrieve ring information "
2936 "from frontend %s. Unable to "
2937 "connect.",
2938 xenbus_get_otherend_path(xbb->dev));
2939 return (error);
2940 }
2941 } else {
2942 /* Multi-page ring format. */
2943 for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages;
2944 ring_idx++) {
2945 char ring_ref_name[]= "ring_refXX";
2946
2947 snprintf(ring_ref_name, sizeof(ring_ref_name),
2948 "ring-ref%u", ring_idx);
2949 error = xs_scanf(XST_NIL, otherend_path,
2950 ring_ref_name, NULL, "%" PRIu32,
2951 &xbb->ring_config.ring_ref[ring_idx]);
2952 if (error != 0) {
2953 xenbus_dev_fatal(xbb->dev, error,
2954 "Failed to retriev grant "
2955 "reference for page %u of "
2956 "shared ring. Unable "
2957 "to connect.", ring_idx);
2958 return (error);
2959 }
2960 }
2961 }
2962
2963 error = xs_gather(XST_NIL, otherend_path,
2964 "protocol", "%63s", protocol_abi,
2965 NULL);
2966 if (error != 0
2967 || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) {
2968 /*
2969 * Assume native if the frontend has not
2970 * published ABI data or it has published and
2971 * matches our own ABI.
2972 */
2973 xbb->abi = BLKIF_PROTOCOL_NATIVE;
2974 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) {
2975 xbb->abi = BLKIF_PROTOCOL_X86_32;
2976 } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) {
2977 xbb->abi = BLKIF_PROTOCOL_X86_64;
2978 } else {
2979 xenbus_dev_fatal(xbb->dev, EINVAL,
2980 "Unknown protocol ABI (%s) published by "
2981 "frontend. Unable to connect.", protocol_abi);
2982 return (EINVAL);
2983 }
2984 return (0);
2985 }
2986
2987 /**
2988 * Allocate per-request data structures given request size and number
2989 * information negotiated with the front-end.
2990 *
2991 * \param xbb Per-instance xbb configuration structure.
2992 */
2993 static int
xbb_alloc_requests(struct xbb_softc * xbb)2994 xbb_alloc_requests(struct xbb_softc *xbb)
2995 {
2996 struct xbb_xen_req *req;
2997 struct xbb_xen_req *last_req;
2998
2999 /*
3000 * Allocate request book keeping datastructures.
3001 */
3002 xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests),
3003 M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
3004 if (xbb->requests == NULL) {
3005 xenbus_dev_fatal(xbb->dev, ENOMEM,
3006 "Unable to allocate request structures");
3007 return (ENOMEM);
3008 }
3009
3010 req = xbb->requests;
3011 last_req = &xbb->requests[xbb->max_requests - 1];
3012 STAILQ_INIT(&xbb->request_free_stailq);
3013 while (req <= last_req) {
3014 STAILQ_INSERT_TAIL(&xbb->request_free_stailq, req, links);
3015 req++;
3016 }
3017 return (0);
3018 }
3019
3020 static int
xbb_alloc_request_lists(struct xbb_softc * xbb)3021 xbb_alloc_request_lists(struct xbb_softc *xbb)
3022 {
3023 struct xbb_xen_reqlist *reqlist;
3024 int i;
3025
3026 /*
3027 * If no requests can be merged, we need 1 request list per
3028 * in flight request.
3029 */
3030 xbb->request_lists = malloc(xbb->max_requests *
3031 sizeof(*xbb->request_lists), M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
3032 if (xbb->request_lists == NULL) {
3033 xenbus_dev_fatal(xbb->dev, ENOMEM,
3034 "Unable to allocate request list structures");
3035 return (ENOMEM);
3036 }
3037
3038 STAILQ_INIT(&xbb->reqlist_free_stailq);
3039 STAILQ_INIT(&xbb->reqlist_pending_stailq);
3040 for (i = 0; i < xbb->max_requests; i++) {
3041 int seg;
3042
3043 reqlist = &xbb->request_lists[i];
3044
3045 reqlist->xbb = xbb;
3046
3047 reqlist->gnt_handles = malloc(xbb->max_reqlist_segments *
3048 sizeof(*reqlist->gnt_handles),
3049 M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
3050 if (reqlist->gnt_handles == NULL) {
3051 xenbus_dev_fatal(xbb->dev, ENOMEM,
3052 "Unable to allocate request "
3053 "grant references");
3054 return (ENOMEM);
3055 }
3056
3057 for (seg = 0; seg < xbb->max_reqlist_segments; seg++)
3058 reqlist->gnt_handles[seg] = GRANT_REF_INVALID;
3059
3060 STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links);
3061 }
3062 return (0);
3063 }
3064
3065 /**
3066 * Supply information about the physical device to the frontend
3067 * via XenBus.
3068 *
3069 * \param xbb Per-instance xbb configuration structure.
3070 */
3071 static int
xbb_publish_backend_info(struct xbb_softc * xbb)3072 xbb_publish_backend_info(struct xbb_softc *xbb)
3073 {
3074 struct xs_transaction xst;
3075 const char *our_path;
3076 const char *leaf;
3077 int error;
3078
3079 our_path = xenbus_get_node(xbb->dev);
3080 while (1) {
3081 error = xs_transaction_start(&xst);
3082 if (error != 0) {
3083 xenbus_dev_fatal(xbb->dev, error,
3084 "Error publishing backend info "
3085 "(start transaction)");
3086 return (error);
3087 }
3088
3089 leaf = "sectors";
3090 error = xs_printf(xst, our_path, leaf,
3091 "%"PRIu64, xbb->media_num_sectors);
3092 if (error != 0)
3093 break;
3094
3095 /* XXX Support all VBD attributes here. */
3096 leaf = "info";
3097 error = xs_printf(xst, our_path, leaf, "%u",
3098 xbb->flags & XBBF_READ_ONLY
3099 ? VDISK_READONLY : 0);
3100 if (error != 0)
3101 break;
3102
3103 leaf = "sector-size";
3104 error = xs_printf(xst, our_path, leaf, "%u",
3105 xbb->sector_size);
3106 if (error != 0)
3107 break;
3108
3109 error = xs_transaction_end(xst, 0);
3110 if (error == 0) {
3111 return (0);
3112 } else if (error != EAGAIN) {
3113 xenbus_dev_fatal(xbb->dev, error, "ending transaction");
3114 return (error);
3115 }
3116 }
3117
3118 xenbus_dev_fatal(xbb->dev, error, "writing %s/%s",
3119 our_path, leaf);
3120 xs_transaction_end(xst, 1);
3121 return (error);
3122 }
3123
3124 /**
3125 * Connect to our blkfront peer now that it has completed publishing
3126 * its configuration into the XenStore.
3127 *
3128 * \param xbb Per-instance xbb configuration structure.
3129 */
3130 static void
xbb_connect(struct xbb_softc * xbb)3131 xbb_connect(struct xbb_softc *xbb)
3132 {
3133 int error;
3134
3135 if (!xbb->hotplug_done ||
3136 (xenbus_get_state(xbb->dev) != XenbusStateInitWait) ||
3137 (xbb_collect_frontend_info(xbb) != 0))
3138 return;
3139
3140 xbb->flags &= ~XBBF_SHUTDOWN;
3141
3142 /*
3143 * We limit the maximum number of reqlist segments to the maximum
3144 * number of segments in the ring, or our absolute maximum,
3145 * whichever is smaller.
3146 */
3147 xbb->max_reqlist_segments = MIN(xbb->max_request_segments *
3148 xbb->max_requests, XBB_MAX_SEGMENTS_PER_REQLIST);
3149
3150 /*
3151 * The maximum size is simply a function of the number of segments
3152 * we can handle.
3153 */
3154 xbb->max_reqlist_size = xbb->max_reqlist_segments * PAGE_SIZE;
3155
3156 /* Allocate resources whose size depends on front-end configuration. */
3157 error = xbb_alloc_communication_mem(xbb);
3158 if (error != 0) {
3159 xenbus_dev_fatal(xbb->dev, error,
3160 "Unable to allocate communication memory");
3161 return;
3162 }
3163
3164 error = xbb_publish_backend_info(xbb);
3165 if (error != 0) {
3166 xenbus_dev_fatal(xbb->dev, error,
3167 "Unable to publish device information");
3168 return;
3169 }
3170
3171 error = xbb_alloc_requests(xbb);
3172 if (error != 0) {
3173 /* Specific errors are reported by xbb_alloc_requests(). */
3174 return;
3175 }
3176
3177 error = xbb_alloc_request_lists(xbb);
3178 if (error != 0) {
3179 /* Specific errors are reported by xbb_alloc_request_lists(). */
3180 return;
3181 }
3182
3183 /*
3184 * Connect communication channel.
3185 */
3186 error = xbb_connect_ring(xbb);
3187 if (error != 0) {
3188 /* Specific errors are reported by xbb_connect_ring(). */
3189 return;
3190 }
3191
3192 /* Ready for I/O. */
3193 xenbus_set_state(xbb->dev, XenbusStateConnected);
3194 }
3195
3196 /*-------------------------- Device Teardown Support -------------------------*/
3197 /**
3198 * Perform device shutdown functions.
3199 *
3200 * \param xbb Per-instance xbb configuration structure.
3201 *
3202 * Mark this instance as shutting down, wait for any active I/O on the
3203 * backend device/file to drain, disconnect from the front-end, and notify
3204 * any waiters (e.g. a thread invoking our detach method) that detach can
3205 * now proceed.
3206 */
3207 static int
xbb_shutdown(struct xbb_softc * xbb)3208 xbb_shutdown(struct xbb_softc *xbb)
3209 {
3210 XenbusState frontState;
3211 int error;
3212
3213 DPRINTF("\n");
3214
3215 /*
3216 * Due to the need to drop our mutex during some
3217 * xenbus operations, it is possible for two threads
3218 * to attempt to close out shutdown processing at
3219 * the same time. Tell the caller that hits this
3220 * race to try back later.
3221 */
3222 if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0)
3223 return (EAGAIN);
3224
3225 xbb->flags |= XBBF_IN_SHUTDOWN;
3226 mtx_unlock(&xbb->lock);
3227
3228 if (xbb->hotplug_watch.node != NULL) {
3229 xs_unregister_watch(&xbb->hotplug_watch);
3230 free(xbb->hotplug_watch.node, M_XENBLOCKBACK);
3231 xbb->hotplug_watch.node = NULL;
3232 }
3233
3234 if (xenbus_get_state(xbb->dev) < XenbusStateClosing)
3235 xenbus_set_state(xbb->dev, XenbusStateClosing);
3236
3237 frontState = xenbus_get_otherend_state(xbb->dev);
3238 mtx_lock(&xbb->lock);
3239 xbb->flags &= ~XBBF_IN_SHUTDOWN;
3240
3241 /* Wait for the frontend to disconnect (if it's connected). */
3242 if (frontState == XenbusStateConnected)
3243 return (EAGAIN);
3244
3245 DPRINTF("\n");
3246
3247 /* Indicate shutdown is in progress. */
3248 xbb->flags |= XBBF_SHUTDOWN;
3249
3250 /* Disconnect from the front-end. */
3251 error = xbb_disconnect(xbb);
3252 if (error != 0) {
3253 /*
3254 * Requests still outstanding. We'll be called again
3255 * once they complete.
3256 */
3257 KASSERT(error == EAGAIN,
3258 ("%s: Unexpected xbb_disconnect() failure %d",
3259 __func__, error));
3260
3261 return (error);
3262 }
3263
3264 DPRINTF("\n");
3265
3266 /* Indicate to xbb_detach() that is it safe to proceed. */
3267 wakeup(xbb);
3268
3269 return (0);
3270 }
3271
3272 /**
3273 * Report an attach time error to the console and Xen, and cleanup
3274 * this instance by forcing immediate detach processing.
3275 *
3276 * \param xbb Per-instance xbb configuration structure.
3277 * \param err Errno describing the error.
3278 * \param fmt Printf style format and arguments
3279 */
3280 static void
xbb_attach_failed(struct xbb_softc * xbb,int err,const char * fmt,...)3281 xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...)
3282 {
3283 va_list ap;
3284 va_list ap_hotplug;
3285
3286 va_start(ap, fmt);
3287 va_copy(ap_hotplug, ap);
3288 xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev),
3289 "hotplug-error", fmt, ap_hotplug);
3290 va_end(ap_hotplug);
3291 xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3292 "hotplug-status", "error");
3293
3294 xenbus_dev_vfatal(xbb->dev, err, fmt, ap);
3295 va_end(ap);
3296
3297 xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3298 "online", "0");
3299 mtx_lock(&xbb->lock);
3300 xbb_shutdown(xbb);
3301 mtx_unlock(&xbb->lock);
3302 }
3303
3304 /*---------------------------- NewBus Entrypoints ----------------------------*/
3305 /**
3306 * Inspect a XenBus device and claim it if is of the appropriate type.
3307 *
3308 * \param dev NewBus device object representing a candidate XenBus device.
3309 *
3310 * \return 0 for success, errno codes for failure.
3311 */
3312 static int
xbb_probe(device_t dev)3313 xbb_probe(device_t dev)
3314 {
3315
3316 if (strcmp(xenbus_get_type(dev), "vbd"))
3317 return (ENXIO);
3318
3319 /* Only attach if Xen creates IOMMU entries for grant mapped pages. */
3320 if (!xen_has_iommu_maps()) {
3321 static bool warned;
3322
3323 if (!warned) {
3324 warned = true;
3325 printf(
3326 "xen-blkback disabled due to grant maps lacking IOMMU entries\n");
3327 }
3328 return (ENXIO);
3329 }
3330
3331 device_set_desc(dev, "Backend Virtual Block Device");
3332 device_quiet(dev);
3333 return (0);
3334 }
3335
3336 /**
3337 * Setup sysctl variables to control various Block Back parameters.
3338 *
3339 * \param xbb Xen Block Back softc.
3340 *
3341 */
3342 static void
xbb_setup_sysctl(struct xbb_softc * xbb)3343 xbb_setup_sysctl(struct xbb_softc *xbb)
3344 {
3345 struct sysctl_ctx_list *sysctl_ctx = NULL;
3346 struct sysctl_oid *sysctl_tree = NULL;
3347
3348 sysctl_ctx = device_get_sysctl_ctx(xbb->dev);
3349 if (sysctl_ctx == NULL)
3350 return;
3351
3352 sysctl_tree = device_get_sysctl_tree(xbb->dev);
3353 if (sysctl_tree == NULL)
3354 return;
3355
3356 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3357 "disable_flush", CTLFLAG_RW, &xbb->disable_flush, 0,
3358 "fake the flush command");
3359
3360 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3361 "flush_interval", CTLFLAG_RW, &xbb->flush_interval, 0,
3362 "send a real flush for N flush requests");
3363
3364 SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3365 "no_coalesce_reqs", CTLFLAG_RW, &xbb->no_coalesce_reqs,0,
3366 "Don't coalesce contiguous requests");
3367
3368 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3369 "reqs_received", CTLFLAG_RW, &xbb->reqs_received,
3370 "how many I/O requests we have received");
3371
3372 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3373 "reqs_completed", CTLFLAG_RW, &xbb->reqs_completed,
3374 "how many I/O requests have been completed");
3375
3376 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3377 "reqs_queued_for_completion", CTLFLAG_RW,
3378 &xbb->reqs_queued_for_completion,
3379 "how many I/O requests queued but not yet pushed");
3380
3381 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3382 "reqs_completed_with_error", CTLFLAG_RW,
3383 &xbb->reqs_completed_with_error,
3384 "how many I/O requests completed with error status");
3385
3386 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3387 "forced_dispatch", CTLFLAG_RW, &xbb->forced_dispatch,
3388 "how many I/O dispatches were forced");
3389
3390 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3391 "normal_dispatch", CTLFLAG_RW, &xbb->normal_dispatch,
3392 "how many I/O dispatches were normal");
3393
3394 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3395 "total_dispatch", CTLFLAG_RW, &xbb->total_dispatch,
3396 "total number of I/O dispatches");
3397
3398 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3399 "kva_shortages", CTLFLAG_RW, &xbb->kva_shortages,
3400 "how many times we have run out of KVA");
3401
3402 SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3403 "request_shortages", CTLFLAG_RW,
3404 &xbb->request_shortages,
3405 "how many times we have run out of requests");
3406
3407 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3408 "max_requests", CTLFLAG_RD, &xbb->max_requests, 0,
3409 "maximum outstanding requests (negotiated)");
3410
3411 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3412 "max_request_segments", CTLFLAG_RD,
3413 &xbb->max_request_segments, 0,
3414 "maximum number of pages per requests (negotiated)");
3415
3416 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3417 "max_request_size", CTLFLAG_RD,
3418 &xbb->max_request_size, 0,
3419 "maximum size in bytes of a request (negotiated)");
3420
3421 SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
3422 "ring_pages", CTLFLAG_RD,
3423 &xbb->ring_config.ring_pages, 0,
3424 "communication channel pages (negotiated)");
3425 }
3426
3427 static void
xbb_attach_disk(device_t dev)3428 xbb_attach_disk(device_t dev)
3429 {
3430 struct xbb_softc *xbb;
3431 int error;
3432
3433 xbb = device_get_softc(dev);
3434
3435 KASSERT(xbb->hotplug_done, ("Missing hotplug execution"));
3436
3437 /* Parse fopen style mode flags. */
3438 if (strchr(xbb->dev_mode, 'w') == NULL)
3439 xbb->flags |= XBBF_READ_ONLY;
3440
3441 /*
3442 * Verify the physical device is present and can support
3443 * the desired I/O mode.
3444 */
3445 error = xbb_open_backend(xbb);
3446 if (error != 0) {
3447 xbb_attach_failed(xbb, error, "Unable to open %s",
3448 xbb->dev_name);
3449 return;
3450 }
3451
3452 /* Use devstat(9) for recording statistics. */
3453 xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev),
3454 xbb->sector_size,
3455 DEVSTAT_ALL_SUPPORTED,
3456 DEVSTAT_TYPE_DIRECT
3457 | DEVSTAT_TYPE_IF_OTHER,
3458 DEVSTAT_PRIORITY_OTHER);
3459
3460 xbb->xbb_stats_in = devstat_new_entry("xbbi", device_get_unit(xbb->dev),
3461 xbb->sector_size,
3462 DEVSTAT_ALL_SUPPORTED,
3463 DEVSTAT_TYPE_DIRECT
3464 | DEVSTAT_TYPE_IF_OTHER,
3465 DEVSTAT_PRIORITY_OTHER);
3466 /*
3467 * Setup sysctl variables.
3468 */
3469 xbb_setup_sysctl(xbb);
3470
3471 /*
3472 * Create a taskqueue for doing work that must occur from a
3473 * thread context.
3474 */
3475 xbb->io_taskqueue = taskqueue_create_fast(device_get_nameunit(dev),
3476 M_NOWAIT,
3477 taskqueue_thread_enqueue,
3478 /*contxt*/&xbb->io_taskqueue);
3479 if (xbb->io_taskqueue == NULL) {
3480 xbb_attach_failed(xbb, error, "Unable to create taskqueue");
3481 return;
3482 }
3483
3484 taskqueue_start_threads(&xbb->io_taskqueue,
3485 /*num threads*/1,
3486 /*priority*/PWAIT,
3487 /*thread name*/
3488 "%s taskq", device_get_nameunit(dev));
3489
3490 /* Update hot-plug status to satisfy xend. */
3491 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3492 "hotplug-status", "connected");
3493 if (error) {
3494 xbb_attach_failed(xbb, error, "writing %s/hotplug-status",
3495 xenbus_get_node(xbb->dev));
3496 return;
3497 }
3498
3499 /* The front end might be waiting for the backend, attach if so. */
3500 if (xenbus_get_otherend_state(xbb->dev) == XenbusStateInitialised)
3501 xbb_connect(xbb);
3502 }
3503
3504 static void
xbb_attach_cb(struct xs_watch * watch,const char ** vec,unsigned int len)3505 xbb_attach_cb(struct xs_watch *watch, const char **vec, unsigned int len)
3506 {
3507 device_t dev;
3508 struct xbb_softc *xbb;
3509 int error;
3510
3511 dev = (device_t)watch->callback_data;
3512 xbb = device_get_softc(dev);
3513
3514 error = xs_gather(XST_NIL, xenbus_get_node(dev), "physical-device-path",
3515 NULL, &xbb->dev_name, NULL);
3516 if (error != 0)
3517 return;
3518
3519 xs_unregister_watch(watch);
3520 free(watch->node, M_XENBLOCKBACK);
3521 watch->node = NULL;
3522 xbb->hotplug_done = true;
3523
3524 /* Collect physical device information. */
3525 error = xs_gather(XST_NIL, xenbus_get_otherend_path(dev), "device-type",
3526 NULL, &xbb->dev_type, NULL);
3527 if (error != 0)
3528 xbb->dev_type = NULL;
3529
3530 error = xs_gather(XST_NIL, xenbus_get_node(dev), "mode", NULL,
3531 &xbb->dev_mode, NULL);
3532 if (error != 0) {
3533 xbb_attach_failed(xbb, error, "reading backend fields at %s",
3534 xenbus_get_node(dev));
3535 return;
3536 }
3537
3538 xbb_attach_disk(dev);
3539 }
3540
3541 /**
3542 * Attach to a XenBus device that has been claimed by our probe routine.
3543 *
3544 * \param dev NewBus device object representing this Xen Block Back instance.
3545 *
3546 * \return 0 for success, errno codes for failure.
3547 */
3548 static int
xbb_attach(device_t dev)3549 xbb_attach(device_t dev)
3550 {
3551 struct xbb_softc *xbb;
3552 int error;
3553 u_int max_ring_page_order;
3554 struct sbuf *watch_path;
3555
3556 DPRINTF("Attaching to %s\n", xenbus_get_node(dev));
3557
3558 /*
3559 * Basic initialization.
3560 * After this block it is safe to call xbb_detach()
3561 * to clean up any allocated data for this instance.
3562 */
3563 xbb = device_get_softc(dev);
3564 xbb->dev = dev;
3565 xbb->otherend_id = xenbus_get_otherend_id(dev);
3566 TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb);
3567 mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF);
3568
3569 /*
3570 * Publish protocol capabilities for consumption by the
3571 * front-end.
3572 */
3573 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3574 "feature-barrier", "1");
3575 if (error) {
3576 xbb_attach_failed(xbb, error, "writing %s/feature-barrier",
3577 xenbus_get_node(xbb->dev));
3578 return (error);
3579 }
3580
3581 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3582 "feature-flush-cache", "1");
3583 if (error) {
3584 xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache",
3585 xenbus_get_node(xbb->dev));
3586 return (error);
3587 }
3588
3589 max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1;
3590 error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
3591 "max-ring-page-order", "%u", max_ring_page_order);
3592 if (error) {
3593 xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order",
3594 xenbus_get_node(xbb->dev));
3595 return (error);
3596 }
3597
3598 /* Tell the toolstack blkback has attached. */
3599 xenbus_set_state(dev, XenbusStateInitWait);
3600
3601 if (xbb->hotplug_done) {
3602 xbb_attach_disk(dev);
3603 return (0);
3604 }
3605
3606 /*
3607 * We need to wait for hotplug script execution before
3608 * moving forward.
3609 */
3610 watch_path = xs_join(xenbus_get_node(xbb->dev), "physical-device-path");
3611 xbb->hotplug_watch.callback_data = (uintptr_t)dev;
3612 xbb->hotplug_watch.callback = xbb_attach_cb;
3613 KASSERT(xbb->hotplug_watch.node == NULL, ("watch node already setup"));
3614 xbb->hotplug_watch.node = strdup(sbuf_data(watch_path), M_XENBLOCKBACK);
3615 /*
3616 * We don't care about the path updated, just about the value changes
3617 * on that single node, hence there's no need to queue more that one
3618 * event.
3619 */
3620 xbb->hotplug_watch.max_pending = 1;
3621 sbuf_delete(watch_path);
3622 error = xs_register_watch(&xbb->hotplug_watch);
3623 if (error != 0) {
3624 xbb_attach_failed(xbb, error, "failed to create watch on %s",
3625 xbb->hotplug_watch.node);
3626 free(xbb->hotplug_watch.node, M_XENBLOCKBACK);
3627 return (error);
3628 }
3629
3630 return (0);
3631 }
3632
3633 /**
3634 * Detach from a block back device instance.
3635 *
3636 * \param dev NewBus device object representing this Xen Block Back instance.
3637 *
3638 * \return 0 for success, errno codes for failure.
3639 *
3640 * \note A block back device may be detached at any time in its life-cycle,
3641 * including part way through the attach process. For this reason,
3642 * initialization order and the initialization state checks in this
3643 * routine must be carefully coupled so that attach time failures
3644 * are gracefully handled.
3645 */
3646 static int
xbb_detach(device_t dev)3647 xbb_detach(device_t dev)
3648 {
3649 struct xbb_softc *xbb;
3650
3651 DPRINTF("\n");
3652
3653 xbb = device_get_softc(dev);
3654 mtx_lock(&xbb->lock);
3655 while (xbb_shutdown(xbb) == EAGAIN) {
3656 msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0,
3657 "xbb_shutdown", 0);
3658 }
3659 mtx_unlock(&xbb->lock);
3660
3661 DPRINTF("\n");
3662
3663 if (xbb->io_taskqueue != NULL)
3664 taskqueue_free(xbb->io_taskqueue);
3665
3666 if (xbb->xbb_stats != NULL)
3667 devstat_remove_entry(xbb->xbb_stats);
3668
3669 if (xbb->xbb_stats_in != NULL)
3670 devstat_remove_entry(xbb->xbb_stats_in);
3671
3672 xbb_close_backend(xbb);
3673
3674 if (xbb->dev_mode != NULL) {
3675 free(xbb->dev_mode, M_XENSTORE);
3676 xbb->dev_mode = NULL;
3677 }
3678
3679 if (xbb->dev_type != NULL) {
3680 free(xbb->dev_type, M_XENSTORE);
3681 xbb->dev_type = NULL;
3682 }
3683
3684 if (xbb->dev_name != NULL) {
3685 free(xbb->dev_name, M_XENSTORE);
3686 xbb->dev_name = NULL;
3687 }
3688
3689 mtx_destroy(&xbb->lock);
3690 return (0);
3691 }
3692
3693 /**
3694 * Prepare this block back device for suspension of this VM.
3695 *
3696 * \param dev NewBus device object representing this Xen Block Back instance.
3697 *
3698 * \return 0 for success, errno codes for failure.
3699 */
3700 static int
xbb_suspend(device_t dev)3701 xbb_suspend(device_t dev)
3702 {
3703 #ifdef NOT_YET
3704 struct xbb_softc *sc = device_get_softc(dev);
3705
3706 /* Prevent new requests being issued until we fix things up. */
3707 mtx_lock(&sc->xb_io_lock);
3708 sc->connected = BLKIF_STATE_SUSPENDED;
3709 mtx_unlock(&sc->xb_io_lock);
3710 #endif
3711
3712 return (0);
3713 }
3714
3715 /**
3716 * Perform any processing required to recover from a suspended state.
3717 *
3718 * \param dev NewBus device object representing this Xen Block Back instance.
3719 *
3720 * \return 0 for success, errno codes for failure.
3721 */
3722 static int
xbb_resume(device_t dev)3723 xbb_resume(device_t dev)
3724 {
3725 return (0);
3726 }
3727
3728 /**
3729 * Handle state changes expressed via the XenStore by our front-end peer.
3730 *
3731 * \param dev NewBus device object representing this Xen
3732 * Block Back instance.
3733 * \param frontend_state The new state of the front-end.
3734 *
3735 * \return 0 for success, errno codes for failure.
3736 */
3737 static void
xbb_frontend_changed(device_t dev,XenbusState frontend_state)3738 xbb_frontend_changed(device_t dev, XenbusState frontend_state)
3739 {
3740 struct xbb_softc *xbb = device_get_softc(dev);
3741
3742 DPRINTF("frontend_state=%s, xbb_state=%s\n",
3743 xenbus_strstate(frontend_state),
3744 xenbus_strstate(xenbus_get_state(xbb->dev)));
3745
3746 switch (frontend_state) {
3747 case XenbusStateInitialising:
3748 break;
3749 case XenbusStateInitialised:
3750 case XenbusStateConnected:
3751 xbb_connect(xbb);
3752 break;
3753 case XenbusStateClosing:
3754 case XenbusStateClosed:
3755 mtx_lock(&xbb->lock);
3756 xbb_shutdown(xbb);
3757 mtx_unlock(&xbb->lock);
3758 if (frontend_state == XenbusStateClosed)
3759 xenbus_set_state(xbb->dev, XenbusStateClosed);
3760 break;
3761 default:
3762 xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend",
3763 frontend_state);
3764 break;
3765 }
3766 }
3767
3768 /*---------------------------- NewBus Registration ---------------------------*/
3769 static device_method_t xbb_methods[] = {
3770 /* Device interface */
3771 DEVMETHOD(device_probe, xbb_probe),
3772 DEVMETHOD(device_attach, xbb_attach),
3773 DEVMETHOD(device_detach, xbb_detach),
3774 DEVMETHOD(device_shutdown, bus_generic_shutdown),
3775 DEVMETHOD(device_suspend, xbb_suspend),
3776 DEVMETHOD(device_resume, xbb_resume),
3777
3778 /* Xenbus interface */
3779 DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed),
3780
3781 DEVMETHOD_END
3782 };
3783
3784 static driver_t xbb_driver = {
3785 "xbbd",
3786 xbb_methods,
3787 sizeof(struct xbb_softc),
3788 };
3789
3790 DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, 0, 0);
3791