xref: /minix/minix/drivers/storage/ahci/ahci.c (revision 0a6a1f1d)
1 /* Advanced Host Controller Interface (AHCI) driver, by D.C. van Moolenbroek
2  * - Multithreading support by Arne Welzel
3  * - Native Command Queuing support by Raja Appuswamy
4  */
5 /*
6  * This driver is based on the following specifications:
7  * - Serial ATA Advanced Host Controller Interface (AHCI) 1.3
8  * - Serial ATA Revision 2.6
9  * - AT Attachment with Packet Interface 7 (ATA/ATAPI-7)
10  * - ATAPI Removable Rewritable Media Devices 1.3 (SFF-8070)
11  *
12  * The driver supports device hot-plug, active device status tracking,
13  * nonremovable ATA and removable ATAPI devices, custom logical sector sizes,
14  * sector-unaligned reads, native command queuing and parallel requests to
15  * different devices.
16  *
17  * It does not implement transparent failure recovery, power management, or
18  * port multiplier support.
19  */
20 /*
21  * An AHCI controller exposes a number of ports (up to 32), each of which may
22  * or may not have one device attached (port multipliers are not supported).
23  * Each port is maintained independently.
24  *
25  * The following figure depicts the possible transitions between port states.
26  * The NO_PORT state is not included; no transitions can be made from or to it.
27  *
28  *   +----------+                      +----------+
29  *   | SPIN_UP  | ------+      +-----> | BAD_DEV  | ------------------+
30  *   +----------+       |      |       +----------+                   |
31  *        |             |      |            ^                         |
32  *        v             v      |            |                         |
33  *   +----------+     +----------+     +----------+     +----------+  |
34  *   |  NO_DEV  | --> | WAIT_DEV | --> | WAIT_ID  | --> | GOOD_DEV |  |
35  *   +----------+     +----------+     +----------+     +----------+  |
36  *        ^                |                |                |        |
37  *        +----------------+----------------+----------------+--------+
38  *
39  * At driver startup, all physically present ports are put in SPIN_UP state.
40  * This state differs from NO_DEV in that BDEV_OPEN calls will be deferred
41  * until either the spin-up timer expires, or a device has been identified on
42  * that port. This prevents early BDEV_OPEN calls from failing erroneously at
43  * startup time if the device has not yet been able to announce its presence.
44  *
45  * If a device is detected, either at startup time or after hot-plug, its
46  * signature is checked and it is identified, after which it may be determined
47  * to be a usable ("good") device, which means that the device is considered to
48  * be in a working state. If these steps fail, the device is marked as unusable
49  * ("bad"). At any point in time, the device may be disconnected; the port is
50  * then put back into NO_DEV state.
51  *
52  * A device in working state (GOOD_DEV) may or may not have a medium. All ATA
53  * devices are assumed to be fixed; all ATAPI devices are assumed to have
54  * removable media. To prevent erroneous access to switched devices and media,
55  * the driver makes devices inaccessible until they are fully closed (the open
56  * count is zero) when a device (hot-plug) or medium change is detected.
57  * For hot-plug changes, access is prevented by setting the BARRIER flag until
58  * the device is fully closed and then reopened. For medium changes, access is
59  * prevented by not acknowledging the medium change until the device is fully
60  * closed and reopened. Removable media are not locked in the drive while
61  * opened, because the driver author is uncomfortable with that concept.
62  *
63  * Ports may leave the group of states where a device is connected (that is,
64  * WAIT_ID, GOOD_DEV, and BAD_DEV) in two ways: either due to a hot-unplug
65  * event, or due to a hard reset after a serious failure. For simplicity, we
66  * we perform a hard reset after a hot-unplug event as well, so that the link
67  * to the device is broken. Thus, in both cases, a transition to NO_DEV is
68  * made, after which the link to the device may or may not be reestablished.
69  * In both cases, ongoing requests are cancelled and the BARRIER flag is set.
70  *
71  * The following table lists for each state, whether the port is started
72  * (PxCMD.ST is set), whether a timer is running, what the PxIE mask is to be
73  * set to, and what BDEV_OPEN calls on this port should return.
74  *
75  *   State       Started     Timer       PxIE        BDEV_OPEN
76  *   ---------   ---------   ---------   ---------   ---------
77  *   NO_PORT     no          no          (none)      ENXIO
78  *   SPIN_UP     no          yes         PCE         (wait)
79  *   NO_DEV      no          no          PCE         ENXIO
80  *   WAIT_DEV    no          yes         PCE         (wait)
81  *   BAD_DEV     no          no          PRCE        ENXIO
82  *   WAIT_ID     yes         yes         PRCE+       (wait)
83  *   GOOD_DEV    yes         per-command PRCE+       OK
84  *
85  * In order to continue deferred BDEV_OPEN calls, the BUSY flag must be unset
86  * when changing from SPIN_UP to any state but WAIT_DEV, and when changing from
87  * WAIT_DEV to any state but WAIT_ID, and when changing from WAIT_ID to any
88  * other state.
89  */
90 /*
91  * The maximum byte size of a single transfer (MAX_TRANSFER) is currently set
92  * to 4MB. This limit has been chosen for a number of reasons:
93  * - The size that can be specified in a Physical Region Descriptor (PRD) is
94  *   limited to 4MB for AHCI. Limiting the total transfer size to at most this
95  *   size implies that no I/O vector element needs to be split up across PRDs.
96  *   This means that the maximum number of needed PRDs can be predetermined.
97  * - The limit is below what can be transferred in a single ATA request, namely
98  *   64k sectors (i.e., at least 32MB). This means that transfer requests need
99  *   never be split up into smaller chunks, reducing implementation complexity.
100  * - A single, static timeout can be used for transfers. Very large transfers
101  *   can legitimately take up to several minutes -- well beyond the appropriate
102  *   timeout range for small transfers. The limit obviates the need for a
103  *   timeout scheme that takes into account the transfer size.
104  * - Similarly, the transfer limit reduces the opportunity for buggy/malicious
105  *   clients to keep the driver busy for a long time with a single request.
106  * - The limit is high enough for all practical purposes. The transfer setup
107  *   overhead is already relatively negligible at this size, and even larger
108  *   requests will not help maximize throughput. As NR_IOREQS is currently set
109  *   to 64, the limit still allows file systems to perform I/O requests with
110  *   vectors completely filled with 64KB-blocks.
111  */
112 #include <minix/drivers.h>
113 #include <minix/blockdriver_mt.h>
114 #include <minix/drvlib.h>
115 #include <machine/pci.h>
116 #include <sys/ioc_disk.h>
117 #include <sys/mman.h>
118 #include <assert.h>
119 
120 #include "ahci.h"
121 
122 /* Host Bus Adapter (HBA) state. */
123 static struct {
124 	volatile u32_t *base;	/* base address of memory-mapped registers */
125 	size_t size;		/* size of memory-mapped register area */
126 
127 	int nr_ports;		/* addressable number of ports (1..NR_PORTS) */
128 	int nr_cmds;		/* maximum number of commands per port */
129 	int has_ncq;		/* NCQ support flag */
130 	int has_clo;		/* CLO support flag */
131 
132 	int irq;		/* IRQ number */
133 	int hook_id;		/* IRQ hook ID */
134 } hba_state;
135 
136 #define hba_read(r)		(hba_state.base[r])
137 #define hba_write(r, v)		(hba_state.base[r] = (v))
138 
139 /* Port state. */
140 static struct port_state {
141 	int state;		/* port state */
142 	unsigned int flags;	/* port flags */
143 
144 	volatile u32_t *reg;	/* memory-mapped port registers */
145 
146 	u8_t *mem_base;		/* primary memory buffer virtual address */
147 	phys_bytes mem_phys;	/* primary memory buffer physical address */
148 	vir_bytes mem_size;	/* primary memory buffer size */
149 
150 	/* the FIS, CL, CT[0] and TMP buffers are all in the primary buffer */
151 	u32_t *fis_base;	/* FIS receive buffer virtual address */
152 	phys_bytes fis_phys;	/* FIS receive buffer physical address */
153 	u32_t *cl_base;		/* command list buffer virtual address */
154 	phys_bytes cl_phys;	/* command list buffer physical address */
155 	u8_t *ct_base[NR_CMDS];	/* command table virtual address */
156 	phys_bytes ct_phys[NR_CMDS];	/* command table physical address */
157 	u8_t *tmp_base;		/* temporary storage buffer virtual address */
158 	phys_bytes tmp_phys;	/* temporary storage buffer physical address */
159 
160 	u8_t *pad_base;		/* sector padding buffer virtual address */
161 	phys_bytes pad_phys;	/* sector padding buffer physical address */
162 	vir_bytes pad_size;	/* sector padding buffer size */
163 
164 	u64_t lba_count;	/* number of valid Logical Block Addresses */
165 	u32_t sector_size;	/* medium sector size in bytes */
166 
167 	int open_count;		/* number of times this port is opened */
168 
169 	int device;		/* associated device number, or NO_DEVICE */
170 	struct device part[DEV_PER_DRIVE];	/* partition bases and sizes */
171 	struct device subpart[SUB_PER_DRIVE];	/* same for subpartitions */
172 
173 	minix_timer_t timer;		/* port-specific timeout timer */
174 	int left;		/* number of tries left before giving up */
175 				/* (only used for signature probing) */
176 
177 	int queue_depth;	/* NCQ queue depth */
178 	u32_t pend_mask;	/* commands not yet complete */
179 	struct {
180 		thread_id_t tid;/* ID of the worker thread */
181 		minix_timer_t timer;	/* timer associated with each request */
182 		int result;	/* success/failure result of the commands */
183 	} cmd_info[NR_CMDS];
184 } port_state[NR_PORTS];
185 
186 #define port_read(ps, r)	((ps)->reg[r])
187 #define port_write(ps, r, v)	((ps)->reg[r] = (v))
188 
189 static int ahci_instance;			/* driver instance number */
190 
191 static int ahci_verbose;			/* verbosity level (0..4) */
192 
193 /* Timeout-related values. */
194 static clock_t ahci_spinup_timeout;
195 static clock_t ahci_device_timeout;
196 static clock_t ahci_device_delay;
197 static unsigned int ahci_device_checks;
198 static clock_t ahci_command_timeout;
199 static clock_t ahci_transfer_timeout;
200 static clock_t ahci_flush_timeout;
201 
202 /* Timeout environment variable names and default values. */
203 static struct {
204 	char *name;				/* environment variable name */
205 	u32_t default_ms;			/* default in milliseconds */
206 	clock_t *ptr;				/* clock ticks value pointer */
207 } ahci_timevar[] = {
208 	{ "ahci_init_timeout",   SPINUP_TIMEOUT,    &ahci_spinup_timeout   },
209 	{ "ahci_device_timeout", DEVICE_TIMEOUT,    &ahci_device_timeout   },
210 	{ "ahci_cmd_timeout",    COMMAND_TIMEOUT,   &ahci_command_timeout  },
211 	{ "ahci_io_timeout",     TRANSFER_TIMEOUT,  &ahci_transfer_timeout },
212 	{ "ahci_flush_timeout",  FLUSH_TIMEOUT,     &ahci_flush_timeout    }
213 };
214 
215 static int ahci_map[MAX_DRIVES];		/* device-to-port mapping */
216 
217 static int ahci_exiting = FALSE;		/* exit after last close? */
218 
219 #define BUILD_ARG(port, tag)	(((port) << 8) | (tag))
220 #define GET_PORT(arg)		((arg) >> 8)
221 #define GET_TAG(arg)		((arg) & 0xFF)
222 
223 #define dprintf(v,s) do {		\
224 	if (ahci_verbose >= (v))	\
225 		printf s;		\
226 } while (0)
227 
228 /* Convert milliseconds to clock ticks. Round up. */
229 #define millis_to_hz(ms)	(((ms) * sys_hz() + 999) / 1000)
230 
231 static void port_set_cmd(struct port_state *ps, int cmd, cmd_fis_t *fis,
232 	u8_t packet[ATAPI_PACKET_SIZE], prd_t *prdt, int nr_prds, int write);
233 static void port_issue(struct port_state *ps, int cmd, clock_t timeout);
234 static int port_exec(struct port_state *ps, int cmd, clock_t timeout);
235 static void port_timeout(minix_timer_t *tp);
236 static void port_disconnect(struct port_state *ps);
237 
238 static char *ahci_portname(struct port_state *ps);
239 static int ahci_open(devminor_t minor, int access);
240 static int ahci_close(devminor_t minor);
241 static ssize_t ahci_transfer(devminor_t minor, int do_write, u64_t position,
242 	endpoint_t endpt, iovec_t *iovec, unsigned int count, int flags);
243 static struct device *ahci_part(devminor_t minor);
244 static void ahci_alarm(clock_t stamp);
245 static int ahci_ioctl(devminor_t minor, unsigned long request,
246 	endpoint_t endpt, cp_grant_id_t grant, endpoint_t user_endpt);
247 static void ahci_intr(unsigned int mask);
248 static int ahci_device(devminor_t minor, device_id_t *id);
249 static struct port_state *ahci_get_port(devminor_t minor);
250 
251 /* AHCI driver table. */
252 static struct blockdriver ahci_dtab = {
253 	.bdr_type	= BLOCKDRIVER_TYPE_DISK,
254 	.bdr_open	= ahci_open,
255 	.bdr_close	= ahci_close,
256 	.bdr_transfer	= ahci_transfer,
257 	.bdr_ioctl	= ahci_ioctl,
258 	.bdr_part	= ahci_part,
259 	.bdr_intr	= ahci_intr,
260 	.bdr_alarm	= ahci_alarm,
261 	.bdr_device	= ahci_device
262 };
263 
264 /*===========================================================================*
265  *				atapi_exec				     *
266  *===========================================================================*/
267 static int atapi_exec(struct port_state *ps, int cmd,
268 	u8_t packet[ATAPI_PACKET_SIZE], size_t size, int write)
269 {
270 	/* Execute an ATAPI command. Return OK or error.
271 	 */
272 	cmd_fis_t fis;
273 	prd_t prd[1];
274 	int nr_prds = 0;
275 
276 	assert(size <= AHCI_TMP_SIZE);
277 
278 	/* Fill in the command table with a FIS, a packet, and if a data
279 	 * transfer is requested, also a PRD.
280 	 */
281 	memset(&fis, 0, sizeof(fis));
282 	fis.cf_cmd = ATA_CMD_PACKET;
283 
284 	if (size > 0) {
285 		fis.cf_feat = ATA_FEAT_PACKET_DMA;
286 		if (!write && (ps->flags & FLAG_USE_DMADIR))
287 			fis.cf_feat |= ATA_FEAT_PACKET_DMADIR;
288 
289 		prd[0].vp_addr = ps->tmp_phys;
290 		prd[0].vp_size = size;
291 		nr_prds++;
292 	}
293 
294 	/* Start the command, and wait for it to complete or fail. */
295 	port_set_cmd(ps, cmd, &fis, packet, prd, nr_prds, write);
296 
297 	return port_exec(ps, cmd, ahci_command_timeout);
298 }
299 
300 /*===========================================================================*
301  *				atapi_test_unit				     *
302  *===========================================================================*/
303 static int atapi_test_unit(struct port_state *ps, int cmd)
304 {
305 	/* Test whether the ATAPI device and medium are ready.
306 	 */
307 	u8_t packet[ATAPI_PACKET_SIZE];
308 
309 	memset(packet, 0, sizeof(packet));
310 	packet[0] = ATAPI_CMD_TEST_UNIT;
311 
312 	return atapi_exec(ps, cmd, packet, 0, FALSE);
313 }
314 
315 /*===========================================================================*
316  *				atapi_request_sense			     *
317  *===========================================================================*/
318 static int atapi_request_sense(struct port_state *ps, int cmd, int *sense)
319 {
320 	/* Request error (sense) information from an ATAPI device, and return
321 	 * the sense key. The additional sense codes are not used at this time.
322 	 */
323 	u8_t packet[ATAPI_PACKET_SIZE];
324 	int r;
325 
326 	memset(packet, 0, sizeof(packet));
327 	packet[0] = ATAPI_CMD_REQUEST_SENSE;
328 	packet[4] = ATAPI_REQUEST_SENSE_LEN;
329 
330 	r = atapi_exec(ps, cmd, packet, ATAPI_REQUEST_SENSE_LEN, FALSE);
331 
332 	if (r != OK)
333 		return r;
334 
335 	dprintf(V_REQ, ("%s: ATAPI SENSE: sense %x ASC %x ASCQ %x\n",
336 		ahci_portname(ps), ps->tmp_base[2] & 0xF, ps->tmp_base[12],
337 		ps->tmp_base[13]));
338 
339 	*sense = ps->tmp_base[2] & 0xF;
340 
341 	return OK;
342 }
343 
344 /*===========================================================================*
345  *				atapi_load_eject			     *
346  *===========================================================================*/
347 static int atapi_load_eject(struct port_state *ps, int cmd, int load)
348 {
349 	/* Load or eject a medium in an ATAPI device.
350 	 */
351 	u8_t packet[ATAPI_PACKET_SIZE];
352 
353 	memset(packet, 0, sizeof(packet));
354 	packet[0] = ATAPI_CMD_START_STOP;
355 	packet[4] = load ? ATAPI_START_STOP_LOAD : ATAPI_START_STOP_EJECT;
356 
357 	return atapi_exec(ps, cmd, packet, 0, FALSE);
358 }
359 
360 /*===========================================================================*
361  *				atapi_read_capacity			     *
362  *===========================================================================*/
363 static int atapi_read_capacity(struct port_state *ps, int cmd)
364 {
365 	/* Retrieve the LBA count and sector size of an ATAPI medium.
366 	 */
367 	u8_t packet[ATAPI_PACKET_SIZE], *buf;
368 	int r;
369 
370 	memset(packet, 0, sizeof(packet));
371 	packet[0] = ATAPI_CMD_READ_CAPACITY;
372 
373 	r = atapi_exec(ps, cmd, packet, ATAPI_READ_CAPACITY_LEN, FALSE);
374 	if (r != OK)
375 		return r;
376 
377 	/* Store the number of LBA blocks and sector size. */
378 	buf = ps->tmp_base;
379 	ps->lba_count = (u64_t) ((buf[0] << 24) | (buf[1] << 16) |
380 		(buf[2] << 8) | buf[3]) + 1;
381 	ps->sector_size =
382 		(buf[4] << 24) | (buf[5] << 16) | (buf[6] << 8) | buf[7];
383 
384 	if (ps->sector_size == 0 || (ps->sector_size & 1)) {
385 		dprintf(V_ERR, ("%s: invalid medium sector size %u\n",
386 			ahci_portname(ps), ps->sector_size));
387 
388 		return EINVAL;
389 	}
390 
391 	dprintf(V_INFO,
392 		("%s: medium detected (%u byte sectors, %llu MB size)\n",
393 		ahci_portname(ps), ps->sector_size,
394 		ps->lba_count * ps->sector_size / (1024*1024)));
395 
396 	return OK;
397 }
398 
399 /*===========================================================================*
400  *				atapi_check_medium			     *
401  *===========================================================================*/
402 static int atapi_check_medium(struct port_state *ps, int cmd)
403 {
404 	/* Check whether a medium is present in a removable-media ATAPI device.
405 	 * If a new medium is detected, get its total and sector size. Return
406 	 * OK only if a usable medium is present, and an error otherwise.
407 	 */
408 	int sense;
409 
410 	/* Perform a readiness check. */
411 	if (atapi_test_unit(ps, cmd) != OK) {
412 		ps->flags &= ~FLAG_HAS_MEDIUM;
413 
414 		/* If the check failed due to a unit attention condition, retry
415 		 * reading the medium capacity. Otherwise, assume that there is
416 		 * no medium available.
417 		 */
418 		if (atapi_request_sense(ps, cmd, &sense) != OK ||
419 				sense != ATAPI_SENSE_UNIT_ATT)
420 			return ENXIO;
421 	}
422 
423 	/* If a medium is newly detected, try reading its capacity now. */
424 	if (!(ps->flags & FLAG_HAS_MEDIUM)) {
425 		if (atapi_read_capacity(ps, cmd) != OK)
426 			return EIO;
427 
428 		ps->flags |= FLAG_HAS_MEDIUM;
429 	}
430 
431 	return OK;
432 }
433 
434 /*===========================================================================*
435  *				atapi_id_check				     *
436  *===========================================================================*/
437 static int atapi_id_check(struct port_state *ps, u16_t *buf)
438 {
439 	/* Determine whether we support this ATAPI device based on the
440 	 * identification data it returned, and store some of its properties.
441 	 */
442 
443 	/* The device must be an ATAPI device; it must have removable media;
444 	 * it must support DMA without DMADIR, or DMADIR for DMA.
445 	 */
446 	if ((buf[ATA_ID_GCAP] & (ATA_ID_GCAP_ATAPI_MASK |
447 		ATA_ID_GCAP_REMOVABLE | ATA_ID_GCAP_INCOMPLETE)) !=
448 		(ATA_ID_GCAP_ATAPI | ATA_ID_GCAP_REMOVABLE) ||
449 		((buf[ATA_ID_CAP] & ATA_ID_CAP_DMA) != ATA_ID_CAP_DMA &&
450 		(buf[ATA_ID_DMADIR] & (ATA_ID_DMADIR_DMADIR |
451 		ATA_ID_DMADIR_DMA)) != (ATA_ID_DMADIR_DMADIR |
452 		ATA_ID_DMADIR_DMA))) {
453 
454 		dprintf(V_ERR, ("%s: unsupported ATAPI device\n",
455 			ahci_portname(ps)));
456 
457 		dprintf(V_DEV, ("%s: GCAP %04x CAP %04x DMADIR %04x\n",
458 			ahci_portname(ps), buf[ATA_ID_GCAP], buf[ATA_ID_CAP],
459 			buf[ATA_ID_DMADIR]));
460 
461 		return FALSE;
462 	}
463 
464 	/* Remember whether to use the DMADIR flag when appropriate. */
465 	if (buf[ATA_ID_DMADIR] & ATA_ID_DMADIR_DMADIR)
466 		ps->flags |= FLAG_USE_DMADIR;
467 
468 	/* ATAPI CD-ROM devices are considered read-only. */
469 	if (((buf[ATA_ID_GCAP] & ATA_ID_GCAP_TYPE_MASK) >>
470 		ATA_ID_GCAP_TYPE_SHIFT) == ATAPI_TYPE_CDROM)
471 		ps->flags |= FLAG_READONLY;
472 
473 	if ((buf[ATA_ID_SUP1] & ATA_ID_SUP1_VALID_MASK) == ATA_ID_SUP1_VALID &&
474 		!(ps->flags & FLAG_READONLY)) {
475 		/* Save write cache related capabilities of the device. It is
476 		 * possible, although unlikely, that a device has support for
477 		 * either of these but not both.
478 		 */
479 		if (buf[ATA_ID_SUP0] & ATA_ID_SUP0_WCACHE)
480 			ps->flags |= FLAG_HAS_WCACHE;
481 
482 		if (buf[ATA_ID_SUP1] & ATA_ID_SUP1_FLUSH)
483 			ps->flags |= FLAG_HAS_FLUSH;
484 	}
485 
486 	return TRUE;
487 }
488 
489 /*===========================================================================*
490  *				atapi_transfer				     *
491  *===========================================================================*/
492 static int atapi_transfer(struct port_state *ps, int cmd, u64_t start_lba,
493 	unsigned int count, int write, prd_t *prdt, int nr_prds)
494 {
495 	/* Perform data transfer from or to an ATAPI device.
496 	 */
497 	cmd_fis_t fis;
498 	u8_t packet[ATAPI_PACKET_SIZE];
499 
500 	/* Fill in a Register Host to Device FIS. */
501 	memset(&fis, 0, sizeof(fis));
502 	fis.cf_cmd = ATA_CMD_PACKET;
503 	fis.cf_feat = ATA_FEAT_PACKET_DMA;
504 	if (!write && (ps->flags & FLAG_USE_DMADIR))
505 		fis.cf_feat |= ATA_FEAT_PACKET_DMADIR;
506 
507 	/* Fill in a packet. */
508 	memset(packet, 0, sizeof(packet));
509 	packet[0] = write ? ATAPI_CMD_WRITE : ATAPI_CMD_READ;
510 	packet[2] = (start_lba >> 24) & 0xFF;
511 	packet[3] = (start_lba >> 16) & 0xFF;
512 	packet[4] = (start_lba >>  8) & 0xFF;
513 	packet[5] = start_lba & 0xFF;
514 	packet[6] = (count >> 24) & 0xFF;
515 	packet[7] = (count >> 16) & 0xFF;
516 	packet[8] = (count >>  8) & 0xFF;
517 	packet[9] = count & 0xFF;
518 
519 	/* Start the command, and wait for it to complete or fail. */
520 	port_set_cmd(ps, cmd, &fis, packet, prdt, nr_prds, write);
521 
522 	return port_exec(ps, cmd, ahci_transfer_timeout);
523 }
524 
525 /*===========================================================================*
526  *				ata_id_check				     *
527  *===========================================================================*/
528 static int ata_id_check(struct port_state *ps, u16_t *buf)
529 {
530 	/* Determine whether we support this ATA device based on the
531 	 * identification data it returned, and store some of its properties.
532 	 */
533 
534 	/* This must be an ATA device; it must not have removable media;
535 	 * it must support LBA and DMA; it must support the FLUSH CACHE
536 	 * command; it must support 48-bit addressing.
537 	 */
538 	if ((buf[ATA_ID_GCAP] & (ATA_ID_GCAP_ATA_MASK | ATA_ID_GCAP_REMOVABLE |
539 		ATA_ID_GCAP_INCOMPLETE)) != ATA_ID_GCAP_ATA ||
540 		(buf[ATA_ID_CAP] & (ATA_ID_CAP_LBA | ATA_ID_CAP_DMA)) !=
541 		(ATA_ID_CAP_LBA | ATA_ID_CAP_DMA) ||
542 		(buf[ATA_ID_SUP1] & (ATA_ID_SUP1_VALID_MASK |
543 		ATA_ID_SUP1_FLUSH | ATA_ID_SUP1_LBA48)) !=
544 		(ATA_ID_SUP1_VALID | ATA_ID_SUP1_FLUSH | ATA_ID_SUP1_LBA48)) {
545 
546 		dprintf(V_ERR, ("%s: unsupported ATA device\n",
547 			ahci_portname(ps)));
548 
549 		dprintf(V_DEV, ("%s: GCAP %04x CAP %04x SUP1 %04x\n",
550 			ahci_portname(ps), buf[ATA_ID_GCAP], buf[ATA_ID_CAP],
551 			buf[ATA_ID_SUP1]));
552 
553 		return FALSE;
554 	}
555 
556 	/* Get number of LBA blocks, and sector size. */
557 	ps->lba_count = ((u64_t) buf[ATA_ID_LBA3] << 48) |
558 			((u64_t) buf[ATA_ID_LBA2] << 32) |
559 			((u64_t) buf[ATA_ID_LBA1] << 16) |
560 			 (u64_t) buf[ATA_ID_LBA0];
561 
562 	/* Determine the queue depth of the device. */
563 	if (hba_state.has_ncq &&
564 			(buf[ATA_ID_SATA_CAP] & ATA_ID_SATA_CAP_NCQ)) {
565 		ps->flags |= FLAG_HAS_NCQ;
566 		ps->queue_depth =
567 			(buf[ATA_ID_QDEPTH] & ATA_ID_QDEPTH_MASK) + 1;
568 		if (ps->queue_depth > hba_state.nr_cmds)
569 			ps->queue_depth = hba_state.nr_cmds;
570 	}
571 
572 	/* For now, we only support long logical sectors. Long physical sector
573 	 * support may be added later. Note that the given value is in words.
574 	 */
575 	if ((buf[ATA_ID_PLSS] & (ATA_ID_PLSS_VALID_MASK | ATA_ID_PLSS_LLS)) ==
576 		(ATA_ID_PLSS_VALID | ATA_ID_PLSS_LLS))
577 		ps->sector_size =
578 			((buf[ATA_ID_LSS1] << 16) | buf[ATA_ID_LSS0]) << 1;
579 	else
580 		ps->sector_size = ATA_SECTOR_SIZE;
581 
582 	if (ps->sector_size < ATA_SECTOR_SIZE) {
583 		dprintf(V_ERR, ("%s: invalid sector size %u\n",
584 			ahci_portname(ps), ps->sector_size));
585 
586 		return FALSE;
587 	}
588 
589 	ps->flags |= FLAG_HAS_MEDIUM | FLAG_HAS_FLUSH;
590 
591 	/* FLUSH CACHE is mandatory for ATA devices; write caches are not. */
592 	if (buf[ATA_ID_SUP0] & ATA_ID_SUP0_WCACHE)
593 		ps->flags |= FLAG_HAS_WCACHE;
594 
595 	/* Check Force Unit Access capability of the device. */
596 	if ((buf[ATA_ID_ENA2] & (ATA_ID_ENA2_VALID_MASK | ATA_ID_ENA2_FUA)) ==
597 		(ATA_ID_ENA2_VALID | ATA_ID_ENA2_FUA))
598 		ps->flags |= FLAG_HAS_FUA;
599 
600 	return TRUE;
601 }
602 
603 /*===========================================================================*
604  *				ata_transfer				     *
605  *===========================================================================*/
606 static int ata_transfer(struct port_state *ps, int cmd, u64_t start_lba,
607 	unsigned int count, int write, int force, prd_t *prdt, int nr_prds)
608 {
609 	/* Perform data transfer from or to an ATA device.
610 	 */
611 	cmd_fis_t fis;
612 
613 	assert(count <= ATA_MAX_SECTORS);
614 
615 	/* Special case for sector counts: 65536 is specified as 0. */
616 	if (count == ATA_MAX_SECTORS)
617 		count = 0;
618 
619 	memset(&fis, 0, sizeof(fis));
620 	fis.cf_dev = ATA_DEV_LBA;
621 	if (ps->flags & FLAG_HAS_NCQ) {
622 		if (write) {
623 			if (force && (ps->flags & FLAG_HAS_FUA))
624 				fis.cf_dev |= ATA_DEV_FUA;
625 
626 			fis.cf_cmd = ATA_CMD_WRITE_FPDMA_QUEUED;
627 		} else {
628 			fis.cf_cmd = ATA_CMD_READ_FPDMA_QUEUED;
629 		}
630 	}
631 	else {
632 		if (write) {
633 			if (force && (ps->flags & FLAG_HAS_FUA))
634 				fis.cf_cmd = ATA_CMD_WRITE_DMA_FUA_EXT;
635 			else
636 				fis.cf_cmd = ATA_CMD_WRITE_DMA_EXT;
637 		}
638 		else {
639 			fis.cf_cmd = ATA_CMD_READ_DMA_EXT;
640 		}
641 	}
642 	fis.cf_lba = start_lba & 0x00FFFFFFUL;
643 	fis.cf_lba_exp = (start_lba >> 24) & 0x00FFFFFFUL;
644 	fis.cf_sec = count & 0xFF;
645 	fis.cf_sec_exp = (count >> 8) & 0xFF;
646 
647 	/* Start the command, and wait for it to complete or fail. */
648 	port_set_cmd(ps, cmd, &fis, NULL /*packet*/, prdt, nr_prds, write);
649 
650 	return port_exec(ps, cmd, ahci_transfer_timeout);
651 }
652 
653 /*===========================================================================*
654  *				gen_identify				     *
655  *===========================================================================*/
656 static int gen_identify(struct port_state *ps, int blocking)
657 {
658 	/* Identify an ATA or ATAPI device. If the blocking flag is set, block
659 	 * until the command has completed; otherwise return immediately.
660 	 */
661 	cmd_fis_t fis;
662 	prd_t prd;
663 
664 	/* Set up a command, and a single PRD for the result. */
665 	memset(&fis, 0, sizeof(fis));
666 
667 	if (ps->flags & FLAG_ATAPI)
668 		fis.cf_cmd = ATA_CMD_IDENTIFY_PACKET;
669 	else
670 		fis.cf_cmd = ATA_CMD_IDENTIFY;
671 
672 	prd.vp_addr = ps->tmp_phys;
673 	prd.vp_size = ATA_ID_SIZE;
674 
675 	/* Start the command, and possibly wait for the result. */
676 	port_set_cmd(ps, 0, &fis, NULL /*packet*/, &prd, 1, FALSE /*write*/);
677 
678 	if (blocking)
679 		return port_exec(ps, 0, ahci_command_timeout);
680 
681 	port_issue(ps, 0, ahci_command_timeout);
682 
683 	return OK;
684 }
685 
686 /*===========================================================================*
687  *				gen_flush_wcache			     *
688  *===========================================================================*/
689 static int gen_flush_wcache(struct port_state *ps)
690 {
691 	/* Flush the device's write cache.
692 	 */
693 	cmd_fis_t fis;
694 
695 	/* The FLUSH CACHE command may not be supported by all (writable ATAPI)
696 	 * devices.
697 	 */
698 	if (!(ps->flags & FLAG_HAS_FLUSH))
699 		return EINVAL;
700 
701 	/* Use the FLUSH CACHE command for both ATA and ATAPI. We are not
702 	 * interested in the disk location of a failure, so there is no reason
703 	 * to use the ATA-only FLUSH CACHE EXT command. Either way, the command
704 	 * may indeed fail due to a disk error, in which case it should be
705 	 * repeated. For now, we shift this responsibility onto the caller.
706 	 */
707 	memset(&fis, 0, sizeof(fis));
708 	fis.cf_cmd = ATA_CMD_FLUSH_CACHE;
709 
710 	/* Start the command, and wait for it to complete or fail.
711 	 * The flush command may take longer than regular I/O commands.
712 	 */
713 	port_set_cmd(ps, 0, &fis, NULL /*packet*/, NULL /*prdt*/, 0,
714 		FALSE /*write*/);
715 
716 	return port_exec(ps, 0, ahci_flush_timeout);
717 }
718 
719 /*===========================================================================*
720  *				gen_get_wcache				     *
721  *===========================================================================*/
722 static int gen_get_wcache(struct port_state *ps, int *val)
723 {
724 	/* Retrieve the status of the device's write cache.
725 	 */
726 	int r;
727 
728 	/* Write caches are not mandatory. */
729 	if (!(ps->flags & FLAG_HAS_WCACHE))
730 		return EINVAL;
731 
732 	/* Retrieve information about the device. */
733 	if ((r = gen_identify(ps, TRUE /*blocking*/)) != OK)
734 		return r;
735 
736 	/* Return the current setting. */
737 	*val = !!(((u16_t *) ps->tmp_base)[ATA_ID_ENA0] & ATA_ID_ENA0_WCACHE);
738 
739 	return OK;
740 }
741 
742 /*===========================================================================*
743  *				gen_set_wcache				     *
744  *===========================================================================*/
745 static int gen_set_wcache(struct port_state *ps, int enable)
746 {
747 	/* Enable or disable the device's write cache.
748 	 */
749 	cmd_fis_t fis;
750 	clock_t timeout;
751 
752 	/* Write caches are not mandatory. */
753 	if (!(ps->flags & FLAG_HAS_WCACHE))
754 		return EINVAL;
755 
756 	/* Disabling the write cache causes a (blocking) cache flush. Cache
757 	 * flushes may take much longer than regular commands.
758 	 */
759 	timeout = enable ? ahci_command_timeout : ahci_flush_timeout;
760 
761 	/* Set up a command. */
762 	memset(&fis, 0, sizeof(fis));
763 	fis.cf_cmd = ATA_CMD_SET_FEATURES;
764 	fis.cf_feat = enable ? ATA_SF_EN_WCACHE : ATA_SF_DI_WCACHE;
765 
766 	/* Start the command, and wait for it to complete or fail. */
767 	port_set_cmd(ps, 0, &fis, NULL /*packet*/, NULL /*prdt*/, 0,
768 		FALSE /*write*/);
769 
770 	return port_exec(ps, 0, timeout);
771 }
772 
773 /*===========================================================================*
774  *				ct_set_fis				     *
775  *===========================================================================*/
776 static vir_bytes ct_set_fis(u8_t *ct, cmd_fis_t *fis, unsigned int tag)
777 {
778 	/* Fill in the Frame Information Structure part of a command table,
779 	 * and return the resulting FIS size (in bytes). We only support the
780 	 * command Register - Host to Device FIS type.
781 	 */
782 
783 	memset(ct, 0, ATA_H2D_SIZE);
784 	ct[ATA_FIS_TYPE] = ATA_FIS_TYPE_H2D;
785 	ct[ATA_H2D_FLAGS] = ATA_H2D_FLAGS_C;
786 	ct[ATA_H2D_CMD] = fis->cf_cmd;
787 	ct[ATA_H2D_LBA_LOW] = fis->cf_lba & 0xFF;
788 	ct[ATA_H2D_LBA_MID] = (fis->cf_lba >> 8) & 0xFF;
789 	ct[ATA_H2D_LBA_HIGH] = (fis->cf_lba >> 16) & 0xFF;
790 	ct[ATA_H2D_DEV] = fis->cf_dev;
791 	ct[ATA_H2D_LBA_LOW_EXP] = fis->cf_lba_exp & 0xFF;
792 	ct[ATA_H2D_LBA_MID_EXP] = (fis->cf_lba_exp >> 8) & 0xFF;
793 	ct[ATA_H2D_LBA_HIGH_EXP] = (fis->cf_lba_exp >> 16) & 0xFF;
794 	ct[ATA_H2D_CTL] = fis->cf_ctl;
795 
796 	if (ATA_IS_FPDMA_CMD(fis->cf_cmd)) {
797 		ct[ATA_H2D_FEAT] = fis->cf_sec;
798 		ct[ATA_H2D_FEAT_EXP] = fis->cf_sec_exp;
799 		ct[ATA_H2D_SEC] = tag << ATA_SEC_TAG_SHIFT;
800 		ct[ATA_H2D_SEC_EXP] = 0;
801 	} else {
802 		ct[ATA_H2D_FEAT] = fis->cf_feat;
803 		ct[ATA_H2D_FEAT_EXP] = fis->cf_feat_exp;
804 		ct[ATA_H2D_SEC] = fis->cf_sec;
805 		ct[ATA_H2D_SEC_EXP] = fis->cf_sec_exp;
806 	}
807 
808 	return ATA_H2D_SIZE;
809 }
810 
811 /*===========================================================================*
812  *				ct_set_packet				     *
813  *===========================================================================*/
814 static void ct_set_packet(u8_t *ct, u8_t packet[ATAPI_PACKET_SIZE])
815 {
816 	/* Fill in the packet part of a command table.
817 	 */
818 
819 	memcpy(&ct[AHCI_CT_PACKET_OFF], packet, ATAPI_PACKET_SIZE);
820 }
821 
822 /*===========================================================================*
823  *				ct_set_prdt				     *
824  *===========================================================================*/
825 static void ct_set_prdt(u8_t *ct, prd_t *prdt, int nr_prds)
826 {
827 	/* Fill in the PRDT part of a command table.
828 	 */
829 	u32_t *p;
830 	int i;
831 
832 	p = (u32_t *) &ct[AHCI_CT_PRDT_OFF];
833 
834 	for (i = 0; i < nr_prds; i++, prdt++) {
835 		*p++ = prdt->vp_addr;
836 		*p++ = 0;
837 		*p++ = 0;
838 		*p++ = prdt->vp_size - 1;
839 	}
840 }
841 
842 /*===========================================================================*
843  *				port_set_cmd				     *
844  *===========================================================================*/
845 static void port_set_cmd(struct port_state *ps, int cmd, cmd_fis_t *fis,
846 	u8_t packet[ATAPI_PACKET_SIZE], prd_t *prdt, int nr_prds, int write)
847 {
848 	/* Prepare the given command for execution, by constructing a command
849 	 * table and setting up a command list entry pointing to the table.
850 	 */
851 	u8_t *ct;
852 	u32_t *cl;
853 	vir_bytes size;
854 
855 	/* Set a port-specific flag that tells us if the command being
856 	 * processed is a NCQ command or not.
857 	 */
858 	if (ATA_IS_FPDMA_CMD(fis->cf_cmd)) {
859 		ps->flags |= FLAG_NCQ_MODE;
860 	} else {
861 		assert(!ps->pend_mask);
862 		ps->flags &= ~FLAG_NCQ_MODE;
863 	}
864 
865 	/* Construct a command table, consisting of a command FIS, optionally
866 	 * a packet, and optionally a number of PRDs (making up the actual PRD
867 	 * table).
868 	 */
869 	ct = ps->ct_base[cmd];
870 
871 	assert(ct != NULL);
872 	assert(nr_prds <= NR_PRDS);
873 
874 	size = ct_set_fis(ct, fis, cmd);
875 
876 	if (packet != NULL)
877 		ct_set_packet(ct, packet);
878 
879 	ct_set_prdt(ct, prdt, nr_prds);
880 
881 	/* Construct a command list entry, pointing to the command's table.
882 	 * Current assumptions: callers always provide a Register - Host to
883 	 * Device type FIS, and all non-NCQ commands are prefetchable.
884 	 */
885 	cl = &ps->cl_base[cmd * AHCI_CL_ENTRY_DWORDS];
886 
887 	memset(cl, 0, AHCI_CL_ENTRY_SIZE);
888 	cl[0] = (nr_prds << AHCI_CL_PRDTL_SHIFT) |
889 		((!ATA_IS_FPDMA_CMD(fis->cf_cmd) &&
890 		(nr_prds > 0 || packet != NULL)) ? AHCI_CL_PREFETCHABLE : 0) |
891 		(write ? AHCI_CL_WRITE : 0) |
892 		((packet != NULL) ? AHCI_CL_ATAPI : 0) |
893 		((size / sizeof(u32_t)) << AHCI_CL_CFL_SHIFT);
894 	cl[2] = ps->ct_phys[cmd];
895 }
896 
897 /*===========================================================================*
898  *				port_finish_cmd				     *
899  *===========================================================================*/
900 static void port_finish_cmd(struct port_state *ps, int cmd, int result)
901 {
902 	/* Finish a command that has either succeeded or failed.
903 	 */
904 
905 	assert(cmd < ps->queue_depth);
906 
907 	dprintf(V_REQ, ("%s: command %d %s\n", ahci_portname(ps),
908 		cmd, (result == RESULT_SUCCESS) ? "succeeded" : "failed"));
909 
910 	/* Update the command result, and clear it from the pending list. */
911 	ps->cmd_info[cmd].result = result;
912 
913 	assert(ps->pend_mask & (1 << cmd));
914 	ps->pend_mask &= ~(1 << cmd);
915 
916 	/* Wake up the thread, unless it is the main thread. This can happen
917 	 * during initialization, as the gen_identify function is called by the
918 	 * main thread itself.
919 	 */
920 	if (ps->state != STATE_WAIT_ID)
921 		blockdriver_mt_wakeup(ps->cmd_info[cmd].tid);
922 }
923 
924 /*===========================================================================*
925  *				port_fail_cmds				     *
926  *===========================================================================*/
927 static void port_fail_cmds(struct port_state *ps)
928 {
929 	/* Fail all ongoing commands for a device.
930 	 */
931 	int i;
932 
933 	for (i = 0; ps->pend_mask != 0 && i < ps->queue_depth; i++)
934 		if (ps->pend_mask & (1 << i))
935 			port_finish_cmd(ps, i, RESULT_FAILURE);
936 }
937 
938 /*===========================================================================*
939  *				port_check_cmds				     *
940  *===========================================================================*/
941 static void port_check_cmds(struct port_state *ps)
942 {
943 	/* Check what commands have completed, and finish them.
944 	 */
945 	u32_t mask, done;
946 	int i;
947 
948 	/* See which commands have completed. */
949 	if (ps->flags & FLAG_NCQ_MODE)
950 		mask = port_read(ps, AHCI_PORT_SACT);
951 	else
952 		mask = port_read(ps, AHCI_PORT_CI);
953 
954 	/* Wake up threads corresponding to completed commands. */
955 	done = ps->pend_mask & ~mask;
956 
957 	for (i = 0; i < ps->queue_depth; i++)
958 		if (done & (1 << i))
959 			port_finish_cmd(ps, i, RESULT_SUCCESS);
960 }
961 
962 /*===========================================================================*
963  *				port_find_cmd				     *
964  *===========================================================================*/
965 static int port_find_cmd(struct port_state *ps)
966 {
967 	/* Find a free command tag to queue the current request.
968 	 */
969 	int i;
970 
971 	for (i = 0; i < ps->queue_depth; i++)
972 		if (!(ps->pend_mask & (1 << i)))
973 			break;
974 
975 	/* We should always be able to find a free slot, since a thread runs
976 	 * only when it is free, and thus, only because a slot is available.
977 	 */
978 	assert(i < ps->queue_depth);
979 
980 	return i;
981 }
982 
983 /*===========================================================================*
984  *				port_get_padbuf				     *
985  *===========================================================================*/
986 static int port_get_padbuf(struct port_state *ps, size_t size)
987 {
988 	/* Make available a temporary buffer for use by this port. Enlarge the
989 	 * previous buffer if applicable and necessary, potentially changing
990 	 * its physical address.
991 	 */
992 
993 	if (ps->pad_base != NULL && ps->pad_size >= size)
994 		return OK;
995 
996 	if (ps->pad_base != NULL)
997 		free_contig(ps->pad_base, ps->pad_size);
998 
999 	ps->pad_size = size;
1000 	ps->pad_base = alloc_contig(ps->pad_size, 0, &ps->pad_phys);
1001 
1002 	if (ps->pad_base == NULL) {
1003 		dprintf(V_ERR, ("%s: unable to allocate a padding buffer of "
1004 			"size %lu\n", ahci_portname(ps),
1005 			(unsigned long) size));
1006 
1007 		return ENOMEM;
1008 	}
1009 
1010 	dprintf(V_INFO, ("%s: allocated padding buffer of size %lu\n",
1011 		ahci_portname(ps), (unsigned long) size));
1012 
1013 	return OK;
1014 }
1015 
1016 /*===========================================================================*
1017  *				sum_iovec				     *
1018  *===========================================================================*/
1019 static int sum_iovec(struct port_state *ps, endpoint_t endpt,
1020 	iovec_s_t *iovec, int nr_req, vir_bytes *total)
1021 {
1022 	/* Retrieve the total size of the given I/O vector. Check for alignment
1023 	 * requirements along the way. Return OK (and the total request size)
1024 	 * or an error.
1025 	 */
1026 	vir_bytes size, bytes;
1027 	int i;
1028 
1029 	bytes = 0;
1030 
1031 	for (i = 0; i < nr_req; i++) {
1032 		size = iovec[i].iov_size;
1033 
1034 		if (size == 0 || (size & 1) || size > LONG_MAX) {
1035 			dprintf(V_ERR, ("%s: bad size %lu in iovec from %d\n",
1036 				ahci_portname(ps), size, endpt));
1037 			return EINVAL;
1038 		}
1039 
1040 		bytes += size;
1041 
1042 		if (bytes > LONG_MAX) {
1043 			dprintf(V_ERR, ("%s: iovec size overflow from %d\n",
1044 				ahci_portname(ps), endpt));
1045 			return EINVAL;
1046 		}
1047 	}
1048 
1049 	*total = bytes;
1050 	return OK;
1051 }
1052 
1053 /*===========================================================================*
1054  *				setup_prdt				     *
1055  *===========================================================================*/
1056 static int setup_prdt(struct port_state *ps, endpoint_t endpt,
1057 	iovec_s_t *iovec, int nr_req, vir_bytes size, vir_bytes lead,
1058 	int write, prd_t *prdt)
1059 {
1060 	/* Convert (the first part of) an I/O vector to a Physical Region
1061 	 * Descriptor Table describing array that can later be used to set the
1062 	 * command's real PRDT. The resulting table as a whole should be
1063 	 * sector-aligned; leading and trailing local buffers may have to be
1064 	 * used for padding as appropriate. Return the number of PRD entries,
1065 	 * or a negative error code.
1066 	 */
1067 	struct vumap_vir vvec[NR_PRDS];
1068 	size_t bytes, trail;
1069 	int i, r, pcount, nr_prds = 0;
1070 
1071 	if (lead > 0) {
1072 		/* Allocate a buffer for the data we don't want. */
1073 		if ((r = port_get_padbuf(ps, ps->sector_size)) != OK)
1074 			return r;
1075 
1076 		prdt[nr_prds].vp_addr = ps->pad_phys;
1077 		prdt[nr_prds].vp_size = lead;
1078 		nr_prds++;
1079 	}
1080 
1081 	/* The sum of lead, size, trail has to be sector-aligned. */
1082 	trail = (ps->sector_size - (lead + size)) % ps->sector_size;
1083 
1084 	/* Get the physical addresses of the given buffers. */
1085 	for (i = 0; i < nr_req && size > 0; i++) {
1086 		bytes = MIN(iovec[i].iov_size, size);
1087 
1088 		if (endpt == SELF)
1089 			vvec[i].vv_addr = (vir_bytes) iovec[i].iov_grant;
1090 		else
1091 			vvec[i].vv_grant = iovec[i].iov_grant;
1092 
1093 		vvec[i].vv_size = bytes;
1094 
1095 		size -= bytes;
1096 	}
1097 
1098 	pcount = i;
1099 
1100 	if ((r = sys_vumap(endpt, vvec, i, 0, write ? VUA_READ : VUA_WRITE,
1101 			&prdt[nr_prds], &pcount)) != OK) {
1102 		dprintf(V_ERR, ("%s: unable to map memory from %d (%d)\n",
1103 			ahci_portname(ps), endpt, r));
1104 		return r;
1105 	}
1106 
1107 	assert(pcount > 0 && pcount <= i);
1108 
1109 	/* Make sure all buffers are physically contiguous and word-aligned. */
1110 	for (i = 0; i < pcount; i++) {
1111 		if (vvec[i].vv_size != prdt[nr_prds].vp_size) {
1112 			dprintf(V_ERR, ("%s: non-contiguous memory from %d\n",
1113 				ahci_portname(ps), endpt));
1114 			return EINVAL;
1115 		}
1116 
1117 		if (prdt[nr_prds].vp_addr & 1) {
1118 			dprintf(V_ERR, ("%s: bad physical address from %d\n",
1119 				ahci_portname(ps), endpt));
1120 			return EINVAL;
1121 		}
1122 
1123 		nr_prds++;
1124 	}
1125 
1126 	if (trail > 0) {
1127 		assert(nr_prds < NR_PRDS);
1128 		prdt[nr_prds].vp_addr = ps->pad_phys + lead;
1129 		prdt[nr_prds].vp_size = trail;
1130 		nr_prds++;
1131 	}
1132 
1133 	return nr_prds;
1134 }
1135 
1136 /*===========================================================================*
1137  *				port_transfer				     *
1138  *===========================================================================*/
1139 static ssize_t port_transfer(struct port_state *ps, u64_t pos, u64_t eof,
1140 	endpoint_t endpt, iovec_s_t *iovec, int nr_req, int write, int flags)
1141 {
1142 	/* Perform an I/O transfer on a port.
1143 	 */
1144 	prd_t prdt[NR_PRDS];
1145 	vir_bytes size, lead;
1146 	unsigned int count, nr_prds;
1147 	u64_t start_lba;
1148 	int r, cmd;
1149 
1150 	/* Get the total request size from the I/O vector. */
1151 	if ((r = sum_iovec(ps, endpt, iovec, nr_req, &size)) != OK)
1152 		return r;
1153 
1154 	dprintf(V_REQ, ("%s: %s for %lu bytes at pos %llx\n",
1155 		ahci_portname(ps), write ? "write" : "read", size, pos));
1156 
1157 	assert(ps->state == STATE_GOOD_DEV);
1158 	assert(ps->flags & FLAG_HAS_MEDIUM);
1159 	assert(ps->sector_size > 0);
1160 
1161 	/* Limit the maximum size of a single transfer.
1162 	 * See the comments at the top of this file for details.
1163 	 */
1164 	if (size > MAX_TRANSFER)
1165 		size = MAX_TRANSFER;
1166 
1167 	/* If necessary, reduce the request size so that the request does not
1168 	 * extend beyond the end of the partition. The caller already
1169 	 * guarantees that the starting position lies within the partition.
1170 	 */
1171 	if (pos + size > eof)
1172 		size = (vir_bytes) (eof - pos);
1173 
1174 	start_lba = pos / ps->sector_size;
1175 	lead = (vir_bytes) (pos % ps->sector_size);
1176 	count = (lead + size + ps->sector_size - 1) / ps->sector_size;
1177 
1178 	/* Position must be word-aligned for read requests, and sector-aligned
1179 	 * for write requests. We do not support read-modify-write for writes.
1180 	 */
1181 	if ((lead & 1) || (write && lead != 0)) {
1182 		dprintf(V_ERR, ("%s: unaligned position from %d\n",
1183 			ahci_portname(ps), endpt));
1184 		return EINVAL;
1185 	}
1186 
1187 	/* Write requests must be sector-aligned. Word alignment of the size is
1188 	 * already guaranteed by sum_iovec().
1189 	 */
1190 	if (write && (size % ps->sector_size) != 0) {
1191 		dprintf(V_ERR, ("%s: unaligned size %lu from %d\n",
1192 			ahci_portname(ps), size, endpt));
1193 		return EINVAL;
1194 	}
1195 
1196 	/* Create a vector of physical addresses and sizes for the transfer. */
1197 	nr_prds = r = setup_prdt(ps, endpt, iovec, nr_req, size, lead, write,
1198 		prdt);
1199 
1200 	if (r < 0) return r;
1201 
1202 	/* Perform the actual transfer. */
1203 	cmd = port_find_cmd(ps);
1204 
1205 	if (ps->flags & FLAG_ATAPI)
1206 		r = atapi_transfer(ps, cmd, start_lba, count, write, prdt,
1207 			nr_prds);
1208 	else
1209 		r = ata_transfer(ps, cmd, start_lba, count, write,
1210 			!!(flags & BDEV_FORCEWRITE), prdt, nr_prds);
1211 
1212 	if (r != OK) return r;
1213 
1214 	return size;
1215 }
1216 
1217 /*===========================================================================*
1218  *				port_hardreset				     *
1219  *===========================================================================*/
1220 static void port_hardreset(struct port_state *ps)
1221 {
1222 	/* Perform a port-level (hard) reset on the given port.
1223 	 */
1224 
1225 	port_write(ps, AHCI_PORT_SCTL, AHCI_PORT_SCTL_DET_INIT);
1226 
1227 	micro_delay(COMRESET_DELAY * 1000);	/* COMRESET_DELAY is in ms */
1228 
1229 	port_write(ps, AHCI_PORT_SCTL, AHCI_PORT_SCTL_DET_NONE);
1230 }
1231 
1232 /*===========================================================================*
1233  *				port_override				     *
1234  *===========================================================================*/
1235 static void port_override(struct port_state *ps)
1236 {
1237 	/* Override the port's BSY and/or DRQ flags. This may only be done
1238 	 * prior to starting the port.
1239 	 */
1240 	u32_t cmd;
1241 
1242 	cmd = port_read(ps, AHCI_PORT_CMD);
1243 	port_write(ps, AHCI_PORT_CMD, cmd | AHCI_PORT_CMD_CLO);
1244 
1245 	SPIN_UNTIL(!(port_read(ps, AHCI_PORT_CMD) & AHCI_PORT_CMD_CLO),
1246 		PORTREG_DELAY);
1247 
1248 	dprintf(V_INFO, ("%s: overridden\n", ahci_portname(ps)));
1249 }
1250 
1251 /*===========================================================================*
1252  *				port_start				     *
1253  *===========================================================================*/
1254 static void port_start(struct port_state *ps)
1255 {
1256 	/* Start the given port, allowing for the execution of commands and the
1257 	 * transfer of data on that port.
1258 	 */
1259 	u32_t cmd;
1260 
1261 	/* Reset status registers. */
1262 	port_write(ps, AHCI_PORT_SERR, ~0);
1263 	port_write(ps, AHCI_PORT_IS, ~0);
1264 
1265 	/* Start the port. */
1266 	cmd = port_read(ps, AHCI_PORT_CMD);
1267 	port_write(ps, AHCI_PORT_CMD, cmd | AHCI_PORT_CMD_ST);
1268 
1269 	dprintf(V_INFO, ("%s: started\n", ahci_portname(ps)));
1270 }
1271 
1272 /*===========================================================================*
1273  *				port_stop				     *
1274  *===========================================================================*/
1275 static void port_stop(struct port_state *ps)
1276 {
1277 	/* Stop the given port, if not already stopped.
1278 	 */
1279 	u32_t cmd;
1280 
1281 	cmd = port_read(ps, AHCI_PORT_CMD);
1282 
1283 	if (cmd & (AHCI_PORT_CMD_CR | AHCI_PORT_CMD_ST)) {
1284 		port_write(ps, AHCI_PORT_CMD, cmd & ~AHCI_PORT_CMD_ST);
1285 
1286 		SPIN_UNTIL(!(port_read(ps, AHCI_PORT_CMD) & AHCI_PORT_CMD_CR),
1287 			PORTREG_DELAY);
1288 
1289 		dprintf(V_INFO, ("%s: stopped\n", ahci_portname(ps)));
1290 	}
1291 }
1292 
1293 /*===========================================================================*
1294  *				port_restart				     *
1295  *===========================================================================*/
1296 static void port_restart(struct port_state *ps)
1297 {
1298 	/* Restart a port after a fatal error has occurred.
1299 	 */
1300 
1301 	/* Fail all outstanding commands. */
1302 	port_fail_cmds(ps);
1303 
1304 	/* Stop the port. */
1305 	port_stop(ps);
1306 
1307 	/* If the BSY and/or DRQ flags are set, reset the port. */
1308 	if (port_read(ps, AHCI_PORT_TFD) &
1309 		(AHCI_PORT_TFD_STS_BSY | AHCI_PORT_TFD_STS_DRQ)) {
1310 
1311 		dprintf(V_ERR, ("%s: port reset\n", ahci_portname(ps)));
1312 
1313 		/* To keep this driver simple, we do not transparently recover
1314 		 * ongoing requests. Instead, we mark the failing device as
1315 		 * disconnected, and reset it. If the reset succeeds, the
1316 		 * device (or, perhaps, eventually, another device) will come
1317 		 * back up. Any current and future requests to this port will
1318 		 * be failed until the port is fully closed and reopened.
1319 		 */
1320 		port_disconnect(ps);
1321 
1322 		/* Trigger a port reset. */
1323 		port_hardreset(ps);
1324 
1325 		return;
1326 	}
1327 
1328 	/* Start the port. */
1329 	port_start(ps);
1330 }
1331 
1332 /*===========================================================================*
1333  *				print_string				     *
1334  *===========================================================================*/
1335 static void print_string(u16_t *buf, int start, int end)
1336 {
1337 	/* Print a string that is stored as little-endian words and padded with
1338 	 * trailing spaces.
1339 	 */
1340 	int i, last = 0;
1341 
1342 	while (end >= start && buf[end] == 0x2020) end--;
1343 
1344 	if (end >= start && (buf[end] & 0xFF) == 0x20) end--, last++;
1345 
1346 	for (i = start; i <= end; i++)
1347 		printf("%c%c", buf[i] >> 8, buf[i] & 0xFF);
1348 
1349 	if (last)
1350 		printf("%c", buf[i] >> 8);
1351 }
1352 
1353 /*===========================================================================*
1354  *				port_id_check				     *
1355  *===========================================================================*/
1356 static void port_id_check(struct port_state *ps, int success)
1357 {
1358 	/* The device identification command has either completed or timed out.
1359 	 * Decide whether this device is usable or not, and store some of its
1360 	 * properties.
1361 	 */
1362 	u16_t *buf;
1363 
1364 	assert(ps->state == STATE_WAIT_ID);
1365 
1366 	ps->flags &= ~FLAG_BUSY;
1367 	cancel_timer(&ps->cmd_info[0].timer);
1368 
1369 	if (!success) {
1370 		if (!(ps->flags & FLAG_ATAPI) &&
1371 				port_read(ps, AHCI_PORT_SIG) != ATA_SIG_ATA) {
1372 			dprintf(V_INFO, ("%s: may not be ATA, trying ATAPI\n",
1373 				ahci_portname(ps)));
1374 
1375 			ps->flags |= FLAG_ATAPI;
1376 
1377 			(void) gen_identify(ps, FALSE /*blocking*/);
1378 			return;
1379 		}
1380 
1381 		dprintf(V_ERR,
1382 			("%s: unable to identify\n", ahci_portname(ps)));
1383 	}
1384 
1385 	/* If the identify command itself succeeded, check the results and
1386 	 * store some properties.
1387 	 */
1388 	if (success) {
1389 		buf = (u16_t *) ps->tmp_base;
1390 
1391 		if (ps->flags & FLAG_ATAPI)
1392 			success = atapi_id_check(ps, buf);
1393 		else
1394 			success = ata_id_check(ps, buf);
1395 	}
1396 
1397 	/* If the device has not been identified successfully, mark it as an
1398 	 * unusable device.
1399 	 */
1400 	if (!success) {
1401 		port_stop(ps);
1402 
1403 		ps->state = STATE_BAD_DEV;
1404 		port_write(ps, AHCI_PORT_IE, AHCI_PORT_IE_PRCE);
1405 
1406 		return;
1407 	}
1408 
1409 	/* The device has been identified successfully, and hence usable. */
1410 	ps->state = STATE_GOOD_DEV;
1411 
1412 	/* Print some information about the device. */
1413 	if (ahci_verbose >= V_INFO) {
1414 		printf("%s: ATA%s, ", ahci_portname(ps),
1415 			(ps->flags & FLAG_ATAPI) ? "PI" : "");
1416 		print_string(buf, 27, 46);
1417 		if (ahci_verbose >= V_DEV) {
1418 			printf(" (");
1419 			print_string(buf, 10, 19);
1420 			printf(", ");
1421 			print_string(buf, 23, 26);
1422 			printf(")");
1423 		}
1424 
1425 		if (ps->flags & FLAG_HAS_MEDIUM)
1426 			printf(", %u byte sectors, %llu MB size",
1427 				ps->sector_size,
1428 				ps->lba_count * ps->sector_size / (1024*1024));
1429 
1430 		printf("\n");
1431 	}
1432 }
1433 
1434 /*===========================================================================*
1435  *				port_connect				     *
1436  *===========================================================================*/
1437 static void port_connect(struct port_state *ps)
1438 {
1439 	/* A device has been found to be attached to this port. Start the port,
1440 	 * and do timed polling for its signature to become available.
1441 	 */
1442 	u32_t status, sig;
1443 
1444 	dprintf(V_INFO, ("%s: device connected\n", ahci_portname(ps)));
1445 
1446 	port_start(ps);
1447 
1448 	/* The next check covers a purely hypothetical race condition, where
1449 	 * the device would disappear right before we try to start it. This is
1450 	 * possible because we have to clear PxSERR, and with that, the DIAG.N
1451 	 * bit. Double-check the port status, and if it is not as we expect,
1452 	 * infer a disconnection.
1453 	 */
1454 	status = port_read(ps, AHCI_PORT_SSTS) & AHCI_PORT_SSTS_DET_MASK;
1455 
1456 	if (status != AHCI_PORT_SSTS_DET_PHY) {
1457 		dprintf(V_ERR, ("%s: device vanished!\n", ahci_portname(ps)));
1458 
1459 		port_stop(ps);
1460 
1461 		ps->state = STATE_NO_DEV;
1462 		ps->flags &= ~FLAG_BUSY;
1463 
1464 		return;
1465 	}
1466 
1467 	/* Clear all state flags except the busy flag, which may be relevant if
1468 	 * a BDEV_OPEN call is waiting for the device to become ready; the
1469 	 * barrier flag, which prevents access to the device until it is
1470 	 * completely closed and (re)opened; and, the thread suspension flag.
1471 	 */
1472 	ps->flags &= (FLAG_BUSY | FLAG_BARRIER | FLAG_SUSPENDED);
1473 
1474 	/* Check the port's signature. We only use the signature to speed up
1475 	 * identification; we will try both ATA and ATAPI if the signature is
1476 	 * neither ATA nor ATAPI.
1477 	 */
1478 	sig = port_read(ps, AHCI_PORT_SIG);
1479 
1480 	if (sig == ATA_SIG_ATAPI)
1481 		ps->flags |= FLAG_ATAPI;
1482 
1483 	/* Attempt to identify the device. Do this using continuation, because
1484 	 * we may already be called from port_wait() here, and could end up
1485 	 * confusing the timer expiration procedure.
1486 	 */
1487 	ps->state = STATE_WAIT_ID;
1488 	port_write(ps, AHCI_PORT_IE, AHCI_PORT_IE_MASK);
1489 
1490 	(void) gen_identify(ps, FALSE /*blocking*/);
1491 }
1492 
1493 /*===========================================================================*
1494  *				port_disconnect				     *
1495  *===========================================================================*/
1496 static void port_disconnect(struct port_state *ps)
1497 {
1498 	/* The device has detached from this port. It has already been stopped.
1499 	 */
1500 
1501 	dprintf(V_INFO, ("%s: device disconnected\n", ahci_portname(ps)));
1502 
1503 	ps->state = STATE_NO_DEV;
1504 	port_write(ps, AHCI_PORT_IE, AHCI_PORT_IE_PCE);
1505 	ps->flags &= ~FLAG_BUSY;
1506 
1507 	/* Fail any ongoing request. The caller may already have done this. */
1508 	port_fail_cmds(ps);
1509 
1510 	/* Block any further access until the device is completely closed and
1511 	 * reopened. This prevents arbitrary I/O to a newly plugged-in device
1512 	 * without upper layers noticing.
1513 	 */
1514 	ps->flags |= FLAG_BARRIER;
1515 
1516 	/* Inform the blockdriver library to reduce the number of threads. */
1517 	blockdriver_mt_set_workers(ps->device, 1);
1518 }
1519 
1520 /*===========================================================================*
1521  *				port_dev_check				     *
1522  *===========================================================================*/
1523 static void port_dev_check(struct port_state *ps)
1524 {
1525 	/* Perform device detection by means of polling.
1526 	 */
1527 	u32_t status, tfd;
1528 
1529 	assert(ps->state == STATE_WAIT_DEV);
1530 
1531 	status = port_read(ps, AHCI_PORT_SSTS) & AHCI_PORT_SSTS_DET_MASK;
1532 
1533 	dprintf(V_DEV, ("%s: polled status %u\n", ahci_portname(ps), status));
1534 
1535 	switch (status) {
1536 	case AHCI_PORT_SSTS_DET_PHY:
1537 		tfd = port_read(ps, AHCI_PORT_TFD);
1538 
1539 		/* If a Phy connection has been established, and the BSY and
1540 		 * DRQ flags are cleared, the device is ready.
1541 		 */
1542 		if (!(tfd & (AHCI_PORT_TFD_STS_BSY | AHCI_PORT_TFD_STS_DRQ))) {
1543 			port_connect(ps);
1544 
1545 			return;
1546 		}
1547 
1548 		/* fall-through */
1549 	case AHCI_PORT_SSTS_DET_DET:
1550 		/* A device has been detected, but it is not ready yet. Try for
1551 		 * a while before giving up. This may take seconds.
1552 		 */
1553 		if (ps->left > 0) {
1554 			ps->left--;
1555 			set_timer(&ps->cmd_info[0].timer, ahci_device_delay,
1556 				port_timeout, BUILD_ARG(ps - port_state, 0));
1557 			return;
1558 		}
1559 	}
1560 
1561 	dprintf(V_INFO, ("%s: device not ready\n", ahci_portname(ps)));
1562 
1563 	/* We get here on timeout, and if the HBA reports that there is no
1564 	 * device present at all. In all cases, we change to another state.
1565 	 */
1566 	if (status == AHCI_PORT_SSTS_DET_PHY) {
1567 		/* Some devices may not correctly clear BSY/DRQ. Upon timeout,
1568 		 * if we can override these flags, do so and start the
1569 		 * identification process anyway.
1570 		 */
1571 		if (hba_state.has_clo) {
1572 			port_override(ps);
1573 
1574 			port_connect(ps);
1575 
1576 			return;
1577 		}
1578 
1579 		/* A device is present and initialized, but not ready. */
1580 		ps->state = STATE_BAD_DEV;
1581 		port_write(ps, AHCI_PORT_IE, AHCI_PORT_IE_PRCE);
1582 	} else {
1583 		/* A device may or may not be present, but it does not appear
1584 		 * to be ready in any case. Ignore it until the next device
1585 		 * initialization event.
1586 		 */
1587 		ps->state = STATE_NO_DEV;
1588 		ps->flags &= ~FLAG_BUSY;
1589 	}
1590 }
1591 
1592 /*===========================================================================*
1593  *				port_intr				     *
1594  *===========================================================================*/
1595 static void port_intr(struct port_state *ps)
1596 {
1597 	/* Process an interrupt on this port.
1598 	 */
1599 	u32_t smask, emask;
1600 	int success;
1601 
1602 	if (ps->state == STATE_NO_PORT) {
1603 		dprintf(V_ERR, ("%s: interrupt for invalid port!\n",
1604 			ahci_portname(ps)));
1605 
1606 		return;
1607 	}
1608 
1609 	smask = port_read(ps, AHCI_PORT_IS);
1610 	emask = smask & port_read(ps, AHCI_PORT_IE);
1611 
1612 	/* Clear the interrupt flags that we saw were set. */
1613 	port_write(ps, AHCI_PORT_IS, smask);
1614 
1615 	dprintf(V_REQ, ("%s: interrupt (%08x)\n", ahci_portname(ps), smask));
1616 
1617 	/* Check if any commands have completed. */
1618 	port_check_cmds(ps);
1619 
1620 	if (emask & AHCI_PORT_IS_PCS) {
1621 		/* Clear the X diagnostics bit to clear this interrupt. */
1622 		port_write(ps, AHCI_PORT_SERR, AHCI_PORT_SERR_DIAG_X);
1623 
1624 		dprintf(V_DEV, ("%s: device attached\n", ahci_portname(ps)));
1625 
1626 		switch (ps->state) {
1627 		case STATE_SPIN_UP:
1628 		case STATE_NO_DEV:
1629 			/* Reportedly, a device has shown up. Start polling its
1630 			 * status until it has become ready.
1631 			 */
1632 
1633 			if (ps->state == STATE_SPIN_UP)
1634 				cancel_timer(&ps->cmd_info[0].timer);
1635 
1636 			ps->state = STATE_WAIT_DEV;
1637 			ps->left = ahci_device_checks;
1638 
1639 			port_dev_check(ps);
1640 
1641 			break;
1642 
1643 		case STATE_WAIT_DEV:
1644 			/* Nothing else to do. */
1645 			break;
1646 
1647 		default:
1648 			/* Impossible. */
1649 			assert(0);
1650 		}
1651 	} else if (emask & AHCI_PORT_IS_PRCS) {
1652 		/* Clear the N diagnostics bit to clear this interrupt. */
1653 		port_write(ps, AHCI_PORT_SERR, AHCI_PORT_SERR_DIAG_N);
1654 
1655 		dprintf(V_DEV, ("%s: device detached\n", ahci_portname(ps)));
1656 
1657 		switch (ps->state) {
1658 		case STATE_WAIT_ID:
1659 		case STATE_GOOD_DEV:
1660 			/* The device is no longer ready. Stop the port, cancel
1661 			 * ongoing requests, and disconnect the device.
1662 			 */
1663 			port_stop(ps);
1664 
1665 			/* fall-through */
1666 		case STATE_BAD_DEV:
1667 			port_disconnect(ps);
1668 
1669 			/* The device has become unusable to us at this point.
1670 			 * Reset the port to make sure that once the device (or
1671 			 * another device) becomes usable again, we will get a
1672 			 * PCS interrupt as well.
1673 			 */
1674 			port_hardreset(ps);
1675 
1676 			break;
1677 
1678 		default:
1679 			/* Impossible. */
1680 			assert(0);
1681 		}
1682 	} else if (smask & AHCI_PORT_IS_MASK) {
1683 		/* We assume that any other interrupt indicates command
1684 		 * completion or (command or device) failure. Unfortunately, if
1685 		 * an NCQ command failed, we cannot easily determine which one
1686 		 * it was. For that reason, after completing all successfully
1687 		 * finished commands (above), we fail all other outstanding
1688 		 * commands and restart the port. This can possibly be improved
1689 		 * later by obtaining per-command status results from the HBA.
1690 		 */
1691 
1692 		success = !(port_read(ps, AHCI_PORT_TFD) &
1693 			(AHCI_PORT_TFD_STS_ERR | AHCI_PORT_TFD_STS_DF));
1694 
1695 		/* Check now for failure. There are fatal failures, and there
1696 		 * are failures that set the TFD.STS.ERR field using a D2H
1697 		 * FIS. In both cases, we just restart the port, failing all
1698 		 * commands in the process.
1699 		 */
1700 		if ((port_read(ps, AHCI_PORT_TFD) &
1701 			(AHCI_PORT_TFD_STS_ERR | AHCI_PORT_TFD_STS_DF)) ||
1702 			(smask & AHCI_PORT_IS_RESTART)) {
1703 				port_restart(ps);
1704 		}
1705 
1706 		/* If we were waiting for ID verification, check now. */
1707 		if (ps->state == STATE_WAIT_ID)
1708 			port_id_check(ps, success);
1709 	}
1710 }
1711 
1712 /*===========================================================================*
1713  *				port_timeout				     *
1714  *===========================================================================*/
1715 static void port_timeout(minix_timer_t *tp)
1716 {
1717 	/* A timeout has occurred on this port. Figure out what the timeout is
1718 	 * for, and take appropriate action.
1719 	 */
1720 	struct port_state *ps;
1721 	int port, cmd;
1722 
1723 	port = GET_PORT(tmr_arg(tp)->ta_int);
1724 	cmd = GET_TAG(tmr_arg(tp)->ta_int);
1725 
1726 	assert(port >= 0 && port < hba_state.nr_ports);
1727 
1728 	ps = &port_state[port];
1729 
1730 	/* Regardless of the outcome of this timeout, wake up the thread if it
1731 	 * is suspended. This applies only during the initialization.
1732 	 */
1733 	if (ps->flags & FLAG_SUSPENDED) {
1734 		assert(cmd == 0);
1735 		blockdriver_mt_wakeup(ps->cmd_info[0].tid);
1736 	}
1737 
1738 	/* If detection of a device after startup timed out, give up on initial
1739 	 * detection and only look for hot plug events from now on.
1740 	 */
1741 	if (ps->state == STATE_SPIN_UP) {
1742 		/* One exception: if the PCS interrupt bit is set here, then we
1743 		 * are probably running on VirtualBox, which is currently not
1744 		 * always raising interrupts when setting interrupt bits (!).
1745 		 */
1746 		if (port_read(ps, AHCI_PORT_IS) & AHCI_PORT_IS_PCS) {
1747 			dprintf(V_INFO, ("%s: bad controller, no interrupt\n",
1748 				ahci_portname(ps)));
1749 
1750 			ps->state = STATE_WAIT_DEV;
1751 			ps->left = ahci_device_checks;
1752 
1753 			port_dev_check(ps);
1754 
1755 			return;
1756 		} else {
1757 			dprintf(V_INFO, ("%s: spin-up timeout\n",
1758 				ahci_portname(ps)));
1759 
1760 			/* If the busy flag is set, a BDEV_OPEN request is
1761 			 * waiting for the detection to finish; clear the busy
1762 			 * flag to return an error to the caller.
1763 			 */
1764 			ps->state = STATE_NO_DEV;
1765 			ps->flags &= ~FLAG_BUSY;
1766 		}
1767 
1768 		return;
1769 	}
1770 
1771 	/* If we are waiting for a device to become connected and initialized,
1772 	 * check now.
1773 	 */
1774 	if (ps->state == STATE_WAIT_DEV) {
1775 		port_dev_check(ps);
1776 
1777 		return;
1778 	}
1779 
1780 	dprintf(V_ERR, ("%s: timeout\n", ahci_portname(ps)));
1781 
1782 	/* Restart the port, failing all current commands. */
1783 	port_restart(ps);
1784 
1785 	/* Finish up the identify operation. */
1786 	if (ps->state == STATE_WAIT_ID)
1787 		port_id_check(ps, FALSE);
1788 }
1789 
1790 /*===========================================================================*
1791  *				port_wait				     *
1792  *===========================================================================*/
1793 static void port_wait(struct port_state *ps)
1794 {
1795 	/* Suspend the current thread until the given port is no longer busy,
1796 	 * due to either command completion or timeout.
1797 	 */
1798 
1799 	ps->flags |= FLAG_SUSPENDED;
1800 
1801 	while (ps->flags & FLAG_BUSY)
1802 		blockdriver_mt_sleep();
1803 
1804 	ps->flags &= ~FLAG_SUSPENDED;
1805 }
1806 
1807 /*===========================================================================*
1808  *				port_issue				     *
1809  *===========================================================================*/
1810 static void port_issue(struct port_state *ps, int cmd, clock_t timeout)
1811 {
1812 	/* Issue a command to the port, and set a timer to trigger a timeout
1813 	 * if the command takes too long to complete.
1814 	 */
1815 
1816 	/* Set the corresponding NCQ command bit, if applicable. */
1817 	if (ps->flags & FLAG_HAS_NCQ)
1818 		port_write(ps, AHCI_PORT_SACT, 1 << cmd);
1819 
1820 	/* Make sure that the compiler does not delay any previous write
1821 	 * operations until after the write to the command issue register.
1822 	 */
1823 	__insn_barrier();
1824 
1825 	/* Tell the controller that a new command is ready. */
1826 	port_write(ps, AHCI_PORT_CI, 1 << cmd);
1827 
1828 	/* Update pending commands. */
1829 	ps->pend_mask |= 1 << cmd;
1830 
1831 	/* Set a timer in case the command does not complete at all. */
1832 	set_timer(&ps->cmd_info[cmd].timer, timeout, port_timeout,
1833 		BUILD_ARG(ps - port_state, cmd));
1834 }
1835 
1836 /*===========================================================================*
1837  *				port_exec				     *
1838  *===========================================================================*/
1839 static int port_exec(struct port_state *ps, int cmd, clock_t timeout)
1840 {
1841 	/* Execute a command on a port, wait for the command to complete or for
1842 	 * a timeout, and return whether the command succeeded or not.
1843 	 */
1844 
1845 	port_issue(ps, cmd, timeout);
1846 
1847 	/* Put the thread to sleep until a timeout or a command completion
1848 	 * happens. Earlier, we used to call port_wait which set the suspended
1849 	 * flag. We now abandon it since the flag has to work on a per-thread,
1850 	 * and hence per-tag basis and not on a per-port basis. Instead, we
1851 	 * retain that call only to defer open calls during device/driver
1852 	 * initialization. Instead, we call sleep here directly. Before
1853 	 * sleeping, we register the thread.
1854 	 */
1855 	ps->cmd_info[cmd].tid = blockdriver_mt_get_tid();
1856 
1857 	blockdriver_mt_sleep();
1858 
1859 	/* Cancelling a timer that just triggered, does no harm. */
1860 	cancel_timer(&ps->cmd_info[cmd].timer);
1861 
1862 	assert(!(ps->flags & FLAG_BUSY));
1863 
1864 	dprintf(V_REQ, ("%s: end of command -- %s\n", ahci_portname(ps),
1865 		(ps->cmd_info[cmd].result == RESULT_FAILURE) ?
1866 		"failure" : "success"));
1867 
1868 	if (ps->cmd_info[cmd].result == RESULT_FAILURE)
1869 		return EIO;
1870 
1871 	return OK;
1872 }
1873 
1874 /*===========================================================================*
1875  *				port_alloc				     *
1876  *===========================================================================*/
1877 static void port_alloc(struct port_state *ps)
1878 {
1879 	/* Allocate memory for the given port, and enable FIS receipt. We try
1880 	 * to cram everything into one 4K-page in order to limit memory usage
1881 	 * as much as possible. More memory may be allocated on demand later,
1882 	 * but allocation failure should be fatal only here. Note that we do
1883 	 * not allocate memory for sector padding here, because we do not know
1884 	 * the device's sector size yet.
1885 	 */
1886 	size_t fis_off, tmp_off, ct_off; int i;
1887 	size_t ct_offs[NR_CMDS];
1888 	u32_t cmd;
1889 
1890 	fis_off = AHCI_CL_SIZE + AHCI_FIS_SIZE - 1;
1891 	fis_off -= fis_off % AHCI_FIS_SIZE;
1892 
1893 	tmp_off = fis_off + AHCI_FIS_SIZE + AHCI_TMP_ALIGN - 1;
1894 	tmp_off -= tmp_off % AHCI_TMP_ALIGN;
1895 
1896 	/* Allocate memory for all the commands. */
1897 	ct_off = tmp_off + AHCI_TMP_SIZE;
1898 	for (i = 0; i < NR_CMDS; i++) {
1899 		ct_off += AHCI_CT_ALIGN - 1;
1900 		ct_off -= ct_off % AHCI_CT_ALIGN;
1901 		ct_offs[i] = ct_off;
1902 		ps->mem_size = ct_off + AHCI_CT_SIZE;
1903 		ct_off = ps->mem_size;
1904 	}
1905 
1906 	ps->mem_base = alloc_contig(ps->mem_size, AC_ALIGN4K, &ps->mem_phys);
1907 	if (ps->mem_base == NULL)
1908 		panic("unable to allocate port memory");
1909 	memset(ps->mem_base, 0, ps->mem_size);
1910 
1911 	ps->cl_base = (u32_t *) ps->mem_base;
1912 	ps->cl_phys = ps->mem_phys;
1913 	assert(ps->cl_phys % AHCI_CL_SIZE == 0);
1914 
1915 	ps->fis_base = (u32_t *) (ps->mem_base + fis_off);
1916 	ps->fis_phys = ps->mem_phys + fis_off;
1917 	assert(ps->fis_phys % AHCI_FIS_SIZE == 0);
1918 
1919 	ps->tmp_base = (u8_t *) (ps->mem_base + tmp_off);
1920 	ps->tmp_phys = ps->mem_phys + tmp_off;
1921 	assert(ps->tmp_phys % AHCI_TMP_ALIGN == 0);
1922 
1923 	for (i = 0; i < NR_CMDS; i++) {
1924 		ps->ct_base[i] = ps->mem_base + ct_offs[i];
1925 		ps->ct_phys[i] = ps->mem_phys + ct_offs[i];
1926 		assert(ps->ct_phys[i] % AHCI_CT_ALIGN == 0);
1927 	}
1928 
1929 	/* Tell the controller about some of the physical addresses. */
1930 	port_write(ps, AHCI_PORT_FBU, 0);
1931 	port_write(ps, AHCI_PORT_FB, ps->fis_phys);
1932 
1933 	port_write(ps, AHCI_PORT_CLBU, 0);
1934 	port_write(ps, AHCI_PORT_CLB, ps->cl_phys);
1935 
1936 	/* Enable FIS receive. */
1937 	cmd = port_read(ps, AHCI_PORT_CMD);
1938 	port_write(ps, AHCI_PORT_CMD, cmd | AHCI_PORT_CMD_FRE);
1939 
1940 	ps->pad_base = NULL;
1941 	ps->pad_size = 0;
1942 }
1943 
1944 /*===========================================================================*
1945  *				port_free				     *
1946  *===========================================================================*/
1947 static void port_free(struct port_state *ps)
1948 {
1949 	/* Disable FIS receipt for the given port, and free previously
1950 	 * allocated memory.
1951 	 */
1952 	u32_t cmd;
1953 
1954 	/* Disable FIS receive. */
1955 	cmd = port_read(ps, AHCI_PORT_CMD);
1956 
1957 	if (cmd & (AHCI_PORT_CMD_FR | AHCI_PORT_CMD_FRE)) {
1958 		port_write(ps, AHCI_PORT_CMD, cmd & ~AHCI_PORT_CMD_FRE);
1959 
1960 		SPIN_UNTIL(!(port_read(ps, AHCI_PORT_CMD) & AHCI_PORT_CMD_FR),
1961 			PORTREG_DELAY);
1962 	}
1963 
1964 	if (ps->pad_base != NULL)
1965 		free_contig(ps->pad_base, ps->pad_size);
1966 
1967 	free_contig(ps->mem_base, ps->mem_size);
1968 }
1969 
1970 /*===========================================================================*
1971  *				port_init				     *
1972  *===========================================================================*/
1973 static void port_init(struct port_state *ps)
1974 {
1975 	/* Initialize the given port.
1976 	 */
1977 	u32_t cmd;
1978 	int i;
1979 
1980 	/* Initialize the port state structure. */
1981 	ps->queue_depth = 1;
1982 	ps->state = STATE_SPIN_UP;
1983 	ps->flags = FLAG_BUSY;
1984 	ps->sector_size = 0;
1985 	ps->open_count = 0;
1986 	ps->pend_mask = 0;
1987 	for (i = 0; i < NR_CMDS; i++)
1988 		init_timer(&ps->cmd_info[i].timer);
1989 
1990 	ps->reg = (u32_t *) ((u8_t *) hba_state.base +
1991 		AHCI_MEM_BASE_SIZE + AHCI_MEM_PORT_SIZE * (ps - port_state));
1992 
1993 	/* Allocate memory for the port. */
1994 	port_alloc(ps);
1995 
1996 	/* Just listen for device connection events for now. */
1997 	port_write(ps, AHCI_PORT_IE, AHCI_PORT_IE_PCE);
1998 
1999 	/* Enable device spin-up for HBAs that support staggered spin-up.
2000 	 * This is a no-op for HBAs that do not support it.
2001 	 */
2002 	cmd = port_read(ps, AHCI_PORT_CMD);
2003 	port_write(ps, AHCI_PORT_CMD, cmd | AHCI_PORT_CMD_SUD);
2004 
2005 	/* Trigger a port reset. */
2006 	port_hardreset(ps);
2007 
2008 	set_timer(&ps->cmd_info[0].timer, ahci_spinup_timeout,
2009 		port_timeout, BUILD_ARG(ps - port_state, 0));
2010 }
2011 
2012 /*===========================================================================*
2013  *				ahci_probe				     *
2014  *===========================================================================*/
2015 static int ahci_probe(int skip)
2016 {
2017 	/* Find a matching PCI device.
2018 	 */
2019 	int r, devind;
2020 	u16_t vid, did;
2021 
2022 	pci_init();
2023 
2024 	r = pci_first_dev(&devind, &vid, &did);
2025 	if (r <= 0)
2026 		return -1;
2027 
2028 	while (skip--) {
2029 		r = pci_next_dev(&devind, &vid, &did);
2030 		if (r <= 0)
2031 			return -1;
2032 	}
2033 
2034 	pci_reserve(devind);
2035 
2036 	return devind;
2037 }
2038 
2039 /*===========================================================================*
2040  *				ahci_reset				     *
2041  *===========================================================================*/
2042 static void ahci_reset(void)
2043 {
2044 	/* Reset the HBA. Do not enable AHCI mode afterwards.
2045 	 */
2046 	u32_t ghc;
2047 
2048 	ghc = hba_read(AHCI_HBA_GHC);
2049 
2050 	hba_write(AHCI_HBA_GHC, ghc | AHCI_HBA_GHC_AE);
2051 
2052 	hba_write(AHCI_HBA_GHC, ghc | AHCI_HBA_GHC_AE | AHCI_HBA_GHC_HR);
2053 
2054 	SPIN_UNTIL(!(hba_read(AHCI_HBA_GHC) & AHCI_HBA_GHC_HR), RESET_DELAY);
2055 
2056 	if (hba_read(AHCI_HBA_GHC) & AHCI_HBA_GHC_HR)
2057 		panic("unable to reset HBA");
2058 }
2059 
2060 /*===========================================================================*
2061  *				ahci_init				     *
2062  *===========================================================================*/
2063 static void ahci_init(int devind)
2064 {
2065 	/* Initialize the device.
2066 	 */
2067 	u32_t base, size, cap, ghc, mask;
2068 	int r, port, ioflag;
2069 
2070 	if ((r = pci_get_bar(devind, PCI_BAR_6, &base, &size, &ioflag)) != OK)
2071 		panic("unable to retrieve BAR: %d", r);
2072 
2073 	if (ioflag)
2074 		panic("invalid BAR type");
2075 
2076 	/* There must be at least one port, and at most NR_PORTS ports. Limit
2077 	 * the actual total number of ports to the size of the exposed area.
2078 	 */
2079 	if (size < AHCI_MEM_BASE_SIZE + AHCI_MEM_PORT_SIZE)
2080 		panic("HBA memory size too small: %u", size);
2081 
2082 	size = MIN(size, AHCI_MEM_BASE_SIZE + AHCI_MEM_PORT_SIZE * NR_PORTS);
2083 
2084 	hba_state.nr_ports = (size - AHCI_MEM_BASE_SIZE) / AHCI_MEM_PORT_SIZE;
2085 
2086 	/* Map the register area into local memory. */
2087 	hba_state.base = (u32_t *) vm_map_phys(SELF, (void *) base, size);
2088 	hba_state.size = size;
2089 	if (hba_state.base == MAP_FAILED)
2090 		panic("unable to map HBA memory");
2091 
2092 	/* Retrieve, allocate and enable the controller's IRQ. */
2093 	hba_state.irq = pci_attr_r8(devind, PCI_ILR);
2094 	hba_state.hook_id = 0;
2095 
2096 	if ((r = sys_irqsetpolicy(hba_state.irq, 0, &hba_state.hook_id)) != OK)
2097 		panic("unable to register IRQ: %d", r);
2098 
2099 	if ((r = sys_irqenable(&hba_state.hook_id)) != OK)
2100 		panic("unable to enable IRQ: %d", r);
2101 
2102 	/* Reset the HBA. */
2103 	ahci_reset();
2104 
2105 	/* Enable AHCI and interrupts. */
2106 	ghc = hba_read(AHCI_HBA_GHC);
2107 	hba_write(AHCI_HBA_GHC, ghc | AHCI_HBA_GHC_AE | AHCI_HBA_GHC_IE);
2108 
2109 	/* Limit the maximum number of commands to the controller's value. */
2110 	/* Note that we currently use only one command anyway. */
2111 	cap = hba_read(AHCI_HBA_CAP);
2112 	hba_state.has_ncq = !!(cap & AHCI_HBA_CAP_SNCQ);
2113 	hba_state.has_clo = !!(cap & AHCI_HBA_CAP_SCLO);
2114 	hba_state.nr_cmds = MIN(NR_CMDS,
2115 		((cap >> AHCI_HBA_CAP_NCS_SHIFT) & AHCI_HBA_CAP_NCS_MASK) + 1);
2116 
2117 	dprintf(V_INFO, ("AHCI%u: HBA v%d.%d%d, %ld ports, %ld commands, "
2118 		"%s queuing, IRQ %d\n",
2119 		ahci_instance,
2120 		(int) (hba_read(AHCI_HBA_VS) >> 16),
2121 		(int) ((hba_read(AHCI_HBA_VS) >> 8) & 0xFF),
2122 		(int) (hba_read(AHCI_HBA_VS) & 0xFF),
2123 		((cap >> AHCI_HBA_CAP_NP_SHIFT) & AHCI_HBA_CAP_NP_MASK) + 1,
2124 		((cap >> AHCI_HBA_CAP_NCS_SHIFT) & AHCI_HBA_CAP_NCS_MASK) + 1,
2125 		hba_state.has_ncq ? "supports" : "no", hba_state.irq));
2126 
2127 	dprintf(V_INFO, ("AHCI%u: CAP %08x, CAP2 %08x, PI %08x\n",
2128 		ahci_instance, cap, hba_read(AHCI_HBA_CAP2),
2129 		hba_read(AHCI_HBA_PI)));
2130 
2131 	/* Initialize each of the implemented ports. We ignore CAP.NP. */
2132 	mask = hba_read(AHCI_HBA_PI);
2133 
2134 	for (port = 0; port < hba_state.nr_ports; port++) {
2135 		port_state[port].device = NO_DEVICE;
2136 		port_state[port].state = STATE_NO_PORT;
2137 
2138 		if (mask & (1 << port))
2139 			port_init(&port_state[port]);
2140 	}
2141 }
2142 
2143 /*===========================================================================*
2144  *				ahci_stop				     *
2145  *===========================================================================*/
2146 static void ahci_stop(void)
2147 {
2148 	/* Disable AHCI, and clean up resources to the extent possible.
2149 	 */
2150 	struct port_state *ps;
2151 	int r, port;
2152 
2153 	for (port = 0; port < hba_state.nr_ports; port++) {
2154 		ps = &port_state[port];
2155 
2156 		if (ps->state != STATE_NO_PORT) {
2157 			port_stop(ps);
2158 
2159 			port_free(ps);
2160 		}
2161 	}
2162 
2163 	ahci_reset();
2164 
2165 	if ((r = vm_unmap_phys(SELF, (void *) hba_state.base,
2166 			hba_state.size)) != OK)
2167 		panic("unable to unmap HBA memory: %d", r);
2168 
2169 	if ((r = sys_irqrmpolicy(&hba_state.hook_id)) != OK)
2170 		panic("unable to deregister IRQ: %d", r);
2171 }
2172 
2173 /*===========================================================================*
2174  *				ahci_alarm				     *
2175  *===========================================================================*/
2176 static void ahci_alarm(clock_t stamp)
2177 {
2178 	/* Process an alarm.
2179 	 */
2180 
2181 	/* Call the port-specific handler for each port that timed out. */
2182 	expire_timers(stamp);
2183 }
2184 
2185 /*===========================================================================*
2186  *				ahci_intr				     *
2187  *===========================================================================*/
2188 static void ahci_intr(unsigned int UNUSED(mask))
2189 {
2190 	/* Process an interrupt.
2191 	 */
2192 	struct port_state *ps;
2193 	u32_t mask;
2194 	int r, port;
2195 
2196 	/* Handle an interrupt for each port that has the interrupt bit set. */
2197 	mask = hba_read(AHCI_HBA_IS);
2198 
2199 	for (port = 0; port < hba_state.nr_ports; port++) {
2200 		if (mask & (1 << port)) {
2201 			ps = &port_state[port];
2202 
2203 			port_intr(ps);
2204 
2205 			/* After processing an interrupt, wake up the device
2206 			 * thread if it is suspended and now no longer busy.
2207 			 */
2208 			if ((ps->flags & (FLAG_SUSPENDED | FLAG_BUSY)) ==
2209 					FLAG_SUSPENDED)
2210 				blockdriver_mt_wakeup(ps->cmd_info[0].tid);
2211 		}
2212 	}
2213 
2214 	/* Clear the bits that we processed. */
2215 	hba_write(AHCI_HBA_IS, mask);
2216 
2217 	/* Reenable the interrupt. */
2218 	if ((r = sys_irqenable(&hba_state.hook_id)) != OK)
2219 		panic("unable to enable IRQ: %d", r);
2220 }
2221 
2222 /*===========================================================================*
2223  *				ahci_get_params				     *
2224  *===========================================================================*/
2225 static void ahci_get_params(void)
2226 {
2227 	/* Retrieve and parse parameters passed to this driver, except the
2228 	 * device-to-port mapping, which has to be parsed later.
2229 	 */
2230 	long v;
2231 	unsigned int i;
2232 
2233 	/* Find out which driver instance we are. */
2234 	v = 0;
2235 	(void) env_parse("instance", "d", 0, &v, 0, 255);
2236 	ahci_instance = (int) v;
2237 
2238 	/* Initialize the verbosity level. */
2239 	v = V_ERR;
2240 	(void) env_parse("ahci_verbose", "d", 0, &v, V_NONE, V_REQ);
2241 	ahci_verbose = (int) v;
2242 
2243 	/* Initialize timeout-related values. */
2244 	for (i = 0; i < sizeof(ahci_timevar) / sizeof(ahci_timevar[0]); i++) {
2245 		v = ahci_timevar[i].default_ms;
2246 
2247 		(void) env_parse(ahci_timevar[i].name, "d", 0, &v, 1,
2248 			LONG_MAX);
2249 
2250 		*ahci_timevar[i].ptr = millis_to_hz(v);
2251 	}
2252 
2253 	ahci_device_delay = millis_to_hz(DEVICE_DELAY);
2254 	ahci_device_checks = (ahci_device_timeout + ahci_device_delay - 1) /
2255 		ahci_device_delay;
2256 }
2257 
2258 /*===========================================================================*
2259  *				ahci_set_mapping			     *
2260  *===========================================================================*/
2261 static void ahci_set_mapping(void)
2262 {
2263 	/* Construct a mapping from device nodes to port numbers.
2264 	 */
2265 	char key[16], val[32], *p;
2266 	unsigned int port;
2267 	int i, j;
2268 
2269 	/* Start off with a mapping that includes implemented ports only, in
2270 	 * order. We choose this mapping over an identity mapping to maximize
2271 	 * the chance that the user will be able to access the first MAX_DRIVES
2272 	 * devices. Note that we can only do this after initializing the HBA.
2273 	 */
2274 	for (i = j = 0; i < NR_PORTS && j < MAX_DRIVES; i++)
2275 		if (port_state[i].state != STATE_NO_PORT)
2276 			ahci_map[j++] = i;
2277 
2278 	for ( ; j < MAX_DRIVES; j++)
2279 		ahci_map[j] = NO_PORT;
2280 
2281 	/* See if the user specified a custom mapping. Unlike all other
2282 	 * configuration options, this is a per-instance setting.
2283 	 */
2284 	strlcpy(key, "ahci0_map", sizeof(key));
2285 	key[4] += ahci_instance;
2286 
2287 	if (env_get_param(key, val, sizeof(val)) == OK) {
2288 		/* Parse the mapping, which is assumed to be a comma-separated
2289 		 * list of zero-based port numbers.
2290 		 */
2291 		p = val;
2292 
2293 		for (i = 0; i < MAX_DRIVES; i++) {
2294 			if (*p) {
2295 				port = (unsigned int) strtoul(p, &p, 0);
2296 
2297 				if (*p) p++;
2298 
2299 				ahci_map[i] = port % NR_PORTS;
2300 			}
2301 			else ahci_map[i] = NO_PORT;
2302 		}
2303 	}
2304 
2305 	/* Create a reverse mapping. */
2306 	for (i = 0; i < MAX_DRIVES; i++)
2307 		if ((j = ahci_map[i]) != NO_PORT)
2308 			port_state[j].device = i;
2309 }
2310 
2311 /*===========================================================================*
2312  *				sef_cb_init_fresh			     *
2313  *===========================================================================*/
2314 static int sef_cb_init_fresh(int type, sef_init_info_t *UNUSED(info))
2315 {
2316 	/* Initialize the driver.
2317 	 */
2318 	int devind;
2319 
2320 	/* Get command line parameters. */
2321 	ahci_get_params();
2322 
2323 	/* Probe for recognized devices, skipping matches as appropriate. */
2324 	devind = ahci_probe(ahci_instance);
2325 
2326 	if (devind < 0)
2327 		panic("no matching device found");
2328 
2329 	/* Initialize the device we found. */
2330 	ahci_init(devind);
2331 
2332 	/* Create a mapping from device nodes to port numbers. */
2333 	ahci_set_mapping();
2334 
2335 	/* Announce that we are up. */
2336 	blockdriver_announce(type);
2337 
2338 	return OK;
2339 }
2340 
2341 /*===========================================================================*
2342  *				sef_cb_signal_handler			     *
2343  *===========================================================================*/
2344 static void sef_cb_signal_handler(int signo)
2345 {
2346 	/* In case of a termination signal, shut down this driver.
2347 	 */
2348 	int port;
2349 
2350 	if (signo != SIGTERM) return;
2351 
2352 	/* If any ports are still opened, assume that the system is being shut
2353 	 * down, and stay up until the last device has been closed.
2354 	 */
2355 	ahci_exiting = TRUE;
2356 
2357 	for (port = 0; port < hba_state.nr_ports; port++)
2358 		if (port_state[port].open_count > 0)
2359 			return;
2360 
2361 	/* If not, stop the driver and exit immediately. */
2362 	ahci_stop();
2363 
2364 	exit(0);
2365 }
2366 
2367 /*===========================================================================*
2368  *				sef_local_startup			     *
2369  *===========================================================================*/
2370 static void sef_local_startup(void)
2371 {
2372 	/* Set callbacks and initialize the System Event Framework (SEF).
2373 	 */
2374 
2375 	/* Register init callbacks. */
2376 	sef_setcb_init_fresh(sef_cb_init_fresh);
2377 
2378 	/* Register signal callbacks. */
2379 	sef_setcb_signal_handler(sef_cb_signal_handler);
2380 
2381 	/* Enable support for live update. */
2382 	blockdriver_mt_support_lu();
2383 
2384 	/* Let SEF perform startup. */
2385 	sef_startup();
2386 }
2387 
2388 /*===========================================================================*
2389  *				ahci_portname				     *
2390  *===========================================================================*/
2391 static char *ahci_portname(struct port_state *ps)
2392 {
2393 	/* Return a printable name for the given port. Whenever we can, print a
2394 	 * "Dx" device number rather than a "Pxx" port number, because the user
2395 	 * may not be aware of the mapping currently in use.
2396 	 */
2397 	static char name[] = "AHCI0-P00";
2398 
2399 	name[4] = '0' + ahci_instance;
2400 
2401 	if (ps->device == NO_DEVICE) {
2402 		name[6] = 'P';
2403 		name[7] = '0' + (ps - port_state) / 10;
2404 		name[8] = '0' + (ps - port_state) % 10;
2405 	}
2406 	else {
2407 		name[6] = 'D';
2408 		name[7] = '0' + ps->device;
2409 		name[8] = 0;
2410 	}
2411 
2412 	return name;
2413 }
2414 
2415 /*===========================================================================*
2416  *				ahci_map_minor				     *
2417  *===========================================================================*/
2418 static struct port_state *ahci_map_minor(devminor_t minor, struct device **dvp)
2419 {
2420 	/* Map a minor device number to a port and a pointer to the partition's
2421 	 * device structure. Return NULL if this minor device number does not
2422 	 * identify an actual device.
2423 	 */
2424 	struct port_state *ps;
2425 	int port;
2426 
2427 	ps = NULL;
2428 
2429 	if (minor >= 0 && minor < NR_MINORS) {
2430 		port = ahci_map[minor / DEV_PER_DRIVE];
2431 
2432 		if (port == NO_PORT)
2433 			return NULL;
2434 
2435 		ps = &port_state[port];
2436 		*dvp = &ps->part[minor % DEV_PER_DRIVE];
2437 	}
2438 	else if ((unsigned) (minor -= MINOR_d0p0s0) < NR_SUBDEVS) {
2439 		port = ahci_map[minor / SUB_PER_DRIVE];
2440 
2441 		if (port == NO_PORT)
2442 			return NULL;
2443 
2444 		ps = &port_state[port];
2445 		*dvp = &ps->subpart[minor % SUB_PER_DRIVE];
2446 	}
2447 
2448 	return ps;
2449 }
2450 
2451 /*===========================================================================*
2452  *				ahci_part				     *
2453  *===========================================================================*/
2454 static struct device *ahci_part(devminor_t minor)
2455 {
2456 	/* Return a pointer to the partition information structure of the given
2457 	 * minor device.
2458 	 */
2459 	struct device *dv;
2460 
2461 	if (ahci_map_minor(minor, &dv) == NULL)
2462 		return NULL;
2463 
2464 	return dv;
2465 }
2466 
2467 /*===========================================================================*
2468  *				ahci_open				     *
2469  *===========================================================================*/
2470 static int ahci_open(devminor_t minor, int access)
2471 {
2472 	/* Open a device.
2473 	 */
2474 	struct port_state *ps;
2475 	int r;
2476 
2477 	ps = ahci_get_port(minor);
2478 
2479 	/* Only one open request can be processed at a time, due to the fact
2480 	 * that it is an exclusive operation. The thread that handles this call
2481 	 * can therefore freely register itself at slot zero.
2482 	 */
2483 	ps->cmd_info[0].tid = blockdriver_mt_get_tid();
2484 
2485 	/* If we are still in the process of initializing this port or device,
2486 	 * wait for completion of that phase first.
2487 	 */
2488 	if (ps->flags & FLAG_BUSY)
2489 		port_wait(ps);
2490 
2491 	/* The device may only be opened if it is now properly functioning. */
2492 	if (ps->state != STATE_GOOD_DEV)
2493 		return ENXIO;
2494 
2495 	/* Some devices may only be opened in read-only mode. */
2496 	if ((ps->flags & FLAG_READONLY) && (access & BDEV_W_BIT))
2497 		return EACCES;
2498 
2499 	if (ps->open_count == 0) {
2500 		/* The first open request. Clear the barrier flag, if set. */
2501 		ps->flags &= ~FLAG_BARRIER;
2502 
2503 		/* Recheck media only when nobody is using the device. */
2504 		if ((ps->flags & FLAG_ATAPI) &&
2505 			(r = atapi_check_medium(ps, 0)) != OK)
2506 			return r;
2507 
2508 		/* After rechecking the media, the partition table must always
2509 		 * be read. This is also a convenient time to do it for
2510 		 * nonremovable devices. Start by resetting the partition
2511 		 * tables and setting the working size of the entire device.
2512 		 */
2513 		memset(ps->part, 0, sizeof(ps->part));
2514 		memset(ps->subpart, 0, sizeof(ps->subpart));
2515 
2516 		ps->part[0].dv_size = ps->lba_count * ps->sector_size;
2517 
2518 		partition(&ahci_dtab, ps->device * DEV_PER_DRIVE, P_PRIMARY,
2519 			!!(ps->flags & FLAG_ATAPI));
2520 
2521 		blockdriver_mt_set_workers(ps->device, ps->queue_depth);
2522 	}
2523 	else {
2524 		/* If the barrier flag is set, deny new open requests until the
2525 		 * device is fully closed first.
2526 		 */
2527 		if (ps->flags & FLAG_BARRIER)
2528 			return ENXIO;
2529 	}
2530 
2531 	ps->open_count++;
2532 
2533 	return OK;
2534 }
2535 
2536 /*===========================================================================*
2537  *				ahci_close				     *
2538  *===========================================================================*/
2539 static int ahci_close(devminor_t minor)
2540 {
2541 	/* Close a device.
2542 	 */
2543 	struct port_state *ps;
2544 	int port;
2545 
2546 	ps = ahci_get_port(minor);
2547 
2548 	/* Decrease the open count. */
2549 	if (ps->open_count <= 0) {
2550 		dprintf(V_ERR, ("%s: closing already-closed port\n",
2551 			ahci_portname(ps)));
2552 
2553 		return EINVAL;
2554 	}
2555 
2556 	ps->open_count--;
2557 
2558 	if (ps->open_count > 0)
2559 		return OK;
2560 
2561 	/* The device is now fully closed. That also means that the threads for
2562 	 * this device are not needed anymore, so we reduce the count to one.
2563 	 */
2564 	blockdriver_mt_set_workers(ps->device, 1);
2565 
2566 	if (ps->state == STATE_GOOD_DEV && !(ps->flags & FLAG_BARRIER)) {
2567 		dprintf(V_INFO, ("%s: flushing write cache\n",
2568 			ahci_portname(ps)));
2569 
2570 		(void) gen_flush_wcache(ps);
2571 	}
2572 
2573 	/* If the entire driver has been told to terminate, check whether all
2574 	 * devices are now closed. If so, tell libblockdriver to quit after
2575 	 * replying to the close request.
2576 	 */
2577 	if (ahci_exiting) {
2578 		for (port = 0; port < hba_state.nr_ports; port++)
2579 			if (port_state[port].open_count > 0)
2580 				break;
2581 
2582 		if (port == hba_state.nr_ports) {
2583 			ahci_stop();
2584 
2585 			blockdriver_mt_terminate();
2586 		}
2587 	}
2588 
2589 	return OK;
2590 }
2591 
2592 /*===========================================================================*
2593  *				ahci_transfer				     *
2594  *===========================================================================*/
2595 static ssize_t ahci_transfer(devminor_t minor, int do_write, u64_t position,
2596 	endpoint_t endpt, iovec_t *iovec, unsigned int count, int flags)
2597 {
2598 	/* Perform data transfer on the selected device.
2599 	 */
2600 	struct port_state *ps;
2601 	struct device *dv;
2602 	u64_t pos, eof;
2603 
2604 	ps = ahci_get_port(minor);
2605 	dv = ahci_part(minor);
2606 
2607 	if (ps->state != STATE_GOOD_DEV || (ps->flags & FLAG_BARRIER))
2608 		return EIO;
2609 
2610 	if (count > NR_IOREQS)
2611 		return EINVAL;
2612 
2613 	/* Check for basic end-of-partition condition: if the start position of
2614 	 * the request is outside the partition, return success immediately.
2615 	 * The size of the request is obtained, and possibly reduced, later.
2616 	 */
2617 	if (position >= dv->dv_size)
2618 		return OK;
2619 
2620 	pos = dv->dv_base + position;
2621 	eof = dv->dv_base + dv->dv_size;
2622 
2623 	return port_transfer(ps, pos, eof, endpt, (iovec_s_t *) iovec, count,
2624 		do_write, flags);
2625 }
2626 
2627 /*===========================================================================*
2628  *				ahci_ioctl				     *
2629  *===========================================================================*/
2630 static int ahci_ioctl(devminor_t minor, unsigned long request,
2631 	endpoint_t endpt, cp_grant_id_t grant, endpoint_t UNUSED(user_endpt))
2632 {
2633 	/* Process I/O control requests.
2634 	 */
2635 	struct port_state *ps;
2636 	int r, val;
2637 
2638 	ps = ahci_get_port(minor);
2639 
2640 	switch (request) {
2641 	case DIOCEJECT:
2642 		if (ps->state != STATE_GOOD_DEV || (ps->flags & FLAG_BARRIER))
2643 			return EIO;
2644 
2645 		if (!(ps->flags & FLAG_ATAPI))
2646 			return EINVAL;
2647 
2648 		return atapi_load_eject(ps, 0, FALSE /*load*/);
2649 
2650 	case DIOCOPENCT:
2651 		return sys_safecopyto(endpt, grant, 0,
2652 			(vir_bytes) &ps->open_count, sizeof(ps->open_count));
2653 
2654 	case DIOCFLUSH:
2655 		if (ps->state != STATE_GOOD_DEV || (ps->flags & FLAG_BARRIER))
2656 			return EIO;
2657 
2658 		return gen_flush_wcache(ps);
2659 
2660 	case DIOCSETWC:
2661 		if (ps->state != STATE_GOOD_DEV || (ps->flags & FLAG_BARRIER))
2662 			return EIO;
2663 
2664 		if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &val,
2665 			sizeof(val))) != OK)
2666 			return r;
2667 
2668 		return gen_set_wcache(ps, val);
2669 
2670 	case DIOCGETWC:
2671 		if (ps->state != STATE_GOOD_DEV || (ps->flags & FLAG_BARRIER))
2672 			return EIO;
2673 
2674 		if ((r = gen_get_wcache(ps, &val)) != OK)
2675 			return r;
2676 
2677 		return sys_safecopyto(endpt, grant, 0, (vir_bytes) &val,
2678 			sizeof(val));
2679 	}
2680 
2681 	return ENOTTY;
2682 }
2683 
2684 /*===========================================================================*
2685  *				ahci_device				     *
2686  *===========================================================================*/
2687 static int ahci_device(devminor_t minor, device_id_t *id)
2688 {
2689 	/* Map a minor device number to a device ID.
2690 	 */
2691 	struct port_state *ps;
2692 	struct device *dv;
2693 
2694 	if ((ps = ahci_map_minor(minor, &dv)) == NULL)
2695 		return ENXIO;
2696 
2697 	*id = ps->device;
2698 
2699 	return OK;
2700 }
2701 
2702 /*===========================================================================*
2703  *				ahci_get_port				     *
2704  *===========================================================================*/
2705 static struct port_state *ahci_get_port(devminor_t minor)
2706 {
2707 	/* Get the port structure associated with the given minor device.
2708 	 * Called only from worker threads, so the minor device is already
2709 	 * guaranteed to map to a port.
2710 	 */
2711 	struct port_state *ps;
2712 	struct device *dv;
2713 
2714 	if ((ps = ahci_map_minor(minor, &dv)) == NULL)
2715 		panic("device mapping for minor %d disappeared", minor);
2716 
2717 	return ps;
2718 }
2719 
2720 /*===========================================================================*
2721  *				main					     *
2722  *===========================================================================*/
2723 int main(int argc, char **argv)
2724 {
2725 	/* Driver task.
2726 	 */
2727 
2728 	env_setargs(argc, argv);
2729 	sef_local_startup();
2730 
2731 	blockdriver_mt_task(&ahci_dtab);
2732 
2733 	return 0;
2734 }
2735