xref: /qemu/block/qcow.c (revision 019d6b8f)
1*019d6b8fSAnthony Liguori /*
2*019d6b8fSAnthony Liguori  * Block driver for the QCOW format
3*019d6b8fSAnthony Liguori  *
4*019d6b8fSAnthony Liguori  * Copyright (c) 2004-2006 Fabrice Bellard
5*019d6b8fSAnthony Liguori  *
6*019d6b8fSAnthony Liguori  * Permission is hereby granted, free of charge, to any person obtaining a copy
7*019d6b8fSAnthony Liguori  * of this software and associated documentation files (the "Software"), to deal
8*019d6b8fSAnthony Liguori  * in the Software without restriction, including without limitation the rights
9*019d6b8fSAnthony Liguori  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10*019d6b8fSAnthony Liguori  * copies of the Software, and to permit persons to whom the Software is
11*019d6b8fSAnthony Liguori  * furnished to do so, subject to the following conditions:
12*019d6b8fSAnthony Liguori  *
13*019d6b8fSAnthony Liguori  * The above copyright notice and this permission notice shall be included in
14*019d6b8fSAnthony Liguori  * all copies or substantial portions of the Software.
15*019d6b8fSAnthony Liguori  *
16*019d6b8fSAnthony Liguori  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17*019d6b8fSAnthony Liguori  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18*019d6b8fSAnthony Liguori  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19*019d6b8fSAnthony Liguori  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20*019d6b8fSAnthony Liguori  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21*019d6b8fSAnthony Liguori  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22*019d6b8fSAnthony Liguori  * THE SOFTWARE.
23*019d6b8fSAnthony Liguori  */
24*019d6b8fSAnthony Liguori #include "qemu-common.h"
25*019d6b8fSAnthony Liguori #include "block_int.h"
26*019d6b8fSAnthony Liguori #include "module.h"
27*019d6b8fSAnthony Liguori #include <zlib.h>
28*019d6b8fSAnthony Liguori #include "aes.h"
29*019d6b8fSAnthony Liguori 
30*019d6b8fSAnthony Liguori /**************************************************************/
31*019d6b8fSAnthony Liguori /* QEMU COW block driver with compression and encryption support */
32*019d6b8fSAnthony Liguori 
33*019d6b8fSAnthony Liguori #define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
34*019d6b8fSAnthony Liguori #define QCOW_VERSION 1
35*019d6b8fSAnthony Liguori 
36*019d6b8fSAnthony Liguori #define QCOW_CRYPT_NONE 0
37*019d6b8fSAnthony Liguori #define QCOW_CRYPT_AES  1
38*019d6b8fSAnthony Liguori 
39*019d6b8fSAnthony Liguori #define QCOW_OFLAG_COMPRESSED (1LL << 63)
40*019d6b8fSAnthony Liguori 
41*019d6b8fSAnthony Liguori typedef struct QCowHeader {
42*019d6b8fSAnthony Liguori     uint32_t magic;
43*019d6b8fSAnthony Liguori     uint32_t version;
44*019d6b8fSAnthony Liguori     uint64_t backing_file_offset;
45*019d6b8fSAnthony Liguori     uint32_t backing_file_size;
46*019d6b8fSAnthony Liguori     uint32_t mtime;
47*019d6b8fSAnthony Liguori     uint64_t size; /* in bytes */
48*019d6b8fSAnthony Liguori     uint8_t cluster_bits;
49*019d6b8fSAnthony Liguori     uint8_t l2_bits;
50*019d6b8fSAnthony Liguori     uint32_t crypt_method;
51*019d6b8fSAnthony Liguori     uint64_t l1_table_offset;
52*019d6b8fSAnthony Liguori } QCowHeader;
53*019d6b8fSAnthony Liguori 
54*019d6b8fSAnthony Liguori #define L2_CACHE_SIZE 16
55*019d6b8fSAnthony Liguori 
56*019d6b8fSAnthony Liguori typedef struct BDRVQcowState {
57*019d6b8fSAnthony Liguori     BlockDriverState *hd;
58*019d6b8fSAnthony Liguori     int cluster_bits;
59*019d6b8fSAnthony Liguori     int cluster_size;
60*019d6b8fSAnthony Liguori     int cluster_sectors;
61*019d6b8fSAnthony Liguori     int l2_bits;
62*019d6b8fSAnthony Liguori     int l2_size;
63*019d6b8fSAnthony Liguori     int l1_size;
64*019d6b8fSAnthony Liguori     uint64_t cluster_offset_mask;
65*019d6b8fSAnthony Liguori     uint64_t l1_table_offset;
66*019d6b8fSAnthony Liguori     uint64_t *l1_table;
67*019d6b8fSAnthony Liguori     uint64_t *l2_cache;
68*019d6b8fSAnthony Liguori     uint64_t l2_cache_offsets[L2_CACHE_SIZE];
69*019d6b8fSAnthony Liguori     uint32_t l2_cache_counts[L2_CACHE_SIZE];
70*019d6b8fSAnthony Liguori     uint8_t *cluster_cache;
71*019d6b8fSAnthony Liguori     uint8_t *cluster_data;
72*019d6b8fSAnthony Liguori     uint64_t cluster_cache_offset;
73*019d6b8fSAnthony Liguori     uint32_t crypt_method; /* current crypt method, 0 if no key yet */
74*019d6b8fSAnthony Liguori     uint32_t crypt_method_header;
75*019d6b8fSAnthony Liguori     AES_KEY aes_encrypt_key;
76*019d6b8fSAnthony Liguori     AES_KEY aes_decrypt_key;
77*019d6b8fSAnthony Liguori } BDRVQcowState;
78*019d6b8fSAnthony Liguori 
79*019d6b8fSAnthony Liguori static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset);
80*019d6b8fSAnthony Liguori 
81*019d6b8fSAnthony Liguori static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
82*019d6b8fSAnthony Liguori {
83*019d6b8fSAnthony Liguori     const QCowHeader *cow_header = (const void *)buf;
84*019d6b8fSAnthony Liguori 
85*019d6b8fSAnthony Liguori     if (buf_size >= sizeof(QCowHeader) &&
86*019d6b8fSAnthony Liguori         be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
87*019d6b8fSAnthony Liguori         be32_to_cpu(cow_header->version) == QCOW_VERSION)
88*019d6b8fSAnthony Liguori         return 100;
89*019d6b8fSAnthony Liguori     else
90*019d6b8fSAnthony Liguori         return 0;
91*019d6b8fSAnthony Liguori }
92*019d6b8fSAnthony Liguori 
93*019d6b8fSAnthony Liguori static int qcow_open(BlockDriverState *bs, const char *filename, int flags)
94*019d6b8fSAnthony Liguori {
95*019d6b8fSAnthony Liguori     BDRVQcowState *s = bs->opaque;
96*019d6b8fSAnthony Liguori     int len, i, shift, ret;
97*019d6b8fSAnthony Liguori     QCowHeader header;
98*019d6b8fSAnthony Liguori 
99*019d6b8fSAnthony Liguori     ret = bdrv_file_open(&s->hd, filename, flags);
100*019d6b8fSAnthony Liguori     if (ret < 0)
101*019d6b8fSAnthony Liguori         return ret;
102*019d6b8fSAnthony Liguori     if (bdrv_pread(s->hd, 0, &header, sizeof(header)) != sizeof(header))
103*019d6b8fSAnthony Liguori         goto fail;
104*019d6b8fSAnthony Liguori     be32_to_cpus(&header.magic);
105*019d6b8fSAnthony Liguori     be32_to_cpus(&header.version);
106*019d6b8fSAnthony Liguori     be64_to_cpus(&header.backing_file_offset);
107*019d6b8fSAnthony Liguori     be32_to_cpus(&header.backing_file_size);
108*019d6b8fSAnthony Liguori     be32_to_cpus(&header.mtime);
109*019d6b8fSAnthony Liguori     be64_to_cpus(&header.size);
110*019d6b8fSAnthony Liguori     be32_to_cpus(&header.crypt_method);
111*019d6b8fSAnthony Liguori     be64_to_cpus(&header.l1_table_offset);
112*019d6b8fSAnthony Liguori 
113*019d6b8fSAnthony Liguori     if (header.magic != QCOW_MAGIC || header.version != QCOW_VERSION)
114*019d6b8fSAnthony Liguori         goto fail;
115*019d6b8fSAnthony Liguori     if (header.size <= 1 || header.cluster_bits < 9)
116*019d6b8fSAnthony Liguori         goto fail;
117*019d6b8fSAnthony Liguori     if (header.crypt_method > QCOW_CRYPT_AES)
118*019d6b8fSAnthony Liguori         goto fail;
119*019d6b8fSAnthony Liguori     s->crypt_method_header = header.crypt_method;
120*019d6b8fSAnthony Liguori     if (s->crypt_method_header)
121*019d6b8fSAnthony Liguori         bs->encrypted = 1;
122*019d6b8fSAnthony Liguori     s->cluster_bits = header.cluster_bits;
123*019d6b8fSAnthony Liguori     s->cluster_size = 1 << s->cluster_bits;
124*019d6b8fSAnthony Liguori     s->cluster_sectors = 1 << (s->cluster_bits - 9);
125*019d6b8fSAnthony Liguori     s->l2_bits = header.l2_bits;
126*019d6b8fSAnthony Liguori     s->l2_size = 1 << s->l2_bits;
127*019d6b8fSAnthony Liguori     bs->total_sectors = header.size / 512;
128*019d6b8fSAnthony Liguori     s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1;
129*019d6b8fSAnthony Liguori 
130*019d6b8fSAnthony Liguori     /* read the level 1 table */
131*019d6b8fSAnthony Liguori     shift = s->cluster_bits + s->l2_bits;
132*019d6b8fSAnthony Liguori     s->l1_size = (header.size + (1LL << shift) - 1) >> shift;
133*019d6b8fSAnthony Liguori 
134*019d6b8fSAnthony Liguori     s->l1_table_offset = header.l1_table_offset;
135*019d6b8fSAnthony Liguori     s->l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
136*019d6b8fSAnthony Liguori     if (!s->l1_table)
137*019d6b8fSAnthony Liguori         goto fail;
138*019d6b8fSAnthony Liguori     if (bdrv_pread(s->hd, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)) !=
139*019d6b8fSAnthony Liguori         s->l1_size * sizeof(uint64_t))
140*019d6b8fSAnthony Liguori         goto fail;
141*019d6b8fSAnthony Liguori     for(i = 0;i < s->l1_size; i++) {
142*019d6b8fSAnthony Liguori         be64_to_cpus(&s->l1_table[i]);
143*019d6b8fSAnthony Liguori     }
144*019d6b8fSAnthony Liguori     /* alloc L2 cache */
145*019d6b8fSAnthony Liguori     s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
146*019d6b8fSAnthony Liguori     if (!s->l2_cache)
147*019d6b8fSAnthony Liguori         goto fail;
148*019d6b8fSAnthony Liguori     s->cluster_cache = qemu_malloc(s->cluster_size);
149*019d6b8fSAnthony Liguori     if (!s->cluster_cache)
150*019d6b8fSAnthony Liguori         goto fail;
151*019d6b8fSAnthony Liguori     s->cluster_data = qemu_malloc(s->cluster_size);
152*019d6b8fSAnthony Liguori     if (!s->cluster_data)
153*019d6b8fSAnthony Liguori         goto fail;
154*019d6b8fSAnthony Liguori     s->cluster_cache_offset = -1;
155*019d6b8fSAnthony Liguori 
156*019d6b8fSAnthony Liguori     /* read the backing file name */
157*019d6b8fSAnthony Liguori     if (header.backing_file_offset != 0) {
158*019d6b8fSAnthony Liguori         len = header.backing_file_size;
159*019d6b8fSAnthony Liguori         if (len > 1023)
160*019d6b8fSAnthony Liguori             len = 1023;
161*019d6b8fSAnthony Liguori         if (bdrv_pread(s->hd, header.backing_file_offset, bs->backing_file, len) != len)
162*019d6b8fSAnthony Liguori             goto fail;
163*019d6b8fSAnthony Liguori         bs->backing_file[len] = '\0';
164*019d6b8fSAnthony Liguori     }
165*019d6b8fSAnthony Liguori     return 0;
166*019d6b8fSAnthony Liguori 
167*019d6b8fSAnthony Liguori  fail:
168*019d6b8fSAnthony Liguori     qemu_free(s->l1_table);
169*019d6b8fSAnthony Liguori     qemu_free(s->l2_cache);
170*019d6b8fSAnthony Liguori     qemu_free(s->cluster_cache);
171*019d6b8fSAnthony Liguori     qemu_free(s->cluster_data);
172*019d6b8fSAnthony Liguori     bdrv_delete(s->hd);
173*019d6b8fSAnthony Liguori     return -1;
174*019d6b8fSAnthony Liguori }
175*019d6b8fSAnthony Liguori 
176*019d6b8fSAnthony Liguori static int qcow_set_key(BlockDriverState *bs, const char *key)
177*019d6b8fSAnthony Liguori {
178*019d6b8fSAnthony Liguori     BDRVQcowState *s = bs->opaque;
179*019d6b8fSAnthony Liguori     uint8_t keybuf[16];
180*019d6b8fSAnthony Liguori     int len, i;
181*019d6b8fSAnthony Liguori 
182*019d6b8fSAnthony Liguori     memset(keybuf, 0, 16);
183*019d6b8fSAnthony Liguori     len = strlen(key);
184*019d6b8fSAnthony Liguori     if (len > 16)
185*019d6b8fSAnthony Liguori         len = 16;
186*019d6b8fSAnthony Liguori     /* XXX: we could compress the chars to 7 bits to increase
187*019d6b8fSAnthony Liguori        entropy */
188*019d6b8fSAnthony Liguori     for(i = 0;i < len;i++) {
189*019d6b8fSAnthony Liguori         keybuf[i] = key[i];
190*019d6b8fSAnthony Liguori     }
191*019d6b8fSAnthony Liguori     s->crypt_method = s->crypt_method_header;
192*019d6b8fSAnthony Liguori 
193*019d6b8fSAnthony Liguori     if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
194*019d6b8fSAnthony Liguori         return -1;
195*019d6b8fSAnthony Liguori     if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
196*019d6b8fSAnthony Liguori         return -1;
197*019d6b8fSAnthony Liguori #if 0
198*019d6b8fSAnthony Liguori     /* test */
199*019d6b8fSAnthony Liguori     {
200*019d6b8fSAnthony Liguori         uint8_t in[16];
201*019d6b8fSAnthony Liguori         uint8_t out[16];
202*019d6b8fSAnthony Liguori         uint8_t tmp[16];
203*019d6b8fSAnthony Liguori         for(i=0;i<16;i++)
204*019d6b8fSAnthony Liguori             in[i] = i;
205*019d6b8fSAnthony Liguori         AES_encrypt(in, tmp, &s->aes_encrypt_key);
206*019d6b8fSAnthony Liguori         AES_decrypt(tmp, out, &s->aes_decrypt_key);
207*019d6b8fSAnthony Liguori         for(i = 0; i < 16; i++)
208*019d6b8fSAnthony Liguori             printf(" %02x", tmp[i]);
209*019d6b8fSAnthony Liguori         printf("\n");
210*019d6b8fSAnthony Liguori         for(i = 0; i < 16; i++)
211*019d6b8fSAnthony Liguori             printf(" %02x", out[i]);
212*019d6b8fSAnthony Liguori         printf("\n");
213*019d6b8fSAnthony Liguori     }
214*019d6b8fSAnthony Liguori #endif
215*019d6b8fSAnthony Liguori     return 0;
216*019d6b8fSAnthony Liguori }
217*019d6b8fSAnthony Liguori 
218*019d6b8fSAnthony Liguori /* The crypt function is compatible with the linux cryptoloop
219*019d6b8fSAnthony Liguori    algorithm for < 4 GB images. NOTE: out_buf == in_buf is
220*019d6b8fSAnthony Liguori    supported */
221*019d6b8fSAnthony Liguori static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
222*019d6b8fSAnthony Liguori                             uint8_t *out_buf, const uint8_t *in_buf,
223*019d6b8fSAnthony Liguori                             int nb_sectors, int enc,
224*019d6b8fSAnthony Liguori                             const AES_KEY *key)
225*019d6b8fSAnthony Liguori {
226*019d6b8fSAnthony Liguori     union {
227*019d6b8fSAnthony Liguori         uint64_t ll[2];
228*019d6b8fSAnthony Liguori         uint8_t b[16];
229*019d6b8fSAnthony Liguori     } ivec;
230*019d6b8fSAnthony Liguori     int i;
231*019d6b8fSAnthony Liguori 
232*019d6b8fSAnthony Liguori     for(i = 0; i < nb_sectors; i++) {
233*019d6b8fSAnthony Liguori         ivec.ll[0] = cpu_to_le64(sector_num);
234*019d6b8fSAnthony Liguori         ivec.ll[1] = 0;
235*019d6b8fSAnthony Liguori         AES_cbc_encrypt(in_buf, out_buf, 512, key,
236*019d6b8fSAnthony Liguori                         ivec.b, enc);
237*019d6b8fSAnthony Liguori         sector_num++;
238*019d6b8fSAnthony Liguori         in_buf += 512;
239*019d6b8fSAnthony Liguori         out_buf += 512;
240*019d6b8fSAnthony Liguori     }
241*019d6b8fSAnthony Liguori }
242*019d6b8fSAnthony Liguori 
243*019d6b8fSAnthony Liguori /* 'allocate' is:
244*019d6b8fSAnthony Liguori  *
245*019d6b8fSAnthony Liguori  * 0 to not allocate.
246*019d6b8fSAnthony Liguori  *
247*019d6b8fSAnthony Liguori  * 1 to allocate a normal cluster (for sector indexes 'n_start' to
248*019d6b8fSAnthony Liguori  * 'n_end')
249*019d6b8fSAnthony Liguori  *
250*019d6b8fSAnthony Liguori  * 2 to allocate a compressed cluster of size
251*019d6b8fSAnthony Liguori  * 'compressed_size'. 'compressed_size' must be > 0 and <
252*019d6b8fSAnthony Liguori  * cluster_size
253*019d6b8fSAnthony Liguori  *
254*019d6b8fSAnthony Liguori  * return 0 if not allocated.
255*019d6b8fSAnthony Liguori  */
256*019d6b8fSAnthony Liguori static uint64_t get_cluster_offset(BlockDriverState *bs,
257*019d6b8fSAnthony Liguori                                    uint64_t offset, int allocate,
258*019d6b8fSAnthony Liguori                                    int compressed_size,
259*019d6b8fSAnthony Liguori                                    int n_start, int n_end)
260*019d6b8fSAnthony Liguori {
261*019d6b8fSAnthony Liguori     BDRVQcowState *s = bs->opaque;
262*019d6b8fSAnthony Liguori     int min_index, i, j, l1_index, l2_index;
263*019d6b8fSAnthony Liguori     uint64_t l2_offset, *l2_table, cluster_offset, tmp;
264*019d6b8fSAnthony Liguori     uint32_t min_count;
265*019d6b8fSAnthony Liguori     int new_l2_table;
266*019d6b8fSAnthony Liguori 
267*019d6b8fSAnthony Liguori     l1_index = offset >> (s->l2_bits + s->cluster_bits);
268*019d6b8fSAnthony Liguori     l2_offset = s->l1_table[l1_index];
269*019d6b8fSAnthony Liguori     new_l2_table = 0;
270*019d6b8fSAnthony Liguori     if (!l2_offset) {
271*019d6b8fSAnthony Liguori         if (!allocate)
272*019d6b8fSAnthony Liguori             return 0;
273*019d6b8fSAnthony Liguori         /* allocate a new l2 entry */
274*019d6b8fSAnthony Liguori         l2_offset = bdrv_getlength(s->hd);
275*019d6b8fSAnthony Liguori         /* round to cluster size */
276*019d6b8fSAnthony Liguori         l2_offset = (l2_offset + s->cluster_size - 1) & ~(s->cluster_size - 1);
277*019d6b8fSAnthony Liguori         /* update the L1 entry */
278*019d6b8fSAnthony Liguori         s->l1_table[l1_index] = l2_offset;
279*019d6b8fSAnthony Liguori         tmp = cpu_to_be64(l2_offset);
280*019d6b8fSAnthony Liguori         if (bdrv_pwrite(s->hd, s->l1_table_offset + l1_index * sizeof(tmp),
281*019d6b8fSAnthony Liguori                         &tmp, sizeof(tmp)) != sizeof(tmp))
282*019d6b8fSAnthony Liguori             return 0;
283*019d6b8fSAnthony Liguori         new_l2_table = 1;
284*019d6b8fSAnthony Liguori     }
285*019d6b8fSAnthony Liguori     for(i = 0; i < L2_CACHE_SIZE; i++) {
286*019d6b8fSAnthony Liguori         if (l2_offset == s->l2_cache_offsets[i]) {
287*019d6b8fSAnthony Liguori             /* increment the hit count */
288*019d6b8fSAnthony Liguori             if (++s->l2_cache_counts[i] == 0xffffffff) {
289*019d6b8fSAnthony Liguori                 for(j = 0; j < L2_CACHE_SIZE; j++) {
290*019d6b8fSAnthony Liguori                     s->l2_cache_counts[j] >>= 1;
291*019d6b8fSAnthony Liguori                 }
292*019d6b8fSAnthony Liguori             }
293*019d6b8fSAnthony Liguori             l2_table = s->l2_cache + (i << s->l2_bits);
294*019d6b8fSAnthony Liguori             goto found;
295*019d6b8fSAnthony Liguori         }
296*019d6b8fSAnthony Liguori     }
297*019d6b8fSAnthony Liguori     /* not found: load a new entry in the least used one */
298*019d6b8fSAnthony Liguori     min_index = 0;
299*019d6b8fSAnthony Liguori     min_count = 0xffffffff;
300*019d6b8fSAnthony Liguori     for(i = 0; i < L2_CACHE_SIZE; i++) {
301*019d6b8fSAnthony Liguori         if (s->l2_cache_counts[i] < min_count) {
302*019d6b8fSAnthony Liguori             min_count = s->l2_cache_counts[i];
303*019d6b8fSAnthony Liguori             min_index = i;
304*019d6b8fSAnthony Liguori         }
305*019d6b8fSAnthony Liguori     }
306*019d6b8fSAnthony Liguori     l2_table = s->l2_cache + (min_index << s->l2_bits);
307*019d6b8fSAnthony Liguori     if (new_l2_table) {
308*019d6b8fSAnthony Liguori         memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
309*019d6b8fSAnthony Liguori         if (bdrv_pwrite(s->hd, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) !=
310*019d6b8fSAnthony Liguori             s->l2_size * sizeof(uint64_t))
311*019d6b8fSAnthony Liguori             return 0;
312*019d6b8fSAnthony Liguori     } else {
313*019d6b8fSAnthony Liguori         if (bdrv_pread(s->hd, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) !=
314*019d6b8fSAnthony Liguori             s->l2_size * sizeof(uint64_t))
315*019d6b8fSAnthony Liguori             return 0;
316*019d6b8fSAnthony Liguori     }
317*019d6b8fSAnthony Liguori     s->l2_cache_offsets[min_index] = l2_offset;
318*019d6b8fSAnthony Liguori     s->l2_cache_counts[min_index] = 1;
319*019d6b8fSAnthony Liguori  found:
320*019d6b8fSAnthony Liguori     l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
321*019d6b8fSAnthony Liguori     cluster_offset = be64_to_cpu(l2_table[l2_index]);
322*019d6b8fSAnthony Liguori     if (!cluster_offset ||
323*019d6b8fSAnthony Liguori         ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1)) {
324*019d6b8fSAnthony Liguori         if (!allocate)
325*019d6b8fSAnthony Liguori             return 0;
326*019d6b8fSAnthony Liguori         /* allocate a new cluster */
327*019d6b8fSAnthony Liguori         if ((cluster_offset & QCOW_OFLAG_COMPRESSED) &&
328*019d6b8fSAnthony Liguori             (n_end - n_start) < s->cluster_sectors) {
329*019d6b8fSAnthony Liguori             /* if the cluster is already compressed, we must
330*019d6b8fSAnthony Liguori                decompress it in the case it is not completely
331*019d6b8fSAnthony Liguori                overwritten */
332*019d6b8fSAnthony Liguori             if (decompress_cluster(s, cluster_offset) < 0)
333*019d6b8fSAnthony Liguori                 return 0;
334*019d6b8fSAnthony Liguori             cluster_offset = bdrv_getlength(s->hd);
335*019d6b8fSAnthony Liguori             cluster_offset = (cluster_offset + s->cluster_size - 1) &
336*019d6b8fSAnthony Liguori                 ~(s->cluster_size - 1);
337*019d6b8fSAnthony Liguori             /* write the cluster content */
338*019d6b8fSAnthony Liguori             if (bdrv_pwrite(s->hd, cluster_offset, s->cluster_cache, s->cluster_size) !=
339*019d6b8fSAnthony Liguori                 s->cluster_size)
340*019d6b8fSAnthony Liguori                 return -1;
341*019d6b8fSAnthony Liguori         } else {
342*019d6b8fSAnthony Liguori             cluster_offset = bdrv_getlength(s->hd);
343*019d6b8fSAnthony Liguori             if (allocate == 1) {
344*019d6b8fSAnthony Liguori                 /* round to cluster size */
345*019d6b8fSAnthony Liguori                 cluster_offset = (cluster_offset + s->cluster_size - 1) &
346*019d6b8fSAnthony Liguori                     ~(s->cluster_size - 1);
347*019d6b8fSAnthony Liguori                 bdrv_truncate(s->hd, cluster_offset + s->cluster_size);
348*019d6b8fSAnthony Liguori                 /* if encrypted, we must initialize the cluster
349*019d6b8fSAnthony Liguori                    content which won't be written */
350*019d6b8fSAnthony Liguori                 if (s->crypt_method &&
351*019d6b8fSAnthony Liguori                     (n_end - n_start) < s->cluster_sectors) {
352*019d6b8fSAnthony Liguori                     uint64_t start_sect;
353*019d6b8fSAnthony Liguori                     start_sect = (offset & ~(s->cluster_size - 1)) >> 9;
354*019d6b8fSAnthony Liguori                     memset(s->cluster_data + 512, 0x00, 512);
355*019d6b8fSAnthony Liguori                     for(i = 0; i < s->cluster_sectors; i++) {
356*019d6b8fSAnthony Liguori                         if (i < n_start || i >= n_end) {
357*019d6b8fSAnthony Liguori                             encrypt_sectors(s, start_sect + i,
358*019d6b8fSAnthony Liguori                                             s->cluster_data,
359*019d6b8fSAnthony Liguori                                             s->cluster_data + 512, 1, 1,
360*019d6b8fSAnthony Liguori                                             &s->aes_encrypt_key);
361*019d6b8fSAnthony Liguori                             if (bdrv_pwrite(s->hd, cluster_offset + i * 512,
362*019d6b8fSAnthony Liguori                                             s->cluster_data, 512) != 512)
363*019d6b8fSAnthony Liguori                                 return -1;
364*019d6b8fSAnthony Liguori                         }
365*019d6b8fSAnthony Liguori                     }
366*019d6b8fSAnthony Liguori                 }
367*019d6b8fSAnthony Liguori             } else if (allocate == 2) {
368*019d6b8fSAnthony Liguori                 cluster_offset |= QCOW_OFLAG_COMPRESSED |
369*019d6b8fSAnthony Liguori                     (uint64_t)compressed_size << (63 - s->cluster_bits);
370*019d6b8fSAnthony Liguori             }
371*019d6b8fSAnthony Liguori         }
372*019d6b8fSAnthony Liguori         /* update L2 table */
373*019d6b8fSAnthony Liguori         tmp = cpu_to_be64(cluster_offset);
374*019d6b8fSAnthony Liguori         l2_table[l2_index] = tmp;
375*019d6b8fSAnthony Liguori         if (bdrv_pwrite(s->hd,
376*019d6b8fSAnthony Liguori                         l2_offset + l2_index * sizeof(tmp), &tmp, sizeof(tmp)) != sizeof(tmp))
377*019d6b8fSAnthony Liguori             return 0;
378*019d6b8fSAnthony Liguori     }
379*019d6b8fSAnthony Liguori     return cluster_offset;
380*019d6b8fSAnthony Liguori }
381*019d6b8fSAnthony Liguori 
382*019d6b8fSAnthony Liguori static int qcow_is_allocated(BlockDriverState *bs, int64_t sector_num,
383*019d6b8fSAnthony Liguori                              int nb_sectors, int *pnum)
384*019d6b8fSAnthony Liguori {
385*019d6b8fSAnthony Liguori     BDRVQcowState *s = bs->opaque;
386*019d6b8fSAnthony Liguori     int index_in_cluster, n;
387*019d6b8fSAnthony Liguori     uint64_t cluster_offset;
388*019d6b8fSAnthony Liguori 
389*019d6b8fSAnthony Liguori     cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0);
390*019d6b8fSAnthony Liguori     index_in_cluster = sector_num & (s->cluster_sectors - 1);
391*019d6b8fSAnthony Liguori     n = s->cluster_sectors - index_in_cluster;
392*019d6b8fSAnthony Liguori     if (n > nb_sectors)
393*019d6b8fSAnthony Liguori         n = nb_sectors;
394*019d6b8fSAnthony Liguori     *pnum = n;
395*019d6b8fSAnthony Liguori     return (cluster_offset != 0);
396*019d6b8fSAnthony Liguori }
397*019d6b8fSAnthony Liguori 
398*019d6b8fSAnthony Liguori static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
399*019d6b8fSAnthony Liguori                              const uint8_t *buf, int buf_size)
400*019d6b8fSAnthony Liguori {
401*019d6b8fSAnthony Liguori     z_stream strm1, *strm = &strm1;
402*019d6b8fSAnthony Liguori     int ret, out_len;
403*019d6b8fSAnthony Liguori 
404*019d6b8fSAnthony Liguori     memset(strm, 0, sizeof(*strm));
405*019d6b8fSAnthony Liguori 
406*019d6b8fSAnthony Liguori     strm->next_in = (uint8_t *)buf;
407*019d6b8fSAnthony Liguori     strm->avail_in = buf_size;
408*019d6b8fSAnthony Liguori     strm->next_out = out_buf;
409*019d6b8fSAnthony Liguori     strm->avail_out = out_buf_size;
410*019d6b8fSAnthony Liguori 
411*019d6b8fSAnthony Liguori     ret = inflateInit2(strm, -12);
412*019d6b8fSAnthony Liguori     if (ret != Z_OK)
413*019d6b8fSAnthony Liguori         return -1;
414*019d6b8fSAnthony Liguori     ret = inflate(strm, Z_FINISH);
415*019d6b8fSAnthony Liguori     out_len = strm->next_out - out_buf;
416*019d6b8fSAnthony Liguori     if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
417*019d6b8fSAnthony Liguori         out_len != out_buf_size) {
418*019d6b8fSAnthony Liguori         inflateEnd(strm);
419*019d6b8fSAnthony Liguori         return -1;
420*019d6b8fSAnthony Liguori     }
421*019d6b8fSAnthony Liguori     inflateEnd(strm);
422*019d6b8fSAnthony Liguori     return 0;
423*019d6b8fSAnthony Liguori }
424*019d6b8fSAnthony Liguori 
425*019d6b8fSAnthony Liguori static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset)
426*019d6b8fSAnthony Liguori {
427*019d6b8fSAnthony Liguori     int ret, csize;
428*019d6b8fSAnthony Liguori     uint64_t coffset;
429*019d6b8fSAnthony Liguori 
430*019d6b8fSAnthony Liguori     coffset = cluster_offset & s->cluster_offset_mask;
431*019d6b8fSAnthony Liguori     if (s->cluster_cache_offset != coffset) {
432*019d6b8fSAnthony Liguori         csize = cluster_offset >> (63 - s->cluster_bits);
433*019d6b8fSAnthony Liguori         csize &= (s->cluster_size - 1);
434*019d6b8fSAnthony Liguori         ret = bdrv_pread(s->hd, coffset, s->cluster_data, csize);
435*019d6b8fSAnthony Liguori         if (ret != csize)
436*019d6b8fSAnthony Liguori             return -1;
437*019d6b8fSAnthony Liguori         if (decompress_buffer(s->cluster_cache, s->cluster_size,
438*019d6b8fSAnthony Liguori                               s->cluster_data, csize) < 0) {
439*019d6b8fSAnthony Liguori             return -1;
440*019d6b8fSAnthony Liguori         }
441*019d6b8fSAnthony Liguori         s->cluster_cache_offset = coffset;
442*019d6b8fSAnthony Liguori     }
443*019d6b8fSAnthony Liguori     return 0;
444*019d6b8fSAnthony Liguori }
445*019d6b8fSAnthony Liguori 
446*019d6b8fSAnthony Liguori #if 0
447*019d6b8fSAnthony Liguori 
448*019d6b8fSAnthony Liguori static int qcow_read(BlockDriverState *bs, int64_t sector_num,
449*019d6b8fSAnthony Liguori                      uint8_t *buf, int nb_sectors)
450*019d6b8fSAnthony Liguori {
451*019d6b8fSAnthony Liguori     BDRVQcowState *s = bs->opaque;
452*019d6b8fSAnthony Liguori     int ret, index_in_cluster, n;
453*019d6b8fSAnthony Liguori     uint64_t cluster_offset;
454*019d6b8fSAnthony Liguori 
455*019d6b8fSAnthony Liguori     while (nb_sectors > 0) {
456*019d6b8fSAnthony Liguori         cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0);
457*019d6b8fSAnthony Liguori         index_in_cluster = sector_num & (s->cluster_sectors - 1);
458*019d6b8fSAnthony Liguori         n = s->cluster_sectors - index_in_cluster;
459*019d6b8fSAnthony Liguori         if (n > nb_sectors)
460*019d6b8fSAnthony Liguori             n = nb_sectors;
461*019d6b8fSAnthony Liguori         if (!cluster_offset) {
462*019d6b8fSAnthony Liguori             if (bs->backing_hd) {
463*019d6b8fSAnthony Liguori                 /* read from the base image */
464*019d6b8fSAnthony Liguori                 ret = bdrv_read(bs->backing_hd, sector_num, buf, n);
465*019d6b8fSAnthony Liguori                 if (ret < 0)
466*019d6b8fSAnthony Liguori                     return -1;
467*019d6b8fSAnthony Liguori             } else {
468*019d6b8fSAnthony Liguori                 memset(buf, 0, 512 * n);
469*019d6b8fSAnthony Liguori             }
470*019d6b8fSAnthony Liguori         } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
471*019d6b8fSAnthony Liguori             if (decompress_cluster(s, cluster_offset) < 0)
472*019d6b8fSAnthony Liguori                 return -1;
473*019d6b8fSAnthony Liguori             memcpy(buf, s->cluster_cache + index_in_cluster * 512, 512 * n);
474*019d6b8fSAnthony Liguori         } else {
475*019d6b8fSAnthony Liguori             ret = bdrv_pread(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
476*019d6b8fSAnthony Liguori             if (ret != n * 512)
477*019d6b8fSAnthony Liguori                 return -1;
478*019d6b8fSAnthony Liguori             if (s->crypt_method) {
479*019d6b8fSAnthony Liguori                 encrypt_sectors(s, sector_num, buf, buf, n, 0,
480*019d6b8fSAnthony Liguori                                 &s->aes_decrypt_key);
481*019d6b8fSAnthony Liguori             }
482*019d6b8fSAnthony Liguori         }
483*019d6b8fSAnthony Liguori         nb_sectors -= n;
484*019d6b8fSAnthony Liguori         sector_num += n;
485*019d6b8fSAnthony Liguori         buf += n * 512;
486*019d6b8fSAnthony Liguori     }
487*019d6b8fSAnthony Liguori     return 0;
488*019d6b8fSAnthony Liguori }
489*019d6b8fSAnthony Liguori #endif
490*019d6b8fSAnthony Liguori 
491*019d6b8fSAnthony Liguori static int qcow_write(BlockDriverState *bs, int64_t sector_num,
492*019d6b8fSAnthony Liguori                      const uint8_t *buf, int nb_sectors)
493*019d6b8fSAnthony Liguori {
494*019d6b8fSAnthony Liguori     BDRVQcowState *s = bs->opaque;
495*019d6b8fSAnthony Liguori     int ret, index_in_cluster, n;
496*019d6b8fSAnthony Liguori     uint64_t cluster_offset;
497*019d6b8fSAnthony Liguori 
498*019d6b8fSAnthony Liguori     while (nb_sectors > 0) {
499*019d6b8fSAnthony Liguori         index_in_cluster = sector_num & (s->cluster_sectors - 1);
500*019d6b8fSAnthony Liguori         n = s->cluster_sectors - index_in_cluster;
501*019d6b8fSAnthony Liguori         if (n > nb_sectors)
502*019d6b8fSAnthony Liguori             n = nb_sectors;
503*019d6b8fSAnthony Liguori         cluster_offset = get_cluster_offset(bs, sector_num << 9, 1, 0,
504*019d6b8fSAnthony Liguori                                             index_in_cluster,
505*019d6b8fSAnthony Liguori                                             index_in_cluster + n);
506*019d6b8fSAnthony Liguori         if (!cluster_offset)
507*019d6b8fSAnthony Liguori             return -1;
508*019d6b8fSAnthony Liguori         if (s->crypt_method) {
509*019d6b8fSAnthony Liguori             encrypt_sectors(s, sector_num, s->cluster_data, buf, n, 1,
510*019d6b8fSAnthony Liguori                             &s->aes_encrypt_key);
511*019d6b8fSAnthony Liguori             ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512,
512*019d6b8fSAnthony Liguori                               s->cluster_data, n * 512);
513*019d6b8fSAnthony Liguori         } else {
514*019d6b8fSAnthony Liguori             ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
515*019d6b8fSAnthony Liguori         }
516*019d6b8fSAnthony Liguori         if (ret != n * 512)
517*019d6b8fSAnthony Liguori             return -1;
518*019d6b8fSAnthony Liguori         nb_sectors -= n;
519*019d6b8fSAnthony Liguori         sector_num += n;
520*019d6b8fSAnthony Liguori         buf += n * 512;
521*019d6b8fSAnthony Liguori     }
522*019d6b8fSAnthony Liguori     s->cluster_cache_offset = -1; /* disable compressed cache */
523*019d6b8fSAnthony Liguori     return 0;
524*019d6b8fSAnthony Liguori }
525*019d6b8fSAnthony Liguori 
526*019d6b8fSAnthony Liguori typedef struct QCowAIOCB {
527*019d6b8fSAnthony Liguori     BlockDriverAIOCB common;
528*019d6b8fSAnthony Liguori     int64_t sector_num;
529*019d6b8fSAnthony Liguori     QEMUIOVector *qiov;
530*019d6b8fSAnthony Liguori     uint8_t *buf;
531*019d6b8fSAnthony Liguori     void *orig_buf;
532*019d6b8fSAnthony Liguori     int nb_sectors;
533*019d6b8fSAnthony Liguori     int n;
534*019d6b8fSAnthony Liguori     uint64_t cluster_offset;
535*019d6b8fSAnthony Liguori     uint8_t *cluster_data;
536*019d6b8fSAnthony Liguori     struct iovec hd_iov;
537*019d6b8fSAnthony Liguori     QEMUIOVector hd_qiov;
538*019d6b8fSAnthony Liguori     BlockDriverAIOCB *hd_aiocb;
539*019d6b8fSAnthony Liguori } QCowAIOCB;
540*019d6b8fSAnthony Liguori 
541*019d6b8fSAnthony Liguori static void qcow_aio_read_cb(void *opaque, int ret)
542*019d6b8fSAnthony Liguori {
543*019d6b8fSAnthony Liguori     QCowAIOCB *acb = opaque;
544*019d6b8fSAnthony Liguori     BlockDriverState *bs = acb->common.bs;
545*019d6b8fSAnthony Liguori     BDRVQcowState *s = bs->opaque;
546*019d6b8fSAnthony Liguori     int index_in_cluster;
547*019d6b8fSAnthony Liguori 
548*019d6b8fSAnthony Liguori     acb->hd_aiocb = NULL;
549*019d6b8fSAnthony Liguori     if (ret < 0)
550*019d6b8fSAnthony Liguori         goto done;
551*019d6b8fSAnthony Liguori 
552*019d6b8fSAnthony Liguori  redo:
553*019d6b8fSAnthony Liguori     /* post process the read buffer */
554*019d6b8fSAnthony Liguori     if (!acb->cluster_offset) {
555*019d6b8fSAnthony Liguori         /* nothing to do */
556*019d6b8fSAnthony Liguori     } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
557*019d6b8fSAnthony Liguori         /* nothing to do */
558*019d6b8fSAnthony Liguori     } else {
559*019d6b8fSAnthony Liguori         if (s->crypt_method) {
560*019d6b8fSAnthony Liguori             encrypt_sectors(s, acb->sector_num, acb->buf, acb->buf,
561*019d6b8fSAnthony Liguori                             acb->n, 0,
562*019d6b8fSAnthony Liguori                             &s->aes_decrypt_key);
563*019d6b8fSAnthony Liguori         }
564*019d6b8fSAnthony Liguori     }
565*019d6b8fSAnthony Liguori 
566*019d6b8fSAnthony Liguori     acb->nb_sectors -= acb->n;
567*019d6b8fSAnthony Liguori     acb->sector_num += acb->n;
568*019d6b8fSAnthony Liguori     acb->buf += acb->n * 512;
569*019d6b8fSAnthony Liguori 
570*019d6b8fSAnthony Liguori     if (acb->nb_sectors == 0) {
571*019d6b8fSAnthony Liguori         /* request completed */
572*019d6b8fSAnthony Liguori         ret = 0;
573*019d6b8fSAnthony Liguori         goto done;
574*019d6b8fSAnthony Liguori     }
575*019d6b8fSAnthony Liguori 
576*019d6b8fSAnthony Liguori     /* prepare next AIO request */
577*019d6b8fSAnthony Liguori     acb->cluster_offset = get_cluster_offset(bs, acb->sector_num << 9,
578*019d6b8fSAnthony Liguori                                              0, 0, 0, 0);
579*019d6b8fSAnthony Liguori     index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
580*019d6b8fSAnthony Liguori     acb->n = s->cluster_sectors - index_in_cluster;
581*019d6b8fSAnthony Liguori     if (acb->n > acb->nb_sectors)
582*019d6b8fSAnthony Liguori         acb->n = acb->nb_sectors;
583*019d6b8fSAnthony Liguori 
584*019d6b8fSAnthony Liguori     if (!acb->cluster_offset) {
585*019d6b8fSAnthony Liguori         if (bs->backing_hd) {
586*019d6b8fSAnthony Liguori             /* read from the base image */
587*019d6b8fSAnthony Liguori             acb->hd_iov.iov_base = (void *)acb->buf;
588*019d6b8fSAnthony Liguori             acb->hd_iov.iov_len = acb->n * 512;
589*019d6b8fSAnthony Liguori             qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
590*019d6b8fSAnthony Liguori             acb->hd_aiocb = bdrv_aio_readv(bs->backing_hd, acb->sector_num,
591*019d6b8fSAnthony Liguori                 &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb);
592*019d6b8fSAnthony Liguori             if (acb->hd_aiocb == NULL)
593*019d6b8fSAnthony Liguori                 goto done;
594*019d6b8fSAnthony Liguori         } else {
595*019d6b8fSAnthony Liguori             /* Note: in this case, no need to wait */
596*019d6b8fSAnthony Liguori             memset(acb->buf, 0, 512 * acb->n);
597*019d6b8fSAnthony Liguori             goto redo;
598*019d6b8fSAnthony Liguori         }
599*019d6b8fSAnthony Liguori     } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
600*019d6b8fSAnthony Liguori         /* add AIO support for compressed blocks ? */
601*019d6b8fSAnthony Liguori         if (decompress_cluster(s, acb->cluster_offset) < 0)
602*019d6b8fSAnthony Liguori             goto done;
603*019d6b8fSAnthony Liguori         memcpy(acb->buf,
604*019d6b8fSAnthony Liguori                s->cluster_cache + index_in_cluster * 512, 512 * acb->n);
605*019d6b8fSAnthony Liguori         goto redo;
606*019d6b8fSAnthony Liguori     } else {
607*019d6b8fSAnthony Liguori         if ((acb->cluster_offset & 511) != 0) {
608*019d6b8fSAnthony Liguori             ret = -EIO;
609*019d6b8fSAnthony Liguori             goto done;
610*019d6b8fSAnthony Liguori         }
611*019d6b8fSAnthony Liguori         acb->hd_iov.iov_base = (void *)acb->buf;
612*019d6b8fSAnthony Liguori         acb->hd_iov.iov_len = acb->n * 512;
613*019d6b8fSAnthony Liguori         qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
614*019d6b8fSAnthony Liguori         acb->hd_aiocb = bdrv_aio_readv(s->hd,
615*019d6b8fSAnthony Liguori                             (acb->cluster_offset >> 9) + index_in_cluster,
616*019d6b8fSAnthony Liguori                             &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb);
617*019d6b8fSAnthony Liguori         if (acb->hd_aiocb == NULL)
618*019d6b8fSAnthony Liguori             goto done;
619*019d6b8fSAnthony Liguori     }
620*019d6b8fSAnthony Liguori 
621*019d6b8fSAnthony Liguori     return;
622*019d6b8fSAnthony Liguori 
623*019d6b8fSAnthony Liguori done:
624*019d6b8fSAnthony Liguori     if (acb->qiov->niov > 1) {
625*019d6b8fSAnthony Liguori         qemu_iovec_from_buffer(acb->qiov, acb->orig_buf, acb->qiov->size);
626*019d6b8fSAnthony Liguori         qemu_vfree(acb->orig_buf);
627*019d6b8fSAnthony Liguori     }
628*019d6b8fSAnthony Liguori     acb->common.cb(acb->common.opaque, ret);
629*019d6b8fSAnthony Liguori     qemu_aio_release(acb);
630*019d6b8fSAnthony Liguori }
631*019d6b8fSAnthony Liguori 
632*019d6b8fSAnthony Liguori static BlockDriverAIOCB *qcow_aio_readv(BlockDriverState *bs,
633*019d6b8fSAnthony Liguori         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
634*019d6b8fSAnthony Liguori         BlockDriverCompletionFunc *cb, void *opaque)
635*019d6b8fSAnthony Liguori {
636*019d6b8fSAnthony Liguori     QCowAIOCB *acb;
637*019d6b8fSAnthony Liguori 
638*019d6b8fSAnthony Liguori     acb = qemu_aio_get(bs, cb, opaque);
639*019d6b8fSAnthony Liguori     if (!acb)
640*019d6b8fSAnthony Liguori         return NULL;
641*019d6b8fSAnthony Liguori     acb->hd_aiocb = NULL;
642*019d6b8fSAnthony Liguori     acb->sector_num = sector_num;
643*019d6b8fSAnthony Liguori     acb->qiov = qiov;
644*019d6b8fSAnthony Liguori     if (qiov->niov > 1)
645*019d6b8fSAnthony Liguori         acb->buf = acb->orig_buf = qemu_blockalign(bs, qiov->size);
646*019d6b8fSAnthony Liguori     else
647*019d6b8fSAnthony Liguori         acb->buf = (uint8_t *)qiov->iov->iov_base;
648*019d6b8fSAnthony Liguori     acb->nb_sectors = nb_sectors;
649*019d6b8fSAnthony Liguori     acb->n = 0;
650*019d6b8fSAnthony Liguori     acb->cluster_offset = 0;
651*019d6b8fSAnthony Liguori 
652*019d6b8fSAnthony Liguori     qcow_aio_read_cb(acb, 0);
653*019d6b8fSAnthony Liguori     return &acb->common;
654*019d6b8fSAnthony Liguori }
655*019d6b8fSAnthony Liguori 
656*019d6b8fSAnthony Liguori static void qcow_aio_write_cb(void *opaque, int ret)
657*019d6b8fSAnthony Liguori {
658*019d6b8fSAnthony Liguori     QCowAIOCB *acb = opaque;
659*019d6b8fSAnthony Liguori     BlockDriverState *bs = acb->common.bs;
660*019d6b8fSAnthony Liguori     BDRVQcowState *s = bs->opaque;
661*019d6b8fSAnthony Liguori     int index_in_cluster;
662*019d6b8fSAnthony Liguori     uint64_t cluster_offset;
663*019d6b8fSAnthony Liguori     const uint8_t *src_buf;
664*019d6b8fSAnthony Liguori 
665*019d6b8fSAnthony Liguori     acb->hd_aiocb = NULL;
666*019d6b8fSAnthony Liguori 
667*019d6b8fSAnthony Liguori     if (ret < 0)
668*019d6b8fSAnthony Liguori         goto done;
669*019d6b8fSAnthony Liguori 
670*019d6b8fSAnthony Liguori     acb->nb_sectors -= acb->n;
671*019d6b8fSAnthony Liguori     acb->sector_num += acb->n;
672*019d6b8fSAnthony Liguori     acb->buf += acb->n * 512;
673*019d6b8fSAnthony Liguori 
674*019d6b8fSAnthony Liguori     if (acb->nb_sectors == 0) {
675*019d6b8fSAnthony Liguori         /* request completed */
676*019d6b8fSAnthony Liguori         ret = 0;
677*019d6b8fSAnthony Liguori         goto done;
678*019d6b8fSAnthony Liguori     }
679*019d6b8fSAnthony Liguori 
680*019d6b8fSAnthony Liguori     index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
681*019d6b8fSAnthony Liguori     acb->n = s->cluster_sectors - index_in_cluster;
682*019d6b8fSAnthony Liguori     if (acb->n > acb->nb_sectors)
683*019d6b8fSAnthony Liguori         acb->n = acb->nb_sectors;
684*019d6b8fSAnthony Liguori     cluster_offset = get_cluster_offset(bs, acb->sector_num << 9, 1, 0,
685*019d6b8fSAnthony Liguori                                         index_in_cluster,
686*019d6b8fSAnthony Liguori                                         index_in_cluster + acb->n);
687*019d6b8fSAnthony Liguori     if (!cluster_offset || (cluster_offset & 511) != 0) {
688*019d6b8fSAnthony Liguori         ret = -EIO;
689*019d6b8fSAnthony Liguori         goto done;
690*019d6b8fSAnthony Liguori     }
691*019d6b8fSAnthony Liguori     if (s->crypt_method) {
692*019d6b8fSAnthony Liguori         if (!acb->cluster_data) {
693*019d6b8fSAnthony Liguori             acb->cluster_data = qemu_mallocz(s->cluster_size);
694*019d6b8fSAnthony Liguori             if (!acb->cluster_data) {
695*019d6b8fSAnthony Liguori                 ret = -ENOMEM;
696*019d6b8fSAnthony Liguori                 goto done;
697*019d6b8fSAnthony Liguori             }
698*019d6b8fSAnthony Liguori         }
699*019d6b8fSAnthony Liguori         encrypt_sectors(s, acb->sector_num, acb->cluster_data, acb->buf,
700*019d6b8fSAnthony Liguori                         acb->n, 1, &s->aes_encrypt_key);
701*019d6b8fSAnthony Liguori         src_buf = acb->cluster_data;
702*019d6b8fSAnthony Liguori     } else {
703*019d6b8fSAnthony Liguori         src_buf = acb->buf;
704*019d6b8fSAnthony Liguori     }
705*019d6b8fSAnthony Liguori 
706*019d6b8fSAnthony Liguori     acb->hd_iov.iov_base = (void *)src_buf;
707*019d6b8fSAnthony Liguori     acb->hd_iov.iov_len = acb->n * 512;
708*019d6b8fSAnthony Liguori     qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
709*019d6b8fSAnthony Liguori     acb->hd_aiocb = bdrv_aio_writev(s->hd,
710*019d6b8fSAnthony Liguori                                     (cluster_offset >> 9) + index_in_cluster,
711*019d6b8fSAnthony Liguori                                     &acb->hd_qiov, acb->n,
712*019d6b8fSAnthony Liguori                                     qcow_aio_write_cb, acb);
713*019d6b8fSAnthony Liguori     if (acb->hd_aiocb == NULL)
714*019d6b8fSAnthony Liguori         goto done;
715*019d6b8fSAnthony Liguori     return;
716*019d6b8fSAnthony Liguori 
717*019d6b8fSAnthony Liguori done:
718*019d6b8fSAnthony Liguori     if (acb->qiov->niov > 1)
719*019d6b8fSAnthony Liguori         qemu_vfree(acb->orig_buf);
720*019d6b8fSAnthony Liguori     acb->common.cb(acb->common.opaque, ret);
721*019d6b8fSAnthony Liguori     qemu_aio_release(acb);
722*019d6b8fSAnthony Liguori }
723*019d6b8fSAnthony Liguori 
724*019d6b8fSAnthony Liguori static BlockDriverAIOCB *qcow_aio_writev(BlockDriverState *bs,
725*019d6b8fSAnthony Liguori         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
726*019d6b8fSAnthony Liguori         BlockDriverCompletionFunc *cb, void *opaque)
727*019d6b8fSAnthony Liguori {
728*019d6b8fSAnthony Liguori     BDRVQcowState *s = bs->opaque;
729*019d6b8fSAnthony Liguori     QCowAIOCB *acb;
730*019d6b8fSAnthony Liguori 
731*019d6b8fSAnthony Liguori     s->cluster_cache_offset = -1; /* disable compressed cache */
732*019d6b8fSAnthony Liguori 
733*019d6b8fSAnthony Liguori     acb = qemu_aio_get(bs, cb, opaque);
734*019d6b8fSAnthony Liguori     if (!acb)
735*019d6b8fSAnthony Liguori         return NULL;
736*019d6b8fSAnthony Liguori     acb->hd_aiocb = NULL;
737*019d6b8fSAnthony Liguori     acb->sector_num = sector_num;
738*019d6b8fSAnthony Liguori     acb->qiov = qiov;
739*019d6b8fSAnthony Liguori     if (qiov->niov > 1) {
740*019d6b8fSAnthony Liguori         acb->buf = acb->orig_buf = qemu_blockalign(bs, qiov->size);
741*019d6b8fSAnthony Liguori         qemu_iovec_to_buffer(qiov, acb->buf);
742*019d6b8fSAnthony Liguori     } else {
743*019d6b8fSAnthony Liguori         acb->buf = (uint8_t *)qiov->iov->iov_base;
744*019d6b8fSAnthony Liguori     }
745*019d6b8fSAnthony Liguori     acb->nb_sectors = nb_sectors;
746*019d6b8fSAnthony Liguori     acb->n = 0;
747*019d6b8fSAnthony Liguori 
748*019d6b8fSAnthony Liguori     qcow_aio_write_cb(acb, 0);
749*019d6b8fSAnthony Liguori     return &acb->common;
750*019d6b8fSAnthony Liguori }
751*019d6b8fSAnthony Liguori 
752*019d6b8fSAnthony Liguori static void qcow_aio_cancel(BlockDriverAIOCB *blockacb)
753*019d6b8fSAnthony Liguori {
754*019d6b8fSAnthony Liguori     QCowAIOCB *acb = (QCowAIOCB *)blockacb;
755*019d6b8fSAnthony Liguori     if (acb->hd_aiocb)
756*019d6b8fSAnthony Liguori         bdrv_aio_cancel(acb->hd_aiocb);
757*019d6b8fSAnthony Liguori     qemu_aio_release(acb);
758*019d6b8fSAnthony Liguori }
759*019d6b8fSAnthony Liguori 
760*019d6b8fSAnthony Liguori static void qcow_close(BlockDriverState *bs)
761*019d6b8fSAnthony Liguori {
762*019d6b8fSAnthony Liguori     BDRVQcowState *s = bs->opaque;
763*019d6b8fSAnthony Liguori     qemu_free(s->l1_table);
764*019d6b8fSAnthony Liguori     qemu_free(s->l2_cache);
765*019d6b8fSAnthony Liguori     qemu_free(s->cluster_cache);
766*019d6b8fSAnthony Liguori     qemu_free(s->cluster_data);
767*019d6b8fSAnthony Liguori     bdrv_delete(s->hd);
768*019d6b8fSAnthony Liguori }
769*019d6b8fSAnthony Liguori 
770*019d6b8fSAnthony Liguori static int qcow_create(const char *filename, int64_t total_size,
771*019d6b8fSAnthony Liguori                       const char *backing_file, int flags)
772*019d6b8fSAnthony Liguori {
773*019d6b8fSAnthony Liguori     int fd, header_size, backing_filename_len, l1_size, i, shift;
774*019d6b8fSAnthony Liguori     QCowHeader header;
775*019d6b8fSAnthony Liguori     uint64_t tmp;
776*019d6b8fSAnthony Liguori 
777*019d6b8fSAnthony Liguori     fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
778*019d6b8fSAnthony Liguori     if (fd < 0)
779*019d6b8fSAnthony Liguori         return -1;
780*019d6b8fSAnthony Liguori     memset(&header, 0, sizeof(header));
781*019d6b8fSAnthony Liguori     header.magic = cpu_to_be32(QCOW_MAGIC);
782*019d6b8fSAnthony Liguori     header.version = cpu_to_be32(QCOW_VERSION);
783*019d6b8fSAnthony Liguori     header.size = cpu_to_be64(total_size * 512);
784*019d6b8fSAnthony Liguori     header_size = sizeof(header);
785*019d6b8fSAnthony Liguori     backing_filename_len = 0;
786*019d6b8fSAnthony Liguori     if (backing_file) {
787*019d6b8fSAnthony Liguori         if (strcmp(backing_file, "fat:")) {
788*019d6b8fSAnthony Liguori             header.backing_file_offset = cpu_to_be64(header_size);
789*019d6b8fSAnthony Liguori             backing_filename_len = strlen(backing_file);
790*019d6b8fSAnthony Liguori             header.backing_file_size = cpu_to_be32(backing_filename_len);
791*019d6b8fSAnthony Liguori             header_size += backing_filename_len;
792*019d6b8fSAnthony Liguori         } else {
793*019d6b8fSAnthony Liguori             /* special backing file for vvfat */
794*019d6b8fSAnthony Liguori             backing_file = NULL;
795*019d6b8fSAnthony Liguori         }
796*019d6b8fSAnthony Liguori         header.cluster_bits = 9; /* 512 byte cluster to avoid copying
797*019d6b8fSAnthony Liguori                                     unmodifyed sectors */
798*019d6b8fSAnthony Liguori         header.l2_bits = 12; /* 32 KB L2 tables */
799*019d6b8fSAnthony Liguori     } else {
800*019d6b8fSAnthony Liguori         header.cluster_bits = 12; /* 4 KB clusters */
801*019d6b8fSAnthony Liguori         header.l2_bits = 9; /* 4 KB L2 tables */
802*019d6b8fSAnthony Liguori     }
803*019d6b8fSAnthony Liguori     header_size = (header_size + 7) & ~7;
804*019d6b8fSAnthony Liguori     shift = header.cluster_bits + header.l2_bits;
805*019d6b8fSAnthony Liguori     l1_size = ((total_size * 512) + (1LL << shift) - 1) >> shift;
806*019d6b8fSAnthony Liguori 
807*019d6b8fSAnthony Liguori     header.l1_table_offset = cpu_to_be64(header_size);
808*019d6b8fSAnthony Liguori     if (flags & BLOCK_FLAG_ENCRYPT) {
809*019d6b8fSAnthony Liguori         header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
810*019d6b8fSAnthony Liguori     } else {
811*019d6b8fSAnthony Liguori         header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
812*019d6b8fSAnthony Liguori     }
813*019d6b8fSAnthony Liguori 
814*019d6b8fSAnthony Liguori     /* write all the data */
815*019d6b8fSAnthony Liguori     write(fd, &header, sizeof(header));
816*019d6b8fSAnthony Liguori     if (backing_file) {
817*019d6b8fSAnthony Liguori         write(fd, backing_file, backing_filename_len);
818*019d6b8fSAnthony Liguori     }
819*019d6b8fSAnthony Liguori     lseek(fd, header_size, SEEK_SET);
820*019d6b8fSAnthony Liguori     tmp = 0;
821*019d6b8fSAnthony Liguori     for(i = 0;i < l1_size; i++) {
822*019d6b8fSAnthony Liguori         write(fd, &tmp, sizeof(tmp));
823*019d6b8fSAnthony Liguori     }
824*019d6b8fSAnthony Liguori     close(fd);
825*019d6b8fSAnthony Liguori     return 0;
826*019d6b8fSAnthony Liguori }
827*019d6b8fSAnthony Liguori 
828*019d6b8fSAnthony Liguori static int qcow_make_empty(BlockDriverState *bs)
829*019d6b8fSAnthony Liguori {
830*019d6b8fSAnthony Liguori     BDRVQcowState *s = bs->opaque;
831*019d6b8fSAnthony Liguori     uint32_t l1_length = s->l1_size * sizeof(uint64_t);
832*019d6b8fSAnthony Liguori     int ret;
833*019d6b8fSAnthony Liguori 
834*019d6b8fSAnthony Liguori     memset(s->l1_table, 0, l1_length);
835*019d6b8fSAnthony Liguori     if (bdrv_pwrite(s->hd, s->l1_table_offset, s->l1_table, l1_length) < 0)
836*019d6b8fSAnthony Liguori 	return -1;
837*019d6b8fSAnthony Liguori     ret = bdrv_truncate(s->hd, s->l1_table_offset + l1_length);
838*019d6b8fSAnthony Liguori     if (ret < 0)
839*019d6b8fSAnthony Liguori         return ret;
840*019d6b8fSAnthony Liguori 
841*019d6b8fSAnthony Liguori     memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
842*019d6b8fSAnthony Liguori     memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
843*019d6b8fSAnthony Liguori     memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
844*019d6b8fSAnthony Liguori 
845*019d6b8fSAnthony Liguori     return 0;
846*019d6b8fSAnthony Liguori }
847*019d6b8fSAnthony Liguori 
848*019d6b8fSAnthony Liguori /* XXX: put compressed sectors first, then all the cluster aligned
849*019d6b8fSAnthony Liguori    tables to avoid losing bytes in alignment */
850*019d6b8fSAnthony Liguori static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num,
851*019d6b8fSAnthony Liguori                                  const uint8_t *buf, int nb_sectors)
852*019d6b8fSAnthony Liguori {
853*019d6b8fSAnthony Liguori     BDRVQcowState *s = bs->opaque;
854*019d6b8fSAnthony Liguori     z_stream strm;
855*019d6b8fSAnthony Liguori     int ret, out_len;
856*019d6b8fSAnthony Liguori     uint8_t *out_buf;
857*019d6b8fSAnthony Liguori     uint64_t cluster_offset;
858*019d6b8fSAnthony Liguori 
859*019d6b8fSAnthony Liguori     if (nb_sectors != s->cluster_sectors)
860*019d6b8fSAnthony Liguori         return -EINVAL;
861*019d6b8fSAnthony Liguori 
862*019d6b8fSAnthony Liguori     out_buf = qemu_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
863*019d6b8fSAnthony Liguori     if (!out_buf)
864*019d6b8fSAnthony Liguori         return -1;
865*019d6b8fSAnthony Liguori 
866*019d6b8fSAnthony Liguori     /* best compression, small window, no zlib header */
867*019d6b8fSAnthony Liguori     memset(&strm, 0, sizeof(strm));
868*019d6b8fSAnthony Liguori     ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
869*019d6b8fSAnthony Liguori                        Z_DEFLATED, -12,
870*019d6b8fSAnthony Liguori                        9, Z_DEFAULT_STRATEGY);
871*019d6b8fSAnthony Liguori     if (ret != 0) {
872*019d6b8fSAnthony Liguori         qemu_free(out_buf);
873*019d6b8fSAnthony Liguori         return -1;
874*019d6b8fSAnthony Liguori     }
875*019d6b8fSAnthony Liguori 
876*019d6b8fSAnthony Liguori     strm.avail_in = s->cluster_size;
877*019d6b8fSAnthony Liguori     strm.next_in = (uint8_t *)buf;
878*019d6b8fSAnthony Liguori     strm.avail_out = s->cluster_size;
879*019d6b8fSAnthony Liguori     strm.next_out = out_buf;
880*019d6b8fSAnthony Liguori 
881*019d6b8fSAnthony Liguori     ret = deflate(&strm, Z_FINISH);
882*019d6b8fSAnthony Liguori     if (ret != Z_STREAM_END && ret != Z_OK) {
883*019d6b8fSAnthony Liguori         qemu_free(out_buf);
884*019d6b8fSAnthony Liguori         deflateEnd(&strm);
885*019d6b8fSAnthony Liguori         return -1;
886*019d6b8fSAnthony Liguori     }
887*019d6b8fSAnthony Liguori     out_len = strm.next_out - out_buf;
888*019d6b8fSAnthony Liguori 
889*019d6b8fSAnthony Liguori     deflateEnd(&strm);
890*019d6b8fSAnthony Liguori 
891*019d6b8fSAnthony Liguori     if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
892*019d6b8fSAnthony Liguori         /* could not compress: write normal cluster */
893*019d6b8fSAnthony Liguori         qcow_write(bs, sector_num, buf, s->cluster_sectors);
894*019d6b8fSAnthony Liguori     } else {
895*019d6b8fSAnthony Liguori         cluster_offset = get_cluster_offset(bs, sector_num << 9, 2,
896*019d6b8fSAnthony Liguori                                             out_len, 0, 0);
897*019d6b8fSAnthony Liguori         cluster_offset &= s->cluster_offset_mask;
898*019d6b8fSAnthony Liguori         if (bdrv_pwrite(s->hd, cluster_offset, out_buf, out_len) != out_len) {
899*019d6b8fSAnthony Liguori             qemu_free(out_buf);
900*019d6b8fSAnthony Liguori             return -1;
901*019d6b8fSAnthony Liguori         }
902*019d6b8fSAnthony Liguori     }
903*019d6b8fSAnthony Liguori 
904*019d6b8fSAnthony Liguori     qemu_free(out_buf);
905*019d6b8fSAnthony Liguori     return 0;
906*019d6b8fSAnthony Liguori }
907*019d6b8fSAnthony Liguori 
908*019d6b8fSAnthony Liguori static void qcow_flush(BlockDriverState *bs)
909*019d6b8fSAnthony Liguori {
910*019d6b8fSAnthony Liguori     BDRVQcowState *s = bs->opaque;
911*019d6b8fSAnthony Liguori     bdrv_flush(s->hd);
912*019d6b8fSAnthony Liguori }
913*019d6b8fSAnthony Liguori 
914*019d6b8fSAnthony Liguori static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
915*019d6b8fSAnthony Liguori {
916*019d6b8fSAnthony Liguori     BDRVQcowState *s = bs->opaque;
917*019d6b8fSAnthony Liguori     bdi->cluster_size = s->cluster_size;
918*019d6b8fSAnthony Liguori     return 0;
919*019d6b8fSAnthony Liguori }
920*019d6b8fSAnthony Liguori 
921*019d6b8fSAnthony Liguori static BlockDriver bdrv_qcow = {
922*019d6b8fSAnthony Liguori     .format_name	= "qcow",
923*019d6b8fSAnthony Liguori     .instance_size	= sizeof(BDRVQcowState),
924*019d6b8fSAnthony Liguori     .bdrv_probe		= qcow_probe,
925*019d6b8fSAnthony Liguori     .bdrv_open		= qcow_open,
926*019d6b8fSAnthony Liguori     .bdrv_close		= qcow_close,
927*019d6b8fSAnthony Liguori     .bdrv_create	= qcow_create,
928*019d6b8fSAnthony Liguori     .bdrv_flush		= qcow_flush,
929*019d6b8fSAnthony Liguori     .bdrv_is_allocated	= qcow_is_allocated,
930*019d6b8fSAnthony Liguori     .bdrv_set_key	= qcow_set_key,
931*019d6b8fSAnthony Liguori     .bdrv_make_empty	= qcow_make_empty,
932*019d6b8fSAnthony Liguori     .bdrv_aio_readv	= qcow_aio_readv,
933*019d6b8fSAnthony Liguori     .bdrv_aio_writev	= qcow_aio_writev,
934*019d6b8fSAnthony Liguori     .bdrv_aio_cancel	= qcow_aio_cancel,
935*019d6b8fSAnthony Liguori     .aiocb_size		= sizeof(QCowAIOCB),
936*019d6b8fSAnthony Liguori     .bdrv_write_compressed = qcow_write_compressed,
937*019d6b8fSAnthony Liguori     .bdrv_get_info	= qcow_get_info,
938*019d6b8fSAnthony Liguori };
939*019d6b8fSAnthony Liguori 
940*019d6b8fSAnthony Liguori static void bdrv_qcow_init(void)
941*019d6b8fSAnthony Liguori {
942*019d6b8fSAnthony Liguori     bdrv_register(&bdrv_qcow);
943*019d6b8fSAnthony Liguori }
944*019d6b8fSAnthony Liguori 
945*019d6b8fSAnthony Liguori block_init(bdrv_qcow_init);
946