1 /*
2 * block_copy API
3 *
4 * Copyright (C) 2013 Proxmox Server Solutions
5 * Copyright (c) 2019 Virtuozzo International GmbH.
6 *
7 * Authors:
8 * Dietmar Maurer (dietmar@proxmox.com)
9 * Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
10 *
11 * This work is licensed under the terms of the GNU GPL, version 2 or later.
12 * See the COPYING file in the top-level directory.
13 */
14
15 #include "qemu/osdep.h"
16
17 #include "trace.h"
18 #include "qapi/error.h"
19 #include "block/block-copy.h"
20 #include "sysemu/block-backend.h"
21 #include "qemu/units.h"
22
23 #define BLOCK_COPY_MAX_COPY_RANGE (16 * MiB)
24 #define BLOCK_COPY_MAX_BUFFER (1 * MiB)
25 #define BLOCK_COPY_MAX_MEM (128 * MiB)
26
27 typedef struct BlockCopyInFlightReq {
28 int64_t offset;
29 int64_t bytes;
30 QLIST_ENTRY(BlockCopyInFlightReq) list;
31 CoQueue wait_queue; /* coroutines blocked on this request */
32 } BlockCopyInFlightReq;
33
34 typedef struct BlockCopyState {
35 /*
36 * BdrvChild objects are not owned or managed by block-copy. They are
37 * provided by block-copy user and user is responsible for appropriate
38 * permissions on these children.
39 */
40 BdrvChild *source;
41 BdrvChild *target;
42 BdrvDirtyBitmap *copy_bitmap;
43 int64_t in_flight_bytes;
44 int64_t cluster_size;
45 bool use_copy_range;
46 int64_t copy_size;
47 uint64_t len;
48 QLIST_HEAD(, BlockCopyInFlightReq) inflight_reqs;
49
50 BdrvRequestFlags write_flags;
51
52 /*
53 * skip_unallocated:
54 *
55 * Used by sync=top jobs, which first scan the source node for unallocated
56 * areas and clear them in the copy_bitmap. During this process, the bitmap
57 * is thus not fully initialized: It may still have bits set for areas that
58 * are unallocated and should actually not be copied.
59 *
60 * This is indicated by skip_unallocated.
61 *
62 * In this case, block_copy() will query the source’s allocation status,
63 * skip unallocated regions, clear them in the copy_bitmap, and invoke
64 * block_copy_reset_unallocated() every time it does.
65 */
66 bool skip_unallocated;
67
68 ProgressMeter *progress;
69 /* progress_bytes_callback: called when some copying progress is done. */
70 ProgressBytesCallbackFunc progress_bytes_callback;
71 void *progress_opaque;
72
73 SharedResource *mem;
74 } BlockCopyState;
75
find_conflicting_inflight_req(BlockCopyState * s,int64_t offset,int64_t bytes)76 static BlockCopyInFlightReq *find_conflicting_inflight_req(BlockCopyState *s,
77 int64_t offset,
78 int64_t bytes)
79 {
80 BlockCopyInFlightReq *req;
81
82 QLIST_FOREACH(req, &s->inflight_reqs, list) {
83 if (offset + bytes > req->offset && offset < req->offset + req->bytes) {
84 return req;
85 }
86 }
87
88 return NULL;
89 }
90
91 /*
92 * If there are no intersecting requests return false. Otherwise, wait for the
93 * first found intersecting request to finish and return true.
94 */
block_copy_wait_one(BlockCopyState * s,int64_t offset,int64_t bytes)95 static bool coroutine_fn block_copy_wait_one(BlockCopyState *s, int64_t offset,
96 int64_t bytes)
97 {
98 BlockCopyInFlightReq *req = find_conflicting_inflight_req(s, offset, bytes);
99
100 if (!req) {
101 return false;
102 }
103
104 qemu_co_queue_wait(&req->wait_queue, NULL);
105
106 return true;
107 }
108
109 /* Called only on full-dirty region */
block_copy_inflight_req_begin(BlockCopyState * s,BlockCopyInFlightReq * req,int64_t offset,int64_t bytes)110 static void block_copy_inflight_req_begin(BlockCopyState *s,
111 BlockCopyInFlightReq *req,
112 int64_t offset, int64_t bytes)
113 {
114 assert(!find_conflicting_inflight_req(s, offset, bytes));
115
116 bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes);
117 s->in_flight_bytes += bytes;
118
119 req->offset = offset;
120 req->bytes = bytes;
121 qemu_co_queue_init(&req->wait_queue);
122 QLIST_INSERT_HEAD(&s->inflight_reqs, req, list);
123 }
124
125 /*
126 * block_copy_inflight_req_shrink
127 *
128 * Drop the tail of the request to be handled later. Set dirty bits back and
129 * wake up all requests waiting for us (may be some of them are not intersecting
130 * with shrunk request)
131 */
block_copy_inflight_req_shrink(BlockCopyState * s,BlockCopyInFlightReq * req,int64_t new_bytes)132 static void coroutine_fn block_copy_inflight_req_shrink(BlockCopyState *s,
133 BlockCopyInFlightReq *req, int64_t new_bytes)
134 {
135 if (new_bytes == req->bytes) {
136 return;
137 }
138
139 assert(new_bytes > 0 && new_bytes < req->bytes);
140
141 s->in_flight_bytes -= req->bytes - new_bytes;
142 bdrv_set_dirty_bitmap(s->copy_bitmap,
143 req->offset + new_bytes, req->bytes - new_bytes);
144
145 req->bytes = new_bytes;
146 qemu_co_queue_restart_all(&req->wait_queue);
147 }
148
block_copy_inflight_req_end(BlockCopyState * s,BlockCopyInFlightReq * req,int ret)149 static void coroutine_fn block_copy_inflight_req_end(BlockCopyState *s,
150 BlockCopyInFlightReq *req,
151 int ret)
152 {
153 s->in_flight_bytes -= req->bytes;
154 if (ret < 0) {
155 bdrv_set_dirty_bitmap(s->copy_bitmap, req->offset, req->bytes);
156 }
157 QLIST_REMOVE(req, list);
158 qemu_co_queue_restart_all(&req->wait_queue);
159 }
160
block_copy_state_free(BlockCopyState * s)161 void block_copy_state_free(BlockCopyState *s)
162 {
163 if (!s) {
164 return;
165 }
166
167 bdrv_release_dirty_bitmap(s->copy_bitmap);
168 shres_destroy(s->mem);
169 g_free(s);
170 }
171
block_copy_max_transfer(BdrvChild * source,BdrvChild * target)172 static uint32_t block_copy_max_transfer(BdrvChild *source, BdrvChild *target)
173 {
174 return MIN_NON_ZERO(INT_MAX,
175 MIN_NON_ZERO(source->bs->bl.max_transfer,
176 target->bs->bl.max_transfer));
177 }
178
block_copy_state_new(BdrvChild * source,BdrvChild * target,int64_t cluster_size,BdrvRequestFlags write_flags,Error ** errp)179 BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
180 int64_t cluster_size,
181 BdrvRequestFlags write_flags, Error **errp)
182 {
183 BlockCopyState *s;
184 BdrvDirtyBitmap *copy_bitmap;
185
186 copy_bitmap = bdrv_create_dirty_bitmap(source->bs, cluster_size, NULL,
187 errp);
188 if (!copy_bitmap) {
189 return NULL;
190 }
191 bdrv_disable_dirty_bitmap(copy_bitmap);
192
193 s = g_new(BlockCopyState, 1);
194 *s = (BlockCopyState) {
195 .source = source,
196 .target = target,
197 .copy_bitmap = copy_bitmap,
198 .cluster_size = cluster_size,
199 .len = bdrv_dirty_bitmap_size(copy_bitmap),
200 .write_flags = write_flags,
201 .mem = shres_create(BLOCK_COPY_MAX_MEM),
202 };
203
204 if (block_copy_max_transfer(source, target) < cluster_size) {
205 /*
206 * copy_range does not respect max_transfer. We don't want to bother
207 * with requests smaller than block-copy cluster size, so fallback to
208 * buffered copying (read and write respect max_transfer on their
209 * behalf).
210 */
211 s->use_copy_range = false;
212 s->copy_size = cluster_size;
213 } else if (write_flags & BDRV_REQ_WRITE_COMPRESSED) {
214 /* Compression supports only cluster-size writes and no copy-range. */
215 s->use_copy_range = false;
216 s->copy_size = cluster_size;
217 } else {
218 /*
219 * We enable copy-range, but keep small copy_size, until first
220 * successful copy_range (look at block_copy_do_copy).
221 */
222 s->use_copy_range = true;
223 s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER);
224 }
225
226 QLIST_INIT(&s->inflight_reqs);
227
228 return s;
229 }
230
block_copy_set_progress_callback(BlockCopyState * s,ProgressBytesCallbackFunc progress_bytes_callback,void * progress_opaque)231 void block_copy_set_progress_callback(
232 BlockCopyState *s,
233 ProgressBytesCallbackFunc progress_bytes_callback,
234 void *progress_opaque)
235 {
236 s->progress_bytes_callback = progress_bytes_callback;
237 s->progress_opaque = progress_opaque;
238 }
239
block_copy_set_progress_meter(BlockCopyState * s,ProgressMeter * pm)240 void block_copy_set_progress_meter(BlockCopyState *s, ProgressMeter *pm)
241 {
242 s->progress = pm;
243 }
244
245 /*
246 * block_copy_do_copy
247 *
248 * Do copy of cluster-aligned chunk. Requested region is allowed to exceed
249 * s->len only to cover last cluster when s->len is not aligned to clusters.
250 *
251 * No sync here: nor bitmap neighter intersecting requests handling, only copy.
252 *
253 * Returns 0 on success.
254 */
block_copy_do_copy(BlockCopyState * s,int64_t offset,int64_t bytes,bool zeroes,bool * error_is_read)255 static int coroutine_fn block_copy_do_copy(BlockCopyState *s,
256 int64_t offset, int64_t bytes,
257 bool zeroes, bool *error_is_read)
258 {
259 int ret;
260 int64_t nbytes = MIN(offset + bytes, s->len) - offset;
261 void *bounce_buffer = NULL;
262
263 assert(offset >= 0 && bytes > 0 && INT64_MAX - offset >= bytes);
264 assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
265 assert(QEMU_IS_ALIGNED(bytes, s->cluster_size));
266 assert(offset < s->len);
267 assert(offset + bytes <= s->len ||
268 offset + bytes == QEMU_ALIGN_UP(s->len, s->cluster_size));
269 assert(nbytes < INT_MAX);
270
271 if (zeroes) {
272 ret = bdrv_co_pwrite_zeroes(s->target, offset, nbytes, s->write_flags &
273 ~BDRV_REQ_WRITE_COMPRESSED);
274 if (ret < 0) {
275 trace_block_copy_write_zeroes_fail(s, offset, ret);
276 if (error_is_read) {
277 *error_is_read = false;
278 }
279 }
280 return ret;
281 }
282
283 if (s->use_copy_range) {
284 ret = bdrv_co_copy_range(s->source, offset, s->target, offset, nbytes,
285 0, s->write_flags);
286 if (ret < 0) {
287 trace_block_copy_copy_range_fail(s, offset, ret);
288 s->use_copy_range = false;
289 s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER);
290 /* Fallback to read+write with allocated buffer */
291 } else {
292 if (s->use_copy_range) {
293 /*
294 * Successful copy-range. Now increase copy_size. copy_range
295 * does not respect max_transfer (it's a TODO), so we factor
296 * that in here.
297 *
298 * Note: we double-check s->use_copy_range for the case when
299 * parallel block-copy request unsets it during previous
300 * bdrv_co_copy_range call.
301 */
302 s->copy_size =
303 MIN(MAX(s->cluster_size, BLOCK_COPY_MAX_COPY_RANGE),
304 QEMU_ALIGN_DOWN(block_copy_max_transfer(s->source,
305 s->target),
306 s->cluster_size));
307 }
308 goto out;
309 }
310 }
311
312 /*
313 * In case of failed copy_range request above, we may proceed with buffered
314 * request larger than BLOCK_COPY_MAX_BUFFER. Still, further requests will
315 * be properly limited, so don't care too much. Moreover the most likely
316 * case (copy_range is unsupported for the configuration, so the very first
317 * copy_range request fails) is handled by setting large copy_size only
318 * after first successful copy_range.
319 */
320
321 bounce_buffer = qemu_blockalign(s->source->bs, nbytes);
322
323 ret = bdrv_co_pread(s->source, offset, nbytes, bounce_buffer, 0);
324 if (ret < 0) {
325 trace_block_copy_read_fail(s, offset, ret);
326 if (error_is_read) {
327 *error_is_read = true;
328 }
329 goto out;
330 }
331
332 ret = bdrv_co_pwrite(s->target, offset, nbytes, bounce_buffer,
333 s->write_flags);
334 if (ret < 0) {
335 trace_block_copy_write_fail(s, offset, ret);
336 if (error_is_read) {
337 *error_is_read = false;
338 }
339 goto out;
340 }
341
342 out:
343 qemu_vfree(bounce_buffer);
344
345 return ret;
346 }
347
block_copy_block_status(BlockCopyState * s,int64_t offset,int64_t bytes,int64_t * pnum)348 static int block_copy_block_status(BlockCopyState *s, int64_t offset,
349 int64_t bytes, int64_t *pnum)
350 {
351 int64_t num;
352 BlockDriverState *base;
353 int ret;
354
355 if (s->skip_unallocated && s->source->bs->backing) {
356 base = s->source->bs->backing->bs;
357 } else {
358 base = NULL;
359 }
360
361 ret = bdrv_block_status_above(s->source->bs, base, offset, bytes, &num,
362 NULL, NULL);
363 if (ret < 0 || num < s->cluster_size) {
364 /*
365 * On error or if failed to obtain large enough chunk just fallback to
366 * copy one cluster.
367 */
368 num = s->cluster_size;
369 ret = BDRV_BLOCK_ALLOCATED | BDRV_BLOCK_DATA;
370 } else if (offset + num == s->len) {
371 num = QEMU_ALIGN_UP(num, s->cluster_size);
372 } else {
373 num = QEMU_ALIGN_DOWN(num, s->cluster_size);
374 }
375
376 *pnum = num;
377 return ret;
378 }
379
380 /*
381 * Check if the cluster starting at offset is allocated or not.
382 * return via pnum the number of contiguous clusters sharing this allocation.
383 */
block_copy_is_cluster_allocated(BlockCopyState * s,int64_t offset,int64_t * pnum)384 static int block_copy_is_cluster_allocated(BlockCopyState *s, int64_t offset,
385 int64_t *pnum)
386 {
387 BlockDriverState *bs = s->source->bs;
388 int64_t count, total_count = 0;
389 int64_t bytes = s->len - offset;
390 int ret;
391
392 assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
393
394 while (true) {
395 ret = bdrv_is_allocated(bs, offset, bytes, &count);
396 if (ret < 0) {
397 return ret;
398 }
399
400 total_count += count;
401
402 if (ret || count == 0) {
403 /*
404 * ret: partial segment(s) are considered allocated.
405 * otherwise: unallocated tail is treated as an entire segment.
406 */
407 *pnum = DIV_ROUND_UP(total_count, s->cluster_size);
408 return ret;
409 }
410
411 /* Unallocated segment(s) with uncertain following segment(s) */
412 if (total_count >= s->cluster_size) {
413 *pnum = total_count / s->cluster_size;
414 return 0;
415 }
416
417 offset += count;
418 bytes -= count;
419 }
420 }
421
422 /*
423 * Reset bits in copy_bitmap starting at offset if they represent unallocated
424 * data in the image. May reset subsequent contiguous bits.
425 * @return 0 when the cluster at @offset was unallocated,
426 * 1 otherwise, and -ret on error.
427 */
block_copy_reset_unallocated(BlockCopyState * s,int64_t offset,int64_t * count)428 int64_t block_copy_reset_unallocated(BlockCopyState *s,
429 int64_t offset, int64_t *count)
430 {
431 int ret;
432 int64_t clusters, bytes;
433
434 ret = block_copy_is_cluster_allocated(s, offset, &clusters);
435 if (ret < 0) {
436 return ret;
437 }
438
439 bytes = clusters * s->cluster_size;
440
441 if (!ret) {
442 bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes);
443 progress_set_remaining(s->progress,
444 bdrv_get_dirty_count(s->copy_bitmap) +
445 s->in_flight_bytes);
446 }
447
448 *count = bytes;
449 return ret;
450 }
451
452 /*
453 * block_copy_dirty_clusters
454 *
455 * Copy dirty clusters in @offset/@bytes range.
456 * Returns 1 if dirty clusters found and successfully copied, 0 if no dirty
457 * clusters found and -errno on failure.
458 */
block_copy_dirty_clusters(BlockCopyState * s,int64_t offset,int64_t bytes,bool * error_is_read)459 static int coroutine_fn block_copy_dirty_clusters(BlockCopyState *s,
460 int64_t offset, int64_t bytes,
461 bool *error_is_read)
462 {
463 int ret = 0;
464 bool found_dirty = false;
465
466 /*
467 * block_copy() user is responsible for keeping source and target in same
468 * aio context
469 */
470 assert(bdrv_get_aio_context(s->source->bs) ==
471 bdrv_get_aio_context(s->target->bs));
472
473 assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
474 assert(QEMU_IS_ALIGNED(bytes, s->cluster_size));
475
476 while (bytes) {
477 BlockCopyInFlightReq req;
478 int64_t next_zero, cur_bytes, status_bytes;
479
480 if (!bdrv_dirty_bitmap_get(s->copy_bitmap, offset)) {
481 trace_block_copy_skip(s, offset);
482 offset += s->cluster_size;
483 bytes -= s->cluster_size;
484 continue; /* already copied */
485 }
486
487 found_dirty = true;
488
489 cur_bytes = MIN(bytes, s->copy_size);
490
491 next_zero = bdrv_dirty_bitmap_next_zero(s->copy_bitmap, offset,
492 cur_bytes);
493 if (next_zero >= 0) {
494 assert(next_zero > offset); /* offset is dirty */
495 assert(next_zero < offset + cur_bytes); /* no need to do MIN() */
496 cur_bytes = next_zero - offset;
497 }
498 block_copy_inflight_req_begin(s, &req, offset, cur_bytes);
499
500 ret = block_copy_block_status(s, offset, cur_bytes, &status_bytes);
501 assert(ret >= 0); /* never fail */
502 cur_bytes = MIN(cur_bytes, status_bytes);
503 block_copy_inflight_req_shrink(s, &req, cur_bytes);
504 if (s->skip_unallocated && !(ret & BDRV_BLOCK_ALLOCATED)) {
505 block_copy_inflight_req_end(s, &req, 0);
506 progress_set_remaining(s->progress,
507 bdrv_get_dirty_count(s->copy_bitmap) +
508 s->in_flight_bytes);
509 trace_block_copy_skip_range(s, offset, status_bytes);
510 offset += status_bytes;
511 bytes -= status_bytes;
512 continue;
513 }
514
515 trace_block_copy_process(s, offset);
516
517 co_get_from_shres(s->mem, cur_bytes);
518 ret = block_copy_do_copy(s, offset, cur_bytes, ret & BDRV_BLOCK_ZERO,
519 error_is_read);
520 co_put_to_shres(s->mem, cur_bytes);
521 block_copy_inflight_req_end(s, &req, ret);
522 if (ret < 0) {
523 return ret;
524 }
525
526 progress_work_done(s->progress, cur_bytes);
527 s->progress_bytes_callback(cur_bytes, s->progress_opaque);
528 offset += cur_bytes;
529 bytes -= cur_bytes;
530 }
531
532 return found_dirty;
533 }
534
535 /*
536 * block_copy
537 *
538 * Copy requested region, accordingly to dirty bitmap.
539 * Collaborate with parallel block_copy requests: if they succeed it will help
540 * us. If they fail, we will retry not-copied regions. So, if we return error,
541 * it means that some I/O operation failed in context of _this_ block_copy call,
542 * not some parallel operation.
543 */
block_copy(BlockCopyState * s,int64_t offset,int64_t bytes,bool * error_is_read)544 int coroutine_fn block_copy(BlockCopyState *s, int64_t offset, int64_t bytes,
545 bool *error_is_read)
546 {
547 int ret;
548
549 do {
550 ret = block_copy_dirty_clusters(s, offset, bytes, error_is_read);
551
552 if (ret == 0) {
553 ret = block_copy_wait_one(s, offset, bytes);
554 }
555
556 /*
557 * We retry in two cases:
558 * 1. Some progress done
559 * Something was copied, which means that there were yield points
560 * and some new dirty bits may have appeared (due to failed parallel
561 * block-copy requests).
562 * 2. We have waited for some intersecting block-copy request
563 * It may have failed and produced new dirty bits.
564 */
565 } while (ret > 0);
566
567 return ret;
568 }
569
block_copy_dirty_bitmap(BlockCopyState * s)570 BdrvDirtyBitmap *block_copy_dirty_bitmap(BlockCopyState *s)
571 {
572 return s->copy_bitmap;
573 }
574
block_copy_set_skip_unallocated(BlockCopyState * s,bool skip)575 void block_copy_set_skip_unallocated(BlockCopyState *s, bool skip)
576 {
577 s->skip_unallocated = skip;
578 }
579