1aead9dc9SPaolo Bonzini /* 2aead9dc9SPaolo Bonzini * Graph lock: rwlock to protect block layer graph manipulations (add/remove 3aead9dc9SPaolo Bonzini * edges and nodes) 4aead9dc9SPaolo Bonzini * 5aead9dc9SPaolo Bonzini * Copyright (c) 2022 Red Hat 6aead9dc9SPaolo Bonzini * 7aead9dc9SPaolo Bonzini * This library is free software; you can redistribute it and/or 8aead9dc9SPaolo Bonzini * modify it under the terms of the GNU Lesser General Public 9aead9dc9SPaolo Bonzini * License as published by the Free Software Foundation; either 10aead9dc9SPaolo Bonzini * version 2.1 of the License, or (at your option) any later version. 11aead9dc9SPaolo Bonzini * 12aead9dc9SPaolo Bonzini * This library is distributed in the hope that it will be useful, 13aead9dc9SPaolo Bonzini * but WITHOUT ANY WARRANTY; without even the implied warranty of 14aead9dc9SPaolo Bonzini * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15aead9dc9SPaolo Bonzini * Lesser General Public License for more details. 16aead9dc9SPaolo Bonzini * 17aead9dc9SPaolo Bonzini * You should have received a copy of the GNU Lesser General Public 18aead9dc9SPaolo Bonzini * License along with this library; if not, see <http://www.gnu.org/licenses/>. 19aead9dc9SPaolo Bonzini */ 20aead9dc9SPaolo Bonzini 21aead9dc9SPaolo Bonzini #include "qemu/osdep.h" 22aead9dc9SPaolo Bonzini #include "qemu/main-loop.h" 23aead9dc9SPaolo Bonzini #include "block/graph-lock.h" 24aead9dc9SPaolo Bonzini #include "block/block.h" 25aead9dc9SPaolo Bonzini #include "block/block_int.h" 26aead9dc9SPaolo Bonzini 274002ffdcSKevin Wolf /* Dummy lock object to use for Thread Safety Analysis (TSA) */ 284002ffdcSKevin Wolf BdrvGraphLock graph_lock; 294002ffdcSKevin Wolf 30aead9dc9SPaolo Bonzini /* Protects the list of aiocontext and orphaned_reader_count */ 31aead9dc9SPaolo Bonzini static QemuMutex aio_context_list_lock; 32aead9dc9SPaolo Bonzini 33aead9dc9SPaolo Bonzini /* Written and read with atomic operations. */ 34aead9dc9SPaolo Bonzini static int has_writer; 35aead9dc9SPaolo Bonzini 36aead9dc9SPaolo Bonzini /* 37aead9dc9SPaolo Bonzini * A reader coroutine could move from an AioContext to another. 38aead9dc9SPaolo Bonzini * If this happens, there is no problem from the point of view of 39aead9dc9SPaolo Bonzini * counters. The problem is that the total count becomes 40aead9dc9SPaolo Bonzini * unbalanced if one of the two AioContexts gets deleted. 41aead9dc9SPaolo Bonzini * The count of readers must remain correct, so the AioContext's 42aead9dc9SPaolo Bonzini * balance is transferred to this glboal variable. 43aead9dc9SPaolo Bonzini * Protected by aio_context_list_lock. 44aead9dc9SPaolo Bonzini */ 45aead9dc9SPaolo Bonzini static uint32_t orphaned_reader_count; 46aead9dc9SPaolo Bonzini 47aead9dc9SPaolo Bonzini /* Queue of readers waiting for the writer to finish */ 48aead9dc9SPaolo Bonzini static CoQueue reader_queue; 49aead9dc9SPaolo Bonzini 50aead9dc9SPaolo Bonzini struct BdrvGraphRWlock { 51aead9dc9SPaolo Bonzini /* How many readers are currently reading the graph. */ 52aead9dc9SPaolo Bonzini uint32_t reader_count; 53aead9dc9SPaolo Bonzini 54aead9dc9SPaolo Bonzini /* 55aead9dc9SPaolo Bonzini * List of BdrvGraphRWlock kept in graph-lock.c 56aead9dc9SPaolo Bonzini * Protected by aio_context_list_lock 57aead9dc9SPaolo Bonzini */ 58aead9dc9SPaolo Bonzini QTAILQ_ENTRY(BdrvGraphRWlock) next_aio; 59aead9dc9SPaolo Bonzini }; 60aead9dc9SPaolo Bonzini 61aead9dc9SPaolo Bonzini /* 62aead9dc9SPaolo Bonzini * List of BdrvGraphRWlock. This list ensures that each BdrvGraphRWlock 63aead9dc9SPaolo Bonzini * can safely modify only its own counter, avoid reading/writing 64aead9dc9SPaolo Bonzini * others and thus improving performances by avoiding cacheline bounces. 65aead9dc9SPaolo Bonzini */ 66aead9dc9SPaolo Bonzini static QTAILQ_HEAD(, BdrvGraphRWlock) aio_context_list = 67aead9dc9SPaolo Bonzini QTAILQ_HEAD_INITIALIZER(aio_context_list); 68aead9dc9SPaolo Bonzini 69aead9dc9SPaolo Bonzini static void __attribute__((__constructor__)) bdrv_init_graph_lock(void) 70aead9dc9SPaolo Bonzini { 71aead9dc9SPaolo Bonzini qemu_mutex_init(&aio_context_list_lock); 72aead9dc9SPaolo Bonzini qemu_co_queue_init(&reader_queue); 73aead9dc9SPaolo Bonzini } 74aead9dc9SPaolo Bonzini 75aead9dc9SPaolo Bonzini void register_aiocontext(AioContext *ctx) 76aead9dc9SPaolo Bonzini { 77aead9dc9SPaolo Bonzini ctx->bdrv_graph = g_new0(BdrvGraphRWlock, 1); 78aead9dc9SPaolo Bonzini QEMU_LOCK_GUARD(&aio_context_list_lock); 79aead9dc9SPaolo Bonzini assert(ctx->bdrv_graph->reader_count == 0); 80aead9dc9SPaolo Bonzini QTAILQ_INSERT_TAIL(&aio_context_list, ctx->bdrv_graph, next_aio); 81aead9dc9SPaolo Bonzini } 82aead9dc9SPaolo Bonzini 83aead9dc9SPaolo Bonzini void unregister_aiocontext(AioContext *ctx) 84aead9dc9SPaolo Bonzini { 85aead9dc9SPaolo Bonzini QEMU_LOCK_GUARD(&aio_context_list_lock); 86aead9dc9SPaolo Bonzini orphaned_reader_count += ctx->bdrv_graph->reader_count; 87aead9dc9SPaolo Bonzini QTAILQ_REMOVE(&aio_context_list, ctx->bdrv_graph, next_aio); 88aead9dc9SPaolo Bonzini g_free(ctx->bdrv_graph); 89aead9dc9SPaolo Bonzini } 90aead9dc9SPaolo Bonzini 91aead9dc9SPaolo Bonzini static uint32_t reader_count(void) 92aead9dc9SPaolo Bonzini { 93aead9dc9SPaolo Bonzini BdrvGraphRWlock *brdv_graph; 94aead9dc9SPaolo Bonzini uint32_t rd; 95aead9dc9SPaolo Bonzini 96aead9dc9SPaolo Bonzini QEMU_LOCK_GUARD(&aio_context_list_lock); 97aead9dc9SPaolo Bonzini 983202d8e4SMichael Tokarev /* rd can temporarily be negative, but the total will *always* be >= 0 */ 99aead9dc9SPaolo Bonzini rd = orphaned_reader_count; 100aead9dc9SPaolo Bonzini QTAILQ_FOREACH(brdv_graph, &aio_context_list, next_aio) { 101aead9dc9SPaolo Bonzini rd += qatomic_read(&brdv_graph->reader_count); 102aead9dc9SPaolo Bonzini } 103aead9dc9SPaolo Bonzini 104aead9dc9SPaolo Bonzini /* shouldn't overflow unless there are 2^31 readers */ 105aead9dc9SPaolo Bonzini assert((int32_t)rd >= 0); 106aead9dc9SPaolo Bonzini return rd; 107aead9dc9SPaolo Bonzini } 108aead9dc9SPaolo Bonzini 10931b2ddfeSKevin Wolf void bdrv_graph_wrlock(BlockDriverState *bs) 110aead9dc9SPaolo Bonzini { 11131b2ddfeSKevin Wolf AioContext *ctx = NULL; 11231b2ddfeSKevin Wolf 113aead9dc9SPaolo Bonzini GLOBAL_STATE_CODE(); 114aead9dc9SPaolo Bonzini assert(!qatomic_read(&has_writer)); 115aead9dc9SPaolo Bonzini 11631b2ddfeSKevin Wolf /* 11731b2ddfeSKevin Wolf * Release only non-mainloop AioContext. The mainloop often relies on the 11831b2ddfeSKevin Wolf * BQL and doesn't lock the main AioContext before doing things. 11931b2ddfeSKevin Wolf */ 12031b2ddfeSKevin Wolf if (bs) { 12131b2ddfeSKevin Wolf ctx = bdrv_get_aio_context(bs); 12231b2ddfeSKevin Wolf if (ctx != qemu_get_aio_context()) { 12331b2ddfeSKevin Wolf aio_context_release(ctx); 12431b2ddfeSKevin Wolf } else { 12531b2ddfeSKevin Wolf ctx = NULL; 12631b2ddfeSKevin Wolf } 12731b2ddfeSKevin Wolf } 12831b2ddfeSKevin Wolf 129aead9dc9SPaolo Bonzini /* Make sure that constantly arriving new I/O doesn't cause starvation */ 130aead9dc9SPaolo Bonzini bdrv_drain_all_begin_nopoll(); 131aead9dc9SPaolo Bonzini 132aead9dc9SPaolo Bonzini /* 133aead9dc9SPaolo Bonzini * reader_count == 0: this means writer will read has_reader as 1 134aead9dc9SPaolo Bonzini * reader_count >= 1: we don't know if writer read has_writer == 0 or 1, 135aead9dc9SPaolo Bonzini * but we need to wait. 136aead9dc9SPaolo Bonzini * Wait by allowing other coroutine (and possible readers) to continue. 137aead9dc9SPaolo Bonzini */ 138aead9dc9SPaolo Bonzini do { 139aead9dc9SPaolo Bonzini /* 140aead9dc9SPaolo Bonzini * has_writer must be 0 while polling, otherwise we get a deadlock if 141aead9dc9SPaolo Bonzini * any callback involved during AIO_WAIT_WHILE() tries to acquire the 142aead9dc9SPaolo Bonzini * reader lock. 143aead9dc9SPaolo Bonzini */ 144aead9dc9SPaolo Bonzini qatomic_set(&has_writer, 0); 145d805d8a2SStefan Hajnoczi AIO_WAIT_WHILE_UNLOCKED(NULL, reader_count() >= 1); 146aead9dc9SPaolo Bonzini qatomic_set(&has_writer, 1); 147aead9dc9SPaolo Bonzini 148aead9dc9SPaolo Bonzini /* 149aead9dc9SPaolo Bonzini * We want to only check reader_count() after has_writer = 1 is visible 150aead9dc9SPaolo Bonzini * to other threads. That way no more readers can sneak in after we've 151aead9dc9SPaolo Bonzini * determined reader_count() == 0. 152aead9dc9SPaolo Bonzini */ 153aead9dc9SPaolo Bonzini smp_mb(); 154aead9dc9SPaolo Bonzini } while (reader_count() >= 1); 155aead9dc9SPaolo Bonzini 156aead9dc9SPaolo Bonzini bdrv_drain_all_end(); 15731b2ddfeSKevin Wolf 15831b2ddfeSKevin Wolf if (ctx) { 15931b2ddfeSKevin Wolf aio_context_acquire(bdrv_get_aio_context(bs)); 16031b2ddfeSKevin Wolf } 161aead9dc9SPaolo Bonzini } 162aead9dc9SPaolo Bonzini 163aead9dc9SPaolo Bonzini void bdrv_graph_wrunlock(void) 164aead9dc9SPaolo Bonzini { 165aead9dc9SPaolo Bonzini GLOBAL_STATE_CODE(); 166aead9dc9SPaolo Bonzini QEMU_LOCK_GUARD(&aio_context_list_lock); 167aead9dc9SPaolo Bonzini assert(qatomic_read(&has_writer)); 168aead9dc9SPaolo Bonzini 169aead9dc9SPaolo Bonzini /* 170aead9dc9SPaolo Bonzini * No need for memory barriers, this works in pair with 171aead9dc9SPaolo Bonzini * the slow path of rdlock() and both take the lock. 172aead9dc9SPaolo Bonzini */ 173aead9dc9SPaolo Bonzini qatomic_store_release(&has_writer, 0); 174aead9dc9SPaolo Bonzini 175aead9dc9SPaolo Bonzini /* Wake up all coroutine that are waiting to read the graph */ 176aead9dc9SPaolo Bonzini qemu_co_enter_all(&reader_queue, &aio_context_list_lock); 177aead9dc9SPaolo Bonzini } 178aead9dc9SPaolo Bonzini 179aead9dc9SPaolo Bonzini void coroutine_fn bdrv_graph_co_rdlock(void) 180aead9dc9SPaolo Bonzini { 181aead9dc9SPaolo Bonzini BdrvGraphRWlock *bdrv_graph; 182aead9dc9SPaolo Bonzini bdrv_graph = qemu_get_current_aio_context()->bdrv_graph; 183aead9dc9SPaolo Bonzini 184aead9dc9SPaolo Bonzini for (;;) { 185aead9dc9SPaolo Bonzini qatomic_set(&bdrv_graph->reader_count, 186aead9dc9SPaolo Bonzini bdrv_graph->reader_count + 1); 187aead9dc9SPaolo Bonzini /* make sure writer sees reader_count before we check has_writer */ 188aead9dc9SPaolo Bonzini smp_mb(); 189aead9dc9SPaolo Bonzini 190aead9dc9SPaolo Bonzini /* 191aead9dc9SPaolo Bonzini * has_writer == 0: this means writer will read reader_count as >= 1 192aead9dc9SPaolo Bonzini * has_writer == 1: we don't know if writer read reader_count == 0 193aead9dc9SPaolo Bonzini * or > 0, but we need to wait anyways because 194aead9dc9SPaolo Bonzini * it will write. 195aead9dc9SPaolo Bonzini */ 196aead9dc9SPaolo Bonzini if (!qatomic_read(&has_writer)) { 197aead9dc9SPaolo Bonzini break; 198aead9dc9SPaolo Bonzini } 199aead9dc9SPaolo Bonzini 200aead9dc9SPaolo Bonzini /* 201aead9dc9SPaolo Bonzini * Synchronize access with reader_count() in bdrv_graph_wrlock(). 202aead9dc9SPaolo Bonzini * Case 1: 203aead9dc9SPaolo Bonzini * If this critical section gets executed first, reader_count will 204aead9dc9SPaolo Bonzini * decrease and the reader will go to sleep. 205aead9dc9SPaolo Bonzini * Then the writer will read reader_count that does not take into 206aead9dc9SPaolo Bonzini * account this reader, and if there's no other reader it will 207aead9dc9SPaolo Bonzini * enter the write section. 208aead9dc9SPaolo Bonzini * Case 2: 209aead9dc9SPaolo Bonzini * If reader_count() critical section gets executed first, 210aead9dc9SPaolo Bonzini * then writer will read reader_count >= 1. 211aead9dc9SPaolo Bonzini * It will wait in AIO_WAIT_WHILE(), but once it releases the lock 212aead9dc9SPaolo Bonzini * we will enter this critical section and call aio_wait_kick(). 213aead9dc9SPaolo Bonzini */ 214aead9dc9SPaolo Bonzini WITH_QEMU_LOCK_GUARD(&aio_context_list_lock) { 215aead9dc9SPaolo Bonzini /* 216aead9dc9SPaolo Bonzini * Additional check when we use the above lock to synchronize 217aead9dc9SPaolo Bonzini * with bdrv_graph_wrunlock(). 218aead9dc9SPaolo Bonzini * Case 1: 219aead9dc9SPaolo Bonzini * If this gets executed first, has_writer is still 1, so we reduce 220aead9dc9SPaolo Bonzini * reader_count and go to sleep. 221aead9dc9SPaolo Bonzini * Then the writer will set has_writer to 0 and wake up all readers, 222aead9dc9SPaolo Bonzini * us included. 223aead9dc9SPaolo Bonzini * Case 2: 224aead9dc9SPaolo Bonzini * If bdrv_graph_wrunlock() critical section gets executed first, 225aead9dc9SPaolo Bonzini * then it will set has_writer to 0 and wake up all other readers. 226aead9dc9SPaolo Bonzini * Then we execute this critical section, and therefore must check 227aead9dc9SPaolo Bonzini * again for has_writer, otherwise we sleep without any writer 228aead9dc9SPaolo Bonzini * actually running. 229aead9dc9SPaolo Bonzini */ 230aead9dc9SPaolo Bonzini if (!qatomic_read(&has_writer)) { 231aead9dc9SPaolo Bonzini return; 232aead9dc9SPaolo Bonzini } 233aead9dc9SPaolo Bonzini 234aead9dc9SPaolo Bonzini /* slow path where reader sleeps */ 235aead9dc9SPaolo Bonzini bdrv_graph->reader_count--; 236aead9dc9SPaolo Bonzini aio_wait_kick(); 237aead9dc9SPaolo Bonzini qemu_co_queue_wait(&reader_queue, &aio_context_list_lock); 238aead9dc9SPaolo Bonzini } 239aead9dc9SPaolo Bonzini } 240aead9dc9SPaolo Bonzini } 241aead9dc9SPaolo Bonzini 242aead9dc9SPaolo Bonzini void coroutine_fn bdrv_graph_co_rdunlock(void) 243aead9dc9SPaolo Bonzini { 244aead9dc9SPaolo Bonzini BdrvGraphRWlock *bdrv_graph; 245aead9dc9SPaolo Bonzini bdrv_graph = qemu_get_current_aio_context()->bdrv_graph; 246aead9dc9SPaolo Bonzini 247aead9dc9SPaolo Bonzini qatomic_store_release(&bdrv_graph->reader_count, 248aead9dc9SPaolo Bonzini bdrv_graph->reader_count - 1); 249aead9dc9SPaolo Bonzini /* make sure writer sees reader_count before we check has_writer */ 250aead9dc9SPaolo Bonzini smp_mb(); 251aead9dc9SPaolo Bonzini 252aead9dc9SPaolo Bonzini /* 253aead9dc9SPaolo Bonzini * has_writer == 0: this means reader will read reader_count decreased 254aead9dc9SPaolo Bonzini * has_writer == 1: we don't know if writer read reader_count old or 255aead9dc9SPaolo Bonzini * new. Therefore, kick again so on next iteration 256aead9dc9SPaolo Bonzini * writer will for sure read the updated value. 257aead9dc9SPaolo Bonzini */ 258aead9dc9SPaolo Bonzini if (qatomic_read(&has_writer)) { 259aead9dc9SPaolo Bonzini aio_wait_kick(); 260aead9dc9SPaolo Bonzini } 261aead9dc9SPaolo Bonzini } 262aead9dc9SPaolo Bonzini 263aead9dc9SPaolo Bonzini void bdrv_graph_rdlock_main_loop(void) 264aead9dc9SPaolo Bonzini { 265aead9dc9SPaolo Bonzini GLOBAL_STATE_CODE(); 266aead9dc9SPaolo Bonzini assert(!qemu_in_coroutine()); 267aead9dc9SPaolo Bonzini } 268aead9dc9SPaolo Bonzini 269aead9dc9SPaolo Bonzini void bdrv_graph_rdunlock_main_loop(void) 270aead9dc9SPaolo Bonzini { 271aead9dc9SPaolo Bonzini GLOBAL_STATE_CODE(); 272aead9dc9SPaolo Bonzini assert(!qemu_in_coroutine()); 273aead9dc9SPaolo Bonzini } 2743f35f82eSEmanuele Giuseppe Esposito 2753f35f82eSEmanuele Giuseppe Esposito void assert_bdrv_graph_readable(void) 2763f35f82eSEmanuele Giuseppe Esposito { 27758a2e3f5SStefan Hajnoczi /* reader_count() is slow due to aio_context_list_lock lock contention */ 27858a2e3f5SStefan Hajnoczi #ifdef CONFIG_DEBUG_GRAPH_LOCK 2793f35f82eSEmanuele Giuseppe Esposito assert(qemu_in_main_thread() || reader_count()); 28058a2e3f5SStefan Hajnoczi #endif 2813f35f82eSEmanuele Giuseppe Esposito } 2823f35f82eSEmanuele Giuseppe Esposito 2833f35f82eSEmanuele Giuseppe Esposito void assert_bdrv_graph_writable(void) 2843f35f82eSEmanuele Giuseppe Esposito { 2853f35f82eSEmanuele Giuseppe Esposito assert(qemu_in_main_thread()); 2863f35f82eSEmanuele Giuseppe Esposito assert(qatomic_read(&has_writer)); 2873f35f82eSEmanuele Giuseppe Esposito } 288