1aead9dc9SPaolo Bonzini /*
2aead9dc9SPaolo Bonzini * Graph lock: rwlock to protect block layer graph manipulations (add/remove
3aead9dc9SPaolo Bonzini * edges and nodes)
4aead9dc9SPaolo Bonzini *
5aead9dc9SPaolo Bonzini * Copyright (c) 2022 Red Hat
6aead9dc9SPaolo Bonzini *
7aead9dc9SPaolo Bonzini * This library is free software; you can redistribute it and/or
8aead9dc9SPaolo Bonzini * modify it under the terms of the GNU Lesser General Public
9aead9dc9SPaolo Bonzini * License as published by the Free Software Foundation; either
10aead9dc9SPaolo Bonzini * version 2.1 of the License, or (at your option) any later version.
11aead9dc9SPaolo Bonzini *
12aead9dc9SPaolo Bonzini * This library is distributed in the hope that it will be useful,
13aead9dc9SPaolo Bonzini * but WITHOUT ANY WARRANTY; without even the implied warranty of
14aead9dc9SPaolo Bonzini * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15aead9dc9SPaolo Bonzini * Lesser General Public License for more details.
16aead9dc9SPaolo Bonzini *
17aead9dc9SPaolo Bonzini * You should have received a copy of the GNU Lesser General Public
18aead9dc9SPaolo Bonzini * License along with this library; if not, see <http://www.gnu.org/licenses/>.
19aead9dc9SPaolo Bonzini */
20aead9dc9SPaolo Bonzini
21aead9dc9SPaolo Bonzini #include "qemu/osdep.h"
22aead9dc9SPaolo Bonzini #include "qemu/main-loop.h"
23aead9dc9SPaolo Bonzini #include "block/graph-lock.h"
24aead9dc9SPaolo Bonzini #include "block/block.h"
25aead9dc9SPaolo Bonzini #include "block/block_int.h"
26aead9dc9SPaolo Bonzini
274002ffdcSKevin Wolf /* Dummy lock object to use for Thread Safety Analysis (TSA) */
284002ffdcSKevin Wolf BdrvGraphLock graph_lock;
294002ffdcSKevin Wolf
30aead9dc9SPaolo Bonzini /* Protects the list of aiocontext and orphaned_reader_count */
31aead9dc9SPaolo Bonzini static QemuMutex aio_context_list_lock;
32aead9dc9SPaolo Bonzini
33aead9dc9SPaolo Bonzini /* Written and read with atomic operations. */
34aead9dc9SPaolo Bonzini static int has_writer;
35aead9dc9SPaolo Bonzini
36aead9dc9SPaolo Bonzini /*
37aead9dc9SPaolo Bonzini * A reader coroutine could move from an AioContext to another.
38aead9dc9SPaolo Bonzini * If this happens, there is no problem from the point of view of
39aead9dc9SPaolo Bonzini * counters. The problem is that the total count becomes
40aead9dc9SPaolo Bonzini * unbalanced if one of the two AioContexts gets deleted.
41aead9dc9SPaolo Bonzini * The count of readers must remain correct, so the AioContext's
42aead9dc9SPaolo Bonzini * balance is transferred to this glboal variable.
43aead9dc9SPaolo Bonzini * Protected by aio_context_list_lock.
44aead9dc9SPaolo Bonzini */
45aead9dc9SPaolo Bonzini static uint32_t orphaned_reader_count;
46aead9dc9SPaolo Bonzini
47aead9dc9SPaolo Bonzini /* Queue of readers waiting for the writer to finish */
48aead9dc9SPaolo Bonzini static CoQueue reader_queue;
49aead9dc9SPaolo Bonzini
50aead9dc9SPaolo Bonzini struct BdrvGraphRWlock {
51aead9dc9SPaolo Bonzini /* How many readers are currently reading the graph. */
52aead9dc9SPaolo Bonzini uint32_t reader_count;
53aead9dc9SPaolo Bonzini
54aead9dc9SPaolo Bonzini /*
55aead9dc9SPaolo Bonzini * List of BdrvGraphRWlock kept in graph-lock.c
56aead9dc9SPaolo Bonzini * Protected by aio_context_list_lock
57aead9dc9SPaolo Bonzini */
58aead9dc9SPaolo Bonzini QTAILQ_ENTRY(BdrvGraphRWlock) next_aio;
59aead9dc9SPaolo Bonzini };
60aead9dc9SPaolo Bonzini
61aead9dc9SPaolo Bonzini /*
62aead9dc9SPaolo Bonzini * List of BdrvGraphRWlock. This list ensures that each BdrvGraphRWlock
63aead9dc9SPaolo Bonzini * can safely modify only its own counter, avoid reading/writing
64aead9dc9SPaolo Bonzini * others and thus improving performances by avoiding cacheline bounces.
65aead9dc9SPaolo Bonzini */
66aead9dc9SPaolo Bonzini static QTAILQ_HEAD(, BdrvGraphRWlock) aio_context_list =
67aead9dc9SPaolo Bonzini QTAILQ_HEAD_INITIALIZER(aio_context_list);
68aead9dc9SPaolo Bonzini
bdrv_init_graph_lock(void)69aead9dc9SPaolo Bonzini static void __attribute__((__constructor__)) bdrv_init_graph_lock(void)
70aead9dc9SPaolo Bonzini {
71aead9dc9SPaolo Bonzini qemu_mutex_init(&aio_context_list_lock);
72aead9dc9SPaolo Bonzini qemu_co_queue_init(&reader_queue);
73aead9dc9SPaolo Bonzini }
74aead9dc9SPaolo Bonzini
register_aiocontext(AioContext * ctx)75aead9dc9SPaolo Bonzini void register_aiocontext(AioContext *ctx)
76aead9dc9SPaolo Bonzini {
77aead9dc9SPaolo Bonzini ctx->bdrv_graph = g_new0(BdrvGraphRWlock, 1);
78aead9dc9SPaolo Bonzini QEMU_LOCK_GUARD(&aio_context_list_lock);
79aead9dc9SPaolo Bonzini assert(ctx->bdrv_graph->reader_count == 0);
80aead9dc9SPaolo Bonzini QTAILQ_INSERT_TAIL(&aio_context_list, ctx->bdrv_graph, next_aio);
81aead9dc9SPaolo Bonzini }
82aead9dc9SPaolo Bonzini
unregister_aiocontext(AioContext * ctx)83aead9dc9SPaolo Bonzini void unregister_aiocontext(AioContext *ctx)
84aead9dc9SPaolo Bonzini {
85aead9dc9SPaolo Bonzini QEMU_LOCK_GUARD(&aio_context_list_lock);
86aead9dc9SPaolo Bonzini orphaned_reader_count += ctx->bdrv_graph->reader_count;
87aead9dc9SPaolo Bonzini QTAILQ_REMOVE(&aio_context_list, ctx->bdrv_graph, next_aio);
88aead9dc9SPaolo Bonzini g_free(ctx->bdrv_graph);
89aead9dc9SPaolo Bonzini }
90aead9dc9SPaolo Bonzini
reader_count(void)91aead9dc9SPaolo Bonzini static uint32_t reader_count(void)
92aead9dc9SPaolo Bonzini {
93aead9dc9SPaolo Bonzini BdrvGraphRWlock *brdv_graph;
94aead9dc9SPaolo Bonzini uint32_t rd;
95aead9dc9SPaolo Bonzini
96aead9dc9SPaolo Bonzini QEMU_LOCK_GUARD(&aio_context_list_lock);
97aead9dc9SPaolo Bonzini
983202d8e4SMichael Tokarev /* rd can temporarily be negative, but the total will *always* be >= 0 */
99aead9dc9SPaolo Bonzini rd = orphaned_reader_count;
100aead9dc9SPaolo Bonzini QTAILQ_FOREACH(brdv_graph, &aio_context_list, next_aio) {
101aead9dc9SPaolo Bonzini rd += qatomic_read(&brdv_graph->reader_count);
102aead9dc9SPaolo Bonzini }
103aead9dc9SPaolo Bonzini
104aead9dc9SPaolo Bonzini /* shouldn't overflow unless there are 2^31 readers */
105aead9dc9SPaolo Bonzini assert((int32_t)rd >= 0);
106aead9dc9SPaolo Bonzini return rd;
107aead9dc9SPaolo Bonzini }
108aead9dc9SPaolo Bonzini
bdrv_graph_wrlock(void)109*6bc30f19SStefan Hajnoczi void no_coroutine_fn bdrv_graph_wrlock(void)
110aead9dc9SPaolo Bonzini {
111aead9dc9SPaolo Bonzini GLOBAL_STATE_CODE();
112aead9dc9SPaolo Bonzini assert(!qatomic_read(&has_writer));
113e6e964b8SKevin Wolf assert(!qemu_in_coroutine());
114aead9dc9SPaolo Bonzini
115aead9dc9SPaolo Bonzini /* Make sure that constantly arriving new I/O doesn't cause starvation */
116aead9dc9SPaolo Bonzini bdrv_drain_all_begin_nopoll();
117aead9dc9SPaolo Bonzini
118aead9dc9SPaolo Bonzini /*
119aead9dc9SPaolo Bonzini * reader_count == 0: this means writer will read has_reader as 1
120aead9dc9SPaolo Bonzini * reader_count >= 1: we don't know if writer read has_writer == 0 or 1,
121aead9dc9SPaolo Bonzini * but we need to wait.
122aead9dc9SPaolo Bonzini * Wait by allowing other coroutine (and possible readers) to continue.
123aead9dc9SPaolo Bonzini */
124aead9dc9SPaolo Bonzini do {
125aead9dc9SPaolo Bonzini /*
126aead9dc9SPaolo Bonzini * has_writer must be 0 while polling, otherwise we get a deadlock if
127aead9dc9SPaolo Bonzini * any callback involved during AIO_WAIT_WHILE() tries to acquire the
128aead9dc9SPaolo Bonzini * reader lock.
129aead9dc9SPaolo Bonzini */
130aead9dc9SPaolo Bonzini qatomic_set(&has_writer, 0);
131d805d8a2SStefan Hajnoczi AIO_WAIT_WHILE_UNLOCKED(NULL, reader_count() >= 1);
132aead9dc9SPaolo Bonzini qatomic_set(&has_writer, 1);
133aead9dc9SPaolo Bonzini
134aead9dc9SPaolo Bonzini /*
135aead9dc9SPaolo Bonzini * We want to only check reader_count() after has_writer = 1 is visible
136aead9dc9SPaolo Bonzini * to other threads. That way no more readers can sneak in after we've
137aead9dc9SPaolo Bonzini * determined reader_count() == 0.
138aead9dc9SPaolo Bonzini */
139aead9dc9SPaolo Bonzini smp_mb();
140aead9dc9SPaolo Bonzini } while (reader_count() >= 1);
141aead9dc9SPaolo Bonzini
142aead9dc9SPaolo Bonzini bdrv_drain_all_end();
143aead9dc9SPaolo Bonzini }
144aead9dc9SPaolo Bonzini
bdrv_graph_wrunlock(void)145*6bc30f19SStefan Hajnoczi void no_coroutine_fn bdrv_graph_wrunlock(void)
146aead9dc9SPaolo Bonzini {
147aead9dc9SPaolo Bonzini GLOBAL_STATE_CODE();
148aead9dc9SPaolo Bonzini assert(qatomic_read(&has_writer));
149aead9dc9SPaolo Bonzini
150ac2ae233SKevin Wolf WITH_QEMU_LOCK_GUARD(&aio_context_list_lock) {
151aead9dc9SPaolo Bonzini /*
152aead9dc9SPaolo Bonzini * No need for memory barriers, this works in pair with
153aead9dc9SPaolo Bonzini * the slow path of rdlock() and both take the lock.
154aead9dc9SPaolo Bonzini */
155aead9dc9SPaolo Bonzini qatomic_store_release(&has_writer, 0);
156aead9dc9SPaolo Bonzini
157ac2ae233SKevin Wolf /* Wake up all coroutines that are waiting to read the graph */
158aead9dc9SPaolo Bonzini qemu_co_enter_all(&reader_queue, &aio_context_list_lock);
159aead9dc9SPaolo Bonzini }
160aead9dc9SPaolo Bonzini
161ac2ae233SKevin Wolf /*
162ac2ae233SKevin Wolf * Run any BHs that were scheduled during the wrlock section and that
163ac2ae233SKevin Wolf * callers might expect to have finished (in particular, this is important
164ac2ae233SKevin Wolf * for bdrv_schedule_unref()).
165ac2ae233SKevin Wolf *
166ac2ae233SKevin Wolf * Do this only after restarting coroutines so that nested event loops in
167ac2ae233SKevin Wolf * BHs don't deadlock if their condition relies on the coroutine making
168ac2ae233SKevin Wolf * progress.
169ac2ae233SKevin Wolf */
170ac2ae233SKevin Wolf aio_bh_poll(qemu_get_aio_context());
171ac2ae233SKevin Wolf }
172ac2ae233SKevin Wolf
bdrv_graph_co_rdlock(void)173aead9dc9SPaolo Bonzini void coroutine_fn bdrv_graph_co_rdlock(void)
174aead9dc9SPaolo Bonzini {
175aead9dc9SPaolo Bonzini BdrvGraphRWlock *bdrv_graph;
176aead9dc9SPaolo Bonzini bdrv_graph = qemu_get_current_aio_context()->bdrv_graph;
177aead9dc9SPaolo Bonzini
178aead9dc9SPaolo Bonzini for (;;) {
179aead9dc9SPaolo Bonzini qatomic_set(&bdrv_graph->reader_count,
180aead9dc9SPaolo Bonzini bdrv_graph->reader_count + 1);
181aead9dc9SPaolo Bonzini /* make sure writer sees reader_count before we check has_writer */
182aead9dc9SPaolo Bonzini smp_mb();
183aead9dc9SPaolo Bonzini
184aead9dc9SPaolo Bonzini /*
185aead9dc9SPaolo Bonzini * has_writer == 0: this means writer will read reader_count as >= 1
186aead9dc9SPaolo Bonzini * has_writer == 1: we don't know if writer read reader_count == 0
187aead9dc9SPaolo Bonzini * or > 0, but we need to wait anyways because
188aead9dc9SPaolo Bonzini * it will write.
189aead9dc9SPaolo Bonzini */
190aead9dc9SPaolo Bonzini if (!qatomic_read(&has_writer)) {
191aead9dc9SPaolo Bonzini break;
192aead9dc9SPaolo Bonzini }
193aead9dc9SPaolo Bonzini
194aead9dc9SPaolo Bonzini /*
195aead9dc9SPaolo Bonzini * Synchronize access with reader_count() in bdrv_graph_wrlock().
196aead9dc9SPaolo Bonzini * Case 1:
197aead9dc9SPaolo Bonzini * If this critical section gets executed first, reader_count will
198aead9dc9SPaolo Bonzini * decrease and the reader will go to sleep.
199aead9dc9SPaolo Bonzini * Then the writer will read reader_count that does not take into
200aead9dc9SPaolo Bonzini * account this reader, and if there's no other reader it will
201aead9dc9SPaolo Bonzini * enter the write section.
202aead9dc9SPaolo Bonzini * Case 2:
203aead9dc9SPaolo Bonzini * If reader_count() critical section gets executed first,
204aead9dc9SPaolo Bonzini * then writer will read reader_count >= 1.
205aead9dc9SPaolo Bonzini * It will wait in AIO_WAIT_WHILE(), but once it releases the lock
206aead9dc9SPaolo Bonzini * we will enter this critical section and call aio_wait_kick().
207aead9dc9SPaolo Bonzini */
208aead9dc9SPaolo Bonzini WITH_QEMU_LOCK_GUARD(&aio_context_list_lock) {
209aead9dc9SPaolo Bonzini /*
210aead9dc9SPaolo Bonzini * Additional check when we use the above lock to synchronize
211aead9dc9SPaolo Bonzini * with bdrv_graph_wrunlock().
212aead9dc9SPaolo Bonzini * Case 1:
213aead9dc9SPaolo Bonzini * If this gets executed first, has_writer is still 1, so we reduce
214aead9dc9SPaolo Bonzini * reader_count and go to sleep.
215aead9dc9SPaolo Bonzini * Then the writer will set has_writer to 0 and wake up all readers,
216aead9dc9SPaolo Bonzini * us included.
217aead9dc9SPaolo Bonzini * Case 2:
218aead9dc9SPaolo Bonzini * If bdrv_graph_wrunlock() critical section gets executed first,
219aead9dc9SPaolo Bonzini * then it will set has_writer to 0 and wake up all other readers.
220aead9dc9SPaolo Bonzini * Then we execute this critical section, and therefore must check
221aead9dc9SPaolo Bonzini * again for has_writer, otherwise we sleep without any writer
222aead9dc9SPaolo Bonzini * actually running.
223aead9dc9SPaolo Bonzini */
224aead9dc9SPaolo Bonzini if (!qatomic_read(&has_writer)) {
225aead9dc9SPaolo Bonzini return;
226aead9dc9SPaolo Bonzini }
227aead9dc9SPaolo Bonzini
228aead9dc9SPaolo Bonzini /* slow path where reader sleeps */
229aead9dc9SPaolo Bonzini bdrv_graph->reader_count--;
230aead9dc9SPaolo Bonzini aio_wait_kick();
231aead9dc9SPaolo Bonzini qemu_co_queue_wait(&reader_queue, &aio_context_list_lock);
232aead9dc9SPaolo Bonzini }
233aead9dc9SPaolo Bonzini }
234aead9dc9SPaolo Bonzini }
235aead9dc9SPaolo Bonzini
bdrv_graph_co_rdunlock(void)236aead9dc9SPaolo Bonzini void coroutine_fn bdrv_graph_co_rdunlock(void)
237aead9dc9SPaolo Bonzini {
238aead9dc9SPaolo Bonzini BdrvGraphRWlock *bdrv_graph;
239aead9dc9SPaolo Bonzini bdrv_graph = qemu_get_current_aio_context()->bdrv_graph;
240aead9dc9SPaolo Bonzini
241aead9dc9SPaolo Bonzini qatomic_store_release(&bdrv_graph->reader_count,
242aead9dc9SPaolo Bonzini bdrv_graph->reader_count - 1);
243aead9dc9SPaolo Bonzini /* make sure writer sees reader_count before we check has_writer */
244aead9dc9SPaolo Bonzini smp_mb();
245aead9dc9SPaolo Bonzini
246aead9dc9SPaolo Bonzini /*
247aead9dc9SPaolo Bonzini * has_writer == 0: this means reader will read reader_count decreased
248aead9dc9SPaolo Bonzini * has_writer == 1: we don't know if writer read reader_count old or
249aead9dc9SPaolo Bonzini * new. Therefore, kick again so on next iteration
250aead9dc9SPaolo Bonzini * writer will for sure read the updated value.
251aead9dc9SPaolo Bonzini */
252aead9dc9SPaolo Bonzini if (qatomic_read(&has_writer)) {
253aead9dc9SPaolo Bonzini aio_wait_kick();
254aead9dc9SPaolo Bonzini }
255aead9dc9SPaolo Bonzini }
256aead9dc9SPaolo Bonzini
bdrv_graph_rdlock_main_loop(void)257aead9dc9SPaolo Bonzini void bdrv_graph_rdlock_main_loop(void)
258aead9dc9SPaolo Bonzini {
259aead9dc9SPaolo Bonzini GLOBAL_STATE_CODE();
260aead9dc9SPaolo Bonzini assert(!qemu_in_coroutine());
261aead9dc9SPaolo Bonzini }
262aead9dc9SPaolo Bonzini
bdrv_graph_rdunlock_main_loop(void)263aead9dc9SPaolo Bonzini void bdrv_graph_rdunlock_main_loop(void)
264aead9dc9SPaolo Bonzini {
265aead9dc9SPaolo Bonzini GLOBAL_STATE_CODE();
266aead9dc9SPaolo Bonzini assert(!qemu_in_coroutine());
267aead9dc9SPaolo Bonzini }
2683f35f82eSEmanuele Giuseppe Esposito
assert_bdrv_graph_readable(void)2693f35f82eSEmanuele Giuseppe Esposito void assert_bdrv_graph_readable(void)
2703f35f82eSEmanuele Giuseppe Esposito {
27158a2e3f5SStefan Hajnoczi /* reader_count() is slow due to aio_context_list_lock lock contention */
27258a2e3f5SStefan Hajnoczi #ifdef CONFIG_DEBUG_GRAPH_LOCK
2733f35f82eSEmanuele Giuseppe Esposito assert(qemu_in_main_thread() || reader_count());
27458a2e3f5SStefan Hajnoczi #endif
2753f35f82eSEmanuele Giuseppe Esposito }
2763f35f82eSEmanuele Giuseppe Esposito
assert_bdrv_graph_writable(void)2773f35f82eSEmanuele Giuseppe Esposito void assert_bdrv_graph_writable(void)
2783f35f82eSEmanuele Giuseppe Esposito {
2793f35f82eSEmanuele Giuseppe Esposito assert(qemu_in_main_thread());
2803f35f82eSEmanuele Giuseppe Esposito assert(qatomic_read(&has_writer));
2813f35f82eSEmanuele Giuseppe Esposito }
282