xref: /qemu/block/graph-lock.c (revision 6bc30f19)
1aead9dc9SPaolo Bonzini /*
2aead9dc9SPaolo Bonzini  * Graph lock: rwlock to protect block layer graph manipulations (add/remove
3aead9dc9SPaolo Bonzini  * edges and nodes)
4aead9dc9SPaolo Bonzini  *
5aead9dc9SPaolo Bonzini  *  Copyright (c) 2022 Red Hat
6aead9dc9SPaolo Bonzini  *
7aead9dc9SPaolo Bonzini  * This library is free software; you can redistribute it and/or
8aead9dc9SPaolo Bonzini  * modify it under the terms of the GNU Lesser General Public
9aead9dc9SPaolo Bonzini  * License as published by the Free Software Foundation; either
10aead9dc9SPaolo Bonzini  * version 2.1 of the License, or (at your option) any later version.
11aead9dc9SPaolo Bonzini  *
12aead9dc9SPaolo Bonzini  * This library is distributed in the hope that it will be useful,
13aead9dc9SPaolo Bonzini  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14aead9dc9SPaolo Bonzini  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15aead9dc9SPaolo Bonzini  * Lesser General Public License for more details.
16aead9dc9SPaolo Bonzini  *
17aead9dc9SPaolo Bonzini  * You should have received a copy of the GNU Lesser General Public
18aead9dc9SPaolo Bonzini  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
19aead9dc9SPaolo Bonzini  */
20aead9dc9SPaolo Bonzini 
21aead9dc9SPaolo Bonzini #include "qemu/osdep.h"
22aead9dc9SPaolo Bonzini #include "qemu/main-loop.h"
23aead9dc9SPaolo Bonzini #include "block/graph-lock.h"
24aead9dc9SPaolo Bonzini #include "block/block.h"
25aead9dc9SPaolo Bonzini #include "block/block_int.h"
26aead9dc9SPaolo Bonzini 
274002ffdcSKevin Wolf /* Dummy lock object to use for Thread Safety Analysis (TSA) */
284002ffdcSKevin Wolf BdrvGraphLock graph_lock;
294002ffdcSKevin Wolf 
30aead9dc9SPaolo Bonzini /* Protects the list of aiocontext and orphaned_reader_count */
31aead9dc9SPaolo Bonzini static QemuMutex aio_context_list_lock;
32aead9dc9SPaolo Bonzini 
33aead9dc9SPaolo Bonzini /* Written and read with atomic operations. */
34aead9dc9SPaolo Bonzini static int has_writer;
35aead9dc9SPaolo Bonzini 
36aead9dc9SPaolo Bonzini /*
37aead9dc9SPaolo Bonzini  * A reader coroutine could move from an AioContext to another.
38aead9dc9SPaolo Bonzini  * If this happens, there is no problem from the point of view of
39aead9dc9SPaolo Bonzini  * counters. The problem is that the total count becomes
40aead9dc9SPaolo Bonzini  * unbalanced if one of the two AioContexts gets deleted.
41aead9dc9SPaolo Bonzini  * The count of readers must remain correct, so the AioContext's
42aead9dc9SPaolo Bonzini  * balance is transferred to this glboal variable.
43aead9dc9SPaolo Bonzini  * Protected by aio_context_list_lock.
44aead9dc9SPaolo Bonzini  */
45aead9dc9SPaolo Bonzini static uint32_t orphaned_reader_count;
46aead9dc9SPaolo Bonzini 
47aead9dc9SPaolo Bonzini /* Queue of readers waiting for the writer to finish */
48aead9dc9SPaolo Bonzini static CoQueue reader_queue;
49aead9dc9SPaolo Bonzini 
50aead9dc9SPaolo Bonzini struct BdrvGraphRWlock {
51aead9dc9SPaolo Bonzini     /* How many readers are currently reading the graph. */
52aead9dc9SPaolo Bonzini     uint32_t reader_count;
53aead9dc9SPaolo Bonzini 
54aead9dc9SPaolo Bonzini     /*
55aead9dc9SPaolo Bonzini      * List of BdrvGraphRWlock kept in graph-lock.c
56aead9dc9SPaolo Bonzini      * Protected by aio_context_list_lock
57aead9dc9SPaolo Bonzini      */
58aead9dc9SPaolo Bonzini     QTAILQ_ENTRY(BdrvGraphRWlock) next_aio;
59aead9dc9SPaolo Bonzini };
60aead9dc9SPaolo Bonzini 
61aead9dc9SPaolo Bonzini /*
62aead9dc9SPaolo Bonzini  * List of BdrvGraphRWlock. This list ensures that each BdrvGraphRWlock
63aead9dc9SPaolo Bonzini  * can safely modify only its own counter, avoid reading/writing
64aead9dc9SPaolo Bonzini  * others and thus improving performances by avoiding cacheline bounces.
65aead9dc9SPaolo Bonzini  */
66aead9dc9SPaolo Bonzini static QTAILQ_HEAD(, BdrvGraphRWlock) aio_context_list =
67aead9dc9SPaolo Bonzini     QTAILQ_HEAD_INITIALIZER(aio_context_list);
68aead9dc9SPaolo Bonzini 
bdrv_init_graph_lock(void)69aead9dc9SPaolo Bonzini static void __attribute__((__constructor__)) bdrv_init_graph_lock(void)
70aead9dc9SPaolo Bonzini {
71aead9dc9SPaolo Bonzini     qemu_mutex_init(&aio_context_list_lock);
72aead9dc9SPaolo Bonzini     qemu_co_queue_init(&reader_queue);
73aead9dc9SPaolo Bonzini }
74aead9dc9SPaolo Bonzini 
register_aiocontext(AioContext * ctx)75aead9dc9SPaolo Bonzini void register_aiocontext(AioContext *ctx)
76aead9dc9SPaolo Bonzini {
77aead9dc9SPaolo Bonzini     ctx->bdrv_graph = g_new0(BdrvGraphRWlock, 1);
78aead9dc9SPaolo Bonzini     QEMU_LOCK_GUARD(&aio_context_list_lock);
79aead9dc9SPaolo Bonzini     assert(ctx->bdrv_graph->reader_count == 0);
80aead9dc9SPaolo Bonzini     QTAILQ_INSERT_TAIL(&aio_context_list, ctx->bdrv_graph, next_aio);
81aead9dc9SPaolo Bonzini }
82aead9dc9SPaolo Bonzini 
unregister_aiocontext(AioContext * ctx)83aead9dc9SPaolo Bonzini void unregister_aiocontext(AioContext *ctx)
84aead9dc9SPaolo Bonzini {
85aead9dc9SPaolo Bonzini     QEMU_LOCK_GUARD(&aio_context_list_lock);
86aead9dc9SPaolo Bonzini     orphaned_reader_count += ctx->bdrv_graph->reader_count;
87aead9dc9SPaolo Bonzini     QTAILQ_REMOVE(&aio_context_list, ctx->bdrv_graph, next_aio);
88aead9dc9SPaolo Bonzini     g_free(ctx->bdrv_graph);
89aead9dc9SPaolo Bonzini }
90aead9dc9SPaolo Bonzini 
reader_count(void)91aead9dc9SPaolo Bonzini static uint32_t reader_count(void)
92aead9dc9SPaolo Bonzini {
93aead9dc9SPaolo Bonzini     BdrvGraphRWlock *brdv_graph;
94aead9dc9SPaolo Bonzini     uint32_t rd;
95aead9dc9SPaolo Bonzini 
96aead9dc9SPaolo Bonzini     QEMU_LOCK_GUARD(&aio_context_list_lock);
97aead9dc9SPaolo Bonzini 
983202d8e4SMichael Tokarev     /* rd can temporarily be negative, but the total will *always* be >= 0 */
99aead9dc9SPaolo Bonzini     rd = orphaned_reader_count;
100aead9dc9SPaolo Bonzini     QTAILQ_FOREACH(brdv_graph, &aio_context_list, next_aio) {
101aead9dc9SPaolo Bonzini         rd += qatomic_read(&brdv_graph->reader_count);
102aead9dc9SPaolo Bonzini     }
103aead9dc9SPaolo Bonzini 
104aead9dc9SPaolo Bonzini     /* shouldn't overflow unless there are 2^31 readers */
105aead9dc9SPaolo Bonzini     assert((int32_t)rd >= 0);
106aead9dc9SPaolo Bonzini     return rd;
107aead9dc9SPaolo Bonzini }
108aead9dc9SPaolo Bonzini 
bdrv_graph_wrlock(void)109*6bc30f19SStefan Hajnoczi void no_coroutine_fn bdrv_graph_wrlock(void)
110aead9dc9SPaolo Bonzini {
111aead9dc9SPaolo Bonzini     GLOBAL_STATE_CODE();
112aead9dc9SPaolo Bonzini     assert(!qatomic_read(&has_writer));
113e6e964b8SKevin Wolf     assert(!qemu_in_coroutine());
114aead9dc9SPaolo Bonzini 
115aead9dc9SPaolo Bonzini     /* Make sure that constantly arriving new I/O doesn't cause starvation */
116aead9dc9SPaolo Bonzini     bdrv_drain_all_begin_nopoll();
117aead9dc9SPaolo Bonzini 
118aead9dc9SPaolo Bonzini     /*
119aead9dc9SPaolo Bonzini      * reader_count == 0: this means writer will read has_reader as 1
120aead9dc9SPaolo Bonzini      * reader_count >= 1: we don't know if writer read has_writer == 0 or 1,
121aead9dc9SPaolo Bonzini      *                    but we need to wait.
122aead9dc9SPaolo Bonzini      * Wait by allowing other coroutine (and possible readers) to continue.
123aead9dc9SPaolo Bonzini      */
124aead9dc9SPaolo Bonzini     do {
125aead9dc9SPaolo Bonzini         /*
126aead9dc9SPaolo Bonzini          * has_writer must be 0 while polling, otherwise we get a deadlock if
127aead9dc9SPaolo Bonzini          * any callback involved during AIO_WAIT_WHILE() tries to acquire the
128aead9dc9SPaolo Bonzini          * reader lock.
129aead9dc9SPaolo Bonzini          */
130aead9dc9SPaolo Bonzini         qatomic_set(&has_writer, 0);
131d805d8a2SStefan Hajnoczi         AIO_WAIT_WHILE_UNLOCKED(NULL, reader_count() >= 1);
132aead9dc9SPaolo Bonzini         qatomic_set(&has_writer, 1);
133aead9dc9SPaolo Bonzini 
134aead9dc9SPaolo Bonzini         /*
135aead9dc9SPaolo Bonzini          * We want to only check reader_count() after has_writer = 1 is visible
136aead9dc9SPaolo Bonzini          * to other threads. That way no more readers can sneak in after we've
137aead9dc9SPaolo Bonzini          * determined reader_count() == 0.
138aead9dc9SPaolo Bonzini          */
139aead9dc9SPaolo Bonzini         smp_mb();
140aead9dc9SPaolo Bonzini     } while (reader_count() >= 1);
141aead9dc9SPaolo Bonzini 
142aead9dc9SPaolo Bonzini     bdrv_drain_all_end();
143aead9dc9SPaolo Bonzini }
144aead9dc9SPaolo Bonzini 
bdrv_graph_wrunlock(void)145*6bc30f19SStefan Hajnoczi void no_coroutine_fn bdrv_graph_wrunlock(void)
146aead9dc9SPaolo Bonzini {
147aead9dc9SPaolo Bonzini     GLOBAL_STATE_CODE();
148aead9dc9SPaolo Bonzini     assert(qatomic_read(&has_writer));
149aead9dc9SPaolo Bonzini 
150ac2ae233SKevin Wolf     WITH_QEMU_LOCK_GUARD(&aio_context_list_lock) {
151aead9dc9SPaolo Bonzini         /*
152aead9dc9SPaolo Bonzini          * No need for memory barriers, this works in pair with
153aead9dc9SPaolo Bonzini          * the slow path of rdlock() and both take the lock.
154aead9dc9SPaolo Bonzini          */
155aead9dc9SPaolo Bonzini         qatomic_store_release(&has_writer, 0);
156aead9dc9SPaolo Bonzini 
157ac2ae233SKevin Wolf         /* Wake up all coroutines that are waiting to read the graph */
158aead9dc9SPaolo Bonzini         qemu_co_enter_all(&reader_queue, &aio_context_list_lock);
159aead9dc9SPaolo Bonzini     }
160aead9dc9SPaolo Bonzini 
161ac2ae233SKevin Wolf     /*
162ac2ae233SKevin Wolf      * Run any BHs that were scheduled during the wrlock section and that
163ac2ae233SKevin Wolf      * callers might expect to have finished (in particular, this is important
164ac2ae233SKevin Wolf      * for bdrv_schedule_unref()).
165ac2ae233SKevin Wolf      *
166ac2ae233SKevin Wolf      * Do this only after restarting coroutines so that nested event loops in
167ac2ae233SKevin Wolf      * BHs don't deadlock if their condition relies on the coroutine making
168ac2ae233SKevin Wolf      * progress.
169ac2ae233SKevin Wolf      */
170ac2ae233SKevin Wolf     aio_bh_poll(qemu_get_aio_context());
171ac2ae233SKevin Wolf }
172ac2ae233SKevin Wolf 
bdrv_graph_co_rdlock(void)173aead9dc9SPaolo Bonzini void coroutine_fn bdrv_graph_co_rdlock(void)
174aead9dc9SPaolo Bonzini {
175aead9dc9SPaolo Bonzini     BdrvGraphRWlock *bdrv_graph;
176aead9dc9SPaolo Bonzini     bdrv_graph = qemu_get_current_aio_context()->bdrv_graph;
177aead9dc9SPaolo Bonzini 
178aead9dc9SPaolo Bonzini     for (;;) {
179aead9dc9SPaolo Bonzini         qatomic_set(&bdrv_graph->reader_count,
180aead9dc9SPaolo Bonzini                     bdrv_graph->reader_count + 1);
181aead9dc9SPaolo Bonzini         /* make sure writer sees reader_count before we check has_writer */
182aead9dc9SPaolo Bonzini         smp_mb();
183aead9dc9SPaolo Bonzini 
184aead9dc9SPaolo Bonzini         /*
185aead9dc9SPaolo Bonzini          * has_writer == 0: this means writer will read reader_count as >= 1
186aead9dc9SPaolo Bonzini          * has_writer == 1: we don't know if writer read reader_count == 0
187aead9dc9SPaolo Bonzini          *                  or > 0, but we need to wait anyways because
188aead9dc9SPaolo Bonzini          *                  it will write.
189aead9dc9SPaolo Bonzini          */
190aead9dc9SPaolo Bonzini         if (!qatomic_read(&has_writer)) {
191aead9dc9SPaolo Bonzini             break;
192aead9dc9SPaolo Bonzini         }
193aead9dc9SPaolo Bonzini 
194aead9dc9SPaolo Bonzini         /*
195aead9dc9SPaolo Bonzini          * Synchronize access with reader_count() in bdrv_graph_wrlock().
196aead9dc9SPaolo Bonzini          * Case 1:
197aead9dc9SPaolo Bonzini          * If this critical section gets executed first, reader_count will
198aead9dc9SPaolo Bonzini          * decrease and the reader will go to sleep.
199aead9dc9SPaolo Bonzini          * Then the writer will read reader_count that does not take into
200aead9dc9SPaolo Bonzini          * account this reader, and if there's no other reader it will
201aead9dc9SPaolo Bonzini          * enter the write section.
202aead9dc9SPaolo Bonzini          * Case 2:
203aead9dc9SPaolo Bonzini          * If reader_count() critical section gets executed first,
204aead9dc9SPaolo Bonzini          * then writer will read reader_count >= 1.
205aead9dc9SPaolo Bonzini          * It will wait in AIO_WAIT_WHILE(), but once it releases the lock
206aead9dc9SPaolo Bonzini          * we will enter this critical section and call aio_wait_kick().
207aead9dc9SPaolo Bonzini          */
208aead9dc9SPaolo Bonzini         WITH_QEMU_LOCK_GUARD(&aio_context_list_lock) {
209aead9dc9SPaolo Bonzini             /*
210aead9dc9SPaolo Bonzini              * Additional check when we use the above lock to synchronize
211aead9dc9SPaolo Bonzini              * with bdrv_graph_wrunlock().
212aead9dc9SPaolo Bonzini              * Case 1:
213aead9dc9SPaolo Bonzini              * If this gets executed first, has_writer is still 1, so we reduce
214aead9dc9SPaolo Bonzini              * reader_count and go to sleep.
215aead9dc9SPaolo Bonzini              * Then the writer will set has_writer to 0 and wake up all readers,
216aead9dc9SPaolo Bonzini              * us included.
217aead9dc9SPaolo Bonzini              * Case 2:
218aead9dc9SPaolo Bonzini              * If bdrv_graph_wrunlock() critical section gets executed first,
219aead9dc9SPaolo Bonzini              * then it will set has_writer to 0 and wake up all other readers.
220aead9dc9SPaolo Bonzini              * Then we execute this critical section, and therefore must check
221aead9dc9SPaolo Bonzini              * again for has_writer, otherwise we sleep without any writer
222aead9dc9SPaolo Bonzini              * actually running.
223aead9dc9SPaolo Bonzini              */
224aead9dc9SPaolo Bonzini             if (!qatomic_read(&has_writer)) {
225aead9dc9SPaolo Bonzini                 return;
226aead9dc9SPaolo Bonzini             }
227aead9dc9SPaolo Bonzini 
228aead9dc9SPaolo Bonzini             /* slow path where reader sleeps */
229aead9dc9SPaolo Bonzini             bdrv_graph->reader_count--;
230aead9dc9SPaolo Bonzini             aio_wait_kick();
231aead9dc9SPaolo Bonzini             qemu_co_queue_wait(&reader_queue, &aio_context_list_lock);
232aead9dc9SPaolo Bonzini         }
233aead9dc9SPaolo Bonzini     }
234aead9dc9SPaolo Bonzini }
235aead9dc9SPaolo Bonzini 
bdrv_graph_co_rdunlock(void)236aead9dc9SPaolo Bonzini void coroutine_fn bdrv_graph_co_rdunlock(void)
237aead9dc9SPaolo Bonzini {
238aead9dc9SPaolo Bonzini     BdrvGraphRWlock *bdrv_graph;
239aead9dc9SPaolo Bonzini     bdrv_graph = qemu_get_current_aio_context()->bdrv_graph;
240aead9dc9SPaolo Bonzini 
241aead9dc9SPaolo Bonzini     qatomic_store_release(&bdrv_graph->reader_count,
242aead9dc9SPaolo Bonzini                           bdrv_graph->reader_count - 1);
243aead9dc9SPaolo Bonzini     /* make sure writer sees reader_count before we check has_writer */
244aead9dc9SPaolo Bonzini     smp_mb();
245aead9dc9SPaolo Bonzini 
246aead9dc9SPaolo Bonzini     /*
247aead9dc9SPaolo Bonzini      * has_writer == 0: this means reader will read reader_count decreased
248aead9dc9SPaolo Bonzini      * has_writer == 1: we don't know if writer read reader_count old or
249aead9dc9SPaolo Bonzini      *                  new. Therefore, kick again so on next iteration
250aead9dc9SPaolo Bonzini      *                  writer will for sure read the updated value.
251aead9dc9SPaolo Bonzini      */
252aead9dc9SPaolo Bonzini     if (qatomic_read(&has_writer)) {
253aead9dc9SPaolo Bonzini         aio_wait_kick();
254aead9dc9SPaolo Bonzini     }
255aead9dc9SPaolo Bonzini }
256aead9dc9SPaolo Bonzini 
bdrv_graph_rdlock_main_loop(void)257aead9dc9SPaolo Bonzini void bdrv_graph_rdlock_main_loop(void)
258aead9dc9SPaolo Bonzini {
259aead9dc9SPaolo Bonzini     GLOBAL_STATE_CODE();
260aead9dc9SPaolo Bonzini     assert(!qemu_in_coroutine());
261aead9dc9SPaolo Bonzini }
262aead9dc9SPaolo Bonzini 
bdrv_graph_rdunlock_main_loop(void)263aead9dc9SPaolo Bonzini void bdrv_graph_rdunlock_main_loop(void)
264aead9dc9SPaolo Bonzini {
265aead9dc9SPaolo Bonzini     GLOBAL_STATE_CODE();
266aead9dc9SPaolo Bonzini     assert(!qemu_in_coroutine());
267aead9dc9SPaolo Bonzini }
2683f35f82eSEmanuele Giuseppe Esposito 
assert_bdrv_graph_readable(void)2693f35f82eSEmanuele Giuseppe Esposito void assert_bdrv_graph_readable(void)
2703f35f82eSEmanuele Giuseppe Esposito {
27158a2e3f5SStefan Hajnoczi     /* reader_count() is slow due to aio_context_list_lock lock contention */
27258a2e3f5SStefan Hajnoczi #ifdef CONFIG_DEBUG_GRAPH_LOCK
2733f35f82eSEmanuele Giuseppe Esposito     assert(qemu_in_main_thread() || reader_count());
27458a2e3f5SStefan Hajnoczi #endif
2753f35f82eSEmanuele Giuseppe Esposito }
2763f35f82eSEmanuele Giuseppe Esposito 
assert_bdrv_graph_writable(void)2773f35f82eSEmanuele Giuseppe Esposito void assert_bdrv_graph_writable(void)
2783f35f82eSEmanuele Giuseppe Esposito {
2793f35f82eSEmanuele Giuseppe Esposito     assert(qemu_in_main_thread());
2803f35f82eSEmanuele Giuseppe Esposito     assert(qatomic_read(&has_writer));
2813f35f82eSEmanuele Giuseppe Esposito }
282