xref: /minix/minix/lib/libbdev/ipc.c (revision 83133719)
1 /* libbdev - IPC and recovery functions */
2 
3 #include <minix/drivers.h>
4 #include <minix/bdev.h>
5 #include <assert.h>
6 
7 #include "const.h"
8 #include "type.h"
9 #include "proto.h"
10 
11 static void bdev_cancel(dev_t dev)
12 {
13 /* Recovering the driver for the given device has failed repeatedly. Mark it as
14  * permanently unusable, and clean up any associated calls and resources.
15  */
16   bdev_call_t *call, *next;
17 
18   printf("bdev: giving up on major %d\n", major(dev));
19 
20   /* Cancel all pending asynchronous requests. */
21   call = NULL;
22 
23   while ((call = bdev_call_iter_maj(dev, call, &next)) != NULL)
24 	bdev_callback_asyn(call, EDEADSRCDST);
25 
26   /* Mark the driver as unusable. */
27   bdev_driver_clear(dev);
28 }
29 
30 static int bdev_recover(dev_t dev, int update_endpt)
31 {
32 /* The IPC subsystem has signaled an error communicating to the driver
33  * associated with the given device. Try to recover. If 'update_endpt' is set,
34  * we need to find the new endpoint of the driver first. Return TRUE iff
35  * recovery has been successful.
36  */
37   bdev_call_t *call, *next;
38   endpoint_t endpt;
39   int r, active, nr_tries;
40 
41   /* Only print output if there is something to recover. Some drivers may be
42    * shut down and later restarted legitimately, and if they were not in use
43    * while that happened, there is no need to flood the console with messages.
44    */
45   active = bdev_minor_is_open(dev) || bdev_call_iter_maj(dev, NULL, &next);
46 
47   if (active)
48 	printf("bdev: recovering from a driver restart on major %d\n",
49 		major(dev));
50 
51   for (nr_tries = 0; nr_tries < RECOVER_TRIES; nr_tries++) {
52 	/* First update the endpoint, if necessary. */
53 	if (update_endpt)
54 		(void) bdev_driver_update(dev);
55 
56 	if ((endpt = bdev_driver_get(dev)) == NONE)
57 		break;
58 
59 	/* If anything goes wrong, update the endpoint again next time. */
60 	update_endpt = TRUE;
61 
62 	/* Reopen all minor devices on the new driver. */
63 	if ((r = bdev_minor_reopen(dev)) != OK) {
64 		/* If the driver died again, we may give it another try. */
65 		if (r == EDEADSRCDST)
66 			continue;
67 
68 		/* If another error occurred, we cannot continue using the
69 		 * driver as is, but we also cannot force it to restart.
70 		 */
71 		break;
72 	}
73 
74 	/* Resend all asynchronous requests. */
75 	call = NULL;
76 
77 	while ((call = bdev_call_iter_maj(dev, call, &next)) != NULL) {
78 		/* It is not strictly necessary that we manage to reissue all
79 		 * asynchronous requests successfully. We can fail them on an
80 		 * individual basis here, without affecting the overall
81 		 * recovery. Note that we will never get new IPC failures here.
82 		 */
83 		if ((r = bdev_restart_asyn(call)) != OK)
84 			bdev_callback_asyn(call, r);
85 	}
86 
87 	/* Recovery seems successful. We can now reissue the current
88 	 * synchronous request (if any), and continue normal operation.
89 	 */
90 	if (active)
91 		printf("bdev: recovery successful, new driver at %d\n", endpt);
92 
93 	return TRUE;
94   }
95 
96   /* Recovery failed repeatedly. Give up on this driver. */
97   bdev_cancel(dev);
98 
99   return FALSE;
100 }
101 
102 void bdev_update(dev_t dev, char *label)
103 {
104 /* Set the endpoint for a driver. Perform recovery if necessary.
105  */
106   endpoint_t endpt, old_endpt;
107 
108   old_endpt = bdev_driver_get(dev);
109 
110   endpt = bdev_driver_set(dev, label);
111 
112   /* If updating the driver causes an endpoint change, we need to perform
113    * recovery, but not update the endpoint yet again.
114    */
115   if (old_endpt != NONE && old_endpt != endpt)
116 	bdev_recover(dev, FALSE /*update_endpt*/);
117 }
118 
119 int bdev_senda(dev_t dev, const message *m_orig, bdev_id_t id)
120 {
121 /* Send an asynchronous request for the given device. This function will never
122  * get any new IPC errors sending to the driver. If sending an asynchronous
123  * request fails, we will find out through other ways later.
124  */
125   endpoint_t endpt;
126   message m;
127   int r;
128 
129   /* If we have no usable driver endpoint, fail instantly. */
130   if ((endpt = bdev_driver_get(dev)) == NONE)
131 	return EDEADSRCDST;
132 
133   m = *m_orig;
134   m.m_lbdev_lblockdriver_msg.id = id;
135 
136   r = asynsend(endpt, &m);
137 
138   if (r != OK)
139 	printf("bdev: asynsend to driver (%d) failed (%d)\n", endpt, r);
140 
141   return r;
142 }
143 
144 int bdev_sendrec(dev_t dev, const message *m_orig)
145 {
146 /* Send a synchronous request for the given device, and wait for the reply.
147  * Return ERESTART if the caller should try to reissue the request.
148  */
149   endpoint_t endpt;
150   message m;
151   int r;
152 
153   /* If we have no usable driver endpoint, fail instantly. */
154   if ((endpt = bdev_driver_get(dev)) == NONE)
155 	return EDEADSRCDST;
156 
157   /* Send the request and block until we receive a reply. */
158   m = *m_orig;
159   m.m_lbdev_lblockdriver_msg.id = NO_ID;
160 
161   r = ipc_sendrec(endpt, &m);
162 
163   /* If communication failed, the driver has died. We assume it will be
164    * restarted soon after, so we attempt recovery. Upon success, we let the
165    * caller reissue the synchronous request.
166    */
167   if (r == EDEADSRCDST) {
168 	if (!bdev_recover(dev, TRUE /*update_endpt*/))
169 		return EDEADSRCDST;
170 
171 	return ERESTART;
172   }
173 
174   if (r != OK) {
175 	printf("bdev: IPC to driver (%d) failed (%d)\n", endpt, r);
176 	return r;
177   }
178 
179   if (m.m_type != BDEV_REPLY) {
180 	printf("bdev: driver (%d) sent weird response (%d)\n",
181 		endpt, m.m_type);
182 	return EINVAL;
183   }
184 
185   /* The protocol contract states that no asynchronous reply can satisfy a
186    * synchronous SENDREC call, so we can never get an asynchronous reply here.
187    */
188   if (m.m_lblockdriver_lbdev_reply.id != NO_ID) {
189 	printf("bdev: driver (%d) sent invalid ID (%d)\n", endpt,
190 		m.m_lblockdriver_lbdev_reply.id);
191 	return EINVAL;
192   }
193 
194   /* Unless the caller is misusing libbdev, we will only get ERESTART if we
195    * have managed to resend a raw block I/O request to the driver after a
196    * restart, but before VFS has had a chance to reopen the associated device
197    * first. This is highly exceptional, and hard to deal with correctly. We
198    * take the easiest route: sleep for a while so that VFS can reopen the
199    * device, and then resend the request. If the call keeps failing, the caller
200    * will eventually give up.
201    */
202   if (m.m_lblockdriver_lbdev_reply.status == ERESTART) {
203 	printf("bdev: got ERESTART from driver (%d), sleeping for reopen\n",
204 		endpt);
205 
206 	micro_delay(1000);
207 
208 	return ERESTART;
209   }
210 
211   /* Return the result of our request. */
212   return m.m_lblockdriver_lbdev_reply.status;
213 }
214 
215 static int bdev_receive(dev_t dev, message *m)
216 {
217 /* Receive one valid message.
218  */
219   endpoint_t endpt;
220   int r, nr_tries = 0;
221 
222   for (;;) {
223 	/* Retrieve and check the driver endpoint on every try, as it will
224 	 * change with each driver restart.
225 	 */
226 	if ((endpt = bdev_driver_get(dev)) == NONE)
227 		return EDEADSRCDST;
228 
229 	r = sef_receive(endpt, m);
230 
231 	if (r == EDEADSRCDST) {
232 		/* If we reached the maximum number of retries, give up. */
233 		if (++nr_tries == DRIVER_TRIES)
234 			break;
235 
236 		/* Attempt recovery. If successful, all asynchronous requests
237 		 * will have been resent, and we can retry receiving a reply.
238 		 */
239 		if (!bdev_recover(dev, TRUE /*update_endpt*/))
240 			return EDEADSRCDST;
241 
242 		continue;
243 	}
244 
245 	if (r != OK) {
246 		printf("bdev: IPC to driver (%d) failed (%d)\n", endpt, r);
247 
248 		return r;
249 	}
250 
251 	if (m->m_type != BDEV_REPLY) {
252 		printf("bdev: driver (%d) sent weird response (%d)\n",
253 			endpt, m->m_type);
254 		return EINVAL;
255 	}
256 
257 	/* The caller is responsible for checking the ID and status. */
258 	return OK;
259   }
260 
261   /* All tries failed, even though all recovery attempts succeeded. In this
262    * case, we let the caller recheck whether it wants to keep calling us,
263    * returning ERESTART to indicate we can be called again but did not actually
264    * receive a message.
265    */
266   return ERESTART;
267 }
268 
269 void bdev_reply_asyn(message *m)
270 {
271 /* A reply has come in from a disk driver.
272  */
273   bdev_call_t *call;
274   endpoint_t endpt;
275   bdev_id_t id;
276   int r;
277 
278   /* This is a requirement for the caller. */
279   assert(m->m_type == BDEV_REPLY);
280 
281   /* Get the corresponding asynchronous call structure. */
282   id = m->m_lblockdriver_lbdev_reply.id;
283 
284   if ((call = bdev_call_get(id)) == NULL) {
285 	printf("bdev: driver (%d) replied to unknown request (%d)\n",
286 		m->m_source, m->m_lblockdriver_lbdev_reply.id);
287 	return;
288   }
289 
290   /* Make sure the reply was sent from the right endpoint. */
291   endpt = bdev_driver_get(call->dev);
292 
293   if (m->m_source != endpt) {
294 	/* If the endpoint is NONE, this may be a stray reply. */
295 	if (endpt != NONE)
296 		printf("bdev: driver (%d) replied to request not sent to it\n",
297 			m->m_source);
298 	return;
299   }
300 
301   /* See the ERESTART comment in bdev_sendrec(). */
302   if (m->m_lblockdriver_lbdev_reply.status == ERESTART) {
303 	printf("bdev: got ERESTART from driver (%d), sleeping for reopen\n",
304 		endpt);
305 
306 	micro_delay(1000);
307 
308 	if ((r = bdev_restart_asyn(call)) != OK)
309 		bdev_callback_asyn(call, r);
310 
311 	return;
312   }
313 
314   bdev_callback_asyn(call, m->m_lblockdriver_lbdev_reply.status);
315 }
316 
317 int bdev_wait_asyn(bdev_id_t id)
318 {
319 /* Wait for an asynchronous request to complete.
320  */
321   bdev_call_t *call;
322   dev_t dev;
323   message m;
324   int r;
325 
326   if ((call = bdev_call_get(id)) == NULL)
327 	return ENOENT;
328 
329   dev = call->dev;
330 
331   do {
332 	if ((r = bdev_receive(dev, &m)) != OK && r != ERESTART)
333 		return r;
334 
335 	/* Processing the reply will free up the call structure as a side
336 	 * effect. If we repeatedly get ERESTART, we will repeatedly resend the
337 	 * asynchronous request, which will then eventually hit the retry limit
338 	 * and we will break out of the loop.
339 	 */
340 	if (r == OK)
341 		bdev_reply_asyn(&m);
342 
343   } while (bdev_call_get(id) != NULL);
344 
345   return OK;
346 }
347