xref: /dragonfly/sys/vfs/hammer/hammer_pfs.c (revision f2c43266)
1 /*
2  * Copyright (c) 2008 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * HAMMER PFS ioctls - Manage pseudo-fs configurations
36  */
37 
38 #include "hammer.h"
39 
40 static int hammer_pfs_autodetect(struct hammer_ioc_pseudofs_rw *pfs,
41 				hammer_inode_t ip);
42 static int hammer_pfs_rollback(hammer_transaction_t trans,
43 				hammer_pseudofs_inmem_t pfsm,
44 				hammer_tid_t trunc_tid);
45 static int hammer_pfs_delete_at_cursor(hammer_cursor_t cursor,
46 				hammer_tid_t trunc_tid);
47 
48 /*
49  * Get mirroring/pseudo-fs information
50  *
51  * NOTE: The ip used for ioctl is not necessarily related to the PFS
52  * since this ioctl only requires PFS id (or upper 16 bits of ip localization).
53  */
54 int
55 hammer_ioc_get_pseudofs(hammer_transaction_t trans, hammer_inode_t ip,
56 			struct hammer_ioc_pseudofs_rw *pfs)
57 {
58 	hammer_pseudofs_inmem_t pfsm;
59 	uint32_t localization;
60 	int error;
61 
62 	if ((error = hammer_pfs_autodetect(pfs, ip)) != 0)
63 		return(error);
64 	localization = pfs_to_lo(pfs->pfs_id);
65 	pfs->bytes = sizeof(struct hammer_pseudofs_data);
66 	pfs->version = HAMMER_IOC_PSEUDOFS_VERSION;
67 
68 	pfsm = hammer_load_pseudofs(trans, localization, &error);
69 	if (error) {
70 		hammer_rel_pseudofs(trans->hmp, pfsm);
71 		return(error);
72 	}
73 
74 	/*
75 	 * If the PFS is a master the sync tid is set by normal operation
76 	 * rather than the mirroring code, and will always track the
77 	 * real HAMMER filesystem.
78 	 *
79 	 * We use flush_tid1, which is the highest fully committed TID.
80 	 * flush_tid2 is the TID most recently flushed, but the UNDO hasn't
81 	 * caught up to it yet so a crash will roll us back to flush_tid1.
82 	 */
83 	if (hammer_is_pfs_master(&pfsm->pfsd))
84 		pfsm->pfsd.sync_end_tid = trans->hmp->flush_tid1;
85 
86 	/*
87 	 * Copy out to userland.
88 	 */
89 	error = 0;
90 	if (pfs->ondisk && error == 0)
91 		error = copyout(&pfsm->pfsd, pfs->ondisk, sizeof(pfsm->pfsd));
92 	hammer_rel_pseudofs(trans->hmp, pfsm);
93 	return(error);
94 }
95 
96 /*
97  * Set mirroring/pseudo-fs information
98  *
99  * NOTE: The ip used for ioctl is not necessarily related to the PFS
100  * since this ioctl only requires PFS id (or upper 16 bits of ip localization).
101  */
102 int
103 hammer_ioc_set_pseudofs(hammer_transaction_t trans, hammer_inode_t ip,
104 			struct ucred *cred, struct hammer_ioc_pseudofs_rw *pfs)
105 {
106 	hammer_pseudofs_inmem_t pfsm;
107 	uint32_t localization;
108 	int error;
109 
110 	if ((error = hammer_pfs_autodetect(pfs, ip)) != 0)
111 		return(error);
112 	localization = pfs_to_lo(pfs->pfs_id);
113 	if (pfs->version != HAMMER_IOC_PSEUDOFS_VERSION)
114 		error = EINVAL;
115 
116 	if (error == 0 && pfs->ondisk) {
117 		/*
118 		 * Load the PFS so we can modify our in-core copy.  Ignore
119 		 * ENOENT errors.
120 		 */
121 		pfsm = hammer_load_pseudofs(trans, localization, &error);
122 		error = copyin(pfs->ondisk, &pfsm->pfsd, sizeof(pfsm->pfsd));
123 
124 		/*
125 		 * Save it back, create a root inode if we are in master
126 		 * mode and no root exists.
127 		 *
128 		 * We do not create root inodes for slaves, the root inode
129 		 * must be mirrored from the master.
130 		 */
131 		if (error == 0 && hammer_is_pfs_master(&pfsm->pfsd)) {
132 			error = hammer_mkroot_pseudofs(trans, cred, pfsm, ip);
133 		}
134 		if (error == 0)
135 			error = hammer_save_pseudofs(trans, pfsm);
136 
137 		/*
138 		 * Wakeup anyone waiting for a TID update for this PFS
139 		 */
140 		wakeup(&pfsm->pfsd.sync_end_tid);
141 		hammer_rel_pseudofs(trans->hmp, pfsm);
142 	}
143 	return(error);
144 }
145 
146 /*
147  * Upgrade a slave to a master
148  *
149  * This is fairly easy to do, but we must physically undo any partial syncs
150  * for transaction ids > sync_end_tid.  Effective, we must do a partial
151  * rollback.
152  *
153  * NOTE: The ip used for ioctl is not necessarily related to the PFS
154  * since this ioctl only requires PFS id (or upper 16 bits of ip localization).
155  */
156 int
157 hammer_ioc_upgrade_pseudofs(hammer_transaction_t trans, hammer_inode_t ip,
158 			struct hammer_ioc_pseudofs_rw *pfs)
159 {
160 	hammer_pseudofs_inmem_t pfsm;
161 	uint32_t localization;
162 	int error;
163 
164 	if ((error = hammer_pfs_autodetect(pfs, ip)) != 0)
165 		return(error);
166 	localization = pfs_to_lo(pfs->pfs_id);
167 	if ((error = hammer_unload_pseudofs(trans, localization)) != 0)
168 		return(error);
169 
170 	/*
171 	 * A master id must be set when upgrading
172 	 */
173 	pfsm = hammer_load_pseudofs(trans, localization, &error);
174 	if (error == 0) {
175 		if (hammer_is_pfs_slave(&pfsm->pfsd)) {
176 			error = hammer_pfs_rollback(trans, pfsm,
177 					    pfsm->pfsd.sync_end_tid + 1);
178 			if (error == 0) {
179 				pfsm->pfsd.mirror_flags &= ~HAMMER_PFSD_SLAVE;
180 				error = hammer_save_pseudofs(trans, pfsm);
181 			}
182 		}
183 	}
184 	hammer_rel_pseudofs(trans->hmp, pfsm);
185 	if (error == EINTR) {
186 		pfs->head.flags |= HAMMER_IOC_HEAD_INTR;
187 		error = 0;
188 	}
189 	return (error);
190 }
191 
192 /*
193  * Downgrade a master to a slave
194  *
195  * This is really easy to do, just set the SLAVE flag and update sync_end_tid.
196  *
197  * We previously did not update sync_end_tid in consideration for a slave
198  * upgraded to a master and then downgraded again, but this completely breaks
199  * the case where one starts with a master and then downgrades to a slave,
200  * then upgrades again.
201  *
202  * NOTE: The ip used for ioctl is not necessarily related to the PFS
203  * since this ioctl only requires PFS id (or upper 16 bits of ip localization).
204  */
205 int
206 hammer_ioc_downgrade_pseudofs(hammer_transaction_t trans, hammer_inode_t ip,
207 			struct hammer_ioc_pseudofs_rw *pfs)
208 {
209 	hammer_mount_t hmp = trans->hmp;
210 	hammer_pseudofs_inmem_t pfsm;
211 	uint32_t localization;
212 	int error;
213 
214 	if ((error = hammer_pfs_autodetect(pfs, ip)) != 0)
215 		return(error);
216 	localization = pfs_to_lo(pfs->pfs_id);
217 	if ((error = hammer_unload_pseudofs(trans, localization)) != 0)
218 		return(error);
219 
220 	pfsm = hammer_load_pseudofs(trans, localization, &error);
221 	if (error == 0) {
222 		if (hammer_is_pfs_master(&pfsm->pfsd)) {
223 			pfsm->pfsd.mirror_flags |= HAMMER_PFSD_SLAVE;
224 			if (pfsm->pfsd.sync_end_tid < hmp->flush_tid1)
225 				pfsm->pfsd.sync_end_tid = hmp->flush_tid1;
226 			error = hammer_save_pseudofs(trans, pfsm);
227 		}
228 	}
229 	hammer_rel_pseudofs(trans->hmp, pfsm);
230 	return (error);
231 }
232 
233 /*
234  * Destroy a PFS
235  *
236  * We can destroy a PFS by scanning and deleting all of its records in the
237  * B-Tree.  The hammer utility will delete the softlink in the primary
238  * filesystem.
239  *
240  * NOTE: The ip used for ioctl is not necessarily related to the PFS
241  * since this ioctl only requires PFS id (or upper 16 bits of ip localization).
242  */
243 int
244 hammer_ioc_destroy_pseudofs(hammer_transaction_t trans, hammer_inode_t ip,
245 			struct hammer_ioc_pseudofs_rw *pfs)
246 {
247 	hammer_pseudofs_inmem_t pfsm;
248 	uint32_t localization;
249 	int error;
250 
251 	if ((error = hammer_pfs_autodetect(pfs, ip)) != 0)
252 		return(error);
253 	localization = pfs_to_lo(pfs->pfs_id);
254 
255 	if ((error = hammer_unload_pseudofs(trans, localization)) != 0)
256 		return(error);
257 
258 	pfsm = hammer_load_pseudofs(trans, localization, &error);
259 	if (error == 0) {
260 		error = hammer_pfs_rollback(trans, pfsm, 0);
261 		if (error == 0) {
262 			pfsm->pfsd.mirror_flags |= HAMMER_PFSD_DELETED;
263 			error = hammer_save_pseudofs(trans, pfsm);
264 		}
265 	}
266 	hammer_rel_pseudofs(trans->hmp, pfsm);
267 	if (error == EINTR) {
268 		pfs->head.flags |= HAMMER_IOC_HEAD_INTR;
269 		error = 0;
270 	}
271 	return(error);
272 }
273 
274 /*
275  * Wait for the PFS to sync past the specified TID
276  */
277 int
278 hammer_ioc_wait_pseudofs(hammer_transaction_t trans, hammer_inode_t ip,
279 			 struct hammer_ioc_pseudofs_rw *pfs)
280 {
281 	hammer_pseudofs_inmem_t pfsm;
282 	struct hammer_pseudofs_data pfsd;
283 	uint32_t localization;
284 	hammer_tid_t tid;
285 	void *waitp;
286 	int error;
287 
288 	if ((error = hammer_pfs_autodetect(pfs, ip)) != 0)
289 		return(error);
290 	localization = pfs_to_lo(pfs->pfs_id);
291 
292 	if ((error = copyin(pfs->ondisk, &pfsd, sizeof(pfsd))) != 0)
293 		return(error);
294 
295 	pfsm = hammer_load_pseudofs(trans, localization, &error);
296 	if (error == 0) {
297 		if (hammer_is_pfs_slave(&pfsm->pfsd)) {
298 			tid = pfsm->pfsd.sync_end_tid;
299 			waitp = &pfsm->pfsd.sync_end_tid;
300 		} else {
301 			tid = trans->hmp->flush_tid1;
302 			waitp = &trans->hmp->flush_tid1;
303 		}
304 		if (tid <= pfsd.sync_end_tid)
305 			tsleep(waitp, PCATCH, "hmrmwt", 0);
306 	}
307 	hammer_rel_pseudofs(trans->hmp, pfsm);
308 	if (error == EINTR) {
309 		pfs->head.flags |= HAMMER_IOC_HEAD_INTR;
310 		error = 0;
311 	}
312 	return(error);
313 }
314 
315 /*
316  * Iterate PFS ondisk data.
317  * This function basically does the same as hammer_load_pseudofs()
318  * except that the purpose of this function is to retrieve data.
319  *
320  * NOTE: The ip used for ioctl is not necessarily related to the PFS
321  * since this ioctl only requires PFS id (or upper 16 bits of ip localization).
322  */
323 int
324 hammer_ioc_iterate_pseudofs(hammer_transaction_t trans, hammer_inode_t ip,
325 			struct hammer_ioc_pfs_iterate *pi)
326 {
327 	struct hammer_cursor cursor;
328 	struct hammer_ioc_pseudofs_rw pfs;
329 	hammer_inode_t dip;
330 	uint32_t localization;
331 	int error;
332 
333 	/*
334 	 * struct hammer_ioc_pfs_iterate was never necessary.
335 	 * This ioctl needs extra code only to do conversion.
336 	 * The name pi->pos is misleading, but it's been exposed
337 	 * to userspace header..
338 	 */
339 	bzero(&pfs, sizeof(pfs));
340 	pfs.pfs_id = pi->pos;
341 	pfs.bytes = sizeof(struct hammer_pseudofs_data);  /* dummy */
342 	if ((error = hammer_pfs_autodetect(&pfs, ip)) != 0)
343 		return(error);
344 	pi->pos = pfs.pfs_id;
345 	localization = pfs_to_lo(pi->pos);
346 
347 	dip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, HAMMER_MAX_TID,
348 		HAMMER_DEF_LOCALIZATION, 0, &error);
349 
350 	error = hammer_init_cursor(trans, &cursor,
351 		(dip ? &dip->cache[1] : NULL), dip);
352 	if (error)
353 		goto out;
354 
355 	cursor.key_beg.localization = HAMMER_DEF_LOCALIZATION |
356 				      HAMMER_LOCALIZE_MISC;
357 	cursor.key_beg.obj_id = HAMMER_OBJID_ROOT;
358 	cursor.key_beg.create_tid = 0;
359 	cursor.key_beg.delete_tid = 0;
360 	cursor.key_beg.rec_type = HAMMER_RECTYPE_PFS;
361 	cursor.key_beg.obj_type = 0;
362 	cursor.key_beg.key = localization;
363 	cursor.asof = HAMMER_MAX_TID;
364 	cursor.flags |= HAMMER_CURSOR_ASOF;
365 
366 	error = hammer_ip_lookup(&cursor);
367 	if (error == 0) {
368 		error = hammer_ip_resolve_data(&cursor);
369 		if (error == 0) {
370 			if (pi->ondisk)
371 				copyout(cursor.data, pi->ondisk, cursor.leaf->data_len);
372 			localization = cursor.leaf->base.key;
373 			pi->pos = lo_to_pfs(localization);
374 			/*
375 			 * Caller needs to increment pi->pos each time calling
376 			 * this ioctl. This ioctl only restores current PFS id.
377 			 */
378 		}
379 	}
380 out:
381 	hammer_done_cursor(&cursor);
382 	if (dip)
383 		hammer_rel_inode(dip, 0);
384 	return(error);
385 }
386 
387 /*
388  * Auto-detect the pseudofs and do basic bounds checking.
389  */
390 static
391 int
392 hammer_pfs_autodetect(struct hammer_ioc_pseudofs_rw *pfs, hammer_inode_t ip)
393 {
394 	int error = 0;
395 
396 	if (pfs->pfs_id == -1)
397 		pfs->pfs_id = lo_to_pfs(ip->obj_localization);
398 	if (pfs->pfs_id < 0 || pfs->pfs_id >= HAMMER_MAX_PFS)
399 		error = EINVAL;
400 	if (pfs->bytes < sizeof(struct hammer_pseudofs_data))
401 		error = EINVAL;
402 	return(error);
403 }
404 
405 /*
406  * Rollback the specified PFS to (trunc_tid - 1), removing everything
407  * greater or equal to trunc_tid.  The PFS must not have been in no-mirror
408  * mode or the MIRROR_FILTERED scan will not work properly.
409  *
410  * This is typically used to remove any partial syncs when upgrading a
411  * slave to a master.  It can theoretically also be used to rollback
412  * any PFS, including PFS#0, BUT ONLY TO POINTS THAT HAVE NOT YET BEEN
413  * PRUNED, and to points that are older only if they are on a retained
414  * (pruning softlink) boundary.
415  *
416  * Rollbacks destroy information.  If you don't mind inode numbers changing
417  * a better way would be to cpdup a snapshot back onto the master.
418  */
419 static
420 int
421 hammer_pfs_rollback(hammer_transaction_t trans,
422 		    hammer_pseudofs_inmem_t pfsm,
423 		    hammer_tid_t trunc_tid)
424 {
425 	struct hammer_cmirror cmirror;
426 	struct hammer_cursor cursor;
427 	struct hammer_base_elm key_cur;
428 	int error;
429 	int seq;
430 
431 	bzero(&cmirror, sizeof(cmirror));
432 	bzero(&key_cur, sizeof(key_cur));
433 	key_cur.localization = HAMMER_MIN_LOCALIZATION | pfsm->localization;
434 	key_cur.obj_id = HAMMER_MIN_OBJID;
435 	key_cur.key = HAMMER_MIN_KEY;
436 	key_cur.create_tid = 1;
437 	key_cur.rec_type = HAMMER_MIN_RECTYPE;
438 
439 	seq = trans->hmp->flusher.done;
440 
441 retry:
442 	error = hammer_init_cursor(trans, &cursor, NULL, NULL);
443 	if (error) {
444 		hammer_done_cursor(&cursor);
445 		goto failed;
446 	}
447 	cursor.key_beg = key_cur;
448 	cursor.key_end.localization = HAMMER_MAX_LOCALIZATION |
449 				      pfsm->localization;
450 	cursor.key_end.obj_id = HAMMER_MAX_OBJID;
451 	cursor.key_end.key = HAMMER_MAX_KEY;
452 	cursor.key_end.create_tid = HAMMER_MAX_TID;
453 	cursor.key_end.rec_type = HAMMER_MAX_RECTYPE;
454 
455 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
456 	cursor.flags |= HAMMER_CURSOR_BACKEND;
457 
458 	/*
459 	 * Do an optimized scan of only records created or modified
460 	 * >= trunc_tid, so we can fix up those records.  We must
461 	 * still check the TIDs but this greatly reduces the size of
462 	 * the scan.
463 	 */
464 	cursor.flags |= HAMMER_CURSOR_MIRROR_FILTERED;
465 	cursor.cmirror = &cmirror;
466 	cmirror.mirror_tid = trunc_tid;
467 
468 	error = hammer_btree_first(&cursor);
469 	while (error == 0) {
470 		/*
471 		 * Abort the rollback.
472 		 */
473 		if (error == 0) {
474 			error = hammer_signal_check(trans->hmp);
475 			if (error)
476 				break;
477 		}
478 
479 		/*
480 		 * We only care about leafs.  Internal nodes can be returned
481 		 * in mirror-filtered mode (they are used to generate SKIP
482 		 * mrecords), but we don't need them for this code.
483 		 *
484 		 * WARNING: See warnings in hammer_unlock_cursor() function.
485 		 */
486 		cursor.flags |= HAMMER_CURSOR_ATEDISK;
487 		if (cursor.node->ondisk->type == HAMMER_BTREE_TYPE_LEAF) {
488 			key_cur = cursor.node->ondisk->elms[cursor.index].base;
489 			error = hammer_pfs_delete_at_cursor(&cursor, trunc_tid);
490 		}
491 
492 		while (hammer_flusher_meta_halflimit(trans->hmp) ||
493 		       hammer_flusher_undo_exhausted(trans, 2)) {
494 			hammer_unlock_cursor(&cursor);
495 			hammer_flusher_wait(trans->hmp, seq);
496 			hammer_lock_cursor(&cursor);
497 			seq = hammer_flusher_async_one(trans->hmp);
498 		}
499 
500 		if (error == 0)
501 			error = hammer_btree_iterate(&cursor);
502 	}
503 	if (error == ENOENT)
504 		error = 0;
505 	hammer_done_cursor(&cursor);
506 	if (error == EDEADLK)
507 		goto retry;
508 failed:
509 	return(error);
510 }
511 
512 /*
513  * Helper function - perform rollback on a B-Tree element given trunc_tid.
514  *
515  * If create_tid >= trunc_tid the record is physically destroyed.
516  * If delete_tid >= trunc_tid it will be set to 0, undeleting the record.
517  */
518 static
519 int
520 hammer_pfs_delete_at_cursor(hammer_cursor_t cursor, hammer_tid_t trunc_tid)
521 {
522 	hammer_btree_leaf_elm_t elm;
523 	int error;
524 
525 	elm = &cursor->node->ondisk->elms[cursor->index].leaf;
526 	if (elm->base.create_tid < trunc_tid &&
527 	    elm->base.delete_tid < trunc_tid) {
528 		return(0);
529 	}
530 
531 	if (elm->base.create_tid >= trunc_tid) {
532 		error = hammer_delete_at_cursor(
533 				cursor, HAMMER_DELETE_DESTROY,
534 				cursor->trans->tid, cursor->trans->time32,
535 				1, NULL);
536 	} else if (elm->base.delete_tid >= trunc_tid) {
537 		error = hammer_delete_at_cursor(
538 				cursor, HAMMER_DELETE_ADJUST,
539 				0, 0,
540 				1, NULL);
541 	} else {
542 		error = 0;
543 	}
544 	return(error);
545 }
546 
547