xref: /dragonfly/sys/vfs/hammer/hammer_pfs.c (revision 55358b98)
1 /*
2  * Copyright (c) 2008 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * HAMMER PFS ioctls - Manage pseudo-fs configurations
36  */
37 
38 #include "hammer.h"
39 
40 static int hammer_pfs_autodetect(struct hammer_ioc_pseudofs_rw *pfs,
41 				hammer_inode_t ip);
42 static int hammer_pfs_rollback(hammer_transaction_t trans,
43 				hammer_pseudofs_inmem_t pfsm,
44 				hammer_tid_t trunc_tid);
45 static int hammer_pfs_delete_at_cursor(hammer_cursor_t cursor,
46 				hammer_tid_t trunc_tid);
47 
48 /*
49  * Get mirroring/pseudo-fs information
50  *
51  * NOTE: The ip used for ioctl is not necessarily related to the PFS
52  * since this ioctl only requires PFS id (or upper 16 bits of ip localization).
53  */
54 int
55 hammer_ioc_get_pseudofs(hammer_transaction_t trans, hammer_inode_t ip,
56 			struct hammer_ioc_pseudofs_rw *pfs)
57 {
58 	hammer_pseudofs_inmem_t pfsm;
59 	uint32_t localization;
60 	int error;
61 
62 	if ((error = hammer_pfs_autodetect(pfs, ip)) != 0)
63 		return(error);
64 	localization = pfs_to_lo(pfs->pfs_id);
65 	pfs->bytes = sizeof(struct hammer_pseudofs_data);
66 	pfs->version = HAMMER_IOC_PSEUDOFS_VERSION;
67 
68 	pfsm = hammer_load_pseudofs(trans, localization, &error);
69 	if (error) {
70 		hammer_rel_pseudofs(trans->hmp, pfsm);
71 		return(error);
72 	}
73 
74 	/*
75 	 * If the PFS is a master the sync tid is set by normal operation
76 	 * rather than the mirroring code, and will always track the
77 	 * real HAMMER filesystem.
78 	 *
79 	 * We use flush_tid1, which is the highest fully committed TID.
80 	 * flush_tid2 is the TID most recently flushed, but the UNDO hasn't
81 	 * caught up to it yet so a crash will roll us back to flush_tid1.
82 	 */
83 	if ((pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE) == 0)
84 		pfsm->pfsd.sync_end_tid = trans->hmp->flush_tid1;
85 
86 	/*
87 	 * Copy out to userland.
88 	 */
89 	error = 0;
90 	if (pfs->ondisk && error == 0)
91 		error = copyout(&pfsm->pfsd, pfs->ondisk, sizeof(pfsm->pfsd));
92 	hammer_rel_pseudofs(trans->hmp, pfsm);
93 	return(error);
94 }
95 
96 /*
97  * Set mirroring/pseudo-fs information
98  *
99  * NOTE: The ip used for ioctl is not necessarily related to the PFS
100  * since this ioctl only requires PFS id (or upper 16 bits of ip localization).
101  */
102 int
103 hammer_ioc_set_pseudofs(hammer_transaction_t trans, hammer_inode_t ip,
104 			struct ucred *cred, struct hammer_ioc_pseudofs_rw *pfs)
105 {
106 	hammer_pseudofs_inmem_t pfsm;
107 	uint32_t localization;
108 	int error;
109 
110 	if ((error = hammer_pfs_autodetect(pfs, ip)) != 0)
111 		return(error);
112 	localization = pfs_to_lo(pfs->pfs_id);
113 	if (pfs->version != HAMMER_IOC_PSEUDOFS_VERSION)
114 		error = EINVAL;
115 
116 	/*
117 	 * Make sure a caller isn't creating a PFS from non-root PFS.
118 	 */
119 	if (lo_to_pfs(ip->obj_localization) != HAMMER_ROOT_PFSID) {
120 		hmkprintf(trans->hmp,
121 			"Creating a PFS from non-root PFS is not allowed\n");
122 		return(EINVAL);
123 	}
124 
125 	if (error == 0 && pfs->ondisk) {
126 		/*
127 		 * Load the PFS so we can modify our in-core copy.  Ignore
128 		 * ENOENT errors.
129 		 */
130 		pfsm = hammer_load_pseudofs(trans, localization, &error);
131 		error = copyin(pfs->ondisk, &pfsm->pfsd, sizeof(pfsm->pfsd));
132 
133 		/*
134 		 * Save it back, create a root inode if we are in master
135 		 * mode and no root exists.
136 		 *
137 		 * We do not create root inodes for slaves, the root inode
138 		 * must be mirrored from the master.
139 		 */
140 		if (error == 0 &&
141 		    (pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE) == 0) {
142 			error = hammer_mkroot_pseudofs(trans, cred, pfsm);
143 		}
144 		if (error == 0)
145 			error = hammer_save_pseudofs(trans, pfsm);
146 
147 		/*
148 		 * Wakeup anyone waiting for a TID update for this PFS
149 		 */
150 		wakeup(&pfsm->pfsd.sync_end_tid);
151 		hammer_rel_pseudofs(trans->hmp, pfsm);
152 	}
153 	return(error);
154 }
155 
156 /*
157  * Upgrade a slave to a master
158  *
159  * This is fairly easy to do, but we must physically undo any partial syncs
160  * for transaction ids > sync_end_tid.  Effective, we must do a partial
161  * rollback.
162  *
163  * NOTE: The ip used for ioctl is not necessarily related to the PFS
164  * since this ioctl only requires PFS id (or upper 16 bits of ip localization).
165  */
166 int
167 hammer_ioc_upgrade_pseudofs(hammer_transaction_t trans, hammer_inode_t ip,
168 			struct hammer_ioc_pseudofs_rw *pfs)
169 {
170 	hammer_pseudofs_inmem_t pfsm;
171 	uint32_t localization;
172 	int error;
173 
174 	if ((error = hammer_pfs_autodetect(pfs, ip)) != 0)
175 		return(error);
176 	localization = pfs_to_lo(pfs->pfs_id);
177 	if ((error = hammer_unload_pseudofs(trans, localization)) != 0)
178 		return(error);
179 
180 	/*
181 	 * A master id must be set when upgrading
182 	 */
183 	pfsm = hammer_load_pseudofs(trans, localization, &error);
184 	if (error == 0) {
185 		if ((pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE) != 0) {
186 			error = hammer_pfs_rollback(trans, pfsm,
187 					    pfsm->pfsd.sync_end_tid + 1);
188 			if (error == 0) {
189 				pfsm->pfsd.mirror_flags &= ~HAMMER_PFSD_SLAVE;
190 				error = hammer_save_pseudofs(trans, pfsm);
191 			}
192 		}
193 	}
194 	hammer_rel_pseudofs(trans->hmp, pfsm);
195 	if (error == EINTR) {
196 		pfs->head.flags |= HAMMER_IOC_HEAD_INTR;
197 		error = 0;
198 	}
199 	return (error);
200 }
201 
202 /*
203  * Downgrade a master to a slave
204  *
205  * This is really easy to do, just set the SLAVE flag and update sync_end_tid.
206  *
207  * We previously did not update sync_end_tid in consideration for a slave
208  * upgraded to a master and then downgraded again, but this completely breaks
209  * the case where one starts with a master and then downgrades to a slave,
210  * then upgrades again.
211  *
212  * NOTE: The ip used for ioctl is not necessarily related to the PFS
213  * since this ioctl only requires PFS id (or upper 16 bits of ip localization).
214  */
215 int
216 hammer_ioc_downgrade_pseudofs(hammer_transaction_t trans, hammer_inode_t ip,
217 			struct hammer_ioc_pseudofs_rw *pfs)
218 {
219 	hammer_mount_t hmp = trans->hmp;
220 	hammer_pseudofs_inmem_t pfsm;
221 	uint32_t localization;
222 	int error;
223 
224 	if ((error = hammer_pfs_autodetect(pfs, ip)) != 0)
225 		return(error);
226 	localization = pfs_to_lo(pfs->pfs_id);
227 	if ((error = hammer_unload_pseudofs(trans, localization)) != 0)
228 		return(error);
229 
230 	pfsm = hammer_load_pseudofs(trans, localization, &error);
231 	if (error == 0) {
232 		if ((pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE) == 0) {
233 			pfsm->pfsd.mirror_flags |= HAMMER_PFSD_SLAVE;
234 			if (pfsm->pfsd.sync_end_tid < hmp->flush_tid1)
235 				pfsm->pfsd.sync_end_tid = hmp->flush_tid1;
236 			error = hammer_save_pseudofs(trans, pfsm);
237 		}
238 	}
239 	hammer_rel_pseudofs(trans->hmp, pfsm);
240 	return (error);
241 }
242 
243 /*
244  * Destroy a PFS
245  *
246  * We can destroy a PFS by scanning and deleting all of its records in the
247  * B-Tree.  The hammer utility will delete the softlink in the primary
248  * filesystem.
249  *
250  * NOTE: The ip used for ioctl is not necessarily related to the PFS
251  * since this ioctl only requires PFS id (or upper 16 bits of ip localization).
252  */
253 int
254 hammer_ioc_destroy_pseudofs(hammer_transaction_t trans, hammer_inode_t ip,
255 			struct hammer_ioc_pseudofs_rw *pfs)
256 {
257 	hammer_pseudofs_inmem_t pfsm;
258 	uint32_t localization;
259 	int error;
260 
261 	if ((error = hammer_pfs_autodetect(pfs, ip)) != 0)
262 		return(error);
263 	localization = pfs_to_lo(pfs->pfs_id);
264 
265 	if ((error = hammer_unload_pseudofs(trans, localization)) != 0)
266 		return(error);
267 
268 	pfsm = hammer_load_pseudofs(trans, localization, &error);
269 	if (error == 0) {
270 		error = hammer_pfs_rollback(trans, pfsm, 0);
271 		if (error == 0) {
272 			pfsm->pfsd.mirror_flags |= HAMMER_PFSD_DELETED;
273 			error = hammer_save_pseudofs(trans, pfsm);
274 		}
275 	}
276 	hammer_rel_pseudofs(trans->hmp, pfsm);
277 	if (error == EINTR) {
278 		pfs->head.flags |= HAMMER_IOC_HEAD_INTR;
279 		error = 0;
280 	}
281 	return(error);
282 }
283 
284 /*
285  * Wait for the PFS to sync past the specified TID
286  */
287 int
288 hammer_ioc_wait_pseudofs(hammer_transaction_t trans, hammer_inode_t ip,
289 			 struct hammer_ioc_pseudofs_rw *pfs)
290 {
291 	hammer_pseudofs_inmem_t pfsm;
292 	struct hammer_pseudofs_data pfsd;
293 	uint32_t localization;
294 	hammer_tid_t tid;
295 	void *waitp;
296 	int error;
297 
298 	if ((error = hammer_pfs_autodetect(pfs, ip)) != 0)
299 		return(error);
300 	localization = pfs_to_lo(pfs->pfs_id);
301 
302 	if ((error = copyin(pfs->ondisk, &pfsd, sizeof(pfsd))) != 0)
303 		return(error);
304 
305 	pfsm = hammer_load_pseudofs(trans, localization, &error);
306 	if (error == 0) {
307 		if (pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE) {
308 			tid = pfsm->pfsd.sync_end_tid;
309 			waitp = &pfsm->pfsd.sync_end_tid;
310 		} else {
311 			tid = trans->hmp->flush_tid1;
312 			waitp = &trans->hmp->flush_tid1;
313 		}
314 		if (tid <= pfsd.sync_end_tid)
315 			tsleep(waitp, PCATCH, "hmrmwt", 0);
316 	}
317 	hammer_rel_pseudofs(trans->hmp, pfsm);
318 	if (error == EINTR) {
319 		pfs->head.flags |= HAMMER_IOC_HEAD_INTR;
320 		error = 0;
321 	}
322 	return(error);
323 }
324 
325 /*
326  * Iterate PFS ondisk data.
327  * This function basically does the same as hammer_load_pseudofs()
328  * except that the purpose of this function is to retrieve data.
329  *
330  * NOTE: The ip used for ioctl is not necessarily related to the PFS
331  * since this ioctl only requires PFS id (or upper 16 bits of ip localization).
332  */
333 int
334 hammer_ioc_iterate_pseudofs(hammer_transaction_t trans, hammer_inode_t ip,
335 			struct hammer_ioc_pfs_iterate *pi)
336 {
337 	struct hammer_cursor cursor;
338 	struct hammer_ioc_pseudofs_rw pfs;
339 	hammer_inode_t dip;
340 	uint32_t localization;
341 	int error;
342 
343 	/*
344 	 * struct hammer_ioc_pfs_iterate was never necessary.
345 	 * This ioctl needs extra code only to do conversion.
346 	 * The name pi->pos is misleading, but it's been exposed
347 	 * to userspace header..
348 	 */
349 	bzero(&pfs, sizeof(pfs));
350 	pfs.pfs_id = pi->pos;
351 	pfs.bytes = sizeof(struct hammer_pseudofs_data);  /* dummy */
352 	if ((error = hammer_pfs_autodetect(&pfs, ip)) != 0)
353 		return(error);
354 	pi->pos = pfs.pfs_id;
355 	localization = pfs_to_lo(pi->pos);
356 
357 	dip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, HAMMER_MAX_TID,
358 		HAMMER_DEF_LOCALIZATION, 0, &error);
359 
360 	error = hammer_init_cursor(trans, &cursor,
361 		(dip ? &dip->cache[1] : NULL), dip);
362 	if (error)
363 		goto out;
364 
365 	cursor.key_beg.localization = HAMMER_DEF_LOCALIZATION |
366 				      HAMMER_LOCALIZE_MISC;
367 	cursor.key_beg.obj_id = HAMMER_OBJID_ROOT;
368 	cursor.key_beg.create_tid = 0;
369 	cursor.key_beg.delete_tid = 0;
370 	cursor.key_beg.rec_type = HAMMER_RECTYPE_PFS;
371 	cursor.key_beg.obj_type = 0;
372 	cursor.key_beg.key = localization;
373 	cursor.asof = HAMMER_MAX_TID;
374 	cursor.flags |= HAMMER_CURSOR_ASOF;
375 
376 	error = hammer_ip_lookup(&cursor);
377 	if (error == 0) {
378 		error = hammer_ip_resolve_data(&cursor);
379 		if (error == 0) {
380 			if (pi->ondisk)
381 				copyout(cursor.data, pi->ondisk, cursor.leaf->data_len);
382 			localization = cursor.leaf->base.key;
383 			pi->pos = lo_to_pfs(localization);
384 			/*
385 			 * Caller needs to increment pi->pos each time calling
386 			 * this ioctl. This ioctl only restores current PFS id.
387 			 */
388 		}
389 	}
390 out:
391 	hammer_done_cursor(&cursor);
392 	if (dip)
393 		hammer_rel_inode(dip, 0);
394 	return(error);
395 }
396 
397 /*
398  * Auto-detect the pseudofs and do basic bounds checking.
399  */
400 static
401 int
402 hammer_pfs_autodetect(struct hammer_ioc_pseudofs_rw *pfs, hammer_inode_t ip)
403 {
404 	int error = 0;
405 
406 	if (pfs->pfs_id == -1)
407 		pfs->pfs_id = lo_to_pfs(ip->obj_localization);
408 	if (pfs->pfs_id < 0 || pfs->pfs_id >= HAMMER_MAX_PFS)
409 		error = EINVAL;
410 	if (pfs->bytes < sizeof(struct hammer_pseudofs_data))
411 		error = EINVAL;
412 	return(error);
413 }
414 
415 /*
416  * Rollback the specified PFS to (trunc_tid - 1), removing everything
417  * greater or equal to trunc_tid.  The PFS must not have been in no-mirror
418  * mode or the MIRROR_FILTERED scan will not work properly.
419  *
420  * This is typically used to remove any partial syncs when upgrading a
421  * slave to a master.  It can theoretically also be used to rollback
422  * any PFS, including PFS#0, BUT ONLY TO POINTS THAT HAVE NOT YET BEEN
423  * PRUNED, and to points that are older only if they are on a retained
424  * (pruning softlink) boundary.
425  *
426  * Rollbacks destroy information.  If you don't mind inode numbers changing
427  * a better way would be to cpdup a snapshot back onto the master.
428  */
429 static
430 int
431 hammer_pfs_rollback(hammer_transaction_t trans,
432 		    hammer_pseudofs_inmem_t pfsm,
433 		    hammer_tid_t trunc_tid)
434 {
435 	struct hammer_cmirror cmirror;
436 	struct hammer_cursor cursor;
437 	struct hammer_base_elm key_cur;
438 	int error;
439 	int seq;
440 
441 	bzero(&cmirror, sizeof(cmirror));
442 	bzero(&key_cur, sizeof(key_cur));
443 	key_cur.localization = HAMMER_MIN_LOCALIZATION | pfsm->localization;
444 	key_cur.obj_id = HAMMER_MIN_OBJID;
445 	key_cur.key = HAMMER_MIN_KEY;
446 	key_cur.create_tid = 1;
447 	key_cur.rec_type = HAMMER_MIN_RECTYPE;
448 
449 	seq = trans->hmp->flusher.done;
450 
451 retry:
452 	error = hammer_init_cursor(trans, &cursor, NULL, NULL);
453 	if (error) {
454 		hammer_done_cursor(&cursor);
455 		goto failed;
456 	}
457 	cursor.key_beg = key_cur;
458 	cursor.key_end.localization = HAMMER_MAX_LOCALIZATION |
459 				      pfsm->localization;
460 	cursor.key_end.obj_id = HAMMER_MAX_OBJID;
461 	cursor.key_end.key = HAMMER_MAX_KEY;
462 	cursor.key_end.create_tid = HAMMER_MAX_TID;
463 	cursor.key_end.rec_type = HAMMER_MAX_RECTYPE;
464 
465 	cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
466 	cursor.flags |= HAMMER_CURSOR_BACKEND;
467 
468 	/*
469 	 * Do an optimized scan of only records created or modified
470 	 * >= trunc_tid, so we can fix up those records.  We must
471 	 * still check the TIDs but this greatly reduces the size of
472 	 * the scan.
473 	 */
474 	cursor.flags |= HAMMER_CURSOR_MIRROR_FILTERED;
475 	cursor.cmirror = &cmirror;
476 	cmirror.mirror_tid = trunc_tid;
477 
478 	error = hammer_btree_first(&cursor);
479 	while (error == 0) {
480 		/*
481 		 * Abort the rollback.
482 		 */
483 		if (error == 0) {
484 			error = hammer_signal_check(trans->hmp);
485 			if (error)
486 				break;
487 		}
488 
489 		/*
490 		 * We only care about leafs.  Internal nodes can be returned
491 		 * in mirror-filtered mode (they are used to generate SKIP
492 		 * mrecords), but we don't need them for this code.
493 		 *
494 		 * WARNING: See warnings in hammer_unlock_cursor() function.
495 		 */
496 		cursor.flags |= HAMMER_CURSOR_ATEDISK;
497 		if (cursor.node->ondisk->type == HAMMER_BTREE_TYPE_LEAF) {
498 			key_cur = cursor.node->ondisk->elms[cursor.index].base;
499 			error = hammer_pfs_delete_at_cursor(&cursor, trunc_tid);
500 		}
501 
502 		while (hammer_flusher_meta_halflimit(trans->hmp) ||
503 		       hammer_flusher_undo_exhausted(trans, 2)) {
504 			hammer_unlock_cursor(&cursor);
505 			hammer_flusher_wait(trans->hmp, seq);
506 			hammer_lock_cursor(&cursor);
507 			seq = hammer_flusher_async_one(trans->hmp);
508 		}
509 
510 		if (error == 0)
511 			error = hammer_btree_iterate(&cursor);
512 	}
513 	if (error == ENOENT)
514 		error = 0;
515 	hammer_done_cursor(&cursor);
516 	if (error == EDEADLK)
517 		goto retry;
518 failed:
519 	return(error);
520 }
521 
522 /*
523  * Helper function - perform rollback on a B-Tree element given trunc_tid.
524  *
525  * If create_tid >= trunc_tid the record is physically destroyed.
526  * If delete_tid >= trunc_tid it will be set to 0, undeleting the record.
527  */
528 static
529 int
530 hammer_pfs_delete_at_cursor(hammer_cursor_t cursor, hammer_tid_t trunc_tid)
531 {
532 	hammer_btree_leaf_elm_t elm;
533 	int error;
534 
535 	elm = &cursor->node->ondisk->elms[cursor->index].leaf;
536 	if (elm->base.create_tid < trunc_tid &&
537 	    elm->base.delete_tid < trunc_tid) {
538 		return(0);
539 	}
540 
541 	if (elm->base.create_tid >= trunc_tid) {
542 		error = hammer_delete_at_cursor(
543 				cursor, HAMMER_DELETE_DESTROY,
544 				cursor->trans->tid, cursor->trans->time32,
545 				1, NULL);
546 	} else if (elm->base.delete_tid >= trunc_tid) {
547 		error = hammer_delete_at_cursor(
548 				cursor, HAMMER_DELETE_ADJUST,
549 				0, 0,
550 				1, NULL);
551 	} else {
552 		error = 0;
553 	}
554 	return(error);
555 }
556 
557