1 /* 2 * Copyright (c) 2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_prune.c,v 1.19 2008/09/23 21:03:52 dillon Exp $ 35 */ 36 37 #include "hammer.h" 38 39 /* 40 * Iterate through the specified range of object ids and remove any 41 * deleted records that fall entirely within a prune modulo. 42 * 43 * A reverse iteration is used to prevent overlapping records from being 44 * created during the iteration due to alignments. This also allows us 45 * to adjust alignments without blowing up the B-Tree. 46 */ 47 static int prune_should_delete(struct hammer_ioc_prune *prune, 48 hammer_btree_leaf_elm_t elm); 49 static void prune_check_nlinks(hammer_cursor_t cursor, 50 hammer_btree_leaf_elm_t elm); 51 52 int 53 hammer_ioc_prune(hammer_transaction_t trans, hammer_inode_t ip, 54 struct hammer_ioc_prune *prune) 55 { 56 struct hammer_cursor cursor; 57 hammer_btree_leaf_elm_t elm; 58 struct hammer_ioc_prune_elm *copy_elms; 59 struct hammer_ioc_prune_elm *user_elms; 60 int error; 61 int isdir; 62 int elm_array_size; 63 int seq; 64 65 if (prune->nelms < 0 || prune->nelms > HAMMER_MAX_PRUNE_ELMS) 66 return(EINVAL); 67 if ((prune->key_beg.localization | prune->key_end.localization) & 68 HAMMER_LOCALIZE_PSEUDOFS_MASK) { 69 return(EINVAL); 70 } 71 if (prune->key_beg.localization > prune->key_end.localization) 72 return(EINVAL); 73 if (prune->key_beg.localization == prune->key_end.localization) { 74 if (prune->key_beg.obj_id > prune->key_end.obj_id) 75 return(EINVAL); 76 /* key-space limitations - no check needed */ 77 } 78 if ((prune->head.flags & HAMMER_IOC_PRUNE_ALL) && prune->nelms) 79 return(EINVAL); 80 81 prune->key_cur.localization = (prune->key_end.localization & 82 HAMMER_LOCALIZE_MASK) + 83 ip->obj_localization; 84 prune->key_cur.obj_id = prune->key_end.obj_id; 85 prune->key_cur.key = HAMMER_MAX_KEY; 86 87 /* 88 * Copy element array from userland 89 */ 90 elm_array_size = sizeof(*copy_elms) * prune->nelms; 91 user_elms = prune->elms; 92 copy_elms = kmalloc(elm_array_size, M_TEMP, M_WAITOK); 93 if ((error = copyin(user_elms, copy_elms, elm_array_size)) != 0) 94 goto failed; 95 prune->elms = copy_elms; 96 97 seq = trans->hmp->flusher.act; 98 99 /* 100 * Scan backwards. Retries typically occur if a deadlock is detected. 101 */ 102 retry: 103 error = hammer_init_cursor(trans, &cursor, NULL, NULL); 104 if (error) { 105 hammer_done_cursor(&cursor); 106 goto failed; 107 } 108 cursor.key_beg.localization = (prune->key_beg.localization & 109 HAMMER_LOCALIZE_MASK) + 110 ip->obj_localization; 111 cursor.key_beg.obj_id = prune->key_beg.obj_id; 112 cursor.key_beg.key = HAMMER_MIN_KEY; 113 cursor.key_beg.create_tid = 1; 114 cursor.key_beg.delete_tid = 0; 115 cursor.key_beg.rec_type = HAMMER_MIN_RECTYPE; 116 cursor.key_beg.obj_type = 0; 117 118 cursor.key_end.localization = prune->key_cur.localization; 119 cursor.key_end.obj_id = prune->key_cur.obj_id; 120 cursor.key_end.key = prune->key_cur.key; 121 cursor.key_end.create_tid = HAMMER_MAX_TID - 1; 122 cursor.key_end.delete_tid = 0; 123 cursor.key_end.rec_type = HAMMER_MAX_RECTYPE; 124 cursor.key_end.obj_type = 0; 125 126 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 127 cursor.flags |= HAMMER_CURSOR_BACKEND; 128 129 /* 130 * This flag allows the B-Tree code to clean up loose ends. At 131 * the moment (XXX) it also means we have to hold the sync lock 132 * through the iteration. 133 */ 134 cursor.flags |= HAMMER_CURSOR_PRUNING; 135 136 hammer_sync_lock_sh(trans); 137 error = hammer_btree_last(&cursor); 138 hammer_sync_unlock(trans); 139 140 while (error == 0) { 141 /* 142 * Check for work 143 */ 144 elm = &cursor.node->ondisk->elms[cursor.index].leaf; 145 prune->key_cur = elm->base; 146 147 /* 148 * Yield to more important tasks 149 */ 150 if ((error = hammer_signal_check(trans->hmp)) != 0) 151 break; 152 153 if (prune->stat_oldest_tid > elm->base.create_tid) 154 prune->stat_oldest_tid = elm->base.create_tid; 155 156 if (hammer_debug_general & 0x0200) { 157 kprintf("check %016llx %016llx cre=%016llx del=%016llx\n", 158 elm->base.obj_id, 159 elm->base.key, 160 elm->base.create_tid, 161 elm->base.delete_tid); 162 } 163 164 if (prune_should_delete(prune, elm)) { 165 if (hammer_debug_general & 0x0200) { 166 kprintf("check %016llx %016llx: DELETE\n", 167 elm->base.obj_id, elm->base.key); 168 } 169 170 /* 171 * NOTE: This can return EDEADLK 172 * 173 * Acquiring the sync lock guarantees that the 174 * operation will not cross a synchronization 175 * boundary (see the flusher). 176 * 177 * We dont need to track inodes or next_tid when 178 * we are destroying deleted records. 179 */ 180 isdir = (elm->base.rec_type == HAMMER_RECTYPE_DIRENTRY); 181 182 hammer_sync_lock_sh(trans); 183 error = hammer_delete_at_cursor(&cursor, 184 HAMMER_DELETE_DESTROY, 185 cursor.trans->tid, 186 cursor.trans->time32, 187 0, &prune->stat_bytes); 188 hammer_sync_unlock(trans); 189 if (error) 190 break; 191 192 if (isdir) 193 ++prune->stat_dirrecords; 194 else 195 ++prune->stat_rawrecords; 196 197 /* 198 * The current record might now be the one after 199 * the one we deleted, set ATEDISK to force us 200 * to skip it (since we are iterating backwards). 201 */ 202 cursor.flags |= HAMMER_CURSOR_ATEDISK; 203 } else { 204 /* 205 * Nothing to delete, but we may have to check other 206 * things. 207 */ 208 prune_check_nlinks(&cursor, elm); 209 cursor.flags |= HAMMER_CURSOR_ATEDISK; 210 if (hammer_debug_general & 0x0100) { 211 kprintf("check %016llx %016llx: SKIP\n", 212 elm->base.obj_id, elm->base.key); 213 } 214 } 215 ++prune->stat_scanrecords; 216 217 while (hammer_flusher_meta_halflimit(trans->hmp) || 218 hammer_flusher_undo_exhausted(trans, 2)) { 219 hammer_unlock_cursor(&cursor); 220 hammer_flusher_wait(trans->hmp, seq); 221 hammer_lock_cursor(&cursor); 222 seq = hammer_flusher_async_one(trans->hmp); 223 } 224 hammer_sync_lock_sh(trans); 225 error = hammer_btree_iterate_reverse(&cursor); 226 hammer_sync_unlock(trans); 227 } 228 if (error == ENOENT) 229 error = 0; 230 hammer_done_cursor(&cursor); 231 if (error == EDEADLK) 232 goto retry; 233 if (error == EINTR) { 234 prune->head.flags |= HAMMER_IOC_HEAD_INTR; 235 error = 0; 236 } 237 failed: 238 prune->key_cur.localization &= HAMMER_LOCALIZE_MASK; 239 prune->elms = user_elms; 240 kfree(copy_elms, M_TEMP); 241 return(error); 242 } 243 244 /* 245 * Check pruning list. The list must be sorted in descending order. 246 * 247 * Return non-zero if the record should be deleted. 248 */ 249 static int 250 prune_should_delete(struct hammer_ioc_prune *prune, hammer_btree_leaf_elm_t elm) 251 { 252 struct hammer_ioc_prune_elm *scan; 253 int i; 254 255 /* 256 * If pruning everything remove all records with a non-zero 257 * delete_tid. 258 */ 259 if (prune->head.flags & HAMMER_IOC_PRUNE_ALL) { 260 if (elm->base.delete_tid != 0) 261 return(1); 262 return(0); 263 } 264 265 for (i = 0; i < prune->nelms; ++i) { 266 scan = &prune->elms[i]; 267 268 /* 269 * Check for loop termination. 270 */ 271 if (elm->base.create_tid >= scan->end_tid || 272 elm->base.delete_tid > scan->end_tid) { 273 break; 274 } 275 276 /* 277 * Determine if we can delete the record. 278 */ 279 if (elm->base.delete_tid && 280 elm->base.create_tid >= scan->beg_tid && 281 elm->base.delete_tid <= scan->end_tid && 282 (elm->base.create_tid - scan->beg_tid) / scan->mod_tid == 283 (elm->base.delete_tid - scan->beg_tid) / scan->mod_tid) { 284 return(1); 285 } 286 } 287 return(0); 288 } 289 290 /* 291 * Dangling inodes can occur if processes are holding open descriptors on 292 * deleted files as-of when a machine crashes. When we find one simply 293 * acquire the inode and release it. The inode handling code will then 294 * do the right thing. 295 */ 296 static 297 void 298 prune_check_nlinks(hammer_cursor_t cursor, hammer_btree_leaf_elm_t elm) 299 { 300 hammer_inode_t ip; 301 int error; 302 303 if (elm->base.rec_type != HAMMER_RECTYPE_INODE) 304 return; 305 if (elm->base.delete_tid != 0) 306 return; 307 if (hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA)) 308 return; 309 if (cursor->data->inode.nlinks) 310 return; 311 hammer_cursor_downgrade(cursor); 312 ip = hammer_get_inode(cursor->trans, NULL, elm->base.obj_id, 313 HAMMER_MAX_TID, 314 elm->base.localization & HAMMER_LOCALIZE_PSEUDOFS_MASK, 315 0, &error); 316 if (ip) { 317 if (hammer_debug_general & 0x0001) { 318 kprintf("pruning disconnected inode %016llx\n", 319 elm->base.obj_id); 320 } 321 hammer_rel_inode(ip, 0); 322 hammer_inode_waitreclaims(cursor->trans->hmp); 323 } else { 324 kprintf("unable to prune disconnected inode %016llx\n", 325 elm->base.obj_id); 326 } 327 } 328 329 #if 0 330 331 /* 332 * NOTE: THIS CODE HAS BEEN REMOVED! Pruning no longer attempts to realign 333 * adjacent records because it seriously interferes with every 334 * mirroring algorithm I could come up with. 335 * 336 * This means that historical accesses beyond the first snapshot 337 * softlink should be on snapshot boundaries only. Historical 338 * accesses from "now" to the first snapshot softlink continue to 339 * be fine-grained. 340 * 341 * NOTE: It also looks like there's a bug in the removed code. It is believed 342 * that create_tid can sometimes get set to 0xffffffffffffffff. Just as 343 * well we no longer try to do this fancy shit. Probably the attempt to 344 * correct the rhb is blowing up the cursor's indexing or addressing mapping. 345 * 346 * Align the record to cover any gaps created through the deletion of 347 * records within the pruning space. If we were to just delete the records 348 * there would be gaps which in turn would cause a snapshot that is NOT on 349 * a pruning boundary to appear corrupt to the user. Forcing alignment 350 * of the create_tid and delete_tid for retained records 'reconnects' 351 * the previously contiguous space, making it contiguous again after the 352 * deletions. 353 * 354 * The use of a reverse iteration allows us to safely align the records and 355 * related elements without creating temporary overlaps. XXX we should 356 * add ordering dependancies for record buffers to guarantee consistency 357 * during recovery. 358 */ 359 static int 360 realign_prune(struct hammer_ioc_prune *prune, 361 hammer_cursor_t cursor, int realign_cre, int realign_del) 362 { 363 struct hammer_ioc_prune_elm *scan; 364 hammer_btree_elm_t elm; 365 hammer_tid_t delta; 366 hammer_tid_t tid; 367 int error; 368 369 hammer_cursor_downgrade(cursor); 370 371 elm = &cursor->node->ondisk->elms[cursor->index]; 372 ++prune->stat_realignments; 373 374 /* 375 * Align the create_tid. By doing a reverse iteration we guarantee 376 * that all records after our current record have already been 377 * aligned, allowing us to safely correct the right-hand-boundary 378 * (because no record to our right is otherwise exactly matching 379 * will have a create_tid to the left of our aligned create_tid). 380 */ 381 error = 0; 382 if (realign_cre >= 0) { 383 scan = &prune->elms[realign_cre]; 384 385 delta = (elm->leaf.base.create_tid - scan->beg_tid) % 386 scan->mod_tid; 387 if (delta) { 388 tid = elm->leaf.base.create_tid - delta + scan->mod_tid; 389 390 /* can EDEADLK */ 391 error = hammer_btree_correct_rhb(cursor, tid + 1); 392 if (error == 0) { 393 error = hammer_btree_extract(cursor, 394 HAMMER_CURSOR_GET_LEAF); 395 } 396 if (error == 0) { 397 /* can EDEADLK */ 398 error = hammer_cursor_upgrade(cursor); 399 } 400 if (error == 0) { 401 hammer_modify_node(cursor->trans, cursor->node, 402 &elm->leaf.base.create_tid, 403 sizeof(elm->leaf.base.create_tid)); 404 elm->leaf.base.create_tid = tid; 405 hammer_modify_node_done(cursor->node); 406 } 407 } 408 } 409 410 /* 411 * Align the delete_tid. This only occurs if the record is historical 412 * was deleted at some point. Realigning the delete_tid does not 413 * move the record within the B-Tree but may cause it to temporarily 414 * overlap a record that has not yet been pruned. 415 */ 416 if (error == 0 && realign_del >= 0) { 417 scan = &prune->elms[realign_del]; 418 419 delta = (elm->leaf.base.delete_tid - scan->beg_tid) % 420 scan->mod_tid; 421 if (delta) { 422 error = hammer_btree_extract(cursor, 423 HAMMER_CURSOR_GET_LEAF); 424 if (error == 0) { 425 hammer_modify_node(cursor->trans, cursor->node, 426 &elm->leaf.base.delete_tid, 427 sizeof(elm->leaf.base.delete_tid)); 428 elm->leaf.base.delete_tid = 429 elm->leaf.base.delete_tid - 430 delta + scan->mod_tid; 431 hammer_modify_node_done(cursor->node); 432 } 433 } 434 } 435 return (error); 436 } 437 438 #endif 439