1 /* 2 * Copyright (c) 2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_prune.c,v 1.19 2008/09/23 21:03:52 dillon Exp $ 35 */ 36 37 #include "hammer.h" 38 39 /* 40 * Iterate through the specified range of object ids and remove any 41 * deleted records that fall entirely within a prune modulo. 42 * 43 * A reverse iteration is used to prevent overlapping records from being 44 * created during the iteration due to alignments. This also allows us 45 * to adjust alignments without blowing up the B-Tree. 46 */ 47 static int prune_should_delete(struct hammer_ioc_prune *prune, 48 hammer_btree_leaf_elm_t elm); 49 static void prune_check_nlinks(hammer_cursor_t cursor, 50 hammer_btree_leaf_elm_t elm); 51 52 int 53 hammer_ioc_prune(hammer_transaction_t trans, hammer_inode_t ip, 54 struct hammer_ioc_prune *prune) 55 { 56 struct hammer_cursor cursor; 57 hammer_btree_leaf_elm_t elm; 58 struct hammer_ioc_prune_elm *copy_elms; 59 struct hammer_ioc_prune_elm *user_elms; 60 int error; 61 int isdir; 62 int elm_array_size; 63 int seq; 64 65 if (prune->nelms < 0 || prune->nelms > HAMMER_MAX_PRUNE_ELMS) 66 return(EINVAL); 67 if ((prune->key_beg.localization | prune->key_end.localization) & 68 HAMMER_LOCALIZE_PSEUDOFS_MASK) { 69 return(EINVAL); 70 } 71 if (prune->key_beg.localization > prune->key_end.localization) 72 return(EINVAL); 73 if (prune->key_beg.localization == prune->key_end.localization) { 74 if (prune->key_beg.obj_id > prune->key_end.obj_id) 75 return(EINVAL); 76 /* key-space limitations - no check needed */ 77 } 78 if ((prune->head.flags & HAMMER_IOC_PRUNE_ALL) && prune->nelms) 79 return(EINVAL); 80 81 prune->key_cur.localization = (prune->key_end.localization & 82 HAMMER_LOCALIZE_MASK) + 83 ip->obj_localization; 84 prune->key_cur.obj_id = prune->key_end.obj_id; 85 prune->key_cur.key = HAMMER_MAX_KEY; 86 87 /* 88 * Copy element array from userland 89 */ 90 elm_array_size = sizeof(*copy_elms) * prune->nelms; 91 user_elms = prune->elms; 92 copy_elms = kmalloc(elm_array_size, M_TEMP, M_WAITOK); 93 if ((error = copyin(user_elms, copy_elms, elm_array_size)) != 0) 94 goto failed; 95 prune->elms = copy_elms; 96 97 seq = trans->hmp->flusher.act; 98 99 /* 100 * Scan backwards. Retries typically occur if a deadlock is detected. 101 */ 102 retry: 103 error = hammer_init_cursor(trans, &cursor, NULL, NULL); 104 if (error) { 105 hammer_done_cursor(&cursor); 106 goto failed; 107 } 108 cursor.key_beg.localization = (prune->key_beg.localization & 109 HAMMER_LOCALIZE_MASK) + 110 ip->obj_localization; 111 cursor.key_beg.obj_id = prune->key_beg.obj_id; 112 cursor.key_beg.key = HAMMER_MIN_KEY; 113 cursor.key_beg.create_tid = 1; 114 cursor.key_beg.delete_tid = 0; 115 cursor.key_beg.rec_type = HAMMER_MIN_RECTYPE; 116 cursor.key_beg.obj_type = 0; 117 118 cursor.key_end.localization = prune->key_cur.localization; 119 cursor.key_end.obj_id = prune->key_cur.obj_id; 120 cursor.key_end.key = prune->key_cur.key; 121 cursor.key_end.create_tid = HAMMER_MAX_TID - 1; 122 cursor.key_end.delete_tid = 0; 123 cursor.key_end.rec_type = HAMMER_MAX_RECTYPE; 124 cursor.key_end.obj_type = 0; 125 126 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 127 cursor.flags |= HAMMER_CURSOR_BACKEND; 128 129 /* 130 * This flag allows the B-Tree code to clean up loose ends. At 131 * the moment (XXX) it also means we have to hold the sync lock 132 * through the iteration. 133 */ 134 cursor.flags |= HAMMER_CURSOR_PRUNING; 135 136 hammer_sync_lock_sh(trans); 137 error = hammer_btree_last(&cursor); 138 hammer_sync_unlock(trans); 139 140 while (error == 0) { 141 /* 142 * Check for work 143 */ 144 elm = &cursor.node->ondisk->elms[cursor.index].leaf; 145 prune->key_cur = elm->base; 146 147 /* 148 * Yield to more important tasks 149 */ 150 if ((error = hammer_signal_check(trans->hmp)) != 0) 151 break; 152 153 if (prune->stat_oldest_tid > elm->base.create_tid) 154 prune->stat_oldest_tid = elm->base.create_tid; 155 156 if (hammer_debug_general & 0x0200) { 157 kprintf("check %016llx %016llx cre=%016llx del=%016llx\n", 158 (long long)elm->base.obj_id, 159 (long long)elm->base.key, 160 (long long)elm->base.create_tid, 161 (long long)elm->base.delete_tid); 162 } 163 164 if (prune_should_delete(prune, elm)) { 165 if (hammer_debug_general & 0x0200) { 166 kprintf("check %016llx %016llx: DELETE\n", 167 (long long)elm->base.obj_id, 168 (long long)elm->base.key); 169 } 170 171 /* 172 * NOTE: This can return EDEADLK 173 * 174 * Acquiring the sync lock guarantees that the 175 * operation will not cross a synchronization 176 * boundary (see the flusher). 177 * 178 * We dont need to track inodes or next_tid when 179 * we are destroying deleted records. 180 */ 181 isdir = (elm->base.rec_type == HAMMER_RECTYPE_DIRENTRY); 182 183 hammer_sync_lock_sh(trans); 184 error = hammer_delete_at_cursor(&cursor, 185 HAMMER_DELETE_DESTROY, 186 cursor.trans->tid, 187 cursor.trans->time32, 188 0, &prune->stat_bytes); 189 hammer_sync_unlock(trans); 190 if (error) 191 break; 192 193 if (isdir) 194 ++prune->stat_dirrecords; 195 else 196 ++prune->stat_rawrecords; 197 198 /* 199 * The current record might now be the one after 200 * the one we deleted, set ATEDISK to force us 201 * to skip it (since we are iterating backwards). 202 */ 203 cursor.flags |= HAMMER_CURSOR_ATEDISK; 204 } else { 205 /* 206 * Nothing to delete, but we may have to check other 207 * things. 208 */ 209 prune_check_nlinks(&cursor, elm); 210 cursor.flags |= HAMMER_CURSOR_ATEDISK; 211 if (hammer_debug_general & 0x0100) { 212 kprintf("check %016llx %016llx: SKIP\n", 213 (long long)elm->base.obj_id, 214 (long long)elm->base.key); 215 } 216 } 217 ++prune->stat_scanrecords; 218 219 /* 220 * WARNING: See warnings in hammer_unlock_cursor() function. 221 */ 222 while (hammer_flusher_meta_halflimit(trans->hmp) || 223 hammer_flusher_undo_exhausted(trans, 2)) { 224 hammer_unlock_cursor(&cursor); 225 hammer_flusher_wait(trans->hmp, seq); 226 hammer_lock_cursor(&cursor); 227 seq = hammer_flusher_async_one(trans->hmp); 228 } 229 hammer_sync_lock_sh(trans); 230 error = hammer_btree_iterate_reverse(&cursor); 231 hammer_sync_unlock(trans); 232 } 233 if (error == ENOENT) 234 error = 0; 235 hammer_done_cursor(&cursor); 236 if (error == EDEADLK) 237 goto retry; 238 if (error == EINTR) { 239 prune->head.flags |= HAMMER_IOC_HEAD_INTR; 240 error = 0; 241 } 242 failed: 243 prune->key_cur.localization &= HAMMER_LOCALIZE_MASK; 244 prune->elms = user_elms; 245 kfree(copy_elms, M_TEMP); 246 return(error); 247 } 248 249 /* 250 * Check pruning list. The list must be sorted in descending order. 251 * 252 * Return non-zero if the record should be deleted. 253 */ 254 static int 255 prune_should_delete(struct hammer_ioc_prune *prune, hammer_btree_leaf_elm_t elm) 256 { 257 struct hammer_ioc_prune_elm *scan; 258 int i; 259 260 /* 261 * If pruning everything remove all records with a non-zero 262 * delete_tid. 263 */ 264 if (prune->head.flags & HAMMER_IOC_PRUNE_ALL) { 265 if (elm->base.delete_tid != 0) 266 return(1); 267 return(0); 268 } 269 270 for (i = 0; i < prune->nelms; ++i) { 271 scan = &prune->elms[i]; 272 273 /* 274 * Check for loop termination. 275 */ 276 if (elm->base.create_tid >= scan->end_tid || 277 elm->base.delete_tid > scan->end_tid) { 278 break; 279 } 280 281 /* 282 * Determine if we can delete the record. 283 */ 284 if (elm->base.delete_tid && 285 elm->base.create_tid >= scan->beg_tid && 286 elm->base.delete_tid <= scan->end_tid && 287 (elm->base.create_tid - scan->beg_tid) / scan->mod_tid == 288 (elm->base.delete_tid - scan->beg_tid) / scan->mod_tid) { 289 return(1); 290 } 291 } 292 return(0); 293 } 294 295 /* 296 * Dangling inodes can occur if processes are holding open descriptors on 297 * deleted files as-of when a machine crashes. When we find one simply 298 * acquire the inode and release it. The inode handling code will then 299 * do the right thing. 300 */ 301 static 302 void 303 prune_check_nlinks(hammer_cursor_t cursor, hammer_btree_leaf_elm_t elm) 304 { 305 hammer_inode_t ip; 306 int error; 307 308 if (elm->base.rec_type != HAMMER_RECTYPE_INODE) 309 return; 310 if (elm->base.delete_tid != 0) 311 return; 312 if (hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA)) 313 return; 314 if (cursor->data->inode.nlinks) 315 return; 316 hammer_cursor_downgrade(cursor); 317 ip = hammer_get_inode(cursor->trans, NULL, elm->base.obj_id, 318 HAMMER_MAX_TID, 319 elm->base.localization & HAMMER_LOCALIZE_PSEUDOFS_MASK, 320 0, &error); 321 if (ip) { 322 if (hammer_debug_general & 0x0001) { 323 kprintf("pruning disconnected inode %016llx\n", 324 (long long)elm->base.obj_id); 325 } 326 hammer_rel_inode(ip, 0); 327 hammer_inode_waitreclaims(cursor->trans); 328 } else { 329 kprintf("unable to prune disconnected inode %016llx\n", 330 (long long)elm->base.obj_id); 331 } 332 } 333 334 #if 0 335 336 /* 337 * NOTE: THIS CODE HAS BEEN REMOVED! Pruning no longer attempts to realign 338 * adjacent records because it seriously interferes with every 339 * mirroring algorithm I could come up with. 340 * 341 * This means that historical accesses beyond the first snapshot 342 * softlink should be on snapshot boundaries only. Historical 343 * accesses from "now" to the first snapshot softlink continue to 344 * be fine-grained. 345 * 346 * NOTE: It also looks like there's a bug in the removed code. It is believed 347 * that create_tid can sometimes get set to 0xffffffffffffffff. Just as 348 * well we no longer try to do this fancy shit. Probably the attempt to 349 * correct the rhb is blowing up the cursor's indexing or addressing mapping. 350 * 351 * Align the record to cover any gaps created through the deletion of 352 * records within the pruning space. If we were to just delete the records 353 * there would be gaps which in turn would cause a snapshot that is NOT on 354 * a pruning boundary to appear corrupt to the user. Forcing alignment 355 * of the create_tid and delete_tid for retained records 'reconnects' 356 * the previously contiguous space, making it contiguous again after the 357 * deletions. 358 * 359 * The use of a reverse iteration allows us to safely align the records and 360 * related elements without creating temporary overlaps. XXX we should 361 * add ordering dependancies for record buffers to guarantee consistency 362 * during recovery. 363 */ 364 static int 365 realign_prune(struct hammer_ioc_prune *prune, 366 hammer_cursor_t cursor, int realign_cre, int realign_del) 367 { 368 struct hammer_ioc_prune_elm *scan; 369 hammer_btree_elm_t elm; 370 hammer_tid_t delta; 371 hammer_tid_t tid; 372 int error; 373 374 hammer_cursor_downgrade(cursor); 375 376 elm = &cursor->node->ondisk->elms[cursor->index]; 377 ++prune->stat_realignments; 378 379 /* 380 * Align the create_tid. By doing a reverse iteration we guarantee 381 * that all records after our current record have already been 382 * aligned, allowing us to safely correct the right-hand-boundary 383 * (because no record to our right is otherwise exactly matching 384 * will have a create_tid to the left of our aligned create_tid). 385 */ 386 error = 0; 387 if (realign_cre >= 0) { 388 scan = &prune->elms[realign_cre]; 389 390 delta = (elm->leaf.base.create_tid - scan->beg_tid) % 391 scan->mod_tid; 392 if (delta) { 393 tid = elm->leaf.base.create_tid - delta + scan->mod_tid; 394 395 /* can EDEADLK */ 396 error = hammer_btree_correct_rhb(cursor, tid + 1); 397 if (error == 0) { 398 error = hammer_btree_extract(cursor, 399 HAMMER_CURSOR_GET_LEAF); 400 } 401 if (error == 0) { 402 /* can EDEADLK */ 403 error = hammer_cursor_upgrade(cursor); 404 } 405 if (error == 0) { 406 hammer_modify_node(cursor->trans, cursor->node, 407 &elm->leaf.base.create_tid, 408 sizeof(elm->leaf.base.create_tid)); 409 elm->leaf.base.create_tid = tid; 410 hammer_modify_node_done(cursor->node); 411 } 412 } 413 } 414 415 /* 416 * Align the delete_tid. This only occurs if the record is historical 417 * was deleted at some point. Realigning the delete_tid does not 418 * move the record within the B-Tree but may cause it to temporarily 419 * overlap a record that has not yet been pruned. 420 */ 421 if (error == 0 && realign_del >= 0) { 422 scan = &prune->elms[realign_del]; 423 424 delta = (elm->leaf.base.delete_tid - scan->beg_tid) % 425 scan->mod_tid; 426 if (delta) { 427 error = hammer_btree_extract(cursor, 428 HAMMER_CURSOR_GET_LEAF); 429 if (error == 0) { 430 hammer_modify_node(cursor->trans, cursor->node, 431 &elm->leaf.base.delete_tid, 432 sizeof(elm->leaf.base.delete_tid)); 433 elm->leaf.base.delete_tid = 434 elm->leaf.base.delete_tid - 435 delta + scan->mod_tid; 436 hammer_modify_node_done(cursor->node); 437 } 438 } 439 } 440 return (error); 441 } 442 443 #endif 444