1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0 23d14c5d2SYehuda Sadeh #include <linux/ceph/ceph_debug.h> 3963b61ebSSage Weil 4963b61ebSSage Weil #include <linux/sort.h> 55a0e3ad6STejun Heo #include <linux/slab.h> 6176c77c9SJeff Layton #include <linux/iversion.h> 7963b61ebSSage Weil #include "super.h" 83d14c5d2SYehuda Sadeh #include "mds_client.h" 93d14c5d2SYehuda Sadeh #include <linux/ceph/decode.h> 10963b61ebSSage Weil 1175c9627eSYan, Zheng /* unused map expires after 5 minutes */ 1275c9627eSYan, Zheng #define CEPH_SNAPID_MAP_TIMEOUT (5 * 60 * HZ) 1375c9627eSYan, Zheng 14963b61ebSSage Weil /* 15963b61ebSSage Weil * Snapshots in ceph are driven in large part by cooperation from the 16963b61ebSSage Weil * client. In contrast to local file systems or file servers that 17963b61ebSSage Weil * implement snapshots at a single point in the system, ceph's 18963b61ebSSage Weil * distributed access to storage requires clients to help decide 19963b61ebSSage Weil * whether a write logically occurs before or after a recently created 20963b61ebSSage Weil * snapshot. 21963b61ebSSage Weil * 22963b61ebSSage Weil * This provides a perfect instantanous client-wide snapshot. Between 23963b61ebSSage Weil * clients, however, snapshots may appear to be applied at slightly 24963b61ebSSage Weil * different points in time, depending on delays in delivering the 25963b61ebSSage Weil * snapshot notification. 26963b61ebSSage Weil * 27963b61ebSSage Weil * Snapshots are _not_ file system-wide. Instead, each snapshot 28963b61ebSSage Weil * applies to the subdirectory nested beneath some directory. This 29963b61ebSSage Weil * effectively divides the hierarchy into multiple "realms," where all 30963b61ebSSage Weil * of the files contained by each realm share the same set of 31963b61ebSSage Weil * snapshots. An individual realm's snap set contains snapshots 32963b61ebSSage Weil * explicitly created on that realm, as well as any snaps in its 33963b61ebSSage Weil * parent's snap set _after_ the point at which the parent became it's 34963b61ebSSage Weil * parent (due to, say, a rename). Similarly, snaps from prior parents 35963b61ebSSage Weil * during the time intervals during which they were the parent are included. 36963b61ebSSage Weil * 37963b61ebSSage Weil * The client is spared most of this detail, fortunately... it must only 38963b61ebSSage Weil * maintains a hierarchy of realms reflecting the current parent/child 39963b61ebSSage Weil * realm relationship, and for each realm has an explicit list of snaps 40963b61ebSSage Weil * inherited from prior parents. 41963b61ebSSage Weil * 42963b61ebSSage Weil * A snap_realm struct is maintained for realms containing every inode 43963b61ebSSage Weil * with an open cap in the system. (The needed snap realm information is 44963b61ebSSage Weil * provided by the MDS whenever a cap is issued, i.e., on open.) A 'seq' 45963b61ebSSage Weil * version number is used to ensure that as realm parameters change (new 46963b61ebSSage Weil * snapshot, new parent, etc.) the client's realm hierarchy is updated. 47963b61ebSSage Weil * 48963b61ebSSage Weil * The realm hierarchy drives the generation of a 'snap context' for each 49963b61ebSSage Weil * realm, which simply lists the resulting set of snaps for the realm. This 50963b61ebSSage Weil * is attached to any writes sent to OSDs. 51963b61ebSSage Weil */ 52963b61ebSSage Weil /* 53963b61ebSSage Weil * Unfortunately error handling is a bit mixed here. If we get a snap 54963b61ebSSage Weil * update, but don't have enough memory to update our realm hierarchy, 55963b61ebSSage Weil * it's not clear what we can do about it (besides complaining to the 56963b61ebSSage Weil * console). 57963b61ebSSage Weil */ 58963b61ebSSage Weil 59963b61ebSSage Weil 60963b61ebSSage Weil /* 61963b61ebSSage Weil * increase ref count for the realm 62963b61ebSSage Weil * 63df2c0cb7SJeff Layton * caller must hold snap_rwsem. 64963b61ebSSage Weil */ 65963b61ebSSage Weil void ceph_get_snap_realm(struct ceph_mds_client *mdsc, 66963b61ebSSage Weil struct ceph_snap_realm *realm) 67963b61ebSSage Weil { 68df2c0cb7SJeff Layton lockdep_assert_held(&mdsc->snap_rwsem); 69a6862e67SJeff Layton 70963b61ebSSage Weil /* 718434ffe7SJeff Layton * The 0->1 and 1->0 transitions must take the snap_empty_lock 728434ffe7SJeff Layton * atomically with the refcount change. Go ahead and bump the 738434ffe7SJeff Layton * nref here, unless it's 0, in which case we take the spinlock 748434ffe7SJeff Layton * and then do the increment and remove it from the list. 75963b61ebSSage Weil */ 768434ffe7SJeff Layton if (atomic_inc_not_zero(&realm->nref)) 778434ffe7SJeff Layton return; 788434ffe7SJeff Layton 79963b61ebSSage Weil spin_lock(&mdsc->snap_empty_lock); 808434ffe7SJeff Layton if (atomic_inc_return(&realm->nref) == 1) 81963b61ebSSage Weil list_del_init(&realm->empty_item); 82963b61ebSSage Weil spin_unlock(&mdsc->snap_empty_lock); 83963b61ebSSage Weil } 84963b61ebSSage Weil 85a105f00cSSage Weil static void __insert_snap_realm(struct rb_root *root, 86a105f00cSSage Weil struct ceph_snap_realm *new) 87a105f00cSSage Weil { 88a105f00cSSage Weil struct rb_node **p = &root->rb_node; 89a105f00cSSage Weil struct rb_node *parent = NULL; 90a105f00cSSage Weil struct ceph_snap_realm *r = NULL; 91a105f00cSSage Weil 92a105f00cSSage Weil while (*p) { 93a105f00cSSage Weil parent = *p; 94a105f00cSSage Weil r = rb_entry(parent, struct ceph_snap_realm, node); 95a105f00cSSage Weil if (new->ino < r->ino) 96a105f00cSSage Weil p = &(*p)->rb_left; 97a105f00cSSage Weil else if (new->ino > r->ino) 98a105f00cSSage Weil p = &(*p)->rb_right; 99a105f00cSSage Weil else 100a105f00cSSage Weil BUG(); 101a105f00cSSage Weil } 102a105f00cSSage Weil 103a105f00cSSage Weil rb_link_node(&new->node, parent, p); 104a105f00cSSage Weil rb_insert_color(&new->node, root); 105a105f00cSSage Weil } 106a105f00cSSage Weil 107963b61ebSSage Weil /* 108963b61ebSSage Weil * create and get the realm rooted at @ino and bump its ref count. 109963b61ebSSage Weil * 110963b61ebSSage Weil * caller must hold snap_rwsem for write. 111963b61ebSSage Weil */ 112963b61ebSSage Weil static struct ceph_snap_realm *ceph_create_snap_realm( 113963b61ebSSage Weil struct ceph_mds_client *mdsc, 114963b61ebSSage Weil u64 ino) 115963b61ebSSage Weil { 116963b61ebSSage Weil struct ceph_snap_realm *realm; 117963b61ebSSage Weil 118a6862e67SJeff Layton lockdep_assert_held_write(&mdsc->snap_rwsem); 119a6862e67SJeff Layton 120963b61ebSSage Weil realm = kzalloc(sizeof(*realm), GFP_NOFS); 121963b61ebSSage Weil if (!realm) 122963b61ebSSage Weil return ERR_PTR(-ENOMEM); 123963b61ebSSage Weil 1245ed91587SXiubo Li /* Do not release the global dummy snaprealm until unmouting */ 1255ed91587SXiubo Li if (ino == CEPH_INO_GLOBAL_SNAPREALM) 1265ed91587SXiubo Li atomic_set(&realm->nref, 2); 1275ed91587SXiubo Li else 1285ed91587SXiubo Li atomic_set(&realm->nref, 1); 129963b61ebSSage Weil realm->ino = ino; 130963b61ebSSage Weil INIT_LIST_HEAD(&realm->children); 131963b61ebSSage Weil INIT_LIST_HEAD(&realm->child_item); 132963b61ebSSage Weil INIT_LIST_HEAD(&realm->empty_item); 133ae00d4f3SSage Weil INIT_LIST_HEAD(&realm->dirty_item); 13474a31df4SXiubo Li INIT_LIST_HEAD(&realm->rebuild_item); 135963b61ebSSage Weil INIT_LIST_HEAD(&realm->inodes_with_caps); 136963b61ebSSage Weil spin_lock_init(&realm->inodes_with_caps_lock); 137a105f00cSSage Weil __insert_snap_realm(&mdsc->snap_realms, realm); 13881c5a148SYan, Zheng mdsc->num_snap_realms++; 13981c5a148SYan, Zheng 140ad5255c1SXiubo Li dout("%s %llx %p\n", __func__, realm->ino, realm); 141963b61ebSSage Weil return realm; 142963b61ebSSage Weil } 143963b61ebSSage Weil 144963b61ebSSage Weil /* 145a105f00cSSage Weil * lookup the realm rooted at @ino. 146963b61ebSSage Weil * 147df2c0cb7SJeff Layton * caller must hold snap_rwsem. 148963b61ebSSage Weil */ 149982d6011SYan, Zheng static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc, 150963b61ebSSage Weil u64 ino) 151963b61ebSSage Weil { 152a105f00cSSage Weil struct rb_node *n = mdsc->snap_realms.rb_node; 153a105f00cSSage Weil struct ceph_snap_realm *r; 154963b61ebSSage Weil 155df2c0cb7SJeff Layton lockdep_assert_held(&mdsc->snap_rwsem); 156a6862e67SJeff Layton 157a105f00cSSage Weil while (n) { 158a105f00cSSage Weil r = rb_entry(n, struct ceph_snap_realm, node); 159a105f00cSSage Weil if (ino < r->ino) 160a105f00cSSage Weil n = n->rb_left; 161a105f00cSSage Weil else if (ino > r->ino) 162a105f00cSSage Weil n = n->rb_right; 163a105f00cSSage Weil else { 164ad5255c1SXiubo Li dout("%s %llx %p\n", __func__, r->ino, r); 165a105f00cSSage Weil return r; 166a105f00cSSage Weil } 167a105f00cSSage Weil } 168a105f00cSSage Weil return NULL; 169963b61ebSSage Weil } 170963b61ebSSage Weil 171982d6011SYan, Zheng struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, 172982d6011SYan, Zheng u64 ino) 173982d6011SYan, Zheng { 174982d6011SYan, Zheng struct ceph_snap_realm *r; 175982d6011SYan, Zheng r = __lookup_snap_realm(mdsc, ino); 176982d6011SYan, Zheng if (r) 177982d6011SYan, Zheng ceph_get_snap_realm(mdsc, r); 178982d6011SYan, Zheng return r; 179982d6011SYan, Zheng } 180982d6011SYan, Zheng 181963b61ebSSage Weil static void __put_snap_realm(struct ceph_mds_client *mdsc, 182963b61ebSSage Weil struct ceph_snap_realm *realm); 183963b61ebSSage Weil 184963b61ebSSage Weil /* 185963b61ebSSage Weil * called with snap_rwsem (write) 186963b61ebSSage Weil */ 187963b61ebSSage Weil static void __destroy_snap_realm(struct ceph_mds_client *mdsc, 188963b61ebSSage Weil struct ceph_snap_realm *realm) 189963b61ebSSage Weil { 190a6862e67SJeff Layton lockdep_assert_held_write(&mdsc->snap_rwsem); 191a6862e67SJeff Layton 192ad5255c1SXiubo Li dout("%s %p %llx\n", __func__, realm, realm->ino); 193963b61ebSSage Weil 194a105f00cSSage Weil rb_erase(&realm->node, &mdsc->snap_realms); 19581c5a148SYan, Zheng mdsc->num_snap_realms--; 196963b61ebSSage Weil 197963b61ebSSage Weil if (realm->parent) { 198963b61ebSSage Weil list_del_init(&realm->child_item); 199963b61ebSSage Weil __put_snap_realm(mdsc, realm->parent); 200963b61ebSSage Weil } 201963b61ebSSage Weil 202963b61ebSSage Weil kfree(realm->prior_parent_snaps); 203963b61ebSSage Weil kfree(realm->snaps); 204963b61ebSSage Weil ceph_put_snap_context(realm->cached_context); 205963b61ebSSage Weil kfree(realm); 206963b61ebSSage Weil } 207963b61ebSSage Weil 208963b61ebSSage Weil /* 209963b61ebSSage Weil * caller holds snap_rwsem (write) 210963b61ebSSage Weil */ 211963b61ebSSage Weil static void __put_snap_realm(struct ceph_mds_client *mdsc, 212963b61ebSSage Weil struct ceph_snap_realm *realm) 213963b61ebSSage Weil { 214a6862e67SJeff Layton lockdep_assert_held_write(&mdsc->snap_rwsem); 215a6862e67SJeff Layton 2168434ffe7SJeff Layton /* 2178434ffe7SJeff Layton * We do not require the snap_empty_lock here, as any caller that 2188434ffe7SJeff Layton * increments the value must hold the snap_rwsem. 2198434ffe7SJeff Layton */ 220963b61ebSSage Weil if (atomic_dec_and_test(&realm->nref)) 221963b61ebSSage Weil __destroy_snap_realm(mdsc, realm); 222963b61ebSSage Weil } 223963b61ebSSage Weil 224963b61ebSSage Weil /* 2258434ffe7SJeff Layton * See comments in ceph_get_snap_realm. Caller needn't hold any locks. 226963b61ebSSage Weil */ 227963b61ebSSage Weil void ceph_put_snap_realm(struct ceph_mds_client *mdsc, 228963b61ebSSage Weil struct ceph_snap_realm *realm) 229963b61ebSSage Weil { 2308434ffe7SJeff Layton if (!atomic_dec_and_lock(&realm->nref, &mdsc->snap_empty_lock)) 231963b61ebSSage Weil return; 232963b61ebSSage Weil 233963b61ebSSage Weil if (down_write_trylock(&mdsc->snap_rwsem)) { 2348434ffe7SJeff Layton spin_unlock(&mdsc->snap_empty_lock); 235963b61ebSSage Weil __destroy_snap_realm(mdsc, realm); 236963b61ebSSage Weil up_write(&mdsc->snap_rwsem); 237963b61ebSSage Weil } else { 238a26a185dSHenry C Chang list_add(&realm->empty_item, &mdsc->snap_empty); 239963b61ebSSage Weil spin_unlock(&mdsc->snap_empty_lock); 240963b61ebSSage Weil } 241963b61ebSSage Weil } 242963b61ebSSage Weil 243963b61ebSSage Weil /* 244963b61ebSSage Weil * Clean up any realms whose ref counts have dropped to zero. Note 245963b61ebSSage Weil * that this does not include realms who were created but not yet 246963b61ebSSage Weil * used. 247963b61ebSSage Weil * 248963b61ebSSage Weil * Called under snap_rwsem (write) 249963b61ebSSage Weil */ 250963b61ebSSage Weil static void __cleanup_empty_realms(struct ceph_mds_client *mdsc) 251963b61ebSSage Weil { 252963b61ebSSage Weil struct ceph_snap_realm *realm; 253963b61ebSSage Weil 254a6862e67SJeff Layton lockdep_assert_held_write(&mdsc->snap_rwsem); 255a6862e67SJeff Layton 256963b61ebSSage Weil spin_lock(&mdsc->snap_empty_lock); 257963b61ebSSage Weil while (!list_empty(&mdsc->snap_empty)) { 258963b61ebSSage Weil realm = list_first_entry(&mdsc->snap_empty, 259963b61ebSSage Weil struct ceph_snap_realm, empty_item); 260963b61ebSSage Weil list_del(&realm->empty_item); 261963b61ebSSage Weil spin_unlock(&mdsc->snap_empty_lock); 262963b61ebSSage Weil __destroy_snap_realm(mdsc, realm); 263963b61ebSSage Weil spin_lock(&mdsc->snap_empty_lock); 264963b61ebSSage Weil } 265963b61ebSSage Weil spin_unlock(&mdsc->snap_empty_lock); 266963b61ebSSage Weil } 267963b61ebSSage Weil 2685ed91587SXiubo Li void ceph_cleanup_global_and_empty_realms(struct ceph_mds_client *mdsc) 269963b61ebSSage Weil { 2705ed91587SXiubo Li struct ceph_snap_realm *global_realm; 2715ed91587SXiubo Li 272963b61ebSSage Weil down_write(&mdsc->snap_rwsem); 2735ed91587SXiubo Li global_realm = __lookup_snap_realm(mdsc, CEPH_INO_GLOBAL_SNAPREALM); 2745ed91587SXiubo Li if (global_realm) 2755ed91587SXiubo Li ceph_put_snap_realm(mdsc, global_realm); 276963b61ebSSage Weil __cleanup_empty_realms(mdsc); 277963b61ebSSage Weil up_write(&mdsc->snap_rwsem); 278963b61ebSSage Weil } 279963b61ebSSage Weil 280963b61ebSSage Weil /* 281963b61ebSSage Weil * adjust the parent realm of a given @realm. adjust child list, and parent 282963b61ebSSage Weil * pointers, and ref counts appropriately. 283963b61ebSSage Weil * 284963b61ebSSage Weil * return true if parent was changed, 0 if unchanged, <0 on error. 285963b61ebSSage Weil * 286963b61ebSSage Weil * caller must hold snap_rwsem for write. 287963b61ebSSage Weil */ 288963b61ebSSage Weil static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc, 289963b61ebSSage Weil struct ceph_snap_realm *realm, 290963b61ebSSage Weil u64 parentino) 291963b61ebSSage Weil { 292963b61ebSSage Weil struct ceph_snap_realm *parent; 293963b61ebSSage Weil 294a6862e67SJeff Layton lockdep_assert_held_write(&mdsc->snap_rwsem); 295a6862e67SJeff Layton 296963b61ebSSage Weil if (realm->parent_ino == parentino) 297963b61ebSSage Weil return 0; 298963b61ebSSage Weil 299963b61ebSSage Weil parent = ceph_lookup_snap_realm(mdsc, parentino); 300963b61ebSSage Weil if (!parent) { 301963b61ebSSage Weil parent = ceph_create_snap_realm(mdsc, parentino); 302963b61ebSSage Weil if (IS_ERR(parent)) 303963b61ebSSage Weil return PTR_ERR(parent); 304963b61ebSSage Weil } 305ad5255c1SXiubo Li dout("%s %llx %p: %llx %p -> %llx %p\n", __func__, realm->ino, 306ad5255c1SXiubo Li realm, realm->parent_ino, realm->parent, parentino, parent); 307963b61ebSSage Weil if (realm->parent) { 308963b61ebSSage Weil list_del_init(&realm->child_item); 309963b61ebSSage Weil ceph_put_snap_realm(mdsc, realm->parent); 310963b61ebSSage Weil } 311963b61ebSSage Weil realm->parent_ino = parentino; 312963b61ebSSage Weil realm->parent = parent; 313963b61ebSSage Weil list_add(&realm->child_item, &parent->children); 314963b61ebSSage Weil return 1; 315963b61ebSSage Weil } 316963b61ebSSage Weil 317963b61ebSSage Weil 318963b61ebSSage Weil static int cmpu64_rev(const void *a, const void *b) 319963b61ebSSage Weil { 320963b61ebSSage Weil if (*(u64 *)a < *(u64 *)b) 321963b61ebSSage Weil return 1; 322963b61ebSSage Weil if (*(u64 *)a > *(u64 *)b) 323963b61ebSSage Weil return -1; 324963b61ebSSage Weil return 0; 325963b61ebSSage Weil } 326963b61ebSSage Weil 32797c85a82SYan, Zheng 328963b61ebSSage Weil /* 329963b61ebSSage Weil * build the snap context for a given realm. 330963b61ebSSage Weil */ 3313ae0bebcSYan, Zheng static int build_snap_context(struct ceph_snap_realm *realm, 33274a31df4SXiubo Li struct list_head *realm_queue, 3333ae0bebcSYan, Zheng struct list_head *dirty_realms) 334963b61ebSSage Weil { 335963b61ebSSage Weil struct ceph_snap_realm *parent = realm->parent; 336963b61ebSSage Weil struct ceph_snap_context *snapc; 337963b61ebSSage Weil int err = 0; 338aa711ee3SAlex Elder u32 num = realm->num_prior_parent_snaps + realm->num_snaps; 339963b61ebSSage Weil 340963b61ebSSage Weil /* 341963b61ebSSage Weil * build parent context, if it hasn't been built. 342963b61ebSSage Weil * conservatively estimate that all parent snaps might be 343963b61ebSSage Weil * included by us. 344963b61ebSSage Weil */ 345963b61ebSSage Weil if (parent) { 346963b61ebSSage Weil if (!parent->cached_context) { 34774a31df4SXiubo Li /* add to the queue head */ 34874a31df4SXiubo Li list_add(&parent->rebuild_item, realm_queue); 34974a31df4SXiubo Li return 1; 350963b61ebSSage Weil } 351963b61ebSSage Weil num += parent->cached_context->num_snaps; 352963b61ebSSage Weil } 353963b61ebSSage Weil 354963b61ebSSage Weil /* do i actually need to update? not if my context seq 355963b61ebSSage Weil matches realm seq, and my parents' does to. (this works 356963b61ebSSage Weil because we rebuild_snap_realms() works _downward_ in 357963b61ebSSage Weil hierarchy after each update.) */ 358963b61ebSSage Weil if (realm->cached_context && 359ec4318bcSSage Weil realm->cached_context->seq == realm->seq && 360963b61ebSSage Weil (!parent || 361ec4318bcSSage Weil realm->cached_context->seq >= parent->cached_context->seq)) { 362ad5255c1SXiubo Li dout("%s %llx %p: %p seq %lld (%u snaps) (unchanged)\n", 363ad5255c1SXiubo Li __func__, realm->ino, realm, realm->cached_context, 364963b61ebSSage Weil realm->cached_context->seq, 365aa711ee3SAlex Elder (unsigned int)realm->cached_context->num_snaps); 366963b61ebSSage Weil return 0; 367963b61ebSSage Weil } 368963b61ebSSage Weil 369963b61ebSSage Weil /* alloc new snap context */ 370963b61ebSSage Weil err = -ENOMEM; 371a3860c1cSXi Wang if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64)) 372963b61ebSSage Weil goto fail; 373812164f8SAlex Elder snapc = ceph_create_snap_context(num, GFP_NOFS); 374963b61ebSSage Weil if (!snapc) 375963b61ebSSage Weil goto fail; 376963b61ebSSage Weil 377963b61ebSSage Weil /* build (reverse sorted) snap vector */ 378963b61ebSSage Weil num = 0; 379963b61ebSSage Weil snapc->seq = realm->seq; 380963b61ebSSage Weil if (parent) { 381aa711ee3SAlex Elder u32 i; 382aa711ee3SAlex Elder 38325985edcSLucas De Marchi /* include any of parent's snaps occurring _after_ my 384963b61ebSSage Weil parent became my parent */ 385963b61ebSSage Weil for (i = 0; i < parent->cached_context->num_snaps; i++) 386963b61ebSSage Weil if (parent->cached_context->snaps[i] >= 387963b61ebSSage Weil realm->parent_since) 388963b61ebSSage Weil snapc->snaps[num++] = 389963b61ebSSage Weil parent->cached_context->snaps[i]; 390963b61ebSSage Weil if (parent->cached_context->seq > snapc->seq) 391963b61ebSSage Weil snapc->seq = parent->cached_context->seq; 392963b61ebSSage Weil } 393963b61ebSSage Weil memcpy(snapc->snaps + num, realm->snaps, 394963b61ebSSage Weil sizeof(u64)*realm->num_snaps); 395963b61ebSSage Weil num += realm->num_snaps; 396963b61ebSSage Weil memcpy(snapc->snaps + num, realm->prior_parent_snaps, 397963b61ebSSage Weil sizeof(u64)*realm->num_prior_parent_snaps); 398963b61ebSSage Weil num += realm->num_prior_parent_snaps; 399963b61ebSSage Weil 400963b61ebSSage Weil sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL); 401963b61ebSSage Weil snapc->num_snaps = num; 402ad5255c1SXiubo Li dout("%s %llx %p: %p seq %lld (%u snaps)\n", __func__, realm->ino, 403ad5255c1SXiubo Li realm, snapc, snapc->seq, (unsigned int) snapc->num_snaps); 404963b61ebSSage Weil 405963b61ebSSage Weil ceph_put_snap_context(realm->cached_context); 4069f4057fcSYan, Zheng realm->cached_context = snapc; 4073ae0bebcSYan, Zheng /* queue realm for cap_snap creation */ 4083ae0bebcSYan, Zheng list_add_tail(&realm->dirty_item, dirty_realms); 409963b61ebSSage Weil return 0; 410963b61ebSSage Weil 411963b61ebSSage Weil fail: 412963b61ebSSage Weil /* 413963b61ebSSage Weil * if we fail, clear old (incorrect) cached_context... hopefully 414963b61ebSSage Weil * we'll have better luck building it later 415963b61ebSSage Weil */ 416963b61ebSSage Weil if (realm->cached_context) { 417963b61ebSSage Weil ceph_put_snap_context(realm->cached_context); 418963b61ebSSage Weil realm->cached_context = NULL; 419963b61ebSSage Weil } 420ad5255c1SXiubo Li pr_err("%s %llx %p fail %d\n", __func__, realm->ino, realm, err); 421963b61ebSSage Weil return err; 422963b61ebSSage Weil } 423963b61ebSSage Weil 424963b61ebSSage Weil /* 425963b61ebSSage Weil * rebuild snap context for the given realm and all of its children. 426963b61ebSSage Weil */ 4273ae0bebcSYan, Zheng static void rebuild_snap_realms(struct ceph_snap_realm *realm, 4283ae0bebcSYan, Zheng struct list_head *dirty_realms) 429963b61ebSSage Weil { 43074a31df4SXiubo Li LIST_HEAD(realm_queue); 43174a31df4SXiubo Li int last = 0; 43274a31df4SXiubo Li bool skip = false; 433963b61ebSSage Weil 43474a31df4SXiubo Li list_add_tail(&realm->rebuild_item, &realm_queue); 435963b61ebSSage Weil 43674a31df4SXiubo Li while (!list_empty(&realm_queue)) { 43774a31df4SXiubo Li struct ceph_snap_realm *_realm, *child; 43874a31df4SXiubo Li 43974a31df4SXiubo Li _realm = list_first_entry(&realm_queue, 44074a31df4SXiubo Li struct ceph_snap_realm, 44174a31df4SXiubo Li rebuild_item); 44274a31df4SXiubo Li 44374a31df4SXiubo Li /* 44474a31df4SXiubo Li * If the last building failed dues to memory 44574a31df4SXiubo Li * issue, just empty the realm_queue and return 44674a31df4SXiubo Li * to avoid infinite loop. 44774a31df4SXiubo Li */ 44874a31df4SXiubo Li if (last < 0) { 44974a31df4SXiubo Li list_del_init(&_realm->rebuild_item); 45074a31df4SXiubo Li continue; 45174a31df4SXiubo Li } 45274a31df4SXiubo Li 45374a31df4SXiubo Li last = build_snap_context(_realm, &realm_queue, dirty_realms); 454ad5255c1SXiubo Li dout("%s %llx %p, %s\n", __func__, _realm->ino, _realm, 45574a31df4SXiubo Li last > 0 ? "is deferred" : !last ? "succeeded" : "failed"); 45674a31df4SXiubo Li 45774a31df4SXiubo Li /* is any child in the list ? */ 45874a31df4SXiubo Li list_for_each_entry(child, &_realm->children, child_item) { 45974a31df4SXiubo Li if (!list_empty(&child->rebuild_item)) { 46074a31df4SXiubo Li skip = true; 46174a31df4SXiubo Li break; 46274a31df4SXiubo Li } 46374a31df4SXiubo Li } 46474a31df4SXiubo Li 46574a31df4SXiubo Li if (!skip) { 46674a31df4SXiubo Li list_for_each_entry(child, &_realm->children, child_item) 46774a31df4SXiubo Li list_add_tail(&child->rebuild_item, &realm_queue); 46874a31df4SXiubo Li } 46974a31df4SXiubo Li 47074a31df4SXiubo Li /* last == 1 means need to build parent first */ 47174a31df4SXiubo Li if (last <= 0) 47274a31df4SXiubo Li list_del_init(&_realm->rebuild_item); 47374a31df4SXiubo Li } 474963b61ebSSage Weil } 475963b61ebSSage Weil 476963b61ebSSage Weil 477963b61ebSSage Weil /* 478963b61ebSSage Weil * helper to allocate and decode an array of snapids. free prior 479963b61ebSSage Weil * instance, if any. 480963b61ebSSage Weil */ 481aa711ee3SAlex Elder static int dup_array(u64 **dst, __le64 *src, u32 num) 482963b61ebSSage Weil { 483aa711ee3SAlex Elder u32 i; 484963b61ebSSage Weil 485963b61ebSSage Weil kfree(*dst); 486963b61ebSSage Weil if (num) { 487963b61ebSSage Weil *dst = kcalloc(num, sizeof(u64), GFP_NOFS); 488963b61ebSSage Weil if (!*dst) 489963b61ebSSage Weil return -ENOMEM; 490963b61ebSSage Weil for (i = 0; i < num; i++) 491963b61ebSSage Weil (*dst)[i] = get_unaligned_le64(src + i); 492963b61ebSSage Weil } else { 493963b61ebSSage Weil *dst = NULL; 494963b61ebSSage Weil } 495963b61ebSSage Weil return 0; 496963b61ebSSage Weil } 497963b61ebSSage Weil 49886056090SYan, Zheng static bool has_new_snaps(struct ceph_snap_context *o, 49986056090SYan, Zheng struct ceph_snap_context *n) 50086056090SYan, Zheng { 50186056090SYan, Zheng if (n->num_snaps == 0) 50286056090SYan, Zheng return false; 50386056090SYan, Zheng /* snaps are in descending order */ 50486056090SYan, Zheng return n->snaps[0] > o->seq; 50586056090SYan, Zheng } 506963b61ebSSage Weil 507963b61ebSSage Weil /* 508963b61ebSSage Weil * When a snapshot is applied, the size/mtime inode metadata is queued 509963b61ebSSage Weil * in a ceph_cap_snap (one for each snapshot) until writeback 510963b61ebSSage Weil * completes and the metadata can be flushed back to the MDS. 511963b61ebSSage Weil * 512963b61ebSSage Weil * However, if a (sync) write is currently in-progress when we apply 513963b61ebSSage Weil * the snapshot, we have to wait until the write succeeds or fails 514963b61ebSSage Weil * (and a final size/mtime is known). In this case the 515963b61ebSSage Weil * cap_snap->writing = 1, and is said to be "pending." When the write 516963b61ebSSage Weil * finishes, we __ceph_finish_cap_snap(). 517963b61ebSSage Weil * 518963b61ebSSage Weil * Caller must hold snap_rwsem for read (i.e., the realm topology won't 519963b61ebSSage Weil * change). 520963b61ebSSage Weil */ 5211ab36c9dSXiubo Li static void ceph_queue_cap_snap(struct ceph_inode_info *ci, 5221ab36c9dSXiubo Li struct ceph_cap_snap **pcapsnap) 523963b61ebSSage Weil { 524874c8ca1SDavid Howells struct inode *inode = &ci->netfs.inode; 52586056090SYan, Zheng struct ceph_snap_context *old_snapc, *new_snapc; 5261ab36c9dSXiubo Li struct ceph_cap_snap *capsnap = *pcapsnap; 52712fe3ddaSLuis Henriques struct ceph_buffer *old_blob = NULL; 5284a625be4SSage Weil int used, dirty; 529963b61ebSSage Weil 530be655596SSage Weil spin_lock(&ci->i_ceph_lock); 531963b61ebSSage Weil used = __ceph_caps_used(ci); 5324a625be4SSage Weil dirty = __ceph_caps_dirty(ci); 533af0ed569SSage Weil 5345dda377cSYan, Zheng old_snapc = ci->i_head_snapc; 53586056090SYan, Zheng new_snapc = ci->i_snap_realm->cached_context; 5365dda377cSYan, Zheng 537af0ed569SSage Weil /* 538af0ed569SSage Weil * If there is a write in progress, treat that as a dirty Fw, 539af0ed569SSage Weil * even though it hasn't completed yet; by the time we finish 540af0ed569SSage Weil * up this capsnap it will be. 541af0ed569SSage Weil */ 542af0ed569SSage Weil if (used & CEPH_CAP_FILE_WR) 543af0ed569SSage Weil dirty |= CEPH_CAP_FILE_WR; 544af0ed569SSage Weil 545963b61ebSSage Weil if (__ceph_have_pending_cap_snap(ci)) { 546963b61ebSSage Weil /* there is no point in queuing multiple "pending" cap_snaps, 547963b61ebSSage Weil as no new writes are allowed to start when pending, so any 548963b61ebSSage Weil writes in progress now were started before the previous 549963b61ebSSage Weil cap_snap. lucky us. */ 550ad5255c1SXiubo Li dout("%s %p %llx.%llx already pending\n", 551ad5255c1SXiubo Li __func__, inode, ceph_vinop(inode)); 5525dda377cSYan, Zheng goto update_snapc; 5535dda377cSYan, Zheng } 55486056090SYan, Zheng if (ci->i_wrbuffer_ref_head == 0 && 55586056090SYan, Zheng !(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) { 556ad5255c1SXiubo Li dout("%s %p %llx.%llx nothing dirty|writing\n", 557ad5255c1SXiubo Li __func__, inode, ceph_vinop(inode)); 5585dda377cSYan, Zheng goto update_snapc; 5595dda377cSYan, Zheng } 560fc837c8fSSage Weil 5615dda377cSYan, Zheng BUG_ON(!old_snapc); 562af0ed569SSage Weil 56386056090SYan, Zheng /* 56486056090SYan, Zheng * There is no need to send FLUSHSNAP message to MDS if there is 56586056090SYan, Zheng * no new snapshot. But when there is dirty pages or on-going 56686056090SYan, Zheng * writes, we still need to create cap_snap. cap_snap is needed 56786056090SYan, Zheng * by the write path and page writeback path. 56886056090SYan, Zheng * 56986056090SYan, Zheng * also see ceph_try_drop_cap_snap() 57086056090SYan, Zheng */ 57186056090SYan, Zheng if (has_new_snaps(old_snapc, new_snapc)) { 57286056090SYan, Zheng if (dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR)) 57386056090SYan, Zheng capsnap->need_flush = true; 57486056090SYan, Zheng } else { 57586056090SYan, Zheng if (!(used & CEPH_CAP_FILE_WR) && 57686056090SYan, Zheng ci->i_wrbuffer_ref_head == 0) { 577ad5255c1SXiubo Li dout("%s %p %llx.%llx no new_snap|dirty_page|writing\n", 578ad5255c1SXiubo Li __func__, inode, ceph_vinop(inode)); 57986056090SYan, Zheng goto update_snapc; 58086056090SYan, Zheng } 58186056090SYan, Zheng } 58286056090SYan, Zheng 583ad5255c1SXiubo Li dout("%s %p %llx.%llx cap_snap %p queuing under %p %s %s\n", 584ad5255c1SXiubo Li __func__, inode, ceph_vinop(inode), capsnap, old_snapc, 585ad5255c1SXiubo Li ceph_cap_string(dirty), capsnap->need_flush ? "" : "no_flush"); 5860444d76aSDave Chinner ihold(inode); 587963b61ebSSage Weil 5885dda377cSYan, Zheng capsnap->follows = old_snapc->seq; 589963b61ebSSage Weil capsnap->issued = __ceph_caps_issued(ci, NULL); 5904a625be4SSage Weil capsnap->dirty = dirty; 591963b61ebSSage Weil 592963b61ebSSage Weil capsnap->mode = inode->i_mode; 593963b61ebSSage Weil capsnap->uid = inode->i_uid; 594963b61ebSSage Weil capsnap->gid = inode->i_gid; 595963b61ebSSage Weil 5964a625be4SSage Weil if (dirty & CEPH_CAP_XATTR_EXCL) { 59712fe3ddaSLuis Henriques old_blob = __ceph_build_xattrs_blob(ci); 5984a625be4SSage Weil capsnap->xattr_blob = 5994a625be4SSage Weil ceph_buffer_get(ci->i_xattrs.blob); 6004a625be4SSage Weil capsnap->xattr_version = ci->i_xattrs.version; 6014a625be4SSage Weil } else { 602963b61ebSSage Weil capsnap->xattr_blob = NULL; 6034a625be4SSage Weil capsnap->xattr_version = 0; 6044a625be4SSage Weil } 605963b61ebSSage Weil 606e20d258dSYan, Zheng capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; 607e20d258dSYan, Zheng 608963b61ebSSage Weil /* dirty page count moved from _head to this cap_snap; 609963b61ebSSage Weil all subsequent writes page dirties occur _after_ this 610963b61ebSSage Weil snapshot. */ 611963b61ebSSage Weil capsnap->dirty_pages = ci->i_wrbuffer_ref_head; 612963b61ebSSage Weil ci->i_wrbuffer_ref_head = 0; 6135dda377cSYan, Zheng capsnap->context = old_snapc; 614963b61ebSSage Weil list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); 615963b61ebSSage Weil 616963b61ebSSage Weil if (used & CEPH_CAP_FILE_WR) { 617ad5255c1SXiubo Li dout("%s %p %llx.%llx cap_snap %p snapc %p seq %llu used WR," 618ad5255c1SXiubo Li " now pending\n", __func__, inode, ceph_vinop(inode), 6195dda377cSYan, Zheng capsnap, old_snapc, old_snapc->seq); 620963b61ebSSage Weil capsnap->writing = 1; 621963b61ebSSage Weil } else { 622963b61ebSSage Weil /* note mtime, size NOW. */ 623963b61ebSSage Weil __ceph_finish_cap_snap(ci, capsnap); 624963b61ebSSage Weil } 6251ab36c9dSXiubo Li *pcapsnap = NULL; 626fce85157SYan, Zheng old_snapc = NULL; 627963b61ebSSage Weil 6285dda377cSYan, Zheng update_snapc: 62937659182SYan, Zheng if (ci->i_wrbuffer_ref_head == 0 && 63037659182SYan, Zheng ci->i_wr_ref == 0 && 63137659182SYan, Zheng ci->i_dirty_caps == 0 && 63237659182SYan, Zheng ci->i_flushing_caps == 0) { 63337659182SYan, Zheng ci->i_head_snapc = NULL; 63437659182SYan, Zheng } else { 63586056090SYan, Zheng ci->i_head_snapc = ceph_get_snap_context(new_snapc); 63686056090SYan, Zheng dout(" new snapc is %p\n", new_snapc); 6375dda377cSYan, Zheng } 638be655596SSage Weil spin_unlock(&ci->i_ceph_lock); 6395dda377cSYan, Zheng 64012fe3ddaSLuis Henriques ceph_buffer_put(old_blob); 6415dda377cSYan, Zheng ceph_put_snap_context(old_snapc); 642963b61ebSSage Weil } 643963b61ebSSage Weil 644963b61ebSSage Weil /* 645963b61ebSSage Weil * Finalize the size, mtime for a cap_snap.. that is, settle on final values 646963b61ebSSage Weil * to be used for the snapshot, to be flushed back to the mds. 647963b61ebSSage Weil * 648963b61ebSSage Weil * If capsnap can now be flushed, add to snap_flush list, and return 1. 649963b61ebSSage Weil * 650be655596SSage Weil * Caller must hold i_ceph_lock. 651963b61ebSSage Weil */ 652963b61ebSSage Weil int __ceph_finish_cap_snap(struct ceph_inode_info *ci, 653963b61ebSSage Weil struct ceph_cap_snap *capsnap) 654963b61ebSSage Weil { 655874c8ca1SDavid Howells struct inode *inode = &ci->netfs.inode; 6562678da88SXiubo Li struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); 657963b61ebSSage Weil 658963b61ebSSage Weil BUG_ON(capsnap->writing); 6592d6795fbSJeff Layton capsnap->size = i_size_read(inode); 6609bbeab41SArnd Bergmann capsnap->mtime = inode->i_mtime; 6619bbeab41SArnd Bergmann capsnap->atime = inode->i_atime; 6629bbeab41SArnd Bergmann capsnap->ctime = inode->i_ctime; 663ec62b894SJeff Layton capsnap->btime = ci->i_btime; 664176c77c9SJeff Layton capsnap->change_attr = inode_peek_iversion_raw(inode); 665963b61ebSSage Weil capsnap->time_warp_seq = ci->i_time_warp_seq; 6665f743e45SYan, Zheng capsnap->truncate_size = ci->i_truncate_size; 6675f743e45SYan, Zheng capsnap->truncate_seq = ci->i_truncate_seq; 668963b61ebSSage Weil if (capsnap->dirty_pages) { 669ad5255c1SXiubo Li dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu " 670ad5255c1SXiubo Li "still has %d dirty pages\n", __func__, inode, 671ad5255c1SXiubo Li ceph_vinop(inode), capsnap, capsnap->context, 672ad5255c1SXiubo Li capsnap->context->seq, ceph_cap_string(capsnap->dirty), 673ad5255c1SXiubo Li capsnap->size, capsnap->dirty_pages); 674963b61ebSSage Weil return 0; 675963b61ebSSage Weil } 67670220ac8SYan, Zheng 677558b4510SXiubo Li /* Fb cap still in use, delay it */ 678558b4510SXiubo Li if (ci->i_wb_ref) { 679ad5255c1SXiubo Li dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu " 680ad5255c1SXiubo Li "used WRBUFFER, delaying\n", __func__, inode, 681ad5255c1SXiubo Li ceph_vinop(inode), capsnap, capsnap->context, 682ad5255c1SXiubo Li capsnap->context->seq, ceph_cap_string(capsnap->dirty), 683ad5255c1SXiubo Li capsnap->size); 684558b4510SXiubo Li capsnap->writing = 1; 685558b4510SXiubo Li return 0; 686558b4510SXiubo Li } 687558b4510SXiubo Li 68870220ac8SYan, Zheng ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS; 689ad5255c1SXiubo Li dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu\n", 690ad5255c1SXiubo Li __func__, inode, ceph_vinop(inode), capsnap, capsnap->context, 691819ccbfaSSage Weil capsnap->context->seq, ceph_cap_string(capsnap->dirty), 692819ccbfaSSage Weil capsnap->size); 693963b61ebSSage Weil 694963b61ebSSage Weil spin_lock(&mdsc->snap_flush_lock); 69504242ff3SYan, Zheng if (list_empty(&ci->i_snap_flush_item)) 696963b61ebSSage Weil list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list); 697963b61ebSSage Weil spin_unlock(&mdsc->snap_flush_lock); 698963b61ebSSage Weil return 1; /* caller may want to ceph_flush_snaps */ 699963b61ebSSage Weil } 700963b61ebSSage Weil 701ed326044SSage Weil /* 702ed326044SSage Weil * Queue cap_snaps for snap writeback for this realm and its children. 703ed326044SSage Weil * Called under snap_rwsem, so realm topology won't change. 704ed326044SSage Weil */ 705ed326044SSage Weil static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) 706ed326044SSage Weil { 707ed326044SSage Weil struct ceph_inode_info *ci; 708ed326044SSage Weil struct inode *lastinode = NULL; 7091ab36c9dSXiubo Li struct ceph_cap_snap *capsnap = NULL; 710ed326044SSage Weil 711ad5255c1SXiubo Li dout("%s %p %llx inode\n", __func__, realm, realm->ino); 712ed326044SSage Weil 713ed326044SSage Weil spin_lock(&realm->inodes_with_caps_lock); 7143ae0bebcSYan, Zheng list_for_each_entry(ci, &realm->inodes_with_caps, i_snap_realm_item) { 715874c8ca1SDavid Howells struct inode *inode = igrab(&ci->netfs.inode); 716ed326044SSage Weil if (!inode) 717ed326044SSage Weil continue; 718ed326044SSage Weil spin_unlock(&realm->inodes_with_caps_lock); 71923c2c76eSJeff Layton iput(lastinode); 720ed326044SSage Weil lastinode = inode; 7211ab36c9dSXiubo Li 7221ab36c9dSXiubo Li /* 7231ab36c9dSXiubo Li * Allocate the capsnap memory outside of ceph_queue_cap_snap() 7241ab36c9dSXiubo Li * to reduce very possible but unnecessary frequently memory 7251ab36c9dSXiubo Li * allocate/free in this loop. 7261ab36c9dSXiubo Li */ 7271ab36c9dSXiubo Li if (!capsnap) { 7281ab36c9dSXiubo Li capsnap = kmem_cache_zalloc(ceph_cap_snap_cachep, GFP_NOFS); 7291ab36c9dSXiubo Li if (!capsnap) { 7301ab36c9dSXiubo Li pr_err("ENOMEM allocating ceph_cap_snap on %p\n", 7311ab36c9dSXiubo Li inode); 7321ab36c9dSXiubo Li return; 7331ab36c9dSXiubo Li } 7341ab36c9dSXiubo Li } 7351ab36c9dSXiubo Li capsnap->cap_flush.is_capsnap = true; 7361ab36c9dSXiubo Li refcount_set(&capsnap->nref, 1); 7371ab36c9dSXiubo Li INIT_LIST_HEAD(&capsnap->cap_flush.i_list); 7381ab36c9dSXiubo Li INIT_LIST_HEAD(&capsnap->cap_flush.g_list); 7391ab36c9dSXiubo Li INIT_LIST_HEAD(&capsnap->ci_item); 7401ab36c9dSXiubo Li 7411ab36c9dSXiubo Li ceph_queue_cap_snap(ci, &capsnap); 742ed326044SSage Weil spin_lock(&realm->inodes_with_caps_lock); 743ed326044SSage Weil } 744ed326044SSage Weil spin_unlock(&realm->inodes_with_caps_lock); 74523c2c76eSJeff Layton iput(lastinode); 746ed326044SSage Weil 7471ab36c9dSXiubo Li if (capsnap) 7481ab36c9dSXiubo Li kmem_cache_free(ceph_cap_snap_cachep, capsnap); 749ad5255c1SXiubo Li dout("%s %p %llx done\n", __func__, realm, realm->ino); 750ed326044SSage Weil } 751963b61ebSSage Weil 752963b61ebSSage Weil /* 753963b61ebSSage Weil * Parse and apply a snapblob "snap trace" from the MDS. This specifies 754963b61ebSSage Weil * the snap realm parameters from a given realm and all of its ancestors, 755963b61ebSSage Weil * up to the root. 756963b61ebSSage Weil * 757963b61ebSSage Weil * Caller must hold snap_rwsem for write. 758963b61ebSSage Weil */ 759963b61ebSSage Weil int ceph_update_snap_trace(struct ceph_mds_client *mdsc, 760982d6011SYan, Zheng void *p, void *e, bool deletion, 761982d6011SYan, Zheng struct ceph_snap_realm **realm_ret) 762963b61ebSSage Weil { 763963b61ebSSage Weil struct ceph_mds_snap_realm *ri; /* encoded */ 764963b61ebSSage Weil __le64 *snaps; /* encoded */ 765963b61ebSSage Weil __le64 *prior_parent_snaps; /* encoded */ 766*51884d15SXiubo Li struct ceph_snap_realm *realm; 767982d6011SYan, Zheng struct ceph_snap_realm *first_realm = NULL; 7682e586641SXiubo Li struct ceph_snap_realm *realm_to_rebuild = NULL; 7692e586641SXiubo Li int rebuild_snapcs; 770963b61ebSSage Weil int err = -ENOMEM; 771ae00d4f3SSage Weil LIST_HEAD(dirty_realms); 772963b61ebSSage Weil 773a6862e67SJeff Layton lockdep_assert_held_write(&mdsc->snap_rwsem); 774a6862e67SJeff Layton 775ad5255c1SXiubo Li dout("%s deletion=%d\n", __func__, deletion); 776963b61ebSSage Weil more: 777*51884d15SXiubo Li realm = NULL; 7782e586641SXiubo Li rebuild_snapcs = 0; 779963b61ebSSage Weil ceph_decode_need(&p, e, sizeof(*ri), bad); 780963b61ebSSage Weil ri = p; 781963b61ebSSage Weil p += sizeof(*ri); 782963b61ebSSage Weil ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) + 783963b61ebSSage Weil le32_to_cpu(ri->num_prior_parent_snaps)), bad); 784963b61ebSSage Weil snaps = p; 785963b61ebSSage Weil p += sizeof(u64) * le32_to_cpu(ri->num_snaps); 786963b61ebSSage Weil prior_parent_snaps = p; 787963b61ebSSage Weil p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps); 788963b61ebSSage Weil 789963b61ebSSage Weil realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino)); 790963b61ebSSage Weil if (!realm) { 791963b61ebSSage Weil realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino)); 792963b61ebSSage Weil if (IS_ERR(realm)) { 793963b61ebSSage Weil err = PTR_ERR(realm); 794963b61ebSSage Weil goto fail; 795963b61ebSSage Weil } 796963b61ebSSage Weil } 797963b61ebSSage Weil 798963b61ebSSage Weil /* ensure the parent is correct */ 799963b61ebSSage Weil err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent)); 800963b61ebSSage Weil if (err < 0) 801963b61ebSSage Weil goto fail; 8022e586641SXiubo Li rebuild_snapcs += err; 803963b61ebSSage Weil 804963b61ebSSage Weil if (le64_to_cpu(ri->seq) > realm->seq) { 805ad5255c1SXiubo Li dout("%s updating %llx %p %lld -> %lld\n", __func__, 806ae00d4f3SSage Weil realm->ino, realm, realm->seq, le64_to_cpu(ri->seq)); 807963b61ebSSage Weil /* update realm parameters, snap lists */ 808963b61ebSSage Weil realm->seq = le64_to_cpu(ri->seq); 809963b61ebSSage Weil realm->created = le64_to_cpu(ri->created); 810963b61ebSSage Weil realm->parent_since = le64_to_cpu(ri->parent_since); 811963b61ebSSage Weil 812963b61ebSSage Weil realm->num_snaps = le32_to_cpu(ri->num_snaps); 813963b61ebSSage Weil err = dup_array(&realm->snaps, snaps, realm->num_snaps); 814963b61ebSSage Weil if (err < 0) 815963b61ebSSage Weil goto fail; 816963b61ebSSage Weil 817963b61ebSSage Weil realm->num_prior_parent_snaps = 818963b61ebSSage Weil le32_to_cpu(ri->num_prior_parent_snaps); 819963b61ebSSage Weil err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps, 820963b61ebSSage Weil realm->num_prior_parent_snaps); 821963b61ebSSage Weil if (err < 0) 822963b61ebSSage Weil goto fail; 823963b61ebSSage Weil 824affbc19aSYan, Zheng if (realm->seq > mdsc->last_snap_seq) 825affbc19aSYan, Zheng mdsc->last_snap_seq = realm->seq; 826ae00d4f3SSage Weil 8272e586641SXiubo Li rebuild_snapcs = 1; 828963b61ebSSage Weil } else if (!realm->cached_context) { 829ad5255c1SXiubo Li dout("%s %llx %p seq %lld new\n", __func__, 830ae00d4f3SSage Weil realm->ino, realm, realm->seq); 8312e586641SXiubo Li rebuild_snapcs = 1; 832ae00d4f3SSage Weil } else { 833ad5255c1SXiubo Li dout("%s %llx %p seq %lld unchanged\n", __func__, 834ae00d4f3SSage Weil realm->ino, realm, realm->seq); 835963b61ebSSage Weil } 836963b61ebSSage Weil 8372e586641SXiubo Li dout("done with %llx %p, rebuild_snapcs=%d, %p %p\n", realm->ino, 8382e586641SXiubo Li realm, rebuild_snapcs, p, e); 839963b61ebSSage Weil 8402e586641SXiubo Li /* 8412e586641SXiubo Li * this will always track the uppest parent realm from which 8422e586641SXiubo Li * we need to rebuild the snapshot contexts _downward_ in 8432e586641SXiubo Li * hierarchy. 8442e586641SXiubo Li */ 8452e586641SXiubo Li if (rebuild_snapcs) 8462e586641SXiubo Li realm_to_rebuild = realm; 8472e586641SXiubo Li 8482e586641SXiubo Li /* rebuild_snapcs when we reach the _end_ (root) of the trace */ 8492e586641SXiubo Li if (realm_to_rebuild && p >= e) 8502e586641SXiubo Li rebuild_snap_realms(realm_to_rebuild, &dirty_realms); 851982d6011SYan, Zheng 852982d6011SYan, Zheng if (!first_realm) 853982d6011SYan, Zheng first_realm = realm; 854982d6011SYan, Zheng else 855982d6011SYan, Zheng ceph_put_snap_realm(mdsc, realm); 856982d6011SYan, Zheng 857963b61ebSSage Weil if (p < e) 858963b61ebSSage Weil goto more; 859963b61ebSSage Weil 860ae00d4f3SSage Weil /* 861ae00d4f3SSage Weil * queue cap snaps _after_ we've built the new snap contexts, 862ae00d4f3SSage Weil * so that i_head_snapc can be set appropriately. 863ae00d4f3SSage Weil */ 864e8e1ba96SSage Weil while (!list_empty(&dirty_realms)) { 865e8e1ba96SSage Weil realm = list_first_entry(&dirty_realms, struct ceph_snap_realm, 866e8e1ba96SSage Weil dirty_item); 8673ae0bebcSYan, Zheng list_del_init(&realm->dirty_item); 868ae00d4f3SSage Weil queue_realm_cap_snaps(realm); 869ae00d4f3SSage Weil } 870ae00d4f3SSage Weil 871982d6011SYan, Zheng if (realm_ret) 872982d6011SYan, Zheng *realm_ret = first_realm; 873982d6011SYan, Zheng else 874982d6011SYan, Zheng ceph_put_snap_realm(mdsc, first_realm); 875982d6011SYan, Zheng 876963b61ebSSage Weil __cleanup_empty_realms(mdsc); 877963b61ebSSage Weil return 0; 878963b61ebSSage Weil 879963b61ebSSage Weil bad: 880f3fd3ea6SJeff Layton err = -EIO; 881963b61ebSSage Weil fail: 882982d6011SYan, Zheng if (realm && !IS_ERR(realm)) 883982d6011SYan, Zheng ceph_put_snap_realm(mdsc, realm); 884982d6011SYan, Zheng if (first_realm) 885982d6011SYan, Zheng ceph_put_snap_realm(mdsc, first_realm); 886ad5255c1SXiubo Li pr_err("%s error %d\n", __func__, err); 887963b61ebSSage Weil return err; 888963b61ebSSage Weil } 889963b61ebSSage Weil 890963b61ebSSage Weil 891963b61ebSSage Weil /* 892963b61ebSSage Weil * Send any cap_snaps that are queued for flush. Try to carry 893963b61ebSSage Weil * s_mutex across multiple snap flushes to avoid locking overhead. 894963b61ebSSage Weil * 895963b61ebSSage Weil * Caller holds no locks. 896963b61ebSSage Weil */ 897963b61ebSSage Weil static void flush_snaps(struct ceph_mds_client *mdsc) 898963b61ebSSage Weil { 899963b61ebSSage Weil struct ceph_inode_info *ci; 900963b61ebSSage Weil struct inode *inode; 901963b61ebSSage Weil struct ceph_mds_session *session = NULL; 902963b61ebSSage Weil 903ad5255c1SXiubo Li dout("%s\n", __func__); 904963b61ebSSage Weil spin_lock(&mdsc->snap_flush_lock); 905963b61ebSSage Weil while (!list_empty(&mdsc->snap_flush_list)) { 906963b61ebSSage Weil ci = list_first_entry(&mdsc->snap_flush_list, 907963b61ebSSage Weil struct ceph_inode_info, i_snap_flush_item); 908874c8ca1SDavid Howells inode = &ci->netfs.inode; 90970b666c3SSage Weil ihold(inode); 910963b61ebSSage Weil spin_unlock(&mdsc->snap_flush_lock); 911ed9b430cSYan, Zheng ceph_flush_snaps(ci, &session); 91223c2c76eSJeff Layton iput(inode); 913963b61ebSSage Weil spin_lock(&mdsc->snap_flush_lock); 914963b61ebSSage Weil } 915963b61ebSSage Weil spin_unlock(&mdsc->snap_flush_lock); 916963b61ebSSage Weil 917963b61ebSSage Weil ceph_put_mds_session(session); 918ad5255c1SXiubo Li dout("%s done\n", __func__); 919963b61ebSSage Weil } 920963b61ebSSage Weil 9210ba92e1cSJeff Layton /** 9220ba92e1cSJeff Layton * ceph_change_snap_realm - change the snap_realm for an inode 9230ba92e1cSJeff Layton * @inode: inode to move to new snap realm 9240ba92e1cSJeff Layton * @realm: new realm to move inode into (may be NULL) 9250ba92e1cSJeff Layton * 9260ba92e1cSJeff Layton * Detach an inode from its old snaprealm (if any) and attach it to 9270ba92e1cSJeff Layton * the new snaprealm (if any). The old snap realm reference held by 9280ba92e1cSJeff Layton * the inode is put. If realm is non-NULL, then the caller's reference 9290ba92e1cSJeff Layton * to it is taken over by the inode. 9300ba92e1cSJeff Layton */ 9310ba92e1cSJeff Layton void ceph_change_snap_realm(struct inode *inode, struct ceph_snap_realm *realm) 9320ba92e1cSJeff Layton { 9330ba92e1cSJeff Layton struct ceph_inode_info *ci = ceph_inode(inode); 9340ba92e1cSJeff Layton struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 9350ba92e1cSJeff Layton struct ceph_snap_realm *oldrealm = ci->i_snap_realm; 9360ba92e1cSJeff Layton 9370ba92e1cSJeff Layton lockdep_assert_held(&ci->i_ceph_lock); 9380ba92e1cSJeff Layton 9390ba92e1cSJeff Layton if (oldrealm) { 9400ba92e1cSJeff Layton spin_lock(&oldrealm->inodes_with_caps_lock); 9410ba92e1cSJeff Layton list_del_init(&ci->i_snap_realm_item); 9420ba92e1cSJeff Layton if (oldrealm->ino == ci->i_vino.ino) 9430ba92e1cSJeff Layton oldrealm->inode = NULL; 9440ba92e1cSJeff Layton spin_unlock(&oldrealm->inodes_with_caps_lock); 9450ba92e1cSJeff Layton ceph_put_snap_realm(mdsc, oldrealm); 9460ba92e1cSJeff Layton } 9470ba92e1cSJeff Layton 9480ba92e1cSJeff Layton ci->i_snap_realm = realm; 9490ba92e1cSJeff Layton 9500ba92e1cSJeff Layton if (realm) { 9510ba92e1cSJeff Layton spin_lock(&realm->inodes_with_caps_lock); 9520ba92e1cSJeff Layton list_add(&ci->i_snap_realm_item, &realm->inodes_with_caps); 9530ba92e1cSJeff Layton if (realm->ino == ci->i_vino.ino) 9540ba92e1cSJeff Layton realm->inode = inode; 9550ba92e1cSJeff Layton spin_unlock(&realm->inodes_with_caps_lock); 9560ba92e1cSJeff Layton } 9570ba92e1cSJeff Layton } 958963b61ebSSage Weil 959963b61ebSSage Weil /* 960963b61ebSSage Weil * Handle a snap notification from the MDS. 961963b61ebSSage Weil * 962963b61ebSSage Weil * This can take two basic forms: the simplest is just a snap creation 963963b61ebSSage Weil * or deletion notification on an existing realm. This should update the 964963b61ebSSage Weil * realm and its children. 965963b61ebSSage Weil * 966963b61ebSSage Weil * The more difficult case is realm creation, due to snap creation at a 967963b61ebSSage Weil * new point in the file hierarchy, or due to a rename that moves a file or 968963b61ebSSage Weil * directory into another realm. 969963b61ebSSage Weil */ 970963b61ebSSage Weil void ceph_handle_snap(struct ceph_mds_client *mdsc, 9712600d2ddSSage Weil struct ceph_mds_session *session, 972963b61ebSSage Weil struct ceph_msg *msg) 973963b61ebSSage Weil { 9743d14c5d2SYehuda Sadeh struct super_block *sb = mdsc->fsc->sb; 9752600d2ddSSage Weil int mds = session->s_mds; 976963b61ebSSage Weil u64 split; 977963b61ebSSage Weil int op; 978963b61ebSSage Weil int trace_len; 979963b61ebSSage Weil struct ceph_snap_realm *realm = NULL; 980963b61ebSSage Weil void *p = msg->front.iov_base; 981963b61ebSSage Weil void *e = p + msg->front.iov_len; 982963b61ebSSage Weil struct ceph_mds_snap_head *h; 983963b61ebSSage Weil int num_split_inos, num_split_realms; 984963b61ebSSage Weil __le64 *split_inos = NULL, *split_realms = NULL; 985963b61ebSSage Weil int i; 986963b61ebSSage Weil int locked_rwsem = 0; 987963b61ebSSage Weil 988963b61ebSSage Weil /* decode */ 989963b61ebSSage Weil if (msg->front.iov_len < sizeof(*h)) 990963b61ebSSage Weil goto bad; 991963b61ebSSage Weil h = p; 992963b61ebSSage Weil op = le32_to_cpu(h->op); 993963b61ebSSage Weil split = le64_to_cpu(h->split); /* non-zero if we are splitting an 994963b61ebSSage Weil * existing realm */ 995963b61ebSSage Weil num_split_inos = le32_to_cpu(h->num_split_inos); 996963b61ebSSage Weil num_split_realms = le32_to_cpu(h->num_split_realms); 997963b61ebSSage Weil trace_len = le32_to_cpu(h->trace_len); 998963b61ebSSage Weil p += sizeof(*h); 999963b61ebSSage Weil 1000ad5255c1SXiubo Li dout("%s from mds%d op %s split %llx tracelen %d\n", __func__, 1001ad5255c1SXiubo Li mds, ceph_snap_op_name(op), split, trace_len); 1002963b61ebSSage Weil 1003963b61ebSSage Weil mutex_lock(&session->s_mutex); 100462575e27SJeff Layton inc_session_sequence(session); 1005963b61ebSSage Weil mutex_unlock(&session->s_mutex); 1006963b61ebSSage Weil 1007963b61ebSSage Weil down_write(&mdsc->snap_rwsem); 1008963b61ebSSage Weil locked_rwsem = 1; 1009963b61ebSSage Weil 1010963b61ebSSage Weil if (op == CEPH_SNAP_OP_SPLIT) { 1011963b61ebSSage Weil struct ceph_mds_snap_realm *ri; 1012963b61ebSSage Weil 1013963b61ebSSage Weil /* 1014963b61ebSSage Weil * A "split" breaks part of an existing realm off into 1015963b61ebSSage Weil * a new realm. The MDS provides a list of inodes 1016963b61ebSSage Weil * (with caps) and child realms that belong to the new 1017963b61ebSSage Weil * child. 1018963b61ebSSage Weil */ 1019963b61ebSSage Weil split_inos = p; 1020963b61ebSSage Weil p += sizeof(u64) * num_split_inos; 1021963b61ebSSage Weil split_realms = p; 1022963b61ebSSage Weil p += sizeof(u64) * num_split_realms; 1023963b61ebSSage Weil ceph_decode_need(&p, e, sizeof(*ri), bad); 1024963b61ebSSage Weil /* we will peek at realm info here, but will _not_ 1025963b61ebSSage Weil * advance p, as the realm update will occur below in 1026963b61ebSSage Weil * ceph_update_snap_trace. */ 1027963b61ebSSage Weil ri = p; 1028963b61ebSSage Weil 1029963b61ebSSage Weil realm = ceph_lookup_snap_realm(mdsc, split); 1030963b61ebSSage Weil if (!realm) { 1031963b61ebSSage Weil realm = ceph_create_snap_realm(mdsc, split); 1032963b61ebSSage Weil if (IS_ERR(realm)) 1033963b61ebSSage Weil goto out; 1034963b61ebSSage Weil } 1035963b61ebSSage Weil 1036963b61ebSSage Weil dout("splitting snap_realm %llx %p\n", realm->ino, realm); 1037963b61ebSSage Weil for (i = 0; i < num_split_inos; i++) { 1038963b61ebSSage Weil struct ceph_vino vino = { 1039963b61ebSSage Weil .ino = le64_to_cpu(split_inos[i]), 1040963b61ebSSage Weil .snap = CEPH_NOSNAP, 1041963b61ebSSage Weil }; 1042963b61ebSSage Weil struct inode *inode = ceph_find_inode(sb, vino); 1043963b61ebSSage Weil struct ceph_inode_info *ci; 1044963b61ebSSage Weil 1045963b61ebSSage Weil if (!inode) 1046963b61ebSSage Weil continue; 1047963b61ebSSage Weil ci = ceph_inode(inode); 1048963b61ebSSage Weil 1049be655596SSage Weil spin_lock(&ci->i_ceph_lock); 1050963b61ebSSage Weil if (!ci->i_snap_realm) 1051963b61ebSSage Weil goto skip_inode; 1052963b61ebSSage Weil /* 1053963b61ebSSage Weil * If this inode belongs to a realm that was 1054963b61ebSSage Weil * created after our new realm, we experienced 1055963b61ebSSage Weil * a race (due to another split notifications 1056963b61ebSSage Weil * arriving from a different MDS). So skip 1057963b61ebSSage Weil * this inode. 1058963b61ebSSage Weil */ 1059963b61ebSSage Weil if (ci->i_snap_realm->created > 1060963b61ebSSage Weil le64_to_cpu(ri->created)) { 1061ad5255c1SXiubo Li dout(" leaving %p %llx.%llx in newer realm %llx %p\n", 1062ad5255c1SXiubo Li inode, ceph_vinop(inode), ci->i_snap_realm->ino, 1063963b61ebSSage Weil ci->i_snap_realm); 1064963b61ebSSage Weil goto skip_inode; 1065963b61ebSSage Weil } 1066ad5255c1SXiubo Li dout(" will move %p %llx.%llx to split realm %llx %p\n", 1067ad5255c1SXiubo Li inode, ceph_vinop(inode), realm->ino, realm); 1068963b61ebSSage Weil 1069ae00d4f3SSage Weil ceph_get_snap_realm(mdsc, realm); 10700ba92e1cSJeff Layton ceph_change_snap_realm(inode, realm); 10710ba92e1cSJeff Layton spin_unlock(&ci->i_ceph_lock); 107223c2c76eSJeff Layton iput(inode); 1073963b61ebSSage Weil continue; 1074963b61ebSSage Weil 1075963b61ebSSage Weil skip_inode: 1076be655596SSage Weil spin_unlock(&ci->i_ceph_lock); 107723c2c76eSJeff Layton iput(inode); 1078963b61ebSSage Weil } 1079963b61ebSSage Weil 1080963b61ebSSage Weil /* we may have taken some of the old realm's children. */ 1081963b61ebSSage Weil for (i = 0; i < num_split_realms; i++) { 1082963b61ebSSage Weil struct ceph_snap_realm *child = 1083982d6011SYan, Zheng __lookup_snap_realm(mdsc, 1084963b61ebSSage Weil le64_to_cpu(split_realms[i])); 1085963b61ebSSage Weil if (!child) 1086963b61ebSSage Weil continue; 1087963b61ebSSage Weil adjust_snap_realm_parent(mdsc, child, realm->ino); 1088963b61ebSSage Weil } 1089963b61ebSSage Weil } 1090963b61ebSSage Weil 1091963b61ebSSage Weil /* 1092963b61ebSSage Weil * update using the provided snap trace. if we are deleting a 1093963b61ebSSage Weil * snap, we can avoid queueing cap_snaps. 1094963b61ebSSage Weil */ 1095963b61ebSSage Weil ceph_update_snap_trace(mdsc, p, e, 1096982d6011SYan, Zheng op == CEPH_SNAP_OP_DESTROY, NULL); 1097963b61ebSSage Weil 1098ae00d4f3SSage Weil if (op == CEPH_SNAP_OP_SPLIT) 1099963b61ebSSage Weil /* we took a reference when we created the realm, above */ 1100963b61ebSSage Weil ceph_put_snap_realm(mdsc, realm); 1101963b61ebSSage Weil 1102963b61ebSSage Weil __cleanup_empty_realms(mdsc); 1103963b61ebSSage Weil 1104963b61ebSSage Weil up_write(&mdsc->snap_rwsem); 1105963b61ebSSage Weil 1106963b61ebSSage Weil flush_snaps(mdsc); 1107963b61ebSSage Weil return; 1108963b61ebSSage Weil 1109963b61ebSSage Weil bad: 1110ad5255c1SXiubo Li pr_err("%s corrupt snap message from mds%d\n", __func__, mds); 11119ec7cab1SSage Weil ceph_msg_dump(msg); 1112963b61ebSSage Weil out: 1113963b61ebSSage Weil if (locked_rwsem) 1114963b61ebSSage Weil up_write(&mdsc->snap_rwsem); 1115963b61ebSSage Weil return; 1116963b61ebSSage Weil } 111775c9627eSYan, Zheng 111875c9627eSYan, Zheng struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc, 111975c9627eSYan, Zheng u64 snap) 112075c9627eSYan, Zheng { 112175c9627eSYan, Zheng struct ceph_snapid_map *sm, *exist; 112275c9627eSYan, Zheng struct rb_node **p, *parent; 112375c9627eSYan, Zheng int ret; 112475c9627eSYan, Zheng 112575c9627eSYan, Zheng exist = NULL; 112675c9627eSYan, Zheng spin_lock(&mdsc->snapid_map_lock); 112775c9627eSYan, Zheng p = &mdsc->snapid_map_tree.rb_node; 112875c9627eSYan, Zheng while (*p) { 112975c9627eSYan, Zheng exist = rb_entry(*p, struct ceph_snapid_map, node); 113075c9627eSYan, Zheng if (snap > exist->snap) { 113175c9627eSYan, Zheng p = &(*p)->rb_left; 113275c9627eSYan, Zheng } else if (snap < exist->snap) { 113375c9627eSYan, Zheng p = &(*p)->rb_right; 113475c9627eSYan, Zheng } else { 113575c9627eSYan, Zheng if (atomic_inc_return(&exist->ref) == 1) 113675c9627eSYan, Zheng list_del_init(&exist->lru); 113775c9627eSYan, Zheng break; 113875c9627eSYan, Zheng } 113975c9627eSYan, Zheng exist = NULL; 114075c9627eSYan, Zheng } 114175c9627eSYan, Zheng spin_unlock(&mdsc->snapid_map_lock); 114275c9627eSYan, Zheng if (exist) { 1143ad5255c1SXiubo Li dout("%s found snapid map %llx -> %x\n", __func__, 1144ad5255c1SXiubo Li exist->snap, exist->dev); 114575c9627eSYan, Zheng return exist; 114675c9627eSYan, Zheng } 114775c9627eSYan, Zheng 114875c9627eSYan, Zheng sm = kmalloc(sizeof(*sm), GFP_NOFS); 114975c9627eSYan, Zheng if (!sm) 115075c9627eSYan, Zheng return NULL; 115175c9627eSYan, Zheng 115275c9627eSYan, Zheng ret = get_anon_bdev(&sm->dev); 115375c9627eSYan, Zheng if (ret < 0) { 115475c9627eSYan, Zheng kfree(sm); 115575c9627eSYan, Zheng return NULL; 115675c9627eSYan, Zheng } 115775c9627eSYan, Zheng 115875c9627eSYan, Zheng INIT_LIST_HEAD(&sm->lru); 115975c9627eSYan, Zheng atomic_set(&sm->ref, 1); 116075c9627eSYan, Zheng sm->snap = snap; 116175c9627eSYan, Zheng 116275c9627eSYan, Zheng exist = NULL; 116375c9627eSYan, Zheng parent = NULL; 116475c9627eSYan, Zheng p = &mdsc->snapid_map_tree.rb_node; 116575c9627eSYan, Zheng spin_lock(&mdsc->snapid_map_lock); 116675c9627eSYan, Zheng while (*p) { 116775c9627eSYan, Zheng parent = *p; 116875c9627eSYan, Zheng exist = rb_entry(*p, struct ceph_snapid_map, node); 116975c9627eSYan, Zheng if (snap > exist->snap) 117075c9627eSYan, Zheng p = &(*p)->rb_left; 117175c9627eSYan, Zheng else if (snap < exist->snap) 117275c9627eSYan, Zheng p = &(*p)->rb_right; 117375c9627eSYan, Zheng else 117475c9627eSYan, Zheng break; 117575c9627eSYan, Zheng exist = NULL; 117675c9627eSYan, Zheng } 117775c9627eSYan, Zheng if (exist) { 117875c9627eSYan, Zheng if (atomic_inc_return(&exist->ref) == 1) 117975c9627eSYan, Zheng list_del_init(&exist->lru); 118075c9627eSYan, Zheng } else { 118175c9627eSYan, Zheng rb_link_node(&sm->node, parent, p); 118275c9627eSYan, Zheng rb_insert_color(&sm->node, &mdsc->snapid_map_tree); 118375c9627eSYan, Zheng } 118475c9627eSYan, Zheng spin_unlock(&mdsc->snapid_map_lock); 118575c9627eSYan, Zheng if (exist) { 118675c9627eSYan, Zheng free_anon_bdev(sm->dev); 118775c9627eSYan, Zheng kfree(sm); 1188ad5255c1SXiubo Li dout("%s found snapid map %llx -> %x\n", __func__, 1189ad5255c1SXiubo Li exist->snap, exist->dev); 119075c9627eSYan, Zheng return exist; 119175c9627eSYan, Zheng } 119275c9627eSYan, Zheng 1193ad5255c1SXiubo Li dout("%s create snapid map %llx -> %x\n", __func__, 1194ad5255c1SXiubo Li sm->snap, sm->dev); 119575c9627eSYan, Zheng return sm; 119675c9627eSYan, Zheng } 119775c9627eSYan, Zheng 119875c9627eSYan, Zheng void ceph_put_snapid_map(struct ceph_mds_client* mdsc, 119975c9627eSYan, Zheng struct ceph_snapid_map *sm) 120075c9627eSYan, Zheng { 120175c9627eSYan, Zheng if (!sm) 120275c9627eSYan, Zheng return; 120375c9627eSYan, Zheng if (atomic_dec_and_lock(&sm->ref, &mdsc->snapid_map_lock)) { 120475c9627eSYan, Zheng if (!RB_EMPTY_NODE(&sm->node)) { 120575c9627eSYan, Zheng sm->last_used = jiffies; 120675c9627eSYan, Zheng list_add_tail(&sm->lru, &mdsc->snapid_map_lru); 120775c9627eSYan, Zheng spin_unlock(&mdsc->snapid_map_lock); 120875c9627eSYan, Zheng } else { 120975c9627eSYan, Zheng /* already cleaned up by 121075c9627eSYan, Zheng * ceph_cleanup_snapid_map() */ 121175c9627eSYan, Zheng spin_unlock(&mdsc->snapid_map_lock); 121275c9627eSYan, Zheng kfree(sm); 121375c9627eSYan, Zheng } 121475c9627eSYan, Zheng } 121575c9627eSYan, Zheng } 121675c9627eSYan, Zheng 121775c9627eSYan, Zheng void ceph_trim_snapid_map(struct ceph_mds_client *mdsc) 121875c9627eSYan, Zheng { 121975c9627eSYan, Zheng struct ceph_snapid_map *sm; 122075c9627eSYan, Zheng unsigned long now; 122175c9627eSYan, Zheng LIST_HEAD(to_free); 122275c9627eSYan, Zheng 122375c9627eSYan, Zheng spin_lock(&mdsc->snapid_map_lock); 122475c9627eSYan, Zheng now = jiffies; 122575c9627eSYan, Zheng 122675c9627eSYan, Zheng while (!list_empty(&mdsc->snapid_map_lru)) { 122775c9627eSYan, Zheng sm = list_first_entry(&mdsc->snapid_map_lru, 122875c9627eSYan, Zheng struct ceph_snapid_map, lru); 122975c9627eSYan, Zheng if (time_after(sm->last_used + CEPH_SNAPID_MAP_TIMEOUT, now)) 123075c9627eSYan, Zheng break; 123175c9627eSYan, Zheng 123275c9627eSYan, Zheng rb_erase(&sm->node, &mdsc->snapid_map_tree); 123375c9627eSYan, Zheng list_move(&sm->lru, &to_free); 123475c9627eSYan, Zheng } 123575c9627eSYan, Zheng spin_unlock(&mdsc->snapid_map_lock); 123675c9627eSYan, Zheng 123775c9627eSYan, Zheng while (!list_empty(&to_free)) { 123875c9627eSYan, Zheng sm = list_first_entry(&to_free, struct ceph_snapid_map, lru); 123975c9627eSYan, Zheng list_del(&sm->lru); 124075c9627eSYan, Zheng dout("trim snapid map %llx -> %x\n", sm->snap, sm->dev); 124175c9627eSYan, Zheng free_anon_bdev(sm->dev); 124275c9627eSYan, Zheng kfree(sm); 124375c9627eSYan, Zheng } 124475c9627eSYan, Zheng } 124575c9627eSYan, Zheng 124675c9627eSYan, Zheng void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc) 124775c9627eSYan, Zheng { 124875c9627eSYan, Zheng struct ceph_snapid_map *sm; 124975c9627eSYan, Zheng struct rb_node *p; 125075c9627eSYan, Zheng LIST_HEAD(to_free); 125175c9627eSYan, Zheng 125275c9627eSYan, Zheng spin_lock(&mdsc->snapid_map_lock); 125375c9627eSYan, Zheng while ((p = rb_first(&mdsc->snapid_map_tree))) { 125475c9627eSYan, Zheng sm = rb_entry(p, struct ceph_snapid_map, node); 125575c9627eSYan, Zheng rb_erase(p, &mdsc->snapid_map_tree); 125675c9627eSYan, Zheng RB_CLEAR_NODE(p); 125775c9627eSYan, Zheng list_move(&sm->lru, &to_free); 125875c9627eSYan, Zheng } 125975c9627eSYan, Zheng spin_unlock(&mdsc->snapid_map_lock); 126075c9627eSYan, Zheng 126175c9627eSYan, Zheng while (!list_empty(&to_free)) { 126275c9627eSYan, Zheng sm = list_first_entry(&to_free, struct ceph_snapid_map, lru); 126375c9627eSYan, Zheng list_del(&sm->lru); 126475c9627eSYan, Zheng free_anon_bdev(sm->dev); 126575c9627eSYan, Zheng if (WARN_ON_ONCE(atomic_read(&sm->ref))) { 126675c9627eSYan, Zheng pr_err("snapid map %llx -> %x still in use\n", 126775c9627eSYan, Zheng sm->snap, sm->dev); 126875c9627eSYan, Zheng } 1269c8d6ee01SLuis Henriques kfree(sm); 127075c9627eSYan, Zheng } 127175c9627eSYan, Zheng } 1272