1 /* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 #include "memcached.h"
3 #include "bipbuffer.h"
4 #include "slab_automove.h"
5 #include "storage.h"
6 #ifdef EXTSTORE
7 #include "slab_automove_extstore.h"
8 #endif
9 #include <sys/stat.h>
10 #include <sys/socket.h>
11 #include <sys/resource.h>
12 #include <fcntl.h>
13 #include <netinet/in.h>
14 #include <errno.h>
15 #include <stdlib.h>
16 #include <stdio.h>
17 #include <signal.h>
18 #include <string.h>
19 #include <time.h>
20 #include <assert.h>
21 #include <unistd.h>
22 #include <poll.h>
23 
24 /* Forward Declarations */
25 static void item_link_q(item *it);
26 static void item_unlink_q(item *it);
27 
28 static unsigned int lru_type_map[4] = {HOT_LRU, WARM_LRU, COLD_LRU, TEMP_LRU};
29 
30 #define LARGEST_ID POWER_LARGEST
31 typedef struct {
32     uint64_t evicted;
33     uint64_t evicted_nonzero;
34     uint64_t reclaimed;
35     uint64_t outofmemory;
36     uint64_t tailrepairs;
37     uint64_t expired_unfetched; /* items reclaimed but never touched */
38     uint64_t evicted_unfetched; /* items evicted but never touched */
39     uint64_t evicted_active; /* items evicted that should have been shuffled */
40     uint64_t crawler_reclaimed;
41     uint64_t crawler_items_checked;
42     uint64_t lrutail_reflocked;
43     uint64_t moves_to_cold;
44     uint64_t moves_to_warm;
45     uint64_t moves_within_lru;
46     uint64_t direct_reclaims;
47     uint64_t hits_to_hot;
48     uint64_t hits_to_warm;
49     uint64_t hits_to_cold;
50     uint64_t hits_to_temp;
51     uint64_t mem_requested;
52     rel_time_t evicted_time;
53 } itemstats_t;
54 
55 static item *heads[LARGEST_ID];
56 static item *tails[LARGEST_ID];
57 static itemstats_t itemstats[LARGEST_ID];
58 static unsigned int sizes[LARGEST_ID];
59 static uint64_t sizes_bytes[LARGEST_ID];
60 static unsigned int *stats_sizes_hist = NULL;
61 static uint64_t stats_sizes_cas_min = 0;
62 static int stats_sizes_buckets = 0;
63 static uint64_t cas_id = 0;
64 
65 static volatile int do_run_lru_maintainer_thread = 0;
66 static int lru_maintainer_initialized = 0;
67 static pthread_mutex_t lru_maintainer_lock = PTHREAD_MUTEX_INITIALIZER;
68 static pthread_mutex_t cas_id_lock = PTHREAD_MUTEX_INITIALIZER;
69 static pthread_mutex_t stats_sizes_lock = PTHREAD_MUTEX_INITIALIZER;
70 
item_stats_reset(void)71 void item_stats_reset(void) {
72     int i;
73     for (i = 0; i < LARGEST_ID; i++) {
74         pthread_mutex_lock(&lru_locks[i]);
75         memset(&itemstats[i], 0, sizeof(itemstats_t));
76         pthread_mutex_unlock(&lru_locks[i]);
77     }
78 }
79 
80 /* called with class lru lock held */
do_item_stats_add_crawl(const int i,const uint64_t reclaimed,const uint64_t unfetched,const uint64_t checked)81 void do_item_stats_add_crawl(const int i, const uint64_t reclaimed,
82         const uint64_t unfetched, const uint64_t checked) {
83     itemstats[i].crawler_reclaimed += reclaimed;
84     itemstats[i].expired_unfetched += unfetched;
85     itemstats[i].crawler_items_checked += checked;
86 }
87 
88 typedef struct _lru_bump_buf {
89     struct _lru_bump_buf *prev;
90     struct _lru_bump_buf *next;
91     pthread_mutex_t mutex;
92     bipbuf_t *buf;
93     uint64_t dropped;
94 } lru_bump_buf;
95 
96 typedef struct {
97     item *it;
98     uint32_t hv;
99 } lru_bump_entry;
100 
101 static lru_bump_buf *bump_buf_head = NULL;
102 static lru_bump_buf *bump_buf_tail = NULL;
103 static pthread_mutex_t bump_buf_lock = PTHREAD_MUTEX_INITIALIZER;
104 /* TODO: tunable? Need bench results */
105 #define LRU_BUMP_BUF_SIZE 8192
106 
107 static bool lru_bump_async(lru_bump_buf *b, item *it, uint32_t hv);
108 static uint64_t lru_total_bumps_dropped(void);
109 
110 /* Get the next CAS id for a new item. */
111 /* TODO: refactor some atomics for this. */
get_cas_id(void)112 uint64_t get_cas_id(void) {
113     pthread_mutex_lock(&cas_id_lock);
114     uint64_t next_id = ++cas_id;
115     pthread_mutex_unlock(&cas_id_lock);
116     return next_id;
117 }
118 
set_cas_id(uint64_t new_cas)119 void set_cas_id(uint64_t new_cas) {
120     pthread_mutex_lock(&cas_id_lock);
121     cas_id = new_cas;
122     pthread_mutex_unlock(&cas_id_lock);
123 }
124 
item_is_flushed(item * it)125 int item_is_flushed(item *it) {
126     rel_time_t oldest_live = settings.oldest_live;
127     uint64_t cas = ITEM_get_cas(it);
128     uint64_t oldest_cas = settings.oldest_cas;
129     if (oldest_live == 0 || oldest_live > current_time)
130         return 0;
131     if ((it->time <= oldest_live)
132             || (oldest_cas != 0 && cas != 0 && cas < oldest_cas)) {
133         return 1;
134     }
135     return 0;
136 }
137 
138 /* must be locked before call */
do_get_lru_size(uint32_t id)139 unsigned int do_get_lru_size(uint32_t id) {
140     return sizes[id];
141 }
142 
143 /* Enable this for reference-count debugging. */
144 #if 0
145 # define DEBUG_REFCNT(it,op) \
146                 fprintf(stderr, "item %x refcnt(%c) %d %c%c%c\n", \
147                         it, op, it->refcount, \
148                         (it->it_flags & ITEM_LINKED) ? 'L' : ' ', \
149                         (it->it_flags & ITEM_SLABBED) ? 'S' : ' ')
150 #else
151 # define DEBUG_REFCNT(it,op) while(0)
152 #endif
153 
154 /**
155  * Generates the variable-sized part of the header for an object.
156  *
157  * nkey    - The length of the key
158  * flags   - key flags
159  * nbytes  - Number of bytes to hold value and addition CRLF terminator
160  * suffix  - Buffer for the "VALUE" line suffix (flags, size).
161  * nsuffix - The length of the suffix is stored here.
162  *
163  * Returns the total size of the header.
164  */
item_make_header(const uint8_t nkey,const unsigned int flags,const int nbytes,char * suffix,uint8_t * nsuffix)165 static size_t item_make_header(const uint8_t nkey, const unsigned int flags, const int nbytes,
166                      char *suffix, uint8_t *nsuffix) {
167     if (flags == 0) {
168         *nsuffix = 0;
169     } else {
170         *nsuffix = sizeof(flags);
171     }
172     return sizeof(item) + nkey + *nsuffix + nbytes;
173 }
174 
do_item_alloc_pull(const size_t ntotal,const unsigned int id)175 item *do_item_alloc_pull(const size_t ntotal, const unsigned int id) {
176     item *it = NULL;
177     int i;
178     /* If no memory is available, attempt a direct LRU juggle/eviction */
179     /* This is a race in order to simplify lru_pull_tail; in cases where
180      * locked items are on the tail, you want them to fall out and cause
181      * occasional OOM's, rather than internally work around them.
182      * This also gives one fewer code path for slab alloc/free
183      */
184     for (i = 0; i < 10; i++) {
185         /* Try to reclaim memory first */
186         if (!settings.lru_segmented) {
187             lru_pull_tail(id, COLD_LRU, 0, 0, 0, NULL);
188         }
189         it = slabs_alloc(ntotal, id, 0);
190 
191         if (it == NULL) {
192             // We send '0' in for "total_bytes" as this routine is always
193             // pulling to evict, or forcing HOT -> COLD migration.
194             // As of this writing, total_bytes isn't at all used with COLD_LRU.
195             if (lru_pull_tail(id, COLD_LRU, 0, LRU_PULL_EVICT, 0, NULL) <= 0) {
196                 if (settings.lru_segmented) {
197                     lru_pull_tail(id, HOT_LRU, 0, 0, 0, NULL);
198                 } else {
199                     break;
200                 }
201             }
202         } else {
203             break;
204         }
205     }
206 
207     if (i > 0) {
208         pthread_mutex_lock(&lru_locks[id]);
209         itemstats[id].direct_reclaims += i;
210         pthread_mutex_unlock(&lru_locks[id]);
211     }
212 
213     return it;
214 }
215 
216 /* Chain another chunk onto this chunk. */
217 /* slab mover: if it finds a chunk without ITEM_CHUNK flag, and no ITEM_LINKED
218  * flag, it counts as busy and skips.
219  * I think it might still not be safe to do linking outside of the slab lock
220  */
do_item_alloc_chunk(item_chunk * ch,const size_t bytes_remain)221 item_chunk *do_item_alloc_chunk(item_chunk *ch, const size_t bytes_remain) {
222     // TODO: Should be a cleaner way of finding real size with slabber calls
223     size_t size = bytes_remain + sizeof(item_chunk);
224     if (size > settings.slab_chunk_size_max)
225         size = settings.slab_chunk_size_max;
226     unsigned int id = slabs_clsid(size);
227 
228     item_chunk *nch = (item_chunk *) do_item_alloc_pull(size, id);
229     if (nch == NULL)
230         return NULL;
231 
232     // link in.
233     // ITEM_CHUNK[ED] bits need to be protected by the slabs lock.
234     slabs_mlock();
235     nch->head = ch->head;
236     ch->next = nch;
237     nch->prev = ch;
238     nch->next = 0;
239     nch->used = 0;
240     nch->slabs_clsid = id;
241     nch->size = size - sizeof(item_chunk);
242     nch->it_flags |= ITEM_CHUNK;
243     slabs_munlock();
244     return nch;
245 }
246 
do_item_alloc(char * key,const size_t nkey,const unsigned int flags,const rel_time_t exptime,const int nbytes)247 item *do_item_alloc(char *key, const size_t nkey, const unsigned int flags,
248                     const rel_time_t exptime, const int nbytes) {
249     uint8_t nsuffix;
250     item *it = NULL;
251     char suffix[40];
252     // Avoid potential underflows.
253     if (nbytes < 2)
254         return 0;
255 
256     size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix);
257     if (settings.use_cas) {
258         ntotal += sizeof(uint64_t);
259     }
260 
261     unsigned int id = slabs_clsid(ntotal);
262     unsigned int hdr_id = 0;
263     if (id == 0)
264         return 0;
265 
266     /* This is a large item. Allocate a header object now, lazily allocate
267      *  chunks while reading the upload.
268      */
269     if (ntotal > settings.slab_chunk_size_max) {
270         /* We still link this item into the LRU for the larger slab class, but
271          * we're pulling a header from an entirely different slab class. The
272          * free routines handle large items specifically.
273          */
274         int htotal = nkey + 1 + nsuffix + sizeof(item) + sizeof(item_chunk);
275         if (settings.use_cas) {
276             htotal += sizeof(uint64_t);
277         }
278 #ifdef NEED_ALIGN
279         // header chunk needs to be padded on some systems
280         int remain = htotal % 8;
281         if (remain != 0) {
282             htotal += 8 - remain;
283         }
284 #endif
285         hdr_id = slabs_clsid(htotal);
286         it = do_item_alloc_pull(htotal, hdr_id);
287         /* setting ITEM_CHUNKED is fine here because we aren't LINKED yet. */
288         if (it != NULL)
289             it->it_flags |= ITEM_CHUNKED;
290     } else {
291         it = do_item_alloc_pull(ntotal, id);
292     }
293 
294     if (it == NULL) {
295         pthread_mutex_lock(&lru_locks[id]);
296         itemstats[id].outofmemory++;
297         pthread_mutex_unlock(&lru_locks[id]);
298         return NULL;
299     }
300 
301     assert(it->it_flags == 0 || it->it_flags == ITEM_CHUNKED);
302     //assert(it != heads[id]);
303 
304     /* Refcount is seeded to 1 by slabs_alloc() */
305     it->next = it->prev = 0;
306 
307     /* Items are initially loaded into the HOT_LRU. This is '0' but I want at
308      * least a note here. Compiler (hopefully?) optimizes this out.
309      */
310     if (settings.temp_lru &&
311             exptime - current_time <= settings.temporary_ttl) {
312         id |= TEMP_LRU;
313     } else if (settings.lru_segmented) {
314         id |= HOT_LRU;
315     } else {
316         /* There is only COLD in compat-mode */
317         id |= COLD_LRU;
318     }
319     it->slabs_clsid = id;
320 
321     DEBUG_REFCNT(it, '*');
322     it->it_flags |= settings.use_cas ? ITEM_CAS : 0;
323     it->it_flags |= nsuffix != 0 ? ITEM_CFLAGS : 0;
324     it->nkey = nkey;
325     it->nbytes = nbytes;
326     memcpy(ITEM_key(it), key, nkey);
327     it->exptime = exptime;
328     if (nsuffix > 0) {
329         memcpy(ITEM_suffix(it), &flags, sizeof(flags));
330     }
331 
332     /* Initialize internal chunk. */
333     if (it->it_flags & ITEM_CHUNKED) {
334         item_chunk *chunk = (item_chunk *) ITEM_schunk(it);
335 
336         chunk->next = 0;
337         chunk->prev = 0;
338         chunk->used = 0;
339         chunk->size = 0;
340         chunk->head = it;
341         chunk->orig_clsid = hdr_id;
342     }
343     it->h_next = 0;
344 
345     return it;
346 }
347 
item_free(item * it)348 void item_free(item *it) {
349     size_t ntotal = ITEM_ntotal(it);
350     unsigned int clsid;
351     assert((it->it_flags & ITEM_LINKED) == 0);
352     assert(it != heads[it->slabs_clsid]);
353     assert(it != tails[it->slabs_clsid]);
354     assert(it->refcount == 0);
355 
356     /* so slab size changer can tell later if item is already free or not */
357     clsid = ITEM_clsid(it);
358     DEBUG_REFCNT(it, 'F');
359     slabs_free(it, ntotal, clsid);
360 }
361 
362 /**
363  * Returns true if an item will fit in the cache (its size does not exceed
364  * the maximum for a cache entry.)
365  */
item_size_ok(const size_t nkey,const int flags,const int nbytes)366 bool item_size_ok(const size_t nkey, const int flags, const int nbytes) {
367     char prefix[40];
368     uint8_t nsuffix;
369     if (nbytes < 2)
370         return false;
371 
372     size_t ntotal = item_make_header(nkey + 1, flags, nbytes,
373                                      prefix, &nsuffix);
374     if (settings.use_cas) {
375         ntotal += sizeof(uint64_t);
376     }
377 
378     return slabs_clsid(ntotal) != 0;
379 }
380 
381 /* fixing stats/references during warm start */
do_item_link_fixup(item * it)382 void do_item_link_fixup(item *it) {
383     item **head, **tail;
384     int ntotal = ITEM_ntotal(it);
385     uint32_t hv = hash(ITEM_key(it), it->nkey);
386     assoc_insert(it, hv);
387 
388     head = &heads[it->slabs_clsid];
389     tail = &tails[it->slabs_clsid];
390     if (it->prev == 0 && *head == 0) *head = it;
391     if (it->next == 0 && *tail == 0) *tail = it;
392     sizes[it->slabs_clsid]++;
393     sizes_bytes[it->slabs_clsid] += ntotal;
394 
395     STATS_LOCK();
396     stats_state.curr_bytes += ntotal;
397     stats_state.curr_items += 1;
398     stats.total_items += 1;
399     STATS_UNLOCK();
400 
401     item_stats_sizes_add(it);
402 
403     return;
404 }
405 
do_item_link_q(item * it)406 static void do_item_link_q(item *it) { /* item is the new head */
407     item **head, **tail;
408     assert((it->it_flags & ITEM_SLABBED) == 0);
409 
410     head = &heads[it->slabs_clsid];
411     tail = &tails[it->slabs_clsid];
412     assert(it != *head);
413     assert((*head && *tail) || (*head == 0 && *tail == 0));
414     it->prev = 0;
415     it->next = *head;
416     if (it->next) it->next->prev = it;
417     *head = it;
418     if (*tail == 0) *tail = it;
419     sizes[it->slabs_clsid]++;
420 #ifdef EXTSTORE
421     if (it->it_flags & ITEM_HDR) {
422         sizes_bytes[it->slabs_clsid] += (ITEM_ntotal(it) - it->nbytes) + sizeof(item_hdr);
423     } else {
424         sizes_bytes[it->slabs_clsid] += ITEM_ntotal(it);
425     }
426 #else
427     sizes_bytes[it->slabs_clsid] += ITEM_ntotal(it);
428 #endif
429 
430     return;
431 }
432 
item_link_q(item * it)433 static void item_link_q(item *it) {
434     pthread_mutex_lock(&lru_locks[it->slabs_clsid]);
435     do_item_link_q(it);
436     pthread_mutex_unlock(&lru_locks[it->slabs_clsid]);
437 }
438 
item_link_q_warm(item * it)439 static void item_link_q_warm(item *it) {
440     pthread_mutex_lock(&lru_locks[it->slabs_clsid]);
441     do_item_link_q(it);
442     itemstats[it->slabs_clsid].moves_to_warm++;
443     pthread_mutex_unlock(&lru_locks[it->slabs_clsid]);
444 }
445 
do_item_unlink_q(item * it)446 static void do_item_unlink_q(item *it) {
447     item **head, **tail;
448     head = &heads[it->slabs_clsid];
449     tail = &tails[it->slabs_clsid];
450 
451     if (*head == it) {
452         assert(it->prev == 0);
453         *head = it->next;
454     }
455     if (*tail == it) {
456         assert(it->next == 0);
457         *tail = it->prev;
458     }
459     assert(it->next != it);
460     assert(it->prev != it);
461 
462     if (it->next) it->next->prev = it->prev;
463     if (it->prev) it->prev->next = it->next;
464     sizes[it->slabs_clsid]--;
465 #ifdef EXTSTORE
466     if (it->it_flags & ITEM_HDR) {
467         sizes_bytes[it->slabs_clsid] -= (ITEM_ntotal(it) - it->nbytes) + sizeof(item_hdr);
468     } else {
469         sizes_bytes[it->slabs_clsid] -= ITEM_ntotal(it);
470     }
471 #else
472     sizes_bytes[it->slabs_clsid] -= ITEM_ntotal(it);
473 #endif
474 
475     return;
476 }
477 
item_unlink_q(item * it)478 static void item_unlink_q(item *it) {
479     pthread_mutex_lock(&lru_locks[it->slabs_clsid]);
480     do_item_unlink_q(it);
481     pthread_mutex_unlock(&lru_locks[it->slabs_clsid]);
482 }
483 
do_item_link(item * it,const uint32_t hv)484 int do_item_link(item *it, const uint32_t hv) {
485     MEMCACHED_ITEM_LINK(ITEM_key(it), it->nkey, it->nbytes);
486     assert((it->it_flags & (ITEM_LINKED|ITEM_SLABBED)) == 0);
487     it->it_flags |= ITEM_LINKED;
488     it->time = current_time;
489 
490     STATS_LOCK();
491     stats_state.curr_bytes += ITEM_ntotal(it);
492     stats_state.curr_items += 1;
493     stats.total_items += 1;
494     STATS_UNLOCK();
495 
496     /* Allocate a new CAS ID on link. */
497     ITEM_set_cas(it, (settings.use_cas) ? get_cas_id() : 0);
498     assoc_insert(it, hv);
499     item_link_q(it);
500     refcount_incr(it);
501     item_stats_sizes_add(it);
502 
503     return 1;
504 }
505 
do_item_unlink(item * it,const uint32_t hv)506 void do_item_unlink(item *it, const uint32_t hv) {
507     MEMCACHED_ITEM_UNLINK(ITEM_key(it), it->nkey, it->nbytes);
508     if ((it->it_flags & ITEM_LINKED) != 0) {
509         it->it_flags &= ~ITEM_LINKED;
510         STATS_LOCK();
511         stats_state.curr_bytes -= ITEM_ntotal(it);
512         stats_state.curr_items -= 1;
513         STATS_UNLOCK();
514         item_stats_sizes_remove(it);
515         assoc_delete(ITEM_key(it), it->nkey, hv);
516         item_unlink_q(it);
517         do_item_remove(it);
518     }
519 }
520 
521 /* FIXME: Is it necessary to keep this copy/pasted code? */
do_item_unlink_nolock(item * it,const uint32_t hv)522 void do_item_unlink_nolock(item *it, const uint32_t hv) {
523     MEMCACHED_ITEM_UNLINK(ITEM_key(it), it->nkey, it->nbytes);
524     if ((it->it_flags & ITEM_LINKED) != 0) {
525         it->it_flags &= ~ITEM_LINKED;
526         STATS_LOCK();
527         stats_state.curr_bytes -= ITEM_ntotal(it);
528         stats_state.curr_items -= 1;
529         STATS_UNLOCK();
530         item_stats_sizes_remove(it);
531         assoc_delete(ITEM_key(it), it->nkey, hv);
532         do_item_unlink_q(it);
533         do_item_remove(it);
534     }
535 }
536 
do_item_remove(item * it)537 void do_item_remove(item *it) {
538     MEMCACHED_ITEM_REMOVE(ITEM_key(it), it->nkey, it->nbytes);
539     assert((it->it_flags & ITEM_SLABBED) == 0);
540     assert(it->refcount > 0);
541 
542     if (refcount_decr(it) == 0) {
543         item_free(it);
544     }
545 }
546 
547 /* Bump the last accessed time, or relink if we're in compat mode */
do_item_update(item * it)548 void do_item_update(item *it) {
549     MEMCACHED_ITEM_UPDATE(ITEM_key(it), it->nkey, it->nbytes);
550 
551     /* Hits to COLD_LRU immediately move to WARM. */
552     if (settings.lru_segmented) {
553         assert((it->it_flags & ITEM_SLABBED) == 0);
554         if ((it->it_flags & ITEM_LINKED) != 0) {
555             if (ITEM_lruid(it) == COLD_LRU && (it->it_flags & ITEM_ACTIVE)) {
556                 it->time = current_time;
557                 item_unlink_q(it);
558                 it->slabs_clsid = ITEM_clsid(it);
559                 it->slabs_clsid |= WARM_LRU;
560                 it->it_flags &= ~ITEM_ACTIVE;
561                 item_link_q_warm(it);
562             } else {
563                 it->time = current_time;
564             }
565         }
566     } else if (it->time < current_time - ITEM_UPDATE_INTERVAL) {
567         assert((it->it_flags & ITEM_SLABBED) == 0);
568 
569         if ((it->it_flags & ITEM_LINKED) != 0) {
570             it->time = current_time;
571             item_unlink_q(it);
572             item_link_q(it);
573         }
574     }
575 }
576 
do_item_replace(item * it,item * new_it,const uint32_t hv)577 int do_item_replace(item *it, item *new_it, const uint32_t hv) {
578     MEMCACHED_ITEM_REPLACE(ITEM_key(it), it->nkey, it->nbytes,
579                            ITEM_key(new_it), new_it->nkey, new_it->nbytes);
580     assert((it->it_flags & ITEM_SLABBED) == 0);
581 
582     do_item_unlink(it, hv);
583     return do_item_link(new_it, hv);
584 }
585 
586 /*@null@*/
587 /* This is walking the line of violating lock order, but I think it's safe.
588  * If the LRU lock is held, an item in the LRU cannot be wiped and freed.
589  * The data could possibly be overwritten, but this is only accessing the
590  * headers.
591  * It may not be the best idea to leave it like this, but for now it's safe.
592  */
item_cachedump(const unsigned int slabs_clsid,const unsigned int limit,unsigned int * bytes)593 char *item_cachedump(const unsigned int slabs_clsid, const unsigned int limit, unsigned int *bytes) {
594     unsigned int memlimit = 2 * 1024 * 1024;   /* 2MB max response size */
595     char *buffer;
596     unsigned int bufcurr;
597     item *it;
598     unsigned int len;
599     unsigned int shown = 0;
600     char key_temp[KEY_MAX_LENGTH + 1];
601     char temp[512];
602     unsigned int id = slabs_clsid;
603     id |= COLD_LRU;
604 
605     pthread_mutex_lock(&lru_locks[id]);
606     it = heads[id];
607 
608     buffer = malloc((size_t)memlimit);
609     if (buffer == 0) {
610         pthread_mutex_unlock(&lru_locks[id]);
611         return NULL;
612     }
613     bufcurr = 0;
614 
615     while (it != NULL && (limit == 0 || shown < limit)) {
616         assert(it->nkey <= KEY_MAX_LENGTH);
617         // protect from printing binary keys.
618         if ((it->nbytes == 0 && it->nkey == 0) || (it->it_flags & ITEM_KEY_BINARY)) {
619             it = it->next;
620             continue;
621         }
622         /* Copy the key since it may not be null-terminated in the struct */
623         strncpy(key_temp, ITEM_key(it), it->nkey);
624         key_temp[it->nkey] = 0x00; /* terminate */
625         len = snprintf(temp, sizeof(temp), "ITEM %s [%d b; %llu s]\r\n",
626                        key_temp, it->nbytes - 2,
627                        it->exptime == 0 ? 0 :
628                        (unsigned long long)it->exptime + process_started);
629         if (bufcurr + len + 6 > memlimit)  /* 6 is END\r\n\0 */
630             break;
631         memcpy(buffer + bufcurr, temp, len);
632         bufcurr += len;
633         shown++;
634         it = it->next;
635     }
636 
637     memcpy(buffer + bufcurr, "END\r\n", 6);
638     bufcurr += 5;
639 
640     *bytes = bufcurr;
641     pthread_mutex_unlock(&lru_locks[id]);
642     return buffer;
643 }
644 
645 /* With refactoring of the various stats code the automover won't need a
646  * custom function here.
647  */
fill_item_stats_automove(item_stats_automove * am)648 void fill_item_stats_automove(item_stats_automove *am) {
649     int n;
650     for (n = 0; n < MAX_NUMBER_OF_SLAB_CLASSES; n++) {
651         item_stats_automove *cur = &am[n];
652 
653         // outofmemory records into HOT
654         int i = n | HOT_LRU;
655         pthread_mutex_lock(&lru_locks[i]);
656         cur->outofmemory = itemstats[i].outofmemory;
657         pthread_mutex_unlock(&lru_locks[i]);
658 
659         // evictions and tail age are from COLD
660         i = n | COLD_LRU;
661         pthread_mutex_lock(&lru_locks[i]);
662         cur->evicted = itemstats[i].evicted;
663         if (tails[i]) {
664             cur->age = current_time - tails[i]->time;
665         } else {
666             cur->age = 0;
667         }
668         pthread_mutex_unlock(&lru_locks[i]);
669      }
670 }
671 
item_stats_totals(ADD_STAT add_stats,void * c)672 void item_stats_totals(ADD_STAT add_stats, void *c) {
673     itemstats_t totals;
674     memset(&totals, 0, sizeof(itemstats_t));
675     int n;
676     for (n = 0; n < MAX_NUMBER_OF_SLAB_CLASSES; n++) {
677         int x;
678         int i;
679         for (x = 0; x < 4; x++) {
680             i = n | lru_type_map[x];
681             pthread_mutex_lock(&lru_locks[i]);
682             totals.expired_unfetched += itemstats[i].expired_unfetched;
683             totals.evicted_unfetched += itemstats[i].evicted_unfetched;
684             totals.evicted_active += itemstats[i].evicted_active;
685             totals.evicted += itemstats[i].evicted;
686             totals.reclaimed += itemstats[i].reclaimed;
687             totals.crawler_reclaimed += itemstats[i].crawler_reclaimed;
688             totals.crawler_items_checked += itemstats[i].crawler_items_checked;
689             totals.lrutail_reflocked += itemstats[i].lrutail_reflocked;
690             totals.moves_to_cold += itemstats[i].moves_to_cold;
691             totals.moves_to_warm += itemstats[i].moves_to_warm;
692             totals.moves_within_lru += itemstats[i].moves_within_lru;
693             totals.direct_reclaims += itemstats[i].direct_reclaims;
694             pthread_mutex_unlock(&lru_locks[i]);
695         }
696     }
697     APPEND_STAT("expired_unfetched", "%llu",
698                 (unsigned long long)totals.expired_unfetched);
699     APPEND_STAT("evicted_unfetched", "%llu",
700                 (unsigned long long)totals.evicted_unfetched);
701     if (settings.lru_maintainer_thread) {
702         APPEND_STAT("evicted_active", "%llu",
703                     (unsigned long long)totals.evicted_active);
704     }
705     APPEND_STAT("evictions", "%llu",
706                 (unsigned long long)totals.evicted);
707     APPEND_STAT("reclaimed", "%llu",
708                 (unsigned long long)totals.reclaimed);
709     APPEND_STAT("crawler_reclaimed", "%llu",
710                 (unsigned long long)totals.crawler_reclaimed);
711     APPEND_STAT("crawler_items_checked", "%llu",
712                 (unsigned long long)totals.crawler_items_checked);
713     APPEND_STAT("lrutail_reflocked", "%llu",
714                 (unsigned long long)totals.lrutail_reflocked);
715     if (settings.lru_maintainer_thread) {
716         APPEND_STAT("moves_to_cold", "%llu",
717                     (unsigned long long)totals.moves_to_cold);
718         APPEND_STAT("moves_to_warm", "%llu",
719                     (unsigned long long)totals.moves_to_warm);
720         APPEND_STAT("moves_within_lru", "%llu",
721                     (unsigned long long)totals.moves_within_lru);
722         APPEND_STAT("direct_reclaims", "%llu",
723                     (unsigned long long)totals.direct_reclaims);
724         APPEND_STAT("lru_bumps_dropped", "%llu",
725                     (unsigned long long)lru_total_bumps_dropped());
726     }
727 }
728 
item_stats(ADD_STAT add_stats,void * c)729 void item_stats(ADD_STAT add_stats, void *c) {
730     struct thread_stats thread_stats;
731     threadlocal_stats_aggregate(&thread_stats);
732     itemstats_t totals;
733     int n;
734     for (n = 0; n < MAX_NUMBER_OF_SLAB_CLASSES; n++) {
735         memset(&totals, 0, sizeof(itemstats_t));
736         int x;
737         int i;
738         unsigned int size = 0;
739         unsigned int age  = 0;
740         unsigned int age_hot = 0;
741         unsigned int age_warm = 0;
742         unsigned int lru_size_map[4];
743         const char *fmt = "items:%d:%s";
744         char key_str[STAT_KEY_LEN];
745         char val_str[STAT_VAL_LEN];
746         int klen = 0, vlen = 0;
747         for (x = 0; x < 4; x++) {
748             i = n | lru_type_map[x];
749             pthread_mutex_lock(&lru_locks[i]);
750             totals.evicted += itemstats[i].evicted;
751             totals.evicted_nonzero += itemstats[i].evicted_nonzero;
752             totals.outofmemory += itemstats[i].outofmemory;
753             totals.tailrepairs += itemstats[i].tailrepairs;
754             totals.reclaimed += itemstats[i].reclaimed;
755             totals.expired_unfetched += itemstats[i].expired_unfetched;
756             totals.evicted_unfetched += itemstats[i].evicted_unfetched;
757             totals.evicted_active += itemstats[i].evicted_active;
758             totals.crawler_reclaimed += itemstats[i].crawler_reclaimed;
759             totals.crawler_items_checked += itemstats[i].crawler_items_checked;
760             totals.lrutail_reflocked += itemstats[i].lrutail_reflocked;
761             totals.moves_to_cold += itemstats[i].moves_to_cold;
762             totals.moves_to_warm += itemstats[i].moves_to_warm;
763             totals.moves_within_lru += itemstats[i].moves_within_lru;
764             totals.direct_reclaims += itemstats[i].direct_reclaims;
765             totals.mem_requested += sizes_bytes[i];
766             size += sizes[i];
767             lru_size_map[x] = sizes[i];
768             if (lru_type_map[x] == COLD_LRU && tails[i] != NULL) {
769                 age = current_time - tails[i]->time;
770             } else if (lru_type_map[x] == HOT_LRU && tails[i] != NULL) {
771                 age_hot = current_time - tails[i]->time;
772             } else if (lru_type_map[x] == WARM_LRU && tails[i] != NULL) {
773                 age_warm = current_time - tails[i]->time;
774             }
775             if (lru_type_map[x] == COLD_LRU)
776                 totals.evicted_time = itemstats[i].evicted_time;
777             switch (lru_type_map[x]) {
778                 case HOT_LRU:
779                     totals.hits_to_hot = thread_stats.lru_hits[i];
780                     break;
781                 case WARM_LRU:
782                     totals.hits_to_warm = thread_stats.lru_hits[i];
783                     break;
784                 case COLD_LRU:
785                     totals.hits_to_cold = thread_stats.lru_hits[i];
786                     break;
787                 case TEMP_LRU:
788                     totals.hits_to_temp = thread_stats.lru_hits[i];
789                     break;
790             }
791             pthread_mutex_unlock(&lru_locks[i]);
792         }
793         if (size == 0)
794             continue;
795         APPEND_NUM_FMT_STAT(fmt, n, "number", "%u", size);
796         if (settings.lru_maintainer_thread) {
797             APPEND_NUM_FMT_STAT(fmt, n, "number_hot", "%u", lru_size_map[0]);
798             APPEND_NUM_FMT_STAT(fmt, n, "number_warm", "%u", lru_size_map[1]);
799             APPEND_NUM_FMT_STAT(fmt, n, "number_cold", "%u", lru_size_map[2]);
800             if (settings.temp_lru) {
801                 APPEND_NUM_FMT_STAT(fmt, n, "number_temp", "%u", lru_size_map[3]);
802             }
803             APPEND_NUM_FMT_STAT(fmt, n, "age_hot", "%u", age_hot);
804             APPEND_NUM_FMT_STAT(fmt, n, "age_warm", "%u", age_warm);
805         }
806         APPEND_NUM_FMT_STAT(fmt, n, "age", "%u", age);
807         APPEND_NUM_FMT_STAT(fmt, n, "mem_requested", "%llu", (unsigned long long)totals.mem_requested);
808         APPEND_NUM_FMT_STAT(fmt, n, "evicted",
809                             "%llu", (unsigned long long)totals.evicted);
810         APPEND_NUM_FMT_STAT(fmt, n, "evicted_nonzero",
811                             "%llu", (unsigned long long)totals.evicted_nonzero);
812         APPEND_NUM_FMT_STAT(fmt, n, "evicted_time",
813                             "%u", totals.evicted_time);
814         APPEND_NUM_FMT_STAT(fmt, n, "outofmemory",
815                             "%llu", (unsigned long long)totals.outofmemory);
816         APPEND_NUM_FMT_STAT(fmt, n, "tailrepairs",
817                             "%llu", (unsigned long long)totals.tailrepairs);
818         APPEND_NUM_FMT_STAT(fmt, n, "reclaimed",
819                             "%llu", (unsigned long long)totals.reclaimed);
820         APPEND_NUM_FMT_STAT(fmt, n, "expired_unfetched",
821                             "%llu", (unsigned long long)totals.expired_unfetched);
822         APPEND_NUM_FMT_STAT(fmt, n, "evicted_unfetched",
823                             "%llu", (unsigned long long)totals.evicted_unfetched);
824         if (settings.lru_maintainer_thread) {
825             APPEND_NUM_FMT_STAT(fmt, n, "evicted_active",
826                                 "%llu", (unsigned long long)totals.evicted_active);
827         }
828         APPEND_NUM_FMT_STAT(fmt, n, "crawler_reclaimed",
829                             "%llu", (unsigned long long)totals.crawler_reclaimed);
830         APPEND_NUM_FMT_STAT(fmt, n, "crawler_items_checked",
831                             "%llu", (unsigned long long)totals.crawler_items_checked);
832         APPEND_NUM_FMT_STAT(fmt, n, "lrutail_reflocked",
833                             "%llu", (unsigned long long)totals.lrutail_reflocked);
834         if (settings.lru_maintainer_thread) {
835             APPEND_NUM_FMT_STAT(fmt, n, "moves_to_cold",
836                                 "%llu", (unsigned long long)totals.moves_to_cold);
837             APPEND_NUM_FMT_STAT(fmt, n, "moves_to_warm",
838                                 "%llu", (unsigned long long)totals.moves_to_warm);
839             APPEND_NUM_FMT_STAT(fmt, n, "moves_within_lru",
840                                 "%llu", (unsigned long long)totals.moves_within_lru);
841             APPEND_NUM_FMT_STAT(fmt, n, "direct_reclaims",
842                                 "%llu", (unsigned long long)totals.direct_reclaims);
843             APPEND_NUM_FMT_STAT(fmt, n, "hits_to_hot",
844                                 "%llu", (unsigned long long)totals.hits_to_hot);
845 
846             APPEND_NUM_FMT_STAT(fmt, n, "hits_to_warm",
847                                 "%llu", (unsigned long long)totals.hits_to_warm);
848 
849             APPEND_NUM_FMT_STAT(fmt, n, "hits_to_cold",
850                                 "%llu", (unsigned long long)totals.hits_to_cold);
851 
852             APPEND_NUM_FMT_STAT(fmt, n, "hits_to_temp",
853                                 "%llu", (unsigned long long)totals.hits_to_temp);
854 
855         }
856     }
857 
858     /* getting here means both ascii and binary terminators fit */
859     add_stats(NULL, 0, NULL, 0, c);
860 }
861 
item_stats_sizes_status(void)862 bool item_stats_sizes_status(void) {
863     bool ret = false;
864     mutex_lock(&stats_sizes_lock);
865     if (stats_sizes_hist != NULL)
866         ret = true;
867     mutex_unlock(&stats_sizes_lock);
868     return ret;
869 }
870 
item_stats_sizes_init(void)871 void item_stats_sizes_init(void) {
872     if (stats_sizes_hist != NULL)
873         return;
874     stats_sizes_buckets = settings.item_size_max / 32 + 1;
875     stats_sizes_hist = calloc(stats_sizes_buckets, sizeof(int));
876     stats_sizes_cas_min = (settings.use_cas) ? get_cas_id() : 0;
877 }
878 
item_stats_sizes_enable(ADD_STAT add_stats,void * c)879 void item_stats_sizes_enable(ADD_STAT add_stats, void *c) {
880     mutex_lock(&stats_sizes_lock);
881     if (!settings.use_cas) {
882         APPEND_STAT("sizes_status", "error", "");
883         APPEND_STAT("sizes_error", "cas_support_disabled", "");
884     } else if (stats_sizes_hist == NULL) {
885         item_stats_sizes_init();
886         if (stats_sizes_hist != NULL) {
887             APPEND_STAT("sizes_status", "enabled", "");
888         } else {
889             APPEND_STAT("sizes_status", "error", "");
890             APPEND_STAT("sizes_error", "no_memory", "");
891         }
892     } else {
893         APPEND_STAT("sizes_status", "enabled", "");
894     }
895     mutex_unlock(&stats_sizes_lock);
896 }
897 
item_stats_sizes_disable(ADD_STAT add_stats,void * c)898 void item_stats_sizes_disable(ADD_STAT add_stats, void *c) {
899     mutex_lock(&stats_sizes_lock);
900     if (stats_sizes_hist != NULL) {
901         free(stats_sizes_hist);
902         stats_sizes_hist = NULL;
903     }
904     APPEND_STAT("sizes_status", "disabled", "");
905     mutex_unlock(&stats_sizes_lock);
906 }
907 
item_stats_sizes_add(item * it)908 void item_stats_sizes_add(item *it) {
909     if (stats_sizes_hist == NULL || stats_sizes_cas_min > ITEM_get_cas(it))
910         return;
911     int ntotal = ITEM_ntotal(it);
912     int bucket = ntotal / 32;
913     if ((ntotal % 32) != 0) bucket++;
914     if (bucket < stats_sizes_buckets) stats_sizes_hist[bucket]++;
915 }
916 
917 /* I think there's no way for this to be accurate without using the CAS value.
918  * Since items getting their time value bumped will pass this validation.
919  */
item_stats_sizes_remove(item * it)920 void item_stats_sizes_remove(item *it) {
921     if (stats_sizes_hist == NULL || stats_sizes_cas_min > ITEM_get_cas(it))
922         return;
923     int ntotal = ITEM_ntotal(it);
924     int bucket = ntotal / 32;
925     if ((ntotal % 32) != 0) bucket++;
926     if (bucket < stats_sizes_buckets) stats_sizes_hist[bucket]--;
927 }
928 
929 /** dumps out a list of objects of each size, with granularity of 32 bytes */
930 /*@null@*/
931 /* Locks are correct based on a technicality. Holds LRU lock while doing the
932  * work, so items can't go invalid, and it's only looking at header sizes
933  * which don't change.
934  */
item_stats_sizes(ADD_STAT add_stats,void * c)935 void item_stats_sizes(ADD_STAT add_stats, void *c) {
936     mutex_lock(&stats_sizes_lock);
937 
938     if (stats_sizes_hist != NULL) {
939         int i;
940         for (i = 0; i < stats_sizes_buckets; i++) {
941             if (stats_sizes_hist[i] != 0) {
942                 char key[12];
943                 snprintf(key, sizeof(key), "%d", i * 32);
944                 APPEND_STAT(key, "%u", stats_sizes_hist[i]);
945             }
946         }
947     } else {
948         APPEND_STAT("sizes_status", "disabled", "");
949     }
950 
951     add_stats(NULL, 0, NULL, 0, c);
952     mutex_unlock(&stats_sizes_lock);
953 }
954 
955 /** wrapper around assoc_find which does the lazy expiration logic */
do_item_get(const char * key,const size_t nkey,const uint32_t hv,conn * c,const bool do_update)956 item *do_item_get(const char *key, const size_t nkey, const uint32_t hv, conn *c, const bool do_update) {
957     item *it = assoc_find(key, nkey, hv);
958     if (it != NULL) {
959         refcount_incr(it);
960         /* Optimization for slab reassignment. prevents popular items from
961          * jamming in busy wait. Can only do this here to satisfy lock order
962          * of item_lock, slabs_lock. */
963         /* This was made unsafe by removal of the cache_lock:
964          * slab_rebalance_signal and slab_rebal.* are modified in a separate
965          * thread under slabs_lock. If slab_rebalance_signal = 1, slab_start =
966          * NULL (0), but slab_end is still equal to some value, this would end
967          * up unlinking every item fetched.
968          * This is either an acceptable loss, or if slab_rebalance_signal is
969          * true, slab_start/slab_end should be put behind the slabs_lock.
970          * Which would cause a huge potential slowdown.
971          * Could also use a specific lock for slab_rebal.* and
972          * slab_rebalance_signal (shorter lock?)
973          */
974         /*if (slab_rebalance_signal &&
975             ((void *)it >= slab_rebal.slab_start && (void *)it < slab_rebal.slab_end)) {
976             do_item_unlink(it, hv);
977             do_item_remove(it);
978             it = NULL;
979         }*/
980     }
981     int was_found = 0;
982 
983     if (settings.verbose > 2) {
984         int ii;
985         if (it == NULL) {
986             fprintf(stderr, "> NOT FOUND ");
987         } else {
988             fprintf(stderr, "> FOUND KEY ");
989         }
990         for (ii = 0; ii < nkey; ++ii) {
991             fprintf(stderr, "%c", key[ii]);
992         }
993     }
994 
995     if (it != NULL) {
996         was_found = 1;
997         if (item_is_flushed(it)) {
998             do_item_unlink(it, hv);
999             STORAGE_delete(c->thread->storage, it);
1000             do_item_remove(it);
1001             it = NULL;
1002             pthread_mutex_lock(&c->thread->stats.mutex);
1003             c->thread->stats.get_flushed++;
1004             pthread_mutex_unlock(&c->thread->stats.mutex);
1005             if (settings.verbose > 2) {
1006                 fprintf(stderr, " -nuked by flush");
1007             }
1008             was_found = 2;
1009         } else if (it->exptime != 0 && it->exptime <= current_time) {
1010             do_item_unlink(it, hv);
1011             STORAGE_delete(c->thread->storage, it);
1012             do_item_remove(it);
1013             it = NULL;
1014             pthread_mutex_lock(&c->thread->stats.mutex);
1015             c->thread->stats.get_expired++;
1016             pthread_mutex_unlock(&c->thread->stats.mutex);
1017             if (settings.verbose > 2) {
1018                 fprintf(stderr, " -nuked by expire");
1019             }
1020             was_found = 3;
1021         } else {
1022             if (do_update) {
1023                 do_item_bump(c, it, hv);
1024             }
1025             DEBUG_REFCNT(it, '+');
1026         }
1027     }
1028 
1029     if (settings.verbose > 2)
1030         fprintf(stderr, "\n");
1031     /* For now this is in addition to the above verbose logging. */
1032     LOGGER_LOG(c->thread->l, LOG_FETCHERS, LOGGER_ITEM_GET, NULL, was_found, key,
1033                nkey, (it) ? it->nbytes : 0, (it) ? ITEM_clsid(it) : 0, c->sfd);
1034 
1035     return it;
1036 }
1037 
1038 // Requires lock held for item.
1039 // Split out of do_item_get() to allow mget functions to look through header
1040 // data before losing state modified via the bump function.
do_item_bump(conn * c,item * it,const uint32_t hv)1041 void do_item_bump(conn *c, item *it, const uint32_t hv) {
1042     /* We update the hit markers only during fetches.
1043      * An item needs to be hit twice overall to be considered
1044      * ACTIVE, but only needs a single hit to maintain activity
1045      * afterward.
1046      * FETCHED tells if an item has ever been active.
1047      */
1048     if (settings.lru_segmented) {
1049         if ((it->it_flags & ITEM_ACTIVE) == 0) {
1050             if ((it->it_flags & ITEM_FETCHED) == 0) {
1051                 it->it_flags |= ITEM_FETCHED;
1052             } else {
1053                 it->it_flags |= ITEM_ACTIVE;
1054                 if (ITEM_lruid(it) != COLD_LRU) {
1055                     it->time = current_time; // only need to bump time.
1056                 } else if (!lru_bump_async(c->thread->lru_bump_buf, it, hv)) {
1057                     // add flag before async bump to avoid race.
1058                     it->it_flags &= ~ITEM_ACTIVE;
1059                 }
1060             }
1061         }
1062     } else {
1063         it->it_flags |= ITEM_FETCHED;
1064         do_item_update(it);
1065     }
1066 }
1067 
do_item_touch(const char * key,size_t nkey,uint32_t exptime,const uint32_t hv,conn * c)1068 item *do_item_touch(const char *key, size_t nkey, uint32_t exptime,
1069                     const uint32_t hv, conn *c) {
1070     item *it = do_item_get(key, nkey, hv, c, DO_UPDATE);
1071     if (it != NULL) {
1072         it->exptime = exptime;
1073     }
1074     return it;
1075 }
1076 
1077 /*** LRU MAINTENANCE THREAD ***/
1078 
1079 /* Returns number of items remove, expired, or evicted.
1080  * Callable from worker threads or the LRU maintainer thread */
lru_pull_tail(const int orig_id,const int cur_lru,const uint64_t total_bytes,const uint8_t flags,const rel_time_t max_age,struct lru_pull_tail_return * ret_it)1081 int lru_pull_tail(const int orig_id, const int cur_lru,
1082         const uint64_t total_bytes, const uint8_t flags, const rel_time_t max_age,
1083         struct lru_pull_tail_return *ret_it) {
1084     item *it = NULL;
1085     int id = orig_id;
1086     int removed = 0;
1087     if (id == 0)
1088         return 0;
1089 
1090     int tries = 5;
1091     item *search;
1092     item *next_it;
1093     void *hold_lock = NULL;
1094     unsigned int move_to_lru = 0;
1095     uint64_t limit = 0;
1096 
1097     id |= cur_lru;
1098     pthread_mutex_lock(&lru_locks[id]);
1099     search = tails[id];
1100     /* We walk up *only* for locked items, and if bottom is expired. */
1101     for (; tries > 0 && search != NULL; tries--, search=next_it) {
1102         /* we might relink search mid-loop, so search->prev isn't reliable */
1103         next_it = search->prev;
1104         if (search->nbytes == 0 && search->nkey == 0 && search->it_flags == 1) {
1105             /* We are a crawler, ignore it. */
1106             if (flags & LRU_PULL_CRAWL_BLOCKS) {
1107                 pthread_mutex_unlock(&lru_locks[id]);
1108                 return 0;
1109             }
1110             tries++;
1111             continue;
1112         }
1113         uint32_t hv = hash(ITEM_key(search), search->nkey);
1114         /* Attempt to hash item lock the "search" item. If locked, no
1115          * other callers can incr the refcount. Also skip ourselves. */
1116         if ((hold_lock = item_trylock(hv)) == NULL)
1117             continue;
1118         /* Now see if the item is refcount locked */
1119         if (refcount_incr(search) != 2) {
1120             /* Note pathological case with ref'ed items in tail.
1121              * Can still unlink the item, but it won't be reusable yet */
1122             itemstats[id].lrutail_reflocked++;
1123             /* In case of refcount leaks, enable for quick workaround. */
1124             /* WARNING: This can cause terrible corruption */
1125             if (settings.tail_repair_time &&
1126                     search->time + settings.tail_repair_time < current_time) {
1127                 itemstats[id].tailrepairs++;
1128                 search->refcount = 1;
1129                 /* This will call item_remove -> item_free since refcnt is 1 */
1130                 STORAGE_delete(ext_storage, search);
1131                 do_item_unlink_nolock(search, hv);
1132                 item_trylock_unlock(hold_lock);
1133                 continue;
1134             }
1135         }
1136 
1137         /* Expired or flushed */
1138         if ((search->exptime != 0 && search->exptime < current_time)
1139             || item_is_flushed(search)) {
1140             itemstats[id].reclaimed++;
1141             if ((search->it_flags & ITEM_FETCHED) == 0) {
1142                 itemstats[id].expired_unfetched++;
1143             }
1144             /* refcnt 2 -> 1 */
1145             do_item_unlink_nolock(search, hv);
1146             STORAGE_delete(ext_storage, search);
1147             /* refcnt 1 -> 0 -> item_free */
1148             do_item_remove(search);
1149             item_trylock_unlock(hold_lock);
1150             removed++;
1151 
1152             /* If all we're finding are expired, can keep going */
1153             continue;
1154         }
1155 
1156         /* If we're HOT_LRU or WARM_LRU and over size limit, send to COLD_LRU.
1157          * If we're COLD_LRU, send to WARM_LRU unless we need to evict
1158          */
1159         switch (cur_lru) {
1160             case HOT_LRU:
1161                 limit = total_bytes * settings.hot_lru_pct / 100;
1162             case WARM_LRU:
1163                 if (limit == 0)
1164                     limit = total_bytes * settings.warm_lru_pct / 100;
1165                 /* Rescue ACTIVE items aggressively */
1166                 if ((search->it_flags & ITEM_ACTIVE) != 0) {
1167                     search->it_flags &= ~ITEM_ACTIVE;
1168                     removed++;
1169                     if (cur_lru == WARM_LRU) {
1170                         itemstats[id].moves_within_lru++;
1171                         do_item_unlink_q(search);
1172                         do_item_link_q(search);
1173                         do_item_remove(search);
1174                         item_trylock_unlock(hold_lock);
1175                     } else {
1176                         /* Active HOT_LRU items flow to WARM */
1177                         itemstats[id].moves_to_warm++;
1178                         move_to_lru = WARM_LRU;
1179                         do_item_unlink_q(search);
1180                         it = search;
1181                     }
1182                 } else if (sizes_bytes[id] > limit ||
1183                            current_time - search->time > max_age) {
1184                     itemstats[id].moves_to_cold++;
1185                     move_to_lru = COLD_LRU;
1186                     do_item_unlink_q(search);
1187                     it = search;
1188                     removed++;
1189                     break;
1190                 } else {
1191                     /* Don't want to move to COLD, not active, bail out */
1192                     it = search;
1193                 }
1194                 break;
1195             case COLD_LRU:
1196                 it = search; /* No matter what, we're stopping */
1197                 if (flags & LRU_PULL_EVICT) {
1198                     if (settings.evict_to_free == 0) {
1199                         /* Don't think we need a counter for this. It'll OOM.  */
1200                         break;
1201                     }
1202                     itemstats[id].evicted++;
1203                     itemstats[id].evicted_time = current_time - search->time;
1204                     if (search->exptime != 0)
1205                         itemstats[id].evicted_nonzero++;
1206                     if ((search->it_flags & ITEM_FETCHED) == 0) {
1207                         itemstats[id].evicted_unfetched++;
1208                     }
1209                     if ((search->it_flags & ITEM_ACTIVE)) {
1210                         itemstats[id].evicted_active++;
1211                     }
1212                     LOGGER_LOG(NULL, LOG_EVICTIONS, LOGGER_EVICTION, search);
1213                     STORAGE_delete(ext_storage, search);
1214                     do_item_unlink_nolock(search, hv);
1215                     removed++;
1216                     if (settings.slab_automove == 2) {
1217                         slabs_reassign(-1, orig_id);
1218                     }
1219                 } else if (flags & LRU_PULL_RETURN_ITEM) {
1220                     /* Keep a reference to this item and return it. */
1221                     ret_it->it = it;
1222                     ret_it->hv = hv;
1223                 } else if ((search->it_flags & ITEM_ACTIVE) != 0
1224                         && settings.lru_segmented) {
1225                     itemstats[id].moves_to_warm++;
1226                     search->it_flags &= ~ITEM_ACTIVE;
1227                     move_to_lru = WARM_LRU;
1228                     do_item_unlink_q(search);
1229                     removed++;
1230                 }
1231                 break;
1232             case TEMP_LRU:
1233                 it = search; /* Kill the loop. Parent only interested in reclaims */
1234                 break;
1235         }
1236         if (it != NULL)
1237             break;
1238     }
1239 
1240     pthread_mutex_unlock(&lru_locks[id]);
1241 
1242     if (it != NULL) {
1243         if (move_to_lru) {
1244             it->slabs_clsid = ITEM_clsid(it);
1245             it->slabs_clsid |= move_to_lru;
1246             item_link_q(it);
1247         }
1248         if ((flags & LRU_PULL_RETURN_ITEM) == 0) {
1249             do_item_remove(it);
1250             item_trylock_unlock(hold_lock);
1251         }
1252     }
1253 
1254     return removed;
1255 }
1256 
1257 
1258 /* TODO: Third place this code needs to be deduped */
lru_bump_buf_link_q(lru_bump_buf * b)1259 static void lru_bump_buf_link_q(lru_bump_buf *b) {
1260     pthread_mutex_lock(&bump_buf_lock);
1261     assert(b != bump_buf_head);
1262 
1263     b->prev = 0;
1264     b->next = bump_buf_head;
1265     if (b->next) b->next->prev = b;
1266     bump_buf_head = b;
1267     if (bump_buf_tail == 0) bump_buf_tail = b;
1268     pthread_mutex_unlock(&bump_buf_lock);
1269     return;
1270 }
1271 
item_lru_bump_buf_create(void)1272 void *item_lru_bump_buf_create(void) {
1273     lru_bump_buf *b = calloc(1, sizeof(lru_bump_buf));
1274     if (b == NULL) {
1275         return NULL;
1276     }
1277 
1278     b->buf = bipbuf_new(sizeof(lru_bump_entry) * LRU_BUMP_BUF_SIZE);
1279     if (b->buf == NULL) {
1280         free(b);
1281         return NULL;
1282     }
1283 
1284     pthread_mutex_init(&b->mutex, NULL);
1285 
1286     lru_bump_buf_link_q(b);
1287     return b;
1288 }
1289 
lru_bump_async(lru_bump_buf * b,item * it,uint32_t hv)1290 static bool lru_bump_async(lru_bump_buf *b, item *it, uint32_t hv) {
1291     bool ret = true;
1292     refcount_incr(it);
1293     pthread_mutex_lock(&b->mutex);
1294     lru_bump_entry *be = (lru_bump_entry *) bipbuf_request(b->buf, sizeof(lru_bump_entry));
1295     if (be != NULL) {
1296         be->it = it;
1297         be->hv = hv;
1298         if (bipbuf_push(b->buf, sizeof(lru_bump_entry)) == 0) {
1299             ret = false;
1300             b->dropped++;
1301         }
1302     } else {
1303         ret = false;
1304         b->dropped++;
1305     }
1306     if (!ret) {
1307         refcount_decr(it);
1308     }
1309     pthread_mutex_unlock(&b->mutex);
1310     return ret;
1311 }
1312 
1313 /* TODO: Might be worth a micro-optimization of having bump buffers link
1314  * themselves back into the central queue when queue goes from zero to
1315  * non-zero, then remove from list if zero more than N times.
1316  * If very few hits on cold this would avoid extra memory barriers from LRU
1317  * maintainer thread. If many hits, they'll just stay in the list.
1318  */
lru_maintainer_bumps(void)1319 static bool lru_maintainer_bumps(void) {
1320     lru_bump_buf *b;
1321     lru_bump_entry *be;
1322     unsigned int size;
1323     unsigned int todo;
1324     bool bumped = false;
1325     pthread_mutex_lock(&bump_buf_lock);
1326     for (b = bump_buf_head; b != NULL; b=b->next) {
1327         pthread_mutex_lock(&b->mutex);
1328         be = (lru_bump_entry *) bipbuf_peek_all(b->buf, &size);
1329         pthread_mutex_unlock(&b->mutex);
1330 
1331         if (be == NULL) {
1332             continue;
1333         }
1334         todo = size;
1335         bumped = true;
1336 
1337         while (todo) {
1338             item_lock(be->hv);
1339             do_item_update(be->it);
1340             do_item_remove(be->it);
1341             item_unlock(be->hv);
1342             be++;
1343             todo -= sizeof(lru_bump_entry);
1344         }
1345 
1346         pthread_mutex_lock(&b->mutex);
1347         be = (lru_bump_entry *) bipbuf_poll(b->buf, size);
1348         pthread_mutex_unlock(&b->mutex);
1349     }
1350     pthread_mutex_unlock(&bump_buf_lock);
1351     return bumped;
1352 }
1353 
lru_total_bumps_dropped(void)1354 static uint64_t lru_total_bumps_dropped(void) {
1355     uint64_t total = 0;
1356     lru_bump_buf *b;
1357     pthread_mutex_lock(&bump_buf_lock);
1358     for (b = bump_buf_head; b != NULL; b=b->next) {
1359         pthread_mutex_lock(&b->mutex);
1360         total += b->dropped;
1361         pthread_mutex_unlock(&b->mutex);
1362     }
1363     pthread_mutex_unlock(&bump_buf_lock);
1364     return total;
1365 }
1366 
1367 /* Loop up to N times:
1368  * If too many items are in HOT_LRU, push to COLD_LRU
1369  * If too many items are in WARM_LRU, push to COLD_LRU
1370  * If too many items are in COLD_LRU, poke COLD_LRU tail
1371  * 1000 loops with 1ms min sleep gives us under 1m items shifted/sec. The
1372  * locks can't handle much more than that. Leaving a TODO for how to
1373  * autoadjust in the future.
1374  */
lru_maintainer_juggle(const int slabs_clsid)1375 static int lru_maintainer_juggle(const int slabs_clsid) {
1376     int i;
1377     int did_moves = 0;
1378     uint64_t total_bytes = 0;
1379     unsigned int chunks_perslab = 0;
1380     //unsigned int chunks_free = 0;
1381     /* TODO: if free_chunks below high watermark, increase aggressiveness */
1382     slabs_available_chunks(slabs_clsid, NULL,
1383             &chunks_perslab);
1384     if (settings.temp_lru) {
1385         /* Only looking for reclaims. Run before we size the LRU. */
1386         for (i = 0; i < 500; i++) {
1387             if (lru_pull_tail(slabs_clsid, TEMP_LRU, 0, 0, 0, NULL) <= 0) {
1388                 break;
1389             } else {
1390                 did_moves++;
1391             }
1392         }
1393     }
1394 
1395     rel_time_t cold_age = 0;
1396     rel_time_t hot_age = 0;
1397     rel_time_t warm_age = 0;
1398     /* If LRU is in flat mode, force items to drain into COLD via max age of 0 */
1399     if (settings.lru_segmented) {
1400         pthread_mutex_lock(&lru_locks[slabs_clsid|COLD_LRU]);
1401         if (tails[slabs_clsid|COLD_LRU]) {
1402             cold_age = current_time - tails[slabs_clsid|COLD_LRU]->time;
1403         }
1404         // Also build up total_bytes for the classes.
1405         total_bytes += sizes_bytes[slabs_clsid|COLD_LRU];
1406         pthread_mutex_unlock(&lru_locks[slabs_clsid|COLD_LRU]);
1407 
1408         hot_age = cold_age * settings.hot_max_factor;
1409         warm_age = cold_age * settings.warm_max_factor;
1410 
1411         // total_bytes doesn't have to be exact. cache it for the juggles.
1412         pthread_mutex_lock(&lru_locks[slabs_clsid|HOT_LRU]);
1413         total_bytes += sizes_bytes[slabs_clsid|HOT_LRU];
1414         pthread_mutex_unlock(&lru_locks[slabs_clsid|HOT_LRU]);
1415 
1416         pthread_mutex_lock(&lru_locks[slabs_clsid|WARM_LRU]);
1417         total_bytes += sizes_bytes[slabs_clsid|WARM_LRU];
1418         pthread_mutex_unlock(&lru_locks[slabs_clsid|WARM_LRU]);
1419     }
1420 
1421     /* Juggle HOT/WARM up to N times */
1422     for (i = 0; i < 500; i++) {
1423         int do_more = 0;
1424         if (lru_pull_tail(slabs_clsid, HOT_LRU, total_bytes, LRU_PULL_CRAWL_BLOCKS, hot_age, NULL) ||
1425             lru_pull_tail(slabs_clsid, WARM_LRU, total_bytes, LRU_PULL_CRAWL_BLOCKS, warm_age, NULL)) {
1426             do_more++;
1427         }
1428         if (settings.lru_segmented) {
1429             do_more += lru_pull_tail(slabs_clsid, COLD_LRU, total_bytes, LRU_PULL_CRAWL_BLOCKS, 0, NULL);
1430         }
1431         if (do_more == 0)
1432             break;
1433         did_moves++;
1434     }
1435     return did_moves;
1436 }
1437 
1438 /* Will crawl all slab classes a minimum of once per hour */
1439 #define MAX_MAINTCRAWL_WAIT 60 * 60
1440 
1441 /* Hoping user input will improve this function. This is all a wild guess.
1442  * Operation: Kicks crawler for each slab id. Crawlers take some statistics as
1443  * to items with nonzero expirations. It then buckets how many items will
1444  * expire per minute for the next hour.
1445  * This function checks the results of a run, and if it things more than 1% of
1446  * expirable objects are ready to go, kick the crawler again to reap.
1447  * It will also kick the crawler once per minute regardless, waiting a minute
1448  * longer for each time it has no work to do, up to an hour wait time.
1449  * The latter is to avoid newly started daemons from waiting too long before
1450  * retrying a crawl.
1451  */
lru_maintainer_crawler_check(struct crawler_expired_data * cdata,logger * l)1452 static void lru_maintainer_crawler_check(struct crawler_expired_data *cdata, logger *l) {
1453     int i;
1454     static rel_time_t next_crawls[POWER_LARGEST];
1455     static rel_time_t next_crawl_wait[POWER_LARGEST];
1456     uint8_t todo[POWER_LARGEST];
1457     memset(todo, 0, sizeof(uint8_t) * POWER_LARGEST);
1458     bool do_run = false;
1459     unsigned int tocrawl_limit = 0;
1460 
1461     // TODO: If not segmented LRU, skip non-cold
1462     for (i = POWER_SMALLEST; i < POWER_LARGEST; i++) {
1463         crawlerstats_t *s = &cdata->crawlerstats[i];
1464         /* We've not successfully kicked off a crawl yet. */
1465         if (s->run_complete) {
1466             char *lru_name = "na";
1467             pthread_mutex_lock(&cdata->lock);
1468             int x;
1469             /* Should we crawl again? */
1470             uint64_t possible_reclaims = s->seen - s->noexp;
1471             uint64_t available_reclaims = 0;
1472             /* Need to think we can free at least 1% of the items before
1473              * crawling. */
1474             /* FIXME: Configurable? */
1475             uint64_t low_watermark = (possible_reclaims / 100) + 1;
1476             rel_time_t since_run = current_time - s->end_time;
1477             /* Don't bother if the payoff is too low. */
1478             for (x = 0; x < 60; x++) {
1479                 available_reclaims += s->histo[x];
1480                 if (available_reclaims > low_watermark) {
1481                     if (next_crawl_wait[i] < (x * 60)) {
1482                         next_crawl_wait[i] += 60;
1483                     } else if (next_crawl_wait[i] >= 60) {
1484                         next_crawl_wait[i] -= 60;
1485                     }
1486                     break;
1487                 }
1488             }
1489 
1490             if (available_reclaims == 0) {
1491                 next_crawl_wait[i] += 60;
1492             }
1493 
1494             if (next_crawl_wait[i] > MAX_MAINTCRAWL_WAIT) {
1495                 next_crawl_wait[i] = MAX_MAINTCRAWL_WAIT;
1496             }
1497 
1498             next_crawls[i] = current_time + next_crawl_wait[i] + 5;
1499             switch (GET_LRU(i)) {
1500                 case HOT_LRU:
1501                     lru_name = "hot";
1502                     break;
1503                 case WARM_LRU:
1504                     lru_name = "warm";
1505                     break;
1506                 case COLD_LRU:
1507                     lru_name = "cold";
1508                     break;
1509                 case TEMP_LRU:
1510                     lru_name = "temp";
1511                     break;
1512             }
1513             LOGGER_LOG(l, LOG_SYSEVENTS, LOGGER_CRAWLER_STATUS, NULL,
1514                     CLEAR_LRU(i),
1515                     lru_name,
1516                     (unsigned long long)low_watermark,
1517                     (unsigned long long)available_reclaims,
1518                     (unsigned int)since_run,
1519                     next_crawls[i] - current_time,
1520                     s->end_time - s->start_time,
1521                     s->seen,
1522                     s->reclaimed);
1523             // Got our calculation, avoid running until next actual run.
1524             s->run_complete = false;
1525             pthread_mutex_unlock(&cdata->lock);
1526         }
1527         if (current_time > next_crawls[i]) {
1528             pthread_mutex_lock(&lru_locks[i]);
1529             if (sizes[i] > tocrawl_limit) {
1530                 tocrawl_limit = sizes[i];
1531             }
1532             pthread_mutex_unlock(&lru_locks[i]);
1533             todo[i] = 1;
1534             do_run = true;
1535             next_crawls[i] = current_time + 5; // minimum retry wait.
1536         }
1537     }
1538     if (do_run) {
1539         if (settings.lru_crawler_tocrawl && settings.lru_crawler_tocrawl < tocrawl_limit) {
1540             tocrawl_limit = settings.lru_crawler_tocrawl;
1541         }
1542         lru_crawler_start(todo, tocrawl_limit, CRAWLER_AUTOEXPIRE, cdata, NULL, 0);
1543     }
1544 }
1545 
1546 slab_automove_reg_t slab_automove_default = {
1547     .init = slab_automove_init,
1548     .free = slab_automove_free,
1549     .run = slab_automove_run
1550 };
1551 #ifdef EXTSTORE
1552 slab_automove_reg_t slab_automove_extstore = {
1553     .init = slab_automove_extstore_init,
1554     .free = slab_automove_extstore_free,
1555     .run = slab_automove_extstore_run
1556 };
1557 #endif
1558 static pthread_t lru_maintainer_tid;
1559 
1560 #define MAX_LRU_MAINTAINER_SLEEP 1000000
1561 #define MIN_LRU_MAINTAINER_SLEEP 1000
1562 
lru_maintainer_thread(void * arg)1563 static void *lru_maintainer_thread(void *arg) {
1564     slab_automove_reg_t *sam = &slab_automove_default;
1565 #ifdef EXTSTORE
1566     void *storage = arg;
1567     if (storage != NULL)
1568         sam = &slab_automove_extstore;
1569 #endif
1570     int i;
1571     useconds_t to_sleep = MIN_LRU_MAINTAINER_SLEEP;
1572     useconds_t last_sleep = MIN_LRU_MAINTAINER_SLEEP;
1573     rel_time_t last_crawler_check = 0;
1574     rel_time_t last_automove_check = 0;
1575     useconds_t next_juggles[MAX_NUMBER_OF_SLAB_CLASSES] = {0};
1576     useconds_t backoff_juggles[MAX_NUMBER_OF_SLAB_CLASSES] = {0};
1577     struct crawler_expired_data *cdata =
1578         calloc(1, sizeof(struct crawler_expired_data));
1579     if (cdata == NULL) {
1580         fprintf(stderr, "Failed to allocate crawler data for LRU maintainer thread\n");
1581         abort();
1582     }
1583     pthread_mutex_init(&cdata->lock, NULL);
1584     cdata->crawl_complete = true; // kick off the crawler.
1585     logger *l = logger_create();
1586     if (l == NULL) {
1587         fprintf(stderr, "Failed to allocate logger for LRU maintainer thread\n");
1588         abort();
1589     }
1590 
1591     double last_ratio = settings.slab_automove_ratio;
1592     void *am = sam->init(&settings);
1593 
1594     pthread_mutex_lock(&lru_maintainer_lock);
1595     if (settings.verbose > 2)
1596         fprintf(stderr, "Starting LRU maintainer background thread\n");
1597     while (do_run_lru_maintainer_thread) {
1598         pthread_mutex_unlock(&lru_maintainer_lock);
1599         if (to_sleep)
1600             usleep(to_sleep);
1601         pthread_mutex_lock(&lru_maintainer_lock);
1602         /* A sleep of zero counts as a minimum of a 1ms wait */
1603         last_sleep = to_sleep > 1000 ? to_sleep : 1000;
1604         to_sleep = MAX_LRU_MAINTAINER_SLEEP;
1605 
1606         STATS_LOCK();
1607         stats.lru_maintainer_juggles++;
1608         STATS_UNLOCK();
1609 
1610         /* Each slab class gets its own sleep to avoid hammering locks */
1611         for (i = POWER_SMALLEST; i < MAX_NUMBER_OF_SLAB_CLASSES; i++) {
1612             next_juggles[i] = next_juggles[i] > last_sleep ? next_juggles[i] - last_sleep : 0;
1613 
1614             if (next_juggles[i] > 0) {
1615                 // Sleep the thread just for the minimum amount (or not at all)
1616                 if (next_juggles[i] < to_sleep)
1617                     to_sleep = next_juggles[i];
1618                 continue;
1619             }
1620 
1621             int did_moves = lru_maintainer_juggle(i);
1622             if (did_moves == 0) {
1623                 if (backoff_juggles[i] != 0) {
1624                     backoff_juggles[i] += backoff_juggles[i] / 8;
1625                 } else {
1626                     backoff_juggles[i] = MIN_LRU_MAINTAINER_SLEEP;
1627                 }
1628                 if (backoff_juggles[i] > MAX_LRU_MAINTAINER_SLEEP)
1629                     backoff_juggles[i] = MAX_LRU_MAINTAINER_SLEEP;
1630             } else if (backoff_juggles[i] > 0) {
1631                 backoff_juggles[i] /= 2;
1632                 if (backoff_juggles[i] < MIN_LRU_MAINTAINER_SLEEP) {
1633                     backoff_juggles[i] = 0;
1634                 }
1635             }
1636             next_juggles[i] = backoff_juggles[i];
1637             if (next_juggles[i] < to_sleep)
1638                 to_sleep = next_juggles[i];
1639         }
1640 
1641         /* Minimize the sleep if we had async LRU bumps to process */
1642         if (settings.lru_segmented && lru_maintainer_bumps() && to_sleep > 1000) {
1643             to_sleep = 1000;
1644         }
1645 
1646         /* Once per second at most */
1647         if (settings.lru_crawler && last_crawler_check != current_time) {
1648             lru_maintainer_crawler_check(cdata, l);
1649             last_crawler_check = current_time;
1650         }
1651 
1652         if (settings.slab_automove == 1 && last_automove_check != current_time) {
1653             if (last_ratio != settings.slab_automove_ratio) {
1654                 sam->free(am);
1655                 am = sam->init(&settings);
1656                 last_ratio = settings.slab_automove_ratio;
1657             }
1658             int src, dst;
1659             sam->run(am, &src, &dst);
1660             if (src != -1 && dst != -1) {
1661                 slabs_reassign(src, dst);
1662                 LOGGER_LOG(l, LOG_SYSEVENTS, LOGGER_SLAB_MOVE, NULL,
1663                         src, dst);
1664             }
1665             // dst == 0 means reclaim to global pool, be more aggressive
1666             if (dst != 0) {
1667                 last_automove_check = current_time;
1668             } else if (dst == 0) {
1669                 // also ensure we minimize the thread sleep
1670                 to_sleep = 1000;
1671             }
1672         }
1673     }
1674     pthread_mutex_unlock(&lru_maintainer_lock);
1675     sam->free(am);
1676     // LRU crawler *must* be stopped.
1677     free(cdata);
1678     if (settings.verbose > 2)
1679         fprintf(stderr, "LRU maintainer thread stopping\n");
1680 
1681     return NULL;
1682 }
1683 
stop_lru_maintainer_thread(void)1684 int stop_lru_maintainer_thread(void) {
1685     int ret;
1686     pthread_mutex_lock(&lru_maintainer_lock);
1687     /* LRU thread is a sleep loop, will die on its own */
1688     do_run_lru_maintainer_thread = 0;
1689     pthread_mutex_unlock(&lru_maintainer_lock);
1690     if ((ret = pthread_join(lru_maintainer_tid, NULL)) != 0) {
1691         fprintf(stderr, "Failed to stop LRU maintainer thread: %s\n", strerror(ret));
1692         return -1;
1693     }
1694     settings.lru_maintainer_thread = false;
1695     return 0;
1696 }
1697 
start_lru_maintainer_thread(void * arg)1698 int start_lru_maintainer_thread(void *arg) {
1699     int ret;
1700 
1701     pthread_mutex_lock(&lru_maintainer_lock);
1702     do_run_lru_maintainer_thread = 1;
1703     settings.lru_maintainer_thread = true;
1704     if ((ret = pthread_create(&lru_maintainer_tid, NULL,
1705         lru_maintainer_thread, arg)) != 0) {
1706         fprintf(stderr, "Can't create LRU maintainer thread: %s\n",
1707             strerror(ret));
1708         pthread_mutex_unlock(&lru_maintainer_lock);
1709         return -1;
1710     }
1711     pthread_mutex_unlock(&lru_maintainer_lock);
1712 
1713     return 0;
1714 }
1715 
1716 /* If we hold this lock, crawler can't wake up or move */
lru_maintainer_pause(void)1717 void lru_maintainer_pause(void) {
1718     pthread_mutex_lock(&lru_maintainer_lock);
1719 }
1720 
lru_maintainer_resume(void)1721 void lru_maintainer_resume(void) {
1722     pthread_mutex_unlock(&lru_maintainer_lock);
1723 }
1724 
init_lru_maintainer(void)1725 int init_lru_maintainer(void) {
1726     lru_maintainer_initialized = 1;
1727     return 0;
1728 }
1729 
1730 /* Tail linkers and crawler for the LRU crawler. */
do_item_linktail_q(item * it)1731 void do_item_linktail_q(item *it) { /* item is the new tail */
1732     item **head, **tail;
1733     assert(it->it_flags == 1);
1734     assert(it->nbytes == 0);
1735 
1736     head = &heads[it->slabs_clsid];
1737     tail = &tails[it->slabs_clsid];
1738     //assert(*tail != 0);
1739     assert(it != *tail);
1740     assert((*head && *tail) || (*head == 0 && *tail == 0));
1741     it->prev = *tail;
1742     it->next = 0;
1743     if (it->prev) {
1744         assert(it->prev->next == 0);
1745         it->prev->next = it;
1746     }
1747     *tail = it;
1748     if (*head == 0) *head = it;
1749     return;
1750 }
1751 
do_item_unlinktail_q(item * it)1752 void do_item_unlinktail_q(item *it) {
1753     item **head, **tail;
1754     head = &heads[it->slabs_clsid];
1755     tail = &tails[it->slabs_clsid];
1756 
1757     if (*head == it) {
1758         assert(it->prev == 0);
1759         *head = it->next;
1760     }
1761     if (*tail == it) {
1762         assert(it->next == 0);
1763         *tail = it->prev;
1764     }
1765     assert(it->next != it);
1766     assert(it->prev != it);
1767 
1768     if (it->next) it->next->prev = it->prev;
1769     if (it->prev) it->prev->next = it->next;
1770     return;
1771 }
1772 
1773 /* This is too convoluted, but it's a difficult shuffle. Try to rewrite it
1774  * more clearly. */
do_item_crawl_q(item * it)1775 item *do_item_crawl_q(item *it) {
1776     item **head, **tail;
1777     assert(it->it_flags == 1);
1778     assert(it->nbytes == 0);
1779     head = &heads[it->slabs_clsid];
1780     tail = &tails[it->slabs_clsid];
1781 
1782     /* We've hit the head, pop off */
1783     if (it->prev == 0) {
1784         assert(*head == it);
1785         if (it->next) {
1786             *head = it->next;
1787             assert(it->next->prev == it);
1788             it->next->prev = 0;
1789         }
1790         return NULL; /* Done */
1791     }
1792 
1793     /* Swing ourselves in front of the next item */
1794     /* NB: If there is a prev, we can't be the head */
1795     assert(it->prev != it);
1796     if (it->prev) {
1797         if (*head == it->prev) {
1798             /* Prev was the head, now we're the head */
1799             *head = it;
1800         }
1801         if (*tail == it) {
1802             /* We are the tail, now they are the tail */
1803             *tail = it->prev;
1804         }
1805         assert(it->next != it);
1806         if (it->next) {
1807             assert(it->prev->next == it);
1808             it->prev->next = it->next;
1809             it->next->prev = it->prev;
1810         } else {
1811             /* Tail. Move this above? */
1812             it->prev->next = 0;
1813         }
1814         /* prev->prev's next is it->prev */
1815         it->next = it->prev;
1816         it->prev = it->next->prev;
1817         it->next->prev = it;
1818         /* New it->prev now, if we're not at the head. */
1819         if (it->prev) {
1820             it->prev->next = it;
1821         }
1822     }
1823     assert(it->next != it);
1824     assert(it->prev != it);
1825 
1826     return it->next; /* success */
1827 }
1828