1 /* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 #include "memcached.h"
3 #include "bipbuffer.h"
4 #include "slab_automove.h"
5 #include "storage.h"
6 #ifdef EXTSTORE
7 #include "slab_automove_extstore.h"
8 #endif
9 #include <sys/stat.h>
10 #include <sys/socket.h>
11 #include <sys/resource.h>
12 #include <fcntl.h>
13 #include <netinet/in.h>
14 #include <errno.h>
15 #include <stdlib.h>
16 #include <stdio.h>
17 #include <signal.h>
18 #include <string.h>
19 #include <time.h>
20 #include <assert.h>
21 #include <unistd.h>
22 #include <poll.h>
23
24 /* Forward Declarations */
25 static void item_link_q(item *it);
26 static void item_unlink_q(item *it);
27
28 static unsigned int lru_type_map[4] = {HOT_LRU, WARM_LRU, COLD_LRU, TEMP_LRU};
29
30 #define LARGEST_ID POWER_LARGEST
31 typedef struct {
32 uint64_t evicted;
33 uint64_t evicted_nonzero;
34 uint64_t reclaimed;
35 uint64_t outofmemory;
36 uint64_t tailrepairs;
37 uint64_t expired_unfetched; /* items reclaimed but never touched */
38 uint64_t evicted_unfetched; /* items evicted but never touched */
39 uint64_t evicted_active; /* items evicted that should have been shuffled */
40 uint64_t crawler_reclaimed;
41 uint64_t crawler_items_checked;
42 uint64_t lrutail_reflocked;
43 uint64_t moves_to_cold;
44 uint64_t moves_to_warm;
45 uint64_t moves_within_lru;
46 uint64_t direct_reclaims;
47 uint64_t hits_to_hot;
48 uint64_t hits_to_warm;
49 uint64_t hits_to_cold;
50 uint64_t hits_to_temp;
51 uint64_t mem_requested;
52 rel_time_t evicted_time;
53 } itemstats_t;
54
55 static item *heads[LARGEST_ID];
56 static item *tails[LARGEST_ID];
57 static itemstats_t itemstats[LARGEST_ID];
58 static unsigned int sizes[LARGEST_ID];
59 static uint64_t sizes_bytes[LARGEST_ID];
60 static unsigned int *stats_sizes_hist = NULL;
61 static uint64_t stats_sizes_cas_min = 0;
62 static int stats_sizes_buckets = 0;
63 static uint64_t cas_id = 0;
64
65 static volatile int do_run_lru_maintainer_thread = 0;
66 static int lru_maintainer_initialized = 0;
67 static pthread_mutex_t lru_maintainer_lock = PTHREAD_MUTEX_INITIALIZER;
68 static pthread_mutex_t cas_id_lock = PTHREAD_MUTEX_INITIALIZER;
69 static pthread_mutex_t stats_sizes_lock = PTHREAD_MUTEX_INITIALIZER;
70
item_stats_reset(void)71 void item_stats_reset(void) {
72 int i;
73 for (i = 0; i < LARGEST_ID; i++) {
74 pthread_mutex_lock(&lru_locks[i]);
75 memset(&itemstats[i], 0, sizeof(itemstats_t));
76 pthread_mutex_unlock(&lru_locks[i]);
77 }
78 }
79
80 /* called with class lru lock held */
do_item_stats_add_crawl(const int i,const uint64_t reclaimed,const uint64_t unfetched,const uint64_t checked)81 void do_item_stats_add_crawl(const int i, const uint64_t reclaimed,
82 const uint64_t unfetched, const uint64_t checked) {
83 itemstats[i].crawler_reclaimed += reclaimed;
84 itemstats[i].expired_unfetched += unfetched;
85 itemstats[i].crawler_items_checked += checked;
86 }
87
88 typedef struct _lru_bump_buf {
89 struct _lru_bump_buf *prev;
90 struct _lru_bump_buf *next;
91 pthread_mutex_t mutex;
92 bipbuf_t *buf;
93 uint64_t dropped;
94 } lru_bump_buf;
95
96 typedef struct {
97 item *it;
98 uint32_t hv;
99 } lru_bump_entry;
100
101 static lru_bump_buf *bump_buf_head = NULL;
102 static lru_bump_buf *bump_buf_tail = NULL;
103 static pthread_mutex_t bump_buf_lock = PTHREAD_MUTEX_INITIALIZER;
104 /* TODO: tunable? Need bench results */
105 #define LRU_BUMP_BUF_SIZE 8192
106
107 static bool lru_bump_async(lru_bump_buf *b, item *it, uint32_t hv);
108 static uint64_t lru_total_bumps_dropped(void);
109
110 /* Get the next CAS id for a new item. */
111 /* TODO: refactor some atomics for this. */
get_cas_id(void)112 uint64_t get_cas_id(void) {
113 pthread_mutex_lock(&cas_id_lock);
114 uint64_t next_id = ++cas_id;
115 pthread_mutex_unlock(&cas_id_lock);
116 return next_id;
117 }
118
set_cas_id(uint64_t new_cas)119 void set_cas_id(uint64_t new_cas) {
120 pthread_mutex_lock(&cas_id_lock);
121 cas_id = new_cas;
122 pthread_mutex_unlock(&cas_id_lock);
123 }
124
item_is_flushed(item * it)125 int item_is_flushed(item *it) {
126 rel_time_t oldest_live = settings.oldest_live;
127 uint64_t cas = ITEM_get_cas(it);
128 uint64_t oldest_cas = settings.oldest_cas;
129 if (oldest_live == 0 || oldest_live > current_time)
130 return 0;
131 if ((it->time <= oldest_live)
132 || (oldest_cas != 0 && cas != 0 && cas < oldest_cas)) {
133 return 1;
134 }
135 return 0;
136 }
137
138 /* must be locked before call */
do_get_lru_size(uint32_t id)139 unsigned int do_get_lru_size(uint32_t id) {
140 return sizes[id];
141 }
142
143 /* Enable this for reference-count debugging. */
144 #if 0
145 # define DEBUG_REFCNT(it,op) \
146 fprintf(stderr, "item %x refcnt(%c) %d %c%c%c\n", \
147 it, op, it->refcount, \
148 (it->it_flags & ITEM_LINKED) ? 'L' : ' ', \
149 (it->it_flags & ITEM_SLABBED) ? 'S' : ' ')
150 #else
151 # define DEBUG_REFCNT(it,op) while(0)
152 #endif
153
154 /**
155 * Generates the variable-sized part of the header for an object.
156 *
157 * nkey - The length of the key
158 * flags - key flags
159 * nbytes - Number of bytes to hold value and addition CRLF terminator
160 * suffix - Buffer for the "VALUE" line suffix (flags, size).
161 * nsuffix - The length of the suffix is stored here.
162 *
163 * Returns the total size of the header.
164 */
item_make_header(const uint8_t nkey,const unsigned int flags,const int nbytes,char * suffix,uint8_t * nsuffix)165 static size_t item_make_header(const uint8_t nkey, const unsigned int flags, const int nbytes,
166 char *suffix, uint8_t *nsuffix) {
167 if (flags == 0) {
168 *nsuffix = 0;
169 } else {
170 *nsuffix = sizeof(flags);
171 }
172 return sizeof(item) + nkey + *nsuffix + nbytes;
173 }
174
do_item_alloc_pull(const size_t ntotal,const unsigned int id)175 item *do_item_alloc_pull(const size_t ntotal, const unsigned int id) {
176 item *it = NULL;
177 int i;
178 /* If no memory is available, attempt a direct LRU juggle/eviction */
179 /* This is a race in order to simplify lru_pull_tail; in cases where
180 * locked items are on the tail, you want them to fall out and cause
181 * occasional OOM's, rather than internally work around them.
182 * This also gives one fewer code path for slab alloc/free
183 */
184 for (i = 0; i < 10; i++) {
185 /* Try to reclaim memory first */
186 if (!settings.lru_segmented) {
187 lru_pull_tail(id, COLD_LRU, 0, 0, 0, NULL);
188 }
189 it = slabs_alloc(ntotal, id, 0);
190
191 if (it == NULL) {
192 // We send '0' in for "total_bytes" as this routine is always
193 // pulling to evict, or forcing HOT -> COLD migration.
194 // As of this writing, total_bytes isn't at all used with COLD_LRU.
195 if (lru_pull_tail(id, COLD_LRU, 0, LRU_PULL_EVICT, 0, NULL) <= 0) {
196 if (settings.lru_segmented) {
197 lru_pull_tail(id, HOT_LRU, 0, 0, 0, NULL);
198 } else {
199 break;
200 }
201 }
202 } else {
203 break;
204 }
205 }
206
207 if (i > 0) {
208 pthread_mutex_lock(&lru_locks[id]);
209 itemstats[id].direct_reclaims += i;
210 pthread_mutex_unlock(&lru_locks[id]);
211 }
212
213 return it;
214 }
215
216 /* Chain another chunk onto this chunk. */
217 /* slab mover: if it finds a chunk without ITEM_CHUNK flag, and no ITEM_LINKED
218 * flag, it counts as busy and skips.
219 * I think it might still not be safe to do linking outside of the slab lock
220 */
do_item_alloc_chunk(item_chunk * ch,const size_t bytes_remain)221 item_chunk *do_item_alloc_chunk(item_chunk *ch, const size_t bytes_remain) {
222 // TODO: Should be a cleaner way of finding real size with slabber calls
223 size_t size = bytes_remain + sizeof(item_chunk);
224 if (size > settings.slab_chunk_size_max)
225 size = settings.slab_chunk_size_max;
226 unsigned int id = slabs_clsid(size);
227
228 item_chunk *nch = (item_chunk *) do_item_alloc_pull(size, id);
229 if (nch == NULL)
230 return NULL;
231
232 // link in.
233 // ITEM_CHUNK[ED] bits need to be protected by the slabs lock.
234 slabs_mlock();
235 nch->head = ch->head;
236 ch->next = nch;
237 nch->prev = ch;
238 nch->next = 0;
239 nch->used = 0;
240 nch->slabs_clsid = id;
241 nch->size = size - sizeof(item_chunk);
242 nch->it_flags |= ITEM_CHUNK;
243 slabs_munlock();
244 return nch;
245 }
246
do_item_alloc(char * key,const size_t nkey,const unsigned int flags,const rel_time_t exptime,const int nbytes)247 item *do_item_alloc(char *key, const size_t nkey, const unsigned int flags,
248 const rel_time_t exptime, const int nbytes) {
249 uint8_t nsuffix;
250 item *it = NULL;
251 char suffix[40];
252 // Avoid potential underflows.
253 if (nbytes < 2)
254 return 0;
255
256 size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix);
257 if (settings.use_cas) {
258 ntotal += sizeof(uint64_t);
259 }
260
261 unsigned int id = slabs_clsid(ntotal);
262 unsigned int hdr_id = 0;
263 if (id == 0)
264 return 0;
265
266 /* This is a large item. Allocate a header object now, lazily allocate
267 * chunks while reading the upload.
268 */
269 if (ntotal > settings.slab_chunk_size_max) {
270 /* We still link this item into the LRU for the larger slab class, but
271 * we're pulling a header from an entirely different slab class. The
272 * free routines handle large items specifically.
273 */
274 int htotal = nkey + 1 + nsuffix + sizeof(item) + sizeof(item_chunk);
275 if (settings.use_cas) {
276 htotal += sizeof(uint64_t);
277 }
278 #ifdef NEED_ALIGN
279 // header chunk needs to be padded on some systems
280 int remain = htotal % 8;
281 if (remain != 0) {
282 htotal += 8 - remain;
283 }
284 #endif
285 hdr_id = slabs_clsid(htotal);
286 it = do_item_alloc_pull(htotal, hdr_id);
287 /* setting ITEM_CHUNKED is fine here because we aren't LINKED yet. */
288 if (it != NULL)
289 it->it_flags |= ITEM_CHUNKED;
290 } else {
291 it = do_item_alloc_pull(ntotal, id);
292 }
293
294 if (it == NULL) {
295 pthread_mutex_lock(&lru_locks[id]);
296 itemstats[id].outofmemory++;
297 pthread_mutex_unlock(&lru_locks[id]);
298 return NULL;
299 }
300
301 assert(it->it_flags == 0 || it->it_flags == ITEM_CHUNKED);
302 //assert(it != heads[id]);
303
304 /* Refcount is seeded to 1 by slabs_alloc() */
305 it->next = it->prev = 0;
306
307 /* Items are initially loaded into the HOT_LRU. This is '0' but I want at
308 * least a note here. Compiler (hopefully?) optimizes this out.
309 */
310 if (settings.temp_lru &&
311 exptime - current_time <= settings.temporary_ttl) {
312 id |= TEMP_LRU;
313 } else if (settings.lru_segmented) {
314 id |= HOT_LRU;
315 } else {
316 /* There is only COLD in compat-mode */
317 id |= COLD_LRU;
318 }
319 it->slabs_clsid = id;
320
321 DEBUG_REFCNT(it, '*');
322 it->it_flags |= settings.use_cas ? ITEM_CAS : 0;
323 it->it_flags |= nsuffix != 0 ? ITEM_CFLAGS : 0;
324 it->nkey = nkey;
325 it->nbytes = nbytes;
326 memcpy(ITEM_key(it), key, nkey);
327 it->exptime = exptime;
328 if (nsuffix > 0) {
329 memcpy(ITEM_suffix(it), &flags, sizeof(flags));
330 }
331
332 /* Initialize internal chunk. */
333 if (it->it_flags & ITEM_CHUNKED) {
334 item_chunk *chunk = (item_chunk *) ITEM_schunk(it);
335
336 chunk->next = 0;
337 chunk->prev = 0;
338 chunk->used = 0;
339 chunk->size = 0;
340 chunk->head = it;
341 chunk->orig_clsid = hdr_id;
342 }
343 it->h_next = 0;
344
345 return it;
346 }
347
item_free(item * it)348 void item_free(item *it) {
349 size_t ntotal = ITEM_ntotal(it);
350 unsigned int clsid;
351 assert((it->it_flags & ITEM_LINKED) == 0);
352 assert(it != heads[it->slabs_clsid]);
353 assert(it != tails[it->slabs_clsid]);
354 assert(it->refcount == 0);
355
356 /* so slab size changer can tell later if item is already free or not */
357 clsid = ITEM_clsid(it);
358 DEBUG_REFCNT(it, 'F');
359 slabs_free(it, ntotal, clsid);
360 }
361
362 /**
363 * Returns true if an item will fit in the cache (its size does not exceed
364 * the maximum for a cache entry.)
365 */
item_size_ok(const size_t nkey,const int flags,const int nbytes)366 bool item_size_ok(const size_t nkey, const int flags, const int nbytes) {
367 char prefix[40];
368 uint8_t nsuffix;
369 if (nbytes < 2)
370 return false;
371
372 size_t ntotal = item_make_header(nkey + 1, flags, nbytes,
373 prefix, &nsuffix);
374 if (settings.use_cas) {
375 ntotal += sizeof(uint64_t);
376 }
377
378 return slabs_clsid(ntotal) != 0;
379 }
380
381 /* fixing stats/references during warm start */
do_item_link_fixup(item * it)382 void do_item_link_fixup(item *it) {
383 item **head, **tail;
384 int ntotal = ITEM_ntotal(it);
385 uint32_t hv = hash(ITEM_key(it), it->nkey);
386 assoc_insert(it, hv);
387
388 head = &heads[it->slabs_clsid];
389 tail = &tails[it->slabs_clsid];
390 if (it->prev == 0 && *head == 0) *head = it;
391 if (it->next == 0 && *tail == 0) *tail = it;
392 sizes[it->slabs_clsid]++;
393 sizes_bytes[it->slabs_clsid] += ntotal;
394
395 STATS_LOCK();
396 stats_state.curr_bytes += ntotal;
397 stats_state.curr_items += 1;
398 stats.total_items += 1;
399 STATS_UNLOCK();
400
401 item_stats_sizes_add(it);
402
403 return;
404 }
405
do_item_link_q(item * it)406 static void do_item_link_q(item *it) { /* item is the new head */
407 item **head, **tail;
408 assert((it->it_flags & ITEM_SLABBED) == 0);
409
410 head = &heads[it->slabs_clsid];
411 tail = &tails[it->slabs_clsid];
412 assert(it != *head);
413 assert((*head && *tail) || (*head == 0 && *tail == 0));
414 it->prev = 0;
415 it->next = *head;
416 if (it->next) it->next->prev = it;
417 *head = it;
418 if (*tail == 0) *tail = it;
419 sizes[it->slabs_clsid]++;
420 #ifdef EXTSTORE
421 if (it->it_flags & ITEM_HDR) {
422 sizes_bytes[it->slabs_clsid] += (ITEM_ntotal(it) - it->nbytes) + sizeof(item_hdr);
423 } else {
424 sizes_bytes[it->slabs_clsid] += ITEM_ntotal(it);
425 }
426 #else
427 sizes_bytes[it->slabs_clsid] += ITEM_ntotal(it);
428 #endif
429
430 return;
431 }
432
item_link_q(item * it)433 static void item_link_q(item *it) {
434 pthread_mutex_lock(&lru_locks[it->slabs_clsid]);
435 do_item_link_q(it);
436 pthread_mutex_unlock(&lru_locks[it->slabs_clsid]);
437 }
438
item_link_q_warm(item * it)439 static void item_link_q_warm(item *it) {
440 pthread_mutex_lock(&lru_locks[it->slabs_clsid]);
441 do_item_link_q(it);
442 itemstats[it->slabs_clsid].moves_to_warm++;
443 pthread_mutex_unlock(&lru_locks[it->slabs_clsid]);
444 }
445
do_item_unlink_q(item * it)446 static void do_item_unlink_q(item *it) {
447 item **head, **tail;
448 head = &heads[it->slabs_clsid];
449 tail = &tails[it->slabs_clsid];
450
451 if (*head == it) {
452 assert(it->prev == 0);
453 *head = it->next;
454 }
455 if (*tail == it) {
456 assert(it->next == 0);
457 *tail = it->prev;
458 }
459 assert(it->next != it);
460 assert(it->prev != it);
461
462 if (it->next) it->next->prev = it->prev;
463 if (it->prev) it->prev->next = it->next;
464 sizes[it->slabs_clsid]--;
465 #ifdef EXTSTORE
466 if (it->it_flags & ITEM_HDR) {
467 sizes_bytes[it->slabs_clsid] -= (ITEM_ntotal(it) - it->nbytes) + sizeof(item_hdr);
468 } else {
469 sizes_bytes[it->slabs_clsid] -= ITEM_ntotal(it);
470 }
471 #else
472 sizes_bytes[it->slabs_clsid] -= ITEM_ntotal(it);
473 #endif
474
475 return;
476 }
477
item_unlink_q(item * it)478 static void item_unlink_q(item *it) {
479 pthread_mutex_lock(&lru_locks[it->slabs_clsid]);
480 do_item_unlink_q(it);
481 pthread_mutex_unlock(&lru_locks[it->slabs_clsid]);
482 }
483
do_item_link(item * it,const uint32_t hv)484 int do_item_link(item *it, const uint32_t hv) {
485 MEMCACHED_ITEM_LINK(ITEM_key(it), it->nkey, it->nbytes);
486 assert((it->it_flags & (ITEM_LINKED|ITEM_SLABBED)) == 0);
487 it->it_flags |= ITEM_LINKED;
488 it->time = current_time;
489
490 STATS_LOCK();
491 stats_state.curr_bytes += ITEM_ntotal(it);
492 stats_state.curr_items += 1;
493 stats.total_items += 1;
494 STATS_UNLOCK();
495
496 /* Allocate a new CAS ID on link. */
497 ITEM_set_cas(it, (settings.use_cas) ? get_cas_id() : 0);
498 assoc_insert(it, hv);
499 item_link_q(it);
500 refcount_incr(it);
501 item_stats_sizes_add(it);
502
503 return 1;
504 }
505
do_item_unlink(item * it,const uint32_t hv)506 void do_item_unlink(item *it, const uint32_t hv) {
507 MEMCACHED_ITEM_UNLINK(ITEM_key(it), it->nkey, it->nbytes);
508 if ((it->it_flags & ITEM_LINKED) != 0) {
509 it->it_flags &= ~ITEM_LINKED;
510 STATS_LOCK();
511 stats_state.curr_bytes -= ITEM_ntotal(it);
512 stats_state.curr_items -= 1;
513 STATS_UNLOCK();
514 item_stats_sizes_remove(it);
515 assoc_delete(ITEM_key(it), it->nkey, hv);
516 item_unlink_q(it);
517 do_item_remove(it);
518 }
519 }
520
521 /* FIXME: Is it necessary to keep this copy/pasted code? */
do_item_unlink_nolock(item * it,const uint32_t hv)522 void do_item_unlink_nolock(item *it, const uint32_t hv) {
523 MEMCACHED_ITEM_UNLINK(ITEM_key(it), it->nkey, it->nbytes);
524 if ((it->it_flags & ITEM_LINKED) != 0) {
525 it->it_flags &= ~ITEM_LINKED;
526 STATS_LOCK();
527 stats_state.curr_bytes -= ITEM_ntotal(it);
528 stats_state.curr_items -= 1;
529 STATS_UNLOCK();
530 item_stats_sizes_remove(it);
531 assoc_delete(ITEM_key(it), it->nkey, hv);
532 do_item_unlink_q(it);
533 do_item_remove(it);
534 }
535 }
536
do_item_remove(item * it)537 void do_item_remove(item *it) {
538 MEMCACHED_ITEM_REMOVE(ITEM_key(it), it->nkey, it->nbytes);
539 assert((it->it_flags & ITEM_SLABBED) == 0);
540 assert(it->refcount > 0);
541
542 if (refcount_decr(it) == 0) {
543 item_free(it);
544 }
545 }
546
547 /* Bump the last accessed time, or relink if we're in compat mode */
do_item_update(item * it)548 void do_item_update(item *it) {
549 MEMCACHED_ITEM_UPDATE(ITEM_key(it), it->nkey, it->nbytes);
550
551 /* Hits to COLD_LRU immediately move to WARM. */
552 if (settings.lru_segmented) {
553 assert((it->it_flags & ITEM_SLABBED) == 0);
554 if ((it->it_flags & ITEM_LINKED) != 0) {
555 if (ITEM_lruid(it) == COLD_LRU && (it->it_flags & ITEM_ACTIVE)) {
556 it->time = current_time;
557 item_unlink_q(it);
558 it->slabs_clsid = ITEM_clsid(it);
559 it->slabs_clsid |= WARM_LRU;
560 it->it_flags &= ~ITEM_ACTIVE;
561 item_link_q_warm(it);
562 } else {
563 it->time = current_time;
564 }
565 }
566 } else if (it->time < current_time - ITEM_UPDATE_INTERVAL) {
567 assert((it->it_flags & ITEM_SLABBED) == 0);
568
569 if ((it->it_flags & ITEM_LINKED) != 0) {
570 it->time = current_time;
571 item_unlink_q(it);
572 item_link_q(it);
573 }
574 }
575 }
576
do_item_replace(item * it,item * new_it,const uint32_t hv)577 int do_item_replace(item *it, item *new_it, const uint32_t hv) {
578 MEMCACHED_ITEM_REPLACE(ITEM_key(it), it->nkey, it->nbytes,
579 ITEM_key(new_it), new_it->nkey, new_it->nbytes);
580 assert((it->it_flags & ITEM_SLABBED) == 0);
581
582 do_item_unlink(it, hv);
583 return do_item_link(new_it, hv);
584 }
585
586 /*@null@*/
587 /* This is walking the line of violating lock order, but I think it's safe.
588 * If the LRU lock is held, an item in the LRU cannot be wiped and freed.
589 * The data could possibly be overwritten, but this is only accessing the
590 * headers.
591 * It may not be the best idea to leave it like this, but for now it's safe.
592 */
item_cachedump(const unsigned int slabs_clsid,const unsigned int limit,unsigned int * bytes)593 char *item_cachedump(const unsigned int slabs_clsid, const unsigned int limit, unsigned int *bytes) {
594 unsigned int memlimit = 2 * 1024 * 1024; /* 2MB max response size */
595 char *buffer;
596 unsigned int bufcurr;
597 item *it;
598 unsigned int len;
599 unsigned int shown = 0;
600 char key_temp[KEY_MAX_LENGTH + 1];
601 char temp[512];
602 unsigned int id = slabs_clsid;
603 id |= COLD_LRU;
604
605 pthread_mutex_lock(&lru_locks[id]);
606 it = heads[id];
607
608 buffer = malloc((size_t)memlimit);
609 if (buffer == 0) {
610 pthread_mutex_unlock(&lru_locks[id]);
611 return NULL;
612 }
613 bufcurr = 0;
614
615 while (it != NULL && (limit == 0 || shown < limit)) {
616 assert(it->nkey <= KEY_MAX_LENGTH);
617 // protect from printing binary keys.
618 if ((it->nbytes == 0 && it->nkey == 0) || (it->it_flags & ITEM_KEY_BINARY)) {
619 it = it->next;
620 continue;
621 }
622 /* Copy the key since it may not be null-terminated in the struct */
623 strncpy(key_temp, ITEM_key(it), it->nkey);
624 key_temp[it->nkey] = 0x00; /* terminate */
625 len = snprintf(temp, sizeof(temp), "ITEM %s [%d b; %llu s]\r\n",
626 key_temp, it->nbytes - 2,
627 it->exptime == 0 ? 0 :
628 (unsigned long long)it->exptime + process_started);
629 if (bufcurr + len + 6 > memlimit) /* 6 is END\r\n\0 */
630 break;
631 memcpy(buffer + bufcurr, temp, len);
632 bufcurr += len;
633 shown++;
634 it = it->next;
635 }
636
637 memcpy(buffer + bufcurr, "END\r\n", 6);
638 bufcurr += 5;
639
640 *bytes = bufcurr;
641 pthread_mutex_unlock(&lru_locks[id]);
642 return buffer;
643 }
644
645 /* With refactoring of the various stats code the automover won't need a
646 * custom function here.
647 */
fill_item_stats_automove(item_stats_automove * am)648 void fill_item_stats_automove(item_stats_automove *am) {
649 int n;
650 for (n = 0; n < MAX_NUMBER_OF_SLAB_CLASSES; n++) {
651 item_stats_automove *cur = &am[n];
652
653 // outofmemory records into HOT
654 int i = n | HOT_LRU;
655 pthread_mutex_lock(&lru_locks[i]);
656 cur->outofmemory = itemstats[i].outofmemory;
657 pthread_mutex_unlock(&lru_locks[i]);
658
659 // evictions and tail age are from COLD
660 i = n | COLD_LRU;
661 pthread_mutex_lock(&lru_locks[i]);
662 cur->evicted = itemstats[i].evicted;
663 if (tails[i]) {
664 cur->age = current_time - tails[i]->time;
665 } else {
666 cur->age = 0;
667 }
668 pthread_mutex_unlock(&lru_locks[i]);
669 }
670 }
671
item_stats_totals(ADD_STAT add_stats,void * c)672 void item_stats_totals(ADD_STAT add_stats, void *c) {
673 itemstats_t totals;
674 memset(&totals, 0, sizeof(itemstats_t));
675 int n;
676 for (n = 0; n < MAX_NUMBER_OF_SLAB_CLASSES; n++) {
677 int x;
678 int i;
679 for (x = 0; x < 4; x++) {
680 i = n | lru_type_map[x];
681 pthread_mutex_lock(&lru_locks[i]);
682 totals.expired_unfetched += itemstats[i].expired_unfetched;
683 totals.evicted_unfetched += itemstats[i].evicted_unfetched;
684 totals.evicted_active += itemstats[i].evicted_active;
685 totals.evicted += itemstats[i].evicted;
686 totals.reclaimed += itemstats[i].reclaimed;
687 totals.crawler_reclaimed += itemstats[i].crawler_reclaimed;
688 totals.crawler_items_checked += itemstats[i].crawler_items_checked;
689 totals.lrutail_reflocked += itemstats[i].lrutail_reflocked;
690 totals.moves_to_cold += itemstats[i].moves_to_cold;
691 totals.moves_to_warm += itemstats[i].moves_to_warm;
692 totals.moves_within_lru += itemstats[i].moves_within_lru;
693 totals.direct_reclaims += itemstats[i].direct_reclaims;
694 pthread_mutex_unlock(&lru_locks[i]);
695 }
696 }
697 APPEND_STAT("expired_unfetched", "%llu",
698 (unsigned long long)totals.expired_unfetched);
699 APPEND_STAT("evicted_unfetched", "%llu",
700 (unsigned long long)totals.evicted_unfetched);
701 if (settings.lru_maintainer_thread) {
702 APPEND_STAT("evicted_active", "%llu",
703 (unsigned long long)totals.evicted_active);
704 }
705 APPEND_STAT("evictions", "%llu",
706 (unsigned long long)totals.evicted);
707 APPEND_STAT("reclaimed", "%llu",
708 (unsigned long long)totals.reclaimed);
709 APPEND_STAT("crawler_reclaimed", "%llu",
710 (unsigned long long)totals.crawler_reclaimed);
711 APPEND_STAT("crawler_items_checked", "%llu",
712 (unsigned long long)totals.crawler_items_checked);
713 APPEND_STAT("lrutail_reflocked", "%llu",
714 (unsigned long long)totals.lrutail_reflocked);
715 if (settings.lru_maintainer_thread) {
716 APPEND_STAT("moves_to_cold", "%llu",
717 (unsigned long long)totals.moves_to_cold);
718 APPEND_STAT("moves_to_warm", "%llu",
719 (unsigned long long)totals.moves_to_warm);
720 APPEND_STAT("moves_within_lru", "%llu",
721 (unsigned long long)totals.moves_within_lru);
722 APPEND_STAT("direct_reclaims", "%llu",
723 (unsigned long long)totals.direct_reclaims);
724 APPEND_STAT("lru_bumps_dropped", "%llu",
725 (unsigned long long)lru_total_bumps_dropped());
726 }
727 }
728
item_stats(ADD_STAT add_stats,void * c)729 void item_stats(ADD_STAT add_stats, void *c) {
730 struct thread_stats thread_stats;
731 threadlocal_stats_aggregate(&thread_stats);
732 itemstats_t totals;
733 int n;
734 for (n = 0; n < MAX_NUMBER_OF_SLAB_CLASSES; n++) {
735 memset(&totals, 0, sizeof(itemstats_t));
736 int x;
737 int i;
738 unsigned int size = 0;
739 unsigned int age = 0;
740 unsigned int age_hot = 0;
741 unsigned int age_warm = 0;
742 unsigned int lru_size_map[4];
743 const char *fmt = "items:%d:%s";
744 char key_str[STAT_KEY_LEN];
745 char val_str[STAT_VAL_LEN];
746 int klen = 0, vlen = 0;
747 for (x = 0; x < 4; x++) {
748 i = n | lru_type_map[x];
749 pthread_mutex_lock(&lru_locks[i]);
750 totals.evicted += itemstats[i].evicted;
751 totals.evicted_nonzero += itemstats[i].evicted_nonzero;
752 totals.outofmemory += itemstats[i].outofmemory;
753 totals.tailrepairs += itemstats[i].tailrepairs;
754 totals.reclaimed += itemstats[i].reclaimed;
755 totals.expired_unfetched += itemstats[i].expired_unfetched;
756 totals.evicted_unfetched += itemstats[i].evicted_unfetched;
757 totals.evicted_active += itemstats[i].evicted_active;
758 totals.crawler_reclaimed += itemstats[i].crawler_reclaimed;
759 totals.crawler_items_checked += itemstats[i].crawler_items_checked;
760 totals.lrutail_reflocked += itemstats[i].lrutail_reflocked;
761 totals.moves_to_cold += itemstats[i].moves_to_cold;
762 totals.moves_to_warm += itemstats[i].moves_to_warm;
763 totals.moves_within_lru += itemstats[i].moves_within_lru;
764 totals.direct_reclaims += itemstats[i].direct_reclaims;
765 totals.mem_requested += sizes_bytes[i];
766 size += sizes[i];
767 lru_size_map[x] = sizes[i];
768 if (lru_type_map[x] == COLD_LRU && tails[i] != NULL) {
769 age = current_time - tails[i]->time;
770 } else if (lru_type_map[x] == HOT_LRU && tails[i] != NULL) {
771 age_hot = current_time - tails[i]->time;
772 } else if (lru_type_map[x] == WARM_LRU && tails[i] != NULL) {
773 age_warm = current_time - tails[i]->time;
774 }
775 if (lru_type_map[x] == COLD_LRU)
776 totals.evicted_time = itemstats[i].evicted_time;
777 switch (lru_type_map[x]) {
778 case HOT_LRU:
779 totals.hits_to_hot = thread_stats.lru_hits[i];
780 break;
781 case WARM_LRU:
782 totals.hits_to_warm = thread_stats.lru_hits[i];
783 break;
784 case COLD_LRU:
785 totals.hits_to_cold = thread_stats.lru_hits[i];
786 break;
787 case TEMP_LRU:
788 totals.hits_to_temp = thread_stats.lru_hits[i];
789 break;
790 }
791 pthread_mutex_unlock(&lru_locks[i]);
792 }
793 if (size == 0)
794 continue;
795 APPEND_NUM_FMT_STAT(fmt, n, "number", "%u", size);
796 if (settings.lru_maintainer_thread) {
797 APPEND_NUM_FMT_STAT(fmt, n, "number_hot", "%u", lru_size_map[0]);
798 APPEND_NUM_FMT_STAT(fmt, n, "number_warm", "%u", lru_size_map[1]);
799 APPEND_NUM_FMT_STAT(fmt, n, "number_cold", "%u", lru_size_map[2]);
800 if (settings.temp_lru) {
801 APPEND_NUM_FMT_STAT(fmt, n, "number_temp", "%u", lru_size_map[3]);
802 }
803 APPEND_NUM_FMT_STAT(fmt, n, "age_hot", "%u", age_hot);
804 APPEND_NUM_FMT_STAT(fmt, n, "age_warm", "%u", age_warm);
805 }
806 APPEND_NUM_FMT_STAT(fmt, n, "age", "%u", age);
807 APPEND_NUM_FMT_STAT(fmt, n, "mem_requested", "%llu", (unsigned long long)totals.mem_requested);
808 APPEND_NUM_FMT_STAT(fmt, n, "evicted",
809 "%llu", (unsigned long long)totals.evicted);
810 APPEND_NUM_FMT_STAT(fmt, n, "evicted_nonzero",
811 "%llu", (unsigned long long)totals.evicted_nonzero);
812 APPEND_NUM_FMT_STAT(fmt, n, "evicted_time",
813 "%u", totals.evicted_time);
814 APPEND_NUM_FMT_STAT(fmt, n, "outofmemory",
815 "%llu", (unsigned long long)totals.outofmemory);
816 APPEND_NUM_FMT_STAT(fmt, n, "tailrepairs",
817 "%llu", (unsigned long long)totals.tailrepairs);
818 APPEND_NUM_FMT_STAT(fmt, n, "reclaimed",
819 "%llu", (unsigned long long)totals.reclaimed);
820 APPEND_NUM_FMT_STAT(fmt, n, "expired_unfetched",
821 "%llu", (unsigned long long)totals.expired_unfetched);
822 APPEND_NUM_FMT_STAT(fmt, n, "evicted_unfetched",
823 "%llu", (unsigned long long)totals.evicted_unfetched);
824 if (settings.lru_maintainer_thread) {
825 APPEND_NUM_FMT_STAT(fmt, n, "evicted_active",
826 "%llu", (unsigned long long)totals.evicted_active);
827 }
828 APPEND_NUM_FMT_STAT(fmt, n, "crawler_reclaimed",
829 "%llu", (unsigned long long)totals.crawler_reclaimed);
830 APPEND_NUM_FMT_STAT(fmt, n, "crawler_items_checked",
831 "%llu", (unsigned long long)totals.crawler_items_checked);
832 APPEND_NUM_FMT_STAT(fmt, n, "lrutail_reflocked",
833 "%llu", (unsigned long long)totals.lrutail_reflocked);
834 if (settings.lru_maintainer_thread) {
835 APPEND_NUM_FMT_STAT(fmt, n, "moves_to_cold",
836 "%llu", (unsigned long long)totals.moves_to_cold);
837 APPEND_NUM_FMT_STAT(fmt, n, "moves_to_warm",
838 "%llu", (unsigned long long)totals.moves_to_warm);
839 APPEND_NUM_FMT_STAT(fmt, n, "moves_within_lru",
840 "%llu", (unsigned long long)totals.moves_within_lru);
841 APPEND_NUM_FMT_STAT(fmt, n, "direct_reclaims",
842 "%llu", (unsigned long long)totals.direct_reclaims);
843 APPEND_NUM_FMT_STAT(fmt, n, "hits_to_hot",
844 "%llu", (unsigned long long)totals.hits_to_hot);
845
846 APPEND_NUM_FMT_STAT(fmt, n, "hits_to_warm",
847 "%llu", (unsigned long long)totals.hits_to_warm);
848
849 APPEND_NUM_FMT_STAT(fmt, n, "hits_to_cold",
850 "%llu", (unsigned long long)totals.hits_to_cold);
851
852 APPEND_NUM_FMT_STAT(fmt, n, "hits_to_temp",
853 "%llu", (unsigned long long)totals.hits_to_temp);
854
855 }
856 }
857
858 /* getting here means both ascii and binary terminators fit */
859 add_stats(NULL, 0, NULL, 0, c);
860 }
861
item_stats_sizes_status(void)862 bool item_stats_sizes_status(void) {
863 bool ret = false;
864 mutex_lock(&stats_sizes_lock);
865 if (stats_sizes_hist != NULL)
866 ret = true;
867 mutex_unlock(&stats_sizes_lock);
868 return ret;
869 }
870
item_stats_sizes_init(void)871 void item_stats_sizes_init(void) {
872 if (stats_sizes_hist != NULL)
873 return;
874 stats_sizes_buckets = settings.item_size_max / 32 + 1;
875 stats_sizes_hist = calloc(stats_sizes_buckets, sizeof(int));
876 stats_sizes_cas_min = (settings.use_cas) ? get_cas_id() : 0;
877 }
878
item_stats_sizes_enable(ADD_STAT add_stats,void * c)879 void item_stats_sizes_enable(ADD_STAT add_stats, void *c) {
880 mutex_lock(&stats_sizes_lock);
881 if (!settings.use_cas) {
882 APPEND_STAT("sizes_status", "error", "");
883 APPEND_STAT("sizes_error", "cas_support_disabled", "");
884 } else if (stats_sizes_hist == NULL) {
885 item_stats_sizes_init();
886 if (stats_sizes_hist != NULL) {
887 APPEND_STAT("sizes_status", "enabled", "");
888 } else {
889 APPEND_STAT("sizes_status", "error", "");
890 APPEND_STAT("sizes_error", "no_memory", "");
891 }
892 } else {
893 APPEND_STAT("sizes_status", "enabled", "");
894 }
895 mutex_unlock(&stats_sizes_lock);
896 }
897
item_stats_sizes_disable(ADD_STAT add_stats,void * c)898 void item_stats_sizes_disable(ADD_STAT add_stats, void *c) {
899 mutex_lock(&stats_sizes_lock);
900 if (stats_sizes_hist != NULL) {
901 free(stats_sizes_hist);
902 stats_sizes_hist = NULL;
903 }
904 APPEND_STAT("sizes_status", "disabled", "");
905 mutex_unlock(&stats_sizes_lock);
906 }
907
item_stats_sizes_add(item * it)908 void item_stats_sizes_add(item *it) {
909 if (stats_sizes_hist == NULL || stats_sizes_cas_min > ITEM_get_cas(it))
910 return;
911 int ntotal = ITEM_ntotal(it);
912 int bucket = ntotal / 32;
913 if ((ntotal % 32) != 0) bucket++;
914 if (bucket < stats_sizes_buckets) stats_sizes_hist[bucket]++;
915 }
916
917 /* I think there's no way for this to be accurate without using the CAS value.
918 * Since items getting their time value bumped will pass this validation.
919 */
item_stats_sizes_remove(item * it)920 void item_stats_sizes_remove(item *it) {
921 if (stats_sizes_hist == NULL || stats_sizes_cas_min > ITEM_get_cas(it))
922 return;
923 int ntotal = ITEM_ntotal(it);
924 int bucket = ntotal / 32;
925 if ((ntotal % 32) != 0) bucket++;
926 if (bucket < stats_sizes_buckets) stats_sizes_hist[bucket]--;
927 }
928
929 /** dumps out a list of objects of each size, with granularity of 32 bytes */
930 /*@null@*/
931 /* Locks are correct based on a technicality. Holds LRU lock while doing the
932 * work, so items can't go invalid, and it's only looking at header sizes
933 * which don't change.
934 */
item_stats_sizes(ADD_STAT add_stats,void * c)935 void item_stats_sizes(ADD_STAT add_stats, void *c) {
936 mutex_lock(&stats_sizes_lock);
937
938 if (stats_sizes_hist != NULL) {
939 int i;
940 for (i = 0; i < stats_sizes_buckets; i++) {
941 if (stats_sizes_hist[i] != 0) {
942 char key[12];
943 snprintf(key, sizeof(key), "%d", i * 32);
944 APPEND_STAT(key, "%u", stats_sizes_hist[i]);
945 }
946 }
947 } else {
948 APPEND_STAT("sizes_status", "disabled", "");
949 }
950
951 add_stats(NULL, 0, NULL, 0, c);
952 mutex_unlock(&stats_sizes_lock);
953 }
954
955 /** wrapper around assoc_find which does the lazy expiration logic */
do_item_get(const char * key,const size_t nkey,const uint32_t hv,conn * c,const bool do_update)956 item *do_item_get(const char *key, const size_t nkey, const uint32_t hv, conn *c, const bool do_update) {
957 item *it = assoc_find(key, nkey, hv);
958 if (it != NULL) {
959 refcount_incr(it);
960 /* Optimization for slab reassignment. prevents popular items from
961 * jamming in busy wait. Can only do this here to satisfy lock order
962 * of item_lock, slabs_lock. */
963 /* This was made unsafe by removal of the cache_lock:
964 * slab_rebalance_signal and slab_rebal.* are modified in a separate
965 * thread under slabs_lock. If slab_rebalance_signal = 1, slab_start =
966 * NULL (0), but slab_end is still equal to some value, this would end
967 * up unlinking every item fetched.
968 * This is either an acceptable loss, or if slab_rebalance_signal is
969 * true, slab_start/slab_end should be put behind the slabs_lock.
970 * Which would cause a huge potential slowdown.
971 * Could also use a specific lock for slab_rebal.* and
972 * slab_rebalance_signal (shorter lock?)
973 */
974 /*if (slab_rebalance_signal &&
975 ((void *)it >= slab_rebal.slab_start && (void *)it < slab_rebal.slab_end)) {
976 do_item_unlink(it, hv);
977 do_item_remove(it);
978 it = NULL;
979 }*/
980 }
981 int was_found = 0;
982
983 if (settings.verbose > 2) {
984 int ii;
985 if (it == NULL) {
986 fprintf(stderr, "> NOT FOUND ");
987 } else {
988 fprintf(stderr, "> FOUND KEY ");
989 }
990 for (ii = 0; ii < nkey; ++ii) {
991 fprintf(stderr, "%c", key[ii]);
992 }
993 }
994
995 if (it != NULL) {
996 was_found = 1;
997 if (item_is_flushed(it)) {
998 do_item_unlink(it, hv);
999 STORAGE_delete(c->thread->storage, it);
1000 do_item_remove(it);
1001 it = NULL;
1002 pthread_mutex_lock(&c->thread->stats.mutex);
1003 c->thread->stats.get_flushed++;
1004 pthread_mutex_unlock(&c->thread->stats.mutex);
1005 if (settings.verbose > 2) {
1006 fprintf(stderr, " -nuked by flush");
1007 }
1008 was_found = 2;
1009 } else if (it->exptime != 0 && it->exptime <= current_time) {
1010 do_item_unlink(it, hv);
1011 STORAGE_delete(c->thread->storage, it);
1012 do_item_remove(it);
1013 it = NULL;
1014 pthread_mutex_lock(&c->thread->stats.mutex);
1015 c->thread->stats.get_expired++;
1016 pthread_mutex_unlock(&c->thread->stats.mutex);
1017 if (settings.verbose > 2) {
1018 fprintf(stderr, " -nuked by expire");
1019 }
1020 was_found = 3;
1021 } else {
1022 if (do_update) {
1023 do_item_bump(c, it, hv);
1024 }
1025 DEBUG_REFCNT(it, '+');
1026 }
1027 }
1028
1029 if (settings.verbose > 2)
1030 fprintf(stderr, "\n");
1031 /* For now this is in addition to the above verbose logging. */
1032 LOGGER_LOG(c->thread->l, LOG_FETCHERS, LOGGER_ITEM_GET, NULL, was_found, key,
1033 nkey, (it) ? it->nbytes : 0, (it) ? ITEM_clsid(it) : 0, c->sfd);
1034
1035 return it;
1036 }
1037
1038 // Requires lock held for item.
1039 // Split out of do_item_get() to allow mget functions to look through header
1040 // data before losing state modified via the bump function.
do_item_bump(conn * c,item * it,const uint32_t hv)1041 void do_item_bump(conn *c, item *it, const uint32_t hv) {
1042 /* We update the hit markers only during fetches.
1043 * An item needs to be hit twice overall to be considered
1044 * ACTIVE, but only needs a single hit to maintain activity
1045 * afterward.
1046 * FETCHED tells if an item has ever been active.
1047 */
1048 if (settings.lru_segmented) {
1049 if ((it->it_flags & ITEM_ACTIVE) == 0) {
1050 if ((it->it_flags & ITEM_FETCHED) == 0) {
1051 it->it_flags |= ITEM_FETCHED;
1052 } else {
1053 it->it_flags |= ITEM_ACTIVE;
1054 if (ITEM_lruid(it) != COLD_LRU) {
1055 it->time = current_time; // only need to bump time.
1056 } else if (!lru_bump_async(c->thread->lru_bump_buf, it, hv)) {
1057 // add flag before async bump to avoid race.
1058 it->it_flags &= ~ITEM_ACTIVE;
1059 }
1060 }
1061 }
1062 } else {
1063 it->it_flags |= ITEM_FETCHED;
1064 do_item_update(it);
1065 }
1066 }
1067
do_item_touch(const char * key,size_t nkey,uint32_t exptime,const uint32_t hv,conn * c)1068 item *do_item_touch(const char *key, size_t nkey, uint32_t exptime,
1069 const uint32_t hv, conn *c) {
1070 item *it = do_item_get(key, nkey, hv, c, DO_UPDATE);
1071 if (it != NULL) {
1072 it->exptime = exptime;
1073 }
1074 return it;
1075 }
1076
1077 /*** LRU MAINTENANCE THREAD ***/
1078
1079 /* Returns number of items remove, expired, or evicted.
1080 * Callable from worker threads or the LRU maintainer thread */
lru_pull_tail(const int orig_id,const int cur_lru,const uint64_t total_bytes,const uint8_t flags,const rel_time_t max_age,struct lru_pull_tail_return * ret_it)1081 int lru_pull_tail(const int orig_id, const int cur_lru,
1082 const uint64_t total_bytes, const uint8_t flags, const rel_time_t max_age,
1083 struct lru_pull_tail_return *ret_it) {
1084 item *it = NULL;
1085 int id = orig_id;
1086 int removed = 0;
1087 if (id == 0)
1088 return 0;
1089
1090 int tries = 5;
1091 item *search;
1092 item *next_it;
1093 void *hold_lock = NULL;
1094 unsigned int move_to_lru = 0;
1095 uint64_t limit = 0;
1096
1097 id |= cur_lru;
1098 pthread_mutex_lock(&lru_locks[id]);
1099 search = tails[id];
1100 /* We walk up *only* for locked items, and if bottom is expired. */
1101 for (; tries > 0 && search != NULL; tries--, search=next_it) {
1102 /* we might relink search mid-loop, so search->prev isn't reliable */
1103 next_it = search->prev;
1104 if (search->nbytes == 0 && search->nkey == 0 && search->it_flags == 1) {
1105 /* We are a crawler, ignore it. */
1106 if (flags & LRU_PULL_CRAWL_BLOCKS) {
1107 pthread_mutex_unlock(&lru_locks[id]);
1108 return 0;
1109 }
1110 tries++;
1111 continue;
1112 }
1113 uint32_t hv = hash(ITEM_key(search), search->nkey);
1114 /* Attempt to hash item lock the "search" item. If locked, no
1115 * other callers can incr the refcount. Also skip ourselves. */
1116 if ((hold_lock = item_trylock(hv)) == NULL)
1117 continue;
1118 /* Now see if the item is refcount locked */
1119 if (refcount_incr(search) != 2) {
1120 /* Note pathological case with ref'ed items in tail.
1121 * Can still unlink the item, but it won't be reusable yet */
1122 itemstats[id].lrutail_reflocked++;
1123 /* In case of refcount leaks, enable for quick workaround. */
1124 /* WARNING: This can cause terrible corruption */
1125 if (settings.tail_repair_time &&
1126 search->time + settings.tail_repair_time < current_time) {
1127 itemstats[id].tailrepairs++;
1128 search->refcount = 1;
1129 /* This will call item_remove -> item_free since refcnt is 1 */
1130 STORAGE_delete(ext_storage, search);
1131 do_item_unlink_nolock(search, hv);
1132 item_trylock_unlock(hold_lock);
1133 continue;
1134 }
1135 }
1136
1137 /* Expired or flushed */
1138 if ((search->exptime != 0 && search->exptime < current_time)
1139 || item_is_flushed(search)) {
1140 itemstats[id].reclaimed++;
1141 if ((search->it_flags & ITEM_FETCHED) == 0) {
1142 itemstats[id].expired_unfetched++;
1143 }
1144 /* refcnt 2 -> 1 */
1145 do_item_unlink_nolock(search, hv);
1146 STORAGE_delete(ext_storage, search);
1147 /* refcnt 1 -> 0 -> item_free */
1148 do_item_remove(search);
1149 item_trylock_unlock(hold_lock);
1150 removed++;
1151
1152 /* If all we're finding are expired, can keep going */
1153 continue;
1154 }
1155
1156 /* If we're HOT_LRU or WARM_LRU and over size limit, send to COLD_LRU.
1157 * If we're COLD_LRU, send to WARM_LRU unless we need to evict
1158 */
1159 switch (cur_lru) {
1160 case HOT_LRU:
1161 limit = total_bytes * settings.hot_lru_pct / 100;
1162 case WARM_LRU:
1163 if (limit == 0)
1164 limit = total_bytes * settings.warm_lru_pct / 100;
1165 /* Rescue ACTIVE items aggressively */
1166 if ((search->it_flags & ITEM_ACTIVE) != 0) {
1167 search->it_flags &= ~ITEM_ACTIVE;
1168 removed++;
1169 if (cur_lru == WARM_LRU) {
1170 itemstats[id].moves_within_lru++;
1171 do_item_unlink_q(search);
1172 do_item_link_q(search);
1173 do_item_remove(search);
1174 item_trylock_unlock(hold_lock);
1175 } else {
1176 /* Active HOT_LRU items flow to WARM */
1177 itemstats[id].moves_to_warm++;
1178 move_to_lru = WARM_LRU;
1179 do_item_unlink_q(search);
1180 it = search;
1181 }
1182 } else if (sizes_bytes[id] > limit ||
1183 current_time - search->time > max_age) {
1184 itemstats[id].moves_to_cold++;
1185 move_to_lru = COLD_LRU;
1186 do_item_unlink_q(search);
1187 it = search;
1188 removed++;
1189 break;
1190 } else {
1191 /* Don't want to move to COLD, not active, bail out */
1192 it = search;
1193 }
1194 break;
1195 case COLD_LRU:
1196 it = search; /* No matter what, we're stopping */
1197 if (flags & LRU_PULL_EVICT) {
1198 if (settings.evict_to_free == 0) {
1199 /* Don't think we need a counter for this. It'll OOM. */
1200 break;
1201 }
1202 itemstats[id].evicted++;
1203 itemstats[id].evicted_time = current_time - search->time;
1204 if (search->exptime != 0)
1205 itemstats[id].evicted_nonzero++;
1206 if ((search->it_flags & ITEM_FETCHED) == 0) {
1207 itemstats[id].evicted_unfetched++;
1208 }
1209 if ((search->it_flags & ITEM_ACTIVE)) {
1210 itemstats[id].evicted_active++;
1211 }
1212 LOGGER_LOG(NULL, LOG_EVICTIONS, LOGGER_EVICTION, search);
1213 STORAGE_delete(ext_storage, search);
1214 do_item_unlink_nolock(search, hv);
1215 removed++;
1216 if (settings.slab_automove == 2) {
1217 slabs_reassign(-1, orig_id);
1218 }
1219 } else if (flags & LRU_PULL_RETURN_ITEM) {
1220 /* Keep a reference to this item and return it. */
1221 ret_it->it = it;
1222 ret_it->hv = hv;
1223 } else if ((search->it_flags & ITEM_ACTIVE) != 0
1224 && settings.lru_segmented) {
1225 itemstats[id].moves_to_warm++;
1226 search->it_flags &= ~ITEM_ACTIVE;
1227 move_to_lru = WARM_LRU;
1228 do_item_unlink_q(search);
1229 removed++;
1230 }
1231 break;
1232 case TEMP_LRU:
1233 it = search; /* Kill the loop. Parent only interested in reclaims */
1234 break;
1235 }
1236 if (it != NULL)
1237 break;
1238 }
1239
1240 pthread_mutex_unlock(&lru_locks[id]);
1241
1242 if (it != NULL) {
1243 if (move_to_lru) {
1244 it->slabs_clsid = ITEM_clsid(it);
1245 it->slabs_clsid |= move_to_lru;
1246 item_link_q(it);
1247 }
1248 if ((flags & LRU_PULL_RETURN_ITEM) == 0) {
1249 do_item_remove(it);
1250 item_trylock_unlock(hold_lock);
1251 }
1252 }
1253
1254 return removed;
1255 }
1256
1257
1258 /* TODO: Third place this code needs to be deduped */
lru_bump_buf_link_q(lru_bump_buf * b)1259 static void lru_bump_buf_link_q(lru_bump_buf *b) {
1260 pthread_mutex_lock(&bump_buf_lock);
1261 assert(b != bump_buf_head);
1262
1263 b->prev = 0;
1264 b->next = bump_buf_head;
1265 if (b->next) b->next->prev = b;
1266 bump_buf_head = b;
1267 if (bump_buf_tail == 0) bump_buf_tail = b;
1268 pthread_mutex_unlock(&bump_buf_lock);
1269 return;
1270 }
1271
item_lru_bump_buf_create(void)1272 void *item_lru_bump_buf_create(void) {
1273 lru_bump_buf *b = calloc(1, sizeof(lru_bump_buf));
1274 if (b == NULL) {
1275 return NULL;
1276 }
1277
1278 b->buf = bipbuf_new(sizeof(lru_bump_entry) * LRU_BUMP_BUF_SIZE);
1279 if (b->buf == NULL) {
1280 free(b);
1281 return NULL;
1282 }
1283
1284 pthread_mutex_init(&b->mutex, NULL);
1285
1286 lru_bump_buf_link_q(b);
1287 return b;
1288 }
1289
lru_bump_async(lru_bump_buf * b,item * it,uint32_t hv)1290 static bool lru_bump_async(lru_bump_buf *b, item *it, uint32_t hv) {
1291 bool ret = true;
1292 refcount_incr(it);
1293 pthread_mutex_lock(&b->mutex);
1294 lru_bump_entry *be = (lru_bump_entry *) bipbuf_request(b->buf, sizeof(lru_bump_entry));
1295 if (be != NULL) {
1296 be->it = it;
1297 be->hv = hv;
1298 if (bipbuf_push(b->buf, sizeof(lru_bump_entry)) == 0) {
1299 ret = false;
1300 b->dropped++;
1301 }
1302 } else {
1303 ret = false;
1304 b->dropped++;
1305 }
1306 if (!ret) {
1307 refcount_decr(it);
1308 }
1309 pthread_mutex_unlock(&b->mutex);
1310 return ret;
1311 }
1312
1313 /* TODO: Might be worth a micro-optimization of having bump buffers link
1314 * themselves back into the central queue when queue goes from zero to
1315 * non-zero, then remove from list if zero more than N times.
1316 * If very few hits on cold this would avoid extra memory barriers from LRU
1317 * maintainer thread. If many hits, they'll just stay in the list.
1318 */
lru_maintainer_bumps(void)1319 static bool lru_maintainer_bumps(void) {
1320 lru_bump_buf *b;
1321 lru_bump_entry *be;
1322 unsigned int size;
1323 unsigned int todo;
1324 bool bumped = false;
1325 pthread_mutex_lock(&bump_buf_lock);
1326 for (b = bump_buf_head; b != NULL; b=b->next) {
1327 pthread_mutex_lock(&b->mutex);
1328 be = (lru_bump_entry *) bipbuf_peek_all(b->buf, &size);
1329 pthread_mutex_unlock(&b->mutex);
1330
1331 if (be == NULL) {
1332 continue;
1333 }
1334 todo = size;
1335 bumped = true;
1336
1337 while (todo) {
1338 item_lock(be->hv);
1339 do_item_update(be->it);
1340 do_item_remove(be->it);
1341 item_unlock(be->hv);
1342 be++;
1343 todo -= sizeof(lru_bump_entry);
1344 }
1345
1346 pthread_mutex_lock(&b->mutex);
1347 be = (lru_bump_entry *) bipbuf_poll(b->buf, size);
1348 pthread_mutex_unlock(&b->mutex);
1349 }
1350 pthread_mutex_unlock(&bump_buf_lock);
1351 return bumped;
1352 }
1353
lru_total_bumps_dropped(void)1354 static uint64_t lru_total_bumps_dropped(void) {
1355 uint64_t total = 0;
1356 lru_bump_buf *b;
1357 pthread_mutex_lock(&bump_buf_lock);
1358 for (b = bump_buf_head; b != NULL; b=b->next) {
1359 pthread_mutex_lock(&b->mutex);
1360 total += b->dropped;
1361 pthread_mutex_unlock(&b->mutex);
1362 }
1363 pthread_mutex_unlock(&bump_buf_lock);
1364 return total;
1365 }
1366
1367 /* Loop up to N times:
1368 * If too many items are in HOT_LRU, push to COLD_LRU
1369 * If too many items are in WARM_LRU, push to COLD_LRU
1370 * If too many items are in COLD_LRU, poke COLD_LRU tail
1371 * 1000 loops with 1ms min sleep gives us under 1m items shifted/sec. The
1372 * locks can't handle much more than that. Leaving a TODO for how to
1373 * autoadjust in the future.
1374 */
lru_maintainer_juggle(const int slabs_clsid)1375 static int lru_maintainer_juggle(const int slabs_clsid) {
1376 int i;
1377 int did_moves = 0;
1378 uint64_t total_bytes = 0;
1379 unsigned int chunks_perslab = 0;
1380 //unsigned int chunks_free = 0;
1381 /* TODO: if free_chunks below high watermark, increase aggressiveness */
1382 slabs_available_chunks(slabs_clsid, NULL,
1383 &chunks_perslab);
1384 if (settings.temp_lru) {
1385 /* Only looking for reclaims. Run before we size the LRU. */
1386 for (i = 0; i < 500; i++) {
1387 if (lru_pull_tail(slabs_clsid, TEMP_LRU, 0, 0, 0, NULL) <= 0) {
1388 break;
1389 } else {
1390 did_moves++;
1391 }
1392 }
1393 }
1394
1395 rel_time_t cold_age = 0;
1396 rel_time_t hot_age = 0;
1397 rel_time_t warm_age = 0;
1398 /* If LRU is in flat mode, force items to drain into COLD via max age of 0 */
1399 if (settings.lru_segmented) {
1400 pthread_mutex_lock(&lru_locks[slabs_clsid|COLD_LRU]);
1401 if (tails[slabs_clsid|COLD_LRU]) {
1402 cold_age = current_time - tails[slabs_clsid|COLD_LRU]->time;
1403 }
1404 // Also build up total_bytes for the classes.
1405 total_bytes += sizes_bytes[slabs_clsid|COLD_LRU];
1406 pthread_mutex_unlock(&lru_locks[slabs_clsid|COLD_LRU]);
1407
1408 hot_age = cold_age * settings.hot_max_factor;
1409 warm_age = cold_age * settings.warm_max_factor;
1410
1411 // total_bytes doesn't have to be exact. cache it for the juggles.
1412 pthread_mutex_lock(&lru_locks[slabs_clsid|HOT_LRU]);
1413 total_bytes += sizes_bytes[slabs_clsid|HOT_LRU];
1414 pthread_mutex_unlock(&lru_locks[slabs_clsid|HOT_LRU]);
1415
1416 pthread_mutex_lock(&lru_locks[slabs_clsid|WARM_LRU]);
1417 total_bytes += sizes_bytes[slabs_clsid|WARM_LRU];
1418 pthread_mutex_unlock(&lru_locks[slabs_clsid|WARM_LRU]);
1419 }
1420
1421 /* Juggle HOT/WARM up to N times */
1422 for (i = 0; i < 500; i++) {
1423 int do_more = 0;
1424 if (lru_pull_tail(slabs_clsid, HOT_LRU, total_bytes, LRU_PULL_CRAWL_BLOCKS, hot_age, NULL) ||
1425 lru_pull_tail(slabs_clsid, WARM_LRU, total_bytes, LRU_PULL_CRAWL_BLOCKS, warm_age, NULL)) {
1426 do_more++;
1427 }
1428 if (settings.lru_segmented) {
1429 do_more += lru_pull_tail(slabs_clsid, COLD_LRU, total_bytes, LRU_PULL_CRAWL_BLOCKS, 0, NULL);
1430 }
1431 if (do_more == 0)
1432 break;
1433 did_moves++;
1434 }
1435 return did_moves;
1436 }
1437
1438 /* Will crawl all slab classes a minimum of once per hour */
1439 #define MAX_MAINTCRAWL_WAIT 60 * 60
1440
1441 /* Hoping user input will improve this function. This is all a wild guess.
1442 * Operation: Kicks crawler for each slab id. Crawlers take some statistics as
1443 * to items with nonzero expirations. It then buckets how many items will
1444 * expire per minute for the next hour.
1445 * This function checks the results of a run, and if it things more than 1% of
1446 * expirable objects are ready to go, kick the crawler again to reap.
1447 * It will also kick the crawler once per minute regardless, waiting a minute
1448 * longer for each time it has no work to do, up to an hour wait time.
1449 * The latter is to avoid newly started daemons from waiting too long before
1450 * retrying a crawl.
1451 */
lru_maintainer_crawler_check(struct crawler_expired_data * cdata,logger * l)1452 static void lru_maintainer_crawler_check(struct crawler_expired_data *cdata, logger *l) {
1453 int i;
1454 static rel_time_t next_crawls[POWER_LARGEST];
1455 static rel_time_t next_crawl_wait[POWER_LARGEST];
1456 uint8_t todo[POWER_LARGEST];
1457 memset(todo, 0, sizeof(uint8_t) * POWER_LARGEST);
1458 bool do_run = false;
1459 unsigned int tocrawl_limit = 0;
1460
1461 // TODO: If not segmented LRU, skip non-cold
1462 for (i = POWER_SMALLEST; i < POWER_LARGEST; i++) {
1463 crawlerstats_t *s = &cdata->crawlerstats[i];
1464 /* We've not successfully kicked off a crawl yet. */
1465 if (s->run_complete) {
1466 char *lru_name = "na";
1467 pthread_mutex_lock(&cdata->lock);
1468 int x;
1469 /* Should we crawl again? */
1470 uint64_t possible_reclaims = s->seen - s->noexp;
1471 uint64_t available_reclaims = 0;
1472 /* Need to think we can free at least 1% of the items before
1473 * crawling. */
1474 /* FIXME: Configurable? */
1475 uint64_t low_watermark = (possible_reclaims / 100) + 1;
1476 rel_time_t since_run = current_time - s->end_time;
1477 /* Don't bother if the payoff is too low. */
1478 for (x = 0; x < 60; x++) {
1479 available_reclaims += s->histo[x];
1480 if (available_reclaims > low_watermark) {
1481 if (next_crawl_wait[i] < (x * 60)) {
1482 next_crawl_wait[i] += 60;
1483 } else if (next_crawl_wait[i] >= 60) {
1484 next_crawl_wait[i] -= 60;
1485 }
1486 break;
1487 }
1488 }
1489
1490 if (available_reclaims == 0) {
1491 next_crawl_wait[i] += 60;
1492 }
1493
1494 if (next_crawl_wait[i] > MAX_MAINTCRAWL_WAIT) {
1495 next_crawl_wait[i] = MAX_MAINTCRAWL_WAIT;
1496 }
1497
1498 next_crawls[i] = current_time + next_crawl_wait[i] + 5;
1499 switch (GET_LRU(i)) {
1500 case HOT_LRU:
1501 lru_name = "hot";
1502 break;
1503 case WARM_LRU:
1504 lru_name = "warm";
1505 break;
1506 case COLD_LRU:
1507 lru_name = "cold";
1508 break;
1509 case TEMP_LRU:
1510 lru_name = "temp";
1511 break;
1512 }
1513 LOGGER_LOG(l, LOG_SYSEVENTS, LOGGER_CRAWLER_STATUS, NULL,
1514 CLEAR_LRU(i),
1515 lru_name,
1516 (unsigned long long)low_watermark,
1517 (unsigned long long)available_reclaims,
1518 (unsigned int)since_run,
1519 next_crawls[i] - current_time,
1520 s->end_time - s->start_time,
1521 s->seen,
1522 s->reclaimed);
1523 // Got our calculation, avoid running until next actual run.
1524 s->run_complete = false;
1525 pthread_mutex_unlock(&cdata->lock);
1526 }
1527 if (current_time > next_crawls[i]) {
1528 pthread_mutex_lock(&lru_locks[i]);
1529 if (sizes[i] > tocrawl_limit) {
1530 tocrawl_limit = sizes[i];
1531 }
1532 pthread_mutex_unlock(&lru_locks[i]);
1533 todo[i] = 1;
1534 do_run = true;
1535 next_crawls[i] = current_time + 5; // minimum retry wait.
1536 }
1537 }
1538 if (do_run) {
1539 if (settings.lru_crawler_tocrawl && settings.lru_crawler_tocrawl < tocrawl_limit) {
1540 tocrawl_limit = settings.lru_crawler_tocrawl;
1541 }
1542 lru_crawler_start(todo, tocrawl_limit, CRAWLER_AUTOEXPIRE, cdata, NULL, 0);
1543 }
1544 }
1545
1546 slab_automove_reg_t slab_automove_default = {
1547 .init = slab_automove_init,
1548 .free = slab_automove_free,
1549 .run = slab_automove_run
1550 };
1551 #ifdef EXTSTORE
1552 slab_automove_reg_t slab_automove_extstore = {
1553 .init = slab_automove_extstore_init,
1554 .free = slab_automove_extstore_free,
1555 .run = slab_automove_extstore_run
1556 };
1557 #endif
1558 static pthread_t lru_maintainer_tid;
1559
1560 #define MAX_LRU_MAINTAINER_SLEEP 1000000
1561 #define MIN_LRU_MAINTAINER_SLEEP 1000
1562
lru_maintainer_thread(void * arg)1563 static void *lru_maintainer_thread(void *arg) {
1564 slab_automove_reg_t *sam = &slab_automove_default;
1565 #ifdef EXTSTORE
1566 void *storage = arg;
1567 if (storage != NULL)
1568 sam = &slab_automove_extstore;
1569 #endif
1570 int i;
1571 useconds_t to_sleep = MIN_LRU_MAINTAINER_SLEEP;
1572 useconds_t last_sleep = MIN_LRU_MAINTAINER_SLEEP;
1573 rel_time_t last_crawler_check = 0;
1574 rel_time_t last_automove_check = 0;
1575 useconds_t next_juggles[MAX_NUMBER_OF_SLAB_CLASSES] = {0};
1576 useconds_t backoff_juggles[MAX_NUMBER_OF_SLAB_CLASSES] = {0};
1577 struct crawler_expired_data *cdata =
1578 calloc(1, sizeof(struct crawler_expired_data));
1579 if (cdata == NULL) {
1580 fprintf(stderr, "Failed to allocate crawler data for LRU maintainer thread\n");
1581 abort();
1582 }
1583 pthread_mutex_init(&cdata->lock, NULL);
1584 cdata->crawl_complete = true; // kick off the crawler.
1585 logger *l = logger_create();
1586 if (l == NULL) {
1587 fprintf(stderr, "Failed to allocate logger for LRU maintainer thread\n");
1588 abort();
1589 }
1590
1591 double last_ratio = settings.slab_automove_ratio;
1592 void *am = sam->init(&settings);
1593
1594 pthread_mutex_lock(&lru_maintainer_lock);
1595 if (settings.verbose > 2)
1596 fprintf(stderr, "Starting LRU maintainer background thread\n");
1597 while (do_run_lru_maintainer_thread) {
1598 pthread_mutex_unlock(&lru_maintainer_lock);
1599 if (to_sleep)
1600 usleep(to_sleep);
1601 pthread_mutex_lock(&lru_maintainer_lock);
1602 /* A sleep of zero counts as a minimum of a 1ms wait */
1603 last_sleep = to_sleep > 1000 ? to_sleep : 1000;
1604 to_sleep = MAX_LRU_MAINTAINER_SLEEP;
1605
1606 STATS_LOCK();
1607 stats.lru_maintainer_juggles++;
1608 STATS_UNLOCK();
1609
1610 /* Each slab class gets its own sleep to avoid hammering locks */
1611 for (i = POWER_SMALLEST; i < MAX_NUMBER_OF_SLAB_CLASSES; i++) {
1612 next_juggles[i] = next_juggles[i] > last_sleep ? next_juggles[i] - last_sleep : 0;
1613
1614 if (next_juggles[i] > 0) {
1615 // Sleep the thread just for the minimum amount (or not at all)
1616 if (next_juggles[i] < to_sleep)
1617 to_sleep = next_juggles[i];
1618 continue;
1619 }
1620
1621 int did_moves = lru_maintainer_juggle(i);
1622 if (did_moves == 0) {
1623 if (backoff_juggles[i] != 0) {
1624 backoff_juggles[i] += backoff_juggles[i] / 8;
1625 } else {
1626 backoff_juggles[i] = MIN_LRU_MAINTAINER_SLEEP;
1627 }
1628 if (backoff_juggles[i] > MAX_LRU_MAINTAINER_SLEEP)
1629 backoff_juggles[i] = MAX_LRU_MAINTAINER_SLEEP;
1630 } else if (backoff_juggles[i] > 0) {
1631 backoff_juggles[i] /= 2;
1632 if (backoff_juggles[i] < MIN_LRU_MAINTAINER_SLEEP) {
1633 backoff_juggles[i] = 0;
1634 }
1635 }
1636 next_juggles[i] = backoff_juggles[i];
1637 if (next_juggles[i] < to_sleep)
1638 to_sleep = next_juggles[i];
1639 }
1640
1641 /* Minimize the sleep if we had async LRU bumps to process */
1642 if (settings.lru_segmented && lru_maintainer_bumps() && to_sleep > 1000) {
1643 to_sleep = 1000;
1644 }
1645
1646 /* Once per second at most */
1647 if (settings.lru_crawler && last_crawler_check != current_time) {
1648 lru_maintainer_crawler_check(cdata, l);
1649 last_crawler_check = current_time;
1650 }
1651
1652 if (settings.slab_automove == 1 && last_automove_check != current_time) {
1653 if (last_ratio != settings.slab_automove_ratio) {
1654 sam->free(am);
1655 am = sam->init(&settings);
1656 last_ratio = settings.slab_automove_ratio;
1657 }
1658 int src, dst;
1659 sam->run(am, &src, &dst);
1660 if (src != -1 && dst != -1) {
1661 slabs_reassign(src, dst);
1662 LOGGER_LOG(l, LOG_SYSEVENTS, LOGGER_SLAB_MOVE, NULL,
1663 src, dst);
1664 }
1665 // dst == 0 means reclaim to global pool, be more aggressive
1666 if (dst != 0) {
1667 last_automove_check = current_time;
1668 } else if (dst == 0) {
1669 // also ensure we minimize the thread sleep
1670 to_sleep = 1000;
1671 }
1672 }
1673 }
1674 pthread_mutex_unlock(&lru_maintainer_lock);
1675 sam->free(am);
1676 // LRU crawler *must* be stopped.
1677 free(cdata);
1678 if (settings.verbose > 2)
1679 fprintf(stderr, "LRU maintainer thread stopping\n");
1680
1681 return NULL;
1682 }
1683
stop_lru_maintainer_thread(void)1684 int stop_lru_maintainer_thread(void) {
1685 int ret;
1686 pthread_mutex_lock(&lru_maintainer_lock);
1687 /* LRU thread is a sleep loop, will die on its own */
1688 do_run_lru_maintainer_thread = 0;
1689 pthread_mutex_unlock(&lru_maintainer_lock);
1690 if ((ret = pthread_join(lru_maintainer_tid, NULL)) != 0) {
1691 fprintf(stderr, "Failed to stop LRU maintainer thread: %s\n", strerror(ret));
1692 return -1;
1693 }
1694 settings.lru_maintainer_thread = false;
1695 return 0;
1696 }
1697
start_lru_maintainer_thread(void * arg)1698 int start_lru_maintainer_thread(void *arg) {
1699 int ret;
1700
1701 pthread_mutex_lock(&lru_maintainer_lock);
1702 do_run_lru_maintainer_thread = 1;
1703 settings.lru_maintainer_thread = true;
1704 if ((ret = pthread_create(&lru_maintainer_tid, NULL,
1705 lru_maintainer_thread, arg)) != 0) {
1706 fprintf(stderr, "Can't create LRU maintainer thread: %s\n",
1707 strerror(ret));
1708 pthread_mutex_unlock(&lru_maintainer_lock);
1709 return -1;
1710 }
1711 pthread_mutex_unlock(&lru_maintainer_lock);
1712
1713 return 0;
1714 }
1715
1716 /* If we hold this lock, crawler can't wake up or move */
lru_maintainer_pause(void)1717 void lru_maintainer_pause(void) {
1718 pthread_mutex_lock(&lru_maintainer_lock);
1719 }
1720
lru_maintainer_resume(void)1721 void lru_maintainer_resume(void) {
1722 pthread_mutex_unlock(&lru_maintainer_lock);
1723 }
1724
init_lru_maintainer(void)1725 int init_lru_maintainer(void) {
1726 lru_maintainer_initialized = 1;
1727 return 0;
1728 }
1729
1730 /* Tail linkers and crawler for the LRU crawler. */
do_item_linktail_q(item * it)1731 void do_item_linktail_q(item *it) { /* item is the new tail */
1732 item **head, **tail;
1733 assert(it->it_flags == 1);
1734 assert(it->nbytes == 0);
1735
1736 head = &heads[it->slabs_clsid];
1737 tail = &tails[it->slabs_clsid];
1738 //assert(*tail != 0);
1739 assert(it != *tail);
1740 assert((*head && *tail) || (*head == 0 && *tail == 0));
1741 it->prev = *tail;
1742 it->next = 0;
1743 if (it->prev) {
1744 assert(it->prev->next == 0);
1745 it->prev->next = it;
1746 }
1747 *tail = it;
1748 if (*head == 0) *head = it;
1749 return;
1750 }
1751
do_item_unlinktail_q(item * it)1752 void do_item_unlinktail_q(item *it) {
1753 item **head, **tail;
1754 head = &heads[it->slabs_clsid];
1755 tail = &tails[it->slabs_clsid];
1756
1757 if (*head == it) {
1758 assert(it->prev == 0);
1759 *head = it->next;
1760 }
1761 if (*tail == it) {
1762 assert(it->next == 0);
1763 *tail = it->prev;
1764 }
1765 assert(it->next != it);
1766 assert(it->prev != it);
1767
1768 if (it->next) it->next->prev = it->prev;
1769 if (it->prev) it->prev->next = it->next;
1770 return;
1771 }
1772
1773 /* This is too convoluted, but it's a difficult shuffle. Try to rewrite it
1774 * more clearly. */
do_item_crawl_q(item * it)1775 item *do_item_crawl_q(item *it) {
1776 item **head, **tail;
1777 assert(it->it_flags == 1);
1778 assert(it->nbytes == 0);
1779 head = &heads[it->slabs_clsid];
1780 tail = &tails[it->slabs_clsid];
1781
1782 /* We've hit the head, pop off */
1783 if (it->prev == 0) {
1784 assert(*head == it);
1785 if (it->next) {
1786 *head = it->next;
1787 assert(it->next->prev == it);
1788 it->next->prev = 0;
1789 }
1790 return NULL; /* Done */
1791 }
1792
1793 /* Swing ourselves in front of the next item */
1794 /* NB: If there is a prev, we can't be the head */
1795 assert(it->prev != it);
1796 if (it->prev) {
1797 if (*head == it->prev) {
1798 /* Prev was the head, now we're the head */
1799 *head = it;
1800 }
1801 if (*tail == it) {
1802 /* We are the tail, now they are the tail */
1803 *tail = it->prev;
1804 }
1805 assert(it->next != it);
1806 if (it->next) {
1807 assert(it->prev->next == it);
1808 it->prev->next = it->next;
1809 it->next->prev = it->prev;
1810 } else {
1811 /* Tail. Move this above? */
1812 it->prev->next = 0;
1813 }
1814 /* prev->prev's next is it->prev */
1815 it->next = it->prev;
1816 it->prev = it->next->prev;
1817 it->next->prev = it;
1818 /* New it->prev now, if we're not at the head. */
1819 if (it->prev) {
1820 it->prev->next = it;
1821 }
1822 }
1823 assert(it->next != it);
1824 assert(it->prev != it);
1825
1826 return it->next; /* success */
1827 }
1828