1 /*  Copyright 2017 Facebook.
2  *
3  *  Use and distribution licensed under the BSD license.  See
4  *  the LICENSE file for full text.
5  */
6 
7 /* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
8 #include "memcached.h"
9 #include "slab_automove_extstore.h"
10 #include <stdlib.h>
11 #include <string.h>
12 
13 #define MIN_PAGES_FOR_SOURCE 2
14 #define MIN_PAGES_FOR_RECLAIM 2.5
15 #define MIN_PAGES_FREE 1.5
16 #define MEMCHECK_PERIOD 60
17 
18 struct window_data {
19     uint64_t age;
20     uint64_t dirty;
21     uint64_t evicted;
22     unsigned int excess_free;
23     unsigned int relaxed;
24 };
25 
26 struct window_global {
27     uint32_t pool_low;
28     uint32_t pool_high;
29 };
30 
31 typedef struct {
32     struct window_data *window_data;
33     struct window_global *window_global;
34     struct settings *settings;
35     uint32_t window_size;
36     uint32_t window_cur;
37     uint32_t item_size;
38     rel_time_t last_memcheck_run;
39     double max_age_ratio;
40     double free_ratio;
41     bool pool_filled_once;
42     unsigned int free_mem[MAX_NUMBER_OF_SLAB_CLASSES];
43     item_stats_automove iam_before[MAX_NUMBER_OF_SLAB_CLASSES];
44     item_stats_automove iam_after[MAX_NUMBER_OF_SLAB_CLASSES];
45     slab_stats_automove sam_before[MAX_NUMBER_OF_SLAB_CLASSES];
46     slab_stats_automove sam_after[MAX_NUMBER_OF_SLAB_CLASSES];
47 } slab_automove;
48 
slab_automove_extstore_init(struct settings * settings)49 void *slab_automove_extstore_init(struct settings *settings) {
50     uint32_t window_size = settings->slab_automove_window;
51     double max_age_ratio = settings->slab_automove_ratio;
52     slab_automove *a = calloc(1, sizeof(slab_automove));
53     if (a == NULL)
54         return NULL;
55     a->window_data = calloc(window_size * MAX_NUMBER_OF_SLAB_CLASSES, sizeof(struct window_data));
56     a->window_global = calloc(window_size, sizeof(struct window_global));
57     a->window_size = window_size;
58     a->max_age_ratio = max_age_ratio;
59     a->free_ratio = settings->slab_automove_freeratio;
60     a->item_size = settings->ext_item_size;
61     a->last_memcheck_run = 0;
62     a->settings = settings;
63     a->pool_filled_once = false;
64     if (a->window_data == NULL || a->window_global == NULL) {
65         if (a->window_data)
66             free(a->window_data);
67         if (a->window_global)
68             free(a->window_global);
69         free(a);
70         return NULL;
71     }
72 
73     // do a dry run to fill the before structs
74     fill_item_stats_automove(a->iam_before);
75     fill_slab_stats_automove(a->sam_before);
76 
77     return (void *)a;
78 }
79 
slab_automove_extstore_free(void * arg)80 void slab_automove_extstore_free(void *arg) {
81     slab_automove *a = (slab_automove *)arg;
82     free(a->window_data);
83     free(a->window_global);
84     free(a);
85 }
86 
window_sum(struct window_data * wd,struct window_data * w,uint32_t size)87 static void window_sum(struct window_data *wd, struct window_data *w,
88         uint32_t size) {
89     for (int x = 0; x < size; x++) {
90         struct window_data *d = &wd[x];
91         w->age += d->age;
92         w->dirty += d->dirty;
93         w->evicted += d->evicted;
94         w->excess_free += d->excess_free;
95         w->relaxed += d->relaxed;
96     }
97 }
98 
99 /* This could potentially merge with above */
window_global_sum(struct window_global * wg,struct window_global * w,uint32_t size)100 static void window_global_sum(struct window_global *wg,
101         struct window_global *w, uint32_t size) {
102     for (int x = 0; x < size; x++) {
103         struct window_global *d = &wg[x];
104         w->pool_high += d->pool_high;
105         w->pool_low += d->pool_low;
106     }
107 }
108 
global_pool_check(slab_automove * a)109 static void global_pool_check(slab_automove *a) {
110     bool mem_limit_reached;
111     uint32_t free = a->free_mem[0];
112     struct window_global *wg = &a->window_global[a->window_cur % a->window_size];
113     unsigned int count = global_page_pool_size(&mem_limit_reached);
114     memset(wg, 0, sizeof(struct window_global));
115     if (!mem_limit_reached)
116         return;
117     if (count < free / 2) {
118         wg->pool_low = 1;
119         a->pool_filled_once = true;
120     } else if (count > free) {
121         wg->pool_high = 1;
122     } else {
123         a->pool_filled_once = true;
124     }
125 }
126 
127 /* A percentage of memory is configured to be held "free" as buffers for the
128  * external storage system.
129  * % of global memory is desired in the global page pool
130  * each slab class has a % of free chunks desired based on how much memory is
131  * currently in the class. This allows time for extstore to flush data when
132  * spikes or waves of set data arrive.
133  * The global page pool reserve acts as a secondary buffer for any slab class,
134  * which helps absorb shifts in which class is active.
135  */
memcheck(slab_automove * a)136 static void memcheck(slab_automove *a) {
137     unsigned int total_pages = 0;
138     if (current_time < a->last_memcheck_run + MEMCHECK_PERIOD)
139         return;
140     a->last_memcheck_run = current_time;
141     for (int n = 1; n < MAX_NUMBER_OF_SLAB_CLASSES; n++) {
142         slab_stats_automove *sam = &a->sam_after[n];
143         total_pages += sam->total_pages;
144         unsigned int hold_free = (sam->total_pages * sam->chunks_per_page)
145             * a->free_ratio;
146         if (sam->chunks_per_page * MIN_PAGES_FREE > hold_free)
147             hold_free = sam->chunks_per_page * MIN_PAGES_FREE;
148         a->free_mem[n] = hold_free;
149         if (a->settings->ext_free_memchunks[n] != hold_free && a->pool_filled_once) {
150             a->settings->ext_free_memchunks[n] = hold_free;
151         }
152     }
153     // remember to add what remains in global pool.
154     total_pages += a->sam_after[0].total_pages;
155     a->free_mem[0] = total_pages * a->free_ratio;
156 }
157 
get_window_data(slab_automove * a,int class)158 static struct window_data *get_window_data(slab_automove *a, int class) {
159     int w_offset = class * a->window_size;
160     return &a->window_data[w_offset + (a->window_cur % a->window_size)];
161 }
162 
slab_automove_extstore_run(void * arg,int * src,int * dst)163 void slab_automove_extstore_run(void *arg, int *src, int *dst) {
164     slab_automove *a = (slab_automove *)arg;
165     int n;
166     struct window_data w_sum;
167     int oldest = -1;
168     uint64_t oldest_age = 0;
169     int youngest = -1;
170     uint64_t youngest_age = ~0;
171     bool too_free = false;
172     *src = -1;
173     *dst = -1;
174 
175     global_pool_check(a);
176     struct window_global wg_sum;
177     memset(&wg_sum, 0, sizeof(struct window_global));
178     window_global_sum(a->window_global, &wg_sum, a->window_size);
179     // fill after structs
180     fill_item_stats_automove(a->iam_after);
181     fill_slab_stats_automove(a->sam_after);
182     a->window_cur++;
183 
184     memcheck(a);
185 
186     // iterate slabs
187     for (n = POWER_SMALLEST; n < MAX_NUMBER_OF_SLAB_CLASSES; n++) {
188         bool small_slab = a->sam_before[n].chunk_size < a->item_size
189             ? true : false;
190         bool free_enough = false;
191         struct window_data *wd = get_window_data(a, n);
192         // summarize the window-up-to-now.
193         memset(&w_sum, 0, sizeof(struct window_data));
194         int w_offset = n * a->window_size;
195         window_sum(&a->window_data[w_offset], &w_sum, a->window_size);
196         memset(wd, 0, sizeof(struct window_data));
197 
198         // if page delta, oom, or evicted delta, mark window dirty
199         // classes marked dirty cannot donate memory back to global pool.
200         if (a->iam_after[n].evicted - a->iam_before[n].evicted > 0 ||
201             a->iam_after[n].outofmemory - a->iam_before[n].outofmemory > 0) {
202             wd->evicted = 1;
203             wd->dirty = 1;
204         }
205         if (a->sam_after[n].total_pages - a->sam_before[n].total_pages > 0) {
206             wd->dirty = 1;
207         }
208         // Mark excess free if we're over the free mem limit for too long.
209         // "free_enough" means it is either wobbling, recently received a new
210         // page of memory, or the crawler is freeing memory.
211         if (a->sam_after[n].free_chunks > a->free_mem[n]) {
212             free_enough = true;
213         }
214         // double the free requirements means we may have memory we can
215         // reclaim to global, if it stays this way for the whole window.
216         if (a->sam_after[n].free_chunks > (a->free_mem[n] * 2) && a->free_mem[n] > 0) {
217             wd->excess_free = 1;
218         }
219 
220         // set age into window
221         wd->age = a->iam_after[n].age;
222 
223         // grab age as average of window total
224         uint64_t age = w_sum.age / a->window_size;
225 
226         // if > N free chunks and not dirty, reclaim memory
227         // small slab classes aren't age balanced and rely more on global
228         // pool. reclaim them more aggressively.
229         if (a->sam_after[n].free_chunks > a->sam_after[n].chunks_per_page * MIN_PAGES_FOR_RECLAIM
230                 && w_sum.dirty == 0) {
231             if (small_slab) {
232                 *src = n;
233                 *dst = 0;
234                 too_free = true;
235             } else if (!small_slab && w_sum.excess_free >= a->window_size) {
236                 // If large slab and free chunks haven't decreased for a full
237                 // window, reclaim pages.
238                 *src = n;
239                 *dst = 0;
240                 too_free = true;
241             }
242         }
243 
244         if (!small_slab) {
245             // if oldest and have enough pages, is oldest
246             if (age > oldest_age
247                     && a->sam_after[n].total_pages > MIN_PAGES_FOR_SOURCE) {
248                 oldest = n;
249                 oldest_age = age;
250             }
251 
252             // don't count as youngest if it hasn't been using new chunks.
253             // (if it was relaxed recently, and is currently "free enough")
254             if (age < youngest_age && a->sam_after[n].total_pages != 0
255                     && w_sum.excess_free < a->window_size
256                     && !(w_sum.relaxed && free_enough)) {
257                 youngest = n;
258                 youngest_age = age;
259             }
260         }
261     }
262 
263     memcpy(a->iam_before, a->iam_after,
264             sizeof(item_stats_automove) * MAX_NUMBER_OF_SLAB_CLASSES);
265     memcpy(a->sam_before, a->sam_after,
266             sizeof(slab_stats_automove) * MAX_NUMBER_OF_SLAB_CLASSES);
267     // only make decisions if window has filled once.
268     if (a->window_cur < a->window_size)
269         return;
270 
271     if (wg_sum.pool_high >= a->window_size && !wg_sum.pool_low && youngest != -1) {
272         if (a->sam_after[youngest].free_chunks <= a->free_mem[youngest]) {
273             *src = 0;
274             *dst = youngest;
275         }
276         struct window_data *wd = get_window_data(a, youngest);
277         // "relaxing" here and below allows us to skip classes which will
278         // never grow or are growing slowly, more quickly finding other
279         // classes which violate the age ratio.
280         wd->relaxed = 1;
281     } else if (!too_free && wg_sum.pool_low && oldest != -1) {
282         *src = oldest;
283         *dst = 0;
284     } else if (!too_free && youngest != -1 && oldest != -1 && youngest != oldest) {
285         // if we have a youngest and oldest, and oldest is outside the ratio.
286         if (youngest_age < ((double)oldest_age * a->max_age_ratio)) {
287             struct window_data *wd = get_window_data(a, youngest);
288             wd->relaxed = 1;
289             // only actually assign more memory if it's absorbed what it has
290             if (a->sam_after[youngest].free_chunks <= a->free_mem[youngest]) {
291                 *src = 0;
292                 *dst = youngest;
293 
294             }
295         }
296     }
297     return;
298 }
299