1 /* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /*
3  *  memcached - memory caching daemon
4  *
5  *       https://www.memcached.org/
6  *
7  *  Copyright 2003 Danga Interactive, Inc.  All rights reserved.
8  *
9  *  Use and distribution licensed under the BSD license.  See
10  *  the LICENSE file for full text.
11  *
12  *  Authors:
13  *      Anatoly Vorobey <mellon@pobox.com>
14  *      Brad Fitzpatrick <brad@danga.com>
15  */
16 #include "memcached.h"
17 #include "storage.h"
18 #include "authfile.h"
19 #include "restart.h"
20 #include <sys/stat.h>
21 #include <sys/socket.h>
22 #include <sys/un.h>
23 #include <signal.h>
24 #include <sys/param.h>
25 #include <sys/resource.h>
26 #include <sys/uio.h>
27 #include <ctype.h>
28 #include <stdarg.h>
29 
30 /* some POSIX systems need the following definition
31  * to get mlockall flags out of sys/mman.h.  */
32 #ifndef _P1003_1B_VISIBLE
33 #define _P1003_1B_VISIBLE
34 #endif
35 #include <pwd.h>
36 #include <sys/mman.h>
37 #include <fcntl.h>
38 #include <netinet/tcp.h>
39 #include <arpa/inet.h>
40 #include <errno.h>
41 #include <stdlib.h>
42 #include <stdio.h>
43 #include <string.h>
44 #include <time.h>
45 #include <assert.h>
46 #include <sysexits.h>
47 #include <stddef.h>
48 
49 #ifdef HAVE_GETOPT_LONG
50 #include <getopt.h>
51 #endif
52 
53 #ifdef TLS
54 #include "tls.h"
55 #endif
56 
57 #include "proto_text.h"
58 #include "proto_bin.h"
59 
60 #if defined(__FreeBSD__)
61 #include <sys/sysctl.h>
62 #endif
63 
64 /*
65  * forward declarations
66  */
67 static void drive_machine(conn *c);
68 static int new_socket(struct addrinfo *ai);
69 static ssize_t tcp_read(conn *arg, void *buf, size_t count);
70 static ssize_t tcp_sendmsg(conn *arg, struct msghdr *msg, int flags);
71 static ssize_t tcp_write(conn *arg, void *buf, size_t count);
72 
73 enum try_read_result {
74     READ_DATA_RECEIVED,
75     READ_NO_DATA_RECEIVED,
76     READ_ERROR,            /** an error occurred (on the socket) (or client closed connection) */
77     READ_MEMORY_ERROR      /** failed to allocate more memory */
78 };
79 
80 static int try_read_command_negotiate(conn *c);
81 static int try_read_command_udp(conn *c);
82 
83 static enum try_read_result try_read_network(conn *c);
84 static enum try_read_result try_read_udp(conn *c);
85 
86 static int start_conn_timeout_thread();
87 
88 
89 /* stats */
90 static void stats_init(void);
91 static void conn_to_str(const conn *c, char *addr, char *svr_addr);
92 
93 /* defaults */
94 static void settings_init(void);
95 
96 /* event handling, network IO */
97 static void event_handler(const evutil_socket_t fd, const short which, void *arg);
98 static void conn_close(conn *c);
99 static void conn_init(void);
100 static bool update_event(conn *c, const int new_flags);
101 static void complete_nread(conn *c);
102 
103 static void conn_free(conn *c);
104 
105 /** exported globals **/
106 struct stats stats;
107 struct stats_state stats_state;
108 struct settings settings;
109 time_t process_started;     /* when the process was started */
110 conn **conns;
111 
112 struct slab_rebalance slab_rebal;
113 volatile int slab_rebalance_signal;
114 #ifdef EXTSTORE
115 /* hoping this is temporary; I'd prefer to cut globals, but will complete this
116  * battle another day.
117  */
118 void *ext_storage = NULL;
119 #endif
120 /** file scope variables **/
121 static conn *listen_conn = NULL;
122 static int max_fds;
123 static struct event_base *main_base;
124 
125 enum transmit_result {
126     TRANSMIT_COMPLETE,   /** All done writing. */
127     TRANSMIT_INCOMPLETE, /** More data remaining to write. */
128     TRANSMIT_SOFT_ERROR, /** Can't write any more right now. */
129     TRANSMIT_HARD_ERROR  /** Can't write (c->state is set to conn_closing) */
130 };
131 
132 /* Default methods to read from/ write to a socket */
tcp_read(conn * c,void * buf,size_t count)133 ssize_t tcp_read(conn *c, void *buf, size_t count) {
134     assert (c != NULL);
135     return read(c->sfd, buf, count);
136 }
137 
tcp_sendmsg(conn * c,struct msghdr * msg,int flags)138 ssize_t tcp_sendmsg(conn *c, struct msghdr *msg, int flags) {
139     assert (c != NULL);
140     return sendmsg(c->sfd, msg, flags);
141 }
142 
tcp_write(conn * c,void * buf,size_t count)143 ssize_t tcp_write(conn *c, void *buf, size_t count) {
144     assert (c != NULL);
145     return write(c->sfd, buf, count);
146 }
147 
148 static enum transmit_result transmit(conn *c);
149 
150 /* This reduces the latency without adding lots of extra wiring to be able to
151  * notify the listener thread of when to listen again.
152  * Also, the clock timer could be broken out into its own thread and we
153  * can block the listener via a condition.
154  */
155 static volatile bool allow_new_conns = true;
156 static int stop_main_loop = NOT_STOP;
157 static struct event maxconnsevent;
maxconns_handler(const evutil_socket_t fd,const short which,void * arg)158 static void maxconns_handler(const evutil_socket_t fd, const short which, void *arg) {
159     struct timeval t = {.tv_sec = 0, .tv_usec = 10000};
160 
161     if (fd == -42 || allow_new_conns == false) {
162         /* reschedule in 10ms if we need to keep polling */
163         evtimer_set(&maxconnsevent, maxconns_handler, 0);
164         event_base_set(main_base, &maxconnsevent);
165         evtimer_add(&maxconnsevent, &t);
166     } else {
167         evtimer_del(&maxconnsevent);
168         accept_new_conns(true);
169     }
170 }
171 
172 /*
173  * given time value that's either unix time or delta from current unix time, return
174  * unix time. Use the fact that delta can't exceed one month (and real time value can't
175  * be that low).
176  */
realtime(const time_t exptime)177 rel_time_t realtime(const time_t exptime) {
178     /* no. of seconds in 30 days - largest possible delta exptime */
179 
180     if (exptime == 0) return 0; /* 0 means never expire */
181 
182     if (exptime > REALTIME_MAXDELTA) {
183         /* if item expiration is at/before the server started, give it an
184            expiration time of 1 second after the server started.
185            (because 0 means don't expire).  without this, we'd
186            underflow and wrap around to some large value way in the
187            future, effectively making items expiring in the past
188            really expiring never */
189         if (exptime <= process_started)
190             return (rel_time_t)1;
191         return (rel_time_t)(exptime - process_started);
192     } else {
193         return (rel_time_t)(exptime + current_time);
194     }
195 }
196 
stats_init(void)197 static void stats_init(void) {
198     memset(&stats, 0, sizeof(struct stats));
199     memset(&stats_state, 0, sizeof(struct stats_state));
200     stats_state.accepting_conns = true; /* assuming we start in this state. */
201 
202     /* make the time we started always be 2 seconds before we really
203        did, so time(0) - time.started is never zero.  if so, things
204        like 'settings.oldest_live' which act as booleans as well as
205        values are now false in boolean context... */
206     process_started = time(0) - ITEM_UPDATE_INTERVAL - 2;
207     stats_prefix_init(settings.prefix_delimiter);
208 }
209 
stats_reset(void)210 void stats_reset(void) {
211     STATS_LOCK();
212     memset(&stats, 0, sizeof(struct stats));
213     stats_prefix_clear();
214     STATS_UNLOCK();
215     threadlocal_stats_reset();
216     item_stats_reset();
217 }
218 
settings_init(void)219 static void settings_init(void) {
220     settings.use_cas = true;
221     settings.access = 0700;
222     settings.port = 11211;
223     settings.udpport = 0;
224 #ifdef TLS
225     settings.ssl_enabled = false;
226     settings.ssl_ctx = NULL;
227     settings.ssl_chain_cert = NULL;
228     settings.ssl_key = NULL;
229     settings.ssl_verify_mode = SSL_VERIFY_NONE;
230     settings.ssl_keyformat = SSL_FILETYPE_PEM;
231     settings.ssl_ciphers = NULL;
232     settings.ssl_ca_cert = NULL;
233     settings.ssl_last_cert_refresh_time = current_time;
234     settings.ssl_wbuf_size = 16 * 1024; // default is 16KB (SSL max frame size is 17KB)
235     settings.ssl_session_cache = false;
236     settings.ssl_min_version = TLS1_2_VERSION;
237 #endif
238     /* By default this string should be NULL for getaddrinfo() */
239     settings.inter = NULL;
240     settings.maxbytes = 64 * 1024 * 1024; /* default is 64MB */
241     settings.maxconns = 1024;         /* to limit connections-related memory to about 5MB */
242     settings.verbose = 0;
243     settings.oldest_live = 0;
244     settings.oldest_cas = 0;          /* supplements accuracy of oldest_live */
245     settings.evict_to_free = 1;       /* push old items out of cache when memory runs out */
246     settings.socketpath = NULL;       /* by default, not using a unix socket */
247     settings.auth_file = NULL;        /* by default, not using ASCII authentication tokens */
248     settings.factor = 1.25;
249     settings.chunk_size = 48;         /* space for a modest key and value */
250     settings.num_threads = 4;         /* N workers */
251     settings.num_threads_per_udp = 0;
252     settings.prefix_delimiter = ':';
253     settings.detail_enabled = 0;
254     settings.reqs_per_event = 20;
255     settings.backlog = 1024;
256     settings.binding_protocol = negotiating_prot;
257     settings.item_size_max = 1024 * 1024; /* The famous 1MB upper limit. */
258     settings.slab_page_size = 1024 * 1024; /* chunks are split from 1MB pages. */
259     settings.slab_chunk_size_max = settings.slab_page_size / 2;
260     settings.sasl = false;
261     settings.maxconns_fast = true;
262     settings.lru_crawler = false;
263     settings.lru_crawler_sleep = 100;
264     settings.lru_crawler_tocrawl = 0;
265     settings.lru_maintainer_thread = false;
266     settings.lru_segmented = true;
267     settings.hot_lru_pct = 20;
268     settings.warm_lru_pct = 40;
269     settings.hot_max_factor = 0.2;
270     settings.warm_max_factor = 2.0;
271     settings.temp_lru = false;
272     settings.temporary_ttl = 61;
273     settings.idle_timeout = 0; /* disabled */
274     settings.hashpower_init = 0;
275     settings.slab_reassign = true;
276     settings.slab_automove = 1;
277     settings.slab_automove_ratio = 0.8;
278     settings.slab_automove_window = 30;
279     settings.shutdown_command = false;
280     settings.tail_repair_time = TAIL_REPAIR_TIME_DEFAULT;
281     settings.flush_enabled = true;
282     settings.dump_enabled = true;
283     settings.crawls_persleep = 1000;
284     settings.logger_watcher_buf_size = LOGGER_WATCHER_BUF_SIZE;
285     settings.logger_buf_size = LOGGER_BUF_SIZE;
286     settings.drop_privileges = false;
287     settings.watch_enabled = true;
288     settings.read_buf_mem_limit = 0;
289 #ifdef MEMCACHED_DEBUG
290     settings.relaxed_privileges = false;
291 #endif
292     settings.num_napi_ids = 0;
293     settings.memory_file = NULL;
294 }
295 
296 extern pthread_mutex_t conn_lock;
297 
298 /* Connection timeout thread bits */
299 static pthread_t conn_timeout_tid;
300 static int do_run_conn_timeout_thread;
301 static pthread_cond_t conn_timeout_cond = PTHREAD_COND_INITIALIZER;
302 static pthread_mutex_t conn_timeout_lock = PTHREAD_MUTEX_INITIALIZER;
303 
304 #define CONNS_PER_SLICE 100
conn_timeout_thread(void * arg)305 static void *conn_timeout_thread(void *arg) {
306     int i;
307     conn *c;
308     rel_time_t oldest_last_cmd;
309     int sleep_time;
310     int sleep_slice = max_fds / CONNS_PER_SLICE;
311     if (sleep_slice == 0)
312         sleep_slice = CONNS_PER_SLICE;
313 
314     useconds_t timeslice = 1000000 / sleep_slice;
315 
316     mutex_lock(&conn_timeout_lock);
317     while(do_run_conn_timeout_thread) {
318         if (settings.verbose > 2)
319             fprintf(stderr, "idle timeout thread at top of connection list\n");
320 
321         oldest_last_cmd = current_time;
322 
323         for (i = 0; i < max_fds; i++) {
324             if ((i % CONNS_PER_SLICE) == 0) {
325                 if (settings.verbose > 2)
326                     fprintf(stderr, "idle timeout thread sleeping for %ulus\n",
327                         (unsigned int)timeslice);
328                 usleep(timeslice);
329             }
330 
331             if (!conns[i])
332                 continue;
333 
334             c = conns[i];
335 
336             if (!IS_TCP(c->transport))
337                 continue;
338 
339             if (c->state != conn_new_cmd && c->state != conn_read)
340                 continue;
341 
342             if ((current_time - c->last_cmd_time) > settings.idle_timeout) {
343                 timeout_conn(c);
344             } else {
345                 if (c->last_cmd_time < oldest_last_cmd)
346                     oldest_last_cmd = c->last_cmd_time;
347             }
348         }
349 
350         /* This is the soonest we could have another connection time out */
351         sleep_time = settings.idle_timeout - (current_time - oldest_last_cmd) + 1;
352         if (sleep_time <= 0)
353             sleep_time = 1;
354 
355         if (settings.verbose > 2)
356             fprintf(stderr,
357                     "idle timeout thread finished pass, sleeping for %ds\n",
358                     sleep_time);
359 
360         struct timeval now;
361         struct timespec to_sleep;
362         gettimeofday(&now, NULL);
363         to_sleep.tv_sec = now.tv_sec + sleep_time;
364         to_sleep.tv_nsec = 0;
365 
366         pthread_cond_timedwait(&conn_timeout_cond, &conn_timeout_lock, &to_sleep);
367     }
368 
369     mutex_unlock(&conn_timeout_lock);
370     return NULL;
371 }
372 
start_conn_timeout_thread()373 static int start_conn_timeout_thread() {
374     int ret;
375 
376     if (settings.idle_timeout == 0)
377         return -1;
378 
379     do_run_conn_timeout_thread = 1;
380     if ((ret = pthread_create(&conn_timeout_tid, NULL,
381         conn_timeout_thread, NULL)) != 0) {
382         fprintf(stderr, "Can't create idle connection timeout thread: %s\n",
383             strerror(ret));
384         return -1;
385     }
386 
387     return 0;
388 }
389 
stop_conn_timeout_thread(void)390 int stop_conn_timeout_thread(void) {
391     if (!do_run_conn_timeout_thread)
392         return -1;
393     mutex_lock(&conn_timeout_lock);
394     do_run_conn_timeout_thread = 0;
395     pthread_cond_signal(&conn_timeout_cond);
396     mutex_unlock(&conn_timeout_lock);
397     pthread_join(conn_timeout_tid, NULL);
398     return 0;
399 }
400 
401 /*
402  * read buffer cache helper functions
403  */
rbuf_release(conn * c)404 static void rbuf_release(conn *c) {
405     if (c->rbuf != NULL && c->rbytes == 0 && !IS_UDP(c->transport)) {
406         if (c->rbuf_malloced) {
407             free(c->rbuf);
408             c->rbuf_malloced = false;
409         } else {
410             do_cache_free(c->thread->rbuf_cache, c->rbuf);
411         }
412         c->rsize = 0;
413         c->rbuf = NULL;
414         c->rcurr = NULL;
415     }
416 }
417 
rbuf_alloc(conn * c)418 static bool rbuf_alloc(conn *c) {
419     if (c->rbuf == NULL) {
420         c->rbuf = do_cache_alloc(c->thread->rbuf_cache);
421         if (!c->rbuf) {
422             THR_STATS_LOCK(c);
423             c->thread->stats.read_buf_oom++;
424             THR_STATS_UNLOCK(c);
425             return false;
426         }
427         c->rsize = READ_BUFFER_SIZE;
428         c->rcurr = c->rbuf;
429     }
430     return true;
431 }
432 
433 // Just for handling huge ASCII multigets.
434 // The previous system was essentially the same; realloc'ing until big enough,
435 // then realloc'ing back down after the request finished.
rbuf_switch_to_malloc(conn * c)436 bool rbuf_switch_to_malloc(conn *c) {
437     // Might as well start with x2 and work from there.
438     size_t size = c->rsize * 2;
439     char *tmp = malloc(size);
440     if (!tmp)
441         return false;
442 
443     do_cache_free(c->thread->rbuf_cache, c->rbuf);
444     memcpy(tmp, c->rcurr, c->rbytes);
445 
446     c->rcurr = c->rbuf = tmp;
447     c->rsize = size;
448     c->rbuf_malloced = true;
449     return true;
450 }
451 
452 /*
453  * Initializes the connections array. We don't actually allocate connection
454  * structures until they're needed, so as to avoid wasting memory when the
455  * maximum connection count is much higher than the actual number of
456  * connections.
457  *
458  * This does end up wasting a few pointers' worth of memory for FDs that are
459  * used for things other than connections, but that's worth it in exchange for
460  * being able to directly index the conns array by FD.
461  */
conn_init(void)462 static void conn_init(void) {
463     /* We're unlikely to see an FD much higher than maxconns. */
464     int next_fd = dup(1);
465     if (next_fd < 0) {
466         perror("Failed to duplicate file descriptor\n");
467         exit(1);
468     }
469     int headroom = 10;      /* account for extra unexpected open FDs */
470     struct rlimit rl;
471 
472     max_fds = settings.maxconns + headroom + next_fd;
473 
474     /* But if possible, get the actual highest FD we can possibly ever see. */
475     if (getrlimit(RLIMIT_NOFILE, &rl) == 0) {
476         max_fds = rl.rlim_max;
477     } else {
478         fprintf(stderr, "Failed to query maximum file descriptor; "
479                         "falling back to maxconns\n");
480     }
481 
482     close(next_fd);
483 
484     if ((conns = calloc(max_fds, sizeof(conn *))) == NULL) {
485         fprintf(stderr, "Failed to allocate connection structures\n");
486         /* This is unrecoverable so bail out early. */
487         exit(1);
488     }
489 }
490 
prot_text(enum protocol prot)491 static const char *prot_text(enum protocol prot) {
492     char *rv = "unknown";
493     switch(prot) {
494         case ascii_prot:
495             rv = "ascii";
496             break;
497         case binary_prot:
498             rv = "binary";
499             break;
500         case negotiating_prot:
501             rv = "auto-negotiate";
502             break;
503     }
504     return rv;
505 }
506 
conn_close_idle(conn * c)507 void conn_close_idle(conn *c) {
508     if (settings.idle_timeout > 0 &&
509         (current_time - c->last_cmd_time) > settings.idle_timeout) {
510         if (c->state != conn_new_cmd && c->state != conn_read) {
511             if (settings.verbose > 1)
512                 fprintf(stderr,
513                     "fd %d wants to timeout, but isn't in read state", c->sfd);
514             return;
515         }
516 
517         if (settings.verbose > 1)
518             fprintf(stderr, "Closing idle fd %d\n", c->sfd);
519 
520         pthread_mutex_lock(&c->thread->stats.mutex);
521         c->thread->stats.idle_kicks++;
522         pthread_mutex_unlock(&c->thread->stats.mutex);
523 
524         c->close_reason = IDLE_TIMEOUT_CLOSE;
525 
526         conn_set_state(c, conn_closing);
527         drive_machine(c);
528     }
529 }
530 
531 /* bring conn back from a sidethread. could have had its event base moved. */
conn_worker_readd(conn * c)532 void conn_worker_readd(conn *c) {
533     if (c->state == conn_io_queue) {
534         c->io_queues_submitted--;
535         // If we're still waiting for other queues to return, don't re-add the
536         // connection yet.
537         if (c->io_queues_submitted != 0) {
538             return;
539         }
540     }
541     c->ev_flags = EV_READ | EV_PERSIST;
542     event_set(&c->event, c->sfd, c->ev_flags, event_handler, (void *)c);
543     event_base_set(c->thread->base, &c->event);
544 
545     // TODO: call conn_cleanup/fail/etc
546     if (event_add(&c->event, 0) == -1) {
547         perror("event_add");
548     }
549 
550     // side thread wanted us to close immediately.
551     if (c->state == conn_closing) {
552         drive_machine(c);
553         return;
554     } else if (c->state == conn_io_queue) {
555         // machine will know how to return based on secondary state.
556         drive_machine(c);
557     } else {
558         conn_set_state(c, conn_new_cmd);
559     }
560 }
561 
thread_io_queue_add(LIBEVENT_THREAD * t,int type,void * ctx,io_queue_stack_cb cb,io_queue_stack_cb com_cb,io_queue_cb ret_cb,io_queue_cb fin_cb)562 void thread_io_queue_add(LIBEVENT_THREAD *t, int type, void *ctx, io_queue_stack_cb cb, io_queue_stack_cb com_cb, io_queue_cb ret_cb, io_queue_cb fin_cb) {
563     io_queue_cb_t *q = t->io_queues;
564     while (q->type != IO_QUEUE_NONE) {
565         q++;
566     }
567     q->type = type;
568     q->ctx = ctx;
569     q->submit_cb = cb;
570     q->complete_cb = com_cb;
571     q->finalize_cb = fin_cb;
572     q->return_cb   = ret_cb;
573     return;
574 }
575 
conn_io_queue_setup(conn * c)576 void conn_io_queue_setup(conn *c) {
577     io_queue_cb_t *qcb = c->thread->io_queues;
578     io_queue_t *q = c->io_queues;
579     while (qcb->type != IO_QUEUE_NONE) {
580         q->type = qcb->type;
581         q->ctx = qcb->ctx;
582         q->stack_ctx = NULL;
583         q->count = 0;
584         qcb++;
585         q++;
586     }
587 }
588 
589 // To be called from conn_release_items to ensure the stack ptrs are reset.
conn_io_queue_reset(conn * c)590 static void conn_io_queue_reset(conn *c) {
591     for (io_queue_t *q = c->io_queues; q->type != IO_QUEUE_NONE; q++) {
592         assert(q->count == 0);
593         q->stack_ctx = NULL;
594     }
595 }
596 
thread_io_queue_get(LIBEVENT_THREAD * t,int type)597 io_queue_cb_t *thread_io_queue_get(LIBEVENT_THREAD *t, int type) {
598     io_queue_cb_t *q = t->io_queues;
599     while (q->type != IO_QUEUE_NONE) {
600         if (q->type == type) {
601             return q;
602         }
603         q++;
604     }
605     return NULL;
606 }
607 
conn_io_queue_get(conn * c,int type)608 io_queue_t *conn_io_queue_get(conn *c, int type) {
609     io_queue_t *q = c->io_queues;
610     while (q->type != IO_QUEUE_NONE) {
611         if (q->type == type) {
612             return q;
613         }
614         q++;
615     }
616     return NULL;
617 }
618 
619 // called after returning to the main worker thread.
620 // users of the queue need to distinguish if the IO was actually consumed or
621 // not and handle appropriately.
conn_io_queue_complete(conn * c)622 static void conn_io_queue_complete(conn *c) {
623     io_queue_t *q = c->io_queues;
624     io_queue_cb_t *qcb = c->thread->io_queues;
625     while (q->type != IO_QUEUE_NONE) {
626         if (q->stack_ctx) {
627             qcb->complete_cb(q);
628         }
629         qcb++;
630         q++;
631     }
632 }
633 
634 // called to return a single IO object to the original worker thread.
conn_io_queue_return(io_pending_t * io)635 void conn_io_queue_return(io_pending_t *io) {
636     io_queue_cb_t *q = thread_io_queue_get(io->thread, io->io_queue_type);
637     q->return_cb(io);
638     return;
639 }
640 
conn_new(const int sfd,enum conn_states init_state,const int event_flags,const int read_buffer_size,enum network_transport transport,struct event_base * base,void * ssl)641 conn *conn_new(const int sfd, enum conn_states init_state,
642                 const int event_flags,
643                 const int read_buffer_size, enum network_transport transport,
644                 struct event_base *base, void *ssl) {
645     conn *c;
646 
647     assert(sfd >= 0 && sfd < max_fds);
648     c = conns[sfd];
649 
650     if (NULL == c) {
651         if (!(c = (conn *)calloc(1, sizeof(conn)))) {
652             STATS_LOCK();
653             stats.malloc_fails++;
654             STATS_UNLOCK();
655             fprintf(stderr, "Failed to allocate connection object\n");
656             return NULL;
657         }
658         MEMCACHED_CONN_CREATE(c);
659         c->read = NULL;
660         c->sendmsg = NULL;
661         c->write = NULL;
662         c->rbuf = NULL;
663 
664         c->rsize = read_buffer_size;
665 
666         // UDP connections use a persistent static buffer.
667         if (c->rsize) {
668             c->rbuf = (char *)malloc((size_t)c->rsize);
669         }
670 
671         if (c->rsize && c->rbuf == NULL) {
672             conn_free(c);
673             STATS_LOCK();
674             stats.malloc_fails++;
675             STATS_UNLOCK();
676             fprintf(stderr, "Failed to allocate buffers for connection\n");
677             return NULL;
678         }
679 
680 
681         STATS_LOCK();
682         stats_state.conn_structs++;
683         STATS_UNLOCK();
684 
685         c->sfd = sfd;
686         conns[sfd] = c;
687     }
688 
689     c->transport = transport;
690     c->protocol = settings.binding_protocol;
691 
692     /* unix socket mode doesn't need this, so zeroed out.  but why
693      * is this done for every command?  presumably for UDP
694      * mode.  */
695     if (!settings.socketpath) {
696         c->request_addr_size = sizeof(c->request_addr);
697     } else {
698         c->request_addr_size = 0;
699     }
700 
701     if (transport == tcp_transport && init_state == conn_new_cmd) {
702         if (getpeername(sfd, (struct sockaddr *) &c->request_addr,
703                         &c->request_addr_size)) {
704             perror("getpeername");
705             memset(&c->request_addr, 0, sizeof(c->request_addr));
706         }
707     }
708 
709     if (init_state == conn_new_cmd) {
710         LOGGER_LOG(NULL, LOG_CONNEVENTS, LOGGER_CONNECTION_NEW, NULL,
711                 &c->request_addr, c->request_addr_size, c->transport, 0, sfd);
712     }
713 
714     if (settings.verbose > 1) {
715         if (init_state == conn_listening) {
716             fprintf(stderr, "<%d server listening (%s)\n", sfd,
717                 prot_text(c->protocol));
718         } else if IS_UDP(transport) {
719             fprintf(stderr, "<%d server listening (udp)\n", sfd);
720         } else if (c->protocol == negotiating_prot) {
721             fprintf(stderr, "<%d new auto-negotiating client connection\n",
722                     sfd);
723         } else if (c->protocol == ascii_prot) {
724             fprintf(stderr, "<%d new ascii client connection.\n", sfd);
725         } else if (c->protocol == binary_prot) {
726             fprintf(stderr, "<%d new binary client connection.\n", sfd);
727         } else {
728             fprintf(stderr, "<%d new unknown (%d) client connection\n",
729                 sfd, c->protocol);
730             assert(false);
731         }
732     }
733 
734 #ifdef TLS
735     c->ssl = NULL;
736     c->ssl_wbuf = NULL;
737     c->ssl_enabled = false;
738 #endif
739     c->state = init_state;
740     c->rlbytes = 0;
741     c->cmd = -1;
742     c->rbytes = 0;
743     c->rcurr = c->rbuf;
744     c->ritem = 0;
745     c->rbuf_malloced = false;
746     c->item_malloced = false;
747     c->sasl_started = false;
748     c->set_stale = false;
749     c->mset_res = false;
750     c->close_after_write = false;
751     c->last_cmd_time = current_time; /* initialize for idle kicker */
752     // wipe all queues.
753     memset(c->io_queues, 0, sizeof(c->io_queues));
754     c->io_queues_submitted = 0;
755 
756     c->item = 0;
757 
758     c->noreply = false;
759 
760 #ifdef TLS
761     if (ssl) {
762         c->ssl = (SSL*)ssl;
763         c->read = ssl_read;
764         c->sendmsg = ssl_sendmsg;
765         c->write = ssl_write;
766         c->ssl_enabled = true;
767         SSL_set_info_callback(c->ssl, ssl_callback);
768     } else
769 #else
770     // This must be NULL if TLS is not enabled.
771     assert(ssl == NULL);
772 #endif
773     {
774         c->read = tcp_read;
775         c->sendmsg = tcp_sendmsg;
776         c->write = tcp_write;
777     }
778 
779     if IS_UDP(transport) {
780         c->try_read_command = try_read_command_udp;
781     } else {
782         switch (c->protocol) {
783             case ascii_prot:
784                 if (settings.auth_file == NULL) {
785                     c->authenticated = true;
786                     c->try_read_command = try_read_command_ascii;
787                 } else {
788                     c->authenticated = false;
789                     c->try_read_command = try_read_command_asciiauth;
790                 }
791                 break;
792             case binary_prot:
793                 // binprot handles its own authentication via SASL parsing.
794                 c->authenticated = false;
795                 c->try_read_command = try_read_command_binary;
796                 break;
797             case negotiating_prot:
798                 c->try_read_command = try_read_command_negotiate;
799                 break;
800         }
801     }
802 
803     event_set(&c->event, sfd, event_flags, event_handler, (void *)c);
804     event_base_set(base, &c->event);
805     c->ev_flags = event_flags;
806 
807     if (event_add(&c->event, 0) == -1) {
808         perror("event_add");
809         return NULL;
810     }
811 
812     STATS_LOCK();
813     stats_state.curr_conns++;
814     stats.total_conns++;
815     STATS_UNLOCK();
816 
817     MEMCACHED_CONN_ALLOCATE(c->sfd);
818 
819     return c;
820 }
821 
conn_release_items(conn * c)822 void conn_release_items(conn *c) {
823     assert(c != NULL);
824 
825     if (c->item) {
826         if (c->item_malloced) {
827             free(c->item);
828             c->item_malloced = false;
829         } else {
830             item_remove(c->item);
831         }
832         c->item = 0;
833     }
834 
835     // Cull any unsent responses.
836     if (c->resp_head) {
837         mc_resp *resp = c->resp_head;
838         // r_f() handles the chain maintenance.
839         while (resp) {
840             // temporary by default. hide behind a debug flag in the future:
841             // double free detection. Transmit loops can drop out early, but
842             // here we could infinite loop.
843             if (resp->free) {
844                 fprintf(stderr, "ERROR: double free detected during conn_release_items(): [%d] [%s]\n",
845                         c->sfd, c->protocol == binary_prot ? "binary" : "ascii");
846                 // Since this is a critical failure, just leak the memory.
847                 // If these errors are seen, an abort() can be used instead.
848                 c->resp_head = NULL;
849                 c->resp = NULL;
850                 break;
851             }
852             resp = resp_finish(c, resp);
853         }
854         conn_io_queue_reset(c);
855     }
856 }
857 
conn_cleanup(conn * c)858 static void conn_cleanup(conn *c) {
859     assert(c != NULL);
860 
861     conn_release_items(c);
862 
863     if (c->sasl_conn) {
864         assert(settings.sasl);
865         sasl_dispose(&c->sasl_conn);
866         c->sasl_conn = NULL;
867     }
868 
869     if IS_UDP(c->transport) {
870         conn_set_state(c, conn_read);
871     }
872 }
873 
874 /*
875  * Frees a connection.
876  */
conn_free(conn * c)877 void conn_free(conn *c) {
878     if (c) {
879         assert(c != NULL);
880         assert(c->sfd >= 0 && c->sfd < max_fds);
881 
882         MEMCACHED_CONN_DESTROY(c);
883         conns[c->sfd] = NULL;
884         if (c->rbuf)
885             free(c->rbuf);
886 #ifdef TLS
887         if (c->ssl_wbuf)
888             c->ssl_wbuf = NULL;
889 #endif
890 
891         free(c);
892     }
893 }
894 
conn_close(conn * c)895 static void conn_close(conn *c) {
896     assert(c != NULL);
897 
898     if (c->thread) {
899         LOGGER_LOG(c->thread->l, LOG_CONNEVENTS, LOGGER_CONNECTION_CLOSE, NULL,
900                 &c->request_addr, c->request_addr_size, c->transport,
901                 c->close_reason, c->sfd);
902     }
903 
904     /* delete the event, the socket and the conn */
905     event_del(&c->event);
906 
907     if (settings.verbose > 1)
908         fprintf(stderr, "<%d connection closed.\n", c->sfd);
909 
910     conn_cleanup(c);
911 
912     // force release of read buffer.
913     if (c->thread) {
914         c->rbytes = 0;
915         rbuf_release(c);
916     }
917 
918     MEMCACHED_CONN_RELEASE(c->sfd);
919     conn_set_state(c, conn_closed);
920 #ifdef TLS
921     if (c->ssl) {
922         SSL_shutdown(c->ssl);
923         SSL_free(c->ssl);
924     }
925 #endif
926     close(c->sfd);
927     c->close_reason = 0;
928     pthread_mutex_lock(&conn_lock);
929     allow_new_conns = true;
930     pthread_mutex_unlock(&conn_lock);
931 
932     STATS_LOCK();
933     stats_state.curr_conns--;
934     STATS_UNLOCK();
935 
936     return;
937 }
938 
939 // Since some connections might be off on side threads and some are managed as
940 // listeners we need to walk through them all from a central point.
941 // Must be called with all worker threads hung or in the process of closing.
conn_close_all(void)942 void conn_close_all(void) {
943     int i;
944     for (i = 0; i < max_fds; i++) {
945         if (conns[i] && conns[i]->state != conn_closed) {
946             conn_close(conns[i]);
947         }
948     }
949 }
950 
951 /**
952  * Convert a state name to a human readable form.
953  */
state_text(enum conn_states state)954 static const char *state_text(enum conn_states state) {
955     const char* const statenames[] = { "conn_listening",
956                                        "conn_new_cmd",
957                                        "conn_waiting",
958                                        "conn_read",
959                                        "conn_parse_cmd",
960                                        "conn_write",
961                                        "conn_nread",
962                                        "conn_swallow",
963                                        "conn_closing",
964                                        "conn_mwrite",
965                                        "conn_closed",
966                                        "conn_watch",
967                                        "conn_io_queue" };
968     return statenames[state];
969 }
970 
971 /*
972  * Sets a connection's current state in the state machine. Any special
973  * processing that needs to happen on certain state transitions can
974  * happen here.
975  */
conn_set_state(conn * c,enum conn_states state)976 void conn_set_state(conn *c, enum conn_states state) {
977     assert(c != NULL);
978     assert(state >= conn_listening && state < conn_max_state);
979 
980     if (state != c->state) {
981         if (settings.verbose > 2) {
982             fprintf(stderr, "%d: going from %s to %s\n",
983                     c->sfd, state_text(c->state),
984                     state_text(state));
985         }
986 
987         if (state == conn_write || state == conn_mwrite) {
988             MEMCACHED_PROCESS_COMMAND_END(c->sfd, c->resp->wbuf, c->resp->wbytes);
989         }
990         c->state = state;
991     }
992 }
993 
994 /*
995  * response object helper functions
996  */
resp_reset(mc_resp * resp)997 void resp_reset(mc_resp *resp) {
998     if (resp->item) {
999         item_remove(resp->item);
1000         resp->item = NULL;
1001     }
1002     if (resp->write_and_free) {
1003         free(resp->write_and_free);
1004         resp->write_and_free = NULL;
1005     }
1006     resp->wbytes = 0;
1007     resp->tosend = 0;
1008     resp->iovcnt = 0;
1009     resp->chunked_data_iov = 0;
1010     resp->chunked_total = 0;
1011     resp->skip = false;
1012 }
1013 
resp_add_iov(mc_resp * resp,const void * buf,int len)1014 void resp_add_iov(mc_resp *resp, const void *buf, int len) {
1015     assert(resp->iovcnt < MC_RESP_IOVCOUNT);
1016     int x = resp->iovcnt;
1017     resp->iov[x].iov_base = (void *)buf;
1018     resp->iov[x].iov_len = len;
1019     resp->iovcnt++;
1020     resp->tosend += len;
1021 }
1022 
1023 // Notes that an IOV should be handled as a chunked item header.
1024 // TODO: I'm hoping this isn't a permanent abstraction while I learn what the
1025 // API should be.
resp_add_chunked_iov(mc_resp * resp,const void * buf,int len)1026 void resp_add_chunked_iov(mc_resp *resp, const void *buf, int len) {
1027     resp->chunked_data_iov = resp->iovcnt;
1028     resp->chunked_total = len;
1029     resp_add_iov(resp, buf, len);
1030 }
1031 
1032 // resp_allocate and resp_free are a wrapper around read buffers which makes
1033 // read buffers the only network memory to track.
1034 // Normally this would be too excessive. In this case it allows end users to
1035 // track a single memory limit for ephemeral connection buffers.
1036 // Fancy bit twiddling tricks are avoided to help keep this straightforward.
resp_allocate(conn * c)1037 static mc_resp* resp_allocate(conn *c) {
1038     LIBEVENT_THREAD *th = c->thread;
1039     mc_resp *resp = NULL;
1040     mc_resp_bundle *b = th->open_bundle;
1041 
1042     if (b != NULL) {
1043         for (int i = 0; i < MAX_RESP_PER_BUNDLE; i++) {
1044             // loop around starting from the most likely to be free
1045             int x = (i + b->next_check) % MAX_RESP_PER_BUNDLE;
1046             if (b->r[x].free) {
1047                 resp = &b->r[x];
1048                 b->next_check = x+1;
1049                 break;
1050             }
1051         }
1052 
1053         if (resp != NULL) {
1054             b->refcount++;
1055             resp->free = false;
1056             if (b->refcount == MAX_RESP_PER_BUNDLE) {
1057                 assert(b->prev == NULL);
1058                 // We only allocate off the head. Assign new head.
1059                 th->open_bundle = b->next;
1060                 // Remove ourselves from the list.
1061                 if (b->next) {
1062                     b->next->prev = 0;
1063                     b->next = 0;
1064                 }
1065             }
1066         }
1067     }
1068 
1069     if (resp == NULL) {
1070         assert(th->open_bundle == NULL);
1071         b = do_cache_alloc(th->rbuf_cache);
1072         if (b) {
1073             THR_STATS_LOCK(c);
1074             c->thread->stats.response_obj_bytes += READ_BUFFER_SIZE;
1075             THR_STATS_UNLOCK(c);
1076             b->next_check = 1;
1077             b->refcount = 1;
1078             for (int i = 0; i < MAX_RESP_PER_BUNDLE; i++) {
1079                 b->r[i].bundle = b;
1080                 b->r[i].free = true;
1081             }
1082             b->next = 0;
1083             b->prev = 0;
1084             th->open_bundle = b;
1085             resp = &b->r[0];
1086             resp->free = false;
1087         } else {
1088             return NULL;
1089         }
1090     }
1091 
1092     return resp;
1093 }
1094 
resp_free(conn * c,mc_resp * resp)1095 static void resp_free(conn *c, mc_resp *resp) {
1096     LIBEVENT_THREAD *th = c->thread;
1097     mc_resp_bundle *b = resp->bundle;
1098 
1099     resp->free = true;
1100     b->refcount--;
1101     if (b->refcount == 0) {
1102         if (b == th->open_bundle && b->next == 0) {
1103             // This is the final bundle. Just hold and reuse to skip init loop
1104             assert(b->prev == 0);
1105             b->next_check = 0;
1106         } else {
1107             // Assert that we're either in the list or at the head.
1108             assert((b->next || b->prev) || b == th->open_bundle);
1109 
1110             // unlink from list.
1111             mc_resp_bundle **head = &th->open_bundle;
1112             if (*head == b) *head = b->next;
1113             // Not tracking the tail.
1114             assert(b->next != b && b->prev != b);
1115 
1116             if (b->next) b->next->prev = b->prev;
1117             if (b->prev) b->prev->next = b->next;
1118 
1119             // Now completely done with this buffer.
1120             do_cache_free(th->rbuf_cache, b);
1121             THR_STATS_LOCK(c);
1122             c->thread->stats.response_obj_bytes -= READ_BUFFER_SIZE;
1123             THR_STATS_UNLOCK(c);
1124         }
1125     } else {
1126         mc_resp_bundle **head = &th->open_bundle;
1127         // NOTE: since we're not tracking tail, latest free ends up in head.
1128         if (b == th->open_bundle || (b->prev || b->next)) {
1129             // If we're already linked, leave it in place to save CPU.
1130         } else {
1131             // Non-zero refcount, need to link into the freelist.
1132             b->prev = 0;
1133             b->next = *head;
1134             if (b->next) b->next->prev = b;
1135             *head = b;
1136         }
1137 
1138     }
1139 }
1140 
resp_start(conn * c)1141 bool resp_start(conn *c) {
1142     mc_resp *resp = resp_allocate(c);
1143     if (!resp) {
1144         THR_STATS_LOCK(c);
1145         c->thread->stats.response_obj_oom++;
1146         THR_STATS_UNLOCK(c);
1147         return false;
1148     }
1149     // handling the stats counters here to simplify testing
1150     THR_STATS_LOCK(c);
1151     c->thread->stats.response_obj_count++;
1152     THR_STATS_UNLOCK(c);
1153     // Skip zeroing the bundle pointer at the start.
1154     // TODO: this line is here temporarily to make the code easy to disable.
1155     // when it's more mature, move the memset into resp_allocate() and have it
1156     // set the bundle pointer on allocate so this line isn't as complex.
1157     memset((char *)resp + sizeof(mc_resp_bundle*), 0, sizeof(*resp) - sizeof(mc_resp_bundle*));
1158     // TODO: this next line works. memset _does_ show up significantly under
1159     // perf reports due to zeroing out the entire resp->wbuf. before swapping
1160     // the lines more validation work should be done to ensure wbuf's aren't
1161     // accidentally reused without being written to.
1162     //memset((char *)resp + sizeof(mc_resp_bundle*), 0, offsetof(mc_resp, wbuf));
1163     if (!c->resp_head) {
1164         c->resp_head = resp;
1165     }
1166     if (!c->resp) {
1167         c->resp = resp;
1168     } else {
1169         c->resp->next = resp;
1170         c->resp = resp;
1171     }
1172     if IS_UDP(c->transport) {
1173         // need to hold on to some data for async responses.
1174         c->resp->request_id = c->request_id;
1175         c->resp->request_addr = c->request_addr;
1176         c->resp->request_addr_size = c->request_addr_size;
1177     }
1178     return true;
1179 }
1180 
1181 // returns next response in chain.
resp_finish(conn * c,mc_resp * resp)1182 mc_resp* resp_finish(conn *c, mc_resp *resp) {
1183     mc_resp *next = resp->next;
1184     if (resp->item) {
1185         // TODO: cache hash value in resp obj?
1186         item_remove(resp->item);
1187         resp->item = NULL;
1188     }
1189     if (resp->write_and_free) {
1190         free(resp->write_and_free);
1191     }
1192     if (resp->io_pending) {
1193         // If we had a pending IO, tell it to internally clean up then return
1194         // the main object back to our thread cache.
1195         io_queue_cb_t *qcb = thread_io_queue_get(c->thread, resp->io_pending->io_queue_type);
1196         qcb->finalize_cb(resp->io_pending);
1197         do_cache_free(c->thread->io_cache, resp->io_pending);
1198         resp->io_pending = NULL;
1199     }
1200     if (c->resp_head == resp) {
1201         c->resp_head = next;
1202     }
1203     if (c->resp == resp) {
1204         c->resp = NULL;
1205     }
1206     resp_free(c, resp);
1207     THR_STATS_LOCK(c);
1208     c->thread->stats.response_obj_count--;
1209     THR_STATS_UNLOCK(c);
1210     return next;
1211 }
1212 
1213 // tells if connection has a depth of response objects to process.
resp_has_stack(conn * c)1214 bool resp_has_stack(conn *c) {
1215     return c->resp_head->next != NULL ? true : false;
1216 }
1217 
out_string(conn * c,const char * str)1218 void out_string(conn *c, const char *str) {
1219     size_t len;
1220     assert(c != NULL);
1221     mc_resp *resp = c->resp;
1222 
1223     // if response was original filled with something, but we're now writing
1224     // out an error or similar, have to reset the object first.
1225     // TODO: since this is often redundant with allocation, how many callers
1226     // are actually requiring it be reset? Can we fast test by just looking at
1227     // tosend and reset if nonzero?
1228     resp_reset(resp);
1229 
1230     if (c->noreply) {
1231         // TODO: just invalidate the response since nothing's been attempted
1232         // to send yet?
1233         resp->skip = true;
1234         if (settings.verbose > 1)
1235             fprintf(stderr, ">%d NOREPLY %s\n", c->sfd, str);
1236         conn_set_state(c, conn_new_cmd);
1237         return;
1238     }
1239 
1240     if (settings.verbose > 1)
1241         fprintf(stderr, ">%d %s\n", c->sfd, str);
1242 
1243     // Fill response object with static string.
1244 
1245     len = strlen(str);
1246     if ((len + 2) > WRITE_BUFFER_SIZE) {
1247         /* ought to be always enough. just fail for simplicity */
1248         str = "SERVER_ERROR output line too long";
1249         len = strlen(str);
1250     }
1251 
1252     memcpy(resp->wbuf, str, len);
1253     memcpy(resp->wbuf + len, "\r\n", 2);
1254     resp_add_iov(resp, resp->wbuf, len + 2);
1255 
1256     conn_set_state(c, conn_new_cmd);
1257     return;
1258 }
1259 
1260 // For metaget-style ASCII commands. Ignores noreply, ensuring clients see
1261 // protocol level errors.
out_errstring(conn * c,const char * str)1262 void out_errstring(conn *c, const char *str) {
1263     c->noreply = false;
1264     out_string(c, str);
1265 }
1266 
1267 /*
1268  * Outputs a protocol-specific "out of memory" error. For ASCII clients,
1269  * this is equivalent to out_string().
1270  */
out_of_memory(conn * c,char * ascii_error)1271 void out_of_memory(conn *c, char *ascii_error) {
1272     const static char error_prefix[] = "SERVER_ERROR ";
1273     const static int error_prefix_len = sizeof(error_prefix) - 1;
1274 
1275     if (c->protocol == binary_prot) {
1276         /* Strip off the generic error prefix; it's irrelevant in binary */
1277         if (!strncmp(ascii_error, error_prefix, error_prefix_len)) {
1278             ascii_error += error_prefix_len;
1279         }
1280         write_bin_error(c, PROTOCOL_BINARY_RESPONSE_ENOMEM, ascii_error, 0);
1281     } else {
1282         out_string(c, ascii_error);
1283     }
1284 }
1285 
append_bin_stats(const char * key,const uint16_t klen,const char * val,const uint32_t vlen,conn * c)1286 static void append_bin_stats(const char *key, const uint16_t klen,
1287                              const char *val, const uint32_t vlen,
1288                              conn *c) {
1289     char *buf = c->stats.buffer + c->stats.offset;
1290     uint32_t bodylen = klen + vlen;
1291     protocol_binary_response_header header = {
1292         .response.magic = (uint8_t)PROTOCOL_BINARY_RES,
1293         .response.opcode = PROTOCOL_BINARY_CMD_STAT,
1294         .response.keylen = (uint16_t)htons(klen),
1295         .response.datatype = (uint8_t)PROTOCOL_BINARY_RAW_BYTES,
1296         .response.bodylen = htonl(bodylen),
1297         .response.opaque = c->opaque
1298     };
1299 
1300     memcpy(buf, header.bytes, sizeof(header.response));
1301     buf += sizeof(header.response);
1302 
1303     if (klen > 0) {
1304         memcpy(buf, key, klen);
1305         buf += klen;
1306 
1307         if (vlen > 0) {
1308             memcpy(buf, val, vlen);
1309         }
1310     }
1311 
1312     c->stats.offset += sizeof(header.response) + bodylen;
1313 }
1314 
append_ascii_stats(const char * key,const uint16_t klen,const char * val,const uint32_t vlen,conn * c)1315 static void append_ascii_stats(const char *key, const uint16_t klen,
1316                                const char *val, const uint32_t vlen,
1317                                conn *c) {
1318     char *pos = c->stats.buffer + c->stats.offset;
1319     uint32_t nbytes = 0;
1320     int remaining = c->stats.size - c->stats.offset;
1321     int room = remaining - 1;
1322 
1323     if (klen == 0 && vlen == 0) {
1324         nbytes = snprintf(pos, room, "END\r\n");
1325     } else if (vlen == 0) {
1326         nbytes = snprintf(pos, room, "STAT %s\r\n", key);
1327     } else {
1328         nbytes = snprintf(pos, room, "STAT %s %s\r\n", key, val);
1329     }
1330 
1331     c->stats.offset += nbytes;
1332 }
1333 
grow_stats_buf(conn * c,size_t needed)1334 static bool grow_stats_buf(conn *c, size_t needed) {
1335     size_t nsize = c->stats.size;
1336     size_t available = nsize - c->stats.offset;
1337     bool rv = true;
1338 
1339     /* Special case: No buffer -- need to allocate fresh */
1340     if (c->stats.buffer == NULL) {
1341         nsize = 1024;
1342         available = c->stats.size = c->stats.offset = 0;
1343     }
1344 
1345     while (needed > available) {
1346         assert(nsize > 0);
1347         nsize = nsize << 1;
1348         available = nsize - c->stats.offset;
1349     }
1350 
1351     if (nsize != c->stats.size) {
1352         char *ptr = realloc(c->stats.buffer, nsize);
1353         if (ptr) {
1354             c->stats.buffer = ptr;
1355             c->stats.size = nsize;
1356         } else {
1357             STATS_LOCK();
1358             stats.malloc_fails++;
1359             STATS_UNLOCK();
1360             rv = false;
1361         }
1362     }
1363 
1364     return rv;
1365 }
1366 
append_stats(const char * key,const uint16_t klen,const char * val,const uint32_t vlen,const void * cookie)1367 void append_stats(const char *key, const uint16_t klen,
1368                   const char *val, const uint32_t vlen,
1369                   const void *cookie)
1370 {
1371     /* value without a key is invalid */
1372     if (klen == 0 && vlen > 0) {
1373         return;
1374     }
1375 
1376     conn *c = (conn*)cookie;
1377 
1378     if (c->protocol == binary_prot) {
1379         size_t needed = vlen + klen + sizeof(protocol_binary_response_header);
1380         if (!grow_stats_buf(c, needed)) {
1381             return;
1382         }
1383         append_bin_stats(key, klen, val, vlen, c);
1384     } else {
1385         size_t needed = vlen + klen + 10; // 10 == "STAT = \r\n"
1386         if (!grow_stats_buf(c, needed)) {
1387             return;
1388         }
1389         append_ascii_stats(key, klen, val, vlen, c);
1390     }
1391 
1392     assert(c->stats.offset <= c->stats.size);
1393 }
1394 
reset_cmd_handler(conn * c)1395 static void reset_cmd_handler(conn *c) {
1396     c->cmd = -1;
1397     c->substate = bin_no_state;
1398     if (c->item != NULL) {
1399         // TODO: Any other way to get here?
1400         // SASL auth was mistakenly using it. Nothing else should?
1401         if (c->item_malloced) {
1402             free(c->item);
1403             c->item_malloced = false;
1404         } else {
1405             item_remove(c->item);
1406         }
1407         c->item = NULL;
1408     }
1409     if (c->rbytes > 0) {
1410         conn_set_state(c, conn_parse_cmd);
1411     } else if (c->resp_head) {
1412         conn_set_state(c, conn_mwrite);
1413     } else {
1414         conn_set_state(c, conn_waiting);
1415     }
1416 }
1417 
complete_nread(conn * c)1418 static void complete_nread(conn *c) {
1419     assert(c != NULL);
1420     assert(c->protocol == ascii_prot
1421            || c->protocol == binary_prot);
1422 
1423     if (c->protocol == ascii_prot) {
1424         complete_nread_ascii(c);
1425     } else if (c->protocol == binary_prot) {
1426         complete_nread_binary(c);
1427     }
1428 }
1429 
1430 /* Destination must always be chunked */
1431 /* This should be part of item.c */
_store_item_copy_chunks(item * d_it,item * s_it,const int len)1432 static int _store_item_copy_chunks(item *d_it, item *s_it, const int len) {
1433     item_chunk *dch = (item_chunk *) ITEM_schunk(d_it);
1434     /* Advance dch until we find free space */
1435     while (dch->size == dch->used) {
1436         if (dch->next) {
1437             dch = dch->next;
1438         } else {
1439             break;
1440         }
1441     }
1442 
1443     if (s_it->it_flags & ITEM_CHUNKED) {
1444         int remain = len;
1445         item_chunk *sch = (item_chunk *) ITEM_schunk(s_it);
1446         int copied = 0;
1447         /* Fills dch's to capacity, not straight copy sch in case data is
1448          * being added or removed (ie append/prepend)
1449          */
1450         while (sch && dch && remain) {
1451             assert(dch->used <= dch->size);
1452             int todo = (dch->size - dch->used < sch->used - copied)
1453                 ? dch->size - dch->used : sch->used - copied;
1454             if (remain < todo)
1455                 todo = remain;
1456             memcpy(dch->data + dch->used, sch->data + copied, todo);
1457             dch->used += todo;
1458             copied += todo;
1459             remain -= todo;
1460             assert(dch->used <= dch->size);
1461             if (dch->size == dch->used) {
1462                 item_chunk *tch = do_item_alloc_chunk(dch, remain);
1463                 if (tch) {
1464                     dch = tch;
1465                 } else {
1466                     return -1;
1467                 }
1468             }
1469             assert(copied <= sch->used);
1470             if (copied == sch->used) {
1471                 copied = 0;
1472                 sch = sch->next;
1473             }
1474         }
1475         /* assert that the destination had enough space for the source */
1476         assert(remain == 0);
1477     } else {
1478         int done = 0;
1479         /* Fill dch's via a non-chunked item. */
1480         while (len > done && dch) {
1481             int todo = (dch->size - dch->used < len - done)
1482                 ? dch->size - dch->used : len - done;
1483             //assert(dch->size - dch->used != 0);
1484             memcpy(dch->data + dch->used, ITEM_data(s_it) + done, todo);
1485             done += todo;
1486             dch->used += todo;
1487             assert(dch->used <= dch->size);
1488             if (dch->size == dch->used) {
1489                 item_chunk *tch = do_item_alloc_chunk(dch, len - done);
1490                 if (tch) {
1491                     dch = tch;
1492                 } else {
1493                     return -1;
1494                 }
1495             }
1496         }
1497         assert(len == done);
1498     }
1499     return 0;
1500 }
1501 
_store_item_copy_data(int comm,item * old_it,item * new_it,item * add_it)1502 static int _store_item_copy_data(int comm, item *old_it, item *new_it, item *add_it) {
1503     if (comm == NREAD_APPEND) {
1504         if (new_it->it_flags & ITEM_CHUNKED) {
1505             if (_store_item_copy_chunks(new_it, old_it, old_it->nbytes - 2) == -1 ||
1506                 _store_item_copy_chunks(new_it, add_it, add_it->nbytes) == -1) {
1507                 return -1;
1508             }
1509         } else {
1510             memcpy(ITEM_data(new_it), ITEM_data(old_it), old_it->nbytes);
1511             memcpy(ITEM_data(new_it) + old_it->nbytes - 2 /* CRLF */, ITEM_data(add_it), add_it->nbytes);
1512         }
1513     } else {
1514         /* NREAD_PREPEND */
1515         if (new_it->it_flags & ITEM_CHUNKED) {
1516             if (_store_item_copy_chunks(new_it, add_it, add_it->nbytes - 2) == -1 ||
1517                 _store_item_copy_chunks(new_it, old_it, old_it->nbytes) == -1) {
1518                 return -1;
1519             }
1520         } else {
1521             memcpy(ITEM_data(new_it), ITEM_data(add_it), add_it->nbytes);
1522             memcpy(ITEM_data(new_it) + add_it->nbytes - 2 /* CRLF */, ITEM_data(old_it), old_it->nbytes);
1523         }
1524     }
1525     return 0;
1526 }
1527 
1528 /*
1529  * Stores an item in the cache according to the semantics of one of the set
1530  * commands. Protected by the item lock.
1531  *
1532  * Returns the state of storage.
1533  */
do_store_item(item * it,int comm,conn * c,const uint32_t hv)1534 enum store_item_type do_store_item(item *it, int comm, conn *c, const uint32_t hv) {
1535     char *key = ITEM_key(it);
1536     item *old_it = do_item_get(key, it->nkey, hv, c, DONT_UPDATE);
1537     enum store_item_type stored = NOT_STORED;
1538 
1539     enum cas_result { CAS_NONE, CAS_MATCH, CAS_BADVAL, CAS_STALE, CAS_MISS };
1540 
1541     item *new_it = NULL;
1542     uint32_t flags;
1543 
1544     /* Do the CAS test up front so we can apply to all store modes */
1545     enum cas_result cas_res = CAS_NONE;
1546 
1547     bool do_store = false;
1548     if (old_it != NULL) {
1549         // Most of the CAS work requires something to compare to.
1550         uint64_t it_cas = ITEM_get_cas(it);
1551         uint64_t old_cas = ITEM_get_cas(old_it);
1552         if (it_cas == 0) {
1553             cas_res = CAS_NONE;
1554         } else if (it_cas == old_cas) {
1555             cas_res = CAS_MATCH;
1556         } else if (c->set_stale && it_cas < old_cas) {
1557             cas_res = CAS_STALE;
1558         } else {
1559             cas_res = CAS_BADVAL;
1560         }
1561 
1562         switch (comm) {
1563             case NREAD_ADD:
1564                 /* add only adds a nonexistent item, but promote to head of LRU */
1565                 do_item_update(old_it);
1566                 break;
1567             case NREAD_CAS:
1568                 if (cas_res == CAS_MATCH) {
1569                     // cas validates
1570                     // it and old_it may belong to different classes.
1571                     // I'm updating the stats for the one that's getting pushed out
1572                     pthread_mutex_lock(&c->thread->stats.mutex);
1573                     c->thread->stats.slab_stats[ITEM_clsid(old_it)].cas_hits++;
1574                     pthread_mutex_unlock(&c->thread->stats.mutex);
1575                     do_store = true;
1576                 } else if (cas_res == CAS_STALE) {
1577                     // if we're allowed to set a stale value, CAS must be lower than
1578                     // the current item's CAS.
1579                     // This replaces the value, but should preserve TTL, and stale
1580                     // item marker bit + token sent if exists.
1581                     it->exptime = old_it->exptime;
1582                     it->it_flags |= ITEM_STALE;
1583                     if (old_it->it_flags & ITEM_TOKEN_SENT) {
1584                         it->it_flags |= ITEM_TOKEN_SENT;
1585                     }
1586 
1587                     pthread_mutex_lock(&c->thread->stats.mutex);
1588                     c->thread->stats.slab_stats[ITEM_clsid(old_it)].cas_hits++;
1589                     pthread_mutex_unlock(&c->thread->stats.mutex);
1590                     do_store = true;
1591                 } else {
1592                     // NONE or BADVAL are the same for CAS cmd
1593                     pthread_mutex_lock(&c->thread->stats.mutex);
1594                     c->thread->stats.slab_stats[ITEM_clsid(old_it)].cas_badval++;
1595                     pthread_mutex_unlock(&c->thread->stats.mutex);
1596 
1597                     if (settings.verbose > 1) {
1598                         fprintf(stderr, "CAS:  failure: expected %llu, got %llu\n",
1599                                 (unsigned long long)ITEM_get_cas(old_it),
1600                                 (unsigned long long)ITEM_get_cas(it));
1601                     }
1602                     stored = EXISTS;
1603                 }
1604                 break;
1605             case NREAD_APPEND:
1606             case NREAD_PREPEND:
1607                 if (cas_res != CAS_NONE && cas_res != CAS_MATCH) {
1608                     stored = EXISTS;
1609                     break;
1610                 }
1611 #ifdef EXTSTORE
1612                 if ((old_it->it_flags & ITEM_HDR) != 0) {
1613                     /* block append/prepend from working with extstore-d items.
1614                      * leave response code to NOT_STORED default */
1615                     break;
1616                 }
1617 #endif
1618                 /* we have it and old_it here - alloc memory to hold both */
1619                 FLAGS_CONV(old_it, flags);
1620                 new_it = do_item_alloc(key, it->nkey, flags, old_it->exptime, it->nbytes + old_it->nbytes - 2 /* CRLF */);
1621 
1622                 // OOM trying to copy.
1623                 if (new_it == NULL)
1624                     break;
1625                 /* copy data from it and old_it to new_it */
1626                 if (_store_item_copy_data(comm, old_it, new_it, it) == -1) {
1627                     // failed data copy
1628                     break;
1629                 } else {
1630                     // refcount of new_it is 1 here. will end up 2 after link.
1631                     // it's original ref is managed outside of this function
1632                     it = new_it;
1633                     do_store = true;
1634                 }
1635                 break;
1636             case NREAD_REPLACE:
1637             case NREAD_SET:
1638                 do_store = true;
1639                 break;
1640         }
1641 
1642         if (do_store) {
1643             STORAGE_delete(c->thread->storage, old_it);
1644             item_replace(old_it, it, hv);
1645             stored = STORED;
1646         }
1647 
1648         do_item_remove(old_it);         /* release our reference */
1649         if (new_it != NULL) {
1650             // append/prepend end up with an extra reference for new_it.
1651             do_item_remove(new_it);
1652         }
1653     } else {
1654         /* No pre-existing item to replace or compare to. */
1655         if (ITEM_get_cas(it) != 0) {
1656             /* Asked for a CAS match but nothing to compare it to. */
1657             cas_res = CAS_MISS;
1658         }
1659 
1660         switch (comm) {
1661             case NREAD_ADD:
1662             case NREAD_SET:
1663                 do_store = true;
1664                 break;
1665             case NREAD_CAS:
1666                 // LRU expired
1667                 stored = NOT_FOUND;
1668                 pthread_mutex_lock(&c->thread->stats.mutex);
1669                 c->thread->stats.cas_misses++;
1670                 pthread_mutex_unlock(&c->thread->stats.mutex);
1671                 break;
1672             case NREAD_REPLACE:
1673             case NREAD_APPEND:
1674             case NREAD_PREPEND:
1675                 /* Requires an existing item. */
1676                 break;
1677         }
1678 
1679         if (do_store) {
1680             do_item_link(it, hv);
1681             stored = STORED;
1682         }
1683     }
1684 
1685     if (stored == STORED) {
1686         c->cas = ITEM_get_cas(it);
1687     }
1688     LOGGER_LOG(c->thread->l, LOG_MUTATIONS, LOGGER_ITEM_STORE, NULL,
1689             stored, comm, ITEM_key(it), it->nkey, it->nbytes, it->exptime,
1690             ITEM_clsid(it), c->sfd);
1691 
1692     return stored;
1693 }
1694 
1695 /* set up a connection to write a buffer then free it, used for stats */
write_and_free(conn * c,char * buf,int bytes)1696 void write_and_free(conn *c, char *buf, int bytes) {
1697     if (buf) {
1698         mc_resp *resp = c->resp;
1699         resp->write_and_free = buf;
1700         resp_add_iov(resp, buf, bytes);
1701         conn_set_state(c, conn_new_cmd);
1702     } else {
1703         out_of_memory(c, "SERVER_ERROR out of memory writing stats");
1704     }
1705 }
1706 
append_stat(const char * name,ADD_STAT add_stats,conn * c,const char * fmt,...)1707 void append_stat(const char *name, ADD_STAT add_stats, conn *c,
1708                  const char *fmt, ...) {
1709     char val_str[STAT_VAL_LEN];
1710     int vlen;
1711     va_list ap;
1712 
1713     assert(name);
1714     assert(add_stats);
1715     assert(c);
1716     assert(fmt);
1717 
1718     va_start(ap, fmt);
1719     vlen = vsnprintf(val_str, sizeof(val_str) - 1, fmt, ap);
1720     va_end(ap);
1721 
1722     add_stats(name, strlen(name), val_str, vlen, c);
1723 }
1724 
1725 /* return server specific stats only */
server_stats(ADD_STAT add_stats,conn * c)1726 void server_stats(ADD_STAT add_stats, conn *c) {
1727     pid_t pid = getpid();
1728     rel_time_t now = current_time;
1729 
1730     struct thread_stats thread_stats;
1731     threadlocal_stats_aggregate(&thread_stats);
1732     struct slab_stats slab_stats;
1733     slab_stats_aggregate(&thread_stats, &slab_stats);
1734 #ifndef WIN32
1735     struct rusage usage;
1736     getrusage(RUSAGE_SELF, &usage);
1737 #endif /* !WIN32 */
1738 
1739     STATS_LOCK();
1740 
1741     APPEND_STAT("pid", "%lu", (long)pid);
1742     APPEND_STAT("uptime", "%u", now - ITEM_UPDATE_INTERVAL);
1743     APPEND_STAT("time", "%ld", now + (long)process_started);
1744     APPEND_STAT("version", "%s", VERSION);
1745     APPEND_STAT("libevent", "%s", event_get_version());
1746     APPEND_STAT("pointer_size", "%d", (int)(8 * sizeof(void *)));
1747 
1748 #ifndef WIN32
1749     append_stat("rusage_user", add_stats, c, "%ld.%06ld",
1750                 (long)usage.ru_utime.tv_sec,
1751                 (long)usage.ru_utime.tv_usec);
1752     append_stat("rusage_system", add_stats, c, "%ld.%06ld",
1753                 (long)usage.ru_stime.tv_sec,
1754                 (long)usage.ru_stime.tv_usec);
1755 #endif /* !WIN32 */
1756 
1757     APPEND_STAT("max_connections", "%d", settings.maxconns);
1758     APPEND_STAT("curr_connections", "%llu", (unsigned long long)stats_state.curr_conns - 1);
1759     APPEND_STAT("total_connections", "%llu", (unsigned long long)stats.total_conns);
1760     if (settings.maxconns_fast) {
1761         APPEND_STAT("rejected_connections", "%llu", (unsigned long long)stats.rejected_conns);
1762     }
1763     APPEND_STAT("connection_structures", "%u", stats_state.conn_structs);
1764     APPEND_STAT("response_obj_oom", "%llu", (unsigned long long)thread_stats.response_obj_oom);
1765     APPEND_STAT("response_obj_count", "%llu", (unsigned long long)thread_stats.response_obj_count);
1766     APPEND_STAT("response_obj_bytes", "%llu", (unsigned long long)thread_stats.response_obj_bytes);
1767     APPEND_STAT("read_buf_count", "%llu", (unsigned long long)thread_stats.read_buf_count);
1768     APPEND_STAT("read_buf_bytes", "%llu", (unsigned long long)thread_stats.read_buf_bytes);
1769     APPEND_STAT("read_buf_bytes_free", "%llu", (unsigned long long)thread_stats.read_buf_bytes_free);
1770     APPEND_STAT("read_buf_oom", "%llu", (unsigned long long)thread_stats.read_buf_oom);
1771     APPEND_STAT("reserved_fds", "%u", stats_state.reserved_fds);
1772     APPEND_STAT("cmd_get", "%llu", (unsigned long long)thread_stats.get_cmds);
1773     APPEND_STAT("cmd_set", "%llu", (unsigned long long)slab_stats.set_cmds);
1774     APPEND_STAT("cmd_flush", "%llu", (unsigned long long)thread_stats.flush_cmds);
1775     APPEND_STAT("cmd_touch", "%llu", (unsigned long long)thread_stats.touch_cmds);
1776     APPEND_STAT("cmd_meta", "%llu", (unsigned long long)thread_stats.meta_cmds);
1777     APPEND_STAT("get_hits", "%llu", (unsigned long long)slab_stats.get_hits);
1778     APPEND_STAT("get_misses", "%llu", (unsigned long long)thread_stats.get_misses);
1779     APPEND_STAT("get_expired", "%llu", (unsigned long long)thread_stats.get_expired);
1780     APPEND_STAT("get_flushed", "%llu", (unsigned long long)thread_stats.get_flushed);
1781 #ifdef EXTSTORE
1782     if (c->thread->storage) {
1783         APPEND_STAT("get_extstore", "%llu", (unsigned long long)thread_stats.get_extstore);
1784         APPEND_STAT("get_aborted_extstore", "%llu", (unsigned long long)thread_stats.get_aborted_extstore);
1785         APPEND_STAT("get_oom_extstore", "%llu", (unsigned long long)thread_stats.get_oom_extstore);
1786         APPEND_STAT("recache_from_extstore", "%llu", (unsigned long long)thread_stats.recache_from_extstore);
1787         APPEND_STAT("miss_from_extstore", "%llu", (unsigned long long)thread_stats.miss_from_extstore);
1788         APPEND_STAT("badcrc_from_extstore", "%llu", (unsigned long long)thread_stats.badcrc_from_extstore);
1789     }
1790 #endif
1791     APPEND_STAT("delete_misses", "%llu", (unsigned long long)thread_stats.delete_misses);
1792     APPEND_STAT("delete_hits", "%llu", (unsigned long long)slab_stats.delete_hits);
1793     APPEND_STAT("incr_misses", "%llu", (unsigned long long)thread_stats.incr_misses);
1794     APPEND_STAT("incr_hits", "%llu", (unsigned long long)slab_stats.incr_hits);
1795     APPEND_STAT("decr_misses", "%llu", (unsigned long long)thread_stats.decr_misses);
1796     APPEND_STAT("decr_hits", "%llu", (unsigned long long)slab_stats.decr_hits);
1797     APPEND_STAT("cas_misses", "%llu", (unsigned long long)thread_stats.cas_misses);
1798     APPEND_STAT("cas_hits", "%llu", (unsigned long long)slab_stats.cas_hits);
1799     APPEND_STAT("cas_badval", "%llu", (unsigned long long)slab_stats.cas_badval);
1800     APPEND_STAT("touch_hits", "%llu", (unsigned long long)slab_stats.touch_hits);
1801     APPEND_STAT("touch_misses", "%llu", (unsigned long long)thread_stats.touch_misses);
1802     APPEND_STAT("auth_cmds", "%llu", (unsigned long long)thread_stats.auth_cmds);
1803     APPEND_STAT("auth_errors", "%llu", (unsigned long long)thread_stats.auth_errors);
1804     if (settings.idle_timeout) {
1805         APPEND_STAT("idle_kicks", "%llu", (unsigned long long)thread_stats.idle_kicks);
1806     }
1807     APPEND_STAT("bytes_read", "%llu", (unsigned long long)thread_stats.bytes_read);
1808     APPEND_STAT("bytes_written", "%llu", (unsigned long long)thread_stats.bytes_written);
1809     APPEND_STAT("limit_maxbytes", "%llu", (unsigned long long)settings.maxbytes);
1810     APPEND_STAT("accepting_conns", "%u", stats_state.accepting_conns);
1811     APPEND_STAT("listen_disabled_num", "%llu", (unsigned long long)stats.listen_disabled_num);
1812     APPEND_STAT("time_in_listen_disabled_us", "%llu", stats.time_in_listen_disabled_us);
1813     APPEND_STAT("threads", "%d", settings.num_threads);
1814     APPEND_STAT("conn_yields", "%llu", (unsigned long long)thread_stats.conn_yields);
1815     APPEND_STAT("hash_power_level", "%u", stats_state.hash_power_level);
1816     APPEND_STAT("hash_bytes", "%llu", (unsigned long long)stats_state.hash_bytes);
1817     APPEND_STAT("hash_is_expanding", "%u", stats_state.hash_is_expanding);
1818     if (settings.slab_reassign) {
1819         APPEND_STAT("slab_reassign_rescues", "%llu", stats.slab_reassign_rescues);
1820         APPEND_STAT("slab_reassign_chunk_rescues", "%llu", stats.slab_reassign_chunk_rescues);
1821         APPEND_STAT("slab_reassign_evictions_nomem", "%llu", stats.slab_reassign_evictions_nomem);
1822         APPEND_STAT("slab_reassign_inline_reclaim", "%llu", stats.slab_reassign_inline_reclaim);
1823         APPEND_STAT("slab_reassign_busy_items", "%llu", stats.slab_reassign_busy_items);
1824         APPEND_STAT("slab_reassign_busy_deletes", "%llu", stats.slab_reassign_busy_deletes);
1825         APPEND_STAT("slab_reassign_running", "%u", stats_state.slab_reassign_running);
1826         APPEND_STAT("slabs_moved", "%llu", stats.slabs_moved);
1827     }
1828     if (settings.lru_crawler) {
1829         APPEND_STAT("lru_crawler_running", "%u", stats_state.lru_crawler_running);
1830         APPEND_STAT("lru_crawler_starts", "%u", stats.lru_crawler_starts);
1831     }
1832     if (settings.lru_maintainer_thread) {
1833         APPEND_STAT("lru_maintainer_juggles", "%llu", (unsigned long long)stats.lru_maintainer_juggles);
1834     }
1835     APPEND_STAT("malloc_fails", "%llu",
1836                 (unsigned long long)stats.malloc_fails);
1837     APPEND_STAT("log_worker_dropped", "%llu", (unsigned long long)stats.log_worker_dropped);
1838     APPEND_STAT("log_worker_written", "%llu", (unsigned long long)stats.log_worker_written);
1839     APPEND_STAT("log_watcher_skipped", "%llu", (unsigned long long)stats.log_watcher_skipped);
1840     APPEND_STAT("log_watcher_sent", "%llu", (unsigned long long)stats.log_watcher_sent);
1841     APPEND_STAT("log_watchers", "%llu", (unsigned long long)stats_state.log_watchers);
1842     STATS_UNLOCK();
1843 #ifdef EXTSTORE
1844     storage_stats(add_stats, c);
1845 #endif
1846 #ifdef TLS
1847     if (settings.ssl_enabled) {
1848         if (settings.ssl_session_cache) {
1849             APPEND_STAT("ssl_new_sessions", "%llu", (unsigned long long)stats.ssl_new_sessions);
1850         }
1851         APPEND_STAT("ssl_handshake_errors", "%llu", (unsigned long long)stats.ssl_handshake_errors);
1852         APPEND_STAT("time_since_server_cert_refresh", "%u", now - settings.ssl_last_cert_refresh_time);
1853     }
1854 #endif
1855     APPEND_STAT("unexpected_napi_ids", "%llu", (unsigned long long)stats.unexpected_napi_ids);
1856     APPEND_STAT("round_robin_fallback", "%llu", (unsigned long long)stats.round_robin_fallback);
1857 }
1858 
process_stat_settings(ADD_STAT add_stats,void * c)1859 void process_stat_settings(ADD_STAT add_stats, void *c) {
1860     assert(add_stats);
1861     APPEND_STAT("maxbytes", "%llu", (unsigned long long)settings.maxbytes);
1862     APPEND_STAT("maxconns", "%d", settings.maxconns);
1863     APPEND_STAT("tcpport", "%d", settings.port);
1864     APPEND_STAT("udpport", "%d", settings.udpport);
1865     APPEND_STAT("inter", "%s", settings.inter ? settings.inter : "NULL");
1866     APPEND_STAT("verbosity", "%d", settings.verbose);
1867     APPEND_STAT("oldest", "%lu", (unsigned long)settings.oldest_live);
1868     APPEND_STAT("evictions", "%s", settings.evict_to_free ? "on" : "off");
1869     APPEND_STAT("domain_socket", "%s",
1870                 settings.socketpath ? settings.socketpath : "NULL");
1871     APPEND_STAT("umask", "%o", settings.access);
1872     APPEND_STAT("shutdown_command", "%s",
1873                 settings.shutdown_command ? "yes" : "no");
1874     APPEND_STAT("growth_factor", "%.2f", settings.factor);
1875     APPEND_STAT("chunk_size", "%d", settings.chunk_size);
1876     APPEND_STAT("num_threads", "%d", settings.num_threads);
1877     APPEND_STAT("num_threads_per_udp", "%d", settings.num_threads_per_udp);
1878     APPEND_STAT("stat_key_prefix", "%c", settings.prefix_delimiter);
1879     APPEND_STAT("detail_enabled", "%s",
1880                 settings.detail_enabled ? "yes" : "no");
1881     APPEND_STAT("reqs_per_event", "%d", settings.reqs_per_event);
1882     APPEND_STAT("cas_enabled", "%s", settings.use_cas ? "yes" : "no");
1883     APPEND_STAT("tcp_backlog", "%d", settings.backlog);
1884     APPEND_STAT("binding_protocol", "%s",
1885                 prot_text(settings.binding_protocol));
1886     APPEND_STAT("auth_enabled_sasl", "%s", settings.sasl ? "yes" : "no");
1887     APPEND_STAT("auth_enabled_ascii", "%s", settings.auth_file ? settings.auth_file : "no");
1888     APPEND_STAT("item_size_max", "%d", settings.item_size_max);
1889     APPEND_STAT("maxconns_fast", "%s", settings.maxconns_fast ? "yes" : "no");
1890     APPEND_STAT("hashpower_init", "%d", settings.hashpower_init);
1891     APPEND_STAT("slab_reassign", "%s", settings.slab_reassign ? "yes" : "no");
1892     APPEND_STAT("slab_automove", "%d", settings.slab_automove);
1893     APPEND_STAT("slab_automove_ratio", "%.2f", settings.slab_automove_ratio);
1894     APPEND_STAT("slab_automove_window", "%u", settings.slab_automove_window);
1895     APPEND_STAT("slab_chunk_max", "%d", settings.slab_chunk_size_max);
1896     APPEND_STAT("lru_crawler", "%s", settings.lru_crawler ? "yes" : "no");
1897     APPEND_STAT("lru_crawler_sleep", "%d", settings.lru_crawler_sleep);
1898     APPEND_STAT("lru_crawler_tocrawl", "%lu", (unsigned long)settings.lru_crawler_tocrawl);
1899     APPEND_STAT("tail_repair_time", "%d", settings.tail_repair_time);
1900     APPEND_STAT("flush_enabled", "%s", settings.flush_enabled ? "yes" : "no");
1901     APPEND_STAT("dump_enabled", "%s", settings.dump_enabled ? "yes" : "no");
1902     APPEND_STAT("hash_algorithm", "%s", settings.hash_algorithm);
1903     APPEND_STAT("lru_maintainer_thread", "%s", settings.lru_maintainer_thread ? "yes" : "no");
1904     APPEND_STAT("lru_segmented", "%s", settings.lru_segmented ? "yes" : "no");
1905     APPEND_STAT("hot_lru_pct", "%d", settings.hot_lru_pct);
1906     APPEND_STAT("warm_lru_pct", "%d", settings.warm_lru_pct);
1907     APPEND_STAT("hot_max_factor", "%.2f", settings.hot_max_factor);
1908     APPEND_STAT("warm_max_factor", "%.2f", settings.warm_max_factor);
1909     APPEND_STAT("temp_lru", "%s", settings.temp_lru ? "yes" : "no");
1910     APPEND_STAT("temporary_ttl", "%u", settings.temporary_ttl);
1911     APPEND_STAT("idle_timeout", "%d", settings.idle_timeout);
1912     APPEND_STAT("watcher_logbuf_size", "%u", settings.logger_watcher_buf_size);
1913     APPEND_STAT("worker_logbuf_size", "%u", settings.logger_buf_size);
1914     APPEND_STAT("read_buf_mem_limit", "%u", settings.read_buf_mem_limit);
1915     APPEND_STAT("track_sizes", "%s", item_stats_sizes_status() ? "yes" : "no");
1916     APPEND_STAT("inline_ascii_response", "%s", "no"); // setting is dead, cannot be yes.
1917 #ifdef HAVE_DROP_PRIVILEGES
1918     APPEND_STAT("drop_privileges", "%s", settings.drop_privileges ? "yes" : "no");
1919 #endif
1920 #ifdef EXTSTORE
1921     APPEND_STAT("ext_item_size", "%u", settings.ext_item_size);
1922     APPEND_STAT("ext_item_age", "%u", settings.ext_item_age);
1923     APPEND_STAT("ext_low_ttl", "%u", settings.ext_low_ttl);
1924     APPEND_STAT("ext_recache_rate", "%u", settings.ext_recache_rate);
1925     APPEND_STAT("ext_wbuf_size", "%u", settings.ext_wbuf_size);
1926     APPEND_STAT("ext_compact_under", "%u", settings.ext_compact_under);
1927     APPEND_STAT("ext_drop_under", "%u", settings.ext_drop_under);
1928     APPEND_STAT("ext_max_frag", "%.2f", settings.ext_max_frag);
1929     APPEND_STAT("slab_automove_freeratio", "%.3f", settings.slab_automove_freeratio);
1930     APPEND_STAT("ext_drop_unread", "%s", settings.ext_drop_unread ? "yes" : "no");
1931 #endif
1932 #ifdef TLS
1933     APPEND_STAT("ssl_enabled", "%s", settings.ssl_enabled ? "yes" : "no");
1934     APPEND_STAT("ssl_chain_cert", "%s", settings.ssl_chain_cert);
1935     APPEND_STAT("ssl_key", "%s", settings.ssl_key);
1936     APPEND_STAT("ssl_verify_mode", "%d", settings.ssl_verify_mode);
1937     APPEND_STAT("ssl_keyformat", "%d", settings.ssl_keyformat);
1938     APPEND_STAT("ssl_ciphers", "%s", settings.ssl_ciphers ? settings.ssl_ciphers : "NULL");
1939     APPEND_STAT("ssl_ca_cert", "%s", settings.ssl_ca_cert ? settings.ssl_ca_cert : "NULL");
1940     APPEND_STAT("ssl_wbuf_size", "%u", settings.ssl_wbuf_size);
1941     APPEND_STAT("ssl_session_cache", "%s", settings.ssl_session_cache ? "yes" : "no");
1942     APPEND_STAT("ssl_min_version", "%s", ssl_proto_text(settings.ssl_min_version));
1943 #endif
1944     APPEND_STAT("num_napi_ids", "%s", settings.num_napi_ids);
1945     APPEND_STAT("memory_file", "%s", settings.memory_file);
1946 }
1947 
nz_strcmp(int nzlength,const char * nz,const char * z)1948 static int nz_strcmp(int nzlength, const char *nz, const char *z) {
1949     int zlength=strlen(z);
1950     return (zlength == nzlength) && (strncmp(nz, z, zlength) == 0) ? 0 : -1;
1951 }
1952 
get_stats(const char * stat_type,int nkey,ADD_STAT add_stats,void * c)1953 bool get_stats(const char *stat_type, int nkey, ADD_STAT add_stats, void *c) {
1954     bool ret = true;
1955 
1956     if (add_stats != NULL) {
1957         if (!stat_type) {
1958             /* prepare general statistics for the engine */
1959             STATS_LOCK();
1960             APPEND_STAT("bytes", "%llu", (unsigned long long)stats_state.curr_bytes);
1961             APPEND_STAT("curr_items", "%llu", (unsigned long long)stats_state.curr_items);
1962             APPEND_STAT("total_items", "%llu", (unsigned long long)stats.total_items);
1963             STATS_UNLOCK();
1964             APPEND_STAT("slab_global_page_pool", "%u", global_page_pool_size(NULL));
1965             item_stats_totals(add_stats, c);
1966         } else if (nz_strcmp(nkey, stat_type, "items") == 0) {
1967             item_stats(add_stats, c);
1968         } else if (nz_strcmp(nkey, stat_type, "slabs") == 0) {
1969             slabs_stats(add_stats, c);
1970         } else if (nz_strcmp(nkey, stat_type, "sizes") == 0) {
1971             item_stats_sizes(add_stats, c);
1972         } else if (nz_strcmp(nkey, stat_type, "sizes_enable") == 0) {
1973             item_stats_sizes_enable(add_stats, c);
1974         } else if (nz_strcmp(nkey, stat_type, "sizes_disable") == 0) {
1975             item_stats_sizes_disable(add_stats, c);
1976         } else {
1977             ret = false;
1978         }
1979     } else {
1980         ret = false;
1981     }
1982 
1983     return ret;
1984 }
1985 
get_conn_text(const conn * c,const int af,char * addr,struct sockaddr * sock_addr)1986 static inline void get_conn_text(const conn *c, const int af,
1987                 char* addr, struct sockaddr *sock_addr) {
1988     char addr_text[MAXPATHLEN];
1989     addr_text[0] = '\0';
1990     const char *protoname = "?";
1991     unsigned short port = 0;
1992 
1993     switch (af) {
1994         case AF_INET:
1995             (void) inet_ntop(af,
1996                     &((struct sockaddr_in *)sock_addr)->sin_addr,
1997                     addr_text,
1998                     sizeof(addr_text) - 1);
1999             port = ntohs(((struct sockaddr_in *)sock_addr)->sin_port);
2000             protoname = IS_UDP(c->transport) ? "udp" : "tcp";
2001             break;
2002 
2003         case AF_INET6:
2004             addr_text[0] = '[';
2005             addr_text[1] = '\0';
2006             if (inet_ntop(af,
2007                     &((struct sockaddr_in6 *)sock_addr)->sin6_addr,
2008                     addr_text + 1,
2009                     sizeof(addr_text) - 2)) {
2010                 strcat(addr_text, "]");
2011             }
2012             port = ntohs(((struct sockaddr_in6 *)sock_addr)->sin6_port);
2013             protoname = IS_UDP(c->transport) ? "udp6" : "tcp6";
2014             break;
2015 
2016 #ifndef DISABLE_UNIX_SOCKET
2017         case AF_UNIX:
2018         {
2019             size_t pathlen = 0;
2020             // this strncpy call originally could piss off an address
2021             // sanitizer; we supplied the size of the dest buf as a limiter,
2022             // but optimized versions of strncpy could read past the end of
2023             // *src while looking for a null terminator. Since buf and
2024             // sun_path here are both on the stack they could even overlap,
2025             // which is "undefined". In all OSS versions of strncpy I could
2026             // find this has no effect; it'll still only copy until the first null
2027             // terminator is found. Thus it's possible to get the OS to
2028             // examine past the end of sun_path but it's unclear to me if this
2029             // can cause any actual problem.
2030             //
2031             // We need a safe_strncpy util function but I'll punt on figuring
2032             // that out for now.
2033             pathlen = sizeof(((struct sockaddr_un *)sock_addr)->sun_path);
2034             if (MAXPATHLEN <= pathlen) {
2035                 pathlen = MAXPATHLEN - 1;
2036             }
2037             strncpy(addr_text,
2038                     ((struct sockaddr_un *)sock_addr)->sun_path,
2039                     pathlen);
2040             addr_text[pathlen] = '\0';
2041             protoname = "unix";
2042         }
2043             break;
2044 #endif /* #ifndef DISABLE_UNIX_SOCKET */
2045     }
2046 
2047     if (strlen(addr_text) < 2) {
2048         /* Most likely this is a connected UNIX-domain client which
2049          * has no peer socket address, but there's no portable way
2050          * to tell for sure.
2051          */
2052         sprintf(addr_text, "<AF %d>", af);
2053     }
2054 
2055     if (port) {
2056         sprintf(addr, "%s:%s:%u", protoname, addr_text, port);
2057     } else {
2058         sprintf(addr, "%s:%s", protoname, addr_text);
2059     }
2060 }
2061 
conn_to_str(const conn * c,char * addr,char * svr_addr)2062 static void conn_to_str(const conn *c, char *addr, char *svr_addr) {
2063     if (!c) {
2064         strcpy(addr, "<null>");
2065     } else if (c->state == conn_closed) {
2066         strcpy(addr, "<closed>");
2067     } else {
2068         struct sockaddr_in6 local_addr;
2069         struct sockaddr *sock_addr = (void *)&c->request_addr;
2070 
2071         /* For listen ports and idle UDP ports, show listen address */
2072         if (c->state == conn_listening ||
2073                 (IS_UDP(c->transport) &&
2074                  c->state == conn_read)) {
2075             socklen_t local_addr_len = sizeof(local_addr);
2076 
2077             if (getsockname(c->sfd,
2078                         (struct sockaddr *)&local_addr,
2079                         &local_addr_len) == 0) {
2080                 sock_addr = (struct sockaddr *)&local_addr;
2081             }
2082         }
2083         get_conn_text(c, sock_addr->sa_family, addr, sock_addr);
2084 
2085         if (c->state != conn_listening && !(IS_UDP(c->transport) &&
2086                  c->state == conn_read)) {
2087             struct sockaddr_storage svr_sock_addr;
2088             socklen_t svr_addr_len = sizeof(svr_sock_addr);
2089             getsockname(c->sfd, (struct sockaddr *)&svr_sock_addr, &svr_addr_len);
2090             get_conn_text(c, svr_sock_addr.ss_family, svr_addr, (struct sockaddr *)&svr_sock_addr);
2091         }
2092     }
2093 }
2094 
process_stats_conns(ADD_STAT add_stats,void * c)2095 void process_stats_conns(ADD_STAT add_stats, void *c) {
2096     int i;
2097     char key_str[STAT_KEY_LEN];
2098     char val_str[STAT_VAL_LEN];
2099     size_t extras_len = sizeof("unix:") + sizeof("65535");
2100     char addr[MAXPATHLEN + extras_len];
2101     char svr_addr[MAXPATHLEN + extras_len];
2102     int klen = 0, vlen = 0;
2103 
2104     assert(add_stats);
2105 
2106     for (i = 0; i < max_fds; i++) {
2107         if (conns[i]) {
2108             /* This is safe to do unlocked because conns are never freed; the
2109              * worst that'll happen will be a minor inconsistency in the
2110              * output -- not worth the complexity of the locking that'd be
2111              * required to prevent it.
2112              */
2113             if IS_UDP(conns[i]->transport) {
2114                 APPEND_NUM_STAT(i, "UDP", "%s", "UDP");
2115             }
2116             if (conns[i]->state != conn_closed) {
2117                 conn_to_str(conns[i], addr, svr_addr);
2118 
2119                 APPEND_NUM_STAT(i, "addr", "%s", addr);
2120                 if (conns[i]->state != conn_listening &&
2121                     !(IS_UDP(conns[i]->transport) && conns[i]->state == conn_read)) {
2122                     APPEND_NUM_STAT(i, "listen_addr", "%s", svr_addr);
2123                 }
2124                 APPEND_NUM_STAT(i, "state", "%s",
2125                         state_text(conns[i]->state));
2126                 APPEND_NUM_STAT(i, "secs_since_last_cmd", "%d",
2127                         current_time - conns[i]->last_cmd_time);
2128             }
2129         }
2130     }
2131 }
2132 
2133 #define IT_REFCOUNT_LIMIT 60000
limited_get(char * key,size_t nkey,conn * c,uint32_t exptime,bool should_touch,bool do_update,bool * overflow)2134 item* limited_get(char *key, size_t nkey, conn *c, uint32_t exptime, bool should_touch, bool do_update, bool *overflow) {
2135     item *it;
2136     if (should_touch) {
2137         it = item_touch(key, nkey, exptime, c);
2138     } else {
2139         it = item_get(key, nkey, c, do_update);
2140     }
2141     if (it && it->refcount > IT_REFCOUNT_LIMIT) {
2142         item_remove(it);
2143         it = NULL;
2144         *overflow = true;
2145     } else {
2146         *overflow = false;
2147     }
2148     return it;
2149 }
2150 
2151 // Semantics are different than limited_get; since the item is returned
2152 // locked, caller can directly change what it needs.
2153 // though it might eventually be a better interface to sink it all into
2154 // items.c.
limited_get_locked(char * key,size_t nkey,conn * c,bool do_update,uint32_t * hv,bool * overflow)2155 item* limited_get_locked(char *key, size_t nkey, conn *c, bool do_update, uint32_t *hv, bool *overflow) {
2156     item *it;
2157     it = item_get_locked(key, nkey, c, do_update, hv);
2158     if (it && it->refcount > IT_REFCOUNT_LIMIT) {
2159         do_item_remove(it);
2160         it = NULL;
2161         item_unlock(*hv);
2162         *overflow = true;
2163     } else {
2164         *overflow = false;
2165     }
2166     return it;
2167 }
2168 
2169 /*
2170  * adds a delta value to a numeric item.
2171  *
2172  * c     connection requesting the operation
2173  * it    item to adjust
2174  * incr  true to increment value, false to decrement
2175  * delta amount to adjust value by
2176  * buf   buffer for response string
2177  *
2178  * returns a response string to send back to the client.
2179  */
do_add_delta(conn * c,const char * key,const size_t nkey,const bool incr,const int64_t delta,char * buf,uint64_t * cas,const uint32_t hv,item ** it_ret)2180 enum delta_result_type do_add_delta(conn *c, const char *key, const size_t nkey,
2181                                     const bool incr, const int64_t delta,
2182                                     char *buf, uint64_t *cas,
2183                                     const uint32_t hv,
2184                                     item **it_ret) {
2185     char *ptr;
2186     uint64_t value;
2187     int res;
2188     item *it;
2189 
2190     it = do_item_get(key, nkey, hv, c, DONT_UPDATE);
2191     if (!it) {
2192         return DELTA_ITEM_NOT_FOUND;
2193     }
2194 
2195     /* Can't delta zero byte values. 2-byte are the "\r\n" */
2196     /* Also can't delta for chunked items. Too large to be a number */
2197 #ifdef EXTSTORE
2198     if (it->nbytes <= 2 || (it->it_flags & (ITEM_CHUNKED|ITEM_HDR)) != 0) {
2199 #else
2200     if (it->nbytes <= 2 || (it->it_flags & (ITEM_CHUNKED)) != 0) {
2201 #endif
2202         do_item_remove(it);
2203         return NON_NUMERIC;
2204     }
2205 
2206     if (cas != NULL && *cas != 0 && ITEM_get_cas(it) != *cas) {
2207         do_item_remove(it);
2208         return DELTA_ITEM_CAS_MISMATCH;
2209     }
2210 
2211     ptr = ITEM_data(it);
2212 
2213     if (!safe_strtoull(ptr, &value)) {
2214         do_item_remove(it);
2215         return NON_NUMERIC;
2216     }
2217 
2218     if (incr) {
2219         value += delta;
2220         MEMCACHED_COMMAND_INCR(c->sfd, ITEM_key(it), it->nkey, value);
2221     } else {
2222         if(delta > value) {
2223             value = 0;
2224         } else {
2225             value -= delta;
2226         }
2227         MEMCACHED_COMMAND_DECR(c->sfd, ITEM_key(it), it->nkey, value);
2228     }
2229 
2230     pthread_mutex_lock(&c->thread->stats.mutex);
2231     if (incr) {
2232         c->thread->stats.slab_stats[ITEM_clsid(it)].incr_hits++;
2233     } else {
2234         c->thread->stats.slab_stats[ITEM_clsid(it)].decr_hits++;
2235     }
2236     pthread_mutex_unlock(&c->thread->stats.mutex);
2237 
2238     itoa_u64(value, buf);
2239     res = strlen(buf);
2240     /* refcount == 2 means we are the only ones holding the item, and it is
2241      * linked. We hold the item's lock in this function, so refcount cannot
2242      * increase. */
2243     if (res + 2 <= it->nbytes && it->refcount == 2) { /* replace in-place */
2244         /* When changing the value without replacing the item, we
2245            need to update the CAS on the existing item. */
2246         /* We also need to fiddle it in the sizes tracker in case the tracking
2247          * was enabled at runtime, since it relies on the CAS value to know
2248          * whether to remove an item or not. */
2249         item_stats_sizes_remove(it);
2250         ITEM_set_cas(it, (settings.use_cas) ? get_cas_id() : 0);
2251         item_stats_sizes_add(it);
2252         memcpy(ITEM_data(it), buf, res);
2253         memset(ITEM_data(it) + res, ' ', it->nbytes - res - 2);
2254         do_item_update(it);
2255     } else if (it->refcount > 1) {
2256         item *new_it;
2257         uint32_t flags;
2258         FLAGS_CONV(it, flags);
2259         new_it = do_item_alloc(ITEM_key(it), it->nkey, flags, it->exptime, res + 2);
2260         if (new_it == 0) {
2261             do_item_remove(it);
2262             return EOM;
2263         }
2264         memcpy(ITEM_data(new_it), buf, res);
2265         memcpy(ITEM_data(new_it) + res, "\r\n", 2);
2266         item_replace(it, new_it, hv);
2267         // Overwrite the older item's CAS with our new CAS since we're
2268         // returning the CAS of the old item below.
2269         ITEM_set_cas(it, (settings.use_cas) ? ITEM_get_cas(new_it) : 0);
2270         do_item_remove(new_it);       /* release our reference */
2271     } else {
2272         /* Should never get here. This means we somehow fetched an unlinked
2273          * item. TODO: Add a counter? */
2274         if (settings.verbose) {
2275             fprintf(stderr, "Tried to do incr/decr on invalid item\n");
2276         }
2277         if (it->refcount == 1)
2278             do_item_remove(it);
2279         return DELTA_ITEM_NOT_FOUND;
2280     }
2281 
2282     if (cas) {
2283         *cas = ITEM_get_cas(it);    /* swap the incoming CAS value */
2284     }
2285     if (it_ret != NULL) {
2286         *it_ret = it;
2287     } else {
2288         do_item_remove(it);         /* release our reference */
2289     }
2290     return OK;
2291 }
2292 
2293 static int try_read_command_negotiate(conn *c) {
2294     assert(c->protocol == negotiating_prot);
2295     assert(c != NULL);
2296     assert(c->rcurr <= (c->rbuf + c->rsize));
2297     assert(c->rbytes > 0);
2298 
2299     if ((unsigned char)c->rbuf[0] == (unsigned char)PROTOCOL_BINARY_REQ) {
2300         c->protocol = binary_prot;
2301         c->try_read_command = try_read_command_binary;
2302     } else {
2303         // authentication doesn't work with negotiated protocol.
2304         c->protocol = ascii_prot;
2305         c->try_read_command = try_read_command_ascii;
2306     }
2307 
2308     if (settings.verbose > 1) {
2309         fprintf(stderr, "%d: Client using the %s protocol\n", c->sfd,
2310                 prot_text(c->protocol));
2311     }
2312 
2313     return c->try_read_command(c);
2314 }
2315 
2316 static int try_read_command_udp(conn *c) {
2317     assert(c != NULL);
2318     assert(c->rcurr <= (c->rbuf + c->rsize));
2319     assert(c->rbytes > 0);
2320 
2321     if ((unsigned char)c->rbuf[0] == (unsigned char)PROTOCOL_BINARY_REQ) {
2322         c->protocol = binary_prot;
2323         return try_read_command_binary(c);
2324     } else {
2325         c->protocol = ascii_prot;
2326         return try_read_command_ascii(c);
2327     }
2328 }
2329 
2330 /*
2331  * read a UDP request.
2332  */
2333 static enum try_read_result try_read_udp(conn *c) {
2334     int res;
2335 
2336     assert(c != NULL);
2337 
2338     c->request_addr_size = sizeof(c->request_addr);
2339     res = recvfrom(c->sfd, c->rbuf, c->rsize,
2340                    0, (struct sockaddr *)&c->request_addr,
2341                    &c->request_addr_size);
2342     if (res > 8) {
2343         unsigned char *buf = (unsigned char *)c->rbuf;
2344         pthread_mutex_lock(&c->thread->stats.mutex);
2345         c->thread->stats.bytes_read += res;
2346         pthread_mutex_unlock(&c->thread->stats.mutex);
2347 
2348         /* Beginning of UDP packet is the request ID; save it. */
2349         c->request_id = buf[0] * 256 + buf[1];
2350 
2351         /* If this is a multi-packet request, drop it. */
2352         if (buf[4] != 0 || buf[5] != 1) {
2353             return READ_NO_DATA_RECEIVED;
2354         }
2355 
2356         /* Don't care about any of the rest of the header. */
2357         res -= 8;
2358         memmove(c->rbuf, c->rbuf + 8, res);
2359 
2360         c->rbytes = res;
2361         c->rcurr = c->rbuf;
2362         return READ_DATA_RECEIVED;
2363     }
2364     return READ_NO_DATA_RECEIVED;
2365 }
2366 
2367 /*
2368  * read from network as much as we can, handle buffer overflow and connection
2369  * close.
2370  * before reading, move the remaining incomplete fragment of a command
2371  * (if any) to the beginning of the buffer.
2372  *
2373  * To protect us from someone flooding a connection with bogus data causing
2374  * the connection to eat up all available memory, break out and start looking
2375  * at the data I've got after a number of reallocs...
2376  *
2377  * @return enum try_read_result
2378  */
2379 static enum try_read_result try_read_network(conn *c) {
2380     enum try_read_result gotdata = READ_NO_DATA_RECEIVED;
2381     int res;
2382     int num_allocs = 0;
2383     assert(c != NULL);
2384 
2385     if (c->rcurr != c->rbuf) {
2386         if (c->rbytes != 0) /* otherwise there's nothing to copy */
2387             memmove(c->rbuf, c->rcurr, c->rbytes);
2388         c->rcurr = c->rbuf;
2389     }
2390 
2391     while (1) {
2392         // TODO: move to rbuf_* func?
2393         if (c->rbytes >= c->rsize && c->rbuf_malloced) {
2394             if (num_allocs == 4) {
2395                 return gotdata;
2396             }
2397             ++num_allocs;
2398             char *new_rbuf = realloc(c->rbuf, c->rsize * 2);
2399             if (!new_rbuf) {
2400                 STATS_LOCK();
2401                 stats.malloc_fails++;
2402                 STATS_UNLOCK();
2403                 if (settings.verbose > 0) {
2404                     fprintf(stderr, "Couldn't realloc input buffer\n");
2405                 }
2406                 c->rbytes = 0; /* ignore what we read */
2407                 out_of_memory(c, "SERVER_ERROR out of memory reading request");
2408                 c->close_after_write = true;
2409                 return READ_MEMORY_ERROR;
2410             }
2411             c->rcurr = c->rbuf = new_rbuf;
2412             c->rsize *= 2;
2413         }
2414 
2415         int avail = c->rsize - c->rbytes;
2416         res = c->read(c, c->rbuf + c->rbytes, avail);
2417         if (res > 0) {
2418             pthread_mutex_lock(&c->thread->stats.mutex);
2419             c->thread->stats.bytes_read += res;
2420             pthread_mutex_unlock(&c->thread->stats.mutex);
2421             gotdata = READ_DATA_RECEIVED;
2422             c->rbytes += res;
2423             if (res == avail && c->rbuf_malloced) {
2424                 // Resize rbuf and try a few times if huge ascii multiget.
2425                 continue;
2426             } else {
2427                 break;
2428             }
2429         }
2430         if (res == 0) {
2431             c->close_reason = NORMAL_CLOSE;
2432             return READ_ERROR;
2433         }
2434         if (res == -1) {
2435             if (errno == EAGAIN || errno == EWOULDBLOCK) {
2436                 break;
2437             }
2438             return READ_ERROR;
2439         }
2440     }
2441     return gotdata;
2442 }
2443 
2444 static bool update_event(conn *c, const int new_flags) {
2445     assert(c != NULL);
2446 
2447     struct event_base *base = c->event.ev_base;
2448     if (c->ev_flags == new_flags)
2449         return true;
2450     if (event_del(&c->event) == -1) return false;
2451     event_set(&c->event, c->sfd, new_flags, event_handler, (void *)c);
2452     event_base_set(base, &c->event);
2453     c->ev_flags = new_flags;
2454     if (event_add(&c->event, 0) == -1) return false;
2455     return true;
2456 }
2457 
2458 /*
2459  * Sets whether we are listening for new connections or not.
2460  */
2461 void do_accept_new_conns(const bool do_accept) {
2462     conn *next;
2463 
2464     for (next = listen_conn; next; next = next->next) {
2465         if (do_accept) {
2466             update_event(next, EV_READ | EV_PERSIST);
2467             if (listen(next->sfd, settings.backlog) != 0) {
2468                 perror("listen");
2469             }
2470         }
2471         else {
2472             update_event(next, 0);
2473             if (listen(next->sfd, 0) != 0) {
2474                 perror("listen");
2475             }
2476         }
2477     }
2478 
2479     if (do_accept) {
2480         struct timeval maxconns_exited;
2481         uint64_t elapsed_us;
2482         gettimeofday(&maxconns_exited,NULL);
2483         STATS_LOCK();
2484         elapsed_us =
2485             (maxconns_exited.tv_sec - stats.maxconns_entered.tv_sec) * 1000000
2486             + (maxconns_exited.tv_usec - stats.maxconns_entered.tv_usec);
2487         stats.time_in_listen_disabled_us += elapsed_us;
2488         stats_state.accepting_conns = true;
2489         STATS_UNLOCK();
2490     } else {
2491         STATS_LOCK();
2492         stats_state.accepting_conns = false;
2493         gettimeofday(&stats.maxconns_entered,NULL);
2494         stats.listen_disabled_num++;
2495         STATS_UNLOCK();
2496         allow_new_conns = false;
2497         maxconns_handler(-42, 0, 0);
2498     }
2499 }
2500 
2501 #define TRANSMIT_ONE_RESP true
2502 #define TRANSMIT_ALL_RESP false
2503 static int _transmit_pre(conn *c, struct iovec *iovs, int iovused, bool one_resp) {
2504     mc_resp *resp = c->resp_head;
2505     while (resp && iovused + resp->iovcnt < IOV_MAX-1) {
2506         if (resp->skip) {
2507             // Don't actually unchain the resp obj here since it's singly-linked.
2508             // Just let the post function handle it linearly.
2509             resp = resp->next;
2510             continue;
2511         }
2512         if (resp->chunked_data_iov) {
2513             // Handle chunked items specially.
2514             // They spend much more time in send so we can be a bit wasteful
2515             // in rebuilding iovecs for them.
2516             item_chunk *ch = (item_chunk *)ITEM_schunk((item *)resp->iov[resp->chunked_data_iov].iov_base);
2517             int x;
2518             for (x = 0; x < resp->iovcnt; x++) {
2519                 // This iov is tracking how far we've copied so far.
2520                 if (x == resp->chunked_data_iov) {
2521                     int done = resp->chunked_total - resp->iov[x].iov_len;
2522                     // Start from the len to allow binprot to cut the \r\n
2523                     int todo = resp->iov[x].iov_len;
2524                     while (ch && todo > 0 && iovused < IOV_MAX-1) {
2525                         int skip = 0;
2526                         if (!ch->used) {
2527                             ch = ch->next;
2528                             continue;
2529                         }
2530                         // Skip parts we've already sent.
2531                         if (done >= ch->used) {
2532                             done -= ch->used;
2533                             ch = ch->next;
2534                             continue;
2535                         } else if (done) {
2536                             skip = done;
2537                             done = 0;
2538                         }
2539                         iovs[iovused].iov_base = ch->data + skip;
2540                         // Stupid binary protocol makes this go negative.
2541                         iovs[iovused].iov_len = ch->used - skip > todo ? todo : ch->used - skip;
2542                         iovused++;
2543                         todo -= ch->used - skip;
2544                         ch = ch->next;
2545                     }
2546                 } else {
2547                     iovs[iovused].iov_base = resp->iov[x].iov_base;
2548                     iovs[iovused].iov_len = resp->iov[x].iov_len;
2549                     iovused++;
2550                 }
2551                 if (iovused >= IOV_MAX-1)
2552                     break;
2553             }
2554         } else {
2555             memcpy(&iovs[iovused], resp->iov, sizeof(struct iovec)*resp->iovcnt);
2556             iovused += resp->iovcnt;
2557         }
2558 
2559         // done looking at first response, walk down the chain.
2560         resp = resp->next;
2561         // used for UDP mode: UDP cannot send multiple responses per packet.
2562         if (one_resp)
2563             break;
2564     }
2565     return iovused;
2566 }
2567 
2568 /*
2569  * Decrements and completes responses based on how much data was transmitted.
2570  * Takes the connection and current result bytes.
2571  */
2572 static void _transmit_post(conn *c, ssize_t res) {
2573     // We've written some of the data. Remove the completed
2574     // responses from the list of pending writes.
2575     mc_resp *resp = c->resp_head;
2576     while (resp) {
2577         int x;
2578         if (resp->skip) {
2579             resp = resp_finish(c, resp);
2580             continue;
2581         }
2582 
2583         // fastpath check. all small responses should cut here.
2584         if (res >= resp->tosend) {
2585             res -= resp->tosend;
2586             resp = resp_finish(c, resp);
2587             continue;
2588         }
2589 
2590         // it's fine to re-check iov's that were zeroed out before.
2591         for (x = 0; x < resp->iovcnt; x++) {
2592             struct iovec *iov = &resp->iov[x];
2593             if (res >= iov->iov_len) {
2594                 resp->tosend -= iov->iov_len;
2595                 res -= iov->iov_len;
2596                 iov->iov_len = 0;
2597             } else {
2598                 // Dumb special case for chunked items. Currently tracking
2599                 // where to inject the chunked item via iov_base.
2600                 // Extra not-great since chunked items can't be the first
2601                 // index, so we have to check for non-zero c_d_iov first.
2602                 if (!resp->chunked_data_iov || x != resp->chunked_data_iov) {
2603                     iov->iov_base = (char *)iov->iov_base + res;
2604                 }
2605                 iov->iov_len -= res;
2606                 resp->tosend -= res;
2607                 res = 0;
2608                 break;
2609             }
2610         }
2611 
2612         // are we done with this response object?
2613         if (resp->tosend == 0) {
2614             resp = resp_finish(c, resp);
2615         } else {
2616             // Jammed up here. This is the new head.
2617             break;
2618         }
2619     }
2620 }
2621 
2622 /*
2623  * Transmit the next chunk of data from our list of msgbuf structures.
2624  *
2625  * Returns:
2626  *   TRANSMIT_COMPLETE   All done writing.
2627  *   TRANSMIT_INCOMPLETE More data remaining to write.
2628  *   TRANSMIT_SOFT_ERROR Can't write any more right now.
2629  *   TRANSMIT_HARD_ERROR Can't write (c->state is set to conn_closing)
2630  */
2631 static enum transmit_result transmit(conn *c) {
2632     assert(c != NULL);
2633     struct iovec iovs[IOV_MAX];
2634     struct msghdr msg;
2635     int iovused = 0;
2636 
2637     // init the msg.
2638     memset(&msg, 0, sizeof(struct msghdr));
2639     msg.msg_iov = iovs;
2640 
2641     iovused = _transmit_pre(c, iovs, iovused, TRANSMIT_ALL_RESP);
2642     if (iovused == 0) {
2643         // Avoid the syscall if we're only handling a noreply.
2644         // Return the response object.
2645         _transmit_post(c, 0);
2646         return TRANSMIT_COMPLETE;
2647     }
2648 
2649     // Alright, send.
2650     ssize_t res;
2651     msg.msg_iovlen = iovused;
2652     res = c->sendmsg(c, &msg, 0);
2653     if (res >= 0) {
2654         pthread_mutex_lock(&c->thread->stats.mutex);
2655         c->thread->stats.bytes_written += res;
2656         pthread_mutex_unlock(&c->thread->stats.mutex);
2657 
2658         // Decrement any partial IOV's and complete any finished resp's.
2659         _transmit_post(c, res);
2660 
2661         if (c->resp_head) {
2662             return TRANSMIT_INCOMPLETE;
2663         } else {
2664             return TRANSMIT_COMPLETE;
2665         }
2666     }
2667 
2668     if (res == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
2669         if (!update_event(c, EV_WRITE | EV_PERSIST)) {
2670             if (settings.verbose > 0)
2671                 fprintf(stderr, "Couldn't update event\n");
2672             conn_set_state(c, conn_closing);
2673             return TRANSMIT_HARD_ERROR;
2674         }
2675         return TRANSMIT_SOFT_ERROR;
2676     }
2677     /* if res == -1 and error is not EAGAIN or EWOULDBLOCK,
2678        we have a real error, on which we close the connection */
2679     if (settings.verbose > 0)
2680         perror("Failed to write, and not due to blocking");
2681 
2682     conn_set_state(c, conn_closing);
2683     return TRANSMIT_HARD_ERROR;
2684 }
2685 
2686 static void build_udp_header(unsigned char *hdr, mc_resp *resp) {
2687     // We need to communicate the total number of packets
2688     // If this isn't set, it's the first time this response is building a udp
2689     // header, so "tosend" must be static.
2690     if (!resp->udp_total) {
2691         uint32_t total;
2692         total = resp->tosend / UDP_DATA_SIZE;
2693         if (resp->tosend % UDP_DATA_SIZE)
2694             total++;
2695         // The spec doesn't really say what we should do here. It's _probably_
2696         // better to bail out?
2697         if (total > USHRT_MAX) {
2698             total = USHRT_MAX;
2699         }
2700         resp->udp_total = total;
2701     }
2702 
2703     // TODO: why wasn't this hto*'s and casts?
2704     // this ends up sending UDP hdr data specifically in host byte order.
2705     *hdr++ = resp->request_id / 256;
2706     *hdr++ = resp->request_id % 256;
2707     *hdr++ = resp->udp_sequence / 256;
2708     *hdr++ = resp->udp_sequence % 256;
2709     *hdr++ = resp->udp_total / 256;
2710     *hdr++ = resp->udp_total % 256;
2711     *hdr++ = 0;
2712     *hdr++ = 0;
2713     resp->udp_sequence++;
2714 }
2715 
2716 /*
2717  * UDP specific transmit function. Uses its own function rather than check
2718  * IS_UDP() five times. If we ever implement sendmmsg or similar support they
2719  * will diverge even more.
2720  * Does not use TLS.
2721  *
2722  * Returns:
2723  *   TRANSMIT_COMPLETE   All done writing.
2724  *   TRANSMIT_INCOMPLETE More data remaining to write.
2725  *   TRANSMIT_SOFT_ERROR Can't write any more right now.
2726  *   TRANSMIT_HARD_ERROR Can't write (c->state is set to conn_closing)
2727  */
2728 static enum transmit_result transmit_udp(conn *c) {
2729     assert(c != NULL);
2730     struct iovec iovs[IOV_MAX];
2731     struct msghdr msg;
2732     mc_resp *resp;
2733     int iovused = 0;
2734     unsigned char udp_hdr[UDP_HEADER_SIZE];
2735 
2736     // We only send one UDP packet per call (ugh), so we can only operate on a
2737     // single response at a time.
2738     resp = c->resp_head;
2739 
2740     if (!resp) {
2741         return TRANSMIT_COMPLETE;
2742     }
2743 
2744     if (resp->skip) {
2745         resp = resp_finish(c, resp);
2746         return TRANSMIT_INCOMPLETE;
2747     }
2748 
2749     // clear the message and initialize it.
2750     memset(&msg, 0, sizeof(struct msghdr));
2751     msg.msg_iov = iovs;
2752 
2753     // the UDP source to return to.
2754     msg.msg_name = &resp->request_addr;
2755     msg.msg_namelen = resp->request_addr_size;
2756 
2757     // First IOV is the custom UDP header.
2758     iovs[0].iov_base = (void *)udp_hdr;
2759     iovs[0].iov_len = UDP_HEADER_SIZE;
2760     build_udp_header(udp_hdr, resp);
2761     iovused++;
2762 
2763     // Fill the IOV's the standard way.
2764     // TODO: might get a small speedup if we let it break early with a length
2765     // limit.
2766     iovused = _transmit_pre(c, iovs, iovused, TRANSMIT_ONE_RESP);
2767 
2768     // Clip the IOV's to the max UDP packet size.
2769     // If we add support for send_mmsg, this can be where we split msg's.
2770     {
2771         int x = 0;
2772         int len = 0;
2773         for (x = 0; x < iovused; x++) {
2774             if (len + iovs[x].iov_len >= UDP_MAX_PAYLOAD_SIZE) {
2775                 iovs[x].iov_len = UDP_MAX_PAYLOAD_SIZE - len;
2776                 x++;
2777                 break;
2778             } else {
2779                 len += iovs[x].iov_len;
2780             }
2781         }
2782         iovused = x;
2783     }
2784 
2785     ssize_t res;
2786     msg.msg_iovlen = iovused;
2787     // NOTE: uses system sendmsg since we have no support for indirect UDP.
2788     res = sendmsg(c->sfd, &msg, 0);
2789     if (res >= 0) {
2790         pthread_mutex_lock(&c->thread->stats.mutex);
2791         c->thread->stats.bytes_written += res;
2792         pthread_mutex_unlock(&c->thread->stats.mutex);
2793 
2794         // Ignore the header size from forwarding the IOV's
2795         res -= UDP_HEADER_SIZE;
2796 
2797         // Decrement any partial IOV's and complete any finished resp's.
2798         _transmit_post(c, res);
2799 
2800         if (c->resp_head) {
2801             return TRANSMIT_INCOMPLETE;
2802         } else {
2803             return TRANSMIT_COMPLETE;
2804         }
2805     }
2806 
2807     if (res == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
2808         if (!update_event(c, EV_WRITE | EV_PERSIST)) {
2809             if (settings.verbose > 0)
2810                 fprintf(stderr, "Couldn't update event\n");
2811             conn_set_state(c, conn_closing);
2812             return TRANSMIT_HARD_ERROR;
2813         }
2814         return TRANSMIT_SOFT_ERROR;
2815     }
2816     /* if res == -1 and error is not EAGAIN or EWOULDBLOCK,
2817        we have a real error, on which we close the connection */
2818     if (settings.verbose > 0)
2819         perror("Failed to write, and not due to blocking");
2820 
2821     conn_set_state(c, conn_read);
2822     return TRANSMIT_HARD_ERROR;
2823 }
2824 
2825 
2826 /* Does a looped read to fill data chunks */
2827 /* TODO: restrict number of times this can loop.
2828  * Also, benchmark using readv's.
2829  */
2830 static int read_into_chunked_item(conn *c) {
2831     int total = 0;
2832     int res;
2833     assert(c->rcurr != c->ritem);
2834 
2835     while (c->rlbytes > 0) {
2836         item_chunk *ch = (item_chunk *)c->ritem;
2837         if (ch->size == ch->used) {
2838             // FIXME: ch->next is currently always 0. remove this?
2839             if (ch->next) {
2840                 c->ritem = (char *) ch->next;
2841             } else {
2842                 /* Allocate next chunk. Binary protocol needs 2b for \r\n */
2843                 c->ritem = (char *) do_item_alloc_chunk(ch, c->rlbytes +
2844                        ((c->protocol == binary_prot) ? 2 : 0));
2845                 if (!c->ritem) {
2846                     // We failed an allocation. Let caller handle cleanup.
2847                     total = -2;
2848                     break;
2849                 }
2850                 // ritem has new chunk, restart the loop.
2851                 continue;
2852                 //assert(c->rlbytes == 0);
2853             }
2854         }
2855 
2856         int unused = ch->size - ch->used;
2857         /* first check if we have leftovers in the conn_read buffer */
2858         if (c->rbytes > 0) {
2859             total = 0;
2860             int tocopy = c->rbytes > c->rlbytes ? c->rlbytes : c->rbytes;
2861             tocopy = tocopy > unused ? unused : tocopy;
2862             if (c->ritem != c->rcurr) {
2863                 memmove(ch->data + ch->used, c->rcurr, tocopy);
2864             }
2865             total += tocopy;
2866             c->rlbytes -= tocopy;
2867             c->rcurr += tocopy;
2868             c->rbytes -= tocopy;
2869             ch->used += tocopy;
2870             if (c->rlbytes == 0) {
2871                 break;
2872             }
2873         } else {
2874             /*  now try reading from the socket */
2875             res = c->read(c, ch->data + ch->used,
2876                     (unused > c->rlbytes ? c->rlbytes : unused));
2877             if (res > 0) {
2878                 pthread_mutex_lock(&c->thread->stats.mutex);
2879                 c->thread->stats.bytes_read += res;
2880                 pthread_mutex_unlock(&c->thread->stats.mutex);
2881                 ch->used += res;
2882                 total += res;
2883                 c->rlbytes -= res;
2884             } else {
2885                 /* Reset total to the latest result so caller can handle it */
2886                 total = res;
2887                 break;
2888             }
2889         }
2890     }
2891 
2892     /* At some point I will be able to ditch the \r\n from item storage and
2893        remove all of these kludges.
2894        The above binprot check ensures inline space for \r\n, but if we do
2895        exactly enough allocs there will be no additional chunk for \r\n.
2896      */
2897     if (c->rlbytes == 0 && c->protocol == binary_prot && total >= 0) {
2898         item_chunk *ch = (item_chunk *)c->ritem;
2899         if (ch->size - ch->used < 2) {
2900             c->ritem = (char *) do_item_alloc_chunk(ch, 2);
2901             if (!c->ritem) {
2902                 total = -2;
2903             }
2904         }
2905     }
2906     return total;
2907 }
2908 
2909 static void drive_machine(conn *c) {
2910     bool stop = false;
2911     int sfd;
2912     socklen_t addrlen;
2913     struct sockaddr_storage addr;
2914     int nreqs = settings.reqs_per_event;
2915     int res;
2916     const char *str;
2917 #ifdef HAVE_ACCEPT4
2918     static int  use_accept4 = 1;
2919 #else
2920     static int  use_accept4 = 0;
2921 #endif
2922 
2923     assert(c != NULL);
2924 
2925     while (!stop) {
2926 
2927         switch(c->state) {
2928         case conn_listening:
2929             addrlen = sizeof(addr);
2930 #ifdef HAVE_ACCEPT4
2931             if (use_accept4) {
2932                 sfd = accept4(c->sfd, (struct sockaddr *)&addr, &addrlen, SOCK_NONBLOCK);
2933             } else {
2934                 sfd = accept(c->sfd, (struct sockaddr *)&addr, &addrlen);
2935             }
2936 #else
2937             sfd = accept(c->sfd, (struct sockaddr *)&addr, &addrlen);
2938 #endif
2939             if (sfd == -1) {
2940                 if (use_accept4 && errno == ENOSYS) {
2941                     use_accept4 = 0;
2942                     continue;
2943                 }
2944                 perror(use_accept4 ? "accept4()" : "accept()");
2945                 if (errno == EAGAIN || errno == EWOULDBLOCK) {
2946                     /* these are transient, so don't log anything */
2947                     stop = true;
2948                 } else if (errno == EMFILE) {
2949                     if (settings.verbose > 0)
2950                         fprintf(stderr, "Too many open connections\n");
2951                     accept_new_conns(false);
2952                     stop = true;
2953                 } else {
2954                     perror("accept()");
2955                     stop = true;
2956                 }
2957                 break;
2958             }
2959             if (!use_accept4) {
2960                 if (fcntl(sfd, F_SETFL, fcntl(sfd, F_GETFL) | O_NONBLOCK) < 0) {
2961                     perror("setting O_NONBLOCK");
2962                     close(sfd);
2963                     break;
2964                 }
2965             }
2966 
2967             bool reject;
2968             if (settings.maxconns_fast) {
2969                 reject = sfd >= settings.maxconns - 1;
2970                 if (reject) {
2971                     STATS_LOCK();
2972                     stats.rejected_conns++;
2973                     STATS_UNLOCK();
2974                 }
2975             } else {
2976                 reject = false;
2977             }
2978 
2979             if (reject) {
2980                 str = "ERROR Too many open connections\r\n";
2981                 res = write(sfd, str, strlen(str));
2982                 close(sfd);
2983             } else {
2984                 void *ssl_v = NULL;
2985 #ifdef TLS
2986                 SSL *ssl = NULL;
2987                 if (c->ssl_enabled) {
2988                     assert(IS_TCP(c->transport) && settings.ssl_enabled);
2989 
2990                     if (settings.ssl_ctx == NULL) {
2991                         if (settings.verbose) {
2992                             fprintf(stderr, "SSL context is not initialized\n");
2993                         }
2994                         close(sfd);
2995                         break;
2996                     }
2997                     SSL_LOCK();
2998                     ssl = SSL_new(settings.ssl_ctx);
2999                     SSL_UNLOCK();
3000                     if (ssl == NULL) {
3001                         if (settings.verbose) {
3002                             fprintf(stderr, "Failed to created the SSL object\n");
3003                         }
3004                         close(sfd);
3005                         break;
3006                     }
3007                     SSL_set_fd(ssl, sfd);
3008                     int ret = SSL_accept(ssl);
3009                     if (ret <= 0) {
3010                         int err = SSL_get_error(ssl, ret);
3011                         if (err == SSL_ERROR_SYSCALL || err == SSL_ERROR_SSL) {
3012                             if (settings.verbose) {
3013                                 fprintf(stderr, "SSL connection failed with error code : %d : %s\n", err, strerror(errno));
3014                             }
3015                             SSL_free(ssl);
3016                             close(sfd);
3017                             STATS_LOCK();
3018                             stats.ssl_handshake_errors++;
3019                             STATS_UNLOCK();
3020                             break;
3021                         }
3022                     }
3023                 }
3024                 ssl_v = (void*) ssl;
3025 #endif
3026 
3027                 dispatch_conn_new(sfd, conn_new_cmd, EV_READ | EV_PERSIST,
3028                                      READ_BUFFER_CACHED, c->transport, ssl_v);
3029             }
3030 
3031             stop = true;
3032             break;
3033 
3034         case conn_waiting:
3035             rbuf_release(c);
3036             if (!update_event(c, EV_READ | EV_PERSIST)) {
3037                 if (settings.verbose > 0)
3038                     fprintf(stderr, "Couldn't update event\n");
3039                 conn_set_state(c, conn_closing);
3040                 break;
3041             }
3042 
3043             conn_set_state(c, conn_read);
3044             stop = true;
3045             break;
3046 
3047         case conn_read:
3048             if (!IS_UDP(c->transport)) {
3049                 // Assign a read buffer if necessary.
3050                 if (!rbuf_alloc(c)) {
3051                     // TODO: Some way to allow for temporary failures.
3052                     conn_set_state(c, conn_closing);
3053                     break;
3054                 }
3055                 res = try_read_network(c);
3056             } else {
3057                 // UDP connections always have a static buffer.
3058                 res = try_read_udp(c);
3059             }
3060 
3061             switch (res) {
3062             case READ_NO_DATA_RECEIVED:
3063                 conn_set_state(c, conn_waiting);
3064                 break;
3065             case READ_DATA_RECEIVED:
3066                 conn_set_state(c, conn_parse_cmd);
3067                 break;
3068             case READ_ERROR:
3069                 conn_set_state(c, conn_closing);
3070                 break;
3071             case READ_MEMORY_ERROR: /* Failed to allocate more memory */
3072                 /* State already set by try_read_network */
3073                 break;
3074             }
3075             break;
3076 
3077         case conn_parse_cmd:
3078             c->noreply = false;
3079             if (c->try_read_command(c) == 0) {
3080                 /* we need more data! */
3081                 if (c->resp_head) {
3082                     // Buffered responses waiting, flush in the meantime.
3083                     conn_set_state(c, conn_mwrite);
3084                 } else {
3085                     conn_set_state(c, conn_waiting);
3086                 }
3087             }
3088 
3089             break;
3090 
3091         case conn_new_cmd:
3092             /* Only process nreqs at a time to avoid starving other
3093                connections */
3094 
3095             --nreqs;
3096             if (nreqs >= 0) {
3097                 reset_cmd_handler(c);
3098             } else if (c->resp_head) {
3099                 // flush response pipe on yield.
3100                 conn_set_state(c, conn_mwrite);
3101             } else {
3102                 pthread_mutex_lock(&c->thread->stats.mutex);
3103                 c->thread->stats.conn_yields++;
3104                 pthread_mutex_unlock(&c->thread->stats.mutex);
3105                 if (c->rbytes > 0) {
3106                     /* We have already read in data into the input buffer,
3107                        so libevent will most likely not signal read events
3108                        on the socket (unless more data is available. As a
3109                        hack we should just put in a request to write data,
3110                        because that should be possible ;-)
3111                     */
3112                     if (!update_event(c, EV_WRITE | EV_PERSIST)) {
3113                         if (settings.verbose > 0)
3114                             fprintf(stderr, "Couldn't update event\n");
3115                         conn_set_state(c, conn_closing);
3116                         break;
3117                     }
3118                 }
3119                 stop = true;
3120             }
3121             break;
3122 
3123         case conn_nread:
3124             if (c->rlbytes == 0) {
3125                 complete_nread(c);
3126                 break;
3127             }
3128 
3129             /* Check if rbytes < 0, to prevent crash */
3130             if (c->rlbytes < 0) {
3131                 if (settings.verbose) {
3132                     fprintf(stderr, "Invalid rlbytes to read: len %d\n", c->rlbytes);
3133                 }
3134                 conn_set_state(c, conn_closing);
3135                 break;
3136             }
3137 
3138             if (c->item_malloced || ((((item *)c->item)->it_flags & ITEM_CHUNKED) == 0) ) {
3139                 /* first check if we have leftovers in the conn_read buffer */
3140                 if (c->rbytes > 0) {
3141                     int tocopy = c->rbytes > c->rlbytes ? c->rlbytes : c->rbytes;
3142                     memmove(c->ritem, c->rcurr, tocopy);
3143                     c->ritem += tocopy;
3144                     c->rlbytes -= tocopy;
3145                     c->rcurr += tocopy;
3146                     c->rbytes -= tocopy;
3147                     if (c->rlbytes == 0) {
3148                         break;
3149                     }
3150                 }
3151 
3152                 /*  now try reading from the socket */
3153                 res = c->read(c, c->ritem, c->rlbytes);
3154                 if (res > 0) {
3155                     pthread_mutex_lock(&c->thread->stats.mutex);
3156                     c->thread->stats.bytes_read += res;
3157                     pthread_mutex_unlock(&c->thread->stats.mutex);
3158                     if (c->rcurr == c->ritem) {
3159                         c->rcurr += res;
3160                     }
3161                     c->ritem += res;
3162                     c->rlbytes -= res;
3163                     break;
3164                 }
3165             } else {
3166                 res = read_into_chunked_item(c);
3167                 if (res > 0)
3168                     break;
3169             }
3170 
3171             if (res == 0) { /* end of stream */
3172                 c->close_reason = NORMAL_CLOSE;
3173                 conn_set_state(c, conn_closing);
3174                 break;
3175             }
3176 
3177             if (res == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
3178                 if (!update_event(c, EV_READ | EV_PERSIST)) {
3179                     if (settings.verbose > 0)
3180                         fprintf(stderr, "Couldn't update event\n");
3181                     conn_set_state(c, conn_closing);
3182                     break;
3183                 }
3184                 stop = true;
3185                 break;
3186             }
3187 
3188             /* Memory allocation failure */
3189             if (res == -2) {
3190                 out_of_memory(c, "SERVER_ERROR Out of memory during read");
3191                 c->sbytes = c->rlbytes;
3192                 conn_set_state(c, conn_swallow);
3193                 // Ensure this flag gets cleared. It gets killed on conn_new()
3194                 // so any conn_closing is fine, calling complete_nread is
3195                 // fine. This swallow semms to be the only other case.
3196                 c->set_stale = false;
3197                 c->mset_res = false;
3198                 break;
3199             }
3200             /* otherwise we have a real error, on which we close the connection */
3201             if (settings.verbose > 0) {
3202                 fprintf(stderr, "Failed to read, and not due to blocking:\n"
3203                         "errno: %d %s \n"
3204                         "rcurr=%p ritem=%p rbuf=%p rlbytes=%d rsize=%d\n",
3205                         errno, strerror(errno),
3206                         (void *)c->rcurr, (void *)c->ritem, (void *)c->rbuf,
3207                         (int)c->rlbytes, (int)c->rsize);
3208             }
3209             conn_set_state(c, conn_closing);
3210             break;
3211 
3212         case conn_swallow:
3213             /* we are reading sbytes and throwing them away */
3214             if (c->sbytes <= 0) {
3215                 conn_set_state(c, conn_new_cmd);
3216                 break;
3217             }
3218 
3219             /* first check if we have leftovers in the conn_read buffer */
3220             if (c->rbytes > 0) {
3221                 int tocopy = c->rbytes > c->sbytes ? c->sbytes : c->rbytes;
3222                 c->sbytes -= tocopy;
3223                 c->rcurr += tocopy;
3224                 c->rbytes -= tocopy;
3225                 break;
3226             }
3227 
3228             /*  now try reading from the socket */
3229             res = c->read(c, c->rbuf, c->rsize > c->sbytes ? c->sbytes : c->rsize);
3230             if (res > 0) {
3231                 pthread_mutex_lock(&c->thread->stats.mutex);
3232                 c->thread->stats.bytes_read += res;
3233                 pthread_mutex_unlock(&c->thread->stats.mutex);
3234                 c->sbytes -= res;
3235                 break;
3236             }
3237             if (res == 0) { /* end of stream */
3238                 c->close_reason = NORMAL_CLOSE;
3239                 conn_set_state(c, conn_closing);
3240                 break;
3241             }
3242             if (res == -1 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
3243                 if (!update_event(c, EV_READ | EV_PERSIST)) {
3244                     if (settings.verbose > 0)
3245                         fprintf(stderr, "Couldn't update event\n");
3246                     conn_set_state(c, conn_closing);
3247                     break;
3248                 }
3249                 stop = true;
3250                 break;
3251             }
3252             /* otherwise we have a real error, on which we close the connection */
3253             if (settings.verbose > 0)
3254                 fprintf(stderr, "Failed to read, and not due to blocking\n");
3255             conn_set_state(c, conn_closing);
3256             break;
3257 
3258         case conn_write:
3259         case conn_mwrite:
3260             /* have side IO's that must process before transmit() can run.
3261              * remove the connection from the worker thread and dispatch the
3262              * IO queue
3263              */
3264             assert(c->io_queues_submitted == 0);
3265 
3266             for (io_queue_t *q = c->io_queues; q->type != IO_QUEUE_NONE; q++) {
3267                 if (q->stack_ctx != NULL) {
3268                     io_queue_cb_t *qcb = thread_io_queue_get(c->thread, q->type);
3269                     qcb->submit_cb(q);
3270                     c->io_queues_submitted++;
3271                 }
3272             }
3273             if (c->io_queues_submitted != 0) {
3274                 conn_set_state(c, conn_io_queue);
3275                 event_del(&c->event);
3276 
3277                 stop = true;
3278                 break;
3279             }
3280 
3281             switch (!IS_UDP(c->transport) ? transmit(c) : transmit_udp(c)) {
3282             case TRANSMIT_COMPLETE:
3283                 if (c->state == conn_mwrite) {
3284                     // Free up IO wraps and any half-uploaded items.
3285                     conn_release_items(c);
3286                     conn_set_state(c, conn_new_cmd);
3287                     if (c->close_after_write) {
3288                         conn_set_state(c, conn_closing);
3289                     }
3290                 } else {
3291                     if (settings.verbose > 0)
3292                         fprintf(stderr, "Unexpected state %d\n", c->state);
3293                     conn_set_state(c, conn_closing);
3294                 }
3295                 break;
3296 
3297             case TRANSMIT_INCOMPLETE:
3298             case TRANSMIT_HARD_ERROR:
3299                 break;                   /* Continue in state machine. */
3300 
3301             case TRANSMIT_SOFT_ERROR:
3302                 stop = true;
3303                 break;
3304             }
3305             break;
3306 
3307         case conn_closing:
3308             if IS_UDP(c->transport)
3309                 conn_cleanup(c);
3310             else
3311                 conn_close(c);
3312             stop = true;
3313             break;
3314 
3315         case conn_closed:
3316             /* This only happens if dormando is an idiot. */
3317             abort();
3318             break;
3319 
3320         case conn_watch:
3321             /* We handed off our connection to the logger thread. */
3322             stop = true;
3323             break;
3324         case conn_io_queue:
3325             /* Complete our queued IO's from within the worker thread. */
3326             conn_io_queue_complete(c);
3327             conn_set_state(c, conn_mwrite);
3328             break;
3329         case conn_max_state:
3330             assert(false);
3331             break;
3332         }
3333     }
3334 
3335     return;
3336 }
3337 
3338 void event_handler(const evutil_socket_t fd, const short which, void *arg) {
3339     conn *c;
3340 
3341     c = (conn *)arg;
3342     assert(c != NULL);
3343 
3344     c->which = which;
3345 
3346     /* sanity */
3347     if (fd != c->sfd) {
3348         if (settings.verbose > 0)
3349             fprintf(stderr, "Catastrophic: event fd doesn't match conn fd!\n");
3350         conn_close(c);
3351         return;
3352     }
3353 
3354     drive_machine(c);
3355 
3356     /* wait for next event */
3357     return;
3358 }
3359 
3360 static int new_socket(struct addrinfo *ai) {
3361     int sfd;
3362     int flags;
3363 
3364     if ((sfd = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol)) == -1) {
3365         return -1;
3366     }
3367 
3368     if ((flags = fcntl(sfd, F_GETFL, 0)) < 0 ||
3369         fcntl(sfd, F_SETFL, flags | O_NONBLOCK) < 0) {
3370         perror("setting O_NONBLOCK");
3371         close(sfd);
3372         return -1;
3373     }
3374     return sfd;
3375 }
3376 
3377 
3378 /*
3379  * Sets a socket's send buffer size to the maximum allowed by the system.
3380  */
3381 static void maximize_sndbuf(const int sfd) {
3382     socklen_t intsize = sizeof(int);
3383     int last_good = 0;
3384     int min, max, avg;
3385     int old_size;
3386 
3387     /* Start with the default size. */
3388 #ifdef _WIN32
3389     if (getsockopt((SOCKET)sfd, SOL_SOCKET, SO_SNDBUF, (char *)&old_size, &intsize) != 0) {
3390 #else
3391     if (getsockopt(sfd, SOL_SOCKET, SO_SNDBUF, &old_size, &intsize) != 0) {
3392 #endif /* #ifdef _WIN32 */
3393         if (settings.verbose > 0)
3394             perror("getsockopt(SO_SNDBUF)");
3395         return;
3396     }
3397 
3398     /* Binary-search for the real maximum. */
3399     min = old_size;
3400     max = MAX_SENDBUF_SIZE;
3401 
3402     while (min <= max) {
3403         avg = ((unsigned int)(min + max)) / 2;
3404         if (setsockopt(sfd, SOL_SOCKET, SO_SNDBUF, (void *)&avg, intsize) == 0) {
3405             last_good = avg;
3406             min = avg + 1;
3407         } else {
3408             max = avg - 1;
3409         }
3410     }
3411 
3412     if (settings.verbose > 1)
3413         fprintf(stderr, "<%d send buffer was %d, now %d\n", sfd, old_size, last_good);
3414 }
3415 
3416 /**
3417  * Create a socket and bind it to a specific port number
3418  * @param interface the interface to bind to
3419  * @param port the port number to bind to
3420  * @param transport the transport protocol (TCP / UDP)
3421  * @param portnumber_file A filepointer to write the port numbers to
3422  *        when they are successfully added to the list of ports we
3423  *        listen on.
3424  */
3425 static int server_socket(const char *interface,
3426                          int port,
3427                          enum network_transport transport,
3428                          FILE *portnumber_file, bool ssl_enabled) {
3429     int sfd;
3430     struct linger ling = {0, 0};
3431     struct addrinfo *ai;
3432     struct addrinfo *next;
3433     struct addrinfo hints = { .ai_flags = AI_PASSIVE,
3434                               .ai_family = AF_UNSPEC };
3435     char port_buf[NI_MAXSERV];
3436     int error;
3437     int success = 0;
3438     int flags =1;
3439 
3440     hints.ai_socktype = IS_UDP(transport) ? SOCK_DGRAM : SOCK_STREAM;
3441 
3442     if (port == -1) {
3443         port = 0;
3444     }
3445     snprintf(port_buf, sizeof(port_buf), "%d", port);
3446     error= getaddrinfo(interface, port_buf, &hints, &ai);
3447     if (error != 0) {
3448         if (error != EAI_SYSTEM)
3449           fprintf(stderr, "getaddrinfo(): %s\n", gai_strerror(error));
3450         else
3451           perror("getaddrinfo()");
3452         return 1;
3453     }
3454 
3455     for (next= ai; next; next= next->ai_next) {
3456         conn *listen_conn_add;
3457         if ((sfd = new_socket(next)) == -1) {
3458             /* getaddrinfo can return "junk" addresses,
3459              * we make sure at least one works before erroring.
3460              */
3461             if (errno == EMFILE) {
3462                 /* ...unless we're out of fds */
3463                 perror("server_socket");
3464                 exit(EX_OSERR);
3465             }
3466             continue;
3467         }
3468 
3469         if (settings.num_napi_ids) {
3470             socklen_t len = sizeof(socklen_t);
3471             int napi_id;
3472             error = getsockopt(sfd, SOL_SOCKET, SO_INCOMING_NAPI_ID, &napi_id, &len);
3473             if (error != 0) {
3474                 fprintf(stderr, "-N <num_napi_ids> option not supported\n");
3475                 exit(EXIT_FAILURE);
3476             }
3477         }
3478 
3479 #ifdef IPV6_V6ONLY
3480         if (next->ai_family == AF_INET6) {
3481             error = setsockopt(sfd, IPPROTO_IPV6, IPV6_V6ONLY, (char *) &flags, sizeof(flags));
3482             if (error != 0) {
3483                 perror("setsockopt");
3484                 close(sfd);
3485                 continue;
3486             }
3487         }
3488 #endif
3489 
3490         setsockopt(sfd, SOL_SOCKET, SO_REUSEADDR, (void *)&flags, sizeof(flags));
3491         if IS_UDP(transport) {
3492             maximize_sndbuf(sfd);
3493         } else {
3494             error = setsockopt(sfd, SOL_SOCKET, SO_KEEPALIVE, (void *)&flags, sizeof(flags));
3495             if (error != 0)
3496                 perror("setsockopt");
3497 
3498             error = setsockopt(sfd, SOL_SOCKET, SO_LINGER, (void *)&ling, sizeof(ling));
3499             if (error != 0)
3500                 perror("setsockopt");
3501 
3502             error = setsockopt(sfd, IPPROTO_TCP, TCP_NODELAY, (void *)&flags, sizeof(flags));
3503             if (error != 0)
3504                 perror("setsockopt");
3505         }
3506 
3507         if (bind(sfd, next->ai_addr, next->ai_addrlen) == -1) {
3508             if (errno != EADDRINUSE) {
3509                 perror("bind()");
3510                 close(sfd);
3511                 freeaddrinfo(ai);
3512                 return 1;
3513             }
3514             close(sfd);
3515             continue;
3516         } else {
3517             success++;
3518             if (!IS_UDP(transport) && listen(sfd, settings.backlog) == -1) {
3519                 perror("listen()");
3520                 close(sfd);
3521                 freeaddrinfo(ai);
3522                 return 1;
3523             }
3524             if (portnumber_file != NULL &&
3525                 (next->ai_addr->sa_family == AF_INET ||
3526                  next->ai_addr->sa_family == AF_INET6)) {
3527                 union {
3528                     struct sockaddr_in in;
3529                     struct sockaddr_in6 in6;
3530                 } my_sockaddr;
3531                 socklen_t len = sizeof(my_sockaddr);
3532                 if (getsockname(sfd, (struct sockaddr*)&my_sockaddr, &len)==0) {
3533                     if (next->ai_addr->sa_family == AF_INET) {
3534                         fprintf(portnumber_file, "%s INET: %u\n",
3535                                 IS_UDP(transport) ? "UDP" : "TCP",
3536                                 ntohs(my_sockaddr.in.sin_port));
3537                     } else {
3538                         fprintf(portnumber_file, "%s INET6: %u\n",
3539                                 IS_UDP(transport) ? "UDP" : "TCP",
3540                                 ntohs(my_sockaddr.in6.sin6_port));
3541                     }
3542                 }
3543             }
3544         }
3545 
3546         if IS_UDP(transport) {
3547             int c;
3548 
3549             for (c = 0; c < settings.num_threads_per_udp; c++) {
3550                 /* Allocate one UDP file descriptor per worker thread;
3551                  * this allows "stats conns" to separately list multiple
3552                  * parallel UDP requests in progress.
3553                  *
3554                  * The dispatch code round-robins new connection requests
3555                  * among threads, so this is guaranteed to assign one
3556                  * FD to each thread.
3557                  */
3558                 int per_thread_fd;
3559                 if (c == 0) {
3560                     per_thread_fd = sfd;
3561                 } else {
3562                     per_thread_fd = dup(sfd);
3563                     if (per_thread_fd < 0) {
3564                         perror("Failed to duplicate file descriptor");
3565                         exit(EXIT_FAILURE);
3566                     }
3567                 }
3568                 dispatch_conn_new(per_thread_fd, conn_read,
3569                                   EV_READ | EV_PERSIST,
3570                                   UDP_READ_BUFFER_SIZE, transport, NULL);
3571             }
3572         } else {
3573             if (!(listen_conn_add = conn_new(sfd, conn_listening,
3574                                              EV_READ | EV_PERSIST, 1,
3575                                              transport, main_base, NULL))) {
3576                 fprintf(stderr, "failed to create listening connection\n");
3577                 exit(EXIT_FAILURE);
3578             }
3579 #ifdef TLS
3580             listen_conn_add->ssl_enabled = ssl_enabled;
3581 #else
3582             assert(ssl_enabled == false);
3583 #endif
3584             listen_conn_add->next = listen_conn;
3585             listen_conn = listen_conn_add;
3586         }
3587     }
3588 
3589     freeaddrinfo(ai);
3590 
3591     /* Return zero iff we detected no errors in starting up connections */
3592     return success == 0;
3593 }
3594 
3595 static int server_sockets(int port, enum network_transport transport,
3596                           FILE *portnumber_file) {
3597     bool ssl_enabled = false;
3598 
3599 #ifdef TLS
3600     const char *notls = "notls";
3601     ssl_enabled = settings.ssl_enabled;
3602 #endif
3603 
3604     if (settings.inter == NULL) {
3605         return server_socket(settings.inter, port, transport, portnumber_file, ssl_enabled);
3606     } else {
3607         // tokenize them and bind to each one of them..
3608         char *b;
3609         int ret = 0;
3610         char *list = strdup(settings.inter);
3611 
3612         if (list == NULL) {
3613             fprintf(stderr, "Failed to allocate memory for parsing server interface string\n");
3614             return 1;
3615         }
3616         for (char *p = strtok_r(list, ";,", &b);
3617             p != NULL;
3618             p = strtok_r(NULL, ";,", &b)) {
3619             int the_port = port;
3620 #ifdef TLS
3621             ssl_enabled = settings.ssl_enabled;
3622             // "notls" option is valid only when memcached is run with SSL enabled.
3623             if (strncmp(p, notls, strlen(notls)) == 0) {
3624                 if (!settings.ssl_enabled) {
3625                     fprintf(stderr, "'notls' option is valid only when SSL is enabled\n");
3626                     free(list);
3627                     return 1;
3628                 }
3629                 ssl_enabled = false;
3630                 p += strlen(notls) + 1;
3631             }
3632 #endif
3633 
3634             char *h = NULL;
3635             if (*p == '[') {
3636                 // expecting it to be an IPv6 address enclosed in []
3637                 // i.e. RFC3986 style recommended by RFC5952
3638                 char *e = strchr(p, ']');
3639                 if (e == NULL) {
3640                     fprintf(stderr, "Invalid IPV6 address: \"%s\"", p);
3641                     free(list);
3642                     return 1;
3643                 }
3644                 h = ++p; // skip the opening '['
3645                 *e = '\0';
3646                 p = ++e; // skip the closing ']'
3647             }
3648 
3649             char *s = strchr(p, ':');
3650             if (s != NULL) {
3651                 // If no more semicolons - attempt to treat as port number.
3652                 // Otherwise the only valid option is an unenclosed IPv6 without port, until
3653                 // of course there was an RFC3986 IPv6 address previously specified -
3654                 // in such a case there is no good option, will just send it to fail as port number.
3655                 if (strchr(s + 1, ':') == NULL || h != NULL) {
3656                     *s = '\0';
3657                     ++s;
3658                     if (!safe_strtol(s, &the_port)) {
3659                         fprintf(stderr, "Invalid port number: \"%s\"", s);
3660                         free(list);
3661                         return 1;
3662                     }
3663                 }
3664             }
3665 
3666             if (h != NULL)
3667                 p = h;
3668 
3669             if (strcmp(p, "*") == 0) {
3670                 p = NULL;
3671             }
3672             ret |= server_socket(p, the_port, transport, portnumber_file, ssl_enabled);
3673         }
3674         free(list);
3675         return ret;
3676     }
3677 }
3678 
3679 #ifndef DISABLE_UNIX_SOCKET
3680 static int new_socket_unix(void) {
3681     int sfd;
3682     int flags;
3683 
3684     if ((sfd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
3685         perror("socket()");
3686         return -1;
3687     }
3688 
3689     if ((flags = fcntl(sfd, F_GETFL, 0)) < 0 ||
3690         fcntl(sfd, F_SETFL, flags | O_NONBLOCK) < 0) {
3691         perror("setting O_NONBLOCK");
3692         close(sfd);
3693         return -1;
3694     }
3695     return sfd;
3696 }
3697 
3698 static int server_socket_unix(const char *path, int access_mask) {
3699     int sfd;
3700     struct linger ling = {0, 0};
3701     struct sockaddr_un addr;
3702     struct stat tstat;
3703     int flags =1;
3704     int old_umask;
3705 
3706     if (!path) {
3707         return 1;
3708     }
3709 
3710     if ((sfd = new_socket_unix()) == -1) {
3711         return 1;
3712     }
3713 
3714     /*
3715      * Clean up a previous socket file if we left it around
3716      */
3717     if (lstat(path, &tstat) == 0) {
3718         if (S_ISSOCK(tstat.st_mode))
3719             unlink(path);
3720     }
3721 
3722     setsockopt(sfd, SOL_SOCKET, SO_REUSEADDR, (void *)&flags, sizeof(flags));
3723     setsockopt(sfd, SOL_SOCKET, SO_KEEPALIVE, (void *)&flags, sizeof(flags));
3724     setsockopt(sfd, SOL_SOCKET, SO_LINGER, (void *)&ling, sizeof(ling));
3725 
3726     /*
3727      * the memset call clears nonstandard fields in some implementations
3728      * that otherwise mess things up.
3729      */
3730     memset(&addr, 0, sizeof(addr));
3731 
3732     addr.sun_family = AF_UNIX;
3733     strncpy(addr.sun_path, path, sizeof(addr.sun_path) - 1);
3734     assert(strcmp(addr.sun_path, path) == 0);
3735     old_umask = umask( ~(access_mask&0777));
3736     if (bind(sfd, (struct sockaddr *)&addr, sizeof(addr)) == -1) {
3737         perror("bind()");
3738         close(sfd);
3739         umask(old_umask);
3740         return 1;
3741     }
3742     umask(old_umask);
3743     if (listen(sfd, settings.backlog) == -1) {
3744         perror("listen()");
3745         close(sfd);
3746         return 1;
3747     }
3748     if (!(listen_conn = conn_new(sfd, conn_listening,
3749                                  EV_READ | EV_PERSIST, 1,
3750                                  local_transport, main_base, NULL))) {
3751         fprintf(stderr, "failed to create listening connection\n");
3752         exit(EXIT_FAILURE);
3753     }
3754 
3755     return 0;
3756 }
3757 #else
3758 #define server_socket_unix(path, access_mask)   -1
3759 #endif /* #ifndef DISABLE_UNIX_SOCKET */
3760 
3761 /*
3762  * We keep the current time of day in a global variable that's updated by a
3763  * timer event. This saves us a bunch of time() system calls (we really only
3764  * need to get the time once a second, whereas there can be tens of thousands
3765  * of requests a second) and allows us to use server-start-relative timestamps
3766  * rather than absolute UNIX timestamps, a space savings on systems where
3767  * sizeof(time_t) > sizeof(unsigned int).
3768  */
3769 volatile rel_time_t current_time;
3770 static struct event clockevent;
3771 #ifdef MEMCACHED_DEBUG
3772 volatile bool is_paused;
3773 volatile int64_t delta;
3774 #endif
3775 #if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_MONOTONIC)
3776 static bool monotonic = false;
3777 static int64_t monotonic_start;
3778 #endif
3779 
3780 /* libevent uses a monotonic clock when available for event scheduling. Aside
3781  * from jitter, simply ticking our internal timer here is accurate enough.
3782  * Note that users who are setting explicit dates for expiration times *must*
3783  * ensure their clocks are correct before starting memcached. */
3784 static void clock_handler(const evutil_socket_t fd, const short which, void *arg) {
3785     struct timeval t = {.tv_sec = 1, .tv_usec = 0};
3786     static bool initialized = false;
3787 
3788     if (initialized) {
3789         /* only delete the event if it's actually there. */
3790         evtimer_del(&clockevent);
3791     } else {
3792         initialized = true;
3793     }
3794 
3795     // While we're here, check for hash table expansion.
3796     // This function should be quick to avoid delaying the timer.
3797     assoc_start_expand(stats_state.curr_items);
3798     // also, if HUP'ed we need to do some maintenance.
3799     // for now that's just the authfile reload.
3800     if (settings.sig_hup) {
3801         settings.sig_hup = false;
3802 
3803         authfile_load(settings.auth_file);
3804     }
3805 
3806     evtimer_set(&clockevent, clock_handler, 0);
3807     event_base_set(main_base, &clockevent);
3808     evtimer_add(&clockevent, &t);
3809 
3810 #ifdef MEMCACHED_DEBUG
3811     if (is_paused) return;
3812 #endif
3813 
3814 #if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_MONOTONIC)
3815     if (monotonic) {
3816         struct timespec ts;
3817         if (clock_gettime(CLOCK_MONOTONIC, &ts) == -1)
3818             return;
3819 #ifdef MEMCACHED_DEBUG
3820         current_time = (rel_time_t) (ts.tv_sec - monotonic_start + delta);
3821 #else
3822         current_time = (rel_time_t) (ts.tv_sec - monotonic_start);
3823 #endif
3824         return;
3825     }
3826 #endif
3827     {
3828         struct timeval tv;
3829         gettimeofday(&tv, NULL);
3830 #ifdef MEMCACHED_DEBUG
3831         current_time = (rel_time_t) (tv.tv_sec - process_started + delta);
3832 #else
3833         current_time = (rel_time_t) (tv.tv_sec - process_started);
3834 #endif
3835     }
3836 }
3837 
3838 static const char* flag_enabled_disabled(bool flag) {
3839     return (flag ? "enabled" : "disabled");
3840 }
3841 
3842 static void verify_default(const char* param, bool condition) {
3843     if (!condition) {
3844         printf("Default value of [%s] has changed."
3845             " Modify the help text and default value check.\n", param);
3846         exit(EXIT_FAILURE);
3847     }
3848 }
3849 
3850 static void usage(void) {
3851     printf(PACKAGE " " VERSION "\n");
3852     printf("-p, --port=<num>          TCP port to listen on (default: %d)\n"
3853            "-U, --udp-port=<num>      UDP port to listen on (default: %d, off)\n",
3854            settings.port, settings.udpport);
3855 #ifndef DISABLE_UNIX_SOCKET
3856     printf("-s, --unix-socket=<file>  UNIX socket to listen on (disables network support)\n");
3857     printf("-a, --unix-mask=<mask>    access mask for UNIX socket, in octal (default: %o)\n",
3858             settings.access);
3859 #endif /* #ifndef DISABLE_UNIX_SOCKET */
3860     printf("-A, --enable-shutdown     enable ascii \"shutdown\" command\n");
3861     printf("-l, --listen=<addr>       interface to listen on (default: INADDR_ANY)\n");
3862 #ifdef TLS
3863     printf("                          if TLS/SSL is enabled, 'notls' prefix can be used to\n"
3864            "                          disable for specific listeners (-l notls:<ip>:<port>) \n");
3865 #endif
3866     printf("-d, --daemon              run as a daemon\n"
3867            "-r, --enable-coredumps    maximize core file limit\n"
3868            "-u, --user=<user>         assume identity of <username> (only when run as root)\n"
3869            "-m, --memory-limit=<num>  item memory in megabytes (default: %lu)\n"
3870            "-M, --disable-evictions   return error on memory exhausted instead of evicting\n"
3871            "-c, --conn-limit=<num>    max simultaneous connections (default: %d)\n"
3872            "-k, --lock-memory         lock down all paged memory\n"
3873            "-v, --verbose             verbose (print errors/warnings while in event loop)\n"
3874            "-vv                       very verbose (also print client commands/responses)\n"
3875            "-vvv                      extremely verbose (internal state transitions)\n"
3876            "-h, --help                print this help and exit\n"
3877            "-i, --license             print memcached and libevent license\n"
3878            "-V, --version             print version and exit\n"
3879            "-P, --pidfile=<file>      save PID in <file>, only used with -d option\n"
3880            "-f, --slab-growth-factor=<num> chunk size growth factor (default: %2.2f)\n"
3881            "-n, --slab-min-size=<bytes> min space used for key+value+flags (default: %d)\n",
3882            (unsigned long) settings.maxbytes / (1 << 20),
3883            settings.maxconns, settings.factor, settings.chunk_size);
3884     verify_default("udp-port",settings.udpport == 0);
3885     printf("-L, --enable-largepages  try to use large memory pages (if available)\n");
3886     printf("-D <char>     Use <char> as the delimiter between key prefixes and IDs.\n"
3887            "              This is used for per-prefix stats reporting. The default is\n"
3888            "              \"%c\" (colon). If this option is specified, stats collection\n"
3889            "              is turned on automatically; if not, then it may be turned on\n"
3890            "              by sending the \"stats detail on\" command to the server.\n",
3891            settings.prefix_delimiter);
3892     printf("-t, --threads=<num>       number of threads to use (default: %d)\n", settings.num_threads);
3893     printf("-R, --max-reqs-per-event  maximum number of requests per event, limits the\n"
3894            "                          requests processed per connection to prevent \n"
3895            "                          starvation (default: %d)\n", settings.reqs_per_event);
3896     printf("-C, --disable-cas         disable use of CAS\n");
3897     printf("-b, --listen-backlog=<num> set the backlog queue limit (default: %d)\n", settings.backlog);
3898     printf("-B, --protocol=<name>     protocol - one of ascii, binary, or auto (default: %s)\n",
3899            prot_text(settings.binding_protocol));
3900     printf("-I, --max-item-size=<num> adjusts max item size\n"
3901            "                          (default: %dm, min: %dk, max: %dm)\n",
3902            settings.item_size_max/ (1 << 20), ITEM_SIZE_MAX_LOWER_LIMIT / (1 << 10),  ITEM_SIZE_MAX_UPPER_LIMIT / (1 << 20));
3903 #ifdef ENABLE_SASL
3904     printf("-S, --enable-sasl         turn on Sasl authentication\n");
3905 #endif
3906     printf("-F, --disable-flush-all   disable flush_all command\n");
3907     printf("-X, --disable-dumping     disable stats cachedump and lru_crawler metadump\n");
3908     printf("-W  --disable-watch       disable watch commands (live logging)\n");
3909     printf("-Y, --auth-file=<file>    (EXPERIMENTAL) enable ASCII protocol authentication. format:\n"
3910            "                          user:pass\\nuser2:pass2\\n\n");
3911     printf("-e, --memory-file=<file>  (EXPERIMENTAL) mmap a file for item memory.\n"
3912            "                          use only in ram disks or persistent memory mounts!\n"
3913            "                          enables restartable cache (stop with SIGUSR1)\n");
3914 #ifdef TLS
3915     printf("-Z, --enable-ssl          enable TLS/SSL\n");
3916 #endif
3917     printf("-o, --extended            comma separated list of extended options\n"
3918            "                          most options have a 'no_' prefix to disable\n"
3919            "   - maxconns_fast:       immediately close new connections after limit (default: %s)\n"
3920            "   - hashpower:           an integer multiplier for how large the hash\n"
3921            "                          table should be. normally grows at runtime. (default starts at: %d)\n"
3922            "                          set based on \"STAT hash_power_level\"\n"
3923            "   - tail_repair_time:    time in seconds for how long to wait before\n"
3924            "                          forcefully killing LRU tail item.\n"
3925            "                          disabled by default; very dangerous option.\n"
3926            "   - hash_algorithm:      the hash table algorithm\n"
3927            "                          default is murmur3 hash. options: jenkins, murmur3, xxh3\n"
3928            "   - no_lru_crawler:      disable LRU Crawler background thread.\n"
3929            "   - lru_crawler_sleep:   microseconds to sleep between items\n"
3930            "                          default is %d.\n"
3931            "   - lru_crawler_tocrawl: max items to crawl per slab per run\n"
3932            "                          default is %u (unlimited)\n",
3933            flag_enabled_disabled(settings.maxconns_fast), settings.hashpower_init,
3934            settings.lru_crawler_sleep, settings.lru_crawler_tocrawl);
3935     printf("   - read_buf_mem_limit:  limit in megabytes for connection read/response buffers.\n"
3936            "                          do not adjust unless you have high (20k+) conn. limits.\n"
3937            "                          0 means unlimited (default: %u)\n",
3938            settings.read_buf_mem_limit);
3939     verify_default("read_buf_mem_limit", settings.read_buf_mem_limit == 0);
3940     printf("   - no_lru_maintainer:   disable new LRU system + background thread.\n"
3941            "   - hot_lru_pct:         pct of slab memory to reserve for hot lru.\n"
3942            "                          (requires lru_maintainer, default pct: %d)\n"
3943            "   - warm_lru_pct:        pct of slab memory to reserve for warm lru.\n"
3944            "                          (requires lru_maintainer, default pct: %d)\n"
3945            "   - hot_max_factor:      items idle > cold lru age * drop from hot lru. (default: %.2f)\n"
3946            "   - warm_max_factor:     items idle > cold lru age * this drop from warm. (default: %.2f)\n"
3947            "   - temporary_ttl:       TTL's below get separate LRU, can't be evicted.\n"
3948            "                          (requires lru_maintainer, default: %d)\n"
3949            "   - idle_timeout:        timeout for idle connections. (default: %d, no timeout)\n",
3950            settings.hot_lru_pct, settings.warm_lru_pct, settings.hot_max_factor, settings.warm_max_factor,
3951            settings.temporary_ttl, settings.idle_timeout);
3952     printf("   - slab_chunk_max:      (EXPERIMENTAL) maximum slab size in kilobytes. use extreme care. (default: %d)\n"
3953            "   - watcher_logbuf_size: size in kilobytes of per-watcher write buffer. (default: %u)\n"
3954            "   - worker_logbuf_size:  size in kilobytes of per-worker-thread buffer\n"
3955            "                          read by background thread, then written to watchers. (default: %u)\n"
3956            "   - track_sizes:         enable dynamic reports for 'stats sizes' command.\n"
3957            "   - no_hashexpand:       disables hash table expansion (dangerous)\n"
3958            "   - modern:              enables options which will be default in future.\n"
3959            "                          currently: nothing\n"
3960            "   - no_modern:           uses defaults of previous major version (1.4.x)\n",
3961            settings.slab_chunk_size_max / (1 << 10), settings.logger_watcher_buf_size / (1 << 10),
3962            settings.logger_buf_size / (1 << 10));
3963     verify_default("tail_repair_time", settings.tail_repair_time == TAIL_REPAIR_TIME_DEFAULT);
3964     verify_default("lru_crawler_tocrawl", settings.lru_crawler_tocrawl == 0);
3965     verify_default("idle_timeout", settings.idle_timeout == 0);
3966 #ifdef HAVE_DROP_PRIVILEGES
3967     printf("   - drop_privileges:     enable dropping extra syscall privileges\n"
3968            "   - no_drop_privileges:  disable drop_privileges in case it causes issues with\n"
3969            "                          some customisation.\n"
3970            "                          (default is no_drop_privileges)\n");
3971     verify_default("drop_privileges", !settings.drop_privileges);
3972 #ifdef MEMCACHED_DEBUG
3973     printf("   - relaxed_privileges:  running tests requires extra privileges. (default: %s)\n",
3974            flag_enabled_disabled(settings.relaxed_privileges));
3975 #endif
3976 #endif
3977 #ifdef EXTSTORE
3978     printf("\n   - External storage (ext_*) related options (see: https://memcached.org/extstore)\n");
3979     printf("   - ext_path:            file to write to for external storage.\n"
3980            "                          ie: ext_path=/mnt/d1/extstore:1G\n"
3981            "   - ext_page_size:       size in megabytes of storage pages. (default: %u)\n"
3982            "   - ext_wbuf_size:       size in megabytes of page write buffers. (default: %u)\n"
3983            "   - ext_threads:         number of IO threads to run. (default: %u)\n"
3984            "   - ext_item_size:       store items larger than this (bytes, default %u)\n"
3985            "   - ext_item_age:        store items idle at least this long (seconds, default: no age limit)\n"
3986            "   - ext_low_ttl:         consider TTLs lower than this specially (default: %u)\n"
3987            "   - ext_drop_unread:     don't re-write unread values during compaction (default: %s)\n"
3988            "   - ext_recache_rate:    recache an item every N accesses (default: %u)\n"
3989            "   - ext_compact_under:   compact when fewer than this many free pages\n"
3990            "                          (default: 1/4th of the assigned storage)\n"
3991            "   - ext_drop_under:      drop COLD items when fewer than this many free pages\n"
3992            "                          (default: 1/4th of the assigned storage)\n"
3993            "   - ext_max_frag:        max page fragmentation to tolerate (default: %.2f)\n"
3994            "   - slab_automove_freeratio: ratio of memory to hold free as buffer.\n"
3995            "                          (see doc/storage.txt for more info, default: %.3f)\n",
3996            settings.ext_page_size / (1 << 20), settings.ext_wbuf_size / (1 << 20), settings.ext_io_threadcount,
3997            settings.ext_item_size, settings.ext_low_ttl,
3998            flag_enabled_disabled(settings.ext_drop_unread), settings.ext_recache_rate,
3999            settings.ext_max_frag, settings.slab_automove_freeratio);
4000     verify_default("ext_item_age", settings.ext_item_age == UINT_MAX);
4001 #endif
4002 #ifdef TLS
4003     printf("   - ssl_chain_cert:      certificate chain file in PEM format\n"
4004            "   - ssl_key:             private key, if not part of the -ssl_chain_cert\n"
4005            "   - ssl_keyformat:       private key format (PEM, DER or ENGINE) (default: PEM)\n");
4006     printf("   - ssl_verify_mode:     peer certificate verification mode, default is 0(None).\n"
4007            "                          valid values are 0(None), 1(Request), 2(Require)\n"
4008            "                          or 3(Once)\n");
4009     printf("   - ssl_ciphers:         specify cipher list to be used\n"
4010            "   - ssl_ca_cert:         PEM format file of acceptable client CA's\n"
4011            "   - ssl_wbuf_size:       size in kilobytes of per-connection SSL output buffer\n"
4012            "                          (default: %u)\n", settings.ssl_wbuf_size / (1 << 10));
4013     printf("   - ssl_session_cache:   enable server-side SSL session cache, to support session\n"
4014            "                          resumption\n"
4015            "   - ssl_min_version:     minimum protocol version to accept (default: %s)\n"
4016 #if OPENSSL_VERSION_NUMBER >= 0x10101000L
4017            "                          valid values are 0(%s), 1(%s), 2(%s), or 3(%s).\n",
4018            ssl_proto_text(settings.ssl_min_version),
4019            ssl_proto_text(TLS1_VERSION), ssl_proto_text(TLS1_1_VERSION),
4020            ssl_proto_text(TLS1_2_VERSION), ssl_proto_text(TLS1_3_VERSION));
4021 #else
4022            "                          valid values are 0(%s), 1(%s), or 2(%s).\n",
4023            ssl_proto_text(settings.ssl_min_version),
4024            ssl_proto_text(TLS1_VERSION), ssl_proto_text(TLS1_1_VERSION),
4025            ssl_proto_text(TLS1_2_VERSION));
4026 #endif
4027     verify_default("ssl_keyformat", settings.ssl_keyformat == SSL_FILETYPE_PEM);
4028     verify_default("ssl_verify_mode", settings.ssl_verify_mode == SSL_VERIFY_NONE);
4029     verify_default("ssl_min_version", settings.ssl_min_version == TLS1_2_VERSION);
4030 #endif
4031     printf("-N, --napi_ids            number of napi ids. see doc/napi_ids.txt for more details\n");
4032     return;
4033 }
4034 
4035 static void usage_license(void) {
4036     printf(PACKAGE " " VERSION "\n\n");
4037     printf(
4038     "Copyright (c) 2003, Danga Interactive, Inc. <http://www.danga.com/>\n"
4039     "All rights reserved.\n"
4040     "\n"
4041     "Redistribution and use in source and binary forms, with or without\n"
4042     "modification, are permitted provided that the following conditions are\n"
4043     "met:\n"
4044     "\n"
4045     "    * Redistributions of source code must retain the above copyright\n"
4046     "notice, this list of conditions and the following disclaimer.\n"
4047     "\n"
4048     "    * Redistributions in binary form must reproduce the above\n"
4049     "copyright notice, this list of conditions and the following disclaimer\n"
4050     "in the documentation and/or other materials provided with the\n"
4051     "distribution.\n"
4052     "\n"
4053     "    * Neither the name of the Danga Interactive nor the names of its\n"
4054     "contributors may be used to endorse or promote products derived from\n"
4055     "this software without specific prior written permission.\n"
4056     "\n"
4057     "THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n"
4058     "\"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\n"
4059     "LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\n"
4060     "A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\n"
4061     "OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\n"
4062     "SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\n"
4063     "LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\n"
4064     "DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\n"
4065     "THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n"
4066     "(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n"
4067     "OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n"
4068     "\n"
4069     "\n"
4070     "This product includes software developed by Niels Provos.\n"
4071     "\n"
4072     "[ libevent ]\n"
4073     "\n"
4074     "Copyright 2000-2003 Niels Provos <provos@citi.umich.edu>\n"
4075     "All rights reserved.\n"
4076     "\n"
4077     "Redistribution and use in source and binary forms, with or without\n"
4078     "modification, are permitted provided that the following conditions\n"
4079     "are met:\n"
4080     "1. Redistributions of source code must retain the above copyright\n"
4081     "   notice, this list of conditions and the following disclaimer.\n"
4082     "2. Redistributions in binary form must reproduce the above copyright\n"
4083     "   notice, this list of conditions and the following disclaimer in the\n"
4084     "   documentation and/or other materials provided with the distribution.\n"
4085     "3. All advertising materials mentioning features or use of this software\n"
4086     "   must display the following acknowledgement:\n"
4087     "      This product includes software developed by Niels Provos.\n"
4088     "4. The name of the author may not be used to endorse or promote products\n"
4089     "   derived from this software without specific prior written permission.\n"
4090     "\n"
4091     "THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR\n"
4092     "IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES\n"
4093     "OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.\n"
4094     "IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,\n"
4095     "INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT\n"
4096     "NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\n"
4097     "DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\n"
4098     "THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n"
4099     "(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF\n"
4100     "THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n"
4101     );
4102 
4103     return;
4104 }
4105 
4106 static void save_pid(const char *pid_file) {
4107     FILE *fp;
4108     if (access(pid_file, F_OK) == 0) {
4109         if ((fp = fopen(pid_file, "r")) != NULL) {
4110             char buffer[1024];
4111             if (fgets(buffer, sizeof(buffer), fp) != NULL) {
4112                 unsigned int pid;
4113                 if (safe_strtoul(buffer, &pid) && kill((pid_t)pid, 0) == 0) {
4114                     fprintf(stderr, "WARNING: The pid file contained the following (running) pid: %u\n", pid);
4115                 }
4116             }
4117             fclose(fp);
4118         }
4119     }
4120 
4121     /* Create the pid file first with a temporary name, then
4122      * atomically move the file to the real name to avoid a race with
4123      * another process opening the file to read the pid, but finding
4124      * it empty.
4125      */
4126     char tmp_pid_file[1024];
4127     snprintf(tmp_pid_file, sizeof(tmp_pid_file), "%s.tmp", pid_file);
4128 
4129     if ((fp = fopen(tmp_pid_file, "w")) == NULL) {
4130         vperror("Could not open the pid file %s for writing", tmp_pid_file);
4131         return;
4132     }
4133 
4134     fprintf(fp,"%ld\n", (long)getpid());
4135     if (fclose(fp) == -1) {
4136         vperror("Could not close the pid file %s", tmp_pid_file);
4137     }
4138 
4139     if (rename(tmp_pid_file, pid_file) != 0) {
4140         vperror("Could not rename the pid file from %s to %s",
4141                 tmp_pid_file, pid_file);
4142     }
4143 }
4144 
4145 static void remove_pidfile(const char *pid_file) {
4146   if (pid_file == NULL)
4147       return;
4148 
4149   if (unlink(pid_file) != 0) {
4150       vperror("Could not remove the pid file %s", pid_file);
4151   }
4152 
4153 }
4154 
4155 static void sig_handler(const int sig) {
4156     stop_main_loop = EXIT_NORMALLY;
4157     printf("Signal handled: %s.\n", strsignal(sig));
4158 }
4159 
4160 static void sighup_handler(const int sig) {
4161     settings.sig_hup = true;
4162 }
4163 
4164 static void sig_usrhandler(const int sig) {
4165     printf("Graceful shutdown signal handled: %s.\n", strsignal(sig));
4166     stop_main_loop = GRACE_STOP;
4167 }
4168 
4169 /*
4170  * On systems that supports multiple page sizes we may reduce the
4171  * number of TLB-misses by using the biggest available page size
4172  */
4173 static int enable_large_pages(void) {
4174 #if defined(HAVE_GETPAGESIZES) && defined(HAVE_MEMCNTL)
4175     int ret = -1;
4176     size_t sizes[32];
4177     int avail = getpagesizes(sizes, 32);
4178     if (avail != -1) {
4179         size_t max = sizes[0];
4180         struct memcntl_mha arg = {0};
4181         int ii;
4182 
4183         for (ii = 1; ii < avail; ++ii) {
4184             if (max < sizes[ii]) {
4185                 max = sizes[ii];
4186             }
4187         }
4188 
4189         arg.mha_flags   = 0;
4190         arg.mha_pagesize = max;
4191         arg.mha_cmd = MHA_MAPSIZE_BSSBRK;
4192 
4193         if (memcntl(0, 0, MC_HAT_ADVISE, (caddr_t)&arg, 0, 0) == -1) {
4194             fprintf(stderr, "Failed to set large pages: %s\n",
4195                     strerror(errno));
4196             fprintf(stderr, "Will use default page size\n");
4197         } else {
4198             ret = 0;
4199         }
4200     } else {
4201         fprintf(stderr, "Failed to get supported pagesizes: %s\n",
4202                 strerror(errno));
4203         fprintf(stderr, "Will use default page size\n");
4204     }
4205 
4206     return ret;
4207 #elif defined(__linux__) && defined(MADV_HUGEPAGE)
4208     /* check if transparent hugepages is compiled into the kernel */
4209     struct stat st;
4210     int ret = stat("/sys/kernel/mm/transparent_hugepage/enabled", &st);
4211     if (ret || !(st.st_mode & S_IFREG)) {
4212         fprintf(stderr, "Transparent huge pages support not detected.\n");
4213         fprintf(stderr, "Will use default page size.\n");
4214         return -1;
4215     }
4216     return 0;
4217 #elif defined(__FreeBSD__)
4218     int spages;
4219     size_t spagesl = sizeof(spages);
4220 
4221     if (sysctlbyname("vm.pmap.pg_ps_enabled", &spages,
4222     &spagesl, NULL, 0) != 0) {
4223         fprintf(stderr, "Could not evaluate the presence of superpages features.");
4224         return -1;
4225     }
4226     if (spages != 1) {
4227         fprintf(stderr, "Superpages support not detected.\n");
4228         fprintf(stderr, "Will use default page size.\n");
4229         return -1;
4230     }
4231     return 0;
4232 #else
4233     return -1;
4234 #endif
4235 }
4236 
4237 /**
4238  * Do basic sanity check of the runtime environment
4239  * @return true if no errors found, false if we can't use this env
4240  */
4241 static bool sanitycheck(void) {
4242     /* One of our biggest problems is old and bogus libevents */
4243     const char *ever = event_get_version();
4244     if (ever != NULL) {
4245         if (strncmp(ever, "1.", 2) == 0) {
4246             fprintf(stderr, "You are using libevent %s.\nPlease upgrade to 2.x"
4247                         " or newer\n", event_get_version());
4248             return false;
4249         }
4250     }
4251 
4252     return true;
4253 }
4254 
4255 static bool _parse_slab_sizes(char *s, uint32_t *slab_sizes) {
4256     char *b = NULL;
4257     uint32_t size = 0;
4258     int i = 0;
4259     uint32_t last_size = 0;
4260 
4261     if (strlen(s) < 1)
4262         return false;
4263 
4264     for (char *p = strtok_r(s, "-", &b);
4265          p != NULL;
4266          p = strtok_r(NULL, "-", &b)) {
4267         if (!safe_strtoul(p, &size) || size < settings.chunk_size
4268              || size > settings.slab_chunk_size_max) {
4269             fprintf(stderr, "slab size %u is out of valid range\n", size);
4270             return false;
4271         }
4272         if (last_size >= size) {
4273             fprintf(stderr, "slab size %u cannot be lower than or equal to a previous class size\n", size);
4274             return false;
4275         }
4276         if (size <= last_size + CHUNK_ALIGN_BYTES) {
4277             fprintf(stderr, "slab size %u must be at least %d bytes larger than previous class\n",
4278                     size, CHUNK_ALIGN_BYTES);
4279             return false;
4280         }
4281         slab_sizes[i++] = size;
4282         last_size = size;
4283         if (i >= MAX_NUMBER_OF_SLAB_CLASSES-1) {
4284             fprintf(stderr, "too many slab classes specified\n");
4285             return false;
4286         }
4287     }
4288 
4289     slab_sizes[i] = 0;
4290     return true;
4291 }
4292 
4293 struct _mc_meta_data {
4294     void *mmap_base;
4295     uint64_t old_base;
4296     char *slab_config; // string containing either factor or custom slab list.
4297     int64_t time_delta;
4298     uint64_t process_started;
4299     uint32_t current_time;
4300 };
4301 
4302 // We need to remember a combination of configuration settings and global
4303 // state for restart viability and resumption of internal services.
4304 // Compared to the number of tunables and state values, relatively little
4305 // does need to be remembered.
4306 // Time is the hardest; we have to assume the sys clock is correct and re-sync for
4307 // the lost time after restart.
4308 static int _mc_meta_save_cb(const char *tag, void *ctx, void *data) {
4309     struct _mc_meta_data *meta = (struct _mc_meta_data *)data;
4310 
4311     // Settings to remember.
4312     // TODO: should get a version of version which is numeric, else
4313     // comparisons for compat reasons are difficult.
4314     // it may be possible to punt on this for now; since we can test for the
4315     // absence of another key... such as the new numeric version.
4316     //restart_set_kv(ctx, "version", "%s", VERSION);
4317     // We hold the original factor or subopts _string_
4318     // it can be directly compared without roundtripping through floats or
4319     // serializing/deserializing the long options list.
4320     restart_set_kv(ctx, "slab_config", "%s", meta->slab_config);
4321     restart_set_kv(ctx, "maxbytes", "%llu", (unsigned long long) settings.maxbytes);
4322     restart_set_kv(ctx, "chunk_size", "%d", settings.chunk_size);
4323     restart_set_kv(ctx, "item_size_max", "%d", settings.item_size_max);
4324     restart_set_kv(ctx, "slab_chunk_size_max", "%d", settings.slab_chunk_size_max);
4325     restart_set_kv(ctx, "slab_page_size", "%d", settings.slab_page_size);
4326     restart_set_kv(ctx, "use_cas", "%s", settings.use_cas ? "true" : "false");
4327     restart_set_kv(ctx, "slab_reassign", "%s", settings.slab_reassign ? "true" : "false");
4328 
4329     // Online state to remember.
4330 
4331     // current time is tough. we need to rely on the clock being correct to
4332     // pull the delta between stop and start times. we also need to know the
4333     // delta between start time and now to restore monotonic clocks.
4334     // for non-monotonic clocks (some OS?), process_started is the only
4335     // important one.
4336     restart_set_kv(ctx, "current_time", "%u", current_time);
4337     // types are great until... this. some systems time_t could be big, but
4338     // I'm assuming never negative.
4339     restart_set_kv(ctx, "process_started", "%llu", (unsigned long long) process_started);
4340     {
4341         struct timeval tv;
4342         gettimeofday(&tv, NULL);
4343         restart_set_kv(ctx, "stop_time", "%lu", tv.tv_sec);
4344     }
4345 
4346     // Might as well just fetch the next CAS value to use than tightly
4347     // coupling the internal variable into the restart system.
4348     restart_set_kv(ctx, "current_cas", "%llu", (unsigned long long) get_cas_id());
4349     restart_set_kv(ctx, "oldest_cas", "%llu", (unsigned long long) settings.oldest_cas);
4350     restart_set_kv(ctx, "logger_gid", "%llu", logger_get_gid());
4351     restart_set_kv(ctx, "hashpower", "%u", stats_state.hash_power_level);
4352     // NOTE: oldest_live is a rel_time_t, which aliases for unsigned int.
4353     // should future proof this with a 64bit upcast, or fetch value from a
4354     // converter function/macro?
4355     restart_set_kv(ctx, "oldest_live", "%u", settings.oldest_live);
4356     // TODO: use uintptr_t etc? is it portable enough?
4357     restart_set_kv(ctx, "mmap_oldbase", "%p", meta->mmap_base);
4358 
4359     return 0;
4360 }
4361 
4362 // We must see at least this number of checked lines. Else empty/missing lines
4363 // could cause a false-positive.
4364 // TODO: Once crc32'ing of the metadata file is done this could be ensured better by
4365 // the restart module itself (crc32 + count of lines must match on the
4366 // backend)
4367 #define RESTART_REQUIRED_META 17
4368 
4369 // With this callback we make a decision on if the current configuration
4370 // matches up enough to allow reusing the cache.
4371 // We also re-load important runtime information.
4372 static int _mc_meta_load_cb(const char *tag, void *ctx, void *data) {
4373     struct _mc_meta_data *meta = (struct _mc_meta_data *)data;
4374     char *key;
4375     char *val;
4376     int reuse_mmap = 0;
4377     meta->process_started = 0;
4378     meta->time_delta = 0;
4379     meta->current_time = 0;
4380     int lines_seen = 0;
4381 
4382     // TODO: not sure this is any better than just doing an if/else tree with
4383     // strcmp's...
4384     enum {
4385         R_MMAP_OLDBASE = 0,
4386         R_MAXBYTES,
4387         R_CHUNK_SIZE,
4388         R_ITEM_SIZE_MAX,
4389         R_SLAB_CHUNK_SIZE_MAX,
4390         R_SLAB_PAGE_SIZE,
4391         R_SLAB_CONFIG,
4392         R_USE_CAS,
4393         R_SLAB_REASSIGN,
4394         R_CURRENT_CAS,
4395         R_OLDEST_CAS,
4396         R_OLDEST_LIVE,
4397         R_LOGGER_GID,
4398         R_CURRENT_TIME,
4399         R_STOP_TIME,
4400         R_PROCESS_STARTED,
4401         R_HASHPOWER,
4402     };
4403 
4404     const char *opts[] = {
4405         [R_MMAP_OLDBASE] = "mmap_oldbase",
4406         [R_MAXBYTES] = "maxbytes",
4407         [R_CHUNK_SIZE] = "chunk_size",
4408         [R_ITEM_SIZE_MAX] = "item_size_max",
4409         [R_SLAB_CHUNK_SIZE_MAX] = "slab_chunk_size_max",
4410         [R_SLAB_PAGE_SIZE] = "slab_page_size",
4411         [R_SLAB_CONFIG] = "slab_config",
4412         [R_USE_CAS] = "use_cas",
4413         [R_SLAB_REASSIGN] = "slab_reassign",
4414         [R_CURRENT_CAS] = "current_cas",
4415         [R_OLDEST_CAS] = "oldest_cas",
4416         [R_OLDEST_LIVE] = "oldest_live",
4417         [R_LOGGER_GID] = "logger_gid",
4418         [R_CURRENT_TIME] = "current_time",
4419         [R_STOP_TIME] = "stop_time",
4420         [R_PROCESS_STARTED] = "process_started",
4421         [R_HASHPOWER] = "hashpower",
4422         NULL
4423     };
4424 
4425     while (restart_get_kv(ctx, &key, &val) == RESTART_OK) {
4426         int type = 0;
4427         int32_t val_int = 0;
4428         uint32_t val_uint = 0;
4429         int64_t bigval_int = 0;
4430         uint64_t bigval_uint = 0;
4431 
4432         while (opts[type] != NULL && strcmp(key, opts[type]) != 0) {
4433             type++;
4434         }
4435         if (opts[type] == NULL) {
4436             fprintf(stderr, "[restart] unknown/unhandled key: %s\n", key);
4437             continue;
4438         }
4439         lines_seen++;
4440 
4441         // helper for any boolean checkers.
4442         bool val_bool = false;
4443         bool is_bool = true;
4444         if (strcmp(val, "false") == 0) {
4445             val_bool = false;
4446         } else if (strcmp(val, "true") == 0) {
4447             val_bool = true;
4448         } else {
4449             is_bool = false;
4450         }
4451 
4452         switch (type) {
4453         case R_MMAP_OLDBASE:
4454             if (!safe_strtoull_hex(val, &meta->old_base)) {
4455                 fprintf(stderr, "[restart] failed to parse %s: %s\n", key, val);
4456                 reuse_mmap = -1;
4457             }
4458             break;
4459         case R_MAXBYTES:
4460             if (!safe_strtoll(val, &bigval_int) || settings.maxbytes != bigval_int) {
4461                 reuse_mmap = -1;
4462             }
4463             break;
4464         case R_CHUNK_SIZE:
4465             if (!safe_strtol(val, &val_int) || settings.chunk_size != val_int) {
4466                 reuse_mmap = -1;
4467             }
4468             break;
4469         case R_ITEM_SIZE_MAX:
4470             if (!safe_strtol(val, &val_int) || settings.item_size_max != val_int) {
4471                 reuse_mmap = -1;
4472             }
4473             break;
4474         case R_SLAB_CHUNK_SIZE_MAX:
4475             if (!safe_strtol(val, &val_int) || settings.slab_chunk_size_max != val_int) {
4476                 reuse_mmap = -1;
4477             }
4478             break;
4479         case R_SLAB_PAGE_SIZE:
4480             if (!safe_strtol(val, &val_int) || settings.slab_page_size != val_int) {
4481                 reuse_mmap = -1;
4482             }
4483             break;
4484         case R_SLAB_CONFIG:
4485             if (strcmp(val, meta->slab_config) != 0) {
4486                 reuse_mmap = -1;
4487             }
4488             break;
4489         case R_USE_CAS:
4490             if (!is_bool || settings.use_cas != val_bool) {
4491                 reuse_mmap = -1;
4492             }
4493             break;
4494         case R_SLAB_REASSIGN:
4495             if (!is_bool || settings.slab_reassign != val_bool) {
4496                 reuse_mmap = -1;
4497             }
4498             break;
4499         case R_CURRENT_CAS:
4500             // FIXME: do we need to fail if these values _aren't_ found?
4501             if (!safe_strtoull(val, &bigval_uint)) {
4502                 reuse_mmap = -1;
4503             } else {
4504                 set_cas_id(bigval_uint);
4505             }
4506             break;
4507         case R_OLDEST_CAS:
4508             if (!safe_strtoull(val, &bigval_uint)) {
4509                 reuse_mmap = -1;
4510             } else {
4511                 settings.oldest_cas = bigval_uint;
4512             }
4513             break;
4514         case R_OLDEST_LIVE:
4515             if (!safe_strtoul(val, &val_uint)) {
4516                 reuse_mmap = -1;
4517             } else {
4518                 settings.oldest_live = val_uint;
4519             }
4520             break;
4521         case R_LOGGER_GID:
4522             if (!safe_strtoull(val, &bigval_uint)) {
4523                 reuse_mmap = -1;
4524             } else {
4525                 logger_set_gid(bigval_uint);
4526             }
4527             break;
4528         case R_PROCESS_STARTED:
4529             if (!safe_strtoull(val, &bigval_uint)) {
4530                 reuse_mmap = -1;
4531             } else {
4532                 meta->process_started = bigval_uint;
4533             }
4534             break;
4535         case R_CURRENT_TIME:
4536             if (!safe_strtoul(val, &val_uint)) {
4537                 reuse_mmap = -1;
4538             } else {
4539                 meta->current_time = val_uint;
4540             }
4541             break;
4542         case R_STOP_TIME:
4543             if (!safe_strtoll(val, &bigval_int)) {
4544                 reuse_mmap = -1;
4545             } else {
4546                 struct timeval t;
4547                 gettimeofday(&t, NULL);
4548                 meta->time_delta = t.tv_sec - bigval_int;
4549                 // clock has done something crazy.
4550                 // there are _lots_ of ways the clock can go wrong here, but
4551                 // this is a safe sanity check since there's nothing else we
4552                 // can realistically do.
4553                 if (meta->time_delta <= 0) {
4554                     reuse_mmap = -1;
4555                 }
4556             }
4557             break;
4558         case R_HASHPOWER:
4559             if (!safe_strtoul(val, &val_uint)) {
4560                 reuse_mmap = -1;
4561             } else {
4562                 settings.hashpower_init = val_uint;
4563             }
4564             break;
4565         default:
4566             fprintf(stderr, "[restart] unhandled key: %s\n", key);
4567         }
4568 
4569         if (reuse_mmap != 0) {
4570             fprintf(stderr, "[restart] restart incompatible due to setting for [%s] [old value: %s]\n", key, val);
4571             break;
4572         }
4573     }
4574 
4575     if (lines_seen < RESTART_REQUIRED_META) {
4576         fprintf(stderr, "[restart] missing some metadata lines\n");
4577         reuse_mmap = -1;
4578     }
4579 
4580     return reuse_mmap;
4581 }
4582 
4583 int main (int argc, char **argv) {
4584     int c;
4585     bool lock_memory = false;
4586     bool do_daemonize = false;
4587     bool preallocate = false;
4588     int maxcore = 0;
4589     char *username = NULL;
4590     char *pid_file = NULL;
4591     struct passwd *pw;
4592     struct rlimit rlim;
4593     char *buf;
4594     char unit = '\0';
4595     int size_max = 0;
4596     int retval = EXIT_SUCCESS;
4597     bool protocol_specified = false;
4598     bool tcp_specified = false;
4599     bool udp_specified = false;
4600     bool start_lru_maintainer = true;
4601     bool start_lru_crawler = true;
4602     bool start_assoc_maint = true;
4603     enum hashfunc_type hash_type = MURMUR3_HASH;
4604     uint32_t tocrawl;
4605     uint32_t slab_sizes[MAX_NUMBER_OF_SLAB_CLASSES];
4606     bool use_slab_sizes = false;
4607     char *slab_sizes_unparsed = NULL;
4608     bool slab_chunk_size_changed = false;
4609     // struct for restart code. Initialized up here so we can curry
4610     // important settings to save or validate.
4611     struct _mc_meta_data *meta = malloc(sizeof(struct _mc_meta_data));
4612     meta->slab_config = NULL;
4613     char *subopts, *subopts_orig;
4614     char *subopts_value;
4615     enum {
4616         MAXCONNS_FAST = 0,
4617         HASHPOWER_INIT,
4618         NO_HASHEXPAND,
4619         SLAB_REASSIGN,
4620         SLAB_AUTOMOVE,
4621         SLAB_AUTOMOVE_RATIO,
4622         SLAB_AUTOMOVE_WINDOW,
4623         TAIL_REPAIR_TIME,
4624         HASH_ALGORITHM,
4625         LRU_CRAWLER,
4626         LRU_CRAWLER_SLEEP,
4627         LRU_CRAWLER_TOCRAWL,
4628         LRU_MAINTAINER,
4629         HOT_LRU_PCT,
4630         WARM_LRU_PCT,
4631         HOT_MAX_FACTOR,
4632         WARM_MAX_FACTOR,
4633         TEMPORARY_TTL,
4634         IDLE_TIMEOUT,
4635         WATCHER_LOGBUF_SIZE,
4636         WORKER_LOGBUF_SIZE,
4637         SLAB_SIZES,
4638         SLAB_CHUNK_MAX,
4639         TRACK_SIZES,
4640         NO_INLINE_ASCII_RESP,
4641         MODERN,
4642         NO_MODERN,
4643         NO_CHUNKED_ITEMS,
4644         NO_SLAB_REASSIGN,
4645         NO_SLAB_AUTOMOVE,
4646         NO_MAXCONNS_FAST,
4647         INLINE_ASCII_RESP,
4648         NO_LRU_CRAWLER,
4649         NO_LRU_MAINTAINER,
4650         NO_DROP_PRIVILEGES,
4651         DROP_PRIVILEGES,
4652         RESP_OBJ_MEM_LIMIT,
4653         READ_BUF_MEM_LIMIT,
4654         META_RESPONSE_OLD,
4655 #ifdef TLS
4656         SSL_CERT,
4657         SSL_KEY,
4658         SSL_VERIFY_MODE,
4659         SSL_KEYFORM,
4660         SSL_CIPHERS,
4661         SSL_CA_CERT,
4662         SSL_WBUF_SIZE,
4663         SSL_SESSION_CACHE,
4664         SSL_MIN_VERSION,
4665 #endif
4666 #ifdef MEMCACHED_DEBUG
4667         RELAXED_PRIVILEGES,
4668 #endif
4669     };
4670     char *const subopts_tokens[] = {
4671         [MAXCONNS_FAST] = "maxconns_fast",
4672         [HASHPOWER_INIT] = "hashpower",
4673         [NO_HASHEXPAND] = "no_hashexpand",
4674         [SLAB_REASSIGN] = "slab_reassign",
4675         [SLAB_AUTOMOVE] = "slab_automove",
4676         [SLAB_AUTOMOVE_RATIO] = "slab_automove_ratio",
4677         [SLAB_AUTOMOVE_WINDOW] = "slab_automove_window",
4678         [TAIL_REPAIR_TIME] = "tail_repair_time",
4679         [HASH_ALGORITHM] = "hash_algorithm",
4680         [LRU_CRAWLER] = "lru_crawler",
4681         [LRU_CRAWLER_SLEEP] = "lru_crawler_sleep",
4682         [LRU_CRAWLER_TOCRAWL] = "lru_crawler_tocrawl",
4683         [LRU_MAINTAINER] = "lru_maintainer",
4684         [HOT_LRU_PCT] = "hot_lru_pct",
4685         [WARM_LRU_PCT] = "warm_lru_pct",
4686         [HOT_MAX_FACTOR] = "hot_max_factor",
4687         [WARM_MAX_FACTOR] = "warm_max_factor",
4688         [TEMPORARY_TTL] = "temporary_ttl",
4689         [IDLE_TIMEOUT] = "idle_timeout",
4690         [WATCHER_LOGBUF_SIZE] = "watcher_logbuf_size",
4691         [WORKER_LOGBUF_SIZE] = "worker_logbuf_size",
4692         [SLAB_SIZES] = "slab_sizes",
4693         [SLAB_CHUNK_MAX] = "slab_chunk_max",
4694         [TRACK_SIZES] = "track_sizes",
4695         [NO_INLINE_ASCII_RESP] = "no_inline_ascii_resp",
4696         [MODERN] = "modern",
4697         [NO_MODERN] = "no_modern",
4698         [NO_CHUNKED_ITEMS] = "no_chunked_items",
4699         [NO_SLAB_REASSIGN] = "no_slab_reassign",
4700         [NO_SLAB_AUTOMOVE] = "no_slab_automove",
4701         [NO_MAXCONNS_FAST] = "no_maxconns_fast",
4702         [INLINE_ASCII_RESP] = "inline_ascii_resp",
4703         [NO_LRU_CRAWLER] = "no_lru_crawler",
4704         [NO_LRU_MAINTAINER] = "no_lru_maintainer",
4705         [NO_DROP_PRIVILEGES] = "no_drop_privileges",
4706         [DROP_PRIVILEGES] = "drop_privileges",
4707         [RESP_OBJ_MEM_LIMIT] = "resp_obj_mem_limit",
4708         [READ_BUF_MEM_LIMIT] = "read_buf_mem_limit",
4709         [META_RESPONSE_OLD] = "meta_response_old",
4710 #ifdef TLS
4711         [SSL_CERT] = "ssl_chain_cert",
4712         [SSL_KEY] = "ssl_key",
4713         [SSL_VERIFY_MODE] = "ssl_verify_mode",
4714         [SSL_KEYFORM] = "ssl_keyformat",
4715         [SSL_CIPHERS] = "ssl_ciphers",
4716         [SSL_CA_CERT] = "ssl_ca_cert",
4717         [SSL_WBUF_SIZE] = "ssl_wbuf_size",
4718         [SSL_SESSION_CACHE] = "ssl_session_cache",
4719         [SSL_MIN_VERSION] = "ssl_min_version",
4720 #endif
4721 #ifdef MEMCACHED_DEBUG
4722         [RELAXED_PRIVILEGES] = "relaxed_privileges",
4723 #endif
4724         NULL
4725     };
4726 
4727     if (!sanitycheck()) {
4728         free(meta);
4729         return EX_OSERR;
4730     }
4731 
4732     /* handle SIGINT, SIGTERM */
4733     signal(SIGINT, sig_handler);
4734     signal(SIGTERM, sig_handler);
4735     signal(SIGHUP, sighup_handler);
4736     signal(SIGUSR1, sig_usrhandler);
4737 
4738     /* init settings */
4739     settings_init();
4740     verify_default("hash_algorithm", hash_type == MURMUR3_HASH);
4741 #ifdef EXTSTORE
4742     void *storage = NULL;
4743     void *storage_cf = storage_init_config(&settings);
4744     bool storage_enabled = false;
4745     if (storage_cf == NULL) {
4746         fprintf(stderr, "failed to allocate extstore config\n");
4747         return 1;
4748     }
4749 #endif
4750 
4751     /* Run regardless of initializing it later */
4752     init_lru_maintainer();
4753 
4754     /* set stderr non-buffering (for running under, say, daemontools) */
4755     setbuf(stderr, NULL);
4756 
4757     char *shortopts =
4758           "a:"  /* access mask for unix socket */
4759           "A"   /* enable admin shutdown command */
4760           "Z"   /* enable SSL */
4761           "p:"  /* TCP port number to listen on */
4762           "s:"  /* unix socket path to listen on */
4763           "U:"  /* UDP port number to listen on */
4764           "m:"  /* max memory to use for items in megabytes */
4765           "M"   /* return error on memory exhausted */
4766           "c:"  /* max simultaneous connections */
4767           "k"   /* lock down all paged memory */
4768           "hiV" /* help, licence info, version */
4769           "r"   /* maximize core file limit */
4770           "v"   /* verbose */
4771           "d"   /* daemon mode */
4772           "l:"  /* interface to listen on */
4773           "u:"  /* user identity to run as */
4774           "P:"  /* save PID in file */
4775           "f:"  /* factor? */
4776           "n:"  /* minimum space allocated for key+value+flags */
4777           "t:"  /* threads */
4778           "D:"  /* prefix delimiter? */
4779           "L"   /* Large memory pages */
4780           "R:"  /* max requests per event */
4781           "C"   /* Disable use of CAS */
4782           "b:"  /* backlog queue limit */
4783           "B:"  /* Binding protocol */
4784           "I:"  /* Max item size */
4785           "S"   /* Sasl ON */
4786           "F"   /* Disable flush_all */
4787           "X"   /* Disable dump commands */
4788           "W"   /* Disable watch commands */
4789           "Y:"   /* Enable token auth */
4790           "e:"  /* mmap path for external item memory */
4791           "o:"  /* Extended generic options */
4792           "N:"  /* NAPI ID based thread selection */
4793           ;
4794 
4795     /* process arguments */
4796 #ifdef HAVE_GETOPT_LONG
4797     const struct option longopts[] = {
4798         {"unix-mask", required_argument, 0, 'a'},
4799         {"enable-shutdown", no_argument, 0, 'A'},
4800         {"enable-ssl", no_argument, 0, 'Z'},
4801         {"port", required_argument, 0, 'p'},
4802         {"unix-socket", required_argument, 0, 's'},
4803         {"udp-port", required_argument, 0, 'U'},
4804         {"memory-limit", required_argument, 0, 'm'},
4805         {"disable-evictions", no_argument, 0, 'M'},
4806         {"conn-limit", required_argument, 0, 'c'},
4807         {"lock-memory", no_argument, 0, 'k'},
4808         {"help", no_argument, 0, 'h'},
4809         {"license", no_argument, 0, 'i'},
4810         {"version", no_argument, 0, 'V'},
4811         {"enable-coredumps", no_argument, 0, 'r'},
4812         {"verbose", optional_argument, 0, 'v'},
4813         {"daemon", no_argument, 0, 'd'},
4814         {"listen", required_argument, 0, 'l'},
4815         {"user", required_argument, 0, 'u'},
4816         {"pidfile", required_argument, 0, 'P'},
4817         {"slab-growth-factor", required_argument, 0, 'f'},
4818         {"slab-min-size", required_argument, 0, 'n'},
4819         {"threads", required_argument, 0, 't'},
4820         {"enable-largepages", no_argument, 0, 'L'},
4821         {"max-reqs-per-event", required_argument, 0, 'R'},
4822         {"disable-cas", no_argument, 0, 'C'},
4823         {"listen-backlog", required_argument, 0, 'b'},
4824         {"protocol", required_argument, 0, 'B'},
4825         {"max-item-size", required_argument, 0, 'I'},
4826         {"enable-sasl", no_argument, 0, 'S'},
4827         {"disable-flush-all", no_argument, 0, 'F'},
4828         {"disable-dumping", no_argument, 0, 'X'},
4829         {"disable-watch", no_argument, 0, 'W'},
4830         {"auth-file", required_argument, 0, 'Y'},
4831         {"memory-file", required_argument, 0, 'e'},
4832         {"extended", required_argument, 0, 'o'},
4833         {"napi-ids", required_argument, 0, 'N'},
4834         {0, 0, 0, 0}
4835     };
4836     int optindex;
4837     while (-1 != (c = getopt_long(argc, argv, shortopts,
4838                     longopts, &optindex))) {
4839 #else
4840     while (-1 != (c = getopt(argc, argv, shortopts))) {
4841 #endif
4842         switch (c) {
4843         case 'A':
4844             /* enables "shutdown" command */
4845             settings.shutdown_command = true;
4846             break;
4847         case 'Z':
4848             /* enable secure communication*/
4849 #ifdef TLS
4850             settings.ssl_enabled = true;
4851 #else
4852             fprintf(stderr, "This server is not built with TLS support.\n");
4853             exit(EX_USAGE);
4854 #endif
4855             break;
4856         case 'a':
4857 #ifndef DISABLE_UNIX_SOCKET
4858             /* access for unix domain socket, as octal mask (like chmod)*/
4859             settings.access= strtol(optarg,NULL,8);
4860 #else
4861             fprintf(stderr, "This server is not built with unix socket support.\n");
4862             exit(EX_USAGE);
4863 #endif /* #ifndef DISABLE_UNIX_SOCKET */
4864             break;
4865         case 'U':
4866             settings.udpport = atoi(optarg);
4867             udp_specified = true;
4868             break;
4869         case 'p':
4870             settings.port = atoi(optarg);
4871             tcp_specified = true;
4872             break;
4873         case 's':
4874 #ifndef DISABLE_UNIX_SOCKET
4875             settings.socketpath = optarg;
4876 #else
4877             fprintf(stderr, "This server is not built with unix socket support.\n");
4878             exit(EX_USAGE);
4879 #endif /* #ifndef DISABLE_UNIX_SOCKET */
4880             break;
4881         case 'm':
4882             settings.maxbytes = ((size_t)atoi(optarg)) * 1024 * 1024;
4883             break;
4884         case 'M':
4885             settings.evict_to_free = 0;
4886             break;
4887         case 'c':
4888             settings.maxconns = atoi(optarg);
4889             if (settings.maxconns <= 0) {
4890                 fprintf(stderr, "Maximum connections must be greater than 0\n");
4891                 return 1;
4892             }
4893             break;
4894         case 'h':
4895             usage();
4896             exit(EXIT_SUCCESS);
4897         case 'i':
4898             usage_license();
4899             exit(EXIT_SUCCESS);
4900         case 'V':
4901             printf(PACKAGE " " VERSION "\n");
4902             exit(EXIT_SUCCESS);
4903         case 'k':
4904             lock_memory = true;
4905             break;
4906         case 'v':
4907             settings.verbose++;
4908             break;
4909         case 'l':
4910             if (settings.inter != NULL) {
4911                 if (strstr(settings.inter, optarg) != NULL) {
4912                     break;
4913                 }
4914                 size_t len = strlen(settings.inter) + strlen(optarg) + 2;
4915                 char *p = malloc(len);
4916                 if (p == NULL) {
4917                     fprintf(stderr, "Failed to allocate memory\n");
4918                     return 1;
4919                 }
4920                 snprintf(p, len, "%s,%s", settings.inter, optarg);
4921                 free(settings.inter);
4922                 settings.inter = p;
4923             } else {
4924                 settings.inter= strdup(optarg);
4925             }
4926             break;
4927         case 'd':
4928             do_daemonize = true;
4929             break;
4930         case 'r':
4931             maxcore = 1;
4932             break;
4933         case 'R':
4934             settings.reqs_per_event = atoi(optarg);
4935             if (settings.reqs_per_event == 0) {
4936                 fprintf(stderr, "Number of requests per event must be greater than 0\n");
4937                 return 1;
4938             }
4939             break;
4940         case 'u':
4941             username = optarg;
4942             break;
4943         case 'P':
4944             pid_file = optarg;
4945             break;
4946         case 'e':
4947             settings.memory_file = optarg;
4948             break;
4949         case 'f':
4950             settings.factor = atof(optarg);
4951             if (settings.factor <= 1.0) {
4952                 fprintf(stderr, "Factor must be greater than 1\n");
4953                 return 1;
4954             }
4955             meta->slab_config = strdup(optarg);
4956             break;
4957         case 'n':
4958             settings.chunk_size = atoi(optarg);
4959             if (settings.chunk_size == 0) {
4960                 fprintf(stderr, "Chunk size must be greater than 0\n");
4961                 return 1;
4962             }
4963             break;
4964         case 't':
4965             settings.num_threads = atoi(optarg);
4966             if (settings.num_threads <= 0) {
4967                 fprintf(stderr, "Number of threads must be greater than 0\n");
4968                 return 1;
4969             }
4970             /* There're other problems when you get above 64 threads.
4971              * In the future we should portably detect # of cores for the
4972              * default.
4973              */
4974             if (settings.num_threads > 64) {
4975                 fprintf(stderr, "WARNING: Setting a high number of worker"
4976                                 "threads is not recommended.\n"
4977                                 " Set this value to the number of cores in"
4978                                 " your machine or less.\n");
4979             }
4980             break;
4981         case 'D':
4982             if (! optarg || ! optarg[0]) {
4983                 fprintf(stderr, "No delimiter specified\n");
4984                 return 1;
4985             }
4986             settings.prefix_delimiter = optarg[0];
4987             settings.detail_enabled = 1;
4988             break;
4989         case 'L' :
4990             if (enable_large_pages() == 0) {
4991                 preallocate = true;
4992             } else {
4993                 fprintf(stderr, "Cannot enable large pages on this system\n"
4994                     "(There is no support as of this version)\n");
4995                 return 1;
4996             }
4997             break;
4998         case 'C' :
4999             settings.use_cas = false;
5000             break;
5001         case 'b' :
5002             settings.backlog = atoi(optarg);
5003             break;
5004         case 'B':
5005             protocol_specified = true;
5006             if (strcmp(optarg, "auto") == 0) {
5007                 settings.binding_protocol = negotiating_prot;
5008             } else if (strcmp(optarg, "binary") == 0) {
5009                 settings.binding_protocol = binary_prot;
5010             } else if (strcmp(optarg, "ascii") == 0) {
5011                 settings.binding_protocol = ascii_prot;
5012             } else {
5013                 fprintf(stderr, "Invalid value for binding protocol: %s\n"
5014                         " -- should be one of auto, binary, or ascii\n", optarg);
5015                 exit(EX_USAGE);
5016             }
5017             break;
5018         case 'I':
5019             buf = strdup(optarg);
5020             unit = buf[strlen(buf)-1];
5021             if (unit == 'k' || unit == 'm' ||
5022                 unit == 'K' || unit == 'M') {
5023                 buf[strlen(buf)-1] = '\0';
5024                 size_max = atoi(buf);
5025                 if (unit == 'k' || unit == 'K')
5026                     size_max *= 1024;
5027                 if (unit == 'm' || unit == 'M')
5028                     size_max *= 1024 * 1024;
5029                 settings.item_size_max = size_max;
5030             } else {
5031                 settings.item_size_max = atoi(buf);
5032             }
5033             free(buf);
5034             break;
5035         case 'S': /* set Sasl authentication to true. Default is false */
5036 #ifndef ENABLE_SASL
5037             fprintf(stderr, "This server is not built with SASL support.\n");
5038             exit(EX_USAGE);
5039 #endif
5040             settings.sasl = true;
5041             break;
5042        case 'F' :
5043             settings.flush_enabled = false;
5044             break;
5045        case 'X' :
5046             settings.dump_enabled = false;
5047             break;
5048        case 'W' :
5049             settings.watch_enabled = false;
5050             break;
5051        case 'Y' :
5052             // dupe the file path now just in case the options get mangled.
5053             settings.auth_file = strdup(optarg);
5054             break;
5055        case 'N':
5056             settings.num_napi_ids = atoi(optarg);
5057             if (settings.num_napi_ids <= 0) {
5058                 fprintf(stderr, "Maximum number of NAPI IDs must be greater than 0\n");
5059                 return 1;
5060             }
5061             break;
5062         case 'o': /* It's sub-opts time! */
5063             subopts_orig = subopts = strdup(optarg); /* getsubopt() changes the original args */
5064 
5065             while (*subopts != '\0') {
5066             // BSD getsubopt (at least) has undefined behavior on -1, so
5067             // if we want to retry the getsubopt call in submodules we
5068             // need an extra layer of string copies.
5069             char *subopts_temp_o = NULL;
5070             char *subopts_temp = subopts_temp_o = strdup(subopts);
5071 
5072             switch (getsubopt(&subopts, subopts_tokens, &subopts_value)) {
5073             case MAXCONNS_FAST:
5074                 settings.maxconns_fast = true;
5075                 break;
5076             case HASHPOWER_INIT:
5077                 if (subopts_value == NULL) {
5078                     fprintf(stderr, "Missing numeric argument for hashpower\n");
5079                     return 1;
5080                 }
5081                 settings.hashpower_init = atoi(subopts_value);
5082                 if (settings.hashpower_init < 12) {
5083                     fprintf(stderr, "Initial hashtable multiplier of %d is too low\n",
5084                         settings.hashpower_init);
5085                     return 1;
5086                 } else if (settings.hashpower_init > 32) {
5087                     fprintf(stderr, "Initial hashtable multiplier of %d is too high\n"
5088                         "Choose a value based on \"STAT hash_power_level\" from a running instance\n",
5089                         settings.hashpower_init);
5090                     return 1;
5091                 }
5092                 break;
5093             case NO_HASHEXPAND:
5094                 start_assoc_maint = false;
5095                 break;
5096             case SLAB_REASSIGN:
5097                 settings.slab_reassign = true;
5098                 break;
5099             case SLAB_AUTOMOVE:
5100                 if (subopts_value == NULL) {
5101                     settings.slab_automove = 1;
5102                     break;
5103                 }
5104                 settings.slab_automove = atoi(subopts_value);
5105                 if (settings.slab_automove < 0 || settings.slab_automove > 2) {
5106                     fprintf(stderr, "slab_automove must be between 0 and 2\n");
5107                     return 1;
5108                 }
5109                 break;
5110             case SLAB_AUTOMOVE_RATIO:
5111                 if (subopts_value == NULL) {
5112                     fprintf(stderr, "Missing slab_automove_ratio argument\n");
5113                     return 1;
5114                 }
5115                 settings.slab_automove_ratio = atof(subopts_value);
5116                 if (settings.slab_automove_ratio <= 0 || settings.slab_automove_ratio > 1) {
5117                     fprintf(stderr, "slab_automove_ratio must be > 0 and < 1\n");
5118                     return 1;
5119                 }
5120                 break;
5121             case SLAB_AUTOMOVE_WINDOW:
5122                 if (subopts_value == NULL) {
5123                     fprintf(stderr, "Missing slab_automove_window argument\n");
5124                     return 1;
5125                 }
5126                 settings.slab_automove_window = atoi(subopts_value);
5127                 if (settings.slab_automove_window < 3) {
5128                     fprintf(stderr, "slab_automove_window must be > 2\n");
5129                     return 1;
5130                 }
5131                 break;
5132             case TAIL_REPAIR_TIME:
5133                 if (subopts_value == NULL) {
5134                     fprintf(stderr, "Missing numeric argument for tail_repair_time\n");
5135                     return 1;
5136                 }
5137                 settings.tail_repair_time = atoi(subopts_value);
5138                 if (settings.tail_repair_time < 10) {
5139                     fprintf(stderr, "Cannot set tail_repair_time to less than 10 seconds\n");
5140                     return 1;
5141                 }
5142                 break;
5143             case HASH_ALGORITHM:
5144                 if (subopts_value == NULL) {
5145                     fprintf(stderr, "Missing hash_algorithm argument\n");
5146                     return 1;
5147                 };
5148                 if (strcmp(subopts_value, "jenkins") == 0) {
5149                     hash_type = JENKINS_HASH;
5150                 } else if (strcmp(subopts_value, "murmur3") == 0) {
5151                     hash_type = MURMUR3_HASH;
5152                 } else if (strcmp(subopts_value, "xxh3") == 0) {
5153                     hash_type = XXH3_HASH;
5154                 } else {
5155                     fprintf(stderr, "Unknown hash_algorithm option (jenkins, murmur3, xxh3)\n");
5156                     return 1;
5157                 }
5158                 break;
5159             case LRU_CRAWLER:
5160                 start_lru_crawler = true;
5161                 break;
5162             case LRU_CRAWLER_SLEEP:
5163                 if (subopts_value == NULL) {
5164                     fprintf(stderr, "Missing lru_crawler_sleep value\n");
5165                     return 1;
5166                 }
5167                 settings.lru_crawler_sleep = atoi(subopts_value);
5168                 if (settings.lru_crawler_sleep > 1000000 || settings.lru_crawler_sleep < 0) {
5169                     fprintf(stderr, "LRU crawler sleep must be between 0 and 1 second\n");
5170                     return 1;
5171                 }
5172                 break;
5173             case LRU_CRAWLER_TOCRAWL:
5174                 if (subopts_value == NULL) {
5175                     fprintf(stderr, "Missing lru_crawler_tocrawl value\n");
5176                     return 1;
5177                 }
5178                 if (!safe_strtoul(subopts_value, &tocrawl)) {
5179                     fprintf(stderr, "lru_crawler_tocrawl takes a numeric 32bit value\n");
5180                     return 1;
5181                 }
5182                 settings.lru_crawler_tocrawl = tocrawl;
5183                 break;
5184             case LRU_MAINTAINER:
5185                 start_lru_maintainer = true;
5186                 settings.lru_segmented = true;
5187                 break;
5188             case HOT_LRU_PCT:
5189                 if (subopts_value == NULL) {
5190                     fprintf(stderr, "Missing hot_lru_pct argument\n");
5191                     return 1;
5192                 }
5193                 settings.hot_lru_pct = atoi(subopts_value);
5194                 if (settings.hot_lru_pct < 1 || settings.hot_lru_pct >= 80) {
5195                     fprintf(stderr, "hot_lru_pct must be > 1 and < 80\n");
5196                     return 1;
5197                 }
5198                 break;
5199             case WARM_LRU_PCT:
5200                 if (subopts_value == NULL) {
5201                     fprintf(stderr, "Missing warm_lru_pct argument\n");
5202                     return 1;
5203                 }
5204                 settings.warm_lru_pct = atoi(subopts_value);
5205                 if (settings.warm_lru_pct < 1 || settings.warm_lru_pct >= 80) {
5206                     fprintf(stderr, "warm_lru_pct must be > 1 and < 80\n");
5207                     return 1;
5208                 }
5209                 break;
5210             case HOT_MAX_FACTOR:
5211                 if (subopts_value == NULL) {
5212                     fprintf(stderr, "Missing hot_max_factor argument\n");
5213                     return 1;
5214                 }
5215                 settings.hot_max_factor = atof(subopts_value);
5216                 if (settings.hot_max_factor <= 0) {
5217                     fprintf(stderr, "hot_max_factor must be > 0\n");
5218                     return 1;
5219                 }
5220                 break;
5221             case WARM_MAX_FACTOR:
5222                 if (subopts_value == NULL) {
5223                     fprintf(stderr, "Missing warm_max_factor argument\n");
5224                     return 1;
5225                 }
5226                 settings.warm_max_factor = atof(subopts_value);
5227                 if (settings.warm_max_factor <= 0) {
5228                     fprintf(stderr, "warm_max_factor must be > 0\n");
5229                     return 1;
5230                 }
5231                 break;
5232             case TEMPORARY_TTL:
5233                 if (subopts_value == NULL) {
5234                     fprintf(stderr, "Missing temporary_ttl argument\n");
5235                     return 1;
5236                 }
5237                 settings.temp_lru = true;
5238                 settings.temporary_ttl = atoi(subopts_value);
5239                 break;
5240             case IDLE_TIMEOUT:
5241                 if (subopts_value == NULL) {
5242                     fprintf(stderr, "Missing numeric argument for idle_timeout\n");
5243                     return 1;
5244                 }
5245                 settings.idle_timeout = atoi(subopts_value);
5246                 break;
5247             case WATCHER_LOGBUF_SIZE:
5248                 if (subopts_value == NULL) {
5249                     fprintf(stderr, "Missing watcher_logbuf_size argument\n");
5250                     return 1;
5251                 }
5252                 if (!safe_strtoul(subopts_value, &settings.logger_watcher_buf_size)) {
5253                     fprintf(stderr, "could not parse argument to watcher_logbuf_size\n");
5254                     return 1;
5255                 }
5256                 settings.logger_watcher_buf_size *= 1024; /* kilobytes */
5257                 break;
5258             case WORKER_LOGBUF_SIZE:
5259                 if (subopts_value == NULL) {
5260                     fprintf(stderr, "Missing worker_logbuf_size argument\n");
5261                     return 1;
5262                 }
5263                 if (!safe_strtoul(subopts_value, &settings.logger_buf_size)) {
5264                     fprintf(stderr, "could not parse argument to worker_logbuf_size\n");
5265                     return 1;
5266                 }
5267                 settings.logger_buf_size *= 1024; /* kilobytes */
5268             case SLAB_SIZES:
5269                 slab_sizes_unparsed = strdup(subopts_value);
5270                 break;
5271             case SLAB_CHUNK_MAX:
5272                 if (subopts_value == NULL) {
5273                     fprintf(stderr, "Missing slab_chunk_max argument\n");
5274                 }
5275                 if (!safe_strtol(subopts_value, &settings.slab_chunk_size_max)) {
5276                     fprintf(stderr, "could not parse argument to slab_chunk_max\n");
5277                 }
5278                 slab_chunk_size_changed = true;
5279                 break;
5280             case TRACK_SIZES:
5281                 item_stats_sizes_init();
5282                 break;
5283             case NO_INLINE_ASCII_RESP:
5284                 break;
5285             case INLINE_ASCII_RESP:
5286                 break;
5287             case NO_CHUNKED_ITEMS:
5288                 settings.slab_chunk_size_max = settings.slab_page_size;
5289                 break;
5290             case NO_SLAB_REASSIGN:
5291                 settings.slab_reassign = false;
5292                 break;
5293             case NO_SLAB_AUTOMOVE:
5294                 settings.slab_automove = 0;
5295                 break;
5296             case NO_MAXCONNS_FAST:
5297                 settings.maxconns_fast = false;
5298                 break;
5299             case NO_LRU_CRAWLER:
5300                 settings.lru_crawler = false;
5301                 start_lru_crawler = false;
5302                 break;
5303             case NO_LRU_MAINTAINER:
5304                 start_lru_maintainer = false;
5305                 settings.lru_segmented = false;
5306                 break;
5307             case META_RESPONSE_OLD:
5308                 settings.meta_response_old = true;
5309                 break;
5310 #ifdef TLS
5311             case SSL_CERT:
5312                 if (subopts_value == NULL) {
5313                     fprintf(stderr, "Missing ssl_chain_cert argument\n");
5314                     return 1;
5315                 }
5316                 settings.ssl_chain_cert = strdup(subopts_value);
5317                 break;
5318             case SSL_KEY:
5319                 if (subopts_value == NULL) {
5320                     fprintf(stderr, "Missing ssl_key argument\n");
5321                     return 1;
5322                 }
5323                 settings.ssl_key = strdup(subopts_value);
5324                 break;
5325             case SSL_VERIFY_MODE:
5326             {
5327                 if (subopts_value == NULL) {
5328                     fprintf(stderr, "Missing ssl_verify_mode argument\n");
5329                     return 1;
5330                 }
5331                 int verify  = 0;
5332                 if (!safe_strtol(subopts_value, &verify)) {
5333                     fprintf(stderr, "could not parse argument to ssl_verify_mode\n");
5334                     return 1;
5335                 }
5336                 switch(verify) {
5337                     case 0:
5338                         settings.ssl_verify_mode = SSL_VERIFY_NONE;
5339                         break;
5340                     case 1:
5341                         settings.ssl_verify_mode = SSL_VERIFY_PEER;
5342                         break;
5343                     case 2:
5344                         settings.ssl_verify_mode = SSL_VERIFY_PEER |
5345                                                     SSL_VERIFY_FAIL_IF_NO_PEER_CERT;
5346                         break;
5347                     case 3:
5348                         settings.ssl_verify_mode = SSL_VERIFY_PEER |
5349                                                     SSL_VERIFY_FAIL_IF_NO_PEER_CERT |
5350                                                     SSL_VERIFY_CLIENT_ONCE;
5351                         break;
5352                     default:
5353                         fprintf(stderr, "Invalid ssl_verify_mode. Use help to see valid options.\n");
5354                         return 1;
5355                 }
5356                 break;
5357             }
5358             case SSL_KEYFORM:
5359                 if (subopts_value == NULL) {
5360                     fprintf(stderr, "Missing ssl_keyformat argument\n");
5361                     return 1;
5362                 }
5363                 if (!safe_strtol(subopts_value, &settings.ssl_keyformat)) {
5364                     fprintf(stderr, "could not parse argument to ssl_keyformat\n");
5365                     return 1;
5366                 }
5367                 break;
5368             case SSL_CIPHERS:
5369                 if (subopts_value == NULL) {
5370                     fprintf(stderr, "Missing ssl_ciphers argument\n");
5371                     return 1;
5372                 }
5373                 settings.ssl_ciphers = strdup(subopts_value);
5374                 break;
5375             case SSL_CA_CERT:
5376                 if (subopts_value == NULL) {
5377                     fprintf(stderr, "Missing ssl_ca_cert argument\n");
5378                     return 1;
5379                 }
5380                 settings.ssl_ca_cert = strdup(subopts_value);
5381                 break;
5382             case SSL_WBUF_SIZE:
5383                 if (subopts_value == NULL) {
5384                     fprintf(stderr, "Missing ssl_wbuf_size argument\n");
5385                     return 1;
5386                 }
5387                 if (!safe_strtoul(subopts_value, &settings.ssl_wbuf_size)) {
5388                     fprintf(stderr, "could not parse argument to ssl_wbuf_size\n");
5389                     return 1;
5390                 }
5391                 settings.ssl_wbuf_size *= 1024; /* kilobytes */
5392                 break;
5393             case SSL_SESSION_CACHE:
5394                 settings.ssl_session_cache = true;
5395                 break;
5396             case SSL_MIN_VERSION: {
5397                 int min_version;
5398                 if (subopts_value == NULL) {
5399                     fprintf(stderr, "Missing ssl_min_version argument\n");
5400                     return 1;
5401                 }
5402                 if (!safe_strtol(subopts_value, &min_version)) {
5403                     fprintf(stderr, "could not parse argument to ssl_min_version\n");
5404                     return 1;
5405                 }
5406                 switch (min_version) {
5407                     case 0:
5408                         settings.ssl_min_version = TLS1_VERSION;
5409                         break;
5410                     case 1:
5411                         settings.ssl_min_version = TLS1_1_VERSION;
5412                         break;
5413                     case 2:
5414                         settings.ssl_min_version = TLS1_2_VERSION;
5415                         break;
5416 #if OPENSSL_VERSION_NUMBER >= 0x10101000L
5417                     case 3:
5418                         settings.ssl_min_version = TLS1_3_VERSION;
5419                         break;
5420 #endif
5421                     default:
5422                         fprintf(stderr, "Invalid ssl_min_version. Use help to see valid options.\n");
5423                         return 1;
5424                 }
5425                 break;
5426             }
5427 #endif
5428             case MODERN:
5429                 /* currently no new defaults */
5430                 break;
5431             case NO_MODERN:
5432                 if (!slab_chunk_size_changed) {
5433                     settings.slab_chunk_size_max = settings.slab_page_size;
5434                 }
5435                 settings.slab_reassign = false;
5436                 settings.slab_automove = 0;
5437                 settings.maxconns_fast = false;
5438                 settings.lru_segmented = false;
5439                 hash_type = JENKINS_HASH;
5440                 start_lru_crawler = false;
5441                 start_lru_maintainer = false;
5442                 break;
5443             case NO_DROP_PRIVILEGES:
5444                 settings.drop_privileges = false;
5445                 break;
5446             case DROP_PRIVILEGES:
5447                 settings.drop_privileges = true;
5448                 break;
5449             case RESP_OBJ_MEM_LIMIT:
5450                 // TODO: Remove at some point in the future.
5451                 fprintf(stderr, "DEPRECATED: resp_obj_mem_limit no longer used. See read_buf_mem_limit,\n");
5452                 break;
5453             case READ_BUF_MEM_LIMIT:
5454                 if (subopts_value == NULL) {
5455                     fprintf(stderr, "Missing read_buf_mem_limit argument\n");
5456                     return 1;
5457                 }
5458                 if (!safe_strtoul(subopts_value, &settings.read_buf_mem_limit)) {
5459                     fprintf(stderr, "could not parse argument to read_buf_mem_limit\n");
5460                     return 1;
5461                 }
5462                 settings.read_buf_mem_limit *= 1024 * 1024; /* megabytes */
5463                 break;
5464 #ifdef MEMCACHED_DEBUG
5465             case RELAXED_PRIVILEGES:
5466                 settings.relaxed_privileges = true;
5467                 break;
5468 #endif
5469             default:
5470 #ifdef EXTSTORE
5471                 // TODO: differentiating response code.
5472                 if (storage_read_config(storage_cf, &subopts_temp)) {
5473                     return 1;
5474                 }
5475 #else
5476                 printf("Illegal suboption \"%s\"\n", subopts_temp);
5477                 return 1;
5478 #endif
5479             } // switch
5480             if (subopts_temp_o) {
5481                 free(subopts_temp_o);
5482             }
5483 
5484             } // while
5485             free(subopts_orig);
5486             break;
5487         default:
5488             fprintf(stderr, "Illegal argument \"%c\"\n", c);
5489             return 1;
5490         }
5491     }
5492 
5493     if (settings.num_napi_ids > settings.num_threads) {
5494         fprintf(stderr, "Number of napi_ids(%d) cannot be greater than number of threads(%d)\n",
5495                 settings.num_napi_ids, settings.num_threads);
5496         exit(EX_USAGE);
5497     }
5498 
5499     if (settings.item_size_max < ITEM_SIZE_MAX_LOWER_LIMIT) {
5500         fprintf(stderr, "Item max size cannot be less than 1024 bytes.\n");
5501         exit(EX_USAGE);
5502     }
5503     if (settings.item_size_max > (settings.maxbytes / 2)) {
5504         fprintf(stderr, "Cannot set item size limit higher than 1/2 of memory max.\n");
5505         exit(EX_USAGE);
5506     }
5507     if (settings.item_size_max > (ITEM_SIZE_MAX_UPPER_LIMIT)) {
5508         fprintf(stderr, "Cannot set item size limit higher than a gigabyte.\n");
5509         exit(EX_USAGE);
5510     }
5511     if (settings.item_size_max > 1024 * 1024) {
5512         if (!slab_chunk_size_changed) {
5513             // Ideal new default is 16k, but needs stitching.
5514             settings.slab_chunk_size_max = settings.slab_page_size / 2;
5515         }
5516     }
5517 
5518     if (settings.slab_chunk_size_max > settings.item_size_max) {
5519         fprintf(stderr, "slab_chunk_max (bytes: %d) cannot be larger than -I (item_size_max %d)\n",
5520                 settings.slab_chunk_size_max, settings.item_size_max);
5521         exit(EX_USAGE);
5522     }
5523 
5524     if (settings.item_size_max % settings.slab_chunk_size_max != 0) {
5525         fprintf(stderr, "-I (item_size_max: %d) must be evenly divisible by slab_chunk_max (bytes: %d)\n",
5526                 settings.item_size_max, settings.slab_chunk_size_max);
5527         exit(EX_USAGE);
5528     }
5529 
5530     if (settings.slab_page_size % settings.slab_chunk_size_max != 0) {
5531         fprintf(stderr, "slab_chunk_max (bytes: %d) must divide evenly into %d (slab_page_size)\n",
5532                 settings.slab_chunk_size_max, settings.slab_page_size);
5533         exit(EX_USAGE);
5534     }
5535 #ifdef EXTSTORE
5536     switch (storage_check_config(storage_cf)) {
5537         case 0:
5538             storage_enabled = true;
5539             break;
5540         case 1:
5541             exit(EX_USAGE);
5542             break;
5543     }
5544 #endif
5545     // Reserve this for the new default. If factor size hasn't changed, use
5546     // new default.
5547     /*if (settings.slab_chunk_size_max == 16384 && settings.factor == 1.25) {
5548         settings.factor = 1.08;
5549     }*/
5550 
5551     if (slab_sizes_unparsed != NULL) {
5552         // want the unedited string for restart code.
5553         char *temp = strdup(slab_sizes_unparsed);
5554         if (_parse_slab_sizes(slab_sizes_unparsed, slab_sizes)) {
5555             use_slab_sizes = true;
5556             if (meta->slab_config) {
5557                 free(meta->slab_config);
5558             }
5559             meta->slab_config = temp;
5560         } else {
5561             exit(EX_USAGE);
5562         }
5563     } else if (!meta->slab_config) {
5564         // using the default factor.
5565         meta->slab_config = "1.25";
5566     }
5567 
5568     if (settings.hot_lru_pct + settings.warm_lru_pct > 80) {
5569         fprintf(stderr, "hot_lru_pct + warm_lru_pct cannot be more than 80%% combined\n");
5570         exit(EX_USAGE);
5571     }
5572 
5573     if (settings.temp_lru && !start_lru_maintainer) {
5574         fprintf(stderr, "temporary_ttl requires lru_maintainer to be enabled\n");
5575         exit(EX_USAGE);
5576     }
5577 
5578     if (hash_init(hash_type) != 0) {
5579         fprintf(stderr, "Failed to initialize hash_algorithm!\n");
5580         exit(EX_USAGE);
5581     }
5582 
5583     /*
5584      * Use one workerthread to serve each UDP port if the user specified
5585      * multiple ports
5586      */
5587     if (settings.inter != NULL && strchr(settings.inter, ',')) {
5588         settings.num_threads_per_udp = 1;
5589     } else {
5590         settings.num_threads_per_udp = settings.num_threads;
5591     }
5592 
5593     if (settings.sasl) {
5594         if (!protocol_specified) {
5595             settings.binding_protocol = binary_prot;
5596         } else {
5597             if (settings.binding_protocol != binary_prot) {
5598                 fprintf(stderr, "ERROR: You cannot allow the ASCII protocol while using SASL.\n");
5599                 exit(EX_USAGE);
5600             }
5601         }
5602 
5603         if (settings.udpport) {
5604             fprintf(stderr, "ERROR: Cannot enable UDP while using binary SASL authentication.\n");
5605             exit(EX_USAGE);
5606         }
5607     }
5608 
5609     if (settings.auth_file) {
5610         if (!protocol_specified) {
5611             settings.binding_protocol = ascii_prot;
5612         } else {
5613             if (settings.binding_protocol != ascii_prot) {
5614                 fprintf(stderr, "ERROR: You cannot allow the BINARY protocol while using ascii authentication tokens.\n");
5615                 exit(EX_USAGE);
5616             }
5617         }
5618     }
5619 
5620     if (udp_specified && settings.udpport != 0 && !tcp_specified) {
5621         settings.port = settings.udpport;
5622     }
5623 
5624 
5625 #ifdef TLS
5626     /*
5627      * Setup SSL if enabled
5628      */
5629     if (settings.ssl_enabled) {
5630         if (!settings.port) {
5631             fprintf(stderr, "ERROR: You cannot enable SSL without a TCP port.\n");
5632             exit(EX_USAGE);
5633         }
5634         // openssl init methods.
5635         SSL_load_error_strings();
5636         SSLeay_add_ssl_algorithms();
5637         // Initiate the SSL context.
5638         ssl_init();
5639     }
5640 #endif
5641 
5642     if (maxcore != 0) {
5643         struct rlimit rlim_new;
5644         /*
5645          * First try raising to infinity; if that fails, try bringing
5646          * the soft limit to the hard.
5647          */
5648         if (getrlimit(RLIMIT_CORE, &rlim) == 0) {
5649             rlim_new.rlim_cur = rlim_new.rlim_max = RLIM_INFINITY;
5650             if (setrlimit(RLIMIT_CORE, &rlim_new)!= 0) {
5651                 /* failed. try raising just to the old max */
5652                 rlim_new.rlim_cur = rlim_new.rlim_max = rlim.rlim_max;
5653                 (void)setrlimit(RLIMIT_CORE, &rlim_new);
5654             }
5655         }
5656         /*
5657          * getrlimit again to see what we ended up with. Only fail if
5658          * the soft limit ends up 0, because then no core files will be
5659          * created at all.
5660          */
5661 
5662         if ((getrlimit(RLIMIT_CORE, &rlim) != 0) || rlim.rlim_cur == 0) {
5663             fprintf(stderr, "failed to ensure corefile creation\n");
5664             exit(EX_OSERR);
5665         }
5666     }
5667 
5668     /*
5669      * If needed, increase rlimits to allow as many connections
5670      * as needed.
5671      */
5672 
5673     if (getrlimit(RLIMIT_NOFILE, &rlim) != 0) {
5674         fprintf(stderr, "failed to getrlimit number of files\n");
5675         exit(EX_OSERR);
5676     } else {
5677 #ifdef MEMCACHED_DEBUG
5678         if (rlim.rlim_cur < settings.maxconns || rlim.rlim_max < settings.maxconns) {
5679 #endif
5680         rlim.rlim_cur = settings.maxconns;
5681         rlim.rlim_max = settings.maxconns;
5682         if (setrlimit(RLIMIT_NOFILE, &rlim) != 0) {
5683             fprintf(stderr, "failed to set rlimit for open files. Try starting as root or requesting smaller maxconns value.\n");
5684             exit(EX_OSERR);
5685         }
5686 #ifdef MEMCACHED_DEBUG
5687         }
5688 #endif
5689     }
5690 
5691     /* lose root privileges if we have them */
5692     if (getuid() == 0 || geteuid() == 0) {
5693         if (username == 0 || *username == '\0') {
5694             fprintf(stderr, "can't run as root without the -u switch\n");
5695             exit(EX_USAGE);
5696         }
5697         if ((pw = getpwnam(username)) == 0) {
5698             fprintf(stderr, "can't find the user %s to switch to\n", username);
5699             exit(EX_NOUSER);
5700         }
5701         if (setgroups(0, NULL) < 0) {
5702             /* setgroups may fail with EPERM, indicating we are already in a
5703              * minimally-privileged state. In that case we continue. For all
5704              * other failure codes we exit.
5705              *
5706              * Note that errno is stored here because fprintf may change it.
5707              */
5708             bool should_exit = errno != EPERM;
5709             fprintf(stderr, "failed to drop supplementary groups: %s\n",
5710                     strerror(errno));
5711             if (should_exit) {
5712                 exit(EX_OSERR);
5713             }
5714         }
5715         if (setgid(pw->pw_gid) < 0 || setuid(pw->pw_uid) < 0) {
5716             fprintf(stderr, "failed to assume identity of user %s\n", username);
5717             exit(EX_OSERR);
5718         }
5719     }
5720 
5721     /* Initialize Sasl if -S was specified */
5722     if (settings.sasl) {
5723         init_sasl();
5724     }
5725 
5726     /* daemonize if requested */
5727     /* if we want to ensure our ability to dump core, don't chdir to / */
5728     if (do_daemonize) {
5729         if (signal(SIGHUP, SIG_IGN) == SIG_ERR) {
5730             perror("Failed to ignore SIGHUP");
5731         }
5732         if (daemonize(maxcore, settings.verbose) == -1) {
5733             fprintf(stderr, "failed to daemon() in order to daemonize\n");
5734             exit(EXIT_FAILURE);
5735         }
5736     }
5737 
5738     /* lock paged memory if needed */
5739     if (lock_memory) {
5740 #ifdef HAVE_MLOCKALL
5741         int res = mlockall(MCL_CURRENT | MCL_FUTURE);
5742         if (res != 0) {
5743             fprintf(stderr, "warning: -k invalid, mlockall() failed: %s\n",
5744                     strerror(errno));
5745         }
5746 #else
5747         fprintf(stderr, "warning: -k invalid, mlockall() not supported on this platform.  proceeding without.\n");
5748 #endif
5749     }
5750 
5751     /* initialize main thread libevent instance */
5752 #if defined(LIBEVENT_VERSION_NUMBER) && LIBEVENT_VERSION_NUMBER >= 0x02000101
5753     /* If libevent version is larger/equal to 2.0.2-alpha, use newer version */
5754     struct event_config *ev_config;
5755     ev_config = event_config_new();
5756     event_config_set_flag(ev_config, EVENT_BASE_FLAG_NOLOCK);
5757     main_base = event_base_new_with_config(ev_config);
5758     event_config_free(ev_config);
5759 #else
5760     /* Otherwise, use older API */
5761     main_base = event_init();
5762 #endif
5763 
5764     /* Load initial auth file if required */
5765     if (settings.auth_file) {
5766         if (settings.udpport) {
5767             fprintf(stderr, "Cannot use UDP with ascii authentication enabled (-U 0 to disable)\n");
5768             exit(EX_USAGE);
5769         }
5770 
5771         switch (authfile_load(settings.auth_file)) {
5772             case AUTHFILE_STATFAIL:
5773                 vperror("Could not stat authfile [%s], error %s", settings.auth_file
5774                                                             , strerror(errno));
5775                 exit(EXIT_FAILURE);
5776                 break;
5777             case AUTHFILE_OPENFAIL:
5778                 vperror("Could not open authfile [%s] for reading, error %s", settings.auth_file
5779                                                                            , strerror(errno));
5780                 exit(EXIT_FAILURE);
5781                 break;
5782             case AUTHFILE_OOM:
5783                 fprintf(stderr, "Out of memory reading password file: %s", settings.auth_file);
5784                 exit(EXIT_FAILURE);
5785                 break;
5786             case AUTHFILE_MALFORMED:
5787                 fprintf(stderr, "Authfile [%s] has a malformed entry. Should be 'user:password'", settings.auth_file);
5788                 exit(EXIT_FAILURE);
5789                 break;
5790             case AUTHFILE_OK:
5791                 break;
5792         }
5793     }
5794 
5795     /* initialize other stuff */
5796     stats_init();
5797     logger_init();
5798     conn_init();
5799     bool reuse_mem = false;
5800     void *mem_base = NULL;
5801     bool prefill = false;
5802     if (settings.memory_file != NULL) {
5803         preallocate = true;
5804         // Easier to manage memory if we prefill the global pool when reusing.
5805         prefill = true;
5806         restart_register("main", _mc_meta_load_cb, _mc_meta_save_cb, meta);
5807         reuse_mem = restart_mmap_open(settings.maxbytes,
5808                         settings.memory_file,
5809                         &mem_base);
5810         // The "save" callback gets called when we're closing out the mmap,
5811         // but we don't know what the mmap_base is until after we call open.
5812         // So we pass the struct above but have to fill it in here so the
5813         // data's available during the save routine.
5814         meta->mmap_base = mem_base;
5815         // Also, the callbacks for load() run before _open returns, so we
5816         // should have the old base in 'meta' as of here.
5817     }
5818     // Initialize the hash table _after_ checking restart metadata.
5819     // We override the hash table start argument with what was live
5820     // previously, to avoid filling a huge set of items into a tiny hash
5821     // table.
5822     assoc_init(settings.hashpower_init);
5823 #ifdef EXTSTORE
5824     if (storage_enabled && reuse_mem) {
5825         fprintf(stderr, "[restart] memory restart with extstore not presently supported.\n");
5826         reuse_mem = false;
5827     }
5828 #endif
5829     slabs_init(settings.maxbytes, settings.factor, preallocate,
5830             use_slab_sizes ? slab_sizes : NULL, mem_base, reuse_mem);
5831 #ifdef EXTSTORE
5832     if (storage_enabled) {
5833         storage = storage_init(storage_cf);
5834         if (storage == NULL) {
5835             exit(EXIT_FAILURE);
5836         }
5837         ext_storage = storage;
5838         /* page mover algorithm for extstore needs memory prefilled */
5839         prefill = true;
5840     }
5841 #endif
5842 
5843     if (settings.drop_privileges) {
5844         setup_privilege_violations_handler();
5845     }
5846 
5847     if (prefill)
5848         slabs_prefill_global();
5849     /* In restartable mode and we've decided to issue a fixup on memory */
5850     if (settings.memory_file != NULL && reuse_mem) {
5851         mc_ptr_t old_base = meta->old_base;
5852         assert(old_base == meta->old_base);
5853 
5854         // should've pulled in process_started from meta file.
5855         process_started = meta->process_started;
5856         // TODO: must be a more canonical way of serializing/deserializing
5857         // pointers? passing through uint64_t should work, and we're not
5858         // annotating the pointer with anything, but it's still slightly
5859         // insane.
5860         restart_fixup((void *)old_base);
5861     }
5862     /*
5863      * ignore SIGPIPE signals; we can use errno == EPIPE if we
5864      * need that information
5865      */
5866     if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) {
5867         perror("failed to ignore SIGPIPE; sigaction");
5868         exit(EX_OSERR);
5869     }
5870     /* start up worker threads if MT mode */
5871 #ifdef EXTSTORE
5872     slabs_set_storage(storage);
5873     memcached_thread_init(settings.num_threads, storage);
5874     init_lru_crawler(storage);
5875 #else
5876     memcached_thread_init(settings.num_threads, NULL);
5877     init_lru_crawler(NULL);
5878 #endif
5879 
5880     if (start_assoc_maint && start_assoc_maintenance_thread() == -1) {
5881         exit(EXIT_FAILURE);
5882     }
5883     if (start_lru_crawler && start_item_crawler_thread() != 0) {
5884         fprintf(stderr, "Failed to enable LRU crawler thread\n");
5885         exit(EXIT_FAILURE);
5886     }
5887 #ifdef EXTSTORE
5888     if (storage && start_storage_compact_thread(storage) != 0) {
5889         fprintf(stderr, "Failed to start storage compaction thread\n");
5890         exit(EXIT_FAILURE);
5891     }
5892     if (storage && start_storage_write_thread(storage) != 0) {
5893         fprintf(stderr, "Failed to start storage writer thread\n");
5894         exit(EXIT_FAILURE);
5895     }
5896 
5897     if (start_lru_maintainer && start_lru_maintainer_thread(storage) != 0) {
5898 #else
5899     if (start_lru_maintainer && start_lru_maintainer_thread(NULL) != 0) {
5900 #endif
5901         fprintf(stderr, "Failed to enable LRU maintainer thread\n");
5902         free(meta);
5903         return 1;
5904     }
5905 
5906     if (settings.slab_reassign &&
5907         start_slab_maintenance_thread() == -1) {
5908         exit(EXIT_FAILURE);
5909     }
5910 
5911     if (settings.idle_timeout && start_conn_timeout_thread() == -1) {
5912         exit(EXIT_FAILURE);
5913     }
5914 
5915     /* initialise clock event */
5916 #if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_MONOTONIC)
5917     {
5918         struct timespec ts;
5919         if (clock_gettime(CLOCK_MONOTONIC, &ts) == 0) {
5920             monotonic = true;
5921             monotonic_start = ts.tv_sec;
5922             // Monotonic clock needs special handling for restarts.
5923             // We get a start time at an arbitrary place, so we need to
5924             // restore the original time delta, which is always "now" - _start
5925             if (reuse_mem) {
5926                 // the running timespan at stop time + the time we think we
5927                 // were stopped.
5928                 monotonic_start -= meta->current_time + meta->time_delta;
5929             } else {
5930                 monotonic_start -= ITEM_UPDATE_INTERVAL + 2;
5931             }
5932         }
5933     }
5934 #endif
5935     clock_handler(0, 0, 0);
5936 
5937     /* create unix mode sockets after dropping privileges */
5938     if (settings.socketpath != NULL) {
5939         errno = 0;
5940         if (server_socket_unix(settings.socketpath,settings.access)) {
5941             vperror("failed to listen on UNIX socket: %s", settings.socketpath);
5942             exit(EX_OSERR);
5943         }
5944     }
5945 
5946     /* create the listening socket, bind it, and init */
5947     if (settings.socketpath == NULL) {
5948         const char *portnumber_filename = getenv("MEMCACHED_PORT_FILENAME");
5949         char *temp_portnumber_filename = NULL;
5950         size_t len;
5951         FILE *portnumber_file = NULL;
5952 
5953         if (portnumber_filename != NULL) {
5954             len = strlen(portnumber_filename)+4+1;
5955             temp_portnumber_filename = malloc(len);
5956             snprintf(temp_portnumber_filename,
5957                      len,
5958                      "%s.lck", portnumber_filename);
5959 
5960             portnumber_file = fopen(temp_portnumber_filename, "a");
5961             if (portnumber_file == NULL) {
5962                 fprintf(stderr, "Failed to open \"%s\": %s\n",
5963                         temp_portnumber_filename, strerror(errno));
5964             }
5965         }
5966 
5967         errno = 0;
5968         if (settings.port && server_sockets(settings.port, tcp_transport,
5969                                            portnumber_file)) {
5970             vperror("failed to listen on TCP port %d", settings.port);
5971             exit(EX_OSERR);
5972         }
5973 
5974         /*
5975          * initialization order: first create the listening sockets
5976          * (may need root on low ports), then drop root if needed,
5977          * then daemonize if needed, then init libevent (in some cases
5978          * descriptors created by libevent wouldn't survive forking).
5979          */
5980 
5981         /* create the UDP listening socket and bind it */
5982         errno = 0;
5983         if (settings.udpport && server_sockets(settings.udpport, udp_transport,
5984                                               portnumber_file)) {
5985             vperror("failed to listen on UDP port %d", settings.udpport);
5986             exit(EX_OSERR);
5987         }
5988 
5989         if (portnumber_file) {
5990             fclose(portnumber_file);
5991             rename(temp_portnumber_filename, portnumber_filename);
5992         }
5993         if (temp_portnumber_filename)
5994             free(temp_portnumber_filename);
5995     }
5996 
5997     /* Give the sockets a moment to open. I know this is dumb, but the error
5998      * is only an advisory.
5999      */
6000     usleep(1000);
6001     if (stats_state.curr_conns + stats_state.reserved_fds >= settings.maxconns - 1) {
6002         fprintf(stderr, "Maxconns setting is too low, use -c to increase.\n");
6003         exit(EXIT_FAILURE);
6004     }
6005 
6006     if (pid_file != NULL) {
6007         save_pid(pid_file);
6008     }
6009 
6010     /* Drop privileges no longer needed */
6011     if (settings.drop_privileges) {
6012         drop_privileges();
6013     }
6014 
6015     /* Initialize the uriencode lookup table. */
6016     uriencode_init();
6017 
6018     /* enter the event loop */
6019     while (!stop_main_loop) {
6020         if (event_base_loop(main_base, EVLOOP_ONCE) != 0) {
6021             retval = EXIT_FAILURE;
6022             break;
6023         }
6024     }
6025 
6026     switch (stop_main_loop) {
6027         case GRACE_STOP:
6028             fprintf(stderr, "Gracefully stopping\n");
6029         break;
6030         case EXIT_NORMALLY:
6031             // Don't need to print anything to STDERR for a normal shutdown.
6032         break;
6033         default:
6034             fprintf(stderr, "Exiting on error\n");
6035         break;
6036     }
6037 
6038     stop_threads();
6039     if (settings.memory_file != NULL && stop_main_loop == GRACE_STOP) {
6040         restart_mmap_close();
6041     }
6042 
6043     /* remove the PID file if we're a daemon */
6044     if (do_daemonize)
6045         remove_pidfile(pid_file);
6046     /* Clean up strdup() call for bind() address */
6047     if (settings.inter)
6048       free(settings.inter);
6049 
6050     /* cleanup base */
6051     event_base_free(main_base);
6052 
6053     free(meta);
6054 
6055     return retval;
6056 }
6057