1 /*
2  * Copyright (c) 2009-2012, Salvatore Sanfilippo <antirez at gmail dot com>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  *   * Redistributions of source code must retain the above copyright notice,
9  *     this list of conditions and the following disclaimer.
10  *   * Redistributions in binary form must reproduce the above copyright
11  *     notice, this list of conditions and the following disclaimer in the
12  *     documentation and/or other materials provided with the distribution.
13  *   * Neither the name of Redis nor the names of its contributors may be used
14  *     to endorse or promote products derived from this software without
15  *     specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27  * POSSIBILITY OF SUCH DAMAGE.
28  */
29 
30 #include "server.h"
31 #include "lzf.h"    /* LZF compression library */
32 #include "zipmap.h"
33 #include "endianconv.h"
34 #include "stream.h"
35 
36 #include <math.h>
37 #include <sys/types.h>
38 #include <sys/time.h>
39 #include <sys/resource.h>
40 #include <sys/wait.h>
41 #include <arpa/inet.h>
42 #include <sys/stat.h>
43 #include <sys/param.h>
44 
45 #define rdbExitReportCorruptRDB(...) rdbCheckThenExit(__LINE__,__VA_ARGS__)
46 
47 extern int rdbCheckMode;
48 void rdbCheckError(const char *fmt, ...);
49 void rdbCheckSetError(const char *fmt, ...);
50 
rdbCheckThenExit(int linenum,char * reason,...)51 void rdbCheckThenExit(int linenum, char *reason, ...) {
52     va_list ap;
53     char msg[1024];
54     int len;
55 
56     len = snprintf(msg,sizeof(msg),
57         "Internal error in RDB reading function at rdb.c:%d -> ", linenum);
58     va_start(ap,reason);
59     vsnprintf(msg+len,sizeof(msg)-len,reason,ap);
60     va_end(ap);
61 
62     if (!rdbCheckMode) {
63         serverLog(LL_WARNING, "%s", msg);
64         char *argv[2] = {"",server.rdb_filename};
65         redis_check_rdb_main(2,argv,NULL);
66     } else {
67         rdbCheckError("%s",msg);
68     }
69     exit(1);
70 }
71 
rdbWriteRaw(rio * rdb,void * p,size_t len)72 static int rdbWriteRaw(rio *rdb, void *p, size_t len) {
73     if (rdb && rioWrite(rdb,p,len) == 0)
74         return -1;
75     return len;
76 }
77 
78 /* This is just a wrapper for the low level function rioRead() that will
79  * automatically abort if it is not possible to read the specified amount
80  * of bytes. */
rdbLoadRaw(rio * rdb,void * buf,uint64_t len)81 void rdbLoadRaw(rio *rdb, void *buf, uint64_t len) {
82     if (rioRead(rdb,buf,len) == 0) {
83         rdbExitReportCorruptRDB(
84             "Impossible to read %llu bytes in rdbLoadRaw()",
85             (unsigned long long) len);
86         return; /* Not reached. */
87     }
88 }
89 
rdbSaveType(rio * rdb,unsigned char type)90 int rdbSaveType(rio *rdb, unsigned char type) {
91     return rdbWriteRaw(rdb,&type,1);
92 }
93 
94 /* Load a "type" in RDB format, that is a one byte unsigned integer.
95  * This function is not only used to load object types, but also special
96  * "types" like the end-of-file type, the EXPIRE type, and so forth. */
rdbLoadType(rio * rdb)97 int rdbLoadType(rio *rdb) {
98     unsigned char type;
99     if (rioRead(rdb,&type,1) == 0) return -1;
100     return type;
101 }
102 
103 /* This is only used to load old databases stored with the RDB_OPCODE_EXPIRETIME
104  * opcode. New versions of Redis store using the RDB_OPCODE_EXPIRETIME_MS
105  * opcode. */
rdbLoadTime(rio * rdb)106 time_t rdbLoadTime(rio *rdb) {
107     int32_t t32;
108     rdbLoadRaw(rdb,&t32,4);
109     return (time_t)t32;
110 }
111 
rdbSaveMillisecondTime(rio * rdb,long long t)112 int rdbSaveMillisecondTime(rio *rdb, long long t) {
113     int64_t t64 = (int64_t) t;
114     memrev64ifbe(&t64); /* Store in little endian. */
115     return rdbWriteRaw(rdb,&t64,8);
116 }
117 
118 /* This function loads a time from the RDB file. It gets the version of the
119  * RDB because, unfortunately, before Redis 5 (RDB version 9), the function
120  * failed to convert data to/from little endian, so RDB files with keys having
121  * expires could not be shared between big endian and little endian systems
122  * (because the expire time will be totally wrong). The fix for this is just
123  * to call memrev64ifbe(), however if we fix this for all the RDB versions,
124  * this call will introduce an incompatibility for big endian systems:
125  * after upgrading to Redis version 5 they will no longer be able to load their
126  * own old RDB files. Because of that, we instead fix the function only for new
127  * RDB versions, and load older RDB versions as we used to do in the past,
128  * allowing big endian systems to load their own old RDB files. */
rdbLoadMillisecondTime(rio * rdb,int rdbver)129 long long rdbLoadMillisecondTime(rio *rdb, int rdbver) {
130     int64_t t64;
131     rdbLoadRaw(rdb,&t64,8);
132     if (rdbver >= 9) /* Check the top comment of this function. */
133         memrev64ifbe(&t64); /* Convert in big endian if the system is BE. */
134     return (long long)t64;
135 }
136 
137 /* Saves an encoded length. The first two bits in the first byte are used to
138  * hold the encoding type. See the RDB_* definitions for more information
139  * on the types of encoding. */
rdbSaveLen(rio * rdb,uint64_t len)140 int rdbSaveLen(rio *rdb, uint64_t len) {
141     unsigned char buf[2];
142     size_t nwritten;
143 
144     if (len < (1<<6)) {
145         /* Save a 6 bit len */
146         buf[0] = (len&0xFF)|(RDB_6BITLEN<<6);
147         if (rdbWriteRaw(rdb,buf,1) == -1) return -1;
148         nwritten = 1;
149     } else if (len < (1<<14)) {
150         /* Save a 14 bit len */
151         buf[0] = ((len>>8)&0xFF)|(RDB_14BITLEN<<6);
152         buf[1] = len&0xFF;
153         if (rdbWriteRaw(rdb,buf,2) == -1) return -1;
154         nwritten = 2;
155     } else if (len <= UINT32_MAX) {
156         /* Save a 32 bit len */
157         buf[0] = RDB_32BITLEN;
158         if (rdbWriteRaw(rdb,buf,1) == -1) return -1;
159         uint32_t len32 = htonl(len);
160         if (rdbWriteRaw(rdb,&len32,4) == -1) return -1;
161         nwritten = 1+4;
162     } else {
163         /* Save a 64 bit len */
164         buf[0] = RDB_64BITLEN;
165         if (rdbWriteRaw(rdb,buf,1) == -1) return -1;
166         len = htonu64(len);
167         if (rdbWriteRaw(rdb,&len,8) == -1) return -1;
168         nwritten = 1+8;
169     }
170     return nwritten;
171 }
172 
173 
174 /* Load an encoded length. If the loaded length is a normal length as stored
175  * with rdbSaveLen(), the read length is set to '*lenptr'. If instead the
176  * loaded length describes a special encoding that follows, then '*isencoded'
177  * is set to 1 and the encoding format is stored at '*lenptr'.
178  *
179  * See the RDB_ENC_* definitions in rdb.h for more information on special
180  * encodings.
181  *
182  * The function returns -1 on error, 0 on success. */
rdbLoadLenByRef(rio * rdb,int * isencoded,uint64_t * lenptr)183 int rdbLoadLenByRef(rio *rdb, int *isencoded, uint64_t *lenptr) {
184     unsigned char buf[2];
185     int type;
186 
187     if (isencoded) *isencoded = 0;
188     if (rioRead(rdb,buf,1) == 0) return -1;
189     type = (buf[0]&0xC0)>>6;
190     if (type == RDB_ENCVAL) {
191         /* Read a 6 bit encoding type. */
192         if (isencoded) *isencoded = 1;
193         *lenptr = buf[0]&0x3F;
194     } else if (type == RDB_6BITLEN) {
195         /* Read a 6 bit len. */
196         *lenptr = buf[0]&0x3F;
197     } else if (type == RDB_14BITLEN) {
198         /* Read a 14 bit len. */
199         if (rioRead(rdb,buf+1,1) == 0) return -1;
200         *lenptr = ((buf[0]&0x3F)<<8)|buf[1];
201     } else if (buf[0] == RDB_32BITLEN) {
202         /* Read a 32 bit len. */
203         uint32_t len;
204         if (rioRead(rdb,&len,4) == 0) return -1;
205         *lenptr = ntohl(len);
206     } else if (buf[0] == RDB_64BITLEN) {
207         /* Read a 64 bit len. */
208         uint64_t len;
209         if (rioRead(rdb,&len,8) == 0) return -1;
210         *lenptr = ntohu64(len);
211     } else {
212         rdbExitReportCorruptRDB(
213             "Unknown length encoding %d in rdbLoadLen()",type);
214         return -1; /* Never reached. */
215     }
216     return 0;
217 }
218 
219 /* This is like rdbLoadLenByRef() but directly returns the value read
220  * from the RDB stream, signaling an error by returning RDB_LENERR
221  * (since it is a too large count to be applicable in any Redis data
222  * structure). */
rdbLoadLen(rio * rdb,int * isencoded)223 uint64_t rdbLoadLen(rio *rdb, int *isencoded) {
224     uint64_t len;
225 
226     if (rdbLoadLenByRef(rdb,isencoded,&len) == -1) return RDB_LENERR;
227     return len;
228 }
229 
230 /* Encodes the "value" argument as integer when it fits in the supported ranges
231  * for encoded types. If the function successfully encodes the integer, the
232  * representation is stored in the buffer pointer to by "enc" and the string
233  * length is returned. Otherwise 0 is returned. */
rdbEncodeInteger(long long value,unsigned char * enc)234 int rdbEncodeInteger(long long value, unsigned char *enc) {
235     if (value >= -(1<<7) && value <= (1<<7)-1) {
236         enc[0] = (RDB_ENCVAL<<6)|RDB_ENC_INT8;
237         enc[1] = value&0xFF;
238         return 2;
239     } else if (value >= -(1<<15) && value <= (1<<15)-1) {
240         enc[0] = (RDB_ENCVAL<<6)|RDB_ENC_INT16;
241         enc[1] = value&0xFF;
242         enc[2] = (value>>8)&0xFF;
243         return 3;
244     } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
245         enc[0] = (RDB_ENCVAL<<6)|RDB_ENC_INT32;
246         enc[1] = value&0xFF;
247         enc[2] = (value>>8)&0xFF;
248         enc[3] = (value>>16)&0xFF;
249         enc[4] = (value>>24)&0xFF;
250         return 5;
251     } else {
252         return 0;
253     }
254 }
255 
256 /* Loads an integer-encoded object with the specified encoding type "enctype".
257  * The returned value changes according to the flags, see
258  * rdbGenerincLoadStringObject() for more info. */
rdbLoadIntegerObject(rio * rdb,int enctype,int flags,size_t * lenptr)259 void *rdbLoadIntegerObject(rio *rdb, int enctype, int flags, size_t *lenptr) {
260     int plain = flags & RDB_LOAD_PLAIN;
261     int sds = flags & RDB_LOAD_SDS;
262     int encode = flags & RDB_LOAD_ENC;
263     unsigned char enc[4];
264     long long val;
265 
266     if (enctype == RDB_ENC_INT8) {
267         if (rioRead(rdb,enc,1) == 0) return NULL;
268         val = (signed char)enc[0];
269     } else if (enctype == RDB_ENC_INT16) {
270         uint16_t v;
271         if (rioRead(rdb,enc,2) == 0) return NULL;
272         v = enc[0]|(enc[1]<<8);
273         val = (int16_t)v;
274     } else if (enctype == RDB_ENC_INT32) {
275         uint32_t v;
276         if (rioRead(rdb,enc,4) == 0) return NULL;
277         v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
278         val = (int32_t)v;
279     } else {
280         val = 0; /* anti-warning */
281         rdbExitReportCorruptRDB("Unknown RDB integer encoding type %d",enctype);
282     }
283     if (plain || sds) {
284         char buf[LONG_STR_SIZE], *p;
285         int len = ll2string(buf,sizeof(buf),val);
286         if (lenptr) *lenptr = len;
287         p = plain ? zmalloc(len) : sdsnewlen(SDS_NOINIT,len);
288         memcpy(p,buf,len);
289         return p;
290     } else if (encode) {
291         return createStringObjectFromLongLongForValue(val);
292     } else {
293         return createObject(OBJ_STRING,sdsfromlonglong(val));
294     }
295 }
296 
297 /* String objects in the form "2391" "-100" without any space and with a
298  * range of values that can fit in an 8, 16 or 32 bit signed value can be
299  * encoded as integers to save space */
rdbTryIntegerEncoding(char * s,size_t len,unsigned char * enc)300 int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
301     long long value;
302     char *endptr, buf[32];
303 
304     /* Check if it's possible to encode this value as a number */
305     value = strtoll(s, &endptr, 10);
306     if (endptr[0] != '\0') return 0;
307     ll2string(buf,32,value);
308 
309     /* If the number converted back into a string is not identical
310      * then it's not possible to encode the string as integer */
311     if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
312 
313     return rdbEncodeInteger(value,enc);
314 }
315 
rdbSaveLzfBlob(rio * rdb,void * data,size_t compress_len,size_t original_len)316 ssize_t rdbSaveLzfBlob(rio *rdb, void *data, size_t compress_len,
317                        size_t original_len) {
318     unsigned char byte;
319     ssize_t n, nwritten = 0;
320 
321     /* Data compressed! Let's save it on disk */
322     byte = (RDB_ENCVAL<<6)|RDB_ENC_LZF;
323     if ((n = rdbWriteRaw(rdb,&byte,1)) == -1) goto writeerr;
324     nwritten += n;
325 
326     if ((n = rdbSaveLen(rdb,compress_len)) == -1) goto writeerr;
327     nwritten += n;
328 
329     if ((n = rdbSaveLen(rdb,original_len)) == -1) goto writeerr;
330     nwritten += n;
331 
332     if ((n = rdbWriteRaw(rdb,data,compress_len)) == -1) goto writeerr;
333     nwritten += n;
334 
335     return nwritten;
336 
337 writeerr:
338     return -1;
339 }
340 
rdbSaveLzfStringObject(rio * rdb,unsigned char * s,size_t len)341 ssize_t rdbSaveLzfStringObject(rio *rdb, unsigned char *s, size_t len) {
342     size_t comprlen, outlen;
343     void *out;
344 
345     /* We require at least four bytes compression for this to be worth it */
346     if (len <= 4) return 0;
347     outlen = len-4;
348     if ((out = zmalloc(outlen+1)) == NULL) return 0;
349     comprlen = lzf_compress(s, len, out, outlen);
350     if (comprlen == 0) {
351         zfree(out);
352         return 0;
353     }
354     ssize_t nwritten = rdbSaveLzfBlob(rdb, out, comprlen, len);
355     zfree(out);
356     return nwritten;
357 }
358 
359 /* Load an LZF compressed string in RDB format. The returned value
360  * changes according to 'flags'. For more info check the
361  * rdbGenericLoadStringObject() function. */
rdbLoadLzfStringObject(rio * rdb,int flags,size_t * lenptr)362 void *rdbLoadLzfStringObject(rio *rdb, int flags, size_t *lenptr) {
363     int plain = flags & RDB_LOAD_PLAIN;
364     int sds = flags & RDB_LOAD_SDS;
365     uint64_t len, clen;
366     unsigned char *c = NULL;
367     char *val = NULL;
368 
369     if ((clen = rdbLoadLen(rdb,NULL)) == RDB_LENERR) return NULL;
370     if ((len = rdbLoadLen(rdb,NULL)) == RDB_LENERR) return NULL;
371     if ((c = zmalloc(clen)) == NULL) goto err;
372 
373     /* Allocate our target according to the uncompressed size. */
374     if (plain) {
375         val = zmalloc(len);
376     } else {
377         val = sdsnewlen(SDS_NOINIT,len);
378     }
379     if (lenptr) *lenptr = len;
380 
381     /* Load the compressed representation and uncompress it to target. */
382     if (rioRead(rdb,c,clen) == 0) goto err;
383     if (lzf_decompress(c,clen,val,len) == 0) {
384         if (rdbCheckMode) rdbCheckSetError("Invalid LZF compressed string");
385         goto err;
386     }
387     zfree(c);
388 
389     if (plain || sds) {
390         return val;
391     } else {
392         return createObject(OBJ_STRING,val);
393     }
394 err:
395     zfree(c);
396     if (plain)
397         zfree(val);
398     else
399         sdsfree(val);
400     return NULL;
401 }
402 
403 /* Save a string object as [len][data] on disk. If the object is a string
404  * representation of an integer value we try to save it in a special form */
rdbSaveRawString(rio * rdb,unsigned char * s,size_t len)405 ssize_t rdbSaveRawString(rio *rdb, unsigned char *s, size_t len) {
406     int enclen;
407     ssize_t n, nwritten = 0;
408 
409     /* Try integer encoding */
410     if (len <= 11) {
411         unsigned char buf[5];
412         if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
413             if (rdbWriteRaw(rdb,buf,enclen) == -1) return -1;
414             return enclen;
415         }
416     }
417 
418     /* Try LZF compression - under 20 bytes it's unable to compress even
419      * aaaaaaaaaaaaaaaaaa so skip it */
420     if (server.rdb_compression && len > 20) {
421         n = rdbSaveLzfStringObject(rdb,s,len);
422         if (n == -1) return -1;
423         if (n > 0) return n;
424         /* Return value of 0 means data can't be compressed, save the old way */
425     }
426 
427     /* Store verbatim */
428     if ((n = rdbSaveLen(rdb,len)) == -1) return -1;
429     nwritten += n;
430     if (len > 0) {
431         if (rdbWriteRaw(rdb,s,len) == -1) return -1;
432         nwritten += len;
433     }
434     return nwritten;
435 }
436 
437 /* Save a long long value as either an encoded string or a string. */
rdbSaveLongLongAsStringObject(rio * rdb,long long value)438 ssize_t rdbSaveLongLongAsStringObject(rio *rdb, long long value) {
439     unsigned char buf[32];
440     ssize_t n, nwritten = 0;
441     int enclen = rdbEncodeInteger(value,buf);
442     if (enclen > 0) {
443         return rdbWriteRaw(rdb,buf,enclen);
444     } else {
445         /* Encode as string */
446         enclen = ll2string((char*)buf,32,value);
447         serverAssert(enclen < 32);
448         if ((n = rdbSaveLen(rdb,enclen)) == -1) return -1;
449         nwritten += n;
450         if ((n = rdbWriteRaw(rdb,buf,enclen)) == -1) return -1;
451         nwritten += n;
452     }
453     return nwritten;
454 }
455 
456 /* Like rdbSaveRawString() gets a Redis object instead. */
rdbSaveStringObject(rio * rdb,robj * obj)457 ssize_t rdbSaveStringObject(rio *rdb, robj *obj) {
458     /* Avoid to decode the object, then encode it again, if the
459      * object is already integer encoded. */
460     if (obj->encoding == OBJ_ENCODING_INT) {
461         return rdbSaveLongLongAsStringObject(rdb,(long)obj->ptr);
462     } else {
463         serverAssertWithInfo(NULL,obj,sdsEncodedObject(obj));
464         return rdbSaveRawString(rdb,obj->ptr,sdslen(obj->ptr));
465     }
466 }
467 
468 /* Load a string object from an RDB file according to flags:
469  *
470  * RDB_LOAD_NONE (no flags): load an RDB object, unencoded.
471  * RDB_LOAD_ENC: If the returned type is a Redis object, try to
472  *               encode it in a special way to be more memory
473  *               efficient. When this flag is passed the function
474  *               no longer guarantees that obj->ptr is an SDS string.
475  * RDB_LOAD_PLAIN: Return a plain string allocated with zmalloc()
476  *                 instead of a Redis object with an sds in it.
477  * RDB_LOAD_SDS: Return an SDS string instead of a Redis object.
478  *
479  * On I/O error NULL is returned.
480  */
rdbGenericLoadStringObject(rio * rdb,int flags,size_t * lenptr)481 void *rdbGenericLoadStringObject(rio *rdb, int flags, size_t *lenptr) {
482     int encode = flags & RDB_LOAD_ENC;
483     int plain = flags & RDB_LOAD_PLAIN;
484     int sds = flags & RDB_LOAD_SDS;
485     int isencoded;
486     uint64_t len;
487 
488     len = rdbLoadLen(rdb,&isencoded);
489     if (isencoded) {
490         switch(len) {
491         case RDB_ENC_INT8:
492         case RDB_ENC_INT16:
493         case RDB_ENC_INT32:
494             return rdbLoadIntegerObject(rdb,len,flags,lenptr);
495         case RDB_ENC_LZF:
496             return rdbLoadLzfStringObject(rdb,flags,lenptr);
497         default:
498             rdbExitReportCorruptRDB("Unknown RDB string encoding type %d",len);
499         }
500     }
501 
502     if (len == RDB_LENERR) return NULL;
503     if (plain || sds) {
504         void *buf = plain ? zmalloc(len) : sdsnewlen(SDS_NOINIT,len);
505         if (lenptr) *lenptr = len;
506         if (len && rioRead(rdb,buf,len) == 0) {
507             if (plain)
508                 zfree(buf);
509             else
510                 sdsfree(buf);
511             return NULL;
512         }
513         return buf;
514     } else {
515         robj *o = encode ? createStringObject(SDS_NOINIT,len) :
516                            createRawStringObject(SDS_NOINIT,len);
517         if (len && rioRead(rdb,o->ptr,len) == 0) {
518             decrRefCount(o);
519             return NULL;
520         }
521         return o;
522     }
523 }
524 
rdbLoadStringObject(rio * rdb)525 robj *rdbLoadStringObject(rio *rdb) {
526     return rdbGenericLoadStringObject(rdb,RDB_LOAD_NONE,NULL);
527 }
528 
rdbLoadEncodedStringObject(rio * rdb)529 robj *rdbLoadEncodedStringObject(rio *rdb) {
530     return rdbGenericLoadStringObject(rdb,RDB_LOAD_ENC,NULL);
531 }
532 
533 /* Save a double value. Doubles are saved as strings prefixed by an unsigned
534  * 8 bit integer specifying the length of the representation.
535  * This 8 bit integer has special values in order to specify the following
536  * conditions:
537  * 253: not a number
538  * 254: + inf
539  * 255: - inf
540  */
rdbSaveDoubleValue(rio * rdb,double val)541 int rdbSaveDoubleValue(rio *rdb, double val) {
542     unsigned char buf[128];
543     int len;
544 
545     if (isnan(val)) {
546         buf[0] = 253;
547         len = 1;
548     } else if (!isfinite(val)) {
549         len = 1;
550         buf[0] = (val < 0) ? 255 : 254;
551     } else {
552 #if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL)
553         /* Check if the float is in a safe range to be casted into a
554          * long long. We are assuming that long long is 64 bit here.
555          * Also we are assuming that there are no implementations around where
556          * double has precision < 52 bit.
557          *
558          * Under this assumptions we test if a double is inside an interval
559          * where casting to long long is safe. Then using two castings we
560          * make sure the decimal part is zero. If all this is true we use
561          * integer printing function that is much faster. */
562         double min = -4503599627370495; /* (2^52)-1 */
563         double max = 4503599627370496; /* -(2^52) */
564         if (val > min && val < max && val == ((double)((long long)val)))
565             ll2string((char*)buf+1,sizeof(buf)-1,(long long)val);
566         else
567 #endif
568             snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
569         buf[0] = strlen((char*)buf+1);
570         len = buf[0]+1;
571     }
572     return rdbWriteRaw(rdb,buf,len);
573 }
574 
575 /* For information about double serialization check rdbSaveDoubleValue() */
rdbLoadDoubleValue(rio * rdb,double * val)576 int rdbLoadDoubleValue(rio *rdb, double *val) {
577     char buf[256];
578     unsigned char len;
579 
580     if (rioRead(rdb,&len,1) == 0) return -1;
581     switch(len) {
582     case 255: *val = R_NegInf; return 0;
583     case 254: *val = R_PosInf; return 0;
584     case 253: *val = R_Nan; return 0;
585     default:
586         if (rioRead(rdb,buf,len) == 0) return -1;
587         buf[len] = '\0';
588         sscanf(buf, "%lg", val);
589         return 0;
590     }
591 }
592 
593 /* Saves a double for RDB 8 or greater, where IE754 binary64 format is assumed.
594  * We just make sure the integer is always stored in little endian, otherwise
595  * the value is copied verbatim from memory to disk.
596  *
597  * Return -1 on error, the size of the serialized value on success. */
rdbSaveBinaryDoubleValue(rio * rdb,double val)598 int rdbSaveBinaryDoubleValue(rio *rdb, double val) {
599     memrev64ifbe(&val);
600     return rdbWriteRaw(rdb,&val,sizeof(val));
601 }
602 
603 /* Loads a double from RDB 8 or greater. See rdbSaveBinaryDoubleValue() for
604  * more info. On error -1 is returned, otherwise 0. */
rdbLoadBinaryDoubleValue(rio * rdb,double * val)605 int rdbLoadBinaryDoubleValue(rio *rdb, double *val) {
606     if (rioRead(rdb,val,sizeof(*val)) == 0) return -1;
607     memrev64ifbe(val);
608     return 0;
609 }
610 
611 /* Like rdbSaveBinaryDoubleValue() but single precision. */
rdbSaveBinaryFloatValue(rio * rdb,float val)612 int rdbSaveBinaryFloatValue(rio *rdb, float val) {
613     memrev32ifbe(&val);
614     return rdbWriteRaw(rdb,&val,sizeof(val));
615 }
616 
617 /* Like rdbLoadBinaryDoubleValue() but single precision. */
rdbLoadBinaryFloatValue(rio * rdb,float * val)618 int rdbLoadBinaryFloatValue(rio *rdb, float *val) {
619     if (rioRead(rdb,val,sizeof(*val)) == 0) return -1;
620     memrev32ifbe(val);
621     return 0;
622 }
623 
624 /* Save the object type of object "o". */
rdbSaveObjectType(rio * rdb,robj * o)625 int rdbSaveObjectType(rio *rdb, robj *o) {
626     switch (o->type) {
627     case OBJ_STRING:
628         return rdbSaveType(rdb,RDB_TYPE_STRING);
629     case OBJ_LIST:
630         if (o->encoding == OBJ_ENCODING_QUICKLIST)
631             return rdbSaveType(rdb,RDB_TYPE_LIST_QUICKLIST);
632         else
633             serverPanic("Unknown list encoding");
634     case OBJ_SET:
635         if (o->encoding == OBJ_ENCODING_INTSET)
636             return rdbSaveType(rdb,RDB_TYPE_SET_INTSET);
637         else if (o->encoding == OBJ_ENCODING_HT)
638             return rdbSaveType(rdb,RDB_TYPE_SET);
639         else
640             serverPanic("Unknown set encoding");
641     case OBJ_ZSET:
642         if (o->encoding == OBJ_ENCODING_ZIPLIST)
643             return rdbSaveType(rdb,RDB_TYPE_ZSET_ZIPLIST);
644         else if (o->encoding == OBJ_ENCODING_SKIPLIST)
645             return rdbSaveType(rdb,RDB_TYPE_ZSET_2);
646         else
647             serverPanic("Unknown sorted set encoding");
648     case OBJ_HASH:
649         if (o->encoding == OBJ_ENCODING_ZIPLIST)
650             return rdbSaveType(rdb,RDB_TYPE_HASH_ZIPLIST);
651         else if (o->encoding == OBJ_ENCODING_HT)
652             return rdbSaveType(rdb,RDB_TYPE_HASH);
653         else
654             serverPanic("Unknown hash encoding");
655     case OBJ_STREAM:
656         return rdbSaveType(rdb,RDB_TYPE_STREAM_LISTPACKS);
657     case OBJ_MODULE:
658         return rdbSaveType(rdb,RDB_TYPE_MODULE_2);
659     default:
660         serverPanic("Unknown object type");
661     }
662     return -1; /* avoid warning */
663 }
664 
665 /* Use rdbLoadType() to load a TYPE in RDB format, but returns -1 if the
666  * type is not specifically a valid Object Type. */
rdbLoadObjectType(rio * rdb)667 int rdbLoadObjectType(rio *rdb) {
668     int type;
669     if ((type = rdbLoadType(rdb)) == -1) return -1;
670     if (!rdbIsObjectType(type)) return -1;
671     return type;
672 }
673 
674 /* This helper function serializes a consumer group Pending Entries List (PEL)
675  * into the RDB file. The 'nacks' argument tells the function if also persist
676  * the informations about the not acknowledged message, or if to persist
677  * just the IDs: this is useful because for the global consumer group PEL
678  * we serialized the NACKs as well, but when serializing the local consumer
679  * PELs we just add the ID, that will be resolved inside the global PEL to
680  * put a reference to the same structure. */
rdbSaveStreamPEL(rio * rdb,rax * pel,int nacks)681 ssize_t rdbSaveStreamPEL(rio *rdb, rax *pel, int nacks) {
682     ssize_t n, nwritten = 0;
683 
684     /* Number of entries in the PEL. */
685     if ((n = rdbSaveLen(rdb,raxSize(pel))) == -1) return -1;
686     nwritten += n;
687 
688     /* Save each entry. */
689     raxIterator ri;
690     raxStart(&ri,pel);
691     raxSeek(&ri,"^",NULL,0);
692     while(raxNext(&ri)) {
693         /* We store IDs in raw form as 128 big big endian numbers, like
694          * they are inside the radix tree key. */
695         if ((n = rdbWriteRaw(rdb,ri.key,sizeof(streamID))) == -1) return -1;
696         nwritten += n;
697 
698         if (nacks) {
699             streamNACK *nack = ri.data;
700             if ((n = rdbSaveMillisecondTime(rdb,nack->delivery_time)) == -1)
701                 return -1;
702             nwritten += n;
703             if ((n = rdbSaveLen(rdb,nack->delivery_count)) == -1) return -1;
704             nwritten += n;
705             /* We don't save the consumer name: we'll save the pending IDs
706              * for each consumer in the consumer PEL, and resolve the consumer
707              * at loading time. */
708         }
709     }
710     raxStop(&ri);
711     return nwritten;
712 }
713 
714 /* Serialize the consumers of a stream consumer group into the RDB. Helper
715  * function for the stream data type serialization. What we do here is to
716  * persist the consumer metadata, and it's PEL, for each consumer. */
rdbSaveStreamConsumers(rio * rdb,streamCG * cg)717 size_t rdbSaveStreamConsumers(rio *rdb, streamCG *cg) {
718     ssize_t n, nwritten = 0;
719 
720     /* Number of consumers in this consumer group. */
721     if ((n = rdbSaveLen(rdb,raxSize(cg->consumers))) == -1) return -1;
722     nwritten += n;
723 
724     /* Save each consumer. */
725     raxIterator ri;
726     raxStart(&ri,cg->consumers);
727     raxSeek(&ri,"^",NULL,0);
728     while(raxNext(&ri)) {
729         streamConsumer *consumer = ri.data;
730 
731         /* Consumer name. */
732         if ((n = rdbSaveRawString(rdb,ri.key,ri.key_len)) == -1) return -1;
733         nwritten += n;
734 
735         /* Last seen time. */
736         if ((n = rdbSaveMillisecondTime(rdb,consumer->seen_time)) == -1)
737             return -1;
738         nwritten += n;
739 
740         /* Consumer PEL, without the ACKs (see last parameter of the function
741          * passed with value of 0), at loading time we'll lookup the ID
742          * in the consumer group global PEL and will put a reference in the
743          * consumer local PEL. */
744         if ((n = rdbSaveStreamPEL(rdb,consumer->pel,0)) == -1)
745             return -1;
746         nwritten += n;
747     }
748     raxStop(&ri);
749     return nwritten;
750 }
751 
752 /* Save a Redis object.
753  * Returns -1 on error, number of bytes written on success. */
rdbSaveObject(rio * rdb,robj * o,robj * key)754 ssize_t rdbSaveObject(rio *rdb, robj *o, robj *key) {
755     ssize_t n = 0, nwritten = 0;
756 
757     if (o->type == OBJ_STRING) {
758         /* Save a string value */
759         if ((n = rdbSaveStringObject(rdb,o)) == -1) return -1;
760         nwritten += n;
761     } else if (o->type == OBJ_LIST) {
762         /* Save a list value */
763         if (o->encoding == OBJ_ENCODING_QUICKLIST) {
764             quicklist *ql = o->ptr;
765             quicklistNode *node = ql->head;
766 
767             if ((n = rdbSaveLen(rdb,ql->len)) == -1) return -1;
768             nwritten += n;
769 
770             while(node) {
771                 if (quicklistNodeIsCompressed(node)) {
772                     void *data;
773                     size_t compress_len = quicklistGetLzf(node, &data);
774                     if ((n = rdbSaveLzfBlob(rdb,data,compress_len,node->sz)) == -1) return -1;
775                     nwritten += n;
776                 } else {
777                     if ((n = rdbSaveRawString(rdb,node->zl,node->sz)) == -1) return -1;
778                     nwritten += n;
779                 }
780                 node = node->next;
781             }
782         } else {
783             serverPanic("Unknown list encoding");
784         }
785     } else if (o->type == OBJ_SET) {
786         /* Save a set value */
787         if (o->encoding == OBJ_ENCODING_HT) {
788             dict *set = o->ptr;
789             dictIterator *di = dictGetIterator(set);
790             dictEntry *de;
791 
792             if ((n = rdbSaveLen(rdb,dictSize(set))) == -1) {
793                 dictReleaseIterator(di);
794                 return -1;
795             }
796             nwritten += n;
797 
798             while((de = dictNext(di)) != NULL) {
799                 sds ele = dictGetKey(de);
800                 if ((n = rdbSaveRawString(rdb,(unsigned char*)ele,sdslen(ele)))
801                     == -1)
802                 {
803                     dictReleaseIterator(di);
804                     return -1;
805                 }
806                 nwritten += n;
807             }
808             dictReleaseIterator(di);
809         } else if (o->encoding == OBJ_ENCODING_INTSET) {
810             size_t l = intsetBlobLen((intset*)o->ptr);
811 
812             if ((n = rdbSaveRawString(rdb,o->ptr,l)) == -1) return -1;
813             nwritten += n;
814         } else {
815             serverPanic("Unknown set encoding");
816         }
817     } else if (o->type == OBJ_ZSET) {
818         /* Save a sorted set value */
819         if (o->encoding == OBJ_ENCODING_ZIPLIST) {
820             size_t l = ziplistBlobLen((unsigned char*)o->ptr);
821 
822             if ((n = rdbSaveRawString(rdb,o->ptr,l)) == -1) return -1;
823             nwritten += n;
824         } else if (o->encoding == OBJ_ENCODING_SKIPLIST) {
825             zset *zs = o->ptr;
826             zskiplist *zsl = zs->zsl;
827 
828             if ((n = rdbSaveLen(rdb,zsl->length)) == -1) return -1;
829             nwritten += n;
830 
831             /* We save the skiplist elements from the greatest to the smallest
832              * (that's trivial since the elements are already ordered in the
833              * skiplist): this improves the load process, since the next loaded
834              * element will always be the smaller, so adding to the skiplist
835              * will always immediately stop at the head, making the insertion
836              * O(1) instead of O(log(N)). */
837             zskiplistNode *zn = zsl->tail;
838             while (zn != NULL) {
839                 if ((n = rdbSaveRawString(rdb,
840                     (unsigned char*)zn->ele,sdslen(zn->ele))) == -1)
841                 {
842                     return -1;
843                 }
844                 nwritten += n;
845                 if ((n = rdbSaveBinaryDoubleValue(rdb,zn->score)) == -1)
846                     return -1;
847                 nwritten += n;
848                 zn = zn->backward;
849             }
850         } else {
851             serverPanic("Unknown sorted set encoding");
852         }
853     } else if (o->type == OBJ_HASH) {
854         /* Save a hash value */
855         if (o->encoding == OBJ_ENCODING_ZIPLIST) {
856             size_t l = ziplistBlobLen((unsigned char*)o->ptr);
857 
858             if ((n = rdbSaveRawString(rdb,o->ptr,l)) == -1) return -1;
859             nwritten += n;
860 
861         } else if (o->encoding == OBJ_ENCODING_HT) {
862             dictIterator *di = dictGetIterator(o->ptr);
863             dictEntry *de;
864 
865             if ((n = rdbSaveLen(rdb,dictSize((dict*)o->ptr))) == -1) {
866                 dictReleaseIterator(di);
867                 return -1;
868             }
869             nwritten += n;
870 
871             while((de = dictNext(di)) != NULL) {
872                 sds field = dictGetKey(de);
873                 sds value = dictGetVal(de);
874 
875                 if ((n = rdbSaveRawString(rdb,(unsigned char*)field,
876                         sdslen(field))) == -1)
877                 {
878                     dictReleaseIterator(di);
879                     return -1;
880                 }
881                 nwritten += n;
882                 if ((n = rdbSaveRawString(rdb,(unsigned char*)value,
883                         sdslen(value))) == -1)
884                 {
885                     dictReleaseIterator(di);
886                     return -1;
887                 }
888                 nwritten += n;
889             }
890             dictReleaseIterator(di);
891         } else {
892             serverPanic("Unknown hash encoding");
893         }
894     } else if (o->type == OBJ_STREAM) {
895         /* Store how many listpacks we have inside the radix tree. */
896         stream *s = o->ptr;
897         rax *rax = s->rax;
898         if ((n = rdbSaveLen(rdb,raxSize(rax))) == -1) return -1;
899         nwritten += n;
900 
901         /* Serialize all the listpacks inside the radix tree as they are,
902          * when loading back, we'll use the first entry of each listpack
903          * to insert it back into the radix tree. */
904         raxIterator ri;
905         raxStart(&ri,rax);
906         raxSeek(&ri,"^",NULL,0);
907         while (raxNext(&ri)) {
908             unsigned char *lp = ri.data;
909             size_t lp_bytes = lpBytes(lp);
910             if ((n = rdbSaveRawString(rdb,ri.key,ri.key_len)) == -1) return -1;
911             nwritten += n;
912             if ((n = rdbSaveRawString(rdb,lp,lp_bytes)) == -1) return -1;
913             nwritten += n;
914         }
915         raxStop(&ri);
916 
917         /* Save the number of elements inside the stream. We cannot obtain
918          * this easily later, since our macro nodes should be checked for
919          * number of items: not a great CPU / space tradeoff. */
920         if ((n = rdbSaveLen(rdb,s->length)) == -1) return -1;
921         nwritten += n;
922         /* Save the last entry ID. */
923         if ((n = rdbSaveLen(rdb,s->last_id.ms)) == -1) return -1;
924         nwritten += n;
925         if ((n = rdbSaveLen(rdb,s->last_id.seq)) == -1) return -1;
926         nwritten += n;
927 
928         /* The consumer groups and their clients are part of the stream
929          * type, so serialize every consumer group. */
930 
931         /* Save the number of groups. */
932         size_t num_cgroups = s->cgroups ? raxSize(s->cgroups) : 0;
933         if ((n = rdbSaveLen(rdb,num_cgroups)) == -1) return -1;
934         nwritten += n;
935 
936         if (num_cgroups) {
937             /* Serialize each consumer group. */
938             raxStart(&ri,s->cgroups);
939             raxSeek(&ri,"^",NULL,0);
940             while(raxNext(&ri)) {
941                 streamCG *cg = ri.data;
942 
943                 /* Save the group name. */
944                 if ((n = rdbSaveRawString(rdb,ri.key,ri.key_len)) == -1)
945                     return -1;
946                 nwritten += n;
947 
948                 /* Last ID. */
949                 if ((n = rdbSaveLen(rdb,cg->last_id.ms)) == -1) return -1;
950                 nwritten += n;
951                 if ((n = rdbSaveLen(rdb,cg->last_id.seq)) == -1) return -1;
952                 nwritten += n;
953 
954                 /* Save the global PEL. */
955                 if ((n = rdbSaveStreamPEL(rdb,cg->pel,1)) == -1) return -1;
956                 nwritten += n;
957 
958                 /* Save the consumers of this group. */
959                 if ((n = rdbSaveStreamConsumers(rdb,cg)) == -1) return -1;
960                 nwritten += n;
961             }
962             raxStop(&ri);
963         }
964     } else if (o->type == OBJ_MODULE) {
965         /* Save a module-specific value. */
966         RedisModuleIO io;
967         moduleValue *mv = o->ptr;
968         moduleType *mt = mv->type;
969 
970         /* Write the "module" identifier as prefix, so that we'll be able
971          * to call the right module during loading. */
972         int retval = rdbSaveLen(rdb,mt->id);
973         if (retval == -1) return -1;
974         io.bytes += retval;
975 
976         /* Then write the module-specific representation + EOF marker. */
977         moduleInitIOContext(io,mt,rdb,key);
978         mt->rdb_save(&io,mv->value);
979         retval = rdbSaveLen(rdb,RDB_MODULE_OPCODE_EOF);
980         if (retval == -1)
981             io.error = 1;
982         else
983             io.bytes += retval;
984 
985         if (io.ctx) {
986             moduleFreeContext(io.ctx);
987             zfree(io.ctx);
988         }
989         return io.error ? -1 : (ssize_t)io.bytes;
990     } else {
991         serverPanic("Unknown object type");
992     }
993     return nwritten;
994 }
995 
996 /* Return the length the object will have on disk if saved with
997  * the rdbSaveObject() function. Currently we use a trick to get
998  * this length with very little changes to the code. In the future
999  * we could switch to a faster solution. */
rdbSavedObjectLen(robj * o)1000 size_t rdbSavedObjectLen(robj *o) {
1001     ssize_t len = rdbSaveObject(NULL,o,NULL);
1002     serverAssertWithInfo(NULL,o,len != -1);
1003     return len;
1004 }
1005 
1006 /* Save a key-value pair, with expire time, type, key, value.
1007  * On error -1 is returned.
1008  * On success if the key was actually saved 1 is returned, otherwise 0
1009  * is returned (the key was already expired). */
rdbSaveKeyValuePair(rio * rdb,robj * key,robj * val,long long expiretime)1010 int rdbSaveKeyValuePair(rio *rdb, robj *key, robj *val, long long expiretime) {
1011     int savelru = server.maxmemory_policy & MAXMEMORY_FLAG_LRU;
1012     int savelfu = server.maxmemory_policy & MAXMEMORY_FLAG_LFU;
1013 
1014     /* Save the expire time */
1015     if (expiretime != -1) {
1016         if (rdbSaveType(rdb,RDB_OPCODE_EXPIRETIME_MS) == -1) return -1;
1017         if (rdbSaveMillisecondTime(rdb,expiretime) == -1) return -1;
1018     }
1019 
1020     /* Save the LRU info. */
1021     if (savelru) {
1022         uint64_t idletime = estimateObjectIdleTime(val);
1023         idletime /= 1000; /* Using seconds is enough and requires less space.*/
1024         if (rdbSaveType(rdb,RDB_OPCODE_IDLE) == -1) return -1;
1025         if (rdbSaveLen(rdb,idletime) == -1) return -1;
1026     }
1027 
1028     /* Save the LFU info. */
1029     if (savelfu) {
1030         uint8_t buf[1];
1031         buf[0] = LFUDecrAndReturn(val);
1032         /* We can encode this in exactly two bytes: the opcode and an 8
1033          * bit counter, since the frequency is logarithmic with a 0-255 range.
1034          * Note that we do not store the halving time because to reset it
1035          * a single time when loading does not affect the frequency much. */
1036         if (rdbSaveType(rdb,RDB_OPCODE_FREQ) == -1) return -1;
1037         if (rdbWriteRaw(rdb,buf,1) == -1) return -1;
1038     }
1039 
1040     /* Save type, key, value */
1041     if (rdbSaveObjectType(rdb,val) == -1) return -1;
1042     if (rdbSaveStringObject(rdb,key) == -1) return -1;
1043     if (rdbSaveObject(rdb,val,key) == -1) return -1;
1044     return 1;
1045 }
1046 
1047 /* Save an AUX field. */
rdbSaveAuxField(rio * rdb,void * key,size_t keylen,void * val,size_t vallen)1048 ssize_t rdbSaveAuxField(rio *rdb, void *key, size_t keylen, void *val, size_t vallen) {
1049     ssize_t ret, len = 0;
1050     if ((ret = rdbSaveType(rdb,RDB_OPCODE_AUX)) == -1) return -1;
1051     len += ret;
1052     if ((ret = rdbSaveRawString(rdb,key,keylen)) == -1) return -1;
1053     len += ret;
1054     if ((ret = rdbSaveRawString(rdb,val,vallen)) == -1) return -1;
1055     len += ret;
1056     return len;
1057 }
1058 
1059 /* Wrapper for rdbSaveAuxField() used when key/val length can be obtained
1060  * with strlen(). */
rdbSaveAuxFieldStrStr(rio * rdb,char * key,char * val)1061 ssize_t rdbSaveAuxFieldStrStr(rio *rdb, char *key, char *val) {
1062     return rdbSaveAuxField(rdb,key,strlen(key),val,strlen(val));
1063 }
1064 
1065 /* Wrapper for strlen(key) + integer type (up to long long range). */
rdbSaveAuxFieldStrInt(rio * rdb,char * key,long long val)1066 ssize_t rdbSaveAuxFieldStrInt(rio *rdb, char *key, long long val) {
1067     char buf[LONG_STR_SIZE];
1068     int vlen = ll2string(buf,sizeof(buf),val);
1069     return rdbSaveAuxField(rdb,key,strlen(key),buf,vlen);
1070 }
1071 
1072 /* Save a few default AUX fields with information about the RDB generated. */
rdbSaveInfoAuxFields(rio * rdb,int flags,rdbSaveInfo * rsi)1073 int rdbSaveInfoAuxFields(rio *rdb, int flags, rdbSaveInfo *rsi) {
1074     int redis_bits = (sizeof(void*) == 8) ? 64 : 32;
1075     int aof_preamble = (flags & RDB_SAVE_AOF_PREAMBLE) != 0;
1076 
1077     /* Add a few fields about the state when the RDB was created. */
1078     if (rdbSaveAuxFieldStrStr(rdb,"redis-ver",REDIS_VERSION) == -1) return -1;
1079     if (rdbSaveAuxFieldStrInt(rdb,"redis-bits",redis_bits) == -1) return -1;
1080     if (rdbSaveAuxFieldStrInt(rdb,"ctime",time(NULL)) == -1) return -1;
1081     if (rdbSaveAuxFieldStrInt(rdb,"used-mem",zmalloc_used_memory()) == -1) return -1;
1082 
1083     /* Handle saving options that generate aux fields. */
1084     if (rsi) {
1085         if (rdbSaveAuxFieldStrInt(rdb,"repl-stream-db",rsi->repl_stream_db)
1086             == -1) return -1;
1087         if (rdbSaveAuxFieldStrStr(rdb,"repl-id",server.replid)
1088             == -1) return -1;
1089         if (rdbSaveAuxFieldStrInt(rdb,"repl-offset",server.master_repl_offset)
1090             == -1) return -1;
1091     }
1092     if (rdbSaveAuxFieldStrInt(rdb,"aof-preamble",aof_preamble) == -1) return -1;
1093     return 1;
1094 }
1095 
rdbSaveSingleModuleAux(rio * rdb,int when,moduleType * mt)1096 ssize_t rdbSaveSingleModuleAux(rio *rdb, int when, moduleType *mt) {
1097     /* Save a module-specific aux value. */
1098     RedisModuleIO io;
1099     int retval = rdbSaveType(rdb, RDB_OPCODE_MODULE_AUX);
1100     if (retval == -1) return -1;
1101     io.bytes += retval;
1102 
1103     /* Write the "module" identifier as prefix, so that we'll be able
1104      * to call the right module during loading. */
1105     retval = rdbSaveLen(rdb,mt->id);
1106     if (retval == -1) return -1;
1107     io.bytes += retval;
1108 
1109     /* write the 'when' so that we can provide it on loading. add a UINT opcode
1110      * for backwards compatibility, everything after the MT needs to be prefixed
1111      * by an opcode. */
1112     retval = rdbSaveLen(rdb,RDB_MODULE_OPCODE_UINT);
1113     if (retval == -1) return -1;
1114     io.bytes += retval;
1115     retval = rdbSaveLen(rdb,when);
1116     if (retval == -1) return -1;
1117     io.bytes += retval;
1118 
1119     /* Then write the module-specific representation + EOF marker. */
1120     moduleInitIOContext(io,mt,rdb,NULL);
1121     mt->aux_save(&io,when);
1122     retval = rdbSaveLen(rdb,RDB_MODULE_OPCODE_EOF);
1123     if (retval == -1)
1124         io.error = 1;
1125     else
1126         io.bytes += retval;
1127 
1128     if (io.ctx) {
1129         moduleFreeContext(io.ctx);
1130         zfree(io.ctx);
1131     }
1132     if (io.error)
1133         return -1;
1134     return io.bytes;
1135 }
1136 
1137 /* Produces a dump of the database in RDB format sending it to the specified
1138  * Redis I/O channel. On success C_OK is returned, otherwise C_ERR
1139  * is returned and part of the output, or all the output, can be
1140  * missing because of I/O errors.
1141  *
1142  * When the function returns C_ERR and if 'error' is not NULL, the
1143  * integer pointed by 'error' is set to the value of errno just after the I/O
1144  * error. */
rdbSaveRio(rio * rdb,int * error,int flags,rdbSaveInfo * rsi)1145 int rdbSaveRio(rio *rdb, int *error, int flags, rdbSaveInfo *rsi) {
1146     dictIterator *di = NULL;
1147     dictEntry *de;
1148     char magic[10];
1149     int j;
1150     uint64_t cksum;
1151     size_t processed = 0;
1152 
1153     if (server.rdb_checksum)
1154         rdb->update_cksum = rioGenericUpdateChecksum;
1155     snprintf(magic,sizeof(magic),"REDIS%04d",RDB_VERSION);
1156     if (rdbWriteRaw(rdb,magic,9) == -1) goto werr;
1157     if (rdbSaveInfoAuxFields(rdb,flags,rsi) == -1) goto werr;
1158     if (rdbSaveModulesAux(rdb, REDISMODULE_AUX_BEFORE_RDB) == -1) goto werr;
1159 
1160     for (j = 0; j < server.dbnum; j++) {
1161         redisDb *db = server.db+j;
1162         dict *d = db->dict;
1163         if (dictSize(d) == 0) continue;
1164         di = dictGetSafeIterator(d);
1165 
1166         /* Write the SELECT DB opcode */
1167         if (rdbSaveType(rdb,RDB_OPCODE_SELECTDB) == -1) goto werr;
1168         if (rdbSaveLen(rdb,j) == -1) goto werr;
1169 
1170         /* Write the RESIZE DB opcode. We trim the size to UINT32_MAX, which
1171          * is currently the largest type we are able to represent in RDB sizes.
1172          * However this does not limit the actual size of the DB to load since
1173          * these sizes are just hints to resize the hash tables. */
1174         uint64_t db_size, expires_size;
1175         db_size = dictSize(db->dict);
1176         expires_size = dictSize(db->expires);
1177         if (rdbSaveType(rdb,RDB_OPCODE_RESIZEDB) == -1) goto werr;
1178         if (rdbSaveLen(rdb,db_size) == -1) goto werr;
1179         if (rdbSaveLen(rdb,expires_size) == -1) goto werr;
1180 
1181         /* Iterate this DB writing every entry */
1182         while((de = dictNext(di)) != NULL) {
1183             sds keystr = dictGetKey(de);
1184             robj key, *o = dictGetVal(de);
1185             long long expire;
1186 
1187             initStaticStringObject(key,keystr);
1188             expire = getExpire(db,&key);
1189             if (rdbSaveKeyValuePair(rdb,&key,o,expire) == -1) goto werr;
1190 
1191             /* When this RDB is produced as part of an AOF rewrite, move
1192              * accumulated diff from parent to child while rewriting in
1193              * order to have a smaller final write. */
1194             if (flags & RDB_SAVE_AOF_PREAMBLE &&
1195                 rdb->processed_bytes > processed+AOF_READ_DIFF_INTERVAL_BYTES)
1196             {
1197                 processed = rdb->processed_bytes;
1198                 aofReadDiffFromParent();
1199             }
1200         }
1201         dictReleaseIterator(di);
1202         di = NULL; /* So that we don't release it again on error. */
1203     }
1204 
1205     /* If we are storing the replication information on disk, persist
1206      * the script cache as well: on successful PSYNC after a restart, we need
1207      * to be able to process any EVALSHA inside the replication backlog the
1208      * master will send us. */
1209     if (rsi && dictSize(server.lua_scripts)) {
1210         di = dictGetIterator(server.lua_scripts);
1211         while((de = dictNext(di)) != NULL) {
1212             robj *body = dictGetVal(de);
1213             if (rdbSaveAuxField(rdb,"lua",3,body->ptr,sdslen(body->ptr)) == -1)
1214                 goto werr;
1215         }
1216         dictReleaseIterator(di);
1217         di = NULL; /* So that we don't release it again on error. */
1218     }
1219 
1220     if (rdbSaveModulesAux(rdb, REDISMODULE_AUX_AFTER_RDB) == -1) goto werr;
1221 
1222     /* EOF opcode */
1223     if (rdbSaveType(rdb,RDB_OPCODE_EOF) == -1) goto werr;
1224 
1225     /* CRC64 checksum. It will be zero if checksum computation is disabled, the
1226      * loading code skips the check in this case. */
1227     cksum = rdb->cksum;
1228     memrev64ifbe(&cksum);
1229     if (rioWrite(rdb,&cksum,8) == 0) goto werr;
1230     return C_OK;
1231 
1232 werr:
1233     if (error) *error = errno;
1234     if (di) dictReleaseIterator(di);
1235     return C_ERR;
1236 }
1237 
1238 /* This is just a wrapper to rdbSaveRio() that additionally adds a prefix
1239  * and a suffix to the generated RDB dump. The prefix is:
1240  *
1241  * $EOF:<40 bytes unguessable hex string>\r\n
1242  *
1243  * While the suffix is the 40 bytes hex string we announced in the prefix.
1244  * This way processes receiving the payload can understand when it ends
1245  * without doing any processing of the content. */
rdbSaveRioWithEOFMark(rio * rdb,int * error,rdbSaveInfo * rsi)1246 int rdbSaveRioWithEOFMark(rio *rdb, int *error, rdbSaveInfo *rsi) {
1247     char eofmark[RDB_EOF_MARK_SIZE];
1248 
1249     getRandomHexChars(eofmark,RDB_EOF_MARK_SIZE);
1250     if (error) *error = 0;
1251     if (rioWrite(rdb,"$EOF:",5) == 0) goto werr;
1252     if (rioWrite(rdb,eofmark,RDB_EOF_MARK_SIZE) == 0) goto werr;
1253     if (rioWrite(rdb,"\r\n",2) == 0) goto werr;
1254     if (rdbSaveRio(rdb,error,RDB_SAVE_NONE,rsi) == C_ERR) goto werr;
1255     if (rioWrite(rdb,eofmark,RDB_EOF_MARK_SIZE) == 0) goto werr;
1256     return C_OK;
1257 
1258 werr: /* Write error. */
1259     /* Set 'error' only if not already set by rdbSaveRio() call. */
1260     if (error && *error == 0) *error = errno;
1261     return C_ERR;
1262 }
1263 
1264 /* Save the DB on disk. Return C_ERR on error, C_OK on success. */
rdbSave(char * filename,rdbSaveInfo * rsi)1265 int rdbSave(char *filename, rdbSaveInfo *rsi) {
1266     char tmpfile[256];
1267     char cwd[MAXPATHLEN]; /* Current working dir path for error messages. */
1268     FILE *fp;
1269     rio rdb;
1270     int error = 0;
1271 
1272     snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
1273     fp = fopen(tmpfile,"w");
1274     if (!fp) {
1275         char *cwdp = getcwd(cwd,MAXPATHLEN);
1276         serverLog(LL_WARNING,
1277             "Failed opening the RDB file %s (in server root dir %s) "
1278             "for saving: %s",
1279             filename,
1280             cwdp ? cwdp : "unknown",
1281             strerror(errno));
1282         return C_ERR;
1283     }
1284 
1285     rioInitWithFile(&rdb,fp);
1286 
1287     if (server.rdb_save_incremental_fsync)
1288         rioSetAutoSync(&rdb,REDIS_AUTOSYNC_BYTES);
1289 
1290     if (rdbSaveRio(&rdb,&error,RDB_SAVE_NONE,rsi) == C_ERR) {
1291         errno = error;
1292         goto werr;
1293     }
1294 
1295     /* Make sure data will not remain on the OS's output buffers */
1296     if (fflush(fp) == EOF) goto werr;
1297     if (fsync(fileno(fp)) == -1) goto werr;
1298     if (fclose(fp) == EOF) goto werr;
1299 
1300     /* Use RENAME to make sure the DB file is changed atomically only
1301      * if the generate DB file is ok. */
1302     if (rename(tmpfile,filename) == -1) {
1303         char *cwdp = getcwd(cwd,MAXPATHLEN);
1304         serverLog(LL_WARNING,
1305             "Error moving temp DB file %s on the final "
1306             "destination %s (in server root dir %s): %s",
1307             tmpfile,
1308             filename,
1309             cwdp ? cwdp : "unknown",
1310             strerror(errno));
1311         unlink(tmpfile);
1312         return C_ERR;
1313     }
1314 
1315     serverLog(LL_NOTICE,"DB saved on disk");
1316     server.dirty = 0;
1317     server.lastsave = time(NULL);
1318     server.lastbgsave_status = C_OK;
1319     return C_OK;
1320 
1321 werr:
1322     serverLog(LL_WARNING,"Write error saving DB on disk: %s", strerror(errno));
1323     fclose(fp);
1324     unlink(tmpfile);
1325     return C_ERR;
1326 }
1327 
rdbSaveBackground(char * filename,rdbSaveInfo * rsi)1328 int rdbSaveBackground(char *filename, rdbSaveInfo *rsi) {
1329     pid_t childpid;
1330     long long start;
1331 
1332     if (server.aof_child_pid != -1 || server.rdb_child_pid != -1) return C_ERR;
1333 
1334     server.dirty_before_bgsave = server.dirty;
1335     server.lastbgsave_try = time(NULL);
1336     openChildInfoPipe();
1337 
1338     start = ustime();
1339     if ((childpid = fork()) == 0) {
1340         int retval;
1341 
1342         /* Child */
1343         closeClildUnusedResourceAfterFork();
1344         redisSetProcTitle("redis-rdb-bgsave");
1345         retval = rdbSave(filename,rsi);
1346         if (retval == C_OK) {
1347             size_t private_dirty = zmalloc_get_private_dirty(-1);
1348 
1349             if (private_dirty) {
1350                 serverLog(LL_NOTICE,
1351                     "RDB: %zu MB of memory used by copy-on-write",
1352                     private_dirty/(1024*1024));
1353             }
1354 
1355             server.child_info_data.cow_size = private_dirty;
1356             sendChildInfo(CHILD_INFO_TYPE_RDB);
1357         }
1358         exitFromChild((retval == C_OK) ? 0 : 1);
1359     } else {
1360         /* Parent */
1361         server.stat_fork_time = ustime()-start;
1362         server.stat_fork_rate = (double) zmalloc_used_memory() * 1000000 / server.stat_fork_time / (1024*1024*1024); /* GB per second. */
1363         latencyAddSampleIfNeeded("fork",server.stat_fork_time/1000);
1364         if (childpid == -1) {
1365             closeChildInfoPipe();
1366             server.lastbgsave_status = C_ERR;
1367             serverLog(LL_WARNING,"Can't save in background: fork: %s",
1368                 strerror(errno));
1369             return C_ERR;
1370         }
1371         serverLog(LL_NOTICE,"Background saving started by pid %d",childpid);
1372         server.rdb_save_time_start = time(NULL);
1373         server.rdb_child_pid = childpid;
1374         server.rdb_child_type = RDB_CHILD_TYPE_DISK;
1375         updateDictResizePolicy();
1376         return C_OK;
1377     }
1378     return C_OK; /* unreached */
1379 }
1380 
rdbRemoveTempFile(pid_t childpid)1381 void rdbRemoveTempFile(pid_t childpid) {
1382     char tmpfile[256];
1383 
1384     snprintf(tmpfile,sizeof(tmpfile),"temp-%d.rdb", (int) childpid);
1385     unlink(tmpfile);
1386 }
1387 
1388 /* This function is called by rdbLoadObject() when the code is in RDB-check
1389  * mode and we find a module value of type 2 that can be parsed without
1390  * the need of the actual module. The value is parsed for errors, finally
1391  * a dummy redis object is returned just to conform to the API. */
rdbLoadCheckModuleValue(rio * rdb,char * modulename)1392 robj *rdbLoadCheckModuleValue(rio *rdb, char *modulename) {
1393     uint64_t opcode;
1394     while((opcode = rdbLoadLen(rdb,NULL)) != RDB_MODULE_OPCODE_EOF) {
1395         if (opcode == RDB_MODULE_OPCODE_SINT ||
1396             opcode == RDB_MODULE_OPCODE_UINT)
1397         {
1398             uint64_t len;
1399             if (rdbLoadLenByRef(rdb,NULL,&len) == -1) {
1400                 rdbExitReportCorruptRDB(
1401                     "Error reading integer from module %s value", modulename);
1402             }
1403         } else if (opcode == RDB_MODULE_OPCODE_STRING) {
1404             robj *o = rdbGenericLoadStringObject(rdb,RDB_LOAD_NONE,NULL);
1405             if (o == NULL) {
1406                 rdbExitReportCorruptRDB(
1407                     "Error reading string from module %s value", modulename);
1408             }
1409             decrRefCount(o);
1410         } else if (opcode == RDB_MODULE_OPCODE_FLOAT) {
1411             float val;
1412             if (rdbLoadBinaryFloatValue(rdb,&val) == -1) {
1413                 rdbExitReportCorruptRDB(
1414                     "Error reading float from module %s value", modulename);
1415             }
1416         } else if (opcode == RDB_MODULE_OPCODE_DOUBLE) {
1417             double val;
1418             if (rdbLoadBinaryDoubleValue(rdb,&val) == -1) {
1419                 rdbExitReportCorruptRDB(
1420                     "Error reading double from module %s value", modulename);
1421             }
1422         }
1423     }
1424     return createStringObject("module-dummy-value",18);
1425 }
1426 
1427 /* Load a Redis object of the specified type from the specified file.
1428  * On success a newly allocated object is returned, otherwise NULL. */
rdbLoadObject(int rdbtype,rio * rdb,robj * key)1429 robj *rdbLoadObject(int rdbtype, rio *rdb, robj *key) {
1430     robj *o = NULL, *ele, *dec;
1431     uint64_t len;
1432     unsigned int i;
1433 
1434     if (rdbtype == RDB_TYPE_STRING) {
1435         /* Read string value */
1436         if ((o = rdbLoadEncodedStringObject(rdb)) == NULL) return NULL;
1437         o = tryObjectEncoding(o);
1438     } else if (rdbtype == RDB_TYPE_LIST) {
1439         /* Read list value */
1440         if ((len = rdbLoadLen(rdb,NULL)) == RDB_LENERR) return NULL;
1441 
1442         o = createQuicklistObject();
1443         quicklistSetOptions(o->ptr, server.list_max_ziplist_size,
1444                             server.list_compress_depth);
1445 
1446         /* Load every single element of the list */
1447         while(len--) {
1448             if ((ele = rdbLoadEncodedStringObject(rdb)) == NULL) return NULL;
1449             dec = getDecodedObject(ele);
1450             size_t len = sdslen(dec->ptr);
1451             quicklistPushTail(o->ptr, dec->ptr, len);
1452             decrRefCount(dec);
1453             decrRefCount(ele);
1454         }
1455     } else if (rdbtype == RDB_TYPE_SET) {
1456         /* Read Set value */
1457         if ((len = rdbLoadLen(rdb,NULL)) == RDB_LENERR) return NULL;
1458 
1459         /* Use a regular set when there are too many entries. */
1460         size_t max_entries = server.set_max_intset_entries;
1461         if (max_entries >= 1<<30) max_entries = 1<<30;
1462         if (len > max_entries) {
1463             o = createSetObject();
1464             /* It's faster to expand the dict to the right size asap in order
1465              * to avoid rehashing */
1466             if (len > DICT_HT_INITIAL_SIZE)
1467                 dictExpand(o->ptr,len);
1468         } else {
1469             o = createIntsetObject();
1470         }
1471 
1472         /* Load every single element of the set */
1473         for (i = 0; i < len; i++) {
1474             long long llval;
1475             sds sdsele;
1476 
1477             if ((sdsele = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL))
1478                 == NULL) return NULL;
1479 
1480             if (o->encoding == OBJ_ENCODING_INTSET) {
1481                 /* Fetch integer value from element. */
1482                 if (isSdsRepresentableAsLongLong(sdsele,&llval) == C_OK) {
1483                     o->ptr = intsetAdd(o->ptr,llval,NULL);
1484                 } else {
1485                     setTypeConvert(o,OBJ_ENCODING_HT);
1486                     dictExpand(o->ptr,len);
1487                 }
1488             }
1489 
1490             /* This will also be called when the set was just converted
1491              * to a regular hash table encoded set. */
1492             if (o->encoding == OBJ_ENCODING_HT) {
1493                 dictAdd((dict*)o->ptr,sdsele,NULL);
1494             } else {
1495                 sdsfree(sdsele);
1496             }
1497         }
1498     } else if (rdbtype == RDB_TYPE_ZSET_2 || rdbtype == RDB_TYPE_ZSET) {
1499         /* Read list/set value. */
1500         uint64_t zsetlen;
1501         size_t maxelelen = 0, totelelen = 0;
1502         zset *zs;
1503 
1504         if ((zsetlen = rdbLoadLen(rdb,NULL)) == RDB_LENERR) return NULL;
1505         o = createZsetObject();
1506         zs = o->ptr;
1507 
1508         if (zsetlen > DICT_HT_INITIAL_SIZE)
1509             dictExpand(zs->dict,zsetlen);
1510 
1511         /* Load every single element of the sorted set. */
1512         while(zsetlen--) {
1513             sds sdsele;
1514             double score;
1515             zskiplistNode *znode;
1516 
1517             if ((sdsele = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL))
1518                 == NULL) return NULL;
1519 
1520             if (rdbtype == RDB_TYPE_ZSET_2) {
1521                 if (rdbLoadBinaryDoubleValue(rdb,&score) == -1) return NULL;
1522             } else {
1523                 if (rdbLoadDoubleValue(rdb,&score) == -1) return NULL;
1524             }
1525 
1526             /* Don't care about integer-encoded strings. */
1527             if (sdslen(sdsele) > maxelelen) maxelelen = sdslen(sdsele);
1528             totelelen += sdslen(sdsele);
1529 
1530             znode = zslInsert(zs->zsl,score,sdsele);
1531             dictAdd(zs->dict,sdsele,&znode->score);
1532         }
1533 
1534         /* Convert *after* loading, since sorted sets are not stored ordered. */
1535         if (zsetLength(o) <= server.zset_max_ziplist_entries &&
1536             maxelelen <= server.zset_max_ziplist_value &&
1537             ziplistSafeToAdd(NULL, totelelen))
1538         {
1539             zsetConvert(o,OBJ_ENCODING_ZIPLIST);
1540         }
1541     } else if (rdbtype == RDB_TYPE_HASH) {
1542         uint64_t len;
1543         int ret;
1544         sds field, value;
1545 
1546         len = rdbLoadLen(rdb, NULL);
1547         if (len == RDB_LENERR) return NULL;
1548 
1549         o = createHashObject();
1550 
1551         /* Too many entries? Use a hash table. */
1552         if (len > server.hash_max_ziplist_entries)
1553             hashTypeConvert(o, OBJ_ENCODING_HT);
1554 
1555         /* Load every field and value into the ziplist */
1556         while (o->encoding == OBJ_ENCODING_ZIPLIST && len > 0) {
1557             len--;
1558             /* Load raw strings */
1559             if ((field = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL))
1560                 == NULL) return NULL;
1561             if ((value = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL))
1562                 == NULL) return NULL;
1563 
1564             /* Convert to hash table if size threshold is exceeded */
1565             if (sdslen(field) > server.hash_max_ziplist_value ||
1566                 sdslen(value) > server.hash_max_ziplist_value ||
1567                 !ziplistSafeToAdd(o->ptr, sdslen(field)+sdslen(value)))
1568             {
1569                 hashTypeConvert(o, OBJ_ENCODING_HT);
1570                 ret = dictAdd((dict*)o->ptr, field, value);
1571                 if (ret == DICT_ERR) {
1572                     rdbExitReportCorruptRDB("Duplicate hash fields detected");
1573                 }
1574                 break;
1575             }
1576 
1577             /* Add pair to ziplist */
1578             o->ptr = ziplistPush(o->ptr, (unsigned char*)field,
1579                     sdslen(field), ZIPLIST_TAIL);
1580             o->ptr = ziplistPush(o->ptr, (unsigned char*)value,
1581                     sdslen(value), ZIPLIST_TAIL);
1582 
1583             sdsfree(field);
1584             sdsfree(value);
1585         }
1586 
1587         if (o->encoding == OBJ_ENCODING_HT && len > DICT_HT_INITIAL_SIZE)
1588             dictExpand(o->ptr,len);
1589 
1590         /* Load remaining fields and values into the hash table */
1591         while (o->encoding == OBJ_ENCODING_HT && len > 0) {
1592             len--;
1593             /* Load encoded strings */
1594             if ((field = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL))
1595                 == NULL) return NULL;
1596             if ((value = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL))
1597                 == NULL) return NULL;
1598 
1599             /* Add pair to hash table */
1600             ret = dictAdd((dict*)o->ptr, field, value);
1601             if (ret == DICT_ERR) {
1602                 rdbExitReportCorruptRDB("Duplicate keys detected");
1603             }
1604         }
1605 
1606         /* All pairs should be read by now */
1607         serverAssert(len == 0);
1608     } else if (rdbtype == RDB_TYPE_LIST_QUICKLIST) {
1609         if ((len = rdbLoadLen(rdb,NULL)) == RDB_LENERR) return NULL;
1610         o = createQuicklistObject();
1611         quicklistSetOptions(o->ptr, server.list_max_ziplist_size,
1612                             server.list_compress_depth);
1613 
1614         while (len--) {
1615             unsigned char *zl =
1616                 rdbGenericLoadStringObject(rdb,RDB_LOAD_PLAIN,NULL);
1617             if (zl == NULL) return NULL;
1618             quicklistAppendZiplist(o->ptr, zl);
1619         }
1620     } else if (rdbtype == RDB_TYPE_HASH_ZIPMAP  ||
1621                rdbtype == RDB_TYPE_LIST_ZIPLIST ||
1622                rdbtype == RDB_TYPE_SET_INTSET   ||
1623                rdbtype == RDB_TYPE_ZSET_ZIPLIST ||
1624                rdbtype == RDB_TYPE_HASH_ZIPLIST)
1625     {
1626         unsigned char *encoded =
1627             rdbGenericLoadStringObject(rdb,RDB_LOAD_PLAIN,NULL);
1628         if (encoded == NULL) return NULL;
1629         o = createObject(OBJ_STRING,encoded); /* Obj type fixed below. */
1630 
1631         /* Fix the object encoding, and make sure to convert the encoded
1632          * data type into the base type if accordingly to the current
1633          * configuration there are too many elements in the encoded data
1634          * type. Note that we only check the length and not max element
1635          * size as this is an O(N) scan. Eventually everything will get
1636          * converted. */
1637         switch(rdbtype) {
1638             case RDB_TYPE_HASH_ZIPMAP:
1639                 /* Convert to ziplist encoded hash. This must be deprecated
1640                  * when loading dumps created by Redis 2.4 gets deprecated. */
1641                 {
1642                     unsigned char *zl = ziplistNew();
1643                     unsigned char *zi = zipmapRewind(o->ptr);
1644                     unsigned char *fstr, *vstr;
1645                     unsigned int flen, vlen;
1646                     unsigned int maxlen = 0;
1647 
1648                     while ((zi = zipmapNext(zi, &fstr, &flen, &vstr, &vlen)) != NULL) {
1649                         if (flen > maxlen) maxlen = flen;
1650                         if (vlen > maxlen) maxlen = vlen;
1651                         if (!ziplistSafeToAdd(zl, (size_t)flen + vlen)) {
1652                             rdbExitReportCorruptRDB("Hash zipmap too big (%u)", flen);
1653                         }
1654 
1655                         zl = ziplistPush(zl, fstr, flen, ZIPLIST_TAIL);
1656                         zl = ziplistPush(zl, vstr, vlen, ZIPLIST_TAIL);
1657                     }
1658 
1659                     zfree(o->ptr);
1660                     o->ptr = zl;
1661                     o->type = OBJ_HASH;
1662                     o->encoding = OBJ_ENCODING_ZIPLIST;
1663 
1664                     if (hashTypeLength(o) > server.hash_max_ziplist_entries ||
1665                         maxlen > server.hash_max_ziplist_value)
1666                     {
1667                         hashTypeConvert(o, OBJ_ENCODING_HT);
1668                     }
1669                 }
1670                 break;
1671             case RDB_TYPE_LIST_ZIPLIST:
1672                 o->type = OBJ_LIST;
1673                 o->encoding = OBJ_ENCODING_ZIPLIST;
1674                 listTypeConvert(o,OBJ_ENCODING_QUICKLIST);
1675                 break;
1676             case RDB_TYPE_SET_INTSET:
1677                 o->type = OBJ_SET;
1678                 o->encoding = OBJ_ENCODING_INTSET;
1679                 if (intsetLen(o->ptr) > server.set_max_intset_entries)
1680                     setTypeConvert(o,OBJ_ENCODING_HT);
1681                 break;
1682             case RDB_TYPE_ZSET_ZIPLIST:
1683                 o->type = OBJ_ZSET;
1684                 o->encoding = OBJ_ENCODING_ZIPLIST;
1685                 if (zsetLength(o) > server.zset_max_ziplist_entries)
1686                     zsetConvert(o,OBJ_ENCODING_SKIPLIST);
1687                 break;
1688             case RDB_TYPE_HASH_ZIPLIST:
1689                 o->type = OBJ_HASH;
1690                 o->encoding = OBJ_ENCODING_ZIPLIST;
1691                 if (hashTypeLength(o) > server.hash_max_ziplist_entries)
1692                     hashTypeConvert(o, OBJ_ENCODING_HT);
1693                 break;
1694             default:
1695                 rdbExitReportCorruptRDB("Unknown RDB encoding type %d",rdbtype);
1696                 break;
1697         }
1698     } else if (rdbtype == RDB_TYPE_STREAM_LISTPACKS) {
1699         o = createStreamObject();
1700         stream *s = o->ptr;
1701         uint64_t listpacks = rdbLoadLen(rdb,NULL);
1702 
1703         while(listpacks--) {
1704             /* Get the master ID, the one we'll use as key of the radix tree
1705              * node: the entries inside the listpack itself are delta-encoded
1706              * relatively to this ID. */
1707             sds nodekey = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL);
1708             if (nodekey == NULL) {
1709                 rdbExitReportCorruptRDB("Stream master ID loading failed: invalid encoding or I/O error.");
1710             }
1711             if (sdslen(nodekey) != sizeof(streamID)) {
1712                 rdbExitReportCorruptRDB("Stream node key entry is not the "
1713                                         "size of a stream ID");
1714             }
1715 
1716             /* Load the listpack. */
1717             unsigned char *lp =
1718                 rdbGenericLoadStringObject(rdb,RDB_LOAD_PLAIN,NULL);
1719             if (lp == NULL) return NULL;
1720             unsigned char *first = lpFirst(lp);
1721             if (first == NULL) {
1722                 /* Serialized listpacks should never be empty, since on
1723                  * deletion we should remove the radix tree key if the
1724                  * resulting listpack is empty. */
1725                 rdbExitReportCorruptRDB("Empty listpack inside stream");
1726             }
1727 
1728             /* Insert the key in the radix tree. */
1729             int retval = raxInsert(s->rax,
1730                 (unsigned char*)nodekey,sizeof(streamID),lp,NULL);
1731             sdsfree(nodekey);
1732             if (!retval)
1733                 rdbExitReportCorruptRDB("Listpack re-added with existing key");
1734         }
1735         /* Load total number of items inside the stream. */
1736         s->length = rdbLoadLen(rdb,NULL);
1737         /* Load the last entry ID. */
1738         s->last_id.ms = rdbLoadLen(rdb,NULL);
1739         s->last_id.seq = rdbLoadLen(rdb,NULL);
1740 
1741         /* Consumer groups loading */
1742         size_t cgroups_count = rdbLoadLen(rdb,NULL);
1743         while(cgroups_count--) {
1744             /* Get the consumer group name and ID. We can then create the
1745              * consumer group ASAP and populate its structure as
1746              * we read more data. */
1747             streamID cg_id;
1748             sds cgname = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL);
1749             if (cgname == NULL) {
1750                 rdbExitReportCorruptRDB(
1751                     "Error reading the consumer group name from Stream");
1752             }
1753             cg_id.ms = rdbLoadLen(rdb,NULL);
1754             cg_id.seq = rdbLoadLen(rdb,NULL);
1755             streamCG *cgroup = streamCreateCG(s,cgname,sdslen(cgname),&cg_id);
1756             if (cgroup == NULL)
1757                 rdbExitReportCorruptRDB("Duplicated consumer group name %s",
1758                                          cgname);
1759             sdsfree(cgname);
1760 
1761             /* Load the global PEL for this consumer group, however we'll
1762              * not yet populate the NACK structures with the message
1763              * owner, since consumers for this group and their messages will
1764              * be read as a next step. So for now leave them not resolved
1765              * and later populate it. */
1766             size_t pel_size = rdbLoadLen(rdb,NULL);
1767             while(pel_size--) {
1768                 unsigned char rawid[sizeof(streamID)];
1769                 rdbLoadRaw(rdb,rawid,sizeof(rawid));
1770                 streamNACK *nack = streamCreateNACK(NULL);
1771                 nack->delivery_time = rdbLoadMillisecondTime(rdb,RDB_VERSION);
1772                 nack->delivery_count = rdbLoadLen(rdb,NULL);
1773                 if (!raxInsert(cgroup->pel,rawid,sizeof(rawid),nack,NULL))
1774                     rdbExitReportCorruptRDB("Duplicated gobal PEL entry "
1775                                             "loading stream consumer group");
1776             }
1777 
1778             /* Now that we loaded our global PEL, we need to load the
1779              * consumers and their local PELs. */
1780             size_t consumers_num = rdbLoadLen(rdb,NULL);
1781             while(consumers_num--) {
1782                 sds cname = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL);
1783                 if (cname == NULL) {
1784                     rdbExitReportCorruptRDB(
1785                         "Error reading the consumer name from Stream group");
1786                 }
1787                 streamConsumer *consumer =
1788                     streamLookupConsumer(cgroup,cname,SLC_NONE);
1789                 sdsfree(cname);
1790                 consumer->seen_time = rdbLoadMillisecondTime(rdb,RDB_VERSION);
1791 
1792                 /* Load the PEL about entries owned by this specific
1793                  * consumer. */
1794                 pel_size = rdbLoadLen(rdb,NULL);
1795                 while(pel_size--) {
1796                     unsigned char rawid[sizeof(streamID)];
1797                     rdbLoadRaw(rdb,rawid,sizeof(rawid));
1798                     streamNACK *nack = raxFind(cgroup->pel,rawid,sizeof(rawid));
1799                     if (nack == raxNotFound)
1800                         rdbExitReportCorruptRDB("Consumer entry not found in "
1801                                                 "group global PEL");
1802 
1803                     /* Set the NACK consumer, that was left to NULL when
1804                      * loading the global PEL. Then set the same shared
1805                      * NACK structure also in the consumer-specific PEL. */
1806                     nack->consumer = consumer;
1807                     if (!raxInsert(consumer->pel,rawid,sizeof(rawid),nack,NULL))
1808                         rdbExitReportCorruptRDB("Duplicated consumer PEL entry "
1809                                                 " loading a stream consumer "
1810                                                 "group");
1811                 }
1812             }
1813         }
1814     } else if (rdbtype == RDB_TYPE_MODULE || rdbtype == RDB_TYPE_MODULE_2) {
1815         uint64_t moduleid = rdbLoadLen(rdb,NULL);
1816         moduleType *mt = moduleTypeLookupModuleByID(moduleid);
1817         char name[10];
1818 
1819         if (rdbCheckMode && rdbtype == RDB_TYPE_MODULE_2) {
1820             moduleTypeNameByID(name,moduleid);
1821             return rdbLoadCheckModuleValue(rdb,name);
1822         }
1823 
1824         if (mt == NULL) {
1825             moduleTypeNameByID(name,moduleid);
1826             serverLog(LL_WARNING,"The RDB file contains module data I can't load: no matching module '%s'", name);
1827             exit(1);
1828         }
1829         RedisModuleIO io;
1830         moduleInitIOContext(io,mt,rdb,key);
1831         io.ver = (rdbtype == RDB_TYPE_MODULE) ? 1 : 2;
1832         /* Call the rdb_load method of the module providing the 10 bit
1833          * encoding version in the lower 10 bits of the module ID. */
1834         void *ptr = mt->rdb_load(&io,moduleid&1023);
1835         if (io.ctx) {
1836             moduleFreeContext(io.ctx);
1837             zfree(io.ctx);
1838         }
1839 
1840         /* Module v2 serialization has an EOF mark at the end. */
1841         if (io.ver == 2) {
1842             uint64_t eof = rdbLoadLen(rdb,NULL);
1843             if (eof != RDB_MODULE_OPCODE_EOF) {
1844                 serverLog(LL_WARNING,"The RDB file contains module data for the module '%s' that is not terminated by the proper module value EOF marker", name);
1845                 exit(1);
1846             }
1847         }
1848 
1849         if (ptr == NULL) {
1850             moduleTypeNameByID(name,moduleid);
1851             serverLog(LL_WARNING,"The RDB file contains module data for the module type '%s', that the responsible module is not able to load. Check for modules log above for additional clues.", name);
1852             exit(1);
1853         }
1854         o = createModuleObject(mt,ptr);
1855     } else {
1856         rdbExitReportCorruptRDB("Unknown RDB encoding type %d",rdbtype);
1857     }
1858     return o;
1859 }
1860 
1861 /* Mark that we are loading in the global state and setup the fields
1862  * needed to provide loading stats. */
startLoading(FILE * fp)1863 void startLoading(FILE *fp) {
1864     struct stat sb;
1865 
1866     /* Load the DB */
1867     server.loading = 1;
1868     server.loading_start_time = time(NULL);
1869     server.loading_loaded_bytes = 0;
1870     if (fstat(fileno(fp), &sb) == -1) {
1871         server.loading_total_bytes = 0;
1872     } else {
1873         server.loading_total_bytes = sb.st_size;
1874     }
1875 }
1876 
1877 /* Refresh the loading progress info */
loadingProgress(off_t pos)1878 void loadingProgress(off_t pos) {
1879     server.loading_loaded_bytes = pos;
1880     if (server.stat_peak_memory < zmalloc_used_memory())
1881         server.stat_peak_memory = zmalloc_used_memory();
1882 }
1883 
1884 /* Loading finished */
stopLoading(void)1885 void stopLoading(void) {
1886     server.loading = 0;
1887 }
1888 
1889 /* Track loading progress in order to serve client's from time to time
1890    and if needed calculate rdb checksum  */
rdbLoadProgressCallback(rio * r,const void * buf,size_t len)1891 void rdbLoadProgressCallback(rio *r, const void *buf, size_t len) {
1892     if (server.rdb_checksum)
1893         rioGenericUpdateChecksum(r, buf, len);
1894     if (server.loading_process_events_interval_bytes &&
1895         (r->processed_bytes + len)/server.loading_process_events_interval_bytes > r->processed_bytes/server.loading_process_events_interval_bytes)
1896     {
1897         /* The DB can take some non trivial amount of time to load. Update
1898          * our cached time since it is used to create and update the last
1899          * interaction time with clients and for other important things. */
1900         updateCachedTime(0);
1901         if (server.masterhost && server.repl_state == REPL_STATE_TRANSFER)
1902             replicationSendNewlineToMaster();
1903         loadingProgress(r->processed_bytes);
1904         processEventsWhileBlocked();
1905     }
1906 }
1907 
1908 /* Load an RDB file from the rio stream 'rdb'. On success C_OK is returned,
1909  * otherwise C_ERR is returned and 'errno' is set accordingly. */
rdbLoadRio(rio * rdb,rdbSaveInfo * rsi,int loading_aof)1910 int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi, int loading_aof) {
1911     uint64_t dbid;
1912     int type, rdbver;
1913     redisDb *db = server.db+0;
1914     char buf[1024];
1915 
1916     rdb->update_cksum = rdbLoadProgressCallback;
1917     rdb->max_processing_chunk = server.loading_process_events_interval_bytes;
1918     if (rioRead(rdb,buf,9) == 0) goto eoferr;
1919     buf[9] = '\0';
1920     if (memcmp(buf,"REDIS",5) != 0) {
1921         serverLog(LL_WARNING,"Wrong signature trying to load DB from file");
1922         errno = EINVAL;
1923         return C_ERR;
1924     }
1925     rdbver = atoi(buf+5);
1926     if (rdbver < 1 || rdbver > RDB_VERSION) {
1927         serverLog(LL_WARNING,"Can't handle RDB format version %d",rdbver);
1928         errno = EINVAL;
1929         return C_ERR;
1930     }
1931 
1932     /* Key-specific attributes, set by opcodes before the key type. */
1933     long long lru_idle = -1, lfu_freq = -1, expiretime = -1, now = mstime();
1934     long long lru_clock = LRU_CLOCK();
1935 
1936     while(1) {
1937         robj *key, *val;
1938 
1939         /* Read type. */
1940         if ((type = rdbLoadType(rdb)) == -1) goto eoferr;
1941 
1942         /* Handle special types. */
1943         if (type == RDB_OPCODE_EXPIRETIME) {
1944             /* EXPIRETIME: load an expire associated with the next key
1945              * to load. Note that after loading an expire we need to
1946              * load the actual type, and continue. */
1947             expiretime = rdbLoadTime(rdb);
1948             expiretime *= 1000;
1949             continue; /* Read next opcode. */
1950         } else if (type == RDB_OPCODE_EXPIRETIME_MS) {
1951             /* EXPIRETIME_MS: milliseconds precision expire times introduced
1952              * with RDB v3. Like EXPIRETIME but no with more precision. */
1953             expiretime = rdbLoadMillisecondTime(rdb,rdbver);
1954             continue; /* Read next opcode. */
1955         } else if (type == RDB_OPCODE_FREQ) {
1956             /* FREQ: LFU frequency. */
1957             uint8_t byte;
1958             if (rioRead(rdb,&byte,1) == 0) goto eoferr;
1959             lfu_freq = byte;
1960             continue; /* Read next opcode. */
1961         } else if (type == RDB_OPCODE_IDLE) {
1962             /* IDLE: LRU idle time. */
1963             uint64_t qword;
1964             if ((qword = rdbLoadLen(rdb,NULL)) == RDB_LENERR) goto eoferr;
1965             lru_idle = qword;
1966             continue; /* Read next opcode. */
1967         } else if (type == RDB_OPCODE_EOF) {
1968             /* EOF: End of file, exit the main loop. */
1969             break;
1970         } else if (type == RDB_OPCODE_SELECTDB) {
1971             /* SELECTDB: Select the specified database. */
1972             if ((dbid = rdbLoadLen(rdb,NULL)) == RDB_LENERR) goto eoferr;
1973             if (dbid >= (unsigned)server.dbnum) {
1974                 serverLog(LL_WARNING,
1975                     "FATAL: Data file was created with a Redis "
1976                     "server configured to handle more than %d "
1977                     "databases. Exiting\n", server.dbnum);
1978                 exit(1);
1979             }
1980             db = server.db+dbid;
1981             continue; /* Read next opcode. */
1982         } else if (type == RDB_OPCODE_RESIZEDB) {
1983             /* RESIZEDB: Hint about the size of the keys in the currently
1984              * selected data base, in order to avoid useless rehashing. */
1985             uint64_t db_size, expires_size;
1986             if ((db_size = rdbLoadLen(rdb,NULL)) == RDB_LENERR)
1987                 goto eoferr;
1988             if ((expires_size = rdbLoadLen(rdb,NULL)) == RDB_LENERR)
1989                 goto eoferr;
1990             dictExpand(db->dict,db_size);
1991             dictExpand(db->expires,expires_size);
1992             continue; /* Read next opcode. */
1993         } else if (type == RDB_OPCODE_AUX) {
1994             /* AUX: generic string-string fields. Use to add state to RDB
1995              * which is backward compatible. Implementations of RDB loading
1996              * are requierd to skip AUX fields they don't understand.
1997              *
1998              * An AUX field is composed of two strings: key and value. */
1999             robj *auxkey, *auxval;
2000             if ((auxkey = rdbLoadStringObject(rdb)) == NULL) goto eoferr;
2001             if ((auxval = rdbLoadStringObject(rdb)) == NULL) goto eoferr;
2002 
2003             if (((char*)auxkey->ptr)[0] == '%') {
2004                 /* All the fields with a name staring with '%' are considered
2005                  * information fields and are logged at startup with a log
2006                  * level of NOTICE. */
2007                 serverLog(LL_NOTICE,"RDB '%s': %s",
2008                     (char*)auxkey->ptr,
2009                     (char*)auxval->ptr);
2010             } else if (!strcasecmp(auxkey->ptr,"repl-stream-db")) {
2011                 if (rsi) rsi->repl_stream_db = atoi(auxval->ptr);
2012             } else if (!strcasecmp(auxkey->ptr,"repl-id")) {
2013                 if (rsi && sdslen(auxval->ptr) == CONFIG_RUN_ID_SIZE) {
2014                     memcpy(rsi->repl_id,auxval->ptr,CONFIG_RUN_ID_SIZE+1);
2015                     rsi->repl_id_is_set = 1;
2016                 }
2017             } else if (!strcasecmp(auxkey->ptr,"repl-offset")) {
2018                 if (rsi) rsi->repl_offset = strtoll(auxval->ptr,NULL,10);
2019             } else if (!strcasecmp(auxkey->ptr,"lua")) {
2020                 /* Load the script back in memory. */
2021                 if (luaCreateFunction(NULL,server.lua,auxval) == NULL) {
2022                     rdbExitReportCorruptRDB(
2023                         "Can't load Lua script from RDB file! "
2024                         "BODY: %s", auxval->ptr);
2025                 }
2026             } else {
2027                 /* We ignore fields we don't understand, as by AUX field
2028                  * contract. */
2029                 serverLog(LL_DEBUG,"Unrecognized RDB AUX field: '%s'",
2030                     (char*)auxkey->ptr);
2031             }
2032 
2033             decrRefCount(auxkey);
2034             decrRefCount(auxval);
2035             continue; /* Read type again. */
2036         } else if (type == RDB_OPCODE_MODULE_AUX) {
2037             /* Load module data that is not related to the Redis key space.
2038              * Such data can be potentially be stored both before and after the
2039              * RDB keys-values section. */
2040             uint64_t moduleid = rdbLoadLen(rdb,NULL);
2041             int when_opcode = rdbLoadLen(rdb,NULL);
2042             int when = rdbLoadLen(rdb,NULL);
2043             if (when_opcode != RDB_MODULE_OPCODE_UINT)
2044                 rdbExitReportCorruptRDB("bad when_opcode");
2045             moduleType *mt = moduleTypeLookupModuleByID(moduleid);
2046             char name[10];
2047             moduleTypeNameByID(name,moduleid);
2048 
2049             if (!rdbCheckMode && mt == NULL) {
2050                 /* Unknown module. */
2051                 serverLog(LL_WARNING,"The RDB file contains AUX module data I can't load: no matching module '%s'", name);
2052                 exit(1);
2053             } else if (!rdbCheckMode && mt != NULL) {
2054                 if (!mt->aux_load) {
2055                     /* Module doesn't support AUX. */
2056                     serverLog(LL_WARNING,"The RDB file contains module AUX data, but the module '%s' doesn't seem to support it.", name);
2057                     exit(1);
2058                 }
2059 
2060                 RedisModuleIO io;
2061                 moduleInitIOContext(io,mt,rdb,NULL);
2062                 io.ver = 2;
2063                 /* Call the rdb_load method of the module providing the 10 bit
2064                  * encoding version in the lower 10 bits of the module ID. */
2065                 if (mt->aux_load(&io,moduleid&1023, when) || io.error) {
2066                     moduleTypeNameByID(name,moduleid);
2067                     serverLog(LL_WARNING,"The RDB file contains module AUX data for the module type '%s', that the responsible module is not able to load. Check for modules log above for additional clues.", name);
2068                     exit(1);
2069                 }
2070                 if (io.ctx) {
2071                     moduleFreeContext(io.ctx);
2072                     zfree(io.ctx);
2073                 }
2074                 uint64_t eof = rdbLoadLen(rdb,NULL);
2075                 if (eof != RDB_MODULE_OPCODE_EOF) {
2076                     serverLog(LL_WARNING,"The RDB file contains module AUX data for the module '%s' that is not terminated by the proper module value EOF marker", name);
2077                     exit(1);
2078                 }
2079                 continue;
2080             } else {
2081                 /* RDB check mode. */
2082                 robj *aux = rdbLoadCheckModuleValue(rdb,name);
2083                 decrRefCount(aux);
2084                 continue; /* Read next opcode. */
2085             }
2086         }
2087 
2088         /* Read key */
2089         if ((key = rdbLoadStringObject(rdb)) == NULL) goto eoferr;
2090         /* Read value */
2091         if ((val = rdbLoadObject(type,rdb,key)) == NULL) goto eoferr;
2092         /* Check if the key already expired. This function is used when loading
2093          * an RDB file from disk, either at startup, or when an RDB was
2094          * received from the master. In the latter case, the master is
2095          * responsible for key expiry. If we would expire keys here, the
2096          * snapshot taken by the master may not be reflected on the slave. */
2097         if (server.masterhost == NULL && !loading_aof && expiretime != -1 && expiretime < now) {
2098             decrRefCount(key);
2099             decrRefCount(val);
2100         } else {
2101             /* Add the new object in the hash table */
2102             dbAdd(db,key,val);
2103 
2104             /* Set the expire time if needed */
2105             if (expiretime != -1) setExpire(NULL,db,key,expiretime);
2106 
2107             /* Set usage information (for eviction). */
2108             objectSetLRUOrLFU(val,lfu_freq,lru_idle,lru_clock);
2109 
2110             /* Decrement the key refcount since dbAdd() will take its
2111              * own reference. */
2112             decrRefCount(key);
2113         }
2114 
2115         /* Reset the state that is key-specified and is populated by
2116          * opcodes before the key, so that we start from scratch again. */
2117         expiretime = -1;
2118         lfu_freq = -1;
2119         lru_idle = -1;
2120     }
2121     /* Verify the checksum if RDB version is >= 5 */
2122     if (rdbver >= 5) {
2123         uint64_t cksum, expected = rdb->cksum;
2124 
2125         if (rioRead(rdb,&cksum,8) == 0) goto eoferr;
2126         if (server.rdb_checksum) {
2127             memrev64ifbe(&cksum);
2128             if (cksum == 0) {
2129                 serverLog(LL_WARNING,"RDB file was saved with checksum disabled: no check performed.");
2130             } else if (cksum != expected) {
2131                 serverLog(LL_WARNING,"Wrong RDB checksum. Aborting now.");
2132                 rdbExitReportCorruptRDB("RDB CRC error");
2133             }
2134         }
2135     }
2136     return C_OK;
2137 
2138 eoferr: /* unexpected end of file is handled here with a fatal exit */
2139     serverLog(LL_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
2140     rdbExitReportCorruptRDB("Unexpected EOF reading RDB file");
2141     return C_ERR; /* Just to avoid warning */
2142 }
2143 
2144 /* Like rdbLoadRio() but takes a filename instead of a rio stream. The
2145  * filename is open for reading and a rio stream object created in order
2146  * to do the actual loading. Moreover the ETA displayed in the INFO
2147  * output is initialized and finalized.
2148  *
2149  * If you pass an 'rsi' structure initialied with RDB_SAVE_OPTION_INIT, the
2150  * loading code will fiil the information fields in the structure. */
rdbLoad(char * filename,rdbSaveInfo * rsi)2151 int rdbLoad(char *filename, rdbSaveInfo *rsi) {
2152     FILE *fp;
2153     rio rdb;
2154     int retval;
2155 
2156     if ((fp = fopen(filename,"r")) == NULL) return C_ERR;
2157     startLoading(fp);
2158     rioInitWithFile(&rdb,fp);
2159     retval = rdbLoadRio(&rdb,rsi,0);
2160     fclose(fp);
2161     stopLoading();
2162     return retval;
2163 }
2164 
2165 /* A background saving child (BGSAVE) terminated its work. Handle this.
2166  * This function covers the case of actual BGSAVEs. */
backgroundSaveDoneHandlerDisk(int exitcode,int bysignal)2167 void backgroundSaveDoneHandlerDisk(int exitcode, int bysignal) {
2168     if (!bysignal && exitcode == 0) {
2169         serverLog(LL_NOTICE,
2170             "Background saving terminated with success");
2171         server.dirty = server.dirty - server.dirty_before_bgsave;
2172         server.lastsave = time(NULL);
2173         server.lastbgsave_status = C_OK;
2174     } else if (!bysignal && exitcode != 0) {
2175         serverLog(LL_WARNING, "Background saving error");
2176         server.lastbgsave_status = C_ERR;
2177     } else {
2178         mstime_t latency;
2179 
2180         serverLog(LL_WARNING,
2181             "Background saving terminated by signal %d", bysignal);
2182         latencyStartMonitor(latency);
2183         rdbRemoveTempFile(server.rdb_child_pid);
2184         latencyEndMonitor(latency);
2185         latencyAddSampleIfNeeded("rdb-unlink-temp-file",latency);
2186         /* SIGUSR1 is whitelisted, so we have a way to kill a child without
2187          * tirggering an error condition. */
2188         if (bysignal != SIGUSR1)
2189             server.lastbgsave_status = C_ERR;
2190     }
2191     server.rdb_child_pid = -1;
2192     server.rdb_child_type = RDB_CHILD_TYPE_NONE;
2193     server.rdb_save_time_last = time(NULL)-server.rdb_save_time_start;
2194     server.rdb_save_time_start = -1;
2195     /* Possibly there are slaves waiting for a BGSAVE in order to be served
2196      * (the first stage of SYNC is a bulk transfer of dump.rdb) */
2197     updateSlavesWaitingBgsave((!bysignal && exitcode == 0) ? C_OK : C_ERR, RDB_CHILD_TYPE_DISK);
2198 }
2199 
2200 /* A background saving child (BGSAVE) terminated its work. Handle this.
2201  * This function covers the case of RDB -> Salves socket transfers for
2202  * diskless replication. */
backgroundSaveDoneHandlerSocket(int exitcode,int bysignal)2203 void backgroundSaveDoneHandlerSocket(int exitcode, int bysignal) {
2204     uint64_t *ok_slaves;
2205 
2206     if (!bysignal && exitcode == 0) {
2207         serverLog(LL_NOTICE,
2208             "Background RDB transfer terminated with success");
2209     } else if (!bysignal && exitcode != 0) {
2210         serverLog(LL_WARNING, "Background transfer error");
2211     } else {
2212         serverLog(LL_WARNING,
2213             "Background transfer terminated by signal %d", bysignal);
2214     }
2215     server.rdb_child_pid = -1;
2216     server.rdb_child_type = RDB_CHILD_TYPE_NONE;
2217     server.rdb_save_time_start = -1;
2218 
2219     /* If the child returns an OK exit code, read the set of slave client
2220      * IDs and the associated status code. We'll terminate all the slaves
2221      * in error state.
2222      *
2223      * If the process returned an error, consider the list of slaves that
2224      * can continue to be empty, so that it's just a special case of the
2225      * normal code path. */
2226     ok_slaves = zmalloc(sizeof(uint64_t)); /* Make space for the count. */
2227     ok_slaves[0] = 0;
2228     if (!bysignal && exitcode == 0) {
2229         int readlen = sizeof(uint64_t);
2230 
2231         if (read(server.rdb_pipe_read_result_from_child, ok_slaves, readlen) ==
2232                  readlen)
2233         {
2234             readlen = ok_slaves[0]*sizeof(uint64_t)*2;
2235 
2236             /* Make space for enough elements as specified by the first
2237              * uint64_t element in the array. */
2238             ok_slaves = zrealloc(ok_slaves,sizeof(uint64_t)+readlen);
2239             if (readlen &&
2240                 read(server.rdb_pipe_read_result_from_child, ok_slaves+1,
2241                      readlen) != readlen)
2242             {
2243                 ok_slaves[0] = 0;
2244             }
2245         }
2246     }
2247 
2248     close(server.rdb_pipe_read_result_from_child);
2249     close(server.rdb_pipe_write_result_to_parent);
2250 
2251     /* We can continue the replication process with all the slaves that
2252      * correctly received the full payload. Others are terminated. */
2253     listNode *ln;
2254     listIter li;
2255 
2256     listRewind(server.slaves,&li);
2257     while((ln = listNext(&li))) {
2258         client *slave = ln->value;
2259 
2260         if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_END) {
2261             uint64_t j;
2262             int errorcode = 0;
2263 
2264             /* Search for the slave ID in the reply. In order for a slave to
2265              * continue the replication process, we need to find it in the list,
2266              * and it must have an error code set to 0 (which means success). */
2267             for (j = 0; j < ok_slaves[0]; j++) {
2268                 if (slave->id == ok_slaves[2*j+1]) {
2269                     errorcode = ok_slaves[2*j+2];
2270                     break; /* Found in slaves list. */
2271                 }
2272             }
2273             if (j == ok_slaves[0] || errorcode != 0) {
2274                 serverLog(LL_WARNING,
2275                 "Closing slave %s: child->slave RDB transfer failed: %s",
2276                     replicationGetSlaveName(slave),
2277                     (errorcode == 0) ? "RDB transfer child aborted"
2278                                      : strerror(errorcode));
2279                 freeClient(slave);
2280             } else {
2281                 serverLog(LL_WARNING,
2282                 "Slave %s correctly received the streamed RDB file.",
2283                     replicationGetSlaveName(slave));
2284                 /* Restore the socket as non-blocking. */
2285                 anetNonBlock(NULL,slave->fd);
2286                 anetSendTimeout(NULL,slave->fd,0);
2287             }
2288         }
2289     }
2290     zfree(ok_slaves);
2291 
2292     updateSlavesWaitingBgsave((!bysignal && exitcode == 0) ? C_OK : C_ERR, RDB_CHILD_TYPE_SOCKET);
2293 }
2294 
2295 /* When a background RDB saving/transfer terminates, call the right handler. */
backgroundSaveDoneHandler(int exitcode,int bysignal)2296 void backgroundSaveDoneHandler(int exitcode, int bysignal) {
2297     switch(server.rdb_child_type) {
2298     case RDB_CHILD_TYPE_DISK:
2299         backgroundSaveDoneHandlerDisk(exitcode,bysignal);
2300         break;
2301     case RDB_CHILD_TYPE_SOCKET:
2302         backgroundSaveDoneHandlerSocket(exitcode,bysignal);
2303         break;
2304     default:
2305         serverPanic("Unknown RDB child type.");
2306         break;
2307     }
2308 }
2309 
2310 /* Spawn an RDB child that writes the RDB to the sockets of the slaves
2311  * that are currently in SLAVE_STATE_WAIT_BGSAVE_START state. */
rdbSaveToSlavesSockets(rdbSaveInfo * rsi)2312 int rdbSaveToSlavesSockets(rdbSaveInfo *rsi) {
2313     int *fds;
2314     uint64_t *clientids;
2315     int numfds;
2316     listNode *ln;
2317     listIter li;
2318     pid_t childpid;
2319     long long start;
2320     int pipefds[2];
2321 
2322     if (server.aof_child_pid != -1 || server.rdb_child_pid != -1) return C_ERR;
2323 
2324     /* Before to fork, create a pipe that will be used in order to
2325      * send back to the parent the IDs of the slaves that successfully
2326      * received all the writes. */
2327     if (pipe(pipefds) == -1) return C_ERR;
2328     server.rdb_pipe_read_result_from_child = pipefds[0];
2329     server.rdb_pipe_write_result_to_parent = pipefds[1];
2330 
2331     /* Collect the file descriptors of the slaves we want to transfer
2332      * the RDB to, which are i WAIT_BGSAVE_START state. */
2333     fds = zmalloc(sizeof(int)*listLength(server.slaves));
2334     /* We also allocate an array of corresponding client IDs. This will
2335      * be useful for the child process in order to build the report
2336      * (sent via unix pipe) that will be sent to the parent. */
2337     clientids = zmalloc(sizeof(uint64_t)*listLength(server.slaves));
2338     numfds = 0;
2339 
2340     listRewind(server.slaves,&li);
2341     while((ln = listNext(&li))) {
2342         client *slave = ln->value;
2343 
2344         if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_START) {
2345             clientids[numfds] = slave->id;
2346             fds[numfds++] = slave->fd;
2347             replicationSetupSlaveForFullResync(slave,getPsyncInitialOffset());
2348             /* Put the socket in blocking mode to simplify RDB transfer.
2349              * We'll restore it when the children returns (since duped socket
2350              * will share the O_NONBLOCK attribute with the parent). */
2351             anetBlock(NULL,slave->fd);
2352             anetSendTimeout(NULL,slave->fd,server.repl_timeout*1000);
2353         }
2354     }
2355 
2356     /* Create the child process. */
2357     openChildInfoPipe();
2358     start = ustime();
2359     if ((childpid = fork()) == 0) {
2360         /* Child */
2361         int retval;
2362         rio slave_sockets;
2363 
2364         rioInitWithFdset(&slave_sockets,fds,numfds);
2365         zfree(fds);
2366 
2367         closeClildUnusedResourceAfterFork();
2368         redisSetProcTitle("redis-rdb-to-slaves");
2369 
2370         retval = rdbSaveRioWithEOFMark(&slave_sockets,NULL,rsi);
2371         if (retval == C_OK && rioFlush(&slave_sockets) == 0)
2372             retval = C_ERR;
2373 
2374         if (retval == C_OK) {
2375             size_t private_dirty = zmalloc_get_private_dirty(-1);
2376 
2377             if (private_dirty) {
2378                 serverLog(LL_NOTICE,
2379                     "RDB: %zu MB of memory used by copy-on-write",
2380                     private_dirty/(1024*1024));
2381             }
2382 
2383             server.child_info_data.cow_size = private_dirty;
2384             sendChildInfo(CHILD_INFO_TYPE_RDB);
2385 
2386             /* If we are returning OK, at least one slave was served
2387              * with the RDB file as expected, so we need to send a report
2388              * to the parent via the pipe. The format of the message is:
2389              *
2390              * <len> <slave[0].id> <slave[0].error> ...
2391              *
2392              * len, slave IDs, and slave errors, are all uint64_t integers,
2393              * so basically the reply is composed of 64 bits for the len field
2394              * plus 2 additional 64 bit integers for each entry, for a total
2395              * of 'len' entries.
2396              *
2397              * The 'id' represents the slave's client ID, so that the master
2398              * can match the report with a specific slave, and 'error' is
2399              * set to 0 if the replication process terminated with a success
2400              * or the error code if an error occurred. */
2401             void *msg = zmalloc(sizeof(uint64_t)*(1+2*numfds));
2402             uint64_t *len = msg;
2403             uint64_t *ids = len+1;
2404             int j, msglen;
2405 
2406             *len = numfds;
2407             for (j = 0; j < numfds; j++) {
2408                 *ids++ = clientids[j];
2409                 *ids++ = slave_sockets.io.fdset.state[j];
2410             }
2411 
2412             /* Write the message to the parent. If we have no good slaves or
2413              * we are unable to transfer the message to the parent, we exit
2414              * with an error so that the parent will abort the replication
2415              * process with all the childre that were waiting. */
2416             msglen = sizeof(uint64_t)*(1+2*numfds);
2417             if (*len == 0 ||
2418                 write(server.rdb_pipe_write_result_to_parent,msg,msglen)
2419                 != msglen)
2420             {
2421                 retval = C_ERR;
2422             }
2423             zfree(msg);
2424         }
2425         zfree(clientids);
2426         rioFreeFdset(&slave_sockets);
2427         exitFromChild((retval == C_OK) ? 0 : 1);
2428     } else {
2429         /* Parent */
2430         if (childpid == -1) {
2431             serverLog(LL_WARNING,"Can't save in background: fork: %s",
2432                 strerror(errno));
2433 
2434             /* Undo the state change. The caller will perform cleanup on
2435              * all the slaves in BGSAVE_START state, but an early call to
2436              * replicationSetupSlaveForFullResync() turned it into BGSAVE_END */
2437             listRewind(server.slaves,&li);
2438             while((ln = listNext(&li))) {
2439                 client *slave = ln->value;
2440                 int j;
2441 
2442                 for (j = 0; j < numfds; j++) {
2443                     if (slave->id == clientids[j]) {
2444                         slave->replstate = SLAVE_STATE_WAIT_BGSAVE_START;
2445                         break;
2446                     }
2447                 }
2448             }
2449             close(pipefds[0]);
2450             close(pipefds[1]);
2451             closeChildInfoPipe();
2452         } else {
2453             server.stat_fork_time = ustime()-start;
2454             server.stat_fork_rate = (double) zmalloc_used_memory() * 1000000 / server.stat_fork_time / (1024*1024*1024); /* GB per second. */
2455             latencyAddSampleIfNeeded("fork",server.stat_fork_time/1000);
2456 
2457             serverLog(LL_NOTICE,"Background RDB transfer started by pid %d",
2458                 childpid);
2459             server.rdb_save_time_start = time(NULL);
2460             server.rdb_child_pid = childpid;
2461             server.rdb_child_type = RDB_CHILD_TYPE_SOCKET;
2462             updateDictResizePolicy();
2463         }
2464         zfree(clientids);
2465         zfree(fds);
2466         return (childpid == -1) ? C_ERR : C_OK;
2467     }
2468     return C_OK; /* Unreached. */
2469 }
2470 
saveCommand(client * c)2471 void saveCommand(client *c) {
2472     if (server.rdb_child_pid != -1) {
2473         addReplyError(c,"Background save already in progress");
2474         return;
2475     }
2476     rdbSaveInfo rsi, *rsiptr;
2477     rsiptr = rdbPopulateSaveInfo(&rsi);
2478     if (rdbSave(server.rdb_filename,rsiptr) == C_OK) {
2479         addReply(c,shared.ok);
2480     } else {
2481         addReply(c,shared.err);
2482     }
2483 }
2484 
2485 /* BGSAVE [SCHEDULE] */
bgsaveCommand(client * c)2486 void bgsaveCommand(client *c) {
2487     int schedule = 0;
2488 
2489     /* The SCHEDULE option changes the behavior of BGSAVE when an AOF rewrite
2490      * is in progress. Instead of returning an error a BGSAVE gets scheduled. */
2491     if (c->argc > 1) {
2492         if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr,"schedule")) {
2493             schedule = 1;
2494         } else {
2495             addReply(c,shared.syntaxerr);
2496             return;
2497         }
2498     }
2499 
2500     rdbSaveInfo rsi, *rsiptr;
2501     rsiptr = rdbPopulateSaveInfo(&rsi);
2502 
2503     if (server.rdb_child_pid != -1) {
2504         addReplyError(c,"Background save already in progress");
2505     } else if (server.aof_child_pid != -1) {
2506         if (schedule) {
2507             server.rdb_bgsave_scheduled = 1;
2508             addReplyStatus(c,"Background saving scheduled");
2509         } else {
2510             addReplyError(c,
2511                 "An AOF log rewriting in progress: can't BGSAVE right now. "
2512                 "Use BGSAVE SCHEDULE in order to schedule a BGSAVE whenever "
2513                 "possible.");
2514         }
2515     } else if (rdbSaveBackground(server.rdb_filename,rsiptr) == C_OK) {
2516         addReplyStatus(c,"Background saving started");
2517     } else {
2518         addReply(c,shared.err);
2519     }
2520 }
2521 
2522 /* Populate the rdbSaveInfo structure used to persist the replication
2523  * information inside the RDB file. Currently the structure explicitly
2524  * contains just the currently selected DB from the master stream, however
2525  * if the rdbSave*() family functions receive a NULL rsi structure also
2526  * the Replication ID/offset is not saved. The function popultes 'rsi'
2527  * that is normally stack-allocated in the caller, returns the populated
2528  * pointer if the instance has a valid master client, otherwise NULL
2529  * is returned, and the RDB saving will not persist any replication related
2530  * information. */
rdbPopulateSaveInfo(rdbSaveInfo * rsi)2531 rdbSaveInfo *rdbPopulateSaveInfo(rdbSaveInfo *rsi) {
2532     rdbSaveInfo rsi_init = RDB_SAVE_INFO_INIT;
2533     *rsi = rsi_init;
2534 
2535     /* If the instance is a master, we can populate the replication info
2536      * only when repl_backlog is not NULL. If the repl_backlog is NULL,
2537      * it means that the instance isn't in any replication chains. In this
2538      * scenario the replication info is useless, because when a slave
2539      * connects to us, the NULL repl_backlog will trigger a full
2540      * synchronization, at the same time we will use a new replid and clear
2541      * replid2. */
2542     if (!server.masterhost && server.repl_backlog) {
2543         /* Note that when server.slaveseldb is -1, it means that this master
2544          * didn't apply any write commands after a full synchronization.
2545          * So we can let repl_stream_db be 0, this allows a restarted slave
2546          * to reload replication ID/offset, it's safe because the next write
2547          * command must generate a SELECT statement. */
2548         rsi->repl_stream_db = server.slaveseldb == -1 ? 0 : server.slaveseldb;
2549         return rsi;
2550     }
2551 
2552     /* If the instance is a slave we need a connected master
2553      * in order to fetch the currently selected DB. */
2554     if (server.master) {
2555         rsi->repl_stream_db = server.master->db->id;
2556         return rsi;
2557     }
2558 
2559     /* If we have a cached master we can use it in order to populate the
2560      * replication selected DB info inside the RDB file: the slave can
2561      * increment the master_repl_offset only from data arriving from the
2562      * master, so if we are disconnected the offset in the cached master
2563      * is valid. */
2564     if (server.cached_master) {
2565         rsi->repl_stream_db = server.cached_master->db->id;
2566         return rsi;
2567     }
2568     return NULL;
2569 }
2570