1 #ifndef __CLUSTER_H 2 #define __CLUSTER_H 3 4 /*----------------------------------------------------------------------------- 5 * Redis cluster data structures, defines, exported API. 6 *----------------------------------------------------------------------------*/ 7 8 #define CLUSTER_SLOTS 16384 9 #define CLUSTER_OK 0 /* Everything looks ok */ 10 #define CLUSTER_FAIL 1 /* The cluster can't work */ 11 #define CLUSTER_NAMELEN 40 /* sha1 hex length */ 12 #define CLUSTER_PORT_INCR 10000 /* Cluster port = baseport + PORT_INCR */ 13 14 /* The following defines are amount of time, sometimes expressed as 15 * multiplicators of the node timeout value (when ending with MULT). */ 16 #define CLUSTER_DEFAULT_NODE_TIMEOUT 15000 17 #define CLUSTER_DEFAULT_SLAVE_VALIDITY 10 /* Slave max data age factor. */ 18 #define CLUSTER_DEFAULT_REQUIRE_FULL_COVERAGE 1 19 #define CLUSTER_DEFAULT_SLAVE_NO_FAILOVER 0 /* Failover by default. */ 20 #define CLUSTER_FAIL_REPORT_VALIDITY_MULT 2 /* Fail report validity. */ 21 #define CLUSTER_FAIL_UNDO_TIME_MULT 2 /* Undo fail if master is back. */ 22 #define CLUSTER_FAIL_UNDO_TIME_ADD 10 /* Some additional time. */ 23 #define CLUSTER_FAILOVER_DELAY 5 /* Seconds */ 24 #define CLUSTER_DEFAULT_MIGRATION_BARRIER 1 25 #define CLUSTER_MF_TIMEOUT 5000 /* Milliseconds to do a manual failover. */ 26 #define CLUSTER_MF_PAUSE_MULT 2 /* Master pause manual failover mult. */ 27 #define CLUSTER_SLAVE_MIGRATION_DELAY 5000 /* Delay for slave migration. */ 28 29 /* Redirection errors returned by getNodeByQuery(). */ 30 #define CLUSTER_REDIR_NONE 0 /* Node can serve the request. */ 31 #define CLUSTER_REDIR_CROSS_SLOT 1 /* -CROSSSLOT request. */ 32 #define CLUSTER_REDIR_UNSTABLE 2 /* -TRYAGAIN redirection required */ 33 #define CLUSTER_REDIR_ASK 3 /* -ASK redirection required. */ 34 #define CLUSTER_REDIR_MOVED 4 /* -MOVED redirection required. */ 35 #define CLUSTER_REDIR_DOWN_STATE 5 /* -CLUSTERDOWN, global state. */ 36 #define CLUSTER_REDIR_DOWN_UNBOUND 6 /* -CLUSTERDOWN, unbound slot. */ 37 38 struct clusterNode; 39 40 /* clusterLink encapsulates everything needed to talk with a remote node. */ 41 typedef struct clusterLink { 42 mstime_t ctime; /* Link creation time */ 43 int fd; /* TCP socket file descriptor */ 44 sds sndbuf; /* Packet send buffer */ 45 sds rcvbuf; /* Packet reception buffer */ 46 struct clusterNode *node; /* Node related to this link if any, or NULL */ 47 } clusterLink; 48 49 /* Cluster node flags and macros. */ 50 #define CLUSTER_NODE_MASTER 1 /* The node is a master */ 51 #define CLUSTER_NODE_SLAVE 2 /* The node is a slave */ 52 #define CLUSTER_NODE_PFAIL 4 /* Failure? Need acknowledge */ 53 #define CLUSTER_NODE_FAIL 8 /* The node is believed to be malfunctioning */ 54 #define CLUSTER_NODE_MYSELF 16 /* This node is myself */ 55 #define CLUSTER_NODE_HANDSHAKE 32 /* We have still to exchange the first ping */ 56 #define CLUSTER_NODE_NOADDR 64 /* We don't know the address of this node */ 57 #define CLUSTER_NODE_MEET 128 /* Send a MEET message to this node */ 58 #define CLUSTER_NODE_MIGRATE_TO 256 /* Master elegible for replica migration. */ 59 #define CLUSTER_NODE_NOFAILOVER 512 /* Slave will not try to failver. */ 60 #define CLUSTER_NODE_NULL_NAME "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" 61 62 #define nodeIsMaster(n) ((n)->flags & CLUSTER_NODE_MASTER) 63 #define nodeIsSlave(n) ((n)->flags & CLUSTER_NODE_SLAVE) 64 #define nodeInHandshake(n) ((n)->flags & CLUSTER_NODE_HANDSHAKE) 65 #define nodeHasAddr(n) (!((n)->flags & CLUSTER_NODE_NOADDR)) 66 #define nodeWithoutAddr(n) ((n)->flags & CLUSTER_NODE_NOADDR) 67 #define nodeTimedOut(n) ((n)->flags & CLUSTER_NODE_PFAIL) 68 #define nodeFailed(n) ((n)->flags & CLUSTER_NODE_FAIL) 69 #define nodeCantFailover(n) ((n)->flags & CLUSTER_NODE_NOFAILOVER) 70 71 /* Reasons why a slave is not able to failover. */ 72 #define CLUSTER_CANT_FAILOVER_NONE 0 73 #define CLUSTER_CANT_FAILOVER_DATA_AGE 1 74 #define CLUSTER_CANT_FAILOVER_WAITING_DELAY 2 75 #define CLUSTER_CANT_FAILOVER_EXPIRED 3 76 #define CLUSTER_CANT_FAILOVER_WAITING_VOTES 4 77 #define CLUSTER_CANT_FAILOVER_RELOG_PERIOD (60*5) /* seconds. */ 78 79 /* clusterState todo_before_sleep flags. */ 80 #define CLUSTER_TODO_HANDLE_FAILOVER (1<<0) 81 #define CLUSTER_TODO_UPDATE_STATE (1<<1) 82 #define CLUSTER_TODO_SAVE_CONFIG (1<<2) 83 #define CLUSTER_TODO_FSYNC_CONFIG (1<<3) 84 85 /* Message types. 86 * 87 * Note that the PING, PONG and MEET messages are actually the same exact 88 * kind of packet. PONG is the reply to ping, in the exact format as a PING, 89 * while MEET is a special PING that forces the receiver to add the sender 90 * as a node (if it is not already in the list). */ 91 #define CLUSTERMSG_TYPE_PING 0 /* Ping */ 92 #define CLUSTERMSG_TYPE_PONG 1 /* Pong (reply to Ping) */ 93 #define CLUSTERMSG_TYPE_MEET 2 /* Meet "let's join" message */ 94 #define CLUSTERMSG_TYPE_FAIL 3 /* Mark node xxx as failing */ 95 #define CLUSTERMSG_TYPE_PUBLISH 4 /* Pub/Sub Publish propagation */ 96 #define CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST 5 /* May I failover? */ 97 #define CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK 6 /* Yes, you have my vote */ 98 #define CLUSTERMSG_TYPE_UPDATE 7 /* Another node slots configuration */ 99 #define CLUSTERMSG_TYPE_MFSTART 8 /* Pause clients for manual failover */ 100 #define CLUSTERMSG_TYPE_MODULE 9 /* Module cluster API message. */ 101 #define CLUSTERMSG_TYPE_COUNT 10 /* Total number of message types. */ 102 103 /* Flags that a module can set in order to prevent certain Redis Cluster 104 * features to be enabled. Useful when implementing a different distributed 105 * system on top of Redis Cluster message bus, using modules. */ 106 #define CLUSTER_MODULE_FLAG_NONE 0 107 #define CLUSTER_MODULE_FLAG_NO_FAILOVER (1<<1) 108 #define CLUSTER_MODULE_FLAG_NO_REDIRECTION (1<<2) 109 110 /* This structure represent elements of node->fail_reports. */ 111 typedef struct clusterNodeFailReport { 112 struct clusterNode *node; /* Node reporting the failure condition. */ 113 mstime_t time; /* Time of the last report from this node. */ 114 } clusterNodeFailReport; 115 116 typedef struct clusterNode { 117 mstime_t ctime; /* Node object creation time. */ 118 char name[CLUSTER_NAMELEN]; /* Node name, hex string, sha1-size */ 119 int flags; /* CLUSTER_NODE_... */ 120 uint64_t configEpoch; /* Last configEpoch observed for this node */ 121 unsigned char slots[CLUSTER_SLOTS/8]; /* slots handled by this node */ 122 int numslots; /* Number of slots handled by this node */ 123 int numslaves; /* Number of slave nodes, if this is a master */ 124 struct clusterNode **slaves; /* pointers to slave nodes */ 125 struct clusterNode *slaveof; /* pointer to the master node. Note that it 126 may be NULL even if the node is a slave 127 if we don't have the master node in our 128 tables. */ 129 mstime_t ping_sent; /* Unix time we sent latest ping */ 130 mstime_t pong_received; /* Unix time we received the pong */ 131 mstime_t data_received; /* Unix time we received any data */ 132 mstime_t fail_time; /* Unix time when FAIL flag was set */ 133 mstime_t voted_time; /* Last time we voted for a slave of this master */ 134 mstime_t repl_offset_time; /* Unix time we received offset for this node */ 135 mstime_t orphaned_time; /* Starting time of orphaned master condition */ 136 long long repl_offset; /* Last known repl offset for this node. */ 137 char ip[NET_IP_STR_LEN]; /* Latest known IP address of this node */ 138 int port; /* Latest known clients port of this node */ 139 int cport; /* Latest known cluster port of this node. */ 140 clusterLink *link; /* TCP/IP link with this node */ 141 list *fail_reports; /* List of nodes signaling this as failing */ 142 } clusterNode; 143 144 typedef struct clusterState { 145 clusterNode *myself; /* This node */ 146 uint64_t currentEpoch; 147 int state; /* CLUSTER_OK, CLUSTER_FAIL, ... */ 148 int size; /* Num of master nodes with at least one slot */ 149 dict *nodes; /* Hash table of name -> clusterNode structures */ 150 dict *nodes_black_list; /* Nodes we don't re-add for a few seconds. */ 151 clusterNode *migrating_slots_to[CLUSTER_SLOTS]; 152 clusterNode *importing_slots_from[CLUSTER_SLOTS]; 153 clusterNode *slots[CLUSTER_SLOTS]; 154 uint64_t slots_keys_count[CLUSTER_SLOTS]; 155 rax *slots_to_keys; 156 /* The following fields are used to take the slave state on elections. */ 157 mstime_t failover_auth_time; /* Time of previous or next election. */ 158 int failover_auth_count; /* Number of votes received so far. */ 159 int failover_auth_sent; /* True if we already asked for votes. */ 160 int failover_auth_rank; /* This slave rank for current auth request. */ 161 uint64_t failover_auth_epoch; /* Epoch of the current election. */ 162 int cant_failover_reason; /* Why a slave is currently not able to 163 failover. See the CANT_FAILOVER_* macros. */ 164 /* Manual failover state in common. */ 165 mstime_t mf_end; /* Manual failover time limit (ms unixtime). 166 It is zero if there is no MF in progress. */ 167 /* Manual failover state of master. */ 168 clusterNode *mf_slave; /* Slave performing the manual failover. */ 169 /* Manual failover state of slave. */ 170 long long mf_master_offset; /* Master offset the slave needs to start MF 171 or zero if stil not received. */ 172 int mf_can_start; /* If non-zero signal that the manual failover 173 can start requesting masters vote. */ 174 /* The followign fields are used by masters to take state on elections. */ 175 uint64_t lastVoteEpoch; /* Epoch of the last vote granted. */ 176 int todo_before_sleep; /* Things to do in clusterBeforeSleep(). */ 177 /* Messages received and sent by type. */ 178 long long stats_bus_messages_sent[CLUSTERMSG_TYPE_COUNT]; 179 long long stats_bus_messages_received[CLUSTERMSG_TYPE_COUNT]; 180 long long stats_pfail_nodes; /* Number of nodes in PFAIL status, 181 excluding nodes without address. */ 182 } clusterState; 183 184 /* Redis cluster messages header */ 185 186 /* Initially we don't know our "name", but we'll find it once we connect 187 * to the first node, using the getsockname() function. Then we'll use this 188 * address for all the next messages. */ 189 typedef struct { 190 char nodename[CLUSTER_NAMELEN]; 191 uint32_t ping_sent; 192 uint32_t pong_received; 193 char ip[NET_IP_STR_LEN]; /* IP address last time it was seen */ 194 uint16_t port; /* base port last time it was seen */ 195 uint16_t cport; /* cluster port last time it was seen */ 196 uint16_t flags; /* node->flags copy */ 197 uint32_t notused1; 198 } clusterMsgDataGossip; 199 200 typedef struct { 201 char nodename[CLUSTER_NAMELEN]; 202 } clusterMsgDataFail; 203 204 typedef struct { 205 uint32_t channel_len; 206 uint32_t message_len; 207 unsigned char bulk_data[8]; /* 8 bytes just as placeholder. */ 208 } clusterMsgDataPublish; 209 210 typedef struct { 211 uint64_t configEpoch; /* Config epoch of the specified instance. */ 212 char nodename[CLUSTER_NAMELEN]; /* Name of the slots owner. */ 213 unsigned char slots[CLUSTER_SLOTS/8]; /* Slots bitmap. */ 214 } clusterMsgDataUpdate; 215 216 typedef struct { 217 uint64_t module_id; /* ID of the sender module. */ 218 uint32_t len; /* ID of the sender module. */ 219 uint8_t type; /* Type from 0 to 255. */ 220 unsigned char bulk_data[3]; /* 3 bytes just as placeholder. */ 221 } clusterMsgModule; 222 223 union clusterMsgData { 224 /* PING, MEET and PONG */ 225 struct { 226 /* Array of N clusterMsgDataGossip structures */ 227 clusterMsgDataGossip gossip[1]; 228 } ping; 229 230 /* FAIL */ 231 struct { 232 clusterMsgDataFail about; 233 } fail; 234 235 /* PUBLISH */ 236 struct { 237 clusterMsgDataPublish msg; 238 } publish; 239 240 /* UPDATE */ 241 struct { 242 clusterMsgDataUpdate nodecfg; 243 } update; 244 245 /* MODULE */ 246 struct { 247 clusterMsgModule msg; 248 } module; 249 }; 250 251 #define CLUSTER_PROTO_VER 1 /* Cluster bus protocol version. */ 252 253 typedef struct { 254 char sig[4]; /* Signature "RCmb" (Redis Cluster message bus). */ 255 uint32_t totlen; /* Total length of this message */ 256 uint16_t ver; /* Protocol version, currently set to 1. */ 257 uint16_t port; /* TCP base port number. */ 258 uint16_t type; /* Message type */ 259 uint16_t count; /* Only used for some kind of messages. */ 260 uint64_t currentEpoch; /* The epoch accordingly to the sending node. */ 261 uint64_t configEpoch; /* The config epoch if it's a master, or the last 262 epoch advertised by its master if it is a 263 slave. */ 264 uint64_t offset; /* Master replication offset if node is a master or 265 processed replication offset if node is a slave. */ 266 char sender[CLUSTER_NAMELEN]; /* Name of the sender node */ 267 unsigned char myslots[CLUSTER_SLOTS/8]; 268 char slaveof[CLUSTER_NAMELEN]; 269 char myip[NET_IP_STR_LEN]; /* Sender IP, if not all zeroed. */ 270 char notused1[34]; /* 34 bytes reserved for future usage. */ 271 uint16_t cport; /* Sender TCP cluster bus port */ 272 uint16_t flags; /* Sender node flags */ 273 unsigned char state; /* Cluster state from the POV of the sender */ 274 unsigned char mflags[3]; /* Message flags: CLUSTERMSG_FLAG[012]_... */ 275 union clusterMsgData data; 276 } clusterMsg; 277 278 #define CLUSTERMSG_MIN_LEN (sizeof(clusterMsg)-sizeof(union clusterMsgData)) 279 280 /* Message flags better specify the packet content or are used to 281 * provide some information about the node state. */ 282 #define CLUSTERMSG_FLAG0_PAUSED (1<<0) /* Master paused for manual failover. */ 283 #define CLUSTERMSG_FLAG0_FORCEACK (1<<1) /* Give ACK to AUTH_REQUEST even if 284 master is up. */ 285 286 /* ---------------------- API exported outside cluster.c -------------------- */ 287 clusterNode *getNodeByQuery(client *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *ask); 288 int clusterRedirectBlockedClientIfNeeded(client *c); 289 void clusterRedirectClient(client *c, clusterNode *n, int hashslot, int error_code); 290 291 #endif /* __CLUSTER_H */ 292