1 /*
2 * Consistent Hash implementation
3 * Please consult this very well detailed article for more information :
4 * http://www.spiteful.com/2008/03/17/programmers-toolbox-part-3-consistent-hashing/
5 *
6 * Our implementation has to support both weighted hashing and weighted round
7 * robin because we'll use it to replace the previous map-based implementation
8 * which offered both algorithms.
9 *
10 * Copyright 2000-2010 Willy Tarreau <w@1wt.eu>
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
16 *
17 */
18
19 #include <common/compat.h>
20 #include <common/config.h>
21 #include <common/debug.h>
22 #include <common/standard.h>
23 #include <eb32tree.h>
24
25 #include <types/global.h>
26 #include <types/server.h>
27
28 #include <proto/backend.h>
29 #include <proto/log.h>
30 #include <proto/queue.h>
31
32 /* Return next tree node after <node> which must still be in the tree, or be
33 * NULL. Lookup wraps around the end to the beginning. If the next node is the
34 * same node, return NULL. This is designed to find a valid next node before
35 * deleting one from the tree.
36 */
chash_skip_node(struct eb_root * root,struct eb32_node * node)37 static inline struct eb32_node *chash_skip_node(struct eb_root *root, struct eb32_node *node)
38 {
39 struct eb32_node *stop = node;
40
41 if (!node)
42 return NULL;
43 node = eb32_next(node);
44 if (!node)
45 node = eb32_first(root);
46 if (node == stop)
47 return NULL;
48 return node;
49 }
50
51 /* Remove all of a server's entries from its tree. This may be used when
52 * setting a server down.
53 */
chash_dequeue_srv(struct server * s)54 static inline void chash_dequeue_srv(struct server *s)
55 {
56 while (s->lb_nodes_now > 0) {
57 if (s->lb_nodes_now >= s->lb_nodes_tot) // should always be false anyway
58 s->lb_nodes_now = s->lb_nodes_tot;
59 s->lb_nodes_now--;
60 if (s->proxy->lbprm.chash.last == &s->lb_nodes[s->lb_nodes_now].node)
61 s->proxy->lbprm.chash.last = chash_skip_node(s->lb_tree, s->proxy->lbprm.chash.last);
62 eb32_delete(&s->lb_nodes[s->lb_nodes_now].node);
63 }
64 }
65
66 /* Adjust the number of entries of a server in its tree. The server must appear
67 * as many times as its weight indicates it. If it's there too often, we remove
68 * the last occurrences. If it's not there enough, we add more occurrences. To
69 * remove a server from the tree, normally call this with eweight=0.
70 *
71 * The server's lock and the lbprm's lock must be held.
72 */
chash_queue_dequeue_srv(struct server * s)73 static inline void chash_queue_dequeue_srv(struct server *s)
74 {
75 while (s->lb_nodes_now > s->next_eweight) {
76 if (s->lb_nodes_now >= s->lb_nodes_tot) // should always be false anyway
77 s->lb_nodes_now = s->lb_nodes_tot;
78 s->lb_nodes_now--;
79 if (s->proxy->lbprm.chash.last == &s->lb_nodes[s->lb_nodes_now].node)
80 s->proxy->lbprm.chash.last = chash_skip_node(s->lb_tree, s->proxy->lbprm.chash.last);
81 eb32_delete(&s->lb_nodes[s->lb_nodes_now].node);
82 }
83
84 /* Attempt to increase the total number of nodes, if the user
85 * increased the weight beyond the original weight
86 */
87 if (s->lb_nodes_tot < s->next_eweight) {
88 struct tree_occ *new_nodes;
89
90 /* First we need to remove all server's entries from its tree
91 * because the realloc will change all nodes pointers */
92 chash_dequeue_srv(s);
93
94 new_nodes = realloc(s->lb_nodes, s->next_eweight * sizeof(*new_nodes));
95 if (new_nodes) {
96 unsigned int j;
97
98 s->lb_nodes = new_nodes;
99 memset(&s->lb_nodes[s->lb_nodes_tot], 0,
100 (s->next_eweight - s->lb_nodes_tot) * sizeof(*s->lb_nodes));
101 for (j = s->lb_nodes_tot; j < s->next_eweight; j++) {
102 s->lb_nodes[j].server = s;
103 s->lb_nodes[j].node.key = full_hash(s->puid * SRV_EWGHT_RANGE + j);
104 }
105 s->lb_nodes_tot = s->next_eweight;
106 }
107 }
108 while (s->lb_nodes_now < s->next_eweight) {
109 if (s->lb_nodes_now >= s->lb_nodes_tot) // should always be false anyway
110 break;
111 if (s->proxy->lbprm.chash.last == &s->lb_nodes[s->lb_nodes_now].node)
112 s->proxy->lbprm.chash.last = chash_skip_node(s->lb_tree, s->proxy->lbprm.chash.last);
113 eb32_insert(s->lb_tree, &s->lb_nodes[s->lb_nodes_now].node);
114 s->lb_nodes_now++;
115 }
116 }
117
118 /* This function updates the server trees according to server <srv>'s new
119 * state. It should be called when server <srv>'s status changes to down.
120 * It is not important whether the server was already down or not. It is not
121 * important either that the new state is completely down (the caller may not
122 * know all the variables of a server's state).
123 *
124 * The server's lock must be held. The lbprm lock will be used.
125 */
chash_set_server_status_down(struct server * srv)126 static void chash_set_server_status_down(struct server *srv)
127 {
128 struct proxy *p = srv->proxy;
129
130 if (!srv_lb_status_changed(srv))
131 return;
132
133 HA_SPIN_LOCK(LBPRM_LOCK, &p->lbprm.lock);
134
135 if (srv_willbe_usable(srv))
136 goto out_update_state;
137
138 if (!srv_currently_usable(srv))
139 /* server was already down */
140 goto out_update_backend;
141
142 if (srv->flags & SRV_F_BACKUP) {
143 p->lbprm.tot_wbck -= srv->cur_eweight;
144 p->srv_bck--;
145
146 if (srv == p->lbprm.fbck) {
147 /* we lost the first backup server in a single-backup
148 * configuration, we must search another one.
149 */
150 struct server *srv2 = p->lbprm.fbck;
151 do {
152 srv2 = srv2->next;
153 } while (srv2 &&
154 !((srv2->flags & SRV_F_BACKUP) &&
155 srv_willbe_usable(srv2)));
156 p->lbprm.fbck = srv2;
157 }
158 } else {
159 p->lbprm.tot_wact -= srv->cur_eweight;
160 p->srv_act--;
161 }
162
163 chash_dequeue_srv(srv);
164
165 out_update_backend:
166 /* check/update tot_used, tot_weight */
167 update_backend_weight(p);
168 out_update_state:
169 srv_lb_commit_status(srv);
170
171 HA_SPIN_UNLOCK(LBPRM_LOCK, &p->lbprm.lock);
172 }
173
174 /* This function updates the server trees according to server <srv>'s new
175 * state. It should be called when server <srv>'s status changes to up.
176 * It is not important whether the server was already down or not. It is not
177 * important either that the new state is completely UP (the caller may not
178 * know all the variables of a server's state). This function will not change
179 * the weight of a server which was already up.
180 *
181 * The server's lock must be held. The lbprm lock will be used.
182 */
chash_set_server_status_up(struct server * srv)183 static void chash_set_server_status_up(struct server *srv)
184 {
185 struct proxy *p = srv->proxy;
186
187 if (!srv_lb_status_changed(srv))
188 return;
189
190 HA_SPIN_LOCK(LBPRM_LOCK, &p->lbprm.lock);
191
192 if (!srv_willbe_usable(srv))
193 goto out_update_state;
194
195 if (srv_currently_usable(srv))
196 /* server was already up */
197 goto out_update_backend;
198
199 if (srv->flags & SRV_F_BACKUP) {
200 p->lbprm.tot_wbck += srv->next_eweight;
201 p->srv_bck++;
202
203 if (!(p->options & PR_O_USE_ALL_BK)) {
204 if (!p->lbprm.fbck) {
205 /* there was no backup server anymore */
206 p->lbprm.fbck = srv;
207 } else {
208 /* we may have restored a backup server prior to fbck,
209 * in which case it should replace it.
210 */
211 struct server *srv2 = srv;
212 do {
213 srv2 = srv2->next;
214 } while (srv2 && (srv2 != p->lbprm.fbck));
215 if (srv2)
216 p->lbprm.fbck = srv;
217 }
218 }
219 } else {
220 p->lbprm.tot_wact += srv->next_eweight;
221 p->srv_act++;
222 }
223
224 /* note that eweight cannot be 0 here */
225 chash_queue_dequeue_srv(srv);
226
227 out_update_backend:
228 /* check/update tot_used, tot_weight */
229 update_backend_weight(p);
230 out_update_state:
231 srv_lb_commit_status(srv);
232
233 HA_SPIN_UNLOCK(LBPRM_LOCK, &p->lbprm.lock);
234 }
235
236 /* This function must be called after an update to server <srv>'s effective
237 * weight. It may be called after a state change too.
238 *
239 * The server's lock must be held. The lbprm lock may be used.
240 */
chash_update_server_weight(struct server * srv)241 static void chash_update_server_weight(struct server *srv)
242 {
243 int old_state, new_state;
244 struct proxy *p = srv->proxy;
245
246 if (!srv_lb_status_changed(srv))
247 return;
248
249 /* If changing the server's weight changes its state, we simply apply
250 * the procedures we already have for status change. If the state
251 * remains down, the server is not in any tree, so it's as easy as
252 * updating its values. If the state remains up with different weights,
253 * there are some computations to perform to find a new place and
254 * possibly a new tree for this server.
255 */
256
257 old_state = srv_currently_usable(srv);
258 new_state = srv_willbe_usable(srv);
259
260 if (!old_state && !new_state) {
261 srv_lb_commit_status(srv);
262 return;
263 }
264 else if (!old_state && new_state) {
265 chash_set_server_status_up(srv);
266 return;
267 }
268 else if (old_state && !new_state) {
269 chash_set_server_status_down(srv);
270 return;
271 }
272
273 HA_SPIN_LOCK(LBPRM_LOCK, &p->lbprm.lock);
274
275 /* only adjust the server's presence in the tree */
276 chash_queue_dequeue_srv(srv);
277
278 if (srv->flags & SRV_F_BACKUP)
279 p->lbprm.tot_wbck += srv->next_eweight - srv->cur_eweight;
280 else
281 p->lbprm.tot_wact += srv->next_eweight - srv->cur_eweight;
282
283 update_backend_weight(p);
284 srv_lb_commit_status(srv);
285
286 HA_SPIN_UNLOCK(LBPRM_LOCK, &p->lbprm.lock);
287 }
288
289 /*
290 * This function implements the "Consistent Hashing with Bounded Loads" algorithm
291 * of Mirrokni, Thorup, and Zadimoghaddam (arxiv:1608.01350), adapted for use with
292 * unequal server weights.
293 */
chash_server_is_eligible(struct server * s)294 int chash_server_is_eligible(struct server *s)
295 {
296 /* The total number of slots to allocate is the total number of outstanding requests
297 * (including the one we're about to make) times the load-balance-factor, rounded up.
298 */
299 unsigned tot_slots = ((s->proxy->served + 1) * s->proxy->lbprm.hash_balance_factor + 99) / 100;
300 unsigned slots_per_weight = tot_slots / s->proxy->lbprm.tot_weight;
301 unsigned remainder = tot_slots % s->proxy->lbprm.tot_weight;
302
303 /* Allocate a whole number of slots per weight unit... */
304 unsigned slots = s->cur_eweight * slots_per_weight;
305
306 /* And then distribute the rest among servers proportionally to their weight. */
307 slots += ((s->cumulative_weight + s->cur_eweight) * remainder) / s->proxy->lbprm.tot_weight
308 - (s->cumulative_weight * remainder) / s->proxy->lbprm.tot_weight;
309
310 /* But never leave a server with 0. */
311 if (slots == 0)
312 slots = 1;
313
314 return s->served < slots;
315 }
316
317 /*
318 * This function returns the running server from the CHASH tree, which is at
319 * the closest distance from the value of <hash>. Doing so ensures that even
320 * with a well imbalanced hash, if some servers are close to each other, they
321 * will still both receive traffic. If any server is found, it will be returned.
322 * It will also skip server <avoid> if the hash result ends on this one.
323 * If no valid server is found, NULL is returned.
324 */
chash_get_server_hash(struct proxy * p,unsigned int hash,const struct server * avoid)325 struct server *chash_get_server_hash(struct proxy *p, unsigned int hash, const struct server *avoid)
326 {
327 struct eb32_node *next, *prev;
328 struct server *nsrv, *psrv;
329 struct eb_root *root;
330 unsigned int dn, dp;
331 int loop;
332
333 HA_SPIN_LOCK(LBPRM_LOCK, &p->lbprm.lock);
334
335 if (p->srv_act)
336 root = &p->lbprm.chash.act;
337 else if (p->lbprm.fbck) {
338 nsrv = p->lbprm.fbck;
339 goto out;
340 }
341 else if (p->srv_bck)
342 root = &p->lbprm.chash.bck;
343 else {
344 nsrv = NULL;
345 goto out;
346 }
347
348 /* find the node after and the node before */
349 next = eb32_lookup_ge(root, hash);
350 if (!next)
351 next = eb32_first(root);
352 if (!next) {
353 nsrv = NULL; /* tree is empty */
354 goto out;
355 }
356
357 prev = eb32_prev(next);
358 if (!prev)
359 prev = eb32_last(root);
360
361 nsrv = eb32_entry(next, struct tree_occ, node)->server;
362 psrv = eb32_entry(prev, struct tree_occ, node)->server;
363
364 /* OK we're located between two servers, let's
365 * compare distances between hash and the two servers
366 * and select the closest server.
367 */
368 dp = hash - prev->key;
369 dn = next->key - hash;
370
371 if (dp <= dn) {
372 next = prev;
373 nsrv = psrv;
374 }
375
376 loop = 0;
377 while (nsrv == avoid || (p->lbprm.hash_balance_factor && !chash_server_is_eligible(nsrv))) {
378 next = eb32_next(next);
379 if (!next) {
380 next = eb32_first(root);
381 if (++loop > 1) // protection against accidental loop
382 break;
383 }
384 nsrv = eb32_entry(next, struct tree_occ, node)->server;
385 }
386
387 out:
388 HA_SPIN_UNLOCK(LBPRM_LOCK, &p->lbprm.lock);
389 return nsrv;
390 }
391
392 /* Return next server from the CHASH tree in backend <p>. If the tree is empty,
393 * return NULL. Saturated servers are skipped.
394 */
chash_get_next_server(struct proxy * p,struct server * srvtoavoid)395 struct server *chash_get_next_server(struct proxy *p, struct server *srvtoavoid)
396 {
397 struct server *srv, *avoided;
398 struct eb32_node *node, *stop, *avoided_node;
399 struct eb_root *root;
400
401 srv = avoided = NULL;
402 avoided_node = NULL;
403
404 HA_SPIN_LOCK(LBPRM_LOCK, &p->lbprm.lock);
405 if (p->srv_act)
406 root = &p->lbprm.chash.act;
407 else if (p->lbprm.fbck) {
408 srv = p->lbprm.fbck;
409 goto out;
410 }
411 else if (p->srv_bck)
412 root = &p->lbprm.chash.bck;
413 else {
414 srv = NULL;
415 goto out;
416 }
417
418 stop = node = p->lbprm.chash.last;
419 do {
420 struct server *s;
421
422 if (node)
423 node = eb32_next(node);
424 if (!node)
425 node = eb32_first(root);
426
427 p->lbprm.chash.last = node;
428 if (!node) {
429 /* no node is available */
430 srv = NULL;
431 goto out;
432 }
433
434 /* Note: if we came here after a down/up cycle with no last
435 * pointer, and after a redispatch (srvtoavoid is set), we
436 * must set stop to non-null otherwise we can loop forever.
437 */
438 if (!stop)
439 stop = node;
440
441 /* OK, we have a server. However, it may be saturated, in which
442 * case we don't want to reconsider it for now, so we'll simply
443 * skip it. Same if it's the server we try to avoid, in which
444 * case we simply remember it for later use if needed.
445 */
446 s = eb32_entry(node, struct tree_occ, node)->server;
447 if (!s->maxconn || (!s->nbpend && s->served < srv_dynamic_maxconn(s))) {
448 if (s != srvtoavoid) {
449 srv = s;
450 break;
451 }
452 avoided = s;
453 avoided_node = node;
454 }
455 } while (node != stop);
456
457 if (!srv) {
458 srv = avoided;
459 p->lbprm.chash.last = avoided_node;
460 }
461
462 out:
463 HA_SPIN_UNLOCK(LBPRM_LOCK, &p->lbprm.lock);
464 return srv;
465 }
466
467 /* This function is responsible for building the active and backup trees for
468 * constistent hashing. The servers receive an array of initialized nodes
469 * with their assigned keys. It also sets p->lbprm.wdiv to the eweight to
470 * uweight ratio.
471 * Return 0 in case of success, -1 in case of allocation failure.
472 */
chash_init_server_tree(struct proxy * p)473 int chash_init_server_tree(struct proxy *p)
474 {
475 struct server *srv;
476 struct eb_root init_head = EB_ROOT;
477 int node;
478
479 p->lbprm.set_server_status_up = chash_set_server_status_up;
480 p->lbprm.set_server_status_down = chash_set_server_status_down;
481 p->lbprm.update_server_eweight = chash_update_server_weight;
482 p->lbprm.server_take_conn = NULL;
483 p->lbprm.server_drop_conn = NULL;
484
485 p->lbprm.wdiv = BE_WEIGHT_SCALE;
486 for (srv = p->srv; srv; srv = srv->next) {
487 srv->next_eweight = (srv->uweight * p->lbprm.wdiv + p->lbprm.wmult - 1) / p->lbprm.wmult;
488 srv_lb_commit_status(srv);
489 }
490
491 recount_servers(p);
492 update_backend_weight(p);
493
494 p->lbprm.chash.act = init_head;
495 p->lbprm.chash.bck = init_head;
496 p->lbprm.chash.last = NULL;
497
498 /* queue active and backup servers in two distinct groups */
499 for (srv = p->srv; srv; srv = srv->next) {
500 srv->lb_tree = (srv->flags & SRV_F_BACKUP) ? &p->lbprm.chash.bck : &p->lbprm.chash.act;
501 srv->lb_nodes_tot = srv->uweight * BE_WEIGHT_SCALE;
502 srv->lb_nodes_now = 0;
503 srv->lb_nodes = calloc(srv->lb_nodes_tot, sizeof(struct tree_occ));
504 if (!srv->lb_nodes) {
505 ha_alert("failed to allocate lb_nodes for server %s.\n", srv->id);
506 return -1;
507 }
508 for (node = 0; node < srv->lb_nodes_tot; node++) {
509 srv->lb_nodes[node].server = srv;
510 srv->lb_nodes[node].node.key = full_hash(srv->puid * SRV_EWGHT_RANGE + node);
511 }
512
513 if (srv_currently_usable(srv))
514 chash_queue_dequeue_srv(srv);
515 }
516 return 0;
517 }
518