1 /*
2    ctdb ip takeover code
3 
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Martin Schwenke  2011
7 
8    This program is free software; you can redistribute it and/or modify
9    it under the terms of the GNU General Public License as published by
10    the Free Software Foundation; either version 3 of the License, or
11    (at your option) any later version.
12 
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17 
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, see <http://www.gnu.org/licenses/>.
20 */
21 
22 #include "replace.h"
23 #include "system/network.h"
24 
25 #include "lib/util/debug.h"
26 #include "common/logging.h"
27 
28 #include "protocol/protocol_util.h"
29 
30 #include "server/ipalloc_private.h"
31 
32 /*
33  * This is the length of the longtest common prefix between the IPs.
34  * It is calculated by XOR-ing the 2 IPs together and counting the
35  * number of leading zeroes.  The implementation means that all
36  * addresses end up being 128 bits long.
37  *
38  * FIXME? Should we consider IPv4 and IPv6 separately given that the
39  * 12 bytes of 0 prefix padding will hurt the algorithm if there are
40  * lots of nodes and IP addresses?
41  */
ip_distance(ctdb_sock_addr * ip1,ctdb_sock_addr * ip2)42 static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
43 {
44 	uint32_t ip1_k[IP_KEYLEN];
45 	uint32_t *t;
46 	int i;
47 	uint32_t x;
48 
49 	uint32_t distance = 0;
50 
51 	memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
52 	t = ip_key(ip2);
53 	for (i=0; i<IP_KEYLEN; i++) {
54 		x = ip1_k[i] ^ t[i];
55 		if (x == 0) {
56 			distance += 32;
57 		} else {
58 			/* Count number of leading zeroes.
59 			 * FIXME? This could be optimised...
60 			 */
61 			while ((x & ((uint32_t)1 << 31)) == 0) {
62 				x <<= 1;
63 				distance += 1;
64 			}
65 		}
66 	}
67 
68 	return distance;
69 }
70 
71 /* Calculate the IP distance for the given IP relative to IPs on the
72    given node.  The ips argument is generally the all_ips variable
73    used in the main part of the algorithm.
74  */
ip_distance_2_sum(ctdb_sock_addr * ip,struct public_ip_list * ips,unsigned int pnn)75 static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
76 				  struct public_ip_list *ips,
77 				  unsigned int pnn)
78 {
79 	struct public_ip_list *t;
80 	uint32_t d;
81 
82 	uint32_t sum = 0;
83 
84 	for (t = ips; t != NULL; t = t->next) {
85 		if (t->pnn != pnn) {
86 			continue;
87 		}
88 
89 		/* Optimisation: We never calculate the distance
90 		 * between an address and itself.  This allows us to
91 		 * calculate the effect of removing an address from a
92 		 * node by simply calculating the distance between
93 		 * that address and all of the exitsing addresses.
94 		 * Moreover, we assume that we're only ever dealing
95 		 * with addresses from all_ips so we can identify an
96 		 * address via a pointer rather than doing a more
97 		 * expensive address comparison. */
98 		if (&(t->addr) == ip) {
99 			continue;
100 		}
101 
102 		d = ip_distance(ip, &(t->addr));
103 		sum += d * d;  /* Cheaper than pulling in math.h :-) */
104 	}
105 
106 	return sum;
107 }
108 
109 /* Return the LCP2 imbalance metric for addresses currently assigned
110    to the given node.
111  */
lcp2_imbalance(struct public_ip_list * all_ips,unsigned int pnn)112 static uint32_t lcp2_imbalance(struct public_ip_list * all_ips,
113 			       unsigned int pnn)
114 {
115 	struct public_ip_list *t;
116 
117 	uint32_t imbalance = 0;
118 
119 	for (t = all_ips; t != NULL; t = t->next) {
120 		if (t->pnn != pnn) {
121 			continue;
122 		}
123 		/* Pass the rest of the IPs rather than the whole
124 		   all_ips input list.
125 		*/
126 		imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
127 	}
128 
129 	return imbalance;
130 }
131 
lcp2_init(struct ipalloc_state * ipalloc_state,uint32_t ** lcp2_imbalances,bool ** rebalance_candidates)132 static bool lcp2_init(struct ipalloc_state *ipalloc_state,
133 		      uint32_t **lcp2_imbalances,
134 		      bool **rebalance_candidates)
135 {
136 	unsigned int i, numnodes;
137 	struct public_ip_list *t;
138 
139 	numnodes = ipalloc_state->num;
140 
141 	*rebalance_candidates = talloc_array(ipalloc_state, bool, numnodes);
142 	if (*rebalance_candidates == NULL) {
143 		DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
144 		return false;
145 	}
146 	*lcp2_imbalances = talloc_array(ipalloc_state, uint32_t, numnodes);
147 	if (*lcp2_imbalances == NULL) {
148 		DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
149 		return false;
150 	}
151 
152 	for (i=0; i<numnodes; i++) {
153 		(*lcp2_imbalances)[i] =
154 			lcp2_imbalance(ipalloc_state->all_ips, i);
155 		/* First step: assume all nodes are candidates */
156 		(*rebalance_candidates)[i] = true;
157 	}
158 
159 	/* 2nd step: if a node has IPs assigned then it must have been
160 	 * healthy before, so we remove it from consideration.  This
161 	 * is overkill but is all we have because we don't maintain
162 	 * state between takeover runs.  An alternative would be to
163 	 * keep state and invalidate it every time the recovery master
164 	 * changes.
165 	 */
166 	for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
167 		if (t->pnn != CTDB_UNKNOWN_PNN) {
168 			(*rebalance_candidates)[t->pnn] = false;
169 		}
170 	}
171 
172 	/* 3rd step: if a node is forced to re-balance then
173 	   we allow failback onto the node */
174 	if (ipalloc_state->force_rebalance_nodes == NULL) {
175 		return true;
176 	}
177 	for (i = 0;
178 	     i < talloc_array_length(ipalloc_state->force_rebalance_nodes);
179 	     i++) {
180 		uint32_t pnn = ipalloc_state->force_rebalance_nodes[i];
181 		if (pnn >= numnodes) {
182 			DEBUG(DEBUG_ERR,
183 			      (__location__ "unknown node %u\n", pnn));
184 			continue;
185 		}
186 
187 		DEBUG(DEBUG_NOTICE,
188 		      ("Forcing rebalancing of IPs to node %u\n", pnn));
189 		(*rebalance_candidates)[pnn] = true;
190 	}
191 
192 	return true;
193 }
194 
195 /* Allocate any unassigned addresses using the LCP2 algorithm to find
196  * the IP/node combination that will cost the least.
197  */
lcp2_allocate_unassigned(struct ipalloc_state * ipalloc_state,uint32_t * lcp2_imbalances)198 static void lcp2_allocate_unassigned(struct ipalloc_state *ipalloc_state,
199 				     uint32_t *lcp2_imbalances)
200 {
201 	struct public_ip_list *t;
202 	unsigned int dstnode, numnodes;
203 
204 	unsigned int minnode;
205 	uint32_t mindsum, dstdsum, dstimbl;
206 	uint32_t minimbl = 0;
207 	struct public_ip_list *minip;
208 
209 	bool should_loop = true;
210 	bool have_unassigned = true;
211 
212 	numnodes = ipalloc_state->num;
213 
214 	while (have_unassigned && should_loop) {
215 		should_loop = false;
216 
217 		DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
218 		DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
219 
220 		minnode = CTDB_UNKNOWN_PNN;
221 		mindsum = 0;
222 		minip = NULL;
223 
224 		/* loop over each unassigned ip. */
225 		for (t = ipalloc_state->all_ips; t != NULL ; t = t->next) {
226 			if (t->pnn != CTDB_UNKNOWN_PNN) {
227 				continue;
228 			}
229 
230 			for (dstnode = 0; dstnode < numnodes; dstnode++) {
231 				/* only check nodes that can actually takeover this ip */
232 				if (!can_node_takeover_ip(ipalloc_state,
233 							  dstnode,
234 							  t)) {
235 					/* no it couldnt   so skip to the next node */
236 					continue;
237 				}
238 
239 				dstdsum = ip_distance_2_sum(&(t->addr),
240 							    ipalloc_state->all_ips,
241 							    dstnode);
242 				dstimbl = lcp2_imbalances[dstnode] + dstdsum;
243 				DEBUG(DEBUG_DEBUG,
244 				      (" %s -> %d [+%d]\n",
245 				       ctdb_sock_addr_to_string(ipalloc_state,
246 								&(t->addr),
247 								false),
248 				       dstnode,
249 				       dstimbl - lcp2_imbalances[dstnode]));
250 
251 
252 				if (minnode == CTDB_UNKNOWN_PNN ||
253 				    dstdsum < mindsum) {
254 					minnode = dstnode;
255 					minimbl = dstimbl;
256 					mindsum = dstdsum;
257 					minip = t;
258 					should_loop = true;
259 				}
260 			}
261 		}
262 
263 		DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
264 
265 		/* If we found one then assign it to the given node. */
266 		if (minnode != CTDB_UNKNOWN_PNN) {
267 			minip->pnn = minnode;
268 			lcp2_imbalances[minnode] = minimbl;
269 			DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
270 					  ctdb_sock_addr_to_string(
271 						  ipalloc_state,
272 						  &(minip->addr), false),
273 					  minnode,
274 					  mindsum));
275 		}
276 
277 		/* There might be a better way but at least this is clear. */
278 		have_unassigned = false;
279 		for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
280 			if (t->pnn == CTDB_UNKNOWN_PNN) {
281 				have_unassigned = true;
282 			}
283 		}
284 	}
285 
286 	/* We know if we have an unassigned addresses so we might as
287 	 * well optimise.
288 	 */
289 	if (have_unassigned) {
290 		for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
291 			if (t->pnn == CTDB_UNKNOWN_PNN) {
292 				DEBUG(DEBUG_WARNING,
293 				      ("Failed to find node to cover ip %s\n",
294 				       ctdb_sock_addr_to_string(ipalloc_state,
295 								&t->addr,
296 								false)));
297 			}
298 		}
299 	}
300 }
301 
302 /* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
303  * to move IPs from, determines the best IP/destination node
304  * combination to move from the source node.
305  */
lcp2_failback_candidate(struct ipalloc_state * ipalloc_state,unsigned int srcnode,uint32_t * lcp2_imbalances,bool * rebalance_candidates)306 static bool lcp2_failback_candidate(struct ipalloc_state *ipalloc_state,
307 				    unsigned int srcnode,
308 				    uint32_t *lcp2_imbalances,
309 				    bool *rebalance_candidates)
310 {
311 	unsigned int dstnode, mindstnode, numnodes;
312 	uint32_t srcdsum, dstimbl, dstdsum;
313 	uint32_t minsrcimbl, mindstimbl;
314 	struct public_ip_list *minip;
315 	struct public_ip_list *t;
316 
317 	/* Find an IP and destination node that best reduces imbalance. */
318 	minip = NULL;
319 	minsrcimbl = 0;
320 	mindstnode = CTDB_UNKNOWN_PNN;
321 	mindstimbl = 0;
322 
323 	numnodes = ipalloc_state->num;
324 
325 	DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
326 	DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n",
327 			   srcnode, lcp2_imbalances[srcnode]));
328 
329 	for (t = ipalloc_state->all_ips; t != NULL; t = t->next) {
330 		uint32_t srcimbl;
331 
332 		/* Only consider addresses on srcnode. */
333 		if (t->pnn != srcnode) {
334 			continue;
335 		}
336 
337 		/* What is this IP address costing the source node? */
338 		srcdsum = ip_distance_2_sum(&(t->addr),
339 					    ipalloc_state->all_ips,
340 					    srcnode);
341 		srcimbl = lcp2_imbalances[srcnode] - srcdsum;
342 
343 		/* Consider this IP address would cost each potential
344 		 * destination node.  Destination nodes are limited to
345 		 * those that are newly healthy, since we don't want
346 		 * to do gratuitous failover of IPs just to make minor
347 		 * balance improvements.
348 		 */
349 		for (dstnode = 0; dstnode < numnodes; dstnode++) {
350 			if (!rebalance_candidates[dstnode]) {
351 				continue;
352 			}
353 
354 			/* only check nodes that can actually takeover this ip */
355 			if (!can_node_takeover_ip(ipalloc_state, dstnode,
356 						  t)) {
357 				/* no it couldnt   so skip to the next node */
358 				continue;
359 			}
360 
361 			dstdsum = ip_distance_2_sum(&(t->addr),
362 						    ipalloc_state->all_ips,
363 						    dstnode);
364 			dstimbl = lcp2_imbalances[dstnode] + dstdsum;
365 			DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
366 					   srcnode, -srcdsum,
367 					   ctdb_sock_addr_to_string(
368 						   ipalloc_state,
369 						   &(t->addr), false),
370 					   dstnode, dstdsum));
371 
372 			if ((dstimbl < lcp2_imbalances[srcnode]) &&
373 			    (dstdsum < srcdsum) &&			\
374 			    ((mindstnode == CTDB_UNKNOWN_PNN) ||				\
375 			     ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
376 
377 				minip = t;
378 				minsrcimbl = srcimbl;
379 				mindstnode = dstnode;
380 				mindstimbl = dstimbl;
381 			}
382 		}
383 	}
384 	DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
385 
386         if (mindstnode != CTDB_UNKNOWN_PNN) {
387 		/* We found a move that makes things better... */
388 		DEBUG(DEBUG_INFO,
389 		      ("%d [%d] -> %s -> %d [+%d]\n",
390 		       srcnode, minsrcimbl - lcp2_imbalances[srcnode],
391 		       ctdb_sock_addr_to_string(ipalloc_state,
392 						&(minip->addr), false),
393 		       mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
394 
395 
396 		lcp2_imbalances[srcnode] = minsrcimbl;
397 		lcp2_imbalances[mindstnode] = mindstimbl;
398 		minip->pnn = mindstnode;
399 
400 		return true;
401 	}
402 
403         return false;
404 }
405 
406 struct lcp2_imbalance_pnn {
407 	uint32_t imbalance;
408 	unsigned int pnn;
409 };
410 
lcp2_cmp_imbalance_pnn(const void * a,const void * b)411 static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
412 {
413 	const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
414 	const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
415 
416 	if (lipa->imbalance > lipb->imbalance) {
417 		return -1;
418 	} else if (lipa->imbalance == lipb->imbalance) {
419 		return 0;
420 	} else {
421 		return 1;
422 	}
423 }
424 
425 /* LCP2 algorithm for rebalancing the cluster.  This finds the source
426  * node with the highest LCP2 imbalance, and then determines the best
427  * IP/destination node combination to move from the source node.
428  */
lcp2_failback(struct ipalloc_state * ipalloc_state,uint32_t * lcp2_imbalances,bool * rebalance_candidates)429 static void lcp2_failback(struct ipalloc_state *ipalloc_state,
430 			  uint32_t *lcp2_imbalances,
431 			  bool *rebalance_candidates)
432 {
433 	int i, numnodes;
434 	struct lcp2_imbalance_pnn * lips;
435 	bool again;
436 
437 	numnodes = ipalloc_state->num;
438 
439 try_again:
440 	/* Put the imbalances and nodes into an array, sort them and
441 	 * iterate through candidates.  Usually the 1st one will be
442 	 * used, so this doesn't cost much...
443 	 */
444 	DEBUG(DEBUG_DEBUG,("+++++++++++++++++++++++++++++++++++++++++\n"));
445 	DEBUG(DEBUG_DEBUG,("Selecting most imbalanced node from:\n"));
446 	lips = talloc_array(ipalloc_state, struct lcp2_imbalance_pnn, numnodes);
447 	for (i = 0; i < numnodes; i++) {
448 		lips[i].imbalance = lcp2_imbalances[i];
449 		lips[i].pnn = i;
450 		DEBUG(DEBUG_DEBUG,(" %d [%d]\n", i, lcp2_imbalances[i]));
451 	}
452 	qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
453 	      lcp2_cmp_imbalance_pnn);
454 
455 	again = false;
456 	for (i = 0; i < numnodes; i++) {
457 		/* This means that all nodes had 0 or 1 addresses, so
458 		 * can't be imbalanced.
459 		 */
460 		if (lips[i].imbalance == 0) {
461 			break;
462 		}
463 
464 		if (lcp2_failback_candidate(ipalloc_state,
465 					    lips[i].pnn,
466 					    lcp2_imbalances,
467 					    rebalance_candidates)) {
468 			again = true;
469 			break;
470 		}
471 	}
472 
473 	talloc_free(lips);
474 	if (again) {
475 		goto try_again;
476 	}
477 }
478 
ipalloc_lcp2(struct ipalloc_state * ipalloc_state)479 bool ipalloc_lcp2(struct ipalloc_state *ipalloc_state)
480 {
481 	uint32_t *lcp2_imbalances;
482 	bool *rebalance_candidates;
483 	int numnodes, i;
484 	bool have_rebalance_candidates;
485 	bool ret = true;
486 
487 	unassign_unsuitable_ips(ipalloc_state);
488 
489 	if (!lcp2_init(ipalloc_state,
490 		       &lcp2_imbalances, &rebalance_candidates)) {
491 		ret = false;
492 		goto finished;
493 	}
494 
495 	lcp2_allocate_unassigned(ipalloc_state, lcp2_imbalances);
496 
497 	/* If we don't want IPs to fail back then don't rebalance IPs. */
498 	if (ipalloc_state->no_ip_failback) {
499 		goto finished;
500 	}
501 
502 	/* It is only worth continuing if we have suitable target
503 	 * nodes to transfer IPs to.  This check is much cheaper than
504 	 * continuing on...
505 	 */
506 	numnodes = ipalloc_state->num;
507 	have_rebalance_candidates = false;
508 	for (i=0; i<numnodes; i++) {
509 		if (rebalance_candidates[i]) {
510 			have_rebalance_candidates = true;
511 			break;
512 		}
513 	}
514 	if (!have_rebalance_candidates) {
515 		goto finished;
516 	}
517 
518 	/* Now, try to make sure the ip adresses are evenly distributed
519 	   across the nodes.
520 	*/
521 	lcp2_failback(ipalloc_state, lcp2_imbalances, rebalance_candidates);
522 
523 finished:
524 	return ret;
525 }
526