1 /*
2  * Copyright (C) 2021 Jakub Kruszona-Zawadzki, Core Technology Sp. z o.o.
3  *
4  * This file is part of MooseFS.
5  *
6  * MooseFS is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, version 2 (only).
9  *
10  * MooseFS is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with MooseFS; if not, write to the Free Software
17  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02111-1301, USA
18  * or visit http://www.gnu.org/licenses/gpl-2.0.html
19  */
20 
21 #ifdef HAVE_CONFIG_H
22 #include "config.h"
23 #endif
24 
25 #include <stdio.h>
26 #include <string.h>
27 #include <stdlib.h>
28 #include <fcntl.h>
29 #include <unistd.h>
30 #include <syslog.h>
31 #include <errno.h>
32 #include <inttypes.h>
33 
34 #include "itree.h"
35 
36 #include "main.h"
37 #include "cfg.h"
38 #include "slogger.h"
39 #include "massert.h"
40 #include "mfsalloc.h"
41 
42 static void *racktree;
43 static char *TopologyFileName;
44 
45 
46 
47 // ************* NAME <-> ID MAP ** BEGIN ***************
48 
49 #define HASHTABSIZE 4096
50 
51 typedef struct _rackhashentry {
52 	char* rackname;
53 	uint32_t rackid;
54 	uint32_t hash;
55 	struct _rackhashentry *next;
56 } rackhashentry;
57 
58 static rackhashentry* rackhashtab[HASHTABSIZE];
59 static rackhashentry** rackidtab = NULL;
60 static uint32_t rackidtabsize = 0;
61 static uint32_t rackidnext = 0;
62 
topology_rackname_hash(char * rackname)63 static inline uint32_t topology_rackname_hash(char *rackname) {
64 	uint8_t p;
65 	uint32_t result = 55821;
66 	while ((p=*rackname)!=0) {
67 		rackname++;
68 		result = result*33+p;
69 	}
70 	return result;
71 }
72 
topology_get_next_free_rackid(void)73 static inline uint32_t topology_get_next_free_rackid(void) {
74 	uint32_t i;
75 	i = rackidtabsize;
76 	if (rackidtabsize==0) {
77 		rackidtabsize = 1024;
78 		rackidtab = malloc(sizeof(rackhashentry*)*rackidtabsize);
79 		passert(rackidtab);
80 		rackidnext = 1; // skip rackid=0
81 	} else if (rackidnext>=rackidtabsize) {
82 		rackidtabsize = rackidtabsize*3/2;
83 		rackidtab = mfsrealloc(rackidtab,sizeof(rackhashentry*)*rackidtabsize);
84 		passert(rackidtab);
85 	}
86 	while (i<rackidtabsize) {
87 		rackidtab[i] = NULL;
88 		i++;
89 	}
90 	return rackidnext++;
91 }
92 
topology_rackname_to_rackid(char * rackname)93 static uint32_t topology_rackname_to_rackid(char *rackname) {
94 	uint32_t hash,hashpos;
95 	rackhashentry *rhe;
96 
97 	hash = topology_rackname_hash(rackname);
98 	hashpos = hash % HASHTABSIZE;
99 	for (rhe = rackhashtab[hashpos] ; rhe != NULL ; rhe = rhe->next) {
100 		if (rhe->hash==hash && strcmp(rhe->rackname,rackname)==0) {
101 			return rhe->rackid;
102 		}
103 	}
104 
105 	rhe = malloc(sizeof(rackhashentry));
106 	rhe->rackname = strdup(rackname);
107 	rhe->rackid = topology_get_next_free_rackid();
108 	rhe->hash = hash;
109 	rhe->next = rackhashtab[hashpos];
110 	rackhashtab[hashpos] = rhe;
111 	rackidtab[rhe->rackid] = rhe;
112 	return rhe->rackid;
113 }
114 
topology_rackid_to_rackname(uint32_t rackid)115 static char* topology_rackid_to_rackname(uint32_t rackid) {
116 	if (rackid==0) {
117 		return "";
118 	}
119 	if (rackid<rackidnext) {
120 		return rackidtab[rackid]->rackname;
121 	}
122 	return NULL;
123 }
124 
topology_rackname_init(void)125 static void topology_rackname_init(void) {
126 	uint32_t i;
127 	rackidtab = NULL;
128 	rackidtabsize = 0;
129 	rackidnext = 0;
130 	for (i=0 ; i<HASHTABSIZE ; i++) {
131 		rackhashtab[i] = NULL;
132 	}
133 }
134 
topology_rackname_cleanup(void)135 static void topology_rackname_cleanup(void) {
136 	uint32_t i;
137 	for (i=1 ; i<rackidnext ; i++) {
138 		free(rackidtab[i]->rackname);
139 		free(rackidtab[i]);
140 	}
141 	free(rackidtab);
142 	topology_rackname_init();
143 }
144 
145 static rackhashentry* rackhashtab_stash[HASHTABSIZE];
146 static rackhashentry** rackidtab_stash = NULL;
147 static uint32_t rackidtabsize_stash = 0;
148 static uint32_t rackidnext_stash = 0;
149 
150 
topology_rackname_stash(void)151 static void topology_rackname_stash(void) {
152 	uint32_t i;
153 	for (i=0 ; i<HASHTABSIZE ; i++) {
154 		rackhashtab_stash[i] = rackhashtab[i];
155 		rackhashtab[i] = NULL;
156 	}
157 	rackidtab_stash = rackidtab;
158 	rackidtab = NULL;
159 	rackidtabsize_stash = rackidtabsize;
160 	rackidtabsize = 0;
161 	rackidnext_stash = rackidnext;
162 	rackidnext = 0;
163 }
164 
topology_rackname_restore(void)165 static void topology_rackname_restore(void) {
166 	topology_rackname_cleanup();
167 	uint32_t i;
168 	for (i=0 ; i<HASHTABSIZE ; i++) {
169 		rackhashtab[i] = rackhashtab_stash[i];
170 		rackhashtab_stash[i] = NULL;
171 	}
172 	rackidtab = rackidtab_stash;
173 	rackidtab_stash = NULL;
174 	rackidtabsize = rackidtabsize_stash;
175 	rackidtabsize_stash = 0;
176 	rackidnext = rackidnext_stash;
177 	rackidnext_stash = 0;
178 }
179 
topology_rackname_cleanupstash(void)180 static void topology_rackname_cleanupstash(void) {
181 	uint32_t i;
182 	for (i=1 ; i<rackidnext_stash ; i++) {
183 		free(rackidtab_stash[i]->rackname);
184 		free(rackidtab_stash[i]);
185 	}
186 	free(rackidtab_stash);
187 	rackidtab_stash = NULL;
188 	rackidtabsize_stash = 0;
189 	rackidnext_stash = 0;
190 	for (i=0 ; i<HASHTABSIZE ; i++) {
191 		rackhashtab_stash[i] = NULL;
192 	}
193 }
194 
195 // ************* NAME <-> ID MAP ** END *****************
196 
197 
198 
199 
topology_parsenet(char * net,uint32_t * fromip,uint32_t * toip)200 int topology_parsenet(char *net,uint32_t *fromip,uint32_t *toip) {
201 	uint32_t ip,i,octet;
202 	if (net[0]=='*' && net[1]==0) {
203 		*fromip = 0;
204 		*toip = 0xFFFFFFFFU;
205 		return 0;
206 	}
207 	ip=0;
208 	for (i=0 ; i<4; i++) {
209 		if (*net>='0' && *net<='9') {
210 			octet=0;
211 			while (*net>='0' && *net<='9') {
212 				octet*=10;
213 				octet+=(*net)-'0';
214 				net++;
215 				if (octet>255) {
216 					return -1;
217 				}
218 			}
219 		} else {
220 			return -1;
221 		}
222 		if (i<3) {
223 			if (*net!='.') {
224 				return -1;
225 			}
226 			net++;
227 		}
228 		ip*=256;
229 		ip+=octet;
230 	}
231 	if (*net==0) {
232 		*fromip = ip;
233 		*toip = ip;
234 		return 0;
235 	}
236 	if (*net=='/') {	// ip/bits and ip/mask
237 		*fromip = ip;
238 		ip=0;
239 		net++;
240 		for (i=0 ; i<4; i++) {
241 			if (*net>='0' && *net<='9') {
242 				octet=0;
243 				while (*net>='0' && *net<='9') {
244 					octet*=10;
245 					octet+=(*net)-'0';
246 					net++;
247 					if (octet>255) {
248 						return -1;
249 					}
250 				}
251 			} else {
252 				return -1;
253 			}
254 			if (i==0 && *net==0 && octet<=32) {	// bits -> convert to mask and skip rest of loop
255 				ip = 0xFFFFFFFF;
256 				if (octet<32) {
257 					ip<<=32-octet;
258 				}
259 				break;
260 			}
261 			if (i<3) {
262 				if (*net!='.') {
263 					return -1;
264 				}
265 				net++;
266 			}
267 			ip*=256;
268 			ip+=octet;
269 		}
270 		if (*net!=0) {
271 			return -1;
272 		}
273 		*fromip &= ip;
274 		*toip = *fromip | (ip ^ 0xFFFFFFFFU);
275 		return 0;
276 	}
277 	if (*net=='-') {	// ip1-ip2
278 		*fromip = ip;
279 		ip=0;
280 		net++;
281 		for (i=0 ; i<4; i++) {
282 			if (*net>='0' && *net<='9') {
283 				octet=0;
284 				while (*net>='0' && *net<='9') {
285 					octet*=10;
286 					octet+=*net-'0';
287 					net++;
288 					if (octet>255) {
289 						return -1;
290 					}
291 				}
292 			} else {
293 				return -1;
294 			}
295 			if (i<3) {
296 				if (*net!='.') {
297 					return -1;
298 				}
299 				net++;
300 			}
301 			ip*=256;
302 			ip+=octet;
303 		}
304 		if (*net!=0) {
305 			return -1;
306 		}
307 		*toip = ip;
308 		return 0;
309 	}
310 	return -1;
311 }
312 
topology_get_rackid(uint32_t ip)313 uint32_t topology_get_rackid(uint32_t ip) {
314 	return itree_find(racktree,ip);
315 }
316 
317 // as for now:
318 //
319 // 0 - same machine
320 // 1 - same rack, different machines
321 // 2 - different racks
322 
topology_distance(uint32_t ip1,uint32_t ip2)323 uint8_t topology_distance(uint32_t ip1,uint32_t ip2) {
324 	uint32_t rid1,rid2;
325 	char *rname1,*rname2;
326 	int pos,lastbar;
327 	uint8_t l1,l2;
328 
329 	if (ip1==ip2) {
330 		return 0;
331 	}
332 	rid1 = itree_find(racktree,ip1);
333 	rid2 = itree_find(racktree,ip2);
334 	if (rid1==rid2) {
335 		return 1;
336 	}
337 	rname1 = topology_rackid_to_rackname(rid1);
338 	rname2 = topology_rackid_to_rackname(rid2);
339 
340 	if (rname1==NULL && rname2==NULL) { // safety guard - this may only happen when both rid1 and rid2 are 0 - it shouldn't pass rid1==rid2 condition
341 		return 1;
342 	}
343 
344 	lastbar = 0;
345 	if (rname1!=NULL && rname2!=NULL) {
346 		pos = 0;
347 		while (1) {
348 			if ((rname1[pos]==0 && rname2[pos]=='|') || (rname1[pos]=='|' && rname2[pos]==0)) {
349 				lastbar = pos;
350 				break;
351 			}
352 			if (rname1[pos] != rname2[pos]) {
353 				break;
354 			}
355 			if (rname1[pos]=='|') {
356 				lastbar = pos;
357 			}
358 			if (rname1[pos] == 0) { // safety guard - this means that strings are identical - if that then they should have the same rackid
359 				return 1;
360 			}
361 			pos++;
362 		}
363 	}
364 	l1 = 0;
365 	l2 = 0;
366 	if (rname1!=NULL) {
367 		if (rname1[lastbar]=='|') {
368 			pos = lastbar+1;
369 		} else {
370 			pos = lastbar;
371 		}
372 		for ( ; rname1[pos] ; pos++) {
373 			if (rname1[pos]=='|') {
374 				l1++;
375 			}
376 		}
377 	}
378 	if (rname2!=NULL) {
379 		if (rname2[lastbar]=='|') {
380 			pos = lastbar+1;
381 		} else {
382 			pos = lastbar;
383 		}
384 		for ( ; rname2[pos] ; pos++) {
385 			if (rname2[pos]=='|') {
386 				l2++;
387 			}
388 		}
389 	}
390 	if (l1>l2) {
391 		return 2+l1;
392 	} else {
393 		return 2+l2;
394 	}
395 }
396 
397 // format:
398 // network	rackid
399 
400 
401 // format (3.0.104+)
402 // network	rack_path_sparated_by_vertical_bar
403 
topology_parseline(char * line,uint32_t lineno,uint32_t * fip,uint32_t * tip,uint32_t * rid)404 int topology_parseline(char *line,uint32_t lineno,uint32_t *fip,uint32_t *tip,uint32_t *rid) {
405 	char c,*net,*rackname;
406 	char *p;
407 
408 	p = line;
409 	while (*p==' ' || *p=='\t') {
410 		p++;
411 	}
412 	if (*p==0 || *p=='#') { // empty line or line with comment only
413 		return -1;
414 	}
415 	net = p;
416 	while (*p && *p!=' ' && *p!='\t') {
417 		p++;
418 	}
419 	if (*p==0) {
420 		mfs_arg_syslog(LOG_WARNING,"mfstopology: incomplete definition in line: %"PRIu32,lineno);
421 		fprintf(stderr,"mfstopology: incomplete definition in line: %"PRIu32"\n",lineno);
422 		return -1;
423 	}
424 	*p=0;
425 	p++;
426 	if (topology_parsenet(net,fip,tip)<0) {
427 		mfs_arg_syslog(LOG_WARNING,"mfstopology: incorrect ip/network definition in line: %"PRIu32,lineno);
428 		fprintf(stderr,"mfstopology: incorrect ip/network definition in line: %"PRIu32"\n",lineno);
429 		return -1;
430 	}
431 
432 	while (*p==' ' || *p=='\t') {
433 		p++;
434 	}
435 
436 	if (*p==0 || *p=='#') {
437 		mfs_arg_syslog(LOG_WARNING,"mfstopology: incorrect rack id in line: %"PRIu32,lineno);
438 		fprintf(stderr,"mfstopology: incorrect rack id in line: %"PRIu32"\n",lineno);
439 		return -1;
440 	}
441 
442 	rackname = p;
443 
444 	while (*p && *p!=' ' && *p!='\t') {
445 		p++;
446 	}
447 
448 	c = *p;
449 	*p = 0;
450 	*rid = topology_rackname_to_rackid(rackname);
451 	*p = c;
452 
453 	while (*p==' ' || *p=='\t') {
454 		p++;
455 	}
456 
457 	if (*p && *p!='#') {
458 		mfs_arg_syslog(LOG_WARNING,"mfstopology: garbage found at the end of line: %"PRIu32,lineno);
459 		fprintf(stderr,"mfstopology: garbage found at the end of line: %"PRIu32"\n",lineno);
460 		return -1;
461 	}
462 	return 0;
463 }
464 
topology_load(void)465 void topology_load(void) {
466 	FILE *fd;
467 	char linebuff[10000];
468 	uint32_t s,lineno;
469 	uint32_t fip,tip,rid;
470 	void *newtree;
471 
472 	fd = fopen(TopologyFileName,"r");
473 	if (fd==NULL) {
474 		if (errno==ENOENT) {
475 			if (racktree) {
476 				syslog(LOG_WARNING,"mfstopology configuration file (%s) not found - network topology not changed",TopologyFileName);
477 			} else {
478 				syslog(LOG_WARNING,"mfstopology configuration file (%s) not found - network topology not defined",TopologyFileName);
479 			}
480 			fprintf(stderr,"mfstopology configuration file (%s) not found - using defaults\n",TopologyFileName);
481 		} else {
482 			if (racktree) {
483 				mfs_arg_errlog(LOG_WARNING,"can't open mfstopology configuration file (%s) - network topology not changed, error",TopologyFileName);
484 			} else {
485 				mfs_arg_errlog(LOG_WARNING,"can't open mfstopology configuration file (%s) - network topology not defined, error",TopologyFileName);
486 			}
487 		}
488 		return;
489 	}
490 
491 	topology_rackname_stash();
492 	newtree = NULL;
493 	lineno = 1;
494 	while (fgets(linebuff,10000,fd)) {
495 		linebuff[9999]=0;
496 		s=strlen(linebuff);
497 		while (s>0 && (linebuff[s-1]=='\r' || linebuff[s-1]=='\n' || linebuff[s-1]=='\t' || linebuff[s-1]==' ')) {
498 			s--;
499 		}
500 		if (s>0) {
501 			linebuff[s]=0;
502 			if (topology_parseline(linebuff,lineno,&fip,&tip,&rid)>=0) {
503 				newtree = itree_add_interval(newtree,fip,tip,rid);
504 //				while (fip<=tip) {
505 //					hash_insert(fip,rid);
506 //					fip++;
507 //				}
508 			}
509 		}
510 		lineno++;
511 	}
512 	if (ferror(fd)) {
513 		fclose(fd);
514 		if (racktree) {
515 			syslog(LOG_WARNING,"error reading mfstopology file - network topology not changed");
516 		} else {
517 			syslog(LOG_WARNING,"error reading mfstopology file - network topology not defined");
518 		}
519 		itree_freeall(newtree);
520 		topology_rackname_restore();
521 		fprintf(stderr,"error reading mfstopology file - network topology not defined (using defaults)\n");
522 		return;
523 	}
524 	fclose(fd);
525 	topology_rackname_cleanupstash();
526 	itree_freeall(racktree);
527 	racktree = newtree;
528 	if (racktree) {
529 		racktree = itree_rebalance(racktree);
530 	}
531 	mfs_syslog(LOG_NOTICE,"topology file has been loaded");
532 }
533 
534 //int topology_init(void) {
535 //	TopologyFileName = strdup("mfstopology.cfg");
536 //	racktree = NULL;
537 //	topology_load();
538 //	itree_show(racktree);
539 //	return 0;
540 //}
541 
topology_reload(void)542 void topology_reload(void) {
543 	int fd;
544 	if (TopologyFileName) {
545 		free(TopologyFileName);
546 	}
547 	if (!cfg_isdefined("TOPOLOGY_FILENAME")) {
548 		TopologyFileName = strdup(ETC_PATH "/mfs/mfstopology.cfg");
549 		passert(TopologyFileName);
550 		if ((fd = open(TopologyFileName,O_RDONLY))<0 && errno==ENOENT) {
551 			char *tmpname;
552 			tmpname = strdup(ETC_PATH "/mfstopology.cfg");
553 			if ((fd = open(tmpname,O_RDONLY))>=0) {
554 				free(TopologyFileName);
555 				TopologyFileName = tmpname;
556 				mfs_syslog(LOG_WARNING,"default sysconf path has changed - please move mfstopology.cfg from "ETC_PATH"/ to "ETC_PATH"/mfs/");
557 			} else {
558 				free(tmpname);
559 			}
560 		}
561 		if (fd>=0) {
562 			close(fd);
563 		}
564 	} else {
565 		TopologyFileName = cfg_getstr("TOPOLOGY_FILENAME",ETC_PATH "/mfs/mfstopology.cfg");
566 	}
567 	topology_load();
568 }
569 
topology_term(void)570 void topology_term(void) {
571 	itree_freeall(racktree);
572 	if (TopologyFileName) {
573 		free(TopologyFileName);
574 	}
575 	topology_rackname_cleanup();
576 }
577 
topology_init(void)578 int topology_init(void) {
579 	TopologyFileName = NULL;
580 	racktree = NULL;
581 	topology_rackname_init();
582 	topology_reload();
583 	main_reload_register(topology_reload);
584 	main_destruct_register(topology_term);
585 	return 0;
586 }
587