1 /*
2 * Copyright (C) 2021 Jakub Kruszona-Zawadzki, Core Technology Sp. z o.o.
3 *
4 * This file is part of MooseFS.
5 *
6 * MooseFS is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, version 2 (only).
9 *
10 * MooseFS is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with MooseFS; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02111-1301, USA
18 * or visit http://www.gnu.org/licenses/gpl-2.0.html
19 */
20
21 #ifdef HAVE_CONFIG_H
22 #include "config.h"
23 #endif
24
25 #include <stdio.h>
26 #include <string.h>
27 #include <stdlib.h>
28 #include <fcntl.h>
29 #include <unistd.h>
30 #include <syslog.h>
31 #include <errno.h>
32 #include <inttypes.h>
33
34 #include "itree.h"
35
36 #include "main.h"
37 #include "cfg.h"
38 #include "slogger.h"
39 #include "massert.h"
40 #include "mfsalloc.h"
41
42 static void *racktree;
43 static char *TopologyFileName;
44
45
46
47 // ************* NAME <-> ID MAP ** BEGIN ***************
48
49 #define HASHTABSIZE 4096
50
51 typedef struct _rackhashentry {
52 char* rackname;
53 uint32_t rackid;
54 uint32_t hash;
55 struct _rackhashentry *next;
56 } rackhashentry;
57
58 static rackhashentry* rackhashtab[HASHTABSIZE];
59 static rackhashentry** rackidtab = NULL;
60 static uint32_t rackidtabsize = 0;
61 static uint32_t rackidnext = 0;
62
topology_rackname_hash(char * rackname)63 static inline uint32_t topology_rackname_hash(char *rackname) {
64 uint8_t p;
65 uint32_t result = 55821;
66 while ((p=*rackname)!=0) {
67 rackname++;
68 result = result*33+p;
69 }
70 return result;
71 }
72
topology_get_next_free_rackid(void)73 static inline uint32_t topology_get_next_free_rackid(void) {
74 uint32_t i;
75 i = rackidtabsize;
76 if (rackidtabsize==0) {
77 rackidtabsize = 1024;
78 rackidtab = malloc(sizeof(rackhashentry*)*rackidtabsize);
79 passert(rackidtab);
80 rackidnext = 1; // skip rackid=0
81 } else if (rackidnext>=rackidtabsize) {
82 rackidtabsize = rackidtabsize*3/2;
83 rackidtab = mfsrealloc(rackidtab,sizeof(rackhashentry*)*rackidtabsize);
84 passert(rackidtab);
85 }
86 while (i<rackidtabsize) {
87 rackidtab[i] = NULL;
88 i++;
89 }
90 return rackidnext++;
91 }
92
topology_rackname_to_rackid(char * rackname)93 static uint32_t topology_rackname_to_rackid(char *rackname) {
94 uint32_t hash,hashpos;
95 rackhashentry *rhe;
96
97 hash = topology_rackname_hash(rackname);
98 hashpos = hash % HASHTABSIZE;
99 for (rhe = rackhashtab[hashpos] ; rhe != NULL ; rhe = rhe->next) {
100 if (rhe->hash==hash && strcmp(rhe->rackname,rackname)==0) {
101 return rhe->rackid;
102 }
103 }
104
105 rhe = malloc(sizeof(rackhashentry));
106 rhe->rackname = strdup(rackname);
107 rhe->rackid = topology_get_next_free_rackid();
108 rhe->hash = hash;
109 rhe->next = rackhashtab[hashpos];
110 rackhashtab[hashpos] = rhe;
111 rackidtab[rhe->rackid] = rhe;
112 return rhe->rackid;
113 }
114
topology_rackid_to_rackname(uint32_t rackid)115 static char* topology_rackid_to_rackname(uint32_t rackid) {
116 if (rackid==0) {
117 return "";
118 }
119 if (rackid<rackidnext) {
120 return rackidtab[rackid]->rackname;
121 }
122 return NULL;
123 }
124
topology_rackname_init(void)125 static void topology_rackname_init(void) {
126 uint32_t i;
127 rackidtab = NULL;
128 rackidtabsize = 0;
129 rackidnext = 0;
130 for (i=0 ; i<HASHTABSIZE ; i++) {
131 rackhashtab[i] = NULL;
132 }
133 }
134
topology_rackname_cleanup(void)135 static void topology_rackname_cleanup(void) {
136 uint32_t i;
137 for (i=1 ; i<rackidnext ; i++) {
138 free(rackidtab[i]->rackname);
139 free(rackidtab[i]);
140 }
141 free(rackidtab);
142 topology_rackname_init();
143 }
144
145 static rackhashentry* rackhashtab_stash[HASHTABSIZE];
146 static rackhashentry** rackidtab_stash = NULL;
147 static uint32_t rackidtabsize_stash = 0;
148 static uint32_t rackidnext_stash = 0;
149
150
topology_rackname_stash(void)151 static void topology_rackname_stash(void) {
152 uint32_t i;
153 for (i=0 ; i<HASHTABSIZE ; i++) {
154 rackhashtab_stash[i] = rackhashtab[i];
155 rackhashtab[i] = NULL;
156 }
157 rackidtab_stash = rackidtab;
158 rackidtab = NULL;
159 rackidtabsize_stash = rackidtabsize;
160 rackidtabsize = 0;
161 rackidnext_stash = rackidnext;
162 rackidnext = 0;
163 }
164
topology_rackname_restore(void)165 static void topology_rackname_restore(void) {
166 topology_rackname_cleanup();
167 uint32_t i;
168 for (i=0 ; i<HASHTABSIZE ; i++) {
169 rackhashtab[i] = rackhashtab_stash[i];
170 rackhashtab_stash[i] = NULL;
171 }
172 rackidtab = rackidtab_stash;
173 rackidtab_stash = NULL;
174 rackidtabsize = rackidtabsize_stash;
175 rackidtabsize_stash = 0;
176 rackidnext = rackidnext_stash;
177 rackidnext_stash = 0;
178 }
179
topology_rackname_cleanupstash(void)180 static void topology_rackname_cleanupstash(void) {
181 uint32_t i;
182 for (i=1 ; i<rackidnext_stash ; i++) {
183 free(rackidtab_stash[i]->rackname);
184 free(rackidtab_stash[i]);
185 }
186 free(rackidtab_stash);
187 rackidtab_stash = NULL;
188 rackidtabsize_stash = 0;
189 rackidnext_stash = 0;
190 for (i=0 ; i<HASHTABSIZE ; i++) {
191 rackhashtab_stash[i] = NULL;
192 }
193 }
194
195 // ************* NAME <-> ID MAP ** END *****************
196
197
198
199
topology_parsenet(char * net,uint32_t * fromip,uint32_t * toip)200 int topology_parsenet(char *net,uint32_t *fromip,uint32_t *toip) {
201 uint32_t ip,i,octet;
202 if (net[0]=='*' && net[1]==0) {
203 *fromip = 0;
204 *toip = 0xFFFFFFFFU;
205 return 0;
206 }
207 ip=0;
208 for (i=0 ; i<4; i++) {
209 if (*net>='0' && *net<='9') {
210 octet=0;
211 while (*net>='0' && *net<='9') {
212 octet*=10;
213 octet+=(*net)-'0';
214 net++;
215 if (octet>255) {
216 return -1;
217 }
218 }
219 } else {
220 return -1;
221 }
222 if (i<3) {
223 if (*net!='.') {
224 return -1;
225 }
226 net++;
227 }
228 ip*=256;
229 ip+=octet;
230 }
231 if (*net==0) {
232 *fromip = ip;
233 *toip = ip;
234 return 0;
235 }
236 if (*net=='/') { // ip/bits and ip/mask
237 *fromip = ip;
238 ip=0;
239 net++;
240 for (i=0 ; i<4; i++) {
241 if (*net>='0' && *net<='9') {
242 octet=0;
243 while (*net>='0' && *net<='9') {
244 octet*=10;
245 octet+=(*net)-'0';
246 net++;
247 if (octet>255) {
248 return -1;
249 }
250 }
251 } else {
252 return -1;
253 }
254 if (i==0 && *net==0 && octet<=32) { // bits -> convert to mask and skip rest of loop
255 ip = 0xFFFFFFFF;
256 if (octet<32) {
257 ip<<=32-octet;
258 }
259 break;
260 }
261 if (i<3) {
262 if (*net!='.') {
263 return -1;
264 }
265 net++;
266 }
267 ip*=256;
268 ip+=octet;
269 }
270 if (*net!=0) {
271 return -1;
272 }
273 *fromip &= ip;
274 *toip = *fromip | (ip ^ 0xFFFFFFFFU);
275 return 0;
276 }
277 if (*net=='-') { // ip1-ip2
278 *fromip = ip;
279 ip=0;
280 net++;
281 for (i=0 ; i<4; i++) {
282 if (*net>='0' && *net<='9') {
283 octet=0;
284 while (*net>='0' && *net<='9') {
285 octet*=10;
286 octet+=*net-'0';
287 net++;
288 if (octet>255) {
289 return -1;
290 }
291 }
292 } else {
293 return -1;
294 }
295 if (i<3) {
296 if (*net!='.') {
297 return -1;
298 }
299 net++;
300 }
301 ip*=256;
302 ip+=octet;
303 }
304 if (*net!=0) {
305 return -1;
306 }
307 *toip = ip;
308 return 0;
309 }
310 return -1;
311 }
312
topology_get_rackid(uint32_t ip)313 uint32_t topology_get_rackid(uint32_t ip) {
314 return itree_find(racktree,ip);
315 }
316
317 // as for now:
318 //
319 // 0 - same machine
320 // 1 - same rack, different machines
321 // 2 - different racks
322
topology_distance(uint32_t ip1,uint32_t ip2)323 uint8_t topology_distance(uint32_t ip1,uint32_t ip2) {
324 uint32_t rid1,rid2;
325 char *rname1,*rname2;
326 int pos,lastbar;
327 uint8_t l1,l2;
328
329 if (ip1==ip2) {
330 return 0;
331 }
332 rid1 = itree_find(racktree,ip1);
333 rid2 = itree_find(racktree,ip2);
334 if (rid1==rid2) {
335 return 1;
336 }
337 rname1 = topology_rackid_to_rackname(rid1);
338 rname2 = topology_rackid_to_rackname(rid2);
339
340 if (rname1==NULL && rname2==NULL) { // safety guard - this may only happen when both rid1 and rid2 are 0 - it shouldn't pass rid1==rid2 condition
341 return 1;
342 }
343
344 lastbar = 0;
345 if (rname1!=NULL && rname2!=NULL) {
346 pos = 0;
347 while (1) {
348 if ((rname1[pos]==0 && rname2[pos]=='|') || (rname1[pos]=='|' && rname2[pos]==0)) {
349 lastbar = pos;
350 break;
351 }
352 if (rname1[pos] != rname2[pos]) {
353 break;
354 }
355 if (rname1[pos]=='|') {
356 lastbar = pos;
357 }
358 if (rname1[pos] == 0) { // safety guard - this means that strings are identical - if that then they should have the same rackid
359 return 1;
360 }
361 pos++;
362 }
363 }
364 l1 = 0;
365 l2 = 0;
366 if (rname1!=NULL) {
367 if (rname1[lastbar]=='|') {
368 pos = lastbar+1;
369 } else {
370 pos = lastbar;
371 }
372 for ( ; rname1[pos] ; pos++) {
373 if (rname1[pos]=='|') {
374 l1++;
375 }
376 }
377 }
378 if (rname2!=NULL) {
379 if (rname2[lastbar]=='|') {
380 pos = lastbar+1;
381 } else {
382 pos = lastbar;
383 }
384 for ( ; rname2[pos] ; pos++) {
385 if (rname2[pos]=='|') {
386 l2++;
387 }
388 }
389 }
390 if (l1>l2) {
391 return 2+l1;
392 } else {
393 return 2+l2;
394 }
395 }
396
397 // format:
398 // network rackid
399
400
401 // format (3.0.104+)
402 // network rack_path_sparated_by_vertical_bar
403
topology_parseline(char * line,uint32_t lineno,uint32_t * fip,uint32_t * tip,uint32_t * rid)404 int topology_parseline(char *line,uint32_t lineno,uint32_t *fip,uint32_t *tip,uint32_t *rid) {
405 char c,*net,*rackname;
406 char *p;
407
408 p = line;
409 while (*p==' ' || *p=='\t') {
410 p++;
411 }
412 if (*p==0 || *p=='#') { // empty line or line with comment only
413 return -1;
414 }
415 net = p;
416 while (*p && *p!=' ' && *p!='\t') {
417 p++;
418 }
419 if (*p==0) {
420 mfs_arg_syslog(LOG_WARNING,"mfstopology: incomplete definition in line: %"PRIu32,lineno);
421 fprintf(stderr,"mfstopology: incomplete definition in line: %"PRIu32"\n",lineno);
422 return -1;
423 }
424 *p=0;
425 p++;
426 if (topology_parsenet(net,fip,tip)<0) {
427 mfs_arg_syslog(LOG_WARNING,"mfstopology: incorrect ip/network definition in line: %"PRIu32,lineno);
428 fprintf(stderr,"mfstopology: incorrect ip/network definition in line: %"PRIu32"\n",lineno);
429 return -1;
430 }
431
432 while (*p==' ' || *p=='\t') {
433 p++;
434 }
435
436 if (*p==0 || *p=='#') {
437 mfs_arg_syslog(LOG_WARNING,"mfstopology: incorrect rack id in line: %"PRIu32,lineno);
438 fprintf(stderr,"mfstopology: incorrect rack id in line: %"PRIu32"\n",lineno);
439 return -1;
440 }
441
442 rackname = p;
443
444 while (*p && *p!=' ' && *p!='\t') {
445 p++;
446 }
447
448 c = *p;
449 *p = 0;
450 *rid = topology_rackname_to_rackid(rackname);
451 *p = c;
452
453 while (*p==' ' || *p=='\t') {
454 p++;
455 }
456
457 if (*p && *p!='#') {
458 mfs_arg_syslog(LOG_WARNING,"mfstopology: garbage found at the end of line: %"PRIu32,lineno);
459 fprintf(stderr,"mfstopology: garbage found at the end of line: %"PRIu32"\n",lineno);
460 return -1;
461 }
462 return 0;
463 }
464
topology_load(void)465 void topology_load(void) {
466 FILE *fd;
467 char linebuff[10000];
468 uint32_t s,lineno;
469 uint32_t fip,tip,rid;
470 void *newtree;
471
472 fd = fopen(TopologyFileName,"r");
473 if (fd==NULL) {
474 if (errno==ENOENT) {
475 if (racktree) {
476 syslog(LOG_WARNING,"mfstopology configuration file (%s) not found - network topology not changed",TopologyFileName);
477 } else {
478 syslog(LOG_WARNING,"mfstopology configuration file (%s) not found - network topology not defined",TopologyFileName);
479 }
480 fprintf(stderr,"mfstopology configuration file (%s) not found - using defaults\n",TopologyFileName);
481 } else {
482 if (racktree) {
483 mfs_arg_errlog(LOG_WARNING,"can't open mfstopology configuration file (%s) - network topology not changed, error",TopologyFileName);
484 } else {
485 mfs_arg_errlog(LOG_WARNING,"can't open mfstopology configuration file (%s) - network topology not defined, error",TopologyFileName);
486 }
487 }
488 return;
489 }
490
491 topology_rackname_stash();
492 newtree = NULL;
493 lineno = 1;
494 while (fgets(linebuff,10000,fd)) {
495 linebuff[9999]=0;
496 s=strlen(linebuff);
497 while (s>0 && (linebuff[s-1]=='\r' || linebuff[s-1]=='\n' || linebuff[s-1]=='\t' || linebuff[s-1]==' ')) {
498 s--;
499 }
500 if (s>0) {
501 linebuff[s]=0;
502 if (topology_parseline(linebuff,lineno,&fip,&tip,&rid)>=0) {
503 newtree = itree_add_interval(newtree,fip,tip,rid);
504 // while (fip<=tip) {
505 // hash_insert(fip,rid);
506 // fip++;
507 // }
508 }
509 }
510 lineno++;
511 }
512 if (ferror(fd)) {
513 fclose(fd);
514 if (racktree) {
515 syslog(LOG_WARNING,"error reading mfstopology file - network topology not changed");
516 } else {
517 syslog(LOG_WARNING,"error reading mfstopology file - network topology not defined");
518 }
519 itree_freeall(newtree);
520 topology_rackname_restore();
521 fprintf(stderr,"error reading mfstopology file - network topology not defined (using defaults)\n");
522 return;
523 }
524 fclose(fd);
525 topology_rackname_cleanupstash();
526 itree_freeall(racktree);
527 racktree = newtree;
528 if (racktree) {
529 racktree = itree_rebalance(racktree);
530 }
531 mfs_syslog(LOG_NOTICE,"topology file has been loaded");
532 }
533
534 //int topology_init(void) {
535 // TopologyFileName = strdup("mfstopology.cfg");
536 // racktree = NULL;
537 // topology_load();
538 // itree_show(racktree);
539 // return 0;
540 //}
541
topology_reload(void)542 void topology_reload(void) {
543 int fd;
544 if (TopologyFileName) {
545 free(TopologyFileName);
546 }
547 if (!cfg_isdefined("TOPOLOGY_FILENAME")) {
548 TopologyFileName = strdup(ETC_PATH "/mfs/mfstopology.cfg");
549 passert(TopologyFileName);
550 if ((fd = open(TopologyFileName,O_RDONLY))<0 && errno==ENOENT) {
551 char *tmpname;
552 tmpname = strdup(ETC_PATH "/mfstopology.cfg");
553 if ((fd = open(tmpname,O_RDONLY))>=0) {
554 free(TopologyFileName);
555 TopologyFileName = tmpname;
556 mfs_syslog(LOG_WARNING,"default sysconf path has changed - please move mfstopology.cfg from "ETC_PATH"/ to "ETC_PATH"/mfs/");
557 } else {
558 free(tmpname);
559 }
560 }
561 if (fd>=0) {
562 close(fd);
563 }
564 } else {
565 TopologyFileName = cfg_getstr("TOPOLOGY_FILENAME",ETC_PATH "/mfs/mfstopology.cfg");
566 }
567 topology_load();
568 }
569
topology_term(void)570 void topology_term(void) {
571 itree_freeall(racktree);
572 if (TopologyFileName) {
573 free(TopologyFileName);
574 }
575 topology_rackname_cleanup();
576 }
577
topology_init(void)578 int topology_init(void) {
579 TopologyFileName = NULL;
580 racktree = NULL;
581 topology_rackname_init();
582 topology_reload();
583 main_reload_register(topology_reload);
584 main_destruct_register(topology_term);
585 return 0;
586 }
587