1 /*
2  * Copyright (C) 2021 Jakub Kruszona-Zawadzki, Core Technology Sp. z o.o.
3  *
4  * This file is part of MooseFS.
5  *
6  * MooseFS is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, version 2 (only).
9  *
10  * MooseFS is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with MooseFS; if not, write to the Free Software
17  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02111-1301, USA
18  * or visit http://www.gnu.org/licenses/gpl-2.0.html
19  */
20 
21 #ifdef HAVE_CONFIG_H
22 #include "config.h"
23 #endif
24 
25 #include <stdlib.h>
26 #include <inttypes.h>
27 
28 #include "MFSCommunication.h"
29 #include "massert.h"
30 
31 #ifndef MFSTEST
32 
33 #include "matoclserv.h"
34 #include "openfiles.h"
35 #include "metadata.h"
36 #include "main.h"
37 #include "changelog.h"
38 #include "datapack.h"
39 #include "bio.h"
40 #include "slogger.h"
41 
42 #endif
43 
44 // ranges are closed-open: <start,end)
45 
46 typedef struct _range {
47 	uint64_t start;
48 	uint64_t end;
49 	uint8_t type;
50 	struct _range *next;
51 } range;
52 
53 #ifndef MFSTEST
54 
55 typedef struct _alock {
56 	uint64_t owner;
57 	uint32_t sessionid;
58 	uint32_t pid;
59 	range *ranges;
60 	struct _alock *next;
61 } alock;
62 
63 typedef struct _wlock {
64 	uint64_t owner;
65 	uint32_t sessionid;
66 	uint32_t pid;
67 	uint32_t msgid;
68 	uint32_t reqid;
69 	uint64_t start;
70 	uint64_t end;
71 	uint8_t type;
72 	struct _wlock *next,**prev;
73 } wlock;
74 
75 typedef struct _inodelocks {
76 	uint32_t inode;
77 	alock *active;
78 	wlock *waiting_head,**waiting_tail;
79 	struct _inodelocks *next;
80 } inodelocks;
81 
82 #define POSIX_LOCK_INODE_HASHSIZE 1024
83 
84 #define POSIX_LOCK_INODE_HASH(inode) (((inode)*0x738A2379)%(POSIX_LOCK_INODE_HASHSIZE))
85 
86 static inodelocks **inodehash;
87 
88 #if 0
89 static inline void posix_lock_dump(void) {
90 	uint32_t h;
91 	inodelocks *il;
92 	alock *al;
93 	wlock *wl,**wlptr;
94 	range *r;
95 	syslog(LOG_NOTICE,"posix lock dump:");
96 	for (h=0 ; h<POSIX_LOCK_INODE_HASHSIZE ; h++) {
97 		for (il = inodehash[h] ; il ; il=il->next) {
98 			syslog(LOG_NOTICE,"  inode: %"PRIu32" (active:%s,waiting:%s)",il->inode,il->active?"yes":"no",il->waiting_head?"yes":"no");
99 			for (al = il->active ; al ; al=al->next) {
100 				syslog(LOG_NOTICE,"    active lock: session:%"PRIu32",owner:%"PRIu64",pid:%"PRIu32,al->sessionid,al->owner,al->pid);
101 				if (al->ranges==NULL) {
102 					syslog(LOG_WARNING,"      no lock ranges !!!");
103 				}
104 				for (r = al->ranges ; r ; r=r->next) {
105 					syslog(LOG_NOTICE,"      range: start:%"PRIu64",end:%"PRIu64",type:%c",r->start,r->end,(r->type==POSIX_LOCK_RDLCK)?'R':(r->type==POSIX_LOCK_WRLCK)?'W':'?');
106 				}
107 			}
108 			wlptr = &(il->waiting_head);
109 			for (wl = il->waiting_head ; wl ; wl=wl->next) {
110 				syslog(LOG_NOTICE,"    waiting lock: session:%"PRIu32",owner:%"PRIu64",pid:%"PRIu32",start:%"PRIu64",end:%"PRIu64",type:%c",wl->sessionid,wl->owner,wl->pid,wl->start,wl->end,wl->type);
111 				wlptr = &(wl->next);
112 			}
113 			if (il->waiting_tail != wlptr) {
114 				syslog(LOG_WARNING,"    wrong tail pointer !!!");
115 			}
116 		}
117 	}
118 }
119 #endif
120 
121 #endif
122 
posix_lock_test_wlock(range * r,uint8_t * type,uint64_t * start,uint64_t * end)123 static inline int posix_lock_test_wlock(range *r,uint8_t *type,uint64_t *start,uint64_t *end) {
124 	while (r) {
125 		if (*type==POSIX_LOCK_WRLCK || r->type==POSIX_LOCK_WRLCK) {
126 			if (*end > r->start && *start < r->end) { // ranges intersects
127 				*type = r->type;
128 				*start = r->start;
129 				*end = r->end;
130 				return 1;
131 			}
132 		}
133 		r = r->next;
134 	}
135 	return 0;
136 }
137 
posix_lock_apply_range(range ** rptr,uint8_t type,uint64_t start,uint64_t end)138 static inline void posix_lock_apply_range(range **rptr,uint8_t type,uint64_t start,uint64_t end) {
139 	range *nr,*r;
140 	uint8_t added;
141 
142 	added = 0;
143 	while (added==0 && (r=*rptr)) {
144 		if (r->end < start) {
145 			// wl:      |-----|
146 			// r:  |--|
147 #ifdef MFSTEST
148 			printf("case 1\n");
149 #endif
150 			rptr = &(r->next);
151 		} else if (r->start > end) {
152 			// wl: |-----|
153 			// r:          |--|
154 			if (type!=POSIX_LOCK_UNLCK) {
155 #ifdef MFSTEST
156 				printf("case 2a\n");
157 				printf("malloc\n");
158 #endif
159 				nr = malloc(sizeof(range));
160 				passert(nr);
161 				nr->start = start;
162 				nr->end = end;
163 				nr->type = type;
164 				nr->next = *rptr;
165 				*rptr = nr;
166 #ifdef MFSTEST
167 			} else {
168 				printf("case 2b\n");
169 #endif
170 			}
171 			added = 1;
172 		} else if (start <= r->start && end >= r->end) {
173 			// wl: |-----|   |-----|
174 			// r:    |--|    |-----|
175 #ifdef MFSTEST
176 			printf("case 3\n");
177 			printf("free\n");
178 #endif
179 			*rptr = r->next;
180 			free(r);
181 		} else if (r->start < start && r->end <= end) {
182 			// wl:   |-----|     |-----|
183 			// r:  |---|       |-------|
184 			if (r->type == type) {
185 #ifdef MFSTEST
186 				printf("case 4a\n");
187 				printf("free\n");
188 #endif
189 				start = r->start;
190 				*rptr = r->next;
191 				free(r);
192 			} else {
193 #ifdef MFSTEST
194 				printf("case 4b\n");
195 #endif
196 				r->end = start;
197 				rptr = &(r->next);
198 			}
199 		} else if (r->start >= start && r->end > end) {
200 			// wl:  |-----|        |-----|
201 			// r:       |---|      |-------|
202 			if (r->type == type) {
203 #ifdef MFSTEST
204 				printf("case 5a\n");
205 #endif
206 				r->start = start;
207 				added = 1;
208 			} else {
209 				r->start = end;
210 				if (type!=POSIX_LOCK_UNLCK) {
211 #ifdef MFSTEST
212 					printf("case 5b\n");
213 					printf("malloc\n");
214 #endif
215 					nr = malloc(sizeof(range));
216 					passert(nr);
217 					nr->start = start;
218 					nr->end = end;
219 					nr->type = type;
220 					nr->next = r;
221 					*rptr = nr;
222 #ifdef MFSTEST
223 				} else {
224 					printf("case 5c\n");
225 #endif
226 				}
227 				added = 1;
228 			}
229 		} else {
230 			// wl:   |-----|
231 			// r:  |---------|
232 			if (r->type != type) {
233 				nr = malloc(sizeof(range));
234 				passert(nr);
235 				nr->start = end;
236 				nr->end = r->end;
237 				nr->type = r->type;
238 				nr->next = r->next;
239 				r->next = nr;
240 				if (type!=POSIX_LOCK_UNLCK) {
241 #ifdef MFSTEST
242 					printf("case 6a\n");
243 					printf("malloc\n");
244 					printf("malloc\n");
245 #endif
246 					nr = malloc(sizeof(range));
247 					passert(nr);
248 					nr->start = start;
249 					nr->end = end;
250 					nr->type = type;
251 					nr->next = r->next;
252 					r->next = nr;
253 #ifdef MFSTEST
254 				} else {
255 					printf("case 6b\n");
256 					printf("malloc\n");
257 #endif
258 				}
259 				r->end = start;
260 #ifdef MFSTEST
261 			} else {
262 				printf("case 6c\n");
263 #endif
264 			}
265 			added = 1;
266 		}
267 	}
268 	if (added==0 && type!=POSIX_LOCK_UNLCK) {
269 #ifdef MFSTEST
270 		printf("case 7\n");
271 		printf("malloc\n");
272 #endif
273 		nr = malloc(sizeof(range));
274 		passert(nr);
275 		nr->start = start;
276 		nr->end = end;
277 		nr->type = type;
278 		nr->next = NULL;
279 		*rptr = nr;
280 	}
281 }
282 
283 #ifndef MFSTEST
284 
posix_lock_inode_find(uint32_t inode)285 static inline inodelocks* posix_lock_inode_find(uint32_t inode) {
286 	inodelocks *il;
287 
288 	for (il = inodehash[POSIX_LOCK_INODE_HASH(inode)] ; il ; il=il->next) {
289 		if (il->inode==inode) {
290 			return il;
291 		}
292 	}
293 	return NULL;
294 }
295 
posix_lock_inode_new(uint32_t inode)296 static inline inodelocks* posix_lock_inode_new(uint32_t inode) {
297 	inodelocks *il;
298 	uint32_t hash;
299 
300 	il = malloc(sizeof(inodelocks));
301 	passert(il);
302 	il->inode = inode;
303 	il->active = NULL;
304 	il->waiting_head = NULL;
305 	il->waiting_tail = &(il->waiting_head);
306 	hash = POSIX_LOCK_INODE_HASH(inode);
307 	il->next = inodehash[hash];
308 	inodehash[hash] = il;
309 	return il;
310 }
311 
posix_lock_inode_remove(uint32_t inode)312 static inline void posix_lock_inode_remove(uint32_t inode) {
313 	inodelocks *il,**ilp;
314 	uint32_t hash;
315 
316 	hash = POSIX_LOCK_INODE_HASH(inode);
317 	ilp = inodehash + hash;
318 	while ((il=*ilp)) {
319 		if (il->inode==inode) {
320 			massert(il->active==NULL && il->waiting_head==NULL,"inode posix lock record not empty !!!");
321 			*ilp = il->next;
322 			free(il);
323 		} else {
324 			ilp = &(il->next);
325 		}
326 	}
327 }
328 
posix_lock_remove_lock(inodelocks * il,wlock * wl)329 static inline void posix_lock_remove_lock(inodelocks *il,wlock *wl) {
330 	if (wl->next==NULL) {
331 		il->waiting_tail = wl->prev;
332 	} else {
333 		wl->next->prev = wl->prev;
334 	}
335 	*(wl->prev) = wl->next;
336 	free(wl);
337 }
338 
posix_lock_get_offensive_lock(inodelocks * il,uint32_t sessionid,uint64_t owner,uint8_t * type,uint64_t * start,uint64_t * end,uint32_t * pid)339 static inline int posix_lock_get_offensive_lock(inodelocks *il,uint32_t sessionid,uint64_t owner,uint8_t *type,uint64_t *start,uint64_t *end,uint32_t *pid) {
340 	alock *al;
341 	for (al=il->active ; al ; al=al->next) {
342 		if (al->owner!=owner || al->sessionid!=sessionid) {
343 			if (posix_lock_test_wlock(al->ranges,type,start,end)) {
344 				if (sessionid==al->sessionid) {
345 					*pid = al->pid;
346 				} else {
347 					*pid = 0;
348 				}
349 				return 1;
350 			}
351 		}
352 	}
353 	return 0;
354 }
355 
posix_lock_find_offensive_lock(inodelocks * il,uint32_t sessionid,uint64_t owner,uint8_t type,uint64_t start,uint64_t end)356 static inline int posix_lock_find_offensive_lock(inodelocks *il,uint32_t sessionid,uint64_t owner,uint8_t type,uint64_t start,uint64_t end) {
357 	alock *al;
358 	for (al=il->active ; al ; al=al->next) {
359 		if (al->owner!=owner || al->sessionid!=sessionid) {
360 			if (posix_lock_test_wlock(al->ranges,&type,&start,&end)) {
361 				return 1;
362 			}
363 		}
364 	}
365 	return 0;
366 }
367 
posix_lock_do_apply_lock(inodelocks * il,uint32_t sessionid,uint64_t owner,uint8_t type,uint64_t start,uint64_t end,uint32_t pid)368 static inline void posix_lock_do_apply_lock(inodelocks *il,uint32_t sessionid,uint64_t owner,uint8_t type,uint64_t start,uint64_t end,uint32_t pid) {
369 	alock *al,**alptr;
370 	alptr = &(il->active);
371 	while ((al=*alptr)) {
372 		if (al->owner==owner && al->sessionid==sessionid) {
373 			posix_lock_apply_range(&(al->ranges),type,start,end);
374 			if (al->ranges==NULL) {
375 				*alptr = al->next;
376 				free(al);
377 			}
378 			return;
379 		}
380 		alptr = &(al->next);
381 	}
382 	if (type==POSIX_LOCK_UNLCK) {
383 		return;
384 	}
385 	al = malloc(sizeof(alock));
386 	passert(al);
387 	al->owner = owner;
388 	al->sessionid = sessionid;
389 	al->pid = pid;
390 	al->ranges = NULL;
391 	al->next = NULL;
392 	*alptr = al;
393 	posix_lock_apply_range(&(al->ranges),type,start,end);
394 }
395 
posix_lock_apply_lock(inodelocks * il,uint32_t sessionid,uint64_t owner,uint8_t type,uint64_t start,uint64_t end,uint32_t pid)396 static inline void posix_lock_apply_lock(inodelocks *il,uint32_t sessionid,uint64_t owner,uint8_t type,uint64_t start,uint64_t end,uint32_t pid) {
397 	changelog("%"PRIu32"|POSIXLOCK(%"PRIu32",%"PRIu32",%"PRIu64",%c,%"PRIu64",%"PRIu64",%"PRIu32")",main_time(),il->inode,sessionid,owner,(type==POSIX_LOCK_RDLCK)?'R':(type==POSIX_LOCK_WRLCK)?'W':'U',start,end,pid);
398 	posix_lock_do_apply_lock(il,sessionid,owner,type,start,end,pid);
399 }
400 
posix_lock_append_lock(inodelocks * il,uint32_t sessionid,uint32_t msgid,uint32_t reqid,uint64_t owner,uint8_t type,uint64_t start,uint64_t end,uint32_t pid)401 static inline void posix_lock_append_lock(inodelocks *il,uint32_t sessionid,uint32_t msgid,uint32_t reqid,uint64_t owner,uint8_t type,uint64_t start,uint64_t end,uint32_t pid) {
402 	wlock *wl;
403 	wl = malloc(sizeof(wlock));
404 	passert(wl);
405 	wl->owner = owner;
406 	wl->sessionid = sessionid;
407 	wl->pid = pid;
408 	wl->msgid = msgid;
409 	wl->reqid = reqid;
410 	wl->start = start;
411 	wl->end = end;
412 	wl->type = type;
413 	wl->next = NULL;
414 	wl->prev = il->waiting_tail;
415 	*(il->waiting_tail) = wl;
416 	il->waiting_tail = &(wl->next);
417 }
418 
posix_lock_interrupt(inodelocks * il,uint32_t sessionid,uint32_t reqid)419 static inline void posix_lock_interrupt(inodelocks *il,uint32_t sessionid,uint32_t reqid) {
420 	wlock *wl;
421 	for (wl=il->waiting_head ; wl ; wl=wl->next) {
422 		if (wl->sessionid==sessionid && wl->reqid==reqid) {
423 			matoclserv_fuse_posix_lock_wake_up(sessionid,wl->msgid,MFS_ERROR_EINTR);
424 			posix_lock_remove_lock(il,wl);
425 			return;
426 		}
427 	}
428 }
429 
posix_lock_check_waiting(inodelocks * il)430 static inline void posix_lock_check_waiting(inodelocks *il) {
431 	wlock *wl,*nwl;
432 	if (il->active==NULL && il->waiting_head==NULL) {
433 		posix_lock_inode_remove(il->inode);
434 		return;
435 	}
436 	wl = il->waiting_head;
437 	while (wl) {
438 		nwl = wl->next;
439 		if (posix_lock_find_offensive_lock(il,wl->sessionid,wl->owner,wl->type,wl->start,wl->end)==0) {
440 			posix_lock_apply_lock(il,wl->sessionid,wl->owner,wl->type,wl->start,wl->end,wl->pid);
441 			matoclserv_fuse_posix_lock_wake_up(wl->sessionid,wl->msgid,MFS_STATUS_OK);
442 			posix_lock_remove_lock(il,wl);
443 		}
444 		wl = nwl;
445 	}
446 }
447 
posix_lock_cmd(uint32_t sessionid,uint32_t msgid,uint32_t reqid,uint32_t inode,uint64_t owner,uint8_t op,uint8_t * type,uint64_t * start,uint64_t * end,uint32_t * pid)448 uint8_t posix_lock_cmd(uint32_t sessionid,uint32_t msgid,uint32_t reqid,uint32_t inode,uint64_t owner,uint8_t op,uint8_t *type,uint64_t *start,uint64_t *end,uint32_t *pid) {
449 	inodelocks *il;
450 	uint8_t i_type;
451 	uint64_t i_start;
452 	uint64_t i_end;
453 	uint32_t i_pid;
454 
455 	i_type = *type;
456 	i_start = *start;
457 	i_end = *end;
458 	i_pid = *pid;
459 
460 //	posix_lock_dump();
461 //	syslog(LOG_NOTICE,"new lock cmd: sessionid:%"PRIu32",msgid:%"PRIu32",reqid:%"PRIu32",inode:%"PRIu32",owner:%"PRIX64",op:%c,type:%c,start:%"PRIu64",end:%"PRIu64",pid:%"PRIu32,sessionid,msgid,reqid,inode,owner,(op==POSIX_LOCK_CMD_INT)?'I':(op==POSIX_LOCK_CMD_GET)?'G':(op==POSIX_LOCK_CMD_SET)?'S':(op==POSIX_LOCK_CMD_TRY)?'T':'?',(i_type==POSIX_LOCK_RDLCK)?'R':(i_type==POSIX_LOCK_WRLCK)?'W':(i_type==POSIX_LOCK_UNLCK)?'U':'?',i_start,i_end,i_pid);
462 
463 	if ((op==POSIX_LOCK_CMD_SET || op==POSIX_LOCK_CMD_TRY) && i_type!=POSIX_LOCK_UNLCK) {
464 		if (of_checknode(sessionid,inode)==0) {
465 			return MFS_ERROR_NOTOPENED;
466 		}
467 	}
468 
469 	il = posix_lock_inode_find(inode);
470 
471 	if (op==POSIX_LOCK_CMD_INT) {
472 		if (il==NULL) {
473 			return MFS_STATUS_OK;
474 		}
475 		posix_lock_interrupt(il,sessionid,reqid);
476 		return MFS_STATUS_OK;
477 	}
478 	if (op==POSIX_LOCK_CMD_GET) {
479 		if (il!=NULL && i_type!=POSIX_LOCK_UNLCK) {
480 			if (posix_lock_get_offensive_lock(il,sessionid,owner,type,start,end,pid)) {
481 				return MFS_STATUS_OK;
482 			}
483 		}
484 		*type = POSIX_LOCK_UNLCK;
485 		*start = 0;
486 		*end = 0;
487 		*pid = 0;
488 		return MFS_STATUS_OK;
489 	}
490 	if (il!=NULL && i_type!=POSIX_LOCK_UNLCK) {
491 		if (posix_lock_find_offensive_lock(il,sessionid,owner,i_type,i_start,i_end)) {
492 			if (op==POSIX_LOCK_CMD_TRY) {
493 				return MFS_ERROR_EAGAIN;
494 			} else {
495 				posix_lock_append_lock(il,sessionid,msgid,reqid,owner,i_type,i_start,i_end,i_pid);
496 				return MFS_ERROR_WAITING;
497 			}
498 		}
499 	}
500 	if (i_type==POSIX_LOCK_UNLCK) {
501 		if (il==NULL) {
502 			return MFS_STATUS_OK;
503 		}
504 		posix_lock_apply_lock(il,sessionid,owner,i_type,i_start,i_end,i_pid);
505 		posix_lock_check_waiting(il);
506 		return MFS_STATUS_OK;
507 	}
508 	if (il==NULL) {
509 		il = posix_lock_inode_new(inode);
510 	}
511 	if (posix_lock_find_offensive_lock(il,sessionid,owner,i_type,i_start,i_end)) {
512 		posix_lock_append_lock(il,sessionid,msgid,reqid,owner,i_type,i_start,i_end,i_pid);
513 		return MFS_ERROR_WAITING;
514 	}
515 	posix_lock_apply_lock(il,sessionid,owner,i_type,i_start,i_end,i_pid);
516 	posix_lock_check_waiting(il);
517 	return MFS_STATUS_OK;
518 }
519 
posix_lock_file_closed(uint32_t sessionid,uint32_t inode)520 void posix_lock_file_closed(uint32_t sessionid,uint32_t inode) {
521 	inodelocks *il;
522 	wlock *wl,*nwl;
523 	alock *al,**alptr;
524 	uint8_t changed;
525 
526 	il = posix_lock_inode_find(inode);
527 	if (il==NULL) {
528 		return;
529 	}
530 
531 	wl = il->waiting_head;
532 	while (wl) {
533 		nwl = wl->next;
534 		if (wl->sessionid==sessionid) {
535 			posix_lock_remove_lock(il,wl);
536 		}
537 		wl = nwl;
538 	}
539 
540 	changed = 0;
541 	alptr = &(il->active);
542 	while ((al=*alptr)) {
543 		if (al->sessionid==sessionid) {
544 			posix_lock_apply_range(&(al->ranges),POSIX_LOCK_UNLCK,0,UINT64_MAX);
545 			massert(al->ranges==NULL,"locks axists after unlocking everything !!!");
546 			*alptr = al->next;
547 			free(al);
548 			changed = 1;
549 		} else {
550 			alptr = &(al->next);
551 		}
552 	}
553 
554 	if (changed) {
555 		posix_lock_check_waiting(il);
556 	} else if (il->active==NULL && il->waiting_head==NULL) {
557 		posix_lock_inode_remove(il->inode);
558 	}
559 }
560 
posix_lock_list(uint32_t inode,uint8_t * buff)561 uint32_t posix_lock_list(uint32_t inode,uint8_t *buff) {
562 	inodelocks *il;
563 	alock *al;
564 	range *r;
565 	uint32_t h;
566 	uint32_t ret=0;
567 
568 	if (inode==0) {
569 		for (h=0 ; h<POSIX_LOCK_INODE_HASHSIZE ; h++) {
570 			for (il = inodehash[h] ; il ; il=il->next) {
571 				for (al=il->active ; al ; al=al->next) {
572 					for (r=al->ranges ; r ; r=r->next) {
573 						if (buff==NULL) {
574 							ret+=37;
575 						} else {
576 							put32bit(&buff,il->inode);
577 							put32bit(&buff,al->sessionid);
578 							put64bit(&buff,al->owner);
579 							put32bit(&buff,al->pid);
580 							put64bit(&buff,r->start);
581 							put64bit(&buff,r->end);
582 							switch (r->type) {
583 								case POSIX_LOCK_RDLCK:
584 									put8bit(&buff,1);
585 									break;
586 								case POSIX_LOCK_WRLCK:
587 									put8bit(&buff,2);
588 									break;
589 								default:
590 									put8bit(&buff,0);
591 							}
592 						}
593 					}
594 				}
595 			}
596 		}
597 	} else {
598 		il = posix_lock_inode_find(inode);
599 		if (il!=NULL) {
600 			for (al=il->active ; al ; al=al->next) {
601 				for (r=al->ranges ; r ; r=r->next) {
602 					if (buff==NULL) {
603 						ret+=33;
604 					} else {
605 						put32bit(&buff,al->sessionid);
606 						put64bit(&buff,al->owner);
607 						put32bit(&buff,al->pid);
608 						put64bit(&buff,r->start);
609 						put64bit(&buff,r->end);
610 						switch (r->type) {
611 							case POSIX_LOCK_RDLCK:
612 								put8bit(&buff,1);
613 								break;
614 							case POSIX_LOCK_WRLCK:
615 								put8bit(&buff,2);
616 								break;
617 							default:
618 								put8bit(&buff,0);
619 						}
620 					}
621 				}
622 			}
623 		}
624 	}
625 	return ret;
626 }
627 
posix_lock_mr_change(uint32_t inode,uint32_t sessionid,uint64_t owner,char cmd,uint64_t start,uint64_t end,uint32_t pid)628 uint8_t posix_lock_mr_change(uint32_t inode,uint32_t sessionid,uint64_t owner,char cmd,uint64_t start,uint64_t end,uint32_t pid) {
629 	inodelocks *il;
630 	uint8_t type;
631 
632 	if (cmd=='U' || cmd=='u') {
633 		il = posix_lock_inode_find(inode);
634 		if (il==NULL) {
635 			return MFS_ERROR_MISMATCH;
636 		}
637 		type = POSIX_LOCK_UNLCK;
638 	} else if (cmd=='R' || cmd=='r' || cmd=='S' || cmd=='s') {
639 		il = posix_lock_inode_find(inode);
640 		if (il==NULL) {
641 			il = posix_lock_inode_new(inode);
642 		}
643 		type = POSIX_LOCK_RDLCK;
644 	} else if (cmd=='W' || cmd=='w' || cmd=='E' || cmd=='e') {
645 		il = posix_lock_inode_find(inode);
646 		if (il==NULL) {
647 			il = posix_lock_inode_new(inode);
648 		}
649 		type = POSIX_LOCK_WRLCK;
650 	} else {
651 		return MFS_ERROR_EINVAL;
652 	}
653 	if (type!=POSIX_LOCK_UNLCK && posix_lock_find_offensive_lock(il,sessionid,owner,type,start,end)) {
654 		return MFS_ERROR_MISMATCH;
655 	}
656 	posix_lock_do_apply_lock(il,sessionid,owner,type,start,end,pid);
657 	meta_version_inc();
658 	return MFS_STATUS_OK;
659 }
660 
661 #define POSIX_LOCK_REC_SIZE 37
662 
posix_lock_store(bio * fd)663 uint8_t posix_lock_store(bio *fd) {
664 	uint8_t storebuff[POSIX_LOCK_REC_SIZE];
665 	uint8_t *ptr;
666 	uint32_t h;
667 	inodelocks *il;
668 	alock *al;
669 	range *r;
670 
671 	if (fd==NULL) {
672 		return 0x10;
673 	}
674 	for (h=0 ; h<POSIX_LOCK_INODE_HASHSIZE ; h++) {
675 		for (il = inodehash[h] ; il ; il=il->next) {
676 			for (al=il->active ; al ; al=al->next) {
677 				for (r=al->ranges ; r ; r=r->next) {
678 					ptr = storebuff;
679 					put32bit(&ptr,il->inode);
680 					put64bit(&ptr,al->owner);
681 					put32bit(&ptr,al->sessionid);
682 					put32bit(&ptr,al->pid);
683 					put64bit(&ptr,r->start);
684 					put64bit(&ptr,r->end);
685 					put8bit(&ptr,r->type);
686 					if (bio_write(fd,storebuff,POSIX_LOCK_REC_SIZE)!=POSIX_LOCK_REC_SIZE) {
687 						return 0xFF;
688 					}
689 				}
690 			}
691 		}
692 	}
693 	memset(storebuff,0,POSIX_LOCK_REC_SIZE);
694 	if (bio_write(fd,storebuff,POSIX_LOCK_REC_SIZE)!=POSIX_LOCK_REC_SIZE) {
695 		return 0xFF;
696 	}
697 	return 0;
698 }
699 
posix_lock_load(bio * fd,uint8_t mver,uint8_t ignoreflag)700 int posix_lock_load(bio *fd,uint8_t mver,uint8_t ignoreflag) {
701 	uint8_t loadbuff[POSIX_LOCK_REC_SIZE];
702 	const uint8_t *ptr;
703 	int32_t l;
704 	uint32_t inode,lastinode,sessionid,lastsessionid,pid;
705 	uint64_t owner,lastowner,start,end,lastend;
706 	uint8_t type,lasttype;
707 	uint8_t fino,fses;
708 	inodelocks *il;
709 	alock *al,**altail;
710 	range *r,**rtail;
711 
712 	if (mver!=0x10) {
713 		return -1;
714 	}
715 
716 	fino = 1;
717 	fses = 1;
718 	lastinode = 0;
719 	lastsessionid = 0;
720 	lastowner = 0;
721 	lasttype = 0; // make gcc happy
722 	lastend = 0; // make gcc happy
723 	il = NULL; // make gcc happy
724 	al = NULL; // make gcc happy
725 	r = NULL; // make gcc happy
726 	altail = NULL; // make gcc happy
727 	rtail = NULL; // make gcc happy
728 	for (;;) {
729 		l = bio_read(fd,loadbuff,POSIX_LOCK_REC_SIZE);
730 		if (l!=POSIX_LOCK_REC_SIZE) {
731 			return -1;
732 		}
733 		ptr = loadbuff;
734 		inode = get32bit(&ptr);
735 		owner = get64bit(&ptr);
736 		sessionid = get32bit(&ptr);
737 		pid = get32bit(&ptr);
738 		start = get64bit(&ptr);
739 		end = get64bit(&ptr);
740 		type = get8bit(&ptr);
741 		if (inode==0 && owner==0 && sessionid==0) {
742 			return 0;
743 		}
744 		if (inode!=lastinode || sessionid!=lastsessionid || fino || fses) {
745 			if (of_checknode(sessionid,inode)==0) {
746 				if (ignoreflag) {
747 					mfs_syslog(LOG_ERR,"loading posix_locks: lock on closed file !!! (ignoring)");
748 					continue;
749 				} else {
750 					mfs_syslog(LOG_ERR,"loading posix_locks: lock on closed file !!!");
751 					return -1;
752 				}
753 			}
754 		}
755 		// add lock
756 		if (inode!=lastinode || fino) {
757 			lastinode = inode;
758 			lastsessionid = 0;
759 			lastowner = 0;
760 			fses = 1;
761 			il = posix_lock_inode_find(inode);
762 			if (il==NULL) {
763 				il = posix_lock_inode_new(inode);
764 			}
765 			altail = &(il->active);
766 			fino = 0;
767 		}
768 		if (sessionid!=lastsessionid || owner!=lastowner || fses) {
769 			lastsessionid = sessionid;
770 			lastowner = owner;
771 			lastend = 0;
772 			lasttype = POSIX_LOCK_UNLCK;
773 			al = malloc(sizeof(alock));
774 			passert(al);
775 			al->owner = owner;
776 			al->sessionid = sessionid;
777 			al->pid = pid;
778 			al->ranges = NULL;
779 			al->next = NULL;
780 			*altail = al;
781 			altail = &(al->next);
782 			rtail = &(al->ranges);
783 			fses = 0;
784 		}
785 		if (lasttype!=POSIX_LOCK_UNLCK) {
786 			if (start<lastend) {
787 				if (ignoreflag) {
788 					mfs_syslog(LOG_ERR,"loading posix_locks: lock range not in order !!! (ignoring)");
789 					continue;
790 				} else {
791 					mfs_syslog(LOG_ERR,"loading posix_locks: lock range not in order !!!");
792 					return -1;
793 				}
794 			}
795 			if (type==lasttype && start==lastend) {
796 				if (ignoreflag) {
797 					mfs_syslog(LOG_ERR,"loading posix_locks: lock range not connected !!! (ignoring)");
798 					continue;
799 				} else {
800 					mfs_syslog(LOG_ERR,"loading posix_locks: lock range not connected !!!");
801 					return -1;
802 				}
803 			}
804 		}
805 		r = malloc(sizeof(range));
806 		passert(r);
807 		r->start = start;
808 		r->end = end;
809 		r->type = type;
810 		r->next = NULL;
811 		*rtail = r;
812 		rtail = &(r->next);
813 		lastend = end;
814 		lasttype = type;
815 	}
816 	return 0; // unreachable
817 }
818 
posix_lock_cleanup(void)819 void posix_lock_cleanup(void) {
820 	uint32_t h;
821 	inodelocks *il,*nil;
822 	wlock *wl,*nwl;
823 	alock *al,*nal;
824 	range *r,*nr;
825 
826 	for (h=0 ; h<POSIX_LOCK_INODE_HASHSIZE ; h++) {
827 		il = inodehash[h];
828 		while (il) {
829 			nil = il->next;
830 			wl = il->waiting_head;
831 			while (wl) {
832 				nwl = wl->next;
833 				free(wl);
834 				wl = nwl;
835 			}
836 			al = il->active;
837 			while (al) {
838 				nal = al->next;
839 				r = al->ranges;
840 				while (r) {
841 					nr = r->next;
842 					free(r);
843 					r = nr;
844 				}
845 				free(al);
846 				al = nal;
847 			}
848 			free(il);
849 			il = nil;
850 		}
851 		inodehash[h] = NULL;
852 	}
853 }
854 
posix_lock_init(void)855 int posix_lock_init(void) {
856 	uint32_t i;
857 	inodehash = malloc(sizeof(inodelocks*)*POSIX_LOCK_INODE_HASHSIZE);
858 	passert(inodehash);
859 	for (i=0 ; i<POSIX_LOCK_INODE_HASHSIZE ; i++) {
860 		inodehash[i] = NULL;
861 	}
862 	return 0;
863 }
864 
865 #endif
866 
867 #ifdef MFSTEST
868 
869 #include <stdio.h>
870 
posix_lock_print_ranges(range * r)871 void posix_lock_print_ranges(range *r) {
872 	uint64_t pos;
873 	range *rm;
874 	if (r) {
875 		rm = r;
876 		while (r) {
877 			printf("%c:<%"PRIu64",%"PRIu64")%s",(r->type==POSIX_LOCK_RDLCK)?'R':(r->type==POSIX_LOCK_WRLCK)?'W':'?',r->start,r->end,(r->next!=NULL)?" ; ":"\n");
878 			r = r->next;
879 		}
880 		r = rm;
881 		for (pos=0 ; pos<260 ; pos++) {
882 			while (r!=NULL && pos>=r->end) {
883 				r = r->next;
884 			}
885 			if (r==NULL || pos<r->start) {
886 				printf(".");
887 			} else {
888 				printf("%c",(r->type==POSIX_LOCK_RDLCK)?'o':(r->type==POSIX_LOCK_WRLCK)?'O':'?');
889 			}
890 		}
891 		printf("\n");
892 	} else {
893 		printf("empty\n");
894 	}
895 }
896 
posix_lock_verbose_apply_range(range ** rptr,uint8_t type,uint64_t start,uint64_t end)897 void posix_lock_verbose_apply_range(range **rptr,uint8_t type,uint64_t start,uint64_t end) {
898 	uint64_t pos;
899 	printf(" + %c:<%"PRIu64",%"PRIu64")\n",(type==POSIX_LOCK_RDLCK)?'R':(type==POSIX_LOCK_WRLCK)?'W':(type==POSIX_LOCK_UNLCK)?'U':'?',start,end);
900 	for (pos=0 ; pos<260 ; pos++) {
901 		if (pos<start || pos>=end) {
902 			printf("-");
903 		} else {
904 			printf("%c",(type==POSIX_LOCK_RDLCK)?'o':(type==POSIX_LOCK_WRLCK)?'O':'.');
905 		}
906 	}
907 	printf("\n");
908 	posix_lock_apply_range(rptr,type,start,end);
909 }
910 
main(int argc,char ** argv)911 int main(int argc,char **argv) {
912 	range *r;
913 	r = NULL;
914 
915 	if (argc<=1) {
916 		printf("usage: %s 1|2\n",argv[0]);
917 		return 1;
918 	}
919 	if (argv[1][0]=='1') {
920 		posix_lock_verbose_apply_range(&r,POSIX_LOCK_RDLCK,20,25);
921 		posix_lock_print_ranges(r);
922 		posix_lock_verbose_apply_range(&r,POSIX_LOCK_RDLCK,30,35);
923 		posix_lock_print_ranges(r);
924 		posix_lock_verbose_apply_range(&r,POSIX_LOCK_RDLCK,10,15);
925 		posix_lock_print_ranges(r);
926 		posix_lock_verbose_apply_range(&r,POSIX_LOCK_RDLCK,19,26);
927 		posix_lock_print_ranges(r);
928 		posix_lock_verbose_apply_range(&r,POSIX_LOCK_RDLCK,18,25);
929 		posix_lock_print_ranges(r);
930 		posix_lock_verbose_apply_range(&r,POSIX_LOCK_RDLCK,20,27);
931 		posix_lock_print_ranges(r);
932 		posix_lock_verbose_apply_range(&r,POSIX_LOCK_RDLCK,20,25);
933 		posix_lock_print_ranges(r);
934 		posix_lock_verbose_apply_range(&r,POSIX_LOCK_RDLCK,11,34);
935 		posix_lock_print_ranges(r);
936 		posix_lock_verbose_apply_range(&r,POSIX_LOCK_WRLCK,20,25);
937 		posix_lock_print_ranges(r);
938 		posix_lock_verbose_apply_range(&r,POSIX_LOCK_UNLCK,15,20);
939 		posix_lock_print_ranges(r);
940 		posix_lock_verbose_apply_range(&r,POSIX_LOCK_UNLCK,25,30);
941 		posix_lock_print_ranges(r);
942 		posix_lock_verbose_apply_range(&r,POSIX_LOCK_WRLCK,15,20);
943 		posix_lock_print_ranges(r);
944 		posix_lock_verbose_apply_range(&r,POSIX_LOCK_WRLCK,25,30);
945 		posix_lock_print_ranges(r);
946 		posix_lock_verbose_apply_range(&r,POSIX_LOCK_RDLCK,15,20);
947 		posix_lock_print_ranges(r);
948 		posix_lock_verbose_apply_range(&r,POSIX_LOCK_RDLCK,25,30);
949 		posix_lock_print_ranges(r);
950 		posix_lock_verbose_apply_range(&r,POSIX_LOCK_RDLCK,20,25);
951 		posix_lock_print_ranges(r);
952 		posix_lock_verbose_apply_range(&r,POSIX_LOCK_UNLCK,25,30);
953 		posix_lock_print_ranges(r);
954 		posix_lock_verbose_apply_range(&r,POSIX_LOCK_UNLCK,15,20);
955 		posix_lock_print_ranges(r);
956 		posix_lock_verbose_apply_range(&r,POSIX_LOCK_UNLCK,0,5);
957 		posix_lock_print_ranges(r);
958 		posix_lock_verbose_apply_range(&r,POSIX_LOCK_UNLCK,0,UINT64_MAX);
959 		posix_lock_print_ranges(r);
960 	}
961 	if (argv[1][0]=='2') {
962 		uint16_t x,start,end;
963 		uint8_t type;
964 		uint32_t i;
965 		for (i=0 ; i<1000 ; i++) {
966 			do {
967 				start = random()%250;
968 				end = random()%250;
969 			} while (start==end);
970 			if (start>end) {
971 				x = start;
972 				start = end;
973 				end = x;
974 			}
975 			switch (random()&3) {
976 				case 0:
977 					type = POSIX_LOCK_RDLCK;
978 					break;
979 				case 1:
980 					type = POSIX_LOCK_WRLCK;
981 					break;
982 				case 2:
983 					if (r==NULL) {
984 						type = POSIX_LOCK_RDLCK;
985 					} else {
986 						type = POSIX_LOCK_UNLCK;
987 					}
988 					break;
989 				case 3:
990 					if (r==NULL) {
991 						type = POSIX_LOCK_WRLCK;
992 					} else {
993 						type = POSIX_LOCK_UNLCK;
994 					}
995 					break;
996 			}
997 			posix_lock_verbose_apply_range(&r,type,start,end);
998 			posix_lock_print_ranges(r);
999 		}
1000 		posix_lock_verbose_apply_range(&r,POSIX_LOCK_UNLCK,0,UINT64_MAX);
1001 		posix_lock_print_ranges(r);
1002 	}
1003 }
1004 #endif
1005