xref: /reactos/base/services/nfsd/pnfs_layout.c (revision 5100859e)
1 /* NFSv4.1 client for Windows
2  * Copyright � 2012 The Regents of the University of Michigan
3  *
4  * Olga Kornievskaia <aglo@umich.edu>
5  * Casey Bodley <cbodley@umich.edu>
6  *
7  * This library is free software; you can redistribute it and/or modify it
8  * under the terms of the GNU Lesser General Public License as published by
9  * the Free Software Foundation; either version 2.1 of the License, or (at
10  * your option) any later version.
11  *
12  * This library is distributed in the hope that it will be useful, but
13  * without any warranty; without even the implied warranty of merchantability
14  * or fitness for a particular purpose.  See the GNU Lesser General Public
15  * License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public License
18  * along with this library; if not, write to the Free Software Foundation,
19  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA
20  */
21 
22 #include <stdio.h>
23 
24 #include "nfs41_ops.h"
25 #include "nfs41_callback.h"
26 #include "util.h"
27 #include "daemon_debug.h"
28 
29 
30 #define FLLVL 2 /* dprintf level for file layout logging */
31 
32 
33 /* pnfs_layout_list */
34 struct pnfs_layout_list {
35     struct list_entry       head;
36     CRITICAL_SECTION        lock;
37 };
38 
39 #define state_entry(pos) list_container(pos, pnfs_layout_state, entry)
40 #define layout_entry(pos) list_container(pos, pnfs_layout, entry)
41 #define file_layout_entry(pos) list_container(pos, pnfs_file_layout, layout.entry)
42 
43 static enum pnfs_status layout_state_create(
44     IN const nfs41_fh *meta_fh,
45     OUT pnfs_layout_state **layout_out)
46 {
47     pnfs_layout_state *layout;
48     enum pnfs_status status = PNFS_SUCCESS;
49 
50     layout = calloc(1, sizeof(pnfs_layout_state));
51     if (layout == NULL) {
52         status = PNFSERR_RESOURCES;
53         goto out;
54     }
55 
56     fh_copy(&layout->meta_fh, meta_fh);
57     list_init(&layout->layouts);
58     list_init(&layout->recalls);
59     InitializeSRWLock(&layout->lock);
60     InitializeConditionVariable(&layout->cond);
61 
62     *layout_out = layout;
63 out:
64     return status;
65 }
66 
67 static void file_layout_free(
68     IN pnfs_file_layout *layout)
69 {
70     if (layout->device) pnfs_file_device_put(layout->device);
71     free(layout->filehandles.arr);
72     free(layout);
73 }
74 
75 static void layout_state_free_layouts(
76     IN pnfs_layout_state *state)
77 {
78     struct list_entry *entry, *tmp;
79     list_for_each_tmp(entry, tmp, &state->layouts)
80         file_layout_free(file_layout_entry(entry));
81     list_init(&state->layouts);
82 }
83 
84 static void layout_state_free_recalls(
85     IN pnfs_layout_state *state)
86 {
87     struct list_entry *entry, *tmp;
88     list_for_each_tmp(entry, tmp, &state->recalls)
89         free(layout_entry(entry));
90     list_init(&state->recalls);
91 }
92 
93 static void layout_state_free(
94     IN pnfs_layout_state *state)
95 {
96     layout_state_free_layouts(state);
97     layout_state_free_recalls(state);
98     free(state);
99 }
100 
101 static int layout_entry_compare(
102     IN const struct list_entry *entry,
103     IN const void *value)
104 {
105     const pnfs_layout_state *layout = state_entry(entry);
106     const nfs41_fh *meta_fh = (const nfs41_fh*)value;
107     const nfs41_fh *layout_fh = (const nfs41_fh*)&layout->meta_fh;
108     const uint32_t diff = layout_fh->len - meta_fh->len;
109     return diff ? diff : memcmp(layout_fh->fh, meta_fh->fh, meta_fh->len);
110 }
111 
112 static enum pnfs_status layout_entry_find(
113     IN struct pnfs_layout_list *layouts,
114     IN const nfs41_fh *meta_fh,
115     OUT struct list_entry **entry_out)
116 {
117     *entry_out = list_search(&layouts->head, meta_fh, layout_entry_compare);
118     return *entry_out ? PNFS_SUCCESS : PNFSERR_NO_LAYOUT;
119 }
120 
121 enum pnfs_status pnfs_layout_list_create(
122     OUT struct pnfs_layout_list **layouts_out)
123 {
124     struct pnfs_layout_list *layouts;
125     enum pnfs_status status = PNFS_SUCCESS;
126 
127     layouts = calloc(1, sizeof(struct pnfs_layout_list));
128     if (layouts == NULL) {
129         status = PNFSERR_RESOURCES;
130         goto out;
131     }
132     list_init(&layouts->head);
133     InitializeCriticalSection(&layouts->lock);
134     *layouts_out = layouts;
135 out:
136     return status;
137 }
138 
139 void pnfs_layout_list_free(
140     IN struct pnfs_layout_list *layouts)
141 {
142     struct list_entry *entry, *tmp;
143 
144     EnterCriticalSection(&layouts->lock);
145 
146     list_for_each_tmp(entry, tmp, &layouts->head)
147         layout_state_free(state_entry(entry));
148 
149     LeaveCriticalSection(&layouts->lock);
150     DeleteCriticalSection(&layouts->lock);
151     free(layouts);
152 }
153 
154 static enum pnfs_status layout_state_find_or_create(
155     IN struct pnfs_layout_list *layouts,
156     IN const nfs41_fh *meta_fh,
157     OUT pnfs_layout_state **layout_out)
158 {
159     struct list_entry *entry;
160     enum pnfs_status status;
161 
162     dprintf(FLLVL, "--> layout_state_find_or_create()\n");
163 
164     EnterCriticalSection(&layouts->lock);
165 
166     /* search for an existing layout */
167     status = layout_entry_find(layouts, meta_fh, &entry);
168     if (status) {
169         /* create a new layout */
170         pnfs_layout_state *layout;
171         status = layout_state_create(meta_fh, &layout);
172         if (status == PNFS_SUCCESS) {
173             /* add it to the list */
174             list_add_head(&layouts->head, &layout->entry);
175             *layout_out = layout;
176 
177             dprintf(FLLVL, "<-- layout_state_find_or_create() "
178                 "returning new layout %p\n", layout);
179         } else {
180             dprintf(FLLVL, "<-- layout_state_find_or_create() "
181                 "returning %s\n", pnfs_error_string(status));
182         }
183     } else {
184         *layout_out = state_entry(entry);
185 
186         dprintf(FLLVL, "<-- layout_state_find_or_create() "
187             "returning existing layout %p\n", *layout_out);
188     }
189 
190     LeaveCriticalSection(&layouts->lock);
191     return status;
192 }
193 
194 static enum pnfs_status layout_state_find_and_delete(
195     IN struct pnfs_layout_list *layouts,
196     IN const nfs41_fh *meta_fh)
197 {
198     struct list_entry *entry;
199     enum pnfs_status status;
200 
201     dprintf(FLLVL, "--> layout_state_find_and_delete()\n");
202 
203     EnterCriticalSection(&layouts->lock);
204 
205     status = layout_entry_find(layouts, meta_fh, &entry);
206     if (status == PNFS_SUCCESS) {
207         list_remove(entry);
208         layout_state_free(state_entry(entry));
209     }
210 
211     LeaveCriticalSection(&layouts->lock);
212 
213     dprintf(FLLVL, "<-- layout_state_find_and_delete() "
214         "returning %s\n", pnfs_error_string(status));
215     return status;
216 }
217 
218 
219 /* pnfs_file_layout */
220 static uint64_t range_max(
221     IN const pnfs_layout *layout)
222 {
223     uint64_t result = layout->offset + layout->length;
224     return result < layout->offset ? NFS4_UINT64_MAX : result;
225 }
226 
227 static bool_t layout_sanity_check(
228     IN pnfs_file_layout *layout)
229 {
230     /* prevent div/0 */
231     if (layout->layout.length == 0 ||
232         layout->layout.iomode < PNFS_IOMODE_READ ||
233         layout->layout.iomode > PNFS_IOMODE_RW ||
234         layout_unit_size(layout) == 0)
235         return FALSE;
236 
237     /* put a cap on layout.length to prevent overflow */
238     layout->layout.length = range_max(&layout->layout) - layout->layout.offset;
239     return TRUE;
240 }
241 
242 static int layout_filehandles_cmp(
243     IN const pnfs_file_layout_handles *lhs,
244     IN const pnfs_file_layout_handles *rhs)
245 {
246     const uint32_t diff = rhs->count - lhs->count;
247     return diff ? diff : memcmp(rhs->arr, lhs->arr,
248         rhs->count * sizeof(nfs41_path_fh));
249 }
250 
251 static bool_t layout_merge_segments(
252     IN pnfs_file_layout *to,
253     IN pnfs_file_layout *from)
254 {
255     const uint64_t to_max = range_max(&to->layout);
256     const uint64_t from_max = range_max(&from->layout);
257 
258     /* cannot merge a segment with itself */
259     if (to == from)
260         return FALSE;
261 
262     /* the ranges must meet or overlap */
263     if (to_max < from->layout.offset || from_max < to->layout.offset)
264         return FALSE;
265 
266     /* the following fields must match: */
267     if (to->layout.iomode != from->layout.iomode ||
268         to->layout.type != from->layout.type ||
269         layout_filehandles_cmp(&to->filehandles, &from->filehandles) != 0 ||
270         memcmp(to->deviceid, from->deviceid, PNFS_DEVICEID_SIZE) != 0 ||
271         to->pattern_offset != from->pattern_offset ||
272         to->first_index != from->first_index ||
273         to->util != from->util)
274         return FALSE;
275 
276     dprintf(FLLVL, "merging layout range {%llu, %llu} with {%llu, %llu}\n",
277         to->layout.offset, to->layout.length,
278         from->layout.offset, from->layout.length);
279 
280     /* calculate the union of the two ranges */
281     to->layout.offset = min(to->layout.offset, from->layout.offset);
282     to->layout.length = max(to_max, from_max) - to->layout.offset;
283     return TRUE;
284 }
285 
286 static enum pnfs_status layout_state_merge(
287     IN pnfs_layout_state *state,
288     IN pnfs_file_layout *from)
289 {
290     struct list_entry *entry, *tmp;
291     pnfs_file_layout *to;
292     enum pnfs_status status = PNFSERR_NO_LAYOUT;
293 
294     /* attempt to merge the new segment with each existing segment */
295     list_for_each_tmp(entry, tmp, &state->layouts) {
296         to = file_layout_entry(entry);
297         if (!layout_merge_segments(to, from))
298             continue;
299 
300         /* on success, remove/free the new segment */
301         list_remove(&from->layout.entry);
302         file_layout_free(from);
303         status = PNFS_SUCCESS;
304 
305         /* because the existing segment 'to' has grown, we may
306          * be able to merge it with later segments */
307         from = to;
308 
309         /* but if there could be io threads referencing this segment,
310          * we can't free it until io is finished */
311         if (state->io_count)
312             break;
313     }
314     return status;
315 }
316 
317 static void layout_ordered_insert(
318     IN pnfs_layout_state *state,
319     IN pnfs_layout *layout)
320 {
321     struct list_entry *entry;
322     list_for_each(entry, &state->layouts) {
323         pnfs_layout *existing = layout_entry(entry);
324 
325         /* maintain an order of increasing offset */
326         if (existing->offset < layout->offset)
327             continue;
328 
329         /* when offsets are equal, prefer a longer segment first */
330         if (existing->offset == layout->offset &&
331             existing->length > layout->length)
332             continue;
333 
334         list_add(&layout->entry, existing->entry.prev, &existing->entry);
335         return;
336     }
337 
338     list_add_tail(&state->layouts, &layout->entry);
339 }
340 
341 static enum pnfs_status layout_update_range(
342     IN OUT pnfs_layout_state *state,
343     IN const struct list_entry *layouts)
344 {
345     struct list_entry *entry, *tmp;
346     pnfs_file_layout *layout;
347     enum pnfs_status status = PNFSERR_NO_LAYOUT;
348 
349     list_for_each_tmp(entry, tmp, layouts) {
350         layout = file_layout_entry(entry);
351 
352         /* don't know what to do with non-file layouts */
353         if (layout->layout.type != PNFS_LAYOUTTYPE_FILE)
354             continue;
355 
356         if (!layout_sanity_check(layout)) {
357             file_layout_free(layout);
358             continue;
359         }
360 
361         /* attempt to merge the range with existing segments */
362         status = layout_state_merge(state, layout);
363         if (status) {
364             dprintf(FLLVL, "saving new layout:\n");
365             dprint_layout(FLLVL, layout);
366 
367             layout_ordered_insert(state, &layout->layout);
368             status = PNFS_SUCCESS;
369         }
370     }
371     return status;
372 }
373 
374 static enum pnfs_status layout_update_stateid(
375     IN OUT pnfs_layout_state *state,
376     IN const stateid4 *stateid)
377 {
378     enum pnfs_status status = PNFS_SUCCESS;
379 
380     if (state->stateid.seqid == 0) {
381         /* save a new layout stateid */
382         memcpy(&state->stateid, stateid, sizeof(stateid4));
383     } else if (memcmp(&state->stateid.other, stateid->other,
384                         NFS4_STATEID_OTHER) == 0) {
385         /* update an existing layout stateid */
386         state->stateid.seqid = stateid->seqid;
387     } else {
388         status = PNFSERR_NO_LAYOUT;
389     }
390     return status;
391 }
392 
393 static enum pnfs_status layout_update(
394     IN OUT pnfs_layout_state *state,
395     IN const pnfs_layoutget_res_ok *layoutget_res)
396 {
397     enum pnfs_status status;
398 
399     /* update the layout ranges held by the client */
400     status = layout_update_range(state, &layoutget_res->layouts);
401     if (status) {
402         eprintf("LAYOUTGET didn't return any file layouts\n");
403         goto out;
404     }
405     /* update the layout stateid */
406     status = layout_update_stateid(state, &layoutget_res->stateid);
407     if (status) {
408         eprintf("LAYOUTGET returned a new stateid when we already had one\n");
409         goto out;
410     }
411     /* if a previous LAYOUTGET set return_on_close, don't overwrite it */
412     if (!state->return_on_close)
413         state->return_on_close = layoutget_res->return_on_close;
414 out:
415     return status;
416 }
417 
418 static enum pnfs_status file_layout_fetch(
419     IN OUT pnfs_layout_state *state,
420     IN nfs41_session *session,
421     IN nfs41_path_fh *meta_file,
422     IN stateid_arg *stateid,
423     IN enum pnfs_iomode iomode,
424     IN uint64_t offset,
425     IN uint64_t minlength,
426     IN uint64_t length)
427 {
428     pnfs_layoutget_res_ok layoutget_res = { 0 };
429     enum pnfs_status pnfsstat = PNFS_SUCCESS;
430     enum nfsstat4 nfsstat;
431 
432     dprintf(FLLVL, "--> file_layout_fetch(%s, seqid=%u)\n",
433         pnfs_iomode_string(iomode), state->stateid.seqid);
434 
435     list_init(&layoutget_res.layouts);
436 
437     /* drop the lock during the rpc call */
438     ReleaseSRWLockExclusive(&state->lock);
439     nfsstat = pnfs_rpc_layoutget(session, meta_file, stateid,
440         iomode, offset, minlength, length, &layoutget_res);
441     AcquireSRWLockExclusive(&state->lock);
442 
443     if (nfsstat) {
444         dprintf(FLLVL, "pnfs_rpc_layoutget() failed with %s\n",
445             nfs_error_string(nfsstat));
446         pnfsstat = PNFSERR_NOT_SUPPORTED;
447     }
448 
449     switch (nfsstat) {
450     case NFS4_OK:
451         /* use the LAYOUTGET results to update our view of the layout */
452         pnfsstat = layout_update(state, &layoutget_res);
453         break;
454 
455     case NFS4ERR_BADIOMODE:
456         /* don't try RW again */
457         if (iomode == PNFS_IOMODE_RW)
458             state->status |= PNFS_LAYOUT_NOT_RW;
459         break;
460 
461     case NFS4ERR_LAYOUTUNAVAILABLE:
462     case NFS4ERR_UNKNOWN_LAYOUTTYPE:
463     case NFS4ERR_BADLAYOUT:
464         /* don't try again at all */
465         state->status |= PNFS_LAYOUT_UNAVAILABLE;
466         break;
467     }
468 
469     dprintf(FLLVL, "<-- file_layout_fetch() returning %s\n",
470         pnfs_error_string(pnfsstat));
471     return pnfsstat;
472 }
473 
474 /* returns PNFS_SUCCESS if the client holds valid layouts that cover
475  * the entire range requested.  otherwise, returns PNFS_PENDING and
476  * sets 'offset_missing' to the lowest offset that is not covered */
477 static enum pnfs_status layout_coverage_status(
478     IN pnfs_layout_state *state,
479     IN enum pnfs_iomode iomode,
480     IN uint64_t offset,
481     IN uint64_t length,
482     OUT uint64_t *offset_missing)
483 {
484     uint64_t position = offset;
485     struct list_entry *entry;
486 
487     list_for_each(entry, &state->layouts) {
488         /* if the current position intersects with a compatible
489          * layout, move the position to the end of that layout */
490         pnfs_layout *layout = layout_entry(entry);
491         if (layout->iomode >= iomode &&
492             layout->offset <= position &&
493             position < layout->offset + layout->length)
494             position = layout->offset + layout->length;
495     }
496 
497     if (position >= offset + length)
498         return PNFS_SUCCESS;
499 
500     *offset_missing = position;
501     return PNFS_PENDING;
502 }
503 
504 static enum pnfs_status layout_fetch(
505     IN pnfs_layout_state *state,
506     IN nfs41_session *session,
507     IN nfs41_path_fh *meta_file,
508     IN stateid_arg *stateid,
509     IN enum pnfs_iomode iomode,
510     IN uint64_t offset,
511     IN uint64_t length)
512 {
513     stateid_arg layout_stateid = { 0 };
514     enum pnfs_status status = PNFS_PENDING;
515 
516     /* check for previous errors from LAYOUTGET */
517     if ((state->status & PNFS_LAYOUT_UNAVAILABLE) ||
518         ((state->status & PNFS_LAYOUT_NOT_RW) && iomode == PNFS_IOMODE_RW)) {
519         status = PNFSERR_NO_LAYOUT;
520         goto out;
521     }
522 
523     /* wait for any pending LAYOUTGETs/LAYOUTRETURNs */
524     while (state->pending)
525         SleepConditionVariableSRW(&state->cond, &state->lock, INFINITE, 0);
526     state->pending = TRUE;
527 
528     /* if there's an existing layout stateid, use it */
529     if (state->stateid.seqid) {
530         memcpy(&layout_stateid.stateid, &state->stateid, sizeof(stateid4));
531         layout_stateid.type = STATEID_LAYOUT;
532         stateid = &layout_stateid;
533     }
534 
535     if ((state->status & PNFS_LAYOUT_NOT_RW) == 0) {
536         /* try to get a RW layout first */
537         status = file_layout_fetch(state, session, meta_file,
538             stateid, PNFS_IOMODE_RW, offset, length, NFS4_UINT64_MAX);
539     }
540 
541     if (status && iomode == PNFS_IOMODE_READ) {
542         /* fall back on READ if necessary */
543         status = file_layout_fetch(state, session, meta_file,
544             stateid, iomode, offset, length, NFS4_UINT64_MAX);
545     }
546 
547     state->pending = FALSE;
548     WakeConditionVariable(&state->cond);
549 out:
550     return status;
551 }
552 
553 static enum pnfs_status device_status(
554     IN pnfs_layout_state *state,
555     IN uint64_t offset,
556     IN uint64_t length,
557     OUT unsigned char *deviceid)
558 {
559     struct list_entry *entry;
560     enum pnfs_status status = PNFS_SUCCESS;
561 
562     list_for_each(entry, &state->layouts) {
563         pnfs_file_layout *layout = file_layout_entry(entry);
564 
565         if (layout->device == NULL) {
566             /* copy missing deviceid */
567             memcpy(deviceid, layout->deviceid, PNFS_DEVICEID_SIZE);
568             status = PNFS_PENDING;
569             break;
570         }
571     }
572     return status;
573 }
574 
575 static void device_assign(
576     IN pnfs_layout_state *state,
577     IN const unsigned char *deviceid,
578     IN pnfs_file_device *device)
579 {
580     struct list_entry *entry;
581     list_for_each(entry, &state->layouts) {
582         pnfs_file_layout *layout = file_layout_entry(entry);
583 
584         /* assign the device to any matching layouts */
585         if (layout->device == NULL &&
586             memcmp(layout->deviceid, deviceid, PNFS_DEVICEID_SIZE) == 0) {
587             layout->device = device;
588 
589             /* XXX: only assign the device to a single segment, because
590              * pnfs_file_device_get() only gives us a single reference */
591             break;
592         }
593     }
594 }
595 
596 static enum pnfs_status device_fetch(
597     IN pnfs_layout_state *state,
598     IN nfs41_session *session,
599     IN unsigned char *deviceid)
600 {
601     pnfs_file_device *device;
602     enum pnfs_status status;
603 
604     /* drop the layoutstate lock for the rpc call */
605     ReleaseSRWLockExclusive(&state->lock);
606     status = pnfs_file_device_get(session,
607         session->client->devices, deviceid, &device);
608     AcquireSRWLockExclusive(&state->lock);
609 
610     if (status == PNFS_SUCCESS)
611         device_assign(state, deviceid, device);
612     return status;
613 }
614 
615 
616 /* nfs41_open_state */
617 static enum pnfs_status client_supports_pnfs(
618     IN nfs41_client *client)
619 {
620     enum pnfs_status status;
621     AcquireSRWLockShared(&client->exid_lock);
622     status = client->roles & EXCHGID4_FLAG_USE_PNFS_MDS
623         ? PNFS_SUCCESS : PNFSERR_NOT_SUPPORTED;
624     ReleaseSRWLockShared(&client->exid_lock);
625     return status;
626 }
627 
628 static enum pnfs_status fs_supports_layout(
629     IN const nfs41_superblock *superblock,
630     IN enum pnfs_layout_type type)
631 {
632     const uint32_t flag = 1 << (type - 1);
633     return (superblock->layout_types & flag) == 0
634         ? PNFSERR_NOT_SUPPORTED : PNFS_SUCCESS;
635 }
636 
637 static enum pnfs_status open_state_layout_cached(
638     IN nfs41_open_state *state,
639     OUT pnfs_layout_state **layout_out)
640 {
641     enum pnfs_status status = PNFSERR_NO_LAYOUT;
642 
643     if (state->layout) {
644         status = PNFS_SUCCESS;
645         *layout_out = state->layout;
646 
647         dprintf(FLLVL, "pnfs_open_state_layout() found "
648             "cached layout %p\n", *layout_out);
649     }
650     return status;
651 }
652 
653 enum pnfs_status pnfs_layout_state_open(
654     IN nfs41_open_state *state,
655     OUT pnfs_layout_state **layout_out)
656 {
657     struct pnfs_layout_list *layouts = state->session->client->layouts;
658     nfs41_session *session = state->session;
659     pnfs_layout_state *layout;
660     enum pnfs_status status;
661 
662     dprintf(FLLVL, "--> pnfs_layout_state_open()\n");
663 
664     status = client_supports_pnfs(session->client);
665     if (status)
666         goto out;
667     status = fs_supports_layout(state->file.fh.superblock, PNFS_LAYOUTTYPE_FILE);
668     if (status)
669         goto out;
670 
671     /* under shared lock, check open state for cached layouts */
672     AcquireSRWLockShared(&state->lock);
673     status = open_state_layout_cached(state, &layout);
674     ReleaseSRWLockShared(&state->lock);
675 
676     if (status) {
677         /* under exclusive lock, find or create a layout for this file */
678         AcquireSRWLockExclusive(&state->lock);
679 
680         status = open_state_layout_cached(state, &layout);
681         if (status) {
682             status = layout_state_find_or_create(layouts, &state->file.fh, &layout);
683             if (status == PNFS_SUCCESS) {
684                 LONG open_count = InterlockedIncrement(&layout->open_count);
685                 state->layout = layout;
686 
687                 dprintf(FLLVL, "pnfs_layout_state_open() caching layout %p "
688                     "(%u opens)\n", state->layout, open_count);
689             }
690         }
691 
692         ReleaseSRWLockExclusive(&state->lock);
693 
694         if (status)
695             goto out;
696     }
697 
698     *layout_out = layout;
699 out:
700     dprintf(FLLVL, "<-- pnfs_layout_state_open() returning %s\n",
701         pnfs_error_string(status));
702     return status;
703 }
704 
705 /* expects caller to hold an exclusive lock on pnfs_layout_state */
706 enum pnfs_status pnfs_layout_state_prepare(
707     IN pnfs_layout_state *state,
708     IN nfs41_session *session,
709     IN nfs41_path_fh *meta_file,
710     IN stateid_arg *stateid,
711     IN enum pnfs_iomode iomode,
712     IN uint64_t offset,
713     IN uint64_t length)
714 {
715     unsigned char deviceid[PNFS_DEVICEID_SIZE];
716     struct list_entry *entry;
717     uint64_t missing;
718     enum pnfs_status status;
719 
720     /* fail if the range intersects any pending recalls */
721     list_for_each(entry, &state->recalls) {
722         const pnfs_layout *recall = layout_entry(entry);
723         if (offset <= recall->offset + recall->length
724             && recall->offset <= offset + length) {
725             status = PNFSERR_LAYOUT_RECALLED;
726             goto out;
727         }
728     }
729 
730     /* if part of the given range is not covered by a layout,
731      * attempt to fetch it with LAYOUTGET */
732     status = layout_coverage_status(state, iomode, offset, length, &missing);
733     if (status == PNFS_PENDING) {
734         status = layout_fetch(state, session, meta_file, stateid,
735             iomode, missing, offset + length - missing);
736 
737         /* return pending because layout_fetch() dropped the lock */
738         if (status == PNFS_SUCCESS)
739             status = PNFS_PENDING;
740         goto out;
741     }
742 
743     /* if any layouts in the range are missing device info,
744      * fetch them with GETDEVICEINFO */
745     status = device_status(state, offset, length, deviceid);
746     if (status == PNFS_PENDING) {
747         status = device_fetch(state, session, deviceid);
748 
749         /* return pending because device_fetch() dropped the lock */
750         if (status == PNFS_SUCCESS)
751             status = PNFS_PENDING;
752         goto out;
753     }
754 out:
755     return status;
756 }
757 
758 static enum pnfs_status layout_return_status(
759     IN const pnfs_layout_state *state)
760 {
761     /* return the layout if we have a stateid */
762     return state->stateid.seqid ? PNFS_SUCCESS : PNFS_PENDING;
763 }
764 
765 static enum pnfs_status file_layout_return(
766     IN nfs41_session *session,
767     IN nfs41_path_fh *file,
768     IN pnfs_layout_state *state)
769 {
770     enum pnfs_status status;
771     enum nfsstat4 nfsstat;
772 
773     dprintf(FLLVL, "--> file_layout_return()\n");
774 
775     /* under shared lock, determine whether we need to return the layout */
776     AcquireSRWLockShared(&state->lock);
777     status = layout_return_status(state);
778     ReleaseSRWLockShared(&state->lock);
779 
780     if (status != PNFS_PENDING)
781         goto out;
782 
783     /* under exclusive lock, return the layout and reset status flags */
784     AcquireSRWLockExclusive(&state->lock);
785 
786     /* wait for any pending LAYOUTGETs/LAYOUTRETURNs */
787     while (state->pending)
788         SleepConditionVariableSRW(&state->cond, &state->lock, INFINITE, 0);
789     state->pending = TRUE;
790 
791     status = layout_return_status(state);
792     if (status == PNFS_PENDING) {
793         pnfs_layoutreturn_res layoutreturn_res = { 0 };
794         stateid4 stateid;
795         memcpy(&stateid, &state->stateid, sizeof(stateid));
796 
797         /* drop the lock during the rpc call */
798         ReleaseSRWLockExclusive(&state->lock);
799         nfsstat = pnfs_rpc_layoutreturn(session, file, PNFS_LAYOUTTYPE_FILE,
800             PNFS_IOMODE_ANY, 0, NFS4_UINT64_MAX, &stateid, &layoutreturn_res);
801         AcquireSRWLockExclusive(&state->lock);
802 
803         if (nfsstat) {
804             eprintf("pnfs_rpc_layoutreturn() failed with %s\n",
805                 nfs_error_string(nfsstat));
806             status = PNFSERR_NO_LAYOUT;
807         } else {
808             status = PNFS_SUCCESS;
809 
810             /* update the layout range held by the client */
811             layout_state_free_layouts(state);
812 
813             /* 12.5.3. Layout Stateid: Once a client has no more
814              * layouts on a file, the layout stateid is no longer
815              * valid and MUST NOT be used. */
816             ZeroMemory(&state->stateid, sizeof(stateid4));
817         }
818     }
819 
820     state->pending = FALSE;
821     WakeConditionVariable(&state->cond);
822     ReleaseSRWLockExclusive(&state->lock);
823 
824 out:
825     dprintf(FLLVL, "<-- file_layout_return() returning %s\n",
826         pnfs_error_string(status));
827     return status;
828 }
829 
830 void pnfs_layout_state_close(
831     IN nfs41_session *session,
832     IN nfs41_open_state *state,
833     IN bool_t remove)
834 {
835     pnfs_layout_state *layout;
836     bool_t return_layout;
837     enum pnfs_status status;
838 
839     AcquireSRWLockExclusive(&state->lock);
840     layout = state->layout;
841     state->layout = NULL;
842     ReleaseSRWLockExclusive(&state->lock);
843 
844     if (layout) {
845         LONG open_count = InterlockedDecrement(&layout->open_count);
846 
847         AcquireSRWLockShared(&layout->lock);
848         /* only return on close if it's the last close */
849         return_layout = layout->return_on_close && (open_count <= 0);
850         ReleaseSRWLockShared(&layout->lock);
851 
852         if (return_layout) {
853             status = file_layout_return(session, &state->file, layout);
854             if (status)
855                 eprintf("file_layout_return() failed with %s\n",
856                     pnfs_error_string(status));
857         }
858     }
859 
860     if (remove && session->client->layouts) {
861         /* free the layout when the file is removed */
862         layout_state_find_and_delete(session->client->layouts, &state->file.fh);
863     }
864 }
865 
866 
867 /* pnfs_layout_recall */
868 struct layout_recall {
869     pnfs_layout layout;
870     bool_t changed;
871 };
872 #define recall_entry(pos) list_container(pos, struct layout_recall, layout.entry)
873 
874 static bool_t layout_recall_compatible(
875     IN const pnfs_layout *layout,
876     IN const pnfs_layout *recall)
877 {
878     return layout->type == recall->type
879         && layout->offset <= (recall->offset + recall->length)
880         && recall->offset <= (layout->offset + layout->length)
881         && (recall->iomode == PNFS_IOMODE_ANY ||
882             layout->iomode == recall->iomode);
883 }
884 
885 static pnfs_file_layout* layout_allocate_copy(
886     IN const pnfs_file_layout *existing)
887 {
888     /* allocate a segment to cover the end of the range */
889     pnfs_file_layout *layout = calloc(1, sizeof(pnfs_file_layout));
890     if (layout == NULL)
891         goto out;
892 
893     memcpy(layout, existing, sizeof(pnfs_file_layout));
894 
895     /* XXX: don't use the device from existing layout;
896      * we need to get a reference for ourselves */
897     layout->device = NULL;
898 
899     /* allocate a copy of the filehandle array */
900     layout->filehandles.arr = calloc(layout->filehandles.count,
901         sizeof(nfs41_path_fh));
902     if (layout->filehandles.arr == NULL)
903         goto out_free;
904 
905     memcpy(layout->filehandles.arr, existing->filehandles.arr,
906         layout->filehandles.count * sizeof(nfs41_path_fh));
907 out:
908     return layout;
909 
910 out_free:
911     file_layout_free(layout);
912     layout = NULL;
913     goto out;
914 }
915 
916 static void layout_recall_range(
917     IN pnfs_layout_state *state,
918     IN const pnfs_layout *recall)
919 {
920     struct list_entry *entry, *tmp;
921     list_for_each_tmp(entry, tmp, &state->layouts) {
922         pnfs_file_layout *layout = file_layout_entry(entry);
923         const uint64_t layout_end = layout->layout.offset + layout->layout.length;
924 
925         if (!layout_recall_compatible(&layout->layout, recall))
926             continue;
927 
928         if (recall->offset > layout->layout.offset) {
929             /* segment starts before recall; shrink length */
930             layout->layout.length = recall->offset - layout->layout.offset;
931 
932             if (layout_end > recall->offset + recall->length) {
933                 /* middle chunk of the segment is recalled;
934                  * allocate a new segment to cover the end */
935                 pnfs_file_layout *remainder = layout_allocate_copy(layout);
936                 if (remainder == NULL) {
937                     /* silently ignore allocation errors here. behave
938                      * as if we 'forgot' this last segment */
939                 } else {
940                     layout->layout.offset = recall->offset + recall->length;
941                     layout->layout.length = layout_end - layout->layout.offset;
942                     layout_ordered_insert(state, &remainder->layout);
943                 }
944             }
945         } else {
946             /* segment starts after recall */
947             if (layout_end <= recall->offset + recall->length) {
948                 /* entire segment is recalled */
949                 list_remove(&layout->layout.entry);
950                 file_layout_free(layout);
951             } else {
952                 /* beginning of segment is recalled; shrink offset/length */
953                 layout->layout.offset = recall->offset + recall->length;
954                 layout->layout.length = layout_end - layout->layout.offset;
955             }
956         }
957     }
958 }
959 
960 static void layout_state_deferred_recalls(
961     IN pnfs_layout_state *state)
962 {
963     struct list_entry *entry, *tmp;
964     list_for_each_tmp(entry, tmp, &state->recalls) {
965         /* process each deferred layout recall */
966         pnfs_layout *recall = layout_entry(entry);
967         layout_recall_range(state, recall);
968 
969         /* remove/free the recall entry */
970         list_remove(&recall->entry);
971         free(recall);
972     }
973 }
974 
975 static void layout_recall_entry_init(
976     OUT struct layout_recall *lrc,
977     IN const struct cb_layoutrecall_args *recall)
978 {
979     list_init(&lrc->layout.entry);
980     if (recall->recall.type == PNFS_RETURN_FILE) {
981         lrc->layout.offset = recall->recall.args.file.offset;
982         lrc->layout.length = recall->recall.args.file.length;
983     } else {
984         lrc->layout.offset = 0;
985         lrc->layout.length = NFS4_UINT64_MAX;
986     }
987     lrc->layout.iomode = recall->iomode;
988     lrc->layout.type = PNFS_LAYOUTTYPE_FILE;
989     lrc->changed = recall->changed;
990 }
991 
992 static enum pnfs_status layout_recall_merge(
993     IN struct list_entry *list,
994     IN pnfs_layout *from)
995 {
996     struct list_entry *entry, *tmp;
997     enum pnfs_status status = PNFSERR_NO_LAYOUT;
998 
999     /* attempt to merge the new recall with each existing recall */
1000     list_for_each_tmp(entry, tmp, list) {
1001         pnfs_layout *to = layout_entry(entry);
1002         const uint64_t to_max = to->offset + to->length;
1003         const uint64_t from_max = from->offset + from->length;
1004 
1005         /* the ranges must meet or overlap */
1006         if (to_max < from->offset || from_max < to->offset)
1007             continue;
1008 
1009         /* the following fields must match: */
1010         if (to->iomode != from->iomode || to->type != from->type)
1011             continue;
1012 
1013         dprintf(FLLVL, "merging recalled range {%llu, %llu} with {%llu, %llu}\n",
1014             to->offset, to->length, from->offset, from->length);
1015 
1016         /* calculate the union of the two ranges */
1017         to->offset = min(to->offset, from->offset);
1018         to->length = max(to_max, from_max) - to->offset;
1019 
1020         /* on success, remove/free the new segment */
1021         list_remove(&from->entry);
1022         free(from);
1023         status = PNFS_SUCCESS;
1024 
1025         /* because the existing segment 'to' has grown, we may
1026          * be able to merge it with later segments */
1027         from = to;
1028     }
1029     return status;
1030 }
1031 
1032 static enum pnfs_status file_layout_recall(
1033     IN pnfs_layout_state *state,
1034     IN const struct cb_layoutrecall_args *recall)
1035 {
1036     const stateid4 *stateid = &recall->recall.args.file.stateid;
1037     enum pnfs_status status = PNFS_SUCCESS;
1038 
1039     /* under an exclusive lock, flag the layout as recalled */
1040     AcquireSRWLockExclusive(&state->lock);
1041 
1042     if (state->stateid.seqid == 0) {
1043         /* return NOMATCHINGLAYOUT if it wasn't actually granted */
1044         status = PNFSERR_NO_LAYOUT;
1045         goto out;
1046     }
1047 
1048     if (recall->recall.type == PNFS_RETURN_FILE) {
1049         /* detect races between CB_LAYOUTRECALL and LAYOUTGET/LAYOUTRETURN */
1050         if (stateid->seqid > state->stateid.seqid + 1) {
1051             /* the server has processed an outstanding LAYOUTGET or
1052              * LAYOUTRETURN; we must return ERR_DELAY until we get the
1053              * response and update our view of the layout */
1054             status = PNFS_PENDING;
1055             goto out;
1056         }
1057 
1058         /* save the updated seqid */
1059         state->stateid.seqid = stateid->seqid;
1060     }
1061 
1062     if (state->io_count) {
1063         /* save an entry for this recall, and process it once io finishes */
1064         struct layout_recall *lrc = calloc(1, sizeof(struct layout_recall));
1065         if (lrc == NULL) {
1066             /* on failure to allocate, we'll have to respond
1067              * to the CB_LAYOUTRECALL with NFS4ERR_DELAY */
1068             status = PNFS_PENDING;
1069             goto out;
1070         }
1071         layout_recall_entry_init(lrc, recall);
1072         if (layout_recall_merge(&state->recalls, &lrc->layout) != PNFS_SUCCESS)
1073             list_add_tail(&state->recalls, &lrc->layout.entry);
1074     } else {
1075         /* if there is no pending io, process the recall immediately */
1076         struct layout_recall lrc = { 0 };
1077         layout_recall_entry_init(&lrc, recall);
1078         layout_recall_range(state, &lrc.layout);
1079     }
1080 out:
1081     ReleaseSRWLockExclusive(&state->lock);
1082     return status;
1083 }
1084 
1085 static enum pnfs_status file_layout_recall_file(
1086     IN nfs41_client *client,
1087     IN const struct cb_layoutrecall_args *recall)
1088 {
1089     struct list_entry *entry;
1090     enum pnfs_status status;
1091 
1092     dprintf(FLLVL, "--> file_layout_recall_file()\n");
1093 
1094     EnterCriticalSection(&client->layouts->lock);
1095 
1096     status = layout_entry_find(client->layouts, &recall->recall.args.file.fh, &entry);
1097     if (status == PNFS_SUCCESS)
1098         status = file_layout_recall(state_entry(entry), recall);
1099 
1100     LeaveCriticalSection(&client->layouts->lock);
1101 
1102     dprintf(FLLVL, "<-- file_layout_recall_file() returning %s\n",
1103         pnfs_error_string(status));
1104     return status;
1105 }
1106 
1107 static bool_t fsid_matches(
1108     IN const nfs41_fsid *lhs,
1109     IN const nfs41_fsid *rhs)
1110 {
1111     return lhs->major == rhs->major && lhs->minor == rhs->minor;
1112 }
1113 
1114 static enum pnfs_status file_layout_recall_fsid(
1115     IN nfs41_client *client,
1116     IN const struct cb_layoutrecall_args *recall)
1117 {
1118     struct list_entry *entry;
1119     pnfs_layout_state *state;
1120     nfs41_fh *fh;
1121     enum pnfs_status status = PNFSERR_NO_LAYOUT;
1122 
1123     dprintf(FLLVL, "--> file_layout_recall_fsid(%llu, %llu)\n",
1124         recall->recall.args.fsid.major, recall->recall.args.fsid.minor);
1125 
1126     EnterCriticalSection(&client->layouts->lock);
1127 
1128     list_for_each(entry, &client->layouts->head) {
1129         state = state_entry(entry);
1130         /* no locks needed to read layout.meta_fh or superblock.fsid,
1131          * because they are only written once on creation */
1132         fh = &state->meta_fh;
1133         if (fsid_matches(&recall->recall.args.fsid, &fh->superblock->fsid))
1134             status = file_layout_recall(state, recall);
1135     }
1136 
1137     LeaveCriticalSection(&client->layouts->lock);
1138 
1139     /* bulk recalls require invalidation of cached device info */
1140     pnfs_file_device_list_invalidate(client->devices);
1141 
1142     dprintf(FLLVL, "<-- file_layout_recall_fsid() returning %s\n",
1143         pnfs_error_string(status));
1144     return status;
1145 }
1146 
1147 static enum pnfs_status file_layout_recall_all(
1148     IN nfs41_client *client,
1149     IN const struct cb_layoutrecall_args *recall)
1150 {
1151     struct list_entry *entry;
1152     enum pnfs_status status = PNFSERR_NO_LAYOUT;
1153 
1154     dprintf(FLLVL, "--> file_layout_recall_all()\n");
1155 
1156     EnterCriticalSection(&client->layouts->lock);
1157 
1158     list_for_each(entry, &client->layouts->head)
1159         status = file_layout_recall(state_entry(entry), recall);
1160 
1161     LeaveCriticalSection(&client->layouts->lock);
1162 
1163     /* bulk recalls require invalidation of cached device info */
1164     pnfs_file_device_list_invalidate(client->devices);
1165 
1166     dprintf(FLLVL, "<-- file_layout_recall_all() returning %s\n",
1167         pnfs_error_string(status));
1168     return status;
1169 }
1170 
1171 enum pnfs_status pnfs_file_layout_recall(
1172     IN nfs41_client *client,
1173     IN const struct cb_layoutrecall_args *recall)
1174 {
1175     enum pnfs_status status = PNFS_SUCCESS;
1176 
1177     dprintf(FLLVL, "--> pnfs_file_layout_recall(%u, %s, %u)\n",
1178         recall->recall.type, pnfs_iomode_string(recall->iomode),
1179         recall->changed);
1180 
1181     if (recall->type != PNFS_LAYOUTTYPE_FILE) {
1182         dprintf(FLLVL, "invalid layout type %u (%s)!\n",
1183             recall->type, pnfs_layout_type_string(recall->type));
1184         status = PNFSERR_NOT_SUPPORTED;
1185         goto out;
1186     }
1187 
1188     switch (recall->recall.type) {
1189     case PNFS_RETURN_FILE:
1190         status = file_layout_recall_file(client, recall);
1191         break;
1192     case PNFS_RETURN_FSID:
1193         status = file_layout_recall_fsid(client, recall);
1194         break;
1195     case PNFS_RETURN_ALL:
1196         status = file_layout_recall_all(client, recall);
1197         break;
1198 
1199     default:
1200         dprintf(FLLVL, "invalid return type %u!\n", recall->recall);
1201         status = PNFSERR_NOT_SUPPORTED;
1202         goto out;
1203     }
1204 out:
1205     dprintf(FLLVL, "<-- pnfs_file_layout_recall() returning %s\n",
1206         pnfs_error_string(status));
1207     return status;
1208 }
1209 
1210 /* expects caller to hold a shared lock on pnfs_layout_state */
1211 enum pnfs_status pnfs_layout_recall_status(
1212     IN const pnfs_layout_state *state,
1213     IN const pnfs_layout *layout)
1214 {
1215     struct list_entry *entry;
1216     enum pnfs_status status = PNFS_SUCCESS;
1217 
1218     /* search for a pending recall that intersects with the given segment */
1219     list_for_each(entry, &state->recalls) {
1220         const struct layout_recall *recall = recall_entry(entry);
1221         if (!layout_recall_compatible(layout, &recall->layout))
1222             continue;
1223 
1224         if (recall->changed)
1225             status = PNFSERR_LAYOUT_CHANGED;
1226         else
1227             status = PNFSERR_LAYOUT_RECALLED;
1228         break;
1229     }
1230     return status;
1231 }
1232 
1233 void pnfs_layout_recall_fenced(
1234     IN pnfs_layout_state *state,
1235     IN const pnfs_layout *layout)
1236 {
1237     struct layout_recall *lrc = calloc(1, sizeof(struct layout_recall));
1238     if (lrc == NULL)
1239         return;
1240 
1241     AcquireSRWLockExclusive(&state->lock);
1242 
1243     list_init(&lrc->layout.entry);
1244     lrc->layout.offset = layout->offset;
1245     lrc->layout.length = layout->length;
1246     lrc->layout.iomode = layout->iomode;
1247     lrc->layout.type = layout->type;
1248     lrc->changed = TRUE;
1249 
1250     if (layout_recall_merge(&state->recalls, &lrc->layout) != PNFS_SUCCESS)
1251         list_add_tail(&state->recalls, &lrc->layout.entry);
1252 
1253     ReleaseSRWLockExclusive(&state->lock);
1254 }
1255 
1256 /* expects caller to hold an exclusive lock on pnfs_layout_state */
1257 void pnfs_layout_io_start(
1258     IN pnfs_layout_state *state)
1259 {
1260     /* take a reference on the layout, so that it won't be recalled
1261      * until all io is finished */
1262     state->io_count++;
1263     dprintf(FLLVL, "pnfs_layout_io_start(): count -> %u\n",
1264         state->io_count);
1265 }
1266 
1267 void pnfs_layout_io_finished(
1268     IN pnfs_layout_state *state)
1269 {
1270     AcquireSRWLockExclusive(&state->lock);
1271 
1272     /* return the reference to signify that an io request is finished */
1273     state->io_count--;
1274     dprintf(FLLVL, "pnfs_layout_io_finished() count -> %u\n",
1275         state->io_count);
1276 
1277     if (state->io_count > 0) /* more io pending */
1278         goto out_unlock;
1279 
1280     /* once all io is finished, process any layout recalls */
1281     layout_state_deferred_recalls(state);
1282 
1283     /* finish any segment merging that was delayed during io */
1284     if (!list_empty(&state->layouts))
1285         layout_state_merge(state, file_layout_entry(state->layouts.next));
1286 
1287 out_unlock:
1288     ReleaseSRWLockExclusive(&state->lock);
1289 }
1290