1 /*
2  * Copyright (c) 2021 Klara Systems, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/mutex.h>
29 #include <sys/sysmacros.h>
30 #include <sys/kmem.h>
31 #include <linux/file.h>
32 #include <linux/magic.h>
33 #include <sys/zone.h>
34 
35 #if defined(CONFIG_USER_NS)
36 #include <linux/statfs.h>
37 #include <linux/proc_ns.h>
38 #endif
39 
40 static kmutex_t zone_datasets_lock;
41 static struct list_head zone_datasets;
42 
43 typedef struct zone_datasets {
44 	struct list_head zds_list;	/* zone_datasets linkage */
45 	struct user_namespace *zds_userns; /* namespace reference */
46 	struct list_head zds_datasets;	/* datasets for the namespace */
47 } zone_datasets_t;
48 
49 typedef struct zone_dataset {
50 	struct list_head zd_list;	/* zone_dataset linkage */
51 	size_t zd_dsnamelen;		/* length of name */
52 	char zd_dsname[0];		/* name of the member dataset */
53 } zone_dataset_t;
54 
55 #if defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM)
56 /*
57  * Returns:
58  * - 0 on success
59  * - EBADF if it cannot open the provided file descriptor
60  * - ENOTTY if the file itself is a not a user namespace file. We want to
61  *   intercept this error in the ZFS layer. We cannot just return one of the
62  *   ZFS_ERR_* errors here as we want to preserve the seperation of the ZFS
63  *   and the SPL layers.
64  */
65 static int
66 user_ns_get(int fd, struct user_namespace **userns)
67 {
68 	struct kstatfs st;
69 	struct file *nsfile;
70 	struct ns_common *ns;
71 	int error;
72 
73 	if ((nsfile = fget(fd)) == NULL)
74 		return (EBADF);
75 	if (vfs_statfs(&nsfile->f_path, &st) != 0) {
76 		error = ENOTTY;
77 		goto done;
78 	}
79 	if (st.f_type != NSFS_MAGIC) {
80 		error = ENOTTY;
81 		goto done;
82 	}
83 	ns = get_proc_ns(file_inode(nsfile));
84 	if (ns->ops->type != CLONE_NEWUSER) {
85 		error = ENOTTY;
86 		goto done;
87 	}
88 	*userns = container_of(ns, struct user_namespace, ns);
89 
90 	error = 0;
91 done:
92 	fput(nsfile);
93 
94 	return (error);
95 }
96 #endif /* defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM) */
97 
98 static unsigned int
99 user_ns_zoneid(struct user_namespace *user_ns)
100 {
101 	unsigned int r;
102 
103 #if defined(HAVE_USER_NS_COMMON_INUM)
104 	r = user_ns->ns.inum;
105 #else
106 	r = user_ns->proc_inum;
107 #endif
108 
109 	return (r);
110 }
111 
112 static struct zone_datasets *
113 zone_datasets_lookup(unsigned int nsinum)
114 {
115 	zone_datasets_t *zds;
116 
117 	list_for_each_entry(zds, &zone_datasets, zds_list) {
118 		if (user_ns_zoneid(zds->zds_userns) == nsinum)
119 			return (zds);
120 	}
121 	return (NULL);
122 }
123 
124 #if defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM)
125 static struct zone_dataset *
126 zone_dataset_lookup(zone_datasets_t *zds, const char *dataset, size_t dsnamelen)
127 {
128 	zone_dataset_t *zd;
129 
130 	list_for_each_entry(zd, &zds->zds_datasets, zd_list) {
131 		if (zd->zd_dsnamelen != dsnamelen)
132 			continue;
133 		if (strncmp(zd->zd_dsname, dataset, dsnamelen) == 0)
134 			return (zd);
135 	}
136 
137 	return (NULL);
138 }
139 
140 static int
141 zone_dataset_cred_check(cred_t *cred)
142 {
143 
144 	if (!uid_eq(cred->uid, GLOBAL_ROOT_UID))
145 		return (EPERM);
146 
147 	return (0);
148 }
149 #endif /* defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM) */
150 
151 static int
152 zone_dataset_name_check(const char *dataset, size_t *dsnamelen)
153 {
154 
155 	if (dataset[0] == '\0' || dataset[0] == '/')
156 		return (ENOENT);
157 
158 	*dsnamelen = strlen(dataset);
159 	/* Ignore trailing slash, if supplied. */
160 	if (dataset[*dsnamelen - 1] == '/')
161 		(*dsnamelen)--;
162 
163 	return (0);
164 }
165 
166 int
167 zone_dataset_attach(cred_t *cred, const char *dataset, int userns_fd)
168 {
169 #if defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM)
170 	struct user_namespace *userns;
171 	zone_datasets_t *zds;
172 	zone_dataset_t *zd;
173 	int error;
174 	size_t dsnamelen;
175 
176 	if ((error = zone_dataset_cred_check(cred)) != 0)
177 		return (error);
178 	if ((error = zone_dataset_name_check(dataset, &dsnamelen)) != 0)
179 		return (error);
180 	if ((error = user_ns_get(userns_fd, &userns)) != 0)
181 		return (error);
182 
183 	mutex_enter(&zone_datasets_lock);
184 	zds = zone_datasets_lookup(user_ns_zoneid(userns));
185 	if (zds == NULL) {
186 		zds = kmem_alloc(sizeof (zone_datasets_t), KM_SLEEP);
187 		INIT_LIST_HEAD(&zds->zds_list);
188 		INIT_LIST_HEAD(&zds->zds_datasets);
189 		zds->zds_userns = userns;
190 		/*
191 		 * Lock the namespace by incresing its refcount to prevent
192 		 * the namespace ID from being reused.
193 		 */
194 		get_user_ns(userns);
195 		list_add_tail(&zds->zds_list, &zone_datasets);
196 	} else {
197 		zd = zone_dataset_lookup(zds, dataset, dsnamelen);
198 		if (zd != NULL) {
199 			mutex_exit(&zone_datasets_lock);
200 			return (EEXIST);
201 		}
202 	}
203 
204 	zd = kmem_alloc(sizeof (zone_dataset_t) + dsnamelen + 1, KM_SLEEP);
205 	zd->zd_dsnamelen = dsnamelen;
206 	strncpy(zd->zd_dsname, dataset, dsnamelen);
207 	zd->zd_dsname[dsnamelen] = '\0';
208 	INIT_LIST_HEAD(&zd->zd_list);
209 	list_add_tail(&zd->zd_list, &zds->zds_datasets);
210 
211 	mutex_exit(&zone_datasets_lock);
212 	return (0);
213 #else
214 	return (ENXIO);
215 #endif /* defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM) */
216 }
217 EXPORT_SYMBOL(zone_dataset_attach);
218 
219 int
220 zone_dataset_detach(cred_t *cred, const char *dataset, int userns_fd)
221 {
222 #if defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM)
223 	struct user_namespace *userns;
224 	zone_datasets_t *zds;
225 	zone_dataset_t *zd;
226 	int error;
227 	size_t dsnamelen;
228 
229 	if ((error = zone_dataset_cred_check(cred)) != 0)
230 		return (error);
231 	if ((error = zone_dataset_name_check(dataset, &dsnamelen)) != 0)
232 		return (error);
233 	if ((error = user_ns_get(userns_fd, &userns)) != 0)
234 		return (error);
235 
236 	mutex_enter(&zone_datasets_lock);
237 	zds = zone_datasets_lookup(user_ns_zoneid(userns));
238 	if (zds != NULL)
239 		zd = zone_dataset_lookup(zds, dataset, dsnamelen);
240 	if (zds == NULL || zd == NULL) {
241 		mutex_exit(&zone_datasets_lock);
242 		return (ENOENT);
243 	}
244 
245 	list_del(&zd->zd_list);
246 	kmem_free(zd, sizeof (*zd) + zd->zd_dsnamelen + 1);
247 
248 	/* Prune the namespace entry if it has no more delegations. */
249 	if (list_empty(&zds->zds_datasets)) {
250 		/*
251 		 * Decrease the refcount now that the namespace is no longer
252 		 * used. It is no longer necessary to prevent the namespace ID
253 		 * from being reused.
254 		 */
255 		put_user_ns(userns);
256 		list_del(&zds->zds_list);
257 		kmem_free(zds, sizeof (*zds));
258 	}
259 
260 	mutex_exit(&zone_datasets_lock);
261 	return (0);
262 #else
263 	return (ENXIO);
264 #endif /* defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM) */
265 }
266 EXPORT_SYMBOL(zone_dataset_detach);
267 
268 /*
269  * A dataset is visible if:
270  * - It is a parent of a namespace entry.
271  * - It is one of the namespace entries.
272  * - It is a child of a namespace entry.
273  *
274  * A dataset is writable if:
275  * - It is one of the namespace entries.
276  * - It is a child of a namespace entry.
277  *
278  * The parent datasets of namespace entries are visible and
279  * read-only to provide a path back to the root of the pool.
280  */
281 int
282 zone_dataset_visible(const char *dataset, int *write)
283 {
284 	zone_datasets_t *zds;
285 	zone_dataset_t *zd;
286 	size_t dsnamelen, zd_len;
287 	int visible;
288 
289 	/* Default to read-only, in case visible is returned. */
290 	if (write != NULL)
291 		*write = 0;
292 	if (zone_dataset_name_check(dataset, &dsnamelen) != 0)
293 		return (0);
294 	if (INGLOBALZONE(curproc)) {
295 		if (write != NULL)
296 			*write = 1;
297 		return (1);
298 	}
299 
300 	mutex_enter(&zone_datasets_lock);
301 	zds = zone_datasets_lookup(crgetzoneid(curproc->cred));
302 	if (zds == NULL) {
303 		mutex_exit(&zone_datasets_lock);
304 		return (0);
305 	}
306 
307 	visible = 0;
308 	list_for_each_entry(zd, &zds->zds_datasets, zd_list) {
309 		zd_len = strlen(zd->zd_dsname);
310 		if (zd_len > dsnamelen) {
311 			/*
312 			 * The name of the namespace entry is longer than that
313 			 * of the dataset, so it could be that the dataset is a
314 			 * parent of the namespace entry.
315 			 */
316 			visible = memcmp(zd->zd_dsname, dataset,
317 			    dsnamelen) == 0 &&
318 			    zd->zd_dsname[dsnamelen] == '/';
319 			if (visible)
320 				break;
321 		} else if (zd_len == dsnamelen) {
322 			/*
323 			 * The name of the namespace entry is as long as that
324 			 * of the dataset, so perhaps the dataset itself is the
325 			 * namespace entry.
326 			 */
327 			visible = memcmp(zd->zd_dsname, dataset, zd_len) == 0;
328 			if (visible) {
329 				if (write != NULL)
330 					*write = 1;
331 				break;
332 			}
333 		} else {
334 			/*
335 			 * The name of the namespace entry is shorter than that
336 			 * of the dataset, so perhaps the dataset is a child of
337 			 * the namespace entry.
338 			 */
339 			visible = memcmp(zd->zd_dsname, dataset,
340 			    zd_len) == 0 && dataset[zd_len] == '/';
341 			if (visible) {
342 				if (write != NULL)
343 					*write = 1;
344 				break;
345 			}
346 		}
347 	}
348 
349 	mutex_exit(&zone_datasets_lock);
350 	return (visible);
351 }
352 EXPORT_SYMBOL(zone_dataset_visible);
353 
354 unsigned int
355 global_zoneid(void)
356 {
357 	unsigned int z = 0;
358 
359 #if defined(CONFIG_USER_NS)
360 	z = user_ns_zoneid(&init_user_ns);
361 #endif
362 
363 	return (z);
364 }
365 EXPORT_SYMBOL(global_zoneid);
366 
367 unsigned int
368 crgetzoneid(const cred_t *cr)
369 {
370 	unsigned int r = 0;
371 
372 #if defined(CONFIG_USER_NS)
373 	r = user_ns_zoneid(cr->user_ns);
374 #endif
375 
376 	return (r);
377 }
378 EXPORT_SYMBOL(crgetzoneid);
379 
380 boolean_t
381 inglobalzone(proc_t *proc)
382 {
383 #if defined(CONFIG_USER_NS)
384 	return (proc->cred->user_ns == &init_user_ns);
385 #else
386 	return (B_TRUE);
387 #endif
388 }
389 EXPORT_SYMBOL(inglobalzone);
390 
391 int
392 spl_zone_init(void)
393 {
394 	mutex_init(&zone_datasets_lock, NULL, MUTEX_DEFAULT, NULL);
395 	INIT_LIST_HEAD(&zone_datasets);
396 	return (0);
397 }
398 
399 void
400 spl_zone_fini(void)
401 {
402 	zone_datasets_t *zds;
403 	zone_dataset_t *zd;
404 
405 	/*
406 	 * It would be better to assert an empty zone_datasets, but since
407 	 * there's no automatic mechanism for cleaning them up if the user
408 	 * namespace is destroyed, just do it here, since spl is about to go
409 	 * out of context.
410 	 */
411 	while (!list_empty(&zone_datasets)) {
412 		zds = list_entry(zone_datasets.next, zone_datasets_t, zds_list);
413 		while (!list_empty(&zds->zds_datasets)) {
414 			zd = list_entry(zds->zds_datasets.next,
415 			    zone_dataset_t, zd_list);
416 			list_del(&zd->zd_list);
417 			kmem_free(zd, sizeof (*zd) + zd->zd_dsnamelen + 1);
418 			put_user_ns(zds->zds_userns);
419 		}
420 		list_del(&zds->zds_list);
421 		kmem_free(zds, sizeof (*zds));
422 	}
423 	mutex_destroy(&zone_datasets_lock);
424 }
425