1 /*
2  * Copyright (c) 2021 Klara Systems, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/sysmacros.h>
29 #include <sys/kmem.h>
30 #include <linux/file.h>
31 #include <linux/magic.h>
32 #include <sys/zone.h>
33 
34 #if defined(CONFIG_USER_NS)
35 #include <linux/statfs.h>
36 #include <linux/proc_ns.h>
37 #endif
38 
39 #include <sys/mutex.h>
40 
41 static kmutex_t zone_datasets_lock;
42 static struct list_head zone_datasets;
43 
44 typedef struct zone_datasets {
45 	struct list_head zds_list;	/* zone_datasets linkage */
46 	struct user_namespace *zds_userns; /* namespace reference */
47 	struct list_head zds_datasets;	/* datasets for the namespace */
48 } zone_datasets_t;
49 
50 typedef struct zone_dataset {
51 	struct list_head zd_list;	/* zone_dataset linkage */
52 	size_t zd_dsnamelen;		/* length of name */
53 	char zd_dsname[];		/* name of the member dataset */
54 } zone_dataset_t;
55 
56 #if defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM)
57 /*
58  * Returns:
59  * - 0 on success
60  * - EBADF if it cannot open the provided file descriptor
61  * - ENOTTY if the file itself is a not a user namespace file. We want to
62  *   intercept this error in the ZFS layer. We cannot just return one of the
63  *   ZFS_ERR_* errors here as we want to preserve the seperation of the ZFS
64  *   and the SPL layers.
65  */
66 static int
67 user_ns_get(int fd, struct user_namespace **userns)
68 {
69 	struct kstatfs st;
70 	struct file *nsfile;
71 	struct ns_common *ns;
72 	int error;
73 
74 	if ((nsfile = fget(fd)) == NULL)
75 		return (EBADF);
76 	if (vfs_statfs(&nsfile->f_path, &st) != 0) {
77 		error = ENOTTY;
78 		goto done;
79 	}
80 	if (st.f_type != NSFS_MAGIC) {
81 		error = ENOTTY;
82 		goto done;
83 	}
84 	ns = get_proc_ns(file_inode(nsfile));
85 	if (ns->ops->type != CLONE_NEWUSER) {
86 		error = ENOTTY;
87 		goto done;
88 	}
89 	*userns = container_of(ns, struct user_namespace, ns);
90 
91 	error = 0;
92 done:
93 	fput(nsfile);
94 
95 	return (error);
96 }
97 #endif /* defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM) */
98 
99 static unsigned int
100 user_ns_zoneid(struct user_namespace *user_ns)
101 {
102 	unsigned int r;
103 
104 #if defined(HAVE_USER_NS_COMMON_INUM)
105 	r = user_ns->ns.inum;
106 #else
107 	r = user_ns->proc_inum;
108 #endif
109 
110 	return (r);
111 }
112 
113 static struct zone_datasets *
114 zone_datasets_lookup(unsigned int nsinum)
115 {
116 	zone_datasets_t *zds;
117 
118 	list_for_each_entry(zds, &zone_datasets, zds_list) {
119 		if (user_ns_zoneid(zds->zds_userns) == nsinum)
120 			return (zds);
121 	}
122 	return (NULL);
123 }
124 
125 #if defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM)
126 static struct zone_dataset *
127 zone_dataset_lookup(zone_datasets_t *zds, const char *dataset, size_t dsnamelen)
128 {
129 	zone_dataset_t *zd;
130 
131 	list_for_each_entry(zd, &zds->zds_datasets, zd_list) {
132 		if (zd->zd_dsnamelen != dsnamelen)
133 			continue;
134 		if (strncmp(zd->zd_dsname, dataset, dsnamelen) == 0)
135 			return (zd);
136 	}
137 
138 	return (NULL);
139 }
140 
141 static int
142 zone_dataset_cred_check(cred_t *cred)
143 {
144 
145 	if (!uid_eq(cred->uid, GLOBAL_ROOT_UID))
146 		return (EPERM);
147 
148 	return (0);
149 }
150 #endif /* defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM) */
151 
152 static int
153 zone_dataset_name_check(const char *dataset, size_t *dsnamelen)
154 {
155 
156 	if (dataset[0] == '\0' || dataset[0] == '/')
157 		return (ENOENT);
158 
159 	*dsnamelen = strlen(dataset);
160 	/* Ignore trailing slash, if supplied. */
161 	if (dataset[*dsnamelen - 1] == '/')
162 		(*dsnamelen)--;
163 
164 	return (0);
165 }
166 
167 int
168 zone_dataset_attach(cred_t *cred, const char *dataset, int userns_fd)
169 {
170 #if defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM)
171 	struct user_namespace *userns;
172 	zone_datasets_t *zds;
173 	zone_dataset_t *zd;
174 	int error;
175 	size_t dsnamelen;
176 
177 	if ((error = zone_dataset_cred_check(cred)) != 0)
178 		return (error);
179 	if ((error = zone_dataset_name_check(dataset, &dsnamelen)) != 0)
180 		return (error);
181 	if ((error = user_ns_get(userns_fd, &userns)) != 0)
182 		return (error);
183 
184 	mutex_enter(&zone_datasets_lock);
185 	zds = zone_datasets_lookup(user_ns_zoneid(userns));
186 	if (zds == NULL) {
187 		zds = kmem_alloc(sizeof (zone_datasets_t), KM_SLEEP);
188 		INIT_LIST_HEAD(&zds->zds_list);
189 		INIT_LIST_HEAD(&zds->zds_datasets);
190 		zds->zds_userns = userns;
191 		/*
192 		 * Lock the namespace by incresing its refcount to prevent
193 		 * the namespace ID from being reused.
194 		 */
195 		get_user_ns(userns);
196 		list_add_tail(&zds->zds_list, &zone_datasets);
197 	} else {
198 		zd = zone_dataset_lookup(zds, dataset, dsnamelen);
199 		if (zd != NULL) {
200 			mutex_exit(&zone_datasets_lock);
201 			return (EEXIST);
202 		}
203 	}
204 
205 	zd = kmem_alloc(sizeof (zone_dataset_t) + dsnamelen + 1, KM_SLEEP);
206 	zd->zd_dsnamelen = dsnamelen;
207 	strlcpy(zd->zd_dsname, dataset, dsnamelen + 1);
208 	INIT_LIST_HEAD(&zd->zd_list);
209 	list_add_tail(&zd->zd_list, &zds->zds_datasets);
210 
211 	mutex_exit(&zone_datasets_lock);
212 	return (0);
213 #else
214 	return (ENXIO);
215 #endif /* defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM) */
216 }
217 EXPORT_SYMBOL(zone_dataset_attach);
218 
219 int
220 zone_dataset_detach(cred_t *cred, const char *dataset, int userns_fd)
221 {
222 #if defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM)
223 	struct user_namespace *userns;
224 	zone_datasets_t *zds;
225 	zone_dataset_t *zd;
226 	int error;
227 	size_t dsnamelen;
228 
229 	if ((error = zone_dataset_cred_check(cred)) != 0)
230 		return (error);
231 	if ((error = zone_dataset_name_check(dataset, &dsnamelen)) != 0)
232 		return (error);
233 	if ((error = user_ns_get(userns_fd, &userns)) != 0)
234 		return (error);
235 
236 	mutex_enter(&zone_datasets_lock);
237 	zds = zone_datasets_lookup(user_ns_zoneid(userns));
238 	if (zds != NULL)
239 		zd = zone_dataset_lookup(zds, dataset, dsnamelen);
240 	if (zds == NULL || zd == NULL) {
241 		mutex_exit(&zone_datasets_lock);
242 		return (ENOENT);
243 	}
244 
245 	list_del(&zd->zd_list);
246 	kmem_free(zd, sizeof (*zd) + zd->zd_dsnamelen + 1);
247 
248 	/* Prune the namespace entry if it has no more delegations. */
249 	if (list_empty(&zds->zds_datasets)) {
250 		/*
251 		 * Decrease the refcount now that the namespace is no longer
252 		 * used. It is no longer necessary to prevent the namespace ID
253 		 * from being reused.
254 		 */
255 		put_user_ns(userns);
256 		list_del(&zds->zds_list);
257 		kmem_free(zds, sizeof (*zds));
258 	}
259 
260 	mutex_exit(&zone_datasets_lock);
261 	return (0);
262 #else
263 	return (ENXIO);
264 #endif /* defined(CONFIG_USER_NS) && defined(HAVE_USER_NS_COMMON_INUM) */
265 }
266 EXPORT_SYMBOL(zone_dataset_detach);
267 
268 /*
269  * A dataset is visible if:
270  * - It is a parent of a namespace entry.
271  * - It is one of the namespace entries.
272  * - It is a child of a namespace entry.
273  *
274  * A dataset is writable if:
275  * - It is one of the namespace entries.
276  * - It is a child of a namespace entry.
277  *
278  * The parent datasets of namespace entries are visible and
279  * read-only to provide a path back to the root of the pool.
280  */
281 int
282 zone_dataset_visible(const char *dataset, int *write)
283 {
284 	zone_datasets_t *zds;
285 	zone_dataset_t *zd;
286 	size_t dsnamelen, zd_len;
287 	int visible;
288 
289 	/* Default to read-only, in case visible is returned. */
290 	if (write != NULL)
291 		*write = 0;
292 	if (zone_dataset_name_check(dataset, &dsnamelen) != 0)
293 		return (0);
294 	if (INGLOBALZONE(curproc)) {
295 		if (write != NULL)
296 			*write = 1;
297 		return (1);
298 	}
299 
300 	mutex_enter(&zone_datasets_lock);
301 	zds = zone_datasets_lookup(crgetzoneid(curproc->cred));
302 	if (zds == NULL) {
303 		mutex_exit(&zone_datasets_lock);
304 		return (0);
305 	}
306 
307 	visible = 0;
308 	list_for_each_entry(zd, &zds->zds_datasets, zd_list) {
309 		zd_len = strlen(zd->zd_dsname);
310 		if (zd_len > dsnamelen) {
311 			/*
312 			 * The name of the namespace entry is longer than that
313 			 * of the dataset, so it could be that the dataset is a
314 			 * parent of the namespace entry.
315 			 */
316 			visible = memcmp(zd->zd_dsname, dataset,
317 			    dsnamelen) == 0 &&
318 			    zd->zd_dsname[dsnamelen] == '/';
319 			if (visible)
320 				break;
321 		} else if (zd_len == dsnamelen) {
322 			/*
323 			 * The name of the namespace entry is as long as that
324 			 * of the dataset, so perhaps the dataset itself is the
325 			 * namespace entry.
326 			 */
327 			visible = memcmp(zd->zd_dsname, dataset, zd_len) == 0;
328 			if (visible) {
329 				if (write != NULL)
330 					*write = 1;
331 				break;
332 			}
333 		} else {
334 			/*
335 			 * The name of the namespace entry is shorter than that
336 			 * of the dataset, so perhaps the dataset is a child of
337 			 * the namespace entry.
338 			 */
339 			visible = memcmp(zd->zd_dsname, dataset,
340 			    zd_len) == 0 && dataset[zd_len] == '/';
341 			if (visible) {
342 				if (write != NULL)
343 					*write = 1;
344 				break;
345 			}
346 		}
347 	}
348 
349 	mutex_exit(&zone_datasets_lock);
350 	return (visible);
351 }
352 EXPORT_SYMBOL(zone_dataset_visible);
353 
354 unsigned int
355 global_zoneid(void)
356 {
357 	unsigned int z = 0;
358 
359 #if defined(CONFIG_USER_NS)
360 	z = user_ns_zoneid(&init_user_ns);
361 #endif
362 
363 	return (z);
364 }
365 EXPORT_SYMBOL(global_zoneid);
366 
367 unsigned int
368 crgetzoneid(const cred_t *cr)
369 {
370 	unsigned int r = 0;
371 
372 #if defined(CONFIG_USER_NS)
373 	r = user_ns_zoneid(cr->user_ns);
374 #endif
375 
376 	return (r);
377 }
378 EXPORT_SYMBOL(crgetzoneid);
379 
380 boolean_t
381 inglobalzone(proc_t *proc)
382 {
383 #if defined(CONFIG_USER_NS)
384 	return (proc->cred->user_ns == &init_user_ns);
385 #else
386 	return (B_TRUE);
387 #endif
388 }
389 EXPORT_SYMBOL(inglobalzone);
390 
391 int
392 spl_zone_init(void)
393 {
394 	mutex_init(&zone_datasets_lock, NULL, MUTEX_DEFAULT, NULL);
395 	INIT_LIST_HEAD(&zone_datasets);
396 	return (0);
397 }
398 
399 void
400 spl_zone_fini(void)
401 {
402 	zone_datasets_t *zds;
403 	zone_dataset_t *zd;
404 
405 	/*
406 	 * It would be better to assert an empty zone_datasets, but since
407 	 * there's no automatic mechanism for cleaning them up if the user
408 	 * namespace is destroyed, just do it here, since spl is about to go
409 	 * out of context.
410 	 */
411 	while (!list_empty(&zone_datasets)) {
412 		zds = list_entry(zone_datasets.next, zone_datasets_t, zds_list);
413 		while (!list_empty(&zds->zds_datasets)) {
414 			zd = list_entry(zds->zds_datasets.next,
415 			    zone_dataset_t, zd_list);
416 			list_del(&zd->zd_list);
417 			kmem_free(zd, sizeof (*zd) + zd->zd_dsnamelen + 1);
418 		}
419 		put_user_ns(zds->zds_userns);
420 		list_del(&zds->zds_list);
421 		kmem_free(zds, sizeof (*zds));
422 	}
423 	mutex_destroy(&zone_datasets_lock);
424 }
425