xref: /netbsd/sys/kern/sys_memfd.c (revision 15113430)
1 /*	$NetBSD: sys_memfd.c,v 1.8 2023/07/29 23:59:59 rin Exp $	*/
2 
3 /*-
4  * Copyright (c) 2023 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Theodore Preduta.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: sys_memfd.c,v 1.8 2023/07/29 23:59:59 rin Exp $");
34 
35 #include <sys/param.h>
36 #include <sys/types.h>
37 
38 #include <sys/fcntl.h>
39 #include <sys/file.h>
40 #include <sys/filedesc.h>
41 #include <sys/memfd.h>
42 #include <sys/mman.h>
43 #include <sys/syscallargs.h>
44 
45 #include <uvm/uvm_extern.h>
46 #include <uvm/uvm_object.h>
47 
48 #define F_SEAL_ANY_WRITE	(F_SEAL_WRITE|F_SEAL_FUTURE_WRITE)
49 #define MFD_KNOWN_SEALS		(F_SEAL_SEAL|F_SEAL_SHRINK|F_SEAL_GROW \
50 				|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE)
51 
52 static const char memfd_prefix[] = "memfd:";
53 
54 static int memfd_read(file_t *, off_t *, struct uio *, kauth_cred_t, int);
55 static int memfd_write(file_t *, off_t *, struct uio *, kauth_cred_t, int);
56 static int memfd_ioctl(file_t *, u_long, void *);
57 static int memfd_fcntl(file_t *, u_int, void *);
58 static int memfd_stat(file_t *, struct stat *);
59 static int memfd_close(file_t *);
60 static int memfd_mmap(file_t *, off_t *, size_t, int, int *, int *,
61     struct uvm_object **, int *);
62 static int memfd_seek(file_t *, off_t, int, off_t *, int);
63 static int memfd_truncate_locked(file_t *, off_t);
64 static int memfd_truncate(file_t *, off_t);
65 
66 static const struct fileops memfd_fileops = {
67 	.fo_name = "memfd",
68 	.fo_read = memfd_read,
69 	.fo_write = memfd_write,
70 	.fo_ioctl = memfd_ioctl,
71 	.fo_fcntl = memfd_fcntl,
72 	.fo_poll = fnullop_poll,
73 	.fo_stat = memfd_stat,
74 	.fo_close = memfd_close,
75 	.fo_kqfilter = fnullop_kqfilter,
76 	.fo_restart = fnullop_restart,
77 	.fo_mmap = memfd_mmap,
78 	.fo_seek = memfd_seek,
79 	.fo_fpathconf = (void *)eopnotsupp,
80 	.fo_posix_fadvise = (void *)eopnotsupp,
81 	.fo_truncate = memfd_truncate,
82 };
83 
84 /*
85  * memfd_create(2).  Creat a file descriptor associated with anonymous
86  * memory.
87  */
88 int
sys_memfd_create(struct lwp * l,const struct sys_memfd_create_args * uap,register_t * retval)89 sys_memfd_create(struct lwp *l, const struct sys_memfd_create_args *uap,
90     register_t *retval)
91 {
92 	/* {
93 		syscallarg(const char *) name;
94 		syscallarg(unsigned int) flags;
95 	} */
96 	int error, fd;
97 	file_t *fp;
98 	struct memfd *mfd;
99 	struct proc *p = l->l_proc;
100 	const unsigned int flags = SCARG(uap, flags);
101 
102 	if (flags & ~(MFD_CLOEXEC|MFD_ALLOW_SEALING))
103 		return EINVAL;
104 
105 	mfd = kmem_zalloc(sizeof(*mfd), KM_SLEEP);
106 	mfd->mfd_size = 0;
107 	mfd->mfd_uobj = uao_create(INT64_MAX - PAGE_SIZE, 0); /* same as tmpfs */
108 
109 	CTASSERT(sizeof(memfd_prefix) < NAME_MAX); /* sanity check */
110 	strcpy(mfd->mfd_name, memfd_prefix);
111 	error = copyinstr(SCARG(uap, name),
112 	    &mfd->mfd_name[sizeof(memfd_prefix) - 1],
113 	    sizeof(mfd->mfd_name) - sizeof(memfd_prefix), NULL);
114 	if (error != 0)
115  		goto leave;
116 
117 	getnanotime(&mfd->mfd_btime);
118 
119 	if ((flags & MFD_ALLOW_SEALING) == 0)
120 		mfd->mfd_seals |= F_SEAL_SEAL;
121 
122 	error = fd_allocfile(&fp, &fd);
123 	if (error != 0)
124 		goto leave;
125 
126 	fp->f_flag = FREAD|FWRITE;
127 	fp->f_type = DTYPE_MEMFD;
128 	fp->f_ops = &memfd_fileops;
129 	fp->f_memfd = mfd;
130 	fd_set_exclose(l, fd, (flags & MFD_CLOEXEC) != 0);
131 	fd_affix(p, fp, fd);
132 
133 	*retval = fd;
134 	return 0;
135 
136 leave:
137 	uao_detach(mfd->mfd_uobj);
138 	kmem_free(mfd, sizeof(*mfd));
139 	return error;
140 }
141 
142 static int
memfd_read(file_t * fp,off_t * offp,struct uio * uio,kauth_cred_t cred,int flags)143 memfd_read(file_t *fp, off_t *offp, struct uio *uio, kauth_cred_t cred,
144     int flags)
145 {
146 	int error;
147 	vsize_t todo;
148 	struct memfd *mfd = fp->f_memfd;
149 
150 	mutex_enter(&fp->f_lock);
151 
152 	if (*offp < 0) {
153 		error = EINVAL;
154 		goto leave;
155 	}
156 
157 	/* Trying to read past the end does nothing. */
158 	if (*offp >= mfd->mfd_size) {
159 		error = 0;
160 		goto leave;
161 	}
162 
163 	uio->uio_offset = *offp;
164 	todo = MIN(uio->uio_resid, mfd->mfd_size - *offp);
165 	error = ubc_uiomove(mfd->mfd_uobj, uio, todo, UVM_ADV_SEQUENTIAL,
166 	    UBC_READ|UBC_PARTIALOK);
167 	if (flags & FOF_UPDATE_OFFSET)
168 		*offp = uio->uio_offset;
169 
170 leave:
171 	getnanotime(&mfd->mfd_atime);
172 
173 
174 	mutex_exit(&fp->f_lock);
175 
176 	return error;
177 }
178 
179 static int
memfd_write(file_t * fp,off_t * offp,struct uio * uio,kauth_cred_t cred,int flags)180 memfd_write(file_t *fp, off_t *offp, struct uio *uio, kauth_cred_t cred,
181     int flags)
182 {
183 	int error;
184 	vsize_t todo;
185 	struct memfd *mfd = fp->f_memfd;
186 
187 	mutex_enter(&fp->f_lock);
188 
189 	if (mfd->mfd_seals & F_SEAL_ANY_WRITE) {
190 		error = EPERM;
191 		goto leave;
192 	}
193 
194 	if (*offp < 0) {
195 		error = EINVAL;
196 		goto leave;
197 	}
198 
199 	uio->uio_offset = *offp;
200 	todo = uio->uio_resid;
201 
202 	if (mfd->mfd_seals & F_SEAL_GROW) {
203 		if (*offp >= mfd->mfd_size) {
204 			error = EPERM;
205 			goto leave;
206 		}
207 
208 		/* Truncate the write to fit in mfd_size */
209 		if (*offp + uio->uio_resid >= mfd->mfd_size)
210 			todo = mfd->mfd_size - *offp;
211 	} else if (*offp + uio->uio_resid >= mfd->mfd_size) {
212 		/* Grow to accommodate the write request. */
213 		error = memfd_truncate_locked(fp, *offp + uio->uio_resid);
214 		if (error != 0)
215 			goto leave;
216 	}
217 
218 	error = ubc_uiomove(mfd->mfd_uobj, uio, todo, UVM_ADV_SEQUENTIAL,
219 	    UBC_WRITE|UBC_PARTIALOK);
220 	if (flags & FOF_UPDATE_OFFSET)
221 		*offp = uio->uio_offset;
222 
223 	getnanotime(&mfd->mfd_mtime);
224 
225 leave:
226 	mutex_exit(&fp->f_lock);
227 
228 	return error;
229 }
230 
231 static int
memfd_ioctl(file_t * fp,u_long cmd,void * data)232 memfd_ioctl(file_t *fp, u_long cmd, void *data)
233 {
234 
235 	return EINVAL;
236 }
237 
238 static int
memfd_fcntl(file_t * fp,u_int cmd,void * data)239 memfd_fcntl(file_t *fp, u_int cmd, void *data)
240 {
241 	struct memfd *mfd = fp->f_memfd;
242 	int error = 0;
243 
244 	switch (cmd) {
245 	case F_ADD_SEALS:
246 		mutex_enter(&fp->f_lock);
247 
248 		if (mfd->mfd_seals & F_SEAL_SEAL) {
249 		        error = EPERM;
250 			goto leave_add_seals;
251 		}
252 
253 		if (*(int *)data & ~MFD_KNOWN_SEALS) {
254 		        error = EINVAL;
255 			goto leave_add_seals;
256 		}
257 
258 		/*
259 		 * Can only add F_SEAL_WRITE if there are no currently
260 		 * open mmaps.
261 		 *
262 		 * XXX should only disallow if there are no currently
263 		 * open mmaps with PROT_WRITE.
264 		 */
265 		if ((mfd->mfd_seals & F_SEAL_WRITE) == 0 &&
266 		    (*(int *)data & F_SEAL_WRITE) != 0 &&
267 		    mfd->mfd_uobj->uo_refs > 1)
268 		{
269 			error = EBUSY;
270 			goto leave_add_seals;
271 		}
272 
273 		mfd->mfd_seals |= *(int *)data;
274 
275 	leave_add_seals:
276 		mutex_exit(&fp->f_lock);
277 		return error;
278 
279 	case F_GET_SEALS:
280 		mutex_enter(&fp->f_lock);
281 		*(int *)data = mfd->mfd_seals;
282 		mutex_exit(&fp->f_lock);
283 		return 0;
284 
285 	default:
286 		return EINVAL;
287 	}
288 }
289 
290 static int
memfd_stat(file_t * fp,struct stat * st)291 memfd_stat(file_t *fp, struct stat *st)
292 {
293 	struct memfd *mfd = fp->f_memfd;
294 
295 	mutex_enter(&fp->f_lock);
296 
297 	memset(st, 0, sizeof(*st));
298 	st->st_uid = kauth_cred_geteuid(fp->f_cred);
299 	st->st_gid = kauth_cred_getegid(fp->f_cred);
300 	st->st_size = mfd->mfd_size;
301 
302 	st->st_mode = S_IREAD;
303 	if ((mfd->mfd_seals & F_SEAL_ANY_WRITE) == 0)
304 		st->st_mode |= S_IWRITE;
305 
306 	st->st_birthtimespec = mfd->mfd_btime;
307 	st->st_ctimespec = mfd->mfd_mtime;
308 	st->st_atimespec = mfd->mfd_atime;
309 	st->st_mtimespec = mfd->mfd_mtime;
310 
311 	mutex_exit(&fp->f_lock);
312 
313 	return 0;
314 }
315 
316 static int
memfd_close(file_t * fp)317 memfd_close(file_t *fp)
318 {
319 	struct memfd *mfd = fp->f_memfd;
320 
321 	uao_detach(mfd->mfd_uobj);
322 
323 	kmem_free(mfd, sizeof(*mfd));
324 	fp->f_memfd = NULL;
325 
326 	return 0;
327 }
328 
329 static int
memfd_mmap(file_t * fp,off_t * offp,size_t size,int prot,int * flagsp,int * advicep,struct uvm_object ** uobjp,int * maxprotp)330 memfd_mmap(file_t *fp, off_t *offp, size_t size, int prot, int *flagsp,
331     int *advicep, struct uvm_object **uobjp, int *maxprotp)
332 {
333 	struct memfd *mfd = fp->f_memfd;
334 	int error = 0;
335 
336 	/* uvm_mmap guarantees page-aligned offset and size.  */
337 	KASSERT(*offp == round_page(*offp));
338 	KASSERT(size == round_page(size));
339 	KASSERT(size > 0);
340 
341 	mutex_enter(&fp->f_lock);
342 
343 	if (*offp < 0) {
344 		error = EINVAL;
345 		goto leave;
346 	}
347 	if (*offp + size > mfd->mfd_size) {
348 		error = EINVAL;
349 		goto leave;
350 	}
351 
352 	if ((mfd->mfd_seals & F_SEAL_ANY_WRITE) &&
353 	    (prot & VM_PROT_WRITE) && (*flagsp & MAP_PRIVATE) == 0) {
354 		error = EPERM;
355 		goto leave;
356 	}
357 
358 	uao_reference(fp->f_memfd->mfd_uobj);
359 	*uobjp = fp->f_memfd->mfd_uobj;
360 
361 	*maxprotp = prot;
362 	*advicep = UVM_ADV_RANDOM;
363 
364 leave:
365 	mutex_exit(&fp->f_lock);
366 
367 	return error;
368 }
369 
370 static int
memfd_seek(file_t * fp,off_t delta,int whence,off_t * newoffp,int flags)371 memfd_seek(file_t *fp, off_t delta, int whence, off_t *newoffp,
372     int flags)
373 {
374 	off_t newoff;
375 	int error = 0;
376 
377 	mutex_enter(&fp->f_lock);
378 
379 	switch (whence) {
380 	case SEEK_CUR:
381 		newoff = fp->f_offset + delta;
382 		break;
383 
384 	case SEEK_END:
385 		newoff = fp->f_memfd->mfd_size + delta;
386 		break;
387 
388 	case SEEK_SET:
389 		newoff = delta;
390 		break;
391 
392 	default:
393 		error = EINVAL;
394 		goto leave;
395 	}
396 
397 	if (newoffp)
398 		*newoffp = newoff;
399 	if (flags & FOF_UPDATE_OFFSET)
400 		fp->f_offset = newoff;
401 
402 leave:
403 	mutex_exit(&fp->f_lock);
404 
405 	return error;
406 }
407 
408 static int
memfd_truncate_locked(file_t * fp,off_t length)409 memfd_truncate_locked(file_t *fp, off_t length)
410 {
411 	struct memfd *mfd = fp->f_memfd;
412 	voff_t start, end;
413 	int error = 0;
414 
415 	KASSERT(mutex_owned(&fp->f_lock));
416 
417 	if (length < 0)
418 		return EINVAL;
419 	if (length == mfd->mfd_size)
420 		return 0;
421 
422 	if ((mfd->mfd_seals & F_SEAL_SHRINK) && length < mfd->mfd_size)
423 		return EPERM;
424 	if ((mfd->mfd_seals & F_SEAL_GROW) && length > mfd->mfd_size)
425 		return EPERM;
426 
427 	if (length > mfd->mfd_size)
428 		ubc_zerorange(mfd->mfd_uobj, mfd->mfd_size,
429 		    length - mfd->mfd_size, 0);
430 	else {
431 		/* length < mfd->mfd_size, so try to get rid of excess pages */
432 		start = round_page(length);
433 		end = round_page(mfd->mfd_size);
434 
435 		if (start < end) { /* we actually have pages to remove */
436 			rw_enter(mfd->mfd_uobj->vmobjlock, RW_WRITER);
437 			error = (*mfd->mfd_uobj->pgops->pgo_put)(mfd->mfd_uobj,
438 			    start, end, PGO_FREE);
439 			/* pgo_put drops vmobjlock */
440 		}
441 	}
442 
443 	getnanotime(&mfd->mfd_mtime);
444 	mfd->mfd_size = length;
445 
446 	return error;
447 }
448 
449 static int
memfd_truncate(file_t * fp,off_t length)450 memfd_truncate(file_t *fp, off_t length)
451 {
452 	int error;
453 
454 	mutex_enter(&fp->f_lock);
455 	error = memfd_truncate_locked(fp, length);
456 	mutex_exit(&fp->f_lock);
457 	return error;
458 }
459