xref: /linux/kernel/acct.c (revision 54a4d58a)
1 /*
2  *  linux/kernel/acct.c
3  *
4  *  BSD Process Accounting for Linux
5  *
6  *  Author: Marco van Wieringen <mvw@planets.elm.net>
7  *
8  *  Some code based on ideas and code from:
9  *  Thomas K. Dyas <tdyas@eden.rutgers.edu>
10  *
11  *  This file implements BSD-style process accounting. Whenever any
12  *  process exits, an accounting record of type "struct acct" is
13  *  written to the file specified with the acct() system call. It is
14  *  up to user-level programs to do useful things with the accounting
15  *  log. The kernel just provides the raw accounting information.
16  *
17  * (C) Copyright 1995 - 1997 Marco van Wieringen - ELM Consultancy B.V.
18  *
19  *  Plugged two leaks. 1) It didn't return acct_file into the free_filps if
20  *  the file happened to be read-only. 2) If the accounting was suspended
21  *  due to the lack of space it happily allowed to reopen it and completely
22  *  lost the old acct_file. 3/10/98, Al Viro.
23  *
24  *  Now we silently close acct_file on attempt to reopen. Cleaned sys_acct().
25  *  XTerms and EMACS are manifestations of pure evil. 21/10/98, AV.
26  *
27  *  Fixed a nasty interaction with with sys_umount(). If the accointing
28  *  was suspeneded we failed to stop it on umount(). Messy.
29  *  Another one: remount to readonly didn't stop accounting.
30  *	Question: what should we do if we have CAP_SYS_ADMIN but not
31  *  CAP_SYS_PACCT? Current code does the following: umount returns -EBUSY
32  *  unless we are messing with the root. In that case we are getting a
33  *  real mess with do_remount_sb(). 9/11/98, AV.
34  *
35  *  Fixed a bunch of races (and pair of leaks). Probably not the best way,
36  *  but this one obviously doesn't introduce deadlocks. Later. BTW, found
37  *  one race (and leak) in BSD implementation.
38  *  OK, that's better. ANOTHER race and leak in BSD variant. There always
39  *  is one more bug... 10/11/98, AV.
40  *
41  *	Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold
42  * ->mmap_sem to walk the vma list of current->mm. Nasty, since it leaks
43  * a struct file opened for write. Fixed. 2/6/2000, AV.
44  */
45 
46 #include <linux/mm.h>
47 #include <linux/slab.h>
48 #include <linux/acct.h>
49 #include <linux/capability.h>
50 #include <linux/file.h>
51 #include <linux/tty.h>
52 #include <linux/security.h>
53 #include <linux/vfs.h>
54 #include <linux/jiffies.h>
55 #include <linux/times.h>
56 #include <linux/syscalls.h>
57 #include <linux/mount.h>
58 #include <linux/uaccess.h>
59 #include <asm/div64.h>
60 #include <linux/blkdev.h> /* sector_div */
61 #include <linux/pid_namespace.h>
62 
63 /*
64  * These constants control the amount of freespace that suspend and
65  * resume the process accounting system, and the time delay between
66  * each check.
67  * Turned into sysctl-controllable parameters. AV, 12/11/98
68  */
69 
70 int acct_parm[3] = {4, 2, 30};
71 #define RESUME		(acct_parm[0])	/* >foo% free space - resume */
72 #define SUSPEND		(acct_parm[1])	/* <foo% free space - suspend */
73 #define ACCT_TIMEOUT	(acct_parm[2])	/* foo second timeout between checks */
74 
75 /*
76  * External references and all of the globals.
77  */
78 static void do_acct_process(struct bsd_acct_struct *acct);
79 
80 struct bsd_acct_struct {
81 	long			count;
82 	struct mutex		lock;
83 	int			active;
84 	unsigned long		needcheck;
85 	struct file		*file;
86 	struct pid_namespace	*ns;
87 	struct list_head	list;
88 };
89 
90 static DEFINE_SPINLOCK(acct_lock);
91 static LIST_HEAD(acct_list);
92 
93 /*
94  * Check the amount of free space and suspend/resume accordingly.
95  */
96 static int check_free_space(struct bsd_acct_struct *acct)
97 {
98 	struct kstatfs sbuf;
99 
100 	if (time_is_before_jiffies(acct->needcheck))
101 		goto out;
102 
103 	/* May block */
104 	if (vfs_statfs(&acct->file->f_path, &sbuf))
105 		goto out;
106 
107 	if (acct->active) {
108 		u64 suspend = sbuf.f_blocks * SUSPEND;
109 		do_div(suspend, 100);
110 		if (sbuf.f_bavail <= suspend) {
111 			acct->active = 0;
112 			printk(KERN_INFO "Process accounting paused\n");
113 		}
114 	} else {
115 		u64 resume = sbuf.f_blocks * RESUME;
116 		do_div(resume, 100);
117 		if (sbuf.f_bavail >= resume) {
118 			acct->active = 1;
119 			printk(KERN_INFO "Process accounting resumed\n");
120 		}
121 	}
122 
123 	acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
124 out:
125 	return acct->active;
126 }
127 
128 static void acct_put(struct bsd_acct_struct *p)
129 {
130 	spin_lock(&acct_lock);
131 	if (!--p->count)
132 		kfree(p);
133 	spin_unlock(&acct_lock);
134 }
135 
136 static struct bsd_acct_struct *acct_get(struct bsd_acct_struct **p)
137 {
138 	struct bsd_acct_struct *res;
139 	spin_lock(&acct_lock);
140 again:
141 	res = *p;
142 	if (res)
143 		res->count++;
144 	spin_unlock(&acct_lock);
145 	if (res) {
146 		mutex_lock(&res->lock);
147 		if (!res->ns) {
148 			mutex_unlock(&res->lock);
149 			spin_lock(&acct_lock);
150 			if (!--res->count)
151 				kfree(res);
152 			goto again;
153 		}
154 	}
155 	return res;
156 }
157 
158 static void acct_kill(struct bsd_acct_struct *acct,
159 		      struct bsd_acct_struct *new)
160 {
161 	if (acct) {
162 		struct file *file = acct->file;
163 		struct pid_namespace *ns = acct->ns;
164 		spin_lock(&acct_lock);
165 		list_del(&acct->list);
166 		mnt_unpin(file->f_path.mnt);
167 		spin_unlock(&acct_lock);
168 		do_acct_process(acct);
169 		filp_close(file, NULL);
170 		spin_lock(&acct_lock);
171 		ns->bacct = new;
172 		if (new) {
173 			mnt_pin(new->file->f_path.mnt);
174 			list_add(&new->list, &acct_list);
175 		}
176 		acct->ns = NULL;
177 		mutex_unlock(&acct->lock);
178 		if (!(acct->count -= 2))
179 			kfree(acct);
180 		spin_unlock(&acct_lock);
181 	}
182 }
183 
184 static int acct_on(struct filename *pathname)
185 {
186 	struct file *file;
187 	struct vfsmount *mnt;
188 	struct pid_namespace *ns = task_active_pid_ns(current);
189 	struct bsd_acct_struct *acct, *old;
190 
191 	acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
192 	if (!acct)
193 		return -ENOMEM;
194 
195 	/* Difference from BSD - they don't do O_APPEND */
196 	file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
197 	if (IS_ERR(file)) {
198 		kfree(acct);
199 		return PTR_ERR(file);
200 	}
201 
202 	if (!S_ISREG(file_inode(file)->i_mode)) {
203 		kfree(acct);
204 		filp_close(file, NULL);
205 		return -EACCES;
206 	}
207 
208 	if (!file->f_op->write) {
209 		kfree(acct);
210 		filp_close(file, NULL);
211 		return -EIO;
212 	}
213 
214 	acct->count = 1;
215 	acct->file = file;
216 	acct->needcheck = jiffies;
217 	acct->ns = ns;
218 	mutex_init(&acct->lock);
219 	mnt = file->f_path.mnt;
220 
221 	old = acct_get(&ns->bacct);
222 	if (old) {
223 		acct_kill(old, acct);
224 	} else {
225 		spin_lock(&acct_lock);
226 		ns->bacct = acct;
227 		mnt_pin(mnt);
228 		list_add(&acct->list, &acct_list);
229 		spin_unlock(&acct_lock);
230 	}
231 	mntput(mnt); /* it's pinned, now give up active reference */
232 	return 0;
233 }
234 
235 static DEFINE_MUTEX(acct_on_mutex);
236 
237 /**
238  * sys_acct - enable/disable process accounting
239  * @name: file name for accounting records or NULL to shutdown accounting
240  *
241  * Returns 0 for success or negative errno values for failure.
242  *
243  * sys_acct() is the only system call needed to implement process
244  * accounting. It takes the name of the file where accounting records
245  * should be written. If the filename is NULL, accounting will be
246  * shutdown.
247  */
248 SYSCALL_DEFINE1(acct, const char __user *, name)
249 {
250 	int error = 0;
251 
252 	if (!capable(CAP_SYS_PACCT))
253 		return -EPERM;
254 
255 	if (name) {
256 		struct filename *tmp = getname(name);
257 		if (IS_ERR(tmp))
258 			return PTR_ERR(tmp);
259 		mutex_lock(&acct_on_mutex);
260 		error = acct_on(tmp);
261 		mutex_unlock(&acct_on_mutex);
262 		putname(tmp);
263 	} else {
264 		acct_kill(acct_get(&task_active_pid_ns(current)->bacct), NULL);
265 	}
266 
267 	return error;
268 }
269 
270 /**
271  * acct_auto_close - turn off a filesystem's accounting if it is on
272  * @m: vfsmount being shut down
273  *
274  * If the accounting is turned on for a file in the subtree pointed to
275  * to by m, turn accounting off.  Done when m is about to die.
276  */
277 void acct_auto_close_mnt(struct vfsmount *m)
278 {
279 	struct bsd_acct_struct *acct;
280 
281 	spin_lock(&acct_lock);
282 restart:
283 	list_for_each_entry(acct, &acct_list, list)
284 		if (acct->file->f_path.mnt == m) {
285 			acct->count++;
286 			spin_unlock(&acct_lock);
287 			mutex_lock(&acct->lock);
288 			if (!acct->ns) {
289 				mutex_unlock(&acct->lock);
290 				spin_lock(&acct_lock);
291 				if (!--acct->count)
292 					kfree(acct);
293 				goto restart;
294 			}
295 			acct_kill(acct, NULL);
296 			spin_lock(&acct_lock);
297 			goto restart;
298 		}
299 	spin_unlock(&acct_lock);
300 }
301 
302 /**
303  * acct_auto_close - turn off a filesystem's accounting if it is on
304  * @sb: super block for the filesystem
305  *
306  * If the accounting is turned on for a file in the filesystem pointed
307  * to by sb, turn accounting off.
308  */
309 void acct_auto_close(struct super_block *sb)
310 {
311 	struct bsd_acct_struct *acct;
312 
313 	spin_lock(&acct_lock);
314 restart:
315 	list_for_each_entry(acct, &acct_list, list)
316 		if (acct->file->f_path.dentry->d_sb == sb) {
317 			acct->count++;
318 			spin_unlock(&acct_lock);
319 			mutex_lock(&acct->lock);
320 			if (!acct->ns) {
321 				mutex_unlock(&acct->lock);
322 				spin_lock(&acct_lock);
323 				if (!--acct->count)
324 					kfree(acct);
325 				goto restart;
326 			}
327 			acct_kill(acct, NULL);
328 			spin_lock(&acct_lock);
329 			goto restart;
330 		}
331 	spin_unlock(&acct_lock);
332 }
333 
334 void acct_exit_ns(struct pid_namespace *ns)
335 {
336 	acct_kill(acct_get(&ns->bacct), NULL);
337 }
338 
339 /*
340  *  encode an unsigned long into a comp_t
341  *
342  *  This routine has been adopted from the encode_comp_t() function in
343  *  the kern_acct.c file of the FreeBSD operating system. The encoding
344  *  is a 13-bit fraction with a 3-bit (base 8) exponent.
345  */
346 
347 #define	MANTSIZE	13			/* 13 bit mantissa. */
348 #define	EXPSIZE		3			/* Base 8 (3 bit) exponent. */
349 #define	MAXFRACT	((1 << MANTSIZE) - 1)	/* Maximum fractional value. */
350 
351 static comp_t encode_comp_t(unsigned long value)
352 {
353 	int exp, rnd;
354 
355 	exp = rnd = 0;
356 	while (value > MAXFRACT) {
357 		rnd = value & (1 << (EXPSIZE - 1));	/* Round up? */
358 		value >>= EXPSIZE;	/* Base 8 exponent == 3 bit shift. */
359 		exp++;
360 	}
361 
362 	/*
363 	 * If we need to round up, do it (and handle overflow correctly).
364 	 */
365 	if (rnd && (++value > MAXFRACT)) {
366 		value >>= EXPSIZE;
367 		exp++;
368 	}
369 
370 	/*
371 	 * Clean it up and polish it off.
372 	 */
373 	exp <<= MANTSIZE;		/* Shift the exponent into place */
374 	exp += value;			/* and add on the mantissa. */
375 	return exp;
376 }
377 
378 #if ACCT_VERSION==1 || ACCT_VERSION==2
379 /*
380  * encode an u64 into a comp2_t (24 bits)
381  *
382  * Format: 5 bit base 2 exponent, 20 bits mantissa.
383  * The leading bit of the mantissa is not stored, but implied for
384  * non-zero exponents.
385  * Largest encodable value is 50 bits.
386  */
387 
388 #define MANTSIZE2       20                      /* 20 bit mantissa. */
389 #define EXPSIZE2        5                       /* 5 bit base 2 exponent. */
390 #define MAXFRACT2       ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */
391 #define MAXEXP2         ((1 <<EXPSIZE2) - 1)    /* Maximum exponent. */
392 
393 static comp2_t encode_comp2_t(u64 value)
394 {
395 	int exp, rnd;
396 
397 	exp = (value > (MAXFRACT2>>1));
398 	rnd = 0;
399 	while (value > MAXFRACT2) {
400 		rnd = value & 1;
401 		value >>= 1;
402 		exp++;
403 	}
404 
405 	/*
406 	 * If we need to round up, do it (and handle overflow correctly).
407 	 */
408 	if (rnd && (++value > MAXFRACT2)) {
409 		value >>= 1;
410 		exp++;
411 	}
412 
413 	if (exp > MAXEXP2) {
414 		/* Overflow. Return largest representable number instead. */
415 		return (1ul << (MANTSIZE2+EXPSIZE2-1)) - 1;
416 	} else {
417 		return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1));
418 	}
419 }
420 #endif
421 
422 #if ACCT_VERSION==3
423 /*
424  * encode an u64 into a 32 bit IEEE float
425  */
426 static u32 encode_float(u64 value)
427 {
428 	unsigned exp = 190;
429 	unsigned u;
430 
431 	if (value==0) return 0;
432 	while ((s64)value > 0){
433 		value <<= 1;
434 		exp--;
435 	}
436 	u = (u32)(value >> 40) & 0x7fffffu;
437 	return u | (exp << 23);
438 }
439 #endif
440 
441 /*
442  *  Write an accounting entry for an exiting process
443  *
444  *  The acct_process() call is the workhorse of the process
445  *  accounting system. The struct acct is built here and then written
446  *  into the accounting file. This function should only be called from
447  *  do_exit() or when switching to a different output file.
448  */
449 
450 static void fill_ac(acct_t *ac)
451 {
452 	struct pacct_struct *pacct = &current->signal->pacct;
453 	u64 elapsed, run_time;
454 	struct tty_struct *tty;
455 
456 	/*
457 	 * Fill the accounting struct with the needed info as recorded
458 	 * by the different kernel functions.
459 	 */
460 	memset(ac, 0, sizeof(acct_t));
461 
462 	ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER;
463 	strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm));
464 
465 	/* calculate run_time in nsec*/
466 	run_time = ktime_get_ns();
467 	run_time -= current->group_leader->start_time;
468 	/* convert nsec -> AHZ */
469 	elapsed = nsec_to_AHZ(run_time);
470 #if ACCT_VERSION==3
471 	ac->ac_etime = encode_float(elapsed);
472 #else
473 	ac->ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
474 	                       (unsigned long) elapsed : (unsigned long) -1l);
475 #endif
476 #if ACCT_VERSION==1 || ACCT_VERSION==2
477 	{
478 		/* new enlarged etime field */
479 		comp2_t etime = encode_comp2_t(elapsed);
480 		ac->ac_etime_hi = etime >> 16;
481 		ac->ac_etime_lo = (u16) etime;
482 	}
483 #endif
484 	do_div(elapsed, AHZ);
485 	ac->ac_btime = get_seconds() - elapsed;
486 #if ACCT_VERSION==2
487 	ac->ac_ahz = AHZ;
488 #endif
489 
490 	spin_lock_irq(&current->sighand->siglock);
491 	tty = current->signal->tty;	/* Safe as we hold the siglock */
492 	ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
493 	ac->ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
494 	ac->ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
495 	ac->ac_flag = pacct->ac_flag;
496 	ac->ac_mem = encode_comp_t(pacct->ac_mem);
497 	ac->ac_minflt = encode_comp_t(pacct->ac_minflt);
498 	ac->ac_majflt = encode_comp_t(pacct->ac_majflt);
499 	ac->ac_exitcode = pacct->ac_exitcode;
500 	spin_unlock_irq(&current->sighand->siglock);
501 }
502 /*
503  *  do_acct_process does all actual work. Caller holds the reference to file.
504  */
505 static void do_acct_process(struct bsd_acct_struct *acct)
506 {
507 	acct_t ac;
508 	unsigned long flim;
509 	const struct cred *orig_cred;
510 	struct pid_namespace *ns = acct->ns;
511 	struct file *file = acct->file;
512 
513 	/*
514 	 * Accounting records are not subject to resource limits.
515 	 */
516 	flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
517 	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
518 	/* Perform file operations on behalf of whoever enabled accounting */
519 	orig_cred = override_creds(file->f_cred);
520 
521 	/*
522 	 * First check to see if there is enough free_space to continue
523 	 * the process accounting system.
524 	 */
525 	if (!check_free_space(acct))
526 		goto out;
527 
528 	fill_ac(&ac);
529 	/* we really need to bite the bullet and change layout */
530 	ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
531 	ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
532 #if ACCT_VERSION==1 || ACCT_VERSION==2
533 	/* backward-compatible 16 bit fields */
534 	ac.ac_uid16 = ac.ac_uid;
535 	ac.ac_gid16 = ac.ac_gid;
536 #endif
537 #if ACCT_VERSION==3
538 	ac.ac_pid = task_tgid_nr_ns(current, ns);
539 	rcu_read_lock();
540 	ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
541 	rcu_read_unlock();
542 #endif
543 	/*
544 	 * Get freeze protection. If the fs is frozen, just skip the write
545 	 * as we could deadlock the system otherwise.
546 	 */
547 	if (file_start_write_trylock(file)) {
548 		/* it's been opened O_APPEND, so position is irrelevant */
549 		loff_t pos = 0;
550 		__kernel_write(file, (char *)&ac, sizeof(acct_t), &pos);
551 		file_end_write(file);
552 	}
553 out:
554 	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
555 	revert_creds(orig_cred);
556 }
557 
558 /**
559  * acct_collect - collect accounting information into pacct_struct
560  * @exitcode: task exit code
561  * @group_dead: not 0, if this thread is the last one in the process.
562  */
563 void acct_collect(long exitcode, int group_dead)
564 {
565 	struct pacct_struct *pacct = &current->signal->pacct;
566 	cputime_t utime, stime;
567 	unsigned long vsize = 0;
568 
569 	if (group_dead && current->mm) {
570 		struct vm_area_struct *vma;
571 		down_read(&current->mm->mmap_sem);
572 		vma = current->mm->mmap;
573 		while (vma) {
574 			vsize += vma->vm_end - vma->vm_start;
575 			vma = vma->vm_next;
576 		}
577 		up_read(&current->mm->mmap_sem);
578 	}
579 
580 	spin_lock_irq(&current->sighand->siglock);
581 	if (group_dead)
582 		pacct->ac_mem = vsize / 1024;
583 	if (thread_group_leader(current)) {
584 		pacct->ac_exitcode = exitcode;
585 		if (current->flags & PF_FORKNOEXEC)
586 			pacct->ac_flag |= AFORK;
587 	}
588 	if (current->flags & PF_SUPERPRIV)
589 		pacct->ac_flag |= ASU;
590 	if (current->flags & PF_DUMPCORE)
591 		pacct->ac_flag |= ACORE;
592 	if (current->flags & PF_SIGNALED)
593 		pacct->ac_flag |= AXSIG;
594 	task_cputime(current, &utime, &stime);
595 	pacct->ac_utime += utime;
596 	pacct->ac_stime += stime;
597 	pacct->ac_minflt += current->min_flt;
598 	pacct->ac_majflt += current->maj_flt;
599 	spin_unlock_irq(&current->sighand->siglock);
600 }
601 
602 static void slow_acct_process(struct pid_namespace *ns)
603 {
604 	for ( ; ns; ns = ns->parent) {
605 		struct bsd_acct_struct *acct = acct_get(&ns->bacct);
606 		if (acct) {
607 			do_acct_process(acct);
608 			mutex_unlock(&acct->lock);
609 			acct_put(acct);
610 		}
611 	}
612 }
613 
614 /**
615  * acct_process
616  *
617  * handles process accounting for an exiting task
618  */
619 void acct_process(void)
620 {
621 	struct pid_namespace *ns;
622 
623 	/*
624 	 * This loop is safe lockless, since current is still
625 	 * alive and holds its namespace, which in turn holds
626 	 * its parent.
627 	 */
628 	for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) {
629 		if (ns->bacct)
630 			break;
631 	}
632 	if (unlikely(ns))
633 		slow_acct_process(ns);
634 }
635