xref: /illumos-gate/usr/src/uts/common/os/exacct.c (revision 48847494)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/exacct.h>
27 #include <sys/exacct_catalog.h>
28 #include <sys/disp.h>
29 #include <sys/task.h>
30 #include <sys/proc.h>
31 #include <sys/cmn_err.h>
32 #include <sys/kmem.h>
33 #include <sys/project.h>
34 #include <sys/systm.h>
35 #include <sys/vnode.h>
36 #include <sys/file.h>
37 #include <sys/acctctl.h>
38 #include <sys/time.h>
39 #include <sys/utsname.h>
40 #include <sys/session.h>
41 #include <sys/sysmacros.h>
42 #include <sys/bitmap.h>
43 #include <sys/msacct.h>
44 #include <sys/mac.h>
45 
46 /*
47  * exacct usage and recording routines
48  *
49  * wracct(2), getacct(2), and the records written at process or task
50  * termination are constructed using the exacct_assemble_[task,proc]_usage()
51  * functions, which take a callback that takes the appropriate action on
52  * the packed exacct record for the task or process.  For the process-related
53  * actions, we partition the routines such that the data collecting component
54  * can be performed while holding p_lock, and all sleeping or blocking
55  * operations can be performed without acquiring p_lock.
56  *
57  * putacct(2), which allows an application to construct a customized record
58  * associated with an existing process or task, has its own entry points:
59  * exacct_tag_task() and exacct_tag_proc().
60  */
61 
62 taskq_t *exacct_queue;
63 kmem_cache_t *exacct_object_cache;
64 
65 zone_key_t exacct_zone_key = ZONE_KEY_UNINITIALIZED;
66 
67 static const uint32_t exacct_version = EXACCT_VERSION;
68 static const char exacct_header[] = "exacct";
69 static const char exacct_creator[] = "SunOS";
70 
71 ea_object_t *
72 ea_alloc_item(ea_catalog_t catalog, void *buf, size_t bufsz)
73 {
74 	ea_object_t *item;
75 
76 	item = kmem_cache_alloc(exacct_object_cache, KM_SLEEP);
77 	bzero(item, sizeof (ea_object_t));
78 	(void) ea_set_item(item, catalog, buf, bufsz);
79 	return (item);
80 }
81 
82 ea_object_t *
83 ea_alloc_group(ea_catalog_t catalog)
84 {
85 	ea_object_t *group;
86 
87 	group = kmem_cache_alloc(exacct_object_cache, KM_SLEEP);
88 	bzero(group, sizeof (ea_object_t));
89 	(void) ea_set_group(group, catalog);
90 	return (group);
91 }
92 
93 ea_object_t *
94 ea_attach_item(ea_object_t *grp, void *buf, size_t bufsz, ea_catalog_t catalog)
95 {
96 	ea_object_t *item;
97 
98 	item = ea_alloc_item(catalog, buf, bufsz);
99 	(void) ea_attach_to_group(grp, item);
100 	return (item);
101 }
102 
103 /*
104  * exacct_add_task_mstate() and exacct_sub_task_mstate() add and subtract
105  * microstate accounting data and resource usage counters from one task_usage_t
106  * from those supplied in another. These functions do not operate on *all*
107  * members of a task_usage_t: for some (e.g. tu_anctaskid) it would not make
108  * sense.
109  */
110 static void
111 exacct_add_task_mstate(task_usage_t *tu, task_usage_t *delta)
112 {
113 	tu->tu_utime  += delta->tu_utime;
114 	tu->tu_stime  += delta->tu_stime;
115 	tu->tu_minflt += delta->tu_minflt;
116 	tu->tu_majflt += delta->tu_majflt;
117 	tu->tu_sndmsg += delta->tu_sndmsg;
118 	tu->tu_rcvmsg += delta->tu_rcvmsg;
119 	tu->tu_ioch   += delta->tu_ioch;
120 	tu->tu_iblk   += delta->tu_iblk;
121 	tu->tu_oblk   += delta->tu_oblk;
122 	tu->tu_vcsw   += delta->tu_vcsw;
123 	tu->tu_icsw   += delta->tu_icsw;
124 	tu->tu_nsig   += delta->tu_nsig;
125 	tu->tu_nswp   += delta->tu_nswp;
126 	tu->tu_nscl   += delta->tu_nscl;
127 }
128 
129 /*
130  * See the comments for exacct_add_task_mstate(), above.
131  */
132 static void
133 exacct_sub_task_mstate(task_usage_t *tu, task_usage_t *delta)
134 {
135 	tu->tu_utime  -= delta->tu_utime;
136 	tu->tu_stime  -= delta->tu_stime;
137 	tu->tu_minflt -= delta->tu_minflt;
138 	tu->tu_majflt -= delta->tu_majflt;
139 	tu->tu_sndmsg -= delta->tu_sndmsg;
140 	tu->tu_rcvmsg -= delta->tu_rcvmsg;
141 	tu->tu_ioch   -= delta->tu_ioch;
142 	tu->tu_iblk   -= delta->tu_iblk;
143 	tu->tu_oblk   -= delta->tu_oblk;
144 	tu->tu_vcsw   -= delta->tu_vcsw;
145 	tu->tu_icsw   -= delta->tu_icsw;
146 	tu->tu_nsig   -= delta->tu_nsig;
147 	tu->tu_nswp   -= delta->tu_nswp;
148 	tu->tu_nscl   -= delta->tu_nscl;
149 }
150 
151 /*
152  * Wrapper for vn_rdwr() used by exacct_vn_write() and exacct_write_header()
153  * to write to the accounting file without corrupting it in case of an I/O or
154  * filesystem error.
155  */
156 static int
157 exacct_vn_write_impl(ac_info_t *info, void *buf, ssize_t bufsize)
158 {
159 	int error;
160 	ssize_t resid;
161 	struct vattr va;
162 
163 	ASSERT(info != NULL);
164 	ASSERT(info->ac_vnode != NULL);
165 	ASSERT(MUTEX_HELD(&info->ac_lock));
166 
167 	/*
168 	 * Save the size. If vn_rdwr fails, reset the size to avoid corrupting
169 	 * the present accounting file.
170 	 */
171 	va.va_mask = AT_SIZE;
172 	error = VOP_GETATTR(info->ac_vnode, &va, 0, kcred, NULL);
173 	if (error == 0) {
174 		error = vn_rdwr(UIO_WRITE, info->ac_vnode, (caddr_t)buf,
175 		    bufsize, 0LL, UIO_SYSSPACE, FAPPEND, (rlim64_t)MAXOFFSET_T,
176 		    kcred, &resid);
177 		if (error) {
178 			(void) VOP_SETATTR(info->ac_vnode, &va, 0, kcred, NULL);
179 		} else if (resid != 0) {
180 			(void) VOP_SETATTR(info->ac_vnode, &va, 0, kcred, NULL);
181 			error = ENOSPC;
182 		}
183 	}
184 	return (error);
185 }
186 
187 /*
188  * exacct_vn_write() safely writes to an accounting file.  acctctl() prevents
189  * the two accounting vnodes from being equal, and the appropriate ac_lock is
190  * held across the call, so we're single threaded through this code for each
191  * file.
192  */
193 static int
194 exacct_vn_write(ac_info_t *info, void *buf, ssize_t bufsize)
195 {
196 	int error;
197 
198 	if (info == NULL)
199 		return (0);
200 
201 	mutex_enter(&info->ac_lock);
202 
203 	/*
204 	 * Don't do anything unless accounting file is set.
205 	 */
206 	if (info->ac_vnode == NULL) {
207 		mutex_exit(&info->ac_lock);
208 		return (0);
209 	}
210 	error = exacct_vn_write_impl(info, buf, bufsize);
211 	mutex_exit(&info->ac_lock);
212 
213 	return (error);
214 }
215 
216 /*
217  * void *exacct_create_header(size_t *)
218  *
219  * Overview
220  *   exacct_create_header() constructs an exacct file header identifying the
221  *   accounting file as the output of the kernel.  exacct_create_header() and
222  *   the static write_header() and verify_header() routines in libexacct must
223  *   remain synchronized.
224  *
225  * Return values
226  *   A pointer to a packed exacct buffer containing the appropriate header is
227  *   returned; the size of the buffer is placed in the location indicated by
228  *   sizep.
229  *
230  * Caller's context
231  *   Suitable for KM_SLEEP allocations.
232  */
233 void *
234 exacct_create_header(size_t *sizep)
235 {
236 	ea_object_t *hdr_grp;
237 	uint32_t bskip;
238 	void *buf;
239 	size_t bufsize;
240 
241 	hdr_grp = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | EXD_GROUP_HEADER);
242 	(void) ea_attach_item(hdr_grp, (void *)&exacct_version, 0,
243 	    EXT_UINT32 | EXC_DEFAULT | EXD_VERSION);
244 	(void) ea_attach_item(hdr_grp, (void *)exacct_header, 0,
245 	    EXT_STRING | EXC_DEFAULT | EXD_FILETYPE);
246 	(void) ea_attach_item(hdr_grp, (void *)exacct_creator, 0,
247 	    EXT_STRING | EXC_DEFAULT | EXD_CREATOR);
248 	(void) ea_attach_item(hdr_grp, uts_nodename(), 0,
249 	    EXT_STRING | EXC_DEFAULT | EXD_HOSTNAME);
250 
251 	bufsize = ea_pack_object(hdr_grp, NULL, 0);
252 	buf = kmem_alloc(bufsize, KM_SLEEP);
253 	(void) ea_pack_object(hdr_grp, buf, bufsize);
254 	ea_free_object(hdr_grp, EUP_ALLOC);
255 
256 	/*
257 	 * To prevent reading the header when reading the file backwards,
258 	 * set the large backskip of the header group to 0 (last 4 bytes).
259 	 */
260 	bskip = 0;
261 	exacct_order32(&bskip);
262 	bcopy(&bskip, (char *)buf + bufsize - sizeof (bskip),
263 	    sizeof (bskip));
264 
265 	*sizep = bufsize;
266 	return (buf);
267 }
268 
269 /*
270  * int exacct_write_header(ac_info_t *, void *, size_t)
271  *
272  * Overview
273  *   exacct_write_header() writes the given header buffer to the indicated
274  *   vnode.
275  *
276  * Return values
277  *   The result of the write operation is returned.
278  *
279  * Caller's context
280  *   Caller must hold the ac_lock of the appropriate accounting file
281  *   information block (ac_info_t).
282  */
283 int
284 exacct_write_header(ac_info_t *info, void *hdr, size_t hdrsize)
285 {
286 	if (info != NULL && info->ac_vnode != NULL)
287 		return (exacct_vn_write_impl(info, hdr, hdrsize));
288 
289 	return (0);
290 }
291 
292 static void
293 exacct_get_interval_task_usage(task_t *tk, task_usage_t *tu,
294     task_usage_t **tu_buf)
295 {
296 	task_usage_t *oldtu, *newtu;
297 	task_usage_t **prevusage;
298 
299 	ASSERT(MUTEX_HELD(&tk->tk_usage_lock));
300 	if (getzoneid() != GLOBAL_ZONEID) {
301 		prevusage = &tk->tk_zoneusage;
302 	} else {
303 		prevusage = &tk->tk_prevusage;
304 	}
305 	if ((oldtu = *prevusage) != NULL) {
306 		/*
307 		 * In case we have any accounting information
308 		 * saved from the previous interval record.
309 		 */
310 		newtu = *tu_buf;
311 		bcopy(tu, newtu, sizeof (task_usage_t));
312 		tu->tu_minflt	-= oldtu->tu_minflt;
313 		tu->tu_majflt	-= oldtu->tu_majflt;
314 		tu->tu_sndmsg	-= oldtu->tu_sndmsg;
315 		tu->tu_rcvmsg	-= oldtu->tu_rcvmsg;
316 		tu->tu_ioch	-= oldtu->tu_ioch;
317 		tu->tu_iblk	-= oldtu->tu_iblk;
318 		tu->tu_oblk	-= oldtu->tu_oblk;
319 		tu->tu_vcsw	-= oldtu->tu_vcsw;
320 		tu->tu_icsw	-= oldtu->tu_icsw;
321 		tu->tu_nsig	-= oldtu->tu_nsig;
322 		tu->tu_nswp	-= oldtu->tu_nswp;
323 		tu->tu_nscl	-= oldtu->tu_nscl;
324 		tu->tu_utime	-= oldtu->tu_utime;
325 		tu->tu_stime	-= oldtu->tu_stime;
326 
327 		tu->tu_startsec = oldtu->tu_finishsec;
328 		tu->tu_startnsec = oldtu->tu_finishnsec;
329 		/*
330 		 * Copy the data from our temporary storage to the task's
331 		 * previous interval usage structure for future reference.
332 		 */
333 		bcopy(newtu, oldtu, sizeof (task_usage_t));
334 	} else {
335 		/*
336 		 * Store current statistics in the task's previous interval
337 		 * usage structure for future references.
338 		 */
339 		*prevusage = *tu_buf;
340 		bcopy(tu, *prevusage, sizeof (task_usage_t));
341 		*tu_buf = NULL;
342 	}
343 }
344 
345 static void
346 exacct_snapshot_task_usage(task_t *tk, task_usage_t *tu)
347 {
348 	timestruc_t ts;
349 	proc_t *p;
350 
351 	ASSERT(MUTEX_HELD(&pidlock));
352 
353 	if ((p = tk->tk_memb_list) == NULL)
354 		return;
355 
356 	/*
357 	 * exacct_snapshot_task_usage() provides an approximate snapshot of the
358 	 * usage of the potentially many members of the task.  Since we don't
359 	 * guarantee exactness, we don't acquire the p_lock of any of the member
360 	 * processes.
361 	 */
362 	do {
363 		mutex_enter(&p->p_lock);
364 		tu->tu_utime	+= mstate_aggr_state(p, LMS_USER);
365 		tu->tu_stime	+= mstate_aggr_state(p, LMS_SYSTEM);
366 		mutex_exit(&p->p_lock);
367 		tu->tu_minflt	+= p->p_ru.minflt;
368 		tu->tu_majflt	+= p->p_ru.majflt;
369 		tu->tu_sndmsg	+= p->p_ru.msgsnd;
370 		tu->tu_rcvmsg	+= p->p_ru.msgrcv;
371 		tu->tu_ioch	+= p->p_ru.ioch;
372 		tu->tu_iblk	+= p->p_ru.inblock;
373 		tu->tu_oblk	+= p->p_ru.oublock;
374 		tu->tu_vcsw	+= p->p_ru.nvcsw;
375 		tu->tu_icsw	+= p->p_ru.nivcsw;
376 		tu->tu_nsig	+= p->p_ru.nsignals;
377 		tu->tu_nswp	+= p->p_ru.nswap;
378 		tu->tu_nscl	+= p->p_ru.sysc;
379 	} while ((p = p->p_tasknext) != tk->tk_memb_list);
380 
381 	/*
382 	 * The resource usage accounted for so far will include that
383 	 * contributed by the task's first process. If this process
384 	 * came from another task, then its accumulated resource usage
385 	 * will include a contribution from work performed there.
386 	 * We must therefore subtract any resource usage that was
387 	 * inherited with the first process.
388 	 */
389 	exacct_sub_task_mstate(tu, tk->tk_inherited);
390 
391 	gethrestime(&ts);
392 	tu->tu_finishsec = (uint64_t)(ulong_t)ts.tv_sec;
393 	tu->tu_finishnsec = (uint64_t)(ulong_t)ts.tv_nsec;
394 }
395 
396 /*
397  * void exacct_update_task_mstate(proc_t *)
398  *
399  * Overview
400  *   exacct_update_task_mstate() updates the task usage; it is intended
401  *   to be called from proc_exit().
402  *
403  * Return values
404  *   None.
405  *
406  * Caller's context
407  *   p_lock must be held at entry.
408  */
409 void
410 exacct_update_task_mstate(proc_t *p)
411 {
412 	task_usage_t *tu;
413 
414 	mutex_enter(&p->p_task->tk_usage_lock);
415 	tu = p->p_task->tk_usage;
416 	tu->tu_utime	+= mstate_aggr_state(p, LMS_USER);
417 	tu->tu_stime	+= mstate_aggr_state(p, LMS_SYSTEM);
418 	tu->tu_minflt	+= p->p_ru.minflt;
419 	tu->tu_majflt	+= p->p_ru.majflt;
420 	tu->tu_sndmsg	+= p->p_ru.msgsnd;
421 	tu->tu_rcvmsg	+= p->p_ru.msgrcv;
422 	tu->tu_ioch	+= p->p_ru.ioch;
423 	tu->tu_iblk	+= p->p_ru.inblock;
424 	tu->tu_oblk	+= p->p_ru.oublock;
425 	tu->tu_vcsw	+= p->p_ru.nvcsw;
426 	tu->tu_icsw	+= p->p_ru.nivcsw;
427 	tu->tu_nsig	+= p->p_ru.nsignals;
428 	tu->tu_nswp	+= p->p_ru.nswap;
429 	tu->tu_nscl	+= p->p_ru.sysc;
430 	mutex_exit(&p->p_task->tk_usage_lock);
431 }
432 
433 static void
434 exacct_calculate_task_usage(task_t *tk, task_usage_t *tu, int flag)
435 {
436 	timestruc_t ts;
437 	task_usage_t *tu_buf;
438 
439 	switch (flag) {
440 	case EW_PARTIAL:
441 		/*
442 		 * For partial records we must report the sum of current
443 		 * accounting statistics with previously accumulated
444 		 * statistics.
445 		 */
446 		mutex_enter(&pidlock);
447 		mutex_enter(&tk->tk_usage_lock);
448 
449 		(void) bcopy(tk->tk_usage, tu, sizeof (task_usage_t));
450 		exacct_snapshot_task_usage(tk, tu);
451 
452 		mutex_exit(&tk->tk_usage_lock);
453 		mutex_exit(&pidlock);
454 		break;
455 	case EW_INTERVAL:
456 		/*
457 		 * We need to allocate spare task_usage_t buffer before
458 		 * grabbing pidlock because we might need it later in
459 		 * exacct_get_interval_task_usage().
460 		 */
461 		tu_buf = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP);
462 		mutex_enter(&pidlock);
463 		mutex_enter(&tk->tk_usage_lock);
464 
465 		/*
466 		 * For interval records, we deduct the previous microstate
467 		 * accounting data and cpu usage times from previously saved
468 		 * results and update the previous task usage structure.
469 		 */
470 		(void) bcopy(tk->tk_usage, tu, sizeof (task_usage_t));
471 		exacct_snapshot_task_usage(tk, tu);
472 		exacct_get_interval_task_usage(tk, tu, &tu_buf);
473 
474 		mutex_exit(&tk->tk_usage_lock);
475 		mutex_exit(&pidlock);
476 
477 		if (tu_buf != NULL)
478 			kmem_free(tu_buf, sizeof (task_usage_t));
479 		break;
480 	case EW_FINAL:
481 		/*
482 		 * For final records, we deduct, from the task's current
483 		 * usage, any usage that was inherited with the arrival
484 		 * of a process from a previous task. We then record
485 		 * the task's finish time.
486 		 */
487 		mutex_enter(&tk->tk_usage_lock);
488 		(void) bcopy(tk->tk_usage, tu, sizeof (task_usage_t));
489 		exacct_sub_task_mstate(tu, tk->tk_inherited);
490 		mutex_exit(&tk->tk_usage_lock);
491 
492 		gethrestime(&ts);
493 		tu->tu_finishsec = (uint64_t)(ulong_t)ts.tv_sec;
494 		tu->tu_finishnsec = (uint64_t)(ulong_t)ts.tv_nsec;
495 
496 		break;
497 	}
498 }
499 
500 static int
501 exacct_attach_task_item(task_t *tk, task_usage_t *tu, ea_object_t *record,
502     int res)
503 {
504 	int attached = 1;
505 
506 	switch (res) {
507 	case AC_TASK_TASKID:
508 		(void) ea_attach_item(record, &tk->tk_tkid,
509 		    sizeof (uint32_t), EXT_UINT32 | EXD_TASK_TASKID);
510 		break;
511 	case AC_TASK_PROJID:
512 		(void) ea_attach_item(record, &tk->tk_proj->kpj_id,
513 		    sizeof (uint32_t), EXT_UINT32 | EXD_TASK_PROJID);
514 		break;
515 	case AC_TASK_CPU: {
516 			timestruc_t ts;
517 			uint64_t ui;
518 
519 			hrt2ts(tu->tu_stime, &ts);
520 			ui = ts.tv_sec;
521 			(void) ea_attach_item(record, &ui, sizeof (uint64_t),
522 			    EXT_UINT64 | EXD_TASK_CPU_SYS_SEC);
523 			ui = ts.tv_nsec;
524 			(void) ea_attach_item(record, &ui, sizeof (uint64_t),
525 			    EXT_UINT64 | EXD_TASK_CPU_SYS_NSEC);
526 
527 			hrt2ts(tu->tu_utime, &ts);
528 			ui = ts.tv_sec;
529 			(void) ea_attach_item(record, &ui, sizeof (uint64_t),
530 			    EXT_UINT64 | EXD_TASK_CPU_USER_SEC);
531 			ui = ts.tv_nsec;
532 			(void) ea_attach_item(record, &ui, sizeof (uint64_t),
533 			    EXT_UINT64 | EXD_TASK_CPU_USER_NSEC);
534 		}
535 		break;
536 	case AC_TASK_TIME:
537 		(void) ea_attach_item(record, &tu->tu_startsec,
538 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_START_SEC);
539 		(void) ea_attach_item(record, &tu->tu_startnsec,
540 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_START_NSEC);
541 		(void) ea_attach_item(record, &tu->tu_finishsec,
542 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FINISH_SEC);
543 		(void) ea_attach_item(record, &tu->tu_finishnsec,
544 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FINISH_NSEC);
545 		break;
546 	case AC_TASK_HOSTNAME:
547 		(void) ea_attach_item(record, tk->tk_zone->zone_nodename,
548 		    strlen(tk->tk_zone->zone_nodename) + 1,
549 		    EXT_STRING | EXD_TASK_HOSTNAME);
550 			break;
551 	case AC_TASK_MICROSTATE:
552 		(void) ea_attach_item(record, &tu->tu_majflt,
553 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FAULTS_MAJOR);
554 		(void) ea_attach_item(record, &tu->tu_minflt,
555 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_FAULTS_MINOR);
556 		(void) ea_attach_item(record, &tu->tu_sndmsg,
557 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_MESSAGES_SND);
558 		(void) ea_attach_item(record, &tu->tu_rcvmsg,
559 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_MESSAGES_RCV);
560 		(void) ea_attach_item(record, &tu->tu_iblk,
561 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_BLOCKS_IN);
562 		(void) ea_attach_item(record, &tu->tu_oblk,
563 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_BLOCKS_OUT);
564 		(void) ea_attach_item(record, &tu->tu_ioch,
565 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_CHARS_RDWR);
566 		(void) ea_attach_item(record, &tu->tu_vcsw,
567 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_CONTEXT_VOL);
568 		(void) ea_attach_item(record, &tu->tu_icsw,
569 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_CONTEXT_INV);
570 		(void) ea_attach_item(record, &tu->tu_nsig,
571 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_SIGNALS);
572 		(void) ea_attach_item(record, &tu->tu_nswp,
573 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_SWAPS);
574 		(void) ea_attach_item(record, &tu->tu_nscl,
575 		    sizeof (uint64_t), EXT_UINT64 | EXD_TASK_SYSCALLS);
576 		break;
577 	case AC_TASK_ANCTASKID:
578 		(void) ea_attach_item(record, &tu->tu_anctaskid,
579 		    sizeof (uint32_t), EXT_UINT32 | EXD_TASK_ANCTASKID);
580 		break;
581 	case AC_TASK_ZONENAME:
582 		(void) ea_attach_item(record, tk->tk_zone->zone_name,
583 		    strlen(tk->tk_zone->zone_name) + 1,
584 		    EXT_STRING | EXD_TASK_ZONENAME);
585 		break;
586 	default:
587 		attached = 0;
588 	}
589 	return (attached);
590 }
591 
592 static ea_object_t *
593 exacct_assemble_task_record(task_t *tk, task_usage_t *tu, ulong_t *mask,
594     ea_catalog_t record_type)
595 {
596 	int res, count;
597 	ea_object_t *record;
598 
599 	/*
600 	 * Assemble usage values into group.
601 	 */
602 	record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type);
603 	for (res = 1, count = 0; res <= AC_TASK_MAX_RES; res++)
604 		if (BT_TEST(mask, res))
605 			count += exacct_attach_task_item(tk, tu, record, res);
606 	if (count == 0) {
607 		ea_free_object(record, EUP_ALLOC);
608 		record = NULL;
609 	}
610 	return (record);
611 }
612 
613 /*
614  * int exacct_assemble_task_usage(task_t *, int (*)(void *, size_t, void *,
615  *	size_t, size_t *), void *, size_t, size_t *, int)
616  *
617  * Overview
618  *   exacct_assemble_task_usage() builds the packed exacct buffer for the
619  *   indicated task, executes the given callback function, and free the packed
620  *   buffer.
621  *
622  * Return values
623  *   Returns 0 on success; otherwise the appropriate error code is returned.
624  *
625  * Caller's context
626  *   Suitable for KM_SLEEP allocations.
627  */
628 int
629 exacct_assemble_task_usage(ac_info_t *ac_task, task_t *tk,
630     int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
631     void *ubuf, size_t ubufsize, size_t *actual, int flag)
632 {
633 	ulong_t mask[AC_MASK_SZ];
634 	ea_object_t *task_record;
635 	ea_catalog_t record_type;
636 	task_usage_t *tu;
637 	void *buf;
638 	size_t bufsize;
639 	int ret;
640 
641 	ASSERT(flag == EW_FINAL || flag == EW_PARTIAL || flag == EW_INTERVAL);
642 
643 	mutex_enter(&ac_task->ac_lock);
644 	if (ac_task->ac_state == AC_OFF) {
645 		mutex_exit(&ac_task->ac_lock);
646 		return (ENOTACTIVE);
647 	}
648 	bt_copy(ac_task->ac_mask, mask, AC_MASK_SZ);
649 	mutex_exit(&ac_task->ac_lock);
650 
651 	switch (flag) {
652 	case EW_FINAL:
653 		record_type = EXD_GROUP_TASK;
654 		break;
655 	case EW_PARTIAL:
656 		record_type = EXD_GROUP_TASK_PARTIAL;
657 		break;
658 	case EW_INTERVAL:
659 		record_type = EXD_GROUP_TASK_INTERVAL;
660 		break;
661 	}
662 
663 	/*
664 	 * Calculate task usage and assemble it into the task record.
665 	 */
666 	tu = kmem_zalloc(sizeof (task_usage_t), KM_SLEEP);
667 	exacct_calculate_task_usage(tk, tu, flag);
668 	task_record = exacct_assemble_task_record(tk, tu, mask, record_type);
669 	if (task_record == NULL) {
670 		/*
671 		 * The current configuration of the accounting system has
672 		 * resulted in records with no data; accordingly, we don't write
673 		 * these, but we return success.
674 		 */
675 		kmem_free(tu, sizeof (task_usage_t));
676 		return (0);
677 	}
678 
679 	/*
680 	 * Pack object into buffer and run callback on it.
681 	 */
682 	bufsize = ea_pack_object(task_record, NULL, 0);
683 	buf = kmem_alloc(bufsize, KM_SLEEP);
684 	(void) ea_pack_object(task_record, buf, bufsize);
685 	ret = callback(ac_task, ubuf, ubufsize, buf, bufsize, actual);
686 
687 	/*
688 	 * Free all previously allocated structures.
689 	 */
690 	kmem_free(buf, bufsize);
691 	ea_free_object(task_record, EUP_ALLOC);
692 	kmem_free(tu, sizeof (task_usage_t));
693 	return (ret);
694 }
695 
696 /*
697  * void exacct_commit_task(void *)
698  *
699  * Overview
700  *   exacct_commit_task() calculates the final usage for a task, updating the
701  *   task usage if task accounting is active, and writing a task record if task
702  *   accounting is active.  exacct_commit_task() is intended for being called
703  *   from a task queue (taskq_t).
704  *
705  * Return values
706  *   None.
707  *
708  * Caller's context
709  *   Suitable for KM_SLEEP allocations.
710  */
711 
712 void
713 exacct_commit_task(void *arg)
714 {
715 	task_t *tk = (task_t *)arg;
716 	size_t size;
717 	zone_t *zone = tk->tk_zone;
718 	struct exacct_globals *acg;
719 
720 	ASSERT(tk != task0p);
721 	ASSERT(tk->tk_memb_list == NULL);
722 
723 	/*
724 	 * Don't do any extra work if the acctctl module isn't loaded.
725 	 */
726 	if (exacct_zone_key != ZONE_KEY_UNINITIALIZED) {
727 		acg = zone_getspecific(exacct_zone_key, zone);
728 		(void) exacct_assemble_task_usage(&acg->ac_task, tk,
729 		    exacct_commit_callback, NULL, 0, &size, EW_FINAL);
730 		if (tk->tk_zone != global_zone) {
731 			acg = zone_getspecific(exacct_zone_key, global_zone);
732 			(void) exacct_assemble_task_usage(&acg->ac_task, tk,
733 			    exacct_commit_callback, NULL, 0, &size, EW_FINAL);
734 		}
735 	}
736 	/*
737 	 * Release associated project and finalize task.
738 	 */
739 	task_end(tk);
740 }
741 
742 static int
743 exacct_attach_proc_item(proc_usage_t *pu, ea_object_t *record, int res)
744 {
745 	int attached = 1;
746 
747 	switch (res) {
748 	case AC_PROC_PID:
749 		(void) ea_attach_item(record, &pu->pu_pid,
750 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_PID);
751 		break;
752 	case AC_PROC_UID:
753 		(void) ea_attach_item(record, &pu->pu_ruid,
754 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_UID);
755 		break;
756 	case AC_PROC_FLAG:
757 		(void) ea_attach_item(record, &pu->pu_acflag,
758 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_ACCT_FLAGS);
759 		break;
760 	case AC_PROC_GID:
761 		(void) ea_attach_item(record, &pu->pu_rgid,
762 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_GID);
763 		break;
764 	case AC_PROC_PROJID:
765 		(void) ea_attach_item(record, &pu->pu_projid,
766 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_PROJID);
767 		break;
768 	case AC_PROC_TASKID:
769 		(void) ea_attach_item(record, &pu->pu_taskid,
770 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_TASKID);
771 		break;
772 	case AC_PROC_CPU:
773 		(void) ea_attach_item(record, &pu->pu_utimesec,
774 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_USER_SEC);
775 		(void) ea_attach_item(record, &pu->pu_utimensec,
776 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_USER_NSEC);
777 		(void) ea_attach_item(record, &pu->pu_stimesec,
778 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_SYS_SEC);
779 		(void) ea_attach_item(record, &pu->pu_stimensec,
780 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CPU_SYS_NSEC);
781 		break;
782 	case AC_PROC_TIME:
783 		(void) ea_attach_item(record, &pu->pu_startsec,
784 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_START_SEC);
785 		(void) ea_attach_item(record, &pu->pu_startnsec,
786 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_START_NSEC);
787 		(void) ea_attach_item(record, &pu->pu_finishsec,
788 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FINISH_SEC);
789 		(void) ea_attach_item(record, &pu->pu_finishnsec,
790 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FINISH_NSEC);
791 		break;
792 	case AC_PROC_COMMAND:
793 		(void) ea_attach_item(record, pu->pu_command,
794 		    strlen(pu->pu_command) + 1, EXT_STRING | EXD_PROC_COMMAND);
795 		break;
796 	case AC_PROC_HOSTNAME:
797 		(void) ea_attach_item(record, pu->pu_nodename,
798 		    strlen(pu->pu_nodename) + 1,
799 		    EXT_STRING | EXD_PROC_HOSTNAME);
800 		break;
801 	case AC_PROC_TTY:
802 		(void) ea_attach_item(record, &pu->pu_major,
803 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_TTY_MAJOR);
804 		(void) ea_attach_item(record, &pu->pu_minor,
805 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_TTY_MINOR);
806 		break;
807 	case AC_PROC_MICROSTATE:
808 		(void) ea_attach_item(record, &pu->pu_majflt,
809 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FAULTS_MAJOR);
810 		(void) ea_attach_item(record, &pu->pu_minflt,
811 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_FAULTS_MINOR);
812 		(void) ea_attach_item(record, &pu->pu_sndmsg,
813 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MESSAGES_SND);
814 		(void) ea_attach_item(record, &pu->pu_rcvmsg,
815 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MESSAGES_RCV);
816 		(void) ea_attach_item(record, &pu->pu_iblk,
817 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_BLOCKS_IN);
818 		(void) ea_attach_item(record, &pu->pu_oblk,
819 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_BLOCKS_OUT);
820 		(void) ea_attach_item(record, &pu->pu_ioch,
821 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CHARS_RDWR);
822 		(void) ea_attach_item(record, &pu->pu_vcsw,
823 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CONTEXT_VOL);
824 		(void) ea_attach_item(record, &pu->pu_icsw,
825 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_CONTEXT_INV);
826 		(void) ea_attach_item(record, &pu->pu_nsig,
827 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_SIGNALS);
828 		(void) ea_attach_item(record, &pu->pu_nswp,
829 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_SWAPS);
830 		(void) ea_attach_item(record, &pu->pu_nscl,
831 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_SYSCALLS);
832 		break;
833 	case AC_PROC_ANCPID:
834 		(void) ea_attach_item(record, &pu->pu_ancpid,
835 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_ANCPID);
836 		break;
837 	case AC_PROC_WAIT_STATUS:
838 		(void) ea_attach_item(record, &pu->pu_wstat,
839 		    sizeof (uint32_t), EXT_UINT32 | EXD_PROC_WAIT_STATUS);
840 		break;
841 	case AC_PROC_ZONENAME:
842 		(void) ea_attach_item(record, pu->pu_zonename,
843 		    strlen(pu->pu_zonename) + 1,
844 		    EXT_STRING | EXD_PROC_ZONENAME);
845 		break;
846 	case AC_PROC_MEM:
847 		(void) ea_attach_item(record, &pu->pu_mem_rss_avg,
848 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MEM_RSS_AVG_K);
849 		(void) ea_attach_item(record, &pu->pu_mem_rss_max,
850 		    sizeof (uint64_t), EXT_UINT64 | EXD_PROC_MEM_RSS_MAX_K);
851 		break;
852 	default:
853 		attached = 0;
854 	}
855 	return (attached);
856 }
857 
858 static ea_object_t *
859 exacct_assemble_proc_record(proc_usage_t *pu, ulong_t *mask,
860     ea_catalog_t record_type)
861 {
862 	int res, count;
863 	ea_object_t *record;
864 
865 	/*
866 	 * Assemble usage values into group.
867 	 */
868 	record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type);
869 	for (res = 1, count = 0; res <= AC_PROC_MAX_RES; res++)
870 		if (BT_TEST(mask, res))
871 			count += exacct_attach_proc_item(pu, record, res);
872 	if (count == 0) {
873 		ea_free_object(record, EUP_ALLOC);
874 		record = NULL;
875 	}
876 	return (record);
877 }
878 
879 /*
880  * The following two routines assume that process's p_lock is held or
881  * exacct_commit_proc has been called from exit() when all lwps are stopped.
882  */
883 static void
884 exacct_calculate_proc_mstate(proc_t *p, proc_usage_t *pu)
885 {
886 	kthread_t *t;
887 
888 	ASSERT(MUTEX_HELD(&p->p_lock));
889 	if ((t = p->p_tlist) == NULL)
890 		return;
891 
892 	do {
893 		pu->pu_minflt	+= t->t_lwp->lwp_ru.minflt;
894 		pu->pu_majflt	+= t->t_lwp->lwp_ru.majflt;
895 		pu->pu_sndmsg	+= t->t_lwp->lwp_ru.msgsnd;
896 		pu->pu_rcvmsg	+= t->t_lwp->lwp_ru.msgrcv;
897 		pu->pu_ioch	+= t->t_lwp->lwp_ru.ioch;
898 		pu->pu_iblk	+= t->t_lwp->lwp_ru.inblock;
899 		pu->pu_oblk	+= t->t_lwp->lwp_ru.oublock;
900 		pu->pu_vcsw	+= t->t_lwp->lwp_ru.nvcsw;
901 		pu->pu_icsw	+= t->t_lwp->lwp_ru.nivcsw;
902 		pu->pu_nsig	+= t->t_lwp->lwp_ru.nsignals;
903 		pu->pu_nswp	+= t->t_lwp->lwp_ru.nswap;
904 		pu->pu_nscl	+= t->t_lwp->lwp_ru.sysc;
905 	} while ((t = t->t_forw) != p->p_tlist);
906 }
907 
908 static void
909 exacct_copy_proc_mstate(proc_t *p, proc_usage_t *pu)
910 {
911 	pu->pu_minflt	= p->p_ru.minflt;
912 	pu->pu_majflt	= p->p_ru.majflt;
913 	pu->pu_sndmsg	= p->p_ru.msgsnd;
914 	pu->pu_rcvmsg	= p->p_ru.msgrcv;
915 	pu->pu_ioch	= p->p_ru.ioch;
916 	pu->pu_iblk	= p->p_ru.inblock;
917 	pu->pu_oblk	= p->p_ru.oublock;
918 	pu->pu_vcsw	= p->p_ru.nvcsw;
919 	pu->pu_icsw	= p->p_ru.nivcsw;
920 	pu->pu_nsig	= p->p_ru.nsignals;
921 	pu->pu_nswp	= p->p_ru.nswap;
922 	pu->pu_nscl	= p->p_ru.sysc;
923 }
924 
925 void
926 exacct_calculate_proc_usage(proc_t *p, proc_usage_t *pu, ulong_t *mask,
927     int flag, int wstat)
928 {
929 	timestruc_t ts, ts_run;
930 
931 	ASSERT(MUTEX_HELD(&p->p_lock));
932 
933 	/*
934 	 * Convert CPU and execution times to sec/nsec format.
935 	 */
936 	if (BT_TEST(mask, AC_PROC_CPU)) {
937 		hrt2ts(mstate_aggr_state(p, LMS_USER), &ts);
938 		pu->pu_utimesec = (uint64_t)(ulong_t)ts.tv_sec;
939 		pu->pu_utimensec = (uint64_t)(ulong_t)ts.tv_nsec;
940 		hrt2ts(mstate_aggr_state(p, LMS_SYSTEM), &ts);
941 		pu->pu_stimesec = (uint64_t)(ulong_t)ts.tv_sec;
942 		pu->pu_stimensec = (uint64_t)(ulong_t)ts.tv_nsec;
943 	}
944 	if (BT_TEST(mask, AC_PROC_TIME)) {
945 		gethrestime(&ts);
946 		pu->pu_finishsec = (uint64_t)(ulong_t)ts.tv_sec;
947 		pu->pu_finishnsec = (uint64_t)(ulong_t)ts.tv_nsec;
948 		hrt2ts(gethrtime() - p->p_mstart, &ts_run);
949 		ts.tv_sec -= ts_run.tv_sec;
950 		ts.tv_nsec -= ts_run.tv_nsec;
951 		if (ts.tv_nsec < 0) {
952 			ts.tv_sec--;
953 			if ((ts.tv_nsec = ts.tv_nsec + NANOSEC) >= NANOSEC) {
954 				ts.tv_sec++;
955 				ts.tv_nsec -= NANOSEC;
956 			}
957 		}
958 		pu->pu_startsec = (uint64_t)(ulong_t)ts.tv_sec;
959 		pu->pu_startnsec = (uint64_t)(ulong_t)ts.tv_nsec;
960 	}
961 
962 	pu->pu_pid = p->p_pidp->pid_id;
963 	pu->pu_acflag = p->p_user.u_acflag;
964 	pu->pu_projid = p->p_task->tk_proj->kpj_id;
965 	pu->pu_taskid = p->p_task->tk_tkid;
966 	pu->pu_major = getmajor(p->p_sessp->s_dev);
967 	pu->pu_minor = getminor(p->p_sessp->s_dev);
968 	pu->pu_ancpid = p->p_ancpid;
969 	pu->pu_wstat = wstat;
970 	/*
971 	 * Compute average RSS in K.  The denominator is the number of
972 	 * samples:  the number of clock ticks plus the initial value.
973 	 */
974 	pu->pu_mem_rss_avg = (PTOU(p)->u_mem / (p->p_stime + p->p_utime + 1)) *
975 	    (PAGESIZE / 1024);
976 	pu->pu_mem_rss_max = PTOU(p)->u_mem_max * (PAGESIZE / 1024);
977 
978 	mutex_enter(&p->p_crlock);
979 	pu->pu_ruid = crgetruid(p->p_cred);
980 	pu->pu_rgid = crgetrgid(p->p_cred);
981 	mutex_exit(&p->p_crlock);
982 
983 	bcopy(p->p_user.u_comm, pu->pu_command, strlen(p->p_user.u_comm) + 1);
984 	bcopy(p->p_zone->zone_name, pu->pu_zonename,
985 	    strlen(p->p_zone->zone_name) + 1);
986 	bcopy(p->p_zone->zone_nodename, pu->pu_nodename,
987 	    strlen(p->p_zone->zone_nodename) + 1);
988 
989 	/*
990 	 * Calculate microstate accounting data for a process that is still
991 	 * running.  Presently, we explicitly collect all of the LWP usage into
992 	 * the proc usage structure here.
993 	 */
994 	if (flag & EW_PARTIAL)
995 		exacct_calculate_proc_mstate(p, pu);
996 	if (flag & EW_FINAL)
997 		exacct_copy_proc_mstate(p, pu);
998 }
999 
1000 /*
1001  * int exacct_assemble_proc_usage(proc_usage_t *, int (*)(void *, size_t, void
1002  *	*, size_t, size_t *), void *, size_t, size_t *)
1003  *
1004  * Overview
1005  *   Assemble record with miscellaneous accounting information about the process
1006  *   and execute the callback on it. It is the callback's job to set "actual" to
1007  *   the size of record.
1008  *
1009  * Return values
1010  *   The result of the callback function, unless the extended process accounting
1011  *   feature is not active, in which case ENOTACTIVE is returned.
1012  *
1013  * Caller's context
1014  *   Suitable for KM_SLEEP allocations.
1015  */
1016 int
1017 exacct_assemble_proc_usage(ac_info_t *ac_proc, proc_usage_t *pu,
1018     int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
1019     void *ubuf, size_t ubufsize, size_t *actual, int flag)
1020 {
1021 	ulong_t mask[AC_MASK_SZ];
1022 	ea_object_t *proc_record;
1023 	ea_catalog_t record_type;
1024 	void *buf;
1025 	size_t bufsize;
1026 	int ret;
1027 
1028 	ASSERT(flag == EW_FINAL || flag == EW_PARTIAL);
1029 
1030 	mutex_enter(&ac_proc->ac_lock);
1031 	if (ac_proc->ac_state == AC_OFF) {
1032 		mutex_exit(&ac_proc->ac_lock);
1033 		return (ENOTACTIVE);
1034 	}
1035 	bt_copy(&ac_proc->ac_mask[0], mask, AC_MASK_SZ);
1036 	mutex_exit(&ac_proc->ac_lock);
1037 
1038 	switch (flag) {
1039 	case EW_FINAL:
1040 		record_type = EXD_GROUP_PROC;
1041 		break;
1042 	case EW_PARTIAL:
1043 		record_type = EXD_GROUP_PROC_PARTIAL;
1044 		break;
1045 	}
1046 
1047 	proc_record = exacct_assemble_proc_record(pu, mask, record_type);
1048 	if (proc_record == NULL)
1049 		return (0);
1050 
1051 	/*
1052 	 * Pack object into buffer and pass to callback.
1053 	 */
1054 	bufsize = ea_pack_object(proc_record, NULL, 0);
1055 	buf = kmem_alloc(bufsize, KM_SLEEP);
1056 	(void) ea_pack_object(proc_record, buf, bufsize);
1057 
1058 	ret = callback(ac_proc, ubuf, ubufsize, buf, bufsize, actual);
1059 
1060 	/*
1061 	 * Free all previously allocations.
1062 	 */
1063 	kmem_free(buf, bufsize);
1064 	ea_free_object(proc_record, EUP_ALLOC);
1065 	return (ret);
1066 }
1067 
1068 /*
1069  * int exacct_commit_callback(ac_info_t *, void *, size_t, void *, size_t,
1070  * 	size_t *)
1071  *
1072  * Overview
1073  *   exacct_commit_callback() writes the indicated buffer to the indicated
1074  *   extended accounting file.
1075  *
1076  * Return values
1077  *   The result of the write operation is returned.  "actual" is updated to
1078  *   contain the number of bytes actually written.
1079  *
1080  * Caller's context
1081  *   Suitable for a vn_rdwr() operation.
1082  */
1083 /*ARGSUSED*/
1084 int
1085 exacct_commit_callback(ac_info_t *info, void *ubuf, size_t ubufsize,
1086     void *buf, size_t bufsize, size_t *actual)
1087 {
1088 	int error = 0;
1089 
1090 	*actual = 0;
1091 	if ((error = exacct_vn_write(info, buf, bufsize)) == 0)
1092 		*actual = bufsize;
1093 	return (error);
1094 }
1095 
1096 static void
1097 exacct_do_commit_proc(ac_info_t *ac_proc, proc_t *p, int wstat)
1098 {
1099 	size_t size;
1100 	proc_usage_t *pu;
1101 	ulong_t mask[AC_MASK_SZ];
1102 
1103 	mutex_enter(&ac_proc->ac_lock);
1104 	if (ac_proc->ac_state == AC_ON) {
1105 		bt_copy(&ac_proc->ac_mask[0], mask, AC_MASK_SZ);
1106 		mutex_exit(&ac_proc->ac_lock);
1107 	} else {
1108 		mutex_exit(&ac_proc->ac_lock);
1109 		return;
1110 	}
1111 
1112 	mutex_enter(&p->p_lock);
1113 	size = strlen(p->p_user.u_comm) + 1;
1114 	mutex_exit(&p->p_lock);
1115 
1116 	pu = kmem_alloc(sizeof (proc_usage_t), KM_SLEEP);
1117 	pu->pu_command = kmem_alloc(size, KM_SLEEP);
1118 	mutex_enter(&p->p_lock);
1119 	exacct_calculate_proc_usage(p, pu, mask, EW_FINAL, wstat);
1120 	mutex_exit(&p->p_lock);
1121 
1122 	(void) exacct_assemble_proc_usage(ac_proc, pu,
1123 	    exacct_commit_callback, NULL, 0, &size, EW_FINAL);
1124 
1125 	kmem_free(pu->pu_command, strlen(pu->pu_command) + 1);
1126 	kmem_free(pu, sizeof (proc_usage_t));
1127 }
1128 
1129 /*
1130  * void exacct_commit_proc(proc_t *, int)
1131  *
1132  * Overview
1133  *   exacct_commit_proc() calculates the final usage for a process, updating the
1134  *   task usage if task accounting is active, and writing a process record if
1135  *   process accounting is active.  exacct_commit_proc() is intended for being
1136  *   called from proc_exit().
1137  *
1138  * Return values
1139  *   None.
1140  *
1141  * Caller's context
1142  *   Suitable for KM_SLEEP allocations.  p_lock must not be held at entry.
1143  */
1144 void
1145 exacct_commit_proc(proc_t *p, int wstat)
1146 {
1147 	zone_t *zone = p->p_zone;
1148 	struct exacct_globals *acg, *gacg = NULL;
1149 
1150 	if (exacct_zone_key == ZONE_KEY_UNINITIALIZED) {
1151 		/*
1152 		 * acctctl module not loaded.  Nothing to do.
1153 		 */
1154 		return;
1155 	}
1156 	acg = zone_getspecific(exacct_zone_key, zone);
1157 	exacct_do_commit_proc(&acg->ac_proc, p, wstat);
1158 	if (zone != global_zone) {
1159 		gacg = zone_getspecific(exacct_zone_key, global_zone);
1160 		exacct_do_commit_proc(&gacg->ac_proc, p, wstat);
1161 	}
1162 }
1163 
1164 static int
1165 exacct_attach_netstat_item(net_stat_t *ns, ea_object_t *record, int res)
1166 {
1167 	int		attached = 1;
1168 
1169 	switch (res) {
1170 	case AC_NET_NAME:
1171 		(void) ea_attach_item(record, ns->ns_name,
1172 		    strlen(ns->ns_name) + 1, EXT_STRING | EXD_NET_STATS_NAME);
1173 		break;
1174 	case AC_NET_CURTIME:
1175 		{
1176 			uint64_t	now;
1177 			timestruc_t	ts;
1178 
1179 			gethrestime(&ts);
1180 			now = (uint64_t)(ulong_t)ts.tv_sec;
1181 			(void) ea_attach_item(record,  &now, sizeof (uint64_t),
1182 			    EXT_UINT64 | EXD_NET_STATS_CURTIME);
1183 		}
1184 		break;
1185 	case AC_NET_IBYTES:
1186 		(void) ea_attach_item(record, &ns->ns_ibytes,
1187 		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_IBYTES);
1188 		break;
1189 	case AC_NET_OBYTES:
1190 		(void) ea_attach_item(record, &ns->ns_obytes,
1191 		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_OBYTES);
1192 		break;
1193 	case AC_NET_IPKTS:
1194 		(void) ea_attach_item(record, &ns->ns_ipackets,
1195 		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_IPKTS);
1196 		break;
1197 	case AC_NET_OPKTS:
1198 		(void) ea_attach_item(record, &ns->ns_opackets,
1199 		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_OPKTS);
1200 		break;
1201 	case AC_NET_IERRPKTS:
1202 		(void) ea_attach_item(record, &ns->ns_ierrors,
1203 		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_IERRPKTS);
1204 		break;
1205 	case AC_NET_OERRPKTS:
1206 		(void) ea_attach_item(record, &ns->ns_oerrors,
1207 		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_OERRPKTS);
1208 		break;
1209 	default:
1210 		attached = 0;
1211 	}
1212 	return (attached);
1213 }
1214 
1215 static int
1216 exacct_attach_netdesc_item(net_desc_t *nd, ea_object_t *record, int res)
1217 {
1218 	int attached = 1;
1219 
1220 	switch (res) {
1221 	case AC_NET_NAME:
1222 		(void) ea_attach_item(record, nd->nd_name,
1223 		    strlen(nd->nd_name) + 1, EXT_STRING | EXD_NET_DESC_NAME);
1224 		break;
1225 	case AC_NET_DEVNAME:
1226 		(void) ea_attach_item(record, nd->nd_devname,
1227 		    strlen(nd->nd_devname) + 1, EXT_STRING |
1228 		    EXD_NET_DESC_DEVNAME);
1229 		break;
1230 	case AC_NET_EHOST:
1231 		(void) ea_attach_item(record, &nd->nd_ehost,
1232 		    sizeof (nd->nd_ehost), EXT_RAW | EXD_NET_DESC_EHOST);
1233 		break;
1234 	case AC_NET_EDEST:
1235 		(void) ea_attach_item(record, &nd->nd_edest,
1236 		    sizeof (nd->nd_edest), EXT_RAW | EXD_NET_DESC_EDEST);
1237 		break;
1238 	case AC_NET_VLAN_TPID:
1239 		(void) ea_attach_item(record, &nd->nd_vlan_tpid,
1240 		    sizeof (ushort_t), EXT_UINT16 | EXD_NET_DESC_VLAN_TPID);
1241 		break;
1242 	case AC_NET_VLAN_TCI:
1243 		(void) ea_attach_item(record, &nd->nd_vlan_tci,
1244 		    sizeof (ushort_t), EXT_UINT16 | EXD_NET_DESC_VLAN_TCI);
1245 		break;
1246 	case AC_NET_SAP:
1247 		(void) ea_attach_item(record, &nd->nd_sap,
1248 		    sizeof (ushort_t), EXT_UINT16 | EXD_NET_DESC_SAP);
1249 		break;
1250 	case AC_NET_PRIORITY:
1251 		(void) ea_attach_item(record, &nd->nd_priority,
1252 		    sizeof (ushort_t), EXT_UINT16 | EXD_NET_DESC_PRIORITY);
1253 		break;
1254 	case AC_NET_BWLIMIT:
1255 		(void) ea_attach_item(record, &nd->nd_bw_limit,
1256 		    sizeof (uint64_t), EXT_UINT64 | EXD_NET_DESC_BWLIMIT);
1257 		break;
1258 	case AC_NET_SADDR:
1259 		if (nd->nd_isv4) {
1260 			(void) ea_attach_item(record, &nd->nd_saddr[3],
1261 			    sizeof (uint32_t), EXT_UINT32 |
1262 			    EXD_NET_DESC_V4SADDR);
1263 		} else {
1264 			(void) ea_attach_item(record, &nd->nd_saddr,
1265 			    sizeof (nd->nd_saddr), EXT_RAW |
1266 			    EXD_NET_DESC_V6SADDR);
1267 		}
1268 		break;
1269 	case AC_NET_DADDR:
1270 		if (nd->nd_isv4) {
1271 			(void) ea_attach_item(record, &nd->nd_daddr[3],
1272 			    sizeof (uint32_t), EXT_UINT32 |
1273 			    EXD_NET_DESC_V4DADDR);
1274 		} else {
1275 			(void) ea_attach_item(record, &nd->nd_daddr,
1276 			    sizeof (nd->nd_daddr), EXT_RAW |
1277 			    EXD_NET_DESC_V6DADDR);
1278 		}
1279 		break;
1280 	case AC_NET_SPORT:
1281 		(void) ea_attach_item(record, &nd->nd_sport,
1282 		    sizeof (uint16_t), EXT_UINT16 | EXD_NET_DESC_SPORT);
1283 		break;
1284 	case AC_NET_DPORT:
1285 		(void) ea_attach_item(record, &nd->nd_dport,
1286 		    sizeof (uint16_t), EXT_UINT16 | EXD_NET_DESC_DPORT);
1287 		break;
1288 	case AC_NET_PROTOCOL:
1289 		(void) ea_attach_item(record, &nd->nd_protocol,
1290 		    sizeof (uint8_t), EXT_UINT8 | EXD_NET_DESC_PROTOCOL);
1291 		break;
1292 	case AC_NET_DSFIELD:
1293 		(void) ea_attach_item(record, &nd->nd_dsfield,
1294 		    sizeof (uint8_t), EXT_UINT8 | EXD_NET_DESC_DSFIELD);
1295 		break;
1296 	default:
1297 		attached = 0;
1298 	}
1299 	return (attached);
1300 }
1301 
1302 static ea_object_t *
1303 exacct_assemble_net_record(void *ninfo, ulong_t *mask, ea_catalog_t record_type,
1304     int what)
1305 {
1306 	int		res;
1307 	int		count;
1308 	ea_object_t	*record;
1309 
1310 	/*
1311 	 * Assemble usage values into group.
1312 	 */
1313 	record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type);
1314 	for (res = 1, count = 0; res <= AC_NET_MAX_RES; res++)
1315 		if (BT_TEST(mask, res)) {
1316 			if (what == EX_NET_LNDESC_REC ||
1317 			    what == EX_NET_FLDESC_REC) {
1318 				count += exacct_attach_netdesc_item(
1319 				    (net_desc_t *)ninfo, record, res);
1320 			} else {
1321 				count += exacct_attach_netstat_item(
1322 				    (net_stat_t *)ninfo, record, res);
1323 			}
1324 		}
1325 	if (count == 0) {
1326 		ea_free_object(record, EUP_ALLOC);
1327 		record = NULL;
1328 	}
1329 	return (record);
1330 }
1331 
1332 int
1333 exacct_assemble_net_usage(ac_info_t *ac_net, void *ninfo,
1334     int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
1335     void *ubuf, size_t ubufsize, size_t *actual, int what)
1336 {
1337 	ulong_t		mask[AC_MASK_SZ];
1338 	ea_object_t	*net_desc;
1339 	ea_catalog_t	record_type;
1340 	void		*buf;
1341 	size_t		bufsize;
1342 	int		ret;
1343 
1344 	mutex_enter(&ac_net->ac_lock);
1345 	if (ac_net->ac_state == AC_OFF) {
1346 		mutex_exit(&ac_net->ac_lock);
1347 		return (ENOTACTIVE);
1348 	}
1349 	bt_copy(&ac_net->ac_mask[0], mask, AC_MASK_SZ);
1350 	mutex_exit(&ac_net->ac_lock);
1351 
1352 	switch (what) {
1353 	case EX_NET_LNDESC_REC:
1354 		record_type = EXD_GROUP_NET_LINK_DESC;
1355 		break;
1356 	case EX_NET_LNSTAT_REC:
1357 		record_type = EXD_GROUP_NET_LINK_STATS;
1358 		break;
1359 	case EX_NET_FLDESC_REC:
1360 		record_type = EXD_GROUP_NET_FLOW_DESC;
1361 		break;
1362 	case EX_NET_FLSTAT_REC:
1363 		record_type = EXD_GROUP_NET_FLOW_STATS;
1364 		break;
1365 	}
1366 
1367 	net_desc = exacct_assemble_net_record(ninfo, mask, record_type, what);
1368 	if (net_desc == NULL)
1369 		return (0);
1370 
1371 	/*
1372 	 * Pack object into buffer and pass to callback.
1373 	 */
1374 	bufsize = ea_pack_object(net_desc, NULL, 0);
1375 	buf = kmem_alloc(bufsize, KM_NOSLEEP);
1376 	if (buf == NULL)
1377 		return (ENOMEM);
1378 
1379 	(void) ea_pack_object(net_desc, buf, bufsize);
1380 
1381 	ret = callback(ac_net, ubuf, ubufsize, buf, bufsize, actual);
1382 
1383 	/*
1384 	 * Free all previously allocations.
1385 	 */
1386 	kmem_free(buf, bufsize);
1387 	ea_free_object(net_desc, EUP_ALLOC);
1388 	return (ret);
1389 }
1390 
1391 int
1392 exacct_commit_netinfo(void *arg, int what)
1393 {
1394 	size_t			size;
1395 	ulong_t			mask[AC_MASK_SZ];
1396 	struct exacct_globals	*acg;
1397 	ac_info_t		*ac_net;
1398 
1399 	if (exacct_zone_key == ZONE_KEY_UNINITIALIZED) {
1400 		/*
1401 		 * acctctl module not loaded. Nothing to do.
1402 		 */
1403 		return (ENOTACTIVE);
1404 	}
1405 
1406 	/*
1407 	 * Even though each zone nominally has its own flow accounting settings
1408 	 * (ac_flow), these are only maintained by and for the global zone.
1409 	 *
1410 	 * If this were to change in the future, this function should grow a
1411 	 * second zoneid (or zone) argument, and use the corresponding zone's
1412 	 * settings rather than always using those of the global zone.
1413 	 */
1414 	acg = zone_getspecific(exacct_zone_key, global_zone);
1415 	ac_net = &acg->ac_net;
1416 
1417 	mutex_enter(&ac_net->ac_lock);
1418 	if (ac_net->ac_state == AC_OFF) {
1419 		mutex_exit(&ac_net->ac_lock);
1420 		return (ENOTACTIVE);
1421 	}
1422 	bt_copy(&ac_net->ac_mask[0], mask, AC_MASK_SZ);
1423 	mutex_exit(&ac_net->ac_lock);
1424 
1425 	return (exacct_assemble_net_usage(ac_net, arg, exacct_commit_callback,
1426 	    NULL, 0, &size, what));
1427 }
1428 
1429 static int
1430 exacct_attach_flow_item(flow_usage_t *fu, ea_object_t *record, int res)
1431 {
1432 	int attached = 1;
1433 
1434 	switch (res) {
1435 	case AC_FLOW_SADDR:
1436 		if (fu->fu_isv4) {
1437 			(void) ea_attach_item(record, &fu->fu_saddr[3],
1438 			    sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_V4SADDR);
1439 		} else {
1440 			(void) ea_attach_item(record, &fu->fu_saddr,
1441 			    sizeof (fu->fu_saddr), EXT_RAW |
1442 			    EXD_FLOW_V6SADDR);
1443 		}
1444 		break;
1445 	case AC_FLOW_DADDR:
1446 		if (fu->fu_isv4) {
1447 			(void) ea_attach_item(record, &fu->fu_daddr[3],
1448 			    sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_V4DADDR);
1449 		} else {
1450 			(void) ea_attach_item(record, &fu->fu_daddr,
1451 			    sizeof (fu->fu_daddr), EXT_RAW |
1452 			    EXD_FLOW_V6DADDR);
1453 		}
1454 		break;
1455 	case AC_FLOW_SPORT:
1456 		(void) ea_attach_item(record, &fu->fu_sport,
1457 		    sizeof (uint16_t), EXT_UINT16 | EXD_FLOW_SPORT);
1458 		break;
1459 	case AC_FLOW_DPORT:
1460 		(void) ea_attach_item(record, &fu->fu_dport,
1461 		    sizeof (uint16_t), EXT_UINT16 | EXD_FLOW_DPORT);
1462 		break;
1463 	case AC_FLOW_PROTOCOL:
1464 		(void) ea_attach_item(record, &fu->fu_protocol,
1465 		    sizeof (uint8_t), EXT_UINT8 | EXD_FLOW_PROTOCOL);
1466 		break;
1467 	case AC_FLOW_DSFIELD:
1468 		(void) ea_attach_item(record, &fu->fu_dsfield,
1469 		    sizeof (uint8_t), EXT_UINT8 | EXD_FLOW_DSFIELD);
1470 		break;
1471 	case AC_FLOW_CTIME:
1472 		(void) ea_attach_item(record, &fu->fu_ctime,
1473 		    sizeof (uint64_t), EXT_UINT64 | EXD_FLOW_CTIME);
1474 		break;
1475 	case AC_FLOW_LSEEN:
1476 		(void) ea_attach_item(record, &fu->fu_lseen,
1477 		    sizeof (uint64_t), EXT_UINT64 | EXD_FLOW_LSEEN);
1478 		break;
1479 	case AC_FLOW_NBYTES:
1480 		(void) ea_attach_item(record, &fu->fu_nbytes,
1481 		    sizeof (uint64_t), EXT_UINT32 | EXD_FLOW_NBYTES);
1482 		break;
1483 	case AC_FLOW_NPKTS:
1484 		(void) ea_attach_item(record, &fu->fu_npackets,
1485 		    sizeof (uint64_t), EXT_UINT32 | EXD_FLOW_NPKTS);
1486 		break;
1487 	case AC_FLOW_PROJID:
1488 		if (fu->fu_projid >= 0) {
1489 			(void) ea_attach_item(record, &fu->fu_projid,
1490 			    sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_PROJID);
1491 		}
1492 		break;
1493 	case AC_FLOW_UID:
1494 		if (fu->fu_userid >= 0) {
1495 			(void) ea_attach_item(record, &fu->fu_userid,
1496 			    sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_UID);
1497 		}
1498 		break;
1499 	case AC_FLOW_ANAME:
1500 		(void) ea_attach_item(record, fu->fu_aname,
1501 		    strlen(fu->fu_aname) + 1, EXT_STRING | EXD_FLOW_ANAME);
1502 		break;
1503 	default:
1504 		attached = 0;
1505 	}
1506 	return (attached);
1507 }
1508 
1509 static ea_object_t *
1510 exacct_assemble_flow_record(flow_usage_t *fu, ulong_t *mask,
1511     ea_catalog_t record_type)
1512 {
1513 	int res, count;
1514 	ea_object_t *record;
1515 
1516 	/*
1517 	 * Assemble usage values into group.
1518 	 */
1519 	record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type);
1520 	for (res = 1, count = 0; res <= AC_FLOW_MAX_RES; res++)
1521 		if (BT_TEST(mask, res))
1522 			count += exacct_attach_flow_item(fu, record, res);
1523 	if (count == 0) {
1524 		ea_free_object(record, EUP_ALLOC);
1525 		record = NULL;
1526 	}
1527 	return (record);
1528 }
1529 
1530 int
1531 exacct_assemble_flow_usage(ac_info_t *ac_flow, flow_usage_t *fu,
1532     int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *),
1533     void *ubuf, size_t ubufsize, size_t *actual)
1534 {
1535 	ulong_t mask[AC_MASK_SZ];
1536 	ea_object_t *flow_usage;
1537 	ea_catalog_t record_type;
1538 	void *buf;
1539 	size_t bufsize;
1540 	int ret;
1541 
1542 	mutex_enter(&ac_flow->ac_lock);
1543 	if (ac_flow->ac_state == AC_OFF) {
1544 		mutex_exit(&ac_flow->ac_lock);
1545 		return (ENOTACTIVE);
1546 	}
1547 	bt_copy(&ac_flow->ac_mask[0], mask, AC_MASK_SZ);
1548 	mutex_exit(&ac_flow->ac_lock);
1549 
1550 	record_type = EXD_GROUP_FLOW;
1551 
1552 	flow_usage = exacct_assemble_flow_record(fu, mask, record_type);
1553 	if (flow_usage == NULL) {
1554 		return (0);
1555 	}
1556 
1557 	/*
1558 	 * Pack object into buffer and pass to callback.
1559 	 */
1560 	bufsize = ea_pack_object(flow_usage, NULL, 0);
1561 	buf = kmem_alloc(bufsize, KM_NOSLEEP);
1562 	if (buf == NULL) {
1563 		return (ENOMEM);
1564 	}
1565 
1566 	(void) ea_pack_object(flow_usage, buf, bufsize);
1567 
1568 	ret = callback(ac_flow, ubuf, ubufsize, buf, bufsize, actual);
1569 
1570 	/*
1571 	 * Free all previously allocations.
1572 	 */
1573 	kmem_free(buf, bufsize);
1574 	ea_free_object(flow_usage, EUP_ALLOC);
1575 	return (ret);
1576 }
1577 
1578 void
1579 exacct_commit_flow(void *arg)
1580 {
1581 	flow_usage_t *f = (flow_usage_t *)arg;
1582 	size_t size;
1583 	ulong_t mask[AC_MASK_SZ];
1584 	struct exacct_globals *acg;
1585 	ac_info_t *ac_flow;
1586 
1587 	if (exacct_zone_key == ZONE_KEY_UNINITIALIZED) {
1588 		/*
1589 		 * acctctl module not loaded. Nothing to do.
1590 		 */
1591 		return;
1592 	}
1593 
1594 	/*
1595 	 * Even though each zone nominally has its own flow accounting settings
1596 	 * (ac_flow), these are only maintained by and for the global zone.
1597 	 *
1598 	 * If this were to change in the future, this function should grow a
1599 	 * second zoneid (or zone) argument, and use the corresponding zone's
1600 	 * settings rather than always using those of the global zone.
1601 	 */
1602 	acg = zone_getspecific(exacct_zone_key, global_zone);
1603 	ac_flow = &acg->ac_flow;
1604 
1605 	mutex_enter(&ac_flow->ac_lock);
1606 	if (ac_flow->ac_state == AC_OFF) {
1607 		mutex_exit(&ac_flow->ac_lock);
1608 		return;
1609 	}
1610 	bt_copy(&ac_flow->ac_mask[0], mask, AC_MASK_SZ);
1611 	mutex_exit(&ac_flow->ac_lock);
1612 
1613 	(void) exacct_assemble_flow_usage(ac_flow, f, exacct_commit_callback,
1614 	    NULL, 0, &size);
1615 }
1616 
1617 /*
1618  * int exacct_tag_task(task_t *, void *, size_t, int)
1619  *
1620  * Overview
1621  *   exacct_tag_task() provides the exacct record construction and writing
1622  *   support required by putacct(2) for task entities.
1623  *
1624  * Return values
1625  *   The result of the write operation is returned, unless the extended
1626  *   accounting facility is not active, in which case ENOTACTIVE is returned.
1627  *
1628  * Caller's context
1629  *   Suitable for KM_SLEEP allocations.
1630  */
1631 int
1632 exacct_tag_task(ac_info_t *ac_task, task_t *tk, void *ubuf, size_t ubufsz,
1633     int flags)
1634 {
1635 	int error = 0;
1636 	void *buf;
1637 	size_t bufsize;
1638 	ea_catalog_t cat;
1639 	ea_object_t *tag;
1640 
1641 	mutex_enter(&ac_task->ac_lock);
1642 	if (ac_task->ac_state == AC_OFF || ac_task->ac_vnode == NULL) {
1643 		mutex_exit(&ac_task->ac_lock);
1644 		return (ENOTACTIVE);
1645 	}
1646 	mutex_exit(&ac_task->ac_lock);
1647 
1648 	tag = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | EXD_GROUP_TASK_TAG);
1649 	(void) ea_attach_item(tag, &tk->tk_tkid, 0,
1650 	    EXT_UINT32 | EXC_DEFAULT | EXD_TASK_TASKID);
1651 	(void) ea_attach_item(tag, tk->tk_zone->zone_nodename, 0,
1652 	    EXT_STRING | EXC_DEFAULT | EXD_TASK_HOSTNAME);
1653 	if (flags == EP_RAW)
1654 		cat = EXT_RAW | EXC_DEFAULT | EXD_TASK_TAG;
1655 	else
1656 		cat = EXT_EXACCT_OBJECT | EXC_DEFAULT | EXD_TASK_TAG;
1657 	(void) ea_attach_item(tag, ubuf, ubufsz, cat);
1658 
1659 	bufsize = ea_pack_object(tag, NULL, 0);
1660 	buf = kmem_alloc(bufsize, KM_SLEEP);
1661 	(void) ea_pack_object(tag, buf, bufsize);
1662 	error = exacct_vn_write(ac_task, buf, bufsize);
1663 	kmem_free(buf, bufsize);
1664 	ea_free_object(tag, EUP_ALLOC);
1665 	return (error);
1666 }
1667 
1668 /*
1669  * exacct_tag_proc(pid_t, taskid_t, void *, size_t, int, char *)
1670  *
1671  * Overview
1672  *   exacct_tag_proc() provides the exacct record construction and writing
1673  *   support required by putacct(2) for processes.
1674  *
1675  * Return values
1676  *   The result of the write operation is returned, unless the extended
1677  *   accounting facility is not active, in which case ENOTACTIVE is returned.
1678  *
1679  * Caller's context
1680  *   Suitable for KM_SLEEP allocations.
1681  */
1682 int
1683 exacct_tag_proc(ac_info_t *ac_proc, pid_t pid, taskid_t tkid, void *ubuf,
1684     size_t ubufsz, int flags, const char *hostname)
1685 {
1686 	int error = 0;
1687 	void *buf;
1688 	size_t bufsize;
1689 	ea_catalog_t cat;
1690 	ea_object_t *tag;
1691 
1692 	mutex_enter(&ac_proc->ac_lock);
1693 	if (ac_proc->ac_state == AC_OFF || ac_proc->ac_vnode == NULL) {
1694 		mutex_exit(&ac_proc->ac_lock);
1695 		return (ENOTACTIVE);
1696 	}
1697 	mutex_exit(&ac_proc->ac_lock);
1698 
1699 	tag = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | EXD_GROUP_PROC_TAG);
1700 	(void) ea_attach_item(tag, &pid, sizeof (uint32_t),
1701 	    EXT_UINT32 | EXC_DEFAULT | EXD_PROC_PID);
1702 	(void) ea_attach_item(tag, &tkid, 0,
1703 	    EXT_UINT32 | EXC_DEFAULT | EXD_TASK_TASKID);
1704 	(void) ea_attach_item(tag, (void *)hostname, 0,
1705 	    EXT_STRING | EXC_DEFAULT | EXD_TASK_HOSTNAME);
1706 	if (flags == EP_RAW)
1707 		cat = EXT_RAW | EXC_DEFAULT | EXD_PROC_TAG;
1708 	else
1709 		cat = EXT_EXACCT_OBJECT | EXC_DEFAULT | EXD_PROC_TAG;
1710 	(void) ea_attach_item(tag, ubuf, ubufsz, cat);
1711 
1712 	bufsize = ea_pack_object(tag, NULL, 0);
1713 	buf = kmem_alloc(bufsize, KM_SLEEP);
1714 	(void) ea_pack_object(tag, buf, bufsize);
1715 	error = exacct_vn_write(ac_proc, buf, bufsize);
1716 	kmem_free(buf, bufsize);
1717 	ea_free_object(tag, EUP_ALLOC);
1718 	return (error);
1719 }
1720 
1721 /*
1722  * void exacct_init(void)
1723  *
1724  * Overview
1725  *   Initialized the extended accounting subsystem.
1726  *
1727  * Return values
1728  *   None.
1729  *
1730  * Caller's context
1731  *   Suitable for KM_SLEEP allocations.
1732  */
1733 void
1734 exacct_init()
1735 {
1736 	exacct_queue = system_taskq;
1737 	exacct_object_cache = kmem_cache_create("exacct_object_cache",
1738 	    sizeof (ea_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1739 }
1740 
1741 /*
1742  * exacct_snapshot_proc_mstate() copies a process's microstate accounting data
1743  * and resource usage counters into a given task_usage_t. It differs from
1744  * exacct_copy_proc_mstate() in that here a) we are copying to a task_usage_t,
1745  * b) p_lock will have been acquired earlier in the call path and c) we
1746  * are here including the process's user and system times.
1747  */
1748 static void
1749 exacct_snapshot_proc_mstate(proc_t *p, task_usage_t *tu)
1750 {
1751 	tu->tu_utime  = mstate_aggr_state(p, LMS_USER);
1752 	tu->tu_stime  = mstate_aggr_state(p, LMS_SYSTEM);
1753 	tu->tu_minflt = p->p_ru.minflt;
1754 	tu->tu_majflt = p->p_ru.majflt;
1755 	tu->tu_sndmsg = p->p_ru.msgsnd;
1756 	tu->tu_rcvmsg = p->p_ru.msgrcv;
1757 	tu->tu_ioch   = p->p_ru.ioch;
1758 	tu->tu_iblk   = p->p_ru.inblock;
1759 	tu->tu_oblk   = p->p_ru.oublock;
1760 	tu->tu_vcsw   = p->p_ru.nvcsw;
1761 	tu->tu_icsw   = p->p_ru.nivcsw;
1762 	tu->tu_nsig   = p->p_ru.nsignals;
1763 	tu->tu_nswp   = p->p_ru.nswap;
1764 	tu->tu_nscl   = p->p_ru.sysc;
1765 }
1766 
1767 /*
1768  * void exacct_move_mstate(proc_t *, task_t *, task_t *)
1769  *
1770  * Overview
1771  *   exacct_move_mstate() is called by task_change() and accounts for
1772  *   a process's resource usage when it is moved from one task to another.
1773  *
1774  *   The process's usage at this point is recorded in the new task so
1775  *   that it can be excluded from the calculation of resources consumed
1776  *   by that task.
1777  *
1778  *   The resource usage inherited by the new task is also added to the
1779  *   aggregate maintained by the old task for processes that have exited.
1780  *
1781  * Return values
1782  *   None.
1783  *
1784  * Caller's context
1785  *   pidlock and p_lock held across exacct_move_mstate().
1786  */
1787 void
1788 exacct_move_mstate(proc_t *p, task_t *oldtk, task_t *newtk)
1789 {
1790 	task_usage_t tu;
1791 
1792 	/* Take a snapshot of this process's mstate and RU counters */
1793 	exacct_snapshot_proc_mstate(p, &tu);
1794 
1795 	/*
1796 	 * Use the snapshot to increment the aggregate usage of the old
1797 	 * task, and the inherited usage of the new one.
1798 	 */
1799 	mutex_enter(&oldtk->tk_usage_lock);
1800 	exacct_add_task_mstate(oldtk->tk_usage, &tu);
1801 	mutex_exit(&oldtk->tk_usage_lock);
1802 	mutex_enter(&newtk->tk_usage_lock);
1803 	exacct_add_task_mstate(newtk->tk_inherited, &tu);
1804 	mutex_exit(&newtk->tk_usage_lock);
1805 }
1806