xref: /dragonfly/sys/sys/buf.h (revision 03be034e)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)buf.h	8.9 (Berkeley) 3/30/95
39  * $FreeBSD: src/sys/sys/buf.h,v 1.88.2.10 2003/01/25 19:02:23 dillon Exp $
40  * $DragonFly: src/sys/sys/buf.h,v 1.12 2005/04/15 19:08:13 dillon Exp $
41  */
42 
43 #ifndef _SYS_BUF_H_
44 #define	_SYS_BUF_H_
45 
46 #ifndef _SYS_QUEUE_H_
47 #include <sys/queue.h>
48 #endif
49 #ifndef _SYS_LOCK_H_
50 #include <sys/lock.h>
51 #endif
52 #ifndef _SYS_DEVICE_H_
53 #include <sys/device.h>
54 #endif
55 
56 #ifndef _SYS_XIO_H_
57 #include <sys/xio.h>
58 #endif
59 #ifndef _SYS_TREE_H_
60 #include <sys/tree.h>
61 #endif
62 
63 struct buf;
64 struct mount;
65 struct vnode;
66 struct xio;
67 
68 struct buf_rb_tree;
69 RB_PROTOTYPE(buf_rb_tree, buf, b_rbnode, rb_buf_compare);
70 
71 /*
72  * To avoid including <ufs/ffs/softdep.h>
73  */
74 LIST_HEAD(workhead, worklist);
75 /*
76  * These are currently used only by the soft dependency code, hence
77  * are stored once in a global variable. If other subsystems wanted
78  * to use these hooks, a pointer to a set of bio_ops could be added
79  * to each buffer.
80  */
81 extern struct bio_ops {
82 	void	(*io_start) (struct buf *);
83 	void	(*io_complete) (struct buf *);
84 	void	(*io_deallocate) (struct buf *);
85 	int	(*io_fsync) (struct vnode *);
86 	int	(*io_sync) (struct mount *);
87 	void	(*io_movedeps) (struct buf *, struct buf *);
88 	int	(*io_countdeps) (struct buf *, int);
89 } bioops;
90 
91 struct iodone_chain {
92 	long	ic_prev_flags;
93 	void	(*ic_prev_iodone) (struct buf *);
94 	void	*ic_prev_iodone_chain;
95 	struct {
96 		long	ia_long;
97 		void	*ia_ptr;
98 	}	ic_args[5];
99 };
100 
101 /*
102  * The buffer header describes an I/O operation in the kernel.
103  *
104  * NOTES:
105  *	b_bufsize, b_bcount.  b_bufsize is the allocation size of the
106  *	buffer, either DEV_BSIZE or PAGE_SIZE aligned.  b_bcount is the
107  *	originally requested buffer size and can serve as a bounds check
108  *	against EOF.  For most, but not all uses, b_bcount == b_bufsize.
109  *
110  *	b_dirtyoff, b_dirtyend.  Buffers support piecemeal, unaligned
111  *	ranges of dirty data that need to be written to backing store.
112  *	The range is typically clipped at b_bcount ( not b_bufsize ).
113  *
114  *	b_resid.  Number of bytes remaining in I/O.  After an I/O operation
115  *	completes, b_resid is usually 0 indicating 100% success.
116  */
117 struct buf {
118 	LIST_ENTRY(buf) b_hash;		/* Hash chain. */
119 	RB_ENTRY(buf) b_rbnode;		/* Red-Black node in vnode RB tree */
120 	TAILQ_ENTRY(buf) b_freelist;	/* Free list position if not active. */
121 	TAILQ_ENTRY(buf) b_act;		/* Device driver queue when active. *new* */
122 	long	b_flags;		/* B_* flags. */
123 	unsigned short b_qindex;	/* buffer queue index */
124 	unsigned char b_xflags;		/* extra flags */
125 	struct lock b_lock;		/* Buffer lock */
126 	int	b_error;		/* Errno value. */
127 	long	b_bufsize;		/* Allocated buffer size. */
128 	long	b_runningbufspace;	/* when I/O is running, pipelining */
129 	long	b_bcount;		/* Valid bytes in buffer. */
130 	long	b_resid;		/* Remaining I/O. */
131 	dev_t	b_dev;			/* Device associated with buffer. */
132 	caddr_t	b_data;			/* Memory, superblocks, indirect etc. */
133 	caddr_t	b_kvabase;		/* base kva for buffer */
134 	int	b_kvasize;		/* size of kva for buffer */
135 	daddr_t	b_lblkno;		/* Logical block number. */
136 	daddr_t	b_blkno;		/* Underlying physical block number. */
137 	off_t	b_offset;		/* Offset into file */
138 					/* Function to call upon completion. */
139 	void	(*b_iodone) (struct buf *);
140 					/* For nested b_iodone's. */
141 	struct	iodone_chain *b_iodone_chain;
142 	struct	vnode *b_vp;		/* Device vnode. */
143 	int	b_dirtyoff;		/* Offset in buffer of dirty region. */
144 	int	b_dirtyend;		/* Offset of end of dirty region. */
145 	daddr_t	b_pblkno;               /* physical block number */
146 	void	*b_saveaddr;		/* Original b_addr for physio. */
147 	void	*b_driver1;		/* for private use by the driver */
148 	void	*b_caller1;		/* for private use by the caller */
149 	union	pager_info {
150 		void	*pg_spc;
151 		int	pg_reqpage;
152 	} b_pager;
153 	union	cluster_info {
154 		TAILQ_HEAD(cluster_list_head, buf) cluster_head;
155 		TAILQ_ENTRY(buf) cluster_entry;
156 	} b_cluster;
157 	struct	xio b_xio;  	/* page list management for buffer head. */
158 	struct	workhead b_dep;		/* List of filesystem dependencies. */
159 	struct chain_info {		/* buffer chaining */
160 		struct buf *parent;
161 		int count;
162 	} b_chain;
163 };
164 
165 #define b_spc	b_pager.pg_spc
166 
167 /*
168  * These flags are kept in b_flags.
169  *
170  * Notes:
171  *
172  *	B_ASYNC		VOP calls on bp's are usually async whether or not
173  *			B_ASYNC is set, but some subsystems, such as NFS, like
174  *			to know what is best for the caller so they can
175  *			optimize the I/O.
176  *
177  *	B_PAGING	Indicates that bp is being used by the paging system or
178  *			some paging system and that the bp is not linked into
179  *			the b_vp's clean/dirty linked lists or ref counts.
180  *			Buffer vp reassignments are illegal in this case.
181  *
182  *	B_CACHE		This may only be set if the buffer is entirely valid.
183  *			The situation where B_DELWRI is set and B_CACHE is
184  *			clear MUST be committed to disk by getblk() so
185  *			B_DELWRI can also be cleared.  See the comments for
186  *			getblk() in kern/vfs_bio.c.  If B_CACHE is clear,
187  *			the caller is expected to clear B_ERROR|B_INVAL,
188  *			set B_READ, and initiate an I/O.
189  *
190  *			The 'entire buffer' is defined to be the range from
191  *			0 through b_bcount.
192  *
193  *	B_MALLOC	Request that the buffer be allocated from the malloc
194  *			pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned.
195  *
196  *	B_CLUSTEROK	This flag is typically set for B_DELWRI buffers
197  *			by filesystems that allow clustering when the buffer
198  *			is fully dirty and indicates that it may be clustered
199  *			with other adjacent dirty buffers.  Note the clustering
200  *			may not be used with the stage 1 data write under NFS
201  *			but may be used for the commit rpc portion.
202  *
203  *	B_VMIO		Indicates that the buffer is tied into an VM object.
204  *			The buffer's data is always PAGE_SIZE aligned even
205  *			if b_bufsize and b_bcount are not.  ( b_bufsize is
206  *			always at least DEV_BSIZE aligned, though ).
207  *
208  *	B_DIRECT	Hint that we should attempt to completely free
209  *			the pages underlying the buffer.   B_DIRECT is
210  *			sticky until the buffer is released and typically
211  *			only has an effect when B_RELBUF is also set.
212  *
213  *	B_NOWDRAIN	This flag should be set when a device (like VN)
214  *			does a turn-around VOP_WRITE from its strategy
215  *			routine.  This flag prevents bwrite() from blocking
216  *			in wdrain, avoiding a deadlock situation.
217  */
218 
219 #define	B_AGE		0x00000001	/* Move to age queue when I/O done. */
220 #define	B_NEEDCOMMIT	0x00000002	/* Append-write in progress. */
221 #define	B_ASYNC		0x00000004	/* Start I/O, do not wait. */
222 #define	B_DIRECT	0x00000008	/* direct I/O flag (pls free vmio) */
223 #define	B_DEFERRED	0x00000010	/* Skipped over for cleaning */
224 #define	B_CACHE		0x00000020	/* Bread found us in the cache. */
225 #define	B_CALL		0x00000040	/* Call b_iodone from biodone. */
226 #define	B_DELWRI	0x00000080	/* Delay I/O until buffer reused. */
227 #define	B_FREEBUF	0x00000100	/* Instruct driver: free blocks */
228 #define	B_DONE		0x00000200	/* I/O completed. */
229 #define	B_EINTR		0x00000400	/* I/O was interrupted */
230 #define	B_ERROR		0x00000800	/* I/O error occurred. */
231 #define	B_UNUSED1000	0x00001000
232 #define	B_INVAL		0x00002000	/* Does not contain valid info. */
233 #define	B_LOCKED	0x00004000	/* Locked in core (not reusable). */
234 #define	B_NOCACHE	0x00008000	/* Do not cache block after use. */
235 #define	B_MALLOC	0x00010000	/* malloced b_data */
236 #define	B_CLUSTEROK	0x00020000	/* Pagein op, so swap() can count it. */
237 #define	B_PHYS		0x00040000	/* I/O to user memory. */
238 #define	B_RAW		0x00080000	/* Set by physio for raw transfers. */
239 #define	B_READ		0x00100000	/* Read buffer. */
240 #define	B_DIRTY		0x00200000	/* Needs writing later. */
241 #define	B_RELBUF	0x00400000	/* Release VMIO buffer. */
242 #define	B_WANT		0x00800000	/* Used by vm_pager.c */
243 #define	B_WRITE		0x00000000	/* Write buffer (pseudo flag). */
244 #define	B_UNUSED1000000	0x01000000
245 #define	B_XXX		0x02000000	/* Debugging flag. */
246 #define	B_PAGING	0x04000000	/* volatile paging I/O -- bypass VMIO */
247 #define	B_ORDERED	0x08000000	/* Must guarantee I/O ordering */
248 #define B_RAM		0x10000000	/* Read ahead mark (flag) */
249 #define B_VMIO		0x20000000	/* VMIO flag */
250 #define B_CLUSTER	0x40000000	/* pagein op, so swap() can count it */
251 #define B_NOWDRAIN	0x80000000	/* Avoid wdrain deadlock */
252 
253 #define PRINT_BUF_FLAGS "\20\40nowdrain\37cluster\36vmio\35ram\34ordered" \
254 	"\33paging\32xxx\31writeinprog\30want\27relbuf\26dirty" \
255 	"\25read\24raw\23phys\22clusterok\21malloc\20nocache" \
256 	"\17locked\16inval\15scanned\14error\13eintr\12done\11freebuf" \
257 	"\10delwri\7call\6cache\4direct\3async\2needcommit\1age"
258 
259 /*
260  * These flags are kept in b_xflags.
261  */
262 #define	BX_VNDIRTY	0x00000001	/* On vnode dirty list */
263 #define	BX_VNCLEAN	0x00000002	/* On vnode clean list */
264 #define	BX_BKGRDWRITE	0x00000004	/* Do writes in background */
265 #define	BX_BKGRDINPROG	0x00000008	/* Background write in progress */
266 #define	BX_BKGRDWAIT	0x00000010	/* Background write waiting */
267 #define BX_AUTOCHAINDONE 0x00000020	/* pager I/O chain auto mode */
268 
269 #define	NOOFFSET	(-1LL)		/* No buffer offset calculated yet */
270 
271 #ifdef _KERNEL
272 /*
273  * Buffer locking.  See sys/buf2.h for inline functions.
274  */
275 extern struct lwkt_token buftimetoken;	/* Interlock on setting prio and timo */
276 extern char *buf_wmesg;			/* Default buffer lock message */
277 #define BUF_WMESG "bufwait"
278 
279 #endif /* _KERNEL */
280 
281 struct buf_queue_head {
282 	TAILQ_HEAD(buf_queue, buf) queue;
283 	daddr_t	last_pblkno;
284 	struct	buf *insert_point;
285 	struct	buf *switch_point;
286 };
287 
288 /*
289  * This structure describes a clustered I/O.  It is stored in the b_saveaddr
290  * field of the buffer on which I/O is done.  At I/O completion, cluster
291  * callback uses the structure to parcel I/O's to individual buffers, and
292  * then free's this structure.
293  */
294 struct cluster_save {
295 	long	bs_bcount;		/* Saved b_bcount. */
296 	long	bs_bufsize;		/* Saved b_bufsize. */
297 	void	*bs_saveaddr;		/* Saved b_addr. */
298 	int	bs_nchildren;		/* Number of associated buffers. */
299 	struct buf **bs_children;	/* List of associated buffers. */
300 };
301 
302 /*
303  * Definitions for the buffer free lists.
304  */
305 #define BUFFER_QUEUES	6	/* number of free buffer queues */
306 
307 #define QUEUE_NONE	0	/* on no queue */
308 #define QUEUE_LOCKED	1	/* locked buffers */
309 #define QUEUE_CLEAN	2	/* non-B_DELWRI buffers */
310 #define QUEUE_DIRTY	3	/* B_DELWRI buffers */
311 #define QUEUE_EMPTYKVA	4	/* empty buffer headers w/KVA assignment */
312 #define QUEUE_EMPTY	5	/* empty buffer headers */
313 
314 /*
315  * Zero out the buffer's data area.
316  */
317 #define	clrbuf(bp) {							\
318 	bzero((bp)->b_data, (u_int)(bp)->b_bcount);			\
319 	(bp)->b_resid = 0;						\
320 }
321 
322 /*
323  * Flags to low-level bitmap allocation routines (balloc).
324  *
325  * Note: sequential_heuristic() in kern/vfs_vnops.c limits the count
326  * to 127.
327  */
328 #define B_SEQMASK	0x7F000000	/* Sequential heuristic mask. */
329 #define B_SEQSHIFT	24		/* Sequential heuristic shift. */
330 #define B_SEQMAX	0x7F
331 #define B_CLRBUF	0x01		/* Cleared invalid areas of buffer. */
332 #define B_SYNC		0x02		/* Do all allocations synchronously. */
333 
334 #ifdef _KERNEL
335 extern int	nbuf;			/* The number of buffer headers */
336 extern int	maxswzone;		/* Max KVA for swap structures */
337 extern int	maxbcache;		/* Max KVA for buffer cache */
338 extern int	runningbufspace;
339 extern int      buf_maxio;              /* nominal maximum I/O for buffer */
340 extern struct	buf *buf;		/* The buffer headers. */
341 extern char	*buffers;		/* The buffer contents. */
342 extern int	bufpages;		/* Number of memory pages in the buffer pool. */
343 extern struct	buf *swbuf;		/* Swap I/O buffer headers. */
344 extern int	nswbuf;			/* Number of swap I/O buffer headers. */
345 extern TAILQ_HEAD(swqueue, buf) bswlist;
346 extern TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES];
347 
348 struct uio;
349 
350 caddr_t bufhashinit (caddr_t);
351 void	bufinit (void);
352 void	bwillwrite (void);
353 int	buf_dirty_count_severe (void);
354 void	bremfree (struct buf *);
355 int	bread (struct vnode *, daddr_t, int, struct buf **);
356 int	breadn (struct vnode *, daddr_t, int, daddr_t *, int *, int,
357 	    struct buf **);
358 int	bwrite (struct buf *);
359 void	bdwrite (struct buf *);
360 void	bawrite (struct buf *);
361 void	bdirty (struct buf *);
362 void	bundirty (struct buf *);
363 int	bowrite (struct buf *);
364 void	brelse (struct buf *);
365 void	bqrelse (struct buf *);
366 int	vfs_bio_awrite (struct buf *);
367 struct buf *     getpbuf (int *);
368 struct buf *incore (struct vnode *, daddr_t);
369 struct buf *gbincore (struct vnode *, daddr_t);
370 int	inmem (struct vnode *, daddr_t);
371 struct buf *getblk (struct vnode *, daddr_t, int, int, int);
372 struct buf *geteblk (int);
373 int	biowait (struct buf *);
374 void	biodone (struct buf *);
375 
376 void	cluster_callback (struct buf *);
377 int	cluster_read (struct vnode *, u_quad_t, daddr_t, long,
378 	    long, int, struct buf **);
379 int	cluster_wbuild (struct vnode *, long, daddr_t, int);
380 void	cluster_write (struct buf *, u_quad_t, int);
381 int	physio (dev_t dev, struct uio *uio, int ioflag);
382 #define physread physio
383 #define physwrite physio
384 void	vfs_bio_set_validclean (struct buf *, int base, int size);
385 void	vfs_bio_clrbuf (struct buf *);
386 void	vfs_busy_pages (struct buf *, int clear_modify);
387 void	vfs_unbusy_pages (struct buf *);
388 void	vwakeup (struct buf *);
389 int	vmapbuf (struct buf *);
390 void	vunmapbuf (struct buf *);
391 void	relpbuf (struct buf *, int *);
392 void	brelvp (struct buf *);
393 void	bgetvp (struct vnode *, struct buf *);
394 void	pbgetvp (struct vnode *, struct buf *);
395 void	pbrelvp (struct buf *);
396 int	allocbuf (struct buf *bp, int size);
397 void	reassignbuf (struct buf *, struct vnode *);
398 void	pbreassignbuf (struct buf *, struct vnode *);
399 struct	buf *trypbuf (int *);
400 
401 #endif /* _KERNEL */
402 
403 #endif /* !_SYS_BUF_H_ */
404