1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/sysmacros.h>
32 #include <sys/conf.h>
33 #include <sys/fssnap_if.h>
34 #include <sys/fs/ufs_inode.h>
35 #include <sys/fs/ufs_lockfs.h>
36 #include <sys/fs/ufs_log.h>
37 #include <sys/fs/ufs_trans.h>
38 #include <sys/cmn_err.h>
39 #include <vm/pvn.h>
40 #include <vm/seg_map.h>
41 #include <sys/fdbuffer.h>
42 
43 #ifdef DEBUG
44 int evn_ufs_debug = 0;
45 #define	DEBUGF(args)	{ if (evn_ufs_debug) cmn_err args; }
46 #else
47 #define	DEBUGF(args)
48 #endif
49 
50 /*
51  * ufs_rdwr_data - supports reading or writing data when
52  * no changes are permitted in file size or space allocation.
53  *
54  * Inputs:
55  * fdb - The mandatory fdbuffer supports
56  *	the read or write operation.
57  * flags - defaults (zero value) to synchronous write
58  *	B_READ - indicates read operation
59  *	B_ASYNC - indicates perform operation asynchronously
60  */
61 /*ARGSUSED*/
62 int
63 ufs_rdwr_data(
64 	vnode_t		*vnodep,
65 	u_offset_t	offset,
66 	size_t		len,
67 	fdbuffer_t	*fdbp,
68 	int		flags,
69 	cred_t		*credp)
70 {
71 	struct inode	*ip = VTOI(vnodep);
72 	struct fs	*fs;
73 	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
74 	struct buf	*bp;
75 	krw_t		rwtype = RW_READER;
76 	u_offset_t	offset1 = offset;	/* Initial offset */
77 	size_t		iolen;
78 	int		curlen = 0;
79 	int		pplen;
80 	daddr_t		bn;
81 	int		contig = 0;
82 	int		error = 0;
83 	int		nbytes;			/* Number bytes this IO */
84 	int		offsetn;		/* Start point this IO */
85 	int		iswrite = flags & B_WRITE;
86 	int		io_started = 0;		/* No IO started */
87 	struct ulockfs	*ulp;
88 	uint_t		protp = PROT_ALL;
89 
90 	error = ufs_lockfs_begin_getpage(ufsvfsp, &ulp, segkmap, !iswrite,
91 	    &protp);
92 	if (error) {
93 		if (flags & B_ASYNC) {
94 			fdb_ioerrdone(fdbp, error);
95 		}
96 		return (error);
97 	}
98 	fs = ufsvfsp->vfs_fs;
99 	iolen = len;
100 
101 	DEBUGF((CE_CONT, "?ufs_rdwr: %s vp: %p pages:%p  off %llx len %lx"
102 	    " isize: %llx fdb: %p\n",
103 	    flags & B_READ ? "READ" : "WRITE", (void *)vnodep,
104 	    (void *)vnodep->v_pages, offset1, iolen, ip->i_size, (void *)fdbp));
105 
106 	rw_enter(&ip->i_ufsvfs->vfs_dqrwlock, RW_READER);
107 	rw_enter(&ip->i_contents, rwtype);
108 
109 	ASSERT(offset1 < ip->i_size);
110 
111 	if ((offset1 + iolen) > ip->i_size) {
112 		iolen = ip->i_size - offset1;
113 	}
114 	while (!error && curlen < iolen) {
115 
116 		contig = 0;
117 
118 		if ((error = bmap_read(ip, offset1, &bn, &contig)) != 0) {
119 			break;
120 		}
121 		ASSERT(!(bn == UFS_HOLE && iswrite));
122 		if (bn == UFS_HOLE) {
123 			/*
124 			 * If the above assertion is true,
125 			 * then the following if statement can never be true.
126 			 */
127 			if (iswrite && (rwtype == RW_READER)) {
128 				rwtype = RW_WRITER;
129 				if (!rw_tryupgrade(&ip->i_contents)) {
130 					rw_exit(&ip->i_contents);
131 					rw_enter(&ip->i_contents, rwtype);
132 					continue;
133 				}
134 			}
135 			offsetn = blkoff(fs, offset1);
136 			pplen = P2ROUNDUP(len, PAGESIZE);
137 			nbytes = MIN((pplen - curlen),
138 			    (fs->fs_bsize - offsetn));
139 			ASSERT(nbytes > 0);
140 
141 			/*
142 			 * We may be reading or writing.
143 			 */
144 			DEBUGF((CE_CONT, "?ufs_rdwr_data: hole %llx - %lx\n",
145 			    offset1, (iolen - curlen)));
146 
147 			if (iswrite) {
148 				printf("**WARNING: ignoring hole in write\n");
149 				error = ENOSPC;
150 			} else {
151 				fdb_add_hole(fdbp, offset1 - offset, nbytes);
152 			}
153 			offset1 += nbytes;
154 			curlen += nbytes;
155 			continue;
156 
157 		}
158 		ASSERT(contig > 0);
159 		pplen = P2ROUNDUP(len, PAGESIZE);
160 
161 		contig = MIN(contig, len - curlen);
162 		contig = P2ROUNDUP(contig, DEV_BSIZE);
163 
164 		bp = fdb_iosetup(fdbp, offset1 - offset, contig, vnodep, flags);
165 
166 		bp->b_edev = ip->i_dev;
167 		bp->b_dev = cmpdev(ip->i_dev);
168 		bp->b_blkno = bn;
169 		bp->b_file = ip->i_vnode;
170 		bp->b_offset = (offset_t)offset1;
171 
172 		if (ufsvfsp->vfs_snapshot) {
173 			fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
174 		} else {
175 			(void) bdev_strategy(bp);
176 		}
177 		io_started = 1;
178 
179 		offset1 += contig;
180 		curlen += contig;
181 		if (iswrite)
182 			lwp_stat_update(LWP_STAT_OUBLK, 1);
183 		else
184 			lwp_stat_update(LWP_STAT_INBLK, 1);
185 
186 		if ((flags & B_ASYNC) == 0) {
187 			error = biowait(bp);
188 			fdb_iodone(bp);
189 		}
190 
191 		DEBUGF((CE_CONT, "?loop ufs_rdwr_data.. off %llx len %lx\n",
192 		    offset1, (iolen - curlen)));
193 	}
194 
195 	DEBUGF((CE_CONT, "?ufs_rdwr_data: off %llx len %lx pages: %p ------\n",
196 	    offset1, (iolen - curlen), (void *)vnodep->v_pages));
197 
198 	rw_exit(&ip->i_contents);
199 	rw_exit(&ip->i_ufsvfs->vfs_dqrwlock);
200 
201 	if (flags & B_ASYNC) {
202 		/*
203 		 * Show that no more asynchronous IO will be added
204 		 */
205 		fdb_ioerrdone(fdbp, error);
206 	}
207 	if (ulp) {
208 		ufs_lockfs_end(ulp);
209 	}
210 	if (io_started && flags & B_ASYNC) {
211 		return (0);
212 	} else {
213 		return (error);
214 	}
215 }
216 
217 /*
218  * ufs_alloc_data - supports allocating space and reads or writes
219  * that involve changes to file length or space allocation.
220  *
221  * This function is more expensive, because of the UFS log transaction,
222  * so ufs_rdwr_data() should be used when space or file length changes
223  * will not occur.
224  *
225  * Inputs:
226  * fdb - A null pointer instructs this function to only allocate
227  *	space for the specified offset and length.
228  *	An actual fdbuffer instructs this function to perform
229  *	the read or write operation.
230  * flags - defaults (zero value) to synchronous write
231  *	B_READ - indicates read operation
232  *	B_ASYNC - indicates perform operation asynchronously
233  */
234 int
235 ufs_alloc_data(
236 	vnode_t		*vnodep,
237 	u_offset_t	offset,
238 	size_t		*len,
239 	fdbuffer_t	*fdbp,
240 	int		flags,
241 	cred_t		*credp)
242 {
243 	struct inode	*ip = VTOI(vnodep);
244 	size_t		done_len, io_len;
245 	int		contig;
246 	u_offset_t	uoff, io_off;
247 	int		error = 0;		/* No error occured */
248 	int		offsetn;		/* Start point this IO */
249 	int		nbytes;			/* Number bytes in this IO */
250 	daddr_t		bn;
251 	struct fs	*fs;
252 	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
253 	int		i_size_changed = 0;
254 	u_offset_t	old_i_size;
255 	struct ulockfs	*ulp;
256 	int		trans_size;
257 	int		issync;			/* UFS Log transaction */
258 						/* synchronous when non-zero */
259 
260 	int		io_started = 0;		/* No IO started */
261 	uint_t		protp = PROT_ALL;
262 
263 	ASSERT((flags & B_WRITE) == 0);
264 
265 	/*
266 	 * Obey the lockfs protocol
267 	 */
268 	error = ufs_lockfs_begin_getpage(ufsvfsp, &ulp, segkmap, 0, &protp);
269 	if (error) {
270 		if ((fdbp != NULL) && (flags & B_ASYNC)) {
271 			fdb_ioerrdone(fdbp, error);
272 		}
273 		return (error);
274 	}
275 	if (ulp) {
276 		/*
277 		 * Try to begin a UFS log transaction
278 		 */
279 		trans_size = TOP_GETPAGE_SIZE(ip);
280 		TRANS_TRY_BEGIN_CSYNC(ufsvfsp, issync, TOP_GETPAGE,
281 		    trans_size, error);
282 		if (error == EWOULDBLOCK) {
283 			ufs_lockfs_end(ulp);
284 			if ((fdbp != NULL) && (flags & B_ASYNC)) {
285 				fdb_ioerrdone(fdbp, EDEADLK);
286 			}
287 			return (EDEADLK);
288 		}
289 	}
290 
291 	uoff = offset;
292 	io_off = offset;
293 	io_len = *len;
294 	done_len = 0;
295 
296 	DEBUGF((CE_CONT, "?ufs_alloc: off %llx len %lx size %llx fdb: %p\n",
297 	    uoff, (io_len - done_len), ip->i_size, (void *)fdbp));
298 
299 	rw_enter(&ip->i_ufsvfs->vfs_dqrwlock, RW_READER);
300 	rw_enter(&ip->i_contents, RW_WRITER);
301 
302 	ASSERT((ip->i_mode & IFMT) == IFREG);
303 
304 	fs = ip->i_fs;
305 
306 	while (error == 0 && done_len < io_len) {
307 		uoff = (u_offset_t)(io_off + done_len);
308 		offsetn = (int)blkoff(fs, uoff);
309 		nbytes = (int)MIN(fs->fs_bsize - offsetn, io_len - done_len);
310 
311 		DEBUGF((CE_CONT, "?ufs_alloc_data: offset: %llx len %x\n",
312 		    uoff, nbytes));
313 
314 		if (uoff + nbytes > ip->i_size) {
315 			/*
316 			 * We are extending the length of the file.
317 			 * bmap is used so that we are sure that
318 			 * if we need to allocate new blocks, that it
319 			 * is done here before we up the file size.
320 			 */
321 			DEBUGF((CE_CONT, "?ufs_alloc_data: grow %llx -> %llx\n",
322 			    ip->i_size, uoff + nbytes));
323 
324 			error = bmap_write(ip, uoff, (offsetn + nbytes),
325 			    BI_ALLOC_ONLY, NULL, credp);
326 			if (ip->i_flag & (ICHG|IUPD))
327 				ip->i_seq++;
328 			if (error) {
329 				DEBUGF((CE_CONT, "?ufs_alloc_data: grow "
330 				    "failed err: %d\n", error));
331 				break;
332 			}
333 			if (fdbp != NULL) {
334 				if (uoff >= ip->i_size) {
335 					/*
336 					 * Desired offset is past end of bytes
337 					 * in file, so we have a hole.
338 					 */
339 					fdb_add_hole(fdbp, uoff - offset,
340 					    nbytes);
341 				} else {
342 					int contig;
343 					buf_t *bp;
344 
345 					error = bmap_read(ip, uoff, &bn,
346 					    &contig);
347 					if (error) {
348 						break;
349 					}
350 
351 					contig = ip->i_size - uoff;
352 					contig = P2ROUNDUP(contig, DEV_BSIZE);
353 
354 					bp = fdb_iosetup(fdbp, uoff - offset,
355 					    contig, vnodep, flags);
356 
357 					bp->b_edev = ip->i_dev;
358 					bp->b_dev = cmpdev(ip->i_dev);
359 					bp->b_blkno = bn;
360 					bp->b_file = ip->i_vnode;
361 					bp->b_offset = (offset_t)uoff;
362 
363 					if (ufsvfsp->vfs_snapshot) {
364 						fssnap_strategy(
365 						    &ufsvfsp->vfs_snapshot, bp);
366 					} else {
367 						(void) bdev_strategy(bp);
368 					}
369 					io_started = 1;
370 
371 					lwp_stat_update(LWP_STAT_OUBLK, 1);
372 
373 					if ((flags & B_ASYNC) == 0) {
374 						error = biowait(bp);
375 						fdb_iodone(bp);
376 						if (error) {
377 							break;
378 						}
379 					}
380 					if (contig > (ip->i_size - uoff)) {
381 						contig -= ip->i_size - uoff;
382 
383 						fdb_add_hole(fdbp,
384 						    ip->i_size - offset,
385 						    contig);
386 					}
387 				}
388 			}
389 
390 			i_size_changed = 1;
391 			old_i_size = ip->i_size;
392 			UFS_SET_ISIZE(uoff + nbytes, ip);
393 			TRANS_INODE(ip->i_ufsvfs, ip);
394 			/*
395 			 * file has grown larger than 2GB. Set flag
396 			 * in superblock to indicate this, if it
397 			 * is not already set.
398 			 */
399 			if ((ip->i_size > MAXOFF32_T) &&
400 			    !(fs->fs_flags & FSLARGEFILES)) {
401 				ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
402 				mutex_enter(&ufsvfsp->vfs_lock);
403 				fs->fs_flags |= FSLARGEFILES;
404 				ufs_sbwrite(ufsvfsp);
405 				mutex_exit(&ufsvfsp->vfs_lock);
406 			}
407 		} else {
408 			/*
409 			 * The file length is not being extended.
410 			 */
411 			error = bmap_read(ip, uoff, &bn, &contig);
412 			if (error) {
413 				DEBUGF((CE_CONT, "?ufs_alloc_data: "
414 				    "bmap_read err: %d\n", error));
415 				break;
416 			}
417 
418 			if (bn != UFS_HOLE) {
419 				/*
420 				 * Did not map a hole in the file
421 				 */
422 				int	contig = P2ROUNDUP(nbytes, DEV_BSIZE);
423 				buf_t	*bp;
424 
425 				if (fdbp != NULL) {
426 					bp = fdb_iosetup(fdbp, uoff - offset,
427 					    contig, vnodep, flags);
428 
429 					bp->b_edev = ip->i_dev;
430 					bp->b_dev = cmpdev(ip->i_dev);
431 					bp->b_blkno = bn;
432 					bp->b_file = ip->i_vnode;
433 					bp->b_offset = (offset_t)uoff;
434 
435 					if (ufsvfsp->vfs_snapshot) {
436 						fssnap_strategy(
437 						    &ufsvfsp->vfs_snapshot, bp);
438 					} else {
439 						(void) bdev_strategy(bp);
440 					}
441 					io_started = 1;
442 
443 					lwp_stat_update(LWP_STAT_OUBLK, 1);
444 
445 					if ((flags & B_ASYNC) == 0) {
446 						error = biowait(bp);
447 						fdb_iodone(bp);
448 						if (error) {
449 							break;
450 						}
451 					}
452 				}
453 			} else {
454 				/*
455 				 * We read a hole in the file.
456 				 * We have to allocate blocks for the hole.
457 				 */
458 				error = bmap_write(ip, uoff, (offsetn + nbytes),
459 				    BI_ALLOC_ONLY, NULL, credp);
460 				if (ip->i_flag & (ICHG|IUPD))
461 					ip->i_seq++;
462 				if (error) {
463 					DEBUGF((CE_CONT, "?ufs_alloc_data: fill"
464 					    " hole failed error: %d\n", error));
465 					break;
466 				}
467 				if (fdbp != NULL) {
468 					fdb_add_hole(fdbp, uoff - offset,
469 					    nbytes);
470 				}
471 			}
472 		}
473 		done_len += nbytes;
474 	}
475 
476 	if (error) {
477 		if (i_size_changed) {
478 			/*
479 			 * Allocation of the blocks for the file failed.
480 			 * So truncate the file size back to its original size.
481 			 */
482 			(void) ufs_itrunc(ip, old_i_size, 0, credp);
483 		}
484 	}
485 
486 	DEBUGF((CE_CONT, "?ufs_alloc: uoff %llx len %lx\n",
487 	    uoff, (io_len - done_len)));
488 
489 	if ((offset + *len) < (NDADDR * fs->fs_bsize)) {
490 		*len = (size_t)(roundup(offset + *len, fs->fs_fsize) - offset);
491 	} else {
492 		*len = (size_t)(roundup(offset + *len, fs->fs_bsize) - offset);
493 	}
494 
495 	/*
496 	 * Flush cached pages.
497 	 *
498 	 * XXX - There should be no pages involved, since the I/O was performed
499 	 * through the device strategy routine and the page cache was bypassed.
500 	 * However, testing has demonstrated that this VOP_PUTPAGE is
501 	 * necessary. Without this, data might not always be read back as it
502 	 * was written.
503 	 *
504 	 */
505 	(void) VOP_PUTPAGE(vnodep, 0, 0, B_INVAL, credp);
506 
507 	rw_exit(&ip->i_contents);
508 	rw_exit(&ip->i_ufsvfs->vfs_dqrwlock);
509 
510 	if ((fdbp != NULL) && (flags & B_ASYNC)) {
511 		/*
512 		 * Show that no more asynchronous IO will be added
513 		 */
514 		fdb_ioerrdone(fdbp, error);
515 	}
516 	if (ulp) {
517 		/*
518 		 * End the UFS Log transaction
519 		 */
520 		TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_GETPAGE,
521 		    trans_size);
522 		ufs_lockfs_end(ulp);
523 	}
524 	if (io_started && (flags & B_ASYNC)) {
525 		return (0);
526 	} else {
527 		return (error);
528 	}
529 }
530