1 /*-
2  * Copyright (c) 1996, 2020 Oracle and/or its affiliates.  All rights reserved.
3  *
4  * See the file LICENSE for license information.
5  *
6  * $Id$
7  */
8 
9 #include "db_config.h"
10 
11 #include "db_int.h"
12 #include "dbinc/mp.h"
13 #include "dbinc/txn.h"
14 
15 /*
16  * This configuration parameter limits the number of hash buckets which
17  * __memp_alloc() searches through while excluding buffers with a 'high'
18  * priority.
19  */
20 #if !defined(MPOOL_ALLOC_SEARCH_LIMIT)
21 #define	MPOOL_ALLOC_SEARCH_LIMIT	500
22 #endif
23 
24 /*
25  * __memp_bh_unreachable --
26  *
27  *	Determine whether this buffer can not ever be seen again: is the next
28  *	newer version visible to the same transaction which sees this one?
29  *	If both versions are visibile to the same transaction, there is no
30  *	reason to keep the older one: it can be purged.
31  *
32  *	If this buffer has a more recent version, and there is a transaction
33  *	with a read_lsn between this buffer's and that more recent version's,
34  *	the buffer is visible to at least that transaction, so return FALSE.
35  *	Otherwise return TRUE.
36  *
37  *	txns:	   3/10		       2/10	   2/5 2/1          1/10
38  *	vers: 3/15	 2/15  2/14    2/10   2/8	     1/150
39  *	      vis	 vis  unreach   vis  unreach	     vis
40  *	who  new txns	 3/10	       2/10		    2/5, 2/1
41  *	sees
42  *
43  *	Note: in the abvove example, the page was allocated after txn 1/10
44  *	started. 1/10 would not see any version of the page.
45  *
46  * PUBLIC: int __memp_bh_unreachable __P((ENV *, BH *, DB_LSN *, int));
47  */
48 int
__memp_bh_unreachable(env,bhp,snapshots,n_snapshots)49 __memp_bh_unreachable(env, bhp, snapshots, n_snapshots)
50 	ENV *env;
51 	BH *bhp;
52 	DB_LSN *snapshots;
53 	int n_snapshots;
54 {
55 	BH *newer_bhp;
56 	DB_LSN b_vlsn, n_vlsn;
57 	int i, ret;
58 #ifdef DIAGNOSTIC
59 	DB_MPOOL *dbmp;
60 	DB_MSGBUF mb;
61 	MPOOLFILE *bh_mfp;
62 #endif
63 
64 	/*
65 	 * The buffer can't be purged if it is being used, or is the most recent
66 	 * version, or the next newer version isn't a copy yet.
67 	 */
68 	if (BH_REFCOUNT(bhp) != 0 ||
69 	    (newer_bhp = SH_CHAIN_NEXT(bhp, vc, __bh)) == NULL ||
70 	    newer_bhp->td_off == INVALID_ROFF)
71 		return (FALSE);
72 
73 	/*
74 	 * Find the visiblity LSNs for this buffer (b_vlsn) and the more recent,
75 	 * newer buffer (n_vlsn). If the newer version hasn't committed yet the
76 	 * bhp could be needed.
77 	 */
78 	n_vlsn = *VISIBLE_LSN(env, newer_bhp);
79 	if (IS_MAX_LSN(n_vlsn))
80 		return (FALSE);
81 	if (bhp->td_off == INVALID_ROFF)
82 		INIT_LSN(b_vlsn);
83 	else
84 		b_vlsn = *VISIBLE_LSN(env, bhp);
85 
86 	ret = TRUE;
87 	/*
88 	 * Look for a transaction which is between n_lsn and b_lsn - determining
89 	 * that bhp is reachable. Stop looking once the transactions get so
90 	 * small (old) that they precede the buffer's version; no earlier txn
91 	 * could be between n_vlsn and b_vlsn.
92 	 */
93 	for (i = 0;
94 	     i < n_snapshots && LOG_COMPARE(&snapshots[i], &b_vlsn) >= 0;
95 	     i++) {
96 		if (LOG_COMPARE(&snapshots[i], &n_vlsn) < 0) {
97 			/*
98 			 * This txn can see (started after) bhp, but not
99 			 * newer_bhp (which committed after this txn started).
100 			 */
101 			ret = FALSE;
102 			break;
103 		}
104 	}
105 
106 #ifdef DIAGNOSTIC
107 	if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC)) {
108 		dbmp = env->mp_handle;
109 		bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
110 		DB_MSGBUF_INIT(&mb);
111 		__db_msgadd(env, &mb,
112     "bh_unreachable %s pgno %d %s %lu/%lu %x newer %lu/%lu txn #%d in\n",
113 		    __memp_fns(dbmp, bh_mfp), bhp->pgno,
114 		    ret ? "purgeable" : "needed",
115 		    (u_long)b_vlsn.file, (u_long)b_vlsn.offset, bhp->flags,
116 		    (u_long)n_vlsn.file, (u_long)n_vlsn.offset, i);
117 		for (i = 0; i != n_snapshots; i++)
118 			__db_msgadd(env, &mb, " %lu/%lu",
119 			    (u_long)snapshots[i].file,
120 			    (u_long)snapshots[i].offset);
121 		DB_MSGBUF_FLUSH(env, &mb);
122 	}
123 #endif
124 	return (ret);
125 }
126 
127 /*
128  * __memp_alloc --
129  *	Allocate some space from a cache region. If the region is full then
130  *	reuse one or more cache buffers.
131  *
132  * PUBLIC: int __memp_alloc __P((DB_MPOOL *,
133  * PUBLIC:     REGINFO *, MPOOLFILE *, size_t, roff_t *, void *));
134  */
135 int
__memp_alloc(dbmp,infop,mfp,len,offsetp,retp)136 __memp_alloc(dbmp, infop, mfp, len, offsetp, retp)
137 	DB_MPOOL *dbmp;
138 	REGINFO *infop;
139 	MPOOLFILE *mfp;
140 	size_t len;
141 	roff_t *offsetp;
142 	void *retp;
143 {
144 	BH *bhp, *current_bhp, *mvcc_bhp, *oldest_bhp;
145 	BH_FROZEN_PAGE *frozen_bhp;
146 	DB_LSN *snapshots, vlsn;
147 	DB_MPOOL_HASH *dbht, *hp, *hp_end, *hp_saved, *hp_tmp;
148 	ENV *env;
149 	MPOOL *c_mp;
150 	MPOOLFILE *bh_mfp;
151 	size_t freed_space;
152 	u_int32_t buckets, bucket_priority, buffers, cache_reduction;
153 	u_int32_t dirty_eviction, high_priority, priority, versions;
154 	u_int32_t priority_saved, put_counter, lru_generation, total_buckets;
155 	int aggressive, alloc_freeze, b_lock, giveup;
156 	int h_locked, need_free, n_snapshots, obsolete, ret, write_error;
157 	u_int8_t *endp;
158 	void *p;
159 
160 	env = dbmp->env;
161 	c_mp = infop->primary;
162 	dbht = R_ADDR(infop, c_mp->htab);
163 	hp_end = &dbht[c_mp->htab_buckets];
164 	hp_saved = NULL;
165 	snapshots = NULL;
166 	priority_saved = write_error = 0;
167 	buckets = buffers = put_counter = total_buckets = versions = 0;
168 	aggressive = alloc_freeze = giveup = h_locked = n_snapshots = 0;
169 
170 	/*
171 	 * If we're allocating a buffer, and the one we're discarding is the
172 	 * same size, we don't want to waste the time to re-integrate it into
173 	 * the shared memory free list.  If the DB_MPOOLFILE argument isn't
174 	 * NULL, we'll compare the underlying page sizes of the two buffers
175 	 * before free-ing and re-allocating buffers.
176 	 */
177 	if (mfp != NULL) {
178 		len = SSZA(BH, buf) + mfp->pagesize;
179 		/* Add space for alignment padding for MVCC diagnostics. */
180 		MVCC_BHSIZE(mfp, len);
181 	}
182 
183 	STAT_INC(env, mpool, nallocs, c_mp->stat.st_alloc, len);
184 
185 	MPOOL_REGION_LOCK(env, infop);
186 
187 	/*
188 	 * First we try to allocate from free memory.  If that fails, scan the
189 	 * buffer pool to find buffers with low priorities.  We consider small
190 	 * sets of hash buckets each time to limit the amount of work needing
191 	 * to be done.  This approximates LRU, but not very well.  We either
192 	 * find a buffer of the same size to use, or we will free 3 times what
193 	 * we need in the hopes it will coalesce into a contiguous chunk of the
194 	 * right size.  In the latter case we branch back here and try again.
195 	 */
196 alloc:	if ((ret = __env_alloc(infop, len, &p)) == 0) {
197 		if (mfp != NULL) {
198 			/*
199 			 * For MVCC diagnostics, align the pointer so that the
200 			 * buffer starts on a page boundary.
201 			 */
202 			MVCC_BHALIGN(p);
203 			bhp = (BH *)p;
204 
205 			if ((ret = __mutex_alloc(env, MTX_MPOOL_BH,
206 			    DB_MUTEX_SHARED, &bhp->mtx_buf)) != 0) {
207 				MVCC_BHUNALIGN(bhp);
208 				__env_alloc_free(infop, bhp);
209 				goto search;
210 			}
211 			atomic_init(&bhp->ref, 0);
212 			c_mp->pages++;
213 		}
214 		MPOOL_REGION_UNLOCK(env, infop);
215 found:		if (offsetp != NULL)
216 			*offsetp = R_OFFSET(infop, p);
217 		*(void **)retp = p;
218 
219 		/*
220 		 * Update the search statistics.
221 		 *
222 		 * We're not holding the region locked here, these statistics
223 		 * can't be trusted.
224 		 */
225 #ifdef HAVE_STATISTICS
226 		total_buckets += buckets;
227 		if (total_buckets != 0) {
228 			if (total_buckets > c_mp->stat.st_alloc_max_buckets)
229 				STAT_SET(env, mpool, alloc_max_buckets,
230 				    c_mp->stat.st_alloc_max_buckets,
231 				    total_buckets, infop->id);
232 			STAT_ADJUST(env, mpool, alloc_buckets,
233 			    c_mp->stat.st_alloc_buckets,
234 			    total_buckets, infop->id);
235 		}
236 		if (buffers != 0) {
237 			if (buffers > c_mp->stat.st_alloc_max_pages)
238 				STAT_SET(env, mpool, alloc_max_pages,
239 				    c_mp->stat.st_alloc_max_pages,
240 				    buffers, infop->id);
241 			STAT_ADJUST(env, mpool, alloc_pages,
242 			    c_mp->stat.st_alloc_pages, buffers, infop->id);
243 		}
244 #endif
245 		goto done;
246 	} else if (giveup || c_mp->pages == 0) {
247 		MPOOL_REGION_UNLOCK(env, infop);
248 
249 		__db_errx(env, DB_STR("3017",
250 		    "unable to allocate space from the buffer cache"));
251 		if (ret == ENOMEM && write_error != 0)
252 			ret = EIO;
253 		goto done;
254 	}
255 
256 search:
257 	/*
258 	 * Anything newer than 1/10th of the buffer pool is ignored during the
259 	 * first MPOOL_SEARCH_ALLOC_LIMIT buckets worth of allocation.
260 	 */
261 	cache_reduction = c_mp->pages / 10;
262 	high_priority = aggressive ? MPOOL_LRU_MAX :
263 	    c_mp->lru_priority - cache_reduction;
264 	lru_generation = c_mp->lru_generation;
265 
266 	ret = 0;
267 
268 	/*
269 	 * We re-attempt the allocation every time we've freed 3 times what
270 	 * we need.  Reset our free-space counter.
271 	 */
272 	freed_space = 0;
273 	total_buckets += buckets;
274 	buckets = 0;
275 
276 	/*
277 	 * Walk the hash buckets and find the next two with potentially useful
278 	 * buffers.  Free the buffer with the lowest priority from the buckets'
279 	 * chains.
280 	 */
281 	for (;;) {
282 		/* All pages have been freed, make one last try */
283 		if (c_mp->pages == 0)
284 			goto alloc;
285 
286 		/* Check for wrap around. */
287 		hp = &dbht[c_mp->last_checked++];
288 		if (hp >= hp_end) {
289 			c_mp->last_checked = 0;
290 			hp = &dbht[c_mp->last_checked++];
291 		}
292 
293 		/*
294 		 * The failure mode is when there are too many buffers we can't
295 		 * write or there's not enough memory in the system to support
296 		 * the number of pinned buffers.
297 		 *
298 		 * Get aggressive if we've reviewed the entire cache without
299 		 * freeing the needed space.  (The code resets "aggressive"
300 		 * when we free any space.)  Aggressive means:
301 		 *
302 		 * a: set a flag to attempt to flush high priority buffers as
303 		 *    well as other buffers.
304 		 * b: look at a buffer in every hash bucket rather than choose
305 		 *    the more preferable of two.
306 		 * c: start to think about giving up.
307 		 *
308 		 * If we get here three or more times, sync the mpool to force
309 		 * out queue extent pages.  While we might not have enough
310 		 * space for what we want and flushing is expensive, why not?
311 		 * Then sleep for a second, hopefully someone else will run and
312 		 * free up some memory.
313 		 *
314 		 * Always try to allocate memory too, in case some other thread
315 		 * returns its memory to the region.
316 		 *
317 		 * We don't have any way to know an allocation has no way to
318 		 * succeed.  Fail if no pages are returned to the cache after
319 		 * we've been trying for a relatively long time.
320 		 *
321 		 * !!!
322 		 * This test ignores pathological cases like no buffers in the
323 		 * system -- we check for that early on, so it isn't possible.
324 		 */
325 		if (buckets++ == c_mp->htab_buckets) {
326 			if (freed_space > 0)
327 				goto alloc;
328 			MPOOL_REGION_UNLOCK(env, infop);
329 
330 			/* Refresh the list of mvcc reader transactions. */
331 			if (snapshots != NULL)
332 				__os_free(env, snapshots);
333 			if ((ret = __txn_get_readers(
334 			    env, &snapshots, &n_snapshots)) != 0)
335 				goto err;
336 
337 			aggressive++;
338 			/*
339 			 * Once aggressive, we consider all buffers. By setting
340 			 * this to MPOOL_LRU_MAX, we'll still select a victim
341 			 * even if all buffers have the highest normal priority.
342 			 */
343 			high_priority = MPOOL_LRU_MAX;
344 			PERFMON4(env, mpool, alloc_wrap,
345 			    len, infop->id, aggressive, c_mp->put_counter);
346 			switch (aggressive) {
347 			case 1:
348 				break;
349 			case 2:
350 				put_counter = c_mp->put_counter;
351 				break;
352 			case 3:
353 			case 4:
354 			case 5:
355 			case 6:
356 				(void)__memp_sync_int(
357 				    env, NULL, 0, DB_SYNC_ALLOC, NULL, NULL);
358 
359 				__os_yield(env, 1, 0);
360 				break;
361 			default:
362 				aggressive = 1;
363 				if (put_counter == c_mp->put_counter)
364 					giveup = 1;
365 				break;
366 			}
367 
368 			MPOOL_REGION_LOCK(env, infop);
369 			goto alloc;
370 		}
371 
372 		/*
373 		 * Skip empty buckets.
374 		 *
375 		 * We can check for empty buckets before locking the hash
376 		 * bucket as we only care if the pointer is zero or non-zero.
377 		 */
378 		if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
379 			continue;
380 
381 		/* Unlock the region and lock the hash bucket. */
382 		MPOOL_REGION_UNLOCK(env, infop);
383 		MUTEX_READLOCK(env, hp->mtx_hash);
384 		h_locked = 1;
385 		b_lock = 0;
386 
387 		/*
388 		 * Set aggressive to consider all buffers if we have already
389 		 * searched in too many buckets.
390 		 */
391 		if (buckets > MPOOL_ALLOC_SEARCH_LIMIT && aggressive == 0) {
392 			aggressive = 1;
393 			/* Once aggressive, we consider all buffers. */
394 			high_priority = MPOOL_LRU_MAX;
395 			if (snapshots == NULL && (ret = __txn_get_readers(
396 			    env, &snapshots, &n_snapshots)) != 0)
397 				goto err;
398 		}
399 
400 		/*
401 		 * Find a buffer we can use.
402 		 * Skip over refcount > 0 buffers; we can't get rid of them.
403 		 *
404 		 * Without MVCC we use the lowest-LRU singleton buffer we find
405 		 * that's better than the result of another hash bucket we've
406 		 * reviewed.  We do not use a buffer which has a priority
407 		 * greater than high_priority unless we are being aggressive.
408 		 *
409 		 * MVCC requires looking at additional factors: we don't want to
410 		 * free a still-relevent buffer out of the middle of an MVCC
411 		 * chain, since that requires freezing - lots of I/O.  So,
412 		 * walk the buffers, looking for an obsolete buffer at the
413 		 * end of the MVCC chain. Once a buffer becomes obsolete, its
414 		 * LRU priority is irrelevant because that version can never
415 		 * be accessed again.
416 		 *
417 		 * If we don't find any obsolete MVCC buffers, we will get
418 		 * aggressive, and in that case consider the lowest priority
419 		 * buffer within a chain.
420 		 */
421 retry_search:	bhp = NULL;
422 		bucket_priority = high_priority;
423 		obsolete = 0;
424 		if (n_snapshots > 0 && LOG_COMPARE(&snapshots[n_snapshots - 1],
425 		    &hp->old_reader) > 0)
426 			hp->old_reader = snapshots[n_snapshots - 1];
427 		SH_TAILQ_FOREACH(current_bhp, &hp->hash_bucket, hq, __bh) {
428 			/*
429 			 * First, do the standard LRU check for singletons.
430 			 * We can use the buffer if it is unreferenced, has a
431 			 * priority that isn't too high (unless we are
432 			 * aggressive), and is better than the best candidate
433 			 * we have found so far in this bucket.
434 			 */
435 #ifdef MPOOL_ALLOC_SEARCH_DYN
436 			if (aggressive == 0 &&
437 			     ++high_priority >= c_mp->lru_priority)
438 				aggressive = 1;
439 #endif
440 
441 			if (SH_CHAIN_SINGLETON(current_bhp, vc)) {
442 				if (BH_REFCOUNT(current_bhp) != 0)
443 					continue;
444 				buffers++;
445 				if (bucket_priority > current_bhp->priority) {
446 					bucket_priority = current_bhp->priority;
447 					if (bhp != NULL)
448 						(void)atomic_dec(env,
449 						    &bhp->ref);
450 					bhp = current_bhp;
451 					(void)atomic_inc(env, &bhp->ref);
452 				}
453 				continue;
454 			}
455 
456 			/*
457 			 * For MVCC buffers, walk through the chain.  If we are
458 			 * aggressive, choose the best candidate from within
459 			 * the chain for freezing.
460 			 */
461 			for (mvcc_bhp = oldest_bhp = current_bhp;
462 			    mvcc_bhp != NULL;
463 			    oldest_bhp = mvcc_bhp,
464 			    mvcc_bhp = SH_CHAIN_PREV(mvcc_bhp, vc, __bh)) {
465 				DB_ASSERT(env, mvcc_bhp !=
466 				    SH_CHAIN_PREV(mvcc_bhp, vc, __bh));
467 #ifdef MPOOL_ALLOC_SEARCH_DYN
468 				if (aggressive == 0 &&
469 				     ++high_priority >= c_mp->lru_priority) {
470 					aggressive = 1;
471 					if (snapshots == NULL && (ret =
472 					    __txn_readers(env,
473 					    &snapshots, &n_snapshots)) != 0)
474 						goto err;
475 				}
476 #endif
477 				if (n_snapshots > 0 &&
478 				    __memp_bh_unreachable(env,
479 				    mvcc_bhp, snapshots, n_snapshots)) {
480 					oldest_bhp = mvcc_bhp;
481 					goto is_obsolete;
482 				}
483 				if (bhp != NULL &&
484 				    mvcc_bhp->priority >= bhp->priority)
485 					continue;
486 				if (BH_REFCOUNT(mvcc_bhp) != 0)
487 					continue;
488 				/*
489 				 * Since taking still-relevant versions requires
490 				 * freezing, skip over them at low aggression
491 				 * levels unless we see that a high proportion
492 				 * of buffers (over 1/4) are MVCC copies.
493 				 */
494 				if (aggressive < 2 &&
495 				    ++versions < (buffers >> 2))
496 					continue;
497 				buffers++;
498 				if (F_ISSET(mvcc_bhp, BH_FROZEN))
499 					continue;
500 				/*
501 				 * Select mvcc_bhp as current best candidate,
502 				 * releasing the current candidate, if any.
503 				 */
504 				if (bhp != NULL)
505 					(void)atomic_dec(env, &bhp->ref);
506 				bhp = mvcc_bhp;
507 				(void)atomic_inc(env, &bhp->ref);
508 			}
509 
510 			/*
511 			 * oldest_bhp is the last buffer on the MVCC chain, and
512 			 * an obsolete buffer at the end of the MVCC chain gets
513 			 * used without further search.
514 			 */
515 			if (BH_REFCOUNT(oldest_bhp) != 0)
516 				continue;
517 
518 			if (BH_OBSOLETE(oldest_bhp, hp->old_reader, vlsn)) {
519 				if (aggressive < 2)
520 					buffers++;
521 is_obsolete:
522 				obsolete = 1;
523 				if (bhp != NULL)
524 					(void)atomic_dec(env, &bhp->ref);
525 				bhp = oldest_bhp;
526 				(void)atomic_inc(env, &bhp->ref);
527 				goto this_buffer;
528 			}
529 		}
530 
531 		/*
532 		 * bhp is either NULL or the best candidate buffer.
533 		 * We'll use the chosen buffer only if we have compared its
534 		 * priority against one chosen from another hash bucket.
535 		 */
536 		if (bhp == NULL)
537 			goto next_hb;
538 
539 		priority = bhp->priority;
540 
541 		/*
542 		 * Compare two hash buckets and select the one with the lower
543 		 * priority, except mvcc at high aggression levels. Performance
544 		 * testing shows looking at two improves the LRU-ness and
545 		 * looking at more only does a little better.
546 		 */
547 		if (hp_saved == NULL) {
548 			/*
549 			 * At high aggressive levels when mvcc is active, stop
550 			 * looking for candidate once one has been found.
551 			 * Freezing takes more time than writing out to a db.
552 			 */
553 			if (aggressive > 1 && n_snapshots > 1)
554 				goto this_buffer;
555 			hp_saved = hp;
556 			priority_saved = priority;
557 			goto next_hb;
558 		}
559 
560 		/*
561 		 * If the buffer we just found is a better choice than our
562 		 * previous choice, use it.
563 		 *
564 		 * If the previous choice was better, pretend we're moving
565 		 * from this hash bucket to the previous one and re-do the
566 		 * search.
567 		 *
568 		 * We don't worry about simply swapping between two buckets
569 		 * because that could only happen if a buffer was removed
570 		 * from the chain, or its priority updated.   If a buffer
571 		 * is removed from the chain, some other thread has managed
572 		 * to discard a buffer, so we're moving forward.  Updating
573 		 * a buffer's priority will make it a high-priority buffer,
574 		 * so we'll ignore it when we search again, and so we will
575 		 * eventually zero in on a buffer to use, or we'll decide
576 		 * there are no buffers we can use.
577 		 *
578 		 * If there's only a single hash bucket with buffers, we'll
579 		 * search the bucket once, choose a buffer, walk the entire
580 		 * list of buckets and search it again.   In the case of a
581 		 * system that's busy, it's possible to imagine a case where
582 		 * we'd loop for a long while.  For that reason, and because
583 		 * the test is easy, we special case and test for it.
584 		 */
585 		if (priority > priority_saved && hp != hp_saved) {
586 			MUTEX_UNLOCK(env, hp->mtx_hash);
587 			hp_tmp = hp_saved;
588 			hp_saved = hp;
589 			hp = hp_tmp;
590 			priority_saved = priority;
591 			MUTEX_READLOCK(env, hp->mtx_hash);
592 			h_locked = 1;
593 			DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
594 			(void)atomic_dec(env, &bhp->ref);
595 			goto retry_search;
596 		}
597 
598 		/*
599 		 * If another thread has called __memp_reset_lru() while we were
600 		 * looking for this buffer, it is possible that we've picked a
601 		 * poor choice for a victim. If so toss it and start over.
602 		 */
603 		if (lru_generation != c_mp->lru_generation) {
604 			DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
605 			(void)atomic_dec(env, &bhp->ref);
606 			MUTEX_UNLOCK(env, hp->mtx_hash);
607 			MPOOL_REGION_LOCK(env, infop);
608 			hp_saved = NULL;
609 			goto search;
610 		}
611 
612 this_buffer:	/*
613 		 * Discard any previously remembered hash bucket, we've got
614 		 * a winner.
615 		 */
616 		hp_saved = NULL;
617 
618 		/* Drop the hash mutex and lock the buffer exclusively. */
619 		MUTEX_UNLOCK(env, hp->mtx_hash);
620 		h_locked = 0;
621 
622 		/* Don't bother trying to latch a busy buffer. */
623 		if (BH_REFCOUNT(bhp) > 1)
624 			goto next_hb;
625 
626 		/* We cannot block as the caller is probably holding locks. */
627 		if ((ret = MUTEX_TRYLOCK(env, bhp->mtx_buf)) != 0) {
628 			if (ret != DB_LOCK_NOTGRANTED)
629 				goto err;
630 			ret = 0;
631 			goto next_hb;
632 		}
633 		F_SET(bhp, BH_EXCLUSIVE);
634 		if (obsolete)
635 			F_SET(bhp, BH_UNREACHABLE);
636 		b_lock = 1;
637 
638 		/* Someone may have grabbed it while we got the lock. */
639 		if (BH_REFCOUNT(bhp) != 1)
640 			goto next_hb;
641 
642 		/* Find the associated MPOOLFILE. */
643 		bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
644 
645 		/* If the page is dirty, write it. */
646 		ret = 0;
647 		dirty_eviction = 0;
648 		if (F_ISSET(bhp, BH_DIRTY)) {
649 			DB_ASSERT(env, atomic_read(&hp->hash_page_dirty) > 0);
650 			ret = __memp_bhwrite(dbmp, hp, bh_mfp, bhp, 0);
651 			DB_ASSERT(env, atomic_read(&bhp->ref) > 0);
652 
653 			/*
654 			 * If a write fails for any reason, we can't proceed.
655 			 *
656 			 * If there's a write error and we're having problems
657 			 * finding something to allocate, avoid selecting this
658 			 * buffer again by maximizing its priority.
659 			 */
660 			if (ret != 0) {
661 				if (ret != EPERM && ret != EAGAIN) {
662 					write_error++;
663 					__db_errx(env, DB_STR_A("3018",
664 		"%s: unwritable page %d remaining in the cache after error %d",
665 					    "%s %d %d"),
666 					    __memp_fns(dbmp, bh_mfp),
667 					    bhp->pgno, ret);
668 				}
669 				bhp->priority = MPOOL_LRU_REDZONE;
670 
671 				goto next_hb;
672 			}
673 
674 			dirty_eviction = 1;
675 		}
676 
677 		/*
678 		 * Freeze this buffer, if necessary.  That is, if the buffer is
679 		 * part of an MVCC chain and could be required by a reader.
680 		 */
681 		if (SH_CHAIN_HASPREV(bhp, vc) ||
682 		    (SH_CHAIN_HASNEXT(bhp, vc) && !obsolete)) {
683 			if (!aggressive ||
684 			    F_ISSET(bhp, BH_DIRTY | BH_FROZEN))
685 				goto next_hb;
686 			ret = __memp_bh_freeze(
687 			    dbmp, infop, hp, bhp, &alloc_freeze);
688 			if (ret == EIO)
689 				write_error++;
690 			if (ret == EBUSY || ret == EIO ||
691 			    ret == ENOMEM || ret == ENOSPC) {
692 				ret = 0;
693 				goto next_hb;
694 			} else if (ret != 0) {
695 				DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
696 				(void)atomic_dec(env, &bhp->ref);
697 				DB_ASSERT(env, b_lock);
698 				F_CLR(bhp, BH_EXCLUSIVE);
699 				MUTEX_UNLOCK(env, bhp->mtx_buf);
700 				DB_ASSERT(env, !h_locked);
701 				goto err;
702 			}
703 		}
704 
705 		MUTEX_LOCK(env, hp->mtx_hash);
706 		h_locked = 1;
707 
708 		/*
709 		 * We released the hash bucket lock while doing I/O, so another
710 		 * thread may have acquired this buffer and incremented the ref
711 		 * count or dirtied the buffer or installed a new version after
712 		 * we wrote it, in which case we can't have it.
713 		 */
714 		if (BH_REFCOUNT(bhp) != 1 || F_ISSET(bhp, BH_DIRTY) ||
715 		    (SH_CHAIN_HASNEXT(bhp, vc) &&
716 		    SH_CHAIN_NEXTP(bhp, vc, __bh)->td_off != bhp->td_off &&
717 		    !(obsolete || BH_OBSOLETE(bhp, hp->old_reader, vlsn)))) {
718 			if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC))
719 				__db_msg(env,
720 		    "memp_alloc next_hb past bhp %lx flags %x ref %d %lx/%lx",
721 				    (u_long)R_OFFSET(infop, bhp), bhp->flags,
722 				    BH_REFCOUNT(bhp),
723 			(u_long)R_OFFSET(infop, SH_CHAIN_NEXTP(bhp, vc, __bh)),
724 			(u_long)R_OFFSET(infop, SH_CHAIN_PREVP(bhp, vc, __bh)));
725 			goto next_hb;
726 		}
727 
728 		/*
729 		 * If the buffer is frozen, thaw it and look for another one
730 		 * we can use. (Calling __memp_bh_freeze above will not mark
731 		 * this bhp BH_FROZEN; it creates another frozen one.)
732 		 */
733 		if (F_ISSET(bhp, BH_FROZEN)) {
734 			DB_ASSERT(env, SH_CHAIN_SINGLETON(bhp, vc) ||
735 			    obsolete || BH_OBSOLETE(bhp, hp->old_reader, vlsn));
736 			DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
737 			if (!F_ISSET(bhp, BH_THAWED)) {
738 				/*
739 				 * This call releases the hash bucket mutex.
740 				 * We're going to retry the search, so we need
741 				 * to re-lock it.
742 				 */
743 				if ((ret = __memp_bh_thaw(dbmp,
744 				    infop, hp, bhp, NULL)) != 0)
745 					goto done;
746 				MUTEX_READLOCK(env, hp->mtx_hash);
747 			} else {
748 				need_free = atomic_dec(env, &bhp->ref) == 0;
749 				F_CLR(bhp, BH_EXCLUSIVE);
750 				MUTEX_UNLOCK(env, bhp->mtx_buf);
751 				if (need_free) {
752 					MPOOL_REGION_LOCK(env, infop);
753 					SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen,
754 					    bhp, hq);
755 					MPOOL_REGION_UNLOCK(env, infop);
756 				}
757 			}
758 			bhp = NULL;
759 			b_lock = alloc_freeze = 0;
760 			goto retry_search;
761 		}
762 
763 		/* We are certainly freeing this buf; now update statistic. */
764 		if (dirty_eviction)
765 			STAT_INC(env, mpool,
766 			    dirty_eviction, c_mp->stat.st_rw_evict, infop->id);
767 		else
768 			STAT_INC(env, mpool,
769 			    clean_eviction, c_mp->stat.st_ro_evict, infop->id);
770 		/*
771 		 * If we need some empty buffer headers for freezing, turn the
772 		 * buffer we've found into frozen headers and put them on the
773 		 * free list.  Only reset alloc_freeze if we've actually
774 		 * allocated some frozen buffer headers.
775 		 */
776 		if (alloc_freeze) {
777 			/* __memp_ bhfree(..., 0) unlocks both hp & bhp. */
778 			h_locked = 0;
779 			b_lock = 0;
780 			if ((ret = __memp_bhfree(dbmp,
781 			     infop, bh_mfp, hp, bhp, 0)) != 0)
782 				goto err;
783 			DB_ASSERT(env, bhp->mtx_buf != MUTEX_INVALID);
784 			if ((ret = __mutex_free(env, &bhp->mtx_buf)) != 0)
785 				goto err;
786 
787 			MVCC_MPROTECT(bhp->buf, bh_mfp->pagesize,
788 			    PROT_READ | PROT_WRITE | PROT_EXEC);
789 
790 			MPOOL_REGION_LOCK(env, infop);
791 			SH_TAILQ_INSERT_TAIL(&c_mp->alloc_frozen,
792 			    (BH_FROZEN_ALLOC *)bhp, links);
793 			frozen_bhp = (BH_FROZEN_PAGE *)
794 			    ((BH_FROZEN_ALLOC *)bhp + 1);
795 			endp = (u_int8_t *)bhp->buf + bh_mfp->pagesize;
796 			while ((u_int8_t *)(frozen_bhp + 1) < endp) {
797 				frozen_bhp->header.mtx_buf = MUTEX_INVALID;
798 				SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen,
799 				    (BH *)frozen_bhp, hq);
800 				frozen_bhp++;
801 			}
802 			MPOOL_REGION_UNLOCK(env, infop);
803 
804 			alloc_freeze = 0;
805 			MUTEX_READLOCK(env, hp->mtx_hash);
806 			h_locked = 1;
807 			goto retry_search;
808 		}
809 
810 		/*
811 		 * If the buffer is the size we're looking for, we can simply
812 		 * reuse it. Otherwise, free it and keep looking.
813 		 */
814 		if (mfp != NULL && mfp->pagesize == bh_mfp->pagesize) {
815 			/* __memp_ bhfree(..., 0) unlocks both hp & bhp. */
816 			h_locked = 0;
817 			b_lock = 0;
818 			if ((ret = __memp_bhfree(dbmp,
819 			     infop, bh_mfp, hp, bhp, 0)) != 0)
820 				goto err;
821 			p = bhp;
822 			goto found;
823 		}
824 
825 		freed_space += sizeof(*bhp) + bh_mfp->pagesize;
826 		/* __memp_ bhfree(.., BH_FREE_FREEMEM) also unlocks hp & bhp. */
827 		h_locked = 0;
828 		b_lock = 0;
829 		if ((ret = __memp_bhfree(dbmp,
830 		    infop, bh_mfp, hp, bhp, BH_FREE_FREEMEM)) != 0)
831 			goto err;
832 
833 		/* Reset "aggressive" and "write_error" if we free any space. */
834 		if (aggressive > 1)
835 			aggressive = 1;
836 		write_error = 0;
837 
838 		/*
839 		 * Unlock this buffer and re-acquire the region lock. If
840 		 * we're reaching here as a result of calling memp_bhfree, the
841 		 * buffer lock has already been discarded.
842 		 */
843 		if (0) {
844 next_hb:		if (bhp != NULL) {
845 				DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
846 				(void)atomic_dec(env, &bhp->ref);
847 				if (b_lock) {
848 					F_CLR(bhp, BH_EXCLUSIVE);
849 					MUTEX_UNLOCK(env, bhp->mtx_buf);
850 					b_lock = 0;
851 				}
852 			}
853 			if (h_locked)
854 				MUTEX_UNLOCK(env, hp->mtx_hash);
855 			h_locked = 0;
856 		}
857 		obsolete = 0;
858 		MPOOL_REGION_LOCK(env, infop);
859 
860 		/*
861 		 * Retry the allocation as soon as we've freed up sufficient
862 		 * space.  We're likely to have to coalesce of memory to
863 		 * satisfy the request, don't try until it's likely (possible?)
864 		 * we'll succeed.
865 		 */
866 		if (freed_space >= 3 * len)
867 			goto alloc;
868 	}
869 err:
870 	if (h_locked) {
871 		MUTEX_UNLOCK(env, hp->mtx_hash);
872 		h_locked = 0;
873 	}
874 done:
875 	if (snapshots != NULL)
876 		__os_free(env, snapshots);
877 	return (ret);
878 }
879 
880 /*
881  * __memp_free --
882  *	Free some space from a cache region.
883  *
884  * PUBLIC: void __memp_free __P((REGINFO *, void *));
885  */
886 void
__memp_free(infop,buf)887 __memp_free(infop, buf)
888 	REGINFO *infop;
889 	void *buf;
890 {
891 	__env_alloc_free(infop, buf);
892 }
893