1 /*-
2 * Copyright (c) 1996, 2020 Oracle and/or its affiliates. All rights reserved.
3 *
4 * See the file LICENSE for license information.
5 *
6 * $Id$
7 */
8
9 #include "db_config.h"
10
11 #include "db_int.h"
12 #include "dbinc/mp.h"
13 #include "dbinc/txn.h"
14
15 /*
16 * This configuration parameter limits the number of hash buckets which
17 * __memp_alloc() searches through while excluding buffers with a 'high'
18 * priority.
19 */
20 #if !defined(MPOOL_ALLOC_SEARCH_LIMIT)
21 #define MPOOL_ALLOC_SEARCH_LIMIT 500
22 #endif
23
24 /*
25 * __memp_bh_unreachable --
26 *
27 * Determine whether this buffer can not ever be seen again: is the next
28 * newer version visible to the same transaction which sees this one?
29 * If both versions are visibile to the same transaction, there is no
30 * reason to keep the older one: it can be purged.
31 *
32 * If this buffer has a more recent version, and there is a transaction
33 * with a read_lsn between this buffer's and that more recent version's,
34 * the buffer is visible to at least that transaction, so return FALSE.
35 * Otherwise return TRUE.
36 *
37 * txns: 3/10 2/10 2/5 2/1 1/10
38 * vers: 3/15 2/15 2/14 2/10 2/8 1/150
39 * vis vis unreach vis unreach vis
40 * who new txns 3/10 2/10 2/5, 2/1
41 * sees
42 *
43 * Note: in the abvove example, the page was allocated after txn 1/10
44 * started. 1/10 would not see any version of the page.
45 *
46 * PUBLIC: int __memp_bh_unreachable __P((ENV *, BH *, DB_LSN *, int));
47 */
48 int
__memp_bh_unreachable(env,bhp,snapshots,n_snapshots)49 __memp_bh_unreachable(env, bhp, snapshots, n_snapshots)
50 ENV *env;
51 BH *bhp;
52 DB_LSN *snapshots;
53 int n_snapshots;
54 {
55 BH *newer_bhp;
56 DB_LSN b_vlsn, n_vlsn;
57 int i, ret;
58 #ifdef DIAGNOSTIC
59 DB_MPOOL *dbmp;
60 DB_MSGBUF mb;
61 MPOOLFILE *bh_mfp;
62 #endif
63
64 /*
65 * The buffer can't be purged if it is being used, or is the most recent
66 * version, or the next newer version isn't a copy yet.
67 */
68 if (BH_REFCOUNT(bhp) != 0 ||
69 (newer_bhp = SH_CHAIN_NEXT(bhp, vc, __bh)) == NULL ||
70 newer_bhp->td_off == INVALID_ROFF)
71 return (FALSE);
72
73 /*
74 * Find the visiblity LSNs for this buffer (b_vlsn) and the more recent,
75 * newer buffer (n_vlsn). If the newer version hasn't committed yet the
76 * bhp could be needed.
77 */
78 n_vlsn = *VISIBLE_LSN(env, newer_bhp);
79 if (IS_MAX_LSN(n_vlsn))
80 return (FALSE);
81 if (bhp->td_off == INVALID_ROFF)
82 INIT_LSN(b_vlsn);
83 else
84 b_vlsn = *VISIBLE_LSN(env, bhp);
85
86 ret = TRUE;
87 /*
88 * Look for a transaction which is between n_lsn and b_lsn - determining
89 * that bhp is reachable. Stop looking once the transactions get so
90 * small (old) that they precede the buffer's version; no earlier txn
91 * could be between n_vlsn and b_vlsn.
92 */
93 for (i = 0;
94 i < n_snapshots && LOG_COMPARE(&snapshots[i], &b_vlsn) >= 0;
95 i++) {
96 if (LOG_COMPARE(&snapshots[i], &n_vlsn) < 0) {
97 /*
98 * This txn can see (started after) bhp, but not
99 * newer_bhp (which committed after this txn started).
100 */
101 ret = FALSE;
102 break;
103 }
104 }
105
106 #ifdef DIAGNOSTIC
107 if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC)) {
108 dbmp = env->mp_handle;
109 bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
110 DB_MSGBUF_INIT(&mb);
111 __db_msgadd(env, &mb,
112 "bh_unreachable %s pgno %d %s %lu/%lu %x newer %lu/%lu txn #%d in\n",
113 __memp_fns(dbmp, bh_mfp), bhp->pgno,
114 ret ? "purgeable" : "needed",
115 (u_long)b_vlsn.file, (u_long)b_vlsn.offset, bhp->flags,
116 (u_long)n_vlsn.file, (u_long)n_vlsn.offset, i);
117 for (i = 0; i != n_snapshots; i++)
118 __db_msgadd(env, &mb, " %lu/%lu",
119 (u_long)snapshots[i].file,
120 (u_long)snapshots[i].offset);
121 DB_MSGBUF_FLUSH(env, &mb);
122 }
123 #endif
124 return (ret);
125 }
126
127 /*
128 * __memp_alloc --
129 * Allocate some space from a cache region. If the region is full then
130 * reuse one or more cache buffers.
131 *
132 * PUBLIC: int __memp_alloc __P((DB_MPOOL *,
133 * PUBLIC: REGINFO *, MPOOLFILE *, size_t, roff_t *, void *));
134 */
135 int
__memp_alloc(dbmp,infop,mfp,len,offsetp,retp)136 __memp_alloc(dbmp, infop, mfp, len, offsetp, retp)
137 DB_MPOOL *dbmp;
138 REGINFO *infop;
139 MPOOLFILE *mfp;
140 size_t len;
141 roff_t *offsetp;
142 void *retp;
143 {
144 BH *bhp, *current_bhp, *mvcc_bhp, *oldest_bhp;
145 BH_FROZEN_PAGE *frozen_bhp;
146 DB_LSN *snapshots, vlsn;
147 DB_MPOOL_HASH *dbht, *hp, *hp_end, *hp_saved, *hp_tmp;
148 ENV *env;
149 MPOOL *c_mp;
150 MPOOLFILE *bh_mfp;
151 size_t freed_space;
152 u_int32_t buckets, bucket_priority, buffers, cache_reduction;
153 u_int32_t dirty_eviction, high_priority, priority, versions;
154 u_int32_t priority_saved, put_counter, lru_generation, total_buckets;
155 int aggressive, alloc_freeze, b_lock, giveup;
156 int h_locked, need_free, n_snapshots, obsolete, ret, write_error;
157 u_int8_t *endp;
158 void *p;
159
160 env = dbmp->env;
161 c_mp = infop->primary;
162 dbht = R_ADDR(infop, c_mp->htab);
163 hp_end = &dbht[c_mp->htab_buckets];
164 hp_saved = NULL;
165 snapshots = NULL;
166 priority_saved = write_error = 0;
167 buckets = buffers = put_counter = total_buckets = versions = 0;
168 aggressive = alloc_freeze = giveup = h_locked = n_snapshots = 0;
169
170 /*
171 * If we're allocating a buffer, and the one we're discarding is the
172 * same size, we don't want to waste the time to re-integrate it into
173 * the shared memory free list. If the DB_MPOOLFILE argument isn't
174 * NULL, we'll compare the underlying page sizes of the two buffers
175 * before free-ing and re-allocating buffers.
176 */
177 if (mfp != NULL) {
178 len = SSZA(BH, buf) + mfp->pagesize;
179 /* Add space for alignment padding for MVCC diagnostics. */
180 MVCC_BHSIZE(mfp, len);
181 }
182
183 STAT_INC(env, mpool, nallocs, c_mp->stat.st_alloc, len);
184
185 MPOOL_REGION_LOCK(env, infop);
186
187 /*
188 * First we try to allocate from free memory. If that fails, scan the
189 * buffer pool to find buffers with low priorities. We consider small
190 * sets of hash buckets each time to limit the amount of work needing
191 * to be done. This approximates LRU, but not very well. We either
192 * find a buffer of the same size to use, or we will free 3 times what
193 * we need in the hopes it will coalesce into a contiguous chunk of the
194 * right size. In the latter case we branch back here and try again.
195 */
196 alloc: if ((ret = __env_alloc(infop, len, &p)) == 0) {
197 if (mfp != NULL) {
198 /*
199 * For MVCC diagnostics, align the pointer so that the
200 * buffer starts on a page boundary.
201 */
202 MVCC_BHALIGN(p);
203 bhp = (BH *)p;
204
205 if ((ret = __mutex_alloc(env, MTX_MPOOL_BH,
206 DB_MUTEX_SHARED, &bhp->mtx_buf)) != 0) {
207 MVCC_BHUNALIGN(bhp);
208 __env_alloc_free(infop, bhp);
209 goto search;
210 }
211 atomic_init(&bhp->ref, 0);
212 c_mp->pages++;
213 }
214 MPOOL_REGION_UNLOCK(env, infop);
215 found: if (offsetp != NULL)
216 *offsetp = R_OFFSET(infop, p);
217 *(void **)retp = p;
218
219 /*
220 * Update the search statistics.
221 *
222 * We're not holding the region locked here, these statistics
223 * can't be trusted.
224 */
225 #ifdef HAVE_STATISTICS
226 total_buckets += buckets;
227 if (total_buckets != 0) {
228 if (total_buckets > c_mp->stat.st_alloc_max_buckets)
229 STAT_SET(env, mpool, alloc_max_buckets,
230 c_mp->stat.st_alloc_max_buckets,
231 total_buckets, infop->id);
232 STAT_ADJUST(env, mpool, alloc_buckets,
233 c_mp->stat.st_alloc_buckets,
234 total_buckets, infop->id);
235 }
236 if (buffers != 0) {
237 if (buffers > c_mp->stat.st_alloc_max_pages)
238 STAT_SET(env, mpool, alloc_max_pages,
239 c_mp->stat.st_alloc_max_pages,
240 buffers, infop->id);
241 STAT_ADJUST(env, mpool, alloc_pages,
242 c_mp->stat.st_alloc_pages, buffers, infop->id);
243 }
244 #endif
245 goto done;
246 } else if (giveup || c_mp->pages == 0) {
247 MPOOL_REGION_UNLOCK(env, infop);
248
249 __db_errx(env, DB_STR("3017",
250 "unable to allocate space from the buffer cache"));
251 if (ret == ENOMEM && write_error != 0)
252 ret = EIO;
253 goto done;
254 }
255
256 search:
257 /*
258 * Anything newer than 1/10th of the buffer pool is ignored during the
259 * first MPOOL_SEARCH_ALLOC_LIMIT buckets worth of allocation.
260 */
261 cache_reduction = c_mp->pages / 10;
262 high_priority = aggressive ? MPOOL_LRU_MAX :
263 c_mp->lru_priority - cache_reduction;
264 lru_generation = c_mp->lru_generation;
265
266 ret = 0;
267
268 /*
269 * We re-attempt the allocation every time we've freed 3 times what
270 * we need. Reset our free-space counter.
271 */
272 freed_space = 0;
273 total_buckets += buckets;
274 buckets = 0;
275
276 /*
277 * Walk the hash buckets and find the next two with potentially useful
278 * buffers. Free the buffer with the lowest priority from the buckets'
279 * chains.
280 */
281 for (;;) {
282 /* All pages have been freed, make one last try */
283 if (c_mp->pages == 0)
284 goto alloc;
285
286 /* Check for wrap around. */
287 hp = &dbht[c_mp->last_checked++];
288 if (hp >= hp_end) {
289 c_mp->last_checked = 0;
290 hp = &dbht[c_mp->last_checked++];
291 }
292
293 /*
294 * The failure mode is when there are too many buffers we can't
295 * write or there's not enough memory in the system to support
296 * the number of pinned buffers.
297 *
298 * Get aggressive if we've reviewed the entire cache without
299 * freeing the needed space. (The code resets "aggressive"
300 * when we free any space.) Aggressive means:
301 *
302 * a: set a flag to attempt to flush high priority buffers as
303 * well as other buffers.
304 * b: look at a buffer in every hash bucket rather than choose
305 * the more preferable of two.
306 * c: start to think about giving up.
307 *
308 * If we get here three or more times, sync the mpool to force
309 * out queue extent pages. While we might not have enough
310 * space for what we want and flushing is expensive, why not?
311 * Then sleep for a second, hopefully someone else will run and
312 * free up some memory.
313 *
314 * Always try to allocate memory too, in case some other thread
315 * returns its memory to the region.
316 *
317 * We don't have any way to know an allocation has no way to
318 * succeed. Fail if no pages are returned to the cache after
319 * we've been trying for a relatively long time.
320 *
321 * !!!
322 * This test ignores pathological cases like no buffers in the
323 * system -- we check for that early on, so it isn't possible.
324 */
325 if (buckets++ == c_mp->htab_buckets) {
326 if (freed_space > 0)
327 goto alloc;
328 MPOOL_REGION_UNLOCK(env, infop);
329
330 /* Refresh the list of mvcc reader transactions. */
331 if (snapshots != NULL)
332 __os_free(env, snapshots);
333 if ((ret = __txn_get_readers(
334 env, &snapshots, &n_snapshots)) != 0)
335 goto err;
336
337 aggressive++;
338 /*
339 * Once aggressive, we consider all buffers. By setting
340 * this to MPOOL_LRU_MAX, we'll still select a victim
341 * even if all buffers have the highest normal priority.
342 */
343 high_priority = MPOOL_LRU_MAX;
344 PERFMON4(env, mpool, alloc_wrap,
345 len, infop->id, aggressive, c_mp->put_counter);
346 switch (aggressive) {
347 case 1:
348 break;
349 case 2:
350 put_counter = c_mp->put_counter;
351 break;
352 case 3:
353 case 4:
354 case 5:
355 case 6:
356 (void)__memp_sync_int(
357 env, NULL, 0, DB_SYNC_ALLOC, NULL, NULL);
358
359 __os_yield(env, 1, 0);
360 break;
361 default:
362 aggressive = 1;
363 if (put_counter == c_mp->put_counter)
364 giveup = 1;
365 break;
366 }
367
368 MPOOL_REGION_LOCK(env, infop);
369 goto alloc;
370 }
371
372 /*
373 * Skip empty buckets.
374 *
375 * We can check for empty buckets before locking the hash
376 * bucket as we only care if the pointer is zero or non-zero.
377 */
378 if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
379 continue;
380
381 /* Unlock the region and lock the hash bucket. */
382 MPOOL_REGION_UNLOCK(env, infop);
383 MUTEX_READLOCK(env, hp->mtx_hash);
384 h_locked = 1;
385 b_lock = 0;
386
387 /*
388 * Set aggressive to consider all buffers if we have already
389 * searched in too many buckets.
390 */
391 if (buckets > MPOOL_ALLOC_SEARCH_LIMIT && aggressive == 0) {
392 aggressive = 1;
393 /* Once aggressive, we consider all buffers. */
394 high_priority = MPOOL_LRU_MAX;
395 if (snapshots == NULL && (ret = __txn_get_readers(
396 env, &snapshots, &n_snapshots)) != 0)
397 goto err;
398 }
399
400 /*
401 * Find a buffer we can use.
402 * Skip over refcount > 0 buffers; we can't get rid of them.
403 *
404 * Without MVCC we use the lowest-LRU singleton buffer we find
405 * that's better than the result of another hash bucket we've
406 * reviewed. We do not use a buffer which has a priority
407 * greater than high_priority unless we are being aggressive.
408 *
409 * MVCC requires looking at additional factors: we don't want to
410 * free a still-relevent buffer out of the middle of an MVCC
411 * chain, since that requires freezing - lots of I/O. So,
412 * walk the buffers, looking for an obsolete buffer at the
413 * end of the MVCC chain. Once a buffer becomes obsolete, its
414 * LRU priority is irrelevant because that version can never
415 * be accessed again.
416 *
417 * If we don't find any obsolete MVCC buffers, we will get
418 * aggressive, and in that case consider the lowest priority
419 * buffer within a chain.
420 */
421 retry_search: bhp = NULL;
422 bucket_priority = high_priority;
423 obsolete = 0;
424 if (n_snapshots > 0 && LOG_COMPARE(&snapshots[n_snapshots - 1],
425 &hp->old_reader) > 0)
426 hp->old_reader = snapshots[n_snapshots - 1];
427 SH_TAILQ_FOREACH(current_bhp, &hp->hash_bucket, hq, __bh) {
428 /*
429 * First, do the standard LRU check for singletons.
430 * We can use the buffer if it is unreferenced, has a
431 * priority that isn't too high (unless we are
432 * aggressive), and is better than the best candidate
433 * we have found so far in this bucket.
434 */
435 #ifdef MPOOL_ALLOC_SEARCH_DYN
436 if (aggressive == 0 &&
437 ++high_priority >= c_mp->lru_priority)
438 aggressive = 1;
439 #endif
440
441 if (SH_CHAIN_SINGLETON(current_bhp, vc)) {
442 if (BH_REFCOUNT(current_bhp) != 0)
443 continue;
444 buffers++;
445 if (bucket_priority > current_bhp->priority) {
446 bucket_priority = current_bhp->priority;
447 if (bhp != NULL)
448 (void)atomic_dec(env,
449 &bhp->ref);
450 bhp = current_bhp;
451 (void)atomic_inc(env, &bhp->ref);
452 }
453 continue;
454 }
455
456 /*
457 * For MVCC buffers, walk through the chain. If we are
458 * aggressive, choose the best candidate from within
459 * the chain for freezing.
460 */
461 for (mvcc_bhp = oldest_bhp = current_bhp;
462 mvcc_bhp != NULL;
463 oldest_bhp = mvcc_bhp,
464 mvcc_bhp = SH_CHAIN_PREV(mvcc_bhp, vc, __bh)) {
465 DB_ASSERT(env, mvcc_bhp !=
466 SH_CHAIN_PREV(mvcc_bhp, vc, __bh));
467 #ifdef MPOOL_ALLOC_SEARCH_DYN
468 if (aggressive == 0 &&
469 ++high_priority >= c_mp->lru_priority) {
470 aggressive = 1;
471 if (snapshots == NULL && (ret =
472 __txn_readers(env,
473 &snapshots, &n_snapshots)) != 0)
474 goto err;
475 }
476 #endif
477 if (n_snapshots > 0 &&
478 __memp_bh_unreachable(env,
479 mvcc_bhp, snapshots, n_snapshots)) {
480 oldest_bhp = mvcc_bhp;
481 goto is_obsolete;
482 }
483 if (bhp != NULL &&
484 mvcc_bhp->priority >= bhp->priority)
485 continue;
486 if (BH_REFCOUNT(mvcc_bhp) != 0)
487 continue;
488 /*
489 * Since taking still-relevant versions requires
490 * freezing, skip over them at low aggression
491 * levels unless we see that a high proportion
492 * of buffers (over 1/4) are MVCC copies.
493 */
494 if (aggressive < 2 &&
495 ++versions < (buffers >> 2))
496 continue;
497 buffers++;
498 if (F_ISSET(mvcc_bhp, BH_FROZEN))
499 continue;
500 /*
501 * Select mvcc_bhp as current best candidate,
502 * releasing the current candidate, if any.
503 */
504 if (bhp != NULL)
505 (void)atomic_dec(env, &bhp->ref);
506 bhp = mvcc_bhp;
507 (void)atomic_inc(env, &bhp->ref);
508 }
509
510 /*
511 * oldest_bhp is the last buffer on the MVCC chain, and
512 * an obsolete buffer at the end of the MVCC chain gets
513 * used without further search.
514 */
515 if (BH_REFCOUNT(oldest_bhp) != 0)
516 continue;
517
518 if (BH_OBSOLETE(oldest_bhp, hp->old_reader, vlsn)) {
519 if (aggressive < 2)
520 buffers++;
521 is_obsolete:
522 obsolete = 1;
523 if (bhp != NULL)
524 (void)atomic_dec(env, &bhp->ref);
525 bhp = oldest_bhp;
526 (void)atomic_inc(env, &bhp->ref);
527 goto this_buffer;
528 }
529 }
530
531 /*
532 * bhp is either NULL or the best candidate buffer.
533 * We'll use the chosen buffer only if we have compared its
534 * priority against one chosen from another hash bucket.
535 */
536 if (bhp == NULL)
537 goto next_hb;
538
539 priority = bhp->priority;
540
541 /*
542 * Compare two hash buckets and select the one with the lower
543 * priority, except mvcc at high aggression levels. Performance
544 * testing shows looking at two improves the LRU-ness and
545 * looking at more only does a little better.
546 */
547 if (hp_saved == NULL) {
548 /*
549 * At high aggressive levels when mvcc is active, stop
550 * looking for candidate once one has been found.
551 * Freezing takes more time than writing out to a db.
552 */
553 if (aggressive > 1 && n_snapshots > 1)
554 goto this_buffer;
555 hp_saved = hp;
556 priority_saved = priority;
557 goto next_hb;
558 }
559
560 /*
561 * If the buffer we just found is a better choice than our
562 * previous choice, use it.
563 *
564 * If the previous choice was better, pretend we're moving
565 * from this hash bucket to the previous one and re-do the
566 * search.
567 *
568 * We don't worry about simply swapping between two buckets
569 * because that could only happen if a buffer was removed
570 * from the chain, or its priority updated. If a buffer
571 * is removed from the chain, some other thread has managed
572 * to discard a buffer, so we're moving forward. Updating
573 * a buffer's priority will make it a high-priority buffer,
574 * so we'll ignore it when we search again, and so we will
575 * eventually zero in on a buffer to use, or we'll decide
576 * there are no buffers we can use.
577 *
578 * If there's only a single hash bucket with buffers, we'll
579 * search the bucket once, choose a buffer, walk the entire
580 * list of buckets and search it again. In the case of a
581 * system that's busy, it's possible to imagine a case where
582 * we'd loop for a long while. For that reason, and because
583 * the test is easy, we special case and test for it.
584 */
585 if (priority > priority_saved && hp != hp_saved) {
586 MUTEX_UNLOCK(env, hp->mtx_hash);
587 hp_tmp = hp_saved;
588 hp_saved = hp;
589 hp = hp_tmp;
590 priority_saved = priority;
591 MUTEX_READLOCK(env, hp->mtx_hash);
592 h_locked = 1;
593 DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
594 (void)atomic_dec(env, &bhp->ref);
595 goto retry_search;
596 }
597
598 /*
599 * If another thread has called __memp_reset_lru() while we were
600 * looking for this buffer, it is possible that we've picked a
601 * poor choice for a victim. If so toss it and start over.
602 */
603 if (lru_generation != c_mp->lru_generation) {
604 DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
605 (void)atomic_dec(env, &bhp->ref);
606 MUTEX_UNLOCK(env, hp->mtx_hash);
607 MPOOL_REGION_LOCK(env, infop);
608 hp_saved = NULL;
609 goto search;
610 }
611
612 this_buffer: /*
613 * Discard any previously remembered hash bucket, we've got
614 * a winner.
615 */
616 hp_saved = NULL;
617
618 /* Drop the hash mutex and lock the buffer exclusively. */
619 MUTEX_UNLOCK(env, hp->mtx_hash);
620 h_locked = 0;
621
622 /* Don't bother trying to latch a busy buffer. */
623 if (BH_REFCOUNT(bhp) > 1)
624 goto next_hb;
625
626 /* We cannot block as the caller is probably holding locks. */
627 if ((ret = MUTEX_TRYLOCK(env, bhp->mtx_buf)) != 0) {
628 if (ret != DB_LOCK_NOTGRANTED)
629 goto err;
630 ret = 0;
631 goto next_hb;
632 }
633 F_SET(bhp, BH_EXCLUSIVE);
634 if (obsolete)
635 F_SET(bhp, BH_UNREACHABLE);
636 b_lock = 1;
637
638 /* Someone may have grabbed it while we got the lock. */
639 if (BH_REFCOUNT(bhp) != 1)
640 goto next_hb;
641
642 /* Find the associated MPOOLFILE. */
643 bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
644
645 /* If the page is dirty, write it. */
646 ret = 0;
647 dirty_eviction = 0;
648 if (F_ISSET(bhp, BH_DIRTY)) {
649 DB_ASSERT(env, atomic_read(&hp->hash_page_dirty) > 0);
650 ret = __memp_bhwrite(dbmp, hp, bh_mfp, bhp, 0);
651 DB_ASSERT(env, atomic_read(&bhp->ref) > 0);
652
653 /*
654 * If a write fails for any reason, we can't proceed.
655 *
656 * If there's a write error and we're having problems
657 * finding something to allocate, avoid selecting this
658 * buffer again by maximizing its priority.
659 */
660 if (ret != 0) {
661 if (ret != EPERM && ret != EAGAIN) {
662 write_error++;
663 __db_errx(env, DB_STR_A("3018",
664 "%s: unwritable page %d remaining in the cache after error %d",
665 "%s %d %d"),
666 __memp_fns(dbmp, bh_mfp),
667 bhp->pgno, ret);
668 }
669 bhp->priority = MPOOL_LRU_REDZONE;
670
671 goto next_hb;
672 }
673
674 dirty_eviction = 1;
675 }
676
677 /*
678 * Freeze this buffer, if necessary. That is, if the buffer is
679 * part of an MVCC chain and could be required by a reader.
680 */
681 if (SH_CHAIN_HASPREV(bhp, vc) ||
682 (SH_CHAIN_HASNEXT(bhp, vc) && !obsolete)) {
683 if (!aggressive ||
684 F_ISSET(bhp, BH_DIRTY | BH_FROZEN))
685 goto next_hb;
686 ret = __memp_bh_freeze(
687 dbmp, infop, hp, bhp, &alloc_freeze);
688 if (ret == EIO)
689 write_error++;
690 if (ret == EBUSY || ret == EIO ||
691 ret == ENOMEM || ret == ENOSPC) {
692 ret = 0;
693 goto next_hb;
694 } else if (ret != 0) {
695 DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
696 (void)atomic_dec(env, &bhp->ref);
697 DB_ASSERT(env, b_lock);
698 F_CLR(bhp, BH_EXCLUSIVE);
699 MUTEX_UNLOCK(env, bhp->mtx_buf);
700 DB_ASSERT(env, !h_locked);
701 goto err;
702 }
703 }
704
705 MUTEX_LOCK(env, hp->mtx_hash);
706 h_locked = 1;
707
708 /*
709 * We released the hash bucket lock while doing I/O, so another
710 * thread may have acquired this buffer and incremented the ref
711 * count or dirtied the buffer or installed a new version after
712 * we wrote it, in which case we can't have it.
713 */
714 if (BH_REFCOUNT(bhp) != 1 || F_ISSET(bhp, BH_DIRTY) ||
715 (SH_CHAIN_HASNEXT(bhp, vc) &&
716 SH_CHAIN_NEXTP(bhp, vc, __bh)->td_off != bhp->td_off &&
717 !(obsolete || BH_OBSOLETE(bhp, hp->old_reader, vlsn)))) {
718 if (FLD_ISSET(env->dbenv->verbose, DB_VERB_MVCC))
719 __db_msg(env,
720 "memp_alloc next_hb past bhp %lx flags %x ref %d %lx/%lx",
721 (u_long)R_OFFSET(infop, bhp), bhp->flags,
722 BH_REFCOUNT(bhp),
723 (u_long)R_OFFSET(infop, SH_CHAIN_NEXTP(bhp, vc, __bh)),
724 (u_long)R_OFFSET(infop, SH_CHAIN_PREVP(bhp, vc, __bh)));
725 goto next_hb;
726 }
727
728 /*
729 * If the buffer is frozen, thaw it and look for another one
730 * we can use. (Calling __memp_bh_freeze above will not mark
731 * this bhp BH_FROZEN; it creates another frozen one.)
732 */
733 if (F_ISSET(bhp, BH_FROZEN)) {
734 DB_ASSERT(env, SH_CHAIN_SINGLETON(bhp, vc) ||
735 obsolete || BH_OBSOLETE(bhp, hp->old_reader, vlsn));
736 DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
737 if (!F_ISSET(bhp, BH_THAWED)) {
738 /*
739 * This call releases the hash bucket mutex.
740 * We're going to retry the search, so we need
741 * to re-lock it.
742 */
743 if ((ret = __memp_bh_thaw(dbmp,
744 infop, hp, bhp, NULL)) != 0)
745 goto done;
746 MUTEX_READLOCK(env, hp->mtx_hash);
747 } else {
748 need_free = atomic_dec(env, &bhp->ref) == 0;
749 F_CLR(bhp, BH_EXCLUSIVE);
750 MUTEX_UNLOCK(env, bhp->mtx_buf);
751 if (need_free) {
752 MPOOL_REGION_LOCK(env, infop);
753 SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen,
754 bhp, hq);
755 MPOOL_REGION_UNLOCK(env, infop);
756 }
757 }
758 bhp = NULL;
759 b_lock = alloc_freeze = 0;
760 goto retry_search;
761 }
762
763 /* We are certainly freeing this buf; now update statistic. */
764 if (dirty_eviction)
765 STAT_INC(env, mpool,
766 dirty_eviction, c_mp->stat.st_rw_evict, infop->id);
767 else
768 STAT_INC(env, mpool,
769 clean_eviction, c_mp->stat.st_ro_evict, infop->id);
770 /*
771 * If we need some empty buffer headers for freezing, turn the
772 * buffer we've found into frozen headers and put them on the
773 * free list. Only reset alloc_freeze if we've actually
774 * allocated some frozen buffer headers.
775 */
776 if (alloc_freeze) {
777 /* __memp_ bhfree(..., 0) unlocks both hp & bhp. */
778 h_locked = 0;
779 b_lock = 0;
780 if ((ret = __memp_bhfree(dbmp,
781 infop, bh_mfp, hp, bhp, 0)) != 0)
782 goto err;
783 DB_ASSERT(env, bhp->mtx_buf != MUTEX_INVALID);
784 if ((ret = __mutex_free(env, &bhp->mtx_buf)) != 0)
785 goto err;
786
787 MVCC_MPROTECT(bhp->buf, bh_mfp->pagesize,
788 PROT_READ | PROT_WRITE | PROT_EXEC);
789
790 MPOOL_REGION_LOCK(env, infop);
791 SH_TAILQ_INSERT_TAIL(&c_mp->alloc_frozen,
792 (BH_FROZEN_ALLOC *)bhp, links);
793 frozen_bhp = (BH_FROZEN_PAGE *)
794 ((BH_FROZEN_ALLOC *)bhp + 1);
795 endp = (u_int8_t *)bhp->buf + bh_mfp->pagesize;
796 while ((u_int8_t *)(frozen_bhp + 1) < endp) {
797 frozen_bhp->header.mtx_buf = MUTEX_INVALID;
798 SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen,
799 (BH *)frozen_bhp, hq);
800 frozen_bhp++;
801 }
802 MPOOL_REGION_UNLOCK(env, infop);
803
804 alloc_freeze = 0;
805 MUTEX_READLOCK(env, hp->mtx_hash);
806 h_locked = 1;
807 goto retry_search;
808 }
809
810 /*
811 * If the buffer is the size we're looking for, we can simply
812 * reuse it. Otherwise, free it and keep looking.
813 */
814 if (mfp != NULL && mfp->pagesize == bh_mfp->pagesize) {
815 /* __memp_ bhfree(..., 0) unlocks both hp & bhp. */
816 h_locked = 0;
817 b_lock = 0;
818 if ((ret = __memp_bhfree(dbmp,
819 infop, bh_mfp, hp, bhp, 0)) != 0)
820 goto err;
821 p = bhp;
822 goto found;
823 }
824
825 freed_space += sizeof(*bhp) + bh_mfp->pagesize;
826 /* __memp_ bhfree(.., BH_FREE_FREEMEM) also unlocks hp & bhp. */
827 h_locked = 0;
828 b_lock = 0;
829 if ((ret = __memp_bhfree(dbmp,
830 infop, bh_mfp, hp, bhp, BH_FREE_FREEMEM)) != 0)
831 goto err;
832
833 /* Reset "aggressive" and "write_error" if we free any space. */
834 if (aggressive > 1)
835 aggressive = 1;
836 write_error = 0;
837
838 /*
839 * Unlock this buffer and re-acquire the region lock. If
840 * we're reaching here as a result of calling memp_bhfree, the
841 * buffer lock has already been discarded.
842 */
843 if (0) {
844 next_hb: if (bhp != NULL) {
845 DB_ASSERT(env, BH_REFCOUNT(bhp) > 0);
846 (void)atomic_dec(env, &bhp->ref);
847 if (b_lock) {
848 F_CLR(bhp, BH_EXCLUSIVE);
849 MUTEX_UNLOCK(env, bhp->mtx_buf);
850 b_lock = 0;
851 }
852 }
853 if (h_locked)
854 MUTEX_UNLOCK(env, hp->mtx_hash);
855 h_locked = 0;
856 }
857 obsolete = 0;
858 MPOOL_REGION_LOCK(env, infop);
859
860 /*
861 * Retry the allocation as soon as we've freed up sufficient
862 * space. We're likely to have to coalesce of memory to
863 * satisfy the request, don't try until it's likely (possible?)
864 * we'll succeed.
865 */
866 if (freed_space >= 3 * len)
867 goto alloc;
868 }
869 err:
870 if (h_locked) {
871 MUTEX_UNLOCK(env, hp->mtx_hash);
872 h_locked = 0;
873 }
874 done:
875 if (snapshots != NULL)
876 __os_free(env, snapshots);
877 return (ret);
878 }
879
880 /*
881 * __memp_free --
882 * Free some space from a cache region.
883 *
884 * PUBLIC: void __memp_free __P((REGINFO *, void *));
885 */
886 void
__memp_free(infop,buf)887 __memp_free(infop, buf)
888 REGINFO *infop;
889 void *buf;
890 {
891 __env_alloc_free(infop, buf);
892 }
893