1 /*-
2  * Copyright (c) 1996, 2020 Oracle and/or its affiliates.  All rights reserved.
3  *
4  * See the file LICENSE for license information.
5  * $Id$
6  */
7 
8 #include "db_config.h"
9 
10 #include "db_int.h"
11 #include "dbinc/db_page.h"
12 #include "dbinc/btree.h"
13 #include "dbinc/hash.h"
14 #include "dbinc/lock.h"
15 #include "dbinc/txn.h"
16 #include "dbinc/mp.h"
17 
18 static int __ham_copy_data __P((DBC *, PAGE *, DB_COMPACT *, int *));
19 static int __ham_truncate_overflow __P((DBC *, u_int32_t, DB_COMPACT *, int *));
20 
21 /*
22  * __ham_compact_int -- internal HASH compaction routine.
23  *
24  * PUBLIC: int __ham_compact_int __P((DBC *,
25  * PUBLIC:      DBT *, DBT *, u_int32_t, DB_COMPACT *, int *, u_int32_t));
26  */
27 int
__ham_compact_int(dbc,start,stop,factor,c_data,donep,flags)28 __ham_compact_int(dbc, start, stop, factor, c_data, donep, flags)
29 	DBC *dbc;
30 	DBT *start, *stop;
31 	u_int32_t factor;
32 	DB_COMPACT *c_data;
33 	int *donep;
34 	u_int32_t flags;
35 {
36 	DB *dbp;
37 	DB_MPOOLFILE *mpf;
38 	HASH_CURSOR *hcp;
39 	db_pgno_t origpgno, pgno;
40 	int check_trunc, pgs_done, ret, t_ret;
41 	u_int32_t empty_buckets, i, stop_bucket;
42 
43 	dbp = dbc->dbp;
44 	mpf = dbp->mpf;
45 	hcp = (HASH_CURSOR *)dbc->internal;
46 	pgs_done = 0;
47 	empty_buckets = 0;
48 	check_trunc = c_data->compact_truncate != PGNO_INVALID;
49 	if ((ret = __ham_get_meta(dbc)) != 0)
50 		return (ret);
51 
52 	if (stop != NULL && stop->size != 0)
53 		stop_bucket = *(u_int32_t *)stop->data;
54 	else
55 		stop_bucket = hcp->hdr->max_bucket;
56 
57 	if (start != NULL && start->size != 0)
58 		hcp->bucket = *(u_int32_t *)start->data;
59 	else
60 		hcp->bucket = 0;
61 
62 	for (; hcp->bucket <= stop_bucket && ret == 0; hcp->bucket++) {
63 		/*
64 		 * For each bucket first move records toward the head of
65 		 * the bucket.
66 		 */
67 		hcp->indx = NDX_INVALID;
68 		F_CLR(hcp, H_ISDUP);
69 		hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket);
70 		pgno = PGNO_INVALID;
71 		ret = __ham_item_next(dbc, DB_LOCK_WRITE, &pgno);
72 
73 		/*
74 		 * If the bucket is empty, just note it, otherwise process it.
75 		 * If there are any records there must be some in the head
76 		 * of the bucket.
77 		 */
78 		if (ret == DB_NOTFOUND ) {
79 			empty_buckets++;
80 			c_data->compact_pages_examine++;
81 			DB_ASSERT(dbp->env,
82 			    PREV_PGNO(hcp->page) == PGNO_INVALID &&
83 			    NEXT_PGNO(hcp->page) == PGNO_INVALID);
84 			goto err;
85 		} else if (ret != 0)
86 			break;
87 		c_data->compact_pages_examine++;
88 
89 		if (NEXT_PGNO(hcp->page) != PGNO_INVALID) {
90 			if ((ret =
91 			    __ham_compact_bucket(dbc, c_data, &pgs_done)) != 0)
92 				goto err;
93 			pgno = PGNO_INVALID;
94 			if ((ret = __ham_item(dbc, DB_LOCK_WRITE, &pgno)) != 0)
95 				goto err;
96 		}
97 
98 		/*
99 		 * Loop through the items in this page in the bucket and process
100 		 * overflow records and off page duplicate sets.
101 		 */
102 		while (ret == 0) {
103 			/* Handle off page duplicate trees. */
104 			if (pgno == PGNO_INVALID)
105 				goto no_opd;
106 			if (check_trunc &&
107 			    pgno > c_data->compact_truncate) {
108 				c_data->compact_pages_examine++;
109 				/*
110 				 * Truncate this page if possible.
111 				 * We must update the parent here
112 				 * because the page number is
113 				 * not aligned.
114 				 */
115 				if ((ret = __memp_dirty(mpf, &hcp->page,
116 				    dbc->thread_info,
117 				    dbc->txn, dbc->priority, 0)) != 0)
118 					break;
119 				origpgno = pgno;
120 				if ((ret = __db_truncate_root(dbc, hcp->page,
121 				    H_DATAINDEX(hcp->indx),
122 				    &pgno, 0, &pgs_done)) != 0)
123 					break;
124 				if (pgno != origpgno) {
125 					memcpy(HOFFDUP_PGNO(H_PAIRDATA(dbp,
126 					    hcp->page, hcp->indx)),
127 					    &pgno, sizeof(db_pgno_t));
128 					pgs_done++;
129 					c_data->compact_pages--;
130 				}
131 			}
132 			/*
133 			 * Compact the off page duplicate tree.
134 			 */
135 			if ((ret = __bam_compact_opd(dbc,
136 			    pgno, NULL, factor, c_data, &pgs_done)) != 0)
137 				break;
138 
139 no_opd:			if (check_trunc && HPAGE_PTYPE(H_PAIRDATA(
140 			    dbp, hcp->page, hcp->indx)) == H_OFFPAGE) {
141 				/* This is an overflow chain. */
142 				if ((ret = __ham_truncate_overflow(dbc,
143 				    H_DATAINDEX(hcp->indx),
144 				    c_data, &pgs_done)) != 0)
145 					break;
146 			}
147 
148 			/* Check for an overflow key. */
149 			if (check_trunc && HPAGE_PTYPE(H_PAIRKEY(
150 			    dbp, hcp->page, hcp->indx)) == H_OFFPAGE) {
151 				/* This is an overflow chain. */
152 				if ((ret = __ham_truncate_overflow(dbc,
153 				    H_KEYINDEX(hcp->indx),
154 				    c_data, &pgs_done)) != 0)
155 					break;
156 			}
157 
158 			pgno = PGNO_INVALID;
159 			ret = __ham_item_next(dbc, DB_LOCK_WRITE, &pgno);
160 		}
161 
162 err:		if (hcp->page != NULL &&
163 		    (t_ret = __memp_fput(mpf, dbc->thread_info,
164 		    hcp->page, dbc->priority)) != 0 && ret == 0)
165 			ret = t_ret;
166 		if (ret == DB_NOTFOUND)
167 			ret = 0;
168 		hcp->page = NULL;
169 		hcp->pgno = pgno = PGNO_INVALID;
170 		/*
171 		 * If we are in an auto-transaction and we updated something
172 		 * return to the caller to commit this transaction to
173 		 * avoid holding locks. Otherwise process the next bucket.
174 		 * We can drop the lock if we did not do anything.
175 		 * We always must commit the txn if we are in MVCC
176 		 * as we have dirtied the hash buckets.
177 		 */
178 		if (ret == 0 &&
179 		    atomic_read(&dbp->mpf->mfp->multiversion) == 0 &&
180 		    (pgs_done == 0 || dbc->txn == NULL))
181 			ret = __LPUT(dbc, hcp->lock);
182 		else if (LF_ISSET(DB_AUTO_COMMIT)) {
183 			if (ret == 0)
184 				hcp->bucket++;
185 			break;
186 		}
187 	}
188 	/*
189 	 * If we saw any empty buckets and we are freeing space we
190 	 * want to contract the table before dropping the metadata
191 	 * page. Wait till we are done with everything else as we
192 	 * need to get an exclusive lock on the metadata page.
193 	 */
194 	if (ret == 0 && empty_buckets != 0 && LF_ISSET(DB_FREE_SPACE)) {
195 		for (i = 0; i < empty_buckets && hcp->hdr->max_bucket > 2; i++)
196 			if ((ret = __ham_contract_table(dbc, c_data)) != 0)
197 				break;
198 	}
199 
200 	if (ret == 0)
201 		ret = __db_retcopy(dbp->env, start, &hcp->bucket,
202 		    sizeof(hcp->bucket), &start->data, &start->ulen);
203 	(void)__ham_release_meta(dbc);
204 	c_data->compact_empty_buckets += empty_buckets;
205 	if (hcp->bucket > stop_bucket)
206 		*donep = 1;
207 	return (ret);
208 }
209 
210 /*
211  * __ham_compact_bucket -- move data to as few pages as possible.
212  *
213  * PUBLIC: int __ham_compact_bucket __P((DBC *, DB_COMPACT *, int *));
214  */
215 int
__ham_compact_bucket(dbc,c_data,pgs_donep)216 __ham_compact_bucket(dbc, c_data, pgs_donep)
217 	DBC *dbc;
218 	DB_COMPACT *c_data;
219 	int *pgs_donep;
220 {
221 	DB *dbp;
222 	DB_MPOOLFILE *mpf;
223 	HASH_CURSOR *hcp;
224 	PAGE *pg;
225 	db_pgno_t pgno;
226 	int check_trunc, ret, t_ret;
227 
228 	hcp = (HASH_CURSOR *)dbc->internal;
229 	dbp = dbc->dbp;
230 	mpf = dbp->mpf;
231 	pg = hcp->page;
232 	check_trunc = c_data->compact_truncate != PGNO_INVALID;
233 	ret = 0;
234 
235 	pgno = hcp->pgno;
236 	do {
237 		if (pg == NULL && (ret = __memp_fget(mpf, &pgno,
238 		    dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &pg)) != 0)
239 			break;
240 		/* Sort any unsorted pages before adding to the page. */
241 		if (TYPE(pg) == P_HASH_UNSORTED) {
242 			if ((ret = __ham_sort_page_cursor(dbc, pg)) != 0)
243 				break;
244 			(*pgs_donep)++;
245 		}
246 
247 		/* If this is not the head try to move it to a lower page. */
248 		if (check_trunc && PREV_PGNO(pg) != PGNO_INVALID  &&
249 		    PGNO(pg) > c_data->compact_truncate &&
250 		    (ret = __db_exchange_page(dbc, &pg,
251 		    hcp->page, PGNO_INVALID, DB_EXCH_FREE, pgs_donep)) != 0)
252 			break;
253 		if (pgno != PGNO(pg))
254 			(*pgs_donep)++;
255 
256 		if (NEXT_PGNO(pg) == PGNO_INVALID)
257 			break;
258 		if ((ret = __ham_copy_data(dbc, pg, c_data, pgs_donep)) != 0)
259 			break;
260 		pgno = NEXT_PGNO(pg);
261 		if (pg != hcp->page && (ret = __memp_fput(mpf,
262 		     dbc->thread_info, pg, dbc->priority)) != 0)
263 			break;
264 		pg = NULL;
265 	} while (pgno != PGNO_INVALID);
266 
267 	if (pg != NULL && pg != hcp->page &&
268 	    (t_ret = __memp_fput(mpf, dbc->thread_info, pg, dbc->priority)) &&
269 	    ret == 0)
270 		ret = t_ret;
271 	return (ret);
272 }
273 
274 /*
275  * __ham_copy_data -- copy as many records as possible from next page
276  */
277 static int
__ham_copy_data(dbc,pg,c_data,pgs_donep)278 __ham_copy_data(dbc, pg, c_data, pgs_donep)
279 	DBC *dbc;
280 	PAGE *pg;
281 	DB_COMPACT *c_data;
282 	int *pgs_donep;
283 {
284 	DB *dbp;
285 	DBC *newdbc;
286 	DBT data, key;
287 	DB_MPOOLFILE *mpf;
288 	HASH_CURSOR *hcp, *ncp;
289 	PAGE *nextpage;
290 	db_pgno_t origpgno;
291 	int i, nument, records, ret, t_ret;
292 	u_int32_t len;
293 
294 	dbp = dbc->dbp;
295 	mpf = dbp->mpf;
296 	hcp = (HASH_CURSOR *)dbc->internal;
297 	records = 0;
298 
299 	if ((ret = __dbc_dup(dbc, &newdbc, 0)) != 0)
300 		return (ret);
301 	ncp = (HASH_CURSOR *)newdbc->internal;
302 	ncp->hdr = hcp->hdr;
303 
304 	/*
305 	 * Copy data to the front of the bucket. Loop until either we
306 	 * have not replaced the next page or there is no next page.
307 	 * If the next page was not removed then it still has data
308 	 * on it.
309 	 */
310 	origpgno = PGNO_INVALID;
311 	while (origpgno != NEXT_PGNO(pg) &&
312 	    (origpgno = NEXT_PGNO(pg)) != PGNO_INVALID) {
313 
314 		if ((ret = __memp_fget(mpf, &NEXT_PGNO(pg), dbc->thread_info,
315 		    dbc->txn, DB_MPOOL_DIRTY, &nextpage)) != 0)
316 			break;
317 
318 		c_data->compact_pages_examine++;
319 		ncp->page = nextpage;
320 		ncp->pgno = PGNO(nextpage);
321 		ncp->indx = 0;
322 		memset(&key, 0, sizeof(key));
323 		memset(&data, 0, sizeof(data));
324 		nument = NUM_ENT(nextpage);
325 		DB_ASSERT(dbp->env, nument != 0);
326 		for (i = 0; i < nument; i += 2) {
327 			len = LEN_HITEM(dbp, nextpage, dbp->pgsize, 0) +
328 			    LEN_HITEM(dbp, nextpage, dbp->pgsize, 1) +
329 			    2 * sizeof(db_indx_t);
330 			if (P_FREESPACE(dbp, pg) < len)
331 				continue;
332 
333 			if ((ret =
334 			    __ham_copypair(dbc, nextpage, 0, pg, NULL, 1)) != 0)
335 				break;
336 
337 			records++;
338 			if ((ret = __ham_del_pair(newdbc,
339 			    HAM_DEL_IGNORE_OFFPAGE, pg)) != 0)
340 				break;
341 			if (!STD_LOCKING(dbc)) {
342 				if ((ret = __ham_dirty_meta(dbc, 0)) != 0)
343 					return (ret);
344 				++hcp->hdr->nelem;
345 			}
346 		}
347 		/*
348 		 * If we moved all the records then __ham_del_pair will
349 		 * have deleted the nextpage.
350 		 */
351 		if (records >= nument/2) {
352 			c_data->compact_pages_examine++;
353 			c_data->compact_pages_free++;
354 			COMPACT_TRUNCATE(c_data);
355 		}
356 		if (ncp->page != NULL &&
357 		    (t_ret = __memp_fput(mpf, dbc->thread_info,
358 		    ncp->page, dbc->priority)) != 0 && ret == 0)
359 			ret = t_ret;
360 		ncp->page = NULL;
361 		ncp->pgno = PGNO_INVALID;
362 	}
363 
364 	/*
365 	 * If __ham_del_pair freed a page then we needed to dirty the metapage
366 	 * and it could change so we need to copy it back to hcp.
367 	 */
368 	hcp->hdr = ncp->hdr;
369 	ncp->hdr = NULL;
370 	if ((t_ret = __ham_release_meta(newdbc)) != 0 && ret == 0)
371 		ret = t_ret;
372 	if ((t_ret = __dbc_close(newdbc)) != 0 && ret == 0)
373 		ret = t_ret;
374 	if (records != 0)
375 		(*pgs_donep)++;
376 	return (ret);
377 }
378 
379 /*
380  * __ham_truncate_overflow -- try to truncate pages from an overflow chain.
381  */
382 static int
__ham_truncate_overflow(dbc,indx,c_data,pgs_done)383 __ham_truncate_overflow(dbc, indx, c_data, pgs_done)
384 	DBC *dbc;
385 	u_int32_t indx;
386 	DB_COMPACT *c_data;
387 	int *pgs_done;
388 {
389 	DB *dbp;
390 	HASH_CURSOR *hcp;
391 	db_pgno_t origpgno, pgno;
392 	int ret;
393 
394 	hcp = (HASH_CURSOR *)dbc->internal;
395 	dbp = dbc->dbp;
396 	memcpy(&pgno,
397 	    HOFFPAGE_PGNO(P_ENTRY(dbp, hcp->page, indx)), sizeof(db_pgno_t));
398 	if (pgno > c_data->compact_truncate) {
399 		c_data->compact_pages_examine++;
400 		origpgno = pgno;
401 		if ((ret = __memp_dirty(dbp->mpf, &hcp->page,
402 		    dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
403 			return (ret);
404 		if ((ret = __db_truncate_root(dbc,
405 		    hcp->page, indx, &pgno, 0, pgs_done)) != 0)
406 			return (ret);
407 		if (pgno != origpgno) {
408 			memcpy(HOFFPAGE_PGNO(P_ENTRY(dbp, hcp->page, indx)),
409 			    &pgno, sizeof(db_pgno_t));
410 			(*pgs_done)++;
411 			c_data->compact_pages--;
412 		}
413 	}
414 	if ((ret =
415 	    __db_truncate_overflow(dbc, pgno, NULL, c_data, pgs_done)) != 0)
416 		return (ret);
417 	return (0);
418 }
419 
420 #ifdef HAVE_FTRUNCATE
421 /*
422  * __ham_compact_hash -- compact the hash table.
423  * PUBLIC: int __ham_compact_hash __P((DB *,
424  * PUBLIC:       DB_THREAD_INFO *, DB_TXN *, DB_COMPACT *));
425  */
426 int
__ham_compact_hash(dbp,ip,txn,c_data)427 __ham_compact_hash(dbp, ip, txn, c_data)
428 	DB *dbp;
429 	DB_THREAD_INFO *ip;
430 	DB_TXN *txn;
431 	DB_COMPACT *c_data;
432 {
433 	DBC *dbc;
434 	DB_LOCK lock;
435 	HASH_CURSOR *hcp;
436 	HMETA *meta;
437 	PAGE *oldpage;
438 	db_pgno_t free_pgno, last_pgno, pgno, start_pgno;
439 	int flags, local_txn, pgs_done, ret, t_ret;
440 	u_int32_t bucket, i, size;
441 
442 	local_txn = IS_DB_AUTO_COMMIT(dbp, txn);
443 	pgs_done = 0;
444 	oldpage = NULL;
445 	dbc = NULL;
446 	LOCK_INIT(lock);
447 
448 	if (local_txn &&
449 	    (ret = __txn_begin(dbp->env, ip, txn, &txn, 0)) != 0)
450 		return (ret);
451 
452 	if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
453 		goto err1;
454 	hcp = (HASH_CURSOR *)dbc->internal;
455 
456 	if ((ret = __ham_get_meta(dbc)) != 0 ||
457 	    (ret = __ham_dirty_meta(dbc, 0)) != 0)
458 		goto err1;
459 
460 	meta = hcp->hdr;
461 
462 	LOCK_CHECK_OFF(ip);
463 
464 	/*
465 	 * Find contiguous lower numbered pages for each hash table segment.
466 	 */
467 	for (i = 0; i < NCACHED && meta->spares[i] != PGNO_INVALID; i++) {
468 		if (i == 0) {
469 			bucket = 0;
470 			size = 1;
471 		} else {
472 			bucket = 1 << (i - 1);
473 			size = bucket;
474 		}
475 		start_pgno = meta->spares[i] + bucket;
476 		if ((ret = __db_find_free(dbc, P_HASH,
477 		    size, start_pgno, &free_pgno)) != 0) {
478 			if (ret != DB_NOTFOUND)
479 				break;
480 			ret = 0;
481 			continue;
482 		}
483 		if (DBC_LOGGING(dbc)) {
484 			if ((ret = __ham_changeslot_log(dbp,
485 			    dbc->txn, &LSN(meta),
486 			    0, &LSN(meta), i, start_pgno, free_pgno)) != 0)
487 				break;
488 		} else
489 			LSN_NOT_LOGGED(LSN(meta));
490 		last_pgno = free_pgno + bucket;
491 		/*
492 		 * March through the list swapping pages.  If the page is
493 		 * empty we just need to free it.  If we are just sliding
494 		 * things down don't free the pages that will be reused.
495 		 * Note that __db_exchange_page returns the new page so
496 		 * we must put it.
497 		 */
498 		for (pgno = start_pgno;
499 		    pgno < start_pgno + size; pgno++, free_pgno++) {
500 			if ((ret = __db_lget(dbc,
501 			    LCK_COUPLE, pgno, DB_LOCK_WRITE, 0, &lock)) != 0)
502 				goto err;
503 			if ((ret = __memp_fget(dbp->mpf, &pgno,
504 			     dbc->thread_info, dbc->txn,
505 			     DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &oldpage)) != 0)
506 				goto err;
507 			if (NUM_ENT(oldpage) != 0) {
508 				if (pgno < last_pgno)
509 					flags = 0;
510 				else
511 					flags = DB_EXCH_FREE;
512 				if ((ret = __db_exchange_page(dbc, &oldpage,
513 				    NULL, free_pgno, flags, &pgs_done)) != 0)
514 					goto err;
515 			} else if (pgno >= last_pgno) {
516 				if ((ret = __db_free(dbc, oldpage, 0)) != 0)
517 					goto err;
518 				COMPACT_TRUNCATE(c_data);
519 				oldpage = NULL;
520 			}
521 			if (oldpage != NULL && (ret = __memp_fput(dbp->mpf,
522 			    dbc->thread_info, oldpage, dbc->priority)) != 0)
523 				goto err;
524 			ret = 0;
525 			oldpage = NULL;
526 			c_data->compact_pages_examine++;
527 		}
528 		meta->spares[i] = free_pgno - (size + bucket);
529 	}
530 	if (ret == 0 && F_ISSET(dbp, DB_AM_SUBDB) &&
531 	    PGNO(hcp->hdr) > c_data->compact_truncate)
532 		ret = __db_move_metadata(dbc, (DBMETA**)&hcp->hdr,
533 		    c_data, &pgs_done);
534 
535 err:	if (oldpage != NULL && (t_ret = __memp_fput(dbp->mpf,
536 	    dbc->thread_info, oldpage, dbc->priority)) != 0 && ret == 0)
537 		ret = t_ret;
538 	if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
539 		ret = t_ret;
540 	LOCK_CHECK_ON(ip);
541 err1:	if (dbc != NULL) {
542 		if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0)
543 			ret = t_ret;
544 		if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
545 			ret = t_ret;
546 	}
547 	if (local_txn && (t_ret = (ret == 0 ?
548 	    __txn_commit(txn, 0) : __txn_abort(txn))) != 0 && ret == 0)
549 		ret = t_ret;
550 
551 	return (ret);
552 }
553 #endif
554