1 /*-
2 * Copyright (c) 1996, 2020 Oracle and/or its affiliates. All rights reserved.
3 *
4 * See the file LICENSE for license information.
5 * $Id$
6 */
7
8 #include "db_config.h"
9
10 #include "db_int.h"
11 #include "dbinc/db_page.h"
12 #include "dbinc/btree.h"
13 #include "dbinc/hash.h"
14 #include "dbinc/lock.h"
15 #include "dbinc/txn.h"
16 #include "dbinc/mp.h"
17
18 static int __ham_copy_data __P((DBC *, PAGE *, DB_COMPACT *, int *));
19 static int __ham_truncate_overflow __P((DBC *, u_int32_t, DB_COMPACT *, int *));
20
21 /*
22 * __ham_compact_int -- internal HASH compaction routine.
23 *
24 * PUBLIC: int __ham_compact_int __P((DBC *,
25 * PUBLIC: DBT *, DBT *, u_int32_t, DB_COMPACT *, int *, u_int32_t));
26 */
27 int
__ham_compact_int(dbc,start,stop,factor,c_data,donep,flags)28 __ham_compact_int(dbc, start, stop, factor, c_data, donep, flags)
29 DBC *dbc;
30 DBT *start, *stop;
31 u_int32_t factor;
32 DB_COMPACT *c_data;
33 int *donep;
34 u_int32_t flags;
35 {
36 DB *dbp;
37 DB_MPOOLFILE *mpf;
38 HASH_CURSOR *hcp;
39 db_pgno_t origpgno, pgno;
40 int check_trunc, pgs_done, ret, t_ret;
41 u_int32_t empty_buckets, i, stop_bucket;
42
43 dbp = dbc->dbp;
44 mpf = dbp->mpf;
45 hcp = (HASH_CURSOR *)dbc->internal;
46 pgs_done = 0;
47 empty_buckets = 0;
48 check_trunc = c_data->compact_truncate != PGNO_INVALID;
49 if ((ret = __ham_get_meta(dbc)) != 0)
50 return (ret);
51
52 if (stop != NULL && stop->size != 0)
53 stop_bucket = *(u_int32_t *)stop->data;
54 else
55 stop_bucket = hcp->hdr->max_bucket;
56
57 if (start != NULL && start->size != 0)
58 hcp->bucket = *(u_int32_t *)start->data;
59 else
60 hcp->bucket = 0;
61
62 for (; hcp->bucket <= stop_bucket && ret == 0; hcp->bucket++) {
63 /*
64 * For each bucket first move records toward the head of
65 * the bucket.
66 */
67 hcp->indx = NDX_INVALID;
68 F_CLR(hcp, H_ISDUP);
69 hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket);
70 pgno = PGNO_INVALID;
71 ret = __ham_item_next(dbc, DB_LOCK_WRITE, &pgno);
72
73 /*
74 * If the bucket is empty, just note it, otherwise process it.
75 * If there are any records there must be some in the head
76 * of the bucket.
77 */
78 if (ret == DB_NOTFOUND ) {
79 empty_buckets++;
80 c_data->compact_pages_examine++;
81 DB_ASSERT(dbp->env,
82 PREV_PGNO(hcp->page) == PGNO_INVALID &&
83 NEXT_PGNO(hcp->page) == PGNO_INVALID);
84 goto err;
85 } else if (ret != 0)
86 break;
87 c_data->compact_pages_examine++;
88
89 if (NEXT_PGNO(hcp->page) != PGNO_INVALID) {
90 if ((ret =
91 __ham_compact_bucket(dbc, c_data, &pgs_done)) != 0)
92 goto err;
93 pgno = PGNO_INVALID;
94 if ((ret = __ham_item(dbc, DB_LOCK_WRITE, &pgno)) != 0)
95 goto err;
96 }
97
98 /*
99 * Loop through the items in this page in the bucket and process
100 * overflow records and off page duplicate sets.
101 */
102 while (ret == 0) {
103 /* Handle off page duplicate trees. */
104 if (pgno == PGNO_INVALID)
105 goto no_opd;
106 if (check_trunc &&
107 pgno > c_data->compact_truncate) {
108 c_data->compact_pages_examine++;
109 /*
110 * Truncate this page if possible.
111 * We must update the parent here
112 * because the page number is
113 * not aligned.
114 */
115 if ((ret = __memp_dirty(mpf, &hcp->page,
116 dbc->thread_info,
117 dbc->txn, dbc->priority, 0)) != 0)
118 break;
119 origpgno = pgno;
120 if ((ret = __db_truncate_root(dbc, hcp->page,
121 H_DATAINDEX(hcp->indx),
122 &pgno, 0, &pgs_done)) != 0)
123 break;
124 if (pgno != origpgno) {
125 memcpy(HOFFDUP_PGNO(H_PAIRDATA(dbp,
126 hcp->page, hcp->indx)),
127 &pgno, sizeof(db_pgno_t));
128 pgs_done++;
129 c_data->compact_pages--;
130 }
131 }
132 /*
133 * Compact the off page duplicate tree.
134 */
135 if ((ret = __bam_compact_opd(dbc,
136 pgno, NULL, factor, c_data, &pgs_done)) != 0)
137 break;
138
139 no_opd: if (check_trunc && HPAGE_PTYPE(H_PAIRDATA(
140 dbp, hcp->page, hcp->indx)) == H_OFFPAGE) {
141 /* This is an overflow chain. */
142 if ((ret = __ham_truncate_overflow(dbc,
143 H_DATAINDEX(hcp->indx),
144 c_data, &pgs_done)) != 0)
145 break;
146 }
147
148 /* Check for an overflow key. */
149 if (check_trunc && HPAGE_PTYPE(H_PAIRKEY(
150 dbp, hcp->page, hcp->indx)) == H_OFFPAGE) {
151 /* This is an overflow chain. */
152 if ((ret = __ham_truncate_overflow(dbc,
153 H_KEYINDEX(hcp->indx),
154 c_data, &pgs_done)) != 0)
155 break;
156 }
157
158 pgno = PGNO_INVALID;
159 ret = __ham_item_next(dbc, DB_LOCK_WRITE, &pgno);
160 }
161
162 err: if (hcp->page != NULL &&
163 (t_ret = __memp_fput(mpf, dbc->thread_info,
164 hcp->page, dbc->priority)) != 0 && ret == 0)
165 ret = t_ret;
166 if (ret == DB_NOTFOUND)
167 ret = 0;
168 hcp->page = NULL;
169 hcp->pgno = pgno = PGNO_INVALID;
170 /*
171 * If we are in an auto-transaction and we updated something
172 * return to the caller to commit this transaction to
173 * avoid holding locks. Otherwise process the next bucket.
174 * We can drop the lock if we did not do anything.
175 * We always must commit the txn if we are in MVCC
176 * as we have dirtied the hash buckets.
177 */
178 if (ret == 0 &&
179 atomic_read(&dbp->mpf->mfp->multiversion) == 0 &&
180 (pgs_done == 0 || dbc->txn == NULL))
181 ret = __LPUT(dbc, hcp->lock);
182 else if (LF_ISSET(DB_AUTO_COMMIT)) {
183 if (ret == 0)
184 hcp->bucket++;
185 break;
186 }
187 }
188 /*
189 * If we saw any empty buckets and we are freeing space we
190 * want to contract the table before dropping the metadata
191 * page. Wait till we are done with everything else as we
192 * need to get an exclusive lock on the metadata page.
193 */
194 if (ret == 0 && empty_buckets != 0 && LF_ISSET(DB_FREE_SPACE)) {
195 for (i = 0; i < empty_buckets && hcp->hdr->max_bucket > 2; i++)
196 if ((ret = __ham_contract_table(dbc, c_data)) != 0)
197 break;
198 }
199
200 if (ret == 0)
201 ret = __db_retcopy(dbp->env, start, &hcp->bucket,
202 sizeof(hcp->bucket), &start->data, &start->ulen);
203 (void)__ham_release_meta(dbc);
204 c_data->compact_empty_buckets += empty_buckets;
205 if (hcp->bucket > stop_bucket)
206 *donep = 1;
207 return (ret);
208 }
209
210 /*
211 * __ham_compact_bucket -- move data to as few pages as possible.
212 *
213 * PUBLIC: int __ham_compact_bucket __P((DBC *, DB_COMPACT *, int *));
214 */
215 int
__ham_compact_bucket(dbc,c_data,pgs_donep)216 __ham_compact_bucket(dbc, c_data, pgs_donep)
217 DBC *dbc;
218 DB_COMPACT *c_data;
219 int *pgs_donep;
220 {
221 DB *dbp;
222 DB_MPOOLFILE *mpf;
223 HASH_CURSOR *hcp;
224 PAGE *pg;
225 db_pgno_t pgno;
226 int check_trunc, ret, t_ret;
227
228 hcp = (HASH_CURSOR *)dbc->internal;
229 dbp = dbc->dbp;
230 mpf = dbp->mpf;
231 pg = hcp->page;
232 check_trunc = c_data->compact_truncate != PGNO_INVALID;
233 ret = 0;
234
235 pgno = hcp->pgno;
236 do {
237 if (pg == NULL && (ret = __memp_fget(mpf, &pgno,
238 dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &pg)) != 0)
239 break;
240 /* Sort any unsorted pages before adding to the page. */
241 if (TYPE(pg) == P_HASH_UNSORTED) {
242 if ((ret = __ham_sort_page_cursor(dbc, pg)) != 0)
243 break;
244 (*pgs_donep)++;
245 }
246
247 /* If this is not the head try to move it to a lower page. */
248 if (check_trunc && PREV_PGNO(pg) != PGNO_INVALID &&
249 PGNO(pg) > c_data->compact_truncate &&
250 (ret = __db_exchange_page(dbc, &pg,
251 hcp->page, PGNO_INVALID, DB_EXCH_FREE, pgs_donep)) != 0)
252 break;
253 if (pgno != PGNO(pg))
254 (*pgs_donep)++;
255
256 if (NEXT_PGNO(pg) == PGNO_INVALID)
257 break;
258 if ((ret = __ham_copy_data(dbc, pg, c_data, pgs_donep)) != 0)
259 break;
260 pgno = NEXT_PGNO(pg);
261 if (pg != hcp->page && (ret = __memp_fput(mpf,
262 dbc->thread_info, pg, dbc->priority)) != 0)
263 break;
264 pg = NULL;
265 } while (pgno != PGNO_INVALID);
266
267 if (pg != NULL && pg != hcp->page &&
268 (t_ret = __memp_fput(mpf, dbc->thread_info, pg, dbc->priority)) &&
269 ret == 0)
270 ret = t_ret;
271 return (ret);
272 }
273
274 /*
275 * __ham_copy_data -- copy as many records as possible from next page
276 */
277 static int
__ham_copy_data(dbc,pg,c_data,pgs_donep)278 __ham_copy_data(dbc, pg, c_data, pgs_donep)
279 DBC *dbc;
280 PAGE *pg;
281 DB_COMPACT *c_data;
282 int *pgs_donep;
283 {
284 DB *dbp;
285 DBC *newdbc;
286 DBT data, key;
287 DB_MPOOLFILE *mpf;
288 HASH_CURSOR *hcp, *ncp;
289 PAGE *nextpage;
290 db_pgno_t origpgno;
291 int i, nument, records, ret, t_ret;
292 u_int32_t len;
293
294 dbp = dbc->dbp;
295 mpf = dbp->mpf;
296 hcp = (HASH_CURSOR *)dbc->internal;
297 records = 0;
298
299 if ((ret = __dbc_dup(dbc, &newdbc, 0)) != 0)
300 return (ret);
301 ncp = (HASH_CURSOR *)newdbc->internal;
302 ncp->hdr = hcp->hdr;
303
304 /*
305 * Copy data to the front of the bucket. Loop until either we
306 * have not replaced the next page or there is no next page.
307 * If the next page was not removed then it still has data
308 * on it.
309 */
310 origpgno = PGNO_INVALID;
311 while (origpgno != NEXT_PGNO(pg) &&
312 (origpgno = NEXT_PGNO(pg)) != PGNO_INVALID) {
313
314 if ((ret = __memp_fget(mpf, &NEXT_PGNO(pg), dbc->thread_info,
315 dbc->txn, DB_MPOOL_DIRTY, &nextpage)) != 0)
316 break;
317
318 c_data->compact_pages_examine++;
319 ncp->page = nextpage;
320 ncp->pgno = PGNO(nextpage);
321 ncp->indx = 0;
322 memset(&key, 0, sizeof(key));
323 memset(&data, 0, sizeof(data));
324 nument = NUM_ENT(nextpage);
325 DB_ASSERT(dbp->env, nument != 0);
326 for (i = 0; i < nument; i += 2) {
327 len = LEN_HITEM(dbp, nextpage, dbp->pgsize, 0) +
328 LEN_HITEM(dbp, nextpage, dbp->pgsize, 1) +
329 2 * sizeof(db_indx_t);
330 if (P_FREESPACE(dbp, pg) < len)
331 continue;
332
333 if ((ret =
334 __ham_copypair(dbc, nextpage, 0, pg, NULL, 1)) != 0)
335 break;
336
337 records++;
338 if ((ret = __ham_del_pair(newdbc,
339 HAM_DEL_IGNORE_OFFPAGE, pg)) != 0)
340 break;
341 if (!STD_LOCKING(dbc)) {
342 if ((ret = __ham_dirty_meta(dbc, 0)) != 0)
343 return (ret);
344 ++hcp->hdr->nelem;
345 }
346 }
347 /*
348 * If we moved all the records then __ham_del_pair will
349 * have deleted the nextpage.
350 */
351 if (records >= nument/2) {
352 c_data->compact_pages_examine++;
353 c_data->compact_pages_free++;
354 COMPACT_TRUNCATE(c_data);
355 }
356 if (ncp->page != NULL &&
357 (t_ret = __memp_fput(mpf, dbc->thread_info,
358 ncp->page, dbc->priority)) != 0 && ret == 0)
359 ret = t_ret;
360 ncp->page = NULL;
361 ncp->pgno = PGNO_INVALID;
362 }
363
364 /*
365 * If __ham_del_pair freed a page then we needed to dirty the metapage
366 * and it could change so we need to copy it back to hcp.
367 */
368 hcp->hdr = ncp->hdr;
369 ncp->hdr = NULL;
370 if ((t_ret = __ham_release_meta(newdbc)) != 0 && ret == 0)
371 ret = t_ret;
372 if ((t_ret = __dbc_close(newdbc)) != 0 && ret == 0)
373 ret = t_ret;
374 if (records != 0)
375 (*pgs_donep)++;
376 return (ret);
377 }
378
379 /*
380 * __ham_truncate_overflow -- try to truncate pages from an overflow chain.
381 */
382 static int
__ham_truncate_overflow(dbc,indx,c_data,pgs_done)383 __ham_truncate_overflow(dbc, indx, c_data, pgs_done)
384 DBC *dbc;
385 u_int32_t indx;
386 DB_COMPACT *c_data;
387 int *pgs_done;
388 {
389 DB *dbp;
390 HASH_CURSOR *hcp;
391 db_pgno_t origpgno, pgno;
392 int ret;
393
394 hcp = (HASH_CURSOR *)dbc->internal;
395 dbp = dbc->dbp;
396 memcpy(&pgno,
397 HOFFPAGE_PGNO(P_ENTRY(dbp, hcp->page, indx)), sizeof(db_pgno_t));
398 if (pgno > c_data->compact_truncate) {
399 c_data->compact_pages_examine++;
400 origpgno = pgno;
401 if ((ret = __memp_dirty(dbp->mpf, &hcp->page,
402 dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
403 return (ret);
404 if ((ret = __db_truncate_root(dbc,
405 hcp->page, indx, &pgno, 0, pgs_done)) != 0)
406 return (ret);
407 if (pgno != origpgno) {
408 memcpy(HOFFPAGE_PGNO(P_ENTRY(dbp, hcp->page, indx)),
409 &pgno, sizeof(db_pgno_t));
410 (*pgs_done)++;
411 c_data->compact_pages--;
412 }
413 }
414 if ((ret =
415 __db_truncate_overflow(dbc, pgno, NULL, c_data, pgs_done)) != 0)
416 return (ret);
417 return (0);
418 }
419
420 #ifdef HAVE_FTRUNCATE
421 /*
422 * __ham_compact_hash -- compact the hash table.
423 * PUBLIC: int __ham_compact_hash __P((DB *,
424 * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DB_COMPACT *));
425 */
426 int
__ham_compact_hash(dbp,ip,txn,c_data)427 __ham_compact_hash(dbp, ip, txn, c_data)
428 DB *dbp;
429 DB_THREAD_INFO *ip;
430 DB_TXN *txn;
431 DB_COMPACT *c_data;
432 {
433 DBC *dbc;
434 DB_LOCK lock;
435 HASH_CURSOR *hcp;
436 HMETA *meta;
437 PAGE *oldpage;
438 db_pgno_t free_pgno, last_pgno, pgno, start_pgno;
439 int flags, local_txn, pgs_done, ret, t_ret;
440 u_int32_t bucket, i, size;
441
442 local_txn = IS_DB_AUTO_COMMIT(dbp, txn);
443 pgs_done = 0;
444 oldpage = NULL;
445 dbc = NULL;
446 LOCK_INIT(lock);
447
448 if (local_txn &&
449 (ret = __txn_begin(dbp->env, ip, txn, &txn, 0)) != 0)
450 return (ret);
451
452 if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
453 goto err1;
454 hcp = (HASH_CURSOR *)dbc->internal;
455
456 if ((ret = __ham_get_meta(dbc)) != 0 ||
457 (ret = __ham_dirty_meta(dbc, 0)) != 0)
458 goto err1;
459
460 meta = hcp->hdr;
461
462 LOCK_CHECK_OFF(ip);
463
464 /*
465 * Find contiguous lower numbered pages for each hash table segment.
466 */
467 for (i = 0; i < NCACHED && meta->spares[i] != PGNO_INVALID; i++) {
468 if (i == 0) {
469 bucket = 0;
470 size = 1;
471 } else {
472 bucket = 1 << (i - 1);
473 size = bucket;
474 }
475 start_pgno = meta->spares[i] + bucket;
476 if ((ret = __db_find_free(dbc, P_HASH,
477 size, start_pgno, &free_pgno)) != 0) {
478 if (ret != DB_NOTFOUND)
479 break;
480 ret = 0;
481 continue;
482 }
483 if (DBC_LOGGING(dbc)) {
484 if ((ret = __ham_changeslot_log(dbp,
485 dbc->txn, &LSN(meta),
486 0, &LSN(meta), i, start_pgno, free_pgno)) != 0)
487 break;
488 } else
489 LSN_NOT_LOGGED(LSN(meta));
490 last_pgno = free_pgno + bucket;
491 /*
492 * March through the list swapping pages. If the page is
493 * empty we just need to free it. If we are just sliding
494 * things down don't free the pages that will be reused.
495 * Note that __db_exchange_page returns the new page so
496 * we must put it.
497 */
498 for (pgno = start_pgno;
499 pgno < start_pgno + size; pgno++, free_pgno++) {
500 if ((ret = __db_lget(dbc,
501 LCK_COUPLE, pgno, DB_LOCK_WRITE, 0, &lock)) != 0)
502 goto err;
503 if ((ret = __memp_fget(dbp->mpf, &pgno,
504 dbc->thread_info, dbc->txn,
505 DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &oldpage)) != 0)
506 goto err;
507 if (NUM_ENT(oldpage) != 0) {
508 if (pgno < last_pgno)
509 flags = 0;
510 else
511 flags = DB_EXCH_FREE;
512 if ((ret = __db_exchange_page(dbc, &oldpage,
513 NULL, free_pgno, flags, &pgs_done)) != 0)
514 goto err;
515 } else if (pgno >= last_pgno) {
516 if ((ret = __db_free(dbc, oldpage, 0)) != 0)
517 goto err;
518 COMPACT_TRUNCATE(c_data);
519 oldpage = NULL;
520 }
521 if (oldpage != NULL && (ret = __memp_fput(dbp->mpf,
522 dbc->thread_info, oldpage, dbc->priority)) != 0)
523 goto err;
524 ret = 0;
525 oldpage = NULL;
526 c_data->compact_pages_examine++;
527 }
528 meta->spares[i] = free_pgno - (size + bucket);
529 }
530 if (ret == 0 && F_ISSET(dbp, DB_AM_SUBDB) &&
531 PGNO(hcp->hdr) > c_data->compact_truncate)
532 ret = __db_move_metadata(dbc, (DBMETA**)&hcp->hdr,
533 c_data, &pgs_done);
534
535 err: if (oldpage != NULL && (t_ret = __memp_fput(dbp->mpf,
536 dbc->thread_info, oldpage, dbc->priority)) != 0 && ret == 0)
537 ret = t_ret;
538 if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
539 ret = t_ret;
540 LOCK_CHECK_ON(ip);
541 err1: if (dbc != NULL) {
542 if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0)
543 ret = t_ret;
544 if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
545 ret = t_ret;
546 }
547 if (local_txn && (t_ret = (ret == 0 ?
548 __txn_commit(txn, 0) : __txn_abort(txn))) != 0 && ret == 0)
549 ret = t_ret;
550
551 return (ret);
552 }
553 #endif
554