1 /*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996, 2013 Oracle and/or its affiliates. All rights reserved.
5 *
6 * $Id$
7 */
8
9 #include "db_config.h"
10
11 #include "db_int.h"
12 #include "dbinc/db_page.h"
13 #include "dbinc/db_verify.h"
14 #include "dbinc/btree.h"
15 #ifdef HAVE_HASH
16 #include "dbinc/hash.h"
17 #endif
18 #include "dbinc/lock.h"
19 #include "dbinc/mp.h"
20 #include "dbinc/partition.h"
21 #include "dbinc/txn.h"
22 #ifdef HAVE_PARTITION
23
24 static int __part_rr __P((DB *, DB_THREAD_INFO *, DB_TXN *,
25 const char *, const char *, const char *, u_int32_t));
26 static int __partc_close __P((DBC *, db_pgno_t, int *));
27 static int __partc_del __P((DBC*, u_int32_t));
28 static int __partc_destroy __P((DBC*));
29 static int __partc_get_pp __P((DBC*, DBT *, DBT *, u_int32_t));
30 static int __partc_put __P((DBC*, DBT *, DBT *, u_int32_t, db_pgno_t *));
31 static int __partc_writelock __P((DBC*));
32 static int __partition_chk_meta __P((DB *,
33 DB_THREAD_INFO *, DB_TXN *, u_int32_t));
34 static int __partition_setup_keys __P((DBC *,
35 DB_PARTITION *, DBMETA *, u_int32_t));
36 static int __part_key_cmp __P((const void *, const void *));
37 static inline void __part_search __P((DB *,
38 DB_PARTITION *, DBT *, u_int32_t *));
39
40 static char *Alloc_err = DB_STR_A("0644",
41 "Partition open failed to allocate %d bytes", "%d");
42
43 /*
44 * Allocate a partition cursor and copy flags to the partition cursor.
45 * Not passed:
46 * DBC_PARTITIONED -- the subcursors are not.
47 * DBC_OWN_LID -- the arg dbc owns the lock id.
48 * DBC_WRITECURSOR DBC_WRITER -- CDS locking happens on
49 * the whole DB, not the partition.
50 */
51 #define GET_PART_CURSOR(dbc, new_dbc, part_id) do { \
52 DB *__part_dbp; \
53 __part_dbp = part->handles[part_id]; \
54 if ((ret = __db_cursor_int(__part_dbp, \
55 (dbc)->thread_info, (dbc)->txn, __part_dbp->type, \
56 PGNO_INVALID, 0, (dbc)->locker, &new_dbc)) != 0) \
57 goto err; \
58 (new_dbc)->flags = (dbc)->flags & \
59 ~(DBC_PARTITIONED|DBC_OWN_LID|DBC_WRITECURSOR|DBC_WRITER); \
60 } while (0)
61
62 /*
63 * Search for the correct partition.
64 */
__part_search(dbp,part,key,part_idp)65 static inline void __part_search(dbp, part, key, part_idp)
66 DB *dbp;
67 DB_PARTITION *part;
68 DBT *key;
69 u_int32_t *part_idp;
70 {
71 db_indx_t base, indx, limit;
72 int cmp;
73 int (*func) __P((DB *, const DBT *, const DBT *));
74
75 DB_ASSERT(dbp->env, part->nparts != 0);
76 COMPQUIET(cmp, 0);
77 COMPQUIET(indx, 0);
78
79 func = ((BTREE *)dbp->bt_internal)->bt_compare;
80 DB_BINARY_SEARCH_FOR(base, limit, part->nparts, O_INDX) {
81 DB_BINARY_SEARCH_INCR(indx, base, limit, O_INDX);
82 cmp = func(dbp, key, &part->keys[indx]);
83 if (cmp == 0)
84 break;
85 if (cmp > 0)
86 DB_BINARY_SEARCH_SHIFT_BASE(indx, base, limit, O_INDX);
87 }
88 if (cmp == 0)
89 *part_idp = indx;
90 else if ((*part_idp = base) != 0)
91 (*part_idp)--;
92 }
93
94 /*
95 * __partition_init --
96 * Initialize the partition structure.
97 * Called when the meta data page is read in during database open or
98 * when partition keys or a callback are set.
99 *
100 * PUBLIC: int __partition_init __P((DB *, u_int32_t));
101 */
102 int
__partition_init(dbp,flags)103 __partition_init(dbp, flags)
104 DB *dbp;
105 u_int32_t flags;
106 {
107 DB_PARTITION *part;
108 int ret;
109
110 if ((part = dbp->p_internal) != NULL) {
111 if ((LF_ISSET(DBMETA_PART_RANGE) &&
112 F_ISSET(part, PART_CALLBACK)) ||
113 (LF_ISSET(DBMETA_PART_CALLBACK) &&
114 F_ISSET(part, PART_RANGE))) {
115 __db_errx(dbp->env, DB_STR("0645",
116 "Cannot specify callback and range keys."));
117 return (EINVAL);
118 }
119 } else if ((ret = __os_calloc(dbp->env, 1, sizeof(*part), &part)) != 0)
120 return (ret);
121
122 if (LF_ISSET(DBMETA_PART_RANGE))
123 F_SET(part, PART_RANGE);
124 if (LF_ISSET(DBMETA_PART_CALLBACK))
125 F_SET(part, PART_CALLBACK);
126 dbp->p_internal = part;
127 /* Set up AM-specific methods that do not require an open. */
128 dbp->db_am_rename = __part_rename;
129 dbp->db_am_remove = __part_remove;
130 return (0);
131 }
132 /*
133 * __partition_set --
134 * Set the partitioning keys or callback function.
135 * This routine must be called prior to creating the database.
136 * PUBLIC: int __partition_set __P((DB *, u_int32_t, DBT *,
137 * PUBLIC: u_int32_t (*callback)(DB *, DBT *key)));
138 */
139
140 int
__partition_set(dbp,parts,keys,callback)141 __partition_set(dbp, parts, keys, callback)
142 DB *dbp;
143 u_int32_t parts;
144 DBT *keys;
145 u_int32_t (*callback)(DB *, DBT *key);
146 {
147 DB_PARTITION *part;
148 ENV *env;
149 int ret;
150
151 DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_partition");
152 env = dbp->dbenv->env;
153
154 if (parts < 2) {
155 __db_errx(env, DB_STR("0646",
156 "Must specify at least 2 partitions."));
157 return (EINVAL);
158 }
159
160 if (keys == NULL && callback == NULL) {
161 __db_errx(env, DB_STR("0647",
162 "Must specify either keys or a callback."));
163 return (EINVAL);
164 }
165 if (keys != NULL && callback != NULL) {
166 bad: __db_errx(env, DB_STR("0648",
167 "May not specify both keys and a callback."));
168 return (EINVAL);
169 }
170
171 if ((ret = __partition_init(dbp,
172 keys != NULL ?
173 DBMETA_PART_RANGE : DBMETA_PART_CALLBACK)) != 0)
174 return (ret);
175 part = dbp->p_internal;
176
177 if ((part->keys != NULL && callback != NULL) ||
178 (part->callback != NULL && keys != NULL))
179 goto bad;
180
181 part->nparts = parts;
182 part->keys = keys;
183 part->callback = callback;
184
185 return (0);
186 }
187
188 /*
189 * __partition_set_dirs --
190 * Set the directories for creating the partition databases.
191 * They must be in the environment.
192 * PUBLIC: int __partition_set_dirs __P((DB *, const char **));
193 */
194 int
__partition_set_dirs(dbp,dirp)195 __partition_set_dirs(dbp, dirp)
196 DB *dbp;
197 const char **dirp;
198 {
199 DB_ENV *dbenv;
200 DB_PARTITION *part;
201 ENV *env;
202 u_int32_t ndirs, slen;
203 int i, ret;
204 const char **dir;
205 char *cp, **part_dirs, **pd;
206
207 DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_partition_dirs");
208 dbenv = dbp->dbenv;
209 env = dbp->env;
210
211 ndirs = 1;
212 slen = 0;
213 for (dir = dirp; *dir != NULL; dir++) {
214 if (F_ISSET(env, ENV_DBLOCAL))
215 slen += (u_int32_t)strlen(*dir) + 1;
216 ndirs++;
217 }
218
219 slen += sizeof(char *) * ndirs;
220 if ((ret = __os_malloc(env, slen, &part_dirs)) != 0)
221 return (EINVAL);
222 memset(part_dirs, 0, slen);
223
224 cp = (char *) part_dirs + (sizeof(char *) * ndirs);
225 pd = part_dirs;
226 for (dir = dirp; *dir != NULL; dir++, pd++) {
227 if (F_ISSET(env, ENV_DBLOCAL)) {
228 (void)strcpy(cp, *dir);
229 *pd = cp;
230 cp += strlen(*dir) + 1;
231 continue;
232 }
233 for (i = 0; i < dbenv->data_next; i++)
234 if (strcmp(*dir, dbenv->db_data_dir[i]) == 0)
235 break;
236 if (i == dbenv->data_next) {
237 __db_errx(dbp->env, DB_STR_A("0649",
238 "Directory not in environment list %s",
239 "%s"), *dir);
240 __os_free(env, part_dirs);
241 return (EINVAL);
242 }
243 *pd = dbenv->db_data_dir[i];
244 }
245
246 if ((part = dbp->p_internal) == NULL) {
247 if ((ret = __partition_init(dbp, 0)) != 0)
248 return (ret);
249 part = dbp->p_internal;
250 }
251
252 part->dirs = (const char **)part_dirs;
253
254 return (0);
255 }
256
257 /*
258 * __partition_open --
259 * Open/create a partitioned database.
260 * PUBLIC: int __partition_open __P((DB *, DB_THREAD_INFO *,
261 * PUBLIC: DB_TXN *, const char *, DBTYPE, u_int32_t, int, int));
262 */
263 int
__partition_open(dbp,ip,txn,fname,type,flags,mode,do_open)264 __partition_open(dbp, ip, txn, fname, type, flags, mode, do_open)
265 DB *dbp;
266 DB_THREAD_INFO *ip;
267 DB_TXN *txn;
268 const char *fname;
269 DBTYPE type;
270 u_int32_t flags;
271 int mode, do_open;
272 {
273 DB *part_db;
274 DB_PARTITION *part;
275 DBC *dbc;
276 ENV *env;
277 u_int32_t part_id;
278 int ret;
279 char *name, *sp;
280 const char **dirp, *np;
281
282 part = dbp->p_internal;
283 env = dbp->dbenv->env;
284 name = NULL;
285
286 if ((ret = __partition_chk_meta(dbp, ip, txn, flags)) != 0 && do_open)
287 goto err;
288
289 if ((ret = __os_calloc(env,
290 part->nparts, sizeof(*part->handles), &part->handles)) != 0) {
291 __db_errx(env,
292 Alloc_err, part->nparts * sizeof(*part->handles));
293 goto err;
294 }
295
296 DB_ASSERT(env, fname != NULL);
297 if ((ret = __os_malloc(env,
298 strlen(fname) + PART_LEN + 1, &name)) != 0) {
299 __db_errx(env, Alloc_err, strlen(fname) + PART_LEN + 1);
300 goto err;
301 }
302
303 sp = name;
304 np = __db_rpath(fname);
305 if (np == NULL)
306 np = fname;
307 else {
308 np++;
309 (void)strncpy(name, fname, (size_t)(np - fname));
310 sp = name + (np - fname);
311 }
312
313 if (F_ISSET(dbp, DB_AM_RECOVER))
314 goto done;
315 dirp = part->dirs;
316 for (part_id = 0; part_id < part->nparts; part_id++) {
317 if ((ret = __db_create_internal(
318 &part->handles[part_id], dbp->env, 0)) != 0)
319 goto err;
320
321 part_db = part->handles[part_id];
322 part_db->flags = F_ISSET(dbp,
323 ~(DB_AM_CREATED | DB_AM_CREATED_MSTR | DB_AM_OPEN_CALLED));
324 F_SET(part_db, DB_AM_PARTDB);
325 part_db->adj_fileid = dbp->adj_fileid;
326 part_db->pgsize = dbp->pgsize;
327 part_db->priority = dbp->priority;
328 part_db->db_append_recno = dbp->db_append_recno;
329 part_db->db_feedback = dbp->db_feedback;
330 part_db->dup_compare = dbp->dup_compare;
331 part_db->app_private = dbp->app_private;
332 part_db->api_internal = dbp->api_internal;
333
334 if (dbp->type == DB_BTREE)
335 __bam_copy_config(dbp, part_db, part->nparts);
336 #ifdef HAVE_HASH
337 if (dbp->type == DB_HASH)
338 __ham_copy_config(dbp, part_db, part->nparts);
339 #endif
340
341 (void)sprintf(sp, PART_NAME, np, part_id);
342 if (do_open) {
343 /*
344 * Cycle through the directory names passed in,
345 * if any.
346 */
347 if (dirp != NULL &&
348 (part_db->dirname = *dirp++) == NULL) {
349 part_db->dirname = *(dirp = part->dirs);
350 dirp++;
351 }
352 if ((ret = __db_open(part_db, ip, txn,
353 name, NULL, type, flags, mode, PGNO_BASE_MD)) != 0)
354 goto err;
355 } else if ((ret = __os_strdup(env, name, &part_db->fname)) != 0)
356 goto err;
357 }
358
359 /* Get rid of the cursor used to open the database its the wrong type */
360 done: while ((dbc = TAILQ_FIRST(&dbp->free_queue)) != NULL)
361 if ((ret = __dbc_destroy(dbc)) != 0)
362 break;
363
364 if (0) {
365 err: (void)__partition_close(dbp, txn, 0);
366 }
367 if (name != NULL)
368 __os_free(env, name);
369 return (ret);
370 }
371
372 /*
373 * __partition_chk_meta --
374 * Check for a consistent meta data page and parameters when opening a
375 * partitioned database.
376 */
377 static int
__partition_chk_meta(dbp,ip,txn,flags)378 __partition_chk_meta(dbp, ip, txn, flags)
379 DB *dbp;
380 DB_THREAD_INFO *ip;
381 DB_TXN *txn;
382 u_int32_t flags;
383 {
384 DBMETA *meta;
385 DB_PARTITION *part;
386 DBC *dbc;
387 DB_LOCK metalock;
388 DB_MPOOLFILE *mpf;
389 ENV *env;
390 db_pgno_t base_pgno;
391 int ret, t_ret;
392
393 dbc = NULL;
394 meta = NULL;
395 LOCK_INIT(metalock);
396 part = dbp->p_internal;
397 mpf = dbp->mpf;
398 env = dbp->env;
399 ret = 0;
400
401 /* Get a cursor on the main db. */
402 dbp->p_internal = NULL;
403 if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
404 goto err;
405
406 /* Get the metadata page. */
407 base_pgno = PGNO_BASE_MD;
408 if ((ret =
409 __db_lget(dbc, 0, base_pgno, DB_LOCK_READ, 0, &metalock)) != 0)
410 goto err;
411 if ((ret = __memp_fget(mpf, &base_pgno, ip, dbc->txn, 0, &meta)) != 0)
412 goto err;
413
414 if (meta->magic != DB_HASHMAGIC &&
415 (meta->magic != DB_BTREEMAGIC || F_ISSET(meta, BTM_RECNO))) {
416 __db_errx(env, DB_STR("0650",
417 "Partitioning may only specified on BTREE and HASH databases."));
418 ret = EINVAL;
419 goto err;
420 }
421 if (!FLD_ISSET(meta->metaflags,
422 DBMETA_PART_RANGE | DBMETA_PART_CALLBACK)) {
423 __db_errx(env, DB_STR("0651",
424 "Partitioning specified on a non-partitioned database."));
425 ret = EINVAL;
426 goto err;
427 }
428
429 if ((F_ISSET(part, PART_RANGE) &&
430 FLD_ISSET(meta->metaflags, DBMETA_PART_CALLBACK)) ||
431 (F_ISSET(part, PART_CALLBACK) &&
432 FLD_ISSET(meta->metaflags, DBMETA_PART_RANGE))) {
433 __db_errx(env, DB_STR("0652",
434 "Incompatible partitioning specified."));
435 ret = EINVAL;
436 goto err;
437 }
438
439 if (FLD_ISSET(meta->metaflags, DBMETA_PART_CALLBACK) &&
440 part->callback == NULL && !IS_RECOVERING(env) &&
441 !F_ISSET(dbp, DB_AM_RECOVER) && !LF_ISSET(DB_RDWRMASTER)) {
442 __db_errx(env, DB_STR("0653",
443 "Partition callback not specified."));
444 ret = EINVAL;
445 goto err;
446 }
447
448 if (F_ISSET(dbp, DB_AM_RECNUM)) {
449 __db_errx(env, DB_STR("0654",
450 "Record numbers are not supported in partitioned databases."));
451 ret = EINVAL;
452 goto err;
453 }
454
455 if (part->nparts == 0) {
456 if (LF_ISSET(DB_CREATE) && meta->nparts == 0) {
457 __db_errx(env, DB_STR("0655",
458 "Zero paritions specified."));
459 ret = EINVAL;
460 goto err;
461 } else
462 part->nparts = meta->nparts;
463 } else if (meta->nparts != 0 && part->nparts != meta->nparts) {
464 __db_errx(env, DB_STR("0656",
465 "Number of partitions does not match."));
466 ret = EINVAL;
467 goto err;
468 }
469
470 if (meta->magic == DB_HASHMAGIC) {
471 if (!F_ISSET(part, PART_CALLBACK)) {
472 __db_errx(env, DB_STR("0657",
473 "Hash database must specify a partition callback."));
474 ret = EINVAL;
475 }
476 } else if (meta->magic != DB_BTREEMAGIC) {
477 __db_errx(env, DB_STR("0658",
478 "Partitioning only supported on BTREE nad HASH."));
479 ret = EINVAL;
480 } else
481 ret = __partition_setup_keys(dbc, part, meta, flags);
482
483 err: /* Put the metadata page back. */
484 if (meta != NULL && (t_ret = __memp_fput(mpf,
485 ip, meta, dbc->priority)) != 0 && ret == 0)
486 ret = t_ret;
487 if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0)
488 ret = t_ret;
489
490 if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
491 ret = t_ret;
492
493 dbp->p_internal = part;
494 return (ret);
495 }
496
497 /*
498 * Support for sorting keys. Keys must be sorted using the btree
499 * compare function so if we call qsort in __partition_setup_keys
500 * we use this structure to pass the DBP and compare function.
501 */
502 struct key_sort {
503 DB *dbp;
504 DBT *key;
505 int (*compare) __P((DB *, const DBT *, const DBT *));
506 };
507
__part_key_cmp(a,b)508 static int __part_key_cmp(a, b)
509 const void *a, *b;
510 {
511 const struct key_sort *ka, *kb;
512
513 ka = a;
514 kb = b;
515 return (ka->compare(ka->dbp, ka->key, kb->key));
516 }
517 /*
518 * __partition_setup_keys --
519 * Get the partition keys into memory, or put them to disk if we
520 * are creating a partitioned database.
521 */
522 static int
__partition_setup_keys(dbc,part,meta,flags)523 __partition_setup_keys(dbc, part, meta, flags)
524 DBC *dbc;
525 DB_PARTITION *part;
526 DBMETA *meta;
527 u_int32_t flags;
528 {
529 BTREE *t;
530 DB *dbp;
531 DBT data, key, *keys, *kp;
532 ENV *env;
533 u_int32_t ds, i, j;
534 u_int8_t *dd;
535 struct key_sort *ks;
536 int have_keys, ret;
537 int (*compare) __P((DB *, const DBT *, const DBT *));
538 void *dp;
539
540 COMPQUIET(dd, NULL);
541 COMPQUIET(ds, 0);
542 memset(&data, 0, sizeof(data));
543 memset(&key, 0, sizeof(key));
544 ks = NULL;
545
546 dbp = dbc->dbp;
547 env = dbp->env;
548
549 /* Need to just read the main database. */
550 dbp->p_internal = NULL;
551 have_keys = 0;
552
553 /* First verify that things what we expect. */
554 if ((ret = __dbc_get(dbc, &key, &data, DB_FIRST)) != 0) {
555 if (ret != DB_NOTFOUND)
556 goto err;
557 if (F_ISSET(part, PART_CALLBACK)) {
558 ret = 0;
559 goto done;
560 }
561 if (!LF_ISSET(DB_CREATE) && !F_ISSET(dbp, DB_AM_RECOVER) &&
562 !LF_ISSET(DB_RDWRMASTER)) {
563 __db_errx(env, DB_STR("0659", "No range keys found."));
564 ret = EINVAL;
565 goto err;
566 }
567 } else {
568 if (F_ISSET(part, PART_CALLBACK)) {
569 __db_errx(env, DB_STR("0660",
570 "Keys found and callback set."));
571 ret = EINVAL;
572 goto err;
573 }
574 if (key.size != 0) {
575 __db_errx(env, DB_STR("0661",
576 "Partition key 0 is not empty."));
577 ret = EINVAL;
578 goto err;
579 }
580 have_keys = 1;
581 }
582
583 if (LF_ISSET(DB_CREATE) && have_keys == 0) {
584 /* Insert the keys into the master database. */
585 for (i = 0; i < part->nparts - 1; i++) {
586 if ((ret = __db_put(dbp, dbc->thread_info,
587 dbc->txn, &part->keys[i], &data, 0)) != 0)
588 goto err;
589 }
590
591 /*
592 * Insert the "0" pointer. All records less than the first
593 * given key go into this partition. We must use the default
594 * compare to insert this key, otherwise it might not be first.
595 */
596 t = dbc->dbp->bt_internal;
597 compare = t->bt_compare;
598 t->bt_compare = __bam_defcmp;
599 memset(&key, 0, sizeof(key));
600 ret = __db_put(dbp, dbc->thread_info, dbc->txn, &key, &data, 0);
601 t->bt_compare = compare;
602 if (ret != 0)
603 goto err;
604 }
605 done: if (F_ISSET(part, PART_RANGE)) {
606 /*
607 * Allocate one page to hold the keys plus space at the
608 * end of the buffer to put an array of DBTs. If there
609 * is not enough space __dbc_get will return how much
610 * is needed and we realloc.
611 */
612 if ((ret = __os_malloc(env,
613 meta->pagesize + (sizeof(DBT) * part->nparts),
614 &part->data)) != 0) {
615 __db_errx(env, Alloc_err, meta->pagesize);
616 goto err;
617 }
618 memset(&key, 0, sizeof(key));
619 memset(&data, 0, sizeof(data));
620 data.data = part->data;
621 data.ulen = meta->pagesize;
622 data.flags = DB_DBT_USERMEM;
623 again: if ((ret = __dbc_get(dbc, &key, &data,
624 DB_FIRST | DB_MULTIPLE_KEY)) == DB_BUFFER_SMALL) {
625 if ((ret = __os_realloc(env,
626 data.size + (sizeof(DBT) * part->nparts),
627 &part->data)) != 0)
628 goto err;
629 data.data = part->data;
630 data.ulen = data.size;
631 goto again;
632 }
633 if (ret == 0) {
634 /*
635 * They passed in keys, they must match.
636 */
637 keys = NULL;
638 compare = NULL;
639 if (have_keys == 1 && (keys = part->keys) != NULL) {
640 t = dbc->dbp->bt_internal;
641 compare = t->bt_compare;
642 if ((ret = __os_malloc(env, (part->nparts - 1)
643 * sizeof(struct key_sort), &ks)) != 0)
644 goto err;
645 for (j = 0; j < part->nparts - 1; j++) {
646 ks[j].dbp = dbc->dbp;
647 ks[j].compare = compare;
648 ks[j].key = &keys[j];
649 }
650
651 qsort(ks, (size_t)part->nparts - 1,
652 sizeof(struct key_sort), __part_key_cmp);
653 }
654 DB_MULTIPLE_INIT(dp, &data);
655 part->keys = (DBT *)
656 ((u_int8_t *)part->data + data.size);
657 j = 0;
658 for (kp = part->keys;
659 kp < &part->keys[part->nparts]; kp++, j++) {
660 DB_MULTIPLE_KEY_NEXT(dp,
661 &data, kp->data, kp->size, dd, ds);
662 if (dp == NULL) {
663 ret = DB_NOTFOUND;
664 break;
665 }
666 if (keys != NULL && j != 0 &&
667 compare(dbc->dbp, ks[j - 1].key, kp) != 0) {
668 if (kp->data == NULL &&
669 F_ISSET(dbp, DB_AM_RECOVER))
670 goto err;
671 __db_errx(env, DB_STR_A("0662",
672 "Partition key %d does not match",
673 "%d"), j);
674 ret = EINVAL;
675 goto err;
676 }
677 }
678 }
679 }
680 if (ret == DB_NOTFOUND && F_ISSET(dbp, DB_AM_RECOVER))
681 ret = 0;
682
683 err: dbp->p_internal = part;
684 if (ks != NULL)
685 __os_free(env, ks);
686 return (ret);
687 }
688
689 /*
690 * __partition_get_callback --
691 * Get the partition callback function.
692 * PUBLIC: int __partition_get_callback __P((DB *,
693 * PUBLIC: u_int32_t *, u_int32_t (**callback)(DB *, DBT *key)));
694 */
695 int
__partition_get_callback(dbp,parts,callback)696 __partition_get_callback(dbp, parts, callback)
697 DB *dbp;
698 u_int32_t *parts;
699 u_int32_t (**callback)(DB *, DBT *key);
700 {
701 DB_PARTITION *part;
702
703 part = dbp->p_internal;
704 /* Only return populated results if partitioned using callbacks. */
705 if (part != NULL && !F_ISSET(part, PART_CALLBACK))
706 part = NULL;
707 if (parts != NULL)
708 *parts = (part != NULL ? part->nparts : 0);
709 if (callback != NULL)
710 *callback = (part != NULL ? part->callback : NULL);
711
712 return (0);
713 }
714
715 /*
716 * __partition_get_keys --
717 * Get partition keys.
718 * PUBLIC: int __partition_get_keys __P((DB *, u_int32_t *, DBT **));
719 */
720 int
__partition_get_keys(dbp,parts,keys)721 __partition_get_keys(dbp, parts, keys)
722 DB *dbp;
723 u_int32_t *parts;
724 DBT **keys;
725 {
726 DB_PARTITION *part;
727
728 part = dbp->p_internal;
729 /* Only return populated results if partitioned using ranges. */
730 if (part != NULL && !F_ISSET(part, PART_RANGE))
731 part = NULL;
732 if (parts != NULL)
733 *parts = (part != NULL ? part->nparts : 0);
734 if (keys != NULL)
735 *keys = (part != NULL ? &part->keys[1] : NULL);
736
737 return (0);
738 }
739
740 /*
741 * __partition_get_dirs --
742 * Get partition dirs.
743 * PUBLIC: int __partition_get_dirs __P((DB *, const char ***));
744 */
745 int
__partition_get_dirs(dbp,dirpp)746 __partition_get_dirs(dbp, dirpp)
747 DB *dbp;
748 const char ***dirpp;
749 {
750 DB_PARTITION *part;
751 ENV *env;
752 u_int32_t i;
753 int ret;
754
755 env = dbp->env;
756 if ((part = dbp->p_internal) == NULL) {
757 *dirpp = NULL;
758 return (0);
759 }
760 if (!F_ISSET(dbp, DB_AM_OPEN_CALLED)) {
761 *dirpp = part->dirs;
762 return (0);
763 }
764
765 /*
766 * We build a list once when asked. The original directory list,
767 * if any, was discarded at open time.
768 */
769 if ((*dirpp = part->dirs) != NULL)
770 return (0);
771
772 if ((ret = __os_calloc(env,
773 sizeof(char *), part->nparts + 1, (void *) &part->dirs)) != 0)
774 return (ret);
775
776 for (i = 0; i < part->nparts; i++)
777 part->dirs[i] = part->handles[i]->dirname;
778
779 *dirpp = part->dirs;
780 return (0);
781 }
782
783 /*
784 * __partc_init --
785 * Initialize the access private portion of a cursor
786 *
787 * PUBLIC: int __partc_init __P((DBC *));
788 */
789 int
__partc_init(dbc)790 __partc_init(dbc)
791 DBC *dbc;
792 {
793 ENV *env;
794 int ret;
795
796 env = dbc->env;
797
798 /* Allocate/initialize the internal structure. */
799 if (dbc->internal == NULL && (ret =
800 __os_calloc(env, 1, sizeof(PART_CURSOR), &dbc->internal)) != 0)
801 return (ret);
802
803 /* Initialize methods. */
804 dbc->close = dbc->c_close = __dbc_close_pp;
805 dbc->cmp = __dbc_cmp_pp;
806 dbc->count = dbc->c_count = __dbc_count_pp;
807 dbc->del = dbc->c_del = __dbc_del_pp;
808 dbc->dup = dbc->c_dup = __dbc_dup_pp;
809 dbc->get = dbc->c_get = __partc_get_pp;
810 dbc->pget = dbc->c_pget = __dbc_pget_pp;
811 dbc->put = dbc->c_put = __dbc_put_pp;
812 dbc->am_bulk = NULL;
813 dbc->am_close = __partc_close;
814 dbc->am_del = __partc_del;
815 dbc->am_destroy = __partc_destroy;
816 dbc->am_get = NULL;
817 dbc->am_put = __partc_put;
818 dbc->am_writelock = __partc_writelock;
819
820 /* We avoid swapping partition cursors since we swap the sub cursors */
821 F_SET(dbc, DBC_PARTITIONED);
822
823 return (0);
824 }
825 /*
826 * __partc_get_pp --
827 * cursor get opeartion on a partitioned database.
828 */
829 static int
__partc_get_pp(dbc,key,data,flags)830 __partc_get_pp(dbc, key, data, flags)
831 DBC *dbc;
832 DBT *key, *data;
833 u_int32_t flags;
834 {
835 DB *dbp;
836 DB_THREAD_INFO *ip;
837 ENV *env;
838 int ignore_lease, ret;
839
840 dbp = dbc->dbp;
841 env = dbp->env;
842
843 ignore_lease = LF_ISSET(DB_IGNORE_LEASE) ? 1 : 0;
844 LF_CLR(DB_IGNORE_LEASE);
845 if ((ret = __dbc_get_arg(dbc, key, data, flags)) != 0)
846 return (ret);
847
848 ENV_ENTER(env, ip);
849
850 DEBUG_LREAD(dbc, dbc->txn, "DBcursor->get",
851 flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags);
852
853 ret = __partc_get(dbc, key, data, flags);
854 /*
855 * Check for master leases.
856 */
857 if (ret == 0 &&
858 IS_REP_MASTER(env) && IS_USING_LEASES(env) && !ignore_lease)
859 ret = __rep_lease_check(env, 1);
860
861 ENV_LEAVE(env, ip);
862 __dbt_userfree(env, key, NULL, data);
863 return (ret);
864 }
865 /*
866 * __partition_get --
867 * cursor get opeartion on a partitioned database.
868 *
869 * PUBLIC: int __partc_get __P((DBC*, DBT *, DBT *, u_int32_t));
870 */
871 int
__partc_get(dbc,key,data,flags)872 __partc_get(dbc, key, data, flags)
873 DBC *dbc;
874 DBT *key, *data;
875 u_int32_t flags;
876 {
877 DB *dbp;
878 DBC *orig_dbc, *new_dbc;
879 DB_PARTITION *part;
880 PART_CURSOR *cp;
881 u_int32_t multi, part_id;
882 int ret, retry, search;
883
884 dbp = dbc->dbp;
885 cp = (PART_CURSOR*)dbc->internal;
886 orig_dbc = cp->sub_cursor;
887 part = dbp->p_internal;
888
889 new_dbc = NULL;
890 retry = search = 0;
891 part_id = cp->part_id;
892 multi = flags & ~DB_OPFLAGS_MASK;
893
894 switch (flags & DB_OPFLAGS_MASK) {
895 case DB_CURRENT:
896 break;
897 case DB_FIRST:
898 part_id = 0;
899 retry = 1;
900 break;
901 case DB_GET_BOTH:
902 case DB_GET_BOTHC:
903 case DB_GET_BOTH_RANGE:
904 search = 1;
905 break;
906 case DB_SET_RANGE:
907 search = 1;
908 retry = 1;
909 break;
910 case DB_LAST:
911 part_id = part->nparts - 1;
912 retry = 1;
913 break;
914 case DB_NEXT:
915 case DB_NEXT_NODUP:
916 if (orig_dbc == NULL)
917 part_id = 0;
918 else
919 part_id = cp->part_id;
920 retry = 1;
921 break;
922 case DB_NEXT_DUP:
923 break;
924 case DB_PREV:
925 case DB_PREV_NODUP:
926 if (orig_dbc == NULL)
927 part_id = part->nparts - 1;
928 else
929 part_id = cp->part_id;
930 retry = 1;
931 break;
932 case DB_PREV_DUP:
933 break;
934 case DB_SET:
935 search = 1;
936 break;
937 default:
938 return (__db_unknown_flag(dbp->env, "__partc_get", flags));
939 }
940
941 /*
942 * If we need to find the partition to start on, then
943 * do a binary search of the in memory partition table.
944 */
945 if (search == 1 && F_ISSET(part, PART_CALLBACK))
946 part_id = part->callback(dbp, key) % part->nparts;
947 else if (search == 1)
948 __part_search(dbp, part, key, &part_id);
949
950 /* Get a new cursor if necessary */
951 if (orig_dbc == NULL || cp->part_id != part_id) {
952 GET_PART_CURSOR(dbc, new_dbc, part_id);
953 } else
954 new_dbc = orig_dbc;
955
956 while ((ret = __dbc_get(new_dbc,
957 key, data, flags)) == DB_NOTFOUND && retry == 1) {
958 switch (flags & DB_OPFLAGS_MASK) {
959 case DB_FIRST:
960 case DB_NEXT:
961 case DB_NEXT_NODUP:
962 case DB_SET_RANGE:
963 if (++part_id < part->nparts) {
964 flags = DB_FIRST | multi;
965 break;
966 }
967 goto err;
968 case DB_LAST:
969 case DB_PREV:
970 case DB_PREV_NODUP:
971 if (part_id-- > 0) {
972 flags = DB_LAST | multi;
973 break;
974 }
975 goto err;
976 default:
977 goto err;
978 }
979
980 if (new_dbc != orig_dbc && (ret = __dbc_close(new_dbc)) != 0)
981 goto err;
982 GET_PART_CURSOR(dbc, new_dbc, part_id);
983 }
984
985 if (ret != 0)
986 goto err;
987
988 /* Success: swap original and new cursors. */
989 if (new_dbc != orig_dbc) {
990 if (orig_dbc != NULL) {
991 cp->sub_cursor = NULL;
992 if ((ret = __dbc_close(orig_dbc)) != 0)
993 goto err;
994 }
995 cp->sub_cursor = new_dbc;
996 cp->part_id = part_id;
997 }
998
999 return (0);
1000
1001 err: if (new_dbc != NULL && new_dbc != orig_dbc)
1002 (void)__dbc_close(new_dbc);
1003 return (ret);
1004 }
1005
1006 /*
1007 * __partc_put --
1008 * cursor put opeartion on a partitioned cursor.
1009 *
1010 */
1011 static int
__partc_put(dbc,key,data,flags,pgnop)1012 __partc_put(dbc, key, data, flags, pgnop)
1013 DBC *dbc;
1014 DBT *key, *data;
1015 u_int32_t flags;
1016 db_pgno_t *pgnop;
1017 {
1018 DB *dbp;
1019 DB_PARTITION *part;
1020 DBC *new_dbc;
1021 PART_CURSOR *cp;
1022 u_int32_t part_id;
1023 int ret;
1024
1025 dbp = dbc->dbp;
1026 cp = (PART_CURSOR*)dbc->internal;
1027 part_id = cp->part_id;
1028 part = dbp->p_internal;
1029 *pgnop = PGNO_INVALID;
1030
1031 switch (flags) {
1032 case DB_KEYFIRST:
1033 case DB_KEYLAST:
1034 case DB_NODUPDATA:
1035 case DB_NOOVERWRITE:
1036 case DB_OVERWRITE_DUP:
1037 if (F_ISSET(part, PART_CALLBACK)) {
1038 part_id = part->callback(dbp, key) % part->nparts;
1039 break;
1040 }
1041 __part_search(dbp, part, key, &part_id);
1042 break;
1043 default:
1044 break;
1045 }
1046
1047 if ((new_dbc = cp->sub_cursor) == NULL || cp->part_id != part_id) {
1048 if ((ret = __db_cursor_int(part->handles[part_id],
1049 dbc->thread_info, dbc->txn, part->handles[part_id]->type,
1050 PGNO_INVALID, 0, dbc->locker, &new_dbc)) != 0)
1051 goto err;
1052 }
1053
1054 if (F_ISSET(dbc, DBC_WRITER | DBC_WRITECURSOR))
1055 F_SET(new_dbc, DBC_WRITER);
1056 if ((ret = __dbc_put(new_dbc, key, data, flags)) != 0)
1057 goto err;
1058
1059 if (new_dbc != cp->sub_cursor) {
1060 if (cp->sub_cursor != NULL) {
1061 if ((ret = __dbc_close(cp->sub_cursor)) != 0)
1062 goto err;
1063 cp->sub_cursor = NULL;
1064 }
1065 cp->sub_cursor = new_dbc;
1066 cp->part_id = part_id;
1067 }
1068
1069 return (0);
1070
1071 err: if (new_dbc != NULL && cp->sub_cursor != new_dbc)
1072 (void)__dbc_close(new_dbc);
1073 return (ret);
1074 }
1075
1076 /*
1077 * __partc_del
1078 * Delete interface to partitioned cursors.
1079 *
1080 */
1081 static int
__partc_del(dbc,flags)1082 __partc_del(dbc, flags)
1083 DBC *dbc;
1084 u_int32_t flags;
1085 {
1086 PART_CURSOR *cp;
1087 cp = (PART_CURSOR*)dbc->internal;
1088
1089 if (F_ISSET(dbc, DBC_WRITER | DBC_WRITECURSOR))
1090 F_SET(cp->sub_cursor, DBC_WRITER);
1091 return (__dbc_del(cp->sub_cursor, flags));
1092 }
1093
1094 /*
1095 * __partc_writelock
1096 * Writelock interface to partitioned cursors.
1097 *
1098 */
1099 static int
__partc_writelock(dbc)1100 __partc_writelock(dbc)
1101 DBC *dbc;
1102 {
1103 PART_CURSOR *cp;
1104 cp = (PART_CURSOR*)dbc->internal;
1105
1106 return (cp->sub_cursor->am_writelock(cp->sub_cursor));
1107 }
1108
1109 /*
1110 * __partc_close
1111 * Close interface to partitioned cursors.
1112 *
1113 */
1114 static int
__partc_close(dbc,root_pgno,rmroot)1115 __partc_close(dbc, root_pgno, rmroot)
1116 DBC *dbc;
1117 db_pgno_t root_pgno;
1118 int *rmroot;
1119 {
1120 PART_CURSOR *cp;
1121 int ret;
1122
1123 COMPQUIET(root_pgno, 0);
1124 COMPQUIET(rmroot, NULL);
1125
1126 cp = (PART_CURSOR*)dbc->internal;
1127
1128 if (cp->sub_cursor == NULL)
1129 return (0);
1130 ret = __dbc_close(cp->sub_cursor);
1131 cp->sub_cursor = NULL;
1132 return (ret);
1133 }
1134
1135 /*
1136 * __partc_destroy --
1137 * Destroy a single cursor.
1138 */
1139 static int
__partc_destroy(dbc)1140 __partc_destroy(dbc)
1141 DBC *dbc;
1142 {
1143 PART_CURSOR *cp;
1144 ENV *env;
1145
1146 cp = (PART_CURSOR *)dbc->internal;
1147 env = dbc->env;
1148
1149 /* Discard the structure. Don't recurse. */
1150 __os_free(env, cp);
1151
1152 return (0);
1153 }
1154
1155 /*
1156 * __partition_close
1157 * Close a partitioned database.
1158 *
1159 * PUBLIC: int __partition_close __P((DB *, DB_TXN *, u_int32_t));
1160 */
1161 int
__partition_close(dbp,txn,flags)1162 __partition_close(dbp, txn, flags)
1163 DB *dbp;
1164 DB_TXN *txn;
1165 u_int32_t flags;
1166 {
1167 DB **pdbp;
1168 DB_PARTITION *part;
1169 ENV *env;
1170 u_int32_t i;
1171 int ret, t_ret;
1172
1173 if ((part = dbp->p_internal) == NULL)
1174 return (0);
1175
1176 env = dbp->env;
1177 ret = 0;
1178
1179 if ((pdbp = part->handles) != NULL) {
1180 for (i = 0; i < part->nparts; i++, pdbp++)
1181 if (*pdbp != NULL && (t_ret =
1182 __db_close(*pdbp, txn, flags)) != 0 && ret == 0)
1183 ret = t_ret;
1184 __os_free(env, part->handles);
1185 }
1186 if (part->dirs != NULL)
1187 __os_free(env, (char **)part->dirs);
1188 if (part->data != NULL)
1189 __os_free(env, (char **)part->data);
1190 __os_free(env, part);
1191 dbp->p_internal = NULL;
1192
1193 return (ret);
1194 }
1195
1196 /*
1197 * __partition_sync
1198 * Sync a partitioned database.
1199 *
1200 * PUBLIC: int __partition_sync __P((DB *));
1201 */
1202 int
__partition_sync(dbp)1203 __partition_sync(dbp)
1204 DB *dbp;
1205 {
1206 DB **pdbp;
1207 DB_PARTITION *part;
1208 u_int32_t i;
1209 int ret, t_ret;
1210
1211 ret = 0;
1212 part = dbp->p_internal;
1213
1214 if ((pdbp = part->handles) != NULL) {
1215 for (i = 0; i < part->nparts; i++, pdbp++)
1216 if (*pdbp != NULL &&
1217 F_ISSET(*pdbp, DB_AM_OPEN_CALLED) && (t_ret =
1218 __memp_fsync((*pdbp)->mpf)) != 0 && ret == 0)
1219 ret = t_ret;
1220 }
1221 if ((t_ret = __memp_fsync(dbp->mpf)) != 0 && ret == 0)
1222 ret = t_ret;
1223
1224 return (ret);
1225 }
1226
1227 /*
1228 * __partition_stat
1229 * Stat a partitioned database.
1230 *
1231 * PUBLIC: int __partition_stat __P((DBC *, void *, u_int32_t));
1232 */
1233 int
__partition_stat(dbc,spp,flags)1234 __partition_stat(dbc, spp, flags)
1235 DBC *dbc;
1236 void *spp;
1237 u_int32_t flags;
1238 {
1239 DB *dbp, **pdbp;
1240 DB_BTREE_STAT *fsp, *bsp;
1241 #ifdef HAVE_HASH
1242 DB_HASH_STAT *hfsp, *hsp;
1243 #endif
1244 DB_PARTITION *part;
1245 DBC *new_dbc;
1246 ENV *env;
1247 u_int32_t i;
1248 int ret;
1249
1250 dbp = dbc->dbp;
1251 part = dbp->p_internal;
1252 env = dbp->env;
1253 fsp = NULL;
1254 #ifdef HAVE_HASH
1255 hfsp = NULL;
1256 #endif
1257
1258 pdbp = part->handles;
1259 for (i = 0; i < part->nparts; i++, pdbp++) {
1260 if ((ret = __db_cursor_int(*pdbp, dbc->thread_info, dbc->txn,
1261 (*pdbp)->type, PGNO_INVALID,
1262 0, dbc->locker, &new_dbc)) != 0)
1263 goto err;
1264 switch (new_dbc->dbtype) {
1265 case DB_BTREE:
1266 if ((ret = __bam_stat(new_dbc, &bsp, flags)) != 0)
1267 goto err;
1268 if (fsp == NULL) {
1269 fsp = bsp;
1270 *(DB_BTREE_STAT **)spp = fsp;
1271 } else {
1272 fsp->bt_nkeys += bsp->bt_nkeys;
1273 fsp->bt_ndata += bsp->bt_ndata;
1274 fsp->bt_pagecnt += bsp->bt_pagecnt;
1275 if (fsp->bt_levels < bsp->bt_levels)
1276 fsp->bt_levels = bsp->bt_levels;
1277 fsp->bt_int_pg += bsp->bt_int_pg;
1278 fsp->bt_leaf_pg += bsp->bt_leaf_pg;
1279 fsp->bt_dup_pg += bsp->bt_dup_pg;
1280 fsp->bt_over_pg += bsp->bt_over_pg;
1281 fsp->bt_free += bsp->bt_free;
1282 fsp->bt_int_pgfree += bsp->bt_int_pgfree;
1283 fsp->bt_leaf_pgfree += bsp->bt_leaf_pgfree;
1284 fsp->bt_dup_pgfree += bsp->bt_dup_pgfree;
1285 fsp->bt_over_pgfree += bsp->bt_over_pgfree;
1286 __os_ufree(env, bsp);
1287 }
1288 break;
1289 #ifdef HAVE_HASH
1290 case DB_HASH:
1291 if ((ret = __ham_stat(new_dbc, &hsp, flags)) != 0)
1292 goto err;
1293 if (hfsp == NULL) {
1294 hfsp = hsp;
1295 *(DB_HASH_STAT **)spp = hfsp;
1296 } else {
1297 hfsp->hash_nkeys += hsp->hash_nkeys;
1298 hfsp->hash_ndata += hsp->hash_ndata;
1299 hfsp->hash_pagecnt += hsp->hash_pagecnt;
1300 hfsp->hash_ffactor += hsp->hash_ffactor;
1301 hfsp->hash_buckets += hsp->hash_buckets;
1302 hfsp->hash_free += hsp->hash_free;
1303 hfsp->hash_bfree += hsp->hash_bfree;
1304 hfsp->hash_bigpages += hsp->hash_bigpages;
1305 hfsp->hash_big_bfree += hsp->hash_big_bfree;
1306 hfsp->hash_overflows += hsp->hash_overflows;
1307 hfsp->hash_ovfl_free += hsp->hash_ovfl_free;
1308 hfsp->hash_dup += hsp->hash_dup;
1309 hfsp->hash_dup_free += hsp->hash_dup_free;
1310 __os_ufree(env, hsp);
1311 }
1312 break;
1313 #endif
1314 default:
1315 break;
1316 }
1317 if ((ret = __dbc_close(new_dbc)) != 0)
1318 goto err;
1319 }
1320 return (0);
1321
1322 err:
1323 if (fsp != NULL)
1324 __os_ufree(env, fsp);
1325 *(DB_BTREE_STAT **)spp = NULL;
1326 return (ret);
1327 }
1328
1329 /*
1330 * __part_truncate --
1331 * Truncate a database.
1332 *
1333 * PUBLIC: int __part_truncate __P((DBC *, u_int32_t *));
1334 */
1335 int
__part_truncate(dbc,countp)1336 __part_truncate(dbc, countp)
1337 DBC *dbc;
1338 u_int32_t *countp;
1339 {
1340 DB *dbp, **pdbp;
1341 DB_PARTITION *part;
1342 DBC *new_dbc;
1343 u_int32_t count, i;
1344 int ret, t_ret;
1345
1346 dbp = dbc->dbp;
1347 part = dbp->p_internal;
1348 pdbp = part->handles;
1349 ret = 0;
1350
1351 if (countp != NULL)
1352 *countp = 0;
1353 for (i = 0; ret == 0 && i < part->nparts; i++, pdbp++) {
1354 if ((ret = __db_cursor_int(*pdbp, dbc->thread_info, dbc->txn,
1355 (*pdbp)->type, PGNO_INVALID,
1356 0, dbc->locker, &new_dbc)) != 0)
1357 break;
1358 switch (dbp->type) {
1359 case DB_BTREE:
1360 case DB_RECNO:
1361 ret = __bam_truncate(new_dbc, &count);
1362 break;
1363 case DB_HASH:
1364 #ifdef HAVE_HASH
1365 ret = __ham_truncate(new_dbc, &count);
1366 break;
1367 #endif
1368 case DB_QUEUE:
1369 case DB_UNKNOWN:
1370 default:
1371 ret = __db_unknown_type(dbp->env,
1372 "DB->truncate", dbp->type);
1373 count = 0;
1374 break;
1375 }
1376 if ((t_ret = __dbc_close(new_dbc)) != 0 && ret == 0)
1377 ret = t_ret;
1378 if (countp != NULL)
1379 *countp += count;
1380 }
1381
1382 return (ret);
1383 }
1384 /*
1385 * __part_compact -- compact a partitioned database.
1386 *
1387 * PUBLIC: int __part_compact __P((DB *, DB_THREAD_INFO *, DB_TXN *,
1388 * PUBLIC: DBT *, DBT *, DB_COMPACT *, u_int32_t, DBT *));
1389 */
1390 int
__part_compact(dbp,ip,txn,start,stop,c_data,flags,end)1391 __part_compact(dbp, ip, txn, start, stop, c_data, flags, end)
1392 DB *dbp;
1393 DB_THREAD_INFO *ip;
1394 DB_TXN *txn;
1395 DBT *start, *stop;
1396 DB_COMPACT *c_data;
1397 u_int32_t flags;
1398 DBT *end;
1399 {
1400 DB **pdbp;
1401 DB_PARTITION *part;
1402 u_int32_t i;
1403 int ret;
1404
1405 part = dbp->p_internal;
1406 pdbp = part->handles;
1407 ret = 0;
1408
1409 for (i = 0; ret == 0 && i < part->nparts; i++, pdbp++) {
1410 switch (dbp->type) {
1411 case DB_HASH:
1412 case DB_BTREE:
1413 case DB_RECNO:
1414 ret = __db_compact_int(*pdbp,
1415 ip, txn, start, stop, c_data, flags, end);
1416 break;
1417
1418 default:
1419 ret = __dbh_am_chk(dbp, DB_OK_BTREE);
1420 break;
1421 }
1422 }
1423 return (ret);
1424 }
1425
1426 /*
1427 * __part_lsn_reset --
1428 * reset the lsns on each partition.
1429 *
1430 * PUBLIC: int __part_lsn_reset __P((DB *, DB_THREAD_INFO *));
1431 */
1432 int
__part_lsn_reset(dbp,ip)1433 __part_lsn_reset(dbp, ip)
1434 DB *dbp;
1435 DB_THREAD_INFO *ip;
1436 {
1437 DB **pdbp;
1438 DB_PARTITION *part;
1439 u_int32_t i;
1440 int ret;
1441
1442 part = dbp->p_internal;
1443 pdbp = part->handles;
1444 ret = 0;
1445
1446 for (i = 0; ret == 0 && i < part->nparts; i++, pdbp++)
1447 ret = __db_lsn_reset((*pdbp)->mpf, ip);
1448
1449 return (ret);
1450 }
1451
1452 /*
1453 * __part_fileid_reset --
1454 * reset the fileid on each partition.
1455 *
1456 * PUBLIC: int __part_fileid_reset
1457 * PUBLIC: __P((ENV *, DB_THREAD_INFO *, const char *, u_int32_t, int));
1458 */
1459 int
__part_fileid_reset(env,ip,fname,nparts,encrypted)1460 __part_fileid_reset(env, ip, fname, nparts, encrypted)
1461 ENV *env;
1462 DB_THREAD_INFO *ip;
1463 const char *fname;
1464 u_int32_t nparts;
1465 int encrypted;
1466 {
1467 int ret;
1468 u_int32_t part_id;
1469 char *name, *sp;
1470 const char *np;
1471
1472 if ((ret = __os_malloc(env,
1473 strlen(fname) + PART_LEN + 1, &name)) != 0) {
1474 __db_errx(env, Alloc_err, strlen(fname) + PART_LEN + 1);
1475 return (ret);
1476 }
1477
1478 sp = name;
1479 np = __db_rpath(fname);
1480 if (np == NULL)
1481 np = fname;
1482 else {
1483 np++;
1484 (void)strncpy(name, fname, (size_t)(np - fname));
1485 sp = name + (np - fname);
1486 }
1487
1488 for (part_id = 0; ret == 0 && part_id < nparts; part_id++) {
1489 (void)sprintf(sp, PART_NAME, np, part_id);
1490 ret = __env_fileid_reset(env, ip, sp, encrypted);
1491 }
1492
1493 __os_free(env, name);
1494 return (ret);
1495 }
1496
1497 /*
1498 * __part_key_range --
1499 * Return proportion of keys relative to given key.
1500 *
1501 * PUBLIC: int __part_key_range __P((DBC *, DBT *, DB_KEY_RANGE *, u_int32_t));
1502 */
1503 int
__part_key_range(dbc,dbt,kp,flags)1504 __part_key_range(dbc, dbt, kp, flags)
1505 DBC *dbc;
1506 DBT *dbt;
1507 DB_KEY_RANGE *kp;
1508 u_int32_t flags;
1509 {
1510 BTREE_CURSOR *cp;
1511 DBC *new_dbc;
1512 DB_PARTITION *part;
1513 PAGE *h;
1514 u_int32_t id, part_id;
1515 u_int32_t elems, empty, less_elems, my_elems, greater_elems;
1516 u_int32_t levels, max_levels, my_levels;
1517 db_pgno_t root_pgno;
1518 int ret;
1519 double total_elems;
1520
1521 COMPQUIET(flags, 0);
1522
1523 part = dbc->dbp->p_internal;
1524
1525 /*
1526 * First we find the key range for the partition that contains the
1527 * key. Then we scale based on estimates of the other partitions.
1528 */
1529 if (F_ISSET(part, PART_CALLBACK))
1530 part_id = part->callback(dbc->dbp, dbt) % part->nparts;
1531 else
1532 __part_search(dbc->dbp, part, dbt, &part_id);
1533 GET_PART_CURSOR(dbc, new_dbc, part_id);
1534
1535 if ((ret = __bam_key_range(new_dbc, dbt, kp, flags)) != 0)
1536 goto err;
1537
1538 cp = (BTREE_CURSOR *)new_dbc->internal;
1539
1540 root_pgno = BAM_ROOT_PGNO(new_dbc);
1541 if ((ret = __memp_fget(new_dbc->dbp->mpf, &root_pgno,
1542 new_dbc->thread_info, new_dbc->txn, 0, &h)) != 0)
1543 goto c_err;
1544
1545 my_elems = NUM_ENT(h);
1546 my_levels = LEVEL(h);
1547 max_levels = my_levels;
1548
1549 if ((ret = __memp_fput(new_dbc->dbp->mpf,
1550 new_dbc->thread_info, h, new_dbc->priority)) != 0)
1551 goto c_err;
1552
1553 if ((ret = __dbc_close(new_dbc)) != 0)
1554 goto err;
1555 /*
1556 * We have the range within one subtree. Now estimate
1557 * what part of the whole range that subtree is. Figure
1558 * out how many levels each part has and how many entries
1559 * in the level below the root.
1560 */
1561 empty = less_elems = greater_elems = 0;
1562 for (id = 0; id < part->nparts; id++) {
1563 if (id == part_id) {
1564 empty = 0;
1565 continue;
1566 }
1567 GET_PART_CURSOR(dbc, new_dbc, id);
1568 cp = (BTREE_CURSOR *)new_dbc->internal;
1569 if ((ret = __memp_fget(new_dbc->dbp->mpf, &cp->root,
1570 new_dbc->thread_info, new_dbc->txn, 0, &h)) != 0)
1571 goto c_err;
1572
1573 elems = NUM_ENT(h);
1574 levels = LEVEL(h);
1575 if (levels == 1)
1576 elems /= 2;
1577
1578 if ((ret = __memp_fput(new_dbc->dbp->mpf,
1579 new_dbc->thread_info, h, new_dbc->priority)) != 0)
1580 goto c_err;
1581
1582 if ((ret = __dbc_close(new_dbc)) != 0)
1583 goto err;
1584
1585 /* If the tree is empty, ignore it. */
1586 if (elems == 0) {
1587 empty++;
1588 continue;
1589 }
1590
1591 /*
1592 * If a tree has fewer levels than the max just count
1593 * it as a single element in the higher level.
1594 */
1595 if (id < part_id) {
1596 if (levels > max_levels) {
1597 max_levels = levels;
1598 less_elems = id + elems - empty;
1599 } else if (levels < max_levels)
1600 less_elems++;
1601 else
1602 less_elems += elems;
1603 } else {
1604 if (levels > max_levels) {
1605 max_levels = levels;
1606 greater_elems = (id - part_id) + elems - empty;
1607 } else if (levels < max_levels)
1608 greater_elems++;
1609 else
1610 greater_elems += elems;
1611 }
1612
1613 }
1614
1615 if (my_levels < max_levels) {
1616 /*
1617 * The subtree containing the key is not the tallest one.
1618 * Reduce its share by the number of records at the highest
1619 * level. Scale the greater and lesser components up
1620 * by the number of records on either side of this
1621 * subtree.
1622 */
1623 total_elems = 1 + greater_elems + less_elems;
1624 kp->equal /= total_elems;
1625 kp->less /= total_elems;
1626 kp->less += less_elems/total_elems;
1627 kp->greater /= total_elems;
1628 kp->greater += greater_elems/total_elems;
1629 } else if (my_levels == max_levels) {
1630 /*
1631 * The key is in one of the tallest subtrees. We will
1632 * scale the values by the ratio of the records at the
1633 * top of this stubtree to the number of records at the
1634 * highest level.
1635 */
1636 total_elems = greater_elems + less_elems;
1637 if (total_elems != 0) {
1638 /*
1639 * First scale down by the fraction of elements
1640 * in this subtree.
1641 */
1642 total_elems += my_elems;
1643 kp->equal *= my_elems;
1644 kp->equal /= total_elems;
1645 kp->less *= my_elems;
1646 kp->less /= total_elems;
1647 kp->greater *= my_elems;
1648 kp->greater /= total_elems;
1649 /*
1650 * Proportionally add weight from the subtrees to the
1651 * left and right of this one.
1652 */
1653 kp->less += less_elems / total_elems;
1654 kp->greater += greater_elems / total_elems;
1655 }
1656 }
1657
1658 if (0) {
1659 c_err: (void)__dbc_close(new_dbc);
1660 }
1661
1662 err: return (ret);
1663 }
1664
1665 /*
1666 * __part_remove --
1667 * Remove method for a partitioned database.
1668 *
1669 * PUBLIC: int __part_remove __P((DB *, DB_THREAD_INFO *,
1670 * PUBLIC: DB_TXN *, const char *, const char *, u_int32_t));
1671 */
1672 int
__part_remove(dbp,ip,txn,name,subdb,flags)1673 __part_remove(dbp, ip, txn, name, subdb, flags)
1674 DB *dbp;
1675 DB_THREAD_INFO *ip;
1676 DB_TXN *txn;
1677 const char *name, *subdb;
1678 u_int32_t flags;
1679 {
1680 return (__part_rr(dbp, ip, txn, name, subdb, NULL, flags));
1681 }
1682
1683 /*
1684 * __part_rename --
1685 * Rename method for a partitioned database.
1686 *
1687 * PUBLIC: int __part_rename __P((DB *, DB_THREAD_INFO *,
1688 * PUBLIC: DB_TXN *, const char *, const char *, const char *));
1689 */
1690 int
__part_rename(dbp,ip,txn,name,subdb,newname)1691 __part_rename(dbp, ip, txn, name, subdb, newname)
1692 DB *dbp;
1693 DB_THREAD_INFO *ip;
1694 DB_TXN *txn;
1695 const char *name, *subdb, *newname;
1696 {
1697 return (__part_rr(dbp, ip, txn, name, subdb, newname, 0));
1698 }
1699
1700 /*
1701 * __part_rr --
1702 * Remove/Rename method for a partitioned database.
1703 */
1704 static int
__part_rr(dbp,ip,txn,name,subdb,newname,flags)1705 __part_rr(dbp, ip, txn, name, subdb, newname, flags)
1706 DB *dbp;
1707 DB_THREAD_INFO *ip;
1708 DB_TXN *txn;
1709 const char *name, *subdb, *newname;
1710 u_int32_t flags;
1711 {
1712 DB **pdbp, *ptmpdbp, *tmpdbp;
1713 DB_PARTITION *part;
1714 ENV *env;
1715 u_int32_t i;
1716 int ret, t_ret;
1717 char *np;
1718
1719 env = dbp->env;
1720 ret = 0;
1721
1722 if (subdb != NULL && name != NULL) {
1723 __db_errx(env, DB_STR("0663",
1724 "A partitioned database can not be in a multiple databases file"));
1725 return (EINVAL);
1726 }
1727 ENV_GET_THREAD_INFO(env, ip);
1728
1729 /*
1730 * Since rename no longer opens the database, we have
1731 * to do it here.
1732 */
1733 if ((ret = __db_create_internal(&tmpdbp, env, 0)) != 0)
1734 return (ret);
1735
1736 /*
1737 * We need to make sure we don't self-deadlock, so give
1738 * this dbp the same locker as the incoming one.
1739 */
1740 tmpdbp->locker = dbp->locker;
1741 if ((ret = __db_open(tmpdbp, ip, txn, name, NULL, dbp->type,
1742 DB_RDWRMASTER | DB_RDONLY, 0, PGNO_BASE_MD)) != 0)
1743 goto err;
1744
1745 part = tmpdbp->p_internal;
1746 pdbp = part->handles;
1747 COMPQUIET(np, NULL);
1748 if (newname != NULL && (ret = __os_malloc(env,
1749 strlen(newname) + PART_LEN + 1, &np)) != 0) {
1750 __db_errx(env, Alloc_err, strlen(newname) + PART_LEN + 1);
1751 goto err;
1752 }
1753 for (i = 0; i < part->nparts; i++, pdbp++) {
1754 if ((ret = __db_create_internal(&ptmpdbp, env, 0)) != 0)
1755 break;
1756 ptmpdbp->locker = (*pdbp)->locker;
1757 if (newname == NULL)
1758 ret = __db_remove_int(ptmpdbp,
1759 ip, txn, (*pdbp)->fname, NULL, flags);
1760 else {
1761 DB_ASSERT(env, np != NULL);
1762 (void)sprintf(np, PART_NAME, newname, i);
1763 ret = __db_rename_int(ptmpdbp,
1764 ip, txn, (*pdbp)->fname, NULL, np, flags);
1765 }
1766 ptmpdbp->locker = NULL;
1767 (void)__db_close(ptmpdbp, NULL, DB_NOSYNC);
1768 if (ret != 0)
1769 break;
1770 }
1771
1772 if (newname != NULL)
1773 __os_free(env, np);
1774
1775 if (!F_ISSET(dbp, DB_AM_OPEN_CALLED)) {
1776 err: /*
1777 * Since we copied the locker ID from the dbp, we'd better not
1778 * free it here.
1779 */
1780 tmpdbp->locker = NULL;
1781
1782 /* We need to remove the lock event we associated with this. */
1783 if (txn != NULL)
1784 __txn_remlock(env,
1785 txn, &tmpdbp->handle_lock, DB_LOCK_INVALIDID);
1786
1787 if ((t_ret = __db_close(tmpdbp,
1788 txn, DB_NOSYNC)) != 0 && ret == 0)
1789 ret = t_ret;
1790 }
1791 return (ret);
1792 }
1793 #ifdef HAVE_VERIFY
1794 /*
1795 * __part_verify --
1796 * Verify a partitioned database.
1797 *
1798 * PUBLIC: int __part_verify __P((DB *, VRFY_DBINFO *, const char *,
1799 * PUBLIC: void *, int (*)(void *, const void *), u_int32_t));
1800 */
1801 int
__part_verify(dbp,vdp,fname,handle,callback,flags)1802 __part_verify(dbp, vdp, fname, handle, callback, flags)
1803 DB *dbp;
1804 VRFY_DBINFO *vdp;
1805 const char *fname;
1806 void *handle;
1807 int (*callback) __P((void *, const void *));
1808 u_int32_t flags;
1809 {
1810 BINTERNAL *lp, *rp;
1811 DB **pdbp;
1812 DB_PARTITION *part;
1813 DBC *dbc;
1814 DBT *key;
1815 ENV *env;
1816 DB_THREAD_INFO *ip;
1817 u_int32_t i;
1818 int ret, t_ret;
1819
1820 env = dbp->env;
1821 lp = rp = NULL;
1822 dbc = NULL;
1823 ip = vdp->thread_info;
1824
1825 if (dbp->type == DB_BTREE) {
1826 if ((ret = __bam_open(dbp, ip,
1827 NULL, fname, PGNO_BASE_MD, flags)) != 0)
1828 goto err;
1829 }
1830 #ifdef HAVE_HASH
1831 else if ((ret = __ham_open(dbp, ip,
1832 NULL, fname, PGNO_BASE_MD, flags)) != 0)
1833 goto err;
1834 #endif
1835
1836 /*
1837 * Initalize partition db handles and get the names. Set DB_RDWRMASTER
1838 * because we may not have the partition callback, but we can still
1839 * look at the structure of the tree.
1840 */
1841 if ((ret = __partition_open(dbp,
1842 ip, NULL, fname, dbp->type, flags | DB_RDWRMASTER, 0, 0)) != 0)
1843 goto err;
1844 part = dbp->p_internal;
1845
1846 if (LF_ISSET(DB_SALVAGE)) {
1847 /* If we are being aggressive we don't want to dump the keys. */
1848 if (LF_ISSET(DB_AGGRESSIVE))
1849 dbp->p_internal = NULL;
1850 ret = __db_prheader(dbp,
1851 NULL, 0, 0, handle, callback, vdp, PGNO_BASE_MD);
1852 dbp->p_internal = part;
1853 if (ret != 0)
1854 goto err;
1855 }
1856
1857 if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0)
1858 goto err;
1859
1860 pdbp = part->handles;
1861 for (i = 0; i < part->nparts; i++, pdbp++) {
1862 if (!F_ISSET(part, PART_RANGE) || part->keys == NULL)
1863 goto vrfy;
1864 if (lp != NULL)
1865 __os_free(env, lp);
1866 lp = rp;
1867 rp = NULL;
1868 if (i + 1 < part->nparts) {
1869 key = &part->keys[i + 1];
1870 if ((ret = __os_malloc(env,
1871 BINTERNAL_SIZE(key->size), &rp)) != 0)
1872 goto err;
1873 rp->len = key->size;
1874 memcpy(rp->data, key->data, key->size);
1875 B_TSET(rp->type, B_KEYDATA);
1876 }
1877 vrfy: if ((t_ret = __db_verify(*pdbp, ip, (*pdbp)->fname,
1878 NULL, handle, callback,
1879 lp, rp, flags | DB_VERIFY_PARTITION)) != 0 && ret == 0)
1880 ret = t_ret;
1881 }
1882
1883 err: if (lp != NULL)
1884 __os_free(env, lp);
1885 if (rp != NULL)
1886 __os_free(env, rp);
1887 return (ret);
1888 }
1889 #endif
1890
1891 #ifdef CONFIG_TEST
1892 /*
1893 * __part_testdocopy -- copy all partitions for testing purposes.
1894 *
1895 * PUBLIC: int __part_testdocopy __P((DB *, const char *));
1896 */
1897 int
__part_testdocopy(dbp,name)1898 __part_testdocopy(dbp, name)
1899 DB *dbp;
1900 const char *name;
1901 {
1902 DB **pdbp;
1903 DB_PARTITION *part;
1904 u_int32_t i;
1905 int ret;
1906
1907 if ((ret = __db_testdocopy(dbp->env, name)) != 0)
1908 return (ret);
1909
1910 part = dbp->p_internal;
1911 pdbp = part->handles;
1912 for (i = 0; i < part->nparts; i++, pdbp++)
1913 if ((ret = __db_testdocopy(dbp->env, (*pdbp)->fname)) != 0)
1914 return (ret);
1915
1916 return (0);
1917 }
1918 #endif
1919 #else
1920 /*
1921 * __db_nopartition --
1922 * Error when a Berkeley DB build doesn't include partitioning.
1923 *
1924 * PUBLIC: int __db_no_partition __P((ENV *));
1925 */
1926 int
__db_no_partition(env)1927 __db_no_partition(env)
1928 ENV *env;
1929 {
1930 __db_errx(env, DB_STR("0664",
1931 "library build did not include support for the database partitioning"));
1932 return (DB_OPNOTSUP);
1933 }
1934 /*
1935 * __partition_set --
1936 * Set the partitioning keys or callback function.
1937 * This routine must be called prior to creating the database.
1938 * PUBLIC: int __partition_set __P((DB *, u_int32_t, DBT *,
1939 * PUBLIC: u_int32_t (*callback)(DB *, DBT *key)));
1940 */
1941
1942 int
__partition_set(dbp,parts,keys,callback)1943 __partition_set(dbp, parts, keys, callback)
1944 DB *dbp;
1945 u_int32_t parts;
1946 DBT *keys;
1947 u_int32_t (*callback)(DB *, DBT *key);
1948 {
1949 COMPQUIET(parts, 0);
1950 COMPQUIET(keys, NULL);
1951 COMPQUIET(callback, NULL);
1952
1953 return (__db_no_partition(dbp->env));
1954 }
1955
1956 /*
1957 * __partition_get_callback --
1958 * Set the partition callback function. This routine must be called
1959 * prior to opening a partition database that requires a function.
1960 * PUBLIC: int __partition_get_callback __P((DB *,
1961 * PUBLIC: u_int32_t *, u_int32_t (**callback)(DB *, DBT *key)));
1962 */
1963 int
__partition_get_callback(dbp,parts,callback)1964 __partition_get_callback(dbp, parts, callback)
1965 DB *dbp;
1966 u_int32_t *parts;
1967 u_int32_t (**callback)(DB *, DBT *key);
1968 {
1969 COMPQUIET(parts, NULL);
1970 COMPQUIET(callback, NULL);
1971
1972 return (__db_no_partition(dbp->env));
1973 }
1974
1975 /*
1976 * __partition_get_dirs --
1977 * Get partition dirs.
1978 * PUBLIC: int __partition_get_dirs __P((DB *, const char ***));
1979 */
1980 int
__partition_get_dirs(dbp,dirpp)1981 __partition_get_dirs(dbp, dirpp)
1982 DB *dbp;
1983 const char ***dirpp;
1984 {
1985 COMPQUIET(dirpp, NULL);
1986 return (__db_no_partition(dbp->env));
1987 }
1988
1989 /*
1990 * __partition_get_keys --
1991 * Get partition keys.
1992 * PUBLIC: int __partition_get_keys __P((DB *, u_int32_t *, DBT **));
1993 */
1994 int
__partition_get_keys(dbp,parts,keys)1995 __partition_get_keys(dbp, parts, keys)
1996 DB *dbp;
1997 u_int32_t *parts;
1998 DBT **keys;
1999 {
2000 COMPQUIET(parts, NULL);
2001 COMPQUIET(keys, NULL);
2002
2003 return (__db_no_partition(dbp->env));
2004 }
2005 /*
2006 * __partition_init --
2007 * Initialize the partition structure.
2008 * Called when the meta data page is read in during database open or
2009 * when partition keys or a callback are set.
2010 *
2011 * PUBLIC: int __partition_init __P((DB *, u_int32_t));
2012 */
2013 int
__partition_init(dbp,flags)2014 __partition_init(dbp, flags)
2015 DB *dbp;
2016 u_int32_t flags;
2017 {
2018 COMPQUIET(flags, 0);
2019
2020 return (__db_no_partition(dbp->env));
2021 }
2022 /*
2023 * __part_fileid_reset --
2024 * reset the fileid on each partition.
2025 *
2026 * PUBLIC: int __part_fileid_reset
2027 * PUBLIC: __P((ENV *, DB_THREAD_INFO *, const char *, u_int32_t, int));
2028 */
2029 int
__part_fileid_reset(env,ip,fname,nparts,encrypted)2030 __part_fileid_reset(env, ip, fname, nparts, encrypted)
2031 ENV *env;
2032 DB_THREAD_INFO *ip;
2033 const char *fname;
2034 u_int32_t nparts;
2035 int encrypted;
2036 {
2037 COMPQUIET(ip, NULL);
2038 COMPQUIET(fname, NULL);
2039 COMPQUIET(nparts, 0);
2040 COMPQUIET(encrypted, 0);
2041
2042 return (__db_no_partition(env));
2043 }
2044 /*
2045 * __partition_set_dirs --
2046 * Set the directories for creating the partition databases.
2047 * They must be in the environment.
2048 * PUBLIC: int __partition_set_dirs __P((DB *, const char **));
2049 */
2050 int
__partition_set_dirs(dbp,dirp)2051 __partition_set_dirs(dbp, dirp)
2052 DB *dbp;
2053 const char **dirp;
2054 {
2055 COMPQUIET(dirp, NULL);
2056
2057 return (__db_no_partition(dbp->env));
2058 }
2059 #endif
2060