1 /*
2 * Copyright (c) 1998, 2020 Oracle and/or its affiliates. All rights reserved.
3 *
4 * See the file LICENSE for license information.
5 *
6 * $Id$
7 */
8
9 #include "db_config.h"
10
11 #include "db_int.h"
12 #include "dbinc/db_page.h"
13 #include "dbinc/db_join.h"
14 #include "dbinc/btree.h"
15 #include "dbinc/lock.h"
16
17 static int __db_join_close_pp __P((DBC *));
18 static int __db_join_cmp __P((const void *, const void *));
19 static int __db_join_del __P((DBC *, u_int32_t));
20 static int __db_join_get __P((DBC *, DBT *, DBT *, u_int32_t));
21 static int __db_join_get_pp __P((DBC *, DBT *, DBT *, u_int32_t));
22 static int __db_join_getnext __P((DBC *, DBT *, DBT *, u_int32_t, u_int32_t));
23 static int __db_join_primget __P((DB *, DB_THREAD_INFO *,
24 DB_TXN *, DB_LOCKER *, DBT *, DBT *, u_int32_t));
25 static int __db_join_put __P((DBC *, DBT *, DBT *, u_int32_t));
26
27 /*
28 * Check to see if the Nth secondary cursor of join cursor jc is pointing
29 * to a sorted duplicate set.
30 */
31 #define SORTED_SET(jc, n) ((jc)->j_curslist[(n)]->dbp->dup_compare != NULL)
32
33 /*
34 * This is the duplicate-assisted join functionality. Right now we're
35 * going to write it such that we return one item at a time, although
36 * I think we may need to optimize it to return them all at once.
37 * It should be easier to get it working this way, and I believe that
38 * changing it should be fairly straightforward.
39 *
40 * We optimize the join by sorting cursors from smallest to largest
41 * cardinality. In most cases, this is indeed optimal. However, if
42 * a cursor with large cardinality has very few data in common with the
43 * first cursor, it is possible that the join will be made faster by
44 * putting it earlier in the cursor list. Since we have no way to detect
45 * cases like this, we simply provide a flag, DB_JOIN_NOSORT, which retains
46 * the sort order specified by the caller, who may know more about the
47 * structure of the data.
48 *
49 * The first cursor moves sequentially through the duplicate set while
50 * the others search explicitly for the duplicate in question.
51 *
52 */
53
54 /*
55 * __db_join --
56 * This is the interface to the duplicate-assisted join functionality.
57 * In the same way that cursors mark a position in a database, a cursor
58 * can mark a position in a join. While most cursors are created by the
59 * cursor method of a DB, join cursors are created through an explicit
60 * call to DB->join.
61 *
62 * The curslist is an array of existing, initialized cursors and primary
63 * is the DB of the primary file. The data item that joins all the
64 * cursors in the curslist is used as the key into the primary and that
65 * key and data are returned. When no more items are left in the join
66 * set, the c_next operation off the join cursor will return DB_NOTFOUND.
67 *
68 * PUBLIC: int __db_join __P((DB *, DBC **, DBC **, u_int32_t));
69 */
70 int
__db_join(primary,curslist,dbcp,flags)71 __db_join(primary, curslist, dbcp, flags)
72 DB *primary;
73 DBC **curslist, **dbcp;
74 u_int32_t flags;
75 {
76 DBC *dbc;
77 ENV *env;
78 JOIN_CURSOR *jc;
79 size_t ncurs, nslots;
80 u_int32_t i;
81 int ret;
82
83 env = primary->env;
84 dbc = NULL;
85 jc = NULL;
86
87 if ((ret = __os_calloc(env, 1, sizeof(DBC), &dbc)) != 0)
88 goto err;
89
90 if ((ret = __os_calloc(env, 1, sizeof(JOIN_CURSOR), &jc)) != 0)
91 goto err;
92
93 if ((ret = __os_malloc(env, 256, &jc->j_key.data)) != 0)
94 goto err;
95 jc->j_key.ulen = 256;
96 F_SET(&jc->j_key, DB_DBT_USERMEM);
97
98 F_SET(&jc->j_rdata, DB_DBT_REALLOC);
99
100 for (jc->j_curslist = curslist;
101 *jc->j_curslist != NULL; jc->j_curslist++)
102 ;
103
104 /*
105 * The number of cursor slots we allocate is one greater than
106 * the number of cursors involved in the join, because the
107 * list is NULL-terminated.
108 */
109 ncurs = (size_t)(jc->j_curslist - curslist);
110 nslots = ncurs + 1;
111
112 /*
113 * !!! -- A note on the various lists hanging off jc.
114 *
115 * j_curslist is the initial NULL-terminated list of cursors passed
116 * into __db_join. The original cursors are not modified; pristine
117 * copies are required because, in databases with unsorted dups, we
118 * must reset all of the secondary cursors after the first each
119 * time the first one is incremented, or else we will lose data
120 * which happen to be sorted differently in two different cursors.
121 *
122 * j_workcurs is where we put those copies that we're planning to
123 * work with. They're lazily c_dup'ed from j_curslist as we need
124 * them, and closed when the join cursor is closed or when we need
125 * to reset them to their original values (in which case we just
126 * c_dup afresh).
127 *
128 * j_fdupcurs is an array of cursors which point to the first
129 * duplicate in the duplicate set that contains the data value
130 * we're currently interested in. We need this to make
131 * __db_join_get correctly return duplicate duplicates; i.e., if a
132 * given data value occurs twice in the set belonging to cursor #2,
133 * and thrice in the set belonging to cursor #3, and once in all
134 * the other cursors, successive calls to __db_join_get need to
135 * return that data item six times. To make this happen, each time
136 * cursor N is allowed to advance to a new datum, all cursors M
137 * such that M > N have to be reset to the first duplicate with
138 * that datum, so __db_join_get will return all the dup-dups again.
139 * We could just reset them to the original cursor from j_curslist,
140 * but that would be a bit slower in the unsorted case and a LOT
141 * slower in the sorted one.
142 *
143 * j_exhausted is a list of boolean values which represent
144 * whether or not their corresponding cursors are "exhausted",
145 * i.e. whether the datum under the corresponding cursor has
146 * been found not to exist in any unreturned combinations of
147 * later secondary cursors, in which case they are ready to be
148 * incremented.
149 */
150
151 /* We don't want to free regions whose callocs have failed. */
152 jc->j_curslist = NULL;
153 jc->j_workcurs = NULL;
154 jc->j_fdupcurs = NULL;
155 jc->j_exhausted = NULL;
156
157 if ((ret = __os_calloc(env, nslots, sizeof(DBC *),
158 &jc->j_curslist)) != 0)
159 goto err;
160 if ((ret = __os_calloc(env, nslots, sizeof(DBC *),
161 &jc->j_workcurs)) != 0)
162 goto err;
163 if ((ret = __os_calloc(env, nslots, sizeof(DBC *),
164 &jc->j_fdupcurs)) != 0)
165 goto err;
166 if ((ret = __os_calloc(env, nslots, sizeof(u_int8_t),
167 &jc->j_exhausted)) != 0)
168 goto err;
169 for (i = 0; curslist[i] != NULL; i++) {
170 jc->j_curslist[i] = curslist[i];
171 jc->j_workcurs[i] = NULL;
172 jc->j_fdupcurs[i] = NULL;
173 jc->j_exhausted[i] = 0;
174 }
175 jc->j_ncurs = (u_int32_t)ncurs;
176
177 /*
178 * If DB_JOIN_NOSORT is not set, optimize secondary cursors by
179 * sorting in order of increasing cardinality.
180 */
181 if (!LF_ISSET(DB_JOIN_NOSORT))
182 qsort(jc->j_curslist, ncurs, sizeof(DBC *), __db_join_cmp);
183
184 /*
185 * We never need to reset the 0th cursor, so there's no
186 * solid reason to use workcurs[0] rather than curslist[0] in
187 * join_get. Nonetheless, it feels cleaner to do it for symmetry,
188 * and this is the most logical place to copy it.
189 *
190 * !!!
191 * There's no need to close the new cursor if we goto err only
192 * because this is the last thing that can fail. Modifier of this
193 * function beware!
194 */
195 if ((ret =
196 __dbc_dup(jc->j_curslist[0], jc->j_workcurs, DB_POSITION)) != 0)
197 goto err;
198
199 dbc->close = dbc->c_close = __db_join_close_pp;
200 dbc->del = dbc->c_del = __db_join_del;
201 dbc->get = dbc->c_get = __db_join_get_pp;
202 dbc->put = dbc->c_put = __db_join_put;
203 dbc->internal = (DBC_INTERNAL *)jc;
204 dbc->dbp = primary;
205 jc->j_primary = primary;
206
207 /* Stash the first cursor's transaction here for easy access. */
208 dbc->txn = curslist[0]->txn;
209
210 *dbcp = dbc;
211
212 MUTEX_LOCK(env, primary->mutex);
213 TAILQ_INSERT_TAIL(&primary->join_queue, dbc, links);
214 MUTEX_UNLOCK(env, primary->mutex);
215
216 return (0);
217
218 err: if (jc != NULL) {
219 if (jc->j_curslist != NULL)
220 __os_free(env, jc->j_curslist);
221 if (jc->j_workcurs != NULL) {
222 if (jc->j_workcurs[0] != NULL)
223 (void)__dbc_close(jc->j_workcurs[0]);
224 __os_free(env, jc->j_workcurs);
225 }
226 if (jc->j_fdupcurs != NULL)
227 __os_free(env, jc->j_fdupcurs);
228 if (jc->j_exhausted != NULL)
229 __os_free(env, jc->j_exhausted);
230 __os_free(env, jc);
231 }
232 if (dbc != NULL)
233 __os_free(env, dbc);
234 return (ret);
235 }
236
237 /*
238 * __db_join_close_pp --
239 * DBC->close pre/post processing for join cursors.
240 */
241 static int
__db_join_close_pp(dbc)242 __db_join_close_pp(dbc)
243 DBC *dbc;
244 {
245 DB *dbp;
246 DB_THREAD_INFO *ip;
247 ENV *env;
248 int handle_check, ret, t_ret;
249
250 dbp = dbc->dbp;
251 env = dbp->env;
252
253 ENV_ENTER(env, ip);
254
255 handle_check = IS_ENV_REPLICATED(env);
256 if (handle_check &&
257 (ret = __db_rep_enter(dbp, 1, 0, IS_REAL_TXN(dbc->txn))) != 0) {
258 handle_check = 0;
259 goto err;
260 }
261
262 ret = __db_join_close(dbc);
263
264 if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
265 ret = t_ret;
266
267 err: ENV_LEAVE(env, ip);
268 return (ret);
269 }
270
271 static int
__db_join_put(dbc,key,data,flags)272 __db_join_put(dbc, key, data, flags)
273 DBC *dbc;
274 DBT *key;
275 DBT *data;
276 u_int32_t flags;
277 {
278 COMPQUIET(dbc, NULL);
279 COMPQUIET(key, NULL);
280 COMPQUIET(data, NULL);
281 COMPQUIET(flags, 0);
282 return (EINVAL);
283 }
284
285 static int
__db_join_del(dbc,flags)286 __db_join_del(dbc, flags)
287 DBC *dbc;
288 u_int32_t flags;
289 {
290 COMPQUIET(dbc, NULL);
291 COMPQUIET(flags, 0);
292 return (EINVAL);
293 }
294
295 /*
296 * __db_join_get_pp --
297 * DBjoin->get pre/post processing.
298 */
299 static int
__db_join_get_pp(dbc,key,data,flags)300 __db_join_get_pp(dbc, key, data, flags)
301 DBC *dbc;
302 DBT *key, *data;
303 u_int32_t flags;
304 {
305 DB *dbp;
306 DB_THREAD_INFO *ip;
307 ENV *env;
308 u_int32_t handle_check, save_flags;
309 int ret, t_ret;
310
311 dbp = dbc->dbp;
312 env = dbp->env;
313
314 /* Save the original flags value. */
315 save_flags = flags;
316
317 if (LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW)) {
318 if (!LOCKING_ON(env))
319 return (__db_fnl(env, "DBC->get"));
320 LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW);
321 }
322
323 switch (flags) {
324 case 0:
325 case DB_JOIN_ITEM:
326 break;
327 default:
328 return (__db_ferr(env, "DBC->get", 0));
329 }
330
331 /*
332 * A partial get of the key of a join cursor don't make much sense;
333 * the entire key is necessary to query the primary database
334 * and find the datum, and so regardless of the size of the key
335 * it would not be a performance improvement. Since it would require
336 * special handling, we simply disallow it.
337 *
338 * A partial get of the data, however, potentially makes sense (if
339 * all possible data are a predictable large structure, for instance)
340 * and causes us no headaches, so we permit it.
341 */
342 if (F_ISSET(key, DB_DBT_PARTIAL)) {
343 __db_errx(env, DB_STR("0516",
344 "DB_DBT_PARTIAL may not be set on key during join_get"));
345 return (EINVAL);
346 }
347
348 ENV_ENTER(env, ip);
349
350 handle_check = IS_ENV_REPLICATED(env);
351 if (handle_check &&
352 (ret = __db_rep_enter(dbp, 1, 0, IS_REAL_TXN(dbc->txn))) != 0) {
353 handle_check = 0;
354 goto err;
355 }
356
357 /* Restore the original flags value. */
358 flags = save_flags;
359
360 ret = __db_join_get(dbc, key, data, flags);
361
362 if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
363 ret = t_ret;
364
365 err: ENV_LEAVE(env, ip);
366 __dbt_userfree(env, key, NULL, NULL);
367 return (ret);
368 }
369
370 static int
__db_join_get(dbc,key_arg,data_arg,flags)371 __db_join_get(dbc, key_arg, data_arg, flags)
372 DBC *dbc;
373 DBT *key_arg, *data_arg;
374 u_int32_t flags;
375 {
376 DB *dbp;
377 DBC *cp;
378 DBT *key_n, key_n_mem;
379 ENV *env;
380 JOIN_CURSOR *jc;
381 int db_manage_data, ret;
382 u_int32_t i, j, operation, opmods;
383
384 dbp = dbc->dbp;
385 env = dbp->env;
386 jc = (JOIN_CURSOR *)dbc->internal;
387
388 operation = LF_ISSET(DB_OPFLAGS_MASK);
389
390 /* !!!
391 * If the set of flags here changes, check that __db_join_primget
392 * is updated to handle them properly.
393 */
394 opmods = LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW);
395
396 /*
397 * Since we are fetching the key as a datum in the secondary indices,
398 * we must be careful of caller-specified DB_DBT_* memory
399 * management flags. If necessary, use a stack-allocated DBT;
400 * we'll appropriately copy and/or allocate the data later.
401 */
402 if (F_ISSET(key_arg,
403 DB_DBT_MALLOC | DB_DBT_USERCOPY | DB_DBT_USERMEM)) {
404 /* We just use the default buffer; no need to go malloc. */
405 key_n = &key_n_mem;
406 memset(key_n, 0, sizeof(DBT));
407 } else {
408 /*
409 * Either DB_DBT_REALLOC or the default buffer will work
410 * fine if we have to reuse it, as we do.
411 */
412 key_n = key_arg;
413 }
414 if (F_ISSET(key_arg, DB_DBT_USERCOPY))
415 key_arg->data = NULL;
416
417 /*
418 * If our last attempt to do a get on the primary key failed,
419 * short-circuit the join and try again with the same key.
420 */
421 if (F_ISSET(jc, JOIN_RETRY))
422 goto samekey;
423 F_CLR(jc, JOIN_RETRY);
424
425 retry: ret = __dbc_get(jc->j_workcurs[0], &jc->j_key, key_n,
426 opmods | (jc->j_exhausted[0] ? DB_NEXT_DUP : DB_CURRENT));
427
428 if (ret == DB_BUFFER_SMALL) {
429 jc->j_key.ulen <<= 1;
430 if ((ret = __os_realloc(env,
431 jc->j_key.ulen, &jc->j_key.data)) != 0)
432 goto mem_err;
433 goto retry;
434 }
435
436 /*
437 * If ret == DB_NOTFOUND, we're out of elements of the first
438 * secondary cursor. This is how we finally finish the join
439 * if all goes well.
440 */
441 if (ret != 0)
442 goto err;
443
444 /*
445 * If jc->j_exhausted[0] == 1, we've just advanced the first cursor,
446 * and we're going to want to advance all the cursors that point to
447 * the first member of a duplicate duplicate set (j_fdupcurs[1..N]).
448 * Close all the cursors in j_fdupcurs; we'll reopen them the
449 * first time through the upcoming loop.
450 */
451 for (i = 1; i < jc->j_ncurs; i++) {
452 if (jc->j_fdupcurs[i] != NULL &&
453 (ret = __dbc_close(jc->j_fdupcurs[i])) != 0)
454 goto err;
455 jc->j_fdupcurs[i] = NULL;
456 }
457
458 /*
459 * If jc->j_curslist[1] == NULL, we have only one cursor in the join.
460 * Thus, we can safely increment that one cursor on each call
461 * to __db_join_get, and we signal this by setting jc->j_exhausted[0]
462 * right away.
463 *
464 * Otherwise, reset jc->j_exhausted[0] to 0, so that we don't
465 * increment it until we know we're ready to.
466 */
467 if (jc->j_curslist[1] == NULL)
468 jc->j_exhausted[0] = 1;
469 else
470 jc->j_exhausted[0] = 0;
471
472 /* We have the first element; now look for it in the other cursors. */
473 for (i = 1; i < jc->j_ncurs; i++) {
474 DB_ASSERT(env, jc->j_curslist[i] != NULL);
475 if (jc->j_workcurs[i] == NULL)
476 /* If this is NULL, we need to dup curslist into it. */
477 if ((ret = __dbc_dup(jc->j_curslist[i],
478 &jc->j_workcurs[i], DB_POSITION)) != 0)
479 goto err;
480
481 retry2: cp = jc->j_workcurs[i];
482
483 if ((ret = __db_join_getnext(cp, &jc->j_key, key_n,
484 jc->j_exhausted[i], opmods)) == DB_NOTFOUND) {
485 /*
486 * jc->j_workcurs[i] has no more of the datum we're
487 * interested in. Go back one cursor and get
488 * a new dup. We can't just move to a new
489 * element of the outer relation, because that way
490 * we might miss duplicate duplicates in cursor i-1.
491 *
492 * If this takes us back to the first cursor,
493 * -then- we can move to a new element of the outer
494 * relation.
495 */
496 --i;
497 jc->j_exhausted[i] = 1;
498
499 if (i == 0) {
500 for (j = 1; jc->j_workcurs[j] != NULL; j++) {
501 /*
502 * We're moving to a new element of
503 * the first secondary cursor. If
504 * that cursor is sorted, then any
505 * other sorted cursors can be safely
506 * reset to the first duplicate
507 * duplicate in the current set if we
508 * have a pointer to it (we can't just
509 * leave them be, or we'll miss
510 * duplicate duplicates in the outer
511 * relation).
512 *
513 * If the first cursor is unsorted, or
514 * if cursor j is unsorted, we can
515 * make no assumptions about what
516 * we're looking for next or where it
517 * will be, so we reset to the very
518 * beginning (setting workcurs NULL
519 * will achieve this next go-round).
520 *
521 * !!!: This is likely to break
522 * horribly if any two cursors are
523 * both sorted, but have different
524 * specified sort functions. For,
525 * now, we dismiss this as pathology
526 * and let strange things happen--we
527 * can't make rope childproof.
528 */
529 if ((ret = __dbc_close(
530 jc->j_workcurs[j])) != 0)
531 goto err;
532 if (!SORTED_SET(jc, 0) ||
533 !SORTED_SET(jc, j) ||
534 jc->j_fdupcurs[j] == NULL)
535 /*
536 * Unsafe conditions;
537 * reset fully.
538 */
539 jc->j_workcurs[j] = NULL;
540 else
541 /* Partial reset suffices. */
542 if ((__dbc_dup(
543 jc->j_fdupcurs[j],
544 &jc->j_workcurs[j],
545 DB_POSITION)) != 0)
546 goto err;
547 jc->j_exhausted[j] = 0;
548 }
549 goto retry;
550 /* NOTREACHED */
551 }
552
553 /*
554 * We're about to advance the cursor and need to
555 * reset all of the workcurs[j] where j>i, so that
556 * we don't miss any duplicate duplicates.
557 */
558 for (j = i + 1;
559 jc->j_workcurs[j] != NULL;
560 j++) {
561 if ((ret =
562 __dbc_close(jc->j_workcurs[j])) != 0)
563 goto err;
564 jc->j_exhausted[j] = 0;
565 if (jc->j_fdupcurs[j] == NULL)
566 jc->j_workcurs[j] = NULL;
567 else if ((ret = __dbc_dup(jc->j_fdupcurs[j],
568 &jc->j_workcurs[j], DB_POSITION)) != 0)
569 goto err;
570 }
571 goto retry2;
572 /* NOTREACHED */
573 }
574
575 if (ret == DB_BUFFER_SMALL) {
576 jc->j_key.ulen <<= 1;
577 if ((ret = __os_realloc(env, jc->j_key.ulen,
578 &jc->j_key.data)) != 0) {
579 mem_err: __db_errx(env, DB_STR_A("0517",
580 "Allocation failed for join key, len = %lu",
581 "%lu"), (u_long)jc->j_key.ulen);
582 goto err;
583 }
584 goto retry2;
585 }
586
587 if (ret != 0)
588 goto err;
589
590 /*
591 * If we made it this far, we've found a matching
592 * datum in cursor i. Mark the current cursor
593 * unexhausted, so we don't miss any duplicate
594 * duplicates the next go-round--unless this is the
595 * very last cursor, in which case there are none to
596 * miss, and we'll need that exhausted flag to finally
597 * get a DB_NOTFOUND and move on to the next datum in
598 * the outermost cursor.
599 */
600 if (i + 1 != jc->j_ncurs)
601 jc->j_exhausted[i] = 0;
602 else
603 jc->j_exhausted[i] = 1;
604
605 /*
606 * If jc->j_fdupcurs[i] is NULL and the ith cursor's dups are
607 * sorted, then we're here for the first time since advancing
608 * cursor 0, and we have a new datum of interest.
609 * jc->j_workcurs[i] points to the beginning of a set of
610 * duplicate duplicates; store this into jc->j_fdupcurs[i].
611 */
612 if (SORTED_SET(jc, i) && jc->j_fdupcurs[i] == NULL && (ret =
613 __dbc_dup(cp, &jc->j_fdupcurs[i], DB_POSITION)) != 0)
614 goto err;
615 }
616
617 err: if (ret != 0)
618 return (ret);
619
620 if (0) {
621 samekey: /*
622 * Get the key we tried and failed to return last time;
623 * it should be the current datum of all the secondary cursors.
624 */
625 if ((ret = __dbc_get(jc->j_workcurs[0],
626 &jc->j_key, key_n, DB_CURRENT | opmods)) != 0)
627 return (ret);
628 F_CLR(jc, JOIN_RETRY);
629 }
630
631 /*
632 * ret == 0; we have a key to return.
633 *
634 * If DB_DBT_USERMEM or DB_DBT_MALLOC is set, we need to copy the key
635 * back into the dbt we were given for the key; call __db_retcopy.
636 * Otherwise, assert that we do not need to copy anything and proceed.
637 */
638 DB_ASSERT(env, F_ISSET(key_arg, DB_DBT_USERMEM | DB_DBT_MALLOC |
639 DB_DBT_USERCOPY) || key_n == key_arg);
640
641 if ((F_ISSET(key_arg, DB_DBT_USERMEM | DB_DBT_MALLOC |
642 DB_DBT_USERCOPY)) &&
643 (ret = __db_retcopy(env,
644 key_arg, key_n->data, key_n->size, NULL, NULL)) != 0) {
645 /*
646 * The retcopy failed, most commonly because we have a user
647 * buffer for the key which is too small. Set things up to
648 * retry next time, and return.
649 */
650 F_SET(jc, JOIN_RETRY);
651 return (ret);
652 }
653
654 /*
655 * If DB_JOIN_ITEM is set, we return it; otherwise we do the lookup
656 * in the primary and then return.
657 */
658 if (operation == DB_JOIN_ITEM)
659 return (0);
660
661 /*
662 * If data_arg->flags == 0--that is, if DB is managing the
663 * data DBT's memory--it's not safe to just pass the DBT
664 * through to the primary get call, since we don't want that
665 * memory to belong to the primary DB handle (and if the primary
666 * is free-threaded, it can't anyway).
667 *
668 * Instead, use memory that is managed by the join cursor, in
669 * jc->j_rdata.
670 */
671 if (!F_ISSET(data_arg, DB_DBT_MALLOC | DB_DBT_REALLOC |
672 DB_DBT_USERMEM | DB_DBT_USERCOPY))
673 db_manage_data = 1;
674 else
675 db_manage_data = 0;
676 if ((ret = __db_join_primget(jc->j_primary, dbc->thread_info,
677 jc->j_curslist[0]->txn, jc->j_curslist[0]->locker, key_n,
678 db_manage_data ? &jc->j_rdata : data_arg, opmods)) != 0) {
679 if (ret == DB_NOTFOUND) {
680 if (LF_ISSET(DB_READ_UNCOMMITTED) ||
681 (jc->j_curslist[0]->txn != NULL && F_ISSET(
682 jc->j_curslist[0]->txn, TXN_READ_UNCOMMITTED)))
683 goto retry;
684 /*
685 * If ret == DB_NOTFOUND, the primary and secondary
686 * are out of sync; every item in each secondary
687 * should correspond to something in the primary,
688 * or we shouldn't have done the join this way.
689 * Wail.
690 */
691 ret = __db_secondary_corrupt(jc->j_primary);
692 } else
693 /*
694 * The get on the primary failed for some other
695 * reason, most commonly because we're using a user
696 * buffer that's not big enough. Flag our failure
697 * so we can return the same key next time.
698 */
699 F_SET(jc, JOIN_RETRY);
700 }
701 if (db_manage_data && ret == 0) {
702 data_arg->data = jc->j_rdata.data;
703 data_arg->size = jc->j_rdata.size;
704 }
705
706 return (ret);
707 }
708
709 /*
710 * __db_join_close --
711 * DBC->close for join cursors.
712 *
713 * PUBLIC: int __db_join_close __P((DBC *));
714 */
715 int
__db_join_close(dbc)716 __db_join_close(dbc)
717 DBC *dbc;
718 {
719 DB *dbp;
720 ENV *env;
721 JOIN_CURSOR *jc;
722 int ret, t_ret;
723 u_int32_t i;
724
725 jc = (JOIN_CURSOR *)dbc->internal;
726 dbp = dbc->dbp;
727 env = dbp->env;
728 ret = t_ret = 0;
729
730 /*
731 * Remove from active list of join cursors. Note that this
732 * must happen before any action that can fail and return, or else
733 * __db_close may loop indefinitely.
734 */
735 MUTEX_LOCK(env, dbp->mutex);
736 TAILQ_REMOVE(&dbp->join_queue, dbc, links);
737 MUTEX_UNLOCK(env, dbp->mutex);
738
739 /*
740 * Close any open scratch cursors. In each case, there may
741 * not be as many outstanding as there are cursors in
742 * curslist, but we want to close whatever's there.
743 *
744 * If any close fails, there's no reason not to close everything else;
745 * we'll just return the error code of the last one to fail. There's
746 * not much the caller can do anyway, since these cursors only exist
747 * hanging off a db-internal data structure that they shouldn't be
748 * mucking with.
749 */
750 for (i = 0; i < jc->j_ncurs; i++) {
751 if (jc->j_workcurs[i] != NULL &&
752 (t_ret = __dbc_close(jc->j_workcurs[i])) != 0)
753 ret = t_ret;
754 if (jc->j_fdupcurs[i] != NULL &&
755 (t_ret = __dbc_close(jc->j_fdupcurs[i])) != 0)
756 ret = t_ret;
757 }
758
759 __os_free(env, jc->j_exhausted);
760 __os_free(env, jc->j_curslist);
761 __os_free(env, jc->j_workcurs);
762 __os_free(env, jc->j_fdupcurs);
763 __os_free(env, jc->j_key.data);
764 if (jc->j_rdata.data != NULL)
765 __os_ufree(env, jc->j_rdata.data);
766 __os_free(env, jc);
767 __os_free(env, dbc);
768
769 return (ret);
770 }
771
772 /*
773 * __db_join_getnext --
774 * This function replaces the DBC_CONTINUE and DBC_KEYSET
775 * functionality inside the various cursor get routines.
776 *
777 * If exhausted == 0, we're not done with the current datum;
778 * return it if it matches "matching", otherwise search
779 * using DB_GET_BOTHC (which is faster than iteratively doing
780 * DB_NEXT_DUP) forward until we find one that does.
781 *
782 * If exhausted == 1, we are done with the current datum, so just
783 * leap forward to searching NEXT_DUPs.
784 *
785 * If no matching datum exists, returns DB_NOTFOUND, else 0.
786 */
787 static int
__db_join_getnext(dbc,key,data,exhausted,opmods)788 __db_join_getnext(dbc, key, data, exhausted, opmods)
789 DBC *dbc;
790 DBT *key, *data;
791 u_int32_t exhausted, opmods;
792 {
793 int ret, cmp;
794 DB *dbp;
795 DBT ldata;
796 int (*func) __P((DB *, const DBT *, const DBT *, size_t *));
797
798 dbp = dbc->dbp;
799 if (dbp->dup_compare == NULL)
800 func = __dbt_defcmp;
801 else
802 (void)dbp->get_dup_compare(dbp, &func);
803
804 switch (exhausted) {
805 case 0:
806 /*
807 * We don't want to step on data->data; use a new
808 * DBT and malloc so we don't step on dbc's rdata memory.
809 */
810 memset(&ldata, 0, sizeof(DBT));
811 F_SET(&ldata, DB_DBT_MALLOC);
812 if ((ret = __dbc_get(dbc,
813 key, &ldata, opmods | DB_CURRENT)) != 0)
814 break;
815 cmp = func(dbp, data, &ldata, NULL);
816 if (cmp == 0) {
817 /*
818 * We have to return the real data value. Copy
819 * it into data, then free the buffer we malloc'ed
820 * above.
821 */
822 if ((ret = __db_retcopy(dbp->env, data, ldata.data,
823 ldata.size, &data->data, &data->size)) != 0)
824 return (ret);
825 __os_ufree(dbp->env, ldata.data);
826 return (0);
827 }
828
829 /*
830 * Didn't match--we want to fall through and search future
831 * dups. We just forget about ldata and free
832 * its buffer--data contains the value we're searching for.
833 */
834 __os_ufree(dbp->env, ldata.data);
835 /* FALLTHROUGH */
836 case 1:
837 ret = __dbc_get(dbc, key, data, opmods | DB_GET_BOTHC);
838 break;
839 default:
840 ret = USR_ERR(dbc->env, EINVAL);
841 break;
842 }
843
844 return (ret);
845 }
846
847 /*
848 * __db_join_cmp --
849 * Comparison function for sorting DBCs in cardinality order.
850 */
851 static int
__db_join_cmp(a,b)852 __db_join_cmp(a, b)
853 const void *a, *b;
854 {
855 DBC *dbca, *dbcb;
856 db_recno_t counta, countb;
857
858 dbca = *((DBC * const *)a);
859 dbcb = *((DBC * const *)b);
860
861 if (__dbc_count(dbca, &counta) != 0 ||
862 __dbc_count(dbcb, &countb) != 0)
863 return (0);
864
865 return ((long)counta - (long)countb);
866 }
867
868 /*
869 * __db_join_primget --
870 * Perform a DB->get in the primary, being careful not to use a new
871 * locker ID if we're doing CDB locking.
872 */
873 static int
__db_join_primget(dbp,ip,txn,locker,key,data,flags)874 __db_join_primget(dbp, ip, txn, locker, key, data, flags)
875 DB *dbp;
876 DB_THREAD_INFO *ip;
877 DB_TXN *txn;
878 DB_LOCKER *locker;
879 DBT *key, *data;
880 u_int32_t flags;
881 {
882 DBC *dbc;
883 u_int32_t rmw;
884 int ret, t_ret;
885
886 if ((ret = __db_cursor_int(dbp, ip,
887 txn, dbp->type, PGNO_INVALID, 0, locker, &dbc)) != 0)
888 return (ret);
889
890 /*
891 * The only allowable flags here are the two flags copied into "opmods"
892 * in __db_join_get, DB_RMW and DB_READ_UNCOMMITTED. The former is an
893 * op on the c_get call, the latter on the cursor call. It's a DB bug
894 * if we allow any other flags down in here.
895 */
896 rmw = LF_ISSET(DB_RMW);
897 if (LF_ISSET(DB_READ_UNCOMMITTED) ||
898 (txn != NULL && F_ISSET(txn, TXN_READ_UNCOMMITTED)))
899 F_SET(dbc, DBC_READ_UNCOMMITTED);
900
901 if (LF_ISSET(DB_READ_COMMITTED) ||
902 (txn != NULL && F_ISSET(txn, TXN_READ_COMMITTED)))
903 F_SET(dbc, DBC_READ_COMMITTED);
904
905 LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW);
906 DB_ASSERT(dbp->env, flags == 0);
907
908 F_SET(dbc, DBC_TRANSIENT);
909
910 /*
911 * This shouldn't be necessary, thanks to the fact that join cursors
912 * swap in their own DB_DBT_REALLOC'ed buffers, but just for form's
913 * sake, we mirror what __db_get does.
914 */
915 SET_RET_MEM(dbc, dbp);
916
917 ret = __dbc_get(dbc, key, data, DB_SET | rmw);
918
919 if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
920 ret = t_ret;
921
922 return (ret);
923 }
924
925 /*
926 * __db_secondary_corrupt --
927 * Report primary/secondary inconsistencies.
928 *
929 * PUBLIC: int __db_secondary_corrupt __P((DB *));
930 */
931 int
__db_secondary_corrupt(dbp)932 __db_secondary_corrupt(dbp)
933 DB *dbp;
934 {
935 __db_err(dbp->env, DB_SECONDARY_BAD, "%s%s%s",
936 dbp->fname == NULL ? "unnamed" : dbp->fname,
937 dbp->dname == NULL ? "" : "/",
938 dbp->dname == NULL ? "" : dbp->dname);
939 return (DB_SECONDARY_BAD);
940 }
941