1 /*
2  * See the file LICENSE for redistribution information.
3  *
4  * Copyright (c) 1998, 2013 Oracle and/or its affiliates.  All rights reserved.
5  *
6  * $Id$
7  */
8 
9 #include "db_config.h"
10 
11 #include "db_int.h"
12 #include "dbinc/db_page.h"
13 #include "dbinc/db_join.h"
14 #include "dbinc/btree.h"
15 #include "dbinc/lock.h"
16 
17 static int __db_join_close_pp __P((DBC *));
18 static int __db_join_cmp __P((const void *, const void *));
19 static int __db_join_del __P((DBC *, u_int32_t));
20 static int __db_join_get __P((DBC *, DBT *, DBT *, u_int32_t));
21 static int __db_join_get_pp __P((DBC *, DBT *, DBT *, u_int32_t));
22 static int __db_join_getnext __P((DBC *, DBT *, DBT *, u_int32_t, u_int32_t));
23 static int __db_join_primget __P((DB *, DB_THREAD_INFO *,
24     DB_TXN *, DB_LOCKER *, DBT *, DBT *, u_int32_t));
25 static int __db_join_put __P((DBC *, DBT *, DBT *, u_int32_t));
26 
27 /*
28  * Check to see if the Nth secondary cursor of join cursor jc is pointing
29  * to a sorted duplicate set.
30  */
31 #define	SORTED_SET(jc, n)   ((jc)->j_curslist[(n)]->dbp->dup_compare != NULL)
32 
33 /*
34  * This is the duplicate-assisted join functionality.  Right now we're
35  * going to write it such that we return one item at a time, although
36  * I think we may need to optimize it to return them all at once.
37  * It should be easier to get it working this way, and I believe that
38  * changing it should be fairly straightforward.
39  *
40  * We optimize the join by sorting cursors from smallest to largest
41  * cardinality.  In most cases, this is indeed optimal.  However, if
42  * a cursor with large cardinality has very few data in common with the
43  * first cursor, it is possible that the join will be made faster by
44  * putting it earlier in the cursor list.  Since we have no way to detect
45  * cases like this, we simply provide a flag, DB_JOIN_NOSORT, which retains
46  * the sort order specified by the caller, who may know more about the
47  * structure of the data.
48  *
49  * The first cursor moves sequentially through the duplicate set while
50  * the others search explicitly for the duplicate in question.
51  *
52  */
53 
54 /*
55  * __db_join --
56  *	This is the interface to the duplicate-assisted join functionality.
57  * In the same way that cursors mark a position in a database, a cursor
58  * can mark a position in a join.  While most cursors are created by the
59  * cursor method of a DB, join cursors are created through an explicit
60  * call to DB->join.
61  *
62  * The curslist is an array of existing, initialized cursors and primary
63  * is the DB of the primary file.  The data item that joins all the
64  * cursors in the curslist is used as the key into the primary and that
65  * key and data are returned.  When no more items are left in the join
66  * set, the  c_next operation off the join cursor will return DB_NOTFOUND.
67  *
68  * PUBLIC: int __db_join __P((DB *, DBC **, DBC **, u_int32_t));
69  */
70 int
__db_join(primary,curslist,dbcp,flags)71 __db_join(primary, curslist, dbcp, flags)
72 	DB *primary;
73 	DBC **curslist, **dbcp;
74 	u_int32_t flags;
75 {
76 	DBC *dbc;
77 	ENV *env;
78 	JOIN_CURSOR *jc;
79 	size_t ncurs, nslots;
80 	u_int32_t i;
81 	int ret;
82 
83 	env = primary->env;
84 	dbc = NULL;
85 	jc = NULL;
86 
87 	if ((ret = __os_calloc(env, 1, sizeof(DBC), &dbc)) != 0)
88 		goto err;
89 
90 	if ((ret = __os_calloc(env, 1, sizeof(JOIN_CURSOR), &jc)) != 0)
91 		goto err;
92 
93 	if ((ret = __os_malloc(env, 256, &jc->j_key.data)) != 0)
94 		goto err;
95 	jc->j_key.ulen = 256;
96 	F_SET(&jc->j_key, DB_DBT_USERMEM);
97 
98 	F_SET(&jc->j_rdata, DB_DBT_REALLOC);
99 
100 	for (jc->j_curslist = curslist;
101 	    *jc->j_curslist != NULL; jc->j_curslist++)
102 		;
103 
104 	/*
105 	 * The number of cursor slots we allocate is one greater than
106 	 * the number of cursors involved in the join, because the
107 	 * list is NULL-terminated.
108 	 */
109 	ncurs = (size_t)(jc->j_curslist - curslist);
110 	nslots = ncurs + 1;
111 
112 	/*
113 	 * !!! -- A note on the various lists hanging off jc.
114 	 *
115 	 * j_curslist is the initial NULL-terminated list of cursors passed
116 	 * into __db_join.  The original cursors are not modified; pristine
117 	 * copies are required because, in databases with unsorted dups, we
118 	 * must reset all of the secondary cursors after the first each
119 	 * time the first one is incremented, or else we will lose data
120 	 * which happen to be sorted differently in two different cursors.
121 	 *
122 	 * j_workcurs is where we put those copies that we're planning to
123 	 * work with.  They're lazily c_dup'ed from j_curslist as we need
124 	 * them, and closed when the join cursor is closed or when we need
125 	 * to reset them to their original values (in which case we just
126 	 * c_dup afresh).
127 	 *
128 	 * j_fdupcurs is an array of cursors which point to the first
129 	 * duplicate in the duplicate set that contains the data value
130 	 * we're currently interested in.  We need this to make
131 	 * __db_join_get correctly return duplicate duplicates;  i.e., if a
132 	 * given data value occurs twice in the set belonging to cursor #2,
133 	 * and thrice in the set belonging to cursor #3, and once in all
134 	 * the other cursors, successive calls to __db_join_get need to
135 	 * return that data item six times.  To make this happen, each time
136 	 * cursor N is allowed to advance to a new datum, all cursors M
137 	 * such that M > N have to be reset to the first duplicate with
138 	 * that datum, so __db_join_get will return all the dup-dups again.
139 	 * We could just reset them to the original cursor from j_curslist,
140 	 * but that would be a bit slower in the unsorted case and a LOT
141 	 * slower in the sorted one.
142 	 *
143 	 * j_exhausted is a list of boolean values which represent
144 	 * whether or not their corresponding cursors are "exhausted",
145 	 * i.e. whether the datum under the corresponding cursor has
146 	 * been found not to exist in any unreturned combinations of
147 	 * later secondary cursors, in which case they are ready to be
148 	 * incremented.
149 	 */
150 
151 	/* We don't want to free regions whose callocs have failed. */
152 	jc->j_curslist = NULL;
153 	jc->j_workcurs = NULL;
154 	jc->j_fdupcurs = NULL;
155 	jc->j_exhausted = NULL;
156 
157 	if ((ret = __os_calloc(env, nslots, sizeof(DBC *),
158 	    &jc->j_curslist)) != 0)
159 		goto err;
160 	if ((ret = __os_calloc(env, nslots, sizeof(DBC *),
161 	    &jc->j_workcurs)) != 0)
162 		goto err;
163 	if ((ret = __os_calloc(env, nslots, sizeof(DBC *),
164 	    &jc->j_fdupcurs)) != 0)
165 		goto err;
166 	if ((ret = __os_calloc(env, nslots, sizeof(u_int8_t),
167 	    &jc->j_exhausted)) != 0)
168 		goto err;
169 	for (i = 0; curslist[i] != NULL; i++) {
170 		jc->j_curslist[i] = curslist[i];
171 		jc->j_workcurs[i] = NULL;
172 		jc->j_fdupcurs[i] = NULL;
173 		jc->j_exhausted[i] = 0;
174 	}
175 	jc->j_ncurs = (u_int32_t)ncurs;
176 
177 	/*
178 	 * If DB_JOIN_NOSORT is not set, optimize secondary cursors by
179 	 * sorting in order of increasing cardinality.
180 	 */
181 	if (!LF_ISSET(DB_JOIN_NOSORT))
182 		qsort(jc->j_curslist, ncurs, sizeof(DBC *), __db_join_cmp);
183 
184 	/*
185 	 * We never need to reset the 0th cursor, so there's no
186 	 * solid reason to use workcurs[0] rather than curslist[0] in
187 	 * join_get.  Nonetheless, it feels cleaner to do it for symmetry,
188 	 * and this is the most logical place to copy it.
189 	 *
190 	 * !!!
191 	 * There's no need to close the new cursor if we goto err only
192 	 * because this is the last thing that can fail.  Modifier of this
193 	 * function beware!
194 	 */
195 	if ((ret =
196 	    __dbc_dup(jc->j_curslist[0], jc->j_workcurs, DB_POSITION)) != 0)
197 		goto err;
198 
199 	dbc->close = dbc->c_close = __db_join_close_pp;
200 	dbc->del = dbc->c_del = __db_join_del;
201 	dbc->get = dbc->c_get = __db_join_get_pp;
202 	dbc->put = dbc->c_put = __db_join_put;
203 	dbc->internal = (DBC_INTERNAL *)jc;
204 	dbc->dbp = primary;
205 	jc->j_primary = primary;
206 
207 	/* Stash the first cursor's transaction here for easy access. */
208 	dbc->txn = curslist[0]->txn;
209 
210 	*dbcp = dbc;
211 
212 	MUTEX_LOCK(env, primary->mutex);
213 	TAILQ_INSERT_TAIL(&primary->join_queue, dbc, links);
214 	MUTEX_UNLOCK(env, primary->mutex);
215 
216 	return (0);
217 
218 err:	if (jc != NULL) {
219 		if (jc->j_curslist != NULL)
220 			__os_free(env, jc->j_curslist);
221 		if (jc->j_workcurs != NULL) {
222 			if (jc->j_workcurs[0] != NULL)
223 				(void)__dbc_close(jc->j_workcurs[0]);
224 			__os_free(env, jc->j_workcurs);
225 		}
226 		if (jc->j_fdupcurs != NULL)
227 			__os_free(env, jc->j_fdupcurs);
228 		if (jc->j_exhausted != NULL)
229 			__os_free(env, jc->j_exhausted);
230 		__os_free(env, jc);
231 	}
232 	if (dbc != NULL)
233 		__os_free(env, dbc);
234 	return (ret);
235 }
236 
237 /*
238  * __db_join_close_pp --
239  *	DBC->close pre/post processing for join cursors.
240  */
241 static int
__db_join_close_pp(dbc)242 __db_join_close_pp(dbc)
243 	DBC *dbc;
244 {
245 	DB *dbp;
246 	DB_THREAD_INFO *ip;
247 	ENV *env;
248 	int handle_check, ret, t_ret;
249 
250 	dbp = dbc->dbp;
251 	env = dbp->env;
252 
253 	ENV_ENTER(env, ip);
254 
255 	handle_check = IS_ENV_REPLICATED(env);
256 	if (handle_check &&
257 	    (ret = __db_rep_enter(dbp, 1, 0, IS_REAL_TXN(dbc->txn))) != 0) {
258 		handle_check = 0;
259 		goto err;
260 	}
261 
262 	ret = __db_join_close(dbc);
263 
264 	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
265 		ret = t_ret;
266 
267 err:	ENV_LEAVE(env, ip);
268 	return (ret);
269 }
270 
271 static int
__db_join_put(dbc,key,data,flags)272 __db_join_put(dbc, key, data, flags)
273 	DBC *dbc;
274 	DBT *key;
275 	DBT *data;
276 	u_int32_t flags;
277 {
278 	COMPQUIET(dbc, NULL);
279 	COMPQUIET(key, NULL);
280 	COMPQUIET(data, NULL);
281 	COMPQUIET(flags, 0);
282 	return (EINVAL);
283 }
284 
285 static int
__db_join_del(dbc,flags)286 __db_join_del(dbc, flags)
287 	DBC *dbc;
288 	u_int32_t flags;
289 {
290 	COMPQUIET(dbc, NULL);
291 	COMPQUIET(flags, 0);
292 	return (EINVAL);
293 }
294 
295 /*
296  * __db_join_get_pp --
297  *	DBjoin->get pre/post processing.
298  */
299 static int
__db_join_get_pp(dbc,key,data,flags)300 __db_join_get_pp(dbc, key, data, flags)
301 	DBC *dbc;
302 	DBT *key, *data;
303 	u_int32_t flags;
304 {
305 	DB *dbp;
306 	DB_THREAD_INFO *ip;
307 	ENV *env;
308 	u_int32_t handle_check, save_flags;
309 	int ret, t_ret;
310 
311 	dbp = dbc->dbp;
312 	env = dbp->env;
313 
314 	/* Save the original flags value. */
315 	save_flags = flags;
316 
317 	if (LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW)) {
318 		if (!LOCKING_ON(env))
319 			return (__db_fnl(env, "DBC->get"));
320 		LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW);
321 	}
322 
323 	switch (flags) {
324 	case 0:
325 	case DB_JOIN_ITEM:
326 		break;
327 	default:
328 		return (__db_ferr(env, "DBC->get", 0));
329 	}
330 
331 	/*
332 	 * A partial get of the key of a join cursor don't make much sense;
333 	 * the entire key is necessary to query the primary database
334 	 * and find the datum, and so regardless of the size of the key
335 	 * it would not be a performance improvement.  Since it would require
336 	 * special handling, we simply disallow it.
337 	 *
338 	 * A partial get of the data, however, potentially makes sense (if
339 	 * all possible data are a predictable large structure, for instance)
340 	 * and causes us no headaches, so we permit it.
341 	 */
342 	if (F_ISSET(key, DB_DBT_PARTIAL)) {
343 		__db_errx(env, DB_STR("0516",
344 		    "DB_DBT_PARTIAL may not be set on key during join_get"));
345 		return (EINVAL);
346 	}
347 
348 	ENV_ENTER(env, ip);
349 
350 	handle_check = IS_ENV_REPLICATED(env);
351 	if (handle_check &&
352 	    (ret = __db_rep_enter(dbp, 1, 0, IS_REAL_TXN(dbc->txn))) != 0) {
353 		handle_check = 0;
354 		goto err;
355 	}
356 
357 	/* Restore the original flags value. */
358 	flags = save_flags;
359 
360 	ret = __db_join_get(dbc, key, data, flags);
361 
362 	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
363 		ret = t_ret;
364 
365 err:	ENV_LEAVE(env, ip);
366 	__dbt_userfree(env, key, NULL, NULL);
367 	return (ret);
368 }
369 
370 static int
__db_join_get(dbc,key_arg,data_arg,flags)371 __db_join_get(dbc, key_arg, data_arg, flags)
372 	DBC *dbc;
373 	DBT *key_arg, *data_arg;
374 	u_int32_t flags;
375 {
376 	DB *dbp;
377 	DBC *cp;
378 	DBT *key_n, key_n_mem;
379 	ENV *env;
380 	JOIN_CURSOR *jc;
381 	int db_manage_data, ret;
382 	u_int32_t i, j, operation, opmods;
383 
384 	dbp = dbc->dbp;
385 	env = dbp->env;
386 	jc = (JOIN_CURSOR *)dbc->internal;
387 
388 	operation = LF_ISSET(DB_OPFLAGS_MASK);
389 
390 	/* !!!
391 	 * If the set of flags here changes, check that __db_join_primget
392 	 * is updated to handle them properly.
393 	 */
394 	opmods = LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW);
395 
396 	/*
397 	 * Since we are fetching the key as a datum in the secondary indices,
398 	 * we must be careful of caller-specified DB_DBT_* memory
399 	 * management flags.  If necessary, use a stack-allocated DBT;
400 	 * we'll appropriately copy and/or allocate the data later.
401 	 */
402 	if (F_ISSET(key_arg,
403 	    DB_DBT_MALLOC | DB_DBT_USERCOPY | DB_DBT_USERMEM)) {
404 		/* We just use the default buffer;  no need to go malloc. */
405 		key_n = &key_n_mem;
406 		memset(key_n, 0, sizeof(DBT));
407 	} else {
408 		/*
409 		 * Either DB_DBT_REALLOC or the default buffer will work
410 		 * fine if we have to reuse it, as we do.
411 		 */
412 		key_n = key_arg;
413 	}
414 	if (F_ISSET(key_arg, DB_DBT_USERCOPY))
415 		key_arg->data = NULL;
416 
417 	/*
418 	 * If our last attempt to do a get on the primary key failed,
419 	 * short-circuit the join and try again with the same key.
420 	 */
421 	if (F_ISSET(jc, JOIN_RETRY))
422 		goto samekey;
423 	F_CLR(jc, JOIN_RETRY);
424 
425 retry:	ret = __dbc_get(jc->j_workcurs[0], &jc->j_key, key_n,
426 	    opmods | (jc->j_exhausted[0] ? DB_NEXT_DUP : DB_CURRENT));
427 
428 	if (ret == DB_BUFFER_SMALL) {
429 		jc->j_key.ulen <<= 1;
430 		if ((ret = __os_realloc(env,
431 		    jc->j_key.ulen, &jc->j_key.data)) != 0)
432 			goto mem_err;
433 		goto retry;
434 	}
435 
436 	/*
437 	 * If ret == DB_NOTFOUND, we're out of elements of the first
438 	 * secondary cursor.  This is how we finally finish the join
439 	 * if all goes well.
440 	 */
441 	if (ret != 0)
442 		goto err;
443 
444 	/*
445 	 * If jc->j_exhausted[0] == 1, we've just advanced the first cursor,
446 	 * and we're going to want to advance all the cursors that point to
447 	 * the first member of a duplicate duplicate set (j_fdupcurs[1..N]).
448 	 * Close all the cursors in j_fdupcurs;  we'll reopen them the
449 	 * first time through the upcoming loop.
450 	 */
451 	for (i = 1; i < jc->j_ncurs; i++) {
452 		if (jc->j_fdupcurs[i] != NULL &&
453 		    (ret = __dbc_close(jc->j_fdupcurs[i])) != 0)
454 			goto err;
455 		jc->j_fdupcurs[i] = NULL;
456 	}
457 
458 	/*
459 	 * If jc->j_curslist[1] == NULL, we have only one cursor in the join.
460 	 * Thus, we can safely increment that one cursor on each call
461 	 * to __db_join_get, and we signal this by setting jc->j_exhausted[0]
462 	 * right away.
463 	 *
464 	 * Otherwise, reset jc->j_exhausted[0] to 0, so that we don't
465 	 * increment it until we know we're ready to.
466 	 */
467 	if (jc->j_curslist[1] == NULL)
468 		jc->j_exhausted[0] = 1;
469 	else
470 		jc->j_exhausted[0] = 0;
471 
472 	/* We have the first element; now look for it in the other cursors. */
473 	for (i = 1; i < jc->j_ncurs; i++) {
474 		DB_ASSERT(env, jc->j_curslist[i] != NULL);
475 		if (jc->j_workcurs[i] == NULL)
476 			/* If this is NULL, we need to dup curslist into it. */
477 			if ((ret = __dbc_dup(jc->j_curslist[i],
478 			    &jc->j_workcurs[i], DB_POSITION)) != 0)
479 				goto err;
480 
481 retry2:		cp = jc->j_workcurs[i];
482 
483 		if ((ret = __db_join_getnext(cp, &jc->j_key, key_n,
484 			    jc->j_exhausted[i], opmods)) == DB_NOTFOUND) {
485 			/*
486 			 * jc->j_workcurs[i] has no more of the datum we're
487 			 * interested in.  Go back one cursor and get
488 			 * a new dup.  We can't just move to a new
489 			 * element of the outer relation, because that way
490 			 * we might miss duplicate duplicates in cursor i-1.
491 			 *
492 			 * If this takes us back to the first cursor,
493 			 * -then- we can move to a new element of the outer
494 			 * relation.
495 			 */
496 			--i;
497 			jc->j_exhausted[i] = 1;
498 
499 			if (i == 0) {
500 				for (j = 1; jc->j_workcurs[j] != NULL; j++) {
501 					/*
502 					 * We're moving to a new element of
503 					 * the first secondary cursor.  If
504 					 * that cursor is sorted, then any
505 					 * other sorted cursors can be safely
506 					 * reset to the first duplicate
507 					 * duplicate in the current set if we
508 					 * have a pointer to it (we can't just
509 					 * leave them be, or we'll miss
510 					 * duplicate duplicates in the outer
511 					 * relation).
512 					 *
513 					 * If the first cursor is unsorted, or
514 					 * if cursor j is unsorted, we can
515 					 * make no assumptions about what
516 					 * we're looking for next or where it
517 					 * will be, so we reset to the very
518 					 * beginning (setting workcurs NULL
519 					 * will achieve this next go-round).
520 					 *
521 					 * XXX: This is likely to break
522 					 * horribly if any two cursors are
523 					 * both sorted, but have different
524 					 * specified sort functions.  For,
525 					 * now, we dismiss this as pathology
526 					 * and let strange things happen--we
527 					 * can't make rope childproof.
528 					 */
529 					if ((ret = __dbc_close(
530 					    jc->j_workcurs[j])) != 0)
531 						goto err;
532 					if (!SORTED_SET(jc, 0) ||
533 					    !SORTED_SET(jc, j) ||
534 					    jc->j_fdupcurs[j] == NULL)
535 						/*
536 						 * Unsafe conditions;
537 						 * reset fully.
538 						 */
539 						jc->j_workcurs[j] = NULL;
540 					else
541 						/* Partial reset suffices. */
542 						if ((__dbc_dup(
543 						    jc->j_fdupcurs[j],
544 						    &jc->j_workcurs[j],
545 						    DB_POSITION)) != 0)
546 							goto err;
547 					jc->j_exhausted[j] = 0;
548 				}
549 				goto retry;
550 				/* NOTREACHED */
551 			}
552 
553 			/*
554 			 * We're about to advance the cursor and need to
555 			 * reset all of the workcurs[j] where j>i, so that
556 			 * we don't miss any duplicate duplicates.
557 			 */
558 			for (j = i + 1;
559 			    jc->j_workcurs[j] != NULL;
560 			    j++) {
561 				if ((ret =
562 				    __dbc_close(jc->j_workcurs[j])) != 0)
563 					goto err;
564 				jc->j_exhausted[j] = 0;
565 				if (jc->j_fdupcurs[j] == NULL)
566 					jc->j_workcurs[j] = NULL;
567 				else if ((ret = __dbc_dup(jc->j_fdupcurs[j],
568 				    &jc->j_workcurs[j], DB_POSITION)) != 0)
569 					goto err;
570 			}
571 			goto retry2;
572 			/* NOTREACHED */
573 		}
574 
575 		if (ret == DB_BUFFER_SMALL) {
576 			jc->j_key.ulen <<= 1;
577 			if ((ret = __os_realloc(env, jc->j_key.ulen,
578 			    &jc->j_key.data)) != 0) {
579 mem_err:			__db_errx(env, DB_STR_A("0517",
580 				    "Allocation failed for join key, len = %lu",
581 				    "%lu"), (u_long)jc->j_key.ulen);
582 				goto err;
583 			}
584 			goto retry2;
585 		}
586 
587 		if (ret != 0)
588 			goto err;
589 
590 		/*
591 		 * If we made it this far, we've found a matching
592 		 * datum in cursor i.  Mark the current cursor
593 		 * unexhausted, so we don't miss any duplicate
594 		 * duplicates the next go-round--unless this is the
595 		 * very last cursor, in which case there are none to
596 		 * miss, and we'll need that exhausted flag to finally
597 		 * get a DB_NOTFOUND and move on to the next datum in
598 		 * the outermost cursor.
599 		 */
600 		if (i + 1 != jc->j_ncurs)
601 			jc->j_exhausted[i] = 0;
602 		else
603 			jc->j_exhausted[i] = 1;
604 
605 		/*
606 		 * If jc->j_fdupcurs[i] is NULL and the ith cursor's dups are
607 		 * sorted, then we're here for the first time since advancing
608 		 * cursor 0, and we have a new datum of interest.
609 		 * jc->j_workcurs[i] points to the beginning of a set of
610 		 * duplicate duplicates;  store this into jc->j_fdupcurs[i].
611 		 */
612 		if (SORTED_SET(jc, i) && jc->j_fdupcurs[i] == NULL && (ret =
613 		    __dbc_dup(cp, &jc->j_fdupcurs[i], DB_POSITION)) != 0)
614 			goto err;
615 	}
616 
617 err:	if (ret != 0)
618 		return (ret);
619 
620 	if (0) {
621 samekey:	/*
622 		 * Get the key we tried and failed to return last time;
623 		 * it should be the current datum of all the secondary cursors.
624 		 */
625 		if ((ret = __dbc_get(jc->j_workcurs[0],
626 		    &jc->j_key, key_n, DB_CURRENT | opmods)) != 0)
627 			return (ret);
628 		F_CLR(jc, JOIN_RETRY);
629 	}
630 
631 	/*
632 	 * ret == 0;  we have a key to return.
633 	 *
634 	 * If DB_DBT_USERMEM or DB_DBT_MALLOC is set, we need to copy the key
635 	 * back into the dbt we were given for the key; call __db_retcopy.
636 	 * Otherwise, assert that we do not need to copy anything and proceed.
637 	 */
638 	DB_ASSERT(env, F_ISSET(key_arg, DB_DBT_USERMEM | DB_DBT_MALLOC |
639 	    DB_DBT_USERCOPY) || key_n == key_arg);
640 
641 	if ((F_ISSET(key_arg, DB_DBT_USERMEM | DB_DBT_MALLOC |
642 	    DB_DBT_USERCOPY)) &&
643 	    (ret = __db_retcopy(env,
644 	    key_arg, key_n->data, key_n->size, NULL, NULL)) != 0) {
645 		/*
646 		 * The retcopy failed, most commonly because we have a user
647 		 * buffer for the key which is too small. Set things up to
648 		 * retry next time, and return.
649 		 */
650 		F_SET(jc, JOIN_RETRY);
651 		return (ret);
652 	}
653 
654 	/*
655 	 * If DB_JOIN_ITEM is set, we return it; otherwise we do the lookup
656 	 * in the primary and then return.
657 	 */
658 	if (operation == DB_JOIN_ITEM)
659 		return (0);
660 
661 	/*
662 	 * If data_arg->flags == 0--that is, if DB is managing the
663 	 * data DBT's memory--it's not safe to just pass the DBT
664 	 * through to the primary get call, since we don't want that
665 	 * memory to belong to the primary DB handle (and if the primary
666 	 * is free-threaded, it can't anyway).
667 	 *
668 	 * Instead, use memory that is managed by the join cursor, in
669 	 * jc->j_rdata.
670 	 */
671 	if (!F_ISSET(data_arg, DB_DBT_MALLOC | DB_DBT_REALLOC |
672 	    DB_DBT_USERMEM | DB_DBT_USERCOPY))
673 		db_manage_data = 1;
674 	else
675 		db_manage_data = 0;
676 	if ((ret = __db_join_primget(jc->j_primary, dbc->thread_info,
677 	    jc->j_curslist[0]->txn, jc->j_curslist[0]->locker, key_n,
678 	    db_manage_data ? &jc->j_rdata : data_arg, opmods)) != 0) {
679 		if (ret == DB_NOTFOUND) {
680 			if (LF_ISSET(DB_READ_UNCOMMITTED) ||
681 			    (jc->j_curslist[0]->txn != NULL && F_ISSET(
682 			    jc->j_curslist[0]->txn, TXN_READ_UNCOMMITTED)))
683 				goto retry;
684 			/*
685 			 * If ret == DB_NOTFOUND, the primary and secondary
686 			 * are out of sync;  every item in each secondary
687 			 * should correspond to something in the primary,
688 			 * or we shouldn't have done the join this way.
689 			 * Wail.
690 			 */
691 			ret = __db_secondary_corrupt(jc->j_primary);
692 		} else
693 			/*
694 			 * The get on the primary failed for some other
695 			 * reason, most commonly because we're using a user
696 			 * buffer that's not big enough.  Flag our failure
697 			 * so we can return the same key next time.
698 			 */
699 			F_SET(jc, JOIN_RETRY);
700 	}
701 	if (db_manage_data && ret == 0) {
702 		data_arg->data = jc->j_rdata.data;
703 		data_arg->size = jc->j_rdata.size;
704 	}
705 
706 	return (ret);
707 }
708 
709 /*
710  * __db_join_close --
711  *	DBC->close for join cursors.
712  *
713  * PUBLIC: int __db_join_close __P((DBC *));
714  */
715 int
__db_join_close(dbc)716 __db_join_close(dbc)
717 	DBC *dbc;
718 {
719 	DB *dbp;
720 	ENV *env;
721 	JOIN_CURSOR *jc;
722 	int ret, t_ret;
723 	u_int32_t i;
724 
725 	jc = (JOIN_CURSOR *)dbc->internal;
726 	dbp = dbc->dbp;
727 	env = dbp->env;
728 	ret = t_ret = 0;
729 
730 	/*
731 	 * Remove from active list of join cursors.  Note that this
732 	 * must happen before any action that can fail and return, or else
733 	 * __db_close may loop indefinitely.
734 	 */
735 	MUTEX_LOCK(env, dbp->mutex);
736 	TAILQ_REMOVE(&dbp->join_queue, dbc, links);
737 	MUTEX_UNLOCK(env, dbp->mutex);
738 
739 	/*
740 	 * Close any open scratch cursors.  In each case, there may
741 	 * not be as many outstanding as there are cursors in
742 	 * curslist, but we want to close whatever's there.
743 	 *
744 	 * If any close fails, there's no reason not to close everything else;
745 	 * we'll just return the error code of the last one to fail.  There's
746 	 * not much the caller can do anyway, since these cursors only exist
747 	 * hanging off a db-internal data structure that they shouldn't be
748 	 * mucking with.
749 	 */
750 	for (i = 0; i < jc->j_ncurs; i++) {
751 		if (jc->j_workcurs[i] != NULL &&
752 		    (t_ret = __dbc_close(jc->j_workcurs[i])) != 0)
753 			ret = t_ret;
754 		if (jc->j_fdupcurs[i] != NULL &&
755 		    (t_ret = __dbc_close(jc->j_fdupcurs[i])) != 0)
756 			ret = t_ret;
757 	}
758 
759 	__os_free(env, jc->j_exhausted);
760 	__os_free(env, jc->j_curslist);
761 	__os_free(env, jc->j_workcurs);
762 	__os_free(env, jc->j_fdupcurs);
763 	__os_free(env, jc->j_key.data);
764 	if (jc->j_rdata.data != NULL)
765 		__os_ufree(env, jc->j_rdata.data);
766 	__os_free(env, jc);
767 	__os_free(env, dbc);
768 
769 	return (ret);
770 }
771 
772 /*
773  * __db_join_getnext --
774  *	This function replaces the DBC_CONTINUE and DBC_KEYSET
775  *	functionality inside the various cursor get routines.
776  *
777  *	If exhausted == 0, we're not done with the current datum;
778  *	return it if it matches "matching", otherwise search
779  *	using DB_GET_BOTHC (which is faster than iteratively doing
780  *	DB_NEXT_DUP) forward until we find one that does.
781  *
782  *	If exhausted == 1, we are done with the current datum, so just
783  *	leap forward to searching NEXT_DUPs.
784  *
785  *	If no matching datum exists, returns DB_NOTFOUND, else 0.
786  */
787 static int
__db_join_getnext(dbc,key,data,exhausted,opmods)788 __db_join_getnext(dbc, key, data, exhausted, opmods)
789 	DBC *dbc;
790 	DBT *key, *data;
791 	u_int32_t exhausted, opmods;
792 {
793 	int ret, cmp;
794 	DB *dbp;
795 	DBT ldata;
796 	int (*func) __P((DB *, const DBT *, const DBT *));
797 
798 	dbp = dbc->dbp;
799 	func = (dbp->dup_compare == NULL) ? __bam_defcmp : dbp->dup_compare;
800 
801 	switch (exhausted) {
802 	case 0:
803 		/*
804 		 * We don't want to step on data->data;  use a new
805 		 * DBT and malloc so we don't step on dbc's rdata memory.
806 		 */
807 		memset(&ldata, 0, sizeof(DBT));
808 		F_SET(&ldata, DB_DBT_MALLOC);
809 		if ((ret = __dbc_get(dbc,
810 		    key, &ldata, opmods | DB_CURRENT)) != 0)
811 			break;
812 		cmp = func(dbp, data, &ldata);
813 		if (cmp == 0) {
814 			/*
815 			 * We have to return the real data value.  Copy
816 			 * it into data, then free the buffer we malloc'ed
817 			 * above.
818 			 */
819 			if ((ret = __db_retcopy(dbp->env, data, ldata.data,
820 			    ldata.size, &data->data, &data->size)) != 0)
821 				return (ret);
822 			__os_ufree(dbp->env, ldata.data);
823 			return (0);
824 		}
825 
826 		/*
827 		 * Didn't match--we want to fall through and search future
828 		 * dups.  We just forget about ldata and free
829 		 * its buffer--data contains the value we're searching for.
830 		 */
831 		__os_ufree(dbp->env, ldata.data);
832 		/* FALLTHROUGH */
833 	case 1:
834 		ret = __dbc_get(dbc, key, data, opmods | DB_GET_BOTHC);
835 		break;
836 	default:
837 		ret = EINVAL;
838 		break;
839 	}
840 
841 	return (ret);
842 }
843 
844 /*
845  * __db_join_cmp --
846  *	Comparison function for sorting DBCs in cardinality order.
847  */
848 static int
__db_join_cmp(a,b)849 __db_join_cmp(a, b)
850 	const void *a, *b;
851 {
852 	DBC *dbca, *dbcb;
853 	db_recno_t counta, countb;
854 
855 	dbca = *((DBC * const *)a);
856 	dbcb = *((DBC * const *)b);
857 
858 	if (__dbc_count(dbca, &counta) != 0 ||
859 	    __dbc_count(dbcb, &countb) != 0)
860 		return (0);
861 
862 	return ((long)counta - (long)countb);
863 }
864 
865 /*
866  * __db_join_primget --
867  *	Perform a DB->get in the primary, being careful not to use a new
868  * locker ID if we're doing CDB locking.
869  */
870 static int
__db_join_primget(dbp,ip,txn,locker,key,data,flags)871 __db_join_primget(dbp, ip, txn, locker, key, data, flags)
872 	DB *dbp;
873 	DB_THREAD_INFO *ip;
874 	DB_TXN *txn;
875 	DB_LOCKER  *locker;
876 	DBT *key, *data;
877 	u_int32_t flags;
878 {
879 	DBC *dbc;
880 	u_int32_t rmw;
881 	int ret, t_ret;
882 
883 	if ((ret = __db_cursor_int(dbp, ip,
884 	    txn, dbp->type, PGNO_INVALID, 0, locker, &dbc)) != 0)
885 		return (ret);
886 
887 	/*
888 	 * The only allowable flags here are the two flags copied into "opmods"
889 	 * in __db_join_get, DB_RMW and DB_READ_UNCOMMITTED.  The former is an
890 	 * op on the c_get call, the latter on the cursor call.  It's a DB bug
891 	 * if we allow any other flags down in here.
892 	 */
893 	rmw = LF_ISSET(DB_RMW);
894 	if (LF_ISSET(DB_READ_UNCOMMITTED) ||
895 	    (txn != NULL && F_ISSET(txn, TXN_READ_UNCOMMITTED)))
896 		F_SET(dbc, DBC_READ_UNCOMMITTED);
897 
898 	if (LF_ISSET(DB_READ_COMMITTED) ||
899 	    (txn != NULL && F_ISSET(txn, TXN_READ_COMMITTED)))
900 		F_SET(dbc, DBC_READ_COMMITTED);
901 
902 	LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW);
903 	DB_ASSERT(dbp->env, flags == 0);
904 
905 	F_SET(dbc, DBC_TRANSIENT);
906 
907 	/*
908 	 * This shouldn't be necessary, thanks to the fact that join cursors
909 	 * swap in their own DB_DBT_REALLOC'ed buffers, but just for form's
910 	 * sake, we mirror what __db_get does.
911 	 */
912 	SET_RET_MEM(dbc, dbp);
913 
914 	ret = __dbc_get(dbc, key, data, DB_SET | rmw);
915 
916 	if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
917 		ret = t_ret;
918 
919 	return (ret);
920 }
921 
922 /*
923  * __db_secondary_corrupt --
924  *	Report primary/secondary inconsistencies.
925  *
926  * PUBLIC: int __db_secondary_corrupt __P((DB *));
927  */
928 int
__db_secondary_corrupt(dbp)929 __db_secondary_corrupt(dbp)
930 	DB *dbp;
931 {
932 	__db_err(dbp->env, DB_SECONDARY_BAD, "%s%s%s",
933 	    dbp->fname == NULL ? "unnamed" : dbp->fname,
934 	    dbp->dname == NULL ? "" : "/",
935 	    dbp->dname == NULL ? "" : dbp->dname);
936 	return (DB_SECONDARY_BAD);
937 }
938