csuite/wt2909_checkpoint_integrity/main.c

/*-
 * Public Domain 2014-2018 MongoDB, Inc.
 * Public Domain 2008-2014 WiredTiger, Inc.
 *
 * This is free and unencumbered software released into the public domain.
 *
 * Anyone is free to copy, modify, publish, use, compile, sell, or
 * distribute this software, either in source code form or as a compiled
 * binary, for any purpose, commercial or non-commercial, and by any
 * means.
 *
 * In jurisdictions that recognize copyright laws, the author or authors
 * of this software dedicate any and all copyright interest in the
 * software to the public domain. We make this dedication for the benefit
 * of the public at large and to the detriment of our heirs and
 * successors. We intend this dedication to be an overt act of
 * relinquishment in perpetuity of all present and future rights to this
 * software under copyright law.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */
#include "test_util.h"

#include <sys/resource.h>
#include <sys/wait.h>

/*
 * JIRA ticket reference: WT-2909
 * Test case description:
 *
 * This test attempts to check the integrity of checkpoints by injecting
 * failures (by means of a custom file system) and then trying to recover. To
 * insulate the top level program from various crashes that may occur when
 * injecting failures, the "populate" code runs in another process, and is
 * expected to sometimes fail. Then the top level program runs recovery (with
 * the normal file system) and checks the results. Any failure at the top level
 * indicates a checkpoint integrity problem.
 *
 * Each subtest uses the same kind of schema and data, the only variance is
 * when the faults are injected. At the moment, this test only injects during
 * checkpoints, and only injects write failures. It varies in the number of
 * successful writes that occur before an injected failure (during a checkpoint
 * operation), this can be indicated with "-o N".  When N is not specified, the
 * test attempts to find the optimal range of N for testing. Clearly when N is
 * large, then the checkpoint may be successfully written, and the data
 * represented by the checkpoint will be fully present. When N is small,
 * nothing of interest is written and no data is present. To find the sweet
 * spot where interesting failures occur, the test does a binary search to find
 * the approximate N that divides the "small" and "large" cases. This is not
 * strictly deterministic, a given N may give different results on different
 * runs. But approximate optimal N can be determined, allowing a series of
 * additional tests clustered around this N.
 *
 * The data is stored in two tables, one having indices. Both tables have
 * the same keys and are updated with the same key in a single transaction.
 *
 * Failure mode:
 * If one table is out of step with the other, that is detected as a failure at
 * the top level.  If an index is missing values (or has extra values), that is
 * likewise a failure at the top level. If the tables or the home directory
 * cannot be opened, that is a top level error. The tables must be present
 * as an initial checkpoint is done without any injected fault.
 */

/*
 * This program does not run on Windows.  The non-portable aspects at minimum
 * are fork/exec the use of environment variables (used by fail_fs), and file
 * name and build locations of dynamically loaded libraries.
 */
#define	BIG_SIZE			(1024 * 10)
#define	BIG_CONTENTS			"<Big String Contents>"
#define	MAX_ARGS			20
#define	MAX_OP_RANGE			1000
#define	STDERR_FILE			"stderr.txt"
#define	STDOUT_FILE			"stdout.txt"
#define	TESTS_PER_OP_VALUE		3
#define	VERBOSE_PRINT			10000

static int check_results(TEST_OPTS *, uint64_t *);
static void check_values(WT_CURSOR *, int, int, int, char *);
static int create_big_string(char **);
static void cursor_count_items(WT_CURSOR *, uint64_t *);
static void disable_failures(void);
static void enable_failures(uint64_t, uint64_t);
static void generate_key(uint64_t, int *);
static void generate_value(uint32_t, uint64_t, char *, int *, int *, int *,
    char **);
static void run_check_subtest(TEST_OPTS *, const char *, uint64_t, bool,
    uint64_t *);
static void run_check_subtest_range(TEST_OPTS *, const char *, bool);
static int run_process(TEST_OPTS *, const char *, char *[], int *);
static void subtest_main(int, char *[], bool);
static void subtest_populate(TEST_OPTS *, bool);

extern int   __wt_optind;

#define	WT_FAIL_FS_LIB	"../../ext/test/fail_fs/.libs/libwiredtiger_fail_fs.so"

/*
 * check_results --
 *	Check all the tables and verify the results.
 */
static int
check_results(TEST_OPTS *opts, uint64_t *foundp)
{
	WT_CURSOR *maincur, *maincur2, *v0cur, *v1cur, *v2cur;
	WT_SESSION *session;
	uint64_t count, idxcount, nrecords;
	uint32_t rndint;
	int key, key_got, ret, v0, v1, v2;
	char *big, *bigref;

	testutil_check(create_big_string(&bigref));
	nrecords = opts->nrecords;
	testutil_check(wiredtiger_open(opts->home, NULL,
	    "create,log=(enabled)", &opts->conn));
	testutil_check(
	    opts->conn->open_session(opts->conn, NULL, NULL, &session));

	testutil_check(session->open_cursor(session, "table:subtest", NULL,
	    NULL, &maincur));
	testutil_check(session->open_cursor(session, "table:subtest2", NULL,
	    NULL, &maincur2));
	testutil_check(session->open_cursor(session, "index:subtest:v0", NULL,
	    NULL, &v0cur));
	testutil_check(session->open_cursor(session, "index:subtest:v1", NULL,
	    NULL, &v1cur));
	testutil_check(session->open_cursor(session, "index:subtest:v2", NULL,
	    NULL, &v2cur));

	count = 0;
	while ((ret = maincur->next(maincur)) == 0) {
		testutil_check(maincur2->next(maincur2));
		testutil_check(maincur2->get_key(maincur2, &key_got));
		testutil_check(maincur2->get_value(maincur2, &rndint));

		generate_key(count, &key);
		generate_value(rndint, count, bigref, &v0, &v1, &v2, &big);
		testutil_assert(key == key_got);

		/* Check the key/values in main table. */
		testutil_check(maincur->get_key(maincur, &key_got));
		testutil_assert(key == key_got);
		check_values(maincur, v0, v1, v2, big);

		/* Check the values in the indices. */
		v0cur->set_key(v0cur, v0);
		testutil_check(v0cur->search(v0cur));
		check_values(v0cur, v0, v1, v2, big);
		v1cur->set_key(v1cur, v1);
		testutil_check(v1cur->search(v1cur));
		check_values(v1cur, v0, v1, v2, big);
		v2cur->set_key(v2cur, v2);
		testutil_check(v2cur->search(v2cur));
		check_values(v2cur, v0, v1, v2, big);

		count++;
		if (count % VERBOSE_PRINT == 0 && opts->verbose)
			printf("checked %" PRIu64 "/%" PRIu64 "\n", count,
			    nrecords);
	}
	if (count % VERBOSE_PRINT != 0 && opts->verbose)
		printf("checked %" PRIu64 "/%" PRIu64 "\n", count, nrecords);

	/*
	 * Always expect at least one entry, as populate does a
	 * checkpoint after the first insert.
	 */
	testutil_assert(count > 0);
	testutil_assert(ret == WT_NOTFOUND);
	testutil_assert(maincur2->next(maincur2) == WT_NOTFOUND);
	cursor_count_items(v0cur, &idxcount);
	testutil_assert(count == idxcount);
	cursor_count_items(v1cur, &idxcount);
	testutil_assert(count == idxcount);
	cursor_count_items(v2cur, &idxcount);
	testutil_assert(count == idxcount);

	testutil_check(opts->conn->close(opts->conn, NULL));
	opts->conn = NULL;

	free(bigref);
	*foundp = count;
	return (0);
}

/*
 * check_values --
 *	Check that the values in the cursor match the given values.
 */
static void
check_values(WT_CURSOR *cursor, int v0, int v1, int v2, char *big)
{
	int v0_got, v1_got, v2_got;
	char *big_got;

	testutil_check(cursor->get_value(cursor, &v0_got, &v1_got, &v2_got,
	    &big_got));
	testutil_assert(v0 == v0_got);
	testutil_assert(v1 == v1_got);
	testutil_assert(v2 == v2_got);
	testutil_assert(strcmp(big, big_got) == 0);
}

/*
 * create_big_string --
 *	Create and fill the "reference" big array.
 */
static int
create_big_string(char **bigp)
{
	size_t i, mod;
	char *big;

	if ((big = malloc(BIG_SIZE + 1)) == NULL)
		return (ENOMEM);
	mod = strlen(BIG_CONTENTS);
	for (i = 0; i < BIG_SIZE; i++) {
		big[i] = BIG_CONTENTS[i % mod];
	}
	big[BIG_SIZE] = '\0';
	*bigp = big;
	return (0);
}

/*
 * cursor_count_items --
 *	Count the number of items in the table by traversing
 *	through the cursor.
 */
static void
cursor_count_items(WT_CURSOR *cursor, uint64_t *countp)
{
	int ret;

	*countp = 0;

	testutil_check(cursor->reset(cursor));
	while ((ret = cursor->next(cursor)) == 0)
		(*countp)++;
	testutil_assert(ret == WT_NOTFOUND);
}

/*
 * disable_failures --
 *	Disable failures in the fail file system.
 */
static void
disable_failures(void)
{
	testutil_check(setenv("WT_FAIL_FS_ENABLE", "0", 1));
}

/*
 * enable_failures --
 *	Enable failures in the fail file system.
 */
static void
enable_failures(uint64_t allow_writes, uint64_t allow_reads)
{
	char value[100];

	testutil_check(setenv("WT_FAIL_FS_ENABLE", "1", 1));
	testutil_check(__wt_snprintf(
	    value, sizeof(value), "%" PRIu64, allow_writes));
	testutil_check(setenv("WT_FAIL_FS_WRITE_ALLOW", value, 1));
	testutil_check(__wt_snprintf(
	    value, sizeof(value), "%" PRIu64, allow_reads));
	testutil_check(setenv("WT_FAIL_FS_READ_ALLOW", value, 1));
}

/*
 * generate_key --
 *	Generate a key used by the "subtest" and "subtest2" tables.
 */
static void
generate_key(uint64_t i, int *keyp)
{
	*keyp = (int)i;
}

/*
 * generate_value --
 *	Generate values for the "subtest" table.
 */
static void
generate_value(uint32_t rndint, uint64_t i, char *bigref,
    int *v0p, int *v1p, int *v2p, char **bigp)
{
	*v0p = (int)(i * 7);
	*v1p = (int)(i * 10007);
	*v2p = (int)(i * 100000007);
	*bigp = &bigref[rndint % BIG_SIZE];
}

/*
 * run_check_subtest --
 *	Run the subtest with the given parameters and check the results.
 */
static void
run_check_subtest(TEST_OPTS *opts, const char *debugger, uint64_t nops,
    bool close_test, uint64_t *nresultsp)
{
	int estatus, narg;
	char rarg[20], sarg[20], *subtest_args[MAX_ARGS];

	narg = 0;
	if (debugger != NULL) {
		subtest_args[narg++] = (char *)debugger;
		subtest_args[narg++] = (char *)"--";
	}

	subtest_args[narg++] = (char *)opts->progname;
	/* "subtest" must appear before arguments */
	if (close_test)
		subtest_args[narg++] = (char *)"subtest_close";
	else
		subtest_args[narg++] = (char *)"subtest";
	subtest_args[narg++] = (char *)"-h";
	subtest_args[narg++] = opts->home;
	subtest_args[narg++] = (char *)"-v";	/* subtest is always verbose */
	subtest_args[narg++] = (char *)"-p";
	subtest_args[narg++] = (char *)"-o";
	testutil_check(__wt_snprintf(sarg, sizeof(sarg), "%" PRIu64, nops));
	subtest_args[narg++] = sarg;		/* number of operations */
	subtest_args[narg++] = (char *)"-n";
	testutil_check(__wt_snprintf(
	    rarg, sizeof(rarg), "%" PRIu64, opts->nrecords));
	subtest_args[narg++] = rarg;		/* number of records */
	subtest_args[narg++] = NULL;
	testutil_assert(narg <= MAX_ARGS);
	if (opts->verbose)
		printf("running a separate process with %" PRIu64
		    " operations until fail...\n", nops);
	testutil_clean_work_dir(opts->home);
	testutil_check(run_process(
	    opts, debugger != NULL ? debugger : opts->progname,
	    subtest_args, &estatus));
	if (opts->verbose)
		printf("process exited %d\n", estatus);

	/*
	 * Verify results in parent process.
	 */
	testutil_check(check_results(opts, nresultsp));
}

/*
 * run_check_subtest_range --
 *
 * Run successive tests via binary search that determines the approximate
 * crossover point between when data is recoverable or not. Once that is
 * determined, run the subtest in a range near that crossover point.
 *
 * The theory is that running at the crossover point will tend to trigger
 * "interesting" failures at the borderline when the checkpoint is about to,
 * or has, succeeded.  If any of those failures creates a WT home directory
 * that cannot be recovered, the top level test will fail.
 */
static void
run_check_subtest_range(TEST_OPTS *opts, const char *debugger, bool close_test)
{
	uint64_t cutoff, high, low, mid, nops, nresults;
	int i;
	bool got_failure, got_success;

	if (opts->verbose)
		printf("Determining best range of operations until failure, "
		    "with close_test %s.\n",
		    (close_test ? "enabled" : "disabled"));

	run_check_subtest(opts, debugger, 1, close_test, &cutoff);
	low = 0;
	high = MAX_OP_RANGE;
	mid = (low + high) / 2;
	while (mid != low) {
		run_check_subtest(opts, debugger, mid, close_test,
		    &nresults);
		if (nresults > cutoff)
			high = mid;
		else
			low = mid;
		mid = (low + high) / 2;
	}
	/*
	 * mid is the number of ops that is the crossover point.
	 * Run some tests near that point to try to trigger weird
	 * failures.  If mid is too low or too high, it indicates
	 * there is a fundamental problem with the test.
	 */
	testutil_assert(mid > 1 && mid < MAX_OP_RANGE - 1);
	if (opts->verbose)
		printf("Retesting around %" PRIu64 " operations.\n",
		    mid);

	got_failure = false;
	got_success = false;
	for (nops = mid - 10; nops < mid + 10; nops++) {
		for (i = 0; i < TESTS_PER_OP_VALUE; i++) {
			run_check_subtest(opts, debugger, nops,
			    close_test, &nresults);
			if (nresults > cutoff)
				got_failure = true;
			else
				got_success = true;
		}
	}
	/*
	 * Check that it really ran with a crossover point.
	 */
	testutil_assert(got_failure);
	testutil_assert(got_success);
}

/*
 * run_process --
 *	Run a program with arguments, wait until it completes.
 */
static int
run_process(TEST_OPTS *opts, const char *prog, char *argv[], int *status)
{
	int pid;
	char **arg;

	if (opts->verbose) {
		printf("running: ");
		for (arg = argv; *arg != NULL; arg++)
			printf("%s ", *arg);
		printf("\n");
	}
	if ((pid = fork()) == 0) {
		(void)execv(prog, argv);
		testutil_die(errno, "%s", prog);
	} else if (pid < 0)
		return (errno);

	(void)waitpid(pid, status, 0);
	return (0);
}

/*
 * subtest_error_handler --
 *     Error event handler.
 */
static int
subtest_error_handler(WT_EVENT_HANDLER *handler,
    WT_SESSION *session, int error, const char *message)
{
	(void)(handler);
	(void)(session);
	(void)(message);

	/* Exit on panic, there's no checking to be done. */
	if (error == WT_PANIC)
		exit (1);
	return (0);
}

static WT_EVENT_HANDLER event_handler = {
	subtest_error_handler,
	NULL,   /* Message handler */
	NULL,   /* Progress handler */
	NULL    /* Close handler */
};

/*
 * subtest_main --
 *	The main program for the subtest
 */
static void
subtest_main(int argc, char *argv[], bool close_test)
{
	struct rlimit rlim;
	TEST_OPTS *opts, _opts;
	WT_SESSION *session;
	char config[1024], filename[1024];

	opts = &_opts;
	memset(opts, 0, sizeof(*opts));
	memset(&rlim, 0, sizeof(rlim));

	/* No core files during fault injection tests. */
	testutil_check(setrlimit(RLIMIT_CORE, &rlim));
	testutil_check(testutil_parse_opts(argc, argv, opts));
	testutil_make_work_dir(opts->home);

	/* Redirect stderr, stdout. */
	testutil_check(__wt_snprintf(
	    filename, sizeof(filename), "%s/%s", opts->home, STDERR_FILE));
	testutil_assert(freopen(filename, "a", stderr) != NULL);
	testutil_check(__wt_snprintf(
	    filename, sizeof(filename), "%s/%s", opts->home, STDOUT_FILE));
	testutil_assert(freopen(filename, "a", stdout) != NULL);
	testutil_check(__wt_snprintf(config, sizeof(config),
	    "create,cache_size=250M,log=(enabled),"
	    "transaction_sync=(enabled,method=none),extensions=("
	    WT_FAIL_FS_LIB
	    "=(early_load,config={environment=true,verbose=true})]"));

	testutil_check(
	    wiredtiger_open(opts->home, &event_handler, config, &opts->conn));
	testutil_check(
	    opts->conn->open_session(opts->conn, NULL, NULL, &session));

	testutil_check(session->create(session, "table:subtest",
	    "key_format=i,value_format=iiiS,"
	    "columns=(id,v0,v1,v2,big)"));

	testutil_check(session->create(session, "table:subtest2",
	    "key_format=i,value_format=i"));

	testutil_check(session->create(session, "index:subtest:v0",
	    "columns=(v0)"));
	testutil_check(session->create(session, "index:subtest:v1",
	    "columns=(v1)"));
	testutil_check(session->create(session, "index:subtest:v2",
	    "columns=(v2)"));

	testutil_check(session->close(session, NULL));

	subtest_populate(opts, close_test);

	testutil_cleanup(opts);
}

/*
 * This macro is used as a substitute for testutil_check, except that it is
 * aware of when a failure may be expected due to the effects of the fail_fs.
 * This macro is used only in subtest_populate(), it uses local variables.
 */
#define	CHECK(expr, failmode) {						\
	int _ret;							\
	_ret = expr;							\
	if (_ret != 0) {						\
		if (!failmode ||					\
		    (_ret != WT_RUN_RECOVERY && _ret != EIO)) {		\
			fprintf(stderr, "  BAD RETURN %d for \"%s\"\n", \
			    _ret, #expr);				\
			testutil_check(_ret);				\
		} else							\
			failed = true;					\
	}								\
}

/*
 * subtest_populate --
 *	Populate the tables.
 */
static void
subtest_populate(TEST_OPTS *opts, bool close_test)
{
	WT_CURSOR *maincur, *maincur2;
	WT_RAND_STATE rnd;
	WT_SESSION *session;
	uint64_t i, nrecords;
	uint32_t rndint;
	int key, v0, v1, v2;
	char *big, *bigref;
	bool failed;

	failed = false;
	__wt_random_init_seed(NULL, &rnd);
	CHECK(create_big_string(&bigref), false);
	nrecords = opts->nrecords;

	CHECK(opts->conn->open_session(
	    opts->conn, NULL, NULL, &session), false);

	CHECK(session->open_cursor(session, "table:subtest", NULL,
	    NULL, &maincur), false);

	CHECK(session->open_cursor(session, "table:subtest2", NULL,
	    NULL, &maincur2), false);

	for (i = 0; i < nrecords && !failed; i++) {
		rndint = __wt_random(&rnd);
		generate_key(i, &key);
		generate_value(rndint, i, bigref, &v0, &v1, &v2, &big);
		CHECK(session->begin_transaction(session, NULL), false);
		maincur->set_key(maincur, key);
		maincur->set_value(maincur, v0, v1, v2, big);
		CHECK(maincur->insert(maincur), false);

		maincur2->set_key(maincur2, key);
		maincur2->set_value(maincur2, rndint);
		CHECK(maincur2->insert(maincur2), false);
		CHECK(session->commit_transaction(session, NULL), false);

		if (i == 0)
			/*
			 * Force an initial checkpoint, that helps to
			 * distinguish a clear failure from just not running
			 * long enough.
			 */
			CHECK(session->checkpoint(session, NULL), false);

		if ((i + 1) % VERBOSE_PRINT == 0 && opts->verbose)
			printf("  %" PRIu64 "/%" PRIu64 "\n",
			    (i + 1), nrecords);
		/* Attempt to isolate the failures to checkpointing. */
		if (i == (nrecords/100)) {
			enable_failures(opts->nops, 1000000);
			/* CHECK should expect failures. */
			CHECK(session->checkpoint(session, NULL), true);
			disable_failures();
			if (failed && opts->verbose)
				printf("checkpoint failed (expected).\n");
		}
	}

	/*
	 * Closing handles after an extreme fail is likely to cause
	 * cascading failures (or crashes), so recommended practice is
	 * to immediately exit. We're interested in testing both with
	 * and without the recommended practice.
	 */
	if (failed) {
		if (!close_test) {
			fprintf(stderr, "exit early.\n");
			exit(0);
		} else
			fprintf(stderr, "closing after failure.\n");
	}

	free(bigref);
	CHECK(maincur->close(maincur), false);
	CHECK(maincur2->close(maincur2), false);
	CHECK(session->close(session, NULL), false);
}

/*
 * main --
 *	The main program for the test. When invoked with "subtest"
 *	argument, run the subtest. Otherwise, run a separate process
 *	for each needed subtest, and check the results.
 */
int
main(int argc, char *argv[])
{
	TEST_OPTS *opts, _opts;
	uint64_t nresults;
	const char *debugger;

	/* Ignore unless requested */
	if (!testutil_is_flag_set("TESTUTIL_ENABLE_LONG_TESTS"))
		return (EXIT_SUCCESS);

	opts = &_opts;
	memset(opts, 0, sizeof(*opts));
	debugger = NULL;

	testutil_check(testutil_parse_opts(argc, argv, opts));
	argc -= __wt_optind;
	argv += __wt_optind;
	if (opts->nrecords == 0)
		opts->nrecords = 50000;

	while (argc > 0) {
		if (strcmp(argv[0], "subtest") == 0) {
			subtest_main(argc, argv, false);
			return (0);
		} else if (strcmp(argv[0], "subtest_close") == 0) {
			subtest_main(argc, argv, true);
			return (0);
		} else if (strcmp(argv[0], "gdb") == 0)
			debugger = "/usr/bin/gdb";
		else
			testutil_assert(false);
		argc--;
		argv++;
	}
	if (opts->verbose) {
		printf("Number of operations until failure: %" PRIu64
		    "  (change with -o N)\n", opts->nops);
		printf("Number of records: %" PRIu64
		    "  (change with -n N)\n", opts->nrecords);
	}
	if (opts->nops == 0) {
		run_check_subtest_range(opts, debugger, false);
		run_check_subtest_range(opts, debugger, true);
	} else
		run_check_subtest(opts, debugger, opts->nops,
		    opts->nrecords, &nresults);

	testutil_clean_work_dir(opts->home);
	testutil_cleanup(opts);

	return (0);
}