1# Failover stress test.
2# In this test a different node is killed in a loop for N
3# iterations. The test checks that certain properties
4# are preserved across iterations.
5
6source "../tests/includes/init-tests.tcl"
7
8test "Create a 5 nodes cluster" {
9    create_cluster 5 5
10}
11
12test "Cluster is up" {
13    assert_cluster_state ok
14}
15
16set iterations 20
17set cluster [redis_cluster 127.0.0.1:[get_instance_attrib redis 0 port]]
18
19while {[incr iterations -1]} {
20    set tokill [randomInt 10]
21    set other [expr {($tokill+1)%10}] ; # Some other instance.
22    set key [randstring 20 20 alpha]
23    set val [randstring 20 20 alpha]
24    set role [RI $tokill role]
25    if {$role eq {master}} {
26        set slave {}
27        set myid [dict get [get_myself $tokill] id]
28        foreach_redis_id id {
29            if {$id == $tokill} continue
30            if {[dict get [get_myself $id] slaveof] eq $myid} {
31                set slave $id
32            }
33        }
34        if {$slave eq {}} {
35            fail "Unable to retrieve slave's ID for master #$tokill"
36        }
37    }
38
39    puts "--- Iteration $iterations ---"
40
41    if {$role eq {master}} {
42        test "Wait for slave of #$tokill to sync" {
43            wait_for_condition 1000 50 {
44                [string match {*state=online*} [RI $tokill slave0]]
45            } else {
46                fail "Slave of node #$tokill is not ok"
47            }
48        }
49        set slave_config_epoch [CI $slave cluster_my_epoch]
50    }
51
52    test "Cluster is writable before failover" {
53        for {set i 0} {$i < 100} {incr i} {
54            catch {$cluster set $key:$i $val:$i} err
55            assert {$err eq {OK}}
56        }
57        # Wait for the write to propagate to the slave if we
58        # are going to kill a master.
59        if {$role eq {master}} {
60            R $tokill wait 1 20000
61        }
62    }
63
64    test "Terminating node #$tokill" {
65        # Stop AOF so that an initial AOFRW won't prevent the instance from terminating
66        R $tokill config set appendonly no
67        kill_instance redis $tokill
68    }
69
70    if {$role eq {master}} {
71        test "Wait failover by #$slave with old epoch $slave_config_epoch" {
72            wait_for_condition 1000 50 {
73                [CI $slave cluster_my_epoch] > $slave_config_epoch
74            } else {
75                fail "No failover detected, epoch is still [CI $slave cluster_my_epoch]"
76            }
77        }
78    }
79
80    test "Cluster should eventually be up again" {
81        assert_cluster_state ok
82    }
83
84    test "Cluster is writable again" {
85        for {set i 0} {$i < 100} {incr i} {
86            catch {$cluster set $key:$i:2 $val:$i:2} err
87            assert {$err eq {OK}}
88        }
89    }
90
91    test "Restarting node #$tokill" {
92        restart_instance redis $tokill
93    }
94
95    test "Instance #$tokill is now a slave" {
96        wait_for_condition 1000 50 {
97            [RI $tokill role] eq {slave}
98        } else {
99            fail "Restarted instance is not a slave"
100        }
101    }
102
103    test "We can read back the value we set before" {
104        for {set i 0} {$i < 100} {incr i} {
105            catch {$cluster get $key:$i} err
106            assert {$err eq "$val:$i"}
107            catch {$cluster get $key:$i:2} err
108            assert {$err eq "$val:$i:2"}
109        }
110    }
111}
112
113test "Post condition: current_epoch >= my_epoch everywhere" {
114    foreach_redis_id id {
115        assert {[CI $id cluster_current_epoch] >= [CI $id cluster_my_epoch]}
116    }
117}
118