1# Failover stress test. 2# In this test a different node is killed in a loop for N 3# iterations. The test checks that certain properties 4# are preserved across iterations. 5 6source "../tests/includes/init-tests.tcl" 7 8test "Create a 5 nodes cluster" { 9 create_cluster 5 5 10} 11 12test "Cluster is up" { 13 assert_cluster_state ok 14} 15 16set iterations 20 17set cluster [redis_cluster 127.0.0.1:[get_instance_attrib redis 0 port]] 18 19while {[incr iterations -1]} { 20 set tokill [randomInt 10] 21 set other [expr {($tokill+1)%10}] ; # Some other instance. 22 set key [randstring 20 20 alpha] 23 set val [randstring 20 20 alpha] 24 set role [RI $tokill role] 25 if {$role eq {master}} { 26 set slave {} 27 set myid [dict get [get_myself $tokill] id] 28 foreach_redis_id id { 29 if {$id == $tokill} continue 30 if {[dict get [get_myself $id] slaveof] eq $myid} { 31 set slave $id 32 } 33 } 34 if {$slave eq {}} { 35 fail "Unable to retrieve slave's ID for master #$tokill" 36 } 37 } 38 39 puts "--- Iteration $iterations ---" 40 41 if {$role eq {master}} { 42 test "Wait for slave of #$tokill to sync" { 43 wait_for_condition 1000 50 { 44 [string match {*state=online*} [RI $tokill slave0]] 45 } else { 46 fail "Slave of node #$tokill is not ok" 47 } 48 } 49 set slave_config_epoch [CI $slave cluster_my_epoch] 50 } 51 52 test "Cluster is writable before failover" { 53 for {set i 0} {$i < 100} {incr i} { 54 catch {$cluster set $key:$i $val:$i} err 55 assert {$err eq {OK}} 56 } 57 # Wait for the write to propagate to the slave if we 58 # are going to kill a master. 59 if {$role eq {master}} { 60 R $tokill wait 1 20000 61 } 62 } 63 64 test "Terminating node #$tokill" { 65 # Stop AOF so that an initial AOFRW won't prevent the instance from terminating 66 R $tokill config set appendonly no 67 kill_instance redis $tokill 68 } 69 70 if {$role eq {master}} { 71 test "Wait failover by #$slave with old epoch $slave_config_epoch" { 72 wait_for_condition 1000 50 { 73 [CI $slave cluster_my_epoch] > $slave_config_epoch 74 } else { 75 fail "No failover detected, epoch is still [CI $slave cluster_my_epoch]" 76 } 77 } 78 } 79 80 test "Cluster should eventually be up again" { 81 assert_cluster_state ok 82 } 83 84 test "Cluster is writable again" { 85 for {set i 0} {$i < 100} {incr i} { 86 catch {$cluster set $key:$i:2 $val:$i:2} err 87 assert {$err eq {OK}} 88 } 89 } 90 91 test "Restarting node #$tokill" { 92 restart_instance redis $tokill 93 } 94 95 test "Instance #$tokill is now a slave" { 96 wait_for_condition 1000 50 { 97 [RI $tokill role] eq {slave} 98 } else { 99 fail "Restarted instance is not a slave" 100 } 101 } 102 103 test "We can read back the value we set before" { 104 for {set i 0} {$i < 100} {incr i} { 105 catch {$cluster get $key:$i} err 106 assert {$err eq "$val:$i"} 107 catch {$cluster get $key:$i:2} err 108 assert {$err eq "$val:$i:2"} 109 } 110 } 111} 112 113test "Post condition: current_epoch >= my_epoch everywhere" { 114 foreach_redis_id id { 115 assert {[CI $id cluster_current_epoch] >= [CI $id cluster_my_epoch]} 116 } 117} 118