1# Failover stress test. 2# In this test a different node is killed in a loop for N 3# iterations. The test checks that certain properties 4# are preserved across iterations. 5 6source "../tests/includes/init-tests.tcl" 7source "../../../tests/support/cli.tcl" 8 9test "Create a 5 nodes cluster" { 10 create_cluster 5 5 11} 12 13test "Cluster is up" { 14 assert_cluster_state ok 15} 16 17test "Enable AOF in all the instances" { 18 foreach_redis_id id { 19 R $id config set appendonly yes 20 # We use "appendfsync no" because it's fast but also guarantees that 21 # write(2) is performed before replying to client. 22 R $id config set appendfsync no 23 } 24 25 foreach_redis_id id { 26 wait_for_condition 1000 500 { 27 [RI $id aof_rewrite_in_progress] == 0 && 28 [RI $id aof_enabled] == 1 29 } else { 30 fail "Failed to enable AOF on instance #$id" 31 } 32 } 33} 34 35# Return non-zero if the specified PID is about a process still in execution, 36# otherwise 0 is returned. 37proc process_is_running {pid} { 38 # PS should return with an error if PID is non existing, 39 # and catch will return non-zero. We want to return non-zero if 40 # the PID exists, so we invert the return value with expr not operator. 41 expr {![catch {exec ps -p $pid}]} 42} 43 44# Our resharding test performs the following actions: 45# 46# - N commands are sent to the cluster in the course of the test. 47# - Every command selects a random key from key:0 to key:MAX-1. 48# - The operation RPUSH key <randomvalue> is performed. 49# - Tcl remembers into an array all the values pushed to each list. 50# - After N/2 commands, the resharding process is started in background. 51# - The test continues while the resharding is in progress. 52# - At the end of the test, we wait for the resharding process to stop. 53# - Finally the keys are checked to see if they contain the value they should. 54 55set numkeys 50000 56set numops 200000 57set start_node_port [get_instance_attrib redis 0 port] 58set cluster [redis_cluster 127.0.0.1:$start_node_port] 59if {$::tls} { 60 # setup a non-TLS cluster client to the TLS cluster 61 set plaintext_port [get_instance_attrib redis 0 plaintext-port] 62 set cluster_plaintext [redis_cluster 127.0.0.1:$plaintext_port 0] 63 puts "Testing TLS cluster on start node 127.0.0.1:$start_node_port, plaintext port $plaintext_port" 64} else { 65 set cluster_plaintext $cluster 66 puts "Testing using non-TLS cluster" 67} 68catch {unset content} 69array set content {} 70set tribpid {} 71 72test "Cluster consistency during live resharding" { 73 set ele 0 74 for {set j 0} {$j < $numops} {incr j} { 75 # Trigger the resharding once we execute half the ops. 76 if {$tribpid ne {} && 77 ($j % 10000) == 0 && 78 ![process_is_running $tribpid]} { 79 set tribpid {} 80 } 81 82 if {$j >= $numops/2 && $tribpid eq {}} { 83 puts -nonewline "...Starting resharding..." 84 flush stdout 85 set target [dict get [get_myself [randomInt 5]] id] 86 set tribpid [lindex [exec \ 87 ../../../src/redis-cli --cluster reshard \ 88 127.0.0.1:[get_instance_attrib redis 0 port] \ 89 --cluster-from all \ 90 --cluster-to $target \ 91 --cluster-slots 100 \ 92 --cluster-yes \ 93 {*}[rediscli_tls_config "../../../tests"] \ 94 | [info nameofexecutable] \ 95 ../tests/helpers/onlydots.tcl \ 96 &] 0] 97 } 98 99 # Write random data to random list. 100 set listid [randomInt $numkeys] 101 set key "key:$listid" 102 incr ele 103 # We write both with Lua scripts and with plain commands. 104 # This way we are able to stress Lua -> Redis command invocation 105 # as well, that has tests to prevent Lua to write into wrong 106 # hash slots. 107 # We also use both TLS and plaintext connections. 108 if {$listid % 3 == 0} { 109 $cluster rpush $key $ele 110 } elseif {$listid % 3 == 1} { 111 $cluster_plaintext rpush $key $ele 112 } else { 113 $cluster eval {redis.call("rpush",KEYS[1],ARGV[1])} 1 $key $ele 114 } 115 lappend content($key) $ele 116 117 if {($j % 1000) == 0} { 118 puts -nonewline W; flush stdout 119 } 120 } 121 122 # Wait for the resharding process to end 123 wait_for_condition 1000 500 { 124 [process_is_running $tribpid] == 0 125 } else { 126 fail "Resharding is not terminating after some time." 127 } 128 129} 130 131test "Verify $numkeys keys for consistency with logical content" { 132 # Check that the Redis Cluster content matches our logical content. 133 foreach {key value} [array get content] { 134 if {[$cluster lrange $key 0 -1] ne $value} { 135 fail "Key $key expected to hold '$value' but actual content is [$cluster lrange $key 0 -1]" 136 } 137 } 138} 139 140test "Terminate and restart all the instances" { 141 foreach_redis_id id { 142 # Stop AOF so that an initial AOFRW won't prevent the instance from terminating 143 R $id config set appendonly no 144 kill_instance redis $id 145 restart_instance redis $id 146 } 147} 148 149test "Cluster should eventually be up again" { 150 assert_cluster_state ok 151} 152 153test "Verify $numkeys keys after the restart" { 154 # Check that the Redis Cluster content matches our logical content. 155 foreach {key value} [array get content] { 156 if {[$cluster lrange $key 0 -1] ne $value} { 157 fail "Key $key expected to hold '$value' but actual content is [$cluster lrange $key 0 -1]" 158 } 159 } 160} 161 162test "Disable AOF in all the instances" { 163 foreach_redis_id id { 164 R $id config set appendonly no 165 } 166} 167 168test "Verify slaves consistency" { 169 set verified_masters 0 170 foreach_redis_id id { 171 set role [R $id role] 172 lassign $role myrole myoffset slaves 173 if {$myrole eq {slave}} continue 174 set masterport [get_instance_attrib redis $id port] 175 set masterdigest [R $id debug digest] 176 foreach_redis_id sid { 177 set srole [R $sid role] 178 if {[lindex $srole 0] eq {master}} continue 179 if {[lindex $srole 2] != $masterport} continue 180 wait_for_condition 1000 500 { 181 [R $sid debug digest] eq $masterdigest 182 } else { 183 fail "Master and slave data digest are different" 184 } 185 incr verified_masters 186 } 187 } 188 assert {$verified_masters >= 5} 189} 190 191test "Dump sanitization was skipped for migrations" { 192 set verified_masters 0 193 foreach_redis_id id { 194 assert {[RI $id dump_payload_sanitizations] == 0} 195 } 196} 197