1# Failover stress test.
2# In this test a different node is killed in a loop for N
3# iterations. The test checks that certain properties
4# are preserved across iterations.
5
6source "../tests/includes/init-tests.tcl"
7source "../../../tests/support/cli.tcl"
8
9test "Create a 5 nodes cluster" {
10    create_cluster 5 5
11}
12
13test "Cluster is up" {
14    assert_cluster_state ok
15}
16
17test "Enable AOF in all the instances" {
18    foreach_redis_id id {
19        R $id config set appendonly yes
20        # We use "appendfsync no" because it's fast but also guarantees that
21        # write(2) is performed before replying to client.
22        R $id config set appendfsync no
23    }
24
25    foreach_redis_id id {
26        wait_for_condition 1000 500 {
27            [RI $id aof_rewrite_in_progress] == 0 &&
28            [RI $id aof_enabled] == 1
29        } else {
30            fail "Failed to enable AOF on instance #$id"
31        }
32    }
33}
34
35# Return non-zero if the specified PID is about a process still in execution,
36# otherwise 0 is returned.
37proc process_is_running {pid} {
38    # PS should return with an error if PID is non existing,
39    # and catch will return non-zero. We want to return non-zero if
40    # the PID exists, so we invert the return value with expr not operator.
41    expr {![catch {exec ps -p $pid}]}
42}
43
44# Our resharding test performs the following actions:
45#
46# - N commands are sent to the cluster in the course of the test.
47# - Every command selects a random key from key:0 to key:MAX-1.
48# - The operation RPUSH key <randomvalue> is performed.
49# - Tcl remembers into an array all the values pushed to each list.
50# - After N/2 commands, the resharding process is started in background.
51# - The test continues while the resharding is in progress.
52# - At the end of the test, we wait for the resharding process to stop.
53# - Finally the keys are checked to see if they contain the value they should.
54
55set numkeys 50000
56set numops 200000
57set start_node_port [get_instance_attrib redis 0 port]
58set cluster [redis_cluster 127.0.0.1:$start_node_port]
59if {$::tls} {
60    # setup a non-TLS cluster client to the TLS cluster
61    set plaintext_port [get_instance_attrib redis 0 plaintext-port]
62    set cluster_plaintext [redis_cluster 127.0.0.1:$plaintext_port 0]
63    puts "Testing TLS cluster on start node 127.0.0.1:$start_node_port, plaintext port $plaintext_port"
64} else {
65    set cluster_plaintext $cluster
66    puts "Testing using non-TLS cluster"
67}
68catch {unset content}
69array set content {}
70set tribpid {}
71
72test "Cluster consistency during live resharding" {
73    set ele 0
74    for {set j 0} {$j < $numops} {incr j} {
75        # Trigger the resharding once we execute half the ops.
76        if {$tribpid ne {} &&
77            ($j % 10000) == 0 &&
78            ![process_is_running $tribpid]} {
79            set tribpid {}
80        }
81
82        if {$j >= $numops/2 && $tribpid eq {}} {
83            puts -nonewline "...Starting resharding..."
84            flush stdout
85            set target [dict get [get_myself [randomInt 5]] id]
86            set tribpid [lindex [exec \
87                ../../../src/redis-cli --cluster reshard \
88                127.0.0.1:[get_instance_attrib redis 0 port] \
89                --cluster-from all \
90                --cluster-to $target \
91                --cluster-slots 100 \
92                --cluster-yes \
93                {*}[rediscli_tls_config "../../../tests"] \
94                | [info nameofexecutable] \
95                ../tests/helpers/onlydots.tcl \
96                &] 0]
97        }
98
99        # Write random data to random list.
100        set listid [randomInt $numkeys]
101        set key "key:$listid"
102        incr ele
103        # We write both with Lua scripts and with plain commands.
104        # This way we are able to stress Lua -> Redis command invocation
105        # as well, that has tests to prevent Lua to write into wrong
106        # hash slots.
107        # We also use both TLS and plaintext connections.
108        if {$listid % 3 == 0} {
109            $cluster rpush $key $ele
110        } elseif {$listid % 3 == 1} {
111            $cluster_plaintext rpush $key $ele
112        } else {
113            $cluster eval {redis.call("rpush",KEYS[1],ARGV[1])} 1 $key $ele
114        }
115        lappend content($key) $ele
116
117        if {($j % 1000) == 0} {
118            puts -nonewline W; flush stdout
119        }
120    }
121
122    # Wait for the resharding process to end
123    wait_for_condition 1000 500 {
124        [process_is_running $tribpid] == 0
125    } else {
126        fail "Resharding is not terminating after some time."
127    }
128
129}
130
131test "Verify $numkeys keys for consistency with logical content" {
132    # Check that the Redis Cluster content matches our logical content.
133    foreach {key value} [array get content] {
134        if {[$cluster lrange $key 0 -1] ne $value} {
135            fail "Key $key expected to hold '$value' but actual content is [$cluster lrange $key 0 -1]"
136        }
137    }
138}
139
140test "Terminate and restart all the instances" {
141    foreach_redis_id id {
142        # Stop AOF so that an initial AOFRW won't prevent the instance from terminating
143        R $id config set appendonly no
144        kill_instance redis $id
145        restart_instance redis $id
146    }
147}
148
149test "Cluster should eventually be up again" {
150    assert_cluster_state ok
151}
152
153test "Verify $numkeys keys after the restart" {
154    # Check that the Redis Cluster content matches our logical content.
155    foreach {key value} [array get content] {
156        if {[$cluster lrange $key 0 -1] ne $value} {
157            fail "Key $key expected to hold '$value' but actual content is [$cluster lrange $key 0 -1]"
158        }
159    }
160}
161
162test "Disable AOF in all the instances" {
163    foreach_redis_id id {
164        R $id config set appendonly no
165    }
166}
167
168test "Verify slaves consistency" {
169    set verified_masters 0
170    foreach_redis_id id {
171        set role [R $id role]
172        lassign $role myrole myoffset slaves
173        if {$myrole eq {slave}} continue
174        set masterport [get_instance_attrib redis $id port]
175        set masterdigest [R $id debug digest]
176        foreach_redis_id sid {
177            set srole [R $sid role]
178            if {[lindex $srole 0] eq {master}} continue
179            if {[lindex $srole 2] != $masterport} continue
180            wait_for_condition 1000 500 {
181                [R $sid debug digest] eq $masterdigest
182            } else {
183                fail "Master and slave data digest are different"
184            }
185            incr verified_masters
186        }
187    }
188    assert {$verified_masters >= 5}
189}
190
191test "Dump sanitization was skipped for migrations" {
192    set verified_masters 0
193    foreach_redis_id id {
194        assert {[RI $id dump_payload_sanitizations] == 0}
195    }
196}
197