1#!/usr/bin/env python3
2# Copyright (c) 2017-2018 The Bitcoin Core developers
3# Distributed under the MIT software license, see the accompanying
4# file COPYING or http://www.opensource.org/licenses/mit-license.php.
5"""Test recovery from a crash during chainstate writing.
6
7- 4 nodes
8  * node0, node1, and node2 will have different dbcrash ratios, and different
9    dbcache sizes
10  * node3 will be a regular node, with no crashing.
11  * The nodes will not connect to each other.
12
13- use default test framework starting chain. initialize starting_tip_height to
14  tip height.
15
16- Main loop:
17  * generate lots of transactions on node3, enough to fill up a block.
18  * uniformly randomly pick a tip height from starting_tip_height to
19    tip_height; with probability 1/(height_difference+4), invalidate this block.
20  * mine enough blocks to overtake tip_height at start of loop.
21  * for each node in [node0,node1,node2]:
22     - for each mined block:
23       * submit block to node
24       * if node crashed on/after submitting:
25         - restart until recovery succeeds
26         - check that utxo matches node3 using gettxoutsetinfo"""
27
28import errno
29import http.client
30import random
31import sys
32import time
33
34from test_framework.messages import COIN, COutPoint, CTransaction, CTxIn, CTxOut, ToHex
35from test_framework.test_framework import BitcoinTestFramework
36from test_framework.util import assert_equal, create_confirmed_utxos, hex_str_to_bytes
37
38HTTP_DISCONNECT_ERRORS = [http.client.CannotSendRequest]
39try:
40    HTTP_DISCONNECT_ERRORS.append(http.client.RemoteDisconnected)
41except AttributeError:
42    pass
43
44class ChainstateWriteCrashTest(BitcoinTestFramework):
45    def set_test_params(self):
46        self.num_nodes = 4
47        self.setup_clean_chain = False
48        # Need a bit of extra time for the nodes to start up for this test
49        self.rpc_timeout = 90
50
51        # Set -maxmempool=0 to turn off mempool memory sharing with dbcache
52        # Set -rpcservertimeout=900 to reduce socket disconnects in this
53        # long-running test
54        self.base_args = ["-limitdescendantsize=0", "-maxmempool=0", "-rpcservertimeout=900", "-dbbatchsize=200000"]
55
56        # Set different crash ratios and cache sizes.  Note that not all of
57        # -dbcache goes to pcoinsTip.
58        self.node0_args = ["-dbcrashratio=8", "-dbcache=4"] + self.base_args
59        self.node1_args = ["-dbcrashratio=16", "-dbcache=8"] + self.base_args
60        self.node2_args = ["-dbcrashratio=24", "-dbcache=16"] + self.base_args
61
62        # Node3 is a normal node with default args, except will mine full blocks
63        self.node3_args = ["-blockmaxweight=4000000"]
64        self.extra_args = [self.node0_args, self.node1_args, self.node2_args, self.node3_args]
65
66    def skip_test_if_missing_module(self):
67        self.skip_if_no_wallet()
68
69    def setup_network(self):
70        self.add_nodes(self.num_nodes, extra_args=self.extra_args)
71        self.start_nodes()
72        self.import_deterministic_coinbase_privkeys()
73        # Leave them unconnected, we'll use submitblock directly in this test
74
75    def restart_node(self, node_index, expected_tip):
76        """Start up a given node id, wait for the tip to reach the given block hash, and calculate the utxo hash.
77
78        Exceptions on startup should indicate node crash (due to -dbcrashratio), in which case we try again. Give up
79        after 60 seconds. Returns the utxo hash of the given node."""
80
81        time_start = time.time()
82        while time.time() - time_start < 120:
83            try:
84                # Any of these RPC calls could throw due to node crash
85                self.start_node(node_index)
86                self.nodes[node_index].waitforblock(expected_tip)
87                utxo_hash = self.nodes[node_index].gettxoutsetinfo()['hash_serialized_2']
88                return utxo_hash
89            except:
90                # An exception here should mean the node is about to crash.
91                # If bitcoind exits, then try again.  wait_for_node_exit()
92                # should raise an exception if bitcoind doesn't exit.
93                self.wait_for_node_exit(node_index, timeout=10)
94            self.crashed_on_restart += 1
95            time.sleep(1)
96
97        # If we got here, bitcoind isn't coming back up on restart.  Could be a
98        # bug in bitcoind, or we've gotten unlucky with our dbcrash ratio --
99        # perhaps we generated a test case that blew up our cache?
100        # TODO: If this happens a lot, we should try to restart without -dbcrashratio
101        # and make sure that recovery happens.
102        raise AssertionError("Unable to successfully restart node %d in allotted time", node_index)
103
104    def submit_block_catch_error(self, node_index, block):
105        """Try submitting a block to the given node.
106
107        Catch any exceptions that indicate the node has crashed.
108        Returns true if the block was submitted successfully; false otherwise."""
109
110        try:
111            self.nodes[node_index].submitblock(block)
112            return True
113        except http.client.BadStatusLine as e:
114            # Prior to 3.5 BadStatusLine('') was raised for a remote disconnect error.
115            if sys.version_info[0] == 3 and sys.version_info[1] < 5 and e.line == "''":
116                self.log.debug("node %d submitblock raised exception: %s", node_index, e)
117                return False
118            else:
119                raise
120        except tuple(HTTP_DISCONNECT_ERRORS) as e:
121            self.log.debug("node %d submitblock raised exception: %s", node_index, e)
122            return False
123        except OSError as e:
124            self.log.debug("node %d submitblock raised OSError exception: errno=%s", node_index, e.errno)
125            if e.errno in [errno.EPIPE, errno.ECONNREFUSED, errno.ECONNRESET]:
126                # The node has likely crashed
127                return False
128            else:
129                # Unexpected exception, raise
130                raise
131
132    def sync_node3blocks(self, block_hashes):
133        """Use submitblock to sync node3's chain with the other nodes
134
135        If submitblock fails, restart the node and get the new utxo hash.
136        If any nodes crash while updating, we'll compare utxo hashes to
137        ensure recovery was successful."""
138
139        node3_utxo_hash = self.nodes[3].gettxoutsetinfo()['hash_serialized_2']
140
141        # Retrieve all the blocks from node3
142        blocks = []
143        for block_hash in block_hashes:
144            blocks.append([block_hash, self.nodes[3].getblock(block_hash, 0)])
145
146        # Deliver each block to each other node
147        for i in range(3):
148            nodei_utxo_hash = None
149            self.log.debug("Syncing blocks to node %d", i)
150            for (block_hash, block) in blocks:
151                # Get the block from node3, and submit to node_i
152                self.log.debug("submitting block %s", block_hash)
153                if not self.submit_block_catch_error(i, block):
154                    # TODO: more carefully check that the crash is due to -dbcrashratio
155                    # (change the exit code perhaps, and check that here?)
156                    self.wait_for_node_exit(i, timeout=30)
157                    self.log.debug("Restarting node %d after block hash %s", i, block_hash)
158                    nodei_utxo_hash = self.restart_node(i, block_hash)
159                    assert nodei_utxo_hash is not None
160                    self.restart_counts[i] += 1
161                else:
162                    # Clear it out after successful submitblock calls -- the cached
163                    # utxo hash will no longer be correct
164                    nodei_utxo_hash = None
165
166            # Check that the utxo hash matches node3's utxo set
167            # NOTE: we only check the utxo set if we had to restart the node
168            # after the last block submitted:
169            # - checking the utxo hash causes a cache flush, which we don't
170            # want to do every time; so
171            # - we only update the utxo cache after a node restart, since flushing
172            # the cache is a no-op at that point
173            if nodei_utxo_hash is not None:
174                self.log.debug("Checking txoutsetinfo matches for node %d", i)
175                assert_equal(nodei_utxo_hash, node3_utxo_hash)
176
177    def verify_utxo_hash(self):
178        """Verify that the utxo hash of each node matches node3.
179
180        Restart any nodes that crash while querying."""
181        node3_utxo_hash = self.nodes[3].gettxoutsetinfo()['hash_serialized_2']
182        self.log.info("Verifying utxo hash matches for all nodes")
183
184        for i in range(3):
185            try:
186                nodei_utxo_hash = self.nodes[i].gettxoutsetinfo()['hash_serialized_2']
187            except OSError:
188                # probably a crash on db flushing
189                nodei_utxo_hash = self.restart_node(i, self.nodes[3].getbestblockhash())
190            assert_equal(nodei_utxo_hash, node3_utxo_hash)
191
192    def generate_small_transactions(self, node, count, utxo_list):
193        FEE = 1000  # TODO: replace this with node relay fee based calculation
194        num_transactions = 0
195        random.shuffle(utxo_list)
196        while len(utxo_list) >= 2 and num_transactions < count:
197            tx = CTransaction()
198            input_amount = 0
199            for i in range(2):
200                utxo = utxo_list.pop()
201                tx.vin.append(CTxIn(COutPoint(int(utxo['txid'], 16), utxo['vout'])))
202                input_amount += int(utxo['amount'] * COIN)
203            output_amount = (input_amount - FEE) // 3
204
205            if output_amount <= 0:
206                # Sanity check -- if we chose inputs that are too small, skip
207                continue
208
209            for i in range(3):
210                tx.vout.append(CTxOut(output_amount, hex_str_to_bytes(utxo['scriptPubKey'])))
211
212            # Sign and send the transaction to get into the mempool
213            tx_signed_hex = node.signrawtransactionwithwallet(ToHex(tx))['hex']
214            node.sendrawtransaction(tx_signed_hex)
215            num_transactions += 1
216
217    def run_test(self):
218        # Track test coverage statistics
219        self.restart_counts = [0, 0, 0]  # Track the restarts for nodes 0-2
220        self.crashed_on_restart = 0      # Track count of crashes during recovery
221
222        # Start by creating a lot of utxos on node3
223        initial_height = self.nodes[3].getblockcount()
224        utxo_list = create_confirmed_utxos(self.nodes[3].getnetworkinfo()['relayfee'], self.nodes[3], 5000)
225        self.log.info("Prepped %d utxo entries", len(utxo_list))
226
227        # Sync these blocks with the other nodes
228        block_hashes_to_sync = []
229        for height in range(initial_height + 1, self.nodes[3].getblockcount() + 1):
230            block_hashes_to_sync.append(self.nodes[3].getblockhash(height))
231
232        self.log.debug("Syncing %d blocks with other nodes", len(block_hashes_to_sync))
233        # Syncing the blocks could cause nodes to crash, so the test begins here.
234        self.sync_node3blocks(block_hashes_to_sync)
235
236        starting_tip_height = self.nodes[3].getblockcount()
237
238        # Main test loop:
239        # each time through the loop, generate a bunch of transactions,
240        # and then either mine a single new block on the tip, or some-sized reorg.
241        for i in range(40):
242            self.log.info("Iteration %d, generating 2500 transactions %s", i, self.restart_counts)
243            # Generate a bunch of small-ish transactions
244            self.generate_small_transactions(self.nodes[3], 2500, utxo_list)
245            # Pick a random block between current tip, and starting tip
246            current_height = self.nodes[3].getblockcount()
247            random_height = random.randint(starting_tip_height, current_height)
248            self.log.debug("At height %d, considering height %d", current_height, random_height)
249            if random_height > starting_tip_height:
250                # Randomly reorg from this point with some probability (1/4 for
251                # tip, 1/5 for tip-1, ...)
252                if random.random() < 1.0 / (current_height + 4 - random_height):
253                    self.log.debug("Invalidating block at height %d", random_height)
254                    self.nodes[3].invalidateblock(self.nodes[3].getblockhash(random_height))
255
256            # Now generate new blocks until we pass the old tip height
257            self.log.debug("Mining longer tip")
258            block_hashes = []
259            while current_height + 1 > self.nodes[3].getblockcount():
260                block_hashes.extend(self.nodes[3].generate(min(10, current_height + 1 - self.nodes[3].getblockcount())))
261            self.log.debug("Syncing %d new blocks...", len(block_hashes))
262            self.sync_node3blocks(block_hashes)
263            utxo_list = self.nodes[3].listunspent()
264            self.log.debug("Node3 utxo count: %d", len(utxo_list))
265
266        # Check that the utxo hashes agree with node3
267        # Useful side effect: each utxo cache gets flushed here, so that we
268        # won't get crashes on shutdown at the end of the test.
269        self.verify_utxo_hash()
270
271        # Check the test coverage
272        self.log.info("Restarted nodes: %s; crashes on restart: %d", self.restart_counts, self.crashed_on_restart)
273
274        # If no nodes were restarted, we didn't test anything.
275        assert self.restart_counts != [0, 0, 0]
276
277        # Make sure we tested the case of crash-during-recovery.
278        assert self.crashed_on_restart > 0
279
280        # Warn if any of the nodes escaped restart.
281        for i in range(3):
282            if self.restart_counts[i] == 0:
283                self.log.warn("Node %d never crashed during utxo flush!", i)
284
285if __name__ == "__main__":
286    ChainstateWriteCrashTest().main()
287