1#!/usr/bin/env python3 2# Copyright (c) 2017-2018 The Bitcoin Core developers 3# Distributed under the MIT software license, see the accompanying 4# file COPYING or http://www.opensource.org/licenses/mit-license.php. 5"""Test recovery from a crash during chainstate writing. 6 7- 4 nodes 8 * node0, node1, and node2 will have different dbcrash ratios, and different 9 dbcache sizes 10 * node3 will be a regular node, with no crashing. 11 * The nodes will not connect to each other. 12 13- use default test framework starting chain. initialize starting_tip_height to 14 tip height. 15 16- Main loop: 17 * generate lots of transactions on node3, enough to fill up a block. 18 * uniformly randomly pick a tip height from starting_tip_height to 19 tip_height; with probability 1/(height_difference+4), invalidate this block. 20 * mine enough blocks to overtake tip_height at start of loop. 21 * for each node in [node0,node1,node2]: 22 - for each mined block: 23 * submit block to node 24 * if node crashed on/after submitting: 25 - restart until recovery succeeds 26 - check that utxo matches node3 using gettxoutsetinfo""" 27 28import errno 29import http.client 30import random 31import sys 32import time 33 34from test_framework.messages import COIN, COutPoint, CTransaction, CTxIn, CTxOut, ToHex 35from test_framework.test_framework import BitcoinTestFramework 36from test_framework.util import assert_equal, create_confirmed_utxos, hex_str_to_bytes 37 38HTTP_DISCONNECT_ERRORS = [http.client.CannotSendRequest] 39try: 40 HTTP_DISCONNECT_ERRORS.append(http.client.RemoteDisconnected) 41except AttributeError: 42 pass 43 44class ChainstateWriteCrashTest(BitcoinTestFramework): 45 def set_test_params(self): 46 self.num_nodes = 4 47 self.setup_clean_chain = False 48 # Need a bit of extra time for the nodes to start up for this test 49 self.rpc_timeout = 90 50 51 # Set -maxmempool=0 to turn off mempool memory sharing with dbcache 52 # Set -rpcservertimeout=900 to reduce socket disconnects in this 53 # long-running test 54 self.base_args = ["-limitdescendantsize=0", "-maxmempool=0", "-rpcservertimeout=900", "-dbbatchsize=200000"] 55 56 # Set different crash ratios and cache sizes. Note that not all of 57 # -dbcache goes to pcoinsTip. 58 self.node0_args = ["-dbcrashratio=8", "-dbcache=4"] + self.base_args 59 self.node1_args = ["-dbcrashratio=16", "-dbcache=8"] + self.base_args 60 self.node2_args = ["-dbcrashratio=24", "-dbcache=16"] + self.base_args 61 62 # Node3 is a normal node with default args, except will mine full blocks 63 self.node3_args = ["-blockmaxweight=4000000"] 64 self.extra_args = [self.node0_args, self.node1_args, self.node2_args, self.node3_args] 65 66 def skip_test_if_missing_module(self): 67 self.skip_if_no_wallet() 68 69 def setup_network(self): 70 self.add_nodes(self.num_nodes, extra_args=self.extra_args) 71 self.start_nodes() 72 self.import_deterministic_coinbase_privkeys() 73 # Leave them unconnected, we'll use submitblock directly in this test 74 75 def restart_node(self, node_index, expected_tip): 76 """Start up a given node id, wait for the tip to reach the given block hash, and calculate the utxo hash. 77 78 Exceptions on startup should indicate node crash (due to -dbcrashratio), in which case we try again. Give up 79 after 60 seconds. Returns the utxo hash of the given node.""" 80 81 time_start = time.time() 82 while time.time() - time_start < 120: 83 try: 84 # Any of these RPC calls could throw due to node crash 85 self.start_node(node_index) 86 self.nodes[node_index].waitforblock(expected_tip) 87 utxo_hash = self.nodes[node_index].gettxoutsetinfo()['hash_serialized_2'] 88 return utxo_hash 89 except: 90 # An exception here should mean the node is about to crash. 91 # If bitcoind exits, then try again. wait_for_node_exit() 92 # should raise an exception if bitcoind doesn't exit. 93 self.wait_for_node_exit(node_index, timeout=10) 94 self.crashed_on_restart += 1 95 time.sleep(1) 96 97 # If we got here, bitcoind isn't coming back up on restart. Could be a 98 # bug in bitcoind, or we've gotten unlucky with our dbcrash ratio -- 99 # perhaps we generated a test case that blew up our cache? 100 # TODO: If this happens a lot, we should try to restart without -dbcrashratio 101 # and make sure that recovery happens. 102 raise AssertionError("Unable to successfully restart node %d in allotted time", node_index) 103 104 def submit_block_catch_error(self, node_index, block): 105 """Try submitting a block to the given node. 106 107 Catch any exceptions that indicate the node has crashed. 108 Returns true if the block was submitted successfully; false otherwise.""" 109 110 try: 111 self.nodes[node_index].submitblock(block) 112 return True 113 except http.client.BadStatusLine as e: 114 # Prior to 3.5 BadStatusLine('') was raised for a remote disconnect error. 115 if sys.version_info[0] == 3 and sys.version_info[1] < 5 and e.line == "''": 116 self.log.debug("node %d submitblock raised exception: %s", node_index, e) 117 return False 118 else: 119 raise 120 except tuple(HTTP_DISCONNECT_ERRORS) as e: 121 self.log.debug("node %d submitblock raised exception: %s", node_index, e) 122 return False 123 except OSError as e: 124 self.log.debug("node %d submitblock raised OSError exception: errno=%s", node_index, e.errno) 125 if e.errno in [errno.EPIPE, errno.ECONNREFUSED, errno.ECONNRESET]: 126 # The node has likely crashed 127 return False 128 else: 129 # Unexpected exception, raise 130 raise 131 132 def sync_node3blocks(self, block_hashes): 133 """Use submitblock to sync node3's chain with the other nodes 134 135 If submitblock fails, restart the node and get the new utxo hash. 136 If any nodes crash while updating, we'll compare utxo hashes to 137 ensure recovery was successful.""" 138 139 node3_utxo_hash = self.nodes[3].gettxoutsetinfo()['hash_serialized_2'] 140 141 # Retrieve all the blocks from node3 142 blocks = [] 143 for block_hash in block_hashes: 144 blocks.append([block_hash, self.nodes[3].getblock(block_hash, 0)]) 145 146 # Deliver each block to each other node 147 for i in range(3): 148 nodei_utxo_hash = None 149 self.log.debug("Syncing blocks to node %d", i) 150 for (block_hash, block) in blocks: 151 # Get the block from node3, and submit to node_i 152 self.log.debug("submitting block %s", block_hash) 153 if not self.submit_block_catch_error(i, block): 154 # TODO: more carefully check that the crash is due to -dbcrashratio 155 # (change the exit code perhaps, and check that here?) 156 self.wait_for_node_exit(i, timeout=30) 157 self.log.debug("Restarting node %d after block hash %s", i, block_hash) 158 nodei_utxo_hash = self.restart_node(i, block_hash) 159 assert nodei_utxo_hash is not None 160 self.restart_counts[i] += 1 161 else: 162 # Clear it out after successful submitblock calls -- the cached 163 # utxo hash will no longer be correct 164 nodei_utxo_hash = None 165 166 # Check that the utxo hash matches node3's utxo set 167 # NOTE: we only check the utxo set if we had to restart the node 168 # after the last block submitted: 169 # - checking the utxo hash causes a cache flush, which we don't 170 # want to do every time; so 171 # - we only update the utxo cache after a node restart, since flushing 172 # the cache is a no-op at that point 173 if nodei_utxo_hash is not None: 174 self.log.debug("Checking txoutsetinfo matches for node %d", i) 175 assert_equal(nodei_utxo_hash, node3_utxo_hash) 176 177 def verify_utxo_hash(self): 178 """Verify that the utxo hash of each node matches node3. 179 180 Restart any nodes that crash while querying.""" 181 node3_utxo_hash = self.nodes[3].gettxoutsetinfo()['hash_serialized_2'] 182 self.log.info("Verifying utxo hash matches for all nodes") 183 184 for i in range(3): 185 try: 186 nodei_utxo_hash = self.nodes[i].gettxoutsetinfo()['hash_serialized_2'] 187 except OSError: 188 # probably a crash on db flushing 189 nodei_utxo_hash = self.restart_node(i, self.nodes[3].getbestblockhash()) 190 assert_equal(nodei_utxo_hash, node3_utxo_hash) 191 192 def generate_small_transactions(self, node, count, utxo_list): 193 FEE = 1000 # TODO: replace this with node relay fee based calculation 194 num_transactions = 0 195 random.shuffle(utxo_list) 196 while len(utxo_list) >= 2 and num_transactions < count: 197 tx = CTransaction() 198 input_amount = 0 199 for i in range(2): 200 utxo = utxo_list.pop() 201 tx.vin.append(CTxIn(COutPoint(int(utxo['txid'], 16), utxo['vout']))) 202 input_amount += int(utxo['amount'] * COIN) 203 output_amount = (input_amount - FEE) // 3 204 205 if output_amount <= 0: 206 # Sanity check -- if we chose inputs that are too small, skip 207 continue 208 209 for i in range(3): 210 tx.vout.append(CTxOut(output_amount, hex_str_to_bytes(utxo['scriptPubKey']))) 211 212 # Sign and send the transaction to get into the mempool 213 tx_signed_hex = node.signrawtransactionwithwallet(ToHex(tx))['hex'] 214 node.sendrawtransaction(tx_signed_hex) 215 num_transactions += 1 216 217 def run_test(self): 218 # Track test coverage statistics 219 self.restart_counts = [0, 0, 0] # Track the restarts for nodes 0-2 220 self.crashed_on_restart = 0 # Track count of crashes during recovery 221 222 # Start by creating a lot of utxos on node3 223 initial_height = self.nodes[3].getblockcount() 224 utxo_list = create_confirmed_utxos(self.nodes[3].getnetworkinfo()['relayfee'], self.nodes[3], 5000) 225 self.log.info("Prepped %d utxo entries", len(utxo_list)) 226 227 # Sync these blocks with the other nodes 228 block_hashes_to_sync = [] 229 for height in range(initial_height + 1, self.nodes[3].getblockcount() + 1): 230 block_hashes_to_sync.append(self.nodes[3].getblockhash(height)) 231 232 self.log.debug("Syncing %d blocks with other nodes", len(block_hashes_to_sync)) 233 # Syncing the blocks could cause nodes to crash, so the test begins here. 234 self.sync_node3blocks(block_hashes_to_sync) 235 236 starting_tip_height = self.nodes[3].getblockcount() 237 238 # Main test loop: 239 # each time through the loop, generate a bunch of transactions, 240 # and then either mine a single new block on the tip, or some-sized reorg. 241 for i in range(40): 242 self.log.info("Iteration %d, generating 2500 transactions %s", i, self.restart_counts) 243 # Generate a bunch of small-ish transactions 244 self.generate_small_transactions(self.nodes[3], 2500, utxo_list) 245 # Pick a random block between current tip, and starting tip 246 current_height = self.nodes[3].getblockcount() 247 random_height = random.randint(starting_tip_height, current_height) 248 self.log.debug("At height %d, considering height %d", current_height, random_height) 249 if random_height > starting_tip_height: 250 # Randomly reorg from this point with some probability (1/4 for 251 # tip, 1/5 for tip-1, ...) 252 if random.random() < 1.0 / (current_height + 4 - random_height): 253 self.log.debug("Invalidating block at height %d", random_height) 254 self.nodes[3].invalidateblock(self.nodes[3].getblockhash(random_height)) 255 256 # Now generate new blocks until we pass the old tip height 257 self.log.debug("Mining longer tip") 258 block_hashes = [] 259 while current_height + 1 > self.nodes[3].getblockcount(): 260 block_hashes.extend(self.nodes[3].generate(min(10, current_height + 1 - self.nodes[3].getblockcount()))) 261 self.log.debug("Syncing %d new blocks...", len(block_hashes)) 262 self.sync_node3blocks(block_hashes) 263 utxo_list = self.nodes[3].listunspent() 264 self.log.debug("Node3 utxo count: %d", len(utxo_list)) 265 266 # Check that the utxo hashes agree with node3 267 # Useful side effect: each utxo cache gets flushed here, so that we 268 # won't get crashes on shutdown at the end of the test. 269 self.verify_utxo_hash() 270 271 # Check the test coverage 272 self.log.info("Restarted nodes: %s; crashes on restart: %d", self.restart_counts, self.crashed_on_restart) 273 274 # If no nodes were restarted, we didn't test anything. 275 assert self.restart_counts != [0, 0, 0] 276 277 # Make sure we tested the case of crash-during-recovery. 278 assert self.crashed_on_restart > 0 279 280 # Warn if any of the nodes escaped restart. 281 for i in range(3): 282 if self.restart_counts[i] == 0: 283 self.log.warn("Node %d never crashed during utxo flush!", i) 284 285if __name__ == "__main__": 286 ChainstateWriteCrashTest().main() 287