1 /* 2 * FailureMonitor.h 3 * 4 * This source file is part of the FoundationDB open source project 5 * 6 * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors 7 * 8 * Licensed under the Apache License, Version 2.0 (the "License"); 9 * you may not use this file except in compliance with the License. 10 * You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, software 15 * distributed under the License is distributed on an "AS IS" BASIS, 16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 * See the License for the specific language governing permissions and 18 * limitations under the License. 19 */ 20 21 #ifndef FLOW_FAILUREMONITOR_H 22 #define FLOW_FAILUREMONITOR_H 23 #pragma once 24 25 #include "flow/flow.h" 26 #include "flow/IndexedSet.h" 27 #include "fdbrpc/FlowTransport.h" // Endpoint 28 #include <unordered_map> 29 30 using std::vector; 31 32 /* 33 34 IFailureMonitor is used by load balancing, data distribution and other components 35 to report on which other machines are unresponsive or experiencing other failures. 36 This is vital both to reconfigure the system in response to failures and to prevent 37 actors from waiting forever for replies from remote machines that are no longer 38 available. When waiting for a reply, clients should generally stop waiting and 39 try an alternative server when a failure is reported, rather than relying on timeouts. 40 41 The information tracked for each machine is a FailureStatus, which 42 for the moment is just a boolean but might be richer in the future. 43 44 Get an IFailureMonitor by calling g_network->failureMonitor(); the simulator keeps 45 one for each simulated machine and ASIONetwork keeps one for each process. 46 47 The system attempts to ensure that failures are reported quickly, but may occasionally 48 report a working system as failed temporarily. Clients that intend to take very costly 49 actions as a result of a failure should probably wait a while to see if a machine becomes 50 unfailed first. If possible use onFailedFor() which in the future may react to 'permanent' 51 failures immediately. 52 53 The information reported through this interface is actually supplied by failureMonitorClient, 54 which exchanges FailureMonitoringRequest/Reply pairs with the failureDetectionServer actor on 55 the ClusterController. This central repository of failure information has the opportunity 56 to take into account topology and global network conditions in identifying failures. In 57 the future it may be augmented with locally available information about failures (e.g. 58 TCP connection loss in ASIONetwork or unexpectedly long response times for application requests). 59 60 Communications failures are tracked at NetworkAddress granularity. When a request is made to 61 a missing endpoint on a non-failed machine, this information is reported back to the requesting 62 machine and tracked at the endpoint level. 63 64 */ 65 66 struct FailureStatus { 67 bool failed; 68 FailureStatusFailureStatus69 FailureStatus() : failed(true) {} FailureStatusFailureStatus70 explicit FailureStatus(bool failed) : failed(failed) {} isFailedFailureStatus71 bool isFailed() { return failed; } isAvailableFailureStatus72 bool isAvailable() { return !failed; } 73 74 bool operator == (FailureStatus const& r) const { return failed == r.failed; } 75 bool operator != (FailureStatus const& r) const { return failed != r.failed; } 76 template <class Ar> serializeFailureStatus77 void serialize(Ar& ar) { 78 serializer(ar, failed); 79 } 80 }; 81 82 class IFailureMonitor { 83 public: 84 // Returns the currently known status for the endpoint 85 virtual FailureStatus getState( Endpoint const& endpoint ) = 0; 86 87 // Returns the currently known status for the address 88 virtual FailureStatus getState( NetworkAddress const& address ) = 0; 89 90 // Only use this function when the endpoint is known to be failed 91 virtual void endpointNotFound( Endpoint const& ) = 0; 92 93 // The next time the known status for the endpoint changes, returns the new status. 94 virtual Future<Void> onStateChanged( Endpoint const& endpoint ) = 0; 95 96 // Returns when onFailed(endpoint) || transport().onDisconnect( endpoint.getPrimaryAddress() ), but more efficiently 97 virtual Future<Void> onDisconnectOrFailure( Endpoint const& endpoint ) = 0; 98 99 // Returns true if the endpoint is failed but the address of the endpoint is not failed. 100 virtual bool onlyEndpointFailed( Endpoint const& endpoint ) = 0; 101 102 // Returns true if the endpoint will never become available. 103 virtual bool permanentlyFailed( Endpoint const& endpoint ) = 0; 104 105 // Called by FlowTransport when a connection closes and a prior request or reply might be lost 106 virtual void notifyDisconnect( NetworkAddress const& ) = 0; 107 108 // Returns when the known status of endpoint is next equal to status. Returns immediately 109 // if appropriate. 110 Future<Void> onStateEqual( Endpoint const& endpoint, FailureStatus status ); 111 112 // Returns when the status of the given endpoint is next considered "failed" onFailed(Endpoint const & endpoint)113 Future<Void> onFailed( Endpoint const& endpoint ) { 114 return onStateEqual( endpoint, FailureStatus() ); 115 } 116 failureMonitor()117 static IFailureMonitor& failureMonitor() { return *static_cast<IFailureMonitor*>((void*) g_network->global(INetwork::enFailureMonitor)); } 118 // Returns the failure monitor that the calling machine should use 119 120 // Returns when the status of the given endpoint has continuously been "failed" for sustainedFailureDuration + (elapsedTime*sustainedFailureSlope) 121 Future<Void> onFailedFor( Endpoint const& endpoint, double sustainedFailureDuration, double sustainedFailureSlope = 0.0 ); 122 }; 123 124 // SimpleFailureMonitor is the sole implementation of IFailureMonitor. It has no 125 // failure detection logic; it just implements the interface and reacts to setStatus() etc. 126 // Initially all addresses are considered failed, but all endpoints of a non-failed address are considered OK. 127 class SimpleFailureMonitor : public IFailureMonitor { 128 public: SimpleFailureMonitor()129 SimpleFailureMonitor() : endpointKnownFailed() { } 130 void setStatus( NetworkAddress const& address, FailureStatus const& status ); 131 void endpointNotFound( Endpoint const& ); 132 virtual void notifyDisconnect( NetworkAddress const& ); 133 134 virtual Future<Void> onStateChanged( Endpoint const& endpoint ); 135 virtual FailureStatus getState( Endpoint const& endpoint ); 136 virtual FailureStatus getState( NetworkAddress const& address ); 137 virtual Future<Void> onDisconnectOrFailure( Endpoint const& endpoint ); 138 virtual bool onlyEndpointFailed( Endpoint const& endpoint ); 139 virtual bool permanentlyFailed( Endpoint const& endpoint ); 140 141 void reset(); 142 private: 143 std::unordered_map< NetworkAddress, FailureStatus > addressStatus; 144 YieldedAsyncMap< Endpoint, bool > endpointKnownFailed; 145 146 friend class OnStateChangedActorActor; 147 }; 148 149 #endif 150