1 /*
2  * FailureMonitor.h
3  *
4  * This source file is part of the FoundationDB open source project
5  *
6  * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
7  *
8  * Licensed under the Apache License, Version 2.0 (the "License");
9  * you may not use this file except in compliance with the License.
10  * You may obtain a copy of the License at
11  *
12  *     http://www.apache.org/licenses/LICENSE-2.0
13  *
14  * Unless required by applicable law or agreed to in writing, software
15  * distributed under the License is distributed on an "AS IS" BASIS,
16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  * See the License for the specific language governing permissions and
18  * limitations under the License.
19  */
20 
21 #ifndef FLOW_FAILUREMONITOR_H
22 #define FLOW_FAILUREMONITOR_H
23 #pragma once
24 
25 #include "flow/flow.h"
26 #include "flow/IndexedSet.h"
27 #include "fdbrpc/FlowTransport.h" // Endpoint
28 #include <unordered_map>
29 
30 using std::vector;
31 
32 /*
33 
34 IFailureMonitor is used by load balancing, data distribution and other components
35 to report on which other machines are unresponsive or experiencing other failures.
36 This is vital both to reconfigure the system in response to failures and to prevent
37 actors from waiting forever for replies from remote machines that are no longer
38 available.  When waiting for a reply, clients should generally stop waiting and
39 try an alternative server when a failure is reported, rather than relying on timeouts.
40 
41 The information tracked for each machine is a FailureStatus, which
42 for the moment is just a boolean but might be richer in the future.
43 
44 Get an IFailureMonitor by calling g_network->failureMonitor(); the simulator keeps
45 one for each simulated machine and ASIONetwork keeps one for each process.
46 
47 The system attempts to ensure that failures are reported quickly, but may occasionally
48 report a working system as failed temporarily.  Clients that intend to take very costly
49 actions as a result of a failure should probably wait a while to see if a machine becomes
50 unfailed first.  If possible use onFailedFor() which in the future may react to 'permanent'
51 failures immediately.
52 
53 The information reported through this interface is actually supplied by failureMonitorClient,
54 which exchanges FailureMonitoringRequest/Reply pairs with the failureDetectionServer actor on
55 the ClusterController.  This central repository of failure information has the opportunity
56 to take into account topology and global network conditions in identifying failures.  In
57 the future it may be augmented with locally available information about failures (e.g.
58 TCP connection loss in ASIONetwork or unexpectedly long response times for application requests).
59 
60 Communications failures are tracked at NetworkAddress granularity.  When a request is made to
61 a missing endpoint on a non-failed machine, this information is reported back to the requesting
62 machine and tracked at the endpoint level.
63 
64 */
65 
66 struct FailureStatus {
67 	bool failed;
68 
FailureStatusFailureStatus69 	FailureStatus() : failed(true) {}
FailureStatusFailureStatus70 	explicit FailureStatus(bool failed) : failed(failed) {}
isFailedFailureStatus71 	bool isFailed() { return failed; }
isAvailableFailureStatus72 	bool isAvailable() { return !failed; }
73 
74 	bool operator == (FailureStatus const& r) const { return failed == r.failed; }
75 	bool operator != (FailureStatus const& r) const { return failed != r.failed; }
76 	template <class Ar>
serializeFailureStatus77 	void serialize(Ar& ar) {
78 		serializer(ar, failed);
79 	}
80 };
81 
82 class IFailureMonitor {
83 public:
84 	// Returns the currently known status for the endpoint
85 	virtual FailureStatus getState( Endpoint const& endpoint ) = 0;
86 
87 	// Returns the currently known status for the address
88 	virtual FailureStatus getState( NetworkAddress const& address ) = 0;
89 
90 	// Only use this function when the endpoint is known to be failed
91 	virtual void endpointNotFound( Endpoint const& ) = 0;
92 
93 	// The next time the known status for the endpoint changes, returns the new status.
94 	virtual Future<Void> onStateChanged( Endpoint const& endpoint ) = 0;
95 
96 	// Returns when onFailed(endpoint) || transport().onDisconnect( endpoint.getPrimaryAddress() ), but more efficiently
97 	virtual Future<Void> onDisconnectOrFailure( Endpoint const& endpoint ) = 0;
98 
99 	// Returns true if the endpoint is failed but the address of the endpoint is not failed.
100 	virtual bool onlyEndpointFailed( Endpoint const& endpoint ) = 0;
101 
102 	// Returns true if the endpoint will never become available.
103 	virtual bool permanentlyFailed( Endpoint const& endpoint ) = 0;
104 
105 	// Called by FlowTransport when a connection closes and a prior request or reply might be lost
106 	virtual void notifyDisconnect( NetworkAddress const& ) = 0;
107 
108 	// Returns when the known status of endpoint is next equal to status.  Returns immediately
109 	//   if appropriate.
110 	Future<Void> onStateEqual( Endpoint const& endpoint, FailureStatus status );
111 
112 	// Returns when the status of the given endpoint is next considered "failed"
onFailed(Endpoint const & endpoint)113 	Future<Void> onFailed( Endpoint const& endpoint ) {
114 		return onStateEqual( endpoint, FailureStatus() );
115 	}
116 
failureMonitor()117 	static IFailureMonitor& failureMonitor() { return *static_cast<IFailureMonitor*>((void*) g_network->global(INetwork::enFailureMonitor)); }
118 	// Returns the failure monitor that the calling machine should use
119 
120 	// Returns when the status of the given endpoint has continuously been "failed" for sustainedFailureDuration + (elapsedTime*sustainedFailureSlope)
121 	Future<Void> onFailedFor( Endpoint const& endpoint, double sustainedFailureDuration, double sustainedFailureSlope = 0.0 );
122 };
123 
124 // SimpleFailureMonitor is the sole implementation of IFailureMonitor.  It has no
125 //   failure detection logic; it just implements the interface and reacts to setStatus() etc.
126 // Initially all addresses are considered failed, but all endpoints of a non-failed address are considered OK.
127 class SimpleFailureMonitor : public IFailureMonitor {
128 public:
SimpleFailureMonitor()129 	SimpleFailureMonitor() : endpointKnownFailed() { }
130 	void setStatus( NetworkAddress const& address, FailureStatus const& status );
131 	void endpointNotFound( Endpoint const& );
132 	virtual void notifyDisconnect( NetworkAddress const& );
133 
134 	virtual Future<Void> onStateChanged( Endpoint const& endpoint );
135 	virtual FailureStatus getState( Endpoint const& endpoint );
136 	virtual FailureStatus getState( NetworkAddress const& address );
137 	virtual Future<Void> onDisconnectOrFailure( Endpoint const& endpoint );
138 	virtual bool onlyEndpointFailed( Endpoint const& endpoint );
139 	virtual bool permanentlyFailed( Endpoint const& endpoint );
140 
141 	void reset();
142 private:
143 	std::unordered_map< NetworkAddress, FailureStatus > addressStatus;
144 	YieldedAsyncMap< Endpoint, bool > endpointKnownFailed;
145 
146 	friend class OnStateChangedActorActor;
147 };
148 
149 #endif
150