1 /*
2 Copyright (c) 2003, 2021, Oracle and/or its affiliates.
3 All rights reserved. Use is subject to license terms.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License, version 2.0,
7 as published by the Free Software Foundation.
8
9 This program is also distributed with certain software (including
10 but not limited to OpenSSL) that is licensed under separate terms,
11 as designated in a particular file or component or in included license
12 documentation. The authors of MySQL hereby grant you an additional
13 permission to link the program and your derivative works with the
14 separately licensed software that they have included with MySQL.
15
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License, version 2.0, for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with this program; if not, write to the Free Software
23 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
24 */
25
26 #ifndef SCI_Transporter_H
27 #define SCI_Transporter_H
28 #include "Transporter.hpp"
29 #include "SHM_Buffer.hpp"
30
31
32 #include <sisci_api.h>
33 #include <sisci_error.h>
34 #include <sisci_types.h>
35
36 #include <ndb_types.h>
37
38 /**
39 * The SCI Transporter
40 *
41 * The design goal of the SCI transporter is to deliver high performance
42 * data transfers (low latency, high bandwidth) combined with very high
43 * availability (failover support).
44 * High performance is an inherit feature of SCI and the, whereas failover
45 * support is implemented at the application level.
46 * In SCI the programming model is similar to the shared memory paradigm.
47 * A process on one node (A) allocates a memory segment and import the
48 * segment to its virtual address space. Another node (B) can connect to
49 * the segment and map this segment into its virtual address space.
50 * If A writes data to the segment, then B can read it and vice versa, through
51 * ordinary loads and stores. This is also called PIO (programmable IO), and
52 * is one thing that distinguish SCI from other interconnects such as,
53 * ethernet, Gig-e, Myrinet, and Infiniband. By using PIO, lower network
54 * latency is achieved, compared to the interconnects mentioned above.
55 * In order for NDB to utilize SCI, the SCI transporter relies on the
56 * SISCI api. The SISCI api provides a high level abstraction to the low
57 * level SCI driver called PCISCI driver.
58 * The SISCI api provides functions to setup, export, and import
59 * memory segments in a process virtual address space, and also functions to
60 * guarantee the correctness of data transfers between nodes. Basically, the
61 *
62 * In NDB Cluster, each SCI transporter creates a local segment
63 * that is mapped into the virtual address space. After the creation of the
64 * local segment, the SCI transporter connects to a segment created by another
65 * transporter at a remote node, and the maps the remote segment into its
66 * virtual address space. However, since NDB Cluster relies on redundancy
67 * at the network level, by using dual SCI adapters communication can be
68 * maintained even if one of the adapter cards fails (or anything on the
69 * network this adapter card exists in e.g. an SCI switch failure).
70 *
71 */
72
73 /**
74 * class SCITransporter
75 * @brief - main class for the SCI transporter.
76 */
77 class SCI_Transporter : public Transporter {
78 friend class TransporterRegistry;
79 public:
80
81 /**
82 * Init the transporter.
83 * @return true if successful, otherwize false
84 */
85 bool initTransporter();
86
87
88 /**
89 * Creates a sequence for error checking.
90 * @param adapterid the adapter on which to create a new sequence.
91 * @return SCI_ERR_OK if ok, otherwize something else.
92 */
93 sci_error_t createSequence(Uint32 adapterid);
94
95
96 /** Initiate Local Segment: create a memory segment,
97 * prepare a memory segment, map the local segment
98 * into memory space and make segment available.
99 * @return SCI_ERR_OK if ok, otherwize something else.
100 */
101 sci_error_t initLocalSegment();
102
103 /**
104 * Calculate the segment id for the remote segment
105 * @param localNodeId - local id (e.g. 1 = mgm , 2 = ndb.2 etc.)
106 * @param remoteNodeId - remote id (e.g. 1 = mgm , 2 = ndb.2 etc.)
107 * @return a segment id
108 */
109 Uint32 remoteSegmentId(Uint16 localNodeId, Uint16 remoteNodeId);
110
111 // Get local segment id (inline)
112 Uint32 hostSegmentId(Uint16 localNodeId, Uint16 remoteNodeId);
113
114 /**
115 * closeSCI closes the SCI virtual device
116 */
117 void closeSCI();
118
119
120 /**
121 * Check the status of the remote node,
122 * if it is connected or has disconnected
123 * @return true if connected, otherwize false.
124 */
125 bool checkConnected();
126
127 /**
128 * Check if the segment are properly connected to each other (remotely
129 * and locally).
130 * @return True if the both the local segment is mapped and the
131 * remote segment is mapped. Otherwize false.
132 */
133 bool getConnectionStatus();
134
135 private:
136 SCI_Transporter(TransporterRegistry &t_reg,
137 const char *local_host,
138 const char *remote_host,
139 int port,
140 bool isMgmConnection,
141 Uint32 packetSize,
142 Uint32 bufferSize,
143 Uint32 nAdapters,
144 Uint16 remoteSciNodeId0,
145 Uint16 remoteSciNodeId1,
146 NodeId localNodeID,
147 NodeId remoteNodeID,
148 NodeId serverNodeId,
149 bool checksum,
150 bool signalId,
151 Uint32 reportFreq = 4096);
152
153 /**
154 * Destructor. Disconnects the transporter.
155 */
156 ~SCI_Transporter();
157
158 virtual bool configure_derived(const TransporterConfiguration* conf);
159
160 bool m_mapped;
161 bool m_initLocal;
162 bool m_sciinit;
163 Uint32 m_failCounter;
164 /**
165 * For statistics on transfered packets
166 */
167 //#ifdef DEBUG_TRANSPORTER
168 #if 1
169 Uint32 i1024;
170 Uint32 i2048;
171 Uint32 i2049;
172 Uint32 i10242048;
173 Uint32 i20484096;
174 Uint32 i4096;
175 Uint32 i4097;
176 #endif
177
178 volatile Uint32 * m_localStatusFlag;
179 volatile Uint32 * m_remoteStatusFlag;
180 volatile Uint32 * m_remoteStatusFlag2;
181
182 SHM_Reader * reader;
183 SHM_Writer * writer;
184 SHM_Writer * writer2;
185
186 /**
187 * Statistics
188 */
189 Uint32 m_reportFreq;
190
191 Uint32 m_adapters;
192 Uint32 m_numberOfRemoteNodes;
193
194 Uint16 m_remoteNodes[2];
195
196 typedef struct SciAdapter {
197 sci_desc_t scidesc;
198 Uint32 localSciNodeId;
199 bool linkStatus;
200 } SciAdapter;
201
202 SciAdapter* sciAdapters;
203 Uint32 m_ActiveAdapterId;
204 Uint32 m_StandbyAdapterId;
205
206 typedef struct sourceSegm {
207 sci_local_segment_t localHandle; // Handle to local segment to be mapped
208 struct localHandleMap {
209 sci_map_t map; // Handle to the new mapped segment.
210 // 2 = max adapters in one node
211 } lhm[2];
212
213 volatile void *mappedMemory; // Used when reading
214 } sourceSegm;
215
216 typedef struct targetSegm {
217 struct remoteHandleMap {
218 sci_remote_segment_t remoteHandle; //Handle to local segment to be mapped
219 sci_map_t map; //Handle to the new mapped segment
220 } rhm[2];
221
222 sci_sequence_status_t m_SequenceStatus; // Used for error checking
223 sci_sequence_t sequence;
224 volatile void * mappedMemory; // Used when writing
225 SHM_Writer * writer;
226 } targetSegm;
227
228 sci_sequence_status_t m_SequenceStatus; // Used for error checking
229
230
231 // Shared between all SCI users active=(either prim or second)
232 sci_desc_t activeSCIDescriptor;
233
234 sourceSegm* m_SourceSegm; // Local segment reference
235 targetSegm* m_TargetSegm; // Remote segment reference
236
237 Uint32 m_LocalAdapterId; // Adapter Id
238 Uint16 m_LocalSciNodeId; // The SCI-node Id of this machine (adapter 0)
239 Uint16 m_LocalSciNodeId1; // The SCI-node Id of this machine (adapter 1)
240 Uint16 m_RemoteSciNodeId; // The SCI-node Id of remote machine (adapter 0)
241 Uint16 m_RemoteSciNodeId1; // The SCI-node Id of remote machine (adapter 1)
242
243 Uint32 m_PacketSize; // The size of each data packet
244 Uint32 m_BufferSize; // Mapped SCI buffer size
245
246 /**
247 * doSend. Copies the data from the source (the send buffer) to the
248 * shared mem. segment.
249 * Sequences are used for error checking.
250 * If an error occurs, the transfer is retried.
251 * If the link that we need to swap to is broken, we will disconnect.
252 * @return Returns true if datatransfer ok. If not retriable
253 * then false is returned.
254 */
255 bool doSend();
256
257 /**
258 * @param adapterNo the adapter for which to retrieve the node id.
259 * @return Returns the node id for an adapter.
260 */
261 Uint32 getLocalNodeId(Uint32 adapterNo);
262
hasDataToRead() const263 bool hasDataToRead() const {
264 return reader->empty() == false;
265 }
266
267 /**
268 * Make the local segment unavailable, no new connections will be accepted.
269 * @return Returns true if the segment was successfully disconnected.
270 */
271 bool disconnectLocal();
272
273 /**
274 * Make the local segment unavailable, no new connections will be accepted.
275 * @return Returns true if the segment was successfully disconnected.
276 */
277 bool disconnectRemote();
278
279 void resetToInitialState();
280
getReceivePtr(Uint32 ** ptr,Uint32 ** eod)281 void getReceivePtr(Uint32 ** ptr, Uint32 ** eod){
282 reader->getReadPtr(* ptr, * eod);
283 }
284
updateReceivePtr(Uint32 * ptr)285 void updateReceivePtr(Uint32 *ptr){
286 reader->updateReadPtr(ptr);
287 }
288
289 /**
290 * Corresponds to SHM_Transporter::setupBuffers()
291 * Initiates the start pointer of the buffer and read pointers.
292 * Initiate the localSegment for the SHM reader.
293 */
294 void setupLocalSegment();
295
296 /**
297 * Initiate the remoteSegment for the SHM writer
298 */
299 void setupRemoteSegment();
300
301 /**
302 * Set the connect flag in the remote memory segment (write through)
303 */
304 void setConnected();
305
306 /**
307 * Set the disconnect flag in the remote memory segment (write through)
308 */
309 void setDisconnect();
310
311 /**
312 * Check if there is a link between the adapter and the switch
313 * @param adapterNo the adapter for which to retrieve the link status.
314 * @return Returns true if there is a link between adapter and switch.
315 * Otherwize false is returned and the cables must be checked.
316 */
317 bool getLinkStatus(Uint32 adapterNo);
318
319 /**
320 * failoverShmWriter takes the state of the active writer and inserts into
321 * the standby writer.
322 */
323 void failoverShmWriter();
324
325 bool init_local();
326 bool init_remote();
327
send_limit_reached(int bufsize)328 bool send_limit_reached(int bufsize) { return (bufsize > m_PacketSize); }
send_is_possible(int timeout_millisec) const329 bool send_is_possible(int timeout_millisec) const { return 1; }
330
331 protected:
332
333 /** Perform a connection between segment
334 * This is a client node, trying to connect to a remote segment.
335 * @param timeout, the time the connect thread sleeps before
336 * retrying.
337 * @return Returns true on success, otherwize falser
338 */
339 bool connect_server_impl(NDB_SOCKET_TYPE sockfd);
340 bool connect_client_impl(NDB_SOCKET_TYPE sockfd);
341
342 /**
343 * We will disconnect if:
344 * -# the other node has disconnected from us
345 * -# unrecoverable error in transmission, on both adapters
346 * -# if we are shutdown properly
347 */
348 void disconnectImpl();
349
350 static bool initSCI();
351 };
352
353
354 /** The theLocalAdapterId combined with the theRemoteNodeId constructs
355 * (SCI ids)* a unique identifier for the local segment
356 */
357 inline
358 Uint32
hostSegmentId(Uint16 SciLocalNodeId,Uint16 SciRemoteNodeId)359 SCI_Transporter::hostSegmentId(Uint16 SciLocalNodeId,
360 Uint16 SciRemoteNodeId) {
361
362 return (SciLocalNodeId << 16) | SciRemoteNodeId;
363 }
364
365 /** The theLocalAdapterId combined with the theRemoteNodeId constructs
366 * (SCI ids)* a unique identifier for the remote segment
367 */
368 inline
369 Uint32
remoteSegmentId(Uint16 SciLocalNodeId,Uint16 SciRemoteNodeId)370 SCI_Transporter::remoteSegmentId(Uint16 SciLocalNodeId,
371 Uint16 SciRemoteNodeId) {
372
373 return (SciRemoteNodeId << 16) | SciLocalNodeId;
374 }
375
376
377 #endif
378