1 /* Copyright (c) 2003-2005, 2007 MySQL AB
2 Use is subject to license terms
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; version 2 of the License.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA */
16
17 #ifndef SCI_Transporter_H
18 #define SCI_Transporter_H
19 #include "Transporter.hpp"
20 #include "SHM_Buffer.hpp"
21
22
23 #include <sisci_api.h>
24 #include <sisci_error.h>
25 #include <sisci_types.h>
26
27 #include <ndb_types.h>
28
29 /**
30 * The SCI Transporter
31 *
32 * The design goal of the SCI transporter is to deliver high performance
33 * data transfers (low latency, high bandwidth) combined with very high
34 * availability (failover support).
35 * High performance is an inherit feature of SCI and the, whereas failover
36 * support is implemented at the application level.
37 * In SCI the programming model is similar to the shared memory paradigm.
38 * A process on one node (A) allocates a memory segment and import the
39 * segment to its virtual address space. Another node (B) can connect to
40 * the segment and map this segment into its virtual address space.
41 * If A writes data to the segment, then B can read it and vice versa, through
42 * ordinary loads and stores. This is also called PIO (programmable IO), and
43 * is one thing that distinguish SCI from other interconnects such as,
44 * ethernet, Gig-e, Myrinet, and Infiniband. By using PIO, lower network
45 * latency is achieved, compared to the interconnects mentioned above.
46 * In order for NDB to utilize SCI, the SCI transporter relies on the
47 * SISCI api. The SISCI api provides a high level abstraction to the low
48 * level SCI driver called PCISCI driver.
49 * The SISCI api provides functions to setup, export, and import
50 * memory segments in a process virtual address space, and also functions to
51 * guarantee the correctness of data transfers between nodes. Basically, the
52 *
53 * In NDB Cluster, each SCI transporter creates a local segment
54 * that is mapped into the virtual address space. After the creation of the
55 * local segment, the SCI transporter connects to a segment created by another
56 * transporter at a remote node, and the maps the remote segment into its
57 * virtual address space. However, since NDB Cluster relies on redundancy
58 * at the network level, by using dual SCI adapters communication can be
59 * maintained even if one of the adapter cards fails (or anything on the
60 * network this adapter card exists in e.g. an SCI switch failure).
61 *
62 */
63
64 /**
65 * class SCITransporter
66 * @brief - main class for the SCI transporter.
67 */
68 class SCI_Transporter : public Transporter {
69 friend class TransporterRegistry;
70 public:
71
72 /**
73 * Init the transporter. Allocate sendbuffers and open a SCI virtual device
74 * for each adapter.
75 * @return true if successful, otherwize false
76 */
77 bool initTransporter();
78
79
80 /**
81 * Creates a sequence for error checking.
82 * @param adapterid the adapter on which to create a new sequence.
83 * @return SCI_ERR_OK if ok, otherwize something else.
84 */
85 sci_error_t createSequence(Uint32 adapterid);
86
87
88 /** Initiate Local Segment: create a memory segment,
89 * prepare a memory segment, map the local segment
90 * into memory space and make segment available.
91 * @return SCI_ERR_OK if ok, otherwize something else.
92 */
93 sci_error_t initLocalSegment();
94
95 /**
96 * Calculate the segment id for the remote segment
97 * @param localNodeId - local id (e.g. 1 = mgm , 2 = ndb.2 etc.)
98 * @param remoteNodeId - remote id (e.g. 1 = mgm , 2 = ndb.2 etc.)
99 * @return a segment id
100 */
101 Uint32 remoteSegmentId(Uint16 localNodeId, Uint16 remoteNodeId);
102
103 // Get local segment id (inline)
104 Uint32 hostSegmentId(Uint16 localNodeId, Uint16 remoteNodeId);
105
106 /**
107 * closeSCI closes the SCI virtual device
108 */
109 void closeSCI();
110
111
112 /**
113 * Check the status of the remote node,
114 * if it is connected or has disconnected
115 * @return true if connected, otherwize false.
116 */
117 bool checkConnected();
118
119 /**
120 * Check if the segment are properly connected to each other (remotely
121 * and locally).
122 * @return True if the both the local segment is mapped and the
123 * remote segment is mapped. Otherwize false.
124 */
125 bool getConnectionStatus();
126
127 virtual Uint32 get_free_buffer() const;
128 private:
129 SCI_Transporter(TransporterRegistry &t_reg,
130 const char *local_host,
131 const char *remote_host,
132 int port,
133 bool isMgmConnection,
134 Uint32 packetSize,
135 Uint32 bufferSize,
136 Uint32 nAdapters,
137 Uint16 remoteSciNodeId0,
138 Uint16 remoteSciNodeId1,
139 NodeId localNodeID,
140 NodeId remoteNodeID,
141 NodeId serverNodeId,
142 bool checksum,
143 bool signalId,
144 Uint32 reportFreq = 4096);
145
146 /**
147 * Destructor. Disconnects the transporter.
148 */
149 ~SCI_Transporter();
150 bool m_mapped;
151 bool m_initLocal;
152 bool m_sciinit;
153 Uint32 m_failCounter;
154 /**
155 * For statistics on transfered packets
156 */
157 //#ifdef DEBUG_TRANSPORTER
158 #if 1
159 Uint32 i1024;
160 Uint32 i2048;
161 Uint32 i2049;
162 Uint32 i10242048;
163 Uint32 i20484096;
164 Uint32 i4096;
165 Uint32 i4097;
166 #endif
167
168 volatile Uint32 * m_localStatusFlag;
169 volatile Uint32 * m_remoteStatusFlag;
170 volatile Uint32 * m_remoteStatusFlag2;
171
172 struct {
173 Uint32 * m_buffer; // The buffer
174 Uint32 m_dataSize; // No of words in buffer
175 Uint32 m_sendBufferSize; // Buffer size
176 Uint32 m_forceSendLimit; // Send when buffer is this full
177 } m_sendBuffer;
178
179 SHM_Reader * reader;
180 SHM_Writer * writer;
181 SHM_Writer * writer2;
182
183 /**
184 * Statistics
185 */
186 Uint32 m_reportFreq;
187
188 Uint32 m_adapters;
189 Uint32 m_numberOfRemoteNodes;
190
191 Uint16 m_remoteNodes[2];
192
193 typedef struct SciAdapter {
194 sci_desc_t scidesc;
195 Uint32 localSciNodeId;
196 bool linkStatus;
197 } SciAdapter;
198
199 SciAdapter* sciAdapters;
200 Uint32 m_ActiveAdapterId;
201 Uint32 m_StandbyAdapterId;
202
203 typedef struct sourceSegm {
204 sci_local_segment_t localHandle; // Handle to local segment to be mapped
205 struct localHandleMap {
206 sci_map_t map; // Handle to the new mapped segment.
207 // 2 = max adapters in one node
208 } lhm[2];
209
210 volatile void *mappedMemory; // Used when reading
211 } sourceSegm;
212
213 typedef struct targetSegm {
214 struct remoteHandleMap {
215 sci_remote_segment_t remoteHandle; //Handle to local segment to be mapped
216 sci_map_t map; //Handle to the new mapped segment
217 } rhm[2];
218
219 sci_sequence_status_t m_SequenceStatus; // Used for error checking
220 sci_sequence_t sequence;
221 volatile void * mappedMemory; // Used when writing
222 SHM_Writer * writer;
223 } targetSegm;
224
225 sci_sequence_status_t m_SequenceStatus; // Used for error checking
226
227
228 // Shared between all SCI users active=(either prim or second)
229 sci_desc_t activeSCIDescriptor;
230
231 sourceSegm* m_SourceSegm; // Local segment reference
232 targetSegm* m_TargetSegm; // Remote segment reference
233
234 Uint32 m_LocalAdapterId; // Adapter Id
235 Uint16 m_LocalSciNodeId; // The SCI-node Id of this machine (adapter 0)
236 Uint16 m_LocalSciNodeId1; // The SCI-node Id of this machine (adapter 1)
237 Uint16 m_RemoteSciNodeId; // The SCI-node Id of remote machine (adapter 0)
238 Uint16 m_RemoteSciNodeId1; // The SCI-node Id of remote machine (adapter 1)
239
240 Uint32 m_PacketSize; // The size of each data packet
241 Uint32 m_BufferSize; // Mapped SCI buffer size
242
243 Uint32 * getWritePtr(Uint32 lenBytes, Uint32 prio);
244 void updateWritePtr(Uint32 lenBytes, Uint32 prio);
245
246 /**
247 * doSend. Copies the data from the source (the send buffer) to the
248 * shared mem. segment.
249 * Sequences are used for error checking.
250 * If an error occurs, the transfer is retried.
251 * If the link that we need to swap to is broken, we will disconnect.
252 * @return Returns true if datatransfer ok. If not retriable
253 * then false is returned.
254 */
255 bool doSend();
256
257 /**
258 * @param adapterNo the adapter for which to retrieve the node id.
259 * @return Returns the node id for an adapter.
260 */
261 Uint32 getLocalNodeId(Uint32 adapterNo);
262
hasDataToRead() const263 bool hasDataToRead() const {
264 return reader->empty() == false;
265 }
266
hasDataToSend() const267 bool hasDataToSend() const {
268 return m_sendBuffer.m_dataSize > 0;
269 }
270
271 /**
272 * Make the local segment unavailable, no new connections will be accepted.
273 * @return Returns true if the segment was successfully disconnected.
274 */
275 bool disconnectLocal();
276
277 /**
278 * Make the local segment unavailable, no new connections will be accepted.
279 * @return Returns true if the segment was successfully disconnected.
280 */
281 bool disconnectRemote();
282
283 void resetToInitialState();
284
285 /**
286 * It is always possible to send data with SCI!
287 * @return True (always)
288 */
289 bool sendIsPossible(struct timeval * timeout);
290
getReceivePtr(Uint32 ** ptr,Uint32 ** eod)291 void getReceivePtr(Uint32 ** ptr, Uint32 ** eod){
292 reader->getReadPtr(* ptr, * eod);
293 }
294
updateReceivePtr(Uint32 * ptr)295 void updateReceivePtr(Uint32 *ptr){
296 reader->updateReadPtr(ptr);
297 }
298
299 /**
300 * Corresponds to SHM_Transporter::setupBuffers()
301 * Initiates the start pointer of the buffer and read pointers.
302 * Initiate the localSegment for the SHM reader.
303 */
304 void setupLocalSegment();
305
306 /**
307 * Initiate the remoteSegment for the SHM writer
308 */
309 void setupRemoteSegment();
310
311 /**
312 * Set the connect flag in the remote memory segment (write through)
313 */
314 void setConnected();
315
316 /**
317 * Set the disconnect flag in the remote memory segment (write through)
318 */
319 void setDisconnect();
320
321 /**
322 * Check if there is a link between the adapter and the switch
323 * @param adapterNo the adapter for which to retrieve the link status.
324 * @return Returns true if there is a link between adapter and switch.
325 * Otherwize false is returned and the cables must be checked.
326 */
327 bool getLinkStatus(Uint32 adapterNo);
328
329 /**
330 * failoverShmWriter takes the state of the active writer and inserts into
331 * the standby writer.
332 */
333 void failoverShmWriter();
334
335 bool init_local();
336 bool init_remote();
337
338 protected:
339
340 /** Perform a connection between segment
341 * This is a client node, trying to connect to a remote segment.
342 * @param timeout, the time the connect thread sleeps before
343 * retrying.
344 * @return Returns true on success, otherwize falser
345 */
346 bool connect_server_impl(NDB_SOCKET_TYPE sockfd);
347 bool connect_client_impl(NDB_SOCKET_TYPE sockfd);
348
349 /**
350 * We will disconnect if:
351 * -# the other node has disconnected from us
352 * -# unrecoverable error in transmission, on both adapters
353 * -# if we are shutdown properly
354 */
355 void disconnectImpl();
356
357 static bool initSCI();
358 };
359
360
361 /** The theLocalAdapterId combined with the theRemoteNodeId constructs
362 * (SCI ids)* a unique identifier for the local segment
363 */
364 inline
365 Uint32
hostSegmentId(Uint16 SciLocalNodeId,Uint16 SciRemoteNodeId)366 SCI_Transporter::hostSegmentId(Uint16 SciLocalNodeId,
367 Uint16 SciRemoteNodeId) {
368
369 return (SciLocalNodeId << 16) | SciRemoteNodeId;
370 }
371
372 /** The theLocalAdapterId combined with the theRemoteNodeId constructs
373 * (SCI ids)* a unique identifier for the remote segment
374 */
375 inline
376 Uint32
remoteSegmentId(Uint16 SciLocalNodeId,Uint16 SciRemoteNodeId)377 SCI_Transporter::remoteSegmentId(Uint16 SciLocalNodeId,
378 Uint16 SciRemoteNodeId) {
379
380 return (SciRemoteNodeId << 16) | SciLocalNodeId;
381 }
382
383
384 #endif
385