1 /*
2    Copyright (C) 2003-2008 MySQL AB, 2008 Sun Microsystems, Inc.
3     All rights reserved. Use is subject to license terms.
4 
5    This program is free software; you can redistribute it and/or modify
6    it under the terms of the GNU General Public License, version 2.0,
7    as published by the Free Software Foundation.
8 
9    This program is also distributed with certain software (including
10    but not limited to OpenSSL) that is licensed under separate terms,
11    as designated in a particular file or component or in included license
12    documentation.  The authors of MySQL hereby grant you an additional
13    permission to link the program and your derivative works with the
14    separately licensed software that they have included with MySQL.
15 
16    This program is distributed in the hope that it will be useful,
17    but WITHOUT ANY WARRANTY; without even the implied warranty of
18    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19    GNU General Public License, version 2.0, for more details.
20 
21    You should have received a copy of the GNU General Public License
22    along with this program; if not, write to the Free Software
23    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA
24 */
25 
26 #ifndef SCI_Transporter_H
27 #define SCI_Transporter_H
28 #include "Transporter.hpp"
29 #include "SHM_Buffer.hpp"
30 
31 
32 #include <sisci_api.h>
33 #include <sisci_error.h>
34 #include <sisci_types.h>
35 
36 #include <ndb_types.h>
37 
38 /**
39  *  The SCI Transporter
40  *
41  *  The design goal of the SCI transporter is to deliver high performance
42  *  data transfers (low latency, high bandwidth) combined with very high
43  *  availability (failover support).
44  *  High performance is an inherit feature of SCI and the, whereas failover
45  *  support is implemented at the application level.
46  *  In SCI the programming model is similar to the shared memory paradigm.
47  *  A process on one node (A) allocates a memory segment and import the
48  *  segment to  its virtual address space. Another node (B) can connect to
49  *  the segment and map this segment into its virtual address space.
50  *  If A writes data to the segment, then B can read it and vice versa, through
51  *  ordinary loads and stores. This is also called PIO (programmable IO), and
52  *  is one thing that distinguish SCI from other interconnects such as,
53  *  ethernet, Gig-e, Myrinet, and Infiniband. By using PIO, lower network
54  *  latency is achieved, compared to the interconnects mentioned above.
55  *  In order for NDB to utilize SCI,  the SCI transporter relies on the
56  *  SISCI api. The SISCI api provides a high level abstraction to the low
57  *  level SCI driver called PCISCI driver.
58  *  The SISCI api provides functions to setup, export, and import
59  *  memory segments in a process virtual address space, and also functions to
60  *  guarantee the correctness of data transfers between nodes. Basically, the
61  *
62  *  In NDB Cluster, each SCI transporter creates a local segment
63  *  that is mapped into the virtual address space. After the creation of the
64  *  local segment, the SCI transporter connects to a segment created by another
65  *  transporter at a remote node, and the maps the remote segment into its
66  *  virtual address space. However, since NDB Cluster relies on redundancy
67  *  at the network level, by using dual SCI adapters communication can be
68  *  maintained even if one of the adapter cards fails (or anything on the
69  *  network this adapter card exists in e.g. an SCI switch failure).
70  *
71  */
72 
73 /**
74  * class SCITransporter
75  * @brief - main class for the SCI transporter.
76  */
77 class SCI_Transporter : public Transporter {
78   friend class TransporterRegistry;
79 public:
80 
81   /**
82    * Init the transporter.
83    * @return true if successful, otherwize false
84    */
85   bool initTransporter();
86 
87 
88   /**
89    * Creates a sequence for error checking.
90    * @param adapterid the adapter on which to create a new sequence.
91    * @return SCI_ERR_OK if ok, otherwize something else.
92    */
93   sci_error_t createSequence(Uint32 adapterid);
94 
95 
96   /** Initiate Local Segment: create a memory segment,
97    * prepare a memory segment, map the local segment
98    * into  memory space and make segment available.
99    * @return SCI_ERR_OK if ok, otherwize something else.
100    */
101   sci_error_t initLocalSegment();
102 
103   /**
104    * Calculate the segment id for the remote segment
105    * @param localNodeId - local id (e.g. 1 = mgm , 2 = ndb.2 etc.)
106    * @param remoteNodeId - remote id (e.g. 1 = mgm , 2 = ndb.2 etc.)
107    * @return a segment id
108    */
109   Uint32  remoteSegmentId(Uint16 localNodeId, Uint16 remoteNodeId);
110 
111   // Get local segment id (inline)
112   Uint32  hostSegmentId(Uint16 localNodeId, Uint16 remoteNodeId);
113 
114   /**
115    * closeSCI closes the SCI virtual device
116    */
117   void closeSCI();
118 
119 
120   /**
121    * Check the status of the remote node,
122    * if it is connected or has disconnected
123    * @return true if connected, otherwize false.
124    */
125   bool checkConnected();
126 
127   /**
128    * Check if the segment are properly connected to each other (remotely
129    * and locally).
130    * @return True if the both the local segment is mapped and the
131    * remote segment is mapped. Otherwize false.
132    */
133   bool getConnectionStatus();
134 
135 private:
136   SCI_Transporter(TransporterRegistry &t_reg,
137                   const char *local_host,
138                   const char *remote_host,
139                   int port,
140 		  bool isMgmConnection,
141                   Uint32 packetSize,
142 		  Uint32 bufferSize,
143 		  Uint32 nAdapters,
144 		  Uint16 remoteSciNodeId0,
145 		  Uint16 remoteSciNodeId1,
146 		  NodeId localNodeID,
147 		  NodeId remoteNodeID,
148 		  NodeId serverNodeId,
149 		  bool checksum,
150 		  bool signalId,
151 		  Uint32 reportFreq = 4096);
152 
153    /**
154    * Destructor. Disconnects the transporter.
155    */
156 	~SCI_Transporter();
157 
158   virtual bool configure_derived(const TransporterConfiguration* conf);
159 
160   bool m_mapped;
161   bool m_initLocal;
162   bool m_sciinit;
163   Uint32 m_failCounter;
164   /**
165    * For statistics on transfered packets
166    */
167 //#ifdef DEBUG_TRANSPORTER
168 #if 1
169   Uint32 i1024;
170   Uint32 i2048;
171   Uint32 i2049;
172   Uint32 i10242048;
173   Uint32 i20484096;
174   Uint32 i4096;
175   Uint32 i4097;
176 #endif
177 
178   volatile Uint32 * m_localStatusFlag;
179   volatile Uint32 * m_remoteStatusFlag;
180   volatile Uint32 * m_remoteStatusFlag2;
181 
182   SHM_Reader * reader;
183   SHM_Writer * writer;
184   SHM_Writer * writer2;
185 
186   /**
187    * Statistics
188    */
189   Uint32 m_reportFreq;
190 
191   Uint32 m_adapters;
192   Uint32 m_numberOfRemoteNodes;
193 
194   Uint16 m_remoteNodes[2];
195 
196   typedef struct SciAdapter {
197     sci_desc_t scidesc;
198     Uint32 localSciNodeId;
199     bool linkStatus;
200   } SciAdapter;
201 
202   SciAdapter* sciAdapters;
203   Uint32 m_ActiveAdapterId;
204   Uint32 m_StandbyAdapterId;
205 
206   typedef struct sourceSegm {
207     sci_local_segment_t localHandle; // Handle to local segment to be mapped
208     struct localHandleMap {
209       sci_map_t map;                   // Handle to the new mapped segment.
210                                        // 2 = max adapters in one node
211     } lhm[2];
212 
213     volatile void *mappedMemory; // Used when reading
214   } sourceSegm;
215 
216   typedef struct targetSegm {
217     struct remoteHandleMap {
218       sci_remote_segment_t remoteHandle; //Handle to local segment to be mapped
219       sci_map_t          map;            //Handle to the new mapped segment
220     } rhm[2];
221 
222     sci_sequence_status_t m_SequenceStatus;    // Used for error checking
223     sci_sequence_t sequence;
224     volatile void * mappedMemory;              // Used when writing
225     SHM_Writer * writer;
226   } targetSegm;
227 
228   sci_sequence_status_t m_SequenceStatus;    // Used for error checking
229 
230 
231   // Shared between all SCI users  active=(either prim or second)
232   sci_desc_t     activeSCIDescriptor;
233 
234   sourceSegm*     m_SourceSegm;               // Local segment reference
235   targetSegm*     m_TargetSegm;               // Remote segment reference
236 
237   Uint32 m_LocalAdapterId;    // Adapter Id
238   Uint16 m_LocalSciNodeId;    // The SCI-node Id of this machine (adapter 0)
239   Uint16 m_LocalSciNodeId1;   // The SCI-node Id of this machine (adapter 1)
240   Uint16 m_RemoteSciNodeId;   // The SCI-node Id of remote machine (adapter 0)
241   Uint16 m_RemoteSciNodeId1;  // The SCI-node Id of remote machine (adapter 1)
242 
243   Uint32 m_PacketSize;        // The size of each data packet
244   Uint32 m_BufferSize;        // Mapped SCI buffer size
245 
246   /**
247    * doSend. Copies the data from the source (the send buffer) to the
248    * shared mem. segment.
249    * Sequences are used for error checking.
250    * If an error occurs, the transfer is retried.
251    * If the link that we need to swap to is broken, we will disconnect.
252    * @return Returns true if datatransfer ok. If not retriable
253    * then false is returned.
254    */
255   bool doSend();
256 
257   /**
258    * @param adapterNo  the adapter for which to retrieve the node id.
259    * @return Returns the node id for an adapter.
260    */
261   Uint32 getLocalNodeId(Uint32 adapterNo);
262 
hasDataToRead() const263   bool hasDataToRead() const {
264     return reader->empty() == false;
265   }
266 
267   /**
268    * Make the local segment unavailable, no new connections will be accepted.
269    * @return Returns true if the segment was successfully disconnected.
270    */
271   bool disconnectLocal();
272 
273   /**
274    * Make the local segment unavailable, no new connections will be accepted.
275    * @return Returns true if the segment was successfully disconnected.
276    */
277   bool disconnectRemote();
278 
279   void resetToInitialState();
280 
getReceivePtr(Uint32 ** ptr,Uint32 ** eod)281   void getReceivePtr(Uint32 ** ptr, Uint32 ** eod){
282     reader->getReadPtr(* ptr, * eod);
283   }
284 
updateReceivePtr(Uint32 * ptr)285   void updateReceivePtr(Uint32 *ptr){
286     reader->updateReadPtr(ptr);
287   }
288 
289   /**
290    *   Corresponds to SHM_Transporter::setupBuffers()
291    *   Initiates the start pointer of the buffer and read pointers.
292    *   Initiate the localSegment for the SHM reader.
293    */
294   void setupLocalSegment();
295 
296   /**
297    *  Initiate the remoteSegment for the SHM writer
298    */
299   void setupRemoteSegment();
300 
301   /**
302    * Set the connect flag in the remote memory segment (write through)
303    */
304   void setConnected();
305 
306   /**
307    * Set the disconnect flag in the remote memory segment (write through)
308    */
309   void setDisconnect();
310 
311   /**
312    * Check if there is a link between the adapter and the switch
313    * @param adapterNo  the adapter for which to retrieve the link status.
314    * @return Returns true if there is a link between adapter and switch.
315    * Otherwize false is returned and the cables must be checked.
316    */
317   bool getLinkStatus(Uint32 adapterNo);
318 
319   /**
320    * failoverShmWriter takes the state of the active writer and inserts into
321    * the standby writer.
322    */
323   void failoverShmWriter();
324 
325   bool init_local();
326   bool init_remote();
327 
send_limit_reached(int bufsize)328   bool send_limit_reached(int bufsize) { return (bufsize > m_PacketSize); }
send_is_possible(int timeout_millisec) const329   bool send_is_possible(int timeout_millisec) const { return 1; }
330 
331 protected:
332 
333   /** Perform a connection between segment
334    * This is a client node, trying to connect to a remote segment.
335    * @param timeout, the time the connect thread sleeps before
336    * retrying.
337    * @return Returns true on success, otherwize falser
338    */
339   bool connect_server_impl(NDB_SOCKET_TYPE sockfd);
340   bool connect_client_impl(NDB_SOCKET_TYPE sockfd);
341 
342   /**
343    *  We will disconnect if:
344    *  -# the other node has disconnected from us
345    *  -# unrecoverable error in transmission, on both adapters
346    *  -# if we are shutdown properly
347    */
348   void disconnectImpl();
349 
350   static bool initSCI();
351 };
352 
353 
354 /** The theLocalAdapterId combined with the theRemoteNodeId constructs
355  *  (SCI ids)* a unique identifier for the local segment
356  */
357 inline
358 Uint32
hostSegmentId(Uint16 SciLocalNodeId,Uint16 SciRemoteNodeId)359 SCI_Transporter::hostSegmentId(Uint16 SciLocalNodeId,
360 			       Uint16 SciRemoteNodeId) {
361 
362   return (SciLocalNodeId << 16) | SciRemoteNodeId;
363 }
364 
365 /** The theLocalAdapterId combined with the theRemoteNodeId constructs
366  *  (SCI ids)* a unique identifier for the remote segment
367  */
368 inline
369 Uint32
remoteSegmentId(Uint16 SciLocalNodeId,Uint16 SciRemoteNodeId)370 SCI_Transporter::remoteSegmentId(Uint16 SciLocalNodeId,
371 				 Uint16 SciRemoteNodeId) {
372 
373   return (SciRemoteNodeId << 16) | SciLocalNodeId;
374 }
375 
376 
377 #endif
378