1 /* Copyright (c) 2003-2005, 2007 MySQL AB
2    Use is subject to license terms
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation; version 2 of the License.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA */
16 
17 #ifndef SCI_Transporter_H
18 #define SCI_Transporter_H
19 #include "Transporter.hpp"
20 #include "SHM_Buffer.hpp"
21 
22 
23 #include <sisci_api.h>
24 #include <sisci_error.h>
25 #include <sisci_types.h>
26 
27 #include <ndb_types.h>
28 
29 /**
30  *  The SCI Transporter
31  *
32  *  The design goal of the SCI transporter is to deliver high performance
33  *  data transfers (low latency, high bandwidth) combined with very high
34  *  availability (failover support).
35  *  High performance is an inherit feature of SCI and the, whereas failover
36  *  support is implemented at the application level.
37  *  In SCI the programming model is similar to the shared memory paradigm.
38  *  A process on one node (A) allocates a memory segment and import the
39  *  segment to  its virtual address space. Another node (B) can connect to
40  *  the segment and map this segment into its virtual address space.
41  *  If A writes data to the segment, then B can read it and vice versa, through
42  *  ordinary loads and stores. This is also called PIO (programmable IO), and
43  *  is one thing that distinguish SCI from other interconnects such as,
44  *  ethernet, Gig-e, Myrinet, and Infiniband. By using PIO, lower network
45  *  latency is achieved, compared to the interconnects mentioned above.
46  *  In order for NDB to utilize SCI,  the SCI transporter relies on the
47  *  SISCI api. The SISCI api provides a high level abstraction to the low
48  *  level SCI driver called PCISCI driver.
49  *  The SISCI api provides functions to setup, export, and import
50  *  memory segments in a process virtual address space, and also functions to
51  *  guarantee the correctness of data transfers between nodes. Basically, the
52  *
53  *  In NDB Cluster, each SCI transporter creates a local segment
54  *  that is mapped into the virtual address space. After the creation of the
55  *  local segment, the SCI transporter connects to a segment created by another
56  *  transporter at a remote node, and the maps the remote segment into its
57  *  virtual address space. However, since NDB Cluster relies on redundancy
58  *  at the network level, by using dual SCI adapters communication can be
59  *  maintained even if one of the adapter cards fails (or anything on the
60  *  network this adapter card exists in e.g. an SCI switch failure).
61  *
62  */
63 
64 /**
65  * class SCITransporter
66  * @brief - main class for the SCI transporter.
67  */
68 class SCI_Transporter : public Transporter {
69   friend class TransporterRegistry;
70 public:
71 
72   /**
73    * Init the transporter. Allocate sendbuffers and open a SCI virtual device
74    * for each adapter.
75    * @return true if successful, otherwize false
76    */
77   bool initTransporter();
78 
79 
80   /**
81    * Creates a sequence for error checking.
82    * @param adapterid the adapter on which to create a new sequence.
83    * @return SCI_ERR_OK if ok, otherwize something else.
84    */
85   sci_error_t createSequence(Uint32 adapterid);
86 
87 
88   /** Initiate Local Segment: create a memory segment,
89    * prepare a memory segment, map the local segment
90    * into  memory space and make segment available.
91    * @return SCI_ERR_OK if ok, otherwize something else.
92    */
93   sci_error_t initLocalSegment();
94 
95   /**
96    * Calculate the segment id for the remote segment
97    * @param localNodeId - local id (e.g. 1 = mgm , 2 = ndb.2 etc.)
98    * @param remoteNodeId - remote id (e.g. 1 = mgm , 2 = ndb.2 etc.)
99    * @return a segment id
100    */
101   Uint32  remoteSegmentId(Uint16 localNodeId, Uint16 remoteNodeId);
102 
103   // Get local segment id (inline)
104   Uint32  hostSegmentId(Uint16 localNodeId, Uint16 remoteNodeId);
105 
106   /**
107    * closeSCI closes the SCI virtual device
108    */
109   void closeSCI();
110 
111 
112   /**
113    * Check the status of the remote node,
114    * if it is connected or has disconnected
115    * @return true if connected, otherwize false.
116    */
117   bool checkConnected();
118 
119   /**
120    * Check if the segment are properly connected to each other (remotely
121    * and locally).
122    * @return True if the both the local segment is mapped and the
123    * remote segment is mapped. Otherwize false.
124    */
125   bool getConnectionStatus();
126 
127   virtual Uint32 get_free_buffer() const;
128 private:
129   SCI_Transporter(TransporterRegistry &t_reg,
130                   const char *local_host,
131                   const char *remote_host,
132                   int port,
133 		  bool isMgmConnection,
134                   Uint32 packetSize,
135 		  Uint32 bufferSize,
136 		  Uint32 nAdapters,
137 		  Uint16 remoteSciNodeId0,
138 		  Uint16 remoteSciNodeId1,
139 		  NodeId localNodeID,
140 		  NodeId remoteNodeID,
141 		  NodeId serverNodeId,
142 		  bool checksum,
143 		  bool signalId,
144 		  Uint32 reportFreq = 4096);
145 
146    /**
147    * Destructor. Disconnects the transporter.
148    */
149 	~SCI_Transporter();
150   bool m_mapped;
151   bool m_initLocal;
152   bool m_sciinit;
153   Uint32 m_failCounter;
154   /**
155    * For statistics on transfered packets
156    */
157 //#ifdef DEBUG_TRANSPORTER
158 #if 1
159   Uint32 i1024;
160   Uint32 i2048;
161   Uint32 i2049;
162   Uint32 i10242048;
163   Uint32 i20484096;
164   Uint32 i4096;
165   Uint32 i4097;
166 #endif
167 
168   volatile Uint32 * m_localStatusFlag;
169   volatile Uint32 * m_remoteStatusFlag;
170   volatile Uint32 * m_remoteStatusFlag2;
171 
172   struct {
173     Uint32 * m_buffer;       // The buffer
174     Uint32 m_dataSize;       // No of words in buffer
175     Uint32 m_sendBufferSize; // Buffer size
176     Uint32 m_forceSendLimit; // Send when buffer is this full
177   } m_sendBuffer;
178 
179   SHM_Reader * reader;
180   SHM_Writer * writer;
181   SHM_Writer * writer2;
182 
183   /**
184    * Statistics
185    */
186   Uint32 m_reportFreq;
187 
188   Uint32 m_adapters;
189   Uint32 m_numberOfRemoteNodes;
190 
191   Uint16 m_remoteNodes[2];
192 
193   typedef struct SciAdapter {
194     sci_desc_t scidesc;
195     Uint32 localSciNodeId;
196     bool linkStatus;
197   } SciAdapter;
198 
199   SciAdapter* sciAdapters;
200   Uint32 m_ActiveAdapterId;
201   Uint32 m_StandbyAdapterId;
202 
203   typedef struct sourceSegm {
204     sci_local_segment_t localHandle; // Handle to local segment to be mapped
205     struct localHandleMap {
206       sci_map_t map;                   // Handle to the new mapped segment.
207                                        // 2 = max adapters in one node
208     } lhm[2];
209 
210     volatile void *mappedMemory; // Used when reading
211   } sourceSegm;
212 
213   typedef struct targetSegm {
214     struct remoteHandleMap {
215       sci_remote_segment_t remoteHandle; //Handle to local segment to be mapped
216       sci_map_t          map;            //Handle to the new mapped segment
217     } rhm[2];
218 
219     sci_sequence_status_t m_SequenceStatus;    // Used for error checking
220     sci_sequence_t sequence;
221     volatile void * mappedMemory;              // Used when writing
222     SHM_Writer * writer;
223   } targetSegm;
224 
225   sci_sequence_status_t m_SequenceStatus;    // Used for error checking
226 
227 
228   // Shared between all SCI users  active=(either prim or second)
229   sci_desc_t     activeSCIDescriptor;
230 
231   sourceSegm*     m_SourceSegm;               // Local segment reference
232   targetSegm*     m_TargetSegm;               // Remote segment reference
233 
234   Uint32 m_LocalAdapterId;    // Adapter Id
235   Uint16 m_LocalSciNodeId;    // The SCI-node Id of this machine (adapter 0)
236   Uint16 m_LocalSciNodeId1;   // The SCI-node Id of this machine (adapter 1)
237   Uint16 m_RemoteSciNodeId;   // The SCI-node Id of remote machine (adapter 0)
238   Uint16 m_RemoteSciNodeId1;  // The SCI-node Id of remote machine (adapter 1)
239 
240   Uint32 m_PacketSize;        // The size of each data packet
241   Uint32 m_BufferSize;        // Mapped SCI buffer size
242 
243   Uint32 * getWritePtr(Uint32 lenBytes, Uint32 prio);
244   void updateWritePtr(Uint32 lenBytes, Uint32 prio);
245 
246   /**
247    * doSend. Copies the data from the source (the send buffer) to the
248    * shared mem. segment.
249    * Sequences are used for error checking.
250    * If an error occurs, the transfer is retried.
251    * If the link that we need to swap to is broken, we will disconnect.
252    * @return Returns true if datatransfer ok. If not retriable
253    * then false is returned.
254    */
255   bool doSend();
256 
257   /**
258    * @param adapterNo  the adapter for which to retrieve the node id.
259    * @return Returns the node id for an adapter.
260    */
261   Uint32 getLocalNodeId(Uint32 adapterNo);
262 
hasDataToRead() const263   bool hasDataToRead() const {
264     return reader->empty() == false;
265   }
266 
hasDataToSend() const267   bool hasDataToSend() const {
268     return m_sendBuffer.m_dataSize > 0;
269   }
270 
271   /**
272    * Make the local segment unavailable, no new connections will be accepted.
273    * @return Returns true if the segment was successfully disconnected.
274    */
275   bool disconnectLocal();
276 
277   /**
278    * Make the local segment unavailable, no new connections will be accepted.
279    * @return Returns true if the segment was successfully disconnected.
280    */
281   bool disconnectRemote();
282 
283   void resetToInitialState();
284 
285   /**
286    *  It is always possible to send data with SCI!
287    *  @return True (always)
288    */
289   bool sendIsPossible(struct timeval * timeout);
290 
getReceivePtr(Uint32 ** ptr,Uint32 ** eod)291   void getReceivePtr(Uint32 ** ptr, Uint32 ** eod){
292     reader->getReadPtr(* ptr, * eod);
293   }
294 
updateReceivePtr(Uint32 * ptr)295   void updateReceivePtr(Uint32 *ptr){
296     reader->updateReadPtr(ptr);
297   }
298 
299   /**
300    *   Corresponds to SHM_Transporter::setupBuffers()
301    *   Initiates the start pointer of the buffer and read pointers.
302    *   Initiate the localSegment for the SHM reader.
303    */
304   void setupLocalSegment();
305 
306   /**
307    *  Initiate the remoteSegment for the SHM writer
308    */
309   void setupRemoteSegment();
310 
311   /**
312    * Set the connect flag in the remote memory segment (write through)
313    */
314   void setConnected();
315 
316   /**
317    * Set the disconnect flag in the remote memory segment (write through)
318    */
319   void setDisconnect();
320 
321   /**
322    * Check if there is a link between the adapter and the switch
323    * @param adapterNo  the adapter for which to retrieve the link status.
324    * @return Returns true if there is a link between adapter and switch.
325    * Otherwize false is returned and the cables must be checked.
326    */
327   bool getLinkStatus(Uint32 adapterNo);
328 
329   /**
330    * failoverShmWriter takes the state of the active writer and inserts into
331    * the standby writer.
332    */
333   void failoverShmWriter();
334 
335   bool init_local();
336   bool init_remote();
337 
338 protected:
339 
340   /** Perform a connection between segment
341    * This is a client node, trying to connect to a remote segment.
342    * @param timeout, the time the connect thread sleeps before
343    * retrying.
344    * @return Returns true on success, otherwize falser
345    */
346   bool connect_server_impl(NDB_SOCKET_TYPE sockfd);
347   bool connect_client_impl(NDB_SOCKET_TYPE sockfd);
348 
349   /**
350    *  We will disconnect if:
351    *  -# the other node has disconnected from us
352    *  -# unrecoverable error in transmission, on both adapters
353    *  -# if we are shutdown properly
354    */
355   void disconnectImpl();
356 
357   static bool initSCI();
358 };
359 
360 
361 /** The theLocalAdapterId combined with the theRemoteNodeId constructs
362  *  (SCI ids)* a unique identifier for the local segment
363  */
364 inline
365 Uint32
hostSegmentId(Uint16 SciLocalNodeId,Uint16 SciRemoteNodeId)366 SCI_Transporter::hostSegmentId(Uint16 SciLocalNodeId,
367 			       Uint16 SciRemoteNodeId) {
368 
369   return (SciLocalNodeId << 16) | SciRemoteNodeId;
370 }
371 
372 /** The theLocalAdapterId combined with the theRemoteNodeId constructs
373  *  (SCI ids)* a unique identifier for the remote segment
374  */
375 inline
376 Uint32
remoteSegmentId(Uint16 SciLocalNodeId,Uint16 SciRemoteNodeId)377 SCI_Transporter::remoteSegmentId(Uint16 SciLocalNodeId,
378 				 Uint16 SciRemoteNodeId) {
379 
380   return (SciRemoteNodeId << 16) | SciLocalNodeId;
381 }
382 
383 
384 #endif
385