1 /* Copyright (c) 2008, 2019, Oracle and/or its affiliates. All rights reserved.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License, version 2.0,
5    as published by the Free Software Foundation.
6 
7    This program is also distributed with certain software (including
8    but not limited to OpenSSL) that is licensed under separate terms,
9    as designated in a particular file or component or in included license
10    documentation.  The authors of MySQL hereby grant you an additional
11    permission to link the program and your derivative works with the
12    separately licensed software that they have included with MySQL.
13 
14    This program is distributed in the hope that it will be useful,
15    but WITHOUT ANY WARRANTY; without even the implied warranty of
16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17    GNU General Public License, version 2.0, for more details.
18 
19    You should have received a copy of the GNU General Public License
20    along with this program; if not, write to the Free Software
21    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
22 
23 #include <ndb_global.h>
24 
25 #define NDBD_MULTITHREADED
26 
27 #include <VMSignal.hpp>
28 #include <kernel_types.h>
29 #include <Prio.hpp>
30 #include <SignalLoggerManager.hpp>
31 #include <SimulatedBlock.hpp>
32 #include <ErrorHandlingMacros.hpp>
33 #include <GlobalData.hpp>
34 #include <WatchDog.hpp>
35 #include <TransporterDefinitions.hpp>
36 #include <TransporterRegistry.hpp>
37 #include "FastScheduler.hpp"
38 #include "mt.hpp"
39 #include <DebuggerNames.hpp>
40 #include <signaldata/StopForCrash.hpp>
41 #include "TransporterCallbackKernel.hpp"
42 #include <NdbSleep.h>
43 #include <NdbGetRUsage.h>
44 #include <portlib/ndb_prefetch.h>
45 #include <blocks/pgman.hpp>
46 #include <blocks/thrman.hpp>
47 #include <Pool.hpp>
48 #include <NdbSpin.h>
49 
50 #include "mt-asm.h"
51 #include "mt-lock.hpp"
52 
53 #include "ThreadConfig.hpp"
54 #include <signaldata/StartOrd.hpp>
55 
56 #include <NdbTick.h>
57 #include <NdbMutex.h>
58 #include <NdbCondition.h>
59 #include <ErrorReporter.hpp>
60 #include <EventLogger.hpp>
61 
62 extern EventLogger * g_eventLogger;
63 
64 #if (defined(VM_TRACE) || defined(ERROR_INSERT))
65 //#define DEBUG_MULTI_TRP 1
66 #endif
67 
68 #ifdef DEBUG_MULTI_TRP
69 #define DEB_MULTI_TRP(arglist) do { g_eventLogger->info arglist ; } while (0)
70 #else
71 #define DEB_MULTI_TRP(arglist) do { } while (0)
72 #endif
73 
74 /**
75  * Two new manual(recompile) error-injections in mt.cpp :
76  *
77  *     NDB_BAD_SEND : Causes send buffer code to mess with a byte in a send buffer
78  *     NDB_LUMPY_SEND : Causes transporters to be given small, oddly aligned and
79  *                      sized IOVECs to send, testing ability of new and existing
80  *                      code to handle this.
81  *
82  *   These are useful for testing the correctness of the new code, and
83  *   the resulting behaviour / debugging output.
84  */
85 //#define NDB_BAD_SEND
86 //#define NDB_LUMPY_SEND
87 
88 /**
89  * Number indicating that the trp has no current sender thread.
90  *
91  * trp is used for short form of transporter in quite a few places.
92  * Originally there was a one to one mapping from node to transporter
93  * and vice versa. Now there can be several transporters used to
94  * connect to one node and thus we work with transporters and not with
95  * nodes in most places used for communication.
96  */
97 #define NO_OWNER_THREAD 0xFFFF
98 
99 static void dumpJobQueues(void);
100 
101 inline
102 SimulatedBlock*
mt_getBlock(BlockNumber blockNo,Uint32 instanceNo)103 GlobalData::mt_getBlock(BlockNumber blockNo, Uint32 instanceNo)
104 {
105   SimulatedBlock* b = getBlock(blockNo);
106   if (b != 0 && instanceNo != 0)
107     b = b->getInstance(instanceNo);
108   return b;
109 }
110 
111 #ifdef __GNUC__
112 /* Provides a small (but noticeable) speedup in benchmarks. */
113 #define memcpy __builtin_memcpy
114 #endif
115 
116 /* Constants found by benchmarks to be reasonable values. */
117 
118 /*
119  * Max. signals to execute from one job buffer before considering other
120  * possible stuff to do.
121  */
122 static const Uint32 MAX_SIGNALS_PER_JB = 75;
123 
124 /**
125  * Max signals written to other thread before calling flush_jbb_write_state
126  */
127 static const Uint32 MAX_SIGNALS_BEFORE_FLUSH_RECEIVER = 2;
128 static const Uint32 MAX_SIGNALS_BEFORE_FLUSH_OTHER = 20;
129 static const Uint32 MAX_SIGNALS_BEFORE_WAKEUP = 128;
130 
131 //#define NDB_MT_LOCK_TO_CPU
132 
133 /* If this is too small it crashes before first signal. */
134 #define MAX_INSTANCES_PER_THREAD (16 + 8 * MAX_NDBMT_LQH_THREADS)
135 
136 static Uint32 glob_num_threads = 0;
137 static Uint32 glob_num_tc_threads = 1;
138 static Uint32 first_receiver_thread_no = 0;
139 static Uint32 max_send_delay = 0;
140 static Uint32 glob_wakeup_latency = 25;
141 
142 #define NO_SEND_THREAD (MAX_BLOCK_THREADS + MAX_NDBMT_SEND_THREADS + 1)
143 
144 /* max signal is 32 words, 7 for signal header and 25 datawords */
145 #define MAX_SIGNAL_SIZE 32
146 #define MIN_SIGNALS_PER_PAGE (thr_job_buffer::SIZE / MAX_SIGNAL_SIZE) //255
147 
148 #if defined(HAVE_LINUX_FUTEX) && defined(NDB_HAVE_XCNG)
149 #define USE_FUTEX
150 #endif
151 
152 #ifdef USE_FUTEX
153 #ifndef _GNU_SOURCE
154 #define _GNU_SOURCE
155 #endif
156 #include <unistd.h>
157 #include <sys/syscall.h>
158 #include <sys/types.h>
159 
160 #define FUTEX_WAIT              0
161 #define FUTEX_WAKE              1
162 #define FUTEX_FD                2
163 #define FUTEX_REQUEUE           3
164 #define FUTEX_CMP_REQUEUE       4
165 #define FUTEX_WAKE_OP           5
166 
167 static inline
168 int
futex_wait(volatile unsigned * addr,int val,const struct timespec * timeout)169 futex_wait(volatile unsigned * addr, int val, const struct timespec * timeout)
170 {
171   return syscall(SYS_futex,
172                  addr, FUTEX_WAIT, val, timeout, 0, 0) == 0 ? 0 : errno;
173 }
174 
175 static inline
176 int
futex_wake(volatile unsigned * addr)177 futex_wake(volatile unsigned * addr)
178 {
179   return syscall(SYS_futex, addr, FUTEX_WAKE, 1, 0, 0, 0) == 0 ? 0 : errno;
180 }
181 
182 struct alignas(NDB_CL) thr_wait
183 {
184   volatile unsigned m_futex_state;
185   enum {
186     FS_RUNNING = 0,
187     FS_SLEEPING = 1
188   };
thr_waitthr_wait189   thr_wait() {
190     assert((sizeof(*this) % NDB_CL) == 0); //Maintain any CL-allignment
191     xcng(&m_futex_state, FS_RUNNING);
192   }
initthr_wait193   void init () {}
194 };
195 
196 /**
197  * Sleep until woken up or timeout occurs.
198  *
199  * Will call check_callback(check_arg) after proper synchronisation, and only
200  * if that returns true will it actually sleep, else it will return
201  * immediately. This is needed to avoid races with wakeup.
202  *
203  * Returns 'true' if it actually did sleep.
204  */
205 template<typename T>
206 static inline
207 bool
yield(struct thr_wait * wait,const Uint32 nsec,bool (* check_callback)(T *),T * check_arg)208 yield(struct thr_wait* wait, const Uint32 nsec,
209       bool (*check_callback)(T*), T* check_arg)
210 {
211   volatile unsigned * val = &wait->m_futex_state;
212 #ifndef NDEBUG
213   int old =
214 #endif
215     xcng(val, thr_wait::FS_SLEEPING);
216   assert(old == thr_wait::FS_RUNNING);
217 
218   /**
219    * At this point, we need to re-check the condition that made us decide to
220    * sleep, and skip sleeping if it changed..
221    *
222    * Otherwise, the condition may have not changed, and the thread making the
223    * change have already decided not to wake us, as our state was FS_RUNNING
224    * at the time.
225    *
226    * Also need a memory barrier to ensure this extra check is race-free.
227    *   but that is already provided by xcng
228    */
229   const bool waited = (*check_callback)(check_arg);
230   if (waited)
231   {
232     struct timespec timeout;
233     timeout.tv_sec = 0;
234     timeout.tv_nsec = nsec;
235     futex_wait(val, thr_wait::FS_SLEEPING, &timeout);
236     /**
237      * Any spurious wakeups are handled by simply running the scheduler code.
238      * The check_callback is needed to ensure that we don't miss wakeups. But
239      * that a spurious wakeups causes one loop in the scheduler compared to
240      * the cost of always checking through buffers to check condition.
241      */
242   }
243   xcng(val, thr_wait::FS_RUNNING);
244   return waited;
245 }
246 
247 static inline
248 int
wakeup(struct thr_wait * wait)249 wakeup(struct thr_wait* wait)
250 {
251   volatile unsigned * val = &wait->m_futex_state;
252   /**
253    * We must ensure that any state update (new data in buffers...) are visible
254    * to the other thread before we can look at the sleep state of that other
255    * thread.
256    */
257   if (xcng(val, thr_wait::FS_RUNNING) == thr_wait::FS_SLEEPING)
258   {
259     return futex_wake(val);
260   }
261   return 0;
262 }
263 
264 static inline
265 int
try_wakeup(struct thr_wait * wait)266 try_wakeup(struct thr_wait* wait)
267 {
268   return wakeup(wait);
269 }
270 #else
271 
272 struct alignas(NDB_CL) thr_wait
273 {
274   NdbMutex *m_mutex;
275   NdbCondition *m_cond;
276   bool m_need_wakeup;
thr_waitthr_wait277   thr_wait() : m_mutex(0), m_cond(0), m_need_wakeup(false) {
278     assert((sizeof(*this) % NDB_CL) == 0); //Maintain any CL-allignment
279   }
280 
initthr_wait281   void init() {
282     m_mutex = NdbMutex_Create();
283     m_cond = NdbCondition_Create();
284   }
285 };
286 
287 template<typename T>
288 static inline
289 bool
yield(struct thr_wait * wait,const Uint32 nsec,bool (* check_callback)(T *),T * check_arg)290 yield(struct thr_wait* wait, const Uint32 nsec,
291       bool (*check_callback)(T*), T* check_arg)
292 {
293   struct timespec end;
294   NdbCondition_ComputeAbsTime(&end, (nsec >= 1000000) ? nsec/1000000 : 1);
295   NdbMutex_Lock(wait->m_mutex);
296 
297   /**
298    * Any spurious wakeups are handled by simply running the scheduler code.
299    * The check_callback is needed to ensure that we don't miss wakeups. But
300    * that a spurious wakeups causes one loop in the scheduler compared to
301    * the cost of always checking through buffers to check condition.
302    */
303   Uint32 waits = 0;
304   if ((*check_callback)(check_arg))
305   {
306     wait->m_need_wakeup = true;
307     waits++;
308     if (NdbCondition_WaitTimeoutAbs(wait->m_cond,
309                                     wait->m_mutex, &end) == ETIMEDOUT)
310     {
311       wait->m_need_wakeup = false;
312     }
313   }
314   NdbMutex_Unlock(wait->m_mutex);
315   return (waits > 0);
316 }
317 
318 
319 static inline
320 int
try_wakeup(struct thr_wait * wait)321 try_wakeup(struct thr_wait* wait)
322 {
323   int success = NdbMutex_Trylock(wait->m_mutex);
324   if (success != 0)
325     return success;
326 
327   // We should avoid signaling when not waiting for wakeup
328   if (wait->m_need_wakeup)
329   {
330     wait->m_need_wakeup = false;
331     NdbCondition_Signal(wait->m_cond);
332   }
333   NdbMutex_Unlock(wait->m_mutex);
334   return 0;
335 }
336 
337 static inline
338 int
wakeup(struct thr_wait * wait)339 wakeup(struct thr_wait* wait)
340 {
341   NdbMutex_Lock(wait->m_mutex);
342   // We should avoid signaling when not waiting for wakeup
343   if (wait->m_need_wakeup)
344   {
345     wait->m_need_wakeup = false;
346     NdbCondition_Signal(wait->m_cond);
347   }
348   NdbMutex_Unlock(wait->m_mutex);
349   return 0;
350 }
351 
352 #endif
353 
354 #define JAM_FILE_ID 236
355 
356 
357 /**
358  * thr_safe_pool
359  */
360 template<typename T>
361 struct alignas(NDB_CL) thr_safe_pool
362 {
363   struct alignas(NDB_CL) thr_safe_pool_lock
364   {
365     struct thr_spin_lock m_lock;
366 
367     T* m_free_list;
368     Uint32 m_cnt;
369     bool m_used_all_reserved;
370   };
371   thr_safe_pool_lock m_safe_lock[MAX_NDBMT_SEND_THREADS];
372   struct thr_spin_lock m_alloc_lock;
373   Uint32 m_allocated;
374 
thr_safe_poolthr_safe_pool375   thr_safe_pool(const char * name)
376   {
377     m_allocated = 0;
378     for (Uint32 i = 0; i < MAX_NDBMT_SEND_THREADS; i++)
379     {
380       char buf[100];
381       m_safe_lock[i].m_free_list = 0;
382       m_safe_lock[i].m_cnt = 0;
383       m_safe_lock[i].m_used_all_reserved = false;
384       BaseString::snprintf(buf, sizeof(buf), "Global_%s[%u]", name, i);
385       register_lock(&m_safe_lock[i].m_lock, buf);
386     }
387     {
388       char buf[100];
389       BaseString::snprintf(buf, sizeof(buf), "Global_allocated%s", name);
390       register_lock(&m_alloc_lock, buf);
391     }
392     assert((sizeof(*this) % NDB_CL) == 0); //Maintain any CL-alignment
393   }
394 
seizethr_safe_pool395   T* seize(Ndbd_mem_manager *mm,
396            Uint32 rg)
397   {
398     /* This function is used by job buffer allocation. */
399     Uint32 instance_no = 0;
400     thr_safe_pool_lock *lock_ptr = &m_safe_lock[instance_no];
401     T* ret = 0;
402     lock(&lock_ptr->m_lock);
403     if (lock_ptr->m_free_list)
404     {
405       assert(lock_ptr->m_cnt);
406       lock_ptr->m_cnt--;
407       ret = lock_ptr->m_free_list;
408       lock_ptr->m_free_list = ret->m_next;
409       unlock(&lock_ptr->m_lock);
410     }
411     else
412     {
413       unlock(&lock_ptr->m_lock);
414       Uint32 dummy;
415       ret = reinterpret_cast<T*>
416         (mm->alloc_page(rg, &dummy,
417                         Ndbd_mem_manager::NDB_ZONE_LE_32));
418       // ToDo: How to deal with failed allocation?!?
419       // I think in this case we need to start grabbing buffers kept for signal
420       // trace.
421       if (ret != NULL)
422       {
423         lock(&m_alloc_lock);
424         m_allocated++;
425         unlock(&m_alloc_lock);
426       }
427     }
428     return ret;
429   }
430 
431 #define RG_REQUIRED_PAGES 96
found_instancethr_safe_pool432   bool found_instance(Uint32 instance,
433                       Uint32 & max_found,
434                       Uint32 & instance_no)
435   {
436     thr_safe_pool_lock *lock_ptr = &m_safe_lock[instance];
437     Uint32 cnt = lock_ptr->m_cnt;
438     if (cnt > RG_REQUIRED_PAGES)
439     {
440       return true;
441     }
442     if (cnt > max_found)
443     {
444       instance_no = instance;
445       max_found = cnt;
446     }
447     return false;
448   }
449 
get_least_empty_instancethr_safe_pool450   Uint32 get_least_empty_instance(Uint32 skip_instance)
451   {
452     /**
453      * Read without mutex protection since it is ok to not get a perfect
454      * result.
455      */
456     Uint32 instance_no_found = 0;
457     Uint32 cnt_found = 0;
458     for (Uint32 i = skip_instance + 1;
459                 i < globalData.ndbMtSendThreads;
460                 i++)
461     {
462       if (found_instance(i,
463                          cnt_found,
464                          instance_no_found))
465         return i;
466     }
467     for (Uint32 i = 0; i < skip_instance; i++)
468     {
469       if (found_instance(i,
470                          cnt_found,
471                          instance_no_found))
472         return i;
473     }
474     return instance_no_found;
475   }
476 
seize_listthr_safe_pool477   Uint32 seize_list(Ndbd_mem_manager *mm,
478                     Uint32 rg,
479                     Uint32 requested,
480                     T** head,
481                     T** tail,
482                     Uint32 instance_no,
483                     bool first_call)
484   {
485     /* This function is used by send buffer allocation. */
486     assert(instance_no < MAX_NDBMT_SEND_THREADS);
487     thr_safe_pool_lock *lock_ptr = &m_safe_lock[instance_no];
488     lock(&lock_ptr->m_lock);
489     if (unlikely(lock_ptr->m_cnt == 0))
490     {
491       unlock(&lock_ptr->m_lock);
492       if (likely(first_call))
493       {
494         /**
495          * No free pages in this instance. We will use the following order
496          * of allocation.
497          *
498          * Case 1: Either no send thread or only one send thread
499          * => Call alloc_page and set use_max_part to true.
500          * If this fails we fail the call.
501          *
502          * Case 2: At least 2 send threads
503          * In this case we will first try to allocate from the memory
504          * manager. But this first call only retrieves from the reserved
505          * part. If we already allocated all from the reserved part we
506          * will skip this call.
507          * Next we will check which instance is the least empty of the
508          * instances. We will try allocating from this instance. The
509          * purpose of this is to avoid allocating beyond the reserved
510          * part as long as possible.
511          * If this call fails as well we will make another call to
512          * alloc_page. This time we will also allow allocations beyond
513          * the reserved part.
514          * If even this fails we will go through the other instances to
515          * see if we can get pages from any instance. Only when this
516          * fails as well will we return no pages found.
517          */
518         Uint32 filled_instance_no = 0;
519         for (Uint32 step = 0; step < 2; step++)
520         {
521           Uint32 dummy;
522           bool locked = false;
523           bool use_max_part = (globalData.ndbMtSendThreads < 2 ||
524                                step == 1);
525           if (use_max_part || !lock_ptr->m_used_all_reserved)
526           {
527             T* ret = reinterpret_cast<T*>
528               (mm->alloc_page(rg,
529                               &dummy,
530                               Ndbd_mem_manager::NDB_ZONE_LE_32,
531                               locked,
532                               use_max_part));
533             if (ret != 0)
534             {
535               ret->m_next = 0;
536               * head = * tail = ret;
537               if (ret != NULL)
538               {
539                 lock(&m_alloc_lock);
540                 m_allocated++;
541                 unlock(&m_alloc_lock);
542               }
543               return 1;
544             }
545             /**
546              * This will only transition from false to true, so no need
547              * to protect it with mutex.
548              */
549             lock_ptr->m_used_all_reserved = true;
550           }
551           /**
552            * No more memory available from global memory, let's see if we
553            * can steal some memory from a neighbour instance.
554            *
555            * This is the call from the local pool, we want to avoid
556            * failing this call since it means we are announcing that we
557            * are out of memory. Try all the other instances before we
558            * move on to requesting memory from the global pool of memory.
559            * We first attempt with the most filled instance, we find this
560            * without acquiring any mutex.
561            */
562           if (globalData.ndbMtSendThreads < 2)
563           {
564             return 0;
565           }
566           if (step == 0)
567           {
568             filled_instance_no = get_least_empty_instance(instance_no);
569             Uint32 returned = seize_list(mm,
570                                          rg,
571                                          requested,
572                                          head,
573                                          tail,
574                                          filled_instance_no,
575                                          false);
576             if (likely(returned > 0))
577             {
578               return returned;
579             }
580           }
581           else
582           {
583             for (Uint32 i = 0; i < globalData.ndbMtSendThreads; i++)
584             {
585               if (i != instance_no &&
586                   i != filled_instance_no)
587               {
588                 Uint32 returned = seize_list(mm,
589                                              rg,
590                                              requested,
591                                              head,
592                                              tail,
593                                              i,
594                                              false);
595                 if (returned != 0)
596                 {
597                   ndbout_c("seize_list: returns %u from instance %u",
598                            returned,
599                            i);
600                   return returned;
601                 }
602               }
603             }
604           }
605         }
606         return 0;
607       }
608       else
609       {
610         return 0;
611       }
612     }
613     else
614     {
615       if (lock_ptr->m_cnt < requested )
616         requested = lock_ptr->m_cnt;
617 
618       T* first = lock_ptr->m_free_list;
619       T* last = first;
620       for (Uint32 i = 1; i < requested; i++)
621       {
622         last = last->m_next;
623       }
624       lock_ptr->m_cnt -= requested;
625       lock_ptr->m_free_list = last->m_next;
626       unlock(&lock_ptr->m_lock);
627       last->m_next = 0;
628       * head = first;
629       * tail = last;
630       return requested;
631     }
632   }
633 
releasethr_safe_pool634   void release(Ndbd_mem_manager *mm,
635                Uint32 rg,
636                T* t)
637   {
638     /* This function is used by job buffer release. */
639     Uint32 instance_no = 0;
640     thr_safe_pool_lock *lock_ptr = &m_safe_lock[instance_no];
641     lock(&lock_ptr->m_lock);
642     t->m_next = lock_ptr->m_free_list;
643     lock_ptr->m_free_list = t;
644     lock_ptr->m_cnt++;
645     unlock(&lock_ptr->m_lock);
646   }
647 
release_listthr_safe_pool648   void release_list(Ndbd_mem_manager *mm,
649                     Uint32 rg,
650                     T* head,
651                     T* tail,
652                     Uint32 cnt,
653                     Uint32 instance_no)
654   {
655     /* This function is used by send buffer release. */
656     assert(instance_no < MAX_NDBMT_SEND_THREADS);
657     Uint32 used_instance_no = instance_no;
658     thr_safe_pool_lock *lock_ptr = &m_safe_lock[used_instance_no];
659     lock(&lock_ptr->m_lock);
660     tail->m_next = lock_ptr->m_free_list;
661     lock_ptr->m_free_list = head;
662     lock_ptr->m_cnt += cnt;
663     unlock(&lock_ptr->m_lock);
664   }
665 };
666 
667 /**
668  * thread_local_pool
669  */
670 template<typename T>
671 class thread_local_pool
672 {
673 public:
thread_local_pool(thr_safe_pool<T> * global_pool,unsigned max_free,unsigned alloc_size=1)674   thread_local_pool(thr_safe_pool<T> *global_pool,
675                     unsigned max_free, unsigned alloc_size = 1) :
676     m_max_free(max_free),
677     m_alloc_size(alloc_size),
678     m_free(0),
679     m_freelist(0),
680     m_global_pool(global_pool)
681   {
682   }
683 
seize(Ndbd_mem_manager * mm,Uint32 rg,Uint32 instance_no)684   T *seize(Ndbd_mem_manager *mm,
685            Uint32 rg,
686            Uint32 instance_no)
687   {
688     T *tmp = m_freelist;
689     if (tmp == 0)
690     {
691       T * tail;
692       m_free = m_global_pool->seize_list(mm,
693                                          rg,
694                                          m_alloc_size,
695                                          &tmp,
696                                          &tail,
697                                          instance_no,
698                                          true);
699     }
700     if (tmp)
701     {
702       m_freelist = tmp->m_next;
703       assert(m_free > 0);
704       m_free--;
705     }
706 
707     validate();
708     return tmp;
709   }
710 
711   /**
712    * Release to local pool even if it get's "too" full
713    *   (wrt to m_max_free)
714    */
release_local(T * t)715   void release_local(T *t)
716   {
717     m_free++;
718     t->m_next = m_freelist;
719     m_freelist = t;
720 
721     validate();
722   }
723 
validate() const724   void validate() const
725   {
726 #ifdef VM_TRACE
727     Uint32 cnt = 0;
728     T* t = m_freelist;
729     while (t)
730     {
731       cnt++;
732       t = t->m_next;
733     }
734     assert(cnt == m_free);
735 #endif
736   }
737 
738   /**
739    * Release entries so that m_max_free is honored
740    *   (likely used together with release_local)
741    */
release_global(Ndbd_mem_manager * mm,Uint32 rg,Uint32 instance_no)742   void release_global(Ndbd_mem_manager *mm,
743                       Uint32 rg,
744                       Uint32 instance_no)
745   {
746     validate();
747     unsigned free = m_free;
748     Uint32 maxfree = m_max_free;
749     assert(maxfree > 0);
750 
751     if (unlikely(free > maxfree))
752     {
753       T* head = m_freelist;
754       T* tail = m_freelist;
755       unsigned cnt = 1;
756       free--;
757 
758       while (free > maxfree)
759       {
760         cnt++;
761         free--;
762         tail = tail->m_next;
763       }
764 
765       assert(free == maxfree);
766 
767       m_free = free;
768       m_freelist = tail->m_next;
769       m_global_pool->release_list(mm,
770                                   rg,
771                                   head,
772                                   tail,
773                                   cnt,
774                                   instance_no);
775     }
776     validate();
777   }
778 
release_all(Ndbd_mem_manager * mm,Uint32 rg,Uint32 instance_no)779   void release_all(Ndbd_mem_manager *mm,
780                    Uint32 rg,
781                    Uint32 instance_no)
782   {
783     validate();
784     T* head = m_freelist;
785     T* tail = m_freelist;
786     if (tail)
787     {
788       unsigned cnt = 1;
789       while (tail->m_next != 0)
790       {
791         cnt++;
792         tail = tail->m_next;
793       }
794       m_global_pool->release_list(mm,
795                                   rg,
796                                   head,
797                                   tail,
798                                   cnt,
799                                   instance_no);
800       m_free = 0;
801       m_freelist = 0;
802     }
803     validate();
804   }
805 
806   /**
807    * release everything if more than m_max_free
808    *   else do nothing
809    */
release_chunk(Ndbd_mem_manager * mm,Uint32 rg,Uint32 instance_no)810   void release_chunk(Ndbd_mem_manager *mm,
811                      Uint32 rg,
812                      Uint32 instance_no)
813   {
814     if (m_free > m_max_free)
815     {
816       release_all(mm, rg, instance_no);
817     }
818   }
819 
820   /**
821    * prealloc up to <em>cnt</em> pages into this pool
822    */
fill(Ndbd_mem_manager * mm,Uint32 rg,Uint32 cnt,Uint32 instance_no)823   bool fill(Ndbd_mem_manager *mm,
824             Uint32 rg,
825             Uint32 cnt,
826             Uint32 instance_no)
827   {
828     if (m_free >= cnt)
829     {
830       return true;
831     }
832 
833     T *head, *tail;
834     Uint32 allocated = m_global_pool->seize_list(mm,
835                                                  rg,
836                                                  m_alloc_size,
837                                                  &head,
838                                                  &tail,
839                                                  instance_no,
840                                                  true);
841     if (allocated)
842     {
843       tail->m_next = m_freelist;
844       m_freelist = head;
845       m_free += allocated;
846       return m_free >= cnt;
847     }
848 
849     return false;
850   }
851 
set_pool(thr_safe_pool<T> * pool)852   void set_pool(thr_safe_pool<T> * pool) { m_global_pool = pool; }
853 
854 private:
855   const unsigned m_max_free;
856   const unsigned m_alloc_size;
857   unsigned m_free;
858   T *m_freelist;
859   thr_safe_pool<T> *m_global_pool;
860 };
861 
862 /**
863  * Signal buffers.
864  *
865  * Each thread job queue contains a list of these buffers with signals.
866  *
867  * There is an underlying assumption that the size of this structure is the
868  * same as the global memory manager page size.
869  */
870 struct thr_job_buffer // 32k
871 {
872   static const unsigned SIZE = 8190;
873 
874   /*
875    * Amount of signal data currently in m_data buffer.
876    * Read/written by producer, read by consumer.
877    */
878   Uint32 m_len;
879   /*
880    * Whether this buffer contained prio A or prio B signals, used when dumping
881    * signals from released buffers.
882    */
883   Uint32 m_prioa;
884   union {
885     Uint32 m_data[SIZE];
886 
887     thr_job_buffer * m_next; // For free-list
888   };
889 };
890 
891 static
892 inline
893 Uint32
calc_fifo_used(Uint32 ri,Uint32 wi,Uint32 sz)894 calc_fifo_used(Uint32 ri, Uint32 wi, Uint32 sz)
895 {
896   return (wi >= ri) ? wi - ri : (sz - ri) + wi;
897 }
898 
899 /**
900  * thr_job_queue is shared between consumer / producer.
901  *
902  * The hot-spot of the thr_job_queue are the read/write indexes.
903  * As they are updated and read frequently they have been placed
904  * in its own thr_job_queue_head[] in order to make them fit inside a
905  * single/few cache lines and thereby avoid complete L1-cache replacement
906  * every time the job_queue is scanned.
907  */
908 struct thr_job_queue_head
909 {
910   unsigned m_read_index;  // Read/written by consumer, read by producer
911   unsigned m_write_index; // Read/written by producer, read by consumer
912 
913   /**
914    * Waiter object: In case job queue is full, the produced thread
915    * will 'yield' on this waiter object until the consumer thread
916    * has consumed (at least) a job buffer.
917    */
918   thr_wait m_waiter;
919 
920   Uint32 used() const;
921 };
922 
923 struct thr_job_queue
924 {
925   static const unsigned SIZE = 32;
926 
927   /**
928    * There is a SAFETY limit on free buffers we never allocate,
929    * but may allow these to be implicitly used as a last resort
930    * when job scheduler is really stuck. ('sleeploop 10')
931    */
932   static const unsigned SAFETY = 2;
933 
934   /**
935    * Some more free buffers are RESERVED to be used to avoid
936    * or resolve circular wait-locks between threads waiting
937    * for buffers to become available.
938    */
939   static const unsigned RESERVED = 4;
940 
941   /**
942    * When free buffer count drops below ALMOST_FULL, we
943    * are allowed to start using RESERVED buffers to prevent
944    * circular wait-locks.
945    */
946   static const unsigned ALMOST_FULL = RESERVED + 2;
947 
948   struct thr_job_buffer* m_buffers[SIZE];
949 };
950 
951 inline
952 Uint32
used() const953 thr_job_queue_head::used() const
954 {
955   return calc_fifo_used(m_read_index, m_write_index, thr_job_queue::SIZE);
956 }
957 
958 /*
959  * Two structures tightly associated with thr_job_queue.
960  *
961  * There will generally be exactly one thr_jb_read_state and one
962  * thr_jb_write_state associated with each thr_job_queue.
963  *
964  * The reason they are kept separate is to avoid unnecessary inter-CPU
965  * cache line pollution. All fields shared among producer and consumer
966  * threads are in thr_job_queue, thr_jb_write_state fields are only
967  * accessed by the producer thread(s), and thr_jb_read_state fields are
968  * only accessed by the consumer thread.
969  *
970  * For example, on Intel core 2 quad processors, there is a ~33%
971  * penalty for two cores accessing the same 64-byte cacheline.
972  */
973 struct thr_jb_write_state
974 {
975   /*
976    * The position to insert the next signal into the queue.
977    *
978    * m_write_index is the index into thr_job_queue::m_buffers[] of the buffer
979    * to insert into, and m_write_pos is the index into thr_job_buffer::m_data[]
980    * at which to store the next signal.
981    */
982   Uint32 m_write_index;
983   Uint32 m_write_pos;
984 
985   /* Thread-local copy of thr_job_queue::m_buffers[m_write_index]. */
986   thr_job_buffer *m_write_buffer;
987 
988   /**
989     Number of signals inserted since last flush to thr_job_queue.
990     This variable stores the number of pending signals not yet flushed
991     in the lower 16 bits and the number of pending signals before a
992     wakeup is called of the other side in the upper 16 bits. To
993     simplify the code we implement the bit manipulations in the
994     methods below.
995 
996     The reason for this optimisation is to minimise use of memory for
997     these variables as they are likely to consume CPU cache memory.
998     It also speeds up some pending signal checks.
999   */
1000   Uint32 m_pending_signals;
1001 
has_any_pending_signalsthr_jb_write_state1002   bool has_any_pending_signals() const
1003   {
1004     return m_pending_signals;
1005   }
get_pending_signalsthr_jb_write_state1006   Uint32 get_pending_signals() const
1007   {
1008     return (m_pending_signals & 0xFFFF);
1009   }
get_pending_signals_wakeupthr_jb_write_state1010   Uint32 get_pending_signals_wakeup() const
1011   {
1012     return (m_pending_signals >> 16);
1013   }
clear_pending_signals_and_set_wakeupthr_jb_write_state1014   void clear_pending_signals_and_set_wakeup(Uint32 wakeups)
1015   {
1016     m_pending_signals = (wakeups << 16);
1017   }
increment_pending_signalsthr_jb_write_state1018   void increment_pending_signals()
1019   {
1020     m_pending_signals++;
1021   }
init_pending_signalsthr_jb_write_state1022   void init_pending_signals()
1023   {
1024     m_pending_signals = 0;
1025   }
1026 
1027   /*
1028    * Is this job buffer open for communication at all?
1029    * Several threads are not expected to communicate, and thus does
1030    * not allocate thr_job_buffer for exchange of signals.
1031    * Don't access any job_buffers without ensuring 'is_open()==true'.
1032    */
is_openthr_jb_write_state1033   bool is_open() const
1034   {
1035     return (m_write_buffer != NULL);
1036   }
1037 };
1038 
1039 /**
1040  * Identify type of thread.
1041  * Based on assumption that threads are allocated in the order:
1042  *  main, ldm, tc, recv, send
1043  */
1044 static bool
is_main_thread(unsigned thr_no)1045 is_main_thread(unsigned thr_no)
1046 {
1047   return thr_no < NUM_MAIN_THREADS;
1048 }
1049 
1050 static bool
is_ldm_thread(unsigned thr_no)1051 is_ldm_thread(unsigned thr_no)
1052 {
1053   return thr_no >= NUM_MAIN_THREADS &&
1054          thr_no <  NUM_MAIN_THREADS+globalData.ndbMtLqhThreads;
1055 }
1056 
1057 /**
1058  * All LDM threads are not created equal:
1059  * First LDMs BACKUP-thread act as client during BACKUP
1060  * (See usage of Backup::UserBackupInstanceKey)
1061  */
1062 static bool
is_first_ldm_thread(unsigned thr_no)1063 is_first_ldm_thread(unsigned thr_no)
1064 {
1065   return thr_no == NUM_MAIN_THREADS;
1066 }
1067 
1068 static bool
is_tc_thread(unsigned thr_no)1069 is_tc_thread(unsigned thr_no)
1070 {
1071   unsigned tc_base = NUM_MAIN_THREADS+globalData.ndbMtLqhThreads;
1072   return thr_no >= tc_base &&
1073          thr_no <  tc_base+globalData.ndbMtTcThreads;
1074 }
1075 
1076 static bool
is_recv_thread(unsigned thr_no)1077 is_recv_thread(unsigned thr_no)
1078 {
1079   unsigned recv_base = NUM_MAIN_THREADS +
1080                        globalData.ndbMtLqhThreads +
1081                        globalData.ndbMtTcThreads;
1082   return thr_no >= recv_base &&
1083          thr_no <  recv_base+globalData.ndbMtReceiveThreads;
1084 }
1085 
1086 /*
1087  * This structure is also used when dumping signal traces, to dump executed
1088  * signals from the buffer(s) currently being processed.
1089  */
1090 struct thr_jb_read_state
1091 {
1092   /*
1093    * Index into thr_job_queue::m_buffers[] of the buffer that we are currently
1094    * executing signals from.
1095    */
1096   Uint32 m_read_index;
1097   /*
1098    * Index into m_read_buffer->m_data[] of the next signal to execute from the
1099    * current buffer.
1100    */
1101   Uint32 m_read_pos;
1102   /*
1103    * Thread local copy of thr_job_queue::m_buffers[m_read_index].
1104    */
1105   thr_job_buffer *m_read_buffer;
1106   /*
1107    * These are thread-local copies of thr_job_queue::m_write_index and
1108    * thr_job_buffer::m_len. They are read once at the start of the signal
1109    * execution loop and used to determine when the end of available signals is
1110    * reached.
1111    */
1112   Uint32 m_read_end;    // End within current thr_job_buffer. (*m_read_buffer)
1113 
1114   Uint32 m_write_index; // Last available thr_job_buffer.
1115 
1116   /*
1117    * Is this job buffer open for communication at all?
1118    * Several threads are not expected to communicate, and thus does
1119    * not allocate thr_job_buffer for exchange of signals.
1120    * Don't access any job_buffers without ensuring 'is_open()==true'.
1121    */
is_openthr_jb_read_state1122   bool is_open() const
1123   {
1124     return (m_read_buffer != NULL);
1125   }
1126 
is_emptythr_jb_read_state1127   bool is_empty() const
1128   {
1129     assert(m_read_index != m_write_index  ||  m_read_pos <= m_read_end);
1130     return (m_read_index == m_write_index) && (m_read_pos >= m_read_end);
1131   }
1132 };
1133 
1134 /**
1135  * time-queue
1136  */
1137 struct thr_tq
1138 {
1139   static const unsigned ZQ_SIZE = 256;
1140   static const unsigned SQ_SIZE = 512;
1141   static const unsigned LQ_SIZE = 512;
1142   static const unsigned PAGES = (MAX_SIGNAL_SIZE *
1143                                 (ZQ_SIZE + SQ_SIZE + LQ_SIZE)) / 8192;
1144 
1145   Uint32 * m_delayed_signals[PAGES];
1146   Uint32 m_next_free;
1147   Uint32 m_next_timer;
1148   Uint32 m_current_time;
1149   Uint32 m_cnt[3];
1150   Uint32 m_zero_queue[ZQ_SIZE];
1151   Uint32 m_short_queue[SQ_SIZE];
1152   Uint32 m_long_queue[LQ_SIZE];
1153 };
1154 
1155 /**
1156  * THR_SEND_BUFFER_ALLOC_SIZE is the amount of 32k pages allocated
1157  * when we allocate pages from the global pool of send buffers to
1158  * the thread_local_pool (which is local to a thread).
1159  *
1160  * We allocate a bunch to decrease contention on send-buffer-pool-mutex
1161  */
1162 #define THR_SEND_BUFFER_ALLOC_SIZE 32
1163 
1164 /**
1165  * THR_SEND_BUFFER_PRE_ALLOC is the amout of 32k pages that are
1166  *   allocated before we start to run signals
1167  */
1168 #define THR_SEND_BUFFER_PRE_ALLOC 32
1169 
1170 /**
1171  * Amount of pages that is allowed to linger in a
1172  * thread-local send-buffer pool
1173  */
1174 #define THR_SEND_BUFFER_MAX_FREE \
1175   (THR_SEND_BUFFER_ALLOC_SIZE + THR_SEND_BUFFER_PRE_ALLOC - 1)
1176 
1177 /*
1178  * Max number of thread-local job buffers to keep before releasing to
1179  * global pool.
1180  */
1181 #define THR_FREE_BUF_MAX 32
1182 /* Minimum number of buffers (to ensure useful trace dumps). */
1183 #define THR_FREE_BUF_MIN 12
1184 /*
1185  * 1/THR_FREE_BUF_BATCH is the fraction of job buffers to allocate/free
1186  * at a time from/to global pool.
1187  */
1188 #define THR_FREE_BUF_BATCH 6
1189 
1190 /**
1191  * a page with send data
1192  */
1193 struct thr_send_page
1194 {
1195   static const Uint32 PGSIZE = 32768;
1196 #if SIZEOF_CHARP == 4
1197   static const Uint32 HEADER_SIZE = 8;
1198 #else
1199   static const Uint32 HEADER_SIZE = 12;
1200 #endif
1201 
max_bytesthr_send_page1202   static Uint32 max_bytes() {
1203     return PGSIZE - offsetof(thr_send_page, m_data);
1204   }
1205 
1206   /* Next page */
1207   thr_send_page* m_next;
1208 
1209   /* Bytes of send data available in this page. */
1210   Uint16 m_bytes;
1211 
1212   /* Start of unsent data */
1213   Uint16 m_start;
1214 
1215   /* Data; real size is to the end of one page. */
1216   char m_data[2];
1217 };
1218 
1219 /**
1220  * a linked list with thr_send_page
1221  */
1222 struct thr_send_buffer
1223 {
1224   thr_send_page* m_first_page;
1225   thr_send_page* m_last_page;
1226 };
1227 
1228 /**
1229  * a ring buffer with linked list of thr_send_page
1230  */
1231 struct thr_send_queue
1232 {
1233   unsigned m_write_index;
1234 #if SIZEOF_CHARP == 8
1235   unsigned m_unused;
1236   thr_send_page* m_buffers[7];
1237   static const unsigned SIZE = 7;
1238 #else
1239   thr_send_page* m_buffers[15];
1240   static const unsigned SIZE = 15;
1241 #endif
1242 };
1243 
1244 struct thr_send_thread_instance;
1245 
1246 struct alignas(NDB_CL) thr_data
1247 {
thr_datathr_data1248   thr_data() : m_jba_write_lock("jbalock"),
1249                m_signal_id_counter(0),
1250                m_send_buffer_pool(0,
1251                                   THR_SEND_BUFFER_MAX_FREE,
1252                                   THR_SEND_BUFFER_ALLOC_SIZE)
1253 #if defined(USE_INIT_GLOBAL_VARIABLES)
1254                ,m_global_variables_ptr_instances(0)
1255                ,m_global_variables_uint32_ptr_instances(0)
1256                ,m_global_variables_uint32_instances(0)
1257                ,m_global_variables_enabled(true)
1258 #endif
1259   {
1260 
1261     // Check cacheline allignment
1262     assert((((UintPtr)this) % NDB_CL) == 0);
1263     assert((((UintPtr)&m_waiter) % NDB_CL) == 0);
1264     assert((((UintPtr)&m_jba_write_lock) % NDB_CL) == 0);
1265     assert((((UintPtr)&m_jba) % NDB_CL) == 0);
1266     assert((((UintPtr)m_in_queue_head) % NDB_CL) == 0);
1267     assert((((UintPtr)m_in_queue) % NDB_CL) == 0);
1268   }
1269 
1270   /**
1271    * We start with the data structures that are shared globally to
1272    * ensure that they get the proper cache line alignment
1273    */
1274   thr_wait m_waiter; /* Cacheline aligned*/
1275 
1276   /*
1277    * Prio A signal incoming queue. This area is used from many threads
1278    * protected by the spin lock. Thus it is also important to protect
1279    * surrounding thread-local variables from CPU cache line sharing
1280    * with this part.
1281    */
1282   alignas(NDB_CL) struct thr_spin_lock m_jba_write_lock;
1283   alignas(NDB_CL) struct thr_job_queue m_jba;
1284   struct thr_job_queue_head m_jba_head;
1285 
1286   /*
1287    * These are the thread input queues, where other threads deliver signals
1288    * into.
1289    * These cache lines are going to be updated by many different CPU's
1290    * all the time whereas other neighbour variables are thread-local variables.
1291    * Avoid false cacheline sharing by require an alignment.
1292    */
1293   alignas(NDB_CL) struct thr_job_queue_head m_in_queue_head[MAX_BLOCK_THREADS];
1294   alignas(NDB_CL) struct thr_job_queue m_in_queue[MAX_BLOCK_THREADS];
1295 
1296   /**
1297    * The remainder of the variables in thr_data are thread-local,
1298    * meaning that they are always updated by the thread that owns those
1299    * data structures and thus those variables aren't shared with other
1300    * CPUs.
1301    */
1302 
1303   unsigned m_thr_no;
1304 
1305   /**
1306    * Thread 0 doesn't necessarily handle all threads in a loop.
1307    * This variable keeps track of which to handle next.
1308    */
1309   unsigned m_next_jbb_no;
1310 
1311   /**
1312    * Spin time of thread after completing all its work (in microseconds).
1313    * We won't go to sleep until we have spun for sufficient time, the aim
1314    * is to increase readiness in systems with much CPU resources
1315    */
1316   unsigned m_spintime;
1317   unsigned m_conf_spintime;
1318 
1319   /**
1320    * nosend option on a thread means that it will never assist with sending.
1321    */
1322   unsigned m_nosend;
1323 
1324   /**
1325    * Realtime scheduler activated for this thread. This means this
1326    * thread will run at a very high priority even beyond the priority
1327    * of the OS.
1328    */
1329   unsigned m_realtime;
1330 
1331   /**
1332    * Index of thread locally in Configuration.cpp
1333    */
1334   unsigned m_thr_index;
1335 
1336   /**
1337    * max signals to execute per JBB buffer
1338    */
1339   unsigned m_max_signals_per_jb;
1340 
1341   /**
1342    * This state show how much assistance we are to provide to the
1343    * send threads in sending. At OVERLOAD we provide no assistance
1344    * and at MEDIUM we take care of our own generated sends and
1345    * at LIGHT we provide some assistance to other threads.
1346    */
1347   OverloadStatus m_overload_status;
1348 
1349   /**
1350    * This is the wakeup instance that we currently use, if 0 it
1351    * means that we don't wake any other block thread up to
1352    * assist in sending. This is a simple way of using idle
1353    * block threads to act as send threads instead of simply
1354    * being idle. In particular this is often used for the main
1355    * thread and the rep thread.
1356    */
1357   Uint32 m_wakeup_instance;
1358 
1359   /**
1360    * This variable keeps track of when we last woke up another thread
1361    * to assist the send thread. We use other timeout calls for this.
1362    */
1363   NDB_TICKS m_last_wakeup_idle_thread;
1364 
1365   /**
1366    * We also keep track of node state, this is in overload state
1367    * if any thread is in OVERLOAD state. In this state we will
1368    * sleep shorter times and be more active in waking up to
1369    * assist the send threads.
1370    */
1371   OverloadStatus m_node_overload_status;
1372 
1373   /**
1374    * Extra JBB signal execute quota allowed to be used to
1375    * drain (almost) full in-buffers. Reserved for usage where
1376    * we are about to end up in a circular wait-lock between
1377    * threads where none if them will be able to proceed.
1378    */
1379   unsigned m_max_extra_signals;
1380 
1381   /**
1382    * max signals to execute before recomputing m_max_signals_per_jb
1383    */
1384   unsigned m_max_exec_signals;
1385 
1386   /**
1387    * Flag indicating that we have sent a local Prio A signal. Used to know
1388    * if to scan for more prio A signals after executing those signals.
1389    * This is used to ensure that if we execute at prio A level and send a
1390    * prio A signal it will be immediately executed (or at least before any
1391    * prio B signal).
1392    */
1393   bool m_sent_local_prioa_signal;
1394 
1395   /* Last read of current ticks */
1396   NDB_TICKS m_curr_ticks;
1397 
1398   NDB_TICKS m_ticks;
1399   struct thr_tq m_tq;
1400 
1401   /**
1402    * If thread overslept it is interesting to see how much time was actually
1403    * spent on executing and how much time was idle time. This will help to
1404    * see if overslept is due to long-running signals or OS not scheduling the
1405    * thread.
1406    *
1407    * We keep the real time last we made scan of time queues to ensure we can
1408    * report proper things in warning messages.
1409    */
1410   NDB_TICKS m_scan_real_ticks;
1411   struct ndb_rusage m_scan_time_queue_rusage;
1412 
1413   /*
1414    * In m_next_buffer we keep a free buffer at all times, so that when
1415    * we hold the lock and find we need a new buffer, we can use this and this
1416    * way defer allocation to after releasing the lock.
1417    */
1418   struct thr_job_buffer* m_next_buffer;
1419 
1420   /*
1421    * We keep a small number of buffers in a thread-local cyclic FIFO, so that
1422    * we can avoid going to the global pool in most cases, and so that we have
1423    * recent buffers available for dumping in trace files.
1424    */
1425   struct thr_job_buffer *m_free_fifo[THR_FREE_BUF_MAX];
1426   /* m_first_free is the index of the entry to return next from seize(). */
1427   Uint32 m_first_free;
1428   /* m_first_unused is the first unused entry in m_free_fifo. */
1429   Uint32 m_first_unused;
1430 
1431 
1432   /* Thread-local read state of prio A buffer. */
1433   struct thr_jb_read_state m_jba_read_state;
1434 
1435   /*
1436    * There is no m_jba_write_state, as we have multiple writers to the prio A
1437    * queue, so local state becomes invalid as soon as we release the lock.
1438    */
1439 
1440   /* These are the write states of m_in_queue[self] in each thread. */
1441   struct thr_jb_write_state m_write_states[MAX_BLOCK_THREADS];
1442   /* These are the read states of all of our own m_in_queue[]. */
1443   struct thr_jb_read_state m_read_states[MAX_BLOCK_THREADS];
1444 
1445   /* Jam buffers for making trace files at crashes. */
1446   EmulatedJamBuffer m_jam;
1447   /* Watchdog counter for this thread. */
1448   Uint32 m_watchdog_counter;
1449   /* Latest executed signal id assigned in this thread */
1450   Uint32 m_signal_id_counter;
1451 
1452   struct thr_send_thread_instance *m_send_instance;
1453   Uint32 m_send_instance_no;
1454 
1455   /* Signal delivery statistics. */
1456   struct
1457   {
1458     Uint64 m_loop_cnt;
1459     Uint64 m_exec_cnt;
1460     Uint64 m_wait_cnt;
1461     Uint64 m_prioa_count;
1462     Uint64 m_prioa_size;
1463     Uint64 m_priob_count;
1464     Uint64 m_priob_size;
1465   } m_stat;
1466 
1467   struct
1468   {
1469     Uint32 m_sleep_longer_spin_time;
1470     Uint32 m_sleep_shorter_spin_time;
1471     Uint32 m_num_waits;
1472     Uint32 m_micros_sleep_times[NUM_SPIN_INTERVALS];
1473     Uint32 m_spin_interval[NUM_SPIN_INTERVALS];
1474   } m_spin_stat;
1475 
1476   Uint64 m_micros_send;
1477   Uint64 m_micros_sleep;
1478   Uint64 m_buffer_full_micros_sleep;
1479   Uint64 m_measured_spintime;
1480 
1481   /* Array of trp ids with pending remote send data. */
1482   TrpId m_pending_send_trps[MAX_NTRANSPORTERS];
1483   /* Number of trp ids in m_pending_send_trps. */
1484   Uint32 m_pending_send_count;
1485 
1486   /**
1487    * Bitmap of pending ids with send data.
1488    * Used to quickly check if a trp id is already in m_pending_send_trps.
1489    */
1490   Bitmask<(MAX_NTRANSPORTERS+31)/32> m_pending_send_mask;
1491 
1492   /* pool for send buffers */
1493   class thread_local_pool<thr_send_page> m_send_buffer_pool;
1494 
1495   /* Send buffer for this thread, these are not touched by any other thread */
1496   struct thr_send_buffer m_send_buffers[MAX_NTRANSPORTERS];
1497 
1498   /* Block instances (main and worker) handled by this thread. */
1499   /* Used for sendpacked (send-at-job-buffer-end). */
1500   Uint32 m_instance_count;
1501   BlockNumber m_instance_list[MAX_INSTANCES_PER_THREAD];
1502 
1503   SectionSegmentPool::Cache m_sectionPoolCache;
1504 
1505   Uint32 m_cpu;
1506   my_thread_t m_thr_id;
1507   NdbThread* m_thread;
1508   Signal *m_signal;
1509   Uint32 m_sched_responsiveness;
1510   Uint32 m_max_signals_before_send;
1511   Uint32 m_max_signals_before_send_flush;
1512 
1513 #ifdef ERROR_INSERT
1514   bool m_delayed_prepare;
1515 #endif
1516 
1517 #if defined (USE_INIT_GLOBAL_VARIABLES)
1518   Uint32 m_global_variables_ptr_instances;
1519   Uint32 m_global_variables_uint32_ptr_instances;
1520   Uint32 m_global_variables_uint32_instances;
1521   bool m_global_variables_enabled;
1522   void* m_global_variables_ptrs[1024];
1523   void* m_global_variables_uint32_ptrs[1024];
1524   void* m_global_variables_uint32[1024];
1525 #endif
1526 };
1527 
1528 struct mt_send_handle  : public TransporterSendBufferHandle
1529 {
1530   struct thr_data * m_selfptr;
mt_send_handlemt_send_handle1531   mt_send_handle(thr_data* ptr) : m_selfptr(ptr) {}
~mt_send_handlemt_send_handle1532   virtual ~mt_send_handle() {}
1533 
1534   virtual Uint32 *getWritePtr(NodeId nodeId,
1535                               TrpId trp_id,
1536                               Uint32 len,
1537                               Uint32 prio,
1538                               Uint32 max,
1539                               SendStatus *error);
1540   virtual Uint32 updateWritePtr(NodeId nodeId,
1541                                 TrpId trp_id,
1542                                 Uint32 lenBytes,
1543                                 Uint32 prio);
1544   virtual void getSendBufferLevel(NodeId node_id, SB_LevelType &level);
1545   virtual bool forceSend(NodeId, TrpId);
1546 };
1547 
1548 struct trp_callback : public TransporterCallback
1549 {
trp_callbacktrp_callback1550   trp_callback() {}
1551 
1552   /* Callback interface. */
1553   void enable_send_buffer(NodeId, TrpId);
1554   void disable_send_buffer(NodeId, TrpId);
1555 
1556   void reportSendLen(NodeId nodeId, Uint32 count, Uint64 bytes);
1557   void lock_transporter(NodeId, TrpId);
1558   void unlock_transporter(NodeId, TrpId);
1559   void lock_send_transporter(NodeId, TrpId);
1560   void unlock_send_transporter(NodeId, TrpId);
1561   Uint32 get_bytes_to_send_iovec(NodeId nodeId,
1562                                  TrpId trp_id,
1563                                  struct iovec *dst,
1564                                  Uint32 max);
1565   Uint32 bytes_sent(NodeId, TrpId, Uint32 bytes);
1566 };
1567 
1568 static char *g_thr_repository_mem = NULL;
1569 static struct thr_repository *g_thr_repository = NULL;
1570 
1571 struct thr_repository
1572 {
thr_repositorythr_repository1573   thr_repository() :
1574       m_section_lock("sectionlock"),
1575       m_mem_manager_lock("memmanagerlock"),
1576       m_jb_pool("jobbufferpool"),
1577       m_sb_pool("sendbufferpool")
1578   {
1579     // Verify assumed cacheline allignment
1580     assert((((UintPtr)this) % NDB_CL) == 0);
1581     assert((((UintPtr)&m_receive_lock) % NDB_CL) == 0);
1582     assert((((UintPtr)&m_section_lock) % NDB_CL) == 0);
1583     assert((((UintPtr)&m_mem_manager_lock) % NDB_CL) == 0);
1584     assert((((UintPtr)&m_jb_pool) % NDB_CL) == 0);
1585     assert((((UintPtr)&m_sb_pool) % NDB_CL) == 0);
1586     assert((((UintPtr)m_thread) % NDB_CL) == 0);
1587     assert((sizeof(m_receive_lock[0]) % NDB_CL) == 0);
1588   }
1589 
1590   /**
1591    * m_receive_lock, m_section_lock, m_mem_manager_lock, m_jb_pool
1592    * and m_sb_pool are all variables globally shared among the threads
1593    * and also heavily updated.
1594    * Requiring alignments avoid false cache line sharing.
1595    */
1596   thr_aligned_spin_lock m_receive_lock[MAX_NDBMT_RECEIVE_THREADS];
1597 
1598   alignas(NDB_CL) struct thr_spin_lock m_section_lock;
1599   alignas(NDB_CL) struct thr_spin_lock m_mem_manager_lock;
1600   alignas(NDB_CL) struct thr_safe_pool<thr_job_buffer> m_jb_pool;
1601   alignas(NDB_CL) struct thr_safe_pool<thr_send_page> m_sb_pool;
1602 
1603   /* m_mm and m_thread_count are globally shared and read only variables */
1604   Ndbd_mem_manager * m_mm;
1605   unsigned m_thread_count;
1606 
1607   /**
1608    * Protect m_mm and m_thread_count from CPU cache misses, first
1609    * part of m_thread (struct thr_data) is globally shared variables.
1610    * So sharing cache line with these for these read only variables
1611    * isn't a good idea
1612    */
1613   alignas(NDB_CL) struct thr_data m_thread[MAX_BLOCK_THREADS];
1614 
1615   /* The buffers that are to be sent */
1616   struct send_buffer
1617   {
1618     /**
1619      * In order to reduce lock contention while
1620      * adding job buffer pages to the send buffers,
1621      * and sending these with the help of the send
1622      * transporters, there are two different
1623      * thr_send_buffer's. Each protected by its own lock:
1624      *
1625      * - m_buffer / m_buffer_lock:
1626      *   Send buffer pages from all threads are linked into
1627      *   the m_buffer when collected by link_thread_send_buffers().
1628      *
1629      * - m_sending / m_send_lock:
1630      *   Before send buffers are given to the send-transporter,
1631      *   they are moved from m_buffer -> m_sending by
1632      *   get_bytes_to_send_iovec(). (Req. both locks.)
1633      *   When transporter has consumed some/all of m_sending
1634      *   buffers, ::bytes_sent() will update m_sending accordingly.
1635      *
1636      * If both locks are required, grab the m_send_lock first.
1637      * Release m_buffer_lock before releasing m_send_lock.
1638      */
1639     struct thr_spin_lock m_buffer_lock; //Protect m_buffer
1640     struct thr_send_buffer m_buffer;
1641 
1642     struct thr_spin_lock m_send_lock;   //Protect m_sending + transporter
1643     struct thr_send_buffer m_sending;
1644 
1645     /* Size of resp. 'm_buffer' and 'm_sending' buffered data */
1646     Uint64 m_buffered_size;             //Protected by m_buffer_lock
1647     Uint64 m_sending_size;              //Protected by m_send_lock
1648 
1649     bool m_enabled;                     //Protected by m_send_lock
1650 
1651     /**
1652      * Flag used to coordinate sending to same remote trp from different
1653      * threads when there are contention on m_send_lock.
1654      *
1655      * If two threads need to send to the same trp at the same time, the
1656      * second thread, rather than wait for the first to finish, will just
1657      * set this flag. The first thread will will then take responsibility
1658      * for sending to this trp when done with its own sending.
1659      */
1660     Uint32 m_force_send;   //Check after release of m_send_lock
1661 
1662     /**
1663      * Which thread is currently holding the m_send_lock
1664      * This is the thr_no of the thread sending, this can be both a
1665      * send thread and a block thread. Send thread start their
1666      * thr_no at glob_num_threads. So it is easy to check this
1667      * thr_no to see if it is a block thread or a send thread.
1668      * This variable is used to find the proper place to return
1669      * the send buffer pages after completing the send.
1670      */
1671     Uint32 m_send_thread;  //Protected by m_send_lock
1672 
1673     /**
1674      * Bytes sent in last performSend().
1675      */
1676     Uint32 m_bytes_sent;
1677 
1678     /* read index(es) in thr_send_queue */
1679     Uint32 m_read_index[MAX_BLOCK_THREADS];
1680   } m_send_buffers[MAX_NTRANSPORTERS];
1681 
1682   /* The buffers published by threads */
1683   thr_send_queue m_thread_send_buffers[MAX_NTRANSPORTERS][MAX_BLOCK_THREADS];
1684 
1685   /*
1686    * These are used to synchronize during crash / trace dumps.
1687    *
1688    */
1689   NdbMutex stop_for_crash_mutex;
1690   NdbCondition stop_for_crash_cond;
1691   Uint32 stopped_threads;
1692 };
1693 
1694 /**
1695  *  Class to handle send threads
1696  *  ----------------------------
1697  *  We can have up to 8 send threads.
1698  *
1699  *  This class will handle when a block thread needs to send, it will
1700  *  handle the running of the send thread and will also start the
1701  *  send thread.
1702  */
1703 #define is_send_thread(thr_no) (thr_no >= glob_num_threads)
1704 
1705 struct thr_send_thread_instance
1706 {
thr_send_thread_instancethr_send_thread_instance1707   thr_send_thread_instance() :
1708                m_instance_no(0),
1709                m_watchdog_counter(0),
1710                m_thr_index(0),
1711                m_thread(NULL),
1712                m_waiter_struct(),
1713                m_send_buffer_pool(0,
1714                                   THR_SEND_BUFFER_MAX_FREE,
1715                                   THR_SEND_BUFFER_ALLOC_SIZE),
1716                m_exec_time(0),
1717                m_sleep_time(0),
1718                m_user_time_os(0),
1719                m_kernel_time_os(0),
1720                m_elapsed_time_os(0),
1721                m_measured_spintime(0),
1722                m_awake(FALSE),
1723                m_first_trp(0),
1724                m_last_trp(0),
1725                m_next_is_high_prio_trp(false),
1726                m_more_trps(false),
1727                m_num_neighbour_trps(0),
1728                m_neighbour_trp_index(0)
1729   {}
1730 
1731   /**
1732    * Instance number of send thread, this is set at creation of
1733    * send thread and after that not changed, so no need to protect
1734    * it when reading it.
1735    */
1736   Uint32 m_instance_no;
1737 
1738   /**
1739    * This variable is registered in the watchdog, it is set by the
1740    * send thread and reset every now and then by watchdog thread.
1741    * No sepecial protection is required in setting it.
1742    */
1743   Uint32 m_watchdog_counter;
1744 
1745   /**
1746    * Thread index of send thread in data node, this variable is
1747    * currently not used.
1748    */
1749   Uint32 m_thr_index;
1750   NdbThread *m_thread;
1751 
1752   /**
1753    * Variable controlling send thread sleep and awakeness, this is
1754    * used in call to wakeup a thread.
1755    */
1756   thr_wait m_waiter_struct;
1757 
1758   class thread_local_pool<thr_send_page> m_send_buffer_pool;
1759 
1760   /**
1761    * The below variables are protected by the send_thread_mutex.
1762    * Each send thread is taking care of a subset of the transporters
1763    * in the data node. The function to decide which send thread
1764    * instance is responsible is simply the transporter id modulo the
1765    * number of send thread instances, possibly extended with a simple
1766    * hash function to make it less likely that some simple regularity
1767    * in node ids create unnecessary bottlenecks.
1768    *
1769    * Each send thread only has neighbour transporters it is responsible
1770    * for in the list below.
1771    */
1772 
1773   /**
1774    * Statistical variables that track send thread CPU usage that is
1775    * reported in call getSendPerformanceTimers that is used by
1776    * THRMAN block to track CPU usage in send threads and is also
1777    * used by THRMAN to report data on send threads in ndbinfo
1778    * tables. The data is used in adaptive send thread control by
1779    * THRMAN.
1780    */
1781   Uint64 m_exec_time;
1782   Uint64 m_sleep_time;
1783   Uint64 m_user_time_os;
1784   Uint64 m_kernel_time_os;
1785   Uint64 m_elapsed_time_os;
1786   Uint64 m_measured_spintime;
1787 
1788   /**
1789    * Boolean indicating if send thread is awake or not.
1790    */
1791   Uint32 m_awake;
1792 
1793   /* First trp that has data to be sent */
1794   Uint32 m_first_trp;
1795 
1796   /* Last trp in list of trps with data available for sending */
1797   Uint32 m_last_trp;
1798 
1799   /* Which list should I get trp from next time. */
1800   bool m_next_is_high_prio_trp;
1801 
1802   /* 'true': More trps became available -> Need recheck ::get_trp() */
1803   bool m_more_trps;
1804 
1805 #define MAX_NEIGHBOURS (3 * MAX_NODE_GROUP_TRANSPORTERS)
1806   Uint32 m_num_neighbour_trps;
1807   Uint32 m_neighbour_trp_index;
1808   Uint32 m_neighbour_trps[MAX_NEIGHBOURS];
1809 
1810   /**
1811    * Mutex protecting the linked list of trps awaiting sending
1812    * and also the m_awake variable of the send thread. This
1813    * includes the neighbour transporters listed above.
1814    *
1815    * In addition the statistical variables listed above.
1816    *
1817    * Finally it also protects the data for transporters handled by this
1818    * send thread in the m_trp_state array (the thr_send_trps struct).
1819    */
1820   NdbMutex *send_thread_mutex;
1821 
1822   /**
1823    * Check if a trp possibly is having data ready to be sent.
1824    * Upon 'true', callee should grab send_thread_mutex and
1825    * try to get_trp() while holding lock.
1826    */
data_availablethr_send_thread_instance1827   bool data_available() const
1828   {
1829     rmb();
1830     return (m_more_trps == TRUE);
1831   }
1832 
check_pending_datathr_send_thread_instance1833   bool check_pending_data()
1834   {
1835     return m_more_trps;
1836   }
1837 };
1838 
1839 struct thr_send_trps
1840 {
1841   /**
1842    * 'm_next' implements a list of 'send_trps' with PENDING'
1843    * data, not yet assigned to a send thread. 0 means NULL.
1844    */
1845   Uint16 m_next;
1846 
1847   /**
1848    * m_data_available are incremented/decremented by each
1849    * party having data to be sent to this specific trp.
1850    * It work in conjunction with a queue of get'able trps
1851    * (insert_trp(), get_trp()) waiting to be served by
1852    * the send threads, such that:
1853    *
1854    * 1) IDLE-state (m_data_available==0, not in list)
1855    *    There are no data available for sending, and
1856    *    no send threads are assigned to this trp.
1857    *
1858    * 2) PENDING-state (m_data_available>0, in list)
1859    *    There are data available for sending, possibly
1860    *    supplied by multiple parties. No send threads
1861    *    are currently serving this request.
1862    *
1863    * 3) ACTIVE-state (m_data_available==1, not in list)
1864    *    There are data available for sending, possibly
1865    *    supplied by multiple parties, which are currently
1866    *    being served by a send thread. All known
1867    *    data available at the time when we became 'ACTIVE'
1868    *    will be served now ( -> '==1')
1869    *
1870    * 3b ACTIVE-WITH-PENDING-state (m_data_available>1, not in list)
1871    *    Variant of above state, send thread is serving requests,
1872    *    and even more data became available since we started.
1873    *
1874    * Allowed state transitions are:
1875    *
1876    * IDLE     -> PENDING  (alert_send_thread w/ insert_trp)
1877    * PENDING  -> ACTIVE   (get_trp)
1878    * ACTIVE   -> IDLE     (run_send_thread if check_done_trp)
1879    * ACTIVE   -> PENDING  (run_send_thread if 'more'
1880    * ACTIVE   -> ACTIVE-P (alert_send_thread while ACTIVE)
1881    * ACTIVE-P -> PENDING  (run_send_thread while not check_done_trp)
1882    * ACTIVE-P -> ACTIVE-P (alert_send_thread while ACTIVE-P)
1883    *
1884    * A consequence of this, is that only a (single-) ACTIVE
1885    * send thread will serve send request to a specific trp.
1886    * Thus, there will be no contention on the m_send_lock
1887    * caused by the send threads.
1888    */
1889   Uint16 m_data_available;
1890 
1891   /**
1892    * This variable shows which trp is actually sending for the moment.
1893    * This will be reset again immediately after sending is completed.
1894    * It is used to ensure that neighbour trps aren't taken out for
1895    * sending by more than one thread. The neighbour list is simply
1896    * an array of the neighbours and we will send if data is avaiable
1897    * to send AND no one else is sending which is checked by looking at
1898    * this variable.
1899    */
1900   Uint16 m_thr_no_sender;
1901 
1902   /* Send to this trp has caused a Transporter overload */
1903   Uint16 m_send_overload;
1904 
1905   /**
1906    * This is neighbour trp in the same node group as ourselves. This means
1907    * that we are likely to communicate with this trp more heavily than
1908    * other trps. Also delays in this communication will make the updates
1909    * take much longer since updates has to traverse this link and the
1910    * corresponding link back 6 times as part of an updating transaction.
1911    *
1912    * Thus for good performance of updates it is essential to prioritise this
1913    * link a bit.
1914    */
1915   bool m_neighbour_trp;
1916 
1917   /**
1918    * Further sending to this trp should be delayed until
1919    * 'm_micros_delayed' has passed since 'm_inserted_time'.
1920    */
1921   Uint32 m_micros_delayed;
1922   NDB_TICKS m_inserted_time;
1923 
1924   /**
1925    * Counter of how many overload situations we experienced towards this
1926    * trp. We keep track of this to get an idea if the config setup is
1927    * incorrect somehow, one should consider increasing TCP_SND_BUF_SIZE
1928    * if this counter is incremented often. It is an indication that a
1929    * bigger buffer is needed to handle bandwith-delay product of the
1930    * node communication.
1931    */
1932   Uint64 m_overload_counter;
1933 };
1934 
1935 class thr_send_threads
1936 {
1937 public:
1938   /* Create send thread environment */
1939   thr_send_threads();
1940 
1941   /* Destroy send thread environment and ensure threads are stopped */
1942   ~thr_send_threads();
1943 
1944   struct thr_send_thread_instance* get_send_thread_instance_by_num(Uint32);
1945   /**
1946    * A block thread provides assistance to send thread by executing send
1947    * to one of the trps.
1948    */
1949   bool assist_send_thread(Uint32 max_num_trps,
1950                           Uint32 thr_no,
1951                           NDB_TICKS now,
1952                           Uint32 &watchdog_counter,
1953                struct thr_send_thread_instance *send_instance,
1954                class thread_local_pool<thr_send_page>  & send_buffer_pool);
1955 
1956   /* Send thread method to send to a transporter picked by get_trp */
1957   bool handle_send_trp(TrpId id,
1958                        Uint32 & num_trp_sent,
1959                        Uint32 thr_no,
1960                        NDB_TICKS & now,
1961                        Uint32 & watchdog_counter,
1962                        struct thr_send_thread_instance *send_instance);
1963 
1964   /* A block thread has flushed data for a trp and wants it sent */
1965   Uint32 alert_send_thread(TrpId trp_id,
1966                            NDB_TICKS now,
1967                            struct thr_send_thread_instance* send_instance);
1968 
1969   /* Method used to run the send thread */
1970   void run_send_thread(Uint32 instance_no);
1971 
1972   /* Method to assign the base transporter to send threads */
1973   void assign_trps_to_send_threads();
1974 
1975   /* Method to assign the multi transporter to send threads */
1976   void assign_multi_trps_to_send_threads();
1977 
1978   /* Method to assign the block threads to assist send threads */
1979   void assign_threads_to_assist_send_threads();
1980 
1981   /* Method to start the send threads */
1982   void start_send_threads();
1983 
1984   /* Get send buffer pool for send thread */
get_send_buffer_pool(Uint32 thr_no)1985   thread_local_pool<thr_send_page>* get_send_buffer_pool(Uint32 thr_no)
1986   {
1987     return &m_send_threads[thr_no - glob_num_threads].m_send_buffer_pool;
1988   }
1989 
1990   void wake_my_send_thread_if_needed(TrpId *trp_id_array,
1991                                      Uint32 count,
1992                    struct thr_send_thread_instance *my_send_instance);
1993   Uint32 get_send_instance(TrpId trp_id);
1994 private:
1995   struct thr_send_thread_instance* get_send_thread_instance_by_trp(TrpId);
1996 
1997   /* Insert a trp in list of trps that has data available to send */
1998   void insert_trp(TrpId trp_id, struct thr_send_thread_instance*);
1999 
2000   /* Get a trp id in order to send to it */
2001   TrpId get_trp(Uint32 instance_no,
2002                  NDB_TICKS now,
2003                  struct thr_send_thread_instance* send_instance);
2004 
2005   /* Update rusage parameters for send thread. */
2006   void update_rusage(struct thr_send_thread_instance *this_send_thread,
2007                      Uint64 elapsed_time);
2008 
2009   /**
2010    * Set of utility methods to aid in scheduling of send work:
2011    *
2012    * Further sending to trp can be delayed
2013    * until 'now+delay'. Used either to wait for more packets
2014    * to be available for bigger chunks, or to wait for an overload
2015    * situation to clear.
2016    */
2017   void set_max_delay(TrpId trp_id, NDB_TICKS now, Uint32 delay_usec);
2018   void set_overload_delay(TrpId trp_id, NDB_TICKS now, Uint32 delay_usec);
2019   Uint32 check_delay_expired(TrpId trp_id, NDB_TICKS now);
2020 
2021   /* Completed sending data to this trp, check if more work pending. */
2022   bool check_done_trp(TrpId trp_id);
2023 
2024   /* Get a send thread which isn't awake currently */
2025   struct thr_send_thread_instance* get_not_awake_send_thread(
2026                  TrpId trp_id,
2027                  struct thr_send_thread_instance *send_instance);
2028 
2029   /* Try to lock send_buffer for this trp. */
2030   static
2031   int trylock_send_trp(TrpId trp_id);
2032 
2033   /* Perform the actual send to the trp, release send_buffer lock.
2034    * Return 'true' if there are still more to be sent to this trp.
2035    */
2036   static
2037   bool perform_send(TrpId trp_id, Uint32 thr_no, Uint32& bytes_sent);
2038 
2039   /* Have threads been started */
2040   Uint32 m_started_threads;
2041 
2042   OverloadStatus m_node_overload_status;
2043 
2044   /* Is data available and next reference for each trp in cluster */
2045   struct thr_send_trps m_trp_state[MAX_NTRANSPORTERS];
2046 
2047   /**
2048    * Very few compiler (gcc) allow zero length arrays
2049    */
2050 #if MAX_NDBMT_SEND_THREADS == 0
2051 #define _MAX_SEND_THREADS 1
2052 #else
2053 #define _MAX_SEND_THREADS MAX_NDBMT_SEND_THREADS
2054 #endif
2055 
2056   /* Data and state for the send threads */
2057   Uint32 m_num_trps;
2058   Uint32 m_next_send_thread_instance_by_trp;
2059   struct thr_send_thread_instance m_send_threads[_MAX_SEND_THREADS];
2060   Uint16 m_send_thread_instance_by_trp[MAX_NTRANSPORTERS];
2061 
2062 public:
2063 
getSendPerformanceTimers(Uint32 send_instance,Uint64 & exec_time,Uint64 & sleep_time,Uint64 & spin_time,Uint64 & user_time_os,Uint64 & kernel_time_os,Uint64 & elapsed_time_os)2064   void getSendPerformanceTimers(Uint32 send_instance,
2065                                 Uint64 & exec_time,
2066                                 Uint64 & sleep_time,
2067                                 Uint64 & spin_time,
2068                                 Uint64 & user_time_os,
2069                                 Uint64 & kernel_time_os,
2070                                 Uint64 & elapsed_time_os)
2071   {
2072     require(send_instance < globalData.ndbMtSendThreads);
2073     NdbMutex_Lock(m_send_threads[send_instance].send_thread_mutex);
2074     exec_time = m_send_threads[send_instance].m_exec_time;
2075     sleep_time = m_send_threads[send_instance].m_sleep_time;
2076     spin_time = m_send_threads[send_instance].m_measured_spintime;
2077     user_time_os= m_send_threads[send_instance].m_user_time_os;
2078     kernel_time_os = m_send_threads[send_instance].m_kernel_time_os;
2079     elapsed_time_os = m_send_threads[send_instance].m_elapsed_time_os;
2080     NdbMutex_Unlock(m_send_threads[send_instance].send_thread_mutex);
2081   }
startChangeNeighbourNode()2082   void startChangeNeighbourNode()
2083   {
2084     for (Uint32 i = 0; i < globalData.ndbMtSendThreads; i++)
2085     {
2086       NdbMutex_Lock(m_send_threads[i].send_thread_mutex);
2087       for (Uint32 j = 0; j < MAX_NEIGHBOURS; j++)
2088       {
2089         m_send_threads[i].m_neighbour_trps[j] = 0;
2090       }
2091       m_send_threads[i].m_num_neighbour_trps = 0;
2092     }
2093     for (Uint32 i = 0; i < MAX_NTRANSPORTERS; i++)
2094     {
2095       m_trp_state[i].m_neighbour_trp = FALSE;
2096     }
2097   }
setNeighbourNode(NodeId nodeId)2098   void setNeighbourNode(NodeId nodeId)
2099   {
2100     NodeId id[MAX_NODE_GROUP_TRANSPORTERS];
2101     Uint32 num_ids;
2102     if (globalData.ndbMtSendThreads == 0)
2103     {
2104       return;
2105     }
2106     globalTransporterRegistry.get_trps_for_node(nodeId,
2107                                                 &id[0],
2108                                                 num_ids,
2109                                                 MAX_NODE_GROUP_TRANSPORTERS);
2110     for (Uint32 index = 0; index < num_ids; index++)
2111     {
2112       Uint32 this_id = id[index];
2113       Uint32 send_instance = get_send_instance(this_id);
2114       m_trp_state[this_id].m_neighbour_trp = TRUE;
2115       for (Uint32 i = 0; i < MAX_NEIGHBOURS; i++)
2116       {
2117         require(m_send_threads[send_instance].m_neighbour_trps[i] != this_id);
2118         if (m_send_threads[send_instance].m_neighbour_trps[i] == 0)
2119         {
2120           DEB_MULTI_TRP(("Neighbour(%u) of node %u is trp %u",
2121                          i,
2122                          nodeId,
2123                          this_id));
2124           assert(m_send_threads[send_instance].m_num_neighbour_trps == i);
2125           m_send_threads[send_instance].m_neighbour_trps[i] = this_id;
2126           m_send_threads[send_instance].m_num_neighbour_trps++;
2127           assert(m_send_threads[send_instance].m_num_neighbour_trps <=
2128                  MAX_NEIGHBOURS);
2129 
2130           break;
2131         }
2132       }
2133     }
2134   }
endChangeNeighbourNode()2135   void endChangeNeighbourNode()
2136   {
2137     /**
2138      * If a transporter was in the transporter list before (don't think it
2139      * should be possible) it doesn't represent an issue since it will simply
2140      * be handled twice, first from neighbour list and second from list of
2141      * transporters.
2142      *
2143      * The opposite behaviour that a transporter goes from neighbour to not
2144      * a neighbour transporter any more should only happen in node failures
2145      * and in that case the transporter should not have any data to send
2146      * and the transporter will be cleared before the node is allowed to
2147      * restart again.
2148      */
2149     for (Uint32 i = 0; i < globalData.ndbMtSendThreads; i++)
2150     {
2151       m_send_threads[i].m_neighbour_trp_index = 0;
2152       NdbMutex_Unlock(m_send_threads[i].send_thread_mutex);
2153     }
2154   }
setNodeOverloadStatus(OverloadStatus new_status)2155   void setNodeOverloadStatus(OverloadStatus new_status)
2156   {
2157     /**
2158      * The read of this variable is unsafe, but has no dire consequences
2159      * if it is shortly inconsistent. We use a memory barrier to at least
2160      * speed up the spreading of the variable to all CPUs.
2161      */
2162     m_node_overload_status = new_status;
2163     mb();
2164   }
2165 };
2166 
2167 
2168 /*
2169  * The single instance of the thr_send_threads class, if this variable
2170  * is non-NULL, then we're using send threads, otherwise if NULL, there
2171  * are no send threads.
2172  */
2173 static char* g_send_threads_mem = NULL;
2174 static thr_send_threads *g_send_threads = NULL;
2175 
2176 extern "C"
2177 void *
mt_send_thread_main(void * thr_arg)2178 mt_send_thread_main(void *thr_arg)
2179 {
2180   struct thr_send_thread_instance *this_send_thread =
2181     (thr_send_thread_instance*)thr_arg;
2182 
2183   Uint32 instance_no = this_send_thread->m_instance_no;
2184   g_send_threads->run_send_thread(instance_no);
2185   return NULL;
2186 }
2187 
thr_send_threads()2188 thr_send_threads::thr_send_threads()
2189   : m_started_threads(FALSE),
2190     m_node_overload_status((OverloadStatus)LIGHT_LOAD_CONST)
2191 {
2192   struct thr_repository *rep = g_thr_repository;
2193 
2194   for (Uint32 i = 0; i < NDB_ARRAY_SIZE(m_trp_state); i++)
2195   {
2196     m_trp_state[i].m_next = 0;
2197     m_trp_state[i].m_data_available = 0;
2198     m_trp_state[i].m_thr_no_sender = Uint16(NO_OWNER_THREAD);
2199     m_trp_state[i].m_send_overload = FALSE;
2200     m_trp_state[i].m_micros_delayed = 0;
2201     m_trp_state[i].m_neighbour_trp = FALSE;
2202     m_trp_state[i].m_overload_counter = 0;
2203     NdbTick_Invalidate(&m_trp_state[i].m_inserted_time);
2204   }
2205   for (Uint32 i = 0; i < NDB_ARRAY_SIZE(m_send_threads); i++)
2206   {
2207     m_send_threads[i].m_more_trps = false;
2208     m_send_threads[i].m_first_trp = 0;
2209     m_send_threads[i].m_last_trp = 0;
2210     m_send_threads[i].m_next_is_high_prio_trp = false;
2211     m_send_threads[i].m_num_neighbour_trps = 0;
2212     m_send_threads[i].m_neighbour_trp_index = 0;
2213     for (Uint32 j = 0; j < MAX_NEIGHBOURS; j++)
2214     {
2215       m_send_threads[i].m_neighbour_trps[j] = 0;
2216     }
2217     m_send_threads[i].m_waiter_struct.init();
2218     m_send_threads[i].m_instance_no = i;
2219     m_send_threads[i].m_send_buffer_pool.set_pool(&rep->m_sb_pool);
2220     m_send_threads[i].send_thread_mutex = NdbMutex_Create();
2221   }
2222   memset(&m_send_thread_instance_by_trp[0],
2223          0xFF,
2224          sizeof(m_send_thread_instance_by_trp));
2225   m_next_send_thread_instance_by_trp = 0;
2226   m_num_trps = 0;
2227 }
2228 
~thr_send_threads()2229 thr_send_threads::~thr_send_threads()
2230 {
2231   if (!m_started_threads)
2232     return;
2233 
2234   for (Uint32 i = 0; i < globalData.ndbMtSendThreads; i++)
2235   {
2236     void *dummy_return_status;
2237 
2238     /* Ensure thread is woken up to die */
2239     wakeup(&(m_send_threads[i].m_waiter_struct));
2240     NdbThread_WaitFor(m_send_threads[i].m_thread, &dummy_return_status);
2241     globalEmulatorData.theConfiguration->removeThread(
2242       m_send_threads[i].m_thread);
2243     NdbThread_Destroy(&(m_send_threads[i].m_thread));
2244   }
2245 }
2246 
2247 /**
2248  * Base transporters are spread equally among the send threads.
2249  * There is no special connection between a thread and a transporter
2250  * to another node. Thus round-robin scheduling is good enough.
2251  */
2252 void
assign_trps_to_send_threads()2253 thr_send_threads::assign_trps_to_send_threads()
2254 {
2255   Uint32 num_trps = globalTransporterRegistry.get_num_trps();
2256   m_num_trps = num_trps;
2257   /* Transporter instance 0 isn't used */
2258   m_send_thread_instance_by_trp[0] = Uint16(~0);
2259   Uint32 send_instance = 0;
2260   for (Uint32 i = 1; i <= num_trps; i++)
2261   {
2262     m_send_thread_instance_by_trp[i] = send_instance;
2263     send_instance++;
2264     if (send_instance == globalData.ndbMtSendThreads)
2265     {
2266       send_instance = 0;
2267     }
2268   }
2269   m_next_send_thread_instance_by_trp = 0;
2270 }
2271 
2272 void
mt_assign_multi_trps_to_send_threads()2273 mt_assign_multi_trps_to_send_threads()
2274 {
2275   DEB_MULTI_TRP(("mt_assign_multi_trps_to_send_threads()"));
2276   if (g_send_threads)
2277   {
2278     g_send_threads->assign_multi_trps_to_send_threads();
2279   }
2280 }
2281 
2282 /**
2283  * Multi transporters are assigned to send thread instances to mimic
2284  * the assignment of LDM instances to send thread instances. This
2285  * ensures that if an LDM thread sends a message to another LDM
2286  * thread in the same node group the LDM thread will assist with
2287  * the sending of this message. The LDM thread will send to another
2288  * LDM thread mostly in case it is within the same node group and it
2289  * will then send to the same LDM instance in that node.
2290  *
2291  * Ideally the number of LDM threads should be a multiple of the number
2292  * of send threads to get the best assignment of transporters to send
2293  * threads.
2294  */
2295 void
assign_multi_trps_to_send_threads()2296 thr_send_threads::assign_multi_trps_to_send_threads()
2297 {
2298   DEB_MULTI_TRP(("assign_multi_trps_to_send_threads()"));
2299   Uint32 new_num_trps = globalTransporterRegistry.get_num_trps();
2300   Uint32 send_instance = m_next_send_thread_instance_by_trp;
2301   DEB_MULTI_TRP(("assign_multi_trps_to_send_threads(): new_num_trps = %u",
2302                  new_num_trps));
2303   for (Uint32 i = m_num_trps + 1; i <= new_num_trps; i++)
2304   {
2305     m_send_thread_instance_by_trp[i] = send_instance;
2306     send_instance++;
2307     if (send_instance == globalData.ndbMtSendThreads)
2308     {
2309       send_instance = 0;
2310     }
2311   }
2312   m_num_trps = new_num_trps;
2313   m_next_send_thread_instance_by_trp = send_instance;
2314 }
2315 
2316 void
assign_threads_to_assist_send_threads()2317 thr_send_threads::assign_threads_to_assist_send_threads()
2318 {
2319   /**
2320    * Assign the block thread (ldm, tc, rep and main) to assist a certain send
2321    * thread instance. This means that assistance will only be provided to a
2322    * subset of the transporters from this block thread. The actual send
2323    * threads can also assist other send threads to avoid having to wake up
2324    * all send threads all the time.
2325    *
2326    * If we have configured the block thread to not provide any send thread
2327    * assistance we will not assign any send thread to it, similarly receive
2328    * threads don't provide send thread assistance and if no send threads
2329    * are around we use the old method of sending without send threads and
2330    * in this case the sending is done by all block threads and there are
2331    * no send threads around at all.
2332    *
2333    * We perform round robin of LDM threads first and then round robin on the
2334    * non-LDM threads. This ensures that the first LDM thread starts at send
2335    * instance 0 to ensure that we support the transporters used for
2336    * communication to the same LDM in the same node group. This is not
2337    * guaranteed for all configurations, but we strive for this configuration
2338    * to ensure that the LDM thread will quickly send its own messages within
2339    * the node group. Messages to other nodes will be picked up by another
2340    * send thread. With only one send thread the LDM threads will support all
2341    * transporters. Multiple send threads is mainly intended for larger
2342    * configurations.
2343    */
2344   THRConfigApplier & conf = globalEmulatorData.theConfiguration->m_thr_config;
2345   struct thr_repository* rep = g_thr_repository;
2346   unsigned int thr_no;
2347   unsigned next_send_instance = 0;
2348   for (thr_no = 0; thr_no < glob_num_threads; thr_no++)
2349   {
2350     thr_data *selfptr = &rep->m_thread[thr_no];
2351     selfptr->m_nosend = conf.do_get_nosend(selfptr->m_instance_list,
2352                                            selfptr->m_instance_count);
2353     if (is_recv_thread(thr_no) || selfptr->m_nosend == 1)
2354     {
2355       selfptr->m_send_instance_no = 0;
2356       selfptr->m_send_instance = NULL;
2357       selfptr->m_nosend = 1;
2358     }
2359     else if (is_ldm_thread(thr_no))
2360     {
2361       selfptr->m_send_instance_no = next_send_instance;
2362       selfptr->m_send_instance =
2363         get_send_thread_instance_by_num(next_send_instance);
2364       next_send_instance++;
2365       if (next_send_instance == globalData.ndbMtSendThreads)
2366       {
2367         next_send_instance = 0;
2368       }
2369     }
2370     else
2371     {
2372     }
2373   }
2374   for (thr_no = 0; thr_no < glob_num_threads; thr_no++)
2375   {
2376     thr_data *selfptr = &rep->m_thread[thr_no];
2377     if (is_recv_thread(thr_no) ||
2378         selfptr->m_nosend == 1 ||
2379         is_ldm_thread(thr_no))
2380     {
2381       continue;
2382     }
2383     else
2384     {
2385       selfptr->m_send_instance_no = next_send_instance;
2386       selfptr->m_send_instance =
2387         get_send_thread_instance_by_num(next_send_instance);
2388       next_send_instance++;
2389       if (next_send_instance == globalData.ndbMtSendThreads)
2390       {
2391         next_send_instance = 0;
2392       }
2393     }
2394   }
2395 }
2396 
2397 void
start_send_threads()2398 thr_send_threads::start_send_threads()
2399 {
2400   for (Uint32 i = 0; i < globalData.ndbMtSendThreads; i++)
2401   {
2402     m_send_threads[i].m_thread =
2403       NdbThread_Create(mt_send_thread_main,
2404                        (void **)&m_send_threads[i],
2405                        1024*1024,
2406                        "send thread", //ToDo add number
2407                        NDB_THREAD_PRIO_MEAN);
2408     m_send_threads[i].m_thr_index =
2409       globalEmulatorData.theConfiguration->addThread(
2410         m_send_threads[i].m_thread,
2411         SendThread);
2412   }
2413   m_started_threads = TRUE;
2414 }
2415 
2416 struct thr_send_thread_instance*
get_send_thread_instance_by_num(Uint32 instance_no)2417 thr_send_threads::get_send_thread_instance_by_num(Uint32 instance_no)
2418 {
2419   return &m_send_threads[instance_no];
2420 }
2421 
2422 Uint32
get_send_instance(TrpId trp_id)2423 thr_send_threads::get_send_instance(TrpId trp_id)
2424 {
2425   require(trp_id < MAX_NTRANSPORTERS);
2426   Uint32 send_thread_instance = m_send_thread_instance_by_trp[trp_id];
2427   require(send_thread_instance < globalData.ndbMtSendThreads);
2428   return send_thread_instance;
2429 }
2430 
2431 struct thr_send_thread_instance*
get_send_thread_instance_by_trp(TrpId trp_id)2432 thr_send_threads::get_send_thread_instance_by_trp(TrpId trp_id)
2433 {
2434   require(trp_id < MAX_NTRANSPORTERS);
2435   Uint32 send_thread_instance = m_send_thread_instance_by_trp[trp_id];
2436   require(send_thread_instance < globalData.ndbMtSendThreads);
2437   return &m_send_threads[send_thread_instance];
2438 }
2439 
2440 /**
2441  * Called under mutex protection of send_thread_mutex
2442  */
2443 void
insert_trp(TrpId trp_id,struct thr_send_thread_instance * send_instance)2444 thr_send_threads::insert_trp(TrpId trp_id,
2445                              struct thr_send_thread_instance *send_instance)
2446 {
2447   struct thr_send_trps &trp_state = m_trp_state[trp_id];
2448 
2449   send_instance->m_more_trps = true;
2450   /* Ensure the lock free ::data_available see 'm_more_trps == TRUE' */
2451   wmb();
2452 
2453   if (trp_state.m_neighbour_trp)
2454     return;
2455 
2456   Uint32 first_trp = send_instance->m_first_trp;
2457   struct thr_send_trps &last_trp_state =
2458     m_trp_state[send_instance->m_last_trp];
2459   trp_state.m_next = 0;
2460   send_instance->m_last_trp = trp_id;
2461   assert(trp_state.m_data_available > 0);
2462 
2463   if (first_trp == 0)
2464   {
2465     send_instance->m_first_trp = trp_id;
2466   }
2467   else
2468   {
2469     last_trp_state.m_next = trp_id;
2470   }
2471 }
2472 
2473 /**
2474  * Called under mutex protection of send_thread_mutex
2475  * The timer is taken before grabbing the mutex and can thus be a
2476  * bit older than now when compared to other times.
2477  */
2478 void
set_max_delay(TrpId trp_id,NDB_TICKS now,Uint32 delay_usec)2479 thr_send_threads::set_max_delay(TrpId trp_id, NDB_TICKS now, Uint32 delay_usec)
2480 {
2481   struct thr_send_trps &trp_state = m_trp_state[trp_id];
2482   assert(trp_state.m_data_available > 0);
2483   assert(!trp_state.m_send_overload);
2484 
2485   trp_state.m_micros_delayed = delay_usec;
2486   trp_state.m_inserted_time = now;
2487   trp_state.m_overload_counter++;
2488 }
2489 
2490 /**
2491  * Called under mutex protection of send_thread_mutex
2492  * The time is taken before grabbing the mutex, so this timer
2493  * could be older time than now in rare cases.
2494  */
2495 void
set_overload_delay(TrpId trp_id,NDB_TICKS now,Uint32 delay_usec)2496 thr_send_threads::set_overload_delay(TrpId trp_id,
2497                                      NDB_TICKS now,
2498                                      Uint32 delay_usec)
2499 {
2500   struct thr_send_trps &trp_state = m_trp_state[trp_id];
2501   assert(trp_state.m_data_available > 0);
2502   trp_state.m_send_overload = TRUE;
2503   trp_state.m_micros_delayed = delay_usec;
2504   trp_state.m_inserted_time = now;
2505   trp_state.m_overload_counter++;
2506 }
2507 
2508 /**
2509  * Called under mutex protection of send_thread_mutex
2510  * The now can be older than what is set in m_inserted_time since
2511  * now is not taken holding the mutex, thus we can take the time,
2512  * be scheduled away for a while and return, in the meantime
2513  * another thread could insert a new event with a newer insert
2514  * time.
2515  *
2516  * We ensure in code below that if this type of event happens that
2517  * we set the timer to be expired and we use the more recent time
2518  * as now.
2519  */
2520 Uint32
check_delay_expired(TrpId trp_id,NDB_TICKS now)2521 thr_send_threads::check_delay_expired(TrpId trp_id, NDB_TICKS now)
2522 {
2523   struct thr_send_trps &trp_state = m_trp_state[trp_id];
2524   assert(trp_state.m_data_available > 0);
2525   Uint64 micros_delayed = Uint64(trp_state.m_micros_delayed);
2526 
2527   if (micros_delayed == 0)
2528     return 0;
2529 
2530   Uint64 micros_passed;
2531   if (now.getUint64() > trp_state.m_inserted_time.getUint64())
2532   {
2533     micros_passed = NdbTick_Elapsed(trp_state.m_inserted_time,
2534                                     now).microSec();
2535   }
2536   else
2537   {
2538     now = trp_state.m_inserted_time;
2539     micros_passed = micros_delayed;
2540   }
2541   if (micros_passed >= micros_delayed) //Expired
2542   {
2543     trp_state.m_inserted_time = now;
2544     trp_state.m_micros_delayed = 0;
2545     trp_state.m_send_overload = FALSE;
2546     return 0;
2547   }
2548 
2549   // Update and return remaining wait time
2550   Uint64 remaining_micros = micros_delayed - micros_passed;
2551   return Uint32(remaining_micros);
2552 }
2553 
2554 /**
2555  * TODO RONM:
2556  * Add some more NDBINFO table to make it easier to analyse the behaviour
2557  * of the workings of the MaxSendDelay parameter.
2558  */
2559 
2560 static Uint64 mt_get_send_buffer_bytes(NodeId id);
2561 
2562 /**
2563  * MAX_SEND_BUFFER_SIZE_TO_DELAY is a heauristic constant that specifies
2564  * a send buffer size that will always be sent. The size of this is based
2565  * on experience that maximum performance of the send part is achieved at
2566  * around 64 kBytes of send buffer size and that the difference between
2567  * 20 kB and 64 kByte is small. So thus avoiding unnecessary delays that
2568  * gain no significant performance gain.
2569  */
2570 static const Uint64 MAX_SEND_BUFFER_SIZE_TO_DELAY = (20 * 1024);
2571 
2572 
2573 /**
2574  * Get a trp having data to be sent to a trp (returned).
2575  *
2576  * Sending could have been delayed, in such cases the trp
2577  * to expire its delay first will be returned. It is then upto
2578  * the callee to either accept this trp, or reinsert it
2579  * such that it can be returned and retried later.
2580  *
2581  * Called under mutex protection of send_thread_mutex
2582  */
2583 #define DELAYED_PREV_NODE_IS_NEIGHBOUR UINT_MAX32
2584 TrpId
get_trp(Uint32 instance_no,NDB_TICKS now,struct thr_send_thread_instance * send_instance)2585 thr_send_threads::get_trp(Uint32 instance_no,
2586                           NDB_TICKS now,
2587                           struct thr_send_thread_instance *send_instance)
2588 {
2589   Uint32 next;
2590   TrpId trp_id;
2591   bool retry = false;
2592   Uint32 prev = 0;
2593   Uint32 delayed_trp = 0;
2594   Uint32 delayed_prev_trp = 0;
2595   Uint32 min_wait_usec = UINT_MAX32;
2596   do
2597   {
2598     if (send_instance->m_next_is_high_prio_trp)
2599     {
2600       Uint32 num_neighbour_trps = send_instance->m_num_neighbour_trps;
2601       Uint32 neighbour_trp_index = send_instance->m_neighbour_trp_index;
2602       for (Uint32 i = 0; i < num_neighbour_trps; i++)
2603       {
2604         trp_id = send_instance->m_neighbour_trps[neighbour_trp_index];
2605         neighbour_trp_index++;
2606         if (neighbour_trp_index == num_neighbour_trps)
2607           neighbour_trp_index = 0;
2608         send_instance->m_neighbour_trp_index = neighbour_trp_index;
2609         if (m_trp_state[trp_id].m_data_available > 0 &&
2610             m_trp_state[trp_id].m_thr_no_sender == NO_OWNER_THREAD)
2611         {
2612           const Uint32 send_delay = check_delay_expired(trp_id, now);
2613           if (likely(send_delay == 0))
2614           {
2615             /**
2616              * Found a neighbour trp to return. Handle this and ensure that
2617              * next call to get_trp will start looking for non-neighbour
2618              * trps.
2619              */
2620             send_instance->m_next_is_high_prio_trp = false;
2621             goto found_neighbour;
2622           }
2623 
2624           /**
2625            * Found a neighbour trp with delay, record the delay
2626            * and the trp and set indicator that delayed trp is
2627            * a neighbour.
2628            */
2629           if (send_delay < min_wait_usec)
2630           {
2631             min_wait_usec = send_delay;
2632             delayed_trp = trp_id;
2633             delayed_prev_trp = DELAYED_PREV_NODE_IS_NEIGHBOUR;
2634           }
2635         }
2636       }
2637       if (retry)
2638       {
2639         /**
2640          * We have already searched the non-neighbour trps and we
2641          * have now searched the neighbour trps and found no trps
2642          * ready to start sending to, we might still have a delayed
2643          * trp, this will be checked before exiting.
2644          */
2645         goto found_no_ready_trps;
2646       }
2647 
2648       /**
2649        * We found no ready trps amongst the neighbour trps, we will
2650        * also search the non-neighbours, we will do this simply by
2651        * falling through into this part and setting retry to true to
2652        * indicate that we already searched the neighbour trps.
2653        */
2654       retry = true;
2655     }
2656     else
2657     {
2658       /**
2659        * We might loop one more time and then we need to ensure that
2660        * we don't just come back here. If we report a trp from this
2661        * function this variable will be set again. If we find no trp
2662        * then it really doesn't matter what this variable is set to.
2663        * When trps are available we will always try to be fair and
2664        * return high prio trps as often as non-high prio trps.
2665        */
2666       send_instance->m_next_is_high_prio_trp = true;
2667     }
2668 
2669     trp_id = send_instance->m_first_trp;
2670     if (!trp_id)
2671     {
2672       if (!retry)
2673       {
2674         /**
2675          * We need to check the neighbour trps before we decide that
2676          * there is no trps to send to.
2677          */
2678         retry = true;
2679         continue;
2680       }
2681       /**
2682        * Found no trps ready to be sent to, will still need check of
2683        * delayed trps before exiting.
2684        */
2685       goto found_no_ready_trps;
2686     }
2687 
2688     /**
2689      * Search for a trp ready to be sent to among the non-neighbour trps.
2690      * If none found, remember the one with the smallest delay.
2691      */
2692     prev = 0;
2693     while (trp_id)
2694     {
2695       next = m_trp_state[trp_id].m_next;
2696 
2697       const Uint32 send_delay = check_delay_expired(trp_id, now);
2698       if (likely(send_delay == 0))
2699       {
2700         /**
2701          * We found a non-neighbour trp to return, handle this
2702          * and set the next get_trp to start looking for
2703          * neighbour trps.
2704          */
2705         send_instance->m_next_is_high_prio_trp = true;
2706         goto found_non_neighbour;
2707       }
2708 
2709       /* Find remaining minimum wait: */
2710       if (min_wait_usec > send_delay)
2711       {
2712         min_wait_usec = send_delay;
2713         delayed_trp = trp_id;
2714         delayed_prev_trp = prev;
2715       }
2716 
2717       prev = trp_id;
2718       trp_id = next;
2719     }
2720 
2721     // As 'first_trp != 0', there has to be a 'delayed_trp'
2722     assert(delayed_trp != 0);
2723 
2724     if (!retry)
2725     {
2726       /**
2727        * Before we decide to send to a delayed non-neighbour trp
2728        * we should check if there is a neighbour ready to be sent
2729        * to, or if there is a neighbour with a lower delay that
2730        * can be sent to.
2731        */
2732       retry = true;
2733       continue;
2734     }
2735     /**
2736      * No trps ready to send to, but we only get here when we know
2737      * there is at least a delayed trp, so jump directly to handling
2738      * of returning delayed trps.
2739      */
2740     goto found_delayed_trp;
2741   } while (1);
2742 
2743 found_no_ready_trps:
2744   /**
2745    * We have found no trps ready to be sent to yet, we can still
2746    * have a delayed trp and we don't know from where it comes.
2747    */
2748   if (delayed_trp == 0)
2749   {
2750     /**
2751      * We have found no trps to send to, neither non-delayed nor
2752      * delayed trps. Mark m_more_trps as false to indicate that
2753      * we have no trps to send to for the moment to give the
2754      * send threads a possibility to go to sleep.
2755      */
2756     send_instance->m_more_trps = false;
2757     return 0;
2758   }
2759 
2760   /**
2761    * We have ensured that delayed_trp exists although we have no
2762    * trps ready to be sent to yet. We will fall through to handling
2763    * of finding a delayed trp.
2764    */
2765 
2766 found_delayed_trp:
2767   /**
2768    * We found no trp ready to send to but we did find a delayed trp.
2769    * We don't know if the delayed trp is a neighbour trp or not, we
2770    * check this using delayed_prev_trp which is set to ~0 for
2771    * neighbour trps.
2772    */
2773   assert(delayed_trp != 0);
2774   trp_id = delayed_trp;
2775   if (delayed_prev_trp == DELAYED_PREV_NODE_IS_NEIGHBOUR)
2776   {
2777     /**
2778      * Go to handling of found neighbour as we have decided to return
2779      * this delayed neighbour trp.
2780      */
2781     send_instance->m_next_is_high_prio_trp = false;
2782     goto found_neighbour;
2783   }
2784   else
2785   {
2786     send_instance->m_next_is_high_prio_trp = true;
2787   }
2788 
2789   prev = delayed_prev_trp;
2790   next = m_trp_state[trp_id].m_next;
2791 
2792   /**
2793    * Fall through to found_non_neighbour since we have decided that this
2794    * delayed trp will be returned.
2795    */
2796 
2797 found_non_neighbour:
2798   /**
2799    * We are going to return a non-neighbour trp, either delayed
2800    * or not. We need to remove it from the list of non-neighbour
2801    * trps to send to.
2802    */
2803 
2804   if (likely(trp_id == send_instance->m_first_trp))
2805   {
2806     send_instance->m_first_trp = next;
2807     assert(prev == 0);
2808   }
2809   else
2810   {
2811     assert(prev != 0);
2812     m_trp_state[prev].m_next = next;
2813   }
2814 
2815   if (trp_id == send_instance->m_last_trp)
2816     send_instance->m_last_trp = prev;
2817 
2818   /**
2819    * Fall through for non-neighbour trps to same return handling as
2820    * neighbour trps.
2821    */
2822 
2823 found_neighbour:
2824   /**
2825    * We found a trp to return, we will update the data available,
2826    * we also need to set m_thr_no_sender to indicate which thread
2827    * is owning the right to send to this trp for the moment.
2828    *
2829    * Neighbour trps can go directly here since they are not
2830    * organised in any lists, but we come here also for
2831    * non-neighbour trps.
2832    */
2833   struct thr_send_trps &trp_state = m_trp_state[trp_id];
2834 
2835   assert(trp_state.m_data_available > 0);
2836   assert(trp_state.m_thr_no_sender == NO_OWNER_THREAD);
2837   trp_state.m_next = 0;
2838   trp_state.m_data_available = 1;
2839   return (TrpId)trp_id;
2840 }
2841 
2842 /* Called under mutex protection of send_thread_mutex */
2843 bool
check_done_trp(TrpId trp_id)2844 thr_send_threads::check_done_trp(TrpId trp_id)
2845 {
2846   struct thr_send_trps &trp_state = m_trp_state[trp_id];
2847   assert(trp_state.m_data_available > 0);
2848   trp_state.m_data_available--;
2849   return (trp_state.m_data_available == 0);
2850 }
2851 
2852 /* Called under mutex protection of send_thread_mutex */
2853 struct thr_send_thread_instance*
get_not_awake_send_thread(TrpId trp_id,struct thr_send_thread_instance * send_instance)2854 thr_send_threads::get_not_awake_send_thread(TrpId trp_id,
2855                          struct thr_send_thread_instance *send_instance)
2856 {
2857   struct thr_send_thread_instance *used_send_thread;
2858   if (trp_id != 0)
2859   {
2860     Uint32 send_thread = get_send_instance(trp_id);
2861     if (!m_send_threads[send_thread].m_awake)
2862     {
2863       used_send_thread= &m_send_threads[send_thread];
2864       assert(used_send_thread == send_instance);
2865       return used_send_thread;
2866     }
2867   }
2868   if (!send_instance->m_awake)
2869     return send_instance;
2870   return NULL;
2871 }
2872 
2873 /**
2874  * We have assisted our send thread instance, check if it still
2875  * need to be woken up.
2876  */
2877 void
wake_my_send_thread_if_needed(TrpId * trp_id_array,Uint32 count,struct thr_send_thread_instance * my_send_instance)2878 thr_send_threads::wake_my_send_thread_if_needed(TrpId *trp_id_array,
2879                                                 Uint32 count,
2880                    struct thr_send_thread_instance *my_send_instance)
2881 {
2882   bool mutex_locked = false;
2883   struct thr_send_thread_instance *wake_send_instance = NULL;
2884   for (Uint32 i = 0; i < count; i++)
2885   {
2886     TrpId trp_id = trp_id_array[i];
2887     struct thr_send_thread_instance *send_instance =
2888       get_send_thread_instance_by_trp(trp_id);
2889     if (send_instance != my_send_instance)
2890       continue;
2891     if (!mutex_locked)
2892     {
2893       mutex_locked = true;
2894       NdbMutex_Lock(my_send_instance->send_thread_mutex);
2895     }
2896     struct thr_send_trps& trp_state = m_trp_state[trp_id];
2897     if (trp_state.m_data_available > 0)
2898     {
2899       wake_send_instance = my_send_instance;
2900       break;
2901     }
2902   }
2903   if (mutex_locked)
2904   {
2905     NdbMutex_Unlock(my_send_instance->send_thread_mutex);
2906   }
2907   if (wake_send_instance != NULL)
2908   {
2909     wakeup(&(wake_send_instance->m_waiter_struct));
2910   }
2911 }
2912 
2913 /**
2914  * Insert transporter into send thread instance data structures.
2915  * Wake send thread unless it is the one which we handle ourselves.
2916  * If we handle it ourselves we will check after assisting the
2917  * send thread if the thread is still required to wake up. This
2918  * ensures that running with 1 send thread will avoid waking up
2919  * send thread when not required to do so. With many send threads
2920  * we will avoid a small portion of wakeup calls through this
2921  * handling.
2922  *
2923  * If we don't do any send thread assistance the instance is simply
2924  * NULL here and we will wake all required send threads.
2925  */
2926 Uint32
alert_send_thread(TrpId trp_id,NDB_TICKS now,struct thr_send_thread_instance * my_send_instance)2927 thr_send_threads::alert_send_thread(TrpId trp_id,
2928                                     NDB_TICKS now,
2929                    struct thr_send_thread_instance *my_send_instance)
2930 {
2931   struct thr_send_thread_instance *send_instance =
2932     get_send_thread_instance_by_trp(trp_id);
2933   struct thr_send_trps& trp_state = m_trp_state[trp_id];
2934 
2935   NdbMutex_Lock(send_instance->send_thread_mutex);
2936   trp_state.m_data_available++;  // There is more to send
2937   if (trp_state.m_data_available > 1)
2938   {
2939     /**
2940      * ACTIVE(_P) -> ACTIVE_P
2941      *
2942      * The trp is already flagged that it has data needing to be sent.
2943      * There is no need to wake even more threads up in this case
2944      * since we piggyback on someone else's request.
2945      *
2946      * Waking another thread for sending to this trp, had only
2947      * resulted in contention and blockage on the send_lock.
2948      *
2949      * We are safe that the buffers we have flushed will be read by a send
2950      * thread: They will either be piggybacked when the send thread
2951      * 'get_trp()' for sending, or data will be available when
2952      * send thread 'check_done_trp()', finds that more data has
2953      * become available. In the later case, the send thread will schedule
2954      * the trp for another round with insert_trp()
2955      */
2956     NdbMutex_Unlock(send_instance->send_thread_mutex);
2957     return 0;
2958   }
2959   assert(!trp_state.m_send_overload);      // Caught above as ACTIVE
2960   assert(m_trp_state[trp_id].m_thr_no_sender == NO_OWNER_THREAD);
2961   insert_trp(trp_id, send_instance);       // IDLE -> PENDING
2962 
2963   /**
2964    * We need to delay sending the data, as set in config.
2965    * This is the first send to this trp, so we start the
2966    * delay timer now.
2967    */
2968   if (max_send_delay > 0)                   // Wait for more payload?
2969   {
2970     set_max_delay(trp_id, now, max_send_delay);
2971   }
2972 
2973   if (send_instance == my_send_instance)
2974   {
2975     NdbMutex_Unlock(send_instance->send_thread_mutex);
2976     return 1;
2977   }
2978 
2979   /*
2980    * Check if the send thread especially responsible for this transporter
2981    * is awake, if not wake it up.
2982    */
2983   struct thr_send_thread_instance *avail_send_thread
2984     = get_not_awake_send_thread(trp_id, send_instance);
2985 
2986   NdbMutex_Unlock(send_instance->send_thread_mutex);
2987 
2988   if (avail_send_thread)
2989   {
2990     /*
2991      * Wake the assigned sleeping send thread, potentially a spurious wakeup,
2992      * but this is not a problem, important is to ensure that at least one
2993      * send thread is awoken to handle our request. If someone is already
2994      * awake and takes care of our request before we get to wake someone up
2995      * it's not a problem.
2996      */
2997     wakeup(&(avail_send_thread->m_waiter_struct));
2998   }
2999   return 1;
3000 }
3001 
3002 static bool
check_available_send_data(struct thr_send_thread_instance * send_instance)3003 check_available_send_data(struct thr_send_thread_instance *send_instance)
3004 {
3005   return !send_instance->data_available();
3006 }
3007 
3008 //static
3009 int
trylock_send_trp(TrpId trp_id)3010 thr_send_threads::trylock_send_trp(TrpId trp_id)
3011 {
3012   thr_repository::send_buffer *sb = g_thr_repository->m_send_buffers+trp_id;
3013   return trylock(&sb->m_send_lock);
3014 }
3015 
3016 //static
3017 bool
perform_send(TrpId trp_id,Uint32 thr_no,Uint32 & bytes_sent)3018 thr_send_threads::perform_send(TrpId trp_id, Uint32 thr_no, Uint32& bytes_sent)
3019 {
3020   thr_repository::send_buffer * sb = g_thr_repository->m_send_buffers+trp_id;
3021 
3022   /**
3023    * Set m_send_thread so that our transporter callback can know which thread
3024    * holds the send lock for this remote trp. This is the thr_no of a block
3025    * thread or the thr_no of a send thread.
3026    */
3027   sb->m_send_thread = thr_no;
3028   const bool more = globalTransporterRegistry.performSend(trp_id);
3029   bytes_sent = sb->m_bytes_sent;
3030   sb->m_send_thread = NO_SEND_THREAD;
3031   unlock(&sb->m_send_lock);
3032   return more;
3033 }
3034 
3035 static void
update_send_sched_config(THRConfigApplier & conf,unsigned instance_no,bool & real_time)3036 update_send_sched_config(THRConfigApplier & conf,
3037                          unsigned instance_no,
3038                          bool & real_time)
3039 {
3040   real_time = conf.do_get_realtime_send(instance_no);
3041 }
3042 
3043 static void
yield_rt_break(NdbThread * thread,enum ThreadTypes type,bool real_time)3044 yield_rt_break(NdbThread *thread,
3045                enum ThreadTypes type,
3046                bool real_time)
3047 {
3048   Configuration * conf = globalEmulatorData.theConfiguration;
3049   conf->setRealtimeScheduler(thread,
3050                              type,
3051                              FALSE,
3052                              FALSE);
3053   conf->setRealtimeScheduler(thread,
3054                              type,
3055                              real_time,
3056                              FALSE);
3057 }
3058 
3059 static void
check_real_time_break(NDB_TICKS now,NDB_TICKS * yield_time,NdbThread * thread,enum ThreadTypes type)3060 check_real_time_break(NDB_TICKS now,
3061                       NDB_TICKS *yield_time,
3062                       NdbThread *thread,
3063                       enum ThreadTypes type)
3064 {
3065   if (unlikely(NdbTick_Compare(now, *yield_time) < 0))
3066   {
3067     /**
3068      * Timer was adjusted backwards, or the monotonic timer implementation
3069      * on this platform is unstable. Best we can do is to restart
3070      * RT-yield timers from new current time.
3071      */
3072     *yield_time = now;
3073   }
3074 
3075   const Uint64 micros_passed =
3076     NdbTick_Elapsed(*yield_time, now).microSec();
3077 
3078   if (micros_passed > 50000)
3079   {
3080     /**
3081      * Lower scheduling prio to time-sharing mode to ensure that
3082      * other threads and processes gets a chance to be scheduled
3083      * if we run for an extended time.
3084      */
3085     yield_rt_break(thread, type, TRUE);
3086     *yield_time = now;
3087   }
3088 }
3089 
3090 #define NUM_WAITS_TO_CHECK_SPINTIME 6
3091 static void
wait_time_tracking(thr_data * selfptr,Uint64 wait_time_in_us)3092 wait_time_tracking(thr_data *selfptr, Uint64 wait_time_in_us)
3093 {
3094   for (Uint32 i = 0; i < NUM_SPIN_INTERVALS; i++)
3095   {
3096     if (wait_time_in_us <= selfptr->m_spin_stat.m_spin_interval[i])
3097     {
3098       selfptr->m_spin_stat.m_micros_sleep_times[i]++;
3099       selfptr->m_spin_stat.m_num_waits++;
3100       if (unlikely(selfptr->m_spintime == 0 &&
3101             selfptr->m_conf_spintime != 0 &&
3102             selfptr->m_spin_stat.m_num_waits == NUM_WAITS_TO_CHECK_SPINTIME))
3103       {
3104         /**
3105          * React quickly to changes in environment, if we don't have
3106          * spinning activated and have already seen 15 wait times, it means
3107          * that there is a good chance that spinning is a good idea now.
3108          * So invoke a check if we should activate spinning now.
3109          */
3110         SimulatedBlock *b = globalData.getBlock(THRMAN, selfptr->m_thr_no + 1);
3111         ((Thrman*)b)->check_spintime(false);
3112       }
3113       return;
3114     }
3115   }
3116   require(false);
3117 }
3118 
3119 static bool check_queues_empty(thr_data *selfptr);
3120 static Uint32 scan_time_queues(struct thr_data* selfptr, NDB_TICKS now);
3121 static bool do_send(struct thr_data* selfptr,
3122                     bool must_send,
3123                     bool assist_send);
3124 /**
3125  * We call this function only after executing no jobs and thus it is
3126  * safe to spin for a short time.
3127  */
3128 static bool
check_yield(thr_data * selfptr,Uint64 min_spin_timer,Uint32 * spin_time_in_us,NDB_TICKS start_spin_ticks)3129 check_yield(thr_data *selfptr,
3130             Uint64 min_spin_timer, //microseconds
3131             Uint32 *spin_time_in_us,
3132             NDB_TICKS start_spin_ticks)
3133 {
3134   NDB_TICKS now;
3135   bool cont_flag = true;
3136   do
3137   {
3138     for (Uint32 i = 0; i < 50; i++)
3139     {
3140       /**
3141        * During around 50 us we only check for JBA and JBB
3142        * queues to not be empty. This happens when another thread or
3143        * the receive thread sends a signal to the thread.
3144        */
3145       NdbSpin();
3146       if (!check_queues_empty(selfptr))
3147       {
3148         /* Found jobs to execute, successful spin */
3149         cont_flag = false;
3150         now = NdbTick_getCurrentTicks();
3151         break;
3152       }
3153       /* Check if we have done enough spinning once per 3 us */
3154       if ((i & 3) == 3)
3155         continue;
3156       now = NdbTick_getCurrentTicks();
3157       Uint64 spin_micros = NdbTick_Elapsed(start_spin_ticks, now).microSec();
3158       if (spin_micros > min_spin_timer)
3159       {
3160         /**
3161          * We have spun for the required time, but to no avail, there was no
3162          * work to do, so it is now time to yield and go to sleep.
3163          */
3164         *spin_time_in_us = spin_micros;
3165         selfptr->m_curr_ticks = now;
3166         selfptr->m_spin_stat.m_sleep_longer_spin_time++;
3167         selfptr->m_measured_spintime += spin_micros;
3168         return true;
3169       }
3170     }
3171     if (!cont_flag)
3172       break;
3173     /**
3174      * Every 50 us we also scan time queues to see if any delayed signals
3175      * need to be delivered. After checking if this generates any new
3176      * messages we also check if we have completed spinning for this
3177      * time.
3178      */
3179     const Uint32 lagging_timers = scan_time_queues(selfptr, now);
3180     if (lagging_timers != 0 ||
3181         !check_queues_empty(selfptr))
3182     {
3183       /* Found jobs to execute, successful spin */
3184       cont_flag = false;
3185       break;
3186     }
3187   } while (cont_flag);
3188   /**
3189    * Successful spinning, we will record spinning time. We will also record
3190    * the number of micros that this has saved. This is a static number based
3191    * on experience. We use measurements from virtual machines where we gain
3192    * the time it would take to go to sleep and wakeup again. This is roughly
3193    * 25 microseconds.
3194    *
3195    * This is the positive part of spinning where we gained something through
3196    * spinning.
3197    */
3198   Uint64 spin_micros = NdbTick_Elapsed(start_spin_ticks, now).microSec();
3199   selfptr->m_curr_ticks = now;
3200   selfptr->m_measured_spintime += spin_micros;
3201   selfptr->m_spin_stat.m_sleep_shorter_spin_time++;
3202   selfptr->m_micros_sleep += spin_micros;
3203   wait_time_tracking(selfptr, spin_micros);
3204   return false;
3205 }
3206 
3207 /**
3208  * We call this function only after executing no jobs and thus it is
3209  * safe to spin for a short time.
3210  */
3211 static bool
check_recv_yield(thr_data * selfptr,TransporterReceiveHandle & recvdata,Uint64 min_spin_timer,Uint32 & num_events,Uint32 * spin_time_in_us,NDB_TICKS start_spin_ticks)3212 check_recv_yield(thr_data *selfptr,
3213                  TransporterReceiveHandle & recvdata,
3214                  Uint64 min_spin_timer, //microseconds
3215                  Uint32 & num_events,
3216                  Uint32 *spin_time_in_us,
3217                  NDB_TICKS start_spin_ticks)
3218 {
3219   NDB_TICKS now;
3220   bool cont_flag = true;
3221   do
3222   {
3223     for (Uint32 i = 0; i < 60; i++)
3224     {
3225       /**
3226        * During around 50 us we only check for JBA and JBB
3227        * queues to not be empty. This happens when another thread or
3228        * the receive thread sends a signal to the thread.
3229        */
3230       NdbSpin();
3231       if ((!check_queues_empty(selfptr)) ||
3232           ((num_events =
3233             globalTransporterRegistry.pollReceive(0, recvdata)) > 0))
3234       {
3235         /* Found jobs to execute, successful spin */
3236         cont_flag = false;
3237         now = NdbTick_getCurrentTicks();
3238         break;
3239       }
3240       /* Check if we have done enough spinning once per 3 us */
3241       if ((i & 3) == 3)
3242         continue;
3243       /* Check if we have done enough spinning */
3244       now = NdbTick_getCurrentTicks();
3245       Uint64 spin_micros = NdbTick_Elapsed(start_spin_ticks, now).microSec();
3246       if (spin_micros > min_spin_timer)
3247       {
3248         /**
3249          * We have spun for the required time, but to no avail, there was no
3250          * work to do, so it is now time to yield and go to sleep.
3251          */
3252         selfptr->m_measured_spintime += spin_micros;
3253         selfptr->m_spin_stat.m_sleep_longer_spin_time++;
3254         return true;
3255       }
3256     }
3257     if (!cont_flag)
3258       break;
3259     /**
3260      * Every 50 us we also scan time queues to see if any delayed signals
3261      * need to be delivered. After checking if this generates any new
3262      * messages we also check if we have completed spinning for this
3263      * time.
3264      */
3265     const Uint32 lagging_timers = scan_time_queues(selfptr, now);
3266     if (lagging_timers != 0 ||
3267         !check_queues_empty(selfptr))
3268     {
3269       /* Found jobs to execute, successful spin */
3270       cont_flag = false;
3271       break;
3272     }
3273   } while (cont_flag);
3274   /**
3275    * Successful spinning, we will record spinning time. We will also record
3276    * the number of micros that this has saved. This is a static number based
3277    * on experience. We use measurements from virtual machines where we gain
3278    * the time it would take to go to sleep and wakeup again. This is roughly
3279    * 25 microseconds.
3280    *
3281    * This is the positive part of spinning where we gained something through
3282    * spinning.
3283    */
3284   Uint64 spin_micros = NdbTick_Elapsed(start_spin_ticks, now).microSec();
3285   selfptr->m_measured_spintime += spin_micros;
3286   selfptr->m_spin_stat.m_sleep_shorter_spin_time++;
3287   selfptr->m_micros_sleep += spin_micros;
3288   wait_time_tracking(selfptr, spin_micros);
3289   return false;
3290 }
3291 
3292 /**
3293  * We enter this function holding the send_thread_mutex if lock is
3294  * false and we leave no longer holding the mutex.
3295  */
3296 bool
assist_send_thread(Uint32 max_num_trps,Uint32 thr_no,NDB_TICKS now,Uint32 & watchdog_counter,struct thr_send_thread_instance * send_instance,class thread_local_pool<thr_send_page> & send_buffer_pool)3297 thr_send_threads::assist_send_thread(Uint32 max_num_trps,
3298                                      Uint32 thr_no,
3299                                      NDB_TICKS now,
3300                                      Uint32 &watchdog_counter,
3301                    struct thr_send_thread_instance *send_instance,
3302                    class thread_local_pool<thr_send_page>  & send_buffer_pool)
3303 {
3304   Uint32 num_trps_sent = 0;
3305   Uint32 loop = 0;
3306   NDB_TICKS spin_ticks_dummy;
3307   TrpId trp_id = 0;
3308 
3309   NdbMutex_Lock(send_instance->send_thread_mutex);
3310 
3311   while (globalData.theRestartFlag != perform_stop &&
3312          loop < max_num_trps &&
3313          (trp_id = get_trp(NO_SEND_THREAD, now, send_instance)) != 0)
3314          // PENDING -> ACTIVE
3315   {
3316     if (!handle_send_trp(trp_id,
3317                          num_trps_sent,
3318                          thr_no,
3319                          now,
3320                          watchdog_counter,
3321                          send_instance))
3322     {
3323       /**
3324        * Neighbour trps are locked through setting
3325        * m_trp_state[id].m_thr_no_sender to thr_no while holding
3326        * the mutex. This flag is set between start of send and end
3327        * of send. In this case there was no send so the flag isn't
3328        * set now, since we insert it back immediately it will simply
3329        * remain unset. We assert on this just in case.
3330        *
3331        * Only transporters waiting for delay to expire was waiting to send,
3332        * we will skip sending in this case and leave it for the send
3333        * thread to handle it. No reason to set pending_send to true since
3334        * there is no hurry to send (through setting id = 0 below).
3335        */
3336       assert(m_trp_state[trp_id].m_thr_no_sender == NO_OWNER_THREAD);
3337       insert_trp(trp_id, send_instance);
3338       trp_id = 0;
3339       break;
3340     }
3341 
3342     watchdog_counter = 3;
3343     send_buffer_pool.release_global(g_thr_repository->m_mm,
3344                                     RG_TRANSPORTER_BUFFERS,
3345                                     send_instance->m_instance_no);
3346 
3347     loop++;
3348   }
3349   if (trp_id == 0)
3350   {
3351     NdbMutex_Unlock(send_instance->send_thread_mutex);
3352     return false;
3353   }
3354   /**
3355    * There is more work to do, keep pending_send flag to true such
3356    * that we will quickly work off the queue of send tasks available.
3357    */
3358   bool pending_send = send_instance->check_pending_data();
3359   NdbMutex_Unlock(send_instance->send_thread_mutex);
3360   return pending_send;
3361 }
3362 
3363 /**
3364  * We hold the send_thread_mutex of the send_instance when we
3365  * enter this function.
3366  */
3367 bool
handle_send_trp(TrpId trp_id,Uint32 & num_trps_sent,Uint32 thr_no,NDB_TICKS & now,Uint32 & watchdog_counter,struct thr_send_thread_instance * send_instance)3368 thr_send_threads::handle_send_trp(TrpId trp_id,
3369                                   Uint32 & num_trps_sent,
3370                                   Uint32 thr_no,
3371                                   NDB_TICKS & now,
3372                                   Uint32 & watchdog_counter,
3373                          struct thr_send_thread_instance *send_instance)
3374 {
3375   assert(send_instance == get_send_thread_instance_by_trp(trp_id));
3376   assert(m_trp_state[trp_id].m_thr_no_sender == NO_OWNER_THREAD);
3377   if (m_trp_state[trp_id].m_micros_delayed > 0)     // Trp send is delayed
3378   {
3379     /**
3380      * The only transporter ready for send was a transporter that still
3381      * required waiting. We will only send if we have enough data to
3382      * send without delay.
3383      */
3384     if (m_trp_state[trp_id].m_send_overload)        // Pause overloaded trp
3385     {
3386       return false;
3387     }
3388 
3389     if (mt_get_send_buffer_bytes(trp_id) >= MAX_SEND_BUFFER_SIZE_TO_DELAY)
3390       set_max_delay(trp_id, now, 0);              // Large packet -> Send now
3391     else                                          // Sleep, let last awake send
3392     {
3393       if (thr_no >= glob_num_threads)
3394       {
3395         /**
3396          * When encountering max_send_delay from send thread we
3397          * will let the send thread go to sleep for as long as
3398          * this trp has to wait (it is the shortest sleep we
3399          * we have. For non-send threads the trp will simply
3400          * be reinserted and someone will pick up later to handle
3401          * things.
3402          *
3403          * At this point in time there are no transporters ready to
3404          * send, they all are waiting for the delay to expire.
3405          */
3406         send_instance->m_more_trps = false;
3407       }
3408       return false;
3409     }
3410   }
3411 
3412   /**
3413    * Multiple send threads can not 'get' the same
3414    * trp simultaneously. Thus, we does not need
3415    * to keep the global send thread mutex any longer.
3416    * Also avoids worker threads blocking on us in
3417    * ::alert_send_thread
3418    */
3419 #ifdef VM_TRACE
3420   my_thread_yield();
3421 #endif
3422   assert(m_trp_state[trp_id].m_thr_no_sender == NO_OWNER_THREAD);
3423   m_trp_state[trp_id].m_thr_no_sender = thr_no;
3424   NdbMutex_Unlock(send_instance->send_thread_mutex);
3425 
3426   watchdog_counter = 6;
3427 
3428   /**
3429    * Need a lock on the send buffers to protect against
3430    * worker thread doing ::forceSend, possibly
3431    * disable_send_buffers() and/or lock_/unlock_transporter().
3432    * To avoid a livelock with ::forceSend() on an overloaded
3433    * systems, we 'try-lock', and reinsert the trp for
3434    * later retry if failed.
3435    *
3436    * To ensure that the combination of more == true &&
3437    * bytes_sent == 0 can be used to signal that the
3438    * transporter is overloaded, we initialise bytes_sent to 1 to avoid
3439    * interpreting a try_lock failure as if it was an overloaded
3440    * transporter. This is a fix for BUG#22393612.
3441    */
3442   bool more = true;
3443   Uint32 bytes_sent = 1;
3444 #ifdef VM_TRACE
3445   my_thread_yield();
3446 #endif
3447   if (likely(trylock_send_trp(trp_id) == 0))
3448   {
3449     more = perform_send(trp_id, thr_no, bytes_sent);
3450     /* We return with no locks or mutexes held */
3451   }
3452 
3453   /**
3454    * Note that we do not yet return any send_buffers to the
3455    * global pool: handle_send_trp() may be called from either
3456    * a send-thread, or a worker-thread doing 'assist send'.
3457    * These has different policies for releasing send_buffers,
3458    * which should be handled by the respective callers.
3459    * (release_chunk() or release_global())
3460    *
3461    * Either own perform_send() processing, or external 'alert'
3462    * could have signaled that there are more sends pending.
3463    * If we had no progress in perform_send, we conclude that
3464    * trp is overloaded, and takes a break doing further send
3465    * attempts to that trp. Also failure of trylock_send_trp
3466    * will result on the 'overload' to be concluded.
3467    * (Quite reasonable as the worker thread is likely forceSend'ing)
3468    */
3469   now = NdbTick_getCurrentTicks();
3470 
3471   NdbMutex_Lock(send_instance->send_thread_mutex);
3472 #ifdef VM_TRACE
3473   my_thread_yield();
3474 #endif
3475   assert(m_trp_state[trp_id].m_thr_no_sender == thr_no);
3476   m_trp_state[trp_id].m_thr_no_sender = NO_OWNER_THREAD;
3477   if (more ||                   // ACTIVE   -> PENDING
3478       !check_done_trp(trp_id))  // ACTIVE-P -> PENDING
3479   {
3480     insert_trp(trp_id, send_instance);
3481 
3482     if (unlikely(more && bytes_sent == 0)) //Trp is overloaded
3483     {
3484       set_overload_delay(trp_id, now, 200);//Delay send-retry by 200 us
3485     }
3486   }                            // ACTIVE   -> IDLE
3487   else
3488   {
3489     num_trps_sent++;
3490   }
3491   return true;
3492 }
3493 
3494 void
update_rusage(struct thr_send_thread_instance * this_send_thread,Uint64 elapsed_time)3495 thr_send_threads::update_rusage(
3496   struct thr_send_thread_instance *this_send_thread,
3497   Uint64 elapsed_time)
3498 {
3499   struct ndb_rusage rusage;
3500 
3501   int res = Ndb_GetRUsage(&rusage, false);
3502   if (res != 0)
3503   {
3504     this_send_thread->m_user_time_os = 0;
3505     this_send_thread->m_kernel_time_os = 0;
3506     this_send_thread->m_elapsed_time_os = 0;
3507     return;
3508   }
3509   this_send_thread->m_user_time_os = rusage.ru_utime;
3510   this_send_thread->m_kernel_time_os = rusage.ru_stime;
3511   this_send_thread->m_elapsed_time_os = elapsed_time;
3512 }
3513 
3514 /**
3515  * There are some send scheduling algorithms build into the send thread.
3516  * Mainly implemented as part of ::run_send_thread, thus commented here:
3517  *
3518  * We have the possibility to set a 'send delay' for each trp. This
3519  * is used both for handling send overload where we should wait
3520  * before retrying, and as an aid for collecting smaller packets into
3521  * larger, and thus fewer packets. Thus decreasing the send overhead
3522  * on a highly loaded system.
3523  *
3524  * A delay due to overload is always waited for. As there are already
3525  * queued up send work in the buffers, sending will be possible
3526  * without the send thread actively busy-retrying. However, delays
3527  * in order to increase the packed size can be ignored.
3528  *
3529  * The basic idea if the later is the following:
3530  * By introducing a delay we ensure that all block threads have
3531  * gotten a chance to execute messages that will generate data
3532  * to be sent to trps. This is particularly helpful in e.g.
3533  * queries that are scanning a table. Here a SCAN_TABREQ is
3534  * received in a TC and this generates a number of SCAN_FRAGREQ
3535  * signals to each LDM, each of those LDMs will in turn generate
3536  * a number of new signals that are all destined to the same
3537  * trp. So this delay here increases the chance that those
3538  * signals can be sent in the same TCP/IP packet over the wire.
3539  *
3540  * Another use case is applications using the asynchronous API
3541  * and thus sending many PK lookups that traverse a trp in
3542  * parallel from the same destination trp. These can benefit
3543  * greatly from this extra delay increasing the packet sizes.
3544  *
3545  * There is also a case when sending many updates that need to
3546  * be sent to the other trp in the same node group. By delaying
3547  * the send of this data we ensure that the receiver thread on
3548  * the other end is getting larger packet sizes and thus we
3549  * improve the throughput of the system in all sorts of ways.
3550  *
3551  * However we also try to ensure that we don't delay signals in
3552  * an idle system where response time is more important than
3553  * the throughput. This is achieved by the fact that we will
3554  * send after looping through the trps ready to send to. In
3555  * an idle system this will be a quick operation. In a loaded
3556  * system this delay can be fairly substantial on the other
3557  * hand.
3558  *
3559  * Finally we attempt to limit the use of more than one send
3560  * thread to cases of very high load. So if there are only
3561  * delayed trp sends remaining, we deduce that the
3562  * system is lightly loaded and we will go to sleep if there
3563  * are other send threads also awake.
3564  */
3565 void
run_send_thread(Uint32 instance_no)3566 thr_send_threads::run_send_thread(Uint32 instance_no)
3567 {
3568   struct thr_send_thread_instance *this_send_thread =
3569     &m_send_threads[instance_no];
3570   const Uint32 thr_no = glob_num_threads + instance_no;
3571 
3572   {
3573     /**
3574      * Wait for thread object to be visible
3575      */
3576     while(this_send_thread->m_thread == 0)
3577       NdbSleep_MilliSleep(30);
3578   }
3579 
3580   {
3581     /**
3582      * Print out information about starting thread
3583      *   (number, tid, name, the CPU it's locked into (if locked at all))
3584      * Also perform the locking to CPU.
3585      */
3586     BaseString tmp;
3587     bool fail = false;
3588     THRConfigApplier & conf = globalEmulatorData.theConfiguration->m_thr_config;
3589     tmp.appfmt("thr: %u ", thr_no);
3590     int tid = NdbThread_GetTid(this_send_thread->m_thread);
3591     if (tid != -1)
3592     {
3593       tmp.appfmt("tid: %u ", tid);
3594     }
3595     conf.appendInfoSendThread(tmp, instance_no);
3596     int res = conf.do_bind_send(this_send_thread->m_thread,
3597                                 instance_no);
3598     if (res < 0)
3599     {
3600       fail = true;
3601       tmp.appfmt("err: %d ", -res);
3602     }
3603     else if (res > 0)
3604     {
3605       tmp.appfmt("OK ");
3606     }
3607 
3608     unsigned thread_prio;
3609     res = conf.do_thread_prio_send(this_send_thread->m_thread,
3610                                    instance_no,
3611                                    thread_prio);
3612     if (res < 0)
3613     {
3614       fail = true;
3615       res = -res;
3616       tmp.appfmt("Failed to set thread prio to %u, ", thread_prio);
3617       if (res == SET_THREAD_PRIO_NOT_SUPPORTED_ERROR)
3618       {
3619         tmp.appfmt("not supported on this OS");
3620       }
3621       else
3622       {
3623         tmp.appfmt("error: %d", res);
3624       }
3625     }
3626     else if (res > 0)
3627     {
3628       tmp.appfmt("Successfully set thread prio to %u ", thread_prio);
3629     }
3630 
3631     printf("%s\n", tmp.c_str());
3632     fflush(stdout);
3633     if (fail)
3634     {
3635       abort();
3636     }
3637   }
3638 
3639   /**
3640    * register watchdog
3641    */
3642   globalEmulatorData.theWatchDog->
3643     registerWatchedThread(&this_send_thread->m_watchdog_counter, thr_no);
3644 
3645   NdbMutex_Lock(this_send_thread->send_thread_mutex);
3646   this_send_thread->m_awake = FALSE;
3647   NdbMutex_Unlock(this_send_thread->send_thread_mutex);
3648 
3649   NDB_TICKS yield_ticks;
3650   bool real_time = false;
3651 
3652   yield_ticks = NdbTick_getCurrentTicks();
3653   THRConfigApplier & conf = globalEmulatorData.theConfiguration->m_thr_config;
3654   update_send_sched_config(conf, instance_no, real_time);
3655 
3656   TrpId trp_id = 0;
3657   Uint64 micros_sleep = 0;
3658   NDB_TICKS last_now = NdbTick_getCurrentTicks();
3659   NDB_TICKS last_rusage = last_now;
3660   NDB_TICKS first_now = last_now;
3661 
3662   while (globalData.theRestartFlag != perform_stop)
3663   {
3664     this_send_thread->m_watchdog_counter = 19;
3665 
3666     NDB_TICKS now = NdbTick_getCurrentTicks();
3667     Uint64 sleep_time = micros_sleep;
3668     Uint64 exec_time = NdbTick_Elapsed(last_now, now).microSec();
3669     Uint64 time_since_update_rusage =
3670       NdbTick_Elapsed(last_rusage, now).microSec();
3671     /**
3672      * At this moment exec_time is elapsed time since last time
3673      * we were here. Now remove the time we spent sleeping to
3674      * get exec_time, thus exec_time + sleep_time will always
3675      * be elapsed time.
3676      */
3677     exec_time -= sleep_time;
3678     last_now = now;
3679     micros_sleep = 0;
3680     if (time_since_update_rusage > Uint64(50 * 1000))
3681     {
3682       Uint64 elapsed_time = NdbTick_Elapsed(first_now, now).microSec();
3683       last_rusage = last_now;
3684       NdbMutex_Lock(this_send_thread->send_thread_mutex);
3685       update_rusage(this_send_thread, elapsed_time);
3686     }
3687     else
3688     {
3689       NdbMutex_Lock(this_send_thread->send_thread_mutex);
3690     }
3691     this_send_thread->m_exec_time += exec_time;
3692     this_send_thread->m_sleep_time += sleep_time;
3693     this_send_thread->m_awake = TRUE;
3694 
3695     /**
3696      * If waited for a specific transporter, reinsert it such that
3697      * it can be re-evaluated for send by get_trp().
3698      *
3699      * This happens when handle_send_trp returns false due to that the
3700      * only transporter ready for execute was a transporter that still
3701      * waited for expiration of delay and no other condition allowed it
3702      * to be sent.
3703      */
3704     if (trp_id != 0)
3705     {
3706       /**
3707        * The trp was locked during our sleep. We now release the
3708        * lock again such that we can acquire the lock again after
3709        * a short sleep. For non-neighbour trps the insert_trp is
3710        * sufficient. For neighbour trps we need to ensure that
3711        * m_trp_state[trp_id].m_thr_no_sender is set to NO_OWNER_THREAD
3712        * since this is the manner in releasing the lock on those
3713        * trps.
3714        */
3715       assert(m_trp_state[trp_id].m_thr_no_sender == thr_no);
3716       m_trp_state[trp_id].m_thr_no_sender = NO_OWNER_THREAD;
3717       insert_trp(trp_id, this_send_thread);
3718       trp_id = 0;
3719     }
3720     while (globalData.theRestartFlag != perform_stop &&
3721            (trp_id = get_trp(instance_no, now, this_send_thread)) != 0)
3722            // PENDING -> ACTIVE
3723     {
3724       Uint32 num_trps_sent_dummy;
3725       if (!handle_send_trp(trp_id,
3726                            num_trps_sent_dummy,
3727                            thr_no,
3728                            now,
3729                            this_send_thread->m_watchdog_counter,
3730                            this_send_thread))
3731       {
3732         /**
3733          * Neighbour trps are not locked by get_trp and insert_trp.
3734          * They are locked by setting
3735          * m_trp_state[trp_id].m_thr_no_sender to thr_no.
3736          * Here we returned false from handle_send_trp since we were
3737          * not allowed to send to trp at this time. We want to keep
3738          * lock on trp as get_trp does for non-neighbour trps, so
3739          * we set this flag to retain lock even after we release mutex.
3740          * We also use asserts to ensure the state transitions are ok.
3741          *
3742          * The transporter is reinserted into the list of transporters
3743          * ready to transmit above in the code since id != 0 when we
3744          * return after sleep.
3745          */
3746         assert(m_trp_state[trp_id].m_thr_no_sender == NO_OWNER_THREAD);
3747         m_trp_state[trp_id].m_thr_no_sender = thr_no;
3748         break;
3749       }
3750 
3751       /* Release chunk-wise to decrease pressure on lock */
3752       this_send_thread->m_watchdog_counter = 3;
3753       this_send_thread->m_send_buffer_pool.release_chunk(
3754                                      g_thr_repository->m_mm,
3755                                      RG_TRANSPORTER_BUFFERS,
3756                                      instance_no);
3757 
3758       /**
3759        * We set trp_id = 0 for the very rare case where theRestartFlag is set
3760        * to perform_stop, we should never need this, but add it in just in
3761        * case.
3762        */
3763       trp_id = 0;
3764     } // while (get_trp()...)
3765 
3766     /* No more trps having data to send right now, prepare to sleep */
3767     this_send_thread->m_awake = FALSE;
3768     const Uint32 trp_wait = (trp_id != 0) ?
3769       m_trp_state[trp_id].m_micros_delayed : 0;
3770     NdbMutex_Unlock(this_send_thread->send_thread_mutex);
3771 
3772 
3773     if (real_time)
3774     {
3775       check_real_time_break(now,
3776                             &yield_ticks,
3777                             this_send_thread->m_thread,
3778                             SendThread);
3779     }
3780 
3781 
3782     /**
3783      * Send thread is by definition a throughput supportive thread.
3784      * Thus in situations when the latency is at risk the sending
3785      * is performed by the block threads. Thus there is no reason
3786      * to perform any spinning in the send thread, we will ignore
3787      * spin timer for send threads.
3788      */
3789     {
3790       Uint32 max_wait_nsec;
3791       /**
3792        * We sleep a max time, possibly waiting for a specific trp
3793        * with delayed send (overloaded, or waiting for more payload).
3794        * (Will be alerted to start working when more send work arrives)
3795        */
3796       if (trp_wait == 0)
3797       {
3798         //50ms, has to wakeup before 100ms watchdog alert.
3799         max_wait_nsec = 50*1000*1000;
3800       }
3801       else
3802       {
3803         max_wait_nsec = trp_wait * 1000;
3804       }
3805       NDB_TICKS before = NdbTick_getCurrentTicks();
3806       bool waited = yield(&this_send_thread->m_waiter_struct,
3807                           max_wait_nsec,
3808                           check_available_send_data,
3809                           this_send_thread);
3810       if (waited)
3811       {
3812         NDB_TICKS after = NdbTick_getCurrentTicks();
3813         micros_sleep += NdbTick_Elapsed(before, after).microSec();
3814       }
3815     }
3816   }
3817 
3818   globalEmulatorData.theWatchDog->unregisterWatchedThread(thr_no);
3819 }
3820 
3821 #if 0
3822 static
3823 Uint32
3824 fifo_used_pages(struct thr_data* selfptr)
3825 {
3826   return calc_fifo_used(selfptr->m_first_unused,
3827                         selfptr->m_first_free,
3828                         THR_FREE_BUF_MAX);
3829 }
3830 #endif
3831 
3832 ATTRIBUTE_NOINLINE
3833 static
3834 void
job_buffer_full(struct thr_data * selfptr)3835 job_buffer_full(struct thr_data* selfptr)
3836 {
3837   ndbout_c("job buffer full");
3838   dumpJobQueues();
3839   abort();
3840 }
3841 
3842 ATTRIBUTE_NOINLINE
3843 static
3844 void
out_of_job_buffer(struct thr_data * selfptr)3845 out_of_job_buffer(struct thr_data* selfptr)
3846 {
3847   ndbout_c("out of job buffer");
3848   dumpJobQueues();
3849   abort();
3850 }
3851 
3852 static
3853 thr_job_buffer*
seize_buffer(struct thr_repository * rep,int thr_no,bool prioa)3854 seize_buffer(struct thr_repository* rep, int thr_no, bool prioa)
3855 {
3856   thr_job_buffer* jb;
3857   struct thr_data* selfptr = &rep->m_thread[thr_no];
3858   Uint32 first_free = selfptr->m_first_free;
3859   Uint32 first_unused = selfptr->m_first_unused;
3860 
3861   /*
3862    * An empty FIFO is denoted by m_first_free == m_first_unused.
3863    * So we will never have a completely full FIFO array, at least one entry will
3864    * always be unused. But the code is simpler as a result.
3865    */
3866 
3867   /*
3868    * We never allow the fifo to become completely empty, as we want to have
3869    * a good number of signals available for trace files in case of a forced
3870    * shutdown.
3871    */
3872   Uint32 buffers = (first_free > first_unused ?
3873                     first_unused + THR_FREE_BUF_MAX - first_free :
3874                     first_unused - first_free);
3875   if (unlikely(buffers <= THR_FREE_BUF_MIN))
3876   {
3877     /*
3878      * All used, allocate another batch from global pool.
3879      *
3880      * Put the new buffers at the head of the fifo, so as not to needlessly
3881      * push out any existing buffers from the fifo (that would loose useful
3882      * data for signal dumps in trace files).
3883      */
3884     Uint32 cnt = 0;
3885     Uint32 batch = THR_FREE_BUF_MAX / THR_FREE_BUF_BATCH;
3886     assert(batch > 0);
3887     assert(batch + THR_FREE_BUF_MIN < THR_FREE_BUF_MAX);
3888     do {
3889       jb = rep->m_jb_pool.seize(rep->m_mm,
3890                                 RG_JOBBUFFER);
3891       if (unlikely(jb == 0))
3892       {
3893         if (unlikely(cnt == 0))
3894         {
3895           out_of_job_buffer(selfptr);
3896         }
3897         break;
3898       }
3899       jb->m_len = 0;
3900       jb->m_prioa = false;
3901       first_free = (first_free ? first_free : THR_FREE_BUF_MAX) - 1;
3902       selfptr->m_free_fifo[first_free] = jb;
3903       batch--;
3904     } while (cnt < batch);
3905     selfptr->m_first_free = first_free;
3906   }
3907 
3908   jb= selfptr->m_free_fifo[first_free];
3909   selfptr->m_first_free = (first_free + 1) % THR_FREE_BUF_MAX;
3910   /* Init here rather than in release_buffer() so signal dump will work. */
3911   jb->m_len = 0;
3912   jb->m_prioa = prioa;
3913   return jb;
3914 }
3915 
3916 static
3917 void
release_buffer(struct thr_repository * rep,int thr_no,thr_job_buffer * jb)3918 release_buffer(struct thr_repository* rep, int thr_no, thr_job_buffer* jb)
3919 {
3920   struct thr_data* selfptr = &rep->m_thread[thr_no];
3921   Uint32 first_free = selfptr->m_first_free;
3922   Uint32 first_unused = selfptr->m_first_unused;
3923 
3924   /*
3925    * Pack near-empty signals, to get more info in the signal traces.
3926    *
3927    * This is not currently used, as we only release full job buffers, hence
3928    * the #if 0.
3929    */
3930 #if 0
3931   Uint32 last_free = (first_unused ? first_unused : THR_FREE_BUF_MAX) - 1;
3932   thr_job_buffer *last_jb = selfptr->m_free_fifo[last_free];
3933   Uint32 len1, len2;
3934 
3935   if (!jb->m_prioa &&
3936       first_free != first_unused &&
3937       !last_jb->m_prioa &&
3938       (len2 = jb->m_len) <= (thr_job_buffer::SIZE / 4) &&
3939       (len1 = last_jb->m_len) + len2 <= thr_job_buffer::SIZE)
3940   {
3941     /*
3942      * The buffer being release is fairly empty, and what data it contains fit
3943      * in the previously released buffer.
3944      *
3945      * We want to avoid too many almost-empty buffers in the free fifo, as that
3946      * makes signal traces less useful due to too little data available. So in
3947      * this case we move the data from the buffer to be released into the
3948      * previous buffer, and place the to-be-released buffer at the head of the
3949      * fifo (to be immediately reused).
3950      *
3951      * This is only done for prio B buffers, as we must not merge prio A and B
3952      * data (or dumps would be incorrect), and prio A buffers are in any case
3953      * full when released.
3954      */
3955     memcpy(last_jb->m_data + len1, jb->m_data, len2*sizeof(jb->m_data[0]));
3956     last_jb->m_len = len1 + len2;
3957     jb->m_len = 0;
3958     first_free = (first_free ? first_free : THR_FREE_BUF_MAX) - 1;
3959     selfptr->m_free_fifo[first_free] = jb;
3960     selfptr->m_first_free = first_free;
3961   }
3962   else
3963 #endif
3964   {
3965     /* Just insert at the end of the fifo. */
3966     selfptr->m_free_fifo[first_unused] = jb;
3967     first_unused = (first_unused + 1) % THR_FREE_BUF_MAX;
3968     selfptr->m_first_unused = first_unused;
3969   }
3970 
3971   if (unlikely(first_unused == first_free))
3972   {
3973     /* FIFO full, need to release to global pool. */
3974     Uint32 batch = THR_FREE_BUF_MAX / THR_FREE_BUF_BATCH;
3975     assert(batch > 0);
3976     assert(batch < THR_FREE_BUF_MAX);
3977     do {
3978       rep->m_jb_pool.release(rep->m_mm,
3979                              RG_JOBBUFFER,
3980                              selfptr->m_free_fifo[first_free]);
3981       first_free = (first_free + 1) % THR_FREE_BUF_MAX;
3982       batch--;
3983     } while (batch > 0);
3984     selfptr->m_first_free = first_free;
3985   }
3986 }
3987 
3988 static
3989 inline
3990 Uint32
scan_queue(struct thr_data * selfptr,Uint32 cnt,Uint32 end,Uint32 * ptr)3991 scan_queue(struct thr_data* selfptr, Uint32 cnt, Uint32 end, Uint32* ptr)
3992 {
3993   Uint32 thr_no = selfptr->m_thr_no;
3994   Uint32 **pages = selfptr->m_tq.m_delayed_signals;
3995   Uint32 free = selfptr->m_tq.m_next_free;
3996   Uint32* save = ptr;
3997   for (Uint32 i = 0; i < cnt; i++, ptr++)
3998   {
3999     Uint32 val = * ptr;
4000     if ((val & 0xFFFF) <= end)
4001     {
4002       Uint32 idx = val >> 16;
4003       Uint32 buf = idx >> 8;
4004       Uint32 pos = MAX_SIGNAL_SIZE * (idx & 0xFF);
4005 
4006       Uint32* page = * (pages + buf);
4007 
4008       const SignalHeader *s = reinterpret_cast<SignalHeader*>(page + pos);
4009       const Uint32 *data = page + pos + (sizeof(*s)>>2);
4010       if (0)
4011 	ndbout_c("found %p val: %d end: %d", s, val & 0xFFFF, end);
4012       /*
4013        * ToDo: Do measurements of the frequency of these prio A timed signals.
4014        *
4015        * If they are frequent, we may want to optimize, as sending one prio A
4016        * signal is somewhat expensive compared to sending one prio B.
4017        */
4018       sendprioa(thr_no, s, data,
4019                 data + s->theLength);
4020       * (page + pos) = free;
4021       free = idx;
4022     }
4023     else if (i > 0)
4024     {
4025       selfptr->m_tq.m_next_free = free;
4026       memmove(save, ptr, 4 * (cnt - i));
4027       return i;
4028     }
4029     else
4030     {
4031       return 0;
4032     }
4033   }
4034   selfptr->m_tq.m_next_free = free;
4035   return cnt;
4036 }
4037 
4038 static
4039 void
handle_time_wrap(struct thr_data * selfptr)4040 handle_time_wrap(struct thr_data* selfptr)
4041 {
4042   Uint32 i;
4043   struct thr_tq * tq = &selfptr->m_tq;
4044   Uint32 cnt0 = tq->m_cnt[0];
4045   Uint32 cnt1 = tq->m_cnt[1];
4046   Uint32 tmp0 = scan_queue(selfptr, cnt0, 32767, tq->m_short_queue);
4047   Uint32 tmp1 = scan_queue(selfptr, cnt1, 32767, tq->m_long_queue);
4048   cnt0 -= tmp0;
4049   cnt1 -= tmp1;
4050   tq->m_cnt[0] = cnt0;
4051   tq->m_cnt[1] = cnt1;
4052   for (i = 0; i<cnt0; i++)
4053   {
4054     assert((tq->m_short_queue[i] & 0xFFFF) > 32767);
4055     tq->m_short_queue[i] -= 32767;
4056   }
4057   for (i = 0; i<cnt1; i++)
4058   {
4059     assert((tq->m_long_queue[i] & 0xFFFF) > 32767);
4060     tq->m_long_queue[i] -= 32767;
4061   }
4062 }
4063 
4064 /**
4065  * FUNCTION: scan_time_queues(), scan_time_queues_impl(),
4066  *           scan_time_queues_backtick()
4067  *
4068  * scan_time_queues() Implements the part we want to be inlined
4069  * into the scheduler loops, while *_impl() & *_backtick() is
4070  * the more unlikely part we don't call unless the timer has
4071  * ticked backward or forward more than 1ms since last 'scan_time.
4072  *
4073  * Check if any delayed signals has expired and should be sent now.
4074  * The time_queues will be checked every time we detect a change
4075  * in current time of >= 1ms. If idle we will sleep for max 10ms
4076  * before rechecking the time_queue.
4077  *
4078  * However, some situations need special attention:
4079  * - Even if we prefer monotonic timers, they are not available, or
4080  *   implemented in our abstraction layer, for all platforms.
4081  *   A non-monotonic timer may leap when adjusted by the user, both
4082  *   forward or backwards.
4083  * - Early implementation of monotonic timers had bugs where time
4084  *   could jump. Similar problems has been reported for several VMs.
4085  * - There might be CPU contention or system swapping where we might
4086  *   sleep for significantly longer that 10ms, causing long forward
4087  *   leaps in perceived time.
4088  *
4089  * In order to adapt to this non-perfect clock behaviour, the
4090  * scheduler has its own 'm_ticks' which is the current time
4091  * as perceived by the scheduler. On entering this function, 'now'
4092  * is the 'real' current time fetched from NdbTick_getCurrentTime().
4093  * 'selfptr->m_ticks' is the previous tick seen by the scheduler,
4094  * and as such is the timestamp which reflects the current time
4095  * as seen by the timer queues.
4096  *
4097  * Normally only a few milliseconds will elapse between each ticks
4098  * as seen by the diff between 'now' and 'selfthr->m_ticks'.
4099  * However, if there are larger leaps in the current time,
4100  * we breaks this up in several small(20ms) steps
4101  * by gradually increasing schedulers 'm_ticks' time. This ensure
4102  * that delayed signals will arrive in correct relative order,
4103  * and repeated signals (pace signals) are received with
4104  * the expected frequence. However, each individual signal may
4105  * be delayed or arriving to fast. Where excact timing is critical,
4106  * these signals should do their own time calculation by reading
4107  * the clock, instead of trusting that the signal is delivered as
4108  * specified by the 'delay' argument
4109  *
4110  * If there are leaps larger than 1500ms, we try a hybrid
4111  * solution by moving the 'm_ticks' forward, close to the
4112  * actuall current time, then continue as above from that
4113  * point in time. A 'time leap Warning' will also be printed
4114  * in the logs.
4115  */
4116 static
4117 Uint32
scan_time_queues_impl(struct thr_data * selfptr,Uint32 diff,NDB_TICKS now)4118 scan_time_queues_impl(struct thr_data* selfptr,
4119                       Uint32 diff,
4120                       NDB_TICKS now)
4121 {
4122   NDB_TICKS last = selfptr->m_ticks;
4123   Uint32 step = diff;
4124 
4125   if (unlikely(diff > 20))     // Break up into max 20ms steps
4126   {
4127     if (unlikely(diff > 1500)) // Time leaped more than 1500ms
4128     {
4129       /**
4130        * There was a long leap in the time since last checking
4131        * of the time_queues. The clock could have been adjusted, or we
4132        * are CPU starved. Anyway, we can never make up for the lost
4133        * CPU cycles, so we forget about them and start fresh from
4134        * a point in time 1000ms behind our current time.
4135        */
4136       struct ndb_rusage curr_rusage;
4137       Ndb_GetRUsage(&curr_rusage, false);
4138       if ((curr_rusage.ru_utime == 0 &&
4139            curr_rusage.ru_stime == 0) ||
4140           (selfptr->m_scan_time_queue_rusage.ru_utime == 0 &&
4141            selfptr->m_scan_time_queue_rusage.ru_stime == 0))
4142       {
4143         /**
4144          * get_rusage failed for some reason, print old variant of warning
4145          * message.
4146          */
4147         g_eventLogger->warning("thr: %u: Overslept %u ms, expected ~10ms",
4148                                selfptr->m_thr_no, diff);
4149       }
4150       else
4151       {
4152         Uint32 diff_real =
4153           NdbTick_Elapsed(selfptr->m_scan_real_ticks, now).milliSec();
4154         Uint64 exec_time = curr_rusage.ru_utime -
4155                            selfptr->m_scan_time_queue_rusage.ru_utime;
4156         Uint64 sys_time = curr_rusage.ru_stime -
4157                           selfptr->m_scan_time_queue_rusage.ru_stime;
4158         g_eventLogger->warning("thr: %u Overslept %u ms, expected ~10ms"
4159                                ", user time: %llu us, sys_time: %llu us",
4160                                selfptr->m_thr_no,
4161                                diff_real,
4162                                exec_time,
4163                                sys_time);
4164       }
4165       last = NdbTick_AddMilliseconds(last, diff-1000);
4166     }
4167     step = 20;  // Max expire intervall handled is 20ms
4168   }
4169 
4170   struct thr_tq * tq = &selfptr->m_tq;
4171   Uint32 curr = tq->m_current_time;
4172   Uint32 cnt0 = tq->m_cnt[0];
4173   Uint32 cnt1 = tq->m_cnt[1];
4174   Uint32 end = (curr + step);
4175   if (end >= 32767)
4176   {
4177     handle_time_wrap(selfptr);
4178     cnt0 = tq->m_cnt[0];
4179     cnt1 = tq->m_cnt[1];
4180     end -= 32767;
4181   }
4182 
4183   Uint32 tmp0 = scan_queue(selfptr, cnt0, end, tq->m_short_queue);
4184   Uint32 tmp1 = scan_queue(selfptr, cnt1, end, tq->m_long_queue);
4185 
4186   tq->m_current_time = end;
4187   tq->m_cnt[0] = cnt0 - tmp0;
4188   tq->m_cnt[1] = cnt1 - tmp1;
4189   selfptr->m_ticks = NdbTick_AddMilliseconds(last, step);
4190   selfptr->m_scan_real_ticks = now;
4191   Ndb_GetRUsage(&selfptr->m_scan_time_queue_rusage, false);
4192   return (diff - step);
4193 }
4194 
4195 /**
4196  * Clock has ticked backwards. We try to handle this
4197  * as best we can.
4198  */
4199 static
4200 void
scan_time_queues_backtick(struct thr_data * selfptr,NDB_TICKS now)4201 scan_time_queues_backtick(struct thr_data* selfptr, NDB_TICKS now)
4202 {
4203   const NDB_TICKS last = selfptr->m_ticks;
4204   assert(NdbTick_Compare(now, last) < 0);
4205 
4206   const Uint64 backward = NdbTick_Elapsed(now, last).milliSec();
4207 
4208   /**
4209    * Silently ignore sub millisecond backticks.
4210    * Such 'noise' is unfortunately common, even for monotonic timers.
4211    */
4212   if (backward > 0)
4213   {
4214     g_eventLogger->warning("thr: %u Time ticked backwards %llu ms.",
4215 		           selfptr->m_thr_no, backward);
4216 
4217     /* Long backticks should never happen for monotonic timers */
4218     assert(backward < 100 || !NdbTick_IsMonotonic());
4219 
4220     /* Accept new time as current */
4221     selfptr->m_ticks = now;
4222   }
4223 }
4224 
4225 /**
4226  * If someone sends a signal with delay it means that the signal
4227  * should be executed as soon as we come to the scan_time_queues
4228  * independent of the amount of time spent since it was sent. We
4229  * use a special time queue for bounded delay signals to avoid having
4230  * to scan through all short time queue signals in every loop of
4231  * the run job buffers.
4232  */
4233 static inline
4234 void
scan_zero_queue(struct thr_data * selfptr)4235 scan_zero_queue(struct thr_data* selfptr)
4236 {
4237   struct thr_tq * tq = &selfptr->m_tq;
4238   Uint32 cnt = tq->m_cnt[2];
4239   if (cnt)
4240   {
4241     Uint32 num_found = scan_queue(selfptr,
4242                                   cnt,
4243                                   tq->m_current_time,
4244                                   tq->m_zero_queue);
4245     require(num_found == cnt);
4246   }
4247   tq->m_cnt[2] = 0;
4248 }
4249 
4250 static inline
4251 Uint32
scan_time_queues(struct thr_data * selfptr,NDB_TICKS now)4252 scan_time_queues(struct thr_data* selfptr, NDB_TICKS now)
4253 {
4254   scan_zero_queue(selfptr);
4255   const NDB_TICKS last = selfptr->m_ticks;
4256   if (unlikely(NdbTick_Compare(now, last) < 0))
4257   {
4258     scan_time_queues_backtick(selfptr, now);
4259     return 0;
4260   }
4261 
4262   const Uint32 diff = (Uint32)NdbTick_Elapsed(last, now).milliSec();
4263   if (unlikely(diff > 0))
4264   {
4265     return scan_time_queues_impl(selfptr, diff, now);
4266   }
4267   return 0;
4268 }
4269 
4270 static
4271 inline
4272 Uint32*
get_free_slot(struct thr_repository * rep,struct thr_data * selfptr,Uint32 * idxptr)4273 get_free_slot(struct thr_repository* rep,
4274 	      struct thr_data* selfptr,
4275 	      Uint32* idxptr)
4276 {
4277   struct thr_tq * tq = &selfptr->m_tq;
4278   Uint32 idx = tq->m_next_free;
4279 retry:
4280 
4281   if (idx != RNIL)
4282   {
4283     Uint32 buf = idx >> 8;
4284     Uint32 pos = idx & 0xFF;
4285     Uint32* page = * (tq->m_delayed_signals + buf);
4286     Uint32* ptr = page + (MAX_SIGNAL_SIZE * pos);
4287     tq->m_next_free = * ptr;
4288     * idxptr = idx;
4289     return ptr;
4290   }
4291 
4292   Uint32 thr_no = selfptr->m_thr_no;
4293   for (Uint32 i = 0; i<thr_tq::PAGES; i++)
4294   {
4295     if (tq->m_delayed_signals[i] == 0)
4296     {
4297       struct thr_job_buffer *jb = seize_buffer(rep, thr_no, false);
4298       Uint32 * page = reinterpret_cast<Uint32*>(jb);
4299       tq->m_delayed_signals[i] = page;
4300       /**
4301        * Init page
4302        */
4303       for (Uint32 j = 0; j < MIN_SIGNALS_PER_PAGE; j ++)
4304       {
4305 	page[j * MAX_SIGNAL_SIZE] = (i << 8) + (j + 1);
4306       }
4307       page[MIN_SIGNALS_PER_PAGE*MAX_SIGNAL_SIZE] = RNIL;
4308       idx = (i << 8);
4309       goto retry;
4310     }
4311   }
4312   abort();
4313   return NULL;
4314 }
4315 
4316 void
senddelay(Uint32 thr_no,const SignalHeader * s,Uint32 delay)4317 senddelay(Uint32 thr_no, const SignalHeader* s, Uint32 delay)
4318 {
4319   struct thr_repository* rep = g_thr_repository;
4320   struct thr_data* selfptr = &rep->m_thread[thr_no];
4321   assert(my_thread_equal(selfptr->m_thr_id, my_thread_self()));
4322   unsigned siglen = (sizeof(*s) >> 2) + s->theLength + s->m_noOfSections;
4323 
4324   Uint32 max;
4325   Uint32 * cntptr;
4326   Uint32 * queueptr;
4327 
4328   Uint32 alarm;
4329   Uint32 nexttimer = selfptr->m_tq.m_next_timer;
4330   if (delay == SimulatedBlock::BOUNDED_DELAY)
4331   {
4332     alarm = selfptr->m_tq.m_current_time;
4333     cntptr = selfptr->m_tq.m_cnt + 2;
4334     queueptr = selfptr->m_tq.m_zero_queue;
4335     max = thr_tq::ZQ_SIZE;
4336   }
4337   else
4338   {
4339     alarm = selfptr->m_tq.m_current_time + delay;
4340     if (delay < 100)
4341     {
4342       cntptr = selfptr->m_tq.m_cnt + 0;
4343       queueptr = selfptr->m_tq.m_short_queue;
4344       max = thr_tq::SQ_SIZE;
4345     }
4346     else
4347     {
4348       cntptr = selfptr->m_tq.m_cnt + 1;
4349       queueptr = selfptr->m_tq.m_long_queue;
4350       max = thr_tq::LQ_SIZE;
4351     }
4352   }
4353 
4354   Uint32 idx;
4355   Uint32* ptr = get_free_slot(rep, selfptr, &idx);
4356   memcpy(ptr, s, 4*siglen);
4357 
4358   if (0)
4359     ndbout_c("now: %d alarm: %d send %s from %s to %s delay: %d idx: %x %p",
4360 	     selfptr->m_tq.m_current_time,
4361 	     alarm,
4362 	     getSignalName(s->theVerId_signalNumber),
4363 	     getBlockName(refToBlock(s->theSendersBlockRef)),
4364 	     getBlockName(s->theReceiversBlockNumber),
4365 	     delay,
4366 	     idx, ptr);
4367 
4368   Uint32 i;
4369   Uint32 cnt = *cntptr;
4370   Uint32 newentry = (idx << 16) | (alarm & 0xFFFF);
4371 
4372   * cntptr = cnt + 1;
4373   selfptr->m_tq.m_next_timer = alarm < nexttimer ? alarm : nexttimer;
4374 
4375   if (cnt == 0 || delay == SimulatedBlock::BOUNDED_DELAY)
4376   {
4377     /* First delayed signal needs no order and bounded delay is FIFO */
4378     queueptr[cnt] = newentry;
4379     return;
4380   }
4381   else if (cnt < max)
4382   {
4383     for (i = 0; i<cnt; i++)
4384     {
4385       Uint32 save = queueptr[i];
4386       if ((save & 0xFFFF) > alarm)
4387       {
4388 	memmove(queueptr+i+1, queueptr+i, 4*(cnt - i));
4389 	queueptr[i] = newentry;
4390 	return;
4391       }
4392     }
4393     assert(i == cnt);
4394     queueptr[i] = newentry;
4395     return;
4396   }
4397   else
4398   {
4399     /* Out of entries in time queue, issue proper error */
4400     if (cntptr == (selfptr->m_tq.m_cnt + 0))
4401     {
4402       /* Error in short time queue */
4403       ERROR_SET(ecError, NDBD_EXIT_TIME_QUEUE_SHORT,
4404                 "Too many in Short Time Queue", "mt.cpp" );
4405     }
4406     else if (cntptr == (selfptr->m_tq.m_cnt + 1))
4407     {
4408       /* Error in long time queue */
4409       ERROR_SET(ecError, NDBD_EXIT_TIME_QUEUE_LONG,
4410                 "Too many in Long Time Queue", "mt.cpp" );
4411     }
4412     else
4413     {
4414       /* Error in zero time queue */
4415       ERROR_SET(ecError, NDBD_EXIT_TIME_QUEUE_ZERO,
4416                 "Too many in Zero Time Queue", "mt.cpp" );
4417     }
4418   }
4419 }
4420 
4421 /*
4422  * Flush the write state to the job queue, making any new signals available to
4423  * receiving threads.
4424  *
4425  * Two versions:
4426  *    - The general version flush_write_state_other() which may flush to
4427  *      any thread, and possibly signal any waiters.
4428  *    - The special version flush_write_state_self() which should only be used
4429  *      to flush messages to itself.
4430  *
4431  * Call to these functions are encapsulated through flush_write_state
4432  * which decides which of these functions to call.
4433  */
4434 static inline
4435 void
flush_write_state_self(thr_job_queue_head * q_head,thr_jb_write_state * w)4436 flush_write_state_self(thr_job_queue_head *q_head, thr_jb_write_state *w)
4437 {
4438   /*
4439    * Can simplify the flush_write_state when writing to myself:
4440    * Simply update write references wo/ mutex, memory barrier and signaling
4441    */
4442   w->m_write_buffer->m_len = w->m_write_pos;
4443   q_head->m_write_index = w->m_write_index;
4444   w->init_pending_signals();
4445 }
4446 
4447 static inline
4448 void
flush_write_state_other(thr_data * dstptr,thr_job_queue_head * q_head,thr_jb_write_state * w,bool prioa_flag)4449 flush_write_state_other(thr_data *dstptr,
4450                         thr_job_queue_head *q_head,
4451                         thr_jb_write_state *w,
4452                         bool prioa_flag)
4453 {
4454   Uint32 pending_signals_saved;
4455   /*
4456    * Two write memory barriers here, as assigning m_len may make signal data
4457    * available to other threads, and assigning m_write_index may make new
4458    * buffers available.
4459    *
4460    * We could optimize this by only doing it as needed, and only doing it
4461    * once before setting all m_len, and once before setting all m_write_index.
4462    *
4463    * But wmb() is a no-op anyway in x86 ...
4464    */
4465   wmb();
4466   w->m_write_buffer->m_len = w->m_write_pos;
4467   wmb();
4468   q_head->m_write_index = w->m_write_index;
4469 
4470   pending_signals_saved = w->get_pending_signals_wakeup();
4471   pending_signals_saved += w->get_pending_signals();
4472 
4473   if (pending_signals_saved >= MAX_SIGNALS_BEFORE_WAKEUP &&
4474       (!prioa_flag))
4475   {
4476     w->init_pending_signals();
4477     wakeup(&(dstptr->m_waiter));
4478   }
4479   else
4480   {
4481     w->clear_pending_signals_and_set_wakeup(pending_signals_saved);
4482   }
4483 }
4484 
4485 /**
4486   This function is used when we need to send signal immediately
4487   due to the flush limit being reached. We don't know whether
4488   signal is to ourselves in this case and we act dependent on who
4489   is the receiver of the signal.
4490 */
4491 static inline
4492 void
flush_write_state(const thr_data * selfptr,thr_data * dstptr,thr_job_queue_head * q_head,thr_jb_write_state * w,bool prioa_flag)4493 flush_write_state(const thr_data *selfptr,
4494                   thr_data *dstptr,
4495                   thr_job_queue_head *q_head,
4496                   thr_jb_write_state *w,
4497                   bool prioa_flag)
4498 {
4499   if (dstptr == selfptr)
4500   {
4501     flush_write_state_self(q_head, w);
4502   }
4503   else
4504   {
4505     flush_write_state_other(dstptr, q_head, w, prioa_flag);
4506   }
4507 }
4508 
4509 /**
4510   This function is used when we are called from flush_jbb_write_state
4511   where we know that the receiver should wakeup to receive the signals
4512   we're sending.
4513 */
4514 static inline
4515 void
flush_write_state_other_wakeup(thr_data * dstptr,thr_job_queue_head * q_head,thr_jb_write_state * w)4516 flush_write_state_other_wakeup(thr_data *dstptr,
4517                                thr_job_queue_head *q_head,
4518                                thr_jb_write_state *w)
4519 {
4520   /*
4521    * We already did a memory barrier before the loop calling this
4522    * function to ensure the buffer is properly seen by receiving
4523    * thread.
4524    */
4525   w->m_write_buffer->m_len = w->m_write_pos;
4526   wmb();
4527   q_head->m_write_index = w->m_write_index;
4528 
4529   w->init_pending_signals();
4530   wakeup(&(dstptr->m_waiter));
4531 }
4532 
4533 static
4534 void
flush_jbb_write_state(thr_data * selfptr)4535 flush_jbb_write_state(thr_data *selfptr)
4536 {
4537   Uint32 thr_count = g_thr_repository->m_thread_count;
4538   Uint32 self = selfptr->m_thr_no;
4539 
4540   thr_jb_write_state *w = selfptr->m_write_states + self;
4541   thr_data *thrptr = g_thr_repository->m_thread;
4542 
4543   /**
4544     We start by flushing to ourselves, this requires no extra memory
4545     barriers and ensures that we can proceed in the loop knowing that
4546     we will only send to remote threads.
4547 
4548     After this we will insert a memory barrier before we start updating
4549     the m_len variable that makes other threads see our signals that
4550     we're sending to them. We need the memory barrier to ensure that the
4551     buffers are seen properly updated by the remote thread when they see
4552     the pointer to them.
4553   */
4554   if (w->has_any_pending_signals())
4555   {
4556     flush_write_state_self(selfptr->m_in_queue_head + self, w);
4557   }
4558   wmb();
4559   w = selfptr->m_write_states;
4560   thr_jb_write_state *w_end = selfptr->m_write_states + thr_count;
4561   for (; w < w_end; thrptr++, w++)
4562   {
4563     if (w->has_any_pending_signals())
4564     {
4565       thr_job_queue_head *q_head = thrptr->m_in_queue_head + self;
4566       flush_write_state_other_wakeup(thrptr, q_head, w);
4567     }
4568   }
4569 }
4570 
4571 /**
4572  * Receive thread will unpack 1024 signals (MAX_RECEIVED_SIGNALS)
4573  * from Transporters before running another check_recv_queue
4574  *
4575  * This function returns true if there is not space to unpack
4576  * this amount of signals, else false.
4577  *
4578  * Also used as callback function from yield() to recheck
4579  * 'full' condition before going to sleep.
4580  */
4581 static bool
check_recv_queue(thr_job_queue_head * q_head)4582 check_recv_queue(thr_job_queue_head *q_head)
4583 {
4584   const Uint32 minfree = (1024 + MIN_SIGNALS_PER_PAGE - 1)/MIN_SIGNALS_PER_PAGE;
4585   /**
4586    * NOTE: m_read_index is read wo/ lock (and updated by different thread)
4587    *       but since the different thread can only consume
4588    *       signals this means that the value returned from this
4589    *       function is always conservative (i.e it can be better than
4590    *       returned value, if read-index has moved but we didnt see it)
4591    */
4592   const unsigned ri = q_head->m_read_index;
4593   const unsigned wi = q_head->m_write_index;
4594   const unsigned busy = (wi >= ri) ? wi - ri : (thr_job_queue::SIZE - ri) + wi;
4595   return (1 + minfree + busy >= thr_job_queue::SIZE);
4596 }
4597 
4598 /**
4599  * Check if any of the receive queues for the threads being served
4600  * by this receive thread, are full.
4601  * If full: Return 'Thr_data*' for (one of) the thread(s)
4602  *          which we have to wait for. (to consume from queue)
4603  */
4604 static struct thr_data*
get_congested_recv_queue(struct thr_repository * rep,Uint32 recv_thread_id)4605 get_congested_recv_queue(struct thr_repository* rep, Uint32 recv_thread_id)
4606 {
4607   const unsigned thr_no = first_receiver_thread_no + recv_thread_id;
4608   thr_data *thrptr = rep->m_thread;
4609 
4610   for (unsigned i = 0; i<glob_num_threads; i++, thrptr++)
4611   {
4612     thr_job_queue_head *q_head = thrptr->m_in_queue_head + thr_no;
4613     if (check_recv_queue(q_head))
4614     {
4615       return thrptr;
4616     }
4617   }
4618   return NULL;
4619 }
4620 
4621 /**
4622  * Compute free buffers in specified queue.
4623  * The SAFETY margin is subtracted from the available
4624  * 'free'. which is returned.
4625  */
4626 static
4627 Uint32
compute_free_buffers_in_queue(const thr_job_queue_head * q_head)4628 compute_free_buffers_in_queue(const thr_job_queue_head *q_head)
4629 {
4630   /**
4631    * NOTE: m_read_index is read wo/ lock (and updated by different thread)
4632    *       but since the different thread can only consume
4633    *       signals this means that the value returned from this
4634    *       function is always conservative (i.e it can be better than
4635    *       returned value, if read-index has moved but we didnt see it)
4636    */
4637   unsigned ri = q_head->m_read_index;
4638   unsigned wi = q_head->m_write_index;
4639   unsigned free = (wi < ri) ? ri - wi : (thr_job_queue::SIZE + ri) - wi;
4640 
4641   assert(free <= thr_job_queue::SIZE);
4642 
4643   if (free <= (1 + thr_job_queue::SAFETY))
4644     return 0;
4645   else
4646     return free - (1 + thr_job_queue::SAFETY);
4647 }
4648 
4649 static
4650 Uint32
compute_min_free_out_buffers(Uint32 thr_no)4651 compute_min_free_out_buffers(Uint32 thr_no)
4652 {
4653   Uint32 minfree = thr_job_queue::SIZE;
4654   const struct thr_repository* rep = g_thr_repository;
4655   const struct thr_data *thrptr = rep->m_thread;
4656 
4657   for (unsigned i = 0; i<glob_num_threads; i++, thrptr++)
4658   {
4659     const thr_job_queue_head *q_head = thrptr->m_in_queue_head + thr_no;
4660     unsigned free = compute_free_buffers_in_queue(q_head);
4661 
4662     if (free < minfree)
4663       minfree = free;
4664   }
4665   return minfree;
4666 }
4667 
4668 /**
4669  * Compute max signals that thr_no can execute wo/ risking
4670  *   job-buffer-full
4671  *
4672  *  see-also update_sched_config
4673  *
4674  *
4675  * 1) compute free-slots in ring-buffer from self to each thread in system
4676  * 2) pick smallest value
4677  * 3) compute how many signals this corresponds to
4678  * 4) compute how many signals self can execute if all were to be to
4679  *    the thread with the fullest ring-buffer (i.e the worst case)
4680  *
4681  *   Assumption: each signal may send *at most* 4 signals
4682  *     - this assumption is made the same in ndbd and ndbmtd and is
4683  *       mostly followed by block-code, although not it all places :-(
4684  */
4685 static
4686 Uint32
compute_max_signals_to_execute(Uint32 min_free_buffers)4687 compute_max_signals_to_execute(Uint32 min_free_buffers)
4688 {
4689   return ((min_free_buffers * MIN_SIGNALS_PER_PAGE) + 3) / 4;
4690 }
4691 
4692 static
4693 void
dumpJobQueues(void)4694 dumpJobQueues(void)
4695 {
4696   BaseString tmp;
4697   const struct thr_repository* rep = g_thr_repository;
4698   for (unsigned from = 0; from<glob_num_threads; from++)
4699   {
4700     for (unsigned to = 0; to<glob_num_threads; to++)
4701     {
4702       const thr_data *thrptr = rep->m_thread + to;
4703       const thr_job_queue_head *q_head = thrptr->m_in_queue_head + from;
4704 
4705       const unsigned used = q_head->used();
4706       if (used > 0)
4707       {
4708         tmp.appfmt(" job buffer %d --> %d, used %d",
4709                    from, to, used);
4710         unsigned free = compute_free_buffers_in_queue(q_head);
4711         if (free <= 0)
4712         {
4713           tmp.appfmt(" FULL!");
4714         }
4715         else if (free <= thr_job_queue::RESERVED)
4716         {
4717           tmp.appfmt(" HIGH LOAD (free:%d)", free);
4718         }
4719         tmp.appfmt("\n");
4720       }
4721     }
4722   }
4723   if (!tmp.empty())
4724   {
4725     ndbout_c("Dumping non-empty job queues:\n%s", tmp.c_str());
4726   }
4727 }
4728 
4729 void
reportSendLen(NodeId nodeId,Uint32 count,Uint64 bytes)4730 trp_callback::reportSendLen(NodeId nodeId, Uint32 count, Uint64 bytes)
4731 {
4732   SignalT<3> signalT;
4733   Signal &signal = * new (&signalT) Signal(0);
4734   memset(&signal.header, 0, sizeof(signal.header));
4735 
4736   if (g_send_threads)
4737   {
4738     /**
4739      * TODO: Implement this also when using send threads!!
4740      * To handle this we need to be able to send from send
4741      * threads since the m_send_thread below can be a send
4742      * thread. One manner to handle is to keep it in send
4743      * thread data structure and have some block thread
4744      * gather the data every now and then.
4745      */
4746     return;
4747   }
4748 
4749 #ifdef RONM_TODO
4750   signal.header.theLength = 3;
4751   signal.header.theSendersSignalId = 0;
4752   signal.header.theSendersBlockRef = numberToRef(0, globalData.ownId);
4753   signal.theData[0] = NDB_LE_SendBytesStatistic;
4754   signal.theData[1] = nodeId;
4755   signal.theData[2] = (Uint32)(bytes/count);
4756   signal.header.theVerId_signalNumber = GSN_EVENT_REP;
4757   signal.header.theReceiversBlockNumber = CMVMI;
4758   sendlocal(g_thr_repository->m_send_buffers[nodeId].m_send_thread,
4759             &signalT.header, signalT.theData, NULL);
4760 #endif
4761 }
4762 
4763 /**
4764  * To lock during connect/disconnect, we take both the send lock for the trp
4765  * (to protect performSend(), and the global receive lock (to protect
4766  * performReceive()). By having two locks, we avoid contention between the
4767  * common send and receive operations.
4768  *
4769  * We can have contention between connect/disconnect of one transporter and
4770  * receive for the others. But the transporter code should try to keep this
4771  * lock only briefly, ie. only to set state to DISCONNECTING / socket fd to
4772  * NDB_INVALID_SOCKET, not for the actual close() syscall.
4773  */
4774 void
lock_transporter(NodeId node,TrpId trp_id)4775 trp_callback::lock_transporter(NodeId node, TrpId trp_id)
4776 {
4777   (void)node;
4778   Uint32 recv_thread_idx = mt_get_recv_thread_idx(trp_id);
4779   struct thr_repository* rep = g_thr_repository;
4780   /**
4781    * Note: take the send lock _first_, so that we will not hold the receive
4782    * lock while blocking on the send lock.
4783    *
4784    * The reverse case, blocking send lock for one transporter while waiting
4785    * for receive lock, is not a problem, as the transporter being blocked is
4786    * in any case disconnecting/connecting at this point in time, and sends are
4787    * non-waiting (so we will not block sending on other transporters).
4788    */
4789   lock(&rep->m_send_buffers[trp_id].m_send_lock);
4790   lock(&rep->m_receive_lock[recv_thread_idx]);
4791 }
4792 
4793 void
unlock_transporter(NodeId node,TrpId trp_id)4794 trp_callback::unlock_transporter(NodeId node, TrpId trp_id)
4795 {
4796   (void)node;
4797   Uint32 recv_thread_idx = mt_get_recv_thread_idx(trp_id);
4798   struct thr_repository* rep = g_thr_repository;
4799   unlock(&rep->m_receive_lock[recv_thread_idx]);
4800   unlock(&rep->m_send_buffers[trp_id].m_send_lock);
4801 }
4802 
4803 void
lock_send_transporter(NodeId node,TrpId trp_id)4804 trp_callback::lock_send_transporter(NodeId node, TrpId trp_id)
4805 {
4806   (void)node;
4807   struct thr_repository* rep = g_thr_repository;
4808   lock(&rep->m_send_buffers[trp_id].m_send_lock);
4809 }
4810 
4811 void
unlock_send_transporter(NodeId node,TrpId trp_id)4812 trp_callback::unlock_send_transporter(NodeId node, TrpId trp_id)
4813 {
4814   (void)node;
4815   struct thr_repository* rep = g_thr_repository;
4816   unlock(&rep->m_send_buffers[trp_id].m_send_lock);
4817 }
4818 
4819 int
mt_checkDoJob(Uint32 recv_thread_idx)4820 mt_checkDoJob(Uint32 recv_thread_idx)
4821 {
4822   struct thr_repository* rep = g_thr_repository;
4823 
4824   /**
4825    * Return '1' if we are not allowed to receive more signals
4826    * into the job buffers from this 'recv_thread_idx'.
4827    *
4828    * NOTE:
4829    *   We should not loop-wait for buffers to become available
4830    *   here as we currently hold the receiver-lock. Furthermore
4831    *   waiting too long here could cause the receiver thread to be
4832    *   less responsive wrt. moving incoming (TCP) data from the
4833    *   TCPTransporters into the (local) receiveBuffers.
4834    *   The thread could also oversleep on its other tasks as
4835    *   handling open/close of connections, and catching
4836    *   its own shutdown events
4837    */
4838   return (get_congested_recv_queue(rep, recv_thread_idx) != NULL);
4839 }
4840 
4841 /**
4842  * Collect all send-buffer-pages to be delivered to trp
4843  * from each thread. Link them together and append them to
4844  * the single send_buffer list 'sb->m_buffer'.
4845  *
4846  * The 'sb->m_buffer_lock' has to be held prior to calling
4847  * this function.
4848  *
4849  * Return: Number of bytes in the collected send-buffers.
4850  *
4851  * TODO: This is not completely fair,
4852  *       it would be better to get one entry from each thr_send_queue
4853  *       per thread instead (until empty)
4854  */
4855 static
4856 Uint32
link_thread_send_buffers(thr_repository::send_buffer * sb,Uint32 id)4857 link_thread_send_buffers(thr_repository::send_buffer * sb, Uint32 id)
4858 {
4859   Uint32 ri[MAX_BLOCK_THREADS];
4860   Uint32 wi[MAX_BLOCK_THREADS];
4861   thr_send_queue *src = g_thr_repository->m_thread_send_buffers[id];
4862   for (unsigned thr = 0; thr < glob_num_threads; thr++)
4863   {
4864     ri[thr] = sb->m_read_index[thr];
4865     wi[thr] = src[thr].m_write_index;
4866   }
4867 
4868   Uint64 sentinel[thr_send_page::HEADER_SIZE >> 1];
4869   thr_send_page* sentinel_page = new (&sentinel[0]) thr_send_page;
4870   sentinel_page->m_next = 0;
4871 
4872   struct thr_send_buffer tmp;
4873   tmp.m_first_page = sentinel_page;
4874   tmp.m_last_page = sentinel_page;
4875 
4876   Uint32 bytes = 0;
4877 
4878 #ifdef ERROR_INSERT
4879 
4880 #define MIXOLOGY_MIX_MT_SEND 2
4881 
4882   if (unlikely(globalEmulatorData.theConfiguration->getMixologyLevel() &
4883                MIXOLOGY_MIX_MT_SEND))
4884   {
4885     /**
4886      * DEBUGGING only
4887      * Interleave at the page level from all threads with
4888      * pages to send - intended to help expose signal
4889      * order dependency bugs
4890      * TODO : Avoid having a whole separate implementation
4891      * like this.
4892      */
4893     bool more_pages;
4894 
4895     do
4896     {
4897       src = g_thr_repository->m_thread_send_buffers[id];
4898       more_pages = false;
4899       for (unsigned thr = 0; thr < glob_num_threads; thr++, src++)
4900       {
4901         Uint32 r = ri[thr];
4902         Uint32 w = wi[thr];
4903         if (r != w)
4904         {
4905           rmb();
4906           /* Take one page from this thread's send buffer for this trp */
4907           thr_send_page * p = src->m_buffers[r];
4908           assert(p->m_start == 0);
4909           bytes += p->m_bytes;
4910           tmp.m_last_page->m_next = p;
4911           tmp.m_last_page = p;
4912 
4913           /* Take page out of read_index slot list */
4914           thr_send_page * next = p->m_next;
4915           p->m_next = NULL;
4916           src->m_buffers[r] = next;
4917 
4918           if (next == NULL)
4919           {
4920             /**
4921              * Used up read slot, any more slots available to read
4922              * from this thread?
4923              */
4924             r = (r+1) % thr_send_queue::SIZE;
4925             more_pages |= (r != w);
4926 
4927             /* Update global and local per thread read indices */
4928             sb->m_read_index[thr] = r;
4929             ri[thr] = r;
4930           }
4931           else
4932           {
4933             more_pages |= true;
4934           }
4935         }
4936       }
4937     } while (more_pages);
4938   }
4939   else
4940 
4941 #endif
4942 
4943   {
4944     for (unsigned thr = 0; thr < glob_num_threads; thr++, src++)
4945     {
4946       Uint32 r = ri[thr];
4947       Uint32 w = wi[thr];
4948       if (r != w)
4949       {
4950         rmb();
4951         while (r != w)
4952         {
4953           thr_send_page * p = src->m_buffers[r];
4954           assert(p->m_start == 0);
4955           bytes += p->m_bytes;
4956           tmp.m_last_page->m_next = p;
4957           while (p->m_next != 0)
4958           {
4959             p = p->m_next;
4960             assert(p->m_start == 0);
4961             bytes += p->m_bytes;
4962           }
4963           tmp.m_last_page = p;
4964           assert(tmp.m_last_page != 0); /* Impossible */
4965           r = (r + 1) % thr_send_queue::SIZE;
4966         }
4967         sb->m_read_index[thr] = r;
4968       }
4969     }
4970   }
4971   if (bytes > 0)
4972   {
4973     const Uint64 buffered_size = sb->m_buffered_size;
4974     /**
4975      * Append send buffers collected from threads
4976      * to end of existing m_buffers.
4977      */
4978     if (sb->m_buffer.m_first_page)
4979     {
4980       assert(sb->m_buffer.m_first_page != NULL);
4981       assert(sb->m_buffer.m_last_page != NULL);
4982       sb->m_buffer.m_last_page->m_next = tmp.m_first_page->m_next;
4983       sb->m_buffer.m_last_page = tmp.m_last_page;
4984     }
4985     else
4986     {
4987       assert(sb->m_buffer.m_first_page == NULL);
4988       assert(sb->m_buffer.m_last_page == NULL);
4989       sb->m_buffer.m_first_page = tmp.m_first_page->m_next;
4990       sb->m_buffer.m_last_page = tmp.m_last_page;
4991     }
4992     sb->m_buffered_size = buffered_size + bytes;
4993   }
4994   return bytes;
4995 }
4996 
4997 /**
4998  * pack thr_send_pages for a particular send-buffer <em>db</em>
4999  * release pages (local) to <em>pool</em>
5000  *
5001  * We're using a very simple algorithm that packs two neighbour
5002  * pages into one page if possible, if not possible we simply
5003  * move on. This guarantees that pages will at least be full to
5004  * 50% fill level which should be sufficient for our needs here.
5005  *
5006  * We call pack_sb_pages() when we fail to send all data to one
5007  * specific trp immediately. This ensures that we won't keep
5008  * pages allocated with lots of free spaces.
5009  *
5010  * We may also pack_sb_pages() from get_bytes_to_send_iovec()
5011  * if all send buffers can't be filled into the iovec[]. Thus
5012  * possibly saving extra send roundtrips.
5013  *
5014  * The send threads will use the pack_sb_pages()
5015  * from the bytes_sent function which is a callback from
5016  * the transporter.
5017  *
5018  * Can only be called with relevant lock held on 'buffer'.
5019  * Return remaining unsent bytes in 'buffer'.
5020  */
5021 static
5022 Uint32
pack_sb_pages(thread_local_pool<thr_send_page> * pool,struct thr_send_buffer * buffer)5023 pack_sb_pages(thread_local_pool<thr_send_page>* pool,
5024               struct thr_send_buffer* buffer)
5025 {
5026   assert(buffer->m_first_page != NULL);
5027   assert(buffer->m_last_page != NULL);
5028   assert(buffer->m_last_page->m_next == NULL);
5029 
5030   thr_send_page* curr = buffer->m_first_page;
5031   Uint32 curr_free = curr->max_bytes() - (curr->m_bytes + curr->m_start);
5032   Uint32 bytes = curr->m_bytes;
5033   while (curr->m_next != 0)
5034   {
5035     thr_send_page* next = curr->m_next;
5036     bytes += next->m_bytes;
5037     assert(next->m_start == 0); // only first page should have half sent bytes
5038     if (next->m_bytes <= curr_free)
5039     {
5040       /**
5041        * There is free space in the current page and it is sufficient to
5042        * store the entire next-page. Copy from next page to current page
5043        * and update current page and release next page to local pool.
5044        */
5045       thr_send_page * save = next;
5046       memcpy(curr->m_data + (curr->m_bytes + curr->m_start),
5047              next->m_data,
5048              next->m_bytes);
5049 
5050       curr_free -= next->m_bytes;
5051 
5052       curr->m_bytes += next->m_bytes;
5053       curr->m_next = next->m_next;
5054 
5055       pool->release_local(save);
5056 
5057 #ifdef NDB_BAD_SEND
5058       if ((curr->m_bytes % 40) == 24)
5059       {
5060         /* Oops */
5061         curr->m_data[curr->m_start + 21] = 'F';
5062       }
5063 #endif
5064     }
5065     else
5066     {
5067       /* Not enough free space in current, move to next page */
5068       curr = next;
5069       curr_free = curr->max_bytes() - (curr->m_bytes + curr->m_start);
5070     }
5071   }
5072 
5073   buffer->m_last_page = curr;
5074   assert(bytes > 0);
5075   return bytes;
5076 }
5077 
5078 static
5079 void
release_list(thread_local_pool<thr_send_page> * pool,thr_send_page * head,thr_send_page * tail)5080 release_list(thread_local_pool<thr_send_page>* pool,
5081              thr_send_page* head, thr_send_page * tail)
5082 {
5083   while (head != tail)
5084   {
5085     thr_send_page * tmp = head;
5086     head = head->m_next;
5087     pool->release_local(tmp);
5088   }
5089   pool->release_local(tail);
5090 }
5091 
5092 /**
5093  * Get buffered pages ready to be sent by the transporter.
5094  * All pages returned from this function will refer to
5095  * pages in the m_sending buffers
5096  *
5097  * The 'sb->m_send_lock' has to be held prior to calling
5098  * this function.
5099  *
5100  * Any available 'm_buffer's will be appended to the
5101  * 'm_sending' buffers with apropriate locks taken.
5102  *
5103  * If sending to trp is not enabled, the buffered pages
5104  * are released instead of being returned from this method.
5105  */
5106 Uint32
get_bytes_to_send_iovec(NodeId node,TrpId trp_id,struct iovec * dst,Uint32 max)5107 trp_callback::get_bytes_to_send_iovec(NodeId node,
5108                                       TrpId trp_id,
5109                                       struct iovec *dst,
5110                                       Uint32 max)
5111 {
5112   (void)node;
5113   thr_repository::send_buffer *sb = g_thr_repository->m_send_buffers + trp_id;
5114   sb->m_bytes_sent = 0;
5115 
5116   /**
5117    * Collect any available send pages from the thread queues
5118    * and 'm_buffers'. Append them to the end of m_sending buffers
5119    */
5120   {
5121     lock(&sb->m_buffer_lock);
5122     link_thread_send_buffers(sb, trp_id);
5123 
5124     if (sb->m_buffer.m_first_page != NULL)
5125     {
5126       // If first page is not NULL, the last page also can't be NULL
5127       require(sb->m_buffer.m_last_page != NULL);
5128       if (sb->m_sending.m_first_page == NULL)
5129       {
5130         sb->m_sending = sb->m_buffer;
5131       }
5132       else
5133       {
5134         assert(sb->m_sending.m_last_page != NULL);
5135         sb->m_sending.m_last_page->m_next = sb->m_buffer.m_first_page;
5136         sb->m_sending.m_last_page = sb->m_buffer.m_last_page;
5137       }
5138       sb->m_buffer.m_first_page = NULL;
5139       sb->m_buffer.m_last_page  = NULL;
5140 
5141       sb->m_sending_size += sb->m_buffered_size;
5142       sb->m_buffered_size = 0;
5143     }
5144     unlock(&sb->m_buffer_lock);
5145 
5146     if (sb->m_sending.m_first_page == NULL)
5147       return 0;
5148   }
5149 
5150   /**
5151    * If sending to trp is not enabled; discard the send buffers.
5152    */
5153   if (unlikely(!sb->m_enabled))
5154   {
5155     thread_local_pool<thr_send_page> pool(&g_thr_repository->m_sb_pool, 0);
5156     release_list(&pool, sb->m_sending.m_first_page, sb->m_sending.m_last_page);
5157     pool.release_all(g_thr_repository->m_mm,
5158                      RG_TRANSPORTER_BUFFERS,
5159                      g_send_threads == NULL ?
5160                        0 :
5161                        g_send_threads->get_send_instance(trp_id));
5162 
5163     sb->m_sending.m_first_page = NULL;
5164     sb->m_sending.m_last_page = NULL;
5165     sb->m_sending_size = 0;
5166     return 0;
5167   }
5168 
5169   /**
5170    * Process linked-list and put into iovecs
5171    */
5172 fill_iovec:
5173   Uint32 tot = 0;
5174   Uint32 pos = 0;
5175   thr_send_page * p = sb->m_sending.m_first_page;
5176 
5177 #ifdef NDB_LUMPY_SEND
5178   /* Drip feed transporter a few bytes at a time to send */
5179   do
5180   {
5181     Uint32 offset = 0;
5182     while ((offset < p->m_bytes) && (pos < max))
5183     {
5184       /* 0 -+1-> 1 -+6-> (7)3 -+11-> (18)2 -+10-> 0 */
5185       Uint32 lumpSz = 1;
5186       switch (offset % 4)
5187       {
5188       case 0 : lumpSz = 1; break;
5189       case 1 : lumpSz = 6; break;
5190       case 2 : lumpSz = 10; break;
5191       case 3 : lumpSz = 11; break;
5192       }
5193       const Uint32 remain = p->m_bytes - offset;
5194       lumpSz = (remain < lumpSz)?
5195         remain :
5196         lumpSz;
5197 
5198       dst[pos].iov_base = p->m_data + p->m_start + offset;
5199       dst[pos].iov_len = lumpSz;
5200       pos ++;
5201       offset+= lumpSz;
5202     }
5203     if (pos == max)
5204     {
5205       return pos;
5206     }
5207     assert(offset == p->m_bytes);
5208     p = p->m_next;
5209   } while (p != NULL);
5210 
5211   return pos;
5212 #endif
5213 
5214   do {
5215     dst[pos].iov_len = p->m_bytes;
5216     dst[pos].iov_base = p->m_data + p->m_start;
5217     assert(p->m_start + p->m_bytes <= p->max_bytes());
5218     tot += p->m_bytes;
5219     pos++;
5220     p = p->m_next;
5221     if (p == NULL)
5222       return pos;
5223   } while (pos < max);
5224 
5225   /**
5226    * Possibly pack send-buffers to get better utilization:
5227    * If we were unable to fill all sendbuffers into iovec[],
5228    * we pack the sendbuffers now if they have a low fill degree.
5229    * This could save us another OS-send for sending the remaining.
5230    */
5231   if (pos == max && max > 1 &&                    // Exhausted iovec[]
5232       tot < (pos * thr_send_page::max_bytes())/4) // < 25% filled
5233   {
5234     const Uint32 thr_no = sb->m_send_thread;
5235     assert(thr_no != NO_SEND_THREAD);
5236 
5237     if (!is_send_thread(thr_no))
5238     {
5239       thr_data * thrptr = &g_thr_repository->m_thread[thr_no];
5240       pack_sb_pages(&thrptr->m_send_buffer_pool, &sb->m_sending);
5241     }
5242     else
5243     {
5244       pack_sb_pages(g_send_threads->get_send_buffer_pool(thr_no), &sb->m_sending);
5245     }
5246 
5247     /**
5248      * Retry filling iovec[]. As 'pack' will ensure at least 50% fill degree,
5249      * we will not do another 'pack' after the retry.
5250      */
5251     goto fill_iovec;
5252   }
5253   return pos;
5254 }
5255 
5256 static
5257 Uint32
bytes_sent(thread_local_pool<thr_send_page> * pool,thr_repository::send_buffer * sb,Uint32 bytes)5258 bytes_sent(thread_local_pool<thr_send_page>* pool,
5259            thr_repository::send_buffer* sb, Uint32 bytes)
5260 {
5261   const Uint64 sending_size = sb->m_sending_size;
5262   assert(bytes && bytes <= sending_size);
5263 
5264   sb->m_bytes_sent = bytes;
5265   sb->m_sending_size = sending_size - bytes;
5266 
5267   Uint32 remain = bytes;
5268   thr_send_page * prev = NULL;
5269   thr_send_page * curr = sb->m_sending.m_first_page;
5270 
5271   /* Some, or all, in 'm_sending' was sent, find endpoint. */
5272   while (remain && remain >= curr->m_bytes)
5273   {
5274     /**
5275      * Calculate new current page such that we can release the
5276      * pages that have been completed and update the state of
5277      * the new current page
5278      */
5279     remain -= curr->m_bytes;
5280     prev = curr;
5281     curr = curr->m_next;
5282   }
5283 
5284   if (remain)
5285   {
5286     /**
5287      * Not all pages was fully sent and we stopped in the middle of
5288      * a page
5289      *
5290      * Update state of new current page and release any pages
5291      * that have already been sent
5292      */
5293     curr->m_start += remain;
5294     assert(curr->m_bytes > remain);
5295     curr->m_bytes -= remain;
5296     if (prev)
5297     {
5298       release_list(pool, sb->m_sending.m_first_page, prev);
5299     }
5300   }
5301   else
5302   {
5303     /**
5304      * We sent a couple of full pages and the sending stopped at a
5305      * page boundary, so we only need to release the sent pages
5306      * and update the new current page.
5307      */
5308     if (prev)
5309     {
5310       release_list(pool, sb->m_sending.m_first_page, prev);
5311 
5312       if (prev == sb->m_sending.m_last_page)
5313       {
5314         /**
5315          * Every thing was released, release the pages in the local pool
5316          */
5317         sb->m_sending.m_first_page = NULL;
5318         sb->m_sending.m_last_page = NULL;
5319         return 0;
5320       }
5321     }
5322     else
5323     {
5324       assert(sb->m_sending.m_first_page != NULL);
5325       pool->release_local(sb->m_sending.m_first_page);
5326     }
5327   }
5328 
5329   sb->m_sending.m_first_page = curr;
5330 
5331   /**
5332    * Since not all bytes were sent...
5333    * spend the time to try to pack the m_sending pages
5334    * possibly releasing send-buffer
5335    */
5336   return pack_sb_pages(pool, &sb->m_sending);
5337 }
5338 
5339 /**
5340  * Register the specified amount of 'bytes' as sent, starting
5341  * from the first avail byte in the m_sending buffer.
5342  *
5343  * The 'm_send_lock' has to be held prior to calling
5344  * this function.
5345  */
5346 Uint32
bytes_sent(NodeId node,TrpId trp_id,Uint32 bytes)5347 trp_callback::bytes_sent(NodeId node, TrpId trp_id, Uint32 bytes)
5348 {
5349   (void)node;
5350   thr_repository::send_buffer *sb = g_thr_repository->m_send_buffers+trp_id;
5351   Uint32 thr_no = sb->m_send_thread;
5352   assert(thr_no != NO_SEND_THREAD);
5353   if (!is_send_thread(thr_no))
5354   {
5355     thr_data * thrptr = &g_thr_repository->m_thread[thr_no];
5356     return ::bytes_sent(&thrptr->m_send_buffer_pool,
5357                         sb,
5358                         bytes);
5359   }
5360   else
5361   {
5362     return ::bytes_sent(g_send_threads->get_send_buffer_pool(thr_no),
5363                         sb,
5364                         bytes);
5365   }
5366 }
5367 
5368 void
enable_send_buffer(NodeId node,TrpId trp_id)5369 trp_callback::enable_send_buffer(NodeId node, TrpId trp_id)
5370 {
5371   (void)node;
5372   thr_repository::send_buffer *sb = g_thr_repository->m_send_buffers+trp_id;
5373   lock(&sb->m_send_lock);
5374   assert(sb->m_sending_size == 0);
5375   {
5376     /**
5377      * Collect and discard any sent buffered signals while
5378      * send buffers were disabled.
5379      */
5380     lock(&sb->m_buffer_lock);
5381     link_thread_send_buffers(sb, trp_id);
5382 
5383     if (sb->m_buffer.m_first_page != NULL)
5384     {
5385       thread_local_pool<thr_send_page> pool(&g_thr_repository->m_sb_pool, 0);
5386       release_list(&pool, sb->m_buffer.m_first_page, sb->m_buffer.m_last_page);
5387       pool.release_all(g_thr_repository->m_mm,
5388                        RG_TRANSPORTER_BUFFERS,
5389                        g_send_threads == NULL ?
5390                          0 :
5391                          g_send_threads->get_send_instance(trp_id));
5392       sb->m_buffer.m_first_page = NULL;
5393       sb->m_buffer.m_last_page = NULL;
5394       sb->m_buffered_size = 0;
5395     }
5396     unlock(&sb->m_buffer_lock);
5397   }
5398   assert(sb->m_enabled == false);
5399   sb->m_enabled = true;
5400   unlock(&sb->m_send_lock);
5401 }
5402 
5403 void
disable_send_buffer(NodeId node,TrpId trp_id)5404 trp_callback::disable_send_buffer(NodeId node, TrpId trp_id)
5405 {
5406   (void)node;
5407   thr_repository::send_buffer *sb = g_thr_repository->m_send_buffers+trp_id;
5408   lock(&sb->m_send_lock);
5409   sb->m_enabled = false;
5410 
5411   /**
5412    * Discard buffered signals not yet sent:
5413    * Note that other threads may still continue send-buffering into
5414    * their thread local send buffers until they discover that the
5415    * transporter has disconnect. However, these sent signals will
5416    * either be discarded when collected by ::get_bytes_to_send_iovec(),
5417    * or any leftovers discarded by ::enable_send_buffer()
5418    */
5419   if (sb->m_sending.m_first_page != NULL)
5420   {
5421     thread_local_pool<thr_send_page> pool(&g_thr_repository->m_sb_pool, 0);
5422     release_list(&pool, sb->m_sending.m_first_page, sb->m_sending.m_last_page);
5423     pool.release_all(g_thr_repository->m_mm,
5424                      RG_TRANSPORTER_BUFFERS,
5425                      g_send_threads == NULL ?
5426                        0 :
5427                        g_send_threads->get_send_instance(trp_id));
5428     sb->m_sending.m_first_page = NULL;
5429     sb->m_sending.m_last_page = NULL;
5430     sb->m_sending_size = 0;
5431   }
5432 
5433   unlock(&sb->m_send_lock);
5434 }
5435 
5436 static inline
5437 void
register_pending_send(thr_data * selfptr,Uint32 trp_id)5438 register_pending_send(thr_data *selfptr, Uint32 trp_id)
5439 {
5440   /* Mark that this trp has pending send data. */
5441   if (!selfptr->m_pending_send_mask.get(trp_id))
5442   {
5443     selfptr->m_pending_send_mask.set(trp_id, 1);
5444     Uint32 i = selfptr->m_pending_send_count;
5445     selfptr->m_pending_send_trps[i] = trp_id;
5446     selfptr->m_pending_send_count = i + 1;
5447   }
5448 }
5449 
5450 /**
5451   Pack send buffers to make memory available to other threads. The signals
5452   sent uses often one page per signal which means that most pages are very
5453   unpacked. In some situations this means that we can run out of send buffers
5454   and still have massive amounts of free space.
5455 
5456   We call this from the main loop in the block threads when we fail to
5457   allocate enough send buffers. In addition we call the node local
5458   pack_sb_pages() several places - See header-comment for that function.
5459 */
5460 static
5461 void
try_pack_send_buffers(thr_data * selfptr)5462 try_pack_send_buffers(thr_data* selfptr)
5463 {
5464   thr_repository* rep = g_thr_repository;
5465   thread_local_pool<thr_send_page>* pool = &selfptr->m_send_buffer_pool;
5466 
5467   for (Uint32 i = 1; i < NDB_ARRAY_SIZE(selfptr->m_send_buffers); i++)
5468   {
5469     if (globalTransporterRegistry.get_transporter(i))
5470     {
5471       thr_repository::send_buffer* sb = rep->m_send_buffers+i;
5472       if (trylock(&sb->m_buffer_lock) != 0)
5473       {
5474         continue; // Continue with next if busy
5475       }
5476 
5477       link_thread_send_buffers(sb, i);
5478       if (sb->m_buffer.m_first_page != NULL)
5479       {
5480         pack_sb_pages(pool, &sb->m_buffer);
5481       }
5482       unlock(&sb->m_buffer_lock);
5483     }
5484   }
5485   /* Release surplus buffers from local pool to global pool */
5486   pool->release_global(g_thr_repository->m_mm,
5487                        RG_TRANSPORTER_BUFFERS,
5488                        selfptr->m_send_instance_no);
5489 }
5490 
5491 
5492 /**
5493  * publish thread-locally prepared send-buffer
5494  */
5495 static
5496 void
flush_send_buffer(thr_data * selfptr,Uint32 trp_id)5497 flush_send_buffer(thr_data* selfptr, Uint32 trp_id)
5498 {
5499   Uint32 thr_no = selfptr->m_thr_no;
5500   thr_send_buffer * src = selfptr->m_send_buffers + trp_id;
5501   thr_repository* rep = g_thr_repository;
5502 
5503   if (src->m_first_page == 0)
5504   {
5505     return;
5506   }
5507   assert(src->m_last_page != 0);
5508 
5509   thr_send_queue * dst = rep->m_thread_send_buffers[trp_id]+thr_no;
5510   thr_repository::send_buffer* sb = rep->m_send_buffers+trp_id;
5511 
5512   Uint32 wi = dst->m_write_index;
5513   Uint32 next = (wi + 1) % thr_send_queue::SIZE;
5514   Uint32 ri = sb->m_read_index[thr_no];
5515 
5516   /**
5517    * If thread local ring buffer of send-buffers is full:
5518    * Empty it by transfering them to the global send_buffer list.
5519    */
5520   if (unlikely(next == ri))
5521   {
5522     lock(&sb->m_buffer_lock);
5523     link_thread_send_buffers(sb, trp_id);
5524     unlock(&sb->m_buffer_lock);
5525   }
5526 
5527   dst->m_buffers[wi] = src->m_first_page;
5528   wmb();
5529   dst->m_write_index = next;
5530 
5531   src->m_first_page = 0;
5532   src->m_last_page = 0;
5533 }
5534 
5535 /**
5536  * This is used in case send buffer gets full, to force an emergency send,
5537  * hopefully freeing up some buffer space for the next signal.
5538  */
5539 bool
forceSend(NodeId node,TrpId trp_id)5540 mt_send_handle::forceSend(NodeId node, TrpId trp_id)
5541 {
5542   (void)node;
5543   struct thr_repository *rep = g_thr_repository;
5544   struct thr_data *selfptr = m_selfptr;
5545   struct thr_repository::send_buffer * sb = rep->m_send_buffers + trp_id;
5546 
5547   {
5548     /**
5549      * NOTE: we don't need a memory barrier after clearing
5550      *       m_force_send here as we unconditionally lock m_send_lock
5551      *       hence there is no way that our data can be "unsent"
5552      */
5553     sb->m_force_send = 0;
5554 
5555     lock(&sb->m_send_lock);
5556     sb->m_send_thread = selfptr->m_thr_no;
5557     bool more = globalTransporterRegistry.performSend(trp_id, false);
5558     sb->m_send_thread = NO_SEND_THREAD;
5559     unlock(&sb->m_send_lock);
5560 
5561     /**
5562      * release buffers prior to maybe looping on sb->m_force_send
5563      */
5564     selfptr->m_send_buffer_pool.release_global(rep->m_mm,
5565                                                RG_TRANSPORTER_BUFFERS,
5566                                                selfptr->m_send_instance_no);
5567     /**
5568      * We need a memory barrier here to prevent race between clearing lock
5569      *   and reading of m_force_send.
5570      *   CPU can reorder the load to before the clear of the lock
5571      */
5572     mb();
5573     if (unlikely(sb->m_force_send) || more)
5574     {
5575       register_pending_send(selfptr, trp_id);
5576     }
5577   }
5578 
5579   return true;
5580 }
5581 
5582 /**
5583  * try sending data
5584  */
5585 static
5586 void
try_send(thr_data * selfptr,Uint32 trp_id)5587 try_send(thr_data * selfptr, Uint32 trp_id)
5588 {
5589   struct thr_repository *rep = g_thr_repository;
5590   struct thr_repository::send_buffer * sb = rep->m_send_buffers + trp_id;
5591 
5592   if (trylock(&sb->m_send_lock) == 0)
5593   {
5594     /**
5595      * Now clear the flag, and start sending all data available to this trp.
5596      *
5597      * Put a memory barrier here, so that if another thread tries to grab
5598      * the send lock but fails due to us holding it here, we either
5599      * 1) Will see m_force_send[id] set to 1 at the end of the loop, or
5600      * 2) We clear here the flag just set by the other thread, but then we
5601      * will (thanks to mb()) be able to see and send all of the data already
5602      * in the first send iteration.
5603      */
5604     sb->m_force_send = 0;
5605     mb();
5606 
5607     sb->m_send_thread = selfptr->m_thr_no;
5608     globalTransporterRegistry.performSend(trp_id);
5609     sb->m_send_thread = NO_SEND_THREAD;
5610     unlock(&sb->m_send_lock);
5611 
5612     /**
5613      * release buffers prior to maybe looping on sb->m_force_send
5614      */
5615     selfptr->m_send_buffer_pool.release_global(rep->m_mm,
5616                                                RG_TRANSPORTER_BUFFERS,
5617                                                selfptr->m_send_instance_no);
5618 
5619     /**
5620      * We need a memory barrier here to prevent race between clearing lock
5621      *   and reading of m_force_send.
5622      *   CPU can reorder the load to before the clear of the lock
5623      */
5624     mb();
5625     if (unlikely(sb->m_force_send))
5626     {
5627       register_pending_send(selfptr, trp_id);
5628     }
5629   }
5630 }
5631 
5632 /**
5633  * Flush send buffers and append them to dst. trps send queue
5634  *
5635  * Flushed buffer contents are piggybacked when another thread
5636  * do_send() to the same dst. trp. This makes it possible to have
5637  * more data included in each message, and thereby reduces total
5638  * #messages handled by the OS which really impacts performance!
5639  */
5640 static
5641 void
do_flush(struct thr_data * selfptr)5642 do_flush(struct thr_data* selfptr)
5643 {
5644   Uint32 i;
5645   Uint32 count = selfptr->m_pending_send_count;
5646   NodeId *trps = selfptr->m_pending_send_trps;
5647 
5648   for (i = 0; i < count; i++)
5649   {
5650     flush_send_buffer(selfptr, trps[i]);
5651   }
5652 }
5653 
5654 /**
5655  * Use the THRMAN block to send the WAKEUP_THREAD_ORD signal
5656  * to the block thread that we want to wakeup.
5657  */
5658 #define MICROS_BETWEEN_WAKEUP_IDLE_THREAD 100
5659 static
5660 inline
5661 void
send_wakeup_thread_ord(struct thr_data * selfptr,NDB_TICKS now)5662 send_wakeup_thread_ord(struct thr_data* selfptr,
5663                        NDB_TICKS now)
5664 {
5665   if (selfptr->m_wakeup_instance > 0)
5666   {
5667     Uint64 since_last =
5668       NdbTick_Elapsed(selfptr->m_last_wakeup_idle_thread, now).microSec();
5669     if (since_last > MICROS_BETWEEN_WAKEUP_IDLE_THREAD)
5670     {
5671       selfptr->m_signal->theData[0] = selfptr->m_wakeup_instance;
5672       SimulatedBlock *b = globalData.getBlock(THRMAN, selfptr->m_thr_no+1);
5673       b->executeFunction_async(GSN_SEND_WAKEUP_THREAD_ORD, selfptr->m_signal);
5674       selfptr->m_last_wakeup_idle_thread = now;
5675     }
5676   }
5677 }
5678 
5679 /**
5680  * Send any pending data to remote trps.
5681  *
5682  * If MUST_SEND is false, will only try to lock the send lock, but if it would
5683  * block, that trp is skipped, to be tried again next time round.
5684  *
5685  * If MUST_SEND is true, we still only try to lock, but if it would block,
5686  * we will force the thread holding the lock, to do the sending on our behalf.
5687  *
5688  * The list of pending trps to send to is thread-local, but the per-trp send
5689  * buffer is shared by all threads. Thus we might skip a trp for which
5690  * another thread has pending send data, and we might send pending data also
5691  * for another thread without clearing the trp from the pending list of that
5692  * other thread (but we will never loose signals due to this).
5693  *
5694  * Return number of trps which still has pending data to be sent.
5695  * These will be retried again in the next round. 'Pending' is
5696  * returned as a negative number if nothing was sent in this round.
5697  *
5698  * (Likely due to receivers consuming too slow, and receive and send buffers
5699  *  already being filled up)
5700  *
5701  * Sending data to other trps is a task that we perform using an algorithm
5702  * that depends on the state of block threads. The block threads can be in
5703  * 3 different states:
5704  *
5705  * LIGHT_LOAD:
5706  * -----------
5707  * In this state we will send to all trps we generate data for. In addition
5708  * we will also send to one trp if we are going to sleep, we will stay awake
5709  * until no more trps to send to. However between each send we will also
5710  * ensure that we execute any signals destined for us.
5711  *
5712  * LIGHT_LOAD threads can also be provided to other threads as wakeup targets.
5713  * This means that these threads will be woken up regularly under load to
5714  * assist with sending.
5715  *
5716  * MEDIUM_LOAD:
5717  * ------------
5718  * At this load level we will also assist send threads before going to sleep
5719  * and continue so until we have work ourselves to do or until there are no
5720  * more trps to send to. We will additionally send partially our own data.
5721  * We will also wake up a send thread during send to ensure that sends are
5722  * performed ASAP.
5723  *
5724  * OVERLOAD:
5725  * ---------
5726  * At this level we will simply inform the send threads about the trps we
5727  * sent some data to, the actual sending will be handled by send threads
5728  * and other block threads assisting the send threads.
5729  *
5730  * In addition if any thread is at overload level we will sleep for a shorter
5731  * time.
5732  *
5733  * The decision about which idle threads to wake up, which overload level to
5734  * use and when to sleep for shorter time is all taken by the local THRMAN
5735  * block. Some decisions is also taken by the THRMAN instance in the main
5736  * thread.
5737  *
5738  * Send threads are woken up in a round robin fashion, each time they are
5739  * awoken they will continue executing until no more work is around.
5740  */
5741 static
5742 bool
do_send(struct thr_data * selfptr,bool must_send,bool assist_send)5743 do_send(struct thr_data* selfptr, bool must_send, bool assist_send)
5744 {
5745   Uint32 count = selfptr->m_pending_send_count;
5746   NodeId *trps = selfptr->m_pending_send_trps;
5747 
5748   const NDB_TICKS now = NdbTick_getCurrentTicks();
5749   selfptr->m_curr_ticks = now;
5750   bool pending_send = false;
5751   selfptr->m_watchdog_counter = 6;
5752 
5753   if (count == 0)
5754   {
5755     if (must_send && assist_send && g_send_threads &&
5756         selfptr->m_overload_status <= (OverloadStatus)MEDIUM_LOAD_CONST &&
5757         (selfptr->m_nosend == 0))
5758     {
5759       /**
5760        * For some overload states we will here provide some
5761        * send assistance even though we had nothing to send
5762        * ourselves. We will however not need to offload any
5763        * sends ourselves.
5764        *
5765        * The idea is that when we get here the thread is usually not so
5766        * active with other things as it has nothing to send, it must
5767        * send which means that it is preparing to go to sleep and
5768        * we have excluded the receive threads through assist_send.
5769        *
5770        * We will avoid this extra send when we are in overload mode since
5771        * it is likely that we will find work to do before going to sleep
5772        * anyways. In all other modes it makes sense to spend some time
5773        * sending before going to sleep. In particular TC threads will be
5774        * doing major send assistance here.
5775        *
5776        * In case there is more work to do and our thread is mostly idle,
5777        * we will soon enough be back here and assist the send thread
5778        * again. We make this happen by setting pending_send flag in
5779        * return from this mode. We come back here after checking that
5780        * we have no signals to process, so at most we will delay the
5781        * signal execution here by the time it takes to send to one
5782        * trp.
5783        *
5784        * The receive threads won't assist the send thread to ensure
5785        * that we can respond to incoming messages ASAP. We want to
5786        * to optimise for response time here since this is needed to
5787        * ensure that the block threads have sufficient work to do.
5788        *
5789        * If we come here and have had nothing to send, then we're able to
5790        * do some more sending if there are pending send still in send queue.
5791        * So we return pending_send != 0 in this case to ensure that this
5792        * thread doesn't go to sleep, but rather come back here to assist the
5793        * send thread a bit more. We'll continue spinning here until we get
5794        * some work to do or until the send queue is empty.
5795        */
5796       Uint32 num_trps_to_send_to = 1;
5797       pending_send = g_send_threads->assist_send_thread(
5798                                          num_trps_to_send_to,
5799                                          selfptr->m_thr_no,
5800                                          now,
5801                                          selfptr->m_watchdog_counter,
5802                                          selfptr->m_send_instance,
5803                                          selfptr->m_send_buffer_pool);
5804       NDB_TICKS after = NdbTick_getCurrentTicks();
5805       selfptr->m_micros_send += NdbTick_Elapsed(now, after).microSec();
5806     }
5807     return pending_send; // send-buffers empty
5808   }
5809 
5810   /* Clear the pending list. */
5811   selfptr->m_pending_send_mask.clear();
5812   selfptr->m_pending_send_count = 0;
5813   selfptr->m_watchdog_counter = 6;
5814   for (Uint32 i = 0; i < count; i++)
5815   {
5816     /**
5817      * Make the data available for sending immediately so that
5818      * any other trp sending will grab this data without having
5819      * wait for us to handling the other trps.
5820      */
5821     Uint32 id = trps[i];
5822     flush_send_buffer(selfptr, id);
5823   }
5824   selfptr->m_watchdog_counter = 6;
5825   if (g_send_threads)
5826   {
5827     /**
5828      * Each send thread is only responsible for a subset of the transporters
5829      * to send to and we will only assist a subset of the transporters
5830      * for sending. This means that it is very hard to predict whether send
5831      * thread needs to be woken up. This means that we will awake the send
5832      * threads required for sending, even if no send assistance was really
5833      * required. This will create some extra load on the send threads, but
5834      * will make NDB data nodes more scalable to handle extremely high loads.
5835      *
5836      * When we are in an overloaded state, we move the trps to send to
5837      * into the send thread global lists. Since we already woken up the
5838      * send threads to handle sends we do no more in overloaded state.
5839      *
5840      * We don't record any send time here since it would be
5841      * an unnecessary extra load, we only grab a mutex and
5842      * ensure that someone else takes over our send work.
5843      *
5844      * When the user have set nosend=1 on this thread we will
5845      * never assist with the sending.
5846      */
5847     if (selfptr->m_overload_status == (OverloadStatus)OVERLOAD_CONST ||
5848         selfptr->m_nosend != 0)
5849     {
5850       for (Uint32 i = 0; i < count; i++)
5851       {
5852         g_send_threads->alert_send_thread(trps[i], now, NULL);
5853       }
5854     }
5855     else
5856     {
5857       /**
5858        * While we are in an light load state we will always try to
5859        * send to as many trps that we inserted ourselves. In this case
5860        * we don't need to wake any send threads. If the trps still need
5861        * sending to after we're done we will ensure that a send thread
5862        * is woken up. assist_send_thread will ensure that send threads
5863        * are woken up if needed.
5864        *
5865        * At medium load levels we keep track of how much trps we have
5866        * wanted to send to and ensure that we at least do a part of that
5867        * work if need be. However we try as much as possible to avoid
5868        * sending at medium load at this point since we still have more
5869        * work to do. So we offload the sending to other threads and
5870        * wait with providing send assistance until we're out of work
5871        * or we have accumulated sufficiently to provide a bit of
5872        * assistance to the send threads.
5873        *
5874        * At medium load we set num_trps_inserted to 0 since we
5875        * have already woken up a send thread and thus there is no
5876        * need to wake up another thread in assist_send_thread, so we
5877        * indicate that we call this function only to assist and need
5878        * no wakeup service.
5879        *
5880        * We will check here also if we should wake an idle thread to
5881        * do some send assistance. We check so that we don't perform
5882        * this wakeup function too often.
5883        */
5884 
5885       Uint32 num_trps_inserted = 0;
5886       for (Uint32 i = 0; i < count; i++)
5887       {
5888         num_trps_inserted += g_send_threads->alert_send_thread(trps[i],
5889                                                                now,
5890                                              selfptr->m_send_instance);
5891       }
5892       Uint32 num_trps_to_send_to = num_trps_inserted;
5893       if (selfptr->m_overload_status != (OverloadStatus)MEDIUM_LOAD_CONST)
5894       {
5895         num_trps_to_send_to++;
5896       }
5897       send_wakeup_thread_ord(selfptr, now);
5898       if (num_trps_to_send_to > 0)
5899       {
5900         pending_send = g_send_threads->assist_send_thread(
5901                                            num_trps_to_send_to,
5902                                            selfptr->m_thr_no,
5903                                            now,
5904                                            selfptr->m_watchdog_counter,
5905                                            selfptr->m_send_instance,
5906                                            selfptr->m_send_buffer_pool);
5907       }
5908       NDB_TICKS after = NdbTick_getCurrentTicks();
5909       selfptr->m_micros_send += NdbTick_Elapsed(now, after).microSec();
5910       g_send_threads->wake_my_send_thread_if_needed(&trps[0],
5911                                     count,
5912                                     selfptr->m_send_instance);
5913     }
5914     return pending_send;
5915   }
5916 
5917   /**
5918    * We're not using send threads, we keep this code around for now
5919    * to ensure that we can support the same behaviour also in newer
5920    * versions for a while. Eventually this code will be deprecated.
5921    */
5922   Uint32 made_progress = 0;
5923   struct thr_repository* rep = g_thr_repository;
5924 
5925   for (Uint32 i = 0; i < count; i++)
5926   {
5927     Uint32 id = trps[i];
5928     thr_repository::send_buffer * sb = rep->m_send_buffers + id;
5929 
5930     selfptr->m_watchdog_counter = 6;
5931 
5932     /**
5933      * If we must send now, set the force_send flag.
5934      *
5935      * This will ensure that if we do not get the send lock, the thread
5936      * holding the lock will try sending again for us when it has released
5937      * the lock.
5938      *
5939      * The lock/unlock pair works as a memory barrier to ensure that the
5940      * flag update is flushed to the other thread.
5941      */
5942     if (must_send)
5943     {
5944       sb->m_force_send = 1;
5945     }
5946 
5947     if (trylock(&sb->m_send_lock) != 0)
5948     {
5949       if (!must_send)
5950       {
5951         /**
5952          * Not doing this trp now, re-add to pending list.
5953          *
5954          * As we only add from the start of an empty list, we are safe from
5955          * overwriting the list while we are iterating over it.
5956          */
5957         register_pending_send(selfptr, id);
5958       }
5959       else
5960       {
5961         /* Other thread will send for us as we set m_force_send. */
5962       }
5963     }
5964     else  //Got send_lock
5965     {
5966       /**
5967        * Now clear the flag, and start sending all data available to this trp.
5968        *
5969        * Put a memory barrier here, so that if another thread tries to grab
5970        * the send lock but fails due to us holding it here, we either
5971        * 1) Will see m_force_send[id] set to 1 at the end of the loop, or
5972        * 2) We clear here the flag just set by the other thread, but then we
5973        * will (thanks to mb()) be able to see and send all of the data already
5974        * in the first send iteration.
5975        */
5976       sb->m_force_send = 0;
5977       mb();
5978 
5979       /**
5980        * Set m_send_thread so that our transporter callback can know which
5981        * thread holds the send lock for this remote trp.
5982        */
5983       sb->m_send_thread = selfptr->m_thr_no;
5984       const bool more = globalTransporterRegistry.performSend(id);
5985       made_progress += sb->m_bytes_sent;
5986       sb->m_send_thread = NO_SEND_THREAD;
5987       unlock(&sb->m_send_lock);
5988 
5989       if (more)   //Didn't complete all my send work
5990       {
5991         register_pending_send(selfptr, id);
5992       }
5993       else
5994       {
5995         /**
5996          * We need a memory barrier here to prevent race between clearing lock
5997          *   and reading of m_force_send.
5998          *   CPU can reorder the load to before the clear of the lock
5999          */
6000         mb();
6001         if (sb->m_force_send) //Other thread forced us to do more send
6002         {
6003           made_progress++;    //Avoid false 'no progress' handling
6004           register_pending_send(selfptr, id);
6005         }
6006       }
6007     }
6008   } //for all trps
6009 
6010   selfptr->m_send_buffer_pool.release_global(rep->m_mm,
6011                                              RG_TRANSPORTER_BUFFERS,
6012                                              selfptr->m_send_instance_no);
6013 
6014   return (made_progress)         // Had some progress?
6015      ?  (selfptr->m_pending_send_count > 0)   // More do_send is required
6016     : false;                     // All busy, or didn't find any work (-> -0)
6017 }
6018 
6019 #ifdef ERROR_INSERT
6020 void
mt_set_delayed_prepare(Uint32 self)6021 mt_set_delayed_prepare(Uint32 self)
6022 {
6023   thr_repository *rep = g_thr_repository;
6024   struct thr_data *selfptr = &rep->m_thread[self];
6025 
6026   selfptr->m_delayed_prepare = true;
6027 }
6028 #endif
6029 
6030 
6031 /**
6032  * These are the implementations of the TransporterSendBufferHandle methods
6033  * in ndbmtd.
6034  */
6035 Uint32 *
getWritePtr(NodeId nodeId,TrpId trp_id,Uint32 len,Uint32 prio,Uint32 max,SendStatus * error)6036 mt_send_handle::getWritePtr(NodeId nodeId,
6037                             TrpId trp_id,
6038                             Uint32 len,
6039                             Uint32 prio,
6040                             Uint32 max,
6041                             SendStatus *error)
6042 {
6043   (void)nodeId;
6044 #ifdef ERROR_INSERT
6045   if (m_selfptr->m_delayed_prepare)
6046   {
6047     g_eventLogger->info("MT thread %u delaying in prepare",
6048                         m_selfptr->m_thr_no);
6049     NdbSleep_MilliSleep(500);
6050     g_eventLogger->info("MT thread %u finished delay, clearing",
6051                         m_selfptr->m_thr_no);
6052     m_selfptr->m_delayed_prepare = false;
6053   }
6054 #endif
6055 
6056   struct thr_send_buffer * b = m_selfptr->m_send_buffers+trp_id;
6057   thr_send_page * p = b->m_last_page;
6058   if (likely(p != NULL))
6059   {
6060     assert(p->m_start == 0); //Nothing sent until flushed
6061 
6062     if (likely(p->m_bytes + len <= thr_send_page::max_bytes()))
6063     {
6064       return (Uint32*)(p->m_data + p->m_bytes);
6065     }
6066     // TODO: maybe dont always flush on page-boundary ???
6067     flush_send_buffer(m_selfptr, trp_id);
6068     if (!g_send_threads)
6069       try_send(m_selfptr, trp_id);
6070   }
6071   if(unlikely(len > thr_send_page::max_bytes()))
6072   {
6073     *error = SEND_MESSAGE_TOO_BIG;
6074     return 0;
6075   }
6076 
6077   bool first = true;
6078   while (first)
6079   {
6080     if (likely((p = m_selfptr->m_send_buffer_pool.seize(g_thr_repository->m_mm,
6081                                       RG_TRANSPORTER_BUFFERS,
6082                                       m_selfptr->m_send_instance_no)) != 0))
6083     {
6084       p->m_bytes = 0;
6085       p->m_start = 0;
6086       p->m_next = 0;
6087       b->m_first_page = b->m_last_page = p;
6088       return (Uint32*)p->m_data;
6089     }
6090     try_pack_send_buffers(m_selfptr);
6091     first = false;
6092   }
6093   *error = SEND_BUFFER_FULL;
6094   return 0;
6095 }
6096 
6097 /**
6098  * Acquire total send buffer size without locking and without gathering
6099  *
6100  * OJA: The usability of this function is rather questionable.
6101  *      m_buffered_size and m_sending_size is updated by
6102  *      link_thread_send_buffers(), get_bytes_to_send_iovec() and
6103  *      bytes_sent() - All part of performSend(). Thus, it is
6104  *      valid *after* a send.
6105  *
6106  *      However, checking it *before* a send in order to
6107  *      determine if the payload is yet too small doesn't
6108  *      really provide correct information of the current state.
6109  *      Most likely '0 will be returned if previous send succeeded.
6110  *
6111  *      A better alternative could be to add a 'min_send' argument
6112  *      to perform_send(), and skip sending if not '>='.
6113  *      (After real size is recalculated)
6114  */
6115 static Uint64
mt_get_send_buffer_bytes(TrpId trp_id)6116 mt_get_send_buffer_bytes(TrpId trp_id)
6117 {
6118   thr_repository *rep = g_thr_repository;
6119   thr_repository::send_buffer *sb = &rep->m_send_buffers[trp_id];
6120   const Uint64 total_send_buffer_size =
6121     sb->m_buffered_size + sb->m_sending_size;
6122   return total_send_buffer_size;
6123 }
6124 
6125 void
mt_getSendBufferLevel(Uint32 self,NodeId id,SB_LevelType & level)6126 mt_getSendBufferLevel(Uint32 self, NodeId id, SB_LevelType &level)
6127 {
6128   Resource_limit rl;
6129   const Uint32 page_size = thr_send_page::PGSIZE;
6130   thr_repository *rep = g_thr_repository;
6131   thr_repository::send_buffer *sb = &rep->m_send_buffers[id];
6132   const Uint64 current_trp_send_buffer_size =
6133     sb->m_buffered_size + sb->m_sending_size;
6134 
6135   /* Memory barrier to get a fresher value for rl.m_curr */
6136   mb();
6137   rep->m_mm->get_resource_limit_nolock(RG_TRANSPORTER_BUFFERS, rl);
6138   Uint64 current_send_buffer_size = rl.m_min * page_size;
6139   Uint64 current_used_send_buffer_size = rl.m_curr * page_size;
6140   Uint64 current_percentage =
6141     (100 * current_used_send_buffer_size) / current_send_buffer_size;
6142 
6143   if (current_percentage >= 90)
6144   {
6145     const Uint32 avail_shared = rep->m_mm->get_free_shared_nolock();
6146     if (rl.m_min + avail_shared > rl.m_max)
6147     {
6148       current_send_buffer_size = rl.m_max * page_size;
6149     }
6150     else
6151     {
6152       current_send_buffer_size = (rl.m_min + avail_shared) * page_size;
6153     }
6154   }
6155   calculate_send_buffer_level(current_trp_send_buffer_size,
6156                               current_send_buffer_size,
6157                               current_used_send_buffer_size,
6158                               glob_num_threads,
6159                               level);
6160   return;
6161 }
6162 
6163 void
getSendBufferLevel(NodeId id,SB_LevelType & level)6164 mt_send_handle::getSendBufferLevel(NodeId id, SB_LevelType &level)
6165 {
6166   (void)id;
6167   (void)level;
6168   return;
6169 }
6170 
6171 Uint32
updateWritePtr(NodeId nodeId,TrpId trp_id,Uint32 lenBytes,Uint32 prio)6172 mt_send_handle::updateWritePtr(NodeId nodeId,
6173                                TrpId trp_id,
6174                                Uint32 lenBytes,
6175                                Uint32 prio)
6176 {
6177   (void)nodeId;
6178   struct thr_send_buffer * b = m_selfptr->m_send_buffers+trp_id;
6179   thr_send_page * p = b->m_last_page;
6180   p->m_bytes += lenBytes;
6181   return p->m_bytes;
6182 }
6183 
6184 /*
6185  * Insert a signal in a job queue.
6186  *
6187  * The signal is not visible to consumers yet after return from this function,
6188  * only recorded in the thr_jb_write_state. It is necessary to first call
6189  * flush_write_state() for this.
6190  *
6191  * The new_buffer is a job buffer to use if the current one gets full. If used,
6192  * we return true, indicating that the caller should allocate a new one for
6193  * the next call. (This is done to allow to insert under lock, but do the
6194  * allocation outside the lock).
6195  */
6196 static inline
6197 bool
insert_signal(thr_job_queue * q,thr_job_queue_head * h,thr_jb_write_state * w,Uint32 prioa,const SignalHeader * sh,const Uint32 * data,const Uint32 secPtr[3],thr_job_buffer * new_buffer)6198 insert_signal(thr_job_queue *q, thr_job_queue_head *h,
6199               thr_jb_write_state *w, Uint32 prioa,
6200               const SignalHeader* sh, const Uint32 *data,
6201               const Uint32 secPtr[3], thr_job_buffer *new_buffer)
6202 {
6203   Uint32 write_pos = w->m_write_pos;
6204   Uint32 datalen = sh->theLength;
6205   assert(w->is_open());
6206   assert(w->m_write_buffer == q->m_buffers[w->m_write_index]);
6207   memcpy(w->m_write_buffer->m_data + write_pos, sh, sizeof(*sh));
6208   write_pos += (sizeof(*sh) >> 2);
6209   memcpy(w->m_write_buffer->m_data + write_pos, data, 4*datalen);
6210   write_pos += datalen;
6211   const Uint32 *p= secPtr;
6212   for (Uint32 i = 0; i < sh->m_noOfSections; i++)
6213     w->m_write_buffer->m_data[write_pos++] = *p++;
6214   w->increment_pending_signals();
6215 
6216 #if SIZEOF_CHARP == 8
6217   /* Align to 8-byte boundary, to ensure aligned copies. */
6218   write_pos= (write_pos+1) & ~((Uint32)1);
6219 #endif
6220 
6221   /*
6222    * We make sure that there is always room for at least one signal in the
6223    * current buffer in the queue, so one insert is always possible without
6224    * adding a new buffer.
6225    */
6226   if (likely(write_pos + MAX_SIGNAL_SIZE <= thr_job_buffer::SIZE))
6227   {
6228     w->m_write_pos = write_pos;
6229     return false;
6230   }
6231   else
6232   {
6233     /*
6234      * Need a write memory barrier here, as this might make signal data visible
6235      * to other threads.
6236      *
6237      * ToDo: We actually only need the wmb() here if we already make this
6238      * buffer visible to the other thread. So we might optimize it a bit. But
6239      * wmb() is a no-op on x86 anyway...
6240      */
6241     wmb();
6242     w->m_write_buffer->m_len = write_pos;
6243     Uint32 write_index = (w->m_write_index + 1) % thr_job_queue::SIZE;
6244 
6245     /**
6246      * Full job buffer is fatal.
6247      *
6248      * ToDo: should we wait for it to become non-full? There is no guarantee
6249      * that this will actually happen...
6250      *
6251      * Or alternatively, ndbrequire() ?
6252      */
6253     if (unlikely(write_index == h->m_read_index))
6254     {
6255       job_buffer_full(0);
6256     }
6257     new_buffer->m_len = 0;
6258     new_buffer->m_prioa = prioa;
6259     q->m_buffers[write_index] = new_buffer;
6260     w->m_write_index = write_index;
6261     w->m_write_pos = 0;
6262     w->m_write_buffer = new_buffer;
6263     return true;                // Buffer new_buffer used
6264   }
6265 
6266   return false;                 // Buffer new_buffer not used
6267 }
6268 
6269 static
6270 void
read_jbb_state(thr_data * selfptr,Uint32 count)6271 read_jbb_state(thr_data *selfptr, Uint32 count)
6272 {
6273   thr_jb_read_state *r = selfptr->m_read_states;
6274   const thr_job_queue *q = selfptr->m_in_queue;
6275   const thr_job_queue_head *h = selfptr->m_in_queue_head;
6276   for (Uint32 i = 0; i < count; i++,r++)
6277   {
6278     if (r->is_open())
6279     {
6280       Uint32 read_index = r->m_read_index;
6281 
6282       /**
6283        * Optimization: Only reload when possibly empty.
6284        * Avoid cache reload of shared thr_job_queue_head
6285        * Read head directly to avoid unnecessary cache
6286        * load of first cache line of m_in_queue entry.
6287        */
6288       if (r->m_write_index == read_index)
6289       {
6290         r->m_write_index = h[i].m_write_index;
6291         read_barrier_depends();
6292         r->m_read_end = q[i].m_buffers[read_index]->m_len;
6293       }
6294     }
6295   }
6296 }
6297 
6298 static
6299 bool
read_jba_state(thr_data * selfptr)6300 read_jba_state(thr_data *selfptr)
6301 {
6302   thr_jb_read_state *r = &(selfptr->m_jba_read_state);
6303   r->m_write_index = selfptr->m_jba_head.m_write_index;
6304   read_barrier_depends();
6305   r->m_read_end = selfptr->m_jba.m_buffers[r->m_read_index]->m_len;
6306   return r->is_empty();
6307 }
6308 
6309 static
6310 inline
6311 bool
check_for_input_from_ndbfs(struct thr_data * thr_ptr,Signal * signal)6312 check_for_input_from_ndbfs(struct thr_data* thr_ptr, Signal* signal)
6313 {
6314   /**
6315    * The manner to check for input from NDBFS file threads misuses
6316    * the SEND_PACKED signal. For ndbmtd this is intended to be
6317    * replaced by using signals directly from NDBFS file threads to
6318    * the issuer of the file request. This is WL#8890.
6319    */
6320   Uint32 i;
6321   for (i = 0; i < thr_ptr->m_instance_count; i++)
6322   {
6323     BlockReference block = thr_ptr->m_instance_list[i];
6324     Uint32 main = blockToMain(block);
6325     if (main == NDBFS)
6326     {
6327       Uint32 instance = blockToInstance(block);
6328       SimulatedBlock* b = globalData.getBlock(main, instance);
6329       b->executeFunction_async(GSN_SEND_PACKED, signal);
6330       if (signal->theData[0] == 1)
6331         return true;
6332       return false;
6333     }
6334   }
6335   return false;
6336 }
6337 
6338 /* Check all job queues, return true only if all are empty. */
6339 static bool
check_queues_empty(thr_data * selfptr)6340 check_queues_empty(thr_data *selfptr)
6341 {
6342   Uint32 thr_count = g_thr_repository->m_thread_count;
6343   if (selfptr->m_thr_no == 0)
6344   {
6345     if (check_for_input_from_ndbfs(selfptr, selfptr->m_signal))
6346       return false;
6347   }
6348   bool empty = read_jba_state(selfptr);
6349   if (!empty)
6350     return false;
6351 
6352   read_jbb_state(selfptr, thr_count);
6353   const thr_jb_read_state *r = selfptr->m_read_states;
6354   for (Uint32 i = 0; i < thr_count; i++,r++)
6355   {
6356     if (!r->is_empty())
6357       return false;
6358   }
6359   return true;
6360 }
6361 
6362 static
6363 inline
6364 void
sendpacked(struct thr_data * thr_ptr,Signal * signal)6365 sendpacked(struct thr_data* thr_ptr, Signal* signal)
6366 {
6367   Uint32 i;
6368   signal->header.m_noOfSections = 0; /* valgrind */
6369   thr_ptr->m_watchdog_counter = 15;
6370   for (i = 0; i < thr_ptr->m_instance_count; i++)
6371   {
6372     BlockReference block = thr_ptr->m_instance_list[i];
6373     Uint32 main = blockToMain(block);
6374     if (main == DBLQH || main == DBTC || main == DBTUP || main == NDBFS)
6375     {
6376       Uint32 instance = blockToInstance(block);
6377       SimulatedBlock* b = globalData.getBlock(main, instance);
6378       // wl4391_todo remove useless assert
6379       assert(b != 0 && b->getThreadId() == thr_ptr->m_thr_no);
6380       /* b->send_at_job_buffer_end(); */
6381       b->executeFunction_async(GSN_SEND_PACKED, signal);
6382     }
6383   }
6384 }
6385 
6386 /**
6387  * We check whether it is time to call do_send or do_flush. These are
6388  * central decisions to the data node scheduler in a multithreaded data
6389  * node. If we wait for too long to make this decision it will severely
6390  * impact our response times since messages will be waiting in the send
6391  * buffer without being sent for up to several milliseconds.
6392  *
6393  * Since we call this function now after executing jobs from one thread,
6394  * we will never call this function with more than 75 signals executed.
6395  * The decision to send/flush is determined by config parameters that
6396  * control the responsiveness of MySQL Cluster. Setting it to a be highly
6397  * responsive means that we will send very often at the expense of
6398  * throughput. Setting it to a high throughput means that we will send
6399  * seldom at the expense of response time to gain higher throughput.
6400  *
6401  * It is possible to change this variable through a DUMP command and can
6402  * thus be changed as the environment changes.
6403  */
6404 static
handle_scheduling_decisions(thr_data * selfptr,Signal * signal,Uint32 & send_sum,Uint32 & flush_sum,bool & pending_send)6405 void handle_scheduling_decisions(thr_data *selfptr,
6406                                  Signal *signal,
6407                                  Uint32 & send_sum,
6408                                  Uint32 & flush_sum,
6409                                  bool & pending_send)
6410 {
6411   if (send_sum >= selfptr->m_max_signals_before_send)
6412   {
6413     /* Try to send, but skip for now in case of lock contention. */
6414     sendpacked(selfptr, signal);
6415     selfptr->m_watchdog_counter = 6;
6416     flush_jbb_write_state(selfptr);
6417     pending_send = do_send(selfptr, FALSE, FALSE);
6418     selfptr->m_watchdog_counter = 20;
6419     send_sum = 0;
6420     flush_sum = 0;
6421   }
6422   else if (flush_sum >= selfptr->m_max_signals_before_send_flush)
6423   {
6424     /* Send buffers append to send queues to dst. trps. */
6425     sendpacked(selfptr, signal);
6426     selfptr->m_watchdog_counter = 6;
6427     flush_jbb_write_state(selfptr);
6428     do_flush(selfptr);
6429     selfptr->m_watchdog_counter = 20;
6430     flush_sum = 0;
6431   }
6432 }
6433 
6434 #if defined(USE_INIT_GLOBAL_VARIABLES)
6435   void mt_clear_global_variables(thr_data*);
6436 #endif
6437 /*
6438  * Execute at most MAX_SIGNALS signals from one job queue, updating local read
6439  * state as appropriate.
6440  *
6441  * Returns number of signals actually executed.
6442  */
6443 static
6444 Uint32
execute_signals(thr_data * selfptr,thr_job_queue * q,thr_job_queue_head * h,thr_jb_read_state * r,Signal * sig,Uint32 max_signals)6445 execute_signals(thr_data *selfptr,
6446                 thr_job_queue *q, thr_job_queue_head *h,
6447                 thr_jb_read_state *r,
6448                 Signal *sig, Uint32 max_signals)
6449 {
6450   Uint32 num_signals;
6451   Uint32 extra_signals = 0;
6452   Uint32 read_index = r->m_read_index;
6453   Uint32 write_index = r->m_write_index;
6454   Uint32 read_pos = r->m_read_pos;
6455   Uint32 read_end = r->m_read_end;
6456   Uint32 *watchDogCounter = &selfptr->m_watchdog_counter;
6457 
6458   if (read_index == write_index && read_pos >= read_end)
6459     return 0;          // empty read_state
6460 
6461   thr_job_buffer *read_buffer = r->m_read_buffer;
6462 
6463   for (num_signals = 0; num_signals < max_signals; num_signals++)
6464   {
6465     *watchDogCounter = 12;
6466     while (read_pos >= read_end)
6467     {
6468       if (read_index == write_index)
6469       {
6470         /* No more available now. */
6471         selfptr->m_stat.m_exec_cnt += num_signals;
6472         return num_signals;
6473       }
6474       else
6475       {
6476         /* Move to next buffer. */
6477         read_index = (read_index + 1) % thr_job_queue::SIZE;
6478         release_buffer(g_thr_repository, selfptr->m_thr_no, read_buffer);
6479         read_buffer = q->m_buffers[read_index];
6480         read_pos = 0;
6481         read_end = read_buffer->m_len;
6482         /* Update thread-local read state. */
6483         r->m_read_index = h->m_read_index = read_index;
6484         r->m_read_buffer = read_buffer;
6485         r->m_read_pos = read_pos;
6486         r->m_read_end = read_end;
6487         /* Wakeup threads waiting for job buffers to become free */
6488         wakeup(&h->m_waiter);
6489       }
6490     }
6491 
6492     /*
6493      * These pre-fetching were found using OProfile to reduce cache misses.
6494      * (Though on Intel Core 2, they do not give much speedup, as apparently
6495      * the hardware prefetcher is already doing a fairly good job).
6496      */
6497     NDB_PREFETCH_READ (read_buffer->m_data + read_pos + 16);
6498     NDB_PREFETCH_WRITE ((Uint32 *)&sig->header + 16);
6499 
6500 #ifdef VM_TRACE
6501     /* Find reading / propagation of junk */
6502     sig->garbage_register();
6503 #endif
6504     /* Now execute the signal. */
6505     SignalHeader* s =
6506       reinterpret_cast<SignalHeader*>(read_buffer->m_data + read_pos);
6507     Uint32 seccnt = s->m_noOfSections;
6508     Uint32 siglen = (sizeof(*s)>>2) + s->theLength;
6509     if(siglen>16)
6510     {
6511       NDB_PREFETCH_READ (read_buffer->m_data + read_pos + 32);
6512     }
6513     Uint32 bno = blockToMain(s->theReceiversBlockNumber);
6514     Uint32 ino = blockToInstance(s->theReceiversBlockNumber);
6515     SimulatedBlock* block = globalData.mt_getBlock(bno, ino);
6516     assert(block != 0);
6517 
6518     Uint32 gsn = s->theVerId_signalNumber;
6519     *watchDogCounter = 1 +
6520       (bno << 8) +
6521       (gsn << 20);
6522 
6523     /* Must update original buffer so signal dump will see it. */
6524     s->theSignalId = selfptr->m_signal_id_counter++;
6525     memcpy(&sig->header, s, 4*siglen);
6526     for(Uint32 i = 0; i < seccnt; i++)
6527     {
6528       sig->m_sectionPtrI[i] = read_buffer->m_data[read_pos + siglen + i];
6529     }
6530 
6531     read_pos += siglen + seccnt;
6532 #if SIZEOF_CHARP == 8
6533     /* Handle 8-byte alignment. */
6534     read_pos = (read_pos + 1) & ~((Uint32)1);
6535 #endif
6536 
6537     /* Update just before execute so signal dump can know how far we are. */
6538     r->m_read_pos = read_pos;
6539 
6540 #ifdef VM_TRACE
6541     if (globalData.testOn)
6542     { //wl4391_todo segments
6543       SegmentedSectionPtr ptr[3];
6544       ptr[0].i = sig->m_sectionPtrI[0];
6545       ptr[1].i = sig->m_sectionPtrI[1];
6546       ptr[2].i = sig->m_sectionPtrI[2];
6547       ::getSections(seccnt, ptr);
6548       globalSignalLoggers.executeSignal(*s,
6549                                         0,
6550                                         &sig->theData[0],
6551                                         globalData.ownId,
6552                                         ptr, seccnt);
6553     }
6554 #endif
6555 
6556     /**
6557      * In 7.4 we introduced the ability for scans in LDM threads to scan
6558      * several rows in the same signal execution without issuing a
6559      * CONTINUEB signal. This means that we effectively changed the
6560      * real-time characteristics of the scheduler. This change ensures
6561      * that we behave the same way as in 7.3 and earlier with respect to
6562      * how many signals are executed. So the m_extra_signals variable can
6563      * be used in the future for other cases where we combine several
6564      * signal executions into one signal and thus ensure that we don't
6565      * change the scheduler algorithms.
6566      *
6567      * This variable is incremented every time we decide to execute more
6568      * signals without real-time breaks in scans in DBLQH.
6569      */
6570     block->jamBuffer()->markEndOfSigExec();
6571     sig->m_extra_signals = 0;
6572 #if defined(USE_INIT_GLOBAL_VARIABLES)
6573     mt_clear_global_variables(selfptr);
6574 #endif
6575     block->executeFunction_async(gsn, sig);
6576     extra_signals += sig->m_extra_signals;
6577   }
6578   /**
6579    * Only count signals causing real-time break and not the one used to
6580    * balance the scheduler.
6581    */
6582   selfptr->m_stat.m_exec_cnt += num_signals;
6583 
6584   return num_signals + extra_signals;
6585 }
6586 
6587 static
6588 Uint32
run_job_buffers(thr_data * selfptr,Signal * sig,Uint32 & send_sum,Uint32 & flush_sum,bool & pending_send)6589 run_job_buffers(thr_data *selfptr,
6590                 Signal *sig,
6591                 Uint32 & send_sum,
6592                 Uint32 & flush_sum,
6593                 bool & pending_send)
6594 {
6595   Uint32 thr_count = g_thr_repository->m_thread_count;
6596   Uint32 signal_count = 0;
6597   Uint32 signal_count_since_last_zero_time_queue = 0;
6598   Uint32 perjb = selfptr->m_max_signals_per_jb;
6599 
6600   read_jbb_state(selfptr, thr_count);
6601   /*
6602    * A load memory barrier to ensure that we see any prio A signal sent later
6603    * than loaded prio B signals.
6604    */
6605   rmb();
6606 
6607   /**
6608    * For the main thread we can stop at any job buffer, so we proceed from
6609    * where we stopped to make different job buffers be equal in importance.
6610    *
6611    * For all other threads m_next_jbb_no should always be 0 when we reach here.
6612    */
6613   Uint32 first_jbb_no = selfptr->m_next_jbb_no;
6614   thr_job_queue *queue = selfptr->m_in_queue + first_jbb_no;
6615   thr_job_queue_head *head = selfptr->m_in_queue_head + first_jbb_no;
6616   thr_jb_read_state *read_state = selfptr->m_read_states + first_jbb_no;
6617   selfptr->m_watchdog_counter = 13;
6618   for (Uint32 jbb_no = first_jbb_no;
6619        jbb_no < thr_count;
6620        jbb_no++,queue++,read_state++,head++)
6621   {
6622     /* Read the prio A state often, to avoid starvation of prio A. */
6623     while (!read_jba_state(selfptr))
6624     {
6625       selfptr->m_sent_local_prioa_signal = false;
6626       static Uint32 max_prioA = thr_job_queue::SIZE * thr_job_buffer::SIZE;
6627       Uint32 num_signals = execute_signals(selfptr,
6628                                            &(selfptr->m_jba),
6629                                            &(selfptr->m_jba_head),
6630                                            &(selfptr->m_jba_read_state), sig,
6631                                            max_prioA);
6632       signal_count += num_signals;
6633       send_sum += num_signals;
6634       flush_sum += num_signals;
6635       if (!selfptr->m_sent_local_prioa_signal)
6636       {
6637         /**
6638          * Break out of loop if there was no prio A signals generated
6639          * from the local execution.
6640          */
6641         break;
6642       }
6643     }
6644 
6645     /**
6646      * Contended queues get an extra execute quota:
6647      *
6648      * If we didn't get a max 'perjb' quota, our out buffers
6649      * are about to fill up. This thread is thus effectively
6650      * slowed down in order to let other threads consume from
6651      * our out buffers. Eventually, when 'perjb==0', we will
6652      * have to wait/sleep for buffers to become available.
6653      *
6654      * This can bring is into a circular wait-lock, where
6655      * threads are stalled due to full out buffers. The same
6656      * thread may also have full in buffers, thus blocking other
6657      * threads from progressing. This could bring us into a
6658      * circular wait-lock, where no threads are able to progress.
6659      * The entire scheduler will then be stuck.
6660      *
6661      * We try to avoid this situation by reserving some
6662      * 'm_max_extra_signals' which are only used to consume
6663      * from 'almost full' in-buffers. We will then reduce the
6664      * risk of ending up in the above wait-lock.
6665      *
6666      * Exclude receiver threads, as there can't be a
6667      * circular wait between recv-thread and workers.
6668      */
6669     Uint32 extra = 0;
6670 
6671     if (perjb < MAX_SIGNALS_PER_JB)  //Job buffer contention
6672     {
6673       const Uint32 free = compute_free_buffers_in_queue(head);
6674       if (free <= thr_job_queue::ALMOST_FULL)
6675       {
6676         if (selfptr->m_max_extra_signals > MAX_SIGNALS_PER_JB - perjb)
6677 	{
6678           extra = MAX_SIGNALS_PER_JB - perjb;
6679         }
6680         else
6681 	{
6682           extra = selfptr->m_max_extra_signals;
6683           selfptr->m_max_exec_signals = 0; //Force recalc
6684         }
6685         selfptr->m_max_extra_signals -= extra;
6686       }
6687     }
6688 
6689 #ifdef ERROR_INSERT
6690 
6691 #define MIXOLOGY_MIX_MT_JBB 1
6692 
6693     if (unlikely(globalEmulatorData.theConfiguration->getMixologyLevel() &
6694                  MIXOLOGY_MIX_MT_JBB))
6695     {
6696       /**
6697        * Let's maximise interleaving to find inter-thread
6698        * signal order dependency bugs
6699        */
6700       perjb = 1;
6701       extra = 0;
6702     }
6703 #endif
6704 
6705     /* Now execute prio B signals from one thread. */
6706     Uint32 num_signals = execute_signals(selfptr, queue, head, read_state,
6707                                          sig, perjb+extra);
6708 
6709     if (num_signals > 0)
6710     {
6711       signal_count += num_signals;
6712       send_sum += num_signals;
6713       flush_sum += num_signals;
6714       handle_scheduling_decisions(selfptr,
6715                                   sig,
6716                                   send_sum,
6717                                   flush_sum,
6718                                   pending_send);
6719 
6720       if (signal_count - signal_count_since_last_zero_time_queue >
6721           (MAX_SIGNALS_EXECUTED_BEFORE_ZERO_TIME_QUEUE_SCAN -
6722            MAX_SIGNALS_PER_JB))
6723       {
6724         /**
6725          * Each execution of execute_signals can at most execute 75 signals
6726          * from one job buffer. We want to ensure that we execute no more than
6727          * 100 signals before we arrive here to get the signals from the
6728          * zero time queue. This implements the bounded delay signal
6729          * concept which is required for rate controlled activities.
6730          *
6731          * We scan the zero time queue if more than 25 signals were executed.
6732          * This means that at most 100 signals will be executed before we arrive
6733          * here again to check the bounded delay signals.
6734          */
6735         signal_count_since_last_zero_time_queue = signal_count;
6736         selfptr->m_watchdog_counter = 14;
6737         scan_zero_queue(selfptr);
6738         selfptr->m_watchdog_counter = 13;
6739       }
6740       if (selfptr->m_thr_no == 0)
6741       {
6742         /**
6743          * Execution in main thread can sometimes be a bit more lengthy,
6744          * so we ensure that we don't miss out on heartbeats and other
6745          * important things by returning to checking scan_time_queues
6746          * more often.
6747          */
6748         jbb_no++;
6749         if (jbb_no >= thr_count)
6750         {
6751           jbb_no = 0;
6752         }
6753         selfptr->m_next_jbb_no = jbb_no;
6754         return signal_count;
6755       }
6756     }
6757   }
6758   selfptr->m_next_jbb_no = 0;
6759   return signal_count;
6760 }
6761 
6762 struct thr_map_entry {
6763   enum { NULL_THR_NO = 0xFF };
6764   Uint8 thr_no;
thr_map_entrythr_map_entry6765   thr_map_entry() : thr_no(NULL_THR_NO) {}
6766 };
6767 
6768 static struct thr_map_entry thr_map[NO_OF_BLOCKS][NDBMT_MAX_BLOCK_INSTANCES];
6769 static Uint32 block_instance_count[NO_OF_BLOCKS];
6770 
6771 static inline Uint32
block2ThreadId(Uint32 block,Uint32 instance)6772 block2ThreadId(Uint32 block, Uint32 instance)
6773 {
6774   assert(block >= MIN_BLOCK_NO && block <= MAX_BLOCK_NO);
6775   Uint32 index = block - MIN_BLOCK_NO;
6776   assert(instance < NDB_ARRAY_SIZE(thr_map[index]));
6777   const thr_map_entry& entry = thr_map[index][instance];
6778   assert(entry.thr_no < glob_num_threads);
6779   return entry.thr_no;
6780 }
6781 
6782 void
add_thr_map(Uint32 main,Uint32 instance,Uint32 thr_no)6783 add_thr_map(Uint32 main, Uint32 instance, Uint32 thr_no)
6784 {
6785   assert(main == blockToMain(main));
6786   Uint32 index = main - MIN_BLOCK_NO;
6787   assert(index < NO_OF_BLOCKS);
6788   assert(instance < NDB_ARRAY_SIZE(thr_map[index]));
6789 
6790   SimulatedBlock* b = globalData.getBlock(main, instance);
6791   require(b != 0);
6792 
6793   /* Block number including instance. */
6794   Uint32 block = numberToBlock(main, instance);
6795 
6796   require(thr_no < glob_num_threads);
6797   struct thr_repository* rep = g_thr_repository;
6798   struct thr_data* thr_ptr = &rep->m_thread[thr_no];
6799 
6800   /* Add to list. */
6801   {
6802     Uint32 i;
6803     for (i = 0; i < thr_ptr->m_instance_count; i++)
6804       require(thr_ptr->m_instance_list[i] != block);
6805   }
6806   require(thr_ptr->m_instance_count < MAX_INSTANCES_PER_THREAD);
6807   thr_ptr->m_instance_list[thr_ptr->m_instance_count++] = block;
6808 
6809   SimulatedBlock::ThreadContext ctx;
6810   ctx.threadId = thr_no;
6811   ctx.jamBuffer = &thr_ptr->m_jam;
6812   ctx.watchDogCounter = &thr_ptr->m_watchdog_counter;
6813   ctx.sectionPoolCache = &thr_ptr->m_sectionPoolCache;
6814   ctx.pHighResTimer = &thr_ptr->m_curr_ticks;
6815   b->assignToThread(ctx);
6816 
6817   /* Create entry mapping block to thread. */
6818   thr_map_entry& entry = thr_map[index][instance];
6819   require(entry.thr_no == thr_map_entry::NULL_THR_NO);
6820   entry.thr_no = thr_no;
6821 }
6822 
6823 /* Static assignment of main instances (before first signal). */
6824 void
mt_init_thr_map()6825 mt_init_thr_map()
6826 {
6827   /* Keep mt-classic assignments in MT LQH. */
6828   const Uint32 thr_GLOBAL = 0;
6829   const Uint32 thr_LOCAL = 1;
6830 
6831   add_thr_map(BACKUP, 0, thr_LOCAL);
6832   add_thr_map(DBTC, 0, thr_GLOBAL);
6833   add_thr_map(DBDIH, 0, thr_GLOBAL);
6834   add_thr_map(DBLQH, 0, thr_LOCAL);
6835   add_thr_map(DBACC, 0, thr_LOCAL);
6836   add_thr_map(DBTUP, 0, thr_LOCAL);
6837   add_thr_map(DBDICT, 0, thr_GLOBAL);
6838   add_thr_map(NDBCNTR, 0, thr_GLOBAL);
6839   add_thr_map(QMGR, 0, thr_GLOBAL);
6840   add_thr_map(NDBFS, 0, thr_GLOBAL);
6841   add_thr_map(CMVMI, 0, thr_GLOBAL);
6842   add_thr_map(TRIX, 0, thr_GLOBAL);
6843   add_thr_map(DBUTIL, 0, thr_GLOBAL);
6844   add_thr_map(SUMA, 0, thr_LOCAL);
6845   add_thr_map(DBTUX, 0, thr_LOCAL);
6846   add_thr_map(TSMAN, 0, thr_LOCAL);
6847   add_thr_map(LGMAN, 0, thr_LOCAL);
6848   add_thr_map(PGMAN, 0, thr_LOCAL);
6849   add_thr_map(RESTORE, 0, thr_LOCAL);
6850   add_thr_map(DBINFO, 0, thr_LOCAL);
6851   add_thr_map(DBSPJ, 0, thr_GLOBAL);
6852   add_thr_map(THRMAN, 0, thr_GLOBAL);
6853   add_thr_map(TRPMAN, 0, thr_GLOBAL);
6854 }
6855 
6856 Uint32
mt_get_instance_count(Uint32 block)6857 mt_get_instance_count(Uint32 block)
6858 {
6859   switch(block){
6860   case DBLQH:
6861   case DBACC:
6862   case DBTUP:
6863   case DBTUX:
6864   case BACKUP:
6865   case RESTORE:
6866     return globalData.ndbMtLqhWorkers;
6867     break;
6868   case PGMAN:
6869     return globalData.ndbMtLqhWorkers + 1;
6870     break;
6871   case DBTC:
6872   case DBSPJ:
6873     return globalData.ndbMtTcThreads;
6874     break;
6875   case TRPMAN:
6876     return globalData.ndbMtReceiveThreads;
6877   case THRMAN:
6878     return glob_num_threads;
6879   default:
6880     require(false);
6881   }
6882   return 0;
6883 }
6884 
6885 void
mt_add_thr_map(Uint32 block,Uint32 instance)6886 mt_add_thr_map(Uint32 block, Uint32 instance)
6887 {
6888   Uint32 num_lqh_threads = globalData.ndbMtLqhThreads;
6889   Uint32 num_tc_threads = globalData.ndbMtTcThreads;
6890 
6891   require(instance != 0);
6892   Uint32 thr_no = NUM_MAIN_THREADS;
6893   switch(block){
6894   case DBLQH:
6895   case DBACC:
6896   case DBTUP:
6897   case DBTUX:
6898   case BACKUP:
6899   case RESTORE:
6900     thr_no += (instance - 1) % num_lqh_threads;
6901     break;
6902   case PGMAN:
6903     if (instance == num_lqh_threads + 1)
6904     {
6905       // Put extra PGMAN together with it's Proxy
6906       thr_no = block2ThreadId(block, 0);
6907     }
6908     else
6909     {
6910       thr_no += (instance - 1) % num_lqh_threads;
6911     }
6912     break;
6913   case DBTC:
6914   case DBSPJ:
6915     thr_no += num_lqh_threads + (instance - 1);
6916     break;
6917   case THRMAN:
6918     thr_no = instance - 1;
6919     break;
6920   case TRPMAN:
6921     thr_no += num_lqh_threads + num_tc_threads + (instance - 1);
6922     break;
6923   default:
6924     require(false);
6925   }
6926 
6927   add_thr_map(block, instance, thr_no);
6928 }
6929 
6930 /**
6931  * create the duplicate entries needed so that
6932  *   sender doesnt need to know how many instances there
6933  *   actually are in this node...
6934  *
6935  * if only 1 instance...then duplicate that for all slots
6936  * else assume instance 0 is proxy...and duplicate workers (modulo)
6937  *
6938  * NOTE: extra pgman worker is instance 5
6939  */
6940 void
mt_finalize_thr_map()6941 mt_finalize_thr_map()
6942 {
6943   for (Uint32 b = 0; b < NO_OF_BLOCKS; b++)
6944   {
6945     Uint32 bno = b + MIN_BLOCK_NO;
6946     Uint32 cnt = 0;
6947     while (cnt < NDB_ARRAY_SIZE(thr_map[b]) &&
6948            thr_map[b][cnt].thr_no != thr_map_entry::NULL_THR_NO)
6949     {
6950       cnt++;
6951     }
6952     block_instance_count[b] = cnt;
6953     if (cnt != NDB_ARRAY_SIZE(thr_map[b]))
6954     {
6955       SimulatedBlock * main = globalData.getBlock(bno, 0);
6956       for (Uint32 i = cnt; i < NDB_ARRAY_SIZE(thr_map[b]); i++)
6957       {
6958         Uint32 dup = (cnt == 1) ? 0 : 1 + ((i - 1) % (cnt - 1));
6959         if (thr_map[b][i].thr_no == thr_map_entry::NULL_THR_NO)
6960         {
6961           thr_map[b][i] = thr_map[b][dup];
6962           main->addInstance(globalData.getBlock(bno, dup), i);
6963         }
6964         else
6965         {
6966           /**
6967            * extra pgman instance
6968            */
6969           require(bno == PGMAN);
6970           require(false);
6971         }
6972       }
6973     }
6974   }
6975 }
6976 
6977 static
6978 void
calculate_max_signals_parameters(thr_data * selfptr)6979 calculate_max_signals_parameters(thr_data *selfptr)
6980 {
6981   switch (selfptr->m_sched_responsiveness)
6982   {
6983     case 0:
6984       selfptr->m_max_signals_before_send = 1000;
6985       selfptr->m_max_signals_before_send_flush = 340;
6986       break;
6987     case 1:
6988       selfptr->m_max_signals_before_send = 800;
6989       selfptr->m_max_signals_before_send_flush = 270;
6990       break;
6991     case 2:
6992       selfptr->m_max_signals_before_send = 600;
6993       selfptr->m_max_signals_before_send_flush = 200;
6994       break;
6995     case 3:
6996       selfptr->m_max_signals_before_send = 450;
6997       selfptr->m_max_signals_before_send_flush = 155;
6998       break;
6999     case 4:
7000       selfptr->m_max_signals_before_send = 350;
7001       selfptr->m_max_signals_before_send_flush = 130;
7002       break;
7003     case 5:
7004       selfptr->m_max_signals_before_send = 300;
7005       selfptr->m_max_signals_before_send_flush = 110;
7006       break;
7007     case 6:
7008       selfptr->m_max_signals_before_send = 250;
7009       selfptr->m_max_signals_before_send_flush = 90;
7010       break;
7011     case 7:
7012       selfptr->m_max_signals_before_send = 200;
7013       selfptr->m_max_signals_before_send_flush = 70;
7014       break;
7015     case 8:
7016       selfptr->m_max_signals_before_send = 170;
7017       selfptr->m_max_signals_before_send_flush = 50;
7018       break;
7019     case 9:
7020       selfptr->m_max_signals_before_send = 135;
7021       selfptr->m_max_signals_before_send_flush = 30;
7022       break;
7023     case 10:
7024       selfptr->m_max_signals_before_send = 70;
7025       selfptr->m_max_signals_before_send_flush = 10;
7026       break;
7027     default:
7028       assert(FALSE);
7029   }
7030   return;
7031 }
7032 
7033 static void
init_thread(thr_data * selfptr)7034 init_thread(thr_data *selfptr)
7035 {
7036   selfptr->m_waiter.init();
7037   selfptr->m_jam.theEmulatedJamIndex = 0;
7038 
7039   selfptr->m_overload_status = (OverloadStatus)LIGHT_LOAD_CONST;
7040   selfptr->m_node_overload_status = (OverloadStatus)LIGHT_LOAD_CONST;
7041   selfptr->m_wakeup_instance = 0;
7042   selfptr->m_last_wakeup_idle_thread = NdbTick_getCurrentTicks();
7043   selfptr->m_micros_send = 0;
7044   selfptr->m_micros_sleep = 0;
7045   selfptr->m_buffer_full_micros_sleep = 0;
7046   selfptr->m_measured_spintime = 0;
7047 
7048   NDB_THREAD_TLS_JAM = &selfptr->m_jam;
7049   NDB_THREAD_TLS_THREAD= selfptr;
7050 
7051   unsigned thr_no = selfptr->m_thr_no;
7052   globalEmulatorData.theWatchDog->
7053     registerWatchedThread(&selfptr->m_watchdog_counter, thr_no);
7054   {
7055     while(selfptr->m_thread == 0)
7056       NdbSleep_MilliSleep(30);
7057   }
7058 
7059   THRConfigApplier & conf = globalEmulatorData.theConfiguration->m_thr_config;
7060   BaseString tmp;
7061   tmp.appfmt("thr: %u ", thr_no);
7062 
7063   bool fail = false;
7064   int tid = NdbThread_GetTid(selfptr->m_thread);
7065   if (tid != -1)
7066   {
7067     tmp.appfmt("tid: %u ", tid);
7068   }
7069 
7070   conf.appendInfo(tmp,
7071                   selfptr->m_instance_list,
7072                   selfptr->m_instance_count);
7073   int res = conf.do_bind(selfptr->m_thread,
7074                          selfptr->m_instance_list,
7075                          selfptr->m_instance_count);
7076   if (res < 0)
7077   {
7078     fail = true;
7079     tmp.appfmt("err: %d ", -res);
7080   }
7081   else if (res > 0)
7082   {
7083     tmp.appfmt("OK ");
7084   }
7085 
7086   unsigned thread_prio;
7087   res = conf.do_thread_prio(selfptr->m_thread,
7088                             selfptr->m_instance_list,
7089                             selfptr->m_instance_count,
7090                             thread_prio);
7091   if (res < 0)
7092   {
7093     fail = true;
7094     res = -res;
7095     tmp.appfmt("Failed to set thread prio to %u, ", thread_prio);
7096     if (res == SET_THREAD_PRIO_NOT_SUPPORTED_ERROR)
7097     {
7098       tmp.appfmt("not supported on this OS");
7099     }
7100     else
7101     {
7102       tmp.appfmt("error: %d", res);
7103     }
7104   }
7105   else if (res > 0)
7106   {
7107     tmp.appfmt("Successfully set thread prio to %u ", thread_prio);
7108   }
7109 
7110   selfptr->m_realtime = conf.do_get_realtime(selfptr->m_instance_list,
7111                                              selfptr->m_instance_count);
7112   selfptr->m_conf_spintime = conf.do_get_spintime(selfptr->m_instance_list,
7113                                                   selfptr->m_instance_count);
7114 
7115   /* spintime always 0 on platforms not supporting spin */
7116   if (!NdbSpin_is_supported())
7117   {
7118     selfptr->m_conf_spintime = 0;
7119   }
7120   selfptr->m_spintime = 0;
7121   memset(&selfptr->m_spin_stat, 0, sizeof(selfptr->m_spin_stat));
7122   selfptr->m_spin_stat.m_spin_interval[NUM_SPIN_INTERVALS - 1] = 0xFFFFFFFF;
7123 
7124   selfptr->m_sched_responsiveness =
7125     globalEmulatorData.theConfiguration->schedulerResponsiveness();
7126   calculate_max_signals_parameters(selfptr);
7127 
7128   selfptr->m_thr_id = my_thread_self();
7129 
7130   for (Uint32 i = 0; i < selfptr->m_instance_count; i++)
7131   {
7132     BlockReference block = selfptr->m_instance_list[i];
7133     Uint32 main = blockToMain(block);
7134     Uint32 instance = blockToInstance(block);
7135     tmp.appfmt("%s(%u) ", getBlockName(main), instance);
7136   }
7137   /* Report parameters used by thread to node log */
7138   tmp.appfmt("realtime=%u, spintime=%u, max_signals_before_send=%u"
7139              ", max_signals_before_send_flush=%u",
7140              selfptr->m_realtime,
7141              selfptr->m_conf_spintime,
7142              selfptr->m_max_signals_before_send,
7143              selfptr->m_max_signals_before_send_flush);
7144 
7145   printf("%s\n", tmp.c_str());
7146   fflush(stdout);
7147   if (fail)
7148   {
7149 #ifndef HAVE_MAC_OS_X_THREAD_INFO
7150     abort();
7151 #endif
7152   }
7153 }
7154 
7155 /**
7156  * Align signal buffer for better cache performance.
7157  * Also skew it a litte for each thread to avoid cache pollution.
7158  */
7159 #define SIGBUF_SIZE (sizeof(Signal) + 63 + 256 * MAX_BLOCK_THREADS)
7160 static Signal *
aligned_signal(unsigned char signal_buf[SIGBUF_SIZE],unsigned thr_no)7161 aligned_signal(unsigned char signal_buf[SIGBUF_SIZE], unsigned thr_no)
7162 {
7163   UintPtr sigtmp= (UintPtr)signal_buf;
7164   sigtmp= (sigtmp+63) & (~(UintPtr)63);
7165   sigtmp+= thr_no*256;
7166   return (Signal *)sigtmp;
7167 }
7168 
7169 /*
7170  * We only do receive in receiver thread(s), no other threads do receive.
7171  *
7172  * As part of the receive loop, we also periodically call update_connections()
7173  * (this way we are similar to single-threaded ndbd).
7174  *
7175  * The TRPMAN block (and no other blocks) run in the same thread as this
7176  * receive loop; this way we avoid races between update_connections() and
7177  * TRPMAN calls into the transporters.
7178  */
7179 
7180 /**
7181  * Array of pointers to TransporterReceiveHandleKernel
7182  *   these are not used "in traffic"
7183  */
7184 static TransporterReceiveHandleKernel *
7185   g_trp_receive_handle_ptr[MAX_NDBMT_RECEIVE_THREADS];
7186 
7187 /**
7188  * Array for mapping trps to receiver threads and function to access it.
7189  */
7190 static Uint32 g_trp_to_recv_thr_map[MAX_NTRANSPORTERS];
7191 
7192 /**
7193  * We use this method both to initialise the realtime variable
7194  * and also for updating it. Currently there is no method to
7195  * update it, but it's likely that we will soon invent one and
7196  * thus the code is prepared for this case.
7197  */
7198 static void
update_rt_config(struct thr_data * selfptr,bool & real_time,enum ThreadTypes type)7199 update_rt_config(struct thr_data *selfptr,
7200                  bool & real_time,
7201                  enum ThreadTypes type)
7202 {
7203   bool old_real_time = real_time;
7204   real_time = selfptr->m_realtime;
7205   if (old_real_time == true && real_time == false)
7206   {
7207     yield_rt_break(selfptr->m_thread,
7208                    type,
7209                    false);
7210   }
7211 }
7212 
7213 /**
7214  * We use this method both to initialise the spintime variable
7215  * and also for updating it. Currently there is no method to
7216  * update it, but it's likely that we will soon invent one and
7217  * thus the code is prepared for this case.
7218  */
7219 static void
update_spin_config(struct thr_data * selfptr,Uint64 & min_spin_timer)7220 update_spin_config(struct thr_data *selfptr,
7221                    Uint64 & min_spin_timer)
7222 {
7223   min_spin_timer = selfptr->m_spintime;
7224 }
7225 
7226 extern "C"
7227 void *
mt_receiver_thread_main(void * thr_arg)7228 mt_receiver_thread_main(void *thr_arg)
7229 {
7230   unsigned char signal_buf[SIGBUF_SIZE];
7231   Signal *signal;
7232   struct thr_repository* rep = g_thr_repository;
7233   struct thr_data* selfptr = (struct thr_data *)thr_arg;
7234   unsigned thr_no = selfptr->m_thr_no;
7235   Uint32& watchDogCounter = selfptr->m_watchdog_counter;
7236   const Uint32 recv_thread_idx = thr_no - first_receiver_thread_no;
7237   bool has_received = false;
7238   int cnt = 0;
7239   bool real_time = false;
7240   Uint64 min_spin_timer;
7241   NDB_TICKS yield_ticks;
7242   NDB_TICKS before;
7243 
7244   init_thread(selfptr);
7245   signal = aligned_signal(signal_buf, thr_no);
7246   update_rt_config(selfptr, real_time, ReceiveThread);
7247   update_spin_config(selfptr, min_spin_timer);
7248 
7249   /**
7250    * Object that keeps track of our pollReceive-state
7251    */
7252   TransporterReceiveHandleKernel recvdata(thr_no, recv_thread_idx);
7253   recvdata.assign_trps(g_trp_to_recv_thr_map);
7254   globalTransporterRegistry.init(recvdata);
7255 
7256   /**
7257    * Save pointer to this for management/error-insert
7258    */
7259   g_trp_receive_handle_ptr[recv_thread_idx] = &recvdata;
7260 
7261   NDB_TICKS now = NdbTick_getCurrentTicks();
7262   before = now;
7263   selfptr->m_curr_ticks = now;
7264   selfptr->m_signal = signal;
7265   selfptr->m_ticks = selfptr->m_scan_real_ticks = yield_ticks = now;
7266   Ndb_GetRUsage(&selfptr->m_scan_time_queue_rusage, false);
7267 
7268   while (globalData.theRestartFlag != perform_stop)
7269   {
7270     if (cnt == 0)
7271     {
7272       watchDogCounter = 5;
7273       update_spin_config(selfptr, min_spin_timer);
7274       Uint32 max_spintime = 0;
7275       /**
7276        * The settings of spinning on transporter is only aimed at
7277        * the NDB API part. We have an elaborate scheme for handling
7278        * spinning in ndbmtd, so we shut down any spinning inside
7279        * the transporter here. The principle is to only spin in one
7280        * location and spinning in recv thread overrides any spinning
7281        * desired on transporter level.
7282        */
7283       max_spintime = 0;
7284       globalTransporterRegistry.update_connections(recvdata,
7285                                                    max_spintime);
7286     }
7287     cnt = (cnt + 1) & 15;
7288 
7289     watchDogCounter = 2;
7290 
7291     now = NdbTick_getCurrentTicks();
7292     selfptr->m_curr_ticks = now;
7293     const Uint32 lagging_timers = scan_time_queues(selfptr, now);
7294     Uint32 dummy1 = 0;
7295     Uint32 dummy2 = 0;
7296     bool dummy3 = false;
7297 
7298     Uint32 sum = run_job_buffers(selfptr, signal, dummy1, dummy2, dummy3);
7299 
7300     if (sum || has_received)
7301     {
7302       sendpacked(selfptr, signal);
7303       watchDogCounter = 6;
7304       flush_jbb_write_state(selfptr);
7305     }
7306 
7307     const bool pending_send = do_send(selfptr, TRUE, FALSE);
7308 
7309     watchDogCounter = 7;
7310 
7311     if (real_time)
7312     {
7313       check_real_time_break(now,
7314                             &yield_ticks,
7315                             selfptr->m_thread,
7316                             ReceiveThread);
7317     }
7318 
7319     /**
7320      * Only allow to sleep in pollReceive when:
7321      * 1) We are not lagging behind in handling timer events.
7322      * 2) No more pending sends, or no send progress.
7323      * 3) There are no 'min_spin' configured or min_spin has elapsed
7324      * We will not check spin timer until we have checked the
7325      * transporters at least one loop and discovered no data. We also
7326      * ensure that we have not executed any signals before we start
7327      * the actual spin timer.
7328      */
7329     Uint32 delay = 0;
7330     Uint32 num_events = 0;
7331     Uint32 spin_micros = 0;
7332     update_spin_config(selfptr, min_spin_timer);
7333     before = NdbTick_getCurrentTicks();
7334 
7335     if (lagging_timers == 0 &&       // 1)
7336         pending_send  == false &&    // 2)
7337         (min_spin_timer == 0 ||      // 3)
7338          (sum == 0 &&
7339           !has_received &&
7340           check_recv_yield(selfptr,
7341                            recvdata,
7342                            min_spin_timer,
7343                            num_events,
7344                            &spin_micros,
7345                            before))))
7346     {
7347       delay = 10; // 10 ms
7348     }
7349 
7350     has_received = false;
7351     if (num_events == 0)
7352     {
7353       /* Need to call pollReceive if not already done in check_recv_yield */
7354       num_events = globalTransporterRegistry.pollReceive(delay, recvdata);
7355     }
7356     if (delay > 0)
7357     {
7358       NDB_TICKS after = NdbTick_getCurrentTicks();
7359       Uint64 micros_sleep = NdbTick_Elapsed(before, after).microSec();
7360       selfptr->m_micros_sleep += micros_sleep;
7361       wait_time_tracking(selfptr, micros_sleep);
7362     }
7363     if (num_events)
7364     {
7365       watchDogCounter = 8;
7366       lock(&rep->m_receive_lock[recv_thread_idx]);
7367       const bool buffersFull =
7368         (globalTransporterRegistry.performReceive(recvdata,
7369                                                   recv_thread_idx) != 0);
7370       unlock(&rep->m_receive_lock[recv_thread_idx]);
7371       has_received = true;
7372 
7373       if (buffersFull)       /* Receive queues(s) are full */
7374       {
7375         thr_data* waitthr = get_congested_recv_queue(rep, recv_thread_idx);
7376         if (waitthr != NULL) /* Will wait for buffers to be freed */
7377         {
7378           /**
7379            * Wait for thread 'waitthr' to consume some of the
7380            * pending signals in m_in_queue previously received
7381            * from this receive thread, 'thr_no'.
7382            * Will recheck queue status with 'check_recv_queue' after latch
7383            * has been set, and *before* going to sleep.
7384            */
7385           const Uint32 nano_wait = 1000*1000;    /* -> 1 ms */
7386           thr_job_queue_head *wait_queue = waitthr->m_in_queue_head + thr_no;
7387           NDB_TICKS before = NdbTick_getCurrentTicks();
7388           const bool waited = yield(&wait_queue->m_waiter,
7389                                     nano_wait,
7390                                     check_recv_queue,
7391                                     wait_queue);
7392           if (waited)
7393           {
7394             NDB_TICKS after = NdbTick_getCurrentTicks();
7395             selfptr->m_buffer_full_micros_sleep +=
7396               NdbTick_Elapsed(before, after).microSec();
7397           }
7398         }
7399       }
7400     }
7401     selfptr->m_stat.m_loop_cnt++;
7402   }
7403 
7404   globalEmulatorData.theWatchDog->unregisterWatchedThread(thr_no);
7405   return NULL;                  // Return value not currently used
7406 }
7407 
7408 /**
7409  * Callback function used by yield() to recheck
7410  * 'job queue full' condition before going to sleep.
7411  *
7412  * Check if the specified 'thr_job_queue_head' (arg)
7413  * is still full, return true if so.
7414  */
7415 static bool
check_congested_job_queue(thr_job_queue_head * waitfor)7416 check_congested_job_queue(thr_job_queue_head *waitfor)
7417 {
7418   return (compute_free_buffers_in_queue(waitfor) <= thr_job_queue::RESERVED);
7419 }
7420 
7421 /**
7422  * Check if any out-queues of selfptr is full.
7423  * If full: Return 'Thr_data*' for (one of) the thread(s)
7424  *          which we have to wait for. (to consume from queue)
7425  */
7426 static struct thr_data*
get_congested_job_queue(const thr_data * selfptr)7427 get_congested_job_queue(const thr_data *selfptr)
7428 {
7429   const Uint32 thr_no = selfptr->m_thr_no;
7430   struct thr_repository* rep = g_thr_repository;
7431   struct thr_data *thrptr = rep->m_thread;
7432   struct thr_data *waitfor = NULL;
7433 
7434   for (unsigned i = 0; i<glob_num_threads; i++, thrptr++)
7435   {
7436     thr_job_queue_head *q_head = thrptr->m_in_queue_head + thr_no;
7437 
7438     if (compute_free_buffers_in_queue(q_head) <= thr_job_queue::RESERVED)
7439     {
7440       if (thrptr != selfptr)  // Don't wait on myself (yet)
7441         return thrptr;
7442       else
7443         waitfor = thrptr;
7444     }
7445   }
7446   return waitfor;             // Possibly 'thrptr == selfptr'
7447 }
7448 
7449 /**
7450  * has_full_in_queues()
7451  *
7452  * Avoid circular waits between block-threads:
7453  * A thread is not allowed to sleep due to full
7454  * 'out' job-buffers if there are other threads
7455  * already having full 'in' job buffers sent to
7456  * this thread.
7457  *
7458  * run_job_buffers() has reserved a 'm_max_extra_signals'
7459  * quota which will be used to drain these 'full_in_queues'.
7460  * So we should allow it to be.
7461  *
7462  * Returns 'true' if any in-queues to this thread are full
7463  */
7464 static
7465 bool
has_full_in_queues(struct thr_data * selfptr)7466 has_full_in_queues(struct thr_data* selfptr)
7467 {
7468   thr_job_queue_head *head = selfptr->m_in_queue_head;
7469 
7470   for (Uint32 thr_no = 0; thr_no < glob_num_threads; thr_no++, head++)
7471   {
7472     if (compute_free_buffers_in_queue(head) <= thr_job_queue::RESERVED)
7473     {
7474       return true;
7475     }
7476   }
7477   return false;
7478 }
7479 
7480 /**
7481  * update_sched_config
7482  *
7483  *   In order to prevent "job-buffer-full", i.e
7484  *     that one thread(T1) produces so much signals to another thread(T2)
7485  *     so that the ring-buffer from T1 to T2 gets full
7486  *     the main loop have 2 "config" variables
7487  *   - m_max_exec_signals
7488  *     This is the *total* no of signals T1 can execute before calling
7489  *     this method again
7490  *   - m_max_signals_per_jb
7491  *     This is the max no of signals T1 can execute from each other thread
7492  *     in system
7493  *
7494  *   Assumption: each signal may send *at most* 4 signals
7495  *     - this assumption is made the same in ndbd and ndbmtd and is
7496  *       mostly followed by block-code, although not in all places :-(
7497  *
7498  *   This function return true, if it it slept
7499  *     (i.e that it concluded that it could not execute *any* signals, wo/
7500  *      risking job-buffer-full)
7501  */
7502 static
7503 bool
update_sched_config(struct thr_data * selfptr,bool pending_send,Uint32 & send_sum,Uint32 & flush_sum)7504 update_sched_config(struct thr_data* selfptr,
7505                     bool pending_send,
7506                     Uint32 & send_sum,
7507                     Uint32 & flush_sum)
7508 {
7509   Uint32 sleeploop = 0;
7510   Uint32 thr_no = selfptr->m_thr_no;
7511   selfptr->m_watchdog_counter = 16;
7512 loop:
7513   Uint32 minfree = compute_min_free_out_buffers(thr_no);
7514   Uint32 reserved = (minfree > thr_job_queue::RESERVED)
7515                    ? thr_job_queue::RESERVED
7516                    : minfree;
7517 
7518   Uint32 avail = compute_max_signals_to_execute(minfree - reserved);
7519   Uint32 perjb = (avail + g_thr_repository->m_thread_count - 1) /
7520                   g_thr_repository->m_thread_count;
7521 
7522   if (selfptr->m_thr_no == 0)
7523   {
7524     /**
7525      * The main thread has some signals that execute for a bit longer than
7526      * other threads. We only allow the main thread thus to execute at most
7527      * 5 signals per round of signal execution. We handle this here and
7528      * also only handle signals from one queue at a time with the main
7529      * thread.
7530      *
7531      * LCP_FRAG_REP is one such signal that can execute now for about
7532      * 1 millisecond, so 5 signals can become 5 milliseconds which should
7533      * fairly safe to ensure we always come back for the 10ms TIME_SIGNAL
7534      * that is handled by the main thread.
7535      */
7536     perjb = MAX(perjb, 5);
7537   }
7538   if (perjb > MAX_SIGNALS_PER_JB)
7539     perjb = MAX_SIGNALS_PER_JB;
7540 
7541   selfptr->m_max_exec_signals = avail;
7542   selfptr->m_max_signals_per_jb = perjb;
7543   selfptr->m_max_extra_signals = compute_max_signals_to_execute(reserved);
7544 
7545   if (unlikely(perjb == 0))
7546   {
7547     if (sleeploop == 10)
7548     {
7549       /**
7550        * we've slept for 10ms...try running anyway
7551        */
7552       selfptr->m_max_signals_per_jb = 1;
7553       ndbout_c("thr_no:%u - sleeploop 10!! "
7554                "(Worker thread blocked (>= 10ms) by slow consumer threads)",
7555                selfptr->m_thr_no);
7556       return true;
7557     }
7558 
7559     struct thr_data* waitthr = get_congested_job_queue(selfptr);
7560     if (waitthr == NULL)                 // Waiters resolved
7561     {
7562       goto loop;
7563     }
7564     else if (has_full_in_queues(selfptr) &&
7565              selfptr->m_max_extra_signals > 0)
7566     {
7567       /* 'extra_signals' used to drain 'full_in_queues'. */
7568       return sleeploop > 0;
7569     }
7570 
7571     if (pending_send)
7572     {
7573       /* About to sleep, _must_ send now. */
7574       pending_send = do_send(selfptr, TRUE, TRUE);
7575       send_sum = 0;
7576       flush_sum = 0;
7577     }
7578 
7579     /**
7580      * Wait for thread 'waitthr' to consume some of the
7581      * pending signals in m_in_queue[].
7582      * Will recheck queue status with 'check_recv_queue'
7583      * after latch has been set, and *before* going to sleep.
7584      */
7585     const Uint32 nano_wait = 1000*1000;    /* -> 1 ms */
7586     thr_job_queue_head *wait_queue = waitthr->m_in_queue_head + thr_no;
7587 
7588     NDB_TICKS before = NdbTick_getCurrentTicks();
7589     const bool waited = yield(&wait_queue->m_waiter,
7590                               nano_wait,
7591                               check_congested_job_queue,
7592                               wait_queue);
7593     if (waited)
7594     {
7595       NDB_TICKS after = NdbTick_getCurrentTicks();
7596       selfptr->m_buffer_full_micros_sleep +=
7597         NdbTick_Elapsed(before, after).microSec();
7598       sleeploop++;
7599     }
7600     goto loop;
7601   }
7602 
7603   return sleeploop > 0;
7604 }
7605 
7606 extern "C"
7607 void *
mt_job_thread_main(void * thr_arg)7608 mt_job_thread_main(void *thr_arg)
7609 {
7610   unsigned char signal_buf[SIGBUF_SIZE];
7611   Signal *signal;
7612 
7613   struct thr_data* selfptr = (struct thr_data *)thr_arg;
7614   init_thread(selfptr);
7615   Uint32& watchDogCounter = selfptr->m_watchdog_counter;
7616 
7617   unsigned thr_no = selfptr->m_thr_no;
7618   signal = aligned_signal(signal_buf, thr_no);
7619 
7620   /* Avoid false watchdog alarms caused by race condition. */
7621   watchDogCounter = 21;
7622 
7623   bool pending_send = false;
7624   Uint32 send_sum = 0;
7625   Uint32 flush_sum = 0;
7626   Uint32 loops = 0;
7627   Uint32 maxloops = 10;/* Loops before reading clock, fuzzy adapted to 1ms freq. */
7628   Uint32 waits = 0;
7629 
7630   NDB_TICKS yield_ticks;
7631 
7632   Uint64 min_spin_timer;
7633   bool real_time = false;
7634 
7635   update_rt_config(selfptr, real_time, BlockThread);
7636   update_spin_config(selfptr, min_spin_timer);
7637 
7638   NDB_TICKS now = NdbTick_getCurrentTicks();
7639   selfptr->m_ticks = yield_ticks = now;
7640   selfptr->m_scan_real_ticks = now;
7641   selfptr->m_signal = signal;
7642   selfptr->m_curr_ticks = now;
7643   Ndb_GetRUsage(&selfptr->m_scan_time_queue_rusage, false);
7644 
7645   while (globalData.theRestartFlag != perform_stop)
7646   {
7647     loops++;
7648 
7649     /**
7650      * prefill our thread local send buffers
7651      *   up to THR_SEND_BUFFER_PRE_ALLOC (1Mb)
7652      *
7653      * and if this doesnt work pack buffers before start to execute signals
7654      */
7655     watchDogCounter = 11;
7656     if (!selfptr->m_send_buffer_pool.fill(g_thr_repository->m_mm,
7657                                           RG_TRANSPORTER_BUFFERS,
7658                                           THR_SEND_BUFFER_PRE_ALLOC,
7659                                           selfptr->m_send_instance_no))
7660     {
7661       try_pack_send_buffers(selfptr);
7662     }
7663 
7664     watchDogCounter = 2;
7665     const Uint32 lagging_timers = scan_time_queues(selfptr, now);
7666 
7667     Uint32 sum = run_job_buffers(selfptr,
7668                                  signal,
7669                                  send_sum,
7670                                  flush_sum,
7671                                  pending_send);
7672 
7673 
7674     if (sum)
7675     {
7676       /**
7677        * It is imperative that we flush signals within our node after
7678        * each round of execution. This makes sure that the receiver
7679        * thread are woken up to do their work which often means that
7680        * they will send some signals back to us (e.g. the commit
7681        * protocol for updates). Quite often we continue executing one
7682        * more loop and while so doing the other threads can return
7683        * new signals to us and thus we avoid going back and forth to
7684        * sleep too often which otherwise would happen.
7685        *
7686        * Many of the optimisations of having TC and LDM colocated
7687        * for transactions would go away unless we use this principle.
7688        *
7689        * No need to flush however if no signals have been executed since
7690        * last flush.
7691        *
7692        * No need to check for send packed signals if we didn't send
7693        * any signals, packed signals are sent as a result of an
7694        * executed signal.
7695        */
7696       sendpacked(selfptr, signal);
7697       watchDogCounter = 6;
7698       if (flush_sum > 0)
7699       {
7700         flush_jbb_write_state(selfptr);
7701         do_flush(selfptr);
7702         flush_sum = 0;
7703       }
7704     }
7705     /**
7706      * Scheduler is not allowed to yield until its internal
7707      * time has caught up on real time.
7708      */
7709     else if (lagging_timers == 0)
7710     {
7711       /* No signals processed, prepare to sleep to wait for more */
7712       if (send_sum > 0 || pending_send == true)
7713       {
7714         /* About to sleep, _must_ send now. */
7715         flush_jbb_write_state(selfptr);
7716         pending_send = do_send(selfptr, TRUE, TRUE);
7717         send_sum = 0;
7718         flush_sum = 0;
7719       }
7720 
7721       /**
7722        * No more incoming signals to process yet, and we have
7723        * either completed all pending sends, or had no progress
7724        * due to full transporters in last do_send(). Wait for
7725        * more signals, use a shorter timeout if pending_send.
7726        */
7727       if (pending_send == false) /* Nothing pending, or no progress made */
7728       {
7729         /**
7730          * When min_spin_timer > 0 it means we are spinning, if we executed
7731          * jobs this time there is no reason to check spin timer and since
7732          * we executed at least one signal we are per definition not yet
7733          * spinning. Thus we can immediately move to the next loop.
7734          * Spinning is performed for a while when sum == 0 AND
7735          * min_spin_timer > 0. In this case we need to go into check_yield
7736          * and initialise spin timer (on first round) and check spin timer
7737          * on subsequent loops.
7738          */
7739         Uint32 spin_time_in_us = 0;
7740         update_spin_config(selfptr, min_spin_timer);
7741         NDB_TICKS before = NdbTick_getCurrentTicks();
7742         bool has_spun = (min_spin_timer != 0);
7743         if (min_spin_timer == 0 ||
7744             check_yield(selfptr,
7745                         min_spin_timer,
7746                         &spin_time_in_us,
7747                         before))
7748         {
7749           /**
7750            * Sleep, either a short nap if send failed due to send overload,
7751            * or a longer sleep if there are no more work waiting.
7752            */
7753           Uint32 maxwait_in_us =
7754             (selfptr->m_node_overload_status >=
7755              (OverloadStatus)MEDIUM_LOAD_CONST) ?
7756             1 * 1000 :
7757             10 * 1000;
7758           if (maxwait_in_us < spin_time_in_us)
7759           {
7760             maxwait_in_us = 0;
7761           }
7762           else
7763           {
7764             maxwait_in_us -= spin_time_in_us;
7765           }
7766           selfptr->m_watchdog_counter = 18;
7767           const Uint32 used_maxwait_in_ns = maxwait_in_us * 1000;
7768           bool waited = yield(&selfptr->m_waiter,
7769                               used_maxwait_in_ns,
7770                               check_queues_empty,
7771                               selfptr);
7772           if (waited)
7773           {
7774             waits++;
7775             /* Update current time after sleeping */
7776             now = NdbTick_getCurrentTicks();
7777             selfptr->m_curr_ticks = now;
7778             yield_ticks = now;
7779             Uint64 micros_sleep = NdbTick_Elapsed(before, now).microSec();
7780             selfptr->m_micros_sleep += micros_sleep;
7781             wait_time_tracking(selfptr, micros_sleep);
7782             selfptr->m_stat.m_wait_cnt += waits;
7783             selfptr->m_stat.m_loop_cnt += loops;
7784             if (selfptr->m_overload_status <=
7785                 (OverloadStatus)MEDIUM_LOAD_CONST)
7786             {
7787               /**
7788                * To ensure that we at least check for trps to send to
7789                * before we yield we set pending_send to true. We will
7790                * quickly discover if nothing is pending.
7791                */
7792               pending_send = true;
7793             }
7794             waits = loops = 0;
7795             if (selfptr->m_thr_no == 0)
7796             {
7797               /**
7798                * NDBFS is using thread 0, here we need to call SEND_PACKED
7799                * to scan the memory channel for messages from NDBFS threads.
7800                * We want to do this here to avoid an extra loop in scheduler
7801                * before we discover those messages from NDBFS.
7802                */
7803               selfptr->m_watchdog_counter = 17;
7804               check_for_input_from_ndbfs(selfptr, signal);
7805             }
7806           }
7807           else if (has_spun)
7808           {
7809             selfptr->m_micros_sleep += spin_time_in_us;
7810             wait_time_tracking(selfptr, spin_time_in_us);
7811           }
7812         }
7813       }
7814     }
7815 
7816     /**
7817      * Check if we executed enough signals,
7818      *   and if so recompute how many signals to execute
7819      */
7820     now = NdbTick_getCurrentTicks();
7821     if (sum >= selfptr->m_max_exec_signals)
7822     {
7823       if (update_sched_config(selfptr,
7824                               send_sum + Uint32(pending_send),
7825                               send_sum,
7826                               flush_sum))
7827       {
7828         /* Update current time after sleeping */
7829         selfptr->m_curr_ticks = now;
7830         selfptr->m_stat.m_wait_cnt += waits;
7831         selfptr->m_stat.m_loop_cnt += loops;
7832         waits = loops = 0;
7833         update_rt_config(selfptr, real_time, BlockThread);
7834         calculate_max_signals_parameters(selfptr);
7835       }
7836     }
7837     else
7838     {
7839       selfptr->m_max_exec_signals -= sum;
7840     }
7841 
7842     /**
7843      * Adaptive reading freq. of system time every time 1ms
7844      * is likely to have passed
7845      */
7846     now = NdbTick_getCurrentTicks();
7847     selfptr->m_curr_ticks = now;
7848     if (loops > maxloops)
7849     {
7850       if (real_time)
7851       {
7852         check_real_time_break(now,
7853                               &yield_ticks,
7854                               selfptr->m_thread,
7855                               BlockThread);
7856       }
7857       const Uint64 diff = NdbTick_Elapsed(selfptr->m_ticks, now).milliSec();
7858 
7859       /* Adjust 'maxloop' to achieve clock reading frequency of 1ms */
7860       if (diff < 1)
7861         maxloops += ((maxloops/10) + 1); /* No change: less frequent reading */
7862       else if (diff > 1 && maxloops > 1)
7863         maxloops -= ((maxloops/10) + 1); /* Overslept: Need more frequent read*/
7864 
7865       selfptr->m_stat.m_wait_cnt += waits;
7866       selfptr->m_stat.m_loop_cnt += loops;
7867       waits = loops = 0;
7868     }
7869   }
7870 
7871   globalEmulatorData.theWatchDog->unregisterWatchedThread(thr_no);
7872   return NULL;                  // Return value not currently used
7873 }
7874 
7875 /**
7876  * Get number of pending signals at B-level in our own thread. Used
7877  * to make some decisions in rate-critical parts of the data node.
7878  */
7879 Uint32
mt_getSignalsInJBB(Uint32 self)7880 mt_getSignalsInJBB(Uint32 self)
7881 {
7882   Uint32 pending_signals = 0;
7883   struct thr_repository* rep = g_thr_repository;
7884   struct thr_data *selfptr = &rep->m_thread[self];
7885   for (Uint32 thr_no = 0; thr_no < glob_num_threads; thr_no++)
7886   {
7887     thr_jb_write_state *w = selfptr->m_write_states + thr_no;
7888     pending_signals += w->get_pending_signals();
7889   }
7890   return pending_signals;
7891 }
7892 
7893 NDB_TICKS
mt_getHighResTimer(Uint32 self)7894 mt_getHighResTimer(Uint32 self)
7895 {
7896   struct thr_repository* rep = g_thr_repository;
7897   struct thr_data *selfptr = &rep->m_thread[self];
7898   return selfptr->m_curr_ticks;
7899 }
7900 
7901 void
mt_setNoSend(Uint32 self)7902 mt_setNoSend(Uint32 self)
7903 {
7904   struct thr_repository* rep = g_thr_repository;
7905   struct thr_data *selfptr = &rep->m_thread[self];
7906   selfptr->m_nosend = 1;
7907 }
7908 
7909 void
mt_startChangeNeighbourNode()7910 mt_startChangeNeighbourNode()
7911 {
7912   if (g_send_threads)
7913   {
7914     g_send_threads->startChangeNeighbourNode();
7915   }
7916 }
7917 
7918 void
mt_setNeighbourNode(NodeId node)7919 mt_setNeighbourNode(NodeId node)
7920 {
7921   if (g_send_threads)
7922   {
7923     g_send_threads->setNeighbourNode(node);
7924   }
7925 }
7926 
7927 void
mt_endChangeNeighbourNode()7928 mt_endChangeNeighbourNode()
7929 {
7930   if (g_send_threads)
7931   {
7932     g_send_threads->endChangeNeighbourNode();
7933   }
7934 }
7935 
7936 void
mt_setOverloadStatus(Uint32 self,OverloadStatus new_status)7937 mt_setOverloadStatus(Uint32 self,
7938                      OverloadStatus new_status)
7939 {
7940   struct thr_repository* rep = g_thr_repository;
7941   struct thr_data *selfptr = &rep->m_thread[self];
7942   selfptr->m_overload_status = new_status;
7943 }
7944 
7945 void
mt_setWakeupThread(Uint32 self,Uint32 wakeup_instance)7946 mt_setWakeupThread(Uint32 self,
7947                    Uint32 wakeup_instance)
7948 {
7949   struct thr_repository* rep = g_thr_repository;
7950   struct thr_data *selfptr = &rep->m_thread[self];
7951   selfptr->m_wakeup_instance = wakeup_instance;
7952 }
7953 
7954 void
mt_setNodeOverloadStatus(Uint32 self,OverloadStatus new_status)7955 mt_setNodeOverloadStatus(Uint32 self,
7956                          OverloadStatus new_status)
7957 {
7958   struct thr_repository* rep = g_thr_repository;
7959   struct thr_data *selfptr = &rep->m_thread[self];
7960   selfptr->m_node_overload_status = new_status;
7961 }
7962 
7963 void
mt_setSendNodeOverloadStatus(OverloadStatus new_status)7964 mt_setSendNodeOverloadStatus(OverloadStatus new_status)
7965 {
7966   if (g_send_threads)
7967   {
7968     g_send_threads->setNodeOverloadStatus(new_status);
7969   }
7970 }
7971 
7972 void
mt_setSpintime(Uint32 self,Uint32 new_spintime)7973 mt_setSpintime(Uint32 self, Uint32 new_spintime)
7974 {
7975   struct thr_repository* rep = g_thr_repository;
7976   struct thr_data *selfptr = &rep->m_thread[self];
7977   /* spintime always 0 on platforms not supporting spin */
7978   if (!NdbSpin_is_supported())
7979   {
7980     new_spintime = 0;
7981   }
7982   selfptr->m_spintime = new_spintime;
7983 }
7984 
7985 Uint32
mt_getConfiguredSpintime(Uint32 self)7986 mt_getConfiguredSpintime(Uint32 self)
7987 {
7988   struct thr_repository* rep = g_thr_repository;
7989   struct thr_data *selfptr = &rep->m_thread[self];
7990 
7991   return selfptr->m_conf_spintime;
7992 }
7993 
7994 Uint32
mt_getWakeupLatency(void)7995 mt_getWakeupLatency(void)
7996 {
7997   return glob_wakeup_latency;
7998 }
7999 
8000 void
mt_setWakeupLatency(Uint32 latency)8001 mt_setWakeupLatency(Uint32 latency)
8002 {
8003   /**
8004    * Round up to next 5 micros (+4) AND
8005    * add 2 microseconds for time to execute going to sleep (+2).
8006    * Rounding up is an attempt to decrease variance by selecting the
8007    * latency more coarsely.
8008    *
8009    */
8010   latency = (latency + 4 + 2) / 5;
8011   latency *= 5;
8012   glob_wakeup_latency = latency;
8013 }
8014 
8015 void
mt_flush_send_buffers(Uint32 self)8016 mt_flush_send_buffers(Uint32 self)
8017 {
8018   struct thr_repository* rep = g_thr_repository;
8019   struct thr_data *selfptr = &rep->m_thread[self];
8020   do_flush(selfptr);
8021 }
8022 
8023 void
mt_set_watchdog_counter(Uint32 self)8024 mt_set_watchdog_counter(Uint32 self)
8025 {
8026   struct thr_repository* rep = g_thr_repository;
8027   struct thr_data *selfptr = &rep->m_thread[self];
8028   selfptr->m_watchdog_counter = 12;
8029 }
8030 
8031 void
mt_getPerformanceTimers(Uint32 self,Uint64 & micros_sleep,Uint64 & spin_time,Uint64 & buffer_full_micros_sleep,Uint64 & micros_send)8032 mt_getPerformanceTimers(Uint32 self,
8033                         Uint64 & micros_sleep,
8034                         Uint64 & spin_time,
8035                         Uint64 & buffer_full_micros_sleep,
8036                         Uint64 & micros_send)
8037 {
8038   struct thr_repository* rep = g_thr_repository;
8039   struct thr_data *selfptr = &rep->m_thread[self];
8040 
8041   /**
8042    * Internally in mt.cpp sleep time now includes spin time. However
8043    * to ensure backwards compatibility we report them separate to
8044    * any block users of this information.
8045    */
8046   micros_sleep = selfptr->m_micros_sleep;
8047   spin_time = selfptr->m_measured_spintime;
8048   if (micros_sleep >= spin_time)
8049   {
8050     micros_sleep -= spin_time;
8051   }
8052   else
8053   {
8054     micros_sleep = 0;
8055   }
8056   buffer_full_micros_sleep = selfptr->m_buffer_full_micros_sleep;
8057   micros_send = selfptr->m_micros_send;
8058 }
8059 
8060 const char *
mt_getThreadDescription(Uint32 self)8061 mt_getThreadDescription(Uint32 self)
8062 {
8063   if (is_main_thread(self))
8064   {
8065     if (self == 0)
8066       return "main thread, schema and distribution handling";
8067     else if (self == 1)
8068       return "rep thread, asynch replication and proxy block handling";
8069     require(false);
8070   }
8071   else if (is_ldm_thread(self))
8072   {
8073     return "ldm thread, handling a set of data partitions";
8074   }
8075   else if (is_tc_thread(self))
8076   {
8077     return "tc thread, transaction handling, unique index and pushdown join"
8078            " handling";
8079   }
8080   else if (is_recv_thread(self))
8081   {
8082     return "receive thread, performing receieve and polling for new receives";
8083   }
8084   else
8085   {
8086     require(false);
8087   }
8088   return NULL;
8089 }
8090 
8091 const char *
mt_getThreadName(Uint32 self)8092 mt_getThreadName(Uint32 self)
8093 {
8094   if (is_main_thread(self))
8095   {
8096     if (self == 0)
8097       return "main";
8098     else if (self == 1)
8099       return "rep";
8100     require(false);
8101   }
8102   else if (is_ldm_thread(self))
8103   {
8104     return "ldm";
8105   }
8106   else if (is_tc_thread(self))
8107   {
8108     return "tc";
8109   }
8110   else if (is_recv_thread(self))
8111   {
8112     return "recv";
8113   }
8114   else
8115   {
8116     require(false);
8117   }
8118   return NULL;
8119 }
8120 
8121 void
mt_getSendPerformanceTimers(Uint32 send_instance,Uint64 & exec_time,Uint64 & sleep_time,Uint64 & spin_time,Uint64 & user_time_os,Uint64 & kernel_time_os,Uint64 & elapsed_time_os)8122 mt_getSendPerformanceTimers(Uint32 send_instance,
8123                             Uint64 & exec_time,
8124                             Uint64 & sleep_time,
8125                             Uint64 & spin_time,
8126                             Uint64 & user_time_os,
8127                             Uint64 & kernel_time_os,
8128                             Uint64 & elapsed_time_os)
8129 {
8130   assert(g_send_threads != NULL);
8131   if (g_send_threads != NULL)
8132   {
8133     g_send_threads->getSendPerformanceTimers(send_instance,
8134                                              exec_time,
8135                                              sleep_time,
8136                                              spin_time,
8137                                              user_time_os,
8138                                              kernel_time_os,
8139                                              elapsed_time_os);
8140   }
8141 }
8142 
8143 Uint32
mt_getNumSendThreads()8144 mt_getNumSendThreads()
8145 {
8146   return globalData.ndbMtSendThreads;
8147 }
8148 
8149 Uint32
mt_getNumThreads()8150 mt_getNumThreads()
8151 {
8152   return glob_num_threads;
8153 }
8154 
8155 void
sendlocal(Uint32 self,const SignalHeader * s,const Uint32 * data,const Uint32 secPtr[3])8156 sendlocal(Uint32 self, const SignalHeader *s, const Uint32 *data,
8157           const Uint32 secPtr[3])
8158 {
8159   Uint32 block = blockToMain(s->theReceiversBlockNumber);
8160   Uint32 instance = blockToInstance(s->theReceiversBlockNumber);
8161 
8162   /*
8163    * Max number of signals to put into job buffer before flushing the buffer
8164    * to the other thread.
8165    * This parameter found to be reasonable by benchmarking.
8166    */
8167   Uint32 MAX_SIGNALS_BEFORE_FLUSH = (self >= first_receiver_thread_no) ?
8168     MAX_SIGNALS_BEFORE_FLUSH_RECEIVER :
8169     MAX_SIGNALS_BEFORE_FLUSH_OTHER;
8170 
8171   Uint32 dst = block2ThreadId(block, instance);
8172   struct thr_repository* rep = g_thr_repository;
8173   struct thr_data *selfptr = &rep->m_thread[self];
8174   assert(my_thread_equal(selfptr->m_thr_id, my_thread_self()));
8175   struct thr_data *dstptr = &rep->m_thread[dst];
8176 
8177   selfptr->m_stat.m_priob_count++;
8178   Uint32 siglen = (sizeof(*s) >> 2) + s->theLength + s->m_noOfSections;
8179   selfptr->m_stat.m_priob_size += siglen;
8180 
8181   assert(s->theLength + s->m_noOfSections <= 25);
8182   thr_job_queue *q = dstptr->m_in_queue + self;
8183   thr_job_queue_head *h = dstptr->m_in_queue_head + self;
8184   thr_jb_write_state *w = selfptr->m_write_states + dst;
8185   if (insert_signal(q, h, w, false, s, data, secPtr, selfptr->m_next_buffer))
8186   {
8187     selfptr->m_next_buffer = seize_buffer(rep, self, false);
8188   }
8189   if (w->get_pending_signals() >= MAX_SIGNALS_BEFORE_FLUSH)
8190   {
8191     flush_write_state(selfptr, dstptr, h, w, false);
8192   }
8193 }
8194 
8195 void
sendprioa(Uint32 self,const SignalHeader * s,const uint32 * data,const Uint32 secPtr[3])8196 sendprioa(Uint32 self, const SignalHeader *s, const uint32 *data,
8197           const Uint32 secPtr[3])
8198 {
8199   Uint32 block = blockToMain(s->theReceiversBlockNumber);
8200   Uint32 instance = blockToInstance(s->theReceiversBlockNumber);
8201 
8202   Uint32 dst = block2ThreadId(block, instance);
8203   struct thr_repository* rep = g_thr_repository;
8204   struct thr_data *selfptr = &rep->m_thread[self];
8205   assert(s->theVerId_signalNumber == GSN_START_ORD ||
8206          my_thread_equal(selfptr->m_thr_id, my_thread_self()));
8207   struct thr_data *dstptr = &rep->m_thread[dst];
8208 
8209   selfptr->m_stat.m_prioa_count++;
8210   Uint32 siglen = (sizeof(*s) >> 2) + s->theLength + s->m_noOfSections;
8211   selfptr->m_stat.m_prioa_size += siglen;
8212 
8213   thr_job_queue *q = &(dstptr->m_jba);
8214   thr_job_queue_head *h = &(dstptr->m_jba_head);
8215   thr_jb_write_state w;
8216 
8217   if (selfptr == dstptr)
8218   {
8219     /**
8220      * Indicate that we sent Prio A signal to ourself.
8221      */
8222     selfptr->m_sent_local_prioa_signal = true;
8223   }
8224 
8225   w.init_pending_signals();
8226   lock(&dstptr->m_jba_write_lock);
8227 
8228   Uint32 index = h->m_write_index;
8229   w.m_write_index = index;
8230   thr_job_buffer *buffer = q->m_buffers[index];
8231   w.m_write_buffer = buffer;
8232   w.m_write_pos = buffer->m_len;
8233   bool buf_used = insert_signal(q, h, &w, true, s, data, secPtr,
8234                                 selfptr->m_next_buffer);
8235   flush_write_state(selfptr, dstptr, h, &w, true);
8236 
8237   unlock(&dstptr->m_jba_write_lock);
8238   if (w.has_any_pending_signals())
8239   {
8240     wakeup(&(dstptr->m_waiter));
8241   }
8242   if (buf_used)
8243     selfptr->m_next_buffer = seize_buffer(rep, self, true);
8244 }
8245 
8246 /**
8247  * Send a signal to a remote node.
8248  *
8249  * (The signal is only queued here, and actually sent later in do_send()).
8250  */
8251 SendStatus
mt_send_remote(Uint32 self,const SignalHeader * sh,Uint8 prio,const Uint32 * data,NodeId nodeId,const LinearSectionPtr ptr[3])8252 mt_send_remote(Uint32 self, const SignalHeader *sh, Uint8 prio,
8253                const Uint32 * data, NodeId nodeId,
8254                const LinearSectionPtr ptr[3])
8255 {
8256   thr_repository *rep = g_thr_repository;
8257   struct thr_data *selfptr = &rep->m_thread[self];
8258   SendStatus ss;
8259 
8260   mt_send_handle handle(selfptr);
8261   /* prepareSend() is lock-free, as we have per-thread send buffers. */
8262   TrpId trp_id = 0;
8263   ss = globalTransporterRegistry.prepareSend(&handle,
8264                                              sh,
8265                                              prio,
8266                                              data,
8267                                              nodeId,
8268                                              trp_id,
8269                                              ptr);
8270   if (likely(ss == SEND_OK))
8271   {
8272     register_pending_send(selfptr, trp_id);
8273   }
8274   return ss;
8275 }
8276 
8277 SendStatus
mt_send_remote(Uint32 self,const SignalHeader * sh,Uint8 prio,const Uint32 * data,NodeId nodeId,class SectionSegmentPool * thePool,const SegmentedSectionPtr ptr[3])8278 mt_send_remote(Uint32 self, const SignalHeader *sh, Uint8 prio,
8279                const Uint32 *data, NodeId nodeId,
8280                class SectionSegmentPool *thePool,
8281                const SegmentedSectionPtr ptr[3])
8282 {
8283   thr_repository *rep = g_thr_repository;
8284   struct thr_data *selfptr = &rep->m_thread[self];
8285   SendStatus ss;
8286 
8287   mt_send_handle handle(selfptr);
8288   TrpId trp_id = 0;
8289   ss = globalTransporterRegistry.prepareSend(&handle,
8290                                              sh,
8291                                              prio,
8292                                              data,
8293                                              nodeId,
8294                                              trp_id,
8295                                              *thePool, ptr);
8296   if (likely(ss == SEND_OK))
8297   {
8298     register_pending_send(selfptr, trp_id);
8299   }
8300   return ss;
8301 }
8302 
8303 /*
8304  * This functions sends a prio A STOP_FOR_CRASH signal to a thread.
8305  *
8306  * It works when called from any other thread, not just from job processing
8307  * threads. But note that this signal will be the last signal to be executed by
8308  * the other thread, as it will exit immediately.
8309  */
8310 static
8311 void
sendprioa_STOP_FOR_CRASH(const struct thr_data * selfptr,Uint32 dst)8312 sendprioa_STOP_FOR_CRASH(const struct thr_data *selfptr, Uint32 dst)
8313 {
8314   SignalT<StopForCrash::SignalLength> signalT;
8315   struct thr_repository* rep = g_thr_repository;
8316   /* As this signal will be the last one executed by the other thread, it does
8317      not matter which buffer we use in case the current buffer is filled up by
8318      the STOP_FOR_CRASH signal; the data in it will never be read.
8319   */
8320   static Uint32 MAX_WAIT = 3000;
8321   static thr_job_buffer dummy_buffer;
8322 
8323   /**
8324    * Pick any instance running in this thread
8325    */
8326   struct thr_data *dstptr = &rep->m_thread[dst];
8327   Uint32 bno = dstptr->m_instance_list[0];
8328 
8329   memset(&signalT.header, 0, sizeof(SignalHeader));
8330   signalT.header.theVerId_signalNumber   = GSN_STOP_FOR_CRASH;
8331   signalT.header.theReceiversBlockNumber = bno;
8332   signalT.header.theSendersBlockRef      = 0;
8333   signalT.header.theTrace                = 0;
8334   signalT.header.theSendersSignalId      = 0;
8335   signalT.header.theSignalId             = 0;
8336   signalT.header.theLength               = StopForCrash::SignalLength;
8337   StopForCrash * stopForCrash = CAST_PTR(StopForCrash, &signalT.theData[0]);
8338   stopForCrash->flags = 0;
8339 
8340   thr_job_queue *q = &(dstptr->m_jba);
8341   thr_job_queue_head *h = &(dstptr->m_jba_head);
8342   thr_jb_write_state w;
8343 
8344   /**
8345    * Ensure that a crash while holding m_jba_write_lock won't block
8346    * dump process forever.
8347    */
8348   Uint64 loop_count = 0;
8349   const NDB_TICKS start_try_lock = NdbTick_getCurrentTicks();
8350   while (trylock(&dstptr->m_jba_write_lock) != 0)
8351   {
8352     if (++loop_count >= 10000)
8353     {
8354       const NDB_TICKS now = NdbTick_getCurrentTicks();
8355       if (NdbTick_Elapsed(start_try_lock, now).milliSec() > MAX_WAIT)
8356       {
8357         return;
8358       }
8359       NdbSleep_MilliSleep(1);
8360       loop_count = 0;
8361     }
8362   }
8363 
8364   w.init_pending_signals();
8365   Uint32 index = h->m_write_index;
8366   w.m_write_index = index;
8367   thr_job_buffer *buffer = q->m_buffers[index];
8368   w.m_write_buffer = buffer;
8369   w.m_write_pos = buffer->m_len;
8370   insert_signal(q, h, &w, true, &signalT.header, signalT.theData, NULL,
8371                 &dummy_buffer);
8372   flush_write_state(selfptr, dstptr, h, &w, true);
8373 
8374   unlock(&dstptr->m_jba_write_lock);
8375   if (w.has_any_pending_signals())
8376   {
8377     loop_count = 0;
8378     /**
8379      * Ensure that a crash while holding wakeup lock won't block
8380      * dump process forever. We will wait at most 3 seconds.
8381      */
8382     const NDB_TICKS start_try_wakeup = NdbTick_getCurrentTicks();
8383     while (try_wakeup(&(dstptr->m_waiter)) != 0)
8384     {
8385       if (++loop_count >= 10000)
8386       {
8387         const NDB_TICKS now = NdbTick_getCurrentTicks();
8388         if (NdbTick_Elapsed(start_try_wakeup, now).milliSec() > MAX_WAIT)
8389         {
8390           return;
8391         }
8392         NdbSleep_MilliSleep(1);
8393         loop_count = 0;
8394       }
8395     }
8396   }
8397 }
8398 
8399 /**
8400  * Implements the rules for which threads are allowed to have
8401  * communication with each other.
8402  * Also see compute_jb_pages() which has similar logic.
8403  */
8404 static bool
may_communicate(unsigned from,unsigned to)8405 may_communicate(unsigned from, unsigned to)
8406 {
8407   if (is_main_thread(from))
8408   {
8409     // Main threads communicates with all other threads
8410     return true;
8411   }
8412   else if (is_ldm_thread(from))
8413   {
8414     // First LDM is special as it may act as internal client
8415     // during backup, and thus communicate with other LDMs:
8416     if (is_first_ldm_thread(from) && is_ldm_thread(to))
8417       return true;
8418 
8419     // All LDM threads can communicates with TC-, main-
8420     // itself, and the BACKUP client (above)
8421     return is_main_thread(to) ||
8422            is_tc_thread(to)   ||
8423            is_first_ldm_thread(to) ||
8424            (to == from);
8425   }
8426   else if (is_tc_thread(from))
8427   {
8428     // TC threads can communicate with SPJ-, LQH-, main- and itself
8429     return is_main_thread(to) ||
8430            is_ldm_thread(to)  ||
8431            is_tc_thread(to);      // Cover both SPJs and itself
8432   }
8433   else
8434   {
8435     assert(is_recv_thread(from));
8436     // Receive treads communicate with all, except other receivers
8437     return !is_recv_thread(to);
8438   }
8439 }
8440 
8441 /**
8442  * init functions
8443  */
8444 static
8445 void
queue_init(struct thr_tq * tq)8446 queue_init(struct thr_tq* tq)
8447 {
8448   tq->m_next_timer = 0;
8449   tq->m_current_time = 0;
8450   tq->m_next_free = RNIL;
8451   tq->m_cnt[0] = tq->m_cnt[1] = tq->m_cnt[2] = 0;
8452   bzero(tq->m_delayed_signals, sizeof(tq->m_delayed_signals));
8453 }
8454 
8455 static
8456 void
thr_init(struct thr_repository * rep,struct thr_data * selfptr,unsigned int cnt,unsigned thr_no)8457 thr_init(struct thr_repository* rep, struct thr_data *selfptr, unsigned int cnt,
8458          unsigned thr_no)
8459 {
8460   Uint32 i;
8461 
8462   selfptr->m_thr_no = thr_no;
8463   selfptr->m_next_jbb_no = 0;
8464   selfptr->m_max_signals_per_jb = MAX_SIGNALS_PER_JB;
8465   selfptr->m_max_exec_signals = 0;
8466   selfptr->m_max_extra_signals = 0;
8467   selfptr->m_first_free = 0;
8468   selfptr->m_first_unused = 0;
8469   selfptr->m_send_instance_no = 0;
8470   selfptr->m_send_instance = NULL;
8471   selfptr->m_nosend = 1;
8472 
8473   {
8474     char buf[100];
8475     BaseString::snprintf(buf, sizeof(buf), "jbalock thr: %u", thr_no);
8476     register_lock(&selfptr->m_jba_write_lock, buf);
8477   }
8478   selfptr->m_jba_head.m_read_index = 0;
8479   selfptr->m_jba_head.m_write_index = 0;
8480   thr_job_buffer *buffer = seize_buffer(rep, thr_no, true);
8481   selfptr->m_jba.m_buffers[0] = buffer;
8482   selfptr->m_jba_read_state.m_read_index = 0;
8483   selfptr->m_jba_read_state.m_read_buffer = buffer;
8484   selfptr->m_jba_read_state.m_read_pos = 0;
8485   selfptr->m_jba_read_state.m_read_end = 0;
8486   selfptr->m_jba_read_state.m_write_index = 0;
8487   selfptr->m_next_buffer = seize_buffer(rep, thr_no, false);
8488   selfptr->m_send_buffer_pool.set_pool(&rep->m_sb_pool);
8489 
8490   for (i = 0; i<cnt; i++)
8491   {
8492     selfptr->m_in_queue_head[i].m_waiter.init();
8493     selfptr->m_in_queue_head[i].m_read_index = 0;
8494     selfptr->m_in_queue_head[i].m_write_index = 0;
8495     buffer = may_communicate(i,thr_no)
8496               ? seize_buffer(rep, thr_no, false) : NULL;
8497     selfptr->m_in_queue[i].m_buffers[0] = buffer;
8498     selfptr->m_read_states[i].m_read_index = 0;
8499     selfptr->m_read_states[i].m_read_buffer = buffer;
8500     selfptr->m_read_states[i].m_read_pos = 0;
8501     selfptr->m_read_states[i].m_read_end = 0;
8502     selfptr->m_read_states[i].m_write_index = 0;
8503   }
8504   queue_init(&selfptr->m_tq);
8505 
8506   bzero(&selfptr->m_stat, sizeof(selfptr->m_stat));
8507 
8508   selfptr->m_pending_send_count = 0;
8509   selfptr->m_pending_send_mask.clear();
8510 
8511   selfptr->m_instance_count = 0;
8512   for (i = 0; i < MAX_INSTANCES_PER_THREAD; i++)
8513     selfptr->m_instance_list[i] = 0;
8514 
8515   bzero(&selfptr->m_send_buffers, sizeof(selfptr->m_send_buffers));
8516 
8517   selfptr->m_thread = 0;
8518   selfptr->m_cpu = NO_LOCK_CPU;
8519 #ifdef ERROR_INSERT
8520   selfptr->m_delayed_prepare = false;
8521 #endif
8522 }
8523 
8524 /* Have to do this after init of all m_in_queues is done. */
8525 static
8526 void
thr_init2(struct thr_repository * rep,struct thr_data * selfptr,unsigned int cnt,unsigned thr_no)8527 thr_init2(struct thr_repository* rep, struct thr_data *selfptr,
8528           unsigned int cnt, unsigned thr_no)
8529 {
8530   for (Uint32 i = 0; i<cnt; i++)
8531   {
8532     selfptr->m_write_states[i].m_write_index = 0;
8533     selfptr->m_write_states[i].m_write_pos = 0;
8534     selfptr->m_write_states[i].m_write_buffer =
8535       rep->m_thread[i].m_in_queue[thr_no].m_buffers[0];
8536     selfptr->m_write_states[i].init_pending_signals();
8537   }
8538 }
8539 
8540 static
8541 void
receive_lock_init(Uint32 recv_thread_id,thr_repository * rep)8542 receive_lock_init(Uint32 recv_thread_id, thr_repository *rep)
8543 {
8544   char buf[100];
8545   BaseString::snprintf(buf, sizeof(buf), "receive lock thread id %d",
8546                        recv_thread_id);
8547   register_lock(&rep->m_receive_lock[recv_thread_id], buf);
8548 }
8549 
8550 static
8551 void
send_buffer_init(Uint32 id,thr_repository::send_buffer * sb)8552 send_buffer_init(Uint32 id, thr_repository::send_buffer * sb)
8553 {
8554   char buf[100];
8555   BaseString::snprintf(buf, sizeof(buf), "send lock trp %d", id);
8556   register_lock(&sb->m_send_lock, buf);
8557   BaseString::snprintf(buf, sizeof(buf), "send_buffer lock trp %d", id);
8558   register_lock(&sb->m_buffer_lock, buf);
8559   sb->m_buffered_size = 0;
8560   sb->m_sending_size = 0;
8561   sb->m_force_send = 0;
8562   sb->m_bytes_sent = 0;
8563   sb->m_send_thread = NO_SEND_THREAD;
8564   sb->m_enabled = false;
8565   bzero(&sb->m_buffer, sizeof(sb->m_buffer));
8566   bzero(&sb->m_sending, sizeof(sb->m_sending));
8567   bzero(sb->m_read_index, sizeof(sb->m_read_index));
8568 }
8569 
8570 static
8571 void
rep_init(struct thr_repository * rep,unsigned int cnt,Ndbd_mem_manager * mm)8572 rep_init(struct thr_repository* rep, unsigned int cnt, Ndbd_mem_manager *mm)
8573 {
8574   rep->m_mm = mm;
8575 
8576   rep->m_thread_count = cnt;
8577   for (unsigned int i = 0; i<cnt; i++)
8578   {
8579     thr_init(rep, &rep->m_thread[i], cnt, i);
8580   }
8581   for (unsigned int i = 0; i<cnt; i++)
8582   {
8583     thr_init2(rep, &rep->m_thread[i], cnt, i);
8584   }
8585 
8586   rep->stopped_threads = 0;
8587   NdbMutex_Init(&rep->stop_for_crash_mutex);
8588   NdbCondition_Init(&rep->stop_for_crash_cond);
8589 
8590   for (Uint32 i = 0; i < NDB_ARRAY_SIZE(rep->m_receive_lock); i++)
8591   {
8592     receive_lock_init(i, rep);
8593   }
8594   for (int i = 0 ; i < MAX_NTRANSPORTERS; i++)
8595   {
8596     send_buffer_init(i, rep->m_send_buffers+i);
8597   }
8598 
8599   bzero(rep->m_thread_send_buffers, sizeof(rep->m_thread_send_buffers));
8600 }
8601 
8602 
8603 /**
8604  * Thread Config
8605  */
8606 
8607 static Uint32
get_total_number_of_block_threads(void)8608 get_total_number_of_block_threads(void)
8609 {
8610   return (NUM_MAIN_THREADS +
8611           globalData.ndbMtLqhThreads +
8612           globalData.ndbMtTcThreads +
8613           globalData.ndbMtReceiveThreads);
8614 }
8615 
8616 static Uint32
get_num_trps()8617 get_num_trps()
8618 {
8619   Uint32 count = 0;
8620   for (Uint32 id = 1; id < MAX_NTRANSPORTERS; id++)
8621   {
8622     if (globalTransporterRegistry.get_transporter(id))
8623     {
8624       count++;
8625     }
8626   }
8627   return count;
8628 }
8629 
8630 /**
8631  * This function returns the amount of extra send buffer pages
8632  * that we should allocate in addition to the amount allocated
8633  * for each trp send buffer.
8634  */
8635 #define MIN_SEND_BUFFER_GENERAL (512) //16M
8636 #define MIN_SEND_BUFFER_PER_NODE (8) //256k
8637 #define MIN_SEND_BUFFER_PER_THREAD (64) //2M
8638 
8639 Uint32
mt_get_extra_send_buffer_pages(Uint32 curr_num_pages,Uint32 extra_mem_pages)8640 mt_get_extra_send_buffer_pages(Uint32 curr_num_pages,
8641                                Uint32 extra_mem_pages)
8642 {
8643   Uint32 loc_num_threads = get_total_number_of_block_threads();
8644   Uint32 num_trps = get_num_trps();
8645 
8646   Uint32 extra_pages = extra_mem_pages;
8647 
8648   /**
8649    * Add 2M for each thread since we allocate 1M every
8650    * time we allocate and also we ensure there is also a minimum
8651    * of 1M of send buffer in each thread. Thus we can easily have
8652    * 2M of send buffer just to keep the contention around the
8653    * send buffer page spinlock small. This memory we add independent
8654    * of the configuration settings since the user cannot be
8655    * expected to handle this and also since we could change this
8656    * behaviour at any time.
8657    */
8658   extra_pages += loc_num_threads * THR_SEND_BUFFER_MAX_FREE;
8659 
8660   if (extra_mem_pages == 0)
8661   {
8662     /**
8663      * The user have set extra send buffer memory to 0 and left for us
8664      * to decide on our own how much extra memory is needed.
8665      *
8666      * We'll make sure that we have at least a minimum of 16M +
8667      * 2M per thread + 256k per trp. If we have this based on
8668      * curr_num_pages and our local additions we don't add
8669      * anything more, if we don't come up to this level we add to
8670      * reach this minimum level.
8671      */
8672     Uint32 min_pages = MIN_SEND_BUFFER_GENERAL +
8673       (MIN_SEND_BUFFER_PER_NODE * num_trps) +
8674       (MIN_SEND_BUFFER_PER_THREAD * loc_num_threads);
8675 
8676     if ((curr_num_pages + extra_pages) < min_pages)
8677     {
8678       extra_pages = min_pages - curr_num_pages;
8679     }
8680   }
8681   return extra_pages;
8682 }
8683 
8684 Uint32
compute_jb_pages(struct EmulatorData * ed)8685 compute_jb_pages(struct EmulatorData * ed)
8686 {
8687   Uint32 cnt = get_total_number_of_block_threads();
8688   Uint32 num_receive_threads = globalData.ndbMtReceiveThreads;
8689   Uint32 num_lqh_threads = globalData.ndbMtLqhThreads;
8690   Uint32 num_tc_threads = globalData.ndbMtTcThreads;
8691   Uint32 num_main_threads = NUM_MAIN_THREADS;
8692 
8693   /**
8694    * Number of pages each thread needs to communicate with another
8695    * thread.
8696    */
8697   Uint32 job_queue_pages_per_thread = thr_job_queue::SIZE;
8698 
8699   /**
8700    * In 'perthread' we calculate number of pages required by
8701    * all 'block threads' (excludes 'send-threads'). 'perthread'
8702    * usage is independent of whether this thread will communicate
8703    * with other 'block threads' or not.
8704    */
8705   Uint32 perthread = 0;
8706 
8707   /**
8708    * Each threads has its own job_queue for 'prio A' signals
8709    */
8710   perthread += job_queue_pages_per_thread;
8711 
8712   /**
8713    * Each thread keeps a available free page in 'm_next_buffer'
8714    * in case it is required by insert_signal() into JBA or JBB.
8715    */
8716   perthread += 1;
8717 
8718   /**
8719    * Each thread keeps time-queued signals in 'struct thr_tq'
8720    * thr_tq::PAGES are used to store these.
8721    */
8722   perthread += thr_tq::PAGES;
8723 
8724   /**
8725    * Each thread has its own 'm_free_fifo[THR_FREE_BUF_MAX]' cache.
8726    * As it is filled to MAX *before* a page is allocated, which consumes a page,
8727    * it will never cache more than MAX-1 pages. Pages are also returned to global
8728    * allocator as soon as MAX is reached.
8729    */
8730   perthread += THR_FREE_BUF_MAX-1;
8731 
8732   /**
8733    * Start by calculating the basic number of pages required for
8734    * our 'cnt' block threads.
8735    * (no inter-thread communication assumed so far)
8736    */
8737   Uint32 tot = cnt * perthread;
8738 
8739   /**
8740    * We then start adding pages required for inter-thread communications:
8741    *
8742    * Receiver threads will be able to communicate with all other
8743    * threads except other receive threads.
8744    */
8745   tot += num_receive_threads *
8746          (cnt - num_receive_threads) *
8747          job_queue_pages_per_thread;
8748 
8749   /**
8750    * LQH threads can communicate with TC threads and main threads.
8751    * Cannot communicate with receive threads and other LQH threads,
8752    * but it can communicate with itself.
8753    */
8754   tot += num_lqh_threads *
8755          (num_tc_threads + num_main_threads + 1) *
8756          job_queue_pages_per_thread;
8757 
8758   /**
8759    * First LDM thread is special as it will act as client
8760    * during backup. It will send to, and receive from (2x)
8761    * the 'num_lqh_threads - 1' other LQH threads.
8762    */
8763   tot += 2 * (num_lqh_threads-1) *
8764          job_queue_pages_per_thread;
8765 
8766   /**
8767    * TC threads can communicate with SPJ-, LQH- and main threads.
8768    * Cannot communicate with receive threads and other TC threads,
8769    * but as SPJ is located together with TC, it is counted as it
8770    * communicate with all TC threads.
8771    */
8772   tot += num_tc_threads *
8773          (num_lqh_threads + num_main_threads + num_tc_threads) *
8774          job_queue_pages_per_thread;
8775 
8776   /**
8777    * Main threads can communicate with all other threads
8778    */
8779   tot += num_main_threads *
8780          cnt *
8781          job_queue_pages_per_thread;
8782 
8783   return tot;
8784 }
8785 
ThreadConfig()8786 ThreadConfig::ThreadConfig()
8787 {
8788   /**
8789    * We take great care within struct thr_repository to optimize
8790    * cache line placement of the different members. This all
8791    * depends on that the base address of thr_repository itself
8792    * is cache line alligned.
8793    *
8794    * So we allocate a char[] sufficient large to hold the
8795    * thr_repository object, with added bytes for placing
8796    * g_thr_repository on a CL-alligned offset withing it.
8797    */
8798   g_thr_repository_mem = new char[sizeof(thr_repository)+NDB_CL];
8799   const int alligned_offs = NDB_CL_PADSZ((UintPtr)g_thr_repository_mem);
8800   char* cache_alligned_mem = &g_thr_repository_mem[alligned_offs];
8801   require((((UintPtr)cache_alligned_mem) % NDB_CL) == 0);
8802   g_thr_repository = new(cache_alligned_mem) thr_repository();
8803 }
8804 
~ThreadConfig()8805 ThreadConfig::~ThreadConfig()
8806 {
8807   g_thr_repository->~thr_repository();
8808   g_thr_repository = NULL;
8809   delete[] g_thr_repository_mem;
8810   g_thr_repository_mem = NULL;
8811 }
8812 
8813 /*
8814  * We must do the init here rather than in the constructor, since at
8815  * constructor time the global memory manager is not available.
8816  */
8817 void
init()8818 ThreadConfig::init()
8819 {
8820   Uint32 num_lqh_threads = globalData.ndbMtLqhThreads;
8821   Uint32 num_tc_threads = globalData.ndbMtTcThreads;
8822   Uint32 num_recv_threads = globalData.ndbMtReceiveThreads;
8823   first_receiver_thread_no =
8824     NUM_MAIN_THREADS + num_tc_threads + num_lqh_threads;
8825   glob_num_threads = first_receiver_thread_no + num_recv_threads;
8826   require(glob_num_threads <= MAX_BLOCK_THREADS);
8827 
8828   glob_num_tc_threads = num_tc_threads;
8829   if (glob_num_tc_threads == 0)
8830     glob_num_tc_threads = 1;
8831 
8832   ndbout << "NDBMT: number of block threads=" << glob_num_threads << endl;
8833 
8834   ::rep_init(g_thr_repository, glob_num_threads,
8835              globalEmulatorData.m_mem_manager);
8836 }
8837 
8838 /**
8839  * return receiver thread handling a particular trp
8840  *   returned number is indexed from 0 and upwards to #receiver threads
8841  *   (or MAX_NODES is none)
8842  */
8843 Uint32
mt_get_recv_thread_idx(TrpId trp_id)8844 mt_get_recv_thread_idx(TrpId trp_id)
8845 {
8846   assert(trp_id < NDB_ARRAY_SIZE(g_trp_to_recv_thr_map));
8847   return g_trp_to_recv_thr_map[trp_id];
8848 }
8849 
8850 static
8851 void
assign_receiver_threads(void)8852 assign_receiver_threads(void)
8853 {
8854   Uint32 num_recv_threads = globalData.ndbMtReceiveThreads;
8855   Uint32 recv_thread_idx = 0;
8856   Uint32 recv_thread_idx_shm = 0;
8857   for (Uint32 trp_id = 1; trp_id < MAX_NTRANSPORTERS; trp_id++)
8858   {
8859     Transporter *trp =
8860       globalTransporterRegistry.get_transporter(trp_id);
8861 
8862     /**
8863      * Ensure that shared memory transporters are well distributed
8864      * over all receive threads, so distribute those independent of
8865      * rest of transporters.
8866      */
8867     if (trp)
8868     {
8869       if (globalTransporterRegistry.is_shm_transporter(trp_id))
8870       {
8871         g_trp_to_recv_thr_map[trp_id] = recv_thread_idx_shm;
8872         globalTransporterRegistry.set_recv_thread_idx(trp,recv_thread_idx_shm);
8873         DEB_MULTI_TRP(("SHM trp %u uses recv_thread_idx: %u",
8874                        trp_id, recv_thread_idx_shm));
8875         recv_thread_idx_shm++;
8876         if (recv_thread_idx_shm == num_recv_threads)
8877           recv_thread_idx_shm = 0;
8878       }
8879       else
8880       {
8881         g_trp_to_recv_thr_map[trp_id] = recv_thread_idx;
8882         DEB_MULTI_TRP(("TCP trp %u uses recv_thread_idx: %u",
8883                        trp_id, recv_thread_idx));
8884         globalTransporterRegistry.set_recv_thread_idx(trp,recv_thread_idx);
8885         recv_thread_idx++;
8886         if (recv_thread_idx == num_recv_threads)
8887           recv_thread_idx = 0;
8888       }
8889     }
8890     else
8891     {
8892       /* Flag for no transporter */
8893       g_trp_to_recv_thr_map[trp_id] = MAX_NTRANSPORTERS;
8894     }
8895   }
8896   return;
8897 }
8898 
8899 void
mt_assign_recv_thread_new_trp(Uint32 trp_id)8900 mt_assign_recv_thread_new_trp(Uint32 trp_id)
8901 {
8902   if (g_trp_to_recv_thr_map[trp_id] != MAX_NTRANSPORTERS)
8903   {
8904     /* Already assigned in the past, keep assignment */
8905     return;
8906   }
8907   Uint32 num_recv_threads = globalData.ndbMtReceiveThreads;
8908   Uint32 next_recv_thread_tcp = 0;
8909   Uint32 next_recv_thread_shm = 0;
8910   for (Uint32 id = 1; id < MAX_NTRANSPORTERS; id++)
8911   {
8912     if (id == trp_id)
8913       continue;
8914     Transporter *trp =
8915       globalTransporterRegistry.get_transporter(id);
8916     if (trp)
8917     {
8918       if (globalTransporterRegistry.is_shm_transporter(id))
8919       {
8920         next_recv_thread_shm = g_trp_to_recv_thr_map[id];
8921       }
8922       else
8923       {
8924         next_recv_thread_tcp = g_trp_to_recv_thr_map[id];
8925       }
8926     }
8927   }
8928   Transporter *trp =
8929     globalTransporterRegistry.get_transporter(trp_id);
8930   require(trp);
8931   Uint32 choosen_recv_thread;
8932   if (globalTransporterRegistry.is_shm_transporter(trp_id))
8933   {
8934     next_recv_thread_shm++;
8935     if (next_recv_thread_shm == num_recv_threads)
8936       next_recv_thread_shm = 0;
8937     g_trp_to_recv_thr_map[trp_id] = next_recv_thread_shm;
8938     choosen_recv_thread = next_recv_thread_shm;
8939     globalTransporterRegistry.set_recv_thread_idx(trp, next_recv_thread_shm);
8940     DEB_MULTI_TRP(("SHM multi trp %u uses recv_thread_idx: %u",
8941                    trp_id, next_recv_thread_shm));
8942   }
8943   else
8944   {
8945     next_recv_thread_tcp++;
8946     if (next_recv_thread_tcp == num_recv_threads)
8947       next_recv_thread_tcp = 0;
8948     g_trp_to_recv_thr_map[trp_id] = next_recv_thread_tcp;
8949     choosen_recv_thread = next_recv_thread_tcp;
8950     globalTransporterRegistry.set_recv_thread_idx(trp, next_recv_thread_tcp);
8951     DEB_MULTI_TRP(("TCP multi trp %u uses recv_thread_idx: %u",
8952                    trp_id, next_recv_thread_tcp));
8953   }
8954   TransporterReceiveHandleKernel *recvdata =
8955     g_trp_receive_handle_ptr[choosen_recv_thread];
8956   recvdata->m_transporters.set(trp_id);
8957 }
8958 
8959 bool
mt_epoll_add_trp(Uint32 self,NodeId node_id,TrpId trp_id)8960 mt_epoll_add_trp(Uint32 self, NodeId node_id, TrpId trp_id)
8961 {
8962   (void)node_id;
8963   struct thr_repository* rep = g_thr_repository;
8964   struct thr_data *selfptr = &rep->m_thread[self];
8965   Uint32 thr_no = selfptr->m_thr_no;
8966   require(thr_no >= first_receiver_thread_no);
8967   Uint32 recv_thread_idx = thr_no - first_receiver_thread_no;
8968   TransporterReceiveHandleKernel *recvdata =
8969     g_trp_receive_handle_ptr[recv_thread_idx];
8970   if (recv_thread_idx != g_trp_to_recv_thr_map[trp_id])
8971   {
8972     return false;
8973   }
8974   Transporter *t = globalTransporterRegistry.get_transporter(trp_id);
8975   lock(&rep->m_send_buffers[trp_id].m_send_lock);
8976   lock(&rep->m_receive_lock[recv_thread_idx]);
8977   require(recvdata->epoll_add(t));
8978   unlock(&rep->m_receive_lock[recv_thread_idx]);
8979   unlock(&rep->m_send_buffers[trp_id].m_send_lock);
8980   return true;
8981 }
8982 
8983 bool
mt_is_recv_thread_for_new_trp(Uint32 self,NodeId node_id,TrpId trp_id)8984 mt_is_recv_thread_for_new_trp(Uint32 self,
8985                               NodeId node_id,
8986                               TrpId trp_id)
8987 {
8988   (void)node_id;
8989   struct thr_repository* rep = g_thr_repository;
8990   struct thr_data *selfptr = &rep->m_thread[self];
8991   Uint32 thr_no = selfptr->m_thr_no;
8992   require(thr_no >= first_receiver_thread_no);
8993   Uint32 recv_thread_idx = thr_no - first_receiver_thread_no;
8994   if (recv_thread_idx != g_trp_to_recv_thr_map[trp_id])
8995   {
8996     return false;
8997   }
8998   return true;
8999 }
9000 
9001 void
ipControlLoop(NdbThread * pThis)9002 ThreadConfig::ipControlLoop(NdbThread* pThis)
9003 {
9004   unsigned int thr_no;
9005   struct thr_repository* rep = g_thr_repository;
9006 
9007   rep->m_thread[first_receiver_thread_no].m_thr_index =
9008     globalEmulatorData.theConfiguration->addThread(pThis, ReceiveThread);
9009 
9010   max_send_delay = globalEmulatorData.theConfiguration->maxSendDelay();
9011 
9012   /**
9013    * Set the configured time we will spend in spinloop before coming
9014    * back to check conditions.
9015    */
9016   Uint32 spin_nanos = globalEmulatorData.theConfiguration->spinTimePerCall();
9017   NdbSpin_Change(Uint64(spin_nanos));
9018   g_eventLogger->info("Number of spin loops is %llu to pause %llu nanoseconds",
9019                       NdbSpin_get_num_spin_loops(),
9020                       NdbSpin_get_current_spin_nanos());
9021 
9022   if (globalData.ndbMtSendThreads)
9023   {
9024     /**
9025      * new operator do not ensure alignment for overaligned data types.
9026      * As for g_thr_repository, overallocate memory and construct the
9027      * thr_send_threads object within at aligned address.
9028      */
9029     g_send_threads_mem = new char[sizeof(thr_send_threads) + NDB_CL];
9030     const int aligned_offs = NDB_CL_PADSZ((UintPtr)g_send_threads_mem);
9031     char* cache_aligned_mem = &g_send_threads_mem[aligned_offs];
9032     require((((UintPtr)cache_aligned_mem) % NDB_CL) == 0);
9033     g_send_threads = new (cache_aligned_mem) thr_send_threads();
9034   }
9035 
9036   /**
9037    * assign trps to receiver threads
9038    */
9039   assign_receiver_threads();
9040 
9041   /* Start the send thread(s) */
9042   if (g_send_threads)
9043   {
9044     /**
9045      * assign trps to send threads
9046      */
9047     g_send_threads->assign_trps_to_send_threads();
9048     g_send_threads->assign_threads_to_assist_send_threads();
9049 
9050     g_send_threads->start_send_threads();
9051   }
9052 
9053   /*
9054    * Start threads for all execution threads, except for the receiver
9055    * thread, which runs in the main thread.
9056    */
9057   for (thr_no = 0; thr_no < glob_num_threads; thr_no++)
9058   {
9059     NDB_TICKS now = NdbTick_getCurrentTicks();
9060     rep->m_thread[thr_no].m_ticks = now;
9061     rep->m_thread[thr_no].m_scan_real_ticks = now;
9062 
9063     if (thr_no == first_receiver_thread_no)
9064       continue;                 // Will run in the main thread.
9065 
9066     /*
9067      * The NdbThread_Create() takes void **, but that is cast to void * when
9068      * passed to the thread function. Which is kind of strange ...
9069      */
9070     if (thr_no < first_receiver_thread_no)
9071     {
9072       /* Start block threads */
9073       struct NdbThread *thread_ptr =
9074         NdbThread_Create(mt_job_thread_main,
9075                          (void **)(rep->m_thread + thr_no),
9076                          1024*1024,
9077                          "execute thread", //ToDo add number
9078                          NDB_THREAD_PRIO_MEAN);
9079       require(thread_ptr != NULL);
9080       rep->m_thread[thr_no].m_thr_index =
9081         globalEmulatorData.theConfiguration->addThread(thread_ptr,
9082                                                        BlockThread);
9083       rep->m_thread[thr_no].m_thread = thread_ptr;
9084     }
9085     else
9086     {
9087       /* Start a receiver thread, also block thread for TRPMAN */
9088       struct NdbThread *thread_ptr =
9089         NdbThread_Create(mt_receiver_thread_main,
9090                          (void **)(&rep->m_thread[thr_no]),
9091                          1024*1024,
9092                          "receive thread", //ToDo add number
9093                          NDB_THREAD_PRIO_MEAN);
9094       require(thread_ptr != NULL);
9095       globalEmulatorData.theConfiguration->addThread(thread_ptr,
9096                                                      ReceiveThread);
9097       rep->m_thread[thr_no].m_thread = thread_ptr;
9098     }
9099   }
9100 
9101   /* Now run the main loop for first receiver thread directly. */
9102   rep->m_thread[first_receiver_thread_no].m_thread = pThis;
9103   mt_receiver_thread_main(&(rep->m_thread[first_receiver_thread_no]));
9104 
9105   /* Wait for all threads to shutdown. */
9106   for (thr_no = 0; thr_no < glob_num_threads; thr_no++)
9107   {
9108     if (thr_no == first_receiver_thread_no)
9109       continue;
9110     void *dummy_return_status;
9111     NdbThread_WaitFor(rep->m_thread[thr_no].m_thread,
9112                       &dummy_return_status);
9113     globalEmulatorData.theConfiguration->removeThread(
9114       rep->m_thread[thr_no].m_thread);
9115     NdbThread_Destroy(&(rep->m_thread[thr_no].m_thread));
9116   }
9117 
9118   /* Delete send threads, includes waiting for threads to shutdown */
9119   if (g_send_threads)
9120   {
9121     g_send_threads->~thr_send_threads();
9122     g_send_threads = NULL;
9123     delete[] g_send_threads_mem;
9124     g_send_threads_mem = NULL;
9125   }
9126   globalEmulatorData.theConfiguration->removeThread(pThis);
9127 }
9128 
9129 int
doStart(NodeState::StartLevel startLevel)9130 ThreadConfig::doStart(NodeState::StartLevel startLevel)
9131 {
9132   SignalT<3> signalT;
9133   memset(&signalT.header, 0, sizeof(SignalHeader));
9134 
9135   signalT.header.theVerId_signalNumber   = GSN_START_ORD;
9136   signalT.header.theReceiversBlockNumber = CMVMI;
9137   signalT.header.theSendersBlockRef      = 0;
9138   signalT.header.theTrace                = 0;
9139   signalT.header.theSignalId             = 0;
9140   signalT.header.theLength               = StartOrd::SignalLength;
9141 
9142   StartOrd * startOrd = CAST_PTR(StartOrd, &signalT.theData[0]);
9143   startOrd->restartInfo = 0;
9144 
9145   sendprioa(block2ThreadId(CMVMI, 0), &signalT.header, signalT.theData, 0);
9146   return 0;
9147 }
9148 
9149 Uint32
traceDumpGetNumThreads()9150 FastScheduler::traceDumpGetNumThreads()
9151 {
9152   /* The last thread is only for receiver -> no trace file. */
9153   return glob_num_threads;
9154 }
9155 
9156 bool
traceDumpGetJam(Uint32 thr_no,const JamEvent * & thrdTheEmulatedJam,Uint32 & thrdTheEmulatedJamIndex)9157 FastScheduler::traceDumpGetJam(Uint32 thr_no,
9158                                const JamEvent * & thrdTheEmulatedJam,
9159                                Uint32 & thrdTheEmulatedJamIndex)
9160 {
9161   if (thr_no >= glob_num_threads)
9162     return false;
9163 
9164 #ifdef NO_EMULATED_JAM
9165   thrdTheEmulatedJam = NULL;
9166   thrdTheEmulatedJamIndex = 0;
9167 #else
9168   const EmulatedJamBuffer *jamBuffer =
9169     &g_thr_repository->m_thread[thr_no].m_jam;
9170   thrdTheEmulatedJam = jamBuffer->theEmulatedJam;
9171   thrdTheEmulatedJamIndex = jamBuffer->theEmulatedJamIndex;
9172 #endif
9173   return true;
9174 }
9175 
9176 void
traceDumpPrepare(NdbShutdownType & nst)9177 FastScheduler::traceDumpPrepare(NdbShutdownType& nst)
9178 {
9179   /*
9180    * We are about to generate trace files for all threads.
9181    *
9182    * We want to stop all threads processing before we dump, as otherwise the
9183    * signal buffers could change while dumping, leading to inconsistent
9184    * results.
9185    *
9186    * To stop threads, we send the GSN_STOP_FOR_CRASH signal as prio A to each
9187    * thread. We then wait for threads to signal they are done (but not forever,
9188    * so as to not have one hanging thread prevent the generation of trace
9189    * dumps). We also must be careful not to send to ourself if the crash is
9190    * being processed by one of the threads processing signals.
9191    *
9192    * We do not stop the transporter thread, as it cannot receive signals (but
9193    * because it does not receive signals it does not really influence dumps in
9194    * any case).
9195    */
9196   const thr_data *selfptr = NDB_THREAD_TLS_THREAD;
9197   /* The selfptr might be NULL, or pointer to thread that crashed. */
9198 
9199   Uint32 waitFor_count = 0;
9200   NdbMutex_Lock(&g_thr_repository->stop_for_crash_mutex);
9201   g_thr_repository->stopped_threads = 0;
9202   NdbMutex_Unlock(&g_thr_repository->stop_for_crash_mutex);
9203 
9204   for (Uint32 thr_no = 0; thr_no < glob_num_threads; thr_no++)
9205   {
9206     if (selfptr != NULL && selfptr->m_thr_no == thr_no)
9207     {
9208       /* This is own thread; we have already stopped processing. */
9209       continue;
9210     }
9211 
9212     sendprioa_STOP_FOR_CRASH(selfptr, thr_no);
9213 
9214     waitFor_count++;
9215   }
9216 
9217   static const Uint32 max_wait_seconds = 2;
9218   const NDB_TICKS start = NdbTick_getCurrentTicks();
9219   NdbMutex_Lock(&g_thr_repository->stop_for_crash_mutex);
9220   while (g_thr_repository->stopped_threads < waitFor_count)
9221   {
9222     NdbCondition_WaitTimeout(&g_thr_repository->stop_for_crash_cond,
9223                              &g_thr_repository->stop_for_crash_mutex,
9224                              10);
9225     const NDB_TICKS now = NdbTick_getCurrentTicks();
9226     if (NdbTick_Elapsed(start,now).seconds() > max_wait_seconds)
9227       break;                    // Give up
9228   }
9229   if (g_thr_repository->stopped_threads < waitFor_count)
9230   {
9231     if (nst != NST_ErrorInsert)
9232     {
9233       nst = NST_Watchdog; // Make this abort fast
9234     }
9235     ndbout_c("Warning: %d thread(s) did not stop before starting crash dump.",
9236              waitFor_count - g_thr_repository->stopped_threads);
9237   }
9238   NdbMutex_Unlock(&g_thr_repository->stop_for_crash_mutex);
9239 
9240   /* Now we are ready (or as ready as can be) for doing crash dump. */
9241 }
9242 
9243 /**
9244  * In ndbmtd we could have a case where we actually have multiple threads
9245  * crashing at the same time. This causes several threads to start processing
9246  * the crash handling in parallel and eventually lead to a deadlock since
9247  * the crash handling thread waits for other threads to stop before completing
9248  * the crash handling.
9249  *
9250  * To avoid this we use this function that only is useful in ndbmtd where
9251  * we check if the crash handling has already started. We protect this
9252  * check using the stop_for_crash-mutex. This function is called twice,
9253  * first to write an entry in the error log and second to specify that the
9254  * error log write is completed.
9255  *
9256  * We proceed only from the first call if the crash handling hasn't started
9257  * or if the crash is not caused by an error insert. If it is caused by an
9258  * error insert it is a normal situation with multiple crashes, so we won't
9259  * clutter the error log with multiple entries in this case. If it is a real
9260  * crash and we have more than one thread crashing, then this is vital
9261  * information to write in the error log, we do however not want more than
9262  * one set of trace files.
9263  *
9264  * To ensure that writes of the error log happens for one thread at a time we
9265  * protect it with the stop_for_crash-mutex. We hold this mutex between the
9266  * first and second call of this function from the error reporter thread.
9267  *
9268  * We proceed from the first call only if we are the first thread that
9269  * reported an error. To handle this properly we start by acquiring the
9270  * mutex, then we write the error log, when we come back we set the
9271  * crash_started flag and release the mutex to enable other threads to
9272  * write into the error log, but still stopping them from proceeding to
9273  * write another set of trace files.
9274  *
9275  * We will not come back from this function the second time unless we are
9276  * the first crashing thread.
9277  */
9278 
9279 static bool crash_started = false;
9280 
9281 void
prepare_to_crash(bool first_phase,bool error_insert_crash)9282 ErrorReporter::prepare_to_crash(bool first_phase, bool error_insert_crash)
9283 {
9284   if (first_phase)
9285   {
9286     NdbMutex_Lock(&g_thr_repository->stop_for_crash_mutex);
9287     if (crash_started && error_insert_crash)
9288     {
9289       /**
9290        * Some other thread has already started the crash handling.
9291        * We call the below method which we will never return from.
9292        * We need not write multiple entries in error log for
9293        * error insert crashes since it is a normal event.
9294        */
9295       NdbMutex_Unlock(&g_thr_repository->stop_for_crash_mutex);
9296       mt_execSTOP_FOR_CRASH();
9297     }
9298     /**
9299      * Proceed to write error log before returning to this method
9300      * again with start set to 0.
9301      */
9302   }
9303   else if (crash_started)
9304   {
9305     (void)error_insert_crash;
9306     /**
9307      * No need to proceed since somebody already started handling the crash.
9308      * We proceed by calling mt_execSTOP_FOR_CRASH to stop this thread
9309      * in a manner that is similar to if we received the signal
9310      * STOP_FOR_CRASH.
9311      */
9312     NdbMutex_Unlock(&g_thr_repository->stop_for_crash_mutex);
9313     mt_execSTOP_FOR_CRASH();
9314   }
9315   else
9316   {
9317     /**
9318      * No crash had started previously, we will take care of it. Before
9319      * handling it we will mark the crash handling as started.
9320      */
9321     crash_started = true;
9322     NdbMutex_Unlock(&g_thr_repository->stop_for_crash_mutex);
9323   }
9324 }
9325 
mt_execSTOP_FOR_CRASH()9326 void mt_execSTOP_FOR_CRASH()
9327 {
9328   const thr_data *selfptr = NDB_THREAD_TLS_THREAD;
9329   require(selfptr != NULL);
9330 
9331   NdbMutex_Lock(&g_thr_repository->stop_for_crash_mutex);
9332   g_thr_repository->stopped_threads++;
9333   NdbCondition_Signal(&g_thr_repository->stop_for_crash_cond);
9334   NdbMutex_Unlock(&g_thr_repository->stop_for_crash_mutex);
9335 
9336   /* ToDo: is this correct? */
9337   globalEmulatorData.theWatchDog->unregisterWatchedThread(selfptr->m_thr_no);
9338 
9339   my_thread_exit(NULL);
9340 }
9341 
9342 void
dumpSignalMemory(Uint32 thr_no,FILE * out)9343 FastScheduler::dumpSignalMemory(Uint32 thr_no, FILE* out)
9344 {
9345   thr_data *selfptr = NDB_THREAD_TLS_THREAD;
9346   const thr_repository *rep = g_thr_repository;
9347   /*
9348    * The selfptr might be NULL, or pointer to thread that is doing the crash
9349    * jump.
9350    * If non-null, we should update the watchdog counter while dumping.
9351    */
9352   Uint32 *watchDogCounter;
9353   if (selfptr)
9354     watchDogCounter = &selfptr->m_watchdog_counter;
9355   else
9356     watchDogCounter = NULL;
9357 
9358   /*
9359    * We want to dump the signal buffers from last executed to first executed.
9360    * So we first need to find the correct sequence to output signals in, stored
9361    * in this arrray.
9362    *
9363    * We will check any buffers in the cyclic m_free_fifo. In addition,
9364    * we also need to scan the already executed part of the current
9365    * buffer in m_jba.
9366    *
9367    * Due to partial execution of prio A buffers, we will use signal ids to know
9368    * where to interleave prio A signals into the stream of prio B signals
9369    * read. So we will keep a pointer to a prio A buffer around; and while
9370    * scanning prio B buffers we will interleave prio A buffers from that buffer
9371    * when the signal id fits the sequence.
9372    *
9373    * This also means that we may have to discard the earliest part of available
9374    * prio A signal data due to too little prio B data present, or vice versa.
9375    */
9376   static const Uint32 MAX_SIGNALS_TO_DUMP = 4096;
9377   struct {
9378     const SignalHeader *ptr;
9379     bool prioa;
9380   } signalSequence[MAX_SIGNALS_TO_DUMP];
9381   Uint32 seq_start = 0;
9382   Uint32 seq_end = 0;
9383 
9384   const struct thr_data *thr_ptr = &rep->m_thread[thr_no];
9385   if (watchDogCounter)
9386     *watchDogCounter = 4;
9387 
9388   /*
9389    * ToDo: Might do some sanity check to avoid crashing on not yet initialised
9390    * thread.
9391    */
9392 
9393   /* Scan all available buffers with already executed signals. */
9394 
9395   /*
9396    * Keep track of all available buffers, so that we can pick out signals in
9397    * the same order they were executed (order obtained from signal id).
9398    *
9399    * We may need to keep track of THR_FREE_BUF_MAX buffers for fully executed
9400    * (and freed) buffers, plus MAX_BLOCK_THREADS buffers for currently active
9401    * prio B buffers, plus one active prio A buffer.
9402    */
9403   struct {
9404     const thr_job_buffer *m_jb;
9405     Uint32 m_pos;
9406     Uint32 m_max;
9407   } jbs[THR_FREE_BUF_MAX + MAX_BLOCK_THREADS + 1];
9408 
9409   Uint32 num_jbs = 0;
9410 
9411   /* Load released buffers. */
9412   Uint32 idx = thr_ptr->m_first_free;
9413   while (idx != thr_ptr->m_first_unused)
9414   {
9415     const thr_job_buffer *q = thr_ptr->m_free_fifo[idx];
9416     if (q->m_len > 0)
9417     {
9418       jbs[num_jbs].m_jb = q;
9419       jbs[num_jbs].m_pos = 0;
9420       jbs[num_jbs].m_max = q->m_len;
9421       num_jbs++;
9422     }
9423     idx = (idx + 1) % THR_FREE_BUF_MAX;
9424   }
9425   /* Load any active prio B buffers. */
9426   for (Uint32 thr_no = 0; thr_no < rep->m_thread_count; thr_no++)
9427   {
9428     const thr_job_queue *q = thr_ptr->m_in_queue + thr_no;
9429     const thr_jb_read_state *r = thr_ptr->m_read_states + thr_no;
9430     Uint32 read_pos = r->m_read_pos;
9431     if (r->is_open() && read_pos > 0)
9432     {
9433       jbs[num_jbs].m_jb = q->m_buffers[r->m_read_index];
9434       jbs[num_jbs].m_pos = 0;
9435       jbs[num_jbs].m_max = read_pos;
9436       num_jbs++;
9437     }
9438   }
9439   /* Load any active prio A buffer. */
9440   const thr_jb_read_state *r = &thr_ptr->m_jba_read_state;
9441   Uint32 read_pos = r->m_read_pos;
9442   if (read_pos > 0)
9443   {
9444     jbs[num_jbs].m_jb = thr_ptr->m_jba.m_buffers[r->m_read_index];
9445     jbs[num_jbs].m_pos = 0;
9446     jbs[num_jbs].m_max = read_pos;
9447     num_jbs++;
9448   }
9449 
9450   /* Use the next signal id as the smallest (oldest).
9451    *
9452    * Subtracting two signal ids with the smallest makes
9453    * them comparable using standard comparision of Uint32,
9454    * there the biggest value is the newest.
9455    * For example,
9456    *   (m_signal_id_counter - smallest_signal_id) == UINT32_MAX
9457    */
9458   const Uint32 smallest_signal_id = thr_ptr->m_signal_id_counter + 1;
9459 
9460   /* Now pick out one signal at a time, in signal id order. */
9461   while (num_jbs > 0)
9462   {
9463     if (watchDogCounter)
9464       *watchDogCounter = 4;
9465 
9466     /* Search out the smallest signal id remaining. */
9467     Uint32 idx_min = 0;
9468     const Uint32 *p = jbs[idx_min].m_jb->m_data + jbs[idx_min].m_pos;
9469     const SignalHeader *s_min = reinterpret_cast<const SignalHeader*>(p);
9470     Uint32 sid_min_adjusted = s_min->theSignalId - smallest_signal_id;
9471 
9472     for (Uint32 i = 1; i < num_jbs; i++)
9473     {
9474       p = jbs[i].m_jb->m_data + jbs[i].m_pos;
9475       const SignalHeader *s = reinterpret_cast<const SignalHeader*>(p);
9476       const Uint32 sid_adjusted = s->theSignalId - smallest_signal_id;
9477       if (sid_adjusted < sid_min_adjusted)
9478       {
9479         idx_min = i;
9480         s_min = s;
9481         sid_min_adjusted = sid_adjusted;
9482       }
9483     }
9484 
9485     /* We found the next signal, now put it in the ordered cyclic buffer. */
9486     signalSequence[seq_end].ptr = s_min;
9487     signalSequence[seq_end].prioa = jbs[idx_min].m_jb->m_prioa;
9488     Uint32 siglen =
9489       (sizeof(SignalHeader)>>2) + s_min->m_noOfSections + s_min->theLength;
9490 #if SIZEOF_CHARP == 8
9491     /* Align to 8-byte boundary, to ensure aligned copies. */
9492     siglen= (siglen+1) & ~((Uint32)1);
9493 #endif
9494     jbs[idx_min].m_pos += siglen;
9495     if (jbs[idx_min].m_pos >= jbs[idx_min].m_max)
9496     {
9497       /* We are done with this job buffer. */
9498       num_jbs--;
9499       jbs[idx_min] = jbs[num_jbs];
9500     }
9501     seq_end = (seq_end + 1) % MAX_SIGNALS_TO_DUMP;
9502     /* Drop old signals if too many available in history. */
9503     if (seq_end == seq_start)
9504       seq_start = (seq_start + 1) % MAX_SIGNALS_TO_DUMP;
9505   }
9506 
9507   /* Now, having build the correct signal sequence, we can dump them all. */
9508   fprintf(out, "\n");
9509   bool first_one = true;
9510   bool out_of_signals = false;
9511   Uint32 lastSignalId = 0;
9512   while (seq_end != seq_start)
9513   {
9514     if (watchDogCounter)
9515       *watchDogCounter = 4;
9516 
9517     if (seq_end == 0)
9518       seq_end = MAX_SIGNALS_TO_DUMP;
9519     seq_end--;
9520     SignalT<25> signal;
9521     const SignalHeader *s = signalSequence[seq_end].ptr;
9522     unsigned siglen = (sizeof(*s)>>2) + s->theLength;
9523     if (siglen > MAX_SIGNAL_SIZE)
9524       siglen = MAX_SIGNAL_SIZE;              // Sanity check
9525     memcpy(&signal.header, s, 4*siglen);
9526     // instance number in trace file is confusing if not MT LQH
9527     if (globalData.ndbMtLqhWorkers == 0)
9528       signal.header.theReceiversBlockNumber &= NDBMT_BLOCK_MASK;
9529 
9530     const Uint32 *posptr = reinterpret_cast<const Uint32 *>(s);
9531     signal.m_sectionPtrI[0] = posptr[siglen + 0];
9532     signal.m_sectionPtrI[1] = posptr[siglen + 1];
9533     signal.m_sectionPtrI[2] = posptr[siglen + 2];
9534     bool prioa = signalSequence[seq_end].prioa;
9535 
9536     /* Make sure to display clearly when there is a gap in the dump. */
9537     if (!first_one && !out_of_signals && (s->theSignalId + 1) != lastSignalId)
9538     {
9539       out_of_signals = true;
9540       fprintf(out, "\n\n\nNo more prio %s signals, rest of dump will be "
9541               "incomplete.\n\n\n\n", prioa ? "B" : "A");
9542     }
9543     first_one = false;
9544     lastSignalId = s->theSignalId;
9545 
9546     fprintf(out, "--------------- Signal ----------------\n");
9547     Uint32 prio = (prioa ? JBA : JBB);
9548     SignalLoggerManager::printSignalHeader(out,
9549                                            signal.header,
9550                                            prio,
9551                                            globalData.ownId,
9552                                            true);
9553     SignalLoggerManager::printSignalData  (out,
9554                                            signal.header,
9555                                            &signal.theData[0]);
9556   }
9557   fflush(out);
9558 }
9559 
9560 int
traceDumpGetCurrentThread()9561 FastScheduler::traceDumpGetCurrentThread()
9562 {
9563   const thr_data *selfptr = NDB_THREAD_TLS_THREAD;
9564 
9565   /* The selfptr might be NULL, or pointer to thread that crashed. */
9566   if (selfptr == 0)
9567   {
9568     return -1;
9569   }
9570   else
9571   {
9572     return (int)selfptr->m_thr_no;
9573   }
9574 }
9575 
9576 void
mt_section_lock()9577 mt_section_lock()
9578 {
9579   lock(&(g_thr_repository->m_section_lock));
9580 }
9581 
9582 void
mt_section_unlock()9583 mt_section_unlock()
9584 {
9585   unlock(&(g_thr_repository->m_section_lock));
9586 }
9587 
9588 void
mt_mem_manager_init()9589 mt_mem_manager_init()
9590 {
9591 }
9592 
9593 void
mt_mem_manager_lock()9594 mt_mem_manager_lock()
9595 {
9596   lock(&(g_thr_repository->m_mem_manager_lock));
9597 }
9598 
9599 void
mt_mem_manager_unlock()9600 mt_mem_manager_unlock()
9601 {
9602   unlock(&(g_thr_repository->m_mem_manager_lock));
9603 }
9604 
9605 Vector<mt_lock_stat> g_locks;
9606 template class Vector<mt_lock_stat>;
9607 
9608 static
9609 void
register_lock(const void * ptr,const char * name)9610 register_lock(const void * ptr, const char * name)
9611 {
9612   if (name == 0)
9613     return;
9614 
9615   mt_lock_stat* arr = g_locks.getBase();
9616   for (size_t i = 0; i<g_locks.size(); i++)
9617   {
9618     if (arr[i].m_ptr == ptr)
9619     {
9620       if (arr[i].m_name)
9621       {
9622         free(arr[i].m_name);
9623       }
9624       arr[i].m_name = strdup(name);
9625       return;
9626     }
9627   }
9628 
9629   mt_lock_stat ln;
9630   ln.m_ptr = ptr;
9631   ln.m_name = strdup(name);
9632   ln.m_contended_count = 0;
9633   ln.m_spin_count = 0;
9634   g_locks.push_back(ln);
9635 }
9636 
9637 #if defined(NDB_HAVE_XCNG) && defined(NDB_USE_SPINLOCK)
9638 static
9639 mt_lock_stat *
lookup_lock(const void * ptr)9640 lookup_lock(const void * ptr)
9641 {
9642   mt_lock_stat* arr = g_locks.getBase();
9643   for (size_t i = 0; i<g_locks.size(); i++)
9644   {
9645     if (arr[i].m_ptr == ptr)
9646       return arr + i;
9647   }
9648 
9649   return 0;
9650 }
9651 #endif
9652 
9653 Uint32
mt_get_threads_for_blocks_no_proxy(const Uint32 blocks[],BlockThreadBitmask & mask)9654 mt_get_threads_for_blocks_no_proxy(const Uint32 blocks[],
9655                                    BlockThreadBitmask& mask)
9656 {
9657   Uint32 cnt = 0;
9658   for (Uint32 i = 0; blocks[i] != 0; i++)
9659   {
9660     Uint32 block = blocks[i];
9661     /**
9662      * Find each thread that has instance of block
9663      */
9664     assert(block == blockToMain(block));
9665     const Uint32 index = block - MIN_BLOCK_NO;
9666     const Uint32 instance_count = block_instance_count[index];
9667     require(instance_count <= NDB_ARRAY_SIZE(thr_map[index]));
9668     // If more than one instance, avoid proxy instance 0
9669     const Uint32 first_instance = (instance_count > 1) ? 1 : 0;
9670     for (Uint32 instance = first_instance;
9671          instance < instance_count;
9672          instance++)
9673     {
9674       Uint32 thr_no = thr_map[index][instance].thr_no;
9675       require(thr_no != thr_map_entry::NULL_THR_NO);
9676 
9677       if (mask.get(thr_no))
9678         continue;
9679 
9680       mask.set(thr_no);
9681       cnt++;
9682     }
9683   }
9684   require(mask.count() == cnt);
9685   return cnt;
9686 }
9687 
9688 Uint32
mt_get_addressable_threads(const Uint32 my_thr_no,BlockThreadBitmask & mask)9689 mt_get_addressable_threads(const Uint32 my_thr_no, BlockThreadBitmask& mask)
9690 {
9691   const Uint32 thr_cnt = get_total_number_of_block_threads();
9692   Uint32 cnt = 0;
9693   for (Uint32 thr_no = 0; thr_no < thr_cnt; thr_no++)
9694   {
9695     if (may_communicate(my_thr_no, thr_no))
9696     {
9697       mask.set(thr_no);
9698       cnt++;
9699     }
9700   }
9701   if (!mask.get(my_thr_no))
9702   {
9703     mask.set(my_thr_no);
9704     cnt++;
9705   }
9706   require(mask.count() == cnt);
9707   return cnt;
9708 }
9709 
9710 void
mt_wakeup(class SimulatedBlock * block)9711 mt_wakeup(class SimulatedBlock* block)
9712 {
9713   Uint32 thr_no = block->getThreadId();
9714   struct thr_data *thrptr = &g_thr_repository->m_thread[thr_no];
9715   wakeup(&thrptr->m_waiter);
9716 }
9717 
9718 #ifdef VM_TRACE
9719 void
mt_assert_own_thread(SimulatedBlock * block)9720 mt_assert_own_thread(SimulatedBlock* block)
9721 {
9722   Uint32 thr_no = block->getThreadId();
9723   struct thr_data *thrptr = &g_thr_repository->m_thread[thr_no];
9724 
9725   if (unlikely(my_thread_equal(thrptr->m_thr_id, my_thread_self()) == 0))
9726   {
9727     fprintf(stderr, "mt_assert_own_thread() - assertion-failure\n");
9728     fflush(stderr);
9729     abort();
9730   }
9731 }
9732 #endif
9733 
9734 
9735 Uint32
mt_get_blocklist(SimulatedBlock * block,Uint32 arr[],Uint32 len)9736 mt_get_blocklist(SimulatedBlock * block, Uint32 arr[], Uint32 len)
9737 {
9738   Uint32 thr_no = block->getThreadId();
9739   struct thr_data *thr_ptr = &g_thr_repository->m_thread[thr_no];
9740 
9741   for (Uint32 i = 0; i < thr_ptr->m_instance_count; i++)
9742   {
9743     arr[i] = thr_ptr->m_instance_list[i];
9744   }
9745 
9746   return thr_ptr->m_instance_count;
9747 }
9748 
9749 void
mt_get_spin_stat(class SimulatedBlock * block,ndb_spin_stat * dst)9750 mt_get_spin_stat(class SimulatedBlock *block, ndb_spin_stat *dst)
9751 {
9752   Uint32 thr_no = block->getThreadId();
9753   struct thr_data *selfptr = &g_thr_repository->m_thread[thr_no];
9754   dst->m_sleep_longer_spin_time = selfptr->m_spin_stat.m_sleep_longer_spin_time;
9755   dst->m_sleep_shorter_spin_time =
9756     selfptr->m_spin_stat.m_sleep_shorter_spin_time;
9757   dst->m_num_waits = selfptr->m_spin_stat.m_num_waits;
9758   for (Uint32 i = 0; i < NUM_SPIN_INTERVALS; i++)
9759   {
9760     dst->m_micros_sleep_times[i] =
9761       selfptr->m_spin_stat.m_micros_sleep_times[i];
9762     dst->m_spin_interval[i] = selfptr->m_spin_stat.m_spin_interval[i];
9763   }
9764 }
9765 
mt_set_spin_stat(class SimulatedBlock * block,ndb_spin_stat * src)9766 void mt_set_spin_stat(class SimulatedBlock *block, ndb_spin_stat *src)
9767 {
9768   Uint32 thr_no = block->getThreadId();
9769   struct thr_data *selfptr = &g_thr_repository->m_thread[thr_no];
9770   memset(&selfptr->m_spin_stat, 0, sizeof(selfptr->m_spin_stat));
9771   for (Uint32 i = 0; i < NUM_SPIN_INTERVALS; i++)
9772   {
9773     selfptr->m_spin_stat.m_spin_interval[i] = src->m_spin_interval[i];
9774   }
9775 }
9776 
9777 void
mt_get_thr_stat(class SimulatedBlock * block,ndb_thr_stat * dst)9778 mt_get_thr_stat(class SimulatedBlock * block, ndb_thr_stat* dst)
9779 {
9780   bzero(dst, sizeof(* dst));
9781   Uint32 thr_no = block->getThreadId();
9782   struct thr_data *selfptr = &g_thr_repository->m_thread[thr_no];
9783 
9784   THRConfigApplier & conf = globalEmulatorData.theConfiguration->m_thr_config;
9785   dst->thr_no = thr_no;
9786   dst->name = conf.getName(selfptr->m_instance_list, selfptr->m_instance_count);
9787   dst->os_tid = NdbThread_GetTid(selfptr->m_thread);
9788   dst->loop_cnt = selfptr->m_stat.m_loop_cnt;
9789   dst->exec_cnt = selfptr->m_stat.m_exec_cnt;
9790   dst->wait_cnt = selfptr->m_stat.m_wait_cnt;
9791   dst->local_sent_prioa = selfptr->m_stat.m_prioa_count;
9792   dst->local_sent_priob = selfptr->m_stat.m_priob_count;
9793 }
9794 
9795 TransporterReceiveHandle *
mt_get_trp_receive_handle(unsigned instance)9796 mt_get_trp_receive_handle(unsigned instance)
9797 {
9798   assert(instance > 0 && instance <= MAX_NDBMT_RECEIVE_THREADS);
9799   if (instance > 0 && instance <= MAX_NDBMT_RECEIVE_THREADS)
9800   {
9801     return g_trp_receive_handle_ptr[instance - 1 /* proxy */];
9802   }
9803   return 0;
9804 }
9805 
9806 #if defined(USE_INIT_GLOBAL_VARIABLES)
9807 void
mt_clear_global_variables(thr_data * selfptr)9808 mt_clear_global_variables(thr_data *selfptr)
9809 {
9810   if (selfptr->m_global_variables_enabled)
9811   {
9812     for (Uint32 i = 0; i < selfptr->m_global_variables_ptr_instances; i++)
9813     {
9814       Ptr<void> *tmp = (Ptr<void>*)selfptr->m_global_variables_ptrs[i];
9815       tmp->i = RNIL;
9816       tmp->p = 0;
9817     }
9818     for (Uint32 i = 0; i < selfptr->m_global_variables_uint32_ptr_instances; i++)
9819     {
9820       void **tmp = (void**)selfptr->m_global_variables_uint32_ptrs[i];
9821       (*tmp) = 0;
9822     }
9823     for (Uint32 i = 0; i < selfptr->m_global_variables_uint32_instances; i++)
9824     {
9825       Uint32 *tmp = (Uint32*)selfptr->m_global_variables_uint32[i];
9826       (*tmp) = Uint32(~0);
9827     }
9828   }
9829 }
9830 
9831 void
mt_enable_global_variables(Uint32 self)9832 mt_enable_global_variables(Uint32 self)
9833 {
9834   struct thr_repository* rep = g_thr_repository;
9835   struct thr_data *selfptr = &rep->m_thread[self];
9836   selfptr->m_global_variables_enabled = true;
9837 }
9838 
9839 void
mt_disable_global_variables(Uint32 self)9840 mt_disable_global_variables(Uint32 self)
9841 {
9842   struct thr_repository* rep = g_thr_repository;
9843   struct thr_data *selfptr = &rep->m_thread[self];
9844   selfptr->m_global_variables_enabled = false;
9845 }
9846 
9847 void
mt_init_global_variables_ptr_instances(Uint32 self,void ** tmp,size_t cnt)9848 mt_init_global_variables_ptr_instances(Uint32 self,
9849                                        void ** tmp,
9850                                        size_t cnt)
9851 {
9852   struct thr_repository* rep = g_thr_repository;
9853   struct thr_data *selfptr = &rep->m_thread[self];
9854   for (size_t i = 0; i < cnt; i++)
9855   {
9856     Uint32 inx = selfptr->m_global_variables_ptr_instances;
9857     selfptr->m_global_variables_ptrs[inx] = tmp[i];
9858     selfptr->m_global_variables_ptr_instances = inx + 1;
9859   }
9860 }
9861 
9862 void
mt_init_global_variables_uint32_ptr_instances(Uint32 self,void ** tmp,size_t cnt)9863 mt_init_global_variables_uint32_ptr_instances(Uint32 self,
9864                                               void **tmp,
9865                                               size_t cnt)
9866 {
9867   struct thr_repository* rep = g_thr_repository;
9868   struct thr_data *selfptr = &rep->m_thread[self];
9869   for (size_t i = 0; i < cnt; i++)
9870   {
9871     Uint32 inx = selfptr->m_global_variables_uint32_ptr_instances;
9872     selfptr->m_global_variables_uint32_ptrs[inx] = tmp[i];
9873     selfptr->m_global_variables_uint32_ptr_instances = inx + 1;
9874   }
9875 }
9876 
9877 void
mt_init_global_variables_uint32_instances(Uint32 self,void ** tmp,size_t cnt)9878 mt_init_global_variables_uint32_instances(Uint32 self,
9879                                           void **tmp,
9880                                           size_t cnt)
9881 {
9882   struct thr_repository* rep = g_thr_repository;
9883   struct thr_data *selfptr = &rep->m_thread[self];
9884   for (size_t i = 0; i < cnt; i++)
9885   {
9886     Uint32 inx = selfptr->m_global_variables_uint32_instances;
9887     selfptr->m_global_variables_uint32[inx] = tmp[i];
9888     selfptr->m_global_variables_uint32_instances = inx + 1;
9889   }
9890 }
9891 #endif
9892 
9893 /**
9894  * Global data
9895  */
9896 static struct trp_callback g_trp_callback;
9897 
9898 TransporterRegistry globalTransporterRegistry(&g_trp_callback, NULL);
9899