1 /* Copyright (c) 2008, 2019, Oracle and/or its affiliates. All rights reserved.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License, version 2.0,
5 as published by the Free Software Foundation.
6
7 This program is also distributed with certain software (including
8 but not limited to OpenSSL) that is licensed under separate terms,
9 as designated in a particular file or component or in included license
10 documentation. The authors of MySQL hereby grant you an additional
11 permission to link the program and your derivative works with the
12 separately licensed software that they have included with MySQL.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License, version 2.0, for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
22
23 #include <ndb_global.h>
24
25 #define NDBD_MULTITHREADED
26
27 #include <VMSignal.hpp>
28 #include <kernel_types.h>
29 #include <Prio.hpp>
30 #include <SignalLoggerManager.hpp>
31 #include <SimulatedBlock.hpp>
32 #include <ErrorHandlingMacros.hpp>
33 #include <GlobalData.hpp>
34 #include <WatchDog.hpp>
35 #include <TransporterDefinitions.hpp>
36 #include <TransporterRegistry.hpp>
37 #include "FastScheduler.hpp"
38 #include "mt.hpp"
39 #include <DebuggerNames.hpp>
40 #include <signaldata/StopForCrash.hpp>
41 #include "TransporterCallbackKernel.hpp"
42 #include <NdbSleep.h>
43 #include <NdbGetRUsage.h>
44 #include <portlib/ndb_prefetch.h>
45 #include <blocks/pgman.hpp>
46 #include <blocks/thrman.hpp>
47 #include <Pool.hpp>
48 #include <NdbSpin.h>
49
50 #include "mt-asm.h"
51 #include "mt-lock.hpp"
52
53 #include "ThreadConfig.hpp"
54 #include <signaldata/StartOrd.hpp>
55
56 #include <NdbTick.h>
57 #include <NdbMutex.h>
58 #include <NdbCondition.h>
59 #include <ErrorReporter.hpp>
60 #include <EventLogger.hpp>
61
62 extern EventLogger * g_eventLogger;
63
64 #if (defined(VM_TRACE) || defined(ERROR_INSERT))
65 //#define DEBUG_MULTI_TRP 1
66 #endif
67
68 #ifdef DEBUG_MULTI_TRP
69 #define DEB_MULTI_TRP(arglist) do { g_eventLogger->info arglist ; } while (0)
70 #else
71 #define DEB_MULTI_TRP(arglist) do { } while (0)
72 #endif
73
74 /**
75 * Two new manual(recompile) error-injections in mt.cpp :
76 *
77 * NDB_BAD_SEND : Causes send buffer code to mess with a byte in a send buffer
78 * NDB_LUMPY_SEND : Causes transporters to be given small, oddly aligned and
79 * sized IOVECs to send, testing ability of new and existing
80 * code to handle this.
81 *
82 * These are useful for testing the correctness of the new code, and
83 * the resulting behaviour / debugging output.
84 */
85 //#define NDB_BAD_SEND
86 //#define NDB_LUMPY_SEND
87
88 /**
89 * Number indicating that the trp has no current sender thread.
90 *
91 * trp is used for short form of transporter in quite a few places.
92 * Originally there was a one to one mapping from node to transporter
93 * and vice versa. Now there can be several transporters used to
94 * connect to one node and thus we work with transporters and not with
95 * nodes in most places used for communication.
96 */
97 #define NO_OWNER_THREAD 0xFFFF
98
99 static void dumpJobQueues(void);
100
101 inline
102 SimulatedBlock*
mt_getBlock(BlockNumber blockNo,Uint32 instanceNo)103 GlobalData::mt_getBlock(BlockNumber blockNo, Uint32 instanceNo)
104 {
105 SimulatedBlock* b = getBlock(blockNo);
106 if (b != 0 && instanceNo != 0)
107 b = b->getInstance(instanceNo);
108 return b;
109 }
110
111 #ifdef __GNUC__
112 /* Provides a small (but noticeable) speedup in benchmarks. */
113 #define memcpy __builtin_memcpy
114 #endif
115
116 /* Constants found by benchmarks to be reasonable values. */
117
118 /*
119 * Max. signals to execute from one job buffer before considering other
120 * possible stuff to do.
121 */
122 static const Uint32 MAX_SIGNALS_PER_JB = 75;
123
124 /**
125 * Max signals written to other thread before calling flush_jbb_write_state
126 */
127 static const Uint32 MAX_SIGNALS_BEFORE_FLUSH_RECEIVER = 2;
128 static const Uint32 MAX_SIGNALS_BEFORE_FLUSH_OTHER = 20;
129 static const Uint32 MAX_SIGNALS_BEFORE_WAKEUP = 128;
130
131 //#define NDB_MT_LOCK_TO_CPU
132
133 /* If this is too small it crashes before first signal. */
134 #define MAX_INSTANCES_PER_THREAD (16 + 8 * MAX_NDBMT_LQH_THREADS)
135
136 static Uint32 glob_num_threads = 0;
137 static Uint32 glob_num_tc_threads = 1;
138 static Uint32 first_receiver_thread_no = 0;
139 static Uint32 max_send_delay = 0;
140 static Uint32 glob_wakeup_latency = 25;
141
142 #define NO_SEND_THREAD (MAX_BLOCK_THREADS + MAX_NDBMT_SEND_THREADS + 1)
143
144 /* max signal is 32 words, 7 for signal header and 25 datawords */
145 #define MAX_SIGNAL_SIZE 32
146 #define MIN_SIGNALS_PER_PAGE (thr_job_buffer::SIZE / MAX_SIGNAL_SIZE) //255
147
148 #if defined(HAVE_LINUX_FUTEX) && defined(NDB_HAVE_XCNG)
149 #define USE_FUTEX
150 #endif
151
152 #ifdef USE_FUTEX
153 #ifndef _GNU_SOURCE
154 #define _GNU_SOURCE
155 #endif
156 #include <unistd.h>
157 #include <sys/syscall.h>
158 #include <sys/types.h>
159
160 #define FUTEX_WAIT 0
161 #define FUTEX_WAKE 1
162 #define FUTEX_FD 2
163 #define FUTEX_REQUEUE 3
164 #define FUTEX_CMP_REQUEUE 4
165 #define FUTEX_WAKE_OP 5
166
167 static inline
168 int
futex_wait(volatile unsigned * addr,int val,const struct timespec * timeout)169 futex_wait(volatile unsigned * addr, int val, const struct timespec * timeout)
170 {
171 return syscall(SYS_futex,
172 addr, FUTEX_WAIT, val, timeout, 0, 0) == 0 ? 0 : errno;
173 }
174
175 static inline
176 int
futex_wake(volatile unsigned * addr)177 futex_wake(volatile unsigned * addr)
178 {
179 return syscall(SYS_futex, addr, FUTEX_WAKE, 1, 0, 0, 0) == 0 ? 0 : errno;
180 }
181
182 struct alignas(NDB_CL) thr_wait
183 {
184 volatile unsigned m_futex_state;
185 enum {
186 FS_RUNNING = 0,
187 FS_SLEEPING = 1
188 };
thr_waitthr_wait189 thr_wait() {
190 assert((sizeof(*this) % NDB_CL) == 0); //Maintain any CL-allignment
191 xcng(&m_futex_state, FS_RUNNING);
192 }
initthr_wait193 void init () {}
194 };
195
196 /**
197 * Sleep until woken up or timeout occurs.
198 *
199 * Will call check_callback(check_arg) after proper synchronisation, and only
200 * if that returns true will it actually sleep, else it will return
201 * immediately. This is needed to avoid races with wakeup.
202 *
203 * Returns 'true' if it actually did sleep.
204 */
205 template<typename T>
206 static inline
207 bool
yield(struct thr_wait * wait,const Uint32 nsec,bool (* check_callback)(T *),T * check_arg)208 yield(struct thr_wait* wait, const Uint32 nsec,
209 bool (*check_callback)(T*), T* check_arg)
210 {
211 volatile unsigned * val = &wait->m_futex_state;
212 #ifndef NDEBUG
213 int old =
214 #endif
215 xcng(val, thr_wait::FS_SLEEPING);
216 assert(old == thr_wait::FS_RUNNING);
217
218 /**
219 * At this point, we need to re-check the condition that made us decide to
220 * sleep, and skip sleeping if it changed..
221 *
222 * Otherwise, the condition may have not changed, and the thread making the
223 * change have already decided not to wake us, as our state was FS_RUNNING
224 * at the time.
225 *
226 * Also need a memory barrier to ensure this extra check is race-free.
227 * but that is already provided by xcng
228 */
229 const bool waited = (*check_callback)(check_arg);
230 if (waited)
231 {
232 struct timespec timeout;
233 timeout.tv_sec = 0;
234 timeout.tv_nsec = nsec;
235 futex_wait(val, thr_wait::FS_SLEEPING, &timeout);
236 /**
237 * Any spurious wakeups are handled by simply running the scheduler code.
238 * The check_callback is needed to ensure that we don't miss wakeups. But
239 * that a spurious wakeups causes one loop in the scheduler compared to
240 * the cost of always checking through buffers to check condition.
241 */
242 }
243 xcng(val, thr_wait::FS_RUNNING);
244 return waited;
245 }
246
247 static inline
248 int
wakeup(struct thr_wait * wait)249 wakeup(struct thr_wait* wait)
250 {
251 volatile unsigned * val = &wait->m_futex_state;
252 /**
253 * We must ensure that any state update (new data in buffers...) are visible
254 * to the other thread before we can look at the sleep state of that other
255 * thread.
256 */
257 if (xcng(val, thr_wait::FS_RUNNING) == thr_wait::FS_SLEEPING)
258 {
259 return futex_wake(val);
260 }
261 return 0;
262 }
263
264 static inline
265 int
try_wakeup(struct thr_wait * wait)266 try_wakeup(struct thr_wait* wait)
267 {
268 return wakeup(wait);
269 }
270 #else
271
272 struct alignas(NDB_CL) thr_wait
273 {
274 NdbMutex *m_mutex;
275 NdbCondition *m_cond;
276 bool m_need_wakeup;
thr_waitthr_wait277 thr_wait() : m_mutex(0), m_cond(0), m_need_wakeup(false) {
278 assert((sizeof(*this) % NDB_CL) == 0); //Maintain any CL-allignment
279 }
280
initthr_wait281 void init() {
282 m_mutex = NdbMutex_Create();
283 m_cond = NdbCondition_Create();
284 }
285 };
286
287 template<typename T>
288 static inline
289 bool
yield(struct thr_wait * wait,const Uint32 nsec,bool (* check_callback)(T *),T * check_arg)290 yield(struct thr_wait* wait, const Uint32 nsec,
291 bool (*check_callback)(T*), T* check_arg)
292 {
293 struct timespec end;
294 NdbCondition_ComputeAbsTime(&end, (nsec >= 1000000) ? nsec/1000000 : 1);
295 NdbMutex_Lock(wait->m_mutex);
296
297 /**
298 * Any spurious wakeups are handled by simply running the scheduler code.
299 * The check_callback is needed to ensure that we don't miss wakeups. But
300 * that a spurious wakeups causes one loop in the scheduler compared to
301 * the cost of always checking through buffers to check condition.
302 */
303 Uint32 waits = 0;
304 if ((*check_callback)(check_arg))
305 {
306 wait->m_need_wakeup = true;
307 waits++;
308 if (NdbCondition_WaitTimeoutAbs(wait->m_cond,
309 wait->m_mutex, &end) == ETIMEDOUT)
310 {
311 wait->m_need_wakeup = false;
312 }
313 }
314 NdbMutex_Unlock(wait->m_mutex);
315 return (waits > 0);
316 }
317
318
319 static inline
320 int
try_wakeup(struct thr_wait * wait)321 try_wakeup(struct thr_wait* wait)
322 {
323 int success = NdbMutex_Trylock(wait->m_mutex);
324 if (success != 0)
325 return success;
326
327 // We should avoid signaling when not waiting for wakeup
328 if (wait->m_need_wakeup)
329 {
330 wait->m_need_wakeup = false;
331 NdbCondition_Signal(wait->m_cond);
332 }
333 NdbMutex_Unlock(wait->m_mutex);
334 return 0;
335 }
336
337 static inline
338 int
wakeup(struct thr_wait * wait)339 wakeup(struct thr_wait* wait)
340 {
341 NdbMutex_Lock(wait->m_mutex);
342 // We should avoid signaling when not waiting for wakeup
343 if (wait->m_need_wakeup)
344 {
345 wait->m_need_wakeup = false;
346 NdbCondition_Signal(wait->m_cond);
347 }
348 NdbMutex_Unlock(wait->m_mutex);
349 return 0;
350 }
351
352 #endif
353
354 #define JAM_FILE_ID 236
355
356
357 /**
358 * thr_safe_pool
359 */
360 template<typename T>
361 struct alignas(NDB_CL) thr_safe_pool
362 {
363 struct alignas(NDB_CL) thr_safe_pool_lock
364 {
365 struct thr_spin_lock m_lock;
366
367 T* m_free_list;
368 Uint32 m_cnt;
369 bool m_used_all_reserved;
370 };
371 thr_safe_pool_lock m_safe_lock[MAX_NDBMT_SEND_THREADS];
372 struct thr_spin_lock m_alloc_lock;
373 Uint32 m_allocated;
374
thr_safe_poolthr_safe_pool375 thr_safe_pool(const char * name)
376 {
377 m_allocated = 0;
378 for (Uint32 i = 0; i < MAX_NDBMT_SEND_THREADS; i++)
379 {
380 char buf[100];
381 m_safe_lock[i].m_free_list = 0;
382 m_safe_lock[i].m_cnt = 0;
383 m_safe_lock[i].m_used_all_reserved = false;
384 BaseString::snprintf(buf, sizeof(buf), "Global_%s[%u]", name, i);
385 register_lock(&m_safe_lock[i].m_lock, buf);
386 }
387 {
388 char buf[100];
389 BaseString::snprintf(buf, sizeof(buf), "Global_allocated%s", name);
390 register_lock(&m_alloc_lock, buf);
391 }
392 assert((sizeof(*this) % NDB_CL) == 0); //Maintain any CL-alignment
393 }
394
seizethr_safe_pool395 T* seize(Ndbd_mem_manager *mm,
396 Uint32 rg)
397 {
398 /* This function is used by job buffer allocation. */
399 Uint32 instance_no = 0;
400 thr_safe_pool_lock *lock_ptr = &m_safe_lock[instance_no];
401 T* ret = 0;
402 lock(&lock_ptr->m_lock);
403 if (lock_ptr->m_free_list)
404 {
405 assert(lock_ptr->m_cnt);
406 lock_ptr->m_cnt--;
407 ret = lock_ptr->m_free_list;
408 lock_ptr->m_free_list = ret->m_next;
409 unlock(&lock_ptr->m_lock);
410 }
411 else
412 {
413 unlock(&lock_ptr->m_lock);
414 Uint32 dummy;
415 ret = reinterpret_cast<T*>
416 (mm->alloc_page(rg, &dummy,
417 Ndbd_mem_manager::NDB_ZONE_LE_32));
418 // ToDo: How to deal with failed allocation?!?
419 // I think in this case we need to start grabbing buffers kept for signal
420 // trace.
421 if (ret != NULL)
422 {
423 lock(&m_alloc_lock);
424 m_allocated++;
425 unlock(&m_alloc_lock);
426 }
427 }
428 return ret;
429 }
430
431 #define RG_REQUIRED_PAGES 96
found_instancethr_safe_pool432 bool found_instance(Uint32 instance,
433 Uint32 & max_found,
434 Uint32 & instance_no)
435 {
436 thr_safe_pool_lock *lock_ptr = &m_safe_lock[instance];
437 Uint32 cnt = lock_ptr->m_cnt;
438 if (cnt > RG_REQUIRED_PAGES)
439 {
440 return true;
441 }
442 if (cnt > max_found)
443 {
444 instance_no = instance;
445 max_found = cnt;
446 }
447 return false;
448 }
449
get_least_empty_instancethr_safe_pool450 Uint32 get_least_empty_instance(Uint32 skip_instance)
451 {
452 /**
453 * Read without mutex protection since it is ok to not get a perfect
454 * result.
455 */
456 Uint32 instance_no_found = 0;
457 Uint32 cnt_found = 0;
458 for (Uint32 i = skip_instance + 1;
459 i < globalData.ndbMtSendThreads;
460 i++)
461 {
462 if (found_instance(i,
463 cnt_found,
464 instance_no_found))
465 return i;
466 }
467 for (Uint32 i = 0; i < skip_instance; i++)
468 {
469 if (found_instance(i,
470 cnt_found,
471 instance_no_found))
472 return i;
473 }
474 return instance_no_found;
475 }
476
seize_listthr_safe_pool477 Uint32 seize_list(Ndbd_mem_manager *mm,
478 Uint32 rg,
479 Uint32 requested,
480 T** head,
481 T** tail,
482 Uint32 instance_no,
483 bool first_call)
484 {
485 /* This function is used by send buffer allocation. */
486 assert(instance_no < MAX_NDBMT_SEND_THREADS);
487 thr_safe_pool_lock *lock_ptr = &m_safe_lock[instance_no];
488 lock(&lock_ptr->m_lock);
489 if (unlikely(lock_ptr->m_cnt == 0))
490 {
491 unlock(&lock_ptr->m_lock);
492 if (likely(first_call))
493 {
494 /**
495 * No free pages in this instance. We will use the following order
496 * of allocation.
497 *
498 * Case 1: Either no send thread or only one send thread
499 * => Call alloc_page and set use_max_part to true.
500 * If this fails we fail the call.
501 *
502 * Case 2: At least 2 send threads
503 * In this case we will first try to allocate from the memory
504 * manager. But this first call only retrieves from the reserved
505 * part. If we already allocated all from the reserved part we
506 * will skip this call.
507 * Next we will check which instance is the least empty of the
508 * instances. We will try allocating from this instance. The
509 * purpose of this is to avoid allocating beyond the reserved
510 * part as long as possible.
511 * If this call fails as well we will make another call to
512 * alloc_page. This time we will also allow allocations beyond
513 * the reserved part.
514 * If even this fails we will go through the other instances to
515 * see if we can get pages from any instance. Only when this
516 * fails as well will we return no pages found.
517 */
518 Uint32 filled_instance_no = 0;
519 for (Uint32 step = 0; step < 2; step++)
520 {
521 Uint32 dummy;
522 bool locked = false;
523 bool use_max_part = (globalData.ndbMtSendThreads < 2 ||
524 step == 1);
525 if (use_max_part || !lock_ptr->m_used_all_reserved)
526 {
527 T* ret = reinterpret_cast<T*>
528 (mm->alloc_page(rg,
529 &dummy,
530 Ndbd_mem_manager::NDB_ZONE_LE_32,
531 locked,
532 use_max_part));
533 if (ret != 0)
534 {
535 ret->m_next = 0;
536 * head = * tail = ret;
537 if (ret != NULL)
538 {
539 lock(&m_alloc_lock);
540 m_allocated++;
541 unlock(&m_alloc_lock);
542 }
543 return 1;
544 }
545 /**
546 * This will only transition from false to true, so no need
547 * to protect it with mutex.
548 */
549 lock_ptr->m_used_all_reserved = true;
550 }
551 /**
552 * No more memory available from global memory, let's see if we
553 * can steal some memory from a neighbour instance.
554 *
555 * This is the call from the local pool, we want to avoid
556 * failing this call since it means we are announcing that we
557 * are out of memory. Try all the other instances before we
558 * move on to requesting memory from the global pool of memory.
559 * We first attempt with the most filled instance, we find this
560 * without acquiring any mutex.
561 */
562 if (globalData.ndbMtSendThreads < 2)
563 {
564 return 0;
565 }
566 if (step == 0)
567 {
568 filled_instance_no = get_least_empty_instance(instance_no);
569 Uint32 returned = seize_list(mm,
570 rg,
571 requested,
572 head,
573 tail,
574 filled_instance_no,
575 false);
576 if (likely(returned > 0))
577 {
578 return returned;
579 }
580 }
581 else
582 {
583 for (Uint32 i = 0; i < globalData.ndbMtSendThreads; i++)
584 {
585 if (i != instance_no &&
586 i != filled_instance_no)
587 {
588 Uint32 returned = seize_list(mm,
589 rg,
590 requested,
591 head,
592 tail,
593 i,
594 false);
595 if (returned != 0)
596 {
597 ndbout_c("seize_list: returns %u from instance %u",
598 returned,
599 i);
600 return returned;
601 }
602 }
603 }
604 }
605 }
606 return 0;
607 }
608 else
609 {
610 return 0;
611 }
612 }
613 else
614 {
615 if (lock_ptr->m_cnt < requested )
616 requested = lock_ptr->m_cnt;
617
618 T* first = lock_ptr->m_free_list;
619 T* last = first;
620 for (Uint32 i = 1; i < requested; i++)
621 {
622 last = last->m_next;
623 }
624 lock_ptr->m_cnt -= requested;
625 lock_ptr->m_free_list = last->m_next;
626 unlock(&lock_ptr->m_lock);
627 last->m_next = 0;
628 * head = first;
629 * tail = last;
630 return requested;
631 }
632 }
633
releasethr_safe_pool634 void release(Ndbd_mem_manager *mm,
635 Uint32 rg,
636 T* t)
637 {
638 /* This function is used by job buffer release. */
639 Uint32 instance_no = 0;
640 thr_safe_pool_lock *lock_ptr = &m_safe_lock[instance_no];
641 lock(&lock_ptr->m_lock);
642 t->m_next = lock_ptr->m_free_list;
643 lock_ptr->m_free_list = t;
644 lock_ptr->m_cnt++;
645 unlock(&lock_ptr->m_lock);
646 }
647
release_listthr_safe_pool648 void release_list(Ndbd_mem_manager *mm,
649 Uint32 rg,
650 T* head,
651 T* tail,
652 Uint32 cnt,
653 Uint32 instance_no)
654 {
655 /* This function is used by send buffer release. */
656 assert(instance_no < MAX_NDBMT_SEND_THREADS);
657 Uint32 used_instance_no = instance_no;
658 thr_safe_pool_lock *lock_ptr = &m_safe_lock[used_instance_no];
659 lock(&lock_ptr->m_lock);
660 tail->m_next = lock_ptr->m_free_list;
661 lock_ptr->m_free_list = head;
662 lock_ptr->m_cnt += cnt;
663 unlock(&lock_ptr->m_lock);
664 }
665 };
666
667 /**
668 * thread_local_pool
669 */
670 template<typename T>
671 class thread_local_pool
672 {
673 public:
thread_local_pool(thr_safe_pool<T> * global_pool,unsigned max_free,unsigned alloc_size=1)674 thread_local_pool(thr_safe_pool<T> *global_pool,
675 unsigned max_free, unsigned alloc_size = 1) :
676 m_max_free(max_free),
677 m_alloc_size(alloc_size),
678 m_free(0),
679 m_freelist(0),
680 m_global_pool(global_pool)
681 {
682 }
683
seize(Ndbd_mem_manager * mm,Uint32 rg,Uint32 instance_no)684 T *seize(Ndbd_mem_manager *mm,
685 Uint32 rg,
686 Uint32 instance_no)
687 {
688 T *tmp = m_freelist;
689 if (tmp == 0)
690 {
691 T * tail;
692 m_free = m_global_pool->seize_list(mm,
693 rg,
694 m_alloc_size,
695 &tmp,
696 &tail,
697 instance_no,
698 true);
699 }
700 if (tmp)
701 {
702 m_freelist = tmp->m_next;
703 assert(m_free > 0);
704 m_free--;
705 }
706
707 validate();
708 return tmp;
709 }
710
711 /**
712 * Release to local pool even if it get's "too" full
713 * (wrt to m_max_free)
714 */
release_local(T * t)715 void release_local(T *t)
716 {
717 m_free++;
718 t->m_next = m_freelist;
719 m_freelist = t;
720
721 validate();
722 }
723
validate() const724 void validate() const
725 {
726 #ifdef VM_TRACE
727 Uint32 cnt = 0;
728 T* t = m_freelist;
729 while (t)
730 {
731 cnt++;
732 t = t->m_next;
733 }
734 assert(cnt == m_free);
735 #endif
736 }
737
738 /**
739 * Release entries so that m_max_free is honored
740 * (likely used together with release_local)
741 */
release_global(Ndbd_mem_manager * mm,Uint32 rg,Uint32 instance_no)742 void release_global(Ndbd_mem_manager *mm,
743 Uint32 rg,
744 Uint32 instance_no)
745 {
746 validate();
747 unsigned free = m_free;
748 Uint32 maxfree = m_max_free;
749 assert(maxfree > 0);
750
751 if (unlikely(free > maxfree))
752 {
753 T* head = m_freelist;
754 T* tail = m_freelist;
755 unsigned cnt = 1;
756 free--;
757
758 while (free > maxfree)
759 {
760 cnt++;
761 free--;
762 tail = tail->m_next;
763 }
764
765 assert(free == maxfree);
766
767 m_free = free;
768 m_freelist = tail->m_next;
769 m_global_pool->release_list(mm,
770 rg,
771 head,
772 tail,
773 cnt,
774 instance_no);
775 }
776 validate();
777 }
778
release_all(Ndbd_mem_manager * mm,Uint32 rg,Uint32 instance_no)779 void release_all(Ndbd_mem_manager *mm,
780 Uint32 rg,
781 Uint32 instance_no)
782 {
783 validate();
784 T* head = m_freelist;
785 T* tail = m_freelist;
786 if (tail)
787 {
788 unsigned cnt = 1;
789 while (tail->m_next != 0)
790 {
791 cnt++;
792 tail = tail->m_next;
793 }
794 m_global_pool->release_list(mm,
795 rg,
796 head,
797 tail,
798 cnt,
799 instance_no);
800 m_free = 0;
801 m_freelist = 0;
802 }
803 validate();
804 }
805
806 /**
807 * release everything if more than m_max_free
808 * else do nothing
809 */
release_chunk(Ndbd_mem_manager * mm,Uint32 rg,Uint32 instance_no)810 void release_chunk(Ndbd_mem_manager *mm,
811 Uint32 rg,
812 Uint32 instance_no)
813 {
814 if (m_free > m_max_free)
815 {
816 release_all(mm, rg, instance_no);
817 }
818 }
819
820 /**
821 * prealloc up to <em>cnt</em> pages into this pool
822 */
fill(Ndbd_mem_manager * mm,Uint32 rg,Uint32 cnt,Uint32 instance_no)823 bool fill(Ndbd_mem_manager *mm,
824 Uint32 rg,
825 Uint32 cnt,
826 Uint32 instance_no)
827 {
828 if (m_free >= cnt)
829 {
830 return true;
831 }
832
833 T *head, *tail;
834 Uint32 allocated = m_global_pool->seize_list(mm,
835 rg,
836 m_alloc_size,
837 &head,
838 &tail,
839 instance_no,
840 true);
841 if (allocated)
842 {
843 tail->m_next = m_freelist;
844 m_freelist = head;
845 m_free += allocated;
846 return m_free >= cnt;
847 }
848
849 return false;
850 }
851
set_pool(thr_safe_pool<T> * pool)852 void set_pool(thr_safe_pool<T> * pool) { m_global_pool = pool; }
853
854 private:
855 const unsigned m_max_free;
856 const unsigned m_alloc_size;
857 unsigned m_free;
858 T *m_freelist;
859 thr_safe_pool<T> *m_global_pool;
860 };
861
862 /**
863 * Signal buffers.
864 *
865 * Each thread job queue contains a list of these buffers with signals.
866 *
867 * There is an underlying assumption that the size of this structure is the
868 * same as the global memory manager page size.
869 */
870 struct thr_job_buffer // 32k
871 {
872 static const unsigned SIZE = 8190;
873
874 /*
875 * Amount of signal data currently in m_data buffer.
876 * Read/written by producer, read by consumer.
877 */
878 Uint32 m_len;
879 /*
880 * Whether this buffer contained prio A or prio B signals, used when dumping
881 * signals from released buffers.
882 */
883 Uint32 m_prioa;
884 union {
885 Uint32 m_data[SIZE];
886
887 thr_job_buffer * m_next; // For free-list
888 };
889 };
890
891 static
892 inline
893 Uint32
calc_fifo_used(Uint32 ri,Uint32 wi,Uint32 sz)894 calc_fifo_used(Uint32 ri, Uint32 wi, Uint32 sz)
895 {
896 return (wi >= ri) ? wi - ri : (sz - ri) + wi;
897 }
898
899 /**
900 * thr_job_queue is shared between consumer / producer.
901 *
902 * The hot-spot of the thr_job_queue are the read/write indexes.
903 * As they are updated and read frequently they have been placed
904 * in its own thr_job_queue_head[] in order to make them fit inside a
905 * single/few cache lines and thereby avoid complete L1-cache replacement
906 * every time the job_queue is scanned.
907 */
908 struct thr_job_queue_head
909 {
910 unsigned m_read_index; // Read/written by consumer, read by producer
911 unsigned m_write_index; // Read/written by producer, read by consumer
912
913 /**
914 * Waiter object: In case job queue is full, the produced thread
915 * will 'yield' on this waiter object until the consumer thread
916 * has consumed (at least) a job buffer.
917 */
918 thr_wait m_waiter;
919
920 Uint32 used() const;
921 };
922
923 struct thr_job_queue
924 {
925 static const unsigned SIZE = 32;
926
927 /**
928 * There is a SAFETY limit on free buffers we never allocate,
929 * but may allow these to be implicitly used as a last resort
930 * when job scheduler is really stuck. ('sleeploop 10')
931 */
932 static const unsigned SAFETY = 2;
933
934 /**
935 * Some more free buffers are RESERVED to be used to avoid
936 * or resolve circular wait-locks between threads waiting
937 * for buffers to become available.
938 */
939 static const unsigned RESERVED = 4;
940
941 /**
942 * When free buffer count drops below ALMOST_FULL, we
943 * are allowed to start using RESERVED buffers to prevent
944 * circular wait-locks.
945 */
946 static const unsigned ALMOST_FULL = RESERVED + 2;
947
948 struct thr_job_buffer* m_buffers[SIZE];
949 };
950
951 inline
952 Uint32
used() const953 thr_job_queue_head::used() const
954 {
955 return calc_fifo_used(m_read_index, m_write_index, thr_job_queue::SIZE);
956 }
957
958 /*
959 * Two structures tightly associated with thr_job_queue.
960 *
961 * There will generally be exactly one thr_jb_read_state and one
962 * thr_jb_write_state associated with each thr_job_queue.
963 *
964 * The reason they are kept separate is to avoid unnecessary inter-CPU
965 * cache line pollution. All fields shared among producer and consumer
966 * threads are in thr_job_queue, thr_jb_write_state fields are only
967 * accessed by the producer thread(s), and thr_jb_read_state fields are
968 * only accessed by the consumer thread.
969 *
970 * For example, on Intel core 2 quad processors, there is a ~33%
971 * penalty for two cores accessing the same 64-byte cacheline.
972 */
973 struct thr_jb_write_state
974 {
975 /*
976 * The position to insert the next signal into the queue.
977 *
978 * m_write_index is the index into thr_job_queue::m_buffers[] of the buffer
979 * to insert into, and m_write_pos is the index into thr_job_buffer::m_data[]
980 * at which to store the next signal.
981 */
982 Uint32 m_write_index;
983 Uint32 m_write_pos;
984
985 /* Thread-local copy of thr_job_queue::m_buffers[m_write_index]. */
986 thr_job_buffer *m_write_buffer;
987
988 /**
989 Number of signals inserted since last flush to thr_job_queue.
990 This variable stores the number of pending signals not yet flushed
991 in the lower 16 bits and the number of pending signals before a
992 wakeup is called of the other side in the upper 16 bits. To
993 simplify the code we implement the bit manipulations in the
994 methods below.
995
996 The reason for this optimisation is to minimise use of memory for
997 these variables as they are likely to consume CPU cache memory.
998 It also speeds up some pending signal checks.
999 */
1000 Uint32 m_pending_signals;
1001
has_any_pending_signalsthr_jb_write_state1002 bool has_any_pending_signals() const
1003 {
1004 return m_pending_signals;
1005 }
get_pending_signalsthr_jb_write_state1006 Uint32 get_pending_signals() const
1007 {
1008 return (m_pending_signals & 0xFFFF);
1009 }
get_pending_signals_wakeupthr_jb_write_state1010 Uint32 get_pending_signals_wakeup() const
1011 {
1012 return (m_pending_signals >> 16);
1013 }
clear_pending_signals_and_set_wakeupthr_jb_write_state1014 void clear_pending_signals_and_set_wakeup(Uint32 wakeups)
1015 {
1016 m_pending_signals = (wakeups << 16);
1017 }
increment_pending_signalsthr_jb_write_state1018 void increment_pending_signals()
1019 {
1020 m_pending_signals++;
1021 }
init_pending_signalsthr_jb_write_state1022 void init_pending_signals()
1023 {
1024 m_pending_signals = 0;
1025 }
1026
1027 /*
1028 * Is this job buffer open for communication at all?
1029 * Several threads are not expected to communicate, and thus does
1030 * not allocate thr_job_buffer for exchange of signals.
1031 * Don't access any job_buffers without ensuring 'is_open()==true'.
1032 */
is_openthr_jb_write_state1033 bool is_open() const
1034 {
1035 return (m_write_buffer != NULL);
1036 }
1037 };
1038
1039 /**
1040 * Identify type of thread.
1041 * Based on assumption that threads are allocated in the order:
1042 * main, ldm, tc, recv, send
1043 */
1044 static bool
is_main_thread(unsigned thr_no)1045 is_main_thread(unsigned thr_no)
1046 {
1047 return thr_no < NUM_MAIN_THREADS;
1048 }
1049
1050 static bool
is_ldm_thread(unsigned thr_no)1051 is_ldm_thread(unsigned thr_no)
1052 {
1053 return thr_no >= NUM_MAIN_THREADS &&
1054 thr_no < NUM_MAIN_THREADS+globalData.ndbMtLqhThreads;
1055 }
1056
1057 /**
1058 * All LDM threads are not created equal:
1059 * First LDMs BACKUP-thread act as client during BACKUP
1060 * (See usage of Backup::UserBackupInstanceKey)
1061 */
1062 static bool
is_first_ldm_thread(unsigned thr_no)1063 is_first_ldm_thread(unsigned thr_no)
1064 {
1065 return thr_no == NUM_MAIN_THREADS;
1066 }
1067
1068 static bool
is_tc_thread(unsigned thr_no)1069 is_tc_thread(unsigned thr_no)
1070 {
1071 unsigned tc_base = NUM_MAIN_THREADS+globalData.ndbMtLqhThreads;
1072 return thr_no >= tc_base &&
1073 thr_no < tc_base+globalData.ndbMtTcThreads;
1074 }
1075
1076 static bool
is_recv_thread(unsigned thr_no)1077 is_recv_thread(unsigned thr_no)
1078 {
1079 unsigned recv_base = NUM_MAIN_THREADS +
1080 globalData.ndbMtLqhThreads +
1081 globalData.ndbMtTcThreads;
1082 return thr_no >= recv_base &&
1083 thr_no < recv_base+globalData.ndbMtReceiveThreads;
1084 }
1085
1086 /*
1087 * This structure is also used when dumping signal traces, to dump executed
1088 * signals from the buffer(s) currently being processed.
1089 */
1090 struct thr_jb_read_state
1091 {
1092 /*
1093 * Index into thr_job_queue::m_buffers[] of the buffer that we are currently
1094 * executing signals from.
1095 */
1096 Uint32 m_read_index;
1097 /*
1098 * Index into m_read_buffer->m_data[] of the next signal to execute from the
1099 * current buffer.
1100 */
1101 Uint32 m_read_pos;
1102 /*
1103 * Thread local copy of thr_job_queue::m_buffers[m_read_index].
1104 */
1105 thr_job_buffer *m_read_buffer;
1106 /*
1107 * These are thread-local copies of thr_job_queue::m_write_index and
1108 * thr_job_buffer::m_len. They are read once at the start of the signal
1109 * execution loop and used to determine when the end of available signals is
1110 * reached.
1111 */
1112 Uint32 m_read_end; // End within current thr_job_buffer. (*m_read_buffer)
1113
1114 Uint32 m_write_index; // Last available thr_job_buffer.
1115
1116 /*
1117 * Is this job buffer open for communication at all?
1118 * Several threads are not expected to communicate, and thus does
1119 * not allocate thr_job_buffer for exchange of signals.
1120 * Don't access any job_buffers without ensuring 'is_open()==true'.
1121 */
is_openthr_jb_read_state1122 bool is_open() const
1123 {
1124 return (m_read_buffer != NULL);
1125 }
1126
is_emptythr_jb_read_state1127 bool is_empty() const
1128 {
1129 assert(m_read_index != m_write_index || m_read_pos <= m_read_end);
1130 return (m_read_index == m_write_index) && (m_read_pos >= m_read_end);
1131 }
1132 };
1133
1134 /**
1135 * time-queue
1136 */
1137 struct thr_tq
1138 {
1139 static const unsigned ZQ_SIZE = 256;
1140 static const unsigned SQ_SIZE = 512;
1141 static const unsigned LQ_SIZE = 512;
1142 static const unsigned PAGES = (MAX_SIGNAL_SIZE *
1143 (ZQ_SIZE + SQ_SIZE + LQ_SIZE)) / 8192;
1144
1145 Uint32 * m_delayed_signals[PAGES];
1146 Uint32 m_next_free;
1147 Uint32 m_next_timer;
1148 Uint32 m_current_time;
1149 Uint32 m_cnt[3];
1150 Uint32 m_zero_queue[ZQ_SIZE];
1151 Uint32 m_short_queue[SQ_SIZE];
1152 Uint32 m_long_queue[LQ_SIZE];
1153 };
1154
1155 /**
1156 * THR_SEND_BUFFER_ALLOC_SIZE is the amount of 32k pages allocated
1157 * when we allocate pages from the global pool of send buffers to
1158 * the thread_local_pool (which is local to a thread).
1159 *
1160 * We allocate a bunch to decrease contention on send-buffer-pool-mutex
1161 */
1162 #define THR_SEND_BUFFER_ALLOC_SIZE 32
1163
1164 /**
1165 * THR_SEND_BUFFER_PRE_ALLOC is the amout of 32k pages that are
1166 * allocated before we start to run signals
1167 */
1168 #define THR_SEND_BUFFER_PRE_ALLOC 32
1169
1170 /**
1171 * Amount of pages that is allowed to linger in a
1172 * thread-local send-buffer pool
1173 */
1174 #define THR_SEND_BUFFER_MAX_FREE \
1175 (THR_SEND_BUFFER_ALLOC_SIZE + THR_SEND_BUFFER_PRE_ALLOC - 1)
1176
1177 /*
1178 * Max number of thread-local job buffers to keep before releasing to
1179 * global pool.
1180 */
1181 #define THR_FREE_BUF_MAX 32
1182 /* Minimum number of buffers (to ensure useful trace dumps). */
1183 #define THR_FREE_BUF_MIN 12
1184 /*
1185 * 1/THR_FREE_BUF_BATCH is the fraction of job buffers to allocate/free
1186 * at a time from/to global pool.
1187 */
1188 #define THR_FREE_BUF_BATCH 6
1189
1190 /**
1191 * a page with send data
1192 */
1193 struct thr_send_page
1194 {
1195 static const Uint32 PGSIZE = 32768;
1196 #if SIZEOF_CHARP == 4
1197 static const Uint32 HEADER_SIZE = 8;
1198 #else
1199 static const Uint32 HEADER_SIZE = 12;
1200 #endif
1201
max_bytesthr_send_page1202 static Uint32 max_bytes() {
1203 return PGSIZE - offsetof(thr_send_page, m_data);
1204 }
1205
1206 /* Next page */
1207 thr_send_page* m_next;
1208
1209 /* Bytes of send data available in this page. */
1210 Uint16 m_bytes;
1211
1212 /* Start of unsent data */
1213 Uint16 m_start;
1214
1215 /* Data; real size is to the end of one page. */
1216 char m_data[2];
1217 };
1218
1219 /**
1220 * a linked list with thr_send_page
1221 */
1222 struct thr_send_buffer
1223 {
1224 thr_send_page* m_first_page;
1225 thr_send_page* m_last_page;
1226 };
1227
1228 /**
1229 * a ring buffer with linked list of thr_send_page
1230 */
1231 struct thr_send_queue
1232 {
1233 unsigned m_write_index;
1234 #if SIZEOF_CHARP == 8
1235 unsigned m_unused;
1236 thr_send_page* m_buffers[7];
1237 static const unsigned SIZE = 7;
1238 #else
1239 thr_send_page* m_buffers[15];
1240 static const unsigned SIZE = 15;
1241 #endif
1242 };
1243
1244 struct thr_send_thread_instance;
1245
1246 struct alignas(NDB_CL) thr_data
1247 {
thr_datathr_data1248 thr_data() : m_jba_write_lock("jbalock"),
1249 m_signal_id_counter(0),
1250 m_send_buffer_pool(0,
1251 THR_SEND_BUFFER_MAX_FREE,
1252 THR_SEND_BUFFER_ALLOC_SIZE)
1253 #if defined(USE_INIT_GLOBAL_VARIABLES)
1254 ,m_global_variables_ptr_instances(0)
1255 ,m_global_variables_uint32_ptr_instances(0)
1256 ,m_global_variables_uint32_instances(0)
1257 ,m_global_variables_enabled(true)
1258 #endif
1259 {
1260
1261 // Check cacheline allignment
1262 assert((((UintPtr)this) % NDB_CL) == 0);
1263 assert((((UintPtr)&m_waiter) % NDB_CL) == 0);
1264 assert((((UintPtr)&m_jba_write_lock) % NDB_CL) == 0);
1265 assert((((UintPtr)&m_jba) % NDB_CL) == 0);
1266 assert((((UintPtr)m_in_queue_head) % NDB_CL) == 0);
1267 assert((((UintPtr)m_in_queue) % NDB_CL) == 0);
1268 }
1269
1270 /**
1271 * We start with the data structures that are shared globally to
1272 * ensure that they get the proper cache line alignment
1273 */
1274 thr_wait m_waiter; /* Cacheline aligned*/
1275
1276 /*
1277 * Prio A signal incoming queue. This area is used from many threads
1278 * protected by the spin lock. Thus it is also important to protect
1279 * surrounding thread-local variables from CPU cache line sharing
1280 * with this part.
1281 */
1282 alignas(NDB_CL) struct thr_spin_lock m_jba_write_lock;
1283 alignas(NDB_CL) struct thr_job_queue m_jba;
1284 struct thr_job_queue_head m_jba_head;
1285
1286 /*
1287 * These are the thread input queues, where other threads deliver signals
1288 * into.
1289 * These cache lines are going to be updated by many different CPU's
1290 * all the time whereas other neighbour variables are thread-local variables.
1291 * Avoid false cacheline sharing by require an alignment.
1292 */
1293 alignas(NDB_CL) struct thr_job_queue_head m_in_queue_head[MAX_BLOCK_THREADS];
1294 alignas(NDB_CL) struct thr_job_queue m_in_queue[MAX_BLOCK_THREADS];
1295
1296 /**
1297 * The remainder of the variables in thr_data are thread-local,
1298 * meaning that they are always updated by the thread that owns those
1299 * data structures and thus those variables aren't shared with other
1300 * CPUs.
1301 */
1302
1303 unsigned m_thr_no;
1304
1305 /**
1306 * Thread 0 doesn't necessarily handle all threads in a loop.
1307 * This variable keeps track of which to handle next.
1308 */
1309 unsigned m_next_jbb_no;
1310
1311 /**
1312 * Spin time of thread after completing all its work (in microseconds).
1313 * We won't go to sleep until we have spun for sufficient time, the aim
1314 * is to increase readiness in systems with much CPU resources
1315 */
1316 unsigned m_spintime;
1317 unsigned m_conf_spintime;
1318
1319 /**
1320 * nosend option on a thread means that it will never assist with sending.
1321 */
1322 unsigned m_nosend;
1323
1324 /**
1325 * Realtime scheduler activated for this thread. This means this
1326 * thread will run at a very high priority even beyond the priority
1327 * of the OS.
1328 */
1329 unsigned m_realtime;
1330
1331 /**
1332 * Index of thread locally in Configuration.cpp
1333 */
1334 unsigned m_thr_index;
1335
1336 /**
1337 * max signals to execute per JBB buffer
1338 */
1339 unsigned m_max_signals_per_jb;
1340
1341 /**
1342 * This state show how much assistance we are to provide to the
1343 * send threads in sending. At OVERLOAD we provide no assistance
1344 * and at MEDIUM we take care of our own generated sends and
1345 * at LIGHT we provide some assistance to other threads.
1346 */
1347 OverloadStatus m_overload_status;
1348
1349 /**
1350 * This is the wakeup instance that we currently use, if 0 it
1351 * means that we don't wake any other block thread up to
1352 * assist in sending. This is a simple way of using idle
1353 * block threads to act as send threads instead of simply
1354 * being idle. In particular this is often used for the main
1355 * thread and the rep thread.
1356 */
1357 Uint32 m_wakeup_instance;
1358
1359 /**
1360 * This variable keeps track of when we last woke up another thread
1361 * to assist the send thread. We use other timeout calls for this.
1362 */
1363 NDB_TICKS m_last_wakeup_idle_thread;
1364
1365 /**
1366 * We also keep track of node state, this is in overload state
1367 * if any thread is in OVERLOAD state. In this state we will
1368 * sleep shorter times and be more active in waking up to
1369 * assist the send threads.
1370 */
1371 OverloadStatus m_node_overload_status;
1372
1373 /**
1374 * Extra JBB signal execute quota allowed to be used to
1375 * drain (almost) full in-buffers. Reserved for usage where
1376 * we are about to end up in a circular wait-lock between
1377 * threads where none if them will be able to proceed.
1378 */
1379 unsigned m_max_extra_signals;
1380
1381 /**
1382 * max signals to execute before recomputing m_max_signals_per_jb
1383 */
1384 unsigned m_max_exec_signals;
1385
1386 /**
1387 * Flag indicating that we have sent a local Prio A signal. Used to know
1388 * if to scan for more prio A signals after executing those signals.
1389 * This is used to ensure that if we execute at prio A level and send a
1390 * prio A signal it will be immediately executed (or at least before any
1391 * prio B signal).
1392 */
1393 bool m_sent_local_prioa_signal;
1394
1395 /* Last read of current ticks */
1396 NDB_TICKS m_curr_ticks;
1397
1398 NDB_TICKS m_ticks;
1399 struct thr_tq m_tq;
1400
1401 /**
1402 * If thread overslept it is interesting to see how much time was actually
1403 * spent on executing and how much time was idle time. This will help to
1404 * see if overslept is due to long-running signals or OS not scheduling the
1405 * thread.
1406 *
1407 * We keep the real time last we made scan of time queues to ensure we can
1408 * report proper things in warning messages.
1409 */
1410 NDB_TICKS m_scan_real_ticks;
1411 struct ndb_rusage m_scan_time_queue_rusage;
1412
1413 /*
1414 * In m_next_buffer we keep a free buffer at all times, so that when
1415 * we hold the lock and find we need a new buffer, we can use this and this
1416 * way defer allocation to after releasing the lock.
1417 */
1418 struct thr_job_buffer* m_next_buffer;
1419
1420 /*
1421 * We keep a small number of buffers in a thread-local cyclic FIFO, so that
1422 * we can avoid going to the global pool in most cases, and so that we have
1423 * recent buffers available for dumping in trace files.
1424 */
1425 struct thr_job_buffer *m_free_fifo[THR_FREE_BUF_MAX];
1426 /* m_first_free is the index of the entry to return next from seize(). */
1427 Uint32 m_first_free;
1428 /* m_first_unused is the first unused entry in m_free_fifo. */
1429 Uint32 m_first_unused;
1430
1431
1432 /* Thread-local read state of prio A buffer. */
1433 struct thr_jb_read_state m_jba_read_state;
1434
1435 /*
1436 * There is no m_jba_write_state, as we have multiple writers to the prio A
1437 * queue, so local state becomes invalid as soon as we release the lock.
1438 */
1439
1440 /* These are the write states of m_in_queue[self] in each thread. */
1441 struct thr_jb_write_state m_write_states[MAX_BLOCK_THREADS];
1442 /* These are the read states of all of our own m_in_queue[]. */
1443 struct thr_jb_read_state m_read_states[MAX_BLOCK_THREADS];
1444
1445 /* Jam buffers for making trace files at crashes. */
1446 EmulatedJamBuffer m_jam;
1447 /* Watchdog counter for this thread. */
1448 Uint32 m_watchdog_counter;
1449 /* Latest executed signal id assigned in this thread */
1450 Uint32 m_signal_id_counter;
1451
1452 struct thr_send_thread_instance *m_send_instance;
1453 Uint32 m_send_instance_no;
1454
1455 /* Signal delivery statistics. */
1456 struct
1457 {
1458 Uint64 m_loop_cnt;
1459 Uint64 m_exec_cnt;
1460 Uint64 m_wait_cnt;
1461 Uint64 m_prioa_count;
1462 Uint64 m_prioa_size;
1463 Uint64 m_priob_count;
1464 Uint64 m_priob_size;
1465 } m_stat;
1466
1467 struct
1468 {
1469 Uint32 m_sleep_longer_spin_time;
1470 Uint32 m_sleep_shorter_spin_time;
1471 Uint32 m_num_waits;
1472 Uint32 m_micros_sleep_times[NUM_SPIN_INTERVALS];
1473 Uint32 m_spin_interval[NUM_SPIN_INTERVALS];
1474 } m_spin_stat;
1475
1476 Uint64 m_micros_send;
1477 Uint64 m_micros_sleep;
1478 Uint64 m_buffer_full_micros_sleep;
1479 Uint64 m_measured_spintime;
1480
1481 /* Array of trp ids with pending remote send data. */
1482 TrpId m_pending_send_trps[MAX_NTRANSPORTERS];
1483 /* Number of trp ids in m_pending_send_trps. */
1484 Uint32 m_pending_send_count;
1485
1486 /**
1487 * Bitmap of pending ids with send data.
1488 * Used to quickly check if a trp id is already in m_pending_send_trps.
1489 */
1490 Bitmask<(MAX_NTRANSPORTERS+31)/32> m_pending_send_mask;
1491
1492 /* pool for send buffers */
1493 class thread_local_pool<thr_send_page> m_send_buffer_pool;
1494
1495 /* Send buffer for this thread, these are not touched by any other thread */
1496 struct thr_send_buffer m_send_buffers[MAX_NTRANSPORTERS];
1497
1498 /* Block instances (main and worker) handled by this thread. */
1499 /* Used for sendpacked (send-at-job-buffer-end). */
1500 Uint32 m_instance_count;
1501 BlockNumber m_instance_list[MAX_INSTANCES_PER_THREAD];
1502
1503 SectionSegmentPool::Cache m_sectionPoolCache;
1504
1505 Uint32 m_cpu;
1506 my_thread_t m_thr_id;
1507 NdbThread* m_thread;
1508 Signal *m_signal;
1509 Uint32 m_sched_responsiveness;
1510 Uint32 m_max_signals_before_send;
1511 Uint32 m_max_signals_before_send_flush;
1512
1513 #ifdef ERROR_INSERT
1514 bool m_delayed_prepare;
1515 #endif
1516
1517 #if defined (USE_INIT_GLOBAL_VARIABLES)
1518 Uint32 m_global_variables_ptr_instances;
1519 Uint32 m_global_variables_uint32_ptr_instances;
1520 Uint32 m_global_variables_uint32_instances;
1521 bool m_global_variables_enabled;
1522 void* m_global_variables_ptrs[1024];
1523 void* m_global_variables_uint32_ptrs[1024];
1524 void* m_global_variables_uint32[1024];
1525 #endif
1526 };
1527
1528 struct mt_send_handle : public TransporterSendBufferHandle
1529 {
1530 struct thr_data * m_selfptr;
mt_send_handlemt_send_handle1531 mt_send_handle(thr_data* ptr) : m_selfptr(ptr) {}
~mt_send_handlemt_send_handle1532 virtual ~mt_send_handle() {}
1533
1534 virtual Uint32 *getWritePtr(NodeId nodeId,
1535 TrpId trp_id,
1536 Uint32 len,
1537 Uint32 prio,
1538 Uint32 max,
1539 SendStatus *error);
1540 virtual Uint32 updateWritePtr(NodeId nodeId,
1541 TrpId trp_id,
1542 Uint32 lenBytes,
1543 Uint32 prio);
1544 virtual void getSendBufferLevel(NodeId node_id, SB_LevelType &level);
1545 virtual bool forceSend(NodeId, TrpId);
1546 };
1547
1548 struct trp_callback : public TransporterCallback
1549 {
trp_callbacktrp_callback1550 trp_callback() {}
1551
1552 /* Callback interface. */
1553 void enable_send_buffer(NodeId, TrpId);
1554 void disable_send_buffer(NodeId, TrpId);
1555
1556 void reportSendLen(NodeId nodeId, Uint32 count, Uint64 bytes);
1557 void lock_transporter(NodeId, TrpId);
1558 void unlock_transporter(NodeId, TrpId);
1559 void lock_send_transporter(NodeId, TrpId);
1560 void unlock_send_transporter(NodeId, TrpId);
1561 Uint32 get_bytes_to_send_iovec(NodeId nodeId,
1562 TrpId trp_id,
1563 struct iovec *dst,
1564 Uint32 max);
1565 Uint32 bytes_sent(NodeId, TrpId, Uint32 bytes);
1566 };
1567
1568 static char *g_thr_repository_mem = NULL;
1569 static struct thr_repository *g_thr_repository = NULL;
1570
1571 struct thr_repository
1572 {
thr_repositorythr_repository1573 thr_repository() :
1574 m_section_lock("sectionlock"),
1575 m_mem_manager_lock("memmanagerlock"),
1576 m_jb_pool("jobbufferpool"),
1577 m_sb_pool("sendbufferpool")
1578 {
1579 // Verify assumed cacheline allignment
1580 assert((((UintPtr)this) % NDB_CL) == 0);
1581 assert((((UintPtr)&m_receive_lock) % NDB_CL) == 0);
1582 assert((((UintPtr)&m_section_lock) % NDB_CL) == 0);
1583 assert((((UintPtr)&m_mem_manager_lock) % NDB_CL) == 0);
1584 assert((((UintPtr)&m_jb_pool) % NDB_CL) == 0);
1585 assert((((UintPtr)&m_sb_pool) % NDB_CL) == 0);
1586 assert((((UintPtr)m_thread) % NDB_CL) == 0);
1587 assert((sizeof(m_receive_lock[0]) % NDB_CL) == 0);
1588 }
1589
1590 /**
1591 * m_receive_lock, m_section_lock, m_mem_manager_lock, m_jb_pool
1592 * and m_sb_pool are all variables globally shared among the threads
1593 * and also heavily updated.
1594 * Requiring alignments avoid false cache line sharing.
1595 */
1596 thr_aligned_spin_lock m_receive_lock[MAX_NDBMT_RECEIVE_THREADS];
1597
1598 alignas(NDB_CL) struct thr_spin_lock m_section_lock;
1599 alignas(NDB_CL) struct thr_spin_lock m_mem_manager_lock;
1600 alignas(NDB_CL) struct thr_safe_pool<thr_job_buffer> m_jb_pool;
1601 alignas(NDB_CL) struct thr_safe_pool<thr_send_page> m_sb_pool;
1602
1603 /* m_mm and m_thread_count are globally shared and read only variables */
1604 Ndbd_mem_manager * m_mm;
1605 unsigned m_thread_count;
1606
1607 /**
1608 * Protect m_mm and m_thread_count from CPU cache misses, first
1609 * part of m_thread (struct thr_data) is globally shared variables.
1610 * So sharing cache line with these for these read only variables
1611 * isn't a good idea
1612 */
1613 alignas(NDB_CL) struct thr_data m_thread[MAX_BLOCK_THREADS];
1614
1615 /* The buffers that are to be sent */
1616 struct send_buffer
1617 {
1618 /**
1619 * In order to reduce lock contention while
1620 * adding job buffer pages to the send buffers,
1621 * and sending these with the help of the send
1622 * transporters, there are two different
1623 * thr_send_buffer's. Each protected by its own lock:
1624 *
1625 * - m_buffer / m_buffer_lock:
1626 * Send buffer pages from all threads are linked into
1627 * the m_buffer when collected by link_thread_send_buffers().
1628 *
1629 * - m_sending / m_send_lock:
1630 * Before send buffers are given to the send-transporter,
1631 * they are moved from m_buffer -> m_sending by
1632 * get_bytes_to_send_iovec(). (Req. both locks.)
1633 * When transporter has consumed some/all of m_sending
1634 * buffers, ::bytes_sent() will update m_sending accordingly.
1635 *
1636 * If both locks are required, grab the m_send_lock first.
1637 * Release m_buffer_lock before releasing m_send_lock.
1638 */
1639 struct thr_spin_lock m_buffer_lock; //Protect m_buffer
1640 struct thr_send_buffer m_buffer;
1641
1642 struct thr_spin_lock m_send_lock; //Protect m_sending + transporter
1643 struct thr_send_buffer m_sending;
1644
1645 /* Size of resp. 'm_buffer' and 'm_sending' buffered data */
1646 Uint64 m_buffered_size; //Protected by m_buffer_lock
1647 Uint64 m_sending_size; //Protected by m_send_lock
1648
1649 bool m_enabled; //Protected by m_send_lock
1650
1651 /**
1652 * Flag used to coordinate sending to same remote trp from different
1653 * threads when there are contention on m_send_lock.
1654 *
1655 * If two threads need to send to the same trp at the same time, the
1656 * second thread, rather than wait for the first to finish, will just
1657 * set this flag. The first thread will will then take responsibility
1658 * for sending to this trp when done with its own sending.
1659 */
1660 Uint32 m_force_send; //Check after release of m_send_lock
1661
1662 /**
1663 * Which thread is currently holding the m_send_lock
1664 * This is the thr_no of the thread sending, this can be both a
1665 * send thread and a block thread. Send thread start their
1666 * thr_no at glob_num_threads. So it is easy to check this
1667 * thr_no to see if it is a block thread or a send thread.
1668 * This variable is used to find the proper place to return
1669 * the send buffer pages after completing the send.
1670 */
1671 Uint32 m_send_thread; //Protected by m_send_lock
1672
1673 /**
1674 * Bytes sent in last performSend().
1675 */
1676 Uint32 m_bytes_sent;
1677
1678 /* read index(es) in thr_send_queue */
1679 Uint32 m_read_index[MAX_BLOCK_THREADS];
1680 } m_send_buffers[MAX_NTRANSPORTERS];
1681
1682 /* The buffers published by threads */
1683 thr_send_queue m_thread_send_buffers[MAX_NTRANSPORTERS][MAX_BLOCK_THREADS];
1684
1685 /*
1686 * These are used to synchronize during crash / trace dumps.
1687 *
1688 */
1689 NdbMutex stop_for_crash_mutex;
1690 NdbCondition stop_for_crash_cond;
1691 Uint32 stopped_threads;
1692 };
1693
1694 /**
1695 * Class to handle send threads
1696 * ----------------------------
1697 * We can have up to 8 send threads.
1698 *
1699 * This class will handle when a block thread needs to send, it will
1700 * handle the running of the send thread and will also start the
1701 * send thread.
1702 */
1703 #define is_send_thread(thr_no) (thr_no >= glob_num_threads)
1704
1705 struct thr_send_thread_instance
1706 {
thr_send_thread_instancethr_send_thread_instance1707 thr_send_thread_instance() :
1708 m_instance_no(0),
1709 m_watchdog_counter(0),
1710 m_thr_index(0),
1711 m_thread(NULL),
1712 m_waiter_struct(),
1713 m_send_buffer_pool(0,
1714 THR_SEND_BUFFER_MAX_FREE,
1715 THR_SEND_BUFFER_ALLOC_SIZE),
1716 m_exec_time(0),
1717 m_sleep_time(0),
1718 m_user_time_os(0),
1719 m_kernel_time_os(0),
1720 m_elapsed_time_os(0),
1721 m_measured_spintime(0),
1722 m_awake(FALSE),
1723 m_first_trp(0),
1724 m_last_trp(0),
1725 m_next_is_high_prio_trp(false),
1726 m_more_trps(false),
1727 m_num_neighbour_trps(0),
1728 m_neighbour_trp_index(0)
1729 {}
1730
1731 /**
1732 * Instance number of send thread, this is set at creation of
1733 * send thread and after that not changed, so no need to protect
1734 * it when reading it.
1735 */
1736 Uint32 m_instance_no;
1737
1738 /**
1739 * This variable is registered in the watchdog, it is set by the
1740 * send thread and reset every now and then by watchdog thread.
1741 * No sepecial protection is required in setting it.
1742 */
1743 Uint32 m_watchdog_counter;
1744
1745 /**
1746 * Thread index of send thread in data node, this variable is
1747 * currently not used.
1748 */
1749 Uint32 m_thr_index;
1750 NdbThread *m_thread;
1751
1752 /**
1753 * Variable controlling send thread sleep and awakeness, this is
1754 * used in call to wakeup a thread.
1755 */
1756 thr_wait m_waiter_struct;
1757
1758 class thread_local_pool<thr_send_page> m_send_buffer_pool;
1759
1760 /**
1761 * The below variables are protected by the send_thread_mutex.
1762 * Each send thread is taking care of a subset of the transporters
1763 * in the data node. The function to decide which send thread
1764 * instance is responsible is simply the transporter id modulo the
1765 * number of send thread instances, possibly extended with a simple
1766 * hash function to make it less likely that some simple regularity
1767 * in node ids create unnecessary bottlenecks.
1768 *
1769 * Each send thread only has neighbour transporters it is responsible
1770 * for in the list below.
1771 */
1772
1773 /**
1774 * Statistical variables that track send thread CPU usage that is
1775 * reported in call getSendPerformanceTimers that is used by
1776 * THRMAN block to track CPU usage in send threads and is also
1777 * used by THRMAN to report data on send threads in ndbinfo
1778 * tables. The data is used in adaptive send thread control by
1779 * THRMAN.
1780 */
1781 Uint64 m_exec_time;
1782 Uint64 m_sleep_time;
1783 Uint64 m_user_time_os;
1784 Uint64 m_kernel_time_os;
1785 Uint64 m_elapsed_time_os;
1786 Uint64 m_measured_spintime;
1787
1788 /**
1789 * Boolean indicating if send thread is awake or not.
1790 */
1791 Uint32 m_awake;
1792
1793 /* First trp that has data to be sent */
1794 Uint32 m_first_trp;
1795
1796 /* Last trp in list of trps with data available for sending */
1797 Uint32 m_last_trp;
1798
1799 /* Which list should I get trp from next time. */
1800 bool m_next_is_high_prio_trp;
1801
1802 /* 'true': More trps became available -> Need recheck ::get_trp() */
1803 bool m_more_trps;
1804
1805 #define MAX_NEIGHBOURS (3 * MAX_NODE_GROUP_TRANSPORTERS)
1806 Uint32 m_num_neighbour_trps;
1807 Uint32 m_neighbour_trp_index;
1808 Uint32 m_neighbour_trps[MAX_NEIGHBOURS];
1809
1810 /**
1811 * Mutex protecting the linked list of trps awaiting sending
1812 * and also the m_awake variable of the send thread. This
1813 * includes the neighbour transporters listed above.
1814 *
1815 * In addition the statistical variables listed above.
1816 *
1817 * Finally it also protects the data for transporters handled by this
1818 * send thread in the m_trp_state array (the thr_send_trps struct).
1819 */
1820 NdbMutex *send_thread_mutex;
1821
1822 /**
1823 * Check if a trp possibly is having data ready to be sent.
1824 * Upon 'true', callee should grab send_thread_mutex and
1825 * try to get_trp() while holding lock.
1826 */
data_availablethr_send_thread_instance1827 bool data_available() const
1828 {
1829 rmb();
1830 return (m_more_trps == TRUE);
1831 }
1832
check_pending_datathr_send_thread_instance1833 bool check_pending_data()
1834 {
1835 return m_more_trps;
1836 }
1837 };
1838
1839 struct thr_send_trps
1840 {
1841 /**
1842 * 'm_next' implements a list of 'send_trps' with PENDING'
1843 * data, not yet assigned to a send thread. 0 means NULL.
1844 */
1845 Uint16 m_next;
1846
1847 /**
1848 * m_data_available are incremented/decremented by each
1849 * party having data to be sent to this specific trp.
1850 * It work in conjunction with a queue of get'able trps
1851 * (insert_trp(), get_trp()) waiting to be served by
1852 * the send threads, such that:
1853 *
1854 * 1) IDLE-state (m_data_available==0, not in list)
1855 * There are no data available for sending, and
1856 * no send threads are assigned to this trp.
1857 *
1858 * 2) PENDING-state (m_data_available>0, in list)
1859 * There are data available for sending, possibly
1860 * supplied by multiple parties. No send threads
1861 * are currently serving this request.
1862 *
1863 * 3) ACTIVE-state (m_data_available==1, not in list)
1864 * There are data available for sending, possibly
1865 * supplied by multiple parties, which are currently
1866 * being served by a send thread. All known
1867 * data available at the time when we became 'ACTIVE'
1868 * will be served now ( -> '==1')
1869 *
1870 * 3b ACTIVE-WITH-PENDING-state (m_data_available>1, not in list)
1871 * Variant of above state, send thread is serving requests,
1872 * and even more data became available since we started.
1873 *
1874 * Allowed state transitions are:
1875 *
1876 * IDLE -> PENDING (alert_send_thread w/ insert_trp)
1877 * PENDING -> ACTIVE (get_trp)
1878 * ACTIVE -> IDLE (run_send_thread if check_done_trp)
1879 * ACTIVE -> PENDING (run_send_thread if 'more'
1880 * ACTIVE -> ACTIVE-P (alert_send_thread while ACTIVE)
1881 * ACTIVE-P -> PENDING (run_send_thread while not check_done_trp)
1882 * ACTIVE-P -> ACTIVE-P (alert_send_thread while ACTIVE-P)
1883 *
1884 * A consequence of this, is that only a (single-) ACTIVE
1885 * send thread will serve send request to a specific trp.
1886 * Thus, there will be no contention on the m_send_lock
1887 * caused by the send threads.
1888 */
1889 Uint16 m_data_available;
1890
1891 /**
1892 * This variable shows which trp is actually sending for the moment.
1893 * This will be reset again immediately after sending is completed.
1894 * It is used to ensure that neighbour trps aren't taken out for
1895 * sending by more than one thread. The neighbour list is simply
1896 * an array of the neighbours and we will send if data is avaiable
1897 * to send AND no one else is sending which is checked by looking at
1898 * this variable.
1899 */
1900 Uint16 m_thr_no_sender;
1901
1902 /* Send to this trp has caused a Transporter overload */
1903 Uint16 m_send_overload;
1904
1905 /**
1906 * This is neighbour trp in the same node group as ourselves. This means
1907 * that we are likely to communicate with this trp more heavily than
1908 * other trps. Also delays in this communication will make the updates
1909 * take much longer since updates has to traverse this link and the
1910 * corresponding link back 6 times as part of an updating transaction.
1911 *
1912 * Thus for good performance of updates it is essential to prioritise this
1913 * link a bit.
1914 */
1915 bool m_neighbour_trp;
1916
1917 /**
1918 * Further sending to this trp should be delayed until
1919 * 'm_micros_delayed' has passed since 'm_inserted_time'.
1920 */
1921 Uint32 m_micros_delayed;
1922 NDB_TICKS m_inserted_time;
1923
1924 /**
1925 * Counter of how many overload situations we experienced towards this
1926 * trp. We keep track of this to get an idea if the config setup is
1927 * incorrect somehow, one should consider increasing TCP_SND_BUF_SIZE
1928 * if this counter is incremented often. It is an indication that a
1929 * bigger buffer is needed to handle bandwith-delay product of the
1930 * node communication.
1931 */
1932 Uint64 m_overload_counter;
1933 };
1934
1935 class thr_send_threads
1936 {
1937 public:
1938 /* Create send thread environment */
1939 thr_send_threads();
1940
1941 /* Destroy send thread environment and ensure threads are stopped */
1942 ~thr_send_threads();
1943
1944 struct thr_send_thread_instance* get_send_thread_instance_by_num(Uint32);
1945 /**
1946 * A block thread provides assistance to send thread by executing send
1947 * to one of the trps.
1948 */
1949 bool assist_send_thread(Uint32 max_num_trps,
1950 Uint32 thr_no,
1951 NDB_TICKS now,
1952 Uint32 &watchdog_counter,
1953 struct thr_send_thread_instance *send_instance,
1954 class thread_local_pool<thr_send_page> & send_buffer_pool);
1955
1956 /* Send thread method to send to a transporter picked by get_trp */
1957 bool handle_send_trp(TrpId id,
1958 Uint32 & num_trp_sent,
1959 Uint32 thr_no,
1960 NDB_TICKS & now,
1961 Uint32 & watchdog_counter,
1962 struct thr_send_thread_instance *send_instance);
1963
1964 /* A block thread has flushed data for a trp and wants it sent */
1965 Uint32 alert_send_thread(TrpId trp_id,
1966 NDB_TICKS now,
1967 struct thr_send_thread_instance* send_instance);
1968
1969 /* Method used to run the send thread */
1970 void run_send_thread(Uint32 instance_no);
1971
1972 /* Method to assign the base transporter to send threads */
1973 void assign_trps_to_send_threads();
1974
1975 /* Method to assign the multi transporter to send threads */
1976 void assign_multi_trps_to_send_threads();
1977
1978 /* Method to assign the block threads to assist send threads */
1979 void assign_threads_to_assist_send_threads();
1980
1981 /* Method to start the send threads */
1982 void start_send_threads();
1983
1984 /* Get send buffer pool for send thread */
get_send_buffer_pool(Uint32 thr_no)1985 thread_local_pool<thr_send_page>* get_send_buffer_pool(Uint32 thr_no)
1986 {
1987 return &m_send_threads[thr_no - glob_num_threads].m_send_buffer_pool;
1988 }
1989
1990 void wake_my_send_thread_if_needed(TrpId *trp_id_array,
1991 Uint32 count,
1992 struct thr_send_thread_instance *my_send_instance);
1993 Uint32 get_send_instance(TrpId trp_id);
1994 private:
1995 struct thr_send_thread_instance* get_send_thread_instance_by_trp(TrpId);
1996
1997 /* Insert a trp in list of trps that has data available to send */
1998 void insert_trp(TrpId trp_id, struct thr_send_thread_instance*);
1999
2000 /* Get a trp id in order to send to it */
2001 TrpId get_trp(Uint32 instance_no,
2002 NDB_TICKS now,
2003 struct thr_send_thread_instance* send_instance);
2004
2005 /* Update rusage parameters for send thread. */
2006 void update_rusage(struct thr_send_thread_instance *this_send_thread,
2007 Uint64 elapsed_time);
2008
2009 /**
2010 * Set of utility methods to aid in scheduling of send work:
2011 *
2012 * Further sending to trp can be delayed
2013 * until 'now+delay'. Used either to wait for more packets
2014 * to be available for bigger chunks, or to wait for an overload
2015 * situation to clear.
2016 */
2017 void set_max_delay(TrpId trp_id, NDB_TICKS now, Uint32 delay_usec);
2018 void set_overload_delay(TrpId trp_id, NDB_TICKS now, Uint32 delay_usec);
2019 Uint32 check_delay_expired(TrpId trp_id, NDB_TICKS now);
2020
2021 /* Completed sending data to this trp, check if more work pending. */
2022 bool check_done_trp(TrpId trp_id);
2023
2024 /* Get a send thread which isn't awake currently */
2025 struct thr_send_thread_instance* get_not_awake_send_thread(
2026 TrpId trp_id,
2027 struct thr_send_thread_instance *send_instance);
2028
2029 /* Try to lock send_buffer for this trp. */
2030 static
2031 int trylock_send_trp(TrpId trp_id);
2032
2033 /* Perform the actual send to the trp, release send_buffer lock.
2034 * Return 'true' if there are still more to be sent to this trp.
2035 */
2036 static
2037 bool perform_send(TrpId trp_id, Uint32 thr_no, Uint32& bytes_sent);
2038
2039 /* Have threads been started */
2040 Uint32 m_started_threads;
2041
2042 OverloadStatus m_node_overload_status;
2043
2044 /* Is data available and next reference for each trp in cluster */
2045 struct thr_send_trps m_trp_state[MAX_NTRANSPORTERS];
2046
2047 /**
2048 * Very few compiler (gcc) allow zero length arrays
2049 */
2050 #if MAX_NDBMT_SEND_THREADS == 0
2051 #define _MAX_SEND_THREADS 1
2052 #else
2053 #define _MAX_SEND_THREADS MAX_NDBMT_SEND_THREADS
2054 #endif
2055
2056 /* Data and state for the send threads */
2057 Uint32 m_num_trps;
2058 Uint32 m_next_send_thread_instance_by_trp;
2059 struct thr_send_thread_instance m_send_threads[_MAX_SEND_THREADS];
2060 Uint16 m_send_thread_instance_by_trp[MAX_NTRANSPORTERS];
2061
2062 public:
2063
getSendPerformanceTimers(Uint32 send_instance,Uint64 & exec_time,Uint64 & sleep_time,Uint64 & spin_time,Uint64 & user_time_os,Uint64 & kernel_time_os,Uint64 & elapsed_time_os)2064 void getSendPerformanceTimers(Uint32 send_instance,
2065 Uint64 & exec_time,
2066 Uint64 & sleep_time,
2067 Uint64 & spin_time,
2068 Uint64 & user_time_os,
2069 Uint64 & kernel_time_os,
2070 Uint64 & elapsed_time_os)
2071 {
2072 require(send_instance < globalData.ndbMtSendThreads);
2073 NdbMutex_Lock(m_send_threads[send_instance].send_thread_mutex);
2074 exec_time = m_send_threads[send_instance].m_exec_time;
2075 sleep_time = m_send_threads[send_instance].m_sleep_time;
2076 spin_time = m_send_threads[send_instance].m_measured_spintime;
2077 user_time_os= m_send_threads[send_instance].m_user_time_os;
2078 kernel_time_os = m_send_threads[send_instance].m_kernel_time_os;
2079 elapsed_time_os = m_send_threads[send_instance].m_elapsed_time_os;
2080 NdbMutex_Unlock(m_send_threads[send_instance].send_thread_mutex);
2081 }
startChangeNeighbourNode()2082 void startChangeNeighbourNode()
2083 {
2084 for (Uint32 i = 0; i < globalData.ndbMtSendThreads; i++)
2085 {
2086 NdbMutex_Lock(m_send_threads[i].send_thread_mutex);
2087 for (Uint32 j = 0; j < MAX_NEIGHBOURS; j++)
2088 {
2089 m_send_threads[i].m_neighbour_trps[j] = 0;
2090 }
2091 m_send_threads[i].m_num_neighbour_trps = 0;
2092 }
2093 for (Uint32 i = 0; i < MAX_NTRANSPORTERS; i++)
2094 {
2095 m_trp_state[i].m_neighbour_trp = FALSE;
2096 }
2097 }
setNeighbourNode(NodeId nodeId)2098 void setNeighbourNode(NodeId nodeId)
2099 {
2100 NodeId id[MAX_NODE_GROUP_TRANSPORTERS];
2101 Uint32 num_ids;
2102 if (globalData.ndbMtSendThreads == 0)
2103 {
2104 return;
2105 }
2106 globalTransporterRegistry.get_trps_for_node(nodeId,
2107 &id[0],
2108 num_ids,
2109 MAX_NODE_GROUP_TRANSPORTERS);
2110 for (Uint32 index = 0; index < num_ids; index++)
2111 {
2112 Uint32 this_id = id[index];
2113 Uint32 send_instance = get_send_instance(this_id);
2114 m_trp_state[this_id].m_neighbour_trp = TRUE;
2115 for (Uint32 i = 0; i < MAX_NEIGHBOURS; i++)
2116 {
2117 require(m_send_threads[send_instance].m_neighbour_trps[i] != this_id);
2118 if (m_send_threads[send_instance].m_neighbour_trps[i] == 0)
2119 {
2120 DEB_MULTI_TRP(("Neighbour(%u) of node %u is trp %u",
2121 i,
2122 nodeId,
2123 this_id));
2124 assert(m_send_threads[send_instance].m_num_neighbour_trps == i);
2125 m_send_threads[send_instance].m_neighbour_trps[i] = this_id;
2126 m_send_threads[send_instance].m_num_neighbour_trps++;
2127 assert(m_send_threads[send_instance].m_num_neighbour_trps <=
2128 MAX_NEIGHBOURS);
2129
2130 break;
2131 }
2132 }
2133 }
2134 }
endChangeNeighbourNode()2135 void endChangeNeighbourNode()
2136 {
2137 /**
2138 * If a transporter was in the transporter list before (don't think it
2139 * should be possible) it doesn't represent an issue since it will simply
2140 * be handled twice, first from neighbour list and second from list of
2141 * transporters.
2142 *
2143 * The opposite behaviour that a transporter goes from neighbour to not
2144 * a neighbour transporter any more should only happen in node failures
2145 * and in that case the transporter should not have any data to send
2146 * and the transporter will be cleared before the node is allowed to
2147 * restart again.
2148 */
2149 for (Uint32 i = 0; i < globalData.ndbMtSendThreads; i++)
2150 {
2151 m_send_threads[i].m_neighbour_trp_index = 0;
2152 NdbMutex_Unlock(m_send_threads[i].send_thread_mutex);
2153 }
2154 }
setNodeOverloadStatus(OverloadStatus new_status)2155 void setNodeOverloadStatus(OverloadStatus new_status)
2156 {
2157 /**
2158 * The read of this variable is unsafe, but has no dire consequences
2159 * if it is shortly inconsistent. We use a memory barrier to at least
2160 * speed up the spreading of the variable to all CPUs.
2161 */
2162 m_node_overload_status = new_status;
2163 mb();
2164 }
2165 };
2166
2167
2168 /*
2169 * The single instance of the thr_send_threads class, if this variable
2170 * is non-NULL, then we're using send threads, otherwise if NULL, there
2171 * are no send threads.
2172 */
2173 static char* g_send_threads_mem = NULL;
2174 static thr_send_threads *g_send_threads = NULL;
2175
2176 extern "C"
2177 void *
mt_send_thread_main(void * thr_arg)2178 mt_send_thread_main(void *thr_arg)
2179 {
2180 struct thr_send_thread_instance *this_send_thread =
2181 (thr_send_thread_instance*)thr_arg;
2182
2183 Uint32 instance_no = this_send_thread->m_instance_no;
2184 g_send_threads->run_send_thread(instance_no);
2185 return NULL;
2186 }
2187
thr_send_threads()2188 thr_send_threads::thr_send_threads()
2189 : m_started_threads(FALSE),
2190 m_node_overload_status((OverloadStatus)LIGHT_LOAD_CONST)
2191 {
2192 struct thr_repository *rep = g_thr_repository;
2193
2194 for (Uint32 i = 0; i < NDB_ARRAY_SIZE(m_trp_state); i++)
2195 {
2196 m_trp_state[i].m_next = 0;
2197 m_trp_state[i].m_data_available = 0;
2198 m_trp_state[i].m_thr_no_sender = Uint16(NO_OWNER_THREAD);
2199 m_trp_state[i].m_send_overload = FALSE;
2200 m_trp_state[i].m_micros_delayed = 0;
2201 m_trp_state[i].m_neighbour_trp = FALSE;
2202 m_trp_state[i].m_overload_counter = 0;
2203 NdbTick_Invalidate(&m_trp_state[i].m_inserted_time);
2204 }
2205 for (Uint32 i = 0; i < NDB_ARRAY_SIZE(m_send_threads); i++)
2206 {
2207 m_send_threads[i].m_more_trps = false;
2208 m_send_threads[i].m_first_trp = 0;
2209 m_send_threads[i].m_last_trp = 0;
2210 m_send_threads[i].m_next_is_high_prio_trp = false;
2211 m_send_threads[i].m_num_neighbour_trps = 0;
2212 m_send_threads[i].m_neighbour_trp_index = 0;
2213 for (Uint32 j = 0; j < MAX_NEIGHBOURS; j++)
2214 {
2215 m_send_threads[i].m_neighbour_trps[j] = 0;
2216 }
2217 m_send_threads[i].m_waiter_struct.init();
2218 m_send_threads[i].m_instance_no = i;
2219 m_send_threads[i].m_send_buffer_pool.set_pool(&rep->m_sb_pool);
2220 m_send_threads[i].send_thread_mutex = NdbMutex_Create();
2221 }
2222 memset(&m_send_thread_instance_by_trp[0],
2223 0xFF,
2224 sizeof(m_send_thread_instance_by_trp));
2225 m_next_send_thread_instance_by_trp = 0;
2226 m_num_trps = 0;
2227 }
2228
~thr_send_threads()2229 thr_send_threads::~thr_send_threads()
2230 {
2231 if (!m_started_threads)
2232 return;
2233
2234 for (Uint32 i = 0; i < globalData.ndbMtSendThreads; i++)
2235 {
2236 void *dummy_return_status;
2237
2238 /* Ensure thread is woken up to die */
2239 wakeup(&(m_send_threads[i].m_waiter_struct));
2240 NdbThread_WaitFor(m_send_threads[i].m_thread, &dummy_return_status);
2241 globalEmulatorData.theConfiguration->removeThread(
2242 m_send_threads[i].m_thread);
2243 NdbThread_Destroy(&(m_send_threads[i].m_thread));
2244 }
2245 }
2246
2247 /**
2248 * Base transporters are spread equally among the send threads.
2249 * There is no special connection between a thread and a transporter
2250 * to another node. Thus round-robin scheduling is good enough.
2251 */
2252 void
assign_trps_to_send_threads()2253 thr_send_threads::assign_trps_to_send_threads()
2254 {
2255 Uint32 num_trps = globalTransporterRegistry.get_num_trps();
2256 m_num_trps = num_trps;
2257 /* Transporter instance 0 isn't used */
2258 m_send_thread_instance_by_trp[0] = Uint16(~0);
2259 Uint32 send_instance = 0;
2260 for (Uint32 i = 1; i <= num_trps; i++)
2261 {
2262 m_send_thread_instance_by_trp[i] = send_instance;
2263 send_instance++;
2264 if (send_instance == globalData.ndbMtSendThreads)
2265 {
2266 send_instance = 0;
2267 }
2268 }
2269 m_next_send_thread_instance_by_trp = 0;
2270 }
2271
2272 void
mt_assign_multi_trps_to_send_threads()2273 mt_assign_multi_trps_to_send_threads()
2274 {
2275 DEB_MULTI_TRP(("mt_assign_multi_trps_to_send_threads()"));
2276 if (g_send_threads)
2277 {
2278 g_send_threads->assign_multi_trps_to_send_threads();
2279 }
2280 }
2281
2282 /**
2283 * Multi transporters are assigned to send thread instances to mimic
2284 * the assignment of LDM instances to send thread instances. This
2285 * ensures that if an LDM thread sends a message to another LDM
2286 * thread in the same node group the LDM thread will assist with
2287 * the sending of this message. The LDM thread will send to another
2288 * LDM thread mostly in case it is within the same node group and it
2289 * will then send to the same LDM instance in that node.
2290 *
2291 * Ideally the number of LDM threads should be a multiple of the number
2292 * of send threads to get the best assignment of transporters to send
2293 * threads.
2294 */
2295 void
assign_multi_trps_to_send_threads()2296 thr_send_threads::assign_multi_trps_to_send_threads()
2297 {
2298 DEB_MULTI_TRP(("assign_multi_trps_to_send_threads()"));
2299 Uint32 new_num_trps = globalTransporterRegistry.get_num_trps();
2300 Uint32 send_instance = m_next_send_thread_instance_by_trp;
2301 DEB_MULTI_TRP(("assign_multi_trps_to_send_threads(): new_num_trps = %u",
2302 new_num_trps));
2303 for (Uint32 i = m_num_trps + 1; i <= new_num_trps; i++)
2304 {
2305 m_send_thread_instance_by_trp[i] = send_instance;
2306 send_instance++;
2307 if (send_instance == globalData.ndbMtSendThreads)
2308 {
2309 send_instance = 0;
2310 }
2311 }
2312 m_num_trps = new_num_trps;
2313 m_next_send_thread_instance_by_trp = send_instance;
2314 }
2315
2316 void
assign_threads_to_assist_send_threads()2317 thr_send_threads::assign_threads_to_assist_send_threads()
2318 {
2319 /**
2320 * Assign the block thread (ldm, tc, rep and main) to assist a certain send
2321 * thread instance. This means that assistance will only be provided to a
2322 * subset of the transporters from this block thread. The actual send
2323 * threads can also assist other send threads to avoid having to wake up
2324 * all send threads all the time.
2325 *
2326 * If we have configured the block thread to not provide any send thread
2327 * assistance we will not assign any send thread to it, similarly receive
2328 * threads don't provide send thread assistance and if no send threads
2329 * are around we use the old method of sending without send threads and
2330 * in this case the sending is done by all block threads and there are
2331 * no send threads around at all.
2332 *
2333 * We perform round robin of LDM threads first and then round robin on the
2334 * non-LDM threads. This ensures that the first LDM thread starts at send
2335 * instance 0 to ensure that we support the transporters used for
2336 * communication to the same LDM in the same node group. This is not
2337 * guaranteed for all configurations, but we strive for this configuration
2338 * to ensure that the LDM thread will quickly send its own messages within
2339 * the node group. Messages to other nodes will be picked up by another
2340 * send thread. With only one send thread the LDM threads will support all
2341 * transporters. Multiple send threads is mainly intended for larger
2342 * configurations.
2343 */
2344 THRConfigApplier & conf = globalEmulatorData.theConfiguration->m_thr_config;
2345 struct thr_repository* rep = g_thr_repository;
2346 unsigned int thr_no;
2347 unsigned next_send_instance = 0;
2348 for (thr_no = 0; thr_no < glob_num_threads; thr_no++)
2349 {
2350 thr_data *selfptr = &rep->m_thread[thr_no];
2351 selfptr->m_nosend = conf.do_get_nosend(selfptr->m_instance_list,
2352 selfptr->m_instance_count);
2353 if (is_recv_thread(thr_no) || selfptr->m_nosend == 1)
2354 {
2355 selfptr->m_send_instance_no = 0;
2356 selfptr->m_send_instance = NULL;
2357 selfptr->m_nosend = 1;
2358 }
2359 else if (is_ldm_thread(thr_no))
2360 {
2361 selfptr->m_send_instance_no = next_send_instance;
2362 selfptr->m_send_instance =
2363 get_send_thread_instance_by_num(next_send_instance);
2364 next_send_instance++;
2365 if (next_send_instance == globalData.ndbMtSendThreads)
2366 {
2367 next_send_instance = 0;
2368 }
2369 }
2370 else
2371 {
2372 }
2373 }
2374 for (thr_no = 0; thr_no < glob_num_threads; thr_no++)
2375 {
2376 thr_data *selfptr = &rep->m_thread[thr_no];
2377 if (is_recv_thread(thr_no) ||
2378 selfptr->m_nosend == 1 ||
2379 is_ldm_thread(thr_no))
2380 {
2381 continue;
2382 }
2383 else
2384 {
2385 selfptr->m_send_instance_no = next_send_instance;
2386 selfptr->m_send_instance =
2387 get_send_thread_instance_by_num(next_send_instance);
2388 next_send_instance++;
2389 if (next_send_instance == globalData.ndbMtSendThreads)
2390 {
2391 next_send_instance = 0;
2392 }
2393 }
2394 }
2395 }
2396
2397 void
start_send_threads()2398 thr_send_threads::start_send_threads()
2399 {
2400 for (Uint32 i = 0; i < globalData.ndbMtSendThreads; i++)
2401 {
2402 m_send_threads[i].m_thread =
2403 NdbThread_Create(mt_send_thread_main,
2404 (void **)&m_send_threads[i],
2405 1024*1024,
2406 "send thread", //ToDo add number
2407 NDB_THREAD_PRIO_MEAN);
2408 m_send_threads[i].m_thr_index =
2409 globalEmulatorData.theConfiguration->addThread(
2410 m_send_threads[i].m_thread,
2411 SendThread);
2412 }
2413 m_started_threads = TRUE;
2414 }
2415
2416 struct thr_send_thread_instance*
get_send_thread_instance_by_num(Uint32 instance_no)2417 thr_send_threads::get_send_thread_instance_by_num(Uint32 instance_no)
2418 {
2419 return &m_send_threads[instance_no];
2420 }
2421
2422 Uint32
get_send_instance(TrpId trp_id)2423 thr_send_threads::get_send_instance(TrpId trp_id)
2424 {
2425 require(trp_id < MAX_NTRANSPORTERS);
2426 Uint32 send_thread_instance = m_send_thread_instance_by_trp[trp_id];
2427 require(send_thread_instance < globalData.ndbMtSendThreads);
2428 return send_thread_instance;
2429 }
2430
2431 struct thr_send_thread_instance*
get_send_thread_instance_by_trp(TrpId trp_id)2432 thr_send_threads::get_send_thread_instance_by_trp(TrpId trp_id)
2433 {
2434 require(trp_id < MAX_NTRANSPORTERS);
2435 Uint32 send_thread_instance = m_send_thread_instance_by_trp[trp_id];
2436 require(send_thread_instance < globalData.ndbMtSendThreads);
2437 return &m_send_threads[send_thread_instance];
2438 }
2439
2440 /**
2441 * Called under mutex protection of send_thread_mutex
2442 */
2443 void
insert_trp(TrpId trp_id,struct thr_send_thread_instance * send_instance)2444 thr_send_threads::insert_trp(TrpId trp_id,
2445 struct thr_send_thread_instance *send_instance)
2446 {
2447 struct thr_send_trps &trp_state = m_trp_state[trp_id];
2448
2449 send_instance->m_more_trps = true;
2450 /* Ensure the lock free ::data_available see 'm_more_trps == TRUE' */
2451 wmb();
2452
2453 if (trp_state.m_neighbour_trp)
2454 return;
2455
2456 Uint32 first_trp = send_instance->m_first_trp;
2457 struct thr_send_trps &last_trp_state =
2458 m_trp_state[send_instance->m_last_trp];
2459 trp_state.m_next = 0;
2460 send_instance->m_last_trp = trp_id;
2461 assert(trp_state.m_data_available > 0);
2462
2463 if (first_trp == 0)
2464 {
2465 send_instance->m_first_trp = trp_id;
2466 }
2467 else
2468 {
2469 last_trp_state.m_next = trp_id;
2470 }
2471 }
2472
2473 /**
2474 * Called under mutex protection of send_thread_mutex
2475 * The timer is taken before grabbing the mutex and can thus be a
2476 * bit older than now when compared to other times.
2477 */
2478 void
set_max_delay(TrpId trp_id,NDB_TICKS now,Uint32 delay_usec)2479 thr_send_threads::set_max_delay(TrpId trp_id, NDB_TICKS now, Uint32 delay_usec)
2480 {
2481 struct thr_send_trps &trp_state = m_trp_state[trp_id];
2482 assert(trp_state.m_data_available > 0);
2483 assert(!trp_state.m_send_overload);
2484
2485 trp_state.m_micros_delayed = delay_usec;
2486 trp_state.m_inserted_time = now;
2487 trp_state.m_overload_counter++;
2488 }
2489
2490 /**
2491 * Called under mutex protection of send_thread_mutex
2492 * The time is taken before grabbing the mutex, so this timer
2493 * could be older time than now in rare cases.
2494 */
2495 void
set_overload_delay(TrpId trp_id,NDB_TICKS now,Uint32 delay_usec)2496 thr_send_threads::set_overload_delay(TrpId trp_id,
2497 NDB_TICKS now,
2498 Uint32 delay_usec)
2499 {
2500 struct thr_send_trps &trp_state = m_trp_state[trp_id];
2501 assert(trp_state.m_data_available > 0);
2502 trp_state.m_send_overload = TRUE;
2503 trp_state.m_micros_delayed = delay_usec;
2504 trp_state.m_inserted_time = now;
2505 trp_state.m_overload_counter++;
2506 }
2507
2508 /**
2509 * Called under mutex protection of send_thread_mutex
2510 * The now can be older than what is set in m_inserted_time since
2511 * now is not taken holding the mutex, thus we can take the time,
2512 * be scheduled away for a while and return, in the meantime
2513 * another thread could insert a new event with a newer insert
2514 * time.
2515 *
2516 * We ensure in code below that if this type of event happens that
2517 * we set the timer to be expired and we use the more recent time
2518 * as now.
2519 */
2520 Uint32
check_delay_expired(TrpId trp_id,NDB_TICKS now)2521 thr_send_threads::check_delay_expired(TrpId trp_id, NDB_TICKS now)
2522 {
2523 struct thr_send_trps &trp_state = m_trp_state[trp_id];
2524 assert(trp_state.m_data_available > 0);
2525 Uint64 micros_delayed = Uint64(trp_state.m_micros_delayed);
2526
2527 if (micros_delayed == 0)
2528 return 0;
2529
2530 Uint64 micros_passed;
2531 if (now.getUint64() > trp_state.m_inserted_time.getUint64())
2532 {
2533 micros_passed = NdbTick_Elapsed(trp_state.m_inserted_time,
2534 now).microSec();
2535 }
2536 else
2537 {
2538 now = trp_state.m_inserted_time;
2539 micros_passed = micros_delayed;
2540 }
2541 if (micros_passed >= micros_delayed) //Expired
2542 {
2543 trp_state.m_inserted_time = now;
2544 trp_state.m_micros_delayed = 0;
2545 trp_state.m_send_overload = FALSE;
2546 return 0;
2547 }
2548
2549 // Update and return remaining wait time
2550 Uint64 remaining_micros = micros_delayed - micros_passed;
2551 return Uint32(remaining_micros);
2552 }
2553
2554 /**
2555 * TODO RONM:
2556 * Add some more NDBINFO table to make it easier to analyse the behaviour
2557 * of the workings of the MaxSendDelay parameter.
2558 */
2559
2560 static Uint64 mt_get_send_buffer_bytes(NodeId id);
2561
2562 /**
2563 * MAX_SEND_BUFFER_SIZE_TO_DELAY is a heauristic constant that specifies
2564 * a send buffer size that will always be sent. The size of this is based
2565 * on experience that maximum performance of the send part is achieved at
2566 * around 64 kBytes of send buffer size and that the difference between
2567 * 20 kB and 64 kByte is small. So thus avoiding unnecessary delays that
2568 * gain no significant performance gain.
2569 */
2570 static const Uint64 MAX_SEND_BUFFER_SIZE_TO_DELAY = (20 * 1024);
2571
2572
2573 /**
2574 * Get a trp having data to be sent to a trp (returned).
2575 *
2576 * Sending could have been delayed, in such cases the trp
2577 * to expire its delay first will be returned. It is then upto
2578 * the callee to either accept this trp, or reinsert it
2579 * such that it can be returned and retried later.
2580 *
2581 * Called under mutex protection of send_thread_mutex
2582 */
2583 #define DELAYED_PREV_NODE_IS_NEIGHBOUR UINT_MAX32
2584 TrpId
get_trp(Uint32 instance_no,NDB_TICKS now,struct thr_send_thread_instance * send_instance)2585 thr_send_threads::get_trp(Uint32 instance_no,
2586 NDB_TICKS now,
2587 struct thr_send_thread_instance *send_instance)
2588 {
2589 Uint32 next;
2590 TrpId trp_id;
2591 bool retry = false;
2592 Uint32 prev = 0;
2593 Uint32 delayed_trp = 0;
2594 Uint32 delayed_prev_trp = 0;
2595 Uint32 min_wait_usec = UINT_MAX32;
2596 do
2597 {
2598 if (send_instance->m_next_is_high_prio_trp)
2599 {
2600 Uint32 num_neighbour_trps = send_instance->m_num_neighbour_trps;
2601 Uint32 neighbour_trp_index = send_instance->m_neighbour_trp_index;
2602 for (Uint32 i = 0; i < num_neighbour_trps; i++)
2603 {
2604 trp_id = send_instance->m_neighbour_trps[neighbour_trp_index];
2605 neighbour_trp_index++;
2606 if (neighbour_trp_index == num_neighbour_trps)
2607 neighbour_trp_index = 0;
2608 send_instance->m_neighbour_trp_index = neighbour_trp_index;
2609 if (m_trp_state[trp_id].m_data_available > 0 &&
2610 m_trp_state[trp_id].m_thr_no_sender == NO_OWNER_THREAD)
2611 {
2612 const Uint32 send_delay = check_delay_expired(trp_id, now);
2613 if (likely(send_delay == 0))
2614 {
2615 /**
2616 * Found a neighbour trp to return. Handle this and ensure that
2617 * next call to get_trp will start looking for non-neighbour
2618 * trps.
2619 */
2620 send_instance->m_next_is_high_prio_trp = false;
2621 goto found_neighbour;
2622 }
2623
2624 /**
2625 * Found a neighbour trp with delay, record the delay
2626 * and the trp and set indicator that delayed trp is
2627 * a neighbour.
2628 */
2629 if (send_delay < min_wait_usec)
2630 {
2631 min_wait_usec = send_delay;
2632 delayed_trp = trp_id;
2633 delayed_prev_trp = DELAYED_PREV_NODE_IS_NEIGHBOUR;
2634 }
2635 }
2636 }
2637 if (retry)
2638 {
2639 /**
2640 * We have already searched the non-neighbour trps and we
2641 * have now searched the neighbour trps and found no trps
2642 * ready to start sending to, we might still have a delayed
2643 * trp, this will be checked before exiting.
2644 */
2645 goto found_no_ready_trps;
2646 }
2647
2648 /**
2649 * We found no ready trps amongst the neighbour trps, we will
2650 * also search the non-neighbours, we will do this simply by
2651 * falling through into this part and setting retry to true to
2652 * indicate that we already searched the neighbour trps.
2653 */
2654 retry = true;
2655 }
2656 else
2657 {
2658 /**
2659 * We might loop one more time and then we need to ensure that
2660 * we don't just come back here. If we report a trp from this
2661 * function this variable will be set again. If we find no trp
2662 * then it really doesn't matter what this variable is set to.
2663 * When trps are available we will always try to be fair and
2664 * return high prio trps as often as non-high prio trps.
2665 */
2666 send_instance->m_next_is_high_prio_trp = true;
2667 }
2668
2669 trp_id = send_instance->m_first_trp;
2670 if (!trp_id)
2671 {
2672 if (!retry)
2673 {
2674 /**
2675 * We need to check the neighbour trps before we decide that
2676 * there is no trps to send to.
2677 */
2678 retry = true;
2679 continue;
2680 }
2681 /**
2682 * Found no trps ready to be sent to, will still need check of
2683 * delayed trps before exiting.
2684 */
2685 goto found_no_ready_trps;
2686 }
2687
2688 /**
2689 * Search for a trp ready to be sent to among the non-neighbour trps.
2690 * If none found, remember the one with the smallest delay.
2691 */
2692 prev = 0;
2693 while (trp_id)
2694 {
2695 next = m_trp_state[trp_id].m_next;
2696
2697 const Uint32 send_delay = check_delay_expired(trp_id, now);
2698 if (likely(send_delay == 0))
2699 {
2700 /**
2701 * We found a non-neighbour trp to return, handle this
2702 * and set the next get_trp to start looking for
2703 * neighbour trps.
2704 */
2705 send_instance->m_next_is_high_prio_trp = true;
2706 goto found_non_neighbour;
2707 }
2708
2709 /* Find remaining minimum wait: */
2710 if (min_wait_usec > send_delay)
2711 {
2712 min_wait_usec = send_delay;
2713 delayed_trp = trp_id;
2714 delayed_prev_trp = prev;
2715 }
2716
2717 prev = trp_id;
2718 trp_id = next;
2719 }
2720
2721 // As 'first_trp != 0', there has to be a 'delayed_trp'
2722 assert(delayed_trp != 0);
2723
2724 if (!retry)
2725 {
2726 /**
2727 * Before we decide to send to a delayed non-neighbour trp
2728 * we should check if there is a neighbour ready to be sent
2729 * to, or if there is a neighbour with a lower delay that
2730 * can be sent to.
2731 */
2732 retry = true;
2733 continue;
2734 }
2735 /**
2736 * No trps ready to send to, but we only get here when we know
2737 * there is at least a delayed trp, so jump directly to handling
2738 * of returning delayed trps.
2739 */
2740 goto found_delayed_trp;
2741 } while (1);
2742
2743 found_no_ready_trps:
2744 /**
2745 * We have found no trps ready to be sent to yet, we can still
2746 * have a delayed trp and we don't know from where it comes.
2747 */
2748 if (delayed_trp == 0)
2749 {
2750 /**
2751 * We have found no trps to send to, neither non-delayed nor
2752 * delayed trps. Mark m_more_trps as false to indicate that
2753 * we have no trps to send to for the moment to give the
2754 * send threads a possibility to go to sleep.
2755 */
2756 send_instance->m_more_trps = false;
2757 return 0;
2758 }
2759
2760 /**
2761 * We have ensured that delayed_trp exists although we have no
2762 * trps ready to be sent to yet. We will fall through to handling
2763 * of finding a delayed trp.
2764 */
2765
2766 found_delayed_trp:
2767 /**
2768 * We found no trp ready to send to but we did find a delayed trp.
2769 * We don't know if the delayed trp is a neighbour trp or not, we
2770 * check this using delayed_prev_trp which is set to ~0 for
2771 * neighbour trps.
2772 */
2773 assert(delayed_trp != 0);
2774 trp_id = delayed_trp;
2775 if (delayed_prev_trp == DELAYED_PREV_NODE_IS_NEIGHBOUR)
2776 {
2777 /**
2778 * Go to handling of found neighbour as we have decided to return
2779 * this delayed neighbour trp.
2780 */
2781 send_instance->m_next_is_high_prio_trp = false;
2782 goto found_neighbour;
2783 }
2784 else
2785 {
2786 send_instance->m_next_is_high_prio_trp = true;
2787 }
2788
2789 prev = delayed_prev_trp;
2790 next = m_trp_state[trp_id].m_next;
2791
2792 /**
2793 * Fall through to found_non_neighbour since we have decided that this
2794 * delayed trp will be returned.
2795 */
2796
2797 found_non_neighbour:
2798 /**
2799 * We are going to return a non-neighbour trp, either delayed
2800 * or not. We need to remove it from the list of non-neighbour
2801 * trps to send to.
2802 */
2803
2804 if (likely(trp_id == send_instance->m_first_trp))
2805 {
2806 send_instance->m_first_trp = next;
2807 assert(prev == 0);
2808 }
2809 else
2810 {
2811 assert(prev != 0);
2812 m_trp_state[prev].m_next = next;
2813 }
2814
2815 if (trp_id == send_instance->m_last_trp)
2816 send_instance->m_last_trp = prev;
2817
2818 /**
2819 * Fall through for non-neighbour trps to same return handling as
2820 * neighbour trps.
2821 */
2822
2823 found_neighbour:
2824 /**
2825 * We found a trp to return, we will update the data available,
2826 * we also need to set m_thr_no_sender to indicate which thread
2827 * is owning the right to send to this trp for the moment.
2828 *
2829 * Neighbour trps can go directly here since they are not
2830 * organised in any lists, but we come here also for
2831 * non-neighbour trps.
2832 */
2833 struct thr_send_trps &trp_state = m_trp_state[trp_id];
2834
2835 assert(trp_state.m_data_available > 0);
2836 assert(trp_state.m_thr_no_sender == NO_OWNER_THREAD);
2837 trp_state.m_next = 0;
2838 trp_state.m_data_available = 1;
2839 return (TrpId)trp_id;
2840 }
2841
2842 /* Called under mutex protection of send_thread_mutex */
2843 bool
check_done_trp(TrpId trp_id)2844 thr_send_threads::check_done_trp(TrpId trp_id)
2845 {
2846 struct thr_send_trps &trp_state = m_trp_state[trp_id];
2847 assert(trp_state.m_data_available > 0);
2848 trp_state.m_data_available--;
2849 return (trp_state.m_data_available == 0);
2850 }
2851
2852 /* Called under mutex protection of send_thread_mutex */
2853 struct thr_send_thread_instance*
get_not_awake_send_thread(TrpId trp_id,struct thr_send_thread_instance * send_instance)2854 thr_send_threads::get_not_awake_send_thread(TrpId trp_id,
2855 struct thr_send_thread_instance *send_instance)
2856 {
2857 struct thr_send_thread_instance *used_send_thread;
2858 if (trp_id != 0)
2859 {
2860 Uint32 send_thread = get_send_instance(trp_id);
2861 if (!m_send_threads[send_thread].m_awake)
2862 {
2863 used_send_thread= &m_send_threads[send_thread];
2864 assert(used_send_thread == send_instance);
2865 return used_send_thread;
2866 }
2867 }
2868 if (!send_instance->m_awake)
2869 return send_instance;
2870 return NULL;
2871 }
2872
2873 /**
2874 * We have assisted our send thread instance, check if it still
2875 * need to be woken up.
2876 */
2877 void
wake_my_send_thread_if_needed(TrpId * trp_id_array,Uint32 count,struct thr_send_thread_instance * my_send_instance)2878 thr_send_threads::wake_my_send_thread_if_needed(TrpId *trp_id_array,
2879 Uint32 count,
2880 struct thr_send_thread_instance *my_send_instance)
2881 {
2882 bool mutex_locked = false;
2883 struct thr_send_thread_instance *wake_send_instance = NULL;
2884 for (Uint32 i = 0; i < count; i++)
2885 {
2886 TrpId trp_id = trp_id_array[i];
2887 struct thr_send_thread_instance *send_instance =
2888 get_send_thread_instance_by_trp(trp_id);
2889 if (send_instance != my_send_instance)
2890 continue;
2891 if (!mutex_locked)
2892 {
2893 mutex_locked = true;
2894 NdbMutex_Lock(my_send_instance->send_thread_mutex);
2895 }
2896 struct thr_send_trps& trp_state = m_trp_state[trp_id];
2897 if (trp_state.m_data_available > 0)
2898 {
2899 wake_send_instance = my_send_instance;
2900 break;
2901 }
2902 }
2903 if (mutex_locked)
2904 {
2905 NdbMutex_Unlock(my_send_instance->send_thread_mutex);
2906 }
2907 if (wake_send_instance != NULL)
2908 {
2909 wakeup(&(wake_send_instance->m_waiter_struct));
2910 }
2911 }
2912
2913 /**
2914 * Insert transporter into send thread instance data structures.
2915 * Wake send thread unless it is the one which we handle ourselves.
2916 * If we handle it ourselves we will check after assisting the
2917 * send thread if the thread is still required to wake up. This
2918 * ensures that running with 1 send thread will avoid waking up
2919 * send thread when not required to do so. With many send threads
2920 * we will avoid a small portion of wakeup calls through this
2921 * handling.
2922 *
2923 * If we don't do any send thread assistance the instance is simply
2924 * NULL here and we will wake all required send threads.
2925 */
2926 Uint32
alert_send_thread(TrpId trp_id,NDB_TICKS now,struct thr_send_thread_instance * my_send_instance)2927 thr_send_threads::alert_send_thread(TrpId trp_id,
2928 NDB_TICKS now,
2929 struct thr_send_thread_instance *my_send_instance)
2930 {
2931 struct thr_send_thread_instance *send_instance =
2932 get_send_thread_instance_by_trp(trp_id);
2933 struct thr_send_trps& trp_state = m_trp_state[trp_id];
2934
2935 NdbMutex_Lock(send_instance->send_thread_mutex);
2936 trp_state.m_data_available++; // There is more to send
2937 if (trp_state.m_data_available > 1)
2938 {
2939 /**
2940 * ACTIVE(_P) -> ACTIVE_P
2941 *
2942 * The trp is already flagged that it has data needing to be sent.
2943 * There is no need to wake even more threads up in this case
2944 * since we piggyback on someone else's request.
2945 *
2946 * Waking another thread for sending to this trp, had only
2947 * resulted in contention and blockage on the send_lock.
2948 *
2949 * We are safe that the buffers we have flushed will be read by a send
2950 * thread: They will either be piggybacked when the send thread
2951 * 'get_trp()' for sending, or data will be available when
2952 * send thread 'check_done_trp()', finds that more data has
2953 * become available. In the later case, the send thread will schedule
2954 * the trp for another round with insert_trp()
2955 */
2956 NdbMutex_Unlock(send_instance->send_thread_mutex);
2957 return 0;
2958 }
2959 assert(!trp_state.m_send_overload); // Caught above as ACTIVE
2960 assert(m_trp_state[trp_id].m_thr_no_sender == NO_OWNER_THREAD);
2961 insert_trp(trp_id, send_instance); // IDLE -> PENDING
2962
2963 /**
2964 * We need to delay sending the data, as set in config.
2965 * This is the first send to this trp, so we start the
2966 * delay timer now.
2967 */
2968 if (max_send_delay > 0) // Wait for more payload?
2969 {
2970 set_max_delay(trp_id, now, max_send_delay);
2971 }
2972
2973 if (send_instance == my_send_instance)
2974 {
2975 NdbMutex_Unlock(send_instance->send_thread_mutex);
2976 return 1;
2977 }
2978
2979 /*
2980 * Check if the send thread especially responsible for this transporter
2981 * is awake, if not wake it up.
2982 */
2983 struct thr_send_thread_instance *avail_send_thread
2984 = get_not_awake_send_thread(trp_id, send_instance);
2985
2986 NdbMutex_Unlock(send_instance->send_thread_mutex);
2987
2988 if (avail_send_thread)
2989 {
2990 /*
2991 * Wake the assigned sleeping send thread, potentially a spurious wakeup,
2992 * but this is not a problem, important is to ensure that at least one
2993 * send thread is awoken to handle our request. If someone is already
2994 * awake and takes care of our request before we get to wake someone up
2995 * it's not a problem.
2996 */
2997 wakeup(&(avail_send_thread->m_waiter_struct));
2998 }
2999 return 1;
3000 }
3001
3002 static bool
check_available_send_data(struct thr_send_thread_instance * send_instance)3003 check_available_send_data(struct thr_send_thread_instance *send_instance)
3004 {
3005 return !send_instance->data_available();
3006 }
3007
3008 //static
3009 int
trylock_send_trp(TrpId trp_id)3010 thr_send_threads::trylock_send_trp(TrpId trp_id)
3011 {
3012 thr_repository::send_buffer *sb = g_thr_repository->m_send_buffers+trp_id;
3013 return trylock(&sb->m_send_lock);
3014 }
3015
3016 //static
3017 bool
perform_send(TrpId trp_id,Uint32 thr_no,Uint32 & bytes_sent)3018 thr_send_threads::perform_send(TrpId trp_id, Uint32 thr_no, Uint32& bytes_sent)
3019 {
3020 thr_repository::send_buffer * sb = g_thr_repository->m_send_buffers+trp_id;
3021
3022 /**
3023 * Set m_send_thread so that our transporter callback can know which thread
3024 * holds the send lock for this remote trp. This is the thr_no of a block
3025 * thread or the thr_no of a send thread.
3026 */
3027 sb->m_send_thread = thr_no;
3028 const bool more = globalTransporterRegistry.performSend(trp_id);
3029 bytes_sent = sb->m_bytes_sent;
3030 sb->m_send_thread = NO_SEND_THREAD;
3031 unlock(&sb->m_send_lock);
3032 return more;
3033 }
3034
3035 static void
update_send_sched_config(THRConfigApplier & conf,unsigned instance_no,bool & real_time)3036 update_send_sched_config(THRConfigApplier & conf,
3037 unsigned instance_no,
3038 bool & real_time)
3039 {
3040 real_time = conf.do_get_realtime_send(instance_no);
3041 }
3042
3043 static void
yield_rt_break(NdbThread * thread,enum ThreadTypes type,bool real_time)3044 yield_rt_break(NdbThread *thread,
3045 enum ThreadTypes type,
3046 bool real_time)
3047 {
3048 Configuration * conf = globalEmulatorData.theConfiguration;
3049 conf->setRealtimeScheduler(thread,
3050 type,
3051 FALSE,
3052 FALSE);
3053 conf->setRealtimeScheduler(thread,
3054 type,
3055 real_time,
3056 FALSE);
3057 }
3058
3059 static void
check_real_time_break(NDB_TICKS now,NDB_TICKS * yield_time,NdbThread * thread,enum ThreadTypes type)3060 check_real_time_break(NDB_TICKS now,
3061 NDB_TICKS *yield_time,
3062 NdbThread *thread,
3063 enum ThreadTypes type)
3064 {
3065 if (unlikely(NdbTick_Compare(now, *yield_time) < 0))
3066 {
3067 /**
3068 * Timer was adjusted backwards, or the monotonic timer implementation
3069 * on this platform is unstable. Best we can do is to restart
3070 * RT-yield timers from new current time.
3071 */
3072 *yield_time = now;
3073 }
3074
3075 const Uint64 micros_passed =
3076 NdbTick_Elapsed(*yield_time, now).microSec();
3077
3078 if (micros_passed > 50000)
3079 {
3080 /**
3081 * Lower scheduling prio to time-sharing mode to ensure that
3082 * other threads and processes gets a chance to be scheduled
3083 * if we run for an extended time.
3084 */
3085 yield_rt_break(thread, type, TRUE);
3086 *yield_time = now;
3087 }
3088 }
3089
3090 #define NUM_WAITS_TO_CHECK_SPINTIME 6
3091 static void
wait_time_tracking(thr_data * selfptr,Uint64 wait_time_in_us)3092 wait_time_tracking(thr_data *selfptr, Uint64 wait_time_in_us)
3093 {
3094 for (Uint32 i = 0; i < NUM_SPIN_INTERVALS; i++)
3095 {
3096 if (wait_time_in_us <= selfptr->m_spin_stat.m_spin_interval[i])
3097 {
3098 selfptr->m_spin_stat.m_micros_sleep_times[i]++;
3099 selfptr->m_spin_stat.m_num_waits++;
3100 if (unlikely(selfptr->m_spintime == 0 &&
3101 selfptr->m_conf_spintime != 0 &&
3102 selfptr->m_spin_stat.m_num_waits == NUM_WAITS_TO_CHECK_SPINTIME))
3103 {
3104 /**
3105 * React quickly to changes in environment, if we don't have
3106 * spinning activated and have already seen 15 wait times, it means
3107 * that there is a good chance that spinning is a good idea now.
3108 * So invoke a check if we should activate spinning now.
3109 */
3110 SimulatedBlock *b = globalData.getBlock(THRMAN, selfptr->m_thr_no + 1);
3111 ((Thrman*)b)->check_spintime(false);
3112 }
3113 return;
3114 }
3115 }
3116 require(false);
3117 }
3118
3119 static bool check_queues_empty(thr_data *selfptr);
3120 static Uint32 scan_time_queues(struct thr_data* selfptr, NDB_TICKS now);
3121 static bool do_send(struct thr_data* selfptr,
3122 bool must_send,
3123 bool assist_send);
3124 /**
3125 * We call this function only after executing no jobs and thus it is
3126 * safe to spin for a short time.
3127 */
3128 static bool
check_yield(thr_data * selfptr,Uint64 min_spin_timer,Uint32 * spin_time_in_us,NDB_TICKS start_spin_ticks)3129 check_yield(thr_data *selfptr,
3130 Uint64 min_spin_timer, //microseconds
3131 Uint32 *spin_time_in_us,
3132 NDB_TICKS start_spin_ticks)
3133 {
3134 NDB_TICKS now;
3135 bool cont_flag = true;
3136 do
3137 {
3138 for (Uint32 i = 0; i < 50; i++)
3139 {
3140 /**
3141 * During around 50 us we only check for JBA and JBB
3142 * queues to not be empty. This happens when another thread or
3143 * the receive thread sends a signal to the thread.
3144 */
3145 NdbSpin();
3146 if (!check_queues_empty(selfptr))
3147 {
3148 /* Found jobs to execute, successful spin */
3149 cont_flag = false;
3150 now = NdbTick_getCurrentTicks();
3151 break;
3152 }
3153 /* Check if we have done enough spinning once per 3 us */
3154 if ((i & 3) == 3)
3155 continue;
3156 now = NdbTick_getCurrentTicks();
3157 Uint64 spin_micros = NdbTick_Elapsed(start_spin_ticks, now).microSec();
3158 if (spin_micros > min_spin_timer)
3159 {
3160 /**
3161 * We have spun for the required time, but to no avail, there was no
3162 * work to do, so it is now time to yield and go to sleep.
3163 */
3164 *spin_time_in_us = spin_micros;
3165 selfptr->m_curr_ticks = now;
3166 selfptr->m_spin_stat.m_sleep_longer_spin_time++;
3167 selfptr->m_measured_spintime += spin_micros;
3168 return true;
3169 }
3170 }
3171 if (!cont_flag)
3172 break;
3173 /**
3174 * Every 50 us we also scan time queues to see if any delayed signals
3175 * need to be delivered. After checking if this generates any new
3176 * messages we also check if we have completed spinning for this
3177 * time.
3178 */
3179 const Uint32 lagging_timers = scan_time_queues(selfptr, now);
3180 if (lagging_timers != 0 ||
3181 !check_queues_empty(selfptr))
3182 {
3183 /* Found jobs to execute, successful spin */
3184 cont_flag = false;
3185 break;
3186 }
3187 } while (cont_flag);
3188 /**
3189 * Successful spinning, we will record spinning time. We will also record
3190 * the number of micros that this has saved. This is a static number based
3191 * on experience. We use measurements from virtual machines where we gain
3192 * the time it would take to go to sleep and wakeup again. This is roughly
3193 * 25 microseconds.
3194 *
3195 * This is the positive part of spinning where we gained something through
3196 * spinning.
3197 */
3198 Uint64 spin_micros = NdbTick_Elapsed(start_spin_ticks, now).microSec();
3199 selfptr->m_curr_ticks = now;
3200 selfptr->m_measured_spintime += spin_micros;
3201 selfptr->m_spin_stat.m_sleep_shorter_spin_time++;
3202 selfptr->m_micros_sleep += spin_micros;
3203 wait_time_tracking(selfptr, spin_micros);
3204 return false;
3205 }
3206
3207 /**
3208 * We call this function only after executing no jobs and thus it is
3209 * safe to spin for a short time.
3210 */
3211 static bool
check_recv_yield(thr_data * selfptr,TransporterReceiveHandle & recvdata,Uint64 min_spin_timer,Uint32 & num_events,Uint32 * spin_time_in_us,NDB_TICKS start_spin_ticks)3212 check_recv_yield(thr_data *selfptr,
3213 TransporterReceiveHandle & recvdata,
3214 Uint64 min_spin_timer, //microseconds
3215 Uint32 & num_events,
3216 Uint32 *spin_time_in_us,
3217 NDB_TICKS start_spin_ticks)
3218 {
3219 NDB_TICKS now;
3220 bool cont_flag = true;
3221 do
3222 {
3223 for (Uint32 i = 0; i < 60; i++)
3224 {
3225 /**
3226 * During around 50 us we only check for JBA and JBB
3227 * queues to not be empty. This happens when another thread or
3228 * the receive thread sends a signal to the thread.
3229 */
3230 NdbSpin();
3231 if ((!check_queues_empty(selfptr)) ||
3232 ((num_events =
3233 globalTransporterRegistry.pollReceive(0, recvdata)) > 0))
3234 {
3235 /* Found jobs to execute, successful spin */
3236 cont_flag = false;
3237 now = NdbTick_getCurrentTicks();
3238 break;
3239 }
3240 /* Check if we have done enough spinning once per 3 us */
3241 if ((i & 3) == 3)
3242 continue;
3243 /* Check if we have done enough spinning */
3244 now = NdbTick_getCurrentTicks();
3245 Uint64 spin_micros = NdbTick_Elapsed(start_spin_ticks, now).microSec();
3246 if (spin_micros > min_spin_timer)
3247 {
3248 /**
3249 * We have spun for the required time, but to no avail, there was no
3250 * work to do, so it is now time to yield and go to sleep.
3251 */
3252 selfptr->m_measured_spintime += spin_micros;
3253 selfptr->m_spin_stat.m_sleep_longer_spin_time++;
3254 return true;
3255 }
3256 }
3257 if (!cont_flag)
3258 break;
3259 /**
3260 * Every 50 us we also scan time queues to see if any delayed signals
3261 * need to be delivered. After checking if this generates any new
3262 * messages we also check if we have completed spinning for this
3263 * time.
3264 */
3265 const Uint32 lagging_timers = scan_time_queues(selfptr, now);
3266 if (lagging_timers != 0 ||
3267 !check_queues_empty(selfptr))
3268 {
3269 /* Found jobs to execute, successful spin */
3270 cont_flag = false;
3271 break;
3272 }
3273 } while (cont_flag);
3274 /**
3275 * Successful spinning, we will record spinning time. We will also record
3276 * the number of micros that this has saved. This is a static number based
3277 * on experience. We use measurements from virtual machines where we gain
3278 * the time it would take to go to sleep and wakeup again. This is roughly
3279 * 25 microseconds.
3280 *
3281 * This is the positive part of spinning where we gained something through
3282 * spinning.
3283 */
3284 Uint64 spin_micros = NdbTick_Elapsed(start_spin_ticks, now).microSec();
3285 selfptr->m_measured_spintime += spin_micros;
3286 selfptr->m_spin_stat.m_sleep_shorter_spin_time++;
3287 selfptr->m_micros_sleep += spin_micros;
3288 wait_time_tracking(selfptr, spin_micros);
3289 return false;
3290 }
3291
3292 /**
3293 * We enter this function holding the send_thread_mutex if lock is
3294 * false and we leave no longer holding the mutex.
3295 */
3296 bool
assist_send_thread(Uint32 max_num_trps,Uint32 thr_no,NDB_TICKS now,Uint32 & watchdog_counter,struct thr_send_thread_instance * send_instance,class thread_local_pool<thr_send_page> & send_buffer_pool)3297 thr_send_threads::assist_send_thread(Uint32 max_num_trps,
3298 Uint32 thr_no,
3299 NDB_TICKS now,
3300 Uint32 &watchdog_counter,
3301 struct thr_send_thread_instance *send_instance,
3302 class thread_local_pool<thr_send_page> & send_buffer_pool)
3303 {
3304 Uint32 num_trps_sent = 0;
3305 Uint32 loop = 0;
3306 NDB_TICKS spin_ticks_dummy;
3307 TrpId trp_id = 0;
3308
3309 NdbMutex_Lock(send_instance->send_thread_mutex);
3310
3311 while (globalData.theRestartFlag != perform_stop &&
3312 loop < max_num_trps &&
3313 (trp_id = get_trp(NO_SEND_THREAD, now, send_instance)) != 0)
3314 // PENDING -> ACTIVE
3315 {
3316 if (!handle_send_trp(trp_id,
3317 num_trps_sent,
3318 thr_no,
3319 now,
3320 watchdog_counter,
3321 send_instance))
3322 {
3323 /**
3324 * Neighbour trps are locked through setting
3325 * m_trp_state[id].m_thr_no_sender to thr_no while holding
3326 * the mutex. This flag is set between start of send and end
3327 * of send. In this case there was no send so the flag isn't
3328 * set now, since we insert it back immediately it will simply
3329 * remain unset. We assert on this just in case.
3330 *
3331 * Only transporters waiting for delay to expire was waiting to send,
3332 * we will skip sending in this case and leave it for the send
3333 * thread to handle it. No reason to set pending_send to true since
3334 * there is no hurry to send (through setting id = 0 below).
3335 */
3336 assert(m_trp_state[trp_id].m_thr_no_sender == NO_OWNER_THREAD);
3337 insert_trp(trp_id, send_instance);
3338 trp_id = 0;
3339 break;
3340 }
3341
3342 watchdog_counter = 3;
3343 send_buffer_pool.release_global(g_thr_repository->m_mm,
3344 RG_TRANSPORTER_BUFFERS,
3345 send_instance->m_instance_no);
3346
3347 loop++;
3348 }
3349 if (trp_id == 0)
3350 {
3351 NdbMutex_Unlock(send_instance->send_thread_mutex);
3352 return false;
3353 }
3354 /**
3355 * There is more work to do, keep pending_send flag to true such
3356 * that we will quickly work off the queue of send tasks available.
3357 */
3358 bool pending_send = send_instance->check_pending_data();
3359 NdbMutex_Unlock(send_instance->send_thread_mutex);
3360 return pending_send;
3361 }
3362
3363 /**
3364 * We hold the send_thread_mutex of the send_instance when we
3365 * enter this function.
3366 */
3367 bool
handle_send_trp(TrpId trp_id,Uint32 & num_trps_sent,Uint32 thr_no,NDB_TICKS & now,Uint32 & watchdog_counter,struct thr_send_thread_instance * send_instance)3368 thr_send_threads::handle_send_trp(TrpId trp_id,
3369 Uint32 & num_trps_sent,
3370 Uint32 thr_no,
3371 NDB_TICKS & now,
3372 Uint32 & watchdog_counter,
3373 struct thr_send_thread_instance *send_instance)
3374 {
3375 assert(send_instance == get_send_thread_instance_by_trp(trp_id));
3376 assert(m_trp_state[trp_id].m_thr_no_sender == NO_OWNER_THREAD);
3377 if (m_trp_state[trp_id].m_micros_delayed > 0) // Trp send is delayed
3378 {
3379 /**
3380 * The only transporter ready for send was a transporter that still
3381 * required waiting. We will only send if we have enough data to
3382 * send without delay.
3383 */
3384 if (m_trp_state[trp_id].m_send_overload) // Pause overloaded trp
3385 {
3386 return false;
3387 }
3388
3389 if (mt_get_send_buffer_bytes(trp_id) >= MAX_SEND_BUFFER_SIZE_TO_DELAY)
3390 set_max_delay(trp_id, now, 0); // Large packet -> Send now
3391 else // Sleep, let last awake send
3392 {
3393 if (thr_no >= glob_num_threads)
3394 {
3395 /**
3396 * When encountering max_send_delay from send thread we
3397 * will let the send thread go to sleep for as long as
3398 * this trp has to wait (it is the shortest sleep we
3399 * we have. For non-send threads the trp will simply
3400 * be reinserted and someone will pick up later to handle
3401 * things.
3402 *
3403 * At this point in time there are no transporters ready to
3404 * send, they all are waiting for the delay to expire.
3405 */
3406 send_instance->m_more_trps = false;
3407 }
3408 return false;
3409 }
3410 }
3411
3412 /**
3413 * Multiple send threads can not 'get' the same
3414 * trp simultaneously. Thus, we does not need
3415 * to keep the global send thread mutex any longer.
3416 * Also avoids worker threads blocking on us in
3417 * ::alert_send_thread
3418 */
3419 #ifdef VM_TRACE
3420 my_thread_yield();
3421 #endif
3422 assert(m_trp_state[trp_id].m_thr_no_sender == NO_OWNER_THREAD);
3423 m_trp_state[trp_id].m_thr_no_sender = thr_no;
3424 NdbMutex_Unlock(send_instance->send_thread_mutex);
3425
3426 watchdog_counter = 6;
3427
3428 /**
3429 * Need a lock on the send buffers to protect against
3430 * worker thread doing ::forceSend, possibly
3431 * disable_send_buffers() and/or lock_/unlock_transporter().
3432 * To avoid a livelock with ::forceSend() on an overloaded
3433 * systems, we 'try-lock', and reinsert the trp for
3434 * later retry if failed.
3435 *
3436 * To ensure that the combination of more == true &&
3437 * bytes_sent == 0 can be used to signal that the
3438 * transporter is overloaded, we initialise bytes_sent to 1 to avoid
3439 * interpreting a try_lock failure as if it was an overloaded
3440 * transporter. This is a fix for BUG#22393612.
3441 */
3442 bool more = true;
3443 Uint32 bytes_sent = 1;
3444 #ifdef VM_TRACE
3445 my_thread_yield();
3446 #endif
3447 if (likely(trylock_send_trp(trp_id) == 0))
3448 {
3449 more = perform_send(trp_id, thr_no, bytes_sent);
3450 /* We return with no locks or mutexes held */
3451 }
3452
3453 /**
3454 * Note that we do not yet return any send_buffers to the
3455 * global pool: handle_send_trp() may be called from either
3456 * a send-thread, or a worker-thread doing 'assist send'.
3457 * These has different policies for releasing send_buffers,
3458 * which should be handled by the respective callers.
3459 * (release_chunk() or release_global())
3460 *
3461 * Either own perform_send() processing, or external 'alert'
3462 * could have signaled that there are more sends pending.
3463 * If we had no progress in perform_send, we conclude that
3464 * trp is overloaded, and takes a break doing further send
3465 * attempts to that trp. Also failure of trylock_send_trp
3466 * will result on the 'overload' to be concluded.
3467 * (Quite reasonable as the worker thread is likely forceSend'ing)
3468 */
3469 now = NdbTick_getCurrentTicks();
3470
3471 NdbMutex_Lock(send_instance->send_thread_mutex);
3472 #ifdef VM_TRACE
3473 my_thread_yield();
3474 #endif
3475 assert(m_trp_state[trp_id].m_thr_no_sender == thr_no);
3476 m_trp_state[trp_id].m_thr_no_sender = NO_OWNER_THREAD;
3477 if (more || // ACTIVE -> PENDING
3478 !check_done_trp(trp_id)) // ACTIVE-P -> PENDING
3479 {
3480 insert_trp(trp_id, send_instance);
3481
3482 if (unlikely(more && bytes_sent == 0)) //Trp is overloaded
3483 {
3484 set_overload_delay(trp_id, now, 200);//Delay send-retry by 200 us
3485 }
3486 } // ACTIVE -> IDLE
3487 else
3488 {
3489 num_trps_sent++;
3490 }
3491 return true;
3492 }
3493
3494 void
update_rusage(struct thr_send_thread_instance * this_send_thread,Uint64 elapsed_time)3495 thr_send_threads::update_rusage(
3496 struct thr_send_thread_instance *this_send_thread,
3497 Uint64 elapsed_time)
3498 {
3499 struct ndb_rusage rusage;
3500
3501 int res = Ndb_GetRUsage(&rusage, false);
3502 if (res != 0)
3503 {
3504 this_send_thread->m_user_time_os = 0;
3505 this_send_thread->m_kernel_time_os = 0;
3506 this_send_thread->m_elapsed_time_os = 0;
3507 return;
3508 }
3509 this_send_thread->m_user_time_os = rusage.ru_utime;
3510 this_send_thread->m_kernel_time_os = rusage.ru_stime;
3511 this_send_thread->m_elapsed_time_os = elapsed_time;
3512 }
3513
3514 /**
3515 * There are some send scheduling algorithms build into the send thread.
3516 * Mainly implemented as part of ::run_send_thread, thus commented here:
3517 *
3518 * We have the possibility to set a 'send delay' for each trp. This
3519 * is used both for handling send overload where we should wait
3520 * before retrying, and as an aid for collecting smaller packets into
3521 * larger, and thus fewer packets. Thus decreasing the send overhead
3522 * on a highly loaded system.
3523 *
3524 * A delay due to overload is always waited for. As there are already
3525 * queued up send work in the buffers, sending will be possible
3526 * without the send thread actively busy-retrying. However, delays
3527 * in order to increase the packed size can be ignored.
3528 *
3529 * The basic idea if the later is the following:
3530 * By introducing a delay we ensure that all block threads have
3531 * gotten a chance to execute messages that will generate data
3532 * to be sent to trps. This is particularly helpful in e.g.
3533 * queries that are scanning a table. Here a SCAN_TABREQ is
3534 * received in a TC and this generates a number of SCAN_FRAGREQ
3535 * signals to each LDM, each of those LDMs will in turn generate
3536 * a number of new signals that are all destined to the same
3537 * trp. So this delay here increases the chance that those
3538 * signals can be sent in the same TCP/IP packet over the wire.
3539 *
3540 * Another use case is applications using the asynchronous API
3541 * and thus sending many PK lookups that traverse a trp in
3542 * parallel from the same destination trp. These can benefit
3543 * greatly from this extra delay increasing the packet sizes.
3544 *
3545 * There is also a case when sending many updates that need to
3546 * be sent to the other trp in the same node group. By delaying
3547 * the send of this data we ensure that the receiver thread on
3548 * the other end is getting larger packet sizes and thus we
3549 * improve the throughput of the system in all sorts of ways.
3550 *
3551 * However we also try to ensure that we don't delay signals in
3552 * an idle system where response time is more important than
3553 * the throughput. This is achieved by the fact that we will
3554 * send after looping through the trps ready to send to. In
3555 * an idle system this will be a quick operation. In a loaded
3556 * system this delay can be fairly substantial on the other
3557 * hand.
3558 *
3559 * Finally we attempt to limit the use of more than one send
3560 * thread to cases of very high load. So if there are only
3561 * delayed trp sends remaining, we deduce that the
3562 * system is lightly loaded and we will go to sleep if there
3563 * are other send threads also awake.
3564 */
3565 void
run_send_thread(Uint32 instance_no)3566 thr_send_threads::run_send_thread(Uint32 instance_no)
3567 {
3568 struct thr_send_thread_instance *this_send_thread =
3569 &m_send_threads[instance_no];
3570 const Uint32 thr_no = glob_num_threads + instance_no;
3571
3572 {
3573 /**
3574 * Wait for thread object to be visible
3575 */
3576 while(this_send_thread->m_thread == 0)
3577 NdbSleep_MilliSleep(30);
3578 }
3579
3580 {
3581 /**
3582 * Print out information about starting thread
3583 * (number, tid, name, the CPU it's locked into (if locked at all))
3584 * Also perform the locking to CPU.
3585 */
3586 BaseString tmp;
3587 bool fail = false;
3588 THRConfigApplier & conf = globalEmulatorData.theConfiguration->m_thr_config;
3589 tmp.appfmt("thr: %u ", thr_no);
3590 int tid = NdbThread_GetTid(this_send_thread->m_thread);
3591 if (tid != -1)
3592 {
3593 tmp.appfmt("tid: %u ", tid);
3594 }
3595 conf.appendInfoSendThread(tmp, instance_no);
3596 int res = conf.do_bind_send(this_send_thread->m_thread,
3597 instance_no);
3598 if (res < 0)
3599 {
3600 fail = true;
3601 tmp.appfmt("err: %d ", -res);
3602 }
3603 else if (res > 0)
3604 {
3605 tmp.appfmt("OK ");
3606 }
3607
3608 unsigned thread_prio;
3609 res = conf.do_thread_prio_send(this_send_thread->m_thread,
3610 instance_no,
3611 thread_prio);
3612 if (res < 0)
3613 {
3614 fail = true;
3615 res = -res;
3616 tmp.appfmt("Failed to set thread prio to %u, ", thread_prio);
3617 if (res == SET_THREAD_PRIO_NOT_SUPPORTED_ERROR)
3618 {
3619 tmp.appfmt("not supported on this OS");
3620 }
3621 else
3622 {
3623 tmp.appfmt("error: %d", res);
3624 }
3625 }
3626 else if (res > 0)
3627 {
3628 tmp.appfmt("Successfully set thread prio to %u ", thread_prio);
3629 }
3630
3631 printf("%s\n", tmp.c_str());
3632 fflush(stdout);
3633 if (fail)
3634 {
3635 abort();
3636 }
3637 }
3638
3639 /**
3640 * register watchdog
3641 */
3642 globalEmulatorData.theWatchDog->
3643 registerWatchedThread(&this_send_thread->m_watchdog_counter, thr_no);
3644
3645 NdbMutex_Lock(this_send_thread->send_thread_mutex);
3646 this_send_thread->m_awake = FALSE;
3647 NdbMutex_Unlock(this_send_thread->send_thread_mutex);
3648
3649 NDB_TICKS yield_ticks;
3650 bool real_time = false;
3651
3652 yield_ticks = NdbTick_getCurrentTicks();
3653 THRConfigApplier & conf = globalEmulatorData.theConfiguration->m_thr_config;
3654 update_send_sched_config(conf, instance_no, real_time);
3655
3656 TrpId trp_id = 0;
3657 Uint64 micros_sleep = 0;
3658 NDB_TICKS last_now = NdbTick_getCurrentTicks();
3659 NDB_TICKS last_rusage = last_now;
3660 NDB_TICKS first_now = last_now;
3661
3662 while (globalData.theRestartFlag != perform_stop)
3663 {
3664 this_send_thread->m_watchdog_counter = 19;
3665
3666 NDB_TICKS now = NdbTick_getCurrentTicks();
3667 Uint64 sleep_time = micros_sleep;
3668 Uint64 exec_time = NdbTick_Elapsed(last_now, now).microSec();
3669 Uint64 time_since_update_rusage =
3670 NdbTick_Elapsed(last_rusage, now).microSec();
3671 /**
3672 * At this moment exec_time is elapsed time since last time
3673 * we were here. Now remove the time we spent sleeping to
3674 * get exec_time, thus exec_time + sleep_time will always
3675 * be elapsed time.
3676 */
3677 exec_time -= sleep_time;
3678 last_now = now;
3679 micros_sleep = 0;
3680 if (time_since_update_rusage > Uint64(50 * 1000))
3681 {
3682 Uint64 elapsed_time = NdbTick_Elapsed(first_now, now).microSec();
3683 last_rusage = last_now;
3684 NdbMutex_Lock(this_send_thread->send_thread_mutex);
3685 update_rusage(this_send_thread, elapsed_time);
3686 }
3687 else
3688 {
3689 NdbMutex_Lock(this_send_thread->send_thread_mutex);
3690 }
3691 this_send_thread->m_exec_time += exec_time;
3692 this_send_thread->m_sleep_time += sleep_time;
3693 this_send_thread->m_awake = TRUE;
3694
3695 /**
3696 * If waited for a specific transporter, reinsert it such that
3697 * it can be re-evaluated for send by get_trp().
3698 *
3699 * This happens when handle_send_trp returns false due to that the
3700 * only transporter ready for execute was a transporter that still
3701 * waited for expiration of delay and no other condition allowed it
3702 * to be sent.
3703 */
3704 if (trp_id != 0)
3705 {
3706 /**
3707 * The trp was locked during our sleep. We now release the
3708 * lock again such that we can acquire the lock again after
3709 * a short sleep. For non-neighbour trps the insert_trp is
3710 * sufficient. For neighbour trps we need to ensure that
3711 * m_trp_state[trp_id].m_thr_no_sender is set to NO_OWNER_THREAD
3712 * since this is the manner in releasing the lock on those
3713 * trps.
3714 */
3715 assert(m_trp_state[trp_id].m_thr_no_sender == thr_no);
3716 m_trp_state[trp_id].m_thr_no_sender = NO_OWNER_THREAD;
3717 insert_trp(trp_id, this_send_thread);
3718 trp_id = 0;
3719 }
3720 while (globalData.theRestartFlag != perform_stop &&
3721 (trp_id = get_trp(instance_no, now, this_send_thread)) != 0)
3722 // PENDING -> ACTIVE
3723 {
3724 Uint32 num_trps_sent_dummy;
3725 if (!handle_send_trp(trp_id,
3726 num_trps_sent_dummy,
3727 thr_no,
3728 now,
3729 this_send_thread->m_watchdog_counter,
3730 this_send_thread))
3731 {
3732 /**
3733 * Neighbour trps are not locked by get_trp and insert_trp.
3734 * They are locked by setting
3735 * m_trp_state[trp_id].m_thr_no_sender to thr_no.
3736 * Here we returned false from handle_send_trp since we were
3737 * not allowed to send to trp at this time. We want to keep
3738 * lock on trp as get_trp does for non-neighbour trps, so
3739 * we set this flag to retain lock even after we release mutex.
3740 * We also use asserts to ensure the state transitions are ok.
3741 *
3742 * The transporter is reinserted into the list of transporters
3743 * ready to transmit above in the code since id != 0 when we
3744 * return after sleep.
3745 */
3746 assert(m_trp_state[trp_id].m_thr_no_sender == NO_OWNER_THREAD);
3747 m_trp_state[trp_id].m_thr_no_sender = thr_no;
3748 break;
3749 }
3750
3751 /* Release chunk-wise to decrease pressure on lock */
3752 this_send_thread->m_watchdog_counter = 3;
3753 this_send_thread->m_send_buffer_pool.release_chunk(
3754 g_thr_repository->m_mm,
3755 RG_TRANSPORTER_BUFFERS,
3756 instance_no);
3757
3758 /**
3759 * We set trp_id = 0 for the very rare case where theRestartFlag is set
3760 * to perform_stop, we should never need this, but add it in just in
3761 * case.
3762 */
3763 trp_id = 0;
3764 } // while (get_trp()...)
3765
3766 /* No more trps having data to send right now, prepare to sleep */
3767 this_send_thread->m_awake = FALSE;
3768 const Uint32 trp_wait = (trp_id != 0) ?
3769 m_trp_state[trp_id].m_micros_delayed : 0;
3770 NdbMutex_Unlock(this_send_thread->send_thread_mutex);
3771
3772
3773 if (real_time)
3774 {
3775 check_real_time_break(now,
3776 &yield_ticks,
3777 this_send_thread->m_thread,
3778 SendThread);
3779 }
3780
3781
3782 /**
3783 * Send thread is by definition a throughput supportive thread.
3784 * Thus in situations when the latency is at risk the sending
3785 * is performed by the block threads. Thus there is no reason
3786 * to perform any spinning in the send thread, we will ignore
3787 * spin timer for send threads.
3788 */
3789 {
3790 Uint32 max_wait_nsec;
3791 /**
3792 * We sleep a max time, possibly waiting for a specific trp
3793 * with delayed send (overloaded, or waiting for more payload).
3794 * (Will be alerted to start working when more send work arrives)
3795 */
3796 if (trp_wait == 0)
3797 {
3798 //50ms, has to wakeup before 100ms watchdog alert.
3799 max_wait_nsec = 50*1000*1000;
3800 }
3801 else
3802 {
3803 max_wait_nsec = trp_wait * 1000;
3804 }
3805 NDB_TICKS before = NdbTick_getCurrentTicks();
3806 bool waited = yield(&this_send_thread->m_waiter_struct,
3807 max_wait_nsec,
3808 check_available_send_data,
3809 this_send_thread);
3810 if (waited)
3811 {
3812 NDB_TICKS after = NdbTick_getCurrentTicks();
3813 micros_sleep += NdbTick_Elapsed(before, after).microSec();
3814 }
3815 }
3816 }
3817
3818 globalEmulatorData.theWatchDog->unregisterWatchedThread(thr_no);
3819 }
3820
3821 #if 0
3822 static
3823 Uint32
3824 fifo_used_pages(struct thr_data* selfptr)
3825 {
3826 return calc_fifo_used(selfptr->m_first_unused,
3827 selfptr->m_first_free,
3828 THR_FREE_BUF_MAX);
3829 }
3830 #endif
3831
3832 ATTRIBUTE_NOINLINE
3833 static
3834 void
job_buffer_full(struct thr_data * selfptr)3835 job_buffer_full(struct thr_data* selfptr)
3836 {
3837 ndbout_c("job buffer full");
3838 dumpJobQueues();
3839 abort();
3840 }
3841
3842 ATTRIBUTE_NOINLINE
3843 static
3844 void
out_of_job_buffer(struct thr_data * selfptr)3845 out_of_job_buffer(struct thr_data* selfptr)
3846 {
3847 ndbout_c("out of job buffer");
3848 dumpJobQueues();
3849 abort();
3850 }
3851
3852 static
3853 thr_job_buffer*
seize_buffer(struct thr_repository * rep,int thr_no,bool prioa)3854 seize_buffer(struct thr_repository* rep, int thr_no, bool prioa)
3855 {
3856 thr_job_buffer* jb;
3857 struct thr_data* selfptr = &rep->m_thread[thr_no];
3858 Uint32 first_free = selfptr->m_first_free;
3859 Uint32 first_unused = selfptr->m_first_unused;
3860
3861 /*
3862 * An empty FIFO is denoted by m_first_free == m_first_unused.
3863 * So we will never have a completely full FIFO array, at least one entry will
3864 * always be unused. But the code is simpler as a result.
3865 */
3866
3867 /*
3868 * We never allow the fifo to become completely empty, as we want to have
3869 * a good number of signals available for trace files in case of a forced
3870 * shutdown.
3871 */
3872 Uint32 buffers = (first_free > first_unused ?
3873 first_unused + THR_FREE_BUF_MAX - first_free :
3874 first_unused - first_free);
3875 if (unlikely(buffers <= THR_FREE_BUF_MIN))
3876 {
3877 /*
3878 * All used, allocate another batch from global pool.
3879 *
3880 * Put the new buffers at the head of the fifo, so as not to needlessly
3881 * push out any existing buffers from the fifo (that would loose useful
3882 * data for signal dumps in trace files).
3883 */
3884 Uint32 cnt = 0;
3885 Uint32 batch = THR_FREE_BUF_MAX / THR_FREE_BUF_BATCH;
3886 assert(batch > 0);
3887 assert(batch + THR_FREE_BUF_MIN < THR_FREE_BUF_MAX);
3888 do {
3889 jb = rep->m_jb_pool.seize(rep->m_mm,
3890 RG_JOBBUFFER);
3891 if (unlikely(jb == 0))
3892 {
3893 if (unlikely(cnt == 0))
3894 {
3895 out_of_job_buffer(selfptr);
3896 }
3897 break;
3898 }
3899 jb->m_len = 0;
3900 jb->m_prioa = false;
3901 first_free = (first_free ? first_free : THR_FREE_BUF_MAX) - 1;
3902 selfptr->m_free_fifo[first_free] = jb;
3903 batch--;
3904 } while (cnt < batch);
3905 selfptr->m_first_free = first_free;
3906 }
3907
3908 jb= selfptr->m_free_fifo[first_free];
3909 selfptr->m_first_free = (first_free + 1) % THR_FREE_BUF_MAX;
3910 /* Init here rather than in release_buffer() so signal dump will work. */
3911 jb->m_len = 0;
3912 jb->m_prioa = prioa;
3913 return jb;
3914 }
3915
3916 static
3917 void
release_buffer(struct thr_repository * rep,int thr_no,thr_job_buffer * jb)3918 release_buffer(struct thr_repository* rep, int thr_no, thr_job_buffer* jb)
3919 {
3920 struct thr_data* selfptr = &rep->m_thread[thr_no];
3921 Uint32 first_free = selfptr->m_first_free;
3922 Uint32 first_unused = selfptr->m_first_unused;
3923
3924 /*
3925 * Pack near-empty signals, to get more info in the signal traces.
3926 *
3927 * This is not currently used, as we only release full job buffers, hence
3928 * the #if 0.
3929 */
3930 #if 0
3931 Uint32 last_free = (first_unused ? first_unused : THR_FREE_BUF_MAX) - 1;
3932 thr_job_buffer *last_jb = selfptr->m_free_fifo[last_free];
3933 Uint32 len1, len2;
3934
3935 if (!jb->m_prioa &&
3936 first_free != first_unused &&
3937 !last_jb->m_prioa &&
3938 (len2 = jb->m_len) <= (thr_job_buffer::SIZE / 4) &&
3939 (len1 = last_jb->m_len) + len2 <= thr_job_buffer::SIZE)
3940 {
3941 /*
3942 * The buffer being release is fairly empty, and what data it contains fit
3943 * in the previously released buffer.
3944 *
3945 * We want to avoid too many almost-empty buffers in the free fifo, as that
3946 * makes signal traces less useful due to too little data available. So in
3947 * this case we move the data from the buffer to be released into the
3948 * previous buffer, and place the to-be-released buffer at the head of the
3949 * fifo (to be immediately reused).
3950 *
3951 * This is only done for prio B buffers, as we must not merge prio A and B
3952 * data (or dumps would be incorrect), and prio A buffers are in any case
3953 * full when released.
3954 */
3955 memcpy(last_jb->m_data + len1, jb->m_data, len2*sizeof(jb->m_data[0]));
3956 last_jb->m_len = len1 + len2;
3957 jb->m_len = 0;
3958 first_free = (first_free ? first_free : THR_FREE_BUF_MAX) - 1;
3959 selfptr->m_free_fifo[first_free] = jb;
3960 selfptr->m_first_free = first_free;
3961 }
3962 else
3963 #endif
3964 {
3965 /* Just insert at the end of the fifo. */
3966 selfptr->m_free_fifo[first_unused] = jb;
3967 first_unused = (first_unused + 1) % THR_FREE_BUF_MAX;
3968 selfptr->m_first_unused = first_unused;
3969 }
3970
3971 if (unlikely(first_unused == first_free))
3972 {
3973 /* FIFO full, need to release to global pool. */
3974 Uint32 batch = THR_FREE_BUF_MAX / THR_FREE_BUF_BATCH;
3975 assert(batch > 0);
3976 assert(batch < THR_FREE_BUF_MAX);
3977 do {
3978 rep->m_jb_pool.release(rep->m_mm,
3979 RG_JOBBUFFER,
3980 selfptr->m_free_fifo[first_free]);
3981 first_free = (first_free + 1) % THR_FREE_BUF_MAX;
3982 batch--;
3983 } while (batch > 0);
3984 selfptr->m_first_free = first_free;
3985 }
3986 }
3987
3988 static
3989 inline
3990 Uint32
scan_queue(struct thr_data * selfptr,Uint32 cnt,Uint32 end,Uint32 * ptr)3991 scan_queue(struct thr_data* selfptr, Uint32 cnt, Uint32 end, Uint32* ptr)
3992 {
3993 Uint32 thr_no = selfptr->m_thr_no;
3994 Uint32 **pages = selfptr->m_tq.m_delayed_signals;
3995 Uint32 free = selfptr->m_tq.m_next_free;
3996 Uint32* save = ptr;
3997 for (Uint32 i = 0; i < cnt; i++, ptr++)
3998 {
3999 Uint32 val = * ptr;
4000 if ((val & 0xFFFF) <= end)
4001 {
4002 Uint32 idx = val >> 16;
4003 Uint32 buf = idx >> 8;
4004 Uint32 pos = MAX_SIGNAL_SIZE * (idx & 0xFF);
4005
4006 Uint32* page = * (pages + buf);
4007
4008 const SignalHeader *s = reinterpret_cast<SignalHeader*>(page + pos);
4009 const Uint32 *data = page + pos + (sizeof(*s)>>2);
4010 if (0)
4011 ndbout_c("found %p val: %d end: %d", s, val & 0xFFFF, end);
4012 /*
4013 * ToDo: Do measurements of the frequency of these prio A timed signals.
4014 *
4015 * If they are frequent, we may want to optimize, as sending one prio A
4016 * signal is somewhat expensive compared to sending one prio B.
4017 */
4018 sendprioa(thr_no, s, data,
4019 data + s->theLength);
4020 * (page + pos) = free;
4021 free = idx;
4022 }
4023 else if (i > 0)
4024 {
4025 selfptr->m_tq.m_next_free = free;
4026 memmove(save, ptr, 4 * (cnt - i));
4027 return i;
4028 }
4029 else
4030 {
4031 return 0;
4032 }
4033 }
4034 selfptr->m_tq.m_next_free = free;
4035 return cnt;
4036 }
4037
4038 static
4039 void
handle_time_wrap(struct thr_data * selfptr)4040 handle_time_wrap(struct thr_data* selfptr)
4041 {
4042 Uint32 i;
4043 struct thr_tq * tq = &selfptr->m_tq;
4044 Uint32 cnt0 = tq->m_cnt[0];
4045 Uint32 cnt1 = tq->m_cnt[1];
4046 Uint32 tmp0 = scan_queue(selfptr, cnt0, 32767, tq->m_short_queue);
4047 Uint32 tmp1 = scan_queue(selfptr, cnt1, 32767, tq->m_long_queue);
4048 cnt0 -= tmp0;
4049 cnt1 -= tmp1;
4050 tq->m_cnt[0] = cnt0;
4051 tq->m_cnt[1] = cnt1;
4052 for (i = 0; i<cnt0; i++)
4053 {
4054 assert((tq->m_short_queue[i] & 0xFFFF) > 32767);
4055 tq->m_short_queue[i] -= 32767;
4056 }
4057 for (i = 0; i<cnt1; i++)
4058 {
4059 assert((tq->m_long_queue[i] & 0xFFFF) > 32767);
4060 tq->m_long_queue[i] -= 32767;
4061 }
4062 }
4063
4064 /**
4065 * FUNCTION: scan_time_queues(), scan_time_queues_impl(),
4066 * scan_time_queues_backtick()
4067 *
4068 * scan_time_queues() Implements the part we want to be inlined
4069 * into the scheduler loops, while *_impl() & *_backtick() is
4070 * the more unlikely part we don't call unless the timer has
4071 * ticked backward or forward more than 1ms since last 'scan_time.
4072 *
4073 * Check if any delayed signals has expired and should be sent now.
4074 * The time_queues will be checked every time we detect a change
4075 * in current time of >= 1ms. If idle we will sleep for max 10ms
4076 * before rechecking the time_queue.
4077 *
4078 * However, some situations need special attention:
4079 * - Even if we prefer monotonic timers, they are not available, or
4080 * implemented in our abstraction layer, for all platforms.
4081 * A non-monotonic timer may leap when adjusted by the user, both
4082 * forward or backwards.
4083 * - Early implementation of monotonic timers had bugs where time
4084 * could jump. Similar problems has been reported for several VMs.
4085 * - There might be CPU contention or system swapping where we might
4086 * sleep for significantly longer that 10ms, causing long forward
4087 * leaps in perceived time.
4088 *
4089 * In order to adapt to this non-perfect clock behaviour, the
4090 * scheduler has its own 'm_ticks' which is the current time
4091 * as perceived by the scheduler. On entering this function, 'now'
4092 * is the 'real' current time fetched from NdbTick_getCurrentTime().
4093 * 'selfptr->m_ticks' is the previous tick seen by the scheduler,
4094 * and as such is the timestamp which reflects the current time
4095 * as seen by the timer queues.
4096 *
4097 * Normally only a few milliseconds will elapse between each ticks
4098 * as seen by the diff between 'now' and 'selfthr->m_ticks'.
4099 * However, if there are larger leaps in the current time,
4100 * we breaks this up in several small(20ms) steps
4101 * by gradually increasing schedulers 'm_ticks' time. This ensure
4102 * that delayed signals will arrive in correct relative order,
4103 * and repeated signals (pace signals) are received with
4104 * the expected frequence. However, each individual signal may
4105 * be delayed or arriving to fast. Where excact timing is critical,
4106 * these signals should do their own time calculation by reading
4107 * the clock, instead of trusting that the signal is delivered as
4108 * specified by the 'delay' argument
4109 *
4110 * If there are leaps larger than 1500ms, we try a hybrid
4111 * solution by moving the 'm_ticks' forward, close to the
4112 * actuall current time, then continue as above from that
4113 * point in time. A 'time leap Warning' will also be printed
4114 * in the logs.
4115 */
4116 static
4117 Uint32
scan_time_queues_impl(struct thr_data * selfptr,Uint32 diff,NDB_TICKS now)4118 scan_time_queues_impl(struct thr_data* selfptr,
4119 Uint32 diff,
4120 NDB_TICKS now)
4121 {
4122 NDB_TICKS last = selfptr->m_ticks;
4123 Uint32 step = diff;
4124
4125 if (unlikely(diff > 20)) // Break up into max 20ms steps
4126 {
4127 if (unlikely(diff > 1500)) // Time leaped more than 1500ms
4128 {
4129 /**
4130 * There was a long leap in the time since last checking
4131 * of the time_queues. The clock could have been adjusted, or we
4132 * are CPU starved. Anyway, we can never make up for the lost
4133 * CPU cycles, so we forget about them and start fresh from
4134 * a point in time 1000ms behind our current time.
4135 */
4136 struct ndb_rusage curr_rusage;
4137 Ndb_GetRUsage(&curr_rusage, false);
4138 if ((curr_rusage.ru_utime == 0 &&
4139 curr_rusage.ru_stime == 0) ||
4140 (selfptr->m_scan_time_queue_rusage.ru_utime == 0 &&
4141 selfptr->m_scan_time_queue_rusage.ru_stime == 0))
4142 {
4143 /**
4144 * get_rusage failed for some reason, print old variant of warning
4145 * message.
4146 */
4147 g_eventLogger->warning("thr: %u: Overslept %u ms, expected ~10ms",
4148 selfptr->m_thr_no, diff);
4149 }
4150 else
4151 {
4152 Uint32 diff_real =
4153 NdbTick_Elapsed(selfptr->m_scan_real_ticks, now).milliSec();
4154 Uint64 exec_time = curr_rusage.ru_utime -
4155 selfptr->m_scan_time_queue_rusage.ru_utime;
4156 Uint64 sys_time = curr_rusage.ru_stime -
4157 selfptr->m_scan_time_queue_rusage.ru_stime;
4158 g_eventLogger->warning("thr: %u Overslept %u ms, expected ~10ms"
4159 ", user time: %llu us, sys_time: %llu us",
4160 selfptr->m_thr_no,
4161 diff_real,
4162 exec_time,
4163 sys_time);
4164 }
4165 last = NdbTick_AddMilliseconds(last, diff-1000);
4166 }
4167 step = 20; // Max expire intervall handled is 20ms
4168 }
4169
4170 struct thr_tq * tq = &selfptr->m_tq;
4171 Uint32 curr = tq->m_current_time;
4172 Uint32 cnt0 = tq->m_cnt[0];
4173 Uint32 cnt1 = tq->m_cnt[1];
4174 Uint32 end = (curr + step);
4175 if (end >= 32767)
4176 {
4177 handle_time_wrap(selfptr);
4178 cnt0 = tq->m_cnt[0];
4179 cnt1 = tq->m_cnt[1];
4180 end -= 32767;
4181 }
4182
4183 Uint32 tmp0 = scan_queue(selfptr, cnt0, end, tq->m_short_queue);
4184 Uint32 tmp1 = scan_queue(selfptr, cnt1, end, tq->m_long_queue);
4185
4186 tq->m_current_time = end;
4187 tq->m_cnt[0] = cnt0 - tmp0;
4188 tq->m_cnt[1] = cnt1 - tmp1;
4189 selfptr->m_ticks = NdbTick_AddMilliseconds(last, step);
4190 selfptr->m_scan_real_ticks = now;
4191 Ndb_GetRUsage(&selfptr->m_scan_time_queue_rusage, false);
4192 return (diff - step);
4193 }
4194
4195 /**
4196 * Clock has ticked backwards. We try to handle this
4197 * as best we can.
4198 */
4199 static
4200 void
scan_time_queues_backtick(struct thr_data * selfptr,NDB_TICKS now)4201 scan_time_queues_backtick(struct thr_data* selfptr, NDB_TICKS now)
4202 {
4203 const NDB_TICKS last = selfptr->m_ticks;
4204 assert(NdbTick_Compare(now, last) < 0);
4205
4206 const Uint64 backward = NdbTick_Elapsed(now, last).milliSec();
4207
4208 /**
4209 * Silently ignore sub millisecond backticks.
4210 * Such 'noise' is unfortunately common, even for monotonic timers.
4211 */
4212 if (backward > 0)
4213 {
4214 g_eventLogger->warning("thr: %u Time ticked backwards %llu ms.",
4215 selfptr->m_thr_no, backward);
4216
4217 /* Long backticks should never happen for monotonic timers */
4218 assert(backward < 100 || !NdbTick_IsMonotonic());
4219
4220 /* Accept new time as current */
4221 selfptr->m_ticks = now;
4222 }
4223 }
4224
4225 /**
4226 * If someone sends a signal with delay it means that the signal
4227 * should be executed as soon as we come to the scan_time_queues
4228 * independent of the amount of time spent since it was sent. We
4229 * use a special time queue for bounded delay signals to avoid having
4230 * to scan through all short time queue signals in every loop of
4231 * the run job buffers.
4232 */
4233 static inline
4234 void
scan_zero_queue(struct thr_data * selfptr)4235 scan_zero_queue(struct thr_data* selfptr)
4236 {
4237 struct thr_tq * tq = &selfptr->m_tq;
4238 Uint32 cnt = tq->m_cnt[2];
4239 if (cnt)
4240 {
4241 Uint32 num_found = scan_queue(selfptr,
4242 cnt,
4243 tq->m_current_time,
4244 tq->m_zero_queue);
4245 require(num_found == cnt);
4246 }
4247 tq->m_cnt[2] = 0;
4248 }
4249
4250 static inline
4251 Uint32
scan_time_queues(struct thr_data * selfptr,NDB_TICKS now)4252 scan_time_queues(struct thr_data* selfptr, NDB_TICKS now)
4253 {
4254 scan_zero_queue(selfptr);
4255 const NDB_TICKS last = selfptr->m_ticks;
4256 if (unlikely(NdbTick_Compare(now, last) < 0))
4257 {
4258 scan_time_queues_backtick(selfptr, now);
4259 return 0;
4260 }
4261
4262 const Uint32 diff = (Uint32)NdbTick_Elapsed(last, now).milliSec();
4263 if (unlikely(diff > 0))
4264 {
4265 return scan_time_queues_impl(selfptr, diff, now);
4266 }
4267 return 0;
4268 }
4269
4270 static
4271 inline
4272 Uint32*
get_free_slot(struct thr_repository * rep,struct thr_data * selfptr,Uint32 * idxptr)4273 get_free_slot(struct thr_repository* rep,
4274 struct thr_data* selfptr,
4275 Uint32* idxptr)
4276 {
4277 struct thr_tq * tq = &selfptr->m_tq;
4278 Uint32 idx = tq->m_next_free;
4279 retry:
4280
4281 if (idx != RNIL)
4282 {
4283 Uint32 buf = idx >> 8;
4284 Uint32 pos = idx & 0xFF;
4285 Uint32* page = * (tq->m_delayed_signals + buf);
4286 Uint32* ptr = page + (MAX_SIGNAL_SIZE * pos);
4287 tq->m_next_free = * ptr;
4288 * idxptr = idx;
4289 return ptr;
4290 }
4291
4292 Uint32 thr_no = selfptr->m_thr_no;
4293 for (Uint32 i = 0; i<thr_tq::PAGES; i++)
4294 {
4295 if (tq->m_delayed_signals[i] == 0)
4296 {
4297 struct thr_job_buffer *jb = seize_buffer(rep, thr_no, false);
4298 Uint32 * page = reinterpret_cast<Uint32*>(jb);
4299 tq->m_delayed_signals[i] = page;
4300 /**
4301 * Init page
4302 */
4303 for (Uint32 j = 0; j < MIN_SIGNALS_PER_PAGE; j ++)
4304 {
4305 page[j * MAX_SIGNAL_SIZE] = (i << 8) + (j + 1);
4306 }
4307 page[MIN_SIGNALS_PER_PAGE*MAX_SIGNAL_SIZE] = RNIL;
4308 idx = (i << 8);
4309 goto retry;
4310 }
4311 }
4312 abort();
4313 return NULL;
4314 }
4315
4316 void
senddelay(Uint32 thr_no,const SignalHeader * s,Uint32 delay)4317 senddelay(Uint32 thr_no, const SignalHeader* s, Uint32 delay)
4318 {
4319 struct thr_repository* rep = g_thr_repository;
4320 struct thr_data* selfptr = &rep->m_thread[thr_no];
4321 assert(my_thread_equal(selfptr->m_thr_id, my_thread_self()));
4322 unsigned siglen = (sizeof(*s) >> 2) + s->theLength + s->m_noOfSections;
4323
4324 Uint32 max;
4325 Uint32 * cntptr;
4326 Uint32 * queueptr;
4327
4328 Uint32 alarm;
4329 Uint32 nexttimer = selfptr->m_tq.m_next_timer;
4330 if (delay == SimulatedBlock::BOUNDED_DELAY)
4331 {
4332 alarm = selfptr->m_tq.m_current_time;
4333 cntptr = selfptr->m_tq.m_cnt + 2;
4334 queueptr = selfptr->m_tq.m_zero_queue;
4335 max = thr_tq::ZQ_SIZE;
4336 }
4337 else
4338 {
4339 alarm = selfptr->m_tq.m_current_time + delay;
4340 if (delay < 100)
4341 {
4342 cntptr = selfptr->m_tq.m_cnt + 0;
4343 queueptr = selfptr->m_tq.m_short_queue;
4344 max = thr_tq::SQ_SIZE;
4345 }
4346 else
4347 {
4348 cntptr = selfptr->m_tq.m_cnt + 1;
4349 queueptr = selfptr->m_tq.m_long_queue;
4350 max = thr_tq::LQ_SIZE;
4351 }
4352 }
4353
4354 Uint32 idx;
4355 Uint32* ptr = get_free_slot(rep, selfptr, &idx);
4356 memcpy(ptr, s, 4*siglen);
4357
4358 if (0)
4359 ndbout_c("now: %d alarm: %d send %s from %s to %s delay: %d idx: %x %p",
4360 selfptr->m_tq.m_current_time,
4361 alarm,
4362 getSignalName(s->theVerId_signalNumber),
4363 getBlockName(refToBlock(s->theSendersBlockRef)),
4364 getBlockName(s->theReceiversBlockNumber),
4365 delay,
4366 idx, ptr);
4367
4368 Uint32 i;
4369 Uint32 cnt = *cntptr;
4370 Uint32 newentry = (idx << 16) | (alarm & 0xFFFF);
4371
4372 * cntptr = cnt + 1;
4373 selfptr->m_tq.m_next_timer = alarm < nexttimer ? alarm : nexttimer;
4374
4375 if (cnt == 0 || delay == SimulatedBlock::BOUNDED_DELAY)
4376 {
4377 /* First delayed signal needs no order and bounded delay is FIFO */
4378 queueptr[cnt] = newentry;
4379 return;
4380 }
4381 else if (cnt < max)
4382 {
4383 for (i = 0; i<cnt; i++)
4384 {
4385 Uint32 save = queueptr[i];
4386 if ((save & 0xFFFF) > alarm)
4387 {
4388 memmove(queueptr+i+1, queueptr+i, 4*(cnt - i));
4389 queueptr[i] = newentry;
4390 return;
4391 }
4392 }
4393 assert(i == cnt);
4394 queueptr[i] = newentry;
4395 return;
4396 }
4397 else
4398 {
4399 /* Out of entries in time queue, issue proper error */
4400 if (cntptr == (selfptr->m_tq.m_cnt + 0))
4401 {
4402 /* Error in short time queue */
4403 ERROR_SET(ecError, NDBD_EXIT_TIME_QUEUE_SHORT,
4404 "Too many in Short Time Queue", "mt.cpp" );
4405 }
4406 else if (cntptr == (selfptr->m_tq.m_cnt + 1))
4407 {
4408 /* Error in long time queue */
4409 ERROR_SET(ecError, NDBD_EXIT_TIME_QUEUE_LONG,
4410 "Too many in Long Time Queue", "mt.cpp" );
4411 }
4412 else
4413 {
4414 /* Error in zero time queue */
4415 ERROR_SET(ecError, NDBD_EXIT_TIME_QUEUE_ZERO,
4416 "Too many in Zero Time Queue", "mt.cpp" );
4417 }
4418 }
4419 }
4420
4421 /*
4422 * Flush the write state to the job queue, making any new signals available to
4423 * receiving threads.
4424 *
4425 * Two versions:
4426 * - The general version flush_write_state_other() which may flush to
4427 * any thread, and possibly signal any waiters.
4428 * - The special version flush_write_state_self() which should only be used
4429 * to flush messages to itself.
4430 *
4431 * Call to these functions are encapsulated through flush_write_state
4432 * which decides which of these functions to call.
4433 */
4434 static inline
4435 void
flush_write_state_self(thr_job_queue_head * q_head,thr_jb_write_state * w)4436 flush_write_state_self(thr_job_queue_head *q_head, thr_jb_write_state *w)
4437 {
4438 /*
4439 * Can simplify the flush_write_state when writing to myself:
4440 * Simply update write references wo/ mutex, memory barrier and signaling
4441 */
4442 w->m_write_buffer->m_len = w->m_write_pos;
4443 q_head->m_write_index = w->m_write_index;
4444 w->init_pending_signals();
4445 }
4446
4447 static inline
4448 void
flush_write_state_other(thr_data * dstptr,thr_job_queue_head * q_head,thr_jb_write_state * w,bool prioa_flag)4449 flush_write_state_other(thr_data *dstptr,
4450 thr_job_queue_head *q_head,
4451 thr_jb_write_state *w,
4452 bool prioa_flag)
4453 {
4454 Uint32 pending_signals_saved;
4455 /*
4456 * Two write memory barriers here, as assigning m_len may make signal data
4457 * available to other threads, and assigning m_write_index may make new
4458 * buffers available.
4459 *
4460 * We could optimize this by only doing it as needed, and only doing it
4461 * once before setting all m_len, and once before setting all m_write_index.
4462 *
4463 * But wmb() is a no-op anyway in x86 ...
4464 */
4465 wmb();
4466 w->m_write_buffer->m_len = w->m_write_pos;
4467 wmb();
4468 q_head->m_write_index = w->m_write_index;
4469
4470 pending_signals_saved = w->get_pending_signals_wakeup();
4471 pending_signals_saved += w->get_pending_signals();
4472
4473 if (pending_signals_saved >= MAX_SIGNALS_BEFORE_WAKEUP &&
4474 (!prioa_flag))
4475 {
4476 w->init_pending_signals();
4477 wakeup(&(dstptr->m_waiter));
4478 }
4479 else
4480 {
4481 w->clear_pending_signals_and_set_wakeup(pending_signals_saved);
4482 }
4483 }
4484
4485 /**
4486 This function is used when we need to send signal immediately
4487 due to the flush limit being reached. We don't know whether
4488 signal is to ourselves in this case and we act dependent on who
4489 is the receiver of the signal.
4490 */
4491 static inline
4492 void
flush_write_state(const thr_data * selfptr,thr_data * dstptr,thr_job_queue_head * q_head,thr_jb_write_state * w,bool prioa_flag)4493 flush_write_state(const thr_data *selfptr,
4494 thr_data *dstptr,
4495 thr_job_queue_head *q_head,
4496 thr_jb_write_state *w,
4497 bool prioa_flag)
4498 {
4499 if (dstptr == selfptr)
4500 {
4501 flush_write_state_self(q_head, w);
4502 }
4503 else
4504 {
4505 flush_write_state_other(dstptr, q_head, w, prioa_flag);
4506 }
4507 }
4508
4509 /**
4510 This function is used when we are called from flush_jbb_write_state
4511 where we know that the receiver should wakeup to receive the signals
4512 we're sending.
4513 */
4514 static inline
4515 void
flush_write_state_other_wakeup(thr_data * dstptr,thr_job_queue_head * q_head,thr_jb_write_state * w)4516 flush_write_state_other_wakeup(thr_data *dstptr,
4517 thr_job_queue_head *q_head,
4518 thr_jb_write_state *w)
4519 {
4520 /*
4521 * We already did a memory barrier before the loop calling this
4522 * function to ensure the buffer is properly seen by receiving
4523 * thread.
4524 */
4525 w->m_write_buffer->m_len = w->m_write_pos;
4526 wmb();
4527 q_head->m_write_index = w->m_write_index;
4528
4529 w->init_pending_signals();
4530 wakeup(&(dstptr->m_waiter));
4531 }
4532
4533 static
4534 void
flush_jbb_write_state(thr_data * selfptr)4535 flush_jbb_write_state(thr_data *selfptr)
4536 {
4537 Uint32 thr_count = g_thr_repository->m_thread_count;
4538 Uint32 self = selfptr->m_thr_no;
4539
4540 thr_jb_write_state *w = selfptr->m_write_states + self;
4541 thr_data *thrptr = g_thr_repository->m_thread;
4542
4543 /**
4544 We start by flushing to ourselves, this requires no extra memory
4545 barriers and ensures that we can proceed in the loop knowing that
4546 we will only send to remote threads.
4547
4548 After this we will insert a memory barrier before we start updating
4549 the m_len variable that makes other threads see our signals that
4550 we're sending to them. We need the memory barrier to ensure that the
4551 buffers are seen properly updated by the remote thread when they see
4552 the pointer to them.
4553 */
4554 if (w->has_any_pending_signals())
4555 {
4556 flush_write_state_self(selfptr->m_in_queue_head + self, w);
4557 }
4558 wmb();
4559 w = selfptr->m_write_states;
4560 thr_jb_write_state *w_end = selfptr->m_write_states + thr_count;
4561 for (; w < w_end; thrptr++, w++)
4562 {
4563 if (w->has_any_pending_signals())
4564 {
4565 thr_job_queue_head *q_head = thrptr->m_in_queue_head + self;
4566 flush_write_state_other_wakeup(thrptr, q_head, w);
4567 }
4568 }
4569 }
4570
4571 /**
4572 * Receive thread will unpack 1024 signals (MAX_RECEIVED_SIGNALS)
4573 * from Transporters before running another check_recv_queue
4574 *
4575 * This function returns true if there is not space to unpack
4576 * this amount of signals, else false.
4577 *
4578 * Also used as callback function from yield() to recheck
4579 * 'full' condition before going to sleep.
4580 */
4581 static bool
check_recv_queue(thr_job_queue_head * q_head)4582 check_recv_queue(thr_job_queue_head *q_head)
4583 {
4584 const Uint32 minfree = (1024 + MIN_SIGNALS_PER_PAGE - 1)/MIN_SIGNALS_PER_PAGE;
4585 /**
4586 * NOTE: m_read_index is read wo/ lock (and updated by different thread)
4587 * but since the different thread can only consume
4588 * signals this means that the value returned from this
4589 * function is always conservative (i.e it can be better than
4590 * returned value, if read-index has moved but we didnt see it)
4591 */
4592 const unsigned ri = q_head->m_read_index;
4593 const unsigned wi = q_head->m_write_index;
4594 const unsigned busy = (wi >= ri) ? wi - ri : (thr_job_queue::SIZE - ri) + wi;
4595 return (1 + minfree + busy >= thr_job_queue::SIZE);
4596 }
4597
4598 /**
4599 * Check if any of the receive queues for the threads being served
4600 * by this receive thread, are full.
4601 * If full: Return 'Thr_data*' for (one of) the thread(s)
4602 * which we have to wait for. (to consume from queue)
4603 */
4604 static struct thr_data*
get_congested_recv_queue(struct thr_repository * rep,Uint32 recv_thread_id)4605 get_congested_recv_queue(struct thr_repository* rep, Uint32 recv_thread_id)
4606 {
4607 const unsigned thr_no = first_receiver_thread_no + recv_thread_id;
4608 thr_data *thrptr = rep->m_thread;
4609
4610 for (unsigned i = 0; i<glob_num_threads; i++, thrptr++)
4611 {
4612 thr_job_queue_head *q_head = thrptr->m_in_queue_head + thr_no;
4613 if (check_recv_queue(q_head))
4614 {
4615 return thrptr;
4616 }
4617 }
4618 return NULL;
4619 }
4620
4621 /**
4622 * Compute free buffers in specified queue.
4623 * The SAFETY margin is subtracted from the available
4624 * 'free'. which is returned.
4625 */
4626 static
4627 Uint32
compute_free_buffers_in_queue(const thr_job_queue_head * q_head)4628 compute_free_buffers_in_queue(const thr_job_queue_head *q_head)
4629 {
4630 /**
4631 * NOTE: m_read_index is read wo/ lock (and updated by different thread)
4632 * but since the different thread can only consume
4633 * signals this means that the value returned from this
4634 * function is always conservative (i.e it can be better than
4635 * returned value, if read-index has moved but we didnt see it)
4636 */
4637 unsigned ri = q_head->m_read_index;
4638 unsigned wi = q_head->m_write_index;
4639 unsigned free = (wi < ri) ? ri - wi : (thr_job_queue::SIZE + ri) - wi;
4640
4641 assert(free <= thr_job_queue::SIZE);
4642
4643 if (free <= (1 + thr_job_queue::SAFETY))
4644 return 0;
4645 else
4646 return free - (1 + thr_job_queue::SAFETY);
4647 }
4648
4649 static
4650 Uint32
compute_min_free_out_buffers(Uint32 thr_no)4651 compute_min_free_out_buffers(Uint32 thr_no)
4652 {
4653 Uint32 minfree = thr_job_queue::SIZE;
4654 const struct thr_repository* rep = g_thr_repository;
4655 const struct thr_data *thrptr = rep->m_thread;
4656
4657 for (unsigned i = 0; i<glob_num_threads; i++, thrptr++)
4658 {
4659 const thr_job_queue_head *q_head = thrptr->m_in_queue_head + thr_no;
4660 unsigned free = compute_free_buffers_in_queue(q_head);
4661
4662 if (free < minfree)
4663 minfree = free;
4664 }
4665 return minfree;
4666 }
4667
4668 /**
4669 * Compute max signals that thr_no can execute wo/ risking
4670 * job-buffer-full
4671 *
4672 * see-also update_sched_config
4673 *
4674 *
4675 * 1) compute free-slots in ring-buffer from self to each thread in system
4676 * 2) pick smallest value
4677 * 3) compute how many signals this corresponds to
4678 * 4) compute how many signals self can execute if all were to be to
4679 * the thread with the fullest ring-buffer (i.e the worst case)
4680 *
4681 * Assumption: each signal may send *at most* 4 signals
4682 * - this assumption is made the same in ndbd and ndbmtd and is
4683 * mostly followed by block-code, although not it all places :-(
4684 */
4685 static
4686 Uint32
compute_max_signals_to_execute(Uint32 min_free_buffers)4687 compute_max_signals_to_execute(Uint32 min_free_buffers)
4688 {
4689 return ((min_free_buffers * MIN_SIGNALS_PER_PAGE) + 3) / 4;
4690 }
4691
4692 static
4693 void
dumpJobQueues(void)4694 dumpJobQueues(void)
4695 {
4696 BaseString tmp;
4697 const struct thr_repository* rep = g_thr_repository;
4698 for (unsigned from = 0; from<glob_num_threads; from++)
4699 {
4700 for (unsigned to = 0; to<glob_num_threads; to++)
4701 {
4702 const thr_data *thrptr = rep->m_thread + to;
4703 const thr_job_queue_head *q_head = thrptr->m_in_queue_head + from;
4704
4705 const unsigned used = q_head->used();
4706 if (used > 0)
4707 {
4708 tmp.appfmt(" job buffer %d --> %d, used %d",
4709 from, to, used);
4710 unsigned free = compute_free_buffers_in_queue(q_head);
4711 if (free <= 0)
4712 {
4713 tmp.appfmt(" FULL!");
4714 }
4715 else if (free <= thr_job_queue::RESERVED)
4716 {
4717 tmp.appfmt(" HIGH LOAD (free:%d)", free);
4718 }
4719 tmp.appfmt("\n");
4720 }
4721 }
4722 }
4723 if (!tmp.empty())
4724 {
4725 ndbout_c("Dumping non-empty job queues:\n%s", tmp.c_str());
4726 }
4727 }
4728
4729 void
reportSendLen(NodeId nodeId,Uint32 count,Uint64 bytes)4730 trp_callback::reportSendLen(NodeId nodeId, Uint32 count, Uint64 bytes)
4731 {
4732 SignalT<3> signalT;
4733 Signal &signal = * new (&signalT) Signal(0);
4734 memset(&signal.header, 0, sizeof(signal.header));
4735
4736 if (g_send_threads)
4737 {
4738 /**
4739 * TODO: Implement this also when using send threads!!
4740 * To handle this we need to be able to send from send
4741 * threads since the m_send_thread below can be a send
4742 * thread. One manner to handle is to keep it in send
4743 * thread data structure and have some block thread
4744 * gather the data every now and then.
4745 */
4746 return;
4747 }
4748
4749 #ifdef RONM_TODO
4750 signal.header.theLength = 3;
4751 signal.header.theSendersSignalId = 0;
4752 signal.header.theSendersBlockRef = numberToRef(0, globalData.ownId);
4753 signal.theData[0] = NDB_LE_SendBytesStatistic;
4754 signal.theData[1] = nodeId;
4755 signal.theData[2] = (Uint32)(bytes/count);
4756 signal.header.theVerId_signalNumber = GSN_EVENT_REP;
4757 signal.header.theReceiversBlockNumber = CMVMI;
4758 sendlocal(g_thr_repository->m_send_buffers[nodeId].m_send_thread,
4759 &signalT.header, signalT.theData, NULL);
4760 #endif
4761 }
4762
4763 /**
4764 * To lock during connect/disconnect, we take both the send lock for the trp
4765 * (to protect performSend(), and the global receive lock (to protect
4766 * performReceive()). By having two locks, we avoid contention between the
4767 * common send and receive operations.
4768 *
4769 * We can have contention between connect/disconnect of one transporter and
4770 * receive for the others. But the transporter code should try to keep this
4771 * lock only briefly, ie. only to set state to DISCONNECTING / socket fd to
4772 * NDB_INVALID_SOCKET, not for the actual close() syscall.
4773 */
4774 void
lock_transporter(NodeId node,TrpId trp_id)4775 trp_callback::lock_transporter(NodeId node, TrpId trp_id)
4776 {
4777 (void)node;
4778 Uint32 recv_thread_idx = mt_get_recv_thread_idx(trp_id);
4779 struct thr_repository* rep = g_thr_repository;
4780 /**
4781 * Note: take the send lock _first_, so that we will not hold the receive
4782 * lock while blocking on the send lock.
4783 *
4784 * The reverse case, blocking send lock for one transporter while waiting
4785 * for receive lock, is not a problem, as the transporter being blocked is
4786 * in any case disconnecting/connecting at this point in time, and sends are
4787 * non-waiting (so we will not block sending on other transporters).
4788 */
4789 lock(&rep->m_send_buffers[trp_id].m_send_lock);
4790 lock(&rep->m_receive_lock[recv_thread_idx]);
4791 }
4792
4793 void
unlock_transporter(NodeId node,TrpId trp_id)4794 trp_callback::unlock_transporter(NodeId node, TrpId trp_id)
4795 {
4796 (void)node;
4797 Uint32 recv_thread_idx = mt_get_recv_thread_idx(trp_id);
4798 struct thr_repository* rep = g_thr_repository;
4799 unlock(&rep->m_receive_lock[recv_thread_idx]);
4800 unlock(&rep->m_send_buffers[trp_id].m_send_lock);
4801 }
4802
4803 void
lock_send_transporter(NodeId node,TrpId trp_id)4804 trp_callback::lock_send_transporter(NodeId node, TrpId trp_id)
4805 {
4806 (void)node;
4807 struct thr_repository* rep = g_thr_repository;
4808 lock(&rep->m_send_buffers[trp_id].m_send_lock);
4809 }
4810
4811 void
unlock_send_transporter(NodeId node,TrpId trp_id)4812 trp_callback::unlock_send_transporter(NodeId node, TrpId trp_id)
4813 {
4814 (void)node;
4815 struct thr_repository* rep = g_thr_repository;
4816 unlock(&rep->m_send_buffers[trp_id].m_send_lock);
4817 }
4818
4819 int
mt_checkDoJob(Uint32 recv_thread_idx)4820 mt_checkDoJob(Uint32 recv_thread_idx)
4821 {
4822 struct thr_repository* rep = g_thr_repository;
4823
4824 /**
4825 * Return '1' if we are not allowed to receive more signals
4826 * into the job buffers from this 'recv_thread_idx'.
4827 *
4828 * NOTE:
4829 * We should not loop-wait for buffers to become available
4830 * here as we currently hold the receiver-lock. Furthermore
4831 * waiting too long here could cause the receiver thread to be
4832 * less responsive wrt. moving incoming (TCP) data from the
4833 * TCPTransporters into the (local) receiveBuffers.
4834 * The thread could also oversleep on its other tasks as
4835 * handling open/close of connections, and catching
4836 * its own shutdown events
4837 */
4838 return (get_congested_recv_queue(rep, recv_thread_idx) != NULL);
4839 }
4840
4841 /**
4842 * Collect all send-buffer-pages to be delivered to trp
4843 * from each thread. Link them together and append them to
4844 * the single send_buffer list 'sb->m_buffer'.
4845 *
4846 * The 'sb->m_buffer_lock' has to be held prior to calling
4847 * this function.
4848 *
4849 * Return: Number of bytes in the collected send-buffers.
4850 *
4851 * TODO: This is not completely fair,
4852 * it would be better to get one entry from each thr_send_queue
4853 * per thread instead (until empty)
4854 */
4855 static
4856 Uint32
link_thread_send_buffers(thr_repository::send_buffer * sb,Uint32 id)4857 link_thread_send_buffers(thr_repository::send_buffer * sb, Uint32 id)
4858 {
4859 Uint32 ri[MAX_BLOCK_THREADS];
4860 Uint32 wi[MAX_BLOCK_THREADS];
4861 thr_send_queue *src = g_thr_repository->m_thread_send_buffers[id];
4862 for (unsigned thr = 0; thr < glob_num_threads; thr++)
4863 {
4864 ri[thr] = sb->m_read_index[thr];
4865 wi[thr] = src[thr].m_write_index;
4866 }
4867
4868 Uint64 sentinel[thr_send_page::HEADER_SIZE >> 1];
4869 thr_send_page* sentinel_page = new (&sentinel[0]) thr_send_page;
4870 sentinel_page->m_next = 0;
4871
4872 struct thr_send_buffer tmp;
4873 tmp.m_first_page = sentinel_page;
4874 tmp.m_last_page = sentinel_page;
4875
4876 Uint32 bytes = 0;
4877
4878 #ifdef ERROR_INSERT
4879
4880 #define MIXOLOGY_MIX_MT_SEND 2
4881
4882 if (unlikely(globalEmulatorData.theConfiguration->getMixologyLevel() &
4883 MIXOLOGY_MIX_MT_SEND))
4884 {
4885 /**
4886 * DEBUGGING only
4887 * Interleave at the page level from all threads with
4888 * pages to send - intended to help expose signal
4889 * order dependency bugs
4890 * TODO : Avoid having a whole separate implementation
4891 * like this.
4892 */
4893 bool more_pages;
4894
4895 do
4896 {
4897 src = g_thr_repository->m_thread_send_buffers[id];
4898 more_pages = false;
4899 for (unsigned thr = 0; thr < glob_num_threads; thr++, src++)
4900 {
4901 Uint32 r = ri[thr];
4902 Uint32 w = wi[thr];
4903 if (r != w)
4904 {
4905 rmb();
4906 /* Take one page from this thread's send buffer for this trp */
4907 thr_send_page * p = src->m_buffers[r];
4908 assert(p->m_start == 0);
4909 bytes += p->m_bytes;
4910 tmp.m_last_page->m_next = p;
4911 tmp.m_last_page = p;
4912
4913 /* Take page out of read_index slot list */
4914 thr_send_page * next = p->m_next;
4915 p->m_next = NULL;
4916 src->m_buffers[r] = next;
4917
4918 if (next == NULL)
4919 {
4920 /**
4921 * Used up read slot, any more slots available to read
4922 * from this thread?
4923 */
4924 r = (r+1) % thr_send_queue::SIZE;
4925 more_pages |= (r != w);
4926
4927 /* Update global and local per thread read indices */
4928 sb->m_read_index[thr] = r;
4929 ri[thr] = r;
4930 }
4931 else
4932 {
4933 more_pages |= true;
4934 }
4935 }
4936 }
4937 } while (more_pages);
4938 }
4939 else
4940
4941 #endif
4942
4943 {
4944 for (unsigned thr = 0; thr < glob_num_threads; thr++, src++)
4945 {
4946 Uint32 r = ri[thr];
4947 Uint32 w = wi[thr];
4948 if (r != w)
4949 {
4950 rmb();
4951 while (r != w)
4952 {
4953 thr_send_page * p = src->m_buffers[r];
4954 assert(p->m_start == 0);
4955 bytes += p->m_bytes;
4956 tmp.m_last_page->m_next = p;
4957 while (p->m_next != 0)
4958 {
4959 p = p->m_next;
4960 assert(p->m_start == 0);
4961 bytes += p->m_bytes;
4962 }
4963 tmp.m_last_page = p;
4964 assert(tmp.m_last_page != 0); /* Impossible */
4965 r = (r + 1) % thr_send_queue::SIZE;
4966 }
4967 sb->m_read_index[thr] = r;
4968 }
4969 }
4970 }
4971 if (bytes > 0)
4972 {
4973 const Uint64 buffered_size = sb->m_buffered_size;
4974 /**
4975 * Append send buffers collected from threads
4976 * to end of existing m_buffers.
4977 */
4978 if (sb->m_buffer.m_first_page)
4979 {
4980 assert(sb->m_buffer.m_first_page != NULL);
4981 assert(sb->m_buffer.m_last_page != NULL);
4982 sb->m_buffer.m_last_page->m_next = tmp.m_first_page->m_next;
4983 sb->m_buffer.m_last_page = tmp.m_last_page;
4984 }
4985 else
4986 {
4987 assert(sb->m_buffer.m_first_page == NULL);
4988 assert(sb->m_buffer.m_last_page == NULL);
4989 sb->m_buffer.m_first_page = tmp.m_first_page->m_next;
4990 sb->m_buffer.m_last_page = tmp.m_last_page;
4991 }
4992 sb->m_buffered_size = buffered_size + bytes;
4993 }
4994 return bytes;
4995 }
4996
4997 /**
4998 * pack thr_send_pages for a particular send-buffer <em>db</em>
4999 * release pages (local) to <em>pool</em>
5000 *
5001 * We're using a very simple algorithm that packs two neighbour
5002 * pages into one page if possible, if not possible we simply
5003 * move on. This guarantees that pages will at least be full to
5004 * 50% fill level which should be sufficient for our needs here.
5005 *
5006 * We call pack_sb_pages() when we fail to send all data to one
5007 * specific trp immediately. This ensures that we won't keep
5008 * pages allocated with lots of free spaces.
5009 *
5010 * We may also pack_sb_pages() from get_bytes_to_send_iovec()
5011 * if all send buffers can't be filled into the iovec[]. Thus
5012 * possibly saving extra send roundtrips.
5013 *
5014 * The send threads will use the pack_sb_pages()
5015 * from the bytes_sent function which is a callback from
5016 * the transporter.
5017 *
5018 * Can only be called with relevant lock held on 'buffer'.
5019 * Return remaining unsent bytes in 'buffer'.
5020 */
5021 static
5022 Uint32
pack_sb_pages(thread_local_pool<thr_send_page> * pool,struct thr_send_buffer * buffer)5023 pack_sb_pages(thread_local_pool<thr_send_page>* pool,
5024 struct thr_send_buffer* buffer)
5025 {
5026 assert(buffer->m_first_page != NULL);
5027 assert(buffer->m_last_page != NULL);
5028 assert(buffer->m_last_page->m_next == NULL);
5029
5030 thr_send_page* curr = buffer->m_first_page;
5031 Uint32 curr_free = curr->max_bytes() - (curr->m_bytes + curr->m_start);
5032 Uint32 bytes = curr->m_bytes;
5033 while (curr->m_next != 0)
5034 {
5035 thr_send_page* next = curr->m_next;
5036 bytes += next->m_bytes;
5037 assert(next->m_start == 0); // only first page should have half sent bytes
5038 if (next->m_bytes <= curr_free)
5039 {
5040 /**
5041 * There is free space in the current page and it is sufficient to
5042 * store the entire next-page. Copy from next page to current page
5043 * and update current page and release next page to local pool.
5044 */
5045 thr_send_page * save = next;
5046 memcpy(curr->m_data + (curr->m_bytes + curr->m_start),
5047 next->m_data,
5048 next->m_bytes);
5049
5050 curr_free -= next->m_bytes;
5051
5052 curr->m_bytes += next->m_bytes;
5053 curr->m_next = next->m_next;
5054
5055 pool->release_local(save);
5056
5057 #ifdef NDB_BAD_SEND
5058 if ((curr->m_bytes % 40) == 24)
5059 {
5060 /* Oops */
5061 curr->m_data[curr->m_start + 21] = 'F';
5062 }
5063 #endif
5064 }
5065 else
5066 {
5067 /* Not enough free space in current, move to next page */
5068 curr = next;
5069 curr_free = curr->max_bytes() - (curr->m_bytes + curr->m_start);
5070 }
5071 }
5072
5073 buffer->m_last_page = curr;
5074 assert(bytes > 0);
5075 return bytes;
5076 }
5077
5078 static
5079 void
release_list(thread_local_pool<thr_send_page> * pool,thr_send_page * head,thr_send_page * tail)5080 release_list(thread_local_pool<thr_send_page>* pool,
5081 thr_send_page* head, thr_send_page * tail)
5082 {
5083 while (head != tail)
5084 {
5085 thr_send_page * tmp = head;
5086 head = head->m_next;
5087 pool->release_local(tmp);
5088 }
5089 pool->release_local(tail);
5090 }
5091
5092 /**
5093 * Get buffered pages ready to be sent by the transporter.
5094 * All pages returned from this function will refer to
5095 * pages in the m_sending buffers
5096 *
5097 * The 'sb->m_send_lock' has to be held prior to calling
5098 * this function.
5099 *
5100 * Any available 'm_buffer's will be appended to the
5101 * 'm_sending' buffers with apropriate locks taken.
5102 *
5103 * If sending to trp is not enabled, the buffered pages
5104 * are released instead of being returned from this method.
5105 */
5106 Uint32
get_bytes_to_send_iovec(NodeId node,TrpId trp_id,struct iovec * dst,Uint32 max)5107 trp_callback::get_bytes_to_send_iovec(NodeId node,
5108 TrpId trp_id,
5109 struct iovec *dst,
5110 Uint32 max)
5111 {
5112 (void)node;
5113 thr_repository::send_buffer *sb = g_thr_repository->m_send_buffers + trp_id;
5114 sb->m_bytes_sent = 0;
5115
5116 /**
5117 * Collect any available send pages from the thread queues
5118 * and 'm_buffers'. Append them to the end of m_sending buffers
5119 */
5120 {
5121 lock(&sb->m_buffer_lock);
5122 link_thread_send_buffers(sb, trp_id);
5123
5124 if (sb->m_buffer.m_first_page != NULL)
5125 {
5126 // If first page is not NULL, the last page also can't be NULL
5127 require(sb->m_buffer.m_last_page != NULL);
5128 if (sb->m_sending.m_first_page == NULL)
5129 {
5130 sb->m_sending = sb->m_buffer;
5131 }
5132 else
5133 {
5134 assert(sb->m_sending.m_last_page != NULL);
5135 sb->m_sending.m_last_page->m_next = sb->m_buffer.m_first_page;
5136 sb->m_sending.m_last_page = sb->m_buffer.m_last_page;
5137 }
5138 sb->m_buffer.m_first_page = NULL;
5139 sb->m_buffer.m_last_page = NULL;
5140
5141 sb->m_sending_size += sb->m_buffered_size;
5142 sb->m_buffered_size = 0;
5143 }
5144 unlock(&sb->m_buffer_lock);
5145
5146 if (sb->m_sending.m_first_page == NULL)
5147 return 0;
5148 }
5149
5150 /**
5151 * If sending to trp is not enabled; discard the send buffers.
5152 */
5153 if (unlikely(!sb->m_enabled))
5154 {
5155 thread_local_pool<thr_send_page> pool(&g_thr_repository->m_sb_pool, 0);
5156 release_list(&pool, sb->m_sending.m_first_page, sb->m_sending.m_last_page);
5157 pool.release_all(g_thr_repository->m_mm,
5158 RG_TRANSPORTER_BUFFERS,
5159 g_send_threads == NULL ?
5160 0 :
5161 g_send_threads->get_send_instance(trp_id));
5162
5163 sb->m_sending.m_first_page = NULL;
5164 sb->m_sending.m_last_page = NULL;
5165 sb->m_sending_size = 0;
5166 return 0;
5167 }
5168
5169 /**
5170 * Process linked-list and put into iovecs
5171 */
5172 fill_iovec:
5173 Uint32 tot = 0;
5174 Uint32 pos = 0;
5175 thr_send_page * p = sb->m_sending.m_first_page;
5176
5177 #ifdef NDB_LUMPY_SEND
5178 /* Drip feed transporter a few bytes at a time to send */
5179 do
5180 {
5181 Uint32 offset = 0;
5182 while ((offset < p->m_bytes) && (pos < max))
5183 {
5184 /* 0 -+1-> 1 -+6-> (7)3 -+11-> (18)2 -+10-> 0 */
5185 Uint32 lumpSz = 1;
5186 switch (offset % 4)
5187 {
5188 case 0 : lumpSz = 1; break;
5189 case 1 : lumpSz = 6; break;
5190 case 2 : lumpSz = 10; break;
5191 case 3 : lumpSz = 11; break;
5192 }
5193 const Uint32 remain = p->m_bytes - offset;
5194 lumpSz = (remain < lumpSz)?
5195 remain :
5196 lumpSz;
5197
5198 dst[pos].iov_base = p->m_data + p->m_start + offset;
5199 dst[pos].iov_len = lumpSz;
5200 pos ++;
5201 offset+= lumpSz;
5202 }
5203 if (pos == max)
5204 {
5205 return pos;
5206 }
5207 assert(offset == p->m_bytes);
5208 p = p->m_next;
5209 } while (p != NULL);
5210
5211 return pos;
5212 #endif
5213
5214 do {
5215 dst[pos].iov_len = p->m_bytes;
5216 dst[pos].iov_base = p->m_data + p->m_start;
5217 assert(p->m_start + p->m_bytes <= p->max_bytes());
5218 tot += p->m_bytes;
5219 pos++;
5220 p = p->m_next;
5221 if (p == NULL)
5222 return pos;
5223 } while (pos < max);
5224
5225 /**
5226 * Possibly pack send-buffers to get better utilization:
5227 * If we were unable to fill all sendbuffers into iovec[],
5228 * we pack the sendbuffers now if they have a low fill degree.
5229 * This could save us another OS-send for sending the remaining.
5230 */
5231 if (pos == max && max > 1 && // Exhausted iovec[]
5232 tot < (pos * thr_send_page::max_bytes())/4) // < 25% filled
5233 {
5234 const Uint32 thr_no = sb->m_send_thread;
5235 assert(thr_no != NO_SEND_THREAD);
5236
5237 if (!is_send_thread(thr_no))
5238 {
5239 thr_data * thrptr = &g_thr_repository->m_thread[thr_no];
5240 pack_sb_pages(&thrptr->m_send_buffer_pool, &sb->m_sending);
5241 }
5242 else
5243 {
5244 pack_sb_pages(g_send_threads->get_send_buffer_pool(thr_no), &sb->m_sending);
5245 }
5246
5247 /**
5248 * Retry filling iovec[]. As 'pack' will ensure at least 50% fill degree,
5249 * we will not do another 'pack' after the retry.
5250 */
5251 goto fill_iovec;
5252 }
5253 return pos;
5254 }
5255
5256 static
5257 Uint32
bytes_sent(thread_local_pool<thr_send_page> * pool,thr_repository::send_buffer * sb,Uint32 bytes)5258 bytes_sent(thread_local_pool<thr_send_page>* pool,
5259 thr_repository::send_buffer* sb, Uint32 bytes)
5260 {
5261 const Uint64 sending_size = sb->m_sending_size;
5262 assert(bytes && bytes <= sending_size);
5263
5264 sb->m_bytes_sent = bytes;
5265 sb->m_sending_size = sending_size - bytes;
5266
5267 Uint32 remain = bytes;
5268 thr_send_page * prev = NULL;
5269 thr_send_page * curr = sb->m_sending.m_first_page;
5270
5271 /* Some, or all, in 'm_sending' was sent, find endpoint. */
5272 while (remain && remain >= curr->m_bytes)
5273 {
5274 /**
5275 * Calculate new current page such that we can release the
5276 * pages that have been completed and update the state of
5277 * the new current page
5278 */
5279 remain -= curr->m_bytes;
5280 prev = curr;
5281 curr = curr->m_next;
5282 }
5283
5284 if (remain)
5285 {
5286 /**
5287 * Not all pages was fully sent and we stopped in the middle of
5288 * a page
5289 *
5290 * Update state of new current page and release any pages
5291 * that have already been sent
5292 */
5293 curr->m_start += remain;
5294 assert(curr->m_bytes > remain);
5295 curr->m_bytes -= remain;
5296 if (prev)
5297 {
5298 release_list(pool, sb->m_sending.m_first_page, prev);
5299 }
5300 }
5301 else
5302 {
5303 /**
5304 * We sent a couple of full pages and the sending stopped at a
5305 * page boundary, so we only need to release the sent pages
5306 * and update the new current page.
5307 */
5308 if (prev)
5309 {
5310 release_list(pool, sb->m_sending.m_first_page, prev);
5311
5312 if (prev == sb->m_sending.m_last_page)
5313 {
5314 /**
5315 * Every thing was released, release the pages in the local pool
5316 */
5317 sb->m_sending.m_first_page = NULL;
5318 sb->m_sending.m_last_page = NULL;
5319 return 0;
5320 }
5321 }
5322 else
5323 {
5324 assert(sb->m_sending.m_first_page != NULL);
5325 pool->release_local(sb->m_sending.m_first_page);
5326 }
5327 }
5328
5329 sb->m_sending.m_first_page = curr;
5330
5331 /**
5332 * Since not all bytes were sent...
5333 * spend the time to try to pack the m_sending pages
5334 * possibly releasing send-buffer
5335 */
5336 return pack_sb_pages(pool, &sb->m_sending);
5337 }
5338
5339 /**
5340 * Register the specified amount of 'bytes' as sent, starting
5341 * from the first avail byte in the m_sending buffer.
5342 *
5343 * The 'm_send_lock' has to be held prior to calling
5344 * this function.
5345 */
5346 Uint32
bytes_sent(NodeId node,TrpId trp_id,Uint32 bytes)5347 trp_callback::bytes_sent(NodeId node, TrpId trp_id, Uint32 bytes)
5348 {
5349 (void)node;
5350 thr_repository::send_buffer *sb = g_thr_repository->m_send_buffers+trp_id;
5351 Uint32 thr_no = sb->m_send_thread;
5352 assert(thr_no != NO_SEND_THREAD);
5353 if (!is_send_thread(thr_no))
5354 {
5355 thr_data * thrptr = &g_thr_repository->m_thread[thr_no];
5356 return ::bytes_sent(&thrptr->m_send_buffer_pool,
5357 sb,
5358 bytes);
5359 }
5360 else
5361 {
5362 return ::bytes_sent(g_send_threads->get_send_buffer_pool(thr_no),
5363 sb,
5364 bytes);
5365 }
5366 }
5367
5368 void
enable_send_buffer(NodeId node,TrpId trp_id)5369 trp_callback::enable_send_buffer(NodeId node, TrpId trp_id)
5370 {
5371 (void)node;
5372 thr_repository::send_buffer *sb = g_thr_repository->m_send_buffers+trp_id;
5373 lock(&sb->m_send_lock);
5374 assert(sb->m_sending_size == 0);
5375 {
5376 /**
5377 * Collect and discard any sent buffered signals while
5378 * send buffers were disabled.
5379 */
5380 lock(&sb->m_buffer_lock);
5381 link_thread_send_buffers(sb, trp_id);
5382
5383 if (sb->m_buffer.m_first_page != NULL)
5384 {
5385 thread_local_pool<thr_send_page> pool(&g_thr_repository->m_sb_pool, 0);
5386 release_list(&pool, sb->m_buffer.m_first_page, sb->m_buffer.m_last_page);
5387 pool.release_all(g_thr_repository->m_mm,
5388 RG_TRANSPORTER_BUFFERS,
5389 g_send_threads == NULL ?
5390 0 :
5391 g_send_threads->get_send_instance(trp_id));
5392 sb->m_buffer.m_first_page = NULL;
5393 sb->m_buffer.m_last_page = NULL;
5394 sb->m_buffered_size = 0;
5395 }
5396 unlock(&sb->m_buffer_lock);
5397 }
5398 assert(sb->m_enabled == false);
5399 sb->m_enabled = true;
5400 unlock(&sb->m_send_lock);
5401 }
5402
5403 void
disable_send_buffer(NodeId node,TrpId trp_id)5404 trp_callback::disable_send_buffer(NodeId node, TrpId trp_id)
5405 {
5406 (void)node;
5407 thr_repository::send_buffer *sb = g_thr_repository->m_send_buffers+trp_id;
5408 lock(&sb->m_send_lock);
5409 sb->m_enabled = false;
5410
5411 /**
5412 * Discard buffered signals not yet sent:
5413 * Note that other threads may still continue send-buffering into
5414 * their thread local send buffers until they discover that the
5415 * transporter has disconnect. However, these sent signals will
5416 * either be discarded when collected by ::get_bytes_to_send_iovec(),
5417 * or any leftovers discarded by ::enable_send_buffer()
5418 */
5419 if (sb->m_sending.m_first_page != NULL)
5420 {
5421 thread_local_pool<thr_send_page> pool(&g_thr_repository->m_sb_pool, 0);
5422 release_list(&pool, sb->m_sending.m_first_page, sb->m_sending.m_last_page);
5423 pool.release_all(g_thr_repository->m_mm,
5424 RG_TRANSPORTER_BUFFERS,
5425 g_send_threads == NULL ?
5426 0 :
5427 g_send_threads->get_send_instance(trp_id));
5428 sb->m_sending.m_first_page = NULL;
5429 sb->m_sending.m_last_page = NULL;
5430 sb->m_sending_size = 0;
5431 }
5432
5433 unlock(&sb->m_send_lock);
5434 }
5435
5436 static inline
5437 void
register_pending_send(thr_data * selfptr,Uint32 trp_id)5438 register_pending_send(thr_data *selfptr, Uint32 trp_id)
5439 {
5440 /* Mark that this trp has pending send data. */
5441 if (!selfptr->m_pending_send_mask.get(trp_id))
5442 {
5443 selfptr->m_pending_send_mask.set(trp_id, 1);
5444 Uint32 i = selfptr->m_pending_send_count;
5445 selfptr->m_pending_send_trps[i] = trp_id;
5446 selfptr->m_pending_send_count = i + 1;
5447 }
5448 }
5449
5450 /**
5451 Pack send buffers to make memory available to other threads. The signals
5452 sent uses often one page per signal which means that most pages are very
5453 unpacked. In some situations this means that we can run out of send buffers
5454 and still have massive amounts of free space.
5455
5456 We call this from the main loop in the block threads when we fail to
5457 allocate enough send buffers. In addition we call the node local
5458 pack_sb_pages() several places - See header-comment for that function.
5459 */
5460 static
5461 void
try_pack_send_buffers(thr_data * selfptr)5462 try_pack_send_buffers(thr_data* selfptr)
5463 {
5464 thr_repository* rep = g_thr_repository;
5465 thread_local_pool<thr_send_page>* pool = &selfptr->m_send_buffer_pool;
5466
5467 for (Uint32 i = 1; i < NDB_ARRAY_SIZE(selfptr->m_send_buffers); i++)
5468 {
5469 if (globalTransporterRegistry.get_transporter(i))
5470 {
5471 thr_repository::send_buffer* sb = rep->m_send_buffers+i;
5472 if (trylock(&sb->m_buffer_lock) != 0)
5473 {
5474 continue; // Continue with next if busy
5475 }
5476
5477 link_thread_send_buffers(sb, i);
5478 if (sb->m_buffer.m_first_page != NULL)
5479 {
5480 pack_sb_pages(pool, &sb->m_buffer);
5481 }
5482 unlock(&sb->m_buffer_lock);
5483 }
5484 }
5485 /* Release surplus buffers from local pool to global pool */
5486 pool->release_global(g_thr_repository->m_mm,
5487 RG_TRANSPORTER_BUFFERS,
5488 selfptr->m_send_instance_no);
5489 }
5490
5491
5492 /**
5493 * publish thread-locally prepared send-buffer
5494 */
5495 static
5496 void
flush_send_buffer(thr_data * selfptr,Uint32 trp_id)5497 flush_send_buffer(thr_data* selfptr, Uint32 trp_id)
5498 {
5499 Uint32 thr_no = selfptr->m_thr_no;
5500 thr_send_buffer * src = selfptr->m_send_buffers + trp_id;
5501 thr_repository* rep = g_thr_repository;
5502
5503 if (src->m_first_page == 0)
5504 {
5505 return;
5506 }
5507 assert(src->m_last_page != 0);
5508
5509 thr_send_queue * dst = rep->m_thread_send_buffers[trp_id]+thr_no;
5510 thr_repository::send_buffer* sb = rep->m_send_buffers+trp_id;
5511
5512 Uint32 wi = dst->m_write_index;
5513 Uint32 next = (wi + 1) % thr_send_queue::SIZE;
5514 Uint32 ri = sb->m_read_index[thr_no];
5515
5516 /**
5517 * If thread local ring buffer of send-buffers is full:
5518 * Empty it by transfering them to the global send_buffer list.
5519 */
5520 if (unlikely(next == ri))
5521 {
5522 lock(&sb->m_buffer_lock);
5523 link_thread_send_buffers(sb, trp_id);
5524 unlock(&sb->m_buffer_lock);
5525 }
5526
5527 dst->m_buffers[wi] = src->m_first_page;
5528 wmb();
5529 dst->m_write_index = next;
5530
5531 src->m_first_page = 0;
5532 src->m_last_page = 0;
5533 }
5534
5535 /**
5536 * This is used in case send buffer gets full, to force an emergency send,
5537 * hopefully freeing up some buffer space for the next signal.
5538 */
5539 bool
forceSend(NodeId node,TrpId trp_id)5540 mt_send_handle::forceSend(NodeId node, TrpId trp_id)
5541 {
5542 (void)node;
5543 struct thr_repository *rep = g_thr_repository;
5544 struct thr_data *selfptr = m_selfptr;
5545 struct thr_repository::send_buffer * sb = rep->m_send_buffers + trp_id;
5546
5547 {
5548 /**
5549 * NOTE: we don't need a memory barrier after clearing
5550 * m_force_send here as we unconditionally lock m_send_lock
5551 * hence there is no way that our data can be "unsent"
5552 */
5553 sb->m_force_send = 0;
5554
5555 lock(&sb->m_send_lock);
5556 sb->m_send_thread = selfptr->m_thr_no;
5557 bool more = globalTransporterRegistry.performSend(trp_id, false);
5558 sb->m_send_thread = NO_SEND_THREAD;
5559 unlock(&sb->m_send_lock);
5560
5561 /**
5562 * release buffers prior to maybe looping on sb->m_force_send
5563 */
5564 selfptr->m_send_buffer_pool.release_global(rep->m_mm,
5565 RG_TRANSPORTER_BUFFERS,
5566 selfptr->m_send_instance_no);
5567 /**
5568 * We need a memory barrier here to prevent race between clearing lock
5569 * and reading of m_force_send.
5570 * CPU can reorder the load to before the clear of the lock
5571 */
5572 mb();
5573 if (unlikely(sb->m_force_send) || more)
5574 {
5575 register_pending_send(selfptr, trp_id);
5576 }
5577 }
5578
5579 return true;
5580 }
5581
5582 /**
5583 * try sending data
5584 */
5585 static
5586 void
try_send(thr_data * selfptr,Uint32 trp_id)5587 try_send(thr_data * selfptr, Uint32 trp_id)
5588 {
5589 struct thr_repository *rep = g_thr_repository;
5590 struct thr_repository::send_buffer * sb = rep->m_send_buffers + trp_id;
5591
5592 if (trylock(&sb->m_send_lock) == 0)
5593 {
5594 /**
5595 * Now clear the flag, and start sending all data available to this trp.
5596 *
5597 * Put a memory barrier here, so that if another thread tries to grab
5598 * the send lock but fails due to us holding it here, we either
5599 * 1) Will see m_force_send[id] set to 1 at the end of the loop, or
5600 * 2) We clear here the flag just set by the other thread, but then we
5601 * will (thanks to mb()) be able to see and send all of the data already
5602 * in the first send iteration.
5603 */
5604 sb->m_force_send = 0;
5605 mb();
5606
5607 sb->m_send_thread = selfptr->m_thr_no;
5608 globalTransporterRegistry.performSend(trp_id);
5609 sb->m_send_thread = NO_SEND_THREAD;
5610 unlock(&sb->m_send_lock);
5611
5612 /**
5613 * release buffers prior to maybe looping on sb->m_force_send
5614 */
5615 selfptr->m_send_buffer_pool.release_global(rep->m_mm,
5616 RG_TRANSPORTER_BUFFERS,
5617 selfptr->m_send_instance_no);
5618
5619 /**
5620 * We need a memory barrier here to prevent race between clearing lock
5621 * and reading of m_force_send.
5622 * CPU can reorder the load to before the clear of the lock
5623 */
5624 mb();
5625 if (unlikely(sb->m_force_send))
5626 {
5627 register_pending_send(selfptr, trp_id);
5628 }
5629 }
5630 }
5631
5632 /**
5633 * Flush send buffers and append them to dst. trps send queue
5634 *
5635 * Flushed buffer contents are piggybacked when another thread
5636 * do_send() to the same dst. trp. This makes it possible to have
5637 * more data included in each message, and thereby reduces total
5638 * #messages handled by the OS which really impacts performance!
5639 */
5640 static
5641 void
do_flush(struct thr_data * selfptr)5642 do_flush(struct thr_data* selfptr)
5643 {
5644 Uint32 i;
5645 Uint32 count = selfptr->m_pending_send_count;
5646 NodeId *trps = selfptr->m_pending_send_trps;
5647
5648 for (i = 0; i < count; i++)
5649 {
5650 flush_send_buffer(selfptr, trps[i]);
5651 }
5652 }
5653
5654 /**
5655 * Use the THRMAN block to send the WAKEUP_THREAD_ORD signal
5656 * to the block thread that we want to wakeup.
5657 */
5658 #define MICROS_BETWEEN_WAKEUP_IDLE_THREAD 100
5659 static
5660 inline
5661 void
send_wakeup_thread_ord(struct thr_data * selfptr,NDB_TICKS now)5662 send_wakeup_thread_ord(struct thr_data* selfptr,
5663 NDB_TICKS now)
5664 {
5665 if (selfptr->m_wakeup_instance > 0)
5666 {
5667 Uint64 since_last =
5668 NdbTick_Elapsed(selfptr->m_last_wakeup_idle_thread, now).microSec();
5669 if (since_last > MICROS_BETWEEN_WAKEUP_IDLE_THREAD)
5670 {
5671 selfptr->m_signal->theData[0] = selfptr->m_wakeup_instance;
5672 SimulatedBlock *b = globalData.getBlock(THRMAN, selfptr->m_thr_no+1);
5673 b->executeFunction_async(GSN_SEND_WAKEUP_THREAD_ORD, selfptr->m_signal);
5674 selfptr->m_last_wakeup_idle_thread = now;
5675 }
5676 }
5677 }
5678
5679 /**
5680 * Send any pending data to remote trps.
5681 *
5682 * If MUST_SEND is false, will only try to lock the send lock, but if it would
5683 * block, that trp is skipped, to be tried again next time round.
5684 *
5685 * If MUST_SEND is true, we still only try to lock, but if it would block,
5686 * we will force the thread holding the lock, to do the sending on our behalf.
5687 *
5688 * The list of pending trps to send to is thread-local, but the per-trp send
5689 * buffer is shared by all threads. Thus we might skip a trp for which
5690 * another thread has pending send data, and we might send pending data also
5691 * for another thread without clearing the trp from the pending list of that
5692 * other thread (but we will never loose signals due to this).
5693 *
5694 * Return number of trps which still has pending data to be sent.
5695 * These will be retried again in the next round. 'Pending' is
5696 * returned as a negative number if nothing was sent in this round.
5697 *
5698 * (Likely due to receivers consuming too slow, and receive and send buffers
5699 * already being filled up)
5700 *
5701 * Sending data to other trps is a task that we perform using an algorithm
5702 * that depends on the state of block threads. The block threads can be in
5703 * 3 different states:
5704 *
5705 * LIGHT_LOAD:
5706 * -----------
5707 * In this state we will send to all trps we generate data for. In addition
5708 * we will also send to one trp if we are going to sleep, we will stay awake
5709 * until no more trps to send to. However between each send we will also
5710 * ensure that we execute any signals destined for us.
5711 *
5712 * LIGHT_LOAD threads can also be provided to other threads as wakeup targets.
5713 * This means that these threads will be woken up regularly under load to
5714 * assist with sending.
5715 *
5716 * MEDIUM_LOAD:
5717 * ------------
5718 * At this load level we will also assist send threads before going to sleep
5719 * and continue so until we have work ourselves to do or until there are no
5720 * more trps to send to. We will additionally send partially our own data.
5721 * We will also wake up a send thread during send to ensure that sends are
5722 * performed ASAP.
5723 *
5724 * OVERLOAD:
5725 * ---------
5726 * At this level we will simply inform the send threads about the trps we
5727 * sent some data to, the actual sending will be handled by send threads
5728 * and other block threads assisting the send threads.
5729 *
5730 * In addition if any thread is at overload level we will sleep for a shorter
5731 * time.
5732 *
5733 * The decision about which idle threads to wake up, which overload level to
5734 * use and when to sleep for shorter time is all taken by the local THRMAN
5735 * block. Some decisions is also taken by the THRMAN instance in the main
5736 * thread.
5737 *
5738 * Send threads are woken up in a round robin fashion, each time they are
5739 * awoken they will continue executing until no more work is around.
5740 */
5741 static
5742 bool
do_send(struct thr_data * selfptr,bool must_send,bool assist_send)5743 do_send(struct thr_data* selfptr, bool must_send, bool assist_send)
5744 {
5745 Uint32 count = selfptr->m_pending_send_count;
5746 NodeId *trps = selfptr->m_pending_send_trps;
5747
5748 const NDB_TICKS now = NdbTick_getCurrentTicks();
5749 selfptr->m_curr_ticks = now;
5750 bool pending_send = false;
5751 selfptr->m_watchdog_counter = 6;
5752
5753 if (count == 0)
5754 {
5755 if (must_send && assist_send && g_send_threads &&
5756 selfptr->m_overload_status <= (OverloadStatus)MEDIUM_LOAD_CONST &&
5757 (selfptr->m_nosend == 0))
5758 {
5759 /**
5760 * For some overload states we will here provide some
5761 * send assistance even though we had nothing to send
5762 * ourselves. We will however not need to offload any
5763 * sends ourselves.
5764 *
5765 * The idea is that when we get here the thread is usually not so
5766 * active with other things as it has nothing to send, it must
5767 * send which means that it is preparing to go to sleep and
5768 * we have excluded the receive threads through assist_send.
5769 *
5770 * We will avoid this extra send when we are in overload mode since
5771 * it is likely that we will find work to do before going to sleep
5772 * anyways. In all other modes it makes sense to spend some time
5773 * sending before going to sleep. In particular TC threads will be
5774 * doing major send assistance here.
5775 *
5776 * In case there is more work to do and our thread is mostly idle,
5777 * we will soon enough be back here and assist the send thread
5778 * again. We make this happen by setting pending_send flag in
5779 * return from this mode. We come back here after checking that
5780 * we have no signals to process, so at most we will delay the
5781 * signal execution here by the time it takes to send to one
5782 * trp.
5783 *
5784 * The receive threads won't assist the send thread to ensure
5785 * that we can respond to incoming messages ASAP. We want to
5786 * to optimise for response time here since this is needed to
5787 * ensure that the block threads have sufficient work to do.
5788 *
5789 * If we come here and have had nothing to send, then we're able to
5790 * do some more sending if there are pending send still in send queue.
5791 * So we return pending_send != 0 in this case to ensure that this
5792 * thread doesn't go to sleep, but rather come back here to assist the
5793 * send thread a bit more. We'll continue spinning here until we get
5794 * some work to do or until the send queue is empty.
5795 */
5796 Uint32 num_trps_to_send_to = 1;
5797 pending_send = g_send_threads->assist_send_thread(
5798 num_trps_to_send_to,
5799 selfptr->m_thr_no,
5800 now,
5801 selfptr->m_watchdog_counter,
5802 selfptr->m_send_instance,
5803 selfptr->m_send_buffer_pool);
5804 NDB_TICKS after = NdbTick_getCurrentTicks();
5805 selfptr->m_micros_send += NdbTick_Elapsed(now, after).microSec();
5806 }
5807 return pending_send; // send-buffers empty
5808 }
5809
5810 /* Clear the pending list. */
5811 selfptr->m_pending_send_mask.clear();
5812 selfptr->m_pending_send_count = 0;
5813 selfptr->m_watchdog_counter = 6;
5814 for (Uint32 i = 0; i < count; i++)
5815 {
5816 /**
5817 * Make the data available for sending immediately so that
5818 * any other trp sending will grab this data without having
5819 * wait for us to handling the other trps.
5820 */
5821 Uint32 id = trps[i];
5822 flush_send_buffer(selfptr, id);
5823 }
5824 selfptr->m_watchdog_counter = 6;
5825 if (g_send_threads)
5826 {
5827 /**
5828 * Each send thread is only responsible for a subset of the transporters
5829 * to send to and we will only assist a subset of the transporters
5830 * for sending. This means that it is very hard to predict whether send
5831 * thread needs to be woken up. This means that we will awake the send
5832 * threads required for sending, even if no send assistance was really
5833 * required. This will create some extra load on the send threads, but
5834 * will make NDB data nodes more scalable to handle extremely high loads.
5835 *
5836 * When we are in an overloaded state, we move the trps to send to
5837 * into the send thread global lists. Since we already woken up the
5838 * send threads to handle sends we do no more in overloaded state.
5839 *
5840 * We don't record any send time here since it would be
5841 * an unnecessary extra load, we only grab a mutex and
5842 * ensure that someone else takes over our send work.
5843 *
5844 * When the user have set nosend=1 on this thread we will
5845 * never assist with the sending.
5846 */
5847 if (selfptr->m_overload_status == (OverloadStatus)OVERLOAD_CONST ||
5848 selfptr->m_nosend != 0)
5849 {
5850 for (Uint32 i = 0; i < count; i++)
5851 {
5852 g_send_threads->alert_send_thread(trps[i], now, NULL);
5853 }
5854 }
5855 else
5856 {
5857 /**
5858 * While we are in an light load state we will always try to
5859 * send to as many trps that we inserted ourselves. In this case
5860 * we don't need to wake any send threads. If the trps still need
5861 * sending to after we're done we will ensure that a send thread
5862 * is woken up. assist_send_thread will ensure that send threads
5863 * are woken up if needed.
5864 *
5865 * At medium load levels we keep track of how much trps we have
5866 * wanted to send to and ensure that we at least do a part of that
5867 * work if need be. However we try as much as possible to avoid
5868 * sending at medium load at this point since we still have more
5869 * work to do. So we offload the sending to other threads and
5870 * wait with providing send assistance until we're out of work
5871 * or we have accumulated sufficiently to provide a bit of
5872 * assistance to the send threads.
5873 *
5874 * At medium load we set num_trps_inserted to 0 since we
5875 * have already woken up a send thread and thus there is no
5876 * need to wake up another thread in assist_send_thread, so we
5877 * indicate that we call this function only to assist and need
5878 * no wakeup service.
5879 *
5880 * We will check here also if we should wake an idle thread to
5881 * do some send assistance. We check so that we don't perform
5882 * this wakeup function too often.
5883 */
5884
5885 Uint32 num_trps_inserted = 0;
5886 for (Uint32 i = 0; i < count; i++)
5887 {
5888 num_trps_inserted += g_send_threads->alert_send_thread(trps[i],
5889 now,
5890 selfptr->m_send_instance);
5891 }
5892 Uint32 num_trps_to_send_to = num_trps_inserted;
5893 if (selfptr->m_overload_status != (OverloadStatus)MEDIUM_LOAD_CONST)
5894 {
5895 num_trps_to_send_to++;
5896 }
5897 send_wakeup_thread_ord(selfptr, now);
5898 if (num_trps_to_send_to > 0)
5899 {
5900 pending_send = g_send_threads->assist_send_thread(
5901 num_trps_to_send_to,
5902 selfptr->m_thr_no,
5903 now,
5904 selfptr->m_watchdog_counter,
5905 selfptr->m_send_instance,
5906 selfptr->m_send_buffer_pool);
5907 }
5908 NDB_TICKS after = NdbTick_getCurrentTicks();
5909 selfptr->m_micros_send += NdbTick_Elapsed(now, after).microSec();
5910 g_send_threads->wake_my_send_thread_if_needed(&trps[0],
5911 count,
5912 selfptr->m_send_instance);
5913 }
5914 return pending_send;
5915 }
5916
5917 /**
5918 * We're not using send threads, we keep this code around for now
5919 * to ensure that we can support the same behaviour also in newer
5920 * versions for a while. Eventually this code will be deprecated.
5921 */
5922 Uint32 made_progress = 0;
5923 struct thr_repository* rep = g_thr_repository;
5924
5925 for (Uint32 i = 0; i < count; i++)
5926 {
5927 Uint32 id = trps[i];
5928 thr_repository::send_buffer * sb = rep->m_send_buffers + id;
5929
5930 selfptr->m_watchdog_counter = 6;
5931
5932 /**
5933 * If we must send now, set the force_send flag.
5934 *
5935 * This will ensure that if we do not get the send lock, the thread
5936 * holding the lock will try sending again for us when it has released
5937 * the lock.
5938 *
5939 * The lock/unlock pair works as a memory barrier to ensure that the
5940 * flag update is flushed to the other thread.
5941 */
5942 if (must_send)
5943 {
5944 sb->m_force_send = 1;
5945 }
5946
5947 if (trylock(&sb->m_send_lock) != 0)
5948 {
5949 if (!must_send)
5950 {
5951 /**
5952 * Not doing this trp now, re-add to pending list.
5953 *
5954 * As we only add from the start of an empty list, we are safe from
5955 * overwriting the list while we are iterating over it.
5956 */
5957 register_pending_send(selfptr, id);
5958 }
5959 else
5960 {
5961 /* Other thread will send for us as we set m_force_send. */
5962 }
5963 }
5964 else //Got send_lock
5965 {
5966 /**
5967 * Now clear the flag, and start sending all data available to this trp.
5968 *
5969 * Put a memory barrier here, so that if another thread tries to grab
5970 * the send lock but fails due to us holding it here, we either
5971 * 1) Will see m_force_send[id] set to 1 at the end of the loop, or
5972 * 2) We clear here the flag just set by the other thread, but then we
5973 * will (thanks to mb()) be able to see and send all of the data already
5974 * in the first send iteration.
5975 */
5976 sb->m_force_send = 0;
5977 mb();
5978
5979 /**
5980 * Set m_send_thread so that our transporter callback can know which
5981 * thread holds the send lock for this remote trp.
5982 */
5983 sb->m_send_thread = selfptr->m_thr_no;
5984 const bool more = globalTransporterRegistry.performSend(id);
5985 made_progress += sb->m_bytes_sent;
5986 sb->m_send_thread = NO_SEND_THREAD;
5987 unlock(&sb->m_send_lock);
5988
5989 if (more) //Didn't complete all my send work
5990 {
5991 register_pending_send(selfptr, id);
5992 }
5993 else
5994 {
5995 /**
5996 * We need a memory barrier here to prevent race between clearing lock
5997 * and reading of m_force_send.
5998 * CPU can reorder the load to before the clear of the lock
5999 */
6000 mb();
6001 if (sb->m_force_send) //Other thread forced us to do more send
6002 {
6003 made_progress++; //Avoid false 'no progress' handling
6004 register_pending_send(selfptr, id);
6005 }
6006 }
6007 }
6008 } //for all trps
6009
6010 selfptr->m_send_buffer_pool.release_global(rep->m_mm,
6011 RG_TRANSPORTER_BUFFERS,
6012 selfptr->m_send_instance_no);
6013
6014 return (made_progress) // Had some progress?
6015 ? (selfptr->m_pending_send_count > 0) // More do_send is required
6016 : false; // All busy, or didn't find any work (-> -0)
6017 }
6018
6019 #ifdef ERROR_INSERT
6020 void
mt_set_delayed_prepare(Uint32 self)6021 mt_set_delayed_prepare(Uint32 self)
6022 {
6023 thr_repository *rep = g_thr_repository;
6024 struct thr_data *selfptr = &rep->m_thread[self];
6025
6026 selfptr->m_delayed_prepare = true;
6027 }
6028 #endif
6029
6030
6031 /**
6032 * These are the implementations of the TransporterSendBufferHandle methods
6033 * in ndbmtd.
6034 */
6035 Uint32 *
getWritePtr(NodeId nodeId,TrpId trp_id,Uint32 len,Uint32 prio,Uint32 max,SendStatus * error)6036 mt_send_handle::getWritePtr(NodeId nodeId,
6037 TrpId trp_id,
6038 Uint32 len,
6039 Uint32 prio,
6040 Uint32 max,
6041 SendStatus *error)
6042 {
6043 (void)nodeId;
6044 #ifdef ERROR_INSERT
6045 if (m_selfptr->m_delayed_prepare)
6046 {
6047 g_eventLogger->info("MT thread %u delaying in prepare",
6048 m_selfptr->m_thr_no);
6049 NdbSleep_MilliSleep(500);
6050 g_eventLogger->info("MT thread %u finished delay, clearing",
6051 m_selfptr->m_thr_no);
6052 m_selfptr->m_delayed_prepare = false;
6053 }
6054 #endif
6055
6056 struct thr_send_buffer * b = m_selfptr->m_send_buffers+trp_id;
6057 thr_send_page * p = b->m_last_page;
6058 if (likely(p != NULL))
6059 {
6060 assert(p->m_start == 0); //Nothing sent until flushed
6061
6062 if (likely(p->m_bytes + len <= thr_send_page::max_bytes()))
6063 {
6064 return (Uint32*)(p->m_data + p->m_bytes);
6065 }
6066 // TODO: maybe dont always flush on page-boundary ???
6067 flush_send_buffer(m_selfptr, trp_id);
6068 if (!g_send_threads)
6069 try_send(m_selfptr, trp_id);
6070 }
6071 if(unlikely(len > thr_send_page::max_bytes()))
6072 {
6073 *error = SEND_MESSAGE_TOO_BIG;
6074 return 0;
6075 }
6076
6077 bool first = true;
6078 while (first)
6079 {
6080 if (likely((p = m_selfptr->m_send_buffer_pool.seize(g_thr_repository->m_mm,
6081 RG_TRANSPORTER_BUFFERS,
6082 m_selfptr->m_send_instance_no)) != 0))
6083 {
6084 p->m_bytes = 0;
6085 p->m_start = 0;
6086 p->m_next = 0;
6087 b->m_first_page = b->m_last_page = p;
6088 return (Uint32*)p->m_data;
6089 }
6090 try_pack_send_buffers(m_selfptr);
6091 first = false;
6092 }
6093 *error = SEND_BUFFER_FULL;
6094 return 0;
6095 }
6096
6097 /**
6098 * Acquire total send buffer size without locking and without gathering
6099 *
6100 * OJA: The usability of this function is rather questionable.
6101 * m_buffered_size and m_sending_size is updated by
6102 * link_thread_send_buffers(), get_bytes_to_send_iovec() and
6103 * bytes_sent() - All part of performSend(). Thus, it is
6104 * valid *after* a send.
6105 *
6106 * However, checking it *before* a send in order to
6107 * determine if the payload is yet too small doesn't
6108 * really provide correct information of the current state.
6109 * Most likely '0 will be returned if previous send succeeded.
6110 *
6111 * A better alternative could be to add a 'min_send' argument
6112 * to perform_send(), and skip sending if not '>='.
6113 * (After real size is recalculated)
6114 */
6115 static Uint64
mt_get_send_buffer_bytes(TrpId trp_id)6116 mt_get_send_buffer_bytes(TrpId trp_id)
6117 {
6118 thr_repository *rep = g_thr_repository;
6119 thr_repository::send_buffer *sb = &rep->m_send_buffers[trp_id];
6120 const Uint64 total_send_buffer_size =
6121 sb->m_buffered_size + sb->m_sending_size;
6122 return total_send_buffer_size;
6123 }
6124
6125 void
mt_getSendBufferLevel(Uint32 self,NodeId id,SB_LevelType & level)6126 mt_getSendBufferLevel(Uint32 self, NodeId id, SB_LevelType &level)
6127 {
6128 Resource_limit rl;
6129 const Uint32 page_size = thr_send_page::PGSIZE;
6130 thr_repository *rep = g_thr_repository;
6131 thr_repository::send_buffer *sb = &rep->m_send_buffers[id];
6132 const Uint64 current_trp_send_buffer_size =
6133 sb->m_buffered_size + sb->m_sending_size;
6134
6135 /* Memory barrier to get a fresher value for rl.m_curr */
6136 mb();
6137 rep->m_mm->get_resource_limit_nolock(RG_TRANSPORTER_BUFFERS, rl);
6138 Uint64 current_send_buffer_size = rl.m_min * page_size;
6139 Uint64 current_used_send_buffer_size = rl.m_curr * page_size;
6140 Uint64 current_percentage =
6141 (100 * current_used_send_buffer_size) / current_send_buffer_size;
6142
6143 if (current_percentage >= 90)
6144 {
6145 const Uint32 avail_shared = rep->m_mm->get_free_shared_nolock();
6146 if (rl.m_min + avail_shared > rl.m_max)
6147 {
6148 current_send_buffer_size = rl.m_max * page_size;
6149 }
6150 else
6151 {
6152 current_send_buffer_size = (rl.m_min + avail_shared) * page_size;
6153 }
6154 }
6155 calculate_send_buffer_level(current_trp_send_buffer_size,
6156 current_send_buffer_size,
6157 current_used_send_buffer_size,
6158 glob_num_threads,
6159 level);
6160 return;
6161 }
6162
6163 void
getSendBufferLevel(NodeId id,SB_LevelType & level)6164 mt_send_handle::getSendBufferLevel(NodeId id, SB_LevelType &level)
6165 {
6166 (void)id;
6167 (void)level;
6168 return;
6169 }
6170
6171 Uint32
updateWritePtr(NodeId nodeId,TrpId trp_id,Uint32 lenBytes,Uint32 prio)6172 mt_send_handle::updateWritePtr(NodeId nodeId,
6173 TrpId trp_id,
6174 Uint32 lenBytes,
6175 Uint32 prio)
6176 {
6177 (void)nodeId;
6178 struct thr_send_buffer * b = m_selfptr->m_send_buffers+trp_id;
6179 thr_send_page * p = b->m_last_page;
6180 p->m_bytes += lenBytes;
6181 return p->m_bytes;
6182 }
6183
6184 /*
6185 * Insert a signal in a job queue.
6186 *
6187 * The signal is not visible to consumers yet after return from this function,
6188 * only recorded in the thr_jb_write_state. It is necessary to first call
6189 * flush_write_state() for this.
6190 *
6191 * The new_buffer is a job buffer to use if the current one gets full. If used,
6192 * we return true, indicating that the caller should allocate a new one for
6193 * the next call. (This is done to allow to insert under lock, but do the
6194 * allocation outside the lock).
6195 */
6196 static inline
6197 bool
insert_signal(thr_job_queue * q,thr_job_queue_head * h,thr_jb_write_state * w,Uint32 prioa,const SignalHeader * sh,const Uint32 * data,const Uint32 secPtr[3],thr_job_buffer * new_buffer)6198 insert_signal(thr_job_queue *q, thr_job_queue_head *h,
6199 thr_jb_write_state *w, Uint32 prioa,
6200 const SignalHeader* sh, const Uint32 *data,
6201 const Uint32 secPtr[3], thr_job_buffer *new_buffer)
6202 {
6203 Uint32 write_pos = w->m_write_pos;
6204 Uint32 datalen = sh->theLength;
6205 assert(w->is_open());
6206 assert(w->m_write_buffer == q->m_buffers[w->m_write_index]);
6207 memcpy(w->m_write_buffer->m_data + write_pos, sh, sizeof(*sh));
6208 write_pos += (sizeof(*sh) >> 2);
6209 memcpy(w->m_write_buffer->m_data + write_pos, data, 4*datalen);
6210 write_pos += datalen;
6211 const Uint32 *p= secPtr;
6212 for (Uint32 i = 0; i < sh->m_noOfSections; i++)
6213 w->m_write_buffer->m_data[write_pos++] = *p++;
6214 w->increment_pending_signals();
6215
6216 #if SIZEOF_CHARP == 8
6217 /* Align to 8-byte boundary, to ensure aligned copies. */
6218 write_pos= (write_pos+1) & ~((Uint32)1);
6219 #endif
6220
6221 /*
6222 * We make sure that there is always room for at least one signal in the
6223 * current buffer in the queue, so one insert is always possible without
6224 * adding a new buffer.
6225 */
6226 if (likely(write_pos + MAX_SIGNAL_SIZE <= thr_job_buffer::SIZE))
6227 {
6228 w->m_write_pos = write_pos;
6229 return false;
6230 }
6231 else
6232 {
6233 /*
6234 * Need a write memory barrier here, as this might make signal data visible
6235 * to other threads.
6236 *
6237 * ToDo: We actually only need the wmb() here if we already make this
6238 * buffer visible to the other thread. So we might optimize it a bit. But
6239 * wmb() is a no-op on x86 anyway...
6240 */
6241 wmb();
6242 w->m_write_buffer->m_len = write_pos;
6243 Uint32 write_index = (w->m_write_index + 1) % thr_job_queue::SIZE;
6244
6245 /**
6246 * Full job buffer is fatal.
6247 *
6248 * ToDo: should we wait for it to become non-full? There is no guarantee
6249 * that this will actually happen...
6250 *
6251 * Or alternatively, ndbrequire() ?
6252 */
6253 if (unlikely(write_index == h->m_read_index))
6254 {
6255 job_buffer_full(0);
6256 }
6257 new_buffer->m_len = 0;
6258 new_buffer->m_prioa = prioa;
6259 q->m_buffers[write_index] = new_buffer;
6260 w->m_write_index = write_index;
6261 w->m_write_pos = 0;
6262 w->m_write_buffer = new_buffer;
6263 return true; // Buffer new_buffer used
6264 }
6265
6266 return false; // Buffer new_buffer not used
6267 }
6268
6269 static
6270 void
read_jbb_state(thr_data * selfptr,Uint32 count)6271 read_jbb_state(thr_data *selfptr, Uint32 count)
6272 {
6273 thr_jb_read_state *r = selfptr->m_read_states;
6274 const thr_job_queue *q = selfptr->m_in_queue;
6275 const thr_job_queue_head *h = selfptr->m_in_queue_head;
6276 for (Uint32 i = 0; i < count; i++,r++)
6277 {
6278 if (r->is_open())
6279 {
6280 Uint32 read_index = r->m_read_index;
6281
6282 /**
6283 * Optimization: Only reload when possibly empty.
6284 * Avoid cache reload of shared thr_job_queue_head
6285 * Read head directly to avoid unnecessary cache
6286 * load of first cache line of m_in_queue entry.
6287 */
6288 if (r->m_write_index == read_index)
6289 {
6290 r->m_write_index = h[i].m_write_index;
6291 read_barrier_depends();
6292 r->m_read_end = q[i].m_buffers[read_index]->m_len;
6293 }
6294 }
6295 }
6296 }
6297
6298 static
6299 bool
read_jba_state(thr_data * selfptr)6300 read_jba_state(thr_data *selfptr)
6301 {
6302 thr_jb_read_state *r = &(selfptr->m_jba_read_state);
6303 r->m_write_index = selfptr->m_jba_head.m_write_index;
6304 read_barrier_depends();
6305 r->m_read_end = selfptr->m_jba.m_buffers[r->m_read_index]->m_len;
6306 return r->is_empty();
6307 }
6308
6309 static
6310 inline
6311 bool
check_for_input_from_ndbfs(struct thr_data * thr_ptr,Signal * signal)6312 check_for_input_from_ndbfs(struct thr_data* thr_ptr, Signal* signal)
6313 {
6314 /**
6315 * The manner to check for input from NDBFS file threads misuses
6316 * the SEND_PACKED signal. For ndbmtd this is intended to be
6317 * replaced by using signals directly from NDBFS file threads to
6318 * the issuer of the file request. This is WL#8890.
6319 */
6320 Uint32 i;
6321 for (i = 0; i < thr_ptr->m_instance_count; i++)
6322 {
6323 BlockReference block = thr_ptr->m_instance_list[i];
6324 Uint32 main = blockToMain(block);
6325 if (main == NDBFS)
6326 {
6327 Uint32 instance = blockToInstance(block);
6328 SimulatedBlock* b = globalData.getBlock(main, instance);
6329 b->executeFunction_async(GSN_SEND_PACKED, signal);
6330 if (signal->theData[0] == 1)
6331 return true;
6332 return false;
6333 }
6334 }
6335 return false;
6336 }
6337
6338 /* Check all job queues, return true only if all are empty. */
6339 static bool
check_queues_empty(thr_data * selfptr)6340 check_queues_empty(thr_data *selfptr)
6341 {
6342 Uint32 thr_count = g_thr_repository->m_thread_count;
6343 if (selfptr->m_thr_no == 0)
6344 {
6345 if (check_for_input_from_ndbfs(selfptr, selfptr->m_signal))
6346 return false;
6347 }
6348 bool empty = read_jba_state(selfptr);
6349 if (!empty)
6350 return false;
6351
6352 read_jbb_state(selfptr, thr_count);
6353 const thr_jb_read_state *r = selfptr->m_read_states;
6354 for (Uint32 i = 0; i < thr_count; i++,r++)
6355 {
6356 if (!r->is_empty())
6357 return false;
6358 }
6359 return true;
6360 }
6361
6362 static
6363 inline
6364 void
sendpacked(struct thr_data * thr_ptr,Signal * signal)6365 sendpacked(struct thr_data* thr_ptr, Signal* signal)
6366 {
6367 Uint32 i;
6368 signal->header.m_noOfSections = 0; /* valgrind */
6369 thr_ptr->m_watchdog_counter = 15;
6370 for (i = 0; i < thr_ptr->m_instance_count; i++)
6371 {
6372 BlockReference block = thr_ptr->m_instance_list[i];
6373 Uint32 main = blockToMain(block);
6374 if (main == DBLQH || main == DBTC || main == DBTUP || main == NDBFS)
6375 {
6376 Uint32 instance = blockToInstance(block);
6377 SimulatedBlock* b = globalData.getBlock(main, instance);
6378 // wl4391_todo remove useless assert
6379 assert(b != 0 && b->getThreadId() == thr_ptr->m_thr_no);
6380 /* b->send_at_job_buffer_end(); */
6381 b->executeFunction_async(GSN_SEND_PACKED, signal);
6382 }
6383 }
6384 }
6385
6386 /**
6387 * We check whether it is time to call do_send or do_flush. These are
6388 * central decisions to the data node scheduler in a multithreaded data
6389 * node. If we wait for too long to make this decision it will severely
6390 * impact our response times since messages will be waiting in the send
6391 * buffer without being sent for up to several milliseconds.
6392 *
6393 * Since we call this function now after executing jobs from one thread,
6394 * we will never call this function with more than 75 signals executed.
6395 * The decision to send/flush is determined by config parameters that
6396 * control the responsiveness of MySQL Cluster. Setting it to a be highly
6397 * responsive means that we will send very often at the expense of
6398 * throughput. Setting it to a high throughput means that we will send
6399 * seldom at the expense of response time to gain higher throughput.
6400 *
6401 * It is possible to change this variable through a DUMP command and can
6402 * thus be changed as the environment changes.
6403 */
6404 static
handle_scheduling_decisions(thr_data * selfptr,Signal * signal,Uint32 & send_sum,Uint32 & flush_sum,bool & pending_send)6405 void handle_scheduling_decisions(thr_data *selfptr,
6406 Signal *signal,
6407 Uint32 & send_sum,
6408 Uint32 & flush_sum,
6409 bool & pending_send)
6410 {
6411 if (send_sum >= selfptr->m_max_signals_before_send)
6412 {
6413 /* Try to send, but skip for now in case of lock contention. */
6414 sendpacked(selfptr, signal);
6415 selfptr->m_watchdog_counter = 6;
6416 flush_jbb_write_state(selfptr);
6417 pending_send = do_send(selfptr, FALSE, FALSE);
6418 selfptr->m_watchdog_counter = 20;
6419 send_sum = 0;
6420 flush_sum = 0;
6421 }
6422 else if (flush_sum >= selfptr->m_max_signals_before_send_flush)
6423 {
6424 /* Send buffers append to send queues to dst. trps. */
6425 sendpacked(selfptr, signal);
6426 selfptr->m_watchdog_counter = 6;
6427 flush_jbb_write_state(selfptr);
6428 do_flush(selfptr);
6429 selfptr->m_watchdog_counter = 20;
6430 flush_sum = 0;
6431 }
6432 }
6433
6434 #if defined(USE_INIT_GLOBAL_VARIABLES)
6435 void mt_clear_global_variables(thr_data*);
6436 #endif
6437 /*
6438 * Execute at most MAX_SIGNALS signals from one job queue, updating local read
6439 * state as appropriate.
6440 *
6441 * Returns number of signals actually executed.
6442 */
6443 static
6444 Uint32
execute_signals(thr_data * selfptr,thr_job_queue * q,thr_job_queue_head * h,thr_jb_read_state * r,Signal * sig,Uint32 max_signals)6445 execute_signals(thr_data *selfptr,
6446 thr_job_queue *q, thr_job_queue_head *h,
6447 thr_jb_read_state *r,
6448 Signal *sig, Uint32 max_signals)
6449 {
6450 Uint32 num_signals;
6451 Uint32 extra_signals = 0;
6452 Uint32 read_index = r->m_read_index;
6453 Uint32 write_index = r->m_write_index;
6454 Uint32 read_pos = r->m_read_pos;
6455 Uint32 read_end = r->m_read_end;
6456 Uint32 *watchDogCounter = &selfptr->m_watchdog_counter;
6457
6458 if (read_index == write_index && read_pos >= read_end)
6459 return 0; // empty read_state
6460
6461 thr_job_buffer *read_buffer = r->m_read_buffer;
6462
6463 for (num_signals = 0; num_signals < max_signals; num_signals++)
6464 {
6465 *watchDogCounter = 12;
6466 while (read_pos >= read_end)
6467 {
6468 if (read_index == write_index)
6469 {
6470 /* No more available now. */
6471 selfptr->m_stat.m_exec_cnt += num_signals;
6472 return num_signals;
6473 }
6474 else
6475 {
6476 /* Move to next buffer. */
6477 read_index = (read_index + 1) % thr_job_queue::SIZE;
6478 release_buffer(g_thr_repository, selfptr->m_thr_no, read_buffer);
6479 read_buffer = q->m_buffers[read_index];
6480 read_pos = 0;
6481 read_end = read_buffer->m_len;
6482 /* Update thread-local read state. */
6483 r->m_read_index = h->m_read_index = read_index;
6484 r->m_read_buffer = read_buffer;
6485 r->m_read_pos = read_pos;
6486 r->m_read_end = read_end;
6487 /* Wakeup threads waiting for job buffers to become free */
6488 wakeup(&h->m_waiter);
6489 }
6490 }
6491
6492 /*
6493 * These pre-fetching were found using OProfile to reduce cache misses.
6494 * (Though on Intel Core 2, they do not give much speedup, as apparently
6495 * the hardware prefetcher is already doing a fairly good job).
6496 */
6497 NDB_PREFETCH_READ (read_buffer->m_data + read_pos + 16);
6498 NDB_PREFETCH_WRITE ((Uint32 *)&sig->header + 16);
6499
6500 #ifdef VM_TRACE
6501 /* Find reading / propagation of junk */
6502 sig->garbage_register();
6503 #endif
6504 /* Now execute the signal. */
6505 SignalHeader* s =
6506 reinterpret_cast<SignalHeader*>(read_buffer->m_data + read_pos);
6507 Uint32 seccnt = s->m_noOfSections;
6508 Uint32 siglen = (sizeof(*s)>>2) + s->theLength;
6509 if(siglen>16)
6510 {
6511 NDB_PREFETCH_READ (read_buffer->m_data + read_pos + 32);
6512 }
6513 Uint32 bno = blockToMain(s->theReceiversBlockNumber);
6514 Uint32 ino = blockToInstance(s->theReceiversBlockNumber);
6515 SimulatedBlock* block = globalData.mt_getBlock(bno, ino);
6516 assert(block != 0);
6517
6518 Uint32 gsn = s->theVerId_signalNumber;
6519 *watchDogCounter = 1 +
6520 (bno << 8) +
6521 (gsn << 20);
6522
6523 /* Must update original buffer so signal dump will see it. */
6524 s->theSignalId = selfptr->m_signal_id_counter++;
6525 memcpy(&sig->header, s, 4*siglen);
6526 for(Uint32 i = 0; i < seccnt; i++)
6527 {
6528 sig->m_sectionPtrI[i] = read_buffer->m_data[read_pos + siglen + i];
6529 }
6530
6531 read_pos += siglen + seccnt;
6532 #if SIZEOF_CHARP == 8
6533 /* Handle 8-byte alignment. */
6534 read_pos = (read_pos + 1) & ~((Uint32)1);
6535 #endif
6536
6537 /* Update just before execute so signal dump can know how far we are. */
6538 r->m_read_pos = read_pos;
6539
6540 #ifdef VM_TRACE
6541 if (globalData.testOn)
6542 { //wl4391_todo segments
6543 SegmentedSectionPtr ptr[3];
6544 ptr[0].i = sig->m_sectionPtrI[0];
6545 ptr[1].i = sig->m_sectionPtrI[1];
6546 ptr[2].i = sig->m_sectionPtrI[2];
6547 ::getSections(seccnt, ptr);
6548 globalSignalLoggers.executeSignal(*s,
6549 0,
6550 &sig->theData[0],
6551 globalData.ownId,
6552 ptr, seccnt);
6553 }
6554 #endif
6555
6556 /**
6557 * In 7.4 we introduced the ability for scans in LDM threads to scan
6558 * several rows in the same signal execution without issuing a
6559 * CONTINUEB signal. This means that we effectively changed the
6560 * real-time characteristics of the scheduler. This change ensures
6561 * that we behave the same way as in 7.3 and earlier with respect to
6562 * how many signals are executed. So the m_extra_signals variable can
6563 * be used in the future for other cases where we combine several
6564 * signal executions into one signal and thus ensure that we don't
6565 * change the scheduler algorithms.
6566 *
6567 * This variable is incremented every time we decide to execute more
6568 * signals without real-time breaks in scans in DBLQH.
6569 */
6570 block->jamBuffer()->markEndOfSigExec();
6571 sig->m_extra_signals = 0;
6572 #if defined(USE_INIT_GLOBAL_VARIABLES)
6573 mt_clear_global_variables(selfptr);
6574 #endif
6575 block->executeFunction_async(gsn, sig);
6576 extra_signals += sig->m_extra_signals;
6577 }
6578 /**
6579 * Only count signals causing real-time break and not the one used to
6580 * balance the scheduler.
6581 */
6582 selfptr->m_stat.m_exec_cnt += num_signals;
6583
6584 return num_signals + extra_signals;
6585 }
6586
6587 static
6588 Uint32
run_job_buffers(thr_data * selfptr,Signal * sig,Uint32 & send_sum,Uint32 & flush_sum,bool & pending_send)6589 run_job_buffers(thr_data *selfptr,
6590 Signal *sig,
6591 Uint32 & send_sum,
6592 Uint32 & flush_sum,
6593 bool & pending_send)
6594 {
6595 Uint32 thr_count = g_thr_repository->m_thread_count;
6596 Uint32 signal_count = 0;
6597 Uint32 signal_count_since_last_zero_time_queue = 0;
6598 Uint32 perjb = selfptr->m_max_signals_per_jb;
6599
6600 read_jbb_state(selfptr, thr_count);
6601 /*
6602 * A load memory barrier to ensure that we see any prio A signal sent later
6603 * than loaded prio B signals.
6604 */
6605 rmb();
6606
6607 /**
6608 * For the main thread we can stop at any job buffer, so we proceed from
6609 * where we stopped to make different job buffers be equal in importance.
6610 *
6611 * For all other threads m_next_jbb_no should always be 0 when we reach here.
6612 */
6613 Uint32 first_jbb_no = selfptr->m_next_jbb_no;
6614 thr_job_queue *queue = selfptr->m_in_queue + first_jbb_no;
6615 thr_job_queue_head *head = selfptr->m_in_queue_head + first_jbb_no;
6616 thr_jb_read_state *read_state = selfptr->m_read_states + first_jbb_no;
6617 selfptr->m_watchdog_counter = 13;
6618 for (Uint32 jbb_no = first_jbb_no;
6619 jbb_no < thr_count;
6620 jbb_no++,queue++,read_state++,head++)
6621 {
6622 /* Read the prio A state often, to avoid starvation of prio A. */
6623 while (!read_jba_state(selfptr))
6624 {
6625 selfptr->m_sent_local_prioa_signal = false;
6626 static Uint32 max_prioA = thr_job_queue::SIZE * thr_job_buffer::SIZE;
6627 Uint32 num_signals = execute_signals(selfptr,
6628 &(selfptr->m_jba),
6629 &(selfptr->m_jba_head),
6630 &(selfptr->m_jba_read_state), sig,
6631 max_prioA);
6632 signal_count += num_signals;
6633 send_sum += num_signals;
6634 flush_sum += num_signals;
6635 if (!selfptr->m_sent_local_prioa_signal)
6636 {
6637 /**
6638 * Break out of loop if there was no prio A signals generated
6639 * from the local execution.
6640 */
6641 break;
6642 }
6643 }
6644
6645 /**
6646 * Contended queues get an extra execute quota:
6647 *
6648 * If we didn't get a max 'perjb' quota, our out buffers
6649 * are about to fill up. This thread is thus effectively
6650 * slowed down in order to let other threads consume from
6651 * our out buffers. Eventually, when 'perjb==0', we will
6652 * have to wait/sleep for buffers to become available.
6653 *
6654 * This can bring is into a circular wait-lock, where
6655 * threads are stalled due to full out buffers. The same
6656 * thread may also have full in buffers, thus blocking other
6657 * threads from progressing. This could bring us into a
6658 * circular wait-lock, where no threads are able to progress.
6659 * The entire scheduler will then be stuck.
6660 *
6661 * We try to avoid this situation by reserving some
6662 * 'm_max_extra_signals' which are only used to consume
6663 * from 'almost full' in-buffers. We will then reduce the
6664 * risk of ending up in the above wait-lock.
6665 *
6666 * Exclude receiver threads, as there can't be a
6667 * circular wait between recv-thread and workers.
6668 */
6669 Uint32 extra = 0;
6670
6671 if (perjb < MAX_SIGNALS_PER_JB) //Job buffer contention
6672 {
6673 const Uint32 free = compute_free_buffers_in_queue(head);
6674 if (free <= thr_job_queue::ALMOST_FULL)
6675 {
6676 if (selfptr->m_max_extra_signals > MAX_SIGNALS_PER_JB - perjb)
6677 {
6678 extra = MAX_SIGNALS_PER_JB - perjb;
6679 }
6680 else
6681 {
6682 extra = selfptr->m_max_extra_signals;
6683 selfptr->m_max_exec_signals = 0; //Force recalc
6684 }
6685 selfptr->m_max_extra_signals -= extra;
6686 }
6687 }
6688
6689 #ifdef ERROR_INSERT
6690
6691 #define MIXOLOGY_MIX_MT_JBB 1
6692
6693 if (unlikely(globalEmulatorData.theConfiguration->getMixologyLevel() &
6694 MIXOLOGY_MIX_MT_JBB))
6695 {
6696 /**
6697 * Let's maximise interleaving to find inter-thread
6698 * signal order dependency bugs
6699 */
6700 perjb = 1;
6701 extra = 0;
6702 }
6703 #endif
6704
6705 /* Now execute prio B signals from one thread. */
6706 Uint32 num_signals = execute_signals(selfptr, queue, head, read_state,
6707 sig, perjb+extra);
6708
6709 if (num_signals > 0)
6710 {
6711 signal_count += num_signals;
6712 send_sum += num_signals;
6713 flush_sum += num_signals;
6714 handle_scheduling_decisions(selfptr,
6715 sig,
6716 send_sum,
6717 flush_sum,
6718 pending_send);
6719
6720 if (signal_count - signal_count_since_last_zero_time_queue >
6721 (MAX_SIGNALS_EXECUTED_BEFORE_ZERO_TIME_QUEUE_SCAN -
6722 MAX_SIGNALS_PER_JB))
6723 {
6724 /**
6725 * Each execution of execute_signals can at most execute 75 signals
6726 * from one job buffer. We want to ensure that we execute no more than
6727 * 100 signals before we arrive here to get the signals from the
6728 * zero time queue. This implements the bounded delay signal
6729 * concept which is required for rate controlled activities.
6730 *
6731 * We scan the zero time queue if more than 25 signals were executed.
6732 * This means that at most 100 signals will be executed before we arrive
6733 * here again to check the bounded delay signals.
6734 */
6735 signal_count_since_last_zero_time_queue = signal_count;
6736 selfptr->m_watchdog_counter = 14;
6737 scan_zero_queue(selfptr);
6738 selfptr->m_watchdog_counter = 13;
6739 }
6740 if (selfptr->m_thr_no == 0)
6741 {
6742 /**
6743 * Execution in main thread can sometimes be a bit more lengthy,
6744 * so we ensure that we don't miss out on heartbeats and other
6745 * important things by returning to checking scan_time_queues
6746 * more often.
6747 */
6748 jbb_no++;
6749 if (jbb_no >= thr_count)
6750 {
6751 jbb_no = 0;
6752 }
6753 selfptr->m_next_jbb_no = jbb_no;
6754 return signal_count;
6755 }
6756 }
6757 }
6758 selfptr->m_next_jbb_no = 0;
6759 return signal_count;
6760 }
6761
6762 struct thr_map_entry {
6763 enum { NULL_THR_NO = 0xFF };
6764 Uint8 thr_no;
thr_map_entrythr_map_entry6765 thr_map_entry() : thr_no(NULL_THR_NO) {}
6766 };
6767
6768 static struct thr_map_entry thr_map[NO_OF_BLOCKS][NDBMT_MAX_BLOCK_INSTANCES];
6769 static Uint32 block_instance_count[NO_OF_BLOCKS];
6770
6771 static inline Uint32
block2ThreadId(Uint32 block,Uint32 instance)6772 block2ThreadId(Uint32 block, Uint32 instance)
6773 {
6774 assert(block >= MIN_BLOCK_NO && block <= MAX_BLOCK_NO);
6775 Uint32 index = block - MIN_BLOCK_NO;
6776 assert(instance < NDB_ARRAY_SIZE(thr_map[index]));
6777 const thr_map_entry& entry = thr_map[index][instance];
6778 assert(entry.thr_no < glob_num_threads);
6779 return entry.thr_no;
6780 }
6781
6782 void
add_thr_map(Uint32 main,Uint32 instance,Uint32 thr_no)6783 add_thr_map(Uint32 main, Uint32 instance, Uint32 thr_no)
6784 {
6785 assert(main == blockToMain(main));
6786 Uint32 index = main - MIN_BLOCK_NO;
6787 assert(index < NO_OF_BLOCKS);
6788 assert(instance < NDB_ARRAY_SIZE(thr_map[index]));
6789
6790 SimulatedBlock* b = globalData.getBlock(main, instance);
6791 require(b != 0);
6792
6793 /* Block number including instance. */
6794 Uint32 block = numberToBlock(main, instance);
6795
6796 require(thr_no < glob_num_threads);
6797 struct thr_repository* rep = g_thr_repository;
6798 struct thr_data* thr_ptr = &rep->m_thread[thr_no];
6799
6800 /* Add to list. */
6801 {
6802 Uint32 i;
6803 for (i = 0; i < thr_ptr->m_instance_count; i++)
6804 require(thr_ptr->m_instance_list[i] != block);
6805 }
6806 require(thr_ptr->m_instance_count < MAX_INSTANCES_PER_THREAD);
6807 thr_ptr->m_instance_list[thr_ptr->m_instance_count++] = block;
6808
6809 SimulatedBlock::ThreadContext ctx;
6810 ctx.threadId = thr_no;
6811 ctx.jamBuffer = &thr_ptr->m_jam;
6812 ctx.watchDogCounter = &thr_ptr->m_watchdog_counter;
6813 ctx.sectionPoolCache = &thr_ptr->m_sectionPoolCache;
6814 ctx.pHighResTimer = &thr_ptr->m_curr_ticks;
6815 b->assignToThread(ctx);
6816
6817 /* Create entry mapping block to thread. */
6818 thr_map_entry& entry = thr_map[index][instance];
6819 require(entry.thr_no == thr_map_entry::NULL_THR_NO);
6820 entry.thr_no = thr_no;
6821 }
6822
6823 /* Static assignment of main instances (before first signal). */
6824 void
mt_init_thr_map()6825 mt_init_thr_map()
6826 {
6827 /* Keep mt-classic assignments in MT LQH. */
6828 const Uint32 thr_GLOBAL = 0;
6829 const Uint32 thr_LOCAL = 1;
6830
6831 add_thr_map(BACKUP, 0, thr_LOCAL);
6832 add_thr_map(DBTC, 0, thr_GLOBAL);
6833 add_thr_map(DBDIH, 0, thr_GLOBAL);
6834 add_thr_map(DBLQH, 0, thr_LOCAL);
6835 add_thr_map(DBACC, 0, thr_LOCAL);
6836 add_thr_map(DBTUP, 0, thr_LOCAL);
6837 add_thr_map(DBDICT, 0, thr_GLOBAL);
6838 add_thr_map(NDBCNTR, 0, thr_GLOBAL);
6839 add_thr_map(QMGR, 0, thr_GLOBAL);
6840 add_thr_map(NDBFS, 0, thr_GLOBAL);
6841 add_thr_map(CMVMI, 0, thr_GLOBAL);
6842 add_thr_map(TRIX, 0, thr_GLOBAL);
6843 add_thr_map(DBUTIL, 0, thr_GLOBAL);
6844 add_thr_map(SUMA, 0, thr_LOCAL);
6845 add_thr_map(DBTUX, 0, thr_LOCAL);
6846 add_thr_map(TSMAN, 0, thr_LOCAL);
6847 add_thr_map(LGMAN, 0, thr_LOCAL);
6848 add_thr_map(PGMAN, 0, thr_LOCAL);
6849 add_thr_map(RESTORE, 0, thr_LOCAL);
6850 add_thr_map(DBINFO, 0, thr_LOCAL);
6851 add_thr_map(DBSPJ, 0, thr_GLOBAL);
6852 add_thr_map(THRMAN, 0, thr_GLOBAL);
6853 add_thr_map(TRPMAN, 0, thr_GLOBAL);
6854 }
6855
6856 Uint32
mt_get_instance_count(Uint32 block)6857 mt_get_instance_count(Uint32 block)
6858 {
6859 switch(block){
6860 case DBLQH:
6861 case DBACC:
6862 case DBTUP:
6863 case DBTUX:
6864 case BACKUP:
6865 case RESTORE:
6866 return globalData.ndbMtLqhWorkers;
6867 break;
6868 case PGMAN:
6869 return globalData.ndbMtLqhWorkers + 1;
6870 break;
6871 case DBTC:
6872 case DBSPJ:
6873 return globalData.ndbMtTcThreads;
6874 break;
6875 case TRPMAN:
6876 return globalData.ndbMtReceiveThreads;
6877 case THRMAN:
6878 return glob_num_threads;
6879 default:
6880 require(false);
6881 }
6882 return 0;
6883 }
6884
6885 void
mt_add_thr_map(Uint32 block,Uint32 instance)6886 mt_add_thr_map(Uint32 block, Uint32 instance)
6887 {
6888 Uint32 num_lqh_threads = globalData.ndbMtLqhThreads;
6889 Uint32 num_tc_threads = globalData.ndbMtTcThreads;
6890
6891 require(instance != 0);
6892 Uint32 thr_no = NUM_MAIN_THREADS;
6893 switch(block){
6894 case DBLQH:
6895 case DBACC:
6896 case DBTUP:
6897 case DBTUX:
6898 case BACKUP:
6899 case RESTORE:
6900 thr_no += (instance - 1) % num_lqh_threads;
6901 break;
6902 case PGMAN:
6903 if (instance == num_lqh_threads + 1)
6904 {
6905 // Put extra PGMAN together with it's Proxy
6906 thr_no = block2ThreadId(block, 0);
6907 }
6908 else
6909 {
6910 thr_no += (instance - 1) % num_lqh_threads;
6911 }
6912 break;
6913 case DBTC:
6914 case DBSPJ:
6915 thr_no += num_lqh_threads + (instance - 1);
6916 break;
6917 case THRMAN:
6918 thr_no = instance - 1;
6919 break;
6920 case TRPMAN:
6921 thr_no += num_lqh_threads + num_tc_threads + (instance - 1);
6922 break;
6923 default:
6924 require(false);
6925 }
6926
6927 add_thr_map(block, instance, thr_no);
6928 }
6929
6930 /**
6931 * create the duplicate entries needed so that
6932 * sender doesnt need to know how many instances there
6933 * actually are in this node...
6934 *
6935 * if only 1 instance...then duplicate that for all slots
6936 * else assume instance 0 is proxy...and duplicate workers (modulo)
6937 *
6938 * NOTE: extra pgman worker is instance 5
6939 */
6940 void
mt_finalize_thr_map()6941 mt_finalize_thr_map()
6942 {
6943 for (Uint32 b = 0; b < NO_OF_BLOCKS; b++)
6944 {
6945 Uint32 bno = b + MIN_BLOCK_NO;
6946 Uint32 cnt = 0;
6947 while (cnt < NDB_ARRAY_SIZE(thr_map[b]) &&
6948 thr_map[b][cnt].thr_no != thr_map_entry::NULL_THR_NO)
6949 {
6950 cnt++;
6951 }
6952 block_instance_count[b] = cnt;
6953 if (cnt != NDB_ARRAY_SIZE(thr_map[b]))
6954 {
6955 SimulatedBlock * main = globalData.getBlock(bno, 0);
6956 for (Uint32 i = cnt; i < NDB_ARRAY_SIZE(thr_map[b]); i++)
6957 {
6958 Uint32 dup = (cnt == 1) ? 0 : 1 + ((i - 1) % (cnt - 1));
6959 if (thr_map[b][i].thr_no == thr_map_entry::NULL_THR_NO)
6960 {
6961 thr_map[b][i] = thr_map[b][dup];
6962 main->addInstance(globalData.getBlock(bno, dup), i);
6963 }
6964 else
6965 {
6966 /**
6967 * extra pgman instance
6968 */
6969 require(bno == PGMAN);
6970 require(false);
6971 }
6972 }
6973 }
6974 }
6975 }
6976
6977 static
6978 void
calculate_max_signals_parameters(thr_data * selfptr)6979 calculate_max_signals_parameters(thr_data *selfptr)
6980 {
6981 switch (selfptr->m_sched_responsiveness)
6982 {
6983 case 0:
6984 selfptr->m_max_signals_before_send = 1000;
6985 selfptr->m_max_signals_before_send_flush = 340;
6986 break;
6987 case 1:
6988 selfptr->m_max_signals_before_send = 800;
6989 selfptr->m_max_signals_before_send_flush = 270;
6990 break;
6991 case 2:
6992 selfptr->m_max_signals_before_send = 600;
6993 selfptr->m_max_signals_before_send_flush = 200;
6994 break;
6995 case 3:
6996 selfptr->m_max_signals_before_send = 450;
6997 selfptr->m_max_signals_before_send_flush = 155;
6998 break;
6999 case 4:
7000 selfptr->m_max_signals_before_send = 350;
7001 selfptr->m_max_signals_before_send_flush = 130;
7002 break;
7003 case 5:
7004 selfptr->m_max_signals_before_send = 300;
7005 selfptr->m_max_signals_before_send_flush = 110;
7006 break;
7007 case 6:
7008 selfptr->m_max_signals_before_send = 250;
7009 selfptr->m_max_signals_before_send_flush = 90;
7010 break;
7011 case 7:
7012 selfptr->m_max_signals_before_send = 200;
7013 selfptr->m_max_signals_before_send_flush = 70;
7014 break;
7015 case 8:
7016 selfptr->m_max_signals_before_send = 170;
7017 selfptr->m_max_signals_before_send_flush = 50;
7018 break;
7019 case 9:
7020 selfptr->m_max_signals_before_send = 135;
7021 selfptr->m_max_signals_before_send_flush = 30;
7022 break;
7023 case 10:
7024 selfptr->m_max_signals_before_send = 70;
7025 selfptr->m_max_signals_before_send_flush = 10;
7026 break;
7027 default:
7028 assert(FALSE);
7029 }
7030 return;
7031 }
7032
7033 static void
init_thread(thr_data * selfptr)7034 init_thread(thr_data *selfptr)
7035 {
7036 selfptr->m_waiter.init();
7037 selfptr->m_jam.theEmulatedJamIndex = 0;
7038
7039 selfptr->m_overload_status = (OverloadStatus)LIGHT_LOAD_CONST;
7040 selfptr->m_node_overload_status = (OverloadStatus)LIGHT_LOAD_CONST;
7041 selfptr->m_wakeup_instance = 0;
7042 selfptr->m_last_wakeup_idle_thread = NdbTick_getCurrentTicks();
7043 selfptr->m_micros_send = 0;
7044 selfptr->m_micros_sleep = 0;
7045 selfptr->m_buffer_full_micros_sleep = 0;
7046 selfptr->m_measured_spintime = 0;
7047
7048 NDB_THREAD_TLS_JAM = &selfptr->m_jam;
7049 NDB_THREAD_TLS_THREAD= selfptr;
7050
7051 unsigned thr_no = selfptr->m_thr_no;
7052 globalEmulatorData.theWatchDog->
7053 registerWatchedThread(&selfptr->m_watchdog_counter, thr_no);
7054 {
7055 while(selfptr->m_thread == 0)
7056 NdbSleep_MilliSleep(30);
7057 }
7058
7059 THRConfigApplier & conf = globalEmulatorData.theConfiguration->m_thr_config;
7060 BaseString tmp;
7061 tmp.appfmt("thr: %u ", thr_no);
7062
7063 bool fail = false;
7064 int tid = NdbThread_GetTid(selfptr->m_thread);
7065 if (tid != -1)
7066 {
7067 tmp.appfmt("tid: %u ", tid);
7068 }
7069
7070 conf.appendInfo(tmp,
7071 selfptr->m_instance_list,
7072 selfptr->m_instance_count);
7073 int res = conf.do_bind(selfptr->m_thread,
7074 selfptr->m_instance_list,
7075 selfptr->m_instance_count);
7076 if (res < 0)
7077 {
7078 fail = true;
7079 tmp.appfmt("err: %d ", -res);
7080 }
7081 else if (res > 0)
7082 {
7083 tmp.appfmt("OK ");
7084 }
7085
7086 unsigned thread_prio;
7087 res = conf.do_thread_prio(selfptr->m_thread,
7088 selfptr->m_instance_list,
7089 selfptr->m_instance_count,
7090 thread_prio);
7091 if (res < 0)
7092 {
7093 fail = true;
7094 res = -res;
7095 tmp.appfmt("Failed to set thread prio to %u, ", thread_prio);
7096 if (res == SET_THREAD_PRIO_NOT_SUPPORTED_ERROR)
7097 {
7098 tmp.appfmt("not supported on this OS");
7099 }
7100 else
7101 {
7102 tmp.appfmt("error: %d", res);
7103 }
7104 }
7105 else if (res > 0)
7106 {
7107 tmp.appfmt("Successfully set thread prio to %u ", thread_prio);
7108 }
7109
7110 selfptr->m_realtime = conf.do_get_realtime(selfptr->m_instance_list,
7111 selfptr->m_instance_count);
7112 selfptr->m_conf_spintime = conf.do_get_spintime(selfptr->m_instance_list,
7113 selfptr->m_instance_count);
7114
7115 /* spintime always 0 on platforms not supporting spin */
7116 if (!NdbSpin_is_supported())
7117 {
7118 selfptr->m_conf_spintime = 0;
7119 }
7120 selfptr->m_spintime = 0;
7121 memset(&selfptr->m_spin_stat, 0, sizeof(selfptr->m_spin_stat));
7122 selfptr->m_spin_stat.m_spin_interval[NUM_SPIN_INTERVALS - 1] = 0xFFFFFFFF;
7123
7124 selfptr->m_sched_responsiveness =
7125 globalEmulatorData.theConfiguration->schedulerResponsiveness();
7126 calculate_max_signals_parameters(selfptr);
7127
7128 selfptr->m_thr_id = my_thread_self();
7129
7130 for (Uint32 i = 0; i < selfptr->m_instance_count; i++)
7131 {
7132 BlockReference block = selfptr->m_instance_list[i];
7133 Uint32 main = blockToMain(block);
7134 Uint32 instance = blockToInstance(block);
7135 tmp.appfmt("%s(%u) ", getBlockName(main), instance);
7136 }
7137 /* Report parameters used by thread to node log */
7138 tmp.appfmt("realtime=%u, spintime=%u, max_signals_before_send=%u"
7139 ", max_signals_before_send_flush=%u",
7140 selfptr->m_realtime,
7141 selfptr->m_conf_spintime,
7142 selfptr->m_max_signals_before_send,
7143 selfptr->m_max_signals_before_send_flush);
7144
7145 printf("%s\n", tmp.c_str());
7146 fflush(stdout);
7147 if (fail)
7148 {
7149 #ifndef HAVE_MAC_OS_X_THREAD_INFO
7150 abort();
7151 #endif
7152 }
7153 }
7154
7155 /**
7156 * Align signal buffer for better cache performance.
7157 * Also skew it a litte for each thread to avoid cache pollution.
7158 */
7159 #define SIGBUF_SIZE (sizeof(Signal) + 63 + 256 * MAX_BLOCK_THREADS)
7160 static Signal *
aligned_signal(unsigned char signal_buf[SIGBUF_SIZE],unsigned thr_no)7161 aligned_signal(unsigned char signal_buf[SIGBUF_SIZE], unsigned thr_no)
7162 {
7163 UintPtr sigtmp= (UintPtr)signal_buf;
7164 sigtmp= (sigtmp+63) & (~(UintPtr)63);
7165 sigtmp+= thr_no*256;
7166 return (Signal *)sigtmp;
7167 }
7168
7169 /*
7170 * We only do receive in receiver thread(s), no other threads do receive.
7171 *
7172 * As part of the receive loop, we also periodically call update_connections()
7173 * (this way we are similar to single-threaded ndbd).
7174 *
7175 * The TRPMAN block (and no other blocks) run in the same thread as this
7176 * receive loop; this way we avoid races between update_connections() and
7177 * TRPMAN calls into the transporters.
7178 */
7179
7180 /**
7181 * Array of pointers to TransporterReceiveHandleKernel
7182 * these are not used "in traffic"
7183 */
7184 static TransporterReceiveHandleKernel *
7185 g_trp_receive_handle_ptr[MAX_NDBMT_RECEIVE_THREADS];
7186
7187 /**
7188 * Array for mapping trps to receiver threads and function to access it.
7189 */
7190 static Uint32 g_trp_to_recv_thr_map[MAX_NTRANSPORTERS];
7191
7192 /**
7193 * We use this method both to initialise the realtime variable
7194 * and also for updating it. Currently there is no method to
7195 * update it, but it's likely that we will soon invent one and
7196 * thus the code is prepared for this case.
7197 */
7198 static void
update_rt_config(struct thr_data * selfptr,bool & real_time,enum ThreadTypes type)7199 update_rt_config(struct thr_data *selfptr,
7200 bool & real_time,
7201 enum ThreadTypes type)
7202 {
7203 bool old_real_time = real_time;
7204 real_time = selfptr->m_realtime;
7205 if (old_real_time == true && real_time == false)
7206 {
7207 yield_rt_break(selfptr->m_thread,
7208 type,
7209 false);
7210 }
7211 }
7212
7213 /**
7214 * We use this method both to initialise the spintime variable
7215 * and also for updating it. Currently there is no method to
7216 * update it, but it's likely that we will soon invent one and
7217 * thus the code is prepared for this case.
7218 */
7219 static void
update_spin_config(struct thr_data * selfptr,Uint64 & min_spin_timer)7220 update_spin_config(struct thr_data *selfptr,
7221 Uint64 & min_spin_timer)
7222 {
7223 min_spin_timer = selfptr->m_spintime;
7224 }
7225
7226 extern "C"
7227 void *
mt_receiver_thread_main(void * thr_arg)7228 mt_receiver_thread_main(void *thr_arg)
7229 {
7230 unsigned char signal_buf[SIGBUF_SIZE];
7231 Signal *signal;
7232 struct thr_repository* rep = g_thr_repository;
7233 struct thr_data* selfptr = (struct thr_data *)thr_arg;
7234 unsigned thr_no = selfptr->m_thr_no;
7235 Uint32& watchDogCounter = selfptr->m_watchdog_counter;
7236 const Uint32 recv_thread_idx = thr_no - first_receiver_thread_no;
7237 bool has_received = false;
7238 int cnt = 0;
7239 bool real_time = false;
7240 Uint64 min_spin_timer;
7241 NDB_TICKS yield_ticks;
7242 NDB_TICKS before;
7243
7244 init_thread(selfptr);
7245 signal = aligned_signal(signal_buf, thr_no);
7246 update_rt_config(selfptr, real_time, ReceiveThread);
7247 update_spin_config(selfptr, min_spin_timer);
7248
7249 /**
7250 * Object that keeps track of our pollReceive-state
7251 */
7252 TransporterReceiveHandleKernel recvdata(thr_no, recv_thread_idx);
7253 recvdata.assign_trps(g_trp_to_recv_thr_map);
7254 globalTransporterRegistry.init(recvdata);
7255
7256 /**
7257 * Save pointer to this for management/error-insert
7258 */
7259 g_trp_receive_handle_ptr[recv_thread_idx] = &recvdata;
7260
7261 NDB_TICKS now = NdbTick_getCurrentTicks();
7262 before = now;
7263 selfptr->m_curr_ticks = now;
7264 selfptr->m_signal = signal;
7265 selfptr->m_ticks = selfptr->m_scan_real_ticks = yield_ticks = now;
7266 Ndb_GetRUsage(&selfptr->m_scan_time_queue_rusage, false);
7267
7268 while (globalData.theRestartFlag != perform_stop)
7269 {
7270 if (cnt == 0)
7271 {
7272 watchDogCounter = 5;
7273 update_spin_config(selfptr, min_spin_timer);
7274 Uint32 max_spintime = 0;
7275 /**
7276 * The settings of spinning on transporter is only aimed at
7277 * the NDB API part. We have an elaborate scheme for handling
7278 * spinning in ndbmtd, so we shut down any spinning inside
7279 * the transporter here. The principle is to only spin in one
7280 * location and spinning in recv thread overrides any spinning
7281 * desired on transporter level.
7282 */
7283 max_spintime = 0;
7284 globalTransporterRegistry.update_connections(recvdata,
7285 max_spintime);
7286 }
7287 cnt = (cnt + 1) & 15;
7288
7289 watchDogCounter = 2;
7290
7291 now = NdbTick_getCurrentTicks();
7292 selfptr->m_curr_ticks = now;
7293 const Uint32 lagging_timers = scan_time_queues(selfptr, now);
7294 Uint32 dummy1 = 0;
7295 Uint32 dummy2 = 0;
7296 bool dummy3 = false;
7297
7298 Uint32 sum = run_job_buffers(selfptr, signal, dummy1, dummy2, dummy3);
7299
7300 if (sum || has_received)
7301 {
7302 sendpacked(selfptr, signal);
7303 watchDogCounter = 6;
7304 flush_jbb_write_state(selfptr);
7305 }
7306
7307 const bool pending_send = do_send(selfptr, TRUE, FALSE);
7308
7309 watchDogCounter = 7;
7310
7311 if (real_time)
7312 {
7313 check_real_time_break(now,
7314 &yield_ticks,
7315 selfptr->m_thread,
7316 ReceiveThread);
7317 }
7318
7319 /**
7320 * Only allow to sleep in pollReceive when:
7321 * 1) We are not lagging behind in handling timer events.
7322 * 2) No more pending sends, or no send progress.
7323 * 3) There are no 'min_spin' configured or min_spin has elapsed
7324 * We will not check spin timer until we have checked the
7325 * transporters at least one loop and discovered no data. We also
7326 * ensure that we have not executed any signals before we start
7327 * the actual spin timer.
7328 */
7329 Uint32 delay = 0;
7330 Uint32 num_events = 0;
7331 Uint32 spin_micros = 0;
7332 update_spin_config(selfptr, min_spin_timer);
7333 before = NdbTick_getCurrentTicks();
7334
7335 if (lagging_timers == 0 && // 1)
7336 pending_send == false && // 2)
7337 (min_spin_timer == 0 || // 3)
7338 (sum == 0 &&
7339 !has_received &&
7340 check_recv_yield(selfptr,
7341 recvdata,
7342 min_spin_timer,
7343 num_events,
7344 &spin_micros,
7345 before))))
7346 {
7347 delay = 10; // 10 ms
7348 }
7349
7350 has_received = false;
7351 if (num_events == 0)
7352 {
7353 /* Need to call pollReceive if not already done in check_recv_yield */
7354 num_events = globalTransporterRegistry.pollReceive(delay, recvdata);
7355 }
7356 if (delay > 0)
7357 {
7358 NDB_TICKS after = NdbTick_getCurrentTicks();
7359 Uint64 micros_sleep = NdbTick_Elapsed(before, after).microSec();
7360 selfptr->m_micros_sleep += micros_sleep;
7361 wait_time_tracking(selfptr, micros_sleep);
7362 }
7363 if (num_events)
7364 {
7365 watchDogCounter = 8;
7366 lock(&rep->m_receive_lock[recv_thread_idx]);
7367 const bool buffersFull =
7368 (globalTransporterRegistry.performReceive(recvdata,
7369 recv_thread_idx) != 0);
7370 unlock(&rep->m_receive_lock[recv_thread_idx]);
7371 has_received = true;
7372
7373 if (buffersFull) /* Receive queues(s) are full */
7374 {
7375 thr_data* waitthr = get_congested_recv_queue(rep, recv_thread_idx);
7376 if (waitthr != NULL) /* Will wait for buffers to be freed */
7377 {
7378 /**
7379 * Wait for thread 'waitthr' to consume some of the
7380 * pending signals in m_in_queue previously received
7381 * from this receive thread, 'thr_no'.
7382 * Will recheck queue status with 'check_recv_queue' after latch
7383 * has been set, and *before* going to sleep.
7384 */
7385 const Uint32 nano_wait = 1000*1000; /* -> 1 ms */
7386 thr_job_queue_head *wait_queue = waitthr->m_in_queue_head + thr_no;
7387 NDB_TICKS before = NdbTick_getCurrentTicks();
7388 const bool waited = yield(&wait_queue->m_waiter,
7389 nano_wait,
7390 check_recv_queue,
7391 wait_queue);
7392 if (waited)
7393 {
7394 NDB_TICKS after = NdbTick_getCurrentTicks();
7395 selfptr->m_buffer_full_micros_sleep +=
7396 NdbTick_Elapsed(before, after).microSec();
7397 }
7398 }
7399 }
7400 }
7401 selfptr->m_stat.m_loop_cnt++;
7402 }
7403
7404 globalEmulatorData.theWatchDog->unregisterWatchedThread(thr_no);
7405 return NULL; // Return value not currently used
7406 }
7407
7408 /**
7409 * Callback function used by yield() to recheck
7410 * 'job queue full' condition before going to sleep.
7411 *
7412 * Check if the specified 'thr_job_queue_head' (arg)
7413 * is still full, return true if so.
7414 */
7415 static bool
check_congested_job_queue(thr_job_queue_head * waitfor)7416 check_congested_job_queue(thr_job_queue_head *waitfor)
7417 {
7418 return (compute_free_buffers_in_queue(waitfor) <= thr_job_queue::RESERVED);
7419 }
7420
7421 /**
7422 * Check if any out-queues of selfptr is full.
7423 * If full: Return 'Thr_data*' for (one of) the thread(s)
7424 * which we have to wait for. (to consume from queue)
7425 */
7426 static struct thr_data*
get_congested_job_queue(const thr_data * selfptr)7427 get_congested_job_queue(const thr_data *selfptr)
7428 {
7429 const Uint32 thr_no = selfptr->m_thr_no;
7430 struct thr_repository* rep = g_thr_repository;
7431 struct thr_data *thrptr = rep->m_thread;
7432 struct thr_data *waitfor = NULL;
7433
7434 for (unsigned i = 0; i<glob_num_threads; i++, thrptr++)
7435 {
7436 thr_job_queue_head *q_head = thrptr->m_in_queue_head + thr_no;
7437
7438 if (compute_free_buffers_in_queue(q_head) <= thr_job_queue::RESERVED)
7439 {
7440 if (thrptr != selfptr) // Don't wait on myself (yet)
7441 return thrptr;
7442 else
7443 waitfor = thrptr;
7444 }
7445 }
7446 return waitfor; // Possibly 'thrptr == selfptr'
7447 }
7448
7449 /**
7450 * has_full_in_queues()
7451 *
7452 * Avoid circular waits between block-threads:
7453 * A thread is not allowed to sleep due to full
7454 * 'out' job-buffers if there are other threads
7455 * already having full 'in' job buffers sent to
7456 * this thread.
7457 *
7458 * run_job_buffers() has reserved a 'm_max_extra_signals'
7459 * quota which will be used to drain these 'full_in_queues'.
7460 * So we should allow it to be.
7461 *
7462 * Returns 'true' if any in-queues to this thread are full
7463 */
7464 static
7465 bool
has_full_in_queues(struct thr_data * selfptr)7466 has_full_in_queues(struct thr_data* selfptr)
7467 {
7468 thr_job_queue_head *head = selfptr->m_in_queue_head;
7469
7470 for (Uint32 thr_no = 0; thr_no < glob_num_threads; thr_no++, head++)
7471 {
7472 if (compute_free_buffers_in_queue(head) <= thr_job_queue::RESERVED)
7473 {
7474 return true;
7475 }
7476 }
7477 return false;
7478 }
7479
7480 /**
7481 * update_sched_config
7482 *
7483 * In order to prevent "job-buffer-full", i.e
7484 * that one thread(T1) produces so much signals to another thread(T2)
7485 * so that the ring-buffer from T1 to T2 gets full
7486 * the main loop have 2 "config" variables
7487 * - m_max_exec_signals
7488 * This is the *total* no of signals T1 can execute before calling
7489 * this method again
7490 * - m_max_signals_per_jb
7491 * This is the max no of signals T1 can execute from each other thread
7492 * in system
7493 *
7494 * Assumption: each signal may send *at most* 4 signals
7495 * - this assumption is made the same in ndbd and ndbmtd and is
7496 * mostly followed by block-code, although not in all places :-(
7497 *
7498 * This function return true, if it it slept
7499 * (i.e that it concluded that it could not execute *any* signals, wo/
7500 * risking job-buffer-full)
7501 */
7502 static
7503 bool
update_sched_config(struct thr_data * selfptr,bool pending_send,Uint32 & send_sum,Uint32 & flush_sum)7504 update_sched_config(struct thr_data* selfptr,
7505 bool pending_send,
7506 Uint32 & send_sum,
7507 Uint32 & flush_sum)
7508 {
7509 Uint32 sleeploop = 0;
7510 Uint32 thr_no = selfptr->m_thr_no;
7511 selfptr->m_watchdog_counter = 16;
7512 loop:
7513 Uint32 minfree = compute_min_free_out_buffers(thr_no);
7514 Uint32 reserved = (minfree > thr_job_queue::RESERVED)
7515 ? thr_job_queue::RESERVED
7516 : minfree;
7517
7518 Uint32 avail = compute_max_signals_to_execute(minfree - reserved);
7519 Uint32 perjb = (avail + g_thr_repository->m_thread_count - 1) /
7520 g_thr_repository->m_thread_count;
7521
7522 if (selfptr->m_thr_no == 0)
7523 {
7524 /**
7525 * The main thread has some signals that execute for a bit longer than
7526 * other threads. We only allow the main thread thus to execute at most
7527 * 5 signals per round of signal execution. We handle this here and
7528 * also only handle signals from one queue at a time with the main
7529 * thread.
7530 *
7531 * LCP_FRAG_REP is one such signal that can execute now for about
7532 * 1 millisecond, so 5 signals can become 5 milliseconds which should
7533 * fairly safe to ensure we always come back for the 10ms TIME_SIGNAL
7534 * that is handled by the main thread.
7535 */
7536 perjb = MAX(perjb, 5);
7537 }
7538 if (perjb > MAX_SIGNALS_PER_JB)
7539 perjb = MAX_SIGNALS_PER_JB;
7540
7541 selfptr->m_max_exec_signals = avail;
7542 selfptr->m_max_signals_per_jb = perjb;
7543 selfptr->m_max_extra_signals = compute_max_signals_to_execute(reserved);
7544
7545 if (unlikely(perjb == 0))
7546 {
7547 if (sleeploop == 10)
7548 {
7549 /**
7550 * we've slept for 10ms...try running anyway
7551 */
7552 selfptr->m_max_signals_per_jb = 1;
7553 ndbout_c("thr_no:%u - sleeploop 10!! "
7554 "(Worker thread blocked (>= 10ms) by slow consumer threads)",
7555 selfptr->m_thr_no);
7556 return true;
7557 }
7558
7559 struct thr_data* waitthr = get_congested_job_queue(selfptr);
7560 if (waitthr == NULL) // Waiters resolved
7561 {
7562 goto loop;
7563 }
7564 else if (has_full_in_queues(selfptr) &&
7565 selfptr->m_max_extra_signals > 0)
7566 {
7567 /* 'extra_signals' used to drain 'full_in_queues'. */
7568 return sleeploop > 0;
7569 }
7570
7571 if (pending_send)
7572 {
7573 /* About to sleep, _must_ send now. */
7574 pending_send = do_send(selfptr, TRUE, TRUE);
7575 send_sum = 0;
7576 flush_sum = 0;
7577 }
7578
7579 /**
7580 * Wait for thread 'waitthr' to consume some of the
7581 * pending signals in m_in_queue[].
7582 * Will recheck queue status with 'check_recv_queue'
7583 * after latch has been set, and *before* going to sleep.
7584 */
7585 const Uint32 nano_wait = 1000*1000; /* -> 1 ms */
7586 thr_job_queue_head *wait_queue = waitthr->m_in_queue_head + thr_no;
7587
7588 NDB_TICKS before = NdbTick_getCurrentTicks();
7589 const bool waited = yield(&wait_queue->m_waiter,
7590 nano_wait,
7591 check_congested_job_queue,
7592 wait_queue);
7593 if (waited)
7594 {
7595 NDB_TICKS after = NdbTick_getCurrentTicks();
7596 selfptr->m_buffer_full_micros_sleep +=
7597 NdbTick_Elapsed(before, after).microSec();
7598 sleeploop++;
7599 }
7600 goto loop;
7601 }
7602
7603 return sleeploop > 0;
7604 }
7605
7606 extern "C"
7607 void *
mt_job_thread_main(void * thr_arg)7608 mt_job_thread_main(void *thr_arg)
7609 {
7610 unsigned char signal_buf[SIGBUF_SIZE];
7611 Signal *signal;
7612
7613 struct thr_data* selfptr = (struct thr_data *)thr_arg;
7614 init_thread(selfptr);
7615 Uint32& watchDogCounter = selfptr->m_watchdog_counter;
7616
7617 unsigned thr_no = selfptr->m_thr_no;
7618 signal = aligned_signal(signal_buf, thr_no);
7619
7620 /* Avoid false watchdog alarms caused by race condition. */
7621 watchDogCounter = 21;
7622
7623 bool pending_send = false;
7624 Uint32 send_sum = 0;
7625 Uint32 flush_sum = 0;
7626 Uint32 loops = 0;
7627 Uint32 maxloops = 10;/* Loops before reading clock, fuzzy adapted to 1ms freq. */
7628 Uint32 waits = 0;
7629
7630 NDB_TICKS yield_ticks;
7631
7632 Uint64 min_spin_timer;
7633 bool real_time = false;
7634
7635 update_rt_config(selfptr, real_time, BlockThread);
7636 update_spin_config(selfptr, min_spin_timer);
7637
7638 NDB_TICKS now = NdbTick_getCurrentTicks();
7639 selfptr->m_ticks = yield_ticks = now;
7640 selfptr->m_scan_real_ticks = now;
7641 selfptr->m_signal = signal;
7642 selfptr->m_curr_ticks = now;
7643 Ndb_GetRUsage(&selfptr->m_scan_time_queue_rusage, false);
7644
7645 while (globalData.theRestartFlag != perform_stop)
7646 {
7647 loops++;
7648
7649 /**
7650 * prefill our thread local send buffers
7651 * up to THR_SEND_BUFFER_PRE_ALLOC (1Mb)
7652 *
7653 * and if this doesnt work pack buffers before start to execute signals
7654 */
7655 watchDogCounter = 11;
7656 if (!selfptr->m_send_buffer_pool.fill(g_thr_repository->m_mm,
7657 RG_TRANSPORTER_BUFFERS,
7658 THR_SEND_BUFFER_PRE_ALLOC,
7659 selfptr->m_send_instance_no))
7660 {
7661 try_pack_send_buffers(selfptr);
7662 }
7663
7664 watchDogCounter = 2;
7665 const Uint32 lagging_timers = scan_time_queues(selfptr, now);
7666
7667 Uint32 sum = run_job_buffers(selfptr,
7668 signal,
7669 send_sum,
7670 flush_sum,
7671 pending_send);
7672
7673
7674 if (sum)
7675 {
7676 /**
7677 * It is imperative that we flush signals within our node after
7678 * each round of execution. This makes sure that the receiver
7679 * thread are woken up to do their work which often means that
7680 * they will send some signals back to us (e.g. the commit
7681 * protocol for updates). Quite often we continue executing one
7682 * more loop and while so doing the other threads can return
7683 * new signals to us and thus we avoid going back and forth to
7684 * sleep too often which otherwise would happen.
7685 *
7686 * Many of the optimisations of having TC and LDM colocated
7687 * for transactions would go away unless we use this principle.
7688 *
7689 * No need to flush however if no signals have been executed since
7690 * last flush.
7691 *
7692 * No need to check for send packed signals if we didn't send
7693 * any signals, packed signals are sent as a result of an
7694 * executed signal.
7695 */
7696 sendpacked(selfptr, signal);
7697 watchDogCounter = 6;
7698 if (flush_sum > 0)
7699 {
7700 flush_jbb_write_state(selfptr);
7701 do_flush(selfptr);
7702 flush_sum = 0;
7703 }
7704 }
7705 /**
7706 * Scheduler is not allowed to yield until its internal
7707 * time has caught up on real time.
7708 */
7709 else if (lagging_timers == 0)
7710 {
7711 /* No signals processed, prepare to sleep to wait for more */
7712 if (send_sum > 0 || pending_send == true)
7713 {
7714 /* About to sleep, _must_ send now. */
7715 flush_jbb_write_state(selfptr);
7716 pending_send = do_send(selfptr, TRUE, TRUE);
7717 send_sum = 0;
7718 flush_sum = 0;
7719 }
7720
7721 /**
7722 * No more incoming signals to process yet, and we have
7723 * either completed all pending sends, or had no progress
7724 * due to full transporters in last do_send(). Wait for
7725 * more signals, use a shorter timeout if pending_send.
7726 */
7727 if (pending_send == false) /* Nothing pending, or no progress made */
7728 {
7729 /**
7730 * When min_spin_timer > 0 it means we are spinning, if we executed
7731 * jobs this time there is no reason to check spin timer and since
7732 * we executed at least one signal we are per definition not yet
7733 * spinning. Thus we can immediately move to the next loop.
7734 * Spinning is performed for a while when sum == 0 AND
7735 * min_spin_timer > 0. In this case we need to go into check_yield
7736 * and initialise spin timer (on first round) and check spin timer
7737 * on subsequent loops.
7738 */
7739 Uint32 spin_time_in_us = 0;
7740 update_spin_config(selfptr, min_spin_timer);
7741 NDB_TICKS before = NdbTick_getCurrentTicks();
7742 bool has_spun = (min_spin_timer != 0);
7743 if (min_spin_timer == 0 ||
7744 check_yield(selfptr,
7745 min_spin_timer,
7746 &spin_time_in_us,
7747 before))
7748 {
7749 /**
7750 * Sleep, either a short nap if send failed due to send overload,
7751 * or a longer sleep if there are no more work waiting.
7752 */
7753 Uint32 maxwait_in_us =
7754 (selfptr->m_node_overload_status >=
7755 (OverloadStatus)MEDIUM_LOAD_CONST) ?
7756 1 * 1000 :
7757 10 * 1000;
7758 if (maxwait_in_us < spin_time_in_us)
7759 {
7760 maxwait_in_us = 0;
7761 }
7762 else
7763 {
7764 maxwait_in_us -= spin_time_in_us;
7765 }
7766 selfptr->m_watchdog_counter = 18;
7767 const Uint32 used_maxwait_in_ns = maxwait_in_us * 1000;
7768 bool waited = yield(&selfptr->m_waiter,
7769 used_maxwait_in_ns,
7770 check_queues_empty,
7771 selfptr);
7772 if (waited)
7773 {
7774 waits++;
7775 /* Update current time after sleeping */
7776 now = NdbTick_getCurrentTicks();
7777 selfptr->m_curr_ticks = now;
7778 yield_ticks = now;
7779 Uint64 micros_sleep = NdbTick_Elapsed(before, now).microSec();
7780 selfptr->m_micros_sleep += micros_sleep;
7781 wait_time_tracking(selfptr, micros_sleep);
7782 selfptr->m_stat.m_wait_cnt += waits;
7783 selfptr->m_stat.m_loop_cnt += loops;
7784 if (selfptr->m_overload_status <=
7785 (OverloadStatus)MEDIUM_LOAD_CONST)
7786 {
7787 /**
7788 * To ensure that we at least check for trps to send to
7789 * before we yield we set pending_send to true. We will
7790 * quickly discover if nothing is pending.
7791 */
7792 pending_send = true;
7793 }
7794 waits = loops = 0;
7795 if (selfptr->m_thr_no == 0)
7796 {
7797 /**
7798 * NDBFS is using thread 0, here we need to call SEND_PACKED
7799 * to scan the memory channel for messages from NDBFS threads.
7800 * We want to do this here to avoid an extra loop in scheduler
7801 * before we discover those messages from NDBFS.
7802 */
7803 selfptr->m_watchdog_counter = 17;
7804 check_for_input_from_ndbfs(selfptr, signal);
7805 }
7806 }
7807 else if (has_spun)
7808 {
7809 selfptr->m_micros_sleep += spin_time_in_us;
7810 wait_time_tracking(selfptr, spin_time_in_us);
7811 }
7812 }
7813 }
7814 }
7815
7816 /**
7817 * Check if we executed enough signals,
7818 * and if so recompute how many signals to execute
7819 */
7820 now = NdbTick_getCurrentTicks();
7821 if (sum >= selfptr->m_max_exec_signals)
7822 {
7823 if (update_sched_config(selfptr,
7824 send_sum + Uint32(pending_send),
7825 send_sum,
7826 flush_sum))
7827 {
7828 /* Update current time after sleeping */
7829 selfptr->m_curr_ticks = now;
7830 selfptr->m_stat.m_wait_cnt += waits;
7831 selfptr->m_stat.m_loop_cnt += loops;
7832 waits = loops = 0;
7833 update_rt_config(selfptr, real_time, BlockThread);
7834 calculate_max_signals_parameters(selfptr);
7835 }
7836 }
7837 else
7838 {
7839 selfptr->m_max_exec_signals -= sum;
7840 }
7841
7842 /**
7843 * Adaptive reading freq. of system time every time 1ms
7844 * is likely to have passed
7845 */
7846 now = NdbTick_getCurrentTicks();
7847 selfptr->m_curr_ticks = now;
7848 if (loops > maxloops)
7849 {
7850 if (real_time)
7851 {
7852 check_real_time_break(now,
7853 &yield_ticks,
7854 selfptr->m_thread,
7855 BlockThread);
7856 }
7857 const Uint64 diff = NdbTick_Elapsed(selfptr->m_ticks, now).milliSec();
7858
7859 /* Adjust 'maxloop' to achieve clock reading frequency of 1ms */
7860 if (diff < 1)
7861 maxloops += ((maxloops/10) + 1); /* No change: less frequent reading */
7862 else if (diff > 1 && maxloops > 1)
7863 maxloops -= ((maxloops/10) + 1); /* Overslept: Need more frequent read*/
7864
7865 selfptr->m_stat.m_wait_cnt += waits;
7866 selfptr->m_stat.m_loop_cnt += loops;
7867 waits = loops = 0;
7868 }
7869 }
7870
7871 globalEmulatorData.theWatchDog->unregisterWatchedThread(thr_no);
7872 return NULL; // Return value not currently used
7873 }
7874
7875 /**
7876 * Get number of pending signals at B-level in our own thread. Used
7877 * to make some decisions in rate-critical parts of the data node.
7878 */
7879 Uint32
mt_getSignalsInJBB(Uint32 self)7880 mt_getSignalsInJBB(Uint32 self)
7881 {
7882 Uint32 pending_signals = 0;
7883 struct thr_repository* rep = g_thr_repository;
7884 struct thr_data *selfptr = &rep->m_thread[self];
7885 for (Uint32 thr_no = 0; thr_no < glob_num_threads; thr_no++)
7886 {
7887 thr_jb_write_state *w = selfptr->m_write_states + thr_no;
7888 pending_signals += w->get_pending_signals();
7889 }
7890 return pending_signals;
7891 }
7892
7893 NDB_TICKS
mt_getHighResTimer(Uint32 self)7894 mt_getHighResTimer(Uint32 self)
7895 {
7896 struct thr_repository* rep = g_thr_repository;
7897 struct thr_data *selfptr = &rep->m_thread[self];
7898 return selfptr->m_curr_ticks;
7899 }
7900
7901 void
mt_setNoSend(Uint32 self)7902 mt_setNoSend(Uint32 self)
7903 {
7904 struct thr_repository* rep = g_thr_repository;
7905 struct thr_data *selfptr = &rep->m_thread[self];
7906 selfptr->m_nosend = 1;
7907 }
7908
7909 void
mt_startChangeNeighbourNode()7910 mt_startChangeNeighbourNode()
7911 {
7912 if (g_send_threads)
7913 {
7914 g_send_threads->startChangeNeighbourNode();
7915 }
7916 }
7917
7918 void
mt_setNeighbourNode(NodeId node)7919 mt_setNeighbourNode(NodeId node)
7920 {
7921 if (g_send_threads)
7922 {
7923 g_send_threads->setNeighbourNode(node);
7924 }
7925 }
7926
7927 void
mt_endChangeNeighbourNode()7928 mt_endChangeNeighbourNode()
7929 {
7930 if (g_send_threads)
7931 {
7932 g_send_threads->endChangeNeighbourNode();
7933 }
7934 }
7935
7936 void
mt_setOverloadStatus(Uint32 self,OverloadStatus new_status)7937 mt_setOverloadStatus(Uint32 self,
7938 OverloadStatus new_status)
7939 {
7940 struct thr_repository* rep = g_thr_repository;
7941 struct thr_data *selfptr = &rep->m_thread[self];
7942 selfptr->m_overload_status = new_status;
7943 }
7944
7945 void
mt_setWakeupThread(Uint32 self,Uint32 wakeup_instance)7946 mt_setWakeupThread(Uint32 self,
7947 Uint32 wakeup_instance)
7948 {
7949 struct thr_repository* rep = g_thr_repository;
7950 struct thr_data *selfptr = &rep->m_thread[self];
7951 selfptr->m_wakeup_instance = wakeup_instance;
7952 }
7953
7954 void
mt_setNodeOverloadStatus(Uint32 self,OverloadStatus new_status)7955 mt_setNodeOverloadStatus(Uint32 self,
7956 OverloadStatus new_status)
7957 {
7958 struct thr_repository* rep = g_thr_repository;
7959 struct thr_data *selfptr = &rep->m_thread[self];
7960 selfptr->m_node_overload_status = new_status;
7961 }
7962
7963 void
mt_setSendNodeOverloadStatus(OverloadStatus new_status)7964 mt_setSendNodeOverloadStatus(OverloadStatus new_status)
7965 {
7966 if (g_send_threads)
7967 {
7968 g_send_threads->setNodeOverloadStatus(new_status);
7969 }
7970 }
7971
7972 void
mt_setSpintime(Uint32 self,Uint32 new_spintime)7973 mt_setSpintime(Uint32 self, Uint32 new_spintime)
7974 {
7975 struct thr_repository* rep = g_thr_repository;
7976 struct thr_data *selfptr = &rep->m_thread[self];
7977 /* spintime always 0 on platforms not supporting spin */
7978 if (!NdbSpin_is_supported())
7979 {
7980 new_spintime = 0;
7981 }
7982 selfptr->m_spintime = new_spintime;
7983 }
7984
7985 Uint32
mt_getConfiguredSpintime(Uint32 self)7986 mt_getConfiguredSpintime(Uint32 self)
7987 {
7988 struct thr_repository* rep = g_thr_repository;
7989 struct thr_data *selfptr = &rep->m_thread[self];
7990
7991 return selfptr->m_conf_spintime;
7992 }
7993
7994 Uint32
mt_getWakeupLatency(void)7995 mt_getWakeupLatency(void)
7996 {
7997 return glob_wakeup_latency;
7998 }
7999
8000 void
mt_setWakeupLatency(Uint32 latency)8001 mt_setWakeupLatency(Uint32 latency)
8002 {
8003 /**
8004 * Round up to next 5 micros (+4) AND
8005 * add 2 microseconds for time to execute going to sleep (+2).
8006 * Rounding up is an attempt to decrease variance by selecting the
8007 * latency more coarsely.
8008 *
8009 */
8010 latency = (latency + 4 + 2) / 5;
8011 latency *= 5;
8012 glob_wakeup_latency = latency;
8013 }
8014
8015 void
mt_flush_send_buffers(Uint32 self)8016 mt_flush_send_buffers(Uint32 self)
8017 {
8018 struct thr_repository* rep = g_thr_repository;
8019 struct thr_data *selfptr = &rep->m_thread[self];
8020 do_flush(selfptr);
8021 }
8022
8023 void
mt_set_watchdog_counter(Uint32 self)8024 mt_set_watchdog_counter(Uint32 self)
8025 {
8026 struct thr_repository* rep = g_thr_repository;
8027 struct thr_data *selfptr = &rep->m_thread[self];
8028 selfptr->m_watchdog_counter = 12;
8029 }
8030
8031 void
mt_getPerformanceTimers(Uint32 self,Uint64 & micros_sleep,Uint64 & spin_time,Uint64 & buffer_full_micros_sleep,Uint64 & micros_send)8032 mt_getPerformanceTimers(Uint32 self,
8033 Uint64 & micros_sleep,
8034 Uint64 & spin_time,
8035 Uint64 & buffer_full_micros_sleep,
8036 Uint64 & micros_send)
8037 {
8038 struct thr_repository* rep = g_thr_repository;
8039 struct thr_data *selfptr = &rep->m_thread[self];
8040
8041 /**
8042 * Internally in mt.cpp sleep time now includes spin time. However
8043 * to ensure backwards compatibility we report them separate to
8044 * any block users of this information.
8045 */
8046 micros_sleep = selfptr->m_micros_sleep;
8047 spin_time = selfptr->m_measured_spintime;
8048 if (micros_sleep >= spin_time)
8049 {
8050 micros_sleep -= spin_time;
8051 }
8052 else
8053 {
8054 micros_sleep = 0;
8055 }
8056 buffer_full_micros_sleep = selfptr->m_buffer_full_micros_sleep;
8057 micros_send = selfptr->m_micros_send;
8058 }
8059
8060 const char *
mt_getThreadDescription(Uint32 self)8061 mt_getThreadDescription(Uint32 self)
8062 {
8063 if (is_main_thread(self))
8064 {
8065 if (self == 0)
8066 return "main thread, schema and distribution handling";
8067 else if (self == 1)
8068 return "rep thread, asynch replication and proxy block handling";
8069 require(false);
8070 }
8071 else if (is_ldm_thread(self))
8072 {
8073 return "ldm thread, handling a set of data partitions";
8074 }
8075 else if (is_tc_thread(self))
8076 {
8077 return "tc thread, transaction handling, unique index and pushdown join"
8078 " handling";
8079 }
8080 else if (is_recv_thread(self))
8081 {
8082 return "receive thread, performing receieve and polling for new receives";
8083 }
8084 else
8085 {
8086 require(false);
8087 }
8088 return NULL;
8089 }
8090
8091 const char *
mt_getThreadName(Uint32 self)8092 mt_getThreadName(Uint32 self)
8093 {
8094 if (is_main_thread(self))
8095 {
8096 if (self == 0)
8097 return "main";
8098 else if (self == 1)
8099 return "rep";
8100 require(false);
8101 }
8102 else if (is_ldm_thread(self))
8103 {
8104 return "ldm";
8105 }
8106 else if (is_tc_thread(self))
8107 {
8108 return "tc";
8109 }
8110 else if (is_recv_thread(self))
8111 {
8112 return "recv";
8113 }
8114 else
8115 {
8116 require(false);
8117 }
8118 return NULL;
8119 }
8120
8121 void
mt_getSendPerformanceTimers(Uint32 send_instance,Uint64 & exec_time,Uint64 & sleep_time,Uint64 & spin_time,Uint64 & user_time_os,Uint64 & kernel_time_os,Uint64 & elapsed_time_os)8122 mt_getSendPerformanceTimers(Uint32 send_instance,
8123 Uint64 & exec_time,
8124 Uint64 & sleep_time,
8125 Uint64 & spin_time,
8126 Uint64 & user_time_os,
8127 Uint64 & kernel_time_os,
8128 Uint64 & elapsed_time_os)
8129 {
8130 assert(g_send_threads != NULL);
8131 if (g_send_threads != NULL)
8132 {
8133 g_send_threads->getSendPerformanceTimers(send_instance,
8134 exec_time,
8135 sleep_time,
8136 spin_time,
8137 user_time_os,
8138 kernel_time_os,
8139 elapsed_time_os);
8140 }
8141 }
8142
8143 Uint32
mt_getNumSendThreads()8144 mt_getNumSendThreads()
8145 {
8146 return globalData.ndbMtSendThreads;
8147 }
8148
8149 Uint32
mt_getNumThreads()8150 mt_getNumThreads()
8151 {
8152 return glob_num_threads;
8153 }
8154
8155 void
sendlocal(Uint32 self,const SignalHeader * s,const Uint32 * data,const Uint32 secPtr[3])8156 sendlocal(Uint32 self, const SignalHeader *s, const Uint32 *data,
8157 const Uint32 secPtr[3])
8158 {
8159 Uint32 block = blockToMain(s->theReceiversBlockNumber);
8160 Uint32 instance = blockToInstance(s->theReceiversBlockNumber);
8161
8162 /*
8163 * Max number of signals to put into job buffer before flushing the buffer
8164 * to the other thread.
8165 * This parameter found to be reasonable by benchmarking.
8166 */
8167 Uint32 MAX_SIGNALS_BEFORE_FLUSH = (self >= first_receiver_thread_no) ?
8168 MAX_SIGNALS_BEFORE_FLUSH_RECEIVER :
8169 MAX_SIGNALS_BEFORE_FLUSH_OTHER;
8170
8171 Uint32 dst = block2ThreadId(block, instance);
8172 struct thr_repository* rep = g_thr_repository;
8173 struct thr_data *selfptr = &rep->m_thread[self];
8174 assert(my_thread_equal(selfptr->m_thr_id, my_thread_self()));
8175 struct thr_data *dstptr = &rep->m_thread[dst];
8176
8177 selfptr->m_stat.m_priob_count++;
8178 Uint32 siglen = (sizeof(*s) >> 2) + s->theLength + s->m_noOfSections;
8179 selfptr->m_stat.m_priob_size += siglen;
8180
8181 assert(s->theLength + s->m_noOfSections <= 25);
8182 thr_job_queue *q = dstptr->m_in_queue + self;
8183 thr_job_queue_head *h = dstptr->m_in_queue_head + self;
8184 thr_jb_write_state *w = selfptr->m_write_states + dst;
8185 if (insert_signal(q, h, w, false, s, data, secPtr, selfptr->m_next_buffer))
8186 {
8187 selfptr->m_next_buffer = seize_buffer(rep, self, false);
8188 }
8189 if (w->get_pending_signals() >= MAX_SIGNALS_BEFORE_FLUSH)
8190 {
8191 flush_write_state(selfptr, dstptr, h, w, false);
8192 }
8193 }
8194
8195 void
sendprioa(Uint32 self,const SignalHeader * s,const uint32 * data,const Uint32 secPtr[3])8196 sendprioa(Uint32 self, const SignalHeader *s, const uint32 *data,
8197 const Uint32 secPtr[3])
8198 {
8199 Uint32 block = blockToMain(s->theReceiversBlockNumber);
8200 Uint32 instance = blockToInstance(s->theReceiversBlockNumber);
8201
8202 Uint32 dst = block2ThreadId(block, instance);
8203 struct thr_repository* rep = g_thr_repository;
8204 struct thr_data *selfptr = &rep->m_thread[self];
8205 assert(s->theVerId_signalNumber == GSN_START_ORD ||
8206 my_thread_equal(selfptr->m_thr_id, my_thread_self()));
8207 struct thr_data *dstptr = &rep->m_thread[dst];
8208
8209 selfptr->m_stat.m_prioa_count++;
8210 Uint32 siglen = (sizeof(*s) >> 2) + s->theLength + s->m_noOfSections;
8211 selfptr->m_stat.m_prioa_size += siglen;
8212
8213 thr_job_queue *q = &(dstptr->m_jba);
8214 thr_job_queue_head *h = &(dstptr->m_jba_head);
8215 thr_jb_write_state w;
8216
8217 if (selfptr == dstptr)
8218 {
8219 /**
8220 * Indicate that we sent Prio A signal to ourself.
8221 */
8222 selfptr->m_sent_local_prioa_signal = true;
8223 }
8224
8225 w.init_pending_signals();
8226 lock(&dstptr->m_jba_write_lock);
8227
8228 Uint32 index = h->m_write_index;
8229 w.m_write_index = index;
8230 thr_job_buffer *buffer = q->m_buffers[index];
8231 w.m_write_buffer = buffer;
8232 w.m_write_pos = buffer->m_len;
8233 bool buf_used = insert_signal(q, h, &w, true, s, data, secPtr,
8234 selfptr->m_next_buffer);
8235 flush_write_state(selfptr, dstptr, h, &w, true);
8236
8237 unlock(&dstptr->m_jba_write_lock);
8238 if (w.has_any_pending_signals())
8239 {
8240 wakeup(&(dstptr->m_waiter));
8241 }
8242 if (buf_used)
8243 selfptr->m_next_buffer = seize_buffer(rep, self, true);
8244 }
8245
8246 /**
8247 * Send a signal to a remote node.
8248 *
8249 * (The signal is only queued here, and actually sent later in do_send()).
8250 */
8251 SendStatus
mt_send_remote(Uint32 self,const SignalHeader * sh,Uint8 prio,const Uint32 * data,NodeId nodeId,const LinearSectionPtr ptr[3])8252 mt_send_remote(Uint32 self, const SignalHeader *sh, Uint8 prio,
8253 const Uint32 * data, NodeId nodeId,
8254 const LinearSectionPtr ptr[3])
8255 {
8256 thr_repository *rep = g_thr_repository;
8257 struct thr_data *selfptr = &rep->m_thread[self];
8258 SendStatus ss;
8259
8260 mt_send_handle handle(selfptr);
8261 /* prepareSend() is lock-free, as we have per-thread send buffers. */
8262 TrpId trp_id = 0;
8263 ss = globalTransporterRegistry.prepareSend(&handle,
8264 sh,
8265 prio,
8266 data,
8267 nodeId,
8268 trp_id,
8269 ptr);
8270 if (likely(ss == SEND_OK))
8271 {
8272 register_pending_send(selfptr, trp_id);
8273 }
8274 return ss;
8275 }
8276
8277 SendStatus
mt_send_remote(Uint32 self,const SignalHeader * sh,Uint8 prio,const Uint32 * data,NodeId nodeId,class SectionSegmentPool * thePool,const SegmentedSectionPtr ptr[3])8278 mt_send_remote(Uint32 self, const SignalHeader *sh, Uint8 prio,
8279 const Uint32 *data, NodeId nodeId,
8280 class SectionSegmentPool *thePool,
8281 const SegmentedSectionPtr ptr[3])
8282 {
8283 thr_repository *rep = g_thr_repository;
8284 struct thr_data *selfptr = &rep->m_thread[self];
8285 SendStatus ss;
8286
8287 mt_send_handle handle(selfptr);
8288 TrpId trp_id = 0;
8289 ss = globalTransporterRegistry.prepareSend(&handle,
8290 sh,
8291 prio,
8292 data,
8293 nodeId,
8294 trp_id,
8295 *thePool, ptr);
8296 if (likely(ss == SEND_OK))
8297 {
8298 register_pending_send(selfptr, trp_id);
8299 }
8300 return ss;
8301 }
8302
8303 /*
8304 * This functions sends a prio A STOP_FOR_CRASH signal to a thread.
8305 *
8306 * It works when called from any other thread, not just from job processing
8307 * threads. But note that this signal will be the last signal to be executed by
8308 * the other thread, as it will exit immediately.
8309 */
8310 static
8311 void
sendprioa_STOP_FOR_CRASH(const struct thr_data * selfptr,Uint32 dst)8312 sendprioa_STOP_FOR_CRASH(const struct thr_data *selfptr, Uint32 dst)
8313 {
8314 SignalT<StopForCrash::SignalLength> signalT;
8315 struct thr_repository* rep = g_thr_repository;
8316 /* As this signal will be the last one executed by the other thread, it does
8317 not matter which buffer we use in case the current buffer is filled up by
8318 the STOP_FOR_CRASH signal; the data in it will never be read.
8319 */
8320 static Uint32 MAX_WAIT = 3000;
8321 static thr_job_buffer dummy_buffer;
8322
8323 /**
8324 * Pick any instance running in this thread
8325 */
8326 struct thr_data *dstptr = &rep->m_thread[dst];
8327 Uint32 bno = dstptr->m_instance_list[0];
8328
8329 memset(&signalT.header, 0, sizeof(SignalHeader));
8330 signalT.header.theVerId_signalNumber = GSN_STOP_FOR_CRASH;
8331 signalT.header.theReceiversBlockNumber = bno;
8332 signalT.header.theSendersBlockRef = 0;
8333 signalT.header.theTrace = 0;
8334 signalT.header.theSendersSignalId = 0;
8335 signalT.header.theSignalId = 0;
8336 signalT.header.theLength = StopForCrash::SignalLength;
8337 StopForCrash * stopForCrash = CAST_PTR(StopForCrash, &signalT.theData[0]);
8338 stopForCrash->flags = 0;
8339
8340 thr_job_queue *q = &(dstptr->m_jba);
8341 thr_job_queue_head *h = &(dstptr->m_jba_head);
8342 thr_jb_write_state w;
8343
8344 /**
8345 * Ensure that a crash while holding m_jba_write_lock won't block
8346 * dump process forever.
8347 */
8348 Uint64 loop_count = 0;
8349 const NDB_TICKS start_try_lock = NdbTick_getCurrentTicks();
8350 while (trylock(&dstptr->m_jba_write_lock) != 0)
8351 {
8352 if (++loop_count >= 10000)
8353 {
8354 const NDB_TICKS now = NdbTick_getCurrentTicks();
8355 if (NdbTick_Elapsed(start_try_lock, now).milliSec() > MAX_WAIT)
8356 {
8357 return;
8358 }
8359 NdbSleep_MilliSleep(1);
8360 loop_count = 0;
8361 }
8362 }
8363
8364 w.init_pending_signals();
8365 Uint32 index = h->m_write_index;
8366 w.m_write_index = index;
8367 thr_job_buffer *buffer = q->m_buffers[index];
8368 w.m_write_buffer = buffer;
8369 w.m_write_pos = buffer->m_len;
8370 insert_signal(q, h, &w, true, &signalT.header, signalT.theData, NULL,
8371 &dummy_buffer);
8372 flush_write_state(selfptr, dstptr, h, &w, true);
8373
8374 unlock(&dstptr->m_jba_write_lock);
8375 if (w.has_any_pending_signals())
8376 {
8377 loop_count = 0;
8378 /**
8379 * Ensure that a crash while holding wakeup lock won't block
8380 * dump process forever. We will wait at most 3 seconds.
8381 */
8382 const NDB_TICKS start_try_wakeup = NdbTick_getCurrentTicks();
8383 while (try_wakeup(&(dstptr->m_waiter)) != 0)
8384 {
8385 if (++loop_count >= 10000)
8386 {
8387 const NDB_TICKS now = NdbTick_getCurrentTicks();
8388 if (NdbTick_Elapsed(start_try_wakeup, now).milliSec() > MAX_WAIT)
8389 {
8390 return;
8391 }
8392 NdbSleep_MilliSleep(1);
8393 loop_count = 0;
8394 }
8395 }
8396 }
8397 }
8398
8399 /**
8400 * Implements the rules for which threads are allowed to have
8401 * communication with each other.
8402 * Also see compute_jb_pages() which has similar logic.
8403 */
8404 static bool
may_communicate(unsigned from,unsigned to)8405 may_communicate(unsigned from, unsigned to)
8406 {
8407 if (is_main_thread(from))
8408 {
8409 // Main threads communicates with all other threads
8410 return true;
8411 }
8412 else if (is_ldm_thread(from))
8413 {
8414 // First LDM is special as it may act as internal client
8415 // during backup, and thus communicate with other LDMs:
8416 if (is_first_ldm_thread(from) && is_ldm_thread(to))
8417 return true;
8418
8419 // All LDM threads can communicates with TC-, main-
8420 // itself, and the BACKUP client (above)
8421 return is_main_thread(to) ||
8422 is_tc_thread(to) ||
8423 is_first_ldm_thread(to) ||
8424 (to == from);
8425 }
8426 else if (is_tc_thread(from))
8427 {
8428 // TC threads can communicate with SPJ-, LQH-, main- and itself
8429 return is_main_thread(to) ||
8430 is_ldm_thread(to) ||
8431 is_tc_thread(to); // Cover both SPJs and itself
8432 }
8433 else
8434 {
8435 assert(is_recv_thread(from));
8436 // Receive treads communicate with all, except other receivers
8437 return !is_recv_thread(to);
8438 }
8439 }
8440
8441 /**
8442 * init functions
8443 */
8444 static
8445 void
queue_init(struct thr_tq * tq)8446 queue_init(struct thr_tq* tq)
8447 {
8448 tq->m_next_timer = 0;
8449 tq->m_current_time = 0;
8450 tq->m_next_free = RNIL;
8451 tq->m_cnt[0] = tq->m_cnt[1] = tq->m_cnt[2] = 0;
8452 bzero(tq->m_delayed_signals, sizeof(tq->m_delayed_signals));
8453 }
8454
8455 static
8456 void
thr_init(struct thr_repository * rep,struct thr_data * selfptr,unsigned int cnt,unsigned thr_no)8457 thr_init(struct thr_repository* rep, struct thr_data *selfptr, unsigned int cnt,
8458 unsigned thr_no)
8459 {
8460 Uint32 i;
8461
8462 selfptr->m_thr_no = thr_no;
8463 selfptr->m_next_jbb_no = 0;
8464 selfptr->m_max_signals_per_jb = MAX_SIGNALS_PER_JB;
8465 selfptr->m_max_exec_signals = 0;
8466 selfptr->m_max_extra_signals = 0;
8467 selfptr->m_first_free = 0;
8468 selfptr->m_first_unused = 0;
8469 selfptr->m_send_instance_no = 0;
8470 selfptr->m_send_instance = NULL;
8471 selfptr->m_nosend = 1;
8472
8473 {
8474 char buf[100];
8475 BaseString::snprintf(buf, sizeof(buf), "jbalock thr: %u", thr_no);
8476 register_lock(&selfptr->m_jba_write_lock, buf);
8477 }
8478 selfptr->m_jba_head.m_read_index = 0;
8479 selfptr->m_jba_head.m_write_index = 0;
8480 thr_job_buffer *buffer = seize_buffer(rep, thr_no, true);
8481 selfptr->m_jba.m_buffers[0] = buffer;
8482 selfptr->m_jba_read_state.m_read_index = 0;
8483 selfptr->m_jba_read_state.m_read_buffer = buffer;
8484 selfptr->m_jba_read_state.m_read_pos = 0;
8485 selfptr->m_jba_read_state.m_read_end = 0;
8486 selfptr->m_jba_read_state.m_write_index = 0;
8487 selfptr->m_next_buffer = seize_buffer(rep, thr_no, false);
8488 selfptr->m_send_buffer_pool.set_pool(&rep->m_sb_pool);
8489
8490 for (i = 0; i<cnt; i++)
8491 {
8492 selfptr->m_in_queue_head[i].m_waiter.init();
8493 selfptr->m_in_queue_head[i].m_read_index = 0;
8494 selfptr->m_in_queue_head[i].m_write_index = 0;
8495 buffer = may_communicate(i,thr_no)
8496 ? seize_buffer(rep, thr_no, false) : NULL;
8497 selfptr->m_in_queue[i].m_buffers[0] = buffer;
8498 selfptr->m_read_states[i].m_read_index = 0;
8499 selfptr->m_read_states[i].m_read_buffer = buffer;
8500 selfptr->m_read_states[i].m_read_pos = 0;
8501 selfptr->m_read_states[i].m_read_end = 0;
8502 selfptr->m_read_states[i].m_write_index = 0;
8503 }
8504 queue_init(&selfptr->m_tq);
8505
8506 bzero(&selfptr->m_stat, sizeof(selfptr->m_stat));
8507
8508 selfptr->m_pending_send_count = 0;
8509 selfptr->m_pending_send_mask.clear();
8510
8511 selfptr->m_instance_count = 0;
8512 for (i = 0; i < MAX_INSTANCES_PER_THREAD; i++)
8513 selfptr->m_instance_list[i] = 0;
8514
8515 bzero(&selfptr->m_send_buffers, sizeof(selfptr->m_send_buffers));
8516
8517 selfptr->m_thread = 0;
8518 selfptr->m_cpu = NO_LOCK_CPU;
8519 #ifdef ERROR_INSERT
8520 selfptr->m_delayed_prepare = false;
8521 #endif
8522 }
8523
8524 /* Have to do this after init of all m_in_queues is done. */
8525 static
8526 void
thr_init2(struct thr_repository * rep,struct thr_data * selfptr,unsigned int cnt,unsigned thr_no)8527 thr_init2(struct thr_repository* rep, struct thr_data *selfptr,
8528 unsigned int cnt, unsigned thr_no)
8529 {
8530 for (Uint32 i = 0; i<cnt; i++)
8531 {
8532 selfptr->m_write_states[i].m_write_index = 0;
8533 selfptr->m_write_states[i].m_write_pos = 0;
8534 selfptr->m_write_states[i].m_write_buffer =
8535 rep->m_thread[i].m_in_queue[thr_no].m_buffers[0];
8536 selfptr->m_write_states[i].init_pending_signals();
8537 }
8538 }
8539
8540 static
8541 void
receive_lock_init(Uint32 recv_thread_id,thr_repository * rep)8542 receive_lock_init(Uint32 recv_thread_id, thr_repository *rep)
8543 {
8544 char buf[100];
8545 BaseString::snprintf(buf, sizeof(buf), "receive lock thread id %d",
8546 recv_thread_id);
8547 register_lock(&rep->m_receive_lock[recv_thread_id], buf);
8548 }
8549
8550 static
8551 void
send_buffer_init(Uint32 id,thr_repository::send_buffer * sb)8552 send_buffer_init(Uint32 id, thr_repository::send_buffer * sb)
8553 {
8554 char buf[100];
8555 BaseString::snprintf(buf, sizeof(buf), "send lock trp %d", id);
8556 register_lock(&sb->m_send_lock, buf);
8557 BaseString::snprintf(buf, sizeof(buf), "send_buffer lock trp %d", id);
8558 register_lock(&sb->m_buffer_lock, buf);
8559 sb->m_buffered_size = 0;
8560 sb->m_sending_size = 0;
8561 sb->m_force_send = 0;
8562 sb->m_bytes_sent = 0;
8563 sb->m_send_thread = NO_SEND_THREAD;
8564 sb->m_enabled = false;
8565 bzero(&sb->m_buffer, sizeof(sb->m_buffer));
8566 bzero(&sb->m_sending, sizeof(sb->m_sending));
8567 bzero(sb->m_read_index, sizeof(sb->m_read_index));
8568 }
8569
8570 static
8571 void
rep_init(struct thr_repository * rep,unsigned int cnt,Ndbd_mem_manager * mm)8572 rep_init(struct thr_repository* rep, unsigned int cnt, Ndbd_mem_manager *mm)
8573 {
8574 rep->m_mm = mm;
8575
8576 rep->m_thread_count = cnt;
8577 for (unsigned int i = 0; i<cnt; i++)
8578 {
8579 thr_init(rep, &rep->m_thread[i], cnt, i);
8580 }
8581 for (unsigned int i = 0; i<cnt; i++)
8582 {
8583 thr_init2(rep, &rep->m_thread[i], cnt, i);
8584 }
8585
8586 rep->stopped_threads = 0;
8587 NdbMutex_Init(&rep->stop_for_crash_mutex);
8588 NdbCondition_Init(&rep->stop_for_crash_cond);
8589
8590 for (Uint32 i = 0; i < NDB_ARRAY_SIZE(rep->m_receive_lock); i++)
8591 {
8592 receive_lock_init(i, rep);
8593 }
8594 for (int i = 0 ; i < MAX_NTRANSPORTERS; i++)
8595 {
8596 send_buffer_init(i, rep->m_send_buffers+i);
8597 }
8598
8599 bzero(rep->m_thread_send_buffers, sizeof(rep->m_thread_send_buffers));
8600 }
8601
8602
8603 /**
8604 * Thread Config
8605 */
8606
8607 static Uint32
get_total_number_of_block_threads(void)8608 get_total_number_of_block_threads(void)
8609 {
8610 return (NUM_MAIN_THREADS +
8611 globalData.ndbMtLqhThreads +
8612 globalData.ndbMtTcThreads +
8613 globalData.ndbMtReceiveThreads);
8614 }
8615
8616 static Uint32
get_num_trps()8617 get_num_trps()
8618 {
8619 Uint32 count = 0;
8620 for (Uint32 id = 1; id < MAX_NTRANSPORTERS; id++)
8621 {
8622 if (globalTransporterRegistry.get_transporter(id))
8623 {
8624 count++;
8625 }
8626 }
8627 return count;
8628 }
8629
8630 /**
8631 * This function returns the amount of extra send buffer pages
8632 * that we should allocate in addition to the amount allocated
8633 * for each trp send buffer.
8634 */
8635 #define MIN_SEND_BUFFER_GENERAL (512) //16M
8636 #define MIN_SEND_BUFFER_PER_NODE (8) //256k
8637 #define MIN_SEND_BUFFER_PER_THREAD (64) //2M
8638
8639 Uint32
mt_get_extra_send_buffer_pages(Uint32 curr_num_pages,Uint32 extra_mem_pages)8640 mt_get_extra_send_buffer_pages(Uint32 curr_num_pages,
8641 Uint32 extra_mem_pages)
8642 {
8643 Uint32 loc_num_threads = get_total_number_of_block_threads();
8644 Uint32 num_trps = get_num_trps();
8645
8646 Uint32 extra_pages = extra_mem_pages;
8647
8648 /**
8649 * Add 2M for each thread since we allocate 1M every
8650 * time we allocate and also we ensure there is also a minimum
8651 * of 1M of send buffer in each thread. Thus we can easily have
8652 * 2M of send buffer just to keep the contention around the
8653 * send buffer page spinlock small. This memory we add independent
8654 * of the configuration settings since the user cannot be
8655 * expected to handle this and also since we could change this
8656 * behaviour at any time.
8657 */
8658 extra_pages += loc_num_threads * THR_SEND_BUFFER_MAX_FREE;
8659
8660 if (extra_mem_pages == 0)
8661 {
8662 /**
8663 * The user have set extra send buffer memory to 0 and left for us
8664 * to decide on our own how much extra memory is needed.
8665 *
8666 * We'll make sure that we have at least a minimum of 16M +
8667 * 2M per thread + 256k per trp. If we have this based on
8668 * curr_num_pages and our local additions we don't add
8669 * anything more, if we don't come up to this level we add to
8670 * reach this minimum level.
8671 */
8672 Uint32 min_pages = MIN_SEND_BUFFER_GENERAL +
8673 (MIN_SEND_BUFFER_PER_NODE * num_trps) +
8674 (MIN_SEND_BUFFER_PER_THREAD * loc_num_threads);
8675
8676 if ((curr_num_pages + extra_pages) < min_pages)
8677 {
8678 extra_pages = min_pages - curr_num_pages;
8679 }
8680 }
8681 return extra_pages;
8682 }
8683
8684 Uint32
compute_jb_pages(struct EmulatorData * ed)8685 compute_jb_pages(struct EmulatorData * ed)
8686 {
8687 Uint32 cnt = get_total_number_of_block_threads();
8688 Uint32 num_receive_threads = globalData.ndbMtReceiveThreads;
8689 Uint32 num_lqh_threads = globalData.ndbMtLqhThreads;
8690 Uint32 num_tc_threads = globalData.ndbMtTcThreads;
8691 Uint32 num_main_threads = NUM_MAIN_THREADS;
8692
8693 /**
8694 * Number of pages each thread needs to communicate with another
8695 * thread.
8696 */
8697 Uint32 job_queue_pages_per_thread = thr_job_queue::SIZE;
8698
8699 /**
8700 * In 'perthread' we calculate number of pages required by
8701 * all 'block threads' (excludes 'send-threads'). 'perthread'
8702 * usage is independent of whether this thread will communicate
8703 * with other 'block threads' or not.
8704 */
8705 Uint32 perthread = 0;
8706
8707 /**
8708 * Each threads has its own job_queue for 'prio A' signals
8709 */
8710 perthread += job_queue_pages_per_thread;
8711
8712 /**
8713 * Each thread keeps a available free page in 'm_next_buffer'
8714 * in case it is required by insert_signal() into JBA or JBB.
8715 */
8716 perthread += 1;
8717
8718 /**
8719 * Each thread keeps time-queued signals in 'struct thr_tq'
8720 * thr_tq::PAGES are used to store these.
8721 */
8722 perthread += thr_tq::PAGES;
8723
8724 /**
8725 * Each thread has its own 'm_free_fifo[THR_FREE_BUF_MAX]' cache.
8726 * As it is filled to MAX *before* a page is allocated, which consumes a page,
8727 * it will never cache more than MAX-1 pages. Pages are also returned to global
8728 * allocator as soon as MAX is reached.
8729 */
8730 perthread += THR_FREE_BUF_MAX-1;
8731
8732 /**
8733 * Start by calculating the basic number of pages required for
8734 * our 'cnt' block threads.
8735 * (no inter-thread communication assumed so far)
8736 */
8737 Uint32 tot = cnt * perthread;
8738
8739 /**
8740 * We then start adding pages required for inter-thread communications:
8741 *
8742 * Receiver threads will be able to communicate with all other
8743 * threads except other receive threads.
8744 */
8745 tot += num_receive_threads *
8746 (cnt - num_receive_threads) *
8747 job_queue_pages_per_thread;
8748
8749 /**
8750 * LQH threads can communicate with TC threads and main threads.
8751 * Cannot communicate with receive threads and other LQH threads,
8752 * but it can communicate with itself.
8753 */
8754 tot += num_lqh_threads *
8755 (num_tc_threads + num_main_threads + 1) *
8756 job_queue_pages_per_thread;
8757
8758 /**
8759 * First LDM thread is special as it will act as client
8760 * during backup. It will send to, and receive from (2x)
8761 * the 'num_lqh_threads - 1' other LQH threads.
8762 */
8763 tot += 2 * (num_lqh_threads-1) *
8764 job_queue_pages_per_thread;
8765
8766 /**
8767 * TC threads can communicate with SPJ-, LQH- and main threads.
8768 * Cannot communicate with receive threads and other TC threads,
8769 * but as SPJ is located together with TC, it is counted as it
8770 * communicate with all TC threads.
8771 */
8772 tot += num_tc_threads *
8773 (num_lqh_threads + num_main_threads + num_tc_threads) *
8774 job_queue_pages_per_thread;
8775
8776 /**
8777 * Main threads can communicate with all other threads
8778 */
8779 tot += num_main_threads *
8780 cnt *
8781 job_queue_pages_per_thread;
8782
8783 return tot;
8784 }
8785
ThreadConfig()8786 ThreadConfig::ThreadConfig()
8787 {
8788 /**
8789 * We take great care within struct thr_repository to optimize
8790 * cache line placement of the different members. This all
8791 * depends on that the base address of thr_repository itself
8792 * is cache line alligned.
8793 *
8794 * So we allocate a char[] sufficient large to hold the
8795 * thr_repository object, with added bytes for placing
8796 * g_thr_repository on a CL-alligned offset withing it.
8797 */
8798 g_thr_repository_mem = new char[sizeof(thr_repository)+NDB_CL];
8799 const int alligned_offs = NDB_CL_PADSZ((UintPtr)g_thr_repository_mem);
8800 char* cache_alligned_mem = &g_thr_repository_mem[alligned_offs];
8801 require((((UintPtr)cache_alligned_mem) % NDB_CL) == 0);
8802 g_thr_repository = new(cache_alligned_mem) thr_repository();
8803 }
8804
~ThreadConfig()8805 ThreadConfig::~ThreadConfig()
8806 {
8807 g_thr_repository->~thr_repository();
8808 g_thr_repository = NULL;
8809 delete[] g_thr_repository_mem;
8810 g_thr_repository_mem = NULL;
8811 }
8812
8813 /*
8814 * We must do the init here rather than in the constructor, since at
8815 * constructor time the global memory manager is not available.
8816 */
8817 void
init()8818 ThreadConfig::init()
8819 {
8820 Uint32 num_lqh_threads = globalData.ndbMtLqhThreads;
8821 Uint32 num_tc_threads = globalData.ndbMtTcThreads;
8822 Uint32 num_recv_threads = globalData.ndbMtReceiveThreads;
8823 first_receiver_thread_no =
8824 NUM_MAIN_THREADS + num_tc_threads + num_lqh_threads;
8825 glob_num_threads = first_receiver_thread_no + num_recv_threads;
8826 require(glob_num_threads <= MAX_BLOCK_THREADS);
8827
8828 glob_num_tc_threads = num_tc_threads;
8829 if (glob_num_tc_threads == 0)
8830 glob_num_tc_threads = 1;
8831
8832 ndbout << "NDBMT: number of block threads=" << glob_num_threads << endl;
8833
8834 ::rep_init(g_thr_repository, glob_num_threads,
8835 globalEmulatorData.m_mem_manager);
8836 }
8837
8838 /**
8839 * return receiver thread handling a particular trp
8840 * returned number is indexed from 0 and upwards to #receiver threads
8841 * (or MAX_NODES is none)
8842 */
8843 Uint32
mt_get_recv_thread_idx(TrpId trp_id)8844 mt_get_recv_thread_idx(TrpId trp_id)
8845 {
8846 assert(trp_id < NDB_ARRAY_SIZE(g_trp_to_recv_thr_map));
8847 return g_trp_to_recv_thr_map[trp_id];
8848 }
8849
8850 static
8851 void
assign_receiver_threads(void)8852 assign_receiver_threads(void)
8853 {
8854 Uint32 num_recv_threads = globalData.ndbMtReceiveThreads;
8855 Uint32 recv_thread_idx = 0;
8856 Uint32 recv_thread_idx_shm = 0;
8857 for (Uint32 trp_id = 1; trp_id < MAX_NTRANSPORTERS; trp_id++)
8858 {
8859 Transporter *trp =
8860 globalTransporterRegistry.get_transporter(trp_id);
8861
8862 /**
8863 * Ensure that shared memory transporters are well distributed
8864 * over all receive threads, so distribute those independent of
8865 * rest of transporters.
8866 */
8867 if (trp)
8868 {
8869 if (globalTransporterRegistry.is_shm_transporter(trp_id))
8870 {
8871 g_trp_to_recv_thr_map[trp_id] = recv_thread_idx_shm;
8872 globalTransporterRegistry.set_recv_thread_idx(trp,recv_thread_idx_shm);
8873 DEB_MULTI_TRP(("SHM trp %u uses recv_thread_idx: %u",
8874 trp_id, recv_thread_idx_shm));
8875 recv_thread_idx_shm++;
8876 if (recv_thread_idx_shm == num_recv_threads)
8877 recv_thread_idx_shm = 0;
8878 }
8879 else
8880 {
8881 g_trp_to_recv_thr_map[trp_id] = recv_thread_idx;
8882 DEB_MULTI_TRP(("TCP trp %u uses recv_thread_idx: %u",
8883 trp_id, recv_thread_idx));
8884 globalTransporterRegistry.set_recv_thread_idx(trp,recv_thread_idx);
8885 recv_thread_idx++;
8886 if (recv_thread_idx == num_recv_threads)
8887 recv_thread_idx = 0;
8888 }
8889 }
8890 else
8891 {
8892 /* Flag for no transporter */
8893 g_trp_to_recv_thr_map[trp_id] = MAX_NTRANSPORTERS;
8894 }
8895 }
8896 return;
8897 }
8898
8899 void
mt_assign_recv_thread_new_trp(Uint32 trp_id)8900 mt_assign_recv_thread_new_trp(Uint32 trp_id)
8901 {
8902 if (g_trp_to_recv_thr_map[trp_id] != MAX_NTRANSPORTERS)
8903 {
8904 /* Already assigned in the past, keep assignment */
8905 return;
8906 }
8907 Uint32 num_recv_threads = globalData.ndbMtReceiveThreads;
8908 Uint32 next_recv_thread_tcp = 0;
8909 Uint32 next_recv_thread_shm = 0;
8910 for (Uint32 id = 1; id < MAX_NTRANSPORTERS; id++)
8911 {
8912 if (id == trp_id)
8913 continue;
8914 Transporter *trp =
8915 globalTransporterRegistry.get_transporter(id);
8916 if (trp)
8917 {
8918 if (globalTransporterRegistry.is_shm_transporter(id))
8919 {
8920 next_recv_thread_shm = g_trp_to_recv_thr_map[id];
8921 }
8922 else
8923 {
8924 next_recv_thread_tcp = g_trp_to_recv_thr_map[id];
8925 }
8926 }
8927 }
8928 Transporter *trp =
8929 globalTransporterRegistry.get_transporter(trp_id);
8930 require(trp);
8931 Uint32 choosen_recv_thread;
8932 if (globalTransporterRegistry.is_shm_transporter(trp_id))
8933 {
8934 next_recv_thread_shm++;
8935 if (next_recv_thread_shm == num_recv_threads)
8936 next_recv_thread_shm = 0;
8937 g_trp_to_recv_thr_map[trp_id] = next_recv_thread_shm;
8938 choosen_recv_thread = next_recv_thread_shm;
8939 globalTransporterRegistry.set_recv_thread_idx(trp, next_recv_thread_shm);
8940 DEB_MULTI_TRP(("SHM multi trp %u uses recv_thread_idx: %u",
8941 trp_id, next_recv_thread_shm));
8942 }
8943 else
8944 {
8945 next_recv_thread_tcp++;
8946 if (next_recv_thread_tcp == num_recv_threads)
8947 next_recv_thread_tcp = 0;
8948 g_trp_to_recv_thr_map[trp_id] = next_recv_thread_tcp;
8949 choosen_recv_thread = next_recv_thread_tcp;
8950 globalTransporterRegistry.set_recv_thread_idx(trp, next_recv_thread_tcp);
8951 DEB_MULTI_TRP(("TCP multi trp %u uses recv_thread_idx: %u",
8952 trp_id, next_recv_thread_tcp));
8953 }
8954 TransporterReceiveHandleKernel *recvdata =
8955 g_trp_receive_handle_ptr[choosen_recv_thread];
8956 recvdata->m_transporters.set(trp_id);
8957 }
8958
8959 bool
mt_epoll_add_trp(Uint32 self,NodeId node_id,TrpId trp_id)8960 mt_epoll_add_trp(Uint32 self, NodeId node_id, TrpId trp_id)
8961 {
8962 (void)node_id;
8963 struct thr_repository* rep = g_thr_repository;
8964 struct thr_data *selfptr = &rep->m_thread[self];
8965 Uint32 thr_no = selfptr->m_thr_no;
8966 require(thr_no >= first_receiver_thread_no);
8967 Uint32 recv_thread_idx = thr_no - first_receiver_thread_no;
8968 TransporterReceiveHandleKernel *recvdata =
8969 g_trp_receive_handle_ptr[recv_thread_idx];
8970 if (recv_thread_idx != g_trp_to_recv_thr_map[trp_id])
8971 {
8972 return false;
8973 }
8974 Transporter *t = globalTransporterRegistry.get_transporter(trp_id);
8975 lock(&rep->m_send_buffers[trp_id].m_send_lock);
8976 lock(&rep->m_receive_lock[recv_thread_idx]);
8977 require(recvdata->epoll_add(t));
8978 unlock(&rep->m_receive_lock[recv_thread_idx]);
8979 unlock(&rep->m_send_buffers[trp_id].m_send_lock);
8980 return true;
8981 }
8982
8983 bool
mt_is_recv_thread_for_new_trp(Uint32 self,NodeId node_id,TrpId trp_id)8984 mt_is_recv_thread_for_new_trp(Uint32 self,
8985 NodeId node_id,
8986 TrpId trp_id)
8987 {
8988 (void)node_id;
8989 struct thr_repository* rep = g_thr_repository;
8990 struct thr_data *selfptr = &rep->m_thread[self];
8991 Uint32 thr_no = selfptr->m_thr_no;
8992 require(thr_no >= first_receiver_thread_no);
8993 Uint32 recv_thread_idx = thr_no - first_receiver_thread_no;
8994 if (recv_thread_idx != g_trp_to_recv_thr_map[trp_id])
8995 {
8996 return false;
8997 }
8998 return true;
8999 }
9000
9001 void
ipControlLoop(NdbThread * pThis)9002 ThreadConfig::ipControlLoop(NdbThread* pThis)
9003 {
9004 unsigned int thr_no;
9005 struct thr_repository* rep = g_thr_repository;
9006
9007 rep->m_thread[first_receiver_thread_no].m_thr_index =
9008 globalEmulatorData.theConfiguration->addThread(pThis, ReceiveThread);
9009
9010 max_send_delay = globalEmulatorData.theConfiguration->maxSendDelay();
9011
9012 /**
9013 * Set the configured time we will spend in spinloop before coming
9014 * back to check conditions.
9015 */
9016 Uint32 spin_nanos = globalEmulatorData.theConfiguration->spinTimePerCall();
9017 NdbSpin_Change(Uint64(spin_nanos));
9018 g_eventLogger->info("Number of spin loops is %llu to pause %llu nanoseconds",
9019 NdbSpin_get_num_spin_loops(),
9020 NdbSpin_get_current_spin_nanos());
9021
9022 if (globalData.ndbMtSendThreads)
9023 {
9024 /**
9025 * new operator do not ensure alignment for overaligned data types.
9026 * As for g_thr_repository, overallocate memory and construct the
9027 * thr_send_threads object within at aligned address.
9028 */
9029 g_send_threads_mem = new char[sizeof(thr_send_threads) + NDB_CL];
9030 const int aligned_offs = NDB_CL_PADSZ((UintPtr)g_send_threads_mem);
9031 char* cache_aligned_mem = &g_send_threads_mem[aligned_offs];
9032 require((((UintPtr)cache_aligned_mem) % NDB_CL) == 0);
9033 g_send_threads = new (cache_aligned_mem) thr_send_threads();
9034 }
9035
9036 /**
9037 * assign trps to receiver threads
9038 */
9039 assign_receiver_threads();
9040
9041 /* Start the send thread(s) */
9042 if (g_send_threads)
9043 {
9044 /**
9045 * assign trps to send threads
9046 */
9047 g_send_threads->assign_trps_to_send_threads();
9048 g_send_threads->assign_threads_to_assist_send_threads();
9049
9050 g_send_threads->start_send_threads();
9051 }
9052
9053 /*
9054 * Start threads for all execution threads, except for the receiver
9055 * thread, which runs in the main thread.
9056 */
9057 for (thr_no = 0; thr_no < glob_num_threads; thr_no++)
9058 {
9059 NDB_TICKS now = NdbTick_getCurrentTicks();
9060 rep->m_thread[thr_no].m_ticks = now;
9061 rep->m_thread[thr_no].m_scan_real_ticks = now;
9062
9063 if (thr_no == first_receiver_thread_no)
9064 continue; // Will run in the main thread.
9065
9066 /*
9067 * The NdbThread_Create() takes void **, but that is cast to void * when
9068 * passed to the thread function. Which is kind of strange ...
9069 */
9070 if (thr_no < first_receiver_thread_no)
9071 {
9072 /* Start block threads */
9073 struct NdbThread *thread_ptr =
9074 NdbThread_Create(mt_job_thread_main,
9075 (void **)(rep->m_thread + thr_no),
9076 1024*1024,
9077 "execute thread", //ToDo add number
9078 NDB_THREAD_PRIO_MEAN);
9079 require(thread_ptr != NULL);
9080 rep->m_thread[thr_no].m_thr_index =
9081 globalEmulatorData.theConfiguration->addThread(thread_ptr,
9082 BlockThread);
9083 rep->m_thread[thr_no].m_thread = thread_ptr;
9084 }
9085 else
9086 {
9087 /* Start a receiver thread, also block thread for TRPMAN */
9088 struct NdbThread *thread_ptr =
9089 NdbThread_Create(mt_receiver_thread_main,
9090 (void **)(&rep->m_thread[thr_no]),
9091 1024*1024,
9092 "receive thread", //ToDo add number
9093 NDB_THREAD_PRIO_MEAN);
9094 require(thread_ptr != NULL);
9095 globalEmulatorData.theConfiguration->addThread(thread_ptr,
9096 ReceiveThread);
9097 rep->m_thread[thr_no].m_thread = thread_ptr;
9098 }
9099 }
9100
9101 /* Now run the main loop for first receiver thread directly. */
9102 rep->m_thread[first_receiver_thread_no].m_thread = pThis;
9103 mt_receiver_thread_main(&(rep->m_thread[first_receiver_thread_no]));
9104
9105 /* Wait for all threads to shutdown. */
9106 for (thr_no = 0; thr_no < glob_num_threads; thr_no++)
9107 {
9108 if (thr_no == first_receiver_thread_no)
9109 continue;
9110 void *dummy_return_status;
9111 NdbThread_WaitFor(rep->m_thread[thr_no].m_thread,
9112 &dummy_return_status);
9113 globalEmulatorData.theConfiguration->removeThread(
9114 rep->m_thread[thr_no].m_thread);
9115 NdbThread_Destroy(&(rep->m_thread[thr_no].m_thread));
9116 }
9117
9118 /* Delete send threads, includes waiting for threads to shutdown */
9119 if (g_send_threads)
9120 {
9121 g_send_threads->~thr_send_threads();
9122 g_send_threads = NULL;
9123 delete[] g_send_threads_mem;
9124 g_send_threads_mem = NULL;
9125 }
9126 globalEmulatorData.theConfiguration->removeThread(pThis);
9127 }
9128
9129 int
doStart(NodeState::StartLevel startLevel)9130 ThreadConfig::doStart(NodeState::StartLevel startLevel)
9131 {
9132 SignalT<3> signalT;
9133 memset(&signalT.header, 0, sizeof(SignalHeader));
9134
9135 signalT.header.theVerId_signalNumber = GSN_START_ORD;
9136 signalT.header.theReceiversBlockNumber = CMVMI;
9137 signalT.header.theSendersBlockRef = 0;
9138 signalT.header.theTrace = 0;
9139 signalT.header.theSignalId = 0;
9140 signalT.header.theLength = StartOrd::SignalLength;
9141
9142 StartOrd * startOrd = CAST_PTR(StartOrd, &signalT.theData[0]);
9143 startOrd->restartInfo = 0;
9144
9145 sendprioa(block2ThreadId(CMVMI, 0), &signalT.header, signalT.theData, 0);
9146 return 0;
9147 }
9148
9149 Uint32
traceDumpGetNumThreads()9150 FastScheduler::traceDumpGetNumThreads()
9151 {
9152 /* The last thread is only for receiver -> no trace file. */
9153 return glob_num_threads;
9154 }
9155
9156 bool
traceDumpGetJam(Uint32 thr_no,const JamEvent * & thrdTheEmulatedJam,Uint32 & thrdTheEmulatedJamIndex)9157 FastScheduler::traceDumpGetJam(Uint32 thr_no,
9158 const JamEvent * & thrdTheEmulatedJam,
9159 Uint32 & thrdTheEmulatedJamIndex)
9160 {
9161 if (thr_no >= glob_num_threads)
9162 return false;
9163
9164 #ifdef NO_EMULATED_JAM
9165 thrdTheEmulatedJam = NULL;
9166 thrdTheEmulatedJamIndex = 0;
9167 #else
9168 const EmulatedJamBuffer *jamBuffer =
9169 &g_thr_repository->m_thread[thr_no].m_jam;
9170 thrdTheEmulatedJam = jamBuffer->theEmulatedJam;
9171 thrdTheEmulatedJamIndex = jamBuffer->theEmulatedJamIndex;
9172 #endif
9173 return true;
9174 }
9175
9176 void
traceDumpPrepare(NdbShutdownType & nst)9177 FastScheduler::traceDumpPrepare(NdbShutdownType& nst)
9178 {
9179 /*
9180 * We are about to generate trace files for all threads.
9181 *
9182 * We want to stop all threads processing before we dump, as otherwise the
9183 * signal buffers could change while dumping, leading to inconsistent
9184 * results.
9185 *
9186 * To stop threads, we send the GSN_STOP_FOR_CRASH signal as prio A to each
9187 * thread. We then wait for threads to signal they are done (but not forever,
9188 * so as to not have one hanging thread prevent the generation of trace
9189 * dumps). We also must be careful not to send to ourself if the crash is
9190 * being processed by one of the threads processing signals.
9191 *
9192 * We do not stop the transporter thread, as it cannot receive signals (but
9193 * because it does not receive signals it does not really influence dumps in
9194 * any case).
9195 */
9196 const thr_data *selfptr = NDB_THREAD_TLS_THREAD;
9197 /* The selfptr might be NULL, or pointer to thread that crashed. */
9198
9199 Uint32 waitFor_count = 0;
9200 NdbMutex_Lock(&g_thr_repository->stop_for_crash_mutex);
9201 g_thr_repository->stopped_threads = 0;
9202 NdbMutex_Unlock(&g_thr_repository->stop_for_crash_mutex);
9203
9204 for (Uint32 thr_no = 0; thr_no < glob_num_threads; thr_no++)
9205 {
9206 if (selfptr != NULL && selfptr->m_thr_no == thr_no)
9207 {
9208 /* This is own thread; we have already stopped processing. */
9209 continue;
9210 }
9211
9212 sendprioa_STOP_FOR_CRASH(selfptr, thr_no);
9213
9214 waitFor_count++;
9215 }
9216
9217 static const Uint32 max_wait_seconds = 2;
9218 const NDB_TICKS start = NdbTick_getCurrentTicks();
9219 NdbMutex_Lock(&g_thr_repository->stop_for_crash_mutex);
9220 while (g_thr_repository->stopped_threads < waitFor_count)
9221 {
9222 NdbCondition_WaitTimeout(&g_thr_repository->stop_for_crash_cond,
9223 &g_thr_repository->stop_for_crash_mutex,
9224 10);
9225 const NDB_TICKS now = NdbTick_getCurrentTicks();
9226 if (NdbTick_Elapsed(start,now).seconds() > max_wait_seconds)
9227 break; // Give up
9228 }
9229 if (g_thr_repository->stopped_threads < waitFor_count)
9230 {
9231 if (nst != NST_ErrorInsert)
9232 {
9233 nst = NST_Watchdog; // Make this abort fast
9234 }
9235 ndbout_c("Warning: %d thread(s) did not stop before starting crash dump.",
9236 waitFor_count - g_thr_repository->stopped_threads);
9237 }
9238 NdbMutex_Unlock(&g_thr_repository->stop_for_crash_mutex);
9239
9240 /* Now we are ready (or as ready as can be) for doing crash dump. */
9241 }
9242
9243 /**
9244 * In ndbmtd we could have a case where we actually have multiple threads
9245 * crashing at the same time. This causes several threads to start processing
9246 * the crash handling in parallel and eventually lead to a deadlock since
9247 * the crash handling thread waits for other threads to stop before completing
9248 * the crash handling.
9249 *
9250 * To avoid this we use this function that only is useful in ndbmtd where
9251 * we check if the crash handling has already started. We protect this
9252 * check using the stop_for_crash-mutex. This function is called twice,
9253 * first to write an entry in the error log and second to specify that the
9254 * error log write is completed.
9255 *
9256 * We proceed only from the first call if the crash handling hasn't started
9257 * or if the crash is not caused by an error insert. If it is caused by an
9258 * error insert it is a normal situation with multiple crashes, so we won't
9259 * clutter the error log with multiple entries in this case. If it is a real
9260 * crash and we have more than one thread crashing, then this is vital
9261 * information to write in the error log, we do however not want more than
9262 * one set of trace files.
9263 *
9264 * To ensure that writes of the error log happens for one thread at a time we
9265 * protect it with the stop_for_crash-mutex. We hold this mutex between the
9266 * first and second call of this function from the error reporter thread.
9267 *
9268 * We proceed from the first call only if we are the first thread that
9269 * reported an error. To handle this properly we start by acquiring the
9270 * mutex, then we write the error log, when we come back we set the
9271 * crash_started flag and release the mutex to enable other threads to
9272 * write into the error log, but still stopping them from proceeding to
9273 * write another set of trace files.
9274 *
9275 * We will not come back from this function the second time unless we are
9276 * the first crashing thread.
9277 */
9278
9279 static bool crash_started = false;
9280
9281 void
prepare_to_crash(bool first_phase,bool error_insert_crash)9282 ErrorReporter::prepare_to_crash(bool first_phase, bool error_insert_crash)
9283 {
9284 if (first_phase)
9285 {
9286 NdbMutex_Lock(&g_thr_repository->stop_for_crash_mutex);
9287 if (crash_started && error_insert_crash)
9288 {
9289 /**
9290 * Some other thread has already started the crash handling.
9291 * We call the below method which we will never return from.
9292 * We need not write multiple entries in error log for
9293 * error insert crashes since it is a normal event.
9294 */
9295 NdbMutex_Unlock(&g_thr_repository->stop_for_crash_mutex);
9296 mt_execSTOP_FOR_CRASH();
9297 }
9298 /**
9299 * Proceed to write error log before returning to this method
9300 * again with start set to 0.
9301 */
9302 }
9303 else if (crash_started)
9304 {
9305 (void)error_insert_crash;
9306 /**
9307 * No need to proceed since somebody already started handling the crash.
9308 * We proceed by calling mt_execSTOP_FOR_CRASH to stop this thread
9309 * in a manner that is similar to if we received the signal
9310 * STOP_FOR_CRASH.
9311 */
9312 NdbMutex_Unlock(&g_thr_repository->stop_for_crash_mutex);
9313 mt_execSTOP_FOR_CRASH();
9314 }
9315 else
9316 {
9317 /**
9318 * No crash had started previously, we will take care of it. Before
9319 * handling it we will mark the crash handling as started.
9320 */
9321 crash_started = true;
9322 NdbMutex_Unlock(&g_thr_repository->stop_for_crash_mutex);
9323 }
9324 }
9325
mt_execSTOP_FOR_CRASH()9326 void mt_execSTOP_FOR_CRASH()
9327 {
9328 const thr_data *selfptr = NDB_THREAD_TLS_THREAD;
9329 require(selfptr != NULL);
9330
9331 NdbMutex_Lock(&g_thr_repository->stop_for_crash_mutex);
9332 g_thr_repository->stopped_threads++;
9333 NdbCondition_Signal(&g_thr_repository->stop_for_crash_cond);
9334 NdbMutex_Unlock(&g_thr_repository->stop_for_crash_mutex);
9335
9336 /* ToDo: is this correct? */
9337 globalEmulatorData.theWatchDog->unregisterWatchedThread(selfptr->m_thr_no);
9338
9339 my_thread_exit(NULL);
9340 }
9341
9342 void
dumpSignalMemory(Uint32 thr_no,FILE * out)9343 FastScheduler::dumpSignalMemory(Uint32 thr_no, FILE* out)
9344 {
9345 thr_data *selfptr = NDB_THREAD_TLS_THREAD;
9346 const thr_repository *rep = g_thr_repository;
9347 /*
9348 * The selfptr might be NULL, or pointer to thread that is doing the crash
9349 * jump.
9350 * If non-null, we should update the watchdog counter while dumping.
9351 */
9352 Uint32 *watchDogCounter;
9353 if (selfptr)
9354 watchDogCounter = &selfptr->m_watchdog_counter;
9355 else
9356 watchDogCounter = NULL;
9357
9358 /*
9359 * We want to dump the signal buffers from last executed to first executed.
9360 * So we first need to find the correct sequence to output signals in, stored
9361 * in this arrray.
9362 *
9363 * We will check any buffers in the cyclic m_free_fifo. In addition,
9364 * we also need to scan the already executed part of the current
9365 * buffer in m_jba.
9366 *
9367 * Due to partial execution of prio A buffers, we will use signal ids to know
9368 * where to interleave prio A signals into the stream of prio B signals
9369 * read. So we will keep a pointer to a prio A buffer around; and while
9370 * scanning prio B buffers we will interleave prio A buffers from that buffer
9371 * when the signal id fits the sequence.
9372 *
9373 * This also means that we may have to discard the earliest part of available
9374 * prio A signal data due to too little prio B data present, or vice versa.
9375 */
9376 static const Uint32 MAX_SIGNALS_TO_DUMP = 4096;
9377 struct {
9378 const SignalHeader *ptr;
9379 bool prioa;
9380 } signalSequence[MAX_SIGNALS_TO_DUMP];
9381 Uint32 seq_start = 0;
9382 Uint32 seq_end = 0;
9383
9384 const struct thr_data *thr_ptr = &rep->m_thread[thr_no];
9385 if (watchDogCounter)
9386 *watchDogCounter = 4;
9387
9388 /*
9389 * ToDo: Might do some sanity check to avoid crashing on not yet initialised
9390 * thread.
9391 */
9392
9393 /* Scan all available buffers with already executed signals. */
9394
9395 /*
9396 * Keep track of all available buffers, so that we can pick out signals in
9397 * the same order they were executed (order obtained from signal id).
9398 *
9399 * We may need to keep track of THR_FREE_BUF_MAX buffers for fully executed
9400 * (and freed) buffers, plus MAX_BLOCK_THREADS buffers for currently active
9401 * prio B buffers, plus one active prio A buffer.
9402 */
9403 struct {
9404 const thr_job_buffer *m_jb;
9405 Uint32 m_pos;
9406 Uint32 m_max;
9407 } jbs[THR_FREE_BUF_MAX + MAX_BLOCK_THREADS + 1];
9408
9409 Uint32 num_jbs = 0;
9410
9411 /* Load released buffers. */
9412 Uint32 idx = thr_ptr->m_first_free;
9413 while (idx != thr_ptr->m_first_unused)
9414 {
9415 const thr_job_buffer *q = thr_ptr->m_free_fifo[idx];
9416 if (q->m_len > 0)
9417 {
9418 jbs[num_jbs].m_jb = q;
9419 jbs[num_jbs].m_pos = 0;
9420 jbs[num_jbs].m_max = q->m_len;
9421 num_jbs++;
9422 }
9423 idx = (idx + 1) % THR_FREE_BUF_MAX;
9424 }
9425 /* Load any active prio B buffers. */
9426 for (Uint32 thr_no = 0; thr_no < rep->m_thread_count; thr_no++)
9427 {
9428 const thr_job_queue *q = thr_ptr->m_in_queue + thr_no;
9429 const thr_jb_read_state *r = thr_ptr->m_read_states + thr_no;
9430 Uint32 read_pos = r->m_read_pos;
9431 if (r->is_open() && read_pos > 0)
9432 {
9433 jbs[num_jbs].m_jb = q->m_buffers[r->m_read_index];
9434 jbs[num_jbs].m_pos = 0;
9435 jbs[num_jbs].m_max = read_pos;
9436 num_jbs++;
9437 }
9438 }
9439 /* Load any active prio A buffer. */
9440 const thr_jb_read_state *r = &thr_ptr->m_jba_read_state;
9441 Uint32 read_pos = r->m_read_pos;
9442 if (read_pos > 0)
9443 {
9444 jbs[num_jbs].m_jb = thr_ptr->m_jba.m_buffers[r->m_read_index];
9445 jbs[num_jbs].m_pos = 0;
9446 jbs[num_jbs].m_max = read_pos;
9447 num_jbs++;
9448 }
9449
9450 /* Use the next signal id as the smallest (oldest).
9451 *
9452 * Subtracting two signal ids with the smallest makes
9453 * them comparable using standard comparision of Uint32,
9454 * there the biggest value is the newest.
9455 * For example,
9456 * (m_signal_id_counter - smallest_signal_id) == UINT32_MAX
9457 */
9458 const Uint32 smallest_signal_id = thr_ptr->m_signal_id_counter + 1;
9459
9460 /* Now pick out one signal at a time, in signal id order. */
9461 while (num_jbs > 0)
9462 {
9463 if (watchDogCounter)
9464 *watchDogCounter = 4;
9465
9466 /* Search out the smallest signal id remaining. */
9467 Uint32 idx_min = 0;
9468 const Uint32 *p = jbs[idx_min].m_jb->m_data + jbs[idx_min].m_pos;
9469 const SignalHeader *s_min = reinterpret_cast<const SignalHeader*>(p);
9470 Uint32 sid_min_adjusted = s_min->theSignalId - smallest_signal_id;
9471
9472 for (Uint32 i = 1; i < num_jbs; i++)
9473 {
9474 p = jbs[i].m_jb->m_data + jbs[i].m_pos;
9475 const SignalHeader *s = reinterpret_cast<const SignalHeader*>(p);
9476 const Uint32 sid_adjusted = s->theSignalId - smallest_signal_id;
9477 if (sid_adjusted < sid_min_adjusted)
9478 {
9479 idx_min = i;
9480 s_min = s;
9481 sid_min_adjusted = sid_adjusted;
9482 }
9483 }
9484
9485 /* We found the next signal, now put it in the ordered cyclic buffer. */
9486 signalSequence[seq_end].ptr = s_min;
9487 signalSequence[seq_end].prioa = jbs[idx_min].m_jb->m_prioa;
9488 Uint32 siglen =
9489 (sizeof(SignalHeader)>>2) + s_min->m_noOfSections + s_min->theLength;
9490 #if SIZEOF_CHARP == 8
9491 /* Align to 8-byte boundary, to ensure aligned copies. */
9492 siglen= (siglen+1) & ~((Uint32)1);
9493 #endif
9494 jbs[idx_min].m_pos += siglen;
9495 if (jbs[idx_min].m_pos >= jbs[idx_min].m_max)
9496 {
9497 /* We are done with this job buffer. */
9498 num_jbs--;
9499 jbs[idx_min] = jbs[num_jbs];
9500 }
9501 seq_end = (seq_end + 1) % MAX_SIGNALS_TO_DUMP;
9502 /* Drop old signals if too many available in history. */
9503 if (seq_end == seq_start)
9504 seq_start = (seq_start + 1) % MAX_SIGNALS_TO_DUMP;
9505 }
9506
9507 /* Now, having build the correct signal sequence, we can dump them all. */
9508 fprintf(out, "\n");
9509 bool first_one = true;
9510 bool out_of_signals = false;
9511 Uint32 lastSignalId = 0;
9512 while (seq_end != seq_start)
9513 {
9514 if (watchDogCounter)
9515 *watchDogCounter = 4;
9516
9517 if (seq_end == 0)
9518 seq_end = MAX_SIGNALS_TO_DUMP;
9519 seq_end--;
9520 SignalT<25> signal;
9521 const SignalHeader *s = signalSequence[seq_end].ptr;
9522 unsigned siglen = (sizeof(*s)>>2) + s->theLength;
9523 if (siglen > MAX_SIGNAL_SIZE)
9524 siglen = MAX_SIGNAL_SIZE; // Sanity check
9525 memcpy(&signal.header, s, 4*siglen);
9526 // instance number in trace file is confusing if not MT LQH
9527 if (globalData.ndbMtLqhWorkers == 0)
9528 signal.header.theReceiversBlockNumber &= NDBMT_BLOCK_MASK;
9529
9530 const Uint32 *posptr = reinterpret_cast<const Uint32 *>(s);
9531 signal.m_sectionPtrI[0] = posptr[siglen + 0];
9532 signal.m_sectionPtrI[1] = posptr[siglen + 1];
9533 signal.m_sectionPtrI[2] = posptr[siglen + 2];
9534 bool prioa = signalSequence[seq_end].prioa;
9535
9536 /* Make sure to display clearly when there is a gap in the dump. */
9537 if (!first_one && !out_of_signals && (s->theSignalId + 1) != lastSignalId)
9538 {
9539 out_of_signals = true;
9540 fprintf(out, "\n\n\nNo more prio %s signals, rest of dump will be "
9541 "incomplete.\n\n\n\n", prioa ? "B" : "A");
9542 }
9543 first_one = false;
9544 lastSignalId = s->theSignalId;
9545
9546 fprintf(out, "--------------- Signal ----------------\n");
9547 Uint32 prio = (prioa ? JBA : JBB);
9548 SignalLoggerManager::printSignalHeader(out,
9549 signal.header,
9550 prio,
9551 globalData.ownId,
9552 true);
9553 SignalLoggerManager::printSignalData (out,
9554 signal.header,
9555 &signal.theData[0]);
9556 }
9557 fflush(out);
9558 }
9559
9560 int
traceDumpGetCurrentThread()9561 FastScheduler::traceDumpGetCurrentThread()
9562 {
9563 const thr_data *selfptr = NDB_THREAD_TLS_THREAD;
9564
9565 /* The selfptr might be NULL, or pointer to thread that crashed. */
9566 if (selfptr == 0)
9567 {
9568 return -1;
9569 }
9570 else
9571 {
9572 return (int)selfptr->m_thr_no;
9573 }
9574 }
9575
9576 void
mt_section_lock()9577 mt_section_lock()
9578 {
9579 lock(&(g_thr_repository->m_section_lock));
9580 }
9581
9582 void
mt_section_unlock()9583 mt_section_unlock()
9584 {
9585 unlock(&(g_thr_repository->m_section_lock));
9586 }
9587
9588 void
mt_mem_manager_init()9589 mt_mem_manager_init()
9590 {
9591 }
9592
9593 void
mt_mem_manager_lock()9594 mt_mem_manager_lock()
9595 {
9596 lock(&(g_thr_repository->m_mem_manager_lock));
9597 }
9598
9599 void
mt_mem_manager_unlock()9600 mt_mem_manager_unlock()
9601 {
9602 unlock(&(g_thr_repository->m_mem_manager_lock));
9603 }
9604
9605 Vector<mt_lock_stat> g_locks;
9606 template class Vector<mt_lock_stat>;
9607
9608 static
9609 void
register_lock(const void * ptr,const char * name)9610 register_lock(const void * ptr, const char * name)
9611 {
9612 if (name == 0)
9613 return;
9614
9615 mt_lock_stat* arr = g_locks.getBase();
9616 for (size_t i = 0; i<g_locks.size(); i++)
9617 {
9618 if (arr[i].m_ptr == ptr)
9619 {
9620 if (arr[i].m_name)
9621 {
9622 free(arr[i].m_name);
9623 }
9624 arr[i].m_name = strdup(name);
9625 return;
9626 }
9627 }
9628
9629 mt_lock_stat ln;
9630 ln.m_ptr = ptr;
9631 ln.m_name = strdup(name);
9632 ln.m_contended_count = 0;
9633 ln.m_spin_count = 0;
9634 g_locks.push_back(ln);
9635 }
9636
9637 #if defined(NDB_HAVE_XCNG) && defined(NDB_USE_SPINLOCK)
9638 static
9639 mt_lock_stat *
lookup_lock(const void * ptr)9640 lookup_lock(const void * ptr)
9641 {
9642 mt_lock_stat* arr = g_locks.getBase();
9643 for (size_t i = 0; i<g_locks.size(); i++)
9644 {
9645 if (arr[i].m_ptr == ptr)
9646 return arr + i;
9647 }
9648
9649 return 0;
9650 }
9651 #endif
9652
9653 Uint32
mt_get_threads_for_blocks_no_proxy(const Uint32 blocks[],BlockThreadBitmask & mask)9654 mt_get_threads_for_blocks_no_proxy(const Uint32 blocks[],
9655 BlockThreadBitmask& mask)
9656 {
9657 Uint32 cnt = 0;
9658 for (Uint32 i = 0; blocks[i] != 0; i++)
9659 {
9660 Uint32 block = blocks[i];
9661 /**
9662 * Find each thread that has instance of block
9663 */
9664 assert(block == blockToMain(block));
9665 const Uint32 index = block - MIN_BLOCK_NO;
9666 const Uint32 instance_count = block_instance_count[index];
9667 require(instance_count <= NDB_ARRAY_SIZE(thr_map[index]));
9668 // If more than one instance, avoid proxy instance 0
9669 const Uint32 first_instance = (instance_count > 1) ? 1 : 0;
9670 for (Uint32 instance = first_instance;
9671 instance < instance_count;
9672 instance++)
9673 {
9674 Uint32 thr_no = thr_map[index][instance].thr_no;
9675 require(thr_no != thr_map_entry::NULL_THR_NO);
9676
9677 if (mask.get(thr_no))
9678 continue;
9679
9680 mask.set(thr_no);
9681 cnt++;
9682 }
9683 }
9684 require(mask.count() == cnt);
9685 return cnt;
9686 }
9687
9688 Uint32
mt_get_addressable_threads(const Uint32 my_thr_no,BlockThreadBitmask & mask)9689 mt_get_addressable_threads(const Uint32 my_thr_no, BlockThreadBitmask& mask)
9690 {
9691 const Uint32 thr_cnt = get_total_number_of_block_threads();
9692 Uint32 cnt = 0;
9693 for (Uint32 thr_no = 0; thr_no < thr_cnt; thr_no++)
9694 {
9695 if (may_communicate(my_thr_no, thr_no))
9696 {
9697 mask.set(thr_no);
9698 cnt++;
9699 }
9700 }
9701 if (!mask.get(my_thr_no))
9702 {
9703 mask.set(my_thr_no);
9704 cnt++;
9705 }
9706 require(mask.count() == cnt);
9707 return cnt;
9708 }
9709
9710 void
mt_wakeup(class SimulatedBlock * block)9711 mt_wakeup(class SimulatedBlock* block)
9712 {
9713 Uint32 thr_no = block->getThreadId();
9714 struct thr_data *thrptr = &g_thr_repository->m_thread[thr_no];
9715 wakeup(&thrptr->m_waiter);
9716 }
9717
9718 #ifdef VM_TRACE
9719 void
mt_assert_own_thread(SimulatedBlock * block)9720 mt_assert_own_thread(SimulatedBlock* block)
9721 {
9722 Uint32 thr_no = block->getThreadId();
9723 struct thr_data *thrptr = &g_thr_repository->m_thread[thr_no];
9724
9725 if (unlikely(my_thread_equal(thrptr->m_thr_id, my_thread_self()) == 0))
9726 {
9727 fprintf(stderr, "mt_assert_own_thread() - assertion-failure\n");
9728 fflush(stderr);
9729 abort();
9730 }
9731 }
9732 #endif
9733
9734
9735 Uint32
mt_get_blocklist(SimulatedBlock * block,Uint32 arr[],Uint32 len)9736 mt_get_blocklist(SimulatedBlock * block, Uint32 arr[], Uint32 len)
9737 {
9738 Uint32 thr_no = block->getThreadId();
9739 struct thr_data *thr_ptr = &g_thr_repository->m_thread[thr_no];
9740
9741 for (Uint32 i = 0; i < thr_ptr->m_instance_count; i++)
9742 {
9743 arr[i] = thr_ptr->m_instance_list[i];
9744 }
9745
9746 return thr_ptr->m_instance_count;
9747 }
9748
9749 void
mt_get_spin_stat(class SimulatedBlock * block,ndb_spin_stat * dst)9750 mt_get_spin_stat(class SimulatedBlock *block, ndb_spin_stat *dst)
9751 {
9752 Uint32 thr_no = block->getThreadId();
9753 struct thr_data *selfptr = &g_thr_repository->m_thread[thr_no];
9754 dst->m_sleep_longer_spin_time = selfptr->m_spin_stat.m_sleep_longer_spin_time;
9755 dst->m_sleep_shorter_spin_time =
9756 selfptr->m_spin_stat.m_sleep_shorter_spin_time;
9757 dst->m_num_waits = selfptr->m_spin_stat.m_num_waits;
9758 for (Uint32 i = 0; i < NUM_SPIN_INTERVALS; i++)
9759 {
9760 dst->m_micros_sleep_times[i] =
9761 selfptr->m_spin_stat.m_micros_sleep_times[i];
9762 dst->m_spin_interval[i] = selfptr->m_spin_stat.m_spin_interval[i];
9763 }
9764 }
9765
mt_set_spin_stat(class SimulatedBlock * block,ndb_spin_stat * src)9766 void mt_set_spin_stat(class SimulatedBlock *block, ndb_spin_stat *src)
9767 {
9768 Uint32 thr_no = block->getThreadId();
9769 struct thr_data *selfptr = &g_thr_repository->m_thread[thr_no];
9770 memset(&selfptr->m_spin_stat, 0, sizeof(selfptr->m_spin_stat));
9771 for (Uint32 i = 0; i < NUM_SPIN_INTERVALS; i++)
9772 {
9773 selfptr->m_spin_stat.m_spin_interval[i] = src->m_spin_interval[i];
9774 }
9775 }
9776
9777 void
mt_get_thr_stat(class SimulatedBlock * block,ndb_thr_stat * dst)9778 mt_get_thr_stat(class SimulatedBlock * block, ndb_thr_stat* dst)
9779 {
9780 bzero(dst, sizeof(* dst));
9781 Uint32 thr_no = block->getThreadId();
9782 struct thr_data *selfptr = &g_thr_repository->m_thread[thr_no];
9783
9784 THRConfigApplier & conf = globalEmulatorData.theConfiguration->m_thr_config;
9785 dst->thr_no = thr_no;
9786 dst->name = conf.getName(selfptr->m_instance_list, selfptr->m_instance_count);
9787 dst->os_tid = NdbThread_GetTid(selfptr->m_thread);
9788 dst->loop_cnt = selfptr->m_stat.m_loop_cnt;
9789 dst->exec_cnt = selfptr->m_stat.m_exec_cnt;
9790 dst->wait_cnt = selfptr->m_stat.m_wait_cnt;
9791 dst->local_sent_prioa = selfptr->m_stat.m_prioa_count;
9792 dst->local_sent_priob = selfptr->m_stat.m_priob_count;
9793 }
9794
9795 TransporterReceiveHandle *
mt_get_trp_receive_handle(unsigned instance)9796 mt_get_trp_receive_handle(unsigned instance)
9797 {
9798 assert(instance > 0 && instance <= MAX_NDBMT_RECEIVE_THREADS);
9799 if (instance > 0 && instance <= MAX_NDBMT_RECEIVE_THREADS)
9800 {
9801 return g_trp_receive_handle_ptr[instance - 1 /* proxy */];
9802 }
9803 return 0;
9804 }
9805
9806 #if defined(USE_INIT_GLOBAL_VARIABLES)
9807 void
mt_clear_global_variables(thr_data * selfptr)9808 mt_clear_global_variables(thr_data *selfptr)
9809 {
9810 if (selfptr->m_global_variables_enabled)
9811 {
9812 for (Uint32 i = 0; i < selfptr->m_global_variables_ptr_instances; i++)
9813 {
9814 Ptr<void> *tmp = (Ptr<void>*)selfptr->m_global_variables_ptrs[i];
9815 tmp->i = RNIL;
9816 tmp->p = 0;
9817 }
9818 for (Uint32 i = 0; i < selfptr->m_global_variables_uint32_ptr_instances; i++)
9819 {
9820 void **tmp = (void**)selfptr->m_global_variables_uint32_ptrs[i];
9821 (*tmp) = 0;
9822 }
9823 for (Uint32 i = 0; i < selfptr->m_global_variables_uint32_instances; i++)
9824 {
9825 Uint32 *tmp = (Uint32*)selfptr->m_global_variables_uint32[i];
9826 (*tmp) = Uint32(~0);
9827 }
9828 }
9829 }
9830
9831 void
mt_enable_global_variables(Uint32 self)9832 mt_enable_global_variables(Uint32 self)
9833 {
9834 struct thr_repository* rep = g_thr_repository;
9835 struct thr_data *selfptr = &rep->m_thread[self];
9836 selfptr->m_global_variables_enabled = true;
9837 }
9838
9839 void
mt_disable_global_variables(Uint32 self)9840 mt_disable_global_variables(Uint32 self)
9841 {
9842 struct thr_repository* rep = g_thr_repository;
9843 struct thr_data *selfptr = &rep->m_thread[self];
9844 selfptr->m_global_variables_enabled = false;
9845 }
9846
9847 void
mt_init_global_variables_ptr_instances(Uint32 self,void ** tmp,size_t cnt)9848 mt_init_global_variables_ptr_instances(Uint32 self,
9849 void ** tmp,
9850 size_t cnt)
9851 {
9852 struct thr_repository* rep = g_thr_repository;
9853 struct thr_data *selfptr = &rep->m_thread[self];
9854 for (size_t i = 0; i < cnt; i++)
9855 {
9856 Uint32 inx = selfptr->m_global_variables_ptr_instances;
9857 selfptr->m_global_variables_ptrs[inx] = tmp[i];
9858 selfptr->m_global_variables_ptr_instances = inx + 1;
9859 }
9860 }
9861
9862 void
mt_init_global_variables_uint32_ptr_instances(Uint32 self,void ** tmp,size_t cnt)9863 mt_init_global_variables_uint32_ptr_instances(Uint32 self,
9864 void **tmp,
9865 size_t cnt)
9866 {
9867 struct thr_repository* rep = g_thr_repository;
9868 struct thr_data *selfptr = &rep->m_thread[self];
9869 for (size_t i = 0; i < cnt; i++)
9870 {
9871 Uint32 inx = selfptr->m_global_variables_uint32_ptr_instances;
9872 selfptr->m_global_variables_uint32_ptrs[inx] = tmp[i];
9873 selfptr->m_global_variables_uint32_ptr_instances = inx + 1;
9874 }
9875 }
9876
9877 void
mt_init_global_variables_uint32_instances(Uint32 self,void ** tmp,size_t cnt)9878 mt_init_global_variables_uint32_instances(Uint32 self,
9879 void **tmp,
9880 size_t cnt)
9881 {
9882 struct thr_repository* rep = g_thr_repository;
9883 struct thr_data *selfptr = &rep->m_thread[self];
9884 for (size_t i = 0; i < cnt; i++)
9885 {
9886 Uint32 inx = selfptr->m_global_variables_uint32_instances;
9887 selfptr->m_global_variables_uint32[inx] = tmp[i];
9888 selfptr->m_global_variables_uint32_instances = inx + 1;
9889 }
9890 }
9891 #endif
9892
9893 /**
9894 * Global data
9895 */
9896 static struct trp_callback g_trp_callback;
9897
9898 TransporterRegistry globalTransporterRegistry(&g_trp_callback, NULL);
9899