1 /*
2  * ompt-tsan.cpp -- Archer runtime library, TSan annotations for Archer
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for details.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef __STDC_FORMAT_MACROS
14 #define __STDC_FORMAT_MACROS
15 #endif
16 
17 #include <algorithm>
18 #include <atomic>
19 #include <cassert>
20 #include <cstdlib>
21 #include <cstring>
22 #include <inttypes.h>
23 #include <iostream>
24 #include <list>
25 #include <mutex>
26 #include <sstream>
27 #include <stack>
28 #include <string>
29 #include <unordered_map>
30 #include <vector>
31 
32 #if (defined __APPLE__ && defined __MACH__)
33 #include <dlfcn.h>
34 #endif
35 
36 #include "omp-tools.h"
37 #include <sys/resource.h>
38 
39 static int runOnTsan;
40 static int hasReductionCallback;
41 
42 class ArcherFlags {
43 public:
44 #if (LLVM_VERSION) >= 40
45   int flush_shadow;
46 #endif
47   int print_max_rss;
48   int verbose;
49   int enabled;
50 
ArcherFlags(const char * env)51   ArcherFlags(const char *env)
52       :
53 #if (LLVM_VERSION) >= 40
54         flush_shadow(0),
55 #endif
56         print_max_rss(0), verbose(0), enabled(1) {
57     if (env) {
58       std::vector<std::string> tokens;
59       std::string token;
60       std::string str(env);
61       std::istringstream iss(str);
62       while (std::getline(iss, token, ' '))
63         tokens.push_back(token);
64 
65       for (std::vector<std::string>::iterator it = tokens.begin();
66            it != tokens.end(); ++it) {
67 #if (LLVM_VERSION) >= 40
68         if (sscanf(it->c_str(), "flush_shadow=%d", &flush_shadow))
69           continue;
70 #endif
71         if (sscanf(it->c_str(), "print_max_rss=%d", &print_max_rss))
72           continue;
73         if (sscanf(it->c_str(), "verbose=%d", &verbose))
74           continue;
75         if (sscanf(it->c_str(), "enable=%d", &enabled))
76           continue;
77         std::cerr << "Illegal values for ARCHER_OPTIONS variable: " << token
78                   << std::endl;
79       }
80     }
81   }
82 };
83 
84 class TsanFlags {
85 public:
86   int ignore_noninstrumented_modules;
87 
TsanFlags(const char * env)88   TsanFlags(const char *env) : ignore_noninstrumented_modules(0) {
89     if (env) {
90       std::vector<std::string> tokens;
91       std::string str(env);
92       auto end = str.end();
93       auto it = str.begin();
94       auto is_sep = [](char c) {
95         return c == ' ' || c == ',' || c == ':' || c == '\n' || c == '\t' ||
96                c == '\r';
97       };
98       while (it != end) {
99         auto next_it = std::find_if(it, end, is_sep);
100         tokens.emplace_back(it, next_it);
101         it = next_it;
102         if (it != end) {
103           ++it;
104         }
105       }
106 
107       for (const auto &token : tokens) {
108         // we are interested in ignore_noninstrumented_modules to print a
109         // warning
110         if (sscanf(token.c_str(), "ignore_noninstrumented_modules=%d",
111                    &ignore_noninstrumented_modules))
112           continue;
113       }
114     }
115   }
116 };
117 
118 #if (LLVM_VERSION) >= 40
119 extern "C" {
120 int __attribute__((weak)) __archer_get_omp_status();
__tsan_flush_memory()121 void __attribute__((weak)) __tsan_flush_memory() {}
122 }
123 #endif
124 ArcherFlags *archer_flags;
125 
126 // The following definitions are pasted from "llvm/Support/Compiler.h" to allow
127 // the code
128 // to be compiled with other compilers like gcc:
129 
130 #ifndef TsanHappensBefore
131 // Thread Sanitizer is a tool that finds races in code.
132 // See http://code.google.com/p/data-race-test/wiki/DynamicAnnotations .
133 // tsan detects these exact functions by name.
134 extern "C" {
135 #if (defined __APPLE__ && defined __MACH__)
AnnotateHappensAfter(const char * file,int line,const volatile void * cv)136 static void AnnotateHappensAfter(const char *file, int line,
137                                  const volatile void *cv) {
138   void (*fptr)(const char *, int, const volatile void *);
139 
140   fptr = (void (*)(const char *, int, const volatile void *))dlsym(
141       RTLD_DEFAULT, "AnnotateHappensAfter");
142   (*fptr)(file, line, cv);
143 }
AnnotateHappensBefore(const char * file,int line,const volatile void * cv)144 static void AnnotateHappensBefore(const char *file, int line,
145                                   const volatile void *cv) {
146   void (*fptr)(const char *, int, const volatile void *);
147 
148   fptr = (void (*)(const char *, int, const volatile void *))dlsym(
149       RTLD_DEFAULT, "AnnotateHappensBefore");
150   (*fptr)(file, line, cv);
151 }
AnnotateIgnoreWritesBegin(const char * file,int line)152 static void AnnotateIgnoreWritesBegin(const char *file, int line) {
153   void (*fptr)(const char *, int);
154 
155   fptr = (void (*)(const char *, int))dlsym(RTLD_DEFAULT,
156                                             "AnnotateIgnoreWritesBegin");
157   (*fptr)(file, line);
158 }
AnnotateIgnoreWritesEnd(const char * file,int line)159 static void AnnotateIgnoreWritesEnd(const char *file, int line) {
160   void (*fptr)(const char *, int);
161 
162   fptr = (void (*)(const char *, int))dlsym(RTLD_DEFAULT,
163                                             "AnnotateIgnoreWritesEnd");
164   (*fptr)(file, line);
165 }
AnnotateNewMemory(const char * file,int line,const volatile void * cv,size_t size)166 static void AnnotateNewMemory(const char *file, int line,
167                               const volatile void *cv, size_t size) {
168   void (*fptr)(const char *, int, const volatile void *, size_t);
169 
170   fptr = (void (*)(const char *, int, const volatile void *, size_t))dlsym(
171       RTLD_DEFAULT, "AnnotateNewMemory");
172   (*fptr)(file, line, cv, size);
173 }
RunningOnValgrind()174 static int RunningOnValgrind() {
175   int (*fptr)();
176 
177   fptr = (int (*)())dlsym(RTLD_DEFAULT, "RunningOnValgrind");
178   if (fptr && fptr != RunningOnValgrind)
179     runOnTsan = 0;
180   return 0;
181 }
182 #else
183 void __attribute__((weak))
184 AnnotateHappensAfter(const char *file, int line, const volatile void *cv) {}
185 void __attribute__((weak))
186 AnnotateHappensBefore(const char *file, int line, const volatile void *cv) {}
187 void __attribute__((weak))
188 AnnotateIgnoreWritesBegin(const char *file, int line) {}
189 void __attribute__((weak)) AnnotateIgnoreWritesEnd(const char *file, int line) {
190 }
191 void __attribute__((weak))
192 AnnotateNewMemory(const char *file, int line, const volatile void *cv,
193                   size_t size) {}
194 int __attribute__((weak)) RunningOnValgrind() {
195   runOnTsan = 0;
196   return 0;
197 }
198 void __attribute__((weak)) __tsan_func_entry(const void *call_pc) {}
199 void __attribute__((weak)) __tsan_func_exit(void) {}
200 #endif
201 }
202 
203 // This marker is used to define a happens-before arc. The race detector will
204 // infer an arc from the begin to the end when they share the same pointer
205 // argument.
206 #define TsanHappensBefore(cv) AnnotateHappensBefore(__FILE__, __LINE__, cv)
207 
208 // This marker defines the destination of a happens-before arc.
209 #define TsanHappensAfter(cv) AnnotateHappensAfter(__FILE__, __LINE__, cv)
210 
211 // Ignore any races on writes between here and the next TsanIgnoreWritesEnd.
212 #define TsanIgnoreWritesBegin() AnnotateIgnoreWritesBegin(__FILE__, __LINE__)
213 
214 // Resume checking for racy writes.
215 #define TsanIgnoreWritesEnd() AnnotateIgnoreWritesEnd(__FILE__, __LINE__)
216 
217 // We don't really delete the clock for now
218 #define TsanDeleteClock(cv)
219 
220 // newMemory
221 #define TsanNewMemory(addr, size)                                              \
222   AnnotateNewMemory(__FILE__, __LINE__, addr, size)
223 #define TsanFreeMemory(addr, size)                                             \
224   AnnotateNewMemory(__FILE__, __LINE__, addr, size)
225 #endif
226 
227 // Function entry/exit
228 #define TsanFuncEntry(pc) __tsan_func_entry(pc)
229 #define TsanFuncExit() __tsan_func_exit()
230 
231 /// Required OMPT inquiry functions.
232 static ompt_get_parallel_info_t ompt_get_parallel_info;
233 static ompt_get_thread_data_t ompt_get_thread_data;
234 
235 typedef uint64_t ompt_tsan_clockid;
236 
my_next_id()237 static uint64_t my_next_id() {
238   static uint64_t ID = 0;
239   uint64_t ret = __sync_fetch_and_add(&ID, 1);
240   return ret;
241 }
242 
243 // Data structure to provide a threadsafe pool of reusable objects.
244 // DataPool<Type of objects, Size of blockalloc>
245 template <typename T, int N> struct DataPool {
246   std::mutex DPMutex;
247   std::stack<T *> DataPointer;
248   std::list<void *> memory;
249   int total;
250 
newDatasDataPool251   void newDatas() {
252     // prefix the Data with a pointer to 'this', allows to return memory to
253     // 'this',
254     // without explicitly knowing the source.
255     //
256     // To reduce lock contention, we use thread local DataPools, but Data
257     // objects move to other threads.
258     // The strategy is to get objects from local pool. Only if the object moved
259     // to another
260     // thread, we might see a penalty on release (returnData).
261     // For "single producer" pattern, a single thread creates tasks, these are
262     // executed by other threads.
263     // The master will have a high demand on TaskData, so return after use.
264     struct pooldata {
265       DataPool<T, N> *dp;
266       T data;
267     };
268     // We alloc without initialize the memory. We cannot call constructors.
269     // Therefore use malloc!
270     pooldata *datas = (pooldata *)malloc(sizeof(pooldata) * N);
271     memory.push_back(datas);
272     for (int i = 0; i < N; i++) {
273       datas[i].dp = this;
274       DataPointer.push(&(datas[i].data));
275     }
276     total += N;
277   }
278 
getDataDataPool279   T *getData() {
280     T *ret;
281     DPMutex.lock();
282     if (DataPointer.empty())
283       newDatas();
284     ret = DataPointer.top();
285     DataPointer.pop();
286     DPMutex.unlock();
287     return ret;
288   }
289 
returnDataDataPool290   void returnData(T *data) {
291     DPMutex.lock();
292     DataPointer.push(data);
293     DPMutex.unlock();
294   }
295 
getDatasDataPool296   void getDatas(int n, T **datas) {
297     DPMutex.lock();
298     for (int i = 0; i < n; i++) {
299       if (DataPointer.empty())
300         newDatas();
301       datas[i] = DataPointer.top();
302       DataPointer.pop();
303     }
304     DPMutex.unlock();
305   }
306 
returnDatasDataPool307   void returnDatas(int n, T **datas) {
308     DPMutex.lock();
309     for (int i = 0; i < n; i++) {
310       DataPointer.push(datas[i]);
311     }
312     DPMutex.unlock();
313   }
314 
DataPoolDataPool315   DataPool() : DPMutex(), DataPointer(), total(0) {}
316 
~DataPoolDataPool317   ~DataPool() {
318     // we assume all memory is returned when the thread finished / destructor is
319     // called
320     for (auto i : memory)
321       if (i)
322         free(i);
323   }
324 };
325 
326 // This function takes care to return the data to the originating DataPool
327 // A pointer to the originating DataPool is stored just before the actual data.
retData(void * data)328 template <typename T, int N> static void retData(void *data) {
329   ((DataPool<T, N> **)data)[-1]->returnData((T *)data);
330 }
331 
332 struct ParallelData;
333 __thread DataPool<ParallelData, 4> *pdp;
334 
335 /// Data structure to store additional information for parallel regions.
336 struct ParallelData {
337 
338   // Parallel fork is just another barrier, use Barrier[1]
339 
340   /// Two addresses for relationships with barriers.
341   ompt_tsan_clockid Barrier[2];
342 
343   const void *codePtr;
344 
GetParallelPtrParallelData345   void *GetParallelPtr() { return &(Barrier[1]); }
346 
GetBarrierPtrParallelData347   void *GetBarrierPtr(unsigned Index) { return &(Barrier[Index]); }
348 
ParallelDataParallelData349   ParallelData(const void *codeptr) : codePtr(codeptr) {}
~ParallelDataParallelData350   ~ParallelData() {
351     TsanDeleteClock(&(Barrier[0]));
352     TsanDeleteClock(&(Barrier[1]));
353   }
354   // overload new/delete to use DataPool for memory management.
operator newParallelData355   void *operator new(size_t size) { return pdp->getData(); }
operator deleteParallelData356   void operator delete(void *p, size_t) { retData<ParallelData, 4>(p); }
357 };
358 
ToParallelData(ompt_data_t * parallel_data)359 static inline ParallelData *ToParallelData(ompt_data_t *parallel_data) {
360   return reinterpret_cast<ParallelData *>(parallel_data->ptr);
361 }
362 
363 struct Taskgroup;
364 __thread DataPool<Taskgroup, 4> *tgp;
365 
366 /// Data structure to support stacking of taskgroups and allow synchronization.
367 struct Taskgroup {
368   /// Its address is used for relationships of the taskgroup's task set.
369   ompt_tsan_clockid Ptr;
370 
371   /// Reference to the parent taskgroup.
372   Taskgroup *Parent;
373 
TaskgroupTaskgroup374   Taskgroup(Taskgroup *Parent) : Parent(Parent) {}
~TaskgroupTaskgroup375   ~Taskgroup() { TsanDeleteClock(&Ptr); }
376 
GetPtrTaskgroup377   void *GetPtr() { return &Ptr; }
378   // overload new/delete to use DataPool for memory management.
operator newTaskgroup379   void *operator new(size_t size) { return tgp->getData(); }
operator deleteTaskgroup380   void operator delete(void *p, size_t) { retData<Taskgroup, 4>(p); }
381 };
382 
383 struct TaskData;
384 __thread DataPool<TaskData, 4> *tdp;
385 
386 /// Data structure to store additional information for tasks.
387 struct TaskData {
388   /// Its address is used for relationships of this task.
389   ompt_tsan_clockid Task;
390 
391   /// Child tasks use its address to declare a relationship to a taskwait in
392   /// this task.
393   ompt_tsan_clockid Taskwait;
394 
395   /// Whether this task is currently executing a barrier.
396   bool InBarrier;
397 
398   /// Whether this task is an included task.
399   bool Included;
400 
401   /// Index of which barrier to use next.
402   char BarrierIndex;
403 
404   /// Count how often this structure has been put into child tasks + 1.
405   std::atomic_int RefCount;
406 
407   /// Reference to the parent that created this task.
408   TaskData *Parent;
409 
410   /// Reference to the implicit task in the stack above this task.
411   TaskData *ImplicitTask;
412 
413   /// Reference to the team of this task.
414   ParallelData *Team;
415 
416   /// Reference to the current taskgroup that this task either belongs to or
417   /// that it just created.
418   Taskgroup *TaskGroup;
419 
420   /// Dependency information for this task.
421   ompt_dependence_t *Dependencies;
422 
423   /// Number of dependency entries.
424   unsigned DependencyCount;
425 
426   void *PrivateData;
427   size_t PrivateDataSize;
428 
429   int execution;
430   int freed;
431 
TaskDataTaskData432   TaskData(TaskData *Parent)
433       : InBarrier(false), Included(false), BarrierIndex(0), RefCount(1),
434         Parent(Parent), ImplicitTask(nullptr), Team(Parent->Team),
435         TaskGroup(nullptr), DependencyCount(0), execution(0), freed(0) {
436     if (Parent != nullptr) {
437       Parent->RefCount++;
438       // Copy over pointer to taskgroup. This task may set up its own stack
439       // but for now belongs to its parent's taskgroup.
440       TaskGroup = Parent->TaskGroup;
441     }
442   }
443 
TaskDataTaskData444   TaskData(ParallelData *Team = nullptr)
445       : InBarrier(false), Included(false), BarrierIndex(0), RefCount(1),
446         Parent(nullptr), ImplicitTask(this), Team(Team), TaskGroup(nullptr),
447         DependencyCount(0), execution(1), freed(0) {}
448 
~TaskDataTaskData449   ~TaskData() {
450     TsanDeleteClock(&Task);
451     TsanDeleteClock(&Taskwait);
452   }
453 
GetTaskPtrTaskData454   void *GetTaskPtr() { return &Task; }
455 
GetTaskwaitPtrTaskData456   void *GetTaskwaitPtr() { return &Taskwait; }
457   // overload new/delete to use DataPool for memory management.
operator newTaskData458   void *operator new(size_t size) { return tdp->getData(); }
operator deleteTaskData459   void operator delete(void *p, size_t) { retData<TaskData, 4>(p); }
460 };
461 
ToTaskData(ompt_data_t * task_data)462 static inline TaskData *ToTaskData(ompt_data_t *task_data) {
463   return reinterpret_cast<TaskData *>(task_data->ptr);
464 }
465 
ToInAddr(void * OutAddr)466 static inline void *ToInAddr(void *OutAddr) {
467   // FIXME: This will give false negatives when a second variable lays directly
468   //        behind a variable that only has a width of 1 byte.
469   //        Another approach would be to "negate" the address or to flip the
470   //        first bit...
471   return reinterpret_cast<char *>(OutAddr) + 1;
472 }
473 
474 /// Store a mutex for each wait_id to resolve race condition with callbacks.
475 std::unordered_map<ompt_wait_id_t, std::mutex> Locks;
476 std::mutex LocksMutex;
477 
ompt_tsan_thread_begin(ompt_thread_t thread_type,ompt_data_t * thread_data)478 static void ompt_tsan_thread_begin(ompt_thread_t thread_type,
479                                    ompt_data_t *thread_data) {
480   pdp = new DataPool<ParallelData, 4>;
481   TsanNewMemory(pdp, sizeof(pdp));
482   tgp = new DataPool<Taskgroup, 4>;
483   TsanNewMemory(tgp, sizeof(tgp));
484   tdp = new DataPool<TaskData, 4>;
485   TsanNewMemory(tdp, sizeof(tdp));
486   thread_data->value = my_next_id();
487 }
488 
ompt_tsan_thread_end(ompt_data_t * thread_data)489 static void ompt_tsan_thread_end(ompt_data_t *thread_data) {
490   delete pdp;
491   delete tgp;
492   delete tdp;
493 }
494 
495 /// OMPT event callbacks for handling parallel regions.
496 
ompt_tsan_parallel_begin(ompt_data_t * parent_task_data,const ompt_frame_t * parent_task_frame,ompt_data_t * parallel_data,uint32_t requested_team_size,int flag,const void * codeptr_ra)497 static void ompt_tsan_parallel_begin(ompt_data_t *parent_task_data,
498                                      const ompt_frame_t *parent_task_frame,
499                                      ompt_data_t *parallel_data,
500                                      uint32_t requested_team_size, int flag,
501                                      const void *codeptr_ra) {
502   ParallelData *Data = new ParallelData(codeptr_ra);
503   parallel_data->ptr = Data;
504 
505   TsanHappensBefore(Data->GetParallelPtr());
506 }
507 
ompt_tsan_parallel_end(ompt_data_t * parallel_data,ompt_data_t * task_data,int flag,const void * codeptr_ra)508 static void ompt_tsan_parallel_end(ompt_data_t *parallel_data,
509                                    ompt_data_t *task_data, int flag,
510                                    const void *codeptr_ra) {
511   ParallelData *Data = ToParallelData(parallel_data);
512   TsanHappensAfter(Data->GetBarrierPtr(0));
513   TsanHappensAfter(Data->GetBarrierPtr(1));
514 
515   delete Data;
516 
517 #if (LLVM_VERSION >= 40)
518   if (&__archer_get_omp_status) {
519     if (__archer_get_omp_status() == 0 && archer_flags->flush_shadow)
520       __tsan_flush_memory();
521   }
522 #endif
523 }
524 
ompt_tsan_implicit_task(ompt_scope_endpoint_t endpoint,ompt_data_t * parallel_data,ompt_data_t * task_data,unsigned int team_size,unsigned int thread_num,int type)525 static void ompt_tsan_implicit_task(ompt_scope_endpoint_t endpoint,
526                                     ompt_data_t *parallel_data,
527                                     ompt_data_t *task_data,
528                                     unsigned int team_size,
529                                     unsigned int thread_num, int type) {
530   switch (endpoint) {
531   case ompt_scope_begin:
532     if (type & ompt_task_initial) {
533       parallel_data->ptr = new ParallelData(nullptr);
534     }
535     task_data->ptr = new TaskData(ToParallelData(parallel_data));
536     TsanHappensAfter(ToParallelData(parallel_data)->GetParallelPtr());
537     TsanFuncEntry(ToParallelData(parallel_data)->codePtr);
538     break;
539   case ompt_scope_end:
540     TaskData *Data = ToTaskData(task_data);
541     assert(Data->freed == 0 && "Implicit task end should only be called once!");
542     Data->freed = 1;
543     assert(Data->RefCount == 1 &&
544            "All tasks should have finished at the implicit barrier!");
545     delete Data;
546     TsanFuncExit();
547     break;
548   }
549 }
550 
ompt_tsan_sync_region(ompt_sync_region_t kind,ompt_scope_endpoint_t endpoint,ompt_data_t * parallel_data,ompt_data_t * task_data,const void * codeptr_ra)551 static void ompt_tsan_sync_region(ompt_sync_region_t kind,
552                                   ompt_scope_endpoint_t endpoint,
553                                   ompt_data_t *parallel_data,
554                                   ompt_data_t *task_data,
555                                   const void *codeptr_ra) {
556   TaskData *Data = ToTaskData(task_data);
557   switch (endpoint) {
558   case ompt_scope_begin:
559     TsanFuncEntry(codeptr_ra);
560     switch (kind) {
561     case ompt_sync_region_barrier_implementation:
562     case ompt_sync_region_barrier_implicit:
563     case ompt_sync_region_barrier_explicit:
564     case ompt_sync_region_barrier: {
565       char BarrierIndex = Data->BarrierIndex;
566       TsanHappensBefore(Data->Team->GetBarrierPtr(BarrierIndex));
567 
568       if (hasReductionCallback < ompt_set_always) {
569         // We ignore writes inside the barrier. These would either occur during
570         // 1. reductions performed by the runtime which are guaranteed to be
571         // race-free.
572         // 2. execution of another task.
573         // For the latter case we will re-enable tracking in task_switch.
574         Data->InBarrier = true;
575         TsanIgnoreWritesBegin();
576       }
577 
578       break;
579     }
580 
581     case ompt_sync_region_taskwait:
582       break;
583 
584     case ompt_sync_region_taskgroup:
585       Data->TaskGroup = new Taskgroup(Data->TaskGroup);
586       break;
587 
588     default:
589       break;
590     }
591     break;
592   case ompt_scope_end:
593     TsanFuncExit();
594     switch (kind) {
595     case ompt_sync_region_barrier_implementation:
596     case ompt_sync_region_barrier_implicit:
597     case ompt_sync_region_barrier_explicit:
598     case ompt_sync_region_barrier: {
599       if (hasReductionCallback < ompt_set_always) {
600         // We want to track writes after the barrier again.
601         Data->InBarrier = false;
602         TsanIgnoreWritesEnd();
603       }
604 
605       char BarrierIndex = Data->BarrierIndex;
606       // Barrier will end after it has been entered by all threads.
607       if (parallel_data)
608         TsanHappensAfter(Data->Team->GetBarrierPtr(BarrierIndex));
609 
610       // It is not guaranteed that all threads have exited this barrier before
611       // we enter the next one. So we will use a different address.
612       // We are however guaranteed that this current barrier is finished
613       // by the time we exit the next one. So we can then reuse the first
614       // address.
615       Data->BarrierIndex = (BarrierIndex + 1) % 2;
616       break;
617     }
618 
619     case ompt_sync_region_taskwait: {
620       if (Data->execution > 1)
621         TsanHappensAfter(Data->GetTaskwaitPtr());
622       break;
623     }
624 
625     case ompt_sync_region_taskgroup: {
626       assert(Data->TaskGroup != nullptr &&
627              "Should have at least one taskgroup!");
628 
629       TsanHappensAfter(Data->TaskGroup->GetPtr());
630 
631       // Delete this allocated taskgroup, all descendent task are finished by
632       // now.
633       Taskgroup *Parent = Data->TaskGroup->Parent;
634       delete Data->TaskGroup;
635       Data->TaskGroup = Parent;
636       break;
637     }
638 
639     default:
640       break;
641     }
642     break;
643   }
644 }
645 
ompt_tsan_reduction(ompt_sync_region_t kind,ompt_scope_endpoint_t endpoint,ompt_data_t * parallel_data,ompt_data_t * task_data,const void * codeptr_ra)646 static void ompt_tsan_reduction(ompt_sync_region_t kind,
647                                 ompt_scope_endpoint_t endpoint,
648                                 ompt_data_t *parallel_data,
649                                 ompt_data_t *task_data,
650                                 const void *codeptr_ra) {
651   switch (endpoint) {
652   case ompt_scope_begin:
653     switch (kind) {
654     case ompt_sync_region_reduction:
655       TsanIgnoreWritesBegin();
656       break;
657     default:
658       break;
659     }
660     break;
661   case ompt_scope_end:
662     switch (kind) {
663     case ompt_sync_region_reduction:
664       TsanIgnoreWritesEnd();
665       break;
666     default:
667       break;
668     }
669     break;
670   }
671 }
672 
673 /// OMPT event callbacks for handling tasks.
674 
ompt_tsan_task_create(ompt_data_t * parent_task_data,const ompt_frame_t * parent_frame,ompt_data_t * new_task_data,int type,int has_dependences,const void * codeptr_ra)675 static void ompt_tsan_task_create(
676     ompt_data_t *parent_task_data,    /* id of parent task            */
677     const ompt_frame_t *parent_frame, /* frame data for parent task   */
678     ompt_data_t *new_task_data,       /* id of created task           */
679     int type, int has_dependences,
680     const void *codeptr_ra) /* pointer to outlined function */
681 {
682   TaskData *Data;
683   assert(new_task_data->ptr == NULL &&
684          "Task data should be initialized to NULL");
685   if (type & ompt_task_initial) {
686     ompt_data_t *parallel_data;
687     int team_size = 1;
688     ompt_get_parallel_info(0, &parallel_data, &team_size);
689     ParallelData *PData = new ParallelData(nullptr);
690     parallel_data->ptr = PData;
691 
692     Data = new TaskData(PData);
693     new_task_data->ptr = Data;
694   } else if (type & ompt_task_undeferred) {
695     Data = new TaskData(ToTaskData(parent_task_data));
696     new_task_data->ptr = Data;
697     Data->Included = true;
698   } else if (type & ompt_task_explicit || type & ompt_task_target) {
699     Data = new TaskData(ToTaskData(parent_task_data));
700     new_task_data->ptr = Data;
701 
702     // Use the newly created address. We cannot use a single address from the
703     // parent because that would declare wrong relationships with other
704     // sibling tasks that may be created before this task is started!
705     TsanHappensBefore(Data->GetTaskPtr());
706     ToTaskData(parent_task_data)->execution++;
707   }
708 }
709 
__ompt_tsan_release_task(TaskData * task)710 static void __ompt_tsan_release_task(TaskData *task) {
711   while (task != nullptr && --task->RefCount == 0) {
712     TaskData *Parent = task->Parent;
713     if (task->DependencyCount > 0) {
714       delete[] task->Dependencies;
715     }
716     delete task;
717     task = Parent;
718   }
719 }
720 
ompt_tsan_task_schedule(ompt_data_t * first_task_data,ompt_task_status_t prior_task_status,ompt_data_t * second_task_data)721 static void ompt_tsan_task_schedule(ompt_data_t *first_task_data,
722                                     ompt_task_status_t prior_task_status,
723                                     ompt_data_t *second_task_data) {
724 
725   //
726   //  The necessary action depends on prior_task_status:
727   //
728   //    ompt_task_early_fulfill = 5,
729   //     -> ignored
730   //
731   //    ompt_task_late_fulfill  = 6,
732   //     -> first completed, first freed, second ignored
733   //
734   //    ompt_task_complete      = 1,
735   //    ompt_task_cancel        = 3,
736   //     -> first completed, first freed, second starts
737   //
738   //    ompt_task_detach        = 4,
739   //    ompt_task_yield         = 2,
740   //    ompt_task_switch        = 7
741   //     -> first suspended, second starts
742   //
743 
744   if (prior_task_status == ompt_task_early_fulfill)
745     return;
746 
747   TaskData *FromTask = ToTaskData(first_task_data);
748 
749   // Legacy handling for missing reduction callback
750   if (hasReductionCallback < ompt_set_always && FromTask->InBarrier) {
751     // We want to ignore writes in the runtime code during barriers,
752     // but not when executing tasks with user code!
753     TsanIgnoreWritesEnd();
754   }
755 
756   // The late fulfill happens after the detached task finished execution
757   if (prior_task_status == ompt_task_late_fulfill)
758     TsanHappensAfter(FromTask->GetTaskPtr());
759 
760   // task completed execution
761   if (prior_task_status == ompt_task_complete ||
762       prior_task_status == ompt_task_cancel ||
763       prior_task_status == ompt_task_late_fulfill) {
764     // Included tasks are executed sequentially, no need to track
765     // synchronization
766     if (!FromTask->Included) {
767       // Task will finish before a barrier in the surrounding parallel region
768       // ...
769       ParallelData *PData = FromTask->Team;
770       TsanHappensBefore(
771           PData->GetBarrierPtr(FromTask->ImplicitTask->BarrierIndex));
772 
773       // ... and before an eventual taskwait by the parent thread.
774       TsanHappensBefore(FromTask->Parent->GetTaskwaitPtr());
775 
776       if (FromTask->TaskGroup != nullptr) {
777         // This task is part of a taskgroup, so it will finish before the
778         // corresponding taskgroup_end.
779         TsanHappensBefore(FromTask->TaskGroup->GetPtr());
780       }
781     }
782 
783     // release dependencies
784     for (unsigned i = 0; i < FromTask->DependencyCount; i++) {
785       ompt_dependence_t *Dependency = &FromTask->Dependencies[i];
786 
787       // in dependencies block following inout and out dependencies!
788       TsanHappensBefore(ToInAddr(Dependency->variable.ptr));
789       if (Dependency->dependence_type == ompt_dependence_type_out ||
790           Dependency->dependence_type == ompt_dependence_type_inout) {
791         TsanHappensBefore(Dependency->variable.ptr);
792       }
793     }
794     // free the previously running task
795     __ompt_tsan_release_task(FromTask);
796   }
797 
798   // For late fulfill of detached task, there is no task to schedule to
799   if (prior_task_status == ompt_task_late_fulfill) {
800     return;
801   }
802 
803   TaskData *ToTask = ToTaskData(second_task_data);
804   // Legacy handling for missing reduction callback
805   if (hasReductionCallback < ompt_set_always && ToTask->InBarrier) {
806     // We re-enter runtime code which currently performs a barrier.
807     TsanIgnoreWritesBegin();
808   }
809 
810   // task suspended
811   if (prior_task_status == ompt_task_switch ||
812       prior_task_status == ompt_task_yield ||
813       prior_task_status == ompt_task_detach) {
814     // Task may be resumed at a later point in time.
815     TsanHappensBefore(FromTask->GetTaskPtr());
816     ToTask->ImplicitTask = FromTask->ImplicitTask;
817     assert(ToTask->ImplicitTask != NULL &&
818            "A task belongs to a team and has an implicit task on the stack");
819   }
820 
821   // Handle dependencies on first execution of the task
822   if (ToTask->execution == 0) {
823     ToTask->execution++;
824     for (unsigned i = 0; i < ToTask->DependencyCount; i++) {
825       ompt_dependence_t *Dependency = &ToTask->Dependencies[i];
826 
827       TsanHappensAfter(Dependency->variable.ptr);
828       // in and inout dependencies are also blocked by prior in dependencies!
829       if (Dependency->dependence_type == ompt_dependence_type_out ||
830           Dependency->dependence_type == ompt_dependence_type_inout) {
831         TsanHappensAfter(ToInAddr(Dependency->variable.ptr));
832       }
833     }
834   }
835   // 1. Task will begin execution after it has been created.
836   // 2. Task will resume after it has been switched away.
837   TsanHappensAfter(ToTask->GetTaskPtr());
838 }
839 
ompt_tsan_dependences(ompt_data_t * task_data,const ompt_dependence_t * deps,int ndeps)840 static void ompt_tsan_dependences(ompt_data_t *task_data,
841                                   const ompt_dependence_t *deps, int ndeps) {
842   if (ndeps > 0) {
843     // Copy the data to use it in task_switch and task_end.
844     TaskData *Data = ToTaskData(task_data);
845     Data->Dependencies = new ompt_dependence_t[ndeps];
846     std::memcpy(Data->Dependencies, deps, sizeof(ompt_dependence_t) * ndeps);
847     Data->DependencyCount = ndeps;
848 
849     // This callback is executed before this task is first started.
850     TsanHappensBefore(Data->GetTaskPtr());
851   }
852 }
853 
854 /// OMPT event callbacks for handling locking.
ompt_tsan_mutex_acquired(ompt_mutex_t kind,ompt_wait_id_t wait_id,const void * codeptr_ra)855 static void ompt_tsan_mutex_acquired(ompt_mutex_t kind, ompt_wait_id_t wait_id,
856                                      const void *codeptr_ra) {
857 
858   // Acquire our own lock to make sure that
859   // 1. the previous release has finished.
860   // 2. the next acquire doesn't start before we have finished our release.
861   LocksMutex.lock();
862   std::mutex &Lock = Locks[wait_id];
863   LocksMutex.unlock();
864 
865   Lock.lock();
866   TsanHappensAfter(&Lock);
867 }
868 
ompt_tsan_mutex_released(ompt_mutex_t kind,ompt_wait_id_t wait_id,const void * codeptr_ra)869 static void ompt_tsan_mutex_released(ompt_mutex_t kind, ompt_wait_id_t wait_id,
870                                      const void *codeptr_ra) {
871   LocksMutex.lock();
872   std::mutex &Lock = Locks[wait_id];
873   LocksMutex.unlock();
874   TsanHappensBefore(&Lock);
875 
876   Lock.unlock();
877 }
878 
879 // callback , signature , variable to store result , required support level
880 #define SET_OPTIONAL_CALLBACK_T(event, type, result, level)                    \
881   do {                                                                         \
882     ompt_callback_##type##_t tsan_##event = &ompt_tsan_##event;                \
883     result = ompt_set_callback(ompt_callback_##event,                          \
884                                (ompt_callback_t)tsan_##event);                 \
885     if (result < level)                                                        \
886       printf("Registered callback '" #event "' is not supported at " #level    \
887              " (%i)\n",                                                        \
888              result);                                                          \
889   } while (0)
890 
891 #define SET_CALLBACK_T(event, type)                                            \
892   do {                                                                         \
893     int res;                                                                   \
894     SET_OPTIONAL_CALLBACK_T(event, type, res, ompt_set_always);                \
895   } while (0)
896 
897 #define SET_CALLBACK(event) SET_CALLBACK_T(event, event)
898 
ompt_tsan_initialize(ompt_function_lookup_t lookup,int device_num,ompt_data_t * tool_data)899 static int ompt_tsan_initialize(ompt_function_lookup_t lookup, int device_num,
900                                 ompt_data_t *tool_data) {
901   const char *options = getenv("TSAN_OPTIONS");
902   TsanFlags tsan_flags(options);
903 
904   ompt_set_callback_t ompt_set_callback =
905       (ompt_set_callback_t)lookup("ompt_set_callback");
906   if (ompt_set_callback == NULL) {
907     std::cerr << "Could not set callback, exiting..." << std::endl;
908     std::exit(1);
909   }
910   ompt_get_parallel_info =
911       (ompt_get_parallel_info_t)lookup("ompt_get_parallel_info");
912   ompt_get_thread_data = (ompt_get_thread_data_t)lookup("ompt_get_thread_data");
913 
914   if (ompt_get_parallel_info == NULL) {
915     fprintf(stderr, "Could not get inquiry function 'ompt_get_parallel_info', "
916                     "exiting...\n");
917     exit(1);
918   }
919 
920   SET_CALLBACK(thread_begin);
921   SET_CALLBACK(thread_end);
922   SET_CALLBACK(parallel_begin);
923   SET_CALLBACK(implicit_task);
924   SET_CALLBACK(sync_region);
925   SET_CALLBACK(parallel_end);
926 
927   SET_CALLBACK(task_create);
928   SET_CALLBACK(task_schedule);
929   SET_CALLBACK(dependences);
930 
931   SET_CALLBACK_T(mutex_acquired, mutex);
932   SET_CALLBACK_T(mutex_released, mutex);
933   SET_OPTIONAL_CALLBACK_T(reduction, sync_region, hasReductionCallback,
934                           ompt_set_never);
935 
936   if (!tsan_flags.ignore_noninstrumented_modules)
937     fprintf(stderr,
938             "Warning: please export "
939             "TSAN_OPTIONS='ignore_noninstrumented_modules=1' "
940             "to avoid false positive reports from the OpenMP runtime!\n");
941   return 1; // success
942 }
943 
ompt_tsan_finalize(ompt_data_t * tool_data)944 static void ompt_tsan_finalize(ompt_data_t *tool_data) {
945   if (archer_flags->print_max_rss) {
946     struct rusage end;
947     getrusage(RUSAGE_SELF, &end);
948     printf("MAX RSS[KBytes] during execution: %ld\n", end.ru_maxrss);
949   }
950 
951   if (archer_flags)
952     delete archer_flags;
953 }
954 
955 extern "C" ompt_start_tool_result_t *
ompt_start_tool(unsigned int omp_version,const char * runtime_version)956 ompt_start_tool(unsigned int omp_version, const char *runtime_version) {
957   const char *options = getenv("ARCHER_OPTIONS");
958   archer_flags = new ArcherFlags(options);
959   if (!archer_flags->enabled) {
960     if (archer_flags->verbose)
961       std::cout << "Archer disabled, stopping operation" << std::endl;
962     delete archer_flags;
963     return NULL;
964   }
965 
966   static ompt_start_tool_result_t ompt_start_tool_result = {
967       &ompt_tsan_initialize, &ompt_tsan_finalize, {0}};
968   runOnTsan = 1;
969   RunningOnValgrind();
970   if (!runOnTsan) // if we are not running on TSAN, give a different tool the
971                   // chance to be loaded
972   {
973     if (archer_flags->verbose)
974       std::cout << "Archer detected OpenMP application without TSan "
975                    "stopping operation"
976                 << std::endl;
977     delete archer_flags;
978     return NULL;
979   }
980 
981   if (archer_flags->verbose)
982     std::cout << "Archer detected OpenMP application with TSan, supplying "
983                  "OpenMP synchronization semantics"
984               << std::endl;
985   return &ompt_start_tool_result;
986 }
987