1 /*
2  * ompt-tsan.cpp -- Archer runtime library, TSan annotations for Archer
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for details.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef __STDC_FORMAT_MACROS
14 #define __STDC_FORMAT_MACROS
15 #endif
16 
17 #include <algorithm>
18 #include <atomic>
19 #include <cassert>
20 #include <cstdlib>
21 #include <cstring>
22 #include <inttypes.h>
23 #include <iostream>
24 #include <list>
25 #include <mutex>
26 #include <sstream>
27 #include <string>
28 #include <sys/resource.h>
29 #include <unistd.h>
30 #include <unordered_map>
31 #include <vector>
32 
33 #if (defined __APPLE__ && defined __MACH__)
34 #include <dlfcn.h>
35 #endif
36 
37 #include "omp-tools.h"
38 
39 // Define attribute that indicates that the fall through from the previous
40 // case label is intentional and should not be diagnosed by a compiler
41 //   Code from libcxx/include/__config
42 // Use a function like macro to imply that it must be followed by a semicolon
43 #if __cplusplus > 201402L && __has_cpp_attribute(fallthrough)
44 #define KMP_FALLTHROUGH() [[fallthrough]]
45 #elif __has_cpp_attribute(clang::fallthrough)
46 #define KMP_FALLTHROUGH() [[clang::fallthrough]]
47 #elif __has_attribute(fallthrough) || __GNUC__ >= 7
48 #define KMP_FALLTHROUGH() __attribute__((__fallthrough__))
49 #else
50 #define KMP_FALLTHROUGH() ((void)0)
51 #endif
52 
53 static int runOnTsan;
54 static int hasReductionCallback;
55 
56 class ArcherFlags {
57 public:
58 #if (LLVM_VERSION) >= 40
59   int flush_shadow{0};
60 #endif
61   int print_max_rss{0};
62   int verbose{0};
63   int enabled{1};
64   int report_data_leak{0};
65   int ignore_serial{0};
66 
ArcherFlags(const char * env)67   ArcherFlags(const char *env) {
68     if (env) {
69       std::vector<std::string> tokens;
70       std::string token;
71       std::string str(env);
72       std::istringstream iss(str);
73       while (std::getline(iss, token, ' '))
74         tokens.push_back(token);
75 
76       for (std::vector<std::string>::iterator it = tokens.begin();
77            it != tokens.end(); ++it) {
78 #if (LLVM_VERSION) >= 40
79         if (sscanf(it->c_str(), "flush_shadow=%d", &flush_shadow))
80           continue;
81 #endif
82         if (sscanf(it->c_str(), "print_max_rss=%d", &print_max_rss))
83           continue;
84         if (sscanf(it->c_str(), "verbose=%d", &verbose))
85           continue;
86         if (sscanf(it->c_str(), "report_data_leak=%d", &report_data_leak))
87           continue;
88         if (sscanf(it->c_str(), "enable=%d", &enabled))
89           continue;
90         if (sscanf(it->c_str(), "ignore_serial=%d", &ignore_serial))
91           continue;
92         std::cerr << "Illegal values for ARCHER_OPTIONS variable: " << token
93                   << std::endl;
94       }
95     }
96   }
97 };
98 
99 class TsanFlags {
100 public:
101   int ignore_noninstrumented_modules;
102 
TsanFlags(const char * env)103   TsanFlags(const char *env) : ignore_noninstrumented_modules(0) {
104     if (env) {
105       std::vector<std::string> tokens;
106       std::string str(env);
107       auto end = str.end();
108       auto it = str.begin();
109       auto is_sep = [](char c) {
110         return c == ' ' || c == ',' || c == ':' || c == '\n' || c == '\t' ||
111                c == '\r';
112       };
113       while (it != end) {
114         auto next_it = std::find_if(it, end, is_sep);
115         tokens.emplace_back(it, next_it);
116         it = next_it;
117         if (it != end) {
118           ++it;
119         }
120       }
121 
122       for (const auto &token : tokens) {
123         // we are interested in ignore_noninstrumented_modules to print a
124         // warning
125         if (sscanf(token.c_str(), "ignore_noninstrumented_modules=%d",
126                    &ignore_noninstrumented_modules))
127           continue;
128       }
129     }
130   }
131 };
132 
133 #if (LLVM_VERSION) >= 40
134 extern "C" {
135 int __attribute__((weak)) __archer_get_omp_status();
__tsan_flush_memory()136 void __attribute__((weak)) __tsan_flush_memory() {}
137 }
138 #endif
139 ArcherFlags *archer_flags;
140 
141 #ifndef TsanHappensBefore
142 // Thread Sanitizer is a tool that finds races in code.
143 // See http://code.google.com/p/data-race-test/wiki/DynamicAnnotations .
144 // tsan detects these exact functions by name.
145 extern "C" {
146 #if (defined __APPLE__ && defined __MACH__)
147 static void (*AnnotateHappensAfter)(const char *, int, const volatile void *);
148 static void (*AnnotateHappensBefore)(const char *, int, const volatile void *);
149 static void (*AnnotateIgnoreWritesBegin)(const char *, int);
150 static void (*AnnotateIgnoreWritesEnd)(const char *, int);
151 static void (*AnnotateNewMemory)(const char *, int, const volatile void *,
152                                  size_t);
153 static void (*__tsan_func_entry)(const void *);
154 static void (*__tsan_func_exit)(void);
155 
RunningOnValgrind()156 static int RunningOnValgrind() {
157   int (*fptr)();
158 
159   fptr = (int (*)())dlsym(RTLD_DEFAULT, "RunningOnValgrind");
160   // If we found RunningOnValgrind other than this function, we assume
161   // Annotation functions present in this execution and leave runOnTsan=1
162   // otherwise we change to runOnTsan=0
163   if (!fptr || fptr == RunningOnValgrind)
164     runOnTsan = 0;
165   return 0;
166 }
167 #else
168 void __attribute__((weak))
169 AnnotateHappensAfter(const char *file, int line, const volatile void *cv) {}
170 void __attribute__((weak))
171 AnnotateHappensBefore(const char *file, int line, const volatile void *cv) {}
172 void __attribute__((weak))
173 AnnotateIgnoreWritesBegin(const char *file, int line) {}
174 void __attribute__((weak)) AnnotateIgnoreWritesEnd(const char *file, int line) {
175 }
176 void __attribute__((weak))
177 AnnotateNewMemory(const char *file, int line, const volatile void *cv,
178                   size_t size) {}
179 int __attribute__((weak)) RunningOnValgrind() {
180   runOnTsan = 0;
181   return 0;
182 }
183 void __attribute__((weak)) __tsan_func_entry(const void *call_pc) {}
184 void __attribute__((weak)) __tsan_func_exit(void) {}
185 #endif
186 }
187 
188 // This marker is used to define a happens-before arc. The race detector will
189 // infer an arc from the begin to the end when they share the same pointer
190 // argument.
191 #define TsanHappensBefore(cv) AnnotateHappensBefore(__FILE__, __LINE__, cv)
192 
193 // This marker defines the destination of a happens-before arc.
194 #define TsanHappensAfter(cv) AnnotateHappensAfter(__FILE__, __LINE__, cv)
195 
196 // Ignore any races on writes between here and the next TsanIgnoreWritesEnd.
197 #define TsanIgnoreWritesBegin() AnnotateIgnoreWritesBegin(__FILE__, __LINE__)
198 
199 // Resume checking for racy writes.
200 #define TsanIgnoreWritesEnd() AnnotateIgnoreWritesEnd(__FILE__, __LINE__)
201 
202 // We don't really delete the clock for now
203 #define TsanDeleteClock(cv)
204 
205 // newMemory
206 #define TsanNewMemory(addr, size)                                              \
207   AnnotateNewMemory(__FILE__, __LINE__, addr, size)
208 #define TsanFreeMemory(addr, size)                                             \
209   AnnotateNewMemory(__FILE__, __LINE__, addr, size)
210 #endif
211 
212 // Function entry/exit
213 #define TsanFuncEntry(pc) __tsan_func_entry(pc)
214 #define TsanFuncExit() __tsan_func_exit()
215 
216 /// Required OMPT inquiry functions.
217 static ompt_get_parallel_info_t ompt_get_parallel_info;
218 static ompt_get_thread_data_t ompt_get_thread_data;
219 
220 typedef char ompt_tsan_clockid;
221 
my_next_id()222 static uint64_t my_next_id() {
223   static uint64_t ID = 0;
224   uint64_t ret = __sync_fetch_and_add(&ID, 1);
225   return ret;
226 }
227 
228 static int pagesize{0};
229 
230 // Data structure to provide a threadsafe pool of reusable objects.
231 // DataPool<Type of objects>
232 template <typename T> struct DataPool final {
233   static __thread DataPool<T> *ThreadDataPool;
234   std::mutex DPMutex{};
235 
236   // store unused objects
237   std::vector<T *> DataPointer{};
238   std::vector<T *> RemoteDataPointer{};
239 
240   // store all allocated memory to finally release
241   std::list<void *> memory;
242 
243   // count remotely returned data (RemoteDataPointer.size())
244   std::atomic<int> remote{0};
245 
246   // totally allocated data objects in pool
247   int total{0};
248 #ifdef DEBUG_DATA
249   int remoteReturn{0};
250   int localReturn{0};
251 
getRemoteDataPool252   int getRemote() { return remoteReturn + remote; }
getLocalDataPool253   int getLocal() { return localReturn; }
254 #endif
getTotalDataPool255   int getTotal() { return total; }
getMissingDataPool256   int getMissing() {
257     return total - DataPointer.size() - RemoteDataPointer.size();
258   }
259 
260   // fill the pool by allocating a page of memory
newDatasDataPool261   void newDatas() {
262     if (remote > 0) {
263       const std::lock_guard<std::mutex> lock(DPMutex);
264       // DataPointer is empty, so just swap the vectors
265       DataPointer.swap(RemoteDataPointer);
266       remote = 0;
267       return;
268     }
269     // calculate size of an object including padding to cacheline size
270     size_t elemSize = sizeof(T);
271     size_t paddedSize = (((elemSize - 1) / 64) + 1) * 64;
272     // number of padded elements to allocate
273     int ndatas = pagesize / paddedSize;
274     char *datas = (char *)malloc(ndatas * paddedSize);
275     memory.push_back(datas);
276     for (int i = 0; i < ndatas; i++) {
277       DataPointer.push_back(new (datas + i * paddedSize) T(this));
278     }
279     total += ndatas;
280   }
281 
282   // get data from the pool
getDataDataPool283   T *getData() {
284     T *ret;
285     if (DataPointer.empty())
286       newDatas();
287     ret = DataPointer.back();
288     DataPointer.pop_back();
289     return ret;
290   }
291 
292   // accesses to the thread-local datapool don't need locks
returnOwnDataDataPool293   void returnOwnData(T *data) {
294     DataPointer.emplace_back(data);
295 #ifdef DEBUG_DATA
296     localReturn++;
297 #endif
298   }
299 
300   // returning to a remote datapool using lock
returnDataDataPool301   void returnData(T *data) {
302     const std::lock_guard<std::mutex> lock(DPMutex);
303     RemoteDataPointer.emplace_back(data);
304     remote++;
305 #ifdef DEBUG_DATA
306     remoteReturn++;
307 #endif
308   }
309 
~DataPoolDataPool310   ~DataPool() {
311     // we assume all memory is returned when the thread finished / destructor is
312     // called
313     if (archer_flags->report_data_leak && getMissing() != 0) {
314       printf("ERROR: While freeing DataPool (%s) we are missing %i data "
315              "objects.\n",
316              __PRETTY_FUNCTION__, getMissing());
317       exit(-3);
318     }
319     for (auto i : DataPointer)
320       if (i)
321         i->~T();
322     for (auto i : RemoteDataPointer)
323       if (i)
324         i->~T();
325     for (auto i : memory)
326       if (i)
327         free(i);
328   }
329 };
330 
331 template <typename T> struct DataPoolEntry {
332   DataPool<T> *owner;
333 
NewDataPoolEntry334   static T *New() { return DataPool<T>::ThreadDataPool->getData(); }
335 
DeleteDataPoolEntry336   void Delete() {
337     static_cast<T *>(this)->Reset();
338     if (owner == DataPool<T>::ThreadDataPool)
339       owner->returnOwnData(static_cast<T *>(this));
340     else
341       owner->returnData(static_cast<T *>(this));
342   }
343 
DataPoolEntryDataPoolEntry344   DataPoolEntry(DataPool<T> *dp) : owner(dp) {}
345 };
346 
347 struct DependencyData;
348 typedef DataPool<DependencyData> DependencyDataPool;
349 template <>
350 __thread DependencyDataPool *DependencyDataPool::ThreadDataPool = nullptr;
351 
352 /// Data structure to store additional information for task dependency.
353 struct DependencyData final : DataPoolEntry<DependencyData> {
354   ompt_tsan_clockid in;
355   ompt_tsan_clockid out;
356   ompt_tsan_clockid inoutset;
GetInPtrDependencyData357   void *GetInPtr() { return &in; }
GetOutPtrDependencyData358   void *GetOutPtr() { return &out; }
GetInoutsetPtrDependencyData359   void *GetInoutsetPtr() { return &inoutset; }
360 
ResetDependencyData361   void Reset() {}
362 
NewDependencyData363   static DependencyData *New() { return DataPoolEntry<DependencyData>::New(); }
364 
DependencyDataDependencyData365   DependencyData(DataPool<DependencyData> *dp)
366       : DataPoolEntry<DependencyData>(dp) {}
367 };
368 
369 struct TaskDependency {
370   void *inPtr;
371   void *outPtr;
372   void *inoutsetPtr;
373   ompt_dependence_type_t type;
TaskDependencyTaskDependency374   TaskDependency(DependencyData *depData, ompt_dependence_type_t type)
375       : inPtr(depData->GetInPtr()), outPtr(depData->GetOutPtr()),
376         inoutsetPtr(depData->GetInoutsetPtr()), type(type) {}
AnnotateBeginTaskDependency377   void AnnotateBegin() {
378     if (type == ompt_dependence_type_out ||
379         type == ompt_dependence_type_inout ||
380         type == ompt_dependence_type_mutexinoutset) {
381       TsanHappensAfter(inPtr);
382       TsanHappensAfter(outPtr);
383       TsanHappensAfter(inoutsetPtr);
384     } else if (type == ompt_dependence_type_in) {
385       TsanHappensAfter(outPtr);
386       TsanHappensAfter(inoutsetPtr);
387     } else if (type == ompt_dependence_type_inoutset) {
388       TsanHappensAfter(inPtr);
389       TsanHappensAfter(outPtr);
390     }
391   }
AnnotateEndTaskDependency392   void AnnotateEnd() {
393     if (type == ompt_dependence_type_out ||
394         type == ompt_dependence_type_inout ||
395         type == ompt_dependence_type_mutexinoutset) {
396       TsanHappensBefore(outPtr);
397     } else if (type == ompt_dependence_type_in) {
398       TsanHappensBefore(inPtr);
399     } else if (type == ompt_dependence_type_inoutset) {
400       TsanHappensBefore(inoutsetPtr);
401     }
402   }
403 };
404 
405 struct ParallelData;
406 typedef DataPool<ParallelData> ParallelDataPool;
407 template <>
408 __thread ParallelDataPool *ParallelDataPool::ThreadDataPool = nullptr;
409 
410 /// Data structure to store additional information for parallel regions.
411 struct ParallelData final : DataPoolEntry<ParallelData> {
412 
413   // Parallel fork is just another barrier, use Barrier[1]
414 
415   /// Two addresses for relationships with barriers.
416   ompt_tsan_clockid Barrier[2];
417 
418   const void *codePtr;
419 
GetParallelPtrParallelData420   void *GetParallelPtr() { return &(Barrier[1]); }
421 
GetBarrierPtrParallelData422   void *GetBarrierPtr(unsigned Index) { return &(Barrier[Index]); }
423 
InitParallelData424   ParallelData *Init(const void *codeptr) {
425     codePtr = codeptr;
426     return this;
427   }
428 
ResetParallelData429   void Reset() {}
430 
NewParallelData431   static ParallelData *New(const void *codeptr) {
432     return DataPoolEntry<ParallelData>::New()->Init(codeptr);
433   }
434 
ParallelDataParallelData435   ParallelData(DataPool<ParallelData> *dp) : DataPoolEntry<ParallelData>(dp) {}
436 };
437 
ToParallelData(ompt_data_t * parallel_data)438 static inline ParallelData *ToParallelData(ompt_data_t *parallel_data) {
439   return reinterpret_cast<ParallelData *>(parallel_data->ptr);
440 }
441 
442 struct Taskgroup;
443 typedef DataPool<Taskgroup> TaskgroupPool;
444 template <> __thread TaskgroupPool *TaskgroupPool::ThreadDataPool = nullptr;
445 
446 /// Data structure to support stacking of taskgroups and allow synchronization.
447 struct Taskgroup final : DataPoolEntry<Taskgroup> {
448   /// Its address is used for relationships of the taskgroup's task set.
449   ompt_tsan_clockid Ptr;
450 
451   /// Reference to the parent taskgroup.
452   Taskgroup *Parent;
453 
GetPtrTaskgroup454   void *GetPtr() { return &Ptr; }
455 
InitTaskgroup456   Taskgroup *Init(Taskgroup *parent) {
457     Parent = parent;
458     return this;
459   }
460 
ResetTaskgroup461   void Reset() {}
462 
NewTaskgroup463   static Taskgroup *New(Taskgroup *Parent) {
464     return DataPoolEntry<Taskgroup>::New()->Init(Parent);
465   }
466 
TaskgroupTaskgroup467   Taskgroup(DataPool<Taskgroup> *dp) : DataPoolEntry<Taskgroup>(dp) {}
468 };
469 
470 struct TaskData;
471 typedef DataPool<TaskData> TaskDataPool;
472 template <> __thread TaskDataPool *TaskDataPool::ThreadDataPool = nullptr;
473 
474 /// Data structure to store additional information for tasks.
475 struct TaskData final : DataPoolEntry<TaskData> {
476   /// Its address is used for relationships of this task.
477   ompt_tsan_clockid Task{0};
478 
479   /// Child tasks use its address to declare a relationship to a taskwait in
480   /// this task.
481   ompt_tsan_clockid Taskwait{0};
482 
483   /// Whether this task is currently executing a barrier.
484   bool InBarrier{false};
485 
486   /// Whether this task is an included task.
487   int TaskType{0};
488 
489   /// count execution phase
490   int execution{0};
491 
492   /// Index of which barrier to use next.
493   char BarrierIndex{0};
494 
495   /// Count how often this structure has been put into child tasks + 1.
496   std::atomic_int RefCount{1};
497 
498   /// Reference to the parent that created this task.
499   TaskData *Parent{nullptr};
500 
501   /// Reference to the implicit task in the stack above this task.
502   TaskData *ImplicitTask{nullptr};
503 
504   /// Reference to the team of this task.
505   ParallelData *Team{nullptr};
506 
507   /// Reference to the current taskgroup that this task either belongs to or
508   /// that it just created.
509   Taskgroup *TaskGroup{nullptr};
510 
511   /// Dependency information for this task.
512   TaskDependency *Dependencies{nullptr};
513 
514   /// Number of dependency entries.
515   unsigned DependencyCount{0};
516 
517   // The dependency-map stores DependencyData objects representing
518   // the dependency variables used on the sibling tasks created from
519   // this task
520   // We expect a rare need for the dependency-map, so alloc on demand
521   std::unordered_map<void *, DependencyData *> *DependencyMap{nullptr};
522 
523 #ifdef DEBUG
524   int freed{0};
525 #endif
526 
isIncludedTaskData527   bool isIncluded() { return TaskType & ompt_task_undeferred; }
isUntiedTaskData528   bool isUntied() { return TaskType & ompt_task_untied; }
isFinalTaskData529   bool isFinal() { return TaskType & ompt_task_final; }
isMergableTaskData530   bool isMergable() { return TaskType & ompt_task_mergeable; }
isMergedTaskData531   bool isMerged() { return TaskType & ompt_task_merged; }
532 
isExplicitTaskData533   bool isExplicit() { return TaskType & ompt_task_explicit; }
isImplicitTaskData534   bool isImplicit() { return TaskType & ompt_task_implicit; }
isInitialTaskData535   bool isInitial() { return TaskType & ompt_task_initial; }
isTargetTaskData536   bool isTarget() { return TaskType & ompt_task_target; }
537 
GetTaskPtrTaskData538   void *GetTaskPtr() { return &Task; }
539 
GetTaskwaitPtrTaskData540   void *GetTaskwaitPtr() { return &Taskwait; }
541 
InitTaskData542   TaskData *Init(TaskData *parent, int taskType) {
543     TaskType = taskType;
544     Parent = parent;
545     Team = Parent->Team;
546     if (Parent != nullptr) {
547       Parent->RefCount++;
548       // Copy over pointer to taskgroup. This task may set up its own stack
549       // but for now belongs to its parent's taskgroup.
550       TaskGroup = Parent->TaskGroup;
551     }
552     return this;
553   }
554 
InitTaskData555   TaskData *Init(ParallelData *team, int taskType) {
556     TaskType = taskType;
557     execution = 1;
558     ImplicitTask = this;
559     Team = team;
560     return this;
561   }
562 
ResetTaskData563   void Reset() {
564     InBarrier = false;
565     TaskType = 0;
566     execution = 0;
567     BarrierIndex = 0;
568     RefCount = 1;
569     Parent = nullptr;
570     ImplicitTask = nullptr;
571     Team = nullptr;
572     TaskGroup = nullptr;
573     if (DependencyMap) {
574       for (auto i : *DependencyMap)
575         i.second->Delete();
576       delete DependencyMap;
577     }
578     DependencyMap = nullptr;
579     if (Dependencies)
580       free(Dependencies);
581     Dependencies = nullptr;
582     DependencyCount = 0;
583 #ifdef DEBUG
584     freed = 0;
585 #endif
586   }
587 
NewTaskData588   static TaskData *New(TaskData *parent, int taskType) {
589     return DataPoolEntry<TaskData>::New()->Init(parent, taskType);
590   }
591 
NewTaskData592   static TaskData *New(ParallelData *team, int taskType) {
593     return DataPoolEntry<TaskData>::New()->Init(team, taskType);
594   }
595 
TaskDataTaskData596   TaskData(DataPool<TaskData> *dp) : DataPoolEntry<TaskData>(dp) {}
597 };
598 
ToTaskData(ompt_data_t * task_data)599 static inline TaskData *ToTaskData(ompt_data_t *task_data) {
600   return reinterpret_cast<TaskData *>(task_data->ptr);
601 }
602 
603 /// Store a mutex for each wait_id to resolve race condition with callbacks.
604 std::unordered_map<ompt_wait_id_t, std::mutex> Locks;
605 std::mutex LocksMutex;
606 
ompt_tsan_thread_begin(ompt_thread_t thread_type,ompt_data_t * thread_data)607 static void ompt_tsan_thread_begin(ompt_thread_t thread_type,
608                                    ompt_data_t *thread_data) {
609   ParallelDataPool::ThreadDataPool = new ParallelDataPool;
610   TsanNewMemory(ParallelDataPool::ThreadDataPool,
611                 sizeof(ParallelDataPool::ThreadDataPool));
612   TaskgroupPool::ThreadDataPool = new TaskgroupPool;
613   TsanNewMemory(TaskgroupPool::ThreadDataPool,
614                 sizeof(TaskgroupPool::ThreadDataPool));
615   TaskDataPool::ThreadDataPool = new TaskDataPool;
616   TsanNewMemory(TaskDataPool::ThreadDataPool,
617                 sizeof(TaskDataPool::ThreadDataPool));
618   DependencyDataPool::ThreadDataPool = new DependencyDataPool;
619   TsanNewMemory(DependencyDataPool::ThreadDataPool,
620                 sizeof(DependencyDataPool::ThreadDataPool));
621   thread_data->value = my_next_id();
622 }
623 
ompt_tsan_thread_end(ompt_data_t * thread_data)624 static void ompt_tsan_thread_end(ompt_data_t *thread_data) {
625   TsanIgnoreWritesBegin();
626   delete ParallelDataPool::ThreadDataPool;
627   delete TaskgroupPool::ThreadDataPool;
628   delete TaskDataPool::ThreadDataPool;
629   delete DependencyDataPool::ThreadDataPool;
630   TsanIgnoreWritesEnd();
631 }
632 
633 /// OMPT event callbacks for handling parallel regions.
634 
ompt_tsan_parallel_begin(ompt_data_t * parent_task_data,const ompt_frame_t * parent_task_frame,ompt_data_t * parallel_data,uint32_t requested_team_size,int flag,const void * codeptr_ra)635 static void ompt_tsan_parallel_begin(ompt_data_t *parent_task_data,
636                                      const ompt_frame_t *parent_task_frame,
637                                      ompt_data_t *parallel_data,
638                                      uint32_t requested_team_size, int flag,
639                                      const void *codeptr_ra) {
640   ParallelData *Data = ParallelData::New(codeptr_ra);
641   parallel_data->ptr = Data;
642 
643   TsanHappensBefore(Data->GetParallelPtr());
644   if (archer_flags->ignore_serial && ToTaskData(parent_task_data)->isInitial())
645     TsanIgnoreWritesEnd();
646 }
647 
ompt_tsan_parallel_end(ompt_data_t * parallel_data,ompt_data_t * task_data,int flag,const void * codeptr_ra)648 static void ompt_tsan_parallel_end(ompt_data_t *parallel_data,
649                                    ompt_data_t *task_data, int flag,
650                                    const void *codeptr_ra) {
651   if (archer_flags->ignore_serial && ToTaskData(task_data)->isInitial())
652     TsanIgnoreWritesBegin();
653   ParallelData *Data = ToParallelData(parallel_data);
654   TsanHappensAfter(Data->GetBarrierPtr(0));
655   TsanHappensAfter(Data->GetBarrierPtr(1));
656 
657   Data->Delete();
658 
659 #if (LLVM_VERSION >= 40)
660   if (&__archer_get_omp_status) {
661     if (__archer_get_omp_status() == 0 && archer_flags->flush_shadow)
662       __tsan_flush_memory();
663   }
664 #endif
665 }
666 
ompt_tsan_implicit_task(ompt_scope_endpoint_t endpoint,ompt_data_t * parallel_data,ompt_data_t * task_data,unsigned int team_size,unsigned int thread_num,int type)667 static void ompt_tsan_implicit_task(ompt_scope_endpoint_t endpoint,
668                                     ompt_data_t *parallel_data,
669                                     ompt_data_t *task_data,
670                                     unsigned int team_size,
671                                     unsigned int thread_num, int type) {
672   switch (endpoint) {
673   case ompt_scope_begin:
674     if (type & ompt_task_initial) {
675       parallel_data->ptr = ParallelData::New(nullptr);
676     }
677     task_data->ptr = TaskData::New(ToParallelData(parallel_data), type);
678     TsanHappensAfter(ToParallelData(parallel_data)->GetParallelPtr());
679     TsanFuncEntry(ToParallelData(parallel_data)->codePtr);
680     break;
681   case ompt_scope_end: {
682     TaskData *Data = ToTaskData(task_data);
683 #ifdef DEBUG
684     assert(Data->freed == 0 && "Implicit task end should only be called once!");
685     Data->freed = 1;
686 #endif
687     assert(Data->RefCount == 1 &&
688            "All tasks should have finished at the implicit barrier!");
689     Data->Delete();
690     if (type & ompt_task_initial) {
691       ToParallelData(parallel_data)->Delete();
692     }
693     TsanFuncExit();
694     break;
695   }
696   case ompt_scope_beginend:
697     // Should not occur according to OpenMP 5.1
698     // Tested in OMPT tests
699     break;
700   }
701 }
702 
ompt_tsan_sync_region(ompt_sync_region_t kind,ompt_scope_endpoint_t endpoint,ompt_data_t * parallel_data,ompt_data_t * task_data,const void * codeptr_ra)703 static void ompt_tsan_sync_region(ompt_sync_region_t kind,
704                                   ompt_scope_endpoint_t endpoint,
705                                   ompt_data_t *parallel_data,
706                                   ompt_data_t *task_data,
707                                   const void *codeptr_ra) {
708   TaskData *Data = ToTaskData(task_data);
709   switch (endpoint) {
710   case ompt_scope_begin:
711   case ompt_scope_beginend:
712     TsanFuncEntry(codeptr_ra);
713     switch (kind) {
714     case ompt_sync_region_barrier_implementation:
715     case ompt_sync_region_barrier_implicit:
716     case ompt_sync_region_barrier_explicit:
717     case ompt_sync_region_barrier_implicit_parallel:
718     case ompt_sync_region_barrier_implicit_workshare:
719     case ompt_sync_region_barrier_teams:
720     case ompt_sync_region_barrier: {
721       char BarrierIndex = Data->BarrierIndex;
722       TsanHappensBefore(Data->Team->GetBarrierPtr(BarrierIndex));
723 
724       if (hasReductionCallback < ompt_set_always) {
725         // We ignore writes inside the barrier. These would either occur during
726         // 1. reductions performed by the runtime which are guaranteed to be
727         // race-free.
728         // 2. execution of another task.
729         // For the latter case we will re-enable tracking in task_switch.
730         Data->InBarrier = true;
731         TsanIgnoreWritesBegin();
732       }
733 
734       break;
735     }
736 
737     case ompt_sync_region_taskwait:
738       break;
739 
740     case ompt_sync_region_taskgroup:
741       Data->TaskGroup = Taskgroup::New(Data->TaskGroup);
742       break;
743 
744     case ompt_sync_region_reduction:
745       // should never be reached
746       break;
747     }
748     if (endpoint == ompt_scope_begin)
749       break;
750     KMP_FALLTHROUGH();
751   case ompt_scope_end:
752     TsanFuncExit();
753     switch (kind) {
754     case ompt_sync_region_barrier_implementation:
755     case ompt_sync_region_barrier_implicit:
756     case ompt_sync_region_barrier_explicit:
757     case ompt_sync_region_barrier_implicit_parallel:
758     case ompt_sync_region_barrier_implicit_workshare:
759     case ompt_sync_region_barrier_teams:
760     case ompt_sync_region_barrier: {
761       if (hasReductionCallback < ompt_set_always) {
762         // We want to track writes after the barrier again.
763         Data->InBarrier = false;
764         TsanIgnoreWritesEnd();
765       }
766 
767       char BarrierIndex = Data->BarrierIndex;
768       // Barrier will end after it has been entered by all threads.
769       if (parallel_data)
770         TsanHappensAfter(Data->Team->GetBarrierPtr(BarrierIndex));
771 
772       // It is not guaranteed that all threads have exited this barrier before
773       // we enter the next one. So we will use a different address.
774       // We are however guaranteed that this current barrier is finished
775       // by the time we exit the next one. So we can then reuse the first
776       // address.
777       Data->BarrierIndex = (BarrierIndex + 1) % 2;
778       break;
779     }
780 
781     case ompt_sync_region_taskwait: {
782       if (Data->execution > 1)
783         TsanHappensAfter(Data->GetTaskwaitPtr());
784       break;
785     }
786 
787     case ompt_sync_region_taskgroup: {
788       assert(Data->TaskGroup != nullptr &&
789              "Should have at least one taskgroup!");
790 
791       TsanHappensAfter(Data->TaskGroup->GetPtr());
792 
793       // Delete this allocated taskgroup, all descendent task are finished by
794       // now.
795       Taskgroup *Parent = Data->TaskGroup->Parent;
796       Data->TaskGroup->Delete();
797       Data->TaskGroup = Parent;
798       break;
799     }
800 
801     case ompt_sync_region_reduction:
802       // Should not occur according to OpenMP 5.1
803       // Tested in OMPT tests
804       break;
805     }
806     break;
807   }
808 }
809 
ompt_tsan_reduction(ompt_sync_region_t kind,ompt_scope_endpoint_t endpoint,ompt_data_t * parallel_data,ompt_data_t * task_data,const void * codeptr_ra)810 static void ompt_tsan_reduction(ompt_sync_region_t kind,
811                                 ompt_scope_endpoint_t endpoint,
812                                 ompt_data_t *parallel_data,
813                                 ompt_data_t *task_data,
814                                 const void *codeptr_ra) {
815   switch (endpoint) {
816   case ompt_scope_begin:
817     switch (kind) {
818     case ompt_sync_region_reduction:
819       TsanIgnoreWritesBegin();
820       break;
821     default:
822       break;
823     }
824     break;
825   case ompt_scope_end:
826     switch (kind) {
827     case ompt_sync_region_reduction:
828       TsanIgnoreWritesEnd();
829       break;
830     default:
831       break;
832     }
833     break;
834   case ompt_scope_beginend:
835     // Should not occur according to OpenMP 5.1
836     // Tested in OMPT tests
837     // Would have no implications for DR detection
838     break;
839   }
840 }
841 
842 /// OMPT event callbacks for handling tasks.
843 
ompt_tsan_task_create(ompt_data_t * parent_task_data,const ompt_frame_t * parent_frame,ompt_data_t * new_task_data,int type,int has_dependences,const void * codeptr_ra)844 static void ompt_tsan_task_create(
845     ompt_data_t *parent_task_data,    /* id of parent task            */
846     const ompt_frame_t *parent_frame, /* frame data for parent task   */
847     ompt_data_t *new_task_data,       /* id of created task           */
848     int type, int has_dependences,
849     const void *codeptr_ra) /* pointer to outlined function */
850 {
851   TaskData *Data;
852   assert(new_task_data->ptr == NULL &&
853          "Task data should be initialized to NULL");
854   if (type & ompt_task_initial) {
855     ompt_data_t *parallel_data;
856     int team_size = 1;
857     ompt_get_parallel_info(0, &parallel_data, &team_size);
858     ParallelData *PData = ParallelData::New(nullptr);
859     parallel_data->ptr = PData;
860 
861     Data = TaskData::New(PData, type);
862     new_task_data->ptr = Data;
863   } else if (type & ompt_task_undeferred) {
864     Data = TaskData::New(ToTaskData(parent_task_data), type);
865     new_task_data->ptr = Data;
866   } else if (type & ompt_task_explicit || type & ompt_task_target) {
867     Data = TaskData::New(ToTaskData(parent_task_data), type);
868     new_task_data->ptr = Data;
869 
870     // Use the newly created address. We cannot use a single address from the
871     // parent because that would declare wrong relationships with other
872     // sibling tasks that may be created before this task is started!
873     TsanHappensBefore(Data->GetTaskPtr());
874     ToTaskData(parent_task_data)->execution++;
875   }
876 }
877 
freeTask(TaskData * task)878 static void freeTask(TaskData *task) {
879   while (task != nullptr && --task->RefCount == 0) {
880     TaskData *Parent = task->Parent;
881     task->Delete();
882     task = Parent;
883   }
884 }
885 
releaseDependencies(TaskData * task)886 static void releaseDependencies(TaskData *task) {
887   for (unsigned i = 0; i < task->DependencyCount; i++) {
888     task->Dependencies[i].AnnotateEnd();
889   }
890 }
891 
acquireDependencies(TaskData * task)892 static void acquireDependencies(TaskData *task) {
893   for (unsigned i = 0; i < task->DependencyCount; i++) {
894     task->Dependencies[i].AnnotateBegin();
895   }
896 }
897 
ompt_tsan_task_schedule(ompt_data_t * first_task_data,ompt_task_status_t prior_task_status,ompt_data_t * second_task_data)898 static void ompt_tsan_task_schedule(ompt_data_t *first_task_data,
899                                     ompt_task_status_t prior_task_status,
900                                     ompt_data_t *second_task_data) {
901 
902   //
903   //  The necessary action depends on prior_task_status:
904   //
905   //    ompt_task_early_fulfill = 5,
906   //     -> ignored
907   //
908   //    ompt_task_late_fulfill  = 6,
909   //     -> first completed, first freed, second ignored
910   //
911   //    ompt_task_complete      = 1,
912   //    ompt_task_cancel        = 3,
913   //     -> first completed, first freed, second starts
914   //
915   //    ompt_task_detach        = 4,
916   //    ompt_task_yield         = 2,
917   //    ompt_task_switch        = 7
918   //     -> first suspended, second starts
919   //
920 
921   if (prior_task_status == ompt_task_early_fulfill)
922     return;
923 
924   TaskData *FromTask = ToTaskData(first_task_data);
925 
926   // Legacy handling for missing reduction callback
927   if (hasReductionCallback < ompt_set_always && FromTask->InBarrier) {
928     // We want to ignore writes in the runtime code during barriers,
929     // but not when executing tasks with user code!
930     TsanIgnoreWritesEnd();
931   }
932 
933   // The late fulfill happens after the detached task finished execution
934   if (prior_task_status == ompt_task_late_fulfill)
935     TsanHappensAfter(FromTask->GetTaskPtr());
936 
937   // task completed execution
938   if (prior_task_status == ompt_task_complete ||
939       prior_task_status == ompt_task_cancel ||
940       prior_task_status == ompt_task_late_fulfill) {
941     // Included tasks are executed sequentially, no need to track
942     // synchronization
943     if (!FromTask->isIncluded()) {
944       // Task will finish before a barrier in the surrounding parallel region
945       // ...
946       ParallelData *PData = FromTask->Team;
947       TsanHappensBefore(
948           PData->GetBarrierPtr(FromTask->ImplicitTask->BarrierIndex));
949 
950       // ... and before an eventual taskwait by the parent thread.
951       TsanHappensBefore(FromTask->Parent->GetTaskwaitPtr());
952 
953       if (FromTask->TaskGroup != nullptr) {
954         // This task is part of a taskgroup, so it will finish before the
955         // corresponding taskgroup_end.
956         TsanHappensBefore(FromTask->TaskGroup->GetPtr());
957       }
958     }
959 
960     // release dependencies
961     releaseDependencies(FromTask);
962     // free the previously running task
963     freeTask(FromTask);
964   }
965 
966   // For late fulfill of detached task, there is no task to schedule to
967   if (prior_task_status == ompt_task_late_fulfill) {
968     return;
969   }
970 
971   TaskData *ToTask = ToTaskData(second_task_data);
972   // Legacy handling for missing reduction callback
973   if (hasReductionCallback < ompt_set_always && ToTask->InBarrier) {
974     // We re-enter runtime code which currently performs a barrier.
975     TsanIgnoreWritesBegin();
976   }
977 
978   // task suspended
979   if (prior_task_status == ompt_task_switch ||
980       prior_task_status == ompt_task_yield ||
981       prior_task_status == ompt_task_detach) {
982     // Task may be resumed at a later point in time.
983     TsanHappensBefore(FromTask->GetTaskPtr());
984     ToTask->ImplicitTask = FromTask->ImplicitTask;
985     assert(ToTask->ImplicitTask != NULL &&
986            "A task belongs to a team and has an implicit task on the stack");
987   }
988 
989   // Handle dependencies on first execution of the task
990   if (ToTask->execution == 0) {
991     ToTask->execution++;
992     acquireDependencies(ToTask);
993   }
994   // 1. Task will begin execution after it has been created.
995   // 2. Task will resume after it has been switched away.
996   TsanHappensAfter(ToTask->GetTaskPtr());
997 }
998 
ompt_tsan_dependences(ompt_data_t * task_data,const ompt_dependence_t * deps,int ndeps)999 static void ompt_tsan_dependences(ompt_data_t *task_data,
1000                                   const ompt_dependence_t *deps, int ndeps) {
1001   if (ndeps > 0) {
1002     // Copy the data to use it in task_switch and task_end.
1003     TaskData *Data = ToTaskData(task_data);
1004     if (!Data->Parent->DependencyMap)
1005       Data->Parent->DependencyMap =
1006           new std::unordered_map<void *, DependencyData *>();
1007     Data->Dependencies =
1008         (TaskDependency *)malloc(sizeof(TaskDependency) * ndeps);
1009     Data->DependencyCount = ndeps;
1010     for (int i = 0; i < ndeps; i++) {
1011       auto ret = Data->Parent->DependencyMap->insert(
1012           std::make_pair(deps[i].variable.ptr, nullptr));
1013       if (ret.second) {
1014         ret.first->second = DependencyData::New();
1015       }
1016       new ((void *)(Data->Dependencies + i))
1017           TaskDependency(ret.first->second, deps[i].dependence_type);
1018     }
1019 
1020     // This callback is executed before this task is first started.
1021     TsanHappensBefore(Data->GetTaskPtr());
1022   }
1023 }
1024 
1025 /// OMPT event callbacks for handling locking.
ompt_tsan_mutex_acquired(ompt_mutex_t kind,ompt_wait_id_t wait_id,const void * codeptr_ra)1026 static void ompt_tsan_mutex_acquired(ompt_mutex_t kind, ompt_wait_id_t wait_id,
1027                                      const void *codeptr_ra) {
1028 
1029   // Acquire our own lock to make sure that
1030   // 1. the previous release has finished.
1031   // 2. the next acquire doesn't start before we have finished our release.
1032   LocksMutex.lock();
1033   std::mutex &Lock = Locks[wait_id];
1034   LocksMutex.unlock();
1035 
1036   Lock.lock();
1037   TsanHappensAfter(&Lock);
1038 }
1039 
ompt_tsan_mutex_released(ompt_mutex_t kind,ompt_wait_id_t wait_id,const void * codeptr_ra)1040 static void ompt_tsan_mutex_released(ompt_mutex_t kind, ompt_wait_id_t wait_id,
1041                                      const void *codeptr_ra) {
1042   LocksMutex.lock();
1043   std::mutex &Lock = Locks[wait_id];
1044   LocksMutex.unlock();
1045   TsanHappensBefore(&Lock);
1046 
1047   Lock.unlock();
1048 }
1049 
1050 // callback , signature , variable to store result , required support level
1051 #define SET_OPTIONAL_CALLBACK_T(event, type, result, level)                    \
1052   do {                                                                         \
1053     ompt_callback_##type##_t tsan_##event = &ompt_tsan_##event;                \
1054     result = ompt_set_callback(ompt_callback_##event,                          \
1055                                (ompt_callback_t)tsan_##event);                 \
1056     if (result < level)                                                        \
1057       printf("Registered callback '" #event "' is not supported at " #level    \
1058              " (%i)\n",                                                        \
1059              result);                                                          \
1060   } while (0)
1061 
1062 #define SET_CALLBACK_T(event, type)                                            \
1063   do {                                                                         \
1064     int res;                                                                   \
1065     SET_OPTIONAL_CALLBACK_T(event, type, res, ompt_set_always);                \
1066   } while (0)
1067 
1068 #define SET_CALLBACK(event) SET_CALLBACK_T(event, event)
1069 
ompt_tsan_initialize(ompt_function_lookup_t lookup,int device_num,ompt_data_t * tool_data)1070 static int ompt_tsan_initialize(ompt_function_lookup_t lookup, int device_num,
1071                                 ompt_data_t *tool_data) {
1072   const char *options = getenv("TSAN_OPTIONS");
1073   TsanFlags tsan_flags(options);
1074 
1075   ompt_set_callback_t ompt_set_callback =
1076       (ompt_set_callback_t)lookup("ompt_set_callback");
1077   if (ompt_set_callback == NULL) {
1078     std::cerr << "Could not set callback, exiting..." << std::endl;
1079     std::exit(1);
1080   }
1081   ompt_get_parallel_info =
1082       (ompt_get_parallel_info_t)lookup("ompt_get_parallel_info");
1083   ompt_get_thread_data = (ompt_get_thread_data_t)lookup("ompt_get_thread_data");
1084 
1085   if (ompt_get_parallel_info == NULL) {
1086     fprintf(stderr, "Could not get inquiry function 'ompt_get_parallel_info', "
1087                     "exiting...\n");
1088     exit(1);
1089   }
1090 
1091 #if (defined __APPLE__ && defined __MACH__)
1092 #define findTsanFunction(f, fSig)                                              \
1093   do {                                                                         \
1094     if (NULL == (f = fSig dlsym(RTLD_DEFAULT, #f)))                            \
1095       printf("Unable to find TSan function " #f ".\n");                        \
1096   } while (0)
1097 
1098   findTsanFunction(AnnotateHappensAfter,
1099                    (void (*)(const char *, int, const volatile void *)));
1100   findTsanFunction(AnnotateHappensBefore,
1101                    (void (*)(const char *, int, const volatile void *)));
1102   findTsanFunction(AnnotateIgnoreWritesBegin, (void (*)(const char *, int)));
1103   findTsanFunction(AnnotateIgnoreWritesEnd, (void (*)(const char *, int)));
1104   findTsanFunction(
1105       AnnotateNewMemory,
1106       (void (*)(const char *, int, const volatile void *, size_t)));
1107   findTsanFunction(__tsan_func_entry, (void (*)(const void *)));
1108   findTsanFunction(__tsan_func_exit, (void (*)(void)));
1109 #endif
1110 
1111   SET_CALLBACK(thread_begin);
1112   SET_CALLBACK(thread_end);
1113   SET_CALLBACK(parallel_begin);
1114   SET_CALLBACK(implicit_task);
1115   SET_CALLBACK(sync_region);
1116   SET_CALLBACK(parallel_end);
1117 
1118   SET_CALLBACK(task_create);
1119   SET_CALLBACK(task_schedule);
1120   SET_CALLBACK(dependences);
1121 
1122   SET_CALLBACK_T(mutex_acquired, mutex);
1123   SET_CALLBACK_T(mutex_released, mutex);
1124   SET_OPTIONAL_CALLBACK_T(reduction, sync_region, hasReductionCallback,
1125                           ompt_set_never);
1126 
1127   if (!tsan_flags.ignore_noninstrumented_modules)
1128     fprintf(stderr,
1129             "Warning: please export "
1130             "TSAN_OPTIONS='ignore_noninstrumented_modules=1' "
1131             "to avoid false positive reports from the OpenMP runtime!\n");
1132   if (archer_flags->ignore_serial)
1133     TsanIgnoreWritesBegin();
1134 
1135   return 1; // success
1136 }
1137 
ompt_tsan_finalize(ompt_data_t * tool_data)1138 static void ompt_tsan_finalize(ompt_data_t *tool_data) {
1139   if (archer_flags->ignore_serial)
1140     TsanIgnoreWritesEnd();
1141   if (archer_flags->print_max_rss) {
1142     struct rusage end;
1143     getrusage(RUSAGE_SELF, &end);
1144     printf("MAX RSS[KBytes] during execution: %ld\n", end.ru_maxrss);
1145   }
1146 
1147   if (archer_flags)
1148     delete archer_flags;
1149 }
1150 
1151 extern "C" ompt_start_tool_result_t *
ompt_start_tool(unsigned int omp_version,const char * runtime_version)1152 ompt_start_tool(unsigned int omp_version, const char *runtime_version) {
1153   const char *options = getenv("ARCHER_OPTIONS");
1154   archer_flags = new ArcherFlags(options);
1155   if (!archer_flags->enabled) {
1156     if (archer_flags->verbose)
1157       std::cout << "Archer disabled, stopping operation" << std::endl;
1158     delete archer_flags;
1159     return NULL;
1160   }
1161 
1162   pagesize = getpagesize();
1163 
1164   static ompt_start_tool_result_t ompt_start_tool_result = {
1165       &ompt_tsan_initialize, &ompt_tsan_finalize, {0}};
1166 
1167   // The OMPT start-up code uses dlopen with RTLD_LAZY. Therefore, we cannot
1168   // rely on dlopen to fail if TSan is missing, but would get a runtime error
1169   // for the first TSan call. We use RunningOnValgrind to detect whether
1170   // an implementation of the Annotation interface is available in the
1171   // execution or disable the tool (by returning NULL).
1172 
1173   runOnTsan = 1;
1174   RunningOnValgrind();
1175   if (!runOnTsan) // if we are not running on TSAN, give a different tool the
1176                   // chance to be loaded
1177   {
1178     if (archer_flags->verbose)
1179       std::cout << "Archer detected OpenMP application without TSan "
1180                    "stopping operation"
1181                 << std::endl;
1182     delete archer_flags;
1183     return NULL;
1184   }
1185 
1186   if (archer_flags->verbose)
1187     std::cout << "Archer detected OpenMP application with TSan, supplying "
1188                  "OpenMP synchronization semantics"
1189               << std::endl;
1190   return &ompt_start_tool_result;
1191 }
1192