1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements.  See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership.  The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License.  You may obtain a copy of the License at
9  *
10  *   http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied.  See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 /*!
21  * \file profiler.h
22  * \brief implements profiler
23  */
24 #ifndef MXNET_PROFILER_PROFILER_H_
25 #define MXNET_PROFILER_PROFILER_H_
26 
27 #include <dmlc/concurrentqueue.h>
28 #include <dmlc/thread_group.h>
29 #include <vector>
30 #include <string>
31 #include <cstdint>
32 #include <mutex>
33 #include <memory>
34 #include <array>
35 #include "./vtune.h"
36 #include "./aggregate_stats.h"
37 #include "./nvtx.h"
38 #include "../common/utils.h"
39 
40 
41 namespace mxnet {
42 namespace profiler {
43 
44 
45 
46 /*!
47  * \brief Constant-sized character array class with simple string API to avoid allocations
48  * \tparam string_size Maximum size of the string (including zero-terminator)
49  */
50 template<size_t string_size>
51 struct static_string {
static_stringstatic_string52   inline static_string() { string_[0] = '\0'; }
static_stringstatic_string53   inline explicit static_string(const char *s) { set(s); }
c_strstatic_string54   inline const char *c_str() const { return &string_[0]; }
setstatic_string55   inline void set(const char *s) {
56     strncpy(&string_[0], s, string_size - 1);
57     string_[string_size - 1] = '\0';
58   }
appendstatic_string59   inline void append(const char *s) {
60     const size_t l = strlen(&string_[0]);
61     if (l < string_size - 1) {
62       strncpy(&string_[0] + l, s, string_size - l - 1);
63       string_[string_size - 1] = '\0';
64     }
65   }
66  private:
67   /*! \brief The actual character array */
68   std::array<char, string_size> string_;
69 };
70 
71 using profile_stat_string = static_string<128>;
72 
73 /*!
74  * \brief Base profile statistic structure
75  */
76 struct ProfileStat {
77   /*!
78    * \brief Event type as used for chrome://tracing support
79    * \note Tracing formats: https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview NOLINT(*)
80    */
81   enum EventType {
82     kDurationBegin = 'B',
83     kDurationEnd = 'E',
84     kComplete = 'X',
85     kInstant = 'i',
86     kCounter = 'C',
87     kAsyncNestableStart = 'b',
88     kAsyncNestableInstant = 'n',
89     kAsyncNestableEnd = 'e',
90     kFlowStart = 's',
91     kFlowStep = 't',
92     kFlowEnd = 'f',
93     kSample = 'P',
94     kObjectCreated = 'N',
95     kObjectSnapshot = 'O',
96     kObjectDestroyed = 'D',
97     kMetadata = 'M',
98     kMemoryDumpGlobal = 'V',
99     kMemoryDumpProcess = 'v',
100     kMark = 'R',
101     kClockSync = 'c',
102     kContextEnter = '(',
103     kContextLeave = ')'
104   };
105 
106   struct SubEvent {
107     /*! \brief whether this sub-event object is enabled */
108     bool enabled_ = false;
109     /*! \brief Type of the sub-event */
110     EventType event_type_;
111     /*! \brief Timestamp of sub-event */
112     uint64_t timestamp_;
113   };
114 
115   /*! \brief operation name */
116   profile_stat_string name_;
117 
118   /*! \brief operation categories (comma-delimited) */
119   profile_stat_string categories_;
120 
121   /*! \brief whether to add this stat to AggregateStats */
122   bool enable_aggregate_ = true;
123 
124   /* !\brief Process id */
125   size_t process_id_ = common::current_process_id();
126 
127   /*! \brief id of thread which operation run on.
128    *
129    * */
130   std::thread::id thread_id_ = std::this_thread::get_id();  // Not yet seen a
131                                                             // case where this isn't valid
132 
133   /*! \brief Sub-events (ie begin, end, etc.) */
134   SubEvent items_[3];  // Don't use vector in order to avoid memory allocation
135 
136   /*!
137    * \brief Get current tick count in microseconds
138    * \return Current arbitrary tick count in microseconds
139    */
NowInMicrosecProfileStat140   static inline uint64_t NowInMicrosec() {
141 #if defined(_MSC_VER) && _MSC_VER <= 1800
142     LARGE_INTEGER frequency, counter;
143     QueryPerformanceFrequency(&frequency);
144     QueryPerformanceCounter(&counter);
145     return counter.QuadPart * 1000000 / frequency.QuadPart;
146 #else
147     return std::chrono::duration_cast<std::chrono::microseconds>(
148       std::chrono::high_resolution_clock::now().time_since_epoch()).count();
149 #endif
150   }
151 
152   /*!
153    * \brief Print event statistics in json format to the supplied output stream
154    * \param os Output stream to write the data
155    * \note Emits all sub-even statistics
156    */
EmitEventsProfileStat157   void EmitEvents(std::ostream *os) {
158     size_t count = 0;
159     for (size_t i = 0; i < sizeof(items_) / sizeof(items_[0]); ++i) {
160       if (items_[i].enabled_) {
161         if (count) {
162           *os << ",\n";
163         }
164         EmitSubEvent(os, i);
165         ++count;
166       }
167     }
168   }
169 
170   /*!
171    * \brief Virtual destructor
172    */
~ProfileStatProfileStat173   virtual ~ProfileStat() {}
174 
175   /*!
176    * \brief Save aggregate data for this stat
177    * \param data Stat data
178    */
SaveAggregateProfileStat179   virtual void SaveAggregate(AggregateStats::StatData *data) const {
180     if (data) {
181       data->type_ = AggregateStats::StatData::kOther;
182     }
183   }
184 
185  protected:
186   /*!
187    * \brief Override to emit extra items within the json event data block. Append with a comma ",".
188    * \param os Output stream to write data to
189    * \param idx Sub-even index (index into items_) to write
190    */
EmitExtraProfileStat191   virtual void EmitExtra(std::ostream *os, size_t idx) {}
192 
193   /*!
194    * \brief Emit sub-event statistics
195    * \param os Output stream
196    * \param idx Sub-even index (index into items_) to write
197    */
EmitSubEventProfileStat198   void EmitSubEvent(std::ostream *os, size_t idx) {
199     const SubEvent &ev = items_[idx];
200     if (ev.enabled_) {
201       *os << "    {\n"
202           << "        \"name\": \"" << name_.c_str() << "\",\n"
203           << "        \"cat\": " << "\"" << categories_.c_str() << "\",\n"
204           << "        \"ph\": \"" << static_cast<char>(ev.event_type_) << "\",\n"
205           << "        \"ts\": " << ev.timestamp_ << ",\n";
206       EmitExtra(os, idx);
207       *os << "        \"pid\": " << process_id_ << ",\n"
208           << "        \"tid\": " << std::hash<std::thread::id>{}(thread_id_) << "\n"
209           << "    }\n";
210     }
211   }
212 };
213 
214 /*!
215  * \brief Device statistics
216  */
217 struct DeviceStats {
218   using TQueue = dmlc::moodycamel::ConcurrentQueue<ProfileStat *>;
219   /*!
220    * \brief Destructor, clean up allocated objects
221    */
~DeviceStatsDeviceStats222   ~DeviceStats() {
223     std::shared_ptr<TQueue> es = opr_exec_stats_;
224     if (es) {
225       ProfileStat *stat = nullptr;
226       while (es->try_dequeue(stat)) {
227         delete stat;
228       }
229     }
230   }
231 
232   /*! \brief device name */
233   std::string dev_name_;
234   /*! \brief operation execution statistics on this device */
235   std::shared_ptr<TQueue> opr_exec_stats_ = std::make_shared<TQueue>();
236 };
237 
238 /*!
239  *  _____              __  _  _
240  * |  __ \            / _|(_)| |
241  * | |__) |_ __  ___ | |_  _ | | ___  _ __
242  * |  ___/| '__|/ _ \|  _|| || |/ _ \| '__|
243  * | |    | |  | (_) | |  | || |  __/| |
244  * |_|    |_|   \___/|_|  |_||_|\___||_|
245  *
246  * \brief profiler that records the operation execution information
247  *        and saves the profile statistics.
248  * \note Profiler class doesn't know anything about VTune
249  */
250 class Profiler {
251  public:
252   enum ProfilerMode {
253       kSymbolic = 1,
254       kImperative = 2,
255       kAPI = 4,
256       kMemory = 8
257   };
258   enum ProfilerState {
259       kNotRunning = 0,
260       kRunning = 1
261   };
262 
263   /*! \brief set state of profiler */
264   void SetState(ProfilerState state);
265   /*! \return state of profiler */
GetState()266   inline ProfilerState GetState() const {
267     return this->state_;
268   }
269   /*!
270    * \brief set profiler configuration
271    * \param mode flags, one or more of 'ProfilerMode'
272    * \param output_filename profile output file name
273    * \param continuous_dump true if profile information should be periodically dumped
274    * \param dump_period Period (in seconds) of profile info dumping
275    */
276   void SetConfig(int mode, std::string output_filename,
277                  bool continuous_dump,
278                  float dump_period,
279                  bool aggregate_stats);
280 
281   /*! \return mode of profiler */
GetMode()282   inline int GetMode() const {
283     return this->mode_;
284   }
285 
IsProfiling(const ProfilerMode pm)286   inline bool IsProfiling(const ProfilerMode pm) const {
287     return GetState() == kRunning && (GetMode() & pm) == pm;
288   }
289 
290   /*! \return whether the profiler is enabled to output */
IsEnableOutput()291   inline bool IsEnableOutput() const {
292     return this->enable_output_;
293   }
294   /*!
295    * \brief dump the profile file
296    * \param perform_cleanup Close off the json trace structures (ie last pass)
297    */
298   void DumpProfile(bool perform_cleanup = true);
299 
300   /*! \return the profiler init time, time unit is microsecond (10^-6) s */
GetInitTime()301   uint64_t MSHADOW_CINLINE GetInitTime() const {
302     return init_time_;
303   }
304   /*!
305    * \brief add one operation execution record in corresponding device statistics
306    * \tparam SetExtraInfoFunction
307    * \param dev_type
308    * \param dev_id
309    * \param set_extra_info_function
310    * \note Because when this function exits, the object is written to the profile queue,
311    *       and at that point, could be consumed and/or destroyed at any moment,
312    *       any preprocessing on the object is to be done in the set_extra_info_function
313    *       callback.  Another option is to use the CreateProfileStat()/AddProfileStat() pair,
314    *       adding it only after
315    */
316   template<typename StatType, typename SetExtraInfoFunction, typename ...Args>
AddNewProfileStat(SetExtraInfoFunction set_extra_info_function,Args...args)317   void AddNewProfileStat(SetExtraInfoFunction set_extra_info_function, Args... args) {
318     if (!paused_) {
319       std::unique_ptr<StatType> stat = CreateProfileStat<StatType>(args...);
320       set_extra_info_function(stat.get());
321       AddProfileStat(&stat);
322     }
323   }
324 
325   /*!
326    * \brief Return aggregate statistic accumulator
327    * \return shared pointer to the 'ProfileStats' aggregate statistic accumulator
328    */
GetAggregateStats()329   std::shared_ptr<AggregateStats> GetAggregateStats() const {
330     return aggregate_stats_;
331   }
332 
333   /*!
334    * \brief Get a pointer to the Profiler singleton
335    * \return Profiler singleton
336    * \param sp Profiler shared pointer, only use for singleton ownership
337    */
338   static Profiler* Get(std::shared_ptr<Profiler> *sp = nullptr);
339 
340   /*!
341    * \brief Set whether statistic collection is to be paused
342    * \param paused true if statistic collection is to be paused, otherwise
343    * resume statistic collection
344    * \note Pause/Resume is not recursive
345    */
set_paused(bool paused)346   void set_paused(bool paused) { paused_ = paused; }
347 
348   /*!
349    * \brief Get the calculated device count (numb er of devices to track in profile data).
350    * \return Device count
351    * \note Number of CPU's + Number of GPU's + One for CPU-Pinned
352    */
DeviceCount()353   size_t DeviceCount() const { return cpu_num_ + gpu_num_ + 2; }
354 
355   /*!
356    * \brief Compute device index given device type and id
357    * \param dev_type Device type
358    * \param dev_id Device ID
359    * \return Device index for indexing into device-specific data
360    */
361   size_t DeviceIndex(mxnet::Context::DeviceType dev_type, int32_t dev_id);
362 
363   /*!
364    * \brief Device name
365    * \param dev_type Device type
366    * \param dev_id Device ID
367    * \return Character pointer to device name
368    */
369   const char *DeviceName(mxnet::Context::DeviceType dev_type, int32_t dev_id);
370 
371 
372   /*!
373    * \brief Device name
374    * \param dev_type Device type
375    * \param dev_id Device ID
376    * \return Character pointer to device name
377    */
378   const char *DeviceName(const size_t index);
379 
380   /*!
381    * \brief Whether aggregate stats are being collected
382    * \return true if aggregate stats are being collected
383    */
AggregateEnabled()384   inline bool AggregateEnabled() const {
385     return aggregate_stats_.get() != nullptr;
386   }
387 
388   /*!
389    * \brief Whether aggregate stats are currently being recorded
390    * \return true if aggregate stats are currently being recorded
391    */
AggregateRunning()392   inline bool AggregateRunning() const {
393     return GetState() == kRunning && AggregateEnabled();
394   }
395 
396  public:
397   /*!
398    * \brief Constructor
399    */
400   Profiler();
401 
402   /*!
403    * \brief Destructor
404    */
405   virtual ~Profiler();
406 
407  private:
408   /*!
409    * \brief Create a new profile statistic object
410    * \tparam StatType The type of the profile statistic object
411    * \tparam Args Argument types to pass to the new object's constructor
412    * \param args Arguments to pass to the new object's constructor
413    * \return A unique_ptr to the new statistic object
414    */
415   template<typename StatType, typename ...Args>
416   static std::unique_ptr<typename std::enable_if<std::is_base_of<ProfileStat, StatType>::value,
CreateProfileStat(Args...args)417     StatType>::type> CreateProfileStat(Args... args) {
418     return std::unique_ptr<StatType>(new StatType(args...));
419   }
420 
421   /*!
422    * \brief Add a general profile statistic object
423    * \tparam StatType Type of the statistic object
424    * \param stat The statistic object
425    */
426   template<typename StatType>
AddProfileStat(std::unique_ptr<StatType> * stat)427   inline void AddProfileStat(std::unique_ptr<StatType> *stat) {
428     general_stats_.opr_exec_stats_->enqueue(stat->release());
429   }
430 
431   /*! \brief generate device information following chrome profile file format */
432   void EmitPid(std::ostream *os, const std::string& name, size_t pid);
433 
434   /*!
435    * \brief Set continuous asynchronous profile dump
436    * \param continuous_dump Whether to continuously dump profile information
437    * \param delay_in_seconds Delay between asynchronous dumps
438    */
439   void SetContinuousProfileDump(bool continuous_dump, float delay_in_seconds);
440 
441   /*! \brief internal mutex of the profiler */
442   std::recursive_mutex m_;
443   /*! \brief indicate whether the profiler is running */
444   ProfilerState state_;
445   /*! \brief once running, enable profiler to output */
446   volatile bool enable_output_;
447   /*! \brief indicate what operator the profiler will record */
448   int mode_ = kSymbolic | kAPI | kMemory;
449   /*! \brief filename to output profile file */
450   std::string filename_ = "profile.json";
451   /*! \brief profile statistics consist of multiple device statistics */
452   std::unique_ptr<DeviceStats[]> profile_stat;
453   /*! \brief Stats not associated directly with a device */
454   DeviceStats  general_stats_;
455   /*! \brief Map category -> pid */
456   std::unordered_map<std::string, size_t> category_to_pid_;
457   /*! \brief cpu number on the machine */
458   unsigned int cpu_num_;
459   /*! \brief gpu number on the machine */
460   unsigned int gpu_num_;
461   /*! \brief the profiler init time */
462   uint64_t init_time_;
463   /*! \brief Continuously dump profile info */
464   volatile bool continuous_dump_ = false;
465   /*! \brief Number of non-meta profiling record emitted */
466   volatile uint64_t num_records_emitted_ = 0;
467   /*! \brief Number of times profile was dumped */
468   volatile uint64_t profile_dump_count_;
469   /*! \brief Whether profiling is paused */
470   volatile bool paused_ = false;
471   /*! \brief Maintain in-memory aggregate stats for print output.
472    *  \warning This has a negative performance impact */
473   std::shared_ptr<AggregateStats> aggregate_stats_ = nullptr;
474   /*! \brief Asynchronous operation thread lifecycle control object */
475   std::shared_ptr<dmlc::ThreadGroup> thread_group_ = std::make_shared<dmlc::ThreadGroup>();
476   /* !\brief pids */
477   std::unordered_set<uint32_t> process_ids_;
478 };
479 
480 #ifdef MXNET_USE_VTUNE
481 #define VTUNE_ONLY_CODE(...) __VA_ARGS__  /* This is undefined at the bottom of this file */
482 #else
483 #define VTUNE_ONLY_CODE(...) /* */        /* This is undefined at the bottom of this file */
484 #endif
485 
486 #ifdef MXNET_USE_NVTX
487 #define NVTX_ONLY_CODE(...) __VA_ARGS__  /* This is undefined at the bottom of this file */
488 #else
489 #define NVTX_ONLY_CODE(...) /* */        /* This is undefined at the bottom of this file */
490 #endif
491 
492 /**
493  *  _____              __  _  _  _                ____  _     _            _
494  * |  __ \            / _|(_)| |(_)              / __ \| |   (_)          | |
495  * | |__) |_ __  ___ | |_  _ | | _ _ __   __ _  | |  | | |__  _  ___   ___| |_  ___
496  * |  ___/| '__|/ _ \|  _|| || || | '_ \ / _` | | |  | | '_ \| |/ _ \ / __| __|/ __|
497  * | |    | |  | (_) | |  | || || | | | | (_| | | |__| | |_) | |  __/| (__| |_ \__ \
498  * |_|    |_|   \___/|_|  |_||_||_|_| |_|\__, |  \____/|_.__/| |\___| \___|\__||___/
499  *                                        __/ |             _/ |
500  *                                       |___/             |__/
501  */
502 
503 enum ProfileObjectType {
504   kDomain,
505   kCounter,
506   kTask,
507   kEvent,
508   kFrame
509 };
510 
511 class ProfileObject {
512  public:
513   /*!
514    * \brief Virtual destructor for child classes
515    */
~ProfileObject()516   virtual ~ProfileObject() {}
517   /*!
518    * \brief Return profiling object object type (i.e. kTask, kEvent, ...)
519    * \return Profiling object type
520    */
521   virtual ProfileObjectType type() const = 0;
522 };
523 
524 /*!
525  * \brief Tuning domain. Used in VTune to separate similar tasks/counters/etc. (for example).
526  *        For chrome tracing, generally maps to category.
527  */
528 struct ProfileDomain : public ProfileObject {
529   /*!:(
530    * \brief Constructor
531    * \param name Name of the domain
532    */
ProfileDomainProfileDomain533   explicit ProfileDomain(const char *name) noexcept
534     : name_(name) {
535     CHECK_NOTNULL(name);
536     CHECK_NE(name[0], '\0');
537     VTUNE_ONLY_CODE(vtune_domain_.reset(new vtune::VTuneDomain(name)));
538   }
539   /*!
540    * \brief Get domain name
541    * \return Domain name
542    */
nameProfileDomain543   const char *name() const { return name_.c_str(); }
typeProfileDomain544   ProfileObjectType type() const override { return kDomain; }
545   VTUNE_ONLY_CODE(inline vtune::VTuneDomain *dom() { return vtune_domain_.get(); });
546  private:
547   /*! \brief Name of the domain */
548   profile_stat_string name_;
549   /*! \brief VTune domain object */
550   VTUNE_ONLY_CODE(std::unique_ptr<vtune::VTuneDomain> vtune_domain_);
551 };
552 
553 /*!
554  * \brief Counter object and statistic item
555  */
556 struct ProfileCounter : public ProfileObject {
557   /*!
558    * \brief Constructor
559    * \param name Counter name
560    * \param domain Counter domain
561    */
ProfileCounterProfileCounter562   ProfileCounter(const char *name, ProfileDomain *domain) noexcept
563     : name_(name)
564       , domain_(domain)
565       , value_(0) {
566     CHECK_NOTNULL(domain);
567     VTUNE_ONLY_CODE(vtune_.reset(new vtune::VTuneCounter(name, domain->dom())));
568   }
~ProfileCounterProfileCounter569   ~ProfileCounter() {}
570   /*! \brief operator: ++object */
571   inline uint64_t operator ++() {
572     return IncrementValue(1);
573   }
574   /*! \brief operator: object++ */
575   inline uint64_t operator ++(int) {
576     const uint64_t old = value_;
577     IncrementValue(1);
578     return old;
579   }
580   /*! \brief operator: --object */
581   inline uint64_t operator --() {
582     CHECK_GT(value_, 0);
583     return DecrementValue(1);
584   }
585   /*! \brief operator: object-- */
586   inline uint64_t operator --(int) {
587     CHECK_GT(value_, 0);
588     const uint64_t old = value_;
589     DecrementValue(1);
590     return old;
591   }
592   /*! \brief operator: object += v */
593   inline uint64_t operator +=(int64_t v) {
594     if (v >= 0) {
595       return IncrementValue(static_cast<uint64_t>(v));
596     } else {
597       v = -v;
598       return DecrementValue(static_cast<uint64_t>(v));
599     }
600   }
601   /*! \brief operator: object -= v */
602   inline uint64_t operator -=(int64_t v) {
603     CHECK_GE(value_, v);
604     if (v >= 0) {
605       return DecrementValue(static_cast<uint64_t>(v));
606     } else {
607       v = -v;
608       return IncrementValue(static_cast<uint64_t>(v));
609     }
610   }
611 
612   inline bool operator >=(int64_t v) {
613       CHECK_GE(v, 0);
614       return value_ >= static_cast<uint64_t>(v);
615   }
616 
617   /*! \brief operator: object = v */
618   inline ProfileCounter& operator = (uint64_t v) {
619     SetValue(v);
620     return *this;
621   }
622 
typeProfileCounter623   ProfileObjectType type() const override { return kCounter; }
624 
625  protected:
626   /*!
627    * \brief Count statistic object
628    */
629   struct ProfileCounterStat : public ProfileStat {
630     uint64_t value_;
ProfileCounterStatProfileCounter::ProfileCounterStat631     explicit ProfileCounterStat(const char *name, uint64_t value) : value_(value) {
632       items_[0].enabled_ = true;
633       items_[0].event_type_ = kCounter;
634       items_->timestamp_ = NowInMicrosec();
635       name_.set(name);
636     }
637 
638     /*!
639      * \brief Emit counter value as extra data for this statistic type
640      * \param os Output stream to write data to
641      * \param idx Sub-even index (index into items_) to write
642      */
EmitExtraProfileCounter::ProfileCounterStat643     void EmitExtra(std::ostream *os, size_t idx) override {
644       ProfileStat::EmitExtra(os, idx);
645       *os << "        \"args\": { \"" << name_.c_str() << "\": " << value_ << " },\n";
646     }
647 
648     /*!
649      * \brief Save aggregate data for this stat
650      * \param data Stat data
651      */
SaveAggregateProfileCounter::ProfileCounterStat652     void SaveAggregate(AggregateStats::StatData *data) const override {
653       if (data) {
654         data->type_ = AggregateStats::StatData::kCounter;
655         ++data->total_count_;
656         data->total_aggregate_ = value_;
657         if (value_ > data->max_aggregate_) {
658           data->max_aggregate_ = value_;
659         }
660         if (value_ < data->min_aggregate_) {
661           data->min_aggregate_ = value_;
662         }
663       }
664     }
665   };
666 
667  private:
668   /*!
669    * \brief Send this object's statistical datapoint to the profiler
670    */
SendStatProfileCounter671   inline void SendStat(uint64_t value) {
672     Profiler::Get()->AddNewProfileStat<ProfileCounterStat>([this](ProfileCounterStat *stat) {
673                                                              stat->categories_.set(domain_->name());
674                                                            },
675                                                            name_.c_str(),
676                                                            value);
677   }
678 
679   /*!
680    * \brief Set counter value
681    * \param val Value to set the counter
682    */
SetValueProfileCounter683   inline void SetValue(uint64_t val) {
684     VTUNE_ONLY_CODE(*vtune_ = val);
685     value_ = val;
686     SendStat(val);
687   }
688 
689   /*!
690    * \brief Adjust counter value by amount given
691    * \param value_change Value to change the counter by
692    */
IncrementValueProfileCounter693   inline uint64_t IncrementValue(uint64_t value_change) {
694     VTUNE_ONLY_CODE(*vtune_ += value_change);
695     const uint64_t v = (value_ += value_change);
696     SendStat(v);
697     return v;
698   }
699 
DecrementValueProfileCounter700   inline uint64_t DecrementValue(uint64_t value_change) {
701     VTUNE_ONLY_CODE(*vtune_ -= value_change);
702     const uint64_t v = (value_ -= value_change);
703     SendStat(v);
704     return v;
705   }
706 
707   /*! \brief Name of the counter */
708   profile_stat_string name_;
709   /*! \brief Domain of the counter */
710   ProfileDomain *domain_;
711   /*! \brief Value of the counter */
712   std::atomic<uint64_t>  value_;
713   /*! \brief VTune counter object */
714   VTUNE_ONLY_CODE(std::unique_ptr<vtune::VTuneCounter> vtune_);
715 };
716 
717 class ProfileDuration : public ProfileObject {
718  public:
719   virtual void start() = 0;
720   virtual void stop() = 0;
721 
722  protected:
723   /*!
724    * \brief Basic duration statistic (start time, stop time)
725    */
726   struct DurationStat : public ProfileStat {
727     enum DurationStatIndex {
728       kStart, kStop
729     };
730     /*!
731      * \brief Constructor
732      * \param begin_event Event type for start point (default is kDurationBegin)
733      * \param end_event Event type for stop point (default is kDurationEnd)
734      */
735     DurationStat(ProfileStat::EventType begin_event = ProfileStat::kDurationBegin,
736                  ProfileStat::EventType end_event = ProfileStat::kDurationEnd) {
737       items_[kStart].enabled_ = items_[kStop].enabled_ = true;
738       items_[kStart].event_type_ = begin_event;
739       items_[kStop].event_type_ = end_event;
740     }
741 
742     /*!
743      * \brief Save aggregate data for this stat
744      * \param data Stat data
745      */
SaveAggregateDurationStat746     void SaveAggregate(AggregateStats::StatData *data) const override {
747       if (data) {
748         data->type_ = AggregateStats::StatData::kDuration;
749         ++data->total_count_;
750         CHECK_GE(items_[kStop].timestamp_, items_[kStart].timestamp_);
751         const uint64_t duration = items_[kStop].timestamp_ - items_[kStart].timestamp_;
752         data->total_aggregate_ += duration;
753         if (duration > data->max_aggregate_) {
754           data->max_aggregate_ = duration;
755         }
756         if (duration < data->min_aggregate_) {
757           data->min_aggregate_ = duration;
758         }
759       }
760     }
761   };
762 };
763 
764 /*!
765  * \brief Task - Thread-granular nestable time block
766  */
767 struct ProfileTask : public ProfileDuration {
768   /*!
769    * \brief Constructor
770    * \param name Name of the task
771    * \param domain Domain of the task
772    */
ProfileTaskProfileTask773   ProfileTask(const char *name, ProfileDomain *domain)
774     : name_(name)
775       , domain_(domain) {
776     CHECK_NOTNULL(domain);
777     categories_.set(domain_->name());
778     categories_.append(",task");
779     VTUNE_ONLY_CODE(vtune_task_.reset(new vtune::VTuneTask(name, domain->dom())));
780     NVTX_ONLY_CODE(nvtx_duration_.reset(new nvtx::NVTXDuration(name)));
781   }
782 
783   /*!
784    * \brief Set the domain
785    */
setDomainProfileTask786   void setDomain(ProfileDomain* domain) {
787     domain_ = domain;
788   }
789 
790   /*!
791    * \brief Start the profiling scope
792    */
startProfileTask793   void start() override {
794     start_time_ = ProfileStat::NowInMicrosec();
795     VTUNE_ONLY_CODE(vtune_task_->start());
796     NVTX_ONLY_CODE(nvtx_duration_->start());
797   }
798 
799   /*!
800    * \brief Stop the profiling scope
801    */
stopProfileTask802   void stop() override {
803     VTUNE_ONLY_CODE(vtune_task_->stop());
804     NVTX_ONLY_CODE(nvtx_duration_->stop());
805     SendStat();
806   }
807 
typeProfileTask808   ProfileObjectType type() const override { return kTask; }
809 
810   /*!
811    * \brief Whether to add stat to AggregateStats
812    */
813   void enableAggregateStats(bool enabled = true) {
814     enable_aggregate_ = enabled;
815   }
816 
817  protected:
818   /*!
819    * \brief Task statistic object
820    */
821   struct ProfileTaskStat : public DurationStat {
ProfileTaskStatProfileTask::ProfileTaskStat822     explicit ProfileTaskStat(const char *name, uint64_t start_time, uint64_t stop_time)
823       : DurationStat(ProfileStat::kAsyncNestableStart, ProfileStat::kAsyncNestableEnd) {
824       name_.set(name);
825       items_[0].timestamp_ = start_time;
826       items_[1].timestamp_ = stop_time;
827     }
EmitExtraProfileTask::ProfileTaskStat828     void EmitExtra(std::ostream *os, size_t idx) override {
829       DurationStat::EmitExtra(os, idx);
830       *os << "        \"id\": " << std::hash<std::thread::id>{}(thread_id_) << ",\n";
831     }
832   };
833 
834  private:
835   /*!
836    * \brief Send this object's statistical datapoint to the profiler
837    */
SendStatProfileTask838   inline void SendStat() {
839     Profiler::Get()->AddNewProfileStat<ProfileTaskStat>([this](ProfileTaskStat *stat) {
840       stat->categories_.set(domain_->name());
841       stat->enable_aggregate_ = enable_aggregate_;
842     }, name_.c_str(), start_time_, ProfileStat::NowInMicrosec());
843   }
844   /*! \brief Task name */
845   const profile_stat_string  name_;
846   /*! \brief Task categories */
847   profile_stat_string categories_;
848   /*! \brief domain */
849   ProfileDomain *domain_;
850   /*! \brief VTune task object */
851   VTUNE_ONLY_CODE(std::unique_ptr<vtune::VTuneTask> vtune_task_);
852   /*! \brief NVTX duration object */
853   NVTX_ONLY_CODE(std::unique_ptr<nvtx::NVTXDuration> nvtx_duration_);
854   /*! \brief whether to add this stat to AggregateStats */
855   bool enable_aggregate_ = true;
856 
857  protected:
858   /*! \brief Task's start tick */
859   uint64_t start_time_;
860 };
861 
862 /*!
863  * \brief Event - Thread-granular time block
864  */
865 struct ProfileEvent  : public ProfileDuration {
866   /*!
867    * \brief Constructor
868    * \param name Name of the event
869    */
ProfileEventProfileEvent870   explicit inline ProfileEvent(const char *name)
871     : name_(name)
872       , categories_("event") {
873     VTUNE_ONLY_CODE(vtune_event_ = vtune::VTuneEvent::registry_.get(name));
874     NVTX_ONLY_CODE(nvtx_duration_.reset(new nvtx::NVTXDuration(name)));
875   }
876 
877   /*!
878    * \brief Start the profiling scope
879    */
startProfileEvent880   void start() override {
881     start_time_ = ProfileStat::NowInMicrosec();
882     VTUNE_ONLY_CODE(vtune_event_->start());
883     NVTX_ONLY_CODE(nvtx_duration_->start());
884   }
885 
886   /*!
887    * \brief Stop the profiling scope
888    */
stopProfileEvent889   void stop() override {
890     VTUNE_ONLY_CODE(vtune_event_->stop());
891     SendStat();
892   }
893 
894   /*!
895    * \brief Set catagories (used for chrome tracing)
896    * \param categories Comma-delimited categories
897    */
SetCategoriesProfileEvent898   void SetCategories(const char *categories) {
899     categories_.set(categories);
900   }
901 
typeProfileEvent902   ProfileObjectType type() const override { return kEvent; }
903 
904  protected:
905   /*!
906    * \brief Event statistic object
907    */
908   struct ProfileEventStat : public DurationStat {
ProfileEventStatProfileEvent::ProfileEventStat909     explicit ProfileEventStat(const char *name, uint64_t start_time, uint64_t stop_time)
910       : DurationStat(ProfileStat::kDurationBegin, ProfileStat::kDurationEnd) {
911       name_.set(name);
912       items_[0].timestamp_ = start_time;
913       items_[1].timestamp_ = stop_time;
914     }
915   };
916 
917  private:
918   /*!
919    * \brief Send this object's statistical datapoint to the profiler
920    */
SendStatProfileEvent921   virtual void SendStat() {
922     Profiler::Get()->AddNewProfileStat<ProfileEventStat>([this](ProfileEventStat *stat) {
923       stat->categories_.set(categories_.c_str());
924     }, name_.c_str(), start_time_, ProfileStat::NowInMicrosec());
925   }
926   /*! \brief Event name */
927   const profile_stat_string  name_;
928   /*! \brief Event categories (comma-delimited) */
929   profile_stat_string categories_;
930   /*! \brief VTune event object */
931   VTUNE_ONLY_CODE(vtune::VTuneEvent *vtune_event_);
932   /*! \brief NVTX duration object */
933   NVTX_ONLY_CODE(std::unique_ptr<nvtx::NVTXDuration> nvtx_duration_;);
934 
935  protected:
936   /*! \brief Start time of the event */
937   uint64_t start_time_;
938 };
939 
940 /*!
941  * \brief Frame - Process-granular time block
942  */
943 struct ProfileFrame : public ProfileDuration {
944   /*!
945    * \brief Constructor
946    * \param name Name of the frame
947    * \param domain Domain of the frame
948    */
ProfileFrameProfileFrame949   ProfileFrame(const char *name, ProfileDomain *domain)
950     : name_(name)
951       , domain_(domain) {
952     CHECK_NOTNULL(domain);
953     categories_.set(domain_->name());
954     categories_.append(",frame");
955     NVTX_ONLY_CODE(nvtx_duration_.reset(new nvtx::NVTXDuration(name)));
956     VTUNE_ONLY_CODE(vtune_frame_.reset(new vtune::VTuneFrame(domain->dom())));
957   }
958 
959   /*!
960    * \brief Start the profiling scope
961    */
startProfileFrame962   void start() override {
963     start_time_ = ProfileStat::NowInMicrosec();
964     VTUNE_ONLY_CODE(vtune_frame_->start());
965     NVTX_ONLY_CODE(nvtx_duration_->start());
966   }
967 
968   /*!
969    * \brief Stop the profiling scope
970    */
stopProfileFrame971   void stop() override {
972     VTUNE_ONLY_CODE(vtune_frame_->stop());
973     SendStat();
974   }
975 
typeProfileFrame976   ProfileObjectType type() const override { return kFrame; }
977 
978  protected:
979   /*!
980    * \brief Frame statistic object
981    */
982   struct ProfileFrameStat : public DurationStat {
ProfileFrameStatProfileFrame::ProfileFrameStat983     explicit ProfileFrameStat(const char *name, uint64_t start_time, uint64_t stop_time)
984       : DurationStat(ProfileStat::kContextEnter, ProfileStat::kContextLeave) {
985       name_.set(name);
986       items_[0].timestamp_ = start_time;
987       items_[1].timestamp_ = stop_time;
988     }
989   };
990 
991  private:
992   /*!
993    * \brief Send this object's statistical datapoint to the profiler
994    */
SendStatProfileFrame995   inline void SendStat() {
996     Profiler::Get()->AddNewProfileStat<ProfileFrameStat>([this](ProfileFrameStat *stat) {
997       stat->categories_.set(categories_.c_str());
998     }, name_.c_str(), start_time_, ProfileStat::NowInMicrosec());
999   }
1000   /*! \brief Frame name */
1001   const profile_stat_string  name_;
1002   /*! \brief Frame categories (comma-delimited) */
1003   profile_stat_string categories_;
1004   /*! \brief Pointer to the domain */
1005   ProfileDomain *domain_;
1006   /*! \brief VTune Frame object */
1007   VTUNE_ONLY_CODE(std::unique_ptr<vtune::VTuneFrame> vtune_frame_);
1008   /*! \brief NVTX duration object */
1009   NVTX_ONLY_CODE(std::unique_ptr<nvtx::NVTXDuration> nvtx_duration_);
1010 
1011  protected:
1012   /*! \brief Frame start time */
1013   uint64_t start_time_;
1014 };
1015 
1016 /*!
1017  * \brief Marker - Mark an instance in time
1018  */
1019 struct ProfileMarker {
1020   enum MarkerScope {  // Should equal VTune values
1021     kUnknown, kGlobal, kProcess, kThread, kTask, kMarker
1022   };
1023 
1024   /*!
1025    * \brief Constructor
1026    * \param name Name of the instant marker
1027    * \param domain Domain of the instant marker
1028    * \param scope Scope of the instant marker
1029    * \param nestable true if the instant marker is nestable
1030    */
1031   ProfileMarker(const char *name,
1032                        ProfileDomain *domain,
1033                        const MarkerScope scope,
1034                        bool nestable = true)
name_ProfileMarker1035     : name_(name)
1036       , domain_(domain)
1037       , scope_(scope)
1038       , nestable_(nestable) {
1039     categories_.set(domain_->name());
1040     categories_.append(",instant_marker");
1041     VTUNE_ONLY_CODE(vtune_instant_marker_.reset(
1042       new vtune::VTuneInstantMarker(name, domain->dom(), static_cast<__itt_scope>(scope))));
1043   }
1044 
1045   /*!
1046    * \brief Signal a marker at this instant
1047    */
markProfileMarker1048   void mark() {
1049     VTUNE_ONLY_CODE(vtune_instant_marker_->signal());
1050     SendStat();
1051   }
1052 
1053  protected:
1054   /*!
1055    * \brief Instant-marker statistic object
1056    */
1057   struct ProfileMarkerStat : public ProfileStat {
ProfileMarkerStatProfileMarker::ProfileMarkerStat1058     explicit ProfileMarkerStat(const char *name, const char scope_char, bool nestable)
1059     : scope_char_(scope_char) {
1060       items_[0].enabled_ = true;
1061       items_[0].event_type_ = nestable ? kAsyncNestableInstant : kInstant;
1062       items_->timestamp_ = NowInMicrosec();
1063       name_.set(name);
1064     }
EmitExtraProfileMarker::ProfileMarkerStat1065     virtual void EmitExtra(std::ostream *os, size_t idx) {
1066       ProfileStat::EmitExtra(os, idx);
1067       *os << "        \"s\": \"" << scope_char_ << "\",\n";
1068     }
1069     const char scope_char_;
1070   };
1071 
1072  private:
1073   /*!
1074    * \brief Send this object's statistical datapoint to the profiler
1075    */
SendStatProfileMarker1076   virtual void SendStat() {
1077     Profiler::Get()->AddNewProfileStat<ProfileMarkerStat>([this](ProfileMarkerStat *stat) {
1078       stat->categories_.set(categories_.c_str());
1079     }, name_.c_str(), vtune_scope_to_chrome_scope(scope_), nestable_);
1080   }
1081 
vtune_scope_to_chrome_scopeProfileMarker1082   static char vtune_scope_to_chrome_scope(const MarkerScope scope) {
1083     switch (scope) {
1084       case kThread:
1085         return 't';
1086       case kGlobal:
1087         return 'g';
1088       case kProcess:
1089       case kUnknown:
1090       case kTask:
1091       case kMarker:
1092       default:
1093         return 'p';
1094     }
1095   }
1096 
1097   /*! \brief Name of the instant marker */
1098   const profile_stat_string name_;
1099   /*! \brief Categories of the instant marker (comma-delimited) */
1100   profile_stat_string categories_;
1101   /*! \brief Pointer to the domain of this instant marker */
1102   ProfileDomain *domain_;
1103   /*! \brief VTune scope */
1104   const MarkerScope scope_;
1105   /*! \brief Whether this marker is nestabe */
1106   const bool nestable_;
1107   /*! \brief VTune instant marker object */
1108   VTUNE_ONLY_CODE(std::unique_ptr<vtune::VTuneInstantMarker> vtune_instant_marker_);
1109 };
1110 
1111 static ProfileDomain custom_op_domain("Custom Operator");
1112 
1113 /*!
1114  * \brief Operator profiler object. Logs as both an independent event and a task in
1115  * the operator domain
1116  */
1117 struct ProfileOperator : public ProfileEvent {
1118   /*!
1119    * \brief Operator attributes
1120    */
1121   struct Attributes {
1122     std::vector<mxnet::TShape> inputs_;
1123     std::vector<mxnet::TShape> outputs_;
1124     std::unordered_map<std::string, std::string> attr_;
to_stringProfileOperator::Attributes1125     std::string to_string() const {
1126       std::stringstream ss;
1127       if (!inputs_.empty()) {
1128         ss << "in: [";
1129         for (size_t i = 0, n = inputs_.size(); i < n; ++i) {
1130           if (i) {
1131             ss << ",";
1132           }
1133           ss << inputs_[i];
1134         }
1135         ss << "]";
1136       }
1137       if (!outputs_.empty()) {
1138         ss << "out: [";
1139         for (size_t i = 0, n = outputs_.size(); i < n; ++i) {
1140           if (i) {
1141             ss << ",";
1142           }
1143           ss << outputs_[i];
1144         }
1145         ss << "]";
1146       }
1147       if (!attr_.empty()) {
1148         for (const auto &tt : attr_) {
1149           ss << " (" << tt.first << "=" << tt.second << ")";
1150         }
1151       }
1152       return ss.str();
1153     }
1154   };
1155 
1156   /*!
1157    * \brief Constructor
1158    * \param name Name of the operator
1159    */
ProfileOperatorProfileOperator1160   explicit inline ProfileOperator(const char *name, Attributes *attributes)
1161     : ProfileEvent(name)
1162       , as_task_(name, &domain_)
1163       , name_(name)
1164       , attributes_(attributes)
1165       , profiling_(!IsDeprecatedOperator(name)) {
1166     if (IsSubOperatorOfCustom(name)) {
1167       as_task_.setDomain(&custom_op_domain);
1168       SetCategories(custom_op_domain.name());
1169     } else {
1170       SetCategories(domain_.name());
1171     }
1172     // make as_task_ not to add stat to AggregateStats; otherwise we will add twice
1173     as_task_.enableAggregateStats(false);
1174   }
1175   /*!
1176    * \brief Start the profiling scope
1177    * \param dev_type Device type that the profiling will occur on
1178    * \param dev_id Device id associated with this opr
1179    */
startForDeviceProfileOperator1180   void startForDevice(mxnet::Context::DeviceType dev_type, uint32_t dev_id) {
1181     dev_type_ = dev_type;
1182     dev_id_ = dev_id;
1183     if (profiling_) {
1184       ProfileEvent::start();
1185       as_task_.start();
1186     }
1187   }
1188   /*!
1189    * \brief Stop the profiling scope
1190    */
stopProfileOperator1191   void stop() override {
1192     if (profiling_) {
1193       as_task_.stop();
1194       ProfileEvent::stop();
1195     }
1196   }
1197 
1198   /*!
1199    * \brief Operation execution statistics
1200    */
1201   struct OprExecStat : public DurationStat {
1202     /*!
1203      * \brief Constructor
1204      * \param name Name of the operator
1205      * \param dev_type Device type (i.e. CPU: 1, GPU: 2, CPUPinned: 3)
1206      * \param dev_id Device ID (ie GPU number)
1207      * \param start_time Time when operator starts
1208      * \param stop_time Time when operator completes
1209      */
OprExecStatProfileOperator::OprExecStat1210     inline OprExecStat(const char *name, mxnet::Context::DeviceType dev_type, uint32_t dev_id,
1211                        uint64_t start_time, uint64_t stop_time,
1212                        const Attributes *attributes)
1213       : DurationStat(ProfileStat::kDurationBegin, ProfileStat::kDurationEnd)
1214         , dev_type_(dev_type)
1215         , dev_id_(dev_id) {
1216       name_.set(name);
1217       if (attributes) {
1218         name_.append(attributes->to_string().c_str());
1219       }
1220       if (IsSubOperatorOfCustom(name)) {
1221         categories_.set(custom_op_domain.name());
1222       } else {
1223         categories_.set("operator");
1224       }
1225       items_[kStart].timestamp_ = start_time;
1226       items_[kStop].timestamp_ = stop_time;
1227     }
1228     /*! \brief device type: CPU: 1, GPU: 2, CPUPinned: 3 */
1229     mxnet::Context::DeviceType dev_type_;
1230     /*! \brief device id */
1231     uint32_t dev_id_;
1232   };
1233 
1234  private:
1235   /*!
1236    * \brief Send this object's statistical datapoint to the profiler
1237    */
SendStatProfileOperator1238   void SendStat() override {
1239     Profiler::Get()->AddNewProfileStat<OprExecStat>(
1240       [](OprExecStat *stat) {}, name_.c_str(), dev_type_, dev_id_,
1241       start_time_, ProfileStat::NowInMicrosec(),
1242       attributes_.get());
1243   }
1244   /*!
1245    * \brief Check if this operator is no longer profiled
1246    * Notice that this operator may still be used for e.g synchronization
1247    */
IsDeprecatedOperatorProfileOperator1248   inline static bool IsDeprecatedOperator(const char* name) {
1249     return strcmp(name, "CustomOperatorWait") == 0 ||
1250            strcmp(name, "Custom") == 0 || strcmp(name, "_backward_Custom") == 0;
1251   }
1252   /*!
1253    * \brief Check if this operator a sub-operator of a custom operator
1254    */
IsSubOperatorOfCustomProfileOperator1255   inline static bool IsSubOperatorOfCustom(const char* name) {
1256     return strstr(name, "::");
1257   }
1258   /*! \brief Also log the operator as a task in the operator domain */
1259   ProfileTask as_task_;
1260   /* !\brief Operator name */
1261   profile_stat_string name_;
1262   /*! \brief device type: CPU: 1, GPU: 2, CPUPinned: 3 */
1263   Context::DeviceType dev_type_;
1264   /*! \brief device id */
1265   uint32_t dev_id_;
1266   /*! \brief Operator domain */
1267   static ProfileDomain domain_;
1268   /*! \brief Optional operator attributes */
1269   std::unique_ptr<Attributes> attributes_;
1270   /*! \brief Whether to profile or not */
1271   const bool profiling_;
1272 };
1273 
1274 /*
1275  * Profiler inline functions
1276  */
DeviceName(mxnet::Context::DeviceType dev_type,int32_t dev_id)1277 inline const char *Profiler::DeviceName(mxnet::Context::DeviceType dev_type, int32_t dev_id) {
1278   return profile_stat[DeviceIndex(dev_type, dev_id)].dev_name_.c_str();
1279 }
1280 
DeviceName(const size_t index)1281 inline const char *Profiler::DeviceName(const size_t index) {
1282   return profile_stat[index].dev_name_.c_str();
1283 }
1284 
DeviceIndex(mxnet::Context::DeviceType dev_type,int32_t dev_id)1285 inline size_t Profiler::DeviceIndex(mxnet::Context::DeviceType dev_type, int32_t dev_id) {
1286   switch (dev_type) {
1287     case Context::kCPU:
1288       return dev_id;
1289     case Context::kGPU:
1290       return cpu_num_ + dev_id;
1291     case Context::kCPUPinned:
1292       return cpu_num_ + gpu_num_;
1293     case Context::kCPUShared:
1294       return cpu_num_ + gpu_num_ + 1;
1295     default:
1296       LOG(FATAL) << "Unknown dev_type: " << dev_type;
1297       return 0;
1298   }
1299 }
1300 
1301 /*!
1302  * \brief Explicit 'Profiler::AddProfileStat' override for 'OprExecStat'
1303  * \param opr_stat Unique pointer to the operator statistic
1304  */
1305 template<>
1306 inline void Profiler::AddProfileStat<ProfileOperator::OprExecStat>(
1307   std::unique_ptr<ProfileOperator::OprExecStat> *opr_stat) {
1308   const size_t idx = DeviceIndex((*opr_stat)->dev_type_, (*opr_stat)->dev_id_);
1309   CHECK_LT(idx, DeviceCount());
1310   DeviceStats& dev_stat = profile_stat[idx];
1311   dev_stat.opr_exec_stats_->enqueue((*opr_stat).release());
1312 }
1313 
1314 #undef VTUNE_ONLY_CODE  // This macro not meant to be used outside of this file
1315 
1316 }  // namespace profiler
1317 }  // namespace mxnet
1318 #endif  // MXNET_PROFILER_PROFILER_H_
1319