1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20 /*!
21 * \file profiler.h
22 * \brief implements profiler
23 */
24 #ifndef MXNET_PROFILER_PROFILER_H_
25 #define MXNET_PROFILER_PROFILER_H_
26
27 #include <dmlc/concurrentqueue.h>
28 #include <dmlc/thread_group.h>
29 #include <vector>
30 #include <string>
31 #include <cstdint>
32 #include <mutex>
33 #include <memory>
34 #include <array>
35 #include "./vtune.h"
36 #include "./aggregate_stats.h"
37 #include "./nvtx.h"
38 #include "../common/utils.h"
39
40
41 namespace mxnet {
42 namespace profiler {
43
44
45
46 /*!
47 * \brief Constant-sized character array class with simple string API to avoid allocations
48 * \tparam string_size Maximum size of the string (including zero-terminator)
49 */
50 template<size_t string_size>
51 struct static_string {
static_stringstatic_string52 inline static_string() { string_[0] = '\0'; }
static_stringstatic_string53 inline explicit static_string(const char *s) { set(s); }
c_strstatic_string54 inline const char *c_str() const { return &string_[0]; }
setstatic_string55 inline void set(const char *s) {
56 strncpy(&string_[0], s, string_size - 1);
57 string_[string_size - 1] = '\0';
58 }
appendstatic_string59 inline void append(const char *s) {
60 const size_t l = strlen(&string_[0]);
61 if (l < string_size - 1) {
62 strncpy(&string_[0] + l, s, string_size - l - 1);
63 string_[string_size - 1] = '\0';
64 }
65 }
66 private:
67 /*! \brief The actual character array */
68 std::array<char, string_size> string_;
69 };
70
71 using profile_stat_string = static_string<128>;
72
73 /*!
74 * \brief Base profile statistic structure
75 */
76 struct ProfileStat {
77 /*!
78 * \brief Event type as used for chrome://tracing support
79 * \note Tracing formats: https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview NOLINT(*)
80 */
81 enum EventType {
82 kDurationBegin = 'B',
83 kDurationEnd = 'E',
84 kComplete = 'X',
85 kInstant = 'i',
86 kCounter = 'C',
87 kAsyncNestableStart = 'b',
88 kAsyncNestableInstant = 'n',
89 kAsyncNestableEnd = 'e',
90 kFlowStart = 's',
91 kFlowStep = 't',
92 kFlowEnd = 'f',
93 kSample = 'P',
94 kObjectCreated = 'N',
95 kObjectSnapshot = 'O',
96 kObjectDestroyed = 'D',
97 kMetadata = 'M',
98 kMemoryDumpGlobal = 'V',
99 kMemoryDumpProcess = 'v',
100 kMark = 'R',
101 kClockSync = 'c',
102 kContextEnter = '(',
103 kContextLeave = ')'
104 };
105
106 struct SubEvent {
107 /*! \brief whether this sub-event object is enabled */
108 bool enabled_ = false;
109 /*! \brief Type of the sub-event */
110 EventType event_type_;
111 /*! \brief Timestamp of sub-event */
112 uint64_t timestamp_;
113 };
114
115 /*! \brief operation name */
116 profile_stat_string name_;
117
118 /*! \brief operation categories (comma-delimited) */
119 profile_stat_string categories_;
120
121 /*! \brief whether to add this stat to AggregateStats */
122 bool enable_aggregate_ = true;
123
124 /* !\brief Process id */
125 size_t process_id_ = common::current_process_id();
126
127 /*! \brief id of thread which operation run on.
128 *
129 * */
130 std::thread::id thread_id_ = std::this_thread::get_id(); // Not yet seen a
131 // case where this isn't valid
132
133 /*! \brief Sub-events (ie begin, end, etc.) */
134 SubEvent items_[3]; // Don't use vector in order to avoid memory allocation
135
136 /*!
137 * \brief Get current tick count in microseconds
138 * \return Current arbitrary tick count in microseconds
139 */
NowInMicrosecProfileStat140 static inline uint64_t NowInMicrosec() {
141 #if defined(_MSC_VER) && _MSC_VER <= 1800
142 LARGE_INTEGER frequency, counter;
143 QueryPerformanceFrequency(&frequency);
144 QueryPerformanceCounter(&counter);
145 return counter.QuadPart * 1000000 / frequency.QuadPart;
146 #else
147 return std::chrono::duration_cast<std::chrono::microseconds>(
148 std::chrono::high_resolution_clock::now().time_since_epoch()).count();
149 #endif
150 }
151
152 /*!
153 * \brief Print event statistics in json format to the supplied output stream
154 * \param os Output stream to write the data
155 * \note Emits all sub-even statistics
156 */
EmitEventsProfileStat157 void EmitEvents(std::ostream *os) {
158 size_t count = 0;
159 for (size_t i = 0; i < sizeof(items_) / sizeof(items_[0]); ++i) {
160 if (items_[i].enabled_) {
161 if (count) {
162 *os << ",\n";
163 }
164 EmitSubEvent(os, i);
165 ++count;
166 }
167 }
168 }
169
170 /*!
171 * \brief Virtual destructor
172 */
~ProfileStatProfileStat173 virtual ~ProfileStat() {}
174
175 /*!
176 * \brief Save aggregate data for this stat
177 * \param data Stat data
178 */
SaveAggregateProfileStat179 virtual void SaveAggregate(AggregateStats::StatData *data) const {
180 if (data) {
181 data->type_ = AggregateStats::StatData::kOther;
182 }
183 }
184
185 protected:
186 /*!
187 * \brief Override to emit extra items within the json event data block. Append with a comma ",".
188 * \param os Output stream to write data to
189 * \param idx Sub-even index (index into items_) to write
190 */
EmitExtraProfileStat191 virtual void EmitExtra(std::ostream *os, size_t idx) {}
192
193 /*!
194 * \brief Emit sub-event statistics
195 * \param os Output stream
196 * \param idx Sub-even index (index into items_) to write
197 */
EmitSubEventProfileStat198 void EmitSubEvent(std::ostream *os, size_t idx) {
199 const SubEvent &ev = items_[idx];
200 if (ev.enabled_) {
201 *os << " {\n"
202 << " \"name\": \"" << name_.c_str() << "\",\n"
203 << " \"cat\": " << "\"" << categories_.c_str() << "\",\n"
204 << " \"ph\": \"" << static_cast<char>(ev.event_type_) << "\",\n"
205 << " \"ts\": " << ev.timestamp_ << ",\n";
206 EmitExtra(os, idx);
207 *os << " \"pid\": " << process_id_ << ",\n"
208 << " \"tid\": " << std::hash<std::thread::id>{}(thread_id_) << "\n"
209 << " }\n";
210 }
211 }
212 };
213
214 /*!
215 * \brief Device statistics
216 */
217 struct DeviceStats {
218 using TQueue = dmlc::moodycamel::ConcurrentQueue<ProfileStat *>;
219 /*!
220 * \brief Destructor, clean up allocated objects
221 */
~DeviceStatsDeviceStats222 ~DeviceStats() {
223 std::shared_ptr<TQueue> es = opr_exec_stats_;
224 if (es) {
225 ProfileStat *stat = nullptr;
226 while (es->try_dequeue(stat)) {
227 delete stat;
228 }
229 }
230 }
231
232 /*! \brief device name */
233 std::string dev_name_;
234 /*! \brief operation execution statistics on this device */
235 std::shared_ptr<TQueue> opr_exec_stats_ = std::make_shared<TQueue>();
236 };
237
238 /*!
239 * _____ __ _ _
240 * | __ \ / _|(_)| |
241 * | |__) |_ __ ___ | |_ _ | | ___ _ __
242 * | ___/| '__|/ _ \| _|| || |/ _ \| '__|
243 * | | | | | (_) | | | || | __/| |
244 * |_| |_| \___/|_| |_||_|\___||_|
245 *
246 * \brief profiler that records the operation execution information
247 * and saves the profile statistics.
248 * \note Profiler class doesn't know anything about VTune
249 */
250 class Profiler {
251 public:
252 enum ProfilerMode {
253 kSymbolic = 1,
254 kImperative = 2,
255 kAPI = 4,
256 kMemory = 8
257 };
258 enum ProfilerState {
259 kNotRunning = 0,
260 kRunning = 1
261 };
262
263 /*! \brief set state of profiler */
264 void SetState(ProfilerState state);
265 /*! \return state of profiler */
GetState()266 inline ProfilerState GetState() const {
267 return this->state_;
268 }
269 /*!
270 * \brief set profiler configuration
271 * \param mode flags, one or more of 'ProfilerMode'
272 * \param output_filename profile output file name
273 * \param continuous_dump true if profile information should be periodically dumped
274 * \param dump_period Period (in seconds) of profile info dumping
275 */
276 void SetConfig(int mode, std::string output_filename,
277 bool continuous_dump,
278 float dump_period,
279 bool aggregate_stats);
280
281 /*! \return mode of profiler */
GetMode()282 inline int GetMode() const {
283 return this->mode_;
284 }
285
IsProfiling(const ProfilerMode pm)286 inline bool IsProfiling(const ProfilerMode pm) const {
287 return GetState() == kRunning && (GetMode() & pm) == pm;
288 }
289
290 /*! \return whether the profiler is enabled to output */
IsEnableOutput()291 inline bool IsEnableOutput() const {
292 return this->enable_output_;
293 }
294 /*!
295 * \brief dump the profile file
296 * \param perform_cleanup Close off the json trace structures (ie last pass)
297 */
298 void DumpProfile(bool perform_cleanup = true);
299
300 /*! \return the profiler init time, time unit is microsecond (10^-6) s */
GetInitTime()301 uint64_t MSHADOW_CINLINE GetInitTime() const {
302 return init_time_;
303 }
304 /*!
305 * \brief add one operation execution record in corresponding device statistics
306 * \tparam SetExtraInfoFunction
307 * \param dev_type
308 * \param dev_id
309 * \param set_extra_info_function
310 * \note Because when this function exits, the object is written to the profile queue,
311 * and at that point, could be consumed and/or destroyed at any moment,
312 * any preprocessing on the object is to be done in the set_extra_info_function
313 * callback. Another option is to use the CreateProfileStat()/AddProfileStat() pair,
314 * adding it only after
315 */
316 template<typename StatType, typename SetExtraInfoFunction, typename ...Args>
AddNewProfileStat(SetExtraInfoFunction set_extra_info_function,Args...args)317 void AddNewProfileStat(SetExtraInfoFunction set_extra_info_function, Args... args) {
318 if (!paused_) {
319 std::unique_ptr<StatType> stat = CreateProfileStat<StatType>(args...);
320 set_extra_info_function(stat.get());
321 AddProfileStat(&stat);
322 }
323 }
324
325 /*!
326 * \brief Return aggregate statistic accumulator
327 * \return shared pointer to the 'ProfileStats' aggregate statistic accumulator
328 */
GetAggregateStats()329 std::shared_ptr<AggregateStats> GetAggregateStats() const {
330 return aggregate_stats_;
331 }
332
333 /*!
334 * \brief Get a pointer to the Profiler singleton
335 * \return Profiler singleton
336 * \param sp Profiler shared pointer, only use for singleton ownership
337 */
338 static Profiler* Get(std::shared_ptr<Profiler> *sp = nullptr);
339
340 /*!
341 * \brief Set whether statistic collection is to be paused
342 * \param paused true if statistic collection is to be paused, otherwise
343 * resume statistic collection
344 * \note Pause/Resume is not recursive
345 */
set_paused(bool paused)346 void set_paused(bool paused) { paused_ = paused; }
347
348 /*!
349 * \brief Get the calculated device count (numb er of devices to track in profile data).
350 * \return Device count
351 * \note Number of CPU's + Number of GPU's + One for CPU-Pinned
352 */
DeviceCount()353 size_t DeviceCount() const { return cpu_num_ + gpu_num_ + 2; }
354
355 /*!
356 * \brief Compute device index given device type and id
357 * \param dev_type Device type
358 * \param dev_id Device ID
359 * \return Device index for indexing into device-specific data
360 */
361 size_t DeviceIndex(mxnet::Context::DeviceType dev_type, int32_t dev_id);
362
363 /*!
364 * \brief Device name
365 * \param dev_type Device type
366 * \param dev_id Device ID
367 * \return Character pointer to device name
368 */
369 const char *DeviceName(mxnet::Context::DeviceType dev_type, int32_t dev_id);
370
371
372 /*!
373 * \brief Device name
374 * \param dev_type Device type
375 * \param dev_id Device ID
376 * \return Character pointer to device name
377 */
378 const char *DeviceName(const size_t index);
379
380 /*!
381 * \brief Whether aggregate stats are being collected
382 * \return true if aggregate stats are being collected
383 */
AggregateEnabled()384 inline bool AggregateEnabled() const {
385 return aggregate_stats_.get() != nullptr;
386 }
387
388 /*!
389 * \brief Whether aggregate stats are currently being recorded
390 * \return true if aggregate stats are currently being recorded
391 */
AggregateRunning()392 inline bool AggregateRunning() const {
393 return GetState() == kRunning && AggregateEnabled();
394 }
395
396 public:
397 /*!
398 * \brief Constructor
399 */
400 Profiler();
401
402 /*!
403 * \brief Destructor
404 */
405 virtual ~Profiler();
406
407 private:
408 /*!
409 * \brief Create a new profile statistic object
410 * \tparam StatType The type of the profile statistic object
411 * \tparam Args Argument types to pass to the new object's constructor
412 * \param args Arguments to pass to the new object's constructor
413 * \return A unique_ptr to the new statistic object
414 */
415 template<typename StatType, typename ...Args>
416 static std::unique_ptr<typename std::enable_if<std::is_base_of<ProfileStat, StatType>::value,
CreateProfileStat(Args...args)417 StatType>::type> CreateProfileStat(Args... args) {
418 return std::unique_ptr<StatType>(new StatType(args...));
419 }
420
421 /*!
422 * \brief Add a general profile statistic object
423 * \tparam StatType Type of the statistic object
424 * \param stat The statistic object
425 */
426 template<typename StatType>
AddProfileStat(std::unique_ptr<StatType> * stat)427 inline void AddProfileStat(std::unique_ptr<StatType> *stat) {
428 general_stats_.opr_exec_stats_->enqueue(stat->release());
429 }
430
431 /*! \brief generate device information following chrome profile file format */
432 void EmitPid(std::ostream *os, const std::string& name, size_t pid);
433
434 /*!
435 * \brief Set continuous asynchronous profile dump
436 * \param continuous_dump Whether to continuously dump profile information
437 * \param delay_in_seconds Delay between asynchronous dumps
438 */
439 void SetContinuousProfileDump(bool continuous_dump, float delay_in_seconds);
440
441 /*! \brief internal mutex of the profiler */
442 std::recursive_mutex m_;
443 /*! \brief indicate whether the profiler is running */
444 ProfilerState state_;
445 /*! \brief once running, enable profiler to output */
446 volatile bool enable_output_;
447 /*! \brief indicate what operator the profiler will record */
448 int mode_ = kSymbolic | kAPI | kMemory;
449 /*! \brief filename to output profile file */
450 std::string filename_ = "profile.json";
451 /*! \brief profile statistics consist of multiple device statistics */
452 std::unique_ptr<DeviceStats[]> profile_stat;
453 /*! \brief Stats not associated directly with a device */
454 DeviceStats general_stats_;
455 /*! \brief Map category -> pid */
456 std::unordered_map<std::string, size_t> category_to_pid_;
457 /*! \brief cpu number on the machine */
458 unsigned int cpu_num_;
459 /*! \brief gpu number on the machine */
460 unsigned int gpu_num_;
461 /*! \brief the profiler init time */
462 uint64_t init_time_;
463 /*! \brief Continuously dump profile info */
464 volatile bool continuous_dump_ = false;
465 /*! \brief Number of non-meta profiling record emitted */
466 volatile uint64_t num_records_emitted_ = 0;
467 /*! \brief Number of times profile was dumped */
468 volatile uint64_t profile_dump_count_;
469 /*! \brief Whether profiling is paused */
470 volatile bool paused_ = false;
471 /*! \brief Maintain in-memory aggregate stats for print output.
472 * \warning This has a negative performance impact */
473 std::shared_ptr<AggregateStats> aggregate_stats_ = nullptr;
474 /*! \brief Asynchronous operation thread lifecycle control object */
475 std::shared_ptr<dmlc::ThreadGroup> thread_group_ = std::make_shared<dmlc::ThreadGroup>();
476 /* !\brief pids */
477 std::unordered_set<uint32_t> process_ids_;
478 };
479
480 #ifdef MXNET_USE_VTUNE
481 #define VTUNE_ONLY_CODE(...) __VA_ARGS__ /* This is undefined at the bottom of this file */
482 #else
483 #define VTUNE_ONLY_CODE(...) /* */ /* This is undefined at the bottom of this file */
484 #endif
485
486 #ifdef MXNET_USE_NVTX
487 #define NVTX_ONLY_CODE(...) __VA_ARGS__ /* This is undefined at the bottom of this file */
488 #else
489 #define NVTX_ONLY_CODE(...) /* */ /* This is undefined at the bottom of this file */
490 #endif
491
492 /**
493 * _____ __ _ _ _ ____ _ _ _
494 * | __ \ / _|(_)| |(_) / __ \| | (_) | |
495 * | |__) |_ __ ___ | |_ _ | | _ _ __ __ _ | | | | |__ _ ___ ___| |_ ___
496 * | ___/| '__|/ _ \| _|| || || | '_ \ / _` | | | | | '_ \| |/ _ \ / __| __|/ __|
497 * | | | | | (_) | | | || || | | | | (_| | | |__| | |_) | | __/| (__| |_ \__ \
498 * |_| |_| \___/|_| |_||_||_|_| |_|\__, | \____/|_.__/| |\___| \___|\__||___/
499 * __/ | _/ |
500 * |___/ |__/
501 */
502
503 enum ProfileObjectType {
504 kDomain,
505 kCounter,
506 kTask,
507 kEvent,
508 kFrame
509 };
510
511 class ProfileObject {
512 public:
513 /*!
514 * \brief Virtual destructor for child classes
515 */
~ProfileObject()516 virtual ~ProfileObject() {}
517 /*!
518 * \brief Return profiling object object type (i.e. kTask, kEvent, ...)
519 * \return Profiling object type
520 */
521 virtual ProfileObjectType type() const = 0;
522 };
523
524 /*!
525 * \brief Tuning domain. Used in VTune to separate similar tasks/counters/etc. (for example).
526 * For chrome tracing, generally maps to category.
527 */
528 struct ProfileDomain : public ProfileObject {
529 /*!:(
530 * \brief Constructor
531 * \param name Name of the domain
532 */
ProfileDomainProfileDomain533 explicit ProfileDomain(const char *name) noexcept
534 : name_(name) {
535 CHECK_NOTNULL(name);
536 CHECK_NE(name[0], '\0');
537 VTUNE_ONLY_CODE(vtune_domain_.reset(new vtune::VTuneDomain(name)));
538 }
539 /*!
540 * \brief Get domain name
541 * \return Domain name
542 */
nameProfileDomain543 const char *name() const { return name_.c_str(); }
typeProfileDomain544 ProfileObjectType type() const override { return kDomain; }
545 VTUNE_ONLY_CODE(inline vtune::VTuneDomain *dom() { return vtune_domain_.get(); });
546 private:
547 /*! \brief Name of the domain */
548 profile_stat_string name_;
549 /*! \brief VTune domain object */
550 VTUNE_ONLY_CODE(std::unique_ptr<vtune::VTuneDomain> vtune_domain_);
551 };
552
553 /*!
554 * \brief Counter object and statistic item
555 */
556 struct ProfileCounter : public ProfileObject {
557 /*!
558 * \brief Constructor
559 * \param name Counter name
560 * \param domain Counter domain
561 */
ProfileCounterProfileCounter562 ProfileCounter(const char *name, ProfileDomain *domain) noexcept
563 : name_(name)
564 , domain_(domain)
565 , value_(0) {
566 CHECK_NOTNULL(domain);
567 VTUNE_ONLY_CODE(vtune_.reset(new vtune::VTuneCounter(name, domain->dom())));
568 }
~ProfileCounterProfileCounter569 ~ProfileCounter() {}
570 /*! \brief operator: ++object */
571 inline uint64_t operator ++() {
572 return IncrementValue(1);
573 }
574 /*! \brief operator: object++ */
575 inline uint64_t operator ++(int) {
576 const uint64_t old = value_;
577 IncrementValue(1);
578 return old;
579 }
580 /*! \brief operator: --object */
581 inline uint64_t operator --() {
582 CHECK_GT(value_, 0);
583 return DecrementValue(1);
584 }
585 /*! \brief operator: object-- */
586 inline uint64_t operator --(int) {
587 CHECK_GT(value_, 0);
588 const uint64_t old = value_;
589 DecrementValue(1);
590 return old;
591 }
592 /*! \brief operator: object += v */
593 inline uint64_t operator +=(int64_t v) {
594 if (v >= 0) {
595 return IncrementValue(static_cast<uint64_t>(v));
596 } else {
597 v = -v;
598 return DecrementValue(static_cast<uint64_t>(v));
599 }
600 }
601 /*! \brief operator: object -= v */
602 inline uint64_t operator -=(int64_t v) {
603 CHECK_GE(value_, v);
604 if (v >= 0) {
605 return DecrementValue(static_cast<uint64_t>(v));
606 } else {
607 v = -v;
608 return IncrementValue(static_cast<uint64_t>(v));
609 }
610 }
611
612 inline bool operator >=(int64_t v) {
613 CHECK_GE(v, 0);
614 return value_ >= static_cast<uint64_t>(v);
615 }
616
617 /*! \brief operator: object = v */
618 inline ProfileCounter& operator = (uint64_t v) {
619 SetValue(v);
620 return *this;
621 }
622
typeProfileCounter623 ProfileObjectType type() const override { return kCounter; }
624
625 protected:
626 /*!
627 * \brief Count statistic object
628 */
629 struct ProfileCounterStat : public ProfileStat {
630 uint64_t value_;
ProfileCounterStatProfileCounter::ProfileCounterStat631 explicit ProfileCounterStat(const char *name, uint64_t value) : value_(value) {
632 items_[0].enabled_ = true;
633 items_[0].event_type_ = kCounter;
634 items_->timestamp_ = NowInMicrosec();
635 name_.set(name);
636 }
637
638 /*!
639 * \brief Emit counter value as extra data for this statistic type
640 * \param os Output stream to write data to
641 * \param idx Sub-even index (index into items_) to write
642 */
EmitExtraProfileCounter::ProfileCounterStat643 void EmitExtra(std::ostream *os, size_t idx) override {
644 ProfileStat::EmitExtra(os, idx);
645 *os << " \"args\": { \"" << name_.c_str() << "\": " << value_ << " },\n";
646 }
647
648 /*!
649 * \brief Save aggregate data for this stat
650 * \param data Stat data
651 */
SaveAggregateProfileCounter::ProfileCounterStat652 void SaveAggregate(AggregateStats::StatData *data) const override {
653 if (data) {
654 data->type_ = AggregateStats::StatData::kCounter;
655 ++data->total_count_;
656 data->total_aggregate_ = value_;
657 if (value_ > data->max_aggregate_) {
658 data->max_aggregate_ = value_;
659 }
660 if (value_ < data->min_aggregate_) {
661 data->min_aggregate_ = value_;
662 }
663 }
664 }
665 };
666
667 private:
668 /*!
669 * \brief Send this object's statistical datapoint to the profiler
670 */
SendStatProfileCounter671 inline void SendStat(uint64_t value) {
672 Profiler::Get()->AddNewProfileStat<ProfileCounterStat>([this](ProfileCounterStat *stat) {
673 stat->categories_.set(domain_->name());
674 },
675 name_.c_str(),
676 value);
677 }
678
679 /*!
680 * \brief Set counter value
681 * \param val Value to set the counter
682 */
SetValueProfileCounter683 inline void SetValue(uint64_t val) {
684 VTUNE_ONLY_CODE(*vtune_ = val);
685 value_ = val;
686 SendStat(val);
687 }
688
689 /*!
690 * \brief Adjust counter value by amount given
691 * \param value_change Value to change the counter by
692 */
IncrementValueProfileCounter693 inline uint64_t IncrementValue(uint64_t value_change) {
694 VTUNE_ONLY_CODE(*vtune_ += value_change);
695 const uint64_t v = (value_ += value_change);
696 SendStat(v);
697 return v;
698 }
699
DecrementValueProfileCounter700 inline uint64_t DecrementValue(uint64_t value_change) {
701 VTUNE_ONLY_CODE(*vtune_ -= value_change);
702 const uint64_t v = (value_ -= value_change);
703 SendStat(v);
704 return v;
705 }
706
707 /*! \brief Name of the counter */
708 profile_stat_string name_;
709 /*! \brief Domain of the counter */
710 ProfileDomain *domain_;
711 /*! \brief Value of the counter */
712 std::atomic<uint64_t> value_;
713 /*! \brief VTune counter object */
714 VTUNE_ONLY_CODE(std::unique_ptr<vtune::VTuneCounter> vtune_);
715 };
716
717 class ProfileDuration : public ProfileObject {
718 public:
719 virtual void start() = 0;
720 virtual void stop() = 0;
721
722 protected:
723 /*!
724 * \brief Basic duration statistic (start time, stop time)
725 */
726 struct DurationStat : public ProfileStat {
727 enum DurationStatIndex {
728 kStart, kStop
729 };
730 /*!
731 * \brief Constructor
732 * \param begin_event Event type for start point (default is kDurationBegin)
733 * \param end_event Event type for stop point (default is kDurationEnd)
734 */
735 DurationStat(ProfileStat::EventType begin_event = ProfileStat::kDurationBegin,
736 ProfileStat::EventType end_event = ProfileStat::kDurationEnd) {
737 items_[kStart].enabled_ = items_[kStop].enabled_ = true;
738 items_[kStart].event_type_ = begin_event;
739 items_[kStop].event_type_ = end_event;
740 }
741
742 /*!
743 * \brief Save aggregate data for this stat
744 * \param data Stat data
745 */
SaveAggregateDurationStat746 void SaveAggregate(AggregateStats::StatData *data) const override {
747 if (data) {
748 data->type_ = AggregateStats::StatData::kDuration;
749 ++data->total_count_;
750 CHECK_GE(items_[kStop].timestamp_, items_[kStart].timestamp_);
751 const uint64_t duration = items_[kStop].timestamp_ - items_[kStart].timestamp_;
752 data->total_aggregate_ += duration;
753 if (duration > data->max_aggregate_) {
754 data->max_aggregate_ = duration;
755 }
756 if (duration < data->min_aggregate_) {
757 data->min_aggregate_ = duration;
758 }
759 }
760 }
761 };
762 };
763
764 /*!
765 * \brief Task - Thread-granular nestable time block
766 */
767 struct ProfileTask : public ProfileDuration {
768 /*!
769 * \brief Constructor
770 * \param name Name of the task
771 * \param domain Domain of the task
772 */
ProfileTaskProfileTask773 ProfileTask(const char *name, ProfileDomain *domain)
774 : name_(name)
775 , domain_(domain) {
776 CHECK_NOTNULL(domain);
777 categories_.set(domain_->name());
778 categories_.append(",task");
779 VTUNE_ONLY_CODE(vtune_task_.reset(new vtune::VTuneTask(name, domain->dom())));
780 NVTX_ONLY_CODE(nvtx_duration_.reset(new nvtx::NVTXDuration(name)));
781 }
782
783 /*!
784 * \brief Set the domain
785 */
setDomainProfileTask786 void setDomain(ProfileDomain* domain) {
787 domain_ = domain;
788 }
789
790 /*!
791 * \brief Start the profiling scope
792 */
startProfileTask793 void start() override {
794 start_time_ = ProfileStat::NowInMicrosec();
795 VTUNE_ONLY_CODE(vtune_task_->start());
796 NVTX_ONLY_CODE(nvtx_duration_->start());
797 }
798
799 /*!
800 * \brief Stop the profiling scope
801 */
stopProfileTask802 void stop() override {
803 VTUNE_ONLY_CODE(vtune_task_->stop());
804 NVTX_ONLY_CODE(nvtx_duration_->stop());
805 SendStat();
806 }
807
typeProfileTask808 ProfileObjectType type() const override { return kTask; }
809
810 /*!
811 * \brief Whether to add stat to AggregateStats
812 */
813 void enableAggregateStats(bool enabled = true) {
814 enable_aggregate_ = enabled;
815 }
816
817 protected:
818 /*!
819 * \brief Task statistic object
820 */
821 struct ProfileTaskStat : public DurationStat {
ProfileTaskStatProfileTask::ProfileTaskStat822 explicit ProfileTaskStat(const char *name, uint64_t start_time, uint64_t stop_time)
823 : DurationStat(ProfileStat::kAsyncNestableStart, ProfileStat::kAsyncNestableEnd) {
824 name_.set(name);
825 items_[0].timestamp_ = start_time;
826 items_[1].timestamp_ = stop_time;
827 }
EmitExtraProfileTask::ProfileTaskStat828 void EmitExtra(std::ostream *os, size_t idx) override {
829 DurationStat::EmitExtra(os, idx);
830 *os << " \"id\": " << std::hash<std::thread::id>{}(thread_id_) << ",\n";
831 }
832 };
833
834 private:
835 /*!
836 * \brief Send this object's statistical datapoint to the profiler
837 */
SendStatProfileTask838 inline void SendStat() {
839 Profiler::Get()->AddNewProfileStat<ProfileTaskStat>([this](ProfileTaskStat *stat) {
840 stat->categories_.set(domain_->name());
841 stat->enable_aggregate_ = enable_aggregate_;
842 }, name_.c_str(), start_time_, ProfileStat::NowInMicrosec());
843 }
844 /*! \brief Task name */
845 const profile_stat_string name_;
846 /*! \brief Task categories */
847 profile_stat_string categories_;
848 /*! \brief domain */
849 ProfileDomain *domain_;
850 /*! \brief VTune task object */
851 VTUNE_ONLY_CODE(std::unique_ptr<vtune::VTuneTask> vtune_task_);
852 /*! \brief NVTX duration object */
853 NVTX_ONLY_CODE(std::unique_ptr<nvtx::NVTXDuration> nvtx_duration_);
854 /*! \brief whether to add this stat to AggregateStats */
855 bool enable_aggregate_ = true;
856
857 protected:
858 /*! \brief Task's start tick */
859 uint64_t start_time_;
860 };
861
862 /*!
863 * \brief Event - Thread-granular time block
864 */
865 struct ProfileEvent : public ProfileDuration {
866 /*!
867 * \brief Constructor
868 * \param name Name of the event
869 */
ProfileEventProfileEvent870 explicit inline ProfileEvent(const char *name)
871 : name_(name)
872 , categories_("event") {
873 VTUNE_ONLY_CODE(vtune_event_ = vtune::VTuneEvent::registry_.get(name));
874 NVTX_ONLY_CODE(nvtx_duration_.reset(new nvtx::NVTXDuration(name)));
875 }
876
877 /*!
878 * \brief Start the profiling scope
879 */
startProfileEvent880 void start() override {
881 start_time_ = ProfileStat::NowInMicrosec();
882 VTUNE_ONLY_CODE(vtune_event_->start());
883 NVTX_ONLY_CODE(nvtx_duration_->start());
884 }
885
886 /*!
887 * \brief Stop the profiling scope
888 */
stopProfileEvent889 void stop() override {
890 VTUNE_ONLY_CODE(vtune_event_->stop());
891 SendStat();
892 }
893
894 /*!
895 * \brief Set catagories (used for chrome tracing)
896 * \param categories Comma-delimited categories
897 */
SetCategoriesProfileEvent898 void SetCategories(const char *categories) {
899 categories_.set(categories);
900 }
901
typeProfileEvent902 ProfileObjectType type() const override { return kEvent; }
903
904 protected:
905 /*!
906 * \brief Event statistic object
907 */
908 struct ProfileEventStat : public DurationStat {
ProfileEventStatProfileEvent::ProfileEventStat909 explicit ProfileEventStat(const char *name, uint64_t start_time, uint64_t stop_time)
910 : DurationStat(ProfileStat::kDurationBegin, ProfileStat::kDurationEnd) {
911 name_.set(name);
912 items_[0].timestamp_ = start_time;
913 items_[1].timestamp_ = stop_time;
914 }
915 };
916
917 private:
918 /*!
919 * \brief Send this object's statistical datapoint to the profiler
920 */
SendStatProfileEvent921 virtual void SendStat() {
922 Profiler::Get()->AddNewProfileStat<ProfileEventStat>([this](ProfileEventStat *stat) {
923 stat->categories_.set(categories_.c_str());
924 }, name_.c_str(), start_time_, ProfileStat::NowInMicrosec());
925 }
926 /*! \brief Event name */
927 const profile_stat_string name_;
928 /*! \brief Event categories (comma-delimited) */
929 profile_stat_string categories_;
930 /*! \brief VTune event object */
931 VTUNE_ONLY_CODE(vtune::VTuneEvent *vtune_event_);
932 /*! \brief NVTX duration object */
933 NVTX_ONLY_CODE(std::unique_ptr<nvtx::NVTXDuration> nvtx_duration_;);
934
935 protected:
936 /*! \brief Start time of the event */
937 uint64_t start_time_;
938 };
939
940 /*!
941 * \brief Frame - Process-granular time block
942 */
943 struct ProfileFrame : public ProfileDuration {
944 /*!
945 * \brief Constructor
946 * \param name Name of the frame
947 * \param domain Domain of the frame
948 */
ProfileFrameProfileFrame949 ProfileFrame(const char *name, ProfileDomain *domain)
950 : name_(name)
951 , domain_(domain) {
952 CHECK_NOTNULL(domain);
953 categories_.set(domain_->name());
954 categories_.append(",frame");
955 NVTX_ONLY_CODE(nvtx_duration_.reset(new nvtx::NVTXDuration(name)));
956 VTUNE_ONLY_CODE(vtune_frame_.reset(new vtune::VTuneFrame(domain->dom())));
957 }
958
959 /*!
960 * \brief Start the profiling scope
961 */
startProfileFrame962 void start() override {
963 start_time_ = ProfileStat::NowInMicrosec();
964 VTUNE_ONLY_CODE(vtune_frame_->start());
965 NVTX_ONLY_CODE(nvtx_duration_->start());
966 }
967
968 /*!
969 * \brief Stop the profiling scope
970 */
stopProfileFrame971 void stop() override {
972 VTUNE_ONLY_CODE(vtune_frame_->stop());
973 SendStat();
974 }
975
typeProfileFrame976 ProfileObjectType type() const override { return kFrame; }
977
978 protected:
979 /*!
980 * \brief Frame statistic object
981 */
982 struct ProfileFrameStat : public DurationStat {
ProfileFrameStatProfileFrame::ProfileFrameStat983 explicit ProfileFrameStat(const char *name, uint64_t start_time, uint64_t stop_time)
984 : DurationStat(ProfileStat::kContextEnter, ProfileStat::kContextLeave) {
985 name_.set(name);
986 items_[0].timestamp_ = start_time;
987 items_[1].timestamp_ = stop_time;
988 }
989 };
990
991 private:
992 /*!
993 * \brief Send this object's statistical datapoint to the profiler
994 */
SendStatProfileFrame995 inline void SendStat() {
996 Profiler::Get()->AddNewProfileStat<ProfileFrameStat>([this](ProfileFrameStat *stat) {
997 stat->categories_.set(categories_.c_str());
998 }, name_.c_str(), start_time_, ProfileStat::NowInMicrosec());
999 }
1000 /*! \brief Frame name */
1001 const profile_stat_string name_;
1002 /*! \brief Frame categories (comma-delimited) */
1003 profile_stat_string categories_;
1004 /*! \brief Pointer to the domain */
1005 ProfileDomain *domain_;
1006 /*! \brief VTune Frame object */
1007 VTUNE_ONLY_CODE(std::unique_ptr<vtune::VTuneFrame> vtune_frame_);
1008 /*! \brief NVTX duration object */
1009 NVTX_ONLY_CODE(std::unique_ptr<nvtx::NVTXDuration> nvtx_duration_);
1010
1011 protected:
1012 /*! \brief Frame start time */
1013 uint64_t start_time_;
1014 };
1015
1016 /*!
1017 * \brief Marker - Mark an instance in time
1018 */
1019 struct ProfileMarker {
1020 enum MarkerScope { // Should equal VTune values
1021 kUnknown, kGlobal, kProcess, kThread, kTask, kMarker
1022 };
1023
1024 /*!
1025 * \brief Constructor
1026 * \param name Name of the instant marker
1027 * \param domain Domain of the instant marker
1028 * \param scope Scope of the instant marker
1029 * \param nestable true if the instant marker is nestable
1030 */
1031 ProfileMarker(const char *name,
1032 ProfileDomain *domain,
1033 const MarkerScope scope,
1034 bool nestable = true)
name_ProfileMarker1035 : name_(name)
1036 , domain_(domain)
1037 , scope_(scope)
1038 , nestable_(nestable) {
1039 categories_.set(domain_->name());
1040 categories_.append(",instant_marker");
1041 VTUNE_ONLY_CODE(vtune_instant_marker_.reset(
1042 new vtune::VTuneInstantMarker(name, domain->dom(), static_cast<__itt_scope>(scope))));
1043 }
1044
1045 /*!
1046 * \brief Signal a marker at this instant
1047 */
markProfileMarker1048 void mark() {
1049 VTUNE_ONLY_CODE(vtune_instant_marker_->signal());
1050 SendStat();
1051 }
1052
1053 protected:
1054 /*!
1055 * \brief Instant-marker statistic object
1056 */
1057 struct ProfileMarkerStat : public ProfileStat {
ProfileMarkerStatProfileMarker::ProfileMarkerStat1058 explicit ProfileMarkerStat(const char *name, const char scope_char, bool nestable)
1059 : scope_char_(scope_char) {
1060 items_[0].enabled_ = true;
1061 items_[0].event_type_ = nestable ? kAsyncNestableInstant : kInstant;
1062 items_->timestamp_ = NowInMicrosec();
1063 name_.set(name);
1064 }
EmitExtraProfileMarker::ProfileMarkerStat1065 virtual void EmitExtra(std::ostream *os, size_t idx) {
1066 ProfileStat::EmitExtra(os, idx);
1067 *os << " \"s\": \"" << scope_char_ << "\",\n";
1068 }
1069 const char scope_char_;
1070 };
1071
1072 private:
1073 /*!
1074 * \brief Send this object's statistical datapoint to the profiler
1075 */
SendStatProfileMarker1076 virtual void SendStat() {
1077 Profiler::Get()->AddNewProfileStat<ProfileMarkerStat>([this](ProfileMarkerStat *stat) {
1078 stat->categories_.set(categories_.c_str());
1079 }, name_.c_str(), vtune_scope_to_chrome_scope(scope_), nestable_);
1080 }
1081
vtune_scope_to_chrome_scopeProfileMarker1082 static char vtune_scope_to_chrome_scope(const MarkerScope scope) {
1083 switch (scope) {
1084 case kThread:
1085 return 't';
1086 case kGlobal:
1087 return 'g';
1088 case kProcess:
1089 case kUnknown:
1090 case kTask:
1091 case kMarker:
1092 default:
1093 return 'p';
1094 }
1095 }
1096
1097 /*! \brief Name of the instant marker */
1098 const profile_stat_string name_;
1099 /*! \brief Categories of the instant marker (comma-delimited) */
1100 profile_stat_string categories_;
1101 /*! \brief Pointer to the domain of this instant marker */
1102 ProfileDomain *domain_;
1103 /*! \brief VTune scope */
1104 const MarkerScope scope_;
1105 /*! \brief Whether this marker is nestabe */
1106 const bool nestable_;
1107 /*! \brief VTune instant marker object */
1108 VTUNE_ONLY_CODE(std::unique_ptr<vtune::VTuneInstantMarker> vtune_instant_marker_);
1109 };
1110
1111 static ProfileDomain custom_op_domain("Custom Operator");
1112
1113 /*!
1114 * \brief Operator profiler object. Logs as both an independent event and a task in
1115 * the operator domain
1116 */
1117 struct ProfileOperator : public ProfileEvent {
1118 /*!
1119 * \brief Operator attributes
1120 */
1121 struct Attributes {
1122 std::vector<mxnet::TShape> inputs_;
1123 std::vector<mxnet::TShape> outputs_;
1124 std::unordered_map<std::string, std::string> attr_;
to_stringProfileOperator::Attributes1125 std::string to_string() const {
1126 std::stringstream ss;
1127 if (!inputs_.empty()) {
1128 ss << "in: [";
1129 for (size_t i = 0, n = inputs_.size(); i < n; ++i) {
1130 if (i) {
1131 ss << ",";
1132 }
1133 ss << inputs_[i];
1134 }
1135 ss << "]";
1136 }
1137 if (!outputs_.empty()) {
1138 ss << "out: [";
1139 for (size_t i = 0, n = outputs_.size(); i < n; ++i) {
1140 if (i) {
1141 ss << ",";
1142 }
1143 ss << outputs_[i];
1144 }
1145 ss << "]";
1146 }
1147 if (!attr_.empty()) {
1148 for (const auto &tt : attr_) {
1149 ss << " (" << tt.first << "=" << tt.second << ")";
1150 }
1151 }
1152 return ss.str();
1153 }
1154 };
1155
1156 /*!
1157 * \brief Constructor
1158 * \param name Name of the operator
1159 */
ProfileOperatorProfileOperator1160 explicit inline ProfileOperator(const char *name, Attributes *attributes)
1161 : ProfileEvent(name)
1162 , as_task_(name, &domain_)
1163 , name_(name)
1164 , attributes_(attributes)
1165 , profiling_(!IsDeprecatedOperator(name)) {
1166 if (IsSubOperatorOfCustom(name)) {
1167 as_task_.setDomain(&custom_op_domain);
1168 SetCategories(custom_op_domain.name());
1169 } else {
1170 SetCategories(domain_.name());
1171 }
1172 // make as_task_ not to add stat to AggregateStats; otherwise we will add twice
1173 as_task_.enableAggregateStats(false);
1174 }
1175 /*!
1176 * \brief Start the profiling scope
1177 * \param dev_type Device type that the profiling will occur on
1178 * \param dev_id Device id associated with this opr
1179 */
startForDeviceProfileOperator1180 void startForDevice(mxnet::Context::DeviceType dev_type, uint32_t dev_id) {
1181 dev_type_ = dev_type;
1182 dev_id_ = dev_id;
1183 if (profiling_) {
1184 ProfileEvent::start();
1185 as_task_.start();
1186 }
1187 }
1188 /*!
1189 * \brief Stop the profiling scope
1190 */
stopProfileOperator1191 void stop() override {
1192 if (profiling_) {
1193 as_task_.stop();
1194 ProfileEvent::stop();
1195 }
1196 }
1197
1198 /*!
1199 * \brief Operation execution statistics
1200 */
1201 struct OprExecStat : public DurationStat {
1202 /*!
1203 * \brief Constructor
1204 * \param name Name of the operator
1205 * \param dev_type Device type (i.e. CPU: 1, GPU: 2, CPUPinned: 3)
1206 * \param dev_id Device ID (ie GPU number)
1207 * \param start_time Time when operator starts
1208 * \param stop_time Time when operator completes
1209 */
OprExecStatProfileOperator::OprExecStat1210 inline OprExecStat(const char *name, mxnet::Context::DeviceType dev_type, uint32_t dev_id,
1211 uint64_t start_time, uint64_t stop_time,
1212 const Attributes *attributes)
1213 : DurationStat(ProfileStat::kDurationBegin, ProfileStat::kDurationEnd)
1214 , dev_type_(dev_type)
1215 , dev_id_(dev_id) {
1216 name_.set(name);
1217 if (attributes) {
1218 name_.append(attributes->to_string().c_str());
1219 }
1220 if (IsSubOperatorOfCustom(name)) {
1221 categories_.set(custom_op_domain.name());
1222 } else {
1223 categories_.set("operator");
1224 }
1225 items_[kStart].timestamp_ = start_time;
1226 items_[kStop].timestamp_ = stop_time;
1227 }
1228 /*! \brief device type: CPU: 1, GPU: 2, CPUPinned: 3 */
1229 mxnet::Context::DeviceType dev_type_;
1230 /*! \brief device id */
1231 uint32_t dev_id_;
1232 };
1233
1234 private:
1235 /*!
1236 * \brief Send this object's statistical datapoint to the profiler
1237 */
SendStatProfileOperator1238 void SendStat() override {
1239 Profiler::Get()->AddNewProfileStat<OprExecStat>(
1240 [](OprExecStat *stat) {}, name_.c_str(), dev_type_, dev_id_,
1241 start_time_, ProfileStat::NowInMicrosec(),
1242 attributes_.get());
1243 }
1244 /*!
1245 * \brief Check if this operator is no longer profiled
1246 * Notice that this operator may still be used for e.g synchronization
1247 */
IsDeprecatedOperatorProfileOperator1248 inline static bool IsDeprecatedOperator(const char* name) {
1249 return strcmp(name, "CustomOperatorWait") == 0 ||
1250 strcmp(name, "Custom") == 0 || strcmp(name, "_backward_Custom") == 0;
1251 }
1252 /*!
1253 * \brief Check if this operator a sub-operator of a custom operator
1254 */
IsSubOperatorOfCustomProfileOperator1255 inline static bool IsSubOperatorOfCustom(const char* name) {
1256 return strstr(name, "::");
1257 }
1258 /*! \brief Also log the operator as a task in the operator domain */
1259 ProfileTask as_task_;
1260 /* !\brief Operator name */
1261 profile_stat_string name_;
1262 /*! \brief device type: CPU: 1, GPU: 2, CPUPinned: 3 */
1263 Context::DeviceType dev_type_;
1264 /*! \brief device id */
1265 uint32_t dev_id_;
1266 /*! \brief Operator domain */
1267 static ProfileDomain domain_;
1268 /*! \brief Optional operator attributes */
1269 std::unique_ptr<Attributes> attributes_;
1270 /*! \brief Whether to profile or not */
1271 const bool profiling_;
1272 };
1273
1274 /*
1275 * Profiler inline functions
1276 */
DeviceName(mxnet::Context::DeviceType dev_type,int32_t dev_id)1277 inline const char *Profiler::DeviceName(mxnet::Context::DeviceType dev_type, int32_t dev_id) {
1278 return profile_stat[DeviceIndex(dev_type, dev_id)].dev_name_.c_str();
1279 }
1280
DeviceName(const size_t index)1281 inline const char *Profiler::DeviceName(const size_t index) {
1282 return profile_stat[index].dev_name_.c_str();
1283 }
1284
DeviceIndex(mxnet::Context::DeviceType dev_type,int32_t dev_id)1285 inline size_t Profiler::DeviceIndex(mxnet::Context::DeviceType dev_type, int32_t dev_id) {
1286 switch (dev_type) {
1287 case Context::kCPU:
1288 return dev_id;
1289 case Context::kGPU:
1290 return cpu_num_ + dev_id;
1291 case Context::kCPUPinned:
1292 return cpu_num_ + gpu_num_;
1293 case Context::kCPUShared:
1294 return cpu_num_ + gpu_num_ + 1;
1295 default:
1296 LOG(FATAL) << "Unknown dev_type: " << dev_type;
1297 return 0;
1298 }
1299 }
1300
1301 /*!
1302 * \brief Explicit 'Profiler::AddProfileStat' override for 'OprExecStat'
1303 * \param opr_stat Unique pointer to the operator statistic
1304 */
1305 template<>
1306 inline void Profiler::AddProfileStat<ProfileOperator::OprExecStat>(
1307 std::unique_ptr<ProfileOperator::OprExecStat> *opr_stat) {
1308 const size_t idx = DeviceIndex((*opr_stat)->dev_type_, (*opr_stat)->dev_id_);
1309 CHECK_LT(idx, DeviceCount());
1310 DeviceStats& dev_stat = profile_stat[idx];
1311 dev_stat.opr_exec_stats_->enqueue((*opr_stat).release());
1312 }
1313
1314 #undef VTUNE_ONLY_CODE // This macro not meant to be used outside of this file
1315
1316 } // namespace profiler
1317 } // namespace mxnet
1318 #endif // MXNET_PROFILER_PROFILER_H_
1319