1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- 2 * vim: set ts=8 sts=4 et sw=4 tw=99: 3 * This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #ifndef js_ProfilingStack_h 8 #define js_ProfilingStack_h 9 10 #include <algorithm> 11 #include <stdint.h> 12 13 #include "jstypes.h" 14 15 #include "js/TypeDecls.h" 16 #include "js/Utility.h" 17 18 class JSTracer; 19 class PseudoStack; 20 21 // This file defines the classes PseudoStack and ProfileEntry. 22 // The PseudoStack manages an array of ProfileEntries. 23 // Usage: 24 // 25 // PseudoStack* pseudoStack = ...; 26 // 27 // // For CPP stack frames: 28 // pseudoStack->pushCppFrame(...); 29 // // Execute some code. When finished, pop the entry: 30 // pseudoStack->pop(); 31 // 32 // // For JS stack frames: 33 // pseudoStack->pushJSFrame(...); 34 // // Execute some code. When finished, pop the entry: 35 // pseudoStack->pop(); 36 // 37 // 38 // Concurrency considerations 39 // 40 // A thread's pseudo stack (and the entries inside it) is only modified by 41 // that thread. However, the pseudo stack can be *read* by a different thread, 42 // the sampler thread: Whenever the profiler wants to sample a given thread A, 43 // the following happens: 44 // (1) Thread A is suspended. 45 // (2) The sampler thread (thread S) reads the PseudoStack of thread A, 46 // including all ProfileEntries that are currently in that stack 47 // (pseudoStack->entries[0..pseudoStack->stackSize()]). 48 // (3) Thread A is resumed. 49 // 50 // Thread suspension is achieved using platform-specific APIs; refer to each 51 // platform's Sampler::SuspendAndSampleAndResumeThread implementation in 52 // platform-*.cpp for details. 53 // 54 // When the thread is suspended, the values in pseudoStack->stackPointer and in 55 // the entry range pseudoStack->entries[0..pseudoStack->stackPointer] need to 56 // be in a consistent state, so that thread S does not read partially- 57 // constructed profile entries. More specifically, we have two requirements: 58 // (1) When adding a new entry at the top of the stack, its ProfileEntry data 59 // needs to be put in place *before* the stackPointer is incremented, and 60 // the compiler + CPU need to know that this order matters. 61 // (2) When popping an entry from the stack and then preparing the 62 // ProfileEntry data for the next frame that is about to be pushed, the 63 // decrement of the stackPointer in pop() needs to happen *before* the 64 // ProfileEntry for the new frame is being popuplated, and the compiler + 65 // CPU need to know that this order matters. 66 // 67 // We can express the relevance of these orderings in multiple ways. 68 // Option A is to make stackPointer an atomic with SequentiallyConsistent 69 // memory ordering. This would ensure that no writes in thread A would be 70 // reordered across any writes to stackPointer, which satisfies requirements 71 // (1) and (2) at the same time. Option A is the simplest. 72 // Option B is to use ReleaseAcquire memory ordering both for writes to 73 // stackPointer *and* for writes to ProfileEntry fields. Release-stores ensure 74 // that all writes that happened *before this write in program order* are not 75 // reordered to happen after this write. ReleaseAcquire ordering places no 76 // requirements on the ordering of writes that happen *after* this write in 77 // program order. 78 // Using release-stores for writes to stackPointer expresses requirement (1), 79 // and using release-stores for writes to the ProfileEntry fields expresses 80 // requirement (2). 81 // 82 // Option B is more complicated than option A, but has much better performance 83 // on x86/64: In a microbenchmark run on a Macbook Pro from 2017, switching 84 // from option A to option B reduced the overhead of pushing+popping a 85 // ProfileEntry by 10 nanoseconds. 86 // On x86/64, release-stores require no explicit hardware barriers or lock 87 // instructions. 88 // On ARM/64, option B may be slower than option A, because the compiler will 89 // generate hardware barriers for every single release-store instead of just 90 // for the writes to stackPointer. However, the actual performance impact of 91 // this has not yet been measured on ARM, so we're currently using option B 92 // everywhere. This is something that we may want to change in the future once 93 // we've done measurements. 94 95 namespace js { 96 97 // A call stack can be specified to the JS engine such that all JS entry/exits 98 // to functions push/pop an entry to/from the specified stack. 99 // 100 // For more detailed information, see vm/GeckoProfiler.h. 101 // 102 class ProfileEntry { 103 // A ProfileEntry represents either a C++ profile entry or a JS one. 104 105 // WARNING WARNING WARNING 106 // 107 // All the fields below are Atomic<...,ReleaseAcquire>. This is needed so 108 // that writes to these fields are release-writes, which ensures that 109 // earlier writes in this thread don't get reordered after the writes to 110 // these fields. In particular, the decrement of the stack pointer in 111 // PseudoStack::pop() is a write that *must* happen before the values in 112 // this ProfileEntry are changed. Otherwise, the sampler thread might see 113 // an inconsistent state where the stack pointer still points to a 114 // ProfileEntry which has already been popped off the stack and whose 115 // fields have now been partially repopulated with new values. 116 // See the "Concurrency considerations" paragraph at the top of this file 117 // for more details. 118 119 // Descriptive label for this entry. Must be a static string! Can be an 120 // empty string, but not a null pointer. 121 mozilla::Atomic<const char*, mozilla::ReleaseAcquire> label_; 122 123 // An additional descriptive string of this entry which is combined with 124 // |label_| in profiler output. Need not be (and usually isn't) static. Can 125 // be null. 126 mozilla::Atomic<const char*, mozilla::ReleaseAcquire> dynamicString_; 127 128 // Stack pointer for non-JS entries, the script pointer otherwise. 129 mozilla::Atomic<void*, mozilla::ReleaseAcquire> spOrScript; 130 131 // Line number for non-JS entries, the bytecode offset otherwise. 132 mozilla::Atomic<int32_t, mozilla::ReleaseAcquire> lineOrPcOffset; 133 134 // Bits 0...1 hold the Kind. Bits 2...3 are unused. Bits 4...12 hold the 135 // Category. 136 mozilla::Atomic<uint32_t, mozilla::ReleaseAcquire> kindAndCategory_; 137 138 static int32_t pcToOffset(JSScript* aScript, jsbytecode* aPc); 139 140 public: 141 enum class Kind : uint32_t { 142 // A normal C++ frame. 143 CPP_NORMAL = 0, 144 145 // A special C++ frame indicating the start of a run of JS pseudostack 146 // entries. CPP_MARKER_FOR_JS frames are ignored, except for the sp 147 // field. 148 CPP_MARKER_FOR_JS = 1, 149 150 // A normal JS frame. 151 JS_NORMAL = 2, 152 153 // An interpreter JS frame that has OSR-ed into baseline. JS_NORMAL 154 // frames can be converted to JS_OSR and back. JS_OSR frames are 155 // ignored. 156 JS_OSR = 3, 157 158 KIND_MASK = 0x3, 159 }; 160 161 // Keep these in sync with devtools/client/performance/modules/categories.js 162 enum class Category : uint32_t { 163 OTHER = 1u << 4, 164 CSS = 1u << 5, 165 JS = 1u << 6, 166 GC = 1u << 7, 167 CC = 1u << 8, 168 NETWORK = 1u << 9, 169 GRAPHICS = 1u << 10, 170 STORAGE = 1u << 11, 171 EVENTS = 1u << 12, 172 173 FIRST = OTHER, 174 LAST = EVENTS, 175 176 CATEGORY_MASK = ~uint32_t(Kind::KIND_MASK), 177 }; 178 179 static_assert((uint32_t(Category::FIRST) & uint32_t(Kind::KIND_MASK)) == 0, 180 "Category overlaps with Kind"); 181 isCpp()182 bool isCpp() const { 183 Kind k = kind(); 184 return k == Kind::CPP_NORMAL || k == Kind::CPP_MARKER_FOR_JS; 185 } 186 isJs()187 bool isJs() const { 188 Kind k = kind(); 189 return k == Kind::JS_NORMAL || k == Kind::JS_OSR; 190 } 191 setLabel(const char * aLabel)192 void setLabel(const char* aLabel) { label_ = aLabel; } label()193 const char* label() const { return label_; } 194 dynamicString()195 const char* dynamicString() const { return dynamicString_; } 196 initCppFrame(const char * aLabel,const char * aDynamicString,void * sp,uint32_t aLine,Kind aKind,Category aCategory)197 void initCppFrame(const char* aLabel, const char* aDynamicString, void* sp, 198 uint32_t aLine, Kind aKind, Category aCategory) { 199 label_ = aLabel; 200 dynamicString_ = aDynamicString; 201 spOrScript = sp; 202 lineOrPcOffset = static_cast<int32_t>(aLine); 203 kindAndCategory_ = uint32_t(aKind) | uint32_t(aCategory); 204 MOZ_ASSERT(isCpp()); 205 } 206 initJsFrame(const char * aLabel,const char * aDynamicString,JSScript * aScript,jsbytecode * aPc)207 void initJsFrame(const char* aLabel, const char* aDynamicString, 208 JSScript* aScript, jsbytecode* aPc) { 209 label_ = aLabel; 210 dynamicString_ = aDynamicString; 211 spOrScript = aScript; 212 lineOrPcOffset = pcToOffset(aScript, aPc); 213 kindAndCategory_ = uint32_t(Kind::JS_NORMAL) | uint32_t(Category::JS); 214 MOZ_ASSERT(isJs()); 215 } 216 setKind(Kind aKind)217 void setKind(Kind aKind) { 218 kindAndCategory_ = uint32_t(aKind) | uint32_t(category()); 219 } 220 kind()221 Kind kind() const { 222 return Kind(kindAndCategory_ & uint32_t(Kind::KIND_MASK)); 223 } 224 category()225 Category category() const { 226 return Category(kindAndCategory_ & uint32_t(Category::CATEGORY_MASK)); 227 } 228 stackAddress()229 void* stackAddress() const { 230 MOZ_ASSERT(!isJs()); 231 return spOrScript; 232 } 233 234 JS_PUBLIC_API JSScript* script() const; 235 line()236 uint32_t line() const { 237 MOZ_ASSERT(!isJs()); 238 return static_cast<uint32_t>(lineOrPcOffset); 239 } 240 241 // Note that the pointer returned might be invalid. rawScript()242 JSScript* rawScript() const { 243 MOZ_ASSERT(isJs()); 244 void* script = spOrScript; 245 return static_cast<JSScript*>(script); 246 } 247 248 // We can't know the layout of JSScript, so look in vm/GeckoProfiler.cpp. 249 JS_FRIEND_API jsbytecode* pc() const; 250 void setPC(jsbytecode* pc); 251 252 void trace(JSTracer* trc); 253 254 // The offset of a pc into a script's code can actually be 0, so to 255 // signify a nullptr pc, use a -1 index. This is checked against in 256 // pc() and setPC() to set/get the right pc. 257 static const int32_t NullPCOffset = -1; 258 }; 259 260 JS_FRIEND_API void SetContextProfilingStack(JSContext* cx, 261 PseudoStack* pseudoStack); 262 263 // GetContextProfilingStack also exists, but it's defined in RootingAPI.h. 264 265 JS_FRIEND_API void EnableContextProfilingStack(JSContext* cx, bool enabled); 266 267 JS_FRIEND_API void RegisterContextProfilingEventMarker(JSContext* cx, 268 void (*fn)(const char*)); 269 270 } // namespace js 271 272 // Each thread has its own PseudoStack. That thread modifies the PseudoStack, 273 // pushing and popping elements as necessary. 274 // 275 // The PseudoStack is also read periodically by the profiler's sampler thread. 276 // This happens only when the thread that owns the PseudoStack is suspended. So 277 // there are no genuine parallel accesses. 278 // 279 // However, it is possible for pushing/popping to be interrupted by a periodic 280 // sample. Because of this, we need pushing/popping to be effectively atomic. 281 // 282 // - When pushing a new entry, we increment the stack pointer -- making the new 283 // entry visible to the sampler thread -- only after the new entry has been 284 // fully written. The stack pointer is Atomic<uint32_t,ReleaseAcquire>, so 285 // the increment is a release-store, which ensures that this store is not 286 // reordered before the writes of the entry. 287 // 288 // - When popping an old entry, the only operation is the decrementing of the 289 // stack pointer, which is obviously atomic. 290 // 291 class PseudoStack final { 292 public: PseudoStack()293 PseudoStack() : stackPointer(0) {} 294 ~PseudoStack()295 ~PseudoStack() { 296 // The label macros keep a reference to the PseudoStack to avoid a TLS 297 // access. If these are somehow not all cleared we will get a 298 // use-after-free so better to crash now. 299 MOZ_RELEASE_ASSERT(stackPointer == 0); 300 } 301 pushCppFrame(const char * label,const char * dynamicString,void * sp,uint32_t line,js::ProfileEntry::Kind kind,js::ProfileEntry::Category category)302 void pushCppFrame(const char* label, const char* dynamicString, void* sp, 303 uint32_t line, js::ProfileEntry::Kind kind, 304 js::ProfileEntry::Category category) { 305 if (stackPointer < MaxEntries) { 306 entries[stackPointer].initCppFrame(label, dynamicString, sp, line, kind, 307 category); 308 } 309 310 // This must happen at the end! The compiler will not reorder this 311 // update because stackPointer is Atomic<..., ReleaseAcquire>, so any 312 // the writes above will not be reordered below the stackPointer store. 313 // Do the read and the write as two separate statements, in order to 314 // make it clear that we don't need an atomic increment, which would be 315 // more expensive on x86 than the separate operations done here. 316 // This thread is the only one that ever changes the value of 317 // stackPointer. 318 uint32_t oldStackPointer = stackPointer; 319 stackPointer = oldStackPointer + 1; 320 } 321 pushJsFrame(const char * label,const char * dynamicString,JSScript * script,jsbytecode * pc)322 void pushJsFrame(const char* label, const char* dynamicString, 323 JSScript* script, jsbytecode* pc) { 324 if (stackPointer < MaxEntries) { 325 entries[stackPointer].initJsFrame(label, dynamicString, script, pc); 326 } 327 328 // This must happen at the end! The compiler will not reorder this 329 // update because stackPointer is Atomic<..., ReleaseAcquire>, which 330 // makes this assignment a release-store, so the writes above will not 331 // be reordered to occur after the stackPointer store. 332 // Do the read and the write as two separate statements, in order to 333 // make it clear that we don't need an atomic increment, which would be 334 // more expensive on x86 than the separate operations done here. 335 // This thread is the only one that ever changes the value of 336 // stackPointer. 337 uint32_t oldStackPointer = stackPointer; 338 stackPointer = oldStackPointer + 1; 339 } 340 pop()341 void pop() { 342 MOZ_ASSERT(stackPointer > 0); 343 // Do the read and the write as two separate statements, in order to 344 // make it clear that we don't need an atomic decrement, which would be 345 // more expensive on x86 than the separate operations done here. 346 // This thread is the only one that ever changes the value of 347 // stackPointer. 348 uint32_t oldStackPointer = stackPointer; 349 stackPointer = oldStackPointer - 1; 350 } 351 stackSize()352 uint32_t stackSize() const { 353 return std::min(uint32_t(stackPointer), uint32_t(MaxEntries)); 354 } 355 356 private: 357 // No copying. 358 PseudoStack(const PseudoStack&) = delete; 359 void operator=(const PseudoStack&) = delete; 360 361 public: 362 static const uint32_t MaxEntries = 1024; 363 364 // The stack entries. 365 js::ProfileEntry entries[MaxEntries]; 366 367 // This may exceed MaxEntries, so instead use the stackSize() method to 368 // determine the number of valid samples in entries. When this is less 369 // than MaxEntries, it refers to the first free entry past the top of the 370 // in-use stack (i.e. entries[stackPointer - 1] is the top stack entry). 371 // 372 // WARNING WARNING WARNING 373 // 374 // This is an atomic variable that uses ReleaseAcquire memory ordering. 375 // See the "Concurrency considerations" paragraph at the top of this file 376 // for more details. 377 mozilla::Atomic<uint32_t, mozilla::ReleaseAcquire> stackPointer; 378 }; 379 380 namespace js { 381 382 class AutoGeckoProfilerEntry; 383 class GeckoProfilerEntryMarker; 384 class GeckoProfilerBaselineOSRMarker; 385 386 class GeckoProfilerThread { 387 friend class AutoGeckoProfilerEntry; 388 friend class GeckoProfilerEntryMarker; 389 friend class GeckoProfilerBaselineOSRMarker; 390 391 PseudoStack* pseudoStack_; 392 393 public: 394 GeckoProfilerThread(); 395 stackPointer()396 uint32_t stackPointer() { 397 MOZ_ASSERT(installed()); 398 return pseudoStack_->stackPointer; 399 } stack()400 ProfileEntry* stack() { return pseudoStack_->entries; } getPseudoStack()401 PseudoStack* getPseudoStack() { return pseudoStack_; } 402 403 /* management of whether instrumentation is on or off */ installed()404 bool installed() { return pseudoStack_ != nullptr; } 405 406 void setProfilingStack(PseudoStack* pseudoStack); 407 void trace(JSTracer* trc); 408 409 /* 410 * Functions which are the actual instrumentation to track run information 411 * 412 * - enter: a function has started to execute 413 * - updatePC: updates the pc information about where a function 414 * is currently executing 415 * - exit: this function has ceased execution, and no further 416 * entries/exits will be made 417 */ 418 bool enter(JSContext* cx, JSScript* script, JSFunction* maybeFun); 419 void exit(JSScript* script, JSFunction* maybeFun); 420 inline void updatePC(JSContext* cx, JSScript* script, jsbytecode* pc); 421 }; 422 423 } // namespace js 424 425 #endif /* js_ProfilingStack_h */ 426