1 /*
2     Title:      Profiling
3     Author:     Dave Matthews, Cambridge University Computer Laboratory
4 
5     Copyright (c) 2000-7
6         Cambridge University Technical Services Limited
7     Further development copyright (c) David C.J. Matthews 2011, 2015, 2020
8 
9     This library is free software; you can redistribute it and/or
10     modify it under the terms of the GNU Lesser General Public
11     License version 2.1 as published by the Free Software Foundation.
12 
13     This library is distributed in the hope that it will be useful,
14     but WITHOUT ANY WARRANTY; without even the implied warranty of
15     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16     Lesser General Public License for more details.
17 
18     You should have received a copy of the GNU Lesser General Public
19     License along with this library; if not, write to the Free Software
20     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
21 
22 */
23 
24 #ifdef HAVE_CONFIG_H
25 #include "config.h"
26 #elif defined(_WIN32)
27 #include "winconfig.h"
28 #else
29 #error "No configuration file"
30 #endif
31 
32 #ifdef HAVE_STDLIB_H
33 #include <stdlib.h>
34 #endif
35 
36 #ifdef HAVE_MALLOC_H
37 #include <malloc.h>
38 #endif
39 
40 #ifdef HAVE_ASSERT_H
41 #include <assert.h>
42 #define ASSERT(x) assert(x)
43 #else
44 #define ASSERT(x) 0
45 #endif
46 
47 #include "globals.h"
48 #include "arb.h"
49 #include "processes.h"
50 #include "polystring.h"
51 #include "profiling.h"
52 #include "save_vec.h"
53 #include "rts_module.h"
54 #include "memmgr.h"
55 #include "scanaddrs.h"
56 #include "locking.h"
57 #include "run_time.h"
58 #include "sys.h"
59 #include "rtsentry.h"
60 
61 extern "C" {
62     POLYEXTERNALSYMBOL POLYUNSIGNED PolyProfiling(FirstArgument threadId, PolyWord mode);
63 }
64 
65 static long mainThreadCounts[MTP_MAXENTRY];
66 static const char* const mainThreadText[MTP_MAXENTRY] =
67 {
68     "UNKNOWN",
69     "GARBAGE COLLECTION (sharing phase)",
70     "GARBAGE COLLECTION (mark phase)",
71     "GARBAGE COLLECTION (copy phase)",
72     "GARBAGE COLLECTION (update phase)",
73     "GARBAGE COLLECTION (minor collection)",
74     "Common data sharing",
75     "Exporting",
76     "Saving state",
77     "Loading saved state",
78     "Profiling",
79     "Setting signal handler",
80     "Cygwin spawn",
81     "Storing module",
82     "Loading module"
83 };
84 
85 // Entries for store profiling
86 enum _extraStore {
87     EST_CODE = 0,
88     EST_STRING,
89     EST_BYTE,
90     EST_WORD,
91     EST_MUTABLE,
92     EST_MUTABLEBYTE,
93     EST_MAX_ENTRY
94 };
95 
96 static POLYUNSIGNED extraStoreCounts[EST_MAX_ENTRY];
97 static const char * const extraStoreText[EST_MAX_ENTRY] =
98 {
99     "Function code",
100     "Strings",
101     "Byte data (long precision ints etc)",
102     "Unidentified word data",
103     "Unidentified mutable data",
104     "Mutable byte data (profiling counts)"
105 };
106 
107 // Poly strings for "standard" counts.  These are generated from the C strings
108 // above the first time profiling is activated.
109 static PolyWord psRTSString[MTP_MAXENTRY], psExtraStrings[EST_MAX_ENTRY], psGCTotal;
110 
111 ProfileMode profileMode;
112 // If we are just profiling a single thread, this is the thread data.
113 static TaskData *singleThreadProfile = 0;
114 
115 // The queue is processed every 400ms and an entry can be
116 // added every ms of CPU time by each thread.
117 #define PCQUEUESIZE 4000
118 
119 static long queuePtr = 0;
120 static POLYCODEPTR pcQueue[PCQUEUESIZE];
121 
122 // Increment, returning the original value.
incrAtomically(long & p)123 static int incrAtomically(long & p)
124 {
125 #if (defined(HAVE_SYNC_FETCH))
126     return __sync_fetch_and_add(&p, 1);
127 #elif (defined(_WIN32))
128     long newValue = InterlockedIncrement(&p);
129     return newValue - 1;
130 #else
131     return p++;
132 #endif
133 }
134 
135 // Decrement and return new value.
decrAtomically(long & p)136 static int decrAtomically(long & p)
137 {
138 #if (defined(HAVE_SYNC_FETCH))
139      return __sync_sub_and_fetch(&p, 1);
140 #elif (defined(_WIN32))
141     return InterlockedDecrement(&p);
142 #else
143     return --p;
144 #endif
145 }
146 
147 typedef struct _PROFENTRY
148 {
149     POLYUNSIGNED count;
150     PolyWord functionName;
151     struct _PROFENTRY *nextEntry;
152 } PROFENTRY, *PPROFENTRY;
153 
154 class ProfileRequest: public MainThreadRequest
155 {
156 public:
ProfileRequest(unsigned prof,TaskData * pTask)157     ProfileRequest(unsigned prof, TaskData *pTask):
158         MainThreadRequest(MTP_PROFILING), mode(prof), pCallingThread(pTask), pTab(0), errorMessage(0) {}
159     ~ProfileRequest();
160     virtual void Perform();
161     Handle extractAsList(TaskData *taskData);
162 
163 private:
164     void getResults(void);
165     void getProfileResults(PolyWord *bottom, PolyWord *top);
166     PPROFENTRY newProfileEntry(void);
167 
168 private:
169     unsigned mode;
170     TaskData *pCallingThread;
171     PPROFENTRY pTab;
172 
173 public:
174     const char *errorMessage;
175 };
176 
~ProfileRequest()177 ProfileRequest::~ProfileRequest()
178 {
179     PPROFENTRY p = pTab;
180     while (p != 0)
181     {
182         PPROFENTRY toFree = p;
183         p = p->nextEntry;
184         free(toFree);
185     }
186 }
187 
188 // Lock to serialise updates of counts. Only used during update.
189 // Not required when we print the counts since there's only one thread
190 // running then.
191 static PLock countLock;
192 
193 // Get the profile object associated with a piece of code.  Returns null if
194 // there isn't one, in particular if this is in the old format.
getProfileObjectForCode(PolyObject * code)195 static PolyObject *getProfileObjectForCode(PolyObject *code)
196 {
197     ASSERT(code->IsCodeObject());
198     PolyWord *consts;
199     POLYUNSIGNED constCount;
200     code->GetConstSegmentForCode(consts, constCount);
201     if (constCount < 2 || consts[1].AsUnsigned() == 0 || ! consts[1].IsDataPtr()) return 0;
202     PolyObject *profObject = consts[1].AsObjPtr();
203     if (profObject->IsMutable() && profObject->IsByteObject() && profObject->Length() == 1)
204         return profObject;
205     else return 0;
206 }
207 
208 // Adds incr to the profile count for the function pointed at by
209 // pc or by one of its callers.
addSynchronousCount(POLYCODEPTR fpc,POLYUNSIGNED incr)210 void addSynchronousCount(POLYCODEPTR fpc, POLYUNSIGNED incr)
211 {
212     // Check that the pc value is within the heap.  It could be
213     // in the assembly code.
214     PolyObject *codeObj = gMem.FindCodeObject(fpc);
215     if (codeObj)
216     {
217         PolyObject *profObject = getProfileObjectForCode(codeObj);
218         PLocker locker(&countLock);
219         if (profObject)
220             profObject->Set(0, PolyWord::FromUnsigned(profObject->Get(0).AsUnsigned() + incr));
221         return;
222     }
223     // Didn't find it.
224     {
225         PLocker locker(&countLock);
226         incrAtomically(mainThreadCounts[MTP_USER_CODE]);
227     }
228 }
229 
230 
231 // newProfileEntry - Make a new entry in the list
newProfileEntry(void)232 PPROFENTRY ProfileRequest::newProfileEntry(void)
233 {
234     PPROFENTRY newEntry = (PPROFENTRY)malloc(sizeof(PROFENTRY));
235     if (newEntry == 0) { errorMessage = "Insufficient memory"; return 0; }
236     newEntry->nextEntry = pTab;
237     pTab = newEntry;
238     return newEntry;
239 }
240 
241 // We don't use ScanAddress here because we're only interested in the
242 // objects themselves not the addresses in them.
243 // We have to build the list of results in C memory rather than directly in
244 // ML memory because we can't allocate in ML memory in the root thread.
getProfileResults(PolyWord * bottom,PolyWord * top)245 void ProfileRequest::getProfileResults(PolyWord *bottom, PolyWord *top)
246 {
247     PolyWord *ptr = bottom;
248 
249     while (ptr < top)
250     {
251         ptr++; // Skip the length word
252         PolyObject *obj = (PolyObject*)ptr;
253         if (obj->ContainsForwardingPtr())
254         {
255             // This used to be necessary when code objects were held in the
256             // general heap.  Now that we only ever scan code and permanent
257             // areas it's probably not needed.
258             while (obj->ContainsForwardingPtr())
259                 obj = obj->GetForwardingPtr();
260             ASSERT(obj->ContainsNormalLengthWord());
261             ptr += obj->Length();
262         }
263         else
264         {
265             ASSERT(obj->ContainsNormalLengthWord());
266 
267             if (obj->IsCodeObject())
268             {
269                 PolyWord *firstConstant = obj->ConstPtrForCode();
270                 PolyWord name = firstConstant[0];
271                 PolyObject *profCount = getProfileObjectForCode(obj);
272                 if (profCount)
273                 {
274                     POLYUNSIGNED count = profCount->Get(0).AsUnsigned();
275 
276                     if (count != 0)
277                     {
278                         if (name != TAGGED(0))
279                         {
280                             PPROFENTRY pEnt = newProfileEntry();
281                             if (pEnt == 0) return;
282                             pEnt->count = count;
283                             pEnt->functionName = name;
284                         }
285 
286                         profCount->Set(0, PolyWord::FromUnsigned(0));
287                     }
288                 }
289             } /* code object */
290             ptr += obj->Length();
291         } /* else */
292     } /* while */
293 }
294 
getResults(void)295 void ProfileRequest::getResults(void)
296 // Print profiling information and reset profile counts.
297 {
298     for (std::vector<PermanentMemSpace*>::iterator i = gMem.pSpaces.begin(); i < gMem.pSpaces.end(); i++)
299     {
300         MemSpace *space = *i;
301         // Permanent areas are filled with objects from the bottom.
302         getProfileResults(space->bottom, space->top); // Bottom to top
303     }
304     for (std::vector<CodeSpace *>::iterator i = gMem.cSpaces.begin(); i < gMem.cSpaces.end(); i++)
305     {
306         CodeSpace *space = *i;
307         getProfileResults(space->bottom, space->top);
308     }
309 
310     {
311         POLYUNSIGNED gc_count =
312             mainThreadCounts[MTP_GCPHASESHARING]+
313             mainThreadCounts[MTP_GCPHASEMARK]+
314             mainThreadCounts[MTP_GCPHASECOMPACT] +
315             mainThreadCounts[MTP_GCPHASEUPDATE] +
316             mainThreadCounts[MTP_GCQUICK];
317         if (gc_count)
318         {
319             PPROFENTRY pEnt = newProfileEntry();
320             if (pEnt == 0) return; // Report insufficient memory?
321             pEnt->count = gc_count;
322             pEnt->functionName = psGCTotal;
323         }
324     }
325 
326     for (unsigned k = 0; k < MTP_MAXENTRY; k++)
327     {
328         if (mainThreadCounts[k])
329         {
330             PPROFENTRY pEnt = newProfileEntry();
331             if (pEnt == 0) return; // Report insufficient memory?
332             pEnt->count = mainThreadCounts[k];
333             pEnt->functionName = psRTSString[k];
334             mainThreadCounts[k] = 0;
335         }
336     }
337 
338     for (unsigned l = 0; l < EST_MAX_ENTRY; l++)
339     {
340         if (extraStoreCounts[l])
341         {
342             PPROFENTRY pEnt = newProfileEntry();
343             if (pEnt == 0) return; // Report insufficient memory?
344             pEnt->count = extraStoreCounts[l];
345             pEnt->functionName = psExtraStrings[l];
346             extraStoreCounts[l] = 0;
347         }
348     }
349 }
350 
351 // Extract the accumulated results as an ML list of pairs of the count and the string.
extractAsList(TaskData * taskData)352 Handle ProfileRequest::extractAsList(TaskData *taskData)
353 {
354     Handle saved = taskData->saveVec.mark();
355     Handle list = taskData->saveVec.push(ListNull);
356 
357     for (PPROFENTRY p = pTab; p != 0; p = p->nextEntry)
358     {
359         Handle pair = alloc_and_save(taskData, 2);
360         Handle countValue = Make_arbitrary_precision(taskData, p->count);
361         pair->WordP()->Set(0, countValue->Word());
362         pair->WordP()->Set(1, p->functionName);
363         Handle next  = alloc_and_save(taskData, sizeof(ML_Cons_Cell) / sizeof(PolyWord));
364         DEREFLISTHANDLE(next)->h = pair->Word();
365         DEREFLISTHANDLE(next)->t =list->Word();
366 
367         taskData->saveVec.reset(saved);
368         list = taskData->saveVec.push(next->Word());
369     }
370 
371     return list;
372 }
373 
374 // We have had an asynchronous interrupt and found a potential PC but
375 // we're in a signal handler.
incrementCountAsynch(POLYCODEPTR pc)376 void incrementCountAsynch(POLYCODEPTR pc)
377 {
378     int q = incrAtomically(queuePtr);
379     if (q < PCQUEUESIZE) pcQueue[q] = pc;
380 }
381 
382 // Called by the main thread to process the queue of PC values
processProfileQueue()383 void processProfileQueue()
384 {
385     if (queuePtr == 0) return;
386     while (1)
387     {
388         int q = queuePtr;
389         if (q >= PCQUEUESIZE)
390             incrAtomically(mainThreadCounts[MTP_USER_CODE]);
391         else addSynchronousCount(pcQueue[q], 1);
392         if (decrAtomically(queuePtr) == 0) break;
393     }
394 }
395 
396 // Handle a SIGVTALRM or the simulated equivalent in Windows.  This may be called
397 // at any time so we have to be careful.  In particular in Linux this may be
398 // executed by a thread while holding a mutex so we must not do anything, such
399 // calling malloc, that could require locking.
handleProfileTrap(TaskData * taskData,SIGNALCONTEXT * context)400 void handleProfileTrap(TaskData *taskData, SIGNALCONTEXT *context)
401 {
402     if (singleThreadProfile != 0 && singleThreadProfile != taskData)
403         return;
404 
405     /* If we are in the garbage-collector add the count to "gc_count"
406         otherwise try to find out where we are. */
407     if (mainThreadPhase == MTP_USER_CODE)
408     {
409         if (taskData == 0 || ! taskData->AddTimeProfileCount(context))
410             incrAtomically(mainThreadCounts[MTP_USER_CODE]);
411         // On Mac OS X all virtual timer interrupts seem to be directed to the root thread
412         // so all the counts will be "unknown".
413     }
414     else incrAtomically(mainThreadCounts[mainThreadPhase]);
415 }
416 
417 // Called from the GC when allocation profiling is on.
AddObjectProfile(PolyObject * obj)418 void AddObjectProfile(PolyObject *obj)
419 {
420     ASSERT(obj->ContainsNormalLengthWord());
421     POLYUNSIGNED length = obj->Length();
422 
423     if (obj->IsWordObject() && OBJ_HAS_PROFILE(obj->LengthWord()))
424     {
425         // It has a profile pointer.  The last word should point to the
426         // closure or code of the allocating function.  Add the size of this to the count.
427         ASSERT(length != 0);
428         PolyWord profWord = obj->Get(length-1);
429         ASSERT(profWord.IsDataPtr());
430         PolyObject *profObject = profWord.AsObjPtr();
431         ASSERT(profObject->IsMutable() && profObject->IsByteObject() && profObject->Length() == 1);
432         profObject->Set(0, PolyWord::FromUnsigned(profObject->Get(0).AsUnsigned() + length + 1));
433     }
434     // If it doesn't have a profile pointer add it to the appropriate count.
435     else if (obj->IsMutable())
436     {
437         if (obj->IsByteObject())
438             extraStoreCounts[EST_MUTABLEBYTE] += length+1;
439         else extraStoreCounts[EST_MUTABLE] += length+1;
440     }
441     else if (obj->IsCodeObject())
442         extraStoreCounts[EST_CODE] += length+1;
443     else if (obj->IsClosureObject())
444     {
445         ASSERT(0);
446     }
447     else if (obj->IsByteObject())
448     {
449         // Try to separate strings from other byte data.  This is only
450         // approximate.
451         if (OBJ_IS_NEGATIVE(obj->LengthWord()))
452             extraStoreCounts[EST_BYTE] += length+1;
453         else
454         {
455             PolyStringObject *possString = (PolyStringObject*)obj;
456             POLYUNSIGNED bytes = length * sizeof(PolyWord);
457             // If the length of the string as given in the first word is sufficient
458             // to fit in the exact number of words then it's probably a string.
459             if (length >= 2 &&
460                 possString->length <= bytes - sizeof(POLYUNSIGNED) &&
461                 possString->length > bytes - 2 * sizeof(POLYUNSIGNED))
462                     extraStoreCounts[EST_STRING] += length+1;
463             else
464             {
465                 extraStoreCounts[EST_BYTE] += length+1;
466             }
467         }
468     }
469     else extraStoreCounts[EST_WORD] += length+1;
470 }
471 
472 // Called from ML to control profiling.
profilerc(TaskData * taskData,Handle mode_handle)473 static Handle profilerc(TaskData *taskData, Handle mode_handle)
474 /* Profiler - generates statistical profiles of the code.
475    The parameter is an integer which determines the value to be profiled.
476    When profiler is called it always resets the profiling and prints out any
477    values which have been accumulated.
478    If the parameter is 0 this is all it does,
479    if the parameter is 1 then it produces time profiling,
480    if the parameter is 2 it produces store profiling.
481    3 - arbitrary precision emulation traps. */
482 {
483     unsigned mode = get_C_unsigned(taskData, mode_handle->Word());
484     {
485         // Create any strings we need.  We only need to do this once but
486         // it must be done by a non-root thread since it needs a taskData object.
487         // Don't bother locking.  At worst we'll create some garbage.
488         for (unsigned k = 0; k < MTP_MAXENTRY; k++)
489         {
490             if (psRTSString[k] == TAGGED(0))
491                 psRTSString[k] = C_string_to_Poly(taskData, mainThreadText[k]);
492         }
493         for (unsigned k = 0; k < EST_MAX_ENTRY; k++)
494         {
495             if (psExtraStrings[k] == TAGGED(0))
496                 psExtraStrings[k] = C_string_to_Poly(taskData, extraStoreText[k]);
497         }
498         if (psGCTotal == TAGGED(0))
499             psGCTotal = C_string_to_Poly(taskData, "GARBAGE COLLECTION (total)");
500     }
501     // All these actions are performed by the root thread.  Only profile
502     // printing needs to be performed with all the threads stopped but it's
503     // simpler to serialise all requests.
504     ProfileRequest request(mode, taskData);
505     processes->MakeRootRequest(taskData, &request);
506     if (request.errorMessage != 0) raise_exception_string(taskData, EXC_Fail, request.errorMessage);
507     return request.extractAsList(taskData);
508 }
509 
PolyProfiling(FirstArgument threadId,PolyWord mode)510 POLYUNSIGNED PolyProfiling(FirstArgument threadId, PolyWord mode)
511 {
512     TaskData *taskData = TaskData::FindTaskForId(threadId);
513     ASSERT(taskData != 0);
514     taskData->PreRTSCall();
515     Handle reset = taskData->saveVec.mark();
516     Handle pushedMode = taskData->saveVec.push(mode);
517     Handle result = 0;
518 
519     try {
520         result = profilerc(taskData, pushedMode);
521     } catch (...) { } // If an ML exception is raised
522 
523     taskData->saveVec.reset(reset);
524     taskData->PostRTSCall();
525     if (result == 0) return TAGGED(0).AsUnsigned();
526     else return result->Word().AsUnsigned();
527 }
528 
529 // This is called from the root thread when all the ML threads have been paused.
Perform()530 void ProfileRequest::Perform()
531 {
532     if (mode != kProfileOff && profileMode != kProfileOff)
533     {
534         // Profiling must be stopped first.
535         errorMessage = "Profiling is currently active";
536         return;
537     }
538 
539     singleThreadProfile = 0; // Unless kProfileTimeThread is given this should be 0
540 
541     switch (mode)
542     {
543     case kProfileOff:
544         // Turn off old profiling mechanism and print out accumulated results
545         profileMode = kProfileOff;
546         processes->StopProfiling();
547         getResults();
548         // Remove all the bitmaps to free up memory
549         gMem.RemoveProfilingBitmaps();
550         break;
551 
552     case kProfileTimeThread:
553         singleThreadProfile = pCallingThread;
554         // And drop through to kProfileTime
555 
556     case kProfileTime:
557         profileMode = kProfileTime;
558         processes->StartProfiling();
559         break;
560 
561     case kProfileStoreAllocation:
562         profileMode = kProfileStoreAllocation;
563         break;
564 
565     case kProfileEmulation:
566         profileMode = kProfileEmulation;
567         break;
568 
569     case kProfileLiveData:
570         profileMode = kProfileLiveData;
571         break;
572 
573     case kProfileLiveMutables:
574         profileMode = kProfileLiveMutables;
575         break;
576 
577     case kProfileMutexContention:
578         profileMode = kProfileMutexContention;
579         break;
580 
581     default: /* do nothing */
582         break;
583     }
584 
585 }
586 
587 struct _entrypts profilingEPT[] =
588 {
589     // Profiling
590     { "PolyProfiling",                  (polyRTSFunction)&PolyProfiling},
591 
592     { NULL, NULL} // End of list.
593 };
594 
595 
596 class Profiling: public RtsModule
597 {
598 public:
599     virtual void Init(void);
600     virtual void GarbageCollect(ScanAddress *process);
601 };
602 
603 // Declare this.  It will be automatically added to the table.
604 static Profiling profileModule;
605 
Init(void)606 void Profiling::Init(void)
607 {
608     // Reset profiling counts.
609     profileMode = kProfileOff;
610     for (unsigned k = 0; k < MTP_MAXENTRY; k++) mainThreadCounts[k] = 0;
611 }
612 
GarbageCollect(ScanAddress * process)613 void Profiling::GarbageCollect(ScanAddress *process)
614 {
615     // Process any strings in the table.
616     for (unsigned k = 0; k < MTP_MAXENTRY; k++)
617         process->ScanRuntimeWord(&psRTSString[k]);
618     for (unsigned k = 0; k < EST_MAX_ENTRY; k++)
619         process->ScanRuntimeWord(&psExtraStrings[k]);
620     process->ScanRuntimeWord(&psGCTotal);
621 }
622