1 /*===- InstrProfilingValue.c - Support library for PGO instrumentation ----===*\
2 |*
3 |* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 |* See https://llvm.org/LICENSE.txt for license information.
5 |* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 |*
7 \*===----------------------------------------------------------------------===*/
8 
9 #include <assert.h>
10 #include <limits.h>
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14 
15 #include "InstrProfiling.h"
16 #include "InstrProfilingInternal.h"
17 #include "InstrProfilingUtil.h"
18 
19 #define INSTR_PROF_VALUE_PROF_DATA
20 #define INSTR_PROF_COMMON_API_IMPL
21 #define INSTR_PROF_VALUE_PROF_MEMOP_API
22 #include "profile/InstrProfData.inc"
23 
24 static int hasStaticCounters = 1;
25 static int OutOfNodesWarnings = 0;
26 static int hasNonDefaultValsPerSite = 0;
27 #define INSTR_PROF_MAX_VP_WARNS 10
28 #define INSTR_PROF_DEFAULT_NUM_VAL_PER_SITE 24
29 #define INSTR_PROF_VNODE_POOL_SIZE 1024
30 
31 #ifndef _MSC_VER
32 /* A shared static pool in addition to the vnodes statically
33  * allocated by the compiler.  */
34 COMPILER_RT_VISIBILITY ValueProfNode
35     lprofValueProfNodes[INSTR_PROF_VNODE_POOL_SIZE] COMPILER_RT_SECTION(
36        COMPILER_RT_SEG INSTR_PROF_VNODES_SECT_NAME);
37 #endif
38 
39 COMPILER_RT_VISIBILITY uint32_t VPMaxNumValsPerSite =
40     INSTR_PROF_DEFAULT_NUM_VAL_PER_SITE;
41 
42 COMPILER_RT_VISIBILITY void lprofSetupValueProfiler(void) {
43   const char *Str = 0;
44   Str = getenv("LLVM_VP_MAX_NUM_VALS_PER_SITE");
45   if (Str && Str[0]) {
46     VPMaxNumValsPerSite = atoi(Str);
47     hasNonDefaultValsPerSite = 1;
48   }
49   if (VPMaxNumValsPerSite > INSTR_PROF_MAX_NUM_VAL_PER_SITE)
50     VPMaxNumValsPerSite = INSTR_PROF_MAX_NUM_VAL_PER_SITE;
51 }
52 
53 COMPILER_RT_VISIBILITY void lprofSetMaxValsPerSite(uint32_t MaxVals) {
54   VPMaxNumValsPerSite = MaxVals;
55   hasNonDefaultValsPerSite = 1;
56 }
57 
58 /* This method is only used in value profiler mock testing.  */
59 COMPILER_RT_VISIBILITY void
60 __llvm_profile_set_num_value_sites(__llvm_profile_data *Data,
61                                    uint32_t ValueKind, uint16_t NumValueSites) {
62 #ifdef __GNUC__
63 #pragma GCC diagnostic push
64 #pragma GCC diagnostic ignored "-Wcast-qual"
65 #endif
66   *((uint16_t *)&Data->NumValueSites[ValueKind]) = NumValueSites;
67 #ifdef __GNUC__
68 #pragma GCC diagnostic pop
69 #endif
70 }
71 
72 /* This method is only used in value profiler mock testing.  */
73 COMPILER_RT_VISIBILITY const __llvm_profile_data *
74 __llvm_profile_iterate_data(const __llvm_profile_data *Data) {
75   return Data + 1;
76 }
77 
78 /* This method is only used in value profiler mock testing.  */
79 COMPILER_RT_VISIBILITY void *
80 __llvm_get_function_addr(const __llvm_profile_data *Data) {
81   return Data->FunctionPointer;
82 }
83 
84 /* Allocate an array that holds the pointers to the linked lists of
85  * value profile counter nodes. The number of element of the array
86  * is the total number of value profile sites instrumented. Returns
87  * 0 if allocation fails.
88  */
89 
90 static int allocateValueProfileCounters(__llvm_profile_data *Data) {
91   uint64_t NumVSites = 0;
92   uint32_t VKI;
93 
94   /* This function will never be called when value site array is allocated
95      statically at compile time.  */
96   hasStaticCounters = 0;
97   /* When dynamic allocation is enabled, allow tracking the max number of
98    * values allowd.  */
99   if (!hasNonDefaultValsPerSite)
100     VPMaxNumValsPerSite = INSTR_PROF_MAX_NUM_VAL_PER_SITE;
101 
102   for (VKI = IPVK_First; VKI <= IPVK_Last; ++VKI)
103     NumVSites += Data->NumValueSites[VKI];
104 
105   // If NumVSites = 0, calloc is allowed to return a non-null pointer.
106   assert(NumVSites > 0 && "NumVSites can't be zero");
107   ValueProfNode **Mem =
108       (ValueProfNode **)calloc(NumVSites, sizeof(ValueProfNode *));
109   if (!Mem)
110     return 0;
111   if (!COMPILER_RT_BOOL_CMPXCHG(&Data->Values, 0, Mem)) {
112     free(Mem);
113     return 0;
114   }
115   return 1;
116 }
117 
118 static ValueProfNode *allocateOneNode(void) {
119   ValueProfNode *Node;
120 
121   if (!hasStaticCounters)
122     return (ValueProfNode *)calloc(1, sizeof(ValueProfNode));
123 
124   /* Early check to avoid value wrapping around.  */
125   if (CurrentVNode + 1 > EndVNode) {
126     if (OutOfNodesWarnings++ < INSTR_PROF_MAX_VP_WARNS) {
127       PROF_WARN("Unable to track new values: %s. "
128                 " Consider using option -mllvm -vp-counters-per-site=<n> to "
129                 "allocate more"
130                 " value profile counters at compile time. \n",
131                 "Running out of static counters");
132     }
133     return 0;
134   }
135   Node = COMPILER_RT_PTR_FETCH_ADD(ValueProfNode, CurrentVNode, 1);
136   /* Due to section padding, EndVNode point to a byte which is one pass
137    * an incomplete VNode, so we need to skip the last incomplete node. */
138   if (Node + 1 > EndVNode)
139     return 0;
140 
141   return Node;
142 }
143 
144 static COMPILER_RT_ALWAYS_INLINE void
145 instrumentTargetValueImpl(uint64_t TargetValue, void *Data,
146                           uint32_t CounterIndex, uint64_t CountValue) {
147   __llvm_profile_data *PData = (__llvm_profile_data *)Data;
148   if (!PData)
149     return;
150   if (!CountValue)
151     return;
152   if (!PData->Values) {
153     if (!allocateValueProfileCounters(PData))
154       return;
155   }
156 
157   ValueProfNode **ValueCounters = (ValueProfNode **)PData->Values;
158   ValueProfNode *PrevVNode = NULL;
159   ValueProfNode *MinCountVNode = NULL;
160   ValueProfNode *CurVNode = ValueCounters[CounterIndex];
161   uint64_t MinCount = UINT64_MAX;
162 
163   uint8_t VDataCount = 0;
164   while (CurVNode) {
165     if (TargetValue == CurVNode->Value) {
166       CurVNode->Count += CountValue;
167       return;
168     }
169     if (CurVNode->Count < MinCount) {
170       MinCount = CurVNode->Count;
171       MinCountVNode = CurVNode;
172     }
173     PrevVNode = CurVNode;
174     CurVNode = CurVNode->Next;
175     ++VDataCount;
176   }
177 
178   if (VDataCount >= VPMaxNumValsPerSite) {
179     /* Bump down the min count node's count. If it reaches 0,
180      * evict it. This eviction/replacement policy makes hot
181      * targets more sticky while cold targets less so. In other
182      * words, it makes it less likely for the hot targets to be
183      * prematurally evicted during warmup/establishment period,
184      * when their counts are still low. In a special case when
185      * the number of values tracked is reduced to only one, this
186      * policy will guarantee that the dominating target with >50%
187      * total count will survive in the end. Note that this scheme
188      * allows the runtime to track the min count node in an adaptive
189      * manner. It can correct previous mistakes and eventually
190      * lock on a cold target that is alread in stable state.
191      *
192      * In very rare cases,  this replacement scheme may still lead
193      * to target loss. For instance, out of \c N value slots, \c N-1
194      * slots are occupied by luke warm targets during the warmup
195      * period and the remaining one slot is competed by two or more
196      * very hot targets. If those hot targets occur in an interleaved
197      * way, none of them will survive (gain enough weight to throw out
198      * other established entries) due to the ping-pong effect.
199      * To handle this situation, user can choose to increase the max
200      * number of tracked values per value site. Alternatively, a more
201      * expensive eviction mechanism can be implemented. It requires
202      * the runtime to track the total number of evictions per-site.
203      * When the total number of evictions reaches certain threshold,
204      * the runtime can wipe out more than one lowest count entries
205      * to give space for hot targets.
206      */
207     if (MinCountVNode->Count <= CountValue) {
208       CurVNode = MinCountVNode;
209       CurVNode->Value = TargetValue;
210       CurVNode->Count = CountValue;
211     } else
212       MinCountVNode->Count -= CountValue;
213 
214     return;
215   }
216 
217   CurVNode = allocateOneNode();
218   if (!CurVNode)
219     return;
220   CurVNode->Value = TargetValue;
221   CurVNode->Count += CountValue;
222 
223   uint32_t Success = 0;
224   if (!ValueCounters[CounterIndex])
225     Success =
226         COMPILER_RT_BOOL_CMPXCHG(&ValueCounters[CounterIndex], 0, CurVNode);
227   else if (PrevVNode && !PrevVNode->Next)
228     Success = COMPILER_RT_BOOL_CMPXCHG(&(PrevVNode->Next), 0, CurVNode);
229 
230   if (!Success && !hasStaticCounters) {
231     free(CurVNode);
232     return;
233   }
234 }
235 
236 COMPILER_RT_VISIBILITY void
237 __llvm_profile_instrument_target(uint64_t TargetValue, void *Data,
238                                  uint32_t CounterIndex) {
239   instrumentTargetValueImpl(TargetValue, Data, CounterIndex, 1);
240 }
241 COMPILER_RT_VISIBILITY void
242 __llvm_profile_instrument_target_value(uint64_t TargetValue, void *Data,
243                                        uint32_t CounterIndex,
244                                        uint64_t CountValue) {
245   instrumentTargetValueImpl(TargetValue, Data, CounterIndex, CountValue);
246 }
247 
248 /*
249  * The target values are partitioned into multiple ranges. The range spec is
250  * defined in InstrProfData.inc.
251  */
252 COMPILER_RT_VISIBILITY void
253 __llvm_profile_instrument_memop(uint64_t TargetValue, void *Data,
254                                 uint32_t CounterIndex) {
255   // Map the target value to the representative value of its range.
256   uint64_t RepValue = InstrProfGetRangeRepValue(TargetValue);
257   __llvm_profile_instrument_target(RepValue, Data, CounterIndex);
258 }
259 
260 /*
261  * A wrapper struct that represents value profile runtime data.
262  * Like InstrProfRecord class which is used by profiling host tools,
263  * ValueProfRuntimeRecord also implements the abstract interfaces defined in
264  * ValueProfRecordClosure so that the runtime data can be serialized using
265  * shared C implementation.
266  */
267 typedef struct ValueProfRuntimeRecord {
268   const __llvm_profile_data *Data;
269   ValueProfNode **NodesKind[IPVK_Last + 1];
270   uint8_t **SiteCountArray;
271 } ValueProfRuntimeRecord;
272 
273 /* ValueProfRecordClosure Interface implementation. */
274 
275 static uint32_t getNumValueSitesRT(const void *R, uint32_t VK) {
276   return ((const ValueProfRuntimeRecord *)R)->Data->NumValueSites[VK];
277 }
278 
279 static uint32_t getNumValueDataRT(const void *R, uint32_t VK) {
280   uint32_t S = 0, I;
281   const ValueProfRuntimeRecord *Record = (const ValueProfRuntimeRecord *)R;
282   if (Record->SiteCountArray[VK] == INSTR_PROF_NULLPTR)
283     return 0;
284   for (I = 0; I < Record->Data->NumValueSites[VK]; I++)
285     S += Record->SiteCountArray[VK][I];
286   return S;
287 }
288 
289 static uint32_t getNumValueDataForSiteRT(const void *R, uint32_t VK,
290                                          uint32_t S) {
291   const ValueProfRuntimeRecord *Record = (const ValueProfRuntimeRecord *)R;
292   return Record->SiteCountArray[VK][S];
293 }
294 
295 static ValueProfRuntimeRecord RTRecord;
296 static ValueProfRecordClosure RTRecordClosure = {
297     &RTRecord,          INSTR_PROF_NULLPTR, /* GetNumValueKinds */
298     getNumValueSitesRT, getNumValueDataRT,  getNumValueDataForSiteRT,
299     INSTR_PROF_NULLPTR, /* RemapValueData */
300     INSTR_PROF_NULLPTR, /* GetValueForSite, */
301     INSTR_PROF_NULLPTR  /* AllocValueProfData */
302 };
303 
304 static uint32_t
305 initializeValueProfRuntimeRecord(const __llvm_profile_data *Data,
306                                  uint8_t *SiteCountArray[]) {
307   unsigned I, J, S = 0, NumValueKinds = 0;
308   ValueProfNode **Nodes = (ValueProfNode **)Data->Values;
309   RTRecord.Data = Data;
310   RTRecord.SiteCountArray = SiteCountArray;
311   for (I = 0; I <= IPVK_Last; I++) {
312     uint16_t N = Data->NumValueSites[I];
313     if (!N)
314       continue;
315 
316     NumValueKinds++;
317 
318     RTRecord.NodesKind[I] = Nodes ? &Nodes[S] : INSTR_PROF_NULLPTR;
319     for (J = 0; J < N; J++) {
320       /* Compute value count for each site. */
321       uint32_t C = 0;
322       ValueProfNode *Site =
323           Nodes ? RTRecord.NodesKind[I][J] : INSTR_PROF_NULLPTR;
324       while (Site) {
325         C++;
326         Site = Site->Next;
327       }
328       if (C > UCHAR_MAX)
329         C = UCHAR_MAX;
330       RTRecord.SiteCountArray[I][J] = C;
331     }
332     S += N;
333   }
334   return NumValueKinds;
335 }
336 
337 static ValueProfNode *getNextNValueData(uint32_t VK, uint32_t Site,
338                                         InstrProfValueData *Dst,
339                                         ValueProfNode *StartNode, uint32_t N) {
340   unsigned I;
341   ValueProfNode *VNode = StartNode ? StartNode : RTRecord.NodesKind[VK][Site];
342   for (I = 0; I < N; I++) {
343     Dst[I].Value = VNode->Value;
344     Dst[I].Count = VNode->Count;
345     VNode = VNode->Next;
346   }
347   return VNode;
348 }
349 
350 static uint32_t getValueProfDataSizeWrapper(void) {
351   return getValueProfDataSize(&RTRecordClosure);
352 }
353 
354 static uint32_t getNumValueDataForSiteWrapper(uint32_t VK, uint32_t S) {
355   return getNumValueDataForSiteRT(&RTRecord, VK, S);
356 }
357 
358 static VPDataReaderType TheVPDataReader = {
359     initializeValueProfRuntimeRecord, getValueProfRecordHeaderSize,
360     getFirstValueProfRecord,          getNumValueDataForSiteWrapper,
361     getValueProfDataSizeWrapper,      getNextNValueData};
362 
363 COMPILER_RT_VISIBILITY VPDataReaderType *lprofGetVPDataReader(void) {
364   return &TheVPDataReader;
365 }
366