1 /* cilk-abi-vla.cpp                  -*-C++-*-
2  *
3  *************************************************************************
4  *
5  *  @copyright
6  *  Copyright (C) 2013, Intel Corporation
7  *  All rights reserved.
8  *
9  *  @copyright
10  *  Redistribution and use in source and binary forms, with or without
11  *  modification, are permitted provided that the following conditions
12  *  are met:
13  *
14  *    * Redistributions of source code must retain the above copyright
15  *      notice, this list of conditions and the following disclaimer.
16  *    * Redistributions in binary form must reproduce the above copyright
17  *      notice, this list of conditions and the following disclaimer in
18  *      the documentation and/or other materials provided with the
19  *      distribution.
20  *    * Neither the name of Intel Corporation nor the names of its
21  *      contributors may be used to endorse or promote products derived
22  *      from this software without specific prior written permission.
23  *
24  *  @copyright
25  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26  *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27  *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
28  *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
29  *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
30  *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
31  *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
32  *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
33  *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
35  *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36  *  POSSIBILITY OF SUCH DAMAGE.
37  *
38  **************************************************************************/
39 
40 /*
41  * Implementation of Variable Length Array (VLA) ABI.
42  *
43  * __cilkrts_stack_alloc() and __cilkrts_stack_free must be compiled
44  * such that ebp/rbp is used for the stack frames.  This is done by having
45  * each of them use alloca, which forces the special frame types needed on
46  * each of the ABIs.  Additionally, for some forms of stack frame, special
47  * care must be taken because the alloca space may not be at the bottom of the
48  * stack frame of the caller.  For Intel64 windows, and for some options
49  * with other ABIs, a preallocated parameter block may exist on the stack
50  * at a lower address than the alloca.  If this is the case, the parameter
51  * distance_from_sp_to_alloca_area will be non-zero, and will indicate how
52  * much pre-allocated parameter space resides in the caller's stack frame
53  * between the alloca area, and the bottom of the stack when the call to
54  * the cilkrts is made.  As such, when non-zero it also includes any space
55  * used for passing the cilkrts_stack_alloc or cilkrts_stack_free parameters.
56  */
57 
58 #include <assert.h>
59 #include <stdlib.h>
60 #include <stdint.h>
61 
62 // Getting a definition for alloca appears to be a pain in the butt.  Here's
63 // a variant on what's recommended in the autoconf doc
64 #if defined _MSC_VER
65 # include <malloc.h>
66 # define alloca _alloca
67 #elif defined HAVE_ALLOCA_H
68 # include <alloca.h>
69 #elif defined __GNUC__
70 # define alloca __builtin_alloca
71 #elif defined _AIX
72 # define alloca __alloca
73 #else
74 # include <stddef.h>
75 # ifdef  __cplusplus
76 extern "C"
77 # endif
78 void *alloca (size_t);
79 #endif
80 
81 #ifdef _WIN32
82 # define INLINE static __inline
83 # pragma warning(disable:1025)  // Don't whine about zero extending result of unary operation
84 #else
85 # define INLINE static inline
86 #endif
87 
88 
89 #include "internal/abi.h"
90 #include "cilk-abi-vla-internal.h"
91 
92 #if defined(__x86_64) || defined(_M_X64)
setsp(void * val)93 INLINE void setsp(void *val)
94 {
95     __asm__("movq %0, %%rsp" : : "r"(val): "rsp");
96 }
getsp(void)97 INLINE char* getsp(void)
98 {
99     void *res;
100 
101     __asm__("movq %%rsp, %0" : "=r"(res): : "rsp");
102     return res;
103 }
getbp(void)104 INLINE char* getbp(void)
105 {
106     void *res;
107 
108     __asm__("movq %%rbp, %0" : "=r"(res): : "rbp");
109     return res;
110 }
copy_frame_down_and_move_bp(char * dst,char * src,size_t cpy_bytes,char * new_ebp)111 INLINE void copy_frame_down_and_move_bp(
112     char *dst,
113     char *src,
114     size_t cpy_bytes,
115     char *new_ebp
116 )
117 {
118     // In this version, dst is guaranteed to be lower address than src,
119     // therefore copying upwards from src into dst is safe in case
120     // there is overlap. The number of bytes is also guaranteed to be
121     // a multiple of 8, and the copy is done in 64 bit word chunks for
122     // best efficiency.
123     __asm__(
124         "movq %0, %%rdi;"
125         "movq %1, %%rsi;"
126         "movq %2, %%rcx;"
127         "shrq $3, %%rcx;"
128         "rep movsq;"
129         "movq %3, %%rbp" :
130         :
131         "rm"(dst), "rm"(src), "rm"(cpy_bytes), "rm"(new_ebp) :
132         "rsi", "rdi", "rcx", "rbp", "memory");
133 }
copy_frame_up_and_move_bp(char * dst,char * src,size_t cpy_bytes,char * new_ebp)134 INLINE void copy_frame_up_and_move_bp(
135     char *dst,
136     char *src,
137     size_t cpy_bytes,
138     char *new_ebp
139 )
140 {
141     // In this version, dst is guaranteed to be higher address than src,
142     // therefore copying downwards from src into dst is safe in case
143     // there is overlap. The number of bytes is also guaranteed to be
144     // a multiple of 8, and the copy is done in 64 bit word chunks for
145     // best efficiency.
146     dst += cpy_bytes - 8;
147     src += cpy_bytes - 8;
148     __asm__(
149         "movq %0, %%rdi;"
150         "movq %1, %%rsi;"
151         "movq %2, %%rcx;"
152         "shrq $3, %%rcx;"
153         "std; rep movsq; cld;"
154         "movl %3, %%rbp;" :
155         :
156         "rm"(dst), "rm"(src), "rm"(cpy_bytes), "rm"(new_ebp) :
157         "rsi", "rdi", "rcx", "rbp", "memory");
158 }
159 #else
setsp(void * val)160 INLINE void setsp(void *val)
161 {
162     __asm__("movl %0, %%esp" : : "r"(val): "esp");
163 }
getsp(void)164 INLINE char* getsp(void)
165 {
166     void *res;
167 
168     __asm__("movl %%esp, %0" : "=r"(res): : "esp");
169     return res;
170 }
getbp(void)171 INLINE char* getbp(void)
172 {
173     void *res;
174 
175     __asm__("movl %%ebp, %0" : "=r"(res): : "ebp");
176     return res;
177 }
copy_frame_down_and_move_bp(char * dst,char * src,size_t cpy_bytes,char * new_ebp)178 INLINE void copy_frame_down_and_move_bp(
179     char *dst,
180     char *src,
181     size_t cpy_bytes,
182     char *new_ebp
183 )
184 {
185     // In this version, dst is guaranteed to be lower address than src,
186     // therefore copying upwards from src into dst is safe in case
187     // there is overlap. The number of bytes is also guaranteed to be
188     // a multiple of 4, and the copy is done in 32 bit word chunks for
189     // best efficiency.
190     __asm__(
191         "movl %0, %%edi;"
192         "movl %1, %%esi;"
193         "movl %2, %%ecx;"
194         "shrl $2, %%ecx;"
195         "rep movsd;"
196         "movl %3, %%ebp" :
197         :
198         "rm"(dst), "rm"(src), "rm"(cpy_bytes), "rm"(new_ebp) :
199         "esi", "edi", "ecx", "ebp", "memory");
200 }
copy_frame_up_and_move_bp(char * dst,char * src,size_t cpy_bytes,char * new_ebp)201 INLINE void copy_frame_up_and_move_bp(
202     char *dst,
203     char *src,
204     size_t cpy_bytes,
205     char *new_ebp
206 )
207 {
208     // In this version, dst is guaranteed to be higher address than src,
209     // therefore copying downwards from src into dst is safe in case
210     // there is overlap. The number of bytes is also guaranteed to be
211     // a multiple of 4, and the copy is done in 32 bit word chunks for
212     // best efficiency.
213     dst += cpy_bytes - 4;
214     src += cpy_bytes - 4;
215     __asm__(
216         "movl %0, %%edi;"
217         "movl %1, %%esi;"
218         "movl %2, %%ecx;"
219         "shrl $2, %%ecx;"
220         "std; rep movsd; cld;"
221         "movl %3, %%ebp" :
222         // "=D"(dst), "=S"(src), "=C"(cpy_bytes) :
223         :
224         "rm"(dst), "rm"(src), "rm"(cpy_bytes), "rm"(new_ebp) :
225         "esi", "edi", "ecx", "ebp", "memory");
226 }
227 #endif
228 
229 
230 #define c_cilk_ptr_from_heap  0xc2f2f00d
231 #define c_cilk_ptr_from_stack 0xc3f30d0f
232 
233 CILK_ABI(__cilkrts_void_ptr)
__cilkrts_stack_alloc(__cilkrts_stack_frame * sf,size_t size,size_t distance_from_sp_to_alloca_area,uint32_t align,uint32_t needs_tag)234 __cilkrts_stack_alloc(
235     __cilkrts_stack_frame *sf,
236     size_t size,
237     size_t distance_from_sp_to_alloca_area,
238     uint32_t align,     // align is always >= minimum stack alignment and
239                         // >= ptr_size as well, and must be a power of 2.
240     uint32_t needs_tag  // non-zero if the pointer being returned needs to
241                         // be tagged
242 )
243 {
244 #ifdef __INTEL_COMPILER
245     // full_size will be a multiple of align, and contains
246     // enough extra space to allocate a marker.
247     size_t full_size = (size + align - 1) & ~(align - 1);
248 
249     if (needs_tag) {
250         full_size += align;
251     }
252 
253     char *t;
254     if (sf->worker != 0 &&
255         ((sf->flags & CILK_FRAME_UNSYNCHED) != 0)) {
256         t = vla_internal_heap_alloc(sf, full_size, align);
257         if (needs_tag) {
258             t += align;
259             ((uint32_t*)t)[-1] = c_cilk_ptr_from_heap;
260         }
261         return (void *)t;
262     }
263 
264     // stack is still synced, allocate full_size from esp,
265     // and record in 32 bits immediately below the space
266     // allocated that this was space that this was
267     // allocated in the stack.
268     char *old_ebp = getbp();
269     char *old_esp = getsp();
270 
271     // make top_ptr point to base of first parameter.
272     char *top_ptr = ((char *)(_AddressOfReturnAddress()) +
273                     sizeof(char *));
274     size_t param_size = 0;
275 
276 #if defined(__x86_64)
277     // For Intel64 linux & MACH ABI, all the parameters were passed in
278     // register, so top of the stack frame above the return address
279     // is just the size of the return address plus
280     // distance_from_sp_to_alloca_area on the chance that the alloca
281     // area isn't at the very bottom of the calling functions stack.
282 #elif defined(__MACH__)
283     // For ia32 MACH, parameter size is always a mutliple of 16
284     // bytes to keep the stack 16 byte aligned.  So we need to round
285     // number of parameters up to multiple of 4.
286     param_size = 8 * sizeof(char *);
287 #else
288     // For both windows Intel64 ABI, and the IA32 windows and
289     // linux ABIs, space is reserved on the stack for all these
290     // parameters.  param_size is 5 * size of a stack slot.
291     param_size = 5 * sizeof(char *);
292 #endif
293 
294     // now make top_ptr point above the params, or if
295     // distance_from_sp_to_alloca_area is not zero, make
296     // it point above that area.  When non-zero,
297     // distance_from_sp_to_alloca area is expected to contain
298     // the parameter space, so we only add one or the other,
299     // not both.
300     top_ptr += (distance_from_sp_to_alloca_area != 0) ?
301                    distance_from_sp_to_alloca_area : param_size;
302 
303     // t needs to end up at current value of top_ptr less full_size and less
304     // distance_from_sp_to_alloca_area and
305     // then rounded down to the alignment needed.  Then we have to bump
306     // esp down by current frame_size, so that when all is done with respect
307     // to executing the return sequence, the final value of esp will be the
308     // same value as t.
309     t = (top_ptr - full_size) - distance_from_sp_to_alloca_area;
310     intptr_t temp = (intptr_t)t;
311     temp &= ~((intptr_t)(align - 1));
312     t = (char *)temp;
313 
314     // ok, the value of t is set where we need it.  Now set esp
315     // to the value of t less the current frame size.
316     // So now when we do regular return esp should be left such
317     // that it has moved down by full_size.
318     size_t cur_fm_size = (top_ptr - old_esp);
319     char *new_esp = t - cur_fm_size;
320     char *new_ebp = old_ebp - (old_esp - new_esp);
321 
322     // extend the stack down by at least the difference between where
323     // I want it to be and where it currently is.  This should take care
324     // of touching any pages necessary.
325     char *foo = alloca(old_esp - new_esp);
326     setsp(foo < new_esp ? foo : new_esp);
327 
328     // Now set esp exactly where I want it.
329     // setsp(new_esp);
330 
331     copy_frame_down_and_move_bp(new_esp, old_esp, cur_fm_size, new_ebp);
332 
333     if (needs_tag) {
334         t += align;
335         ((uint32_t*)t)[-1] = c_cilk_ptr_from_stack;
336     }
337 
338     return t;
339 #else // Not __INTEL_COMPILER
340     // Not supported unless we can figure out how to get the size of the frame
341     return NULL;
342 #endif
343 }
344 
345 // This frees the space allocated for a variable length array.
346 CILK_ABI(void)
__cilkrts_stack_free(__cilkrts_stack_frame * sf,void * p,size_t size,size_t distance_from_sp_to_alloca_area,uint32_t align,uint32_t known_from_stack)347 __cilkrts_stack_free(
348     __cilkrts_stack_frame *sf,
349     void *p,
350     size_t size,
351     size_t distance_from_sp_to_alloca_area,
352     uint32_t align, // same requirements as for align in allocation,
353                     // and must match alignment that was passed when
354                     // doing the allocation
355     uint32_t known_from_stack  // non-zero if this is known to be allocated
356                                // on the stack, and therefore has no tag
357 )
358 {
359 #ifdef __INTEL_COMPILER
360     uint32_t *t = (uint32_t*)p;
361 
362     // full_size will be a multiple of align, and contains
363     // enough extra space to allocate a marker if one was needed.
364     size_t full_size = (size + align - 1) & ~(align - 1);
365     if (known_from_stack == 0) {
366         // if the compiler hasn't told the run-time that this is
367         // known to be on the stack, then this pointer must have been
368         // tagged such that the run-time can tell.
369         assert(t[-1] == c_cilk_ptr_from_stack ||
370                t[-1] == c_cilk_ptr_from_heap);
371 
372         known_from_stack = t[-1] == c_cilk_ptr_from_stack;
373         full_size += align;    // accounts for extra space for marker
374         t = (uint32_t *)(((char *)t) - align);
375     }
376 
377     if (known_from_stack) {
378         // alloca useage forces an ebp/rbp based stack frame even though
379         // 0 and unused.
380         char *foo = alloca(0);
381         if (sf->worker == 0 || (sf->flags & CILK_FRAME_UNSYNCHED) == 0) {
382             // p was allocated from current stack frame and we
383             // are synced on current stack frame.  Return the
384             // amount of the stack that needs to be freed.
385             char *old_ebp = getbp();
386             char *old_esp = getsp();
387 
388             // make top_ptr point to base of first parameter.
389             char *top_ptr = ((char *)(_AddressOfReturnAddress()) +
390                             sizeof(char *));
391             size_t param_size = 0;
392 
393 #if defined(__x86_64)
394             // For Intel64 linux & MACH ABI, all the parameters were passed in
395             // register, so top of the stack frame above the return address
396             // is just the size of the return address plus
397             // distance_from_sp_to_alloca_area on the chance that the alloca
398             // area isn't at the very bottom of the calling functions stack.
399 #elif defined(__MACH__)
400             // For ia32 MACH, parameter size is always a mutliple of 16
401             // bytes to keep the stack 16 byte aligned.  So we need to round
402             // number of parameters up to multiple of 4.
403             param_size = 8 * sizeof(char *);
404 #else
405             // For both windows Intel64 ABI, and the IA32 windows and
406             // linux ABIs, space is reserved on the stack for all these
407             // parameters.  param_size is 5 * size of a stack slot.
408             param_size = 6 * sizeof(char *);
409 #endif
410 
411             // now make top_ptr point above the params, or if
412             // distance_from_sp_to_alloca_area is not zero, make
413             // it point above that area.  When non-zero,
414             // distance_from_sp_to_alloca area is expected to contain
415             // the parameter space, so we only add one or the other,
416             // not both.
417             top_ptr += (distance_from_sp_to_alloca_area != 0) ?
418                            distance_from_sp_to_alloca_area : param_size;
419 
420             size_t cur_fm_size = (top_ptr - old_esp);
421             char *new_esp = old_esp + full_size;
422             char *new_ebp = old_ebp + full_size;
423 
424             copy_frame_up_and_move_bp(new_esp, old_esp, cur_fm_size, new_ebp);
425             setsp(new_esp);
426         }
427         else {
428             // p was allocated on stack frame, but that is
429             // no longer the current stack frame.  Need to adjust the
430             // saved esp that is somewhere in the cilk runtime so that
431             // on sync, esp will be cut back correctly.
432             vla_free_from_original_stack(sf, full_size);
433         }
434     }
435     else {
436         vla_internal_heap_free(t, full_size);
437     }
438 #else // Not __INTEL_COMPILER
439     // Not supported unless we can figure out how to get the size of the frame
440 #endif
441 }
442