1 /* cilk-abi-vla.cpp -*-C++-*-
2 *
3 *************************************************************************
4 *
5 * @copyright
6 * Copyright (C) 2013, Intel Corporation
7 * All rights reserved.
8 *
9 * @copyright
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * * Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in
18 * the documentation and/or other materials provided with the
19 * distribution.
20 * * Neither the name of Intel Corporation nor the names of its
21 * contributors may be used to endorse or promote products derived
22 * from this software without specific prior written permission.
23 *
24 * @copyright
25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
28 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
29 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
30 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
31 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
32 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
33 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
35 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 *
38 **************************************************************************/
39
40 /*
41 * Implementation of Variable Length Array (VLA) ABI.
42 *
43 * __cilkrts_stack_alloc() and __cilkrts_stack_free must be compiled
44 * such that ebp/rbp is used for the stack frames. This is done by having
45 * each of them use alloca, which forces the special frame types needed on
46 * each of the ABIs. Additionally, for some forms of stack frame, special
47 * care must be taken because the alloca space may not be at the bottom of the
48 * stack frame of the caller. For Intel64 windows, and for some options
49 * with other ABIs, a preallocated parameter block may exist on the stack
50 * at a lower address than the alloca. If this is the case, the parameter
51 * distance_from_sp_to_alloca_area will be non-zero, and will indicate how
52 * much pre-allocated parameter space resides in the caller's stack frame
53 * between the alloca area, and the bottom of the stack when the call to
54 * the cilkrts is made. As such, when non-zero it also includes any space
55 * used for passing the cilkrts_stack_alloc or cilkrts_stack_free parameters.
56 */
57
58 #include <assert.h>
59 #include <stdlib.h>
60 #include <stdint.h>
61
62 // Getting a definition for alloca appears to be a pain in the butt. Here's
63 // a variant on what's recommended in the autoconf doc
64 #if defined _MSC_VER
65 # include <malloc.h>
66 # define alloca _alloca
67 #elif defined HAVE_ALLOCA_H
68 # include <alloca.h>
69 #elif defined __GNUC__
70 # define alloca __builtin_alloca
71 #elif defined _AIX
72 # define alloca __alloca
73 #else
74 # include <stddef.h>
75 # ifdef __cplusplus
76 extern "C"
77 # endif
78 void *alloca (size_t);
79 #endif
80
81 #ifdef _WIN32
82 # define INLINE static __inline
83 # pragma warning(disable:1025) // Don't whine about zero extending result of unary operation
84 #else
85 # define INLINE static inline
86 #endif
87
88
89 #include "internal/abi.h"
90 #include "cilk-abi-vla-internal.h"
91
92 #if defined(__x86_64) || defined(_M_X64)
setsp(void * val)93 INLINE void setsp(void *val)
94 {
95 __asm__("movq %0, %%rsp" : : "r"(val): "rsp");
96 }
getsp(void)97 INLINE char* getsp(void)
98 {
99 void *res;
100
101 __asm__("movq %%rsp, %0" : "=r"(res): : "rsp");
102 return res;
103 }
getbp(void)104 INLINE char* getbp(void)
105 {
106 void *res;
107
108 __asm__("movq %%rbp, %0" : "=r"(res): : "rbp");
109 return res;
110 }
copy_frame_down_and_move_bp(char * dst,char * src,size_t cpy_bytes,char * new_ebp)111 INLINE void copy_frame_down_and_move_bp(
112 char *dst,
113 char *src,
114 size_t cpy_bytes,
115 char *new_ebp
116 )
117 {
118 // In this version, dst is guaranteed to be lower address than src,
119 // therefore copying upwards from src into dst is safe in case
120 // there is overlap. The number of bytes is also guaranteed to be
121 // a multiple of 8, and the copy is done in 64 bit word chunks for
122 // best efficiency.
123 __asm__(
124 "movq %0, %%rdi;"
125 "movq %1, %%rsi;"
126 "movq %2, %%rcx;"
127 "shrq $3, %%rcx;"
128 "rep movsq;"
129 "movq %3, %%rbp" :
130 :
131 "rm"(dst), "rm"(src), "rm"(cpy_bytes), "rm"(new_ebp) :
132 "rsi", "rdi", "rcx", "rbp", "memory");
133 }
copy_frame_up_and_move_bp(char * dst,char * src,size_t cpy_bytes,char * new_ebp)134 INLINE void copy_frame_up_and_move_bp(
135 char *dst,
136 char *src,
137 size_t cpy_bytes,
138 char *new_ebp
139 )
140 {
141 // In this version, dst is guaranteed to be higher address than src,
142 // therefore copying downwards from src into dst is safe in case
143 // there is overlap. The number of bytes is also guaranteed to be
144 // a multiple of 8, and the copy is done in 64 bit word chunks for
145 // best efficiency.
146 dst += cpy_bytes - 8;
147 src += cpy_bytes - 8;
148 __asm__(
149 "movq %0, %%rdi;"
150 "movq %1, %%rsi;"
151 "movq %2, %%rcx;"
152 "shrq $3, %%rcx;"
153 "std; rep movsq; cld;"
154 "movl %3, %%rbp;" :
155 :
156 "rm"(dst), "rm"(src), "rm"(cpy_bytes), "rm"(new_ebp) :
157 "rsi", "rdi", "rcx", "rbp", "memory");
158 }
159 #else
setsp(void * val)160 INLINE void setsp(void *val)
161 {
162 __asm__("movl %0, %%esp" : : "r"(val): "esp");
163 }
getsp(void)164 INLINE char* getsp(void)
165 {
166 void *res;
167
168 __asm__("movl %%esp, %0" : "=r"(res): : "esp");
169 return res;
170 }
getbp(void)171 INLINE char* getbp(void)
172 {
173 void *res;
174
175 __asm__("movl %%ebp, %0" : "=r"(res): : "ebp");
176 return res;
177 }
copy_frame_down_and_move_bp(char * dst,char * src,size_t cpy_bytes,char * new_ebp)178 INLINE void copy_frame_down_and_move_bp(
179 char *dst,
180 char *src,
181 size_t cpy_bytes,
182 char *new_ebp
183 )
184 {
185 // In this version, dst is guaranteed to be lower address than src,
186 // therefore copying upwards from src into dst is safe in case
187 // there is overlap. The number of bytes is also guaranteed to be
188 // a multiple of 4, and the copy is done in 32 bit word chunks for
189 // best efficiency.
190 __asm__(
191 "movl %0, %%edi;"
192 "movl %1, %%esi;"
193 "movl %2, %%ecx;"
194 "shrl $2, %%ecx;"
195 "rep movsd;"
196 "movl %3, %%ebp" :
197 :
198 "rm"(dst), "rm"(src), "rm"(cpy_bytes), "rm"(new_ebp) :
199 "esi", "edi", "ecx", "ebp", "memory");
200 }
copy_frame_up_and_move_bp(char * dst,char * src,size_t cpy_bytes,char * new_ebp)201 INLINE void copy_frame_up_and_move_bp(
202 char *dst,
203 char *src,
204 size_t cpy_bytes,
205 char *new_ebp
206 )
207 {
208 // In this version, dst is guaranteed to be higher address than src,
209 // therefore copying downwards from src into dst is safe in case
210 // there is overlap. The number of bytes is also guaranteed to be
211 // a multiple of 4, and the copy is done in 32 bit word chunks for
212 // best efficiency.
213 dst += cpy_bytes - 4;
214 src += cpy_bytes - 4;
215 __asm__(
216 "movl %0, %%edi;"
217 "movl %1, %%esi;"
218 "movl %2, %%ecx;"
219 "shrl $2, %%ecx;"
220 "std; rep movsd; cld;"
221 "movl %3, %%ebp" :
222 // "=D"(dst), "=S"(src), "=C"(cpy_bytes) :
223 :
224 "rm"(dst), "rm"(src), "rm"(cpy_bytes), "rm"(new_ebp) :
225 "esi", "edi", "ecx", "ebp", "memory");
226 }
227 #endif
228
229
230 #define c_cilk_ptr_from_heap 0xc2f2f00d
231 #define c_cilk_ptr_from_stack 0xc3f30d0f
232
233 CILK_ABI(__cilkrts_void_ptr)
__cilkrts_stack_alloc(__cilkrts_stack_frame * sf,size_t size,size_t distance_from_sp_to_alloca_area,uint32_t align,uint32_t needs_tag)234 __cilkrts_stack_alloc(
235 __cilkrts_stack_frame *sf,
236 size_t size,
237 size_t distance_from_sp_to_alloca_area,
238 uint32_t align, // align is always >= minimum stack alignment and
239 // >= ptr_size as well, and must be a power of 2.
240 uint32_t needs_tag // non-zero if the pointer being returned needs to
241 // be tagged
242 )
243 {
244 #ifdef __INTEL_COMPILER
245 // full_size will be a multiple of align, and contains
246 // enough extra space to allocate a marker.
247 size_t full_size = (size + align - 1) & ~(align - 1);
248
249 if (needs_tag) {
250 full_size += align;
251 }
252
253 char *t;
254 if (sf->worker != 0 &&
255 ((sf->flags & CILK_FRAME_UNSYNCHED) != 0)) {
256 t = vla_internal_heap_alloc(sf, full_size, align);
257 if (needs_tag) {
258 t += align;
259 ((uint32_t*)t)[-1] = c_cilk_ptr_from_heap;
260 }
261 return (void *)t;
262 }
263
264 // stack is still synced, allocate full_size from esp,
265 // and record in 32 bits immediately below the space
266 // allocated that this was space that this was
267 // allocated in the stack.
268 char *old_ebp = getbp();
269 char *old_esp = getsp();
270
271 // make top_ptr point to base of first parameter.
272 char *top_ptr = ((char *)(_AddressOfReturnAddress()) +
273 sizeof(char *));
274 size_t param_size = 0;
275
276 #if defined(__x86_64)
277 // For Intel64 linux & MACH ABI, all the parameters were passed in
278 // register, so top of the stack frame above the return address
279 // is just the size of the return address plus
280 // distance_from_sp_to_alloca_area on the chance that the alloca
281 // area isn't at the very bottom of the calling functions stack.
282 #elif defined(__MACH__)
283 // For ia32 MACH, parameter size is always a mutliple of 16
284 // bytes to keep the stack 16 byte aligned. So we need to round
285 // number of parameters up to multiple of 4.
286 param_size = 8 * sizeof(char *);
287 #else
288 // For both windows Intel64 ABI, and the IA32 windows and
289 // linux ABIs, space is reserved on the stack for all these
290 // parameters. param_size is 5 * size of a stack slot.
291 param_size = 5 * sizeof(char *);
292 #endif
293
294 // now make top_ptr point above the params, or if
295 // distance_from_sp_to_alloca_area is not zero, make
296 // it point above that area. When non-zero,
297 // distance_from_sp_to_alloca area is expected to contain
298 // the parameter space, so we only add one or the other,
299 // not both.
300 top_ptr += (distance_from_sp_to_alloca_area != 0) ?
301 distance_from_sp_to_alloca_area : param_size;
302
303 // t needs to end up at current value of top_ptr less full_size and less
304 // distance_from_sp_to_alloca_area and
305 // then rounded down to the alignment needed. Then we have to bump
306 // esp down by current frame_size, so that when all is done with respect
307 // to executing the return sequence, the final value of esp will be the
308 // same value as t.
309 t = (top_ptr - full_size) - distance_from_sp_to_alloca_area;
310 intptr_t temp = (intptr_t)t;
311 temp &= ~((intptr_t)(align - 1));
312 t = (char *)temp;
313
314 // ok, the value of t is set where we need it. Now set esp
315 // to the value of t less the current frame size.
316 // So now when we do regular return esp should be left such
317 // that it has moved down by full_size.
318 size_t cur_fm_size = (top_ptr - old_esp);
319 char *new_esp = t - cur_fm_size;
320 char *new_ebp = old_ebp - (old_esp - new_esp);
321
322 // extend the stack down by at least the difference between where
323 // I want it to be and where it currently is. This should take care
324 // of touching any pages necessary.
325 char *foo = alloca(old_esp - new_esp);
326 setsp(foo < new_esp ? foo : new_esp);
327
328 // Now set esp exactly where I want it.
329 // setsp(new_esp);
330
331 copy_frame_down_and_move_bp(new_esp, old_esp, cur_fm_size, new_ebp);
332
333 if (needs_tag) {
334 t += align;
335 ((uint32_t*)t)[-1] = c_cilk_ptr_from_stack;
336 }
337
338 return t;
339 #else // Not __INTEL_COMPILER
340 // Not supported unless we can figure out how to get the size of the frame
341 return NULL;
342 #endif
343 }
344
345 // This frees the space allocated for a variable length array.
346 CILK_ABI(void)
__cilkrts_stack_free(__cilkrts_stack_frame * sf,void * p,size_t size,size_t distance_from_sp_to_alloca_area,uint32_t align,uint32_t known_from_stack)347 __cilkrts_stack_free(
348 __cilkrts_stack_frame *sf,
349 void *p,
350 size_t size,
351 size_t distance_from_sp_to_alloca_area,
352 uint32_t align, // same requirements as for align in allocation,
353 // and must match alignment that was passed when
354 // doing the allocation
355 uint32_t known_from_stack // non-zero if this is known to be allocated
356 // on the stack, and therefore has no tag
357 )
358 {
359 #ifdef __INTEL_COMPILER
360 uint32_t *t = (uint32_t*)p;
361
362 // full_size will be a multiple of align, and contains
363 // enough extra space to allocate a marker if one was needed.
364 size_t full_size = (size + align - 1) & ~(align - 1);
365 if (known_from_stack == 0) {
366 // if the compiler hasn't told the run-time that this is
367 // known to be on the stack, then this pointer must have been
368 // tagged such that the run-time can tell.
369 assert(t[-1] == c_cilk_ptr_from_stack ||
370 t[-1] == c_cilk_ptr_from_heap);
371
372 known_from_stack = t[-1] == c_cilk_ptr_from_stack;
373 full_size += align; // accounts for extra space for marker
374 t = (uint32_t *)(((char *)t) - align);
375 }
376
377 if (known_from_stack) {
378 // alloca useage forces an ebp/rbp based stack frame even though
379 // 0 and unused.
380 char *foo = alloca(0);
381 if (sf->worker == 0 || (sf->flags & CILK_FRAME_UNSYNCHED) == 0) {
382 // p was allocated from current stack frame and we
383 // are synced on current stack frame. Return the
384 // amount of the stack that needs to be freed.
385 char *old_ebp = getbp();
386 char *old_esp = getsp();
387
388 // make top_ptr point to base of first parameter.
389 char *top_ptr = ((char *)(_AddressOfReturnAddress()) +
390 sizeof(char *));
391 size_t param_size = 0;
392
393 #if defined(__x86_64)
394 // For Intel64 linux & MACH ABI, all the parameters were passed in
395 // register, so top of the stack frame above the return address
396 // is just the size of the return address plus
397 // distance_from_sp_to_alloca_area on the chance that the alloca
398 // area isn't at the very bottom of the calling functions stack.
399 #elif defined(__MACH__)
400 // For ia32 MACH, parameter size is always a mutliple of 16
401 // bytes to keep the stack 16 byte aligned. So we need to round
402 // number of parameters up to multiple of 4.
403 param_size = 8 * sizeof(char *);
404 #else
405 // For both windows Intel64 ABI, and the IA32 windows and
406 // linux ABIs, space is reserved on the stack for all these
407 // parameters. param_size is 5 * size of a stack slot.
408 param_size = 6 * sizeof(char *);
409 #endif
410
411 // now make top_ptr point above the params, or if
412 // distance_from_sp_to_alloca_area is not zero, make
413 // it point above that area. When non-zero,
414 // distance_from_sp_to_alloca area is expected to contain
415 // the parameter space, so we only add one or the other,
416 // not both.
417 top_ptr += (distance_from_sp_to_alloca_area != 0) ?
418 distance_from_sp_to_alloca_area : param_size;
419
420 size_t cur_fm_size = (top_ptr - old_esp);
421 char *new_esp = old_esp + full_size;
422 char *new_ebp = old_ebp + full_size;
423
424 copy_frame_up_and_move_bp(new_esp, old_esp, cur_fm_size, new_ebp);
425 setsp(new_esp);
426 }
427 else {
428 // p was allocated on stack frame, but that is
429 // no longer the current stack frame. Need to adjust the
430 // saved esp that is somewhere in the cilk runtime so that
431 // on sync, esp will be cut back correctly.
432 vla_free_from_original_stack(sf, full_size);
433 }
434 }
435 else {
436 vla_internal_heap_free(t, full_size);
437 }
438 #else // Not __INTEL_COMPILER
439 // Not supported unless we can figure out how to get the size of the frame
440 #endif
441 }
442