1 #include <iostream>
2 
3 #include "CSE.h"
4 #include "CodeGen_Internal.h"
5 #include "CodeGen_Posix.h"
6 #include "Debug.h"
7 #include "IR.h"
8 #include "IROperator.h"
9 #include "IRPrinter.h"
10 #include "LLVM_Headers.h"
11 #include "Simplify.h"
12 
13 namespace Halide {
14 namespace Internal {
15 
16 using std::string;
17 using std::vector;
18 
19 using namespace llvm;
20 
CodeGen_Posix(Target t)21 CodeGen_Posix::CodeGen_Posix(Target t)
22     : CodeGen_LLVM(t) {
23 }
24 
codegen_allocation_size(const std::string & name,Type type,const std::vector<Expr> & extents,const Expr & condition)25 Value *CodeGen_Posix::codegen_allocation_size(const std::string &name, Type type, const std::vector<Expr> &extents, const Expr &condition) {
26     // Compute size from list of extents checking for overflow.
27 
28     Expr overflow = make_zero(UInt(64));
29     Expr total_size = make_const(UInt(64), type.lanes() * type.bytes());
30 
31     // We'll multiply all the extents into the 64-bit value
32     // total_size. We'll also track (total_size >> 32) as a 64-bit
33     // value to check for overflow as we go. The loop invariant will
34     // be that either the overflow Expr is non-zero, or total_size_hi
35     // only occupies the bottom 32-bits. Overflow could be more simply
36     // checked for using division, but that's slower at runtime. This
37     // method generates much better assembly.
38     Expr total_size_hi = make_zero(UInt(64));
39 
40     Expr low_mask = make_const(UInt(64), (uint64_t)(0xffffffff));
41     for (size_t i = 0; i < extents.size(); i++) {
42         Expr next_extent = cast(UInt(32), max(0, extents[i]));
43 
44         // Update total_size >> 32. This math can't overflow due to
45         // the loop invariant:
46         total_size_hi *= next_extent;
47         // Deal with carry from the low bits. Still can't overflow.
48         total_size_hi += ((total_size & low_mask) * next_extent) >> 32;
49 
50         // Update total_size. This may overflow.
51         total_size *= next_extent;
52 
53         // We can check for overflow by asserting that total_size_hi
54         // is still a 32-bit number.
55         overflow = overflow | (total_size_hi >> 32);
56     }
57 
58     Expr max_size = make_const(UInt(64), target.maximum_buffer_size());
59     Expr size_check = (overflow == 0) && (total_size <= max_size);
60 
61     if (!is_one(condition)) {
62         size_check = simplify(size_check || !condition);
63     }
64 
65     // For constant-sized allocations this check should simplify away.
66     size_check = common_subexpression_elimination(simplify(size_check));
67     if (!is_one(size_check)) {
68         create_assertion(codegen(size_check || !condition),
69                          Call::make(Int(32), "halide_error_buffer_allocation_too_large",
70                                     {name, total_size, max_size}, Call::Extern));
71     }
72 
73     total_size = simplify(total_size);
74     return codegen(total_size);
75 }
76 
allocation_padding(Type type) const77 int CodeGen_Posix::allocation_padding(Type type) const {
78     // We potentially load one scalar value past the end of the
79     // buffer, so pad the allocation with an extra instance of the
80     // scalar type.
81     return type.bytes();
82 }
83 
create_allocation(const std::string & name,Type type,MemoryType memory_type,const std::vector<Expr> & extents,const Expr & condition,const Expr & new_expr,std::string free_function)84 CodeGen_Posix::Allocation CodeGen_Posix::create_allocation(const std::string &name, Type type, MemoryType memory_type,
85                                                            const std::vector<Expr> &extents, const Expr &condition,
86                                                            const Expr &new_expr, std::string free_function) {
87     Value *llvm_size = nullptr;
88     int64_t stack_bytes = 0;
89     int32_t constant_bytes = Allocate::constant_allocation_size(extents, name);
90     if (constant_bytes > 0) {
91         constant_bytes *= type.bytes();
92         stack_bytes = constant_bytes;
93 
94         if (stack_bytes > target.maximum_buffer_size()) {
95             const string str_max_size = target.has_large_buffers() ? "2^63 - 1" : "2^31 - 1";
96             user_error << "Total size for allocation " << name << " is constant but exceeds " << str_max_size << ".";
97         } else if (memory_type == MemoryType::Heap ||
98                    (memory_type != MemoryType::Register &&
99                     !can_allocation_fit_on_stack(stack_bytes))) {
100             // We should put the allocation on the heap if it's
101             // explicitly placed on the heap, or if it's not
102             // explicitly placed in registers and it's large. Large
103             // stack allocations become pseudostack allocations
104             // instead.
105             stack_bytes = 0;
106             llvm_size = codegen(Expr(constant_bytes));
107         }
108     } else {
109         // Should have been caught in bound_small_allocations
110         internal_assert(memory_type != MemoryType::Register);
111         llvm_size = codegen_allocation_size(name, type, extents, condition);
112     }
113 
114     // Only allocate memory if the condition is true, otherwise 0.
115     Value *llvm_condition = codegen(condition);
116     if (llvm_size != nullptr) {
117         // Add the requested padding to the allocation size. If the
118         // allocation is on the stack, we can just read past the top
119         // of the stack, so we only need this for heap allocations.
120         Value *padding = ConstantInt::get(llvm_size->getType(), allocation_padding(type));
121         llvm_size = builder->CreateAdd(llvm_size, padding);
122         llvm_size = builder->CreateSelect(llvm_condition,
123                                           llvm_size,
124                                           ConstantInt::get(llvm_size->getType(), 0));
125     }
126 
127     Allocation allocation;
128     allocation.constant_bytes = constant_bytes;
129     allocation.stack_bytes = new_expr.defined() ? 0 : stack_bytes;
130     allocation.type = type;
131     allocation.name = name;
132 
133     if (!new_expr.defined() && extents.empty()) {
134         // If it's a scalar allocation, don't try anything clever. We
135         // want llvm to be able to promote it to a register.
136         allocation.ptr = create_alloca_at_entry(llvm_type_of(type), 1, false, name);
137         allocation.stack_bytes = stack_bytes;
138         cur_stack_alloc_total += allocation.stack_bytes;
139         debug(4) << "cur_stack_alloc_total += " << allocation.stack_bytes << " -> " << cur_stack_alloc_total << " for " << name << "\n";
140     } else if (!new_expr.defined() && stack_bytes != 0) {
141 
142         // Try to find a free stack allocation we can use.
143         vector<Allocation>::iterator it = free_stack_allocs.end();
144         for (it = free_stack_allocs.begin(); it != free_stack_allocs.end(); ++it) {
145             if (it->pseudostack_slot) {
146                 // Don't merge with dynamic stack allocations
147                 continue;
148             }
149             AllocaInst *alloca_inst = dyn_cast<AllocaInst>(it->ptr);
150             llvm::Function *allocated_in = alloca_inst ? alloca_inst->getParent()->getParent() : nullptr;
151             llvm::Function *current_func = builder->GetInsertBlock()->getParent();
152 
153             if (allocated_in == current_func &&
154                 it->type == type &&
155                 it->stack_bytes >= stack_bytes) {
156                 break;
157             }
158         }
159         if (it != free_stack_allocs.end()) {
160             debug(4) << "Reusing freed stack allocation of " << it->stack_bytes
161                      << " bytes for allocation " << name
162                      << " of " << stack_bytes << " bytes.\n";
163             // Use a free alloc we found.
164             allocation.ptr = it->ptr;
165             allocation.stack_bytes = it->stack_bytes;
166             allocation.name = it->name;
167 
168             // This allocation isn't free anymore.
169             free_stack_allocs.erase(it);
170         } else {
171             debug(4) << "Allocating " << stack_bytes << " bytes on the stack for " << name << "\n";
172             // We used to do the alloca locally and save and restore the
173             // stack pointer, but this makes llvm generate streams of
174             // spill/reloads.
175             int64_t stack_size = (stack_bytes + type.bytes() - 1) / type.bytes();
176             // Handles are stored as uint64s
177             llvm::Type *t =
178                 llvm_type_of(type.is_handle() ? UInt(64, type.lanes()) : type);
179             allocation.ptr = create_alloca_at_entry(t, stack_size, false, name);
180             allocation.stack_bytes = stack_bytes;
181         }
182         cur_stack_alloc_total += allocation.stack_bytes;
183         debug(4) << "cur_stack_alloc_total += " << allocation.stack_bytes << " -> " << cur_stack_alloc_total << " for " << name << "\n";
184     } else if (memory_type == MemoryType::Stack && !new_expr.defined()) {
185         // Try to find a free pseudostack allocation we can use.
186         vector<Allocation>::iterator it = free_stack_allocs.end();
187         for (it = free_stack_allocs.begin(); it != free_stack_allocs.end(); ++it) {
188             if (!it->pseudostack_slot) {
189                 // Don't merge with static stack allocations
190                 continue;
191             }
192             AllocaInst *alloca_inst = dyn_cast<AllocaInst>(it->pseudostack_slot);
193             llvm::Function *allocated_in = alloca_inst ? alloca_inst->getParent()->getParent() : nullptr;
194             llvm::Function *current_func = builder->GetInsertBlock()->getParent();
195             if (it->type == type &&
196                 allocated_in == current_func) {
197                 break;
198             }
199         }
200         Value *slot = nullptr;
201         if (it != free_stack_allocs.end()) {
202             debug(4) << "Reusing freed pseudostack allocation from " << it->name
203                      << " for " << name << "\n";
204             slot = it->pseudostack_slot;
205             allocation.name = it->name;
206             allocation.destructor = it->destructor;
207             // We've already created a destructor stack entry for this
208             // pseudostack allocation, but we may not have actually
209             // registered the destructor if we're reusing an
210             // allocation that occurs conditionally. TODO: Why not
211             // just register the destructor at entry?
212 
213             builder->CreateStore(builder->CreatePointerCast(slot, i8_t->getPointerTo()), allocation.destructor);
214             free_stack_allocs.erase(it);
215         } else {
216             // Stack allocation with a dynamic size
217             slot = create_alloca_at_entry(pseudostack_slot_t_type, 1, true, name + ".pseudostack_slot");
218             llvm::Function *free_fn = module->getFunction("pseudostack_free");
219             allocation.destructor = register_destructor(free_fn, slot, Always);
220         }
221 
222         // Even if we're reusing a stack slot, we need to call
223         // pseudostack_alloc to potentially reallocate.
224         llvm::Function *malloc_fn = module->getFunction("pseudostack_alloc");
225         internal_assert(malloc_fn) << "Could not find pseudostack_alloc in module\n";
226         malloc_fn->setReturnDoesNotAlias();
227 
228         llvm::Function::arg_iterator arg_iter = malloc_fn->arg_begin();
229         ++arg_iter;  // skip the user context *
230         slot = builder->CreatePointerCast(slot, arg_iter->getType());
231         ++arg_iter;  // skip the pointer to the stack slot
232         llvm_size = builder->CreateIntCast(llvm_size, arg_iter->getType(), false);
233         Value *args[3] = {get_user_context(), slot, llvm_size};
234         Value *call = builder->CreateCall(malloc_fn, args);
235 
236         // Fix the type to avoid pointless bitcasts later
237         allocation.ptr = builder->CreatePointerCast(call, llvm_type_of(type)->getPointerTo());
238         allocation.pseudostack_slot = slot;
239     } else {
240         if (new_expr.defined()) {
241             allocation.ptr = codegen(new_expr);
242         } else {
243             // call malloc
244             llvm::Function *malloc_fn = module->getFunction("halide_malloc");
245             internal_assert(malloc_fn) << "Could not find halide_malloc in module\n";
246             malloc_fn->setReturnDoesNotAlias();
247 
248             llvm::Function::arg_iterator arg_iter = malloc_fn->arg_begin();
249             ++arg_iter;  // skip the user context *
250             llvm_size = builder->CreateIntCast(llvm_size, arg_iter->getType(), false);
251 
252             debug(4) << "Creating call to halide_malloc for allocation " << name
253                      << " of size " << type.bytes();
254             for (Expr e : extents) {
255                 debug(4) << " x " << e;
256             }
257             debug(4) << "\n";
258             Value *args[2] = {get_user_context(), llvm_size};
259 
260             Value *call = builder->CreateCall(malloc_fn, args);
261 
262             // Fix the type to avoid pointless bitcasts later
263             call = builder->CreatePointerCast(call, llvm_type_of(type)->getPointerTo());
264 
265             allocation.ptr = call;
266         }
267 
268         // Assert that the allocation worked.
269         Value *check = builder->CreateIsNotNull(allocation.ptr);
270         if (llvm_size) {
271             Value *zero_size = builder->CreateIsNull(llvm_size);
272             check = builder->CreateOr(check, zero_size);
273         }
274         if (!is_one(condition)) {
275             // If the condition is false, it's OK for the new_expr to be null.
276             Value *condition_is_false = builder->CreateIsNull(llvm_condition);
277             check = builder->CreateOr(check, condition_is_false);
278         }
279 
280         create_assertion(check, Call::make(Int(32), "halide_error_out_of_memory",
281                                            std::vector<Expr>(), Call::Extern));
282 
283         // Register a destructor for this allocation.
284         if (free_function.empty()) {
285             free_function = "halide_free";
286         }
287         llvm::Function *free_fn = module->getFunction(free_function);
288         internal_assert(free_fn) << "Could not find " << free_function << " in module.\n";
289         allocation.destructor = register_destructor(free_fn, allocation.ptr, OnError);
290         allocation.destructor_function = free_fn;
291     }
292 
293     // Push the allocation base pointer onto the symbol table
294     debug(3) << "Pushing allocation called " << name << " onto the symbol table\n";
295 
296     allocations.push(name, allocation);
297 
298     return allocation;
299 }
300 
free_allocation(const std::string & name)301 void CodeGen_Posix::free_allocation(const std::string &name) {
302     Allocation alloc = allocations.get(name);
303 
304     if (alloc.stack_bytes) {
305         // Remember this allocation so it can be re-used by a later allocation.
306         free_stack_allocs.push_back(alloc);
307         cur_stack_alloc_total -= alloc.stack_bytes;
308         debug(4) << "cur_stack_alloc_total -= " << alloc.stack_bytes << " -> " << cur_stack_alloc_total << " for " << name << "\n";
309     } else if (alloc.pseudostack_slot) {
310         // Don't call the destructor yet - the lifetime persists until function exit.
311         free_stack_allocs.push_back(alloc);
312     } else if (alloc.destructor_function) {
313         internal_assert(alloc.destructor);
314         trigger_destructor(alloc.destructor_function, alloc.destructor);
315     }
316 
317     allocations.pop(name);
318     sym_pop(name);
319 }
320 
get_allocation_name(const std::string & n)321 string CodeGen_Posix::get_allocation_name(const std::string &n) {
322     if (allocations.contains(n)) {
323         return allocations.get(n).name;
324     } else {
325         return n;
326     }
327 }
328 
visit(const Allocate * alloc)329 void CodeGen_Posix::visit(const Allocate *alloc) {
330     if (sym_exists(alloc->name)) {
331         user_error << "Can't have two different buffers with the same name: "
332                    << alloc->name << "\n";
333     }
334 
335     Allocation allocation = create_allocation(alloc->name, alloc->type, alloc->memory_type,
336                                               alloc->extents, alloc->condition,
337                                               alloc->new_expr, alloc->free_function);
338     sym_push(alloc->name, allocation.ptr);
339 
340     codegen(alloc->body);
341 
342     // If there was no early free, free it now.
343     if (allocations.contains(alloc->name)) {
344         free_allocation(alloc->name);
345     }
346 }
347 
visit(const Free * stmt)348 void CodeGen_Posix::visit(const Free *stmt) {
349     free_allocation(stmt->name);
350 }
351 
352 }  // namespace Internal
353 }  // namespace Halide
354