1 /*
2  * Copyright 2012      Ecole Normale Superieure
3  *
4  * Use of this software is governed by the MIT license
5  *
6  * Written by Sven Verdoolaege,
7  * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
8  */
9 
10 #include <isl/aff.h>
11 #include <isl/ast.h>
12 
13 #include "cuda_common.h"
14 #include "cuda.h"
15 #include "gpu.h"
16 #include "gpu_print.h"
17 #include "print.h"
18 #include "util.h"
19 
print_cuda_macros(__isl_take isl_printer * p)20 static __isl_give isl_printer *print_cuda_macros(__isl_take isl_printer *p)
21 {
22 	const char *macros =
23 		"#define cudaCheckReturn(ret) \\\n"
24 		"  do { \\\n"
25 		"    cudaError_t cudaCheckReturn_e = (ret); \\\n"
26 		"    if (cudaCheckReturn_e != cudaSuccess) { \\\n"
27 		"      fprintf(stderr, \"CUDA error: %s\\n\", "
28 		"cudaGetErrorString(cudaCheckReturn_e)); \\\n"
29 		"      fflush(stderr); \\\n"
30 		"    } \\\n"
31 		"    assert(cudaCheckReturn_e == cudaSuccess); \\\n"
32 		"  } while(0)\n"
33 		"#define cudaCheckKernel() \\\n"
34 		"  do { \\\n"
35 		"    cudaCheckReturn(cudaGetLastError()); \\\n"
36 		"  } while(0)\n\n";
37 
38 	p = isl_printer_print_str(p, macros);
39 	return p;
40 }
41 
42 /* Print a declaration for the device array corresponding to "array" on "p".
43  */
declare_device_array(__isl_take isl_printer * p,struct gpu_array_info * array)44 static __isl_give isl_printer *declare_device_array(__isl_take isl_printer *p,
45 	struct gpu_array_info *array)
46 {
47 	int i;
48 
49 	p = isl_printer_start_line(p);
50 	p = isl_printer_print_str(p, array->type);
51 	p = isl_printer_print_str(p, " ");
52 	if (!array->linearize && array->n_index > 1)
53 		p = isl_printer_print_str(p, "(");
54 	p = isl_printer_print_str(p, "*dev_");
55 	p = isl_printer_print_str(p, array->name);
56 	if (!array->linearize && array->n_index > 1) {
57 		p = isl_printer_print_str(p, ")");
58 		for (i = 1; i < array->n_index; i++) {
59 			isl_ast_expr *bound;
60 			bound = isl_ast_expr_get_op_arg(array->bound_expr,
61 							1 + i);
62 			p = isl_printer_print_str(p, "[");
63 			p = isl_printer_print_ast_expr(p, bound);
64 			p = isl_printer_print_str(p, "]");
65 			isl_ast_expr_free(bound);
66 		}
67 	}
68 	p = isl_printer_print_str(p, ";");
69 	p = isl_printer_end_line(p);
70 
71 	return p;
72 }
73 
declare_device_arrays(__isl_take isl_printer * p,struct gpu_prog * prog)74 static __isl_give isl_printer *declare_device_arrays(__isl_take isl_printer *p,
75 	struct gpu_prog *prog)
76 {
77 	int i;
78 
79 	for (i = 0; i < prog->n_array; ++i) {
80 		if (!gpu_array_requires_device_allocation(&prog->array[i]))
81 			continue;
82 
83 		p = declare_device_array(p, &prog->array[i]);
84 	}
85 	p = isl_printer_start_line(p);
86 	p = isl_printer_end_line(p);
87 	return p;
88 }
89 
allocate_device_arrays(__isl_take isl_printer * p,struct gpu_prog * prog)90 static __isl_give isl_printer *allocate_device_arrays(
91 	__isl_take isl_printer *p, struct gpu_prog *prog)
92 {
93 	int i;
94 
95 	for (i = 0; i < prog->n_array; ++i) {
96 		struct gpu_array_info *array = &prog->array[i];
97 
98 		if (!gpu_array_requires_device_allocation(&prog->array[i]))
99 			continue;
100 		p = ppcg_ast_expr_print_macros(array->bound_expr, p);
101 		p = isl_printer_start_line(p);
102 		p = isl_printer_print_str(p,
103 			"cudaCheckReturn(cudaMalloc((void **) &dev_");
104 		p = isl_printer_print_str(p, prog->array[i].name);
105 		p = isl_printer_print_str(p, ", ");
106 		p = gpu_array_info_print_size(p, &prog->array[i]);
107 		p = isl_printer_print_str(p, "));");
108 		p = isl_printer_end_line(p);
109 	}
110 	p = isl_printer_start_line(p);
111 	p = isl_printer_end_line(p);
112 	return p;
113 }
114 
free_device_arrays(__isl_take isl_printer * p,struct gpu_prog * prog)115 static __isl_give isl_printer *free_device_arrays(__isl_take isl_printer *p,
116 	struct gpu_prog *prog)
117 {
118 	int i;
119 
120 	for (i = 0; i < prog->n_array; ++i) {
121 		if (!gpu_array_requires_device_allocation(&prog->array[i]))
122 			continue;
123 		p = isl_printer_start_line(p);
124 		p = isl_printer_print_str(p, "cudaCheckReturn(cudaFree(dev_");
125 		p = isl_printer_print_str(p, prog->array[i].name);
126 		p = isl_printer_print_str(p, "));");
127 		p = isl_printer_end_line(p);
128 	}
129 
130 	return p;
131 }
132 
133 /* Print code to "p" for copying "array" from the host to the device
134  * in its entirety.  The bounds on the extent of "array" have
135  * been precomputed in extract_array_info and are used in
136  * gpu_array_info_print_size.
137  */
copy_array_to_device(__isl_take isl_printer * p,struct gpu_array_info * array)138 static __isl_give isl_printer *copy_array_to_device(__isl_take isl_printer *p,
139 	struct gpu_array_info *array)
140 {
141 	p = isl_printer_start_line(p);
142 	p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(dev_");
143 	p = isl_printer_print_str(p, array->name);
144 	p = isl_printer_print_str(p, ", ");
145 
146 	if (gpu_array_is_scalar(array))
147 		p = isl_printer_print_str(p, "&");
148 	p = isl_printer_print_str(p, array->name);
149 	p = isl_printer_print_str(p, ", ");
150 
151 	p = gpu_array_info_print_size(p, array);
152 	p = isl_printer_print_str(p, ", cudaMemcpyHostToDevice));");
153 	p = isl_printer_end_line(p);
154 
155 	return p;
156 }
157 
158 /* Print code to "p" for copying "array" back from the device to the host
159  * in its entirety.  The bounds on the extent of "array" have
160  * been precomputed in extract_array_info and are used in
161  * gpu_array_info_print_size.
162  */
copy_array_from_device(__isl_take isl_printer * p,struct gpu_array_info * array)163 static __isl_give isl_printer *copy_array_from_device(
164 	__isl_take isl_printer *p, struct gpu_array_info *array)
165 {
166 	p = isl_printer_start_line(p);
167 	p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(");
168 	if (gpu_array_is_scalar(array))
169 		p = isl_printer_print_str(p, "&");
170 	p = isl_printer_print_str(p, array->name);
171 	p = isl_printer_print_str(p, ", dev_");
172 	p = isl_printer_print_str(p, array->name);
173 	p = isl_printer_print_str(p, ", ");
174 	p = gpu_array_info_print_size(p, array);
175 	p = isl_printer_print_str(p, ", cudaMemcpyDeviceToHost));");
176 	p = isl_printer_end_line(p);
177 
178 	return p;
179 }
180 
print_reverse_list(__isl_take isl_printer * p,int len,int * list)181 static __isl_give isl_printer* print_reverse_list(__isl_take isl_printer *p, int len, int *list)
182 {
183 	int i;
184 
185 	if (len == 0)
186 		return p;
187 
188 	p = isl_printer_print_str(p, "(");
189 	for (i = 0; i < len; ++i) {
190 		if (i)
191 			p = isl_printer_print_str(p, ", ");
192 		p = isl_printer_print_int(p, list[len - 1 - i]);
193 	}
194 	return isl_printer_print_str(p, ")");
195 }
196 
197 /* Print the effective grid size as a list of the sizes in each
198  * dimension, from innermost to outermost.
199  */
print_grid_size(__isl_take isl_printer * p,struct ppcg_kernel * kernel)200 static __isl_give isl_printer *print_grid_size(__isl_take isl_printer *p,
201 	struct ppcg_kernel *kernel)
202 {
203 	int i;
204 	int dim;
205 
206 	dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
207 	if (dim == 0)
208 		return p;
209 
210 	p = isl_printer_print_str(p, "(");
211 	for (i = dim - 1; i >= 0; --i) {
212 		isl_ast_expr *bound;
213 
214 		bound = isl_ast_expr_get_op_arg(kernel->grid_size_expr, 1 + i);
215 		p = isl_printer_print_ast_expr(p, bound);
216 		isl_ast_expr_free(bound);
217 
218 		if (i > 0)
219 			p = isl_printer_print_str(p, ", ");
220 	}
221 
222 	p = isl_printer_print_str(p, ")");
223 
224 	return p;
225 }
226 
227 /* Print the grid definition.
228  */
print_grid(__isl_take isl_printer * p,struct ppcg_kernel * kernel)229 static __isl_give isl_printer *print_grid(__isl_take isl_printer *p,
230 	struct ppcg_kernel *kernel)
231 {
232 	p = isl_printer_start_line(p);
233 	p = isl_printer_print_str(p, "dim3 k");
234 	p = isl_printer_print_int(p, kernel->id);
235 	p = isl_printer_print_str(p, "_dimGrid");
236 	p = print_grid_size(p, kernel);
237 	p = isl_printer_print_str(p, ";");
238 	p = isl_printer_end_line(p);
239 
240 	return p;
241 }
242 
243 /* Print the arguments to a kernel declaration or call.  If "types" is set,
244  * then print a declaration (including the types of the arguments).
245  *
246  * The arguments are printed in the following order
247  * - the arrays accessed by the kernel
248  * - the parameters
249  * - the host loop iterators
250  */
print_kernel_arguments(__isl_take isl_printer * p,struct gpu_prog * prog,struct ppcg_kernel * kernel,int types)251 static __isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p,
252 	struct gpu_prog *prog, struct ppcg_kernel *kernel, int types)
253 {
254 	int i, n;
255 	int first = 1;
256 	unsigned nparam;
257 	isl_space *space;
258 	const char *type;
259 
260 	for (i = 0; i < prog->n_array; ++i) {
261 		int required;
262 
263 		required = ppcg_kernel_requires_array_argument(kernel, i);
264 		if (required < 0)
265 			return isl_printer_free(p);
266 		if (!required)
267 			continue;
268 
269 		if (!first)
270 			p = isl_printer_print_str(p, ", ");
271 
272 		if (types)
273 			p = gpu_array_info_print_declaration_argument(p,
274 				&prog->array[i], NULL);
275 		else
276 			p = gpu_array_info_print_call_argument(p,
277 				&prog->array[i]);
278 
279 		first = 0;
280 	}
281 
282 	space = isl_union_set_get_space(kernel->arrays);
283 	nparam = isl_space_dim(space, isl_dim_param);
284 	for (i = 0; i < nparam; ++i) {
285 		const char *name;
286 
287 		name = isl_space_get_dim_name(space, isl_dim_param, i);
288 
289 		if (!first)
290 			p = isl_printer_print_str(p, ", ");
291 		if (types)
292 			p = isl_printer_print_str(p, "int ");
293 		p = isl_printer_print_str(p, name);
294 
295 		first = 0;
296 	}
297 	isl_space_free(space);
298 
299 	n = isl_space_dim(kernel->space, isl_dim_set);
300 	type = isl_options_get_ast_iterator_type(prog->ctx);
301 	for (i = 0; i < n; ++i) {
302 		const char *name;
303 
304 		if (!first)
305 			p = isl_printer_print_str(p, ", ");
306 		name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
307 		if (types) {
308 			p = isl_printer_print_str(p, type);
309 			p = isl_printer_print_str(p, " ");
310 		}
311 		p = isl_printer_print_str(p, name);
312 
313 		first = 0;
314 	}
315 
316 	return p;
317 }
318 
319 /* Print the header of the given kernel.
320  */
print_kernel_header(__isl_take isl_printer * p,struct gpu_prog * prog,struct ppcg_kernel * kernel)321 static __isl_give isl_printer *print_kernel_header(__isl_take isl_printer *p,
322 	struct gpu_prog *prog, struct ppcg_kernel *kernel)
323 {
324 	p = isl_printer_start_line(p);
325 	p = isl_printer_print_str(p, "__global__ void kernel");
326 	p = isl_printer_print_int(p, kernel->id);
327 	p = isl_printer_print_str(p, "(");
328 	p = print_kernel_arguments(p, prog, kernel, 1);
329 	p = isl_printer_print_str(p, ")");
330 
331 	return p;
332 }
333 
334 /* Print the header of the given kernel to both gen->cuda.kernel_h
335  * and gen->cuda.kernel_c.
336  */
print_kernel_headers(struct gpu_prog * prog,struct ppcg_kernel * kernel,struct cuda_info * cuda)337 static void print_kernel_headers(struct gpu_prog *prog,
338 	struct ppcg_kernel *kernel, struct cuda_info *cuda)
339 {
340 	isl_printer *p;
341 
342 	p = isl_printer_to_file(prog->ctx, cuda->kernel_h);
343 	p = isl_printer_set_output_format(p, ISL_FORMAT_C);
344 	p = print_kernel_header(p, prog, kernel);
345 	p = isl_printer_print_str(p, ";");
346 	p = isl_printer_end_line(p);
347 	isl_printer_free(p);
348 
349 	p = isl_printer_to_file(prog->ctx, cuda->kernel_c);
350 	p = isl_printer_set_output_format(p, ISL_FORMAT_C);
351 	p = print_kernel_header(p, prog, kernel);
352 	p = isl_printer_end_line(p);
353 	isl_printer_free(p);
354 }
355 
print_indent(FILE * dst,int indent)356 static void print_indent(FILE *dst, int indent)
357 {
358 	fprintf(dst, "%*s", indent, "");
359 }
360 
361 /* Print a list of iterators of type "type" with names "ids" to "out".
362  * Each iterator is assigned one of the cuda identifiers in cuda_dims.
363  * In particular, the last iterator is assigned the x identifier
364  * (the first in the list of cuda identifiers).
365  */
print_iterators(FILE * out,const char * type,__isl_keep isl_id_list * ids,const char * cuda_dims[])366 static void print_iterators(FILE *out, const char *type,
367 	__isl_keep isl_id_list *ids, const char *cuda_dims[])
368 {
369 	int i, n;
370 
371 	n = isl_id_list_n_id(ids);
372 	if (n <= 0)
373 		return;
374 	print_indent(out, 4);
375 	fprintf(out, "%s ", type);
376 	for (i = 0; i < n; ++i) {
377 		isl_id *id;
378 
379 		if (i)
380 			fprintf(out, ", ");
381 		id = isl_id_list_get_id(ids, i);
382 		fprintf(out, "%s = %s", isl_id_get_name(id),
383 			cuda_dims[n - 1 - i]);
384 		isl_id_free(id);
385 	}
386 	fprintf(out, ";\n");
387 }
388 
print_kernel_iterators(FILE * out,struct ppcg_kernel * kernel)389 static void print_kernel_iterators(FILE *out, struct ppcg_kernel *kernel)
390 {
391 	isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
392 	const char *type;
393 	const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
394 	const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
395 					"threadIdx.z" };
396 
397 	type = isl_options_get_ast_iterator_type(ctx);
398 
399 	print_iterators(out, type, kernel->block_ids, block_dims);
400 	print_iterators(out, type, kernel->thread_ids, thread_dims);
401 }
402 
print_kernel_var(__isl_take isl_printer * p,struct ppcg_kernel_var * var)403 static __isl_give isl_printer *print_kernel_var(__isl_take isl_printer *p,
404 	struct ppcg_kernel_var *var)
405 {
406 	int j;
407 
408 	p = isl_printer_start_line(p);
409 	if (var->type == ppcg_access_shared)
410 		p = isl_printer_print_str(p, "__shared__ ");
411 	p = isl_printer_print_str(p, var->array->type);
412 	p = isl_printer_print_str(p, " ");
413 	p = isl_printer_print_str(p,  var->name);
414 	for (j = 0; j < var->array->n_index; ++j) {
415 		isl_val *v;
416 
417 		p = isl_printer_print_str(p, "[");
418 		v = isl_vec_get_element_val(var->size, j);
419 		p = isl_printer_print_val(p, v);
420 		isl_val_free(v);
421 		p = isl_printer_print_str(p, "]");
422 	}
423 	p = isl_printer_print_str(p, ";");
424 	p = isl_printer_end_line(p);
425 
426 	return p;
427 }
428 
print_kernel_vars(__isl_take isl_printer * p,struct ppcg_kernel * kernel)429 static __isl_give isl_printer *print_kernel_vars(__isl_take isl_printer *p,
430 	struct ppcg_kernel *kernel)
431 {
432 	int i;
433 
434 	for (i = 0; i < kernel->n_var; ++i)
435 		p = print_kernel_var(p, &kernel->var[i]);
436 
437 	return p;
438 }
439 
440 /* Print a sync statement.
441  */
print_sync(__isl_take isl_printer * p,struct ppcg_kernel_stmt * stmt)442 static __isl_give isl_printer *print_sync(__isl_take isl_printer *p,
443 	struct ppcg_kernel_stmt *stmt)
444 {
445 	p = isl_printer_start_line(p);
446 	p = isl_printer_print_str(p, "__syncthreads();");
447 	p = isl_printer_end_line(p);
448 
449 	return p;
450 }
451 
452 /* This function is called for each user statement in the AST,
453  * i.e., for each kernel body statement, copy statement or sync statement.
454  */
print_kernel_stmt(__isl_take isl_printer * p,__isl_take isl_ast_print_options * print_options,__isl_keep isl_ast_node * node,void * user)455 static __isl_give isl_printer *print_kernel_stmt(__isl_take isl_printer *p,
456 	__isl_take isl_ast_print_options *print_options,
457 	__isl_keep isl_ast_node *node, void *user)
458 {
459 	isl_id *id;
460 	struct ppcg_kernel_stmt *stmt;
461 
462 	id = isl_ast_node_get_annotation(node);
463 	stmt = isl_id_get_user(id);
464 	isl_id_free(id);
465 
466 	isl_ast_print_options_free(print_options);
467 
468 	switch (stmt->type) {
469 	case ppcg_kernel_copy:
470 		return ppcg_kernel_print_copy(p, stmt);
471 	case ppcg_kernel_sync:
472 		return print_sync(p, stmt);
473 	case ppcg_kernel_domain:
474 		return ppcg_kernel_print_domain(p, stmt);
475 	}
476 
477 	return p;
478 }
479 
print_kernel(struct gpu_prog * prog,struct ppcg_kernel * kernel,struct cuda_info * cuda)480 static void print_kernel(struct gpu_prog *prog, struct ppcg_kernel *kernel,
481 	struct cuda_info *cuda)
482 {
483 	isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
484 	isl_ast_print_options *print_options;
485 	isl_printer *p;
486 
487 	print_kernel_headers(prog, kernel, cuda);
488 	fprintf(cuda->kernel_c, "{\n");
489 	print_kernel_iterators(cuda->kernel_c, kernel);
490 
491 	p = isl_printer_to_file(ctx, cuda->kernel_c);
492 	p = isl_printer_set_output_format(p, ISL_FORMAT_C);
493 	p = isl_printer_indent(p, 4);
494 
495 	p = print_kernel_vars(p, kernel);
496 	p = isl_printer_end_line(p);
497 	p = ppcg_set_macro_names(p);
498 	p = gpu_print_macros(p, kernel->tree);
499 
500 	print_options = isl_ast_print_options_alloc(ctx);
501 	print_options = isl_ast_print_options_set_print_user(print_options,
502 							&print_kernel_stmt, NULL);
503 	p = isl_ast_node_print(kernel->tree, p, print_options);
504 	isl_printer_free(p);
505 
506 	fprintf(cuda->kernel_c, "}\n");
507 }
508 
509 /* Print code for initializing the device for execution of the transformed
510  * code.  This includes declaring locally defined variables as well as
511  * declaring and allocating the required copies of arrays on the device.
512  */
init_device(__isl_take isl_printer * p,struct gpu_prog * prog)513 static __isl_give isl_printer *init_device(__isl_take isl_printer *p,
514 	struct gpu_prog *prog)
515 {
516 	p = print_cuda_macros(p);
517 
518 	p = gpu_print_local_declarations(p, prog);
519 	p = declare_device_arrays(p, prog);
520 	p = allocate_device_arrays(p, prog);
521 
522 	return p;
523 }
524 
525 /* Print code for clearing the device after execution of the transformed code.
526  * In particular, free the memory that was allocated on the device.
527  */
clear_device(__isl_take isl_printer * p,struct gpu_prog * prog)528 static __isl_give isl_printer *clear_device(__isl_take isl_printer *p,
529 	struct gpu_prog *prog)
530 {
531 	p = free_device_arrays(p, prog);
532 
533 	return p;
534 }
535 
536 /* Print a statement for copying an array to or from the device,
537  * or for initializing or clearing the device.
538  * The statement identifier of a copying node is called
539  * "to_device_<array name>" or "from_device_<array name>" and
540  * its user pointer points to the gpu_array_info of the array
541  * that needs to be copied.
542  * The node for initializing the device is called "init_device".
543  * The node for clearing the device is called "clear_device".
544  *
545  * Extract the array (if any) from the identifier and call
546  * init_device, clear_device, copy_array_to_device or copy_array_from_device.
547  */
print_device_node(__isl_take isl_printer * p,__isl_keep isl_ast_node * node,struct gpu_prog * prog)548 static __isl_give isl_printer *print_device_node(__isl_take isl_printer *p,
549 	__isl_keep isl_ast_node *node, struct gpu_prog *prog)
550 {
551 	isl_ast_expr *expr, *arg;
552 	isl_id *id;
553 	const char *name;
554 	struct gpu_array_info *array;
555 
556 	expr = isl_ast_node_user_get_expr(node);
557 	arg = isl_ast_expr_get_op_arg(expr, 0);
558 	id = isl_ast_expr_get_id(arg);
559 	name = isl_id_get_name(id);
560 	array = isl_id_get_user(id);
561 	isl_id_free(id);
562 	isl_ast_expr_free(arg);
563 	isl_ast_expr_free(expr);
564 
565 	if (!name)
566 		return isl_printer_free(p);
567 	if (!strcmp(name, "init_device"))
568 		return init_device(p, prog);
569 	if (!strcmp(name, "clear_device"))
570 		return clear_device(p, prog);
571 	if (!array)
572 		return isl_printer_free(p);
573 
574 	if (!prefixcmp(name, "to_device"))
575 		return copy_array_to_device(p, array);
576 	else
577 		return copy_array_from_device(p, array);
578 }
579 
580 struct print_host_user_data {
581 	struct cuda_info *cuda;
582 	struct gpu_prog *prog;
583 };
584 
585 /* Print the user statement of the host code to "p".
586  *
587  * The host code may contain original user statements, kernel launches,
588  * statements that copy data to/from the device and statements
589  * the initialize or clear the device.
590  * The original user statements and the kernel launches have
591  * an associated annotation, while the other statements do not.
592  * The latter are handled by print_device_node.
593  * The annotation on the user statements is called "user".
594  *
595  * In case of a kernel launch, print a block of statements that
596  * defines the grid and the block and then launches the kernel.
597  */
print_host_user(__isl_take isl_printer * p,__isl_take isl_ast_print_options * print_options,__isl_keep isl_ast_node * node,void * user)598 __isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
599 	__isl_take isl_ast_print_options *print_options,
600 	__isl_keep isl_ast_node *node, void *user)
601 {
602 	isl_id *id;
603 	int is_user;
604 	struct ppcg_kernel *kernel;
605 	struct ppcg_kernel_stmt *stmt;
606 	struct print_host_user_data *data;
607 
608 	isl_ast_print_options_free(print_options);
609 
610 	data = (struct print_host_user_data *) user;
611 
612 	id = isl_ast_node_get_annotation(node);
613 	if (!id)
614 		return print_device_node(p, node, data->prog);
615 
616 	is_user = !strcmp(isl_id_get_name(id), "user");
617 	kernel = is_user ? NULL : isl_id_get_user(id);
618 	stmt = is_user ? isl_id_get_user(id) : NULL;
619 	isl_id_free(id);
620 
621 	if (is_user)
622 		return ppcg_kernel_print_domain(p, stmt);
623 
624 	p = ppcg_start_block(p);
625 
626 	p = isl_printer_start_line(p);
627 	p = isl_printer_print_str(p, "dim3 k");
628 	p = isl_printer_print_int(p, kernel->id);
629 	p = isl_printer_print_str(p, "_dimBlock");
630 	p = print_reverse_list(p, kernel->n_block, kernel->block_dim);
631 	p = isl_printer_print_str(p, ";");
632 	p = isl_printer_end_line(p);
633 
634 	p = print_grid(p, kernel);
635 
636 	p = isl_printer_start_line(p);
637 	p = isl_printer_print_str(p, "kernel");
638 	p = isl_printer_print_int(p, kernel->id);
639 	p = isl_printer_print_str(p, " <<<k");
640 	p = isl_printer_print_int(p, kernel->id);
641 	p = isl_printer_print_str(p, "_dimGrid, k");
642 	p = isl_printer_print_int(p, kernel->id);
643 	p = isl_printer_print_str(p, "_dimBlock>>> (");
644 	p = print_kernel_arguments(p, data->prog, kernel, 0);
645 	p = isl_printer_print_str(p, ");");
646 	p = isl_printer_end_line(p);
647 
648 	p = isl_printer_start_line(p);
649 	p = isl_printer_print_str(p, "cudaCheckKernel();");
650 	p = isl_printer_end_line(p);
651 
652 	p = ppcg_end_block(p);
653 
654 	p = isl_printer_start_line(p);
655 	p = isl_printer_end_line(p);
656 
657 #if 0
658 	print_kernel(data->prog, kernel, data->cuda);
659 #endif
660 
661 	return p;
662 }
663 
print_host_code(__isl_take isl_printer * p,struct gpu_prog * prog,__isl_keep isl_ast_node * tree,struct cuda_info * cuda)664 static __isl_give isl_printer *print_host_code(__isl_take isl_printer *p,
665 	struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
666 	struct cuda_info *cuda)
667 {
668 	isl_ast_print_options *print_options;
669 	isl_ctx *ctx = isl_ast_node_get_ctx(tree);
670 	struct print_host_user_data data = { cuda, prog };
671 
672 	print_options = isl_ast_print_options_alloc(ctx);
673 	print_options = isl_ast_print_options_set_print_user(print_options,
674 						&print_host_user, &data);
675 
676 	p = gpu_print_macros(p, tree);
677 	p = isl_ast_node_print(tree, p, print_options);
678 
679 	return p;
680 }
681 
682 /* Given a gpu_prog "prog" and the corresponding transformed AST
683  * "tree", print the entire CUDA code to "p".
684  * "types" collects the types for which a definition has already
685  * been printed.
686  */
print_cuda(__isl_take isl_printer * p,struct gpu_prog * prog,__isl_keep isl_ast_node * tree,struct gpu_types * types,void * user)687 static __isl_give isl_printer *print_cuda(__isl_take isl_printer *p,
688 	struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
689 	struct gpu_types *types, void *user)
690 {
691 	struct cuda_info *cuda = user;
692 	isl_printer *kernel;
693 
694 	kernel = isl_printer_to_file(isl_printer_get_ctx(p), cuda->kernel_c);
695 	kernel = isl_printer_set_output_format(kernel, ISL_FORMAT_C);
696 	kernel = gpu_print_types(kernel, types, prog);
697 	isl_printer_free(kernel);
698 
699 	if (!kernel)
700 		return isl_printer_free(p);
701 
702 	p = print_host_code(p, prog, tree, cuda);
703 
704 	return p;
705 }
706 
707 /* Transform the code in the file called "input" by replacing
708  * all scops by corresponding CUDA code.
709  * The names of the output files are derived from "input".
710  *
711  * We let generate_gpu do all the hard work and then let it call
712  * us back for printing the AST in print_cuda.
713  *
714  * To prepare for this printing, we first open the output files
715  * and we close them after generate_gpu has finished.
716  */
generate_cuda(isl_ctx * ctx,struct ppcg_options * options,const char * input)717 int generate_cuda(isl_ctx *ctx, struct ppcg_options *options,
718 	const char *input)
719 {
720 	struct cuda_info cuda;
721 	int r;
722 
723 	cuda_open_files(&cuda, input);
724 
725 	r = generate_gpu(ctx, input, cuda.host_c, options, &print_cuda, &cuda);
726 
727 	cuda_close_files(&cuda);
728 
729 	return r;
730 }
731