1 /* Target code for NVPTX.
2 Copyright (C) 2014-2018 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #include <sstream>
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "cfghooks.h"
32 #include "df.h"
33 #include "memmodel.h"
34 #include "tm_p.h"
35 #include "expmed.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "diagnostic.h"
41 #include "alias.h"
42 #include "insn-flags.h"
43 #include "output.h"
44 #include "insn-attr.h"
45 #include "flags.h"
46 #include "dojump.h"
47 #include "explow.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "stmt.h"
51 #include "expr.h"
52 #include "tm-preds.h"
53 #include "tm-constrs.h"
54 #include "langhooks.h"
55 #include "dbxout.h"
56 #include "cfgrtl.h"
57 #include "gimple.h"
58 #include "stor-layout.h"
59 #include "builtins.h"
60 #include "omp-general.h"
61 #include "omp-low.h"
62 #include "gomp-constants.h"
63 #include "dumpfile.h"
64 #include "internal-fn.h"
65 #include "gimple-iterator.h"
66 #include "stringpool.h"
67 #include "attribs.h"
68 #include "tree-vrp.h"
69 #include "tree-ssa-operands.h"
70 #include "tree-ssanames.h"
71 #include "gimplify.h"
72 #include "tree-phinodes.h"
73 #include "cfgloop.h"
74 #include "fold-const.h"
75 #include "intl.h"
76
77 /* This file should be included last. */
78 #include "target-def.h"
79
80 #define WORKAROUND_PTXJIT_BUG 1
81 #define WORKAROUND_PTXJIT_BUG_2 1
82
83 /* The various PTX memory areas an object might reside in. */
84 enum nvptx_data_area
85 {
86 DATA_AREA_GENERIC,
87 DATA_AREA_GLOBAL,
88 DATA_AREA_SHARED,
89 DATA_AREA_LOCAL,
90 DATA_AREA_CONST,
91 DATA_AREA_PARAM,
92 DATA_AREA_MAX
93 };
94
95 /* We record the data area in the target symbol flags. */
96 #define SYMBOL_DATA_AREA(SYM) \
97 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
98 & 7)
99 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
100 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
101
102 /* Record the function decls we've written, and the libfuncs and function
103 decls corresponding to them. */
104 static std::stringstream func_decls;
105
106 struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
107 {
hashdeclared_libfunc_hasher108 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
equaldeclared_libfunc_hasher109 static bool equal (rtx a, rtx b) { return a == b; }
110 };
111
112 static GTY((cache))
113 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
114
115 struct tree_hasher : ggc_cache_ptr_hash<tree_node>
116 {
hashtree_hasher117 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
equaltree_hasher118 static bool equal (tree a, tree b) { return a == b; }
119 };
120
121 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
122 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
123
124 /* Buffer needed to broadcast across workers. This is used for both
125 worker-neutering and worker broadcasting. It is shared by all
126 functions emitted. The buffer is placed in shared memory. It'd be
127 nice if PTX supported common blocks, because then this could be
128 shared across TUs (taking the largest size). */
129 static unsigned worker_bcast_size;
130 static unsigned worker_bcast_align;
131 static GTY(()) rtx worker_bcast_sym;
132
133 /* Buffer needed for worker reductions. This has to be distinct from
134 the worker broadcast array, as both may be live concurrently. */
135 static unsigned worker_red_size;
136 static unsigned worker_red_align;
137 static GTY(()) rtx worker_red_sym;
138
139 /* Global lock variable, needed for 128bit worker & gang reductions. */
140 static GTY(()) tree global_lock_var;
141
142 /* True if any function references __nvptx_stacks. */
143 static bool need_softstack_decl;
144
145 /* True if any function references __nvptx_uni. */
146 static bool need_unisimt_decl;
147
148 /* Allocate a new, cleared machine_function structure. */
149
150 static struct machine_function *
nvptx_init_machine_status(void)151 nvptx_init_machine_status (void)
152 {
153 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
154 p->return_mode = VOIDmode;
155 return p;
156 }
157
158 /* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
159 and -fopenacc is also enabled. */
160
161 static void
diagnose_openacc_conflict(bool optval,const char * optname)162 diagnose_openacc_conflict (bool optval, const char *optname)
163 {
164 if (flag_openacc && optval)
165 error ("option %s is not supported together with -fopenacc", optname);
166 }
167
168 /* Implement TARGET_OPTION_OVERRIDE. */
169
170 static void
nvptx_option_override(void)171 nvptx_option_override (void)
172 {
173 init_machine_status = nvptx_init_machine_status;
174
175 /* Set toplevel_reorder, unless explicitly disabled. We need
176 reordering so that we emit necessary assembler decls of
177 undeclared variables. */
178 if (!global_options_set.x_flag_toplevel_reorder)
179 flag_toplevel_reorder = 1;
180
181 debug_nonbind_markers_p = 0;
182
183 /* Set flag_no_common, unless explicitly disabled. We fake common
184 using .weak, and that's not entirely accurate, so avoid it
185 unless forced. */
186 if (!global_options_set.x_flag_no_common)
187 flag_no_common = 1;
188
189 /* The patch area requires nops, which we don't have. */
190 if (function_entry_patch_area_size > 0)
191 sorry ("not generating patch area, nops not supported");
192
193 /* Assumes that it will see only hard registers. */
194 flag_var_tracking = 0;
195
196 if (nvptx_optimize < 0)
197 nvptx_optimize = optimize > 0;
198
199 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
200 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
201 declared_libfuncs_htab
202 = hash_table<declared_libfunc_hasher>::create_ggc (17);
203
204 worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_bcast");
205 SET_SYMBOL_DATA_AREA (worker_bcast_sym, DATA_AREA_SHARED);
206 worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
207
208 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
209 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
210 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
211
212 diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
213 diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
214 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
215
216 if (TARGET_GOMP)
217 target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
218 }
219
220 /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
221 deal with ptx ideosyncracies. */
222
223 const char *
nvptx_ptx_type_from_mode(machine_mode mode,bool promote)224 nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
225 {
226 switch (mode)
227 {
228 case E_BLKmode:
229 return ".b8";
230 case E_BImode:
231 return ".pred";
232 case E_QImode:
233 if (promote)
234 return ".u32";
235 else
236 return ".u8";
237 case E_HImode:
238 return ".u16";
239 case E_SImode:
240 return ".u32";
241 case E_DImode:
242 return ".u64";
243
244 case E_SFmode:
245 return ".f32";
246 case E_DFmode:
247 return ".f64";
248
249 case E_V2SImode:
250 return ".v2.u32";
251 case E_V2DImode:
252 return ".v2.u64";
253
254 default:
255 gcc_unreachable ();
256 }
257 }
258
259 /* Encode the PTX data area that DECL (which might not actually be a
260 _DECL) should reside in. */
261
262 static void
nvptx_encode_section_info(tree decl,rtx rtl,int first)263 nvptx_encode_section_info (tree decl, rtx rtl, int first)
264 {
265 default_encode_section_info (decl, rtl, first);
266 if (first && MEM_P (rtl))
267 {
268 nvptx_data_area area = DATA_AREA_GENERIC;
269
270 if (TREE_CONSTANT (decl))
271 area = DATA_AREA_CONST;
272 else if (TREE_CODE (decl) == VAR_DECL)
273 {
274 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
275 {
276 area = DATA_AREA_SHARED;
277 if (DECL_INITIAL (decl))
278 error ("static initialization of variable %q+D in %<.shared%>"
279 " memory is not supported", decl);
280 }
281 else
282 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
283 }
284
285 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
286 }
287 }
288
289 /* Return the PTX name of the data area in which SYM should be
290 placed. The symbol must have already been processed by
291 nvptx_encode_seciton_info, or equivalent. */
292
293 static const char *
section_for_sym(rtx sym)294 section_for_sym (rtx sym)
295 {
296 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
297 /* Same order as nvptx_data_area enum. */
298 static char const *const areas[] =
299 {"", ".global", ".shared", ".local", ".const", ".param"};
300
301 return areas[area];
302 }
303
304 /* Similarly for a decl. */
305
306 static const char *
section_for_decl(const_tree decl)307 section_for_decl (const_tree decl)
308 {
309 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
310 }
311
312 /* Check NAME for special function names and redirect them by returning a
313 replacement. This applies to malloc, free and realloc, for which we
314 want to use libgcc wrappers, and call, which triggers a bug in
315 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
316 not active in an offload compiler -- the names are all set by the
317 host-side compiler. */
318
319 static const char *
nvptx_name_replacement(const char * name)320 nvptx_name_replacement (const char *name)
321 {
322 if (strcmp (name, "call") == 0)
323 return "__nvptx_call";
324 if (strcmp (name, "malloc") == 0)
325 return "__nvptx_malloc";
326 if (strcmp (name, "free") == 0)
327 return "__nvptx_free";
328 if (strcmp (name, "realloc") == 0)
329 return "__nvptx_realloc";
330 return name;
331 }
332
333 /* If MODE should be treated as two registers of an inner mode, return
334 that inner mode. Otherwise return VOIDmode. */
335
336 static machine_mode
maybe_split_mode(machine_mode mode)337 maybe_split_mode (machine_mode mode)
338 {
339 if (COMPLEX_MODE_P (mode))
340 return GET_MODE_INNER (mode);
341
342 if (mode == TImode)
343 return DImode;
344
345 return VOIDmode;
346 }
347
348 /* Return true if mode should be treated as two registers. */
349
350 static bool
split_mode_p(machine_mode mode)351 split_mode_p (machine_mode mode)
352 {
353 return maybe_split_mode (mode) != VOIDmode;
354 }
355
356 /* Output a register, subreg, or register pair (with optional
357 enclosing braces). */
358
359 static void
360 output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
361 int subreg_offset = -1)
362 {
363 if (inner_mode == VOIDmode)
364 {
365 if (HARD_REGISTER_NUM_P (regno))
366 fprintf (file, "%s", reg_names[regno]);
367 else
368 fprintf (file, "%%r%d", regno);
369 }
370 else if (subreg_offset >= 0)
371 {
372 output_reg (file, regno, VOIDmode);
373 fprintf (file, "$%d", subreg_offset);
374 }
375 else
376 {
377 if (subreg_offset == -1)
378 fprintf (file, "{");
379 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
380 fprintf (file, ",");
381 output_reg (file, regno, inner_mode, 0);
382 if (subreg_offset == -1)
383 fprintf (file, "}");
384 }
385 }
386
387 /* Emit forking instructions for MASK. */
388
389 static void
nvptx_emit_forking(unsigned mask,bool is_call)390 nvptx_emit_forking (unsigned mask, bool is_call)
391 {
392 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
393 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
394 if (mask)
395 {
396 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
397
398 /* Emit fork at all levels. This helps form SESE regions, as
399 it creates a block with a single successor before entering a
400 partitooned region. That is a good candidate for the end of
401 an SESE region. */
402 emit_insn (gen_nvptx_fork (op));
403 emit_insn (gen_nvptx_forked (op));
404 }
405 }
406
407 /* Emit joining instructions for MASK. */
408
409 static void
nvptx_emit_joining(unsigned mask,bool is_call)410 nvptx_emit_joining (unsigned mask, bool is_call)
411 {
412 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
413 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
414 if (mask)
415 {
416 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
417
418 /* Emit joining for all non-call pars to ensure there's a single
419 predecessor for the block the join insn ends up in. This is
420 needed for skipping entire loops. */
421 emit_insn (gen_nvptx_joining (op));
422 emit_insn (gen_nvptx_join (op));
423 }
424 }
425
426
427 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
428 returned in memory. Integer and floating types supported by the
429 machine are passed in registers, everything else is passed in
430 memory. Complex types are split. */
431
432 static bool
pass_in_memory(machine_mode mode,const_tree type,bool for_return)433 pass_in_memory (machine_mode mode, const_tree type, bool for_return)
434 {
435 if (type)
436 {
437 if (AGGREGATE_TYPE_P (type))
438 return true;
439 if (TREE_CODE (type) == VECTOR_TYPE)
440 return true;
441 }
442
443 if (!for_return && COMPLEX_MODE_P (mode))
444 /* Complex types are passed as two underlying args. */
445 mode = GET_MODE_INNER (mode);
446
447 if (GET_MODE_CLASS (mode) != MODE_INT
448 && GET_MODE_CLASS (mode) != MODE_FLOAT)
449 return true;
450
451 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
452 return true;
453
454 return false;
455 }
456
457 /* A non-memory argument of mode MODE is being passed, determine the mode it
458 should be promoted to. This is also used for determining return
459 type promotion. */
460
461 static machine_mode
promote_arg(machine_mode mode,bool prototyped)462 promote_arg (machine_mode mode, bool prototyped)
463 {
464 if (!prototyped && mode == SFmode)
465 /* K&R float promotion for unprototyped functions. */
466 mode = DFmode;
467 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
468 mode = SImode;
469
470 return mode;
471 }
472
473 /* A non-memory return type of MODE is being returned. Determine the
474 mode it should be promoted to. */
475
476 static machine_mode
promote_return(machine_mode mode)477 promote_return (machine_mode mode)
478 {
479 return promote_arg (mode, true);
480 }
481
482 /* Implement TARGET_FUNCTION_ARG. */
483
484 static rtx
nvptx_function_arg(cumulative_args_t ARG_UNUSED (cum_v),machine_mode mode,const_tree,bool named)485 nvptx_function_arg (cumulative_args_t ARG_UNUSED (cum_v), machine_mode mode,
486 const_tree, bool named)
487 {
488 if (mode == VOIDmode || !named)
489 return NULL_RTX;
490
491 return gen_reg_rtx (mode);
492 }
493
494 /* Implement TARGET_FUNCTION_INCOMING_ARG. */
495
496 static rtx
nvptx_function_incoming_arg(cumulative_args_t cum_v,machine_mode mode,const_tree,bool named)497 nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
498 const_tree, bool named)
499 {
500 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
501
502 if (mode == VOIDmode || !named)
503 return NULL_RTX;
504
505 /* No need to deal with split modes here, the only case that can
506 happen is complex modes and those are dealt with by
507 TARGET_SPLIT_COMPLEX_ARG. */
508 return gen_rtx_UNSPEC (mode,
509 gen_rtvec (1, GEN_INT (cum->count)),
510 UNSPEC_ARG_REG);
511 }
512
513 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
514
515 static void
nvptx_function_arg_advance(cumulative_args_t cum_v,machine_mode ARG_UNUSED (mode),const_tree ARG_UNUSED (type),bool ARG_UNUSED (named))516 nvptx_function_arg_advance (cumulative_args_t cum_v,
517 machine_mode ARG_UNUSED (mode),
518 const_tree ARG_UNUSED (type),
519 bool ARG_UNUSED (named))
520 {
521 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
522
523 cum->count++;
524 }
525
526 /* Implement TARGET_FUNCTION_ARG_BOUNDARY.
527
528 For nvptx This is only used for varadic args. The type has already
529 been promoted and/or converted to invisible reference. */
530
531 static unsigned
nvptx_function_arg_boundary(machine_mode mode,const_tree ARG_UNUSED (type))532 nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
533 {
534 return GET_MODE_ALIGNMENT (mode);
535 }
536
537 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
538
539 For nvptx, we know how to handle functions declared as stdarg: by
540 passing an extra pointer to the unnamed arguments. However, the
541 Fortran frontend can produce a different situation, where a
542 function pointer is declared with no arguments, but the actual
543 function and calls to it take more arguments. In that case, we
544 want to ensure the call matches the definition of the function. */
545
546 static bool
nvptx_strict_argument_naming(cumulative_args_t cum_v)547 nvptx_strict_argument_naming (cumulative_args_t cum_v)
548 {
549 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
550
551 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
552 }
553
554 /* Implement TARGET_LIBCALL_VALUE. */
555
556 static rtx
nvptx_libcall_value(machine_mode mode,const_rtx)557 nvptx_libcall_value (machine_mode mode, const_rtx)
558 {
559 if (!cfun || !cfun->machine->doing_call)
560 /* Pretend to return in a hard reg for early uses before pseudos can be
561 generated. */
562 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
563
564 return gen_reg_rtx (mode);
565 }
566
567 /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
568 where function FUNC returns or receives a value of data type TYPE. */
569
570 static rtx
nvptx_function_value(const_tree type,const_tree ARG_UNUSED (func),bool outgoing)571 nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
572 bool outgoing)
573 {
574 machine_mode mode = promote_return (TYPE_MODE (type));
575
576 if (outgoing)
577 {
578 gcc_assert (cfun);
579 cfun->machine->return_mode = mode;
580 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
581 }
582
583 return nvptx_libcall_value (mode, NULL_RTX);
584 }
585
586 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
587
588 static bool
nvptx_function_value_regno_p(const unsigned int regno)589 nvptx_function_value_regno_p (const unsigned int regno)
590 {
591 return regno == NVPTX_RETURN_REGNUM;
592 }
593
594 /* Types with a mode other than those supported by the machine are passed by
595 reference in memory. */
596
597 static bool
nvptx_pass_by_reference(cumulative_args_t ARG_UNUSED (cum),machine_mode mode,const_tree type,bool ARG_UNUSED (named))598 nvptx_pass_by_reference (cumulative_args_t ARG_UNUSED (cum),
599 machine_mode mode, const_tree type,
600 bool ARG_UNUSED (named))
601 {
602 return pass_in_memory (mode, type, false);
603 }
604
605 /* Implement TARGET_RETURN_IN_MEMORY. */
606
607 static bool
nvptx_return_in_memory(const_tree type,const_tree)608 nvptx_return_in_memory (const_tree type, const_tree)
609 {
610 return pass_in_memory (TYPE_MODE (type), type, true);
611 }
612
613 /* Implement TARGET_PROMOTE_FUNCTION_MODE. */
614
615 static machine_mode
nvptx_promote_function_mode(const_tree type,machine_mode mode,int * ARG_UNUSED (punsignedp),const_tree funtype,int for_return)616 nvptx_promote_function_mode (const_tree type, machine_mode mode,
617 int *ARG_UNUSED (punsignedp),
618 const_tree funtype, int for_return)
619 {
620 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
621 }
622
623 /* Helper for write_arg. Emit a single PTX argument of MODE, either
624 in a prototype, or as copy in a function prologue. ARGNO is the
625 index of this argument in the PTX function. FOR_REG is negative,
626 if we're emitting the PTX prototype. It is zero if we're copying
627 to an argument register and it is greater than zero if we're
628 copying to a specific hard register. */
629
630 static int
write_arg_mode(std::stringstream & s,int for_reg,int argno,machine_mode mode)631 write_arg_mode (std::stringstream &s, int for_reg, int argno,
632 machine_mode mode)
633 {
634 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
635
636 if (for_reg < 0)
637 {
638 /* Writing PTX prototype. */
639 s << (argno ? ", " : " (");
640 s << ".param" << ptx_type << " %in_ar" << argno;
641 }
642 else
643 {
644 s << "\t.reg" << ptx_type << " ";
645 if (for_reg)
646 s << reg_names[for_reg];
647 else
648 s << "%ar" << argno;
649 s << ";\n";
650 if (argno >= 0)
651 {
652 s << "\tld.param" << ptx_type << " ";
653 if (for_reg)
654 s << reg_names[for_reg];
655 else
656 s << "%ar" << argno;
657 s << ", [%in_ar" << argno << "];\n";
658 }
659 }
660 return argno + 1;
661 }
662
663 /* Process function parameter TYPE to emit one or more PTX
664 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
665 is true, if this is a prototyped function, rather than an old-style
666 C declaration. Returns the next argument number to use.
667
668 The promotion behavior here must match the regular GCC function
669 parameter marshalling machinery. */
670
671 static int
write_arg_type(std::stringstream & s,int for_reg,int argno,tree type,bool prototyped)672 write_arg_type (std::stringstream &s, int for_reg, int argno,
673 tree type, bool prototyped)
674 {
675 machine_mode mode = TYPE_MODE (type);
676
677 if (mode == VOIDmode)
678 return argno;
679
680 if (pass_in_memory (mode, type, false))
681 mode = Pmode;
682 else
683 {
684 bool split = TREE_CODE (type) == COMPLEX_TYPE;
685
686 if (split)
687 {
688 /* Complex types are sent as two separate args. */
689 type = TREE_TYPE (type);
690 mode = TYPE_MODE (type);
691 prototyped = true;
692 }
693
694 mode = promote_arg (mode, prototyped);
695 if (split)
696 argno = write_arg_mode (s, for_reg, argno, mode);
697 }
698
699 return write_arg_mode (s, for_reg, argno, mode);
700 }
701
702 /* Emit a PTX return as a prototype or function prologue declaration
703 for MODE. */
704
705 static void
write_return_mode(std::stringstream & s,bool for_proto,machine_mode mode)706 write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
707 {
708 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
709 const char *pfx = "\t.reg";
710 const char *sfx = ";\n";
711
712 if (for_proto)
713 pfx = "(.param", sfx = "_out) ";
714
715 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
716 }
717
718 /* Process a function return TYPE to emit a PTX return as a prototype
719 or function prologue declaration. Returns true if return is via an
720 additional pointer parameter. The promotion behavior here must
721 match the regular GCC function return mashalling. */
722
723 static bool
write_return_type(std::stringstream & s,bool for_proto,tree type)724 write_return_type (std::stringstream &s, bool for_proto, tree type)
725 {
726 machine_mode mode = TYPE_MODE (type);
727
728 if (mode == VOIDmode)
729 return false;
730
731 bool return_in_mem = pass_in_memory (mode, type, true);
732
733 if (return_in_mem)
734 {
735 if (for_proto)
736 return return_in_mem;
737
738 /* Named return values can cause us to return a pointer as well
739 as expect an argument for the return location. This is
740 optimization-level specific, so no caller can make use of
741 this data, but more importantly for us, we must ensure it
742 doesn't change the PTX prototype. */
743 mode = (machine_mode) cfun->machine->return_mode;
744
745 if (mode == VOIDmode)
746 return return_in_mem;
747
748 /* Clear return_mode to inhibit copy of retval to non-existent
749 retval parameter. */
750 cfun->machine->return_mode = VOIDmode;
751 }
752 else
753 mode = promote_return (mode);
754
755 write_return_mode (s, for_proto, mode);
756
757 return return_in_mem;
758 }
759
760 /* Look for attributes in ATTRS that would indicate we must write a function
761 as a .entry kernel rather than a .func. Return true if one is found. */
762
763 static bool
write_as_kernel(tree attrs)764 write_as_kernel (tree attrs)
765 {
766 return (lookup_attribute ("kernel", attrs) != NULL_TREE
767 || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
768 && lookup_attribute ("oacc function", attrs) != NULL_TREE));
769 /* For OpenMP target regions, the corresponding kernel entry is emitted from
770 write_omp_entry as a separate function. */
771 }
772
773 /* Emit a linker marker for a function decl or defn. */
774
775 static void
write_fn_marker(std::stringstream & s,bool is_defn,bool globalize,const char * name)776 write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
777 const char *name)
778 {
779 s << "\n// BEGIN";
780 if (globalize)
781 s << " GLOBAL";
782 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
783 s << name << "\n";
784 }
785
786 /* Emit a linker marker for a variable decl or defn. */
787
788 static void
write_var_marker(FILE * file,bool is_defn,bool globalize,const char * name)789 write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
790 {
791 fprintf (file, "\n// BEGIN%s VAR %s: ",
792 globalize ? " GLOBAL" : "",
793 is_defn ? "DEF" : "DECL");
794 assemble_name_raw (file, name);
795 fputs ("\n", file);
796 }
797
798 /* Write a .func or .kernel declaration or definition along with
799 a helper comment for use by ld. S is the stream to write to, DECL
800 the decl for the function with name NAME. For definitions, emit
801 a declaration too. */
802
803 static const char *
write_fn_proto(std::stringstream & s,bool is_defn,const char * name,const_tree decl)804 write_fn_proto (std::stringstream &s, bool is_defn,
805 const char *name, const_tree decl)
806 {
807 if (is_defn)
808 /* Emit a declaration. The PTX assembler gets upset without it. */
809 name = write_fn_proto (s, false, name, decl);
810 else
811 {
812 /* Avoid repeating the name replacement. */
813 name = nvptx_name_replacement (name);
814 if (name[0] == '*')
815 name++;
816 }
817
818 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
819
820 /* PTX declaration. */
821 if (DECL_EXTERNAL (decl))
822 s << ".extern ";
823 else if (TREE_PUBLIC (decl))
824 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
825 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
826
827 tree fntype = TREE_TYPE (decl);
828 tree result_type = TREE_TYPE (fntype);
829
830 /* atomic_compare_exchange_$n builtins have an exceptional calling
831 convention. */
832 int not_atomic_weak_arg = -1;
833 if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
834 switch (DECL_FUNCTION_CODE (decl))
835 {
836 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
837 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
838 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
839 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
840 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
841 /* These atomics skip the 'weak' parm in an actual library
842 call. We must skip it in the prototype too. */
843 not_atomic_weak_arg = 3;
844 break;
845
846 default:
847 break;
848 }
849
850 /* Declare the result. */
851 bool return_in_mem = write_return_type (s, true, result_type);
852
853 s << name;
854
855 int argno = 0;
856
857 /* Emit argument list. */
858 if (return_in_mem)
859 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
860
861 /* We get:
862 NULL in TYPE_ARG_TYPES, for old-style functions
863 NULL in DECL_ARGUMENTS, for builtin functions without another
864 declaration.
865 So we have to pick the best one we have. */
866 tree args = TYPE_ARG_TYPES (fntype);
867 bool prototyped = true;
868 if (!args)
869 {
870 args = DECL_ARGUMENTS (decl);
871 prototyped = false;
872 }
873
874 for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
875 {
876 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
877
878 if (not_atomic_weak_arg)
879 argno = write_arg_type (s, -1, argno, type, prototyped);
880 else
881 gcc_assert (type == boolean_type_node);
882 }
883
884 if (stdarg_p (fntype))
885 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
886
887 if (DECL_STATIC_CHAIN (decl))
888 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
889
890 if (!argno && strcmp (name, "main") == 0)
891 {
892 argno = write_arg_type (s, -1, argno, integer_type_node, true);
893 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
894 }
895
896 if (argno)
897 s << ")";
898
899 s << (is_defn ? "\n" : ";\n");
900
901 return name;
902 }
903
904 /* Construct a function declaration from a call insn. This can be
905 necessary for two reasons - either we have an indirect call which
906 requires a .callprototype declaration, or we have a libcall
907 generated by emit_library_call for which no decl exists. */
908
909 static void
write_fn_proto_from_insn(std::stringstream & s,const char * name,rtx result,rtx pat)910 write_fn_proto_from_insn (std::stringstream &s, const char *name,
911 rtx result, rtx pat)
912 {
913 if (!name)
914 {
915 s << "\t.callprototype ";
916 name = "_";
917 }
918 else
919 {
920 name = nvptx_name_replacement (name);
921 write_fn_marker (s, false, true, name);
922 s << "\t.extern .func ";
923 }
924
925 if (result != NULL_RTX)
926 write_return_mode (s, true, GET_MODE (result));
927
928 s << name;
929
930 int arg_end = XVECLEN (pat, 0);
931 for (int i = 1; i < arg_end; i++)
932 {
933 /* We don't have to deal with mode splitting & promotion here,
934 as that was already done when generating the call
935 sequence. */
936 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
937
938 write_arg_mode (s, -1, i - 1, mode);
939 }
940 if (arg_end != 1)
941 s << ")";
942 s << ";\n";
943 }
944
945 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
946 table and and write a ptx prototype. These are emitted at end of
947 compilation. */
948
949 static void
nvptx_record_fndecl(tree decl)950 nvptx_record_fndecl (tree decl)
951 {
952 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
953 if (*slot == NULL)
954 {
955 *slot = decl;
956 const char *name = get_fnname_from_decl (decl);
957 write_fn_proto (func_decls, false, name, decl);
958 }
959 }
960
961 /* Record a libcall or unprototyped external function. CALLEE is the
962 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
963 declaration for it. */
964
965 static void
nvptx_record_libfunc(rtx callee,rtx retval,rtx pat)966 nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
967 {
968 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
969 if (*slot == NULL)
970 {
971 *slot = callee;
972
973 const char *name = XSTR (callee, 0);
974 write_fn_proto_from_insn (func_decls, name, retval, pat);
975 }
976 }
977
978 /* DECL is an external FUNCTION_DECL, that we're referencing. If it
979 is prototyped, record it now. Otherwise record it as needed at end
980 of compilation, when we might have more information about it. */
981
982 void
nvptx_record_needed_fndecl(tree decl)983 nvptx_record_needed_fndecl (tree decl)
984 {
985 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
986 {
987 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
988 if (*slot == NULL)
989 *slot = decl;
990 }
991 else
992 nvptx_record_fndecl (decl);
993 }
994
995 /* SYM is a SYMBOL_REF. If it refers to an external function, record
996 it as needed. */
997
998 static void
nvptx_maybe_record_fnsym(rtx sym)999 nvptx_maybe_record_fnsym (rtx sym)
1000 {
1001 tree decl = SYMBOL_REF_DECL (sym);
1002
1003 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
1004 nvptx_record_needed_fndecl (decl);
1005 }
1006
1007 /* Emit a local array to hold some part of a conventional stack frame
1008 and initialize REGNO to point to it. If the size is zero, it'll
1009 never be valid to dereference, so we can simply initialize to
1010 zero. */
1011
1012 static void
init_frame(FILE * file,int regno,unsigned align,unsigned size)1013 init_frame (FILE *file, int regno, unsigned align, unsigned size)
1014 {
1015 if (size)
1016 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
1017 align, reg_names[regno], size);
1018 fprintf (file, "\t.reg.u%d %s;\n",
1019 POINTER_SIZE, reg_names[regno]);
1020 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
1021 : "\tmov.u%d %s, 0;\n"),
1022 POINTER_SIZE, reg_names[regno], reg_names[regno]);
1023 }
1024
1025 /* Emit soft stack frame setup sequence. */
1026
1027 static void
init_softstack_frame(FILE * file,unsigned alignment,HOST_WIDE_INT size)1028 init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
1029 {
1030 /* Maintain 64-bit stack alignment. */
1031 unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
1032 size = ROUND_UP (size, keep_align);
1033 int bits = POINTER_SIZE;
1034 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1035 const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
1036 const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
1037 const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
1038 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
1039 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
1040 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
1041 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
1042 fprintf (file, "\t{\n");
1043 fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
1044 fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
1045 fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
1046 fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1047 fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1048 bits == 64 ? ".wide" : ".lo", bits / 8);
1049 fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
1050
1051 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1052 fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
1053
1054 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1055 fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
1056 bits, reg_sspprev, reg_sspslot);
1057
1058 /* Initialize %frame = %sspprev - size. */
1059 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1060 bits, reg_frame, reg_sspprev, size);
1061
1062 /* Apply alignment, if larger than 64. */
1063 if (alignment > keep_align)
1064 fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
1065 bits, reg_frame, reg_frame, -alignment);
1066
1067 size = crtl->outgoing_args_size;
1068 gcc_assert (size % keep_align == 0);
1069
1070 /* Initialize %stack. */
1071 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1072 bits, reg_stack, reg_frame, size);
1073
1074 if (!crtl->is_leaf)
1075 fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
1076 bits, reg_sspslot, reg_stack);
1077 fprintf (file, "\t}\n");
1078 cfun->machine->has_softstack = true;
1079 need_softstack_decl = true;
1080 }
1081
1082 /* Emit code to initialize the REGNO predicate register to indicate
1083 whether we are not lane zero on the NAME axis. */
1084
1085 static void
nvptx_init_axis_predicate(FILE * file,int regno,const char * name)1086 nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
1087 {
1088 fprintf (file, "\t{\n");
1089 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
1090 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
1091 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
1092 fprintf (file, "\t}\n");
1093 }
1094
1095 /* Emit code to initialize predicate and master lane index registers for
1096 -muniform-simt code generation variant. */
1097
1098 static void
nvptx_init_unisimt_predicate(FILE * file)1099 nvptx_init_unisimt_predicate (FILE *file)
1100 {
1101 cfun->machine->unisimt_location = gen_reg_rtx (Pmode);
1102 int loc = REGNO (cfun->machine->unisimt_location);
1103 int bits = POINTER_SIZE;
1104 fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc);
1105 fprintf (file, "\t{\n");
1106 fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
1107 fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
1108 fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1109 fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1110 bits == 64 ? ".wide" : ".lo");
1111 fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc);
1112 fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc);
1113 if (cfun->machine->unisimt_predicate)
1114 {
1115 int master = REGNO (cfun->machine->unisimt_master);
1116 int pred = REGNO (cfun->machine->unisimt_predicate);
1117 fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc);
1118 fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1119 /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */
1120 fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
1121 /* Compute predicate as 'tid.x == master'. */
1122 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
1123 }
1124 fprintf (file, "\t}\n");
1125 need_unisimt_decl = true;
1126 }
1127
1128 /* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1129
1130 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1131 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1132 {
1133 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1134 __nvptx_uni[tid.y] = 0;
1135 gomp_nvptx_main (ORIG, arg);
1136 }
1137 ORIG itself should not be emitted as a PTX .entry function. */
1138
1139 static void
write_omp_entry(FILE * file,const char * name,const char * orig)1140 write_omp_entry (FILE *file, const char *name, const char *orig)
1141 {
1142 static bool gomp_nvptx_main_declared;
1143 if (!gomp_nvptx_main_declared)
1144 {
1145 gomp_nvptx_main_declared = true;
1146 write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
1147 func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1148 << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
1149 }
1150 /* PR79332. Single out this string; it confuses gcc.pot generation. */
1151 #define NTID_Y "%ntid.y"
1152 #define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1153 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1154 {\n\
1155 .reg.u32 %r<3>;\n\
1156 .reg.u" PS " %R<4>;\n\
1157 mov.u32 %r0, %tid.y;\n\
1158 mov.u32 %r1, " NTID_Y ";\n\
1159 mov.u32 %r2, %ctaid.x;\n\
1160 cvt.u" PS ".u32 %R1, %r0;\n\
1161 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1162 mov.u" PS " %R0, __nvptx_stacks;\n\
1163 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1164 ld.param.u" PS " %R2, [%stack];\n\
1165 ld.param.u" PS " %R3, [%sz];\n\
1166 add.u" PS " %R2, %R2, %R3;\n\
1167 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1168 st.shared.u" PS " [%R0], %R2;\n\
1169 mov.u" PS " %R0, __nvptx_uni;\n\
1170 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1171 mov.u32 %r0, 0;\n\
1172 st.shared.u32 [%R0], %r0;\n\
1173 mov.u" PS " %R0, \0;\n\
1174 ld.param.u" PS " %R1, [%arg];\n\
1175 {\n\
1176 .param.u" PS " %P<2>;\n\
1177 st.param.u" PS " [%P0], %R0;\n\
1178 st.param.u" PS " [%P1], %R1;\n\
1179 call.uni gomp_nvptx_main, (%P0, %P1);\n\
1180 }\n\
1181 ret.uni;\n\
1182 }\n"
1183 static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1184 static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1185 #undef ENTRY_TEMPLATE
1186 #undef NTID_Y
1187 const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
1188 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1189 const char *entry_2 = entry_1 + strlen (entry64) + 1;
1190 fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
1191 need_softstack_decl = need_unisimt_decl = true;
1192 }
1193
1194 /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1195 function, including local var decls and copies from the arguments to
1196 local regs. */
1197
1198 void
nvptx_declare_function_name(FILE * file,const char * name,const_tree decl)1199 nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
1200 {
1201 tree fntype = TREE_TYPE (decl);
1202 tree result_type = TREE_TYPE (fntype);
1203 int argno = 0;
1204
1205 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
1206 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
1207 {
1208 char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
1209 sprintf (buf, "%s$impl", name);
1210 write_omp_entry (file, name, buf);
1211 name = buf;
1212 }
1213 /* We construct the initial part of the function into a string
1214 stream, in order to share the prototype writing code. */
1215 std::stringstream s;
1216 write_fn_proto (s, true, name, decl);
1217 s << "{\n";
1218
1219 bool return_in_mem = write_return_type (s, false, result_type);
1220 if (return_in_mem)
1221 argno = write_arg_type (s, 0, argno, ptr_type_node, true);
1222
1223 /* Declare and initialize incoming arguments. */
1224 tree args = TYPE_ARG_TYPES (fntype);
1225 bool prototyped = true;
1226 if (!args)
1227 {
1228 args = DECL_ARGUMENTS (decl);
1229 prototyped = false;
1230 }
1231
1232 for (; args != NULL_TREE; args = TREE_CHAIN (args))
1233 {
1234 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
1235
1236 argno = write_arg_type (s, 0, argno, type, prototyped);
1237 }
1238
1239 if (stdarg_p (fntype))
1240 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1241 true);
1242
1243 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1244 write_arg_type (s, STATIC_CHAIN_REGNUM,
1245 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1246 true);
1247
1248 fprintf (file, "%s", s.str().c_str());
1249
1250 /* Usually 'crtl->is_leaf' is computed during register allocator
1251 initialization (which is not done on NVPTX) or for pressure-sensitive
1252 optimizations. Initialize it here, except if already set. */
1253 if (!crtl->is_leaf)
1254 crtl->is_leaf = leaf_function_p ();
1255
1256 HOST_WIDE_INT sz = get_frame_size ();
1257 bool need_frameptr = sz || cfun->machine->has_chain;
1258 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
1259 if (!TARGET_SOFT_STACK)
1260 {
1261 /* Declare a local var for outgoing varargs. */
1262 if (cfun->machine->has_varadic)
1263 init_frame (file, STACK_POINTER_REGNUM,
1264 UNITS_PER_WORD, crtl->outgoing_args_size);
1265
1266 /* Declare a local variable for the frame. Force its size to be
1267 DImode-compatible. */
1268 if (need_frameptr)
1269 init_frame (file, FRAME_POINTER_REGNUM, alignment,
1270 ROUND_UP (sz, GET_MODE_SIZE (DImode)));
1271 }
1272 else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca
1273 || (cfun->machine->has_simtreg && !crtl->is_leaf))
1274 init_softstack_frame (file, alignment, sz);
1275
1276 if (cfun->machine->has_simtreg)
1277 {
1278 unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size;
1279 unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align;
1280 align = MAX (align, GET_MODE_SIZE (DImode));
1281 if (!crtl->is_leaf || cfun->calls_alloca)
1282 simtsz = HOST_WIDE_INT_M1U;
1283 if (simtsz == HOST_WIDE_INT_M1U)
1284 simtsz = nvptx_softstack_size;
1285 if (cfun->machine->has_softstack)
1286 simtsz += POINTER_SIZE / 8;
1287 simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode));
1288 if (align > GET_MODE_SIZE (DImode))
1289 simtsz += align - GET_MODE_SIZE (DImode);
1290 if (simtsz)
1291 fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
1292 HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
1293 }
1294 /* Declare the pseudos we have as ptx registers. */
1295 int maxregs = max_reg_num ();
1296 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1297 {
1298 if (regno_reg_rtx[i] != const0_rtx)
1299 {
1300 machine_mode mode = PSEUDO_REGNO_MODE (i);
1301 machine_mode split = maybe_split_mode (mode);
1302
1303 if (split_mode_p (mode))
1304 mode = split;
1305 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1306 output_reg (file, i, split, -2);
1307 fprintf (file, ";\n");
1308 }
1309 }
1310
1311 /* Emit axis predicates. */
1312 if (cfun->machine->axis_predicate[0])
1313 nvptx_init_axis_predicate (file,
1314 REGNO (cfun->machine->axis_predicate[0]), "y");
1315 if (cfun->machine->axis_predicate[1])
1316 nvptx_init_axis_predicate (file,
1317 REGNO (cfun->machine->axis_predicate[1]), "x");
1318 if (cfun->machine->unisimt_predicate
1319 || (cfun->machine->has_simtreg && !crtl->is_leaf))
1320 nvptx_init_unisimt_predicate (file);
1321 }
1322
1323 /* Output code for switching uniform-simt state. ENTERING indicates whether
1324 we are entering or leaving non-uniform execution region. */
1325
1326 static void
nvptx_output_unisimt_switch(FILE * file,bool entering)1327 nvptx_output_unisimt_switch (FILE *file, bool entering)
1328 {
1329 if (crtl->is_leaf && !cfun->machine->unisimt_predicate)
1330 return;
1331 fprintf (file, "\t{\n");
1332 fprintf (file, "\t\t.reg.u32 %%ustmp2;\n");
1333 fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0);
1334 if (!crtl->is_leaf)
1335 {
1336 int loc = REGNO (cfun->machine->unisimt_location);
1337 fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc);
1338 }
1339 if (cfun->machine->unisimt_predicate)
1340 {
1341 int master = REGNO (cfun->machine->unisimt_master);
1342 int pred = REGNO (cfun->machine->unisimt_predicate);
1343 fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1344 fprintf (file, "\t\tmov.u32 %%r%d, %s;\n",
1345 master, entering ? "%ustmp2" : "0");
1346 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master);
1347 }
1348 fprintf (file, "\t}\n");
1349 }
1350
1351 /* Output code for allocating per-lane storage and switching soft-stack pointer.
1352 ENTERING indicates whether we are entering or leaving non-uniform execution.
1353 PTR is the register pointing to allocated storage, it is assigned to on
1354 entering and used to restore state on leaving. SIZE and ALIGN are used only
1355 on entering. */
1356
1357 static void
nvptx_output_softstack_switch(FILE * file,bool entering,rtx ptr,rtx size,rtx align)1358 nvptx_output_softstack_switch (FILE *file, bool entering,
1359 rtx ptr, rtx size, rtx align)
1360 {
1361 gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr));
1362 if (crtl->is_leaf && !cfun->machine->simt_stack_size)
1363 return;
1364 int bits = POINTER_SIZE, regno = REGNO (ptr);
1365 fprintf (file, "\t{\n");
1366 if (entering)
1367 {
1368 fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1369 HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno,
1370 cfun->machine->simt_stack_size);
1371 fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno);
1372 if (CONST_INT_P (size))
1373 fprintf (file, HOST_WIDE_INT_PRINT_DEC,
1374 ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode)));
1375 else
1376 output_reg (file, REGNO (size), VOIDmode);
1377 fputs (";\n", file);
1378 if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
1379 fprintf (file,
1380 "\t\tand.u%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
1381 bits, regno, regno, UINTVAL (align));
1382 }
1383 if (cfun->machine->has_softstack)
1384 {
1385 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1386 if (entering)
1387 {
1388 fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1389 bits, regno, bits / 8, reg_stack);
1390 fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n",
1391 bits, reg_stack, regno, bits / 8);
1392 }
1393 else
1394 {
1395 fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1396 bits, reg_stack, regno, bits / 8);
1397 }
1398 nvptx_output_set_softstack (REGNO (stack_pointer_rtx));
1399 }
1400 fprintf (file, "\t}\n");
1401 }
1402
1403 /* Output code to enter non-uniform execution region. DEST is a register
1404 to hold a per-lane allocation given by SIZE and ALIGN. */
1405
1406 const char *
nvptx_output_simt_enter(rtx dest,rtx size,rtx align)1407 nvptx_output_simt_enter (rtx dest, rtx size, rtx align)
1408 {
1409 nvptx_output_unisimt_switch (asm_out_file, true);
1410 nvptx_output_softstack_switch (asm_out_file, true, dest, size, align);
1411 return "";
1412 }
1413
1414 /* Output code to leave non-uniform execution region. SRC is the register
1415 holding per-lane storage previously allocated by omp_simt_enter insn. */
1416
1417 const char *
nvptx_output_simt_exit(rtx src)1418 nvptx_output_simt_exit (rtx src)
1419 {
1420 nvptx_output_unisimt_switch (asm_out_file, false);
1421 nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX);
1422 return "";
1423 }
1424
1425 /* Output instruction that sets soft stack pointer in shared memory to the
1426 value in register given by SRC_REGNO. */
1427
1428 const char *
nvptx_output_set_softstack(unsigned src_regno)1429 nvptx_output_set_softstack (unsigned src_regno)
1430 {
1431 if (cfun->machine->has_softstack && !crtl->is_leaf)
1432 {
1433 fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
1434 POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
1435 output_reg (asm_out_file, src_regno, VOIDmode);
1436 fprintf (asm_out_file, ";\n");
1437 }
1438 return "";
1439 }
1440 /* Output a return instruction. Also copy the return value to its outgoing
1441 location. */
1442
1443 const char *
nvptx_output_return(void)1444 nvptx_output_return (void)
1445 {
1446 machine_mode mode = (machine_mode)cfun->machine->return_mode;
1447
1448 if (mode != VOIDmode)
1449 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1450 nvptx_ptx_type_from_mode (mode, false),
1451 reg_names[NVPTX_RETURN_REGNUM],
1452 reg_names[NVPTX_RETURN_REGNUM]);
1453
1454 return "ret;";
1455 }
1456
1457 /* Terminate a function by writing a closing brace to FILE. */
1458
1459 void
nvptx_function_end(FILE * file)1460 nvptx_function_end (FILE *file)
1461 {
1462 fprintf (file, "}\n");
1463 }
1464
1465 /* Decide whether we can make a sibling call to a function. For ptx, we
1466 can't. */
1467
1468 static bool
nvptx_function_ok_for_sibcall(tree,tree)1469 nvptx_function_ok_for_sibcall (tree, tree)
1470 {
1471 return false;
1472 }
1473
1474 /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1475
1476 static rtx
nvptx_get_drap_rtx(void)1477 nvptx_get_drap_rtx (void)
1478 {
1479 if (TARGET_SOFT_STACK && stack_realign_drap)
1480 return arg_pointer_rtx;
1481 return NULL_RTX;
1482 }
1483
1484 /* Implement the TARGET_CALL_ARGS hook. Record information about one
1485 argument to the next call. */
1486
1487 static void
nvptx_call_args(rtx arg,tree fntype)1488 nvptx_call_args (rtx arg, tree fntype)
1489 {
1490 if (!cfun->machine->doing_call)
1491 {
1492 cfun->machine->doing_call = true;
1493 cfun->machine->is_varadic = false;
1494 cfun->machine->num_args = 0;
1495
1496 if (fntype && stdarg_p (fntype))
1497 {
1498 cfun->machine->is_varadic = true;
1499 cfun->machine->has_varadic = true;
1500 cfun->machine->num_args++;
1501 }
1502 }
1503
1504 if (REG_P (arg) && arg != pc_rtx)
1505 {
1506 cfun->machine->num_args++;
1507 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1508 cfun->machine->call_args);
1509 }
1510 }
1511
1512 /* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1513 information we recorded. */
1514
1515 static void
nvptx_end_call_args(void)1516 nvptx_end_call_args (void)
1517 {
1518 cfun->machine->doing_call = false;
1519 free_EXPR_LIST_list (&cfun->machine->call_args);
1520 }
1521
1522 /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1523 track of whether calls involving static chains or varargs were seen
1524 in the current function.
1525 For libcalls, maintain a hash table of decls we have seen, and
1526 record a function decl for later when encountering a new one. */
1527
1528 void
nvptx_expand_call(rtx retval,rtx address)1529 nvptx_expand_call (rtx retval, rtx address)
1530 {
1531 rtx callee = XEXP (address, 0);
1532 rtx varargs = NULL_RTX;
1533 unsigned parallel = 0;
1534
1535 if (!call_insn_operand (callee, Pmode))
1536 {
1537 callee = force_reg (Pmode, callee);
1538 address = change_address (address, QImode, callee);
1539 }
1540
1541 if (GET_CODE (callee) == SYMBOL_REF)
1542 {
1543 tree decl = SYMBOL_REF_DECL (callee);
1544 if (decl != NULL_TREE)
1545 {
1546 if (DECL_STATIC_CHAIN (decl))
1547 cfun->machine->has_chain = true;
1548
1549 tree attr = oacc_get_fn_attrib (decl);
1550 if (attr)
1551 {
1552 tree dims = TREE_VALUE (attr);
1553
1554 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1555 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1556 {
1557 if (TREE_PURPOSE (dims)
1558 && !integer_zerop (TREE_PURPOSE (dims)))
1559 break;
1560 /* Not on this axis. */
1561 parallel ^= GOMP_DIM_MASK (ix);
1562 dims = TREE_CHAIN (dims);
1563 }
1564 }
1565 }
1566 }
1567
1568 unsigned nargs = cfun->machine->num_args;
1569 if (cfun->machine->is_varadic)
1570 {
1571 varargs = gen_reg_rtx (Pmode);
1572 emit_move_insn (varargs, stack_pointer_rtx);
1573 }
1574
1575 rtvec vec = rtvec_alloc (nargs + 1);
1576 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
1577 int vec_pos = 0;
1578
1579 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
1580 rtx tmp_retval = retval;
1581 if (retval)
1582 {
1583 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1584 tmp_retval = gen_reg_rtx (GET_MODE (retval));
1585 call = gen_rtx_SET (tmp_retval, call);
1586 }
1587 XVECEXP (pat, 0, vec_pos++) = call;
1588
1589 /* Construct the call insn, including a USE for each argument pseudo
1590 register. These will be used when printing the insn. */
1591 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
1592 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
1593
1594 if (varargs)
1595 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
1596
1597 gcc_assert (vec_pos = XVECLEN (pat, 0));
1598
1599 nvptx_emit_forking (parallel, true);
1600 emit_call_insn (pat);
1601 nvptx_emit_joining (parallel, true);
1602
1603 if (tmp_retval != retval)
1604 emit_move_insn (retval, tmp_retval);
1605 }
1606
1607 /* Emit a comparison COMPARE, and return the new test to be used in the
1608 jump. */
1609
1610 rtx
nvptx_expand_compare(rtx compare)1611 nvptx_expand_compare (rtx compare)
1612 {
1613 rtx pred = gen_reg_rtx (BImode);
1614 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1615 XEXP (compare, 0), XEXP (compare, 1));
1616 emit_insn (gen_rtx_SET (pred, cmp));
1617 return gen_rtx_NE (BImode, pred, const0_rtx);
1618 }
1619
1620 /* Expand the oacc fork & join primitive into ptx-required unspecs. */
1621
1622 void
nvptx_expand_oacc_fork(unsigned mode)1623 nvptx_expand_oacc_fork (unsigned mode)
1624 {
1625 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1626 }
1627
1628 void
nvptx_expand_oacc_join(unsigned mode)1629 nvptx_expand_oacc_join (unsigned mode)
1630 {
1631 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1632 }
1633
1634 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1635 objects. */
1636
1637 static rtx
nvptx_gen_unpack(rtx dst0,rtx dst1,rtx src)1638 nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1639 {
1640 rtx res;
1641
1642 switch (GET_MODE (src))
1643 {
1644 case E_DImode:
1645 res = gen_unpackdisi2 (dst0, dst1, src);
1646 break;
1647 case E_DFmode:
1648 res = gen_unpackdfsi2 (dst0, dst1, src);
1649 break;
1650 default: gcc_unreachable ();
1651 }
1652 return res;
1653 }
1654
1655 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1656 object. */
1657
1658 static rtx
nvptx_gen_pack(rtx dst,rtx src0,rtx src1)1659 nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1660 {
1661 rtx res;
1662
1663 switch (GET_MODE (dst))
1664 {
1665 case E_DImode:
1666 res = gen_packsidi2 (dst, src0, src1);
1667 break;
1668 case E_DFmode:
1669 res = gen_packsidf2 (dst, src0, src1);
1670 break;
1671 default: gcc_unreachable ();
1672 }
1673 return res;
1674 }
1675
1676 /* Generate an instruction or sequence to broadcast register REG
1677 across the vectors of a single warp. */
1678
1679 rtx
nvptx_gen_shuffle(rtx dst,rtx src,rtx idx,nvptx_shuffle_kind kind)1680 nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
1681 {
1682 rtx res;
1683
1684 switch (GET_MODE (dst))
1685 {
1686 case E_SImode:
1687 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1688 break;
1689 case E_SFmode:
1690 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1691 break;
1692 case E_DImode:
1693 case E_DFmode:
1694 {
1695 rtx tmp0 = gen_reg_rtx (SImode);
1696 rtx tmp1 = gen_reg_rtx (SImode);
1697
1698 start_sequence ();
1699 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1700 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1701 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1702 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1703 res = get_insns ();
1704 end_sequence ();
1705 }
1706 break;
1707 case E_BImode:
1708 {
1709 rtx tmp = gen_reg_rtx (SImode);
1710
1711 start_sequence ();
1712 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1713 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1714 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1715 res = get_insns ();
1716 end_sequence ();
1717 }
1718 break;
1719 case E_QImode:
1720 case E_HImode:
1721 {
1722 rtx tmp = gen_reg_rtx (SImode);
1723
1724 start_sequence ();
1725 emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
1726 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1727 emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
1728 tmp)));
1729 res = get_insns ();
1730 end_sequence ();
1731 }
1732 break;
1733
1734 default:
1735 gcc_unreachable ();
1736 }
1737 return res;
1738 }
1739
1740 /* Generate an instruction or sequence to broadcast register REG
1741 across the vectors of a single warp. */
1742
1743 static rtx
nvptx_gen_vcast(rtx reg)1744 nvptx_gen_vcast (rtx reg)
1745 {
1746 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1747 }
1748
1749 /* Structure used when generating a worker-level spill or fill. */
1750
1751 struct wcast_data_t
1752 {
1753 rtx base; /* Register holding base addr of buffer. */
1754 rtx ptr; /* Iteration var, if needed. */
1755 unsigned offset; /* Offset into worker buffer. */
1756 };
1757
1758 /* Direction of the spill/fill and looping setup/teardown indicator. */
1759
1760 enum propagate_mask
1761 {
1762 PM_read = 1 << 0,
1763 PM_write = 1 << 1,
1764 PM_loop_begin = 1 << 2,
1765 PM_loop_end = 1 << 3,
1766
1767 PM_read_write = PM_read | PM_write
1768 };
1769
1770 /* Generate instruction(s) to spill or fill register REG to/from the
1771 worker broadcast array. PM indicates what is to be done, REP
1772 how many loop iterations will be executed (0 for not a loop). */
1773
1774 static rtx
nvptx_gen_wcast(rtx reg,propagate_mask pm,unsigned rep,wcast_data_t * data)1775 nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
1776 {
1777 rtx res;
1778 machine_mode mode = GET_MODE (reg);
1779
1780 switch (mode)
1781 {
1782 case E_BImode:
1783 {
1784 rtx tmp = gen_reg_rtx (SImode);
1785
1786 start_sequence ();
1787 if (pm & PM_read)
1788 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1789 emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
1790 if (pm & PM_write)
1791 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1792 res = get_insns ();
1793 end_sequence ();
1794 }
1795 break;
1796
1797 default:
1798 {
1799 rtx addr = data->ptr;
1800
1801 if (!addr)
1802 {
1803 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1804
1805 if (align > worker_bcast_align)
1806 worker_bcast_align = align;
1807 data->offset = (data->offset + align - 1) & ~(align - 1);
1808 addr = data->base;
1809 if (data->offset)
1810 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1811 }
1812
1813 addr = gen_rtx_MEM (mode, addr);
1814 if (pm == PM_read)
1815 res = gen_rtx_SET (addr, reg);
1816 else if (pm == PM_write)
1817 res = gen_rtx_SET (reg, addr);
1818 else
1819 gcc_unreachable ();
1820
1821 if (data->ptr)
1822 {
1823 /* We're using a ptr, increment it. */
1824 start_sequence ();
1825
1826 emit_insn (res);
1827 emit_insn (gen_adddi3 (data->ptr, data->ptr,
1828 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1829 res = get_insns ();
1830 end_sequence ();
1831 }
1832 else
1833 rep = 1;
1834 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1835 }
1836 break;
1837 }
1838 return res;
1839 }
1840
1841 /* Returns true if X is a valid address for use in a memory reference. */
1842
1843 static bool
nvptx_legitimate_address_p(machine_mode,rtx x,bool)1844 nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1845 {
1846 enum rtx_code code = GET_CODE (x);
1847
1848 switch (code)
1849 {
1850 case REG:
1851 return true;
1852
1853 case PLUS:
1854 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1855 return true;
1856 return false;
1857
1858 case CONST:
1859 case SYMBOL_REF:
1860 case LABEL_REF:
1861 return true;
1862
1863 default:
1864 return false;
1865 }
1866 }
1867
1868 /* Machinery to output constant initializers. When beginning an
1869 initializer, we decide on a fragment size (which is visible in ptx
1870 in the type used), and then all initializer data is buffered until
1871 a fragment is filled and ready to be written out. */
1872
1873 static struct
1874 {
1875 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
1876 unsigned HOST_WIDE_INT val; /* Current fragment value. */
1877 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
1878 out. */
1879 unsigned size; /* Fragment size to accumulate. */
1880 unsigned offset; /* Offset within current fragment. */
1881 bool started; /* Whether we've output any initializer. */
1882 } init_frag;
1883
1884 /* The current fragment is full, write it out. SYM may provide a
1885 symbolic reference we should output, in which case the fragment
1886 value is the addend. */
1887
1888 static void
output_init_frag(rtx sym)1889 output_init_frag (rtx sym)
1890 {
1891 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
1892 unsigned HOST_WIDE_INT val = init_frag.val;
1893
1894 init_frag.started = true;
1895 init_frag.val = 0;
1896 init_frag.offset = 0;
1897 init_frag.remaining--;
1898
1899 if (sym)
1900 {
1901 bool function = (SYMBOL_REF_DECL (sym)
1902 && (TREE_CODE (SYMBOL_REF_DECL (sym)) == FUNCTION_DECL));
1903 if (!function)
1904 fprintf (asm_out_file, "generic(");
1905 output_address (VOIDmode, sym);
1906 if (!function)
1907 fprintf (asm_out_file, ")");
1908 if (val)
1909 fprintf (asm_out_file, " + ");
1910 }
1911
1912 if (!sym || val)
1913 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
1914 }
1915
1916 /* Add value VAL of size SIZE to the data we're emitting, and keep
1917 writing out chunks as they fill up. */
1918
1919 static void
nvptx_assemble_value(unsigned HOST_WIDE_INT val,unsigned size)1920 nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
1921 {
1922 val &= ((unsigned HOST_WIDE_INT)2 << (size * BITS_PER_UNIT - 1)) - 1;
1923
1924 for (unsigned part = 0; size; size -= part)
1925 {
1926 val >>= part * BITS_PER_UNIT;
1927 part = init_frag.size - init_frag.offset;
1928 if (part > size)
1929 part = size;
1930
1931 unsigned HOST_WIDE_INT partial
1932 = val << (init_frag.offset * BITS_PER_UNIT);
1933 init_frag.val |= partial & init_frag.mask;
1934 init_frag.offset += part;
1935
1936 if (init_frag.offset == init_frag.size)
1937 output_init_frag (NULL);
1938 }
1939 }
1940
1941 /* Target hook for assembling integer object X of size SIZE. */
1942
1943 static bool
nvptx_assemble_integer(rtx x,unsigned int size,int ARG_UNUSED (aligned_p))1944 nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
1945 {
1946 HOST_WIDE_INT val = 0;
1947
1948 switch (GET_CODE (x))
1949 {
1950 default:
1951 /* Let the generic machinery figure it out, usually for a
1952 CONST_WIDE_INT. */
1953 return false;
1954
1955 case CONST_INT:
1956 nvptx_assemble_value (INTVAL (x), size);
1957 break;
1958
1959 case CONST:
1960 x = XEXP (x, 0);
1961 gcc_assert (GET_CODE (x) == PLUS);
1962 val = INTVAL (XEXP (x, 1));
1963 x = XEXP (x, 0);
1964 gcc_assert (GET_CODE (x) == SYMBOL_REF);
1965 /* FALLTHROUGH */
1966
1967 case SYMBOL_REF:
1968 gcc_assert (size == init_frag.size);
1969 if (init_frag.offset)
1970 sorry ("cannot emit unaligned pointers in ptx assembly");
1971
1972 nvptx_maybe_record_fnsym (x);
1973 init_frag.val = val;
1974 output_init_frag (x);
1975 break;
1976 }
1977
1978 return true;
1979 }
1980
1981 /* Output SIZE zero bytes. We ignore the FILE argument since the
1982 functions we're calling to perform the output just use
1983 asm_out_file. */
1984
1985 void
nvptx_output_skip(FILE *,unsigned HOST_WIDE_INT size)1986 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
1987 {
1988 /* Finish the current fragment, if it's started. */
1989 if (init_frag.offset)
1990 {
1991 unsigned part = init_frag.size - init_frag.offset;
1992 if (part > size)
1993 part = (unsigned) size;
1994 size -= part;
1995 nvptx_assemble_value (0, part);
1996 }
1997
1998 /* If this skip doesn't terminate the initializer, write as many
1999 remaining pieces as possible directly. */
2000 if (size < init_frag.remaining * init_frag.size)
2001 {
2002 while (size >= init_frag.size)
2003 {
2004 size -= init_frag.size;
2005 output_init_frag (NULL_RTX);
2006 }
2007 if (size)
2008 nvptx_assemble_value (0, size);
2009 }
2010 }
2011
2012 /* Output a string STR with length SIZE. As in nvptx_output_skip we
2013 ignore the FILE arg. */
2014
2015 void
nvptx_output_ascii(FILE *,const char * str,unsigned HOST_WIDE_INT size)2016 nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
2017 {
2018 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
2019 nvptx_assemble_value (str[i], 1);
2020 }
2021
2022 /* Return true if TYPE is a record type where the last field is an array without
2023 given dimension. */
2024
2025 static bool
flexible_array_member_type_p(const_tree type)2026 flexible_array_member_type_p (const_tree type)
2027 {
2028 if (TREE_CODE (type) != RECORD_TYPE)
2029 return false;
2030
2031 const_tree last_field = NULL_TREE;
2032 for (const_tree f = TYPE_FIELDS (type); f; f = TREE_CHAIN (f))
2033 last_field = f;
2034
2035 if (!last_field)
2036 return false;
2037
2038 const_tree last_field_type = TREE_TYPE (last_field);
2039 if (TREE_CODE (last_field_type) != ARRAY_TYPE)
2040 return false;
2041
2042 return (! TYPE_DOMAIN (last_field_type)
2043 || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type)));
2044 }
2045
2046 /* Emit a PTX variable decl and prepare for emission of its
2047 initializer. NAME is the symbol name and SETION the PTX data
2048 area. The type is TYPE, object size SIZE and alignment is ALIGN.
2049 The caller has already emitted any indentation and linkage
2050 specifier. It is responsible for any initializer, terminating ;
2051 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
2052 this is the opposite way round that PTX wants them! */
2053
2054 static void
2055 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
2056 const_tree type, HOST_WIDE_INT size, unsigned align,
2057 bool undefined = false)
2058 {
2059 bool atype = (TREE_CODE (type) == ARRAY_TYPE)
2060 && (TYPE_DOMAIN (type) == NULL_TREE);
2061
2062 if (undefined && flexible_array_member_type_p (type))
2063 {
2064 size = 0;
2065 atype = true;
2066 }
2067
2068 while (TREE_CODE (type) == ARRAY_TYPE)
2069 type = TREE_TYPE (type);
2070
2071 if (TREE_CODE (type) == VECTOR_TYPE
2072 || TREE_CODE (type) == COMPLEX_TYPE)
2073 /* Neither vector nor complex types can contain the other. */
2074 type = TREE_TYPE (type);
2075
2076 unsigned elt_size = int_size_in_bytes (type);
2077
2078 /* Largest mode we're prepared to accept. For BLKmode types we
2079 don't know if it'll contain pointer constants, so have to choose
2080 pointer size, otherwise we can choose DImode. */
2081 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
2082
2083 elt_size |= GET_MODE_SIZE (elt_mode);
2084 elt_size &= -elt_size; /* Extract LSB set. */
2085
2086 init_frag.size = elt_size;
2087 /* Avoid undefined shift behavior by using '2'. */
2088 init_frag.mask = ((unsigned HOST_WIDE_INT)2
2089 << (elt_size * BITS_PER_UNIT - 1)) - 1;
2090 init_frag.val = 0;
2091 init_frag.offset = 0;
2092 init_frag.started = false;
2093 /* Size might not be a multiple of elt size, if there's an
2094 initialized trailing struct array with smaller type than
2095 elt_size. */
2096 init_frag.remaining = (size + elt_size - 1) / elt_size;
2097
2098 fprintf (file, "%s .align %d .u%d ",
2099 section, align / BITS_PER_UNIT,
2100 elt_size * BITS_PER_UNIT);
2101 assemble_name (file, name);
2102
2103 if (size)
2104 /* We make everything an array, to simplify any initialization
2105 emission. */
2106 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
2107 else if (atype)
2108 fprintf (file, "[]");
2109 }
2110
2111 /* Called when the initializer for a decl has been completely output through
2112 combinations of the three functions above. */
2113
2114 static void
nvptx_assemble_decl_end(void)2115 nvptx_assemble_decl_end (void)
2116 {
2117 if (init_frag.offset)
2118 /* This can happen with a packed struct with trailing array member. */
2119 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
2120 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
2121 }
2122
2123 /* Output an uninitialized common or file-scope variable. */
2124
2125 void
nvptx_output_aligned_decl(FILE * file,const char * name,const_tree decl,HOST_WIDE_INT size,unsigned align)2126 nvptx_output_aligned_decl (FILE *file, const char *name,
2127 const_tree decl, HOST_WIDE_INT size, unsigned align)
2128 {
2129 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2130
2131 /* If this is public, it is common. The nearest thing we have to
2132 common is weak. */
2133 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
2134
2135 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2136 TREE_TYPE (decl), size, align);
2137 nvptx_assemble_decl_end ();
2138 }
2139
2140 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
2141 writing a constant variable EXP with NAME and SIZE and its
2142 initializer to FILE. */
2143
2144 static void
nvptx_asm_declare_constant_name(FILE * file,const char * name,const_tree exp,HOST_WIDE_INT obj_size)2145 nvptx_asm_declare_constant_name (FILE *file, const char *name,
2146 const_tree exp, HOST_WIDE_INT obj_size)
2147 {
2148 write_var_marker (file, true, false, name);
2149
2150 fprintf (file, "\t");
2151
2152 tree type = TREE_TYPE (exp);
2153 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
2154 TYPE_ALIGN (type));
2155 }
2156
2157 /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
2158 a variable DECL with NAME to FILE. */
2159
2160 void
nvptx_declare_object_name(FILE * file,const char * name,const_tree decl)2161 nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
2162 {
2163 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2164
2165 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
2166 : DECL_WEAK (decl) ? ".weak " : ".visible "));
2167
2168 tree type = TREE_TYPE (decl);
2169 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
2170 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2171 type, obj_size, DECL_ALIGN (decl));
2172 }
2173
2174 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
2175
2176 static void
nvptx_globalize_label(FILE *,const char *)2177 nvptx_globalize_label (FILE *, const char *)
2178 {
2179 }
2180
2181 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
2182 declaration only for variable DECL with NAME to FILE. */
2183
2184 static void
nvptx_assemble_undefined_decl(FILE * file,const char * name,const_tree decl)2185 nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
2186 {
2187 /* The middle end can place constant pool decls into the varpool as
2188 undefined. Until that is fixed, catch the problem here. */
2189 if (DECL_IN_CONSTANT_POOL (decl))
2190 return;
2191
2192 /* We support weak defintions, and hence have the right
2193 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
2194 if (DECL_WEAK (decl))
2195 error_at (DECL_SOURCE_LOCATION (decl),
2196 "PTX does not support weak declarations"
2197 " (only weak definitions)");
2198 write_var_marker (file, false, TREE_PUBLIC (decl), name);
2199
2200 fprintf (file, "\t.extern ");
2201 tree size = DECL_SIZE_UNIT (decl);
2202 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2203 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
2204 DECL_ALIGN (decl), true);
2205 nvptx_assemble_decl_end ();
2206 }
2207
2208 /* Output a pattern for a move instruction. */
2209
2210 const char *
nvptx_output_mov_insn(rtx dst,rtx src)2211 nvptx_output_mov_insn (rtx dst, rtx src)
2212 {
2213 machine_mode dst_mode = GET_MODE (dst);
2214 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
2215 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
2216 machine_mode src_inner = (GET_CODE (src) == SUBREG
2217 ? GET_MODE (XEXP (src, 0)) : dst_mode);
2218
2219 rtx sym = src;
2220 if (GET_CODE (sym) == CONST)
2221 sym = XEXP (XEXP (sym, 0), 0);
2222 if (SYMBOL_REF_P (sym))
2223 {
2224 if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
2225 return "%.\tcvta%D1%t0\t%0, %1;";
2226 nvptx_maybe_record_fnsym (sym);
2227 }
2228
2229 if (src_inner == dst_inner)
2230 return "%.\tmov%t0\t%0, %1;";
2231
2232 if (CONSTANT_P (src))
2233 return (GET_MODE_CLASS (dst_inner) == MODE_INT
2234 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
2235 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2236
2237 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
2238 {
2239 if (GET_MODE_BITSIZE (dst_mode) == 128
2240 && GET_MODE_BITSIZE (GET_MODE (src)) == 128)
2241 {
2242 /* mov.b128 is not supported. */
2243 if (dst_inner == V2DImode && src_inner == TImode)
2244 return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2245 else if (dst_inner == TImode && src_inner == V2DImode)
2246 return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2247
2248 gcc_unreachable ();
2249 }
2250 return "%.\tmov.b%T0\t%0, %1;";
2251 }
2252
2253 return "%.\tcvt%t0%t1\t%0, %1;";
2254 }
2255
2256 static void nvptx_print_operand (FILE *, rtx, int);
2257
2258 /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
2259 involves writing .param declarations and in/out copies into them. For
2260 indirect calls, also write the .callprototype. */
2261
2262 const char *
nvptx_output_call_insn(rtx_insn * insn,rtx result,rtx callee)2263 nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
2264 {
2265 char buf[16];
2266 static int labelno;
2267 bool needs_tgt = register_operand (callee, Pmode);
2268 rtx pat = PATTERN (insn);
2269 if (GET_CODE (pat) == COND_EXEC)
2270 pat = COND_EXEC_CODE (pat);
2271 int arg_end = XVECLEN (pat, 0);
2272 tree decl = NULL_TREE;
2273
2274 fprintf (asm_out_file, "\t{\n");
2275 if (result != NULL)
2276 fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
2277 nvptx_ptx_type_from_mode (GET_MODE (result), false),
2278 reg_names[NVPTX_RETURN_REGNUM]);
2279
2280 /* Ensure we have a ptx declaration in the output if necessary. */
2281 if (GET_CODE (callee) == SYMBOL_REF)
2282 {
2283 decl = SYMBOL_REF_DECL (callee);
2284 if (!decl
2285 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
2286 nvptx_record_libfunc (callee, result, pat);
2287 else if (DECL_EXTERNAL (decl))
2288 nvptx_record_fndecl (decl);
2289 }
2290
2291 if (needs_tgt)
2292 {
2293 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
2294 labelno++;
2295 ASM_OUTPUT_LABEL (asm_out_file, buf);
2296 std::stringstream s;
2297 write_fn_proto_from_insn (s, NULL, result, pat);
2298 fputs (s.str().c_str(), asm_out_file);
2299 }
2300
2301 for (int argno = 1; argno < arg_end; argno++)
2302 {
2303 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
2304 machine_mode mode = GET_MODE (t);
2305 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
2306
2307 /* Mode splitting has already been done. */
2308 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
2309 "\t\tst.param%s [%%out_arg%d], ",
2310 ptx_type, argno, ptx_type, argno);
2311 output_reg (asm_out_file, REGNO (t), VOIDmode);
2312 fprintf (asm_out_file, ";\n");
2313 }
2314
2315 /* The '.' stands for the call's predicate, if any. */
2316 nvptx_print_operand (asm_out_file, NULL_RTX, '.');
2317 fprintf (asm_out_file, "\t\tcall ");
2318 if (result != NULL_RTX)
2319 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
2320
2321 if (decl)
2322 {
2323 const char *name = get_fnname_from_decl (decl);
2324 name = nvptx_name_replacement (name);
2325 assemble_name (asm_out_file, name);
2326 }
2327 else
2328 output_address (VOIDmode, callee);
2329
2330 const char *open = "(";
2331 for (int argno = 1; argno < arg_end; argno++)
2332 {
2333 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
2334 open = "";
2335 }
2336 if (decl && DECL_STATIC_CHAIN (decl))
2337 {
2338 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
2339 open = "";
2340 }
2341 if (!open[0])
2342 fprintf (asm_out_file, ")");
2343
2344 if (needs_tgt)
2345 {
2346 fprintf (asm_out_file, ", ");
2347 assemble_name (asm_out_file, buf);
2348 }
2349 fprintf (asm_out_file, ";\n");
2350
2351 if (find_reg_note (insn, REG_NORETURN, NULL))
2352 {
2353 /* No return functions confuse the PTX JIT, as it doesn't realize
2354 the flow control barrier they imply. It can seg fault if it
2355 encounters what looks like an unexitable loop. Emit a trailing
2356 trap and exit, which it does grok. */
2357 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
2358 fprintf (asm_out_file, "\t\texit; // (noreturn)\n");
2359 }
2360
2361 if (result)
2362 {
2363 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2364
2365 if (!rval[0])
2366 /* We must escape the '%' that starts RETURN_REGNUM. */
2367 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2368 reg_names[NVPTX_RETURN_REGNUM]);
2369 return rval;
2370 }
2371
2372 return "}";
2373 }
2374
2375 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2376
2377 static bool
nvptx_print_operand_punct_valid_p(unsigned char c)2378 nvptx_print_operand_punct_valid_p (unsigned char c)
2379 {
2380 return c == '.' || c== '#';
2381 }
2382
2383 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2384
2385 static void
nvptx_print_address_operand(FILE * file,rtx x,machine_mode)2386 nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
2387 {
2388 rtx off;
2389 if (GET_CODE (x) == CONST)
2390 x = XEXP (x, 0);
2391 switch (GET_CODE (x))
2392 {
2393 case PLUS:
2394 off = XEXP (x, 1);
2395 output_address (VOIDmode, XEXP (x, 0));
2396 fprintf (file, "+");
2397 output_address (VOIDmode, off);
2398 break;
2399
2400 case SYMBOL_REF:
2401 case LABEL_REF:
2402 output_addr_const (file, x);
2403 break;
2404
2405 default:
2406 gcc_assert (GET_CODE (x) != MEM);
2407 nvptx_print_operand (file, x, 0);
2408 break;
2409 }
2410 }
2411
2412 /* Write assembly language output for the address ADDR to FILE. */
2413
2414 static void
nvptx_print_operand_address(FILE * file,machine_mode mode,rtx addr)2415 nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
2416 {
2417 nvptx_print_address_operand (file, addr, mode);
2418 }
2419
2420 /* Print an operand, X, to FILE, with an optional modifier in CODE.
2421
2422 Meaning of CODE:
2423 . -- print the predicate for the instruction or an emptry string for an
2424 unconditional one.
2425 # -- print a rounding mode for the instruction
2426
2427 A -- print a data area for a MEM
2428 c -- print an opcode suffix for a comparison operator, including a type code
2429 D -- print a data area for a MEM operand
2430 S -- print a shuffle kind specified by CONST_INT
2431 t -- print a type opcode suffix, promoting QImode to 32 bits
2432 T -- print a type size in bits
2433 u -- print a type opcode suffix without promotions. */
2434
2435 static void
nvptx_print_operand(FILE * file,rtx x,int code)2436 nvptx_print_operand (FILE *file, rtx x, int code)
2437 {
2438 if (code == '.')
2439 {
2440 x = current_insn_predicate;
2441 if (x)
2442 {
2443 fputs ("@", file);
2444 if (GET_CODE (x) == EQ)
2445 fputs ("!", file);
2446 output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
2447 }
2448 return;
2449 }
2450 else if (code == '#')
2451 {
2452 fputs (".rn", file);
2453 return;
2454 }
2455
2456 enum rtx_code x_code = GET_CODE (x);
2457 machine_mode mode = GET_MODE (x);
2458
2459 switch (code)
2460 {
2461 case 'A':
2462 x = XEXP (x, 0);
2463 /* FALLTHROUGH. */
2464
2465 case 'D':
2466 if (GET_CODE (x) == CONST)
2467 x = XEXP (x, 0);
2468 if (GET_CODE (x) == PLUS)
2469 x = XEXP (x, 0);
2470
2471 if (GET_CODE (x) == SYMBOL_REF)
2472 fputs (section_for_sym (x), file);
2473 break;
2474
2475 case 't':
2476 case 'u':
2477 if (x_code == SUBREG)
2478 {
2479 machine_mode inner_mode = GET_MODE (SUBREG_REG (x));
2480 if (VECTOR_MODE_P (inner_mode)
2481 && (GET_MODE_SIZE (mode)
2482 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2483 mode = GET_MODE_INNER (inner_mode);
2484 else if (split_mode_p (inner_mode))
2485 mode = maybe_split_mode (inner_mode);
2486 else
2487 mode = inner_mode;
2488 }
2489 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
2490 break;
2491
2492 case 'H':
2493 case 'L':
2494 {
2495 rtx inner_x = SUBREG_REG (x);
2496 machine_mode inner_mode = GET_MODE (inner_x);
2497 machine_mode split = maybe_split_mode (inner_mode);
2498
2499 output_reg (file, REGNO (inner_x), split,
2500 (code == 'H'
2501 ? GET_MODE_SIZE (inner_mode) / 2
2502 : 0));
2503 }
2504 break;
2505
2506 case 'S':
2507 {
2508 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2509 /* Same order as nvptx_shuffle_kind. */
2510 static const char *const kinds[] =
2511 {".up", ".down", ".bfly", ".idx"};
2512 fputs (kinds[kind], file);
2513 }
2514 break;
2515
2516 case 'T':
2517 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
2518 break;
2519
2520 case 'j':
2521 fprintf (file, "@");
2522 goto common;
2523
2524 case 'J':
2525 fprintf (file, "@!");
2526 goto common;
2527
2528 case 'c':
2529 mode = GET_MODE (XEXP (x, 0));
2530 switch (x_code)
2531 {
2532 case EQ:
2533 fputs (".eq", file);
2534 break;
2535 case NE:
2536 if (FLOAT_MODE_P (mode))
2537 fputs (".neu", file);
2538 else
2539 fputs (".ne", file);
2540 break;
2541 case LE:
2542 case LEU:
2543 fputs (".le", file);
2544 break;
2545 case GE:
2546 case GEU:
2547 fputs (".ge", file);
2548 break;
2549 case LT:
2550 case LTU:
2551 fputs (".lt", file);
2552 break;
2553 case GT:
2554 case GTU:
2555 fputs (".gt", file);
2556 break;
2557 case LTGT:
2558 fputs (".ne", file);
2559 break;
2560 case UNEQ:
2561 fputs (".equ", file);
2562 break;
2563 case UNLE:
2564 fputs (".leu", file);
2565 break;
2566 case UNGE:
2567 fputs (".geu", file);
2568 break;
2569 case UNLT:
2570 fputs (".ltu", file);
2571 break;
2572 case UNGT:
2573 fputs (".gtu", file);
2574 break;
2575 case UNORDERED:
2576 fputs (".nan", file);
2577 break;
2578 case ORDERED:
2579 fputs (".num", file);
2580 break;
2581 default:
2582 gcc_unreachable ();
2583 }
2584 if (FLOAT_MODE_P (mode)
2585 || x_code == EQ || x_code == NE
2586 || x_code == GEU || x_code == GTU
2587 || x_code == LEU || x_code == LTU)
2588 fputs (nvptx_ptx_type_from_mode (mode, true), file);
2589 else
2590 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
2591 break;
2592 default:
2593 common:
2594 switch (x_code)
2595 {
2596 case SUBREG:
2597 {
2598 rtx inner_x = SUBREG_REG (x);
2599 machine_mode inner_mode = GET_MODE (inner_x);
2600 machine_mode split = maybe_split_mode (inner_mode);
2601
2602 if (VECTOR_MODE_P (inner_mode)
2603 && (GET_MODE_SIZE (mode)
2604 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2605 {
2606 output_reg (file, REGNO (inner_x), VOIDmode);
2607 fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y");
2608 }
2609 else if (split_mode_p (inner_mode)
2610 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2611 output_reg (file, REGNO (inner_x), split);
2612 else
2613 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2614 }
2615 break;
2616
2617 case REG:
2618 output_reg (file, REGNO (x), maybe_split_mode (mode));
2619 break;
2620
2621 case MEM:
2622 fputc ('[', file);
2623 nvptx_print_address_operand (file, XEXP (x, 0), mode);
2624 fputc (']', file);
2625 break;
2626
2627 case CONST_INT:
2628 output_addr_const (file, x);
2629 break;
2630
2631 case CONST:
2632 case SYMBOL_REF:
2633 case LABEL_REF:
2634 /* We could use output_addr_const, but that can print things like
2635 "x-8", which breaks ptxas. Need to ensure it is output as
2636 "x+-8". */
2637 nvptx_print_address_operand (file, x, VOIDmode);
2638 break;
2639
2640 case CONST_DOUBLE:
2641 long vals[2];
2642 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
2643 vals[0] &= 0xffffffff;
2644 vals[1] &= 0xffffffff;
2645 if (mode == SFmode)
2646 fprintf (file, "0f%08lx", vals[0]);
2647 else
2648 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2649 break;
2650
2651 case CONST_VECTOR:
2652 {
2653 unsigned n = CONST_VECTOR_NUNITS (x);
2654 fprintf (file, "{ ");
2655 for (unsigned i = 0; i < n; ++i)
2656 {
2657 if (i != 0)
2658 fprintf (file, ", ");
2659
2660 rtx elem = CONST_VECTOR_ELT (x, i);
2661 output_addr_const (file, elem);
2662 }
2663 fprintf (file, " }");
2664 }
2665 break;
2666
2667 default:
2668 output_addr_const (file, x);
2669 }
2670 }
2671 }
2672
2673 /* Record replacement regs used to deal with subreg operands. */
2674 struct reg_replace
2675 {
2676 rtx replacement[MAX_RECOG_OPERANDS];
2677 machine_mode mode;
2678 int n_allocated;
2679 int n_in_use;
2680 };
2681
2682 /* Allocate or reuse a replacement in R and return the rtx. */
2683
2684 static rtx
get_replacement(struct reg_replace * r)2685 get_replacement (struct reg_replace *r)
2686 {
2687 if (r->n_allocated == r->n_in_use)
2688 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2689 return r->replacement[r->n_in_use++];
2690 }
2691
2692 /* Clean up subreg operands. In ptx assembly, everything is typed, and
2693 the presence of subregs would break the rules for most instructions.
2694 Replace them with a suitable new register of the right size, plus
2695 conversion copyin/copyout instructions. */
2696
2697 static void
nvptx_reorg_subreg(void)2698 nvptx_reorg_subreg (void)
2699 {
2700 struct reg_replace qiregs, hiregs, siregs, diregs;
2701 rtx_insn *insn, *next;
2702
2703 qiregs.n_allocated = 0;
2704 hiregs.n_allocated = 0;
2705 siregs.n_allocated = 0;
2706 diregs.n_allocated = 0;
2707 qiregs.mode = QImode;
2708 hiregs.mode = HImode;
2709 siregs.mode = SImode;
2710 diregs.mode = DImode;
2711
2712 for (insn = get_insns (); insn; insn = next)
2713 {
2714 next = NEXT_INSN (insn);
2715 if (!NONDEBUG_INSN_P (insn)
2716 || asm_noperands (PATTERN (insn)) >= 0
2717 || GET_CODE (PATTERN (insn)) == USE
2718 || GET_CODE (PATTERN (insn)) == CLOBBER)
2719 continue;
2720
2721 qiregs.n_in_use = 0;
2722 hiregs.n_in_use = 0;
2723 siregs.n_in_use = 0;
2724 diregs.n_in_use = 0;
2725 extract_insn (insn);
2726 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
2727
2728 for (int i = 0; i < recog_data.n_operands; i++)
2729 {
2730 rtx op = recog_data.operand[i];
2731 if (GET_CODE (op) != SUBREG)
2732 continue;
2733
2734 rtx inner = SUBREG_REG (op);
2735
2736 machine_mode outer_mode = GET_MODE (op);
2737 machine_mode inner_mode = GET_MODE (inner);
2738 gcc_assert (s_ok);
2739 if (s_ok
2740 && (GET_MODE_PRECISION (inner_mode)
2741 >= GET_MODE_PRECISION (outer_mode)))
2742 continue;
2743 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2744 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2745 : outer_mode == HImode ? &hiregs
2746 : outer_mode == SImode ? &siregs
2747 : &diregs);
2748 rtx new_reg = get_replacement (r);
2749
2750 if (recog_data.operand_type[i] != OP_OUT)
2751 {
2752 enum rtx_code code;
2753 if (GET_MODE_PRECISION (inner_mode)
2754 < GET_MODE_PRECISION (outer_mode))
2755 code = ZERO_EXTEND;
2756 else
2757 code = TRUNCATE;
2758
2759 rtx pat = gen_rtx_SET (new_reg,
2760 gen_rtx_fmt_e (code, outer_mode, inner));
2761 emit_insn_before (pat, insn);
2762 }
2763
2764 if (recog_data.operand_type[i] != OP_IN)
2765 {
2766 enum rtx_code code;
2767 if (GET_MODE_PRECISION (inner_mode)
2768 < GET_MODE_PRECISION (outer_mode))
2769 code = TRUNCATE;
2770 else
2771 code = ZERO_EXTEND;
2772
2773 rtx pat = gen_rtx_SET (inner,
2774 gen_rtx_fmt_e (code, inner_mode, new_reg));
2775 emit_insn_after (pat, insn);
2776 }
2777 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2778 }
2779 }
2780 }
2781
2782 /* Return a SImode "master lane index" register for uniform-simt, allocating on
2783 first use. */
2784
2785 static rtx
nvptx_get_unisimt_master()2786 nvptx_get_unisimt_master ()
2787 {
2788 rtx &master = cfun->machine->unisimt_master;
2789 return master ? master : master = gen_reg_rtx (SImode);
2790 }
2791
2792 /* Return a BImode "predicate" register for uniform-simt, similar to above. */
2793
2794 static rtx
nvptx_get_unisimt_predicate()2795 nvptx_get_unisimt_predicate ()
2796 {
2797 rtx &pred = cfun->machine->unisimt_predicate;
2798 return pred ? pred : pred = gen_reg_rtx (BImode);
2799 }
2800
2801 /* Return true if given call insn references one of the functions provided by
2802 the CUDA runtime: malloc, free, vprintf. */
2803
2804 static bool
nvptx_call_insn_is_syscall_p(rtx_insn * insn)2805 nvptx_call_insn_is_syscall_p (rtx_insn *insn)
2806 {
2807 rtx pat = PATTERN (insn);
2808 gcc_checking_assert (GET_CODE (pat) == PARALLEL);
2809 pat = XVECEXP (pat, 0, 0);
2810 if (GET_CODE (pat) == SET)
2811 pat = SET_SRC (pat);
2812 gcc_checking_assert (GET_CODE (pat) == CALL
2813 && GET_CODE (XEXP (pat, 0)) == MEM);
2814 rtx addr = XEXP (XEXP (pat, 0), 0);
2815 if (GET_CODE (addr) != SYMBOL_REF)
2816 return false;
2817 const char *name = XSTR (addr, 0);
2818 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
2819 references with forced assembler name refer to PTX syscalls. For vprintf,
2820 accept both normal and forced-assembler-name references. */
2821 return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
2822 || !strcmp (name, "*malloc")
2823 || !strcmp (name, "*free"));
2824 }
2825
2826 /* If SET subexpression of INSN sets a register, emit a shuffle instruction to
2827 propagate its value from lane MASTER to current lane. */
2828
2829 static void
nvptx_unisimt_handle_set(rtx set,rtx_insn * insn,rtx master)2830 nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
2831 {
2832 rtx reg;
2833 if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
2834 emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
2835 }
2836
2837 /* Adjust code for uniform-simt code generation variant by making atomics and
2838 "syscalls" conditionally executed, and inserting shuffle-based propagation
2839 for registers being set. */
2840
2841 static void
nvptx_reorg_uniform_simt()2842 nvptx_reorg_uniform_simt ()
2843 {
2844 rtx_insn *insn, *next;
2845
2846 for (insn = get_insns (); insn; insn = next)
2847 {
2848 next = NEXT_INSN (insn);
2849 if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
2850 && !(NONJUMP_INSN_P (insn)
2851 && GET_CODE (PATTERN (insn)) == PARALLEL
2852 && get_attr_atomic (insn)))
2853 continue;
2854 rtx pat = PATTERN (insn);
2855 rtx master = nvptx_get_unisimt_master ();
2856 for (int i = 0; i < XVECLEN (pat, 0); i++)
2857 nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
2858 rtx pred = nvptx_get_unisimt_predicate ();
2859 pred = gen_rtx_NE (BImode, pred, const0_rtx);
2860 pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
2861 validate_change (insn, &PATTERN (insn), pat, false);
2862 }
2863 }
2864
2865 /* Loop structure of the function. The entire function is described as
2866 a NULL loop. */
2867
2868 struct parallel
2869 {
2870 /* Parent parallel. */
2871 parallel *parent;
2872
2873 /* Next sibling parallel. */
2874 parallel *next;
2875
2876 /* First child parallel. */
2877 parallel *inner;
2878
2879 /* Partitioning mask of the parallel. */
2880 unsigned mask;
2881
2882 /* Partitioning used within inner parallels. */
2883 unsigned inner_mask;
2884
2885 /* Location of parallel forked and join. The forked is the first
2886 block in the parallel and the join is the first block after of
2887 the partition. */
2888 basic_block forked_block;
2889 basic_block join_block;
2890
2891 rtx_insn *forked_insn;
2892 rtx_insn *join_insn;
2893
2894 rtx_insn *fork_insn;
2895 rtx_insn *joining_insn;
2896
2897 /* Basic blocks in this parallel, but not in child parallels. The
2898 FORKED and JOINING blocks are in the partition. The FORK and JOIN
2899 blocks are not. */
2900 auto_vec<basic_block> blocks;
2901
2902 public:
2903 parallel (parallel *parent, unsigned mode);
2904 ~parallel ();
2905 };
2906
2907 /* Constructor links the new parallel into it's parent's chain of
2908 children. */
2909
parallel(parallel * parent_,unsigned mask_)2910 parallel::parallel (parallel *parent_, unsigned mask_)
2911 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
2912 {
2913 forked_block = join_block = 0;
2914 forked_insn = join_insn = 0;
2915 fork_insn = joining_insn = 0;
2916
2917 if (parent)
2918 {
2919 next = parent->inner;
2920 parent->inner = this;
2921 }
2922 }
2923
~parallel()2924 parallel::~parallel ()
2925 {
2926 delete inner;
2927 delete next;
2928 }
2929
2930 /* Map of basic blocks to insns */
2931 typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
2932
2933 /* A tuple of an insn of interest and the BB in which it resides. */
2934 typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
2935 typedef auto_vec<insn_bb_t> insn_bb_vec_t;
2936
2937 /* Split basic blocks such that each forked and join unspecs are at
2938 the start of their basic blocks. Thus afterwards each block will
2939 have a single partitioning mode. We also do the same for return
2940 insns, as they are executed by every thread. Return the
2941 partitioning mode of the function as a whole. Populate MAP with
2942 head and tail blocks. We also clear the BB visited flag, which is
2943 used when finding partitions. */
2944
2945 static void
nvptx_split_blocks(bb_insn_map_t * map)2946 nvptx_split_blocks (bb_insn_map_t *map)
2947 {
2948 insn_bb_vec_t worklist;
2949 basic_block block;
2950 rtx_insn *insn;
2951
2952 /* Locate all the reorg instructions of interest. */
2953 FOR_ALL_BB_FN (block, cfun)
2954 {
2955 bool seen_insn = false;
2956
2957 /* Clear visited flag, for use by parallel locator */
2958 block->flags &= ~BB_VISITED;
2959
2960 FOR_BB_INSNS (block, insn)
2961 {
2962 if (!INSN_P (insn))
2963 continue;
2964 switch (recog_memoized (insn))
2965 {
2966 default:
2967 seen_insn = true;
2968 continue;
2969 case CODE_FOR_nvptx_forked:
2970 case CODE_FOR_nvptx_join:
2971 break;
2972
2973 case CODE_FOR_return:
2974 /* We also need to split just before return insns, as
2975 that insn needs executing by all threads, but the
2976 block it is in probably does not. */
2977 break;
2978 }
2979
2980 if (seen_insn)
2981 /* We've found an instruction that must be at the start of
2982 a block, but isn't. Add it to the worklist. */
2983 worklist.safe_push (insn_bb_t (insn, block));
2984 else
2985 /* It was already the first instruction. Just add it to
2986 the map. */
2987 map->get_or_insert (block) = insn;
2988 seen_insn = true;
2989 }
2990 }
2991
2992 /* Split blocks on the worklist. */
2993 unsigned ix;
2994 insn_bb_t *elt;
2995 basic_block remap = 0;
2996 for (ix = 0; worklist.iterate (ix, &elt); ix++)
2997 {
2998 if (remap != elt->second)
2999 {
3000 block = elt->second;
3001 remap = block;
3002 }
3003
3004 /* Split block before insn. The insn is in the new block */
3005 edge e = split_block (block, PREV_INSN (elt->first));
3006
3007 block = e->dest;
3008 map->get_or_insert (block) = elt->first;
3009 }
3010 }
3011
3012 /* BLOCK is a basic block containing a head or tail instruction.
3013 Locate the associated prehead or pretail instruction, which must be
3014 in the single predecessor block. */
3015
3016 static rtx_insn *
nvptx_discover_pre(basic_block block,int expected)3017 nvptx_discover_pre (basic_block block, int expected)
3018 {
3019 gcc_assert (block->preds->length () == 1);
3020 basic_block pre_block = (*block->preds)[0]->src;
3021 rtx_insn *pre_insn;
3022
3023 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
3024 pre_insn = PREV_INSN (pre_insn))
3025 gcc_assert (pre_insn != BB_HEAD (pre_block));
3026
3027 gcc_assert (recog_memoized (pre_insn) == expected);
3028 return pre_insn;
3029 }
3030
3031 /* Dump this parallel and all its inner parallels. */
3032
3033 static void
nvptx_dump_pars(parallel * par,unsigned depth)3034 nvptx_dump_pars (parallel *par, unsigned depth)
3035 {
3036 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
3037 depth, par->mask,
3038 par->forked_block ? par->forked_block->index : -1,
3039 par->join_block ? par->join_block->index : -1);
3040
3041 fprintf (dump_file, " blocks:");
3042
3043 basic_block block;
3044 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
3045 fprintf (dump_file, " %d", block->index);
3046 fprintf (dump_file, "\n");
3047 if (par->inner)
3048 nvptx_dump_pars (par->inner, depth + 1);
3049
3050 if (par->next)
3051 nvptx_dump_pars (par->next, depth);
3052 }
3053
3054 /* If BLOCK contains a fork/join marker, process it to create or
3055 terminate a loop structure. Add this block to the current loop,
3056 and then walk successor blocks. */
3057
3058 static parallel *
nvptx_find_par(bb_insn_map_t * map,parallel * par,basic_block block)3059 nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
3060 {
3061 if (block->flags & BB_VISITED)
3062 return par;
3063 block->flags |= BB_VISITED;
3064
3065 if (rtx_insn **endp = map->get (block))
3066 {
3067 rtx_insn *end = *endp;
3068
3069 /* This is a block head or tail, or return instruction. */
3070 switch (recog_memoized (end))
3071 {
3072 case CODE_FOR_return:
3073 /* Return instructions are in their own block, and we
3074 don't need to do anything more. */
3075 return par;
3076
3077 case CODE_FOR_nvptx_forked:
3078 /* Loop head, create a new inner loop and add it into
3079 our parent's child list. */
3080 {
3081 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3082
3083 gcc_assert (mask);
3084 par = new parallel (par, mask);
3085 par->forked_block = block;
3086 par->forked_insn = end;
3087 if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3088 par->fork_insn
3089 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
3090 }
3091 break;
3092
3093 case CODE_FOR_nvptx_join:
3094 /* A loop tail. Finish the current loop and return to
3095 parent. */
3096 {
3097 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3098
3099 gcc_assert (par->mask == mask);
3100 par->join_block = block;
3101 par->join_insn = end;
3102 if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3103 par->joining_insn
3104 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
3105 par = par->parent;
3106 }
3107 break;
3108
3109 default:
3110 gcc_unreachable ();
3111 }
3112 }
3113
3114 if (par)
3115 /* Add this block onto the current loop's list of blocks. */
3116 par->blocks.safe_push (block);
3117 else
3118 /* This must be the entry block. Create a NULL parallel. */
3119 par = new parallel (0, 0);
3120
3121 /* Walk successor blocks. */
3122 edge e;
3123 edge_iterator ei;
3124
3125 FOR_EACH_EDGE (e, ei, block->succs)
3126 nvptx_find_par (map, par, e->dest);
3127
3128 return par;
3129 }
3130
3131 /* DFS walk the CFG looking for fork & join markers. Construct
3132 loop structures as we go. MAP is a mapping of basic blocks
3133 to head & tail markers, discovered when splitting blocks. This
3134 speeds up the discovery. We rely on the BB visited flag having
3135 been cleared when splitting blocks. */
3136
3137 static parallel *
nvptx_discover_pars(bb_insn_map_t * map)3138 nvptx_discover_pars (bb_insn_map_t *map)
3139 {
3140 basic_block block;
3141
3142 /* Mark exit blocks as visited. */
3143 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3144 block->flags |= BB_VISITED;
3145
3146 /* And entry block as not. */
3147 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3148 block->flags &= ~BB_VISITED;
3149
3150 parallel *par = nvptx_find_par (map, 0, block);
3151
3152 if (dump_file)
3153 {
3154 fprintf (dump_file, "\nLoops\n");
3155 nvptx_dump_pars (par, 0);
3156 fprintf (dump_file, "\n");
3157 }
3158
3159 return par;
3160 }
3161
3162 /* Analyse a group of BBs within a partitioned region and create N
3163 Single-Entry-Single-Exit regions. Some of those regions will be
3164 trivial ones consisting of a single BB. The blocks of a
3165 partitioned region might form a set of disjoint graphs -- because
3166 the region encloses a differently partitoned sub region.
3167
3168 We use the linear time algorithm described in 'Finding Regions Fast:
3169 Single Entry Single Exit and control Regions in Linear Time'
3170 Johnson, Pearson & Pingali. That algorithm deals with complete
3171 CFGs, where a back edge is inserted from END to START, and thus the
3172 problem becomes one of finding equivalent loops.
3173
3174 In this case we have a partial CFG. We complete it by redirecting
3175 any incoming edge to the graph to be from an arbitrary external BB,
3176 and similarly redirecting any outgoing edge to be to that BB.
3177 Thus we end up with a closed graph.
3178
3179 The algorithm works by building a spanning tree of an undirected
3180 graph and keeping track of back edges from nodes further from the
3181 root in the tree to nodes nearer to the root in the tree. In the
3182 description below, the root is up and the tree grows downwards.
3183
3184 We avoid having to deal with degenerate back-edges to the same
3185 block, by splitting each BB into 3 -- one for input edges, one for
3186 the node itself and one for the output edges. Such back edges are
3187 referred to as 'Brackets'. Cycle equivalent nodes will have the
3188 same set of brackets.
3189
3190 Determining bracket equivalency is done by maintaining a list of
3191 brackets in such a manner that the list length and final bracket
3192 uniquely identify the set.
3193
3194 We use coloring to mark all BBs with cycle equivalency with the
3195 same color. This is the output of the 'Finding Regions Fast'
3196 algorithm. Notice it doesn't actually find the set of nodes within
3197 a particular region, just unorderd sets of nodes that are the
3198 entries and exits of SESE regions.
3199
3200 After determining cycle equivalency, we need to find the minimal
3201 set of SESE regions. Do this with a DFS coloring walk of the
3202 complete graph. We're either 'looking' or 'coloring'. When
3203 looking, and we're in the subgraph, we start coloring the color of
3204 the current node, and remember that node as the start of the
3205 current color's SESE region. Every time we go to a new node, we
3206 decrement the count of nodes with thet color. If it reaches zero,
3207 we remember that node as the end of the current color's SESE region
3208 and return to 'looking'. Otherwise we color the node the current
3209 color.
3210
3211 This way we end up with coloring the inside of non-trivial SESE
3212 regions with the color of that region. */
3213
3214 /* A pair of BBs. We use this to represent SESE regions. */
3215 typedef std::pair<basic_block, basic_block> bb_pair_t;
3216 typedef auto_vec<bb_pair_t> bb_pair_vec_t;
3217
3218 /* A node in the undirected CFG. The discriminator SECOND indicates just
3219 above or just below the BB idicated by FIRST. */
3220 typedef std::pair<basic_block, int> pseudo_node_t;
3221
3222 /* A bracket indicates an edge towards the root of the spanning tree of the
3223 undirected graph. Each bracket has a color, determined
3224 from the currrent set of brackets. */
3225 struct bracket
3226 {
3227 pseudo_node_t back; /* Back target */
3228
3229 /* Current color and size of set. */
3230 unsigned color;
3231 unsigned size;
3232
bracketbracket3233 bracket (pseudo_node_t back_)
3234 : back (back_), color (~0u), size (~0u)
3235 {
3236 }
3237
get_colorbracket3238 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
3239 {
3240 if (length != size)
3241 {
3242 size = length;
3243 color = color_counts.length ();
3244 color_counts.quick_push (0);
3245 }
3246 color_counts[color]++;
3247 return color;
3248 }
3249 };
3250
3251 typedef auto_vec<bracket> bracket_vec_t;
3252
3253 /* Basic block info for finding SESE regions. */
3254
3255 struct bb_sese
3256 {
3257 int node; /* Node number in spanning tree. */
3258 int parent; /* Parent node number. */
3259
3260 /* The algorithm splits each node A into Ai, A', Ao. The incoming
3261 edges arrive at pseudo-node Ai and the outgoing edges leave at
3262 pseudo-node Ao. We have to remember which way we arrived at a
3263 particular node when generating the spanning tree. dir > 0 means
3264 we arrived at Ai, dir < 0 means we arrived at Ao. */
3265 int dir;
3266
3267 /* Lowest numbered pseudo-node reached via a backedge from thsis
3268 node, or any descendant. */
3269 pseudo_node_t high;
3270
3271 int color; /* Cycle-equivalence color */
3272
3273 /* Stack of brackets for this node. */
3274 bracket_vec_t brackets;
3275
bb_sesebb_sese3276 bb_sese (unsigned node_, unsigned p, int dir_)
3277 :node (node_), parent (p), dir (dir_)
3278 {
3279 }
3280 ~bb_sese ();
3281
3282 /* Push a bracket ending at BACK. */
pushbb_sese3283 void push (const pseudo_node_t &back)
3284 {
3285 if (dump_file)
3286 fprintf (dump_file, "Pushing backedge %d:%+d\n",
3287 back.first ? back.first->index : 0, back.second);
3288 brackets.safe_push (bracket (back));
3289 }
3290
3291 void append (bb_sese *child);
3292 void remove (const pseudo_node_t &);
3293
3294 /* Set node's color. */
set_colorbb_sese3295 void set_color (auto_vec<unsigned> &color_counts)
3296 {
3297 color = brackets.last ().get_color (color_counts, brackets.length ());
3298 }
3299 };
3300
~bb_sese()3301 bb_sese::~bb_sese ()
3302 {
3303 }
3304
3305 /* Destructively append CHILD's brackets. */
3306
3307 void
append(bb_sese * child)3308 bb_sese::append (bb_sese *child)
3309 {
3310 if (int len = child->brackets.length ())
3311 {
3312 int ix;
3313
3314 if (dump_file)
3315 {
3316 for (ix = 0; ix < len; ix++)
3317 {
3318 const pseudo_node_t &pseudo = child->brackets[ix].back;
3319 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
3320 child->node, pseudo.first ? pseudo.first->index : 0,
3321 pseudo.second);
3322 }
3323 }
3324 if (!brackets.length ())
3325 std::swap (brackets, child->brackets);
3326 else
3327 {
3328 brackets.reserve (len);
3329 for (ix = 0; ix < len; ix++)
3330 brackets.quick_push (child->brackets[ix]);
3331 }
3332 }
3333 }
3334
3335 /* Remove brackets that terminate at PSEUDO. */
3336
3337 void
remove(const pseudo_node_t & pseudo)3338 bb_sese::remove (const pseudo_node_t &pseudo)
3339 {
3340 unsigned removed = 0;
3341 int len = brackets.length ();
3342
3343 for (int ix = 0; ix < len; ix++)
3344 {
3345 if (brackets[ix].back == pseudo)
3346 {
3347 if (dump_file)
3348 fprintf (dump_file, "Removing backedge %d:%+d\n",
3349 pseudo.first ? pseudo.first->index : 0, pseudo.second);
3350 removed++;
3351 }
3352 else if (removed)
3353 brackets[ix-removed] = brackets[ix];
3354 }
3355 while (removed--)
3356 brackets.pop ();
3357 }
3358
3359 /* Accessors for BB's aux pointer. */
3360 #define BB_SET_SESE(B, S) ((B)->aux = (S))
3361 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3362
3363 /* DFS walk creating SESE data structures. Only cover nodes with
3364 BB_VISITED set. Append discovered blocks to LIST. We number in
3365 increments of 3 so that the above and below pseudo nodes can be
3366 implicitly numbered too. */
3367
3368 static int
nvptx_sese_number(int n,int p,int dir,basic_block b,auto_vec<basic_block> * list)3369 nvptx_sese_number (int n, int p, int dir, basic_block b,
3370 auto_vec<basic_block> *list)
3371 {
3372 if (BB_GET_SESE (b))
3373 return n;
3374
3375 if (dump_file)
3376 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
3377 b->index, n, p, dir);
3378
3379 BB_SET_SESE (b, new bb_sese (n, p, dir));
3380 p = n;
3381
3382 n += 3;
3383 list->quick_push (b);
3384
3385 /* First walk the nodes on the 'other side' of this node, then walk
3386 the nodes on the same side. */
3387 for (unsigned ix = 2; ix; ix--)
3388 {
3389 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
3390 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
3391 : offsetof (edge_def, src));
3392 edge e;
3393 edge_iterator (ei);
3394
3395 FOR_EACH_EDGE (e, ei, edges)
3396 {
3397 basic_block target = *(basic_block *)((char *)e + offset);
3398
3399 if (target->flags & BB_VISITED)
3400 n = nvptx_sese_number (n, p, dir, target, list);
3401 }
3402 dir = -dir;
3403 }
3404 return n;
3405 }
3406
3407 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
3408 EDGES are the outgoing edges and OFFSET is the offset to the src
3409 or dst block on the edges. */
3410
3411 static void
nvptx_sese_pseudo(basic_block me,bb_sese * sese,int depth,int dir,vec<edge,va_gc> * edges,size_t offset)3412 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
3413 vec<edge, va_gc> *edges, size_t offset)
3414 {
3415 edge e;
3416 edge_iterator (ei);
3417 int hi_back = depth;
3418 pseudo_node_t node_back (0, depth);
3419 int hi_child = depth;
3420 pseudo_node_t node_child (0, depth);
3421 basic_block child = NULL;
3422 unsigned num_children = 0;
3423 int usd = -dir * sese->dir;
3424
3425 if (dump_file)
3426 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
3427 me->index, sese->node, dir);
3428
3429 if (dir < 0)
3430 {
3431 /* This is the above pseudo-child. It has the BB itself as an
3432 additional child node. */
3433 node_child = sese->high;
3434 hi_child = node_child.second;
3435 if (node_child.first)
3436 hi_child += BB_GET_SESE (node_child.first)->node;
3437 num_children++;
3438 }
3439
3440 /* Examine each edge.
3441 - if it is a child (a) append its bracket list and (b) record
3442 whether it is the child with the highest reaching bracket.
3443 - if it is an edge to ancestor, record whether it's the highest
3444 reaching backlink. */
3445 FOR_EACH_EDGE (e, ei, edges)
3446 {
3447 basic_block target = *(basic_block *)((char *)e + offset);
3448
3449 if (bb_sese *t_sese = BB_GET_SESE (target))
3450 {
3451 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
3452 {
3453 /* Child node. Append its bracket list. */
3454 num_children++;
3455 sese->append (t_sese);
3456
3457 /* Compare it's hi value. */
3458 int t_hi = t_sese->high.second;
3459
3460 if (basic_block child_hi_block = t_sese->high.first)
3461 t_hi += BB_GET_SESE (child_hi_block)->node;
3462
3463 if (hi_child > t_hi)
3464 {
3465 hi_child = t_hi;
3466 node_child = t_sese->high;
3467 child = target;
3468 }
3469 }
3470 else if (t_sese->node < sese->node + dir
3471 && !(dir < 0 && sese->parent == t_sese->node))
3472 {
3473 /* Non-parental ancestor node -- a backlink. */
3474 int d = usd * t_sese->dir;
3475 int back = t_sese->node + d;
3476
3477 if (hi_back > back)
3478 {
3479 hi_back = back;
3480 node_back = pseudo_node_t (target, d);
3481 }
3482 }
3483 }
3484 else
3485 { /* Fallen off graph, backlink to entry node. */
3486 hi_back = 0;
3487 node_back = pseudo_node_t (0, 0);
3488 }
3489 }
3490
3491 /* Remove any brackets that terminate at this pseudo node. */
3492 sese->remove (pseudo_node_t (me, dir));
3493
3494 /* Now push any backlinks from this pseudo node. */
3495 FOR_EACH_EDGE (e, ei, edges)
3496 {
3497 basic_block target = *(basic_block *)((char *)e + offset);
3498 if (bb_sese *t_sese = BB_GET_SESE (target))
3499 {
3500 if (t_sese->node < sese->node + dir
3501 && !(dir < 0 && sese->parent == t_sese->node))
3502 /* Non-parental ancestor node - backedge from me. */
3503 sese->push (pseudo_node_t (target, usd * t_sese->dir));
3504 }
3505 else
3506 {
3507 /* back edge to entry node */
3508 sese->push (pseudo_node_t (0, 0));
3509 }
3510 }
3511
3512 /* If this node leads directly or indirectly to a no-return region of
3513 the graph, then fake a backedge to entry node. */
3514 if (!sese->brackets.length () || !edges || !edges->length ())
3515 {
3516 hi_back = 0;
3517 node_back = pseudo_node_t (0, 0);
3518 sese->push (node_back);
3519 }
3520
3521 /* Record the highest reaching backedge from us or a descendant. */
3522 sese->high = hi_back < hi_child ? node_back : node_child;
3523
3524 if (num_children > 1)
3525 {
3526 /* There is more than one child -- this is a Y shaped piece of
3527 spanning tree. We have to insert a fake backedge from this
3528 node to the highest ancestor reached by not-the-highest
3529 reaching child. Note that there may be multiple children
3530 with backedges to the same highest node. That's ok and we
3531 insert the edge to that highest node. */
3532 hi_child = depth;
3533 if (dir < 0 && child)
3534 {
3535 node_child = sese->high;
3536 hi_child = node_child.second;
3537 if (node_child.first)
3538 hi_child += BB_GET_SESE (node_child.first)->node;
3539 }
3540
3541 FOR_EACH_EDGE (e, ei, edges)
3542 {
3543 basic_block target = *(basic_block *)((char *)e + offset);
3544
3545 if (target == child)
3546 /* Ignore the highest child. */
3547 continue;
3548
3549 bb_sese *t_sese = BB_GET_SESE (target);
3550 if (!t_sese)
3551 continue;
3552 if (t_sese->parent != sese->node)
3553 /* Not a child. */
3554 continue;
3555
3556 /* Compare its hi value. */
3557 int t_hi = t_sese->high.second;
3558
3559 if (basic_block child_hi_block = t_sese->high.first)
3560 t_hi += BB_GET_SESE (child_hi_block)->node;
3561
3562 if (hi_child > t_hi)
3563 {
3564 hi_child = t_hi;
3565 node_child = t_sese->high;
3566 }
3567 }
3568
3569 sese->push (node_child);
3570 }
3571 }
3572
3573
3574 /* DFS walk of BB graph. Color node BLOCK according to COLORING then
3575 proceed to successors. Set SESE entry and exit nodes of
3576 REGIONS. */
3577
3578 static void
nvptx_sese_color(auto_vec<unsigned> & color_counts,bb_pair_vec_t & regions,basic_block block,int coloring)3579 nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t ®ions,
3580 basic_block block, int coloring)
3581 {
3582 bb_sese *sese = BB_GET_SESE (block);
3583
3584 if (block->flags & BB_VISITED)
3585 {
3586 /* If we've already encountered this block, either we must not
3587 be coloring, or it must have been colored the current color. */
3588 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3589 return;
3590 }
3591
3592 block->flags |= BB_VISITED;
3593
3594 if (sese)
3595 {
3596 if (coloring < 0)
3597 {
3598 /* Start coloring a region. */
3599 regions[sese->color].first = block;
3600 coloring = sese->color;
3601 }
3602
3603 if (!--color_counts[sese->color] && sese->color == coloring)
3604 {
3605 /* Found final block of SESE region. */
3606 regions[sese->color].second = block;
3607 coloring = -1;
3608 }
3609 else
3610 /* Color the node, so we can assert on revisiting the node
3611 that the graph is indeed SESE. */
3612 sese->color = coloring;
3613 }
3614 else
3615 /* Fallen off the subgraph, we cannot be coloring. */
3616 gcc_assert (coloring < 0);
3617
3618 /* Walk each successor block. */
3619 if (block->succs && block->succs->length ())
3620 {
3621 edge e;
3622 edge_iterator ei;
3623
3624 FOR_EACH_EDGE (e, ei, block->succs)
3625 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3626 }
3627 else
3628 gcc_assert (coloring < 0);
3629 }
3630
3631 /* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3632 end up with NULL entries in it. */
3633
3634 static void
nvptx_find_sese(auto_vec<basic_block> & blocks,bb_pair_vec_t & regions)3635 nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t ®ions)
3636 {
3637 basic_block block;
3638 int ix;
3639
3640 /* First clear each BB of the whole function. */
3641 FOR_ALL_BB_FN (block, cfun)
3642 {
3643 block->flags &= ~BB_VISITED;
3644 BB_SET_SESE (block, 0);
3645 }
3646
3647 /* Mark blocks in the function that are in this graph. */
3648 for (ix = 0; blocks.iterate (ix, &block); ix++)
3649 block->flags |= BB_VISITED;
3650
3651 /* Counts of nodes assigned to each color. There cannot be more
3652 colors than blocks (and hopefully there will be fewer). */
3653 auto_vec<unsigned> color_counts;
3654 color_counts.reserve (blocks.length ());
3655
3656 /* Worklist of nodes in the spanning tree. Again, there cannot be
3657 more nodes in the tree than blocks (there will be fewer if the
3658 CFG of blocks is disjoint). */
3659 auto_vec<basic_block> spanlist;
3660 spanlist.reserve (blocks.length ());
3661
3662 /* Make sure every block has its cycle class determined. */
3663 for (ix = 0; blocks.iterate (ix, &block); ix++)
3664 {
3665 if (BB_GET_SESE (block))
3666 /* We already met this block in an earlier graph solve. */
3667 continue;
3668
3669 if (dump_file)
3670 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3671
3672 /* Number the nodes reachable from block initial DFS order. */
3673 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3674
3675 /* Now walk in reverse DFS order to find cycle equivalents. */
3676 while (spanlist.length ())
3677 {
3678 block = spanlist.pop ();
3679 bb_sese *sese = BB_GET_SESE (block);
3680
3681 /* Do the pseudo node below. */
3682 nvptx_sese_pseudo (block, sese, depth, +1,
3683 sese->dir > 0 ? block->succs : block->preds,
3684 (sese->dir > 0 ? offsetof (edge_def, dest)
3685 : offsetof (edge_def, src)));
3686 sese->set_color (color_counts);
3687 /* Do the pseudo node above. */
3688 nvptx_sese_pseudo (block, sese, depth, -1,
3689 sese->dir < 0 ? block->succs : block->preds,
3690 (sese->dir < 0 ? offsetof (edge_def, dest)
3691 : offsetof (edge_def, src)));
3692 }
3693 if (dump_file)
3694 fprintf (dump_file, "\n");
3695 }
3696
3697 if (dump_file)
3698 {
3699 unsigned count;
3700 const char *comma = "";
3701
3702 fprintf (dump_file, "Found %d cycle equivalents\n",
3703 color_counts.length ());
3704 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3705 {
3706 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3707
3708 comma = "";
3709 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3710 if (BB_GET_SESE (block)->color == ix)
3711 {
3712 block->flags |= BB_VISITED;
3713 fprintf (dump_file, "%s%d", comma, block->index);
3714 comma=",";
3715 }
3716 fprintf (dump_file, "}");
3717 comma = ", ";
3718 }
3719 fprintf (dump_file, "\n");
3720 }
3721
3722 /* Now we've colored every block in the subgraph. We now need to
3723 determine the minimal set of SESE regions that cover that
3724 subgraph. Do this with a DFS walk of the complete function.
3725 During the walk we're either 'looking' or 'coloring'. When we
3726 reach the last node of a particular color, we stop coloring and
3727 return to looking. */
3728
3729 /* There cannot be more SESE regions than colors. */
3730 regions.reserve (color_counts.length ());
3731 for (ix = color_counts.length (); ix--;)
3732 regions.quick_push (bb_pair_t (0, 0));
3733
3734 for (ix = 0; blocks.iterate (ix, &block); ix++)
3735 block->flags &= ~BB_VISITED;
3736
3737 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3738
3739 if (dump_file)
3740 {
3741 const char *comma = "";
3742 int len = regions.length ();
3743
3744 fprintf (dump_file, "SESE regions:");
3745 for (ix = 0; ix != len; ix++)
3746 {
3747 basic_block from = regions[ix].first;
3748 basic_block to = regions[ix].second;
3749
3750 if (from)
3751 {
3752 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3753 if (to != from)
3754 fprintf (dump_file, "->%d", to->index);
3755
3756 int color = BB_GET_SESE (from)->color;
3757
3758 /* Print the blocks within the region (excluding ends). */
3759 FOR_EACH_BB_FN (block, cfun)
3760 {
3761 bb_sese *sese = BB_GET_SESE (block);
3762
3763 if (sese && sese->color == color
3764 && block != from && block != to)
3765 fprintf (dump_file, ".%d", block->index);
3766 }
3767 fprintf (dump_file, "}");
3768 }
3769 comma = ",";
3770 }
3771 fprintf (dump_file, "\n\n");
3772 }
3773
3774 for (ix = 0; blocks.iterate (ix, &block); ix++)
3775 delete BB_GET_SESE (block);
3776 }
3777
3778 #undef BB_SET_SESE
3779 #undef BB_GET_SESE
3780
3781 /* Propagate live state at the start of a partitioned region. IS_CALL
3782 indicates whether the propagation is for a (partitioned) call
3783 instruction. BLOCK provides the live register information, and
3784 might not contain INSN. Propagation is inserted just after INSN. RW
3785 indicates whether we are reading and/or writing state. This
3786 separation is needed for worker-level proppagation where we
3787 essentially do a spill & fill. FN is the underlying worker
3788 function to generate the propagation instructions for single
3789 register. DATA is user data.
3790
3791 Returns true if we didn't emit any instructions.
3792
3793 We propagate the live register set for non-calls and the entire
3794 frame for calls and non-calls. We could do better by (a)
3795 propagating just the live set that is used within the partitioned
3796 regions and (b) only propagating stack entries that are used. The
3797 latter might be quite hard to determine. */
3798
3799 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
3800
3801 static bool
nvptx_propagate(bool is_call,basic_block block,rtx_insn * insn,propagate_mask rw,propagator_fn fn,void * data)3802 nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
3803 propagate_mask rw, propagator_fn fn, void *data)
3804 {
3805 bitmap live = DF_LIVE_IN (block);
3806 bitmap_iterator iterator;
3807 unsigned ix;
3808 bool empty = true;
3809
3810 /* Copy the frame array. */
3811 HOST_WIDE_INT fs = get_frame_size ();
3812 if (fs)
3813 {
3814 rtx tmp = gen_reg_rtx (DImode);
3815 rtx idx = NULL_RTX;
3816 rtx ptr = gen_reg_rtx (Pmode);
3817 rtx pred = NULL_RTX;
3818 rtx_code_label *label = NULL;
3819
3820 empty = false;
3821 /* The frame size might not be DImode compatible, but the frame
3822 array's declaration will be. So it's ok to round up here. */
3823 fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
3824 /* Detect single iteration loop. */
3825 if (fs == 1)
3826 fs = 0;
3827
3828 start_sequence ();
3829 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3830 if (fs)
3831 {
3832 idx = gen_reg_rtx (SImode);
3833 pred = gen_reg_rtx (BImode);
3834 label = gen_label_rtx ();
3835
3836 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3837 /* Allow worker function to initialize anything needed. */
3838 rtx init = fn (tmp, PM_loop_begin, fs, data);
3839 if (init)
3840 emit_insn (init);
3841 emit_label (label);
3842 LABEL_NUSES (label)++;
3843 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
3844 }
3845 if (rw & PM_read)
3846 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
3847 emit_insn (fn (tmp, rw, fs, data));
3848 if (rw & PM_write)
3849 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
3850 if (fs)
3851 {
3852 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
3853 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
3854 emit_insn (gen_br_true_uni (pred, label));
3855 rtx fini = fn (tmp, PM_loop_end, fs, data);
3856 if (fini)
3857 emit_insn (fini);
3858 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
3859 }
3860 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
3861 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
3862 rtx cpy = get_insns ();
3863 end_sequence ();
3864 insn = emit_insn_after (cpy, insn);
3865 }
3866
3867 if (!is_call)
3868 /* Copy live registers. */
3869 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
3870 {
3871 rtx reg = regno_reg_rtx[ix];
3872
3873 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
3874 {
3875 rtx bcast = fn (reg, rw, 0, data);
3876
3877 insn = emit_insn_after (bcast, insn);
3878 empty = false;
3879 }
3880 }
3881 return empty;
3882 }
3883
3884 /* Worker for nvptx_vpropagate. */
3885
3886 static rtx
vprop_gen(rtx reg,propagate_mask pm,unsigned ARG_UNUSED (count),void * ARG_UNUSED (data))3887 vprop_gen (rtx reg, propagate_mask pm,
3888 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
3889 {
3890 if (!(pm & PM_read_write))
3891 return 0;
3892
3893 return nvptx_gen_vcast (reg);
3894 }
3895
3896 /* Propagate state that is live at start of BLOCK across the vectors
3897 of a single warp. Propagation is inserted just after INSN.
3898 IS_CALL and return as for nvptx_propagate. */
3899
3900 static bool
nvptx_vpropagate(bool is_call,basic_block block,rtx_insn * insn)3901 nvptx_vpropagate (bool is_call, basic_block block, rtx_insn *insn)
3902 {
3903 return nvptx_propagate (is_call, block, insn, PM_read_write, vprop_gen, 0);
3904 }
3905
3906 /* Worker for nvptx_wpropagate. */
3907
3908 static rtx
wprop_gen(rtx reg,propagate_mask pm,unsigned rep,void * data_)3909 wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
3910 {
3911 wcast_data_t *data = (wcast_data_t *)data_;
3912
3913 if (pm & PM_loop_begin)
3914 {
3915 /* Starting a loop, initialize pointer. */
3916 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
3917
3918 if (align > worker_bcast_align)
3919 worker_bcast_align = align;
3920 data->offset = (data->offset + align - 1) & ~(align - 1);
3921
3922 data->ptr = gen_reg_rtx (Pmode);
3923
3924 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
3925 }
3926 else if (pm & PM_loop_end)
3927 {
3928 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
3929 data->ptr = NULL_RTX;
3930 return clobber;
3931 }
3932 else
3933 return nvptx_gen_wcast (reg, pm, rep, data);
3934 }
3935
3936 /* Spill or fill live state that is live at start of BLOCK. PRE_P
3937 indicates if this is just before partitioned mode (do spill), or
3938 just after it starts (do fill). Sequence is inserted just after
3939 INSN. IS_CALL and return as for nvptx_propagate. */
3940
3941 static bool
nvptx_wpropagate(bool pre_p,bool is_call,basic_block block,rtx_insn * insn)3942 nvptx_wpropagate (bool pre_p, bool is_call, basic_block block, rtx_insn *insn)
3943 {
3944 wcast_data_t data;
3945
3946 data.base = gen_reg_rtx (Pmode);
3947 data.offset = 0;
3948 data.ptr = NULL_RTX;
3949
3950 bool empty = nvptx_propagate (is_call, block, insn,
3951 pre_p ? PM_read : PM_write, wprop_gen, &data);
3952 gcc_assert (empty == !data.offset);
3953 if (data.offset)
3954 {
3955 /* Stuff was emitted, initialize the base pointer now. */
3956 rtx init = gen_rtx_SET (data.base, worker_bcast_sym);
3957 emit_insn_after (init, insn);
3958
3959 if (worker_bcast_size < data.offset)
3960 worker_bcast_size = data.offset;
3961 }
3962 return empty;
3963 }
3964
3965 /* Emit a worker-level synchronization barrier. We use different
3966 markers for before and after synchronizations. */
3967
3968 static rtx
nvptx_wsync(bool after)3969 nvptx_wsync (bool after)
3970 {
3971 return gen_nvptx_barsync (GEN_INT (after));
3972 }
3973
3974 #if WORKAROUND_PTXJIT_BUG
3975 /* Return first real insn in BB, or return NULL_RTX if BB does not contain
3976 real insns. */
3977
3978 static rtx_insn *
bb_first_real_insn(basic_block bb)3979 bb_first_real_insn (basic_block bb)
3980 {
3981 rtx_insn *insn;
3982
3983 /* Find first insn of from block. */
3984 FOR_BB_INSNS (bb, insn)
3985 if (INSN_P (insn))
3986 return insn;
3987
3988 return 0;
3989 }
3990 #endif
3991
3992 /* Single neutering according to MASK. FROM is the incoming block and
3993 TO is the outgoing block. These may be the same block. Insert at
3994 start of FROM:
3995
3996 if (tid.<axis>) goto end.
3997
3998 and insert before ending branch of TO (if there is such an insn):
3999
4000 end:
4001 <possibly-broadcast-cond>
4002 <branch>
4003
4004 We currently only use differnt FROM and TO when skipping an entire
4005 loop. We could do more if we detected superblocks. */
4006
4007 static void
nvptx_single(unsigned mask,basic_block from,basic_block to)4008 nvptx_single (unsigned mask, basic_block from, basic_block to)
4009 {
4010 rtx_insn *head = BB_HEAD (from);
4011 rtx_insn *tail = BB_END (to);
4012 unsigned skip_mask = mask;
4013
4014 while (true)
4015 {
4016 /* Find first insn of from block. */
4017 while (head != BB_END (from)
4018 && (!INSN_P (head)
4019 || recog_memoized (head) == CODE_FOR_nvptx_barsync))
4020 head = NEXT_INSN (head);
4021
4022 if (from == to)
4023 break;
4024
4025 if (!(JUMP_P (head) && single_succ_p (from)))
4026 break;
4027
4028 basic_block jump_target = single_succ (from);
4029 if (!single_pred_p (jump_target))
4030 break;
4031
4032 from = jump_target;
4033 head = BB_HEAD (from);
4034 }
4035
4036 /* Find last insn of to block */
4037 rtx_insn *limit = from == to ? head : BB_HEAD (to);
4038 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
4039 tail = PREV_INSN (tail);
4040
4041 /* Detect if tail is a branch. */
4042 rtx tail_branch = NULL_RTX;
4043 rtx cond_branch = NULL_RTX;
4044 if (tail && INSN_P (tail))
4045 {
4046 tail_branch = PATTERN (tail);
4047 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
4048 tail_branch = NULL_RTX;
4049 else
4050 {
4051 cond_branch = SET_SRC (tail_branch);
4052 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
4053 cond_branch = NULL_RTX;
4054 }
4055 }
4056
4057 if (tail == head)
4058 {
4059 /* If this is empty, do nothing. */
4060 if (!head || !INSN_P (head))
4061 return;
4062
4063 /* If this is a dummy insn, do nothing. */
4064 switch (recog_memoized (head))
4065 {
4066 default:
4067 break;
4068 case CODE_FOR_nvptx_barsync:
4069 case CODE_FOR_nvptx_fork:
4070 case CODE_FOR_nvptx_forked:
4071 case CODE_FOR_nvptx_joining:
4072 case CODE_FOR_nvptx_join:
4073 return;
4074 }
4075
4076 if (cond_branch)
4077 {
4078 /* If we're only doing vector single, there's no need to
4079 emit skip code because we'll not insert anything. */
4080 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
4081 skip_mask = 0;
4082 }
4083 else if (tail_branch)
4084 /* Block with only unconditional branch. Nothing to do. */
4085 return;
4086 }
4087
4088 /* Insert the vector test inside the worker test. */
4089 unsigned mode;
4090 rtx_insn *before = tail;
4091 rtx_insn *neuter_start = NULL;
4092 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4093 if (GOMP_DIM_MASK (mode) & skip_mask)
4094 {
4095 rtx_code_label *label = gen_label_rtx ();
4096 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
4097
4098 if (!pred)
4099 {
4100 pred = gen_reg_rtx (BImode);
4101 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
4102 }
4103
4104 rtx br;
4105 if (mode == GOMP_DIM_VECTOR)
4106 br = gen_br_true (pred, label);
4107 else
4108 br = gen_br_true_uni (pred, label);
4109 if (neuter_start)
4110 neuter_start = emit_insn_after (br, neuter_start);
4111 else
4112 neuter_start = emit_insn_before (br, head);
4113
4114 LABEL_NUSES (label)++;
4115 if (tail_branch)
4116 before = emit_label_before (label, before);
4117 else
4118 {
4119 rtx_insn *label_insn = emit_label_after (label, tail);
4120 if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER)
4121 && CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL))
4122 emit_insn_after (gen_exit (), label_insn);
4123 }
4124 }
4125
4126 /* Now deal with propagating the branch condition. */
4127 if (cond_branch)
4128 {
4129 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
4130
4131 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
4132 {
4133 /* Vector mode only, do a shuffle. */
4134 #if WORKAROUND_PTXJIT_BUG
4135 /* The branch condition %rcond is propagated like this:
4136
4137 {
4138 .reg .u32 %x;
4139 mov.u32 %x,%tid.x;
4140 setp.ne.u32 %rnotvzero,%x,0;
4141 }
4142
4143 @%rnotvzero bra Lskip;
4144 setp.<op>.<type> %rcond,op1,op2;
4145 Lskip:
4146 selp.u32 %rcondu32,1,0,%rcond;
4147 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4148 setp.ne.u32 %rcond,%rcondu32,0;
4149
4150 There seems to be a bug in the ptx JIT compiler (observed at driver
4151 version 381.22, at -O1 and higher for sm_61), that drops the shfl
4152 unless %rcond is initialized to something before 'bra Lskip'. The
4153 bug is not observed with ptxas from cuda 8.0.61.
4154
4155 It is true that the code is non-trivial: at Lskip, %rcond is
4156 uninitialized in threads 1-31, and after the selp the same holds
4157 for %rcondu32. But shfl propagates the defined value in thread 0
4158 to threads 1-31, so after the shfl %rcondu32 is defined in threads
4159 0-31, and after the setp.ne %rcond is defined in threads 0-31.
4160
4161 There is nothing in the PTX spec to suggest that this is wrong, or
4162 to explain why the extra initialization is needed. So, we classify
4163 it as a JIT bug, and the extra initialization as workaround:
4164
4165 {
4166 .reg .u32 %x;
4167 mov.u32 %x,%tid.x;
4168 setp.ne.u32 %rnotvzero,%x,0;
4169 }
4170
4171 +.reg .pred %rcond2;
4172 +setp.eq.u32 %rcond2, 1, 0;
4173
4174 @%rnotvzero bra Lskip;
4175 setp.<op>.<type> %rcond,op1,op2;
4176 +mov.pred %rcond2, %rcond;
4177 Lskip:
4178 +mov.pred %rcond, %rcond2;
4179 selp.u32 %rcondu32,1,0,%rcond;
4180 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4181 setp.ne.u32 %rcond,%rcondu32,0;
4182 */
4183 rtx_insn *label = PREV_INSN (tail);
4184 gcc_assert (label && LABEL_P (label));
4185 rtx tmp = gen_reg_rtx (BImode);
4186 emit_insn_before (gen_movbi (tmp, const0_rtx),
4187 bb_first_real_insn (from));
4188 emit_insn_before (gen_rtx_SET (tmp, pvar), label);
4189 emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
4190 #endif
4191 emit_insn_before (nvptx_gen_vcast (pvar), tail);
4192 }
4193 else
4194 {
4195 /* Includes worker mode, do spill & fill. By construction
4196 we should never have worker mode only. */
4197 wcast_data_t data;
4198
4199 data.base = worker_bcast_sym;
4200 data.ptr = 0;
4201
4202 if (worker_bcast_size < GET_MODE_SIZE (SImode))
4203 worker_bcast_size = GET_MODE_SIZE (SImode);
4204
4205 data.offset = 0;
4206 emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
4207 before);
4208 /* Barrier so other workers can see the write. */
4209 emit_insn_before (nvptx_wsync (false), tail);
4210 data.offset = 0;
4211 emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
4212 /* This barrier is needed to avoid worker zero clobbering
4213 the broadcast buffer before all the other workers have
4214 had a chance to read this instance of it. */
4215 emit_insn_before (nvptx_wsync (true), tail);
4216 }
4217
4218 extract_insn (tail);
4219 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
4220 UNSPEC_BR_UNIFIED);
4221 validate_change (tail, recog_data.operand_loc[0], unsp, false);
4222 }
4223 }
4224
4225 /* PAR is a parallel that is being skipped in its entirety according to
4226 MASK. Treat this as skipping a superblock starting at forked
4227 and ending at joining. */
4228
4229 static void
nvptx_skip_par(unsigned mask,parallel * par)4230 nvptx_skip_par (unsigned mask, parallel *par)
4231 {
4232 basic_block tail = par->join_block;
4233 gcc_assert (tail->preds->length () == 1);
4234
4235 basic_block pre_tail = (*tail->preds)[0]->src;
4236 gcc_assert (pre_tail->succs->length () == 1);
4237
4238 nvptx_single (mask, par->forked_block, pre_tail);
4239 }
4240
4241 /* If PAR has a single inner parallel and PAR itself only contains
4242 empty entry and exit blocks, swallow the inner PAR. */
4243
4244 static void
nvptx_optimize_inner(parallel * par)4245 nvptx_optimize_inner (parallel *par)
4246 {
4247 parallel *inner = par->inner;
4248
4249 /* We mustn't be the outer dummy par. */
4250 if (!par->mask)
4251 return;
4252
4253 /* We must have a single inner par. */
4254 if (!inner || inner->next)
4255 return;
4256
4257 /* We must only contain 2 blocks ourselves -- the head and tail of
4258 the inner par. */
4259 if (par->blocks.length () != 2)
4260 return;
4261
4262 /* We must be disjoint partitioning. As we only have vector and
4263 worker partitioning, this is sufficient to guarantee the pars
4264 have adjacent partitioning. */
4265 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
4266 /* This indicates malformed code generation. */
4267 return;
4268
4269 /* The outer forked insn should be immediately followed by the inner
4270 fork insn. */
4271 rtx_insn *forked = par->forked_insn;
4272 rtx_insn *fork = BB_END (par->forked_block);
4273
4274 if (NEXT_INSN (forked) != fork)
4275 return;
4276 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
4277
4278 /* The outer joining insn must immediately follow the inner join
4279 insn. */
4280 rtx_insn *joining = par->joining_insn;
4281 rtx_insn *join = inner->join_insn;
4282 if (NEXT_INSN (join) != joining)
4283 return;
4284
4285 /* Preconditions met. Swallow the inner par. */
4286 if (dump_file)
4287 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
4288 inner->mask, inner->forked_block->index,
4289 inner->join_block->index,
4290 par->mask, par->forked_block->index, par->join_block->index);
4291
4292 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
4293
4294 par->blocks.reserve (inner->blocks.length ());
4295 while (inner->blocks.length ())
4296 par->blocks.quick_push (inner->blocks.pop ());
4297
4298 par->inner = inner->inner;
4299 inner->inner = NULL;
4300
4301 delete inner;
4302 }
4303
4304 /* Process the parallel PAR and all its contained
4305 parallels. We do everything but the neutering. Return mask of
4306 partitioned modes used within this parallel. */
4307
4308 static unsigned
nvptx_process_pars(parallel * par)4309 nvptx_process_pars (parallel *par)
4310 {
4311 if (nvptx_optimize)
4312 nvptx_optimize_inner (par);
4313
4314 unsigned inner_mask = par->mask;
4315
4316 /* Do the inner parallels first. */
4317 if (par->inner)
4318 {
4319 par->inner_mask = nvptx_process_pars (par->inner);
4320 inner_mask |= par->inner_mask;
4321 }
4322
4323 bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
4324
4325 if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
4326 {
4327 nvptx_wpropagate (false, is_call, par->forked_block, par->forked_insn);
4328 bool empty = nvptx_wpropagate (true, is_call,
4329 par->forked_block, par->fork_insn);
4330
4331 if (!empty || !is_call)
4332 {
4333 /* Insert begin and end synchronizations. */
4334 emit_insn_before (nvptx_wsync (false), par->forked_insn);
4335 emit_insn_before (nvptx_wsync (true), par->join_insn);
4336 }
4337 }
4338 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
4339 nvptx_vpropagate (is_call, par->forked_block, par->forked_insn);
4340
4341 /* Now do siblings. */
4342 if (par->next)
4343 inner_mask |= nvptx_process_pars (par->next);
4344 return inner_mask;
4345 }
4346
4347 /* Neuter the parallel described by PAR. We recurse in depth-first
4348 order. MODES are the partitioning of the execution and OUTER is
4349 the partitioning of the parallels we are contained in. */
4350
4351 static void
nvptx_neuter_pars(parallel * par,unsigned modes,unsigned outer)4352 nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
4353 {
4354 unsigned me = (par->mask
4355 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
4356 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4357 unsigned skip_mask = 0, neuter_mask = 0;
4358
4359 if (par->inner)
4360 nvptx_neuter_pars (par->inner, modes, outer | me);
4361
4362 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4363 {
4364 if ((outer | me) & GOMP_DIM_MASK (mode))
4365 {} /* Mode is partitioned: no neutering. */
4366 else if (!(modes & GOMP_DIM_MASK (mode)))
4367 {} /* Mode is not used: nothing to do. */
4368 else if (par->inner_mask & GOMP_DIM_MASK (mode)
4369 || !par->forked_insn)
4370 /* Partitioned in inner parallels, or we're not a partitioned
4371 at all: neuter individual blocks. */
4372 neuter_mask |= GOMP_DIM_MASK (mode);
4373 else if (!par->parent || !par->parent->forked_insn
4374 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
4375 /* Parent isn't a parallel or contains this paralleling: skip
4376 parallel at this level. */
4377 skip_mask |= GOMP_DIM_MASK (mode);
4378 else
4379 {} /* Parent will skip this parallel itself. */
4380 }
4381
4382 if (neuter_mask)
4383 {
4384 int ix, len;
4385
4386 if (nvptx_optimize)
4387 {
4388 /* Neuter whole SESE regions. */
4389 bb_pair_vec_t regions;
4390
4391 nvptx_find_sese (par->blocks, regions);
4392 len = regions.length ();
4393 for (ix = 0; ix != len; ix++)
4394 {
4395 basic_block from = regions[ix].first;
4396 basic_block to = regions[ix].second;
4397
4398 if (from)
4399 nvptx_single (neuter_mask, from, to);
4400 else
4401 gcc_assert (!to);
4402 }
4403 }
4404 else
4405 {
4406 /* Neuter each BB individually. */
4407 len = par->blocks.length ();
4408 for (ix = 0; ix != len; ix++)
4409 {
4410 basic_block block = par->blocks[ix];
4411
4412 nvptx_single (neuter_mask, block, block);
4413 }
4414 }
4415 }
4416
4417 if (skip_mask)
4418 nvptx_skip_par (skip_mask, par);
4419
4420 if (par->next)
4421 nvptx_neuter_pars (par->next, modes, outer);
4422 }
4423
4424 #if WORKAROUND_PTXJIT_BUG_2
4425 /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant
4426 is needed in the nvptx target because the branches generated for
4427 parititioning are NONJUMP_INSN_P, not JUMP_P. */
4428
4429 static rtx
4430 nvptx_pc_set (const rtx_insn *insn, bool strict = true)
4431 {
4432 rtx pat;
4433 if ((strict && !JUMP_P (insn))
4434 || (!strict && !INSN_P (insn)))
4435 return NULL_RTX;
4436 pat = PATTERN (insn);
4437
4438 /* The set is allowed to appear either as the insn pattern or
4439 the first set in a PARALLEL. */
4440 if (GET_CODE (pat) == PARALLEL)
4441 pat = XVECEXP (pat, 0, 0);
4442 if (GET_CODE (pat) == SET && GET_CODE (SET_DEST (pat)) == PC)
4443 return pat;
4444
4445 return NULL_RTX;
4446 }
4447
4448 /* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT. */
4449
4450 static rtx
4451 nvptx_condjump_label (const rtx_insn *insn, bool strict = true)
4452 {
4453 rtx x = nvptx_pc_set (insn, strict);
4454
4455 if (!x)
4456 return NULL_RTX;
4457 x = SET_SRC (x);
4458 if (GET_CODE (x) == LABEL_REF)
4459 return x;
4460 if (GET_CODE (x) != IF_THEN_ELSE)
4461 return NULL_RTX;
4462 if (XEXP (x, 2) == pc_rtx && GET_CODE (XEXP (x, 1)) == LABEL_REF)
4463 return XEXP (x, 1);
4464 if (XEXP (x, 1) == pc_rtx && GET_CODE (XEXP (x, 2)) == LABEL_REF)
4465 return XEXP (x, 2);
4466 return NULL_RTX;
4467 }
4468
4469 /* Insert a dummy ptx insn when encountering a branch to a label with no ptx
4470 insn inbetween the branch and the label. This works around a JIT bug
4471 observed at driver version 384.111, at -O0 for sm_50. */
4472
4473 static void
prevent_branch_around_nothing(void)4474 prevent_branch_around_nothing (void)
4475 {
4476 rtx_insn *seen_label = NULL;
4477 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
4478 {
4479 if (INSN_P (insn) && condjump_p (insn))
4480 {
4481 seen_label = label_ref_label (nvptx_condjump_label (insn, false));
4482 continue;
4483 }
4484
4485 if (seen_label == NULL)
4486 continue;
4487
4488 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
4489 continue;
4490
4491 if (INSN_P (insn))
4492 switch (recog_memoized (insn))
4493 {
4494 case CODE_FOR_nvptx_fork:
4495 case CODE_FOR_nvptx_forked:
4496 case CODE_FOR_nvptx_joining:
4497 case CODE_FOR_nvptx_join:
4498 continue;
4499 default:
4500 seen_label = NULL;
4501 continue;
4502 }
4503
4504 if (LABEL_P (insn) && insn == seen_label)
4505 emit_insn_before (gen_fake_nop (), insn);
4506
4507 seen_label = NULL;
4508 }
4509 }
4510 #endif
4511
4512 /* PTX-specific reorganization
4513 - Split blocks at fork and join instructions
4514 - Compute live registers
4515 - Mark now-unused registers, so function begin doesn't declare
4516 unused registers.
4517 - Insert state propagation when entering partitioned mode
4518 - Insert neutering instructions when in single mode
4519 - Replace subregs with suitable sequences.
4520 */
4521
4522 static void
nvptx_reorg(void)4523 nvptx_reorg (void)
4524 {
4525 /* We are freeing block_for_insn in the toplev to keep compatibility
4526 with old MDEP_REORGS that are not CFG based. Recompute it now. */
4527 compute_bb_for_insn ();
4528
4529 thread_prologue_and_epilogue_insns ();
4530
4531 /* Split blocks and record interesting unspecs. */
4532 bb_insn_map_t bb_insn_map;
4533
4534 nvptx_split_blocks (&bb_insn_map);
4535
4536 /* Compute live regs */
4537 df_clear_flags (DF_LR_RUN_DCE);
4538 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
4539 df_live_add_problem ();
4540 df_live_set_all_dirty ();
4541 df_analyze ();
4542 regstat_init_n_sets_and_refs ();
4543
4544 if (dump_file)
4545 df_dump (dump_file);
4546
4547 /* Mark unused regs as unused. */
4548 int max_regs = max_reg_num ();
4549 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
4550 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
4551 regno_reg_rtx[i] = const0_rtx;
4552
4553 /* Determine launch dimensions of the function. If it is not an
4554 offloaded function (i.e. this is a regular compiler), the
4555 function has no neutering. */
4556 tree attr = oacc_get_fn_attrib (current_function_decl);
4557 if (attr)
4558 {
4559 /* If we determined this mask before RTL expansion, we could
4560 elide emission of some levels of forks and joins. */
4561 unsigned mask = 0;
4562 tree dims = TREE_VALUE (attr);
4563 unsigned ix;
4564
4565 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
4566 {
4567 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
4568 tree allowed = TREE_PURPOSE (dims);
4569
4570 if (size != 1 && !(allowed && integer_zerop (allowed)))
4571 mask |= GOMP_DIM_MASK (ix);
4572 }
4573 /* If there is worker neutering, there must be vector
4574 neutering. Otherwise the hardware will fail. */
4575 gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
4576 || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4577
4578 /* Discover & process partitioned regions. */
4579 parallel *pars = nvptx_discover_pars (&bb_insn_map);
4580 nvptx_process_pars (pars);
4581 nvptx_neuter_pars (pars, mask, 0);
4582 delete pars;
4583 }
4584
4585 /* Replace subregs. */
4586 nvptx_reorg_subreg ();
4587
4588 if (TARGET_UNIFORM_SIMT)
4589 nvptx_reorg_uniform_simt ();
4590
4591 #if WORKAROUND_PTXJIT_BUG_2
4592 prevent_branch_around_nothing ();
4593 #endif
4594
4595 regstat_free_n_sets_and_refs ();
4596
4597 df_finish_pass (true);
4598 }
4599
4600 /* Handle a "kernel" attribute; arguments as in
4601 struct attribute_spec.handler. */
4602
4603 static tree
nvptx_handle_kernel_attribute(tree * node,tree name,tree ARG_UNUSED (args),int ARG_UNUSED (flags),bool * no_add_attrs)4604 nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
4605 int ARG_UNUSED (flags), bool *no_add_attrs)
4606 {
4607 tree decl = *node;
4608
4609 if (TREE_CODE (decl) != FUNCTION_DECL)
4610 {
4611 error ("%qE attribute only applies to functions", name);
4612 *no_add_attrs = true;
4613 }
4614 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
4615 {
4616 error ("%qE attribute requires a void return type", name);
4617 *no_add_attrs = true;
4618 }
4619
4620 return NULL_TREE;
4621 }
4622
4623 /* Handle a "shared" attribute; arguments as in
4624 struct attribute_spec.handler. */
4625
4626 static tree
nvptx_handle_shared_attribute(tree * node,tree name,tree ARG_UNUSED (args),int ARG_UNUSED (flags),bool * no_add_attrs)4627 nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
4628 int ARG_UNUSED (flags), bool *no_add_attrs)
4629 {
4630 tree decl = *node;
4631
4632 if (TREE_CODE (decl) != VAR_DECL)
4633 {
4634 error ("%qE attribute only applies to variables", name);
4635 *no_add_attrs = true;
4636 }
4637 else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
4638 {
4639 error ("%qE attribute not allowed with auto storage class", name);
4640 *no_add_attrs = true;
4641 }
4642
4643 return NULL_TREE;
4644 }
4645
4646 /* Table of valid machine attributes. */
4647 static const struct attribute_spec nvptx_attribute_table[] =
4648 {
4649 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
4650 affects_type_identity, handler, exclude } */
4651 { "kernel", 0, 0, true, false, false, false, nvptx_handle_kernel_attribute,
4652 NULL },
4653 { "shared", 0, 0, true, false, false, false, nvptx_handle_shared_attribute,
4654 NULL },
4655 { NULL, 0, 0, false, false, false, false, NULL, NULL }
4656 };
4657
4658 /* Limit vector alignments to BIGGEST_ALIGNMENT. */
4659
4660 static HOST_WIDE_INT
nvptx_vector_alignment(const_tree type)4661 nvptx_vector_alignment (const_tree type)
4662 {
4663 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
4664
4665 return MIN (align, BIGGEST_ALIGNMENT);
4666 }
4667
4668 /* Indicate that INSN cannot be duplicated. */
4669
4670 static bool
nvptx_cannot_copy_insn_p(rtx_insn * insn)4671 nvptx_cannot_copy_insn_p (rtx_insn *insn)
4672 {
4673 switch (recog_memoized (insn))
4674 {
4675 case CODE_FOR_nvptx_shufflesi:
4676 case CODE_FOR_nvptx_shufflesf:
4677 case CODE_FOR_nvptx_barsync:
4678 case CODE_FOR_nvptx_fork:
4679 case CODE_FOR_nvptx_forked:
4680 case CODE_FOR_nvptx_joining:
4681 case CODE_FOR_nvptx_join:
4682 return true;
4683 default:
4684 return false;
4685 }
4686 }
4687
4688 /* Section anchors do not work. Initialization for flag_section_anchor
4689 probes the existence of the anchoring target hooks and prevents
4690 anchoring if they don't exist. However, we may be being used with
4691 a host-side compiler that does support anchoring, and hence see
4692 the anchor flag set (as it's not recalculated). So provide an
4693 implementation denying anchoring. */
4694
4695 static bool
nvptx_use_anchors_for_symbol_p(const_rtx ARG_UNUSED (a))4696 nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
4697 {
4698 return false;
4699 }
4700
4701 /* Record a symbol for mkoffload to enter into the mapping table. */
4702
4703 static void
nvptx_record_offload_symbol(tree decl)4704 nvptx_record_offload_symbol (tree decl)
4705 {
4706 switch (TREE_CODE (decl))
4707 {
4708 case VAR_DECL:
4709 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
4710 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
4711 break;
4712
4713 case FUNCTION_DECL:
4714 {
4715 tree attr = oacc_get_fn_attrib (decl);
4716 /* OpenMP offloading does not set this attribute. */
4717 tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
4718
4719 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
4720 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
4721
4722 for (; dims; dims = TREE_CHAIN (dims))
4723 {
4724 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
4725
4726 gcc_assert (!TREE_PURPOSE (dims));
4727 fprintf (asm_out_file, ", %#x", size);
4728 }
4729
4730 fprintf (asm_out_file, "\n");
4731 }
4732 break;
4733
4734 default:
4735 gcc_unreachable ();
4736 }
4737 }
4738
4739 /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
4740 at the start of a file. */
4741
4742 static void
nvptx_file_start(void)4743 nvptx_file_start (void)
4744 {
4745 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
4746 fputs ("\t.version\t3.1\n", asm_out_file);
4747 fputs ("\t.target\tsm_30\n", asm_out_file);
4748 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
4749 fputs ("// END PREAMBLE\n", asm_out_file);
4750 }
4751
4752 /* Emit a declaration for a worker-level buffer in .shared memory. */
4753
4754 static void
write_worker_buffer(FILE * file,rtx sym,unsigned align,unsigned size)4755 write_worker_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
4756 {
4757 const char *name = XSTR (sym, 0);
4758
4759 write_var_marker (file, true, false, name);
4760 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
4761 align, name, size);
4762 }
4763
4764 /* Write out the function declarations we've collected and declare storage
4765 for the broadcast buffer. */
4766
4767 static void
nvptx_file_end(void)4768 nvptx_file_end (void)
4769 {
4770 hash_table<tree_hasher>::iterator iter;
4771 tree decl;
4772 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
4773 nvptx_record_fndecl (decl);
4774 fputs (func_decls.str().c_str(), asm_out_file);
4775
4776 if (worker_bcast_size)
4777 write_worker_buffer (asm_out_file, worker_bcast_sym,
4778 worker_bcast_align, worker_bcast_size);
4779
4780 if (worker_red_size)
4781 write_worker_buffer (asm_out_file, worker_red_sym,
4782 worker_red_align, worker_red_size);
4783
4784 if (need_softstack_decl)
4785 {
4786 write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
4787 /* 32 is the maximum number of warps in a block. Even though it's an
4788 external declaration, emit the array size explicitly; otherwise, it
4789 may fail at PTX JIT time if the definition is later in link order. */
4790 fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
4791 POINTER_SIZE);
4792 }
4793 if (need_unisimt_decl)
4794 {
4795 write_var_marker (asm_out_file, false, true, "__nvptx_uni");
4796 fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
4797 }
4798 }
4799
4800 /* Expander for the shuffle builtins. */
4801
4802 static rtx
nvptx_expand_shuffle(tree exp,rtx target,machine_mode mode,int ignore)4803 nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
4804 {
4805 if (ignore)
4806 return target;
4807
4808 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
4809 NULL_RTX, mode, EXPAND_NORMAL);
4810 if (!REG_P (src))
4811 src = copy_to_mode_reg (mode, src);
4812
4813 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
4814 NULL_RTX, SImode, EXPAND_NORMAL);
4815 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
4816 NULL_RTX, SImode, EXPAND_NORMAL);
4817
4818 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
4819 idx = copy_to_mode_reg (SImode, idx);
4820
4821 rtx pat = nvptx_gen_shuffle (target, src, idx,
4822 (nvptx_shuffle_kind) INTVAL (op));
4823 if (pat)
4824 emit_insn (pat);
4825
4826 return target;
4827 }
4828
4829 /* Worker reduction address expander. */
4830
4831 static rtx
nvptx_expand_worker_addr(tree exp,rtx target,machine_mode ARG_UNUSED (mode),int ignore)4832 nvptx_expand_worker_addr (tree exp, rtx target,
4833 machine_mode ARG_UNUSED (mode), int ignore)
4834 {
4835 if (ignore)
4836 return target;
4837
4838 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
4839 if (align > worker_red_align)
4840 worker_red_align = align;
4841
4842 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
4843 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
4844 if (size + offset > worker_red_size)
4845 worker_red_size = size + offset;
4846
4847 rtx addr = worker_red_sym;
4848 if (offset)
4849 {
4850 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
4851 addr = gen_rtx_CONST (Pmode, addr);
4852 }
4853
4854 emit_move_insn (target, addr);
4855
4856 return target;
4857 }
4858
4859 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
4860 not require taking the address of any object, other than the memory
4861 cell being operated on. */
4862
4863 static rtx
nvptx_expand_cmp_swap(tree exp,rtx target,machine_mode ARG_UNUSED (m),int ARG_UNUSED (ignore))4864 nvptx_expand_cmp_swap (tree exp, rtx target,
4865 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
4866 {
4867 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
4868
4869 if (!target)
4870 target = gen_reg_rtx (mode);
4871
4872 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
4873 NULL_RTX, Pmode, EXPAND_NORMAL);
4874 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
4875 NULL_RTX, mode, EXPAND_NORMAL);
4876 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
4877 NULL_RTX, mode, EXPAND_NORMAL);
4878 rtx pat;
4879
4880 mem = gen_rtx_MEM (mode, mem);
4881 if (!REG_P (cmp))
4882 cmp = copy_to_mode_reg (mode, cmp);
4883 if (!REG_P (src))
4884 src = copy_to_mode_reg (mode, src);
4885
4886 if (mode == SImode)
4887 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
4888 else
4889 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
4890
4891 emit_insn (pat);
4892
4893 return target;
4894 }
4895
4896
4897 /* Codes for all the NVPTX builtins. */
4898 enum nvptx_builtins
4899 {
4900 NVPTX_BUILTIN_SHUFFLE,
4901 NVPTX_BUILTIN_SHUFFLELL,
4902 NVPTX_BUILTIN_WORKER_ADDR,
4903 NVPTX_BUILTIN_CMP_SWAP,
4904 NVPTX_BUILTIN_CMP_SWAPLL,
4905 NVPTX_BUILTIN_MAX
4906 };
4907
4908 static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
4909
4910 /* Return the NVPTX builtin for CODE. */
4911
4912 static tree
nvptx_builtin_decl(unsigned code,bool ARG_UNUSED (initialize_p))4913 nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
4914 {
4915 if (code >= NVPTX_BUILTIN_MAX)
4916 return error_mark_node;
4917
4918 return nvptx_builtin_decls[code];
4919 }
4920
4921 /* Set up all builtin functions for this target. */
4922
4923 static void
nvptx_init_builtins(void)4924 nvptx_init_builtins (void)
4925 {
4926 #define DEF(ID, NAME, T) \
4927 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
4928 = add_builtin_function ("__builtin_nvptx_" NAME, \
4929 build_function_type_list T, \
4930 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
4931 #define ST sizetype
4932 #define UINT unsigned_type_node
4933 #define LLUINT long_long_unsigned_type_node
4934 #define PTRVOID ptr_type_node
4935
4936 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
4937 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
4938 DEF (WORKER_ADDR, "worker_addr",
4939 (PTRVOID, ST, UINT, UINT, NULL_TREE));
4940 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
4941 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
4942
4943 #undef DEF
4944 #undef ST
4945 #undef UINT
4946 #undef LLUINT
4947 #undef PTRVOID
4948 }
4949
4950 /* Expand an expression EXP that calls a built-in function,
4951 with result going to TARGET if that's convenient
4952 (and in mode MODE if that's convenient).
4953 SUBTARGET may be used as the target for computing one of EXP's operands.
4954 IGNORE is nonzero if the value is to be ignored. */
4955
4956 static rtx
nvptx_expand_builtin(tree exp,rtx target,rtx ARG_UNUSED (subtarget),machine_mode mode,int ignore)4957 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
4958 machine_mode mode, int ignore)
4959 {
4960 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4961 switch (DECL_FUNCTION_CODE (fndecl))
4962 {
4963 case NVPTX_BUILTIN_SHUFFLE:
4964 case NVPTX_BUILTIN_SHUFFLELL:
4965 return nvptx_expand_shuffle (exp, target, mode, ignore);
4966
4967 case NVPTX_BUILTIN_WORKER_ADDR:
4968 return nvptx_expand_worker_addr (exp, target, mode, ignore);
4969
4970 case NVPTX_BUILTIN_CMP_SWAP:
4971 case NVPTX_BUILTIN_CMP_SWAPLL:
4972 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
4973
4974 default: gcc_unreachable ();
4975 }
4976 }
4977
4978 /* Define dimension sizes for known hardware. */
4979 #define PTX_VECTOR_LENGTH 32
4980 #define PTX_WORKER_LENGTH 32
4981 #define PTX_GANG_DEFAULT 0 /* Defer to runtime. */
4982
4983 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
4984
4985 static int
nvptx_simt_vf()4986 nvptx_simt_vf ()
4987 {
4988 return PTX_VECTOR_LENGTH;
4989 }
4990
4991 /* Validate compute dimensions of an OpenACC offload or routine, fill
4992 in non-unity defaults. FN_LEVEL indicates the level at which a
4993 routine might spawn a loop. It is negative for non-routines. If
4994 DECL is null, we are validating the default dimensions. */
4995
4996 static bool
nvptx_goacc_validate_dims(tree decl,int dims[],int fn_level)4997 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
4998 {
4999 bool changed = false;
5000
5001 /* The vector size must be 32, unless this is a SEQ routine. */
5002 if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
5003 && dims[GOMP_DIM_VECTOR] >= 0
5004 && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
5005 {
5006 if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0)
5007 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5008 dims[GOMP_DIM_VECTOR]
5009 ? G_("using vector_length (%d), ignoring %d")
5010 : G_("using vector_length (%d), ignoring runtime setting"),
5011 PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]);
5012 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
5013 changed = true;
5014 }
5015
5016 /* Check the num workers is not too large. */
5017 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
5018 {
5019 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5020 "using num_workers (%d), ignoring %d",
5021 PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
5022 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
5023 changed = true;
5024 }
5025
5026 if (!decl)
5027 {
5028 dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
5029 if (dims[GOMP_DIM_WORKER] < 0)
5030 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
5031 if (dims[GOMP_DIM_GANG] < 0)
5032 dims[GOMP_DIM_GANG] = PTX_GANG_DEFAULT;
5033 changed = true;
5034 }
5035
5036 return changed;
5037 }
5038
5039 /* Return maximum dimension size, or zero for unbounded. */
5040
5041 static int
nvptx_dim_limit(int axis)5042 nvptx_dim_limit (int axis)
5043 {
5044 switch (axis)
5045 {
5046 case GOMP_DIM_WORKER:
5047 return PTX_WORKER_LENGTH;
5048
5049 case GOMP_DIM_VECTOR:
5050 return PTX_VECTOR_LENGTH;
5051
5052 default:
5053 break;
5054 }
5055 return 0;
5056 }
5057
5058 /* Determine whether fork & joins are needed. */
5059
5060 static bool
nvptx_goacc_fork_join(gcall * call,const int dims[],bool ARG_UNUSED (is_fork))5061 nvptx_goacc_fork_join (gcall *call, const int dims[],
5062 bool ARG_UNUSED (is_fork))
5063 {
5064 tree arg = gimple_call_arg (call, 2);
5065 unsigned axis = TREE_INT_CST_LOW (arg);
5066
5067 /* We only care about worker and vector partitioning. */
5068 if (axis < GOMP_DIM_WORKER)
5069 return false;
5070
5071 /* If the size is 1, there's no partitioning. */
5072 if (dims[axis] == 1)
5073 return false;
5074
5075 return true;
5076 }
5077
5078 /* Generate a PTX builtin function call that returns the address in
5079 the worker reduction buffer at OFFSET. TYPE is the type of the
5080 data at that location. */
5081
5082 static tree
nvptx_get_worker_red_addr(tree type,tree offset)5083 nvptx_get_worker_red_addr (tree type, tree offset)
5084 {
5085 machine_mode mode = TYPE_MODE (type);
5086 tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
5087 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
5088 tree align = build_int_cst (unsigned_type_node,
5089 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
5090 tree call = build_call_expr (fndecl, 3, offset, size, align);
5091
5092 return fold_convert (build_pointer_type (type), call);
5093 }
5094
5095 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
5096 will cast the variable if necessary. */
5097
5098 static void
nvptx_generate_vector_shuffle(location_t loc,tree dest_var,tree var,unsigned shift,gimple_seq * seq)5099 nvptx_generate_vector_shuffle (location_t loc,
5100 tree dest_var, tree var, unsigned shift,
5101 gimple_seq *seq)
5102 {
5103 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
5104 tree_code code = NOP_EXPR;
5105 tree arg_type = unsigned_type_node;
5106 tree var_type = TREE_TYPE (var);
5107 tree dest_type = var_type;
5108
5109 if (TREE_CODE (var_type) == COMPLEX_TYPE)
5110 var_type = TREE_TYPE (var_type);
5111
5112 if (TREE_CODE (var_type) == REAL_TYPE)
5113 code = VIEW_CONVERT_EXPR;
5114
5115 if (TYPE_SIZE (var_type)
5116 == TYPE_SIZE (long_long_unsigned_type_node))
5117 {
5118 fn = NVPTX_BUILTIN_SHUFFLELL;
5119 arg_type = long_long_unsigned_type_node;
5120 }
5121
5122 tree call = nvptx_builtin_decl (fn, true);
5123 tree bits = build_int_cst (unsigned_type_node, shift);
5124 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
5125 tree expr;
5126
5127 if (var_type != dest_type)
5128 {
5129 /* Do real and imaginary parts separately. */
5130 tree real = fold_build1 (REALPART_EXPR, var_type, var);
5131 real = fold_build1 (code, arg_type, real);
5132 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
5133 real = fold_build1 (code, var_type, real);
5134
5135 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
5136 imag = fold_build1 (code, arg_type, imag);
5137 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
5138 imag = fold_build1 (code, var_type, imag);
5139
5140 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
5141 }
5142 else
5143 {
5144 expr = fold_build1 (code, arg_type, var);
5145 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
5146 expr = fold_build1 (code, dest_type, expr);
5147 }
5148
5149 gimplify_assign (dest_var, expr, seq);
5150 }
5151
5152 /* Lazily generate the global lock var decl and return its address. */
5153
5154 static tree
nvptx_global_lock_addr()5155 nvptx_global_lock_addr ()
5156 {
5157 tree v = global_lock_var;
5158
5159 if (!v)
5160 {
5161 tree name = get_identifier ("__reduction_lock");
5162 tree type = build_qualified_type (unsigned_type_node,
5163 TYPE_QUAL_VOLATILE);
5164 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
5165 global_lock_var = v;
5166 DECL_ARTIFICIAL (v) = 1;
5167 DECL_EXTERNAL (v) = 1;
5168 TREE_STATIC (v) = 1;
5169 TREE_PUBLIC (v) = 1;
5170 TREE_USED (v) = 1;
5171 mark_addressable (v);
5172 mark_decl_referenced (v);
5173 }
5174
5175 return build_fold_addr_expr (v);
5176 }
5177
5178 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
5179 GSI. We use a lockless scheme for nearly all case, which looks
5180 like:
5181 actual = initval(OP);
5182 do {
5183 guess = actual;
5184 write = guess OP myval;
5185 actual = cmp&swap (ptr, guess, write)
5186 } while (actual bit-different-to guess);
5187 return write;
5188
5189 This relies on a cmp&swap instruction, which is available for 32-
5190 and 64-bit types. Larger types must use a locking scheme. */
5191
5192 static tree
nvptx_lockless_update(location_t loc,gimple_stmt_iterator * gsi,tree ptr,tree var,tree_code op)5193 nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
5194 tree ptr, tree var, tree_code op)
5195 {
5196 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
5197 tree_code code = NOP_EXPR;
5198 tree arg_type = unsigned_type_node;
5199 tree var_type = TREE_TYPE (var);
5200
5201 if (TREE_CODE (var_type) == COMPLEX_TYPE
5202 || TREE_CODE (var_type) == REAL_TYPE)
5203 code = VIEW_CONVERT_EXPR;
5204
5205 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
5206 {
5207 arg_type = long_long_unsigned_type_node;
5208 fn = NVPTX_BUILTIN_CMP_SWAPLL;
5209 }
5210
5211 tree swap_fn = nvptx_builtin_decl (fn, true);
5212
5213 gimple_seq init_seq = NULL;
5214 tree init_var = make_ssa_name (arg_type);
5215 tree init_expr = omp_reduction_init_op (loc, op, var_type);
5216 init_expr = fold_build1 (code, arg_type, init_expr);
5217 gimplify_assign (init_var, init_expr, &init_seq);
5218 gimple *init_end = gimple_seq_last (init_seq);
5219
5220 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
5221
5222 /* Split the block just after the init stmts. */
5223 basic_block pre_bb = gsi_bb (*gsi);
5224 edge pre_edge = split_block (pre_bb, init_end);
5225 basic_block loop_bb = pre_edge->dest;
5226 pre_bb = pre_edge->src;
5227 /* Reset the iterator. */
5228 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5229
5230 tree expect_var = make_ssa_name (arg_type);
5231 tree actual_var = make_ssa_name (arg_type);
5232 tree write_var = make_ssa_name (arg_type);
5233
5234 /* Build and insert the reduction calculation. */
5235 gimple_seq red_seq = NULL;
5236 tree write_expr = fold_build1 (code, var_type, expect_var);
5237 write_expr = fold_build2 (op, var_type, write_expr, var);
5238 write_expr = fold_build1 (code, arg_type, write_expr);
5239 gimplify_assign (write_var, write_expr, &red_seq);
5240
5241 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5242
5243 /* Build & insert the cmp&swap sequence. */
5244 gimple_seq latch_seq = NULL;
5245 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
5246 ptr, expect_var, write_var);
5247 gimplify_assign (actual_var, swap_expr, &latch_seq);
5248
5249 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
5250 NULL_TREE, NULL_TREE);
5251 gimple_seq_add_stmt (&latch_seq, cond);
5252
5253 gimple *latch_end = gimple_seq_last (latch_seq);
5254 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
5255
5256 /* Split the block just after the latch stmts. */
5257 edge post_edge = split_block (loop_bb, latch_end);
5258 basic_block post_bb = post_edge->dest;
5259 loop_bb = post_edge->src;
5260 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5261
5262 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
5263 post_edge->probability = profile_probability::even ();
5264 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
5265 loop_edge->probability = profile_probability::even ();
5266 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
5267 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
5268
5269 gphi *phi = create_phi_node (expect_var, loop_bb);
5270 add_phi_arg (phi, init_var, pre_edge, loc);
5271 add_phi_arg (phi, actual_var, loop_edge, loc);
5272
5273 loop *loop = alloc_loop ();
5274 loop->header = loop_bb;
5275 loop->latch = loop_bb;
5276 add_loop (loop, loop_bb->loop_father);
5277
5278 return fold_build1 (code, var_type, write_var);
5279 }
5280
5281 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
5282 GSI. This is necessary for types larger than 64 bits, where there
5283 is no cmp&swap instruction to implement a lockless scheme. We use
5284 a lock variable in global memory.
5285
5286 while (cmp&swap (&lock_var, 0, 1))
5287 continue;
5288 T accum = *ptr;
5289 accum = accum OP var;
5290 *ptr = accum;
5291 cmp&swap (&lock_var, 1, 0);
5292 return accum;
5293
5294 A lock in global memory is necessary to force execution engine
5295 descheduling and avoid resource starvation that can occur if the
5296 lock is in .shared memory. */
5297
5298 static tree
nvptx_lockfull_update(location_t loc,gimple_stmt_iterator * gsi,tree ptr,tree var,tree_code op)5299 nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
5300 tree ptr, tree var, tree_code op)
5301 {
5302 tree var_type = TREE_TYPE (var);
5303 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
5304 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
5305 tree uns_locked = build_int_cst (unsigned_type_node, 1);
5306
5307 /* Split the block just before the gsi. Insert a gimple nop to make
5308 this easier. */
5309 gimple *nop = gimple_build_nop ();
5310 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
5311 basic_block entry_bb = gsi_bb (*gsi);
5312 edge entry_edge = split_block (entry_bb, nop);
5313 basic_block lock_bb = entry_edge->dest;
5314 /* Reset the iterator. */
5315 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5316
5317 /* Build and insert the locking sequence. */
5318 gimple_seq lock_seq = NULL;
5319 tree lock_var = make_ssa_name (unsigned_type_node);
5320 tree lock_expr = nvptx_global_lock_addr ();
5321 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
5322 uns_unlocked, uns_locked);
5323 gimplify_assign (lock_var, lock_expr, &lock_seq);
5324 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
5325 NULL_TREE, NULL_TREE);
5326 gimple_seq_add_stmt (&lock_seq, cond);
5327 gimple *lock_end = gimple_seq_last (lock_seq);
5328 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
5329
5330 /* Split the block just after the lock sequence. */
5331 edge locked_edge = split_block (lock_bb, lock_end);
5332 basic_block update_bb = locked_edge->dest;
5333 lock_bb = locked_edge->src;
5334 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5335
5336 /* Create the lock loop ... */
5337 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
5338 locked_edge->probability = profile_probability::even ();
5339 edge loop_edge = make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
5340 loop_edge->probability = profile_probability::even ();
5341 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
5342 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
5343
5344 /* ... and the loop structure. */
5345 loop *lock_loop = alloc_loop ();
5346 lock_loop->header = lock_bb;
5347 lock_loop->latch = lock_bb;
5348 lock_loop->nb_iterations_estimate = 1;
5349 lock_loop->any_estimate = true;
5350 add_loop (lock_loop, entry_bb->loop_father);
5351
5352 /* Build and insert the reduction calculation. */
5353 gimple_seq red_seq = NULL;
5354 tree acc_in = make_ssa_name (var_type);
5355 tree ref_in = build_simple_mem_ref (ptr);
5356 TREE_THIS_VOLATILE (ref_in) = 1;
5357 gimplify_assign (acc_in, ref_in, &red_seq);
5358
5359 tree acc_out = make_ssa_name (var_type);
5360 tree update_expr = fold_build2 (op, var_type, ref_in, var);
5361 gimplify_assign (acc_out, update_expr, &red_seq);
5362
5363 tree ref_out = build_simple_mem_ref (ptr);
5364 TREE_THIS_VOLATILE (ref_out) = 1;
5365 gimplify_assign (ref_out, acc_out, &red_seq);
5366
5367 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5368
5369 /* Build & insert the unlock sequence. */
5370 gimple_seq unlock_seq = NULL;
5371 tree unlock_expr = nvptx_global_lock_addr ();
5372 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
5373 uns_locked, uns_unlocked);
5374 gimplify_and_add (unlock_expr, &unlock_seq);
5375 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
5376
5377 return acc_out;
5378 }
5379
5380 /* Emit a sequence to update a reduction accumlator at *PTR with the
5381 value held in VAR using operator OP. Return the updated value.
5382
5383 TODO: optimize for atomic ops and indepedent complex ops. */
5384
5385 static tree
nvptx_reduction_update(location_t loc,gimple_stmt_iterator * gsi,tree ptr,tree var,tree_code op)5386 nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
5387 tree ptr, tree var, tree_code op)
5388 {
5389 tree type = TREE_TYPE (var);
5390 tree size = TYPE_SIZE (type);
5391
5392 if (size == TYPE_SIZE (unsigned_type_node)
5393 || size == TYPE_SIZE (long_long_unsigned_type_node))
5394 return nvptx_lockless_update (loc, gsi, ptr, var, op);
5395 else
5396 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
5397 }
5398
5399 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
5400
5401 static void
nvptx_goacc_reduction_setup(gcall * call)5402 nvptx_goacc_reduction_setup (gcall *call)
5403 {
5404 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5405 tree lhs = gimple_call_lhs (call);
5406 tree var = gimple_call_arg (call, 2);
5407 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5408 gimple_seq seq = NULL;
5409
5410 push_gimplify_context (true);
5411
5412 if (level != GOMP_DIM_GANG)
5413 {
5414 /* Copy the receiver object. */
5415 tree ref_to_res = gimple_call_arg (call, 1);
5416
5417 if (!integer_zerop (ref_to_res))
5418 var = build_simple_mem_ref (ref_to_res);
5419 }
5420
5421 if (level == GOMP_DIM_WORKER)
5422 {
5423 /* Store incoming value to worker reduction buffer. */
5424 tree offset = gimple_call_arg (call, 5);
5425 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
5426 tree ptr = make_ssa_name (TREE_TYPE (call));
5427
5428 gimplify_assign (ptr, call, &seq);
5429 tree ref = build_simple_mem_ref (ptr);
5430 TREE_THIS_VOLATILE (ref) = 1;
5431 gimplify_assign (ref, var, &seq);
5432 }
5433
5434 if (lhs)
5435 gimplify_assign (lhs, var, &seq);
5436
5437 pop_gimplify_context (NULL);
5438 gsi_replace_with_seq (&gsi, seq, true);
5439 }
5440
5441 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
5442
5443 static void
nvptx_goacc_reduction_init(gcall * call)5444 nvptx_goacc_reduction_init (gcall *call)
5445 {
5446 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5447 tree lhs = gimple_call_lhs (call);
5448 tree var = gimple_call_arg (call, 2);
5449 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5450 enum tree_code rcode
5451 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
5452 tree init = omp_reduction_init_op (gimple_location (call), rcode,
5453 TREE_TYPE (var));
5454 gimple_seq seq = NULL;
5455
5456 push_gimplify_context (true);
5457
5458 if (level == GOMP_DIM_VECTOR)
5459 {
5460 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
5461 tree tid = make_ssa_name (integer_type_node);
5462 tree dim_vector = gimple_call_arg (call, 3);
5463 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
5464 dim_vector);
5465 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
5466 NULL_TREE, NULL_TREE);
5467
5468 gimple_call_set_lhs (tid_call, tid);
5469 gimple_seq_add_stmt (&seq, tid_call);
5470 gimple_seq_add_stmt (&seq, cond_stmt);
5471
5472 /* Split the block just after the call. */
5473 edge init_edge = split_block (gsi_bb (gsi), call);
5474 basic_block init_bb = init_edge->dest;
5475 basic_block call_bb = init_edge->src;
5476
5477 /* Fixup flags from call_bb to init_bb. */
5478 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
5479 init_edge->probability = profile_probability::even ();
5480
5481 /* Set the initialization stmts. */
5482 gimple_seq init_seq = NULL;
5483 tree init_var = make_ssa_name (TREE_TYPE (var));
5484 gimplify_assign (init_var, init, &init_seq);
5485 gsi = gsi_start_bb (init_bb);
5486 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
5487
5488 /* Split block just after the init stmt. */
5489 gsi_prev (&gsi);
5490 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
5491 basic_block dst_bb = inited_edge->dest;
5492
5493 /* Create false edge from call_bb to dst_bb. */
5494 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
5495 nop_edge->probability = profile_probability::even ();
5496
5497 /* Create phi node in dst block. */
5498 gphi *phi = create_phi_node (lhs, dst_bb);
5499 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
5500 add_phi_arg (phi, var, nop_edge, gimple_location (call));
5501
5502 /* Reset dominator of dst bb. */
5503 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
5504
5505 /* Reset the gsi. */
5506 gsi = gsi_for_stmt (call);
5507 }
5508 else
5509 {
5510 if (level == GOMP_DIM_GANG)
5511 {
5512 /* If there's no receiver object, propagate the incoming VAR. */
5513 tree ref_to_res = gimple_call_arg (call, 1);
5514 if (integer_zerop (ref_to_res))
5515 init = var;
5516 }
5517
5518 gimplify_assign (lhs, init, &seq);
5519 }
5520
5521 pop_gimplify_context (NULL);
5522 gsi_replace_with_seq (&gsi, seq, true);
5523 }
5524
5525 /* NVPTX implementation of GOACC_REDUCTION_FINI. */
5526
5527 static void
nvptx_goacc_reduction_fini(gcall * call)5528 nvptx_goacc_reduction_fini (gcall *call)
5529 {
5530 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5531 tree lhs = gimple_call_lhs (call);
5532 tree ref_to_res = gimple_call_arg (call, 1);
5533 tree var = gimple_call_arg (call, 2);
5534 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5535 enum tree_code op
5536 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
5537 gimple_seq seq = NULL;
5538 tree r = NULL_TREE;;
5539
5540 push_gimplify_context (true);
5541
5542 if (level == GOMP_DIM_VECTOR)
5543 {
5544 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
5545 but that requires a method of emitting a unified jump at the
5546 gimple level. */
5547 for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1)
5548 {
5549 tree other_var = make_ssa_name (TREE_TYPE (var));
5550 nvptx_generate_vector_shuffle (gimple_location (call),
5551 other_var, var, shfl, &seq);
5552
5553 r = make_ssa_name (TREE_TYPE (var));
5554 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
5555 var, other_var), &seq);
5556 var = r;
5557 }
5558 }
5559 else
5560 {
5561 tree accum = NULL_TREE;
5562
5563 if (level == GOMP_DIM_WORKER)
5564 {
5565 /* Get reduction buffer address. */
5566 tree offset = gimple_call_arg (call, 5);
5567 tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
5568 tree ptr = make_ssa_name (TREE_TYPE (call));
5569
5570 gimplify_assign (ptr, call, &seq);
5571 accum = ptr;
5572 }
5573 else if (integer_zerop (ref_to_res))
5574 r = var;
5575 else
5576 accum = ref_to_res;
5577
5578 if (accum)
5579 {
5580 /* UPDATE the accumulator. */
5581 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5582 seq = NULL;
5583 r = nvptx_reduction_update (gimple_location (call), &gsi,
5584 accum, var, op);
5585 }
5586 }
5587
5588 if (lhs)
5589 gimplify_assign (lhs, r, &seq);
5590 pop_gimplify_context (NULL);
5591
5592 gsi_replace_with_seq (&gsi, seq, true);
5593 }
5594
5595 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
5596
5597 static void
nvptx_goacc_reduction_teardown(gcall * call)5598 nvptx_goacc_reduction_teardown (gcall *call)
5599 {
5600 gimple_stmt_iterator gsi = gsi_for_stmt (call);
5601 tree lhs = gimple_call_lhs (call);
5602 tree var = gimple_call_arg (call, 2);
5603 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5604 gimple_seq seq = NULL;
5605
5606 push_gimplify_context (true);
5607 if (level == GOMP_DIM_WORKER)
5608 {
5609 /* Read the worker reduction buffer. */
5610 tree offset = gimple_call_arg (call, 5);
5611 tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset);
5612 tree ptr = make_ssa_name (TREE_TYPE (call));
5613
5614 gimplify_assign (ptr, call, &seq);
5615 var = build_simple_mem_ref (ptr);
5616 TREE_THIS_VOLATILE (var) = 1;
5617 }
5618
5619 if (level != GOMP_DIM_GANG)
5620 {
5621 /* Write to the receiver object. */
5622 tree ref_to_res = gimple_call_arg (call, 1);
5623
5624 if (!integer_zerop (ref_to_res))
5625 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
5626 }
5627
5628 if (lhs)
5629 gimplify_assign (lhs, var, &seq);
5630
5631 pop_gimplify_context (NULL);
5632
5633 gsi_replace_with_seq (&gsi, seq, true);
5634 }
5635
5636 /* NVPTX reduction expander. */
5637
5638 static void
nvptx_goacc_reduction(gcall * call)5639 nvptx_goacc_reduction (gcall *call)
5640 {
5641 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
5642
5643 switch (code)
5644 {
5645 case IFN_GOACC_REDUCTION_SETUP:
5646 nvptx_goacc_reduction_setup (call);
5647 break;
5648
5649 case IFN_GOACC_REDUCTION_INIT:
5650 nvptx_goacc_reduction_init (call);
5651 break;
5652
5653 case IFN_GOACC_REDUCTION_FINI:
5654 nvptx_goacc_reduction_fini (call);
5655 break;
5656
5657 case IFN_GOACC_REDUCTION_TEARDOWN:
5658 nvptx_goacc_reduction_teardown (call);
5659 break;
5660
5661 default:
5662 gcc_unreachable ();
5663 }
5664 }
5665
5666 static bool
nvptx_cannot_force_const_mem(machine_mode mode ATTRIBUTE_UNUSED,rtx x ATTRIBUTE_UNUSED)5667 nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
5668 rtx x ATTRIBUTE_UNUSED)
5669 {
5670 return true;
5671 }
5672
5673 static bool
nvptx_vector_mode_supported(machine_mode mode)5674 nvptx_vector_mode_supported (machine_mode mode)
5675 {
5676 return (mode == V2SImode
5677 || mode == V2DImode);
5678 }
5679
5680 /* Return the preferred mode for vectorizing scalar MODE. */
5681
5682 static machine_mode
nvptx_preferred_simd_mode(scalar_mode mode)5683 nvptx_preferred_simd_mode (scalar_mode mode)
5684 {
5685 switch (mode)
5686 {
5687 case E_DImode:
5688 return V2DImode;
5689 case E_SImode:
5690 return V2SImode;
5691
5692 default:
5693 return default_preferred_simd_mode (mode);
5694 }
5695 }
5696
5697 unsigned int
nvptx_data_alignment(const_tree type,unsigned int basic_align)5698 nvptx_data_alignment (const_tree type, unsigned int basic_align)
5699 {
5700 if (TREE_CODE (type) == INTEGER_TYPE)
5701 {
5702 unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
5703 if (size == GET_MODE_SIZE (TImode))
5704 return GET_MODE_BITSIZE (maybe_split_mode (TImode));
5705 }
5706
5707 return basic_align;
5708 }
5709
5710 /* Implement TARGET_MODES_TIEABLE_P. */
5711
5712 static bool
nvptx_modes_tieable_p(machine_mode,machine_mode)5713 nvptx_modes_tieable_p (machine_mode, machine_mode)
5714 {
5715 return false;
5716 }
5717
5718 /* Implement TARGET_HARD_REGNO_NREGS. */
5719
5720 static unsigned int
nvptx_hard_regno_nregs(unsigned int,machine_mode)5721 nvptx_hard_regno_nregs (unsigned int, machine_mode)
5722 {
5723 return 1;
5724 }
5725
5726 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
5727
5728 static bool
nvptx_can_change_mode_class(machine_mode,machine_mode,reg_class_t)5729 nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t)
5730 {
5731 return false;
5732 }
5733
5734 #undef TARGET_OPTION_OVERRIDE
5735 #define TARGET_OPTION_OVERRIDE nvptx_option_override
5736
5737 #undef TARGET_ATTRIBUTE_TABLE
5738 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
5739
5740 #undef TARGET_LRA_P
5741 #define TARGET_LRA_P hook_bool_void_false
5742
5743 #undef TARGET_LEGITIMATE_ADDRESS_P
5744 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
5745
5746 #undef TARGET_PROMOTE_FUNCTION_MODE
5747 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
5748
5749 #undef TARGET_FUNCTION_ARG
5750 #define TARGET_FUNCTION_ARG nvptx_function_arg
5751 #undef TARGET_FUNCTION_INCOMING_ARG
5752 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
5753 #undef TARGET_FUNCTION_ARG_ADVANCE
5754 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
5755 #undef TARGET_FUNCTION_ARG_BOUNDARY
5756 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
5757 #undef TARGET_PASS_BY_REFERENCE
5758 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
5759 #undef TARGET_FUNCTION_VALUE_REGNO_P
5760 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
5761 #undef TARGET_FUNCTION_VALUE
5762 #define TARGET_FUNCTION_VALUE nvptx_function_value
5763 #undef TARGET_LIBCALL_VALUE
5764 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
5765 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
5766 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
5767 #undef TARGET_GET_DRAP_RTX
5768 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
5769 #undef TARGET_SPLIT_COMPLEX_ARG
5770 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
5771 #undef TARGET_RETURN_IN_MEMORY
5772 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
5773 #undef TARGET_OMIT_STRUCT_RETURN_REG
5774 #define TARGET_OMIT_STRUCT_RETURN_REG true
5775 #undef TARGET_STRICT_ARGUMENT_NAMING
5776 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
5777 #undef TARGET_CALL_ARGS
5778 #define TARGET_CALL_ARGS nvptx_call_args
5779 #undef TARGET_END_CALL_ARGS
5780 #define TARGET_END_CALL_ARGS nvptx_end_call_args
5781
5782 #undef TARGET_ASM_FILE_START
5783 #define TARGET_ASM_FILE_START nvptx_file_start
5784 #undef TARGET_ASM_FILE_END
5785 #define TARGET_ASM_FILE_END nvptx_file_end
5786 #undef TARGET_ASM_GLOBALIZE_LABEL
5787 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
5788 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
5789 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
5790 #undef TARGET_PRINT_OPERAND
5791 #define TARGET_PRINT_OPERAND nvptx_print_operand
5792 #undef TARGET_PRINT_OPERAND_ADDRESS
5793 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
5794 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
5795 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
5796 #undef TARGET_ASM_INTEGER
5797 #define TARGET_ASM_INTEGER nvptx_assemble_integer
5798 #undef TARGET_ASM_DECL_END
5799 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
5800 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
5801 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
5802 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
5803 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
5804 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
5805 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
5806
5807 #undef TARGET_MACHINE_DEPENDENT_REORG
5808 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
5809 #undef TARGET_NO_REGISTER_ALLOCATION
5810 #define TARGET_NO_REGISTER_ALLOCATION true
5811
5812 #undef TARGET_ENCODE_SECTION_INFO
5813 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
5814 #undef TARGET_RECORD_OFFLOAD_SYMBOL
5815 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
5816
5817 #undef TARGET_VECTOR_ALIGNMENT
5818 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
5819
5820 #undef TARGET_CANNOT_COPY_INSN_P
5821 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
5822
5823 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
5824 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
5825
5826 #undef TARGET_INIT_BUILTINS
5827 #define TARGET_INIT_BUILTINS nvptx_init_builtins
5828 #undef TARGET_EXPAND_BUILTIN
5829 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
5830 #undef TARGET_BUILTIN_DECL
5831 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
5832
5833 #undef TARGET_SIMT_VF
5834 #define TARGET_SIMT_VF nvptx_simt_vf
5835
5836 #undef TARGET_GOACC_VALIDATE_DIMS
5837 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
5838
5839 #undef TARGET_GOACC_DIM_LIMIT
5840 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
5841
5842 #undef TARGET_GOACC_FORK_JOIN
5843 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
5844
5845 #undef TARGET_GOACC_REDUCTION
5846 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
5847
5848 #undef TARGET_CANNOT_FORCE_CONST_MEM
5849 #define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
5850
5851 #undef TARGET_VECTOR_MODE_SUPPORTED_P
5852 #define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
5853
5854 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
5855 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
5856 nvptx_preferred_simd_mode
5857
5858 #undef TARGET_MODES_TIEABLE_P
5859 #define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
5860
5861 #undef TARGET_HARD_REGNO_NREGS
5862 #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
5863
5864 #undef TARGET_CAN_CHANGE_MODE_CLASS
5865 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
5866
5867 struct gcc_target targetm = TARGET_INITIALIZER;
5868
5869 #include "gt-nvptx.h"
5870