1 /* Target code for NVPTX.
2    Copyright (C) 2014-2020 Free Software Foundation, Inc.
3    Contributed by Bernd Schmidt <bernds@codesourcery.com>
4 
5    This file is part of GCC.
6 
7    GCC is free software; you can redistribute it and/or modify it
8    under the terms of the GNU General Public License as published
9    by the Free Software Foundation; either version 3, or (at your
10    option) any later version.
11 
12    GCC is distributed in the hope that it will be useful, but WITHOUT
13    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
15    License for more details.
16 
17    You should have received a copy of the GNU General Public License
18    along with GCC; see the file COPYING3.  If not see
19    <http://www.gnu.org/licenses/>.  */
20 
21 #define IN_TARGET_CODE 1
22 
23 #include "config.h"
24 #include <sstream>
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "cfghooks.h"
32 #include "df.h"
33 #include "memmodel.h"
34 #include "tm_p.h"
35 #include "expmed.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "diagnostic.h"
41 #include "alias.h"
42 #include "insn-flags.h"
43 #include "output.h"
44 #include "insn-attr.h"
45 #include "flags.h"
46 #include "dojump.h"
47 #include "explow.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "stmt.h"
51 #include "expr.h"
52 #include "tm-preds.h"
53 #include "tm-constrs.h"
54 #include "langhooks.h"
55 #include "dbxout.h"
56 #include "cfgrtl.h"
57 #include "gimple.h"
58 #include "stor-layout.h"
59 #include "builtins.h"
60 #include "omp-general.h"
61 #include "omp-low.h"
62 #include "omp-offload.h"
63 #include "gomp-constants.h"
64 #include "dumpfile.h"
65 #include "internal-fn.h"
66 #include "gimple-iterator.h"
67 #include "stringpool.h"
68 #include "attribs.h"
69 #include "tree-vrp.h"
70 #include "tree-ssa-operands.h"
71 #include "tree-ssanames.h"
72 #include "gimplify.h"
73 #include "tree-phinodes.h"
74 #include "cfgloop.h"
75 #include "fold-const.h"
76 #include "intl.h"
77 
78 /* This file should be included last.  */
79 #include "target-def.h"
80 
81 #define WORKAROUND_PTXJIT_BUG 1
82 #define WORKAROUND_PTXJIT_BUG_2 1
83 #define WORKAROUND_PTXJIT_BUG_3 1
84 
85 /* The PTX concept CTA (Concurrent Thread Array) maps on the CUDA concept thread
86    block, which has had a maximum number of threads of 1024 since CUDA version
87    2.x.  */
88 #define PTX_CTA_SIZE 1024
89 
90 #define PTX_CTA_NUM_BARRIERS 16
91 #define PTX_WARP_SIZE 32
92 
93 #define PTX_PER_CTA_BARRIER 0
94 #define PTX_NUM_PER_CTA_BARRIERS 1
95 #define PTX_FIRST_PER_WORKER_BARRIER (PTX_NUM_PER_CTA_BARRIERS)
96 #define PTX_NUM_PER_WORKER_BARRIERS (PTX_CTA_NUM_BARRIERS - PTX_NUM_PER_CTA_BARRIERS)
97 
98 #define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE
99 #define PTX_MAX_VECTOR_LENGTH PTX_CTA_SIZE
100 #define PTX_WORKER_LENGTH 32
101 #define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime.  */
102 
103 /* The various PTX memory areas an object might reside in.  */
104 enum nvptx_data_area
105 {
106   DATA_AREA_GENERIC,
107   DATA_AREA_GLOBAL,
108   DATA_AREA_SHARED,
109   DATA_AREA_LOCAL,
110   DATA_AREA_CONST,
111   DATA_AREA_PARAM,
112   DATA_AREA_MAX
113 };
114 
115 /*  We record the data area in the target symbol flags.  */
116 #define SYMBOL_DATA_AREA(SYM) \
117   (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
118 		    & 7)
119 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
120   (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
121 
122 /* Record the function decls we've written, and the libfuncs and function
123    decls corresponding to them.  */
124 static std::stringstream func_decls;
125 
126 struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
127 {
hashdeclared_libfunc_hasher128   static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
equaldeclared_libfunc_hasher129   static bool equal (rtx a, rtx b) { return a == b; }
130 };
131 
132 static GTY((cache))
133   hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
134 
135 struct tree_hasher : ggc_cache_ptr_hash<tree_node>
136 {
hashtree_hasher137   static hashval_t hash (tree t) { return htab_hash_pointer (t); }
equaltree_hasher138   static bool equal (tree a, tree b) { return a == b; }
139 };
140 
141 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
142 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
143 
144 /* Buffer needed to broadcast across workers and vectors.  This is
145    used for both worker-neutering and worker broadcasting, and
146    vector-neutering and boardcasting when vector_length > 32.  It is
147    shared by all functions emitted.  The buffer is placed in shared
148    memory.  It'd be nice if PTX supported common blocks, because then
149    this could be shared across TUs (taking the largest size).  */
150 static unsigned oacc_bcast_size;
151 static unsigned oacc_bcast_partition;
152 static unsigned oacc_bcast_align;
153 static GTY(()) rtx oacc_bcast_sym;
154 
155 /* Buffer needed for worker reductions.  This has to be distinct from
156    the worker broadcast array, as both may be live concurrently.  */
157 static unsigned worker_red_size;
158 static unsigned worker_red_align;
159 static GTY(()) rtx worker_red_sym;
160 
161 /* Buffer needed for vector reductions, when vector_length >
162    PTX_WARP_SIZE.  This has to be distinct from the worker broadcast
163    array, as both may be live concurrently.  */
164 static unsigned vector_red_size;
165 static unsigned vector_red_align;
166 static unsigned vector_red_partition;
167 static GTY(()) rtx vector_red_sym;
168 
169 /* Global lock variable, needed for 128bit worker & gang reductions.  */
170 static GTY(()) tree global_lock_var;
171 
172 /* True if any function references __nvptx_stacks.  */
173 static bool need_softstack_decl;
174 
175 /* True if any function references __nvptx_uni.  */
176 static bool need_unisimt_decl;
177 
178 static int nvptx_mach_max_workers ();
179 
180 /* Allocate a new, cleared machine_function structure.  */
181 
182 static struct machine_function *
nvptx_init_machine_status(void)183 nvptx_init_machine_status (void)
184 {
185   struct machine_function *p = ggc_cleared_alloc<machine_function> ();
186   p->return_mode = VOIDmode;
187   return p;
188 }
189 
190 /* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
191    and -fopenacc is also enabled.  */
192 
193 static void
diagnose_openacc_conflict(bool optval,const char * optname)194 diagnose_openacc_conflict (bool optval, const char *optname)
195 {
196   if (flag_openacc && optval)
197     error ("option %s is not supported together with %<-fopenacc%>", optname);
198 }
199 
200 /* Implement TARGET_OPTION_OVERRIDE.  */
201 
202 static void
nvptx_option_override(void)203 nvptx_option_override (void)
204 {
205   init_machine_status = nvptx_init_machine_status;
206 
207   /* Set toplevel_reorder, unless explicitly disabled.  We need
208      reordering so that we emit necessary assembler decls of
209      undeclared variables. */
210   if (!global_options_set.x_flag_toplevel_reorder)
211     flag_toplevel_reorder = 1;
212 
213   debug_nonbind_markers_p = 0;
214 
215   /* Set flag_no_common, unless explicitly disabled.  We fake common
216      using .weak, and that's not entirely accurate, so avoid it
217      unless forced.  */
218   if (!global_options_set.x_flag_no_common)
219     flag_no_common = 1;
220 
221   /* The patch area requires nops, which we don't have.  */
222   if (function_entry_patch_area_size > 0)
223     sorry ("not generating patch area, nops not supported");
224 
225   /* Assumes that it will see only hard registers.  */
226   flag_var_tracking = 0;
227 
228   if (nvptx_optimize < 0)
229     nvptx_optimize = optimize > 0;
230 
231   declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
232   needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
233   declared_libfuncs_htab
234     = hash_table<declared_libfunc_hasher>::create_ggc (17);
235 
236   oacc_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__oacc_bcast");
237   SET_SYMBOL_DATA_AREA (oacc_bcast_sym, DATA_AREA_SHARED);
238   oacc_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
239   oacc_bcast_partition = 0;
240 
241   worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
242   SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
243   worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
244 
245   vector_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__vector_red");
246   SET_SYMBOL_DATA_AREA (vector_red_sym, DATA_AREA_SHARED);
247   vector_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
248   vector_red_partition = 0;
249 
250   diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
251   diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
252   diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
253 
254   if (TARGET_GOMP)
255     target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
256 }
257 
258 /* Return a ptx type for MODE.  If PROMOTE, then use .u32 for QImode to
259    deal with ptx ideosyncracies.  */
260 
261 const char *
nvptx_ptx_type_from_mode(machine_mode mode,bool promote)262 nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
263 {
264   switch (mode)
265     {
266     case E_BLKmode:
267       return ".b8";
268     case E_BImode:
269       return ".pred";
270     case E_QImode:
271       if (promote)
272 	return ".u32";
273       else
274 	return ".u8";
275     case E_HImode:
276       return ".u16";
277     case E_SImode:
278       return ".u32";
279     case E_DImode:
280       return ".u64";
281 
282     case E_SFmode:
283       return ".f32";
284     case E_DFmode:
285       return ".f64";
286 
287     case E_V2SImode:
288       return ".v2.u32";
289     case E_V2DImode:
290       return ".v2.u64";
291 
292     default:
293       gcc_unreachable ();
294     }
295 }
296 
297 /* Encode the PTX data area that DECL (which might not actually be a
298    _DECL) should reside in.  */
299 
300 static void
nvptx_encode_section_info(tree decl,rtx rtl,int first)301 nvptx_encode_section_info (tree decl, rtx rtl, int first)
302 {
303   default_encode_section_info (decl, rtl, first);
304   if (first && MEM_P (rtl))
305     {
306       nvptx_data_area area = DATA_AREA_GENERIC;
307 
308       if (TREE_CONSTANT (decl))
309 	area = DATA_AREA_CONST;
310       else if (TREE_CODE (decl) == VAR_DECL)
311 	{
312 	  if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
313 	    {
314 	      area = DATA_AREA_SHARED;
315 	      if (DECL_INITIAL (decl))
316 		error ("static initialization of variable %q+D in %<.shared%>"
317 		       " memory is not supported", decl);
318 	    }
319 	  else
320 	    area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
321 	}
322 
323       SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
324     }
325 }
326 
327 /* Return the PTX name of the data area in which SYM should be
328    placed.  The symbol must have already been processed by
329    nvptx_encode_seciton_info, or equivalent.  */
330 
331 static const char *
section_for_sym(rtx sym)332 section_for_sym (rtx sym)
333 {
334   nvptx_data_area area = SYMBOL_DATA_AREA (sym);
335   /* Same order as nvptx_data_area enum.  */
336   static char const *const areas[] =
337     {"", ".global", ".shared", ".local", ".const", ".param"};
338 
339   return areas[area];
340 }
341 
342 /* Similarly for a decl.  */
343 
344 static const char *
section_for_decl(const_tree decl)345 section_for_decl (const_tree decl)
346 {
347   return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
348 }
349 
350 /* Check NAME for special function names and redirect them by returning a
351    replacement.  This applies to malloc, free and realloc, for which we
352    want to use libgcc wrappers, and call, which triggers a bug in
353    ptxas.  We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
354    not active in an offload compiler -- the names are all set by the
355    host-side compiler.  */
356 
357 static const char *
nvptx_name_replacement(const char * name)358 nvptx_name_replacement (const char *name)
359 {
360   if (strcmp (name, "call") == 0)
361     return "__nvptx_call";
362   if (strcmp (name, "malloc") == 0)
363     return "__nvptx_malloc";
364   if (strcmp (name, "free") == 0)
365     return "__nvptx_free";
366   if (strcmp (name, "realloc") == 0)
367     return "__nvptx_realloc";
368   return name;
369 }
370 
371 /* If MODE should be treated as two registers of an inner mode, return
372    that inner mode.  Otherwise return VOIDmode.  */
373 
374 static machine_mode
maybe_split_mode(machine_mode mode)375 maybe_split_mode (machine_mode mode)
376 {
377   if (COMPLEX_MODE_P (mode))
378     return GET_MODE_INNER (mode);
379 
380   if (mode == TImode)
381     return DImode;
382 
383   return VOIDmode;
384 }
385 
386 /* Return true if mode should be treated as two registers.  */
387 
388 static bool
split_mode_p(machine_mode mode)389 split_mode_p (machine_mode mode)
390 {
391   return maybe_split_mode (mode) != VOIDmode;
392 }
393 
394 /* Output a register, subreg, or register pair (with optional
395    enclosing braces).  */
396 
397 static void
398 output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
399 	    int subreg_offset = -1)
400 {
401   if (inner_mode == VOIDmode)
402     {
403       if (HARD_REGISTER_NUM_P (regno))
404 	fprintf (file, "%s", reg_names[regno]);
405       else
406 	fprintf (file, "%%r%d", regno);
407     }
408   else if (subreg_offset >= 0)
409     {
410       output_reg (file, regno, VOIDmode);
411       fprintf (file, "$%d", subreg_offset);
412     }
413   else
414     {
415       if (subreg_offset == -1)
416 	fprintf (file, "{");
417       output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
418       fprintf (file, ",");
419       output_reg (file, regno, inner_mode, 0);
420       if (subreg_offset == -1)
421 	fprintf (file, "}");
422     }
423 }
424 
425 /* Emit forking instructions for MASK.  */
426 
427 static void
nvptx_emit_forking(unsigned mask,bool is_call)428 nvptx_emit_forking (unsigned mask, bool is_call)
429 {
430   mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
431 	   | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
432   if (mask)
433     {
434       rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
435 
436       /* Emit fork at all levels.  This helps form SESE regions, as
437 	 it creates a block with a single successor before entering a
438 	 partitooned region.  That is a good candidate for the end of
439 	 an SESE region.  */
440       emit_insn (gen_nvptx_fork (op));
441       emit_insn (gen_nvptx_forked (op));
442     }
443 }
444 
445 /* Emit joining instructions for MASK.  */
446 
447 static void
nvptx_emit_joining(unsigned mask,bool is_call)448 nvptx_emit_joining (unsigned mask, bool is_call)
449 {
450   mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
451 	   | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
452   if (mask)
453     {
454       rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
455 
456       /* Emit joining for all non-call pars to ensure there's a single
457 	 predecessor for the block the join insn ends up in.  This is
458 	 needed for skipping entire loops.  */
459       emit_insn (gen_nvptx_joining (op));
460       emit_insn (gen_nvptx_join (op));
461     }
462 }
463 
464 
465 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
466    returned in memory.  Integer and floating types supported by the
467    machine are passed in registers, everything else is passed in
468    memory.  Complex types are split.  */
469 
470 static bool
pass_in_memory(machine_mode mode,const_tree type,bool for_return)471 pass_in_memory (machine_mode mode, const_tree type, bool for_return)
472 {
473   if (type)
474     {
475       if (AGGREGATE_TYPE_P (type))
476 	return true;
477       if (TREE_CODE (type) == VECTOR_TYPE)
478 	return true;
479     }
480 
481   if (!for_return && COMPLEX_MODE_P (mode))
482     /* Complex types are passed as two underlying args.  */
483     mode = GET_MODE_INNER (mode);
484 
485   if (GET_MODE_CLASS (mode) != MODE_INT
486       && GET_MODE_CLASS (mode) != MODE_FLOAT)
487     return true;
488 
489   if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
490     return true;
491 
492   return false;
493 }
494 
495 /* A non-memory argument of mode MODE is being passed, determine the mode it
496    should be promoted to.  This is also used for determining return
497    type promotion.  */
498 
499 static machine_mode
promote_arg(machine_mode mode,bool prototyped)500 promote_arg (machine_mode mode, bool prototyped)
501 {
502   if (!prototyped && mode == SFmode)
503     /* K&R float promotion for unprototyped functions.  */
504     mode = DFmode;
505   else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
506     mode = SImode;
507 
508   return mode;
509 }
510 
511 /* A non-memory return type of MODE is being returned.  Determine the
512    mode it should be promoted to.  */
513 
514 static machine_mode
promote_return(machine_mode mode)515 promote_return (machine_mode mode)
516 {
517   return promote_arg (mode, true);
518 }
519 
520 /* Implement TARGET_FUNCTION_ARG.  */
521 
522 static rtx
nvptx_function_arg(cumulative_args_t,const function_arg_info & arg)523 nvptx_function_arg (cumulative_args_t, const function_arg_info &arg)
524 {
525   if (arg.end_marker_p () || !arg.named)
526     return NULL_RTX;
527 
528   return gen_reg_rtx (arg.mode);
529 }
530 
531 /* Implement TARGET_FUNCTION_INCOMING_ARG.  */
532 
533 static rtx
nvptx_function_incoming_arg(cumulative_args_t cum_v,const function_arg_info & arg)534 nvptx_function_incoming_arg (cumulative_args_t cum_v,
535 			     const function_arg_info &arg)
536 {
537   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
538 
539   if (arg.end_marker_p () || !arg.named)
540     return NULL_RTX;
541 
542   /* No need to deal with split modes here, the only case that can
543      happen is complex modes and those are dealt with by
544      TARGET_SPLIT_COMPLEX_ARG.  */
545   return gen_rtx_UNSPEC (arg.mode,
546 			 gen_rtvec (1, GEN_INT (cum->count)),
547 			 UNSPEC_ARG_REG);
548 }
549 
550 /* Implement TARGET_FUNCTION_ARG_ADVANCE.  */
551 
552 static void
nvptx_function_arg_advance(cumulative_args_t cum_v,const function_arg_info &)553 nvptx_function_arg_advance (cumulative_args_t cum_v, const function_arg_info &)
554 {
555   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
556 
557   cum->count++;
558 }
559 
560 /* Implement TARGET_FUNCTION_ARG_BOUNDARY.
561 
562    For nvptx This is only used for varadic args.  The type has already
563    been promoted and/or converted to invisible reference.  */
564 
565 static unsigned
nvptx_function_arg_boundary(machine_mode mode,const_tree ARG_UNUSED (type))566 nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
567 {
568   return GET_MODE_ALIGNMENT (mode);
569 }
570 
571 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
572 
573    For nvptx, we know how to handle functions declared as stdarg: by
574    passing an extra pointer to the unnamed arguments.  However, the
575    Fortran frontend can produce a different situation, where a
576    function pointer is declared with no arguments, but the actual
577    function and calls to it take more arguments.  In that case, we
578    want to ensure the call matches the definition of the function.  */
579 
580 static bool
nvptx_strict_argument_naming(cumulative_args_t cum_v)581 nvptx_strict_argument_naming (cumulative_args_t cum_v)
582 {
583   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
584 
585   return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
586 }
587 
588 /* Implement TARGET_LIBCALL_VALUE.  */
589 
590 static rtx
nvptx_libcall_value(machine_mode mode,const_rtx)591 nvptx_libcall_value (machine_mode mode, const_rtx)
592 {
593   if (!cfun || !cfun->machine->doing_call)
594     /* Pretend to return in a hard reg for early uses before pseudos can be
595        generated.  */
596     return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
597 
598   return gen_reg_rtx (mode);
599 }
600 
601 /* TARGET_FUNCTION_VALUE implementation.  Returns an RTX representing the place
602    where function FUNC returns or receives a value of data type TYPE.  */
603 
604 static rtx
nvptx_function_value(const_tree type,const_tree ARG_UNUSED (func),bool outgoing)605 nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
606 		      bool outgoing)
607 {
608   machine_mode mode = promote_return (TYPE_MODE (type));
609 
610   if (outgoing)
611     {
612       gcc_assert (cfun);
613       cfun->machine->return_mode = mode;
614       return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
615     }
616 
617   return nvptx_libcall_value (mode, NULL_RTX);
618 }
619 
620 /* Implement TARGET_FUNCTION_VALUE_REGNO_P.  */
621 
622 static bool
nvptx_function_value_regno_p(const unsigned int regno)623 nvptx_function_value_regno_p (const unsigned int regno)
624 {
625   return regno == NVPTX_RETURN_REGNUM;
626 }
627 
628 /* Types with a mode other than those supported by the machine are passed by
629    reference in memory.  */
630 
631 static bool
nvptx_pass_by_reference(cumulative_args_t,const function_arg_info & arg)632 nvptx_pass_by_reference (cumulative_args_t, const function_arg_info &arg)
633 {
634   return pass_in_memory (arg.mode, arg.type, false);
635 }
636 
637 /* Implement TARGET_RETURN_IN_MEMORY.  */
638 
639 static bool
nvptx_return_in_memory(const_tree type,const_tree)640 nvptx_return_in_memory (const_tree type, const_tree)
641 {
642   return pass_in_memory (TYPE_MODE (type), type, true);
643 }
644 
645 /* Implement TARGET_PROMOTE_FUNCTION_MODE.  */
646 
647 static machine_mode
nvptx_promote_function_mode(const_tree type,machine_mode mode,int * ARG_UNUSED (punsignedp),const_tree funtype,int for_return)648 nvptx_promote_function_mode (const_tree type, machine_mode mode,
649 			     int *ARG_UNUSED (punsignedp),
650 			     const_tree funtype, int for_return)
651 {
652   return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
653 }
654 
655 /* Helper for write_arg.  Emit a single PTX argument of MODE, either
656    in a prototype, or as copy in a function prologue.  ARGNO is the
657    index of this argument in the PTX function.  FOR_REG is negative,
658    if we're emitting the PTX prototype.  It is zero if we're copying
659    to an argument register and it is greater than zero if we're
660    copying to a specific hard register.  */
661 
662 static int
write_arg_mode(std::stringstream & s,int for_reg,int argno,machine_mode mode)663 write_arg_mode (std::stringstream &s, int for_reg, int argno,
664 		machine_mode mode)
665 {
666   const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
667 
668   if (for_reg < 0)
669     {
670       /* Writing PTX prototype.  */
671       s << (argno ? ", " : " (");
672       s << ".param" << ptx_type << " %in_ar" << argno;
673     }
674   else
675     {
676       s << "\t.reg" << ptx_type << " ";
677       if (for_reg)
678 	s << reg_names[for_reg];
679       else
680 	s << "%ar" << argno;
681       s << ";\n";
682       if (argno >= 0)
683 	{
684 	  s << "\tld.param" << ptx_type << " ";
685 	  if (for_reg)
686 	    s << reg_names[for_reg];
687 	  else
688 	    s << "%ar" << argno;
689 	  s << ", [%in_ar" << argno << "];\n";
690 	}
691     }
692   return argno + 1;
693 }
694 
695 /* Process function parameter TYPE to emit one or more PTX
696    arguments. S, FOR_REG and ARGNO as for write_arg_mode.  PROTOTYPED
697    is true, if this is a prototyped function, rather than an old-style
698    C declaration.  Returns the next argument number to use.
699 
700    The promotion behavior here must match the regular GCC function
701    parameter marshalling machinery.  */
702 
703 static int
write_arg_type(std::stringstream & s,int for_reg,int argno,tree type,bool prototyped)704 write_arg_type (std::stringstream &s, int for_reg, int argno,
705 		tree type, bool prototyped)
706 {
707   machine_mode mode = TYPE_MODE (type);
708 
709   if (mode == VOIDmode)
710     return argno;
711 
712   if (pass_in_memory (mode, type, false))
713     mode = Pmode;
714   else
715     {
716       bool split = TREE_CODE (type) == COMPLEX_TYPE;
717 
718       if (split)
719 	{
720 	  /* Complex types are sent as two separate args.  */
721 	  type = TREE_TYPE (type);
722 	  mode = TYPE_MODE (type);
723 	  prototyped = true;
724 	}
725 
726       mode = promote_arg (mode, prototyped);
727       if (split)
728 	argno = write_arg_mode (s, for_reg, argno, mode);
729     }
730 
731   return write_arg_mode (s, for_reg, argno, mode);
732 }
733 
734 /* Emit a PTX return as a prototype or function prologue declaration
735    for MODE.  */
736 
737 static void
write_return_mode(std::stringstream & s,bool for_proto,machine_mode mode)738 write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
739 {
740   const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
741   const char *pfx = "\t.reg";
742   const char *sfx = ";\n";
743 
744   if (for_proto)
745     pfx = "(.param", sfx = "_out) ";
746 
747   s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
748 }
749 
750 /* Process a function return TYPE to emit a PTX return as a prototype
751    or function prologue declaration.  Returns true if return is via an
752    additional pointer parameter.  The promotion behavior here must
753    match the regular GCC function return mashalling.  */
754 
755 static bool
write_return_type(std::stringstream & s,bool for_proto,tree type)756 write_return_type (std::stringstream &s, bool for_proto, tree type)
757 {
758   machine_mode mode = TYPE_MODE (type);
759 
760   if (mode == VOIDmode)
761     return false;
762 
763   bool return_in_mem = pass_in_memory (mode, type, true);
764 
765   if (return_in_mem)
766     {
767       if (for_proto)
768 	return return_in_mem;
769 
770       /* Named return values can cause us to return a pointer as well
771 	 as expect an argument for the return location.  This is
772 	 optimization-level specific, so no caller can make use of
773 	 this data, but more importantly for us, we must ensure it
774 	 doesn't change the PTX prototype.  */
775       mode = (machine_mode) cfun->machine->return_mode;
776 
777       if (mode == VOIDmode)
778 	return return_in_mem;
779 
780       /* Clear return_mode to inhibit copy of retval to non-existent
781 	 retval parameter.  */
782       cfun->machine->return_mode = VOIDmode;
783     }
784   else
785     mode = promote_return (mode);
786 
787   write_return_mode (s, for_proto, mode);
788 
789   return return_in_mem;
790 }
791 
792 /* Look for attributes in ATTRS that would indicate we must write a function
793    as a .entry kernel rather than a .func.  Return true if one is found.  */
794 
795 static bool
write_as_kernel(tree attrs)796 write_as_kernel (tree attrs)
797 {
798   return (lookup_attribute ("kernel", attrs) != NULL_TREE
799 	  || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
800 	      && lookup_attribute ("oacc function", attrs) != NULL_TREE));
801   /* For OpenMP target regions, the corresponding kernel entry is emitted from
802      write_omp_entry as a separate function.  */
803 }
804 
805 /* Emit a linker marker for a function decl or defn.  */
806 
807 static void
write_fn_marker(std::stringstream & s,bool is_defn,bool globalize,const char * name)808 write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
809 		 const char *name)
810 {
811   s << "\n// BEGIN";
812   if (globalize)
813     s << " GLOBAL";
814   s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
815   s << name << "\n";
816 }
817 
818 /* Emit a linker marker for a variable decl or defn.  */
819 
820 static void
write_var_marker(FILE * file,bool is_defn,bool globalize,const char * name)821 write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
822 {
823   fprintf (file, "\n// BEGIN%s VAR %s: ",
824 	   globalize ? " GLOBAL" : "",
825 	   is_defn ? "DEF" : "DECL");
826   assemble_name_raw (file, name);
827   fputs ("\n", file);
828 }
829 
830 /* Write a .func or .kernel declaration or definition along with
831    a helper comment for use by ld.  S is the stream to write to, DECL
832    the decl for the function with name NAME.   For definitions, emit
833    a declaration too.  */
834 
835 static const char *
write_fn_proto(std::stringstream & s,bool is_defn,const char * name,const_tree decl)836 write_fn_proto (std::stringstream &s, bool is_defn,
837 		const char *name, const_tree decl)
838 {
839   if (is_defn)
840     /* Emit a declaration. The PTX assembler gets upset without it.   */
841     name = write_fn_proto (s, false, name, decl);
842   else
843     {
844       /* Avoid repeating the name replacement.  */
845       name = nvptx_name_replacement (name);
846       if (name[0] == '*')
847 	name++;
848     }
849 
850   write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
851 
852   /* PTX declaration.  */
853   if (DECL_EXTERNAL (decl))
854     s << ".extern ";
855   else if (TREE_PUBLIC (decl))
856     s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
857   s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
858 
859   tree fntype = TREE_TYPE (decl);
860   tree result_type = TREE_TYPE (fntype);
861 
862   /* atomic_compare_exchange_$n builtins have an exceptional calling
863      convention.  */
864   int not_atomic_weak_arg = -1;
865   if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
866     switch (DECL_FUNCTION_CODE (decl))
867       {
868       case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
869       case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
870       case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
871       case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
872       case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
873 	/* These atomics skip the 'weak' parm in an actual library
874 	   call.  We must skip it in the prototype too.  */
875 	not_atomic_weak_arg = 3;
876 	break;
877 
878       default:
879 	break;
880       }
881 
882   /* Declare the result.  */
883   bool return_in_mem = write_return_type (s, true, result_type);
884 
885   s << name;
886 
887   int argno = 0;
888 
889   /* Emit argument list.  */
890   if (return_in_mem)
891     argno = write_arg_type (s, -1, argno, ptr_type_node, true);
892 
893   /* We get:
894      NULL in TYPE_ARG_TYPES, for old-style functions
895      NULL in DECL_ARGUMENTS, for builtin functions without another
896        declaration.
897      So we have to pick the best one we have.  */
898   tree args = TYPE_ARG_TYPES (fntype);
899   bool prototyped = true;
900   if (!args)
901     {
902       args = DECL_ARGUMENTS (decl);
903       prototyped = false;
904     }
905 
906   for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
907     {
908       tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
909 
910       if (not_atomic_weak_arg)
911 	argno = write_arg_type (s, -1, argno, type, prototyped);
912       else
913 	gcc_assert (type == boolean_type_node);
914     }
915 
916   if (stdarg_p (fntype))
917     argno = write_arg_type (s, -1, argno, ptr_type_node, true);
918 
919   if (DECL_STATIC_CHAIN (decl))
920     argno = write_arg_type (s, -1, argno, ptr_type_node, true);
921 
922   if (!argno && strcmp (name, "main") == 0)
923     {
924       argno = write_arg_type (s, -1, argno, integer_type_node, true);
925       argno = write_arg_type (s, -1, argno, ptr_type_node, true);
926     }
927 
928   if (argno)
929     s << ")";
930 
931   s << (is_defn ? "\n" : ";\n");
932 
933   return name;
934 }
935 
936 /* Construct a function declaration from a call insn.  This can be
937    necessary for two reasons - either we have an indirect call which
938    requires a .callprototype declaration, or we have a libcall
939    generated by emit_library_call for which no decl exists.  */
940 
941 static void
write_fn_proto_from_insn(std::stringstream & s,const char * name,rtx result,rtx pat)942 write_fn_proto_from_insn (std::stringstream &s, const char *name,
943 			  rtx result, rtx pat)
944 {
945   if (!name)
946     {
947       s << "\t.callprototype ";
948       name = "_";
949     }
950   else
951     {
952       name = nvptx_name_replacement (name);
953       write_fn_marker (s, false, true, name);
954       s << "\t.extern .func ";
955     }
956 
957   if (result != NULL_RTX)
958     write_return_mode (s, true, GET_MODE (result));
959 
960   s << name;
961 
962   int arg_end = XVECLEN (pat, 0);
963   for (int i = 1; i < arg_end; i++)
964     {
965       /* We don't have to deal with mode splitting & promotion here,
966 	 as that was already done when generating the call
967 	 sequence.  */
968       machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
969 
970       write_arg_mode (s, -1, i - 1, mode);
971     }
972   if (arg_end != 1)
973     s << ")";
974   s << ";\n";
975 }
976 
977 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
978    table and write a ptx prototype.  These are emitted at end of
979    compilation.  */
980 
981 static void
nvptx_record_fndecl(tree decl)982 nvptx_record_fndecl (tree decl)
983 {
984   tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
985   if (*slot == NULL)
986     {
987       *slot = decl;
988       const char *name = get_fnname_from_decl (decl);
989       write_fn_proto (func_decls, false, name, decl);
990     }
991 }
992 
993 /* Record a libcall or unprototyped external function. CALLEE is the
994    SYMBOL_REF.  Insert into the libfunc hash table and emit a ptx
995    declaration for it.  */
996 
997 static void
nvptx_record_libfunc(rtx callee,rtx retval,rtx pat)998 nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
999 {
1000   rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
1001   if (*slot == NULL)
1002     {
1003       *slot = callee;
1004 
1005       const char *name = XSTR (callee, 0);
1006       write_fn_proto_from_insn (func_decls, name, retval, pat);
1007     }
1008 }
1009 
1010 /* DECL is an external FUNCTION_DECL, that we're referencing.  If it
1011    is prototyped, record it now.  Otherwise record it as needed at end
1012    of compilation, when we might have more information about it.  */
1013 
1014 void
nvptx_record_needed_fndecl(tree decl)1015 nvptx_record_needed_fndecl (tree decl)
1016 {
1017   if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
1018     {
1019       tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
1020       if (*slot == NULL)
1021 	*slot = decl;
1022     }
1023   else
1024     nvptx_record_fndecl (decl);
1025 }
1026 
1027 /* SYM is a SYMBOL_REF.  If it refers to an external function, record
1028    it as needed.  */
1029 
1030 static void
nvptx_maybe_record_fnsym(rtx sym)1031 nvptx_maybe_record_fnsym (rtx sym)
1032 {
1033   tree decl = SYMBOL_REF_DECL (sym);
1034 
1035   if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
1036     nvptx_record_needed_fndecl (decl);
1037 }
1038 
1039 /* Emit a local array to hold some part of a conventional stack frame
1040    and initialize REGNO to point to it.  If the size is zero, it'll
1041    never be valid to dereference, so we can simply initialize to
1042    zero.  */
1043 
1044 static void
init_frame(FILE * file,int regno,unsigned align,unsigned size)1045 init_frame (FILE  *file, int regno, unsigned align, unsigned size)
1046 {
1047   if (size)
1048     fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
1049 	     align, reg_names[regno], size);
1050   fprintf (file, "\t.reg.u%d %s;\n",
1051 	   POINTER_SIZE, reg_names[regno]);
1052   fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
1053 		  :  "\tmov.u%d %s, 0;\n"),
1054 	   POINTER_SIZE, reg_names[regno], reg_names[regno]);
1055 }
1056 
1057 /* Emit soft stack frame setup sequence.  */
1058 
1059 static void
init_softstack_frame(FILE * file,unsigned alignment,HOST_WIDE_INT size)1060 init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
1061 {
1062   /* Maintain 64-bit stack alignment.  */
1063   unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
1064   size = ROUND_UP (size, keep_align);
1065   int bits = POINTER_SIZE;
1066   const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1067   const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
1068   const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
1069   const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
1070   fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
1071   fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
1072   fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
1073   fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
1074   fprintf (file, "\t{\n");
1075   fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
1076   fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
1077   fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
1078   fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1079   fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1080 	   bits == 64 ? ".wide" : ".lo", bits / 8);
1081   fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
1082 
1083   /* Initialize %sspslot = &__nvptx_stacks[tid.y].  */
1084   fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
1085 
1086   /* Initialize %sspprev = __nvptx_stacks[tid.y].  */
1087   fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
1088 	   bits, reg_sspprev, reg_sspslot);
1089 
1090   /* Initialize %frame = %sspprev - size.  */
1091   fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1092 	   bits, reg_frame, reg_sspprev, size);
1093 
1094   /* Apply alignment, if larger than 64.  */
1095   if (alignment > keep_align)
1096     fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
1097 	     bits, reg_frame, reg_frame, -alignment);
1098 
1099   size = crtl->outgoing_args_size;
1100   gcc_assert (size % keep_align == 0);
1101 
1102   /* Initialize %stack.  */
1103   fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1104 	   bits, reg_stack, reg_frame, size);
1105 
1106   if (!crtl->is_leaf)
1107     fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
1108 	     bits, reg_sspslot, reg_stack);
1109   fprintf (file, "\t}\n");
1110   cfun->machine->has_softstack = true;
1111   need_softstack_decl = true;
1112 }
1113 
1114 /* Emit code to initialize the REGNO predicate register to indicate
1115    whether we are not lane zero on the NAME axis.  */
1116 
1117 static void
nvptx_init_axis_predicate(FILE * file,int regno,const char * name)1118 nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
1119 {
1120   fprintf (file, "\t{\n");
1121   fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
1122   if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1123     {
1124       fprintf (file, "\t\t.reg.u64\t%%t_red;\n");
1125       fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1126     }
1127   fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
1128   fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
1129   if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1130     {
1131       fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tid.y;\n");
1132       fprintf (file, "\t\tcvta.shared.u64\t%%t_red, __vector_red;\n");
1133       fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_red; "
1134 	       "// vector reduction buffer\n",
1135 	       REGNO (cfun->machine->red_partition),
1136 	       vector_red_partition);
1137     }
1138   /* Verify vector_red_size.  */
1139   gcc_assert (vector_red_partition * nvptx_mach_max_workers ()
1140 	      <= vector_red_size);
1141   fprintf (file, "\t}\n");
1142 }
1143 
1144 /* Emit code to initialize OpenACC worker broadcast and synchronization
1145    registers.  */
1146 
1147 static void
nvptx_init_oacc_workers(FILE * file)1148 nvptx_init_oacc_workers (FILE *file)
1149 {
1150   fprintf (file, "\t{\n");
1151   fprintf (file, "\t\t.reg.u32\t%%tidy;\n");
1152   if (cfun->machine->bcast_partition)
1153     {
1154       fprintf (file, "\t\t.reg.u64\t%%t_bcast;\n");
1155       fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1156     }
1157   fprintf (file, "\t\tmov.u32\t\t%%tidy, %%tid.y;\n");
1158   if (cfun->machine->bcast_partition)
1159     {
1160       fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tidy;\n");
1161       fprintf (file, "\t\tadd.u64\t\t%%y64, %%y64, 1; // vector ID\n");
1162       fprintf (file, "\t\tcvta.shared.u64\t%%t_bcast, __oacc_bcast;\n");
1163       fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_bcast; "
1164 	       "// vector broadcast offset\n",
1165 	       REGNO (cfun->machine->bcast_partition),
1166 	       oacc_bcast_partition);
1167     }
1168   /* Verify oacc_bcast_size.  */
1169   gcc_assert (oacc_bcast_partition * (nvptx_mach_max_workers () + 1)
1170 	      <= oacc_bcast_size);
1171   if (cfun->machine->sync_bar)
1172     fprintf (file, "\t\tadd.u32\t\t%%r%d, %%tidy, 1; "
1173 	     "// vector synchronization barrier\n",
1174 	     REGNO (cfun->machine->sync_bar));
1175   fprintf (file, "\t}\n");
1176 }
1177 
1178 /* Emit code to initialize predicate and master lane index registers for
1179    -muniform-simt code generation variant.  */
1180 
1181 static void
nvptx_init_unisimt_predicate(FILE * file)1182 nvptx_init_unisimt_predicate (FILE *file)
1183 {
1184   cfun->machine->unisimt_location = gen_reg_rtx (Pmode);
1185   int loc = REGNO (cfun->machine->unisimt_location);
1186   int bits = POINTER_SIZE;
1187   fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc);
1188   fprintf (file, "\t{\n");
1189   fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
1190   fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
1191   fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1192   fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1193 	   bits == 64 ? ".wide" : ".lo");
1194   fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc);
1195   fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc);
1196   if (cfun->machine->unisimt_predicate)
1197     {
1198       int master = REGNO (cfun->machine->unisimt_master);
1199       int pred = REGNO (cfun->machine->unisimt_predicate);
1200       fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc);
1201       fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1202       /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'.  */
1203       fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
1204       /* Compute predicate as 'tid.x == master'.  */
1205       fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
1206     }
1207   fprintf (file, "\t}\n");
1208   need_unisimt_decl = true;
1209 }
1210 
1211 /* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1212 
1213    extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1214    void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1215    {
1216      __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1217      __nvptx_uni[tid.y] = 0;
1218      gomp_nvptx_main (ORIG, arg);
1219    }
1220    ORIG itself should not be emitted as a PTX .entry function.  */
1221 
1222 static void
write_omp_entry(FILE * file,const char * name,const char * orig)1223 write_omp_entry (FILE *file, const char *name, const char *orig)
1224 {
1225   static bool gomp_nvptx_main_declared;
1226   if (!gomp_nvptx_main_declared)
1227     {
1228       gomp_nvptx_main_declared = true;
1229       write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
1230       func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1231         << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
1232     }
1233   /* PR79332.  Single out this string; it confuses gcc.pot generation.  */
1234 #define NTID_Y "%ntid.y"
1235 #define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1236  (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1237 {\n\
1238 	.reg.u32 %r<3>;\n\
1239 	.reg.u" PS " %R<4>;\n\
1240 	mov.u32 %r0, %tid.y;\n\
1241 	mov.u32 %r1, " NTID_Y ";\n\
1242 	mov.u32 %r2, %ctaid.x;\n\
1243 	cvt.u" PS ".u32 %R1, %r0;\n\
1244 	" MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1245 	mov.u" PS " %R0, __nvptx_stacks;\n\
1246 	" MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1247 	ld.param.u" PS " %R2, [%stack];\n\
1248 	ld.param.u" PS " %R3, [%sz];\n\
1249 	add.u" PS " %R2, %R2, %R3;\n\
1250 	mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1251 	st.shared.u" PS " [%R0], %R2;\n\
1252 	mov.u" PS " %R0, __nvptx_uni;\n\
1253 	" MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1254 	mov.u32 %r0, 0;\n\
1255 	st.shared.u32 [%R0], %r0;\n\
1256 	mov.u" PS " %R0, \0;\n\
1257 	ld.param.u" PS " %R1, [%arg];\n\
1258 	{\n\
1259 		.param.u" PS " %P<2>;\n\
1260 		st.param.u" PS " [%P0], %R0;\n\
1261 		st.param.u" PS " [%P1], %R1;\n\
1262 		call.uni gomp_nvptx_main, (%P0, %P1);\n\
1263 	}\n\
1264 	ret.uni;\n\
1265 }\n"
1266   static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1267   static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32  ");
1268 #undef ENTRY_TEMPLATE
1269 #undef NTID_Y
1270   const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
1271   /* Position ENTRY_2 after the embedded nul using strlen of the prefix.  */
1272   const char *entry_2 = entry_1 + strlen (entry64) + 1;
1273   fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
1274   need_softstack_decl = need_unisimt_decl = true;
1275 }
1276 
1277 /* Implement ASM_DECLARE_FUNCTION_NAME.  Writes the start of a ptx
1278    function, including local var decls and copies from the arguments to
1279    local regs.  */
1280 
1281 void
nvptx_declare_function_name(FILE * file,const char * name,const_tree decl)1282 nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
1283 {
1284   tree fntype = TREE_TYPE (decl);
1285   tree result_type = TREE_TYPE (fntype);
1286   int argno = 0;
1287 
1288   if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
1289       && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
1290     {
1291       char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
1292       sprintf (buf, "%s$impl", name);
1293       write_omp_entry (file, name, buf);
1294       name = buf;
1295     }
1296   /* We construct the initial part of the function into a string
1297      stream, in order to share the prototype writing code.  */
1298   std::stringstream s;
1299   write_fn_proto (s, true, name, decl);
1300   s << "{\n";
1301 
1302   bool return_in_mem = write_return_type (s, false, result_type);
1303   if (return_in_mem)
1304     argno = write_arg_type (s, 0, argno, ptr_type_node, true);
1305 
1306   /* Declare and initialize incoming arguments.  */
1307   tree args = TYPE_ARG_TYPES (fntype);
1308   bool prototyped = true;
1309   if (!args)
1310     {
1311       args = DECL_ARGUMENTS (decl);
1312       prototyped = false;
1313     }
1314 
1315   for (; args != NULL_TREE; args = TREE_CHAIN (args))
1316     {
1317       tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
1318 
1319       argno = write_arg_type (s, 0, argno, type, prototyped);
1320     }
1321 
1322   if (stdarg_p (fntype))
1323     argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1324 			    true);
1325 
1326   if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1327     write_arg_type (s, STATIC_CHAIN_REGNUM,
1328 		    DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1329 		    true);
1330 
1331   fprintf (file, "%s", s.str().c_str());
1332 
1333   /* Usually 'crtl->is_leaf' is computed during register allocator
1334      initialization (which is not done on NVPTX) or for pressure-sensitive
1335      optimizations.  Initialize it here, except if already set.  */
1336   if (!crtl->is_leaf)
1337     crtl->is_leaf = leaf_function_p ();
1338 
1339   HOST_WIDE_INT sz = get_frame_size ();
1340   bool need_frameptr = sz || cfun->machine->has_chain;
1341   int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
1342   if (!TARGET_SOFT_STACK)
1343     {
1344       /* Declare a local var for outgoing varargs.  */
1345       if (cfun->machine->has_varadic)
1346 	init_frame (file, STACK_POINTER_REGNUM,
1347 		    UNITS_PER_WORD, crtl->outgoing_args_size);
1348 
1349       /* Declare a local variable for the frame.  Force its size to be
1350 	 DImode-compatible.  */
1351       if (need_frameptr)
1352 	init_frame (file, FRAME_POINTER_REGNUM, alignment,
1353 		    ROUND_UP (sz, GET_MODE_SIZE (DImode)));
1354     }
1355   else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca
1356 	   || (cfun->machine->has_simtreg && !crtl->is_leaf))
1357     init_softstack_frame (file, alignment, sz);
1358 
1359   if (cfun->machine->has_simtreg)
1360     {
1361       unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size;
1362       unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align;
1363       align = MAX (align, GET_MODE_SIZE (DImode));
1364       if (!crtl->is_leaf || cfun->calls_alloca)
1365 	simtsz = HOST_WIDE_INT_M1U;
1366       if (simtsz == HOST_WIDE_INT_M1U)
1367 	simtsz = nvptx_softstack_size;
1368       if (cfun->machine->has_softstack)
1369 	simtsz += POINTER_SIZE / 8;
1370       simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode));
1371       if (align > GET_MODE_SIZE (DImode))
1372 	simtsz += align - GET_MODE_SIZE (DImode);
1373       if (simtsz)
1374 	fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
1375 		HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
1376     }
1377 
1378   /* Restore the vector reduction partition register, if necessary.
1379      FIXME: Find out when and why this is necessary, and fix it.  */
1380   if (cfun->machine->red_partition)
1381     regno_reg_rtx[REGNO (cfun->machine->red_partition)]
1382       = cfun->machine->red_partition;
1383 
1384   /* Declare the pseudos we have as ptx registers.  */
1385   int maxregs = max_reg_num ();
1386   for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1387     {
1388       if (regno_reg_rtx[i] != const0_rtx)
1389 	{
1390 	  machine_mode mode = PSEUDO_REGNO_MODE (i);
1391 	  machine_mode split = maybe_split_mode (mode);
1392 
1393 	  if (split_mode_p (mode))
1394 	    mode = split;
1395 	  fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1396 	  output_reg (file, i, split, -2);
1397 	  fprintf (file, ";\n");
1398 	}
1399     }
1400 
1401   /* Emit axis predicates. */
1402   if (cfun->machine->axis_predicate[0])
1403     nvptx_init_axis_predicate (file,
1404 			       REGNO (cfun->machine->axis_predicate[0]), "y");
1405   if (cfun->machine->axis_predicate[1])
1406     nvptx_init_axis_predicate (file,
1407 			       REGNO (cfun->machine->axis_predicate[1]), "x");
1408   if (cfun->machine->unisimt_predicate
1409       || (cfun->machine->has_simtreg && !crtl->is_leaf))
1410     nvptx_init_unisimt_predicate (file);
1411   if (cfun->machine->bcast_partition || cfun->machine->sync_bar)
1412     nvptx_init_oacc_workers (file);
1413 }
1414 
1415 /* Output code for switching uniform-simt state.  ENTERING indicates whether
1416    we are entering or leaving non-uniform execution region.  */
1417 
1418 static void
nvptx_output_unisimt_switch(FILE * file,bool entering)1419 nvptx_output_unisimt_switch (FILE *file, bool entering)
1420 {
1421   if (crtl->is_leaf && !cfun->machine->unisimt_predicate)
1422     return;
1423   fprintf (file, "\t{\n");
1424   fprintf (file, "\t\t.reg.u32 %%ustmp2;\n");
1425   fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0);
1426   if (!crtl->is_leaf)
1427     {
1428       int loc = REGNO (cfun->machine->unisimt_location);
1429       fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc);
1430     }
1431   if (cfun->machine->unisimt_predicate)
1432     {
1433       int master = REGNO (cfun->machine->unisimt_master);
1434       int pred = REGNO (cfun->machine->unisimt_predicate);
1435       fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1436       fprintf (file, "\t\tmov.u32 %%r%d, %s;\n",
1437 	       master, entering ? "%ustmp2" : "0");
1438       fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master);
1439     }
1440   fprintf (file, "\t}\n");
1441 }
1442 
1443 /* Output code for allocating per-lane storage and switching soft-stack pointer.
1444    ENTERING indicates whether we are entering or leaving non-uniform execution.
1445    PTR is the register pointing to allocated storage, it is assigned to on
1446    entering and used to restore state on leaving.  SIZE and ALIGN are used only
1447    on entering.  */
1448 
1449 static void
nvptx_output_softstack_switch(FILE * file,bool entering,rtx ptr,rtx size,rtx align)1450 nvptx_output_softstack_switch (FILE *file, bool entering,
1451 			       rtx ptr, rtx size, rtx align)
1452 {
1453   gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr));
1454   if (crtl->is_leaf && !cfun->machine->simt_stack_size)
1455     return;
1456   int bits = POINTER_SIZE, regno = REGNO (ptr);
1457   fprintf (file, "\t{\n");
1458   if (entering)
1459     {
1460       fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1461 	       HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno,
1462 	       cfun->machine->simt_stack_size);
1463       fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno);
1464       if (CONST_INT_P (size))
1465 	fprintf (file, HOST_WIDE_INT_PRINT_DEC,
1466 		 ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode)));
1467       else
1468 	output_reg (file, REGNO (size), VOIDmode);
1469       fputs (";\n", file);
1470       if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
1471 	fprintf (file,
1472 		 "\t\tand.b%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
1473 		 bits, regno, regno, UINTVAL (align));
1474     }
1475   if (cfun->machine->has_softstack)
1476     {
1477       const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1478       if (entering)
1479 	{
1480 	  fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1481 		   bits, regno, bits / 8, reg_stack);
1482 	  fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n",
1483 		   bits, reg_stack, regno, bits / 8);
1484 	}
1485       else
1486 	{
1487 	  fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1488 		   bits, reg_stack, regno, bits / 8);
1489 	}
1490       nvptx_output_set_softstack (REGNO (stack_pointer_rtx));
1491     }
1492   fprintf (file, "\t}\n");
1493 }
1494 
1495 /* Output code to enter non-uniform execution region.  DEST is a register
1496    to hold a per-lane allocation given by SIZE and ALIGN.  */
1497 
1498 const char *
nvptx_output_simt_enter(rtx dest,rtx size,rtx align)1499 nvptx_output_simt_enter (rtx dest, rtx size, rtx align)
1500 {
1501   nvptx_output_unisimt_switch (asm_out_file, true);
1502   nvptx_output_softstack_switch (asm_out_file, true, dest, size, align);
1503   return "";
1504 }
1505 
1506 /* Output code to leave non-uniform execution region.  SRC is the register
1507    holding per-lane storage previously allocated by omp_simt_enter insn.  */
1508 
1509 const char *
nvptx_output_simt_exit(rtx src)1510 nvptx_output_simt_exit (rtx src)
1511 {
1512   nvptx_output_unisimt_switch (asm_out_file, false);
1513   nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX);
1514   return "";
1515 }
1516 
1517 /* Output instruction that sets soft stack pointer in shared memory to the
1518    value in register given by SRC_REGNO.  */
1519 
1520 const char *
nvptx_output_set_softstack(unsigned src_regno)1521 nvptx_output_set_softstack (unsigned src_regno)
1522 {
1523   if (cfun->machine->has_softstack && !crtl->is_leaf)
1524     {
1525       fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
1526 	       POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
1527       output_reg (asm_out_file, src_regno, VOIDmode);
1528       fprintf (asm_out_file, ";\n");
1529     }
1530   return "";
1531 }
1532 /* Output a return instruction.  Also copy the return value to its outgoing
1533    location.  */
1534 
1535 const char *
nvptx_output_return(void)1536 nvptx_output_return (void)
1537 {
1538   machine_mode mode = (machine_mode)cfun->machine->return_mode;
1539 
1540   if (mode != VOIDmode)
1541     fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1542 	     nvptx_ptx_type_from_mode (mode, false),
1543 	     reg_names[NVPTX_RETURN_REGNUM],
1544 	     reg_names[NVPTX_RETURN_REGNUM]);
1545 
1546   return "ret;";
1547 }
1548 
1549 /* Terminate a function by writing a closing brace to FILE.  */
1550 
1551 void
nvptx_function_end(FILE * file)1552 nvptx_function_end (FILE *file)
1553 {
1554   fprintf (file, "}\n");
1555 }
1556 
1557 /* Decide whether we can make a sibling call to a function.  For ptx, we
1558    can't.  */
1559 
1560 static bool
nvptx_function_ok_for_sibcall(tree,tree)1561 nvptx_function_ok_for_sibcall (tree, tree)
1562 {
1563   return false;
1564 }
1565 
1566 /* Return Dynamic ReAlignment Pointer RTX.  For PTX there isn't any.  */
1567 
1568 static rtx
nvptx_get_drap_rtx(void)1569 nvptx_get_drap_rtx (void)
1570 {
1571   if (TARGET_SOFT_STACK && stack_realign_drap)
1572     return arg_pointer_rtx;
1573   return NULL_RTX;
1574 }
1575 
1576 /* Implement the TARGET_CALL_ARGS hook.  Record information about one
1577    argument to the next call.  */
1578 
1579 static void
nvptx_call_args(rtx arg,tree fntype)1580 nvptx_call_args (rtx arg, tree fntype)
1581 {
1582   if (!cfun->machine->doing_call)
1583     {
1584       cfun->machine->doing_call = true;
1585       cfun->machine->is_varadic = false;
1586       cfun->machine->num_args = 0;
1587 
1588       if (fntype && stdarg_p (fntype))
1589 	{
1590 	  cfun->machine->is_varadic = true;
1591 	  cfun->machine->has_varadic = true;
1592 	  cfun->machine->num_args++;
1593 	}
1594     }
1595 
1596   if (REG_P (arg) && arg != pc_rtx)
1597     {
1598       cfun->machine->num_args++;
1599       cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1600 						  cfun->machine->call_args);
1601     }
1602 }
1603 
1604 /* Implement the corresponding END_CALL_ARGS hook.  Clear and free the
1605    information we recorded.  */
1606 
1607 static void
nvptx_end_call_args(void)1608 nvptx_end_call_args (void)
1609 {
1610   cfun->machine->doing_call = false;
1611   free_EXPR_LIST_list (&cfun->machine->call_args);
1612 }
1613 
1614 /* Emit the sequence for a call to ADDRESS, setting RETVAL.  Keep
1615    track of whether calls involving static chains or varargs were seen
1616    in the current function.
1617    For libcalls, maintain a hash table of decls we have seen, and
1618    record a function decl for later when encountering a new one.  */
1619 
1620 void
nvptx_expand_call(rtx retval,rtx address)1621 nvptx_expand_call (rtx retval, rtx address)
1622 {
1623   rtx callee = XEXP (address, 0);
1624   rtx varargs = NULL_RTX;
1625   unsigned parallel = 0;
1626 
1627   if (!call_insn_operand (callee, Pmode))
1628     {
1629       callee = force_reg (Pmode, callee);
1630       address = change_address (address, QImode, callee);
1631     }
1632 
1633   if (GET_CODE (callee) == SYMBOL_REF)
1634     {
1635       tree decl = SYMBOL_REF_DECL (callee);
1636       if (decl != NULL_TREE)
1637 	{
1638 	  if (DECL_STATIC_CHAIN (decl))
1639 	    cfun->machine->has_chain = true;
1640 
1641 	  tree attr = oacc_get_fn_attrib (decl);
1642 	  if (attr)
1643 	    {
1644 	      tree dims = TREE_VALUE (attr);
1645 
1646 	      parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1647 	      for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1648 		{
1649 		  if (TREE_PURPOSE (dims)
1650 		      && !integer_zerop (TREE_PURPOSE (dims)))
1651 		    break;
1652 		  /* Not on this axis.  */
1653 		  parallel ^= GOMP_DIM_MASK (ix);
1654 		  dims = TREE_CHAIN (dims);
1655 		}
1656 	    }
1657 	}
1658     }
1659 
1660   unsigned nargs = cfun->machine->num_args;
1661   if (cfun->machine->is_varadic)
1662     {
1663       varargs = gen_reg_rtx (Pmode);
1664       emit_move_insn (varargs, stack_pointer_rtx);
1665     }
1666 
1667   rtvec vec = rtvec_alloc (nargs + 1);
1668   rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
1669   int vec_pos = 0;
1670 
1671   rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
1672   rtx tmp_retval = retval;
1673   if (retval)
1674     {
1675       if (!nvptx_register_operand (retval, GET_MODE (retval)))
1676 	tmp_retval = gen_reg_rtx (GET_MODE (retval));
1677       call = gen_rtx_SET (tmp_retval, call);
1678     }
1679   XVECEXP (pat, 0, vec_pos++) = call;
1680 
1681   /* Construct the call insn, including a USE for each argument pseudo
1682      register.  These will be used when printing the insn.  */
1683   for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
1684     XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
1685 
1686   if (varargs)
1687     XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
1688 
1689   gcc_assert (vec_pos = XVECLEN (pat, 0));
1690 
1691   nvptx_emit_forking (parallel, true);
1692   emit_call_insn (pat);
1693   nvptx_emit_joining (parallel, true);
1694 
1695   if (tmp_retval != retval)
1696     emit_move_insn (retval, tmp_retval);
1697 }
1698 
1699 /* Emit a comparison COMPARE, and return the new test to be used in the
1700    jump.  */
1701 
1702 rtx
nvptx_expand_compare(rtx compare)1703 nvptx_expand_compare (rtx compare)
1704 {
1705   rtx pred = gen_reg_rtx (BImode);
1706   rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1707 			    XEXP (compare, 0), XEXP (compare, 1));
1708   emit_insn (gen_rtx_SET (pred, cmp));
1709   return gen_rtx_NE (BImode, pred, const0_rtx);
1710 }
1711 
1712 /* Expand the oacc fork & join primitive into ptx-required unspecs.  */
1713 
1714 void
nvptx_expand_oacc_fork(unsigned mode)1715 nvptx_expand_oacc_fork (unsigned mode)
1716 {
1717   nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1718 }
1719 
1720 void
nvptx_expand_oacc_join(unsigned mode)1721 nvptx_expand_oacc_join (unsigned mode)
1722 {
1723   nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1724 }
1725 
1726 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1727    objects.  */
1728 
1729 static rtx
nvptx_gen_unpack(rtx dst0,rtx dst1,rtx src)1730 nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1731 {
1732   rtx res;
1733 
1734   switch (GET_MODE (src))
1735     {
1736     case E_DImode:
1737       res = gen_unpackdisi2 (dst0, dst1, src);
1738       break;
1739     case E_DFmode:
1740       res = gen_unpackdfsi2 (dst0, dst1, src);
1741       break;
1742     default: gcc_unreachable ();
1743     }
1744   return res;
1745 }
1746 
1747 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1748    object.  */
1749 
1750 static rtx
nvptx_gen_pack(rtx dst,rtx src0,rtx src1)1751 nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1752 {
1753   rtx res;
1754 
1755   switch (GET_MODE (dst))
1756     {
1757     case E_DImode:
1758       res = gen_packsidi2 (dst, src0, src1);
1759       break;
1760     case E_DFmode:
1761       res = gen_packsidf2 (dst, src0, src1);
1762       break;
1763     default: gcc_unreachable ();
1764     }
1765   return res;
1766 }
1767 
1768 /* Generate an instruction or sequence to broadcast register REG
1769    across the vectors of a single warp.  */
1770 
1771 rtx
nvptx_gen_shuffle(rtx dst,rtx src,rtx idx,nvptx_shuffle_kind kind)1772 nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
1773 {
1774   rtx res;
1775 
1776   switch (GET_MODE (dst))
1777     {
1778     case E_SImode:
1779       res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1780       break;
1781     case E_SFmode:
1782       res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1783       break;
1784     case E_DImode:
1785     case E_DFmode:
1786       {
1787 	rtx tmp0 = gen_reg_rtx (SImode);
1788 	rtx tmp1 = gen_reg_rtx (SImode);
1789 
1790 	start_sequence ();
1791 	emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1792 	emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1793 	emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1794 	emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1795 	res = get_insns ();
1796 	end_sequence ();
1797       }
1798       break;
1799     case E_BImode:
1800       {
1801 	rtx tmp = gen_reg_rtx (SImode);
1802 
1803 	start_sequence ();
1804 	emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1805 	emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1806 	emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1807 	res = get_insns ();
1808 	end_sequence ();
1809       }
1810       break;
1811     case E_QImode:
1812     case E_HImode:
1813       {
1814 	rtx tmp = gen_reg_rtx (SImode);
1815 
1816 	start_sequence ();
1817 	emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
1818 	emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1819 	emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
1820 						    tmp)));
1821 	res = get_insns ();
1822 	end_sequence ();
1823       }
1824       break;
1825 
1826     default:
1827       gcc_unreachable ();
1828     }
1829   return res;
1830 }
1831 
1832 /* Generate an instruction or sequence to broadcast register REG
1833    across the vectors of a single warp.  */
1834 
1835 static rtx
nvptx_gen_warp_bcast(rtx reg)1836 nvptx_gen_warp_bcast (rtx reg)
1837 {
1838   return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1839 }
1840 
1841 /* Structure used when generating a worker-level spill or fill.  */
1842 
1843 struct broadcast_data_t
1844 {
1845   rtx base;  /* Register holding base addr of buffer.  */
1846   rtx ptr;  /* Iteration var,  if needed.  */
1847   unsigned offset; /* Offset into worker buffer.  */
1848 };
1849 
1850 /* Direction of the spill/fill and looping setup/teardown indicator.  */
1851 
1852 enum propagate_mask
1853   {
1854     PM_read = 1 << 0,
1855     PM_write = 1 << 1,
1856     PM_loop_begin = 1 << 2,
1857     PM_loop_end = 1 << 3,
1858 
1859     PM_read_write = PM_read | PM_write
1860   };
1861 
1862 /* Generate instruction(s) to spill or fill register REG to/from the
1863    worker broadcast array.  PM indicates what is to be done, REP
1864    how many loop iterations will be executed (0 for not a loop).  */
1865 
1866 static rtx
nvptx_gen_shared_bcast(rtx reg,propagate_mask pm,unsigned rep,broadcast_data_t * data,bool vector)1867 nvptx_gen_shared_bcast (rtx reg, propagate_mask pm, unsigned rep,
1868 			broadcast_data_t *data, bool vector)
1869 {
1870   rtx  res;
1871   machine_mode mode = GET_MODE (reg);
1872 
1873   switch (mode)
1874     {
1875     case E_BImode:
1876       {
1877 	rtx tmp = gen_reg_rtx (SImode);
1878 
1879 	start_sequence ();
1880 	if (pm & PM_read)
1881 	  emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1882 	emit_insn (nvptx_gen_shared_bcast (tmp, pm, rep, data, vector));
1883 	if (pm & PM_write)
1884 	  emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1885 	res = get_insns ();
1886 	end_sequence ();
1887       }
1888       break;
1889 
1890     default:
1891       {
1892 	rtx addr = data->ptr;
1893 
1894 	if (!addr)
1895 	  {
1896 	    unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1897 
1898 	    oacc_bcast_align = MAX (oacc_bcast_align, align);
1899 	    data->offset = ROUND_UP (data->offset, align);
1900 	    addr = data->base;
1901 	    gcc_assert (data->base != NULL);
1902 	    if (data->offset)
1903 	      addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1904 	  }
1905 
1906 	addr = gen_rtx_MEM (mode, addr);
1907 	if (pm == PM_read)
1908 	  res = gen_rtx_SET (addr, reg);
1909 	else if (pm == PM_write)
1910 	  res = gen_rtx_SET (reg, addr);
1911 	else
1912 	  gcc_unreachable ();
1913 
1914 	if (data->ptr)
1915 	  {
1916 	    /* We're using a ptr, increment it.  */
1917 	    start_sequence ();
1918 
1919 	    emit_insn (res);
1920 	    emit_insn (gen_adddi3 (data->ptr, data->ptr,
1921 				   GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1922 	    res = get_insns ();
1923 	    end_sequence ();
1924 	  }
1925 	else
1926 	  rep = 1;
1927 	data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1928       }
1929       break;
1930     }
1931   return res;
1932 }
1933 
1934 /* Returns true if X is a valid address for use in a memory reference.  */
1935 
1936 static bool
nvptx_legitimate_address_p(machine_mode,rtx x,bool)1937 nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1938 {
1939   enum rtx_code code = GET_CODE (x);
1940 
1941   switch (code)
1942     {
1943     case REG:
1944       return true;
1945 
1946     case PLUS:
1947       if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1948 	return true;
1949       return false;
1950 
1951     case CONST:
1952     case SYMBOL_REF:
1953     case LABEL_REF:
1954       return true;
1955 
1956     default:
1957       return false;
1958     }
1959 }
1960 
1961 /* Machinery to output constant initializers.  When beginning an
1962    initializer, we decide on a fragment size (which is visible in ptx
1963    in the type used), and then all initializer data is buffered until
1964    a fragment is filled and ready to be written out.  */
1965 
1966 static struct
1967 {
1968   unsigned HOST_WIDE_INT mask; /* Mask for storing fragment.  */
1969   unsigned HOST_WIDE_INT val; /* Current fragment value.  */
1970   unsigned HOST_WIDE_INT remaining; /*  Remaining bytes to be written
1971 					out.  */
1972   unsigned size;  /* Fragment size to accumulate.  */
1973   unsigned offset;  /* Offset within current fragment.  */
1974   bool started;   /* Whether we've output any initializer.  */
1975 } init_frag;
1976 
1977 /* The current fragment is full,  write it out.  SYM may provide a
1978    symbolic reference we should output,  in which case the fragment
1979    value is the addend.  */
1980 
1981 static void
output_init_frag(rtx sym)1982 output_init_frag (rtx sym)
1983 {
1984   fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
1985   unsigned HOST_WIDE_INT val = init_frag.val;
1986 
1987   init_frag.started = true;
1988   init_frag.val = 0;
1989   init_frag.offset = 0;
1990   init_frag.remaining--;
1991 
1992   if (sym)
1993     {
1994       bool function = (SYMBOL_REF_DECL (sym)
1995 		       && (TREE_CODE (SYMBOL_REF_DECL (sym)) == FUNCTION_DECL));
1996       if (!function)
1997 	fprintf (asm_out_file, "generic(");
1998       output_address (VOIDmode, sym);
1999       if (!function)
2000 	fprintf (asm_out_file, ")");
2001       if (val)
2002 	fprintf (asm_out_file, " + ");
2003     }
2004 
2005   if (!sym || val)
2006     fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
2007 }
2008 
2009 /* Add value VAL of size SIZE to the data we're emitting, and keep
2010    writing out chunks as they fill up.  */
2011 
2012 static void
nvptx_assemble_value(unsigned HOST_WIDE_INT val,unsigned size)2013 nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
2014 {
2015   val &= ((unsigned  HOST_WIDE_INT)2 << (size * BITS_PER_UNIT - 1)) - 1;
2016 
2017   for (unsigned part = 0; size; size -= part)
2018     {
2019       val >>= part * BITS_PER_UNIT;
2020       part = init_frag.size - init_frag.offset;
2021       part = MIN (part, size);
2022 
2023       unsigned HOST_WIDE_INT partial
2024 	= val << (init_frag.offset * BITS_PER_UNIT);
2025       init_frag.val |= partial & init_frag.mask;
2026       init_frag.offset += part;
2027 
2028       if (init_frag.offset == init_frag.size)
2029 	output_init_frag (NULL);
2030     }
2031 }
2032 
2033 /* Target hook for assembling integer object X of size SIZE.  */
2034 
2035 static bool
nvptx_assemble_integer(rtx x,unsigned int size,int ARG_UNUSED (aligned_p))2036 nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
2037 {
2038   HOST_WIDE_INT val = 0;
2039 
2040   switch (GET_CODE (x))
2041     {
2042     default:
2043       /* Let the generic machinery figure it out, usually for a
2044 	 CONST_WIDE_INT.  */
2045       return false;
2046 
2047     case CONST_INT:
2048       nvptx_assemble_value (INTVAL (x), size);
2049       break;
2050 
2051     case CONST:
2052       x = XEXP (x, 0);
2053       gcc_assert (GET_CODE (x) == PLUS);
2054       val = INTVAL (XEXP (x, 1));
2055       x = XEXP (x, 0);
2056       gcc_assert (GET_CODE (x) == SYMBOL_REF);
2057       /* FALLTHROUGH */
2058 
2059     case SYMBOL_REF:
2060       gcc_assert (size == init_frag.size);
2061       if (init_frag.offset)
2062 	sorry ("cannot emit unaligned pointers in ptx assembly");
2063 
2064       nvptx_maybe_record_fnsym (x);
2065       init_frag.val = val;
2066       output_init_frag (x);
2067       break;
2068     }
2069 
2070   return true;
2071 }
2072 
2073 /* Output SIZE zero bytes.  We ignore the FILE argument since the
2074    functions we're calling to perform the output just use
2075    asm_out_file.  */
2076 
2077 void
nvptx_output_skip(FILE *,unsigned HOST_WIDE_INT size)2078 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
2079 {
2080   /* Finish the current fragment, if it's started.  */
2081   if (init_frag.offset)
2082     {
2083       unsigned part = init_frag.size - init_frag.offset;
2084       part = MIN (part, (unsigned)size);
2085       size -= part;
2086       nvptx_assemble_value (0, part);
2087     }
2088 
2089   /* If this skip doesn't terminate the initializer, write as many
2090      remaining pieces as possible directly.  */
2091   if (size < init_frag.remaining * init_frag.size)
2092     {
2093       while (size >= init_frag.size)
2094 	{
2095 	  size -= init_frag.size;
2096 	  output_init_frag (NULL_RTX);
2097 	}
2098       if (size)
2099 	nvptx_assemble_value (0, size);
2100     }
2101 }
2102 
2103 /* Output a string STR with length SIZE.  As in nvptx_output_skip we
2104    ignore the FILE arg.  */
2105 
2106 void
nvptx_output_ascii(FILE *,const char * str,unsigned HOST_WIDE_INT size)2107 nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
2108 {
2109   for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
2110     nvptx_assemble_value (str[i], 1);
2111 }
2112 
2113 /* Return true if TYPE is a record type where the last field is an array without
2114    given dimension.  */
2115 
2116 static bool
flexible_array_member_type_p(const_tree type)2117 flexible_array_member_type_p (const_tree type)
2118 {
2119   if (TREE_CODE (type) != RECORD_TYPE)
2120     return false;
2121 
2122   const_tree last_field = NULL_TREE;
2123   for (const_tree f = TYPE_FIELDS (type); f; f = TREE_CHAIN (f))
2124     last_field = f;
2125 
2126   if (!last_field)
2127     return false;
2128 
2129   const_tree last_field_type = TREE_TYPE (last_field);
2130   if (TREE_CODE (last_field_type) != ARRAY_TYPE)
2131     return false;
2132 
2133   return (! TYPE_DOMAIN (last_field_type)
2134 	  || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type)));
2135 }
2136 
2137 /* Emit a PTX variable decl and prepare for emission of its
2138    initializer.  NAME is the symbol name and SETION the PTX data
2139    area. The type is TYPE, object size SIZE and alignment is ALIGN.
2140    The caller has already emitted any indentation and linkage
2141    specifier.  It is responsible for any initializer, terminating ;
2142    and newline.  SIZE is in bytes, ALIGN is in bits -- confusingly
2143    this is the opposite way round that PTX wants them!  */
2144 
2145 static void
2146 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
2147 			   const_tree type, HOST_WIDE_INT size, unsigned align,
2148 			   bool undefined = false)
2149 {
2150   bool atype = (TREE_CODE (type) == ARRAY_TYPE)
2151     && (TYPE_DOMAIN (type) == NULL_TREE);
2152 
2153   if (undefined && flexible_array_member_type_p (type))
2154     {
2155       size = 0;
2156       atype = true;
2157     }
2158 
2159   while (TREE_CODE (type) == ARRAY_TYPE)
2160     type = TREE_TYPE (type);
2161 
2162   if (TREE_CODE (type) == VECTOR_TYPE
2163       || TREE_CODE (type) == COMPLEX_TYPE)
2164     /* Neither vector nor complex types can contain the other.  */
2165     type = TREE_TYPE (type);
2166 
2167   unsigned elt_size = int_size_in_bytes (type);
2168 
2169   /* Largest mode we're prepared to accept.  For BLKmode types we
2170      don't know if it'll contain pointer constants, so have to choose
2171      pointer size, otherwise we can choose DImode.  */
2172   machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
2173 
2174   elt_size |= GET_MODE_SIZE (elt_mode);
2175   elt_size &= -elt_size; /* Extract LSB set.  */
2176 
2177   init_frag.size = elt_size;
2178   /* Avoid undefined shift behavior by using '2'.  */
2179   init_frag.mask = ((unsigned HOST_WIDE_INT)2
2180 		    << (elt_size * BITS_PER_UNIT - 1)) - 1;
2181   init_frag.val = 0;
2182   init_frag.offset = 0;
2183   init_frag.started = false;
2184   /* Size might not be a multiple of elt size, if there's an
2185      initialized trailing struct array with smaller type than
2186      elt_size. */
2187   init_frag.remaining = (size + elt_size - 1) / elt_size;
2188 
2189   fprintf (file, "%s .align %d .u%d ",
2190 	   section, align / BITS_PER_UNIT,
2191 	   elt_size * BITS_PER_UNIT);
2192   assemble_name (file, name);
2193 
2194   if (size)
2195     /* We make everything an array, to simplify any initialization
2196        emission.  */
2197     fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
2198   else if (atype)
2199     fprintf (file, "[]");
2200 }
2201 
2202 /* Called when the initializer for a decl has been completely output through
2203    combinations of the three functions above.  */
2204 
2205 static void
nvptx_assemble_decl_end(void)2206 nvptx_assemble_decl_end (void)
2207 {
2208   if (init_frag.offset)
2209     /* This can happen with a packed struct with trailing array member.  */
2210     nvptx_assemble_value (0, init_frag.size - init_frag.offset);
2211   fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
2212 }
2213 
2214 /* Output an uninitialized common or file-scope variable.  */
2215 
2216 void
nvptx_output_aligned_decl(FILE * file,const char * name,const_tree decl,HOST_WIDE_INT size,unsigned align)2217 nvptx_output_aligned_decl (FILE *file, const char *name,
2218 			   const_tree decl, HOST_WIDE_INT size, unsigned align)
2219 {
2220   write_var_marker (file, true, TREE_PUBLIC (decl), name);
2221 
2222   /* If this is public, it is common.  The nearest thing we have to
2223      common is weak.  */
2224   fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
2225 
2226   nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2227 			     TREE_TYPE (decl), size, align);
2228   nvptx_assemble_decl_end ();
2229 }
2230 
2231 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME.  Begin the process of
2232    writing a constant variable EXP with NAME and SIZE and its
2233    initializer to FILE.  */
2234 
2235 static void
nvptx_asm_declare_constant_name(FILE * file,const char * name,const_tree exp,HOST_WIDE_INT obj_size)2236 nvptx_asm_declare_constant_name (FILE *file, const char *name,
2237 				 const_tree exp, HOST_WIDE_INT obj_size)
2238 {
2239   write_var_marker (file, true, false, name);
2240 
2241   fprintf (file, "\t");
2242 
2243   tree type = TREE_TYPE (exp);
2244   nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
2245 			     TYPE_ALIGN (type));
2246 }
2247 
2248 /* Implement the ASM_DECLARE_OBJECT_NAME macro.  Used to start writing
2249    a variable DECL with NAME to FILE.  */
2250 
2251 void
nvptx_declare_object_name(FILE * file,const char * name,const_tree decl)2252 nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
2253 {
2254   write_var_marker (file, true, TREE_PUBLIC (decl), name);
2255 
2256   fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
2257 			  : DECL_WEAK (decl) ? ".weak " : ".visible "));
2258 
2259   tree type = TREE_TYPE (decl);
2260   HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
2261   nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2262 			     type, obj_size, DECL_ALIGN (decl));
2263 }
2264 
2265 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing.  */
2266 
2267 static void
nvptx_globalize_label(FILE *,const char *)2268 nvptx_globalize_label (FILE *, const char *)
2269 {
2270 }
2271 
2272 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL.  Write an extern
2273    declaration only for variable DECL with NAME to FILE.  */
2274 
2275 static void
nvptx_assemble_undefined_decl(FILE * file,const char * name,const_tree decl)2276 nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
2277 {
2278   /* The middle end can place constant pool decls into the varpool as
2279      undefined.  Until that is fixed, catch the problem here.  */
2280   if (DECL_IN_CONSTANT_POOL (decl))
2281     return;
2282 
2283   /*  We support weak defintions, and hence have the right
2284       ASM_WEAKEN_DECL definition.  Diagnose the problem here.  */
2285   if (DECL_WEAK (decl))
2286     error_at (DECL_SOURCE_LOCATION (decl),
2287 	      "PTX does not support weak declarations"
2288 	      " (only weak definitions)");
2289   write_var_marker (file, false, TREE_PUBLIC (decl), name);
2290 
2291   fprintf (file, "\t.extern ");
2292   tree size = DECL_SIZE_UNIT (decl);
2293   nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2294 			     TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
2295 			     DECL_ALIGN (decl), true);
2296   nvptx_assemble_decl_end ();
2297 }
2298 
2299 /* Output a pattern for a move instruction.  */
2300 
2301 const char *
nvptx_output_mov_insn(rtx dst,rtx src)2302 nvptx_output_mov_insn (rtx dst, rtx src)
2303 {
2304   machine_mode dst_mode = GET_MODE (dst);
2305   machine_mode dst_inner = (GET_CODE (dst) == SUBREG
2306 			    ? GET_MODE (XEXP (dst, 0)) : dst_mode);
2307   machine_mode src_inner = (GET_CODE (src) == SUBREG
2308 			    ? GET_MODE (XEXP (src, 0)) : dst_mode);
2309 
2310   rtx sym = src;
2311   if (GET_CODE (sym) == CONST)
2312     sym = XEXP (XEXP (sym, 0), 0);
2313   if (SYMBOL_REF_P (sym))
2314     {
2315       if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
2316 	return "%.\tcvta%D1%t0\t%0, %1;";
2317       nvptx_maybe_record_fnsym (sym);
2318     }
2319 
2320   if (src_inner == dst_inner)
2321     return "%.\tmov%t0\t%0, %1;";
2322 
2323   if (CONSTANT_P (src))
2324     return (GET_MODE_CLASS (dst_inner) == MODE_INT
2325 	    && GET_MODE_CLASS (src_inner) != MODE_FLOAT
2326 	    ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2327 
2328   if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
2329     {
2330       if (GET_MODE_BITSIZE (dst_mode) == 128
2331 	  && GET_MODE_BITSIZE (GET_MODE (src)) == 128)
2332 	{
2333 	  /* mov.b128 is not supported.  */
2334 	  if (dst_inner == V2DImode && src_inner == TImode)
2335 	    return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2336 	  else if (dst_inner == TImode && src_inner == V2DImode)
2337 	    return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2338 
2339 	  gcc_unreachable ();
2340 	}
2341       return "%.\tmov.b%T0\t%0, %1;";
2342     }
2343 
2344   return "%.\tcvt%t0%t1\t%0, %1;";
2345 }
2346 
2347 static void nvptx_print_operand (FILE *, rtx, int);
2348 
2349 /* Output INSN, which is a call to CALLEE with result RESULT.  For ptx, this
2350    involves writing .param declarations and in/out copies into them.  For
2351    indirect calls, also write the .callprototype.  */
2352 
2353 const char *
nvptx_output_call_insn(rtx_insn * insn,rtx result,rtx callee)2354 nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
2355 {
2356   char buf[16];
2357   static int labelno;
2358   bool needs_tgt = register_operand (callee, Pmode);
2359   rtx pat = PATTERN (insn);
2360   if (GET_CODE (pat) == COND_EXEC)
2361     pat = COND_EXEC_CODE (pat);
2362   int arg_end = XVECLEN (pat, 0);
2363   tree decl = NULL_TREE;
2364 
2365   fprintf (asm_out_file, "\t{\n");
2366   if (result != NULL)
2367     fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
2368 	     nvptx_ptx_type_from_mode (GET_MODE (result), false),
2369 	     reg_names[NVPTX_RETURN_REGNUM]);
2370 
2371   /* Ensure we have a ptx declaration in the output if necessary.  */
2372   if (GET_CODE (callee) == SYMBOL_REF)
2373     {
2374       decl = SYMBOL_REF_DECL (callee);
2375       if (!decl
2376 	  || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
2377 	nvptx_record_libfunc (callee, result, pat);
2378       else if (DECL_EXTERNAL (decl))
2379 	nvptx_record_fndecl (decl);
2380     }
2381 
2382   if (needs_tgt)
2383     {
2384       ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
2385       labelno++;
2386       ASM_OUTPUT_LABEL (asm_out_file, buf);
2387       std::stringstream s;
2388       write_fn_proto_from_insn (s, NULL, result, pat);
2389       fputs (s.str().c_str(), asm_out_file);
2390     }
2391 
2392   for (int argno = 1; argno < arg_end; argno++)
2393     {
2394       rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
2395       machine_mode mode = GET_MODE (t);
2396       const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
2397 
2398       /* Mode splitting has already been done.  */
2399       fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
2400 	       "\t\tst.param%s [%%out_arg%d], ",
2401 	       ptx_type, argno, ptx_type, argno);
2402       output_reg (asm_out_file, REGNO (t), VOIDmode);
2403       fprintf (asm_out_file, ";\n");
2404     }
2405 
2406   /* The '.' stands for the call's predicate, if any.  */
2407   nvptx_print_operand (asm_out_file, NULL_RTX, '.');
2408   fprintf (asm_out_file, "\t\tcall ");
2409   if (result != NULL_RTX)
2410     fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
2411 
2412   if (decl)
2413     {
2414       const char *name = get_fnname_from_decl (decl);
2415       name = nvptx_name_replacement (name);
2416       assemble_name (asm_out_file, name);
2417     }
2418   else
2419     output_address (VOIDmode, callee);
2420 
2421   const char *open = "(";
2422   for (int argno = 1; argno < arg_end; argno++)
2423     {
2424       fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
2425       open = "";
2426     }
2427   if (decl && DECL_STATIC_CHAIN (decl))
2428     {
2429       fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
2430       open = "";
2431     }
2432   if (!open[0])
2433     fprintf (asm_out_file, ")");
2434 
2435   if (needs_tgt)
2436     {
2437       fprintf (asm_out_file, ", ");
2438       assemble_name (asm_out_file, buf);
2439     }
2440   fprintf (asm_out_file, ";\n");
2441 
2442   if (find_reg_note (insn, REG_NORETURN, NULL))
2443     {
2444       /* No return functions confuse the PTX JIT, as it doesn't realize
2445 	 the flow control barrier they imply.  It can seg fault if it
2446 	 encounters what looks like an unexitable loop.  Emit a trailing
2447 	 trap and exit, which it does grok.  */
2448       fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
2449       fprintf (asm_out_file, "\t\texit; // (noreturn)\n");
2450     }
2451 
2452   if (result)
2453     {
2454       static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2455 
2456       if (!rval[0])
2457 	/* We must escape the '%' that starts RETURN_REGNUM.  */
2458 	sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2459 		 reg_names[NVPTX_RETURN_REGNUM]);
2460       return rval;
2461     }
2462 
2463   return "}";
2464 }
2465 
2466 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P.  */
2467 
2468 static bool
nvptx_print_operand_punct_valid_p(unsigned char c)2469 nvptx_print_operand_punct_valid_p (unsigned char c)
2470 {
2471   return c == '.' || c== '#';
2472 }
2473 
2474 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE.  */
2475 
2476 static void
nvptx_print_address_operand(FILE * file,rtx x,machine_mode)2477 nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
2478 {
2479   rtx off;
2480   if (GET_CODE (x) == CONST)
2481     x = XEXP (x, 0);
2482   switch (GET_CODE (x))
2483     {
2484     case PLUS:
2485       off = XEXP (x, 1);
2486       output_address (VOIDmode, XEXP (x, 0));
2487       fprintf (file, "+");
2488       output_address (VOIDmode, off);
2489       break;
2490 
2491     case SYMBOL_REF:
2492     case LABEL_REF:
2493       output_addr_const (file, x);
2494       break;
2495 
2496     default:
2497       gcc_assert (GET_CODE (x) != MEM);
2498       nvptx_print_operand (file, x, 0);
2499       break;
2500     }
2501 }
2502 
2503 /* Write assembly language output for the address ADDR to FILE.  */
2504 
2505 static void
nvptx_print_operand_address(FILE * file,machine_mode mode,rtx addr)2506 nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
2507 {
2508   nvptx_print_address_operand (file, addr, mode);
2509 }
2510 
2511 /* Print an operand, X, to FILE, with an optional modifier in CODE.
2512 
2513    Meaning of CODE:
2514    . -- print the predicate for the instruction or an emptry string for an
2515         unconditional one.
2516    # -- print a rounding mode for the instruction
2517 
2518    A -- print a data area for a MEM
2519    c -- print an opcode suffix for a comparison operator, including a type code
2520    D -- print a data area for a MEM operand
2521    S -- print a shuffle kind specified by CONST_INT
2522    t -- print a type opcode suffix, promoting QImode to 32 bits
2523    T -- print a type size in bits
2524    u -- print a type opcode suffix without promotions.  */
2525 
2526 static void
nvptx_print_operand(FILE * file,rtx x,int code)2527 nvptx_print_operand (FILE *file, rtx x, int code)
2528 {
2529   if (code == '.')
2530     {
2531       x = current_insn_predicate;
2532       if (x)
2533 	{
2534 	  fputs ("@", file);
2535 	  if (GET_CODE (x) == EQ)
2536 	    fputs ("!", file);
2537 	  output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
2538 	}
2539       return;
2540     }
2541   else if (code == '#')
2542     {
2543       fputs (".rn", file);
2544       return;
2545     }
2546 
2547   enum rtx_code x_code = GET_CODE (x);
2548   machine_mode mode = GET_MODE (x);
2549 
2550   switch (code)
2551     {
2552     case 'A':
2553       x = XEXP (x, 0);
2554       /* FALLTHROUGH.  */
2555 
2556     case 'D':
2557       if (GET_CODE (x) == CONST)
2558 	x = XEXP (x, 0);
2559       if (GET_CODE (x) == PLUS)
2560 	x = XEXP (x, 0);
2561 
2562       if (GET_CODE (x) == SYMBOL_REF)
2563 	fputs (section_for_sym (x), file);
2564       break;
2565 
2566     case 't':
2567     case 'u':
2568       if (x_code == SUBREG)
2569 	{
2570 	  machine_mode inner_mode = GET_MODE (SUBREG_REG (x));
2571 	  if (VECTOR_MODE_P (inner_mode)
2572 	      && (GET_MODE_SIZE (mode)
2573 		  <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2574 	    mode = GET_MODE_INNER (inner_mode);
2575 	  else if (split_mode_p (inner_mode))
2576 	    mode = maybe_split_mode (inner_mode);
2577 	  else
2578 	    mode = inner_mode;
2579 	}
2580       fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
2581       break;
2582 
2583     case 'H':
2584     case 'L':
2585       {
2586 	rtx inner_x = SUBREG_REG (x);
2587 	machine_mode inner_mode = GET_MODE (inner_x);
2588 	machine_mode split = maybe_split_mode (inner_mode);
2589 
2590 	output_reg (file, REGNO (inner_x), split,
2591 		    (code == 'H'
2592 		     ? GET_MODE_SIZE (inner_mode) / 2
2593 		     : 0));
2594       }
2595       break;
2596 
2597     case 'S':
2598       {
2599 	nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2600 	/* Same order as nvptx_shuffle_kind.  */
2601 	static const char *const kinds[] =
2602 	  {".up", ".down", ".bfly", ".idx"};
2603 	fputs (kinds[kind], file);
2604       }
2605       break;
2606 
2607     case 'T':
2608       fprintf (file, "%d", GET_MODE_BITSIZE (mode));
2609       break;
2610 
2611     case 'j':
2612       fprintf (file, "@");
2613       goto common;
2614 
2615     case 'J':
2616       fprintf (file, "@!");
2617       goto common;
2618 
2619     case 'c':
2620       mode = GET_MODE (XEXP (x, 0));
2621       switch (x_code)
2622 	{
2623 	case EQ:
2624 	  fputs (".eq", file);
2625 	  break;
2626 	case NE:
2627 	  if (FLOAT_MODE_P (mode))
2628 	    fputs (".neu", file);
2629 	  else
2630 	    fputs (".ne", file);
2631 	  break;
2632 	case LE:
2633 	case LEU:
2634 	  fputs (".le", file);
2635 	  break;
2636 	case GE:
2637 	case GEU:
2638 	  fputs (".ge", file);
2639 	  break;
2640 	case LT:
2641 	case LTU:
2642 	  fputs (".lt", file);
2643 	  break;
2644 	case GT:
2645 	case GTU:
2646 	  fputs (".gt", file);
2647 	  break;
2648 	case LTGT:
2649 	  fputs (".ne", file);
2650 	  break;
2651 	case UNEQ:
2652 	  fputs (".equ", file);
2653 	  break;
2654 	case UNLE:
2655 	  fputs (".leu", file);
2656 	  break;
2657 	case UNGE:
2658 	  fputs (".geu", file);
2659 	  break;
2660 	case UNLT:
2661 	  fputs (".ltu", file);
2662 	  break;
2663 	case UNGT:
2664 	  fputs (".gtu", file);
2665 	  break;
2666 	case UNORDERED:
2667 	  fputs (".nan", file);
2668 	  break;
2669 	case ORDERED:
2670 	  fputs (".num", file);
2671 	  break;
2672 	default:
2673 	  gcc_unreachable ();
2674 	}
2675       if (FLOAT_MODE_P (mode)
2676 	  || x_code == EQ || x_code == NE
2677 	  || x_code == GEU || x_code == GTU
2678 	  || x_code == LEU || x_code == LTU)
2679 	fputs (nvptx_ptx_type_from_mode (mode, true), file);
2680       else
2681 	fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
2682       break;
2683     default:
2684     common:
2685       switch (x_code)
2686 	{
2687 	case SUBREG:
2688 	  {
2689 	    rtx inner_x = SUBREG_REG (x);
2690 	    machine_mode inner_mode = GET_MODE (inner_x);
2691 	    machine_mode split = maybe_split_mode (inner_mode);
2692 
2693 	    if (VECTOR_MODE_P (inner_mode)
2694 		&& (GET_MODE_SIZE (mode)
2695 		    <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2696 	      {
2697 		output_reg (file, REGNO (inner_x), VOIDmode);
2698 		fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y");
2699 	      }
2700 	    else if (split_mode_p (inner_mode)
2701 		&& (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2702 	      output_reg (file, REGNO (inner_x), split);
2703 	    else
2704 	      output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2705 	  }
2706 	  break;
2707 
2708 	case REG:
2709 	  output_reg (file, REGNO (x), maybe_split_mode (mode));
2710 	  break;
2711 
2712 	case MEM:
2713 	  fputc ('[', file);
2714 	  nvptx_print_address_operand (file, XEXP (x, 0), mode);
2715 	  fputc (']', file);
2716 	  break;
2717 
2718 	case CONST_INT:
2719 	  output_addr_const (file, x);
2720 	  break;
2721 
2722 	case CONST:
2723 	case SYMBOL_REF:
2724 	case LABEL_REF:
2725 	  /* We could use output_addr_const, but that can print things like
2726 	     "x-8", which breaks ptxas.  Need to ensure it is output as
2727 	     "x+-8".  */
2728 	  nvptx_print_address_operand (file, x, VOIDmode);
2729 	  break;
2730 
2731 	case CONST_DOUBLE:
2732 	  long vals[2];
2733 	  real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
2734 	  vals[0] &= 0xffffffff;
2735 	  vals[1] &= 0xffffffff;
2736 	  if (mode == SFmode)
2737 	    fprintf (file, "0f%08lx", vals[0]);
2738 	  else
2739 	    fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2740 	  break;
2741 
2742 	case CONST_VECTOR:
2743 	  {
2744 	    unsigned n = CONST_VECTOR_NUNITS (x);
2745 	    fprintf (file, "{ ");
2746 	    for (unsigned i = 0; i < n; ++i)
2747 	      {
2748 		if (i != 0)
2749 		  fprintf (file, ", ");
2750 
2751 		rtx elem = CONST_VECTOR_ELT (x, i);
2752 		output_addr_const (file, elem);
2753 	      }
2754 	    fprintf (file, " }");
2755 	  }
2756 	  break;
2757 
2758 	default:
2759 	  output_addr_const (file, x);
2760 	}
2761     }
2762 }
2763 
2764 /* Record replacement regs used to deal with subreg operands.  */
2765 struct reg_replace
2766 {
2767   rtx replacement[MAX_RECOG_OPERANDS];
2768   machine_mode mode;
2769   int n_allocated;
2770   int n_in_use;
2771 };
2772 
2773 /* Allocate or reuse a replacement in R and return the rtx.  */
2774 
2775 static rtx
get_replacement(struct reg_replace * r)2776 get_replacement (struct reg_replace *r)
2777 {
2778   if (r->n_allocated == r->n_in_use)
2779     r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2780   return r->replacement[r->n_in_use++];
2781 }
2782 
2783 /* Clean up subreg operands.  In ptx assembly, everything is typed, and
2784    the presence of subregs would break the rules for most instructions.
2785    Replace them with a suitable new register of the right size, plus
2786    conversion copyin/copyout instructions.  */
2787 
2788 static void
nvptx_reorg_subreg(void)2789 nvptx_reorg_subreg (void)
2790 {
2791   struct reg_replace qiregs, hiregs, siregs, diregs;
2792   rtx_insn *insn, *next;
2793 
2794   qiregs.n_allocated = 0;
2795   hiregs.n_allocated = 0;
2796   siregs.n_allocated = 0;
2797   diregs.n_allocated = 0;
2798   qiregs.mode = QImode;
2799   hiregs.mode = HImode;
2800   siregs.mode = SImode;
2801   diregs.mode = DImode;
2802 
2803   for (insn = get_insns (); insn; insn = next)
2804     {
2805       next = NEXT_INSN (insn);
2806       if (!NONDEBUG_INSN_P (insn)
2807 	  || asm_noperands (PATTERN (insn)) >= 0
2808 	  || GET_CODE (PATTERN (insn)) == USE
2809 	  || GET_CODE (PATTERN (insn)) == CLOBBER)
2810 	continue;
2811 
2812       qiregs.n_in_use = 0;
2813       hiregs.n_in_use = 0;
2814       siregs.n_in_use = 0;
2815       diregs.n_in_use = 0;
2816       extract_insn (insn);
2817       enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
2818 
2819       for (int i = 0; i < recog_data.n_operands; i++)
2820 	{
2821 	  rtx op = recog_data.operand[i];
2822 	  if (GET_CODE (op) != SUBREG)
2823 	    continue;
2824 
2825 	  rtx inner = SUBREG_REG (op);
2826 
2827 	  machine_mode outer_mode = GET_MODE (op);
2828 	  machine_mode inner_mode = GET_MODE (inner);
2829 	  gcc_assert (s_ok);
2830 	  if (s_ok
2831 	      && (GET_MODE_PRECISION (inner_mode)
2832 		  >= GET_MODE_PRECISION (outer_mode)))
2833 	    continue;
2834 	  gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2835 	  struct reg_replace *r = (outer_mode == QImode ? &qiregs
2836 				   : outer_mode == HImode ? &hiregs
2837 				   : outer_mode == SImode ? &siregs
2838 				   : &diregs);
2839 	  rtx new_reg = get_replacement (r);
2840 
2841 	  if (recog_data.operand_type[i] != OP_OUT)
2842 	    {
2843 	      enum rtx_code code;
2844 	      if (GET_MODE_PRECISION (inner_mode)
2845 		  < GET_MODE_PRECISION (outer_mode))
2846 		code = ZERO_EXTEND;
2847 	      else
2848 		code = TRUNCATE;
2849 
2850 	      rtx pat = gen_rtx_SET (new_reg,
2851 				     gen_rtx_fmt_e (code, outer_mode, inner));
2852 	      emit_insn_before (pat, insn);
2853 	    }
2854 
2855 	  if (recog_data.operand_type[i] != OP_IN)
2856 	    {
2857 	      enum rtx_code code;
2858 	      if (GET_MODE_PRECISION (inner_mode)
2859 		  < GET_MODE_PRECISION (outer_mode))
2860 		code = TRUNCATE;
2861 	      else
2862 		code = ZERO_EXTEND;
2863 
2864 	      rtx pat = gen_rtx_SET (inner,
2865 				     gen_rtx_fmt_e (code, inner_mode, new_reg));
2866 	      emit_insn_after (pat, insn);
2867 	    }
2868 	  validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2869 	}
2870     }
2871 }
2872 
2873 /* Return a SImode "master lane index" register for uniform-simt, allocating on
2874    first use.  */
2875 
2876 static rtx
nvptx_get_unisimt_master()2877 nvptx_get_unisimt_master ()
2878 {
2879   rtx &master = cfun->machine->unisimt_master;
2880   return master ? master : master = gen_reg_rtx (SImode);
2881 }
2882 
2883 /* Return a BImode "predicate" register for uniform-simt, similar to above.  */
2884 
2885 static rtx
nvptx_get_unisimt_predicate()2886 nvptx_get_unisimt_predicate ()
2887 {
2888   rtx &pred = cfun->machine->unisimt_predicate;
2889   return pred ? pred : pred = gen_reg_rtx (BImode);
2890 }
2891 
2892 /* Return true if given call insn references one of the functions provided by
2893    the CUDA runtime: malloc, free, vprintf.  */
2894 
2895 static bool
nvptx_call_insn_is_syscall_p(rtx_insn * insn)2896 nvptx_call_insn_is_syscall_p (rtx_insn *insn)
2897 {
2898   rtx pat = PATTERN (insn);
2899   gcc_checking_assert (GET_CODE (pat) == PARALLEL);
2900   pat = XVECEXP (pat, 0, 0);
2901   if (GET_CODE (pat) == SET)
2902     pat = SET_SRC (pat);
2903   gcc_checking_assert (GET_CODE (pat) == CALL
2904 		       && GET_CODE (XEXP (pat, 0)) == MEM);
2905   rtx addr = XEXP (XEXP (pat, 0), 0);
2906   if (GET_CODE (addr) != SYMBOL_REF)
2907     return false;
2908   const char *name = XSTR (addr, 0);
2909   /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
2910      references with forced assembler name refer to PTX syscalls.  For vprintf,
2911      accept both normal and forced-assembler-name references.  */
2912   return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
2913 	  || !strcmp (name, "*malloc")
2914 	  || !strcmp (name, "*free"));
2915 }
2916 
2917 /* If SET subexpression of INSN sets a register, emit a shuffle instruction to
2918    propagate its value from lane MASTER to current lane.  */
2919 
2920 static void
nvptx_unisimt_handle_set(rtx set,rtx_insn * insn,rtx master)2921 nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
2922 {
2923   rtx reg;
2924   if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
2925     emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
2926 }
2927 
2928 /* Adjust code for uniform-simt code generation variant by making atomics and
2929    "syscalls" conditionally executed, and inserting shuffle-based propagation
2930    for registers being set.  */
2931 
2932 static void
nvptx_reorg_uniform_simt()2933 nvptx_reorg_uniform_simt ()
2934 {
2935   rtx_insn *insn, *next;
2936 
2937   for (insn = get_insns (); insn; insn = next)
2938     {
2939       next = NEXT_INSN (insn);
2940       if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
2941 	  && !(NONJUMP_INSN_P (insn)
2942 	       && GET_CODE (PATTERN (insn)) == PARALLEL
2943 	       && get_attr_atomic (insn)))
2944 	continue;
2945       rtx pat = PATTERN (insn);
2946       rtx master = nvptx_get_unisimt_master ();
2947       for (int i = 0; i < XVECLEN (pat, 0); i++)
2948 	nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
2949       rtx pred = nvptx_get_unisimt_predicate ();
2950       pred = gen_rtx_NE (BImode, pred, const0_rtx);
2951       pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
2952       validate_change (insn, &PATTERN (insn), pat, false);
2953     }
2954 }
2955 
2956 /* Offloading function attributes.  */
2957 
2958 struct offload_attrs
2959 {
2960   unsigned mask;
2961   int num_gangs;
2962   int num_workers;
2963   int vector_length;
2964 };
2965 
2966 /* Define entries for cfun->machine->axis_dim.  */
2967 
2968 #define MACH_VECTOR_LENGTH 0
2969 #define MACH_MAX_WORKERS 1
2970 
2971 static void populate_offload_attrs (offload_attrs *oa);
2972 
2973 static void
init_axis_dim(void)2974 init_axis_dim (void)
2975 {
2976   offload_attrs oa;
2977   int max_workers;
2978 
2979   populate_offload_attrs (&oa);
2980 
2981   if (oa.num_workers == 0)
2982     max_workers = PTX_CTA_SIZE / oa.vector_length;
2983   else
2984     max_workers = oa.num_workers;
2985 
2986   cfun->machine->axis_dim[MACH_VECTOR_LENGTH] = oa.vector_length;
2987   cfun->machine->axis_dim[MACH_MAX_WORKERS] = max_workers;
2988   cfun->machine->axis_dim_init_p = true;
2989 }
2990 
2991 static int ATTRIBUTE_UNUSED
nvptx_mach_max_workers()2992 nvptx_mach_max_workers ()
2993 {
2994   if (!cfun->machine->axis_dim_init_p)
2995     init_axis_dim ();
2996   return cfun->machine->axis_dim[MACH_MAX_WORKERS];
2997 }
2998 
2999 static int ATTRIBUTE_UNUSED
nvptx_mach_vector_length()3000 nvptx_mach_vector_length ()
3001 {
3002   if (!cfun->machine->axis_dim_init_p)
3003     init_axis_dim ();
3004   return cfun->machine->axis_dim[MACH_VECTOR_LENGTH];
3005 }
3006 
3007 /* Loop structure of the function.  The entire function is described as
3008    a NULL loop.  */
3009 
3010 struct parallel
3011 {
3012   /* Parent parallel.  */
3013   parallel *parent;
3014 
3015   /* Next sibling parallel.  */
3016   parallel *next;
3017 
3018   /* First child parallel.  */
3019   parallel *inner;
3020 
3021   /* Partitioning mask of the parallel.  */
3022   unsigned mask;
3023 
3024   /* Partitioning used within inner parallels. */
3025   unsigned inner_mask;
3026 
3027   /* Location of parallel forked and join.  The forked is the first
3028      block in the parallel and the join is the first block after of
3029      the partition.  */
3030   basic_block forked_block;
3031   basic_block join_block;
3032 
3033   rtx_insn *forked_insn;
3034   rtx_insn *join_insn;
3035 
3036   rtx_insn *fork_insn;
3037   rtx_insn *joining_insn;
3038 
3039   /* Basic blocks in this parallel, but not in child parallels.  The
3040      FORKED and JOINING blocks are in the partition.  The FORK and JOIN
3041      blocks are not.  */
3042   auto_vec<basic_block> blocks;
3043 
3044 public:
3045   parallel (parallel *parent, unsigned mode);
3046   ~parallel ();
3047 };
3048 
3049 /* Constructor links the new parallel into it's parent's chain of
3050    children.  */
3051 
parallel(parallel * parent_,unsigned mask_)3052 parallel::parallel (parallel *parent_, unsigned mask_)
3053   :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
3054 {
3055   forked_block = join_block = 0;
3056   forked_insn = join_insn = 0;
3057   fork_insn = joining_insn = 0;
3058 
3059   if (parent)
3060     {
3061       next = parent->inner;
3062       parent->inner = this;
3063     }
3064 }
3065 
~parallel()3066 parallel::~parallel ()
3067 {
3068   delete inner;
3069   delete next;
3070 }
3071 
3072 /* Map of basic blocks to insns */
3073 typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
3074 
3075 /* A tuple of an insn of interest and the BB in which it resides.  */
3076 typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
3077 typedef auto_vec<insn_bb_t> insn_bb_vec_t;
3078 
3079 /* Split basic blocks such that each forked and join unspecs are at
3080    the start of their basic blocks.  Thus afterwards each block will
3081    have a single partitioning mode.  We also do the same for return
3082    insns, as they are executed by every thread.  Return the
3083    partitioning mode of the function as a whole.  Populate MAP with
3084    head and tail blocks.  We also clear the BB visited flag, which is
3085    used when finding partitions.  */
3086 
3087 static void
nvptx_split_blocks(bb_insn_map_t * map)3088 nvptx_split_blocks (bb_insn_map_t *map)
3089 {
3090   insn_bb_vec_t worklist;
3091   basic_block block;
3092   rtx_insn *insn;
3093 
3094   /* Locate all the reorg instructions of interest.  */
3095   FOR_ALL_BB_FN (block, cfun)
3096     {
3097       bool seen_insn = false;
3098 
3099       /* Clear visited flag, for use by parallel locator  */
3100       block->flags &= ~BB_VISITED;
3101 
3102       FOR_BB_INSNS (block, insn)
3103 	{
3104 	  if (!INSN_P (insn))
3105 	    continue;
3106 	  switch (recog_memoized (insn))
3107 	    {
3108 	    default:
3109 	      seen_insn = true;
3110 	      continue;
3111 	    case CODE_FOR_nvptx_forked:
3112 	    case CODE_FOR_nvptx_join:
3113 	      break;
3114 
3115 	    case CODE_FOR_return:
3116 	      /* We also need to split just before return insns, as
3117 		 that insn needs executing by all threads, but the
3118 		 block it is in probably does not.  */
3119 	      break;
3120 	    }
3121 
3122 	  if (seen_insn)
3123 	    /* We've found an instruction that  must be at the start of
3124 	       a block, but isn't.  Add it to the worklist.  */
3125 	    worklist.safe_push (insn_bb_t (insn, block));
3126 	  else
3127 	    /* It was already the first instruction.  Just add it to
3128 	       the map.  */
3129 	    map->get_or_insert (block) = insn;
3130 	  seen_insn = true;
3131 	}
3132     }
3133 
3134   /* Split blocks on the worklist.  */
3135   unsigned ix;
3136   insn_bb_t *elt;
3137   basic_block remap = 0;
3138   for (ix = 0; worklist.iterate (ix, &elt); ix++)
3139     {
3140       if (remap != elt->second)
3141 	{
3142 	  block = elt->second;
3143 	  remap = block;
3144 	}
3145 
3146       /* Split block before insn. The insn is in the new block  */
3147       edge e = split_block (block, PREV_INSN (elt->first));
3148 
3149       block = e->dest;
3150       map->get_or_insert (block) = elt->first;
3151     }
3152 }
3153 
3154 /* Return true if MASK contains parallelism that requires shared
3155    memory to broadcast.  */
3156 
3157 static bool
nvptx_needs_shared_bcast(unsigned mask)3158 nvptx_needs_shared_bcast (unsigned mask)
3159 {
3160   bool worker = mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
3161   bool large_vector = (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
3162     && nvptx_mach_vector_length () != PTX_WARP_SIZE;
3163 
3164   return worker || large_vector;
3165 }
3166 
3167 /* BLOCK is a basic block containing a head or tail instruction.
3168    Locate the associated prehead or pretail instruction, which must be
3169    in the single predecessor block.  */
3170 
3171 static rtx_insn *
nvptx_discover_pre(basic_block block,int expected)3172 nvptx_discover_pre (basic_block block, int expected)
3173 {
3174   gcc_assert (block->preds->length () == 1);
3175   basic_block pre_block = (*block->preds)[0]->src;
3176   rtx_insn *pre_insn;
3177 
3178   for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
3179        pre_insn = PREV_INSN (pre_insn))
3180     gcc_assert (pre_insn != BB_HEAD (pre_block));
3181 
3182   gcc_assert (recog_memoized (pre_insn) == expected);
3183   return pre_insn;
3184 }
3185 
3186 /* Dump this parallel and all its inner parallels.  */
3187 
3188 static void
nvptx_dump_pars(parallel * par,unsigned depth)3189 nvptx_dump_pars (parallel *par, unsigned depth)
3190 {
3191   fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
3192 	   depth, par->mask,
3193 	   par->forked_block ? par->forked_block->index : -1,
3194 	   par->join_block ? par->join_block->index : -1);
3195 
3196   fprintf (dump_file, "    blocks:");
3197 
3198   basic_block block;
3199   for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
3200     fprintf (dump_file, " %d", block->index);
3201   fprintf (dump_file, "\n");
3202   if (par->inner)
3203     nvptx_dump_pars (par->inner, depth + 1);
3204 
3205   if (par->next)
3206     nvptx_dump_pars (par->next, depth);
3207 }
3208 
3209 /* If BLOCK contains a fork/join marker, process it to create or
3210    terminate a loop structure.  Add this block to the current loop,
3211    and then walk successor blocks.   */
3212 
3213 static parallel *
nvptx_find_par(bb_insn_map_t * map,parallel * par,basic_block block)3214 nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
3215 {
3216   if (block->flags & BB_VISITED)
3217     return par;
3218   block->flags |= BB_VISITED;
3219 
3220   if (rtx_insn **endp = map->get (block))
3221     {
3222       rtx_insn *end = *endp;
3223 
3224       /* This is a block head or tail, or return instruction.  */
3225       switch (recog_memoized (end))
3226 	{
3227 	case CODE_FOR_return:
3228 	  /* Return instructions are in their own block, and we
3229 	     don't need to do anything more.  */
3230 	  return par;
3231 
3232 	case CODE_FOR_nvptx_forked:
3233 	  /* Loop head, create a new inner loop and add it into
3234 	     our parent's child list.  */
3235 	  {
3236 	    unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3237 
3238 	    gcc_assert (mask);
3239 	    par = new parallel (par, mask);
3240 	    par->forked_block = block;
3241 	    par->forked_insn = end;
3242 	    if (nvptx_needs_shared_bcast (mask))
3243 	      par->fork_insn
3244 		= nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
3245 	  }
3246 	  break;
3247 
3248 	case CODE_FOR_nvptx_join:
3249 	  /* A loop tail.  Finish the current loop and return to
3250 	     parent.  */
3251 	  {
3252 	    unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3253 
3254 	    gcc_assert (par->mask == mask);
3255 	    gcc_assert (par->join_block == NULL);
3256 	    par->join_block = block;
3257 	    par->join_insn = end;
3258 	    if (nvptx_needs_shared_bcast (mask))
3259 	      par->joining_insn
3260 		= nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
3261 	    par = par->parent;
3262 	  }
3263 	  break;
3264 
3265 	default:
3266 	  gcc_unreachable ();
3267 	}
3268     }
3269 
3270   if (par)
3271     /* Add this block onto the current loop's list of blocks.  */
3272     par->blocks.safe_push (block);
3273   else
3274     /* This must be the entry block.  Create a NULL parallel.  */
3275     par = new parallel (0, 0);
3276 
3277   /* Walk successor blocks.  */
3278   edge e;
3279   edge_iterator ei;
3280 
3281   FOR_EACH_EDGE (e, ei, block->succs)
3282     nvptx_find_par (map, par, e->dest);
3283 
3284   return par;
3285 }
3286 
3287 /* DFS walk the CFG looking for fork & join markers.  Construct
3288    loop structures as we go.  MAP is a mapping of basic blocks
3289    to head & tail markers, discovered when splitting blocks.  This
3290    speeds up the discovery.  We rely on the BB visited flag having
3291    been cleared when splitting blocks.  */
3292 
3293 static parallel *
nvptx_discover_pars(bb_insn_map_t * map)3294 nvptx_discover_pars (bb_insn_map_t *map)
3295 {
3296   basic_block block;
3297 
3298   /* Mark exit blocks as visited.  */
3299   block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3300   block->flags |= BB_VISITED;
3301 
3302   /* And entry block as not.  */
3303   block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3304   block->flags &= ~BB_VISITED;
3305 
3306   parallel *par = nvptx_find_par (map, 0, block);
3307 
3308   if (dump_file)
3309     {
3310       fprintf (dump_file, "\nLoops\n");
3311       nvptx_dump_pars (par, 0);
3312       fprintf (dump_file, "\n");
3313     }
3314 
3315   return par;
3316 }
3317 
3318 /* Analyse a group of BBs within a partitioned region and create N
3319    Single-Entry-Single-Exit regions.  Some of those regions will be
3320    trivial ones consisting of a single BB.  The blocks of a
3321    partitioned region might form a set of disjoint graphs -- because
3322    the region encloses a differently partitoned sub region.
3323 
3324    We use the linear time algorithm described in 'Finding Regions Fast:
3325    Single Entry Single Exit and control Regions in Linear Time'
3326    Johnson, Pearson & Pingali.  That algorithm deals with complete
3327    CFGs, where a back edge is inserted from END to START, and thus the
3328    problem becomes one of finding equivalent loops.
3329 
3330    In this case we have a partial CFG.  We complete it by redirecting
3331    any incoming edge to the graph to be from an arbitrary external BB,
3332    and similarly redirecting any outgoing edge to be to  that BB.
3333    Thus we end up with a closed graph.
3334 
3335    The algorithm works by building a spanning tree of an undirected
3336    graph and keeping track of back edges from nodes further from the
3337    root in the tree to nodes nearer to the root in the tree.  In the
3338    description below, the root is up and the tree grows downwards.
3339 
3340    We avoid having to deal with degenerate back-edges to the same
3341    block, by splitting each BB into 3 -- one for input edges, one for
3342    the node itself and one for the output edges.  Such back edges are
3343    referred to as 'Brackets'.  Cycle equivalent nodes will have the
3344    same set of brackets.
3345 
3346    Determining bracket equivalency is done by maintaining a list of
3347    brackets in such a manner that the list length and final bracket
3348    uniquely identify the set.
3349 
3350    We use coloring to mark all BBs with cycle equivalency with the
3351    same color.  This is the output of the 'Finding Regions Fast'
3352    algorithm.  Notice it doesn't actually find the set of nodes within
3353    a particular region, just unorderd sets of nodes that are the
3354    entries and exits of SESE regions.
3355 
3356    After determining cycle equivalency, we need to find the minimal
3357    set of SESE regions.  Do this with a DFS coloring walk of the
3358    complete graph.  We're either 'looking' or 'coloring'.  When
3359    looking, and we're in the subgraph, we start coloring the color of
3360    the current node, and remember that node as the start of the
3361    current color's SESE region.  Every time we go to a new node, we
3362    decrement the count of nodes with thet color.  If it reaches zero,
3363    we remember that node as the end of the current color's SESE region
3364    and return to 'looking'.  Otherwise we color the node the current
3365    color.
3366 
3367    This way we end up with coloring the inside of non-trivial SESE
3368    regions with the color of that region.  */
3369 
3370 /* A pair of BBs.  We use this to represent SESE regions.  */
3371 typedef std::pair<basic_block, basic_block> bb_pair_t;
3372 typedef auto_vec<bb_pair_t> bb_pair_vec_t;
3373 
3374 /* A node in the undirected CFG.  The discriminator SECOND indicates just
3375    above or just below the BB idicated by FIRST.  */
3376 typedef std::pair<basic_block, int> pseudo_node_t;
3377 
3378 /* A bracket indicates an edge towards the root of the spanning tree of the
3379    undirected graph.  Each bracket has a color, determined
3380    from the currrent set of brackets.  */
3381 struct bracket
3382 {
3383   pseudo_node_t back; /* Back target */
3384 
3385   /* Current color and size of set.  */
3386   unsigned color;
3387   unsigned size;
3388 
bracketbracket3389   bracket (pseudo_node_t back_)
3390   : back (back_), color (~0u), size (~0u)
3391   {
3392   }
3393 
get_colorbracket3394   unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
3395   {
3396     if (length != size)
3397       {
3398 	size = length;
3399 	color = color_counts.length ();
3400 	color_counts.quick_push (0);
3401       }
3402     color_counts[color]++;
3403     return color;
3404   }
3405 };
3406 
3407 typedef auto_vec<bracket> bracket_vec_t;
3408 
3409 /* Basic block info for finding SESE regions.    */
3410 
3411 struct bb_sese
3412 {
3413   int node;  /* Node number in spanning tree.  */
3414   int parent; /* Parent node number.  */
3415 
3416   /* The algorithm splits each node A into Ai, A', Ao. The incoming
3417      edges arrive at pseudo-node Ai and the outgoing edges leave at
3418      pseudo-node Ao.  We have to remember which way we arrived at a
3419      particular node when generating the spanning tree.  dir > 0 means
3420      we arrived at Ai, dir < 0 means we arrived at Ao.  */
3421   int dir;
3422 
3423   /* Lowest numbered pseudo-node reached via a backedge from thsis
3424      node, or any descendant.  */
3425   pseudo_node_t high;
3426 
3427   int color;  /* Cycle-equivalence color  */
3428 
3429   /* Stack of brackets for this node.  */
3430   bracket_vec_t brackets;
3431 
bb_sesebb_sese3432   bb_sese (unsigned node_, unsigned p, int dir_)
3433   :node (node_), parent (p), dir (dir_)
3434   {
3435   }
3436   ~bb_sese ();
3437 
3438   /* Push a bracket ending at BACK.  */
pushbb_sese3439   void push (const pseudo_node_t &back)
3440   {
3441     if (dump_file)
3442       fprintf (dump_file, "Pushing backedge %d:%+d\n",
3443 	       back.first ? back.first->index : 0, back.second);
3444     brackets.safe_push (bracket (back));
3445   }
3446 
3447   void append (bb_sese *child);
3448   void remove (const pseudo_node_t &);
3449 
3450   /* Set node's color.  */
set_colorbb_sese3451   void set_color (auto_vec<unsigned> &color_counts)
3452   {
3453     color = brackets.last ().get_color (color_counts, brackets.length ());
3454   }
3455 };
3456 
~bb_sese()3457 bb_sese::~bb_sese ()
3458 {
3459 }
3460 
3461 /* Destructively append CHILD's brackets.  */
3462 
3463 void
append(bb_sese * child)3464 bb_sese::append (bb_sese *child)
3465 {
3466   if (int len = child->brackets.length ())
3467     {
3468       int ix;
3469 
3470       if (dump_file)
3471 	{
3472 	  for (ix = 0; ix < len; ix++)
3473 	    {
3474 	      const pseudo_node_t &pseudo = child->brackets[ix].back;
3475 	      fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
3476 		       child->node, pseudo.first ? pseudo.first->index : 0,
3477 		       pseudo.second);
3478 	    }
3479 	}
3480       if (!brackets.length ())
3481 	std::swap (brackets, child->brackets);
3482       else
3483 	{
3484 	  brackets.reserve (len);
3485 	  for (ix = 0; ix < len; ix++)
3486 	    brackets.quick_push (child->brackets[ix]);
3487 	}
3488     }
3489 }
3490 
3491 /* Remove brackets that terminate at PSEUDO.  */
3492 
3493 void
remove(const pseudo_node_t & pseudo)3494 bb_sese::remove (const pseudo_node_t &pseudo)
3495 {
3496   unsigned removed = 0;
3497   int len = brackets.length ();
3498 
3499   for (int ix = 0; ix < len; ix++)
3500     {
3501       if (brackets[ix].back == pseudo)
3502 	{
3503 	  if (dump_file)
3504 	    fprintf (dump_file, "Removing backedge %d:%+d\n",
3505 		     pseudo.first ? pseudo.first->index : 0, pseudo.second);
3506 	  removed++;
3507 	}
3508       else if (removed)
3509 	brackets[ix-removed] = brackets[ix];
3510     }
3511   while (removed--)
3512     brackets.pop ();
3513 }
3514 
3515 /* Accessors for BB's aux pointer.  */
3516 #define BB_SET_SESE(B, S) ((B)->aux = (S))
3517 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3518 
3519 /* DFS walk creating SESE data structures.  Only cover nodes with
3520    BB_VISITED set.  Append discovered blocks to LIST.  We number in
3521    increments of 3 so that the above and below pseudo nodes can be
3522    implicitly numbered too.  */
3523 
3524 static int
nvptx_sese_number(int n,int p,int dir,basic_block b,auto_vec<basic_block> * list)3525 nvptx_sese_number (int n, int p, int dir, basic_block b,
3526 		   auto_vec<basic_block> *list)
3527 {
3528   if (BB_GET_SESE (b))
3529     return n;
3530 
3531   if (dump_file)
3532     fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
3533 	     b->index, n, p, dir);
3534 
3535   BB_SET_SESE (b, new bb_sese (n, p, dir));
3536   p = n;
3537 
3538   n += 3;
3539   list->quick_push (b);
3540 
3541   /* First walk the nodes on the 'other side' of this node, then walk
3542      the nodes on the same side.  */
3543   for (unsigned ix = 2; ix; ix--)
3544     {
3545       vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
3546       size_t offset = (dir > 0 ? offsetof (edge_def, dest)
3547 		       : offsetof (edge_def, src));
3548       edge e;
3549       edge_iterator ei;
3550 
3551       FOR_EACH_EDGE (e, ei, edges)
3552 	{
3553 	  basic_block target = *(basic_block *)((char *)e + offset);
3554 
3555 	  if (target->flags & BB_VISITED)
3556 	    n = nvptx_sese_number (n, p, dir, target, list);
3557 	}
3558       dir = -dir;
3559     }
3560   return n;
3561 }
3562 
3563 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
3564    EDGES are the outgoing edges and OFFSET is the offset to the src
3565    or dst block on the edges.   */
3566 
3567 static void
nvptx_sese_pseudo(basic_block me,bb_sese * sese,int depth,int dir,vec<edge,va_gc> * edges,size_t offset)3568 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
3569 		   vec<edge, va_gc> *edges, size_t offset)
3570 {
3571   edge e;
3572   edge_iterator ei;
3573   int hi_back = depth;
3574   pseudo_node_t node_back (NULL, depth);
3575   int hi_child = depth;
3576   pseudo_node_t node_child (NULL, depth);
3577   basic_block child = NULL;
3578   unsigned num_children = 0;
3579   int usd = -dir * sese->dir;
3580 
3581   if (dump_file)
3582     fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
3583 	     me->index, sese->node, dir);
3584 
3585   if (dir < 0)
3586     {
3587       /* This is the above pseudo-child.  It has the BB itself as an
3588 	 additional child node.  */
3589       node_child = sese->high;
3590       hi_child = node_child.second;
3591       if (node_child.first)
3592 	hi_child += BB_GET_SESE (node_child.first)->node;
3593       num_children++;
3594     }
3595 
3596   /* Examine each edge.
3597      - if it is a child (a) append its bracket list and (b) record
3598           whether it is the child with the highest reaching bracket.
3599      - if it is an edge to ancestor, record whether it's the highest
3600           reaching backlink.  */
3601   FOR_EACH_EDGE (e, ei, edges)
3602     {
3603       basic_block target = *(basic_block *)((char *)e + offset);
3604 
3605       if (bb_sese *t_sese = BB_GET_SESE (target))
3606 	{
3607 	  if (t_sese->parent == sese->node && !(t_sese->dir + usd))
3608 	    {
3609 	      /* Child node.  Append its bracket list. */
3610 	      num_children++;
3611 	      sese->append (t_sese);
3612 
3613 	      /* Compare it's hi value.  */
3614 	      int t_hi = t_sese->high.second;
3615 
3616 	      if (basic_block child_hi_block = t_sese->high.first)
3617 		t_hi += BB_GET_SESE (child_hi_block)->node;
3618 
3619 	      if (hi_child > t_hi)
3620 		{
3621 		  hi_child = t_hi;
3622 		  node_child = t_sese->high;
3623 		  child = target;
3624 		}
3625 	    }
3626 	  else if (t_sese->node < sese->node + dir
3627 		   && !(dir < 0 && sese->parent == t_sese->node))
3628 	    {
3629 	      /* Non-parental ancestor node -- a backlink.  */
3630 	      int d = usd * t_sese->dir;
3631 	      int back = t_sese->node + d;
3632 
3633 	      if (hi_back > back)
3634 		{
3635 		  hi_back = back;
3636 		  node_back = pseudo_node_t (target, d);
3637 		}
3638 	    }
3639 	}
3640       else
3641 	{ /* Fallen off graph, backlink to entry node.  */
3642 	  hi_back = 0;
3643 	  node_back = pseudo_node_t (NULL, 0);
3644 	}
3645     }
3646 
3647   /* Remove any brackets that terminate at this pseudo node.  */
3648   sese->remove (pseudo_node_t (me, dir));
3649 
3650   /* Now push any backlinks from this pseudo node.  */
3651   FOR_EACH_EDGE (e, ei, edges)
3652     {
3653       basic_block target = *(basic_block *)((char *)e + offset);
3654       if (bb_sese *t_sese = BB_GET_SESE (target))
3655 	{
3656 	  if (t_sese->node < sese->node + dir
3657 	      && !(dir < 0 && sese->parent == t_sese->node))
3658 	    /* Non-parental ancestor node - backedge from me.  */
3659 	    sese->push (pseudo_node_t (target, usd * t_sese->dir));
3660 	}
3661       else
3662 	{
3663 	  /* back edge to entry node */
3664 	  sese->push (pseudo_node_t (NULL, 0));
3665 	}
3666     }
3667 
3668  /* If this node leads directly or indirectly to a no-return region of
3669      the graph, then fake a backedge to entry node.  */
3670   if (!sese->brackets.length () || !edges || !edges->length ())
3671     {
3672       hi_back = 0;
3673       node_back = pseudo_node_t (NULL, 0);
3674       sese->push (node_back);
3675     }
3676 
3677   /* Record the highest reaching backedge from us or a descendant.  */
3678   sese->high = hi_back < hi_child ? node_back : node_child;
3679 
3680   if (num_children > 1)
3681     {
3682       /* There is more than one child -- this is a Y shaped piece of
3683 	 spanning tree.  We have to insert a fake backedge from this
3684 	 node to the highest ancestor reached by not-the-highest
3685 	 reaching child.  Note that there may be multiple children
3686 	 with backedges to the same highest node.  That's ok and we
3687 	 insert the edge to that highest node.  */
3688       hi_child = depth;
3689       if (dir < 0 && child)
3690 	{
3691 	  node_child = sese->high;
3692 	  hi_child = node_child.second;
3693 	  if (node_child.first)
3694 	    hi_child += BB_GET_SESE (node_child.first)->node;
3695 	}
3696 
3697       FOR_EACH_EDGE (e, ei, edges)
3698 	{
3699 	  basic_block target = *(basic_block *)((char *)e + offset);
3700 
3701 	  if (target == child)
3702 	    /* Ignore the highest child. */
3703 	    continue;
3704 
3705 	  bb_sese *t_sese = BB_GET_SESE (target);
3706 	  if (!t_sese)
3707 	    continue;
3708 	  if (t_sese->parent != sese->node)
3709 	    /* Not a child. */
3710 	    continue;
3711 
3712 	  /* Compare its hi value.  */
3713 	  int t_hi = t_sese->high.second;
3714 
3715 	  if (basic_block child_hi_block = t_sese->high.first)
3716 	    t_hi += BB_GET_SESE (child_hi_block)->node;
3717 
3718 	  if (hi_child > t_hi)
3719 	    {
3720 	      hi_child = t_hi;
3721 	      node_child = t_sese->high;
3722 	    }
3723 	}
3724 
3725       sese->push (node_child);
3726     }
3727 }
3728 
3729 
3730 /* DFS walk of BB graph.  Color node BLOCK according to COLORING then
3731    proceed to successors.  Set SESE entry and exit nodes of
3732    REGIONS.  */
3733 
3734 static void
nvptx_sese_color(auto_vec<unsigned> & color_counts,bb_pair_vec_t & regions,basic_block block,int coloring)3735 nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
3736 		  basic_block block, int coloring)
3737 {
3738   bb_sese *sese = BB_GET_SESE (block);
3739 
3740   if (block->flags & BB_VISITED)
3741     {
3742       /* If we've already encountered this block, either we must not
3743 	 be coloring, or it must have been colored the current color.  */
3744       gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3745       return;
3746     }
3747 
3748   block->flags |= BB_VISITED;
3749 
3750   if (sese)
3751     {
3752       if (coloring < 0)
3753 	{
3754 	  /* Start coloring a region.  */
3755 	  regions[sese->color].first = block;
3756 	  coloring = sese->color;
3757 	}
3758 
3759       if (!--color_counts[sese->color] && sese->color == coloring)
3760 	{
3761 	  /* Found final block of SESE region.  */
3762 	  regions[sese->color].second = block;
3763 	  coloring = -1;
3764 	}
3765       else
3766 	/* Color the node, so we can assert on revisiting the node
3767 	   that the graph is indeed SESE.  */
3768 	sese->color = coloring;
3769     }
3770   else
3771     /* Fallen off the subgraph, we cannot be coloring.  */
3772     gcc_assert (coloring < 0);
3773 
3774   /* Walk each successor block.  */
3775   if (block->succs && block->succs->length ())
3776     {
3777       edge e;
3778       edge_iterator ei;
3779 
3780       FOR_EACH_EDGE (e, ei, block->succs)
3781 	nvptx_sese_color (color_counts, regions, e->dest, coloring);
3782     }
3783   else
3784     gcc_assert (coloring < 0);
3785 }
3786 
3787 /* Find minimal set of SESE regions covering BLOCKS.  REGIONS might
3788    end up with NULL entries in it.  */
3789 
3790 static void
nvptx_find_sese(auto_vec<basic_block> & blocks,bb_pair_vec_t & regions)3791 nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3792 {
3793   basic_block block;
3794   int ix;
3795 
3796   /* First clear each BB of the whole function.  */
3797   FOR_ALL_BB_FN (block, cfun)
3798     {
3799       block->flags &= ~BB_VISITED;
3800       BB_SET_SESE (block, 0);
3801     }
3802 
3803   /* Mark blocks in the function that are in this graph.  */
3804   for (ix = 0; blocks.iterate (ix, &block); ix++)
3805     block->flags |= BB_VISITED;
3806 
3807   /* Counts of nodes assigned to each color.  There cannot be more
3808      colors than blocks (and hopefully there will be fewer).  */
3809   auto_vec<unsigned> color_counts;
3810   color_counts.reserve (blocks.length ());
3811 
3812   /* Worklist of nodes in the spanning tree.  Again, there cannot be
3813      more nodes in the tree than blocks (there will be fewer if the
3814      CFG of blocks is disjoint).  */
3815   auto_vec<basic_block> spanlist;
3816   spanlist.reserve (blocks.length ());
3817 
3818   /* Make sure every block has its cycle class determined.  */
3819   for (ix = 0; blocks.iterate (ix, &block); ix++)
3820     {
3821       if (BB_GET_SESE (block))
3822 	/* We already met this block in an earlier graph solve.  */
3823 	continue;
3824 
3825       if (dump_file)
3826 	fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3827 
3828       /* Number the nodes reachable from block initial DFS order.  */
3829       int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3830 
3831       /* Now walk in reverse DFS order to find cycle equivalents.  */
3832       while (spanlist.length ())
3833 	{
3834 	  block = spanlist.pop ();
3835 	  bb_sese *sese = BB_GET_SESE (block);
3836 
3837 	  /* Do the pseudo node below.  */
3838 	  nvptx_sese_pseudo (block, sese, depth, +1,
3839 			     sese->dir > 0 ? block->succs : block->preds,
3840 			     (sese->dir > 0 ? offsetof (edge_def, dest)
3841 			      : offsetof (edge_def, src)));
3842 	  sese->set_color (color_counts);
3843 	  /* Do the pseudo node above.  */
3844 	  nvptx_sese_pseudo (block, sese, depth, -1,
3845 			     sese->dir < 0 ? block->succs : block->preds,
3846 			     (sese->dir < 0 ? offsetof (edge_def, dest)
3847 			      : offsetof (edge_def, src)));
3848 	}
3849       if (dump_file)
3850 	fprintf (dump_file, "\n");
3851     }
3852 
3853   if (dump_file)
3854     {
3855       unsigned count;
3856       const char *comma = "";
3857 
3858       fprintf (dump_file, "Found %d cycle equivalents\n",
3859 	       color_counts.length ());
3860       for (ix = 0; color_counts.iterate (ix, &count); ix++)
3861 	{
3862 	  fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3863 
3864 	  comma = "";
3865 	  for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3866 	    if (BB_GET_SESE (block)->color == ix)
3867 	      {
3868 		block->flags |= BB_VISITED;
3869 		fprintf (dump_file, "%s%d", comma, block->index);
3870 		comma=",";
3871 	      }
3872 	  fprintf (dump_file, "}");
3873 	  comma = ", ";
3874 	}
3875       fprintf (dump_file, "\n");
3876    }
3877 
3878   /* Now we've colored every block in the subgraph.  We now need to
3879      determine the minimal set of SESE regions that cover that
3880      subgraph.  Do this with a DFS walk of the complete function.
3881      During the walk we're either 'looking' or 'coloring'.  When we
3882      reach the last node of a particular color, we stop coloring and
3883      return to looking.  */
3884 
3885   /* There cannot be more SESE regions than colors.  */
3886   regions.reserve (color_counts.length ());
3887   for (ix = color_counts.length (); ix--;)
3888     regions.quick_push (bb_pair_t (0, 0));
3889 
3890   for (ix = 0; blocks.iterate (ix, &block); ix++)
3891     block->flags &= ~BB_VISITED;
3892 
3893   nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3894 
3895   if (dump_file)
3896     {
3897       const char *comma = "";
3898       int len = regions.length ();
3899 
3900       fprintf (dump_file, "SESE regions:");
3901       for (ix = 0; ix != len; ix++)
3902 	{
3903 	  basic_block from = regions[ix].first;
3904 	  basic_block to = regions[ix].second;
3905 
3906 	  if (from)
3907 	    {
3908 	      fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3909 	      if (to != from)
3910 		fprintf (dump_file, "->%d", to->index);
3911 
3912 	      int color = BB_GET_SESE (from)->color;
3913 
3914 	      /* Print the blocks within the region (excluding ends).  */
3915 	      FOR_EACH_BB_FN (block, cfun)
3916 		{
3917 		  bb_sese *sese = BB_GET_SESE (block);
3918 
3919 		  if (sese && sese->color == color
3920 		      && block != from && block != to)
3921 		    fprintf (dump_file, ".%d", block->index);
3922 		}
3923 	      fprintf (dump_file, "}");
3924 	    }
3925 	  comma = ",";
3926 	}
3927       fprintf (dump_file, "\n\n");
3928     }
3929 
3930   for (ix = 0; blocks.iterate (ix, &block); ix++)
3931     delete BB_GET_SESE (block);
3932 }
3933 
3934 #undef BB_SET_SESE
3935 #undef BB_GET_SESE
3936 
3937 /* Propagate live state at the start of a partitioned region.  IS_CALL
3938    indicates whether the propagation is for a (partitioned) call
3939    instruction.  BLOCK provides the live register information, and
3940    might not contain INSN. Propagation is inserted just after INSN. RW
3941    indicates whether we are reading and/or writing state.  This
3942    separation is needed for worker-level proppagation where we
3943    essentially do a spill & fill.  FN is the underlying worker
3944    function to generate the propagation instructions for single
3945    register.  DATA is user data.
3946 
3947    Returns true if we didn't emit any instructions.
3948 
3949    We propagate the live register set for non-calls and the entire
3950    frame for calls and non-calls.  We could do better by (a)
3951    propagating just the live set that is used within the partitioned
3952    regions and (b) only propagating stack entries that are used.  The
3953    latter might be quite hard to determine.  */
3954 
3955 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *, bool);
3956 
3957 static bool
nvptx_propagate(bool is_call,basic_block block,rtx_insn * insn,propagate_mask rw,propagator_fn fn,void * data,bool vector)3958 nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
3959 		 propagate_mask rw, propagator_fn fn, void *data, bool vector)
3960 {
3961   bitmap live = DF_LIVE_IN (block);
3962   bitmap_iterator iterator;
3963   unsigned ix;
3964   bool empty = true;
3965 
3966   /* Copy the frame array.  */
3967   HOST_WIDE_INT fs = get_frame_size ();
3968   if (fs)
3969     {
3970       rtx tmp = gen_reg_rtx (DImode);
3971       rtx idx = NULL_RTX;
3972       rtx ptr = gen_reg_rtx (Pmode);
3973       rtx pred = NULL_RTX;
3974       rtx_code_label *label = NULL;
3975 
3976       empty = false;
3977       /* The frame size might not be DImode compatible, but the frame
3978 	 array's declaration will be.  So it's ok to round up here.  */
3979       fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
3980       /* Detect single iteration loop. */
3981       if (fs == 1)
3982 	fs = 0;
3983 
3984       start_sequence ();
3985       emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3986       if (fs)
3987 	{
3988 	  idx = gen_reg_rtx (SImode);
3989 	  pred = gen_reg_rtx (BImode);
3990 	  label = gen_label_rtx ();
3991 
3992 	  emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3993 	  /* Allow worker function to initialize anything needed.  */
3994 	  rtx init = fn (tmp, PM_loop_begin, fs, data, vector);
3995 	  if (init)
3996 	    emit_insn (init);
3997 	  emit_label (label);
3998 	  LABEL_NUSES (label)++;
3999 	  emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
4000 	}
4001       if (rw & PM_read)
4002 	emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
4003       emit_insn (fn (tmp, rw, fs, data, vector));
4004       if (rw & PM_write)
4005 	emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
4006       if (fs)
4007 	{
4008 	  emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
4009 	  emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
4010 	  emit_insn (gen_br_true_uni (pred, label));
4011 	  rtx fini = fn (tmp, PM_loop_end, fs, data, vector);
4012 	  if (fini)
4013 	    emit_insn (fini);
4014 	  emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
4015 	}
4016       emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
4017       emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
4018       rtx cpy = get_insns ();
4019       end_sequence ();
4020       insn = emit_insn_after (cpy, insn);
4021     }
4022 
4023   if (!is_call)
4024     /* Copy live registers.  */
4025     EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
4026       {
4027 	rtx reg = regno_reg_rtx[ix];
4028 
4029 	if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
4030 	  {
4031 	    rtx bcast = fn (reg, rw, 0, data, vector);
4032 
4033 	    insn = emit_insn_after (bcast, insn);
4034 	    empty = false;
4035 	  }
4036       }
4037   return empty;
4038 }
4039 
4040 /* Worker for nvptx_warp_propagate.  */
4041 
4042 static rtx
warp_prop_gen(rtx reg,propagate_mask pm,unsigned ARG_UNUSED (count),void * ARG_UNUSED (data),bool ARG_UNUSED (vector))4043 warp_prop_gen (rtx reg, propagate_mask pm,
4044 	       unsigned ARG_UNUSED (count), void *ARG_UNUSED (data),
4045 	       bool ARG_UNUSED (vector))
4046 {
4047   if (!(pm & PM_read_write))
4048     return 0;
4049 
4050   return nvptx_gen_warp_bcast (reg);
4051 }
4052 
4053 /* Propagate state that is live at start of BLOCK across the vectors
4054    of a single warp.  Propagation is inserted just after INSN.
4055    IS_CALL and return as for nvptx_propagate.  */
4056 
4057 static bool
nvptx_warp_propagate(bool is_call,basic_block block,rtx_insn * insn)4058 nvptx_warp_propagate (bool is_call, basic_block block, rtx_insn *insn)
4059 {
4060   return nvptx_propagate (is_call, block, insn, PM_read_write,
4061 			  warp_prop_gen, 0, false);
4062 }
4063 
4064 /* Worker for nvptx_shared_propagate.  */
4065 
4066 static rtx
shared_prop_gen(rtx reg,propagate_mask pm,unsigned rep,void * data_,bool vector)4067 shared_prop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_,
4068 		 bool vector)
4069 {
4070   broadcast_data_t *data = (broadcast_data_t *)data_;
4071 
4072   if (pm & PM_loop_begin)
4073     {
4074       /* Starting a loop, initialize pointer.    */
4075       unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
4076 
4077       oacc_bcast_align = MAX (oacc_bcast_align, align);
4078       data->offset = ROUND_UP (data->offset, align);
4079 
4080       data->ptr = gen_reg_rtx (Pmode);
4081 
4082       return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
4083     }
4084   else if (pm & PM_loop_end)
4085     {
4086       rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
4087       data->ptr = NULL_RTX;
4088       return clobber;
4089     }
4090   else
4091     return nvptx_gen_shared_bcast (reg, pm, rep, data, vector);
4092 }
4093 
4094 /* Spill or fill live state that is live at start of BLOCK.  PRE_P
4095    indicates if this is just before partitioned mode (do spill), or
4096    just after it starts (do fill). Sequence is inserted just after
4097    INSN.  IS_CALL and return as for nvptx_propagate.  */
4098 
4099 static bool
nvptx_shared_propagate(bool pre_p,bool is_call,basic_block block,rtx_insn * insn,bool vector)4100 nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block,
4101 			rtx_insn *insn, bool vector)
4102 {
4103   broadcast_data_t data;
4104 
4105   data.base = gen_reg_rtx (Pmode);
4106   data.offset = 0;
4107   data.ptr = NULL_RTX;
4108 
4109   bool empty = nvptx_propagate (is_call, block, insn,
4110 				pre_p ? PM_read : PM_write, shared_prop_gen,
4111 				&data, vector);
4112   gcc_assert (empty == !data.offset);
4113   if (data.offset)
4114     {
4115       rtx bcast_sym = oacc_bcast_sym;
4116 
4117       /* Stuff was emitted, initialize the base pointer now.  */
4118       if (vector && nvptx_mach_max_workers () > 1)
4119 	{
4120 	  if (!cfun->machine->bcast_partition)
4121 	    {
4122 	      /* It would be nice to place this register in
4123 		 DATA_AREA_SHARED.  */
4124 	      cfun->machine->bcast_partition = gen_reg_rtx (DImode);
4125 	    }
4126 	  if (!cfun->machine->sync_bar)
4127 	    cfun->machine->sync_bar = gen_reg_rtx (SImode);
4128 
4129 	  bcast_sym = cfun->machine->bcast_partition;
4130 	}
4131 
4132       rtx init = gen_rtx_SET (data.base, bcast_sym);
4133       emit_insn_after (init, insn);
4134 
4135       unsigned int psize = ROUND_UP (data.offset, oacc_bcast_align);
4136       unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
4137 			   ? nvptx_mach_max_workers () + 1
4138 			   : 1);
4139 
4140       oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
4141       oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
4142     }
4143   return empty;
4144 }
4145 
4146 /* Emit a CTA-level synchronization barrier.  LOCK is the barrier number,
4147    which is an integer or a register.  THREADS is the number of threads
4148    controlled by the barrier.  */
4149 
4150 static rtx
nvptx_cta_sync(rtx lock,int threads)4151 nvptx_cta_sync (rtx lock, int threads)
4152 {
4153   return gen_nvptx_barsync (lock, GEN_INT (threads));
4154 }
4155 
4156 #if WORKAROUND_PTXJIT_BUG
4157 /* Return first real insn in BB, or return NULL_RTX if BB does not contain
4158    real insns.  */
4159 
4160 static rtx_insn *
bb_first_real_insn(basic_block bb)4161 bb_first_real_insn (basic_block bb)
4162 {
4163   rtx_insn *insn;
4164 
4165   /* Find first insn of from block.  */
4166   FOR_BB_INSNS (bb, insn)
4167     if (INSN_P (insn))
4168       return insn;
4169 
4170   return 0;
4171 }
4172 #endif
4173 
4174 /* Return true if INSN needs neutering.  */
4175 
4176 static bool
needs_neutering_p(rtx_insn * insn)4177 needs_neutering_p (rtx_insn *insn)
4178 {
4179   if (!INSN_P (insn))
4180     return false;
4181 
4182   switch (recog_memoized (insn))
4183     {
4184     case CODE_FOR_nvptx_fork:
4185     case CODE_FOR_nvptx_forked:
4186     case CODE_FOR_nvptx_joining:
4187     case CODE_FOR_nvptx_join:
4188     case CODE_FOR_nvptx_barsync:
4189       return false;
4190     default:
4191       return true;
4192     }
4193 }
4194 
4195 /* Verify position of VECTOR_{JUMP,LABEL} and WORKER_{JUMP,LABEL} in FROM.  */
4196 
4197 static bool
verify_neutering_jumps(basic_block from,rtx_insn * vector_jump,rtx_insn * worker_jump,rtx_insn * vector_label,rtx_insn * worker_label)4198 verify_neutering_jumps (basic_block from,
4199 			rtx_insn *vector_jump, rtx_insn *worker_jump,
4200 			rtx_insn *vector_label, rtx_insn *worker_label)
4201 {
4202   basic_block bb = from;
4203   rtx_insn *insn = BB_HEAD (bb);
4204   bool seen_worker_jump = false;
4205   bool seen_vector_jump = false;
4206   bool seen_worker_label = false;
4207   bool seen_vector_label = false;
4208   bool worker_neutered = false;
4209   bool vector_neutered = false;
4210   while (true)
4211     {
4212       if (insn == worker_jump)
4213 	{
4214 	  seen_worker_jump = true;
4215 	  worker_neutered = true;
4216 	  gcc_assert (!vector_neutered);
4217 	}
4218       else if (insn == vector_jump)
4219 	{
4220 	  seen_vector_jump = true;
4221 	  vector_neutered = true;
4222 	}
4223       else if (insn == worker_label)
4224 	{
4225 	  seen_worker_label = true;
4226 	  gcc_assert (worker_neutered);
4227 	  worker_neutered = false;
4228 	}
4229       else if (insn == vector_label)
4230 	{
4231 	  seen_vector_label = true;
4232 	  gcc_assert (vector_neutered);
4233 	  vector_neutered = false;
4234 	}
4235       else if (INSN_P (insn))
4236 	switch (recog_memoized (insn))
4237 	  {
4238 	  case CODE_FOR_nvptx_barsync:
4239 	    gcc_assert (!vector_neutered && !worker_neutered);
4240 	    break;
4241 	  default:
4242 	    break;
4243 	  }
4244 
4245       if (insn != BB_END (bb))
4246 	insn = NEXT_INSN (insn);
4247       else if (JUMP_P (insn) && single_succ_p (bb)
4248 	       && !seen_vector_jump && !seen_worker_jump)
4249 	{
4250 	  bb = single_succ (bb);
4251 	  insn = BB_HEAD (bb);
4252 	}
4253       else
4254 	break;
4255     }
4256 
4257   gcc_assert (!(vector_jump && !seen_vector_jump));
4258   gcc_assert (!(worker_jump && !seen_worker_jump));
4259 
4260   if (seen_vector_label || seen_worker_label)
4261     {
4262       gcc_assert (!(vector_label && !seen_vector_label));
4263       gcc_assert (!(worker_label && !seen_worker_label));
4264 
4265       return true;
4266     }
4267 
4268   return false;
4269 }
4270 
4271 /* Verify position of VECTOR_LABEL and WORKER_LABEL in TO.  */
4272 
4273 static void
verify_neutering_labels(basic_block to,rtx_insn * vector_label,rtx_insn * worker_label)4274 verify_neutering_labels (basic_block to, rtx_insn *vector_label,
4275 			 rtx_insn *worker_label)
4276 {
4277   basic_block bb = to;
4278   rtx_insn *insn = BB_END (bb);
4279   bool seen_worker_label = false;
4280   bool seen_vector_label = false;
4281   while (true)
4282     {
4283       if (insn == worker_label)
4284 	{
4285 	  seen_worker_label = true;
4286 	  gcc_assert (!seen_vector_label);
4287 	}
4288       else if (insn == vector_label)
4289 	seen_vector_label = true;
4290       else if (INSN_P (insn))
4291 	switch (recog_memoized (insn))
4292 	  {
4293 	  case CODE_FOR_nvptx_barsync:
4294 	    gcc_assert (!seen_vector_label && !seen_worker_label);
4295 	    break;
4296 	  }
4297 
4298       if (insn != BB_HEAD (bb))
4299 	insn = PREV_INSN (insn);
4300       else
4301 	break;
4302     }
4303 
4304   gcc_assert (!(vector_label && !seen_vector_label));
4305   gcc_assert (!(worker_label && !seen_worker_label));
4306 }
4307 
4308 /* Single neutering according to MASK.  FROM is the incoming block and
4309    TO is the outgoing block.  These may be the same block. Insert at
4310    start of FROM:
4311 
4312      if (tid.<axis>) goto end.
4313 
4314    and insert before ending branch of TO (if there is such an insn):
4315 
4316      end:
4317      <possibly-broadcast-cond>
4318      <branch>
4319 
4320    We currently only use differnt FROM and TO when skipping an entire
4321    loop.  We could do more if we detected superblocks.  */
4322 
4323 static void
nvptx_single(unsigned mask,basic_block from,basic_block to)4324 nvptx_single (unsigned mask, basic_block from, basic_block to)
4325 {
4326   rtx_insn *head = BB_HEAD (from);
4327   rtx_insn *tail = BB_END (to);
4328   unsigned skip_mask = mask;
4329 
4330   while (true)
4331     {
4332       /* Find first insn of from block.  */
4333       while (head != BB_END (from) && !needs_neutering_p (head))
4334 	head = NEXT_INSN (head);
4335 
4336       if (from == to)
4337 	break;
4338 
4339       if (!(JUMP_P (head) && single_succ_p (from)))
4340 	break;
4341 
4342       basic_block jump_target = single_succ (from);
4343       if (!single_pred_p (jump_target))
4344 	break;
4345 
4346       from = jump_target;
4347       head = BB_HEAD (from);
4348     }
4349 
4350   /* Find last insn of to block */
4351   rtx_insn *limit = from == to ? head : BB_HEAD (to);
4352   while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
4353     tail = PREV_INSN (tail);
4354 
4355   /* Detect if tail is a branch.  */
4356   rtx tail_branch = NULL_RTX;
4357   rtx cond_branch = NULL_RTX;
4358   if (tail && INSN_P (tail))
4359     {
4360       tail_branch = PATTERN (tail);
4361       if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
4362 	tail_branch = NULL_RTX;
4363       else
4364 	{
4365 	  cond_branch = SET_SRC (tail_branch);
4366 	  if (GET_CODE (cond_branch) != IF_THEN_ELSE)
4367 	    cond_branch = NULL_RTX;
4368 	}
4369     }
4370 
4371   if (tail == head)
4372     {
4373       /* If this is empty, do nothing.  */
4374       if (!head || !needs_neutering_p (head))
4375 	return;
4376 
4377       if (cond_branch)
4378 	{
4379 	  /* If we're only doing vector single, there's no need to
4380 	     emit skip code because we'll not insert anything.  */
4381 	  if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
4382 	    skip_mask = 0;
4383 	}
4384       else if (tail_branch)
4385 	/* Block with only unconditional branch.  Nothing to do.  */
4386 	return;
4387     }
4388 
4389   /* Insert the vector test inside the worker test.  */
4390   unsigned mode;
4391   rtx_insn *before = tail;
4392   rtx_insn *neuter_start = NULL;
4393   rtx_insn *worker_label = NULL, *vector_label = NULL;
4394   rtx_insn *worker_jump = NULL, *vector_jump = NULL;
4395   for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4396     if (GOMP_DIM_MASK (mode) & skip_mask)
4397       {
4398 	rtx_code_label *label = gen_label_rtx ();
4399 	rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
4400 	rtx_insn **mode_jump
4401 	  = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump;
4402 	rtx_insn **mode_label
4403 	  = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label;
4404 
4405 	if (!pred)
4406 	  {
4407 	    pred = gen_reg_rtx (BImode);
4408 	    cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
4409 	  }
4410 
4411 	rtx br;
4412 	if (mode == GOMP_DIM_VECTOR)
4413 	  br = gen_br_true (pred, label);
4414 	else
4415 	  br = gen_br_true_uni (pred, label);
4416 	if (neuter_start)
4417 	  neuter_start = emit_insn_after (br, neuter_start);
4418 	else
4419 	  neuter_start = emit_insn_before (br, head);
4420 	*mode_jump = neuter_start;
4421 
4422 	LABEL_NUSES (label)++;
4423 	rtx_insn *label_insn;
4424 	if (tail_branch)
4425 	  {
4426 	    label_insn = emit_label_before (label, before);
4427 	    before = label_insn;
4428 	  }
4429 	else
4430 	  {
4431 	    label_insn = emit_label_after (label, tail);
4432 	    if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER)
4433 		&& CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL))
4434 	      emit_insn_after (gen_exit (), label_insn);
4435 	  }
4436 
4437 	*mode_label = label_insn;
4438       }
4439 
4440   /* Now deal with propagating the branch condition.  */
4441   if (cond_branch)
4442     {
4443       rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
4444 
4445       if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask
4446 	  && nvptx_mach_vector_length () == PTX_WARP_SIZE)
4447 	{
4448 	  /* Vector mode only, do a shuffle.  */
4449 #if WORKAROUND_PTXJIT_BUG
4450 	  /* The branch condition %rcond is propagated like this:
4451 
4452 		{
4453 		    .reg .u32 %x;
4454 		    mov.u32 %x,%tid.x;
4455 		    setp.ne.u32 %rnotvzero,%x,0;
4456 		 }
4457 
4458 		 @%rnotvzero bra Lskip;
4459 		 setp.<op>.<type> %rcond,op1,op2;
4460 		 Lskip:
4461 		 selp.u32 %rcondu32,1,0,%rcond;
4462 		 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4463 		 setp.ne.u32 %rcond,%rcondu32,0;
4464 
4465 	     There seems to be a bug in the ptx JIT compiler (observed at driver
4466 	     version 381.22, at -O1 and higher for sm_61), that drops the shfl
4467 	     unless %rcond is initialized to something before 'bra Lskip'.  The
4468 	     bug is not observed with ptxas from cuda 8.0.61.
4469 
4470 	     It is true that the code is non-trivial: at Lskip, %rcond is
4471 	     uninitialized in threads 1-31, and after the selp the same holds
4472 	     for %rcondu32.  But shfl propagates the defined value in thread 0
4473 	     to threads 1-31, so after the shfl %rcondu32 is defined in threads
4474 	     0-31, and after the setp.ne %rcond is defined in threads 0-31.
4475 
4476 	     There is nothing in the PTX spec to suggest that this is wrong, or
4477 	     to explain why the extra initialization is needed.  So, we classify
4478 	     it as a JIT bug, and the extra initialization as workaround:
4479 
4480 		{
4481 		    .reg .u32 %x;
4482 		    mov.u32 %x,%tid.x;
4483 		    setp.ne.u32 %rnotvzero,%x,0;
4484 		}
4485 
4486 		+.reg .pred %rcond2;
4487 		+setp.eq.u32 %rcond2, 1, 0;
4488 
4489 		 @%rnotvzero bra Lskip;
4490 		 setp.<op>.<type> %rcond,op1,op2;
4491 		+mov.pred %rcond2, %rcond;
4492 		 Lskip:
4493 		+mov.pred %rcond, %rcond2;
4494 		 selp.u32 %rcondu32,1,0,%rcond;
4495 		 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4496 		 setp.ne.u32 %rcond,%rcondu32,0;
4497 	  */
4498 	  rtx_insn *label = PREV_INSN (tail);
4499 	  gcc_assert (label && LABEL_P (label));
4500 	  rtx tmp = gen_reg_rtx (BImode);
4501 	  emit_insn_before (gen_movbi (tmp, const0_rtx),
4502 			    bb_first_real_insn (from));
4503 	  emit_insn_before (gen_rtx_SET (tmp, pvar), label);
4504 	  emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
4505 #endif
4506 	  emit_insn_before (nvptx_gen_warp_bcast (pvar), tail);
4507 	}
4508       else
4509 	{
4510 	  /* Includes worker mode, do spill & fill.  By construction
4511 	     we should never have worker mode only. */
4512 	  broadcast_data_t data;
4513 	  unsigned size = GET_MODE_SIZE (SImode);
4514 	  bool vector = (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) != 0;
4515 	  bool worker = (GOMP_DIM_MASK (GOMP_DIM_WORKER) == mask) != 0;
4516 	  rtx barrier = GEN_INT (0);
4517 	  int threads = 0;
4518 
4519 	  data.base = oacc_bcast_sym;
4520 	  data.ptr = 0;
4521 
4522 	  bool use_partitioning_p = (vector && !worker
4523 				     && nvptx_mach_max_workers () > 1
4524 				     && cfun->machine->bcast_partition);
4525 	  if (use_partitioning_p)
4526 	    {
4527 	      data.base = cfun->machine->bcast_partition;
4528 	      barrier = cfun->machine->sync_bar;
4529 	      threads = nvptx_mach_vector_length ();
4530 	    }
4531 	  gcc_assert (data.base != NULL);
4532 	  gcc_assert (barrier);
4533 
4534 	  unsigned int psize = ROUND_UP (size, oacc_bcast_align);
4535 	  unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
4536 			       ? nvptx_mach_max_workers () + 1
4537 			       : 1);
4538 
4539 	  oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
4540 	  oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
4541 
4542 	  data.offset = 0;
4543 	  emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_read, 0, &data,
4544 						    vector),
4545 			    before);
4546 
4547 	  /* Barrier so other workers can see the write.  */
4548 	  emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
4549 	  data.offset = 0;
4550 	  emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data,
4551 						    vector),
4552 			    tail);
4553 	  /* This barrier is needed to avoid worker zero clobbering
4554 	     the broadcast buffer before all the other workers have
4555 	     had a chance to read this instance of it.  */
4556 	  emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
4557 	}
4558 
4559       extract_insn (tail);
4560       rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
4561 				 UNSPEC_BR_UNIFIED);
4562       validate_change (tail, recog_data.operand_loc[0], unsp, false);
4563     }
4564 
4565   bool seen_label = verify_neutering_jumps (from, vector_jump, worker_jump,
4566 					    vector_label, worker_label);
4567   if (!seen_label)
4568     verify_neutering_labels (to, vector_label, worker_label);
4569 }
4570 
4571 /* PAR is a parallel that is being skipped in its entirety according to
4572    MASK.  Treat this as skipping a superblock starting at forked
4573    and ending at joining.  */
4574 
4575 static void
nvptx_skip_par(unsigned mask,parallel * par)4576 nvptx_skip_par (unsigned mask, parallel *par)
4577 {
4578   basic_block tail = par->join_block;
4579   gcc_assert (tail->preds->length () == 1);
4580 
4581   basic_block pre_tail = (*tail->preds)[0]->src;
4582   gcc_assert (pre_tail->succs->length () == 1);
4583 
4584   nvptx_single (mask, par->forked_block, pre_tail);
4585 }
4586 
4587 /* If PAR has a single inner parallel and PAR itself only contains
4588    empty entry and exit blocks, swallow the inner PAR.  */
4589 
4590 static void
nvptx_optimize_inner(parallel * par)4591 nvptx_optimize_inner (parallel *par)
4592 {
4593   parallel *inner = par->inner;
4594 
4595   /* We mustn't be the outer dummy par.  */
4596   if (!par->mask)
4597     return;
4598 
4599   /* We must have a single inner par.  */
4600   if (!inner || inner->next)
4601     return;
4602 
4603   /* We must only contain 2 blocks ourselves -- the head and tail of
4604      the inner par.  */
4605   if (par->blocks.length () != 2)
4606     return;
4607 
4608   /* We must be disjoint partitioning.  As we only have vector and
4609      worker partitioning, this is sufficient to guarantee the pars
4610      have adjacent partitioning.  */
4611   if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
4612     /* This indicates malformed code generation.  */
4613     return;
4614 
4615   /* The outer forked insn should be immediately followed by the inner
4616      fork insn.  */
4617   rtx_insn *forked = par->forked_insn;
4618   rtx_insn *fork = BB_END (par->forked_block);
4619 
4620   if (NEXT_INSN (forked) != fork)
4621     return;
4622   gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
4623 
4624   /* The outer joining insn must immediately follow the inner join
4625      insn.  */
4626   rtx_insn *joining = par->joining_insn;
4627   rtx_insn *join = inner->join_insn;
4628   if (NEXT_INSN (join) != joining)
4629     return;
4630 
4631   /* Preconditions met.  Swallow the inner par.  */
4632   if (dump_file)
4633     fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
4634 	     inner->mask, inner->forked_block->index,
4635 	     inner->join_block->index,
4636 	     par->mask, par->forked_block->index, par->join_block->index);
4637 
4638   par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
4639 
4640   par->blocks.reserve (inner->blocks.length ());
4641   while (inner->blocks.length ())
4642     par->blocks.quick_push (inner->blocks.pop ());
4643 
4644   par->inner = inner->inner;
4645   inner->inner = NULL;
4646 
4647   delete inner;
4648 }
4649 
4650 /* Process the parallel PAR and all its contained
4651    parallels.  We do everything but the neutering.  Return mask of
4652    partitioned modes used within this parallel.  */
4653 
4654 static unsigned
nvptx_process_pars(parallel * par)4655 nvptx_process_pars (parallel *par)
4656 {
4657   if (nvptx_optimize)
4658     nvptx_optimize_inner (par);
4659 
4660   unsigned inner_mask = par->mask;
4661 
4662   /* Do the inner parallels first.  */
4663   if (par->inner)
4664     {
4665       par->inner_mask = nvptx_process_pars (par->inner);
4666       inner_mask |= par->inner_mask;
4667     }
4668 
4669   bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
4670   bool worker = (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER));
4671   bool large_vector = ((par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
4672 		      && nvptx_mach_vector_length () > PTX_WARP_SIZE);
4673 
4674   if (worker || large_vector)
4675     {
4676       nvptx_shared_propagate (false, is_call, par->forked_block,
4677 			      par->forked_insn, !worker);
4678       bool no_prop_p
4679 	= nvptx_shared_propagate (true, is_call, par->forked_block,
4680 				  par->fork_insn, !worker);
4681       bool empty_loop_p
4682 	= !is_call && (NEXT_INSN (par->forked_insn)
4683 		       && NEXT_INSN (par->forked_insn) == par->joining_insn);
4684       rtx barrier = GEN_INT (0);
4685       int threads = 0;
4686 
4687       if (!worker && cfun->machine->sync_bar)
4688 	{
4689 	  barrier = cfun->machine->sync_bar;
4690 	  threads = nvptx_mach_vector_length ();
4691 	}
4692 
4693       if (no_prop_p && empty_loop_p)
4694 	;
4695       else if (no_prop_p && is_call)
4696 	;
4697       else
4698 	{
4699 	  /* Insert begin and end synchronizations.  */
4700 	  emit_insn_before (nvptx_cta_sync (barrier, threads),
4701 			    par->forked_insn);
4702 	  emit_insn_before (nvptx_cta_sync (barrier, threads), par->join_insn);
4703 	}
4704     }
4705   else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
4706     nvptx_warp_propagate (is_call, par->forked_block, par->forked_insn);
4707 
4708   /* Now do siblings.  */
4709   if (par->next)
4710     inner_mask |= nvptx_process_pars (par->next);
4711   return inner_mask;
4712 }
4713 
4714 /* Neuter the parallel described by PAR.  We recurse in depth-first
4715    order.  MODES are the partitioning of the execution and OUTER is
4716    the partitioning of the parallels we are contained in.  */
4717 
4718 static void
nvptx_neuter_pars(parallel * par,unsigned modes,unsigned outer)4719 nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
4720 {
4721   unsigned me = (par->mask
4722 		 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
4723 		    | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4724   unsigned  skip_mask = 0, neuter_mask = 0;
4725 
4726   if (par->inner)
4727     nvptx_neuter_pars (par->inner, modes, outer | me);
4728 
4729   for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4730     {
4731       if ((outer | me) & GOMP_DIM_MASK (mode))
4732 	{} /* Mode is partitioned: no neutering.  */
4733       else if (!(modes & GOMP_DIM_MASK (mode)))
4734 	{} /* Mode is not used: nothing to do.  */
4735       else if (par->inner_mask & GOMP_DIM_MASK (mode)
4736 	       || !par->forked_insn)
4737 	/* Partitioned in inner parallels, or we're not a partitioned
4738 	   at all: neuter individual blocks.  */
4739 	neuter_mask |= GOMP_DIM_MASK (mode);
4740       else if (!par->parent || !par->parent->forked_insn
4741 	       || par->parent->inner_mask & GOMP_DIM_MASK (mode))
4742 	/* Parent isn't a parallel or contains this paralleling: skip
4743 	   parallel at this level.  */
4744 	skip_mask |= GOMP_DIM_MASK (mode);
4745       else
4746 	{} /* Parent will skip this parallel itself.  */
4747     }
4748 
4749   if (neuter_mask)
4750     {
4751       int ix, len;
4752 
4753       if (nvptx_optimize)
4754 	{
4755 	  /* Neuter whole SESE regions.  */
4756 	  bb_pair_vec_t regions;
4757 
4758 	  nvptx_find_sese (par->blocks, regions);
4759 	  len = regions.length ();
4760 	  for (ix = 0; ix != len; ix++)
4761 	    {
4762 	      basic_block from = regions[ix].first;
4763 	      basic_block to = regions[ix].second;
4764 
4765 	      if (from)
4766 		nvptx_single (neuter_mask, from, to);
4767 	      else
4768 		gcc_assert (!to);
4769 	    }
4770 	}
4771       else
4772 	{
4773 	  /* Neuter each BB individually.  */
4774 	  len = par->blocks.length ();
4775 	  for (ix = 0; ix != len; ix++)
4776 	    {
4777 	      basic_block block = par->blocks[ix];
4778 
4779 	      nvptx_single (neuter_mask, block, block);
4780 	    }
4781 	}
4782     }
4783 
4784   if (skip_mask)
4785     nvptx_skip_par (skip_mask, par);
4786 
4787   if (par->next)
4788     nvptx_neuter_pars (par->next, modes, outer);
4789 }
4790 
4791 static void
populate_offload_attrs(offload_attrs * oa)4792 populate_offload_attrs (offload_attrs *oa)
4793 {
4794   tree attr = oacc_get_fn_attrib (current_function_decl);
4795   tree dims = TREE_VALUE (attr);
4796   unsigned ix;
4797 
4798   oa->mask = 0;
4799 
4800   for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
4801     {
4802       tree t = TREE_VALUE (dims);
4803       int size = (t == NULL_TREE) ? -1 : TREE_INT_CST_LOW (t);
4804       tree allowed = TREE_PURPOSE (dims);
4805 
4806       if (size != 1 && !(allowed && integer_zerop (allowed)))
4807 	oa->mask |= GOMP_DIM_MASK (ix);
4808 
4809       switch (ix)
4810 	{
4811 	case GOMP_DIM_GANG:
4812 	  oa->num_gangs = size;
4813 	  break;
4814 
4815 	case GOMP_DIM_WORKER:
4816 	  oa->num_workers = size;
4817 	  break;
4818 
4819 	case GOMP_DIM_VECTOR:
4820 	  oa->vector_length = size;
4821 	  break;
4822 	}
4823     }
4824 }
4825 
4826 #if WORKAROUND_PTXJIT_BUG_2
4827 /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT.  This variant
4828    is needed in the nvptx target because the branches generated for
4829    parititioning are NONJUMP_INSN_P, not JUMP_P.  */
4830 
4831 static rtx
4832 nvptx_pc_set (const rtx_insn *insn, bool strict = true)
4833 {
4834   rtx pat;
4835   if ((strict && !JUMP_P (insn))
4836       || (!strict && !INSN_P (insn)))
4837     return NULL_RTX;
4838   pat = PATTERN (insn);
4839 
4840   /* The set is allowed to appear either as the insn pattern or
4841      the first set in a PARALLEL.  */
4842   if (GET_CODE (pat) == PARALLEL)
4843     pat = XVECEXP (pat, 0, 0);
4844   if (GET_CODE (pat) == SET && GET_CODE (SET_DEST (pat)) == PC)
4845     return pat;
4846 
4847   return NULL_RTX;
4848 }
4849 
4850 /* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT.  */
4851 
4852 static rtx
4853 nvptx_condjump_label (const rtx_insn *insn, bool strict = true)
4854 {
4855   rtx x = nvptx_pc_set (insn, strict);
4856 
4857   if (!x)
4858     return NULL_RTX;
4859   x = SET_SRC (x);
4860   if (GET_CODE (x) == LABEL_REF)
4861     return x;
4862   if (GET_CODE (x) != IF_THEN_ELSE)
4863     return NULL_RTX;
4864   if (XEXP (x, 2) == pc_rtx && GET_CODE (XEXP (x, 1)) == LABEL_REF)
4865     return XEXP (x, 1);
4866   if (XEXP (x, 1) == pc_rtx && GET_CODE (XEXP (x, 2)) == LABEL_REF)
4867     return XEXP (x, 2);
4868   return NULL_RTX;
4869 }
4870 
4871 /* Insert a dummy ptx insn when encountering a branch to a label with no ptx
4872    insn inbetween the branch and the label.  This works around a JIT bug
4873    observed at driver version 384.111, at -O0 for sm_50.  */
4874 
4875 static void
prevent_branch_around_nothing(void)4876 prevent_branch_around_nothing (void)
4877 {
4878   rtx_insn *seen_label = NULL;
4879     for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
4880       {
4881 	if (INSN_P (insn) && condjump_p (insn))
4882 	  {
4883 	    seen_label = label_ref_label (nvptx_condjump_label (insn, false));
4884 	    continue;
4885 	  }
4886 
4887 	if (seen_label == NULL)
4888 	  continue;
4889 
4890 	if (NOTE_P (insn) || DEBUG_INSN_P (insn))
4891 	  continue;
4892 
4893 	if (INSN_P (insn))
4894 	  switch (recog_memoized (insn))
4895 	    {
4896 	    case CODE_FOR_nvptx_fork:
4897 	    case CODE_FOR_nvptx_forked:
4898 	    case CODE_FOR_nvptx_joining:
4899 	    case CODE_FOR_nvptx_join:
4900 	      continue;
4901 	    default:
4902 	      seen_label = NULL;
4903 	      continue;
4904 	    }
4905 
4906 	if (LABEL_P (insn) && insn == seen_label)
4907 	  emit_insn_before (gen_fake_nop (), insn);
4908 
4909 	seen_label = NULL;
4910       }
4911   }
4912 #endif
4913 
4914 #ifdef WORKAROUND_PTXJIT_BUG_3
4915 /* Insert two membar.cta insns inbetween two subsequent bar.sync insns.  This
4916    works around a hang observed at driver version 390.48 for sm_50.  */
4917 
4918 static void
workaround_barsyncs(void)4919 workaround_barsyncs (void)
4920 {
4921   bool seen_barsync = false;
4922   for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
4923     {
4924       if (INSN_P (insn) && recog_memoized (insn) == CODE_FOR_nvptx_barsync)
4925 	{
4926 	  if (seen_barsync)
4927 	    {
4928 	      emit_insn_before (gen_nvptx_membar_cta (), insn);
4929 	      emit_insn_before (gen_nvptx_membar_cta (), insn);
4930 	    }
4931 
4932 	  seen_barsync = true;
4933 	  continue;
4934 	}
4935 
4936       if (!seen_barsync)
4937 	continue;
4938 
4939       if (NOTE_P (insn) || DEBUG_INSN_P (insn))
4940 	continue;
4941       else if (INSN_P (insn))
4942 	switch (recog_memoized (insn))
4943 	  {
4944 	  case CODE_FOR_nvptx_fork:
4945 	  case CODE_FOR_nvptx_forked:
4946 	  case CODE_FOR_nvptx_joining:
4947 	  case CODE_FOR_nvptx_join:
4948 	    continue;
4949 	  default:
4950 	    break;
4951 	  }
4952 
4953       seen_barsync = false;
4954     }
4955 }
4956 #endif
4957 
4958 /* PTX-specific reorganization
4959    - Split blocks at fork and join instructions
4960    - Compute live registers
4961    - Mark now-unused registers, so function begin doesn't declare
4962    unused registers.
4963    - Insert state propagation when entering partitioned mode
4964    - Insert neutering instructions when in single mode
4965    - Replace subregs with suitable sequences.
4966 */
4967 
4968 static void
nvptx_reorg(void)4969 nvptx_reorg (void)
4970 {
4971   /* We are freeing block_for_insn in the toplev to keep compatibility
4972      with old MDEP_REORGS that are not CFG based.  Recompute it now.  */
4973   compute_bb_for_insn ();
4974 
4975   thread_prologue_and_epilogue_insns ();
4976 
4977   /* Split blocks and record interesting unspecs.  */
4978   bb_insn_map_t bb_insn_map;
4979 
4980   nvptx_split_blocks (&bb_insn_map);
4981 
4982   /* Compute live regs */
4983   df_clear_flags (DF_LR_RUN_DCE);
4984   df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
4985   df_live_add_problem ();
4986   df_live_set_all_dirty ();
4987   df_analyze ();
4988   regstat_init_n_sets_and_refs ();
4989 
4990   if (dump_file)
4991     df_dump (dump_file);
4992 
4993   /* Mark unused regs as unused.  */
4994   int max_regs = max_reg_num ();
4995   for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
4996     if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
4997       regno_reg_rtx[i] = const0_rtx;
4998 
4999   /* Determine launch dimensions of the function.  If it is not an
5000      offloaded function  (i.e. this is a regular compiler), the
5001      function has no neutering.  */
5002   tree attr = oacc_get_fn_attrib (current_function_decl);
5003   if (attr)
5004     {
5005       /* If we determined this mask before RTL expansion, we could
5006 	 elide emission of some levels of forks and joins.  */
5007       offload_attrs oa;
5008 
5009       populate_offload_attrs (&oa);
5010 
5011       /* If there is worker neutering, there must be vector
5012 	 neutering.  Otherwise the hardware will fail.  */
5013       gcc_assert (!(oa.mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
5014 		  || (oa.mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
5015 
5016       /* Discover & process partitioned regions.  */
5017       parallel *pars = nvptx_discover_pars (&bb_insn_map);
5018       nvptx_process_pars (pars);
5019       nvptx_neuter_pars (pars, oa.mask, 0);
5020       delete pars;
5021     }
5022 
5023   /* Replace subregs.  */
5024   nvptx_reorg_subreg ();
5025 
5026   if (TARGET_UNIFORM_SIMT)
5027     nvptx_reorg_uniform_simt ();
5028 
5029 #if WORKAROUND_PTXJIT_BUG_2
5030   prevent_branch_around_nothing ();
5031 #endif
5032 
5033 #ifdef WORKAROUND_PTXJIT_BUG_3
5034   workaround_barsyncs ();
5035 #endif
5036 
5037   regstat_free_n_sets_and_refs ();
5038 
5039   df_finish_pass (true);
5040 }
5041 
5042 /* Handle a "kernel" attribute; arguments as in
5043    struct attribute_spec.handler.  */
5044 
5045 static tree
nvptx_handle_kernel_attribute(tree * node,tree name,tree ARG_UNUSED (args),int ARG_UNUSED (flags),bool * no_add_attrs)5046 nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
5047 			       int ARG_UNUSED (flags), bool *no_add_attrs)
5048 {
5049   tree decl = *node;
5050 
5051   if (TREE_CODE (decl) != FUNCTION_DECL)
5052     {
5053       error ("%qE attribute only applies to functions", name);
5054       *no_add_attrs = true;
5055     }
5056   else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
5057     {
5058       error ("%qE attribute requires a void return type", name);
5059       *no_add_attrs = true;
5060     }
5061 
5062   return NULL_TREE;
5063 }
5064 
5065 /* Handle a "shared" attribute; arguments as in
5066    struct attribute_spec.handler.  */
5067 
5068 static tree
nvptx_handle_shared_attribute(tree * node,tree name,tree ARG_UNUSED (args),int ARG_UNUSED (flags),bool * no_add_attrs)5069 nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
5070 			       int ARG_UNUSED (flags), bool *no_add_attrs)
5071 {
5072   tree decl = *node;
5073 
5074   if (TREE_CODE (decl) != VAR_DECL)
5075     {
5076       error ("%qE attribute only applies to variables", name);
5077       *no_add_attrs = true;
5078     }
5079   else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
5080     {
5081       error ("%qE attribute not allowed with auto storage class", name);
5082       *no_add_attrs = true;
5083     }
5084 
5085   return NULL_TREE;
5086 }
5087 
5088 /* Table of valid machine attributes.  */
5089 static const struct attribute_spec nvptx_attribute_table[] =
5090 {
5091   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
5092        affects_type_identity, handler, exclude } */
5093   { "kernel", 0, 0, true, false,  false, false, nvptx_handle_kernel_attribute,
5094     NULL },
5095   { "shared", 0, 0, true, false,  false, false, nvptx_handle_shared_attribute,
5096     NULL },
5097   { NULL, 0, 0, false, false, false, false, NULL, NULL }
5098 };
5099 
5100 /* Limit vector alignments to BIGGEST_ALIGNMENT.  */
5101 
5102 static HOST_WIDE_INT
nvptx_vector_alignment(const_tree type)5103 nvptx_vector_alignment (const_tree type)
5104 {
5105   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
5106 
5107   return MIN (align, BIGGEST_ALIGNMENT);
5108 }
5109 
5110 /* Indicate that INSN cannot be duplicated.   */
5111 
5112 static bool
nvptx_cannot_copy_insn_p(rtx_insn * insn)5113 nvptx_cannot_copy_insn_p (rtx_insn *insn)
5114 {
5115   switch (recog_memoized (insn))
5116     {
5117     case CODE_FOR_nvptx_shufflesi:
5118     case CODE_FOR_nvptx_shufflesf:
5119     case CODE_FOR_nvptx_barsync:
5120     case CODE_FOR_nvptx_fork:
5121     case CODE_FOR_nvptx_forked:
5122     case CODE_FOR_nvptx_joining:
5123     case CODE_FOR_nvptx_join:
5124       return true;
5125     default:
5126       return false;
5127     }
5128 }
5129 
5130 /* Section anchors do not work.  Initialization for flag_section_anchor
5131    probes the existence of the anchoring target hooks and prevents
5132    anchoring if they don't exist.  However, we may be being used with
5133    a host-side compiler that does support anchoring, and hence see
5134    the anchor flag set (as it's not recalculated).  So provide an
5135    implementation denying anchoring.  */
5136 
5137 static bool
nvptx_use_anchors_for_symbol_p(const_rtx ARG_UNUSED (a))5138 nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
5139 {
5140   return false;
5141 }
5142 
5143 /* Record a symbol for mkoffload to enter into the mapping table.  */
5144 
5145 static void
nvptx_record_offload_symbol(tree decl)5146 nvptx_record_offload_symbol (tree decl)
5147 {
5148   switch (TREE_CODE (decl))
5149     {
5150     case VAR_DECL:
5151       fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
5152 	       IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
5153       break;
5154 
5155     case FUNCTION_DECL:
5156       {
5157 	tree attr = oacc_get_fn_attrib (decl);
5158 	/* OpenMP offloading does not set this attribute.  */
5159 	tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
5160 
5161 	fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
5162 		 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
5163 
5164 	for (; dims; dims = TREE_CHAIN (dims))
5165 	  {
5166 	    int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
5167 
5168 	    gcc_assert (!TREE_PURPOSE (dims));
5169 	    fprintf (asm_out_file, ", %#x", size);
5170 	  }
5171 
5172 	fprintf (asm_out_file, "\n");
5173       }
5174       break;
5175 
5176     default:
5177       gcc_unreachable ();
5178     }
5179 }
5180 
5181 /* Implement TARGET_ASM_FILE_START.  Write the kinds of things ptxas expects
5182    at the start of a file.  */
5183 
5184 static void
nvptx_file_start(void)5185 nvptx_file_start (void)
5186 {
5187   fputs ("// BEGIN PREAMBLE\n", asm_out_file);
5188   fputs ("\t.version\t3.1\n", asm_out_file);
5189   if (TARGET_SM35)
5190     fputs ("\t.target\tsm_35\n", asm_out_file);
5191   else
5192     fputs ("\t.target\tsm_30\n", asm_out_file);
5193   fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
5194   fputs ("// END PREAMBLE\n", asm_out_file);
5195 }
5196 
5197 /* Emit a declaration for a worker and vector-level buffer in .shared
5198    memory.  */
5199 
5200 static void
write_shared_buffer(FILE * file,rtx sym,unsigned align,unsigned size)5201 write_shared_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
5202 {
5203   const char *name = XSTR (sym, 0);
5204 
5205   write_var_marker (file, true, false, name);
5206   fprintf (file, ".shared .align %d .u8 %s[%d];\n",
5207 	   align, name, size);
5208 }
5209 
5210 /* Write out the function declarations we've collected and declare storage
5211    for the broadcast buffer.  */
5212 
5213 static void
nvptx_file_end(void)5214 nvptx_file_end (void)
5215 {
5216   hash_table<tree_hasher>::iterator iter;
5217   tree decl;
5218   FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
5219     nvptx_record_fndecl (decl);
5220   fputs (func_decls.str().c_str(), asm_out_file);
5221 
5222   if (oacc_bcast_size)
5223     write_shared_buffer (asm_out_file, oacc_bcast_sym,
5224 			 oacc_bcast_align, oacc_bcast_size);
5225 
5226   if (worker_red_size)
5227     write_shared_buffer (asm_out_file, worker_red_sym,
5228 			 worker_red_align, worker_red_size);
5229 
5230   if (vector_red_size)
5231     write_shared_buffer (asm_out_file, vector_red_sym,
5232 			 vector_red_align, vector_red_size);
5233 
5234   if (need_softstack_decl)
5235     {
5236       write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
5237       /* 32 is the maximum number of warps in a block.  Even though it's an
5238          external declaration, emit the array size explicitly; otherwise, it
5239          may fail at PTX JIT time if the definition is later in link order.  */
5240       fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
5241 	       POINTER_SIZE);
5242     }
5243   if (need_unisimt_decl)
5244     {
5245       write_var_marker (asm_out_file, false, true, "__nvptx_uni");
5246       fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
5247     }
5248 }
5249 
5250 /* Expander for the shuffle builtins.  */
5251 
5252 static rtx
nvptx_expand_shuffle(tree exp,rtx target,machine_mode mode,int ignore)5253 nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
5254 {
5255   if (ignore)
5256     return target;
5257 
5258   rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
5259 			 NULL_RTX, mode, EXPAND_NORMAL);
5260   if (!REG_P (src))
5261     src = copy_to_mode_reg (mode, src);
5262 
5263   rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
5264 			 NULL_RTX, SImode, EXPAND_NORMAL);
5265   rtx op = expand_expr (CALL_EXPR_ARG  (exp, 2),
5266 			NULL_RTX, SImode, EXPAND_NORMAL);
5267 
5268   if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
5269     idx = copy_to_mode_reg (SImode, idx);
5270 
5271   rtx pat = nvptx_gen_shuffle (target, src, idx,
5272 			       (nvptx_shuffle_kind) INTVAL (op));
5273   if (pat)
5274     emit_insn (pat);
5275 
5276   return target;
5277 }
5278 
5279 const char *
nvptx_output_red_partition(rtx dst,rtx offset)5280 nvptx_output_red_partition (rtx dst, rtx offset)
5281 {
5282   const char *zero_offset = "\t\tmov.u64\t%%r%d, %%r%d; // vred buffer\n";
5283   const char *with_offset = "\t\tadd.u64\t%%r%d, %%r%d, %d; // vred buffer\n";
5284 
5285   if (offset == const0_rtx)
5286     fprintf (asm_out_file, zero_offset, REGNO (dst),
5287 	     REGNO (cfun->machine->red_partition));
5288   else
5289     fprintf (asm_out_file, with_offset, REGNO (dst),
5290 	     REGNO (cfun->machine->red_partition), UINTVAL (offset));
5291 
5292   return "";
5293 }
5294 
5295 /* Shared-memory reduction address expander.  */
5296 
5297 static rtx
nvptx_expand_shared_addr(tree exp,rtx target,machine_mode ARG_UNUSED (mode),int ignore,int vector)5298 nvptx_expand_shared_addr (tree exp, rtx target,
5299 			  machine_mode ARG_UNUSED (mode), int ignore,
5300 			  int vector)
5301 {
5302   if (ignore)
5303     return target;
5304 
5305   unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
5306   unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
5307   unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
5308   rtx addr = worker_red_sym;
5309 
5310   if (vector)
5311     {
5312       offload_attrs oa;
5313 
5314       populate_offload_attrs (&oa);
5315 
5316       unsigned int psize = ROUND_UP (size + offset, align);
5317       unsigned int pnum = nvptx_mach_max_workers ();
5318       vector_red_partition = MAX (vector_red_partition, psize);
5319       vector_red_size = MAX (vector_red_size, psize * pnum);
5320       vector_red_align = MAX (vector_red_align, align);
5321 
5322       if (cfun->machine->red_partition == NULL)
5323 	cfun->machine->red_partition = gen_reg_rtx (Pmode);
5324 
5325       addr = gen_reg_rtx (Pmode);
5326       emit_insn (gen_nvptx_red_partition (addr, GEN_INT (offset)));
5327     }
5328   else
5329     {
5330       worker_red_align = MAX (worker_red_align, align);
5331       worker_red_size = MAX (worker_red_size, size + offset);
5332 
5333       if (offset)
5334 	{
5335 	  addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
5336 	  addr = gen_rtx_CONST (Pmode, addr);
5337 	}
5338    }
5339 
5340   emit_move_insn (target, addr);
5341   return target;
5342 }
5343 
5344 /* Expand the CMP_SWAP PTX builtins.  We have our own versions that do
5345    not require taking the address of any object, other than the memory
5346    cell being operated on.  */
5347 
5348 static rtx
nvptx_expand_cmp_swap(tree exp,rtx target,machine_mode ARG_UNUSED (m),int ARG_UNUSED (ignore))5349 nvptx_expand_cmp_swap (tree exp, rtx target,
5350 		       machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
5351 {
5352   machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
5353 
5354   if (!target)
5355     target = gen_reg_rtx (mode);
5356 
5357   rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
5358 			 NULL_RTX, Pmode, EXPAND_NORMAL);
5359   rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
5360 			 NULL_RTX, mode, EXPAND_NORMAL);
5361   rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
5362 			 NULL_RTX, mode, EXPAND_NORMAL);
5363   rtx pat;
5364 
5365   mem = gen_rtx_MEM (mode, mem);
5366   if (!REG_P (cmp))
5367     cmp = copy_to_mode_reg (mode, cmp);
5368   if (!REG_P (src))
5369     src = copy_to_mode_reg (mode, src);
5370 
5371   if (mode == SImode)
5372     pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
5373   else
5374     pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
5375 
5376   emit_insn (pat);
5377 
5378   return target;
5379 }
5380 
5381 
5382 /* Codes for all the NVPTX builtins.  */
5383 enum nvptx_builtins
5384 {
5385   NVPTX_BUILTIN_SHUFFLE,
5386   NVPTX_BUILTIN_SHUFFLELL,
5387   NVPTX_BUILTIN_WORKER_ADDR,
5388   NVPTX_BUILTIN_VECTOR_ADDR,
5389   NVPTX_BUILTIN_CMP_SWAP,
5390   NVPTX_BUILTIN_CMP_SWAPLL,
5391   NVPTX_BUILTIN_MAX
5392 };
5393 
5394 static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
5395 
5396 /* Return the NVPTX builtin for CODE.  */
5397 
5398 static tree
nvptx_builtin_decl(unsigned code,bool ARG_UNUSED (initialize_p))5399 nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
5400 {
5401   if (code >= NVPTX_BUILTIN_MAX)
5402     return error_mark_node;
5403 
5404   return nvptx_builtin_decls[code];
5405 }
5406 
5407 /* Set up all builtin functions for this target.  */
5408 
5409 static void
nvptx_init_builtins(void)5410 nvptx_init_builtins (void)
5411 {
5412 #define DEF(ID, NAME, T)						\
5413   (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID]				\
5414    = add_builtin_function ("__builtin_nvptx_" NAME,			\
5415 			   build_function_type_list T,			\
5416 			   NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
5417 #define ST sizetype
5418 #define UINT unsigned_type_node
5419 #define LLUINT long_long_unsigned_type_node
5420 #define PTRVOID ptr_type_node
5421 
5422   DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
5423   DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
5424   DEF (WORKER_ADDR, "worker_addr",
5425        (PTRVOID, ST, UINT, UINT, NULL_TREE));
5426   DEF (VECTOR_ADDR, "vector_addr",
5427        (PTRVOID, ST, UINT, UINT, NULL_TREE));
5428   DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
5429   DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
5430 
5431 #undef DEF
5432 #undef ST
5433 #undef UINT
5434 #undef LLUINT
5435 #undef PTRVOID
5436 }
5437 
5438 /* Expand an expression EXP that calls a built-in function,
5439    with result going to TARGET if that's convenient
5440    (and in mode MODE if that's convenient).
5441    SUBTARGET may be used as the target for computing one of EXP's operands.
5442    IGNORE is nonzero if the value is to be ignored.  */
5443 
5444 static rtx
nvptx_expand_builtin(tree exp,rtx target,rtx ARG_UNUSED (subtarget),machine_mode mode,int ignore)5445 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
5446 		      machine_mode mode, int ignore)
5447 {
5448   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
5449   switch (DECL_MD_FUNCTION_CODE (fndecl))
5450     {
5451     case NVPTX_BUILTIN_SHUFFLE:
5452     case NVPTX_BUILTIN_SHUFFLELL:
5453       return nvptx_expand_shuffle (exp, target, mode, ignore);
5454 
5455     case NVPTX_BUILTIN_WORKER_ADDR:
5456       return nvptx_expand_shared_addr (exp, target, mode, ignore, false);
5457 
5458     case NVPTX_BUILTIN_VECTOR_ADDR:
5459       return nvptx_expand_shared_addr (exp, target, mode, ignore, true);
5460 
5461     case NVPTX_BUILTIN_CMP_SWAP:
5462     case NVPTX_BUILTIN_CMP_SWAPLL:
5463       return nvptx_expand_cmp_swap (exp, target, mode, ignore);
5464 
5465     default: gcc_unreachable ();
5466     }
5467 }
5468 
5469 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp.  */
5470 
5471 static int
nvptx_simt_vf()5472 nvptx_simt_vf ()
5473 {
5474   return PTX_WARP_SIZE;
5475 }
5476 
5477 /* Return 1 if TRAIT NAME is present in the OpenMP context's
5478    device trait set, return 0 if not present in any OpenMP context in the
5479    whole translation unit, or -1 if not present in the current OpenMP context
5480    but might be present in another OpenMP context in the same TU.  */
5481 
5482 int
nvptx_omp_device_kind_arch_isa(enum omp_device_kind_arch_isa trait,const char * name)5483 nvptx_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait,
5484 				const char *name)
5485 {
5486   switch (trait)
5487     {
5488     case omp_device_kind:
5489       return strcmp (name, "gpu") == 0;
5490     case omp_device_arch:
5491       return strcmp (name, "nvptx") == 0;
5492     case omp_device_isa:
5493       if (strcmp (name, "sm_30") == 0)
5494 	return !TARGET_SM35;
5495       if (strcmp (name, "sm_35") == 0)
5496 	return TARGET_SM35;
5497       return 0;
5498     default:
5499       gcc_unreachable ();
5500     }
5501 }
5502 
5503 static bool
nvptx_welformed_vector_length_p(int l)5504 nvptx_welformed_vector_length_p (int l)
5505 {
5506   gcc_assert (l > 0);
5507   return l % PTX_WARP_SIZE == 0;
5508 }
5509 
5510 static void
nvptx_apply_dim_limits(int dims[])5511 nvptx_apply_dim_limits (int dims[])
5512 {
5513   /* Check that the vector_length is not too large.  */
5514   if (dims[GOMP_DIM_VECTOR] > PTX_MAX_VECTOR_LENGTH)
5515     dims[GOMP_DIM_VECTOR] = PTX_MAX_VECTOR_LENGTH;
5516 
5517   /* Check that the number of workers is not too large.  */
5518   if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
5519     dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
5520 
5521   /* Ensure that num_worker * vector_length <= cta size.  */
5522   if (dims[GOMP_DIM_WORKER] > 0 &&  dims[GOMP_DIM_VECTOR] > 0
5523       && dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] > PTX_CTA_SIZE)
5524     dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5525 
5526   /* If we need a per-worker barrier ... .  */
5527   if (dims[GOMP_DIM_WORKER] > 0 &&  dims[GOMP_DIM_VECTOR] > 0
5528       && dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
5529     /* Don't use more barriers than available.  */
5530     dims[GOMP_DIM_WORKER] = MIN (dims[GOMP_DIM_WORKER],
5531 				 PTX_NUM_PER_WORKER_BARRIERS);
5532 }
5533 
5534 /* Return true if FNDECL contains calls to vector-partitionable routines.  */
5535 
5536 static bool
has_vector_partitionable_routine_calls_p(tree fndecl)5537 has_vector_partitionable_routine_calls_p (tree fndecl)
5538 {
5539   if (!fndecl)
5540     return false;
5541 
5542   basic_block bb;
5543   FOR_EACH_BB_FN (bb, DECL_STRUCT_FUNCTION (fndecl))
5544     for (gimple_stmt_iterator i = gsi_start_bb (bb); !gsi_end_p (i);
5545 	 gsi_next_nondebug (&i))
5546       {
5547 	gimple *stmt = gsi_stmt (i);
5548 	if (gimple_code (stmt) != GIMPLE_CALL)
5549 	  continue;
5550 
5551 	tree callee = gimple_call_fndecl (stmt);
5552 	if (!callee)
5553 	  continue;
5554 
5555 	tree attrs  = oacc_get_fn_attrib (callee);
5556 	if (attrs == NULL_TREE)
5557 	  return false;
5558 
5559 	int partition_level = oacc_fn_attrib_level (attrs);
5560 	bool seq_routine_p = partition_level == GOMP_DIM_MAX;
5561 	if (!seq_routine_p)
5562 	  return true;
5563       }
5564 
5565   return false;
5566 }
5567 
5568 /* As nvptx_goacc_validate_dims, but does not return bool to indicate whether
5569    DIMS has changed.  */
5570 
5571 static void
nvptx_goacc_validate_dims_1(tree decl,int dims[],int fn_level,unsigned used)5572 nvptx_goacc_validate_dims_1 (tree decl, int dims[], int fn_level, unsigned used)
5573 {
5574   bool oacc_default_dims_p = false;
5575   bool oacc_min_dims_p = false;
5576   bool offload_region_p = false;
5577   bool routine_p = false;
5578   bool routine_seq_p = false;
5579   int default_vector_length = -1;
5580 
5581   if (decl == NULL_TREE)
5582     {
5583       if (fn_level == -1)
5584 	oacc_default_dims_p = true;
5585       else if (fn_level == -2)
5586 	oacc_min_dims_p = true;
5587       else
5588 	gcc_unreachable ();
5589     }
5590   else if (fn_level == -1)
5591     offload_region_p = true;
5592   else if (0 <= fn_level && fn_level <= GOMP_DIM_MAX)
5593     {
5594       routine_p = true;
5595       routine_seq_p = fn_level == GOMP_DIM_MAX;
5596     }
5597   else
5598     gcc_unreachable ();
5599 
5600   if (oacc_min_dims_p)
5601     {
5602       gcc_assert (dims[GOMP_DIM_VECTOR] == 1);
5603       gcc_assert (dims[GOMP_DIM_WORKER] == 1);
5604       gcc_assert (dims[GOMP_DIM_GANG] == 1);
5605 
5606       dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5607       return;
5608     }
5609 
5610   if (routine_p)
5611     {
5612       if (!routine_seq_p)
5613 	dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5614 
5615       return;
5616     }
5617 
5618   if (oacc_default_dims_p)
5619     {
5620       /* -1  : not set
5621 	  0  : set at runtime, f.i. -fopenacc-dims=-
5622          >= 1: set at compile time, f.i. -fopenacc-dims=1.  */
5623       gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
5624       gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
5625       gcc_assert (dims[GOMP_DIM_GANG] >= -1);
5626 
5627       /* But -fopenacc-dims=- is not yet supported on trunk.  */
5628       gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
5629       gcc_assert (dims[GOMP_DIM_WORKER] != 0);
5630       gcc_assert (dims[GOMP_DIM_GANG] != 0);
5631     }
5632 
5633   if (offload_region_p)
5634     {
5635       /* -1   : not set
5636 	  0   : set using variable, f.i. num_gangs (n)
5637 	  >= 1: set using constant, f.i. num_gangs (1).  */
5638       gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
5639       gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
5640       gcc_assert (dims[GOMP_DIM_GANG] >= -1);
5641     }
5642 
5643   if (offload_region_p)
5644     default_vector_length = oacc_get_default_dim (GOMP_DIM_VECTOR);
5645   else
5646     /* oacc_default_dims_p.  */
5647     default_vector_length = PTX_DEFAULT_VECTOR_LENGTH;
5648 
5649   int old_dims[GOMP_DIM_MAX];
5650   unsigned int i;
5651   for (i = 0; i < GOMP_DIM_MAX; ++i)
5652     old_dims[i] = dims[i];
5653 
5654   const char *vector_reason = NULL;
5655   if (offload_region_p && has_vector_partitionable_routine_calls_p (decl))
5656     {
5657       default_vector_length = PTX_WARP_SIZE;
5658 
5659       if (dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
5660 	{
5661 	  vector_reason = G_("using vector_length (%d) due to call to"
5662 			     " vector-partitionable routine, ignoring %d");
5663 	  dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5664 	}
5665     }
5666 
5667   if (dims[GOMP_DIM_VECTOR] == 0)
5668     {
5669       vector_reason = G_("using vector_length (%d), ignoring runtime setting");
5670       dims[GOMP_DIM_VECTOR] = default_vector_length;
5671     }
5672 
5673   if (dims[GOMP_DIM_VECTOR] > 0
5674       && !nvptx_welformed_vector_length_p (dims[GOMP_DIM_VECTOR]))
5675     dims[GOMP_DIM_VECTOR] = default_vector_length;
5676 
5677   nvptx_apply_dim_limits (dims);
5678 
5679   if (dims[GOMP_DIM_VECTOR] != old_dims[GOMP_DIM_VECTOR])
5680     warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5681 		vector_reason != NULL
5682 		? vector_reason
5683 		: G_("using vector_length (%d), ignoring %d"),
5684 		dims[GOMP_DIM_VECTOR], old_dims[GOMP_DIM_VECTOR]);
5685 
5686   if (dims[GOMP_DIM_WORKER] != old_dims[GOMP_DIM_WORKER])
5687     warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5688 		G_("using num_workers (%d), ignoring %d"),
5689 		dims[GOMP_DIM_WORKER], old_dims[GOMP_DIM_WORKER]);
5690 
5691   if (oacc_default_dims_p)
5692     {
5693       if (dims[GOMP_DIM_VECTOR] < 0)
5694 	dims[GOMP_DIM_VECTOR] = default_vector_length;
5695       if (dims[GOMP_DIM_WORKER] < 0)
5696 	dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM;
5697       if (dims[GOMP_DIM_GANG] < 0)
5698 	dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM;
5699       nvptx_apply_dim_limits (dims);
5700     }
5701 
5702   if (offload_region_p)
5703     {
5704       for (i = 0; i < GOMP_DIM_MAX; i++)
5705 	{
5706 	  if (!(dims[i] < 0))
5707 	    continue;
5708 
5709 	  if ((used & GOMP_DIM_MASK (i)) == 0)
5710 	    /* Function oacc_validate_dims will apply the minimal dimension.  */
5711 	    continue;
5712 
5713 	  dims[i] = (i == GOMP_DIM_VECTOR
5714 		     ? default_vector_length
5715 		     : oacc_get_default_dim (i));
5716 	}
5717 
5718       nvptx_apply_dim_limits (dims);
5719     }
5720 }
5721 
5722 /* Validate compute dimensions of an OpenACC offload or routine, fill
5723    in non-unity defaults.  FN_LEVEL indicates the level at which a
5724    routine might spawn a loop.  It is negative for non-routines.  If
5725    DECL is null, we are validating the default dimensions.  */
5726 
5727 static bool
nvptx_goacc_validate_dims(tree decl,int dims[],int fn_level,unsigned used)5728 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level, unsigned used)
5729 {
5730   int old_dims[GOMP_DIM_MAX];
5731   unsigned int i;
5732 
5733   for (i = 0; i < GOMP_DIM_MAX; ++i)
5734     old_dims[i] = dims[i];
5735 
5736   nvptx_goacc_validate_dims_1 (decl, dims, fn_level, used);
5737 
5738   gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
5739   if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0)
5740     gcc_assert (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] <= PTX_CTA_SIZE);
5741 
5742   for (i = 0; i < GOMP_DIM_MAX; ++i)
5743     if (old_dims[i] != dims[i])
5744       return true;
5745 
5746   return false;
5747 }
5748 
5749 /* Return maximum dimension size, or zero for unbounded.  */
5750 
5751 static int
nvptx_dim_limit(int axis)5752 nvptx_dim_limit (int axis)
5753 {
5754   switch (axis)
5755     {
5756     case GOMP_DIM_VECTOR:
5757       return PTX_MAX_VECTOR_LENGTH;
5758 
5759     default:
5760       break;
5761     }
5762   return 0;
5763 }
5764 
5765 /* Determine whether fork & joins are needed.  */
5766 
5767 static bool
nvptx_goacc_fork_join(gcall * call,const int dims[],bool ARG_UNUSED (is_fork))5768 nvptx_goacc_fork_join (gcall *call, const int dims[],
5769 		       bool ARG_UNUSED (is_fork))
5770 {
5771   tree arg = gimple_call_arg (call, 2);
5772   unsigned axis = TREE_INT_CST_LOW (arg);
5773 
5774   /* We only care about worker and vector partitioning.  */
5775   if (axis < GOMP_DIM_WORKER)
5776     return false;
5777 
5778   /* If the size is 1, there's no partitioning.  */
5779   if (dims[axis] == 1)
5780     return false;
5781 
5782   return true;
5783 }
5784 
5785 /* Generate a PTX builtin function call that returns the address in
5786    the worker reduction buffer at OFFSET.  TYPE is the type of the
5787    data at that location.  */
5788 
5789 static tree
nvptx_get_shared_red_addr(tree type,tree offset,bool vector)5790 nvptx_get_shared_red_addr (tree type, tree offset, bool vector)
5791 {
5792   enum nvptx_builtins addr_dim = NVPTX_BUILTIN_WORKER_ADDR;
5793   if (vector)
5794     addr_dim = NVPTX_BUILTIN_VECTOR_ADDR;
5795   machine_mode mode = TYPE_MODE (type);
5796   tree fndecl = nvptx_builtin_decl (addr_dim, true);
5797   tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
5798   tree align = build_int_cst (unsigned_type_node,
5799 			      GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
5800   tree call = build_call_expr (fndecl, 3, offset, size, align);
5801 
5802   return fold_convert (build_pointer_type (type), call);
5803 }
5804 
5805 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR.  This function
5806    will cast the variable if necessary.  */
5807 
5808 static void
nvptx_generate_vector_shuffle(location_t loc,tree dest_var,tree var,unsigned shift,gimple_seq * seq)5809 nvptx_generate_vector_shuffle (location_t loc,
5810 			       tree dest_var, tree var, unsigned shift,
5811 			       gimple_seq *seq)
5812 {
5813   unsigned fn = NVPTX_BUILTIN_SHUFFLE;
5814   tree_code code = NOP_EXPR;
5815   tree arg_type = unsigned_type_node;
5816   tree var_type = TREE_TYPE (var);
5817   tree dest_type = var_type;
5818 
5819   if (TREE_CODE (var_type) == COMPLEX_TYPE)
5820     var_type = TREE_TYPE (var_type);
5821 
5822   if (TREE_CODE (var_type) == REAL_TYPE)
5823     code = VIEW_CONVERT_EXPR;
5824 
5825   if (TYPE_SIZE (var_type)
5826       == TYPE_SIZE (long_long_unsigned_type_node))
5827     {
5828       fn = NVPTX_BUILTIN_SHUFFLELL;
5829       arg_type = long_long_unsigned_type_node;
5830     }
5831 
5832   tree call = nvptx_builtin_decl (fn, true);
5833   tree bits = build_int_cst (unsigned_type_node, shift);
5834   tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
5835   tree expr;
5836 
5837   if (var_type != dest_type)
5838     {
5839       /* Do real and imaginary parts separately.  */
5840       tree real = fold_build1 (REALPART_EXPR, var_type, var);
5841       real = fold_build1 (code, arg_type, real);
5842       real = build_call_expr_loc (loc, call, 3, real, bits, kind);
5843       real = fold_build1 (code, var_type, real);
5844 
5845       tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
5846       imag = fold_build1 (code, arg_type, imag);
5847       imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
5848       imag = fold_build1 (code, var_type, imag);
5849 
5850       expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
5851     }
5852   else
5853     {
5854       expr = fold_build1 (code, arg_type, var);
5855       expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
5856       expr = fold_build1 (code, dest_type, expr);
5857     }
5858 
5859   gimplify_assign (dest_var, expr, seq);
5860 }
5861 
5862 /* Lazily generate the global lock var decl and return its address.  */
5863 
5864 static tree
nvptx_global_lock_addr()5865 nvptx_global_lock_addr ()
5866 {
5867   tree v = global_lock_var;
5868 
5869   if (!v)
5870     {
5871       tree name = get_identifier ("__reduction_lock");
5872       tree type = build_qualified_type (unsigned_type_node,
5873 					TYPE_QUAL_VOLATILE);
5874       v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
5875       global_lock_var = v;
5876       DECL_ARTIFICIAL (v) = 1;
5877       DECL_EXTERNAL (v) = 1;
5878       TREE_STATIC (v) = 1;
5879       TREE_PUBLIC (v) = 1;
5880       TREE_USED (v) = 1;
5881       mark_addressable (v);
5882       mark_decl_referenced (v);
5883     }
5884 
5885   return build_fold_addr_expr (v);
5886 }
5887 
5888 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
5889    GSI.  We use a lockless scheme for nearly all case, which looks
5890    like:
5891      actual = initval(OP);
5892      do {
5893        guess = actual;
5894        write = guess OP myval;
5895        actual = cmp&swap (ptr, guess, write)
5896      } while (actual bit-different-to guess);
5897    return write;
5898 
5899    This relies on a cmp&swap instruction, which is available for 32-
5900    and 64-bit types.  Larger types must use a locking scheme.  */
5901 
5902 static tree
nvptx_lockless_update(location_t loc,gimple_stmt_iterator * gsi,tree ptr,tree var,tree_code op)5903 nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
5904 		       tree ptr, tree var, tree_code op)
5905 {
5906   unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
5907   tree_code code = NOP_EXPR;
5908   tree arg_type = unsigned_type_node;
5909   tree var_type = TREE_TYPE (var);
5910 
5911   if (TREE_CODE (var_type) == COMPLEX_TYPE
5912       || TREE_CODE (var_type) == REAL_TYPE)
5913     code = VIEW_CONVERT_EXPR;
5914 
5915   if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
5916     {
5917       arg_type = long_long_unsigned_type_node;
5918       fn = NVPTX_BUILTIN_CMP_SWAPLL;
5919     }
5920 
5921   tree swap_fn = nvptx_builtin_decl (fn, true);
5922 
5923   gimple_seq init_seq = NULL;
5924   tree init_var = make_ssa_name (arg_type);
5925   tree init_expr = omp_reduction_init_op (loc, op, var_type);
5926   init_expr = fold_build1 (code, arg_type, init_expr);
5927   gimplify_assign (init_var, init_expr, &init_seq);
5928   gimple *init_end = gimple_seq_last (init_seq);
5929 
5930   gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
5931 
5932   /* Split the block just after the init stmts.  */
5933   basic_block pre_bb = gsi_bb (*gsi);
5934   edge pre_edge = split_block (pre_bb, init_end);
5935   basic_block loop_bb = pre_edge->dest;
5936   pre_bb = pre_edge->src;
5937   /* Reset the iterator.  */
5938   *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5939 
5940   tree expect_var = make_ssa_name (arg_type);
5941   tree actual_var = make_ssa_name (arg_type);
5942   tree write_var = make_ssa_name (arg_type);
5943 
5944   /* Build and insert the reduction calculation.  */
5945   gimple_seq red_seq = NULL;
5946   tree write_expr = fold_build1 (code, var_type, expect_var);
5947   write_expr = fold_build2 (op, var_type, write_expr, var);
5948   write_expr = fold_build1 (code, arg_type, write_expr);
5949   gimplify_assign (write_var, write_expr, &red_seq);
5950 
5951   gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5952 
5953   /* Build & insert the cmp&swap sequence.  */
5954   gimple_seq latch_seq = NULL;
5955   tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
5956 					ptr, expect_var, write_var);
5957   gimplify_assign (actual_var, swap_expr, &latch_seq);
5958 
5959   gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
5960 				   NULL_TREE, NULL_TREE);
5961   gimple_seq_add_stmt (&latch_seq, cond);
5962 
5963   gimple *latch_end = gimple_seq_last (latch_seq);
5964   gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
5965 
5966   /* Split the block just after the latch stmts.  */
5967   edge post_edge = split_block (loop_bb, latch_end);
5968   basic_block post_bb = post_edge->dest;
5969   loop_bb = post_edge->src;
5970   *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5971 
5972   post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
5973   post_edge->probability = profile_probability::even ();
5974   edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
5975   loop_edge->probability = profile_probability::even ();
5976   set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
5977   set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
5978 
5979   gphi *phi = create_phi_node (expect_var, loop_bb);
5980   add_phi_arg (phi, init_var, pre_edge, loc);
5981   add_phi_arg (phi, actual_var, loop_edge, loc);
5982 
5983   loop *loop = alloc_loop ();
5984   loop->header = loop_bb;
5985   loop->latch = loop_bb;
5986   add_loop (loop, loop_bb->loop_father);
5987 
5988   return fold_build1 (code, var_type, write_var);
5989 }
5990 
5991 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
5992    GSI.  This is necessary for types larger than 64 bits, where there
5993    is no cmp&swap instruction to implement a lockless scheme.  We use
5994    a lock variable in global memory.
5995 
5996    while (cmp&swap (&lock_var, 0, 1))
5997      continue;
5998    T accum = *ptr;
5999    accum = accum OP var;
6000    *ptr = accum;
6001    cmp&swap (&lock_var, 1, 0);
6002    return accum;
6003 
6004    A lock in global memory is necessary to force execution engine
6005    descheduling and avoid resource starvation that can occur if the
6006    lock is in .shared memory.  */
6007 
6008 static tree
nvptx_lockfull_update(location_t loc,gimple_stmt_iterator * gsi,tree ptr,tree var,tree_code op)6009 nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
6010 		       tree ptr, tree var, tree_code op)
6011 {
6012   tree var_type = TREE_TYPE (var);
6013   tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
6014   tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
6015   tree uns_locked = build_int_cst (unsigned_type_node, 1);
6016 
6017   /* Split the block just before the gsi.  Insert a gimple nop to make
6018      this easier.  */
6019   gimple *nop = gimple_build_nop ();
6020   gsi_insert_before (gsi, nop, GSI_SAME_STMT);
6021   basic_block entry_bb = gsi_bb (*gsi);
6022   edge entry_edge = split_block (entry_bb, nop);
6023   basic_block lock_bb = entry_edge->dest;
6024   /* Reset the iterator.  */
6025   *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6026 
6027   /* Build and insert the locking sequence.  */
6028   gimple_seq lock_seq = NULL;
6029   tree lock_var = make_ssa_name (unsigned_type_node);
6030   tree lock_expr = nvptx_global_lock_addr ();
6031   lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
6032 				   uns_unlocked, uns_locked);
6033   gimplify_assign (lock_var, lock_expr, &lock_seq);
6034   gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
6035 				   NULL_TREE, NULL_TREE);
6036   gimple_seq_add_stmt (&lock_seq, cond);
6037   gimple *lock_end = gimple_seq_last (lock_seq);
6038   gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
6039 
6040   /* Split the block just after the lock sequence.  */
6041   edge locked_edge = split_block (lock_bb, lock_end);
6042   basic_block update_bb = locked_edge->dest;
6043   lock_bb = locked_edge->src;
6044   *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6045 
6046   /* Create the lock loop ... */
6047   locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
6048   locked_edge->probability = profile_probability::even ();
6049   edge loop_edge = make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
6050   loop_edge->probability = profile_probability::even ();
6051   set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
6052   set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
6053 
6054   /* ... and the loop structure.  */
6055   loop *lock_loop = alloc_loop ();
6056   lock_loop->header = lock_bb;
6057   lock_loop->latch = lock_bb;
6058   lock_loop->nb_iterations_estimate = 1;
6059   lock_loop->any_estimate = true;
6060   add_loop (lock_loop, entry_bb->loop_father);
6061 
6062   /* Build and insert the reduction calculation.  */
6063   gimple_seq red_seq = NULL;
6064   tree acc_in = make_ssa_name (var_type);
6065   tree ref_in = build_simple_mem_ref (ptr);
6066   TREE_THIS_VOLATILE (ref_in) = 1;
6067   gimplify_assign (acc_in, ref_in, &red_seq);
6068 
6069   tree acc_out = make_ssa_name (var_type);
6070   tree update_expr = fold_build2 (op, var_type, ref_in, var);
6071   gimplify_assign (acc_out, update_expr, &red_seq);
6072 
6073   tree ref_out = build_simple_mem_ref (ptr);
6074   TREE_THIS_VOLATILE (ref_out) = 1;
6075   gimplify_assign (ref_out, acc_out, &red_seq);
6076 
6077   gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
6078 
6079   /* Build & insert the unlock sequence.  */
6080   gimple_seq unlock_seq = NULL;
6081   tree unlock_expr = nvptx_global_lock_addr ();
6082   unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
6083 				     uns_locked, uns_unlocked);
6084   gimplify_and_add (unlock_expr, &unlock_seq);
6085   gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
6086 
6087   return acc_out;
6088 }
6089 
6090 /* Emit a sequence to update a reduction accumlator at *PTR with the
6091    value held in VAR using operator OP.  Return the updated value.
6092 
6093    TODO: optimize for atomic ops and indepedent complex ops.  */
6094 
6095 static tree
nvptx_reduction_update(location_t loc,gimple_stmt_iterator * gsi,tree ptr,tree var,tree_code op)6096 nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
6097 			tree ptr, tree var, tree_code op)
6098 {
6099   tree type = TREE_TYPE (var);
6100   tree size = TYPE_SIZE (type);
6101 
6102   if (size == TYPE_SIZE (unsigned_type_node)
6103       || size == TYPE_SIZE (long_long_unsigned_type_node))
6104     return nvptx_lockless_update (loc, gsi, ptr, var, op);
6105   else
6106     return nvptx_lockfull_update (loc, gsi, ptr, var, op);
6107 }
6108 
6109 /* NVPTX implementation of GOACC_REDUCTION_SETUP.  */
6110 
6111 static void
nvptx_goacc_reduction_setup(gcall * call,offload_attrs * oa)6112 nvptx_goacc_reduction_setup (gcall *call, offload_attrs *oa)
6113 {
6114   gimple_stmt_iterator gsi = gsi_for_stmt (call);
6115   tree lhs = gimple_call_lhs (call);
6116   tree var = gimple_call_arg (call, 2);
6117   int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6118   gimple_seq seq = NULL;
6119 
6120   push_gimplify_context (true);
6121 
6122   if (level != GOMP_DIM_GANG)
6123     {
6124       /* Copy the receiver object.  */
6125       tree ref_to_res = gimple_call_arg (call, 1);
6126 
6127       if (!integer_zerop (ref_to_res))
6128 	var = build_simple_mem_ref (ref_to_res);
6129     }
6130 
6131   if (level == GOMP_DIM_WORKER
6132       || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
6133     {
6134       /* Store incoming value to worker reduction buffer.  */
6135       tree offset = gimple_call_arg (call, 5);
6136       tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6137 					     level == GOMP_DIM_VECTOR);
6138       tree ptr = make_ssa_name (TREE_TYPE (call));
6139 
6140       gimplify_assign (ptr, call, &seq);
6141       tree ref = build_simple_mem_ref (ptr);
6142       TREE_THIS_VOLATILE (ref) = 1;
6143       gimplify_assign (ref, var, &seq);
6144     }
6145 
6146   if (lhs)
6147     gimplify_assign (lhs, var, &seq);
6148 
6149   pop_gimplify_context (NULL);
6150   gsi_replace_with_seq (&gsi, seq, true);
6151 }
6152 
6153 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
6154 
6155 static void
nvptx_goacc_reduction_init(gcall * call,offload_attrs * oa)6156 nvptx_goacc_reduction_init (gcall *call, offload_attrs *oa)
6157 {
6158   gimple_stmt_iterator gsi = gsi_for_stmt (call);
6159   tree lhs = gimple_call_lhs (call);
6160   tree var = gimple_call_arg (call, 2);
6161   int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6162   enum tree_code rcode
6163     = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
6164   tree init = omp_reduction_init_op (gimple_location (call), rcode,
6165 				     TREE_TYPE (var));
6166   gimple_seq seq = NULL;
6167 
6168   push_gimplify_context (true);
6169 
6170   if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
6171     {
6172       /* Initialize vector-non-zeroes to INIT_VAL (OP).  */
6173       tree tid = make_ssa_name (integer_type_node);
6174       tree dim_vector = gimple_call_arg (call, 3);
6175       gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
6176 						     dim_vector);
6177       gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
6178 					     NULL_TREE, NULL_TREE);
6179 
6180       gimple_call_set_lhs (tid_call, tid);
6181       gimple_seq_add_stmt (&seq, tid_call);
6182       gimple_seq_add_stmt (&seq, cond_stmt);
6183 
6184       /* Split the block just after the call.  */
6185       edge init_edge = split_block (gsi_bb (gsi), call);
6186       basic_block init_bb = init_edge->dest;
6187       basic_block call_bb = init_edge->src;
6188 
6189       /* Fixup flags from call_bb to init_bb.  */
6190       init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
6191       init_edge->probability = profile_probability::even ();
6192 
6193       /* Set the initialization stmts.  */
6194       gimple_seq init_seq = NULL;
6195       tree init_var = make_ssa_name (TREE_TYPE (var));
6196       gimplify_assign (init_var, init, &init_seq);
6197       gsi = gsi_start_bb (init_bb);
6198       gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
6199 
6200       /* Split block just after the init stmt.  */
6201       gsi_prev (&gsi);
6202       edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
6203       basic_block dst_bb = inited_edge->dest;
6204 
6205       /* Create false edge from call_bb to dst_bb.  */
6206       edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
6207       nop_edge->probability = profile_probability::even ();
6208 
6209       /* Create phi node in dst block.  */
6210       gphi *phi = create_phi_node (lhs, dst_bb);
6211       add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
6212       add_phi_arg (phi, var, nop_edge, gimple_location (call));
6213 
6214       /* Reset dominator of dst bb.  */
6215       set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
6216 
6217       /* Reset the gsi.  */
6218       gsi = gsi_for_stmt (call);
6219     }
6220   else
6221     {
6222       if (level == GOMP_DIM_GANG)
6223 	{
6224 	  /* If there's no receiver object, propagate the incoming VAR.  */
6225 	  tree ref_to_res = gimple_call_arg (call, 1);
6226 	  if (integer_zerop (ref_to_res))
6227 	    init = var;
6228 	}
6229 
6230       if (lhs != NULL_TREE)
6231 	gimplify_assign (lhs, init, &seq);
6232     }
6233 
6234   pop_gimplify_context (NULL);
6235   gsi_replace_with_seq (&gsi, seq, true);
6236 }
6237 
6238 /* NVPTX implementation of GOACC_REDUCTION_FINI.  */
6239 
6240 static void
nvptx_goacc_reduction_fini(gcall * call,offload_attrs * oa)6241 nvptx_goacc_reduction_fini (gcall *call, offload_attrs *oa)
6242 {
6243   gimple_stmt_iterator gsi = gsi_for_stmt (call);
6244   tree lhs = gimple_call_lhs (call);
6245   tree ref_to_res = gimple_call_arg (call, 1);
6246   tree var = gimple_call_arg (call, 2);
6247   int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6248   enum tree_code op
6249     = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
6250   gimple_seq seq = NULL;
6251   tree r = NULL_TREE;;
6252 
6253   push_gimplify_context (true);
6254 
6255   if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
6256     {
6257       /* Emit binary shuffle tree.  TODO. Emit this as an actual loop,
6258 	 but that requires a method of emitting a unified jump at the
6259 	 gimple level.  */
6260       for (int shfl = PTX_WARP_SIZE / 2; shfl > 0; shfl = shfl >> 1)
6261 	{
6262 	  tree other_var = make_ssa_name (TREE_TYPE (var));
6263 	  nvptx_generate_vector_shuffle (gimple_location (call),
6264 					 other_var, var, shfl, &seq);
6265 
6266 	  r = make_ssa_name (TREE_TYPE (var));
6267 	  gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
6268 					   var, other_var), &seq);
6269 	  var = r;
6270 	}
6271     }
6272   else
6273     {
6274       tree accum = NULL_TREE;
6275 
6276       if (level == GOMP_DIM_WORKER || level == GOMP_DIM_VECTOR)
6277 	{
6278 	  /* Get reduction buffer address.  */
6279 	  tree offset = gimple_call_arg (call, 5);
6280 	  tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6281 						 level == GOMP_DIM_VECTOR);
6282 	  tree ptr = make_ssa_name (TREE_TYPE (call));
6283 
6284 	  gimplify_assign (ptr, call, &seq);
6285 	  accum = ptr;
6286 	}
6287       else if (integer_zerop (ref_to_res))
6288 	r = var;
6289       else
6290 	accum = ref_to_res;
6291 
6292       if (accum)
6293 	{
6294 	  /* UPDATE the accumulator.  */
6295 	  gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
6296 	  seq = NULL;
6297 	  r = nvptx_reduction_update (gimple_location (call), &gsi,
6298 				      accum, var, op);
6299 	}
6300     }
6301 
6302   if (lhs)
6303     gimplify_assign (lhs, r, &seq);
6304   pop_gimplify_context (NULL);
6305 
6306   gsi_replace_with_seq (&gsi, seq, true);
6307 }
6308 
6309 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN.  */
6310 
6311 static void
nvptx_goacc_reduction_teardown(gcall * call,offload_attrs * oa)6312 nvptx_goacc_reduction_teardown (gcall *call, offload_attrs *oa)
6313 {
6314   gimple_stmt_iterator gsi = gsi_for_stmt (call);
6315   tree lhs = gimple_call_lhs (call);
6316   tree var = gimple_call_arg (call, 2);
6317   int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6318   gimple_seq seq = NULL;
6319 
6320   push_gimplify_context (true);
6321   if (level == GOMP_DIM_WORKER
6322       || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
6323     {
6324       /* Read the worker reduction buffer.  */
6325       tree offset = gimple_call_arg (call, 5);
6326       tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6327 					     level == GOMP_DIM_VECTOR);
6328       tree ptr = make_ssa_name (TREE_TYPE (call));
6329 
6330       gimplify_assign (ptr, call, &seq);
6331       var = build_simple_mem_ref (ptr);
6332       TREE_THIS_VOLATILE (var) = 1;
6333     }
6334 
6335   if (level != GOMP_DIM_GANG)
6336     {
6337       /* Write to the receiver object.  */
6338       tree ref_to_res = gimple_call_arg (call, 1);
6339 
6340       if (!integer_zerop (ref_to_res))
6341 	gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
6342     }
6343 
6344   if (lhs)
6345     gimplify_assign (lhs, var, &seq);
6346 
6347   pop_gimplify_context (NULL);
6348 
6349   gsi_replace_with_seq (&gsi, seq, true);
6350 }
6351 
6352 /* NVPTX reduction expander.  */
6353 
6354 static void
nvptx_goacc_reduction(gcall * call)6355 nvptx_goacc_reduction (gcall *call)
6356 {
6357   unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
6358   offload_attrs oa;
6359 
6360   populate_offload_attrs (&oa);
6361 
6362   switch (code)
6363     {
6364     case IFN_GOACC_REDUCTION_SETUP:
6365       nvptx_goacc_reduction_setup (call, &oa);
6366       break;
6367 
6368     case IFN_GOACC_REDUCTION_INIT:
6369       nvptx_goacc_reduction_init (call, &oa);
6370       break;
6371 
6372     case IFN_GOACC_REDUCTION_FINI:
6373       nvptx_goacc_reduction_fini (call, &oa);
6374       break;
6375 
6376     case IFN_GOACC_REDUCTION_TEARDOWN:
6377       nvptx_goacc_reduction_teardown (call, &oa);
6378       break;
6379 
6380     default:
6381       gcc_unreachable ();
6382     }
6383 }
6384 
6385 static bool
nvptx_cannot_force_const_mem(machine_mode mode ATTRIBUTE_UNUSED,rtx x ATTRIBUTE_UNUSED)6386 nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
6387 			      rtx x ATTRIBUTE_UNUSED)
6388 {
6389   return true;
6390 }
6391 
6392 static bool
nvptx_vector_mode_supported(machine_mode mode)6393 nvptx_vector_mode_supported (machine_mode mode)
6394 {
6395   return (mode == V2SImode
6396 	  || mode == V2DImode);
6397 }
6398 
6399 /* Return the preferred mode for vectorizing scalar MODE.  */
6400 
6401 static machine_mode
nvptx_preferred_simd_mode(scalar_mode mode)6402 nvptx_preferred_simd_mode (scalar_mode mode)
6403 {
6404   switch (mode)
6405     {
6406     case E_DImode:
6407       return V2DImode;
6408     case E_SImode:
6409       return V2SImode;
6410 
6411     default:
6412       return default_preferred_simd_mode (mode);
6413     }
6414 }
6415 
6416 unsigned int
nvptx_data_alignment(const_tree type,unsigned int basic_align)6417 nvptx_data_alignment (const_tree type, unsigned int basic_align)
6418 {
6419   if (TREE_CODE (type) == INTEGER_TYPE)
6420     {
6421       unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
6422       if (size == GET_MODE_SIZE (TImode))
6423 	return GET_MODE_BITSIZE (maybe_split_mode (TImode));
6424     }
6425 
6426   return basic_align;
6427 }
6428 
6429 /* Implement TARGET_MODES_TIEABLE_P.  */
6430 
6431 static bool
nvptx_modes_tieable_p(machine_mode,machine_mode)6432 nvptx_modes_tieable_p (machine_mode, machine_mode)
6433 {
6434   return false;
6435 }
6436 
6437 /* Implement TARGET_HARD_REGNO_NREGS.  */
6438 
6439 static unsigned int
nvptx_hard_regno_nregs(unsigned int,machine_mode)6440 nvptx_hard_regno_nregs (unsigned int, machine_mode)
6441 {
6442   return 1;
6443 }
6444 
6445 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
6446 
6447 static bool
nvptx_can_change_mode_class(machine_mode,machine_mode,reg_class_t)6448 nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t)
6449 {
6450   return false;
6451 }
6452 
6453 static GTY(()) tree nvptx_previous_fndecl;
6454 
6455 static void
nvptx_set_current_function(tree fndecl)6456 nvptx_set_current_function (tree fndecl)
6457 {
6458   if (!fndecl || fndecl == nvptx_previous_fndecl)
6459     return;
6460 
6461   nvptx_previous_fndecl = fndecl;
6462   vector_red_partition = 0;
6463   oacc_bcast_partition = 0;
6464 }
6465 
6466 #undef TARGET_OPTION_OVERRIDE
6467 #define TARGET_OPTION_OVERRIDE nvptx_option_override
6468 
6469 #undef TARGET_ATTRIBUTE_TABLE
6470 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
6471 
6472 #undef TARGET_LRA_P
6473 #define TARGET_LRA_P hook_bool_void_false
6474 
6475 #undef TARGET_LEGITIMATE_ADDRESS_P
6476 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
6477 
6478 #undef  TARGET_PROMOTE_FUNCTION_MODE
6479 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
6480 
6481 #undef TARGET_FUNCTION_ARG
6482 #define TARGET_FUNCTION_ARG nvptx_function_arg
6483 #undef TARGET_FUNCTION_INCOMING_ARG
6484 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
6485 #undef TARGET_FUNCTION_ARG_ADVANCE
6486 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
6487 #undef TARGET_FUNCTION_ARG_BOUNDARY
6488 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
6489 #undef TARGET_PASS_BY_REFERENCE
6490 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
6491 #undef TARGET_FUNCTION_VALUE_REGNO_P
6492 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
6493 #undef TARGET_FUNCTION_VALUE
6494 #define TARGET_FUNCTION_VALUE nvptx_function_value
6495 #undef TARGET_LIBCALL_VALUE
6496 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
6497 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
6498 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
6499 #undef TARGET_GET_DRAP_RTX
6500 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
6501 #undef TARGET_SPLIT_COMPLEX_ARG
6502 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
6503 #undef TARGET_RETURN_IN_MEMORY
6504 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
6505 #undef TARGET_OMIT_STRUCT_RETURN_REG
6506 #define TARGET_OMIT_STRUCT_RETURN_REG true
6507 #undef TARGET_STRICT_ARGUMENT_NAMING
6508 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
6509 #undef TARGET_CALL_ARGS
6510 #define TARGET_CALL_ARGS nvptx_call_args
6511 #undef TARGET_END_CALL_ARGS
6512 #define TARGET_END_CALL_ARGS nvptx_end_call_args
6513 
6514 #undef TARGET_ASM_FILE_START
6515 #define TARGET_ASM_FILE_START nvptx_file_start
6516 #undef TARGET_ASM_FILE_END
6517 #define TARGET_ASM_FILE_END nvptx_file_end
6518 #undef TARGET_ASM_GLOBALIZE_LABEL
6519 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
6520 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
6521 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
6522 #undef  TARGET_PRINT_OPERAND
6523 #define TARGET_PRINT_OPERAND nvptx_print_operand
6524 #undef  TARGET_PRINT_OPERAND_ADDRESS
6525 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
6526 #undef  TARGET_PRINT_OPERAND_PUNCT_VALID_P
6527 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
6528 #undef TARGET_ASM_INTEGER
6529 #define TARGET_ASM_INTEGER nvptx_assemble_integer
6530 #undef TARGET_ASM_DECL_END
6531 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
6532 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
6533 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
6534 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
6535 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
6536 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
6537 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
6538 
6539 #undef TARGET_MACHINE_DEPENDENT_REORG
6540 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
6541 #undef TARGET_NO_REGISTER_ALLOCATION
6542 #define TARGET_NO_REGISTER_ALLOCATION true
6543 
6544 #undef TARGET_ENCODE_SECTION_INFO
6545 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
6546 #undef TARGET_RECORD_OFFLOAD_SYMBOL
6547 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
6548 
6549 #undef TARGET_VECTOR_ALIGNMENT
6550 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
6551 
6552 #undef TARGET_CANNOT_COPY_INSN_P
6553 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
6554 
6555 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
6556 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
6557 
6558 #undef TARGET_INIT_BUILTINS
6559 #define TARGET_INIT_BUILTINS nvptx_init_builtins
6560 #undef TARGET_EXPAND_BUILTIN
6561 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
6562 #undef  TARGET_BUILTIN_DECL
6563 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
6564 
6565 #undef TARGET_SIMT_VF
6566 #define TARGET_SIMT_VF nvptx_simt_vf
6567 
6568 #undef TARGET_OMP_DEVICE_KIND_ARCH_ISA
6569 #define TARGET_OMP_DEVICE_KIND_ARCH_ISA nvptx_omp_device_kind_arch_isa
6570 
6571 #undef TARGET_GOACC_VALIDATE_DIMS
6572 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
6573 
6574 #undef TARGET_GOACC_DIM_LIMIT
6575 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
6576 
6577 #undef TARGET_GOACC_FORK_JOIN
6578 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
6579 
6580 #undef TARGET_GOACC_REDUCTION
6581 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
6582 
6583 #undef TARGET_CANNOT_FORCE_CONST_MEM
6584 #define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
6585 
6586 #undef TARGET_VECTOR_MODE_SUPPORTED_P
6587 #define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
6588 
6589 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
6590 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
6591     nvptx_preferred_simd_mode
6592 
6593 #undef TARGET_MODES_TIEABLE_P
6594 #define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
6595 
6596 #undef TARGET_HARD_REGNO_NREGS
6597 #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
6598 
6599 #undef TARGET_CAN_CHANGE_MODE_CLASS
6600 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
6601 
6602 #undef TARGET_HAVE_SPECULATION_SAFE_VALUE
6603 #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
6604 
6605 #undef TARGET_SET_CURRENT_FUNCTION
6606 #define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function
6607 
6608 struct gcc_target targetm = TARGET_INITIALIZER;
6609 
6610 #include "gt-nvptx.h"
6611