1 /* Target code for NVPTX.
2    Copyright (C) 2014-2018 Free Software Foundation, Inc.
3    Contributed by Bernd Schmidt <bernds@codesourcery.com>
4 
5    This file is part of GCC.
6 
7    GCC is free software; you can redistribute it and/or modify it
8    under the terms of the GNU General Public License as published
9    by the Free Software Foundation; either version 3, or (at your
10    option) any later version.
11 
12    GCC is distributed in the hope that it will be useful, but WITHOUT
13    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
15    License for more details.
16 
17    You should have received a copy of the GNU General Public License
18    along with GCC; see the file COPYING3.  If not see
19    <http://www.gnu.org/licenses/>.  */
20 
21 #define IN_TARGET_CODE 1
22 
23 #include "config.h"
24 #include <sstream>
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "cfghooks.h"
32 #include "df.h"
33 #include "memmodel.h"
34 #include "tm_p.h"
35 #include "expmed.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "diagnostic.h"
41 #include "alias.h"
42 #include "insn-flags.h"
43 #include "output.h"
44 #include "insn-attr.h"
45 #include "flags.h"
46 #include "dojump.h"
47 #include "explow.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "stmt.h"
51 #include "expr.h"
52 #include "tm-preds.h"
53 #include "tm-constrs.h"
54 #include "langhooks.h"
55 #include "dbxout.h"
56 #include "cfgrtl.h"
57 #include "gimple.h"
58 #include "stor-layout.h"
59 #include "builtins.h"
60 #include "omp-general.h"
61 #include "omp-low.h"
62 #include "gomp-constants.h"
63 #include "dumpfile.h"
64 #include "internal-fn.h"
65 #include "gimple-iterator.h"
66 #include "stringpool.h"
67 #include "attribs.h"
68 #include "tree-vrp.h"
69 #include "tree-ssa-operands.h"
70 #include "tree-ssanames.h"
71 #include "gimplify.h"
72 #include "tree-phinodes.h"
73 #include "cfgloop.h"
74 #include "fold-const.h"
75 #include "intl.h"
76 
77 /* This file should be included last.  */
78 #include "target-def.h"
79 
80 #define WORKAROUND_PTXJIT_BUG 1
81 #define WORKAROUND_PTXJIT_BUG_2 1
82 
83 /* The various PTX memory areas an object might reside in.  */
84 enum nvptx_data_area
85 {
86   DATA_AREA_GENERIC,
87   DATA_AREA_GLOBAL,
88   DATA_AREA_SHARED,
89   DATA_AREA_LOCAL,
90   DATA_AREA_CONST,
91   DATA_AREA_PARAM,
92   DATA_AREA_MAX
93 };
94 
95 /*  We record the data area in the target symbol flags.  */
96 #define SYMBOL_DATA_AREA(SYM) \
97   (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
98 		    & 7)
99 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
100   (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
101 
102 /* Record the function decls we've written, and the libfuncs and function
103    decls corresponding to them.  */
104 static std::stringstream func_decls;
105 
106 struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
107 {
hashdeclared_libfunc_hasher108   static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
equaldeclared_libfunc_hasher109   static bool equal (rtx a, rtx b) { return a == b; }
110 };
111 
112 static GTY((cache))
113   hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
114 
115 struct tree_hasher : ggc_cache_ptr_hash<tree_node>
116 {
hashtree_hasher117   static hashval_t hash (tree t) { return htab_hash_pointer (t); }
equaltree_hasher118   static bool equal (tree a, tree b) { return a == b; }
119 };
120 
121 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
122 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
123 
124 /* Buffer needed to broadcast across workers.  This is used for both
125    worker-neutering and worker broadcasting.  It is shared by all
126    functions emitted.  The buffer is placed in shared memory.  It'd be
127    nice if PTX supported common blocks, because then this could be
128    shared across TUs (taking the largest size).  */
129 static unsigned worker_bcast_size;
130 static unsigned worker_bcast_align;
131 static GTY(()) rtx worker_bcast_sym;
132 
133 /* Buffer needed for worker reductions.  This has to be distinct from
134    the worker broadcast array, as both may be live concurrently.  */
135 static unsigned worker_red_size;
136 static unsigned worker_red_align;
137 static GTY(()) rtx worker_red_sym;
138 
139 /* Global lock variable, needed for 128bit worker & gang reductions.  */
140 static GTY(()) tree global_lock_var;
141 
142 /* True if any function references __nvptx_stacks.  */
143 static bool need_softstack_decl;
144 
145 /* True if any function references __nvptx_uni.  */
146 static bool need_unisimt_decl;
147 
148 /* Allocate a new, cleared machine_function structure.  */
149 
150 static struct machine_function *
nvptx_init_machine_status(void)151 nvptx_init_machine_status (void)
152 {
153   struct machine_function *p = ggc_cleared_alloc<machine_function> ();
154   p->return_mode = VOIDmode;
155   return p;
156 }
157 
158 /* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
159    and -fopenacc is also enabled.  */
160 
161 static void
diagnose_openacc_conflict(bool optval,const char * optname)162 diagnose_openacc_conflict (bool optval, const char *optname)
163 {
164   if (flag_openacc && optval)
165     error ("option %s is not supported together with -fopenacc", optname);
166 }
167 
168 /* Implement TARGET_OPTION_OVERRIDE.  */
169 
170 static void
nvptx_option_override(void)171 nvptx_option_override (void)
172 {
173   init_machine_status = nvptx_init_machine_status;
174 
175   /* Set toplevel_reorder, unless explicitly disabled.  We need
176      reordering so that we emit necessary assembler decls of
177      undeclared variables. */
178   if (!global_options_set.x_flag_toplevel_reorder)
179     flag_toplevel_reorder = 1;
180 
181   debug_nonbind_markers_p = 0;
182 
183   /* Set flag_no_common, unless explicitly disabled.  We fake common
184      using .weak, and that's not entirely accurate, so avoid it
185      unless forced.  */
186   if (!global_options_set.x_flag_no_common)
187     flag_no_common = 1;
188 
189   /* The patch area requires nops, which we don't have.  */
190   if (function_entry_patch_area_size > 0)
191     sorry ("not generating patch area, nops not supported");
192 
193   /* Assumes that it will see only hard registers.  */
194   flag_var_tracking = 0;
195 
196   if (nvptx_optimize < 0)
197     nvptx_optimize = optimize > 0;
198 
199   declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
200   needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
201   declared_libfuncs_htab
202     = hash_table<declared_libfunc_hasher>::create_ggc (17);
203 
204   worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_bcast");
205   SET_SYMBOL_DATA_AREA (worker_bcast_sym, DATA_AREA_SHARED);
206   worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
207 
208   worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
209   SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
210   worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
211 
212   diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
213   diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
214   diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
215 
216   if (TARGET_GOMP)
217     target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
218 }
219 
220 /* Return a ptx type for MODE.  If PROMOTE, then use .u32 for QImode to
221    deal with ptx ideosyncracies.  */
222 
223 const char *
nvptx_ptx_type_from_mode(machine_mode mode,bool promote)224 nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
225 {
226   switch (mode)
227     {
228     case E_BLKmode:
229       return ".b8";
230     case E_BImode:
231       return ".pred";
232     case E_QImode:
233       if (promote)
234 	return ".u32";
235       else
236 	return ".u8";
237     case E_HImode:
238       return ".u16";
239     case E_SImode:
240       return ".u32";
241     case E_DImode:
242       return ".u64";
243 
244     case E_SFmode:
245       return ".f32";
246     case E_DFmode:
247       return ".f64";
248 
249     case E_V2SImode:
250       return ".v2.u32";
251     case E_V2DImode:
252       return ".v2.u64";
253 
254     default:
255       gcc_unreachable ();
256     }
257 }
258 
259 /* Encode the PTX data area that DECL (which might not actually be a
260    _DECL) should reside in.  */
261 
262 static void
nvptx_encode_section_info(tree decl,rtx rtl,int first)263 nvptx_encode_section_info (tree decl, rtx rtl, int first)
264 {
265   default_encode_section_info (decl, rtl, first);
266   if (first && MEM_P (rtl))
267     {
268       nvptx_data_area area = DATA_AREA_GENERIC;
269 
270       if (TREE_CONSTANT (decl))
271 	area = DATA_AREA_CONST;
272       else if (TREE_CODE (decl) == VAR_DECL)
273 	{
274 	  if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
275 	    {
276 	      area = DATA_AREA_SHARED;
277 	      if (DECL_INITIAL (decl))
278 		error ("static initialization of variable %q+D in %<.shared%>"
279 		       " memory is not supported", decl);
280 	    }
281 	  else
282 	    area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
283 	}
284 
285       SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
286     }
287 }
288 
289 /* Return the PTX name of the data area in which SYM should be
290    placed.  The symbol must have already been processed by
291    nvptx_encode_seciton_info, or equivalent.  */
292 
293 static const char *
section_for_sym(rtx sym)294 section_for_sym (rtx sym)
295 {
296   nvptx_data_area area = SYMBOL_DATA_AREA (sym);
297   /* Same order as nvptx_data_area enum.  */
298   static char const *const areas[] =
299     {"", ".global", ".shared", ".local", ".const", ".param"};
300 
301   return areas[area];
302 }
303 
304 /* Similarly for a decl.  */
305 
306 static const char *
section_for_decl(const_tree decl)307 section_for_decl (const_tree decl)
308 {
309   return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
310 }
311 
312 /* Check NAME for special function names and redirect them by returning a
313    replacement.  This applies to malloc, free and realloc, for which we
314    want to use libgcc wrappers, and call, which triggers a bug in
315    ptxas.  We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
316    not active in an offload compiler -- the names are all set by the
317    host-side compiler.  */
318 
319 static const char *
nvptx_name_replacement(const char * name)320 nvptx_name_replacement (const char *name)
321 {
322   if (strcmp (name, "call") == 0)
323     return "__nvptx_call";
324   if (strcmp (name, "malloc") == 0)
325     return "__nvptx_malloc";
326   if (strcmp (name, "free") == 0)
327     return "__nvptx_free";
328   if (strcmp (name, "realloc") == 0)
329     return "__nvptx_realloc";
330   return name;
331 }
332 
333 /* If MODE should be treated as two registers of an inner mode, return
334    that inner mode.  Otherwise return VOIDmode.  */
335 
336 static machine_mode
maybe_split_mode(machine_mode mode)337 maybe_split_mode (machine_mode mode)
338 {
339   if (COMPLEX_MODE_P (mode))
340     return GET_MODE_INNER (mode);
341 
342   if (mode == TImode)
343     return DImode;
344 
345   return VOIDmode;
346 }
347 
348 /* Return true if mode should be treated as two registers.  */
349 
350 static bool
split_mode_p(machine_mode mode)351 split_mode_p (machine_mode mode)
352 {
353   return maybe_split_mode (mode) != VOIDmode;
354 }
355 
356 /* Output a register, subreg, or register pair (with optional
357    enclosing braces).  */
358 
359 static void
360 output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
361 	    int subreg_offset = -1)
362 {
363   if (inner_mode == VOIDmode)
364     {
365       if (HARD_REGISTER_NUM_P (regno))
366 	fprintf (file, "%s", reg_names[regno]);
367       else
368 	fprintf (file, "%%r%d", regno);
369     }
370   else if (subreg_offset >= 0)
371     {
372       output_reg (file, regno, VOIDmode);
373       fprintf (file, "$%d", subreg_offset);
374     }
375   else
376     {
377       if (subreg_offset == -1)
378 	fprintf (file, "{");
379       output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
380       fprintf (file, ",");
381       output_reg (file, regno, inner_mode, 0);
382       if (subreg_offset == -1)
383 	fprintf (file, "}");
384     }
385 }
386 
387 /* Emit forking instructions for MASK.  */
388 
389 static void
nvptx_emit_forking(unsigned mask,bool is_call)390 nvptx_emit_forking (unsigned mask, bool is_call)
391 {
392   mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
393 	   | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
394   if (mask)
395     {
396       rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
397 
398       /* Emit fork at all levels.  This helps form SESE regions, as
399 	 it creates a block with a single successor before entering a
400 	 partitooned region.  That is a good candidate for the end of
401 	 an SESE region.  */
402       emit_insn (gen_nvptx_fork (op));
403       emit_insn (gen_nvptx_forked (op));
404     }
405 }
406 
407 /* Emit joining instructions for MASK.  */
408 
409 static void
nvptx_emit_joining(unsigned mask,bool is_call)410 nvptx_emit_joining (unsigned mask, bool is_call)
411 {
412   mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
413 	   | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
414   if (mask)
415     {
416       rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
417 
418       /* Emit joining for all non-call pars to ensure there's a single
419 	 predecessor for the block the join insn ends up in.  This is
420 	 needed for skipping entire loops.  */
421       emit_insn (gen_nvptx_joining (op));
422       emit_insn (gen_nvptx_join (op));
423     }
424 }
425 
426 
427 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
428    returned in memory.  Integer and floating types supported by the
429    machine are passed in registers, everything else is passed in
430    memory.  Complex types are split.  */
431 
432 static bool
pass_in_memory(machine_mode mode,const_tree type,bool for_return)433 pass_in_memory (machine_mode mode, const_tree type, bool for_return)
434 {
435   if (type)
436     {
437       if (AGGREGATE_TYPE_P (type))
438 	return true;
439       if (TREE_CODE (type) == VECTOR_TYPE)
440 	return true;
441     }
442 
443   if (!for_return && COMPLEX_MODE_P (mode))
444     /* Complex types are passed as two underlying args.  */
445     mode = GET_MODE_INNER (mode);
446 
447   if (GET_MODE_CLASS (mode) != MODE_INT
448       && GET_MODE_CLASS (mode) != MODE_FLOAT)
449     return true;
450 
451   if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
452     return true;
453 
454   return false;
455 }
456 
457 /* A non-memory argument of mode MODE is being passed, determine the mode it
458    should be promoted to.  This is also used for determining return
459    type promotion.  */
460 
461 static machine_mode
promote_arg(machine_mode mode,bool prototyped)462 promote_arg (machine_mode mode, bool prototyped)
463 {
464   if (!prototyped && mode == SFmode)
465     /* K&R float promotion for unprototyped functions.  */
466     mode = DFmode;
467   else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
468     mode = SImode;
469 
470   return mode;
471 }
472 
473 /* A non-memory return type of MODE is being returned.  Determine the
474    mode it should be promoted to.  */
475 
476 static machine_mode
promote_return(machine_mode mode)477 promote_return (machine_mode mode)
478 {
479   return promote_arg (mode, true);
480 }
481 
482 /* Implement TARGET_FUNCTION_ARG.  */
483 
484 static rtx
nvptx_function_arg(cumulative_args_t ARG_UNUSED (cum_v),machine_mode mode,const_tree,bool named)485 nvptx_function_arg (cumulative_args_t ARG_UNUSED (cum_v), machine_mode mode,
486 		    const_tree, bool named)
487 {
488   if (mode == VOIDmode || !named)
489     return NULL_RTX;
490 
491   return gen_reg_rtx (mode);
492 }
493 
494 /* Implement TARGET_FUNCTION_INCOMING_ARG.  */
495 
496 static rtx
nvptx_function_incoming_arg(cumulative_args_t cum_v,machine_mode mode,const_tree,bool named)497 nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
498 			     const_tree, bool named)
499 {
500   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
501 
502   if (mode == VOIDmode || !named)
503     return NULL_RTX;
504 
505   /* No need to deal with split modes here, the only case that can
506      happen is complex modes and those are dealt with by
507      TARGET_SPLIT_COMPLEX_ARG.  */
508   return gen_rtx_UNSPEC (mode,
509 			 gen_rtvec (1, GEN_INT (cum->count)),
510 			 UNSPEC_ARG_REG);
511 }
512 
513 /* Implement TARGET_FUNCTION_ARG_ADVANCE.  */
514 
515 static void
nvptx_function_arg_advance(cumulative_args_t cum_v,machine_mode ARG_UNUSED (mode),const_tree ARG_UNUSED (type),bool ARG_UNUSED (named))516 nvptx_function_arg_advance (cumulative_args_t cum_v,
517 			    machine_mode ARG_UNUSED (mode),
518 			    const_tree ARG_UNUSED (type),
519 			    bool ARG_UNUSED (named))
520 {
521   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
522 
523   cum->count++;
524 }
525 
526 /* Implement TARGET_FUNCTION_ARG_BOUNDARY.
527 
528    For nvptx This is only used for varadic args.  The type has already
529    been promoted and/or converted to invisible reference.  */
530 
531 static unsigned
nvptx_function_arg_boundary(machine_mode mode,const_tree ARG_UNUSED (type))532 nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
533 {
534   return GET_MODE_ALIGNMENT (mode);
535 }
536 
537 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
538 
539    For nvptx, we know how to handle functions declared as stdarg: by
540    passing an extra pointer to the unnamed arguments.  However, the
541    Fortran frontend can produce a different situation, where a
542    function pointer is declared with no arguments, but the actual
543    function and calls to it take more arguments.  In that case, we
544    want to ensure the call matches the definition of the function.  */
545 
546 static bool
nvptx_strict_argument_naming(cumulative_args_t cum_v)547 nvptx_strict_argument_naming (cumulative_args_t cum_v)
548 {
549   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
550 
551   return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
552 }
553 
554 /* Implement TARGET_LIBCALL_VALUE.  */
555 
556 static rtx
nvptx_libcall_value(machine_mode mode,const_rtx)557 nvptx_libcall_value (machine_mode mode, const_rtx)
558 {
559   if (!cfun || !cfun->machine->doing_call)
560     /* Pretend to return in a hard reg for early uses before pseudos can be
561        generated.  */
562     return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
563 
564   return gen_reg_rtx (mode);
565 }
566 
567 /* TARGET_FUNCTION_VALUE implementation.  Returns an RTX representing the place
568    where function FUNC returns or receives a value of data type TYPE.  */
569 
570 static rtx
nvptx_function_value(const_tree type,const_tree ARG_UNUSED (func),bool outgoing)571 nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
572 		      bool outgoing)
573 {
574   machine_mode mode = promote_return (TYPE_MODE (type));
575 
576   if (outgoing)
577     {
578       gcc_assert (cfun);
579       cfun->machine->return_mode = mode;
580       return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
581     }
582 
583   return nvptx_libcall_value (mode, NULL_RTX);
584 }
585 
586 /* Implement TARGET_FUNCTION_VALUE_REGNO_P.  */
587 
588 static bool
nvptx_function_value_regno_p(const unsigned int regno)589 nvptx_function_value_regno_p (const unsigned int regno)
590 {
591   return regno == NVPTX_RETURN_REGNUM;
592 }
593 
594 /* Types with a mode other than those supported by the machine are passed by
595    reference in memory.  */
596 
597 static bool
nvptx_pass_by_reference(cumulative_args_t ARG_UNUSED (cum),machine_mode mode,const_tree type,bool ARG_UNUSED (named))598 nvptx_pass_by_reference (cumulative_args_t ARG_UNUSED (cum),
599 			 machine_mode mode, const_tree type,
600 			 bool ARG_UNUSED (named))
601 {
602   return pass_in_memory (mode, type, false);
603 }
604 
605 /* Implement TARGET_RETURN_IN_MEMORY.  */
606 
607 static bool
nvptx_return_in_memory(const_tree type,const_tree)608 nvptx_return_in_memory (const_tree type, const_tree)
609 {
610   return pass_in_memory (TYPE_MODE (type), type, true);
611 }
612 
613 /* Implement TARGET_PROMOTE_FUNCTION_MODE.  */
614 
615 static machine_mode
nvptx_promote_function_mode(const_tree type,machine_mode mode,int * ARG_UNUSED (punsignedp),const_tree funtype,int for_return)616 nvptx_promote_function_mode (const_tree type, machine_mode mode,
617 			     int *ARG_UNUSED (punsignedp),
618 			     const_tree funtype, int for_return)
619 {
620   return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
621 }
622 
623 /* Helper for write_arg.  Emit a single PTX argument of MODE, either
624    in a prototype, or as copy in a function prologue.  ARGNO is the
625    index of this argument in the PTX function.  FOR_REG is negative,
626    if we're emitting the PTX prototype.  It is zero if we're copying
627    to an argument register and it is greater than zero if we're
628    copying to a specific hard register.  */
629 
630 static int
write_arg_mode(std::stringstream & s,int for_reg,int argno,machine_mode mode)631 write_arg_mode (std::stringstream &s, int for_reg, int argno,
632 		machine_mode mode)
633 {
634   const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
635 
636   if (for_reg < 0)
637     {
638       /* Writing PTX prototype.  */
639       s << (argno ? ", " : " (");
640       s << ".param" << ptx_type << " %in_ar" << argno;
641     }
642   else
643     {
644       s << "\t.reg" << ptx_type << " ";
645       if (for_reg)
646 	s << reg_names[for_reg];
647       else
648 	s << "%ar" << argno;
649       s << ";\n";
650       if (argno >= 0)
651 	{
652 	  s << "\tld.param" << ptx_type << " ";
653 	  if (for_reg)
654 	    s << reg_names[for_reg];
655 	  else
656 	    s << "%ar" << argno;
657 	  s << ", [%in_ar" << argno << "];\n";
658 	}
659     }
660   return argno + 1;
661 }
662 
663 /* Process function parameter TYPE to emit one or more PTX
664    arguments. S, FOR_REG and ARGNO as for write_arg_mode.  PROTOTYPED
665    is true, if this is a prototyped function, rather than an old-style
666    C declaration.  Returns the next argument number to use.
667 
668    The promotion behavior here must match the regular GCC function
669    parameter marshalling machinery.  */
670 
671 static int
write_arg_type(std::stringstream & s,int for_reg,int argno,tree type,bool prototyped)672 write_arg_type (std::stringstream &s, int for_reg, int argno,
673 		tree type, bool prototyped)
674 {
675   machine_mode mode = TYPE_MODE (type);
676 
677   if (mode == VOIDmode)
678     return argno;
679 
680   if (pass_in_memory (mode, type, false))
681     mode = Pmode;
682   else
683     {
684       bool split = TREE_CODE (type) == COMPLEX_TYPE;
685 
686       if (split)
687 	{
688 	  /* Complex types are sent as two separate args.  */
689 	  type = TREE_TYPE (type);
690 	  mode = TYPE_MODE (type);
691 	  prototyped = true;
692 	}
693 
694       mode = promote_arg (mode, prototyped);
695       if (split)
696 	argno = write_arg_mode (s, for_reg, argno, mode);
697     }
698 
699   return write_arg_mode (s, for_reg, argno, mode);
700 }
701 
702 /* Emit a PTX return as a prototype or function prologue declaration
703    for MODE.  */
704 
705 static void
write_return_mode(std::stringstream & s,bool for_proto,machine_mode mode)706 write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
707 {
708   const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
709   const char *pfx = "\t.reg";
710   const char *sfx = ";\n";
711 
712   if (for_proto)
713     pfx = "(.param", sfx = "_out) ";
714 
715   s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
716 }
717 
718 /* Process a function return TYPE to emit a PTX return as a prototype
719    or function prologue declaration.  Returns true if return is via an
720    additional pointer parameter.  The promotion behavior here must
721    match the regular GCC function return mashalling.  */
722 
723 static bool
write_return_type(std::stringstream & s,bool for_proto,tree type)724 write_return_type (std::stringstream &s, bool for_proto, tree type)
725 {
726   machine_mode mode = TYPE_MODE (type);
727 
728   if (mode == VOIDmode)
729     return false;
730 
731   bool return_in_mem = pass_in_memory (mode, type, true);
732 
733   if (return_in_mem)
734     {
735       if (for_proto)
736 	return return_in_mem;
737 
738       /* Named return values can cause us to return a pointer as well
739 	 as expect an argument for the return location.  This is
740 	 optimization-level specific, so no caller can make use of
741 	 this data, but more importantly for us, we must ensure it
742 	 doesn't change the PTX prototype.  */
743       mode = (machine_mode) cfun->machine->return_mode;
744 
745       if (mode == VOIDmode)
746 	return return_in_mem;
747 
748       /* Clear return_mode to inhibit copy of retval to non-existent
749 	 retval parameter.  */
750       cfun->machine->return_mode = VOIDmode;
751     }
752   else
753     mode = promote_return (mode);
754 
755   write_return_mode (s, for_proto, mode);
756 
757   return return_in_mem;
758 }
759 
760 /* Look for attributes in ATTRS that would indicate we must write a function
761    as a .entry kernel rather than a .func.  Return true if one is found.  */
762 
763 static bool
write_as_kernel(tree attrs)764 write_as_kernel (tree attrs)
765 {
766   return (lookup_attribute ("kernel", attrs) != NULL_TREE
767 	  || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
768 	      && lookup_attribute ("oacc function", attrs) != NULL_TREE));
769   /* For OpenMP target regions, the corresponding kernel entry is emitted from
770      write_omp_entry as a separate function.  */
771 }
772 
773 /* Emit a linker marker for a function decl or defn.  */
774 
775 static void
write_fn_marker(std::stringstream & s,bool is_defn,bool globalize,const char * name)776 write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
777 		 const char *name)
778 {
779   s << "\n// BEGIN";
780   if (globalize)
781     s << " GLOBAL";
782   s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
783   s << name << "\n";
784 }
785 
786 /* Emit a linker marker for a variable decl or defn.  */
787 
788 static void
write_var_marker(FILE * file,bool is_defn,bool globalize,const char * name)789 write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
790 {
791   fprintf (file, "\n// BEGIN%s VAR %s: ",
792 	   globalize ? " GLOBAL" : "",
793 	   is_defn ? "DEF" : "DECL");
794   assemble_name_raw (file, name);
795   fputs ("\n", file);
796 }
797 
798 /* Write a .func or .kernel declaration or definition along with
799    a helper comment for use by ld.  S is the stream to write to, DECL
800    the decl for the function with name NAME.   For definitions, emit
801    a declaration too.  */
802 
803 static const char *
write_fn_proto(std::stringstream & s,bool is_defn,const char * name,const_tree decl)804 write_fn_proto (std::stringstream &s, bool is_defn,
805 		const char *name, const_tree decl)
806 {
807   if (is_defn)
808     /* Emit a declaration. The PTX assembler gets upset without it.   */
809     name = write_fn_proto (s, false, name, decl);
810   else
811     {
812       /* Avoid repeating the name replacement.  */
813       name = nvptx_name_replacement (name);
814       if (name[0] == '*')
815 	name++;
816     }
817 
818   write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
819 
820   /* PTX declaration.  */
821   if (DECL_EXTERNAL (decl))
822     s << ".extern ";
823   else if (TREE_PUBLIC (decl))
824     s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
825   s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
826 
827   tree fntype = TREE_TYPE (decl);
828   tree result_type = TREE_TYPE (fntype);
829 
830   /* atomic_compare_exchange_$n builtins have an exceptional calling
831      convention.  */
832   int not_atomic_weak_arg = -1;
833   if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
834     switch (DECL_FUNCTION_CODE (decl))
835       {
836       case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
837       case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
838       case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
839       case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
840       case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
841 	/* These atomics skip the 'weak' parm in an actual library
842 	   call.  We must skip it in the prototype too.  */
843 	not_atomic_weak_arg = 3;
844 	break;
845 
846       default:
847 	break;
848       }
849 
850   /* Declare the result.  */
851   bool return_in_mem = write_return_type (s, true, result_type);
852 
853   s << name;
854 
855   int argno = 0;
856 
857   /* Emit argument list.  */
858   if (return_in_mem)
859     argno = write_arg_type (s, -1, argno, ptr_type_node, true);
860 
861   /* We get:
862      NULL in TYPE_ARG_TYPES, for old-style functions
863      NULL in DECL_ARGUMENTS, for builtin functions without another
864        declaration.
865      So we have to pick the best one we have.  */
866   tree args = TYPE_ARG_TYPES (fntype);
867   bool prototyped = true;
868   if (!args)
869     {
870       args = DECL_ARGUMENTS (decl);
871       prototyped = false;
872     }
873 
874   for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
875     {
876       tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
877 
878       if (not_atomic_weak_arg)
879 	argno = write_arg_type (s, -1, argno, type, prototyped);
880       else
881 	gcc_assert (type == boolean_type_node);
882     }
883 
884   if (stdarg_p (fntype))
885     argno = write_arg_type (s, -1, argno, ptr_type_node, true);
886 
887   if (DECL_STATIC_CHAIN (decl))
888     argno = write_arg_type (s, -1, argno, ptr_type_node, true);
889 
890   if (!argno && strcmp (name, "main") == 0)
891     {
892       argno = write_arg_type (s, -1, argno, integer_type_node, true);
893       argno = write_arg_type (s, -1, argno, ptr_type_node, true);
894     }
895 
896   if (argno)
897     s << ")";
898 
899   s << (is_defn ? "\n" : ";\n");
900 
901   return name;
902 }
903 
904 /* Construct a function declaration from a call insn.  This can be
905    necessary for two reasons - either we have an indirect call which
906    requires a .callprototype declaration, or we have a libcall
907    generated by emit_library_call for which no decl exists.  */
908 
909 static void
write_fn_proto_from_insn(std::stringstream & s,const char * name,rtx result,rtx pat)910 write_fn_proto_from_insn (std::stringstream &s, const char *name,
911 			  rtx result, rtx pat)
912 {
913   if (!name)
914     {
915       s << "\t.callprototype ";
916       name = "_";
917     }
918   else
919     {
920       name = nvptx_name_replacement (name);
921       write_fn_marker (s, false, true, name);
922       s << "\t.extern .func ";
923     }
924 
925   if (result != NULL_RTX)
926     write_return_mode (s, true, GET_MODE (result));
927 
928   s << name;
929 
930   int arg_end = XVECLEN (pat, 0);
931   for (int i = 1; i < arg_end; i++)
932     {
933       /* We don't have to deal with mode splitting & promotion here,
934 	 as that was already done when generating the call
935 	 sequence.  */
936       machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
937 
938       write_arg_mode (s, -1, i - 1, mode);
939     }
940   if (arg_end != 1)
941     s << ")";
942   s << ";\n";
943 }
944 
945 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
946    table and and write a ptx prototype.  These are emitted at end of
947    compilation.  */
948 
949 static void
nvptx_record_fndecl(tree decl)950 nvptx_record_fndecl (tree decl)
951 {
952   tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
953   if (*slot == NULL)
954     {
955       *slot = decl;
956       const char *name = get_fnname_from_decl (decl);
957       write_fn_proto (func_decls, false, name, decl);
958     }
959 }
960 
961 /* Record a libcall or unprototyped external function. CALLEE is the
962    SYMBOL_REF.  Insert into the libfunc hash table and emit a ptx
963    declaration for it.  */
964 
965 static void
nvptx_record_libfunc(rtx callee,rtx retval,rtx pat)966 nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
967 {
968   rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
969   if (*slot == NULL)
970     {
971       *slot = callee;
972 
973       const char *name = XSTR (callee, 0);
974       write_fn_proto_from_insn (func_decls, name, retval, pat);
975     }
976 }
977 
978 /* DECL is an external FUNCTION_DECL, that we're referencing.  If it
979    is prototyped, record it now.  Otherwise record it as needed at end
980    of compilation, when we might have more information about it.  */
981 
982 void
nvptx_record_needed_fndecl(tree decl)983 nvptx_record_needed_fndecl (tree decl)
984 {
985   if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
986     {
987       tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
988       if (*slot == NULL)
989 	*slot = decl;
990     }
991   else
992     nvptx_record_fndecl (decl);
993 }
994 
995 /* SYM is a SYMBOL_REF.  If it refers to an external function, record
996    it as needed.  */
997 
998 static void
nvptx_maybe_record_fnsym(rtx sym)999 nvptx_maybe_record_fnsym (rtx sym)
1000 {
1001   tree decl = SYMBOL_REF_DECL (sym);
1002 
1003   if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
1004     nvptx_record_needed_fndecl (decl);
1005 }
1006 
1007 /* Emit a local array to hold some part of a conventional stack frame
1008    and initialize REGNO to point to it.  If the size is zero, it'll
1009    never be valid to dereference, so we can simply initialize to
1010    zero.  */
1011 
1012 static void
init_frame(FILE * file,int regno,unsigned align,unsigned size)1013 init_frame (FILE  *file, int regno, unsigned align, unsigned size)
1014 {
1015   if (size)
1016     fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
1017 	     align, reg_names[regno], size);
1018   fprintf (file, "\t.reg.u%d %s;\n",
1019 	   POINTER_SIZE, reg_names[regno]);
1020   fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
1021 		  :  "\tmov.u%d %s, 0;\n"),
1022 	   POINTER_SIZE, reg_names[regno], reg_names[regno]);
1023 }
1024 
1025 /* Emit soft stack frame setup sequence.  */
1026 
1027 static void
init_softstack_frame(FILE * file,unsigned alignment,HOST_WIDE_INT size)1028 init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
1029 {
1030   /* Maintain 64-bit stack alignment.  */
1031   unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
1032   size = ROUND_UP (size, keep_align);
1033   int bits = POINTER_SIZE;
1034   const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1035   const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
1036   const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
1037   const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
1038   fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
1039   fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
1040   fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
1041   fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
1042   fprintf (file, "\t{\n");
1043   fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
1044   fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
1045   fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
1046   fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1047   fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1048 	   bits == 64 ? ".wide" : ".lo", bits / 8);
1049   fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
1050 
1051   /* Initialize %sspslot = &__nvptx_stacks[tid.y].  */
1052   fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
1053 
1054   /* Initialize %sspprev = __nvptx_stacks[tid.y].  */
1055   fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
1056 	   bits, reg_sspprev, reg_sspslot);
1057 
1058   /* Initialize %frame = %sspprev - size.  */
1059   fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1060 	   bits, reg_frame, reg_sspprev, size);
1061 
1062   /* Apply alignment, if larger than 64.  */
1063   if (alignment > keep_align)
1064     fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
1065 	     bits, reg_frame, reg_frame, -alignment);
1066 
1067   size = crtl->outgoing_args_size;
1068   gcc_assert (size % keep_align == 0);
1069 
1070   /* Initialize %stack.  */
1071   fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1072 	   bits, reg_stack, reg_frame, size);
1073 
1074   if (!crtl->is_leaf)
1075     fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
1076 	     bits, reg_sspslot, reg_stack);
1077   fprintf (file, "\t}\n");
1078   cfun->machine->has_softstack = true;
1079   need_softstack_decl = true;
1080 }
1081 
1082 /* Emit code to initialize the REGNO predicate register to indicate
1083    whether we are not lane zero on the NAME axis.  */
1084 
1085 static void
nvptx_init_axis_predicate(FILE * file,int regno,const char * name)1086 nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
1087 {
1088   fprintf (file, "\t{\n");
1089   fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
1090   fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
1091   fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
1092   fprintf (file, "\t}\n");
1093 }
1094 
1095 /* Emit code to initialize predicate and master lane index registers for
1096    -muniform-simt code generation variant.  */
1097 
1098 static void
nvptx_init_unisimt_predicate(FILE * file)1099 nvptx_init_unisimt_predicate (FILE *file)
1100 {
1101   cfun->machine->unisimt_location = gen_reg_rtx (Pmode);
1102   int loc = REGNO (cfun->machine->unisimt_location);
1103   int bits = POINTER_SIZE;
1104   fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc);
1105   fprintf (file, "\t{\n");
1106   fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
1107   fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
1108   fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1109   fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1110 	   bits == 64 ? ".wide" : ".lo");
1111   fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc);
1112   fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc);
1113   if (cfun->machine->unisimt_predicate)
1114     {
1115       int master = REGNO (cfun->machine->unisimt_master);
1116       int pred = REGNO (cfun->machine->unisimt_predicate);
1117       fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc);
1118       fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1119       /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'.  */
1120       fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
1121       /* Compute predicate as 'tid.x == master'.  */
1122       fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
1123     }
1124   fprintf (file, "\t}\n");
1125   need_unisimt_decl = true;
1126 }
1127 
1128 /* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1129 
1130    extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1131    void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1132    {
1133      __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1134      __nvptx_uni[tid.y] = 0;
1135      gomp_nvptx_main (ORIG, arg);
1136    }
1137    ORIG itself should not be emitted as a PTX .entry function.  */
1138 
1139 static void
write_omp_entry(FILE * file,const char * name,const char * orig)1140 write_omp_entry (FILE *file, const char *name, const char *orig)
1141 {
1142   static bool gomp_nvptx_main_declared;
1143   if (!gomp_nvptx_main_declared)
1144     {
1145       gomp_nvptx_main_declared = true;
1146       write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
1147       func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1148         << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
1149     }
1150   /* PR79332.  Single out this string; it confuses gcc.pot generation.  */
1151 #define NTID_Y "%ntid.y"
1152 #define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1153  (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1154 {\n\
1155 	.reg.u32 %r<3>;\n\
1156 	.reg.u" PS " %R<4>;\n\
1157 	mov.u32 %r0, %tid.y;\n\
1158 	mov.u32 %r1, " NTID_Y ";\n\
1159 	mov.u32 %r2, %ctaid.x;\n\
1160 	cvt.u" PS ".u32 %R1, %r0;\n\
1161 	" MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1162 	mov.u" PS " %R0, __nvptx_stacks;\n\
1163 	" MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1164 	ld.param.u" PS " %R2, [%stack];\n\
1165 	ld.param.u" PS " %R3, [%sz];\n\
1166 	add.u" PS " %R2, %R2, %R3;\n\
1167 	mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1168 	st.shared.u" PS " [%R0], %R2;\n\
1169 	mov.u" PS " %R0, __nvptx_uni;\n\
1170 	" MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1171 	mov.u32 %r0, 0;\n\
1172 	st.shared.u32 [%R0], %r0;\n\
1173 	mov.u" PS " %R0, \0;\n\
1174 	ld.param.u" PS " %R1, [%arg];\n\
1175 	{\n\
1176 		.param.u" PS " %P<2>;\n\
1177 		st.param.u" PS " [%P0], %R0;\n\
1178 		st.param.u" PS " [%P1], %R1;\n\
1179 		call.uni gomp_nvptx_main, (%P0, %P1);\n\
1180 	}\n\
1181 	ret.uni;\n\
1182 }\n"
1183   static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1184   static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32  ");
1185 #undef ENTRY_TEMPLATE
1186 #undef NTID_Y
1187   const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
1188   /* Position ENTRY_2 after the embedded nul using strlen of the prefix.  */
1189   const char *entry_2 = entry_1 + strlen (entry64) + 1;
1190   fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
1191   need_softstack_decl = need_unisimt_decl = true;
1192 }
1193 
1194 /* Implement ASM_DECLARE_FUNCTION_NAME.  Writes the start of a ptx
1195    function, including local var decls and copies from the arguments to
1196    local regs.  */
1197 
1198 void
nvptx_declare_function_name(FILE * file,const char * name,const_tree decl)1199 nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
1200 {
1201   tree fntype = TREE_TYPE (decl);
1202   tree result_type = TREE_TYPE (fntype);
1203   int argno = 0;
1204 
1205   if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
1206       && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
1207     {
1208       char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
1209       sprintf (buf, "%s$impl", name);
1210       write_omp_entry (file, name, buf);
1211       name = buf;
1212     }
1213   /* We construct the initial part of the function into a string
1214      stream, in order to share the prototype writing code.  */
1215   std::stringstream s;
1216   write_fn_proto (s, true, name, decl);
1217   s << "{\n";
1218 
1219   bool return_in_mem = write_return_type (s, false, result_type);
1220   if (return_in_mem)
1221     argno = write_arg_type (s, 0, argno, ptr_type_node, true);
1222 
1223   /* Declare and initialize incoming arguments.  */
1224   tree args = TYPE_ARG_TYPES (fntype);
1225   bool prototyped = true;
1226   if (!args)
1227     {
1228       args = DECL_ARGUMENTS (decl);
1229       prototyped = false;
1230     }
1231 
1232   for (; args != NULL_TREE; args = TREE_CHAIN (args))
1233     {
1234       tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
1235 
1236       argno = write_arg_type (s, 0, argno, type, prototyped);
1237     }
1238 
1239   if (stdarg_p (fntype))
1240     argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1241 			    true);
1242 
1243   if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1244     write_arg_type (s, STATIC_CHAIN_REGNUM,
1245 		    DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1246 		    true);
1247 
1248   fprintf (file, "%s", s.str().c_str());
1249 
1250   /* Usually 'crtl->is_leaf' is computed during register allocator
1251      initialization (which is not done on NVPTX) or for pressure-sensitive
1252      optimizations.  Initialize it here, except if already set.  */
1253   if (!crtl->is_leaf)
1254     crtl->is_leaf = leaf_function_p ();
1255 
1256   HOST_WIDE_INT sz = get_frame_size ();
1257   bool need_frameptr = sz || cfun->machine->has_chain;
1258   int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
1259   if (!TARGET_SOFT_STACK)
1260     {
1261       /* Declare a local var for outgoing varargs.  */
1262       if (cfun->machine->has_varadic)
1263 	init_frame (file, STACK_POINTER_REGNUM,
1264 		    UNITS_PER_WORD, crtl->outgoing_args_size);
1265 
1266       /* Declare a local variable for the frame.  Force its size to be
1267 	 DImode-compatible.  */
1268       if (need_frameptr)
1269 	init_frame (file, FRAME_POINTER_REGNUM, alignment,
1270 		    ROUND_UP (sz, GET_MODE_SIZE (DImode)));
1271     }
1272   else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca
1273 	   || (cfun->machine->has_simtreg && !crtl->is_leaf))
1274     init_softstack_frame (file, alignment, sz);
1275 
1276   if (cfun->machine->has_simtreg)
1277     {
1278       unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size;
1279       unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align;
1280       align = MAX (align, GET_MODE_SIZE (DImode));
1281       if (!crtl->is_leaf || cfun->calls_alloca)
1282 	simtsz = HOST_WIDE_INT_M1U;
1283       if (simtsz == HOST_WIDE_INT_M1U)
1284 	simtsz = nvptx_softstack_size;
1285       if (cfun->machine->has_softstack)
1286 	simtsz += POINTER_SIZE / 8;
1287       simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode));
1288       if (align > GET_MODE_SIZE (DImode))
1289 	simtsz += align - GET_MODE_SIZE (DImode);
1290       if (simtsz)
1291 	fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
1292 		HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
1293     }
1294   /* Declare the pseudos we have as ptx registers.  */
1295   int maxregs = max_reg_num ();
1296   for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1297     {
1298       if (regno_reg_rtx[i] != const0_rtx)
1299 	{
1300 	  machine_mode mode = PSEUDO_REGNO_MODE (i);
1301 	  machine_mode split = maybe_split_mode (mode);
1302 
1303 	  if (split_mode_p (mode))
1304 	    mode = split;
1305 	  fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1306 	  output_reg (file, i, split, -2);
1307 	  fprintf (file, ";\n");
1308 	}
1309     }
1310 
1311   /* Emit axis predicates. */
1312   if (cfun->machine->axis_predicate[0])
1313     nvptx_init_axis_predicate (file,
1314 			       REGNO (cfun->machine->axis_predicate[0]), "y");
1315   if (cfun->machine->axis_predicate[1])
1316     nvptx_init_axis_predicate (file,
1317 			       REGNO (cfun->machine->axis_predicate[1]), "x");
1318   if (cfun->machine->unisimt_predicate
1319       || (cfun->machine->has_simtreg && !crtl->is_leaf))
1320     nvptx_init_unisimt_predicate (file);
1321 }
1322 
1323 /* Output code for switching uniform-simt state.  ENTERING indicates whether
1324    we are entering or leaving non-uniform execution region.  */
1325 
1326 static void
nvptx_output_unisimt_switch(FILE * file,bool entering)1327 nvptx_output_unisimt_switch (FILE *file, bool entering)
1328 {
1329   if (crtl->is_leaf && !cfun->machine->unisimt_predicate)
1330     return;
1331   fprintf (file, "\t{\n");
1332   fprintf (file, "\t\t.reg.u32 %%ustmp2;\n");
1333   fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0);
1334   if (!crtl->is_leaf)
1335     {
1336       int loc = REGNO (cfun->machine->unisimt_location);
1337       fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc);
1338     }
1339   if (cfun->machine->unisimt_predicate)
1340     {
1341       int master = REGNO (cfun->machine->unisimt_master);
1342       int pred = REGNO (cfun->machine->unisimt_predicate);
1343       fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1344       fprintf (file, "\t\tmov.u32 %%r%d, %s;\n",
1345 	       master, entering ? "%ustmp2" : "0");
1346       fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master);
1347     }
1348   fprintf (file, "\t}\n");
1349 }
1350 
1351 /* Output code for allocating per-lane storage and switching soft-stack pointer.
1352    ENTERING indicates whether we are entering or leaving non-uniform execution.
1353    PTR is the register pointing to allocated storage, it is assigned to on
1354    entering and used to restore state on leaving.  SIZE and ALIGN are used only
1355    on entering.  */
1356 
1357 static void
nvptx_output_softstack_switch(FILE * file,bool entering,rtx ptr,rtx size,rtx align)1358 nvptx_output_softstack_switch (FILE *file, bool entering,
1359 			       rtx ptr, rtx size, rtx align)
1360 {
1361   gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr));
1362   if (crtl->is_leaf && !cfun->machine->simt_stack_size)
1363     return;
1364   int bits = POINTER_SIZE, regno = REGNO (ptr);
1365   fprintf (file, "\t{\n");
1366   if (entering)
1367     {
1368       fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1369 	       HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno,
1370 	       cfun->machine->simt_stack_size);
1371       fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno);
1372       if (CONST_INT_P (size))
1373 	fprintf (file, HOST_WIDE_INT_PRINT_DEC,
1374 		 ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode)));
1375       else
1376 	output_reg (file, REGNO (size), VOIDmode);
1377       fputs (";\n", file);
1378       if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
1379 	fprintf (file,
1380 		 "\t\tand.u%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
1381 		 bits, regno, regno, UINTVAL (align));
1382     }
1383   if (cfun->machine->has_softstack)
1384     {
1385       const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1386       if (entering)
1387 	{
1388 	  fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1389 		   bits, regno, bits / 8, reg_stack);
1390 	  fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n",
1391 		   bits, reg_stack, regno, bits / 8);
1392 	}
1393       else
1394 	{
1395 	  fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1396 		   bits, reg_stack, regno, bits / 8);
1397 	}
1398       nvptx_output_set_softstack (REGNO (stack_pointer_rtx));
1399     }
1400   fprintf (file, "\t}\n");
1401 }
1402 
1403 /* Output code to enter non-uniform execution region.  DEST is a register
1404    to hold a per-lane allocation given by SIZE and ALIGN.  */
1405 
1406 const char *
nvptx_output_simt_enter(rtx dest,rtx size,rtx align)1407 nvptx_output_simt_enter (rtx dest, rtx size, rtx align)
1408 {
1409   nvptx_output_unisimt_switch (asm_out_file, true);
1410   nvptx_output_softstack_switch (asm_out_file, true, dest, size, align);
1411   return "";
1412 }
1413 
1414 /* Output code to leave non-uniform execution region.  SRC is the register
1415    holding per-lane storage previously allocated by omp_simt_enter insn.  */
1416 
1417 const char *
nvptx_output_simt_exit(rtx src)1418 nvptx_output_simt_exit (rtx src)
1419 {
1420   nvptx_output_unisimt_switch (asm_out_file, false);
1421   nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX);
1422   return "";
1423 }
1424 
1425 /* Output instruction that sets soft stack pointer in shared memory to the
1426    value in register given by SRC_REGNO.  */
1427 
1428 const char *
nvptx_output_set_softstack(unsigned src_regno)1429 nvptx_output_set_softstack (unsigned src_regno)
1430 {
1431   if (cfun->machine->has_softstack && !crtl->is_leaf)
1432     {
1433       fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
1434 	       POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
1435       output_reg (asm_out_file, src_regno, VOIDmode);
1436       fprintf (asm_out_file, ";\n");
1437     }
1438   return "";
1439 }
1440 /* Output a return instruction.  Also copy the return value to its outgoing
1441    location.  */
1442 
1443 const char *
nvptx_output_return(void)1444 nvptx_output_return (void)
1445 {
1446   machine_mode mode = (machine_mode)cfun->machine->return_mode;
1447 
1448   if (mode != VOIDmode)
1449     fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1450 	     nvptx_ptx_type_from_mode (mode, false),
1451 	     reg_names[NVPTX_RETURN_REGNUM],
1452 	     reg_names[NVPTX_RETURN_REGNUM]);
1453 
1454   return "ret;";
1455 }
1456 
1457 /* Terminate a function by writing a closing brace to FILE.  */
1458 
1459 void
nvptx_function_end(FILE * file)1460 nvptx_function_end (FILE *file)
1461 {
1462   fprintf (file, "}\n");
1463 }
1464 
1465 /* Decide whether we can make a sibling call to a function.  For ptx, we
1466    can't.  */
1467 
1468 static bool
nvptx_function_ok_for_sibcall(tree,tree)1469 nvptx_function_ok_for_sibcall (tree, tree)
1470 {
1471   return false;
1472 }
1473 
1474 /* Return Dynamic ReAlignment Pointer RTX.  For PTX there isn't any.  */
1475 
1476 static rtx
nvptx_get_drap_rtx(void)1477 nvptx_get_drap_rtx (void)
1478 {
1479   if (TARGET_SOFT_STACK && stack_realign_drap)
1480     return arg_pointer_rtx;
1481   return NULL_RTX;
1482 }
1483 
1484 /* Implement the TARGET_CALL_ARGS hook.  Record information about one
1485    argument to the next call.  */
1486 
1487 static void
nvptx_call_args(rtx arg,tree fntype)1488 nvptx_call_args (rtx arg, tree fntype)
1489 {
1490   if (!cfun->machine->doing_call)
1491     {
1492       cfun->machine->doing_call = true;
1493       cfun->machine->is_varadic = false;
1494       cfun->machine->num_args = 0;
1495 
1496       if (fntype && stdarg_p (fntype))
1497 	{
1498 	  cfun->machine->is_varadic = true;
1499 	  cfun->machine->has_varadic = true;
1500 	  cfun->machine->num_args++;
1501 	}
1502     }
1503 
1504   if (REG_P (arg) && arg != pc_rtx)
1505     {
1506       cfun->machine->num_args++;
1507       cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1508 						  cfun->machine->call_args);
1509     }
1510 }
1511 
1512 /* Implement the corresponding END_CALL_ARGS hook.  Clear and free the
1513    information we recorded.  */
1514 
1515 static void
nvptx_end_call_args(void)1516 nvptx_end_call_args (void)
1517 {
1518   cfun->machine->doing_call = false;
1519   free_EXPR_LIST_list (&cfun->machine->call_args);
1520 }
1521 
1522 /* Emit the sequence for a call to ADDRESS, setting RETVAL.  Keep
1523    track of whether calls involving static chains or varargs were seen
1524    in the current function.
1525    For libcalls, maintain a hash table of decls we have seen, and
1526    record a function decl for later when encountering a new one.  */
1527 
1528 void
nvptx_expand_call(rtx retval,rtx address)1529 nvptx_expand_call (rtx retval, rtx address)
1530 {
1531   rtx callee = XEXP (address, 0);
1532   rtx varargs = NULL_RTX;
1533   unsigned parallel = 0;
1534 
1535   if (!call_insn_operand (callee, Pmode))
1536     {
1537       callee = force_reg (Pmode, callee);
1538       address = change_address (address, QImode, callee);
1539     }
1540 
1541   if (GET_CODE (callee) == SYMBOL_REF)
1542     {
1543       tree decl = SYMBOL_REF_DECL (callee);
1544       if (decl != NULL_TREE)
1545 	{
1546 	  if (DECL_STATIC_CHAIN (decl))
1547 	    cfun->machine->has_chain = true;
1548 
1549 	  tree attr = oacc_get_fn_attrib (decl);
1550 	  if (attr)
1551 	    {
1552 	      tree dims = TREE_VALUE (attr);
1553 
1554 	      parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1555 	      for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1556 		{
1557 		  if (TREE_PURPOSE (dims)
1558 		      && !integer_zerop (TREE_PURPOSE (dims)))
1559 		    break;
1560 		  /* Not on this axis.  */
1561 		  parallel ^= GOMP_DIM_MASK (ix);
1562 		  dims = TREE_CHAIN (dims);
1563 		}
1564 	    }
1565 	}
1566     }
1567 
1568   unsigned nargs = cfun->machine->num_args;
1569   if (cfun->machine->is_varadic)
1570     {
1571       varargs = gen_reg_rtx (Pmode);
1572       emit_move_insn (varargs, stack_pointer_rtx);
1573     }
1574 
1575   rtvec vec = rtvec_alloc (nargs + 1);
1576   rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
1577   int vec_pos = 0;
1578 
1579   rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
1580   rtx tmp_retval = retval;
1581   if (retval)
1582     {
1583       if (!nvptx_register_operand (retval, GET_MODE (retval)))
1584 	tmp_retval = gen_reg_rtx (GET_MODE (retval));
1585       call = gen_rtx_SET (tmp_retval, call);
1586     }
1587   XVECEXP (pat, 0, vec_pos++) = call;
1588 
1589   /* Construct the call insn, including a USE for each argument pseudo
1590      register.  These will be used when printing the insn.  */
1591   for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
1592     XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
1593 
1594   if (varargs)
1595     XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
1596 
1597   gcc_assert (vec_pos = XVECLEN (pat, 0));
1598 
1599   nvptx_emit_forking (parallel, true);
1600   emit_call_insn (pat);
1601   nvptx_emit_joining (parallel, true);
1602 
1603   if (tmp_retval != retval)
1604     emit_move_insn (retval, tmp_retval);
1605 }
1606 
1607 /* Emit a comparison COMPARE, and return the new test to be used in the
1608    jump.  */
1609 
1610 rtx
nvptx_expand_compare(rtx compare)1611 nvptx_expand_compare (rtx compare)
1612 {
1613   rtx pred = gen_reg_rtx (BImode);
1614   rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1615 			    XEXP (compare, 0), XEXP (compare, 1));
1616   emit_insn (gen_rtx_SET (pred, cmp));
1617   return gen_rtx_NE (BImode, pred, const0_rtx);
1618 }
1619 
1620 /* Expand the oacc fork & join primitive into ptx-required unspecs.  */
1621 
1622 void
nvptx_expand_oacc_fork(unsigned mode)1623 nvptx_expand_oacc_fork (unsigned mode)
1624 {
1625   nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1626 }
1627 
1628 void
nvptx_expand_oacc_join(unsigned mode)1629 nvptx_expand_oacc_join (unsigned mode)
1630 {
1631   nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1632 }
1633 
1634 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1635    objects.  */
1636 
1637 static rtx
nvptx_gen_unpack(rtx dst0,rtx dst1,rtx src)1638 nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1639 {
1640   rtx res;
1641 
1642   switch (GET_MODE (src))
1643     {
1644     case E_DImode:
1645       res = gen_unpackdisi2 (dst0, dst1, src);
1646       break;
1647     case E_DFmode:
1648       res = gen_unpackdfsi2 (dst0, dst1, src);
1649       break;
1650     default: gcc_unreachable ();
1651     }
1652   return res;
1653 }
1654 
1655 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1656    object.  */
1657 
1658 static rtx
nvptx_gen_pack(rtx dst,rtx src0,rtx src1)1659 nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1660 {
1661   rtx res;
1662 
1663   switch (GET_MODE (dst))
1664     {
1665     case E_DImode:
1666       res = gen_packsidi2 (dst, src0, src1);
1667       break;
1668     case E_DFmode:
1669       res = gen_packsidf2 (dst, src0, src1);
1670       break;
1671     default: gcc_unreachable ();
1672     }
1673   return res;
1674 }
1675 
1676 /* Generate an instruction or sequence to broadcast register REG
1677    across the vectors of a single warp.  */
1678 
1679 rtx
nvptx_gen_shuffle(rtx dst,rtx src,rtx idx,nvptx_shuffle_kind kind)1680 nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
1681 {
1682   rtx res;
1683 
1684   switch (GET_MODE (dst))
1685     {
1686     case E_SImode:
1687       res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1688       break;
1689     case E_SFmode:
1690       res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1691       break;
1692     case E_DImode:
1693     case E_DFmode:
1694       {
1695 	rtx tmp0 = gen_reg_rtx (SImode);
1696 	rtx tmp1 = gen_reg_rtx (SImode);
1697 
1698 	start_sequence ();
1699 	emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1700 	emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1701 	emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1702 	emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1703 	res = get_insns ();
1704 	end_sequence ();
1705       }
1706       break;
1707     case E_BImode:
1708       {
1709 	rtx tmp = gen_reg_rtx (SImode);
1710 
1711 	start_sequence ();
1712 	emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1713 	emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1714 	emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1715 	res = get_insns ();
1716 	end_sequence ();
1717       }
1718       break;
1719     case E_QImode:
1720     case E_HImode:
1721       {
1722 	rtx tmp = gen_reg_rtx (SImode);
1723 
1724 	start_sequence ();
1725 	emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
1726 	emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1727 	emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
1728 						    tmp)));
1729 	res = get_insns ();
1730 	end_sequence ();
1731       }
1732       break;
1733 
1734     default:
1735       gcc_unreachable ();
1736     }
1737   return res;
1738 }
1739 
1740 /* Generate an instruction or sequence to broadcast register REG
1741    across the vectors of a single warp.  */
1742 
1743 static rtx
nvptx_gen_vcast(rtx reg)1744 nvptx_gen_vcast (rtx reg)
1745 {
1746   return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1747 }
1748 
1749 /* Structure used when generating a worker-level spill or fill.  */
1750 
1751 struct wcast_data_t
1752 {
1753   rtx base;  /* Register holding base addr of buffer.  */
1754   rtx ptr;  /* Iteration var,  if needed.  */
1755   unsigned offset; /* Offset into worker buffer.  */
1756 };
1757 
1758 /* Direction of the spill/fill and looping setup/teardown indicator.  */
1759 
1760 enum propagate_mask
1761   {
1762     PM_read = 1 << 0,
1763     PM_write = 1 << 1,
1764     PM_loop_begin = 1 << 2,
1765     PM_loop_end = 1 << 3,
1766 
1767     PM_read_write = PM_read | PM_write
1768   };
1769 
1770 /* Generate instruction(s) to spill or fill register REG to/from the
1771    worker broadcast array.  PM indicates what is to be done, REP
1772    how many loop iterations will be executed (0 for not a loop).  */
1773 
1774 static rtx
nvptx_gen_wcast(rtx reg,propagate_mask pm,unsigned rep,wcast_data_t * data)1775 nvptx_gen_wcast (rtx reg, propagate_mask pm, unsigned rep, wcast_data_t *data)
1776 {
1777   rtx  res;
1778   machine_mode mode = GET_MODE (reg);
1779 
1780   switch (mode)
1781     {
1782     case E_BImode:
1783       {
1784 	rtx tmp = gen_reg_rtx (SImode);
1785 
1786 	start_sequence ();
1787 	if (pm & PM_read)
1788 	  emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1789 	emit_insn (nvptx_gen_wcast (tmp, pm, rep, data));
1790 	if (pm & PM_write)
1791 	  emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1792 	res = get_insns ();
1793 	end_sequence ();
1794       }
1795       break;
1796 
1797     default:
1798       {
1799 	rtx addr = data->ptr;
1800 
1801 	if (!addr)
1802 	  {
1803 	    unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1804 
1805 	    if (align > worker_bcast_align)
1806 	      worker_bcast_align = align;
1807 	    data->offset = (data->offset + align - 1) & ~(align - 1);
1808 	    addr = data->base;
1809 	    if (data->offset)
1810 	      addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1811 	  }
1812 
1813 	addr = gen_rtx_MEM (mode, addr);
1814 	if (pm == PM_read)
1815 	  res = gen_rtx_SET (addr, reg);
1816 	else if (pm == PM_write)
1817 	  res = gen_rtx_SET (reg, addr);
1818 	else
1819 	  gcc_unreachable ();
1820 
1821 	if (data->ptr)
1822 	  {
1823 	    /* We're using a ptr, increment it.  */
1824 	    start_sequence ();
1825 
1826 	    emit_insn (res);
1827 	    emit_insn (gen_adddi3 (data->ptr, data->ptr,
1828 				   GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1829 	    res = get_insns ();
1830 	    end_sequence ();
1831 	  }
1832 	else
1833 	  rep = 1;
1834 	data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1835       }
1836       break;
1837     }
1838   return res;
1839 }
1840 
1841 /* Returns true if X is a valid address for use in a memory reference.  */
1842 
1843 static bool
nvptx_legitimate_address_p(machine_mode,rtx x,bool)1844 nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1845 {
1846   enum rtx_code code = GET_CODE (x);
1847 
1848   switch (code)
1849     {
1850     case REG:
1851       return true;
1852 
1853     case PLUS:
1854       if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1855 	return true;
1856       return false;
1857 
1858     case CONST:
1859     case SYMBOL_REF:
1860     case LABEL_REF:
1861       return true;
1862 
1863     default:
1864       return false;
1865     }
1866 }
1867 
1868 /* Machinery to output constant initializers.  When beginning an
1869    initializer, we decide on a fragment size (which is visible in ptx
1870    in the type used), and then all initializer data is buffered until
1871    a fragment is filled and ready to be written out.  */
1872 
1873 static struct
1874 {
1875   unsigned HOST_WIDE_INT mask; /* Mask for storing fragment.  */
1876   unsigned HOST_WIDE_INT val; /* Current fragment value.  */
1877   unsigned HOST_WIDE_INT remaining; /*  Remaining bytes to be written
1878 					out.  */
1879   unsigned size;  /* Fragment size to accumulate.  */
1880   unsigned offset;  /* Offset within current fragment.  */
1881   bool started;   /* Whether we've output any initializer.  */
1882 } init_frag;
1883 
1884 /* The current fragment is full,  write it out.  SYM may provide a
1885    symbolic reference we should output,  in which case the fragment
1886    value is the addend.  */
1887 
1888 static void
output_init_frag(rtx sym)1889 output_init_frag (rtx sym)
1890 {
1891   fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
1892   unsigned HOST_WIDE_INT val = init_frag.val;
1893 
1894   init_frag.started = true;
1895   init_frag.val = 0;
1896   init_frag.offset = 0;
1897   init_frag.remaining--;
1898 
1899   if (sym)
1900     {
1901       bool function = (SYMBOL_REF_DECL (sym)
1902 		       && (TREE_CODE (SYMBOL_REF_DECL (sym)) == FUNCTION_DECL));
1903       if (!function)
1904 	fprintf (asm_out_file, "generic(");
1905       output_address (VOIDmode, sym);
1906       if (!function)
1907 	fprintf (asm_out_file, ")");
1908       if (val)
1909 	fprintf (asm_out_file, " + ");
1910     }
1911 
1912   if (!sym || val)
1913     fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
1914 }
1915 
1916 /* Add value VAL of size SIZE to the data we're emitting, and keep
1917    writing out chunks as they fill up.  */
1918 
1919 static void
nvptx_assemble_value(unsigned HOST_WIDE_INT val,unsigned size)1920 nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
1921 {
1922   val &= ((unsigned  HOST_WIDE_INT)2 << (size * BITS_PER_UNIT - 1)) - 1;
1923 
1924   for (unsigned part = 0; size; size -= part)
1925     {
1926       val >>= part * BITS_PER_UNIT;
1927       part = init_frag.size - init_frag.offset;
1928       if (part > size)
1929 	part = size;
1930 
1931       unsigned HOST_WIDE_INT partial
1932 	= val << (init_frag.offset * BITS_PER_UNIT);
1933       init_frag.val |= partial & init_frag.mask;
1934       init_frag.offset += part;
1935 
1936       if (init_frag.offset == init_frag.size)
1937 	output_init_frag (NULL);
1938     }
1939 }
1940 
1941 /* Target hook for assembling integer object X of size SIZE.  */
1942 
1943 static bool
nvptx_assemble_integer(rtx x,unsigned int size,int ARG_UNUSED (aligned_p))1944 nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
1945 {
1946   HOST_WIDE_INT val = 0;
1947 
1948   switch (GET_CODE (x))
1949     {
1950     default:
1951       /* Let the generic machinery figure it out, usually for a
1952 	 CONST_WIDE_INT.  */
1953       return false;
1954 
1955     case CONST_INT:
1956       nvptx_assemble_value (INTVAL (x), size);
1957       break;
1958 
1959     case CONST:
1960       x = XEXP (x, 0);
1961       gcc_assert (GET_CODE (x) == PLUS);
1962       val = INTVAL (XEXP (x, 1));
1963       x = XEXP (x, 0);
1964       gcc_assert (GET_CODE (x) == SYMBOL_REF);
1965       /* FALLTHROUGH */
1966 
1967     case SYMBOL_REF:
1968       gcc_assert (size == init_frag.size);
1969       if (init_frag.offset)
1970 	sorry ("cannot emit unaligned pointers in ptx assembly");
1971 
1972       nvptx_maybe_record_fnsym (x);
1973       init_frag.val = val;
1974       output_init_frag (x);
1975       break;
1976     }
1977 
1978   return true;
1979 }
1980 
1981 /* Output SIZE zero bytes.  We ignore the FILE argument since the
1982    functions we're calling to perform the output just use
1983    asm_out_file.  */
1984 
1985 void
nvptx_output_skip(FILE *,unsigned HOST_WIDE_INT size)1986 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
1987 {
1988   /* Finish the current fragment, if it's started.  */
1989   if (init_frag.offset)
1990     {
1991       unsigned part = init_frag.size - init_frag.offset;
1992       if (part > size)
1993 	part = (unsigned) size;
1994       size -= part;
1995       nvptx_assemble_value (0, part);
1996     }
1997 
1998   /* If this skip doesn't terminate the initializer, write as many
1999      remaining pieces as possible directly.  */
2000   if (size < init_frag.remaining * init_frag.size)
2001     {
2002       while (size >= init_frag.size)
2003 	{
2004 	  size -= init_frag.size;
2005 	  output_init_frag (NULL_RTX);
2006 	}
2007       if (size)
2008 	nvptx_assemble_value (0, size);
2009     }
2010 }
2011 
2012 /* Output a string STR with length SIZE.  As in nvptx_output_skip we
2013    ignore the FILE arg.  */
2014 
2015 void
nvptx_output_ascii(FILE *,const char * str,unsigned HOST_WIDE_INT size)2016 nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
2017 {
2018   for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
2019     nvptx_assemble_value (str[i], 1);
2020 }
2021 
2022 /* Return true if TYPE is a record type where the last field is an array without
2023    given dimension.  */
2024 
2025 static bool
flexible_array_member_type_p(const_tree type)2026 flexible_array_member_type_p (const_tree type)
2027 {
2028   if (TREE_CODE (type) != RECORD_TYPE)
2029     return false;
2030 
2031   const_tree last_field = NULL_TREE;
2032   for (const_tree f = TYPE_FIELDS (type); f; f = TREE_CHAIN (f))
2033     last_field = f;
2034 
2035   if (!last_field)
2036     return false;
2037 
2038   const_tree last_field_type = TREE_TYPE (last_field);
2039   if (TREE_CODE (last_field_type) != ARRAY_TYPE)
2040     return false;
2041 
2042   return (! TYPE_DOMAIN (last_field_type)
2043 	  || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type)));
2044 }
2045 
2046 /* Emit a PTX variable decl and prepare for emission of its
2047    initializer.  NAME is the symbol name and SETION the PTX data
2048    area. The type is TYPE, object size SIZE and alignment is ALIGN.
2049    The caller has already emitted any indentation and linkage
2050    specifier.  It is responsible for any initializer, terminating ;
2051    and newline.  SIZE is in bytes, ALIGN is in bits -- confusingly
2052    this is the opposite way round that PTX wants them!  */
2053 
2054 static void
2055 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
2056 			   const_tree type, HOST_WIDE_INT size, unsigned align,
2057 			   bool undefined = false)
2058 {
2059   bool atype = (TREE_CODE (type) == ARRAY_TYPE)
2060     && (TYPE_DOMAIN (type) == NULL_TREE);
2061 
2062   if (undefined && flexible_array_member_type_p (type))
2063     {
2064       size = 0;
2065       atype = true;
2066     }
2067 
2068   while (TREE_CODE (type) == ARRAY_TYPE)
2069     type = TREE_TYPE (type);
2070 
2071   if (TREE_CODE (type) == VECTOR_TYPE
2072       || TREE_CODE (type) == COMPLEX_TYPE)
2073     /* Neither vector nor complex types can contain the other.  */
2074     type = TREE_TYPE (type);
2075 
2076   unsigned elt_size = int_size_in_bytes (type);
2077 
2078   /* Largest mode we're prepared to accept.  For BLKmode types we
2079      don't know if it'll contain pointer constants, so have to choose
2080      pointer size, otherwise we can choose DImode.  */
2081   machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
2082 
2083   elt_size |= GET_MODE_SIZE (elt_mode);
2084   elt_size &= -elt_size; /* Extract LSB set.  */
2085 
2086   init_frag.size = elt_size;
2087   /* Avoid undefined shift behavior by using '2'.  */
2088   init_frag.mask = ((unsigned HOST_WIDE_INT)2
2089 		    << (elt_size * BITS_PER_UNIT - 1)) - 1;
2090   init_frag.val = 0;
2091   init_frag.offset = 0;
2092   init_frag.started = false;
2093   /* Size might not be a multiple of elt size, if there's an
2094      initialized trailing struct array with smaller type than
2095      elt_size. */
2096   init_frag.remaining = (size + elt_size - 1) / elt_size;
2097 
2098   fprintf (file, "%s .align %d .u%d ",
2099 	   section, align / BITS_PER_UNIT,
2100 	   elt_size * BITS_PER_UNIT);
2101   assemble_name (file, name);
2102 
2103   if (size)
2104     /* We make everything an array, to simplify any initialization
2105        emission.  */
2106     fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
2107   else if (atype)
2108     fprintf (file, "[]");
2109 }
2110 
2111 /* Called when the initializer for a decl has been completely output through
2112    combinations of the three functions above.  */
2113 
2114 static void
nvptx_assemble_decl_end(void)2115 nvptx_assemble_decl_end (void)
2116 {
2117   if (init_frag.offset)
2118     /* This can happen with a packed struct with trailing array member.  */
2119     nvptx_assemble_value (0, init_frag.size - init_frag.offset);
2120   fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
2121 }
2122 
2123 /* Output an uninitialized common or file-scope variable.  */
2124 
2125 void
nvptx_output_aligned_decl(FILE * file,const char * name,const_tree decl,HOST_WIDE_INT size,unsigned align)2126 nvptx_output_aligned_decl (FILE *file, const char *name,
2127 			   const_tree decl, HOST_WIDE_INT size, unsigned align)
2128 {
2129   write_var_marker (file, true, TREE_PUBLIC (decl), name);
2130 
2131   /* If this is public, it is common.  The nearest thing we have to
2132      common is weak.  */
2133   fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
2134 
2135   nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2136 			     TREE_TYPE (decl), size, align);
2137   nvptx_assemble_decl_end ();
2138 }
2139 
2140 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME.  Begin the process of
2141    writing a constant variable EXP with NAME and SIZE and its
2142    initializer to FILE.  */
2143 
2144 static void
nvptx_asm_declare_constant_name(FILE * file,const char * name,const_tree exp,HOST_WIDE_INT obj_size)2145 nvptx_asm_declare_constant_name (FILE *file, const char *name,
2146 				 const_tree exp, HOST_WIDE_INT obj_size)
2147 {
2148   write_var_marker (file, true, false, name);
2149 
2150   fprintf (file, "\t");
2151 
2152   tree type = TREE_TYPE (exp);
2153   nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
2154 			     TYPE_ALIGN (type));
2155 }
2156 
2157 /* Implement the ASM_DECLARE_OBJECT_NAME macro.  Used to start writing
2158    a variable DECL with NAME to FILE.  */
2159 
2160 void
nvptx_declare_object_name(FILE * file,const char * name,const_tree decl)2161 nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
2162 {
2163   write_var_marker (file, true, TREE_PUBLIC (decl), name);
2164 
2165   fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
2166 			  : DECL_WEAK (decl) ? ".weak " : ".visible "));
2167 
2168   tree type = TREE_TYPE (decl);
2169   HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
2170   nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2171 			     type, obj_size, DECL_ALIGN (decl));
2172 }
2173 
2174 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing.  */
2175 
2176 static void
nvptx_globalize_label(FILE *,const char *)2177 nvptx_globalize_label (FILE *, const char *)
2178 {
2179 }
2180 
2181 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL.  Write an extern
2182    declaration only for variable DECL with NAME to FILE.  */
2183 
2184 static void
nvptx_assemble_undefined_decl(FILE * file,const char * name,const_tree decl)2185 nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
2186 {
2187   /* The middle end can place constant pool decls into the varpool as
2188      undefined.  Until that is fixed, catch the problem here.  */
2189   if (DECL_IN_CONSTANT_POOL (decl))
2190     return;
2191 
2192   /*  We support weak defintions, and hence have the right
2193       ASM_WEAKEN_DECL definition.  Diagnose the problem here.  */
2194   if (DECL_WEAK (decl))
2195     error_at (DECL_SOURCE_LOCATION (decl),
2196 	      "PTX does not support weak declarations"
2197 	      " (only weak definitions)");
2198   write_var_marker (file, false, TREE_PUBLIC (decl), name);
2199 
2200   fprintf (file, "\t.extern ");
2201   tree size = DECL_SIZE_UNIT (decl);
2202   nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2203 			     TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
2204 			     DECL_ALIGN (decl), true);
2205   nvptx_assemble_decl_end ();
2206 }
2207 
2208 /* Output a pattern for a move instruction.  */
2209 
2210 const char *
nvptx_output_mov_insn(rtx dst,rtx src)2211 nvptx_output_mov_insn (rtx dst, rtx src)
2212 {
2213   machine_mode dst_mode = GET_MODE (dst);
2214   machine_mode dst_inner = (GET_CODE (dst) == SUBREG
2215 			    ? GET_MODE (XEXP (dst, 0)) : dst_mode);
2216   machine_mode src_inner = (GET_CODE (src) == SUBREG
2217 			    ? GET_MODE (XEXP (src, 0)) : dst_mode);
2218 
2219   rtx sym = src;
2220   if (GET_CODE (sym) == CONST)
2221     sym = XEXP (XEXP (sym, 0), 0);
2222   if (SYMBOL_REF_P (sym))
2223     {
2224       if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
2225 	return "%.\tcvta%D1%t0\t%0, %1;";
2226       nvptx_maybe_record_fnsym (sym);
2227     }
2228 
2229   if (src_inner == dst_inner)
2230     return "%.\tmov%t0\t%0, %1;";
2231 
2232   if (CONSTANT_P (src))
2233     return (GET_MODE_CLASS (dst_inner) == MODE_INT
2234 	    && GET_MODE_CLASS (src_inner) != MODE_FLOAT
2235 	    ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2236 
2237   if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
2238     {
2239       if (GET_MODE_BITSIZE (dst_mode) == 128
2240 	  && GET_MODE_BITSIZE (GET_MODE (src)) == 128)
2241 	{
2242 	  /* mov.b128 is not supported.  */
2243 	  if (dst_inner == V2DImode && src_inner == TImode)
2244 	    return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2245 	  else if (dst_inner == TImode && src_inner == V2DImode)
2246 	    return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2247 
2248 	  gcc_unreachable ();
2249 	}
2250       return "%.\tmov.b%T0\t%0, %1;";
2251     }
2252 
2253   return "%.\tcvt%t0%t1\t%0, %1;";
2254 }
2255 
2256 static void nvptx_print_operand (FILE *, rtx, int);
2257 
2258 /* Output INSN, which is a call to CALLEE with result RESULT.  For ptx, this
2259    involves writing .param declarations and in/out copies into them.  For
2260    indirect calls, also write the .callprototype.  */
2261 
2262 const char *
nvptx_output_call_insn(rtx_insn * insn,rtx result,rtx callee)2263 nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
2264 {
2265   char buf[16];
2266   static int labelno;
2267   bool needs_tgt = register_operand (callee, Pmode);
2268   rtx pat = PATTERN (insn);
2269   if (GET_CODE (pat) == COND_EXEC)
2270     pat = COND_EXEC_CODE (pat);
2271   int arg_end = XVECLEN (pat, 0);
2272   tree decl = NULL_TREE;
2273 
2274   fprintf (asm_out_file, "\t{\n");
2275   if (result != NULL)
2276     fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
2277 	     nvptx_ptx_type_from_mode (GET_MODE (result), false),
2278 	     reg_names[NVPTX_RETURN_REGNUM]);
2279 
2280   /* Ensure we have a ptx declaration in the output if necessary.  */
2281   if (GET_CODE (callee) == SYMBOL_REF)
2282     {
2283       decl = SYMBOL_REF_DECL (callee);
2284       if (!decl
2285 	  || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
2286 	nvptx_record_libfunc (callee, result, pat);
2287       else if (DECL_EXTERNAL (decl))
2288 	nvptx_record_fndecl (decl);
2289     }
2290 
2291   if (needs_tgt)
2292     {
2293       ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
2294       labelno++;
2295       ASM_OUTPUT_LABEL (asm_out_file, buf);
2296       std::stringstream s;
2297       write_fn_proto_from_insn (s, NULL, result, pat);
2298       fputs (s.str().c_str(), asm_out_file);
2299     }
2300 
2301   for (int argno = 1; argno < arg_end; argno++)
2302     {
2303       rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
2304       machine_mode mode = GET_MODE (t);
2305       const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
2306 
2307       /* Mode splitting has already been done.  */
2308       fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
2309 	       "\t\tst.param%s [%%out_arg%d], ",
2310 	       ptx_type, argno, ptx_type, argno);
2311       output_reg (asm_out_file, REGNO (t), VOIDmode);
2312       fprintf (asm_out_file, ";\n");
2313     }
2314 
2315   /* The '.' stands for the call's predicate, if any.  */
2316   nvptx_print_operand (asm_out_file, NULL_RTX, '.');
2317   fprintf (asm_out_file, "\t\tcall ");
2318   if (result != NULL_RTX)
2319     fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
2320 
2321   if (decl)
2322     {
2323       const char *name = get_fnname_from_decl (decl);
2324       name = nvptx_name_replacement (name);
2325       assemble_name (asm_out_file, name);
2326     }
2327   else
2328     output_address (VOIDmode, callee);
2329 
2330   const char *open = "(";
2331   for (int argno = 1; argno < arg_end; argno++)
2332     {
2333       fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
2334       open = "";
2335     }
2336   if (decl && DECL_STATIC_CHAIN (decl))
2337     {
2338       fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
2339       open = "";
2340     }
2341   if (!open[0])
2342     fprintf (asm_out_file, ")");
2343 
2344   if (needs_tgt)
2345     {
2346       fprintf (asm_out_file, ", ");
2347       assemble_name (asm_out_file, buf);
2348     }
2349   fprintf (asm_out_file, ";\n");
2350 
2351   if (find_reg_note (insn, REG_NORETURN, NULL))
2352     {
2353       /* No return functions confuse the PTX JIT, as it doesn't realize
2354 	 the flow control barrier they imply.  It can seg fault if it
2355 	 encounters what looks like an unexitable loop.  Emit a trailing
2356 	 trap and exit, which it does grok.  */
2357       fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
2358       fprintf (asm_out_file, "\t\texit; // (noreturn)\n");
2359     }
2360 
2361   if (result)
2362     {
2363       static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2364 
2365       if (!rval[0])
2366 	/* We must escape the '%' that starts RETURN_REGNUM.  */
2367 	sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2368 		 reg_names[NVPTX_RETURN_REGNUM]);
2369       return rval;
2370     }
2371 
2372   return "}";
2373 }
2374 
2375 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P.  */
2376 
2377 static bool
nvptx_print_operand_punct_valid_p(unsigned char c)2378 nvptx_print_operand_punct_valid_p (unsigned char c)
2379 {
2380   return c == '.' || c== '#';
2381 }
2382 
2383 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE.  */
2384 
2385 static void
nvptx_print_address_operand(FILE * file,rtx x,machine_mode)2386 nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
2387 {
2388   rtx off;
2389   if (GET_CODE (x) == CONST)
2390     x = XEXP (x, 0);
2391   switch (GET_CODE (x))
2392     {
2393     case PLUS:
2394       off = XEXP (x, 1);
2395       output_address (VOIDmode, XEXP (x, 0));
2396       fprintf (file, "+");
2397       output_address (VOIDmode, off);
2398       break;
2399 
2400     case SYMBOL_REF:
2401     case LABEL_REF:
2402       output_addr_const (file, x);
2403       break;
2404 
2405     default:
2406       gcc_assert (GET_CODE (x) != MEM);
2407       nvptx_print_operand (file, x, 0);
2408       break;
2409     }
2410 }
2411 
2412 /* Write assembly language output for the address ADDR to FILE.  */
2413 
2414 static void
nvptx_print_operand_address(FILE * file,machine_mode mode,rtx addr)2415 nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
2416 {
2417   nvptx_print_address_operand (file, addr, mode);
2418 }
2419 
2420 /* Print an operand, X, to FILE, with an optional modifier in CODE.
2421 
2422    Meaning of CODE:
2423    . -- print the predicate for the instruction or an emptry string for an
2424         unconditional one.
2425    # -- print a rounding mode for the instruction
2426 
2427    A -- print a data area for a MEM
2428    c -- print an opcode suffix for a comparison operator, including a type code
2429    D -- print a data area for a MEM operand
2430    S -- print a shuffle kind specified by CONST_INT
2431    t -- print a type opcode suffix, promoting QImode to 32 bits
2432    T -- print a type size in bits
2433    u -- print a type opcode suffix without promotions.  */
2434 
2435 static void
nvptx_print_operand(FILE * file,rtx x,int code)2436 nvptx_print_operand (FILE *file, rtx x, int code)
2437 {
2438   if (code == '.')
2439     {
2440       x = current_insn_predicate;
2441       if (x)
2442 	{
2443 	  fputs ("@", file);
2444 	  if (GET_CODE (x) == EQ)
2445 	    fputs ("!", file);
2446 	  output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
2447 	}
2448       return;
2449     }
2450   else if (code == '#')
2451     {
2452       fputs (".rn", file);
2453       return;
2454     }
2455 
2456   enum rtx_code x_code = GET_CODE (x);
2457   machine_mode mode = GET_MODE (x);
2458 
2459   switch (code)
2460     {
2461     case 'A':
2462       x = XEXP (x, 0);
2463       /* FALLTHROUGH.  */
2464 
2465     case 'D':
2466       if (GET_CODE (x) == CONST)
2467 	x = XEXP (x, 0);
2468       if (GET_CODE (x) == PLUS)
2469 	x = XEXP (x, 0);
2470 
2471       if (GET_CODE (x) == SYMBOL_REF)
2472 	fputs (section_for_sym (x), file);
2473       break;
2474 
2475     case 't':
2476     case 'u':
2477       if (x_code == SUBREG)
2478 	{
2479 	  machine_mode inner_mode = GET_MODE (SUBREG_REG (x));
2480 	  if (VECTOR_MODE_P (inner_mode)
2481 	      && (GET_MODE_SIZE (mode)
2482 		  <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2483 	    mode = GET_MODE_INNER (inner_mode);
2484 	  else if (split_mode_p (inner_mode))
2485 	    mode = maybe_split_mode (inner_mode);
2486 	  else
2487 	    mode = inner_mode;
2488 	}
2489       fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
2490       break;
2491 
2492     case 'H':
2493     case 'L':
2494       {
2495 	rtx inner_x = SUBREG_REG (x);
2496 	machine_mode inner_mode = GET_MODE (inner_x);
2497 	machine_mode split = maybe_split_mode (inner_mode);
2498 
2499 	output_reg (file, REGNO (inner_x), split,
2500 		    (code == 'H'
2501 		     ? GET_MODE_SIZE (inner_mode) / 2
2502 		     : 0));
2503       }
2504       break;
2505 
2506     case 'S':
2507       {
2508 	nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2509 	/* Same order as nvptx_shuffle_kind.  */
2510 	static const char *const kinds[] =
2511 	  {".up", ".down", ".bfly", ".idx"};
2512 	fputs (kinds[kind], file);
2513       }
2514       break;
2515 
2516     case 'T':
2517       fprintf (file, "%d", GET_MODE_BITSIZE (mode));
2518       break;
2519 
2520     case 'j':
2521       fprintf (file, "@");
2522       goto common;
2523 
2524     case 'J':
2525       fprintf (file, "@!");
2526       goto common;
2527 
2528     case 'c':
2529       mode = GET_MODE (XEXP (x, 0));
2530       switch (x_code)
2531 	{
2532 	case EQ:
2533 	  fputs (".eq", file);
2534 	  break;
2535 	case NE:
2536 	  if (FLOAT_MODE_P (mode))
2537 	    fputs (".neu", file);
2538 	  else
2539 	    fputs (".ne", file);
2540 	  break;
2541 	case LE:
2542 	case LEU:
2543 	  fputs (".le", file);
2544 	  break;
2545 	case GE:
2546 	case GEU:
2547 	  fputs (".ge", file);
2548 	  break;
2549 	case LT:
2550 	case LTU:
2551 	  fputs (".lt", file);
2552 	  break;
2553 	case GT:
2554 	case GTU:
2555 	  fputs (".gt", file);
2556 	  break;
2557 	case LTGT:
2558 	  fputs (".ne", file);
2559 	  break;
2560 	case UNEQ:
2561 	  fputs (".equ", file);
2562 	  break;
2563 	case UNLE:
2564 	  fputs (".leu", file);
2565 	  break;
2566 	case UNGE:
2567 	  fputs (".geu", file);
2568 	  break;
2569 	case UNLT:
2570 	  fputs (".ltu", file);
2571 	  break;
2572 	case UNGT:
2573 	  fputs (".gtu", file);
2574 	  break;
2575 	case UNORDERED:
2576 	  fputs (".nan", file);
2577 	  break;
2578 	case ORDERED:
2579 	  fputs (".num", file);
2580 	  break;
2581 	default:
2582 	  gcc_unreachable ();
2583 	}
2584       if (FLOAT_MODE_P (mode)
2585 	  || x_code == EQ || x_code == NE
2586 	  || x_code == GEU || x_code == GTU
2587 	  || x_code == LEU || x_code == LTU)
2588 	fputs (nvptx_ptx_type_from_mode (mode, true), file);
2589       else
2590 	fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
2591       break;
2592     default:
2593     common:
2594       switch (x_code)
2595 	{
2596 	case SUBREG:
2597 	  {
2598 	    rtx inner_x = SUBREG_REG (x);
2599 	    machine_mode inner_mode = GET_MODE (inner_x);
2600 	    machine_mode split = maybe_split_mode (inner_mode);
2601 
2602 	    if (VECTOR_MODE_P (inner_mode)
2603 		&& (GET_MODE_SIZE (mode)
2604 		    <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2605 	      {
2606 		output_reg (file, REGNO (inner_x), VOIDmode);
2607 		fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y");
2608 	      }
2609 	    else if (split_mode_p (inner_mode)
2610 		&& (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2611 	      output_reg (file, REGNO (inner_x), split);
2612 	    else
2613 	      output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2614 	  }
2615 	  break;
2616 
2617 	case REG:
2618 	  output_reg (file, REGNO (x), maybe_split_mode (mode));
2619 	  break;
2620 
2621 	case MEM:
2622 	  fputc ('[', file);
2623 	  nvptx_print_address_operand (file, XEXP (x, 0), mode);
2624 	  fputc (']', file);
2625 	  break;
2626 
2627 	case CONST_INT:
2628 	  output_addr_const (file, x);
2629 	  break;
2630 
2631 	case CONST:
2632 	case SYMBOL_REF:
2633 	case LABEL_REF:
2634 	  /* We could use output_addr_const, but that can print things like
2635 	     "x-8", which breaks ptxas.  Need to ensure it is output as
2636 	     "x+-8".  */
2637 	  nvptx_print_address_operand (file, x, VOIDmode);
2638 	  break;
2639 
2640 	case CONST_DOUBLE:
2641 	  long vals[2];
2642 	  real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
2643 	  vals[0] &= 0xffffffff;
2644 	  vals[1] &= 0xffffffff;
2645 	  if (mode == SFmode)
2646 	    fprintf (file, "0f%08lx", vals[0]);
2647 	  else
2648 	    fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2649 	  break;
2650 
2651 	case CONST_VECTOR:
2652 	  {
2653 	    unsigned n = CONST_VECTOR_NUNITS (x);
2654 	    fprintf (file, "{ ");
2655 	    for (unsigned i = 0; i < n; ++i)
2656 	      {
2657 		if (i != 0)
2658 		  fprintf (file, ", ");
2659 
2660 		rtx elem = CONST_VECTOR_ELT (x, i);
2661 		output_addr_const (file, elem);
2662 	      }
2663 	    fprintf (file, " }");
2664 	  }
2665 	  break;
2666 
2667 	default:
2668 	  output_addr_const (file, x);
2669 	}
2670     }
2671 }
2672 
2673 /* Record replacement regs used to deal with subreg operands.  */
2674 struct reg_replace
2675 {
2676   rtx replacement[MAX_RECOG_OPERANDS];
2677   machine_mode mode;
2678   int n_allocated;
2679   int n_in_use;
2680 };
2681 
2682 /* Allocate or reuse a replacement in R and return the rtx.  */
2683 
2684 static rtx
get_replacement(struct reg_replace * r)2685 get_replacement (struct reg_replace *r)
2686 {
2687   if (r->n_allocated == r->n_in_use)
2688     r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2689   return r->replacement[r->n_in_use++];
2690 }
2691 
2692 /* Clean up subreg operands.  In ptx assembly, everything is typed, and
2693    the presence of subregs would break the rules for most instructions.
2694    Replace them with a suitable new register of the right size, plus
2695    conversion copyin/copyout instructions.  */
2696 
2697 static void
nvptx_reorg_subreg(void)2698 nvptx_reorg_subreg (void)
2699 {
2700   struct reg_replace qiregs, hiregs, siregs, diregs;
2701   rtx_insn *insn, *next;
2702 
2703   qiregs.n_allocated = 0;
2704   hiregs.n_allocated = 0;
2705   siregs.n_allocated = 0;
2706   diregs.n_allocated = 0;
2707   qiregs.mode = QImode;
2708   hiregs.mode = HImode;
2709   siregs.mode = SImode;
2710   diregs.mode = DImode;
2711 
2712   for (insn = get_insns (); insn; insn = next)
2713     {
2714       next = NEXT_INSN (insn);
2715       if (!NONDEBUG_INSN_P (insn)
2716 	  || asm_noperands (PATTERN (insn)) >= 0
2717 	  || GET_CODE (PATTERN (insn)) == USE
2718 	  || GET_CODE (PATTERN (insn)) == CLOBBER)
2719 	continue;
2720 
2721       qiregs.n_in_use = 0;
2722       hiregs.n_in_use = 0;
2723       siregs.n_in_use = 0;
2724       diregs.n_in_use = 0;
2725       extract_insn (insn);
2726       enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
2727 
2728       for (int i = 0; i < recog_data.n_operands; i++)
2729 	{
2730 	  rtx op = recog_data.operand[i];
2731 	  if (GET_CODE (op) != SUBREG)
2732 	    continue;
2733 
2734 	  rtx inner = SUBREG_REG (op);
2735 
2736 	  machine_mode outer_mode = GET_MODE (op);
2737 	  machine_mode inner_mode = GET_MODE (inner);
2738 	  gcc_assert (s_ok);
2739 	  if (s_ok
2740 	      && (GET_MODE_PRECISION (inner_mode)
2741 		  >= GET_MODE_PRECISION (outer_mode)))
2742 	    continue;
2743 	  gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2744 	  struct reg_replace *r = (outer_mode == QImode ? &qiregs
2745 				   : outer_mode == HImode ? &hiregs
2746 				   : outer_mode == SImode ? &siregs
2747 				   : &diregs);
2748 	  rtx new_reg = get_replacement (r);
2749 
2750 	  if (recog_data.operand_type[i] != OP_OUT)
2751 	    {
2752 	      enum rtx_code code;
2753 	      if (GET_MODE_PRECISION (inner_mode)
2754 		  < GET_MODE_PRECISION (outer_mode))
2755 		code = ZERO_EXTEND;
2756 	      else
2757 		code = TRUNCATE;
2758 
2759 	      rtx pat = gen_rtx_SET (new_reg,
2760 				     gen_rtx_fmt_e (code, outer_mode, inner));
2761 	      emit_insn_before (pat, insn);
2762 	    }
2763 
2764 	  if (recog_data.operand_type[i] != OP_IN)
2765 	    {
2766 	      enum rtx_code code;
2767 	      if (GET_MODE_PRECISION (inner_mode)
2768 		  < GET_MODE_PRECISION (outer_mode))
2769 		code = TRUNCATE;
2770 	      else
2771 		code = ZERO_EXTEND;
2772 
2773 	      rtx pat = gen_rtx_SET (inner,
2774 				     gen_rtx_fmt_e (code, inner_mode, new_reg));
2775 	      emit_insn_after (pat, insn);
2776 	    }
2777 	  validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2778 	}
2779     }
2780 }
2781 
2782 /* Return a SImode "master lane index" register for uniform-simt, allocating on
2783    first use.  */
2784 
2785 static rtx
nvptx_get_unisimt_master()2786 nvptx_get_unisimt_master ()
2787 {
2788   rtx &master = cfun->machine->unisimt_master;
2789   return master ? master : master = gen_reg_rtx (SImode);
2790 }
2791 
2792 /* Return a BImode "predicate" register for uniform-simt, similar to above.  */
2793 
2794 static rtx
nvptx_get_unisimt_predicate()2795 nvptx_get_unisimt_predicate ()
2796 {
2797   rtx &pred = cfun->machine->unisimt_predicate;
2798   return pred ? pred : pred = gen_reg_rtx (BImode);
2799 }
2800 
2801 /* Return true if given call insn references one of the functions provided by
2802    the CUDA runtime: malloc, free, vprintf.  */
2803 
2804 static bool
nvptx_call_insn_is_syscall_p(rtx_insn * insn)2805 nvptx_call_insn_is_syscall_p (rtx_insn *insn)
2806 {
2807   rtx pat = PATTERN (insn);
2808   gcc_checking_assert (GET_CODE (pat) == PARALLEL);
2809   pat = XVECEXP (pat, 0, 0);
2810   if (GET_CODE (pat) == SET)
2811     pat = SET_SRC (pat);
2812   gcc_checking_assert (GET_CODE (pat) == CALL
2813 		       && GET_CODE (XEXP (pat, 0)) == MEM);
2814   rtx addr = XEXP (XEXP (pat, 0), 0);
2815   if (GET_CODE (addr) != SYMBOL_REF)
2816     return false;
2817   const char *name = XSTR (addr, 0);
2818   /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
2819      references with forced assembler name refer to PTX syscalls.  For vprintf,
2820      accept both normal and forced-assembler-name references.  */
2821   return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
2822 	  || !strcmp (name, "*malloc")
2823 	  || !strcmp (name, "*free"));
2824 }
2825 
2826 /* If SET subexpression of INSN sets a register, emit a shuffle instruction to
2827    propagate its value from lane MASTER to current lane.  */
2828 
2829 static void
nvptx_unisimt_handle_set(rtx set,rtx_insn * insn,rtx master)2830 nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
2831 {
2832   rtx reg;
2833   if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
2834     emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
2835 }
2836 
2837 /* Adjust code for uniform-simt code generation variant by making atomics and
2838    "syscalls" conditionally executed, and inserting shuffle-based propagation
2839    for registers being set.  */
2840 
2841 static void
nvptx_reorg_uniform_simt()2842 nvptx_reorg_uniform_simt ()
2843 {
2844   rtx_insn *insn, *next;
2845 
2846   for (insn = get_insns (); insn; insn = next)
2847     {
2848       next = NEXT_INSN (insn);
2849       if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
2850 	  && !(NONJUMP_INSN_P (insn)
2851 	       && GET_CODE (PATTERN (insn)) == PARALLEL
2852 	       && get_attr_atomic (insn)))
2853 	continue;
2854       rtx pat = PATTERN (insn);
2855       rtx master = nvptx_get_unisimt_master ();
2856       for (int i = 0; i < XVECLEN (pat, 0); i++)
2857 	nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
2858       rtx pred = nvptx_get_unisimt_predicate ();
2859       pred = gen_rtx_NE (BImode, pred, const0_rtx);
2860       pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
2861       validate_change (insn, &PATTERN (insn), pat, false);
2862     }
2863 }
2864 
2865 /* Loop structure of the function.  The entire function is described as
2866    a NULL loop.  */
2867 
2868 struct parallel
2869 {
2870   /* Parent parallel.  */
2871   parallel *parent;
2872 
2873   /* Next sibling parallel.  */
2874   parallel *next;
2875 
2876   /* First child parallel.  */
2877   parallel *inner;
2878 
2879   /* Partitioning mask of the parallel.  */
2880   unsigned mask;
2881 
2882   /* Partitioning used within inner parallels. */
2883   unsigned inner_mask;
2884 
2885   /* Location of parallel forked and join.  The forked is the first
2886      block in the parallel and the join is the first block after of
2887      the partition.  */
2888   basic_block forked_block;
2889   basic_block join_block;
2890 
2891   rtx_insn *forked_insn;
2892   rtx_insn *join_insn;
2893 
2894   rtx_insn *fork_insn;
2895   rtx_insn *joining_insn;
2896 
2897   /* Basic blocks in this parallel, but not in child parallels.  The
2898      FORKED and JOINING blocks are in the partition.  The FORK and JOIN
2899      blocks are not.  */
2900   auto_vec<basic_block> blocks;
2901 
2902 public:
2903   parallel (parallel *parent, unsigned mode);
2904   ~parallel ();
2905 };
2906 
2907 /* Constructor links the new parallel into it's parent's chain of
2908    children.  */
2909 
parallel(parallel * parent_,unsigned mask_)2910 parallel::parallel (parallel *parent_, unsigned mask_)
2911   :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
2912 {
2913   forked_block = join_block = 0;
2914   forked_insn = join_insn = 0;
2915   fork_insn = joining_insn = 0;
2916 
2917   if (parent)
2918     {
2919       next = parent->inner;
2920       parent->inner = this;
2921     }
2922 }
2923 
~parallel()2924 parallel::~parallel ()
2925 {
2926   delete inner;
2927   delete next;
2928 }
2929 
2930 /* Map of basic blocks to insns */
2931 typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
2932 
2933 /* A tuple of an insn of interest and the BB in which it resides.  */
2934 typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
2935 typedef auto_vec<insn_bb_t> insn_bb_vec_t;
2936 
2937 /* Split basic blocks such that each forked and join unspecs are at
2938    the start of their basic blocks.  Thus afterwards each block will
2939    have a single partitioning mode.  We also do the same for return
2940    insns, as they are executed by every thread.  Return the
2941    partitioning mode of the function as a whole.  Populate MAP with
2942    head and tail blocks.  We also clear the BB visited flag, which is
2943    used when finding partitions.  */
2944 
2945 static void
nvptx_split_blocks(bb_insn_map_t * map)2946 nvptx_split_blocks (bb_insn_map_t *map)
2947 {
2948   insn_bb_vec_t worklist;
2949   basic_block block;
2950   rtx_insn *insn;
2951 
2952   /* Locate all the reorg instructions of interest.  */
2953   FOR_ALL_BB_FN (block, cfun)
2954     {
2955       bool seen_insn = false;
2956 
2957       /* Clear visited flag, for use by parallel locator  */
2958       block->flags &= ~BB_VISITED;
2959 
2960       FOR_BB_INSNS (block, insn)
2961 	{
2962 	  if (!INSN_P (insn))
2963 	    continue;
2964 	  switch (recog_memoized (insn))
2965 	    {
2966 	    default:
2967 	      seen_insn = true;
2968 	      continue;
2969 	    case CODE_FOR_nvptx_forked:
2970 	    case CODE_FOR_nvptx_join:
2971 	      break;
2972 
2973 	    case CODE_FOR_return:
2974 	      /* We also need to split just before return insns, as
2975 		 that insn needs executing by all threads, but the
2976 		 block it is in probably does not.  */
2977 	      break;
2978 	    }
2979 
2980 	  if (seen_insn)
2981 	    /* We've found an instruction that  must be at the start of
2982 	       a block, but isn't.  Add it to the worklist.  */
2983 	    worklist.safe_push (insn_bb_t (insn, block));
2984 	  else
2985 	    /* It was already the first instruction.  Just add it to
2986 	       the map.  */
2987 	    map->get_or_insert (block) = insn;
2988 	  seen_insn = true;
2989 	}
2990     }
2991 
2992   /* Split blocks on the worklist.  */
2993   unsigned ix;
2994   insn_bb_t *elt;
2995   basic_block remap = 0;
2996   for (ix = 0; worklist.iterate (ix, &elt); ix++)
2997     {
2998       if (remap != elt->second)
2999 	{
3000 	  block = elt->second;
3001 	  remap = block;
3002 	}
3003 
3004       /* Split block before insn. The insn is in the new block  */
3005       edge e = split_block (block, PREV_INSN (elt->first));
3006 
3007       block = e->dest;
3008       map->get_or_insert (block) = elt->first;
3009     }
3010 }
3011 
3012 /* BLOCK is a basic block containing a head or tail instruction.
3013    Locate the associated prehead or pretail instruction, which must be
3014    in the single predecessor block.  */
3015 
3016 static rtx_insn *
nvptx_discover_pre(basic_block block,int expected)3017 nvptx_discover_pre (basic_block block, int expected)
3018 {
3019   gcc_assert (block->preds->length () == 1);
3020   basic_block pre_block = (*block->preds)[0]->src;
3021   rtx_insn *pre_insn;
3022 
3023   for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
3024        pre_insn = PREV_INSN (pre_insn))
3025     gcc_assert (pre_insn != BB_HEAD (pre_block));
3026 
3027   gcc_assert (recog_memoized (pre_insn) == expected);
3028   return pre_insn;
3029 }
3030 
3031 /* Dump this parallel and all its inner parallels.  */
3032 
3033 static void
nvptx_dump_pars(parallel * par,unsigned depth)3034 nvptx_dump_pars (parallel *par, unsigned depth)
3035 {
3036   fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
3037 	   depth, par->mask,
3038 	   par->forked_block ? par->forked_block->index : -1,
3039 	   par->join_block ? par->join_block->index : -1);
3040 
3041   fprintf (dump_file, "    blocks:");
3042 
3043   basic_block block;
3044   for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
3045     fprintf (dump_file, " %d", block->index);
3046   fprintf (dump_file, "\n");
3047   if (par->inner)
3048     nvptx_dump_pars (par->inner, depth + 1);
3049 
3050   if (par->next)
3051     nvptx_dump_pars (par->next, depth);
3052 }
3053 
3054 /* If BLOCK contains a fork/join marker, process it to create or
3055    terminate a loop structure.  Add this block to the current loop,
3056    and then walk successor blocks.   */
3057 
3058 static parallel *
nvptx_find_par(bb_insn_map_t * map,parallel * par,basic_block block)3059 nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
3060 {
3061   if (block->flags & BB_VISITED)
3062     return par;
3063   block->flags |= BB_VISITED;
3064 
3065   if (rtx_insn **endp = map->get (block))
3066     {
3067       rtx_insn *end = *endp;
3068 
3069       /* This is a block head or tail, or return instruction.  */
3070       switch (recog_memoized (end))
3071 	{
3072 	case CODE_FOR_return:
3073 	  /* Return instructions are in their own block, and we
3074 	     don't need to do anything more.  */
3075 	  return par;
3076 
3077 	case CODE_FOR_nvptx_forked:
3078 	  /* Loop head, create a new inner loop and add it into
3079 	     our parent's child list.  */
3080 	  {
3081 	    unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3082 
3083 	    gcc_assert (mask);
3084 	    par = new parallel (par, mask);
3085 	    par->forked_block = block;
3086 	    par->forked_insn = end;
3087 	    if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3088 	      par->fork_insn
3089 		= nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
3090 	  }
3091 	  break;
3092 
3093 	case CODE_FOR_nvptx_join:
3094 	  /* A loop tail.  Finish the current loop and return to
3095 	     parent.  */
3096 	  {
3097 	    unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3098 
3099 	    gcc_assert (par->mask == mask);
3100 	    par->join_block = block;
3101 	    par->join_insn = end;
3102 	    if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
3103 	      par->joining_insn
3104 		= nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
3105 	    par = par->parent;
3106 	  }
3107 	  break;
3108 
3109 	default:
3110 	  gcc_unreachable ();
3111 	}
3112     }
3113 
3114   if (par)
3115     /* Add this block onto the current loop's list of blocks.  */
3116     par->blocks.safe_push (block);
3117   else
3118     /* This must be the entry block.  Create a NULL parallel.  */
3119     par = new parallel (0, 0);
3120 
3121   /* Walk successor blocks.  */
3122   edge e;
3123   edge_iterator ei;
3124 
3125   FOR_EACH_EDGE (e, ei, block->succs)
3126     nvptx_find_par (map, par, e->dest);
3127 
3128   return par;
3129 }
3130 
3131 /* DFS walk the CFG looking for fork & join markers.  Construct
3132    loop structures as we go.  MAP is a mapping of basic blocks
3133    to head & tail markers, discovered when splitting blocks.  This
3134    speeds up the discovery.  We rely on the BB visited flag having
3135    been cleared when splitting blocks.  */
3136 
3137 static parallel *
nvptx_discover_pars(bb_insn_map_t * map)3138 nvptx_discover_pars (bb_insn_map_t *map)
3139 {
3140   basic_block block;
3141 
3142   /* Mark exit blocks as visited.  */
3143   block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3144   block->flags |= BB_VISITED;
3145 
3146   /* And entry block as not.  */
3147   block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3148   block->flags &= ~BB_VISITED;
3149 
3150   parallel *par = nvptx_find_par (map, 0, block);
3151 
3152   if (dump_file)
3153     {
3154       fprintf (dump_file, "\nLoops\n");
3155       nvptx_dump_pars (par, 0);
3156       fprintf (dump_file, "\n");
3157     }
3158 
3159   return par;
3160 }
3161 
3162 /* Analyse a group of BBs within a partitioned region and create N
3163    Single-Entry-Single-Exit regions.  Some of those regions will be
3164    trivial ones consisting of a single BB.  The blocks of a
3165    partitioned region might form a set of disjoint graphs -- because
3166    the region encloses a differently partitoned sub region.
3167 
3168    We use the linear time algorithm described in 'Finding Regions Fast:
3169    Single Entry Single Exit and control Regions in Linear Time'
3170    Johnson, Pearson & Pingali.  That algorithm deals with complete
3171    CFGs, where a back edge is inserted from END to START, and thus the
3172    problem becomes one of finding equivalent loops.
3173 
3174    In this case we have a partial CFG.  We complete it by redirecting
3175    any incoming edge to the graph to be from an arbitrary external BB,
3176    and similarly redirecting any outgoing edge to be to  that BB.
3177    Thus we end up with a closed graph.
3178 
3179    The algorithm works by building a spanning tree of an undirected
3180    graph and keeping track of back edges from nodes further from the
3181    root in the tree to nodes nearer to the root in the tree.  In the
3182    description below, the root is up and the tree grows downwards.
3183 
3184    We avoid having to deal with degenerate back-edges to the same
3185    block, by splitting each BB into 3 -- one for input edges, one for
3186    the node itself and one for the output edges.  Such back edges are
3187    referred to as 'Brackets'.  Cycle equivalent nodes will have the
3188    same set of brackets.
3189 
3190    Determining bracket equivalency is done by maintaining a list of
3191    brackets in such a manner that the list length and final bracket
3192    uniquely identify the set.
3193 
3194    We use coloring to mark all BBs with cycle equivalency with the
3195    same color.  This is the output of the 'Finding Regions Fast'
3196    algorithm.  Notice it doesn't actually find the set of nodes within
3197    a particular region, just unorderd sets of nodes that are the
3198    entries and exits of SESE regions.
3199 
3200    After determining cycle equivalency, we need to find the minimal
3201    set of SESE regions.  Do this with a DFS coloring walk of the
3202    complete graph.  We're either 'looking' or 'coloring'.  When
3203    looking, and we're in the subgraph, we start coloring the color of
3204    the current node, and remember that node as the start of the
3205    current color's SESE region.  Every time we go to a new node, we
3206    decrement the count of nodes with thet color.  If it reaches zero,
3207    we remember that node as the end of the current color's SESE region
3208    and return to 'looking'.  Otherwise we color the node the current
3209    color.
3210 
3211    This way we end up with coloring the inside of non-trivial SESE
3212    regions with the color of that region.  */
3213 
3214 /* A pair of BBs.  We use this to represent SESE regions.  */
3215 typedef std::pair<basic_block, basic_block> bb_pair_t;
3216 typedef auto_vec<bb_pair_t> bb_pair_vec_t;
3217 
3218 /* A node in the undirected CFG.  The discriminator SECOND indicates just
3219    above or just below the BB idicated by FIRST.  */
3220 typedef std::pair<basic_block, int> pseudo_node_t;
3221 
3222 /* A bracket indicates an edge towards the root of the spanning tree of the
3223    undirected graph.  Each bracket has a color, determined
3224    from the currrent set of brackets.  */
3225 struct bracket
3226 {
3227   pseudo_node_t back; /* Back target */
3228 
3229   /* Current color and size of set.  */
3230   unsigned color;
3231   unsigned size;
3232 
bracketbracket3233   bracket (pseudo_node_t back_)
3234   : back (back_), color (~0u), size (~0u)
3235   {
3236   }
3237 
get_colorbracket3238   unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
3239   {
3240     if (length != size)
3241       {
3242 	size = length;
3243 	color = color_counts.length ();
3244 	color_counts.quick_push (0);
3245       }
3246     color_counts[color]++;
3247     return color;
3248   }
3249 };
3250 
3251 typedef auto_vec<bracket> bracket_vec_t;
3252 
3253 /* Basic block info for finding SESE regions.    */
3254 
3255 struct bb_sese
3256 {
3257   int node;  /* Node number in spanning tree.  */
3258   int parent; /* Parent node number.  */
3259 
3260   /* The algorithm splits each node A into Ai, A', Ao. The incoming
3261      edges arrive at pseudo-node Ai and the outgoing edges leave at
3262      pseudo-node Ao.  We have to remember which way we arrived at a
3263      particular node when generating the spanning tree.  dir > 0 means
3264      we arrived at Ai, dir < 0 means we arrived at Ao.  */
3265   int dir;
3266 
3267   /* Lowest numbered pseudo-node reached via a backedge from thsis
3268      node, or any descendant.  */
3269   pseudo_node_t high;
3270 
3271   int color;  /* Cycle-equivalence color  */
3272 
3273   /* Stack of brackets for this node.  */
3274   bracket_vec_t brackets;
3275 
bb_sesebb_sese3276   bb_sese (unsigned node_, unsigned p, int dir_)
3277   :node (node_), parent (p), dir (dir_)
3278   {
3279   }
3280   ~bb_sese ();
3281 
3282   /* Push a bracket ending at BACK.  */
pushbb_sese3283   void push (const pseudo_node_t &back)
3284   {
3285     if (dump_file)
3286       fprintf (dump_file, "Pushing backedge %d:%+d\n",
3287 	       back.first ? back.first->index : 0, back.second);
3288     brackets.safe_push (bracket (back));
3289   }
3290 
3291   void append (bb_sese *child);
3292   void remove (const pseudo_node_t &);
3293 
3294   /* Set node's color.  */
set_colorbb_sese3295   void set_color (auto_vec<unsigned> &color_counts)
3296   {
3297     color = brackets.last ().get_color (color_counts, brackets.length ());
3298   }
3299 };
3300 
~bb_sese()3301 bb_sese::~bb_sese ()
3302 {
3303 }
3304 
3305 /* Destructively append CHILD's brackets.  */
3306 
3307 void
append(bb_sese * child)3308 bb_sese::append (bb_sese *child)
3309 {
3310   if (int len = child->brackets.length ())
3311     {
3312       int ix;
3313 
3314       if (dump_file)
3315 	{
3316 	  for (ix = 0; ix < len; ix++)
3317 	    {
3318 	      const pseudo_node_t &pseudo = child->brackets[ix].back;
3319 	      fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
3320 		       child->node, pseudo.first ? pseudo.first->index : 0,
3321 		       pseudo.second);
3322 	    }
3323 	}
3324       if (!brackets.length ())
3325 	std::swap (brackets, child->brackets);
3326       else
3327 	{
3328 	  brackets.reserve (len);
3329 	  for (ix = 0; ix < len; ix++)
3330 	    brackets.quick_push (child->brackets[ix]);
3331 	}
3332     }
3333 }
3334 
3335 /* Remove brackets that terminate at PSEUDO.  */
3336 
3337 void
remove(const pseudo_node_t & pseudo)3338 bb_sese::remove (const pseudo_node_t &pseudo)
3339 {
3340   unsigned removed = 0;
3341   int len = brackets.length ();
3342 
3343   for (int ix = 0; ix < len; ix++)
3344     {
3345       if (brackets[ix].back == pseudo)
3346 	{
3347 	  if (dump_file)
3348 	    fprintf (dump_file, "Removing backedge %d:%+d\n",
3349 		     pseudo.first ? pseudo.first->index : 0, pseudo.second);
3350 	  removed++;
3351 	}
3352       else if (removed)
3353 	brackets[ix-removed] = brackets[ix];
3354     }
3355   while (removed--)
3356     brackets.pop ();
3357 }
3358 
3359 /* Accessors for BB's aux pointer.  */
3360 #define BB_SET_SESE(B, S) ((B)->aux = (S))
3361 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3362 
3363 /* DFS walk creating SESE data structures.  Only cover nodes with
3364    BB_VISITED set.  Append discovered blocks to LIST.  We number in
3365    increments of 3 so that the above and below pseudo nodes can be
3366    implicitly numbered too.  */
3367 
3368 static int
nvptx_sese_number(int n,int p,int dir,basic_block b,auto_vec<basic_block> * list)3369 nvptx_sese_number (int n, int p, int dir, basic_block b,
3370 		   auto_vec<basic_block> *list)
3371 {
3372   if (BB_GET_SESE (b))
3373     return n;
3374 
3375   if (dump_file)
3376     fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
3377 	     b->index, n, p, dir);
3378 
3379   BB_SET_SESE (b, new bb_sese (n, p, dir));
3380   p = n;
3381 
3382   n += 3;
3383   list->quick_push (b);
3384 
3385   /* First walk the nodes on the 'other side' of this node, then walk
3386      the nodes on the same side.  */
3387   for (unsigned ix = 2; ix; ix--)
3388     {
3389       vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
3390       size_t offset = (dir > 0 ? offsetof (edge_def, dest)
3391 		       : offsetof (edge_def, src));
3392       edge e;
3393       edge_iterator (ei);
3394 
3395       FOR_EACH_EDGE (e, ei, edges)
3396 	{
3397 	  basic_block target = *(basic_block *)((char *)e + offset);
3398 
3399 	  if (target->flags & BB_VISITED)
3400 	    n = nvptx_sese_number (n, p, dir, target, list);
3401 	}
3402       dir = -dir;
3403     }
3404   return n;
3405 }
3406 
3407 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
3408    EDGES are the outgoing edges and OFFSET is the offset to the src
3409    or dst block on the edges.   */
3410 
3411 static void
nvptx_sese_pseudo(basic_block me,bb_sese * sese,int depth,int dir,vec<edge,va_gc> * edges,size_t offset)3412 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
3413 		   vec<edge, va_gc> *edges, size_t offset)
3414 {
3415   edge e;
3416   edge_iterator (ei);
3417   int hi_back = depth;
3418   pseudo_node_t node_back (0, depth);
3419   int hi_child = depth;
3420   pseudo_node_t node_child (0, depth);
3421   basic_block child = NULL;
3422   unsigned num_children = 0;
3423   int usd = -dir * sese->dir;
3424 
3425   if (dump_file)
3426     fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
3427 	     me->index, sese->node, dir);
3428 
3429   if (dir < 0)
3430     {
3431       /* This is the above pseudo-child.  It has the BB itself as an
3432 	 additional child node.  */
3433       node_child = sese->high;
3434       hi_child = node_child.second;
3435       if (node_child.first)
3436 	hi_child += BB_GET_SESE (node_child.first)->node;
3437       num_children++;
3438     }
3439 
3440   /* Examine each edge.
3441      - if it is a child (a) append its bracket list and (b) record
3442           whether it is the child with the highest reaching bracket.
3443      - if it is an edge to ancestor, record whether it's the highest
3444           reaching backlink.  */
3445   FOR_EACH_EDGE (e, ei, edges)
3446     {
3447       basic_block target = *(basic_block *)((char *)e + offset);
3448 
3449       if (bb_sese *t_sese = BB_GET_SESE (target))
3450 	{
3451 	  if (t_sese->parent == sese->node && !(t_sese->dir + usd))
3452 	    {
3453 	      /* Child node.  Append its bracket list. */
3454 	      num_children++;
3455 	      sese->append (t_sese);
3456 
3457 	      /* Compare it's hi value.  */
3458 	      int t_hi = t_sese->high.second;
3459 
3460 	      if (basic_block child_hi_block = t_sese->high.first)
3461 		t_hi += BB_GET_SESE (child_hi_block)->node;
3462 
3463 	      if (hi_child > t_hi)
3464 		{
3465 		  hi_child = t_hi;
3466 		  node_child = t_sese->high;
3467 		  child = target;
3468 		}
3469 	    }
3470 	  else if (t_sese->node < sese->node + dir
3471 		   && !(dir < 0 && sese->parent == t_sese->node))
3472 	    {
3473 	      /* Non-parental ancestor node -- a backlink.  */
3474 	      int d = usd * t_sese->dir;
3475 	      int back = t_sese->node + d;
3476 
3477 	      if (hi_back > back)
3478 		{
3479 		  hi_back = back;
3480 		  node_back = pseudo_node_t (target, d);
3481 		}
3482 	    }
3483 	}
3484       else
3485 	{ /* Fallen off graph, backlink to entry node.  */
3486 	  hi_back = 0;
3487 	  node_back = pseudo_node_t (0, 0);
3488 	}
3489     }
3490 
3491   /* Remove any brackets that terminate at this pseudo node.  */
3492   sese->remove (pseudo_node_t (me, dir));
3493 
3494   /* Now push any backlinks from this pseudo node.  */
3495   FOR_EACH_EDGE (e, ei, edges)
3496     {
3497       basic_block target = *(basic_block *)((char *)e + offset);
3498       if (bb_sese *t_sese = BB_GET_SESE (target))
3499 	{
3500 	  if (t_sese->node < sese->node + dir
3501 	      && !(dir < 0 && sese->parent == t_sese->node))
3502 	    /* Non-parental ancestor node - backedge from me.  */
3503 	    sese->push (pseudo_node_t (target, usd * t_sese->dir));
3504 	}
3505       else
3506 	{
3507 	  /* back edge to entry node */
3508 	  sese->push (pseudo_node_t (0, 0));
3509 	}
3510     }
3511 
3512  /* If this node leads directly or indirectly to a no-return region of
3513      the graph, then fake a backedge to entry node.  */
3514   if (!sese->brackets.length () || !edges || !edges->length ())
3515     {
3516       hi_back = 0;
3517       node_back = pseudo_node_t (0, 0);
3518       sese->push (node_back);
3519     }
3520 
3521   /* Record the highest reaching backedge from us or a descendant.  */
3522   sese->high = hi_back < hi_child ? node_back : node_child;
3523 
3524   if (num_children > 1)
3525     {
3526       /* There is more than one child -- this is a Y shaped piece of
3527 	 spanning tree.  We have to insert a fake backedge from this
3528 	 node to the highest ancestor reached by not-the-highest
3529 	 reaching child.  Note that there may be multiple children
3530 	 with backedges to the same highest node.  That's ok and we
3531 	 insert the edge to that highest node.  */
3532       hi_child = depth;
3533       if (dir < 0 && child)
3534 	{
3535 	  node_child = sese->high;
3536 	  hi_child = node_child.second;
3537 	  if (node_child.first)
3538 	    hi_child += BB_GET_SESE (node_child.first)->node;
3539 	}
3540 
3541       FOR_EACH_EDGE (e, ei, edges)
3542 	{
3543 	  basic_block target = *(basic_block *)((char *)e + offset);
3544 
3545 	  if (target == child)
3546 	    /* Ignore the highest child. */
3547 	    continue;
3548 
3549 	  bb_sese *t_sese = BB_GET_SESE (target);
3550 	  if (!t_sese)
3551 	    continue;
3552 	  if (t_sese->parent != sese->node)
3553 	    /* Not a child. */
3554 	    continue;
3555 
3556 	  /* Compare its hi value.  */
3557 	  int t_hi = t_sese->high.second;
3558 
3559 	  if (basic_block child_hi_block = t_sese->high.first)
3560 	    t_hi += BB_GET_SESE (child_hi_block)->node;
3561 
3562 	  if (hi_child > t_hi)
3563 	    {
3564 	      hi_child = t_hi;
3565 	      node_child = t_sese->high;
3566 	    }
3567 	}
3568 
3569       sese->push (node_child);
3570     }
3571 }
3572 
3573 
3574 /* DFS walk of BB graph.  Color node BLOCK according to COLORING then
3575    proceed to successors.  Set SESE entry and exit nodes of
3576    REGIONS.  */
3577 
3578 static void
nvptx_sese_color(auto_vec<unsigned> & color_counts,bb_pair_vec_t & regions,basic_block block,int coloring)3579 nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
3580 		  basic_block block, int coloring)
3581 {
3582   bb_sese *sese = BB_GET_SESE (block);
3583 
3584   if (block->flags & BB_VISITED)
3585     {
3586       /* If we've already encountered this block, either we must not
3587 	 be coloring, or it must have been colored the current color.  */
3588       gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3589       return;
3590     }
3591 
3592   block->flags |= BB_VISITED;
3593 
3594   if (sese)
3595     {
3596       if (coloring < 0)
3597 	{
3598 	  /* Start coloring a region.  */
3599 	  regions[sese->color].first = block;
3600 	  coloring = sese->color;
3601 	}
3602 
3603       if (!--color_counts[sese->color] && sese->color == coloring)
3604 	{
3605 	  /* Found final block of SESE region.  */
3606 	  regions[sese->color].second = block;
3607 	  coloring = -1;
3608 	}
3609       else
3610 	/* Color the node, so we can assert on revisiting the node
3611 	   that the graph is indeed SESE.  */
3612 	sese->color = coloring;
3613     }
3614   else
3615     /* Fallen off the subgraph, we cannot be coloring.  */
3616     gcc_assert (coloring < 0);
3617 
3618   /* Walk each successor block.  */
3619   if (block->succs && block->succs->length ())
3620     {
3621       edge e;
3622       edge_iterator ei;
3623 
3624       FOR_EACH_EDGE (e, ei, block->succs)
3625 	nvptx_sese_color (color_counts, regions, e->dest, coloring);
3626     }
3627   else
3628     gcc_assert (coloring < 0);
3629 }
3630 
3631 /* Find minimal set of SESE regions covering BLOCKS.  REGIONS might
3632    end up with NULL entries in it.  */
3633 
3634 static void
nvptx_find_sese(auto_vec<basic_block> & blocks,bb_pair_vec_t & regions)3635 nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3636 {
3637   basic_block block;
3638   int ix;
3639 
3640   /* First clear each BB of the whole function.  */
3641   FOR_ALL_BB_FN (block, cfun)
3642     {
3643       block->flags &= ~BB_VISITED;
3644       BB_SET_SESE (block, 0);
3645     }
3646 
3647   /* Mark blocks in the function that are in this graph.  */
3648   for (ix = 0; blocks.iterate (ix, &block); ix++)
3649     block->flags |= BB_VISITED;
3650 
3651   /* Counts of nodes assigned to each color.  There cannot be more
3652      colors than blocks (and hopefully there will be fewer).  */
3653   auto_vec<unsigned> color_counts;
3654   color_counts.reserve (blocks.length ());
3655 
3656   /* Worklist of nodes in the spanning tree.  Again, there cannot be
3657      more nodes in the tree than blocks (there will be fewer if the
3658      CFG of blocks is disjoint).  */
3659   auto_vec<basic_block> spanlist;
3660   spanlist.reserve (blocks.length ());
3661 
3662   /* Make sure every block has its cycle class determined.  */
3663   for (ix = 0; blocks.iterate (ix, &block); ix++)
3664     {
3665       if (BB_GET_SESE (block))
3666 	/* We already met this block in an earlier graph solve.  */
3667 	continue;
3668 
3669       if (dump_file)
3670 	fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3671 
3672       /* Number the nodes reachable from block initial DFS order.  */
3673       int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3674 
3675       /* Now walk in reverse DFS order to find cycle equivalents.  */
3676       while (spanlist.length ())
3677 	{
3678 	  block = spanlist.pop ();
3679 	  bb_sese *sese = BB_GET_SESE (block);
3680 
3681 	  /* Do the pseudo node below.  */
3682 	  nvptx_sese_pseudo (block, sese, depth, +1,
3683 			     sese->dir > 0 ? block->succs : block->preds,
3684 			     (sese->dir > 0 ? offsetof (edge_def, dest)
3685 			      : offsetof (edge_def, src)));
3686 	  sese->set_color (color_counts);
3687 	  /* Do the pseudo node above.  */
3688 	  nvptx_sese_pseudo (block, sese, depth, -1,
3689 			     sese->dir < 0 ? block->succs : block->preds,
3690 			     (sese->dir < 0 ? offsetof (edge_def, dest)
3691 			      : offsetof (edge_def, src)));
3692 	}
3693       if (dump_file)
3694 	fprintf (dump_file, "\n");
3695     }
3696 
3697   if (dump_file)
3698     {
3699       unsigned count;
3700       const char *comma = "";
3701 
3702       fprintf (dump_file, "Found %d cycle equivalents\n",
3703 	       color_counts.length ());
3704       for (ix = 0; color_counts.iterate (ix, &count); ix++)
3705 	{
3706 	  fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3707 
3708 	  comma = "";
3709 	  for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3710 	    if (BB_GET_SESE (block)->color == ix)
3711 	      {
3712 		block->flags |= BB_VISITED;
3713 		fprintf (dump_file, "%s%d", comma, block->index);
3714 		comma=",";
3715 	      }
3716 	  fprintf (dump_file, "}");
3717 	  comma = ", ";
3718 	}
3719       fprintf (dump_file, "\n");
3720    }
3721 
3722   /* Now we've colored every block in the subgraph.  We now need to
3723      determine the minimal set of SESE regions that cover that
3724      subgraph.  Do this with a DFS walk of the complete function.
3725      During the walk we're either 'looking' or 'coloring'.  When we
3726      reach the last node of a particular color, we stop coloring and
3727      return to looking.  */
3728 
3729   /* There cannot be more SESE regions than colors.  */
3730   regions.reserve (color_counts.length ());
3731   for (ix = color_counts.length (); ix--;)
3732     regions.quick_push (bb_pair_t (0, 0));
3733 
3734   for (ix = 0; blocks.iterate (ix, &block); ix++)
3735     block->flags &= ~BB_VISITED;
3736 
3737   nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3738 
3739   if (dump_file)
3740     {
3741       const char *comma = "";
3742       int len = regions.length ();
3743 
3744       fprintf (dump_file, "SESE regions:");
3745       for (ix = 0; ix != len; ix++)
3746 	{
3747 	  basic_block from = regions[ix].first;
3748 	  basic_block to = regions[ix].second;
3749 
3750 	  if (from)
3751 	    {
3752 	      fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3753 	      if (to != from)
3754 		fprintf (dump_file, "->%d", to->index);
3755 
3756 	      int color = BB_GET_SESE (from)->color;
3757 
3758 	      /* Print the blocks within the region (excluding ends).  */
3759 	      FOR_EACH_BB_FN (block, cfun)
3760 		{
3761 		  bb_sese *sese = BB_GET_SESE (block);
3762 
3763 		  if (sese && sese->color == color
3764 		      && block != from && block != to)
3765 		    fprintf (dump_file, ".%d", block->index);
3766 		}
3767 	      fprintf (dump_file, "}");
3768 	    }
3769 	  comma = ",";
3770 	}
3771       fprintf (dump_file, "\n\n");
3772     }
3773 
3774   for (ix = 0; blocks.iterate (ix, &block); ix++)
3775     delete BB_GET_SESE (block);
3776 }
3777 
3778 #undef BB_SET_SESE
3779 #undef BB_GET_SESE
3780 
3781 /* Propagate live state at the start of a partitioned region.  IS_CALL
3782    indicates whether the propagation is for a (partitioned) call
3783    instruction.  BLOCK provides the live register information, and
3784    might not contain INSN. Propagation is inserted just after INSN. RW
3785    indicates whether we are reading and/or writing state.  This
3786    separation is needed for worker-level proppagation where we
3787    essentially do a spill & fill.  FN is the underlying worker
3788    function to generate the propagation instructions for single
3789    register.  DATA is user data.
3790 
3791    Returns true if we didn't emit any instructions.
3792 
3793    We propagate the live register set for non-calls and the entire
3794    frame for calls and non-calls.  We could do better by (a)
3795    propagating just the live set that is used within the partitioned
3796    regions and (b) only propagating stack entries that are used.  The
3797    latter might be quite hard to determine.  */
3798 
3799 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *);
3800 
3801 static bool
nvptx_propagate(bool is_call,basic_block block,rtx_insn * insn,propagate_mask rw,propagator_fn fn,void * data)3802 nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
3803 		 propagate_mask rw, propagator_fn fn, void *data)
3804 {
3805   bitmap live = DF_LIVE_IN (block);
3806   bitmap_iterator iterator;
3807   unsigned ix;
3808   bool empty = true;
3809 
3810   /* Copy the frame array.  */
3811   HOST_WIDE_INT fs = get_frame_size ();
3812   if (fs)
3813     {
3814       rtx tmp = gen_reg_rtx (DImode);
3815       rtx idx = NULL_RTX;
3816       rtx ptr = gen_reg_rtx (Pmode);
3817       rtx pred = NULL_RTX;
3818       rtx_code_label *label = NULL;
3819 
3820       empty = false;
3821       /* The frame size might not be DImode compatible, but the frame
3822 	 array's declaration will be.  So it's ok to round up here.  */
3823       fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
3824       /* Detect single iteration loop. */
3825       if (fs == 1)
3826 	fs = 0;
3827 
3828       start_sequence ();
3829       emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3830       if (fs)
3831 	{
3832 	  idx = gen_reg_rtx (SImode);
3833 	  pred = gen_reg_rtx (BImode);
3834 	  label = gen_label_rtx ();
3835 
3836 	  emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3837 	  /* Allow worker function to initialize anything needed.  */
3838 	  rtx init = fn (tmp, PM_loop_begin, fs, data);
3839 	  if (init)
3840 	    emit_insn (init);
3841 	  emit_label (label);
3842 	  LABEL_NUSES (label)++;
3843 	  emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
3844 	}
3845       if (rw & PM_read)
3846 	emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
3847       emit_insn (fn (tmp, rw, fs, data));
3848       if (rw & PM_write)
3849 	emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
3850       if (fs)
3851 	{
3852 	  emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
3853 	  emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
3854 	  emit_insn (gen_br_true_uni (pred, label));
3855 	  rtx fini = fn (tmp, PM_loop_end, fs, data);
3856 	  if (fini)
3857 	    emit_insn (fini);
3858 	  emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
3859 	}
3860       emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
3861       emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
3862       rtx cpy = get_insns ();
3863       end_sequence ();
3864       insn = emit_insn_after (cpy, insn);
3865     }
3866 
3867   if (!is_call)
3868     /* Copy live registers.  */
3869     EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
3870       {
3871 	rtx reg = regno_reg_rtx[ix];
3872 
3873 	if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
3874 	  {
3875 	    rtx bcast = fn (reg, rw, 0, data);
3876 
3877 	    insn = emit_insn_after (bcast, insn);
3878 	    empty = false;
3879 	  }
3880       }
3881   return empty;
3882 }
3883 
3884 /* Worker for nvptx_vpropagate.  */
3885 
3886 static rtx
vprop_gen(rtx reg,propagate_mask pm,unsigned ARG_UNUSED (count),void * ARG_UNUSED (data))3887 vprop_gen (rtx reg, propagate_mask pm,
3888 	   unsigned ARG_UNUSED (count), void *ARG_UNUSED (data))
3889 {
3890   if (!(pm & PM_read_write))
3891     return 0;
3892 
3893   return nvptx_gen_vcast (reg);
3894 }
3895 
3896 /* Propagate state that is live at start of BLOCK across the vectors
3897    of a single warp.  Propagation is inserted just after INSN.
3898    IS_CALL and return as for nvptx_propagate.  */
3899 
3900 static bool
nvptx_vpropagate(bool is_call,basic_block block,rtx_insn * insn)3901 nvptx_vpropagate (bool is_call, basic_block block, rtx_insn *insn)
3902 {
3903   return nvptx_propagate (is_call, block, insn, PM_read_write, vprop_gen, 0);
3904 }
3905 
3906 /* Worker for nvptx_wpropagate.  */
3907 
3908 static rtx
wprop_gen(rtx reg,propagate_mask pm,unsigned rep,void * data_)3909 wprop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_)
3910 {
3911   wcast_data_t *data = (wcast_data_t *)data_;
3912 
3913   if (pm & PM_loop_begin)
3914     {
3915       /* Starting a loop, initialize pointer.    */
3916       unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
3917 
3918       if (align > worker_bcast_align)
3919 	worker_bcast_align = align;
3920       data->offset = (data->offset + align - 1) & ~(align - 1);
3921 
3922       data->ptr = gen_reg_rtx (Pmode);
3923 
3924       return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
3925     }
3926   else if (pm & PM_loop_end)
3927     {
3928       rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
3929       data->ptr = NULL_RTX;
3930       return clobber;
3931     }
3932   else
3933     return nvptx_gen_wcast (reg, pm, rep, data);
3934 }
3935 
3936 /* Spill or fill live state that is live at start of BLOCK.  PRE_P
3937    indicates if this is just before partitioned mode (do spill), or
3938    just after it starts (do fill). Sequence is inserted just after
3939    INSN.  IS_CALL and return as for nvptx_propagate.  */
3940 
3941 static bool
nvptx_wpropagate(bool pre_p,bool is_call,basic_block block,rtx_insn * insn)3942 nvptx_wpropagate (bool pre_p, bool is_call, basic_block block, rtx_insn *insn)
3943 {
3944   wcast_data_t data;
3945 
3946   data.base = gen_reg_rtx (Pmode);
3947   data.offset = 0;
3948   data.ptr = NULL_RTX;
3949 
3950   bool empty = nvptx_propagate (is_call, block, insn,
3951 				pre_p ? PM_read : PM_write, wprop_gen, &data);
3952   gcc_assert (empty == !data.offset);
3953   if (data.offset)
3954     {
3955       /* Stuff was emitted, initialize the base pointer now.  */
3956       rtx init = gen_rtx_SET (data.base, worker_bcast_sym);
3957       emit_insn_after (init, insn);
3958 
3959       if (worker_bcast_size < data.offset)
3960 	worker_bcast_size = data.offset;
3961     }
3962   return empty;
3963 }
3964 
3965 /* Emit a worker-level synchronization barrier.  We use different
3966    markers for before and after synchronizations.  */
3967 
3968 static rtx
nvptx_wsync(bool after)3969 nvptx_wsync (bool after)
3970 {
3971   return gen_nvptx_barsync (GEN_INT (after));
3972 }
3973 
3974 #if WORKAROUND_PTXJIT_BUG
3975 /* Return first real insn in BB, or return NULL_RTX if BB does not contain
3976    real insns.  */
3977 
3978 static rtx_insn *
bb_first_real_insn(basic_block bb)3979 bb_first_real_insn (basic_block bb)
3980 {
3981   rtx_insn *insn;
3982 
3983   /* Find first insn of from block.  */
3984   FOR_BB_INSNS (bb, insn)
3985     if (INSN_P (insn))
3986       return insn;
3987 
3988   return 0;
3989 }
3990 #endif
3991 
3992 /* Single neutering according to MASK.  FROM is the incoming block and
3993    TO is the outgoing block.  These may be the same block. Insert at
3994    start of FROM:
3995 
3996      if (tid.<axis>) goto end.
3997 
3998    and insert before ending branch of TO (if there is such an insn):
3999 
4000      end:
4001      <possibly-broadcast-cond>
4002      <branch>
4003 
4004    We currently only use differnt FROM and TO when skipping an entire
4005    loop.  We could do more if we detected superblocks.  */
4006 
4007 static void
nvptx_single(unsigned mask,basic_block from,basic_block to)4008 nvptx_single (unsigned mask, basic_block from, basic_block to)
4009 {
4010   rtx_insn *head = BB_HEAD (from);
4011   rtx_insn *tail = BB_END (to);
4012   unsigned skip_mask = mask;
4013 
4014   while (true)
4015     {
4016       /* Find first insn of from block.  */
4017       while (head != BB_END (from)
4018 	     && (!INSN_P (head)
4019 		 || recog_memoized (head) == CODE_FOR_nvptx_barsync))
4020 	head = NEXT_INSN (head);
4021 
4022       if (from == to)
4023 	break;
4024 
4025       if (!(JUMP_P (head) && single_succ_p (from)))
4026 	break;
4027 
4028       basic_block jump_target = single_succ (from);
4029       if (!single_pred_p (jump_target))
4030 	break;
4031 
4032       from = jump_target;
4033       head = BB_HEAD (from);
4034     }
4035 
4036   /* Find last insn of to block */
4037   rtx_insn *limit = from == to ? head : BB_HEAD (to);
4038   while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
4039     tail = PREV_INSN (tail);
4040 
4041   /* Detect if tail is a branch.  */
4042   rtx tail_branch = NULL_RTX;
4043   rtx cond_branch = NULL_RTX;
4044   if (tail && INSN_P (tail))
4045     {
4046       tail_branch = PATTERN (tail);
4047       if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
4048 	tail_branch = NULL_RTX;
4049       else
4050 	{
4051 	  cond_branch = SET_SRC (tail_branch);
4052 	  if (GET_CODE (cond_branch) != IF_THEN_ELSE)
4053 	    cond_branch = NULL_RTX;
4054 	}
4055     }
4056 
4057   if (tail == head)
4058     {
4059       /* If this is empty, do nothing.  */
4060       if (!head || !INSN_P (head))
4061 	return;
4062 
4063       /* If this is a dummy insn, do nothing.  */
4064       switch (recog_memoized (head))
4065 	{
4066 	default:
4067 	  break;
4068 	case CODE_FOR_nvptx_barsync:
4069 	case CODE_FOR_nvptx_fork:
4070 	case CODE_FOR_nvptx_forked:
4071 	case CODE_FOR_nvptx_joining:
4072 	case CODE_FOR_nvptx_join:
4073 	  return;
4074 	}
4075 
4076       if (cond_branch)
4077 	{
4078 	  /* If we're only doing vector single, there's no need to
4079 	     emit skip code because we'll not insert anything.  */
4080 	  if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
4081 	    skip_mask = 0;
4082 	}
4083       else if (tail_branch)
4084 	/* Block with only unconditional branch.  Nothing to do.  */
4085 	return;
4086     }
4087 
4088   /* Insert the vector test inside the worker test.  */
4089   unsigned mode;
4090   rtx_insn *before = tail;
4091   rtx_insn *neuter_start = NULL;
4092   for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4093     if (GOMP_DIM_MASK (mode) & skip_mask)
4094       {
4095 	rtx_code_label *label = gen_label_rtx ();
4096 	rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
4097 
4098 	if (!pred)
4099 	  {
4100 	    pred = gen_reg_rtx (BImode);
4101 	    cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
4102 	  }
4103 
4104 	rtx br;
4105 	if (mode == GOMP_DIM_VECTOR)
4106 	  br = gen_br_true (pred, label);
4107 	else
4108 	  br = gen_br_true_uni (pred, label);
4109 	if (neuter_start)
4110 	  neuter_start = emit_insn_after (br, neuter_start);
4111 	else
4112 	  neuter_start = emit_insn_before (br, head);
4113 
4114 	LABEL_NUSES (label)++;
4115 	if (tail_branch)
4116 	  before = emit_label_before (label, before);
4117 	else
4118 	  {
4119 	    rtx_insn *label_insn = emit_label_after (label, tail);
4120 	    if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER)
4121 		&& CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL))
4122 	      emit_insn_after (gen_exit (), label_insn);
4123 	  }
4124       }
4125 
4126   /* Now deal with propagating the branch condition.  */
4127   if (cond_branch)
4128     {
4129       rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
4130 
4131       if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask)
4132 	{
4133 	  /* Vector mode only, do a shuffle.  */
4134 #if WORKAROUND_PTXJIT_BUG
4135 	  /* The branch condition %rcond is propagated like this:
4136 
4137 		{
4138 		    .reg .u32 %x;
4139 		    mov.u32 %x,%tid.x;
4140 		    setp.ne.u32 %rnotvzero,%x,0;
4141 		 }
4142 
4143 		 @%rnotvzero bra Lskip;
4144 		 setp.<op>.<type> %rcond,op1,op2;
4145 		 Lskip:
4146 		 selp.u32 %rcondu32,1,0,%rcond;
4147 		 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4148 		 setp.ne.u32 %rcond,%rcondu32,0;
4149 
4150 	     There seems to be a bug in the ptx JIT compiler (observed at driver
4151 	     version 381.22, at -O1 and higher for sm_61), that drops the shfl
4152 	     unless %rcond is initialized to something before 'bra Lskip'.  The
4153 	     bug is not observed with ptxas from cuda 8.0.61.
4154 
4155 	     It is true that the code is non-trivial: at Lskip, %rcond is
4156 	     uninitialized in threads 1-31, and after the selp the same holds
4157 	     for %rcondu32.  But shfl propagates the defined value in thread 0
4158 	     to threads 1-31, so after the shfl %rcondu32 is defined in threads
4159 	     0-31, and after the setp.ne %rcond is defined in threads 0-31.
4160 
4161 	     There is nothing in the PTX spec to suggest that this is wrong, or
4162 	     to explain why the extra initialization is needed.  So, we classify
4163 	     it as a JIT bug, and the extra initialization as workaround:
4164 
4165 		{
4166 		    .reg .u32 %x;
4167 		    mov.u32 %x,%tid.x;
4168 		    setp.ne.u32 %rnotvzero,%x,0;
4169 		}
4170 
4171 		+.reg .pred %rcond2;
4172 		+setp.eq.u32 %rcond2, 1, 0;
4173 
4174 		 @%rnotvzero bra Lskip;
4175 		 setp.<op>.<type> %rcond,op1,op2;
4176 		+mov.pred %rcond2, %rcond;
4177 		 Lskip:
4178 		+mov.pred %rcond, %rcond2;
4179 		 selp.u32 %rcondu32,1,0,%rcond;
4180 		 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4181 		 setp.ne.u32 %rcond,%rcondu32,0;
4182 	  */
4183 	  rtx_insn *label = PREV_INSN (tail);
4184 	  gcc_assert (label && LABEL_P (label));
4185 	  rtx tmp = gen_reg_rtx (BImode);
4186 	  emit_insn_before (gen_movbi (tmp, const0_rtx),
4187 			    bb_first_real_insn (from));
4188 	  emit_insn_before (gen_rtx_SET (tmp, pvar), label);
4189 	  emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
4190 #endif
4191 	  emit_insn_before (nvptx_gen_vcast (pvar), tail);
4192 	}
4193       else
4194 	{
4195 	  /* Includes worker mode, do spill & fill.  By construction
4196 	     we should never have worker mode only. */
4197 	  wcast_data_t data;
4198 
4199 	  data.base = worker_bcast_sym;
4200 	  data.ptr = 0;
4201 
4202 	  if (worker_bcast_size < GET_MODE_SIZE (SImode))
4203 	    worker_bcast_size = GET_MODE_SIZE (SImode);
4204 
4205 	  data.offset = 0;
4206 	  emit_insn_before (nvptx_gen_wcast (pvar, PM_read, 0, &data),
4207 			    before);
4208 	  /* Barrier so other workers can see the write.  */
4209 	  emit_insn_before (nvptx_wsync (false), tail);
4210 	  data.offset = 0;
4211 	  emit_insn_before (nvptx_gen_wcast (pvar, PM_write, 0, &data), tail);
4212 	  /* This barrier is needed to avoid worker zero clobbering
4213 	     the broadcast buffer before all the other workers have
4214 	     had a chance to read this instance of it.  */
4215 	  emit_insn_before (nvptx_wsync (true), tail);
4216 	}
4217 
4218       extract_insn (tail);
4219       rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
4220 				 UNSPEC_BR_UNIFIED);
4221       validate_change (tail, recog_data.operand_loc[0], unsp, false);
4222     }
4223 }
4224 
4225 /* PAR is a parallel that is being skipped in its entirety according to
4226    MASK.  Treat this as skipping a superblock starting at forked
4227    and ending at joining.  */
4228 
4229 static void
nvptx_skip_par(unsigned mask,parallel * par)4230 nvptx_skip_par (unsigned mask, parallel *par)
4231 {
4232   basic_block tail = par->join_block;
4233   gcc_assert (tail->preds->length () == 1);
4234 
4235   basic_block pre_tail = (*tail->preds)[0]->src;
4236   gcc_assert (pre_tail->succs->length () == 1);
4237 
4238   nvptx_single (mask, par->forked_block, pre_tail);
4239 }
4240 
4241 /* If PAR has a single inner parallel and PAR itself only contains
4242    empty entry and exit blocks, swallow the inner PAR.  */
4243 
4244 static void
nvptx_optimize_inner(parallel * par)4245 nvptx_optimize_inner (parallel *par)
4246 {
4247   parallel *inner = par->inner;
4248 
4249   /* We mustn't be the outer dummy par.  */
4250   if (!par->mask)
4251     return;
4252 
4253   /* We must have a single inner par.  */
4254   if (!inner || inner->next)
4255     return;
4256 
4257   /* We must only contain 2 blocks ourselves -- the head and tail of
4258      the inner par.  */
4259   if (par->blocks.length () != 2)
4260     return;
4261 
4262   /* We must be disjoint partitioning.  As we only have vector and
4263      worker partitioning, this is sufficient to guarantee the pars
4264      have adjacent partitioning.  */
4265   if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
4266     /* This indicates malformed code generation.  */
4267     return;
4268 
4269   /* The outer forked insn should be immediately followed by the inner
4270      fork insn.  */
4271   rtx_insn *forked = par->forked_insn;
4272   rtx_insn *fork = BB_END (par->forked_block);
4273 
4274   if (NEXT_INSN (forked) != fork)
4275     return;
4276   gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
4277 
4278   /* The outer joining insn must immediately follow the inner join
4279      insn.  */
4280   rtx_insn *joining = par->joining_insn;
4281   rtx_insn *join = inner->join_insn;
4282   if (NEXT_INSN (join) != joining)
4283     return;
4284 
4285   /* Preconditions met.  Swallow the inner par.  */
4286   if (dump_file)
4287     fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
4288 	     inner->mask, inner->forked_block->index,
4289 	     inner->join_block->index,
4290 	     par->mask, par->forked_block->index, par->join_block->index);
4291 
4292   par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
4293 
4294   par->blocks.reserve (inner->blocks.length ());
4295   while (inner->blocks.length ())
4296     par->blocks.quick_push (inner->blocks.pop ());
4297 
4298   par->inner = inner->inner;
4299   inner->inner = NULL;
4300 
4301   delete inner;
4302 }
4303 
4304 /* Process the parallel PAR and all its contained
4305    parallels.  We do everything but the neutering.  Return mask of
4306    partitioned modes used within this parallel.  */
4307 
4308 static unsigned
nvptx_process_pars(parallel * par)4309 nvptx_process_pars (parallel *par)
4310 {
4311   if (nvptx_optimize)
4312     nvptx_optimize_inner (par);
4313 
4314   unsigned inner_mask = par->mask;
4315 
4316   /* Do the inner parallels first.  */
4317   if (par->inner)
4318     {
4319       par->inner_mask = nvptx_process_pars (par->inner);
4320       inner_mask |= par->inner_mask;
4321     }
4322 
4323   bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
4324 
4325   if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
4326     {
4327       nvptx_wpropagate (false, is_call, par->forked_block, par->forked_insn);
4328       bool empty = nvptx_wpropagate (true, is_call,
4329 				     par->forked_block, par->fork_insn);
4330 
4331       if (!empty || !is_call)
4332 	{
4333 	  /* Insert begin and end synchronizations.  */
4334 	  emit_insn_before (nvptx_wsync (false), par->forked_insn);
4335 	  emit_insn_before (nvptx_wsync (true), par->join_insn);
4336 	}
4337     }
4338   else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
4339     nvptx_vpropagate (is_call, par->forked_block, par->forked_insn);
4340 
4341   /* Now do siblings.  */
4342   if (par->next)
4343     inner_mask |= nvptx_process_pars (par->next);
4344   return inner_mask;
4345 }
4346 
4347 /* Neuter the parallel described by PAR.  We recurse in depth-first
4348    order.  MODES are the partitioning of the execution and OUTER is
4349    the partitioning of the parallels we are contained in.  */
4350 
4351 static void
nvptx_neuter_pars(parallel * par,unsigned modes,unsigned outer)4352 nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
4353 {
4354   unsigned me = (par->mask
4355 		 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
4356 		    | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4357   unsigned  skip_mask = 0, neuter_mask = 0;
4358 
4359   if (par->inner)
4360     nvptx_neuter_pars (par->inner, modes, outer | me);
4361 
4362   for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4363     {
4364       if ((outer | me) & GOMP_DIM_MASK (mode))
4365 	{} /* Mode is partitioned: no neutering.  */
4366       else if (!(modes & GOMP_DIM_MASK (mode)))
4367 	{} /* Mode is not used: nothing to do.  */
4368       else if (par->inner_mask & GOMP_DIM_MASK (mode)
4369 	       || !par->forked_insn)
4370 	/* Partitioned in inner parallels, or we're not a partitioned
4371 	   at all: neuter individual blocks.  */
4372 	neuter_mask |= GOMP_DIM_MASK (mode);
4373       else if (!par->parent || !par->parent->forked_insn
4374 	       || par->parent->inner_mask & GOMP_DIM_MASK (mode))
4375 	/* Parent isn't a parallel or contains this paralleling: skip
4376 	   parallel at this level.  */
4377 	skip_mask |= GOMP_DIM_MASK (mode);
4378       else
4379 	{} /* Parent will skip this parallel itself.  */
4380     }
4381 
4382   if (neuter_mask)
4383     {
4384       int ix, len;
4385 
4386       if (nvptx_optimize)
4387 	{
4388 	  /* Neuter whole SESE regions.  */
4389 	  bb_pair_vec_t regions;
4390 
4391 	  nvptx_find_sese (par->blocks, regions);
4392 	  len = regions.length ();
4393 	  for (ix = 0; ix != len; ix++)
4394 	    {
4395 	      basic_block from = regions[ix].first;
4396 	      basic_block to = regions[ix].second;
4397 
4398 	      if (from)
4399 		nvptx_single (neuter_mask, from, to);
4400 	      else
4401 		gcc_assert (!to);
4402 	    }
4403 	}
4404       else
4405 	{
4406 	  /* Neuter each BB individually.  */
4407 	  len = par->blocks.length ();
4408 	  for (ix = 0; ix != len; ix++)
4409 	    {
4410 	      basic_block block = par->blocks[ix];
4411 
4412 	      nvptx_single (neuter_mask, block, block);
4413 	    }
4414 	}
4415     }
4416 
4417   if (skip_mask)
4418       nvptx_skip_par (skip_mask, par);
4419 
4420   if (par->next)
4421     nvptx_neuter_pars (par->next, modes, outer);
4422 }
4423 
4424 #if WORKAROUND_PTXJIT_BUG_2
4425 /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT.  This variant
4426    is needed in the nvptx target because the branches generated for
4427    parititioning are NONJUMP_INSN_P, not JUMP_P.  */
4428 
4429 static rtx
4430 nvptx_pc_set (const rtx_insn *insn, bool strict = true)
4431 {
4432   rtx pat;
4433   if ((strict && !JUMP_P (insn))
4434       || (!strict && !INSN_P (insn)))
4435     return NULL_RTX;
4436   pat = PATTERN (insn);
4437 
4438   /* The set is allowed to appear either as the insn pattern or
4439      the first set in a PARALLEL.  */
4440   if (GET_CODE (pat) == PARALLEL)
4441     pat = XVECEXP (pat, 0, 0);
4442   if (GET_CODE (pat) == SET && GET_CODE (SET_DEST (pat)) == PC)
4443     return pat;
4444 
4445   return NULL_RTX;
4446 }
4447 
4448 /* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT.  */
4449 
4450 static rtx
4451 nvptx_condjump_label (const rtx_insn *insn, bool strict = true)
4452 {
4453   rtx x = nvptx_pc_set (insn, strict);
4454 
4455   if (!x)
4456     return NULL_RTX;
4457   x = SET_SRC (x);
4458   if (GET_CODE (x) == LABEL_REF)
4459     return x;
4460   if (GET_CODE (x) != IF_THEN_ELSE)
4461     return NULL_RTX;
4462   if (XEXP (x, 2) == pc_rtx && GET_CODE (XEXP (x, 1)) == LABEL_REF)
4463     return XEXP (x, 1);
4464   if (XEXP (x, 1) == pc_rtx && GET_CODE (XEXP (x, 2)) == LABEL_REF)
4465     return XEXP (x, 2);
4466   return NULL_RTX;
4467 }
4468 
4469 /* Insert a dummy ptx insn when encountering a branch to a label with no ptx
4470    insn inbetween the branch and the label.  This works around a JIT bug
4471    observed at driver version 384.111, at -O0 for sm_50.  */
4472 
4473 static void
prevent_branch_around_nothing(void)4474 prevent_branch_around_nothing (void)
4475 {
4476   rtx_insn *seen_label = NULL;
4477     for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
4478       {
4479 	if (INSN_P (insn) && condjump_p (insn))
4480 	  {
4481 	    seen_label = label_ref_label (nvptx_condjump_label (insn, false));
4482 	    continue;
4483 	  }
4484 
4485 	if (seen_label == NULL)
4486 	  continue;
4487 
4488 	if (NOTE_P (insn) || DEBUG_INSN_P (insn))
4489 	  continue;
4490 
4491 	if (INSN_P (insn))
4492 	  switch (recog_memoized (insn))
4493 	    {
4494 	    case CODE_FOR_nvptx_fork:
4495 	    case CODE_FOR_nvptx_forked:
4496 	    case CODE_FOR_nvptx_joining:
4497 	    case CODE_FOR_nvptx_join:
4498 	      continue;
4499 	    default:
4500 	      seen_label = NULL;
4501 	      continue;
4502 	    }
4503 
4504 	if (LABEL_P (insn) && insn == seen_label)
4505 	  emit_insn_before (gen_fake_nop (), insn);
4506 
4507 	seen_label = NULL;
4508       }
4509   }
4510 #endif
4511 
4512 /* PTX-specific reorganization
4513    - Split blocks at fork and join instructions
4514    - Compute live registers
4515    - Mark now-unused registers, so function begin doesn't declare
4516    unused registers.
4517    - Insert state propagation when entering partitioned mode
4518    - Insert neutering instructions when in single mode
4519    - Replace subregs with suitable sequences.
4520 */
4521 
4522 static void
nvptx_reorg(void)4523 nvptx_reorg (void)
4524 {
4525   /* We are freeing block_for_insn in the toplev to keep compatibility
4526      with old MDEP_REORGS that are not CFG based.  Recompute it now.  */
4527   compute_bb_for_insn ();
4528 
4529   thread_prologue_and_epilogue_insns ();
4530 
4531   /* Split blocks and record interesting unspecs.  */
4532   bb_insn_map_t bb_insn_map;
4533 
4534   nvptx_split_blocks (&bb_insn_map);
4535 
4536   /* Compute live regs */
4537   df_clear_flags (DF_LR_RUN_DCE);
4538   df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
4539   df_live_add_problem ();
4540   df_live_set_all_dirty ();
4541   df_analyze ();
4542   regstat_init_n_sets_and_refs ();
4543 
4544   if (dump_file)
4545     df_dump (dump_file);
4546 
4547   /* Mark unused regs as unused.  */
4548   int max_regs = max_reg_num ();
4549   for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
4550     if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
4551       regno_reg_rtx[i] = const0_rtx;
4552 
4553   /* Determine launch dimensions of the function.  If it is not an
4554      offloaded function  (i.e. this is a regular compiler), the
4555      function has no neutering.  */
4556   tree attr = oacc_get_fn_attrib (current_function_decl);
4557   if (attr)
4558     {
4559       /* If we determined this mask before RTL expansion, we could
4560 	 elide emission of some levels of forks and joins.  */
4561       unsigned mask = 0;
4562       tree dims = TREE_VALUE (attr);
4563       unsigned ix;
4564 
4565       for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
4566 	{
4567 	  int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
4568 	  tree allowed = TREE_PURPOSE (dims);
4569 
4570 	  if (size != 1 && !(allowed && integer_zerop (allowed)))
4571 	    mask |= GOMP_DIM_MASK (ix);
4572 	}
4573       /* If there is worker neutering, there must be vector
4574 	 neutering.  Otherwise the hardware will fail.  */
4575       gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
4576 		  || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4577 
4578       /* Discover & process partitioned regions.  */
4579       parallel *pars = nvptx_discover_pars (&bb_insn_map);
4580       nvptx_process_pars (pars);
4581       nvptx_neuter_pars (pars, mask, 0);
4582       delete pars;
4583     }
4584 
4585   /* Replace subregs.  */
4586   nvptx_reorg_subreg ();
4587 
4588   if (TARGET_UNIFORM_SIMT)
4589     nvptx_reorg_uniform_simt ();
4590 
4591 #if WORKAROUND_PTXJIT_BUG_2
4592   prevent_branch_around_nothing ();
4593 #endif
4594 
4595   regstat_free_n_sets_and_refs ();
4596 
4597   df_finish_pass (true);
4598 }
4599 
4600 /* Handle a "kernel" attribute; arguments as in
4601    struct attribute_spec.handler.  */
4602 
4603 static tree
nvptx_handle_kernel_attribute(tree * node,tree name,tree ARG_UNUSED (args),int ARG_UNUSED (flags),bool * no_add_attrs)4604 nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
4605 			       int ARG_UNUSED (flags), bool *no_add_attrs)
4606 {
4607   tree decl = *node;
4608 
4609   if (TREE_CODE (decl) != FUNCTION_DECL)
4610     {
4611       error ("%qE attribute only applies to functions", name);
4612       *no_add_attrs = true;
4613     }
4614   else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
4615     {
4616       error ("%qE attribute requires a void return type", name);
4617       *no_add_attrs = true;
4618     }
4619 
4620   return NULL_TREE;
4621 }
4622 
4623 /* Handle a "shared" attribute; arguments as in
4624    struct attribute_spec.handler.  */
4625 
4626 static tree
nvptx_handle_shared_attribute(tree * node,tree name,tree ARG_UNUSED (args),int ARG_UNUSED (flags),bool * no_add_attrs)4627 nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
4628 			       int ARG_UNUSED (flags), bool *no_add_attrs)
4629 {
4630   tree decl = *node;
4631 
4632   if (TREE_CODE (decl) != VAR_DECL)
4633     {
4634       error ("%qE attribute only applies to variables", name);
4635       *no_add_attrs = true;
4636     }
4637   else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
4638     {
4639       error ("%qE attribute not allowed with auto storage class", name);
4640       *no_add_attrs = true;
4641     }
4642 
4643   return NULL_TREE;
4644 }
4645 
4646 /* Table of valid machine attributes.  */
4647 static const struct attribute_spec nvptx_attribute_table[] =
4648 {
4649   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
4650        affects_type_identity, handler, exclude } */
4651   { "kernel", 0, 0, true, false,  false, false, nvptx_handle_kernel_attribute,
4652     NULL },
4653   { "shared", 0, 0, true, false,  false, false, nvptx_handle_shared_attribute,
4654     NULL },
4655   { NULL, 0, 0, false, false, false, false, NULL, NULL }
4656 };
4657 
4658 /* Limit vector alignments to BIGGEST_ALIGNMENT.  */
4659 
4660 static HOST_WIDE_INT
nvptx_vector_alignment(const_tree type)4661 nvptx_vector_alignment (const_tree type)
4662 {
4663   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
4664 
4665   return MIN (align, BIGGEST_ALIGNMENT);
4666 }
4667 
4668 /* Indicate that INSN cannot be duplicated.   */
4669 
4670 static bool
nvptx_cannot_copy_insn_p(rtx_insn * insn)4671 nvptx_cannot_copy_insn_p (rtx_insn *insn)
4672 {
4673   switch (recog_memoized (insn))
4674     {
4675     case CODE_FOR_nvptx_shufflesi:
4676     case CODE_FOR_nvptx_shufflesf:
4677     case CODE_FOR_nvptx_barsync:
4678     case CODE_FOR_nvptx_fork:
4679     case CODE_FOR_nvptx_forked:
4680     case CODE_FOR_nvptx_joining:
4681     case CODE_FOR_nvptx_join:
4682       return true;
4683     default:
4684       return false;
4685     }
4686 }
4687 
4688 /* Section anchors do not work.  Initialization for flag_section_anchor
4689    probes the existence of the anchoring target hooks and prevents
4690    anchoring if they don't exist.  However, we may be being used with
4691    a host-side compiler that does support anchoring, and hence see
4692    the anchor flag set (as it's not recalculated).  So provide an
4693    implementation denying anchoring.  */
4694 
4695 static bool
nvptx_use_anchors_for_symbol_p(const_rtx ARG_UNUSED (a))4696 nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
4697 {
4698   return false;
4699 }
4700 
4701 /* Record a symbol for mkoffload to enter into the mapping table.  */
4702 
4703 static void
nvptx_record_offload_symbol(tree decl)4704 nvptx_record_offload_symbol (tree decl)
4705 {
4706   switch (TREE_CODE (decl))
4707     {
4708     case VAR_DECL:
4709       fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
4710 	       IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
4711       break;
4712 
4713     case FUNCTION_DECL:
4714       {
4715 	tree attr = oacc_get_fn_attrib (decl);
4716 	/* OpenMP offloading does not set this attribute.  */
4717 	tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
4718 
4719 	fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
4720 		 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
4721 
4722 	for (; dims; dims = TREE_CHAIN (dims))
4723 	  {
4724 	    int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
4725 
4726 	    gcc_assert (!TREE_PURPOSE (dims));
4727 	    fprintf (asm_out_file, ", %#x", size);
4728 	  }
4729 
4730 	fprintf (asm_out_file, "\n");
4731       }
4732       break;
4733 
4734     default:
4735       gcc_unreachable ();
4736     }
4737 }
4738 
4739 /* Implement TARGET_ASM_FILE_START.  Write the kinds of things ptxas expects
4740    at the start of a file.  */
4741 
4742 static void
nvptx_file_start(void)4743 nvptx_file_start (void)
4744 {
4745   fputs ("// BEGIN PREAMBLE\n", asm_out_file);
4746   fputs ("\t.version\t3.1\n", asm_out_file);
4747   fputs ("\t.target\tsm_30\n", asm_out_file);
4748   fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
4749   fputs ("// END PREAMBLE\n", asm_out_file);
4750 }
4751 
4752 /* Emit a declaration for a worker-level buffer in .shared memory.  */
4753 
4754 static void
write_worker_buffer(FILE * file,rtx sym,unsigned align,unsigned size)4755 write_worker_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
4756 {
4757   const char *name = XSTR (sym, 0);
4758 
4759   write_var_marker (file, true, false, name);
4760   fprintf (file, ".shared .align %d .u8 %s[%d];\n",
4761 	   align, name, size);
4762 }
4763 
4764 /* Write out the function declarations we've collected and declare storage
4765    for the broadcast buffer.  */
4766 
4767 static void
nvptx_file_end(void)4768 nvptx_file_end (void)
4769 {
4770   hash_table<tree_hasher>::iterator iter;
4771   tree decl;
4772   FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
4773     nvptx_record_fndecl (decl);
4774   fputs (func_decls.str().c_str(), asm_out_file);
4775 
4776   if (worker_bcast_size)
4777     write_worker_buffer (asm_out_file, worker_bcast_sym,
4778 			 worker_bcast_align, worker_bcast_size);
4779 
4780   if (worker_red_size)
4781     write_worker_buffer (asm_out_file, worker_red_sym,
4782 			 worker_red_align, worker_red_size);
4783 
4784   if (need_softstack_decl)
4785     {
4786       write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
4787       /* 32 is the maximum number of warps in a block.  Even though it's an
4788          external declaration, emit the array size explicitly; otherwise, it
4789          may fail at PTX JIT time if the definition is later in link order.  */
4790       fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
4791 	       POINTER_SIZE);
4792     }
4793   if (need_unisimt_decl)
4794     {
4795       write_var_marker (asm_out_file, false, true, "__nvptx_uni");
4796       fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
4797     }
4798 }
4799 
4800 /* Expander for the shuffle builtins.  */
4801 
4802 static rtx
nvptx_expand_shuffle(tree exp,rtx target,machine_mode mode,int ignore)4803 nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
4804 {
4805   if (ignore)
4806     return target;
4807 
4808   rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
4809 			 NULL_RTX, mode, EXPAND_NORMAL);
4810   if (!REG_P (src))
4811     src = copy_to_mode_reg (mode, src);
4812 
4813   rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
4814 			 NULL_RTX, SImode, EXPAND_NORMAL);
4815   rtx op = expand_expr (CALL_EXPR_ARG  (exp, 2),
4816 			NULL_RTX, SImode, EXPAND_NORMAL);
4817 
4818   if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
4819     idx = copy_to_mode_reg (SImode, idx);
4820 
4821   rtx pat = nvptx_gen_shuffle (target, src, idx,
4822 			       (nvptx_shuffle_kind) INTVAL (op));
4823   if (pat)
4824     emit_insn (pat);
4825 
4826   return target;
4827 }
4828 
4829 /* Worker reduction address expander.  */
4830 
4831 static rtx
nvptx_expand_worker_addr(tree exp,rtx target,machine_mode ARG_UNUSED (mode),int ignore)4832 nvptx_expand_worker_addr (tree exp, rtx target,
4833 			  machine_mode ARG_UNUSED (mode), int ignore)
4834 {
4835   if (ignore)
4836     return target;
4837 
4838   unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
4839   if (align > worker_red_align)
4840     worker_red_align = align;
4841 
4842   unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
4843   unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
4844   if (size + offset > worker_red_size)
4845     worker_red_size = size + offset;
4846 
4847   rtx addr = worker_red_sym;
4848   if (offset)
4849     {
4850       addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
4851       addr = gen_rtx_CONST (Pmode, addr);
4852     }
4853 
4854   emit_move_insn (target, addr);
4855 
4856   return target;
4857 }
4858 
4859 /* Expand the CMP_SWAP PTX builtins.  We have our own versions that do
4860    not require taking the address of any object, other than the memory
4861    cell being operated on.  */
4862 
4863 static rtx
nvptx_expand_cmp_swap(tree exp,rtx target,machine_mode ARG_UNUSED (m),int ARG_UNUSED (ignore))4864 nvptx_expand_cmp_swap (tree exp, rtx target,
4865 		       machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
4866 {
4867   machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
4868 
4869   if (!target)
4870     target = gen_reg_rtx (mode);
4871 
4872   rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
4873 			 NULL_RTX, Pmode, EXPAND_NORMAL);
4874   rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
4875 			 NULL_RTX, mode, EXPAND_NORMAL);
4876   rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
4877 			 NULL_RTX, mode, EXPAND_NORMAL);
4878   rtx pat;
4879 
4880   mem = gen_rtx_MEM (mode, mem);
4881   if (!REG_P (cmp))
4882     cmp = copy_to_mode_reg (mode, cmp);
4883   if (!REG_P (src))
4884     src = copy_to_mode_reg (mode, src);
4885 
4886   if (mode == SImode)
4887     pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
4888   else
4889     pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
4890 
4891   emit_insn (pat);
4892 
4893   return target;
4894 }
4895 
4896 
4897 /* Codes for all the NVPTX builtins.  */
4898 enum nvptx_builtins
4899 {
4900   NVPTX_BUILTIN_SHUFFLE,
4901   NVPTX_BUILTIN_SHUFFLELL,
4902   NVPTX_BUILTIN_WORKER_ADDR,
4903   NVPTX_BUILTIN_CMP_SWAP,
4904   NVPTX_BUILTIN_CMP_SWAPLL,
4905   NVPTX_BUILTIN_MAX
4906 };
4907 
4908 static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
4909 
4910 /* Return the NVPTX builtin for CODE.  */
4911 
4912 static tree
nvptx_builtin_decl(unsigned code,bool ARG_UNUSED (initialize_p))4913 nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
4914 {
4915   if (code >= NVPTX_BUILTIN_MAX)
4916     return error_mark_node;
4917 
4918   return nvptx_builtin_decls[code];
4919 }
4920 
4921 /* Set up all builtin functions for this target.  */
4922 
4923 static void
nvptx_init_builtins(void)4924 nvptx_init_builtins (void)
4925 {
4926 #define DEF(ID, NAME, T)						\
4927   (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID]				\
4928    = add_builtin_function ("__builtin_nvptx_" NAME,			\
4929 			   build_function_type_list T,			\
4930 			   NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
4931 #define ST sizetype
4932 #define UINT unsigned_type_node
4933 #define LLUINT long_long_unsigned_type_node
4934 #define PTRVOID ptr_type_node
4935 
4936   DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
4937   DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
4938   DEF (WORKER_ADDR, "worker_addr",
4939        (PTRVOID, ST, UINT, UINT, NULL_TREE));
4940   DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
4941   DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
4942 
4943 #undef DEF
4944 #undef ST
4945 #undef UINT
4946 #undef LLUINT
4947 #undef PTRVOID
4948 }
4949 
4950 /* Expand an expression EXP that calls a built-in function,
4951    with result going to TARGET if that's convenient
4952    (and in mode MODE if that's convenient).
4953    SUBTARGET may be used as the target for computing one of EXP's operands.
4954    IGNORE is nonzero if the value is to be ignored.  */
4955 
4956 static rtx
nvptx_expand_builtin(tree exp,rtx target,rtx ARG_UNUSED (subtarget),machine_mode mode,int ignore)4957 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
4958 		      machine_mode mode, int ignore)
4959 {
4960   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
4961   switch (DECL_FUNCTION_CODE (fndecl))
4962     {
4963     case NVPTX_BUILTIN_SHUFFLE:
4964     case NVPTX_BUILTIN_SHUFFLELL:
4965       return nvptx_expand_shuffle (exp, target, mode, ignore);
4966 
4967     case NVPTX_BUILTIN_WORKER_ADDR:
4968       return nvptx_expand_worker_addr (exp, target, mode, ignore);
4969 
4970     case NVPTX_BUILTIN_CMP_SWAP:
4971     case NVPTX_BUILTIN_CMP_SWAPLL:
4972       return nvptx_expand_cmp_swap (exp, target, mode, ignore);
4973 
4974     default: gcc_unreachable ();
4975     }
4976 }
4977 
4978 /* Define dimension sizes for known hardware.  */
4979 #define PTX_VECTOR_LENGTH 32
4980 #define PTX_WORKER_LENGTH 32
4981 #define PTX_GANG_DEFAULT  0 /* Defer to runtime.  */
4982 
4983 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp.  */
4984 
4985 static int
nvptx_simt_vf()4986 nvptx_simt_vf ()
4987 {
4988   return PTX_VECTOR_LENGTH;
4989 }
4990 
4991 /* Validate compute dimensions of an OpenACC offload or routine, fill
4992    in non-unity defaults.  FN_LEVEL indicates the level at which a
4993    routine might spawn a loop.  It is negative for non-routines.  If
4994    DECL is null, we are validating the default dimensions.  */
4995 
4996 static bool
nvptx_goacc_validate_dims(tree decl,int dims[],int fn_level)4997 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
4998 {
4999   bool changed = false;
5000 
5001   /* The vector size must be 32, unless this is a SEQ routine.  */
5002   if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1
5003       && dims[GOMP_DIM_VECTOR] >= 0
5004       && dims[GOMP_DIM_VECTOR] != PTX_VECTOR_LENGTH)
5005     {
5006       if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0)
5007 	warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5008 		    dims[GOMP_DIM_VECTOR]
5009 		    ? G_("using vector_length (%d), ignoring %d")
5010 		    : G_("using vector_length (%d), ignoring runtime setting"),
5011 		    PTX_VECTOR_LENGTH, dims[GOMP_DIM_VECTOR]);
5012       dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
5013       changed = true;
5014     }
5015 
5016   /* Check the num workers is not too large.  */
5017   if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
5018     {
5019       warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5020 		  "using num_workers (%d), ignoring %d",
5021 		  PTX_WORKER_LENGTH, dims[GOMP_DIM_WORKER]);
5022       dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
5023       changed = true;
5024     }
5025 
5026   if (!decl)
5027     {
5028       dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
5029       if (dims[GOMP_DIM_WORKER] < 0)
5030 	dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
5031       if (dims[GOMP_DIM_GANG] < 0)
5032 	dims[GOMP_DIM_GANG] = PTX_GANG_DEFAULT;
5033       changed = true;
5034     }
5035 
5036   return changed;
5037 }
5038 
5039 /* Return maximum dimension size, or zero for unbounded.  */
5040 
5041 static int
nvptx_dim_limit(int axis)5042 nvptx_dim_limit (int axis)
5043 {
5044   switch (axis)
5045     {
5046     case GOMP_DIM_WORKER:
5047       return PTX_WORKER_LENGTH;
5048 
5049     case GOMP_DIM_VECTOR:
5050       return PTX_VECTOR_LENGTH;
5051 
5052     default:
5053       break;
5054     }
5055   return 0;
5056 }
5057 
5058 /* Determine whether fork & joins are needed.  */
5059 
5060 static bool
nvptx_goacc_fork_join(gcall * call,const int dims[],bool ARG_UNUSED (is_fork))5061 nvptx_goacc_fork_join (gcall *call, const int dims[],
5062 		       bool ARG_UNUSED (is_fork))
5063 {
5064   tree arg = gimple_call_arg (call, 2);
5065   unsigned axis = TREE_INT_CST_LOW (arg);
5066 
5067   /* We only care about worker and vector partitioning.  */
5068   if (axis < GOMP_DIM_WORKER)
5069     return false;
5070 
5071   /* If the size is 1, there's no partitioning.  */
5072   if (dims[axis] == 1)
5073     return false;
5074 
5075   return true;
5076 }
5077 
5078 /* Generate a PTX builtin function call that returns the address in
5079    the worker reduction buffer at OFFSET.  TYPE is the type of the
5080    data at that location.  */
5081 
5082 static tree
nvptx_get_worker_red_addr(tree type,tree offset)5083 nvptx_get_worker_red_addr (tree type, tree offset)
5084 {
5085   machine_mode mode = TYPE_MODE (type);
5086   tree fndecl = nvptx_builtin_decl (NVPTX_BUILTIN_WORKER_ADDR, true);
5087   tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
5088   tree align = build_int_cst (unsigned_type_node,
5089 			      GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
5090   tree call = build_call_expr (fndecl, 3, offset, size, align);
5091 
5092   return fold_convert (build_pointer_type (type), call);
5093 }
5094 
5095 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR.  This function
5096    will cast the variable if necessary.  */
5097 
5098 static void
nvptx_generate_vector_shuffle(location_t loc,tree dest_var,tree var,unsigned shift,gimple_seq * seq)5099 nvptx_generate_vector_shuffle (location_t loc,
5100 			       tree dest_var, tree var, unsigned shift,
5101 			       gimple_seq *seq)
5102 {
5103   unsigned fn = NVPTX_BUILTIN_SHUFFLE;
5104   tree_code code = NOP_EXPR;
5105   tree arg_type = unsigned_type_node;
5106   tree var_type = TREE_TYPE (var);
5107   tree dest_type = var_type;
5108 
5109   if (TREE_CODE (var_type) == COMPLEX_TYPE)
5110     var_type = TREE_TYPE (var_type);
5111 
5112   if (TREE_CODE (var_type) == REAL_TYPE)
5113     code = VIEW_CONVERT_EXPR;
5114 
5115   if (TYPE_SIZE (var_type)
5116       == TYPE_SIZE (long_long_unsigned_type_node))
5117     {
5118       fn = NVPTX_BUILTIN_SHUFFLELL;
5119       arg_type = long_long_unsigned_type_node;
5120     }
5121 
5122   tree call = nvptx_builtin_decl (fn, true);
5123   tree bits = build_int_cst (unsigned_type_node, shift);
5124   tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
5125   tree expr;
5126 
5127   if (var_type != dest_type)
5128     {
5129       /* Do real and imaginary parts separately.  */
5130       tree real = fold_build1 (REALPART_EXPR, var_type, var);
5131       real = fold_build1 (code, arg_type, real);
5132       real = build_call_expr_loc (loc, call, 3, real, bits, kind);
5133       real = fold_build1 (code, var_type, real);
5134 
5135       tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
5136       imag = fold_build1 (code, arg_type, imag);
5137       imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
5138       imag = fold_build1 (code, var_type, imag);
5139 
5140       expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
5141     }
5142   else
5143     {
5144       expr = fold_build1 (code, arg_type, var);
5145       expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
5146       expr = fold_build1 (code, dest_type, expr);
5147     }
5148 
5149   gimplify_assign (dest_var, expr, seq);
5150 }
5151 
5152 /* Lazily generate the global lock var decl and return its address.  */
5153 
5154 static tree
nvptx_global_lock_addr()5155 nvptx_global_lock_addr ()
5156 {
5157   tree v = global_lock_var;
5158 
5159   if (!v)
5160     {
5161       tree name = get_identifier ("__reduction_lock");
5162       tree type = build_qualified_type (unsigned_type_node,
5163 					TYPE_QUAL_VOLATILE);
5164       v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
5165       global_lock_var = v;
5166       DECL_ARTIFICIAL (v) = 1;
5167       DECL_EXTERNAL (v) = 1;
5168       TREE_STATIC (v) = 1;
5169       TREE_PUBLIC (v) = 1;
5170       TREE_USED (v) = 1;
5171       mark_addressable (v);
5172       mark_decl_referenced (v);
5173     }
5174 
5175   return build_fold_addr_expr (v);
5176 }
5177 
5178 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
5179    GSI.  We use a lockless scheme for nearly all case, which looks
5180    like:
5181      actual = initval(OP);
5182      do {
5183        guess = actual;
5184        write = guess OP myval;
5185        actual = cmp&swap (ptr, guess, write)
5186      } while (actual bit-different-to guess);
5187    return write;
5188 
5189    This relies on a cmp&swap instruction, which is available for 32-
5190    and 64-bit types.  Larger types must use a locking scheme.  */
5191 
5192 static tree
nvptx_lockless_update(location_t loc,gimple_stmt_iterator * gsi,tree ptr,tree var,tree_code op)5193 nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
5194 		       tree ptr, tree var, tree_code op)
5195 {
5196   unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
5197   tree_code code = NOP_EXPR;
5198   tree arg_type = unsigned_type_node;
5199   tree var_type = TREE_TYPE (var);
5200 
5201   if (TREE_CODE (var_type) == COMPLEX_TYPE
5202       || TREE_CODE (var_type) == REAL_TYPE)
5203     code = VIEW_CONVERT_EXPR;
5204 
5205   if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
5206     {
5207       arg_type = long_long_unsigned_type_node;
5208       fn = NVPTX_BUILTIN_CMP_SWAPLL;
5209     }
5210 
5211   tree swap_fn = nvptx_builtin_decl (fn, true);
5212 
5213   gimple_seq init_seq = NULL;
5214   tree init_var = make_ssa_name (arg_type);
5215   tree init_expr = omp_reduction_init_op (loc, op, var_type);
5216   init_expr = fold_build1 (code, arg_type, init_expr);
5217   gimplify_assign (init_var, init_expr, &init_seq);
5218   gimple *init_end = gimple_seq_last (init_seq);
5219 
5220   gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
5221 
5222   /* Split the block just after the init stmts.  */
5223   basic_block pre_bb = gsi_bb (*gsi);
5224   edge pre_edge = split_block (pre_bb, init_end);
5225   basic_block loop_bb = pre_edge->dest;
5226   pre_bb = pre_edge->src;
5227   /* Reset the iterator.  */
5228   *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5229 
5230   tree expect_var = make_ssa_name (arg_type);
5231   tree actual_var = make_ssa_name (arg_type);
5232   tree write_var = make_ssa_name (arg_type);
5233 
5234   /* Build and insert the reduction calculation.  */
5235   gimple_seq red_seq = NULL;
5236   tree write_expr = fold_build1 (code, var_type, expect_var);
5237   write_expr = fold_build2 (op, var_type, write_expr, var);
5238   write_expr = fold_build1 (code, arg_type, write_expr);
5239   gimplify_assign (write_var, write_expr, &red_seq);
5240 
5241   gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5242 
5243   /* Build & insert the cmp&swap sequence.  */
5244   gimple_seq latch_seq = NULL;
5245   tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
5246 					ptr, expect_var, write_var);
5247   gimplify_assign (actual_var, swap_expr, &latch_seq);
5248 
5249   gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
5250 				   NULL_TREE, NULL_TREE);
5251   gimple_seq_add_stmt (&latch_seq, cond);
5252 
5253   gimple *latch_end = gimple_seq_last (latch_seq);
5254   gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
5255 
5256   /* Split the block just after the latch stmts.  */
5257   edge post_edge = split_block (loop_bb, latch_end);
5258   basic_block post_bb = post_edge->dest;
5259   loop_bb = post_edge->src;
5260   *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5261 
5262   post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
5263   post_edge->probability = profile_probability::even ();
5264   edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
5265   loop_edge->probability = profile_probability::even ();
5266   set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
5267   set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
5268 
5269   gphi *phi = create_phi_node (expect_var, loop_bb);
5270   add_phi_arg (phi, init_var, pre_edge, loc);
5271   add_phi_arg (phi, actual_var, loop_edge, loc);
5272 
5273   loop *loop = alloc_loop ();
5274   loop->header = loop_bb;
5275   loop->latch = loop_bb;
5276   add_loop (loop, loop_bb->loop_father);
5277 
5278   return fold_build1 (code, var_type, write_var);
5279 }
5280 
5281 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
5282    GSI.  This is necessary for types larger than 64 bits, where there
5283    is no cmp&swap instruction to implement a lockless scheme.  We use
5284    a lock variable in global memory.
5285 
5286    while (cmp&swap (&lock_var, 0, 1))
5287      continue;
5288    T accum = *ptr;
5289    accum = accum OP var;
5290    *ptr = accum;
5291    cmp&swap (&lock_var, 1, 0);
5292    return accum;
5293 
5294    A lock in global memory is necessary to force execution engine
5295    descheduling and avoid resource starvation that can occur if the
5296    lock is in .shared memory.  */
5297 
5298 static tree
nvptx_lockfull_update(location_t loc,gimple_stmt_iterator * gsi,tree ptr,tree var,tree_code op)5299 nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
5300 		       tree ptr, tree var, tree_code op)
5301 {
5302   tree var_type = TREE_TYPE (var);
5303   tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
5304   tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
5305   tree uns_locked = build_int_cst (unsigned_type_node, 1);
5306 
5307   /* Split the block just before the gsi.  Insert a gimple nop to make
5308      this easier.  */
5309   gimple *nop = gimple_build_nop ();
5310   gsi_insert_before (gsi, nop, GSI_SAME_STMT);
5311   basic_block entry_bb = gsi_bb (*gsi);
5312   edge entry_edge = split_block (entry_bb, nop);
5313   basic_block lock_bb = entry_edge->dest;
5314   /* Reset the iterator.  */
5315   *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5316 
5317   /* Build and insert the locking sequence.  */
5318   gimple_seq lock_seq = NULL;
5319   tree lock_var = make_ssa_name (unsigned_type_node);
5320   tree lock_expr = nvptx_global_lock_addr ();
5321   lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
5322 				   uns_unlocked, uns_locked);
5323   gimplify_assign (lock_var, lock_expr, &lock_seq);
5324   gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
5325 				   NULL_TREE, NULL_TREE);
5326   gimple_seq_add_stmt (&lock_seq, cond);
5327   gimple *lock_end = gimple_seq_last (lock_seq);
5328   gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
5329 
5330   /* Split the block just after the lock sequence.  */
5331   edge locked_edge = split_block (lock_bb, lock_end);
5332   basic_block update_bb = locked_edge->dest;
5333   lock_bb = locked_edge->src;
5334   *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5335 
5336   /* Create the lock loop ... */
5337   locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
5338   locked_edge->probability = profile_probability::even ();
5339   edge loop_edge = make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
5340   loop_edge->probability = profile_probability::even ();
5341   set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
5342   set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
5343 
5344   /* ... and the loop structure.  */
5345   loop *lock_loop = alloc_loop ();
5346   lock_loop->header = lock_bb;
5347   lock_loop->latch = lock_bb;
5348   lock_loop->nb_iterations_estimate = 1;
5349   lock_loop->any_estimate = true;
5350   add_loop (lock_loop, entry_bb->loop_father);
5351 
5352   /* Build and insert the reduction calculation.  */
5353   gimple_seq red_seq = NULL;
5354   tree acc_in = make_ssa_name (var_type);
5355   tree ref_in = build_simple_mem_ref (ptr);
5356   TREE_THIS_VOLATILE (ref_in) = 1;
5357   gimplify_assign (acc_in, ref_in, &red_seq);
5358 
5359   tree acc_out = make_ssa_name (var_type);
5360   tree update_expr = fold_build2 (op, var_type, ref_in, var);
5361   gimplify_assign (acc_out, update_expr, &red_seq);
5362 
5363   tree ref_out = build_simple_mem_ref (ptr);
5364   TREE_THIS_VOLATILE (ref_out) = 1;
5365   gimplify_assign (ref_out, acc_out, &red_seq);
5366 
5367   gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5368 
5369   /* Build & insert the unlock sequence.  */
5370   gimple_seq unlock_seq = NULL;
5371   tree unlock_expr = nvptx_global_lock_addr ();
5372   unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
5373 				     uns_locked, uns_unlocked);
5374   gimplify_and_add (unlock_expr, &unlock_seq);
5375   gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
5376 
5377   return acc_out;
5378 }
5379 
5380 /* Emit a sequence to update a reduction accumlator at *PTR with the
5381    value held in VAR using operator OP.  Return the updated value.
5382 
5383    TODO: optimize for atomic ops and indepedent complex ops.  */
5384 
5385 static tree
nvptx_reduction_update(location_t loc,gimple_stmt_iterator * gsi,tree ptr,tree var,tree_code op)5386 nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
5387 			tree ptr, tree var, tree_code op)
5388 {
5389   tree type = TREE_TYPE (var);
5390   tree size = TYPE_SIZE (type);
5391 
5392   if (size == TYPE_SIZE (unsigned_type_node)
5393       || size == TYPE_SIZE (long_long_unsigned_type_node))
5394     return nvptx_lockless_update (loc, gsi, ptr, var, op);
5395   else
5396     return nvptx_lockfull_update (loc, gsi, ptr, var, op);
5397 }
5398 
5399 /* NVPTX implementation of GOACC_REDUCTION_SETUP.  */
5400 
5401 static void
nvptx_goacc_reduction_setup(gcall * call)5402 nvptx_goacc_reduction_setup (gcall *call)
5403 {
5404   gimple_stmt_iterator gsi = gsi_for_stmt (call);
5405   tree lhs = gimple_call_lhs (call);
5406   tree var = gimple_call_arg (call, 2);
5407   int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5408   gimple_seq seq = NULL;
5409 
5410   push_gimplify_context (true);
5411 
5412   if (level != GOMP_DIM_GANG)
5413     {
5414       /* Copy the receiver object.  */
5415       tree ref_to_res = gimple_call_arg (call, 1);
5416 
5417       if (!integer_zerop (ref_to_res))
5418 	var = build_simple_mem_ref (ref_to_res);
5419     }
5420 
5421   if (level == GOMP_DIM_WORKER)
5422     {
5423       /* Store incoming value to worker reduction buffer.  */
5424       tree offset = gimple_call_arg (call, 5);
5425       tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
5426       tree ptr = make_ssa_name (TREE_TYPE (call));
5427 
5428       gimplify_assign (ptr, call, &seq);
5429       tree ref = build_simple_mem_ref (ptr);
5430       TREE_THIS_VOLATILE (ref) = 1;
5431       gimplify_assign (ref, var, &seq);
5432     }
5433 
5434   if (lhs)
5435     gimplify_assign (lhs, var, &seq);
5436 
5437   pop_gimplify_context (NULL);
5438   gsi_replace_with_seq (&gsi, seq, true);
5439 }
5440 
5441 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
5442 
5443 static void
nvptx_goacc_reduction_init(gcall * call)5444 nvptx_goacc_reduction_init (gcall *call)
5445 {
5446   gimple_stmt_iterator gsi = gsi_for_stmt (call);
5447   tree lhs = gimple_call_lhs (call);
5448   tree var = gimple_call_arg (call, 2);
5449   int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5450   enum tree_code rcode
5451     = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
5452   tree init = omp_reduction_init_op (gimple_location (call), rcode,
5453 				     TREE_TYPE (var));
5454   gimple_seq seq = NULL;
5455 
5456   push_gimplify_context (true);
5457 
5458   if (level == GOMP_DIM_VECTOR)
5459     {
5460       /* Initialize vector-non-zeroes to INIT_VAL (OP).  */
5461       tree tid = make_ssa_name (integer_type_node);
5462       tree dim_vector = gimple_call_arg (call, 3);
5463       gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
5464 						     dim_vector);
5465       gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
5466 					     NULL_TREE, NULL_TREE);
5467 
5468       gimple_call_set_lhs (tid_call, tid);
5469       gimple_seq_add_stmt (&seq, tid_call);
5470       gimple_seq_add_stmt (&seq, cond_stmt);
5471 
5472       /* Split the block just after the call.  */
5473       edge init_edge = split_block (gsi_bb (gsi), call);
5474       basic_block init_bb = init_edge->dest;
5475       basic_block call_bb = init_edge->src;
5476 
5477       /* Fixup flags from call_bb to init_bb.  */
5478       init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
5479       init_edge->probability = profile_probability::even ();
5480 
5481       /* Set the initialization stmts.  */
5482       gimple_seq init_seq = NULL;
5483       tree init_var = make_ssa_name (TREE_TYPE (var));
5484       gimplify_assign (init_var, init, &init_seq);
5485       gsi = gsi_start_bb (init_bb);
5486       gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
5487 
5488       /* Split block just after the init stmt.  */
5489       gsi_prev (&gsi);
5490       edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
5491       basic_block dst_bb = inited_edge->dest;
5492 
5493       /* Create false edge from call_bb to dst_bb.  */
5494       edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
5495       nop_edge->probability = profile_probability::even ();
5496 
5497       /* Create phi node in dst block.  */
5498       gphi *phi = create_phi_node (lhs, dst_bb);
5499       add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
5500       add_phi_arg (phi, var, nop_edge, gimple_location (call));
5501 
5502       /* Reset dominator of dst bb.  */
5503       set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
5504 
5505       /* Reset the gsi.  */
5506       gsi = gsi_for_stmt (call);
5507     }
5508   else
5509     {
5510       if (level == GOMP_DIM_GANG)
5511 	{
5512 	  /* If there's no receiver object, propagate the incoming VAR.  */
5513 	  tree ref_to_res = gimple_call_arg (call, 1);
5514 	  if (integer_zerop (ref_to_res))
5515 	    init = var;
5516 	}
5517 
5518       gimplify_assign (lhs, init, &seq);
5519     }
5520 
5521   pop_gimplify_context (NULL);
5522   gsi_replace_with_seq (&gsi, seq, true);
5523 }
5524 
5525 /* NVPTX implementation of GOACC_REDUCTION_FINI.  */
5526 
5527 static void
nvptx_goacc_reduction_fini(gcall * call)5528 nvptx_goacc_reduction_fini (gcall *call)
5529 {
5530   gimple_stmt_iterator gsi = gsi_for_stmt (call);
5531   tree lhs = gimple_call_lhs (call);
5532   tree ref_to_res = gimple_call_arg (call, 1);
5533   tree var = gimple_call_arg (call, 2);
5534   int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5535   enum tree_code op
5536     = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
5537   gimple_seq seq = NULL;
5538   tree r = NULL_TREE;;
5539 
5540   push_gimplify_context (true);
5541 
5542   if (level == GOMP_DIM_VECTOR)
5543     {
5544       /* Emit binary shuffle tree.  TODO. Emit this as an actual loop,
5545 	 but that requires a method of emitting a unified jump at the
5546 	 gimple level.  */
5547       for (int shfl = PTX_VECTOR_LENGTH / 2; shfl > 0; shfl = shfl >> 1)
5548 	{
5549 	  tree other_var = make_ssa_name (TREE_TYPE (var));
5550 	  nvptx_generate_vector_shuffle (gimple_location (call),
5551 					 other_var, var, shfl, &seq);
5552 
5553 	  r = make_ssa_name (TREE_TYPE (var));
5554 	  gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
5555 					   var, other_var), &seq);
5556 	  var = r;
5557 	}
5558     }
5559   else
5560     {
5561       tree accum = NULL_TREE;
5562 
5563       if (level == GOMP_DIM_WORKER)
5564 	{
5565 	  /* Get reduction buffer address.  */
5566 	  tree offset = gimple_call_arg (call, 5);
5567 	  tree call = nvptx_get_worker_red_addr (TREE_TYPE (var), offset);
5568 	  tree ptr = make_ssa_name (TREE_TYPE (call));
5569 
5570 	  gimplify_assign (ptr, call, &seq);
5571 	  accum = ptr;
5572 	}
5573       else if (integer_zerop (ref_to_res))
5574 	r = var;
5575       else
5576 	accum = ref_to_res;
5577 
5578       if (accum)
5579 	{
5580 	  /* UPDATE the accumulator.  */
5581 	  gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5582 	  seq = NULL;
5583 	  r = nvptx_reduction_update (gimple_location (call), &gsi,
5584 				      accum, var, op);
5585 	}
5586     }
5587 
5588   if (lhs)
5589     gimplify_assign (lhs, r, &seq);
5590   pop_gimplify_context (NULL);
5591 
5592   gsi_replace_with_seq (&gsi, seq, true);
5593 }
5594 
5595 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN.  */
5596 
5597 static void
nvptx_goacc_reduction_teardown(gcall * call)5598 nvptx_goacc_reduction_teardown (gcall *call)
5599 {
5600   gimple_stmt_iterator gsi = gsi_for_stmt (call);
5601   tree lhs = gimple_call_lhs (call);
5602   tree var = gimple_call_arg (call, 2);
5603   int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
5604   gimple_seq seq = NULL;
5605 
5606   push_gimplify_context (true);
5607   if (level == GOMP_DIM_WORKER)
5608     {
5609       /* Read the worker reduction buffer.  */
5610       tree offset = gimple_call_arg (call, 5);
5611       tree call = nvptx_get_worker_red_addr(TREE_TYPE (var), offset);
5612       tree ptr = make_ssa_name (TREE_TYPE (call));
5613 
5614       gimplify_assign (ptr, call, &seq);
5615       var = build_simple_mem_ref (ptr);
5616       TREE_THIS_VOLATILE (var) = 1;
5617     }
5618 
5619   if (level != GOMP_DIM_GANG)
5620     {
5621       /* Write to the receiver object.  */
5622       tree ref_to_res = gimple_call_arg (call, 1);
5623 
5624       if (!integer_zerop (ref_to_res))
5625 	gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
5626     }
5627 
5628   if (lhs)
5629     gimplify_assign (lhs, var, &seq);
5630 
5631   pop_gimplify_context (NULL);
5632 
5633   gsi_replace_with_seq (&gsi, seq, true);
5634 }
5635 
5636 /* NVPTX reduction expander.  */
5637 
5638 static void
nvptx_goacc_reduction(gcall * call)5639 nvptx_goacc_reduction (gcall *call)
5640 {
5641   unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
5642 
5643   switch (code)
5644     {
5645     case IFN_GOACC_REDUCTION_SETUP:
5646       nvptx_goacc_reduction_setup (call);
5647       break;
5648 
5649     case IFN_GOACC_REDUCTION_INIT:
5650       nvptx_goacc_reduction_init (call);
5651       break;
5652 
5653     case IFN_GOACC_REDUCTION_FINI:
5654       nvptx_goacc_reduction_fini (call);
5655       break;
5656 
5657     case IFN_GOACC_REDUCTION_TEARDOWN:
5658       nvptx_goacc_reduction_teardown (call);
5659       break;
5660 
5661     default:
5662       gcc_unreachable ();
5663     }
5664 }
5665 
5666 static bool
nvptx_cannot_force_const_mem(machine_mode mode ATTRIBUTE_UNUSED,rtx x ATTRIBUTE_UNUSED)5667 nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
5668 			      rtx x ATTRIBUTE_UNUSED)
5669 {
5670   return true;
5671 }
5672 
5673 static bool
nvptx_vector_mode_supported(machine_mode mode)5674 nvptx_vector_mode_supported (machine_mode mode)
5675 {
5676   return (mode == V2SImode
5677 	  || mode == V2DImode);
5678 }
5679 
5680 /* Return the preferred mode for vectorizing scalar MODE.  */
5681 
5682 static machine_mode
nvptx_preferred_simd_mode(scalar_mode mode)5683 nvptx_preferred_simd_mode (scalar_mode mode)
5684 {
5685   switch (mode)
5686     {
5687     case E_DImode:
5688       return V2DImode;
5689     case E_SImode:
5690       return V2SImode;
5691 
5692     default:
5693       return default_preferred_simd_mode (mode);
5694     }
5695 }
5696 
5697 unsigned int
nvptx_data_alignment(const_tree type,unsigned int basic_align)5698 nvptx_data_alignment (const_tree type, unsigned int basic_align)
5699 {
5700   if (TREE_CODE (type) == INTEGER_TYPE)
5701     {
5702       unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
5703       if (size == GET_MODE_SIZE (TImode))
5704 	return GET_MODE_BITSIZE (maybe_split_mode (TImode));
5705     }
5706 
5707   return basic_align;
5708 }
5709 
5710 /* Implement TARGET_MODES_TIEABLE_P.  */
5711 
5712 static bool
nvptx_modes_tieable_p(machine_mode,machine_mode)5713 nvptx_modes_tieable_p (machine_mode, machine_mode)
5714 {
5715   return false;
5716 }
5717 
5718 /* Implement TARGET_HARD_REGNO_NREGS.  */
5719 
5720 static unsigned int
nvptx_hard_regno_nregs(unsigned int,machine_mode)5721 nvptx_hard_regno_nregs (unsigned int, machine_mode)
5722 {
5723   return 1;
5724 }
5725 
5726 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
5727 
5728 static bool
nvptx_can_change_mode_class(machine_mode,machine_mode,reg_class_t)5729 nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t)
5730 {
5731   return false;
5732 }
5733 
5734 #undef TARGET_OPTION_OVERRIDE
5735 #define TARGET_OPTION_OVERRIDE nvptx_option_override
5736 
5737 #undef TARGET_ATTRIBUTE_TABLE
5738 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
5739 
5740 #undef TARGET_LRA_P
5741 #define TARGET_LRA_P hook_bool_void_false
5742 
5743 #undef TARGET_LEGITIMATE_ADDRESS_P
5744 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
5745 
5746 #undef  TARGET_PROMOTE_FUNCTION_MODE
5747 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
5748 
5749 #undef TARGET_FUNCTION_ARG
5750 #define TARGET_FUNCTION_ARG nvptx_function_arg
5751 #undef TARGET_FUNCTION_INCOMING_ARG
5752 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
5753 #undef TARGET_FUNCTION_ARG_ADVANCE
5754 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
5755 #undef TARGET_FUNCTION_ARG_BOUNDARY
5756 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
5757 #undef TARGET_PASS_BY_REFERENCE
5758 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
5759 #undef TARGET_FUNCTION_VALUE_REGNO_P
5760 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
5761 #undef TARGET_FUNCTION_VALUE
5762 #define TARGET_FUNCTION_VALUE nvptx_function_value
5763 #undef TARGET_LIBCALL_VALUE
5764 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
5765 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
5766 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
5767 #undef TARGET_GET_DRAP_RTX
5768 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
5769 #undef TARGET_SPLIT_COMPLEX_ARG
5770 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
5771 #undef TARGET_RETURN_IN_MEMORY
5772 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
5773 #undef TARGET_OMIT_STRUCT_RETURN_REG
5774 #define TARGET_OMIT_STRUCT_RETURN_REG true
5775 #undef TARGET_STRICT_ARGUMENT_NAMING
5776 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
5777 #undef TARGET_CALL_ARGS
5778 #define TARGET_CALL_ARGS nvptx_call_args
5779 #undef TARGET_END_CALL_ARGS
5780 #define TARGET_END_CALL_ARGS nvptx_end_call_args
5781 
5782 #undef TARGET_ASM_FILE_START
5783 #define TARGET_ASM_FILE_START nvptx_file_start
5784 #undef TARGET_ASM_FILE_END
5785 #define TARGET_ASM_FILE_END nvptx_file_end
5786 #undef TARGET_ASM_GLOBALIZE_LABEL
5787 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
5788 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
5789 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
5790 #undef  TARGET_PRINT_OPERAND
5791 #define TARGET_PRINT_OPERAND nvptx_print_operand
5792 #undef  TARGET_PRINT_OPERAND_ADDRESS
5793 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
5794 #undef  TARGET_PRINT_OPERAND_PUNCT_VALID_P
5795 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
5796 #undef TARGET_ASM_INTEGER
5797 #define TARGET_ASM_INTEGER nvptx_assemble_integer
5798 #undef TARGET_ASM_DECL_END
5799 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
5800 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
5801 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
5802 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
5803 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
5804 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
5805 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
5806 
5807 #undef TARGET_MACHINE_DEPENDENT_REORG
5808 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
5809 #undef TARGET_NO_REGISTER_ALLOCATION
5810 #define TARGET_NO_REGISTER_ALLOCATION true
5811 
5812 #undef TARGET_ENCODE_SECTION_INFO
5813 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
5814 #undef TARGET_RECORD_OFFLOAD_SYMBOL
5815 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
5816 
5817 #undef TARGET_VECTOR_ALIGNMENT
5818 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
5819 
5820 #undef TARGET_CANNOT_COPY_INSN_P
5821 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
5822 
5823 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
5824 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
5825 
5826 #undef TARGET_INIT_BUILTINS
5827 #define TARGET_INIT_BUILTINS nvptx_init_builtins
5828 #undef TARGET_EXPAND_BUILTIN
5829 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
5830 #undef  TARGET_BUILTIN_DECL
5831 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
5832 
5833 #undef TARGET_SIMT_VF
5834 #define TARGET_SIMT_VF nvptx_simt_vf
5835 
5836 #undef TARGET_GOACC_VALIDATE_DIMS
5837 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
5838 
5839 #undef TARGET_GOACC_DIM_LIMIT
5840 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
5841 
5842 #undef TARGET_GOACC_FORK_JOIN
5843 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
5844 
5845 #undef TARGET_GOACC_REDUCTION
5846 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
5847 
5848 #undef TARGET_CANNOT_FORCE_CONST_MEM
5849 #define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
5850 
5851 #undef TARGET_VECTOR_MODE_SUPPORTED_P
5852 #define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
5853 
5854 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
5855 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
5856     nvptx_preferred_simd_mode
5857 
5858 #undef TARGET_MODES_TIEABLE_P
5859 #define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
5860 
5861 #undef TARGET_HARD_REGNO_NREGS
5862 #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
5863 
5864 #undef TARGET_CAN_CHANGE_MODE_CLASS
5865 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
5866 
5867 struct gcc_target targetm = TARGET_INITIALIZER;
5868 
5869 #include "gt-nvptx.h"
5870