1 /* Target code for NVPTX.
2    Copyright (C) 2014-2021 Free Software Foundation, Inc.
3    Contributed by Bernd Schmidt <bernds@codesourcery.com>
4 
5    This file is part of GCC.
6 
7    GCC is free software; you can redistribute it and/or modify it
8    under the terms of the GNU General Public License as published
9    by the Free Software Foundation; either version 3, or (at your
10    option) any later version.
11 
12    GCC is distributed in the hope that it will be useful, but WITHOUT
13    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
15    License for more details.
16 
17    You should have received a copy of the GNU General Public License
18    along with GCC; see the file COPYING3.  If not see
19    <http://www.gnu.org/licenses/>.  */
20 
21 #define IN_TARGET_CODE 1
22 
23 #include "config.h"
24 #include <sstream>
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "cfghooks.h"
32 #include "df.h"
33 #include "memmodel.h"
34 #include "tm_p.h"
35 #include "expmed.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "diagnostic.h"
41 #include "alias.h"
42 #include "insn-flags.h"
43 #include "output.h"
44 #include "insn-attr.h"
45 #include "flags.h"
46 #include "dojump.h"
47 #include "explow.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "stmt.h"
51 #include "expr.h"
52 #include "tm-preds.h"
53 #include "tm-constrs.h"
54 #include "langhooks.h"
55 #include "dbxout.h"
56 #include "cfgrtl.h"
57 #include "gimple.h"
58 #include "stor-layout.h"
59 #include "builtins.h"
60 #include "omp-general.h"
61 #include "omp-low.h"
62 #include "omp-offload.h"
63 #include "gomp-constants.h"
64 #include "dumpfile.h"
65 #include "internal-fn.h"
66 #include "gimple-iterator.h"
67 #include "stringpool.h"
68 #include "attribs.h"
69 #include "tree-vrp.h"
70 #include "tree-ssa-operands.h"
71 #include "tree-ssanames.h"
72 #include "gimplify.h"
73 #include "tree-phinodes.h"
74 #include "cfgloop.h"
75 #include "fold-const.h"
76 #include "intl.h"
77 #include "opts.h"
78 
79 /* This file should be included last.  */
80 #include "target-def.h"
81 
82 #define WORKAROUND_PTXJIT_BUG 1
83 #define WORKAROUND_PTXJIT_BUG_2 1
84 #define WORKAROUND_PTXJIT_BUG_3 1
85 
86 /* The PTX concept CTA (Concurrent Thread Array) maps on the CUDA concept thread
87    block, which has had a maximum number of threads of 1024 since CUDA version
88    2.x.  */
89 #define PTX_CTA_SIZE 1024
90 
91 #define PTX_CTA_NUM_BARRIERS 16
92 #define PTX_WARP_SIZE 32
93 
94 #define PTX_PER_CTA_BARRIER 0
95 #define PTX_NUM_PER_CTA_BARRIERS 1
96 #define PTX_FIRST_PER_WORKER_BARRIER (PTX_NUM_PER_CTA_BARRIERS)
97 #define PTX_NUM_PER_WORKER_BARRIERS (PTX_CTA_NUM_BARRIERS - PTX_NUM_PER_CTA_BARRIERS)
98 
99 #define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE
100 #define PTX_MAX_VECTOR_LENGTH PTX_CTA_SIZE
101 #define PTX_WORKER_LENGTH 32
102 #define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime.  */
103 
104 /* The various PTX memory areas an object might reside in.  */
105 enum nvptx_data_area
106 {
107   DATA_AREA_GENERIC,
108   DATA_AREA_GLOBAL,
109   DATA_AREA_SHARED,
110   DATA_AREA_LOCAL,
111   DATA_AREA_CONST,
112   DATA_AREA_PARAM,
113   DATA_AREA_MAX
114 };
115 
116 /*  We record the data area in the target symbol flags.  */
117 #define SYMBOL_DATA_AREA(SYM) \
118   (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
119 		    & 7)
120 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
121   (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
122 
123 /* Record the function decls we've written, and the libfuncs and function
124    decls corresponding to them.  */
125 static std::stringstream func_decls;
126 
127 struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
128 {
hashdeclared_libfunc_hasher129   static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
equaldeclared_libfunc_hasher130   static bool equal (rtx a, rtx b) { return a == b; }
131 };
132 
133 static GTY((cache))
134   hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
135 
136 struct tree_hasher : ggc_cache_ptr_hash<tree_node>
137 {
hashtree_hasher138   static hashval_t hash (tree t) { return htab_hash_pointer (t); }
equaltree_hasher139   static bool equal (tree a, tree b) { return a == b; }
140 };
141 
142 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
143 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
144 
145 /* Buffer needed to broadcast across workers and vectors.  This is
146    used for both worker-neutering and worker broadcasting, and
147    vector-neutering and boardcasting when vector_length > 32.  It is
148    shared by all functions emitted.  The buffer is placed in shared
149    memory.  It'd be nice if PTX supported common blocks, because then
150    this could be shared across TUs (taking the largest size).  */
151 static unsigned oacc_bcast_size;
152 static unsigned oacc_bcast_partition;
153 static unsigned oacc_bcast_align;
154 static GTY(()) rtx oacc_bcast_sym;
155 
156 /* Buffer needed for worker reductions.  This has to be distinct from
157    the worker broadcast array, as both may be live concurrently.  */
158 static unsigned worker_red_size;
159 static unsigned worker_red_align;
160 static GTY(()) rtx worker_red_sym;
161 
162 /* Buffer needed for vector reductions, when vector_length >
163    PTX_WARP_SIZE.  This has to be distinct from the worker broadcast
164    array, as both may be live concurrently.  */
165 static unsigned vector_red_size;
166 static unsigned vector_red_align;
167 static unsigned vector_red_partition;
168 static GTY(()) rtx vector_red_sym;
169 
170 /* Global lock variable, needed for 128bit worker & gang reductions.  */
171 static GTY(()) tree global_lock_var;
172 
173 /* True if any function references __nvptx_stacks.  */
174 static bool need_softstack_decl;
175 
176 /* True if any function references __nvptx_uni.  */
177 static bool need_unisimt_decl;
178 
179 static int nvptx_mach_max_workers ();
180 
181 /* Allocate a new, cleared machine_function structure.  */
182 
183 static struct machine_function *
nvptx_init_machine_status(void)184 nvptx_init_machine_status (void)
185 {
186   struct machine_function *p = ggc_cleared_alloc<machine_function> ();
187   p->return_mode = VOIDmode;
188   return p;
189 }
190 
191 /* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
192    and -fopenacc is also enabled.  */
193 
194 static void
diagnose_openacc_conflict(bool optval,const char * optname)195 diagnose_openacc_conflict (bool optval, const char *optname)
196 {
197   if (flag_openacc && optval)
198     error ("option %s is not supported together with %<-fopenacc%>", optname);
199 }
200 
201 /* Implement TARGET_OPTION_OVERRIDE.  */
202 
203 static void
nvptx_option_override(void)204 nvptx_option_override (void)
205 {
206   init_machine_status = nvptx_init_machine_status;
207 
208   /* Set toplevel_reorder, unless explicitly disabled.  We need
209      reordering so that we emit necessary assembler decls of
210      undeclared variables. */
211   if (!global_options_set.x_flag_toplevel_reorder)
212     flag_toplevel_reorder = 1;
213 
214   debug_nonbind_markers_p = 0;
215 
216   /* Set flag_no_common, unless explicitly disabled.  We fake common
217      using .weak, and that's not entirely accurate, so avoid it
218      unless forced.  */
219   if (!global_options_set.x_flag_no_common)
220     flag_no_common = 1;
221 
222   /* The patch area requires nops, which we don't have.  */
223   HOST_WIDE_INT patch_area_size, patch_area_entry;
224   parse_and_check_patch_area (flag_patchable_function_entry, false,
225 			      &patch_area_size, &patch_area_entry);
226   if (patch_area_size > 0)
227     sorry ("not generating patch area, nops not supported");
228 
229   /* Assumes that it will see only hard registers.  */
230   flag_var_tracking = 0;
231 
232   if (nvptx_optimize < 0)
233     nvptx_optimize = optimize > 0;
234 
235   declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
236   needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
237   declared_libfuncs_htab
238     = hash_table<declared_libfunc_hasher>::create_ggc (17);
239 
240   oacc_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__oacc_bcast");
241   SET_SYMBOL_DATA_AREA (oacc_bcast_sym, DATA_AREA_SHARED);
242   oacc_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
243   oacc_bcast_partition = 0;
244 
245   worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
246   SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
247   worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
248 
249   vector_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__vector_red");
250   SET_SYMBOL_DATA_AREA (vector_red_sym, DATA_AREA_SHARED);
251   vector_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
252   vector_red_partition = 0;
253 
254   diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
255   diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
256   diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
257 
258   if (TARGET_GOMP)
259     target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
260 }
261 
262 /* Return a ptx type for MODE.  If PROMOTE, then use .u32 for QImode to
263    deal with ptx ideosyncracies.  */
264 
265 const char *
nvptx_ptx_type_from_mode(machine_mode mode,bool promote)266 nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
267 {
268   switch (mode)
269     {
270     case E_BLKmode:
271       return ".b8";
272     case E_BImode:
273       return ".pred";
274     case E_QImode:
275       if (promote)
276 	return ".u32";
277       else
278 	return ".u8";
279     case E_HImode:
280       return ".u16";
281     case E_SImode:
282       return ".u32";
283     case E_DImode:
284       return ".u64";
285 
286     case E_SFmode:
287       return ".f32";
288     case E_DFmode:
289       return ".f64";
290 
291     case E_V2SImode:
292       return ".v2.u32";
293     case E_V2DImode:
294       return ".v2.u64";
295 
296     default:
297       gcc_unreachable ();
298     }
299 }
300 
301 /* Encode the PTX data area that DECL (which might not actually be a
302    _DECL) should reside in.  */
303 
304 static void
nvptx_encode_section_info(tree decl,rtx rtl,int first)305 nvptx_encode_section_info (tree decl, rtx rtl, int first)
306 {
307   default_encode_section_info (decl, rtl, first);
308   if (first && MEM_P (rtl))
309     {
310       nvptx_data_area area = DATA_AREA_GENERIC;
311 
312       if (TREE_CONSTANT (decl))
313 	area = DATA_AREA_CONST;
314       else if (TREE_CODE (decl) == VAR_DECL)
315 	{
316 	  if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
317 	    {
318 	      area = DATA_AREA_SHARED;
319 	      if (DECL_INITIAL (decl))
320 		error ("static initialization of variable %q+D in %<.shared%>"
321 		       " memory is not supported", decl);
322 	    }
323 	  else
324 	    area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
325 	}
326 
327       SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
328     }
329 }
330 
331 /* Return the PTX name of the data area in which SYM should be
332    placed.  The symbol must have already been processed by
333    nvptx_encode_seciton_info, or equivalent.  */
334 
335 static const char *
section_for_sym(rtx sym)336 section_for_sym (rtx sym)
337 {
338   nvptx_data_area area = SYMBOL_DATA_AREA (sym);
339   /* Same order as nvptx_data_area enum.  */
340   static char const *const areas[] =
341     {"", ".global", ".shared", ".local", ".const", ".param"};
342 
343   return areas[area];
344 }
345 
346 /* Similarly for a decl.  */
347 
348 static const char *
section_for_decl(const_tree decl)349 section_for_decl (const_tree decl)
350 {
351   return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
352 }
353 
354 /* Check NAME for special function names and redirect them by returning a
355    replacement.  This applies to malloc, free and realloc, for which we
356    want to use libgcc wrappers, and call, which triggers a bug in
357    ptxas.  We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
358    not active in an offload compiler -- the names are all set by the
359    host-side compiler.  */
360 
361 static const char *
nvptx_name_replacement(const char * name)362 nvptx_name_replacement (const char *name)
363 {
364   if (strcmp (name, "call") == 0)
365     return "__nvptx_call";
366   if (strcmp (name, "malloc") == 0)
367     return "__nvptx_malloc";
368   if (strcmp (name, "free") == 0)
369     return "__nvptx_free";
370   if (strcmp (name, "realloc") == 0)
371     return "__nvptx_realloc";
372   return name;
373 }
374 
375 /* Return NULL if NAME contains no dot.  Otherwise return a copy of NAME
376    with the dots replaced with dollar signs.  */
377 
378 static char *
nvptx_replace_dot(const char * name)379 nvptx_replace_dot (const char *name)
380 {
381   if (strchr (name, '.') == NULL)
382     return NULL;
383 
384   char *p = xstrdup (name);
385   for (size_t i = 0; i < strlen (p); ++i)
386     if (p[i] == '.')
387       p[i] = '$';
388   return p;
389 }
390 
391 /* If MODE should be treated as two registers of an inner mode, return
392    that inner mode.  Otherwise return VOIDmode.  */
393 
394 static machine_mode
maybe_split_mode(machine_mode mode)395 maybe_split_mode (machine_mode mode)
396 {
397   if (COMPLEX_MODE_P (mode))
398     return GET_MODE_INNER (mode);
399 
400   if (mode == TImode)
401     return DImode;
402 
403   return VOIDmode;
404 }
405 
406 /* Return true if mode should be treated as two registers.  */
407 
408 static bool
split_mode_p(machine_mode mode)409 split_mode_p (machine_mode mode)
410 {
411   return maybe_split_mode (mode) != VOIDmode;
412 }
413 
414 /* Output a register, subreg, or register pair (with optional
415    enclosing braces).  */
416 
417 static void
418 output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
419 	    int subreg_offset = -1)
420 {
421   if (inner_mode == VOIDmode)
422     {
423       if (HARD_REGISTER_NUM_P (regno))
424 	fprintf (file, "%s", reg_names[regno]);
425       else
426 	fprintf (file, "%%r%d", regno);
427     }
428   else if (subreg_offset >= 0)
429     {
430       output_reg (file, regno, VOIDmode);
431       fprintf (file, "$%d", subreg_offset);
432     }
433   else
434     {
435       if (subreg_offset == -1)
436 	fprintf (file, "{");
437       output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
438       fprintf (file, ",");
439       output_reg (file, regno, inner_mode, 0);
440       if (subreg_offset == -1)
441 	fprintf (file, "}");
442     }
443 }
444 
445 /* Emit forking instructions for MASK.  */
446 
447 static void
nvptx_emit_forking(unsigned mask,bool is_call)448 nvptx_emit_forking (unsigned mask, bool is_call)
449 {
450   mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
451 	   | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
452   if (mask)
453     {
454       rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
455 
456       /* Emit fork at all levels.  This helps form SESE regions, as
457 	 it creates a block with a single successor before entering a
458 	 partitooned region.  That is a good candidate for the end of
459 	 an SESE region.  */
460       emit_insn (gen_nvptx_fork (op));
461       emit_insn (gen_nvptx_forked (op));
462     }
463 }
464 
465 /* Emit joining instructions for MASK.  */
466 
467 static void
nvptx_emit_joining(unsigned mask,bool is_call)468 nvptx_emit_joining (unsigned mask, bool is_call)
469 {
470   mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
471 	   | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
472   if (mask)
473     {
474       rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
475 
476       /* Emit joining for all non-call pars to ensure there's a single
477 	 predecessor for the block the join insn ends up in.  This is
478 	 needed for skipping entire loops.  */
479       emit_insn (gen_nvptx_joining (op));
480       emit_insn (gen_nvptx_join (op));
481     }
482 }
483 
484 
485 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
486    returned in memory.  Integer and floating types supported by the
487    machine are passed in registers, everything else is passed in
488    memory.  Complex types are split.  */
489 
490 static bool
pass_in_memory(machine_mode mode,const_tree type,bool for_return)491 pass_in_memory (machine_mode mode, const_tree type, bool for_return)
492 {
493   if (type)
494     {
495       if (AGGREGATE_TYPE_P (type))
496 	return true;
497       if (TREE_CODE (type) == VECTOR_TYPE)
498 	return true;
499     }
500 
501   if (!for_return && COMPLEX_MODE_P (mode))
502     /* Complex types are passed as two underlying args.  */
503     mode = GET_MODE_INNER (mode);
504 
505   if (GET_MODE_CLASS (mode) != MODE_INT
506       && GET_MODE_CLASS (mode) != MODE_FLOAT)
507     return true;
508 
509   if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
510     return true;
511 
512   return false;
513 }
514 
515 /* A non-memory argument of mode MODE is being passed, determine the mode it
516    should be promoted to.  This is also used for determining return
517    type promotion.  */
518 
519 static machine_mode
promote_arg(machine_mode mode,bool prototyped)520 promote_arg (machine_mode mode, bool prototyped)
521 {
522   if (!prototyped && mode == SFmode)
523     /* K&R float promotion for unprototyped functions.  */
524     mode = DFmode;
525   else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
526     mode = SImode;
527 
528   return mode;
529 }
530 
531 /* A non-memory return type of MODE is being returned.  Determine the
532    mode it should be promoted to.  */
533 
534 static machine_mode
promote_return(machine_mode mode)535 promote_return (machine_mode mode)
536 {
537   return promote_arg (mode, true);
538 }
539 
540 /* Implement TARGET_FUNCTION_ARG.  */
541 
542 static rtx
nvptx_function_arg(cumulative_args_t,const function_arg_info & arg)543 nvptx_function_arg (cumulative_args_t, const function_arg_info &arg)
544 {
545   if (arg.end_marker_p () || !arg.named)
546     return NULL_RTX;
547 
548   return gen_reg_rtx (arg.mode);
549 }
550 
551 /* Implement TARGET_FUNCTION_INCOMING_ARG.  */
552 
553 static rtx
nvptx_function_incoming_arg(cumulative_args_t cum_v,const function_arg_info & arg)554 nvptx_function_incoming_arg (cumulative_args_t cum_v,
555 			     const function_arg_info &arg)
556 {
557   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
558 
559   if (arg.end_marker_p () || !arg.named)
560     return NULL_RTX;
561 
562   /* No need to deal with split modes here, the only case that can
563      happen is complex modes and those are dealt with by
564      TARGET_SPLIT_COMPLEX_ARG.  */
565   return gen_rtx_UNSPEC (arg.mode,
566 			 gen_rtvec (1, GEN_INT (cum->count)),
567 			 UNSPEC_ARG_REG);
568 }
569 
570 /* Implement TARGET_FUNCTION_ARG_ADVANCE.  */
571 
572 static void
nvptx_function_arg_advance(cumulative_args_t cum_v,const function_arg_info &)573 nvptx_function_arg_advance (cumulative_args_t cum_v, const function_arg_info &)
574 {
575   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
576 
577   cum->count++;
578 }
579 
580 /* Implement TARGET_FUNCTION_ARG_BOUNDARY.
581 
582    For nvptx This is only used for varadic args.  The type has already
583    been promoted and/or converted to invisible reference.  */
584 
585 static unsigned
nvptx_function_arg_boundary(machine_mode mode,const_tree ARG_UNUSED (type))586 nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
587 {
588   return GET_MODE_ALIGNMENT (mode);
589 }
590 
591 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
592 
593    For nvptx, we know how to handle functions declared as stdarg: by
594    passing an extra pointer to the unnamed arguments.  However, the
595    Fortran frontend can produce a different situation, where a
596    function pointer is declared with no arguments, but the actual
597    function and calls to it take more arguments.  In that case, we
598    want to ensure the call matches the definition of the function.  */
599 
600 static bool
nvptx_strict_argument_naming(cumulative_args_t cum_v)601 nvptx_strict_argument_naming (cumulative_args_t cum_v)
602 {
603   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
604 
605   return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
606 }
607 
608 /* Implement TARGET_LIBCALL_VALUE.  */
609 
610 static rtx
nvptx_libcall_value(machine_mode mode,const_rtx)611 nvptx_libcall_value (machine_mode mode, const_rtx)
612 {
613   if (!cfun || !cfun->machine->doing_call)
614     /* Pretend to return in a hard reg for early uses before pseudos can be
615        generated.  */
616     return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
617 
618   return gen_reg_rtx (mode);
619 }
620 
621 /* TARGET_FUNCTION_VALUE implementation.  Returns an RTX representing the place
622    where function FUNC returns or receives a value of data type TYPE.  */
623 
624 static rtx
nvptx_function_value(const_tree type,const_tree ARG_UNUSED (func),bool outgoing)625 nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
626 		      bool outgoing)
627 {
628   machine_mode mode = promote_return (TYPE_MODE (type));
629 
630   if (outgoing)
631     {
632       gcc_assert (cfun);
633       cfun->machine->return_mode = mode;
634       return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
635     }
636 
637   return nvptx_libcall_value (mode, NULL_RTX);
638 }
639 
640 /* Implement TARGET_FUNCTION_VALUE_REGNO_P.  */
641 
642 static bool
nvptx_function_value_regno_p(const unsigned int regno)643 nvptx_function_value_regno_p (const unsigned int regno)
644 {
645   return regno == NVPTX_RETURN_REGNUM;
646 }
647 
648 /* Types with a mode other than those supported by the machine are passed by
649    reference in memory.  */
650 
651 static bool
nvptx_pass_by_reference(cumulative_args_t,const function_arg_info & arg)652 nvptx_pass_by_reference (cumulative_args_t, const function_arg_info &arg)
653 {
654   return pass_in_memory (arg.mode, arg.type, false);
655 }
656 
657 /* Implement TARGET_RETURN_IN_MEMORY.  */
658 
659 static bool
nvptx_return_in_memory(const_tree type,const_tree)660 nvptx_return_in_memory (const_tree type, const_tree)
661 {
662   return pass_in_memory (TYPE_MODE (type), type, true);
663 }
664 
665 /* Implement TARGET_PROMOTE_FUNCTION_MODE.  */
666 
667 static machine_mode
nvptx_promote_function_mode(const_tree type,machine_mode mode,int * ARG_UNUSED (punsignedp),const_tree funtype,int for_return)668 nvptx_promote_function_mode (const_tree type, machine_mode mode,
669 			     int *ARG_UNUSED (punsignedp),
670 			     const_tree funtype, int for_return)
671 {
672   return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
673 }
674 
675 /* Helper for write_arg.  Emit a single PTX argument of MODE, either
676    in a prototype, or as copy in a function prologue.  ARGNO is the
677    index of this argument in the PTX function.  FOR_REG is negative,
678    if we're emitting the PTX prototype.  It is zero if we're copying
679    to an argument register and it is greater than zero if we're
680    copying to a specific hard register.  */
681 
682 static int
write_arg_mode(std::stringstream & s,int for_reg,int argno,machine_mode mode)683 write_arg_mode (std::stringstream &s, int for_reg, int argno,
684 		machine_mode mode)
685 {
686   const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
687 
688   if (for_reg < 0)
689     {
690       /* Writing PTX prototype.  */
691       s << (argno ? ", " : " (");
692       s << ".param" << ptx_type << " %in_ar" << argno;
693     }
694   else
695     {
696       s << "\t.reg" << ptx_type << " ";
697       if (for_reg)
698 	s << reg_names[for_reg];
699       else
700 	s << "%ar" << argno;
701       s << ";\n";
702       if (argno >= 0)
703 	{
704 	  s << "\tld.param" << ptx_type << " ";
705 	  if (for_reg)
706 	    s << reg_names[for_reg];
707 	  else
708 	    s << "%ar" << argno;
709 	  s << ", [%in_ar" << argno << "];\n";
710 	}
711     }
712   return argno + 1;
713 }
714 
715 /* Process function parameter TYPE to emit one or more PTX
716    arguments. S, FOR_REG and ARGNO as for write_arg_mode.  PROTOTYPED
717    is true, if this is a prototyped function, rather than an old-style
718    C declaration.  Returns the next argument number to use.
719 
720    The promotion behavior here must match the regular GCC function
721    parameter marshalling machinery.  */
722 
723 static int
write_arg_type(std::stringstream & s,int for_reg,int argno,tree type,bool prototyped)724 write_arg_type (std::stringstream &s, int for_reg, int argno,
725 		tree type, bool prototyped)
726 {
727   machine_mode mode = TYPE_MODE (type);
728 
729   if (mode == VOIDmode)
730     return argno;
731 
732   if (pass_in_memory (mode, type, false))
733     mode = Pmode;
734   else
735     {
736       bool split = TREE_CODE (type) == COMPLEX_TYPE;
737 
738       if (split)
739 	{
740 	  /* Complex types are sent as two separate args.  */
741 	  type = TREE_TYPE (type);
742 	  mode = TYPE_MODE (type);
743 	  prototyped = true;
744 	}
745 
746       mode = promote_arg (mode, prototyped);
747       if (split)
748 	argno = write_arg_mode (s, for_reg, argno, mode);
749     }
750 
751   return write_arg_mode (s, for_reg, argno, mode);
752 }
753 
754 /* Emit a PTX return as a prototype or function prologue declaration
755    for MODE.  */
756 
757 static void
write_return_mode(std::stringstream & s,bool for_proto,machine_mode mode)758 write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
759 {
760   const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
761   const char *pfx = "\t.reg";
762   const char *sfx = ";\n";
763 
764   if (for_proto)
765     pfx = "(.param", sfx = "_out) ";
766 
767   s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
768 }
769 
770 /* Process a function return TYPE to emit a PTX return as a prototype
771    or function prologue declaration.  Returns true if return is via an
772    additional pointer parameter.  The promotion behavior here must
773    match the regular GCC function return mashalling.  */
774 
775 static bool
write_return_type(std::stringstream & s,bool for_proto,tree type)776 write_return_type (std::stringstream &s, bool for_proto, tree type)
777 {
778   machine_mode mode = TYPE_MODE (type);
779 
780   if (mode == VOIDmode)
781     return false;
782 
783   bool return_in_mem = pass_in_memory (mode, type, true);
784 
785   if (return_in_mem)
786     {
787       if (for_proto)
788 	return return_in_mem;
789 
790       /* Named return values can cause us to return a pointer as well
791 	 as expect an argument for the return location.  This is
792 	 optimization-level specific, so no caller can make use of
793 	 this data, but more importantly for us, we must ensure it
794 	 doesn't change the PTX prototype.  */
795       mode = (machine_mode) cfun->machine->return_mode;
796 
797       if (mode == VOIDmode)
798 	return return_in_mem;
799 
800       /* Clear return_mode to inhibit copy of retval to non-existent
801 	 retval parameter.  */
802       cfun->machine->return_mode = VOIDmode;
803     }
804   else
805     mode = promote_return (mode);
806 
807   write_return_mode (s, for_proto, mode);
808 
809   return return_in_mem;
810 }
811 
812 /* Look for attributes in ATTRS that would indicate we must write a function
813    as a .entry kernel rather than a .func.  Return true if one is found.  */
814 
815 static bool
write_as_kernel(tree attrs)816 write_as_kernel (tree attrs)
817 {
818   return (lookup_attribute ("kernel", attrs) != NULL_TREE
819 	  || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
820 	      && lookup_attribute ("oacc function", attrs) != NULL_TREE));
821   /* For OpenMP target regions, the corresponding kernel entry is emitted from
822      write_omp_entry as a separate function.  */
823 }
824 
825 /* Emit a linker marker for a function decl or defn.  */
826 
827 static void
write_fn_marker(std::stringstream & s,bool is_defn,bool globalize,const char * name)828 write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
829 		 const char *name)
830 {
831   s << "\n// BEGIN";
832   if (globalize)
833     s << " GLOBAL";
834   s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
835   s << name << "\n";
836 }
837 
838 /* Emit a linker marker for a variable decl or defn.  */
839 
840 static void
write_var_marker(FILE * file,bool is_defn,bool globalize,const char * name)841 write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
842 {
843   fprintf (file, "\n// BEGIN%s VAR %s: ",
844 	   globalize ? " GLOBAL" : "",
845 	   is_defn ? "DEF" : "DECL");
846   assemble_name_raw (file, name);
847   fputs ("\n", file);
848 }
849 
850 /* Helper function for write_fn_proto.  */
851 
852 static void
write_fn_proto_1(std::stringstream & s,bool is_defn,const char * name,const_tree decl)853 write_fn_proto_1 (std::stringstream &s, bool is_defn,
854 		  const char *name, const_tree decl)
855 {
856   write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
857 
858   /* PTX declaration.  */
859   if (DECL_EXTERNAL (decl))
860     s << ".extern ";
861   else if (TREE_PUBLIC (decl))
862     s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
863   s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
864 
865   tree fntype = TREE_TYPE (decl);
866   tree result_type = TREE_TYPE (fntype);
867 
868   /* atomic_compare_exchange_$n builtins have an exceptional calling
869      convention.  */
870   int not_atomic_weak_arg = -1;
871   if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
872     switch (DECL_FUNCTION_CODE (decl))
873       {
874       case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
875       case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
876       case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
877       case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
878       case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
879 	/* These atomics skip the 'weak' parm in an actual library
880 	   call.  We must skip it in the prototype too.  */
881 	not_atomic_weak_arg = 3;
882 	break;
883 
884       default:
885 	break;
886       }
887 
888   /* Declare the result.  */
889   bool return_in_mem = write_return_type (s, true, result_type);
890 
891   s << name;
892 
893   int argno = 0;
894 
895   /* Emit argument list.  */
896   if (return_in_mem)
897     argno = write_arg_type (s, -1, argno, ptr_type_node, true);
898 
899   /* We get:
900      NULL in TYPE_ARG_TYPES, for old-style functions
901      NULL in DECL_ARGUMENTS, for builtin functions without another
902        declaration.
903      So we have to pick the best one we have.  */
904   tree args = TYPE_ARG_TYPES (fntype);
905   bool prototyped = true;
906   if (!args)
907     {
908       args = DECL_ARGUMENTS (decl);
909       prototyped = false;
910     }
911 
912   for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
913     {
914       tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
915 
916       if (not_atomic_weak_arg)
917 	argno = write_arg_type (s, -1, argno, type, prototyped);
918       else
919 	gcc_assert (TREE_CODE (type) == BOOLEAN_TYPE);
920     }
921 
922   if (stdarg_p (fntype))
923     argno = write_arg_type (s, -1, argno, ptr_type_node, true);
924 
925   if (DECL_STATIC_CHAIN (decl))
926     argno = write_arg_type (s, -1, argno, ptr_type_node, true);
927 
928   if (!argno && strcmp (name, "main") == 0)
929     {
930       argno = write_arg_type (s, -1, argno, integer_type_node, true);
931       argno = write_arg_type (s, -1, argno, ptr_type_node, true);
932     }
933 
934   if (argno)
935     s << ")";
936 
937   s << (is_defn ? "\n" : ";\n");
938 }
939 
940 /* Write a .func or .kernel declaration or definition along with
941    a helper comment for use by ld.  S is the stream to write to, DECL
942    the decl for the function with name NAME.  For definitions, emit
943    a declaration too.  */
944 
945 static void
write_fn_proto(std::stringstream & s,bool is_defn,const char * name,const_tree decl)946 write_fn_proto (std::stringstream &s, bool is_defn,
947 		const char *name, const_tree decl)
948 {
949   const char *replacement = nvptx_name_replacement (name);
950   char *replaced_dots = NULL;
951   if (replacement != name)
952     name = replacement;
953   else
954     {
955       replaced_dots = nvptx_replace_dot (name);
956       if (replaced_dots)
957 	name = replaced_dots;
958     }
959   if (name[0] == '*')
960     name++;
961 
962   if (is_defn)
963     /* Emit a declaration.  The PTX assembler gets upset without it.  */
964     write_fn_proto_1 (s, false, name, decl);
965 
966   write_fn_proto_1 (s, is_defn, name, decl);
967 
968   if (replaced_dots)
969     XDELETE (replaced_dots);
970 }
971 
972 /* Construct a function declaration from a call insn.  This can be
973    necessary for two reasons - either we have an indirect call which
974    requires a .callprototype declaration, or we have a libcall
975    generated by emit_library_call for which no decl exists.  */
976 
977 static void
write_fn_proto_from_insn(std::stringstream & s,const char * name,rtx result,rtx pat)978 write_fn_proto_from_insn (std::stringstream &s, const char *name,
979 			  rtx result, rtx pat)
980 {
981   char *replaced_dots = NULL;
982 
983   if (!name)
984     {
985       s << "\t.callprototype ";
986       name = "_";
987     }
988   else
989     {
990       const char *replacement = nvptx_name_replacement (name);
991       if (replacement != name)
992 	name = replacement;
993       else
994 	{
995 	  replaced_dots = nvptx_replace_dot (name);
996 	  if (replaced_dots)
997 	    name = replaced_dots;
998 	}
999       write_fn_marker (s, false, true, name);
1000       s << "\t.extern .func ";
1001     }
1002 
1003   if (result != NULL_RTX)
1004     write_return_mode (s, true, GET_MODE (result));
1005 
1006   s << name;
1007   if (replaced_dots)
1008     XDELETE (replaced_dots);
1009 
1010   int arg_end = XVECLEN (pat, 0);
1011   for (int i = 1; i < arg_end; i++)
1012     {
1013       /* We don't have to deal with mode splitting & promotion here,
1014 	 as that was already done when generating the call
1015 	 sequence.  */
1016       machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
1017 
1018       write_arg_mode (s, -1, i - 1, mode);
1019     }
1020   if (arg_end != 1)
1021     s << ")";
1022   s << ";\n";
1023 }
1024 
1025 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
1026    table and write a ptx prototype.  These are emitted at end of
1027    compilation.  */
1028 
1029 static void
nvptx_record_fndecl(tree decl)1030 nvptx_record_fndecl (tree decl)
1031 {
1032   tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
1033   if (*slot == NULL)
1034     {
1035       *slot = decl;
1036       const char *name = get_fnname_from_decl (decl);
1037       write_fn_proto (func_decls, false, name, decl);
1038     }
1039 }
1040 
1041 /* Record a libcall or unprototyped external function. CALLEE is the
1042    SYMBOL_REF.  Insert into the libfunc hash table and emit a ptx
1043    declaration for it.  */
1044 
1045 static void
nvptx_record_libfunc(rtx callee,rtx retval,rtx pat)1046 nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
1047 {
1048   rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
1049   if (*slot == NULL)
1050     {
1051       *slot = callee;
1052 
1053       const char *name = XSTR (callee, 0);
1054       write_fn_proto_from_insn (func_decls, name, retval, pat);
1055     }
1056 }
1057 
1058 /* DECL is an external FUNCTION_DECL, that we're referencing.  If it
1059    is prototyped, record it now.  Otherwise record it as needed at end
1060    of compilation, when we might have more information about it.  */
1061 
1062 void
nvptx_record_needed_fndecl(tree decl)1063 nvptx_record_needed_fndecl (tree decl)
1064 {
1065   if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
1066     {
1067       tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
1068       if (*slot == NULL)
1069 	*slot = decl;
1070     }
1071   else
1072     nvptx_record_fndecl (decl);
1073 }
1074 
1075 /* SYM is a SYMBOL_REF.  If it refers to an external function, record
1076    it as needed.  */
1077 
1078 static void
nvptx_maybe_record_fnsym(rtx sym)1079 nvptx_maybe_record_fnsym (rtx sym)
1080 {
1081   tree decl = SYMBOL_REF_DECL (sym);
1082 
1083   if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
1084     nvptx_record_needed_fndecl (decl);
1085 }
1086 
1087 /* Emit a local array to hold some part of a conventional stack frame
1088    and initialize REGNO to point to it.  If the size is zero, it'll
1089    never be valid to dereference, so we can simply initialize to
1090    zero.  */
1091 
1092 static void
init_frame(FILE * file,int regno,unsigned align,unsigned size)1093 init_frame (FILE  *file, int regno, unsigned align, unsigned size)
1094 {
1095   if (size)
1096     fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
1097 	     align, reg_names[regno], size);
1098   fprintf (file, "\t.reg.u%d %s;\n",
1099 	   POINTER_SIZE, reg_names[regno]);
1100   fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
1101 		  :  "\tmov.u%d %s, 0;\n"),
1102 	   POINTER_SIZE, reg_names[regno], reg_names[regno]);
1103 }
1104 
1105 /* Emit soft stack frame setup sequence.  */
1106 
1107 static void
init_softstack_frame(FILE * file,unsigned alignment,HOST_WIDE_INT size)1108 init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
1109 {
1110   /* Maintain 64-bit stack alignment.  */
1111   unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
1112   size = ROUND_UP (size, keep_align);
1113   int bits = POINTER_SIZE;
1114   const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1115   const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
1116   const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
1117   const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
1118   fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
1119   fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
1120   fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
1121   fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
1122   fprintf (file, "\t{\n");
1123   fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
1124   fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
1125   fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
1126   fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1127   fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1128 	   bits == 64 ? ".wide" : ".lo", bits / 8);
1129   fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
1130 
1131   /* Initialize %sspslot = &__nvptx_stacks[tid.y].  */
1132   fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
1133 
1134   /* Initialize %sspprev = __nvptx_stacks[tid.y].  */
1135   fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
1136 	   bits, reg_sspprev, reg_sspslot);
1137 
1138   /* Initialize %frame = %sspprev - size.  */
1139   fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1140 	   bits, reg_frame, reg_sspprev, size);
1141 
1142   /* Apply alignment, if larger than 64.  */
1143   if (alignment > keep_align)
1144     fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
1145 	     bits, reg_frame, reg_frame, -alignment);
1146 
1147   size = crtl->outgoing_args_size;
1148   gcc_assert (size % keep_align == 0);
1149 
1150   /* Initialize %stack.  */
1151   fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1152 	   bits, reg_stack, reg_frame, size);
1153 
1154   if (!crtl->is_leaf)
1155     fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
1156 	     bits, reg_sspslot, reg_stack);
1157   fprintf (file, "\t}\n");
1158   cfun->machine->has_softstack = true;
1159   need_softstack_decl = true;
1160 }
1161 
1162 /* Emit code to initialize the REGNO predicate register to indicate
1163    whether we are not lane zero on the NAME axis.  */
1164 
1165 static void
nvptx_init_axis_predicate(FILE * file,int regno,const char * name)1166 nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
1167 {
1168   fprintf (file, "\t{\n");
1169   fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
1170   if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1171     {
1172       fprintf (file, "\t\t.reg.u64\t%%t_red;\n");
1173       fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1174     }
1175   fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
1176   fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
1177   if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1178     {
1179       fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tid.y;\n");
1180       fprintf (file, "\t\tcvta.shared.u64\t%%t_red, __vector_red;\n");
1181       fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_red; "
1182 	       "// vector reduction buffer\n",
1183 	       REGNO (cfun->machine->red_partition),
1184 	       vector_red_partition);
1185     }
1186   /* Verify vector_red_size.  */
1187   gcc_assert (vector_red_partition * nvptx_mach_max_workers ()
1188 	      <= vector_red_size);
1189   fprintf (file, "\t}\n");
1190 }
1191 
1192 /* Emit code to initialize OpenACC worker broadcast and synchronization
1193    registers.  */
1194 
1195 static void
nvptx_init_oacc_workers(FILE * file)1196 nvptx_init_oacc_workers (FILE *file)
1197 {
1198   fprintf (file, "\t{\n");
1199   fprintf (file, "\t\t.reg.u32\t%%tidy;\n");
1200   if (cfun->machine->bcast_partition)
1201     {
1202       fprintf (file, "\t\t.reg.u64\t%%t_bcast;\n");
1203       fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1204     }
1205   fprintf (file, "\t\tmov.u32\t\t%%tidy, %%tid.y;\n");
1206   if (cfun->machine->bcast_partition)
1207     {
1208       fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tidy;\n");
1209       fprintf (file, "\t\tadd.u64\t\t%%y64, %%y64, 1; // vector ID\n");
1210       fprintf (file, "\t\tcvta.shared.u64\t%%t_bcast, __oacc_bcast;\n");
1211       fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_bcast; "
1212 	       "// vector broadcast offset\n",
1213 	       REGNO (cfun->machine->bcast_partition),
1214 	       oacc_bcast_partition);
1215     }
1216   /* Verify oacc_bcast_size.  */
1217   gcc_assert (oacc_bcast_partition * (nvptx_mach_max_workers () + 1)
1218 	      <= oacc_bcast_size);
1219   if (cfun->machine->sync_bar)
1220     fprintf (file, "\t\tadd.u32\t\t%%r%d, %%tidy, 1; "
1221 	     "// vector synchronization barrier\n",
1222 	     REGNO (cfun->machine->sync_bar));
1223   fprintf (file, "\t}\n");
1224 }
1225 
1226 /* Emit code to initialize predicate and master lane index registers for
1227    -muniform-simt code generation variant.  */
1228 
1229 static void
nvptx_init_unisimt_predicate(FILE * file)1230 nvptx_init_unisimt_predicate (FILE *file)
1231 {
1232   cfun->machine->unisimt_location = gen_reg_rtx (Pmode);
1233   int loc = REGNO (cfun->machine->unisimt_location);
1234   int bits = POINTER_SIZE;
1235   fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc);
1236   fprintf (file, "\t{\n");
1237   fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
1238   fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
1239   fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1240   fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1241 	   bits == 64 ? ".wide" : ".lo");
1242   fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc);
1243   fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc);
1244   if (cfun->machine->unisimt_predicate)
1245     {
1246       int master = REGNO (cfun->machine->unisimt_master);
1247       int pred = REGNO (cfun->machine->unisimt_predicate);
1248       fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc);
1249       fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1250       /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'.  */
1251       fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
1252       /* Compute predicate as 'tid.x == master'.  */
1253       fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
1254     }
1255   fprintf (file, "\t}\n");
1256   need_unisimt_decl = true;
1257 }
1258 
1259 /* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1260 
1261    extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1262    void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1263    {
1264      __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1265      __nvptx_uni[tid.y] = 0;
1266      gomp_nvptx_main (ORIG, arg);
1267    }
1268    ORIG itself should not be emitted as a PTX .entry function.  */
1269 
1270 static void
write_omp_entry(FILE * file,const char * name,const char * orig)1271 write_omp_entry (FILE *file, const char *name, const char *orig)
1272 {
1273   static bool gomp_nvptx_main_declared;
1274   if (!gomp_nvptx_main_declared)
1275     {
1276       gomp_nvptx_main_declared = true;
1277       write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
1278       func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1279         << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
1280     }
1281   /* PR79332.  Single out this string; it confuses gcc.pot generation.  */
1282 #define NTID_Y "%ntid.y"
1283 #define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1284  (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1285 {\n\
1286 	.reg.u32 %r<3>;\n\
1287 	.reg.u" PS " %R<4>;\n\
1288 	mov.u32 %r0, %tid.y;\n\
1289 	mov.u32 %r1, " NTID_Y ";\n\
1290 	mov.u32 %r2, %ctaid.x;\n\
1291 	cvt.u" PS ".u32 %R1, %r0;\n\
1292 	" MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1293 	mov.u" PS " %R0, __nvptx_stacks;\n\
1294 	" MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1295 	ld.param.u" PS " %R2, [%stack];\n\
1296 	ld.param.u" PS " %R3, [%sz];\n\
1297 	add.u" PS " %R2, %R2, %R3;\n\
1298 	mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1299 	st.shared.u" PS " [%R0], %R2;\n\
1300 	mov.u" PS " %R0, __nvptx_uni;\n\
1301 	" MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1302 	mov.u32 %r0, 0;\n\
1303 	st.shared.u32 [%R0], %r0;\n\
1304 	mov.u" PS " %R0, \0;\n\
1305 	ld.param.u" PS " %R1, [%arg];\n\
1306 	{\n\
1307 		.param.u" PS " %P<2>;\n\
1308 		st.param.u" PS " [%P0], %R0;\n\
1309 		st.param.u" PS " [%P1], %R1;\n\
1310 		call.uni gomp_nvptx_main, (%P0, %P1);\n\
1311 	}\n\
1312 	ret.uni;\n\
1313 }\n"
1314   static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1315   static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32  ");
1316 #undef ENTRY_TEMPLATE
1317 #undef NTID_Y
1318   const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
1319   /* Position ENTRY_2 after the embedded nul using strlen of the prefix.  */
1320   const char *entry_2 = entry_1 + strlen (entry64) + 1;
1321   fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
1322   need_softstack_decl = need_unisimt_decl = true;
1323 }
1324 
1325 /* Implement ASM_DECLARE_FUNCTION_NAME.  Writes the start of a ptx
1326    function, including local var decls and copies from the arguments to
1327    local regs.  */
1328 
1329 void
nvptx_declare_function_name(FILE * file,const char * name,const_tree decl)1330 nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
1331 {
1332   tree fntype = TREE_TYPE (decl);
1333   tree result_type = TREE_TYPE (fntype);
1334   int argno = 0;
1335 
1336   if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
1337       && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
1338     {
1339       char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
1340       sprintf (buf, "%s$impl", name);
1341       write_omp_entry (file, name, buf);
1342       name = buf;
1343     }
1344   /* We construct the initial part of the function into a string
1345      stream, in order to share the prototype writing code.  */
1346   std::stringstream s;
1347   write_fn_proto (s, true, name, decl);
1348   s << "{\n";
1349 
1350   bool return_in_mem = write_return_type (s, false, result_type);
1351   if (return_in_mem)
1352     argno = write_arg_type (s, 0, argno, ptr_type_node, true);
1353 
1354   /* Declare and initialize incoming arguments.  */
1355   tree args = TYPE_ARG_TYPES (fntype);
1356   bool prototyped = true;
1357   if (!args)
1358     {
1359       args = DECL_ARGUMENTS (decl);
1360       prototyped = false;
1361     }
1362 
1363   for (; args != NULL_TREE; args = TREE_CHAIN (args))
1364     {
1365       tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
1366 
1367       argno = write_arg_type (s, 0, argno, type, prototyped);
1368     }
1369 
1370   if (stdarg_p (fntype))
1371     argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1372 			    true);
1373 
1374   if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1375     write_arg_type (s, STATIC_CHAIN_REGNUM,
1376 		    DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1377 		    true);
1378 
1379   fprintf (file, "%s", s.str().c_str());
1380 
1381   /* Usually 'crtl->is_leaf' is computed during register allocator
1382      initialization (which is not done on NVPTX) or for pressure-sensitive
1383      optimizations.  Initialize it here, except if already set.  */
1384   if (!crtl->is_leaf)
1385     crtl->is_leaf = leaf_function_p ();
1386 
1387   HOST_WIDE_INT sz = get_frame_size ();
1388   bool need_frameptr = sz || cfun->machine->has_chain;
1389   int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
1390   if (!TARGET_SOFT_STACK)
1391     {
1392       /* Declare a local var for outgoing varargs.  */
1393       if (cfun->machine->has_varadic)
1394 	init_frame (file, STACK_POINTER_REGNUM,
1395 		    UNITS_PER_WORD, crtl->outgoing_args_size);
1396 
1397       /* Declare a local variable for the frame.  Force its size to be
1398 	 DImode-compatible.  */
1399       if (need_frameptr)
1400 	init_frame (file, FRAME_POINTER_REGNUM, alignment,
1401 		    ROUND_UP (sz, GET_MODE_SIZE (DImode)));
1402     }
1403   else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca
1404 	   || (cfun->machine->has_simtreg && !crtl->is_leaf))
1405     init_softstack_frame (file, alignment, sz);
1406 
1407   if (cfun->machine->has_simtreg)
1408     {
1409       unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size;
1410       unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align;
1411       align = MAX (align, GET_MODE_SIZE (DImode));
1412       if (!crtl->is_leaf || cfun->calls_alloca)
1413 	simtsz = HOST_WIDE_INT_M1U;
1414       if (simtsz == HOST_WIDE_INT_M1U)
1415 	simtsz = nvptx_softstack_size;
1416       if (cfun->machine->has_softstack)
1417 	simtsz += POINTER_SIZE / 8;
1418       simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode));
1419       if (align > GET_MODE_SIZE (DImode))
1420 	simtsz += align - GET_MODE_SIZE (DImode);
1421       if (simtsz)
1422 	fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
1423 		HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
1424     }
1425 
1426   /* Restore the vector reduction partition register, if necessary.
1427      FIXME: Find out when and why this is necessary, and fix it.  */
1428   if (cfun->machine->red_partition)
1429     regno_reg_rtx[REGNO (cfun->machine->red_partition)]
1430       = cfun->machine->red_partition;
1431 
1432   /* Declare the pseudos we have as ptx registers.  */
1433   int maxregs = max_reg_num ();
1434   for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1435     {
1436       if (regno_reg_rtx[i] != const0_rtx)
1437 	{
1438 	  machine_mode mode = PSEUDO_REGNO_MODE (i);
1439 	  machine_mode split = maybe_split_mode (mode);
1440 
1441 	  if (split_mode_p (mode))
1442 	    mode = split;
1443 	  fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1444 	  output_reg (file, i, split, -2);
1445 	  fprintf (file, ";\n");
1446 	}
1447     }
1448 
1449   /* Emit axis predicates. */
1450   if (cfun->machine->axis_predicate[0])
1451     nvptx_init_axis_predicate (file,
1452 			       REGNO (cfun->machine->axis_predicate[0]), "y");
1453   if (cfun->machine->axis_predicate[1])
1454     nvptx_init_axis_predicate (file,
1455 			       REGNO (cfun->machine->axis_predicate[1]), "x");
1456   if (cfun->machine->unisimt_predicate
1457       || (cfun->machine->has_simtreg && !crtl->is_leaf))
1458     nvptx_init_unisimt_predicate (file);
1459   if (cfun->machine->bcast_partition || cfun->machine->sync_bar)
1460     nvptx_init_oacc_workers (file);
1461 }
1462 
1463 /* Output code for switching uniform-simt state.  ENTERING indicates whether
1464    we are entering or leaving non-uniform execution region.  */
1465 
1466 static void
nvptx_output_unisimt_switch(FILE * file,bool entering)1467 nvptx_output_unisimt_switch (FILE *file, bool entering)
1468 {
1469   if (crtl->is_leaf && !cfun->machine->unisimt_predicate)
1470     return;
1471   fprintf (file, "\t{\n");
1472   fprintf (file, "\t\t.reg.u32 %%ustmp2;\n");
1473   fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0);
1474   if (!crtl->is_leaf)
1475     {
1476       int loc = REGNO (cfun->machine->unisimt_location);
1477       fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc);
1478     }
1479   if (cfun->machine->unisimt_predicate)
1480     {
1481       int master = REGNO (cfun->machine->unisimt_master);
1482       int pred = REGNO (cfun->machine->unisimt_predicate);
1483       fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1484       fprintf (file, "\t\tmov.u32 %%r%d, %s;\n",
1485 	       master, entering ? "%ustmp2" : "0");
1486       fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master);
1487     }
1488   fprintf (file, "\t}\n");
1489 }
1490 
1491 /* Output code for allocating per-lane storage and switching soft-stack pointer.
1492    ENTERING indicates whether we are entering or leaving non-uniform execution.
1493    PTR is the register pointing to allocated storage, it is assigned to on
1494    entering and used to restore state on leaving.  SIZE and ALIGN are used only
1495    on entering.  */
1496 
1497 static void
nvptx_output_softstack_switch(FILE * file,bool entering,rtx ptr,rtx size,rtx align)1498 nvptx_output_softstack_switch (FILE *file, bool entering,
1499 			       rtx ptr, rtx size, rtx align)
1500 {
1501   gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr));
1502   if (crtl->is_leaf && !cfun->machine->simt_stack_size)
1503     return;
1504   int bits = POINTER_SIZE, regno = REGNO (ptr);
1505   fprintf (file, "\t{\n");
1506   if (entering)
1507     {
1508       fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1509 	       HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno,
1510 	       cfun->machine->simt_stack_size);
1511       fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno);
1512       if (CONST_INT_P (size))
1513 	fprintf (file, HOST_WIDE_INT_PRINT_DEC,
1514 		 ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode)));
1515       else
1516 	output_reg (file, REGNO (size), VOIDmode);
1517       fputs (";\n", file);
1518       if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
1519 	fprintf (file,
1520 		 "\t\tand.b%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
1521 		 bits, regno, regno, UINTVAL (align));
1522     }
1523   if (cfun->machine->has_softstack)
1524     {
1525       const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1526       if (entering)
1527 	{
1528 	  fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1529 		   bits, regno, bits / 8, reg_stack);
1530 	  fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n",
1531 		   bits, reg_stack, regno, bits / 8);
1532 	}
1533       else
1534 	{
1535 	  fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1536 		   bits, reg_stack, regno, bits / 8);
1537 	}
1538       nvptx_output_set_softstack (REGNO (stack_pointer_rtx));
1539     }
1540   fprintf (file, "\t}\n");
1541 }
1542 
1543 /* Output code to enter non-uniform execution region.  DEST is a register
1544    to hold a per-lane allocation given by SIZE and ALIGN.  */
1545 
1546 const char *
nvptx_output_simt_enter(rtx dest,rtx size,rtx align)1547 nvptx_output_simt_enter (rtx dest, rtx size, rtx align)
1548 {
1549   nvptx_output_unisimt_switch (asm_out_file, true);
1550   nvptx_output_softstack_switch (asm_out_file, true, dest, size, align);
1551   return "";
1552 }
1553 
1554 /* Output code to leave non-uniform execution region.  SRC is the register
1555    holding per-lane storage previously allocated by omp_simt_enter insn.  */
1556 
1557 const char *
nvptx_output_simt_exit(rtx src)1558 nvptx_output_simt_exit (rtx src)
1559 {
1560   nvptx_output_unisimt_switch (asm_out_file, false);
1561   nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX);
1562   return "";
1563 }
1564 
1565 /* Output instruction that sets soft stack pointer in shared memory to the
1566    value in register given by SRC_REGNO.  */
1567 
1568 const char *
nvptx_output_set_softstack(unsigned src_regno)1569 nvptx_output_set_softstack (unsigned src_regno)
1570 {
1571   if (cfun->machine->has_softstack && !crtl->is_leaf)
1572     {
1573       fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
1574 	       POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
1575       output_reg (asm_out_file, src_regno, VOIDmode);
1576       fprintf (asm_out_file, ";\n");
1577     }
1578   return "";
1579 }
1580 /* Output a return instruction.  Also copy the return value to its outgoing
1581    location.  */
1582 
1583 const char *
nvptx_output_return(void)1584 nvptx_output_return (void)
1585 {
1586   machine_mode mode = (machine_mode)cfun->machine->return_mode;
1587 
1588   if (mode != VOIDmode)
1589     fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1590 	     nvptx_ptx_type_from_mode (mode, false),
1591 	     reg_names[NVPTX_RETURN_REGNUM],
1592 	     reg_names[NVPTX_RETURN_REGNUM]);
1593 
1594   return "ret;";
1595 }
1596 
1597 /* Terminate a function by writing a closing brace to FILE.  */
1598 
1599 void
nvptx_function_end(FILE * file)1600 nvptx_function_end (FILE *file)
1601 {
1602   fprintf (file, "}\n");
1603 }
1604 
1605 /* Decide whether we can make a sibling call to a function.  For ptx, we
1606    can't.  */
1607 
1608 static bool
nvptx_function_ok_for_sibcall(tree,tree)1609 nvptx_function_ok_for_sibcall (tree, tree)
1610 {
1611   return false;
1612 }
1613 
1614 /* Return Dynamic ReAlignment Pointer RTX.  For PTX there isn't any.  */
1615 
1616 static rtx
nvptx_get_drap_rtx(void)1617 nvptx_get_drap_rtx (void)
1618 {
1619   if (TARGET_SOFT_STACK && stack_realign_drap)
1620     return arg_pointer_rtx;
1621   return NULL_RTX;
1622 }
1623 
1624 /* Implement the TARGET_CALL_ARGS hook.  Record information about one
1625    argument to the next call.  */
1626 
1627 static void
nvptx_call_args(rtx arg,tree fntype)1628 nvptx_call_args (rtx arg, tree fntype)
1629 {
1630   if (!cfun->machine->doing_call)
1631     {
1632       cfun->machine->doing_call = true;
1633       cfun->machine->is_varadic = false;
1634       cfun->machine->num_args = 0;
1635 
1636       if (fntype && stdarg_p (fntype))
1637 	{
1638 	  cfun->machine->is_varadic = true;
1639 	  cfun->machine->has_varadic = true;
1640 	  cfun->machine->num_args++;
1641 	}
1642     }
1643 
1644   if (REG_P (arg) && arg != pc_rtx)
1645     {
1646       cfun->machine->num_args++;
1647       cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1648 						  cfun->machine->call_args);
1649     }
1650 }
1651 
1652 /* Implement the corresponding END_CALL_ARGS hook.  Clear and free the
1653    information we recorded.  */
1654 
1655 static void
nvptx_end_call_args(void)1656 nvptx_end_call_args (void)
1657 {
1658   cfun->machine->doing_call = false;
1659   free_EXPR_LIST_list (&cfun->machine->call_args);
1660 }
1661 
1662 /* Emit the sequence for a call to ADDRESS, setting RETVAL.  Keep
1663    track of whether calls involving static chains or varargs were seen
1664    in the current function.
1665    For libcalls, maintain a hash table of decls we have seen, and
1666    record a function decl for later when encountering a new one.  */
1667 
1668 void
nvptx_expand_call(rtx retval,rtx address)1669 nvptx_expand_call (rtx retval, rtx address)
1670 {
1671   rtx callee = XEXP (address, 0);
1672   rtx varargs = NULL_RTX;
1673   unsigned parallel = 0;
1674 
1675   if (!call_insn_operand (callee, Pmode))
1676     {
1677       callee = force_reg (Pmode, callee);
1678       address = change_address (address, QImode, callee);
1679     }
1680 
1681   if (GET_CODE (callee) == SYMBOL_REF)
1682     {
1683       tree decl = SYMBOL_REF_DECL (callee);
1684       if (decl != NULL_TREE)
1685 	{
1686 	  if (DECL_STATIC_CHAIN (decl))
1687 	    cfun->machine->has_chain = true;
1688 
1689 	  tree attr = oacc_get_fn_attrib (decl);
1690 	  if (attr)
1691 	    {
1692 	      tree dims = TREE_VALUE (attr);
1693 
1694 	      parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1695 	      for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1696 		{
1697 		  if (TREE_PURPOSE (dims)
1698 		      && !integer_zerop (TREE_PURPOSE (dims)))
1699 		    break;
1700 		  /* Not on this axis.  */
1701 		  parallel ^= GOMP_DIM_MASK (ix);
1702 		  dims = TREE_CHAIN (dims);
1703 		}
1704 	    }
1705 	}
1706     }
1707 
1708   unsigned nargs = cfun->machine->num_args;
1709   if (cfun->machine->is_varadic)
1710     {
1711       varargs = gen_reg_rtx (Pmode);
1712       emit_move_insn (varargs, stack_pointer_rtx);
1713     }
1714 
1715   rtvec vec = rtvec_alloc (nargs + 1);
1716   rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
1717   int vec_pos = 0;
1718 
1719   rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
1720   rtx tmp_retval = retval;
1721   if (retval)
1722     {
1723       if (!nvptx_register_operand (retval, GET_MODE (retval)))
1724 	tmp_retval = gen_reg_rtx (GET_MODE (retval));
1725       call = gen_rtx_SET (tmp_retval, call);
1726     }
1727   XVECEXP (pat, 0, vec_pos++) = call;
1728 
1729   /* Construct the call insn, including a USE for each argument pseudo
1730      register.  These will be used when printing the insn.  */
1731   for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
1732     XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
1733 
1734   if (varargs)
1735     XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
1736 
1737   gcc_assert (vec_pos = XVECLEN (pat, 0));
1738 
1739   nvptx_emit_forking (parallel, true);
1740   emit_call_insn (pat);
1741   nvptx_emit_joining (parallel, true);
1742 
1743   if (tmp_retval != retval)
1744     emit_move_insn (retval, tmp_retval);
1745 }
1746 
1747 /* Emit a comparison COMPARE, and return the new test to be used in the
1748    jump.  */
1749 
1750 rtx
nvptx_expand_compare(rtx compare)1751 nvptx_expand_compare (rtx compare)
1752 {
1753   rtx pred = gen_reg_rtx (BImode);
1754   rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1755 			    XEXP (compare, 0), XEXP (compare, 1));
1756   emit_insn (gen_rtx_SET (pred, cmp));
1757   return gen_rtx_NE (BImode, pred, const0_rtx);
1758 }
1759 
1760 /* Expand the oacc fork & join primitive into ptx-required unspecs.  */
1761 
1762 void
nvptx_expand_oacc_fork(unsigned mode)1763 nvptx_expand_oacc_fork (unsigned mode)
1764 {
1765   nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1766 }
1767 
1768 void
nvptx_expand_oacc_join(unsigned mode)1769 nvptx_expand_oacc_join (unsigned mode)
1770 {
1771   nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1772 }
1773 
1774 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1775    objects.  */
1776 
1777 static rtx
nvptx_gen_unpack(rtx dst0,rtx dst1,rtx src)1778 nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1779 {
1780   rtx res;
1781 
1782   switch (GET_MODE (src))
1783     {
1784     case E_DImode:
1785       res = gen_unpackdisi2 (dst0, dst1, src);
1786       break;
1787     case E_DFmode:
1788       res = gen_unpackdfsi2 (dst0, dst1, src);
1789       break;
1790     default: gcc_unreachable ();
1791     }
1792   return res;
1793 }
1794 
1795 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1796    object.  */
1797 
1798 static rtx
nvptx_gen_pack(rtx dst,rtx src0,rtx src1)1799 nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1800 {
1801   rtx res;
1802 
1803   switch (GET_MODE (dst))
1804     {
1805     case E_DImode:
1806       res = gen_packsidi2 (dst, src0, src1);
1807       break;
1808     case E_DFmode:
1809       res = gen_packsidf2 (dst, src0, src1);
1810       break;
1811     default: gcc_unreachable ();
1812     }
1813   return res;
1814 }
1815 
1816 /* Generate an instruction or sequence to broadcast register REG
1817    across the vectors of a single warp.  */
1818 
1819 rtx
nvptx_gen_shuffle(rtx dst,rtx src,rtx idx,nvptx_shuffle_kind kind)1820 nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
1821 {
1822   rtx res;
1823 
1824   switch (GET_MODE (dst))
1825     {
1826     case E_SImode:
1827       res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1828       break;
1829     case E_SFmode:
1830       res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1831       break;
1832     case E_DImode:
1833     case E_DFmode:
1834       {
1835 	rtx tmp0 = gen_reg_rtx (SImode);
1836 	rtx tmp1 = gen_reg_rtx (SImode);
1837 
1838 	start_sequence ();
1839 	emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1840 	emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1841 	emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1842 	emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1843 	res = get_insns ();
1844 	end_sequence ();
1845       }
1846       break;
1847     case E_V2SImode:
1848       {
1849 	rtx src0 = gen_rtx_SUBREG (SImode, src, 0);
1850 	rtx src1 = gen_rtx_SUBREG (SImode, src, 4);
1851 	rtx dst0 = gen_rtx_SUBREG (SImode, dst, 0);
1852 	rtx dst1 = gen_rtx_SUBREG (SImode, dst, 4);
1853 	rtx tmp0 = gen_reg_rtx (SImode);
1854 	rtx tmp1 = gen_reg_rtx (SImode);
1855 	start_sequence ();
1856 	emit_insn (gen_movsi (tmp0, src0));
1857 	emit_insn (gen_movsi (tmp1, src1));
1858 	emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1859 	emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1860 	emit_insn (gen_movsi (dst0, tmp0));
1861 	emit_insn (gen_movsi (dst1, tmp1));
1862 	res = get_insns ();
1863 	end_sequence ();
1864       }
1865       break;
1866     case E_V2DImode:
1867       {
1868 	rtx src0 = gen_rtx_SUBREG (DImode, src, 0);
1869 	rtx src1 = gen_rtx_SUBREG (DImode, src, 8);
1870 	rtx dst0 = gen_rtx_SUBREG (DImode, dst, 0);
1871 	rtx dst1 = gen_rtx_SUBREG (DImode, dst, 8);
1872 	rtx tmp0 = gen_reg_rtx (DImode);
1873 	rtx tmp1 = gen_reg_rtx (DImode);
1874 	start_sequence ();
1875 	emit_insn (gen_movdi (tmp0, src0));
1876 	emit_insn (gen_movdi (tmp1, src1));
1877 	emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1878 	emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1879 	emit_insn (gen_movdi (dst0, tmp0));
1880 	emit_insn (gen_movdi (dst1, tmp1));
1881 	res = get_insns ();
1882 	end_sequence ();
1883       }
1884       break;
1885     case E_BImode:
1886       {
1887 	rtx tmp = gen_reg_rtx (SImode);
1888 
1889 	start_sequence ();
1890 	emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1891 	emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1892 	emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1893 	res = get_insns ();
1894 	end_sequence ();
1895       }
1896       break;
1897     case E_QImode:
1898     case E_HImode:
1899       {
1900 	rtx tmp = gen_reg_rtx (SImode);
1901 
1902 	start_sequence ();
1903 	emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
1904 	emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1905 	emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
1906 						    tmp)));
1907 	res = get_insns ();
1908 	end_sequence ();
1909       }
1910       break;
1911 
1912     default:
1913       gcc_unreachable ();
1914     }
1915   return res;
1916 }
1917 
1918 /* Generate an instruction or sequence to broadcast register REG
1919    across the vectors of a single warp.  */
1920 
1921 static rtx
nvptx_gen_warp_bcast(rtx reg)1922 nvptx_gen_warp_bcast (rtx reg)
1923 {
1924   return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1925 }
1926 
1927 /* Structure used when generating a worker-level spill or fill.  */
1928 
1929 struct broadcast_data_t
1930 {
1931   rtx base;  /* Register holding base addr of buffer.  */
1932   rtx ptr;  /* Iteration var,  if needed.  */
1933   unsigned offset; /* Offset into worker buffer.  */
1934 };
1935 
1936 /* Direction of the spill/fill and looping setup/teardown indicator.  */
1937 
1938 enum propagate_mask
1939   {
1940     PM_read = 1 << 0,
1941     PM_write = 1 << 1,
1942     PM_loop_begin = 1 << 2,
1943     PM_loop_end = 1 << 3,
1944 
1945     PM_read_write = PM_read | PM_write
1946   };
1947 
1948 /* Generate instruction(s) to spill or fill register REG to/from the
1949    worker broadcast array.  PM indicates what is to be done, REP
1950    how many loop iterations will be executed (0 for not a loop).  */
1951 
1952 static rtx
nvptx_gen_shared_bcast(rtx reg,propagate_mask pm,unsigned rep,broadcast_data_t * data,bool vector)1953 nvptx_gen_shared_bcast (rtx reg, propagate_mask pm, unsigned rep,
1954 			broadcast_data_t *data, bool vector)
1955 {
1956   rtx  res;
1957   machine_mode mode = GET_MODE (reg);
1958 
1959   switch (mode)
1960     {
1961     case E_BImode:
1962       {
1963 	rtx tmp = gen_reg_rtx (SImode);
1964 
1965 	start_sequence ();
1966 	if (pm & PM_read)
1967 	  emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1968 	emit_insn (nvptx_gen_shared_bcast (tmp, pm, rep, data, vector));
1969 	if (pm & PM_write)
1970 	  emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1971 	res = get_insns ();
1972 	end_sequence ();
1973       }
1974       break;
1975 
1976     default:
1977       {
1978 	rtx addr = data->ptr;
1979 
1980 	if (!addr)
1981 	  {
1982 	    unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1983 
1984 	    oacc_bcast_align = MAX (oacc_bcast_align, align);
1985 	    data->offset = ROUND_UP (data->offset, align);
1986 	    addr = data->base;
1987 	    gcc_assert (data->base != NULL);
1988 	    if (data->offset)
1989 	      addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1990 	  }
1991 
1992 	addr = gen_rtx_MEM (mode, addr);
1993 	if (pm == PM_read)
1994 	  res = gen_rtx_SET (addr, reg);
1995 	else if (pm == PM_write)
1996 	  res = gen_rtx_SET (reg, addr);
1997 	else
1998 	  gcc_unreachable ();
1999 
2000 	if (data->ptr)
2001 	  {
2002 	    /* We're using a ptr, increment it.  */
2003 	    start_sequence ();
2004 
2005 	    emit_insn (res);
2006 	    emit_insn (gen_adddi3 (data->ptr, data->ptr,
2007 				   GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
2008 	    res = get_insns ();
2009 	    end_sequence ();
2010 	  }
2011 	else
2012 	  rep = 1;
2013 	data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
2014       }
2015       break;
2016     }
2017   return res;
2018 }
2019 
2020 /* Returns true if X is a valid address for use in a memory reference.  */
2021 
2022 static bool
nvptx_legitimate_address_p(machine_mode,rtx x,bool)2023 nvptx_legitimate_address_p (machine_mode, rtx x, bool)
2024 {
2025   enum rtx_code code = GET_CODE (x);
2026 
2027   switch (code)
2028     {
2029     case REG:
2030       return true;
2031 
2032     case PLUS:
2033       if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
2034 	return true;
2035       return false;
2036 
2037     case CONST:
2038     case SYMBOL_REF:
2039     case LABEL_REF:
2040       return true;
2041 
2042     default:
2043       return false;
2044     }
2045 }
2046 
2047 /* Machinery to output constant initializers.  When beginning an
2048    initializer, we decide on a fragment size (which is visible in ptx
2049    in the type used), and then all initializer data is buffered until
2050    a fragment is filled and ready to be written out.  */
2051 
2052 static struct
2053 {
2054   unsigned HOST_WIDE_INT mask; /* Mask for storing fragment.  */
2055   unsigned HOST_WIDE_INT val; /* Current fragment value.  */
2056   unsigned HOST_WIDE_INT remaining; /*  Remaining bytes to be written
2057 					out.  */
2058   unsigned size;  /* Fragment size to accumulate.  */
2059   unsigned offset;  /* Offset within current fragment.  */
2060   bool started;   /* Whether we've output any initializer.  */
2061 } init_frag;
2062 
2063 /* The current fragment is full,  write it out.  SYM may provide a
2064    symbolic reference we should output,  in which case the fragment
2065    value is the addend.  */
2066 
2067 static void
output_init_frag(rtx sym)2068 output_init_frag (rtx sym)
2069 {
2070   fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
2071   unsigned HOST_WIDE_INT val = init_frag.val;
2072 
2073   init_frag.started = true;
2074   init_frag.val = 0;
2075   init_frag.offset = 0;
2076   init_frag.remaining--;
2077 
2078   if (sym)
2079     {
2080       bool function = (SYMBOL_REF_DECL (sym)
2081 		       && (TREE_CODE (SYMBOL_REF_DECL (sym)) == FUNCTION_DECL));
2082       if (!function)
2083 	fprintf (asm_out_file, "generic(");
2084       output_address (VOIDmode, sym);
2085       if (!function)
2086 	fprintf (asm_out_file, ")");
2087       if (val)
2088 	fprintf (asm_out_file, " + ");
2089     }
2090 
2091   if (!sym || val)
2092     fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
2093 }
2094 
2095 /* Add value VAL of size SIZE to the data we're emitting, and keep
2096    writing out chunks as they fill up.  */
2097 
2098 static void
nvptx_assemble_value(unsigned HOST_WIDE_INT val,unsigned size)2099 nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
2100 {
2101   bool negative_p
2102     = val & (HOST_WIDE_INT_1U << (HOST_BITS_PER_WIDE_INT - 1));
2103 
2104   /* Avoid undefined behaviour.  */
2105   if (size * BITS_PER_UNIT < HOST_BITS_PER_WIDE_INT)
2106     val &= (HOST_WIDE_INT_1U << (size * BITS_PER_UNIT)) - 1;
2107 
2108   for (unsigned part = 0; size; size -= part)
2109     {
2110       if (part * BITS_PER_UNIT == HOST_BITS_PER_WIDE_INT)
2111 	/* Avoid undefined behaviour.  */
2112 	val = negative_p ? -1 : 0;
2113       else
2114 	val >>= (part * BITS_PER_UNIT);
2115       part = init_frag.size - init_frag.offset;
2116       part = MIN (part, size);
2117 
2118       unsigned HOST_WIDE_INT partial
2119 	= val << (init_frag.offset * BITS_PER_UNIT);
2120       init_frag.val |= partial & init_frag.mask;
2121       init_frag.offset += part;
2122 
2123       if (init_frag.offset == init_frag.size)
2124 	output_init_frag (NULL);
2125     }
2126 }
2127 
2128 /* Target hook for assembling integer object X of size SIZE.  */
2129 
2130 static bool
nvptx_assemble_integer(rtx x,unsigned int size,int ARG_UNUSED (aligned_p))2131 nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
2132 {
2133   HOST_WIDE_INT val = 0;
2134 
2135   switch (GET_CODE (x))
2136     {
2137     default:
2138       /* Let the generic machinery figure it out, usually for a
2139 	 CONST_WIDE_INT.  */
2140       return false;
2141 
2142     case CONST_INT:
2143       nvptx_assemble_value (INTVAL (x), size);
2144       break;
2145 
2146     case CONST:
2147       x = XEXP (x, 0);
2148       gcc_assert (GET_CODE (x) == PLUS);
2149       val = INTVAL (XEXP (x, 1));
2150       x = XEXP (x, 0);
2151       gcc_assert (GET_CODE (x) == SYMBOL_REF);
2152       gcc_fallthrough (); /* FALLTHROUGH */
2153 
2154     case SYMBOL_REF:
2155       gcc_assert (size == init_frag.size);
2156       if (init_frag.offset)
2157 	sorry ("cannot emit unaligned pointers in ptx assembly");
2158 
2159       nvptx_maybe_record_fnsym (x);
2160       init_frag.val = val;
2161       output_init_frag (x);
2162       break;
2163     }
2164 
2165   return true;
2166 }
2167 
2168 /* Output SIZE zero bytes.  We ignore the FILE argument since the
2169    functions we're calling to perform the output just use
2170    asm_out_file.  */
2171 
2172 void
nvptx_output_skip(FILE *,unsigned HOST_WIDE_INT size)2173 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
2174 {
2175   /* Finish the current fragment, if it's started.  */
2176   if (init_frag.offset)
2177     {
2178       unsigned part = init_frag.size - init_frag.offset;
2179       part = MIN (part, (unsigned)size);
2180       size -= part;
2181       nvptx_assemble_value (0, part);
2182     }
2183 
2184   /* If this skip doesn't terminate the initializer, write as many
2185      remaining pieces as possible directly.  */
2186   if (size < init_frag.remaining * init_frag.size)
2187     {
2188       while (size >= init_frag.size)
2189 	{
2190 	  size -= init_frag.size;
2191 	  output_init_frag (NULL_RTX);
2192 	}
2193       if (size)
2194 	nvptx_assemble_value (0, size);
2195     }
2196 }
2197 
2198 /* Output a string STR with length SIZE.  As in nvptx_output_skip we
2199    ignore the FILE arg.  */
2200 
2201 void
nvptx_output_ascii(FILE *,const char * str,unsigned HOST_WIDE_INT size)2202 nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
2203 {
2204   for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
2205     nvptx_assemble_value (str[i], 1);
2206 }
2207 
2208 /* Return true if TYPE is a record type where the last field is an array without
2209    given dimension.  */
2210 
2211 static bool
flexible_array_member_type_p(const_tree type)2212 flexible_array_member_type_p (const_tree type)
2213 {
2214   if (TREE_CODE (type) != RECORD_TYPE)
2215     return false;
2216 
2217   const_tree last_field = NULL_TREE;
2218   for (const_tree f = TYPE_FIELDS (type); f; f = TREE_CHAIN (f))
2219     last_field = f;
2220 
2221   if (!last_field)
2222     return false;
2223 
2224   const_tree last_field_type = TREE_TYPE (last_field);
2225   if (TREE_CODE (last_field_type) != ARRAY_TYPE)
2226     return false;
2227 
2228   return (! TYPE_DOMAIN (last_field_type)
2229 	  || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type)));
2230 }
2231 
2232 /* Emit a PTX variable decl and prepare for emission of its
2233    initializer.  NAME is the symbol name and SETION the PTX data
2234    area. The type is TYPE, object size SIZE and alignment is ALIGN.
2235    The caller has already emitted any indentation and linkage
2236    specifier.  It is responsible for any initializer, terminating ;
2237    and newline.  SIZE is in bytes, ALIGN is in bits -- confusingly
2238    this is the opposite way round that PTX wants them!  */
2239 
2240 static void
2241 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
2242 			   const_tree type, HOST_WIDE_INT size, unsigned align,
2243 			   bool undefined = false)
2244 {
2245   bool atype = (TREE_CODE (type) == ARRAY_TYPE)
2246     && (TYPE_DOMAIN (type) == NULL_TREE);
2247 
2248   if (undefined && flexible_array_member_type_p (type))
2249     {
2250       size = 0;
2251       atype = true;
2252     }
2253 
2254   while (TREE_CODE (type) == ARRAY_TYPE)
2255     type = TREE_TYPE (type);
2256 
2257   if (TREE_CODE (type) == VECTOR_TYPE
2258       || TREE_CODE (type) == COMPLEX_TYPE)
2259     /* Neither vector nor complex types can contain the other.  */
2260     type = TREE_TYPE (type);
2261 
2262   unsigned HOST_WIDE_INT elt_size = int_size_in_bytes (type);
2263 
2264   /* Largest mode we're prepared to accept.  For BLKmode types we
2265      don't know if it'll contain pointer constants, so have to choose
2266      pointer size, otherwise we can choose DImode.  */
2267   machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
2268 
2269   elt_size |= GET_MODE_SIZE (elt_mode);
2270   elt_size &= -elt_size; /* Extract LSB set.  */
2271 
2272   init_frag.size = elt_size;
2273   /* Avoid undefined shift behavior by using '2'.  */
2274   init_frag.mask = ((unsigned HOST_WIDE_INT)2
2275 		    << (elt_size * BITS_PER_UNIT - 1)) - 1;
2276   init_frag.val = 0;
2277   init_frag.offset = 0;
2278   init_frag.started = false;
2279   /* Size might not be a multiple of elt size, if there's an
2280      initialized trailing struct array with smaller type than
2281      elt_size. */
2282   init_frag.remaining = (size + elt_size - 1) / elt_size;
2283 
2284   fprintf (file, "%s .align %d .u" HOST_WIDE_INT_PRINT_UNSIGNED " ",
2285 	   section, align / BITS_PER_UNIT,
2286 	   elt_size * BITS_PER_UNIT);
2287   assemble_name (file, name);
2288 
2289   if (size)
2290     /* We make everything an array, to simplify any initialization
2291        emission.  */
2292     fprintf (file, "[" HOST_WIDE_INT_PRINT_UNSIGNED "]", init_frag.remaining);
2293   else if (atype)
2294     fprintf (file, "[]");
2295 }
2296 
2297 /* Called when the initializer for a decl has been completely output through
2298    combinations of the three functions above.  */
2299 
2300 static void
nvptx_assemble_decl_end(void)2301 nvptx_assemble_decl_end (void)
2302 {
2303   if (init_frag.offset)
2304     /* This can happen with a packed struct with trailing array member.  */
2305     nvptx_assemble_value (0, init_frag.size - init_frag.offset);
2306   fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
2307 }
2308 
2309 /* Output an uninitialized common or file-scope variable.  */
2310 
2311 void
nvptx_output_aligned_decl(FILE * file,const char * name,const_tree decl,HOST_WIDE_INT size,unsigned align)2312 nvptx_output_aligned_decl (FILE *file, const char *name,
2313 			   const_tree decl, HOST_WIDE_INT size, unsigned align)
2314 {
2315   write_var_marker (file, true, TREE_PUBLIC (decl), name);
2316 
2317   /* If this is public, it is common.  The nearest thing we have to
2318      common is weak.  */
2319   fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
2320 
2321   nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2322 			     TREE_TYPE (decl), size, align);
2323   nvptx_assemble_decl_end ();
2324 }
2325 
2326 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME.  Begin the process of
2327    writing a constant variable EXP with NAME and SIZE and its
2328    initializer to FILE.  */
2329 
2330 static void
nvptx_asm_declare_constant_name(FILE * file,const char * name,const_tree exp,HOST_WIDE_INT obj_size)2331 nvptx_asm_declare_constant_name (FILE *file, const char *name,
2332 				 const_tree exp, HOST_WIDE_INT obj_size)
2333 {
2334   write_var_marker (file, true, false, name);
2335 
2336   fprintf (file, "\t");
2337 
2338   tree type = TREE_TYPE (exp);
2339   nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
2340 			     TYPE_ALIGN (type));
2341 }
2342 
2343 /* Implement the ASM_DECLARE_OBJECT_NAME macro.  Used to start writing
2344    a variable DECL with NAME to FILE.  */
2345 
2346 void
nvptx_declare_object_name(FILE * file,const char * name,const_tree decl)2347 nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
2348 {
2349   write_var_marker (file, true, TREE_PUBLIC (decl), name);
2350 
2351   fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
2352 			  : DECL_WEAK (decl) ? ".weak " : ".visible "));
2353 
2354   tree type = TREE_TYPE (decl);
2355   HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
2356   nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2357 			     type, obj_size, DECL_ALIGN (decl));
2358 }
2359 
2360 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing.  */
2361 
2362 static void
nvptx_globalize_label(FILE *,const char *)2363 nvptx_globalize_label (FILE *, const char *)
2364 {
2365 }
2366 
2367 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL.  Write an extern
2368    declaration only for variable DECL with NAME to FILE.  */
2369 
2370 static void
nvptx_assemble_undefined_decl(FILE * file,const char * name,const_tree decl)2371 nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
2372 {
2373   /* The middle end can place constant pool decls into the varpool as
2374      undefined.  Until that is fixed, catch the problem here.  */
2375   if (DECL_IN_CONSTANT_POOL (decl))
2376     return;
2377 
2378   /*  We support weak defintions, and hence have the right
2379       ASM_WEAKEN_DECL definition.  Diagnose the problem here.  */
2380   if (DECL_WEAK (decl))
2381     error_at (DECL_SOURCE_LOCATION (decl),
2382 	      "PTX does not support weak declarations"
2383 	      " (only weak definitions)");
2384   write_var_marker (file, false, TREE_PUBLIC (decl), name);
2385 
2386   fprintf (file, "\t.extern ");
2387   tree size = DECL_SIZE_UNIT (decl);
2388   nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2389 			     TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
2390 			     DECL_ALIGN (decl), true);
2391   nvptx_assemble_decl_end ();
2392 }
2393 
2394 /* Output a pattern for a move instruction.  */
2395 
2396 const char *
nvptx_output_mov_insn(rtx dst,rtx src)2397 nvptx_output_mov_insn (rtx dst, rtx src)
2398 {
2399   machine_mode dst_mode = GET_MODE (dst);
2400   machine_mode src_mode = GET_MODE (src);
2401   machine_mode dst_inner = (GET_CODE (dst) == SUBREG
2402 			    ? GET_MODE (XEXP (dst, 0)) : dst_mode);
2403   machine_mode src_inner = (GET_CODE (src) == SUBREG
2404 			    ? GET_MODE (XEXP (src, 0)) : dst_mode);
2405 
2406   rtx sym = src;
2407   if (GET_CODE (sym) == CONST)
2408     sym = XEXP (XEXP (sym, 0), 0);
2409   if (SYMBOL_REF_P (sym))
2410     {
2411       if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
2412 	return "%.\tcvta%D1%t0\t%0, %1;";
2413       nvptx_maybe_record_fnsym (sym);
2414     }
2415 
2416   if (src_inner == dst_inner)
2417     return "%.\tmov%t0\t%0, %1;";
2418 
2419   if (CONSTANT_P (src))
2420     return (GET_MODE_CLASS (dst_inner) == MODE_INT
2421 	    && GET_MODE_CLASS (src_inner) != MODE_FLOAT
2422 	    ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2423 
2424   if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
2425     {
2426       if (GET_MODE_BITSIZE (dst_mode) == 128
2427 	  && GET_MODE_BITSIZE (src_mode) == 128)
2428 	{
2429 	  /* mov.b128 is not supported.  */
2430 	  if (dst_inner == V2DImode && src_inner == TImode)
2431 	    return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2432 	  else if (dst_inner == TImode && src_inner == V2DImode)
2433 	    return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2434 
2435 	  gcc_unreachable ();
2436 	}
2437       return "%.\tmov.b%T0\t%0, %1;";
2438     }
2439 
2440   if (GET_MODE_BITSIZE (src_inner) == 128
2441       && GET_MODE_BITSIZE (src_mode) == 64)
2442     return "%.\tmov.b%T0\t%0, %1;";
2443 
2444   return "%.\tcvt%t0%t1\t%0, %1;";
2445 }
2446 
2447 static void nvptx_print_operand (FILE *, rtx, int);
2448 
2449 /* Output INSN, which is a call to CALLEE with result RESULT.  For ptx, this
2450    involves writing .param declarations and in/out copies into them.  For
2451    indirect calls, also write the .callprototype.  */
2452 
2453 const char *
nvptx_output_call_insn(rtx_insn * insn,rtx result,rtx callee)2454 nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
2455 {
2456   char buf[16];
2457   static int labelno;
2458   bool needs_tgt = register_operand (callee, Pmode);
2459   rtx pat = PATTERN (insn);
2460   if (GET_CODE (pat) == COND_EXEC)
2461     pat = COND_EXEC_CODE (pat);
2462   int arg_end = XVECLEN (pat, 0);
2463   tree decl = NULL_TREE;
2464 
2465   fprintf (asm_out_file, "\t{\n");
2466   if (result != NULL)
2467     fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
2468 	     nvptx_ptx_type_from_mode (GET_MODE (result), false),
2469 	     reg_names[NVPTX_RETURN_REGNUM]);
2470 
2471   /* Ensure we have a ptx declaration in the output if necessary.  */
2472   if (GET_CODE (callee) == SYMBOL_REF)
2473     {
2474       decl = SYMBOL_REF_DECL (callee);
2475       if (!decl
2476 	  || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
2477 	nvptx_record_libfunc (callee, result, pat);
2478       else if (DECL_EXTERNAL (decl))
2479 	nvptx_record_fndecl (decl);
2480     }
2481 
2482   if (needs_tgt)
2483     {
2484       ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
2485       labelno++;
2486       ASM_OUTPUT_LABEL (asm_out_file, buf);
2487       std::stringstream s;
2488       write_fn_proto_from_insn (s, NULL, result, pat);
2489       fputs (s.str().c_str(), asm_out_file);
2490     }
2491 
2492   for (int argno = 1; argno < arg_end; argno++)
2493     {
2494       rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
2495       machine_mode mode = GET_MODE (t);
2496       const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
2497 
2498       /* Mode splitting has already been done.  */
2499       fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
2500 	       "\t\tst.param%s [%%out_arg%d], ",
2501 	       ptx_type, argno, ptx_type, argno);
2502       output_reg (asm_out_file, REGNO (t), VOIDmode);
2503       fprintf (asm_out_file, ";\n");
2504     }
2505 
2506   /* The '.' stands for the call's predicate, if any.  */
2507   nvptx_print_operand (asm_out_file, NULL_RTX, '.');
2508   fprintf (asm_out_file, "\t\tcall ");
2509   if (result != NULL_RTX)
2510     fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
2511 
2512   if (decl)
2513     {
2514       char *replaced_dots = NULL;
2515       const char *name = get_fnname_from_decl (decl);
2516       const char *replacement = nvptx_name_replacement (name);
2517       if (replacement != name)
2518 	name = replacement;
2519       else
2520 	{
2521 	  replaced_dots = nvptx_replace_dot (name);
2522 	  if (replaced_dots)
2523 	    name = replaced_dots;
2524 	}
2525       assemble_name (asm_out_file, name);
2526       if (replaced_dots)
2527 	XDELETE (replaced_dots);
2528     }
2529   else
2530     output_address (VOIDmode, callee);
2531 
2532   const char *open = "(";
2533   for (int argno = 1; argno < arg_end; argno++)
2534     {
2535       fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
2536       open = "";
2537     }
2538   if (decl && DECL_STATIC_CHAIN (decl))
2539     {
2540       fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
2541       open = "";
2542     }
2543   if (!open[0])
2544     fprintf (asm_out_file, ")");
2545 
2546   if (needs_tgt)
2547     {
2548       fprintf (asm_out_file, ", ");
2549       assemble_name (asm_out_file, buf);
2550     }
2551   fprintf (asm_out_file, ";\n");
2552 
2553   if (find_reg_note (insn, REG_NORETURN, NULL))
2554     {
2555       /* No return functions confuse the PTX JIT, as it doesn't realize
2556 	 the flow control barrier they imply.  It can seg fault if it
2557 	 encounters what looks like an unexitable loop.  Emit a trailing
2558 	 trap and exit, which it does grok.  */
2559       fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
2560       fprintf (asm_out_file, "\t\texit; // (noreturn)\n");
2561     }
2562 
2563   if (result)
2564     {
2565       static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2566 
2567       if (!rval[0])
2568 	/* We must escape the '%' that starts RETURN_REGNUM.  */
2569 	sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2570 		 reg_names[NVPTX_RETURN_REGNUM]);
2571       return rval;
2572     }
2573 
2574   return "}";
2575 }
2576 
2577 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P.  */
2578 
2579 static bool
nvptx_print_operand_punct_valid_p(unsigned char c)2580 nvptx_print_operand_punct_valid_p (unsigned char c)
2581 {
2582   return c == '.' || c== '#';
2583 }
2584 
2585 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE.  */
2586 
2587 static void
nvptx_print_address_operand(FILE * file,rtx x,machine_mode)2588 nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
2589 {
2590   rtx off;
2591   if (GET_CODE (x) == CONST)
2592     x = XEXP (x, 0);
2593   switch (GET_CODE (x))
2594     {
2595     case PLUS:
2596       off = XEXP (x, 1);
2597       output_address (VOIDmode, XEXP (x, 0));
2598       fprintf (file, "+");
2599       output_address (VOIDmode, off);
2600       break;
2601 
2602     case SYMBOL_REF:
2603     case LABEL_REF:
2604       output_addr_const (file, x);
2605       break;
2606 
2607     default:
2608       gcc_assert (GET_CODE (x) != MEM);
2609       nvptx_print_operand (file, x, 0);
2610       break;
2611     }
2612 }
2613 
2614 /* Write assembly language output for the address ADDR to FILE.  */
2615 
2616 static void
nvptx_print_operand_address(FILE * file,machine_mode mode,rtx addr)2617 nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
2618 {
2619   nvptx_print_address_operand (file, addr, mode);
2620 }
2621 
2622 /* Print an operand, X, to FILE, with an optional modifier in CODE.
2623 
2624    Meaning of CODE:
2625    . -- print the predicate for the instruction or an emptry string for an
2626         unconditional one.
2627    # -- print a rounding mode for the instruction
2628 
2629    A -- print a data area for a MEM
2630    c -- print an opcode suffix for a comparison operator, including a type code
2631    D -- print a data area for a MEM operand
2632    S -- print a shuffle kind specified by CONST_INT
2633    t -- print a type opcode suffix, promoting QImode to 32 bits
2634    T -- print a type size in bits
2635    u -- print a type opcode suffix without promotions.  */
2636 
2637 static void
nvptx_print_operand(FILE * file,rtx x,int code)2638 nvptx_print_operand (FILE *file, rtx x, int code)
2639 {
2640   if (code == '.')
2641     {
2642       x = current_insn_predicate;
2643       if (x)
2644 	{
2645 	  fputs ("@", file);
2646 	  if (GET_CODE (x) == EQ)
2647 	    fputs ("!", file);
2648 	  output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
2649 	}
2650       return;
2651     }
2652   else if (code == '#')
2653     {
2654       fputs (".rn", file);
2655       return;
2656     }
2657 
2658   enum rtx_code x_code = GET_CODE (x);
2659   machine_mode mode = GET_MODE (x);
2660 
2661   switch (code)
2662     {
2663     case 'A':
2664       x = XEXP (x, 0);
2665       gcc_fallthrough (); /* FALLTHROUGH. */
2666 
2667     case 'D':
2668       if (GET_CODE (x) == CONST)
2669 	x = XEXP (x, 0);
2670       if (GET_CODE (x) == PLUS)
2671 	x = XEXP (x, 0);
2672 
2673       if (GET_CODE (x) == SYMBOL_REF)
2674 	fputs (section_for_sym (x), file);
2675       break;
2676 
2677     case 't':
2678     case 'u':
2679       if (x_code == SUBREG)
2680 	{
2681 	  machine_mode inner_mode = GET_MODE (SUBREG_REG (x));
2682 	  if (VECTOR_MODE_P (inner_mode)
2683 	      && (GET_MODE_SIZE (mode)
2684 		  <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2685 	    mode = GET_MODE_INNER (inner_mode);
2686 	  else if (split_mode_p (inner_mode))
2687 	    mode = maybe_split_mode (inner_mode);
2688 	  else
2689 	    mode = inner_mode;
2690 	}
2691       fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
2692       break;
2693 
2694     case 'H':
2695     case 'L':
2696       {
2697 	rtx inner_x = SUBREG_REG (x);
2698 	machine_mode inner_mode = GET_MODE (inner_x);
2699 	machine_mode split = maybe_split_mode (inner_mode);
2700 
2701 	output_reg (file, REGNO (inner_x), split,
2702 		    (code == 'H'
2703 		     ? GET_MODE_SIZE (inner_mode) / 2
2704 		     : 0));
2705       }
2706       break;
2707 
2708     case 'S':
2709       {
2710 	nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2711 	/* Same order as nvptx_shuffle_kind.  */
2712 	static const char *const kinds[] =
2713 	  {".up", ".down", ".bfly", ".idx"};
2714 	fputs (kinds[kind], file);
2715       }
2716       break;
2717 
2718     case 'T':
2719       fprintf (file, "%d", GET_MODE_BITSIZE (mode));
2720       break;
2721 
2722     case 'j':
2723       fprintf (file, "@");
2724       goto common;
2725 
2726     case 'J':
2727       fprintf (file, "@!");
2728       goto common;
2729 
2730     case 'c':
2731       mode = GET_MODE (XEXP (x, 0));
2732       switch (x_code)
2733 	{
2734 	case EQ:
2735 	  fputs (".eq", file);
2736 	  break;
2737 	case NE:
2738 	  if (FLOAT_MODE_P (mode))
2739 	    fputs (".neu", file);
2740 	  else
2741 	    fputs (".ne", file);
2742 	  break;
2743 	case LE:
2744 	case LEU:
2745 	  fputs (".le", file);
2746 	  break;
2747 	case GE:
2748 	case GEU:
2749 	  fputs (".ge", file);
2750 	  break;
2751 	case LT:
2752 	case LTU:
2753 	  fputs (".lt", file);
2754 	  break;
2755 	case GT:
2756 	case GTU:
2757 	  fputs (".gt", file);
2758 	  break;
2759 	case LTGT:
2760 	  fputs (".ne", file);
2761 	  break;
2762 	case UNEQ:
2763 	  fputs (".equ", file);
2764 	  break;
2765 	case UNLE:
2766 	  fputs (".leu", file);
2767 	  break;
2768 	case UNGE:
2769 	  fputs (".geu", file);
2770 	  break;
2771 	case UNLT:
2772 	  fputs (".ltu", file);
2773 	  break;
2774 	case UNGT:
2775 	  fputs (".gtu", file);
2776 	  break;
2777 	case UNORDERED:
2778 	  fputs (".nan", file);
2779 	  break;
2780 	case ORDERED:
2781 	  fputs (".num", file);
2782 	  break;
2783 	default:
2784 	  gcc_unreachable ();
2785 	}
2786       if (FLOAT_MODE_P (mode)
2787 	  || x_code == EQ || x_code == NE
2788 	  || x_code == GEU || x_code == GTU
2789 	  || x_code == LEU || x_code == LTU)
2790 	fputs (nvptx_ptx_type_from_mode (mode, true), file);
2791       else
2792 	fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
2793       break;
2794     default:
2795     common:
2796       switch (x_code)
2797 	{
2798 	case SUBREG:
2799 	  {
2800 	    rtx inner_x = SUBREG_REG (x);
2801 	    machine_mode inner_mode = GET_MODE (inner_x);
2802 	    machine_mode split = maybe_split_mode (inner_mode);
2803 
2804 	    if (VECTOR_MODE_P (inner_mode)
2805 		&& (GET_MODE_SIZE (mode)
2806 		    <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2807 	      {
2808 		output_reg (file, REGNO (inner_x), VOIDmode);
2809 		fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y");
2810 	      }
2811 	    else if (split_mode_p (inner_mode)
2812 		&& (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2813 	      output_reg (file, REGNO (inner_x), split);
2814 	    else
2815 	      output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2816 	  }
2817 	  break;
2818 
2819 	case REG:
2820 	  output_reg (file, REGNO (x), maybe_split_mode (mode));
2821 	  break;
2822 
2823 	case MEM:
2824 	  fputc ('[', file);
2825 	  nvptx_print_address_operand (file, XEXP (x, 0), mode);
2826 	  fputc (']', file);
2827 	  break;
2828 
2829 	case CONST_INT:
2830 	  output_addr_const (file, x);
2831 	  break;
2832 
2833 	case CONST:
2834 	case SYMBOL_REF:
2835 	case LABEL_REF:
2836 	  /* We could use output_addr_const, but that can print things like
2837 	     "x-8", which breaks ptxas.  Need to ensure it is output as
2838 	     "x+-8".  */
2839 	  nvptx_print_address_operand (file, x, VOIDmode);
2840 	  break;
2841 
2842 	case CONST_DOUBLE:
2843 	  long vals[2];
2844 	  real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
2845 	  vals[0] &= 0xffffffff;
2846 	  vals[1] &= 0xffffffff;
2847 	  if (mode == SFmode)
2848 	    fprintf (file, "0f%08lx", vals[0]);
2849 	  else
2850 	    fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2851 	  break;
2852 
2853 	case CONST_VECTOR:
2854 	  {
2855 	    unsigned n = CONST_VECTOR_NUNITS (x);
2856 	    fprintf (file, "{ ");
2857 	    for (unsigned i = 0; i < n; ++i)
2858 	      {
2859 		if (i != 0)
2860 		  fprintf (file, ", ");
2861 
2862 		rtx elem = CONST_VECTOR_ELT (x, i);
2863 		output_addr_const (file, elem);
2864 	      }
2865 	    fprintf (file, " }");
2866 	  }
2867 	  break;
2868 
2869 	default:
2870 	  output_addr_const (file, x);
2871 	}
2872     }
2873 }
2874 
2875 /* Record replacement regs used to deal with subreg operands.  */
2876 struct reg_replace
2877 {
2878   rtx replacement[MAX_RECOG_OPERANDS];
2879   machine_mode mode;
2880   int n_allocated;
2881   int n_in_use;
2882 };
2883 
2884 /* Allocate or reuse a replacement in R and return the rtx.  */
2885 
2886 static rtx
get_replacement(struct reg_replace * r)2887 get_replacement (struct reg_replace *r)
2888 {
2889   if (r->n_allocated == r->n_in_use)
2890     r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2891   return r->replacement[r->n_in_use++];
2892 }
2893 
2894 /* Clean up subreg operands.  In ptx assembly, everything is typed, and
2895    the presence of subregs would break the rules for most instructions.
2896    Replace them with a suitable new register of the right size, plus
2897    conversion copyin/copyout instructions.  */
2898 
2899 static void
nvptx_reorg_subreg(void)2900 nvptx_reorg_subreg (void)
2901 {
2902   struct reg_replace qiregs, hiregs, siregs, diregs;
2903   rtx_insn *insn, *next;
2904 
2905   qiregs.n_allocated = 0;
2906   hiregs.n_allocated = 0;
2907   siregs.n_allocated = 0;
2908   diregs.n_allocated = 0;
2909   qiregs.mode = QImode;
2910   hiregs.mode = HImode;
2911   siregs.mode = SImode;
2912   diregs.mode = DImode;
2913 
2914   for (insn = get_insns (); insn; insn = next)
2915     {
2916       next = NEXT_INSN (insn);
2917       if (!NONDEBUG_INSN_P (insn)
2918 	  || asm_noperands (PATTERN (insn)) >= 0
2919 	  || GET_CODE (PATTERN (insn)) == USE
2920 	  || GET_CODE (PATTERN (insn)) == CLOBBER)
2921 	continue;
2922 
2923       qiregs.n_in_use = 0;
2924       hiregs.n_in_use = 0;
2925       siregs.n_in_use = 0;
2926       diregs.n_in_use = 0;
2927       extract_insn (insn);
2928       enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
2929 
2930       for (int i = 0; i < recog_data.n_operands; i++)
2931 	{
2932 	  rtx op = recog_data.operand[i];
2933 	  if (GET_CODE (op) != SUBREG)
2934 	    continue;
2935 
2936 	  rtx inner = SUBREG_REG (op);
2937 
2938 	  machine_mode outer_mode = GET_MODE (op);
2939 	  machine_mode inner_mode = GET_MODE (inner);
2940 	  gcc_assert (s_ok);
2941 	  if (s_ok
2942 	      && (GET_MODE_PRECISION (inner_mode)
2943 		  >= GET_MODE_PRECISION (outer_mode)))
2944 	    continue;
2945 	  gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2946 	  struct reg_replace *r = (outer_mode == QImode ? &qiregs
2947 				   : outer_mode == HImode ? &hiregs
2948 				   : outer_mode == SImode ? &siregs
2949 				   : &diregs);
2950 	  rtx new_reg = get_replacement (r);
2951 
2952 	  if (recog_data.operand_type[i] != OP_OUT)
2953 	    {
2954 	      enum rtx_code code;
2955 	      if (GET_MODE_PRECISION (inner_mode)
2956 		  < GET_MODE_PRECISION (outer_mode))
2957 		code = ZERO_EXTEND;
2958 	      else
2959 		code = TRUNCATE;
2960 
2961 	      rtx pat = gen_rtx_SET (new_reg,
2962 				     gen_rtx_fmt_e (code, outer_mode, inner));
2963 	      emit_insn_before (pat, insn);
2964 	    }
2965 
2966 	  if (recog_data.operand_type[i] != OP_IN)
2967 	    {
2968 	      enum rtx_code code;
2969 	      if (GET_MODE_PRECISION (inner_mode)
2970 		  < GET_MODE_PRECISION (outer_mode))
2971 		code = TRUNCATE;
2972 	      else
2973 		code = ZERO_EXTEND;
2974 
2975 	      rtx pat = gen_rtx_SET (inner,
2976 				     gen_rtx_fmt_e (code, inner_mode, new_reg));
2977 	      emit_insn_after (pat, insn);
2978 	    }
2979 	  validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2980 	}
2981     }
2982 }
2983 
2984 /* Return a SImode "master lane index" register for uniform-simt, allocating on
2985    first use.  */
2986 
2987 static rtx
nvptx_get_unisimt_master()2988 nvptx_get_unisimt_master ()
2989 {
2990   rtx &master = cfun->machine->unisimt_master;
2991   return master ? master : master = gen_reg_rtx (SImode);
2992 }
2993 
2994 /* Return a BImode "predicate" register for uniform-simt, similar to above.  */
2995 
2996 static rtx
nvptx_get_unisimt_predicate()2997 nvptx_get_unisimt_predicate ()
2998 {
2999   rtx &pred = cfun->machine->unisimt_predicate;
3000   return pred ? pred : pred = gen_reg_rtx (BImode);
3001 }
3002 
3003 /* Return true if given call insn references one of the functions provided by
3004    the CUDA runtime: malloc, free, vprintf.  */
3005 
3006 static bool
nvptx_call_insn_is_syscall_p(rtx_insn * insn)3007 nvptx_call_insn_is_syscall_p (rtx_insn *insn)
3008 {
3009   rtx pat = PATTERN (insn);
3010   gcc_checking_assert (GET_CODE (pat) == PARALLEL);
3011   pat = XVECEXP (pat, 0, 0);
3012   if (GET_CODE (pat) == SET)
3013     pat = SET_SRC (pat);
3014   gcc_checking_assert (GET_CODE (pat) == CALL
3015 		       && GET_CODE (XEXP (pat, 0)) == MEM);
3016   rtx addr = XEXP (XEXP (pat, 0), 0);
3017   if (GET_CODE (addr) != SYMBOL_REF)
3018     return false;
3019   const char *name = XSTR (addr, 0);
3020   /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
3021      references with forced assembler name refer to PTX syscalls.  For vprintf,
3022      accept both normal and forced-assembler-name references.  */
3023   return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
3024 	  || !strcmp (name, "*malloc")
3025 	  || !strcmp (name, "*free"));
3026 }
3027 
3028 /* If SET subexpression of INSN sets a register, emit a shuffle instruction to
3029    propagate its value from lane MASTER to current lane.  */
3030 
3031 static void
nvptx_unisimt_handle_set(rtx set,rtx_insn * insn,rtx master)3032 nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
3033 {
3034   rtx reg;
3035   if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
3036     emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
3037 }
3038 
3039 /* Adjust code for uniform-simt code generation variant by making atomics and
3040    "syscalls" conditionally executed, and inserting shuffle-based propagation
3041    for registers being set.  */
3042 
3043 static void
nvptx_reorg_uniform_simt()3044 nvptx_reorg_uniform_simt ()
3045 {
3046   rtx_insn *insn, *next;
3047 
3048   for (insn = get_insns (); insn; insn = next)
3049     {
3050       next = NEXT_INSN (insn);
3051       if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
3052 	  && !(NONJUMP_INSN_P (insn)
3053 	       && GET_CODE (PATTERN (insn)) == PARALLEL
3054 	       && get_attr_atomic (insn)))
3055 	continue;
3056       rtx pat = PATTERN (insn);
3057       rtx master = nvptx_get_unisimt_master ();
3058       for (int i = 0; i < XVECLEN (pat, 0); i++)
3059 	nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
3060       rtx pred = nvptx_get_unisimt_predicate ();
3061       pred = gen_rtx_NE (BImode, pred, const0_rtx);
3062       pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
3063       validate_change (insn, &PATTERN (insn), pat, false);
3064     }
3065 }
3066 
3067 /* Offloading function attributes.  */
3068 
3069 struct offload_attrs
3070 {
3071   unsigned mask;
3072   int num_gangs;
3073   int num_workers;
3074   int vector_length;
3075 };
3076 
3077 /* Define entries for cfun->machine->axis_dim.  */
3078 
3079 #define MACH_VECTOR_LENGTH 0
3080 #define MACH_MAX_WORKERS 1
3081 
3082 static void populate_offload_attrs (offload_attrs *oa);
3083 
3084 static void
init_axis_dim(void)3085 init_axis_dim (void)
3086 {
3087   offload_attrs oa;
3088   int max_workers;
3089 
3090   populate_offload_attrs (&oa);
3091 
3092   if (oa.num_workers == 0)
3093     max_workers = PTX_CTA_SIZE / oa.vector_length;
3094   else
3095     max_workers = oa.num_workers;
3096 
3097   cfun->machine->axis_dim[MACH_VECTOR_LENGTH] = oa.vector_length;
3098   cfun->machine->axis_dim[MACH_MAX_WORKERS] = max_workers;
3099   cfun->machine->axis_dim_init_p = true;
3100 }
3101 
3102 static int ATTRIBUTE_UNUSED
nvptx_mach_max_workers()3103 nvptx_mach_max_workers ()
3104 {
3105   if (!cfun->machine->axis_dim_init_p)
3106     init_axis_dim ();
3107   return cfun->machine->axis_dim[MACH_MAX_WORKERS];
3108 }
3109 
3110 static int ATTRIBUTE_UNUSED
nvptx_mach_vector_length()3111 nvptx_mach_vector_length ()
3112 {
3113   if (!cfun->machine->axis_dim_init_p)
3114     init_axis_dim ();
3115   return cfun->machine->axis_dim[MACH_VECTOR_LENGTH];
3116 }
3117 
3118 /* Loop structure of the function.  The entire function is described as
3119    a NULL loop.  */
3120 
3121 struct parallel
3122 {
3123   /* Parent parallel.  */
3124   parallel *parent;
3125 
3126   /* Next sibling parallel.  */
3127   parallel *next;
3128 
3129   /* First child parallel.  */
3130   parallel *inner;
3131 
3132   /* Partitioning mask of the parallel.  */
3133   unsigned mask;
3134 
3135   /* Partitioning used within inner parallels. */
3136   unsigned inner_mask;
3137 
3138   /* Location of parallel forked and join.  The forked is the first
3139      block in the parallel and the join is the first block after of
3140      the partition.  */
3141   basic_block forked_block;
3142   basic_block join_block;
3143 
3144   rtx_insn *forked_insn;
3145   rtx_insn *join_insn;
3146 
3147   rtx_insn *fork_insn;
3148   rtx_insn *joining_insn;
3149 
3150   /* Basic blocks in this parallel, but not in child parallels.  The
3151      FORKED and JOINING blocks are in the partition.  The FORK and JOIN
3152      blocks are not.  */
3153   auto_vec<basic_block> blocks;
3154 
3155 public:
3156   parallel (parallel *parent, unsigned mode);
3157   ~parallel ();
3158 };
3159 
3160 /* Constructor links the new parallel into it's parent's chain of
3161    children.  */
3162 
parallel(parallel * parent_,unsigned mask_)3163 parallel::parallel (parallel *parent_, unsigned mask_)
3164   :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
3165 {
3166   forked_block = join_block = 0;
3167   forked_insn = join_insn = 0;
3168   fork_insn = joining_insn = 0;
3169 
3170   if (parent)
3171     {
3172       next = parent->inner;
3173       parent->inner = this;
3174     }
3175 }
3176 
~parallel()3177 parallel::~parallel ()
3178 {
3179   delete inner;
3180   delete next;
3181 }
3182 
3183 /* Map of basic blocks to insns */
3184 typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
3185 
3186 /* A tuple of an insn of interest and the BB in which it resides.  */
3187 typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
3188 typedef auto_vec<insn_bb_t> insn_bb_vec_t;
3189 
3190 /* Split basic blocks such that each forked and join unspecs are at
3191    the start of their basic blocks.  Thus afterwards each block will
3192    have a single partitioning mode.  We also do the same for return
3193    insns, as they are executed by every thread.  Return the
3194    partitioning mode of the function as a whole.  Populate MAP with
3195    head and tail blocks.  We also clear the BB visited flag, which is
3196    used when finding partitions.  */
3197 
3198 static void
nvptx_split_blocks(bb_insn_map_t * map)3199 nvptx_split_blocks (bb_insn_map_t *map)
3200 {
3201   insn_bb_vec_t worklist;
3202   basic_block block;
3203   rtx_insn *insn;
3204 
3205   /* Locate all the reorg instructions of interest.  */
3206   FOR_ALL_BB_FN (block, cfun)
3207     {
3208       bool seen_insn = false;
3209 
3210       /* Clear visited flag, for use by parallel locator  */
3211       block->flags &= ~BB_VISITED;
3212 
3213       FOR_BB_INSNS (block, insn)
3214 	{
3215 	  if (!INSN_P (insn))
3216 	    continue;
3217 	  switch (recog_memoized (insn))
3218 	    {
3219 	    default:
3220 	      seen_insn = true;
3221 	      continue;
3222 	    case CODE_FOR_nvptx_forked:
3223 	    case CODE_FOR_nvptx_join:
3224 	      break;
3225 
3226 	    case CODE_FOR_return:
3227 	      /* We also need to split just before return insns, as
3228 		 that insn needs executing by all threads, but the
3229 		 block it is in probably does not.  */
3230 	      break;
3231 	    }
3232 
3233 	  if (seen_insn)
3234 	    /* We've found an instruction that  must be at the start of
3235 	       a block, but isn't.  Add it to the worklist.  */
3236 	    worklist.safe_push (insn_bb_t (insn, block));
3237 	  else
3238 	    /* It was already the first instruction.  Just add it to
3239 	       the map.  */
3240 	    map->get_or_insert (block) = insn;
3241 	  seen_insn = true;
3242 	}
3243     }
3244 
3245   /* Split blocks on the worklist.  */
3246   unsigned ix;
3247   insn_bb_t *elt;
3248   basic_block remap = 0;
3249   for (ix = 0; worklist.iterate (ix, &elt); ix++)
3250     {
3251       if (remap != elt->second)
3252 	{
3253 	  block = elt->second;
3254 	  remap = block;
3255 	}
3256 
3257       /* Split block before insn. The insn is in the new block  */
3258       edge e = split_block (block, PREV_INSN (elt->first));
3259 
3260       block = e->dest;
3261       map->get_or_insert (block) = elt->first;
3262     }
3263 }
3264 
3265 /* Return true if MASK contains parallelism that requires shared
3266    memory to broadcast.  */
3267 
3268 static bool
nvptx_needs_shared_bcast(unsigned mask)3269 nvptx_needs_shared_bcast (unsigned mask)
3270 {
3271   bool worker = mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
3272   bool large_vector = (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
3273     && nvptx_mach_vector_length () != PTX_WARP_SIZE;
3274 
3275   return worker || large_vector;
3276 }
3277 
3278 /* BLOCK is a basic block containing a head or tail instruction.
3279    Locate the associated prehead or pretail instruction, which must be
3280    in the single predecessor block.  */
3281 
3282 static rtx_insn *
nvptx_discover_pre(basic_block block,int expected)3283 nvptx_discover_pre (basic_block block, int expected)
3284 {
3285   gcc_assert (block->preds->length () == 1);
3286   basic_block pre_block = (*block->preds)[0]->src;
3287   rtx_insn *pre_insn;
3288 
3289   for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
3290        pre_insn = PREV_INSN (pre_insn))
3291     gcc_assert (pre_insn != BB_HEAD (pre_block));
3292 
3293   gcc_assert (recog_memoized (pre_insn) == expected);
3294   return pre_insn;
3295 }
3296 
3297 /* Dump this parallel and all its inner parallels.  */
3298 
3299 static void
nvptx_dump_pars(parallel * par,unsigned depth)3300 nvptx_dump_pars (parallel *par, unsigned depth)
3301 {
3302   fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
3303 	   depth, par->mask,
3304 	   par->forked_block ? par->forked_block->index : -1,
3305 	   par->join_block ? par->join_block->index : -1);
3306 
3307   fprintf (dump_file, "    blocks:");
3308 
3309   basic_block block;
3310   for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
3311     fprintf (dump_file, " %d", block->index);
3312   fprintf (dump_file, "\n");
3313   if (par->inner)
3314     nvptx_dump_pars (par->inner, depth + 1);
3315 
3316   if (par->next)
3317     nvptx_dump_pars (par->next, depth);
3318 }
3319 
3320 /* If BLOCK contains a fork/join marker, process it to create or
3321    terminate a loop structure.  Add this block to the current loop,
3322    and then walk successor blocks.   */
3323 
3324 static parallel *
nvptx_find_par(bb_insn_map_t * map,parallel * par,basic_block block)3325 nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
3326 {
3327   if (block->flags & BB_VISITED)
3328     return par;
3329   block->flags |= BB_VISITED;
3330 
3331   if (rtx_insn **endp = map->get (block))
3332     {
3333       rtx_insn *end = *endp;
3334 
3335       /* This is a block head or tail, or return instruction.  */
3336       switch (recog_memoized (end))
3337 	{
3338 	case CODE_FOR_return:
3339 	  /* Return instructions are in their own block, and we
3340 	     don't need to do anything more.  */
3341 	  return par;
3342 
3343 	case CODE_FOR_nvptx_forked:
3344 	  /* Loop head, create a new inner loop and add it into
3345 	     our parent's child list.  */
3346 	  {
3347 	    unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3348 
3349 	    gcc_assert (mask);
3350 	    par = new parallel (par, mask);
3351 	    par->forked_block = block;
3352 	    par->forked_insn = end;
3353 	    if (nvptx_needs_shared_bcast (mask))
3354 	      par->fork_insn
3355 		= nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
3356 	  }
3357 	  break;
3358 
3359 	case CODE_FOR_nvptx_join:
3360 	  /* A loop tail.  Finish the current loop and return to
3361 	     parent.  */
3362 	  {
3363 	    unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3364 
3365 	    gcc_assert (par->mask == mask);
3366 	    gcc_assert (par->join_block == NULL);
3367 	    par->join_block = block;
3368 	    par->join_insn = end;
3369 	    if (nvptx_needs_shared_bcast (mask))
3370 	      par->joining_insn
3371 		= nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
3372 	    par = par->parent;
3373 	  }
3374 	  break;
3375 
3376 	default:
3377 	  gcc_unreachable ();
3378 	}
3379     }
3380 
3381   if (par)
3382     /* Add this block onto the current loop's list of blocks.  */
3383     par->blocks.safe_push (block);
3384   else
3385     /* This must be the entry block.  Create a NULL parallel.  */
3386     par = new parallel (0, 0);
3387 
3388   /* Walk successor blocks.  */
3389   edge e;
3390   edge_iterator ei;
3391 
3392   FOR_EACH_EDGE (e, ei, block->succs)
3393     nvptx_find_par (map, par, e->dest);
3394 
3395   return par;
3396 }
3397 
3398 /* DFS walk the CFG looking for fork & join markers.  Construct
3399    loop structures as we go.  MAP is a mapping of basic blocks
3400    to head & tail markers, discovered when splitting blocks.  This
3401    speeds up the discovery.  We rely on the BB visited flag having
3402    been cleared when splitting blocks.  */
3403 
3404 static parallel *
nvptx_discover_pars(bb_insn_map_t * map)3405 nvptx_discover_pars (bb_insn_map_t *map)
3406 {
3407   basic_block block;
3408 
3409   /* Mark exit blocks as visited.  */
3410   block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3411   block->flags |= BB_VISITED;
3412 
3413   /* And entry block as not.  */
3414   block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3415   block->flags &= ~BB_VISITED;
3416 
3417   parallel *par = nvptx_find_par (map, 0, block);
3418 
3419   if (dump_file)
3420     {
3421       fprintf (dump_file, "\nLoops\n");
3422       nvptx_dump_pars (par, 0);
3423       fprintf (dump_file, "\n");
3424     }
3425 
3426   return par;
3427 }
3428 
3429 /* Analyse a group of BBs within a partitioned region and create N
3430    Single-Entry-Single-Exit regions.  Some of those regions will be
3431    trivial ones consisting of a single BB.  The blocks of a
3432    partitioned region might form a set of disjoint graphs -- because
3433    the region encloses a differently partitoned sub region.
3434 
3435    We use the linear time algorithm described in 'Finding Regions Fast:
3436    Single Entry Single Exit and control Regions in Linear Time'
3437    Johnson, Pearson & Pingali.  That algorithm deals with complete
3438    CFGs, where a back edge is inserted from END to START, and thus the
3439    problem becomes one of finding equivalent loops.
3440 
3441    In this case we have a partial CFG.  We complete it by redirecting
3442    any incoming edge to the graph to be from an arbitrary external BB,
3443    and similarly redirecting any outgoing edge to be to  that BB.
3444    Thus we end up with a closed graph.
3445 
3446    The algorithm works by building a spanning tree of an undirected
3447    graph and keeping track of back edges from nodes further from the
3448    root in the tree to nodes nearer to the root in the tree.  In the
3449    description below, the root is up and the tree grows downwards.
3450 
3451    We avoid having to deal with degenerate back-edges to the same
3452    block, by splitting each BB into 3 -- one for input edges, one for
3453    the node itself and one for the output edges.  Such back edges are
3454    referred to as 'Brackets'.  Cycle equivalent nodes will have the
3455    same set of brackets.
3456 
3457    Determining bracket equivalency is done by maintaining a list of
3458    brackets in such a manner that the list length and final bracket
3459    uniquely identify the set.
3460 
3461    We use coloring to mark all BBs with cycle equivalency with the
3462    same color.  This is the output of the 'Finding Regions Fast'
3463    algorithm.  Notice it doesn't actually find the set of nodes within
3464    a particular region, just unorderd sets of nodes that are the
3465    entries and exits of SESE regions.
3466 
3467    After determining cycle equivalency, we need to find the minimal
3468    set of SESE regions.  Do this with a DFS coloring walk of the
3469    complete graph.  We're either 'looking' or 'coloring'.  When
3470    looking, and we're in the subgraph, we start coloring the color of
3471    the current node, and remember that node as the start of the
3472    current color's SESE region.  Every time we go to a new node, we
3473    decrement the count of nodes with thet color.  If it reaches zero,
3474    we remember that node as the end of the current color's SESE region
3475    and return to 'looking'.  Otherwise we color the node the current
3476    color.
3477 
3478    This way we end up with coloring the inside of non-trivial SESE
3479    regions with the color of that region.  */
3480 
3481 /* A pair of BBs.  We use this to represent SESE regions.  */
3482 typedef std::pair<basic_block, basic_block> bb_pair_t;
3483 typedef auto_vec<bb_pair_t> bb_pair_vec_t;
3484 
3485 /* A node in the undirected CFG.  The discriminator SECOND indicates just
3486    above or just below the BB idicated by FIRST.  */
3487 typedef std::pair<basic_block, int> pseudo_node_t;
3488 
3489 /* A bracket indicates an edge towards the root of the spanning tree of the
3490    undirected graph.  Each bracket has a color, determined
3491    from the currrent set of brackets.  */
3492 struct bracket
3493 {
3494   pseudo_node_t back; /* Back target */
3495 
3496   /* Current color and size of set.  */
3497   unsigned color;
3498   unsigned size;
3499 
bracketbracket3500   bracket (pseudo_node_t back_)
3501   : back (back_), color (~0u), size (~0u)
3502   {
3503   }
3504 
get_colorbracket3505   unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
3506   {
3507     if (length != size)
3508       {
3509 	size = length;
3510 	color = color_counts.length ();
3511 	color_counts.quick_push (0);
3512       }
3513     color_counts[color]++;
3514     return color;
3515   }
3516 };
3517 
3518 typedef auto_vec<bracket> bracket_vec_t;
3519 
3520 /* Basic block info for finding SESE regions.    */
3521 
3522 struct bb_sese
3523 {
3524   int node;  /* Node number in spanning tree.  */
3525   int parent; /* Parent node number.  */
3526 
3527   /* The algorithm splits each node A into Ai, A', Ao. The incoming
3528      edges arrive at pseudo-node Ai and the outgoing edges leave at
3529      pseudo-node Ao.  We have to remember which way we arrived at a
3530      particular node when generating the spanning tree.  dir > 0 means
3531      we arrived at Ai, dir < 0 means we arrived at Ao.  */
3532   int dir;
3533 
3534   /* Lowest numbered pseudo-node reached via a backedge from thsis
3535      node, or any descendant.  */
3536   pseudo_node_t high;
3537 
3538   int color;  /* Cycle-equivalence color  */
3539 
3540   /* Stack of brackets for this node.  */
3541   bracket_vec_t brackets;
3542 
bb_sesebb_sese3543   bb_sese (unsigned node_, unsigned p, int dir_)
3544   :node (node_), parent (p), dir (dir_)
3545   {
3546   }
3547   ~bb_sese ();
3548 
3549   /* Push a bracket ending at BACK.  */
pushbb_sese3550   void push (const pseudo_node_t &back)
3551   {
3552     if (dump_file)
3553       fprintf (dump_file, "Pushing backedge %d:%+d\n",
3554 	       back.first ? back.first->index : 0, back.second);
3555     brackets.safe_push (bracket (back));
3556   }
3557 
3558   void append (bb_sese *child);
3559   void remove (const pseudo_node_t &);
3560 
3561   /* Set node's color.  */
set_colorbb_sese3562   void set_color (auto_vec<unsigned> &color_counts)
3563   {
3564     color = brackets.last ().get_color (color_counts, brackets.length ());
3565   }
3566 };
3567 
~bb_sese()3568 bb_sese::~bb_sese ()
3569 {
3570 }
3571 
3572 /* Destructively append CHILD's brackets.  */
3573 
3574 void
append(bb_sese * child)3575 bb_sese::append (bb_sese *child)
3576 {
3577   if (int len = child->brackets.length ())
3578     {
3579       int ix;
3580 
3581       if (dump_file)
3582 	{
3583 	  for (ix = 0; ix < len; ix++)
3584 	    {
3585 	      const pseudo_node_t &pseudo = child->brackets[ix].back;
3586 	      fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
3587 		       child->node, pseudo.first ? pseudo.first->index : 0,
3588 		       pseudo.second);
3589 	    }
3590 	}
3591       if (!brackets.length ())
3592 	std::swap (brackets, child->brackets);
3593       else
3594 	{
3595 	  brackets.reserve (len);
3596 	  for (ix = 0; ix < len; ix++)
3597 	    brackets.quick_push (child->brackets[ix]);
3598 	}
3599     }
3600 }
3601 
3602 /* Remove brackets that terminate at PSEUDO.  */
3603 
3604 void
remove(const pseudo_node_t & pseudo)3605 bb_sese::remove (const pseudo_node_t &pseudo)
3606 {
3607   unsigned removed = 0;
3608   int len = brackets.length ();
3609 
3610   for (int ix = 0; ix < len; ix++)
3611     {
3612       if (brackets[ix].back == pseudo)
3613 	{
3614 	  if (dump_file)
3615 	    fprintf (dump_file, "Removing backedge %d:%+d\n",
3616 		     pseudo.first ? pseudo.first->index : 0, pseudo.second);
3617 	  removed++;
3618 	}
3619       else if (removed)
3620 	brackets[ix-removed] = brackets[ix];
3621     }
3622   while (removed--)
3623     brackets.pop ();
3624 }
3625 
3626 /* Accessors for BB's aux pointer.  */
3627 #define BB_SET_SESE(B, S) ((B)->aux = (S))
3628 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3629 
3630 /* DFS walk creating SESE data structures.  Only cover nodes with
3631    BB_VISITED set.  Append discovered blocks to LIST.  We number in
3632    increments of 3 so that the above and below pseudo nodes can be
3633    implicitly numbered too.  */
3634 
3635 static int
nvptx_sese_number(int n,int p,int dir,basic_block b,auto_vec<basic_block> * list)3636 nvptx_sese_number (int n, int p, int dir, basic_block b,
3637 		   auto_vec<basic_block> *list)
3638 {
3639   if (BB_GET_SESE (b))
3640     return n;
3641 
3642   if (dump_file)
3643     fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
3644 	     b->index, n, p, dir);
3645 
3646   BB_SET_SESE (b, new bb_sese (n, p, dir));
3647   p = n;
3648 
3649   n += 3;
3650   list->quick_push (b);
3651 
3652   /* First walk the nodes on the 'other side' of this node, then walk
3653      the nodes on the same side.  */
3654   for (unsigned ix = 2; ix; ix--)
3655     {
3656       vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
3657       size_t offset = (dir > 0 ? offsetof (edge_def, dest)
3658 		       : offsetof (edge_def, src));
3659       edge e;
3660       edge_iterator ei;
3661 
3662       FOR_EACH_EDGE (e, ei, edges)
3663 	{
3664 	  basic_block target = *(basic_block *)((char *)e + offset);
3665 
3666 	  if (target->flags & BB_VISITED)
3667 	    n = nvptx_sese_number (n, p, dir, target, list);
3668 	}
3669       dir = -dir;
3670     }
3671   return n;
3672 }
3673 
3674 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
3675    EDGES are the outgoing edges and OFFSET is the offset to the src
3676    or dst block on the edges.   */
3677 
3678 static void
nvptx_sese_pseudo(basic_block me,bb_sese * sese,int depth,int dir,vec<edge,va_gc> * edges,size_t offset)3679 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
3680 		   vec<edge, va_gc> *edges, size_t offset)
3681 {
3682   edge e;
3683   edge_iterator ei;
3684   int hi_back = depth;
3685   pseudo_node_t node_back (nullptr, depth);
3686   int hi_child = depth;
3687   pseudo_node_t node_child (nullptr, depth);
3688   basic_block child = NULL;
3689   unsigned num_children = 0;
3690   int usd = -dir * sese->dir;
3691 
3692   if (dump_file)
3693     fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
3694 	     me->index, sese->node, dir);
3695 
3696   if (dir < 0)
3697     {
3698       /* This is the above pseudo-child.  It has the BB itself as an
3699 	 additional child node.  */
3700       node_child = sese->high;
3701       hi_child = node_child.second;
3702       if (node_child.first)
3703 	hi_child += BB_GET_SESE (node_child.first)->node;
3704       num_children++;
3705     }
3706 
3707   /* Examine each edge.
3708      - if it is a child (a) append its bracket list and (b) record
3709           whether it is the child with the highest reaching bracket.
3710      - if it is an edge to ancestor, record whether it's the highest
3711           reaching backlink.  */
3712   FOR_EACH_EDGE (e, ei, edges)
3713     {
3714       basic_block target = *(basic_block *)((char *)e + offset);
3715 
3716       if (bb_sese *t_sese = BB_GET_SESE (target))
3717 	{
3718 	  if (t_sese->parent == sese->node && !(t_sese->dir + usd))
3719 	    {
3720 	      /* Child node.  Append its bracket list. */
3721 	      num_children++;
3722 	      sese->append (t_sese);
3723 
3724 	      /* Compare it's hi value.  */
3725 	      int t_hi = t_sese->high.second;
3726 
3727 	      if (basic_block child_hi_block = t_sese->high.first)
3728 		t_hi += BB_GET_SESE (child_hi_block)->node;
3729 
3730 	      if (hi_child > t_hi)
3731 		{
3732 		  hi_child = t_hi;
3733 		  node_child = t_sese->high;
3734 		  child = target;
3735 		}
3736 	    }
3737 	  else if (t_sese->node < sese->node + dir
3738 		   && !(dir < 0 && sese->parent == t_sese->node))
3739 	    {
3740 	      /* Non-parental ancestor node -- a backlink.  */
3741 	      int d = usd * t_sese->dir;
3742 	      int back = t_sese->node + d;
3743 
3744 	      if (hi_back > back)
3745 		{
3746 		  hi_back = back;
3747 		  node_back = pseudo_node_t (target, d);
3748 		}
3749 	    }
3750 	}
3751       else
3752 	{ /* Fallen off graph, backlink to entry node.  */
3753 	  hi_back = 0;
3754 	  node_back = pseudo_node_t (nullptr, 0);
3755 	}
3756     }
3757 
3758   /* Remove any brackets that terminate at this pseudo node.  */
3759   sese->remove (pseudo_node_t (me, dir));
3760 
3761   /* Now push any backlinks from this pseudo node.  */
3762   FOR_EACH_EDGE (e, ei, edges)
3763     {
3764       basic_block target = *(basic_block *)((char *)e + offset);
3765       if (bb_sese *t_sese = BB_GET_SESE (target))
3766 	{
3767 	  if (t_sese->node < sese->node + dir
3768 	      && !(dir < 0 && sese->parent == t_sese->node))
3769 	    /* Non-parental ancestor node - backedge from me.  */
3770 	    sese->push (pseudo_node_t (target, usd * t_sese->dir));
3771 	}
3772       else
3773 	{
3774 	  /* back edge to entry node */
3775 	  sese->push (pseudo_node_t (nullptr, 0));
3776 	}
3777     }
3778 
3779  /* If this node leads directly or indirectly to a no-return region of
3780      the graph, then fake a backedge to entry node.  */
3781   if (!sese->brackets.length () || !edges || !edges->length ())
3782     {
3783       hi_back = 0;
3784       node_back = pseudo_node_t (nullptr, 0);
3785       sese->push (node_back);
3786     }
3787 
3788   /* Record the highest reaching backedge from us or a descendant.  */
3789   sese->high = hi_back < hi_child ? node_back : node_child;
3790 
3791   if (num_children > 1)
3792     {
3793       /* There is more than one child -- this is a Y shaped piece of
3794 	 spanning tree.  We have to insert a fake backedge from this
3795 	 node to the highest ancestor reached by not-the-highest
3796 	 reaching child.  Note that there may be multiple children
3797 	 with backedges to the same highest node.  That's ok and we
3798 	 insert the edge to that highest node.  */
3799       hi_child = depth;
3800       if (dir < 0 && child)
3801 	{
3802 	  node_child = sese->high;
3803 	  hi_child = node_child.second;
3804 	  if (node_child.first)
3805 	    hi_child += BB_GET_SESE (node_child.first)->node;
3806 	}
3807 
3808       FOR_EACH_EDGE (e, ei, edges)
3809 	{
3810 	  basic_block target = *(basic_block *)((char *)e + offset);
3811 
3812 	  if (target == child)
3813 	    /* Ignore the highest child. */
3814 	    continue;
3815 
3816 	  bb_sese *t_sese = BB_GET_SESE (target);
3817 	  if (!t_sese)
3818 	    continue;
3819 	  if (t_sese->parent != sese->node)
3820 	    /* Not a child. */
3821 	    continue;
3822 
3823 	  /* Compare its hi value.  */
3824 	  int t_hi = t_sese->high.second;
3825 
3826 	  if (basic_block child_hi_block = t_sese->high.first)
3827 	    t_hi += BB_GET_SESE (child_hi_block)->node;
3828 
3829 	  if (hi_child > t_hi)
3830 	    {
3831 	      hi_child = t_hi;
3832 	      node_child = t_sese->high;
3833 	    }
3834 	}
3835 
3836       sese->push (node_child);
3837     }
3838 }
3839 
3840 
3841 /* DFS walk of BB graph.  Color node BLOCK according to COLORING then
3842    proceed to successors.  Set SESE entry and exit nodes of
3843    REGIONS.  */
3844 
3845 static void
nvptx_sese_color(auto_vec<unsigned> & color_counts,bb_pair_vec_t & regions,basic_block block,int coloring)3846 nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
3847 		  basic_block block, int coloring)
3848 {
3849   bb_sese *sese = BB_GET_SESE (block);
3850 
3851   if (block->flags & BB_VISITED)
3852     {
3853       /* If we've already encountered this block, either we must not
3854 	 be coloring, or it must have been colored the current color.  */
3855       gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3856       return;
3857     }
3858 
3859   block->flags |= BB_VISITED;
3860 
3861   if (sese)
3862     {
3863       if (coloring < 0)
3864 	{
3865 	  /* Start coloring a region.  */
3866 	  regions[sese->color].first = block;
3867 	  coloring = sese->color;
3868 	}
3869 
3870       if (!--color_counts[sese->color] && sese->color == coloring)
3871 	{
3872 	  /* Found final block of SESE region.  */
3873 	  regions[sese->color].second = block;
3874 	  coloring = -1;
3875 	}
3876       else
3877 	/* Color the node, so we can assert on revisiting the node
3878 	   that the graph is indeed SESE.  */
3879 	sese->color = coloring;
3880     }
3881   else
3882     /* Fallen off the subgraph, we cannot be coloring.  */
3883     gcc_assert (coloring < 0);
3884 
3885   /* Walk each successor block.  */
3886   if (block->succs && block->succs->length ())
3887     {
3888       edge e;
3889       edge_iterator ei;
3890 
3891       FOR_EACH_EDGE (e, ei, block->succs)
3892 	nvptx_sese_color (color_counts, regions, e->dest, coloring);
3893     }
3894   else
3895     gcc_assert (coloring < 0);
3896 }
3897 
3898 /* Find minimal set of SESE regions covering BLOCKS.  REGIONS might
3899    end up with NULL entries in it.  */
3900 
3901 static void
nvptx_find_sese(auto_vec<basic_block> & blocks,bb_pair_vec_t & regions)3902 nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3903 {
3904   basic_block block;
3905   int ix;
3906 
3907   /* First clear each BB of the whole function.  */
3908   FOR_ALL_BB_FN (block, cfun)
3909     {
3910       block->flags &= ~BB_VISITED;
3911       BB_SET_SESE (block, 0);
3912     }
3913 
3914   /* Mark blocks in the function that are in this graph.  */
3915   for (ix = 0; blocks.iterate (ix, &block); ix++)
3916     block->flags |= BB_VISITED;
3917 
3918   /* Counts of nodes assigned to each color.  There cannot be more
3919      colors than blocks (and hopefully there will be fewer).  */
3920   auto_vec<unsigned> color_counts;
3921   color_counts.reserve (blocks.length ());
3922 
3923   /* Worklist of nodes in the spanning tree.  Again, there cannot be
3924      more nodes in the tree than blocks (there will be fewer if the
3925      CFG of blocks is disjoint).  */
3926   auto_vec<basic_block> spanlist;
3927   spanlist.reserve (blocks.length ());
3928 
3929   /* Make sure every block has its cycle class determined.  */
3930   for (ix = 0; blocks.iterate (ix, &block); ix++)
3931     {
3932       if (BB_GET_SESE (block))
3933 	/* We already met this block in an earlier graph solve.  */
3934 	continue;
3935 
3936       if (dump_file)
3937 	fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3938 
3939       /* Number the nodes reachable from block initial DFS order.  */
3940       int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3941 
3942       /* Now walk in reverse DFS order to find cycle equivalents.  */
3943       while (spanlist.length ())
3944 	{
3945 	  block = spanlist.pop ();
3946 	  bb_sese *sese = BB_GET_SESE (block);
3947 
3948 	  /* Do the pseudo node below.  */
3949 	  nvptx_sese_pseudo (block, sese, depth, +1,
3950 			     sese->dir > 0 ? block->succs : block->preds,
3951 			     (sese->dir > 0 ? offsetof (edge_def, dest)
3952 			      : offsetof (edge_def, src)));
3953 	  sese->set_color (color_counts);
3954 	  /* Do the pseudo node above.  */
3955 	  nvptx_sese_pseudo (block, sese, depth, -1,
3956 			     sese->dir < 0 ? block->succs : block->preds,
3957 			     (sese->dir < 0 ? offsetof (edge_def, dest)
3958 			      : offsetof (edge_def, src)));
3959 	}
3960       if (dump_file)
3961 	fprintf (dump_file, "\n");
3962     }
3963 
3964   if (dump_file)
3965     {
3966       unsigned count;
3967       const char *comma = "";
3968 
3969       fprintf (dump_file, "Found %d cycle equivalents\n",
3970 	       color_counts.length ());
3971       for (ix = 0; color_counts.iterate (ix, &count); ix++)
3972 	{
3973 	  fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3974 
3975 	  comma = "";
3976 	  for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3977 	    if (BB_GET_SESE (block)->color == ix)
3978 	      {
3979 		block->flags |= BB_VISITED;
3980 		fprintf (dump_file, "%s%d", comma, block->index);
3981 		comma=",";
3982 	      }
3983 	  fprintf (dump_file, "}");
3984 	  comma = ", ";
3985 	}
3986       fprintf (dump_file, "\n");
3987    }
3988 
3989   /* Now we've colored every block in the subgraph.  We now need to
3990      determine the minimal set of SESE regions that cover that
3991      subgraph.  Do this with a DFS walk of the complete function.
3992      During the walk we're either 'looking' or 'coloring'.  When we
3993      reach the last node of a particular color, we stop coloring and
3994      return to looking.  */
3995 
3996   /* There cannot be more SESE regions than colors.  */
3997   regions.reserve (color_counts.length ());
3998   for (ix = color_counts.length (); ix--;)
3999     regions.quick_push (bb_pair_t (0, 0));
4000 
4001   for (ix = 0; blocks.iterate (ix, &block); ix++)
4002     block->flags &= ~BB_VISITED;
4003 
4004   nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
4005 
4006   if (dump_file)
4007     {
4008       const char *comma = "";
4009       int len = regions.length ();
4010 
4011       fprintf (dump_file, "SESE regions:");
4012       for (ix = 0; ix != len; ix++)
4013 	{
4014 	  basic_block from = regions[ix].first;
4015 	  basic_block to = regions[ix].second;
4016 
4017 	  if (from)
4018 	    {
4019 	      fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
4020 	      if (to != from)
4021 		fprintf (dump_file, "->%d", to->index);
4022 
4023 	      int color = BB_GET_SESE (from)->color;
4024 
4025 	      /* Print the blocks within the region (excluding ends).  */
4026 	      FOR_EACH_BB_FN (block, cfun)
4027 		{
4028 		  bb_sese *sese = BB_GET_SESE (block);
4029 
4030 		  if (sese && sese->color == color
4031 		      && block != from && block != to)
4032 		    fprintf (dump_file, ".%d", block->index);
4033 		}
4034 	      fprintf (dump_file, "}");
4035 	    }
4036 	  comma = ",";
4037 	}
4038       fprintf (dump_file, "\n\n");
4039     }
4040 
4041   for (ix = 0; blocks.iterate (ix, &block); ix++)
4042     delete BB_GET_SESE (block);
4043 }
4044 
4045 #undef BB_SET_SESE
4046 #undef BB_GET_SESE
4047 
4048 /* Propagate live state at the start of a partitioned region.  IS_CALL
4049    indicates whether the propagation is for a (partitioned) call
4050    instruction.  BLOCK provides the live register information, and
4051    might not contain INSN. Propagation is inserted just after INSN. RW
4052    indicates whether we are reading and/or writing state.  This
4053    separation is needed for worker-level proppagation where we
4054    essentially do a spill & fill.  FN is the underlying worker
4055    function to generate the propagation instructions for single
4056    register.  DATA is user data.
4057 
4058    Returns true if we didn't emit any instructions.
4059 
4060    We propagate the live register set for non-calls and the entire
4061    frame for calls and non-calls.  We could do better by (a)
4062    propagating just the live set that is used within the partitioned
4063    regions and (b) only propagating stack entries that are used.  The
4064    latter might be quite hard to determine.  */
4065 
4066 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *, bool);
4067 
4068 static bool
nvptx_propagate(bool is_call,basic_block block,rtx_insn * insn,propagate_mask rw,propagator_fn fn,void * data,bool vector)4069 nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
4070 		 propagate_mask rw, propagator_fn fn, void *data, bool vector)
4071 {
4072   bitmap live = DF_LIVE_IN (block);
4073   bitmap_iterator iterator;
4074   unsigned ix;
4075   bool empty = true;
4076 
4077   /* Copy the frame array.  */
4078   HOST_WIDE_INT fs = get_frame_size ();
4079   if (fs)
4080     {
4081       rtx tmp = gen_reg_rtx (DImode);
4082       rtx idx = NULL_RTX;
4083       rtx ptr = gen_reg_rtx (Pmode);
4084       rtx pred = NULL_RTX;
4085       rtx_code_label *label = NULL;
4086 
4087       empty = false;
4088       /* The frame size might not be DImode compatible, but the frame
4089 	 array's declaration will be.  So it's ok to round up here.  */
4090       fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
4091       /* Detect single iteration loop. */
4092       if (fs == 1)
4093 	fs = 0;
4094 
4095       start_sequence ();
4096       emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
4097       if (fs)
4098 	{
4099 	  idx = gen_reg_rtx (SImode);
4100 	  pred = gen_reg_rtx (BImode);
4101 	  label = gen_label_rtx ();
4102 
4103 	  emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
4104 	  /* Allow worker function to initialize anything needed.  */
4105 	  rtx init = fn (tmp, PM_loop_begin, fs, data, vector);
4106 	  if (init)
4107 	    emit_insn (init);
4108 	  emit_label (label);
4109 	  LABEL_NUSES (label)++;
4110 	  emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
4111 	}
4112       if (rw & PM_read)
4113 	emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
4114       emit_insn (fn (tmp, rw, fs, data, vector));
4115       if (rw & PM_write)
4116 	emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
4117       if (fs)
4118 	{
4119 	  emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
4120 	  emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
4121 	  emit_insn (gen_br_true_uni (pred, label));
4122 	  rtx fini = fn (tmp, PM_loop_end, fs, data, vector);
4123 	  if (fini)
4124 	    emit_insn (fini);
4125 	  emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
4126 	}
4127       emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
4128       emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
4129       rtx cpy = get_insns ();
4130       end_sequence ();
4131       insn = emit_insn_after (cpy, insn);
4132     }
4133 
4134   if (!is_call)
4135     /* Copy live registers.  */
4136     EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
4137       {
4138 	rtx reg = regno_reg_rtx[ix];
4139 
4140 	if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
4141 	  {
4142 	    rtx bcast = fn (reg, rw, 0, data, vector);
4143 
4144 	    insn = emit_insn_after (bcast, insn);
4145 	    empty = false;
4146 	  }
4147       }
4148   return empty;
4149 }
4150 
4151 /* Worker for nvptx_warp_propagate.  */
4152 
4153 static rtx
warp_prop_gen(rtx reg,propagate_mask pm,unsigned ARG_UNUSED (count),void * ARG_UNUSED (data),bool ARG_UNUSED (vector))4154 warp_prop_gen (rtx reg, propagate_mask pm,
4155 	       unsigned ARG_UNUSED (count), void *ARG_UNUSED (data),
4156 	       bool ARG_UNUSED (vector))
4157 {
4158   if (!(pm & PM_read_write))
4159     return 0;
4160 
4161   return nvptx_gen_warp_bcast (reg);
4162 }
4163 
4164 /* Propagate state that is live at start of BLOCK across the vectors
4165    of a single warp.  Propagation is inserted just after INSN.
4166    IS_CALL and return as for nvptx_propagate.  */
4167 
4168 static bool
nvptx_warp_propagate(bool is_call,basic_block block,rtx_insn * insn)4169 nvptx_warp_propagate (bool is_call, basic_block block, rtx_insn *insn)
4170 {
4171   return nvptx_propagate (is_call, block, insn, PM_read_write,
4172 			  warp_prop_gen, 0, false);
4173 }
4174 
4175 /* Worker for nvptx_shared_propagate.  */
4176 
4177 static rtx
shared_prop_gen(rtx reg,propagate_mask pm,unsigned rep,void * data_,bool vector)4178 shared_prop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_,
4179 		 bool vector)
4180 {
4181   broadcast_data_t *data = (broadcast_data_t *)data_;
4182 
4183   if (pm & PM_loop_begin)
4184     {
4185       /* Starting a loop, initialize pointer.    */
4186       unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
4187 
4188       oacc_bcast_align = MAX (oacc_bcast_align, align);
4189       data->offset = ROUND_UP (data->offset, align);
4190 
4191       data->ptr = gen_reg_rtx (Pmode);
4192 
4193       return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
4194     }
4195   else if (pm & PM_loop_end)
4196     {
4197       rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
4198       data->ptr = NULL_RTX;
4199       return clobber;
4200     }
4201   else
4202     return nvptx_gen_shared_bcast (reg, pm, rep, data, vector);
4203 }
4204 
4205 /* Spill or fill live state that is live at start of BLOCK.  PRE_P
4206    indicates if this is just before partitioned mode (do spill), or
4207    just after it starts (do fill). Sequence is inserted just after
4208    INSN.  IS_CALL and return as for nvptx_propagate.  */
4209 
4210 static bool
nvptx_shared_propagate(bool pre_p,bool is_call,basic_block block,rtx_insn * insn,bool vector)4211 nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block,
4212 			rtx_insn *insn, bool vector)
4213 {
4214   broadcast_data_t data;
4215 
4216   data.base = gen_reg_rtx (Pmode);
4217   data.offset = 0;
4218   data.ptr = NULL_RTX;
4219 
4220   bool empty = nvptx_propagate (is_call, block, insn,
4221 				pre_p ? PM_read : PM_write, shared_prop_gen,
4222 				&data, vector);
4223   gcc_assert (empty == !data.offset);
4224   if (data.offset)
4225     {
4226       rtx bcast_sym = oacc_bcast_sym;
4227 
4228       /* Stuff was emitted, initialize the base pointer now.  */
4229       if (vector && nvptx_mach_max_workers () > 1)
4230 	{
4231 	  if (!cfun->machine->bcast_partition)
4232 	    {
4233 	      /* It would be nice to place this register in
4234 		 DATA_AREA_SHARED.  */
4235 	      cfun->machine->bcast_partition = gen_reg_rtx (DImode);
4236 	    }
4237 	  if (!cfun->machine->sync_bar)
4238 	    cfun->machine->sync_bar = gen_reg_rtx (SImode);
4239 
4240 	  bcast_sym = cfun->machine->bcast_partition;
4241 	}
4242 
4243       rtx init = gen_rtx_SET (data.base, bcast_sym);
4244       emit_insn_after (init, insn);
4245 
4246       unsigned int psize = ROUND_UP (data.offset, oacc_bcast_align);
4247       unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
4248 			   ? nvptx_mach_max_workers () + 1
4249 			   : 1);
4250 
4251       oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
4252       oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
4253     }
4254   return empty;
4255 }
4256 
4257 /* Emit a CTA-level synchronization barrier.  LOCK is the barrier number,
4258    which is an integer or a register.  THREADS is the number of threads
4259    controlled by the barrier.  */
4260 
4261 static rtx
nvptx_cta_sync(rtx lock,int threads)4262 nvptx_cta_sync (rtx lock, int threads)
4263 {
4264   return gen_nvptx_barsync (lock, GEN_INT (threads));
4265 }
4266 
4267 #if WORKAROUND_PTXJIT_BUG
4268 /* Return first real insn in BB, or return NULL_RTX if BB does not contain
4269    real insns.  */
4270 
4271 static rtx_insn *
bb_first_real_insn(basic_block bb)4272 bb_first_real_insn (basic_block bb)
4273 {
4274   rtx_insn *insn;
4275 
4276   /* Find first insn of from block.  */
4277   FOR_BB_INSNS (bb, insn)
4278     if (INSN_P (insn))
4279       return insn;
4280 
4281   return 0;
4282 }
4283 #endif
4284 
4285 /* Return true if INSN needs neutering.  */
4286 
4287 static bool
needs_neutering_p(rtx_insn * insn)4288 needs_neutering_p (rtx_insn *insn)
4289 {
4290   if (!INSN_P (insn))
4291     return false;
4292 
4293   switch (recog_memoized (insn))
4294     {
4295     case CODE_FOR_nvptx_fork:
4296     case CODE_FOR_nvptx_forked:
4297     case CODE_FOR_nvptx_joining:
4298     case CODE_FOR_nvptx_join:
4299     case CODE_FOR_nvptx_barsync:
4300       return false;
4301     default:
4302       return true;
4303     }
4304 }
4305 
4306 /* Verify position of VECTOR_{JUMP,LABEL} and WORKER_{JUMP,LABEL} in FROM.  */
4307 
4308 static bool
verify_neutering_jumps(basic_block from,rtx_insn * vector_jump,rtx_insn * worker_jump,rtx_insn * vector_label,rtx_insn * worker_label)4309 verify_neutering_jumps (basic_block from,
4310 			rtx_insn *vector_jump, rtx_insn *worker_jump,
4311 			rtx_insn *vector_label, rtx_insn *worker_label)
4312 {
4313   basic_block bb = from;
4314   rtx_insn *insn = BB_HEAD (bb);
4315   bool seen_worker_jump = false;
4316   bool seen_vector_jump = false;
4317   bool seen_worker_label = false;
4318   bool seen_vector_label = false;
4319   bool worker_neutered = false;
4320   bool vector_neutered = false;
4321   while (true)
4322     {
4323       if (insn == worker_jump)
4324 	{
4325 	  seen_worker_jump = true;
4326 	  worker_neutered = true;
4327 	  gcc_assert (!vector_neutered);
4328 	}
4329       else if (insn == vector_jump)
4330 	{
4331 	  seen_vector_jump = true;
4332 	  vector_neutered = true;
4333 	}
4334       else if (insn == worker_label)
4335 	{
4336 	  seen_worker_label = true;
4337 	  gcc_assert (worker_neutered);
4338 	  worker_neutered = false;
4339 	}
4340       else if (insn == vector_label)
4341 	{
4342 	  seen_vector_label = true;
4343 	  gcc_assert (vector_neutered);
4344 	  vector_neutered = false;
4345 	}
4346       else if (INSN_P (insn))
4347 	switch (recog_memoized (insn))
4348 	  {
4349 	  case CODE_FOR_nvptx_barsync:
4350 	    gcc_assert (!vector_neutered && !worker_neutered);
4351 	    break;
4352 	  default:
4353 	    break;
4354 	  }
4355 
4356       if (insn != BB_END (bb))
4357 	insn = NEXT_INSN (insn);
4358       else if (JUMP_P (insn) && single_succ_p (bb)
4359 	       && !seen_vector_jump && !seen_worker_jump)
4360 	{
4361 	  bb = single_succ (bb);
4362 	  insn = BB_HEAD (bb);
4363 	}
4364       else
4365 	break;
4366     }
4367 
4368   gcc_assert (!(vector_jump && !seen_vector_jump));
4369   gcc_assert (!(worker_jump && !seen_worker_jump));
4370 
4371   if (seen_vector_label || seen_worker_label)
4372     {
4373       gcc_assert (!(vector_label && !seen_vector_label));
4374       gcc_assert (!(worker_label && !seen_worker_label));
4375 
4376       return true;
4377     }
4378 
4379   return false;
4380 }
4381 
4382 /* Verify position of VECTOR_LABEL and WORKER_LABEL in TO.  */
4383 
4384 static void
verify_neutering_labels(basic_block to,rtx_insn * vector_label,rtx_insn * worker_label)4385 verify_neutering_labels (basic_block to, rtx_insn *vector_label,
4386 			 rtx_insn *worker_label)
4387 {
4388   basic_block bb = to;
4389   rtx_insn *insn = BB_END (bb);
4390   bool seen_worker_label = false;
4391   bool seen_vector_label = false;
4392   while (true)
4393     {
4394       if (insn == worker_label)
4395 	{
4396 	  seen_worker_label = true;
4397 	  gcc_assert (!seen_vector_label);
4398 	}
4399       else if (insn == vector_label)
4400 	seen_vector_label = true;
4401       else if (INSN_P (insn))
4402 	switch (recog_memoized (insn))
4403 	  {
4404 	  case CODE_FOR_nvptx_barsync:
4405 	    gcc_assert (!seen_vector_label && !seen_worker_label);
4406 	    break;
4407 	  }
4408 
4409       if (insn != BB_HEAD (bb))
4410 	insn = PREV_INSN (insn);
4411       else
4412 	break;
4413     }
4414 
4415   gcc_assert (!(vector_label && !seen_vector_label));
4416   gcc_assert (!(worker_label && !seen_worker_label));
4417 }
4418 
4419 /* Single neutering according to MASK.  FROM is the incoming block and
4420    TO is the outgoing block.  These may be the same block. Insert at
4421    start of FROM:
4422 
4423      if (tid.<axis>) goto end.
4424 
4425    and insert before ending branch of TO (if there is such an insn):
4426 
4427      end:
4428      <possibly-broadcast-cond>
4429      <branch>
4430 
4431    We currently only use differnt FROM and TO when skipping an entire
4432    loop.  We could do more if we detected superblocks.  */
4433 
4434 static void
nvptx_single(unsigned mask,basic_block from,basic_block to)4435 nvptx_single (unsigned mask, basic_block from, basic_block to)
4436 {
4437   rtx_insn *head = BB_HEAD (from);
4438   rtx_insn *tail = BB_END (to);
4439   unsigned skip_mask = mask;
4440 
4441   while (true)
4442     {
4443       /* Find first insn of from block.  */
4444       while (head != BB_END (from) && !needs_neutering_p (head))
4445 	head = NEXT_INSN (head);
4446 
4447       if (from == to)
4448 	break;
4449 
4450       if (!(JUMP_P (head) && single_succ_p (from)))
4451 	break;
4452 
4453       basic_block jump_target = single_succ (from);
4454       if (!single_pred_p (jump_target))
4455 	break;
4456 
4457       from = jump_target;
4458       head = BB_HEAD (from);
4459     }
4460 
4461   /* Find last insn of to block */
4462   rtx_insn *limit = from == to ? head : BB_HEAD (to);
4463   while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
4464     tail = PREV_INSN (tail);
4465 
4466   /* Detect if tail is a branch.  */
4467   rtx tail_branch = NULL_RTX;
4468   rtx cond_branch = NULL_RTX;
4469   if (tail && INSN_P (tail))
4470     {
4471       tail_branch = PATTERN (tail);
4472       if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
4473 	tail_branch = NULL_RTX;
4474       else
4475 	{
4476 	  cond_branch = SET_SRC (tail_branch);
4477 	  if (GET_CODE (cond_branch) != IF_THEN_ELSE)
4478 	    cond_branch = NULL_RTX;
4479 	}
4480     }
4481 
4482   if (tail == head)
4483     {
4484       /* If this is empty, do nothing.  */
4485       if (!head || !needs_neutering_p (head))
4486 	return;
4487 
4488       if (cond_branch)
4489 	{
4490 	  /* If we're only doing vector single, there's no need to
4491 	     emit skip code because we'll not insert anything.  */
4492 	  if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
4493 	    skip_mask = 0;
4494 	}
4495       else if (tail_branch)
4496 	/* Block with only unconditional branch.  Nothing to do.  */
4497 	return;
4498     }
4499 
4500   /* Insert the vector test inside the worker test.  */
4501   unsigned mode;
4502   rtx_insn *before = tail;
4503   rtx_insn *neuter_start = NULL;
4504   rtx_insn *worker_label = NULL, *vector_label = NULL;
4505   rtx_insn *worker_jump = NULL, *vector_jump = NULL;
4506   for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4507     if (GOMP_DIM_MASK (mode) & skip_mask)
4508       {
4509 	rtx_code_label *label = gen_label_rtx ();
4510 	rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
4511 	rtx_insn **mode_jump
4512 	  = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump;
4513 	rtx_insn **mode_label
4514 	  = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label;
4515 
4516 	if (!pred)
4517 	  {
4518 	    pred = gen_reg_rtx (BImode);
4519 	    cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
4520 	  }
4521 
4522 	rtx br;
4523 	if (mode == GOMP_DIM_VECTOR)
4524 	  br = gen_br_true (pred, label);
4525 	else
4526 	  br = gen_br_true_uni (pred, label);
4527 	if (neuter_start)
4528 	  neuter_start = emit_insn_after (br, neuter_start);
4529 	else
4530 	  neuter_start = emit_insn_before (br, head);
4531 	*mode_jump = neuter_start;
4532 
4533 	LABEL_NUSES (label)++;
4534 	rtx_insn *label_insn;
4535 	if (tail_branch)
4536 	  {
4537 	    label_insn = emit_label_before (label, before);
4538 	    before = label_insn;
4539 	  }
4540 	else
4541 	  {
4542 	    label_insn = emit_label_after (label, tail);
4543 	    if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER)
4544 		&& CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL))
4545 	      emit_insn_after (gen_exit (), label_insn);
4546 	  }
4547 
4548 	*mode_label = label_insn;
4549       }
4550 
4551   /* Now deal with propagating the branch condition.  */
4552   if (cond_branch)
4553     {
4554       rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
4555 
4556       if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask
4557 	  && nvptx_mach_vector_length () == PTX_WARP_SIZE)
4558 	{
4559 	  /* Vector mode only, do a shuffle.  */
4560 #if WORKAROUND_PTXJIT_BUG
4561 	  /* The branch condition %rcond is propagated like this:
4562 
4563 		{
4564 		    .reg .u32 %x;
4565 		    mov.u32 %x,%tid.x;
4566 		    setp.ne.u32 %rnotvzero,%x,0;
4567 		 }
4568 
4569 		 @%rnotvzero bra Lskip;
4570 		 setp.<op>.<type> %rcond,op1,op2;
4571 		 Lskip:
4572 		 selp.u32 %rcondu32,1,0,%rcond;
4573 		 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4574 		 setp.ne.u32 %rcond,%rcondu32,0;
4575 
4576 	     There seems to be a bug in the ptx JIT compiler (observed at driver
4577 	     version 381.22, at -O1 and higher for sm_61), that drops the shfl
4578 	     unless %rcond is initialized to something before 'bra Lskip'.  The
4579 	     bug is not observed with ptxas from cuda 8.0.61.
4580 
4581 	     It is true that the code is non-trivial: at Lskip, %rcond is
4582 	     uninitialized in threads 1-31, and after the selp the same holds
4583 	     for %rcondu32.  But shfl propagates the defined value in thread 0
4584 	     to threads 1-31, so after the shfl %rcondu32 is defined in threads
4585 	     0-31, and after the setp.ne %rcond is defined in threads 0-31.
4586 
4587 	     There is nothing in the PTX spec to suggest that this is wrong, or
4588 	     to explain why the extra initialization is needed.  So, we classify
4589 	     it as a JIT bug, and the extra initialization as workaround:
4590 
4591 		{
4592 		    .reg .u32 %x;
4593 		    mov.u32 %x,%tid.x;
4594 		    setp.ne.u32 %rnotvzero,%x,0;
4595 		}
4596 
4597 		+.reg .pred %rcond2;
4598 		+setp.eq.u32 %rcond2, 1, 0;
4599 
4600 		 @%rnotvzero bra Lskip;
4601 		 setp.<op>.<type> %rcond,op1,op2;
4602 		+mov.pred %rcond2, %rcond;
4603 		 Lskip:
4604 		+mov.pred %rcond, %rcond2;
4605 		 selp.u32 %rcondu32,1,0,%rcond;
4606 		 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4607 		 setp.ne.u32 %rcond,%rcondu32,0;
4608 	  */
4609 	  rtx_insn *label = PREV_INSN (tail);
4610 	  gcc_assert (label && LABEL_P (label));
4611 	  rtx tmp = gen_reg_rtx (BImode);
4612 	  emit_insn_before (gen_movbi (tmp, const0_rtx),
4613 			    bb_first_real_insn (from));
4614 	  emit_insn_before (gen_rtx_SET (tmp, pvar), label);
4615 	  emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
4616 #endif
4617 	  emit_insn_before (nvptx_gen_warp_bcast (pvar), tail);
4618 	}
4619       else
4620 	{
4621 	  /* Includes worker mode, do spill & fill.  By construction
4622 	     we should never have worker mode only. */
4623 	  broadcast_data_t data;
4624 	  unsigned size = GET_MODE_SIZE (SImode);
4625 	  bool vector = (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) != 0;
4626 	  bool worker = (GOMP_DIM_MASK (GOMP_DIM_WORKER) == mask) != 0;
4627 	  rtx barrier = GEN_INT (0);
4628 	  int threads = 0;
4629 
4630 	  data.base = oacc_bcast_sym;
4631 	  data.ptr = 0;
4632 
4633 	  bool use_partitioning_p = (vector && !worker
4634 				     && nvptx_mach_max_workers () > 1
4635 				     && cfun->machine->bcast_partition);
4636 	  if (use_partitioning_p)
4637 	    {
4638 	      data.base = cfun->machine->bcast_partition;
4639 	      barrier = cfun->machine->sync_bar;
4640 	      threads = nvptx_mach_vector_length ();
4641 	    }
4642 	  gcc_assert (data.base != NULL);
4643 	  gcc_assert (barrier);
4644 
4645 	  unsigned int psize = ROUND_UP (size, oacc_bcast_align);
4646 	  unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
4647 			       ? nvptx_mach_max_workers () + 1
4648 			       : 1);
4649 
4650 	  oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
4651 	  oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
4652 
4653 	  data.offset = 0;
4654 	  emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_read, 0, &data,
4655 						    vector),
4656 			    before);
4657 
4658 	  /* Barrier so other workers can see the write.  */
4659 	  emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
4660 	  data.offset = 0;
4661 	  emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data,
4662 						    vector),
4663 			    tail);
4664 	  /* This barrier is needed to avoid worker zero clobbering
4665 	     the broadcast buffer before all the other workers have
4666 	     had a chance to read this instance of it.  */
4667 	  emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
4668 	}
4669 
4670       extract_insn (tail);
4671       rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
4672 				 UNSPEC_BR_UNIFIED);
4673       validate_change (tail, recog_data.operand_loc[0], unsp, false);
4674     }
4675 
4676   bool seen_label = verify_neutering_jumps (from, vector_jump, worker_jump,
4677 					    vector_label, worker_label);
4678   if (!seen_label)
4679     verify_neutering_labels (to, vector_label, worker_label);
4680 }
4681 
4682 /* PAR is a parallel that is being skipped in its entirety according to
4683    MASK.  Treat this as skipping a superblock starting at forked
4684    and ending at joining.  */
4685 
4686 static void
nvptx_skip_par(unsigned mask,parallel * par)4687 nvptx_skip_par (unsigned mask, parallel *par)
4688 {
4689   basic_block tail = par->join_block;
4690   gcc_assert (tail->preds->length () == 1);
4691 
4692   basic_block pre_tail = (*tail->preds)[0]->src;
4693   gcc_assert (pre_tail->succs->length () == 1);
4694 
4695   nvptx_single (mask, par->forked_block, pre_tail);
4696 }
4697 
4698 /* If PAR has a single inner parallel and PAR itself only contains
4699    empty entry and exit blocks, swallow the inner PAR.  */
4700 
4701 static void
nvptx_optimize_inner(parallel * par)4702 nvptx_optimize_inner (parallel *par)
4703 {
4704   parallel *inner = par->inner;
4705 
4706   /* We mustn't be the outer dummy par.  */
4707   if (!par->mask)
4708     return;
4709 
4710   /* We must have a single inner par.  */
4711   if (!inner || inner->next)
4712     return;
4713 
4714   /* We must only contain 2 blocks ourselves -- the head and tail of
4715      the inner par.  */
4716   if (par->blocks.length () != 2)
4717     return;
4718 
4719   /* We must be disjoint partitioning.  As we only have vector and
4720      worker partitioning, this is sufficient to guarantee the pars
4721      have adjacent partitioning.  */
4722   if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
4723     /* This indicates malformed code generation.  */
4724     return;
4725 
4726   /* The outer forked insn should be immediately followed by the inner
4727      fork insn.  */
4728   rtx_insn *forked = par->forked_insn;
4729   rtx_insn *fork = BB_END (par->forked_block);
4730 
4731   if (NEXT_INSN (forked) != fork)
4732     return;
4733   gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
4734 
4735   /* The outer joining insn must immediately follow the inner join
4736      insn.  */
4737   rtx_insn *joining = par->joining_insn;
4738   rtx_insn *join = inner->join_insn;
4739   if (NEXT_INSN (join) != joining)
4740     return;
4741 
4742   /* Preconditions met.  Swallow the inner par.  */
4743   if (dump_file)
4744     fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
4745 	     inner->mask, inner->forked_block->index,
4746 	     inner->join_block->index,
4747 	     par->mask, par->forked_block->index, par->join_block->index);
4748 
4749   par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
4750 
4751   par->blocks.reserve (inner->blocks.length ());
4752   while (inner->blocks.length ())
4753     par->blocks.quick_push (inner->blocks.pop ());
4754 
4755   par->inner = inner->inner;
4756   inner->inner = NULL;
4757 
4758   delete inner;
4759 }
4760 
4761 /* Process the parallel PAR and all its contained
4762    parallels.  We do everything but the neutering.  Return mask of
4763    partitioned modes used within this parallel.  */
4764 
4765 static unsigned
nvptx_process_pars(parallel * par)4766 nvptx_process_pars (parallel *par)
4767 {
4768   if (nvptx_optimize)
4769     nvptx_optimize_inner (par);
4770 
4771   unsigned inner_mask = par->mask;
4772 
4773   /* Do the inner parallels first.  */
4774   if (par->inner)
4775     {
4776       par->inner_mask = nvptx_process_pars (par->inner);
4777       inner_mask |= par->inner_mask;
4778     }
4779 
4780   bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
4781   bool worker = (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER));
4782   bool large_vector = ((par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
4783 		      && nvptx_mach_vector_length () > PTX_WARP_SIZE);
4784 
4785   if (worker || large_vector)
4786     {
4787       nvptx_shared_propagate (false, is_call, par->forked_block,
4788 			      par->forked_insn, !worker);
4789       bool no_prop_p
4790 	= nvptx_shared_propagate (true, is_call, par->forked_block,
4791 				  par->fork_insn, !worker);
4792       bool empty_loop_p
4793 	= !is_call && (NEXT_INSN (par->forked_insn)
4794 		       && NEXT_INSN (par->forked_insn) == par->joining_insn);
4795       rtx barrier = GEN_INT (0);
4796       int threads = 0;
4797 
4798       if (!worker && cfun->machine->sync_bar)
4799 	{
4800 	  barrier = cfun->machine->sync_bar;
4801 	  threads = nvptx_mach_vector_length ();
4802 	}
4803 
4804       if (no_prop_p && empty_loop_p)
4805 	;
4806       else if (no_prop_p && is_call)
4807 	;
4808       else
4809 	{
4810 	  /* Insert begin and end synchronizations.  */
4811 	  emit_insn_before (nvptx_cta_sync (barrier, threads),
4812 			    par->forked_insn);
4813 	  emit_insn_before (nvptx_cta_sync (barrier, threads), par->join_insn);
4814 	}
4815     }
4816   else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
4817     nvptx_warp_propagate (is_call, par->forked_block, par->forked_insn);
4818 
4819   /* Now do siblings.  */
4820   if (par->next)
4821     inner_mask |= nvptx_process_pars (par->next);
4822   return inner_mask;
4823 }
4824 
4825 /* Neuter the parallel described by PAR.  We recurse in depth-first
4826    order.  MODES are the partitioning of the execution and OUTER is
4827    the partitioning of the parallels we are contained in.  */
4828 
4829 static void
nvptx_neuter_pars(parallel * par,unsigned modes,unsigned outer)4830 nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
4831 {
4832   unsigned me = (par->mask
4833 		 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
4834 		    | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4835   unsigned  skip_mask = 0, neuter_mask = 0;
4836 
4837   if (par->inner)
4838     nvptx_neuter_pars (par->inner, modes, outer | me);
4839 
4840   for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4841     {
4842       if ((outer | me) & GOMP_DIM_MASK (mode))
4843 	{} /* Mode is partitioned: no neutering.  */
4844       else if (!(modes & GOMP_DIM_MASK (mode)))
4845 	{} /* Mode is not used: nothing to do.  */
4846       else if (par->inner_mask & GOMP_DIM_MASK (mode)
4847 	       || !par->forked_insn)
4848 	/* Partitioned in inner parallels, or we're not a partitioned
4849 	   at all: neuter individual blocks.  */
4850 	neuter_mask |= GOMP_DIM_MASK (mode);
4851       else if (!par->parent || !par->parent->forked_insn
4852 	       || par->parent->inner_mask & GOMP_DIM_MASK (mode))
4853 	/* Parent isn't a parallel or contains this paralleling: skip
4854 	   parallel at this level.  */
4855 	skip_mask |= GOMP_DIM_MASK (mode);
4856       else
4857 	{} /* Parent will skip this parallel itself.  */
4858     }
4859 
4860   if (neuter_mask)
4861     {
4862       int ix, len;
4863 
4864       if (nvptx_optimize)
4865 	{
4866 	  /* Neuter whole SESE regions.  */
4867 	  bb_pair_vec_t regions;
4868 
4869 	  nvptx_find_sese (par->blocks, regions);
4870 	  len = regions.length ();
4871 	  for (ix = 0; ix != len; ix++)
4872 	    {
4873 	      basic_block from = regions[ix].first;
4874 	      basic_block to = regions[ix].second;
4875 
4876 	      if (from)
4877 		nvptx_single (neuter_mask, from, to);
4878 	      else
4879 		gcc_assert (!to);
4880 	    }
4881 	}
4882       else
4883 	{
4884 	  /* Neuter each BB individually.  */
4885 	  len = par->blocks.length ();
4886 	  for (ix = 0; ix != len; ix++)
4887 	    {
4888 	      basic_block block = par->blocks[ix];
4889 
4890 	      nvptx_single (neuter_mask, block, block);
4891 	    }
4892 	}
4893     }
4894 
4895   if (skip_mask)
4896     nvptx_skip_par (skip_mask, par);
4897 
4898   if (par->next)
4899     nvptx_neuter_pars (par->next, modes, outer);
4900 }
4901 
4902 static void
populate_offload_attrs(offload_attrs * oa)4903 populate_offload_attrs (offload_attrs *oa)
4904 {
4905   tree attr = oacc_get_fn_attrib (current_function_decl);
4906   tree dims = TREE_VALUE (attr);
4907   unsigned ix;
4908 
4909   oa->mask = 0;
4910 
4911   for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
4912     {
4913       tree t = TREE_VALUE (dims);
4914       int size = (t == NULL_TREE) ? -1 : TREE_INT_CST_LOW (t);
4915       tree allowed = TREE_PURPOSE (dims);
4916 
4917       if (size != 1 && !(allowed && integer_zerop (allowed)))
4918 	oa->mask |= GOMP_DIM_MASK (ix);
4919 
4920       switch (ix)
4921 	{
4922 	case GOMP_DIM_GANG:
4923 	  oa->num_gangs = size;
4924 	  break;
4925 
4926 	case GOMP_DIM_WORKER:
4927 	  oa->num_workers = size;
4928 	  break;
4929 
4930 	case GOMP_DIM_VECTOR:
4931 	  oa->vector_length = size;
4932 	  break;
4933 	}
4934     }
4935 }
4936 
4937 #if WORKAROUND_PTXJIT_BUG_2
4938 /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT.  This variant
4939    is needed in the nvptx target because the branches generated for
4940    parititioning are NONJUMP_INSN_P, not JUMP_P.  */
4941 
4942 static rtx
4943 nvptx_pc_set (const rtx_insn *insn, bool strict = true)
4944 {
4945   rtx pat;
4946   if ((strict && !JUMP_P (insn))
4947       || (!strict && !INSN_P (insn)))
4948     return NULL_RTX;
4949   pat = PATTERN (insn);
4950 
4951   /* The set is allowed to appear either as the insn pattern or
4952      the first set in a PARALLEL.  */
4953   if (GET_CODE (pat) == PARALLEL)
4954     pat = XVECEXP (pat, 0, 0);
4955   if (GET_CODE (pat) == SET && GET_CODE (SET_DEST (pat)) == PC)
4956     return pat;
4957 
4958   return NULL_RTX;
4959 }
4960 
4961 /* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT.  */
4962 
4963 static rtx
4964 nvptx_condjump_label (const rtx_insn *insn, bool strict = true)
4965 {
4966   rtx x = nvptx_pc_set (insn, strict);
4967 
4968   if (!x)
4969     return NULL_RTX;
4970   x = SET_SRC (x);
4971   if (GET_CODE (x) == LABEL_REF)
4972     return x;
4973   if (GET_CODE (x) != IF_THEN_ELSE)
4974     return NULL_RTX;
4975   if (XEXP (x, 2) == pc_rtx && GET_CODE (XEXP (x, 1)) == LABEL_REF)
4976     return XEXP (x, 1);
4977   if (XEXP (x, 1) == pc_rtx && GET_CODE (XEXP (x, 2)) == LABEL_REF)
4978     return XEXP (x, 2);
4979   return NULL_RTX;
4980 }
4981 
4982 /* Insert a dummy ptx insn when encountering a branch to a label with no ptx
4983    insn inbetween the branch and the label.  This works around a JIT bug
4984    observed at driver version 384.111, at -O0 for sm_50.  */
4985 
4986 static void
prevent_branch_around_nothing(void)4987 prevent_branch_around_nothing (void)
4988 {
4989   rtx_insn *seen_label = NULL;
4990     for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
4991       {
4992 	if (INSN_P (insn) && condjump_p (insn))
4993 	  {
4994 	    seen_label = label_ref_label (nvptx_condjump_label (insn, false));
4995 	    continue;
4996 	  }
4997 
4998 	if (seen_label == NULL)
4999 	  continue;
5000 
5001 	if (NOTE_P (insn) || DEBUG_INSN_P (insn))
5002 	  continue;
5003 
5004 	if (INSN_P (insn))
5005 	  switch (recog_memoized (insn))
5006 	    {
5007 	    case CODE_FOR_nvptx_fork:
5008 	    case CODE_FOR_nvptx_forked:
5009 	    case CODE_FOR_nvptx_joining:
5010 	    case CODE_FOR_nvptx_join:
5011 	      continue;
5012 	    default:
5013 	      seen_label = NULL;
5014 	      continue;
5015 	    }
5016 
5017 	if (LABEL_P (insn) && insn == seen_label)
5018 	  emit_insn_before (gen_fake_nop (), insn);
5019 
5020 	seen_label = NULL;
5021       }
5022   }
5023 #endif
5024 
5025 #ifdef WORKAROUND_PTXJIT_BUG_3
5026 /* Insert two membar.cta insns inbetween two subsequent bar.sync insns.  This
5027    works around a hang observed at driver version 390.48 for sm_50.  */
5028 
5029 static void
workaround_barsyncs(void)5030 workaround_barsyncs (void)
5031 {
5032   bool seen_barsync = false;
5033   for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
5034     {
5035       if (INSN_P (insn) && recog_memoized (insn) == CODE_FOR_nvptx_barsync)
5036 	{
5037 	  if (seen_barsync)
5038 	    {
5039 	      emit_insn_before (gen_nvptx_membar_cta (), insn);
5040 	      emit_insn_before (gen_nvptx_membar_cta (), insn);
5041 	    }
5042 
5043 	  seen_barsync = true;
5044 	  continue;
5045 	}
5046 
5047       if (!seen_barsync)
5048 	continue;
5049 
5050       if (NOTE_P (insn) || DEBUG_INSN_P (insn))
5051 	continue;
5052       else if (INSN_P (insn))
5053 	switch (recog_memoized (insn))
5054 	  {
5055 	  case CODE_FOR_nvptx_fork:
5056 	  case CODE_FOR_nvptx_forked:
5057 	  case CODE_FOR_nvptx_joining:
5058 	  case CODE_FOR_nvptx_join:
5059 	    continue;
5060 	  default:
5061 	    break;
5062 	  }
5063 
5064       seen_barsync = false;
5065     }
5066 }
5067 #endif
5068 
5069 /* PTX-specific reorganization
5070    - Split blocks at fork and join instructions
5071    - Compute live registers
5072    - Mark now-unused registers, so function begin doesn't declare
5073    unused registers.
5074    - Insert state propagation when entering partitioned mode
5075    - Insert neutering instructions when in single mode
5076    - Replace subregs with suitable sequences.
5077 */
5078 
5079 static void
nvptx_reorg(void)5080 nvptx_reorg (void)
5081 {
5082   /* We are freeing block_for_insn in the toplev to keep compatibility
5083      with old MDEP_REORGS that are not CFG based.  Recompute it now.  */
5084   compute_bb_for_insn ();
5085 
5086   thread_prologue_and_epilogue_insns ();
5087 
5088   /* Split blocks and record interesting unspecs.  */
5089   bb_insn_map_t bb_insn_map;
5090 
5091   nvptx_split_blocks (&bb_insn_map);
5092 
5093   /* Compute live regs */
5094   df_clear_flags (DF_LR_RUN_DCE);
5095   df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
5096   df_live_add_problem ();
5097   df_live_set_all_dirty ();
5098   df_analyze ();
5099   regstat_init_n_sets_and_refs ();
5100 
5101   if (dump_file)
5102     df_dump (dump_file);
5103 
5104   /* Mark unused regs as unused.  */
5105   int max_regs = max_reg_num ();
5106   for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
5107     if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
5108       regno_reg_rtx[i] = const0_rtx;
5109 
5110   /* Determine launch dimensions of the function.  If it is not an
5111      offloaded function  (i.e. this is a regular compiler), the
5112      function has no neutering.  */
5113   tree attr = oacc_get_fn_attrib (current_function_decl);
5114   if (attr)
5115     {
5116       /* If we determined this mask before RTL expansion, we could
5117 	 elide emission of some levels of forks and joins.  */
5118       offload_attrs oa;
5119 
5120       populate_offload_attrs (&oa);
5121 
5122       /* If there is worker neutering, there must be vector
5123 	 neutering.  Otherwise the hardware will fail.  */
5124       gcc_assert (!(oa.mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
5125 		  || (oa.mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
5126 
5127       /* Discover & process partitioned regions.  */
5128       parallel *pars = nvptx_discover_pars (&bb_insn_map);
5129       nvptx_process_pars (pars);
5130       nvptx_neuter_pars (pars, oa.mask, 0);
5131       delete pars;
5132     }
5133 
5134   /* Replace subregs.  */
5135   nvptx_reorg_subreg ();
5136 
5137   if (TARGET_UNIFORM_SIMT)
5138     nvptx_reorg_uniform_simt ();
5139 
5140 #if WORKAROUND_PTXJIT_BUG_2
5141   prevent_branch_around_nothing ();
5142 #endif
5143 
5144 #ifdef WORKAROUND_PTXJIT_BUG_3
5145   workaround_barsyncs ();
5146 #endif
5147 
5148   regstat_free_n_sets_and_refs ();
5149 
5150   df_finish_pass (true);
5151 }
5152 
5153 /* Handle a "kernel" attribute; arguments as in
5154    struct attribute_spec.handler.  */
5155 
5156 static tree
nvptx_handle_kernel_attribute(tree * node,tree name,tree ARG_UNUSED (args),int ARG_UNUSED (flags),bool * no_add_attrs)5157 nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
5158 			       int ARG_UNUSED (flags), bool *no_add_attrs)
5159 {
5160   tree decl = *node;
5161 
5162   if (TREE_CODE (decl) != FUNCTION_DECL)
5163     {
5164       error ("%qE attribute only applies to functions", name);
5165       *no_add_attrs = true;
5166     }
5167   else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
5168     {
5169       error ("%qE attribute requires a void return type", name);
5170       *no_add_attrs = true;
5171     }
5172 
5173   return NULL_TREE;
5174 }
5175 
5176 /* Handle a "shared" attribute; arguments as in
5177    struct attribute_spec.handler.  */
5178 
5179 static tree
nvptx_handle_shared_attribute(tree * node,tree name,tree ARG_UNUSED (args),int ARG_UNUSED (flags),bool * no_add_attrs)5180 nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
5181 			       int ARG_UNUSED (flags), bool *no_add_attrs)
5182 {
5183   tree decl = *node;
5184 
5185   if (TREE_CODE (decl) != VAR_DECL)
5186     {
5187       error ("%qE attribute only applies to variables", name);
5188       *no_add_attrs = true;
5189     }
5190   else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
5191     {
5192       error ("%qE attribute not allowed with auto storage class", name);
5193       *no_add_attrs = true;
5194     }
5195 
5196   return NULL_TREE;
5197 }
5198 
5199 /* Table of valid machine attributes.  */
5200 static const struct attribute_spec nvptx_attribute_table[] =
5201 {
5202   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
5203        affects_type_identity, handler, exclude } */
5204   { "kernel", 0, 0, true, false,  false, false, nvptx_handle_kernel_attribute,
5205     NULL },
5206   { "shared", 0, 0, true, false,  false, false, nvptx_handle_shared_attribute,
5207     NULL },
5208   { NULL, 0, 0, false, false, false, false, NULL, NULL }
5209 };
5210 
5211 /* Limit vector alignments to BIGGEST_ALIGNMENT.  */
5212 
5213 static HOST_WIDE_INT
nvptx_vector_alignment(const_tree type)5214 nvptx_vector_alignment (const_tree type)
5215 {
5216   unsigned HOST_WIDE_INT align;
5217   tree size = TYPE_SIZE (type);
5218 
5219   /* Ensure align is not bigger than BIGGEST_ALIGNMENT.  */
5220   if (tree_fits_uhwi_p (size))
5221     {
5222       align = tree_to_uhwi (size);
5223       align = MIN (align, BIGGEST_ALIGNMENT);
5224     }
5225   else
5226     align = BIGGEST_ALIGNMENT;
5227 
5228   /* Ensure align is not smaller than mode alignment.  */
5229   align = MAX (align, GET_MODE_ALIGNMENT (TYPE_MODE (type)));
5230 
5231   return align;
5232 }
5233 
5234 /* Indicate that INSN cannot be duplicated.   */
5235 
5236 static bool
nvptx_cannot_copy_insn_p(rtx_insn * insn)5237 nvptx_cannot_copy_insn_p (rtx_insn *insn)
5238 {
5239   switch (recog_memoized (insn))
5240     {
5241     case CODE_FOR_nvptx_shufflesi:
5242     case CODE_FOR_nvptx_shufflesf:
5243     case CODE_FOR_nvptx_barsync:
5244     case CODE_FOR_nvptx_fork:
5245     case CODE_FOR_nvptx_forked:
5246     case CODE_FOR_nvptx_joining:
5247     case CODE_FOR_nvptx_join:
5248       return true;
5249     default:
5250       return false;
5251     }
5252 }
5253 
5254 /* Section anchors do not work.  Initialization for flag_section_anchor
5255    probes the existence of the anchoring target hooks and prevents
5256    anchoring if they don't exist.  However, we may be being used with
5257    a host-side compiler that does support anchoring, and hence see
5258    the anchor flag set (as it's not recalculated).  So provide an
5259    implementation denying anchoring.  */
5260 
5261 static bool
nvptx_use_anchors_for_symbol_p(const_rtx ARG_UNUSED (a))5262 nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
5263 {
5264   return false;
5265 }
5266 
5267 /* Record a symbol for mkoffload to enter into the mapping table.  */
5268 
5269 static void
nvptx_record_offload_symbol(tree decl)5270 nvptx_record_offload_symbol (tree decl)
5271 {
5272   switch (TREE_CODE (decl))
5273     {
5274     case VAR_DECL:
5275       fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
5276 	       IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
5277       break;
5278 
5279     case FUNCTION_DECL:
5280       {
5281 	tree attr = oacc_get_fn_attrib (decl);
5282 	/* OpenMP offloading does not set this attribute.  */
5283 	tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
5284 
5285 	fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
5286 		 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
5287 
5288 	for (; dims; dims = TREE_CHAIN (dims))
5289 	  {
5290 	    int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
5291 
5292 	    gcc_assert (!TREE_PURPOSE (dims));
5293 	    fprintf (asm_out_file, ", %#x", size);
5294 	  }
5295 
5296 	fprintf (asm_out_file, "\n");
5297       }
5298       break;
5299 
5300     default:
5301       gcc_unreachable ();
5302     }
5303 }
5304 
5305 /* Implement TARGET_ASM_FILE_START.  Write the kinds of things ptxas expects
5306    at the start of a file.  */
5307 
5308 static void
nvptx_file_start(void)5309 nvptx_file_start (void)
5310 {
5311   fputs ("// BEGIN PREAMBLE\n", asm_out_file);
5312   fputs ("\t.version\t3.1\n", asm_out_file);
5313   if (TARGET_SM35)
5314     fputs ("\t.target\tsm_35\n", asm_out_file);
5315   else
5316     fputs ("\t.target\tsm_30\n", asm_out_file);
5317   fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
5318   fputs ("// END PREAMBLE\n", asm_out_file);
5319 }
5320 
5321 /* Emit a declaration for a worker and vector-level buffer in .shared
5322    memory.  */
5323 
5324 static void
write_shared_buffer(FILE * file,rtx sym,unsigned align,unsigned size)5325 write_shared_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
5326 {
5327   const char *name = XSTR (sym, 0);
5328 
5329   write_var_marker (file, true, false, name);
5330   fprintf (file, ".shared .align %d .u8 %s[%d];\n",
5331 	   align, name, size);
5332 }
5333 
5334 /* Write out the function declarations we've collected and declare storage
5335    for the broadcast buffer.  */
5336 
5337 static void
nvptx_file_end(void)5338 nvptx_file_end (void)
5339 {
5340   hash_table<tree_hasher>::iterator iter;
5341   tree decl;
5342   FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
5343     nvptx_record_fndecl (decl);
5344   fputs (func_decls.str().c_str(), asm_out_file);
5345 
5346   if (oacc_bcast_size)
5347     write_shared_buffer (asm_out_file, oacc_bcast_sym,
5348 			 oacc_bcast_align, oacc_bcast_size);
5349 
5350   if (worker_red_size)
5351     write_shared_buffer (asm_out_file, worker_red_sym,
5352 			 worker_red_align, worker_red_size);
5353 
5354   if (vector_red_size)
5355     write_shared_buffer (asm_out_file, vector_red_sym,
5356 			 vector_red_align, vector_red_size);
5357 
5358   if (need_softstack_decl)
5359     {
5360       write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
5361       /* 32 is the maximum number of warps in a block.  Even though it's an
5362          external declaration, emit the array size explicitly; otherwise, it
5363          may fail at PTX JIT time if the definition is later in link order.  */
5364       fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
5365 	       POINTER_SIZE);
5366     }
5367   if (need_unisimt_decl)
5368     {
5369       write_var_marker (asm_out_file, false, true, "__nvptx_uni");
5370       fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
5371     }
5372 }
5373 
5374 /* Expander for the shuffle builtins.  */
5375 
5376 static rtx
nvptx_expand_shuffle(tree exp,rtx target,machine_mode mode,int ignore)5377 nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
5378 {
5379   if (ignore)
5380     return target;
5381 
5382   rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
5383 			 NULL_RTX, mode, EXPAND_NORMAL);
5384   if (!REG_P (src))
5385     src = copy_to_mode_reg (mode, src);
5386 
5387   rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
5388 			 NULL_RTX, SImode, EXPAND_NORMAL);
5389   rtx op = expand_expr (CALL_EXPR_ARG  (exp, 2),
5390 			NULL_RTX, SImode, EXPAND_NORMAL);
5391 
5392   if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
5393     idx = copy_to_mode_reg (SImode, idx);
5394 
5395   rtx pat = nvptx_gen_shuffle (target, src, idx,
5396 			       (nvptx_shuffle_kind) INTVAL (op));
5397   if (pat)
5398     emit_insn (pat);
5399 
5400   return target;
5401 }
5402 
5403 const char *
nvptx_output_red_partition(rtx dst,rtx offset)5404 nvptx_output_red_partition (rtx dst, rtx offset)
5405 {
5406   const char *zero_offset = "\t\tmov.u64\t%%r%d, %%r%d; // vred buffer\n";
5407   const char *with_offset = "\t\tadd.u64\t%%r%d, %%r%d, %d; // vred buffer\n";
5408 
5409   if (offset == const0_rtx)
5410     fprintf (asm_out_file, zero_offset, REGNO (dst),
5411 	     REGNO (cfun->machine->red_partition));
5412   else
5413     fprintf (asm_out_file, with_offset, REGNO (dst),
5414 	     REGNO (cfun->machine->red_partition), UINTVAL (offset));
5415 
5416   return "";
5417 }
5418 
5419 /* Shared-memory reduction address expander.  */
5420 
5421 static rtx
nvptx_expand_shared_addr(tree exp,rtx target,machine_mode ARG_UNUSED (mode),int ignore,int vector)5422 nvptx_expand_shared_addr (tree exp, rtx target,
5423 			  machine_mode ARG_UNUSED (mode), int ignore,
5424 			  int vector)
5425 {
5426   if (ignore)
5427     return target;
5428 
5429   unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
5430   unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
5431   unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
5432   rtx addr = worker_red_sym;
5433 
5434   if (vector)
5435     {
5436       offload_attrs oa;
5437 
5438       populate_offload_attrs (&oa);
5439 
5440       unsigned int psize = ROUND_UP (size + offset, align);
5441       unsigned int pnum = nvptx_mach_max_workers ();
5442       vector_red_partition = MAX (vector_red_partition, psize);
5443       vector_red_size = MAX (vector_red_size, psize * pnum);
5444       vector_red_align = MAX (vector_red_align, align);
5445 
5446       if (cfun->machine->red_partition == NULL)
5447 	cfun->machine->red_partition = gen_reg_rtx (Pmode);
5448 
5449       addr = gen_reg_rtx (Pmode);
5450       emit_insn (gen_nvptx_red_partition (addr, GEN_INT (offset)));
5451     }
5452   else
5453     {
5454       worker_red_align = MAX (worker_red_align, align);
5455       worker_red_size = MAX (worker_red_size, size + offset);
5456 
5457       if (offset)
5458 	{
5459 	  addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
5460 	  addr = gen_rtx_CONST (Pmode, addr);
5461 	}
5462    }
5463 
5464   emit_move_insn (target, addr);
5465   return target;
5466 }
5467 
5468 /* Expand the CMP_SWAP PTX builtins.  We have our own versions that do
5469    not require taking the address of any object, other than the memory
5470    cell being operated on.  */
5471 
5472 static rtx
nvptx_expand_cmp_swap(tree exp,rtx target,machine_mode ARG_UNUSED (m),int ARG_UNUSED (ignore))5473 nvptx_expand_cmp_swap (tree exp, rtx target,
5474 		       machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
5475 {
5476   machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
5477 
5478   if (!target)
5479     target = gen_reg_rtx (mode);
5480 
5481   rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
5482 			 NULL_RTX, Pmode, EXPAND_NORMAL);
5483   rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
5484 			 NULL_RTX, mode, EXPAND_NORMAL);
5485   rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
5486 			 NULL_RTX, mode, EXPAND_NORMAL);
5487   rtx pat;
5488 
5489   mem = gen_rtx_MEM (mode, mem);
5490   if (!REG_P (cmp))
5491     cmp = copy_to_mode_reg (mode, cmp);
5492   if (!REG_P (src))
5493     src = copy_to_mode_reg (mode, src);
5494 
5495   if (mode == SImode)
5496     pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
5497   else
5498     pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
5499 
5500   emit_insn (pat);
5501 
5502   return target;
5503 }
5504 
5505 
5506 /* Codes for all the NVPTX builtins.  */
5507 enum nvptx_builtins
5508 {
5509   NVPTX_BUILTIN_SHUFFLE,
5510   NVPTX_BUILTIN_SHUFFLELL,
5511   NVPTX_BUILTIN_WORKER_ADDR,
5512   NVPTX_BUILTIN_VECTOR_ADDR,
5513   NVPTX_BUILTIN_CMP_SWAP,
5514   NVPTX_BUILTIN_CMP_SWAPLL,
5515   NVPTX_BUILTIN_MAX
5516 };
5517 
5518 static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
5519 
5520 /* Return the NVPTX builtin for CODE.  */
5521 
5522 static tree
nvptx_builtin_decl(unsigned code,bool ARG_UNUSED (initialize_p))5523 nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
5524 {
5525   if (code >= NVPTX_BUILTIN_MAX)
5526     return error_mark_node;
5527 
5528   return nvptx_builtin_decls[code];
5529 }
5530 
5531 /* Set up all builtin functions for this target.  */
5532 
5533 static void
nvptx_init_builtins(void)5534 nvptx_init_builtins (void)
5535 {
5536 #define DEF(ID, NAME, T)						\
5537   (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID]				\
5538    = add_builtin_function ("__builtin_nvptx_" NAME,			\
5539 			   build_function_type_list T,			\
5540 			   NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
5541 #define ST sizetype
5542 #define UINT unsigned_type_node
5543 #define LLUINT long_long_unsigned_type_node
5544 #define PTRVOID ptr_type_node
5545 
5546   DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
5547   DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
5548   DEF (WORKER_ADDR, "worker_addr",
5549        (PTRVOID, ST, UINT, UINT, NULL_TREE));
5550   DEF (VECTOR_ADDR, "vector_addr",
5551        (PTRVOID, ST, UINT, UINT, NULL_TREE));
5552   DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
5553   DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
5554 
5555 #undef DEF
5556 #undef ST
5557 #undef UINT
5558 #undef LLUINT
5559 #undef PTRVOID
5560 }
5561 
5562 /* Expand an expression EXP that calls a built-in function,
5563    with result going to TARGET if that's convenient
5564    (and in mode MODE if that's convenient).
5565    SUBTARGET may be used as the target for computing one of EXP's operands.
5566    IGNORE is nonzero if the value is to be ignored.  */
5567 
5568 static rtx
nvptx_expand_builtin(tree exp,rtx target,rtx ARG_UNUSED (subtarget),machine_mode mode,int ignore)5569 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
5570 		      machine_mode mode, int ignore)
5571 {
5572   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
5573   switch (DECL_MD_FUNCTION_CODE (fndecl))
5574     {
5575     case NVPTX_BUILTIN_SHUFFLE:
5576     case NVPTX_BUILTIN_SHUFFLELL:
5577       return nvptx_expand_shuffle (exp, target, mode, ignore);
5578 
5579     case NVPTX_BUILTIN_WORKER_ADDR:
5580       return nvptx_expand_shared_addr (exp, target, mode, ignore, false);
5581 
5582     case NVPTX_BUILTIN_VECTOR_ADDR:
5583       return nvptx_expand_shared_addr (exp, target, mode, ignore, true);
5584 
5585     case NVPTX_BUILTIN_CMP_SWAP:
5586     case NVPTX_BUILTIN_CMP_SWAPLL:
5587       return nvptx_expand_cmp_swap (exp, target, mode, ignore);
5588 
5589     default: gcc_unreachable ();
5590     }
5591 }
5592 
5593 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp.  */
5594 
5595 static int
nvptx_simt_vf()5596 nvptx_simt_vf ()
5597 {
5598   return PTX_WARP_SIZE;
5599 }
5600 
5601 /* Return 1 if TRAIT NAME is present in the OpenMP context's
5602    device trait set, return 0 if not present in any OpenMP context in the
5603    whole translation unit, or -1 if not present in the current OpenMP context
5604    but might be present in another OpenMP context in the same TU.  */
5605 
5606 int
nvptx_omp_device_kind_arch_isa(enum omp_device_kind_arch_isa trait,const char * name)5607 nvptx_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait,
5608 				const char *name)
5609 {
5610   switch (trait)
5611     {
5612     case omp_device_kind:
5613       return strcmp (name, "gpu") == 0;
5614     case omp_device_arch:
5615       return strcmp (name, "nvptx") == 0;
5616     case omp_device_isa:
5617       if (strcmp (name, "sm_30") == 0)
5618 	return !TARGET_SM35;
5619       if (strcmp (name, "sm_35") == 0)
5620 	return TARGET_SM35;
5621       return 0;
5622     default:
5623       gcc_unreachable ();
5624     }
5625 }
5626 
5627 static bool
nvptx_welformed_vector_length_p(int l)5628 nvptx_welformed_vector_length_p (int l)
5629 {
5630   gcc_assert (l > 0);
5631   return l % PTX_WARP_SIZE == 0;
5632 }
5633 
5634 static void
nvptx_apply_dim_limits(int dims[])5635 nvptx_apply_dim_limits (int dims[])
5636 {
5637   /* Check that the vector_length is not too large.  */
5638   if (dims[GOMP_DIM_VECTOR] > PTX_MAX_VECTOR_LENGTH)
5639     dims[GOMP_DIM_VECTOR] = PTX_MAX_VECTOR_LENGTH;
5640 
5641   /* Check that the number of workers is not too large.  */
5642   if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
5643     dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
5644 
5645   /* Ensure that num_worker * vector_length <= cta size.  */
5646   if (dims[GOMP_DIM_WORKER] > 0 &&  dims[GOMP_DIM_VECTOR] > 0
5647       && dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] > PTX_CTA_SIZE)
5648     dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5649 
5650   /* If we need a per-worker barrier ... .  */
5651   if (dims[GOMP_DIM_WORKER] > 0 &&  dims[GOMP_DIM_VECTOR] > 0
5652       && dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
5653     /* Don't use more barriers than available.  */
5654     dims[GOMP_DIM_WORKER] = MIN (dims[GOMP_DIM_WORKER],
5655 				 PTX_NUM_PER_WORKER_BARRIERS);
5656 }
5657 
5658 /* Return true if FNDECL contains calls to vector-partitionable routines.  */
5659 
5660 static bool
has_vector_partitionable_routine_calls_p(tree fndecl)5661 has_vector_partitionable_routine_calls_p (tree fndecl)
5662 {
5663   if (!fndecl)
5664     return false;
5665 
5666   basic_block bb;
5667   FOR_EACH_BB_FN (bb, DECL_STRUCT_FUNCTION (fndecl))
5668     for (gimple_stmt_iterator i = gsi_start_bb (bb); !gsi_end_p (i);
5669 	 gsi_next_nondebug (&i))
5670       {
5671 	gimple *stmt = gsi_stmt (i);
5672 	if (gimple_code (stmt) != GIMPLE_CALL)
5673 	  continue;
5674 
5675 	tree callee = gimple_call_fndecl (stmt);
5676 	if (!callee)
5677 	  continue;
5678 
5679 	tree attrs  = oacc_get_fn_attrib (callee);
5680 	if (attrs == NULL_TREE)
5681 	  return false;
5682 
5683 	int partition_level = oacc_fn_attrib_level (attrs);
5684 	bool seq_routine_p = partition_level == GOMP_DIM_MAX;
5685 	if (!seq_routine_p)
5686 	  return true;
5687       }
5688 
5689   return false;
5690 }
5691 
5692 /* As nvptx_goacc_validate_dims, but does not return bool to indicate whether
5693    DIMS has changed.  */
5694 
5695 static void
nvptx_goacc_validate_dims_1(tree decl,int dims[],int fn_level,unsigned used)5696 nvptx_goacc_validate_dims_1 (tree decl, int dims[], int fn_level, unsigned used)
5697 {
5698   bool oacc_default_dims_p = false;
5699   bool oacc_min_dims_p = false;
5700   bool offload_region_p = false;
5701   bool routine_p = false;
5702   bool routine_seq_p = false;
5703   int default_vector_length = -1;
5704 
5705   if (decl == NULL_TREE)
5706     {
5707       if (fn_level == -1)
5708 	oacc_default_dims_p = true;
5709       else if (fn_level == -2)
5710 	oacc_min_dims_p = true;
5711       else
5712 	gcc_unreachable ();
5713     }
5714   else if (fn_level == -1)
5715     offload_region_p = true;
5716   else if (0 <= fn_level && fn_level <= GOMP_DIM_MAX)
5717     {
5718       routine_p = true;
5719       routine_seq_p = fn_level == GOMP_DIM_MAX;
5720     }
5721   else
5722     gcc_unreachable ();
5723 
5724   if (oacc_min_dims_p)
5725     {
5726       gcc_assert (dims[GOMP_DIM_VECTOR] == 1);
5727       gcc_assert (dims[GOMP_DIM_WORKER] == 1);
5728       gcc_assert (dims[GOMP_DIM_GANG] == 1);
5729 
5730       dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5731       return;
5732     }
5733 
5734   if (routine_p)
5735     {
5736       if (!routine_seq_p)
5737 	dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5738 
5739       return;
5740     }
5741 
5742   if (oacc_default_dims_p)
5743     {
5744       /* -1  : not set
5745 	  0  : set at runtime, f.i. -fopenacc-dims=-
5746          >= 1: set at compile time, f.i. -fopenacc-dims=1.  */
5747       gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
5748       gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
5749       gcc_assert (dims[GOMP_DIM_GANG] >= -1);
5750 
5751       /* But -fopenacc-dims=- is not yet supported on trunk.  */
5752       gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
5753       gcc_assert (dims[GOMP_DIM_WORKER] != 0);
5754       gcc_assert (dims[GOMP_DIM_GANG] != 0);
5755     }
5756 
5757   if (offload_region_p)
5758     {
5759       /* -1   : not set
5760 	  0   : set using variable, f.i. num_gangs (n)
5761 	  >= 1: set using constant, f.i. num_gangs (1).  */
5762       gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
5763       gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
5764       gcc_assert (dims[GOMP_DIM_GANG] >= -1);
5765     }
5766 
5767   if (offload_region_p)
5768     default_vector_length = oacc_get_default_dim (GOMP_DIM_VECTOR);
5769   else
5770     /* oacc_default_dims_p.  */
5771     default_vector_length = PTX_DEFAULT_VECTOR_LENGTH;
5772 
5773   int old_dims[GOMP_DIM_MAX];
5774   unsigned int i;
5775   for (i = 0; i < GOMP_DIM_MAX; ++i)
5776     old_dims[i] = dims[i];
5777 
5778   const char *vector_reason = NULL;
5779   if (offload_region_p && has_vector_partitionable_routine_calls_p (decl))
5780     {
5781       default_vector_length = PTX_WARP_SIZE;
5782 
5783       if (dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
5784 	{
5785 	  vector_reason = G_("using vector_length (%d) due to call to"
5786 			     " vector-partitionable routine, ignoring %d");
5787 	  dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5788 	}
5789     }
5790 
5791   if (dims[GOMP_DIM_VECTOR] == 0)
5792     {
5793       vector_reason = G_("using vector_length (%d), ignoring runtime setting");
5794       dims[GOMP_DIM_VECTOR] = default_vector_length;
5795     }
5796 
5797   if (dims[GOMP_DIM_VECTOR] > 0
5798       && !nvptx_welformed_vector_length_p (dims[GOMP_DIM_VECTOR]))
5799     dims[GOMP_DIM_VECTOR] = default_vector_length;
5800 
5801   nvptx_apply_dim_limits (dims);
5802 
5803   if (dims[GOMP_DIM_VECTOR] != old_dims[GOMP_DIM_VECTOR])
5804     warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5805 		vector_reason != NULL
5806 		? vector_reason
5807 		: G_("using vector_length (%d), ignoring %d"),
5808 		dims[GOMP_DIM_VECTOR], old_dims[GOMP_DIM_VECTOR]);
5809 
5810   if (dims[GOMP_DIM_WORKER] != old_dims[GOMP_DIM_WORKER])
5811     warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5812 		G_("using num_workers (%d), ignoring %d"),
5813 		dims[GOMP_DIM_WORKER], old_dims[GOMP_DIM_WORKER]);
5814 
5815   if (oacc_default_dims_p)
5816     {
5817       if (dims[GOMP_DIM_VECTOR] < 0)
5818 	dims[GOMP_DIM_VECTOR] = default_vector_length;
5819       if (dims[GOMP_DIM_WORKER] < 0)
5820 	dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM;
5821       if (dims[GOMP_DIM_GANG] < 0)
5822 	dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM;
5823       nvptx_apply_dim_limits (dims);
5824     }
5825 
5826   if (offload_region_p)
5827     {
5828       for (i = 0; i < GOMP_DIM_MAX; i++)
5829 	{
5830 	  if (!(dims[i] < 0))
5831 	    continue;
5832 
5833 	  if ((used & GOMP_DIM_MASK (i)) == 0)
5834 	    /* Function oacc_validate_dims will apply the minimal dimension.  */
5835 	    continue;
5836 
5837 	  dims[i] = (i == GOMP_DIM_VECTOR
5838 		     ? default_vector_length
5839 		     : oacc_get_default_dim (i));
5840 	}
5841 
5842       nvptx_apply_dim_limits (dims);
5843     }
5844 }
5845 
5846 /* Validate compute dimensions of an OpenACC offload or routine, fill
5847    in non-unity defaults.  FN_LEVEL indicates the level at which a
5848    routine might spawn a loop.  It is negative for non-routines.  If
5849    DECL is null, we are validating the default dimensions.  */
5850 
5851 static bool
nvptx_goacc_validate_dims(tree decl,int dims[],int fn_level,unsigned used)5852 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level, unsigned used)
5853 {
5854   int old_dims[GOMP_DIM_MAX];
5855   unsigned int i;
5856 
5857   for (i = 0; i < GOMP_DIM_MAX; ++i)
5858     old_dims[i] = dims[i];
5859 
5860   nvptx_goacc_validate_dims_1 (decl, dims, fn_level, used);
5861 
5862   gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
5863   if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0)
5864     gcc_assert (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] <= PTX_CTA_SIZE);
5865 
5866   for (i = 0; i < GOMP_DIM_MAX; ++i)
5867     if (old_dims[i] != dims[i])
5868       return true;
5869 
5870   return false;
5871 }
5872 
5873 /* Return maximum dimension size, or zero for unbounded.  */
5874 
5875 static int
nvptx_dim_limit(int axis)5876 nvptx_dim_limit (int axis)
5877 {
5878   switch (axis)
5879     {
5880     case GOMP_DIM_VECTOR:
5881       return PTX_MAX_VECTOR_LENGTH;
5882 
5883     default:
5884       break;
5885     }
5886   return 0;
5887 }
5888 
5889 /* Determine whether fork & joins are needed.  */
5890 
5891 static bool
nvptx_goacc_fork_join(gcall * call,const int dims[],bool ARG_UNUSED (is_fork))5892 nvptx_goacc_fork_join (gcall *call, const int dims[],
5893 		       bool ARG_UNUSED (is_fork))
5894 {
5895   tree arg = gimple_call_arg (call, 2);
5896   unsigned axis = TREE_INT_CST_LOW (arg);
5897 
5898   /* We only care about worker and vector partitioning.  */
5899   if (axis < GOMP_DIM_WORKER)
5900     return false;
5901 
5902   /* If the size is 1, there's no partitioning.  */
5903   if (dims[axis] == 1)
5904     return false;
5905 
5906   return true;
5907 }
5908 
5909 /* Generate a PTX builtin function call that returns the address in
5910    the worker reduction buffer at OFFSET.  TYPE is the type of the
5911    data at that location.  */
5912 
5913 static tree
nvptx_get_shared_red_addr(tree type,tree offset,bool vector)5914 nvptx_get_shared_red_addr (tree type, tree offset, bool vector)
5915 {
5916   enum nvptx_builtins addr_dim = NVPTX_BUILTIN_WORKER_ADDR;
5917   if (vector)
5918     addr_dim = NVPTX_BUILTIN_VECTOR_ADDR;
5919   machine_mode mode = TYPE_MODE (type);
5920   tree fndecl = nvptx_builtin_decl (addr_dim, true);
5921   tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
5922   tree align = build_int_cst (unsigned_type_node,
5923 			      GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
5924   tree call = build_call_expr (fndecl, 3, offset, size, align);
5925 
5926   return fold_convert (build_pointer_type (type), call);
5927 }
5928 
5929 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR.  This function
5930    will cast the variable if necessary.  */
5931 
5932 static void
nvptx_generate_vector_shuffle(location_t loc,tree dest_var,tree var,unsigned shift,gimple_seq * seq)5933 nvptx_generate_vector_shuffle (location_t loc,
5934 			       tree dest_var, tree var, unsigned shift,
5935 			       gimple_seq *seq)
5936 {
5937   unsigned fn = NVPTX_BUILTIN_SHUFFLE;
5938   tree_code code = NOP_EXPR;
5939   tree arg_type = unsigned_type_node;
5940   tree var_type = TREE_TYPE (var);
5941   tree dest_type = var_type;
5942 
5943   if (TREE_CODE (var_type) == COMPLEX_TYPE)
5944     var_type = TREE_TYPE (var_type);
5945 
5946   if (TREE_CODE (var_type) == REAL_TYPE)
5947     code = VIEW_CONVERT_EXPR;
5948 
5949   if (TYPE_SIZE (var_type)
5950       == TYPE_SIZE (long_long_unsigned_type_node))
5951     {
5952       fn = NVPTX_BUILTIN_SHUFFLELL;
5953       arg_type = long_long_unsigned_type_node;
5954     }
5955 
5956   tree call = nvptx_builtin_decl (fn, true);
5957   tree bits = build_int_cst (unsigned_type_node, shift);
5958   tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
5959   tree expr;
5960 
5961   if (var_type != dest_type)
5962     {
5963       /* Do real and imaginary parts separately.  */
5964       tree real = fold_build1 (REALPART_EXPR, var_type, var);
5965       real = fold_build1 (code, arg_type, real);
5966       real = build_call_expr_loc (loc, call, 3, real, bits, kind);
5967       real = fold_build1 (code, var_type, real);
5968 
5969       tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
5970       imag = fold_build1 (code, arg_type, imag);
5971       imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
5972       imag = fold_build1 (code, var_type, imag);
5973 
5974       expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
5975     }
5976   else
5977     {
5978       expr = fold_build1 (code, arg_type, var);
5979       expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
5980       expr = fold_build1 (code, dest_type, expr);
5981     }
5982 
5983   gimplify_assign (dest_var, expr, seq);
5984 }
5985 
5986 /* Lazily generate the global lock var decl and return its address.  */
5987 
5988 static tree
nvptx_global_lock_addr()5989 nvptx_global_lock_addr ()
5990 {
5991   tree v = global_lock_var;
5992 
5993   if (!v)
5994     {
5995       tree name = get_identifier ("__reduction_lock");
5996       tree type = build_qualified_type (unsigned_type_node,
5997 					TYPE_QUAL_VOLATILE);
5998       v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
5999       global_lock_var = v;
6000       DECL_ARTIFICIAL (v) = 1;
6001       DECL_EXTERNAL (v) = 1;
6002       TREE_STATIC (v) = 1;
6003       TREE_PUBLIC (v) = 1;
6004       TREE_USED (v) = 1;
6005       mark_addressable (v);
6006       mark_decl_referenced (v);
6007     }
6008 
6009   return build_fold_addr_expr (v);
6010 }
6011 
6012 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
6013    GSI.  We use a lockless scheme for nearly all case, which looks
6014    like:
6015      actual = initval(OP);
6016      do {
6017        guess = actual;
6018        write = guess OP myval;
6019        actual = cmp&swap (ptr, guess, write)
6020      } while (actual bit-different-to guess);
6021    return write;
6022 
6023    This relies on a cmp&swap instruction, which is available for 32-
6024    and 64-bit types.  Larger types must use a locking scheme.  */
6025 
6026 static tree
nvptx_lockless_update(location_t loc,gimple_stmt_iterator * gsi,tree ptr,tree var,tree_code op)6027 nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
6028 		       tree ptr, tree var, tree_code op)
6029 {
6030   unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
6031   tree_code code = NOP_EXPR;
6032   tree arg_type = unsigned_type_node;
6033   tree var_type = TREE_TYPE (var);
6034 
6035   if (TREE_CODE (var_type) == COMPLEX_TYPE
6036       || TREE_CODE (var_type) == REAL_TYPE)
6037     code = VIEW_CONVERT_EXPR;
6038 
6039   if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
6040     {
6041       arg_type = long_long_unsigned_type_node;
6042       fn = NVPTX_BUILTIN_CMP_SWAPLL;
6043     }
6044 
6045   tree swap_fn = nvptx_builtin_decl (fn, true);
6046 
6047   gimple_seq init_seq = NULL;
6048   tree init_var = make_ssa_name (arg_type);
6049   tree init_expr = omp_reduction_init_op (loc, op, var_type);
6050   init_expr = fold_build1 (code, arg_type, init_expr);
6051   gimplify_assign (init_var, init_expr, &init_seq);
6052   gimple *init_end = gimple_seq_last (init_seq);
6053 
6054   gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
6055 
6056   /* Split the block just after the init stmts.  */
6057   basic_block pre_bb = gsi_bb (*gsi);
6058   edge pre_edge = split_block (pre_bb, init_end);
6059   basic_block loop_bb = pre_edge->dest;
6060   pre_bb = pre_edge->src;
6061   /* Reset the iterator.  */
6062   *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6063 
6064   tree expect_var = make_ssa_name (arg_type);
6065   tree actual_var = make_ssa_name (arg_type);
6066   tree write_var = make_ssa_name (arg_type);
6067 
6068   /* Build and insert the reduction calculation.  */
6069   gimple_seq red_seq = NULL;
6070   tree write_expr = fold_build1 (code, var_type, expect_var);
6071   write_expr = fold_build2 (op, var_type, write_expr, var);
6072   write_expr = fold_build1 (code, arg_type, write_expr);
6073   gimplify_assign (write_var, write_expr, &red_seq);
6074 
6075   gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
6076 
6077   /* Build & insert the cmp&swap sequence.  */
6078   gimple_seq latch_seq = NULL;
6079   tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
6080 					ptr, expect_var, write_var);
6081   gimplify_assign (actual_var, swap_expr, &latch_seq);
6082 
6083   gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
6084 				   NULL_TREE, NULL_TREE);
6085   gimple_seq_add_stmt (&latch_seq, cond);
6086 
6087   gimple *latch_end = gimple_seq_last (latch_seq);
6088   gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
6089 
6090   /* Split the block just after the latch stmts.  */
6091   edge post_edge = split_block (loop_bb, latch_end);
6092   basic_block post_bb = post_edge->dest;
6093   loop_bb = post_edge->src;
6094   *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6095 
6096   post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
6097   post_edge->probability = profile_probability::even ();
6098   edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
6099   loop_edge->probability = profile_probability::even ();
6100   set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
6101   set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
6102 
6103   gphi *phi = create_phi_node (expect_var, loop_bb);
6104   add_phi_arg (phi, init_var, pre_edge, loc);
6105   add_phi_arg (phi, actual_var, loop_edge, loc);
6106 
6107   loop *loop = alloc_loop ();
6108   loop->header = loop_bb;
6109   loop->latch = loop_bb;
6110   add_loop (loop, loop_bb->loop_father);
6111 
6112   return fold_build1 (code, var_type, write_var);
6113 }
6114 
6115 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
6116    GSI.  This is necessary for types larger than 64 bits, where there
6117    is no cmp&swap instruction to implement a lockless scheme.  We use
6118    a lock variable in global memory.
6119 
6120    while (cmp&swap (&lock_var, 0, 1))
6121      continue;
6122    T accum = *ptr;
6123    accum = accum OP var;
6124    *ptr = accum;
6125    cmp&swap (&lock_var, 1, 0);
6126    return accum;
6127 
6128    A lock in global memory is necessary to force execution engine
6129    descheduling and avoid resource starvation that can occur if the
6130    lock is in .shared memory.  */
6131 
6132 static tree
nvptx_lockfull_update(location_t loc,gimple_stmt_iterator * gsi,tree ptr,tree var,tree_code op)6133 nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
6134 		       tree ptr, tree var, tree_code op)
6135 {
6136   tree var_type = TREE_TYPE (var);
6137   tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
6138   tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
6139   tree uns_locked = build_int_cst (unsigned_type_node, 1);
6140 
6141   /* Split the block just before the gsi.  Insert a gimple nop to make
6142      this easier.  */
6143   gimple *nop = gimple_build_nop ();
6144   gsi_insert_before (gsi, nop, GSI_SAME_STMT);
6145   basic_block entry_bb = gsi_bb (*gsi);
6146   edge entry_edge = split_block (entry_bb, nop);
6147   basic_block lock_bb = entry_edge->dest;
6148   /* Reset the iterator.  */
6149   *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6150 
6151   /* Build and insert the locking sequence.  */
6152   gimple_seq lock_seq = NULL;
6153   tree lock_var = make_ssa_name (unsigned_type_node);
6154   tree lock_expr = nvptx_global_lock_addr ();
6155   lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
6156 				   uns_unlocked, uns_locked);
6157   gimplify_assign (lock_var, lock_expr, &lock_seq);
6158   gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
6159 				   NULL_TREE, NULL_TREE);
6160   gimple_seq_add_stmt (&lock_seq, cond);
6161   gimple *lock_end = gimple_seq_last (lock_seq);
6162   gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
6163 
6164   /* Split the block just after the lock sequence.  */
6165   edge locked_edge = split_block (lock_bb, lock_end);
6166   basic_block update_bb = locked_edge->dest;
6167   lock_bb = locked_edge->src;
6168   *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6169 
6170   /* Create the lock loop ... */
6171   locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
6172   locked_edge->probability = profile_probability::even ();
6173   edge loop_edge = make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
6174   loop_edge->probability = profile_probability::even ();
6175   set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
6176   set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
6177 
6178   /* ... and the loop structure.  */
6179   loop *lock_loop = alloc_loop ();
6180   lock_loop->header = lock_bb;
6181   lock_loop->latch = lock_bb;
6182   lock_loop->nb_iterations_estimate = 1;
6183   lock_loop->any_estimate = true;
6184   add_loop (lock_loop, entry_bb->loop_father);
6185 
6186   /* Build and insert the reduction calculation.  */
6187   gimple_seq red_seq = NULL;
6188   tree acc_in = make_ssa_name (var_type);
6189   tree ref_in = build_simple_mem_ref (ptr);
6190   TREE_THIS_VOLATILE (ref_in) = 1;
6191   gimplify_assign (acc_in, ref_in, &red_seq);
6192 
6193   tree acc_out = make_ssa_name (var_type);
6194   tree update_expr = fold_build2 (op, var_type, ref_in, var);
6195   gimplify_assign (acc_out, update_expr, &red_seq);
6196 
6197   tree ref_out = build_simple_mem_ref (ptr);
6198   TREE_THIS_VOLATILE (ref_out) = 1;
6199   gimplify_assign (ref_out, acc_out, &red_seq);
6200 
6201   gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
6202 
6203   /* Build & insert the unlock sequence.  */
6204   gimple_seq unlock_seq = NULL;
6205   tree unlock_expr = nvptx_global_lock_addr ();
6206   unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
6207 				     uns_locked, uns_unlocked);
6208   gimplify_and_add (unlock_expr, &unlock_seq);
6209   gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
6210 
6211   return acc_out;
6212 }
6213 
6214 /* Emit a sequence to update a reduction accumlator at *PTR with the
6215    value held in VAR using operator OP.  Return the updated value.
6216 
6217    TODO: optimize for atomic ops and indepedent complex ops.  */
6218 
6219 static tree
nvptx_reduction_update(location_t loc,gimple_stmt_iterator * gsi,tree ptr,tree var,tree_code op)6220 nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
6221 			tree ptr, tree var, tree_code op)
6222 {
6223   tree type = TREE_TYPE (var);
6224   tree size = TYPE_SIZE (type);
6225 
6226   if (size == TYPE_SIZE (unsigned_type_node)
6227       || size == TYPE_SIZE (long_long_unsigned_type_node))
6228     return nvptx_lockless_update (loc, gsi, ptr, var, op);
6229   else
6230     return nvptx_lockfull_update (loc, gsi, ptr, var, op);
6231 }
6232 
6233 /* NVPTX implementation of GOACC_REDUCTION_SETUP.  */
6234 
6235 static void
nvptx_goacc_reduction_setup(gcall * call,offload_attrs * oa)6236 nvptx_goacc_reduction_setup (gcall *call, offload_attrs *oa)
6237 {
6238   gimple_stmt_iterator gsi = gsi_for_stmt (call);
6239   tree lhs = gimple_call_lhs (call);
6240   tree var = gimple_call_arg (call, 2);
6241   int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6242   gimple_seq seq = NULL;
6243 
6244   push_gimplify_context (true);
6245 
6246   if (level != GOMP_DIM_GANG)
6247     {
6248       /* Copy the receiver object.  */
6249       tree ref_to_res = gimple_call_arg (call, 1);
6250 
6251       if (!integer_zerop (ref_to_res))
6252 	var = build_simple_mem_ref (ref_to_res);
6253     }
6254 
6255   if (level == GOMP_DIM_WORKER
6256       || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
6257     {
6258       /* Store incoming value to worker reduction buffer.  */
6259       tree offset = gimple_call_arg (call, 5);
6260       tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6261 					     level == GOMP_DIM_VECTOR);
6262       tree ptr = make_ssa_name (TREE_TYPE (call));
6263 
6264       gimplify_assign (ptr, call, &seq);
6265       tree ref = build_simple_mem_ref (ptr);
6266       TREE_THIS_VOLATILE (ref) = 1;
6267       gimplify_assign (ref, var, &seq);
6268     }
6269 
6270   if (lhs)
6271     gimplify_assign (lhs, var, &seq);
6272 
6273   pop_gimplify_context (NULL);
6274   gsi_replace_with_seq (&gsi, seq, true);
6275 }
6276 
6277 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
6278 
6279 static void
nvptx_goacc_reduction_init(gcall * call,offload_attrs * oa)6280 nvptx_goacc_reduction_init (gcall *call, offload_attrs *oa)
6281 {
6282   gimple_stmt_iterator gsi = gsi_for_stmt (call);
6283   tree lhs = gimple_call_lhs (call);
6284   tree var = gimple_call_arg (call, 2);
6285   int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6286   enum tree_code rcode
6287     = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
6288   tree init = omp_reduction_init_op (gimple_location (call), rcode,
6289 				     TREE_TYPE (var));
6290   gimple_seq seq = NULL;
6291 
6292   push_gimplify_context (true);
6293 
6294   if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
6295     {
6296       /* Initialize vector-non-zeroes to INIT_VAL (OP).  */
6297       tree tid = make_ssa_name (integer_type_node);
6298       tree dim_vector = gimple_call_arg (call, 3);
6299       gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
6300 						     dim_vector);
6301       gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
6302 					     NULL_TREE, NULL_TREE);
6303 
6304       gimple_call_set_lhs (tid_call, tid);
6305       gimple_seq_add_stmt (&seq, tid_call);
6306       gimple_seq_add_stmt (&seq, cond_stmt);
6307 
6308       /* Split the block just after the call.  */
6309       edge init_edge = split_block (gsi_bb (gsi), call);
6310       basic_block init_bb = init_edge->dest;
6311       basic_block call_bb = init_edge->src;
6312 
6313       /* Fixup flags from call_bb to init_bb.  */
6314       init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
6315       init_edge->probability = profile_probability::even ();
6316 
6317       /* Set the initialization stmts.  */
6318       gimple_seq init_seq = NULL;
6319       tree init_var = make_ssa_name (TREE_TYPE (var));
6320       gimplify_assign (init_var, init, &init_seq);
6321       gsi = gsi_start_bb (init_bb);
6322       gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
6323 
6324       /* Split block just after the init stmt.  */
6325       gsi_prev (&gsi);
6326       edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
6327       basic_block dst_bb = inited_edge->dest;
6328 
6329       /* Create false edge from call_bb to dst_bb.  */
6330       edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
6331       nop_edge->probability = profile_probability::even ();
6332 
6333       /* Create phi node in dst block.  */
6334       gphi *phi = create_phi_node (lhs, dst_bb);
6335       add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
6336       add_phi_arg (phi, var, nop_edge, gimple_location (call));
6337 
6338       /* Reset dominator of dst bb.  */
6339       set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
6340 
6341       /* Reset the gsi.  */
6342       gsi = gsi_for_stmt (call);
6343     }
6344   else
6345     {
6346       if (level == GOMP_DIM_GANG)
6347 	{
6348 	  /* If there's no receiver object, propagate the incoming VAR.  */
6349 	  tree ref_to_res = gimple_call_arg (call, 1);
6350 	  if (integer_zerop (ref_to_res))
6351 	    init = var;
6352 	}
6353 
6354       if (lhs != NULL_TREE)
6355 	gimplify_assign (lhs, init, &seq);
6356     }
6357 
6358   pop_gimplify_context (NULL);
6359   gsi_replace_with_seq (&gsi, seq, true);
6360 }
6361 
6362 /* NVPTX implementation of GOACC_REDUCTION_FINI.  */
6363 
6364 static void
nvptx_goacc_reduction_fini(gcall * call,offload_attrs * oa)6365 nvptx_goacc_reduction_fini (gcall *call, offload_attrs *oa)
6366 {
6367   gimple_stmt_iterator gsi = gsi_for_stmt (call);
6368   tree lhs = gimple_call_lhs (call);
6369   tree ref_to_res = gimple_call_arg (call, 1);
6370   tree var = gimple_call_arg (call, 2);
6371   int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6372   enum tree_code op
6373     = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
6374   gimple_seq seq = NULL;
6375   tree r = NULL_TREE;;
6376 
6377   push_gimplify_context (true);
6378 
6379   if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
6380     {
6381       /* Emit binary shuffle tree.  TODO. Emit this as an actual loop,
6382 	 but that requires a method of emitting a unified jump at the
6383 	 gimple level.  */
6384       for (int shfl = PTX_WARP_SIZE / 2; shfl > 0; shfl = shfl >> 1)
6385 	{
6386 	  tree other_var = make_ssa_name (TREE_TYPE (var));
6387 	  nvptx_generate_vector_shuffle (gimple_location (call),
6388 					 other_var, var, shfl, &seq);
6389 
6390 	  r = make_ssa_name (TREE_TYPE (var));
6391 	  gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
6392 					   var, other_var), &seq);
6393 	  var = r;
6394 	}
6395     }
6396   else
6397     {
6398       tree accum = NULL_TREE;
6399 
6400       if (level == GOMP_DIM_WORKER || level == GOMP_DIM_VECTOR)
6401 	{
6402 	  /* Get reduction buffer address.  */
6403 	  tree offset = gimple_call_arg (call, 5);
6404 	  tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6405 						 level == GOMP_DIM_VECTOR);
6406 	  tree ptr = make_ssa_name (TREE_TYPE (call));
6407 
6408 	  gimplify_assign (ptr, call, &seq);
6409 	  accum = ptr;
6410 	}
6411       else if (integer_zerop (ref_to_res))
6412 	r = var;
6413       else
6414 	accum = ref_to_res;
6415 
6416       if (accum)
6417 	{
6418 	  /* UPDATE the accumulator.  */
6419 	  gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
6420 	  seq = NULL;
6421 	  r = nvptx_reduction_update (gimple_location (call), &gsi,
6422 				      accum, var, op);
6423 	}
6424     }
6425 
6426   if (lhs)
6427     gimplify_assign (lhs, r, &seq);
6428   pop_gimplify_context (NULL);
6429 
6430   gsi_replace_with_seq (&gsi, seq, true);
6431 }
6432 
6433 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN.  */
6434 
6435 static void
nvptx_goacc_reduction_teardown(gcall * call,offload_attrs * oa)6436 nvptx_goacc_reduction_teardown (gcall *call, offload_attrs *oa)
6437 {
6438   gimple_stmt_iterator gsi = gsi_for_stmt (call);
6439   tree lhs = gimple_call_lhs (call);
6440   tree var = gimple_call_arg (call, 2);
6441   int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6442   gimple_seq seq = NULL;
6443 
6444   push_gimplify_context (true);
6445   if (level == GOMP_DIM_WORKER
6446       || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
6447     {
6448       /* Read the worker reduction buffer.  */
6449       tree offset = gimple_call_arg (call, 5);
6450       tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6451 					     level == GOMP_DIM_VECTOR);
6452       tree ptr = make_ssa_name (TREE_TYPE (call));
6453 
6454       gimplify_assign (ptr, call, &seq);
6455       var = build_simple_mem_ref (ptr);
6456       TREE_THIS_VOLATILE (var) = 1;
6457     }
6458 
6459   if (level != GOMP_DIM_GANG)
6460     {
6461       /* Write to the receiver object.  */
6462       tree ref_to_res = gimple_call_arg (call, 1);
6463 
6464       if (!integer_zerop (ref_to_res))
6465 	gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
6466     }
6467 
6468   if (lhs)
6469     gimplify_assign (lhs, var, &seq);
6470 
6471   pop_gimplify_context (NULL);
6472 
6473   gsi_replace_with_seq (&gsi, seq, true);
6474 }
6475 
6476 /* NVPTX reduction expander.  */
6477 
6478 static void
nvptx_goacc_reduction(gcall * call)6479 nvptx_goacc_reduction (gcall *call)
6480 {
6481   unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
6482   offload_attrs oa;
6483 
6484   populate_offload_attrs (&oa);
6485 
6486   switch (code)
6487     {
6488     case IFN_GOACC_REDUCTION_SETUP:
6489       nvptx_goacc_reduction_setup (call, &oa);
6490       break;
6491 
6492     case IFN_GOACC_REDUCTION_INIT:
6493       nvptx_goacc_reduction_init (call, &oa);
6494       break;
6495 
6496     case IFN_GOACC_REDUCTION_FINI:
6497       nvptx_goacc_reduction_fini (call, &oa);
6498       break;
6499 
6500     case IFN_GOACC_REDUCTION_TEARDOWN:
6501       nvptx_goacc_reduction_teardown (call, &oa);
6502       break;
6503 
6504     default:
6505       gcc_unreachable ();
6506     }
6507 }
6508 
6509 static bool
nvptx_cannot_force_const_mem(machine_mode mode ATTRIBUTE_UNUSED,rtx x ATTRIBUTE_UNUSED)6510 nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
6511 			      rtx x ATTRIBUTE_UNUSED)
6512 {
6513   return true;
6514 }
6515 
6516 static bool
nvptx_vector_mode_supported(machine_mode mode)6517 nvptx_vector_mode_supported (machine_mode mode)
6518 {
6519   return (mode == V2SImode
6520 	  || mode == V2DImode);
6521 }
6522 
6523 /* Return the preferred mode for vectorizing scalar MODE.  */
6524 
6525 static machine_mode
nvptx_preferred_simd_mode(scalar_mode mode)6526 nvptx_preferred_simd_mode (scalar_mode mode)
6527 {
6528   switch (mode)
6529     {
6530     case E_DImode:
6531       return V2DImode;
6532     case E_SImode:
6533       return V2SImode;
6534 
6535     default:
6536       return default_preferred_simd_mode (mode);
6537     }
6538 }
6539 
6540 unsigned int
nvptx_data_alignment(const_tree type,unsigned int basic_align)6541 nvptx_data_alignment (const_tree type, unsigned int basic_align)
6542 {
6543   if (TREE_CODE (type) == INTEGER_TYPE)
6544     {
6545       unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
6546       if (size == GET_MODE_SIZE (TImode))
6547 	return GET_MODE_BITSIZE (maybe_split_mode (TImode));
6548     }
6549 
6550   return basic_align;
6551 }
6552 
6553 /* Implement TARGET_MODES_TIEABLE_P.  */
6554 
6555 static bool
nvptx_modes_tieable_p(machine_mode,machine_mode)6556 nvptx_modes_tieable_p (machine_mode, machine_mode)
6557 {
6558   return false;
6559 }
6560 
6561 /* Implement TARGET_HARD_REGNO_NREGS.  */
6562 
6563 static unsigned int
nvptx_hard_regno_nregs(unsigned int,machine_mode)6564 nvptx_hard_regno_nregs (unsigned int, machine_mode)
6565 {
6566   return 1;
6567 }
6568 
6569 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
6570 
6571 static bool
nvptx_can_change_mode_class(machine_mode,machine_mode,reg_class_t)6572 nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t)
6573 {
6574   return false;
6575 }
6576 
6577 /* Implement TARGET_TRULY_NOOP_TRUNCATION.  */
6578 
6579 static bool
nvptx_truly_noop_truncation(poly_uint64,poly_uint64)6580 nvptx_truly_noop_truncation (poly_uint64, poly_uint64)
6581 {
6582   return false;
6583 }
6584 
6585 static GTY(()) tree nvptx_previous_fndecl;
6586 
6587 static void
nvptx_set_current_function(tree fndecl)6588 nvptx_set_current_function (tree fndecl)
6589 {
6590   if (!fndecl || fndecl == nvptx_previous_fndecl)
6591     return;
6592 
6593   nvptx_previous_fndecl = fndecl;
6594   vector_red_partition = 0;
6595   oacc_bcast_partition = 0;
6596 }
6597 
6598 /* Implement TARGET_LIBC_HAS_FUNCTION.  */
6599 
6600 bool
nvptx_libc_has_function(enum function_class fn_class,tree type)6601 nvptx_libc_has_function (enum function_class fn_class, tree type)
6602 {
6603   if (fn_class == function_sincos)
6604     {
6605       if (type != NULL_TREE)
6606 	/* Currently, newlib does not support sincosl.  */
6607 	return type == float_type_node || type == double_type_node;
6608       else
6609 	return true;
6610     }
6611 
6612   return default_libc_has_function (fn_class, type);
6613 }
6614 
6615 #undef TARGET_OPTION_OVERRIDE
6616 #define TARGET_OPTION_OVERRIDE nvptx_option_override
6617 
6618 #undef TARGET_ATTRIBUTE_TABLE
6619 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
6620 
6621 #undef TARGET_LRA_P
6622 #define TARGET_LRA_P hook_bool_void_false
6623 
6624 #undef TARGET_LEGITIMATE_ADDRESS_P
6625 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
6626 
6627 #undef  TARGET_PROMOTE_FUNCTION_MODE
6628 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
6629 
6630 #undef TARGET_FUNCTION_ARG
6631 #define TARGET_FUNCTION_ARG nvptx_function_arg
6632 #undef TARGET_FUNCTION_INCOMING_ARG
6633 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
6634 #undef TARGET_FUNCTION_ARG_ADVANCE
6635 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
6636 #undef TARGET_FUNCTION_ARG_BOUNDARY
6637 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
6638 #undef TARGET_PASS_BY_REFERENCE
6639 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
6640 #undef TARGET_FUNCTION_VALUE_REGNO_P
6641 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
6642 #undef TARGET_FUNCTION_VALUE
6643 #define TARGET_FUNCTION_VALUE nvptx_function_value
6644 #undef TARGET_LIBCALL_VALUE
6645 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
6646 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
6647 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
6648 #undef TARGET_GET_DRAP_RTX
6649 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
6650 #undef TARGET_SPLIT_COMPLEX_ARG
6651 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
6652 #undef TARGET_RETURN_IN_MEMORY
6653 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
6654 #undef TARGET_OMIT_STRUCT_RETURN_REG
6655 #define TARGET_OMIT_STRUCT_RETURN_REG true
6656 #undef TARGET_STRICT_ARGUMENT_NAMING
6657 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
6658 #undef TARGET_CALL_ARGS
6659 #define TARGET_CALL_ARGS nvptx_call_args
6660 #undef TARGET_END_CALL_ARGS
6661 #define TARGET_END_CALL_ARGS nvptx_end_call_args
6662 
6663 #undef TARGET_ASM_FILE_START
6664 #define TARGET_ASM_FILE_START nvptx_file_start
6665 #undef TARGET_ASM_FILE_END
6666 #define TARGET_ASM_FILE_END nvptx_file_end
6667 #undef TARGET_ASM_GLOBALIZE_LABEL
6668 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
6669 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
6670 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
6671 #undef  TARGET_PRINT_OPERAND
6672 #define TARGET_PRINT_OPERAND nvptx_print_operand
6673 #undef  TARGET_PRINT_OPERAND_ADDRESS
6674 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
6675 #undef  TARGET_PRINT_OPERAND_PUNCT_VALID_P
6676 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
6677 #undef TARGET_ASM_INTEGER
6678 #define TARGET_ASM_INTEGER nvptx_assemble_integer
6679 #undef TARGET_ASM_DECL_END
6680 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
6681 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
6682 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
6683 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
6684 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
6685 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
6686 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
6687 
6688 #undef TARGET_MACHINE_DEPENDENT_REORG
6689 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
6690 #undef TARGET_NO_REGISTER_ALLOCATION
6691 #define TARGET_NO_REGISTER_ALLOCATION true
6692 
6693 #undef TARGET_ENCODE_SECTION_INFO
6694 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
6695 #undef TARGET_RECORD_OFFLOAD_SYMBOL
6696 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
6697 
6698 #undef TARGET_VECTOR_ALIGNMENT
6699 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
6700 
6701 #undef TARGET_CANNOT_COPY_INSN_P
6702 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
6703 
6704 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
6705 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
6706 
6707 #undef TARGET_INIT_BUILTINS
6708 #define TARGET_INIT_BUILTINS nvptx_init_builtins
6709 #undef TARGET_EXPAND_BUILTIN
6710 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
6711 #undef  TARGET_BUILTIN_DECL
6712 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
6713 
6714 #undef TARGET_SIMT_VF
6715 #define TARGET_SIMT_VF nvptx_simt_vf
6716 
6717 #undef TARGET_OMP_DEVICE_KIND_ARCH_ISA
6718 #define TARGET_OMP_DEVICE_KIND_ARCH_ISA nvptx_omp_device_kind_arch_isa
6719 
6720 #undef TARGET_GOACC_VALIDATE_DIMS
6721 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
6722 
6723 #undef TARGET_GOACC_DIM_LIMIT
6724 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
6725 
6726 #undef TARGET_GOACC_FORK_JOIN
6727 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
6728 
6729 #undef TARGET_GOACC_REDUCTION
6730 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
6731 
6732 #undef TARGET_CANNOT_FORCE_CONST_MEM
6733 #define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
6734 
6735 #undef TARGET_VECTOR_MODE_SUPPORTED_P
6736 #define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
6737 
6738 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
6739 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
6740     nvptx_preferred_simd_mode
6741 
6742 #undef TARGET_MODES_TIEABLE_P
6743 #define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
6744 
6745 #undef TARGET_HARD_REGNO_NREGS
6746 #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
6747 
6748 #undef TARGET_CAN_CHANGE_MODE_CLASS
6749 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
6750 
6751 #undef TARGET_TRULY_NOOP_TRUNCATION
6752 #define TARGET_TRULY_NOOP_TRUNCATION nvptx_truly_noop_truncation
6753 
6754 #undef TARGET_HAVE_SPECULATION_SAFE_VALUE
6755 #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
6756 
6757 #undef TARGET_SET_CURRENT_FUNCTION
6758 #define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function
6759 
6760 #undef TARGET_LIBC_HAS_FUNCTION
6761 #define TARGET_LIBC_HAS_FUNCTION nvptx_libc_has_function
6762 
6763 struct gcc_target targetm = TARGET_INITIALIZER;
6764 
6765 #include "gt-nvptx.h"
6766