1 /* Target code for NVPTX.
2 Copyright (C) 2014-2021 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #include <sstream>
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "cfghooks.h"
32 #include "df.h"
33 #include "memmodel.h"
34 #include "tm_p.h"
35 #include "expmed.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "diagnostic.h"
41 #include "alias.h"
42 #include "insn-flags.h"
43 #include "output.h"
44 #include "insn-attr.h"
45 #include "flags.h"
46 #include "dojump.h"
47 #include "explow.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "stmt.h"
51 #include "expr.h"
52 #include "tm-preds.h"
53 #include "tm-constrs.h"
54 #include "langhooks.h"
55 #include "dbxout.h"
56 #include "cfgrtl.h"
57 #include "gimple.h"
58 #include "stor-layout.h"
59 #include "builtins.h"
60 #include "omp-general.h"
61 #include "omp-low.h"
62 #include "omp-offload.h"
63 #include "gomp-constants.h"
64 #include "dumpfile.h"
65 #include "internal-fn.h"
66 #include "gimple-iterator.h"
67 #include "stringpool.h"
68 #include "attribs.h"
69 #include "tree-vrp.h"
70 #include "tree-ssa-operands.h"
71 #include "tree-ssanames.h"
72 #include "gimplify.h"
73 #include "tree-phinodes.h"
74 #include "cfgloop.h"
75 #include "fold-const.h"
76 #include "intl.h"
77 #include "opts.h"
78
79 /* This file should be included last. */
80 #include "target-def.h"
81
82 #define WORKAROUND_PTXJIT_BUG 1
83 #define WORKAROUND_PTXJIT_BUG_2 1
84 #define WORKAROUND_PTXJIT_BUG_3 1
85
86 /* The PTX concept CTA (Concurrent Thread Array) maps on the CUDA concept thread
87 block, which has had a maximum number of threads of 1024 since CUDA version
88 2.x. */
89 #define PTX_CTA_SIZE 1024
90
91 #define PTX_CTA_NUM_BARRIERS 16
92 #define PTX_WARP_SIZE 32
93
94 #define PTX_PER_CTA_BARRIER 0
95 #define PTX_NUM_PER_CTA_BARRIERS 1
96 #define PTX_FIRST_PER_WORKER_BARRIER (PTX_NUM_PER_CTA_BARRIERS)
97 #define PTX_NUM_PER_WORKER_BARRIERS (PTX_CTA_NUM_BARRIERS - PTX_NUM_PER_CTA_BARRIERS)
98
99 #define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE
100 #define PTX_MAX_VECTOR_LENGTH PTX_CTA_SIZE
101 #define PTX_WORKER_LENGTH 32
102 #define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */
103
104 /* The various PTX memory areas an object might reside in. */
105 enum nvptx_data_area
106 {
107 DATA_AREA_GENERIC,
108 DATA_AREA_GLOBAL,
109 DATA_AREA_SHARED,
110 DATA_AREA_LOCAL,
111 DATA_AREA_CONST,
112 DATA_AREA_PARAM,
113 DATA_AREA_MAX
114 };
115
116 /* We record the data area in the target symbol flags. */
117 #define SYMBOL_DATA_AREA(SYM) \
118 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
119 & 7)
120 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
121 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
122
123 /* Record the function decls we've written, and the libfuncs and function
124 decls corresponding to them. */
125 static std::stringstream func_decls;
126
127 struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
128 {
hashdeclared_libfunc_hasher129 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
equaldeclared_libfunc_hasher130 static bool equal (rtx a, rtx b) { return a == b; }
131 };
132
133 static GTY((cache))
134 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
135
136 struct tree_hasher : ggc_cache_ptr_hash<tree_node>
137 {
hashtree_hasher138 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
equaltree_hasher139 static bool equal (tree a, tree b) { return a == b; }
140 };
141
142 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
143 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
144
145 /* Buffer needed to broadcast across workers and vectors. This is
146 used for both worker-neutering and worker broadcasting, and
147 vector-neutering and boardcasting when vector_length > 32. It is
148 shared by all functions emitted. The buffer is placed in shared
149 memory. It'd be nice if PTX supported common blocks, because then
150 this could be shared across TUs (taking the largest size). */
151 static unsigned oacc_bcast_size;
152 static unsigned oacc_bcast_partition;
153 static unsigned oacc_bcast_align;
154 static GTY(()) rtx oacc_bcast_sym;
155
156 /* Buffer needed for worker reductions. This has to be distinct from
157 the worker broadcast array, as both may be live concurrently. */
158 static unsigned worker_red_size;
159 static unsigned worker_red_align;
160 static GTY(()) rtx worker_red_sym;
161
162 /* Buffer needed for vector reductions, when vector_length >
163 PTX_WARP_SIZE. This has to be distinct from the worker broadcast
164 array, as both may be live concurrently. */
165 static unsigned vector_red_size;
166 static unsigned vector_red_align;
167 static unsigned vector_red_partition;
168 static GTY(()) rtx vector_red_sym;
169
170 /* Global lock variable, needed for 128bit worker & gang reductions. */
171 static GTY(()) tree global_lock_var;
172
173 /* True if any function references __nvptx_stacks. */
174 static bool need_softstack_decl;
175
176 /* True if any function references __nvptx_uni. */
177 static bool need_unisimt_decl;
178
179 static int nvptx_mach_max_workers ();
180
181 /* Allocate a new, cleared machine_function structure. */
182
183 static struct machine_function *
nvptx_init_machine_status(void)184 nvptx_init_machine_status (void)
185 {
186 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
187 p->return_mode = VOIDmode;
188 return p;
189 }
190
191 /* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
192 and -fopenacc is also enabled. */
193
194 static void
diagnose_openacc_conflict(bool optval,const char * optname)195 diagnose_openacc_conflict (bool optval, const char *optname)
196 {
197 if (flag_openacc && optval)
198 error ("option %s is not supported together with %<-fopenacc%>", optname);
199 }
200
201 /* Implement TARGET_OPTION_OVERRIDE. */
202
203 static void
nvptx_option_override(void)204 nvptx_option_override (void)
205 {
206 init_machine_status = nvptx_init_machine_status;
207
208 /* Set toplevel_reorder, unless explicitly disabled. We need
209 reordering so that we emit necessary assembler decls of
210 undeclared variables. */
211 if (!global_options_set.x_flag_toplevel_reorder)
212 flag_toplevel_reorder = 1;
213
214 debug_nonbind_markers_p = 0;
215
216 /* Set flag_no_common, unless explicitly disabled. We fake common
217 using .weak, and that's not entirely accurate, so avoid it
218 unless forced. */
219 if (!global_options_set.x_flag_no_common)
220 flag_no_common = 1;
221
222 /* The patch area requires nops, which we don't have. */
223 HOST_WIDE_INT patch_area_size, patch_area_entry;
224 parse_and_check_patch_area (flag_patchable_function_entry, false,
225 &patch_area_size, &patch_area_entry);
226 if (patch_area_size > 0)
227 sorry ("not generating patch area, nops not supported");
228
229 /* Assumes that it will see only hard registers. */
230 flag_var_tracking = 0;
231
232 if (nvptx_optimize < 0)
233 nvptx_optimize = optimize > 0;
234
235 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
236 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
237 declared_libfuncs_htab
238 = hash_table<declared_libfunc_hasher>::create_ggc (17);
239
240 oacc_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__oacc_bcast");
241 SET_SYMBOL_DATA_AREA (oacc_bcast_sym, DATA_AREA_SHARED);
242 oacc_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
243 oacc_bcast_partition = 0;
244
245 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
246 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
247 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
248
249 vector_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__vector_red");
250 SET_SYMBOL_DATA_AREA (vector_red_sym, DATA_AREA_SHARED);
251 vector_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
252 vector_red_partition = 0;
253
254 diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
255 diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
256 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
257
258 if (TARGET_GOMP)
259 target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
260 }
261
262 /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
263 deal with ptx ideosyncracies. */
264
265 const char *
nvptx_ptx_type_from_mode(machine_mode mode,bool promote)266 nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
267 {
268 switch (mode)
269 {
270 case E_BLKmode:
271 return ".b8";
272 case E_BImode:
273 return ".pred";
274 case E_QImode:
275 if (promote)
276 return ".u32";
277 else
278 return ".u8";
279 case E_HImode:
280 return ".u16";
281 case E_SImode:
282 return ".u32";
283 case E_DImode:
284 return ".u64";
285
286 case E_SFmode:
287 return ".f32";
288 case E_DFmode:
289 return ".f64";
290
291 case E_V2SImode:
292 return ".v2.u32";
293 case E_V2DImode:
294 return ".v2.u64";
295
296 default:
297 gcc_unreachable ();
298 }
299 }
300
301 /* Encode the PTX data area that DECL (which might not actually be a
302 _DECL) should reside in. */
303
304 static void
nvptx_encode_section_info(tree decl,rtx rtl,int first)305 nvptx_encode_section_info (tree decl, rtx rtl, int first)
306 {
307 default_encode_section_info (decl, rtl, first);
308 if (first && MEM_P (rtl))
309 {
310 nvptx_data_area area = DATA_AREA_GENERIC;
311
312 if (TREE_CONSTANT (decl))
313 area = DATA_AREA_CONST;
314 else if (TREE_CODE (decl) == VAR_DECL)
315 {
316 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
317 {
318 area = DATA_AREA_SHARED;
319 if (DECL_INITIAL (decl))
320 error ("static initialization of variable %q+D in %<.shared%>"
321 " memory is not supported", decl);
322 }
323 else
324 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
325 }
326
327 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
328 }
329 }
330
331 /* Return the PTX name of the data area in which SYM should be
332 placed. The symbol must have already been processed by
333 nvptx_encode_seciton_info, or equivalent. */
334
335 static const char *
section_for_sym(rtx sym)336 section_for_sym (rtx sym)
337 {
338 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
339 /* Same order as nvptx_data_area enum. */
340 static char const *const areas[] =
341 {"", ".global", ".shared", ".local", ".const", ".param"};
342
343 return areas[area];
344 }
345
346 /* Similarly for a decl. */
347
348 static const char *
section_for_decl(const_tree decl)349 section_for_decl (const_tree decl)
350 {
351 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
352 }
353
354 /* Check NAME for special function names and redirect them by returning a
355 replacement. This applies to malloc, free and realloc, for which we
356 want to use libgcc wrappers, and call, which triggers a bug in
357 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
358 not active in an offload compiler -- the names are all set by the
359 host-side compiler. */
360
361 static const char *
nvptx_name_replacement(const char * name)362 nvptx_name_replacement (const char *name)
363 {
364 if (strcmp (name, "call") == 0)
365 return "__nvptx_call";
366 if (strcmp (name, "malloc") == 0)
367 return "__nvptx_malloc";
368 if (strcmp (name, "free") == 0)
369 return "__nvptx_free";
370 if (strcmp (name, "realloc") == 0)
371 return "__nvptx_realloc";
372 return name;
373 }
374
375 /* Return NULL if NAME contains no dot. Otherwise return a copy of NAME
376 with the dots replaced with dollar signs. */
377
378 static char *
nvptx_replace_dot(const char * name)379 nvptx_replace_dot (const char *name)
380 {
381 if (strchr (name, '.') == NULL)
382 return NULL;
383
384 char *p = xstrdup (name);
385 for (size_t i = 0; i < strlen (p); ++i)
386 if (p[i] == '.')
387 p[i] = '$';
388 return p;
389 }
390
391 /* If MODE should be treated as two registers of an inner mode, return
392 that inner mode. Otherwise return VOIDmode. */
393
394 static machine_mode
maybe_split_mode(machine_mode mode)395 maybe_split_mode (machine_mode mode)
396 {
397 if (COMPLEX_MODE_P (mode))
398 return GET_MODE_INNER (mode);
399
400 if (mode == TImode)
401 return DImode;
402
403 return VOIDmode;
404 }
405
406 /* Return true if mode should be treated as two registers. */
407
408 static bool
split_mode_p(machine_mode mode)409 split_mode_p (machine_mode mode)
410 {
411 return maybe_split_mode (mode) != VOIDmode;
412 }
413
414 /* Output a register, subreg, or register pair (with optional
415 enclosing braces). */
416
417 static void
418 output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
419 int subreg_offset = -1)
420 {
421 if (inner_mode == VOIDmode)
422 {
423 if (HARD_REGISTER_NUM_P (regno))
424 fprintf (file, "%s", reg_names[regno]);
425 else
426 fprintf (file, "%%r%d", regno);
427 }
428 else if (subreg_offset >= 0)
429 {
430 output_reg (file, regno, VOIDmode);
431 fprintf (file, "$%d", subreg_offset);
432 }
433 else
434 {
435 if (subreg_offset == -1)
436 fprintf (file, "{");
437 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
438 fprintf (file, ",");
439 output_reg (file, regno, inner_mode, 0);
440 if (subreg_offset == -1)
441 fprintf (file, "}");
442 }
443 }
444
445 /* Emit forking instructions for MASK. */
446
447 static void
nvptx_emit_forking(unsigned mask,bool is_call)448 nvptx_emit_forking (unsigned mask, bool is_call)
449 {
450 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
451 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
452 if (mask)
453 {
454 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
455
456 /* Emit fork at all levels. This helps form SESE regions, as
457 it creates a block with a single successor before entering a
458 partitooned region. That is a good candidate for the end of
459 an SESE region. */
460 emit_insn (gen_nvptx_fork (op));
461 emit_insn (gen_nvptx_forked (op));
462 }
463 }
464
465 /* Emit joining instructions for MASK. */
466
467 static void
nvptx_emit_joining(unsigned mask,bool is_call)468 nvptx_emit_joining (unsigned mask, bool is_call)
469 {
470 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
471 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
472 if (mask)
473 {
474 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
475
476 /* Emit joining for all non-call pars to ensure there's a single
477 predecessor for the block the join insn ends up in. This is
478 needed for skipping entire loops. */
479 emit_insn (gen_nvptx_joining (op));
480 emit_insn (gen_nvptx_join (op));
481 }
482 }
483
484
485 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
486 returned in memory. Integer and floating types supported by the
487 machine are passed in registers, everything else is passed in
488 memory. Complex types are split. */
489
490 static bool
pass_in_memory(machine_mode mode,const_tree type,bool for_return)491 pass_in_memory (machine_mode mode, const_tree type, bool for_return)
492 {
493 if (type)
494 {
495 if (AGGREGATE_TYPE_P (type))
496 return true;
497 if (TREE_CODE (type) == VECTOR_TYPE)
498 return true;
499 }
500
501 if (!for_return && COMPLEX_MODE_P (mode))
502 /* Complex types are passed as two underlying args. */
503 mode = GET_MODE_INNER (mode);
504
505 if (GET_MODE_CLASS (mode) != MODE_INT
506 && GET_MODE_CLASS (mode) != MODE_FLOAT)
507 return true;
508
509 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
510 return true;
511
512 return false;
513 }
514
515 /* A non-memory argument of mode MODE is being passed, determine the mode it
516 should be promoted to. This is also used for determining return
517 type promotion. */
518
519 static machine_mode
promote_arg(machine_mode mode,bool prototyped)520 promote_arg (machine_mode mode, bool prototyped)
521 {
522 if (!prototyped && mode == SFmode)
523 /* K&R float promotion for unprototyped functions. */
524 mode = DFmode;
525 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
526 mode = SImode;
527
528 return mode;
529 }
530
531 /* A non-memory return type of MODE is being returned. Determine the
532 mode it should be promoted to. */
533
534 static machine_mode
promote_return(machine_mode mode)535 promote_return (machine_mode mode)
536 {
537 return promote_arg (mode, true);
538 }
539
540 /* Implement TARGET_FUNCTION_ARG. */
541
542 static rtx
nvptx_function_arg(cumulative_args_t,const function_arg_info & arg)543 nvptx_function_arg (cumulative_args_t, const function_arg_info &arg)
544 {
545 if (arg.end_marker_p () || !arg.named)
546 return NULL_RTX;
547
548 return gen_reg_rtx (arg.mode);
549 }
550
551 /* Implement TARGET_FUNCTION_INCOMING_ARG. */
552
553 static rtx
nvptx_function_incoming_arg(cumulative_args_t cum_v,const function_arg_info & arg)554 nvptx_function_incoming_arg (cumulative_args_t cum_v,
555 const function_arg_info &arg)
556 {
557 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
558
559 if (arg.end_marker_p () || !arg.named)
560 return NULL_RTX;
561
562 /* No need to deal with split modes here, the only case that can
563 happen is complex modes and those are dealt with by
564 TARGET_SPLIT_COMPLEX_ARG. */
565 return gen_rtx_UNSPEC (arg.mode,
566 gen_rtvec (1, GEN_INT (cum->count)),
567 UNSPEC_ARG_REG);
568 }
569
570 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
571
572 static void
nvptx_function_arg_advance(cumulative_args_t cum_v,const function_arg_info &)573 nvptx_function_arg_advance (cumulative_args_t cum_v, const function_arg_info &)
574 {
575 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
576
577 cum->count++;
578 }
579
580 /* Implement TARGET_FUNCTION_ARG_BOUNDARY.
581
582 For nvptx This is only used for varadic args. The type has already
583 been promoted and/or converted to invisible reference. */
584
585 static unsigned
nvptx_function_arg_boundary(machine_mode mode,const_tree ARG_UNUSED (type))586 nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
587 {
588 return GET_MODE_ALIGNMENT (mode);
589 }
590
591 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
592
593 For nvptx, we know how to handle functions declared as stdarg: by
594 passing an extra pointer to the unnamed arguments. However, the
595 Fortran frontend can produce a different situation, where a
596 function pointer is declared with no arguments, but the actual
597 function and calls to it take more arguments. In that case, we
598 want to ensure the call matches the definition of the function. */
599
600 static bool
nvptx_strict_argument_naming(cumulative_args_t cum_v)601 nvptx_strict_argument_naming (cumulative_args_t cum_v)
602 {
603 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
604
605 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
606 }
607
608 /* Implement TARGET_LIBCALL_VALUE. */
609
610 static rtx
nvptx_libcall_value(machine_mode mode,const_rtx)611 nvptx_libcall_value (machine_mode mode, const_rtx)
612 {
613 if (!cfun || !cfun->machine->doing_call)
614 /* Pretend to return in a hard reg for early uses before pseudos can be
615 generated. */
616 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
617
618 return gen_reg_rtx (mode);
619 }
620
621 /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
622 where function FUNC returns or receives a value of data type TYPE. */
623
624 static rtx
nvptx_function_value(const_tree type,const_tree ARG_UNUSED (func),bool outgoing)625 nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
626 bool outgoing)
627 {
628 machine_mode mode = promote_return (TYPE_MODE (type));
629
630 if (outgoing)
631 {
632 gcc_assert (cfun);
633 cfun->machine->return_mode = mode;
634 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
635 }
636
637 return nvptx_libcall_value (mode, NULL_RTX);
638 }
639
640 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
641
642 static bool
nvptx_function_value_regno_p(const unsigned int regno)643 nvptx_function_value_regno_p (const unsigned int regno)
644 {
645 return regno == NVPTX_RETURN_REGNUM;
646 }
647
648 /* Types with a mode other than those supported by the machine are passed by
649 reference in memory. */
650
651 static bool
nvptx_pass_by_reference(cumulative_args_t,const function_arg_info & arg)652 nvptx_pass_by_reference (cumulative_args_t, const function_arg_info &arg)
653 {
654 return pass_in_memory (arg.mode, arg.type, false);
655 }
656
657 /* Implement TARGET_RETURN_IN_MEMORY. */
658
659 static bool
nvptx_return_in_memory(const_tree type,const_tree)660 nvptx_return_in_memory (const_tree type, const_tree)
661 {
662 return pass_in_memory (TYPE_MODE (type), type, true);
663 }
664
665 /* Implement TARGET_PROMOTE_FUNCTION_MODE. */
666
667 static machine_mode
nvptx_promote_function_mode(const_tree type,machine_mode mode,int * ARG_UNUSED (punsignedp),const_tree funtype,int for_return)668 nvptx_promote_function_mode (const_tree type, machine_mode mode,
669 int *ARG_UNUSED (punsignedp),
670 const_tree funtype, int for_return)
671 {
672 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
673 }
674
675 /* Helper for write_arg. Emit a single PTX argument of MODE, either
676 in a prototype, or as copy in a function prologue. ARGNO is the
677 index of this argument in the PTX function. FOR_REG is negative,
678 if we're emitting the PTX prototype. It is zero if we're copying
679 to an argument register and it is greater than zero if we're
680 copying to a specific hard register. */
681
682 static int
write_arg_mode(std::stringstream & s,int for_reg,int argno,machine_mode mode)683 write_arg_mode (std::stringstream &s, int for_reg, int argno,
684 machine_mode mode)
685 {
686 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
687
688 if (for_reg < 0)
689 {
690 /* Writing PTX prototype. */
691 s << (argno ? ", " : " (");
692 s << ".param" << ptx_type << " %in_ar" << argno;
693 }
694 else
695 {
696 s << "\t.reg" << ptx_type << " ";
697 if (for_reg)
698 s << reg_names[for_reg];
699 else
700 s << "%ar" << argno;
701 s << ";\n";
702 if (argno >= 0)
703 {
704 s << "\tld.param" << ptx_type << " ";
705 if (for_reg)
706 s << reg_names[for_reg];
707 else
708 s << "%ar" << argno;
709 s << ", [%in_ar" << argno << "];\n";
710 }
711 }
712 return argno + 1;
713 }
714
715 /* Process function parameter TYPE to emit one or more PTX
716 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
717 is true, if this is a prototyped function, rather than an old-style
718 C declaration. Returns the next argument number to use.
719
720 The promotion behavior here must match the regular GCC function
721 parameter marshalling machinery. */
722
723 static int
write_arg_type(std::stringstream & s,int for_reg,int argno,tree type,bool prototyped)724 write_arg_type (std::stringstream &s, int for_reg, int argno,
725 tree type, bool prototyped)
726 {
727 machine_mode mode = TYPE_MODE (type);
728
729 if (mode == VOIDmode)
730 return argno;
731
732 if (pass_in_memory (mode, type, false))
733 mode = Pmode;
734 else
735 {
736 bool split = TREE_CODE (type) == COMPLEX_TYPE;
737
738 if (split)
739 {
740 /* Complex types are sent as two separate args. */
741 type = TREE_TYPE (type);
742 mode = TYPE_MODE (type);
743 prototyped = true;
744 }
745
746 mode = promote_arg (mode, prototyped);
747 if (split)
748 argno = write_arg_mode (s, for_reg, argno, mode);
749 }
750
751 return write_arg_mode (s, for_reg, argno, mode);
752 }
753
754 /* Emit a PTX return as a prototype or function prologue declaration
755 for MODE. */
756
757 static void
write_return_mode(std::stringstream & s,bool for_proto,machine_mode mode)758 write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
759 {
760 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
761 const char *pfx = "\t.reg";
762 const char *sfx = ";\n";
763
764 if (for_proto)
765 pfx = "(.param", sfx = "_out) ";
766
767 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
768 }
769
770 /* Process a function return TYPE to emit a PTX return as a prototype
771 or function prologue declaration. Returns true if return is via an
772 additional pointer parameter. The promotion behavior here must
773 match the regular GCC function return mashalling. */
774
775 static bool
write_return_type(std::stringstream & s,bool for_proto,tree type)776 write_return_type (std::stringstream &s, bool for_proto, tree type)
777 {
778 machine_mode mode = TYPE_MODE (type);
779
780 if (mode == VOIDmode)
781 return false;
782
783 bool return_in_mem = pass_in_memory (mode, type, true);
784
785 if (return_in_mem)
786 {
787 if (for_proto)
788 return return_in_mem;
789
790 /* Named return values can cause us to return a pointer as well
791 as expect an argument for the return location. This is
792 optimization-level specific, so no caller can make use of
793 this data, but more importantly for us, we must ensure it
794 doesn't change the PTX prototype. */
795 mode = (machine_mode) cfun->machine->return_mode;
796
797 if (mode == VOIDmode)
798 return return_in_mem;
799
800 /* Clear return_mode to inhibit copy of retval to non-existent
801 retval parameter. */
802 cfun->machine->return_mode = VOIDmode;
803 }
804 else
805 mode = promote_return (mode);
806
807 write_return_mode (s, for_proto, mode);
808
809 return return_in_mem;
810 }
811
812 /* Look for attributes in ATTRS that would indicate we must write a function
813 as a .entry kernel rather than a .func. Return true if one is found. */
814
815 static bool
write_as_kernel(tree attrs)816 write_as_kernel (tree attrs)
817 {
818 return (lookup_attribute ("kernel", attrs) != NULL_TREE
819 || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
820 && lookup_attribute ("oacc function", attrs) != NULL_TREE));
821 /* For OpenMP target regions, the corresponding kernel entry is emitted from
822 write_omp_entry as a separate function. */
823 }
824
825 /* Emit a linker marker for a function decl or defn. */
826
827 static void
write_fn_marker(std::stringstream & s,bool is_defn,bool globalize,const char * name)828 write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
829 const char *name)
830 {
831 s << "\n// BEGIN";
832 if (globalize)
833 s << " GLOBAL";
834 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
835 s << name << "\n";
836 }
837
838 /* Emit a linker marker for a variable decl or defn. */
839
840 static void
write_var_marker(FILE * file,bool is_defn,bool globalize,const char * name)841 write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
842 {
843 fprintf (file, "\n// BEGIN%s VAR %s: ",
844 globalize ? " GLOBAL" : "",
845 is_defn ? "DEF" : "DECL");
846 assemble_name_raw (file, name);
847 fputs ("\n", file);
848 }
849
850 /* Helper function for write_fn_proto. */
851
852 static void
write_fn_proto_1(std::stringstream & s,bool is_defn,const char * name,const_tree decl)853 write_fn_proto_1 (std::stringstream &s, bool is_defn,
854 const char *name, const_tree decl)
855 {
856 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
857
858 /* PTX declaration. */
859 if (DECL_EXTERNAL (decl))
860 s << ".extern ";
861 else if (TREE_PUBLIC (decl))
862 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
863 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
864
865 tree fntype = TREE_TYPE (decl);
866 tree result_type = TREE_TYPE (fntype);
867
868 /* atomic_compare_exchange_$n builtins have an exceptional calling
869 convention. */
870 int not_atomic_weak_arg = -1;
871 if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
872 switch (DECL_FUNCTION_CODE (decl))
873 {
874 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
875 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
876 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
877 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
878 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
879 /* These atomics skip the 'weak' parm in an actual library
880 call. We must skip it in the prototype too. */
881 not_atomic_weak_arg = 3;
882 break;
883
884 default:
885 break;
886 }
887
888 /* Declare the result. */
889 bool return_in_mem = write_return_type (s, true, result_type);
890
891 s << name;
892
893 int argno = 0;
894
895 /* Emit argument list. */
896 if (return_in_mem)
897 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
898
899 /* We get:
900 NULL in TYPE_ARG_TYPES, for old-style functions
901 NULL in DECL_ARGUMENTS, for builtin functions without another
902 declaration.
903 So we have to pick the best one we have. */
904 tree args = TYPE_ARG_TYPES (fntype);
905 bool prototyped = true;
906 if (!args)
907 {
908 args = DECL_ARGUMENTS (decl);
909 prototyped = false;
910 }
911
912 for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
913 {
914 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
915
916 if (not_atomic_weak_arg)
917 argno = write_arg_type (s, -1, argno, type, prototyped);
918 else
919 gcc_assert (TREE_CODE (type) == BOOLEAN_TYPE);
920 }
921
922 if (stdarg_p (fntype))
923 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
924
925 if (DECL_STATIC_CHAIN (decl))
926 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
927
928 if (!argno && strcmp (name, "main") == 0)
929 {
930 argno = write_arg_type (s, -1, argno, integer_type_node, true);
931 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
932 }
933
934 if (argno)
935 s << ")";
936
937 s << (is_defn ? "\n" : ";\n");
938 }
939
940 /* Write a .func or .kernel declaration or definition along with
941 a helper comment for use by ld. S is the stream to write to, DECL
942 the decl for the function with name NAME. For definitions, emit
943 a declaration too. */
944
945 static void
write_fn_proto(std::stringstream & s,bool is_defn,const char * name,const_tree decl)946 write_fn_proto (std::stringstream &s, bool is_defn,
947 const char *name, const_tree decl)
948 {
949 const char *replacement = nvptx_name_replacement (name);
950 char *replaced_dots = NULL;
951 if (replacement != name)
952 name = replacement;
953 else
954 {
955 replaced_dots = nvptx_replace_dot (name);
956 if (replaced_dots)
957 name = replaced_dots;
958 }
959 if (name[0] == '*')
960 name++;
961
962 if (is_defn)
963 /* Emit a declaration. The PTX assembler gets upset without it. */
964 write_fn_proto_1 (s, false, name, decl);
965
966 write_fn_proto_1 (s, is_defn, name, decl);
967
968 if (replaced_dots)
969 XDELETE (replaced_dots);
970 }
971
972 /* Construct a function declaration from a call insn. This can be
973 necessary for two reasons - either we have an indirect call which
974 requires a .callprototype declaration, or we have a libcall
975 generated by emit_library_call for which no decl exists. */
976
977 static void
write_fn_proto_from_insn(std::stringstream & s,const char * name,rtx result,rtx pat)978 write_fn_proto_from_insn (std::stringstream &s, const char *name,
979 rtx result, rtx pat)
980 {
981 char *replaced_dots = NULL;
982
983 if (!name)
984 {
985 s << "\t.callprototype ";
986 name = "_";
987 }
988 else
989 {
990 const char *replacement = nvptx_name_replacement (name);
991 if (replacement != name)
992 name = replacement;
993 else
994 {
995 replaced_dots = nvptx_replace_dot (name);
996 if (replaced_dots)
997 name = replaced_dots;
998 }
999 write_fn_marker (s, false, true, name);
1000 s << "\t.extern .func ";
1001 }
1002
1003 if (result != NULL_RTX)
1004 write_return_mode (s, true, GET_MODE (result));
1005
1006 s << name;
1007 if (replaced_dots)
1008 XDELETE (replaced_dots);
1009
1010 int arg_end = XVECLEN (pat, 0);
1011 for (int i = 1; i < arg_end; i++)
1012 {
1013 /* We don't have to deal with mode splitting & promotion here,
1014 as that was already done when generating the call
1015 sequence. */
1016 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
1017
1018 write_arg_mode (s, -1, i - 1, mode);
1019 }
1020 if (arg_end != 1)
1021 s << ")";
1022 s << ";\n";
1023 }
1024
1025 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
1026 table and write a ptx prototype. These are emitted at end of
1027 compilation. */
1028
1029 static void
nvptx_record_fndecl(tree decl)1030 nvptx_record_fndecl (tree decl)
1031 {
1032 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
1033 if (*slot == NULL)
1034 {
1035 *slot = decl;
1036 const char *name = get_fnname_from_decl (decl);
1037 write_fn_proto (func_decls, false, name, decl);
1038 }
1039 }
1040
1041 /* Record a libcall or unprototyped external function. CALLEE is the
1042 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
1043 declaration for it. */
1044
1045 static void
nvptx_record_libfunc(rtx callee,rtx retval,rtx pat)1046 nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
1047 {
1048 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
1049 if (*slot == NULL)
1050 {
1051 *slot = callee;
1052
1053 const char *name = XSTR (callee, 0);
1054 write_fn_proto_from_insn (func_decls, name, retval, pat);
1055 }
1056 }
1057
1058 /* DECL is an external FUNCTION_DECL, that we're referencing. If it
1059 is prototyped, record it now. Otherwise record it as needed at end
1060 of compilation, when we might have more information about it. */
1061
1062 void
nvptx_record_needed_fndecl(tree decl)1063 nvptx_record_needed_fndecl (tree decl)
1064 {
1065 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
1066 {
1067 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
1068 if (*slot == NULL)
1069 *slot = decl;
1070 }
1071 else
1072 nvptx_record_fndecl (decl);
1073 }
1074
1075 /* SYM is a SYMBOL_REF. If it refers to an external function, record
1076 it as needed. */
1077
1078 static void
nvptx_maybe_record_fnsym(rtx sym)1079 nvptx_maybe_record_fnsym (rtx sym)
1080 {
1081 tree decl = SYMBOL_REF_DECL (sym);
1082
1083 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
1084 nvptx_record_needed_fndecl (decl);
1085 }
1086
1087 /* Emit a local array to hold some part of a conventional stack frame
1088 and initialize REGNO to point to it. If the size is zero, it'll
1089 never be valid to dereference, so we can simply initialize to
1090 zero. */
1091
1092 static void
init_frame(FILE * file,int regno,unsigned align,unsigned size)1093 init_frame (FILE *file, int regno, unsigned align, unsigned size)
1094 {
1095 if (size)
1096 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
1097 align, reg_names[regno], size);
1098 fprintf (file, "\t.reg.u%d %s;\n",
1099 POINTER_SIZE, reg_names[regno]);
1100 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
1101 : "\tmov.u%d %s, 0;\n"),
1102 POINTER_SIZE, reg_names[regno], reg_names[regno]);
1103 }
1104
1105 /* Emit soft stack frame setup sequence. */
1106
1107 static void
init_softstack_frame(FILE * file,unsigned alignment,HOST_WIDE_INT size)1108 init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
1109 {
1110 /* Maintain 64-bit stack alignment. */
1111 unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
1112 size = ROUND_UP (size, keep_align);
1113 int bits = POINTER_SIZE;
1114 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1115 const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
1116 const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
1117 const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
1118 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
1119 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
1120 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
1121 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
1122 fprintf (file, "\t{\n");
1123 fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
1124 fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
1125 fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
1126 fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1127 fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1128 bits == 64 ? ".wide" : ".lo", bits / 8);
1129 fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
1130
1131 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1132 fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
1133
1134 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1135 fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
1136 bits, reg_sspprev, reg_sspslot);
1137
1138 /* Initialize %frame = %sspprev - size. */
1139 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1140 bits, reg_frame, reg_sspprev, size);
1141
1142 /* Apply alignment, if larger than 64. */
1143 if (alignment > keep_align)
1144 fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
1145 bits, reg_frame, reg_frame, -alignment);
1146
1147 size = crtl->outgoing_args_size;
1148 gcc_assert (size % keep_align == 0);
1149
1150 /* Initialize %stack. */
1151 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1152 bits, reg_stack, reg_frame, size);
1153
1154 if (!crtl->is_leaf)
1155 fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
1156 bits, reg_sspslot, reg_stack);
1157 fprintf (file, "\t}\n");
1158 cfun->machine->has_softstack = true;
1159 need_softstack_decl = true;
1160 }
1161
1162 /* Emit code to initialize the REGNO predicate register to indicate
1163 whether we are not lane zero on the NAME axis. */
1164
1165 static void
nvptx_init_axis_predicate(FILE * file,int regno,const char * name)1166 nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
1167 {
1168 fprintf (file, "\t{\n");
1169 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
1170 if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1171 {
1172 fprintf (file, "\t\t.reg.u64\t%%t_red;\n");
1173 fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1174 }
1175 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
1176 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
1177 if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1178 {
1179 fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tid.y;\n");
1180 fprintf (file, "\t\tcvta.shared.u64\t%%t_red, __vector_red;\n");
1181 fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_red; "
1182 "// vector reduction buffer\n",
1183 REGNO (cfun->machine->red_partition),
1184 vector_red_partition);
1185 }
1186 /* Verify vector_red_size. */
1187 gcc_assert (vector_red_partition * nvptx_mach_max_workers ()
1188 <= vector_red_size);
1189 fprintf (file, "\t}\n");
1190 }
1191
1192 /* Emit code to initialize OpenACC worker broadcast and synchronization
1193 registers. */
1194
1195 static void
nvptx_init_oacc_workers(FILE * file)1196 nvptx_init_oacc_workers (FILE *file)
1197 {
1198 fprintf (file, "\t{\n");
1199 fprintf (file, "\t\t.reg.u32\t%%tidy;\n");
1200 if (cfun->machine->bcast_partition)
1201 {
1202 fprintf (file, "\t\t.reg.u64\t%%t_bcast;\n");
1203 fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1204 }
1205 fprintf (file, "\t\tmov.u32\t\t%%tidy, %%tid.y;\n");
1206 if (cfun->machine->bcast_partition)
1207 {
1208 fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tidy;\n");
1209 fprintf (file, "\t\tadd.u64\t\t%%y64, %%y64, 1; // vector ID\n");
1210 fprintf (file, "\t\tcvta.shared.u64\t%%t_bcast, __oacc_bcast;\n");
1211 fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_bcast; "
1212 "// vector broadcast offset\n",
1213 REGNO (cfun->machine->bcast_partition),
1214 oacc_bcast_partition);
1215 }
1216 /* Verify oacc_bcast_size. */
1217 gcc_assert (oacc_bcast_partition * (nvptx_mach_max_workers () + 1)
1218 <= oacc_bcast_size);
1219 if (cfun->machine->sync_bar)
1220 fprintf (file, "\t\tadd.u32\t\t%%r%d, %%tidy, 1; "
1221 "// vector synchronization barrier\n",
1222 REGNO (cfun->machine->sync_bar));
1223 fprintf (file, "\t}\n");
1224 }
1225
1226 /* Emit code to initialize predicate and master lane index registers for
1227 -muniform-simt code generation variant. */
1228
1229 static void
nvptx_init_unisimt_predicate(FILE * file)1230 nvptx_init_unisimt_predicate (FILE *file)
1231 {
1232 cfun->machine->unisimt_location = gen_reg_rtx (Pmode);
1233 int loc = REGNO (cfun->machine->unisimt_location);
1234 int bits = POINTER_SIZE;
1235 fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc);
1236 fprintf (file, "\t{\n");
1237 fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
1238 fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
1239 fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1240 fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1241 bits == 64 ? ".wide" : ".lo");
1242 fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc);
1243 fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc);
1244 if (cfun->machine->unisimt_predicate)
1245 {
1246 int master = REGNO (cfun->machine->unisimt_master);
1247 int pred = REGNO (cfun->machine->unisimt_predicate);
1248 fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc);
1249 fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1250 /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */
1251 fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
1252 /* Compute predicate as 'tid.x == master'. */
1253 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
1254 }
1255 fprintf (file, "\t}\n");
1256 need_unisimt_decl = true;
1257 }
1258
1259 /* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1260
1261 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1262 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1263 {
1264 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1265 __nvptx_uni[tid.y] = 0;
1266 gomp_nvptx_main (ORIG, arg);
1267 }
1268 ORIG itself should not be emitted as a PTX .entry function. */
1269
1270 static void
write_omp_entry(FILE * file,const char * name,const char * orig)1271 write_omp_entry (FILE *file, const char *name, const char *orig)
1272 {
1273 static bool gomp_nvptx_main_declared;
1274 if (!gomp_nvptx_main_declared)
1275 {
1276 gomp_nvptx_main_declared = true;
1277 write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
1278 func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1279 << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
1280 }
1281 /* PR79332. Single out this string; it confuses gcc.pot generation. */
1282 #define NTID_Y "%ntid.y"
1283 #define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1284 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1285 {\n\
1286 .reg.u32 %r<3>;\n\
1287 .reg.u" PS " %R<4>;\n\
1288 mov.u32 %r0, %tid.y;\n\
1289 mov.u32 %r1, " NTID_Y ";\n\
1290 mov.u32 %r2, %ctaid.x;\n\
1291 cvt.u" PS ".u32 %R1, %r0;\n\
1292 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1293 mov.u" PS " %R0, __nvptx_stacks;\n\
1294 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1295 ld.param.u" PS " %R2, [%stack];\n\
1296 ld.param.u" PS " %R3, [%sz];\n\
1297 add.u" PS " %R2, %R2, %R3;\n\
1298 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1299 st.shared.u" PS " [%R0], %R2;\n\
1300 mov.u" PS " %R0, __nvptx_uni;\n\
1301 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1302 mov.u32 %r0, 0;\n\
1303 st.shared.u32 [%R0], %r0;\n\
1304 mov.u" PS " %R0, \0;\n\
1305 ld.param.u" PS " %R1, [%arg];\n\
1306 {\n\
1307 .param.u" PS " %P<2>;\n\
1308 st.param.u" PS " [%P0], %R0;\n\
1309 st.param.u" PS " [%P1], %R1;\n\
1310 call.uni gomp_nvptx_main, (%P0, %P1);\n\
1311 }\n\
1312 ret.uni;\n\
1313 }\n"
1314 static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1315 static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1316 #undef ENTRY_TEMPLATE
1317 #undef NTID_Y
1318 const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
1319 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1320 const char *entry_2 = entry_1 + strlen (entry64) + 1;
1321 fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
1322 need_softstack_decl = need_unisimt_decl = true;
1323 }
1324
1325 /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1326 function, including local var decls and copies from the arguments to
1327 local regs. */
1328
1329 void
nvptx_declare_function_name(FILE * file,const char * name,const_tree decl)1330 nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
1331 {
1332 tree fntype = TREE_TYPE (decl);
1333 tree result_type = TREE_TYPE (fntype);
1334 int argno = 0;
1335
1336 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
1337 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
1338 {
1339 char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
1340 sprintf (buf, "%s$impl", name);
1341 write_omp_entry (file, name, buf);
1342 name = buf;
1343 }
1344 /* We construct the initial part of the function into a string
1345 stream, in order to share the prototype writing code. */
1346 std::stringstream s;
1347 write_fn_proto (s, true, name, decl);
1348 s << "{\n";
1349
1350 bool return_in_mem = write_return_type (s, false, result_type);
1351 if (return_in_mem)
1352 argno = write_arg_type (s, 0, argno, ptr_type_node, true);
1353
1354 /* Declare and initialize incoming arguments. */
1355 tree args = TYPE_ARG_TYPES (fntype);
1356 bool prototyped = true;
1357 if (!args)
1358 {
1359 args = DECL_ARGUMENTS (decl);
1360 prototyped = false;
1361 }
1362
1363 for (; args != NULL_TREE; args = TREE_CHAIN (args))
1364 {
1365 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
1366
1367 argno = write_arg_type (s, 0, argno, type, prototyped);
1368 }
1369
1370 if (stdarg_p (fntype))
1371 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1372 true);
1373
1374 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1375 write_arg_type (s, STATIC_CHAIN_REGNUM,
1376 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1377 true);
1378
1379 fprintf (file, "%s", s.str().c_str());
1380
1381 /* Usually 'crtl->is_leaf' is computed during register allocator
1382 initialization (which is not done on NVPTX) or for pressure-sensitive
1383 optimizations. Initialize it here, except if already set. */
1384 if (!crtl->is_leaf)
1385 crtl->is_leaf = leaf_function_p ();
1386
1387 HOST_WIDE_INT sz = get_frame_size ();
1388 bool need_frameptr = sz || cfun->machine->has_chain;
1389 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
1390 if (!TARGET_SOFT_STACK)
1391 {
1392 /* Declare a local var for outgoing varargs. */
1393 if (cfun->machine->has_varadic)
1394 init_frame (file, STACK_POINTER_REGNUM,
1395 UNITS_PER_WORD, crtl->outgoing_args_size);
1396
1397 /* Declare a local variable for the frame. Force its size to be
1398 DImode-compatible. */
1399 if (need_frameptr)
1400 init_frame (file, FRAME_POINTER_REGNUM, alignment,
1401 ROUND_UP (sz, GET_MODE_SIZE (DImode)));
1402 }
1403 else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca
1404 || (cfun->machine->has_simtreg && !crtl->is_leaf))
1405 init_softstack_frame (file, alignment, sz);
1406
1407 if (cfun->machine->has_simtreg)
1408 {
1409 unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size;
1410 unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align;
1411 align = MAX (align, GET_MODE_SIZE (DImode));
1412 if (!crtl->is_leaf || cfun->calls_alloca)
1413 simtsz = HOST_WIDE_INT_M1U;
1414 if (simtsz == HOST_WIDE_INT_M1U)
1415 simtsz = nvptx_softstack_size;
1416 if (cfun->machine->has_softstack)
1417 simtsz += POINTER_SIZE / 8;
1418 simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode));
1419 if (align > GET_MODE_SIZE (DImode))
1420 simtsz += align - GET_MODE_SIZE (DImode);
1421 if (simtsz)
1422 fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
1423 HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
1424 }
1425
1426 /* Restore the vector reduction partition register, if necessary.
1427 FIXME: Find out when and why this is necessary, and fix it. */
1428 if (cfun->machine->red_partition)
1429 regno_reg_rtx[REGNO (cfun->machine->red_partition)]
1430 = cfun->machine->red_partition;
1431
1432 /* Declare the pseudos we have as ptx registers. */
1433 int maxregs = max_reg_num ();
1434 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1435 {
1436 if (regno_reg_rtx[i] != const0_rtx)
1437 {
1438 machine_mode mode = PSEUDO_REGNO_MODE (i);
1439 machine_mode split = maybe_split_mode (mode);
1440
1441 if (split_mode_p (mode))
1442 mode = split;
1443 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1444 output_reg (file, i, split, -2);
1445 fprintf (file, ";\n");
1446 }
1447 }
1448
1449 /* Emit axis predicates. */
1450 if (cfun->machine->axis_predicate[0])
1451 nvptx_init_axis_predicate (file,
1452 REGNO (cfun->machine->axis_predicate[0]), "y");
1453 if (cfun->machine->axis_predicate[1])
1454 nvptx_init_axis_predicate (file,
1455 REGNO (cfun->machine->axis_predicate[1]), "x");
1456 if (cfun->machine->unisimt_predicate
1457 || (cfun->machine->has_simtreg && !crtl->is_leaf))
1458 nvptx_init_unisimt_predicate (file);
1459 if (cfun->machine->bcast_partition || cfun->machine->sync_bar)
1460 nvptx_init_oacc_workers (file);
1461 }
1462
1463 /* Output code for switching uniform-simt state. ENTERING indicates whether
1464 we are entering or leaving non-uniform execution region. */
1465
1466 static void
nvptx_output_unisimt_switch(FILE * file,bool entering)1467 nvptx_output_unisimt_switch (FILE *file, bool entering)
1468 {
1469 if (crtl->is_leaf && !cfun->machine->unisimt_predicate)
1470 return;
1471 fprintf (file, "\t{\n");
1472 fprintf (file, "\t\t.reg.u32 %%ustmp2;\n");
1473 fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0);
1474 if (!crtl->is_leaf)
1475 {
1476 int loc = REGNO (cfun->machine->unisimt_location);
1477 fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc);
1478 }
1479 if (cfun->machine->unisimt_predicate)
1480 {
1481 int master = REGNO (cfun->machine->unisimt_master);
1482 int pred = REGNO (cfun->machine->unisimt_predicate);
1483 fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1484 fprintf (file, "\t\tmov.u32 %%r%d, %s;\n",
1485 master, entering ? "%ustmp2" : "0");
1486 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master);
1487 }
1488 fprintf (file, "\t}\n");
1489 }
1490
1491 /* Output code for allocating per-lane storage and switching soft-stack pointer.
1492 ENTERING indicates whether we are entering or leaving non-uniform execution.
1493 PTR is the register pointing to allocated storage, it is assigned to on
1494 entering and used to restore state on leaving. SIZE and ALIGN are used only
1495 on entering. */
1496
1497 static void
nvptx_output_softstack_switch(FILE * file,bool entering,rtx ptr,rtx size,rtx align)1498 nvptx_output_softstack_switch (FILE *file, bool entering,
1499 rtx ptr, rtx size, rtx align)
1500 {
1501 gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr));
1502 if (crtl->is_leaf && !cfun->machine->simt_stack_size)
1503 return;
1504 int bits = POINTER_SIZE, regno = REGNO (ptr);
1505 fprintf (file, "\t{\n");
1506 if (entering)
1507 {
1508 fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1509 HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno,
1510 cfun->machine->simt_stack_size);
1511 fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno);
1512 if (CONST_INT_P (size))
1513 fprintf (file, HOST_WIDE_INT_PRINT_DEC,
1514 ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode)));
1515 else
1516 output_reg (file, REGNO (size), VOIDmode);
1517 fputs (";\n", file);
1518 if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
1519 fprintf (file,
1520 "\t\tand.b%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
1521 bits, regno, regno, UINTVAL (align));
1522 }
1523 if (cfun->machine->has_softstack)
1524 {
1525 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1526 if (entering)
1527 {
1528 fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1529 bits, regno, bits / 8, reg_stack);
1530 fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n",
1531 bits, reg_stack, regno, bits / 8);
1532 }
1533 else
1534 {
1535 fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1536 bits, reg_stack, regno, bits / 8);
1537 }
1538 nvptx_output_set_softstack (REGNO (stack_pointer_rtx));
1539 }
1540 fprintf (file, "\t}\n");
1541 }
1542
1543 /* Output code to enter non-uniform execution region. DEST is a register
1544 to hold a per-lane allocation given by SIZE and ALIGN. */
1545
1546 const char *
nvptx_output_simt_enter(rtx dest,rtx size,rtx align)1547 nvptx_output_simt_enter (rtx dest, rtx size, rtx align)
1548 {
1549 nvptx_output_unisimt_switch (asm_out_file, true);
1550 nvptx_output_softstack_switch (asm_out_file, true, dest, size, align);
1551 return "";
1552 }
1553
1554 /* Output code to leave non-uniform execution region. SRC is the register
1555 holding per-lane storage previously allocated by omp_simt_enter insn. */
1556
1557 const char *
nvptx_output_simt_exit(rtx src)1558 nvptx_output_simt_exit (rtx src)
1559 {
1560 nvptx_output_unisimt_switch (asm_out_file, false);
1561 nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX);
1562 return "";
1563 }
1564
1565 /* Output instruction that sets soft stack pointer in shared memory to the
1566 value in register given by SRC_REGNO. */
1567
1568 const char *
nvptx_output_set_softstack(unsigned src_regno)1569 nvptx_output_set_softstack (unsigned src_regno)
1570 {
1571 if (cfun->machine->has_softstack && !crtl->is_leaf)
1572 {
1573 fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
1574 POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
1575 output_reg (asm_out_file, src_regno, VOIDmode);
1576 fprintf (asm_out_file, ";\n");
1577 }
1578 return "";
1579 }
1580 /* Output a return instruction. Also copy the return value to its outgoing
1581 location. */
1582
1583 const char *
nvptx_output_return(void)1584 nvptx_output_return (void)
1585 {
1586 machine_mode mode = (machine_mode)cfun->machine->return_mode;
1587
1588 if (mode != VOIDmode)
1589 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1590 nvptx_ptx_type_from_mode (mode, false),
1591 reg_names[NVPTX_RETURN_REGNUM],
1592 reg_names[NVPTX_RETURN_REGNUM]);
1593
1594 return "ret;";
1595 }
1596
1597 /* Terminate a function by writing a closing brace to FILE. */
1598
1599 void
nvptx_function_end(FILE * file)1600 nvptx_function_end (FILE *file)
1601 {
1602 fprintf (file, "}\n");
1603 }
1604
1605 /* Decide whether we can make a sibling call to a function. For ptx, we
1606 can't. */
1607
1608 static bool
nvptx_function_ok_for_sibcall(tree,tree)1609 nvptx_function_ok_for_sibcall (tree, tree)
1610 {
1611 return false;
1612 }
1613
1614 /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1615
1616 static rtx
nvptx_get_drap_rtx(void)1617 nvptx_get_drap_rtx (void)
1618 {
1619 if (TARGET_SOFT_STACK && stack_realign_drap)
1620 return arg_pointer_rtx;
1621 return NULL_RTX;
1622 }
1623
1624 /* Implement the TARGET_CALL_ARGS hook. Record information about one
1625 argument to the next call. */
1626
1627 static void
nvptx_call_args(rtx arg,tree fntype)1628 nvptx_call_args (rtx arg, tree fntype)
1629 {
1630 if (!cfun->machine->doing_call)
1631 {
1632 cfun->machine->doing_call = true;
1633 cfun->machine->is_varadic = false;
1634 cfun->machine->num_args = 0;
1635
1636 if (fntype && stdarg_p (fntype))
1637 {
1638 cfun->machine->is_varadic = true;
1639 cfun->machine->has_varadic = true;
1640 cfun->machine->num_args++;
1641 }
1642 }
1643
1644 if (REG_P (arg) && arg != pc_rtx)
1645 {
1646 cfun->machine->num_args++;
1647 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1648 cfun->machine->call_args);
1649 }
1650 }
1651
1652 /* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1653 information we recorded. */
1654
1655 static void
nvptx_end_call_args(void)1656 nvptx_end_call_args (void)
1657 {
1658 cfun->machine->doing_call = false;
1659 free_EXPR_LIST_list (&cfun->machine->call_args);
1660 }
1661
1662 /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1663 track of whether calls involving static chains or varargs were seen
1664 in the current function.
1665 For libcalls, maintain a hash table of decls we have seen, and
1666 record a function decl for later when encountering a new one. */
1667
1668 void
nvptx_expand_call(rtx retval,rtx address)1669 nvptx_expand_call (rtx retval, rtx address)
1670 {
1671 rtx callee = XEXP (address, 0);
1672 rtx varargs = NULL_RTX;
1673 unsigned parallel = 0;
1674
1675 if (!call_insn_operand (callee, Pmode))
1676 {
1677 callee = force_reg (Pmode, callee);
1678 address = change_address (address, QImode, callee);
1679 }
1680
1681 if (GET_CODE (callee) == SYMBOL_REF)
1682 {
1683 tree decl = SYMBOL_REF_DECL (callee);
1684 if (decl != NULL_TREE)
1685 {
1686 if (DECL_STATIC_CHAIN (decl))
1687 cfun->machine->has_chain = true;
1688
1689 tree attr = oacc_get_fn_attrib (decl);
1690 if (attr)
1691 {
1692 tree dims = TREE_VALUE (attr);
1693
1694 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1695 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1696 {
1697 if (TREE_PURPOSE (dims)
1698 && !integer_zerop (TREE_PURPOSE (dims)))
1699 break;
1700 /* Not on this axis. */
1701 parallel ^= GOMP_DIM_MASK (ix);
1702 dims = TREE_CHAIN (dims);
1703 }
1704 }
1705 }
1706 }
1707
1708 unsigned nargs = cfun->machine->num_args;
1709 if (cfun->machine->is_varadic)
1710 {
1711 varargs = gen_reg_rtx (Pmode);
1712 emit_move_insn (varargs, stack_pointer_rtx);
1713 }
1714
1715 rtvec vec = rtvec_alloc (nargs + 1);
1716 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
1717 int vec_pos = 0;
1718
1719 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
1720 rtx tmp_retval = retval;
1721 if (retval)
1722 {
1723 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1724 tmp_retval = gen_reg_rtx (GET_MODE (retval));
1725 call = gen_rtx_SET (tmp_retval, call);
1726 }
1727 XVECEXP (pat, 0, vec_pos++) = call;
1728
1729 /* Construct the call insn, including a USE for each argument pseudo
1730 register. These will be used when printing the insn. */
1731 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
1732 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
1733
1734 if (varargs)
1735 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
1736
1737 gcc_assert (vec_pos = XVECLEN (pat, 0));
1738
1739 nvptx_emit_forking (parallel, true);
1740 emit_call_insn (pat);
1741 nvptx_emit_joining (parallel, true);
1742
1743 if (tmp_retval != retval)
1744 emit_move_insn (retval, tmp_retval);
1745 }
1746
1747 /* Emit a comparison COMPARE, and return the new test to be used in the
1748 jump. */
1749
1750 rtx
nvptx_expand_compare(rtx compare)1751 nvptx_expand_compare (rtx compare)
1752 {
1753 rtx pred = gen_reg_rtx (BImode);
1754 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1755 XEXP (compare, 0), XEXP (compare, 1));
1756 emit_insn (gen_rtx_SET (pred, cmp));
1757 return gen_rtx_NE (BImode, pred, const0_rtx);
1758 }
1759
1760 /* Expand the oacc fork & join primitive into ptx-required unspecs. */
1761
1762 void
nvptx_expand_oacc_fork(unsigned mode)1763 nvptx_expand_oacc_fork (unsigned mode)
1764 {
1765 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1766 }
1767
1768 void
nvptx_expand_oacc_join(unsigned mode)1769 nvptx_expand_oacc_join (unsigned mode)
1770 {
1771 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1772 }
1773
1774 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1775 objects. */
1776
1777 static rtx
nvptx_gen_unpack(rtx dst0,rtx dst1,rtx src)1778 nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1779 {
1780 rtx res;
1781
1782 switch (GET_MODE (src))
1783 {
1784 case E_DImode:
1785 res = gen_unpackdisi2 (dst0, dst1, src);
1786 break;
1787 case E_DFmode:
1788 res = gen_unpackdfsi2 (dst0, dst1, src);
1789 break;
1790 default: gcc_unreachable ();
1791 }
1792 return res;
1793 }
1794
1795 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1796 object. */
1797
1798 static rtx
nvptx_gen_pack(rtx dst,rtx src0,rtx src1)1799 nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1800 {
1801 rtx res;
1802
1803 switch (GET_MODE (dst))
1804 {
1805 case E_DImode:
1806 res = gen_packsidi2 (dst, src0, src1);
1807 break;
1808 case E_DFmode:
1809 res = gen_packsidf2 (dst, src0, src1);
1810 break;
1811 default: gcc_unreachable ();
1812 }
1813 return res;
1814 }
1815
1816 /* Generate an instruction or sequence to broadcast register REG
1817 across the vectors of a single warp. */
1818
1819 rtx
nvptx_gen_shuffle(rtx dst,rtx src,rtx idx,nvptx_shuffle_kind kind)1820 nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
1821 {
1822 rtx res;
1823
1824 switch (GET_MODE (dst))
1825 {
1826 case E_SImode:
1827 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1828 break;
1829 case E_SFmode:
1830 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1831 break;
1832 case E_DImode:
1833 case E_DFmode:
1834 {
1835 rtx tmp0 = gen_reg_rtx (SImode);
1836 rtx tmp1 = gen_reg_rtx (SImode);
1837
1838 start_sequence ();
1839 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1840 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1841 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1842 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1843 res = get_insns ();
1844 end_sequence ();
1845 }
1846 break;
1847 case E_V2SImode:
1848 {
1849 rtx src0 = gen_rtx_SUBREG (SImode, src, 0);
1850 rtx src1 = gen_rtx_SUBREG (SImode, src, 4);
1851 rtx dst0 = gen_rtx_SUBREG (SImode, dst, 0);
1852 rtx dst1 = gen_rtx_SUBREG (SImode, dst, 4);
1853 rtx tmp0 = gen_reg_rtx (SImode);
1854 rtx tmp1 = gen_reg_rtx (SImode);
1855 start_sequence ();
1856 emit_insn (gen_movsi (tmp0, src0));
1857 emit_insn (gen_movsi (tmp1, src1));
1858 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1859 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1860 emit_insn (gen_movsi (dst0, tmp0));
1861 emit_insn (gen_movsi (dst1, tmp1));
1862 res = get_insns ();
1863 end_sequence ();
1864 }
1865 break;
1866 case E_V2DImode:
1867 {
1868 rtx src0 = gen_rtx_SUBREG (DImode, src, 0);
1869 rtx src1 = gen_rtx_SUBREG (DImode, src, 8);
1870 rtx dst0 = gen_rtx_SUBREG (DImode, dst, 0);
1871 rtx dst1 = gen_rtx_SUBREG (DImode, dst, 8);
1872 rtx tmp0 = gen_reg_rtx (DImode);
1873 rtx tmp1 = gen_reg_rtx (DImode);
1874 start_sequence ();
1875 emit_insn (gen_movdi (tmp0, src0));
1876 emit_insn (gen_movdi (tmp1, src1));
1877 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1878 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1879 emit_insn (gen_movdi (dst0, tmp0));
1880 emit_insn (gen_movdi (dst1, tmp1));
1881 res = get_insns ();
1882 end_sequence ();
1883 }
1884 break;
1885 case E_BImode:
1886 {
1887 rtx tmp = gen_reg_rtx (SImode);
1888
1889 start_sequence ();
1890 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1891 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1892 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1893 res = get_insns ();
1894 end_sequence ();
1895 }
1896 break;
1897 case E_QImode:
1898 case E_HImode:
1899 {
1900 rtx tmp = gen_reg_rtx (SImode);
1901
1902 start_sequence ();
1903 emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
1904 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1905 emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
1906 tmp)));
1907 res = get_insns ();
1908 end_sequence ();
1909 }
1910 break;
1911
1912 default:
1913 gcc_unreachable ();
1914 }
1915 return res;
1916 }
1917
1918 /* Generate an instruction or sequence to broadcast register REG
1919 across the vectors of a single warp. */
1920
1921 static rtx
nvptx_gen_warp_bcast(rtx reg)1922 nvptx_gen_warp_bcast (rtx reg)
1923 {
1924 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1925 }
1926
1927 /* Structure used when generating a worker-level spill or fill. */
1928
1929 struct broadcast_data_t
1930 {
1931 rtx base; /* Register holding base addr of buffer. */
1932 rtx ptr; /* Iteration var, if needed. */
1933 unsigned offset; /* Offset into worker buffer. */
1934 };
1935
1936 /* Direction of the spill/fill and looping setup/teardown indicator. */
1937
1938 enum propagate_mask
1939 {
1940 PM_read = 1 << 0,
1941 PM_write = 1 << 1,
1942 PM_loop_begin = 1 << 2,
1943 PM_loop_end = 1 << 3,
1944
1945 PM_read_write = PM_read | PM_write
1946 };
1947
1948 /* Generate instruction(s) to spill or fill register REG to/from the
1949 worker broadcast array. PM indicates what is to be done, REP
1950 how many loop iterations will be executed (0 for not a loop). */
1951
1952 static rtx
nvptx_gen_shared_bcast(rtx reg,propagate_mask pm,unsigned rep,broadcast_data_t * data,bool vector)1953 nvptx_gen_shared_bcast (rtx reg, propagate_mask pm, unsigned rep,
1954 broadcast_data_t *data, bool vector)
1955 {
1956 rtx res;
1957 machine_mode mode = GET_MODE (reg);
1958
1959 switch (mode)
1960 {
1961 case E_BImode:
1962 {
1963 rtx tmp = gen_reg_rtx (SImode);
1964
1965 start_sequence ();
1966 if (pm & PM_read)
1967 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1968 emit_insn (nvptx_gen_shared_bcast (tmp, pm, rep, data, vector));
1969 if (pm & PM_write)
1970 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1971 res = get_insns ();
1972 end_sequence ();
1973 }
1974 break;
1975
1976 default:
1977 {
1978 rtx addr = data->ptr;
1979
1980 if (!addr)
1981 {
1982 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1983
1984 oacc_bcast_align = MAX (oacc_bcast_align, align);
1985 data->offset = ROUND_UP (data->offset, align);
1986 addr = data->base;
1987 gcc_assert (data->base != NULL);
1988 if (data->offset)
1989 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1990 }
1991
1992 addr = gen_rtx_MEM (mode, addr);
1993 if (pm == PM_read)
1994 res = gen_rtx_SET (addr, reg);
1995 else if (pm == PM_write)
1996 res = gen_rtx_SET (reg, addr);
1997 else
1998 gcc_unreachable ();
1999
2000 if (data->ptr)
2001 {
2002 /* We're using a ptr, increment it. */
2003 start_sequence ();
2004
2005 emit_insn (res);
2006 emit_insn (gen_adddi3 (data->ptr, data->ptr,
2007 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
2008 res = get_insns ();
2009 end_sequence ();
2010 }
2011 else
2012 rep = 1;
2013 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
2014 }
2015 break;
2016 }
2017 return res;
2018 }
2019
2020 /* Returns true if X is a valid address for use in a memory reference. */
2021
2022 static bool
nvptx_legitimate_address_p(machine_mode,rtx x,bool)2023 nvptx_legitimate_address_p (machine_mode, rtx x, bool)
2024 {
2025 enum rtx_code code = GET_CODE (x);
2026
2027 switch (code)
2028 {
2029 case REG:
2030 return true;
2031
2032 case PLUS:
2033 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
2034 return true;
2035 return false;
2036
2037 case CONST:
2038 case SYMBOL_REF:
2039 case LABEL_REF:
2040 return true;
2041
2042 default:
2043 return false;
2044 }
2045 }
2046
2047 /* Machinery to output constant initializers. When beginning an
2048 initializer, we decide on a fragment size (which is visible in ptx
2049 in the type used), and then all initializer data is buffered until
2050 a fragment is filled and ready to be written out. */
2051
2052 static struct
2053 {
2054 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
2055 unsigned HOST_WIDE_INT val; /* Current fragment value. */
2056 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
2057 out. */
2058 unsigned size; /* Fragment size to accumulate. */
2059 unsigned offset; /* Offset within current fragment. */
2060 bool started; /* Whether we've output any initializer. */
2061 } init_frag;
2062
2063 /* The current fragment is full, write it out. SYM may provide a
2064 symbolic reference we should output, in which case the fragment
2065 value is the addend. */
2066
2067 static void
output_init_frag(rtx sym)2068 output_init_frag (rtx sym)
2069 {
2070 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
2071 unsigned HOST_WIDE_INT val = init_frag.val;
2072
2073 init_frag.started = true;
2074 init_frag.val = 0;
2075 init_frag.offset = 0;
2076 init_frag.remaining--;
2077
2078 if (sym)
2079 {
2080 bool function = (SYMBOL_REF_DECL (sym)
2081 && (TREE_CODE (SYMBOL_REF_DECL (sym)) == FUNCTION_DECL));
2082 if (!function)
2083 fprintf (asm_out_file, "generic(");
2084 output_address (VOIDmode, sym);
2085 if (!function)
2086 fprintf (asm_out_file, ")");
2087 if (val)
2088 fprintf (asm_out_file, " + ");
2089 }
2090
2091 if (!sym || val)
2092 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
2093 }
2094
2095 /* Add value VAL of size SIZE to the data we're emitting, and keep
2096 writing out chunks as they fill up. */
2097
2098 static void
nvptx_assemble_value(unsigned HOST_WIDE_INT val,unsigned size)2099 nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
2100 {
2101 bool negative_p
2102 = val & (HOST_WIDE_INT_1U << (HOST_BITS_PER_WIDE_INT - 1));
2103
2104 /* Avoid undefined behaviour. */
2105 if (size * BITS_PER_UNIT < HOST_BITS_PER_WIDE_INT)
2106 val &= (HOST_WIDE_INT_1U << (size * BITS_PER_UNIT)) - 1;
2107
2108 for (unsigned part = 0; size; size -= part)
2109 {
2110 if (part * BITS_PER_UNIT == HOST_BITS_PER_WIDE_INT)
2111 /* Avoid undefined behaviour. */
2112 val = negative_p ? -1 : 0;
2113 else
2114 val >>= (part * BITS_PER_UNIT);
2115 part = init_frag.size - init_frag.offset;
2116 part = MIN (part, size);
2117
2118 unsigned HOST_WIDE_INT partial
2119 = val << (init_frag.offset * BITS_PER_UNIT);
2120 init_frag.val |= partial & init_frag.mask;
2121 init_frag.offset += part;
2122
2123 if (init_frag.offset == init_frag.size)
2124 output_init_frag (NULL);
2125 }
2126 }
2127
2128 /* Target hook for assembling integer object X of size SIZE. */
2129
2130 static bool
nvptx_assemble_integer(rtx x,unsigned int size,int ARG_UNUSED (aligned_p))2131 nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
2132 {
2133 HOST_WIDE_INT val = 0;
2134
2135 switch (GET_CODE (x))
2136 {
2137 default:
2138 /* Let the generic machinery figure it out, usually for a
2139 CONST_WIDE_INT. */
2140 return false;
2141
2142 case CONST_INT:
2143 nvptx_assemble_value (INTVAL (x), size);
2144 break;
2145
2146 case CONST:
2147 x = XEXP (x, 0);
2148 gcc_assert (GET_CODE (x) == PLUS);
2149 val = INTVAL (XEXP (x, 1));
2150 x = XEXP (x, 0);
2151 gcc_assert (GET_CODE (x) == SYMBOL_REF);
2152 gcc_fallthrough (); /* FALLTHROUGH */
2153
2154 case SYMBOL_REF:
2155 gcc_assert (size == init_frag.size);
2156 if (init_frag.offset)
2157 sorry ("cannot emit unaligned pointers in ptx assembly");
2158
2159 nvptx_maybe_record_fnsym (x);
2160 init_frag.val = val;
2161 output_init_frag (x);
2162 break;
2163 }
2164
2165 return true;
2166 }
2167
2168 /* Output SIZE zero bytes. We ignore the FILE argument since the
2169 functions we're calling to perform the output just use
2170 asm_out_file. */
2171
2172 void
nvptx_output_skip(FILE *,unsigned HOST_WIDE_INT size)2173 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
2174 {
2175 /* Finish the current fragment, if it's started. */
2176 if (init_frag.offset)
2177 {
2178 unsigned part = init_frag.size - init_frag.offset;
2179 part = MIN (part, (unsigned)size);
2180 size -= part;
2181 nvptx_assemble_value (0, part);
2182 }
2183
2184 /* If this skip doesn't terminate the initializer, write as many
2185 remaining pieces as possible directly. */
2186 if (size < init_frag.remaining * init_frag.size)
2187 {
2188 while (size >= init_frag.size)
2189 {
2190 size -= init_frag.size;
2191 output_init_frag (NULL_RTX);
2192 }
2193 if (size)
2194 nvptx_assemble_value (0, size);
2195 }
2196 }
2197
2198 /* Output a string STR with length SIZE. As in nvptx_output_skip we
2199 ignore the FILE arg. */
2200
2201 void
nvptx_output_ascii(FILE *,const char * str,unsigned HOST_WIDE_INT size)2202 nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
2203 {
2204 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
2205 nvptx_assemble_value (str[i], 1);
2206 }
2207
2208 /* Return true if TYPE is a record type where the last field is an array without
2209 given dimension. */
2210
2211 static bool
flexible_array_member_type_p(const_tree type)2212 flexible_array_member_type_p (const_tree type)
2213 {
2214 if (TREE_CODE (type) != RECORD_TYPE)
2215 return false;
2216
2217 const_tree last_field = NULL_TREE;
2218 for (const_tree f = TYPE_FIELDS (type); f; f = TREE_CHAIN (f))
2219 last_field = f;
2220
2221 if (!last_field)
2222 return false;
2223
2224 const_tree last_field_type = TREE_TYPE (last_field);
2225 if (TREE_CODE (last_field_type) != ARRAY_TYPE)
2226 return false;
2227
2228 return (! TYPE_DOMAIN (last_field_type)
2229 || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type)));
2230 }
2231
2232 /* Emit a PTX variable decl and prepare for emission of its
2233 initializer. NAME is the symbol name and SETION the PTX data
2234 area. The type is TYPE, object size SIZE and alignment is ALIGN.
2235 The caller has already emitted any indentation and linkage
2236 specifier. It is responsible for any initializer, terminating ;
2237 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
2238 this is the opposite way round that PTX wants them! */
2239
2240 static void
2241 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
2242 const_tree type, HOST_WIDE_INT size, unsigned align,
2243 bool undefined = false)
2244 {
2245 bool atype = (TREE_CODE (type) == ARRAY_TYPE)
2246 && (TYPE_DOMAIN (type) == NULL_TREE);
2247
2248 if (undefined && flexible_array_member_type_p (type))
2249 {
2250 size = 0;
2251 atype = true;
2252 }
2253
2254 while (TREE_CODE (type) == ARRAY_TYPE)
2255 type = TREE_TYPE (type);
2256
2257 if (TREE_CODE (type) == VECTOR_TYPE
2258 || TREE_CODE (type) == COMPLEX_TYPE)
2259 /* Neither vector nor complex types can contain the other. */
2260 type = TREE_TYPE (type);
2261
2262 unsigned HOST_WIDE_INT elt_size = int_size_in_bytes (type);
2263
2264 /* Largest mode we're prepared to accept. For BLKmode types we
2265 don't know if it'll contain pointer constants, so have to choose
2266 pointer size, otherwise we can choose DImode. */
2267 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
2268
2269 elt_size |= GET_MODE_SIZE (elt_mode);
2270 elt_size &= -elt_size; /* Extract LSB set. */
2271
2272 init_frag.size = elt_size;
2273 /* Avoid undefined shift behavior by using '2'. */
2274 init_frag.mask = ((unsigned HOST_WIDE_INT)2
2275 << (elt_size * BITS_PER_UNIT - 1)) - 1;
2276 init_frag.val = 0;
2277 init_frag.offset = 0;
2278 init_frag.started = false;
2279 /* Size might not be a multiple of elt size, if there's an
2280 initialized trailing struct array with smaller type than
2281 elt_size. */
2282 init_frag.remaining = (size + elt_size - 1) / elt_size;
2283
2284 fprintf (file, "%s .align %d .u" HOST_WIDE_INT_PRINT_UNSIGNED " ",
2285 section, align / BITS_PER_UNIT,
2286 elt_size * BITS_PER_UNIT);
2287 assemble_name (file, name);
2288
2289 if (size)
2290 /* We make everything an array, to simplify any initialization
2291 emission. */
2292 fprintf (file, "[" HOST_WIDE_INT_PRINT_UNSIGNED "]", init_frag.remaining);
2293 else if (atype)
2294 fprintf (file, "[]");
2295 }
2296
2297 /* Called when the initializer for a decl has been completely output through
2298 combinations of the three functions above. */
2299
2300 static void
nvptx_assemble_decl_end(void)2301 nvptx_assemble_decl_end (void)
2302 {
2303 if (init_frag.offset)
2304 /* This can happen with a packed struct with trailing array member. */
2305 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
2306 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
2307 }
2308
2309 /* Output an uninitialized common or file-scope variable. */
2310
2311 void
nvptx_output_aligned_decl(FILE * file,const char * name,const_tree decl,HOST_WIDE_INT size,unsigned align)2312 nvptx_output_aligned_decl (FILE *file, const char *name,
2313 const_tree decl, HOST_WIDE_INT size, unsigned align)
2314 {
2315 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2316
2317 /* If this is public, it is common. The nearest thing we have to
2318 common is weak. */
2319 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
2320
2321 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2322 TREE_TYPE (decl), size, align);
2323 nvptx_assemble_decl_end ();
2324 }
2325
2326 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
2327 writing a constant variable EXP with NAME and SIZE and its
2328 initializer to FILE. */
2329
2330 static void
nvptx_asm_declare_constant_name(FILE * file,const char * name,const_tree exp,HOST_WIDE_INT obj_size)2331 nvptx_asm_declare_constant_name (FILE *file, const char *name,
2332 const_tree exp, HOST_WIDE_INT obj_size)
2333 {
2334 write_var_marker (file, true, false, name);
2335
2336 fprintf (file, "\t");
2337
2338 tree type = TREE_TYPE (exp);
2339 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
2340 TYPE_ALIGN (type));
2341 }
2342
2343 /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
2344 a variable DECL with NAME to FILE. */
2345
2346 void
nvptx_declare_object_name(FILE * file,const char * name,const_tree decl)2347 nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
2348 {
2349 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2350
2351 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
2352 : DECL_WEAK (decl) ? ".weak " : ".visible "));
2353
2354 tree type = TREE_TYPE (decl);
2355 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
2356 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2357 type, obj_size, DECL_ALIGN (decl));
2358 }
2359
2360 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
2361
2362 static void
nvptx_globalize_label(FILE *,const char *)2363 nvptx_globalize_label (FILE *, const char *)
2364 {
2365 }
2366
2367 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
2368 declaration only for variable DECL with NAME to FILE. */
2369
2370 static void
nvptx_assemble_undefined_decl(FILE * file,const char * name,const_tree decl)2371 nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
2372 {
2373 /* The middle end can place constant pool decls into the varpool as
2374 undefined. Until that is fixed, catch the problem here. */
2375 if (DECL_IN_CONSTANT_POOL (decl))
2376 return;
2377
2378 /* We support weak defintions, and hence have the right
2379 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
2380 if (DECL_WEAK (decl))
2381 error_at (DECL_SOURCE_LOCATION (decl),
2382 "PTX does not support weak declarations"
2383 " (only weak definitions)");
2384 write_var_marker (file, false, TREE_PUBLIC (decl), name);
2385
2386 fprintf (file, "\t.extern ");
2387 tree size = DECL_SIZE_UNIT (decl);
2388 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2389 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
2390 DECL_ALIGN (decl), true);
2391 nvptx_assemble_decl_end ();
2392 }
2393
2394 /* Output a pattern for a move instruction. */
2395
2396 const char *
nvptx_output_mov_insn(rtx dst,rtx src)2397 nvptx_output_mov_insn (rtx dst, rtx src)
2398 {
2399 machine_mode dst_mode = GET_MODE (dst);
2400 machine_mode src_mode = GET_MODE (src);
2401 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
2402 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
2403 machine_mode src_inner = (GET_CODE (src) == SUBREG
2404 ? GET_MODE (XEXP (src, 0)) : dst_mode);
2405
2406 rtx sym = src;
2407 if (GET_CODE (sym) == CONST)
2408 sym = XEXP (XEXP (sym, 0), 0);
2409 if (SYMBOL_REF_P (sym))
2410 {
2411 if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
2412 return "%.\tcvta%D1%t0\t%0, %1;";
2413 nvptx_maybe_record_fnsym (sym);
2414 }
2415
2416 if (src_inner == dst_inner)
2417 return "%.\tmov%t0\t%0, %1;";
2418
2419 if (CONSTANT_P (src))
2420 return (GET_MODE_CLASS (dst_inner) == MODE_INT
2421 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
2422 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2423
2424 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
2425 {
2426 if (GET_MODE_BITSIZE (dst_mode) == 128
2427 && GET_MODE_BITSIZE (src_mode) == 128)
2428 {
2429 /* mov.b128 is not supported. */
2430 if (dst_inner == V2DImode && src_inner == TImode)
2431 return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2432 else if (dst_inner == TImode && src_inner == V2DImode)
2433 return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2434
2435 gcc_unreachable ();
2436 }
2437 return "%.\tmov.b%T0\t%0, %1;";
2438 }
2439
2440 if (GET_MODE_BITSIZE (src_inner) == 128
2441 && GET_MODE_BITSIZE (src_mode) == 64)
2442 return "%.\tmov.b%T0\t%0, %1;";
2443
2444 return "%.\tcvt%t0%t1\t%0, %1;";
2445 }
2446
2447 static void nvptx_print_operand (FILE *, rtx, int);
2448
2449 /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
2450 involves writing .param declarations and in/out copies into them. For
2451 indirect calls, also write the .callprototype. */
2452
2453 const char *
nvptx_output_call_insn(rtx_insn * insn,rtx result,rtx callee)2454 nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
2455 {
2456 char buf[16];
2457 static int labelno;
2458 bool needs_tgt = register_operand (callee, Pmode);
2459 rtx pat = PATTERN (insn);
2460 if (GET_CODE (pat) == COND_EXEC)
2461 pat = COND_EXEC_CODE (pat);
2462 int arg_end = XVECLEN (pat, 0);
2463 tree decl = NULL_TREE;
2464
2465 fprintf (asm_out_file, "\t{\n");
2466 if (result != NULL)
2467 fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
2468 nvptx_ptx_type_from_mode (GET_MODE (result), false),
2469 reg_names[NVPTX_RETURN_REGNUM]);
2470
2471 /* Ensure we have a ptx declaration in the output if necessary. */
2472 if (GET_CODE (callee) == SYMBOL_REF)
2473 {
2474 decl = SYMBOL_REF_DECL (callee);
2475 if (!decl
2476 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
2477 nvptx_record_libfunc (callee, result, pat);
2478 else if (DECL_EXTERNAL (decl))
2479 nvptx_record_fndecl (decl);
2480 }
2481
2482 if (needs_tgt)
2483 {
2484 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
2485 labelno++;
2486 ASM_OUTPUT_LABEL (asm_out_file, buf);
2487 std::stringstream s;
2488 write_fn_proto_from_insn (s, NULL, result, pat);
2489 fputs (s.str().c_str(), asm_out_file);
2490 }
2491
2492 for (int argno = 1; argno < arg_end; argno++)
2493 {
2494 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
2495 machine_mode mode = GET_MODE (t);
2496 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
2497
2498 /* Mode splitting has already been done. */
2499 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
2500 "\t\tst.param%s [%%out_arg%d], ",
2501 ptx_type, argno, ptx_type, argno);
2502 output_reg (asm_out_file, REGNO (t), VOIDmode);
2503 fprintf (asm_out_file, ";\n");
2504 }
2505
2506 /* The '.' stands for the call's predicate, if any. */
2507 nvptx_print_operand (asm_out_file, NULL_RTX, '.');
2508 fprintf (asm_out_file, "\t\tcall ");
2509 if (result != NULL_RTX)
2510 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
2511
2512 if (decl)
2513 {
2514 char *replaced_dots = NULL;
2515 const char *name = get_fnname_from_decl (decl);
2516 const char *replacement = nvptx_name_replacement (name);
2517 if (replacement != name)
2518 name = replacement;
2519 else
2520 {
2521 replaced_dots = nvptx_replace_dot (name);
2522 if (replaced_dots)
2523 name = replaced_dots;
2524 }
2525 assemble_name (asm_out_file, name);
2526 if (replaced_dots)
2527 XDELETE (replaced_dots);
2528 }
2529 else
2530 output_address (VOIDmode, callee);
2531
2532 const char *open = "(";
2533 for (int argno = 1; argno < arg_end; argno++)
2534 {
2535 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
2536 open = "";
2537 }
2538 if (decl && DECL_STATIC_CHAIN (decl))
2539 {
2540 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
2541 open = "";
2542 }
2543 if (!open[0])
2544 fprintf (asm_out_file, ")");
2545
2546 if (needs_tgt)
2547 {
2548 fprintf (asm_out_file, ", ");
2549 assemble_name (asm_out_file, buf);
2550 }
2551 fprintf (asm_out_file, ";\n");
2552
2553 if (find_reg_note (insn, REG_NORETURN, NULL))
2554 {
2555 /* No return functions confuse the PTX JIT, as it doesn't realize
2556 the flow control barrier they imply. It can seg fault if it
2557 encounters what looks like an unexitable loop. Emit a trailing
2558 trap and exit, which it does grok. */
2559 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
2560 fprintf (asm_out_file, "\t\texit; // (noreturn)\n");
2561 }
2562
2563 if (result)
2564 {
2565 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2566
2567 if (!rval[0])
2568 /* We must escape the '%' that starts RETURN_REGNUM. */
2569 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2570 reg_names[NVPTX_RETURN_REGNUM]);
2571 return rval;
2572 }
2573
2574 return "}";
2575 }
2576
2577 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2578
2579 static bool
nvptx_print_operand_punct_valid_p(unsigned char c)2580 nvptx_print_operand_punct_valid_p (unsigned char c)
2581 {
2582 return c == '.' || c== '#';
2583 }
2584
2585 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2586
2587 static void
nvptx_print_address_operand(FILE * file,rtx x,machine_mode)2588 nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
2589 {
2590 rtx off;
2591 if (GET_CODE (x) == CONST)
2592 x = XEXP (x, 0);
2593 switch (GET_CODE (x))
2594 {
2595 case PLUS:
2596 off = XEXP (x, 1);
2597 output_address (VOIDmode, XEXP (x, 0));
2598 fprintf (file, "+");
2599 output_address (VOIDmode, off);
2600 break;
2601
2602 case SYMBOL_REF:
2603 case LABEL_REF:
2604 output_addr_const (file, x);
2605 break;
2606
2607 default:
2608 gcc_assert (GET_CODE (x) != MEM);
2609 nvptx_print_operand (file, x, 0);
2610 break;
2611 }
2612 }
2613
2614 /* Write assembly language output for the address ADDR to FILE. */
2615
2616 static void
nvptx_print_operand_address(FILE * file,machine_mode mode,rtx addr)2617 nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
2618 {
2619 nvptx_print_address_operand (file, addr, mode);
2620 }
2621
2622 /* Print an operand, X, to FILE, with an optional modifier in CODE.
2623
2624 Meaning of CODE:
2625 . -- print the predicate for the instruction or an emptry string for an
2626 unconditional one.
2627 # -- print a rounding mode for the instruction
2628
2629 A -- print a data area for a MEM
2630 c -- print an opcode suffix for a comparison operator, including a type code
2631 D -- print a data area for a MEM operand
2632 S -- print a shuffle kind specified by CONST_INT
2633 t -- print a type opcode suffix, promoting QImode to 32 bits
2634 T -- print a type size in bits
2635 u -- print a type opcode suffix without promotions. */
2636
2637 static void
nvptx_print_operand(FILE * file,rtx x,int code)2638 nvptx_print_operand (FILE *file, rtx x, int code)
2639 {
2640 if (code == '.')
2641 {
2642 x = current_insn_predicate;
2643 if (x)
2644 {
2645 fputs ("@", file);
2646 if (GET_CODE (x) == EQ)
2647 fputs ("!", file);
2648 output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
2649 }
2650 return;
2651 }
2652 else if (code == '#')
2653 {
2654 fputs (".rn", file);
2655 return;
2656 }
2657
2658 enum rtx_code x_code = GET_CODE (x);
2659 machine_mode mode = GET_MODE (x);
2660
2661 switch (code)
2662 {
2663 case 'A':
2664 x = XEXP (x, 0);
2665 gcc_fallthrough (); /* FALLTHROUGH. */
2666
2667 case 'D':
2668 if (GET_CODE (x) == CONST)
2669 x = XEXP (x, 0);
2670 if (GET_CODE (x) == PLUS)
2671 x = XEXP (x, 0);
2672
2673 if (GET_CODE (x) == SYMBOL_REF)
2674 fputs (section_for_sym (x), file);
2675 break;
2676
2677 case 't':
2678 case 'u':
2679 if (x_code == SUBREG)
2680 {
2681 machine_mode inner_mode = GET_MODE (SUBREG_REG (x));
2682 if (VECTOR_MODE_P (inner_mode)
2683 && (GET_MODE_SIZE (mode)
2684 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2685 mode = GET_MODE_INNER (inner_mode);
2686 else if (split_mode_p (inner_mode))
2687 mode = maybe_split_mode (inner_mode);
2688 else
2689 mode = inner_mode;
2690 }
2691 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
2692 break;
2693
2694 case 'H':
2695 case 'L':
2696 {
2697 rtx inner_x = SUBREG_REG (x);
2698 machine_mode inner_mode = GET_MODE (inner_x);
2699 machine_mode split = maybe_split_mode (inner_mode);
2700
2701 output_reg (file, REGNO (inner_x), split,
2702 (code == 'H'
2703 ? GET_MODE_SIZE (inner_mode) / 2
2704 : 0));
2705 }
2706 break;
2707
2708 case 'S':
2709 {
2710 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2711 /* Same order as nvptx_shuffle_kind. */
2712 static const char *const kinds[] =
2713 {".up", ".down", ".bfly", ".idx"};
2714 fputs (kinds[kind], file);
2715 }
2716 break;
2717
2718 case 'T':
2719 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
2720 break;
2721
2722 case 'j':
2723 fprintf (file, "@");
2724 goto common;
2725
2726 case 'J':
2727 fprintf (file, "@!");
2728 goto common;
2729
2730 case 'c':
2731 mode = GET_MODE (XEXP (x, 0));
2732 switch (x_code)
2733 {
2734 case EQ:
2735 fputs (".eq", file);
2736 break;
2737 case NE:
2738 if (FLOAT_MODE_P (mode))
2739 fputs (".neu", file);
2740 else
2741 fputs (".ne", file);
2742 break;
2743 case LE:
2744 case LEU:
2745 fputs (".le", file);
2746 break;
2747 case GE:
2748 case GEU:
2749 fputs (".ge", file);
2750 break;
2751 case LT:
2752 case LTU:
2753 fputs (".lt", file);
2754 break;
2755 case GT:
2756 case GTU:
2757 fputs (".gt", file);
2758 break;
2759 case LTGT:
2760 fputs (".ne", file);
2761 break;
2762 case UNEQ:
2763 fputs (".equ", file);
2764 break;
2765 case UNLE:
2766 fputs (".leu", file);
2767 break;
2768 case UNGE:
2769 fputs (".geu", file);
2770 break;
2771 case UNLT:
2772 fputs (".ltu", file);
2773 break;
2774 case UNGT:
2775 fputs (".gtu", file);
2776 break;
2777 case UNORDERED:
2778 fputs (".nan", file);
2779 break;
2780 case ORDERED:
2781 fputs (".num", file);
2782 break;
2783 default:
2784 gcc_unreachable ();
2785 }
2786 if (FLOAT_MODE_P (mode)
2787 || x_code == EQ || x_code == NE
2788 || x_code == GEU || x_code == GTU
2789 || x_code == LEU || x_code == LTU)
2790 fputs (nvptx_ptx_type_from_mode (mode, true), file);
2791 else
2792 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
2793 break;
2794 default:
2795 common:
2796 switch (x_code)
2797 {
2798 case SUBREG:
2799 {
2800 rtx inner_x = SUBREG_REG (x);
2801 machine_mode inner_mode = GET_MODE (inner_x);
2802 machine_mode split = maybe_split_mode (inner_mode);
2803
2804 if (VECTOR_MODE_P (inner_mode)
2805 && (GET_MODE_SIZE (mode)
2806 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2807 {
2808 output_reg (file, REGNO (inner_x), VOIDmode);
2809 fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y");
2810 }
2811 else if (split_mode_p (inner_mode)
2812 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2813 output_reg (file, REGNO (inner_x), split);
2814 else
2815 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2816 }
2817 break;
2818
2819 case REG:
2820 output_reg (file, REGNO (x), maybe_split_mode (mode));
2821 break;
2822
2823 case MEM:
2824 fputc ('[', file);
2825 nvptx_print_address_operand (file, XEXP (x, 0), mode);
2826 fputc (']', file);
2827 break;
2828
2829 case CONST_INT:
2830 output_addr_const (file, x);
2831 break;
2832
2833 case CONST:
2834 case SYMBOL_REF:
2835 case LABEL_REF:
2836 /* We could use output_addr_const, but that can print things like
2837 "x-8", which breaks ptxas. Need to ensure it is output as
2838 "x+-8". */
2839 nvptx_print_address_operand (file, x, VOIDmode);
2840 break;
2841
2842 case CONST_DOUBLE:
2843 long vals[2];
2844 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
2845 vals[0] &= 0xffffffff;
2846 vals[1] &= 0xffffffff;
2847 if (mode == SFmode)
2848 fprintf (file, "0f%08lx", vals[0]);
2849 else
2850 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2851 break;
2852
2853 case CONST_VECTOR:
2854 {
2855 unsigned n = CONST_VECTOR_NUNITS (x);
2856 fprintf (file, "{ ");
2857 for (unsigned i = 0; i < n; ++i)
2858 {
2859 if (i != 0)
2860 fprintf (file, ", ");
2861
2862 rtx elem = CONST_VECTOR_ELT (x, i);
2863 output_addr_const (file, elem);
2864 }
2865 fprintf (file, " }");
2866 }
2867 break;
2868
2869 default:
2870 output_addr_const (file, x);
2871 }
2872 }
2873 }
2874
2875 /* Record replacement regs used to deal with subreg operands. */
2876 struct reg_replace
2877 {
2878 rtx replacement[MAX_RECOG_OPERANDS];
2879 machine_mode mode;
2880 int n_allocated;
2881 int n_in_use;
2882 };
2883
2884 /* Allocate or reuse a replacement in R and return the rtx. */
2885
2886 static rtx
get_replacement(struct reg_replace * r)2887 get_replacement (struct reg_replace *r)
2888 {
2889 if (r->n_allocated == r->n_in_use)
2890 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2891 return r->replacement[r->n_in_use++];
2892 }
2893
2894 /* Clean up subreg operands. In ptx assembly, everything is typed, and
2895 the presence of subregs would break the rules for most instructions.
2896 Replace them with a suitable new register of the right size, plus
2897 conversion copyin/copyout instructions. */
2898
2899 static void
nvptx_reorg_subreg(void)2900 nvptx_reorg_subreg (void)
2901 {
2902 struct reg_replace qiregs, hiregs, siregs, diregs;
2903 rtx_insn *insn, *next;
2904
2905 qiregs.n_allocated = 0;
2906 hiregs.n_allocated = 0;
2907 siregs.n_allocated = 0;
2908 diregs.n_allocated = 0;
2909 qiregs.mode = QImode;
2910 hiregs.mode = HImode;
2911 siregs.mode = SImode;
2912 diregs.mode = DImode;
2913
2914 for (insn = get_insns (); insn; insn = next)
2915 {
2916 next = NEXT_INSN (insn);
2917 if (!NONDEBUG_INSN_P (insn)
2918 || asm_noperands (PATTERN (insn)) >= 0
2919 || GET_CODE (PATTERN (insn)) == USE
2920 || GET_CODE (PATTERN (insn)) == CLOBBER)
2921 continue;
2922
2923 qiregs.n_in_use = 0;
2924 hiregs.n_in_use = 0;
2925 siregs.n_in_use = 0;
2926 diregs.n_in_use = 0;
2927 extract_insn (insn);
2928 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
2929
2930 for (int i = 0; i < recog_data.n_operands; i++)
2931 {
2932 rtx op = recog_data.operand[i];
2933 if (GET_CODE (op) != SUBREG)
2934 continue;
2935
2936 rtx inner = SUBREG_REG (op);
2937
2938 machine_mode outer_mode = GET_MODE (op);
2939 machine_mode inner_mode = GET_MODE (inner);
2940 gcc_assert (s_ok);
2941 if (s_ok
2942 && (GET_MODE_PRECISION (inner_mode)
2943 >= GET_MODE_PRECISION (outer_mode)))
2944 continue;
2945 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2946 struct reg_replace *r = (outer_mode == QImode ? &qiregs
2947 : outer_mode == HImode ? &hiregs
2948 : outer_mode == SImode ? &siregs
2949 : &diregs);
2950 rtx new_reg = get_replacement (r);
2951
2952 if (recog_data.operand_type[i] != OP_OUT)
2953 {
2954 enum rtx_code code;
2955 if (GET_MODE_PRECISION (inner_mode)
2956 < GET_MODE_PRECISION (outer_mode))
2957 code = ZERO_EXTEND;
2958 else
2959 code = TRUNCATE;
2960
2961 rtx pat = gen_rtx_SET (new_reg,
2962 gen_rtx_fmt_e (code, outer_mode, inner));
2963 emit_insn_before (pat, insn);
2964 }
2965
2966 if (recog_data.operand_type[i] != OP_IN)
2967 {
2968 enum rtx_code code;
2969 if (GET_MODE_PRECISION (inner_mode)
2970 < GET_MODE_PRECISION (outer_mode))
2971 code = TRUNCATE;
2972 else
2973 code = ZERO_EXTEND;
2974
2975 rtx pat = gen_rtx_SET (inner,
2976 gen_rtx_fmt_e (code, inner_mode, new_reg));
2977 emit_insn_after (pat, insn);
2978 }
2979 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2980 }
2981 }
2982 }
2983
2984 /* Return a SImode "master lane index" register for uniform-simt, allocating on
2985 first use. */
2986
2987 static rtx
nvptx_get_unisimt_master()2988 nvptx_get_unisimt_master ()
2989 {
2990 rtx &master = cfun->machine->unisimt_master;
2991 return master ? master : master = gen_reg_rtx (SImode);
2992 }
2993
2994 /* Return a BImode "predicate" register for uniform-simt, similar to above. */
2995
2996 static rtx
nvptx_get_unisimt_predicate()2997 nvptx_get_unisimt_predicate ()
2998 {
2999 rtx &pred = cfun->machine->unisimt_predicate;
3000 return pred ? pred : pred = gen_reg_rtx (BImode);
3001 }
3002
3003 /* Return true if given call insn references one of the functions provided by
3004 the CUDA runtime: malloc, free, vprintf. */
3005
3006 static bool
nvptx_call_insn_is_syscall_p(rtx_insn * insn)3007 nvptx_call_insn_is_syscall_p (rtx_insn *insn)
3008 {
3009 rtx pat = PATTERN (insn);
3010 gcc_checking_assert (GET_CODE (pat) == PARALLEL);
3011 pat = XVECEXP (pat, 0, 0);
3012 if (GET_CODE (pat) == SET)
3013 pat = SET_SRC (pat);
3014 gcc_checking_assert (GET_CODE (pat) == CALL
3015 && GET_CODE (XEXP (pat, 0)) == MEM);
3016 rtx addr = XEXP (XEXP (pat, 0), 0);
3017 if (GET_CODE (addr) != SYMBOL_REF)
3018 return false;
3019 const char *name = XSTR (addr, 0);
3020 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
3021 references with forced assembler name refer to PTX syscalls. For vprintf,
3022 accept both normal and forced-assembler-name references. */
3023 return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
3024 || !strcmp (name, "*malloc")
3025 || !strcmp (name, "*free"));
3026 }
3027
3028 /* If SET subexpression of INSN sets a register, emit a shuffle instruction to
3029 propagate its value from lane MASTER to current lane. */
3030
3031 static void
nvptx_unisimt_handle_set(rtx set,rtx_insn * insn,rtx master)3032 nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
3033 {
3034 rtx reg;
3035 if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
3036 emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
3037 }
3038
3039 /* Adjust code for uniform-simt code generation variant by making atomics and
3040 "syscalls" conditionally executed, and inserting shuffle-based propagation
3041 for registers being set. */
3042
3043 static void
nvptx_reorg_uniform_simt()3044 nvptx_reorg_uniform_simt ()
3045 {
3046 rtx_insn *insn, *next;
3047
3048 for (insn = get_insns (); insn; insn = next)
3049 {
3050 next = NEXT_INSN (insn);
3051 if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
3052 && !(NONJUMP_INSN_P (insn)
3053 && GET_CODE (PATTERN (insn)) == PARALLEL
3054 && get_attr_atomic (insn)))
3055 continue;
3056 rtx pat = PATTERN (insn);
3057 rtx master = nvptx_get_unisimt_master ();
3058 for (int i = 0; i < XVECLEN (pat, 0); i++)
3059 nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
3060 rtx pred = nvptx_get_unisimt_predicate ();
3061 pred = gen_rtx_NE (BImode, pred, const0_rtx);
3062 pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
3063 validate_change (insn, &PATTERN (insn), pat, false);
3064 }
3065 }
3066
3067 /* Offloading function attributes. */
3068
3069 struct offload_attrs
3070 {
3071 unsigned mask;
3072 int num_gangs;
3073 int num_workers;
3074 int vector_length;
3075 };
3076
3077 /* Define entries for cfun->machine->axis_dim. */
3078
3079 #define MACH_VECTOR_LENGTH 0
3080 #define MACH_MAX_WORKERS 1
3081
3082 static void populate_offload_attrs (offload_attrs *oa);
3083
3084 static void
init_axis_dim(void)3085 init_axis_dim (void)
3086 {
3087 offload_attrs oa;
3088 int max_workers;
3089
3090 populate_offload_attrs (&oa);
3091
3092 if (oa.num_workers == 0)
3093 max_workers = PTX_CTA_SIZE / oa.vector_length;
3094 else
3095 max_workers = oa.num_workers;
3096
3097 cfun->machine->axis_dim[MACH_VECTOR_LENGTH] = oa.vector_length;
3098 cfun->machine->axis_dim[MACH_MAX_WORKERS] = max_workers;
3099 cfun->machine->axis_dim_init_p = true;
3100 }
3101
3102 static int ATTRIBUTE_UNUSED
nvptx_mach_max_workers()3103 nvptx_mach_max_workers ()
3104 {
3105 if (!cfun->machine->axis_dim_init_p)
3106 init_axis_dim ();
3107 return cfun->machine->axis_dim[MACH_MAX_WORKERS];
3108 }
3109
3110 static int ATTRIBUTE_UNUSED
nvptx_mach_vector_length()3111 nvptx_mach_vector_length ()
3112 {
3113 if (!cfun->machine->axis_dim_init_p)
3114 init_axis_dim ();
3115 return cfun->machine->axis_dim[MACH_VECTOR_LENGTH];
3116 }
3117
3118 /* Loop structure of the function. The entire function is described as
3119 a NULL loop. */
3120
3121 struct parallel
3122 {
3123 /* Parent parallel. */
3124 parallel *parent;
3125
3126 /* Next sibling parallel. */
3127 parallel *next;
3128
3129 /* First child parallel. */
3130 parallel *inner;
3131
3132 /* Partitioning mask of the parallel. */
3133 unsigned mask;
3134
3135 /* Partitioning used within inner parallels. */
3136 unsigned inner_mask;
3137
3138 /* Location of parallel forked and join. The forked is the first
3139 block in the parallel and the join is the first block after of
3140 the partition. */
3141 basic_block forked_block;
3142 basic_block join_block;
3143
3144 rtx_insn *forked_insn;
3145 rtx_insn *join_insn;
3146
3147 rtx_insn *fork_insn;
3148 rtx_insn *joining_insn;
3149
3150 /* Basic blocks in this parallel, but not in child parallels. The
3151 FORKED and JOINING blocks are in the partition. The FORK and JOIN
3152 blocks are not. */
3153 auto_vec<basic_block> blocks;
3154
3155 public:
3156 parallel (parallel *parent, unsigned mode);
3157 ~parallel ();
3158 };
3159
3160 /* Constructor links the new parallel into it's parent's chain of
3161 children. */
3162
parallel(parallel * parent_,unsigned mask_)3163 parallel::parallel (parallel *parent_, unsigned mask_)
3164 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
3165 {
3166 forked_block = join_block = 0;
3167 forked_insn = join_insn = 0;
3168 fork_insn = joining_insn = 0;
3169
3170 if (parent)
3171 {
3172 next = parent->inner;
3173 parent->inner = this;
3174 }
3175 }
3176
~parallel()3177 parallel::~parallel ()
3178 {
3179 delete inner;
3180 delete next;
3181 }
3182
3183 /* Map of basic blocks to insns */
3184 typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
3185
3186 /* A tuple of an insn of interest and the BB in which it resides. */
3187 typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
3188 typedef auto_vec<insn_bb_t> insn_bb_vec_t;
3189
3190 /* Split basic blocks such that each forked and join unspecs are at
3191 the start of their basic blocks. Thus afterwards each block will
3192 have a single partitioning mode. We also do the same for return
3193 insns, as they are executed by every thread. Return the
3194 partitioning mode of the function as a whole. Populate MAP with
3195 head and tail blocks. We also clear the BB visited flag, which is
3196 used when finding partitions. */
3197
3198 static void
nvptx_split_blocks(bb_insn_map_t * map)3199 nvptx_split_blocks (bb_insn_map_t *map)
3200 {
3201 insn_bb_vec_t worklist;
3202 basic_block block;
3203 rtx_insn *insn;
3204
3205 /* Locate all the reorg instructions of interest. */
3206 FOR_ALL_BB_FN (block, cfun)
3207 {
3208 bool seen_insn = false;
3209
3210 /* Clear visited flag, for use by parallel locator */
3211 block->flags &= ~BB_VISITED;
3212
3213 FOR_BB_INSNS (block, insn)
3214 {
3215 if (!INSN_P (insn))
3216 continue;
3217 switch (recog_memoized (insn))
3218 {
3219 default:
3220 seen_insn = true;
3221 continue;
3222 case CODE_FOR_nvptx_forked:
3223 case CODE_FOR_nvptx_join:
3224 break;
3225
3226 case CODE_FOR_return:
3227 /* We also need to split just before return insns, as
3228 that insn needs executing by all threads, but the
3229 block it is in probably does not. */
3230 break;
3231 }
3232
3233 if (seen_insn)
3234 /* We've found an instruction that must be at the start of
3235 a block, but isn't. Add it to the worklist. */
3236 worklist.safe_push (insn_bb_t (insn, block));
3237 else
3238 /* It was already the first instruction. Just add it to
3239 the map. */
3240 map->get_or_insert (block) = insn;
3241 seen_insn = true;
3242 }
3243 }
3244
3245 /* Split blocks on the worklist. */
3246 unsigned ix;
3247 insn_bb_t *elt;
3248 basic_block remap = 0;
3249 for (ix = 0; worklist.iterate (ix, &elt); ix++)
3250 {
3251 if (remap != elt->second)
3252 {
3253 block = elt->second;
3254 remap = block;
3255 }
3256
3257 /* Split block before insn. The insn is in the new block */
3258 edge e = split_block (block, PREV_INSN (elt->first));
3259
3260 block = e->dest;
3261 map->get_or_insert (block) = elt->first;
3262 }
3263 }
3264
3265 /* Return true if MASK contains parallelism that requires shared
3266 memory to broadcast. */
3267
3268 static bool
nvptx_needs_shared_bcast(unsigned mask)3269 nvptx_needs_shared_bcast (unsigned mask)
3270 {
3271 bool worker = mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
3272 bool large_vector = (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
3273 && nvptx_mach_vector_length () != PTX_WARP_SIZE;
3274
3275 return worker || large_vector;
3276 }
3277
3278 /* BLOCK is a basic block containing a head or tail instruction.
3279 Locate the associated prehead or pretail instruction, which must be
3280 in the single predecessor block. */
3281
3282 static rtx_insn *
nvptx_discover_pre(basic_block block,int expected)3283 nvptx_discover_pre (basic_block block, int expected)
3284 {
3285 gcc_assert (block->preds->length () == 1);
3286 basic_block pre_block = (*block->preds)[0]->src;
3287 rtx_insn *pre_insn;
3288
3289 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
3290 pre_insn = PREV_INSN (pre_insn))
3291 gcc_assert (pre_insn != BB_HEAD (pre_block));
3292
3293 gcc_assert (recog_memoized (pre_insn) == expected);
3294 return pre_insn;
3295 }
3296
3297 /* Dump this parallel and all its inner parallels. */
3298
3299 static void
nvptx_dump_pars(parallel * par,unsigned depth)3300 nvptx_dump_pars (parallel *par, unsigned depth)
3301 {
3302 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
3303 depth, par->mask,
3304 par->forked_block ? par->forked_block->index : -1,
3305 par->join_block ? par->join_block->index : -1);
3306
3307 fprintf (dump_file, " blocks:");
3308
3309 basic_block block;
3310 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
3311 fprintf (dump_file, " %d", block->index);
3312 fprintf (dump_file, "\n");
3313 if (par->inner)
3314 nvptx_dump_pars (par->inner, depth + 1);
3315
3316 if (par->next)
3317 nvptx_dump_pars (par->next, depth);
3318 }
3319
3320 /* If BLOCK contains a fork/join marker, process it to create or
3321 terminate a loop structure. Add this block to the current loop,
3322 and then walk successor blocks. */
3323
3324 static parallel *
nvptx_find_par(bb_insn_map_t * map,parallel * par,basic_block block)3325 nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
3326 {
3327 if (block->flags & BB_VISITED)
3328 return par;
3329 block->flags |= BB_VISITED;
3330
3331 if (rtx_insn **endp = map->get (block))
3332 {
3333 rtx_insn *end = *endp;
3334
3335 /* This is a block head or tail, or return instruction. */
3336 switch (recog_memoized (end))
3337 {
3338 case CODE_FOR_return:
3339 /* Return instructions are in their own block, and we
3340 don't need to do anything more. */
3341 return par;
3342
3343 case CODE_FOR_nvptx_forked:
3344 /* Loop head, create a new inner loop and add it into
3345 our parent's child list. */
3346 {
3347 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3348
3349 gcc_assert (mask);
3350 par = new parallel (par, mask);
3351 par->forked_block = block;
3352 par->forked_insn = end;
3353 if (nvptx_needs_shared_bcast (mask))
3354 par->fork_insn
3355 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
3356 }
3357 break;
3358
3359 case CODE_FOR_nvptx_join:
3360 /* A loop tail. Finish the current loop and return to
3361 parent. */
3362 {
3363 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3364
3365 gcc_assert (par->mask == mask);
3366 gcc_assert (par->join_block == NULL);
3367 par->join_block = block;
3368 par->join_insn = end;
3369 if (nvptx_needs_shared_bcast (mask))
3370 par->joining_insn
3371 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
3372 par = par->parent;
3373 }
3374 break;
3375
3376 default:
3377 gcc_unreachable ();
3378 }
3379 }
3380
3381 if (par)
3382 /* Add this block onto the current loop's list of blocks. */
3383 par->blocks.safe_push (block);
3384 else
3385 /* This must be the entry block. Create a NULL parallel. */
3386 par = new parallel (0, 0);
3387
3388 /* Walk successor blocks. */
3389 edge e;
3390 edge_iterator ei;
3391
3392 FOR_EACH_EDGE (e, ei, block->succs)
3393 nvptx_find_par (map, par, e->dest);
3394
3395 return par;
3396 }
3397
3398 /* DFS walk the CFG looking for fork & join markers. Construct
3399 loop structures as we go. MAP is a mapping of basic blocks
3400 to head & tail markers, discovered when splitting blocks. This
3401 speeds up the discovery. We rely on the BB visited flag having
3402 been cleared when splitting blocks. */
3403
3404 static parallel *
nvptx_discover_pars(bb_insn_map_t * map)3405 nvptx_discover_pars (bb_insn_map_t *map)
3406 {
3407 basic_block block;
3408
3409 /* Mark exit blocks as visited. */
3410 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3411 block->flags |= BB_VISITED;
3412
3413 /* And entry block as not. */
3414 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3415 block->flags &= ~BB_VISITED;
3416
3417 parallel *par = nvptx_find_par (map, 0, block);
3418
3419 if (dump_file)
3420 {
3421 fprintf (dump_file, "\nLoops\n");
3422 nvptx_dump_pars (par, 0);
3423 fprintf (dump_file, "\n");
3424 }
3425
3426 return par;
3427 }
3428
3429 /* Analyse a group of BBs within a partitioned region and create N
3430 Single-Entry-Single-Exit regions. Some of those regions will be
3431 trivial ones consisting of a single BB. The blocks of a
3432 partitioned region might form a set of disjoint graphs -- because
3433 the region encloses a differently partitoned sub region.
3434
3435 We use the linear time algorithm described in 'Finding Regions Fast:
3436 Single Entry Single Exit and control Regions in Linear Time'
3437 Johnson, Pearson & Pingali. That algorithm deals with complete
3438 CFGs, where a back edge is inserted from END to START, and thus the
3439 problem becomes one of finding equivalent loops.
3440
3441 In this case we have a partial CFG. We complete it by redirecting
3442 any incoming edge to the graph to be from an arbitrary external BB,
3443 and similarly redirecting any outgoing edge to be to that BB.
3444 Thus we end up with a closed graph.
3445
3446 The algorithm works by building a spanning tree of an undirected
3447 graph and keeping track of back edges from nodes further from the
3448 root in the tree to nodes nearer to the root in the tree. In the
3449 description below, the root is up and the tree grows downwards.
3450
3451 We avoid having to deal with degenerate back-edges to the same
3452 block, by splitting each BB into 3 -- one for input edges, one for
3453 the node itself and one for the output edges. Such back edges are
3454 referred to as 'Brackets'. Cycle equivalent nodes will have the
3455 same set of brackets.
3456
3457 Determining bracket equivalency is done by maintaining a list of
3458 brackets in such a manner that the list length and final bracket
3459 uniquely identify the set.
3460
3461 We use coloring to mark all BBs with cycle equivalency with the
3462 same color. This is the output of the 'Finding Regions Fast'
3463 algorithm. Notice it doesn't actually find the set of nodes within
3464 a particular region, just unorderd sets of nodes that are the
3465 entries and exits of SESE regions.
3466
3467 After determining cycle equivalency, we need to find the minimal
3468 set of SESE regions. Do this with a DFS coloring walk of the
3469 complete graph. We're either 'looking' or 'coloring'. When
3470 looking, and we're in the subgraph, we start coloring the color of
3471 the current node, and remember that node as the start of the
3472 current color's SESE region. Every time we go to a new node, we
3473 decrement the count of nodes with thet color. If it reaches zero,
3474 we remember that node as the end of the current color's SESE region
3475 and return to 'looking'. Otherwise we color the node the current
3476 color.
3477
3478 This way we end up with coloring the inside of non-trivial SESE
3479 regions with the color of that region. */
3480
3481 /* A pair of BBs. We use this to represent SESE regions. */
3482 typedef std::pair<basic_block, basic_block> bb_pair_t;
3483 typedef auto_vec<bb_pair_t> bb_pair_vec_t;
3484
3485 /* A node in the undirected CFG. The discriminator SECOND indicates just
3486 above or just below the BB idicated by FIRST. */
3487 typedef std::pair<basic_block, int> pseudo_node_t;
3488
3489 /* A bracket indicates an edge towards the root of the spanning tree of the
3490 undirected graph. Each bracket has a color, determined
3491 from the currrent set of brackets. */
3492 struct bracket
3493 {
3494 pseudo_node_t back; /* Back target */
3495
3496 /* Current color and size of set. */
3497 unsigned color;
3498 unsigned size;
3499
bracketbracket3500 bracket (pseudo_node_t back_)
3501 : back (back_), color (~0u), size (~0u)
3502 {
3503 }
3504
get_colorbracket3505 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
3506 {
3507 if (length != size)
3508 {
3509 size = length;
3510 color = color_counts.length ();
3511 color_counts.quick_push (0);
3512 }
3513 color_counts[color]++;
3514 return color;
3515 }
3516 };
3517
3518 typedef auto_vec<bracket> bracket_vec_t;
3519
3520 /* Basic block info for finding SESE regions. */
3521
3522 struct bb_sese
3523 {
3524 int node; /* Node number in spanning tree. */
3525 int parent; /* Parent node number. */
3526
3527 /* The algorithm splits each node A into Ai, A', Ao. The incoming
3528 edges arrive at pseudo-node Ai and the outgoing edges leave at
3529 pseudo-node Ao. We have to remember which way we arrived at a
3530 particular node when generating the spanning tree. dir > 0 means
3531 we arrived at Ai, dir < 0 means we arrived at Ao. */
3532 int dir;
3533
3534 /* Lowest numbered pseudo-node reached via a backedge from thsis
3535 node, or any descendant. */
3536 pseudo_node_t high;
3537
3538 int color; /* Cycle-equivalence color */
3539
3540 /* Stack of brackets for this node. */
3541 bracket_vec_t brackets;
3542
bb_sesebb_sese3543 bb_sese (unsigned node_, unsigned p, int dir_)
3544 :node (node_), parent (p), dir (dir_)
3545 {
3546 }
3547 ~bb_sese ();
3548
3549 /* Push a bracket ending at BACK. */
pushbb_sese3550 void push (const pseudo_node_t &back)
3551 {
3552 if (dump_file)
3553 fprintf (dump_file, "Pushing backedge %d:%+d\n",
3554 back.first ? back.first->index : 0, back.second);
3555 brackets.safe_push (bracket (back));
3556 }
3557
3558 void append (bb_sese *child);
3559 void remove (const pseudo_node_t &);
3560
3561 /* Set node's color. */
set_colorbb_sese3562 void set_color (auto_vec<unsigned> &color_counts)
3563 {
3564 color = brackets.last ().get_color (color_counts, brackets.length ());
3565 }
3566 };
3567
~bb_sese()3568 bb_sese::~bb_sese ()
3569 {
3570 }
3571
3572 /* Destructively append CHILD's brackets. */
3573
3574 void
append(bb_sese * child)3575 bb_sese::append (bb_sese *child)
3576 {
3577 if (int len = child->brackets.length ())
3578 {
3579 int ix;
3580
3581 if (dump_file)
3582 {
3583 for (ix = 0; ix < len; ix++)
3584 {
3585 const pseudo_node_t &pseudo = child->brackets[ix].back;
3586 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
3587 child->node, pseudo.first ? pseudo.first->index : 0,
3588 pseudo.second);
3589 }
3590 }
3591 if (!brackets.length ())
3592 std::swap (brackets, child->brackets);
3593 else
3594 {
3595 brackets.reserve (len);
3596 for (ix = 0; ix < len; ix++)
3597 brackets.quick_push (child->brackets[ix]);
3598 }
3599 }
3600 }
3601
3602 /* Remove brackets that terminate at PSEUDO. */
3603
3604 void
remove(const pseudo_node_t & pseudo)3605 bb_sese::remove (const pseudo_node_t &pseudo)
3606 {
3607 unsigned removed = 0;
3608 int len = brackets.length ();
3609
3610 for (int ix = 0; ix < len; ix++)
3611 {
3612 if (brackets[ix].back == pseudo)
3613 {
3614 if (dump_file)
3615 fprintf (dump_file, "Removing backedge %d:%+d\n",
3616 pseudo.first ? pseudo.first->index : 0, pseudo.second);
3617 removed++;
3618 }
3619 else if (removed)
3620 brackets[ix-removed] = brackets[ix];
3621 }
3622 while (removed--)
3623 brackets.pop ();
3624 }
3625
3626 /* Accessors for BB's aux pointer. */
3627 #define BB_SET_SESE(B, S) ((B)->aux = (S))
3628 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3629
3630 /* DFS walk creating SESE data structures. Only cover nodes with
3631 BB_VISITED set. Append discovered blocks to LIST. We number in
3632 increments of 3 so that the above and below pseudo nodes can be
3633 implicitly numbered too. */
3634
3635 static int
nvptx_sese_number(int n,int p,int dir,basic_block b,auto_vec<basic_block> * list)3636 nvptx_sese_number (int n, int p, int dir, basic_block b,
3637 auto_vec<basic_block> *list)
3638 {
3639 if (BB_GET_SESE (b))
3640 return n;
3641
3642 if (dump_file)
3643 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
3644 b->index, n, p, dir);
3645
3646 BB_SET_SESE (b, new bb_sese (n, p, dir));
3647 p = n;
3648
3649 n += 3;
3650 list->quick_push (b);
3651
3652 /* First walk the nodes on the 'other side' of this node, then walk
3653 the nodes on the same side. */
3654 for (unsigned ix = 2; ix; ix--)
3655 {
3656 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
3657 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
3658 : offsetof (edge_def, src));
3659 edge e;
3660 edge_iterator ei;
3661
3662 FOR_EACH_EDGE (e, ei, edges)
3663 {
3664 basic_block target = *(basic_block *)((char *)e + offset);
3665
3666 if (target->flags & BB_VISITED)
3667 n = nvptx_sese_number (n, p, dir, target, list);
3668 }
3669 dir = -dir;
3670 }
3671 return n;
3672 }
3673
3674 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
3675 EDGES are the outgoing edges and OFFSET is the offset to the src
3676 or dst block on the edges. */
3677
3678 static void
nvptx_sese_pseudo(basic_block me,bb_sese * sese,int depth,int dir,vec<edge,va_gc> * edges,size_t offset)3679 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
3680 vec<edge, va_gc> *edges, size_t offset)
3681 {
3682 edge e;
3683 edge_iterator ei;
3684 int hi_back = depth;
3685 pseudo_node_t node_back (nullptr, depth);
3686 int hi_child = depth;
3687 pseudo_node_t node_child (nullptr, depth);
3688 basic_block child = NULL;
3689 unsigned num_children = 0;
3690 int usd = -dir * sese->dir;
3691
3692 if (dump_file)
3693 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
3694 me->index, sese->node, dir);
3695
3696 if (dir < 0)
3697 {
3698 /* This is the above pseudo-child. It has the BB itself as an
3699 additional child node. */
3700 node_child = sese->high;
3701 hi_child = node_child.second;
3702 if (node_child.first)
3703 hi_child += BB_GET_SESE (node_child.first)->node;
3704 num_children++;
3705 }
3706
3707 /* Examine each edge.
3708 - if it is a child (a) append its bracket list and (b) record
3709 whether it is the child with the highest reaching bracket.
3710 - if it is an edge to ancestor, record whether it's the highest
3711 reaching backlink. */
3712 FOR_EACH_EDGE (e, ei, edges)
3713 {
3714 basic_block target = *(basic_block *)((char *)e + offset);
3715
3716 if (bb_sese *t_sese = BB_GET_SESE (target))
3717 {
3718 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
3719 {
3720 /* Child node. Append its bracket list. */
3721 num_children++;
3722 sese->append (t_sese);
3723
3724 /* Compare it's hi value. */
3725 int t_hi = t_sese->high.second;
3726
3727 if (basic_block child_hi_block = t_sese->high.first)
3728 t_hi += BB_GET_SESE (child_hi_block)->node;
3729
3730 if (hi_child > t_hi)
3731 {
3732 hi_child = t_hi;
3733 node_child = t_sese->high;
3734 child = target;
3735 }
3736 }
3737 else if (t_sese->node < sese->node + dir
3738 && !(dir < 0 && sese->parent == t_sese->node))
3739 {
3740 /* Non-parental ancestor node -- a backlink. */
3741 int d = usd * t_sese->dir;
3742 int back = t_sese->node + d;
3743
3744 if (hi_back > back)
3745 {
3746 hi_back = back;
3747 node_back = pseudo_node_t (target, d);
3748 }
3749 }
3750 }
3751 else
3752 { /* Fallen off graph, backlink to entry node. */
3753 hi_back = 0;
3754 node_back = pseudo_node_t (nullptr, 0);
3755 }
3756 }
3757
3758 /* Remove any brackets that terminate at this pseudo node. */
3759 sese->remove (pseudo_node_t (me, dir));
3760
3761 /* Now push any backlinks from this pseudo node. */
3762 FOR_EACH_EDGE (e, ei, edges)
3763 {
3764 basic_block target = *(basic_block *)((char *)e + offset);
3765 if (bb_sese *t_sese = BB_GET_SESE (target))
3766 {
3767 if (t_sese->node < sese->node + dir
3768 && !(dir < 0 && sese->parent == t_sese->node))
3769 /* Non-parental ancestor node - backedge from me. */
3770 sese->push (pseudo_node_t (target, usd * t_sese->dir));
3771 }
3772 else
3773 {
3774 /* back edge to entry node */
3775 sese->push (pseudo_node_t (nullptr, 0));
3776 }
3777 }
3778
3779 /* If this node leads directly or indirectly to a no-return region of
3780 the graph, then fake a backedge to entry node. */
3781 if (!sese->brackets.length () || !edges || !edges->length ())
3782 {
3783 hi_back = 0;
3784 node_back = pseudo_node_t (nullptr, 0);
3785 sese->push (node_back);
3786 }
3787
3788 /* Record the highest reaching backedge from us or a descendant. */
3789 sese->high = hi_back < hi_child ? node_back : node_child;
3790
3791 if (num_children > 1)
3792 {
3793 /* There is more than one child -- this is a Y shaped piece of
3794 spanning tree. We have to insert a fake backedge from this
3795 node to the highest ancestor reached by not-the-highest
3796 reaching child. Note that there may be multiple children
3797 with backedges to the same highest node. That's ok and we
3798 insert the edge to that highest node. */
3799 hi_child = depth;
3800 if (dir < 0 && child)
3801 {
3802 node_child = sese->high;
3803 hi_child = node_child.second;
3804 if (node_child.first)
3805 hi_child += BB_GET_SESE (node_child.first)->node;
3806 }
3807
3808 FOR_EACH_EDGE (e, ei, edges)
3809 {
3810 basic_block target = *(basic_block *)((char *)e + offset);
3811
3812 if (target == child)
3813 /* Ignore the highest child. */
3814 continue;
3815
3816 bb_sese *t_sese = BB_GET_SESE (target);
3817 if (!t_sese)
3818 continue;
3819 if (t_sese->parent != sese->node)
3820 /* Not a child. */
3821 continue;
3822
3823 /* Compare its hi value. */
3824 int t_hi = t_sese->high.second;
3825
3826 if (basic_block child_hi_block = t_sese->high.first)
3827 t_hi += BB_GET_SESE (child_hi_block)->node;
3828
3829 if (hi_child > t_hi)
3830 {
3831 hi_child = t_hi;
3832 node_child = t_sese->high;
3833 }
3834 }
3835
3836 sese->push (node_child);
3837 }
3838 }
3839
3840
3841 /* DFS walk of BB graph. Color node BLOCK according to COLORING then
3842 proceed to successors. Set SESE entry and exit nodes of
3843 REGIONS. */
3844
3845 static void
nvptx_sese_color(auto_vec<unsigned> & color_counts,bb_pair_vec_t & regions,basic_block block,int coloring)3846 nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t ®ions,
3847 basic_block block, int coloring)
3848 {
3849 bb_sese *sese = BB_GET_SESE (block);
3850
3851 if (block->flags & BB_VISITED)
3852 {
3853 /* If we've already encountered this block, either we must not
3854 be coloring, or it must have been colored the current color. */
3855 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3856 return;
3857 }
3858
3859 block->flags |= BB_VISITED;
3860
3861 if (sese)
3862 {
3863 if (coloring < 0)
3864 {
3865 /* Start coloring a region. */
3866 regions[sese->color].first = block;
3867 coloring = sese->color;
3868 }
3869
3870 if (!--color_counts[sese->color] && sese->color == coloring)
3871 {
3872 /* Found final block of SESE region. */
3873 regions[sese->color].second = block;
3874 coloring = -1;
3875 }
3876 else
3877 /* Color the node, so we can assert on revisiting the node
3878 that the graph is indeed SESE. */
3879 sese->color = coloring;
3880 }
3881 else
3882 /* Fallen off the subgraph, we cannot be coloring. */
3883 gcc_assert (coloring < 0);
3884
3885 /* Walk each successor block. */
3886 if (block->succs && block->succs->length ())
3887 {
3888 edge e;
3889 edge_iterator ei;
3890
3891 FOR_EACH_EDGE (e, ei, block->succs)
3892 nvptx_sese_color (color_counts, regions, e->dest, coloring);
3893 }
3894 else
3895 gcc_assert (coloring < 0);
3896 }
3897
3898 /* Find minimal set of SESE regions covering BLOCKS. REGIONS might
3899 end up with NULL entries in it. */
3900
3901 static void
nvptx_find_sese(auto_vec<basic_block> & blocks,bb_pair_vec_t & regions)3902 nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t ®ions)
3903 {
3904 basic_block block;
3905 int ix;
3906
3907 /* First clear each BB of the whole function. */
3908 FOR_ALL_BB_FN (block, cfun)
3909 {
3910 block->flags &= ~BB_VISITED;
3911 BB_SET_SESE (block, 0);
3912 }
3913
3914 /* Mark blocks in the function that are in this graph. */
3915 for (ix = 0; blocks.iterate (ix, &block); ix++)
3916 block->flags |= BB_VISITED;
3917
3918 /* Counts of nodes assigned to each color. There cannot be more
3919 colors than blocks (and hopefully there will be fewer). */
3920 auto_vec<unsigned> color_counts;
3921 color_counts.reserve (blocks.length ());
3922
3923 /* Worklist of nodes in the spanning tree. Again, there cannot be
3924 more nodes in the tree than blocks (there will be fewer if the
3925 CFG of blocks is disjoint). */
3926 auto_vec<basic_block> spanlist;
3927 spanlist.reserve (blocks.length ());
3928
3929 /* Make sure every block has its cycle class determined. */
3930 for (ix = 0; blocks.iterate (ix, &block); ix++)
3931 {
3932 if (BB_GET_SESE (block))
3933 /* We already met this block in an earlier graph solve. */
3934 continue;
3935
3936 if (dump_file)
3937 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3938
3939 /* Number the nodes reachable from block initial DFS order. */
3940 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3941
3942 /* Now walk in reverse DFS order to find cycle equivalents. */
3943 while (spanlist.length ())
3944 {
3945 block = spanlist.pop ();
3946 bb_sese *sese = BB_GET_SESE (block);
3947
3948 /* Do the pseudo node below. */
3949 nvptx_sese_pseudo (block, sese, depth, +1,
3950 sese->dir > 0 ? block->succs : block->preds,
3951 (sese->dir > 0 ? offsetof (edge_def, dest)
3952 : offsetof (edge_def, src)));
3953 sese->set_color (color_counts);
3954 /* Do the pseudo node above. */
3955 nvptx_sese_pseudo (block, sese, depth, -1,
3956 sese->dir < 0 ? block->succs : block->preds,
3957 (sese->dir < 0 ? offsetof (edge_def, dest)
3958 : offsetof (edge_def, src)));
3959 }
3960 if (dump_file)
3961 fprintf (dump_file, "\n");
3962 }
3963
3964 if (dump_file)
3965 {
3966 unsigned count;
3967 const char *comma = "";
3968
3969 fprintf (dump_file, "Found %d cycle equivalents\n",
3970 color_counts.length ());
3971 for (ix = 0; color_counts.iterate (ix, &count); ix++)
3972 {
3973 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3974
3975 comma = "";
3976 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3977 if (BB_GET_SESE (block)->color == ix)
3978 {
3979 block->flags |= BB_VISITED;
3980 fprintf (dump_file, "%s%d", comma, block->index);
3981 comma=",";
3982 }
3983 fprintf (dump_file, "}");
3984 comma = ", ";
3985 }
3986 fprintf (dump_file, "\n");
3987 }
3988
3989 /* Now we've colored every block in the subgraph. We now need to
3990 determine the minimal set of SESE regions that cover that
3991 subgraph. Do this with a DFS walk of the complete function.
3992 During the walk we're either 'looking' or 'coloring'. When we
3993 reach the last node of a particular color, we stop coloring and
3994 return to looking. */
3995
3996 /* There cannot be more SESE regions than colors. */
3997 regions.reserve (color_counts.length ());
3998 for (ix = color_counts.length (); ix--;)
3999 regions.quick_push (bb_pair_t (0, 0));
4000
4001 for (ix = 0; blocks.iterate (ix, &block); ix++)
4002 block->flags &= ~BB_VISITED;
4003
4004 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
4005
4006 if (dump_file)
4007 {
4008 const char *comma = "";
4009 int len = regions.length ();
4010
4011 fprintf (dump_file, "SESE regions:");
4012 for (ix = 0; ix != len; ix++)
4013 {
4014 basic_block from = regions[ix].first;
4015 basic_block to = regions[ix].second;
4016
4017 if (from)
4018 {
4019 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
4020 if (to != from)
4021 fprintf (dump_file, "->%d", to->index);
4022
4023 int color = BB_GET_SESE (from)->color;
4024
4025 /* Print the blocks within the region (excluding ends). */
4026 FOR_EACH_BB_FN (block, cfun)
4027 {
4028 bb_sese *sese = BB_GET_SESE (block);
4029
4030 if (sese && sese->color == color
4031 && block != from && block != to)
4032 fprintf (dump_file, ".%d", block->index);
4033 }
4034 fprintf (dump_file, "}");
4035 }
4036 comma = ",";
4037 }
4038 fprintf (dump_file, "\n\n");
4039 }
4040
4041 for (ix = 0; blocks.iterate (ix, &block); ix++)
4042 delete BB_GET_SESE (block);
4043 }
4044
4045 #undef BB_SET_SESE
4046 #undef BB_GET_SESE
4047
4048 /* Propagate live state at the start of a partitioned region. IS_CALL
4049 indicates whether the propagation is for a (partitioned) call
4050 instruction. BLOCK provides the live register information, and
4051 might not contain INSN. Propagation is inserted just after INSN. RW
4052 indicates whether we are reading and/or writing state. This
4053 separation is needed for worker-level proppagation where we
4054 essentially do a spill & fill. FN is the underlying worker
4055 function to generate the propagation instructions for single
4056 register. DATA is user data.
4057
4058 Returns true if we didn't emit any instructions.
4059
4060 We propagate the live register set for non-calls and the entire
4061 frame for calls and non-calls. We could do better by (a)
4062 propagating just the live set that is used within the partitioned
4063 regions and (b) only propagating stack entries that are used. The
4064 latter might be quite hard to determine. */
4065
4066 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *, bool);
4067
4068 static bool
nvptx_propagate(bool is_call,basic_block block,rtx_insn * insn,propagate_mask rw,propagator_fn fn,void * data,bool vector)4069 nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
4070 propagate_mask rw, propagator_fn fn, void *data, bool vector)
4071 {
4072 bitmap live = DF_LIVE_IN (block);
4073 bitmap_iterator iterator;
4074 unsigned ix;
4075 bool empty = true;
4076
4077 /* Copy the frame array. */
4078 HOST_WIDE_INT fs = get_frame_size ();
4079 if (fs)
4080 {
4081 rtx tmp = gen_reg_rtx (DImode);
4082 rtx idx = NULL_RTX;
4083 rtx ptr = gen_reg_rtx (Pmode);
4084 rtx pred = NULL_RTX;
4085 rtx_code_label *label = NULL;
4086
4087 empty = false;
4088 /* The frame size might not be DImode compatible, but the frame
4089 array's declaration will be. So it's ok to round up here. */
4090 fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
4091 /* Detect single iteration loop. */
4092 if (fs == 1)
4093 fs = 0;
4094
4095 start_sequence ();
4096 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
4097 if (fs)
4098 {
4099 idx = gen_reg_rtx (SImode);
4100 pred = gen_reg_rtx (BImode);
4101 label = gen_label_rtx ();
4102
4103 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
4104 /* Allow worker function to initialize anything needed. */
4105 rtx init = fn (tmp, PM_loop_begin, fs, data, vector);
4106 if (init)
4107 emit_insn (init);
4108 emit_label (label);
4109 LABEL_NUSES (label)++;
4110 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
4111 }
4112 if (rw & PM_read)
4113 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
4114 emit_insn (fn (tmp, rw, fs, data, vector));
4115 if (rw & PM_write)
4116 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
4117 if (fs)
4118 {
4119 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
4120 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
4121 emit_insn (gen_br_true_uni (pred, label));
4122 rtx fini = fn (tmp, PM_loop_end, fs, data, vector);
4123 if (fini)
4124 emit_insn (fini);
4125 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
4126 }
4127 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
4128 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
4129 rtx cpy = get_insns ();
4130 end_sequence ();
4131 insn = emit_insn_after (cpy, insn);
4132 }
4133
4134 if (!is_call)
4135 /* Copy live registers. */
4136 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
4137 {
4138 rtx reg = regno_reg_rtx[ix];
4139
4140 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
4141 {
4142 rtx bcast = fn (reg, rw, 0, data, vector);
4143
4144 insn = emit_insn_after (bcast, insn);
4145 empty = false;
4146 }
4147 }
4148 return empty;
4149 }
4150
4151 /* Worker for nvptx_warp_propagate. */
4152
4153 static rtx
warp_prop_gen(rtx reg,propagate_mask pm,unsigned ARG_UNUSED (count),void * ARG_UNUSED (data),bool ARG_UNUSED (vector))4154 warp_prop_gen (rtx reg, propagate_mask pm,
4155 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data),
4156 bool ARG_UNUSED (vector))
4157 {
4158 if (!(pm & PM_read_write))
4159 return 0;
4160
4161 return nvptx_gen_warp_bcast (reg);
4162 }
4163
4164 /* Propagate state that is live at start of BLOCK across the vectors
4165 of a single warp. Propagation is inserted just after INSN.
4166 IS_CALL and return as for nvptx_propagate. */
4167
4168 static bool
nvptx_warp_propagate(bool is_call,basic_block block,rtx_insn * insn)4169 nvptx_warp_propagate (bool is_call, basic_block block, rtx_insn *insn)
4170 {
4171 return nvptx_propagate (is_call, block, insn, PM_read_write,
4172 warp_prop_gen, 0, false);
4173 }
4174
4175 /* Worker for nvptx_shared_propagate. */
4176
4177 static rtx
shared_prop_gen(rtx reg,propagate_mask pm,unsigned rep,void * data_,bool vector)4178 shared_prop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_,
4179 bool vector)
4180 {
4181 broadcast_data_t *data = (broadcast_data_t *)data_;
4182
4183 if (pm & PM_loop_begin)
4184 {
4185 /* Starting a loop, initialize pointer. */
4186 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
4187
4188 oacc_bcast_align = MAX (oacc_bcast_align, align);
4189 data->offset = ROUND_UP (data->offset, align);
4190
4191 data->ptr = gen_reg_rtx (Pmode);
4192
4193 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
4194 }
4195 else if (pm & PM_loop_end)
4196 {
4197 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
4198 data->ptr = NULL_RTX;
4199 return clobber;
4200 }
4201 else
4202 return nvptx_gen_shared_bcast (reg, pm, rep, data, vector);
4203 }
4204
4205 /* Spill or fill live state that is live at start of BLOCK. PRE_P
4206 indicates if this is just before partitioned mode (do spill), or
4207 just after it starts (do fill). Sequence is inserted just after
4208 INSN. IS_CALL and return as for nvptx_propagate. */
4209
4210 static bool
nvptx_shared_propagate(bool pre_p,bool is_call,basic_block block,rtx_insn * insn,bool vector)4211 nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block,
4212 rtx_insn *insn, bool vector)
4213 {
4214 broadcast_data_t data;
4215
4216 data.base = gen_reg_rtx (Pmode);
4217 data.offset = 0;
4218 data.ptr = NULL_RTX;
4219
4220 bool empty = nvptx_propagate (is_call, block, insn,
4221 pre_p ? PM_read : PM_write, shared_prop_gen,
4222 &data, vector);
4223 gcc_assert (empty == !data.offset);
4224 if (data.offset)
4225 {
4226 rtx bcast_sym = oacc_bcast_sym;
4227
4228 /* Stuff was emitted, initialize the base pointer now. */
4229 if (vector && nvptx_mach_max_workers () > 1)
4230 {
4231 if (!cfun->machine->bcast_partition)
4232 {
4233 /* It would be nice to place this register in
4234 DATA_AREA_SHARED. */
4235 cfun->machine->bcast_partition = gen_reg_rtx (DImode);
4236 }
4237 if (!cfun->machine->sync_bar)
4238 cfun->machine->sync_bar = gen_reg_rtx (SImode);
4239
4240 bcast_sym = cfun->machine->bcast_partition;
4241 }
4242
4243 rtx init = gen_rtx_SET (data.base, bcast_sym);
4244 emit_insn_after (init, insn);
4245
4246 unsigned int psize = ROUND_UP (data.offset, oacc_bcast_align);
4247 unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
4248 ? nvptx_mach_max_workers () + 1
4249 : 1);
4250
4251 oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
4252 oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
4253 }
4254 return empty;
4255 }
4256
4257 /* Emit a CTA-level synchronization barrier. LOCK is the barrier number,
4258 which is an integer or a register. THREADS is the number of threads
4259 controlled by the barrier. */
4260
4261 static rtx
nvptx_cta_sync(rtx lock,int threads)4262 nvptx_cta_sync (rtx lock, int threads)
4263 {
4264 return gen_nvptx_barsync (lock, GEN_INT (threads));
4265 }
4266
4267 #if WORKAROUND_PTXJIT_BUG
4268 /* Return first real insn in BB, or return NULL_RTX if BB does not contain
4269 real insns. */
4270
4271 static rtx_insn *
bb_first_real_insn(basic_block bb)4272 bb_first_real_insn (basic_block bb)
4273 {
4274 rtx_insn *insn;
4275
4276 /* Find first insn of from block. */
4277 FOR_BB_INSNS (bb, insn)
4278 if (INSN_P (insn))
4279 return insn;
4280
4281 return 0;
4282 }
4283 #endif
4284
4285 /* Return true if INSN needs neutering. */
4286
4287 static bool
needs_neutering_p(rtx_insn * insn)4288 needs_neutering_p (rtx_insn *insn)
4289 {
4290 if (!INSN_P (insn))
4291 return false;
4292
4293 switch (recog_memoized (insn))
4294 {
4295 case CODE_FOR_nvptx_fork:
4296 case CODE_FOR_nvptx_forked:
4297 case CODE_FOR_nvptx_joining:
4298 case CODE_FOR_nvptx_join:
4299 case CODE_FOR_nvptx_barsync:
4300 return false;
4301 default:
4302 return true;
4303 }
4304 }
4305
4306 /* Verify position of VECTOR_{JUMP,LABEL} and WORKER_{JUMP,LABEL} in FROM. */
4307
4308 static bool
verify_neutering_jumps(basic_block from,rtx_insn * vector_jump,rtx_insn * worker_jump,rtx_insn * vector_label,rtx_insn * worker_label)4309 verify_neutering_jumps (basic_block from,
4310 rtx_insn *vector_jump, rtx_insn *worker_jump,
4311 rtx_insn *vector_label, rtx_insn *worker_label)
4312 {
4313 basic_block bb = from;
4314 rtx_insn *insn = BB_HEAD (bb);
4315 bool seen_worker_jump = false;
4316 bool seen_vector_jump = false;
4317 bool seen_worker_label = false;
4318 bool seen_vector_label = false;
4319 bool worker_neutered = false;
4320 bool vector_neutered = false;
4321 while (true)
4322 {
4323 if (insn == worker_jump)
4324 {
4325 seen_worker_jump = true;
4326 worker_neutered = true;
4327 gcc_assert (!vector_neutered);
4328 }
4329 else if (insn == vector_jump)
4330 {
4331 seen_vector_jump = true;
4332 vector_neutered = true;
4333 }
4334 else if (insn == worker_label)
4335 {
4336 seen_worker_label = true;
4337 gcc_assert (worker_neutered);
4338 worker_neutered = false;
4339 }
4340 else if (insn == vector_label)
4341 {
4342 seen_vector_label = true;
4343 gcc_assert (vector_neutered);
4344 vector_neutered = false;
4345 }
4346 else if (INSN_P (insn))
4347 switch (recog_memoized (insn))
4348 {
4349 case CODE_FOR_nvptx_barsync:
4350 gcc_assert (!vector_neutered && !worker_neutered);
4351 break;
4352 default:
4353 break;
4354 }
4355
4356 if (insn != BB_END (bb))
4357 insn = NEXT_INSN (insn);
4358 else if (JUMP_P (insn) && single_succ_p (bb)
4359 && !seen_vector_jump && !seen_worker_jump)
4360 {
4361 bb = single_succ (bb);
4362 insn = BB_HEAD (bb);
4363 }
4364 else
4365 break;
4366 }
4367
4368 gcc_assert (!(vector_jump && !seen_vector_jump));
4369 gcc_assert (!(worker_jump && !seen_worker_jump));
4370
4371 if (seen_vector_label || seen_worker_label)
4372 {
4373 gcc_assert (!(vector_label && !seen_vector_label));
4374 gcc_assert (!(worker_label && !seen_worker_label));
4375
4376 return true;
4377 }
4378
4379 return false;
4380 }
4381
4382 /* Verify position of VECTOR_LABEL and WORKER_LABEL in TO. */
4383
4384 static void
verify_neutering_labels(basic_block to,rtx_insn * vector_label,rtx_insn * worker_label)4385 verify_neutering_labels (basic_block to, rtx_insn *vector_label,
4386 rtx_insn *worker_label)
4387 {
4388 basic_block bb = to;
4389 rtx_insn *insn = BB_END (bb);
4390 bool seen_worker_label = false;
4391 bool seen_vector_label = false;
4392 while (true)
4393 {
4394 if (insn == worker_label)
4395 {
4396 seen_worker_label = true;
4397 gcc_assert (!seen_vector_label);
4398 }
4399 else if (insn == vector_label)
4400 seen_vector_label = true;
4401 else if (INSN_P (insn))
4402 switch (recog_memoized (insn))
4403 {
4404 case CODE_FOR_nvptx_barsync:
4405 gcc_assert (!seen_vector_label && !seen_worker_label);
4406 break;
4407 }
4408
4409 if (insn != BB_HEAD (bb))
4410 insn = PREV_INSN (insn);
4411 else
4412 break;
4413 }
4414
4415 gcc_assert (!(vector_label && !seen_vector_label));
4416 gcc_assert (!(worker_label && !seen_worker_label));
4417 }
4418
4419 /* Single neutering according to MASK. FROM is the incoming block and
4420 TO is the outgoing block. These may be the same block. Insert at
4421 start of FROM:
4422
4423 if (tid.<axis>) goto end.
4424
4425 and insert before ending branch of TO (if there is such an insn):
4426
4427 end:
4428 <possibly-broadcast-cond>
4429 <branch>
4430
4431 We currently only use differnt FROM and TO when skipping an entire
4432 loop. We could do more if we detected superblocks. */
4433
4434 static void
nvptx_single(unsigned mask,basic_block from,basic_block to)4435 nvptx_single (unsigned mask, basic_block from, basic_block to)
4436 {
4437 rtx_insn *head = BB_HEAD (from);
4438 rtx_insn *tail = BB_END (to);
4439 unsigned skip_mask = mask;
4440
4441 while (true)
4442 {
4443 /* Find first insn of from block. */
4444 while (head != BB_END (from) && !needs_neutering_p (head))
4445 head = NEXT_INSN (head);
4446
4447 if (from == to)
4448 break;
4449
4450 if (!(JUMP_P (head) && single_succ_p (from)))
4451 break;
4452
4453 basic_block jump_target = single_succ (from);
4454 if (!single_pred_p (jump_target))
4455 break;
4456
4457 from = jump_target;
4458 head = BB_HEAD (from);
4459 }
4460
4461 /* Find last insn of to block */
4462 rtx_insn *limit = from == to ? head : BB_HEAD (to);
4463 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
4464 tail = PREV_INSN (tail);
4465
4466 /* Detect if tail is a branch. */
4467 rtx tail_branch = NULL_RTX;
4468 rtx cond_branch = NULL_RTX;
4469 if (tail && INSN_P (tail))
4470 {
4471 tail_branch = PATTERN (tail);
4472 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
4473 tail_branch = NULL_RTX;
4474 else
4475 {
4476 cond_branch = SET_SRC (tail_branch);
4477 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
4478 cond_branch = NULL_RTX;
4479 }
4480 }
4481
4482 if (tail == head)
4483 {
4484 /* If this is empty, do nothing. */
4485 if (!head || !needs_neutering_p (head))
4486 return;
4487
4488 if (cond_branch)
4489 {
4490 /* If we're only doing vector single, there's no need to
4491 emit skip code because we'll not insert anything. */
4492 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
4493 skip_mask = 0;
4494 }
4495 else if (tail_branch)
4496 /* Block with only unconditional branch. Nothing to do. */
4497 return;
4498 }
4499
4500 /* Insert the vector test inside the worker test. */
4501 unsigned mode;
4502 rtx_insn *before = tail;
4503 rtx_insn *neuter_start = NULL;
4504 rtx_insn *worker_label = NULL, *vector_label = NULL;
4505 rtx_insn *worker_jump = NULL, *vector_jump = NULL;
4506 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4507 if (GOMP_DIM_MASK (mode) & skip_mask)
4508 {
4509 rtx_code_label *label = gen_label_rtx ();
4510 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
4511 rtx_insn **mode_jump
4512 = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump;
4513 rtx_insn **mode_label
4514 = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label;
4515
4516 if (!pred)
4517 {
4518 pred = gen_reg_rtx (BImode);
4519 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
4520 }
4521
4522 rtx br;
4523 if (mode == GOMP_DIM_VECTOR)
4524 br = gen_br_true (pred, label);
4525 else
4526 br = gen_br_true_uni (pred, label);
4527 if (neuter_start)
4528 neuter_start = emit_insn_after (br, neuter_start);
4529 else
4530 neuter_start = emit_insn_before (br, head);
4531 *mode_jump = neuter_start;
4532
4533 LABEL_NUSES (label)++;
4534 rtx_insn *label_insn;
4535 if (tail_branch)
4536 {
4537 label_insn = emit_label_before (label, before);
4538 before = label_insn;
4539 }
4540 else
4541 {
4542 label_insn = emit_label_after (label, tail);
4543 if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER)
4544 && CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL))
4545 emit_insn_after (gen_exit (), label_insn);
4546 }
4547
4548 *mode_label = label_insn;
4549 }
4550
4551 /* Now deal with propagating the branch condition. */
4552 if (cond_branch)
4553 {
4554 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
4555
4556 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask
4557 && nvptx_mach_vector_length () == PTX_WARP_SIZE)
4558 {
4559 /* Vector mode only, do a shuffle. */
4560 #if WORKAROUND_PTXJIT_BUG
4561 /* The branch condition %rcond is propagated like this:
4562
4563 {
4564 .reg .u32 %x;
4565 mov.u32 %x,%tid.x;
4566 setp.ne.u32 %rnotvzero,%x,0;
4567 }
4568
4569 @%rnotvzero bra Lskip;
4570 setp.<op>.<type> %rcond,op1,op2;
4571 Lskip:
4572 selp.u32 %rcondu32,1,0,%rcond;
4573 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4574 setp.ne.u32 %rcond,%rcondu32,0;
4575
4576 There seems to be a bug in the ptx JIT compiler (observed at driver
4577 version 381.22, at -O1 and higher for sm_61), that drops the shfl
4578 unless %rcond is initialized to something before 'bra Lskip'. The
4579 bug is not observed with ptxas from cuda 8.0.61.
4580
4581 It is true that the code is non-trivial: at Lskip, %rcond is
4582 uninitialized in threads 1-31, and after the selp the same holds
4583 for %rcondu32. But shfl propagates the defined value in thread 0
4584 to threads 1-31, so after the shfl %rcondu32 is defined in threads
4585 0-31, and after the setp.ne %rcond is defined in threads 0-31.
4586
4587 There is nothing in the PTX spec to suggest that this is wrong, or
4588 to explain why the extra initialization is needed. So, we classify
4589 it as a JIT bug, and the extra initialization as workaround:
4590
4591 {
4592 .reg .u32 %x;
4593 mov.u32 %x,%tid.x;
4594 setp.ne.u32 %rnotvzero,%x,0;
4595 }
4596
4597 +.reg .pred %rcond2;
4598 +setp.eq.u32 %rcond2, 1, 0;
4599
4600 @%rnotvzero bra Lskip;
4601 setp.<op>.<type> %rcond,op1,op2;
4602 +mov.pred %rcond2, %rcond;
4603 Lskip:
4604 +mov.pred %rcond, %rcond2;
4605 selp.u32 %rcondu32,1,0,%rcond;
4606 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4607 setp.ne.u32 %rcond,%rcondu32,0;
4608 */
4609 rtx_insn *label = PREV_INSN (tail);
4610 gcc_assert (label && LABEL_P (label));
4611 rtx tmp = gen_reg_rtx (BImode);
4612 emit_insn_before (gen_movbi (tmp, const0_rtx),
4613 bb_first_real_insn (from));
4614 emit_insn_before (gen_rtx_SET (tmp, pvar), label);
4615 emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
4616 #endif
4617 emit_insn_before (nvptx_gen_warp_bcast (pvar), tail);
4618 }
4619 else
4620 {
4621 /* Includes worker mode, do spill & fill. By construction
4622 we should never have worker mode only. */
4623 broadcast_data_t data;
4624 unsigned size = GET_MODE_SIZE (SImode);
4625 bool vector = (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) != 0;
4626 bool worker = (GOMP_DIM_MASK (GOMP_DIM_WORKER) == mask) != 0;
4627 rtx barrier = GEN_INT (0);
4628 int threads = 0;
4629
4630 data.base = oacc_bcast_sym;
4631 data.ptr = 0;
4632
4633 bool use_partitioning_p = (vector && !worker
4634 && nvptx_mach_max_workers () > 1
4635 && cfun->machine->bcast_partition);
4636 if (use_partitioning_p)
4637 {
4638 data.base = cfun->machine->bcast_partition;
4639 barrier = cfun->machine->sync_bar;
4640 threads = nvptx_mach_vector_length ();
4641 }
4642 gcc_assert (data.base != NULL);
4643 gcc_assert (barrier);
4644
4645 unsigned int psize = ROUND_UP (size, oacc_bcast_align);
4646 unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
4647 ? nvptx_mach_max_workers () + 1
4648 : 1);
4649
4650 oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
4651 oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
4652
4653 data.offset = 0;
4654 emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_read, 0, &data,
4655 vector),
4656 before);
4657
4658 /* Barrier so other workers can see the write. */
4659 emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
4660 data.offset = 0;
4661 emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data,
4662 vector),
4663 tail);
4664 /* This barrier is needed to avoid worker zero clobbering
4665 the broadcast buffer before all the other workers have
4666 had a chance to read this instance of it. */
4667 emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
4668 }
4669
4670 extract_insn (tail);
4671 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
4672 UNSPEC_BR_UNIFIED);
4673 validate_change (tail, recog_data.operand_loc[0], unsp, false);
4674 }
4675
4676 bool seen_label = verify_neutering_jumps (from, vector_jump, worker_jump,
4677 vector_label, worker_label);
4678 if (!seen_label)
4679 verify_neutering_labels (to, vector_label, worker_label);
4680 }
4681
4682 /* PAR is a parallel that is being skipped in its entirety according to
4683 MASK. Treat this as skipping a superblock starting at forked
4684 and ending at joining. */
4685
4686 static void
nvptx_skip_par(unsigned mask,parallel * par)4687 nvptx_skip_par (unsigned mask, parallel *par)
4688 {
4689 basic_block tail = par->join_block;
4690 gcc_assert (tail->preds->length () == 1);
4691
4692 basic_block pre_tail = (*tail->preds)[0]->src;
4693 gcc_assert (pre_tail->succs->length () == 1);
4694
4695 nvptx_single (mask, par->forked_block, pre_tail);
4696 }
4697
4698 /* If PAR has a single inner parallel and PAR itself only contains
4699 empty entry and exit blocks, swallow the inner PAR. */
4700
4701 static void
nvptx_optimize_inner(parallel * par)4702 nvptx_optimize_inner (parallel *par)
4703 {
4704 parallel *inner = par->inner;
4705
4706 /* We mustn't be the outer dummy par. */
4707 if (!par->mask)
4708 return;
4709
4710 /* We must have a single inner par. */
4711 if (!inner || inner->next)
4712 return;
4713
4714 /* We must only contain 2 blocks ourselves -- the head and tail of
4715 the inner par. */
4716 if (par->blocks.length () != 2)
4717 return;
4718
4719 /* We must be disjoint partitioning. As we only have vector and
4720 worker partitioning, this is sufficient to guarantee the pars
4721 have adjacent partitioning. */
4722 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
4723 /* This indicates malformed code generation. */
4724 return;
4725
4726 /* The outer forked insn should be immediately followed by the inner
4727 fork insn. */
4728 rtx_insn *forked = par->forked_insn;
4729 rtx_insn *fork = BB_END (par->forked_block);
4730
4731 if (NEXT_INSN (forked) != fork)
4732 return;
4733 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
4734
4735 /* The outer joining insn must immediately follow the inner join
4736 insn. */
4737 rtx_insn *joining = par->joining_insn;
4738 rtx_insn *join = inner->join_insn;
4739 if (NEXT_INSN (join) != joining)
4740 return;
4741
4742 /* Preconditions met. Swallow the inner par. */
4743 if (dump_file)
4744 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
4745 inner->mask, inner->forked_block->index,
4746 inner->join_block->index,
4747 par->mask, par->forked_block->index, par->join_block->index);
4748
4749 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
4750
4751 par->blocks.reserve (inner->blocks.length ());
4752 while (inner->blocks.length ())
4753 par->blocks.quick_push (inner->blocks.pop ());
4754
4755 par->inner = inner->inner;
4756 inner->inner = NULL;
4757
4758 delete inner;
4759 }
4760
4761 /* Process the parallel PAR and all its contained
4762 parallels. We do everything but the neutering. Return mask of
4763 partitioned modes used within this parallel. */
4764
4765 static unsigned
nvptx_process_pars(parallel * par)4766 nvptx_process_pars (parallel *par)
4767 {
4768 if (nvptx_optimize)
4769 nvptx_optimize_inner (par);
4770
4771 unsigned inner_mask = par->mask;
4772
4773 /* Do the inner parallels first. */
4774 if (par->inner)
4775 {
4776 par->inner_mask = nvptx_process_pars (par->inner);
4777 inner_mask |= par->inner_mask;
4778 }
4779
4780 bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
4781 bool worker = (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER));
4782 bool large_vector = ((par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
4783 && nvptx_mach_vector_length () > PTX_WARP_SIZE);
4784
4785 if (worker || large_vector)
4786 {
4787 nvptx_shared_propagate (false, is_call, par->forked_block,
4788 par->forked_insn, !worker);
4789 bool no_prop_p
4790 = nvptx_shared_propagate (true, is_call, par->forked_block,
4791 par->fork_insn, !worker);
4792 bool empty_loop_p
4793 = !is_call && (NEXT_INSN (par->forked_insn)
4794 && NEXT_INSN (par->forked_insn) == par->joining_insn);
4795 rtx barrier = GEN_INT (0);
4796 int threads = 0;
4797
4798 if (!worker && cfun->machine->sync_bar)
4799 {
4800 barrier = cfun->machine->sync_bar;
4801 threads = nvptx_mach_vector_length ();
4802 }
4803
4804 if (no_prop_p && empty_loop_p)
4805 ;
4806 else if (no_prop_p && is_call)
4807 ;
4808 else
4809 {
4810 /* Insert begin and end synchronizations. */
4811 emit_insn_before (nvptx_cta_sync (barrier, threads),
4812 par->forked_insn);
4813 emit_insn_before (nvptx_cta_sync (barrier, threads), par->join_insn);
4814 }
4815 }
4816 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
4817 nvptx_warp_propagate (is_call, par->forked_block, par->forked_insn);
4818
4819 /* Now do siblings. */
4820 if (par->next)
4821 inner_mask |= nvptx_process_pars (par->next);
4822 return inner_mask;
4823 }
4824
4825 /* Neuter the parallel described by PAR. We recurse in depth-first
4826 order. MODES are the partitioning of the execution and OUTER is
4827 the partitioning of the parallels we are contained in. */
4828
4829 static void
nvptx_neuter_pars(parallel * par,unsigned modes,unsigned outer)4830 nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
4831 {
4832 unsigned me = (par->mask
4833 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
4834 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4835 unsigned skip_mask = 0, neuter_mask = 0;
4836
4837 if (par->inner)
4838 nvptx_neuter_pars (par->inner, modes, outer | me);
4839
4840 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4841 {
4842 if ((outer | me) & GOMP_DIM_MASK (mode))
4843 {} /* Mode is partitioned: no neutering. */
4844 else if (!(modes & GOMP_DIM_MASK (mode)))
4845 {} /* Mode is not used: nothing to do. */
4846 else if (par->inner_mask & GOMP_DIM_MASK (mode)
4847 || !par->forked_insn)
4848 /* Partitioned in inner parallels, or we're not a partitioned
4849 at all: neuter individual blocks. */
4850 neuter_mask |= GOMP_DIM_MASK (mode);
4851 else if (!par->parent || !par->parent->forked_insn
4852 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
4853 /* Parent isn't a parallel or contains this paralleling: skip
4854 parallel at this level. */
4855 skip_mask |= GOMP_DIM_MASK (mode);
4856 else
4857 {} /* Parent will skip this parallel itself. */
4858 }
4859
4860 if (neuter_mask)
4861 {
4862 int ix, len;
4863
4864 if (nvptx_optimize)
4865 {
4866 /* Neuter whole SESE regions. */
4867 bb_pair_vec_t regions;
4868
4869 nvptx_find_sese (par->blocks, regions);
4870 len = regions.length ();
4871 for (ix = 0; ix != len; ix++)
4872 {
4873 basic_block from = regions[ix].first;
4874 basic_block to = regions[ix].second;
4875
4876 if (from)
4877 nvptx_single (neuter_mask, from, to);
4878 else
4879 gcc_assert (!to);
4880 }
4881 }
4882 else
4883 {
4884 /* Neuter each BB individually. */
4885 len = par->blocks.length ();
4886 for (ix = 0; ix != len; ix++)
4887 {
4888 basic_block block = par->blocks[ix];
4889
4890 nvptx_single (neuter_mask, block, block);
4891 }
4892 }
4893 }
4894
4895 if (skip_mask)
4896 nvptx_skip_par (skip_mask, par);
4897
4898 if (par->next)
4899 nvptx_neuter_pars (par->next, modes, outer);
4900 }
4901
4902 static void
populate_offload_attrs(offload_attrs * oa)4903 populate_offload_attrs (offload_attrs *oa)
4904 {
4905 tree attr = oacc_get_fn_attrib (current_function_decl);
4906 tree dims = TREE_VALUE (attr);
4907 unsigned ix;
4908
4909 oa->mask = 0;
4910
4911 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
4912 {
4913 tree t = TREE_VALUE (dims);
4914 int size = (t == NULL_TREE) ? -1 : TREE_INT_CST_LOW (t);
4915 tree allowed = TREE_PURPOSE (dims);
4916
4917 if (size != 1 && !(allowed && integer_zerop (allowed)))
4918 oa->mask |= GOMP_DIM_MASK (ix);
4919
4920 switch (ix)
4921 {
4922 case GOMP_DIM_GANG:
4923 oa->num_gangs = size;
4924 break;
4925
4926 case GOMP_DIM_WORKER:
4927 oa->num_workers = size;
4928 break;
4929
4930 case GOMP_DIM_VECTOR:
4931 oa->vector_length = size;
4932 break;
4933 }
4934 }
4935 }
4936
4937 #if WORKAROUND_PTXJIT_BUG_2
4938 /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant
4939 is needed in the nvptx target because the branches generated for
4940 parititioning are NONJUMP_INSN_P, not JUMP_P. */
4941
4942 static rtx
4943 nvptx_pc_set (const rtx_insn *insn, bool strict = true)
4944 {
4945 rtx pat;
4946 if ((strict && !JUMP_P (insn))
4947 || (!strict && !INSN_P (insn)))
4948 return NULL_RTX;
4949 pat = PATTERN (insn);
4950
4951 /* The set is allowed to appear either as the insn pattern or
4952 the first set in a PARALLEL. */
4953 if (GET_CODE (pat) == PARALLEL)
4954 pat = XVECEXP (pat, 0, 0);
4955 if (GET_CODE (pat) == SET && GET_CODE (SET_DEST (pat)) == PC)
4956 return pat;
4957
4958 return NULL_RTX;
4959 }
4960
4961 /* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT. */
4962
4963 static rtx
4964 nvptx_condjump_label (const rtx_insn *insn, bool strict = true)
4965 {
4966 rtx x = nvptx_pc_set (insn, strict);
4967
4968 if (!x)
4969 return NULL_RTX;
4970 x = SET_SRC (x);
4971 if (GET_CODE (x) == LABEL_REF)
4972 return x;
4973 if (GET_CODE (x) != IF_THEN_ELSE)
4974 return NULL_RTX;
4975 if (XEXP (x, 2) == pc_rtx && GET_CODE (XEXP (x, 1)) == LABEL_REF)
4976 return XEXP (x, 1);
4977 if (XEXP (x, 1) == pc_rtx && GET_CODE (XEXP (x, 2)) == LABEL_REF)
4978 return XEXP (x, 2);
4979 return NULL_RTX;
4980 }
4981
4982 /* Insert a dummy ptx insn when encountering a branch to a label with no ptx
4983 insn inbetween the branch and the label. This works around a JIT bug
4984 observed at driver version 384.111, at -O0 for sm_50. */
4985
4986 static void
prevent_branch_around_nothing(void)4987 prevent_branch_around_nothing (void)
4988 {
4989 rtx_insn *seen_label = NULL;
4990 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
4991 {
4992 if (INSN_P (insn) && condjump_p (insn))
4993 {
4994 seen_label = label_ref_label (nvptx_condjump_label (insn, false));
4995 continue;
4996 }
4997
4998 if (seen_label == NULL)
4999 continue;
5000
5001 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
5002 continue;
5003
5004 if (INSN_P (insn))
5005 switch (recog_memoized (insn))
5006 {
5007 case CODE_FOR_nvptx_fork:
5008 case CODE_FOR_nvptx_forked:
5009 case CODE_FOR_nvptx_joining:
5010 case CODE_FOR_nvptx_join:
5011 continue;
5012 default:
5013 seen_label = NULL;
5014 continue;
5015 }
5016
5017 if (LABEL_P (insn) && insn == seen_label)
5018 emit_insn_before (gen_fake_nop (), insn);
5019
5020 seen_label = NULL;
5021 }
5022 }
5023 #endif
5024
5025 #ifdef WORKAROUND_PTXJIT_BUG_3
5026 /* Insert two membar.cta insns inbetween two subsequent bar.sync insns. This
5027 works around a hang observed at driver version 390.48 for sm_50. */
5028
5029 static void
workaround_barsyncs(void)5030 workaround_barsyncs (void)
5031 {
5032 bool seen_barsync = false;
5033 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
5034 {
5035 if (INSN_P (insn) && recog_memoized (insn) == CODE_FOR_nvptx_barsync)
5036 {
5037 if (seen_barsync)
5038 {
5039 emit_insn_before (gen_nvptx_membar_cta (), insn);
5040 emit_insn_before (gen_nvptx_membar_cta (), insn);
5041 }
5042
5043 seen_barsync = true;
5044 continue;
5045 }
5046
5047 if (!seen_barsync)
5048 continue;
5049
5050 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
5051 continue;
5052 else if (INSN_P (insn))
5053 switch (recog_memoized (insn))
5054 {
5055 case CODE_FOR_nvptx_fork:
5056 case CODE_FOR_nvptx_forked:
5057 case CODE_FOR_nvptx_joining:
5058 case CODE_FOR_nvptx_join:
5059 continue;
5060 default:
5061 break;
5062 }
5063
5064 seen_barsync = false;
5065 }
5066 }
5067 #endif
5068
5069 /* PTX-specific reorganization
5070 - Split blocks at fork and join instructions
5071 - Compute live registers
5072 - Mark now-unused registers, so function begin doesn't declare
5073 unused registers.
5074 - Insert state propagation when entering partitioned mode
5075 - Insert neutering instructions when in single mode
5076 - Replace subregs with suitable sequences.
5077 */
5078
5079 static void
nvptx_reorg(void)5080 nvptx_reorg (void)
5081 {
5082 /* We are freeing block_for_insn in the toplev to keep compatibility
5083 with old MDEP_REORGS that are not CFG based. Recompute it now. */
5084 compute_bb_for_insn ();
5085
5086 thread_prologue_and_epilogue_insns ();
5087
5088 /* Split blocks and record interesting unspecs. */
5089 bb_insn_map_t bb_insn_map;
5090
5091 nvptx_split_blocks (&bb_insn_map);
5092
5093 /* Compute live regs */
5094 df_clear_flags (DF_LR_RUN_DCE);
5095 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
5096 df_live_add_problem ();
5097 df_live_set_all_dirty ();
5098 df_analyze ();
5099 regstat_init_n_sets_and_refs ();
5100
5101 if (dump_file)
5102 df_dump (dump_file);
5103
5104 /* Mark unused regs as unused. */
5105 int max_regs = max_reg_num ();
5106 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
5107 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
5108 regno_reg_rtx[i] = const0_rtx;
5109
5110 /* Determine launch dimensions of the function. If it is not an
5111 offloaded function (i.e. this is a regular compiler), the
5112 function has no neutering. */
5113 tree attr = oacc_get_fn_attrib (current_function_decl);
5114 if (attr)
5115 {
5116 /* If we determined this mask before RTL expansion, we could
5117 elide emission of some levels of forks and joins. */
5118 offload_attrs oa;
5119
5120 populate_offload_attrs (&oa);
5121
5122 /* If there is worker neutering, there must be vector
5123 neutering. Otherwise the hardware will fail. */
5124 gcc_assert (!(oa.mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
5125 || (oa.mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
5126
5127 /* Discover & process partitioned regions. */
5128 parallel *pars = nvptx_discover_pars (&bb_insn_map);
5129 nvptx_process_pars (pars);
5130 nvptx_neuter_pars (pars, oa.mask, 0);
5131 delete pars;
5132 }
5133
5134 /* Replace subregs. */
5135 nvptx_reorg_subreg ();
5136
5137 if (TARGET_UNIFORM_SIMT)
5138 nvptx_reorg_uniform_simt ();
5139
5140 #if WORKAROUND_PTXJIT_BUG_2
5141 prevent_branch_around_nothing ();
5142 #endif
5143
5144 #ifdef WORKAROUND_PTXJIT_BUG_3
5145 workaround_barsyncs ();
5146 #endif
5147
5148 regstat_free_n_sets_and_refs ();
5149
5150 df_finish_pass (true);
5151 }
5152
5153 /* Handle a "kernel" attribute; arguments as in
5154 struct attribute_spec.handler. */
5155
5156 static tree
nvptx_handle_kernel_attribute(tree * node,tree name,tree ARG_UNUSED (args),int ARG_UNUSED (flags),bool * no_add_attrs)5157 nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
5158 int ARG_UNUSED (flags), bool *no_add_attrs)
5159 {
5160 tree decl = *node;
5161
5162 if (TREE_CODE (decl) != FUNCTION_DECL)
5163 {
5164 error ("%qE attribute only applies to functions", name);
5165 *no_add_attrs = true;
5166 }
5167 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
5168 {
5169 error ("%qE attribute requires a void return type", name);
5170 *no_add_attrs = true;
5171 }
5172
5173 return NULL_TREE;
5174 }
5175
5176 /* Handle a "shared" attribute; arguments as in
5177 struct attribute_spec.handler. */
5178
5179 static tree
nvptx_handle_shared_attribute(tree * node,tree name,tree ARG_UNUSED (args),int ARG_UNUSED (flags),bool * no_add_attrs)5180 nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
5181 int ARG_UNUSED (flags), bool *no_add_attrs)
5182 {
5183 tree decl = *node;
5184
5185 if (TREE_CODE (decl) != VAR_DECL)
5186 {
5187 error ("%qE attribute only applies to variables", name);
5188 *no_add_attrs = true;
5189 }
5190 else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
5191 {
5192 error ("%qE attribute not allowed with auto storage class", name);
5193 *no_add_attrs = true;
5194 }
5195
5196 return NULL_TREE;
5197 }
5198
5199 /* Table of valid machine attributes. */
5200 static const struct attribute_spec nvptx_attribute_table[] =
5201 {
5202 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
5203 affects_type_identity, handler, exclude } */
5204 { "kernel", 0, 0, true, false, false, false, nvptx_handle_kernel_attribute,
5205 NULL },
5206 { "shared", 0, 0, true, false, false, false, nvptx_handle_shared_attribute,
5207 NULL },
5208 { NULL, 0, 0, false, false, false, false, NULL, NULL }
5209 };
5210
5211 /* Limit vector alignments to BIGGEST_ALIGNMENT. */
5212
5213 static HOST_WIDE_INT
nvptx_vector_alignment(const_tree type)5214 nvptx_vector_alignment (const_tree type)
5215 {
5216 unsigned HOST_WIDE_INT align;
5217 tree size = TYPE_SIZE (type);
5218
5219 /* Ensure align is not bigger than BIGGEST_ALIGNMENT. */
5220 if (tree_fits_uhwi_p (size))
5221 {
5222 align = tree_to_uhwi (size);
5223 align = MIN (align, BIGGEST_ALIGNMENT);
5224 }
5225 else
5226 align = BIGGEST_ALIGNMENT;
5227
5228 /* Ensure align is not smaller than mode alignment. */
5229 align = MAX (align, GET_MODE_ALIGNMENT (TYPE_MODE (type)));
5230
5231 return align;
5232 }
5233
5234 /* Indicate that INSN cannot be duplicated. */
5235
5236 static bool
nvptx_cannot_copy_insn_p(rtx_insn * insn)5237 nvptx_cannot_copy_insn_p (rtx_insn *insn)
5238 {
5239 switch (recog_memoized (insn))
5240 {
5241 case CODE_FOR_nvptx_shufflesi:
5242 case CODE_FOR_nvptx_shufflesf:
5243 case CODE_FOR_nvptx_barsync:
5244 case CODE_FOR_nvptx_fork:
5245 case CODE_FOR_nvptx_forked:
5246 case CODE_FOR_nvptx_joining:
5247 case CODE_FOR_nvptx_join:
5248 return true;
5249 default:
5250 return false;
5251 }
5252 }
5253
5254 /* Section anchors do not work. Initialization for flag_section_anchor
5255 probes the existence of the anchoring target hooks and prevents
5256 anchoring if they don't exist. However, we may be being used with
5257 a host-side compiler that does support anchoring, and hence see
5258 the anchor flag set (as it's not recalculated). So provide an
5259 implementation denying anchoring. */
5260
5261 static bool
nvptx_use_anchors_for_symbol_p(const_rtx ARG_UNUSED (a))5262 nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
5263 {
5264 return false;
5265 }
5266
5267 /* Record a symbol for mkoffload to enter into the mapping table. */
5268
5269 static void
nvptx_record_offload_symbol(tree decl)5270 nvptx_record_offload_symbol (tree decl)
5271 {
5272 switch (TREE_CODE (decl))
5273 {
5274 case VAR_DECL:
5275 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
5276 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
5277 break;
5278
5279 case FUNCTION_DECL:
5280 {
5281 tree attr = oacc_get_fn_attrib (decl);
5282 /* OpenMP offloading does not set this attribute. */
5283 tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
5284
5285 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
5286 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
5287
5288 for (; dims; dims = TREE_CHAIN (dims))
5289 {
5290 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
5291
5292 gcc_assert (!TREE_PURPOSE (dims));
5293 fprintf (asm_out_file, ", %#x", size);
5294 }
5295
5296 fprintf (asm_out_file, "\n");
5297 }
5298 break;
5299
5300 default:
5301 gcc_unreachable ();
5302 }
5303 }
5304
5305 /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
5306 at the start of a file. */
5307
5308 static void
nvptx_file_start(void)5309 nvptx_file_start (void)
5310 {
5311 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
5312 fputs ("\t.version\t3.1\n", asm_out_file);
5313 if (TARGET_SM35)
5314 fputs ("\t.target\tsm_35\n", asm_out_file);
5315 else
5316 fputs ("\t.target\tsm_30\n", asm_out_file);
5317 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
5318 fputs ("// END PREAMBLE\n", asm_out_file);
5319 }
5320
5321 /* Emit a declaration for a worker and vector-level buffer in .shared
5322 memory. */
5323
5324 static void
write_shared_buffer(FILE * file,rtx sym,unsigned align,unsigned size)5325 write_shared_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
5326 {
5327 const char *name = XSTR (sym, 0);
5328
5329 write_var_marker (file, true, false, name);
5330 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
5331 align, name, size);
5332 }
5333
5334 /* Write out the function declarations we've collected and declare storage
5335 for the broadcast buffer. */
5336
5337 static void
nvptx_file_end(void)5338 nvptx_file_end (void)
5339 {
5340 hash_table<tree_hasher>::iterator iter;
5341 tree decl;
5342 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
5343 nvptx_record_fndecl (decl);
5344 fputs (func_decls.str().c_str(), asm_out_file);
5345
5346 if (oacc_bcast_size)
5347 write_shared_buffer (asm_out_file, oacc_bcast_sym,
5348 oacc_bcast_align, oacc_bcast_size);
5349
5350 if (worker_red_size)
5351 write_shared_buffer (asm_out_file, worker_red_sym,
5352 worker_red_align, worker_red_size);
5353
5354 if (vector_red_size)
5355 write_shared_buffer (asm_out_file, vector_red_sym,
5356 vector_red_align, vector_red_size);
5357
5358 if (need_softstack_decl)
5359 {
5360 write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
5361 /* 32 is the maximum number of warps in a block. Even though it's an
5362 external declaration, emit the array size explicitly; otherwise, it
5363 may fail at PTX JIT time if the definition is later in link order. */
5364 fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
5365 POINTER_SIZE);
5366 }
5367 if (need_unisimt_decl)
5368 {
5369 write_var_marker (asm_out_file, false, true, "__nvptx_uni");
5370 fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
5371 }
5372 }
5373
5374 /* Expander for the shuffle builtins. */
5375
5376 static rtx
nvptx_expand_shuffle(tree exp,rtx target,machine_mode mode,int ignore)5377 nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
5378 {
5379 if (ignore)
5380 return target;
5381
5382 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
5383 NULL_RTX, mode, EXPAND_NORMAL);
5384 if (!REG_P (src))
5385 src = copy_to_mode_reg (mode, src);
5386
5387 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
5388 NULL_RTX, SImode, EXPAND_NORMAL);
5389 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
5390 NULL_RTX, SImode, EXPAND_NORMAL);
5391
5392 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
5393 idx = copy_to_mode_reg (SImode, idx);
5394
5395 rtx pat = nvptx_gen_shuffle (target, src, idx,
5396 (nvptx_shuffle_kind) INTVAL (op));
5397 if (pat)
5398 emit_insn (pat);
5399
5400 return target;
5401 }
5402
5403 const char *
nvptx_output_red_partition(rtx dst,rtx offset)5404 nvptx_output_red_partition (rtx dst, rtx offset)
5405 {
5406 const char *zero_offset = "\t\tmov.u64\t%%r%d, %%r%d; // vred buffer\n";
5407 const char *with_offset = "\t\tadd.u64\t%%r%d, %%r%d, %d; // vred buffer\n";
5408
5409 if (offset == const0_rtx)
5410 fprintf (asm_out_file, zero_offset, REGNO (dst),
5411 REGNO (cfun->machine->red_partition));
5412 else
5413 fprintf (asm_out_file, with_offset, REGNO (dst),
5414 REGNO (cfun->machine->red_partition), UINTVAL (offset));
5415
5416 return "";
5417 }
5418
5419 /* Shared-memory reduction address expander. */
5420
5421 static rtx
nvptx_expand_shared_addr(tree exp,rtx target,machine_mode ARG_UNUSED (mode),int ignore,int vector)5422 nvptx_expand_shared_addr (tree exp, rtx target,
5423 machine_mode ARG_UNUSED (mode), int ignore,
5424 int vector)
5425 {
5426 if (ignore)
5427 return target;
5428
5429 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
5430 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
5431 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
5432 rtx addr = worker_red_sym;
5433
5434 if (vector)
5435 {
5436 offload_attrs oa;
5437
5438 populate_offload_attrs (&oa);
5439
5440 unsigned int psize = ROUND_UP (size + offset, align);
5441 unsigned int pnum = nvptx_mach_max_workers ();
5442 vector_red_partition = MAX (vector_red_partition, psize);
5443 vector_red_size = MAX (vector_red_size, psize * pnum);
5444 vector_red_align = MAX (vector_red_align, align);
5445
5446 if (cfun->machine->red_partition == NULL)
5447 cfun->machine->red_partition = gen_reg_rtx (Pmode);
5448
5449 addr = gen_reg_rtx (Pmode);
5450 emit_insn (gen_nvptx_red_partition (addr, GEN_INT (offset)));
5451 }
5452 else
5453 {
5454 worker_red_align = MAX (worker_red_align, align);
5455 worker_red_size = MAX (worker_red_size, size + offset);
5456
5457 if (offset)
5458 {
5459 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
5460 addr = gen_rtx_CONST (Pmode, addr);
5461 }
5462 }
5463
5464 emit_move_insn (target, addr);
5465 return target;
5466 }
5467
5468 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
5469 not require taking the address of any object, other than the memory
5470 cell being operated on. */
5471
5472 static rtx
nvptx_expand_cmp_swap(tree exp,rtx target,machine_mode ARG_UNUSED (m),int ARG_UNUSED (ignore))5473 nvptx_expand_cmp_swap (tree exp, rtx target,
5474 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
5475 {
5476 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
5477
5478 if (!target)
5479 target = gen_reg_rtx (mode);
5480
5481 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
5482 NULL_RTX, Pmode, EXPAND_NORMAL);
5483 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
5484 NULL_RTX, mode, EXPAND_NORMAL);
5485 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
5486 NULL_RTX, mode, EXPAND_NORMAL);
5487 rtx pat;
5488
5489 mem = gen_rtx_MEM (mode, mem);
5490 if (!REG_P (cmp))
5491 cmp = copy_to_mode_reg (mode, cmp);
5492 if (!REG_P (src))
5493 src = copy_to_mode_reg (mode, src);
5494
5495 if (mode == SImode)
5496 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
5497 else
5498 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
5499
5500 emit_insn (pat);
5501
5502 return target;
5503 }
5504
5505
5506 /* Codes for all the NVPTX builtins. */
5507 enum nvptx_builtins
5508 {
5509 NVPTX_BUILTIN_SHUFFLE,
5510 NVPTX_BUILTIN_SHUFFLELL,
5511 NVPTX_BUILTIN_WORKER_ADDR,
5512 NVPTX_BUILTIN_VECTOR_ADDR,
5513 NVPTX_BUILTIN_CMP_SWAP,
5514 NVPTX_BUILTIN_CMP_SWAPLL,
5515 NVPTX_BUILTIN_MAX
5516 };
5517
5518 static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
5519
5520 /* Return the NVPTX builtin for CODE. */
5521
5522 static tree
nvptx_builtin_decl(unsigned code,bool ARG_UNUSED (initialize_p))5523 nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
5524 {
5525 if (code >= NVPTX_BUILTIN_MAX)
5526 return error_mark_node;
5527
5528 return nvptx_builtin_decls[code];
5529 }
5530
5531 /* Set up all builtin functions for this target. */
5532
5533 static void
nvptx_init_builtins(void)5534 nvptx_init_builtins (void)
5535 {
5536 #define DEF(ID, NAME, T) \
5537 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
5538 = add_builtin_function ("__builtin_nvptx_" NAME, \
5539 build_function_type_list T, \
5540 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
5541 #define ST sizetype
5542 #define UINT unsigned_type_node
5543 #define LLUINT long_long_unsigned_type_node
5544 #define PTRVOID ptr_type_node
5545
5546 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
5547 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
5548 DEF (WORKER_ADDR, "worker_addr",
5549 (PTRVOID, ST, UINT, UINT, NULL_TREE));
5550 DEF (VECTOR_ADDR, "vector_addr",
5551 (PTRVOID, ST, UINT, UINT, NULL_TREE));
5552 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
5553 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
5554
5555 #undef DEF
5556 #undef ST
5557 #undef UINT
5558 #undef LLUINT
5559 #undef PTRVOID
5560 }
5561
5562 /* Expand an expression EXP that calls a built-in function,
5563 with result going to TARGET if that's convenient
5564 (and in mode MODE if that's convenient).
5565 SUBTARGET may be used as the target for computing one of EXP's operands.
5566 IGNORE is nonzero if the value is to be ignored. */
5567
5568 static rtx
nvptx_expand_builtin(tree exp,rtx target,rtx ARG_UNUSED (subtarget),machine_mode mode,int ignore)5569 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
5570 machine_mode mode, int ignore)
5571 {
5572 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
5573 switch (DECL_MD_FUNCTION_CODE (fndecl))
5574 {
5575 case NVPTX_BUILTIN_SHUFFLE:
5576 case NVPTX_BUILTIN_SHUFFLELL:
5577 return nvptx_expand_shuffle (exp, target, mode, ignore);
5578
5579 case NVPTX_BUILTIN_WORKER_ADDR:
5580 return nvptx_expand_shared_addr (exp, target, mode, ignore, false);
5581
5582 case NVPTX_BUILTIN_VECTOR_ADDR:
5583 return nvptx_expand_shared_addr (exp, target, mode, ignore, true);
5584
5585 case NVPTX_BUILTIN_CMP_SWAP:
5586 case NVPTX_BUILTIN_CMP_SWAPLL:
5587 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
5588
5589 default: gcc_unreachable ();
5590 }
5591 }
5592
5593 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
5594
5595 static int
nvptx_simt_vf()5596 nvptx_simt_vf ()
5597 {
5598 return PTX_WARP_SIZE;
5599 }
5600
5601 /* Return 1 if TRAIT NAME is present in the OpenMP context's
5602 device trait set, return 0 if not present in any OpenMP context in the
5603 whole translation unit, or -1 if not present in the current OpenMP context
5604 but might be present in another OpenMP context in the same TU. */
5605
5606 int
nvptx_omp_device_kind_arch_isa(enum omp_device_kind_arch_isa trait,const char * name)5607 nvptx_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait,
5608 const char *name)
5609 {
5610 switch (trait)
5611 {
5612 case omp_device_kind:
5613 return strcmp (name, "gpu") == 0;
5614 case omp_device_arch:
5615 return strcmp (name, "nvptx") == 0;
5616 case omp_device_isa:
5617 if (strcmp (name, "sm_30") == 0)
5618 return !TARGET_SM35;
5619 if (strcmp (name, "sm_35") == 0)
5620 return TARGET_SM35;
5621 return 0;
5622 default:
5623 gcc_unreachable ();
5624 }
5625 }
5626
5627 static bool
nvptx_welformed_vector_length_p(int l)5628 nvptx_welformed_vector_length_p (int l)
5629 {
5630 gcc_assert (l > 0);
5631 return l % PTX_WARP_SIZE == 0;
5632 }
5633
5634 static void
nvptx_apply_dim_limits(int dims[])5635 nvptx_apply_dim_limits (int dims[])
5636 {
5637 /* Check that the vector_length is not too large. */
5638 if (dims[GOMP_DIM_VECTOR] > PTX_MAX_VECTOR_LENGTH)
5639 dims[GOMP_DIM_VECTOR] = PTX_MAX_VECTOR_LENGTH;
5640
5641 /* Check that the number of workers is not too large. */
5642 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
5643 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
5644
5645 /* Ensure that num_worker * vector_length <= cta size. */
5646 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0
5647 && dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] > PTX_CTA_SIZE)
5648 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5649
5650 /* If we need a per-worker barrier ... . */
5651 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0
5652 && dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
5653 /* Don't use more barriers than available. */
5654 dims[GOMP_DIM_WORKER] = MIN (dims[GOMP_DIM_WORKER],
5655 PTX_NUM_PER_WORKER_BARRIERS);
5656 }
5657
5658 /* Return true if FNDECL contains calls to vector-partitionable routines. */
5659
5660 static bool
has_vector_partitionable_routine_calls_p(tree fndecl)5661 has_vector_partitionable_routine_calls_p (tree fndecl)
5662 {
5663 if (!fndecl)
5664 return false;
5665
5666 basic_block bb;
5667 FOR_EACH_BB_FN (bb, DECL_STRUCT_FUNCTION (fndecl))
5668 for (gimple_stmt_iterator i = gsi_start_bb (bb); !gsi_end_p (i);
5669 gsi_next_nondebug (&i))
5670 {
5671 gimple *stmt = gsi_stmt (i);
5672 if (gimple_code (stmt) != GIMPLE_CALL)
5673 continue;
5674
5675 tree callee = gimple_call_fndecl (stmt);
5676 if (!callee)
5677 continue;
5678
5679 tree attrs = oacc_get_fn_attrib (callee);
5680 if (attrs == NULL_TREE)
5681 return false;
5682
5683 int partition_level = oacc_fn_attrib_level (attrs);
5684 bool seq_routine_p = partition_level == GOMP_DIM_MAX;
5685 if (!seq_routine_p)
5686 return true;
5687 }
5688
5689 return false;
5690 }
5691
5692 /* As nvptx_goacc_validate_dims, but does not return bool to indicate whether
5693 DIMS has changed. */
5694
5695 static void
nvptx_goacc_validate_dims_1(tree decl,int dims[],int fn_level,unsigned used)5696 nvptx_goacc_validate_dims_1 (tree decl, int dims[], int fn_level, unsigned used)
5697 {
5698 bool oacc_default_dims_p = false;
5699 bool oacc_min_dims_p = false;
5700 bool offload_region_p = false;
5701 bool routine_p = false;
5702 bool routine_seq_p = false;
5703 int default_vector_length = -1;
5704
5705 if (decl == NULL_TREE)
5706 {
5707 if (fn_level == -1)
5708 oacc_default_dims_p = true;
5709 else if (fn_level == -2)
5710 oacc_min_dims_p = true;
5711 else
5712 gcc_unreachable ();
5713 }
5714 else if (fn_level == -1)
5715 offload_region_p = true;
5716 else if (0 <= fn_level && fn_level <= GOMP_DIM_MAX)
5717 {
5718 routine_p = true;
5719 routine_seq_p = fn_level == GOMP_DIM_MAX;
5720 }
5721 else
5722 gcc_unreachable ();
5723
5724 if (oacc_min_dims_p)
5725 {
5726 gcc_assert (dims[GOMP_DIM_VECTOR] == 1);
5727 gcc_assert (dims[GOMP_DIM_WORKER] == 1);
5728 gcc_assert (dims[GOMP_DIM_GANG] == 1);
5729
5730 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5731 return;
5732 }
5733
5734 if (routine_p)
5735 {
5736 if (!routine_seq_p)
5737 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5738
5739 return;
5740 }
5741
5742 if (oacc_default_dims_p)
5743 {
5744 /* -1 : not set
5745 0 : set at runtime, f.i. -fopenacc-dims=-
5746 >= 1: set at compile time, f.i. -fopenacc-dims=1. */
5747 gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
5748 gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
5749 gcc_assert (dims[GOMP_DIM_GANG] >= -1);
5750
5751 /* But -fopenacc-dims=- is not yet supported on trunk. */
5752 gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
5753 gcc_assert (dims[GOMP_DIM_WORKER] != 0);
5754 gcc_assert (dims[GOMP_DIM_GANG] != 0);
5755 }
5756
5757 if (offload_region_p)
5758 {
5759 /* -1 : not set
5760 0 : set using variable, f.i. num_gangs (n)
5761 >= 1: set using constant, f.i. num_gangs (1). */
5762 gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
5763 gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
5764 gcc_assert (dims[GOMP_DIM_GANG] >= -1);
5765 }
5766
5767 if (offload_region_p)
5768 default_vector_length = oacc_get_default_dim (GOMP_DIM_VECTOR);
5769 else
5770 /* oacc_default_dims_p. */
5771 default_vector_length = PTX_DEFAULT_VECTOR_LENGTH;
5772
5773 int old_dims[GOMP_DIM_MAX];
5774 unsigned int i;
5775 for (i = 0; i < GOMP_DIM_MAX; ++i)
5776 old_dims[i] = dims[i];
5777
5778 const char *vector_reason = NULL;
5779 if (offload_region_p && has_vector_partitionable_routine_calls_p (decl))
5780 {
5781 default_vector_length = PTX_WARP_SIZE;
5782
5783 if (dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
5784 {
5785 vector_reason = G_("using vector_length (%d) due to call to"
5786 " vector-partitionable routine, ignoring %d");
5787 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5788 }
5789 }
5790
5791 if (dims[GOMP_DIM_VECTOR] == 0)
5792 {
5793 vector_reason = G_("using vector_length (%d), ignoring runtime setting");
5794 dims[GOMP_DIM_VECTOR] = default_vector_length;
5795 }
5796
5797 if (dims[GOMP_DIM_VECTOR] > 0
5798 && !nvptx_welformed_vector_length_p (dims[GOMP_DIM_VECTOR]))
5799 dims[GOMP_DIM_VECTOR] = default_vector_length;
5800
5801 nvptx_apply_dim_limits (dims);
5802
5803 if (dims[GOMP_DIM_VECTOR] != old_dims[GOMP_DIM_VECTOR])
5804 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5805 vector_reason != NULL
5806 ? vector_reason
5807 : G_("using vector_length (%d), ignoring %d"),
5808 dims[GOMP_DIM_VECTOR], old_dims[GOMP_DIM_VECTOR]);
5809
5810 if (dims[GOMP_DIM_WORKER] != old_dims[GOMP_DIM_WORKER])
5811 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5812 G_("using num_workers (%d), ignoring %d"),
5813 dims[GOMP_DIM_WORKER], old_dims[GOMP_DIM_WORKER]);
5814
5815 if (oacc_default_dims_p)
5816 {
5817 if (dims[GOMP_DIM_VECTOR] < 0)
5818 dims[GOMP_DIM_VECTOR] = default_vector_length;
5819 if (dims[GOMP_DIM_WORKER] < 0)
5820 dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM;
5821 if (dims[GOMP_DIM_GANG] < 0)
5822 dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM;
5823 nvptx_apply_dim_limits (dims);
5824 }
5825
5826 if (offload_region_p)
5827 {
5828 for (i = 0; i < GOMP_DIM_MAX; i++)
5829 {
5830 if (!(dims[i] < 0))
5831 continue;
5832
5833 if ((used & GOMP_DIM_MASK (i)) == 0)
5834 /* Function oacc_validate_dims will apply the minimal dimension. */
5835 continue;
5836
5837 dims[i] = (i == GOMP_DIM_VECTOR
5838 ? default_vector_length
5839 : oacc_get_default_dim (i));
5840 }
5841
5842 nvptx_apply_dim_limits (dims);
5843 }
5844 }
5845
5846 /* Validate compute dimensions of an OpenACC offload or routine, fill
5847 in non-unity defaults. FN_LEVEL indicates the level at which a
5848 routine might spawn a loop. It is negative for non-routines. If
5849 DECL is null, we are validating the default dimensions. */
5850
5851 static bool
nvptx_goacc_validate_dims(tree decl,int dims[],int fn_level,unsigned used)5852 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level, unsigned used)
5853 {
5854 int old_dims[GOMP_DIM_MAX];
5855 unsigned int i;
5856
5857 for (i = 0; i < GOMP_DIM_MAX; ++i)
5858 old_dims[i] = dims[i];
5859
5860 nvptx_goacc_validate_dims_1 (decl, dims, fn_level, used);
5861
5862 gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
5863 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0)
5864 gcc_assert (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] <= PTX_CTA_SIZE);
5865
5866 for (i = 0; i < GOMP_DIM_MAX; ++i)
5867 if (old_dims[i] != dims[i])
5868 return true;
5869
5870 return false;
5871 }
5872
5873 /* Return maximum dimension size, or zero for unbounded. */
5874
5875 static int
nvptx_dim_limit(int axis)5876 nvptx_dim_limit (int axis)
5877 {
5878 switch (axis)
5879 {
5880 case GOMP_DIM_VECTOR:
5881 return PTX_MAX_VECTOR_LENGTH;
5882
5883 default:
5884 break;
5885 }
5886 return 0;
5887 }
5888
5889 /* Determine whether fork & joins are needed. */
5890
5891 static bool
nvptx_goacc_fork_join(gcall * call,const int dims[],bool ARG_UNUSED (is_fork))5892 nvptx_goacc_fork_join (gcall *call, const int dims[],
5893 bool ARG_UNUSED (is_fork))
5894 {
5895 tree arg = gimple_call_arg (call, 2);
5896 unsigned axis = TREE_INT_CST_LOW (arg);
5897
5898 /* We only care about worker and vector partitioning. */
5899 if (axis < GOMP_DIM_WORKER)
5900 return false;
5901
5902 /* If the size is 1, there's no partitioning. */
5903 if (dims[axis] == 1)
5904 return false;
5905
5906 return true;
5907 }
5908
5909 /* Generate a PTX builtin function call that returns the address in
5910 the worker reduction buffer at OFFSET. TYPE is the type of the
5911 data at that location. */
5912
5913 static tree
nvptx_get_shared_red_addr(tree type,tree offset,bool vector)5914 nvptx_get_shared_red_addr (tree type, tree offset, bool vector)
5915 {
5916 enum nvptx_builtins addr_dim = NVPTX_BUILTIN_WORKER_ADDR;
5917 if (vector)
5918 addr_dim = NVPTX_BUILTIN_VECTOR_ADDR;
5919 machine_mode mode = TYPE_MODE (type);
5920 tree fndecl = nvptx_builtin_decl (addr_dim, true);
5921 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
5922 tree align = build_int_cst (unsigned_type_node,
5923 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
5924 tree call = build_call_expr (fndecl, 3, offset, size, align);
5925
5926 return fold_convert (build_pointer_type (type), call);
5927 }
5928
5929 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
5930 will cast the variable if necessary. */
5931
5932 static void
nvptx_generate_vector_shuffle(location_t loc,tree dest_var,tree var,unsigned shift,gimple_seq * seq)5933 nvptx_generate_vector_shuffle (location_t loc,
5934 tree dest_var, tree var, unsigned shift,
5935 gimple_seq *seq)
5936 {
5937 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
5938 tree_code code = NOP_EXPR;
5939 tree arg_type = unsigned_type_node;
5940 tree var_type = TREE_TYPE (var);
5941 tree dest_type = var_type;
5942
5943 if (TREE_CODE (var_type) == COMPLEX_TYPE)
5944 var_type = TREE_TYPE (var_type);
5945
5946 if (TREE_CODE (var_type) == REAL_TYPE)
5947 code = VIEW_CONVERT_EXPR;
5948
5949 if (TYPE_SIZE (var_type)
5950 == TYPE_SIZE (long_long_unsigned_type_node))
5951 {
5952 fn = NVPTX_BUILTIN_SHUFFLELL;
5953 arg_type = long_long_unsigned_type_node;
5954 }
5955
5956 tree call = nvptx_builtin_decl (fn, true);
5957 tree bits = build_int_cst (unsigned_type_node, shift);
5958 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
5959 tree expr;
5960
5961 if (var_type != dest_type)
5962 {
5963 /* Do real and imaginary parts separately. */
5964 tree real = fold_build1 (REALPART_EXPR, var_type, var);
5965 real = fold_build1 (code, arg_type, real);
5966 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
5967 real = fold_build1 (code, var_type, real);
5968
5969 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
5970 imag = fold_build1 (code, arg_type, imag);
5971 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
5972 imag = fold_build1 (code, var_type, imag);
5973
5974 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
5975 }
5976 else
5977 {
5978 expr = fold_build1 (code, arg_type, var);
5979 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
5980 expr = fold_build1 (code, dest_type, expr);
5981 }
5982
5983 gimplify_assign (dest_var, expr, seq);
5984 }
5985
5986 /* Lazily generate the global lock var decl and return its address. */
5987
5988 static tree
nvptx_global_lock_addr()5989 nvptx_global_lock_addr ()
5990 {
5991 tree v = global_lock_var;
5992
5993 if (!v)
5994 {
5995 tree name = get_identifier ("__reduction_lock");
5996 tree type = build_qualified_type (unsigned_type_node,
5997 TYPE_QUAL_VOLATILE);
5998 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
5999 global_lock_var = v;
6000 DECL_ARTIFICIAL (v) = 1;
6001 DECL_EXTERNAL (v) = 1;
6002 TREE_STATIC (v) = 1;
6003 TREE_PUBLIC (v) = 1;
6004 TREE_USED (v) = 1;
6005 mark_addressable (v);
6006 mark_decl_referenced (v);
6007 }
6008
6009 return build_fold_addr_expr (v);
6010 }
6011
6012 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
6013 GSI. We use a lockless scheme for nearly all case, which looks
6014 like:
6015 actual = initval(OP);
6016 do {
6017 guess = actual;
6018 write = guess OP myval;
6019 actual = cmp&swap (ptr, guess, write)
6020 } while (actual bit-different-to guess);
6021 return write;
6022
6023 This relies on a cmp&swap instruction, which is available for 32-
6024 and 64-bit types. Larger types must use a locking scheme. */
6025
6026 static tree
nvptx_lockless_update(location_t loc,gimple_stmt_iterator * gsi,tree ptr,tree var,tree_code op)6027 nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
6028 tree ptr, tree var, tree_code op)
6029 {
6030 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
6031 tree_code code = NOP_EXPR;
6032 tree arg_type = unsigned_type_node;
6033 tree var_type = TREE_TYPE (var);
6034
6035 if (TREE_CODE (var_type) == COMPLEX_TYPE
6036 || TREE_CODE (var_type) == REAL_TYPE)
6037 code = VIEW_CONVERT_EXPR;
6038
6039 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
6040 {
6041 arg_type = long_long_unsigned_type_node;
6042 fn = NVPTX_BUILTIN_CMP_SWAPLL;
6043 }
6044
6045 tree swap_fn = nvptx_builtin_decl (fn, true);
6046
6047 gimple_seq init_seq = NULL;
6048 tree init_var = make_ssa_name (arg_type);
6049 tree init_expr = omp_reduction_init_op (loc, op, var_type);
6050 init_expr = fold_build1 (code, arg_type, init_expr);
6051 gimplify_assign (init_var, init_expr, &init_seq);
6052 gimple *init_end = gimple_seq_last (init_seq);
6053
6054 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
6055
6056 /* Split the block just after the init stmts. */
6057 basic_block pre_bb = gsi_bb (*gsi);
6058 edge pre_edge = split_block (pre_bb, init_end);
6059 basic_block loop_bb = pre_edge->dest;
6060 pre_bb = pre_edge->src;
6061 /* Reset the iterator. */
6062 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6063
6064 tree expect_var = make_ssa_name (arg_type);
6065 tree actual_var = make_ssa_name (arg_type);
6066 tree write_var = make_ssa_name (arg_type);
6067
6068 /* Build and insert the reduction calculation. */
6069 gimple_seq red_seq = NULL;
6070 tree write_expr = fold_build1 (code, var_type, expect_var);
6071 write_expr = fold_build2 (op, var_type, write_expr, var);
6072 write_expr = fold_build1 (code, arg_type, write_expr);
6073 gimplify_assign (write_var, write_expr, &red_seq);
6074
6075 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
6076
6077 /* Build & insert the cmp&swap sequence. */
6078 gimple_seq latch_seq = NULL;
6079 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
6080 ptr, expect_var, write_var);
6081 gimplify_assign (actual_var, swap_expr, &latch_seq);
6082
6083 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
6084 NULL_TREE, NULL_TREE);
6085 gimple_seq_add_stmt (&latch_seq, cond);
6086
6087 gimple *latch_end = gimple_seq_last (latch_seq);
6088 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
6089
6090 /* Split the block just after the latch stmts. */
6091 edge post_edge = split_block (loop_bb, latch_end);
6092 basic_block post_bb = post_edge->dest;
6093 loop_bb = post_edge->src;
6094 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6095
6096 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
6097 post_edge->probability = profile_probability::even ();
6098 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
6099 loop_edge->probability = profile_probability::even ();
6100 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
6101 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
6102
6103 gphi *phi = create_phi_node (expect_var, loop_bb);
6104 add_phi_arg (phi, init_var, pre_edge, loc);
6105 add_phi_arg (phi, actual_var, loop_edge, loc);
6106
6107 loop *loop = alloc_loop ();
6108 loop->header = loop_bb;
6109 loop->latch = loop_bb;
6110 add_loop (loop, loop_bb->loop_father);
6111
6112 return fold_build1 (code, var_type, write_var);
6113 }
6114
6115 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
6116 GSI. This is necessary for types larger than 64 bits, where there
6117 is no cmp&swap instruction to implement a lockless scheme. We use
6118 a lock variable in global memory.
6119
6120 while (cmp&swap (&lock_var, 0, 1))
6121 continue;
6122 T accum = *ptr;
6123 accum = accum OP var;
6124 *ptr = accum;
6125 cmp&swap (&lock_var, 1, 0);
6126 return accum;
6127
6128 A lock in global memory is necessary to force execution engine
6129 descheduling and avoid resource starvation that can occur if the
6130 lock is in .shared memory. */
6131
6132 static tree
nvptx_lockfull_update(location_t loc,gimple_stmt_iterator * gsi,tree ptr,tree var,tree_code op)6133 nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
6134 tree ptr, tree var, tree_code op)
6135 {
6136 tree var_type = TREE_TYPE (var);
6137 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
6138 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
6139 tree uns_locked = build_int_cst (unsigned_type_node, 1);
6140
6141 /* Split the block just before the gsi. Insert a gimple nop to make
6142 this easier. */
6143 gimple *nop = gimple_build_nop ();
6144 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
6145 basic_block entry_bb = gsi_bb (*gsi);
6146 edge entry_edge = split_block (entry_bb, nop);
6147 basic_block lock_bb = entry_edge->dest;
6148 /* Reset the iterator. */
6149 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6150
6151 /* Build and insert the locking sequence. */
6152 gimple_seq lock_seq = NULL;
6153 tree lock_var = make_ssa_name (unsigned_type_node);
6154 tree lock_expr = nvptx_global_lock_addr ();
6155 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
6156 uns_unlocked, uns_locked);
6157 gimplify_assign (lock_var, lock_expr, &lock_seq);
6158 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
6159 NULL_TREE, NULL_TREE);
6160 gimple_seq_add_stmt (&lock_seq, cond);
6161 gimple *lock_end = gimple_seq_last (lock_seq);
6162 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
6163
6164 /* Split the block just after the lock sequence. */
6165 edge locked_edge = split_block (lock_bb, lock_end);
6166 basic_block update_bb = locked_edge->dest;
6167 lock_bb = locked_edge->src;
6168 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6169
6170 /* Create the lock loop ... */
6171 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
6172 locked_edge->probability = profile_probability::even ();
6173 edge loop_edge = make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
6174 loop_edge->probability = profile_probability::even ();
6175 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
6176 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
6177
6178 /* ... and the loop structure. */
6179 loop *lock_loop = alloc_loop ();
6180 lock_loop->header = lock_bb;
6181 lock_loop->latch = lock_bb;
6182 lock_loop->nb_iterations_estimate = 1;
6183 lock_loop->any_estimate = true;
6184 add_loop (lock_loop, entry_bb->loop_father);
6185
6186 /* Build and insert the reduction calculation. */
6187 gimple_seq red_seq = NULL;
6188 tree acc_in = make_ssa_name (var_type);
6189 tree ref_in = build_simple_mem_ref (ptr);
6190 TREE_THIS_VOLATILE (ref_in) = 1;
6191 gimplify_assign (acc_in, ref_in, &red_seq);
6192
6193 tree acc_out = make_ssa_name (var_type);
6194 tree update_expr = fold_build2 (op, var_type, ref_in, var);
6195 gimplify_assign (acc_out, update_expr, &red_seq);
6196
6197 tree ref_out = build_simple_mem_ref (ptr);
6198 TREE_THIS_VOLATILE (ref_out) = 1;
6199 gimplify_assign (ref_out, acc_out, &red_seq);
6200
6201 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
6202
6203 /* Build & insert the unlock sequence. */
6204 gimple_seq unlock_seq = NULL;
6205 tree unlock_expr = nvptx_global_lock_addr ();
6206 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
6207 uns_locked, uns_unlocked);
6208 gimplify_and_add (unlock_expr, &unlock_seq);
6209 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
6210
6211 return acc_out;
6212 }
6213
6214 /* Emit a sequence to update a reduction accumlator at *PTR with the
6215 value held in VAR using operator OP. Return the updated value.
6216
6217 TODO: optimize for atomic ops and indepedent complex ops. */
6218
6219 static tree
nvptx_reduction_update(location_t loc,gimple_stmt_iterator * gsi,tree ptr,tree var,tree_code op)6220 nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
6221 tree ptr, tree var, tree_code op)
6222 {
6223 tree type = TREE_TYPE (var);
6224 tree size = TYPE_SIZE (type);
6225
6226 if (size == TYPE_SIZE (unsigned_type_node)
6227 || size == TYPE_SIZE (long_long_unsigned_type_node))
6228 return nvptx_lockless_update (loc, gsi, ptr, var, op);
6229 else
6230 return nvptx_lockfull_update (loc, gsi, ptr, var, op);
6231 }
6232
6233 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
6234
6235 static void
nvptx_goacc_reduction_setup(gcall * call,offload_attrs * oa)6236 nvptx_goacc_reduction_setup (gcall *call, offload_attrs *oa)
6237 {
6238 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6239 tree lhs = gimple_call_lhs (call);
6240 tree var = gimple_call_arg (call, 2);
6241 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6242 gimple_seq seq = NULL;
6243
6244 push_gimplify_context (true);
6245
6246 if (level != GOMP_DIM_GANG)
6247 {
6248 /* Copy the receiver object. */
6249 tree ref_to_res = gimple_call_arg (call, 1);
6250
6251 if (!integer_zerop (ref_to_res))
6252 var = build_simple_mem_ref (ref_to_res);
6253 }
6254
6255 if (level == GOMP_DIM_WORKER
6256 || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
6257 {
6258 /* Store incoming value to worker reduction buffer. */
6259 tree offset = gimple_call_arg (call, 5);
6260 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6261 level == GOMP_DIM_VECTOR);
6262 tree ptr = make_ssa_name (TREE_TYPE (call));
6263
6264 gimplify_assign (ptr, call, &seq);
6265 tree ref = build_simple_mem_ref (ptr);
6266 TREE_THIS_VOLATILE (ref) = 1;
6267 gimplify_assign (ref, var, &seq);
6268 }
6269
6270 if (lhs)
6271 gimplify_assign (lhs, var, &seq);
6272
6273 pop_gimplify_context (NULL);
6274 gsi_replace_with_seq (&gsi, seq, true);
6275 }
6276
6277 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
6278
6279 static void
nvptx_goacc_reduction_init(gcall * call,offload_attrs * oa)6280 nvptx_goacc_reduction_init (gcall *call, offload_attrs *oa)
6281 {
6282 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6283 tree lhs = gimple_call_lhs (call);
6284 tree var = gimple_call_arg (call, 2);
6285 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6286 enum tree_code rcode
6287 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
6288 tree init = omp_reduction_init_op (gimple_location (call), rcode,
6289 TREE_TYPE (var));
6290 gimple_seq seq = NULL;
6291
6292 push_gimplify_context (true);
6293
6294 if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
6295 {
6296 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
6297 tree tid = make_ssa_name (integer_type_node);
6298 tree dim_vector = gimple_call_arg (call, 3);
6299 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
6300 dim_vector);
6301 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
6302 NULL_TREE, NULL_TREE);
6303
6304 gimple_call_set_lhs (tid_call, tid);
6305 gimple_seq_add_stmt (&seq, tid_call);
6306 gimple_seq_add_stmt (&seq, cond_stmt);
6307
6308 /* Split the block just after the call. */
6309 edge init_edge = split_block (gsi_bb (gsi), call);
6310 basic_block init_bb = init_edge->dest;
6311 basic_block call_bb = init_edge->src;
6312
6313 /* Fixup flags from call_bb to init_bb. */
6314 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
6315 init_edge->probability = profile_probability::even ();
6316
6317 /* Set the initialization stmts. */
6318 gimple_seq init_seq = NULL;
6319 tree init_var = make_ssa_name (TREE_TYPE (var));
6320 gimplify_assign (init_var, init, &init_seq);
6321 gsi = gsi_start_bb (init_bb);
6322 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
6323
6324 /* Split block just after the init stmt. */
6325 gsi_prev (&gsi);
6326 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
6327 basic_block dst_bb = inited_edge->dest;
6328
6329 /* Create false edge from call_bb to dst_bb. */
6330 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
6331 nop_edge->probability = profile_probability::even ();
6332
6333 /* Create phi node in dst block. */
6334 gphi *phi = create_phi_node (lhs, dst_bb);
6335 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
6336 add_phi_arg (phi, var, nop_edge, gimple_location (call));
6337
6338 /* Reset dominator of dst bb. */
6339 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
6340
6341 /* Reset the gsi. */
6342 gsi = gsi_for_stmt (call);
6343 }
6344 else
6345 {
6346 if (level == GOMP_DIM_GANG)
6347 {
6348 /* If there's no receiver object, propagate the incoming VAR. */
6349 tree ref_to_res = gimple_call_arg (call, 1);
6350 if (integer_zerop (ref_to_res))
6351 init = var;
6352 }
6353
6354 if (lhs != NULL_TREE)
6355 gimplify_assign (lhs, init, &seq);
6356 }
6357
6358 pop_gimplify_context (NULL);
6359 gsi_replace_with_seq (&gsi, seq, true);
6360 }
6361
6362 /* NVPTX implementation of GOACC_REDUCTION_FINI. */
6363
6364 static void
nvptx_goacc_reduction_fini(gcall * call,offload_attrs * oa)6365 nvptx_goacc_reduction_fini (gcall *call, offload_attrs *oa)
6366 {
6367 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6368 tree lhs = gimple_call_lhs (call);
6369 tree ref_to_res = gimple_call_arg (call, 1);
6370 tree var = gimple_call_arg (call, 2);
6371 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6372 enum tree_code op
6373 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
6374 gimple_seq seq = NULL;
6375 tree r = NULL_TREE;;
6376
6377 push_gimplify_context (true);
6378
6379 if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
6380 {
6381 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
6382 but that requires a method of emitting a unified jump at the
6383 gimple level. */
6384 for (int shfl = PTX_WARP_SIZE / 2; shfl > 0; shfl = shfl >> 1)
6385 {
6386 tree other_var = make_ssa_name (TREE_TYPE (var));
6387 nvptx_generate_vector_shuffle (gimple_location (call),
6388 other_var, var, shfl, &seq);
6389
6390 r = make_ssa_name (TREE_TYPE (var));
6391 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
6392 var, other_var), &seq);
6393 var = r;
6394 }
6395 }
6396 else
6397 {
6398 tree accum = NULL_TREE;
6399
6400 if (level == GOMP_DIM_WORKER || level == GOMP_DIM_VECTOR)
6401 {
6402 /* Get reduction buffer address. */
6403 tree offset = gimple_call_arg (call, 5);
6404 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6405 level == GOMP_DIM_VECTOR);
6406 tree ptr = make_ssa_name (TREE_TYPE (call));
6407
6408 gimplify_assign (ptr, call, &seq);
6409 accum = ptr;
6410 }
6411 else if (integer_zerop (ref_to_res))
6412 r = var;
6413 else
6414 accum = ref_to_res;
6415
6416 if (accum)
6417 {
6418 /* UPDATE the accumulator. */
6419 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
6420 seq = NULL;
6421 r = nvptx_reduction_update (gimple_location (call), &gsi,
6422 accum, var, op);
6423 }
6424 }
6425
6426 if (lhs)
6427 gimplify_assign (lhs, r, &seq);
6428 pop_gimplify_context (NULL);
6429
6430 gsi_replace_with_seq (&gsi, seq, true);
6431 }
6432
6433 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
6434
6435 static void
nvptx_goacc_reduction_teardown(gcall * call,offload_attrs * oa)6436 nvptx_goacc_reduction_teardown (gcall *call, offload_attrs *oa)
6437 {
6438 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6439 tree lhs = gimple_call_lhs (call);
6440 tree var = gimple_call_arg (call, 2);
6441 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6442 gimple_seq seq = NULL;
6443
6444 push_gimplify_context (true);
6445 if (level == GOMP_DIM_WORKER
6446 || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
6447 {
6448 /* Read the worker reduction buffer. */
6449 tree offset = gimple_call_arg (call, 5);
6450 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6451 level == GOMP_DIM_VECTOR);
6452 tree ptr = make_ssa_name (TREE_TYPE (call));
6453
6454 gimplify_assign (ptr, call, &seq);
6455 var = build_simple_mem_ref (ptr);
6456 TREE_THIS_VOLATILE (var) = 1;
6457 }
6458
6459 if (level != GOMP_DIM_GANG)
6460 {
6461 /* Write to the receiver object. */
6462 tree ref_to_res = gimple_call_arg (call, 1);
6463
6464 if (!integer_zerop (ref_to_res))
6465 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
6466 }
6467
6468 if (lhs)
6469 gimplify_assign (lhs, var, &seq);
6470
6471 pop_gimplify_context (NULL);
6472
6473 gsi_replace_with_seq (&gsi, seq, true);
6474 }
6475
6476 /* NVPTX reduction expander. */
6477
6478 static void
nvptx_goacc_reduction(gcall * call)6479 nvptx_goacc_reduction (gcall *call)
6480 {
6481 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
6482 offload_attrs oa;
6483
6484 populate_offload_attrs (&oa);
6485
6486 switch (code)
6487 {
6488 case IFN_GOACC_REDUCTION_SETUP:
6489 nvptx_goacc_reduction_setup (call, &oa);
6490 break;
6491
6492 case IFN_GOACC_REDUCTION_INIT:
6493 nvptx_goacc_reduction_init (call, &oa);
6494 break;
6495
6496 case IFN_GOACC_REDUCTION_FINI:
6497 nvptx_goacc_reduction_fini (call, &oa);
6498 break;
6499
6500 case IFN_GOACC_REDUCTION_TEARDOWN:
6501 nvptx_goacc_reduction_teardown (call, &oa);
6502 break;
6503
6504 default:
6505 gcc_unreachable ();
6506 }
6507 }
6508
6509 static bool
nvptx_cannot_force_const_mem(machine_mode mode ATTRIBUTE_UNUSED,rtx x ATTRIBUTE_UNUSED)6510 nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
6511 rtx x ATTRIBUTE_UNUSED)
6512 {
6513 return true;
6514 }
6515
6516 static bool
nvptx_vector_mode_supported(machine_mode mode)6517 nvptx_vector_mode_supported (machine_mode mode)
6518 {
6519 return (mode == V2SImode
6520 || mode == V2DImode);
6521 }
6522
6523 /* Return the preferred mode for vectorizing scalar MODE. */
6524
6525 static machine_mode
nvptx_preferred_simd_mode(scalar_mode mode)6526 nvptx_preferred_simd_mode (scalar_mode mode)
6527 {
6528 switch (mode)
6529 {
6530 case E_DImode:
6531 return V2DImode;
6532 case E_SImode:
6533 return V2SImode;
6534
6535 default:
6536 return default_preferred_simd_mode (mode);
6537 }
6538 }
6539
6540 unsigned int
nvptx_data_alignment(const_tree type,unsigned int basic_align)6541 nvptx_data_alignment (const_tree type, unsigned int basic_align)
6542 {
6543 if (TREE_CODE (type) == INTEGER_TYPE)
6544 {
6545 unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
6546 if (size == GET_MODE_SIZE (TImode))
6547 return GET_MODE_BITSIZE (maybe_split_mode (TImode));
6548 }
6549
6550 return basic_align;
6551 }
6552
6553 /* Implement TARGET_MODES_TIEABLE_P. */
6554
6555 static bool
nvptx_modes_tieable_p(machine_mode,machine_mode)6556 nvptx_modes_tieable_p (machine_mode, machine_mode)
6557 {
6558 return false;
6559 }
6560
6561 /* Implement TARGET_HARD_REGNO_NREGS. */
6562
6563 static unsigned int
nvptx_hard_regno_nregs(unsigned int,machine_mode)6564 nvptx_hard_regno_nregs (unsigned int, machine_mode)
6565 {
6566 return 1;
6567 }
6568
6569 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
6570
6571 static bool
nvptx_can_change_mode_class(machine_mode,machine_mode,reg_class_t)6572 nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t)
6573 {
6574 return false;
6575 }
6576
6577 /* Implement TARGET_TRULY_NOOP_TRUNCATION. */
6578
6579 static bool
nvptx_truly_noop_truncation(poly_uint64,poly_uint64)6580 nvptx_truly_noop_truncation (poly_uint64, poly_uint64)
6581 {
6582 return false;
6583 }
6584
6585 static GTY(()) tree nvptx_previous_fndecl;
6586
6587 static void
nvptx_set_current_function(tree fndecl)6588 nvptx_set_current_function (tree fndecl)
6589 {
6590 if (!fndecl || fndecl == nvptx_previous_fndecl)
6591 return;
6592
6593 nvptx_previous_fndecl = fndecl;
6594 vector_red_partition = 0;
6595 oacc_bcast_partition = 0;
6596 }
6597
6598 /* Implement TARGET_LIBC_HAS_FUNCTION. */
6599
6600 bool
nvptx_libc_has_function(enum function_class fn_class,tree type)6601 nvptx_libc_has_function (enum function_class fn_class, tree type)
6602 {
6603 if (fn_class == function_sincos)
6604 {
6605 if (type != NULL_TREE)
6606 /* Currently, newlib does not support sincosl. */
6607 return type == float_type_node || type == double_type_node;
6608 else
6609 return true;
6610 }
6611
6612 return default_libc_has_function (fn_class, type);
6613 }
6614
6615 #undef TARGET_OPTION_OVERRIDE
6616 #define TARGET_OPTION_OVERRIDE nvptx_option_override
6617
6618 #undef TARGET_ATTRIBUTE_TABLE
6619 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
6620
6621 #undef TARGET_LRA_P
6622 #define TARGET_LRA_P hook_bool_void_false
6623
6624 #undef TARGET_LEGITIMATE_ADDRESS_P
6625 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
6626
6627 #undef TARGET_PROMOTE_FUNCTION_MODE
6628 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
6629
6630 #undef TARGET_FUNCTION_ARG
6631 #define TARGET_FUNCTION_ARG nvptx_function_arg
6632 #undef TARGET_FUNCTION_INCOMING_ARG
6633 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
6634 #undef TARGET_FUNCTION_ARG_ADVANCE
6635 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
6636 #undef TARGET_FUNCTION_ARG_BOUNDARY
6637 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
6638 #undef TARGET_PASS_BY_REFERENCE
6639 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
6640 #undef TARGET_FUNCTION_VALUE_REGNO_P
6641 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
6642 #undef TARGET_FUNCTION_VALUE
6643 #define TARGET_FUNCTION_VALUE nvptx_function_value
6644 #undef TARGET_LIBCALL_VALUE
6645 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
6646 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
6647 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
6648 #undef TARGET_GET_DRAP_RTX
6649 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
6650 #undef TARGET_SPLIT_COMPLEX_ARG
6651 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
6652 #undef TARGET_RETURN_IN_MEMORY
6653 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
6654 #undef TARGET_OMIT_STRUCT_RETURN_REG
6655 #define TARGET_OMIT_STRUCT_RETURN_REG true
6656 #undef TARGET_STRICT_ARGUMENT_NAMING
6657 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
6658 #undef TARGET_CALL_ARGS
6659 #define TARGET_CALL_ARGS nvptx_call_args
6660 #undef TARGET_END_CALL_ARGS
6661 #define TARGET_END_CALL_ARGS nvptx_end_call_args
6662
6663 #undef TARGET_ASM_FILE_START
6664 #define TARGET_ASM_FILE_START nvptx_file_start
6665 #undef TARGET_ASM_FILE_END
6666 #define TARGET_ASM_FILE_END nvptx_file_end
6667 #undef TARGET_ASM_GLOBALIZE_LABEL
6668 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
6669 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
6670 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
6671 #undef TARGET_PRINT_OPERAND
6672 #define TARGET_PRINT_OPERAND nvptx_print_operand
6673 #undef TARGET_PRINT_OPERAND_ADDRESS
6674 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
6675 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
6676 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
6677 #undef TARGET_ASM_INTEGER
6678 #define TARGET_ASM_INTEGER nvptx_assemble_integer
6679 #undef TARGET_ASM_DECL_END
6680 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
6681 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
6682 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
6683 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
6684 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
6685 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
6686 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
6687
6688 #undef TARGET_MACHINE_DEPENDENT_REORG
6689 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
6690 #undef TARGET_NO_REGISTER_ALLOCATION
6691 #define TARGET_NO_REGISTER_ALLOCATION true
6692
6693 #undef TARGET_ENCODE_SECTION_INFO
6694 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
6695 #undef TARGET_RECORD_OFFLOAD_SYMBOL
6696 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
6697
6698 #undef TARGET_VECTOR_ALIGNMENT
6699 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
6700
6701 #undef TARGET_CANNOT_COPY_INSN_P
6702 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
6703
6704 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
6705 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
6706
6707 #undef TARGET_INIT_BUILTINS
6708 #define TARGET_INIT_BUILTINS nvptx_init_builtins
6709 #undef TARGET_EXPAND_BUILTIN
6710 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
6711 #undef TARGET_BUILTIN_DECL
6712 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
6713
6714 #undef TARGET_SIMT_VF
6715 #define TARGET_SIMT_VF nvptx_simt_vf
6716
6717 #undef TARGET_OMP_DEVICE_KIND_ARCH_ISA
6718 #define TARGET_OMP_DEVICE_KIND_ARCH_ISA nvptx_omp_device_kind_arch_isa
6719
6720 #undef TARGET_GOACC_VALIDATE_DIMS
6721 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
6722
6723 #undef TARGET_GOACC_DIM_LIMIT
6724 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
6725
6726 #undef TARGET_GOACC_FORK_JOIN
6727 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
6728
6729 #undef TARGET_GOACC_REDUCTION
6730 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
6731
6732 #undef TARGET_CANNOT_FORCE_CONST_MEM
6733 #define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
6734
6735 #undef TARGET_VECTOR_MODE_SUPPORTED_P
6736 #define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
6737
6738 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
6739 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
6740 nvptx_preferred_simd_mode
6741
6742 #undef TARGET_MODES_TIEABLE_P
6743 #define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
6744
6745 #undef TARGET_HARD_REGNO_NREGS
6746 #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
6747
6748 #undef TARGET_CAN_CHANGE_MODE_CLASS
6749 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
6750
6751 #undef TARGET_TRULY_NOOP_TRUNCATION
6752 #define TARGET_TRULY_NOOP_TRUNCATION nvptx_truly_noop_truncation
6753
6754 #undef TARGET_HAVE_SPECULATION_SAFE_VALUE
6755 #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
6756
6757 #undef TARGET_SET_CURRENT_FUNCTION
6758 #define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function
6759
6760 #undef TARGET_LIBC_HAS_FUNCTION
6761 #define TARGET_LIBC_HAS_FUNCTION nvptx_libc_has_function
6762
6763 struct gcc_target targetm = TARGET_INITIALIZER;
6764
6765 #include "gt-nvptx.h"
6766